ia64/xen-unstable

view xen/arch/x86/memory.c @ 4895:24dfd18ea63e

bitkeeper revision 1.1159.258.120 (42848bfe8kMyWWcBA64rq7h7l7AyoA)

Shadow code bug fix (found by Ian) that was breaking refcounts, and subsequently
causing migration problems.
author mafetter@fleming.research
date Fri May 13 11:14:06 2005 +0000 (2005-05-13)
parents c1b75b4f338c
children
line source
1 /******************************************************************************
2 * arch/x86/memory.c
3 *
4 * Copyright (c) 2002-2004 K A Fraser
5 * Copyright (c) 2004 Christian Limpach
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
22 /*
23 * A description of the x86 page table API:
24 *
25 * Domains trap to do_mmu_update with a list of update requests.
26 * This is a list of (ptr, val) pairs, where the requested operation
27 * is *ptr = val.
28 *
29 * Reference counting of pages:
30 * ----------------------------
31 * Each page has two refcounts: tot_count and type_count.
32 *
33 * TOT_COUNT is the obvious reference count. It counts all uses of a
34 * physical page frame by a domain, including uses as a page directory,
35 * a page table, or simple mappings via a PTE. This count prevents a
36 * domain from releasing a frame back to the free pool when it still holds
37 * a reference to it.
38 *
39 * TYPE_COUNT is more subtle. A frame can be put to one of three
40 * mutually-exclusive uses: it might be used as a page directory, or a
41 * page table, or it may be mapped writable by the domain [of course, a
42 * frame may not be used in any of these three ways!].
43 * So, type_count is a count of the number of times a frame is being
44 * referred to in its current incarnation. Therefore, a page can only
45 * change its type when its type count is zero.
46 *
47 * Pinning the page type:
48 * ----------------------
49 * The type of a page can be pinned/unpinned with the commands
50 * MMUEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is,
51 * pinning is not reference counted, so it can't be nested).
52 * This is useful to prevent a page's type count falling to zero, at which
53 * point safety checks would need to be carried out next time the count
54 * is increased again.
55 *
56 * A further note on writable page mappings:
57 * -----------------------------------------
58 * For simplicity, the count of writable mappings for a page may not
59 * correspond to reality. The 'writable count' is incremented for every
60 * PTE which maps the page with the _PAGE_RW flag set. However, for
61 * write access to be possible the page directory entry must also have
62 * its _PAGE_RW bit set. We do not check this as it complicates the
63 * reference counting considerably [consider the case of multiple
64 * directory entries referencing a single page table, some with the RW
65 * bit set, others not -- it starts getting a bit messy].
66 * In normal use, this simplification shouldn't be a problem.
67 * However, the logic can be added if required.
68 *
69 * One more note on read-only page mappings:
70 * -----------------------------------------
71 * We want domains to be able to map pages for read-only access. The
72 * main reason is that page tables and directories should be readable
73 * by a domain, but it would not be safe for them to be writable.
74 * However, domains have free access to rings 1 & 2 of the Intel
75 * privilege model. In terms of page protection, these are considered
76 * to be part of 'supervisor mode'. The WP bit in CR0 controls whether
77 * read-only restrictions are respected in supervisor mode -- if the
78 * bit is clear then any mapped page is writable.
79 *
80 * We get round this by always setting the WP bit and disallowing
81 * updates to it. This is very unlikely to cause a problem for guest
82 * OS's, which will generally use the WP bit to simplify copy-on-write
83 * implementation (in that case, OS wants a fault when it writes to
84 * an application-supplied buffer).
85 */
87 #include <xen/config.h>
88 #include <xen/init.h>
89 #include <xen/kernel.h>
90 #include <xen/lib.h>
91 #include <xen/mm.h>
92 #include <xen/sched.h>
93 #include <xen/errno.h>
94 #include <xen/perfc.h>
95 #include <xen/irq.h>
96 #include <xen/softirq.h>
97 #include <asm/shadow.h>
98 #include <asm/page.h>
99 #include <asm/flushtlb.h>
100 #include <asm/io.h>
101 #include <asm/uaccess.h>
102 #include <asm/domain_page.h>
103 #include <asm/ldt.h>
104 #include <asm/e820.h>
106 #ifdef VERBOSE
107 #define MEM_LOG(_f, _a...) \
108 printk("DOM%u: (file=memory.c, line=%d) " _f "\n", \
109 current->id , __LINE__ , ## _a )
110 #else
111 #define MEM_LOG(_f, _a...) ((void)0)
112 #endif
114 static int alloc_l2_table(struct pfn_info *page);
115 static int alloc_l1_table(struct pfn_info *page);
116 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d);
117 static int get_page_and_type_from_pagenr(unsigned long page_nr,
118 u32 type,
119 struct domain *d);
121 static void free_l2_table(struct pfn_info *page);
122 static void free_l1_table(struct pfn_info *page);
124 static int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t, unsigned long);
125 static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t);
127 /* Used to defer flushing of memory structures. */
128 static struct {
129 #define DOP_FLUSH_TLB (1<<0) /* Flush the TLB. */
130 #define DOP_RELOAD_LDT (1<<1) /* Reload the LDT shadow mapping. */
131 unsigned long deferred_ops;
132 /* If non-NULL, specifies a foreign subject domain for some operations. */
133 struct domain *foreign;
134 } __cacheline_aligned percpu_info[NR_CPUS];
136 /*
137 * Returns the current foreign domain; defaults to the currently-executing
138 * domain if a foreign override hasn't been specified.
139 */
140 #define FOREIGNDOM (percpu_info[smp_processor_id()].foreign ? : current)
142 /* Private domain structs for DOMID_XEN and DOMID_IO. */
143 static struct domain *dom_xen, *dom_io;
145 /* Frame table and its size in pages. */
146 struct pfn_info *frame_table;
147 unsigned long frame_table_size;
148 unsigned long max_page;
150 void __init init_frametable(void)
151 {
152 unsigned long i, p;
154 frame_table = (struct pfn_info *)FRAMETABLE_VIRT_START;
155 frame_table_size = max_page * sizeof(struct pfn_info);
156 frame_table_size = (frame_table_size + PAGE_SIZE - 1) & PAGE_MASK;
158 for ( i = 0; i < frame_table_size; i += (4UL << 20) )
159 {
160 p = alloc_boot_pages(min(frame_table_size - i, 4UL << 20), 4UL << 20);
161 if ( p == 0 )
162 panic("Not enough memory for frame table\n");
163 idle_pg_table[(FRAMETABLE_VIRT_START + i) >> L2_PAGETABLE_SHIFT] =
164 mk_l2_pgentry(p | __PAGE_HYPERVISOR | _PAGE_PSE);
165 }
167 memset(frame_table, 0, frame_table_size);
168 }
170 void arch_init_memory(void)
171 {
172 unsigned long i, j, pfn, nr_pfns;
173 struct pfn_info *page;
175 /*
176 * We are rather picky about the layout of 'struct pfn_info'. The
177 * count_info and domain fields must be adjacent, as we perform atomic
178 * 64-bit operations on them. Also, just for sanity, we assert the size
179 * of the structure here.
180 */
181 if ( (offsetof(struct pfn_info, u.inuse.domain) !=
182 (offsetof(struct pfn_info, count_info) + sizeof(u32))) ||
183 (sizeof(struct pfn_info) != 24) )
184 {
185 printk("Weird pfn_info layout (%ld,%ld,%d)\n",
186 offsetof(struct pfn_info, count_info),
187 offsetof(struct pfn_info, u.inuse.domain),
188 sizeof(struct pfn_info));
189 for ( ; ; ) ;
190 }
192 memset(percpu_info, 0, sizeof(percpu_info));
194 /* Initialise to a magic of 0x55555555 so easier to spot bugs later. */
195 memset(machine_to_phys_mapping, 0x55, 4<<20);
197 /*
198 * Initialise our DOMID_XEN domain.
199 * Any Xen-heap pages that we will allow to be mapped will have
200 * their domain field set to dom_xen.
201 */
202 dom_xen = alloc_domain_struct();
203 atomic_set(&dom_xen->refcnt, 1);
204 dom_xen->id = DOMID_XEN;
206 /*
207 * Initialise our DOMID_IO domain.
208 * This domain owns I/O pages that are within the range of the pfn_info
209 * array. Mappings occur at the priv of the caller.
210 */
211 dom_io = alloc_domain_struct();
212 atomic_set(&dom_io->refcnt, 1);
213 dom_io->id = DOMID_IO;
215 /* M2P table is mappable read-only by privileged domains. */
216 for ( i = 0; i < 1024; i++ )
217 {
218 /* Ensure it's mapped read-only by guests (use GDT type). */
219 page = &frame_table[m2p_start_mfn+i];
220 page->count_info = PGC_allocated | 1;
221 page->u.inuse.type_info = PGT_gdt_page | PGT_validated | 1;
222 page->u.inuse.domain = dom_xen;
223 }
225 /* First 1MB of RAM is historically marked as I/O. */
226 for ( i = 0; i < 0x100; i++ )
227 {
228 page = &frame_table[i];
229 page->count_info = PGC_allocated | 1;
230 page->u.inuse.type_info = PGT_writable_page | PGT_validated | 1;
231 page->u.inuse.domain = dom_io;
232 }
234 /* Any non-RAM areas in the e820 map are considered to be for I/O. */
235 for ( i = 0; i < e820.nr_map; i++ )
236 {
237 if ( e820.map[i].type == E820_RAM )
238 continue;
239 pfn = e820.map[i].addr >> PAGE_SHIFT;
240 nr_pfns = (e820.map[i].size +
241 (e820.map[i].addr & ~PAGE_MASK) +
242 ~PAGE_MASK) >> PAGE_SHIFT;
243 for ( j = 0; j < nr_pfns; j++ )
244 {
245 if ( !pfn_valid(pfn+j) )
246 continue;
247 page = &frame_table[pfn+j];
248 page->count_info = PGC_allocated | 1;
249 page->u.inuse.type_info = PGT_writable_page | PGT_validated | 1;
250 page->u.inuse.domain = dom_io;
251 }
252 }
253 }
255 static void __invalidate_shadow_ldt(struct domain *d)
256 {
257 int i;
258 unsigned long pfn;
259 struct pfn_info *page;
261 d->mm.shadow_ldt_mapcnt = 0;
263 for ( i = 16; i < 32; i++ )
264 {
265 pfn = l1_pgentry_to_pagenr(d->mm.perdomain_pt[i]);
266 if ( pfn == 0 ) continue;
267 d->mm.perdomain_pt[i] = mk_l1_pgentry(0);
268 page = &frame_table[pfn];
269 ASSERT_PAGE_IS_TYPE(page, PGT_ldt_page);
270 ASSERT_PAGE_IS_DOMAIN(page, d);
271 put_page_and_type(page);
272 }
274 /* Dispose of the (now possibly invalid) mappings from the TLB. */
275 percpu_info[d->processor].deferred_ops |= DOP_FLUSH_TLB | DOP_RELOAD_LDT;
276 }
279 static inline void invalidate_shadow_ldt(struct domain *d)
280 {
281 if ( d->mm.shadow_ldt_mapcnt != 0 )
282 __invalidate_shadow_ldt(d);
283 }
286 static int alloc_segdesc_page(struct pfn_info *page)
287 {
288 unsigned long *descs = map_domain_mem((page-frame_table) << PAGE_SHIFT);
289 int i;
291 for ( i = 0; i < 512; i++ )
292 if ( unlikely(!check_descriptor(&descs[i*2])) )
293 goto fail;
295 unmap_domain_mem(descs);
296 return 1;
298 fail:
299 unmap_domain_mem(descs);
300 return 0;
301 }
304 /* Map shadow page at offset @off. */
305 int map_ldt_shadow_page(unsigned int off)
306 {
307 struct domain *d = current;
308 unsigned long l1e;
310 if ( unlikely(in_irq()) )
311 BUG();
313 __get_user(l1e, (unsigned long *)&linear_pg_table[(d->mm.ldt_base >>
314 PAGE_SHIFT) + off]);
316 if ( unlikely(!(l1e & _PAGE_PRESENT)) ||
317 unlikely(!get_page_and_type(&frame_table[l1e >> PAGE_SHIFT],
318 d, PGT_ldt_page)) )
319 return 0;
321 d->mm.perdomain_pt[off + 16] = mk_l1_pgentry(l1e | _PAGE_RW);
322 d->mm.shadow_ldt_mapcnt++;
324 return 1;
325 }
328 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d)
329 {
330 struct pfn_info *page = &frame_table[page_nr];
332 if ( unlikely(!pfn_valid(page_nr)) || unlikely(!get_page(page, d)) )
333 {
334 MEM_LOG("Could not get page ref for pfn %08lx", page_nr);
335 return 0;
336 }
338 return 1;
339 }
342 static int get_page_and_type_from_pagenr(unsigned long page_nr,
343 u32 type,
344 struct domain *d)
345 {
346 struct pfn_info *page = &frame_table[page_nr];
348 if ( unlikely(!get_page_from_pagenr(page_nr, d)) )
349 return 0;
351 if ( unlikely(!get_page_type(page, type)) )
352 {
353 #ifdef VERBOSE
354 if ( (type & PGT_type_mask) != PGT_l1_page_table )
355 MEM_LOG("Bad page type for pfn %08lx (%08x)",
356 page_nr, page->u.inuse.type_info);
357 #endif
358 put_page(page);
359 return 0;
360 }
362 return 1;
363 }
366 /*
367 * We allow an L2 tables to map each other (a.k.a. linear page tables). It
368 * needs some special care with reference counst and access permissions:
369 * 1. The mapping entry must be read-only, or the guest may get write access
370 * to its own PTEs.
371 * 2. We must only bump the reference counts for an *already validated*
372 * L2 table, or we can end up in a deadlock in get_page_type() by waiting
373 * on a validation that is required to complete that validation.
374 * 3. We only need to increment the reference counts for the mapped page
375 * frame if it is mapped by a different L2 table. This is sufficient and
376 * also necessary to allow validation of an L2 table mapping itself.
377 */
378 static int
379 get_linear_pagetable(
380 l2_pgentry_t l2e, unsigned long pfn, struct domain *d)
381 {
382 u32 x, y;
383 struct pfn_info *page;
385 if ( (l2_pgentry_val(l2e) & _PAGE_RW) )
386 {
387 MEM_LOG("Attempt to create linear p.t. with write perms");
388 return 0;
389 }
391 if ( (l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn )
392 {
393 /* Make sure the mapped frame belongs to the correct domain. */
394 if ( unlikely(!get_page_from_pagenr(l2_pgentry_to_pagenr(l2e), d)) )
395 return 0;
397 /*
398 * Make sure that the mapped frame is an already-validated L2 table.
399 * If so, atomically increment the count (checking for overflow).
400 */
401 page = &frame_table[l2_pgentry_to_pagenr(l2e)];
402 y = page->u.inuse.type_info;
403 do {
404 x = y;
405 if ( unlikely((x & PGT_count_mask) == PGT_count_mask) ||
406 unlikely((x & (PGT_type_mask|PGT_validated)) !=
407 (PGT_l2_page_table|PGT_validated)) )
408 {
409 put_page(page);
410 return 0;
411 }
412 }
413 while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x );
414 }
416 return 1;
417 }
420 static int
421 get_page_from_l1e(
422 l1_pgentry_t l1e, struct domain *d)
423 {
424 unsigned long l1v = l1_pgentry_val(l1e);
425 unsigned long pfn = l1_pgentry_to_pagenr(l1e);
426 struct pfn_info *page = &frame_table[pfn];
427 extern int domain_iomem_in_pfn(struct domain *d, unsigned long pfn);
429 if ( !(l1v & _PAGE_PRESENT) )
430 return 1;
432 if ( unlikely(l1v & (_PAGE_GLOBAL|_PAGE_PAT)) )
433 {
434 MEM_LOG("Bad L1 type settings %04lx", l1v & (_PAGE_GLOBAL|_PAGE_PAT));
435 return 0;
436 }
438 if ( unlikely(!pfn_valid(pfn)) ||
439 unlikely(page->u.inuse.domain == dom_io) )
440 {
441 /* DOMID_IO reverts to caller for privilege checks. */
442 if ( d == dom_io )
443 d = current;
445 if ( (!IS_PRIV(d)) &&
446 (!IS_CAPABLE_PHYSDEV(d) || !domain_iomem_in_pfn(d, pfn)) )
447 {
448 MEM_LOG("Non-privileged attempt to map I/O space %08lx", pfn);
449 return 0;
450 }
452 /* No reference counting for out-of-range I/O pages. */
453 if ( !pfn_valid(pfn) )
454 return 1;
456 d = dom_io;
457 }
459 return ((l1v & _PAGE_RW) ?
460 get_page_and_type(page, d, PGT_writable_page) :
461 get_page(page, d));
462 }
465 /* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
466 static int
467 get_page_from_l2e(
468 l2_pgentry_t l2e, unsigned long pfn,
469 struct domain *d, unsigned long va_idx)
470 {
471 int rc;
473 if ( !(l2_pgentry_val(l2e) & _PAGE_PRESENT) )
474 return 1;
476 if ( unlikely((l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE))) )
477 {
478 MEM_LOG("Bad L2 page type settings %04lx",
479 l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE));
480 return 0;
481 }
483 rc = get_page_and_type_from_pagenr(
484 l2_pgentry_to_pagenr(l2e),
485 PGT_l1_page_table | (va_idx<<PGT_va_shift), d);
487 if ( unlikely(!rc) )
488 return get_linear_pagetable(l2e, pfn, d);
490 return 1;
491 }
494 static void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
495 {
496 unsigned long l1v = l1_pgentry_val(l1e);
497 unsigned long pfn = l1_pgentry_to_pagenr(l1e);
498 struct pfn_info *page = &frame_table[pfn];
499 struct domain *e;
501 if ( !(l1v & _PAGE_PRESENT) || !pfn_valid(pfn) )
502 return;
504 e = page->u.inuse.domain;
505 if ( unlikely(e != d) )
506 {
507 /*
508 * Unmap a foreign page that may have been mapped via a grant table.
509 * Note that this can fail for a privileged domain that can map foreign
510 * pages via MMUEXT_SET_FOREIGNDOM. Such domains can have some mappings
511 * counted via a grant entry and some counted directly in the page
512 * structure's reference count. Note that reference counts won't get
513 * dangerously confused as long as we always try to decrement the
514 * grant entry first. We may end up with a mismatch between which
515 * mappings and which unmappings are counted via the grant entry, but
516 * really it doesn't matter as privileged domains have carte blanche.
517 */
518 if ( likely(gnttab_check_unmap(e, d, pfn, !(l1v & _PAGE_RW))) )
519 return;
520 /* Assume this mapping was made via MMUEXT_SET_FOREIGNDOM... */
521 }
523 if ( l1v & _PAGE_RW )
524 {
525 put_page_and_type(page);
526 }
527 else
528 {
529 /* We expect this is rare so we blow the entire shadow LDT. */
530 if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) ==
531 PGT_ldt_page)) &&
532 unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) )
533 invalidate_shadow_ldt(e);
534 put_page(page);
535 }
536 }
539 /*
540 * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
541 * Note also that this automatically deals correctly with linear p.t.'s.
542 */
543 static void put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
544 {
545 if ( (l2_pgentry_val(l2e) & _PAGE_PRESENT) &&
546 ((l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn) )
547 put_page_and_type(&frame_table[l2_pgentry_to_pagenr(l2e)]);
548 }
551 static int alloc_l2_table(struct pfn_info *page)
552 {
553 struct domain *d = page->u.inuse.domain;
554 unsigned long page_nr = page_to_pfn(page);
555 l2_pgentry_t *pl2e;
556 int i;
558 pl2e = map_domain_mem(page_nr << PAGE_SHIFT);
560 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
561 if ( unlikely(!get_page_from_l2e(pl2e[i], page_nr, d, i)) )
562 goto fail;
564 #if defined(__i386__)
565 /* Now we add our private high mappings. */
566 memcpy(&pl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
567 &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
568 HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
569 pl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
570 mk_l2_pgentry((page_nr << PAGE_SHIFT) | __PAGE_HYPERVISOR);
571 pl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] =
572 mk_l2_pgentry(__pa(page->u.inuse.domain->mm.perdomain_pt) |
573 __PAGE_HYPERVISOR);
574 #endif
576 unmap_domain_mem(pl2e);
577 return 1;
579 fail:
580 while ( i-- > 0 )
581 put_page_from_l2e(pl2e[i], page_nr);
583 unmap_domain_mem(pl2e);
584 return 0;
585 }
588 static int alloc_l1_table(struct pfn_info *page)
589 {
590 struct domain *d = page->u.inuse.domain;
591 unsigned long page_nr = page_to_pfn(page);
592 l1_pgentry_t *pl1e;
593 int i;
595 pl1e = map_domain_mem(page_nr << PAGE_SHIFT);
597 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
598 if ( unlikely(!get_page_from_l1e(pl1e[i], d)) )
599 goto fail;
601 unmap_domain_mem(pl1e);
602 return 1;
604 fail:
605 while ( i-- > 0 )
606 put_page_from_l1e(pl1e[i], d);
608 unmap_domain_mem(pl1e);
609 return 0;
610 }
613 static void free_l2_table(struct pfn_info *page)
614 {
615 unsigned long page_nr = page - frame_table;
616 l2_pgentry_t *pl2e;
617 int i;
619 pl2e = map_domain_mem(page_nr << PAGE_SHIFT);
621 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
622 put_page_from_l2e(pl2e[i], page_nr);
624 unmap_domain_mem(pl2e);
625 }
628 static void free_l1_table(struct pfn_info *page)
629 {
630 struct domain *d = page->u.inuse.domain;
631 unsigned long page_nr = page - frame_table;
632 l1_pgentry_t *pl1e;
633 int i;
635 pl1e = map_domain_mem(page_nr << PAGE_SHIFT);
637 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
638 put_page_from_l1e(pl1e[i], d);
640 unmap_domain_mem(pl1e);
641 }
644 static inline int update_l2e(l2_pgentry_t *pl2e,
645 l2_pgentry_t ol2e,
646 l2_pgentry_t nl2e)
647 {
648 unsigned long o = cmpxchg((unsigned long *)pl2e,
649 l2_pgentry_val(ol2e),
650 l2_pgentry_val(nl2e));
651 if ( o != l2_pgentry_val(ol2e) )
652 MEM_LOG("Failed to update %08lx -> %08lx: saw %08lx\n",
653 l2_pgentry_val(ol2e), l2_pgentry_val(nl2e), o);
654 return (o == l2_pgentry_val(ol2e));
655 }
658 /* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
659 static int mod_l2_entry(l2_pgentry_t *pl2e,
660 l2_pgentry_t nl2e,
661 unsigned long pfn)
662 {
663 l2_pgentry_t ol2e;
664 unsigned long _ol2e;
666 if ( unlikely((((unsigned long)pl2e & (PAGE_SIZE-1)) >> 2) >=
667 DOMAIN_ENTRIES_PER_L2_PAGETABLE) )
668 {
669 MEM_LOG("Illegal L2 update attempt in Xen-private area %p", pl2e);
670 return 0;
671 }
673 if ( unlikely(__get_user(_ol2e, (unsigned long *)pl2e) != 0) )
674 return 0;
675 ol2e = mk_l2_pgentry(_ol2e);
677 if ( l2_pgentry_val(nl2e) & _PAGE_PRESENT )
678 {
679 /* Differ in mapping (bits 12-31) or presence (bit 0)? */
680 if ( ((l2_pgentry_val(ol2e) ^ l2_pgentry_val(nl2e)) & ~0xffe) == 0 )
681 return update_l2e(pl2e, ol2e, nl2e);
683 if ( unlikely(!get_page_from_l2e(nl2e, pfn, current,
684 ((unsigned long)pl2e &
685 ~PAGE_MASK) >> 2)) )
686 return 0;
688 if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) )
689 {
690 put_page_from_l2e(nl2e, pfn);
691 return 0;
692 }
694 put_page_from_l2e(ol2e, pfn);
695 return 1;
696 }
698 if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) )
699 return 0;
701 put_page_from_l2e(ol2e, pfn);
702 return 1;
703 }
706 static inline int update_l1e(l1_pgentry_t *pl1e,
707 l1_pgentry_t ol1e,
708 l1_pgentry_t nl1e)
709 {
710 unsigned long o = l1_pgentry_val(ol1e);
711 unsigned long n = l1_pgentry_val(nl1e);
713 if ( unlikely(cmpxchg_user(pl1e, o, n) != 0) ||
714 unlikely(o != l1_pgentry_val(ol1e)) )
715 {
716 MEM_LOG("Failed to update %08lx -> %08lx: saw %08lx\n",
717 l1_pgentry_val(ol1e), l1_pgentry_val(nl1e), o);
718 return 0;
719 }
721 return 1;
722 }
725 /* Update the L1 entry at pl1e to new value nl1e. */
726 static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e)
727 {
728 l1_pgentry_t ol1e;
729 unsigned long _ol1e;
730 struct domain *d = current;
732 if ( unlikely(__get_user(_ol1e, (unsigned long *)pl1e) != 0) )
733 {
734 MEM_LOG("Bad get_user\n");
735 return 0;
736 }
738 ol1e = mk_l1_pgentry(_ol1e);
740 if ( l1_pgentry_val(nl1e) & _PAGE_PRESENT )
741 {
742 /* Differ in mapping (bits 12-31), r/w (bit 1), or presence (bit 0)? */
743 if ( ((l1_pgentry_val(ol1e) ^ l1_pgentry_val(nl1e)) & ~0xffc) == 0 )
744 return update_l1e(pl1e, ol1e, nl1e);
746 if ( unlikely(!get_page_from_l1e(nl1e, FOREIGNDOM)) )
747 return 0;
749 if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
750 {
751 put_page_from_l1e(nl1e, d);
752 return 0;
753 }
755 put_page_from_l1e(ol1e, d);
756 return 1;
757 }
759 if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
760 return 0;
762 put_page_from_l1e(ol1e, d);
763 return 1;
764 }
767 int alloc_page_type(struct pfn_info *page, unsigned int type)
768 {
769 switch ( type )
770 {
771 case PGT_l1_page_table:
772 return alloc_l1_table(page);
773 case PGT_l2_page_table:
774 return alloc_l2_table(page);
775 case PGT_gdt_page:
776 case PGT_ldt_page:
777 return alloc_segdesc_page(page);
778 default:
779 printk("Bad type in alloc_page_type %x t=%x c=%x\n",
780 type, page->u.inuse.type_info,
781 page->count_info);
782 BUG();
783 }
785 return 0;
786 }
789 void free_page_type(struct pfn_info *page, unsigned int type)
790 {
791 struct domain *d = page->u.inuse.domain;
793 switch ( type )
794 {
795 case PGT_l1_page_table:
796 free_l1_table(page);
797 break;
799 case PGT_l2_page_table:
800 free_l2_table(page);
801 break;
803 default:
804 BUG();
805 }
807 if ( unlikely(d->mm.shadow_mode) &&
808 (get_shadow_status(&d->mm, page_to_pfn(page)) & PSH_shadowed) )
809 {
810 unshadow_table(page_to_pfn(page), type);
811 put_shadow_status(&d->mm);
812 }
813 }
816 void put_page_type(struct pfn_info *page)
817 {
818 u32 nx, x, y = page->u.inuse.type_info;
820 again:
821 do {
822 x = y;
823 nx = x - 1;
825 ASSERT((x & PGT_count_mask) != 0);
827 /*
828 * The page should always be validated while a reference is held. The
829 * exception is during domain destruction, when we forcibly invalidate
830 * page-table pages if we detect a referential loop.
831 * See domain.c:relinquish_list().
832 */
833 ASSERT((x & PGT_validated) ||
834 test_bit(DF_DYING, &page->u.inuse.domain->flags));
836 if ( unlikely((nx & PGT_count_mask) == 0) )
837 {
838 /* Record TLB information for flush later. Races are harmless. */
839 page->tlbflush_timestamp = tlbflush_current_time();
841 if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
842 likely(nx & PGT_validated) )
843 {
844 /*
845 * Page-table pages must be unvalidated when count is zero. The
846 * 'free' is safe because the refcnt is non-zero and validated
847 * bit is clear => other ops will spin or fail.
848 */
849 if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x,
850 x & ~PGT_validated)) != x) )
851 goto again;
852 /* We cleared the 'valid bit' so we do the clear up. */
853 free_page_type(page, x & PGT_type_mask);
854 /* Carry on, but with the 'valid bit' now clear. */
855 x &= ~PGT_validated;
856 nx &= ~PGT_validated;
857 }
858 }
859 else if ( unlikely((nx & (PGT_pinned | PGT_count_mask)) ==
860 (PGT_pinned | 1)) )
861 {
862 /* Page is now only pinned. Make the back pointer mutable again. */
863 nx |= PGT_va_mutable;
864 }
865 }
866 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
867 }
870 int get_page_type(struct pfn_info *page, u32 type)
871 {
872 u32 nx, x, y = page->u.inuse.type_info;
874 again:
875 do {
876 x = y;
877 nx = x + 1;
878 if ( unlikely((nx & PGT_count_mask) == 0) )
879 {
880 MEM_LOG("Type count overflow on pfn %08lx\n", page_to_pfn(page));
881 return 0;
882 }
883 else if ( unlikely((x & PGT_count_mask) == 0) )
884 {
885 if ( (x & (PGT_type_mask|PGT_va_mask)) != type )
886 {
887 /*
888 * On type change we check to flush stale TLB entries. This
889 * may be unnecessary (e.g., page was GDT/LDT) but those
890 * circumstances should be very rare.
891 */
892 struct domain *d = page->u.inuse.domain;
893 if ( unlikely(NEED_FLUSH(tlbflush_time[d->processor],
894 page->tlbflush_timestamp)) )
895 {
896 perfc_incr(need_flush_tlb_flush);
897 flush_tlb_cpu(d->processor);
898 }
900 /* We lose existing type, back pointer, and validity. */
901 nx &= ~(PGT_type_mask | PGT_va_mask | PGT_validated);
902 nx |= type;
904 /* No special validation needed for writable pages. */
905 /* Page tables and GDT/LDT need to be scanned for validity. */
906 if ( type == PGT_writable_page )
907 nx |= PGT_validated;
908 }
909 }
910 else if ( unlikely((x & (PGT_type_mask|PGT_va_mask)) != type) )
911 {
912 if ( unlikely((x & PGT_type_mask) != (type & PGT_type_mask) ) )
913 {
914 if ( ((x & PGT_type_mask) != PGT_l2_page_table) ||
915 ((type & PGT_type_mask) != PGT_l1_page_table) )
916 MEM_LOG("Bad type (saw %08x != exp %08x) for pfn %08lx\n",
917 x & PGT_type_mask, type, page_to_pfn(page));
918 return 0;
919 }
920 else if ( (x & PGT_va_mask) == PGT_va_mutable )
921 {
922 /* The va backpointer is mutable, hence we update it. */
923 nx &= ~PGT_va_mask;
924 nx |= type; /* we know the actual type is correct */
925 }
926 else if ( unlikely((x & PGT_va_mask) != (type & PGT_va_mask)) )
927 {
928 /* This table is potentially mapped at multiple locations. */
929 nx &= ~PGT_va_mask;
930 nx |= PGT_va_unknown;
931 }
932 }
933 else if ( unlikely(!(x & PGT_validated)) )
934 {
935 /* Someone else is updating validation of this page. Wait... */
936 while ( (y = page->u.inuse.type_info) == x )
937 cpu_relax();
938 goto again;
939 }
940 }
941 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
943 if ( unlikely(!(nx & PGT_validated)) )
944 {
945 /* Try to validate page type; drop the new reference on failure. */
946 if ( unlikely(!alloc_page_type(page, type & PGT_type_mask)) )
947 {
948 MEM_LOG("Error while validating pfn %08lx for type %08x."
949 " caf=%08x taf=%08x\n",
950 page_to_pfn(page), type,
951 page->count_info,
952 page->u.inuse.type_info);
953 /* Noone else can get a reference. We hold the only ref. */
954 page->u.inuse.type_info = 0;
955 return 0;
956 }
958 /* Noone else is updating simultaneously. */
959 __set_bit(_PGT_validated, &page->u.inuse.type_info);
960 }
962 return 1;
963 }
966 static int do_extended_command(unsigned long ptr, unsigned long val)
967 {
968 int okay = 1, cpu = smp_processor_id();
969 unsigned int cmd = val & MMUEXT_CMD_MASK;
970 unsigned long pfn = ptr >> PAGE_SHIFT;
971 unsigned long old_base_pfn;
972 struct pfn_info *page = &frame_table[pfn];
973 struct domain *d = current, *nd, *e;
974 u32 x, y;
975 domid_t domid;
976 grant_ref_t gntref;
978 switch ( cmd )
979 {
980 case MMUEXT_PIN_L1_TABLE:
981 case MMUEXT_PIN_L2_TABLE:
982 /*
983 * We insist that, if you pin an L1 page, it's the first thing that
984 * you do to it. This is because we require the backptr to still be
985 * mutable. This assumption seems safe.
986 */
987 okay = get_page_and_type_from_pagenr(
988 pfn,
989 ((cmd==MMUEXT_PIN_L2_TABLE) ?
990 PGT_l2_page_table : (PGT_l1_page_table|PGT_va_mutable)),
991 FOREIGNDOM);
993 if ( unlikely(!okay) )
994 {
995 MEM_LOG("Error while pinning pfn %08lx", pfn);
996 break;
997 }
999 if ( unlikely(test_and_set_bit(_PGT_pinned,
1000 &page->u.inuse.type_info)) )
1002 MEM_LOG("Pfn %08lx already pinned", pfn);
1003 put_page_and_type(page);
1004 okay = 0;
1005 break;
1008 break;
1010 case MMUEXT_UNPIN_TABLE:
1011 if ( unlikely(!(okay = get_page_from_pagenr(pfn, FOREIGNDOM))) )
1013 MEM_LOG("Page %08lx bad domain (dom=%p)",
1014 ptr, page->u.inuse.domain);
1016 else if ( likely(test_and_clear_bit(_PGT_pinned,
1017 &page->u.inuse.type_info)) )
1019 put_page_and_type(page);
1020 put_page(page);
1022 else
1024 okay = 0;
1025 put_page(page);
1026 MEM_LOG("Pfn %08lx not pinned", pfn);
1028 break;
1030 case MMUEXT_NEW_BASEPTR:
1031 okay = get_page_and_type_from_pagenr(pfn, PGT_l2_page_table, d);
1032 if ( likely(okay) )
1034 invalidate_shadow_ldt(d);
1036 percpu_info[cpu].deferred_ops &= ~DOP_FLUSH_TLB;
1037 old_base_pfn = pagetable_val(d->mm.pagetable) >> PAGE_SHIFT;
1038 d->mm.pagetable = mk_pagetable(pfn << PAGE_SHIFT);
1040 shadow_mk_pagetable(&d->mm);
1042 write_ptbase(&d->mm);
1044 put_page_and_type(&frame_table[old_base_pfn]);
1046 else
1048 MEM_LOG("Error while installing new baseptr %08lx", ptr);
1050 break;
1052 case MMUEXT_TLB_FLUSH:
1053 percpu_info[cpu].deferred_ops |= DOP_FLUSH_TLB;
1054 break;
1056 case MMUEXT_INVLPG:
1057 __flush_tlb_one(ptr);
1058 break;
1060 case MMUEXT_FLUSH_CACHE:
1061 if ( unlikely(!IS_CAPABLE_PHYSDEV(d)) )
1063 MEM_LOG("Non-physdev domain tried to FLUSH_CACHE.\n");
1064 okay = 0;
1066 else
1068 wbinvd();
1070 break;
1072 case MMUEXT_SET_LDT:
1074 unsigned long ents = val >> MMUEXT_CMD_SHIFT;
1075 if ( ((ptr & (PAGE_SIZE-1)) != 0) ||
1076 (ents > 8192) ||
1077 ((ptr+ents*LDT_ENTRY_SIZE) < ptr) ||
1078 ((ptr+ents*LDT_ENTRY_SIZE) > PAGE_OFFSET) )
1080 okay = 0;
1081 MEM_LOG("Bad args to SET_LDT: ptr=%08lx, ents=%08lx", ptr, ents);
1083 else if ( (d->mm.ldt_ents != ents) ||
1084 (d->mm.ldt_base != ptr) )
1086 invalidate_shadow_ldt(d);
1087 d->mm.ldt_base = ptr;
1088 d->mm.ldt_ents = ents;
1089 load_LDT(d);
1090 percpu_info[cpu].deferred_ops &= ~DOP_RELOAD_LDT;
1091 if ( ents != 0 )
1092 percpu_info[cpu].deferred_ops |= DOP_RELOAD_LDT;
1094 break;
1097 case MMUEXT_SET_FOREIGNDOM:
1098 domid = (domid_t)(val >> 16);
1100 if ( (e = percpu_info[cpu].foreign) != NULL )
1101 put_domain(e);
1102 percpu_info[cpu].foreign = NULL;
1104 if ( !IS_PRIV(d) )
1106 switch ( domid )
1108 case DOMID_IO:
1109 get_knownalive_domain(dom_io);
1110 percpu_info[cpu].foreign = dom_io;
1111 break;
1112 default:
1113 MEM_LOG("Dom %u cannot set foreign dom\n", d->id);
1114 okay = 0;
1115 break;
1118 else
1120 percpu_info[cpu].foreign = e = find_domain_by_id(domid);
1121 if ( e == NULL )
1123 switch ( domid )
1125 case DOMID_XEN:
1126 get_knownalive_domain(dom_xen);
1127 percpu_info[cpu].foreign = dom_xen;
1128 break;
1129 case DOMID_IO:
1130 get_knownalive_domain(dom_io);
1131 percpu_info[cpu].foreign = dom_io;
1132 break;
1133 default:
1134 MEM_LOG("Unknown domain '%u'", domid);
1135 okay = 0;
1136 break;
1140 break;
1142 case MMUEXT_TRANSFER_PAGE:
1143 domid = (domid_t)(val >> 16);
1144 gntref = (grant_ref_t)((val & 0xFF00) | ((ptr >> 2) & 0x00FF));
1146 if ( unlikely(IS_XEN_HEAP_FRAME(page)) ||
1147 unlikely(!pfn_valid(pfn)) ||
1148 unlikely((e = find_domain_by_id(domid)) == NULL) )
1150 MEM_LOG("Bad frame (%08lx) or bad domid (%d).\n", pfn, domid);
1151 okay = 0;
1152 break;
1155 spin_lock(&d->page_alloc_lock);
1157 /*
1158 * The tricky bit: atomically release ownership while there is just one
1159 * benign reference to the page (PGC_allocated). If that reference
1160 * disappears then the deallocation routine will safely spin.
1161 */
1162 nd = page->u.inuse.domain;
1163 y = page->count_info;
1164 do {
1165 x = y;
1166 if ( unlikely((x & (PGC_count_mask|PGC_allocated)) !=
1167 (1|PGC_allocated)) ||
1168 unlikely(nd != d) )
1170 MEM_LOG("Bad page values %08lx: ed=%p(%u), sd=%p,"
1171 " caf=%08x, taf=%08x\n", page_to_pfn(page),
1172 d, d->id, nd, x, page->u.inuse.type_info);
1173 spin_unlock(&d->page_alloc_lock);
1174 put_domain(e);
1175 return 0;
1177 __asm__ __volatile__(
1178 LOCK_PREFIX "cmpxchg8b %2"
1179 : "=d" (nd), "=a" (y),
1180 "=m" (*(volatile u64 *)(&page->count_info))
1181 : "0" (d), "1" (x), "c" (NULL), "b" (x) );
1183 while ( unlikely(nd != d) || unlikely(y != x) );
1185 /*
1186 * Unlink from 'd'. At least one reference remains (now anonymous), so
1187 * noone else is spinning to try to delete this page from 'd'.
1188 */
1189 d->tot_pages--;
1190 list_del(&page->list);
1192 spin_unlock(&d->page_alloc_lock);
1194 spin_lock(&e->page_alloc_lock);
1196 /*
1197 * Check that 'e' will accept the page and has reservation headroom.
1198 * Also, a domain mustn't have PGC_allocated pages when it is dying.
1199 */
1200 ASSERT(e->tot_pages <= e->max_pages);
1201 if ( unlikely(test_bit(DF_DYING, &e->flags)) ||
1202 unlikely(e->tot_pages == e->max_pages) ||
1203 unlikely(!gnttab_prepare_for_transfer(e, d, gntref)) )
1205 MEM_LOG("Transferee has no reservation headroom (%d,%d), or "
1206 "provided a bad grant ref, or is dying (%08lx).\n",
1207 e->tot_pages, e->max_pages, e->flags);
1208 spin_unlock(&e->page_alloc_lock);
1209 put_domain(e);
1210 okay = 0;
1211 break;
1214 /* Okay, add the page to 'e'. */
1215 if ( unlikely(e->tot_pages++ == 0) )
1216 get_knownalive_domain(e);
1217 list_add_tail(&page->list, &e->page_list);
1218 page->u.inuse.domain = e;
1220 spin_unlock(&e->page_alloc_lock);
1222 /* Transfer is all done: tell the guest about its new page frame. */
1223 gnttab_notify_transfer(e, gntref, pfn);
1225 put_domain(e);
1226 break;
1228 case MMUEXT_REASSIGN_PAGE:
1229 if ( unlikely(!IS_PRIV(d)) )
1231 MEM_LOG("Dom %u has no reassignment priv", d->id);
1232 okay = 0;
1233 break;
1236 e = percpu_info[cpu].foreign;
1237 if ( unlikely(e == NULL) )
1239 MEM_LOG("No FOREIGNDOM to reassign pfn %08lx to", pfn);
1240 okay = 0;
1241 break;
1244 /*
1245 * Grab both page_list locks, in order. This prevents the page from
1246 * disappearing elsewhere while we modify the owner, and we'll need
1247 * both locks if we're successful so that we can change lists.
1248 */
1249 if ( d < e )
1251 spin_lock(&d->page_alloc_lock);
1252 spin_lock(&e->page_alloc_lock);
1254 else
1256 spin_lock(&e->page_alloc_lock);
1257 spin_lock(&d->page_alloc_lock);
1260 /* A domain shouldn't have PGC_allocated pages when it is dying. */
1261 if ( unlikely(test_bit(DF_DYING, &e->flags)) ||
1262 unlikely(IS_XEN_HEAP_FRAME(page)) )
1264 MEM_LOG("Reassignment page is Xen heap, or dest dom is dying.");
1265 okay = 0;
1266 goto reassign_fail;
1269 /*
1270 * The tricky bit: atomically change owner while there is just one
1271 * benign reference to the page (PGC_allocated). If that reference
1272 * disappears then the deallocation routine will safely spin.
1273 */
1274 nd = page->u.inuse.domain;
1275 y = page->count_info;
1276 do {
1277 x = y;
1278 if ( unlikely((x & (PGC_count_mask|PGC_allocated)) !=
1279 (1|PGC_allocated)) ||
1280 unlikely(nd != d) )
1282 MEM_LOG("Bad page values %08lx: ed=%p(%u), sd=%p,"
1283 " caf=%08x, taf=%08x\n", page_to_pfn(page),
1284 d, d->id, nd, x, page->u.inuse.type_info);
1285 okay = 0;
1286 goto reassign_fail;
1288 __asm__ __volatile__(
1289 LOCK_PREFIX "cmpxchg8b %3"
1290 : "=d" (nd), "=a" (y), "=c" (e),
1291 "=m" (*(volatile u64 *)(&page->count_info))
1292 : "0" (d), "1" (x), "c" (e), "b" (x) );
1294 while ( unlikely(nd != d) || unlikely(y != x) );
1296 /*
1297 * Unlink from 'd'. We transferred at least one reference to 'e', so
1298 * noone else is spinning to try to delete this page from 'd'.
1299 */
1300 d->tot_pages--;
1301 list_del(&page->list);
1303 /*
1304 * Add the page to 'e'. Someone may already have removed the last
1305 * reference and want to remove the page from 'e'. However, we have
1306 * the lock so they'll spin waiting for us.
1307 */
1308 if ( unlikely(e->tot_pages++ == 0) )
1309 get_knownalive_domain(e);
1310 list_add_tail(&page->list, &e->page_list);
1312 reassign_fail:
1313 spin_unlock(&d->page_alloc_lock);
1314 spin_unlock(&e->page_alloc_lock);
1315 break;
1317 case MMUEXT_CLEAR_FOREIGNDOM:
1318 if ( (e = percpu_info[cpu].foreign) != NULL )
1319 put_domain(e);
1320 percpu_info[cpu].foreign = NULL;
1321 break;
1323 default:
1324 MEM_LOG("Invalid extended pt command 0x%08lx", val & MMUEXT_CMD_MASK);
1325 okay = 0;
1326 break;
1329 return okay;
1332 int do_mmu_update(
1333 mmu_update_t *ureqs, unsigned int count, unsigned int *pdone)
1335 /*
1336 * We steal the m.s.b. of the @count parameter to indicate whether this
1337 * invocation of do_mmu_update() is resuming a previously preempted call.
1338 * We steal the next 15 bits to remember the current FOREIGNDOM.
1339 */
1340 #define MMU_UPDATE_PREEMPTED (~(~0U>>1))
1341 #define MMU_UPDATE_PREEMPT_FDOM_SHIFT ((sizeof(int)*8)-16)
1342 #define MMU_UPDATE_PREEMPT_FDOM_MASK (0x7FFFU<<MMU_UPDATE_PREEMPT_FDOM_SHIFT)
1344 mmu_update_t req;
1345 unsigned long va = 0, deferred_ops, pfn, prev_pfn = 0;
1346 struct pfn_info *page;
1347 int rc = 0, okay = 1, i = 0, cpu = smp_processor_id();
1348 unsigned int cmd, done = 0;
1349 unsigned long prev_spfn = 0;
1350 l1_pgentry_t *prev_spl1e = 0;
1351 struct domain *d = current;
1352 u32 type_info;
1353 domid_t domid;
1355 cleanup_writable_pagetable(d);
1357 /*
1358 * If we are resuming after preemption, read how much work we have already
1359 * done. This allows us to set the @done output parameter correctly.
1360 * We also reset FOREIGNDOM here.
1361 */
1362 if ( unlikely(count&(MMU_UPDATE_PREEMPTED|MMU_UPDATE_PREEMPT_FDOM_MASK)) )
1364 if ( !(count & MMU_UPDATE_PREEMPTED) )
1366 /* Count overflow into private FOREIGNDOM field. */
1367 MEM_LOG("do_mmu_update count is too large");
1368 rc = -EINVAL;
1369 goto out;
1371 count &= ~MMU_UPDATE_PREEMPTED;
1372 domid = count >> MMU_UPDATE_PREEMPT_FDOM_SHIFT;
1373 count &= ~MMU_UPDATE_PREEMPT_FDOM_MASK;
1374 if ( unlikely(pdone != NULL) )
1375 (void)get_user(done, pdone);
1376 if ( (domid != current->id) &&
1377 !do_extended_command(0, MMUEXT_SET_FOREIGNDOM | (domid << 16)) )
1379 rc = -EINVAL;
1380 goto out;
1384 perfc_incrc(calls_to_mmu_update);
1385 perfc_addc(num_page_updates, count);
1387 if ( unlikely(!array_access_ok(VERIFY_READ, ureqs, count, sizeof(req))) )
1389 rc = -EFAULT;
1390 goto out;
1393 for ( i = 0; i < count; i++ )
1395 if ( hypercall_preempt_check() )
1397 rc = hypercall_create_continuation(
1398 __HYPERVISOR_mmu_update, 3, ureqs,
1399 (count - i) |
1400 (FOREIGNDOM->id << MMU_UPDATE_PREEMPT_FDOM_SHIFT) |
1401 MMU_UPDATE_PREEMPTED, pdone);
1402 break;
1405 if ( unlikely(__copy_from_user(&req, ureqs, sizeof(req)) != 0) )
1407 MEM_LOG("Bad __copy_from_user");
1408 rc = -EFAULT;
1409 break;
1412 cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
1413 pfn = req.ptr >> PAGE_SHIFT;
1415 okay = 0;
1417 switch ( cmd )
1419 /*
1420 * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
1421 */
1422 case MMU_NORMAL_PT_UPDATE:
1423 if ( unlikely(!get_page_from_pagenr(pfn, current)) )
1425 MEM_LOG("Could not get page for normal update");
1426 break;
1429 if ( likely(prev_pfn == pfn) )
1431 va = (va & PAGE_MASK) | (req.ptr & ~PAGE_MASK);
1433 else
1435 if ( prev_pfn != 0 )
1436 unmap_domain_mem((void *)va);
1437 va = (unsigned long)map_domain_mem(req.ptr);
1438 prev_pfn = pfn;
1441 page = &frame_table[pfn];
1442 switch ( (type_info = page->u.inuse.type_info) & PGT_type_mask )
1444 case PGT_l1_page_table:
1445 if ( likely(get_page_type(
1446 page, type_info & (PGT_type_mask|PGT_va_mask))) )
1448 okay = mod_l1_entry((l1_pgentry_t *)va,
1449 mk_l1_pgentry(req.val));
1451 if ( unlikely(d->mm.shadow_mode) && okay &&
1452 (get_shadow_status(&d->mm, page-frame_table) &
1453 PSH_shadowed) )
1455 shadow_l1_normal_pt_update(
1456 req.ptr, req.val, &prev_spfn, &prev_spl1e);
1457 put_shadow_status(&d->mm);
1460 put_page_type(page);
1462 break;
1463 case PGT_l2_page_table:
1464 if ( likely(get_page_type(page, PGT_l2_page_table)) )
1466 okay = mod_l2_entry((l2_pgentry_t *)va,
1467 mk_l2_pgentry(req.val),
1468 pfn);
1470 if ( unlikely(d->mm.shadow_mode) && okay &&
1471 (get_shadow_status(&d->mm, page-frame_table) &
1472 PSH_shadowed) )
1474 shadow_l2_normal_pt_update(req.ptr, req.val);
1475 put_shadow_status(&d->mm);
1478 put_page_type(page);
1480 break;
1481 default:
1482 if ( likely(get_page_type(page, PGT_writable_page)) )
1484 *(unsigned long *)va = req.val;
1485 okay = 1;
1486 put_page_type(page);
1488 break;
1491 put_page(page);
1492 break;
1494 case MMU_MACHPHYS_UPDATE:
1495 if ( unlikely(!get_page_from_pagenr(pfn, FOREIGNDOM)) )
1497 MEM_LOG("Could not get page for mach->phys update");
1498 break;
1501 machine_to_phys_mapping[pfn] = req.val;
1502 okay = 1;
1504 /*
1505 * If in log-dirty mode, mark the corresponding pseudo-physical
1506 * page as dirty.
1507 */
1508 if ( unlikely(d->mm.shadow_mode == SHM_logdirty) &&
1509 mark_dirty(&d->mm, pfn) )
1510 d->mm.shadow_dirty_block_count++;
1512 put_page(&frame_table[pfn]);
1513 break;
1515 /*
1516 * MMU_EXTENDED_COMMAND: Extended command is specified
1517 * in the least-siginificant bits of the 'value' field.
1518 */
1519 case MMU_EXTENDED_COMMAND:
1520 req.ptr &= ~(sizeof(l1_pgentry_t) - 1);
1521 okay = do_extended_command(req.ptr, req.val);
1522 break;
1524 default:
1525 MEM_LOG("Invalid page update command %08lx", req.ptr);
1526 break;
1529 if ( unlikely(!okay) )
1531 rc = -EINVAL;
1532 break;
1535 ureqs++;
1538 out:
1539 if ( prev_pfn != 0 )
1540 unmap_domain_mem((void *)va);
1542 if ( unlikely(prev_spl1e != 0) )
1543 unmap_domain_mem((void *)prev_spl1e);
1545 deferred_ops = percpu_info[cpu].deferred_ops;
1546 percpu_info[cpu].deferred_ops = 0;
1548 if ( deferred_ops & DOP_FLUSH_TLB )
1549 local_flush_tlb();
1551 if ( deferred_ops & DOP_RELOAD_LDT )
1552 (void)map_ldt_shadow_page(0);
1554 if ( unlikely(percpu_info[cpu].foreign != NULL) )
1556 put_domain(percpu_info[cpu].foreign);
1557 percpu_info[cpu].foreign = NULL;
1560 /* Add incremental work we have done to the @done output parameter. */
1561 if ( unlikely(pdone != NULL) )
1562 __put_user(done + i, pdone);
1564 return rc;
1568 int do_update_va_mapping(unsigned long page_nr,
1569 unsigned long val,
1570 unsigned long flags)
1572 struct domain *d = current;
1573 int err = 0;
1574 unsigned int cpu = d->processor;
1575 unsigned long deferred_ops;
1577 perfc_incrc(calls_to_update_va);
1579 if ( unlikely(page_nr >= (HYPERVISOR_VIRT_START >> PAGE_SHIFT)) )
1580 return -EINVAL;
1582 cleanup_writable_pagetable(d);
1584 /*
1585 * XXX When we make this support 4MB superpages we should also deal with
1586 * the case of updating L2 entries.
1587 */
1589 if ( unlikely(!mod_l1_entry(&linear_pg_table[page_nr],
1590 mk_l1_pgentry(val))) )
1591 err = -EINVAL;
1593 if ( unlikely(d->mm.shadow_mode) )
1595 unsigned long sval;
1597 l1pte_propagate_from_guest(&d->mm, &val, &sval);
1599 if ( unlikely(__put_user(sval, ((unsigned long *)(
1600 &shadow_linear_pg_table[page_nr])))) )
1602 /*
1603 * Since L2's are guranteed RW, failure indicates either that the
1604 * page was not shadowed, or that the L2 entry has not yet been
1605 * updated to reflect the shadow.
1606 */
1607 unsigned l2_idx = page_nr >> (L2_PAGETABLE_SHIFT - L1_PAGETABLE_SHIFT);
1608 l2_pgentry_t gpde = linear_l2_table[l2_idx];
1609 unsigned long gpfn = l2_pgentry_val(gpde) >> PAGE_SHIFT;
1610 unsigned long spfn;
1612 if ((spfn = (get_shadow_status(&d->mm, gpfn) & PSH_pfn_mask)))
1614 unsigned long *sl1e = map_domain_mem(spfn << PAGE_SHIFT);
1615 unsigned l1_idx = page_nr & (ENTRIES_PER_L1_PAGETABLE - 1);
1616 sl1e[l1_idx] = sval;
1617 unmap_domain_mem(sl1e);
1618 put_shadow_status(&d->mm);
1620 perfc_incrc(shadow_update_va_fail1);
1622 else
1623 perfc_incrc(shadow_update_va_fail2);
1626 /*
1627 * If we're in log-dirty mode then we need to note that we've updated
1628 * the PTE in the PT-holding page. We need the machine frame number
1629 * for this.
1630 */
1631 if ( d->mm.shadow_mode == SHM_logdirty )
1632 mark_dirty(&current->mm, va_to_l1mfn(page_nr << PAGE_SHIFT));
1634 check_pagetable(&d->mm, d->mm.pagetable, "va"); /* debug */
1637 deferred_ops = percpu_info[cpu].deferred_ops;
1638 percpu_info[cpu].deferred_ops = 0;
1640 if ( unlikely(deferred_ops & DOP_FLUSH_TLB) ||
1641 unlikely(flags & UVMF_FLUSH_TLB) )
1642 local_flush_tlb();
1643 else if ( unlikely(flags & UVMF_INVLPG) )
1644 __flush_tlb_one(page_nr << PAGE_SHIFT);
1646 if ( unlikely(deferred_ops & DOP_RELOAD_LDT) )
1647 (void)map_ldt_shadow_page(0);
1649 return err;
1652 int do_update_va_mapping_otherdomain(unsigned long page_nr,
1653 unsigned long val,
1654 unsigned long flags,
1655 domid_t domid)
1657 unsigned int cpu = smp_processor_id();
1658 struct domain *d;
1659 int rc;
1661 if ( unlikely(!IS_PRIV(current)) )
1662 return -EPERM;
1664 percpu_info[cpu].foreign = d = find_domain_by_id(domid);
1665 if ( unlikely(d == NULL) )
1667 MEM_LOG("Unknown domain '%u'", domid);
1668 return -ESRCH;
1671 rc = do_update_va_mapping(page_nr, val, flags);
1673 put_domain(d);
1674 percpu_info[cpu].foreign = NULL;
1676 return rc;
1681 /*************************
1682 * Writable Pagetables
1683 */
1685 ptwr_info_t ptwr_info[NR_CPUS];
1687 #ifdef VERBOSE
1688 int ptwr_debug = 0x0;
1689 #define PTWR_PRINTK(_f, _a...) \
1690 do { if ( unlikely(ptwr_debug) ) printk( _f , ## _a ); } while ( 0 )
1691 #define PTWR_PRINT_WHICH (which ? 'I' : 'A')
1692 #else
1693 #define PTWR_PRINTK(_f, _a...) ((void)0)
1694 #endif
1696 /* Flush the given writable p.t. page and write-protect it again. */
1697 void ptwr_flush(const int which)
1699 unsigned long sstat, spte, pte, *ptep, l1va;
1700 l1_pgentry_t *sl1e = NULL, *pl1e, ol1e, nl1e;
1701 l2_pgentry_t *pl2e;
1702 int i, cpu = smp_processor_id();
1703 struct domain *d = current;
1705 l1va = ptwr_info[cpu].ptinfo[which].l1va;
1706 ptep = (unsigned long *)&linear_pg_table[l1va>>PAGE_SHIFT];
1708 /*
1709 * STEP 1. Write-protect the p.t. page so no more updates can occur.
1710 */
1712 if ( unlikely(__get_user(pte, ptep)) )
1714 MEM_LOG("ptwr: Could not read pte at %p\n", ptep);
1715 /*
1716 * Really a bug. We could read this PTE during the initial fault,
1717 * and pagetables can't have changed meantime.
1718 */
1719 BUG();
1721 PTWR_PRINTK("[%c] disconnected_l1va at %p is %08lx\n",
1722 PTWR_PRINT_WHICH, ptep, pte);
1723 pte &= ~_PAGE_RW;
1725 if ( unlikely(d->mm.shadow_mode) )
1727 /* Write-protect the p.t. page in the shadow page table. */
1728 l1pte_propagate_from_guest(&d->mm, &pte, &spte);
1729 __put_user(
1730 spte, (unsigned long *)&shadow_linear_pg_table[l1va>>PAGE_SHIFT]);
1732 /* Is the p.t. page itself shadowed? Map it into Xen space if so. */
1733 sstat = get_shadow_status(&d->mm, pte >> PAGE_SHIFT);
1734 if ( sstat & PSH_shadowed )
1735 sl1e = map_domain_mem((sstat & PSH_pfn_mask) << PAGE_SHIFT);
1738 /* Write-protect the p.t. page in the guest page table. */
1739 if ( unlikely(__put_user(pte, ptep)) )
1741 MEM_LOG("ptwr: Could not update pte at %p\n", ptep);
1742 /*
1743 * Really a bug. We could write this PTE during the initial fault,
1744 * and pagetables can't have changed meantime.
1745 */
1746 BUG();
1749 /* Ensure that there are no stale writable mappings in any TLB. */
1750 /* NB. INVLPG is a serialising instruction: flushes pending updates. */
1751 __flush_tlb_one(l1va); /* XXX Multi-CPU guests? */
1752 PTWR_PRINTK("[%c] disconnected_l1va at %p now %08lx\n",
1753 PTWR_PRINT_WHICH, ptep, pte);
1755 /*
1756 * STEP 2. Validate any modified PTEs.
1757 */
1759 pl1e = ptwr_info[cpu].ptinfo[which].pl1e;
1760 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
1762 ol1e = ptwr_info[cpu].ptinfo[which].page[i];
1763 nl1e = pl1e[i];
1765 if ( likely(l1_pgentry_val(ol1e) == l1_pgentry_val(nl1e)) )
1766 continue;
1768 /*
1769 * Fast path for PTEs that have merely been write-protected
1770 * (e.g., during a Unix fork()). A strict reduction in privilege.
1771 */
1772 if ( likely(l1_pgentry_val(ol1e) == (l1_pgentry_val(nl1e)|_PAGE_RW)) )
1774 if ( likely(l1_pgentry_val(nl1e) & _PAGE_PRESENT) )
1776 if ( unlikely(sl1e != NULL) )
1777 l1pte_propagate_from_guest(
1778 &d->mm, &l1_pgentry_val(nl1e),
1779 &l1_pgentry_val(sl1e[i]));
1780 put_page_type(&frame_table[l1_pgentry_to_pagenr(nl1e)]);
1782 continue;
1785 if ( unlikely(!get_page_from_l1e(nl1e, d)) )
1787 MEM_LOG("ptwr: Could not re-validate l1 page\n");
1788 /*
1789 * Make the remaining p.t's consistent before crashing, so the
1790 * reference counts are correct.
1791 */
1792 memcpy(&pl1e[i], &ptwr_info[cpu].ptinfo[which].page[i],
1793 (ENTRIES_PER_L1_PAGETABLE - i) * sizeof(l1_pgentry_t));
1794 unmap_domain_mem(pl1e);
1795 ptwr_info[cpu].ptinfo[which].l1va = 0;
1796 if ( (which == PTWR_PT_ACTIVE) && likely(!d->mm.shadow_mode) )
1798 pl2e = &linear_l2_table[ptwr_info[cpu].ptinfo[which].l2_idx];
1799 *pl2e = mk_l2_pgentry(l2_pgentry_val(*pl2e) | _PAGE_PRESENT);
1801 domain_crash();
1802 return;
1805 if ( unlikely(sl1e != NULL) )
1806 l1pte_propagate_from_guest(
1807 &d->mm, &l1_pgentry_val(nl1e), &l1_pgentry_val(sl1e[i]));
1809 if ( unlikely(l1_pgentry_val(ol1e) & _PAGE_PRESENT) )
1810 put_page_from_l1e(ol1e, d);
1812 unmap_domain_mem(pl1e);
1814 /*
1815 * STEP 3. Reattach the L1 p.t. page into the current address space.
1816 */
1818 if ( (which == PTWR_PT_ACTIVE) && likely(!d->mm.shadow_mode) )
1820 pl2e = &linear_l2_table[ptwr_info[cpu].ptinfo[which].l2_idx];
1821 *pl2e = mk_l2_pgentry(l2_pgentry_val(*pl2e) | _PAGE_PRESENT);
1824 /*
1825 * STEP 4. Final tidy-up.
1826 */
1828 ptwr_info[cpu].ptinfo[which].l1va = 0;
1830 if ( unlikely(sl1e != NULL) )
1832 unmap_domain_mem(sl1e);
1833 put_shadow_status(&d->mm);
1837 /* Write page fault handler: check if guest is trying to modify a PTE. */
1838 int ptwr_do_page_fault(unsigned long addr)
1840 unsigned long pte, pfn, l2e;
1841 struct pfn_info *page;
1842 l2_pgentry_t *pl2e;
1843 int which, cpu = smp_processor_id();
1844 u32 l2_idx;
1846 /*
1847 * Attempt to read the PTE that maps the VA being accessed. By checking for
1848 * PDE validity in the L2 we avoid many expensive fixups in __get_user().
1849 */
1850 if ( !(l2_pgentry_val(linear_l2_table[addr>>L2_PAGETABLE_SHIFT]) &
1851 _PAGE_PRESENT) ||
1852 __get_user(pte, (unsigned long *)&linear_pg_table[addr>>PAGE_SHIFT]) )
1853 return 0;
1855 pfn = pte >> PAGE_SHIFT;
1856 page = &frame_table[pfn];
1858 /* We are looking only for read-only mappings of p.t. pages. */
1859 if ( ((pte & (_PAGE_RW | _PAGE_PRESENT)) != _PAGE_PRESENT) ||
1860 ((page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table) )
1861 return 0;
1863 /* Get the L2 index at which this L1 p.t. is always mapped. */
1864 l2_idx = page->u.inuse.type_info & PGT_va_mask;
1865 if ( unlikely(l2_idx >= PGT_va_unknown) )
1867 domain_crash(); /* Urk! This L1 is mapped in multiple L2 slots! */
1868 return 0;
1870 l2_idx >>= PGT_va_shift;
1872 if ( l2_idx == (addr >> L2_PAGETABLE_SHIFT) )
1874 MEM_LOG("PTWR failure! Pagetable maps itself at %08lx\n", addr);
1875 domain_crash();
1876 return 0;
1879 /*
1880 * Is the L1 p.t. mapped into the current address space? If so we call it
1881 * an ACTIVE p.t., otherwise it is INACTIVE.
1882 */
1883 pl2e = &linear_l2_table[l2_idx];
1884 l2e = l2_pgentry_val(*pl2e);
1885 which = PTWR_PT_INACTIVE;
1886 if ( (l2e >> PAGE_SHIFT) == pfn )
1888 /* Check the PRESENT bit to set ACTIVE. */
1889 if ( likely(l2e & _PAGE_PRESENT) )
1890 which = PTWR_PT_ACTIVE;
1891 else {
1892 /*
1893 * If the PRESENT bit is clear, we may be conflicting with
1894 * the current ACTIVE p.t. (it may be the same p.t. mapped
1895 * at another virt addr).
1896 * The ptwr_flush call below will restore the PRESENT bit.
1897 */
1898 if ( ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va &&
1899 l2_idx == ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l2_idx )
1900 which = PTWR_PT_ACTIVE;
1904 PTWR_PRINTK("[%c] page_fault on l1 pt at va %08lx, pt for %08x, "
1905 "pfn %08lx\n", PTWR_PRINT_WHICH,
1906 addr, l2_idx << L2_PAGETABLE_SHIFT, pfn);
1908 /*
1909 * We only allow one ACTIVE and one INACTIVE p.t. to be updated at at
1910 * time. If there is already one, we must flush it out.
1911 */
1912 if ( ptwr_info[cpu].ptinfo[which].l1va )
1913 ptwr_flush(which);
1915 ptwr_info[cpu].ptinfo[which].l1va = addr | 1;
1916 ptwr_info[cpu].ptinfo[which].l2_idx = l2_idx;
1918 /* For safety, disconnect the L1 p.t. page from current space. */
1919 if ( (which == PTWR_PT_ACTIVE) && likely(!current->mm.shadow_mode) )
1921 *pl2e = mk_l2_pgentry(l2e & ~_PAGE_PRESENT);
1922 flush_tlb(); /* XXX Multi-CPU guests? */
1925 /* Temporarily map the L1 page, and make a copy of it. */
1926 ptwr_info[cpu].ptinfo[which].pl1e = map_domain_mem(pfn << PAGE_SHIFT);
1927 memcpy(ptwr_info[cpu].ptinfo[which].page,
1928 ptwr_info[cpu].ptinfo[which].pl1e,
1929 ENTRIES_PER_L1_PAGETABLE * sizeof(l1_pgentry_t));
1931 /* Finally, make the p.t. page writable by the guest OS. */
1932 pte |= _PAGE_RW;
1933 PTWR_PRINTK("[%c] update %p pte to %08lx\n", PTWR_PRINT_WHICH,
1934 &linear_pg_table[addr>>PAGE_SHIFT], pte);
1935 if ( unlikely(__put_user(pte, (unsigned long *)
1936 &linear_pg_table[addr>>PAGE_SHIFT])) )
1938 MEM_LOG("ptwr: Could not update pte at %p\n", (unsigned long *)
1939 &linear_pg_table[addr>>PAGE_SHIFT]);
1940 /* Toss the writable pagetable state and crash. */
1941 unmap_domain_mem(ptwr_info[cpu].ptinfo[which].pl1e);
1942 ptwr_info[cpu].ptinfo[which].l1va = 0;
1943 domain_crash();
1944 return 0;
1947 return EXCRET_fault_fixed;
1950 static __init int ptwr_init(void)
1952 int i;
1954 for ( i = 0; i < smp_num_cpus; i++ )
1956 ptwr_info[i].ptinfo[PTWR_PT_ACTIVE].page =
1957 (void *)alloc_xenheap_page();
1958 ptwr_info[i].ptinfo[PTWR_PT_INACTIVE].page =
1959 (void *)alloc_xenheap_page();
1962 return 0;
1964 __initcall(ptwr_init);
1969 /************************************************************************/
1970 /************************************************************************/
1971 /************************************************************************/
1973 #ifndef NDEBUG
1975 void audit_domain(struct domain *d)
1977 int ttot=0, ctot=0, io_mappings=0, lowmem_mappings=0;
1979 void adjust (struct pfn_info *page, int dir, int adjtype)
1981 int count = page->count_info & PGC_count_mask;
1983 if ( adjtype )
1985 int tcount = page->u.inuse.type_info & PGT_count_mask;
1987 ttot++;
1989 tcount += dir;
1991 if ( tcount < 0 )
1993 /* This will only come out once. */
1994 printk("Audit %d: type count whent below zero pfn=%x "
1995 "taf=%x otaf=%x\n",
1996 d->id, page-frame_table,
1997 page->u.inuse.type_info,
1998 page->tlbflush_timestamp);
2001 page->u.inuse.type_info =
2002 (page->u.inuse.type_info & ~PGT_count_mask) |
2003 (tcount & PGT_count_mask);
2006 ctot++;
2007 count += dir;
2008 if ( count < 0 )
2010 /* This will only come out once. */
2011 printk("Audit %d: general count whent below zero pfn=%x "
2012 "taf=%x otaf=%x\n",
2013 d->id, page-frame_table,
2014 page->u.inuse.type_info,
2015 page->tlbflush_timestamp);
2018 page->count_info =
2019 (page->count_info & ~PGC_count_mask) |
2020 (count & PGC_count_mask);
2024 void scan_for_pfn(struct domain *d, unsigned long xpfn)
2026 unsigned long pfn, *pt;
2027 struct list_head *list_ent;
2028 struct pfn_info *page;
2029 int i;
2031 list_ent = d->page_list.next;
2032 for ( i = 0; (list_ent != &d->page_list); i++ )
2034 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
2035 page = &frame_table[pfn];
2037 switch ( page->u.inuse.type_info & PGT_type_mask )
2039 case PGT_l1_page_table:
2040 case PGT_l2_page_table:
2041 pt = map_domain_mem(pfn<<PAGE_SHIFT);
2042 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
2043 if ( (pt[i] & _PAGE_PRESENT) &&
2044 ((pt[i] >> PAGE_SHIFT) == xpfn) )
2045 printk(" found dom=%d i=%x pfn=%lx t=%x c=%x\n",
2046 d->id, i, pfn, page->u.inuse.type_info,
2047 page->count_info);
2048 unmap_domain_mem(pt);
2051 list_ent = frame_table[pfn].list.next;
2056 void scan_for_pfn_remote(unsigned long xpfn)
2058 struct domain *e;
2059 for_each_domain ( e )
2060 scan_for_pfn( e, xpfn );
2063 int i, l1, l2;
2064 unsigned long pfn;
2065 struct list_head *list_ent;
2066 struct pfn_info *page;
2068 if ( d != current )
2069 domain_pause(d);
2070 synchronise_pagetables(~0UL);
2072 printk("pt base=%lx sh_info=%x\n",
2073 pagetable_val(d->mm.pagetable)>>PAGE_SHIFT,
2074 virt_to_page(d->shared_info)-frame_table);
2076 spin_lock(&d->page_alloc_lock);
2078 /* PHASE 0 */
2080 list_ent = d->page_list.next;
2081 for ( i = 0; (list_ent != &d->page_list); i++ )
2083 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
2084 page = &frame_table[pfn];
2086 if ( page->u.inuse.domain != d )
2087 BUG();
2089 if ( (page->u.inuse.type_info & PGT_count_mask) >
2090 (page->count_info & PGC_count_mask) )
2091 printk("taf > caf %x %x pfn=%lx\n",
2092 page->u.inuse.type_info, page->count_info, pfn );
2094 #if 0 /* SYSV shared memory pages plus writeable files. */
2095 if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_writable_page &&
2096 (page->u.inuse.type_info & PGT_count_mask) > 1 )
2098 printk("writeable page with type count >1: pfn=%lx t=%x c=%x\n",
2099 pfn,
2100 page->u.inuse.type_info,
2101 page->count_info );
2102 scan_for_pfn_remote(pfn);
2104 #endif
2105 if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_none &&
2106 (page->u.inuse.type_info & PGT_count_mask) > 1 )
2108 printk("normal page with type count >1: pfn=%lx t=%x c=%x\n",
2109 pfn,
2110 page->u.inuse.type_info,
2111 page->count_info );
2114 /* Use tlbflush_timestamp to store original type_info. */
2115 page->tlbflush_timestamp = page->u.inuse.type_info;
2117 list_ent = frame_table[pfn].list.next;
2121 /* PHASE 1 */
2122 if( pagetable_val(d->mm.pagetable) )
2123 adjust(&frame_table[pagetable_val(d->mm.pagetable)>>PAGE_SHIFT], -1, 1);
2125 list_ent = d->page_list.next;
2126 for ( i = 0; (list_ent != &d->page_list); i++ )
2128 unsigned long *pt;
2129 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
2130 page = &frame_table[pfn];
2132 if ( page->u.inuse.domain != d )
2133 BUG();
2135 switch ( page->u.inuse.type_info & PGT_type_mask )
2137 case PGT_l2_page_table:
2139 if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated )
2140 printk("Audit %d: L2 not validated %x\n",
2141 d->id, page->u.inuse.type_info);
2143 if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned )
2144 printk("Audit %d: L2 not pinned %x\n",
2145 d->id, page->u.inuse.type_info);
2146 else
2147 adjust( page, -1, 1 );
2149 pt = map_domain_mem( pfn<<PAGE_SHIFT );
2151 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
2153 if ( pt[i] & _PAGE_PRESENT )
2155 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
2156 struct pfn_info *l1page = &frame_table[l1pfn];
2158 if ( l1page->u.inuse.domain != d )
2160 printk("L2: Skip bizarre page belonging to other "
2161 "dom %p\n", l1page->u.inuse.domain);
2162 continue;
2165 if ( (l1page->u.inuse.type_info & PGT_type_mask) ==
2166 PGT_l2_page_table )
2167 printk("Audit %d: [%x] Found %s Linear PT "
2168 "t=%x pfn=%lx\n", d->id, i,
2169 (l1pfn==pfn) ? "Self" : "Other",
2170 l1page->u.inuse.type_info,
2171 l1pfn);
2172 else if ( (l1page->u.inuse.type_info & PGT_type_mask) !=
2173 PGT_l1_page_table )
2174 printk("Audit %d: [%x] Expected L1 t=%x pfn=%lx\n",
2175 d->id, i,
2176 l1page->u.inuse.type_info,
2177 l1pfn);
2179 adjust(l1page, -1, 1);
2183 unmap_domain_mem(pt);
2185 break;
2188 case PGT_l1_page_table:
2190 if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
2191 adjust( page, -1, 1 );
2193 if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated )
2194 printk("Audit %d: L1 not validated %x\n",
2195 d->id, page->u.inuse.type_info);
2196 #if 0
2197 if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned )
2198 printk("Audit %d: L1 not pinned %x\n",
2199 d->id, page->u.inuse.type_info);
2200 #endif
2201 pt = map_domain_mem( pfn<<PAGE_SHIFT );
2203 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
2205 if ( pt[i] & _PAGE_PRESENT )
2207 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
2208 struct pfn_info *l1page = &frame_table[l1pfn];
2210 if ( l1pfn < 0x100 )
2212 lowmem_mappings++;
2213 continue;
2216 if ( l1pfn > max_page )
2218 io_mappings++;
2219 continue;
2222 if ( pt[i] & _PAGE_RW )
2225 if ( (l1page->u.inuse.type_info & PGT_type_mask) ==
2226 PGT_l1_page_table ||
2227 (l1page->u.inuse.type_info & PGT_type_mask) ==
2228 PGT_l2_page_table )
2229 printk("Audit %d: [%x] Ilegal RW t=%x pfn=%lx\n",
2230 d->id, i,
2231 l1page->u.inuse.type_info,
2232 l1pfn);
2236 if ( l1page->u.inuse.domain != d )
2238 printk("Audit %d: [%lx,%x] Skip foreign page dom=%lx "
2239 "pfn=%lx c=%08x t=%08x m2p=%lx\n",
2240 d->id, pfn, i,
2241 (unsigned long)l1page->u.inuse.domain,
2242 l1pfn,
2243 l1page->count_info,
2244 l1page->u.inuse.type_info,
2245 machine_to_phys_mapping[l1pfn]);
2246 continue;
2249 adjust(l1page, -1, 0);
2253 unmap_domain_mem(pt);
2255 break;
2258 list_ent = frame_table[pfn].list.next;
2261 if ( (io_mappings > 0) || (lowmem_mappings > 0) )
2262 printk("Audit %d: Found %d lowmem mappings and %d io mappings\n",
2263 d->id, lowmem_mappings, io_mappings);
2265 /* PHASE 2 */
2267 ctot = ttot = 0;
2268 list_ent = d->page_list.next;
2269 for ( i = 0; (list_ent != &d->page_list); i++ )
2271 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
2272 page = &frame_table[pfn];
2274 switch ( page->u.inuse.type_info & PGT_type_mask)
2276 case PGT_l1_page_table:
2277 case PGT_l2_page_table:
2278 if ( (page->u.inuse.type_info & PGT_count_mask) != 0 )
2280 printk("Audit %d: type count!=0 t=%x ot=%x c=%x pfn=%lx\n",
2281 d->id, page->u.inuse.type_info,
2282 page->tlbflush_timestamp,
2283 page->count_info, pfn );
2284 scan_for_pfn_remote(pfn);
2286 default:
2287 if ( (page->count_info & PGC_count_mask) != 1 )
2289 printk("Audit %d: gen count!=1 (c=%x) t=%x ot=%x pfn=%lx\n",
2290 d->id,
2291 page->count_info,
2292 page->u.inuse.type_info,
2293 page->tlbflush_timestamp, pfn );
2294 scan_for_pfn_remote(pfn);
2296 break;
2299 list_ent = frame_table[pfn].list.next;
2302 /* PHASE 3 */
2303 list_ent = d->page_list.next;
2304 l1 = l2 = 0;
2305 for ( i = 0; (list_ent != &d->page_list); i++ )
2307 unsigned long *pt;
2308 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
2309 page = &frame_table[pfn];
2311 switch ( page->u.inuse.type_info & PGT_type_mask )
2313 case PGT_l2_page_table:
2314 l2++;
2315 if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
2316 adjust( page, 1, 1 );
2318 pt = map_domain_mem( pfn<<PAGE_SHIFT );
2320 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
2322 if ( pt[i] & _PAGE_PRESENT )
2324 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
2325 struct pfn_info *l1page;
2327 if (l1pfn>max_page)
2328 continue;
2330 l1page = &frame_table[l1pfn];
2332 if ( l1page->u.inuse.domain == d)
2333 adjust(l1page, 1, 1);
2337 unmap_domain_mem(pt);
2338 break;
2340 case PGT_l1_page_table:
2341 l1++;
2342 if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
2343 adjust( page, 1, 1 );
2345 pt = map_domain_mem( pfn<<PAGE_SHIFT );
2347 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
2349 if ( pt[i] & _PAGE_PRESENT )
2351 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
2352 struct pfn_info *l1page;
2354 if (l1pfn>max_page)
2355 continue;
2357 l1page = &frame_table[l1pfn];
2359 if ( (l1page->u.inuse.domain != d) ||
2360 (l1pfn < 0x100) || (l1pfn > max_page) )
2361 continue;
2363 adjust(l1page, 1, 0);
2367 unmap_domain_mem(pt);
2368 break;
2372 page->tlbflush_timestamp = 0;
2374 list_ent = frame_table[pfn].list.next;
2377 spin_unlock(&d->page_alloc_lock);
2379 if( pagetable_val(d->mm.pagetable) )
2380 adjust(&frame_table[pagetable_val(d->mm.pagetable)>>PAGE_SHIFT], 1, 1);
2382 printk("Audit %d: Done. pages=%d l1=%d l2=%d ctot=%d ttot=%d\n", d->id, i, l1, l2, ctot, ttot );
2384 if ( d != current )
2385 domain_unpause(d);
2388 void audit_domains(void)
2390 struct domain *d;
2391 for_each_domain ( d )
2392 audit_domain(d);
2395 void audit_domains_key(unsigned char key)
2397 audit_domains();
2400 #endif