ia64/xen-unstable

view xen/arch/x86/memory.c @ 3385:b0d439448dfe

bitkeeper revision 1.1159.212.7 (41dda285BvmhILeGv4rnjJ6N_psxaA)

Patch from Leendert van Doorn leendert@watson.ibm.com to fix bug that was effecting mapping of IO pages when Xen was built with certain versions of gcc.
author iap10@labyrinth.cl.cam.ac.uk
date Thu Jan 06 20:41:41 2005 +0000 (2005-01-06)
parents d1e0d9a8fde0
children a4621fab44b4
line source
1 /******************************************************************************
2 * arch/x86/memory.c
3 *
4 * Copyright (c) 2002-2004 K A Fraser
5 * Copyright (c) 2004 Christian Limpach
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
22 /*
23 * A description of the x86 page table API:
24 *
25 * Domains trap to do_mmu_update with a list of update requests.
26 * This is a list of (ptr, val) pairs, where the requested operation
27 * is *ptr = val.
28 *
29 * Reference counting of pages:
30 * ----------------------------
31 * Each page has two refcounts: tot_count and type_count.
32 *
33 * TOT_COUNT is the obvious reference count. It counts all uses of a
34 * physical page frame by a domain, including uses as a page directory,
35 * a page table, or simple mappings via a PTE. This count prevents a
36 * domain from releasing a frame back to the free pool when it still holds
37 * a reference to it.
38 *
39 * TYPE_COUNT is more subtle. A frame can be put to one of three
40 * mutually-exclusive uses: it might be used as a page directory, or a
41 * page table, or it may be mapped writable by the domain [of course, a
42 * frame may not be used in any of these three ways!].
43 * So, type_count is a count of the number of times a frame is being
44 * referred to in its current incarnation. Therefore, a page can only
45 * change its type when its type count is zero.
46 *
47 * Pinning the page type:
48 * ----------------------
49 * The type of a page can be pinned/unpinned with the commands
50 * MMUEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is,
51 * pinning is not reference counted, so it can't be nested).
52 * This is useful to prevent a page's type count falling to zero, at which
53 * point safety checks would need to be carried out next time the count
54 * is increased again.
55 *
56 * A further note on writable page mappings:
57 * -----------------------------------------
58 * For simplicity, the count of writable mappings for a page may not
59 * correspond to reality. The 'writable count' is incremented for every
60 * PTE which maps the page with the _PAGE_RW flag set. However, for
61 * write access to be possible the page directory entry must also have
62 * its _PAGE_RW bit set. We do not check this as it complicates the
63 * reference counting considerably [consider the case of multiple
64 * directory entries referencing a single page table, some with the RW
65 * bit set, others not -- it starts getting a bit messy].
66 * In normal use, this simplification shouldn't be a problem.
67 * However, the logic can be added if required.
68 *
69 * One more note on read-only page mappings:
70 * -----------------------------------------
71 * We want domains to be able to map pages for read-only access. The
72 * main reason is that page tables and directories should be readable
73 * by a domain, but it would not be safe for them to be writable.
74 * However, domains have free access to rings 1 & 2 of the Intel
75 * privilege model. In terms of page protection, these are considered
76 * to be part of 'supervisor mode'. The WP bit in CR0 controls whether
77 * read-only restrictions are respected in supervisor mode -- if the
78 * bit is clear then any mapped page is writable.
79 *
80 * We get round this by always setting the WP bit and disallowing
81 * updates to it. This is very unlikely to cause a problem for guest
82 * OS's, which will generally use the WP bit to simplify copy-on-write
83 * implementation (in that case, OS wants a fault when it writes to
84 * an application-supplied buffer).
85 */
87 #include <xen/config.h>
88 #include <xen/init.h>
89 #include <xen/kernel.h>
90 #include <xen/lib.h>
91 #include <xen/mm.h>
92 #include <xen/sched.h>
93 #include <xen/errno.h>
94 #include <xen/perfc.h>
95 #include <xen/irq.h>
96 #include <xen/softirq.h>
97 #include <asm/shadow.h>
98 #include <asm/page.h>
99 #include <asm/flushtlb.h>
100 #include <asm/io.h>
101 #include <asm/uaccess.h>
102 #include <asm/domain_page.h>
103 #include <asm/ldt.h>
105 #ifdef VERBOSE
106 #define MEM_LOG(_f, _a...) \
107 printk("DOM%u: (file=memory.c, line=%d) " _f "\n", \
108 current->domain->id , __LINE__ , ## _a )
109 #else
110 #define MEM_LOG(_f, _a...) ((void)0)
111 #endif
113 static int alloc_l2_table(struct pfn_info *page);
114 static int alloc_l1_table(struct pfn_info *page);
115 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d);
116 static int get_page_and_type_from_pagenr(unsigned long page_nr,
117 u32 type,
118 struct domain *d);
120 static void free_l2_table(struct pfn_info *page);
121 static void free_l1_table(struct pfn_info *page);
123 static int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t, unsigned long);
124 static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t);
126 /* Used to defer flushing of memory structures. */
127 static struct {
128 #define DOP_FLUSH_TLB (1<<0) /* Flush the TLB. */
129 #define DOP_RELOAD_LDT (1<<1) /* Reload the LDT shadow mapping. */
130 unsigned long deferred_ops;
131 /* If non-NULL, specifies a foreign subject domain for some operations. */
132 struct domain *foreign;
133 } __cacheline_aligned percpu_info[NR_CPUS];
135 /*
136 * Returns the current foreign domain; defaults to the currently-executing
137 * domain if a foreign override hasn't been specified.
138 */
139 #define FOREIGNDOM (percpu_info[smp_processor_id()].foreign ? : current->domain)
141 /* Private domain structs for DOMID_XEN and DOMID_IO. */
142 static struct domain *dom_xen, *dom_io;
144 /* Frame table and its size in pages. */
145 struct pfn_info *frame_table;
146 unsigned long frame_table_size;
147 unsigned long max_page;
149 void __init init_frametable(void)
150 {
151 unsigned long i, p;
153 frame_table = (struct pfn_info *)FRAMETABLE_VIRT_START;
154 frame_table_size = max_page * sizeof(struct pfn_info);
155 frame_table_size = (frame_table_size + PAGE_SIZE - 1) & PAGE_MASK;
157 for ( i = 0; i < frame_table_size; i += (4UL << 20) )
158 {
159 p = alloc_boot_pages(min(frame_table_size - i, 4UL << 20), 4UL << 20);
160 if ( p == 0 )
161 panic("Not enough memory for frame table\n");
162 idle_pg_table[(FRAMETABLE_VIRT_START + i) >> L2_PAGETABLE_SHIFT] =
163 mk_l2_pgentry(p | __PAGE_HYPERVISOR | _PAGE_PSE);
164 }
166 memset(frame_table, 0, frame_table_size);
167 }
169 void arch_init_memory(void)
170 {
171 unsigned long mfn, i;
173 /*
174 * We are rather picky about the layout of 'struct pfn_info'. The
175 * count_info and domain fields must be adjacent, as we perform atomic
176 * 64-bit operations on them. Also, just for sanity, we assert the size
177 * of the structure here.
178 */
179 if ( (offsetof(struct pfn_info, u.inuse.domain) !=
180 (offsetof(struct pfn_info, count_info) + sizeof(u32))) ||
181 (sizeof(struct pfn_info) != 24) )
182 {
183 printk("Weird pfn_info layout (%ld,%ld,%d)\n",
184 offsetof(struct pfn_info, count_info),
185 offsetof(struct pfn_info, u.inuse.domain),
186 sizeof(struct pfn_info));
187 for ( ; ; ) ;
188 }
190 memset(percpu_info, 0, sizeof(percpu_info));
192 /* Initialise to a magic of 0x55555555 so easier to spot bugs later. */
193 memset(machine_to_phys_mapping, 0x55, 4<<20);
195 /*
196 * Initialise our DOMID_XEN domain.
197 * Any Xen-heap pages that we will allow to be mapped will have
198 * their domain field set to dom_xen.
199 */
200 dom_xen = alloc_domain_struct();
201 atomic_set(&dom_xen->refcnt, 1);
202 dom_xen->id = DOMID_XEN;
204 /*
205 * Initialise our DOMID_IO domain.
206 * This domain owns no pages but is considered a special case when
207 * mapping I/O pages, as the mappings occur at the priv of the caller.
208 */
209 dom_io = alloc_domain_struct();
210 atomic_set(&dom_io->refcnt, 1);
211 dom_io->id = DOMID_IO;
213 /* M2P table is mappable read-only by privileged domains. */
214 mfn = l2_pgentry_to_pagenr(
215 idle_pg_table[RDWR_MPT_VIRT_START >> L2_PAGETABLE_SHIFT]);
216 for ( i = 0; i < 1024; i++ )
217 {
218 frame_table[mfn+i].count_info = PGC_allocated | 1;
219 frame_table[mfn+i].u.inuse.type_info = PGT_gdt_page | 1; /* non-RW */
220 frame_table[mfn+i].u.inuse.domain = dom_xen;
221 }
222 }
224 static void __invalidate_shadow_ldt(struct exec_domain *d)
225 {
226 int i;
227 unsigned long pfn;
228 struct pfn_info *page;
230 d->mm.shadow_ldt_mapcnt = 0;
232 for ( i = 16; i < 32; i++ )
233 {
234 pfn = l1_pgentry_to_pagenr(d->mm.perdomain_ptes[i]);
235 if ( pfn == 0 ) continue;
236 d->mm.perdomain_ptes[i] = mk_l1_pgentry(0);
237 page = &frame_table[pfn];
238 ASSERT_PAGE_IS_TYPE(page, PGT_ldt_page);
239 ASSERT_PAGE_IS_DOMAIN(page, d->domain);
240 put_page_and_type(page);
241 }
243 /* Dispose of the (now possibly invalid) mappings from the TLB. */
244 percpu_info[d->processor].deferred_ops |= DOP_FLUSH_TLB | DOP_RELOAD_LDT;
245 }
248 static inline void invalidate_shadow_ldt(struct exec_domain *d)
249 {
250 if ( d->mm.shadow_ldt_mapcnt != 0 )
251 __invalidate_shadow_ldt(d);
252 }
255 static int alloc_segdesc_page(struct pfn_info *page)
256 {
257 unsigned long *descs = map_domain_mem((page-frame_table) << PAGE_SHIFT);
258 int i;
260 for ( i = 0; i < 512; i++ )
261 if ( unlikely(!check_descriptor(&descs[i*2])) )
262 goto fail;
264 unmap_domain_mem(descs);
265 return 1;
267 fail:
268 unmap_domain_mem(descs);
269 return 0;
270 }
273 /* Map shadow page at offset @off. */
274 int map_ldt_shadow_page(unsigned int off)
275 {
276 struct exec_domain *ed = current;
277 struct domain *d = ed->domain;
278 unsigned long l1e;
280 if ( unlikely(in_irq()) )
281 BUG();
283 __get_user(l1e, (unsigned long *)&linear_pg_table[(ed->mm.ldt_base >>
284 PAGE_SHIFT) + off]);
286 if ( unlikely(!(l1e & _PAGE_PRESENT)) ||
287 unlikely(!get_page_and_type(&frame_table[l1e >> PAGE_SHIFT],
288 d, PGT_ldt_page)) )
289 return 0;
291 ed->mm.perdomain_ptes[off + 16] = mk_l1_pgentry(l1e | _PAGE_RW);
292 ed->mm.shadow_ldt_mapcnt++;
294 return 1;
295 }
298 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d)
299 {
300 struct pfn_info *page = &frame_table[page_nr];
302 if ( unlikely(!pfn_is_ram(page_nr)) )
303 {
304 MEM_LOG("Pfn %08lx is not RAM", page_nr);
305 return 0;
306 }
308 if ( unlikely(!get_page(page, d)) )
309 {
310 MEM_LOG("Could not get page ref for pfn %08lx", page_nr);
311 return 0;
312 }
314 return 1;
315 }
318 static int get_page_and_type_from_pagenr(unsigned long page_nr,
319 u32 type,
320 struct domain *d)
321 {
322 struct pfn_info *page = &frame_table[page_nr];
324 if ( unlikely(!get_page_from_pagenr(page_nr, d)) )
325 return 0;
327 if ( unlikely(!get_page_type(page, type)) )
328 {
329 #ifdef VERBOSE
330 if ( (type & PGT_type_mask) != PGT_l1_page_table )
331 MEM_LOG("Bad page type for pfn %08lx (%08x)",
332 page_nr, page->u.inuse.type_info);
333 #endif
334 put_page(page);
335 return 0;
336 }
338 return 1;
339 }
342 /*
343 * We allow an L2 tables to map each other (a.k.a. linear page tables). It
344 * needs some special care with reference counst and access permissions:
345 * 1. The mapping entry must be read-only, or the guest may get write access
346 * to its own PTEs.
347 * 2. We must only bump the reference counts for an *already validated*
348 * L2 table, or we can end up in a deadlock in get_page_type() by waiting
349 * on a validation that is required to complete that validation.
350 * 3. We only need to increment the reference counts for the mapped page
351 * frame if it is mapped by a different L2 table. This is sufficient and
352 * also necessary to allow validation of an L2 table mapping itself.
353 */
354 static int
355 get_linear_pagetable(
356 l2_pgentry_t l2e, unsigned long pfn, struct domain *d)
357 {
358 u32 x, y;
359 struct pfn_info *page;
361 if ( (l2_pgentry_val(l2e) & _PAGE_RW) )
362 {
363 MEM_LOG("Attempt to create linear p.t. with write perms");
364 return 0;
365 }
367 if ( (l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn )
368 {
369 /* Make sure the mapped frame belongs to the correct domain. */
370 if ( unlikely(!get_page_from_pagenr(l2_pgentry_to_pagenr(l2e), d)) )
371 return 0;
373 /*
374 * Make sure that the mapped frame is an already-validated L2 table.
375 * If so, atomically increment the count (checking for overflow).
376 */
377 page = &frame_table[l2_pgentry_to_pagenr(l2e)];
378 y = page->u.inuse.type_info;
379 do {
380 x = y;
381 if ( unlikely((x & PGT_count_mask) == PGT_count_mask) ||
382 unlikely((x & (PGT_type_mask|PGT_validated)) !=
383 (PGT_l2_page_table|PGT_validated)) )
384 {
385 put_page(page);
386 return 0;
387 }
388 }
389 while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x );
390 }
392 return 1;
393 }
396 static int
397 get_page_from_l1e(
398 l1_pgentry_t l1e, struct domain *d)
399 {
400 unsigned long l1v = l1_pgentry_val(l1e);
401 unsigned long pfn = l1_pgentry_to_pagenr(l1e);
402 struct pfn_info *page = &frame_table[pfn];
403 extern int domain_iomem_in_pfn(struct domain *d, unsigned long pfn);
405 if ( !(l1v & _PAGE_PRESENT) )
406 return 1;
408 if ( unlikely(l1v & (_PAGE_GLOBAL|_PAGE_PAT)) )
409 {
410 MEM_LOG("Bad L1 type settings %04lx", l1v & (_PAGE_GLOBAL|_PAGE_PAT));
411 return 0;
412 }
414 if ( unlikely(!pfn_is_ram(pfn)) )
415 {
416 /* Revert to caller privileges if FD == DOMID_IO. */
417 if ( d == dom_io )
418 d = current->domain;
420 if ( IS_PRIV(d) )
421 return 1;
423 if ( IS_CAPABLE_PHYSDEV(d) )
424 return domain_iomem_in_pfn(d, pfn);
426 MEM_LOG("Non-privileged attempt to map I/O space %08lx", pfn);
427 return 0;
428 }
430 return ((l1v & _PAGE_RW) ?
431 get_page_and_type(page, d, PGT_writable_page) :
432 get_page(page, d));
433 }
436 /* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
437 static int
438 get_page_from_l2e(
439 l2_pgentry_t l2e, unsigned long pfn,
440 struct domain *d, unsigned long va_idx)
441 {
442 int rc;
444 if ( !(l2_pgentry_val(l2e) & _PAGE_PRESENT) )
445 return 1;
447 if ( unlikely((l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE))) )
448 {
449 MEM_LOG("Bad L2 page type settings %04lx",
450 l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE));
451 return 0;
452 }
454 rc = get_page_and_type_from_pagenr(
455 l2_pgentry_to_pagenr(l2e),
456 PGT_l1_page_table | (va_idx<<PGT_va_shift), d);
458 if ( unlikely(!rc) )
459 return get_linear_pagetable(l2e, pfn, d);
461 return 1;
462 }
465 static void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
466 {
467 unsigned long l1v = l1_pgentry_val(l1e);
468 unsigned long pfn = l1_pgentry_to_pagenr(l1e);
469 struct pfn_info *page = &frame_table[pfn];
470 struct domain *e;
472 if ( !(l1v & _PAGE_PRESENT) || !pfn_is_ram(pfn) )
473 return;
475 e = page->u.inuse.domain;
476 if ( unlikely(e != d) )
477 {
478 /*
479 * Unmap a foreign page that may have been mapped via a grant table.
480 * Note that this can fail for a privileged domain that can map foreign
481 * pages via MMUEXT_SET_FOREIGNDOM. Such domains can have some mappings
482 * counted via a grant entry and some counted directly in the page
483 * structure's reference count. Note that reference counts won't get
484 * dangerously confused as long as we always try to decrement the
485 * grant entry first. We may end up with a mismatch between which
486 * mappings and which unmappings are counted via the grant entry, but
487 * really it doesn't matter as privileged domains have carte blanche.
488 */
489 if ( likely(gnttab_check_unmap(e, d, pfn, !(l1v & _PAGE_RW))) )
490 return;
491 /* Assume this mapping was made via MMUEXT_SET_FOREIGNDOM... */
492 }
494 if ( l1v & _PAGE_RW )
495 {
496 put_page_and_type(page);
497 }
498 else
499 {
500 /* We expect this is rare so we blow the entire shadow LDT. */
501 if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) ==
502 PGT_ldt_page)) &&
503 unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) )
504 invalidate_shadow_ldt(e->exec_domain[0]);
505 put_page(page);
506 }
507 }
510 /*
511 * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
512 * Note also that this automatically deals correctly with linear p.t.'s.
513 */
514 static void put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
515 {
516 if ( (l2_pgentry_val(l2e) & _PAGE_PRESENT) &&
517 ((l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn) )
518 put_page_and_type(&frame_table[l2_pgentry_to_pagenr(l2e)]);
519 }
522 static int alloc_l2_table(struct pfn_info *page)
523 {
524 struct domain *d = page->u.inuse.domain;
525 unsigned long page_nr = page_to_pfn(page);
526 l2_pgentry_t *pl2e;
527 int i;
529 pl2e = map_domain_mem(page_nr << PAGE_SHIFT);
531 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
532 if ( unlikely(!get_page_from_l2e(pl2e[i], page_nr, d, i)) )
533 goto fail;
535 #if defined(__i386__)
536 /* Now we add our private high mappings. */
537 memcpy(&pl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
538 &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
539 HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
540 pl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
541 mk_l2_pgentry((page_nr << PAGE_SHIFT) | __PAGE_HYPERVISOR);
542 pl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] =
543 mk_l2_pgentry(__pa(page->u.inuse.domain->mm_perdomain_pt) |
544 __PAGE_HYPERVISOR);
545 #endif
547 unmap_domain_mem(pl2e);
548 return 1;
550 fail:
551 while ( i-- > 0 )
552 put_page_from_l2e(pl2e[i], page_nr);
554 unmap_domain_mem(pl2e);
555 return 0;
556 }
559 static int alloc_l1_table(struct pfn_info *page)
560 {
561 struct domain *d = page->u.inuse.domain;
562 unsigned long page_nr = page_to_pfn(page);
563 l1_pgentry_t *pl1e;
564 int i;
566 pl1e = map_domain_mem(page_nr << PAGE_SHIFT);
568 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
569 if ( unlikely(!get_page_from_l1e(pl1e[i], d)) )
570 goto fail;
572 unmap_domain_mem(pl1e);
573 return 1;
575 fail:
576 while ( i-- > 0 )
577 put_page_from_l1e(pl1e[i], d);
579 unmap_domain_mem(pl1e);
580 return 0;
581 }
584 static void free_l2_table(struct pfn_info *page)
585 {
586 unsigned long page_nr = page - frame_table;
587 l2_pgentry_t *pl2e;
588 int i;
590 pl2e = map_domain_mem(page_nr << PAGE_SHIFT);
592 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
593 put_page_from_l2e(pl2e[i], page_nr);
595 unmap_domain_mem(pl2e);
596 }
599 static void free_l1_table(struct pfn_info *page)
600 {
601 struct domain *d = page->u.inuse.domain;
602 unsigned long page_nr = page - frame_table;
603 l1_pgentry_t *pl1e;
604 int i;
606 pl1e = map_domain_mem(page_nr << PAGE_SHIFT);
608 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
609 put_page_from_l1e(pl1e[i], d);
611 unmap_domain_mem(pl1e);
612 }
615 static inline int update_l2e(l2_pgentry_t *pl2e,
616 l2_pgentry_t ol2e,
617 l2_pgentry_t nl2e)
618 {
619 unsigned long o = cmpxchg((unsigned long *)pl2e,
620 l2_pgentry_val(ol2e),
621 l2_pgentry_val(nl2e));
622 if ( o != l2_pgentry_val(ol2e) )
623 MEM_LOG("Failed to update %08lx -> %08lx: saw %08lx\n",
624 l2_pgentry_val(ol2e), l2_pgentry_val(nl2e), o);
625 return (o == l2_pgentry_val(ol2e));
626 }
629 /* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
630 static int mod_l2_entry(l2_pgentry_t *pl2e,
631 l2_pgentry_t nl2e,
632 unsigned long pfn)
633 {
634 l2_pgentry_t ol2e;
635 unsigned long _ol2e;
637 if ( unlikely((((unsigned long)pl2e & (PAGE_SIZE-1)) >> 2) >=
638 DOMAIN_ENTRIES_PER_L2_PAGETABLE) )
639 {
640 MEM_LOG("Illegal L2 update attempt in Xen-private area %p", pl2e);
641 return 0;
642 }
644 if ( unlikely(__get_user(_ol2e, (unsigned long *)pl2e) != 0) )
645 return 0;
646 ol2e = mk_l2_pgentry(_ol2e);
648 if ( l2_pgentry_val(nl2e) & _PAGE_PRESENT )
649 {
650 /* Differ in mapping (bits 12-31) or presence (bit 0)? */
651 if ( ((l2_pgentry_val(ol2e) ^ l2_pgentry_val(nl2e)) & ~0xffe) == 0 )
652 return update_l2e(pl2e, ol2e, nl2e);
654 if ( unlikely(!get_page_from_l2e(nl2e, pfn, current->domain,
655 ((unsigned long)pl2e &
656 ~PAGE_MASK) >> 2)) )
657 return 0;
659 if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) )
660 {
661 put_page_from_l2e(nl2e, pfn);
662 return 0;
663 }
665 put_page_from_l2e(ol2e, pfn);
666 return 1;
667 }
669 if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) )
670 return 0;
672 put_page_from_l2e(ol2e, pfn);
673 return 1;
674 }
677 static inline int update_l1e(l1_pgentry_t *pl1e,
678 l1_pgentry_t ol1e,
679 l1_pgentry_t nl1e)
680 {
681 unsigned long o = l1_pgentry_val(ol1e);
682 unsigned long n = l1_pgentry_val(nl1e);
684 if ( unlikely(cmpxchg_user(pl1e, o, n) != 0) ||
685 unlikely(o != l1_pgentry_val(ol1e)) )
686 {
687 MEM_LOG("Failed to update %08lx -> %08lx: saw %08lx\n",
688 l1_pgentry_val(ol1e), l1_pgentry_val(nl1e), o);
689 return 0;
690 }
692 return 1;
693 }
696 /* Update the L1 entry at pl1e to new value nl1e. */
697 static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e)
698 {
699 l1_pgentry_t ol1e;
700 unsigned long _ol1e;
701 struct domain *d = current->domain;
703 if ( unlikely(__get_user(_ol1e, (unsigned long *)pl1e) != 0) )
704 {
705 MEM_LOG("Bad get_user\n");
706 return 0;
707 }
709 ol1e = mk_l1_pgentry(_ol1e);
711 if ( l1_pgentry_val(nl1e) & _PAGE_PRESENT )
712 {
713 /* Differ in mapping (bits 12-31), r/w (bit 1), or presence (bit 0)? */
714 if ( ((l1_pgentry_val(ol1e) ^ l1_pgentry_val(nl1e)) & ~0xffc) == 0 )
715 return update_l1e(pl1e, ol1e, nl1e);
717 if ( unlikely(!get_page_from_l1e(nl1e, FOREIGNDOM)) )
718 return 0;
720 if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
721 {
722 put_page_from_l1e(nl1e, d);
723 return 0;
724 }
726 put_page_from_l1e(ol1e, d);
727 return 1;
728 }
730 if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
731 return 0;
733 put_page_from_l1e(ol1e, d);
734 return 1;
735 }
738 int alloc_page_type(struct pfn_info *page, unsigned int type)
739 {
740 switch ( type )
741 {
742 case PGT_l1_page_table:
743 return alloc_l1_table(page);
744 case PGT_l2_page_table:
745 return alloc_l2_table(page);
746 case PGT_gdt_page:
747 case PGT_ldt_page:
748 return alloc_segdesc_page(page);
749 default:
750 printk("Bad type in alloc_page_type %x t=%x c=%x\n",
751 type, page->u.inuse.type_info,
752 page->count_info);
753 BUG();
754 }
756 return 0;
757 }
760 void free_page_type(struct pfn_info *page, unsigned int type)
761 {
762 struct domain *d = page->u.inuse.domain;
764 switch ( type )
765 {
766 case PGT_l1_page_table:
767 free_l1_table(page);
768 break;
770 case PGT_l2_page_table:
771 free_l2_table(page);
772 break;
774 default:
775 BUG();
776 }
778 if ( unlikely(d->exec_domain[0]->mm.shadow_mode) &&
779 (get_shadow_status(&d->exec_domain[0]->mm, page_to_pfn(page)) & PSH_shadowed) )
780 {
781 unshadow_table(page_to_pfn(page), type);
782 put_shadow_status(&d->exec_domain[0]->mm);
783 }
784 }
787 void put_page_type(struct pfn_info *page)
788 {
789 u32 nx, x, y = page->u.inuse.type_info;
791 again:
792 do {
793 x = y;
794 nx = x - 1;
796 ASSERT((x & PGT_count_mask) != 0);
798 /*
799 * The page should always be validated while a reference is held. The
800 * exception is during domain destruction, when we forcibly invalidate
801 * page-table pages if we detect a referential loop.
802 * See domain.c:relinquish_list().
803 */
804 ASSERT((x & PGT_validated) ||
805 test_bit(DF_DYING, &page->u.inuse.domain->d_flags));
807 if ( unlikely((nx & PGT_count_mask) == 0) )
808 {
809 /* Record TLB information for flush later. Races are harmless. */
810 page->tlbflush_timestamp = tlbflush_current_time();
812 if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
813 likely(nx & PGT_validated) )
814 {
815 /*
816 * Page-table pages must be unvalidated when count is zero. The
817 * 'free' is safe because the refcnt is non-zero and validated
818 * bit is clear => other ops will spin or fail.
819 */
820 if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x,
821 x & ~PGT_validated)) != x) )
822 goto again;
823 /* We cleared the 'valid bit' so we do the clear up. */
824 free_page_type(page, x & PGT_type_mask);
825 /* Carry on, but with the 'valid bit' now clear. */
826 x &= ~PGT_validated;
827 nx &= ~PGT_validated;
828 }
829 }
830 else if ( unlikely((nx & (PGT_pinned | PGT_count_mask)) ==
831 (PGT_pinned | 1)) )
832 {
833 /* Page is now only pinned. Make the back pointer mutable again. */
834 nx |= PGT_va_mutable;
835 }
836 }
837 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
838 }
841 int get_page_type(struct pfn_info *page, u32 type)
842 {
843 u32 nx, x, y = page->u.inuse.type_info;
845 again:
846 do {
847 x = y;
848 nx = x + 1;
849 if ( unlikely((nx & PGT_count_mask) == 0) )
850 {
851 MEM_LOG("Type count overflow on pfn %08lx\n", page_to_pfn(page));
852 return 0;
853 }
854 else if ( unlikely((x & PGT_count_mask) == 0) )
855 {
856 if ( (x & (PGT_type_mask|PGT_va_mask)) != type )
857 {
858 /*
859 * On type change we check to flush stale TLB entries. This
860 * may be unnecessary (e.g., page was GDT/LDT) but those
861 * circumstances should be very rare.
862 */
863 struct domain *d = page->u.inuse.domain;
864 if ( unlikely(NEED_FLUSH(tlbflush_time[d->exec_domain[0]->processor],
865 page->tlbflush_timestamp)) )
866 {
867 perfc_incr(need_flush_tlb_flush);
868 flush_tlb_cpu(d->exec_domain[0]->processor);
869 }
871 /* We lose existing type, back pointer, and validity. */
872 nx &= ~(PGT_type_mask | PGT_va_mask | PGT_validated);
873 nx |= type;
875 /* No special validation needed for writable pages. */
876 /* Page tables and GDT/LDT need to be scanned for validity. */
877 if ( type == PGT_writable_page )
878 nx |= PGT_validated;
879 }
880 }
881 else if ( unlikely((x & (PGT_type_mask|PGT_va_mask)) != type) )
882 {
883 if ( unlikely((x & PGT_type_mask) != (type & PGT_type_mask) ) )
884 {
885 if ( ((x & PGT_type_mask) != PGT_l2_page_table) ||
886 ((type & PGT_type_mask) != PGT_l1_page_table) )
887 MEM_LOG("Bad type (saw %08x != exp %08x) for pfn %08lx\n",
888 x & PGT_type_mask, type, page_to_pfn(page));
889 return 0;
890 }
891 else if ( (x & PGT_va_mask) == PGT_va_mutable )
892 {
893 /* The va backpointer is mutable, hence we update it. */
894 nx &= ~PGT_va_mask;
895 nx |= type; /* we know the actual type is correct */
896 }
897 else if ( unlikely((x & PGT_va_mask) != (type & PGT_va_mask)) )
898 {
899 /* This table is potentially mapped at multiple locations. */
900 nx &= ~PGT_va_mask;
901 nx |= PGT_va_unknown;
902 }
903 }
904 else if ( unlikely(!(x & PGT_validated)) )
905 {
906 /* Someone else is updating validation of this page. Wait... */
907 while ( (y = page->u.inuse.type_info) == x )
908 {
909 rep_nop();
910 barrier();
911 }
912 goto again;
913 }
914 }
915 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
917 if ( unlikely(!(nx & PGT_validated)) )
918 {
919 /* Try to validate page type; drop the new reference on failure. */
920 if ( unlikely(!alloc_page_type(page, type & PGT_type_mask)) )
921 {
922 MEM_LOG("Error while validating pfn %08lx for type %08x."
923 " caf=%08x taf=%08x\n",
924 page_to_pfn(page), type,
925 page->count_info,
926 page->u.inuse.type_info);
927 /* Noone else can get a reference. We hold the only ref. */
928 page->u.inuse.type_info = 0;
929 return 0;
930 }
932 /* Noone else is updating simultaneously. */
933 __set_bit(_PGT_validated, &page->u.inuse.type_info);
934 }
936 return 1;
937 }
940 static int do_extended_command(unsigned long ptr, unsigned long val)
941 {
942 int okay = 1, cpu = smp_processor_id();
943 unsigned int cmd = val & MMUEXT_CMD_MASK;
944 unsigned long pfn = ptr >> PAGE_SHIFT;
945 unsigned long old_base_pfn;
946 struct pfn_info *page = &frame_table[pfn];
947 struct exec_domain *ed = current;
948 struct domain *d = ed->domain, *nd, *e;
949 u32 x, y;
950 domid_t domid;
951 grant_ref_t gntref;
953 switch ( cmd )
954 {
955 case MMUEXT_PIN_L1_TABLE:
956 case MMUEXT_PIN_L2_TABLE:
957 /*
958 * We insist that, if you pin an L1 page, it's the first thing that
959 * you do to it. This is because we require the backptr to still be
960 * mutable. This assumption seems safe.
961 */
962 okay = get_page_and_type_from_pagenr(
963 pfn,
964 ((cmd==MMUEXT_PIN_L2_TABLE) ?
965 PGT_l2_page_table : (PGT_l1_page_table|PGT_va_mutable)),
966 FOREIGNDOM);
968 if ( unlikely(!okay) )
969 {
970 MEM_LOG("Error while pinning pfn %08lx", pfn);
971 break;
972 }
974 if ( unlikely(test_and_set_bit(_PGT_pinned,
975 &page->u.inuse.type_info)) )
976 {
977 MEM_LOG("Pfn %08lx already pinned", pfn);
978 put_page_and_type(page);
979 okay = 0;
980 break;
981 }
983 break;
985 case MMUEXT_UNPIN_TABLE:
986 if ( unlikely(!(okay = get_page_from_pagenr(pfn, FOREIGNDOM))) )
987 {
988 MEM_LOG("Page %08lx bad domain (dom=%p)",
989 ptr, page->u.inuse.domain);
990 }
991 else if ( likely(test_and_clear_bit(_PGT_pinned,
992 &page->u.inuse.type_info)) )
993 {
994 put_page_and_type(page);
995 put_page(page);
996 }
997 else
998 {
999 okay = 0;
1000 put_page(page);
1001 MEM_LOG("Pfn %08lx not pinned", pfn);
1003 break;
1005 case MMUEXT_NEW_BASEPTR:
1006 okay = get_page_and_type_from_pagenr(pfn, PGT_l2_page_table, d);
1007 if ( likely(okay) )
1009 invalidate_shadow_ldt(ed);
1011 percpu_info[cpu].deferred_ops &= ~DOP_FLUSH_TLB;
1012 old_base_pfn = pagetable_val(ed->mm.pagetable) >> PAGE_SHIFT;
1013 ed->mm.pagetable = mk_pagetable(pfn << PAGE_SHIFT);
1015 shadow_mk_pagetable(&ed->mm);
1017 write_ptbase(&ed->mm);
1019 put_page_and_type(&frame_table[old_base_pfn]);
1021 else
1023 MEM_LOG("Error while installing new baseptr %08lx", ptr);
1025 break;
1027 case MMUEXT_TLB_FLUSH:
1028 percpu_info[cpu].deferred_ops |= DOP_FLUSH_TLB;
1029 break;
1031 case MMUEXT_INVLPG:
1032 __flush_tlb_one(ptr);
1033 break;
1035 case MMUEXT_FLUSH_CACHE:
1036 if ( unlikely(!IS_CAPABLE_PHYSDEV(d)) )
1038 MEM_LOG("Non-physdev domain tried to FLUSH_CACHE.\n");
1039 okay = 0;
1041 else
1043 wbinvd();
1045 break;
1047 case MMUEXT_SET_LDT:
1049 unsigned long ents = val >> MMUEXT_CMD_SHIFT;
1050 if ( ((ptr & (PAGE_SIZE-1)) != 0) ||
1051 (ents > 8192) ||
1052 ((ptr+ents*LDT_ENTRY_SIZE) < ptr) ||
1053 ((ptr+ents*LDT_ENTRY_SIZE) > PAGE_OFFSET) )
1055 okay = 0;
1056 MEM_LOG("Bad args to SET_LDT: ptr=%08lx, ents=%08lx", ptr, ents);
1058 else if ( (ed->mm.ldt_ents != ents) ||
1059 (ed->mm.ldt_base != ptr) )
1061 invalidate_shadow_ldt(ed);
1062 ed->mm.ldt_base = ptr;
1063 ed->mm.ldt_ents = ents;
1064 load_LDT(ed);
1065 percpu_info[cpu].deferred_ops &= ~DOP_RELOAD_LDT;
1066 if ( ents != 0 )
1067 percpu_info[cpu].deferred_ops |= DOP_RELOAD_LDT;
1069 break;
1072 case MMUEXT_SET_FOREIGNDOM:
1073 domid = (domid_t)(val >> 16);
1075 if ( (e = percpu_info[cpu].foreign) != NULL )
1076 put_domain(e);
1077 percpu_info[cpu].foreign = NULL;
1079 if ( !IS_PRIV(d) )
1081 switch ( domid )
1083 case DOMID_IO:
1084 get_knownalive_domain(dom_io);
1085 percpu_info[cpu].foreign = dom_io;
1086 break;
1087 default:
1088 MEM_LOG("Dom %u cannot set foreign dom\n", d->id);
1089 okay = 0;
1090 break;
1093 else
1095 percpu_info[cpu].foreign = e = find_domain_by_id(domid);
1096 if ( e == NULL )
1098 switch ( domid )
1100 case DOMID_XEN:
1101 get_knownalive_domain(dom_xen);
1102 percpu_info[cpu].foreign = dom_xen;
1103 break;
1104 case DOMID_IO:
1105 get_knownalive_domain(dom_io);
1106 percpu_info[cpu].foreign = dom_io;
1107 break;
1108 default:
1109 MEM_LOG("Unknown domain '%u'", domid);
1110 okay = 0;
1111 break;
1115 break;
1117 case MMUEXT_TRANSFER_PAGE:
1118 domid = (domid_t)(val >> 16);
1119 gntref = (grant_ref_t)((val & 0xFF00) | ((ptr >> 2) & 0x00FF));
1121 if ( unlikely(IS_XEN_HEAP_FRAME(page)) ||
1122 unlikely(!pfn_is_ram(pfn)) ||
1123 unlikely((e = find_domain_by_id(domid)) == NULL) )
1125 MEM_LOG("Bad frame (%08lx) or bad domid (%d).\n", pfn, domid);
1126 okay = 0;
1127 break;
1130 spin_lock(&d->page_alloc_lock);
1132 /*
1133 * The tricky bit: atomically release ownership while there is just one
1134 * benign reference to the page (PGC_allocated). If that reference
1135 * disappears then the deallocation routine will safely spin.
1136 */
1137 nd = page->u.inuse.domain;
1138 y = page->count_info;
1139 do {
1140 x = y;
1141 if ( unlikely((x & (PGC_count_mask|PGC_allocated)) !=
1142 (1|PGC_allocated)) ||
1143 unlikely(nd != d) )
1145 MEM_LOG("Bad page values %08lx: ed=%p(%u), sd=%p,"
1146 " caf=%08x, taf=%08x\n", page_to_pfn(page),
1147 d, d->id, nd, x, page->u.inuse.type_info);
1148 spin_unlock(&d->page_alloc_lock);
1149 put_domain(e);
1150 return 0;
1152 __asm__ __volatile__(
1153 LOCK_PREFIX "cmpxchg8b %2"
1154 : "=d" (nd), "=a" (y),
1155 "=m" (*(volatile u64 *)(&page->count_info))
1156 : "0" (d), "1" (x), "c" (NULL), "b" (x) );
1158 while ( unlikely(nd != d) || unlikely(y != x) );
1160 /*
1161 * Unlink from 'd'. At least one reference remains (now anonymous), so
1162 * noone else is spinning to try to delete this page from 'd'.
1163 */
1164 d->tot_pages--;
1165 list_del(&page->list);
1167 spin_unlock(&d->page_alloc_lock);
1169 spin_lock(&e->page_alloc_lock);
1171 /*
1172 * Check that 'e' will accept the page and has reservation headroom.
1173 * Also, a domain mustn't have PGC_allocated pages when it is dying.
1174 */
1175 ASSERT(e->tot_pages <= e->max_pages);
1176 if ( unlikely(test_bit(DF_DYING, &e->d_flags)) ||
1177 unlikely(e->tot_pages == e->max_pages) ||
1178 unlikely(!gnttab_prepare_for_transfer(e, d, gntref)) )
1180 MEM_LOG("Transferee has no reservation headroom (%d,%d), or "
1181 "provided a bad grant ref, or is dying (%08lx).\n",
1182 e->tot_pages, e->max_pages, e->d_flags);
1183 spin_unlock(&e->page_alloc_lock);
1184 put_domain(e);
1185 okay = 0;
1186 break;
1189 /* Okay, add the page to 'e'. */
1190 if ( unlikely(e->tot_pages++ == 0) )
1191 get_knownalive_domain(e);
1192 list_add_tail(&page->list, &e->page_list);
1193 page->u.inuse.domain = e;
1195 spin_unlock(&e->page_alloc_lock);
1197 /* Transfer is all done: tell the guest about its new page frame. */
1198 gnttab_notify_transfer(e, gntref, pfn);
1200 put_domain(e);
1201 break;
1203 case MMUEXT_REASSIGN_PAGE:
1204 if ( unlikely(!IS_PRIV(d)) )
1206 MEM_LOG("Dom %u has no reassignment priv", d->id);
1207 okay = 0;
1208 break;
1211 e = percpu_info[cpu].foreign;
1212 if ( unlikely(e == NULL) )
1214 MEM_LOG("No FOREIGNDOM to reassign pfn %08lx to", pfn);
1215 okay = 0;
1216 break;
1219 /*
1220 * Grab both page_list locks, in order. This prevents the page from
1221 * disappearing elsewhere while we modify the owner, and we'll need
1222 * both locks if we're successful so that we can change lists.
1223 */
1224 if ( d < e )
1226 spin_lock(&d->page_alloc_lock);
1227 spin_lock(&e->page_alloc_lock);
1229 else
1231 spin_lock(&e->page_alloc_lock);
1232 spin_lock(&d->page_alloc_lock);
1235 /* A domain shouldn't have PGC_allocated pages when it is dying. */
1236 if ( unlikely(test_bit(DF_DYING, &e->d_flags)) ||
1237 unlikely(IS_XEN_HEAP_FRAME(page)) )
1239 MEM_LOG("Reassignment page is Xen heap, or dest dom is dying.");
1240 okay = 0;
1241 goto reassign_fail;
1244 /*
1245 * The tricky bit: atomically change owner while there is just one
1246 * benign reference to the page (PGC_allocated). If that reference
1247 * disappears then the deallocation routine will safely spin.
1248 */
1249 nd = page->u.inuse.domain;
1250 y = page->count_info;
1251 do {
1252 x = y;
1253 if ( unlikely((x & (PGC_count_mask|PGC_allocated)) !=
1254 (1|PGC_allocated)) ||
1255 unlikely(nd != d) )
1257 MEM_LOG("Bad page values %08lx: ed=%p(%u), sd=%p,"
1258 " caf=%08x, taf=%08x\n", page_to_pfn(page),
1259 d, d->id, nd, x, page->u.inuse.type_info);
1260 okay = 0;
1261 goto reassign_fail;
1263 __asm__ __volatile__(
1264 LOCK_PREFIX "cmpxchg8b %3"
1265 : "=d" (nd), "=a" (y), "=c" (e),
1266 "=m" (*(volatile u64 *)(&page->count_info))
1267 : "0" (d), "1" (x), "c" (e), "b" (x) );
1269 while ( unlikely(nd != d) || unlikely(y != x) );
1271 /*
1272 * Unlink from 'd'. We transferred at least one reference to 'e', so
1273 * noone else is spinning to try to delete this page from 'd'.
1274 */
1275 d->tot_pages--;
1276 list_del(&page->list);
1278 /*
1279 * Add the page to 'e'. Someone may already have removed the last
1280 * reference and want to remove the page from 'e'. However, we have
1281 * the lock so they'll spin waiting for us.
1282 */
1283 if ( unlikely(e->tot_pages++ == 0) )
1284 get_knownalive_domain(e);
1285 list_add_tail(&page->list, &e->page_list);
1287 reassign_fail:
1288 spin_unlock(&d->page_alloc_lock);
1289 spin_unlock(&e->page_alloc_lock);
1290 break;
1292 case MMUEXT_CLEAR_FOREIGNDOM:
1293 if ( (e = percpu_info[cpu].foreign) != NULL )
1294 put_domain(e);
1295 percpu_info[cpu].foreign = NULL;
1296 break;
1298 default:
1299 MEM_LOG("Invalid extended pt command 0x%08lx", val & MMUEXT_CMD_MASK);
1300 okay = 0;
1301 break;
1304 return okay;
1307 int do_mmu_update(
1308 mmu_update_t *ureqs, unsigned int count, unsigned int *pdone)
1310 /*
1311 * We steal the m.s.b. of the @count parameter to indicate whether this
1312 * invocation of do_mmu_update() is resuming a previously preempted call.
1313 * We steal the next 15 bits to remember the current FOREIGNDOM.
1314 */
1315 #define MMU_UPDATE_PREEMPTED (~(~0U>>1))
1316 #define MMU_UPDATE_PREEMPT_FDOM_SHIFT ((sizeof(int)*8)-16)
1317 #define MMU_UPDATE_PREEMPT_FDOM_MASK (0x7FFFU<<MMU_UPDATE_PREEMPT_FDOM_SHIFT)
1319 mmu_update_t req;
1320 unsigned long va = 0, deferred_ops, pfn, prev_pfn = 0;
1321 struct pfn_info *page;
1322 int rc = 0, okay = 1, i = 0, cpu = smp_processor_id();
1323 unsigned int cmd, done = 0;
1324 unsigned long prev_spfn = 0;
1325 l1_pgentry_t *prev_spl1e = 0;
1326 struct exec_domain *ed = current;
1327 struct domain *d = ed->domain;
1328 u32 type_info;
1329 domid_t domid;
1331 LOCK_BIGLOCK(d);
1333 cleanup_writable_pagetable(d, PTWR_CLEANUP_ACTIVE | PTWR_CLEANUP_INACTIVE);
1335 /*
1336 * If we are resuming after preemption, read how much work we have already
1337 * done. This allows us to set the @done output parameter correctly.
1338 * We also reset FOREIGNDOM here.
1339 */
1340 if ( unlikely(count&(MMU_UPDATE_PREEMPTED|MMU_UPDATE_PREEMPT_FDOM_MASK)) )
1342 if ( !(count & MMU_UPDATE_PREEMPTED) )
1344 /* Count overflow into private FOREIGNDOM field. */
1345 MEM_LOG("do_mmu_update count is too large");
1346 rc = -EINVAL;
1347 goto out;
1349 count &= ~MMU_UPDATE_PREEMPTED;
1350 domid = count >> MMU_UPDATE_PREEMPT_FDOM_SHIFT;
1351 count &= ~MMU_UPDATE_PREEMPT_FDOM_MASK;
1352 if ( unlikely(pdone != NULL) )
1353 (void)get_user(done, pdone);
1354 if ( (domid != current->domain->id) &&
1355 !do_extended_command(0, MMUEXT_SET_FOREIGNDOM | (domid << 16)) )
1357 rc = -EINVAL;
1358 goto out;
1362 perfc_incrc(calls_to_mmu_update);
1363 perfc_addc(num_page_updates, count);
1365 if ( unlikely(!array_access_ok(VERIFY_READ, ureqs, count, sizeof(req))) )
1367 rc = -EFAULT;
1368 goto out;
1371 for ( i = 0; i < count; i++ )
1373 if ( hypercall_preempt_check() )
1375 rc = hypercall_create_continuation(
1376 __HYPERVISOR_mmu_update, 3, ureqs,
1377 (count - i) |
1378 (FOREIGNDOM->id << MMU_UPDATE_PREEMPT_FDOM_SHIFT) |
1379 MMU_UPDATE_PREEMPTED, pdone);
1380 break;
1383 if ( unlikely(__copy_from_user(&req, ureqs, sizeof(req)) != 0) )
1385 MEM_LOG("Bad __copy_from_user");
1386 rc = -EFAULT;
1387 break;
1390 cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
1391 pfn = req.ptr >> PAGE_SHIFT;
1393 okay = 0;
1395 switch ( cmd )
1397 /*
1398 * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
1399 */
1400 case MMU_NORMAL_PT_UPDATE:
1401 if ( unlikely(!get_page_from_pagenr(pfn, current->domain)) )
1403 MEM_LOG("Could not get page for normal update");
1404 break;
1407 if ( likely(prev_pfn == pfn) )
1409 va = (va & PAGE_MASK) | (req.ptr & ~PAGE_MASK);
1411 else
1413 if ( prev_pfn != 0 )
1414 unmap_domain_mem((void *)va);
1415 va = (unsigned long)map_domain_mem(req.ptr);
1416 prev_pfn = pfn;
1419 page = &frame_table[pfn];
1420 switch ( (type_info = page->u.inuse.type_info) & PGT_type_mask )
1422 case PGT_l1_page_table:
1423 if ( likely(get_page_type(
1424 page, type_info & (PGT_type_mask|PGT_va_mask))) )
1426 okay = mod_l1_entry((l1_pgentry_t *)va,
1427 mk_l1_pgentry(req.val));
1429 if ( unlikely(ed->mm.shadow_mode) && okay &&
1430 (get_shadow_status(&ed->mm, page-frame_table) &
1431 PSH_shadowed) )
1433 shadow_l1_normal_pt_update(
1434 req.ptr, req.val, &prev_spfn, &prev_spl1e);
1435 put_shadow_status(&ed->mm);
1438 put_page_type(page);
1440 break;
1441 case PGT_l2_page_table:
1442 if ( likely(get_page_type(page, PGT_l2_page_table)) )
1444 okay = mod_l2_entry((l2_pgentry_t *)va,
1445 mk_l2_pgentry(req.val),
1446 pfn);
1448 if ( unlikely(ed->mm.shadow_mode) && okay &&
1449 (get_shadow_status(&ed->mm, page-frame_table) &
1450 PSH_shadowed) )
1452 shadow_l2_normal_pt_update(req.ptr, req.val);
1453 put_shadow_status(&ed->mm);
1456 put_page_type(page);
1458 break;
1459 default:
1460 if ( likely(get_page_type(page, PGT_writable_page)) )
1462 *(unsigned long *)va = req.val;
1463 okay = 1;
1464 put_page_type(page);
1466 break;
1469 put_page(page);
1470 break;
1472 case MMU_MACHPHYS_UPDATE:
1473 if ( unlikely(!get_page_from_pagenr(pfn, FOREIGNDOM)) )
1475 MEM_LOG("Could not get page for mach->phys update");
1476 break;
1479 machine_to_phys_mapping[pfn] = req.val;
1480 okay = 1;
1482 /*
1483 * If in log-dirty mode, mark the corresponding pseudo-physical
1484 * page as dirty.
1485 */
1486 if ( unlikely(ed->mm.shadow_mode == SHM_logdirty) &&
1487 mark_dirty(&ed->mm, pfn) )
1488 ed->mm.shadow_dirty_block_count++;
1490 put_page(&frame_table[pfn]);
1491 break;
1493 /*
1494 * MMU_EXTENDED_COMMAND: Extended command is specified
1495 * in the least-siginificant bits of the 'value' field.
1496 */
1497 case MMU_EXTENDED_COMMAND:
1498 req.ptr &= ~(sizeof(l1_pgentry_t) - 1);
1499 okay = do_extended_command(req.ptr, req.val);
1500 break;
1502 default:
1503 MEM_LOG("Invalid page update command %08lx", req.ptr);
1504 break;
1507 if ( unlikely(!okay) )
1509 rc = -EINVAL;
1510 break;
1513 ureqs++;
1516 out:
1517 if ( prev_pfn != 0 )
1518 unmap_domain_mem((void *)va);
1520 if ( unlikely(prev_spl1e != 0) )
1521 unmap_domain_mem((void *)prev_spl1e);
1523 deferred_ops = percpu_info[cpu].deferred_ops;
1524 percpu_info[cpu].deferred_ops = 0;
1526 if ( deferred_ops & DOP_FLUSH_TLB )
1527 local_flush_tlb();
1529 if ( deferred_ops & DOP_RELOAD_LDT )
1530 (void)map_ldt_shadow_page(0);
1532 if ( unlikely(percpu_info[cpu].foreign != NULL) )
1534 put_domain(percpu_info[cpu].foreign);
1535 percpu_info[cpu].foreign = NULL;
1538 /* Add incremental work we have done to the @done output parameter. */
1539 if ( unlikely(pdone != NULL) )
1540 __put_user(done + i, pdone);
1542 UNLOCK_BIGLOCK(d);
1543 return rc;
1547 int do_update_va_mapping(unsigned long page_nr,
1548 unsigned long val,
1549 unsigned long flags)
1551 struct exec_domain *ed = current;
1552 struct domain *d = ed->domain;
1553 int err = 0;
1554 unsigned int cpu = ed->processor;
1555 unsigned long deferred_ops;
1557 perfc_incrc(calls_to_update_va);
1559 if ( unlikely(page_nr >= (HYPERVISOR_VIRT_START >> PAGE_SHIFT)) )
1560 return -EINVAL;
1562 LOCK_BIGLOCK(d);
1564 cleanup_writable_pagetable(d, PTWR_CLEANUP_ACTIVE | PTWR_CLEANUP_INACTIVE);
1566 /*
1567 * XXX When we make this support 4MB superpages we should also deal with
1568 * the case of updating L2 entries.
1569 */
1571 if ( unlikely(!mod_l1_entry(&linear_pg_table[page_nr],
1572 mk_l1_pgentry(val))) )
1573 err = -EINVAL;
1575 if ( unlikely(ed->mm.shadow_mode) )
1577 unsigned long sval;
1579 l1pte_propagate_from_guest(&ed->mm, &val, &sval);
1581 if ( unlikely(__put_user(sval, ((unsigned long *)(
1582 &shadow_linear_pg_table[page_nr])))) )
1584 /*
1585 * Since L2's are guranteed RW, failure indicates the page was not
1586 * shadowed, so ignore.
1587 */
1588 perfc_incrc(shadow_update_va_fail);
1591 /*
1592 * If we're in log-dirty mode then we need to note that we've updated
1593 * the PTE in the PT-holding page. We need the machine frame number
1594 * for this.
1595 */
1596 if ( ed->mm.shadow_mode == SHM_logdirty )
1597 mark_dirty(&current->mm, va_to_l1mfn(page_nr << PAGE_SHIFT));
1599 check_pagetable(&ed->mm, ed->mm.pagetable, "va"); /* debug */
1602 deferred_ops = percpu_info[cpu].deferred_ops;
1603 percpu_info[cpu].deferred_ops = 0;
1605 if ( unlikely(deferred_ops & DOP_FLUSH_TLB) ||
1606 unlikely(flags & UVMF_FLUSH_TLB) )
1607 local_flush_tlb();
1608 else if ( unlikely(flags & UVMF_INVLPG) )
1609 __flush_tlb_one(page_nr << PAGE_SHIFT);
1611 if ( unlikely(deferred_ops & DOP_RELOAD_LDT) )
1612 (void)map_ldt_shadow_page(0);
1614 UNLOCK_BIGLOCK(d);
1616 return err;
1619 int do_update_va_mapping_otherdomain(unsigned long page_nr,
1620 unsigned long val,
1621 unsigned long flags,
1622 domid_t domid)
1624 unsigned int cpu = smp_processor_id();
1625 struct domain *d;
1626 int rc;
1628 if ( unlikely(!IS_PRIV(current->domain)) )
1629 return -EPERM;
1631 percpu_info[cpu].foreign = d = find_domain_by_id(domid);
1632 if ( unlikely(d == NULL) )
1634 MEM_LOG("Unknown domain '%u'", domid);
1635 return -ESRCH;
1638 rc = do_update_va_mapping(page_nr, val, flags);
1640 put_domain(d);
1641 percpu_info[cpu].foreign = NULL;
1643 return rc;
1648 /*************************
1649 * Writable Pagetables
1650 */
1652 ptwr_info_t ptwr_info[NR_CPUS];
1654 #ifdef VERBOSE
1655 int ptwr_debug = 0x0;
1656 #define PTWR_PRINTK(_f, _a...) \
1657 do { if ( unlikely(ptwr_debug) ) printk( _f , ## _a ); } while ( 0 )
1658 #define PTWR_PRINT_WHICH (which ? 'I' : 'A')
1659 #else
1660 #define PTWR_PRINTK(_f, _a...) ((void)0)
1661 #endif
1663 /* Flush the given writable p.t. page and write-protect it again. */
1664 void ptwr_flush(const int which)
1666 unsigned long sstat, spte, pte, *ptep, l1va;
1667 l1_pgentry_t *sl1e = NULL, *pl1e, ol1e, nl1e;
1668 l2_pgentry_t *pl2e;
1669 int i, cpu = smp_processor_id();
1670 struct exec_domain *ed = current;
1671 struct domain *d = ed->domain;
1673 l1va = ptwr_info[cpu].ptinfo[which].l1va;
1674 ptep = (unsigned long *)&linear_pg_table[l1va>>PAGE_SHIFT];
1676 /*
1677 * STEP 1. Write-protect the p.t. page so no more updates can occur.
1678 */
1680 if ( unlikely(__get_user(pte, ptep)) )
1682 MEM_LOG("ptwr: Could not read pte at %p\n", ptep);
1683 /*
1684 * Really a bug. We could read this PTE during the initial fault,
1685 * and pagetables can't have changed meantime. XXX Multi-CPU guests?
1686 */
1687 BUG();
1689 PTWR_PRINTK("[%c] disconnected_l1va at %p is %08lx\n",
1690 PTWR_PRINT_WHICH, ptep, pte);
1691 pte &= ~_PAGE_RW;
1693 if ( unlikely(ed->mm.shadow_mode) )
1695 /* Write-protect the p.t. page in the shadow page table. */
1696 l1pte_propagate_from_guest(&ed->mm, &pte, &spte);
1697 __put_user(
1698 spte, (unsigned long *)&shadow_linear_pg_table[l1va>>PAGE_SHIFT]);
1700 /* Is the p.t. page itself shadowed? Map it into Xen space if so. */
1701 sstat = get_shadow_status(&ed->mm, pte >> PAGE_SHIFT);
1702 if ( sstat & PSH_shadowed )
1703 sl1e = map_domain_mem((sstat & PSH_pfn_mask) << PAGE_SHIFT);
1706 /* Write-protect the p.t. page in the guest page table. */
1707 if ( unlikely(__put_user(pte, ptep)) )
1709 MEM_LOG("ptwr: Could not update pte at %p\n", ptep);
1710 /*
1711 * Really a bug. We could write this PTE during the initial fault,
1712 * and pagetables can't have changed meantime. XXX Multi-CPU guests?
1713 */
1714 BUG();
1717 /* Ensure that there are no stale writable mappings in any TLB. */
1718 /* NB. INVLPG is a serialising instruction: flushes pending updates. */
1719 #if 1
1720 __flush_tlb_one(l1va); /* XXX Multi-CPU guests? */
1721 #else
1722 flush_tlb_all();
1723 #endif
1724 PTWR_PRINTK("[%c] disconnected_l1va at %p now %08lx\n",
1725 PTWR_PRINT_WHICH, ptep, pte);
1727 /*
1728 * STEP 2. Validate any modified PTEs.
1729 */
1731 pl1e = ptwr_info[cpu].ptinfo[which].pl1e;
1732 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
1734 ol1e = ptwr_info[cpu].ptinfo[which].page[i];
1735 nl1e = pl1e[i];
1737 if ( likely(l1_pgentry_val(ol1e) == l1_pgentry_val(nl1e)) )
1738 continue;
1740 /*
1741 * Fast path for PTEs that have merely been write-protected
1742 * (e.g., during a Unix fork()). A strict reduction in privilege.
1743 */
1744 if ( likely(l1_pgentry_val(ol1e) == (l1_pgentry_val(nl1e)|_PAGE_RW)) )
1746 if ( likely(l1_pgentry_val(nl1e) & _PAGE_PRESENT) )
1748 if ( unlikely(sl1e != NULL) )
1749 l1pte_propagate_from_guest(
1750 &ed->mm, &l1_pgentry_val(nl1e),
1751 &l1_pgentry_val(sl1e[i]));
1752 put_page_type(&frame_table[l1_pgentry_to_pagenr(nl1e)]);
1754 continue;
1757 if ( unlikely(!get_page_from_l1e(nl1e, d)) )
1759 MEM_LOG("ptwr: Could not re-validate l1 page\n");
1760 /*
1761 * Make the remaining p.t's consistent before crashing, so the
1762 * reference counts are correct.
1763 */
1764 memcpy(&pl1e[i], &ptwr_info[cpu].ptinfo[which].page[i],
1765 (ENTRIES_PER_L1_PAGETABLE - i) * sizeof(l1_pgentry_t));
1766 unmap_domain_mem(pl1e);
1767 ptwr_info[cpu].ptinfo[which].l1va = 0;
1768 UNLOCK_BIGLOCK(d);
1769 domain_crash();
1772 if ( unlikely(sl1e != NULL) )
1773 l1pte_propagate_from_guest(
1774 &ed->mm, &l1_pgentry_val(nl1e), &l1_pgentry_val(sl1e[i]));
1776 if ( unlikely(l1_pgentry_val(ol1e) & _PAGE_PRESENT) )
1777 put_page_from_l1e(ol1e, d);
1779 unmap_domain_mem(pl1e);
1781 /*
1782 * STEP 3. Reattach the L1 p.t. page into the current address space.
1783 */
1785 if ( (which == PTWR_PT_ACTIVE) && likely(!ed->mm.shadow_mode) )
1787 pl2e = &linear_l2_table[ptwr_info[cpu].ptinfo[which].l2_idx];
1788 *pl2e = mk_l2_pgentry(l2_pgentry_val(*pl2e) | _PAGE_PRESENT);
1791 /*
1792 * STEP 4. Final tidy-up.
1793 */
1795 ptwr_info[cpu].ptinfo[which].l1va = 0;
1797 if ( unlikely(sl1e != NULL) )
1799 unmap_domain_mem(sl1e);
1800 put_shadow_status(&ed->mm);
1804 /* Write page fault handler: check if guest is trying to modify a PTE. */
1805 int ptwr_do_page_fault(unsigned long addr)
1807 unsigned long pte, pfn, l2e;
1808 struct pfn_info *page;
1809 l2_pgentry_t *pl2e;
1810 int which, cpu = smp_processor_id();
1811 u32 l2_idx;
1813 /*
1814 * Attempt to read the PTE that maps the VA being accessed. By checking for
1815 * PDE validity in the L2 we avoid many expensive fixups in __get_user().
1816 */
1817 if ( !(l2_pgentry_val(linear_l2_table[addr>>L2_PAGETABLE_SHIFT]) &
1818 _PAGE_PRESENT) ||
1819 __get_user(pte, (unsigned long *)&linear_pg_table[addr>>PAGE_SHIFT]) )
1821 return 0;
1824 pfn = pte >> PAGE_SHIFT;
1825 page = &frame_table[pfn];
1827 /* We are looking only for read-only mappings of p.t. pages. */
1828 if ( ((pte & (_PAGE_RW | _PAGE_PRESENT)) != _PAGE_PRESENT) ||
1829 ((page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table) )
1831 return 0;
1834 /* Get the L2 index at which this L1 p.t. is always mapped. */
1835 l2_idx = page->u.inuse.type_info & PGT_va_mask;
1836 if ( unlikely(l2_idx >= PGT_va_unknown) )
1838 domain_crash(); /* Urk! This L1 is mapped in multiple L2 slots! */
1840 l2_idx >>= PGT_va_shift;
1842 if ( l2_idx == (addr >> L2_PAGETABLE_SHIFT) )
1844 MEM_LOG("PTWR failure! Pagetable maps itself at %08lx\n", addr);
1845 domain_crash();
1848 /*
1849 * Is the L1 p.t. mapped into the current address space? If so we call it
1850 * an ACTIVE p.t., otherwise it is INACTIVE.
1851 */
1852 pl2e = &linear_l2_table[l2_idx];
1853 l2e = l2_pgentry_val(*pl2e);
1854 which = PTWR_PT_INACTIVE;
1855 if ( (l2e >> PAGE_SHIFT) == pfn )
1857 /* Check the PRESENT bit to set ACTIVE. */
1858 if ( likely(l2e & _PAGE_PRESENT) )
1859 which = PTWR_PT_ACTIVE;
1860 else {
1861 /*
1862 * If the PRESENT bit is clear, we may be conflicting with
1863 * the current ACTIVE p.t. (it may be the same p.t. mapped
1864 * at another virt addr).
1865 * The ptwr_flush call below will restore the PRESENT bit.
1866 */
1867 if ( ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va &&
1868 l2_idx == ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l2_idx )
1869 which = PTWR_PT_ACTIVE;
1873 PTWR_PRINTK("[%c] page_fault on l1 pt at va %08lx, pt for %08x, "
1874 "pfn %08lx\n", PTWR_PRINT_WHICH,
1875 addr, l2_idx << L2_PAGETABLE_SHIFT, pfn);
1877 /*
1878 * We only allow one ACTIVE and one INACTIVE p.t. to be updated at at
1879 * time. If there is already one, we must flush it out.
1880 */
1881 if ( ptwr_info[cpu].ptinfo[which].l1va )
1882 ptwr_flush(which);
1884 ptwr_info[cpu].ptinfo[which].l1va = addr | 1;
1885 ptwr_info[cpu].ptinfo[which].l2_idx = l2_idx;
1887 /* For safety, disconnect the L1 p.t. page from current space. */
1888 if ( (which == PTWR_PT_ACTIVE) && likely(!current->mm.shadow_mode) )
1890 *pl2e = mk_l2_pgentry(l2e & ~_PAGE_PRESENT);
1891 #if 1
1892 flush_tlb(); /* XXX Multi-CPU guests? */
1893 #else
1894 flush_tlb_all();
1895 #endif
1898 /* Temporarily map the L1 page, and make a copy of it. */
1899 ptwr_info[cpu].ptinfo[which].pl1e = map_domain_mem(pfn << PAGE_SHIFT);
1900 memcpy(ptwr_info[cpu].ptinfo[which].page,
1901 ptwr_info[cpu].ptinfo[which].pl1e,
1902 ENTRIES_PER_L1_PAGETABLE * sizeof(l1_pgentry_t));
1904 /* Finally, make the p.t. page writable by the guest OS. */
1905 pte |= _PAGE_RW;
1906 PTWR_PRINTK("[%c] update %p pte to %08lx\n", PTWR_PRINT_WHICH,
1907 &linear_pg_table[addr>>PAGE_SHIFT], pte);
1908 if ( unlikely(__put_user(pte, (unsigned long *)
1909 &linear_pg_table[addr>>PAGE_SHIFT])) )
1911 MEM_LOG("ptwr: Could not update pte at %p\n", (unsigned long *)
1912 &linear_pg_table[addr>>PAGE_SHIFT]);
1913 /* Toss the writable pagetable state and crash. */
1914 unmap_domain_mem(ptwr_info[cpu].ptinfo[which].pl1e);
1915 ptwr_info[cpu].ptinfo[which].l1va = 0;
1916 domain_crash();
1919 return EXCRET_fault_fixed;
1922 static __init int ptwr_init(void)
1924 int i;
1926 for ( i = 0; i < smp_num_cpus; i++ )
1928 ptwr_info[i].ptinfo[PTWR_PT_ACTIVE].page =
1929 (void *)alloc_xenheap_page();
1930 ptwr_info[i].ptinfo[PTWR_PT_INACTIVE].page =
1931 (void *)alloc_xenheap_page();
1934 return 0;
1936 __initcall(ptwr_init);
1941 /************************************************************************/
1942 /************************************************************************/
1943 /************************************************************************/
1945 #ifndef NDEBUG
1947 void ptwr_status(void)
1949 unsigned long pte, *ptep, pfn;
1950 struct pfn_info *page;
1951 int cpu = smp_processor_id();
1953 ptep = (unsigned long *)&linear_pg_table
1954 [ptwr_info[cpu].ptinfo[PTWR_PT_INACTIVE].l1va>>PAGE_SHIFT];
1956 if ( __get_user(pte, ptep) ) {
1957 MEM_LOG("ptwr: Could not read pte at %p\n", ptep);
1958 domain_crash();
1961 pfn = pte >> PAGE_SHIFT;
1962 page = &frame_table[pfn];
1963 printk("need to alloc l1 page %p\n", page);
1964 /* make pt page writable */
1965 printk("need to make read-only l1-page at %p is %08lx\n",
1966 ptep, pte);
1968 if ( ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va == 0 )
1969 return;
1971 if ( __get_user(pte, (unsigned long *)
1972 ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va) ) {
1973 MEM_LOG("ptwr: Could not read pte at %p\n", (unsigned long *)
1974 ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va);
1975 domain_crash();
1977 pfn = pte >> PAGE_SHIFT;
1978 page = &frame_table[pfn];
1981 void audit_domain(struct domain *d)
1983 int ttot=0, ctot=0, io_mappings=0, lowmem_mappings=0;
1985 void adjust (struct pfn_info *page, int dir, int adjtype)
1987 int count = page->count_info & PGC_count_mask;
1989 if ( adjtype )
1991 int tcount = page->u.inuse.type_info & PGT_count_mask;
1993 ttot++;
1995 tcount += dir;
1997 if ( tcount < 0 )
1999 /* This will only come out once. */
2000 printk("Audit %d: type count whent below zero pfn=%x "
2001 "taf=%x otaf=%x\n",
2002 d->id, page-frame_table,
2003 page->u.inuse.type_info,
2004 page->tlbflush_timestamp);
2007 page->u.inuse.type_info =
2008 (page->u.inuse.type_info & ~PGT_count_mask) |
2009 (tcount & PGT_count_mask);
2012 ctot++;
2013 count += dir;
2014 if ( count < 0 )
2016 /* This will only come out once. */
2017 printk("Audit %d: general count whent below zero pfn=%x "
2018 "taf=%x otaf=%x\n",
2019 d->id, page-frame_table,
2020 page->u.inuse.type_info,
2021 page->tlbflush_timestamp);
2024 page->count_info =
2025 (page->count_info & ~PGC_count_mask) |
2026 (count & PGC_count_mask);
2030 void scan_for_pfn(struct domain *d, unsigned long xpfn)
2032 unsigned long pfn, *pt;
2033 struct list_head *list_ent;
2034 struct pfn_info *page;
2035 int i;
2037 list_ent = d->page_list.next;
2038 for ( i = 0; (list_ent != &d->page_list); i++ )
2040 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
2041 page = &frame_table[pfn];
2043 switch ( page->u.inuse.type_info & PGT_type_mask )
2045 case PGT_l1_page_table:
2046 case PGT_l2_page_table:
2047 pt = map_domain_mem(pfn<<PAGE_SHIFT);
2048 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
2049 if ( (pt[i] & _PAGE_PRESENT) &&
2050 ((pt[i] >> PAGE_SHIFT) == xpfn) )
2051 printk(" found dom=%d i=%x pfn=%lx t=%x c=%x\n",
2052 d->id, i, pfn, page->u.inuse.type_info,
2053 page->count_info);
2054 unmap_domain_mem(pt);
2057 list_ent = frame_table[pfn].list.next;
2062 void scan_for_pfn_remote(unsigned long xpfn)
2064 struct domain *e;
2065 for_each_domain ( e )
2066 scan_for_pfn( e, xpfn );
2069 int i;
2070 unsigned long pfn;
2071 struct list_head *list_ent;
2072 struct pfn_info *page;
2074 if ( d != current->domain )
2075 domain_pause(d);
2076 synchronise_pagetables(~0UL);
2078 printk("pt base=%lx sh_info=%x\n",
2079 pagetable_val(d->exec_domain[0]->mm.pagetable)>>PAGE_SHIFT,
2080 virt_to_page(d->shared_info)-frame_table);
2082 spin_lock(&d->page_alloc_lock);
2084 /* PHASE 0 */
2086 list_ent = d->page_list.next;
2087 for ( i = 0; (list_ent != &d->page_list); i++ )
2089 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
2090 page = &frame_table[pfn];
2092 if ( page->u.inuse.domain != d )
2093 BUG();
2095 if ( (page->u.inuse.type_info & PGT_count_mask) >
2096 (page->count_info & PGC_count_mask) )
2097 printk("taf > caf %x %x pfn=%lx\n",
2098 page->u.inuse.type_info, page->count_info, pfn );
2100 #if 0 /* SYSV shared memory pages plus writeable files. */
2101 if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_writable_page &&
2102 (page->u.inuse.type_info & PGT_count_mask) > 1 )
2104 printk("writeable page with type count >1: pfn=%lx t=%x c=%x\n",
2105 pfn,
2106 page->u.inuse.type_info,
2107 page->count_info );
2108 scan_for_pfn_remote(pfn);
2110 #endif
2111 if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_none &&
2112 (page->u.inuse.type_info & PGT_count_mask) > 1 )
2114 printk("normal page with type count >1: pfn=%lx t=%x c=%x\n",
2115 pfn,
2116 page->u.inuse.type_info,
2117 page->count_info );
2120 /* Use tlbflush_timestamp to store original type_info. */
2121 page->tlbflush_timestamp = page->u.inuse.type_info;
2123 list_ent = frame_table[pfn].list.next;
2127 /* PHASE 1 */
2129 adjust(&frame_table[pagetable_val(d->exec_domain[0]->mm.pagetable)>>PAGE_SHIFT], -1, 1);
2131 list_ent = d->page_list.next;
2132 for ( i = 0; (list_ent != &d->page_list); i++ )
2134 unsigned long *pt;
2135 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
2136 page = &frame_table[pfn];
2138 if ( page->u.inuse.domain != d )
2139 BUG();
2141 switch ( page->u.inuse.type_info & PGT_type_mask )
2143 case PGT_l2_page_table:
2145 if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated )
2146 printk("Audit %d: L2 not validated %x\n",
2147 d->id, page->u.inuse.type_info);
2149 if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned )
2150 printk("Audit %d: L2 not pinned %x\n",
2151 d->id, page->u.inuse.type_info);
2152 else
2153 adjust( page, -1, 1 );
2155 pt = map_domain_mem( pfn<<PAGE_SHIFT );
2157 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
2159 if ( pt[i] & _PAGE_PRESENT )
2161 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
2162 struct pfn_info *l1page = &frame_table[l1pfn];
2164 if ( l1page->u.inuse.domain != d )
2166 printk("L2: Skip bizarre page belonging to other "
2167 "dom %p\n", l1page->u.inuse.domain);
2168 continue;
2171 if ( (l1page->u.inuse.type_info & PGT_type_mask) ==
2172 PGT_l2_page_table )
2173 printk("Audit %d: [%x] Found %s Linear PT "
2174 "t=%x pfn=%lx\n", d->id, i,
2175 (l1pfn==pfn) ? "Self" : "Other",
2176 l1page->u.inuse.type_info,
2177 l1pfn);
2178 else if ( (l1page->u.inuse.type_info & PGT_type_mask) !=
2179 PGT_l1_page_table )
2180 printk("Audit %d: [%x] Expected L1 t=%x pfn=%lx\n",
2181 d->id, i,
2182 l1page->u.inuse.type_info,
2183 l1pfn);
2185 adjust(l1page, -1, 1);
2189 unmap_domain_mem(pt);
2191 break;
2194 case PGT_l1_page_table:
2196 if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
2197 adjust( page, -1, 1 );
2199 if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated )
2200 printk("Audit %d: L1 not validated %x\n",
2201 d->id, page->u.inuse.type_info);
2202 #if 0
2203 if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned )
2204 printk("Audit %d: L1 not pinned %x\n",
2205 d->id, page->u.inuse.type_info);
2206 #endif
2207 pt = map_domain_mem( pfn<<PAGE_SHIFT );
2209 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
2211 if ( pt[i] & _PAGE_PRESENT )
2213 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
2214 struct pfn_info *l1page = &frame_table[l1pfn];
2216 if ( l1pfn < 0x100 )
2218 lowmem_mappings++;
2219 continue;
2222 if ( l1pfn > max_page )
2224 io_mappings++;
2225 continue;
2228 if ( pt[i] & _PAGE_RW )
2231 if ( (l1page->u.inuse.type_info & PGT_type_mask) ==
2232 PGT_l1_page_table ||
2233 (l1page->u.inuse.type_info & PGT_type_mask) ==
2234 PGT_l2_page_table )
2235 printk("Audit %d: [%x] Ilegal RW t=%x pfn=%lx\n",
2236 d->id, i,
2237 l1page->u.inuse.type_info,
2238 l1pfn);
2242 if ( l1page->u.inuse.domain != d )
2244 printk("Audit %d: [%lx,%x] Skip foreign page dom=%lx "
2245 "pfn=%lx c=%08x t=%08x m2p=%lx\n",
2246 d->id, pfn, i,
2247 (unsigned long)l1page->u.inuse.domain,
2248 l1pfn,
2249 l1page->count_info,
2250 l1page->u.inuse.type_info,
2251 machine_to_phys_mapping[l1pfn]);
2252 continue;
2255 adjust(l1page, -1, 0);
2259 unmap_domain_mem(pt);
2261 break;
2264 list_ent = frame_table[pfn].list.next;
2267 if ( (io_mappings > 0) || (lowmem_mappings > 0) )
2268 printk("Audit %d: Found %d lowmem mappings and %d io mappings\n",
2269 d->id, lowmem_mappings, io_mappings);
2271 /* PHASE 2 */
2273 ctot = ttot = 0;
2274 list_ent = d->page_list.next;
2275 for ( i = 0; (list_ent != &d->page_list); i++ )
2277 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
2278 page = &frame_table[pfn];
2280 switch ( page->u.inuse.type_info & PGT_type_mask)
2282 case PGT_l1_page_table:
2283 case PGT_l2_page_table:
2284 if ( (page->u.inuse.type_info & PGT_count_mask) != 0 )
2286 printk("Audit %d: type count!=0 t=%x ot=%x c=%x pfn=%lx\n",
2287 d->id, page->u.inuse.type_info,
2288 page->tlbflush_timestamp,
2289 page->count_info, pfn );
2290 scan_for_pfn_remote(pfn);
2292 default:
2293 if ( (page->count_info & PGC_count_mask) != 1 )
2295 printk("Audit %d: gen count!=1 (c=%x) t=%x ot=%x pfn=%lx\n",
2296 d->id,
2297 page->count_info,
2298 page->u.inuse.type_info,
2299 page->tlbflush_timestamp, pfn );
2300 scan_for_pfn_remote(pfn);
2302 break;
2305 list_ent = frame_table[pfn].list.next;
2308 /* PHASE 3 */
2310 list_ent = d->page_list.next;
2311 for ( i = 0; (list_ent != &d->page_list); i++ )
2313 unsigned long *pt;
2314 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
2315 page = &frame_table[pfn];
2317 switch ( page->u.inuse.type_info & PGT_type_mask )
2319 case PGT_l2_page_table:
2320 if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
2321 adjust( page, 1, 1 );
2323 pt = map_domain_mem( pfn<<PAGE_SHIFT );
2325 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
2327 if ( pt[i] & _PAGE_PRESENT )
2329 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
2330 struct pfn_info *l1page = &frame_table[l1pfn];
2332 if ( l1page->u.inuse.domain == d)
2333 adjust(l1page, 1, 1);
2337 unmap_domain_mem(pt);
2338 break;
2340 case PGT_l1_page_table:
2341 if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
2342 adjust( page, 1, 1 );
2344 pt = map_domain_mem( pfn<<PAGE_SHIFT );
2346 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
2348 if ( pt[i] & _PAGE_PRESENT )
2350 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
2351 struct pfn_info *l1page = &frame_table[l1pfn];
2353 if ( (l1page->u.inuse.domain != d) ||
2354 (l1pfn < 0x100) || (l1pfn > max_page) )
2355 continue;
2357 adjust(l1page, 1, 0);
2361 unmap_domain_mem(pt);
2362 break;
2366 page->tlbflush_timestamp = 0;
2368 list_ent = frame_table[pfn].list.next;
2371 spin_unlock(&d->page_alloc_lock);
2373 adjust(&frame_table[pagetable_val(d->exec_domain[0]->mm.pagetable)>>PAGE_SHIFT], 1, 1);
2375 printk("Audit %d: Done. ctot=%d ttot=%d\n", d->id, ctot, ttot );
2377 if ( d != current->domain )
2378 domain_unpause(d);
2381 void audit_domains(void)
2383 struct domain *d;
2384 for_each_domain ( d )
2385 audit_domain(d);
2388 void audit_domains_key(unsigned char key)
2390 audit_domains();
2393 #endif