ia64/xen-unstable

view xen/arch/x86/memory.c @ 2464:eab6988779a6

bitkeeper revision 1.1159.1.142 (413fa2cd-7ateOlBoBUdfXmFsZV9Yw)

Fix build error.
author kaf24@freefall.cl.cam.ac.uk
date Thu Sep 09 00:24:45 2004 +0000 (2004-09-09)
parents 77bc1f0ea51f
children db0119e98043
line source
1 /******************************************************************************
2 * arch/x86/memory.c
3 *
4 * Copyright (c) 2002-2004 K A Fraser
5 * Copyright (c) 2004 Christian Limpach
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
22 /*
23 * A description of the x86 page table API:
24 *
25 * Domains trap to do_mmu_update with a list of update requests.
26 * This is a list of (ptr, val) pairs, where the requested operation
27 * is *ptr = val.
28 *
29 * Reference counting of pages:
30 * ----------------------------
31 * Each page has two refcounts: tot_count and type_count.
32 *
33 * TOT_COUNT is the obvious reference count. It counts all uses of a
34 * physical page frame by a domain, including uses as a page directory,
35 * a page table, or simple mappings via a PTE. This count prevents a
36 * domain from releasing a frame back to the free pool when it still holds
37 * a reference to it.
38 *
39 * TYPE_COUNT is more subtle. A frame can be put to one of three
40 * mutually-exclusive uses: it might be used as a page directory, or a
41 * page table, or it may be mapped writable by the domain [of course, a
42 * frame may not be used in any of these three ways!].
43 * So, type_count is a count of the number of times a frame is being
44 * referred to in its current incarnation. Therefore, a page can only
45 * change its type when its type count is zero.
46 *
47 * Pinning the page type:
48 * ----------------------
49 * The type of a page can be pinned/unpinned with the commands
50 * MMUEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is,
51 * pinning is not reference counted, so it can't be nested).
52 * This is useful to prevent a page's type count falling to zero, at which
53 * point safety checks would need to be carried out next time the count
54 * is increased again.
55 *
56 * A further note on writable page mappings:
57 * -----------------------------------------
58 * For simplicity, the count of writable mappings for a page may not
59 * correspond to reality. The 'writable count' is incremented for every
60 * PTE which maps the page with the _PAGE_RW flag set. However, for
61 * write access to be possible the page directory entry must also have
62 * its _PAGE_RW bit set. We do not check this as it complicates the
63 * reference counting considerably [consider the case of multiple
64 * directory entries referencing a single page table, some with the RW
65 * bit set, others not -- it starts getting a bit messy].
66 * In normal use, this simplification shouldn't be a problem.
67 * However, the logic can be added if required.
68 *
69 * One more note on read-only page mappings:
70 * -----------------------------------------
71 * We want domains to be able to map pages for read-only access. The
72 * main reason is that page tables and directories should be readable
73 * by a domain, but it would not be safe for them to be writable.
74 * However, domains have free access to rings 1 & 2 of the Intel
75 * privilege model. In terms of page protection, these are considered
76 * to be part of 'supervisor mode'. The WP bit in CR0 controls whether
77 * read-only restrictions are respected in supervisor mode -- if the
78 * bit is clear then any mapped page is writable.
79 *
80 * We get round this by always setting the WP bit and disallowing
81 * updates to it. This is very unlikely to cause a problem for guest
82 * OS's, which will generally use the WP bit to simplify copy-on-write
83 * implementation (in that case, OS wants a fault when it writes to
84 * an application-supplied buffer).
85 */
87 #include <xen/config.h>
88 #include <xen/init.h>
89 #include <xen/lib.h>
90 #include <xen/mm.h>
91 #include <xen/sched.h>
92 #include <xen/errno.h>
93 #include <xen/perfc.h>
94 #include <xen/irq.h>
95 #include <xen/softirq.h>
96 #include <asm/shadow.h>
97 #include <asm/page.h>
98 #include <asm/flushtlb.h>
99 #include <asm/io.h>
100 #include <asm/uaccess.h>
101 #include <asm/domain_page.h>
102 #include <asm/ldt.h>
104 #ifdef VERBOSE
105 #define MEM_LOG(_f, _a...) \
106 printk("DOM%u: (file=memory.c, line=%d) " _f "\n", \
107 current->domain , __LINE__ , ## _a )
108 #else
109 #define MEM_LOG(_f, _a...) ((void)0)
110 #endif
112 static int alloc_l2_table(struct pfn_info *page);
113 static int alloc_l1_table(struct pfn_info *page);
114 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d);
115 static int get_page_and_type_from_pagenr(unsigned long page_nr,
116 u32 type,
117 struct domain *d);
119 static void free_l2_table(struct pfn_info *page);
120 static void free_l1_table(struct pfn_info *page);
122 static int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t, unsigned long);
123 static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t);
125 /* Used to defer flushing of memory structures. */
126 static struct {
127 #define DOP_FLUSH_TLB (1<<0) /* Flush the TLB. */
128 #define DOP_RELOAD_LDT (1<<1) /* Reload the LDT shadow mapping. */
129 unsigned long deferred_ops;
130 unsigned long cr0;
131 /* If non-NULL, specifies a foreign subject domain for some operations. */
132 struct domain *foreign;
133 } percpu_info[NR_CPUS] __cacheline_aligned;
135 /*
136 * Returns the current foreign domain; defaults to the currently-executing
137 * domain if a foreign override hasn't been specified.
138 */
139 #define FOREIGNDOM (percpu_info[smp_processor_id()].foreign ? : current)
141 /* Private domain structs for DOMID_XEN and DOMID_IO. */
142 static struct domain *dom_xen, *dom_io;
144 void arch_init_memory(void)
145 {
146 unsigned long mfn;
147 int i;
149 /*
150 * We are rather picky about the layout of 'struct pfn_info'. The
151 * count_info and domain fields must be adjacent, as we perform atomic
152 * 64-bit operations on them. Also, just for sanity, we assert the size
153 * of the structure here.
154 */
155 if ( (offsetof(struct pfn_info, u.inuse.domain) !=
156 (offsetof(struct pfn_info, count_info) + sizeof(u32))) ||
157 (sizeof(struct pfn_info) != 24) )
158 {
159 printk("Weird pfn_info layout (%ld,%ld,%d)\n",
160 offsetof(struct pfn_info, count_info),
161 offsetof(struct pfn_info, u.inuse.domain),
162 sizeof(struct pfn_info));
163 for ( ; ; ) ;
164 }
166 memset(percpu_info, 0, sizeof(percpu_info));
168 for ( mfn = 0; mfn < max_page; mfn++ )
169 frame_table[mfn].count_info |= PGC_always_set;
171 /* Initialise to a magic of 0x55555555 so easier to spot bugs later. */
172 memset(machine_to_phys_mapping, 0x55, 4<<20);
174 /*
175 * Initialise our DOMID_XEN domain.
176 * Any Xen-heap pages that we will allow to be mapped will have
177 * their domain field set to dom_xen.
178 */
179 dom_xen = alloc_domain_struct();
180 atomic_set(&dom_xen->refcnt, 1);
181 dom_xen->domain = DOMID_XEN;
183 /*
184 * Initialise our DOMID_IO domain.
185 * This domain owns no pages but is considered a special case when
186 * mapping I/O pages, as the mappings occur at the priv of the caller.
187 */
188 dom_io = alloc_domain_struct();
189 atomic_set(&dom_io->refcnt, 1);
190 dom_io->domain = DOMID_IO;
192 /* M2P table is mappable read-only by privileged domains. */
193 for ( mfn = virt_to_phys(&machine_to_phys_mapping[0<<20])>>PAGE_SHIFT;
194 mfn < virt_to_phys(&machine_to_phys_mapping[1<<20])>>PAGE_SHIFT;
195 mfn++ )
196 {
197 frame_table[mfn].count_info |= PGC_allocated | 1;
198 frame_table[mfn].u.inuse.type_info = PGT_gdt_page | 1; /* non-RW */
199 frame_table[mfn].u.inuse.domain = dom_xen;
200 }
202 vm_assist_info[VMASST_TYPE_writable_pagetables].enable = NULL;
203 vm_assist_info[VMASST_TYPE_writable_pagetables].disable = NULL;
205 for ( i = 0; i < smp_num_cpus; i++ )
206 {
207 ptwr_info[i].disconnected_page = (void *)alloc_xenheap_page();
208 ptwr_info[i].writable_page = (void *)alloc_xenheap_page();
209 }
210 }
212 static void __invalidate_shadow_ldt(struct domain *d)
213 {
214 int i;
215 unsigned long pfn;
216 struct pfn_info *page;
218 d->mm.shadow_ldt_mapcnt = 0;
220 for ( i = 16; i < 32; i++ )
221 {
222 pfn = l1_pgentry_to_pagenr(d->mm.perdomain_pt[i]);
223 if ( pfn == 0 ) continue;
224 d->mm.perdomain_pt[i] = mk_l1_pgentry(0);
225 page = &frame_table[pfn];
226 ASSERT_PAGE_IS_TYPE(page, PGT_ldt_page);
227 ASSERT_PAGE_IS_DOMAIN(page, d);
228 put_page_and_type(page);
229 }
231 /* Dispose of the (now possibly invalid) mappings from the TLB. */
232 percpu_info[d->processor].deferred_ops |= DOP_FLUSH_TLB | DOP_RELOAD_LDT;
233 }
236 static inline void invalidate_shadow_ldt(struct domain *d)
237 {
238 if ( d->mm.shadow_ldt_mapcnt != 0 )
239 __invalidate_shadow_ldt(d);
240 }
243 static int alloc_segdesc_page(struct pfn_info *page)
244 {
245 unsigned long *descs = map_domain_mem((page-frame_table) << PAGE_SHIFT);
246 int i;
248 for ( i = 0; i < 512; i++ )
249 if ( unlikely(!check_descriptor(&descs[i*2])) )
250 goto fail;
252 unmap_domain_mem(descs);
253 return 1;
255 fail:
256 unmap_domain_mem(descs);
257 return 0;
258 }
261 /* Map shadow page at offset @off. */
262 int map_ldt_shadow_page(unsigned int off)
263 {
264 struct domain *d = current;
265 unsigned long l1e;
267 if ( unlikely(in_irq()) )
268 BUG();
270 __get_user(l1e, (unsigned long *)&linear_pg_table[(d->mm.ldt_base >>
271 PAGE_SHIFT) + off]);
273 if ( unlikely(!(l1e & _PAGE_PRESENT)) ||
274 unlikely(!get_page_and_type(&frame_table[l1e >> PAGE_SHIFT],
275 d, PGT_ldt_page)) )
276 return 0;
278 d->mm.perdomain_pt[off + 16] = mk_l1_pgentry(l1e | _PAGE_RW);
279 d->mm.shadow_ldt_mapcnt++;
281 return 1;
282 }
285 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d)
286 {
287 struct pfn_info *page = &frame_table[page_nr];
289 if ( unlikely(!pfn_is_ram(page_nr)) )
290 {
291 MEM_LOG("Pfn %08lx is not RAM", page_nr);
292 return 0;
293 }
295 if ( unlikely(!get_page(page, d)) )
296 {
297 MEM_LOG("Could not get page ref for pfn %08lx", page_nr);
298 return 0;
299 }
301 return 1;
302 }
305 static int get_page_and_type_from_pagenr(unsigned long page_nr,
306 u32 type,
307 struct domain *d)
308 {
309 struct pfn_info *page = &frame_table[page_nr];
311 if ( unlikely(!get_page_from_pagenr(page_nr, d)) )
312 return 0;
314 if ( unlikely(!get_page_type(page, type)) )
315 {
316 #ifdef VERBOSE
317 if ( (type & PGT_type_mask) != PGT_l1_page_table )
318 MEM_LOG("Bad page type for pfn %08lx (%08x)",
319 page_nr, page->u.inuse.type_info);
320 #endif
321 put_page(page);
322 return 0;
323 }
325 return 1;
326 }
329 /*
330 * We allow an L2 tables to map each other (a.k.a. linear page tables). It
331 * needs some special care with reference counst and access permissions:
332 * 1. The mapping entry must be read-only, or the guest may get write access
333 * to its own PTEs.
334 * 2. We must only bump the reference counts for an *already validated*
335 * L2 table, or we can end up in a deadlock in get_page_type() by waiting
336 * on a validation that is required to complete that validation.
337 * 3. We only need to increment the reference counts for the mapped page
338 * frame if it is mapped by a different L2 table. This is sufficient and
339 * also necessary to allow validation of an L2 table mapping itself.
340 */
341 static int
342 get_linear_pagetable(
343 l2_pgentry_t l2e, unsigned long pfn, struct domain *d)
344 {
345 u32 x, y;
346 struct pfn_info *page;
348 if ( (l2_pgentry_val(l2e) & _PAGE_RW) )
349 {
350 MEM_LOG("Attempt to create linear p.t. with write perms");
351 return 0;
352 }
354 if ( (l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn )
355 {
356 /* Make sure the mapped frame belongs to the correct domain. */
357 if ( unlikely(!get_page_from_pagenr(l2_pgentry_to_pagenr(l2e), d)) )
358 return 0;
360 /*
361 * Make sure that the mapped frame is an already-validated L2 table.
362 * If so, atomically increment the count (checking for overflow).
363 */
364 page = &frame_table[l2_pgentry_to_pagenr(l2e)];
365 y = page->u.inuse.type_info;
366 do {
367 x = y;
368 if ( unlikely((x & PGT_count_mask) == PGT_count_mask) ||
369 unlikely((x & (PGT_type_mask|PGT_validated)) !=
370 (PGT_l2_page_table|PGT_validated)) )
371 {
372 put_page(page);
373 return 0;
374 }
375 }
376 while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x );
377 }
379 return 1;
380 }
383 static inline int
384 readonly_page_from_l1e(
385 l1_pgentry_t l1e)
386 {
387 struct pfn_info *page = &frame_table[l1_pgentry_to_pagenr(l1e)];
388 unsigned long l1v = l1_pgentry_val(l1e);
390 if ( !(l1v & _PAGE_PRESENT) || !pfn_is_ram(l1v >> PAGE_SHIFT) )
391 return 0;
392 put_page_type(page);
393 return 1;
394 }
396 static int
397 get_page_from_l1e(
398 l1_pgentry_t l1e, struct domain *d)
399 {
400 unsigned long l1v = l1_pgentry_val(l1e);
401 unsigned long pfn = l1_pgentry_to_pagenr(l1e);
402 struct pfn_info *page = &frame_table[pfn];
403 extern int domain_iomem_in_pfn(struct domain *d, unsigned long pfn);
405 if ( !(l1v & _PAGE_PRESENT) )
406 return 1;
408 if ( unlikely(l1v & (_PAGE_GLOBAL|_PAGE_PAT)) )
409 {
410 MEM_LOG("Bad L1 type settings %04lx", l1v & (_PAGE_GLOBAL|_PAGE_PAT));
411 return 0;
412 }
414 if ( unlikely(!pfn_is_ram(pfn)) )
415 {
416 /* SPECIAL CASE 1. Mapping an I/O page. */
418 /* Revert to caller privileges if FD == DOMID_IO. */
419 if ( d == dom_io )
420 d = current;
422 if ( IS_PRIV(d) )
423 return 1;
425 if ( IS_CAPABLE_PHYSDEV(d) )
426 return domain_iomem_in_pfn(d, pfn);
428 MEM_LOG("Non-privileged attempt to map I/O space %08lx", pfn);
429 return 0;
430 }
432 if ( unlikely(!get_page_from_pagenr(pfn, d)) )
433 {
434 /* SPECIAL CASE 2. Mapping a foreign page via a grant table. */
436 int rc;
437 struct domain *e;
438 u32 count_info;
439 /*
440 * Yuk! Amazingly this is the simplest way to get a guaranteed atomic
441 * snapshot of a 64-bit value on IA32. x86/64 solves this of course!
442 * Basically it's a no-op CMPXCHG, to get us the current contents.
443 * No need for LOCK prefix -- we know that count_info is never zero
444 * because it contains PGC_always_set.
445 */
446 ASSERT(test_bit(_PGC_always_set, &page->count_info));
447 __asm__ __volatile__(
448 "cmpxchg8b %2"
449 : "=d" (e), "=a" (count_info),
450 "=m" (*(volatile u64 *)(&page->count_info))
451 : "0" (0), "1" (0), "c" (0), "b" (0) );
452 if ( unlikely((count_info & PGC_count_mask) == 0) ||
453 unlikely(e == NULL) || unlikely(!get_domain(e)) )
454 return 0;
455 rc = gnttab_try_map(
456 e, d, pfn, (l1v & _PAGE_RW) ? GNTTAB_MAP_RW : GNTTAB_MAP_RO);
457 put_domain(e);
458 return rc;
459 }
461 if ( l1v & _PAGE_RW )
462 {
463 if ( unlikely(!get_page_type(page, PGT_writable_page)) )
464 return 0;
465 }
467 return 1;
468 }
471 /* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
472 static int
473 get_page_from_l2e(
474 l2_pgentry_t l2e, unsigned long pfn,
475 struct domain *d, unsigned long va_idx)
476 {
477 int rc;
479 if ( !(l2_pgentry_val(l2e) & _PAGE_PRESENT) )
480 return 1;
482 if ( unlikely((l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE))) )
483 {
484 MEM_LOG("Bad L2 page type settings %04lx",
485 l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE));
486 return 0;
487 }
489 rc = get_page_and_type_from_pagenr(
490 l2_pgentry_to_pagenr(l2e),
491 PGT_l1_page_table | (va_idx<<PGT_va_shift), d);
493 if ( unlikely(!rc) )
494 return get_linear_pagetable(l2e, pfn, d);
496 return 1;
497 }
500 static void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
501 {
502 unsigned long l1v = l1_pgentry_val(l1e);
503 unsigned long pfn = l1_pgentry_to_pagenr(l1e);
504 struct pfn_info *page = &frame_table[pfn];
505 struct domain *e = page->u.inuse.domain;
507 if ( !(l1v & _PAGE_PRESENT) || !pfn_is_ram(pfn) )
508 return;
510 if ( unlikely(e != d) )
511 {
512 /*
513 * Unmap a foreign page that may have been mapped via a grant table.
514 * Note that this can fail for a privileged domain that can map foreign
515 * pages via MMUEXT_SET_FOREIGNDOM. Such domains can have some mappings
516 * counted via a grant entry and some counted directly in the page
517 * structure's reference count. Note that reference counts won't get
518 * dangerously confused as long as we always try to decrement the
519 * grant entry first. We may end up with a mismatch between which
520 * mappings and which unmappings are counted via the grant entry, but
521 * really it doesn't matter as privileged domains have carte blanche.
522 */
523 if ( likely(gnttab_try_map(e, d, pfn, (l1v & _PAGE_RW) ?
524 GNTTAB_UNMAP_RW : GNTTAB_UNMAP_RO)) )
525 return;
526 /* Assume this mapping was made via MMUEXT_SET_FOREIGNDOM... */
527 }
529 if ( l1v & _PAGE_RW )
530 {
531 put_page_and_type(page);
532 }
533 else
534 {
535 /* We expect this is rare so we blow the entire shadow LDT. */
536 if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) ==
537 PGT_ldt_page)) &&
538 unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) )
539 invalidate_shadow_ldt(e);
540 put_page(page);
541 }
542 }
545 /*
546 * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
547 * Note also that this automatically deals correctly with linear p.t.'s.
548 */
549 static void put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
550 {
551 if ( (l2_pgentry_val(l2e) & _PAGE_PRESENT) &&
552 ((l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn) )
553 put_page_and_type(&frame_table[l2_pgentry_to_pagenr(l2e)]);
554 }
557 static int alloc_l2_table(struct pfn_info *page)
558 {
559 struct domain *d = page->u.inuse.domain;
560 unsigned long page_nr = page_to_pfn(page);
561 l2_pgentry_t *pl2e;
562 int i;
564 pl2e = map_domain_mem(page_nr << PAGE_SHIFT);
566 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ ) {
567 if ( unlikely(!get_page_from_l2e(pl2e[i], page_nr, d, i)) )
568 goto fail;
569 }
571 #if defined(__i386__)
572 /* Now we add our private high mappings. */
573 memcpy(&pl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
574 &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
575 HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
576 pl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
577 mk_l2_pgentry((page_nr << PAGE_SHIFT) | __PAGE_HYPERVISOR);
578 pl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] =
579 mk_l2_pgentry(__pa(page->u.inuse.domain->mm.perdomain_pt) |
580 __PAGE_HYPERVISOR);
581 #endif
583 unmap_domain_mem(pl2e);
584 return 1;
586 fail:
587 while ( i-- > 0 )
588 put_page_from_l2e(pl2e[i], page_nr);
590 unmap_domain_mem(pl2e);
591 return 0;
592 }
595 static int alloc_l1_table(struct pfn_info *page)
596 {
597 struct domain *d = page->u.inuse.domain;
598 unsigned long page_nr = page_to_pfn(page);
599 l1_pgentry_t *pl1e;
600 int i;
602 pl1e = map_domain_mem(page_nr << PAGE_SHIFT);
604 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
605 if ( unlikely(!get_page_from_l1e(pl1e[i], d)) )
606 goto fail;
608 unmap_domain_mem(pl1e);
609 return 1;
611 fail:
612 while ( i-- > 0 )
613 put_page_from_l1e(pl1e[i], d);
615 unmap_domain_mem(pl1e);
616 return 0;
617 }
620 static void free_l2_table(struct pfn_info *page)
621 {
622 unsigned long page_nr = page - frame_table;
623 l2_pgentry_t *pl2e;
624 int i;
626 pl2e = map_domain_mem(page_nr << PAGE_SHIFT);
628 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
629 put_page_from_l2e(pl2e[i], page_nr);
631 unmap_domain_mem(pl2e);
632 }
635 static void free_l1_table(struct pfn_info *page)
636 {
637 struct domain *d = page->u.inuse.domain;
638 unsigned long page_nr = page - frame_table;
639 l1_pgentry_t *pl1e;
640 int i;
642 pl1e = map_domain_mem(page_nr << PAGE_SHIFT);
644 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
645 put_page_from_l1e(pl1e[i], d);
647 unmap_domain_mem(pl1e);
648 }
651 static inline int update_l2e(l2_pgentry_t *pl2e,
652 l2_pgentry_t ol2e,
653 l2_pgentry_t nl2e)
654 {
655 unsigned long o = cmpxchg((unsigned long *)pl2e,
656 l2_pgentry_val(ol2e),
657 l2_pgentry_val(nl2e));
658 if ( o != l2_pgentry_val(ol2e) )
659 MEM_LOG("Failed to update %08lx -> %08lx: saw %08lx\n",
660 l2_pgentry_val(ol2e), l2_pgentry_val(nl2e), o);
661 return (o == l2_pgentry_val(ol2e));
662 }
665 /* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
666 static int mod_l2_entry(l2_pgentry_t *pl2e,
667 l2_pgentry_t nl2e,
668 unsigned long pfn)
669 {
670 l2_pgentry_t ol2e;
671 unsigned long _ol2e;
673 if ( unlikely((((unsigned long)pl2e & (PAGE_SIZE-1)) >> 2) >=
674 DOMAIN_ENTRIES_PER_L2_PAGETABLE) )
675 {
676 MEM_LOG("Illegal L2 update attempt in Xen-private area %p", pl2e);
677 return 0;
678 }
680 if ( unlikely(__get_user(_ol2e, (unsigned long *)pl2e) != 0) )
681 return 0;
682 ol2e = mk_l2_pgentry(_ol2e);
684 if ( l2_pgentry_val(nl2e) & _PAGE_PRESENT )
685 {
686 /* Differ in mapping (bits 12-31) or presence (bit 0)? */
687 if ( ((l2_pgentry_val(ol2e) ^ l2_pgentry_val(nl2e)) & ~0xffe) == 0 )
688 return update_l2e(pl2e, ol2e, nl2e);
690 if ( unlikely(!get_page_from_l2e(nl2e, pfn, current,
691 ((unsigned long)pl2e &
692 ~PAGE_MASK) >> 2)) )
693 return 0;
695 if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) )
696 {
697 put_page_from_l2e(nl2e, pfn);
698 return 0;
699 }
701 put_page_from_l2e(ol2e, pfn);
702 return 1;
703 }
705 if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) )
706 return 0;
708 put_page_from_l2e(ol2e, pfn);
709 return 1;
710 }
713 static inline int update_l1e(l1_pgentry_t *pl1e,
714 l1_pgentry_t ol1e,
715 l1_pgentry_t nl1e)
716 {
717 unsigned long o = l1_pgentry_val(ol1e);
718 unsigned long n = l1_pgentry_val(nl1e);
720 if ( unlikely(cmpxchg_user(pl1e, o, n) != 0) ||
721 unlikely(o != l1_pgentry_val(ol1e)) )
722 {
723 MEM_LOG("Failed to update %08lx -> %08lx: saw %08lx\n",
724 l1_pgentry_val(ol1e), l1_pgentry_val(nl1e), o);
725 return 0;
726 }
728 return 1;
729 }
732 /* Update the L1 entry at pl1e to new value nl1e. */
733 static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e)
734 {
735 l1_pgentry_t ol1e;
736 unsigned long _ol1e;
737 struct domain *d = current;
739 if ( unlikely(__get_user(_ol1e, (unsigned long *)pl1e) != 0) )
740 {
741 MEM_LOG("Bad get_user\n");
742 return 0;
743 }
745 ol1e = mk_l1_pgentry(_ol1e);
747 if ( l1_pgentry_val(nl1e) & _PAGE_PRESENT )
748 {
749 /* Differ in mapping (bits 12-31), r/w (bit 1), or presence (bit 0)? */
750 if ( ((l1_pgentry_val(ol1e) ^ l1_pgentry_val(nl1e)) & ~0xffc) == 0 )
751 return update_l1e(pl1e, ol1e, nl1e);
753 if ( unlikely(!get_page_from_l1e(nl1e, FOREIGNDOM)) )
754 return 0;
756 if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
757 {
758 put_page_from_l1e(nl1e, d);
759 return 0;
760 }
762 put_page_from_l1e(ol1e, d);
763 return 1;
764 }
766 if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
767 return 0;
769 put_page_from_l1e(ol1e, d);
770 return 1;
771 }
774 int alloc_page_type(struct pfn_info *page, unsigned int type)
775 {
776 switch ( type )
777 {
778 case PGT_l1_page_table:
779 return alloc_l1_table(page);
780 case PGT_l2_page_table:
781 return alloc_l2_table(page);
782 case PGT_gdt_page:
783 case PGT_ldt_page:
784 return alloc_segdesc_page(page);
785 default:
786 printk("Bad type in alloc_page_type %x t=%x c=%x\n",
787 type, page->u.inuse.type_info,
788 page->count_info);
789 BUG();
790 }
792 return 0;
793 }
796 void free_page_type(struct pfn_info *page, unsigned int type)
797 {
798 struct domain *d = page->u.inuse.domain;
800 switch ( type )
801 {
802 case PGT_l1_page_table:
803 free_l1_table(page);
804 break;
806 case PGT_l2_page_table:
807 free_l2_table(page);
808 break;
810 default:
811 BUG();
812 }
814 if ( unlikely(d->mm.shadow_mode) &&
815 (get_shadow_status(&d->mm, page_to_pfn(page)) & PSH_shadowed) )
816 {
817 unshadow_table(page_to_pfn(page), type);
818 put_shadow_status(&d->mm);
819 }
820 }
823 void put_page_type(struct pfn_info *page)
824 {
825 u32 nx, x, y = page->u.inuse.type_info;
827 again:
828 do {
829 x = y;
830 nx = x - 1;
832 ASSERT((x & PGT_count_mask) != 0);
833 ASSERT(x & PGT_validated);
835 if ( unlikely((nx & PGT_count_mask) == 0) )
836 {
837 /* Record TLB information for flush later. Races are harmless. */
838 page->tlbflush_timestamp = tlbflush_clock;
840 if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) )
841 {
842 /*
843 * Page-table pages must be unvalidated when count is zero. The
844 * 'free' is safe because the refcnt is non-zero and validated
845 * bit is clear => other ops will spin or fail.
846 */
847 if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x,
848 x & ~PGT_validated)) != x) )
849 goto again;
850 /* We cleared the 'valid bit' so we do the clear up. */
851 free_page_type(page, x & PGT_type_mask);
852 /* Carry on, but with the 'valid bit' now clear. */
853 x &= ~PGT_validated;
854 nx &= ~PGT_validated;
855 }
856 }
857 else if ( unlikely((nx & (PGT_pinned | PGT_count_mask)) ==
858 (PGT_pinned | 1)) )
859 {
860 /* Page is now only pinned. Make the back pointer mutable again. */
861 nx |= PGT_va_mutable;
862 }
863 }
864 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
865 }
868 int get_page_type(struct pfn_info *page, u32 type)
869 {
870 u32 nx, x, y = page->u.inuse.type_info;
872 again:
873 do {
874 x = y;
875 nx = x + 1;
876 if ( unlikely((nx & PGT_count_mask) == 0) )
877 {
878 MEM_LOG("Type count overflow on pfn %08lx\n", page_to_pfn(page));
879 return 0;
880 }
881 else if ( unlikely((x & PGT_count_mask) == 0) )
882 {
883 if ( (x & (PGT_type_mask|PGT_va_mask)) != type )
884 {
885 /*
886 * On type change we check to flush stale TLB entries. This
887 * may be unnecessary (e.g., page was GDT/LDT) but those
888 * circumstances should be very rare.
889 */
890 struct domain *d = page->u.inuse.domain;
891 if ( unlikely(NEED_FLUSH(tlbflush_time[d->processor],
892 page->tlbflush_timestamp)) )
893 {
894 perfc_incr(need_flush_tlb_flush);
895 flush_tlb_cpu(d->processor);
896 }
898 /* We lose existing type, back pointer, and validity. */
899 nx &= ~(PGT_type_mask | PGT_va_mask | PGT_validated);
900 nx |= type;
902 /* No special validation needed for writable pages. */
903 /* Page tables and GDT/LDT need to be scanned for validity. */
904 if ( type == PGT_writable_page )
905 nx |= PGT_validated;
906 }
907 }
908 else if ( unlikely((x & (PGT_type_mask|PGT_va_mask)) != type) )
909 {
910 if ( unlikely((x & PGT_type_mask) != (type & PGT_type_mask) ) )
911 {
912 if ( ((x & PGT_type_mask) != PGT_l2_page_table) ||
913 ((type & PGT_type_mask) != PGT_l1_page_table) )
914 MEM_LOG("Bad type (saw %08x != exp %08x) for pfn %08lx\n",
915 x & PGT_type_mask, type, page_to_pfn(page));
916 return 0;
917 }
918 else if ( (x & PGT_va_mask) == PGT_va_mutable )
919 {
920 /* The va backpointer is mutable, hence we update it. */
921 nx &= ~PGT_va_mask;
922 nx |= type; /* we know the actual type is correct */
923 }
924 else if ( unlikely((x & PGT_va_mask) != (type & PGT_va_mask)) )
925 {
926 /* The va backpointer wasn't mutable, and is different. */
927 MEM_LOG("Unexpected va backpointer (saw %08x != exp %08x)"
928 " for pfn %08lx\n", x, type, page_to_pfn(page));
929 return 0;
930 }
931 }
932 else if ( unlikely(!(x & PGT_validated)) )
933 {
934 /* Someone else is updating validation of this page. Wait... */
935 while ( (y = page->u.inuse.type_info) == x )
936 {
937 rep_nop();
938 barrier();
939 }
940 goto again;
941 }
942 }
943 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
945 if ( unlikely(!(nx & PGT_validated)) )
946 {
947 /* Try to validate page type; drop the new reference on failure. */
948 if ( unlikely(!alloc_page_type(page, type & PGT_type_mask)) )
949 {
950 MEM_LOG("Error while validating pfn %08lx for type %08x."
951 " caf=%08x taf=%08x\n",
952 page_to_pfn(page), type,
953 page->count_info,
954 page->u.inuse.type_info);
955 /* Noone else can get a reference. We hold the only ref. */
956 page->u.inuse.type_info = 0;
957 return 0;
958 }
960 /* Noone else is updating simultaneously. */
961 __set_bit(_PGT_validated, &page->u.inuse.type_info);
962 }
964 return 1;
965 }
968 static int do_extended_command(unsigned long ptr, unsigned long val)
969 {
970 int okay = 1, cpu = smp_processor_id();
971 unsigned int cmd = val & MMUEXT_CMD_MASK;
972 unsigned long pfn = ptr >> PAGE_SHIFT;
973 unsigned long old_base_pfn;
974 struct pfn_info *page = &frame_table[pfn];
975 struct domain *d = current, *nd, *e;
976 u32 x, y;
977 domid_t domid;
978 grant_ref_t gntref;
980 switch ( cmd )
981 {
982 case MMUEXT_PIN_L1_TABLE:
983 case MMUEXT_PIN_L2_TABLE:
984 /*
985 * We insist that, if you pin an L1 page, it's the first thing that
986 * you do to it. This is because we require the backptr to still be
987 * mutable. This assumption seems safe.
988 */
989 okay = get_page_and_type_from_pagenr(
990 pfn,
991 ((cmd==MMUEXT_PIN_L2_TABLE) ?
992 PGT_l2_page_table : (PGT_l1_page_table|PGT_va_mutable)),
993 FOREIGNDOM);
995 if ( unlikely(!okay) )
996 {
997 MEM_LOG("Error while pinning pfn %08lx", pfn);
998 put_page(page);
999 break;
1002 if ( unlikely(test_and_set_bit(_PGT_pinned,
1003 &page->u.inuse.type_info)) )
1005 MEM_LOG("Pfn %08lx already pinned", pfn);
1006 put_page_and_type(page);
1007 okay = 0;
1008 break;
1011 break;
1013 case MMUEXT_UNPIN_TABLE:
1014 if ( unlikely(!(okay = get_page_from_pagenr(pfn, FOREIGNDOM))) )
1016 MEM_LOG("Page %08lx bad domain (dom=%p)",
1017 ptr, page->u.inuse.domain);
1019 else if ( likely(test_and_clear_bit(_PGT_pinned,
1020 &page->u.inuse.type_info)) )
1022 put_page_and_type(page);
1023 put_page(page);
1025 else
1027 okay = 0;
1028 put_page(page);
1029 MEM_LOG("Pfn %08lx not pinned", pfn);
1031 break;
1033 case MMUEXT_NEW_BASEPTR:
1034 okay = get_page_and_type_from_pagenr(pfn, PGT_l2_page_table, d);
1035 if ( likely(okay) )
1037 invalidate_shadow_ldt(d);
1039 percpu_info[cpu].deferred_ops &= ~DOP_FLUSH_TLB;
1040 old_base_pfn = pagetable_val(d->mm.pagetable) >> PAGE_SHIFT;
1041 d->mm.pagetable = mk_pagetable(pfn << PAGE_SHIFT);
1043 shadow_mk_pagetable(&d->mm);
1045 write_ptbase(&d->mm);
1047 put_page_and_type(&frame_table[old_base_pfn]);
1049 /*
1050 * Note that we tick the clock /after/ dropping the old base's
1051 * reference count. If the page tables got freed then this will
1052 * avoid unnecessary TLB flushes when the pages are reused. */
1053 tlb_clocktick();
1055 else
1057 MEM_LOG("Error while installing new baseptr %08lx", ptr);
1059 break;
1061 case MMUEXT_TLB_FLUSH:
1062 percpu_info[cpu].deferred_ops |= DOP_FLUSH_TLB;
1063 break;
1065 case MMUEXT_INVLPG:
1066 __flush_tlb_one(ptr);
1067 break;
1069 case MMUEXT_FLUSH_CACHE:
1070 if ( unlikely(!IS_CAPABLE_PHYSDEV(d)) )
1072 MEM_LOG("Non-physdev domain tried to FLUSH_CACHE.\n");
1073 okay = 0;
1075 else
1077 wbinvd();
1079 break;
1081 case MMUEXT_SET_LDT:
1083 unsigned long ents = val >> MMUEXT_CMD_SHIFT;
1084 if ( ((ptr & (PAGE_SIZE-1)) != 0) ||
1085 (ents > 8192) ||
1086 ((ptr+ents*LDT_ENTRY_SIZE) < ptr) ||
1087 ((ptr+ents*LDT_ENTRY_SIZE) > PAGE_OFFSET) )
1089 okay = 0;
1090 MEM_LOG("Bad args to SET_LDT: ptr=%08lx, ents=%08lx", ptr, ents);
1092 else if ( (d->mm.ldt_ents != ents) ||
1093 (d->mm.ldt_base != ptr) )
1095 invalidate_shadow_ldt(d);
1096 d->mm.ldt_base = ptr;
1097 d->mm.ldt_ents = ents;
1098 load_LDT(d);
1099 percpu_info[cpu].deferred_ops &= ~DOP_RELOAD_LDT;
1100 if ( ents != 0 )
1101 percpu_info[cpu].deferred_ops |= DOP_RELOAD_LDT;
1103 break;
1106 case MMUEXT_SET_FOREIGNDOM:
1107 domid = (domid_t)(val >> 16);
1109 if ( (e = percpu_info[cpu].foreign) != NULL )
1110 put_domain(e);
1111 percpu_info[cpu].foreign = NULL;
1113 if ( !IS_PRIV(d) )
1115 switch ( domid )
1117 case DOMID_IO:
1118 get_knownalive_domain(dom_io);
1119 percpu_info[cpu].foreign = dom_io;
1120 break;
1121 default:
1122 MEM_LOG("Dom %u cannot set foreign dom\n", d->domain);
1123 okay = 0;
1124 break;
1127 else
1129 percpu_info[cpu].foreign = e = find_domain_by_id(domid);
1130 if ( e == NULL )
1132 switch ( domid )
1134 case DOMID_XEN:
1135 get_knownalive_domain(dom_xen);
1136 percpu_info[cpu].foreign = dom_xen;
1137 break;
1138 case DOMID_IO:
1139 get_knownalive_domain(dom_io);
1140 percpu_info[cpu].foreign = dom_io;
1141 break;
1142 default:
1143 MEM_LOG("Unknown domain '%u'", domid);
1144 okay = 0;
1145 break;
1149 break;
1151 case MMUEXT_TRANSFER_PAGE:
1152 domid = (domid_t)(val >> 16);
1153 gntref = (grant_ref_t)((val & 0xFF00) | ((ptr >> 2) & 0x00FF));
1155 if ( unlikely(IS_XEN_HEAP_FRAME(page)) ||
1156 unlikely(!pfn_is_ram(pfn)) ||
1157 unlikely((e = find_domain_by_id(domid)) == NULL) )
1159 MEM_LOG("Bad frame (%08lx) or bad domid (%d).\n", pfn, domid);
1160 okay = 0;
1161 break;
1164 spin_lock(&d->page_alloc_lock);
1166 /*
1167 * The tricky bit: atomically release ownership while there is just one
1168 * benign reference to the page (PGC_allocated). If that reference
1169 * disappears then the deallocation routine will safely spin.
1170 */
1171 nd = page->u.inuse.domain;
1172 y = page->count_info;
1173 do {
1174 x = y;
1175 if ( unlikely((x & (PGC_count_mask|PGC_allocated)) !=
1176 (1|PGC_allocated)) ||
1177 unlikely(nd != d) )
1179 MEM_LOG("Bad page values %08lx: ed=%p(%u), sd=%p,"
1180 " caf=%08x, taf=%08x\n", page_to_pfn(page),
1181 d, d->domain, nd, x, page->u.inuse.type_info);
1182 spin_unlock(&d->page_alloc_lock);
1183 put_domain(e);
1184 okay = 0;
1185 break;
1187 __asm__ __volatile__(
1188 LOCK_PREFIX "cmpxchg8b %2"
1189 : "=d" (nd), "=a" (y),
1190 "=m" (*(volatile u64 *)(&page->count_info))
1191 : "0" (d), "1" (x), "c" (NULL), "b" (x) );
1193 while ( unlikely(nd != d) || unlikely(y != x) );
1195 /*
1196 * Unlink from 'd'. At least one reference remains (now anonymous), so
1197 * noone else is spinning to try to delete this page from 'd'.
1198 */
1199 d->tot_pages--;
1200 list_del(&page->list);
1202 spin_unlock(&d->page_alloc_lock);
1204 spin_lock(&e->page_alloc_lock);
1206 /*
1207 * Check that 'e' will accept the page and has reservation headroom.
1208 * Also, a domain mustn't have PGC_allocated pages when it is dying.
1209 */
1210 ASSERT(e->tot_pages <= e->max_pages);
1211 if ( unlikely(test_bit(DF_DYING, &e->flags)) ||
1212 unlikely(e->tot_pages == e->max_pages) ||
1213 unlikely(!gnttab_prepare_for_transfer(e, d, gntref)) )
1215 MEM_LOG("Transferee has no reservation headroom (%d,%d), or "
1216 "provided a bad grant ref, or is dying (%08lx).\n",
1217 e->tot_pages, e->max_pages, e->flags);
1218 spin_unlock(&e->page_alloc_lock);
1219 put_domain(e);
1220 okay = 0;
1221 break;
1224 /* Okay, add the page to 'e'. */
1225 if ( unlikely(e->tot_pages++ == 0) )
1226 get_knownalive_domain(e);
1227 list_add_tail(&page->list, &e->page_list);
1228 page->u.inuse.domain = e;
1230 spin_unlock(&e->page_alloc_lock);
1232 /* Transfer is all done: tell the guest about its new page frame. */
1233 gnttab_notify_transfer(e, gntref, pfn);
1235 put_domain(e);
1236 break;
1238 case MMUEXT_REASSIGN_PAGE:
1239 if ( unlikely(!IS_PRIV(d)) )
1241 MEM_LOG("Dom %u has no reassignment priv", d->domain);
1242 okay = 0;
1243 break;
1246 e = percpu_info[cpu].foreign;
1247 if ( unlikely(e == NULL) )
1249 MEM_LOG("No FOREIGNDOM to reassign pfn %08lx to", pfn);
1250 okay = 0;
1251 break;
1254 /*
1255 * Grab both page_list locks, in order. This prevents the page from
1256 * disappearing elsewhere while we modify the owner, and we'll need
1257 * both locks if we're successful so that we can change lists.
1258 */
1259 if ( d < e )
1261 spin_lock(&d->page_alloc_lock);
1262 spin_lock(&e->page_alloc_lock);
1264 else
1266 spin_lock(&e->page_alloc_lock);
1267 spin_lock(&d->page_alloc_lock);
1270 /* A domain shouldn't have PGC_allocated pages when it is dying. */
1271 if ( unlikely(test_bit(DF_DYING, &e->flags)) ||
1272 unlikely(IS_XEN_HEAP_FRAME(page)) )
1274 MEM_LOG("Reassignment page is Xen heap, or dest dom is dying.");
1275 okay = 0;
1276 goto reassign_fail;
1279 /*
1280 * The tricky bit: atomically change owner while there is just one
1281 * benign reference to the page (PGC_allocated). If that reference
1282 * disappears then the deallocation routine will safely spin.
1283 */
1284 nd = page->u.inuse.domain;
1285 y = page->count_info;
1286 do {
1287 x = y;
1288 if ( unlikely((x & (PGC_count_mask|PGC_allocated)) !=
1289 (1|PGC_allocated)) ||
1290 unlikely(nd != d) )
1292 MEM_LOG("Bad page values %08lx: ed=%p(%u), sd=%p,"
1293 " caf=%08x, taf=%08x\n", page_to_pfn(page),
1294 d, d->domain, nd, x, page->u.inuse.type_info);
1295 okay = 0;
1296 goto reassign_fail;
1298 __asm__ __volatile__(
1299 LOCK_PREFIX "cmpxchg8b %3"
1300 : "=d" (nd), "=a" (y), "=c" (e),
1301 "=m" (*(volatile u64 *)(&page->count_info))
1302 : "0" (d), "1" (x), "c" (e), "b" (x) );
1304 while ( unlikely(nd != d) || unlikely(y != x) );
1306 /*
1307 * Unlink from 'd'. We transferred at least one reference to 'e', so
1308 * noone else is spinning to try to delete this page from 'd'.
1309 */
1310 d->tot_pages--;
1311 list_del(&page->list);
1313 /*
1314 * Add the page to 'e'. Someone may already have removed the last
1315 * reference and want to remove the page from 'e'. However, we have
1316 * the lock so they'll spin waiting for us.
1317 */
1318 if ( unlikely(e->tot_pages++ == 0) )
1319 get_knownalive_domain(e);
1320 list_add_tail(&page->list, &e->page_list);
1322 reassign_fail:
1323 spin_unlock(&d->page_alloc_lock);
1324 spin_unlock(&e->page_alloc_lock);
1325 break;
1327 case MMUEXT_CLEAR_FOREIGNDOM:
1328 if ( (e = percpu_info[cpu].foreign) != NULL )
1329 put_domain(e);
1330 percpu_info[cpu].foreign = NULL;
1331 break;
1333 default:
1334 MEM_LOG("Invalid extended pt command 0x%08lx", val & MMUEXT_CMD_MASK);
1335 okay = 0;
1336 break;
1339 return okay;
1343 int do_mmu_update(mmu_update_t *ureqs, int count, int *success_count)
1345 mmu_update_t req;
1346 unsigned long va = 0, deferred_ops, pfn, prev_pfn = 0;
1347 struct pfn_info *page;
1348 int rc = 0, okay = 1, i, cpu = smp_processor_id();
1349 unsigned int cmd;
1350 unsigned long prev_spfn = 0;
1351 l1_pgentry_t *prev_spl1e = 0;
1352 struct domain *d = current;
1353 u32 type_info;
1355 perfc_incrc(calls_to_mmu_update);
1356 perfc_addc(num_page_updates, count);
1358 cleanup_writable_pagetable(d, PTWR_CLEANUP_ACTIVE | PTWR_CLEANUP_INACTIVE);
1360 if ( unlikely(!access_ok(VERIFY_READ, ureqs, count * sizeof(req))) )
1361 return -EFAULT;
1363 for ( i = 0; i < count; i++ )
1365 if ( unlikely(__copy_from_user(&req, ureqs, sizeof(req)) != 0) )
1367 MEM_LOG("Bad __copy_from_user");
1368 rc = -EFAULT;
1369 break;
1372 cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
1373 pfn = req.ptr >> PAGE_SHIFT;
1375 okay = 0;
1377 switch ( cmd )
1379 /*
1380 * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
1381 */
1382 case MMU_NORMAL_PT_UPDATE:
1383 if ( unlikely(!get_page_from_pagenr(pfn, current)) )
1385 MEM_LOG("Could not get page for normal update");
1386 break;
1389 if ( likely(prev_pfn == pfn) )
1391 va = (va & PAGE_MASK) | (req.ptr & ~PAGE_MASK);
1393 else
1395 if ( prev_pfn != 0 )
1396 unmap_domain_mem((void *)va);
1397 va = (unsigned long)map_domain_mem(req.ptr);
1398 prev_pfn = pfn;
1401 page = &frame_table[pfn];
1402 switch ( (type_info = page->u.inuse.type_info) & PGT_type_mask )
1404 case PGT_l1_page_table:
1405 if ( likely(get_page_type(
1406 page, type_info & (PGT_type_mask|PGT_va_mask))) )
1408 okay = mod_l1_entry((l1_pgentry_t *)va,
1409 mk_l1_pgentry(req.val));
1411 if ( unlikely(d->mm.shadow_mode) && okay &&
1412 (get_shadow_status(&d->mm, page-frame_table) &
1413 PSH_shadowed) )
1415 shadow_l1_normal_pt_update(
1416 req.ptr, req.val, &prev_spfn, &prev_spl1e);
1417 put_shadow_status(&d->mm);
1420 put_page_type(page);
1422 break;
1423 case PGT_l2_page_table:
1424 if ( likely(get_page_type(page, PGT_l2_page_table)) )
1426 okay = mod_l2_entry((l2_pgentry_t *)va,
1427 mk_l2_pgentry(req.val),
1428 pfn);
1430 if ( unlikely(d->mm.shadow_mode) && okay &&
1431 (get_shadow_status(&d->mm, page-frame_table) &
1432 PSH_shadowed) )
1434 shadow_l2_normal_pt_update(req.ptr, req.val);
1435 put_shadow_status(&d->mm);
1438 put_page_type(page);
1440 break;
1441 default:
1442 if ( likely(get_page_type(page, PGT_writable_page)) )
1444 *(unsigned long *)va = req.val;
1445 okay = 1;
1446 put_page_type(page);
1448 break;
1451 put_page(page);
1452 break;
1454 case MMU_MACHPHYS_UPDATE:
1455 if ( unlikely(!get_page_from_pagenr(pfn, FOREIGNDOM)) )
1457 MEM_LOG("Could not get page for mach->phys update");
1458 break;
1461 machine_to_phys_mapping[pfn] = req.val;
1462 okay = 1;
1464 /*
1465 * If in log-dirty mode, mark the corresponding pseudo-physical
1466 * page as dirty.
1467 */
1468 if ( unlikely(d->mm.shadow_mode == SHM_logdirty) )
1469 mark_dirty(&d->mm, pfn);
1471 put_page(&frame_table[pfn]);
1472 break;
1474 /*
1475 * MMU_EXTENDED_COMMAND: Extended command is specified
1476 * in the least-siginificant bits of the 'value' field.
1477 */
1478 case MMU_EXTENDED_COMMAND:
1479 req.ptr &= ~(sizeof(l1_pgentry_t) - 1);
1480 okay = do_extended_command(req.ptr, req.val);
1481 break;
1483 default:
1484 MEM_LOG("Invalid page update command %08lx", req.ptr);
1485 break;
1488 if ( unlikely(!okay) )
1490 rc = -EINVAL;
1491 break;
1494 ureqs++;
1497 if ( prev_pfn != 0 )
1498 unmap_domain_mem((void *)va);
1500 if ( unlikely(prev_spl1e != 0) )
1501 unmap_domain_mem((void *)prev_spl1e);
1503 deferred_ops = percpu_info[cpu].deferred_ops;
1504 percpu_info[cpu].deferred_ops = 0;
1506 if ( deferred_ops & DOP_FLUSH_TLB )
1507 local_flush_tlb();
1509 if ( deferred_ops & DOP_RELOAD_LDT )
1510 (void)map_ldt_shadow_page(0);
1512 if ( unlikely(percpu_info[cpu].foreign != NULL) )
1514 put_domain(percpu_info[cpu].foreign);
1515 percpu_info[cpu].foreign = NULL;
1518 if ( unlikely(success_count != NULL) )
1519 put_user(count, success_count);
1521 return rc;
1525 int do_update_va_mapping(unsigned long page_nr,
1526 unsigned long val,
1527 unsigned long flags)
1529 struct domain *d = current;
1530 int err = 0;
1531 unsigned int cpu = d->processor;
1532 unsigned long deferred_ops;
1534 perfc_incrc(calls_to_update_va);
1536 if ( unlikely(page_nr >= (HYPERVISOR_VIRT_START >> PAGE_SHIFT)) )
1537 return -EINVAL;
1539 cleanup_writable_pagetable(d, PTWR_CLEANUP_ACTIVE | PTWR_CLEANUP_INACTIVE);
1541 /*
1542 * XXX When we make this support 4MB superpages we should also deal with
1543 * the case of updating L2 entries.
1544 */
1546 if ( unlikely(!mod_l1_entry(&linear_pg_table[page_nr],
1547 mk_l1_pgentry(val))) )
1548 err = -EINVAL;
1550 if ( unlikely(d->mm.shadow_mode) )
1552 unsigned long sval;
1554 l1pte_no_fault(&d->mm, &val, &sval);
1556 if ( unlikely(__put_user(sval, ((unsigned long *)(
1557 &shadow_linear_pg_table[page_nr])))) )
1559 /*
1560 * Since L2's are guranteed RW, failure indicates the page was not
1561 * shadowed, so ignore.
1562 */
1563 perfc_incrc(shadow_update_va_fail);
1566 /*
1567 * If we're in log-dirty mode then we need to note that we've updated
1568 * the PTE in the PT-holding page. We need the machine frame number
1569 * for this.
1570 */
1571 if ( d->mm.shadow_mode == SHM_logdirty )
1572 mark_dirty( &current->mm, va_to_l1mfn(page_nr<<PAGE_SHIFT) );
1574 check_pagetable(d, d->mm.pagetable, "va"); /* debug */
1577 deferred_ops = percpu_info[cpu].deferred_ops;
1578 percpu_info[cpu].deferred_ops = 0;
1580 if ( unlikely(deferred_ops & DOP_FLUSH_TLB) ||
1581 unlikely(flags & UVMF_FLUSH_TLB) )
1582 local_flush_tlb();
1583 else if ( unlikely(flags & UVMF_INVLPG) )
1584 __flush_tlb_one(page_nr << PAGE_SHIFT);
1586 if ( unlikely(deferred_ops & DOP_RELOAD_LDT) )
1587 (void)map_ldt_shadow_page(0);
1589 return err;
1592 int do_update_va_mapping_otherdomain(unsigned long page_nr,
1593 unsigned long val,
1594 unsigned long flags,
1595 domid_t domid)
1597 unsigned int cpu = smp_processor_id();
1598 struct domain *d;
1599 int rc;
1601 if ( unlikely(!IS_PRIV(current)) )
1602 return -EPERM;
1604 percpu_info[cpu].foreign = d = find_domain_by_id(domid);
1605 if ( unlikely(d == NULL) )
1607 MEM_LOG("Unknown domain '%u'", domid);
1608 return -ESRCH;
1611 rc = do_update_va_mapping(page_nr, val, flags);
1613 put_domain(d);
1614 percpu_info[cpu].foreign = NULL;
1616 return rc;
1621 /*************************
1622 * Writable Pagetables
1623 */
1625 ptwr_info_t ptwr_info[NR_CPUS] =
1626 { [ 0 ... NR_CPUS-1 ] =
1628 .disconnected_pteidx = -1,
1629 .disconnected_page = 0,
1630 .writable_l1va = 0,
1631 .writable_page = 0,
1633 };
1635 #ifdef VERBOSE
1636 int ptwr_debug = 0x0;
1637 #define PTWR_PRINTK(w, x) if (ptwr_debug & (w)) printk x
1638 #define PP_ALL 0xff
1639 #define PP_A 0x1
1640 #define PP_I 0x2
1641 #else
1642 #define PTWR_PRINTK(w, x)
1643 #endif
1645 void ptwr_reconnect_disconnected(void)
1647 unsigned long pte;
1648 #ifdef VERBOSE
1649 unsigned long pfn;
1650 #endif
1651 l2_pgentry_t *pl2e, nl2e;
1652 l1_pgentry_t *pl1e;
1653 int cpu = smp_processor_id();
1654 int i;
1655 unsigned long *writable_pte = (unsigned long *)&linear_pg_table
1656 [ptwr_info[cpu].disconnected_l1va>>PAGE_SHIFT];
1658 PTWR_PRINTK(PP_A, ("[A] page fault in disconn space %08lx\n",
1659 ptwr_info[cpu].disconnected_pteidx <<
1660 L2_PAGETABLE_SHIFT));
1661 pl2e = &linear_l2_table[ptwr_info[cpu].disconnected_pteidx];
1663 #ifdef VERBOSE
1664 pfn = ptwr_info[cpu].disconnected_pte >> PAGE_SHIFT;
1665 #endif
1666 PTWR_PRINTK(PP_A, ("[A] pl2e %p l2e %08lx pfn %08lx taf %08x/%08x\n",
1667 pl2e, l2_pgentry_val(*pl2e), l1_pgentry_val(
1668 linear_pg_table[(unsigned long)pl2e >>
1669 PAGE_SHIFT]) >> PAGE_SHIFT,
1670 frame_table[pfn].u.inuse.type_info,
1671 frame_table[pfn].count_info));
1673 nl2e = mk_l2_pgentry(l2_pgentry_val(*pl2e) | _PAGE_PRESENT);
1674 pl1e = ptwr_info[cpu].disconnected_pl1e;
1675 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ ) {
1676 l1_pgentry_t ol1e, nl1e;
1677 nl1e = ptwr_info[cpu].disconnected_page[i];
1678 ol1e = pl1e[i];
1679 if (likely(l1_pgentry_val(nl1e) == l1_pgentry_val(ol1e)))
1680 continue;
1681 if (likely(l1_pgentry_val(nl1e) == (l1_pgentry_val(ol1e) | _PAGE_RW)))
1683 if (likely(readonly_page_from_l1e(nl1e))) {
1684 pl1e[i] = ptwr_info[cpu].disconnected_page[i];
1685 continue;
1688 if (unlikely(l1_pgentry_val(ol1e) & _PAGE_PRESENT))
1689 put_page_from_l1e(ol1e, current);
1690 if (unlikely(!get_page_from_l1e(nl1e, current))) {
1691 MEM_LOG("ptwr: Could not re-validate l1 page\n");
1692 domain_crash();
1694 pl1e[i] = ptwr_info[cpu].disconnected_page[i];
1696 unmap_domain_mem(pl1e);
1697 /* reconnect l1 page */
1698 update_l2e(pl2e, *pl2e, nl2e);
1700 PTWR_PRINTK(PP_A,
1701 ("[A] now pl2e %p l2e %08lx taf %08x/%08x\n",
1702 pl2e, l2_pgentry_val(*pl2e),
1703 frame_table[pfn].u.inuse.type_info,
1704 frame_table[pfn].count_info));
1705 ptwr_info[cpu].disconnected_pteidx = -1;
1707 /* make pt page write protected */
1708 if (__get_user(pte, writable_pte)) {
1709 MEM_LOG("ptwr: Could not read pte at %p\n", writable_pte);
1710 domain_crash();
1712 PTWR_PRINTK(PP_A, ("[A] disconnected_l1va at %p is %08lx\n",
1713 writable_pte, pte));
1714 pte = (ptwr_info[cpu].disconnected_pte & PAGE_MASK) |
1715 (pte & ~(PAGE_MASK|_PAGE_RW));
1716 if (__put_user(pte, writable_pte)) {
1717 MEM_LOG("ptwr: Could not update pte at %p\n", writable_pte);
1718 domain_crash();
1720 __flush_tlb_one(ptwr_info[cpu].disconnected_l1va);
1721 PTWR_PRINTK(PP_A, ("[A] disconnected_l1va at %p now %08lx\n",
1722 writable_pte, pte));
1726 void ptwr_flush_inactive(void)
1728 unsigned long pte;
1729 l1_pgentry_t *pl1e;
1730 int cpu = smp_processor_id();
1731 int i;
1732 unsigned long *writable_pte = (unsigned long *)&linear_pg_table
1733 [ptwr_info[cpu].writable_l1va>>PAGE_SHIFT];
1735 pl1e = ptwr_info[cpu].writable_pl1e;
1736 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ ) {
1737 l1_pgentry_t ol1e, nl1e;
1738 nl1e = ptwr_info[cpu].writable_page[i];
1739 ol1e = pl1e[i];
1740 if (likely(l1_pgentry_val(ol1e) == l1_pgentry_val(nl1e)))
1741 continue;
1742 if (unlikely(l1_pgentry_val(ol1e) & _PAGE_PRESENT))
1743 put_page_from_l1e(ol1e, current);
1744 if (unlikely(!get_page_from_l1e(nl1e, current))) {
1745 MEM_LOG("ptwr: Could not re-validate l1 page\n");
1746 domain_crash();
1748 pl1e[i] = ptwr_info[cpu].writable_page[i];
1750 unmap_domain_mem(pl1e);
1752 /* make pt page writable */
1753 if (__get_user(pte, writable_pte)) {
1754 MEM_LOG("ptwr: Could not read pte at %p\n", writable_pte);
1755 domain_crash();
1757 PTWR_PRINTK(PP_I, ("[I] disconnected_l1va at %p is %08lx\n",
1758 writable_pte, pte));
1759 pte = (ptwr_info[cpu].writable_pte & PAGE_MASK) |
1760 (pte & ~(PAGE_MASK|_PAGE_RW));
1761 if (__put_user(pte, writable_pte)) {
1762 MEM_LOG("ptwr: Could not update pte at %p\n", writable_pte);
1763 domain_crash();
1765 __flush_tlb_one(ptwr_info[cpu].writable_l1va);
1766 PTWR_PRINTK(PP_I, ("[I] disconnected_l1va at %p now %08lx\n",
1767 writable_pte, pte));
1769 ptwr_info[cpu].writable_l1va = 0;
1772 int ptwr_do_page_fault(unsigned long addr)
1774 /* write page fault, check if we're trying to modify an l1 page table */
1775 unsigned long pte, pfn;
1776 struct pfn_info *page;
1777 l2_pgentry_t *pl2e;
1778 int cpu = smp_processor_id();
1780 #if 0
1781 PTWR_PRINTK(PP_ALL, ("get user %p for va %08lx\n",
1782 &linear_pg_table[addr>>PAGE_SHIFT], addr));
1783 #endif
1785 /* Testing for page_present in the L2 avoids lots of unncessary fixups */
1786 if ( (l2_pgentry_val(linear_l2_table[addr >> L2_PAGETABLE_SHIFT]) &
1787 _PAGE_PRESENT) &&
1788 (__get_user(pte, (unsigned long *)
1789 &linear_pg_table[addr >> PAGE_SHIFT]) == 0) )
1791 pfn = pte >> PAGE_SHIFT;
1792 #if 0
1793 PTWR_PRINTK(PP_ALL, ("check pte %08lx = pfn %08lx for va %08lx\n", pte,
1794 pfn, addr));
1795 #endif
1796 page = &frame_table[pfn];
1797 if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_l1_page_table )
1799 pl2e = &linear_l2_table[(page->u.inuse.type_info &
1800 PGT_va_mask) >> PGT_va_shift];
1801 PTWR_PRINTK(PP_ALL, ("page_fault on l1 pt at va %08lx, pt for %08x"
1802 ", pfn %08lx\n", addr,
1803 ((page->u.inuse.type_info & PGT_va_mask) >>
1804 PGT_va_shift) << L2_PAGETABLE_SHIFT, pfn));
1806 if ( l2_pgentry_val(*pl2e) >> PAGE_SHIFT != pfn )
1808 /* this L1 is not in the current address space */
1809 PTWR_PRINTK(PP_I, ("[I] freeing l1 page %p taf %08x/%08x\n",
1810 page, page->u.inuse.type_info,
1811 page->count_info));
1812 if (ptwr_info[cpu].writable_l1va)
1813 ptwr_flush_inactive();
1814 ptwr_info[cpu].writable_l1va = addr | 1;
1816 ptwr_info[cpu].writable_pl1e =
1817 map_domain_mem(pfn << PAGE_SHIFT);
1818 memcpy(ptwr_info[cpu].writable_page,
1819 ptwr_info[cpu].writable_pl1e,
1820 ENTRIES_PER_L1_PAGETABLE * sizeof(l1_pgentry_t));
1822 /* make pt page writable */
1823 ptwr_info[cpu].writable_pte = pte;
1824 pte = (virt_to_phys(ptwr_info[cpu].writable_page) &
1825 PAGE_MASK) | _PAGE_RW | (pte & ~PAGE_MASK);
1827 else
1829 l2_pgentry_t nl2e;
1831 if ( ptwr_info[cpu].disconnected_pteidx >= 0 )
1832 ptwr_reconnect_disconnected();
1833 PTWR_PRINTK(PP_A, ("[A] pl2e %p l2e %08lx pfn %08lx "
1834 "taf %08x/%08x\n", pl2e,
1835 l2_pgentry_val(*pl2e),
1836 l1_pgentry_val(linear_pg_table
1837 [(unsigned long)pl2e >>
1838 PAGE_SHIFT]) >> PAGE_SHIFT,
1839 frame_table[pfn].u.inuse.type_info,
1840 frame_table[pfn].count_info));
1841 /* disconnect l1 page */
1842 nl2e = mk_l2_pgentry((l2_pgentry_val(*pl2e) & ~_PAGE_PRESENT));
1843 update_l2e(pl2e, *pl2e, nl2e);
1845 ptwr_info[cpu].disconnected_pteidx =
1846 (page->u.inuse.type_info & PGT_va_mask) >> PGT_va_shift;
1847 PTWR_PRINTK(PP_A, ("[A] now pl2e %p l2e %08lx "
1848 "taf %08x/%08x\n", pl2e,
1849 l2_pgentry_val(*pl2e),
1850 frame_table[pfn].u.inuse.type_info,
1851 frame_table[pfn].count_info));
1852 ptwr_info[cpu].disconnected_l1va = addr;
1853 ptwr_info[cpu].disconnected_pl1e =
1854 map_domain_mem(l2_pgentry_to_pagenr(nl2e) << PAGE_SHIFT);
1855 memcpy(&ptwr_info[cpu].disconnected_page[0],
1856 ptwr_info[cpu].disconnected_pl1e,
1857 ENTRIES_PER_L1_PAGETABLE * sizeof(l1_pgentry_t));
1859 /* make pt page writable */
1860 ptwr_info[cpu].disconnected_pte = pte;
1861 pte = (virt_to_phys(ptwr_info[cpu].disconnected_page) &
1862 PAGE_MASK) | _PAGE_RW | (pte & ~PAGE_MASK);
1865 PTWR_PRINTK(PP_ALL, ("update %p pte to %08lx\n",
1866 &linear_pg_table[addr>>PAGE_SHIFT], pte));
1867 if ( __put_user(pte, (unsigned long *)
1868 &linear_pg_table[addr>>PAGE_SHIFT]) ) {
1869 MEM_LOG("ptwr: Could not update pte at %p\n", (unsigned long *)
1870 &linear_pg_table[addr>>PAGE_SHIFT]);
1871 domain_crash();
1873 return 1;
1876 return 0;
1879 #ifndef NDEBUG
1880 void ptwr_status(void)
1882 unsigned long pte, pfn;
1883 struct pfn_info *page;
1884 l2_pgentry_t *pl2e;
1885 int cpu = smp_processor_id();
1887 unsigned long *writable_pte = (unsigned long *)&linear_pg_table
1888 [ptwr_info[cpu].writable_l1va>>PAGE_SHIFT];
1890 if ( __get_user(pte, writable_pte) ) {
1891 MEM_LOG("ptwr: Could not read pte at %p\n", writable_pte);
1892 domain_crash();
1895 pfn = pte >> PAGE_SHIFT;
1896 page = &frame_table[pfn];
1897 printk("need to alloc l1 page %p\n", page);
1898 /* make pt page writable */
1899 printk("need to make read-only l1-page at %p is %08lx\n",
1900 writable_pte, pte);
1902 if ( ptwr_info[cpu].disconnected_pteidx < 0 )
1903 return;
1905 printk("disconnected space: space %08lx\n",
1906 ptwr_info[cpu].disconnected_pteidx << L2_PAGETABLE_SHIFT);
1907 pl2e = &linear_l2_table[ptwr_info[cpu].disconnected_pteidx];
1909 if ( __get_user(pte, (unsigned long *)ptwr_info[cpu].disconnected_l1va) ) {
1910 MEM_LOG("ptwr: Could not read pte at %p\n", (unsigned long *)
1911 ptwr_info[cpu].disconnected_l1va);
1912 domain_crash();
1914 pfn = pte >> PAGE_SHIFT;
1915 page = &frame_table[pfn];
1917 PTWR_PRINTK(PP_ALL, (" pl2e %p l2e %08lx pfn %08lx taf %08x/%08x\n",
1918 pl2e, l2_pgentry_val(*pl2e), pfn,
1919 frame_table[pfn].u.inuse.type_info,
1920 frame_table[pfn].u.inuse.domain->domain));
1924 /************************************************************************/
1927 void audit_domain( struct domain *d)
1929 int ttot=0, ctot=0;
1930 void adjust ( struct pfn_info *page, int dir, int adjtype )
1932 int count = page->count_info & PGC_count_mask;
1934 if ( adjtype )
1936 int tcount = page->u.inuse.type_info & PGT_count_mask;
1938 ttot++;
1940 tcount += dir;
1942 if ( tcount < 0 )
1944 printk("Audit %d: type count whent below zero pfn=%x taf=%x otaf=%x\n",
1945 d->domain, page-frame_table,
1946 page->u.inuse.type_info,
1947 page->tlbflush_timestamp);
1948 return;
1951 page->u.inuse.type_info =
1952 (page->u.inuse.type_info & ~PGT_count_mask) | tcount;
1955 ctot++;
1956 count += dir;
1957 if ( count < 0 )
1959 printk("Audit %d: general count whent below zero pfn=%x taf=%x otaf=%x\n",
1960 d->domain, page-frame_table,
1961 page->u.inuse.type_info,
1962 page->tlbflush_timestamp);
1963 return;
1966 page->count_info =
1967 (page->count_info & ~PGC_count_mask) | count;
1971 void scan_for_pfn( struct domain *d, unsigned long xpfn )
1973 unsigned long pfn;
1974 struct list_head *list_ent;
1975 int i;
1977 list_ent = d->page_list.next;
1978 for ( i = 0; (list_ent != &d->page_list); i++ )
1980 unsigned long * pt;
1981 struct pfn_info *page;
1982 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
1983 page = &frame_table[pfn];
1985 if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_l1_page_table ||
1986 (page->u.inuse.type_info & PGT_type_mask) == PGT_l2_page_table )
1988 pt = map_domain_mem( pfn<<PAGE_SHIFT );
1990 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
1992 if ( pt[i] & _PAGE_PRESENT )
1994 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
1996 if ( l1pfn == xpfn )
1998 printk(" found dom=%d i=%x pfn=%lx t=%x c=%x\n",
1999 d->domain,
2000 i,pfn,page->u.inuse.type_info,
2001 page->count_info);
2006 unmap_domain_mem(pt);
2009 list_ent = frame_table[pfn].list.next;
2014 void scan_for_pfn_remote( unsigned long xpfn )
2016 struct domain *e;
2018 for_each_domain ( e )
2020 scan_for_pfn( e, xpfn );
2024 int i;
2025 unsigned long pfn;
2026 struct list_head *list_ent;
2028 if ( d != current )
2029 domain_pause(d);
2030 synchronise_pagetables(~0UL);
2032 printk("pt base=%lx sh_info=%x\n",
2033 pagetable_val(d->mm.pagetable)>>PAGE_SHIFT,
2034 virt_to_page(d->shared_info)-frame_table);
2036 spin_lock(&d->page_alloc_lock);
2038 /* phase 0 */
2040 list_ent = d->page_list.next;
2041 for ( i = 0; (list_ent != &d->page_list); i++ )
2043 struct pfn_info *page;
2044 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
2045 page = &frame_table[pfn];
2047 if ( page->u.inuse.domain != d )
2048 BUG();
2050 if ( (page->u.inuse.type_info & PGT_count_mask) >
2051 (page->count_info & PGC_count_mask) )
2052 printk("taf > caf %x %x pfn=%lx\n",
2053 page->u.inuse.type_info, page->count_info, pfn );
2055 #if 0 // SYSV shared memory pages plus writeable files
2056 if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_writable_page &&
2057 (page->u.inuse.type_info & PGT_count_mask) > 1 )
2059 printk("writeable page with type count >1: pfn=%lx t=%x c=%x\n",
2060 pfn,
2061 page->u.inuse.type_info,
2062 page->count_info );
2063 scan_for_pfn_remote(pfn);
2065 #endif
2066 if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_none &&
2067 (page->u.inuse.type_info & PGT_count_mask) > 1 )
2069 printk("normal page with type count >1: pfn=%lx t=%x c=%x\n",
2070 pfn,
2071 page->u.inuse.type_info,
2072 page->count_info );
2075 // use tlbflush_timestamp to store original type_info
2076 page->tlbflush_timestamp = page->u.inuse.type_info;
2078 list_ent = frame_table[pfn].list.next;
2082 /* phase 1 */
2084 adjust( &frame_table[pagetable_val(d->mm.pagetable)>>PAGE_SHIFT], -1, 1 );
2086 list_ent = d->page_list.next;
2087 for ( i = 0; (list_ent != &d->page_list); i++ )
2089 unsigned long * pt;
2090 struct pfn_info *page;
2091 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
2092 page = &frame_table[pfn];
2094 if ( page->u.inuse.domain != d )
2095 BUG();
2097 switch ( page->u.inuse.type_info & PGT_type_mask )
2099 case PGT_l2_page_table:
2101 if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated )
2102 printk("Audit %d: L2 not validated %x\n",
2103 d->domain, page->u.inuse.type_info);
2105 if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned )
2106 printk("Audit %d: L2 not pinned %x\n",
2107 d->domain, page->u.inuse.type_info);
2108 else
2109 adjust( page, -1, 1 );
2111 pt = map_domain_mem( pfn<<PAGE_SHIFT );
2113 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
2115 if ( pt[i] & _PAGE_PRESENT )
2117 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
2118 struct pfn_info *l1page = &frame_table[l1pfn];
2120 if ( l1page->u.inuse.domain != d )
2122 printk("Skip page belowing to other dom %p\n",
2123 l1page->u.inuse.domain);
2124 continue;
2127 if ( (l1page->u.inuse.type_info & PGT_type_mask) !=
2128 PGT_l1_page_table )
2129 printk("Audit %d: [%x] Expected L1 t=%x pfn=%lx\n",
2130 d->domain, i,
2131 l1page->u.inuse.type_info,
2132 l1pfn);
2134 adjust( l1page, -1, 1 );
2138 unmap_domain_mem(pt);
2140 break;
2143 case PGT_l1_page_table:
2145 if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
2147 //printk("L1 is pinned\n");
2148 adjust( page, -1, 1 );
2151 if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated )
2152 printk("Audit %d: L1 not validated %x\n",
2153 d->domain, page->u.inuse.type_info);
2154 #if 0
2155 if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned )
2156 printk("Audit %d: L1 not pinned %x\n",
2157 d->domain, page->u.inuse.type_info);
2158 #endif
2159 pt = map_domain_mem( pfn<<PAGE_SHIFT );
2161 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
2163 if ( pt[i] & _PAGE_PRESENT )
2165 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
2166 struct pfn_info *l1page = &frame_table[l1pfn];
2168 if ( pt[i] & _PAGE_RW )
2171 if ( (l1page->u.inuse.type_info & PGT_type_mask) ==
2172 PGT_l1_page_table ||
2173 (l1page->u.inuse.type_info & PGT_type_mask) ==
2174 PGT_l2_page_table )
2175 printk("Audit %d: [%x] Ilegal RW t=%x pfn=%lx\n",
2176 d->domain, i,
2177 l1page->u.inuse.type_info,
2178 l1pfn);
2182 if ( l1page->u.inuse.domain != d )
2184 printk("Skip page belowing to other dom %p\n",
2185 l1page->u.inuse.domain);
2186 continue;
2189 adjust( l1page, -1, 0 );
2193 unmap_domain_mem(pt);
2195 break;
2200 list_ent = frame_table[pfn].list.next;
2203 /* phase 2 */
2205 ctot = ttot = 0;
2206 list_ent = d->page_list.next;
2207 for ( i = 0; (list_ent != &d->page_list); i++ )
2209 struct pfn_info *page;
2210 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
2212 page = &frame_table[pfn];
2215 switch ( page->u.inuse.type_info & PGT_type_mask)
2217 case PGT_l1_page_table:
2218 case PGT_l2_page_table:
2219 if ( (page->u.inuse.type_info & PGT_count_mask) != 0 )
2221 printk("Audit %d: type count!=0 t=%x ot=%x c=%x pfn=%lx\n",
2222 d->domain, page->u.inuse.type_info,
2223 page->tlbflush_timestamp,
2224 page->count_info, pfn );
2225 scan_for_pfn_remote(pfn);
2227 default:
2228 if ( (page->count_info & PGC_count_mask) != 1 )
2230 printk("Audit %d: general count!=1 (c=%x) t=%x ot=%x pfn=%lx\n",
2231 d->domain,
2232 page->count_info,
2233 page->u.inuse.type_info,
2234 page->tlbflush_timestamp, pfn );
2235 scan_for_pfn_remote(pfn);
2237 break;
2240 list_ent = frame_table[pfn].list.next;
2243 /* phase 3 */
2245 list_ent = d->page_list.next;
2246 for ( i = 0; (list_ent != &d->page_list); i++ )
2248 unsigned long * pt;
2249 struct pfn_info *page;
2250 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
2252 page = &frame_table[pfn];
2254 switch ( page->u.inuse.type_info & PGT_type_mask )
2256 case PGT_l2_page_table:
2257 if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
2258 adjust( page, 1, 1 );
2260 pt = map_domain_mem( pfn<<PAGE_SHIFT );
2262 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
2264 if ( pt[i] & _PAGE_PRESENT )
2266 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
2267 struct pfn_info *l1page = &frame_table[l1pfn];
2269 if ( l1page->u.inuse.domain == d)
2270 adjust( l1page, 1, 1 );
2274 unmap_domain_mem(pt);
2275 break;
2277 case PGT_l1_page_table:
2278 if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
2279 adjust( page, 1, 1 );
2281 pt = map_domain_mem( pfn<<PAGE_SHIFT );
2283 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
2285 if ( pt[i] & _PAGE_PRESENT )
2287 #if 1
2289 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
2290 struct pfn_info *l1page = &frame_table[l1pfn];
2292 if ( l1page->u.inuse.domain == d)
2293 adjust( l1page, 1, 0 );
2294 #endif
2298 unmap_domain_mem(pt);
2299 break;
2303 page->tlbflush_timestamp = 0; // put back
2306 list_ent = frame_table[pfn].list.next;
2309 spin_unlock(&d->page_alloc_lock);
2311 adjust( &frame_table[pagetable_val(d->mm.pagetable)>>PAGE_SHIFT], 1, 1 );
2313 printk("Audit %d: Done. ctot=%d ttot=%d\n",d->domain, ctot, ttot );
2315 if ( d != current )
2316 domain_unpause(d);
2320 void audit_domains(void)
2322 struct domain *d;
2324 for_each_domain ( d )
2326 if ( d->domain > 0 )
2327 audit_domain(d);
2331 void audit_domains_key(unsigned char key, void *dev_id,
2332 struct pt_regs *regs)
2334 open_softirq( MEMAUDIT_SOFTIRQ, audit_domains );
2335 raise_softirq( MEMAUDIT_SOFTIRQ );
2339 #endif