ia64/xen-unstable

view xen/arch/x86/mm.c @ 3894:ba1a314ce815

bitkeeper revision 1.1230.4.1 (421b7d70g_kPPMvAgkUMMU-R8G4RrA)

add perfcounters for pagetable update histograms

Signed-off-by: michael.fetterman@cl.cam.ac.uk
author rneugeba@wyvis.research.intel-research.net
date Tue Feb 22 18:44:00 2005 +0000 (2005-02-22)
parents ad1d06d64313
children b57a97bb65bd
line source
1 /* -*- Mode:C; c-basic-offset:4; tab-width:4; indent-tabs-mode:nil -*- */
2 /******************************************************************************
3 * arch/x86/mm.c
4 *
5 * Copyright (c) 2002-2005 K A Fraser
6 * Copyright (c) 2004 Christian Limpach
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 */
23 /*
24 * A description of the x86 page table API:
25 *
26 * Domains trap to do_mmu_update with a list of update requests.
27 * This is a list of (ptr, val) pairs, where the requested operation
28 * is *ptr = val.
29 *
30 * Reference counting of pages:
31 * ----------------------------
32 * Each page has two refcounts: tot_count and type_count.
33 *
34 * TOT_COUNT is the obvious reference count. It counts all uses of a
35 * physical page frame by a domain, including uses as a page directory,
36 * a page table, or simple mappings via a PTE. This count prevents a
37 * domain from releasing a frame back to the free pool when it still holds
38 * a reference to it.
39 *
40 * TYPE_COUNT is more subtle. A frame can be put to one of three
41 * mutually-exclusive uses: it might be used as a page directory, or a
42 * page table, or it may be mapped writable by the domain [of course, a
43 * frame may not be used in any of these three ways!].
44 * So, type_count is a count of the number of times a frame is being
45 * referred to in its current incarnation. Therefore, a page can only
46 * change its type when its type count is zero.
47 *
48 * Pinning the page type:
49 * ----------------------
50 * The type of a page can be pinned/unpinned with the commands
51 * MMUEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is,
52 * pinning is not reference counted, so it can't be nested).
53 * This is useful to prevent a page's type count falling to zero, at which
54 * point safety checks would need to be carried out next time the count
55 * is increased again.
56 *
57 * A further note on writable page mappings:
58 * -----------------------------------------
59 * For simplicity, the count of writable mappings for a page may not
60 * correspond to reality. The 'writable count' is incremented for every
61 * PTE which maps the page with the _PAGE_RW flag set. However, for
62 * write access to be possible the page directory entry must also have
63 * its _PAGE_RW bit set. We do not check this as it complicates the
64 * reference counting considerably [consider the case of multiple
65 * directory entries referencing a single page table, some with the RW
66 * bit set, others not -- it starts getting a bit messy].
67 * In normal use, this simplification shouldn't be a problem.
68 * However, the logic can be added if required.
69 *
70 * One more note on read-only page mappings:
71 * -----------------------------------------
72 * We want domains to be able to map pages for read-only access. The
73 * main reason is that page tables and directories should be readable
74 * by a domain, but it would not be safe for them to be writable.
75 * However, domains have free access to rings 1 & 2 of the Intel
76 * privilege model. In terms of page protection, these are considered
77 * to be part of 'supervisor mode'. The WP bit in CR0 controls whether
78 * read-only restrictions are respected in supervisor mode -- if the
79 * bit is clear then any mapped page is writable.
80 *
81 * We get round this by always setting the WP bit and disallowing
82 * updates to it. This is very unlikely to cause a problem for guest
83 * OS's, which will generally use the WP bit to simplify copy-on-write
84 * implementation (in that case, OS wants a fault when it writes to
85 * an application-supplied buffer).
86 */
88 #include <xen/config.h>
89 #include <xen/init.h>
90 #include <xen/kernel.h>
91 #include <xen/lib.h>
92 #include <xen/mm.h>
93 #include <xen/sched.h>
94 #include <xen/errno.h>
95 #include <xen/perfc.h>
96 #include <xen/irq.h>
97 #include <xen/softirq.h>
98 #include <asm/shadow.h>
99 #include <asm/page.h>
100 #include <asm/flushtlb.h>
101 #include <asm/io.h>
102 #include <asm/uaccess.h>
103 #include <asm/domain_page.h>
104 #include <asm/ldt.h>
106 #ifdef VERBOSE
107 #define MEM_LOG(_f, _a...) \
108 printk("DOM%u: (file=memory.c, line=%d) " _f "\n", \
109 current->domain->id , __LINE__ , ## _a )
110 #else
111 #define MEM_LOG(_f, _a...) ((void)0)
112 #endif
114 static int alloc_l2_table(struct pfn_info *page);
115 static int alloc_l1_table(struct pfn_info *page);
116 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d);
117 static int get_page_and_type_from_pagenr(unsigned long page_nr,
118 u32 type,
119 struct domain *d);
121 static void free_l2_table(struct pfn_info *page);
122 static void free_l1_table(struct pfn_info *page);
124 static int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t, unsigned long);
125 static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t);
127 /* Used to defer flushing of memory structures. */
128 static struct {
129 #define DOP_FLUSH_TLB (1<<0) /* Flush the TLB. */
130 #define DOP_RELOAD_LDT (1<<1) /* Reload the LDT shadow mapping. */
131 unsigned long deferred_ops;
132 /* If non-NULL, specifies a foreign subject domain for some operations. */
133 struct domain *foreign;
134 } __cacheline_aligned percpu_info[NR_CPUS];
136 /*
137 * Returns the current foreign domain; defaults to the currently-executing
138 * domain if a foreign override hasn't been specified.
139 */
140 #define FOREIGNDOM (percpu_info[smp_processor_id()].foreign ? : current->domain)
142 /* Private domain structs for DOMID_XEN and DOMID_IO. */
143 static struct domain *dom_xen, *dom_io;
145 /* Frame table and its size in pages. */
146 struct pfn_info *frame_table;
147 unsigned long frame_table_size;
148 unsigned long max_page;
150 void __init init_frametable(void)
151 {
152 unsigned long i, p;
154 frame_table = (struct pfn_info *)FRAMETABLE_VIRT_START;
155 frame_table_size = max_page * sizeof(struct pfn_info);
156 frame_table_size = (frame_table_size + PAGE_SIZE - 1) & PAGE_MASK;
158 for ( i = 0; i < frame_table_size; i += (4UL << 20) )
159 {
160 p = alloc_boot_pages(min(frame_table_size - i, 4UL << 20), 4UL << 20);
161 if ( p == 0 )
162 panic("Not enough memory for frame table\n");
163 map_pages(idle_pg_table, FRAMETABLE_VIRT_START + i, p,
164 4UL << 20, PAGE_HYPERVISOR);
165 }
167 memset(frame_table, 0, frame_table_size);
168 }
170 void arch_init_memory(void)
171 {
172 extern void subarch_init_memory(struct domain *);
174 memset(percpu_info, 0, sizeof(percpu_info));
176 /*
177 * Initialise our DOMID_XEN domain.
178 * Any Xen-heap pages that we will allow to be mapped will have
179 * their domain field set to dom_xen.
180 */
181 dom_xen = alloc_domain_struct();
182 atomic_set(&dom_xen->refcnt, 1);
183 dom_xen->id = DOMID_XEN;
185 /*
186 * Initialise our DOMID_IO domain.
187 * This domain owns no pages but is considered a special case when
188 * mapping I/O pages, as the mappings occur at the priv of the caller.
189 */
190 dom_io = alloc_domain_struct();
191 atomic_set(&dom_io->refcnt, 1);
192 dom_io->id = DOMID_IO;
194 subarch_init_memory(dom_xen);
195 }
197 void write_ptbase(struct exec_domain *ed)
198 {
199 write_cr3(pagetable_val(ed->arch.monitor_table));
200 }
202 static void __invalidate_shadow_ldt(struct exec_domain *d)
203 {
204 int i;
205 unsigned long pfn;
206 struct pfn_info *page;
208 d->arch.shadow_ldt_mapcnt = 0;
210 for ( i = 16; i < 32; i++ )
211 {
212 pfn = l1_pgentry_to_pfn(d->arch.perdomain_ptes[i]);
213 if ( pfn == 0 ) continue;
214 d->arch.perdomain_ptes[i] = mk_l1_pgentry(0);
215 page = &frame_table[pfn];
216 ASSERT_PAGE_IS_TYPE(page, PGT_ldt_page);
217 ASSERT_PAGE_IS_DOMAIN(page, d->domain);
218 put_page_and_type(page);
219 }
221 /* Dispose of the (now possibly invalid) mappings from the TLB. */
222 percpu_info[d->processor].deferred_ops |= DOP_FLUSH_TLB | DOP_RELOAD_LDT;
223 }
226 static inline void invalidate_shadow_ldt(struct exec_domain *d)
227 {
228 if ( d->arch.shadow_ldt_mapcnt != 0 )
229 __invalidate_shadow_ldt(d);
230 }
233 static int alloc_segdesc_page(struct pfn_info *page)
234 {
235 struct desc_struct *descs;
236 int i;
238 descs = map_domain_mem((page-frame_table) << PAGE_SHIFT);
240 for ( i = 0; i < 512; i++ )
241 if ( unlikely(!check_descriptor(&descs[i])) )
242 goto fail;
244 unmap_domain_mem(descs);
245 return 1;
247 fail:
248 unmap_domain_mem(descs);
249 return 0;
250 }
253 /* Map shadow page at offset @off. */
254 int map_ldt_shadow_page(unsigned int off)
255 {
256 struct exec_domain *ed = current;
257 struct domain *d = ed->domain;
258 unsigned long l1e;
260 if ( unlikely(in_irq()) )
261 BUG();
263 __get_user(l1e, (unsigned long *)
264 &linear_pg_table[l1_linear_offset(ed->arch.ldt_base) + off]);
266 if ( unlikely(!(l1e & _PAGE_PRESENT)) ||
267 unlikely(!get_page_and_type(
268 &frame_table[l1_pgentry_to_pfn(mk_l1_pgentry(l1e))],
269 d, PGT_ldt_page)) )
270 return 0;
272 ed->arch.perdomain_ptes[off + 16] = mk_l1_pgentry(l1e | _PAGE_RW);
273 ed->arch.shadow_ldt_mapcnt++;
275 return 1;
276 }
279 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d)
280 {
281 struct pfn_info *page = &frame_table[page_nr];
283 if ( unlikely(!pfn_is_ram(page_nr)) )
284 {
285 MEM_LOG("Pfn %p is not RAM", page_nr);
286 return 0;
287 }
289 if ( unlikely(!get_page(page, d)) )
290 {
291 MEM_LOG("Could not get page ref for pfn %p", page_nr);
292 return 0;
293 }
295 return 1;
296 }
299 static int get_page_and_type_from_pagenr(unsigned long page_nr,
300 u32 type,
301 struct domain *d)
302 {
303 struct pfn_info *page = &frame_table[page_nr];
305 if ( unlikely(!get_page_from_pagenr(page_nr, d)) )
306 return 0;
308 if ( unlikely(!get_page_type(page, type)) )
309 {
310 if ( (type & PGT_type_mask) != PGT_l1_page_table )
311 MEM_LOG("Bad page type for pfn %p (%08x)",
312 page_nr, page->u.inuse.type_info);
313 put_page(page);
314 return 0;
315 }
317 return 1;
318 }
321 /*
322 * We allow root tables to map each other (a.k.a. linear page tables). It
323 * needs some special care with reference counts and access permissions:
324 * 1. The mapping entry must be read-only, or the guest may get write access
325 * to its own PTEs.
326 * 2. We must only bump the reference counts for an *already validated*
327 * L2 table, or we can end up in a deadlock in get_page_type() by waiting
328 * on a validation that is required to complete that validation.
329 * 3. We only need to increment the reference counts for the mapped page
330 * frame if it is mapped by a different root table. This is sufficient and
331 * also necessary to allow validation of a root table mapping itself.
332 */
333 static int
334 get_linear_pagetable(
335 root_pgentry_t re, unsigned long re_pfn, struct domain *d)
336 {
337 u32 x, y;
338 struct pfn_info *page;
339 unsigned long pfn;
341 if ( (root_pgentry_val(re) & _PAGE_RW) )
342 {
343 MEM_LOG("Attempt to create linear p.t. with write perms");
344 return 0;
345 }
347 if ( (pfn = root_pgentry_to_pfn(re)) != re_pfn )
348 {
349 /* Make sure the mapped frame belongs to the correct domain. */
350 if ( unlikely(!get_page_from_pagenr(pfn, d)) )
351 return 0;
353 /*
354 * Make sure that the mapped frame is an already-validated L2 table.
355 * If so, atomically increment the count (checking for overflow).
356 */
357 page = &frame_table[pfn];
358 y = page->u.inuse.type_info;
359 do {
360 x = y;
361 if ( unlikely((x & PGT_count_mask) == PGT_count_mask) ||
362 unlikely((x & (PGT_type_mask|PGT_validated)) !=
363 (PGT_root_page_table|PGT_validated)) )
364 {
365 put_page(page);
366 return 0;
367 }
368 }
369 while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x );
370 }
372 return 1;
373 }
376 static int
377 get_page_from_l1e(
378 l1_pgentry_t l1e, struct domain *d)
379 {
380 unsigned long l1v = l1_pgentry_val(l1e);
381 unsigned long pfn = l1_pgentry_to_pfn(l1e);
382 struct pfn_info *page = &frame_table[pfn];
383 extern int domain_iomem_in_pfn(struct domain *d, unsigned long pfn);
385 if ( !(l1v & _PAGE_PRESENT) )
386 return 1;
388 if ( unlikely(l1v & L1_DISALLOW_MASK) )
389 {
390 MEM_LOG("Bad L1 type settings %p", l1v & L1_DISALLOW_MASK);
391 return 0;
392 }
394 if ( unlikely(!pfn_is_ram(pfn)) )
395 {
396 /* Revert to caller privileges if FD == DOMID_IO. */
397 if ( d == dom_io )
398 d = current->domain;
400 if ( IS_PRIV(d) )
401 return 1;
403 if ( IS_CAPABLE_PHYSDEV(d) )
404 return domain_iomem_in_pfn(d, pfn);
406 MEM_LOG("Non-privileged attempt to map I/O space %p", pfn);
407 return 0;
408 }
410 return ((l1v & _PAGE_RW) ?
411 get_page_and_type(page, d, PGT_writable_page) :
412 get_page(page, d));
413 }
416 /* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
417 static int
418 get_page_from_l2e(
419 l2_pgentry_t l2e, unsigned long pfn,
420 struct domain *d, unsigned long va_idx)
421 {
422 int rc;
424 if ( !(l2_pgentry_val(l2e) & _PAGE_PRESENT) )
425 return 1;
427 if ( unlikely((l2_pgentry_val(l2e) & L2_DISALLOW_MASK)) )
428 {
429 MEM_LOG("Bad L2 page type settings %p",
430 l2_pgentry_val(l2e) & L2_DISALLOW_MASK);
431 return 0;
432 }
434 rc = get_page_and_type_from_pagenr(
435 l2_pgentry_to_pfn(l2e),
436 PGT_l1_page_table | (va_idx<<PGT_va_shift), d);
438 #if defined(__i386__)
439 return rc ? rc : get_linear_pagetable(l2e, pfn, d);
440 #elif defined(__x86_64__)
441 return rc;
442 #endif
443 }
446 #ifdef __x86_64__
448 static int
449 get_page_from_l3e(
450 l3_pgentry_t l3e, unsigned long pfn, struct domain *d)
451 {
452 if ( !(l3_pgentry_val(l3e) & _PAGE_PRESENT) )
453 return 1;
455 if ( unlikely((l3_pgentry_val(l3e) & L3_DISALLOW_MASK)) )
456 {
457 MEM_LOG("Bad L3 page type settings %p",
458 l3_pgentry_val(l3e) & L3_DISALLOW_MASK);
459 return 0;
460 }
462 return get_page_and_type_from_pagenr(
463 l3_pgentry_to_pfn(l3e), PGT_l2_page_table, d);
464 }
467 static int
468 get_page_from_l4e(
469 l4_pgentry_t l4e, unsigned long pfn, struct domain *d)
470 {
471 int rc;
473 if ( !(l4_pgentry_val(l4e) & _PAGE_PRESENT) )
474 return 1;
476 if ( unlikely((l4_pgentry_val(l4e) & L4_DISALLOW_MASK)) )
477 {
478 MEM_LOG("Bad L4 page type settings %p",
479 l4_pgentry_val(l4e) & L4_DISALLOW_MASK);
480 return 0;
481 }
483 rc = get_page_and_type_from_pagenr(
484 l4_pgentry_to_pfn(l4e), PGT_l3_page_table, d);
486 if ( unlikely(!rc) )
487 return get_linear_pagetable(l4e, pfn, d);
489 return 1;
490 }
492 #endif /* __x86_64__ */
495 static void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
496 {
497 unsigned long l1v = l1_pgentry_val(l1e);
498 unsigned long pfn = l1_pgentry_to_pfn(l1e);
499 struct pfn_info *page = &frame_table[pfn];
500 struct domain *e;
502 if ( !(l1v & _PAGE_PRESENT) || !pfn_is_ram(pfn) )
503 return;
505 e = page_get_owner(page);
506 if ( unlikely(e != d) )
507 {
508 /*
509 * Unmap a foreign page that may have been mapped via a grant table.
510 * Note that this can fail for a privileged domain that can map foreign
511 * pages via MMUEXT_SET_FOREIGNDOM. Such domains can have some mappings
512 * counted via a grant entry and some counted directly in the page
513 * structure's reference count. Note that reference counts won't get
514 * dangerously confused as long as we always try to decrement the
515 * grant entry first. We may end up with a mismatch between which
516 * mappings and which unmappings are counted via the grant entry, but
517 * really it doesn't matter as privileged domains have carte blanche.
518 */
519 if ( likely(gnttab_check_unmap(e, d, pfn, !(l1v & _PAGE_RW))) )
520 return;
521 /* Assume this mapping was made via MMUEXT_SET_FOREIGNDOM... */
522 }
524 if ( l1v & _PAGE_RW )
525 {
526 put_page_and_type(page);
527 }
528 else
529 {
530 /* We expect this is rare so we blow the entire shadow LDT. */
531 if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) ==
532 PGT_ldt_page)) &&
533 unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) )
534 invalidate_shadow_ldt(e->exec_domain[0]);
535 put_page(page);
536 }
537 }
540 /*
541 * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
542 * Note also that this automatically deals correctly with linear p.t.'s.
543 */
544 static void put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
545 {
546 if ( (l2_pgentry_val(l2e) & _PAGE_PRESENT) &&
547 (l2_pgentry_to_pfn(l2e) != pfn) )
548 put_page_and_type(&frame_table[l2_pgentry_to_pfn(l2e)]);
549 }
552 #ifdef __x86_64__
554 static void put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn)
555 {
556 if ( (l3_pgentry_val(l3e) & _PAGE_PRESENT) &&
557 (l3_pgentry_to_pfn(l3e) != pfn) )
558 put_page_and_type(&frame_table[l3_pgentry_to_pfn(l3e)]);
559 }
562 static void put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn)
563 {
564 if ( (l4_pgentry_val(l4e) & _PAGE_PRESENT) &&
565 (l4_pgentry_to_pfn(l4e) != pfn) )
566 put_page_and_type(&frame_table[l4_pgentry_to_pfn(l4e)]);
567 }
569 #endif /* __x86_64__ */
572 static int alloc_l1_table(struct pfn_info *page)
573 {
574 struct domain *d = page_get_owner(page);
575 unsigned long pfn = page_to_pfn(page);
576 l1_pgentry_t *pl1e;
577 int i;
579 pl1e = map_domain_mem(pfn << PAGE_SHIFT);
581 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
582 if ( is_guest_l1_slot(i) &&
583 unlikely(!get_page_from_l1e(pl1e[i], d)) )
584 goto fail;
586 unmap_domain_mem(pl1e);
587 return 1;
589 fail:
590 while ( i-- > 0 )
591 if ( is_guest_l1_slot(i) )
592 put_page_from_l1e(pl1e[i], d);
594 unmap_domain_mem(pl1e);
595 return 0;
596 }
599 static int alloc_l2_table(struct pfn_info *page)
600 {
601 struct domain *d = page_get_owner(page);
602 unsigned long pfn = page_to_pfn(page);
603 l2_pgentry_t *pl2e;
604 int i;
606 pl2e = map_domain_mem(pfn << PAGE_SHIFT);
608 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
609 if ( is_guest_l2_slot(i) &&
610 unlikely(!get_page_from_l2e(pl2e[i], pfn, d, i)) )
611 goto fail;
613 #if defined(__i386__)
614 /* Xen private mappings. */
615 memcpy(&pl2e[ROOT_PAGETABLE_FIRST_XEN_SLOT],
616 &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT],
617 ROOT_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
618 pl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
619 mk_l2_pgentry((pfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
620 pl2e[l2_table_offset(PERDOMAIN_VIRT_START)] =
621 mk_l2_pgentry(__pa(page_get_owner(page)->arch.mm_perdomain_pt) |
622 __PAGE_HYPERVISOR);
623 #endif
625 unmap_domain_mem(pl2e);
626 return 1;
628 fail:
629 while ( i-- > 0 )
630 if ( is_guest_l2_slot(i) )
631 put_page_from_l2e(pl2e[i], pfn);
633 unmap_domain_mem(pl2e);
634 return 0;
635 }
638 #ifdef __x86_64__
640 static int alloc_l3_table(struct pfn_info *page)
641 {
642 struct domain *d = page_get_owner(page);
643 unsigned long pfn = page_to_pfn(page);
644 l3_pgentry_t *pl3e = page_to_virt(page);
645 int i;
647 for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
648 if ( is_guest_l3_slot(i) &&
649 unlikely(!get_page_from_l3e(pl3e[i], pfn, d)) )
650 goto fail;
652 return 1;
654 fail:
655 while ( i-- > 0 )
656 if ( is_guest_l3_slot(i) )
657 put_page_from_l3e(pl3e[i], pfn);
659 return 0;
660 }
663 static int alloc_l4_table(struct pfn_info *page)
664 {
665 struct domain *d = page_get_owner(page);
666 unsigned long pfn = page_to_pfn(page);
667 l4_pgentry_t *pl4e = page_to_virt(page);
668 int i;
670 for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
671 if ( is_guest_l4_slot(i) &&
672 unlikely(!get_page_from_l4e(pl4e[i], pfn, d)) )
673 goto fail;
675 /* Xen private mappings. */
676 memcpy(&pl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT],
677 &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT],
678 ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t));
679 pl4e[l4_table_offset(LINEAR_PT_VIRT_START)] =
680 mk_l4_pgentry((pfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
681 pl4e[l4_table_offset(PERDOMAIN_VIRT_START)] =
682 mk_l4_pgentry(__pa(page_get_owner(page)->arch.mm_perdomain_l3) |
683 __PAGE_HYPERVISOR);
685 return 1;
687 fail:
688 while ( i-- > 0 )
689 if ( is_guest_l4_slot(i) )
690 put_page_from_l4e(pl4e[i], pfn);
692 return 0;
693 }
695 #endif /* __x86_64__ */
698 static void free_l1_table(struct pfn_info *page)
699 {
700 struct domain *d = page_get_owner(page);
701 unsigned long pfn = page_to_pfn(page);
702 l1_pgentry_t *pl1e;
703 int i;
705 pl1e = map_domain_mem(pfn << PAGE_SHIFT);
707 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
708 if ( is_guest_l1_slot(i) )
709 put_page_from_l1e(pl1e[i], d);
711 unmap_domain_mem(pl1e);
712 }
715 static void free_l2_table(struct pfn_info *page)
716 {
717 unsigned long pfn = page_to_pfn(page);
718 l2_pgentry_t *pl2e;
719 int i;
721 pl2e = map_domain_mem(pfn << PAGE_SHIFT);
723 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
724 if ( is_guest_l2_slot(i) )
725 put_page_from_l2e(pl2e[i], pfn);
727 unmap_domain_mem(pl2e);
728 }
731 #ifdef __x86_64__
733 static void free_l3_table(struct pfn_info *page)
734 {
735 unsigned long pfn = page_to_pfn(page);
736 l3_pgentry_t *pl3e = page_to_virt(page);
737 int i;
739 for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
740 if ( is_guest_l3_slot(i) )
741 put_page_from_l3e(pl3e[i], pfn);
742 }
745 static void free_l4_table(struct pfn_info *page)
746 {
747 unsigned long pfn = page_to_pfn(page);
748 l4_pgentry_t *pl4e = page_to_virt(page);
749 int i;
751 for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
752 if ( is_guest_l4_slot(i) )
753 put_page_from_l4e(pl4e[i], pfn);
754 }
756 #endif /* __x86_64__ */
759 static inline int update_l1e(l1_pgentry_t *pl1e,
760 l1_pgentry_t ol1e,
761 l1_pgentry_t nl1e)
762 {
763 unsigned long o = l1_pgentry_val(ol1e);
764 unsigned long n = l1_pgentry_val(nl1e);
766 if ( unlikely(cmpxchg_user(pl1e, o, n) != 0) ||
767 unlikely(o != l1_pgentry_val(ol1e)) )
768 {
769 MEM_LOG("Failed to update %p -> %p: saw %p\n",
770 l1_pgentry_val(ol1e), l1_pgentry_val(nl1e), o);
771 return 0;
772 }
774 return 1;
775 }
778 /* Update the L1 entry at pl1e to new value nl1e. */
779 static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e)
780 {
781 l1_pgentry_t ol1e;
782 unsigned long _ol1e;
783 struct domain *d = current->domain;
785 if ( unlikely(__get_user(_ol1e, (unsigned long *)pl1e) != 0) )
786 return 0;
787 ol1e = mk_l1_pgentry(_ol1e);
789 if ( l1_pgentry_val(nl1e) & _PAGE_PRESENT )
790 {
791 if ( unlikely(l1_pgentry_val(nl1e) & L1_DISALLOW_MASK) )
792 {
793 MEM_LOG("Bad L1 type settings %p",
794 l1_pgentry_val(nl1e) & L1_DISALLOW_MASK);
795 return 0;
796 }
798 /* Fast path for identical mapping, r/w and presence. */
799 if ( ((l1_pgentry_val(ol1e) ^ l1_pgentry_val(nl1e)) &
800 ((PADDR_MASK & PAGE_MASK) | _PAGE_RW | _PAGE_PRESENT)) == 0 )
801 return update_l1e(pl1e, ol1e, nl1e);
803 if ( unlikely(!get_page_from_l1e(nl1e, FOREIGNDOM)) )
804 return 0;
806 if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
807 {
808 put_page_from_l1e(nl1e, d);
809 return 0;
810 }
812 put_page_from_l1e(ol1e, d);
813 return 1;
814 }
816 if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
817 return 0;
819 put_page_from_l1e(ol1e, d);
820 return 1;
821 }
824 #define UPDATE_ENTRY(_t,_p,_o,_n) ({ \
825 unsigned long __o = cmpxchg((unsigned long *)(_p), \
826 _t ## _pgentry_val(_o), \
827 _t ## _pgentry_val(_n)); \
828 if ( __o != _t ## _pgentry_val(_o) ) \
829 MEM_LOG("Failed to update %p -> %p: saw %p\n", \
830 _t ## _pgentry_val(_o), _t ## _pgentry_val(_n), __o); \
831 (__o == _t ## _pgentry_val(_o)); })
834 /* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
835 static int mod_l2_entry(l2_pgentry_t *pl2e,
836 l2_pgentry_t nl2e,
837 unsigned long pfn)
838 {
839 l2_pgentry_t ol2e;
840 unsigned long _ol2e;
842 if ( unlikely(!is_guest_l2_slot(pgentry_ptr_to_slot(pl2e))) )
843 {
844 MEM_LOG("Illegal L2 update attempt in Xen-private area %p", pl2e);
845 return 0;
846 }
848 if ( unlikely(__get_user(_ol2e, (unsigned long *)pl2e) != 0) )
849 return 0;
850 ol2e = mk_l2_pgentry(_ol2e);
852 if ( l2_pgentry_val(nl2e) & _PAGE_PRESENT )
853 {
854 if ( unlikely(l2_pgentry_val(nl2e) & L2_DISALLOW_MASK) )
855 {
856 MEM_LOG("Bad L2 type settings %p",
857 l2_pgentry_val(nl2e) & L2_DISALLOW_MASK);
858 return 0;
859 }
861 /* Fast path for identical mapping and presence. */
862 if ( ((l2_pgentry_val(ol2e) ^ l2_pgentry_val(nl2e)) &
863 ((PADDR_MASK & PAGE_MASK) | _PAGE_PRESENT)) == 0 )
864 return UPDATE_ENTRY(l2, pl2e, ol2e, nl2e);
866 if ( unlikely(!get_page_from_l2e(nl2e, pfn, current->domain,
867 ((unsigned long)pl2e &
868 ~PAGE_MASK) >> 2)) )
869 return 0;
871 if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e)) )
872 {
873 put_page_from_l2e(nl2e, pfn);
874 return 0;
875 }
877 put_page_from_l2e(ol2e, pfn);
878 return 1;
879 }
881 if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e)) )
882 return 0;
884 put_page_from_l2e(ol2e, pfn);
885 return 1;
886 }
889 #ifdef __x86_64__
891 /* Update the L3 entry at pl3e to new value nl3e. pl3e is within frame pfn. */
892 static int mod_l3_entry(l3_pgentry_t *pl3e,
893 l3_pgentry_t nl3e,
894 unsigned long pfn)
895 {
896 l3_pgentry_t ol3e;
897 unsigned long _ol3e;
899 if ( unlikely(!is_guest_l3_slot(pgentry_ptr_to_slot(pl3e))) )
900 {
901 MEM_LOG("Illegal L3 update attempt in Xen-private area %p", pl3e);
902 return 0;
903 }
905 if ( unlikely(__get_user(_ol3e, (unsigned long *)pl3e) != 0) )
906 return 0;
907 ol3e = mk_l3_pgentry(_ol3e);
909 if ( l3_pgentry_val(nl3e) & _PAGE_PRESENT )
910 {
911 if ( unlikely(l3_pgentry_val(nl3e) & L3_DISALLOW_MASK) )
912 {
913 MEM_LOG("Bad L3 type settings %p",
914 l3_pgentry_val(nl3e) & L3_DISALLOW_MASK);
915 return 0;
916 }
918 /* Fast path for identical mapping and presence. */
919 if ( ((l3_pgentry_val(ol3e) ^ l3_pgentry_val(nl3e)) &
920 ((PADDR_MASK & PAGE_MASK) | _PAGE_PRESENT)) == 0 )
921 return UPDATE_ENTRY(l3, pl3e, ol3e, nl3e);
923 if ( unlikely(!get_page_from_l3e(nl3e, pfn, current->domain)) )
924 return 0;
926 if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e)) )
927 {
928 put_page_from_l3e(nl3e, pfn);
929 return 0;
930 }
932 put_page_from_l3e(ol3e, pfn);
933 return 1;
934 }
936 if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e)) )
937 return 0;
939 put_page_from_l3e(ol3e, pfn);
940 return 1;
941 }
944 /* Update the L4 entry at pl4e to new value nl4e. pl4e is within frame pfn. */
945 static int mod_l4_entry(l4_pgentry_t *pl4e,
946 l4_pgentry_t nl4e,
947 unsigned long pfn)
948 {
949 l4_pgentry_t ol4e;
950 unsigned long _ol4e;
952 if ( unlikely(!is_guest_l4_slot(pgentry_ptr_to_slot(pl4e))) )
953 {
954 MEM_LOG("Illegal L4 update attempt in Xen-private area %p", pl4e);
955 return 0;
956 }
958 if ( unlikely(__get_user(_ol4e, (unsigned long *)pl4e) != 0) )
959 return 0;
960 ol4e = mk_l4_pgentry(_ol4e);
962 if ( l4_pgentry_val(nl4e) & _PAGE_PRESENT )
963 {
964 if ( unlikely(l4_pgentry_val(nl4e) & L4_DISALLOW_MASK) )
965 {
966 MEM_LOG("Bad L4 type settings %p",
967 l4_pgentry_val(nl4e) & L4_DISALLOW_MASK);
968 return 0;
969 }
971 /* Fast path for identical mapping and presence. */
972 if ( ((l4_pgentry_val(ol4e) ^ l4_pgentry_val(nl4e)) &
973 ((PADDR_MASK & PAGE_MASK) | _PAGE_PRESENT)) == 0 )
974 return UPDATE_ENTRY(l4, pl4e, ol4e, nl4e);
976 if ( unlikely(!get_page_from_l4e(nl4e, pfn, current->domain)) )
977 return 0;
979 if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e)) )
980 {
981 put_page_from_l4e(nl4e, pfn);
982 return 0;
983 }
985 put_page_from_l4e(ol4e, pfn);
986 return 1;
987 }
989 if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e)) )
990 return 0;
992 put_page_from_l4e(ol4e, pfn);
993 return 1;
994 }
996 #endif /* __x86_64__ */
999 int alloc_page_type(struct pfn_info *page, unsigned int type)
1001 switch ( type )
1003 case PGT_l1_page_table:
1004 return alloc_l1_table(page);
1005 case PGT_l2_page_table:
1006 return alloc_l2_table(page);
1007 #ifdef __x86_64__
1008 case PGT_l3_page_table:
1009 return alloc_l3_table(page);
1010 case PGT_l4_page_table:
1011 return alloc_l4_table(page);
1012 #endif
1013 case PGT_gdt_page:
1014 case PGT_ldt_page:
1015 return alloc_segdesc_page(page);
1016 default:
1017 printk("Bad type in alloc_page_type %x t=%x c=%x\n",
1018 type, page->u.inuse.type_info,
1019 page->count_info);
1020 BUG();
1023 return 0;
1027 void free_page_type(struct pfn_info *page, unsigned int type)
1029 struct domain *d = page_get_owner(page);
1031 switch ( type )
1033 case PGT_l1_page_table:
1034 free_l1_table(page);
1035 break;
1037 case PGT_l2_page_table:
1038 free_l2_table(page);
1039 break;
1041 #ifdef __x86_64__
1042 case PGT_l3_page_table:
1043 free_l3_table(page);
1044 break;
1046 case PGT_l4_page_table:
1047 free_l4_table(page);
1048 break;
1049 #endif
1051 default:
1052 BUG();
1055 if ( unlikely(shadow_mode_enabled(d)) &&
1056 (get_shadow_status(d, page_to_pfn(page)) & PSH_shadowed) )
1058 unshadow_table(page_to_pfn(page), type);
1059 put_shadow_status(d);
1064 void put_page_type(struct pfn_info *page)
1066 u32 nx, x, y = page->u.inuse.type_info;
1068 again:
1069 do {
1070 x = y;
1071 nx = x - 1;
1073 ASSERT((x & PGT_count_mask) != 0);
1075 /*
1076 * The page should always be validated while a reference is held. The
1077 * exception is during domain destruction, when we forcibly invalidate
1078 * page-table pages if we detect a referential loop.
1079 * See domain.c:relinquish_list().
1080 */
1081 ASSERT((x & PGT_validated) ||
1082 test_bit(DF_DYING, &page_get_owner(page)->d_flags));
1084 if ( unlikely((nx & PGT_count_mask) == 0) )
1086 /* Record TLB information for flush later. Races are harmless. */
1087 page->tlbflush_timestamp = tlbflush_current_time();
1089 if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
1090 likely(nx & PGT_validated) )
1092 /*
1093 * Page-table pages must be unvalidated when count is zero. The
1094 * 'free' is safe because the refcnt is non-zero and validated
1095 * bit is clear => other ops will spin or fail.
1096 */
1097 if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x,
1098 x & ~PGT_validated)) != x) )
1099 goto again;
1100 /* We cleared the 'valid bit' so we do the clear up. */
1101 free_page_type(page, x & PGT_type_mask);
1102 /* Carry on, but with the 'valid bit' now clear. */
1103 x &= ~PGT_validated;
1104 nx &= ~PGT_validated;
1107 else if ( unlikely((nx & (PGT_pinned | PGT_count_mask)) ==
1108 (PGT_pinned | 1)) )
1110 /* Page is now only pinned. Make the back pointer mutable again. */
1111 nx |= PGT_va_mutable;
1114 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
1118 int get_page_type(struct pfn_info *page, u32 type)
1120 u32 nx, x, y = page->u.inuse.type_info;
1122 again:
1123 do {
1124 x = y;
1125 nx = x + 1;
1126 if ( unlikely((nx & PGT_count_mask) == 0) )
1128 MEM_LOG("Type count overflow on pfn %p\n", page_to_pfn(page));
1129 return 0;
1131 else if ( unlikely((x & PGT_count_mask) == 0) )
1133 if ( (x & (PGT_type_mask|PGT_va_mask)) != type )
1135 /*
1136 * On type change we check to flush stale TLB entries. This
1137 * may be unnecessary (e.g., page was GDT/LDT) but those
1138 * circumstances should be very rare.
1139 */
1140 struct domain *d = page_get_owner(page);
1141 if ( unlikely(NEED_FLUSH(tlbflush_time[d->exec_domain[0]->
1142 processor],
1143 page->tlbflush_timestamp)) )
1145 perfc_incr(need_flush_tlb_flush);
1146 flush_tlb_cpu(d->exec_domain[0]->processor);
1149 /* We lose existing type, back pointer, and validity. */
1150 nx &= ~(PGT_type_mask | PGT_va_mask | PGT_validated);
1151 nx |= type;
1153 /* No special validation needed for writable pages. */
1154 /* Page tables and GDT/LDT need to be scanned for validity. */
1155 if ( type == PGT_writable_page )
1156 nx |= PGT_validated;
1159 else if ( unlikely((x & (PGT_type_mask|PGT_va_mask)) != type) )
1161 if ( unlikely((x & PGT_type_mask) != (type & PGT_type_mask) ) )
1163 if ( ((x & PGT_type_mask) != PGT_l2_page_table) ||
1164 ((type & PGT_type_mask) != PGT_l1_page_table) )
1165 MEM_LOG("Bad type (saw %08x != exp %08x) for pfn %p\n",
1166 x & PGT_type_mask, type, page_to_pfn(page));
1167 return 0;
1169 else if ( (x & PGT_va_mask) == PGT_va_mutable )
1171 /* The va backpointer is mutable, hence we update it. */
1172 nx &= ~PGT_va_mask;
1173 nx |= type; /* we know the actual type is correct */
1175 else if ( unlikely((x & PGT_va_mask) != (type & PGT_va_mask)) )
1177 /* This table is potentially mapped at multiple locations. */
1178 nx &= ~PGT_va_mask;
1179 nx |= PGT_va_unknown;
1182 else if ( unlikely(!(x & PGT_validated)) )
1184 /* Someone else is updating validation of this page. Wait... */
1185 while ( (y = page->u.inuse.type_info) == x )
1187 rep_nop();
1188 barrier();
1190 goto again;
1193 while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
1195 if ( unlikely(!(nx & PGT_validated)) )
1197 /* Try to validate page type; drop the new reference on failure. */
1198 if ( unlikely(!alloc_page_type(page, type & PGT_type_mask)) )
1200 MEM_LOG("Error while validating pfn %p for type %08x."
1201 " caf=%08x taf=%08x\n",
1202 page_to_pfn(page), type,
1203 page->count_info,
1204 page->u.inuse.type_info);
1205 /* Noone else can get a reference. We hold the only ref. */
1206 page->u.inuse.type_info = 0;
1207 return 0;
1210 /* Noone else is updating simultaneously. */
1211 __set_bit(_PGT_validated, &page->u.inuse.type_info);
1214 return 1;
1218 int new_guest_cr3(unsigned long pfn)
1220 struct exec_domain *ed = current;
1221 struct domain *d = ed->domain;
1222 int okay, cpu = smp_processor_id();
1223 unsigned long old_base_pfn;
1225 okay = get_page_and_type_from_pagenr(pfn, PGT_root_page_table, d);
1226 if ( likely(okay) )
1228 invalidate_shadow_ldt(ed);
1230 percpu_info[cpu].deferred_ops &= ~DOP_FLUSH_TLB;
1231 old_base_pfn = pagetable_val(ed->arch.guest_table) >> PAGE_SHIFT;
1232 ed->arch.guest_table = mk_pagetable(pfn << PAGE_SHIFT);
1233 update_pagetables(ed); /* update shadow_table and monitor_table */
1235 write_ptbase(ed);
1237 put_page_and_type(&frame_table[old_base_pfn]);
1239 else
1241 MEM_LOG("Error while installing new baseptr %p", pfn);
1244 return okay;
1247 static int do_extended_command(unsigned long ptr, unsigned long val)
1249 int okay = 1, cpu = smp_processor_id();
1250 unsigned int cmd = val & MMUEXT_CMD_MASK, type;
1251 unsigned long pfn = ptr >> PAGE_SHIFT;
1252 struct pfn_info *page = &frame_table[pfn];
1253 struct exec_domain *ed = current;
1254 struct domain *d = ed->domain, *e;
1255 u32 x, y, _d, _nd;
1256 domid_t domid;
1257 grant_ref_t gntref;
1259 switch ( cmd )
1261 case MMUEXT_PIN_L1_TABLE:
1262 /*
1263 * We insist that, if you pin an L1 page, it's the first thing that
1264 * you do to it. This is because we require the backptr to still be
1265 * mutable. This assumption seems safe.
1266 */
1267 type = PGT_l1_page_table | PGT_va_mutable;
1269 pin_page:
1270 okay = get_page_and_type_from_pagenr(pfn, type, FOREIGNDOM);
1271 if ( unlikely(!okay) )
1273 MEM_LOG("Error while pinning pfn %p", pfn);
1274 break;
1277 if ( unlikely(test_and_set_bit(_PGT_pinned,
1278 &page->u.inuse.type_info)) )
1280 MEM_LOG("Pfn %p already pinned", pfn);
1281 put_page_and_type(page);
1282 okay = 0;
1283 break;
1286 break;
1288 case MMUEXT_PIN_L2_TABLE:
1289 type = PGT_l2_page_table;
1290 goto pin_page;
1292 #ifdef __x86_64__
1293 case MMUEXT_PIN_L3_TABLE:
1294 type = PGT_l3_page_table;
1295 goto pin_page;
1297 case MMUEXT_PIN_L4_TABLE:
1298 type = PGT_l4_page_table;
1299 goto pin_page;
1300 #endif /* __x86_64__ */
1302 case MMUEXT_UNPIN_TABLE:
1303 if ( unlikely(!(okay = get_page_from_pagenr(pfn, FOREIGNDOM))) )
1305 MEM_LOG("Page %p bad domain (dom=%p)",
1306 ptr, page_get_owner(page));
1308 else if ( likely(test_and_clear_bit(_PGT_pinned,
1309 &page->u.inuse.type_info)) )
1311 put_page_and_type(page);
1312 put_page(page);
1314 else
1316 okay = 0;
1317 put_page(page);
1318 MEM_LOG("Pfn %p not pinned", pfn);
1320 break;
1322 case MMUEXT_NEW_BASEPTR:
1323 okay = new_guest_cr3(pfn);
1324 break;
1326 #ifdef __x86_64__
1327 case MMUEXT_NEW_USER_BASEPTR:
1328 okay = get_page_and_type_from_pagenr(pfn, PGT_root_page_table, d);
1329 if ( unlikely(!okay) )
1331 MEM_LOG("Error while installing new baseptr %p", pfn);
1333 else
1335 unsigned long old_pfn =
1336 pagetable_val(ed->arch.guest_table_user) >> PAGE_SHIFT;
1337 ed->arch.guest_table_user = mk_pagetable(pfn << PAGE_SHIFT);
1338 if ( old_pfn != 0 )
1339 put_page_and_type(&frame_table[old_pfn]);
1341 break;
1342 #endif
1344 case MMUEXT_TLB_FLUSH:
1345 percpu_info[cpu].deferred_ops |= DOP_FLUSH_TLB;
1346 break;
1348 case MMUEXT_INVLPG:
1349 __flush_tlb_one(ptr);
1350 break;
1352 case MMUEXT_FLUSH_CACHE:
1353 if ( unlikely(!IS_CAPABLE_PHYSDEV(d)) )
1355 MEM_LOG("Non-physdev domain tried to FLUSH_CACHE.\n");
1356 okay = 0;
1358 else
1360 wbinvd();
1362 break;
1364 case MMUEXT_SET_LDT:
1366 unsigned long ents = val >> MMUEXT_CMD_SHIFT;
1367 if ( ((ptr & (PAGE_SIZE-1)) != 0) ||
1368 (ents > 8192) ||
1369 ((ptr+ents*LDT_ENTRY_SIZE) < ptr) ||
1370 ((ptr+ents*LDT_ENTRY_SIZE) > PAGE_OFFSET) )
1372 okay = 0;
1373 MEM_LOG("Bad args to SET_LDT: ptr=%p, ents=%p", ptr, ents);
1375 else if ( (ed->arch.ldt_ents != ents) ||
1376 (ed->arch.ldt_base != ptr) )
1378 invalidate_shadow_ldt(ed);
1379 ed->arch.ldt_base = ptr;
1380 ed->arch.ldt_ents = ents;
1381 load_LDT(ed);
1382 percpu_info[cpu].deferred_ops &= ~DOP_RELOAD_LDT;
1383 if ( ents != 0 )
1384 percpu_info[cpu].deferred_ops |= DOP_RELOAD_LDT;
1386 break;
1389 case MMUEXT_SET_FOREIGNDOM:
1390 domid = (domid_t)(val >> 16);
1392 if ( (e = percpu_info[cpu].foreign) != NULL )
1393 put_domain(e);
1394 percpu_info[cpu].foreign = NULL;
1396 if ( !IS_PRIV(d) )
1398 switch ( domid )
1400 case DOMID_IO:
1401 get_knownalive_domain(dom_io);
1402 percpu_info[cpu].foreign = dom_io;
1403 break;
1404 default:
1405 MEM_LOG("Dom %u cannot set foreign dom\n", d->id);
1406 okay = 0;
1407 break;
1410 else
1412 percpu_info[cpu].foreign = e = find_domain_by_id(domid);
1413 if ( e == NULL )
1415 switch ( domid )
1417 case DOMID_XEN:
1418 get_knownalive_domain(dom_xen);
1419 percpu_info[cpu].foreign = dom_xen;
1420 break;
1421 case DOMID_IO:
1422 get_knownalive_domain(dom_io);
1423 percpu_info[cpu].foreign = dom_io;
1424 break;
1425 default:
1426 MEM_LOG("Unknown domain '%u'", domid);
1427 okay = 0;
1428 break;
1432 break;
1434 case MMUEXT_TRANSFER_PAGE:
1435 domid = (domid_t)(val >> 16);
1436 gntref = (grant_ref_t)((val & 0xFF00) | ((ptr >> 2) & 0x00FF));
1438 if ( unlikely(IS_XEN_HEAP_FRAME(page)) ||
1439 unlikely(!pfn_is_ram(pfn)) ||
1440 unlikely((e = find_domain_by_id(domid)) == NULL) )
1442 MEM_LOG("Bad frame (%p) or bad domid (%d).\n", pfn, domid);
1443 okay = 0;
1444 break;
1447 spin_lock(&d->page_alloc_lock);
1449 /*
1450 * The tricky bit: atomically release ownership while there is just one
1451 * benign reference to the page (PGC_allocated). If that reference
1452 * disappears then the deallocation routine will safely spin.
1453 */
1454 _d = pickle_domptr(d);
1455 _nd = page->u.inuse._domain;
1456 y = page->count_info;
1457 do {
1458 x = y;
1459 if ( unlikely((x & (PGC_count_mask|PGC_allocated)) !=
1460 (1|PGC_allocated)) ||
1461 unlikely(_nd != _d) )
1463 MEM_LOG("Bad page values %p: ed=%p(%u), sd=%p,"
1464 " caf=%08x, taf=%08x\n", page_to_pfn(page),
1465 d, d->id, unpickle_domptr(_nd), x,
1466 page->u.inuse.type_info);
1467 spin_unlock(&d->page_alloc_lock);
1468 put_domain(e);
1469 return 0;
1471 __asm__ __volatile__(
1472 LOCK_PREFIX "cmpxchg8b %2"
1473 : "=d" (_nd), "=a" (y),
1474 "=m" (*(volatile u64 *)(&page->count_info))
1475 : "0" (_d), "1" (x), "c" (NULL), "b" (x) );
1477 while ( unlikely(_nd != _d) || unlikely(y != x) );
1479 /*
1480 * Unlink from 'd'. At least one reference remains (now anonymous), so
1481 * noone else is spinning to try to delete this page from 'd'.
1482 */
1483 d->tot_pages--;
1484 list_del(&page->list);
1486 spin_unlock(&d->page_alloc_lock);
1488 spin_lock(&e->page_alloc_lock);
1490 /*
1491 * Check that 'e' will accept the page and has reservation headroom.
1492 * Also, a domain mustn't have PGC_allocated pages when it is dying.
1493 */
1494 ASSERT(e->tot_pages <= e->max_pages);
1495 if ( unlikely(test_bit(DF_DYING, &e->d_flags)) ||
1496 unlikely(e->tot_pages == e->max_pages) ||
1497 unlikely(!gnttab_prepare_for_transfer(e, d, gntref)) )
1499 MEM_LOG("Transferee has no reservation headroom (%d,%d), or "
1500 "provided a bad grant ref, or is dying (%p).\n",
1501 e->tot_pages, e->max_pages, e->d_flags);
1502 spin_unlock(&e->page_alloc_lock);
1503 put_domain(e);
1504 okay = 0;
1505 break;
1508 /* Okay, add the page to 'e'. */
1509 if ( unlikely(e->tot_pages++ == 0) )
1510 get_knownalive_domain(e);
1511 list_add_tail(&page->list, &e->page_list);
1512 page_set_owner(page, e);
1514 spin_unlock(&e->page_alloc_lock);
1516 /* Transfer is all done: tell the guest about its new page frame. */
1517 gnttab_notify_transfer(e, gntref, pfn);
1519 put_domain(e);
1520 break;
1522 case MMUEXT_REASSIGN_PAGE:
1523 if ( unlikely(!IS_PRIV(d)) )
1525 MEM_LOG("Dom %u has no reassignment priv", d->id);
1526 okay = 0;
1527 break;
1530 e = percpu_info[cpu].foreign;
1531 if ( unlikely(e == NULL) )
1533 MEM_LOG("No FOREIGNDOM to reassign pfn %p to", pfn);
1534 okay = 0;
1535 break;
1538 /*
1539 * Grab both page_list locks, in order. This prevents the page from
1540 * disappearing elsewhere while we modify the owner, and we'll need
1541 * both locks if we're successful so that we can change lists.
1542 */
1543 if ( d < e )
1545 spin_lock(&d->page_alloc_lock);
1546 spin_lock(&e->page_alloc_lock);
1548 else
1550 spin_lock(&e->page_alloc_lock);
1551 spin_lock(&d->page_alloc_lock);
1554 /* A domain shouldn't have PGC_allocated pages when it is dying. */
1555 if ( unlikely(test_bit(DF_DYING, &e->d_flags)) ||
1556 unlikely(IS_XEN_HEAP_FRAME(page)) )
1558 MEM_LOG("Reassignment page is Xen heap, or dest dom is dying.");
1559 okay = 0;
1560 goto reassign_fail;
1563 /*
1564 * The tricky bit: atomically change owner while there is just one
1565 * benign reference to the page (PGC_allocated). If that reference
1566 * disappears then the deallocation routine will safely spin.
1567 */
1568 _d = pickle_domptr(d);
1569 _nd = page->u.inuse._domain;
1570 y = page->count_info;
1571 do {
1572 x = y;
1573 if ( unlikely((x & (PGC_count_mask|PGC_allocated)) !=
1574 (1|PGC_allocated)) ||
1575 unlikely(_nd != _d) )
1577 MEM_LOG("Bad page values %p: ed=%p(%u), sd=%p,"
1578 " caf=%08x, taf=%08x\n", page_to_pfn(page),
1579 d, d->id, unpickle_domptr(_nd), x,
1580 page->u.inuse.type_info);
1581 okay = 0;
1582 goto reassign_fail;
1584 __asm__ __volatile__(
1585 LOCK_PREFIX "cmpxchg8b %3"
1586 : "=d" (_nd), "=a" (y), "=c" (e),
1587 "=m" (*(volatile u64 *)(&page->count_info))
1588 : "0" (_d), "1" (x), "c" (e), "b" (x) );
1590 while ( unlikely(_nd != _d) || unlikely(y != x) );
1592 /*
1593 * Unlink from 'd'. We transferred at least one reference to 'e', so
1594 * noone else is spinning to try to delete this page from 'd'.
1595 */
1596 d->tot_pages--;
1597 list_del(&page->list);
1599 /*
1600 * Add the page to 'e'. Someone may already have removed the last
1601 * reference and want to remove the page from 'e'. However, we have
1602 * the lock so they'll spin waiting for us.
1603 */
1604 if ( unlikely(e->tot_pages++ == 0) )
1605 get_knownalive_domain(e);
1606 list_add_tail(&page->list, &e->page_list);
1608 reassign_fail:
1609 spin_unlock(&d->page_alloc_lock);
1610 spin_unlock(&e->page_alloc_lock);
1611 break;
1613 case MMUEXT_CLEAR_FOREIGNDOM:
1614 if ( (e = percpu_info[cpu].foreign) != NULL )
1615 put_domain(e);
1616 percpu_info[cpu].foreign = NULL;
1617 break;
1619 default:
1620 MEM_LOG("Invalid extended pt command 0x%p", val & MMUEXT_CMD_MASK);
1621 okay = 0;
1622 break;
1625 return okay;
1628 int do_mmu_update(
1629 mmu_update_t *ureqs, unsigned int count, unsigned int *pdone)
1631 /*
1632 * We steal the m.s.b. of the @count parameter to indicate whether this
1633 * invocation of do_mmu_update() is resuming a previously preempted call.
1634 * We steal the next 15 bits to remember the current FOREIGNDOM.
1635 */
1636 #define MMU_UPDATE_PREEMPTED (~(~0U>>1))
1637 #define MMU_UPDATE_PREEMPT_FDOM_SHIFT ((sizeof(int)*8)-16)
1638 #define MMU_UPDATE_PREEMPT_FDOM_MASK (0x7FFFU<<MMU_UPDATE_PREEMPT_FDOM_SHIFT)
1640 mmu_update_t req;
1641 unsigned long va = 0, deferred_ops, pfn, prev_pfn = 0;
1642 struct pfn_info *page;
1643 int rc = 0, okay = 1, i = 0, cpu = smp_processor_id();
1644 unsigned int cmd, done = 0;
1645 unsigned long prev_smfn = 0;
1646 l1_pgentry_t *prev_spl1e = 0;
1647 struct exec_domain *ed = current;
1648 struct domain *d = ed->domain;
1649 u32 type_info;
1650 domid_t domid;
1652 LOCK_BIGLOCK(d);
1654 cleanup_writable_pagetable(d);
1656 if ( unlikely(shadow_mode_enabled(d)) )
1657 check_pagetable(d, ed->arch.guest_table, "pre-mmu"); /* debug */
1659 if ( unlikely(shadow_mode_translate(d) ) )
1660 domain_crash();
1662 /*
1663 * If we are resuming after preemption, read how much work we have already
1664 * done. This allows us to set the @done output parameter correctly.
1665 * We also reset FOREIGNDOM here.
1666 */
1667 if ( unlikely(count&(MMU_UPDATE_PREEMPTED|MMU_UPDATE_PREEMPT_FDOM_MASK)) )
1669 if ( !(count & MMU_UPDATE_PREEMPTED) )
1671 /* Count overflow into private FOREIGNDOM field. */
1672 MEM_LOG("do_mmu_update count is too large");
1673 rc = -EINVAL;
1674 goto out;
1676 count &= ~MMU_UPDATE_PREEMPTED;
1677 domid = count >> MMU_UPDATE_PREEMPT_FDOM_SHIFT;
1678 count &= ~MMU_UPDATE_PREEMPT_FDOM_MASK;
1679 if ( unlikely(pdone != NULL) )
1680 (void)get_user(done, pdone);
1681 if ( (domid != current->domain->id) &&
1682 !do_extended_command(0, MMUEXT_SET_FOREIGNDOM | (domid << 16)) )
1684 rc = -EINVAL;
1685 goto out;
1689 #ifdef PERF_COUNTERS
1690 perfc_incrc(calls_to_mmu_update);
1691 perfc_addc(num_page_updates, count);
1692 /*
1693 * do a histogram for count.
1694 * first bucket is for count=0,
1695 * second bucket is for count=1
1696 * last bucket is for count >= 63 * PERFC_PT_UPDATES_BUCKET_SIZE
1697 */
1698 if ( count == 0 )
1700 perfc_incra(bpt_updates, 0);
1701 } else if ( count == 1 )
1703 perfc_incra(bpt_updates, 1);
1704 } else if ( (count / PERFC_PT_UPDATES_BUCKET_SIZE)
1705 < (PERFC_MAX_PT_UPDATES - 3) )
1707 perfc_incra(bpt_updates, (count / PERFC_PT_UPDATES_BUCKET_SIZE) + 2);
1708 } else
1710 perfc_incra(bpt_updates, PERFC_MAX_PT_UPDATES - 1);
1712 #endif
1714 if ( unlikely(!array_access_ok(VERIFY_READ, ureqs, count, sizeof(req))) )
1716 rc = -EFAULT;
1717 goto out;
1720 for ( i = 0; i < count; i++ )
1722 if ( hypercall_preempt_check() )
1724 rc = hypercall3_create_continuation(
1725 __HYPERVISOR_mmu_update, ureqs,
1726 (count - i) |
1727 (FOREIGNDOM->id << MMU_UPDATE_PREEMPT_FDOM_SHIFT) |
1728 MMU_UPDATE_PREEMPTED, pdone);
1729 break;
1732 if ( unlikely(__copy_from_user(&req, ureqs, sizeof(req)) != 0) )
1734 MEM_LOG("Bad __copy_from_user");
1735 rc = -EFAULT;
1736 break;
1739 cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
1740 pfn = req.ptr >> PAGE_SHIFT;
1742 okay = 0;
1744 switch ( cmd )
1746 /*
1747 * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
1748 */
1749 case MMU_NORMAL_PT_UPDATE:
1750 if ( unlikely(!get_page_from_pagenr(pfn, current->domain)) )
1752 MEM_LOG("Could not get page for normal update");
1753 break;
1756 if ( likely(prev_pfn == pfn) )
1758 va = (va & PAGE_MASK) | (req.ptr & ~PAGE_MASK);
1760 else
1762 if ( prev_pfn != 0 )
1763 unmap_domain_mem((void *)va);
1764 va = (unsigned long)map_domain_mem(req.ptr);
1765 prev_pfn = pfn;
1768 page = &frame_table[pfn];
1769 switch ( (type_info = page->u.inuse.type_info) & PGT_type_mask )
1771 case PGT_l1_page_table:
1772 if ( likely(get_page_type(
1773 page, type_info & (PGT_type_mask|PGT_va_mask))) )
1775 okay = mod_l1_entry((l1_pgentry_t *)va,
1776 mk_l1_pgentry(req.val));
1778 if ( unlikely(shadow_mode_enabled(d)) && okay &&
1779 (get_shadow_status(d, page-frame_table) &
1780 PSH_shadowed) )
1782 shadow_l1_normal_pt_update(
1783 req.ptr, req.val, &prev_smfn, &prev_spl1e);
1784 put_shadow_status(d);
1787 put_page_type(page);
1789 break;
1790 case PGT_l2_page_table:
1791 if ( likely(get_page_type(page, PGT_l2_page_table)) )
1793 okay = mod_l2_entry((l2_pgentry_t *)va,
1794 mk_l2_pgentry(req.val),
1795 pfn);
1797 if ( unlikely(shadow_mode_enabled(d)) && okay &&
1798 (get_shadow_status(d, page-frame_table) &
1799 PSH_shadowed) )
1801 shadow_l2_normal_pt_update(req.ptr, req.val);
1802 put_shadow_status(d);
1805 put_page_type(page);
1807 break;
1808 #ifdef __x86_64__
1809 case PGT_l3_page_table:
1810 if ( likely(get_page_type(page, PGT_l3_page_table)) )
1812 okay = mod_l3_entry((l3_pgentry_t *)va,
1813 mk_l3_pgentry(req.val),
1814 pfn);
1816 if ( unlikely(shadow_mode_enabled(d)) && okay &&
1817 (get_shadow_status(d, page-frame_table) &
1818 PSH_shadowed) )
1820 /*XXXshadow_l3_normal_pt_update(req.ptr, req.val);*/
1821 put_shadow_status(d);
1824 put_page_type(page);
1826 break;
1827 case PGT_l4_page_table:
1828 if ( likely(get_page_type(page, PGT_l4_page_table)) )
1830 okay = mod_l4_entry((l4_pgentry_t *)va,
1831 mk_l4_pgentry(req.val),
1832 pfn);
1834 if ( unlikely(shadow_mode_enabled(d)) && okay &&
1835 (get_shadow_status(d, page-frame_table) &
1836 PSH_shadowed) )
1838 /*XXXshadow_l4_normal_pt_update(req.ptr, req.val);*/
1839 put_shadow_status(d);
1842 put_page_type(page);
1844 break;
1845 #endif /* __x86_64__ */
1846 default:
1847 if ( likely(get_page_type(page, PGT_writable_page)) )
1849 *(unsigned long *)va = req.val;
1850 okay = 1;
1851 put_page_type(page);
1853 break;
1856 put_page(page);
1857 break;
1859 case MMU_MACHPHYS_UPDATE:
1860 if ( unlikely(!get_page_from_pagenr(pfn, FOREIGNDOM)) )
1862 MEM_LOG("Could not get page for mach->phys update");
1863 break;
1866 machine_to_phys_mapping[pfn] = req.val;
1867 okay = 1;
1869 /*
1870 * If in log-dirty mode, mark the corresponding pseudo-physical
1871 * page as dirty.
1872 */
1873 if ( unlikely(shadow_mode_log_dirty(d)) &&
1874 mark_dirty(d, pfn) )
1875 d->arch.shadow_dirty_block_count++;
1877 put_page(&frame_table[pfn]);
1878 break;
1880 /*
1881 * MMU_EXTENDED_COMMAND: Extended command is specified
1882 * in the least-siginificant bits of the 'value' field.
1883 */
1884 case MMU_EXTENDED_COMMAND:
1885 req.ptr &= ~(sizeof(l1_pgentry_t) - 1);
1886 okay = do_extended_command(req.ptr, req.val);
1887 break;
1889 default:
1890 MEM_LOG("Invalid page update command %p", req.ptr);
1891 break;
1894 if ( unlikely(!okay) )
1896 rc = -EINVAL;
1897 break;
1900 ureqs++;
1903 out:
1904 if ( prev_pfn != 0 )
1905 unmap_domain_mem((void *)va);
1907 if ( unlikely(prev_spl1e != 0) )
1908 unmap_domain_mem((void *)prev_spl1e);
1910 deferred_ops = percpu_info[cpu].deferred_ops;
1911 percpu_info[cpu].deferred_ops = 0;
1913 if ( deferred_ops & DOP_FLUSH_TLB )
1914 local_flush_tlb();
1916 if ( deferred_ops & DOP_RELOAD_LDT )
1917 (void)map_ldt_shadow_page(0);
1919 if ( unlikely(percpu_info[cpu].foreign != NULL) )
1921 put_domain(percpu_info[cpu].foreign);
1922 percpu_info[cpu].foreign = NULL;
1925 /* Add incremental work we have done to the @done output parameter. */
1926 if ( unlikely(pdone != NULL) )
1927 __put_user(done + i, pdone);
1929 if ( unlikely(shadow_mode_enabled(d)) )
1930 check_pagetable(d, ed->arch.guest_table, "post-mmu"); /* debug */
1932 UNLOCK_BIGLOCK(d);
1933 return rc;
1937 int do_update_va_mapping(unsigned long va,
1938 unsigned long val,
1939 unsigned long flags)
1941 struct exec_domain *ed = current;
1942 struct domain *d = ed->domain;
1943 int err = 0;
1944 unsigned int cpu = ed->processor;
1945 unsigned long deferred_ops;
1947 perfc_incrc(calls_to_update_va);
1949 if ( unlikely(!__addr_ok(va)) )
1950 return -EINVAL;
1952 if ( unlikely(shadow_mode_translate(d) ) )
1953 domain_crash();
1955 LOCK_BIGLOCK(d);
1957 cleanup_writable_pagetable(d);
1959 /*
1960 * XXX When we make this support 4MB superpages we should also deal with
1961 * the case of updating L2 entries.
1962 */
1964 if ( unlikely(!mod_l1_entry(&linear_pg_table[l1_linear_offset(va)],
1965 mk_l1_pgentry(val))) )
1966 err = -EINVAL;
1968 if ( unlikely(shadow_mode_enabled(d)) )
1970 unsigned long sval = 0;
1972 l1pte_propagate_from_guest(d, &val, &sval);
1974 if ( unlikely(__put_user(sval, ((unsigned long *)(
1975 &shadow_linear_pg_table[l1_linear_offset(va)])))) )
1977 /*
1978 * Since L2's are guranteed RW, failure indicates either that the
1979 * page was not shadowed, or that the L2 entry has not yet been
1980 * updated to reflect the shadow.
1981 */
1982 l2_pgentry_t gpde = linear_l2_table[l2_table_offset(va)];
1983 unsigned long gpfn = l2_pgentry_val(gpde) >> PAGE_SHIFT;
1985 if (get_shadow_status(d, gpfn))
1987 unsigned long gmfn = __gpfn_to_mfn(d, gpfn);
1988 unsigned long *gl1e = map_domain_mem(gmfn << PAGE_SHIFT);
1989 unsigned l1_idx = l1_table_offset(va);
1990 gl1e[l1_idx] = sval;
1991 unmap_domain_mem(gl1e);
1992 put_shadow_status(d);
1994 perfc_incrc(shadow_update_va_fail1);
1996 else
1997 perfc_incrc(shadow_update_va_fail2);
2000 /*
2001 * If we're in log-dirty mode then we need to note that we've updated
2002 * the PTE in the PT-holding page. We need the machine frame number
2003 * for this.
2004 */
2005 if ( shadow_mode_log_dirty(d) )
2006 mark_dirty(d, va_to_l1mfn(va));
2008 check_pagetable(d, ed->arch.guest_table, "va"); /* debug */
2011 deferred_ops = percpu_info[cpu].deferred_ops;
2012 percpu_info[cpu].deferred_ops = 0;
2014 if ( unlikely(deferred_ops & DOP_FLUSH_TLB) ||
2015 unlikely(flags & UVMF_FLUSH_TLB) )
2016 local_flush_tlb();
2017 else if ( unlikely(flags & UVMF_INVLPG) )
2018 __flush_tlb_one(va);
2020 if ( unlikely(deferred_ops & DOP_RELOAD_LDT) )
2021 (void)map_ldt_shadow_page(0);
2023 UNLOCK_BIGLOCK(d);
2025 return err;
2028 int do_update_va_mapping_otherdomain(unsigned long va,
2029 unsigned long val,
2030 unsigned long flags,
2031 domid_t domid)
2033 unsigned int cpu = smp_processor_id();
2034 struct domain *d;
2035 int rc;
2037 if ( unlikely(!IS_PRIV(current->domain)) )
2038 return -EPERM;
2040 percpu_info[cpu].foreign = d = find_domain_by_id(domid);
2041 if ( unlikely(d == NULL) )
2043 MEM_LOG("Unknown domain '%u'", domid);
2044 return -ESRCH;
2047 rc = do_update_va_mapping(va, val, flags);
2049 put_domain(d);
2050 percpu_info[cpu].foreign = NULL;
2052 return rc;
2057 /*************************
2058 * Descriptor Tables
2059 */
2061 void destroy_gdt(struct exec_domain *ed)
2063 int i;
2064 unsigned long pfn;
2066 for ( i = 0; i < 16; i++ )
2068 if ( (pfn = l1_pgentry_to_pfn(ed->arch.perdomain_ptes[i])) != 0 )
2069 put_page_and_type(&frame_table[pfn]);
2070 ed->arch.perdomain_ptes[i] = mk_l1_pgentry(0);
2075 long set_gdt(struct exec_domain *ed,
2076 unsigned long *frames,
2077 unsigned int entries)
2079 struct domain *d = ed->domain;
2080 /* NB. There are 512 8-byte entries per GDT page. */
2081 int i = 0, nr_pages = (entries + 511) / 512;
2082 struct desc_struct *vgdt;
2083 unsigned long pfn;
2085 /* Check the first page in the new GDT. */
2086 if ( (pfn = frames[0]) >= max_page )
2087 goto fail;
2089 /* The first page is special because Xen owns a range of entries in it. */
2090 if ( !get_page_and_type(&frame_table[pfn], d, PGT_gdt_page) )
2092 /* GDT checks failed: try zapping the Xen reserved entries. */
2093 if ( !get_page_and_type(&frame_table[pfn], d, PGT_writable_page) )
2094 goto fail;
2095 vgdt = map_domain_mem(pfn << PAGE_SHIFT);
2096 memset(vgdt + FIRST_RESERVED_GDT_ENTRY, 0,
2097 NR_RESERVED_GDT_ENTRIES*8);
2098 unmap_domain_mem(vgdt);
2099 put_page_and_type(&frame_table[pfn]);
2101 /* Okay, we zapped the entries. Now try the GDT checks again. */
2102 if ( !get_page_and_type(&frame_table[pfn], d, PGT_gdt_page) )
2103 goto fail;
2106 /* Check the remaining pages in the new GDT. */
2107 for ( i = 1; i < nr_pages; i++ )
2108 if ( ((pfn = frames[i]) >= max_page) ||
2109 !get_page_and_type(&frame_table[pfn], d, PGT_gdt_page) )
2110 goto fail;
2112 /* Copy reserved GDT entries to the new GDT. */
2113 vgdt = map_domain_mem(frames[0] << PAGE_SHIFT);
2114 memcpy(vgdt + FIRST_RESERVED_GDT_ENTRY,
2115 gdt_table + FIRST_RESERVED_GDT_ENTRY,
2116 NR_RESERVED_GDT_ENTRIES*8);
2117 unmap_domain_mem(vgdt);
2119 /* Tear down the old GDT. */
2120 destroy_gdt(ed);
2122 /* Install the new GDT. */
2123 for ( i = 0; i < nr_pages; i++ )
2124 ed->arch.perdomain_ptes[i] =
2125 mk_l1_pgentry((frames[i] << PAGE_SHIFT) | __PAGE_HYPERVISOR);
2127 SET_GDT_ADDRESS(ed, GDT_VIRT_START(ed));
2128 SET_GDT_ENTRIES(ed, entries);
2130 return 0;
2132 fail:
2133 while ( i-- > 0 )
2134 put_page_and_type(&frame_table[frames[i]]);
2135 return -EINVAL;
2139 long do_set_gdt(unsigned long *frame_list, unsigned int entries)
2141 int nr_pages = (entries + 511) / 512;
2142 unsigned long frames[16];
2143 long ret;
2145 if ( (entries <= LAST_RESERVED_GDT_ENTRY) || (entries > 8192) )
2146 return -EINVAL;
2148 if ( copy_from_user(frames, frame_list, nr_pages * sizeof(unsigned long)) )
2149 return -EFAULT;
2151 LOCK_BIGLOCK(current->domain);
2153 if ( (ret = set_gdt(current, frames, entries)) == 0 )
2155 local_flush_tlb();
2156 __asm__ __volatile__ ("lgdt %0" : "=m" (*current->arch.gdt));
2159 UNLOCK_BIGLOCK(current->domain);
2161 return ret;
2165 long do_update_descriptor(
2166 unsigned long pa, unsigned long word1, unsigned long word2)
2168 unsigned long pfn = pa >> PAGE_SHIFT;
2169 struct desc_struct *gdt_pent, d;
2170 struct pfn_info *page;
2171 struct exec_domain *ed;
2172 long ret = -EINVAL;
2174 d.a = (u32)word1;
2175 d.b = (u32)word2;
2177 LOCK_BIGLOCK(current->domain);
2179 if ( (pa & 7) || (pfn >= max_page) || !check_descriptor(&d) ) {
2180 UNLOCK_BIGLOCK(current->domain);
2181 return -EINVAL;
2184 page = &frame_table[pfn];
2185 if ( unlikely(!get_page(page, current->domain)) ) {
2186 UNLOCK_BIGLOCK(current->domain);
2187 return -EINVAL;
2190 /* Check if the given frame is in use in an unsafe context. */
2191 switch ( page->u.inuse.type_info & PGT_type_mask )
2193 case PGT_gdt_page:
2194 /* Disallow updates of Xen-reserved descriptors in the current GDT. */
2195 for_each_exec_domain(current->domain, ed) {
2196 if ( (l1_pgentry_to_pfn(ed->arch.perdomain_ptes[0]) == pfn) &&
2197 (((pa&(PAGE_SIZE-1))>>3) >= FIRST_RESERVED_GDT_ENTRY) &&
2198 (((pa&(PAGE_SIZE-1))>>3) <= LAST_RESERVED_GDT_ENTRY) )
2199 goto out;
2201 if ( unlikely(!get_page_type(page, PGT_gdt_page)) )
2202 goto out;
2203 break;
2204 case PGT_ldt_page:
2205 if ( unlikely(!get_page_type(page, PGT_ldt_page)) )
2206 goto out;
2207 break;
2208 default:
2209 if ( unlikely(!get_page_type(page, PGT_writable_page)) )
2210 goto out;
2211 break;
2214 /* All is good so make the update. */
2215 gdt_pent = map_domain_mem(pa);
2216 memcpy(gdt_pent, &d, 8);
2217 unmap_domain_mem(gdt_pent);
2219 put_page_type(page);
2221 ret = 0; /* success */
2223 out:
2224 put_page(page);
2226 UNLOCK_BIGLOCK(current->domain);
2228 return ret;
2233 /*************************
2234 * Writable Pagetables
2235 */
2237 ptwr_info_t ptwr_info[NR_CPUS];
2239 #ifdef VERBOSE
2240 int ptwr_debug = 0x0;
2241 #define PTWR_PRINTK(_f, _a...) \
2242 do { if ( unlikely(ptwr_debug) ) printk( _f , ## _a ); } while ( 0 )
2243 #define PTWR_PRINT_WHICH (which ? 'I' : 'A')
2244 #else
2245 #define PTWR_PRINTK(_f, _a...) ((void)0)
2246 #endif
2248 /* Flush the given writable p.t. page and write-protect it again. */
2249 void ptwr_flush(const int which)
2251 unsigned long sstat, spte, pte, *ptep, l1va;
2252 l1_pgentry_t *sl1e = NULL, *pl1e, ol1e, nl1e;
2253 l2_pgentry_t *pl2e;
2254 int i, cpu = smp_processor_id();
2255 struct exec_domain *ed = current;
2256 struct domain *d = ed->domain;
2257 unsigned int count;
2259 l1va = ptwr_info[cpu].ptinfo[which].l1va;
2260 ptep = (unsigned long *)&linear_pg_table[l1_linear_offset(l1va)];
2262 /*
2263 * STEP 1. Write-protect the p.t. page so no more updates can occur.
2264 */
2266 if ( unlikely(__get_user(pte, ptep)) )
2268 MEM_LOG("ptwr: Could not read pte at %p\n", ptep);
2269 /*
2270 * Really a bug. We could read this PTE during the initial fault,
2271 * and pagetables can't have changed meantime. XXX Multi-CPU guests?
2272 */
2273 BUG();
2275 PTWR_PRINTK("[%c] disconnected_l1va at %p is %p\n",
2276 PTWR_PRINT_WHICH, ptep, pte);
2277 pte &= ~_PAGE_RW;
2279 if ( unlikely(shadow_mode_enabled(d)) )
2281 /* Write-protect the p.t. page in the shadow page table. */
2282 l1pte_propagate_from_guest(d, &pte, &spte);
2283 __put_user(spte, (unsigned long *)
2284 &shadow_linear_pg_table[l1_linear_offset(l1va)]);
2286 /* Is the p.t. page itself shadowed? Map it into Xen space if so. */
2287 sstat = get_shadow_status(d, pte >> PAGE_SHIFT);
2288 if ( sstat & PSH_shadowed )
2289 sl1e = map_domain_mem((sstat & PSH_pfn_mask) << PAGE_SHIFT);
2292 /* Write-protect the p.t. page in the guest page table. */
2293 if ( unlikely(__put_user(pte, ptep)) )
2295 MEM_LOG("ptwr: Could not update pte at %p\n", ptep);
2296 /*
2297 * Really a bug. We could write this PTE during the initial fault,
2298 * and pagetables can't have changed meantime. XXX Multi-CPU guests?
2299 */
2300 BUG();
2303 /* Ensure that there are no stale writable mappings in any TLB. */
2304 /* NB. INVLPG is a serialising instruction: flushes pending updates. */
2305 #if 1
2306 __flush_tlb_one(l1va); /* XXX Multi-CPU guests? */
2307 #else
2308 flush_tlb_all();
2309 #endif
2310 PTWR_PRINTK("[%c] disconnected_l1va at %p now %p\n",
2311 PTWR_PRINT_WHICH, ptep, pte);
2313 /*
2314 * STEP 2. Validate any modified PTEs.
2315 */
2316 count = 0;
2317 pl1e = ptwr_info[cpu].ptinfo[which].pl1e;
2318 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
2320 ol1e = ptwr_info[cpu].ptinfo[which].page[i];
2321 nl1e = pl1e[i];
2323 if ( likely(l1_pgentry_val(ol1e) == l1_pgentry_val(nl1e)) )
2324 continue;
2326 /* update number of entries modified */
2327 count++;
2329 /*
2330 * Fast path for PTEs that have merely been write-protected
2331 * (e.g., during a Unix fork()). A strict reduction in privilege.
2332 */
2333 if ( likely(l1_pgentry_val(ol1e) == (l1_pgentry_val(nl1e)|_PAGE_RW)) )
2335 if ( likely(l1_pgentry_val(nl1e) & _PAGE_PRESENT) )
2337 if ( unlikely(sl1e != NULL) )
2338 l1pte_propagate_from_guest(
2339 d, &l1_pgentry_val(nl1e),
2340 &l1_pgentry_val(sl1e[i]));
2341 put_page_type(&frame_table[l1_pgentry_to_pfn(nl1e)]);
2343 continue;
2346 if ( unlikely(!get_page_from_l1e(nl1e, d)) )
2348 MEM_LOG("ptwr: Could not re-validate l1 page\n");
2349 /*
2350 * Make the remaining p.t's consistent before crashing, so the
2351 * reference counts are correct.
2352 */
2353 memcpy(&pl1e[i], &ptwr_info[cpu].ptinfo[which].page[i],
2354 (L1_PAGETABLE_ENTRIES - i) * sizeof(l1_pgentry_t));
2355 unmap_domain_mem(pl1e);
2356 ptwr_info[cpu].ptinfo[which].l1va = 0;
2357 UNLOCK_BIGLOCK(d);
2358 domain_crash();
2361 if ( unlikely(sl1e != NULL) )
2362 l1pte_propagate_from_guest(
2363 d, &l1_pgentry_val(nl1e), &l1_pgentry_val(sl1e[i]));
2365 if ( unlikely(l1_pgentry_val(ol1e) & _PAGE_PRESENT) )
2366 put_page_from_l1e(ol1e, d);
2368 unmap_domain_mem(pl1e);
2370 #ifdef PERF_COUNTERS
2371 /*
2372 * do a histogram for count.
2373 * first bucket is for count=0,
2374 * second bucket is for count=1
2375 * last bucket is for count >= 63 * PERFC_PT_UPDATES_BUCKET_SIZE
2376 */
2377 if ( count == 0 )
2379 perfc_incra(wpt_updates, 0);
2380 } else if ( count == 1 )
2382 perfc_incra(wpt_updates, 1);
2383 } else if ( (count / PERFC_PT_UPDATES_BUCKET_SIZE)
2384 < (PERFC_MAX_PT_UPDATES - 3) )
2386 perfc_incra(wpt_updates, (count / PERFC_PT_UPDATES_BUCKET_SIZE) + 2);
2387 } else
2389 perfc_incra(wpt_updates, PERFC_MAX_PT_UPDATES - 1);
2391 #endif
2394 /*
2395 * STEP 3. Reattach the L1 p.t. page into the current address space.
2396 */
2398 if ( (which == PTWR_PT_ACTIVE) && likely(!shadow_mode_enabled(d)) )
2400 pl2e = &linear_l2_table[ptwr_info[cpu].ptinfo[which].l2_idx];
2401 *pl2e = mk_l2_pgentry(l2_pgentry_val(*pl2e) | _PAGE_PRESENT);
2404 /*
2405 * STEP 4. Final tidy-up.
2406 */
2408 ptwr_info[cpu].ptinfo[which].l1va = 0;
2410 if ( unlikely(sl1e != NULL) )
2412 unmap_domain_mem(sl1e);
2413 put_shadow_status(d);
2417 /* Write page fault handler: check if guest is trying to modify a PTE. */
2418 int ptwr_do_page_fault(unsigned long addr)
2420 unsigned long pte, pfn, l2e;
2421 struct pfn_info *page;
2422 l2_pgentry_t *pl2e;
2423 int which, cpu = smp_processor_id();
2424 u32 l2_idx;
2426 #ifdef __x86_64__
2427 return 0; /* Writable pagetables need fixing for x86_64. */
2428 #endif
2430 /*
2431 * Attempt to read the PTE that maps the VA being accessed. By checking for
2432 * PDE validity in the L2 we avoid many expensive fixups in __get_user().
2433 */
2434 if ( !(l2_pgentry_val(linear_l2_table[addr>>L2_PAGETABLE_SHIFT]) &
2435 _PAGE_PRESENT) ||
2436 __get_user(pte, (unsigned long *)
2437 &linear_pg_table[l1_linear_offset(addr)]) )
2439 return 0;
2442 pfn = pte >> PAGE_SHIFT;
2443 page = &frame_table[pfn];
2445 /* We are looking only for read-only mappings of p.t. pages. */
2446 if ( ((pte & (_PAGE_RW | _PAGE_PRESENT)) != _PAGE_PRESENT) ||
2447 ((page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table) )
2449 return 0;
2452 /* Get the L2 index at which this L1 p.t. is always mapped. */
2453 l2_idx = page->u.inuse.type_info & PGT_va_mask;
2454 if ( unlikely(l2_idx >= PGT_va_unknown) )
2456 domain_crash(); /* Urk! This L1 is mapped in multiple L2 slots! */
2458 l2_idx >>= PGT_va_shift;
2460 if ( l2_idx == (addr >> L2_PAGETABLE_SHIFT) )
2462 MEM_LOG("PTWR failure! Pagetable maps itself at %p\n", addr);
2463 domain_crash();
2466 /*
2467 * Is the L1 p.t. mapped into the current address space? If so we call it
2468 * an ACTIVE p.t., otherwise it is INACTIVE.
2469 */
2470 pl2e = &linear_l2_table[l2_idx];
2471 l2e = l2_pgentry_val(*pl2e);
2472 which = PTWR_PT_INACTIVE;
2473 if ( (l2e >> PAGE_SHIFT) == pfn )
2475 /* Check the PRESENT bit to set ACTIVE. */
2476 if ( likely(l2e & _PAGE_PRESENT) )
2477 which = PTWR_PT_ACTIVE;
2478 else {
2479 /*
2480 * If the PRESENT bit is clear, we may be conflicting with
2481 * the current ACTIVE p.t. (it may be the same p.t. mapped
2482 * at another virt addr).
2483 * The ptwr_flush call below will restore the PRESENT bit.
2484 */
2485 if ( ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va &&
2486 l2_idx == ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l2_idx )
2487 which = PTWR_PT_ACTIVE;
2491 PTWR_PRINTK("[%c] page_fault on l1 pt at va %p, pt for %08x, "
2492 "pfn %p\n", PTWR_PRINT_WHICH,
2493 addr, l2_idx << L2_PAGETABLE_SHIFT, pfn);
2495 /*
2496 * We only allow one ACTIVE and one INACTIVE p.t. to be updated at at
2497 * time. If there is already one, we must flush it out.
2498 */
2499 if ( ptwr_info[cpu].ptinfo[which].l1va )
2500 ptwr_flush(which);
2502 ptwr_info[cpu].ptinfo[which].l1va = addr | 1;
2503 ptwr_info[cpu].ptinfo[which].l2_idx = l2_idx;
2505 /* For safety, disconnect the L1 p.t. page from current space. */
2506 if ( (which == PTWR_PT_ACTIVE) &&
2507 likely(!shadow_mode_enabled(current->domain)) )
2509 *pl2e = mk_l2_pgentry(l2e & ~_PAGE_PRESENT);
2510 #if 1
2511 flush_tlb(); /* XXX Multi-CPU guests? */
2512 #else
2513 flush_tlb_all();
2514 #endif
2517 /* Temporarily map the L1 page, and make a copy of it. */
2518 ptwr_info[cpu].ptinfo[which].pl1e = map_domain_mem(pfn << PAGE_SHIFT);
2519 memcpy(ptwr_info[cpu].ptinfo[which].page,
2520 ptwr_info[cpu].ptinfo[which].pl1e,
2521 L1_PAGETABLE_ENTRIES * sizeof(l1_pgentry_t));
2523 /* Finally, make the p.t. page writable by the guest OS. */
2524 pte |= _PAGE_RW;
2525 PTWR_PRINTK("[%c] update %p pte to %p\n", PTWR_PRINT_WHICH,
2526 &linear_pg_table[addr>>PAGE_SHIFT], pte);
2527 if ( unlikely(__put_user(pte, (unsigned long *)
2528 &linear_pg_table[addr>>PAGE_SHIFT])) )
2530 MEM_LOG("ptwr: Could not update pte at %p\n", (unsigned long *)
2531 &linear_pg_table[addr>>PAGE_SHIFT]);
2532 /* Toss the writable pagetable state and crash. */
2533 unmap_domain_mem(ptwr_info[cpu].ptinfo[which].pl1e);
2534 ptwr_info[cpu].ptinfo[which].l1va = 0;
2535 domain_crash();
2538 return EXCRET_fault_fixed;
2541 static __init int ptwr_init(void)
2543 int i;
2545 for ( i = 0; i < smp_num_cpus; i++ )
2547 ptwr_info[i].ptinfo[PTWR_PT_ACTIVE].page =
2548 (void *)alloc_xenheap_page();
2549 ptwr_info[i].ptinfo[PTWR_PT_INACTIVE].page =
2550 (void *)alloc_xenheap_page();
2553 return 0;
2555 __initcall(ptwr_init);
2560 /************************************************************************/
2561 /************************************************************************/
2562 /************************************************************************/
2564 #ifndef NDEBUG
2566 void ptwr_status(void)
2568 unsigned long pte, *ptep, pfn;
2569 struct pfn_info *page;
2570 int cpu = smp_processor_id();
2572 ptep = (unsigned long *)&linear_pg_table
2573 [ptwr_info[cpu].ptinfo[PTWR_PT_INACTIVE].l1va>>PAGE_SHIFT];
2575 if ( __get_user(pte, ptep) ) {
2576 MEM_LOG("ptwr: Could not read pte at %p\n", ptep);
2577 domain_crash();
2580 pfn = pte >> PAGE_SHIFT;
2581 page = &frame_table[pfn];
2582 printk("need to alloc l1 page %p\n", page);
2583 /* make pt page writable */
2584 printk("need to make read-only l1-page at %p is %p\n",
2585 ptep, pte);
2587 if ( ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va == 0 )
2588 return;
2590 if ( __get_user(pte, (unsigned long *)
2591 ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va) ) {
2592 MEM_LOG("ptwr: Could not read pte at %p\n", (unsigned long *)
2593 ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va);
2594 domain_crash();
2596 pfn = pte >> PAGE_SHIFT;
2597 page = &frame_table[pfn];
2600 void audit_domain(struct domain *d)
2602 int ttot=0, ctot=0, io_mappings=0, lowmem_mappings=0;
2604 void adjust (struct pfn_info *page, int dir, int adjtype)
2606 int count = page->count_info & PGC_count_mask;
2608 if ( adjtype )
2610 int tcount = page->u.inuse.type_info & PGT_count_mask;
2612 ttot++;
2614 tcount += dir;
2616 if ( tcount < 0 )
2618 /* This will only come out once. */
2619 printk("Audit %d: type count whent below zero pfn=%x "
2620 "taf=%x otaf=%x\n",
2621 d->id, page-frame_table,
2622 page->u.inuse.type_info,
2623 page->tlbflush_timestamp);
2626 page->u.inuse.type_info =
2627 (page->u.inuse.type_info & ~PGT_count_mask) |
2628 (tcount & PGT_count_mask);
2631 ctot++;
2632 count += dir;
2633 if ( count < 0 )
2635 /* This will only come out once. */
2636 printk("Audit %d: general count whent below zero pfn=%x "
2637 "taf=%x otaf=%x\n",
2638 d->id, page-frame_table,
2639 page->u.inuse.type_info,
2640 page->tlbflush_timestamp);
2643 page->count_info =
2644 (page->count_info & ~PGC_count_mask) |
2645 (count & PGC_count_mask);
2649 void scan_for_pfn(struct domain *d, unsigned long xpfn)
2651 unsigned long pfn, *pt;
2652 struct list_head *list_ent;
2653 struct pfn_info *page;
2654 int i;
2656 list_ent = d->page_list.next;
2657 for ( i = 0; (list_ent != &d->page_list); i++ )
2659 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
2660 page = &frame_table[pfn];
2662 switch ( page->u.inuse.type_info & PGT_type_mask )
2664 case PGT_l1_page_table:
2665 case PGT_l2_page_table:
2666 pt = map_domain_mem(pfn<<PAGE_SHIFT);
2667 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
2668 if ( (pt[i] & _PAGE_PRESENT) &&
2669 ((pt[i] >> PAGE_SHIFT) == xpfn) )
2670 printk(" found dom=%d i=%x pfn=%lx t=%x c=%x\n",
2671 d->id, i, pfn, page->u.inuse.type_info,
2672 page->count_info);
2673 unmap_domain_mem(pt);
2676 list_ent = frame_table[pfn].list.next;
2681 void scan_for_pfn_remote(unsigned long xpfn)
2683 struct domain *e;
2684 for_each_domain ( e )
2685 scan_for_pfn( e, xpfn );
2688 int i, l1, l2;
2689 unsigned long pfn;
2690 struct list_head *list_ent;
2691 struct pfn_info *page;
2693 if ( d != current->domain )
2694 domain_pause(d);
2695 synchronise_pagetables(~0UL);
2697 printk("pt base=%lx sh_info=%x\n",
2698 pagetable_val(d->exec_domain[0]->arch.guest_table)>>PAGE_SHIFT,
2699 virt_to_page(d->shared_info)-frame_table);
2701 spin_lock(&d->page_alloc_lock);
2703 /* PHASE 0 */
2705 list_ent = d->page_list.next;
2706 for ( i = 0; (list_ent != &d->page_list); i++ )
2708 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
2709 page = &frame_table[pfn];
2711 if ( page_get_owner(page) != d )
2712 BUG();
2714 if ( (page->u.inuse.type_info & PGT_count_mask) >
2715 (page->count_info & PGC_count_mask) )
2716 printk("taf > caf %x %x pfn=%lx\n",
2717 page->u.inuse.type_info, page->count_info, pfn );
2719 #if 0 /* SYSV shared memory pages plus writeable files. */
2720 if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_writable_page &&
2721 (page->u.inuse.type_info & PGT_count_mask) > 1 )
2723 printk("writeable page with type count >1: pfn=%lx t=%x c=%x\n",
2724 pfn,
2725 page->u.inuse.type_info,
2726 page->count_info );
2727 scan_for_pfn_remote(pfn);
2729 #endif
2730 if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_none &&
2731 (page->u.inuse.type_info & PGT_count_mask) > 1 )
2733 printk("normal page with type count >1: pfn=%lx t=%x c=%x\n",
2734 pfn,
2735 page->u.inuse.type_info,
2736 page->count_info );
2739 /* Use tlbflush_timestamp to store original type_info. */
2740 page->tlbflush_timestamp = page->u.inuse.type_info;
2742 list_ent = frame_table[pfn].list.next;
2746 /* PHASE 1 */
2747 if ( pagetable_val(d->exec_domain[0]->arch.guest_table) )
2748 adjust(&frame_table[pagetable_val(d->exec_domain[0]->arch.guest_table)
2749 >>PAGE_SHIFT], -1, 1);
2751 list_ent = d->page_list.next;
2752 for ( i = 0; (list_ent != &d->page_list); i++ )
2754 unsigned long *pt;
2755 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
2756 page = &frame_table[pfn];
2758 if ( page_get_owner(page) != d )
2759 BUG();
2761 switch ( page->u.inuse.type_info & PGT_type_mask )
2763 case PGT_l2_page_table:
2765 if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated )
2766 printk("Audit %d: L2 not validated %x\n",
2767 d->id, page->u.inuse.type_info);
2769 if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned )
2770 printk("Audit %d: L2 not pinned %x\n",
2771 d->id, page->u.inuse.type_info);
2772 else
2773 adjust( page, -1, 1 );
2775 pt = map_domain_mem( pfn<<PAGE_SHIFT );
2777 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
2779 if ( pt[i] & _PAGE_PRESENT )
2781 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
2782 struct pfn_info *l1page = &frame_table[l1pfn];
2784 if ( page_get_owner(l1page) != d )
2786 printk("L2: Skip bizarre page belonging to other "
2787 "dom %p\n", page_get_owner(l1page));
2788 continue;
2791 if ( (l1page->u.inuse.type_info & PGT_type_mask) ==
2792 PGT_l2_page_table )
2793 printk("Audit %d: [%x] Found %s Linear PT "
2794 "t=%x pfn=%lx\n", d->id, i,
2795 (l1pfn==pfn) ? "Self" : "Other",
2796 l1page->u.inuse.type_info,
2797 l1pfn);
2798 else if ( (l1page->u.inuse.type_info & PGT_type_mask) !=
2799 PGT_l1_page_table )
2800 printk("Audit %d: [%x] Expected L1 t=%x pfn=%lx\n",
2801 d->id, i,
2802 l1page->u.inuse.type_info,
2803 l1pfn);
2805 adjust(l1page, -1, 1);
2809 unmap_domain_mem(pt);
2811 break;
2814 case PGT_l1_page_table:
2816 if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
2817 adjust( page, -1, 1 );
2819 if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated )
2820 printk("Audit %d: L1 not validated %x\n",
2821 d->id, page->u.inuse.type_info);
2822 #if 0
2823 if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned )
2824 printk("Audit %d: L1 not pinned %x\n",
2825 d->id, page->u.inuse.type_info);
2826 #endif
2827 pt = map_domain_mem( pfn<<PAGE_SHIFT );
2829 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
2831 if ( pt[i] & _PAGE_PRESENT )
2833 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
2834 struct pfn_info *l1page = &frame_table[l1pfn];
2836 if ( l1pfn < 0x100 )
2838 lowmem_mappings++;
2839 continue;
2842 if ( l1pfn > max_page )
2844 io_mappings++;
2845 continue;
2848 if ( pt[i] & _PAGE_RW )
2851 if ( (l1page->u.inuse.type_info & PGT_type_mask) ==
2852 PGT_l1_page_table ||
2853 (l1page->u.inuse.type_info & PGT_type_mask) ==
2854 PGT_l2_page_table )
2855 printk("Audit %d: [%x] Ilegal RW t=%x pfn=%lx\n",
2856 d->id, i,
2857 l1page->u.inuse.type_info,
2858 l1pfn);
2862 if ( page_get_owner(l1page) != d )
2864 printk("Audit %d: [%lx,%x] Skip foreign page dom=%p "
2865 "pfn=%lx c=%08x t=%08x m2p=%lx\n",
2866 d->id, pfn, i,
2867 page_get_owner(l1page),
2868 l1pfn,
2869 l1page->count_info,
2870 l1page->u.inuse.type_info,
2871 machine_to_phys_mapping[l1pfn]);
2872 continue;
2875 adjust(l1page, -1, 0);
2879 unmap_domain_mem(pt);
2881 break;
2884 list_ent = frame_table[pfn].list.next;
2887 if ( (io_mappings > 0) || (lowmem_mappings > 0) )
2888 printk("Audit %d: Found %d lowmem mappings and %d io mappings\n",
2889 d->id, lowmem_mappings, io_mappings);
2891 /* PHASE 2 */
2893 ctot = ttot = 0;
2894 list_ent = d->page_list.next;
2895 for ( i = 0; (list_ent != &d->page_list); i++ )
2897 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
2898 page = &frame_table[pfn];
2900 switch ( page->u.inuse.type_info & PGT_type_mask)
2902 case PGT_l1_page_table:
2903 case PGT_l2_page_table:
2904 if ( (page->u.inuse.type_info & PGT_count_mask) != 0 )
2906 printk("Audit %d: type count!=0 t=%x ot=%x c=%x pfn=%lx\n",
2907 d->id, page->u.inuse.type_info,
2908 page->tlbflush_timestamp,
2909 page->count_info, pfn );
2910 scan_for_pfn_remote(pfn);
2912 default:
2913 if ( (page->count_info & PGC_count_mask) != 1 )
2915 printk("Audit %d: gen count!=1 (c=%x) t=%x ot=%x pfn=%lx\n",
2916 d->id,
2917 page->count_info,
2918 page->u.inuse.type_info,
2919 page->tlbflush_timestamp, pfn );
2920 scan_for_pfn_remote(pfn);
2922 break;
2925 list_ent = frame_table[pfn].list.next;
2928 /* PHASE 3 */
2929 list_ent = d->page_list.next;
2930 l1 = l2 = 0;
2931 for ( i = 0; (list_ent != &d->page_list); i++ )
2933 unsigned long *pt;
2934 pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
2935 page = &frame_table[pfn];
2937 switch ( page->u.inuse.type_info & PGT_type_mask )
2939 case PGT_l2_page_table:
2940 l2++;
2941 if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
2942 adjust( page, 1, 1 );
2944 pt = map_domain_mem( pfn<<PAGE_SHIFT );
2946 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
2948 if ( pt[i] & _PAGE_PRESENT )
2950 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
2951 struct pfn_info *l1page;
2953 if (l1pfn>max_page)
2954 continue;
2956 l1page = &frame_table[l1pfn];
2958 if ( page_get_owner(l1page) == d )
2959 adjust(l1page, 1, 1);
2963 unmap_domain_mem(pt);
2964 break;
2966 case PGT_l1_page_table:
2967 l1++;
2968 if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
2969 adjust( page, 1, 1 );
2971 pt = map_domain_mem( pfn<<PAGE_SHIFT );
2973 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
2975 if ( pt[i] & _PAGE_PRESENT )
2977 unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
2978 struct pfn_info *l1page;
2980 if (l1pfn>max_page)
2981 continue;
2983 l1page = &frame_table[l1pfn];
2985 if ( (page_get_owner(l1page) != d) ||
2986 (l1pfn < 0x100) || (l1pfn > max_page) )
2987 continue;
2989 adjust(l1page, 1, 0);
2993 unmap_domain_mem(pt);
2994 break;
2998 page->tlbflush_timestamp = 0;
3000 list_ent = frame_table[pfn].list.next;
3003 spin_unlock(&d->page_alloc_lock);
3005 if ( pagetable_val(d->exec_domain[0]->arch.guest_table) )
3006 adjust(&frame_table[pagetable_val(
3007 d->exec_domain[0]->arch.guest_table)>>PAGE_SHIFT], 1, 1);
3009 printk("Audit %d: Done. pages=%d l1=%d l2=%d ctot=%d ttot=%d\n", d->id, i, l1, l2, ctot, ttot );
3011 if ( d != current->domain )
3012 domain_unpause(d);
3015 void audit_domains(void)
3017 struct domain *d;
3018 for_each_domain ( d )
3019 audit_domain(d);
3022 void audit_domains_key(unsigned char key)
3024 audit_domains();
3027 #endif /* NDEBUG */