ia64/xen-unstable

view xen/arch/x86/memory.c @ 1880:020f58885ed4

bitkeeper revision 1.1108.2.22 (410083a0-dBCDjHcryIgCa_AFqhcNA)

memory.c:
Fix invalidate LDT code to invalidate the correct LDT.
author kaf24@scramble.cl.cam.ac.uk
date Fri Jul 23 03:18:56 2004 +0000 (2004-07-23)
parents 91ec71db01ce
children 381b2b637b12 a2d2b4ac2439
line source
1 /******************************************************************************
2 * arch/x86/memory.c
3 *
4 * Copyright (c) 2002-2004 K A Fraser
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
21 /*
22 * A description of the x86 page table API:
23 *
24 * Domains trap to do_mmu_update with a list of update requests.
25 * This is a list of (ptr, val) pairs, where the requested operation
26 * is *ptr = val.
27 *
28 * Reference counting of pages:
29 * ----------------------------
30 * Each page has two refcounts: tot_count and type_count.
31 *
32 * TOT_COUNT is the obvious reference count. It counts all uses of a
33 * physical page frame by a domain, including uses as a page directory,
34 * a page table, or simple mappings via a PTE. This count prevents a
35 * domain from releasing a frame back to the free pool when it still holds
36 * a reference to it.
37 *
38 * TYPE_COUNT is more subtle. A frame can be put to one of three
39 * mutually-exclusive uses: it might be used as a page directory, or a
40 * page table, or it may be mapped writeable by the domain [of course, a
41 * frame may not be used in any of these three ways!].
42 * So, type_count is a count of the number of times a frame is being
43 * referred to in its current incarnation. Therefore, a page can only
44 * change its type when its type count is zero.
45 *
46 * Pinning the page type:
47 * ----------------------
48 * The type of a page can be pinned/unpinned with the commands
49 * MMUEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is,
50 * pinning is not reference counted, so it can't be nested).
51 * This is useful to prevent a page's type count falling to zero, at which
52 * point safety checks would need to be carried out next time the count
53 * is increased again.
54 *
55 * A further note on writeable page mappings:
56 * ------------------------------------------
57 * For simplicity, the count of writeable mappings for a page may not
58 * correspond to reality. The 'writeable count' is incremented for every
59 * PTE which maps the page with the _PAGE_RW flag set. However, for
60 * write access to be possible the page directory entry must also have
61 * its _PAGE_RW bit set. We do not check this as it complicates the
62 * reference counting considerably [consider the case of multiple
63 * directory entries referencing a single page table, some with the RW
64 * bit set, others not -- it starts getting a bit messy].
65 * In normal use, this simplification shouldn't be a problem.
66 * However, the logic can be added if required.
67 *
68 * One more note on read-only page mappings:
69 * -----------------------------------------
70 * We want domains to be able to map pages for read-only access. The
71 * main reason is that page tables and directories should be readable
72 * by a domain, but it would not be safe for them to be writeable.
73 * However, domains have free access to rings 1 & 2 of the Intel
74 * privilege model. In terms of page protection, these are considered
75 * to be part of 'supervisor mode'. The WP bit in CR0 controls whether
76 * read-only restrictions are respected in supervisor mode -- if the
77 * bit is clear then any mapped page is writeable.
78 *
79 * We get round this by always setting the WP bit and disallowing
80 * updates to it. This is very unlikely to cause a problem for guest
81 * OS's, which will generally use the WP bit to simplify copy-on-write
82 * implementation (in that case, OS wants a fault when it writes to
83 * an application-supplied buffer).
84 */
86 #include <xen/config.h>
87 #include <xen/init.h>
88 #include <xen/lib.h>
89 #include <xen/mm.h>
90 #include <xen/sched.h>
91 #include <xen/errno.h>
92 #include <xen/perfc.h>
93 #include <xen/irq.h>
94 #include <asm/shadow.h>
95 #include <asm/page.h>
96 #include <asm/flushtlb.h>
97 #include <asm/io.h>
98 #include <asm/uaccess.h>
99 #include <asm/domain_page.h>
100 #include <asm/ldt.h>
102 #ifndef NDEBUG
103 #define MEM_LOG(_f, _a...) \
104 printk("DOM%u: (file=memory.c, line=%d) " _f "\n", \
105 current->domain , __LINE__ , ## _a )
106 #else
107 #define MEM_LOG(_f, _a...) ((void)0)
108 #endif
110 static int alloc_l2_table(struct pfn_info *page);
111 static int alloc_l1_table(struct pfn_info *page);
112 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d);
113 static int get_page_and_type_from_pagenr(unsigned long page_nr,
114 u32 type,
115 struct domain *d);
117 static void free_l2_table(struct pfn_info *page);
118 static void free_l1_table(struct pfn_info *page);
120 static int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t, unsigned long);
121 static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t);
123 /* Used to defer flushing of memory structures. */
124 static struct {
125 #define DOP_FLUSH_TLB (1<<0) /* Flush the TLB. */
126 #define DOP_RELOAD_LDT (1<<1) /* Reload the LDT shadow mapping. */
127 unsigned long deferred_ops;
128 unsigned long cr0;
129 /* General-Purpose Subject, Page-Table Subject */
130 struct domain *gps, *pts;
131 } percpu_info[NR_CPUS] __cacheline_aligned;
133 /* Determine the current General-Purpose Subject or Page-Table Subject. */
134 #define PTS (percpu_info[smp_processor_id()].pts ? : current)
135 #define GPS (percpu_info[smp_processor_id()].gps ? : current)
138 void init_percpu_info(void)
139 {
140 memset(percpu_info, 0, sizeof(percpu_info));
141 }
143 static void __invalidate_shadow_ldt(struct domain *d)
144 {
145 int i;
146 unsigned long pfn;
147 struct pfn_info *page;
149 d->mm.shadow_ldt_mapcnt = 0;
151 for ( i = 16; i < 32; i++ )
152 {
153 pfn = l1_pgentry_to_pagenr(d->mm.perdomain_pt[i]);
154 if ( pfn == 0 ) continue;
155 d->mm.perdomain_pt[i] = mk_l1_pgentry(0);
156 page = &frame_table[pfn];
157 ASSERT_PAGE_IS_TYPE(page, PGT_ldt_page);
158 ASSERT_PAGE_IS_DOMAIN(page, d);
159 put_page_and_type(page);
160 }
162 /* Dispose of the (now possibly invalid) mappings from the TLB. */
163 percpu_info[d->processor].deferred_ops |= DOP_FLUSH_TLB | DOP_RELOAD_LDT;
164 }
167 static inline void invalidate_shadow_ldt(struct domain *d)
168 {
169 if ( d->mm.shadow_ldt_mapcnt != 0 )
170 __invalidate_shadow_ldt(d);
171 }
174 int alloc_segdesc_page(struct pfn_info *page)
175 {
176 unsigned long *descs = map_domain_mem((page-frame_table) << PAGE_SHIFT);
177 int i;
179 for ( i = 0; i < 512; i++ )
180 if ( unlikely(!check_descriptor(&descs[i*2])) )
181 goto fail;
183 unmap_domain_mem(descs);
184 return 1;
186 fail:
187 unmap_domain_mem(descs);
188 return 0;
189 }
192 /* Map shadow page at offset @off. */
193 int map_ldt_shadow_page(unsigned int off)
194 {
195 struct domain *d = current;
196 unsigned long l1e;
198 if ( unlikely(in_irq()) )
199 BUG();
201 __get_user(l1e, (unsigned long *)&linear_pg_table[(d->mm.ldt_base >>
202 PAGE_SHIFT) + off]);
204 if ( unlikely(!(l1e & _PAGE_PRESENT)) ||
205 unlikely(!get_page_and_type(&frame_table[l1e >> PAGE_SHIFT],
206 d, PGT_ldt_page)) )
207 return 0;
209 d->mm.perdomain_pt[off + 16] = mk_l1_pgentry(l1e | _PAGE_RW);
210 d->mm.shadow_ldt_mapcnt++;
212 return 1;
213 }
216 static int get_page_from_pagenr(unsigned long page_nr, struct domain *d)
217 {
218 struct pfn_info *page = &frame_table[page_nr];
220 if ( unlikely(!pfn_is_ram(page_nr)) )
221 {
222 MEM_LOG("Pfn %08lx is not RAM", page_nr);
223 return 0;
224 }
226 if ( unlikely(!get_page(page, d)) )
227 {
228 MEM_LOG("Could not get page ref for pfn %08lx", page_nr);
229 return 0;
230 }
232 return 1;
233 }
236 static int get_page_and_type_from_pagenr(unsigned long page_nr,
237 u32 type,
238 struct domain *d)
239 {
240 struct pfn_info *page = &frame_table[page_nr];
242 if ( unlikely(!get_page_from_pagenr(page_nr, d)) )
243 return 0;
245 if ( unlikely(!get_page_type(page, type)) )
246 {
247 MEM_LOG("Bad page type for pfn %08lx (%08x)",
248 page_nr, page->type_and_flags);
249 put_page(page);
250 return 0;
251 }
253 return 1;
254 }
257 /*
258 * We allow an L2 tables to map each other (a.k.a. linear page tables). It
259 * needs some special care with reference counst and access permissions:
260 * 1. The mapping entry must be read-only, or the guest may get write access
261 * to its own PTEs.
262 * 2. We must only bump the reference counts for an *already validated*
263 * L2 table, or we can end up in a deadlock in get_page_type() by waiting
264 * on a validation that is required to complete that validation.
265 * 3. We only need to increment the reference counts for the mapped page
266 * frame if it is mapped by a different L2 table. This is sufficient and
267 * also necessary to allow validation of an L2 table mapping itself.
268 */
269 static int get_linear_pagetable(l2_pgentry_t l2e, unsigned long pfn)
270 {
271 u32 x, y;
272 struct pfn_info *page;
274 if ( (l2_pgentry_val(l2e) & _PAGE_RW) )
275 {
276 MEM_LOG("Attempt to create linear p.t. with write perms");
277 return 0;
278 }
280 if ( (l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn )
281 {
282 /* Make sure the mapped frame belongs to the correct domain. */
283 if ( unlikely(!get_page_from_pagenr(l2_pgentry_to_pagenr(l2e), PTS)) )
284 return 0;
286 /*
287 * Make sure that the mapped frame is an already-validated L2 table.
288 * If so, atomically increment the count (checking for overflow).
289 */
290 page = &frame_table[l2_pgentry_to_pagenr(l2e)];
291 y = page->type_and_flags;
292 do {
293 x = y;
294 if ( unlikely((x & PGT_count_mask) == PGT_count_mask) ||
295 unlikely((x & (PGT_type_mask|PGT_validated)) !=
296 (PGT_l2_page_table|PGT_validated)) )
297 {
298 put_page(page);
299 return 0;
300 }
301 }
302 while ( (y = cmpxchg(&page->type_and_flags, x, x + 1)) != x );
303 }
305 return 1;
306 }
309 static int get_page_from_l1e(l1_pgentry_t l1e)
310 {
311 unsigned long l1v = l1_pgentry_val(l1e);
312 unsigned long pfn = l1_pgentry_to_pagenr(l1e);
313 extern int domain_iomem_in_pfn(struct domain *d, unsigned long pfn);
315 if ( !(l1v & _PAGE_PRESENT) )
316 return 1;
318 if ( unlikely(l1v & (_PAGE_GLOBAL|_PAGE_PAT)) )
319 {
320 MEM_LOG("Bad L1 type settings %04lx", l1v & (_PAGE_GLOBAL|_PAGE_PAT));
321 return 0;
322 }
324 if ( unlikely(!pfn_is_ram(pfn)) )
325 {
326 if ( IS_PRIV(current) )
327 return 1;
329 if ( IS_CAPABLE_PHYSDEV(current) )
330 return domain_iomem_in_pfn(current, pfn);
332 MEM_LOG("Non-privileged attempt to map I/O space %08lx", pfn);
333 return 0;
334 }
336 if ( l1v & _PAGE_RW )
337 {
338 if ( unlikely(!get_page_and_type_from_pagenr(
339 pfn, PGT_writeable_page, GPS)) )
340 return 0;
341 set_bit(_PGC_tlb_flush_on_type_change,
342 &frame_table[pfn].count_and_flags);
343 return 1;
344 }
346 return get_page_from_pagenr(pfn, GPS);
347 }
350 /* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
351 static int get_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
352 {
353 if ( !(l2_pgentry_val(l2e) & _PAGE_PRESENT) )
354 return 1;
356 if ( unlikely((l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE))) )
357 {
358 MEM_LOG("Bad L2 page type settings %04lx",
359 l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE));
360 return 0;
361 }
363 if ( unlikely(!get_page_and_type_from_pagenr(
364 l2_pgentry_to_pagenr(l2e), PGT_l1_page_table, PTS)) )
365 return get_linear_pagetable(l2e, pfn);
367 return 1;
368 }
371 static void put_page_from_l1e(l1_pgentry_t l1e)
372 {
373 struct pfn_info *page = &frame_table[l1_pgentry_to_pagenr(l1e)];
374 unsigned long l1v = l1_pgentry_val(l1e);
376 if ( !(l1v & _PAGE_PRESENT) || !pfn_is_ram(l1v >> PAGE_SHIFT) )
377 return;
379 if ( l1v & _PAGE_RW )
380 {
381 put_page_and_type(page);
382 }
383 else
384 {
385 /* We expect this is rare so we blow the entire shadow LDT. */
386 if ( unlikely(((page->type_and_flags & PGT_type_mask) ==
387 PGT_ldt_page)) &&
388 unlikely(((page->type_and_flags & PGT_count_mask) != 0)) )
389 invalidate_shadow_ldt(page->u.domain);
390 put_page(page);
391 }
392 }
395 /*
396 * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
397 * Note also that this automatically deals correctly with linear p.t.'s.
398 */
399 static void put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
400 {
401 if ( (l2_pgentry_val(l2e) & _PAGE_PRESENT) &&
402 ((l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn) )
403 put_page_and_type(&frame_table[l2_pgentry_to_pagenr(l2e)]);
404 }
407 static int alloc_l2_table(struct pfn_info *page)
408 {
409 unsigned long page_nr = page - frame_table;
410 l2_pgentry_t *pl2e;
411 int i;
413 pl2e = map_domain_mem(page_nr << PAGE_SHIFT);
415 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
416 if ( unlikely(!get_page_from_l2e(pl2e[i], page_nr)) )
417 goto fail;
419 #if defined(__i386__)
420 /* Now we add our private high mappings. */
421 memcpy(&pl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
422 &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
423 HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
424 pl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
425 mk_l2_pgentry((page_nr << PAGE_SHIFT) | __PAGE_HYPERVISOR);
426 pl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] =
427 mk_l2_pgentry(__pa(page->u.domain->mm.perdomain_pt) |
428 __PAGE_HYPERVISOR);
429 #endif
431 unmap_domain_mem(pl2e);
432 return 1;
434 fail:
435 while ( i-- > 0 )
436 put_page_from_l2e(pl2e[i], page_nr);
438 unmap_domain_mem(pl2e);
439 return 0;
440 }
443 static int alloc_l1_table(struct pfn_info *page)
444 {
445 unsigned long page_nr = page - frame_table;
446 l1_pgentry_t *pl1e;
447 int i;
449 pl1e = map_domain_mem(page_nr << PAGE_SHIFT);
451 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
452 if ( unlikely(!get_page_from_l1e(pl1e[i])) )
453 goto fail;
455 unmap_domain_mem(pl1e);
456 return 1;
458 fail:
459 while ( i-- > 0 )
460 put_page_from_l1e(pl1e[i]);
462 unmap_domain_mem(pl1e);
463 return 0;
464 }
467 static void free_l2_table(struct pfn_info *page)
468 {
469 unsigned long page_nr = page - frame_table;
470 l2_pgentry_t *pl2e;
471 int i;
473 pl2e = map_domain_mem(page_nr << PAGE_SHIFT);
475 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
476 put_page_from_l2e(pl2e[i], page_nr);
478 unmap_domain_mem(pl2e);
479 }
482 static void free_l1_table(struct pfn_info *page)
483 {
484 unsigned long page_nr = page - frame_table;
485 l1_pgentry_t *pl1e;
486 int i;
488 pl1e = map_domain_mem(page_nr << PAGE_SHIFT);
490 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
491 put_page_from_l1e(pl1e[i]);
493 unmap_domain_mem(pl1e);
494 }
497 static inline int update_l2e(l2_pgentry_t *pl2e,
498 l2_pgentry_t ol2e,
499 l2_pgentry_t nl2e)
500 {
501 unsigned long o = cmpxchg((unsigned long *)pl2e,
502 l2_pgentry_val(ol2e),
503 l2_pgentry_val(nl2e));
504 if ( o != l2_pgentry_val(ol2e) )
505 MEM_LOG("Failed to update %08lx -> %08lx: saw %08lx\n",
506 l2_pgentry_val(ol2e), l2_pgentry_val(nl2e), o);
507 return (o == l2_pgentry_val(ol2e));
508 }
511 /* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
512 static int mod_l2_entry(l2_pgentry_t *pl2e,
513 l2_pgentry_t nl2e,
514 unsigned long pfn)
515 {
516 l2_pgentry_t ol2e;
517 unsigned long _ol2e;
519 if ( unlikely((((unsigned long)pl2e & (PAGE_SIZE-1)) >> 2) >=
520 DOMAIN_ENTRIES_PER_L2_PAGETABLE) )
521 {
522 MEM_LOG("Illegal L2 update attempt in Xen-private area %p", pl2e);
523 return 0;
524 }
526 if ( unlikely(__get_user(_ol2e, (unsigned long *)pl2e) != 0) )
527 return 0;
528 ol2e = mk_l2_pgentry(_ol2e);
530 if ( l2_pgentry_val(nl2e) & _PAGE_PRESENT )
531 {
532 /* Differ in mapping (bits 12-31) or presence (bit 0)? */
533 if ( ((l2_pgentry_val(ol2e) ^ l2_pgentry_val(nl2e)) & ~0xffe) == 0 )
534 return update_l2e(pl2e, ol2e, nl2e);
536 if ( unlikely(!get_page_from_l2e(nl2e, pfn)) )
537 return 0;
539 if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) )
540 {
541 put_page_from_l2e(nl2e, pfn);
542 return 0;
543 }
545 put_page_from_l2e(ol2e, pfn);
546 return 1;
547 }
549 if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) )
550 return 0;
552 put_page_from_l2e(ol2e, pfn);
553 return 1;
554 }
557 static inline int update_l1e(l1_pgentry_t *pl1e,
558 l1_pgentry_t ol1e,
559 l1_pgentry_t nl1e)
560 {
561 unsigned long o = l1_pgentry_val(ol1e);
562 unsigned long n = l1_pgentry_val(nl1e);
564 if ( unlikely(cmpxchg_user(pl1e, o, n) != 0) ||
565 unlikely(o != l1_pgentry_val(ol1e)) )
566 {
567 MEM_LOG("Failed to update %08lx -> %08lx: saw %08lx\n",
568 l1_pgentry_val(ol1e), l1_pgentry_val(nl1e), o);
569 return 0;
570 }
572 return 1;
573 }
576 /* Update the L1 entry at pl1e to new value nl1e. */
577 static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e)
578 {
579 l1_pgentry_t ol1e;
580 unsigned long _ol1e;
582 if ( unlikely(__get_user(_ol1e, (unsigned long *)pl1e) != 0) )
583 {
584 MEM_LOG("Bad get_user\n");
585 return 0;
586 }
588 ol1e = mk_l1_pgentry(_ol1e);
590 if ( l1_pgentry_val(nl1e) & _PAGE_PRESENT )
591 {
592 /* Differ in mapping (bits 12-31), r/w (bit 1), or presence (bit 0)? */
593 if ( ((l1_pgentry_val(ol1e) ^ l1_pgentry_val(nl1e)) & ~0xffc) == 0 )
594 return update_l1e(pl1e, ol1e, nl1e);
596 if ( unlikely(!get_page_from_l1e(nl1e)) )
597 return 0;
599 if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
600 {
601 put_page_from_l1e(nl1e);
602 return 0;
603 }
605 put_page_from_l1e(ol1e);
606 return 1;
607 }
609 if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
610 return 0;
612 put_page_from_l1e(ol1e);
613 return 1;
614 }
617 int alloc_page_type(struct pfn_info *page, unsigned int type)
618 {
619 if ( unlikely(test_and_clear_bit(_PGC_tlb_flush_on_type_change,
620 &page->count_and_flags)) )
621 {
622 struct domain *p = page->u.domain;
623 if ( unlikely(NEED_FLUSH(tlbflush_time[p->processor],
624 page->tlbflush_timestamp)) )
625 {
626 perfc_incr(need_flush_tlb_flush);
627 flush_tlb_cpu(p->processor);
628 }
629 }
631 switch ( type )
632 {
633 case PGT_l1_page_table:
634 return alloc_l1_table(page);
635 case PGT_l2_page_table:
636 return alloc_l2_table(page);
637 case PGT_gdt_page:
638 case PGT_ldt_page:
639 return alloc_segdesc_page(page);
640 default:
641 BUG();
642 }
644 return 0;
645 }
648 void free_page_type(struct pfn_info *page, unsigned int type)
649 {
650 switch ( type )
651 {
652 case PGT_l1_page_table:
653 free_l1_table(page);
654 if ( unlikely(current->mm.shadow_mode) &&
655 (get_shadow_status(&current->mm,
656 page-frame_table) & PSH_shadowed) )
657 {
658 /*
659 * Using 'current->mm' is safe and correct because page-table pages
660 * are not shared across domains. Updates to such pages' types are
661 * thus only done within the context of the owning domain. The one
662 * exception is when destroying a domain; however, this is not a
663 * problem as the currently-executing domain will not have this MFN
664 * shadowed, and at domain end-of-day we explicitly unshadow
665 * everything so that nothing will get left lying around.
666 */
667 unshadow_table( page-frame_table, type );
668 put_shadow_status(&current->mm);
669 }
670 break;
672 case PGT_l2_page_table:
673 free_l2_table(page);
674 if ( unlikely(current->mm.shadow_mode) &&
675 (get_shadow_status(&current->mm,
676 page-frame_table) & PSH_shadowed) )
677 {
678 unshadow_table( page-frame_table, type );
679 put_shadow_status(&current->mm);
680 }
681 break;
683 default:
684 BUG();
685 }
686 }
689 static int do_extended_command(unsigned long ptr, unsigned long val)
690 {
691 int okay = 1, cpu = smp_processor_id();
692 unsigned int cmd = val & MMUEXT_CMD_MASK;
693 unsigned long pfn = ptr >> PAGE_SHIFT;
694 unsigned long old_base_pfn;
695 struct pfn_info *page = &frame_table[pfn];
696 struct domain *d = current, *nd, *e;
697 u32 x, y;
698 domid_t domid;
700 switch ( cmd )
701 {
702 case MMUEXT_PIN_L1_TABLE:
703 case MMUEXT_PIN_L2_TABLE:
704 okay = get_page_and_type_from_pagenr(
705 pfn,
706 (cmd==MMUEXT_PIN_L2_TABLE) ? PGT_l2_page_table : PGT_l1_page_table,
707 PTS);
708 if ( unlikely(!okay) )
709 {
710 MEM_LOG("Error while pinning pfn %08lx", pfn);
711 put_page(page);
712 break;
713 }
715 if ( unlikely(test_and_set_bit(_PGC_guest_pinned,
716 &page->count_and_flags)) )
717 {
718 MEM_LOG("Pfn %08lx already pinned", pfn);
719 put_page_and_type(page);
720 okay = 0;
721 break;
722 }
724 break;
726 case MMUEXT_UNPIN_TABLE:
727 if ( unlikely(!(okay = get_page_from_pagenr(pfn, PTS))) )
728 {
729 MEM_LOG("Page %08lx bad domain (dom=%p)",
730 ptr, page->u.domain);
731 }
732 else if ( likely(test_and_clear_bit(_PGC_guest_pinned,
733 &page->count_and_flags)) )
734 {
735 put_page_and_type(page);
736 put_page(page);
737 }
738 else
739 {
740 okay = 0;
741 put_page(page);
742 MEM_LOG("Pfn %08lx not pinned", pfn);
743 }
744 break;
746 case MMUEXT_NEW_BASEPTR:
747 okay = get_page_and_type_from_pagenr(pfn, PGT_l2_page_table, d);
748 if ( likely(okay) )
749 {
750 invalidate_shadow_ldt(d);
752 percpu_info[cpu].deferred_ops &= ~DOP_FLUSH_TLB;
753 old_base_pfn = pagetable_val(d->mm.pagetable) >> PAGE_SHIFT;
754 d->mm.pagetable = mk_pagetable(pfn << PAGE_SHIFT);
756 shadow_mk_pagetable(&d->mm);
758 write_ptbase(&d->mm);
760 put_page_and_type(&frame_table[old_base_pfn]);
762 /*
763 * Note that we tick the clock /after/ dropping the old base's
764 * reference count. If the page tables got freed then this will
765 * avoid unnecessary TLB flushes when the pages are reused.
766 */
767 tlb_clocktick();
768 }
769 else
770 {
771 MEM_LOG("Error while installing new baseptr %08lx", ptr);
772 }
773 break;
775 case MMUEXT_TLB_FLUSH:
776 percpu_info[cpu].deferred_ops |= DOP_FLUSH_TLB;
777 break;
779 case MMUEXT_INVLPG:
780 __flush_tlb_one(ptr);
781 break;
783 case MMUEXT_SET_LDT:
784 {
785 unsigned long ents = val >> MMUEXT_CMD_SHIFT;
786 if ( ((ptr & (PAGE_SIZE-1)) != 0) ||
787 (ents > 8192) ||
788 ((ptr+ents*LDT_ENTRY_SIZE) < ptr) ||
789 ((ptr+ents*LDT_ENTRY_SIZE) > PAGE_OFFSET) )
790 {
791 okay = 0;
792 MEM_LOG("Bad args to SET_LDT: ptr=%08lx, ents=%08lx", ptr, ents);
793 }
794 else if ( (d->mm.ldt_ents != ents) ||
795 (d->mm.ldt_base != ptr) )
796 {
797 invalidate_shadow_ldt(d);
798 d->mm.ldt_base = ptr;
799 d->mm.ldt_ents = ents;
800 load_LDT(d);
801 percpu_info[cpu].deferred_ops &= ~DOP_RELOAD_LDT;
802 if ( ents != 0 )
803 percpu_info[cpu].deferred_ops |= DOP_RELOAD_LDT;
804 }
805 break;
806 }
808 case MMUEXT_SET_SUBJECTDOM:
809 domid = ((domid_t)((ptr&~0xFFFF)|(val>>16)));
811 if ( !IS_PRIV(d) )
812 {
813 MEM_LOG("Dom %u has no privilege to set subject domain",
814 d->domain);
815 okay = 0;
816 }
817 else
818 {
819 if ( percpu_info[cpu].gps != NULL )
820 put_domain(percpu_info[cpu].gps);
821 percpu_info[cpu].gps = find_domain_by_id(domid);
822 percpu_info[cpu].pts = (val & SET_PAGETABLE_SUBJECTDOM) ?
823 percpu_info[cpu].gps : NULL;
824 if ( percpu_info[cpu].gps == NULL )
825 {
826 MEM_LOG("Unknown domain '%u'", domid);
827 okay = 0;
828 }
829 }
830 break;
832 case MMUEXT_REASSIGN_PAGE:
833 if ( unlikely(!IS_PRIV(d)) )
834 {
835 MEM_LOG("Dom %u has no reassignment priv", d->domain);
836 okay = 0;
837 break;
838 }
840 if ( unlikely((e = percpu_info[cpu].gps) == NULL) )
841 {
842 MEM_LOG("No GPS to reassign pfn %08lx to", pfn);
843 okay = 0;
844 break;
845 }
847 /*
848 * Grab both page_list locks, in order. This prevents the page from
849 * disappearing elsewhere while we modify the owner, and we'll need
850 * both locks if we're successful so that we can change lists.
851 */
852 if ( d < e )
853 {
854 spin_lock(&d->page_alloc_lock);
855 spin_lock(&e->page_alloc_lock);
856 }
857 else
858 {
859 spin_lock(&e->page_alloc_lock);
860 spin_lock(&d->page_alloc_lock);
861 }
863 /* A domain shouldn't have PGC_allocated pages when it is dying. */
864 if ( unlikely(test_bit(DF_DYING, &e->flags)) ||
865 unlikely(IS_XEN_HEAP_FRAME(page)) )
866 {
867 MEM_LOG("Reassignment page is Xen heap, or dest dom is dying.");
868 okay = 0;
869 goto reassign_fail;
870 }
872 /*
873 * The tricky bit: atomically change owner while there is just one
874 * benign reference to the page (PGC_allocated). If that reference
875 * disappears then the deallocation routine will safely spin.
876 */
877 nd = page->u.domain;
878 y = page->count_and_flags;
879 do {
880 x = y;
881 if ( unlikely((x & (PGC_count_mask|PGC_allocated)) !=
882 (1|PGC_allocated)) ||
883 unlikely(nd != d) )
884 {
885 MEM_LOG("Bad page values %08lx: ed=%p(%u), sd=%p,"
886 " caf=%08x, taf=%08x\n", page_to_pfn(page),
887 d, d->domain, nd, x, page->type_and_flags);
888 okay = 0;
889 goto reassign_fail;
890 }
891 __asm__ __volatile__(
892 LOCK_PREFIX "cmpxchg8b %3"
893 : "=a" (nd), "=d" (y), "=b" (e),
894 "=m" (*(volatile u64 *)(&page->u.domain))
895 : "0" (d), "1" (x), "b" (e), "c" (x) );
896 }
897 while ( unlikely(nd != d) || unlikely(y != x) );
899 /*
900 * Unlink from 'd'. We transferred at least one reference to 'e', so
901 * noone else is spinning to try to delete this page from 'd'.
902 */
903 d->tot_pages--;
904 list_del(&page->list);
906 /*
907 * Add the page to 'e'. Someone may already have removed the last
908 * reference and want to remove the page from 'e'. However, we have
909 * the lock so they'll spin waiting for us.
910 */
911 if ( unlikely(e->tot_pages++ == 0) )
912 get_domain(e);
913 list_add_tail(&page->list, &e->page_list);
915 reassign_fail:
916 spin_unlock(&d->page_alloc_lock);
917 spin_unlock(&e->page_alloc_lock);
918 break;
920 case MMUEXT_RESET_SUBJECTDOM:
921 if ( percpu_info[cpu].gps != NULL )
922 put_domain(percpu_info[cpu].gps);
923 percpu_info[cpu].gps = percpu_info[cpu].pts = NULL;
924 break;
926 default:
927 MEM_LOG("Invalid extended pt command 0x%08lx", val & MMUEXT_CMD_MASK);
928 okay = 0;
929 break;
930 }
932 return okay;
933 }
936 int do_mmu_update(mmu_update_t *ureqs, int count, int *success_count)
937 {
938 mmu_update_t req;
939 unsigned long va = 0, deferred_ops, pfn, prev_pfn = 0;
940 struct pfn_info *page;
941 int rc = 0, okay = 1, i, cpu = smp_processor_id();
942 unsigned int cmd;
943 unsigned long prev_spfn = 0;
944 l1_pgentry_t *prev_spl1e = 0;
946 perfc_incrc(calls_to_mmu_update);
947 perfc_addc(num_page_updates, count);
949 for ( i = 0; i < count; i++ )
950 {
951 if ( unlikely(copy_from_user(&req, ureqs, sizeof(req)) != 0) )
952 {
953 MEM_LOG("Bad copy_from_user");
954 rc = -EFAULT;
955 break;
956 }
958 cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
959 pfn = req.ptr >> PAGE_SHIFT;
961 okay = 0;
963 switch ( cmd )
964 {
965 /*
966 * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
967 */
968 case MMU_NORMAL_PT_UPDATE:
969 if ( unlikely(!get_page_from_pagenr(pfn, PTS)) )
970 {
971 MEM_LOG("Could not get page for normal update");
972 break;
973 }
975 if ( likely(prev_pfn == pfn) )
976 {
977 va = (va & PAGE_MASK) | (req.ptr & ~PAGE_MASK);
978 }
979 else
980 {
981 if ( prev_pfn != 0 )
982 unmap_domain_mem((void *)va);
983 va = (unsigned long)map_domain_mem(req.ptr);
984 prev_pfn = pfn;
985 }
987 page = &frame_table[pfn];
988 switch ( (page->type_and_flags & PGT_type_mask) )
989 {
990 case PGT_l1_page_table:
991 if ( likely(get_page_type(page, PGT_l1_page_table)) )
992 {
993 okay = mod_l1_entry((l1_pgentry_t *)va,
994 mk_l1_pgentry(req.val));
996 if ( okay && unlikely(current->mm.shadow_mode) &&
997 (get_shadow_status(&current->mm, page-frame_table) &
998 PSH_shadowed) )
999 {
1000 shadow_l1_normal_pt_update( req.ptr, req.val,
1001 &prev_spfn, &prev_spl1e );
1002 put_shadow_status(&current->mm);
1005 put_page_type(page);
1007 break;
1008 case PGT_l2_page_table:
1009 if ( likely(get_page_type(page, PGT_l2_page_table)) )
1011 okay = mod_l2_entry((l2_pgentry_t *)va,
1012 mk_l2_pgentry(req.val),
1013 pfn);
1015 if ( okay && unlikely(current->mm.shadow_mode) &&
1016 (get_shadow_status(&current->mm, page-frame_table) &
1017 PSH_shadowed) )
1019 shadow_l2_normal_pt_update( req.ptr, req.val );
1020 put_shadow_status(&current->mm);
1023 put_page_type(page);
1025 break;
1026 default:
1027 if ( likely(get_page_type(page, PGT_writeable_page)) )
1029 *(unsigned long *)va = req.val;
1030 okay = 1;
1031 put_page_type(page);
1033 break;
1036 put_page(page);
1038 break;
1040 case MMU_MACHPHYS_UPDATE:
1041 if ( unlikely(!get_page_from_pagenr(pfn, GPS)) )
1043 MEM_LOG("Could not get page for mach->phys update");
1044 break;
1047 machine_to_phys_mapping[pfn] = req.val;
1048 okay = 1;
1050 /*
1051 * If in log-dirty mode, mark the corresponding pseudo-physical
1052 * page as dirty.
1053 */
1054 if ( unlikely(current->mm.shadow_mode == SHM_logdirty) )
1055 mark_dirty(&current->mm, pfn);
1057 put_page(&frame_table[pfn]);
1058 break;
1060 /*
1061 * MMU_EXTENDED_COMMAND: Extended command is specified
1062 * in the least-siginificant bits of the 'value' field.
1063 */
1064 case MMU_EXTENDED_COMMAND:
1065 req.ptr &= ~(sizeof(l1_pgentry_t) - 1);
1066 okay = do_extended_command(req.ptr, req.val);
1067 break;
1069 default:
1070 MEM_LOG("Invalid page update command %08lx", req.ptr);
1071 break;
1074 if ( unlikely(!okay) )
1076 rc = -EINVAL;
1077 break;
1080 ureqs++;
1083 if ( prev_pfn != 0 )
1084 unmap_domain_mem((void *)va);
1086 if( prev_spl1e != 0 )
1087 unmap_domain_mem((void *)prev_spl1e);
1089 deferred_ops = percpu_info[cpu].deferred_ops;
1090 percpu_info[cpu].deferred_ops = 0;
1092 if ( deferred_ops & DOP_FLUSH_TLB )
1093 local_flush_tlb();
1095 if ( deferred_ops & DOP_RELOAD_LDT )
1096 (void)map_ldt_shadow_page(0);
1098 if ( unlikely(percpu_info[cpu].gps != NULL) )
1100 put_domain(percpu_info[cpu].gps);
1101 percpu_info[cpu].gps = percpu_info[cpu].pts = NULL;
1104 if ( unlikely(success_count != NULL) )
1105 put_user(count, success_count);
1107 return rc;
1111 int do_update_va_mapping(unsigned long page_nr,
1112 unsigned long val,
1113 unsigned long flags)
1115 struct domain *p = current;
1116 int err = 0;
1117 unsigned int cpu = p->processor;
1118 unsigned long deferred_ops;
1120 perfc_incrc(calls_to_update_va);
1122 if ( unlikely(page_nr >= (HYPERVISOR_VIRT_START >> PAGE_SHIFT)) )
1123 return -EINVAL;
1125 /*
1126 * XXX When we make this support 4MB superpages we should also deal with
1127 * the case of updating L2 entries.
1128 */
1130 if ( unlikely(!mod_l1_entry(&linear_pg_table[page_nr],
1131 mk_l1_pgentry(val))) )
1132 err = -EINVAL;
1134 if ( unlikely(p->mm.shadow_mode) )
1136 unsigned long sval;
1138 l1pte_no_fault( &current->mm, &val, &sval );
1140 if ( unlikely(__put_user(sval, ((unsigned long *)(
1141 &shadow_linear_pg_table[page_nr])))) )
1143 /*
1144 * Since L2's are guranteed RW, failure indicates the page was not
1145 * shadowed, so ignore.
1146 */
1147 perfc_incrc(shadow_update_va_fail);
1150 /*
1151 * If we're in log-dirty mode then we need to note that we've updated
1152 * the PTE in the PT-holding page. We need the machine frame number
1153 * for this.
1154 */
1155 if ( p->mm.shadow_mode == SHM_logdirty )
1156 mark_dirty( &current->mm, va_to_l1mfn(page_nr<<PAGE_SHIFT) );
1158 check_pagetable( p, p->mm.pagetable, "va" ); /* debug */
1161 deferred_ops = percpu_info[cpu].deferred_ops;
1162 percpu_info[cpu].deferred_ops = 0;
1164 if ( unlikely(deferred_ops & DOP_FLUSH_TLB) ||
1165 unlikely(flags & UVMF_FLUSH_TLB) )
1166 local_flush_tlb();
1167 else if ( unlikely(flags & UVMF_INVLPG) )
1168 __flush_tlb_one(page_nr << PAGE_SHIFT);
1170 if ( unlikely(deferred_ops & DOP_RELOAD_LDT) )
1171 (void)map_ldt_shadow_page(0);
1173 return err;
1176 int do_update_va_mapping_otherdomain(unsigned long page_nr,
1177 unsigned long val,
1178 unsigned long flags,
1179 domid_t domid)
1181 unsigned int cpu = smp_processor_id();
1182 struct domain *d;
1183 int rc;
1185 if ( unlikely(!IS_PRIV(current)) )
1186 return -EPERM;
1188 percpu_info[cpu].gps = d = find_domain_by_id(domid);
1189 if ( unlikely(d == NULL) )
1191 MEM_LOG("Unknown domain '%u'", domid);
1192 return -ESRCH;
1195 rc = do_update_va_mapping(page_nr, val, flags);
1197 put_domain(d);
1198 percpu_info[cpu].gps = NULL;
1200 return rc;