ia64/xen-unstable

view xen/common/memory.c @ 847:39d0706234b7

bitkeeper revision 1.526 (3f8f4139NGtfXYTOvQIloULmYg2ktg)

Add a couple of software performance counters to the page table update routines.
author iap10@labyrinth.cl.cam.ac.uk
date Fri Oct 17 01:09:13 2003 +0000 (2003-10-17)
parents a9b037118083
children 21376a12e008
line source
1 /******************************************************************************
2 * memory.c
3 *
4 * Copyright (c) 2002 K A Fraser
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
21 /*
22 * A description of the page table API:
23 *
24 * Domains trap to process_page_updates with a list of update requests.
25 * This is a list of (ptr, val) pairs, where the requested operation
26 * is *ptr = val.
27 *
28 * Reference counting of pages:
29 * ----------------------------
30 * Each page has two refcounts: tot_count and type_count.
31 *
32 * TOT_COUNT is the obvious reference count. It counts all uses of a
33 * physical page frame by a domain, including uses as a page directory,
34 * a page table, or simple mappings via a PTE. This count prevents a
35 * domain from releasing a frame back to the hypervisor's free pool when
36 * it is still referencing it!
37 *
38 * TYPE_COUNT is more subtle. A frame can be put to one of three
39 * mutually-exclusive uses: it might be used as a page directory, or a
40 * page table, or it may be mapped writeable by the domain [of course, a
41 * frame may not be used in any of these three ways!].
42 * So, type_count is a count of the number of times a frame is being
43 * referred to in its current incarnation. Therefore, a page can only
44 * change its type when its type count is zero.
45 *
46 * Pinning the page type:
47 * ----------------------
48 * The type of a page can be pinned/unpinned with the commands
49 * PGEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is,
50 * pinning is not reference counted, so it can't be nested).
51 * This is useful to prevent a page's type count falling to zero, at which
52 * point safety checks would need to be carried out next time the count
53 * is increased again.
54 *
55 * A further note on writeable page mappings:
56 * ------------------------------------------
57 * For simplicity, the count of writeable mappings for a page may not
58 * correspond to reality. The 'writeable count' is incremented for every
59 * PTE which maps the page with the _PAGE_RW flag set. However, for
60 * write access to be possible the page directory entry must also have
61 * its _PAGE_RW bit set. We do not check this as it complicates the
62 * reference counting considerably [consider the case of multiple
63 * directory entries referencing a single page table, some with the RW
64 * bit set, others not -- it starts getting a bit messy].
65 * In normal use, this simplification shouldn't be a problem.
66 * However, the logic can be added if required.
67 *
68 * One more note on read-only page mappings:
69 * -----------------------------------------
70 * We want domains to be able to map pages for read-only access. The
71 * main reason is that page tables and directories should be readable
72 * by a domain, but it would not be safe for them to be writeable.
73 * However, domains have free access to rings 1 & 2 of the Intel
74 * privilege model. In terms of page protection, these are considered
75 * to be part of 'supervisor mode'. The WP bit in CR0 controls whether
76 * read-only restrictions are respected in supervisor mode -- if the
77 * bit is clear then any mapped page is writeable.
78 *
79 * We get round this by always setting the WP bit and disallowing
80 * updates to it. This is very unlikely to cause a problem for guest
81 * OS's, which will generally use the WP bit to simplify copy-on-write
82 * implementation (in that case, OS wants a fault when it writes to
83 * an application-supplied buffer).
84 */
87 /*
88 * THE FOLLOWING ARE ISSUES IF GUEST OPERATING SYSTEMS BECOME SMP-CAPABLE.
89 * -----------------------------------------------------------------------
90 *
91 * *********
92 * UPDATE 15/7/02: Interface has changed --updates now specify physical
93 * address of page-table entry, rather than specifying a virtual address,
94 * so hypervisor no longer "walks" the page tables. Therefore the
95 * solution below cannot work. Another possibility is to add a new entry
96 * to our "struct page" which says to which top-level page table each
97 * lower-level page table or writeable mapping belongs. If it belongs to more
98 * than one, we'd probably just flush on all processors running the domain.
99 * *********
100 *
101 * The problem involves creating new page tables which might be mapped
102 * writeable in the TLB of another processor. As an example, a domain might be
103 * running in two contexts (ie. on two processors) simultaneously, using the
104 * same top-level page table in both contexts. Now, if context 1 sends an
105 * update request [make page P read-only, add a reference to page P as a page
106 * table], that will succeed if there was only one writeable mapping of P.
107 * However, that mapping may persist in the TLB of context 2.
108 *
109 * Solution: when installing a new page table, we must flush foreign TLBs as
110 * necessary. Naive solution is to flush on any processor running our domain.
111 * Cleverer solution is to flush on any processor running same top-level page
112 * table, but this will sometimes fail (consider two different top-level page
113 * tables which have a shared lower-level page table).
114 *
115 * A better solution: when squashing a write reference, check how many times
116 * that lowest-level table entry is referenced by ORing refcounts of tables
117 * down the page-table hierarchy. If results is != 1, we require flushing all
118 * instances of current domain if a new table is installed (because the
119 * lowest-level entry may be referenced by many top-level page tables).
120 * However, common case will be that result == 1, so we only need to flush
121 * processors with the same top-level page table. Make choice at
122 * table-installation time based on a `flush_level' flag, which is
123 * FLUSH_NONE, FLUSH_PAGETABLE, FLUSH_DOMAIN. A flush reduces this
124 * to FLUSH_NONE, while squashed write mappings can only promote up
125 * to more aggressive flush types.
126 */
128 #include <xeno/config.h>
129 #include <xeno/init.h>
130 #include <xeno/lib.h>
131 #include <xeno/mm.h>
132 #include <xeno/sched.h>
133 #include <xeno/errno.h>
134 #include <xeno/perfc.h>
135 #include <asm/page.h>
136 #include <asm/flushtlb.h>
137 #include <asm/io.h>
138 #include <asm/uaccess.h>
139 #include <asm/domain_page.h>
141 #if 0
142 #define MEM_LOG(_f, _a...) printk("DOM%d: (file=memory.c, line=%d) " _f "\n", current->domain, __LINE__, ## _a )
143 #else
144 #define MEM_LOG(_f, _a...) ((void)0)
145 #endif
147 /* Domain 0 is allowed to submit requests on behalf of others. */
148 #define DOMAIN_OKAY(_f) \
149 ((((_f) & PG_domain_mask) == current->domain) || (current->domain == 0))
151 /* 'get' checks parameter for validity before inc'ing refcnt. */
152 static int get_l2_table(unsigned long page_nr);
153 static int get_l1_table(unsigned long page_nr);
154 static int get_page(unsigned long page_nr, int writeable);
155 static int inc_page_refcnt(unsigned long page_nr, unsigned int type);
156 /* 'put' does no checking because if refcnt not zero, entity must be valid. */
157 static void put_l2_table(unsigned long page_nr);
158 static void put_l1_table(unsigned long page_nr);
159 static void put_page(unsigned long page_nr, int writeable);
160 static int dec_page_refcnt(unsigned long page_nr, unsigned int type);
162 static int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t);
163 static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t);
165 /* frame table size and its size in pages */
166 frame_table_t * frame_table;
167 unsigned long frame_table_size;
168 unsigned long max_page;
170 struct list_head free_list;
171 spinlock_t free_list_lock = SPIN_LOCK_UNLOCKED;
172 unsigned int free_pfns;
174 /* Used to defer flushing of memory structures. */
175 static struct {
176 int flush_tlb;
177 int refresh_ldt;
178 } deferred_op[NR_CPUS] __cacheline_aligned;
180 /*
181 * init_frametable:
182 * Initialise per-frame memory information. This goes directly after
183 * MAX_MONITOR_ADDRESS in physical memory.
184 */
185 void __init init_frametable(unsigned long nr_pages)
186 {
187 struct pfn_info *pf;
188 unsigned long page_index;
189 unsigned long flags;
191 memset(deferred_op, 0, sizeof(deferred_op));
193 max_page = nr_pages;
194 frame_table_size = nr_pages * sizeof(struct pfn_info);
195 frame_table_size = (frame_table_size + PAGE_SIZE - 1) & PAGE_MASK;
196 frame_table = (frame_table_t *)FRAMETABLE_VIRT_START;
197 memset(frame_table, 0, frame_table_size);
199 free_pfns = 0;
201 /* Put all domain-allocatable memory on a free list. */
202 spin_lock_irqsave(&free_list_lock, flags);
203 INIT_LIST_HEAD(&free_list);
204 for( page_index = (__pa(frame_table) + frame_table_size) >> PAGE_SHIFT;
205 page_index < nr_pages;
206 page_index++ )
207 {
208 pf = list_entry(&frame_table[page_index].list, struct pfn_info, list);
209 list_add_tail(&pf->list, &free_list);
210 free_pfns++;
211 }
212 spin_unlock_irqrestore(&free_list_lock, flags);
213 }
216 static void __invalidate_shadow_ldt(struct task_struct *p)
217 {
218 int i, cpu = p->processor;
219 unsigned long pfn;
220 struct pfn_info *page;
222 p->mm.shadow_ldt_mapcnt = 0;
224 for ( i = 16; i < 32; i++ )
225 {
226 pfn = l1_pgentry_to_pagenr(p->mm.perdomain_pt[i]);
227 if ( pfn == 0 ) continue;
228 p->mm.perdomain_pt[i] = mk_l1_pgentry(0);
229 page = frame_table + pfn;
230 ASSERT((page->flags & PG_type_mask) == PGT_ldt_page);
231 ASSERT((page->flags & PG_domain_mask) == p->domain);
232 ASSERT((page->type_count != 0) && (page->tot_count != 0));
233 put_page_type(page);
234 put_page_tot(page);
235 }
237 /* Dispose of the (now possibly invalid) mappings from the TLB. */
238 deferred_op[cpu].flush_tlb = 1;
239 deferred_op[cpu].refresh_ldt = 1;
240 }
243 static inline void invalidate_shadow_ldt(void)
244 {
245 struct task_struct *p = current;
246 if ( p->mm.shadow_ldt_mapcnt != 0 )
247 __invalidate_shadow_ldt(p);
248 }
251 /* Map shadow page at offset @off. Returns 0 on success. */
252 int map_ldt_shadow_page(unsigned int off)
253 {
254 struct task_struct *p = current;
255 unsigned long addr = p->mm.ldt_base + (off << PAGE_SHIFT);
256 unsigned long l1e, *ldt_page, flags;
257 struct pfn_info *page;
258 int i, ret = -1;
260 spin_lock_irqsave(&p->page_lock, flags);
262 __get_user(l1e, (unsigned long *)(linear_pg_table+(addr>>PAGE_SHIFT)));
263 if ( unlikely(!(l1e & _PAGE_PRESENT)) )
264 goto out;
266 page = frame_table + (l1e >> PAGE_SHIFT);
267 if ( unlikely((page->flags & PG_type_mask) != PGT_ldt_page) )
268 {
269 if ( unlikely(page->type_count != 0) )
270 goto out;
272 /* Check all potential LDT entries in the page. */
273 ldt_page = (unsigned long *)addr;
274 for ( i = 0; i < 512; i++ )
275 if ( unlikely(!check_descriptor(ldt_page[i*2], ldt_page[i*2+1])) )
276 goto out;
278 if ( unlikely(page->flags & PG_need_flush) )
279 {
280 perfc_incrc(need_flush_tlb_flush);
281 __write_cr3_counted(pagetable_val(p->mm.pagetable));
282 page->flags &= ~PG_need_flush;
283 }
285 page->flags &= ~PG_type_mask;
286 page->flags |= PGT_ldt_page;
287 }
289 /* Success! */
290 get_page_type(page);
291 get_page_tot(page);
292 p->mm.perdomain_pt[off+16] = mk_l1_pgentry(l1e|_PAGE_RW);
293 p->mm.shadow_ldt_mapcnt++;
295 ret = 0;
297 out:
298 spin_unlock_irqrestore(&p->page_lock, flags);
299 return ret;
300 }
303 /* Return original refcnt, or -1 on error. */
304 static int inc_page_refcnt(unsigned long page_nr, unsigned int type)
305 {
306 struct pfn_info *page;
307 unsigned long flags;
309 if ( unlikely(page_nr >= max_page) )
310 {
311 MEM_LOG("Page out of range (%08lx>%08lx)", page_nr, max_page);
312 return -1;
313 }
314 page = frame_table + page_nr;
315 flags = page->flags;
316 if ( unlikely(!DOMAIN_OKAY(flags)) )
317 {
318 MEM_LOG("Bad page domain (%ld)", flags & PG_domain_mask);
319 return -1;
320 }
321 if ( (flags & PG_type_mask) != type )
322 {
323 if ( page_type_count(page) != 0 )
324 {
325 MEM_LOG("Page %08lx bad type/count (%08lx!=%08x) cnt=%ld",
326 page_nr << PAGE_SHIFT,
327 flags & PG_type_mask, type, page_type_count(page));
328 return -1;
329 }
331 if ( unlikely(flags & PG_need_flush) )
332 {
333 deferred_op[smp_processor_id()].flush_tlb = 1;
334 page->flags &= ~PG_need_flush;
335 perfc_incrc(need_flush_tlb_flush);
336 }
338 page->flags &= ~PG_type_mask;
339 page->flags |= type;
340 }
342 get_page_tot(page);
343 return get_page_type(page);
344 }
347 /* Return new refcnt, or -1 on error. */
348 static int dec_page_refcnt(unsigned long page_nr, unsigned int type)
349 {
350 struct pfn_info *page;
352 if ( unlikely(page_nr >= max_page) )
353 {
354 MEM_LOG("Page out of range (%08lx>%08lx)", page_nr, max_page);
355 return -1;
356 }
357 page = frame_table + page_nr;
358 if ( unlikely(!DOMAIN_OKAY(page->flags)) ||
359 unlikely(((page->flags & PG_type_mask) != type)) )
360 {
361 MEM_LOG("Bad page type/domain (dom=%ld) (type %ld != expected %d)",
362 page->flags & PG_domain_mask, page->flags & PG_type_mask,
363 type);
364 return -1;
365 }
366 ASSERT(page_type_count(page) != 0);
367 put_page_tot(page);
368 return put_page_type(page);
369 }
372 /* We allow a L2 table to map itself, to achieve a linear pagetable. */
373 /* NB. There's no need for a put_twisted_l2_table() function!! */
374 static int get_twisted_l2_table(unsigned long entry_pfn, l2_pgentry_t l2e)
375 {
376 unsigned long l2v = l2_pgentry_val(l2e);
378 /* Clearly the mapping must be read-only :-) */
379 if ( (l2v & _PAGE_RW) )
380 {
381 MEM_LOG("Attempt to install twisted L2 entry with write permissions");
382 return -1;
383 }
385 /* This is a sufficient final check. */
386 if ( (l2v >> PAGE_SHIFT) != entry_pfn )
387 {
388 MEM_LOG("L2 tables may not map _other_ L2 tables!\n");
389 return -1;
390 }
392 /* We don't bump the reference counts. */
393 return 0;
394 }
397 static int get_l2_table(unsigned long page_nr)
398 {
399 l2_pgentry_t *p_l2_entry, l2_entry;
400 int i, ret=0;
402 ret = inc_page_refcnt(page_nr, PGT_l2_page_table);
403 if ( likely(ret != 0) ) return (ret < 0) ? ret : 0;
405 /* NEW level-2 page table! Deal with every PDE in the table. */
406 p_l2_entry = map_domain_mem(page_nr << PAGE_SHIFT);
407 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
408 {
409 l2_entry = *p_l2_entry++;
410 if ( !(l2_pgentry_val(l2_entry) & _PAGE_PRESENT) ) continue;
411 if ( (l2_pgentry_val(l2_entry) & (_PAGE_GLOBAL|_PAGE_PSE)) )
412 {
413 MEM_LOG("Bad L2 page type settings %04lx",
414 l2_pgentry_val(l2_entry) & (_PAGE_GLOBAL|_PAGE_PSE));
415 ret = -1;
416 goto out;
417 }
418 /* Assume we're mapping an L1 table, falling back to twisted L2. */
419 ret = get_l1_table(l2_pgentry_to_pagenr(l2_entry));
420 if ( ret ) ret = get_twisted_l2_table(page_nr, l2_entry);
421 if ( ret ) goto out;
422 }
424 /* Now we simply slap in our high mapping. */
425 memcpy(p_l2_entry,
426 &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
427 HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
428 p_l2_entry[(PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT) -
429 DOMAIN_ENTRIES_PER_L2_PAGETABLE] =
430 mk_l2_pgentry(__pa(current->mm.perdomain_pt) | __PAGE_HYPERVISOR);
431 p_l2_entry[(LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT) -
432 DOMAIN_ENTRIES_PER_L2_PAGETABLE] =
433 mk_l2_pgentry((page_nr << PAGE_SHIFT) | __PAGE_HYPERVISOR);
435 out:
436 unmap_domain_mem(p_l2_entry);
437 return ret;
438 }
441 static int get_l1_table(unsigned long page_nr)
442 {
443 l1_pgentry_t *p_l1_entry, l1_entry;
444 int i, ret;
446 /* Update ref count for page pointed at by PDE. */
447 ret = inc_page_refcnt(page_nr, PGT_l1_page_table);
448 if ( likely(ret != 0) ) return (ret < 0) ? ret : 0;
450 /* NEW level-1 page table! Deal with every PTE in the table. */
451 p_l1_entry = map_domain_mem(page_nr << PAGE_SHIFT);
452 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
453 {
454 l1_entry = *p_l1_entry++;
455 if ( !(l1_pgentry_val(l1_entry) & _PAGE_PRESENT) ) continue;
456 if ( (l1_pgentry_val(l1_entry) &
457 (_PAGE_GLOBAL|_PAGE_PAT)) )
458 {
459 MEM_LOG("Bad L1 page type settings %04lx",
460 l1_pgentry_val(l1_entry) &
461 (_PAGE_GLOBAL|_PAGE_PAT));
462 ret = -1;
463 goto out;
464 }
465 ret = get_page(l1_pgentry_to_pagenr(l1_entry),
466 l1_pgentry_val(l1_entry) & _PAGE_RW);
467 if ( ret ) goto out;
468 }
470 out:
471 /* Make sure we unmap the right page! */
472 unmap_domain_mem(p_l1_entry-1);
473 return ret;
474 }
477 static int get_page(unsigned long page_nr, int writeable)
478 {
479 struct pfn_info *page;
480 unsigned long flags;
482 /* Update ref count for page pointed at by PTE. */
483 if ( unlikely(page_nr >= max_page) )
484 {
485 MEM_LOG("Page out of range (%08lx>%08lx)", page_nr, max_page);
486 return(-1);
487 }
488 page = frame_table + page_nr;
489 flags = page->flags;
490 if ( unlikely(!DOMAIN_OKAY(flags)) )
491 {
492 MEM_LOG("Bad page domain (%ld)", flags & PG_domain_mask);
493 return(-1);
494 }
496 if ( writeable )
497 {
498 if ( (flags & PG_type_mask) != PGT_writeable_page )
499 {
500 if ( page_type_count(page) != 0 )
501 {
502 MEM_LOG("Bad page type/count (%08lx!=%08x) cnt=%ld",
503 flags & PG_type_mask, PGT_writeable_page,
504 page_type_count(page));
505 return(-1);
506 }
507 page->flags &= ~PG_type_mask;
508 page->flags |= PGT_writeable_page;
509 }
510 page->flags |= PG_need_flush;
511 get_page_type(page);
512 }
514 get_page_tot(page);
516 return(0);
517 }
520 static void put_l2_table(unsigned long page_nr)
521 {
522 l2_pgentry_t *p_l2_entry, l2_entry;
523 int i;
525 if ( likely(dec_page_refcnt(page_nr, PGT_l2_page_table)) ) return;
527 /* We had last reference to level-2 page table. Free the PDEs. */
528 p_l2_entry = map_domain_mem(page_nr << PAGE_SHIFT);
529 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
530 {
531 l2_entry = *p_l2_entry++;
532 if ( (l2_pgentry_val(l2_entry) & _PAGE_PRESENT) )
533 put_l1_table(l2_pgentry_to_pagenr(l2_entry));
534 }
536 unmap_domain_mem(p_l2_entry);
537 }
540 static void put_l1_table(unsigned long page_nr)
541 {
542 l1_pgentry_t *p_l1_entry, l1_entry;
543 int i;
545 if ( likely(dec_page_refcnt(page_nr, PGT_l1_page_table)) ) return;
547 /* We had last reference to level-1 page table. Free the PTEs. */
548 p_l1_entry = map_domain_mem(page_nr << PAGE_SHIFT);
549 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
550 {
551 l1_entry = *p_l1_entry++;
552 if ( (l1_pgentry_val(l1_entry) & _PAGE_PRESENT) )
553 {
554 put_page(l1_pgentry_to_pagenr(l1_entry),
555 l1_pgentry_val(l1_entry) & _PAGE_RW);
556 }
557 }
559 /* Make sure we unmap the right page! */
560 unmap_domain_mem(p_l1_entry-1);
561 }
564 static void put_page(unsigned long page_nr, int writeable)
565 {
566 struct pfn_info *page;
567 ASSERT(page_nr < max_page);
568 page = frame_table + page_nr;
569 ASSERT(DOMAIN_OKAY(page->flags));
570 ASSERT((!writeable) ||
571 ((page_type_count(page) != 0) &&
572 ((page->flags & PG_type_mask) == PGT_writeable_page) &&
573 ((page->flags & PG_need_flush) == PG_need_flush)));
574 if ( writeable )
575 {
576 put_page_type(page);
577 }
578 else if ( unlikely(((page->flags & PG_type_mask) == PGT_ldt_page) &&
579 (page_type_count(page) != 0)) )
580 {
581 /* We expect this is rare so we just blow the entire shadow LDT. */
582 invalidate_shadow_ldt();
583 }
584 put_page_tot(page);
585 }
588 static int mod_l2_entry(l2_pgentry_t *p_l2_entry, l2_pgentry_t new_l2_entry)
589 {
590 l2_pgentry_t old_l2_entry = *p_l2_entry;
592 if ( unlikely((((unsigned long)p_l2_entry & (PAGE_SIZE-1)) >> 2) >=
593 DOMAIN_ENTRIES_PER_L2_PAGETABLE) )
594 {
595 MEM_LOG("Illegal L2 update attempt in hypervisor area %p",
596 p_l2_entry);
597 goto fail;
598 }
600 if ( (l2_pgentry_val(new_l2_entry) & _PAGE_PRESENT) )
601 {
602 if ( unlikely((l2_pgentry_val(new_l2_entry) &
603 (_PAGE_GLOBAL|_PAGE_PSE))) )
604 {
605 MEM_LOG("Bad L2 entry val %04lx",
606 l2_pgentry_val(new_l2_entry) &
607 (_PAGE_GLOBAL|_PAGE_PSE));
608 goto fail;
609 }
610 /* Differ in mapping (bits 12-31) or presence (bit 0)? */
611 if ( ((l2_pgentry_val(old_l2_entry) ^
612 l2_pgentry_val(new_l2_entry)) & 0xfffff001) != 0 )
613 {
614 if ( (l2_pgentry_val(old_l2_entry) & _PAGE_PRESENT) )
615 put_l1_table(l2_pgentry_to_pagenr(old_l2_entry));
617 /* Assume we're mapping an L1 table, falling back to twisted L2. */
618 if ( unlikely(get_l1_table(l2_pgentry_to_pagenr(new_l2_entry))) )
619 {
620 /* NB. No need to sanity-check the VA: done already. */
621 unsigned long l1e = l1_pgentry_val(
622 linear_pg_table[(unsigned long)p_l2_entry >> PAGE_SHIFT]);
623 if ( get_twisted_l2_table(l1e >> PAGE_SHIFT, new_l2_entry) )
624 goto fail;
625 }
626 }
627 }
628 else if ( (l2_pgentry_val(old_l2_entry) & _PAGE_PRESENT) )
629 {
630 put_l1_table(l2_pgentry_to_pagenr(old_l2_entry));
631 }
633 *p_l2_entry = new_l2_entry;
634 return 0;
636 fail:
637 return -1;
638 }
641 static int mod_l1_entry(l1_pgentry_t *p_l1_entry, l1_pgentry_t new_l1_entry)
642 {
643 l1_pgentry_t old_l1_entry = *p_l1_entry;
645 if ( (l1_pgentry_val(new_l1_entry) & _PAGE_PRESENT) )
646 {
647 if ( unlikely((l1_pgentry_val(new_l1_entry) &
648 (_PAGE_GLOBAL|_PAGE_PAT))) )
649 {
650 MEM_LOG("Bad L1 entry val %04lx",
651 l1_pgentry_val(new_l1_entry) &
652 (_PAGE_GLOBAL|_PAGE_PAT));
653 goto fail;
654 }
655 /*
656 * Differ in mapping (bits 12-31), writeable (bit 1), or
657 * presence (bit 0)?
658 */
659 if ( ((l1_pgentry_val(old_l1_entry) ^
660 l1_pgentry_val(new_l1_entry)) & 0xfffff003) != 0 )
661 {
662 if ( (l1_pgentry_val(old_l1_entry) & _PAGE_PRESENT) )
663 put_page(l1_pgentry_to_pagenr(old_l1_entry),
664 l1_pgentry_val(old_l1_entry) & _PAGE_RW);
666 if ( get_page(l1_pgentry_to_pagenr(new_l1_entry),
667 l1_pgentry_val(new_l1_entry) & _PAGE_RW) )
668 goto fail;
669 }
670 }
671 else if ( (l1_pgentry_val(old_l1_entry) & _PAGE_PRESENT) )
672 {
673 put_page(l1_pgentry_to_pagenr(old_l1_entry),
674 l1_pgentry_val(old_l1_entry) & _PAGE_RW);
675 }
677 *p_l1_entry = new_l1_entry;
678 return 0;
680 fail:
681 return -1;
682 }
685 static int do_extended_command(unsigned long ptr, unsigned long val)
686 {
687 int err = 0, cpu = smp_processor_id();
688 unsigned int cmd = val & PGEXT_CMD_MASK;
689 unsigned long pfn = ptr >> PAGE_SHIFT;
690 struct pfn_info *page = frame_table + pfn;
692 /* 'ptr' must be in range except where it isn't a machine address. */
693 if ( (pfn >= max_page) && (cmd != PGEXT_SET_LDT) )
694 return 1;
696 switch ( cmd )
697 {
698 case PGEXT_PIN_L1_TABLE:
699 err = get_l1_table(pfn);
700 goto mark_as_pinned;
701 case PGEXT_PIN_L2_TABLE:
702 err = get_l2_table(pfn);
703 mark_as_pinned:
704 if ( unlikely(err) )
705 {
706 MEM_LOG("Error while pinning pfn %08lx", pfn);
707 break;
708 }
709 put_page_type(page);
710 put_page_tot(page);
711 if ( likely(!(page->type_count & REFCNT_PIN_BIT)) )
712 {
713 page->type_count |= REFCNT_PIN_BIT;
714 page->tot_count |= REFCNT_PIN_BIT;
715 }
716 else
717 {
718 MEM_LOG("Pfn %08lx already pinned", pfn);
719 err = 1;
720 }
721 break;
723 case PGEXT_UNPIN_TABLE:
724 if ( !DOMAIN_OKAY(page->flags) )
725 {
726 err = 1;
727 MEM_LOG("Page %08lx bad domain (dom=%ld)",
728 ptr, page->flags & PG_domain_mask);
729 }
730 else if ( (page->type_count & REFCNT_PIN_BIT) )
731 {
732 page->type_count &= ~REFCNT_PIN_BIT;
733 page->tot_count &= ~REFCNT_PIN_BIT;
734 get_page_type(page);
735 get_page_tot(page);
736 ((page->flags & PG_type_mask) == PGT_l1_page_table) ?
737 put_l1_table(pfn) : put_l2_table(pfn);
738 }
739 else
740 {
741 err = 1;
742 MEM_LOG("Pfn %08lx not pinned", pfn);
743 }
744 break;
746 case PGEXT_NEW_BASEPTR:
747 err = get_l2_table(pfn);
748 if ( !err )
749 {
750 put_l2_table(pagetable_val(current->mm.pagetable) >> PAGE_SHIFT);
751 current->mm.pagetable = mk_pagetable(pfn << PAGE_SHIFT);
752 invalidate_shadow_ldt();
753 deferred_op[cpu].flush_tlb = 1;
754 }
755 else
756 {
757 MEM_LOG("Error while installing new baseptr %08lx %d", ptr, err);
758 }
759 break;
761 case PGEXT_TLB_FLUSH:
762 deferred_op[cpu].flush_tlb = 1;
763 break;
765 case PGEXT_INVLPG:
766 __flush_tlb_one(val & ~PGEXT_CMD_MASK);
767 break;
769 case PGEXT_SET_LDT:
770 {
771 unsigned long ents = val >> PGEXT_CMD_SHIFT;
772 if ( ((ptr & (PAGE_SIZE-1)) != 0) ||
773 (ents > 8192) ||
774 ((ptr+ents*LDT_ENTRY_SIZE) < ptr) ||
775 ((ptr+ents*LDT_ENTRY_SIZE) > PAGE_OFFSET) )
776 {
777 err = 1;
778 MEM_LOG("Bad args to SET_LDT: ptr=%08lx, ents=%08lx", ptr, ents);
779 }
780 else if ( (current->mm.ldt_ents != ents) ||
781 (current->mm.ldt_base != ptr) )
782 {
783 if ( current->mm.ldt_ents != 0 )
784 invalidate_shadow_ldt();
785 current->mm.ldt_base = ptr;
786 current->mm.ldt_ents = ents;
787 load_LDT(current);
788 deferred_op[cpu].refresh_ldt = (ents != 0);
789 }
790 break;
791 }
793 default:
794 MEM_LOG("Invalid extended pt command 0x%08lx", val & PGEXT_CMD_MASK);
795 err = 1;
796 break;
797 }
799 return err;
800 }
803 int do_process_page_updates(page_update_request_t *ureqs, int count)
804 {
805 page_update_request_t req;
806 unsigned long flags, pfn, l1e;
807 struct pfn_info *page;
808 int err = 0, i, cpu = smp_processor_id();
809 unsigned int cmd;
810 unsigned long cr0 = 0;
812 perfc_incrc( calls_to_process_page_updates );
813 perfc_addc( num_page_updates, count );
815 for ( i = 0; i < count; i++ )
816 {
818 if ( unlikely(copy_from_user(&req, ureqs, sizeof(req)) != 0) )
819 {
820 if ( cr0 != 0 ) write_cr0(cr0);
821 kill_domain_with_errmsg("Cannot read page update request");
822 }
824 cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
825 pfn = req.ptr >> PAGE_SHIFT;
827 err = 1;
829 spin_lock_irq(&current->page_lock);
831 /* Get the page-frame number that a non-extended command references. */
832 if ( (cmd == PGREQ_NORMAL_UPDATE) || (cmd == PGREQ_UNCHECKED_UPDATE) )
833 {
834 if ( cr0 == 0 )
835 {
836 cr0 = read_cr0();
837 write_cr0(cr0 & ~X86_CR0_WP);
838 }
839 /* Need to use 'get_user' since the VA's PGD may be absent. */
840 __get_user(l1e, (unsigned long *)(linear_pg_table+pfn));
841 /* Now check that the VA's PTE isn't absent. */
842 if ( unlikely(!(l1e & _PAGE_PRESENT)) )
843 {
844 MEM_LOG("L1E n.p. at VA %08lx (%08lx)", req.ptr&~3, l1e);
845 goto unlock;
846 }
847 /* Finally, get the underlying machine address. */
848 pfn = l1e >> PAGE_SHIFT;
849 }
851 /* Least significant bits of 'ptr' demux the operation type. */
852 switch ( cmd )
853 {
854 /*
855 * PGREQ_NORMAL_UPDATE: Normal update to any level of page table.
856 */
857 case PGREQ_NORMAL_UPDATE:
858 page = frame_table + pfn;
859 flags = page->flags;
861 if ( likely(DOMAIN_OKAY(flags)) )
862 {
863 switch ( (flags & PG_type_mask) )
864 {
865 case PGT_l1_page_table:
866 err = mod_l1_entry((l1_pgentry_t *)req.ptr,
867 mk_l1_pgentry(req.val));
868 break;
869 case PGT_l2_page_table:
870 err = mod_l2_entry((l2_pgentry_t *)req.ptr,
871 mk_l2_pgentry(req.val));
872 break;
873 default:
874 if ( page->type_count == 0 )
875 {
876 *(unsigned long *)req.ptr = req.val;
877 err = 0;
878 }
879 else
880 MEM_LOG("Update to bad page %08lx", req.ptr);
881 break;
882 }
883 }
884 else
885 {
886 MEM_LOG("Bad domain normal update (dom %d, pfn %ld)",
887 current->domain, pfn);
888 }
889 break;
891 case PGREQ_UNCHECKED_UPDATE:
892 req.ptr &= ~(sizeof(l1_pgentry_t) - 1);
893 if ( likely(IS_PRIV(current)) )
894 {
895 *(unsigned long *)req.ptr = req.val;
896 err = 0;
897 }
898 else
899 {
900 MEM_LOG("Bad unchecked update attempt");
901 }
902 break;
904 case PGREQ_MPT_UPDATE:
905 page = frame_table + pfn;
906 if ( unlikely(pfn >= max_page) )
907 {
908 MEM_LOG("Page out of range (%08lx > %08lx)", pfn, max_page);
909 }
910 else if ( likely(DOMAIN_OKAY(page->flags)) )
911 {
912 machine_to_phys_mapping[pfn] = req.val;
913 err = 0;
914 }
915 else
916 {
917 MEM_LOG("Bad domain MPT update (dom %d, pfn %ld)",
918 current->domain, pfn);
919 }
920 break;
922 /*
923 * PGREQ_EXTENDED_COMMAND: Extended command is specified
924 * in the least-siginificant bits of the 'value' field.
925 */
926 case PGREQ_EXTENDED_COMMAND:
927 req.ptr &= ~(sizeof(l1_pgentry_t) - 1);
928 err = do_extended_command(req.ptr, req.val);
929 break;
931 default:
932 MEM_LOG("Invalid page update command %08lx", req.ptr);
933 break;
934 }
936 unlock:
937 spin_unlock_irq(&current->page_lock);
939 if ( unlikely(err) )
940 {
941 if ( cr0 != 0 ) write_cr0(cr0);
942 kill_domain_with_errmsg("Illegal page update request");
943 }
945 ureqs++;
946 }
948 if ( deferred_op[cpu].flush_tlb )
949 {
950 deferred_op[cpu].flush_tlb = 0;
951 __write_cr3_counted(pagetable_val(current->mm.pagetable));
952 }
954 if ( deferred_op[cpu].refresh_ldt )
955 {
956 deferred_op[cpu].refresh_ldt = 0;
957 (void)map_ldt_shadow_page(0);
958 }
960 if ( cr0 != 0 )
961 write_cr0(cr0);
963 return 0;
964 }
967 int do_update_va_mapping(unsigned long page_nr,
968 unsigned long val,
969 unsigned long flags)
970 {
971 unsigned long _x, cr0 = 0;
972 struct task_struct *p = current;
973 int err = -EINVAL;
975 if ( unlikely(page_nr >= (HYPERVISOR_VIRT_START >> PAGE_SHIFT)) )
976 goto out;
978 spin_lock_irq(&p->page_lock);
980 /* Check that the VA's page-directory entry is present.. */
981 if ( unlikely((err = __get_user(_x, (unsigned long *)
982 (&linear_pg_table[page_nr]))) != 0) )
983 goto unlock_and_out;
985 /* If the VA's page-directory entry is read-only, we frob the WP bit. */
986 if ( unlikely(__put_user(_x, (unsigned long *)
987 (&linear_pg_table[page_nr]))) )
988 {
989 cr0 = read_cr0();
990 write_cr0(cr0 & ~X86_CR0_WP);
991 }
993 if ( unlikely((err = mod_l1_entry(&linear_pg_table[page_nr],
994 mk_l1_pgentry(val))) != 0) )
995 {
996 spin_unlock_irq(&p->page_lock);
997 kill_domain_with_errmsg("Illegal VA-mapping update request");
998 }
1000 if ( unlikely(flags & UVMF_INVLPG) )
1001 __flush_tlb_one(page_nr << PAGE_SHIFT);
1003 if ( unlikely(flags & UVMF_FLUSH_TLB) )
1004 __write_cr3_counted(pagetable_val(p->mm.pagetable));
1006 if ( unlikely(cr0 != 0) )
1007 write_cr0(cr0);
1009 unlock_and_out:
1010 spin_unlock_irq(&p->page_lock);
1011 out:
1012 return err;