direct-io.hg

view xen/common/memory.c @ 803:54d82b047eb4

bitkeeper revision 1.492 (3f833580GWlNh3YcP007drav9Zondw)

memory.c:
Another little fix.
author kaf24@scramble.cl.cam.ac.uk
date Tue Oct 07 21:52:00 2003 +0000 (2003-10-07)
parents f4b23abe3038
children a55c876d6d2e
line source
1 /******************************************************************************
2 * memory.c
3 *
4 * Copyright (c) 2002 K A Fraser
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
21 /*
22 * A description of the page table API:
23 *
24 * Domains trap to process_page_updates with a list of update requests.
25 * This is a list of (ptr, val) pairs, where the requested operation
26 * is *ptr = val.
27 *
28 * Reference counting of pages:
29 * ----------------------------
30 * Each page has two refcounts: tot_count and type_count.
31 *
32 * TOT_COUNT is the obvious reference count. It counts all uses of a
33 * physical page frame by a domain, including uses as a page directory,
34 * a page table, or simple mappings via a PTE. This count prevents a
35 * domain from releasing a frame back to the hypervisor's free pool when
36 * it is still referencing it!
37 *
38 * TYPE_COUNT is more subtle. A frame can be put to one of three
39 * mutually-exclusive uses: it might be used as a page directory, or a
40 * page table, or it may be mapped writeable by the domain [of course, a
41 * frame may not be used in any of these three ways!].
42 * So, type_count is a count of the number of times a frame is being
43 * referred to in its current incarnation. Therefore, a page can only
44 * change its type when its type count is zero.
45 *
46 * Pinning the page type:
47 * ----------------------
48 * The type of a page can be pinned/unpinned with the commands
49 * PGEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is,
50 * pinning is not reference counted, so it can't be nested).
51 * This is useful to prevent a page's type count falling to zero, at which
52 * point safety checks would need to be carried out next time the count
53 * is increased again.
54 *
55 * A further note on writeable page mappings:
56 * ------------------------------------------
57 * For simplicity, the count of writeable mappings for a page may not
58 * correspond to reality. The 'writeable count' is incremented for every
59 * PTE which maps the page with the _PAGE_RW flag set. However, for
60 * write access to be possible the page directory entry must also have
61 * its _PAGE_RW bit set. We do not check this as it complicates the
62 * reference counting considerably [consider the case of multiple
63 * directory entries referencing a single page table, some with the RW
64 * bit set, others not -- it starts getting a bit messy].
65 * In normal use, this simplification shouldn't be a problem.
66 * However, the logic can be added if required.
67 *
68 * One more note on read-only page mappings:
69 * -----------------------------------------
70 * We want domains to be able to map pages for read-only access. The
71 * main reason is that page tables and directories should be readable
72 * by a domain, but it would not be safe for them to be writeable.
73 * However, domains have free access to rings 1 & 2 of the Intel
74 * privilege model. In terms of page protection, these are considered
75 * to be part of 'supervisor mode'. The WP bit in CR0 controls whether
76 * read-only restrictions are respected in supervisor mode -- if the
77 * bit is clear then any mapped page is writeable.
78 *
79 * We get round this by always setting the WP bit and disallowing
80 * updates to it. This is very unlikely to cause a problem for guest
81 * OS's, which will generally use the WP bit to simplify copy-on-write
82 * implementation (in that case, OS wants a fault when it writes to
83 * an application-supplied buffer).
84 */
87 /*
88 * THE FOLLOWING ARE ISSUES IF GUEST OPERATING SYSTEMS BECOME SMP-CAPABLE.
89 * -----------------------------------------------------------------------
90 *
91 * *********
92 * UPDATE 15/7/02: Interface has changed --updates now specify physical
93 * address of page-table entry, rather than specifying a virtual address,
94 * so hypervisor no longer "walks" the page tables. Therefore the
95 * solution below cannot work. Another possibility is to add a new entry
96 * to our "struct page" which says to which top-level page table each
97 * lower-level page table or writeable mapping belongs. If it belongs to more
98 * than one, we'd probably just flush on all processors running the domain.
99 * *********
100 *
101 * The problem involves creating new page tables which might be mapped
102 * writeable in the TLB of another processor. As an example, a domain might be
103 * running in two contexts (ie. on two processors) simultaneously, using the
104 * same top-level page table in both contexts. Now, if context 1 sends an
105 * update request [make page P read-only, add a reference to page P as a page
106 * table], that will succeed if there was only one writeable mapping of P.
107 * However, that mapping may persist in the TLB of context 2.
108 *
109 * Solution: when installing a new page table, we must flush foreign TLBs as
110 * necessary. Naive solution is to flush on any processor running our domain.
111 * Cleverer solution is to flush on any processor running same top-level page
112 * table, but this will sometimes fail (consider two different top-level page
113 * tables which have a shared lower-level page table).
114 *
115 * A better solution: when squashing a write reference, check how many times
116 * that lowest-level table entry is referenced by ORing refcounts of tables
117 * down the page-table hierarchy. If results is != 1, we require flushing all
118 * instances of current domain if a new table is installed (because the
119 * lowest-level entry may be referenced by many top-level page tables).
120 * However, common case will be that result == 1, so we only need to flush
121 * processors with the same top-level page table. Make choice at
122 * table-installation time based on a `flush_level' flag, which is
123 * FLUSH_NONE, FLUSH_PAGETABLE, FLUSH_DOMAIN. A flush reduces this
124 * to FLUSH_NONE, while squashed write mappings can only promote up
125 * to more aggressive flush types.
126 */
128 #include <xeno/config.h>
129 #include <xeno/init.h>
130 #include <xeno/lib.h>
131 #include <xeno/mm.h>
132 #include <xeno/sched.h>
133 #include <xeno/errno.h>
134 #include <xeno/perfc.h>
135 #include <asm/page.h>
136 #include <asm/flushtlb.h>
137 #include <asm/io.h>
138 #include <asm/uaccess.h>
139 #include <asm/domain_page.h>
141 #if 0
142 #define MEM_LOG(_f, _a...) printk("DOM%d: (file=memory.c, line=%d) " _f "\n", current->domain, __LINE__, ## _a )
143 #else
144 #define MEM_LOG(_f, _a...) ((void)0)
145 #endif
147 /* Domain 0 is allowed to submit requests on behalf of others. */
148 #define DOMAIN_OKAY(_f) \
149 ((((_f) & PG_domain_mask) == current->domain) || (current->domain == 0))
151 /* 'get' checks parameter for validity before inc'ing refcnt. */
152 static int get_l2_table(unsigned long page_nr);
153 static int get_l1_table(unsigned long page_nr);
154 static int get_page(unsigned long page_nr, int writeable);
155 static int inc_page_refcnt(unsigned long page_nr, unsigned int type);
156 /* 'put' does no checking because if refcnt not zero, entity must be valid. */
157 static void put_l2_table(unsigned long page_nr);
158 static void put_l1_table(unsigned long page_nr);
159 static void put_page(unsigned long page_nr, int writeable);
160 static int dec_page_refcnt(unsigned long page_nr, unsigned int type);
162 static int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t);
163 static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t);
165 /* frame table size and its size in pages */
166 frame_table_t * frame_table;
167 unsigned long frame_table_size;
168 unsigned long max_page;
170 struct list_head free_list;
171 spinlock_t free_list_lock = SPIN_LOCK_UNLOCKED;
172 unsigned int free_pfns;
174 /* Used to defer flushing of memory structures. */
175 static int flush_tlb[NR_CPUS] __cacheline_aligned;
178 /*
179 * init_frametable:
180 * Initialise per-frame memory information. This goes directly after
181 * MAX_MONITOR_ADDRESS in physical memory.
182 */
183 void __init init_frametable(unsigned long nr_pages)
184 {
185 struct pfn_info *pf;
186 unsigned long page_index;
187 unsigned long flags;
189 memset(flush_tlb, 0, sizeof(flush_tlb));
191 max_page = nr_pages;
192 frame_table_size = nr_pages * sizeof(struct pfn_info);
193 frame_table_size = (frame_table_size + PAGE_SIZE - 1) & PAGE_MASK;
194 frame_table = (frame_table_t *)FRAMETABLE_VIRT_START;
195 memset(frame_table, 0, frame_table_size);
197 free_pfns = 0;
199 /* Put all domain-allocatable memory on a free list. */
200 spin_lock_irqsave(&free_list_lock, flags);
201 INIT_LIST_HEAD(&free_list);
202 for( page_index = (__pa(frame_table) + frame_table_size) >> PAGE_SHIFT;
203 page_index < nr_pages;
204 page_index++ )
205 {
206 pf = list_entry(&frame_table[page_index].list, struct pfn_info, list);
207 list_add_tail(&pf->list, &free_list);
208 free_pfns++;
209 }
210 spin_unlock_irqrestore(&free_list_lock, flags);
211 }
214 static void __invalidate_shadow_ldt(void)
215 {
216 int i;
217 unsigned long pfn;
218 struct pfn_info *page;
220 current->mm.shadow_ldt_mapcnt = 0;
222 for ( i = 16; i < 32; i++ )
223 {
224 pfn = l1_pgentry_to_pagenr(current->mm.perdomain_pt[i]);
225 if ( pfn == 0 ) continue;
226 current->mm.perdomain_pt[i] = mk_l1_pgentry(0);
227 page = frame_table + pfn;
228 ASSERT((page->flags & PG_type_mask) == PGT_ldt_page);
229 ASSERT((page->flags & PG_domain_mask) == current->domain);
230 ASSERT((page->type_count != 0) && (page->tot_count != 0));
231 put_page_type(page);
232 put_page_tot(page);
233 }
234 }
235 static inline void invalidate_shadow_ldt(void)
236 {
237 if ( current->mm.shadow_ldt_mapcnt != 0 )
238 __invalidate_shadow_ldt();
239 }
242 /* Return original refcnt, or -1 on error. */
243 static int inc_page_refcnt(unsigned long page_nr, unsigned int type)
244 {
245 struct pfn_info *page;
246 unsigned long flags;
248 if ( page_nr >= max_page )
249 {
250 MEM_LOG("Page out of range (%08lx>%08lx)", page_nr, max_page);
251 return -1;
252 }
253 page = frame_table + page_nr;
254 flags = page->flags;
255 if ( !DOMAIN_OKAY(flags) )
256 {
257 MEM_LOG("Bad page domain (%ld)", flags & PG_domain_mask);
258 return -1;
259 }
260 if ( (flags & PG_type_mask) != type )
261 {
262 if ( page_type_count(page) != 0 )
263 {
264 MEM_LOG("Page %08lx bad type/count (%08lx!=%08x) cnt=%ld",
265 page_nr << PAGE_SHIFT,
266 flags & PG_type_mask, type, page_type_count(page));
267 return -1;
268 }
270 if ( flags & PG_need_flush )
271 {
272 flush_tlb[smp_processor_id()] = 1;
273 page->flags &= ~PG_need_flush;
274 perfc_incrc(need_flush_tlb_flush);
275 }
277 page->flags &= ~PG_type_mask;
278 page->flags |= type;
279 }
281 get_page_tot(page);
282 return get_page_type(page);
283 }
286 /* Return new refcnt, or -1 on error. */
287 static int dec_page_refcnt(unsigned long page_nr, unsigned int type)
288 {
289 struct pfn_info *page;
291 if ( page_nr >= max_page )
292 {
293 MEM_LOG("Page out of range (%08lx>%08lx)", page_nr, max_page);
294 return -1;
295 }
296 page = frame_table + page_nr;
297 if ( !DOMAIN_OKAY(page->flags) ||
298 ((page->flags & PG_type_mask) != type) )
299 {
300 MEM_LOG("Bad page type/domain (dom=%ld) (type %ld != expected %d)",
301 page->flags & PG_domain_mask, page->flags & PG_type_mask,
302 type);
303 return -1;
304 }
305 ASSERT(page_type_count(page) != 0);
306 put_page_tot(page);
307 return put_page_type(page);
308 }
311 /* We allow a L2 table to map itself, to achieve a linear pagetable. */
312 /* NB. There's no need for a put_twisted_l2_table() function!! */
313 static int get_twisted_l2_table(unsigned long entry_pfn, l2_pgentry_t l2e)
314 {
315 unsigned long l2v = l2_pgentry_val(l2e);
317 /* Clearly the mapping must be read-only :-) */
318 if ( (l2v & _PAGE_RW) )
319 {
320 MEM_LOG("Attempt to install twisted L2 entry with write permissions");
321 return -1;
322 }
324 /* This is a sufficient final check. */
325 if ( (l2v >> PAGE_SHIFT) != entry_pfn )
326 {
327 MEM_LOG("L2 tables may not map _other_ L2 tables!\n");
328 return -1;
329 }
331 /* We don't bump the reference counts. */
332 return 0;
333 }
336 static int get_l2_table(unsigned long page_nr)
337 {
338 l2_pgentry_t *p_l2_entry, l2_entry;
339 int i, ret=0;
341 ret = inc_page_refcnt(page_nr, PGT_l2_page_table);
342 if ( ret != 0 ) return (ret < 0) ? ret : 0;
344 /* NEW level-2 page table! Deal with every PDE in the table. */
345 p_l2_entry = map_domain_mem(page_nr << PAGE_SHIFT);
346 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
347 {
348 l2_entry = *p_l2_entry++;
349 if ( !(l2_pgentry_val(l2_entry) & _PAGE_PRESENT) ) continue;
350 if ( (l2_pgentry_val(l2_entry) & (_PAGE_GLOBAL|_PAGE_PSE)) )
351 {
352 MEM_LOG("Bad L2 page type settings %04lx",
353 l2_pgentry_val(l2_entry) & (_PAGE_GLOBAL|_PAGE_PSE));
354 ret = -1;
355 goto out;
356 }
357 /* Assume we're mapping an L1 table, falling back to twisted L2. */
358 ret = get_l1_table(l2_pgentry_to_pagenr(l2_entry));
359 if ( ret ) ret = get_twisted_l2_table(page_nr, l2_entry);
360 if ( ret ) goto out;
361 }
363 /* Now we simply slap in our high mapping. */
364 memcpy(p_l2_entry,
365 &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
366 HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
367 p_l2_entry[(PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT) -
368 DOMAIN_ENTRIES_PER_L2_PAGETABLE] =
369 mk_l2_pgentry(__pa(current->mm.perdomain_pt) | __PAGE_HYPERVISOR);
370 p_l2_entry[(LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT) -
371 DOMAIN_ENTRIES_PER_L2_PAGETABLE] =
372 mk_l2_pgentry((page_nr << PAGE_SHIFT) | __PAGE_HYPERVISOR);
374 out:
375 unmap_domain_mem(p_l2_entry);
376 return ret;
377 }
380 static int get_l1_table(unsigned long page_nr)
381 {
382 l1_pgentry_t *p_l1_entry, l1_entry;
383 int i, ret;
385 /* Update ref count for page pointed at by PDE. */
386 ret = inc_page_refcnt(page_nr, PGT_l1_page_table);
387 if ( ret != 0 ) return (ret < 0) ? ret : 0;
389 /* NEW level-1 page table! Deal with every PTE in the table. */
390 p_l1_entry = map_domain_mem(page_nr << PAGE_SHIFT);
391 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
392 {
393 l1_entry = *p_l1_entry++;
394 if ( !(l1_pgentry_val(l1_entry) & _PAGE_PRESENT) ) continue;
395 if ( (l1_pgentry_val(l1_entry) &
396 (_PAGE_GLOBAL|_PAGE_PAT)) )
397 {
398 MEM_LOG("Bad L1 page type settings %04lx",
399 l1_pgentry_val(l1_entry) &
400 (_PAGE_GLOBAL|_PAGE_PAT));
401 ret = -1;
402 goto out;
403 }
404 ret = get_page(l1_pgentry_to_pagenr(l1_entry),
405 l1_pgentry_val(l1_entry) & _PAGE_RW);
406 if ( ret ) goto out;
407 }
409 out:
410 /* Make sure we unmap the right page! */
411 unmap_domain_mem(p_l1_entry-1);
412 return ret;
413 }
416 static int get_page(unsigned long page_nr, int writeable)
417 {
418 struct pfn_info *page;
419 unsigned long flags;
421 /* Update ref count for page pointed at by PTE. */
422 if ( page_nr >= max_page )
423 {
424 MEM_LOG("Page out of range (%08lx>%08lx)", page_nr, max_page);
425 return(-1);
426 }
427 page = frame_table + page_nr;
428 flags = page->flags;
429 if ( !DOMAIN_OKAY(flags) )
430 {
431 MEM_LOG("Bad page domain (%ld)", flags & PG_domain_mask);
432 return(-1);
433 }
435 if ( writeable )
436 {
437 if ( (flags & PG_type_mask) != PGT_writeable_page )
438 {
439 if ( page_type_count(page) != 0 )
440 {
441 MEM_LOG("Bad page type/count (%08lx!=%08x) cnt=%ld",
442 flags & PG_type_mask, PGT_writeable_page,
443 page_type_count(page));
444 return(-1);
445 }
446 page->flags &= ~PG_type_mask;
447 page->flags |= PGT_writeable_page;
448 }
449 page->flags |= PG_need_flush;
450 get_page_type(page);
451 }
453 get_page_tot(page);
455 return(0);
456 }
459 static void put_l2_table(unsigned long page_nr)
460 {
461 l2_pgentry_t *p_l2_entry, l2_entry;
462 int i;
464 if ( dec_page_refcnt(page_nr, PGT_l2_page_table) ) return;
466 /* We had last reference to level-2 page table. Free the PDEs. */
467 p_l2_entry = map_domain_mem(page_nr << PAGE_SHIFT);
468 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
469 {
470 l2_entry = *p_l2_entry++;
471 if ( (l2_pgentry_val(l2_entry) & _PAGE_PRESENT) )
472 put_l1_table(l2_pgentry_to_pagenr(l2_entry));
473 }
475 unmap_domain_mem(p_l2_entry);
476 }
479 static void put_l1_table(unsigned long page_nr)
480 {
481 l1_pgentry_t *p_l1_entry, l1_entry;
482 int i;
484 if ( dec_page_refcnt(page_nr, PGT_l1_page_table) ) return;
486 /* We had last reference to level-1 page table. Free the PTEs. */
487 p_l1_entry = map_domain_mem(page_nr << PAGE_SHIFT);
488 for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
489 {
490 l1_entry = *p_l1_entry++;
491 if ( (l1_pgentry_val(l1_entry) & _PAGE_PRESENT) )
492 {
493 put_page(l1_pgentry_to_pagenr(l1_entry),
494 l1_pgentry_val(l1_entry) & _PAGE_RW);
495 }
496 }
498 /* Make sure we unmap the right page! */
499 unmap_domain_mem(p_l1_entry-1);
500 }
503 static void put_page(unsigned long page_nr, int writeable)
504 {
505 struct pfn_info *page;
506 ASSERT(page_nr < max_page);
507 page = frame_table + page_nr;
508 ASSERT(DOMAIN_OKAY(page->flags));
509 ASSERT((!writeable) ||
510 ((page_type_count(page) != 0) &&
511 ((page->flags & PG_type_mask) == PGT_writeable_page) &&
512 ((page->flags & PG_need_flush) == PG_need_flush)));
513 if ( writeable )
514 {
515 put_page_type(page);
516 }
517 else if ( unlikely(((page->flags & PG_type_mask) == PGT_ldt_page) &&
518 (page_type_count(page) != 0)) )
519 {
520 /* We expect this is rare so we just blow the entire shadow LDT. */
521 invalidate_shadow_ldt();
522 }
523 put_page_tot(page);
524 }
527 static int mod_l2_entry(l2_pgentry_t *p_l2_entry, l2_pgentry_t new_l2_entry)
528 {
529 l2_pgentry_t old_l2_entry = *p_l2_entry;
531 if ( (((unsigned long)p_l2_entry & (PAGE_SIZE-1)) >> 2) >=
532 DOMAIN_ENTRIES_PER_L2_PAGETABLE )
533 {
534 MEM_LOG("Illegal L2 update attempt in hypervisor area %p",
535 p_l2_entry);
536 goto fail;
537 }
539 if ( (l2_pgentry_val(new_l2_entry) & _PAGE_PRESENT) )
540 {
541 if ( (l2_pgentry_val(new_l2_entry) & (_PAGE_GLOBAL|_PAGE_PSE)) )
542 {
543 MEM_LOG("Bad L2 entry val %04lx",
544 l2_pgentry_val(new_l2_entry) &
545 (_PAGE_GLOBAL|_PAGE_PSE));
546 goto fail;
547 }
548 /* Differ in mapping (bits 12-31) or presence (bit 0)? */
549 if ( ((l2_pgentry_val(old_l2_entry) ^
550 l2_pgentry_val(new_l2_entry)) & 0xfffff001) != 0 )
551 {
552 if ( (l2_pgentry_val(old_l2_entry) & _PAGE_PRESENT) )
553 {
554 put_l1_table(l2_pgentry_to_pagenr(old_l2_entry));
555 }
557 /* Assume we're mapping an L1 table, falling back to twisted L2. */
558 if ( unlikely(get_l1_table(l2_pgentry_to_pagenr(new_l2_entry))) )
559 {
560 /* NB. No need to sanity-check the VA: done already. */
561 unsigned long l1e = l1_pgentry_val(
562 linear_pg_table[(unsigned long)p_l2_entry >> PAGE_SHIFT]);
563 if ( get_twisted_l2_table(l1e >> PAGE_SHIFT, new_l2_entry) )
564 goto fail;
565 }
566 }
567 }
568 else if ( (l2_pgentry_val(old_l2_entry) & _PAGE_PRESENT) )
569 {
570 put_l1_table(l2_pgentry_to_pagenr(old_l2_entry));
571 }
573 *p_l2_entry = new_l2_entry;
574 return 0;
576 fail:
577 return -1;
578 }
581 static int mod_l1_entry(l1_pgentry_t *p_l1_entry, l1_pgentry_t new_l1_entry)
582 {
583 l1_pgentry_t old_l1_entry = *p_l1_entry;
585 if ( (l1_pgentry_val(new_l1_entry) & _PAGE_PRESENT) )
586 {
587 if ( (l1_pgentry_val(new_l1_entry) &
588 (_PAGE_GLOBAL|_PAGE_PAT)) )
589 {
591 MEM_LOG("Bad L1 entry val %04lx",
592 l1_pgentry_val(new_l1_entry) &
593 (_PAGE_GLOBAL|_PAGE_PAT));
594 goto fail;
595 }
596 /*
597 * Differ in mapping (bits 12-31), writeable (bit 1), or
598 * presence (bit 0)?
599 */
600 if ( ((l1_pgentry_val(old_l1_entry) ^
601 l1_pgentry_val(new_l1_entry)) & 0xfffff003) != 0 )
602 {
603 if ( (l1_pgentry_val(old_l1_entry) & _PAGE_PRESENT) )
604 {
605 put_page(l1_pgentry_to_pagenr(old_l1_entry),
606 l1_pgentry_val(old_l1_entry) & _PAGE_RW);
607 }
609 if ( get_page(l1_pgentry_to_pagenr(new_l1_entry),
610 l1_pgentry_val(new_l1_entry) & _PAGE_RW) ){
611 goto fail;
612 }
613 }
614 }
615 else if ( (l1_pgentry_val(old_l1_entry) & _PAGE_PRESENT) )
616 {
617 put_page(l1_pgentry_to_pagenr(old_l1_entry),
618 l1_pgentry_val(old_l1_entry) & _PAGE_RW);
619 }
621 *p_l1_entry = new_l1_entry;
622 return 0;
624 fail:
625 return -1;
626 }
629 static int do_extended_command(unsigned long ptr, unsigned long val)
630 {
631 int err = 0;
632 unsigned int cmd = val & PGEXT_CMD_MASK;
633 unsigned long pfn = ptr >> PAGE_SHIFT;
634 struct pfn_info *page = frame_table + pfn;
636 /* 'ptr' must be in range except where it isn't a machine address. */
637 if ( (pfn >= max_page) && (cmd != PGEXT_SET_LDT) )
638 return 1;
640 switch ( cmd )
641 {
642 case PGEXT_PIN_L1_TABLE:
643 err = get_l1_table(pfn);
644 goto mark_as_pinned;
645 case PGEXT_PIN_L2_TABLE:
646 err = get_l2_table(pfn);
647 mark_as_pinned:
648 if ( err )
649 {
650 MEM_LOG("Error while pinning pfn %08lx", pfn);
651 break;
652 }
653 put_page_type(page);
654 put_page_tot(page);
655 if ( !(page->type_count & REFCNT_PIN_BIT) )
656 {
657 page->type_count |= REFCNT_PIN_BIT;
658 page->tot_count |= REFCNT_PIN_BIT;
659 }
660 else
661 {
662 MEM_LOG("Pfn %08lx already pinned", pfn);
663 err = 1;
664 }
665 break;
667 case PGEXT_UNPIN_TABLE:
668 if ( !DOMAIN_OKAY(page->flags) )
669 {
670 err = 1;
671 MEM_LOG("Page %08lx bad domain (dom=%ld)",
672 ptr, page->flags & PG_domain_mask);
673 }
674 else if ( (page->type_count & REFCNT_PIN_BIT) )
675 {
676 page->type_count &= ~REFCNT_PIN_BIT;
677 page->tot_count &= ~REFCNT_PIN_BIT;
678 get_page_type(page);
679 get_page_tot(page);
680 ((page->flags & PG_type_mask) == PGT_l1_page_table) ?
681 put_l1_table(pfn) : put_l2_table(pfn);
682 }
683 else
684 {
685 err = 1;
686 MEM_LOG("Pfn %08lx not pinned", pfn);
687 }
688 break;
690 case PGEXT_NEW_BASEPTR:
691 err = get_l2_table(pfn);
692 if ( !err )
693 {
694 put_l2_table(pagetable_val(current->mm.pagetable) >> PAGE_SHIFT);
695 current->mm.pagetable = mk_pagetable(pfn << PAGE_SHIFT);
696 invalidate_shadow_ldt();
697 flush_tlb[smp_processor_id()] = 1;
698 }
699 else
700 {
701 MEM_LOG("Error while installing new baseptr %08lx %d", ptr, err);
702 }
703 break;
705 case PGEXT_TLB_FLUSH:
706 flush_tlb[smp_processor_id()] = 1;
707 break;
709 case PGEXT_INVLPG:
710 __flush_tlb_one(val & ~PGEXT_CMD_MASK);
711 break;
713 case PGEXT_SET_LDT:
714 {
715 unsigned long ents = val >> PGEXT_CMD_SHIFT;
716 if ( ((ptr & (PAGE_SIZE-1)) != 0) ||
717 (ents > 8192) ||
718 ((ptr+ents*LDT_ENTRY_SIZE) < ptr) ||
719 ((ptr+ents*LDT_ENTRY_SIZE) > PAGE_OFFSET) )
720 {
721 err = 1;
722 MEM_LOG("Bad args to SET_LDT: ptr=%08lx, ents=%08lx", ptr, ents);
723 }
724 else if ( (current->mm.ldt_ents != ents) ||
725 (current->mm.ldt_base != ptr) )
726 {
727 if ( current->mm.ldt_ents != 0 )
728 {
729 invalidate_shadow_ldt();
730 flush_tlb[smp_processor_id()] = 1;
731 }
732 current->mm.ldt_base = ptr;
733 current->mm.ldt_ents = ents;
734 load_LDT();
735 }
736 break;
737 }
739 default:
740 MEM_LOG("Invalid extended pt command 0x%08lx", val & PGEXT_CMD_MASK);
741 err = 1;
742 break;
743 }
745 return err;
746 }
749 int do_process_page_updates(page_update_request_t *ureqs, int count)
750 {
751 page_update_request_t req;
752 unsigned long flags, pfn, l1e;
753 struct pfn_info *page;
754 int err = 0, i;
755 unsigned int cmd;
756 unsigned long cr0 = read_cr0();
758 /* Clear the WP bit so that we can write even read-only page mappings. */
759 write_cr0(cr0 & ~X86_CR0_WP);
761 for ( i = 0; i < count; i++ )
762 {
763 if ( copy_from_user(&req, ureqs, sizeof(req)) )
764 {
765 kill_domain_with_errmsg("Cannot read page update request");
766 }
768 cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
769 pfn = req.ptr >> PAGE_SHIFT;
771 err = 1;
773 spin_lock_irq(&current->page_lock);
775 /* Get the page-frame number that a non-extended command references. */
776 if ( likely(cmd != PGREQ_EXTENDED_COMMAND) )
777 {
778 if ( likely(cmd != PGREQ_MPT_UPDATE) )
779 {
780 /* Need to use 'get_user' since the VA's PGD may be absent. */
781 __get_user(l1e, (unsigned long *)(linear_pg_table+pfn));
782 /* Now check that the VA's PTE isn't absent. */
783 if ( !(l1e & _PAGE_PRESENT) )
784 {
785 MEM_LOG("L1E n.p. at VA %08lx (%08lx)", req.ptr&~3, l1e);
786 goto unlock;
787 }
788 /* Finally, get the underlying machine address. */
789 pfn = l1e >> PAGE_SHIFT;
790 }
791 else if ( pfn >= max_page )
792 {
793 MEM_LOG("Page out of range (%08lx > %08lx)", pfn, max_page);
794 goto unlock;
795 }
796 }
798 /* Least significant bits of 'ptr' demux the operation type. */
799 switch ( cmd )
800 {
801 /*
802 * PGREQ_NORMAL_UPDATE: Normal update to any level of page table.
803 */
804 case PGREQ_NORMAL_UPDATE:
805 page = frame_table + pfn;
806 flags = page->flags;
808 if ( DOMAIN_OKAY(flags) )
809 {
810 switch ( (flags & PG_type_mask) )
811 {
812 case PGT_l1_page_table:
813 err = mod_l1_entry((l1_pgentry_t *)req.ptr,
814 mk_l1_pgentry(req.val));
815 break;
816 case PGT_l2_page_table:
817 err = mod_l2_entry((l2_pgentry_t *)req.ptr,
818 mk_l2_pgentry(req.val));
819 break;
820 default:
821 if ( page->type_count == 0 )
822 {
823 *(unsigned long *)req.ptr = req.val;
824 err = 0;
825 }
826 else
827 MEM_LOG("Update to bad page %08lx", req.ptr);
828 break;
829 }
830 }
831 else
832 {
833 MEM_LOG("Bad domain normal update (dom %d, pfn %ld)",
834 current->domain, pfn);
835 }
836 break;
838 case PGREQ_UNCHECKED_UPDATE:
839 req.ptr &= ~(sizeof(l1_pgentry_t) - 1);
840 if ( IS_PRIV(current) )
841 {
842 *(unsigned long *)req.ptr = req.val;
843 err = 0;
844 }
845 else
846 {
847 MEM_LOG("Bad unchecked update attempt");
848 }
849 break;
851 case PGREQ_MPT_UPDATE:
852 page = frame_table + pfn;
853 if ( DOMAIN_OKAY(page->flags) )
854 {
855 machine_to_phys_mapping[pfn] = req.val;
856 err = 0;
857 }
858 else
859 {
860 MEM_LOG("Bad domain MPT update (dom %d, pfn %ld)",
861 current->domain, pfn);
862 }
863 break;
865 /*
866 * PGREQ_EXTENDED_COMMAND: Extended command is specified
867 * in the least-siginificant bits of the 'value' field.
868 */
869 case PGREQ_EXTENDED_COMMAND:
870 req.ptr &= ~(sizeof(l1_pgentry_t) - 1);
871 err = do_extended_command(req.ptr, req.val);
872 break;
874 default:
875 MEM_LOG("Invalid page update command %08lx", req.ptr);
876 break;
877 }
879 unlock:
880 spin_unlock_irq(&current->page_lock);
882 if ( err )
883 kill_domain_with_errmsg("Illegal page update request");
885 ureqs++;
886 }
888 if ( flush_tlb[smp_processor_id()] )
889 {
890 flush_tlb[smp_processor_id()] = 0;
891 __write_cr3_counted(pagetable_val(current->mm.pagetable));
893 }
895 /* Restore the WP bit before returning to guest. */
896 write_cr0(cr0);
898 return 0;
899 }