ia64/xen-unstable

view linux-2.6-xen-sparse/arch/i386/mm/pgtable-xen.c @ 8976:4f4625f80528

Rename XENFEAT_ring0_kernel to slightly more generic XENFEAT_supervisor_mode_kernel.
Add comments in public header version.h to explain what all the different
feature flags mean. Add a new flag XENFEAT_pae_pgdir_above_4gb to inform the guest
that its PAE page directories do not need to conform to the usual hardware
restriction.

Signed-off-by: Keir Fraser <keir@xensource.com>
author kaf24@firebug.cl.cam.ac.uk
date Thu Feb 23 11:59:27 2006 +0100 (2006-02-23)
parents 1ca3d63e7008
children 822a27d28afe
line source
1 /*
2 * linux/arch/i386/mm/pgtable.c
3 */
5 #include <linux/config.h>
6 #include <linux/sched.h>
7 #include <linux/kernel.h>
8 #include <linux/errno.h>
9 #include <linux/mm.h>
10 #include <linux/swap.h>
11 #include <linux/smp.h>
12 #include <linux/highmem.h>
13 #include <linux/slab.h>
14 #include <linux/pagemap.h>
15 #include <linux/spinlock.h>
17 #include <asm/system.h>
18 #include <asm/pgtable.h>
19 #include <asm/pgalloc.h>
20 #include <asm/fixmap.h>
21 #include <asm/e820.h>
22 #include <asm/tlb.h>
23 #include <asm/tlbflush.h>
24 #include <asm/io.h>
25 #include <asm/mmu_context.h>
27 #include <xen/features.h>
28 #include <xen/foreign_page.h>
29 #include <asm/hypervisor.h>
31 static void pgd_test_and_unpin(pgd_t *pgd);
33 void show_mem(void)
34 {
35 int total = 0, reserved = 0;
36 int shared = 0, cached = 0;
37 int highmem = 0;
38 struct page *page;
39 pg_data_t *pgdat;
40 unsigned long i;
41 struct page_state ps;
42 unsigned long flags;
44 printk(KERN_INFO "Mem-info:\n");
45 show_free_areas();
46 printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
47 for_each_pgdat(pgdat) {
48 pgdat_resize_lock(pgdat, &flags);
49 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
50 page = pgdat_page_nr(pgdat, i);
51 total++;
52 if (PageHighMem(page))
53 highmem++;
54 if (PageReserved(page))
55 reserved++;
56 else if (PageSwapCache(page))
57 cached++;
58 else if (page_count(page))
59 shared += page_count(page) - 1;
60 }
61 pgdat_resize_unlock(pgdat, &flags);
62 }
63 printk(KERN_INFO "%d pages of RAM\n", total);
64 printk(KERN_INFO "%d pages of HIGHMEM\n", highmem);
65 printk(KERN_INFO "%d reserved pages\n", reserved);
66 printk(KERN_INFO "%d pages shared\n", shared);
67 printk(KERN_INFO "%d pages swap cached\n", cached);
69 get_page_state(&ps);
70 printk(KERN_INFO "%lu pages dirty\n", ps.nr_dirty);
71 printk(KERN_INFO "%lu pages writeback\n", ps.nr_writeback);
72 printk(KERN_INFO "%lu pages mapped\n", ps.nr_mapped);
73 printk(KERN_INFO "%lu pages slab\n", ps.nr_slab);
74 printk(KERN_INFO "%lu pages pagetables\n", ps.nr_page_table_pages);
75 }
77 /*
78 * Associate a virtual page frame with a given physical page frame
79 * and protection flags for that frame.
80 */
81 static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
82 {
83 pgd_t *pgd;
84 pud_t *pud;
85 pmd_t *pmd;
86 pte_t *pte;
88 pgd = swapper_pg_dir + pgd_index(vaddr);
89 if (pgd_none(*pgd)) {
90 BUG();
91 return;
92 }
93 pud = pud_offset(pgd, vaddr);
94 if (pud_none(*pud)) {
95 BUG();
96 return;
97 }
98 pmd = pmd_offset(pud, vaddr);
99 if (pmd_none(*pmd)) {
100 BUG();
101 return;
102 }
103 pte = pte_offset_kernel(pmd, vaddr);
104 /* <pfn,flags> stored as-is, to permit clearing entries */
105 set_pte(pte, pfn_pte(pfn, flags));
107 /*
108 * It's enough to flush this one mapping.
109 * (PGE mappings get flushed as well)
110 */
111 __flush_tlb_one(vaddr);
112 }
114 /*
115 * Associate a virtual page frame with a given physical page frame
116 * and protection flags for that frame.
117 */
118 static void set_pte_pfn_ma(unsigned long vaddr, unsigned long pfn,
119 pgprot_t flags)
120 {
121 pgd_t *pgd;
122 pud_t *pud;
123 pmd_t *pmd;
124 pte_t *pte;
126 pgd = swapper_pg_dir + pgd_index(vaddr);
127 if (pgd_none(*pgd)) {
128 BUG();
129 return;
130 }
131 pud = pud_offset(pgd, vaddr);
132 if (pud_none(*pud)) {
133 BUG();
134 return;
135 }
136 pmd = pmd_offset(pud, vaddr);
137 if (pmd_none(*pmd)) {
138 BUG();
139 return;
140 }
141 pte = pte_offset_kernel(pmd, vaddr);
142 /* <pfn,flags> stored as-is, to permit clearing entries */
143 set_pte(pte, pfn_pte_ma(pfn, flags));
145 /*
146 * It's enough to flush this one mapping.
147 * (PGE mappings get flushed as well)
148 */
149 __flush_tlb_one(vaddr);
150 }
152 /*
153 * Associate a large virtual page frame with a given physical page frame
154 * and protection flags for that frame. pfn is for the base of the page,
155 * vaddr is what the page gets mapped to - both must be properly aligned.
156 * The pmd must already be instantiated. Assumes PAE mode.
157 */
158 void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
159 {
160 pgd_t *pgd;
161 pud_t *pud;
162 pmd_t *pmd;
164 if (vaddr & (PMD_SIZE-1)) { /* vaddr is misaligned */
165 printk(KERN_WARNING "set_pmd_pfn: vaddr misaligned\n");
166 return; /* BUG(); */
167 }
168 if (pfn & (PTRS_PER_PTE-1)) { /* pfn is misaligned */
169 printk(KERN_WARNING "set_pmd_pfn: pfn misaligned\n");
170 return; /* BUG(); */
171 }
172 pgd = swapper_pg_dir + pgd_index(vaddr);
173 if (pgd_none(*pgd)) {
174 printk(KERN_WARNING "set_pmd_pfn: pgd_none\n");
175 return; /* BUG(); */
176 }
177 pud = pud_offset(pgd, vaddr);
178 pmd = pmd_offset(pud, vaddr);
179 set_pmd(pmd, pfn_pmd(pfn, flags));
180 /*
181 * It's enough to flush this one mapping.
182 * (PGE mappings get flushed as well)
183 */
184 __flush_tlb_one(vaddr);
185 }
187 void __set_fixmap (enum fixed_addresses idx, maddr_t phys, pgprot_t flags)
188 {
189 unsigned long address = __fix_to_virt(idx);
191 if (idx >= __end_of_fixed_addresses) {
192 BUG();
193 return;
194 }
195 switch (idx) {
196 case FIX_WP_TEST:
197 case FIX_VSYSCALL:
198 #ifdef CONFIG_X86_F00F_BUG
199 case FIX_F00F_IDT:
200 #endif
201 set_pte_pfn(address, phys >> PAGE_SHIFT, flags);
202 break;
203 default:
204 set_pte_pfn_ma(address, phys >> PAGE_SHIFT, flags);
205 break;
206 }
207 }
209 pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
210 {
211 pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
212 if (pte)
213 make_lowmem_page_readonly(pte, XENFEAT_writable_page_tables);
214 return pte;
215 }
217 struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
218 {
219 struct page *pte;
221 #ifdef CONFIG_HIGHPTE
222 pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
223 #else
224 pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
225 if (pte) {
226 SetPageForeign(pte, pte_free);
227 set_page_count(pte, 1);
228 }
229 #endif
230 return pte;
231 }
233 void pte_free(struct page *pte)
234 {
235 unsigned long va = (unsigned long)__va(page_to_pfn(pte)<<PAGE_SHIFT);
237 if (!pte_write(*virt_to_ptep(va)))
238 BUG_ON(HYPERVISOR_update_va_mapping(
239 va, pfn_pte(page_to_pfn(pte), PAGE_KERNEL), 0));
241 ClearPageForeign(pte);
242 set_page_count(pte, 1);
244 __free_page(pte);
245 }
247 void pmd_ctor(void *pmd, kmem_cache_t *cache, unsigned long flags)
248 {
249 memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
250 }
252 /*
253 * List of all pgd's needed for non-PAE so it can invalidate entries
254 * in both cached and uncached pgd's; not needed for PAE since the
255 * kernel pmd is shared. If PAE were not to share the pmd a similar
256 * tactic would be needed. This is essentially codepath-based locking
257 * against pageattr.c; it is the unique case in which a valid change
258 * of kernel pagetables can't be lazily synchronized by vmalloc faults.
259 * vmalloc faults work because attached pagetables are never freed.
260 * The locking scheme was chosen on the basis of manfred's
261 * recommendations and having no core impact whatsoever.
262 * -- wli
263 */
264 DEFINE_SPINLOCK(pgd_lock);
265 struct page *pgd_list;
267 static inline void pgd_list_add(pgd_t *pgd)
268 {
269 struct page *page = virt_to_page(pgd);
270 page->index = (unsigned long)pgd_list;
271 if (pgd_list)
272 set_page_private(pgd_list, (unsigned long)&page->index);
273 pgd_list = page;
274 set_page_private(page, (unsigned long)&pgd_list);
275 }
277 static inline void pgd_list_del(pgd_t *pgd)
278 {
279 struct page *next, **pprev, *page = virt_to_page(pgd);
280 next = (struct page *)page->index;
281 pprev = (struct page **)page_private(page);
282 *pprev = next;
283 if (next)
284 set_page_private(next, (unsigned long)pprev);
285 }
287 void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused)
288 {
289 unsigned long flags;
291 if (PTRS_PER_PMD > 1) {
292 if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)) {
293 int rc = xen_create_contiguous_region(
294 (unsigned long)pgd, 0, 32);
295 BUG_ON(rc);
296 }
297 if (HAVE_SHARED_KERNEL_PMD)
298 memcpy((pgd_t *)pgd + USER_PTRS_PER_PGD,
299 swapper_pg_dir + USER_PTRS_PER_PGD,
300 (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t));
301 } else {
302 spin_lock_irqsave(&pgd_lock, flags);
303 memcpy((pgd_t *)pgd + USER_PTRS_PER_PGD,
304 swapper_pg_dir + USER_PTRS_PER_PGD,
305 (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t));
306 memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
307 pgd_list_add(pgd);
308 spin_unlock_irqrestore(&pgd_lock, flags);
309 }
310 }
312 /* never called when PTRS_PER_PMD > 1 */
313 void pgd_dtor(void *pgd, kmem_cache_t *cache, unsigned long unused)
314 {
315 unsigned long flags; /* can be called from interrupt context */
317 spin_lock_irqsave(&pgd_lock, flags);
318 pgd_list_del(pgd);
319 spin_unlock_irqrestore(&pgd_lock, flags);
321 pgd_test_and_unpin(pgd);
322 }
324 pgd_t *pgd_alloc(struct mm_struct *mm)
325 {
326 int i;
327 pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL);
329 pgd_test_and_unpin(pgd);
331 if (PTRS_PER_PMD == 1 || !pgd)
332 return pgd;
334 for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
335 pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
336 if (!pmd)
337 goto out_oom;
338 set_pgd(&pgd[i], __pgd(1 + __pa(pmd)));
339 }
341 if (!HAVE_SHARED_KERNEL_PMD) {
342 unsigned long flags;
344 for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) {
345 pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
346 if (!pmd)
347 goto out_oom;
348 set_pgd(&pgd[USER_PTRS_PER_PGD], __pgd(1 + __pa(pmd)));
349 }
351 spin_lock_irqsave(&pgd_lock, flags);
352 for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) {
353 unsigned long v = (unsigned long)i << PGDIR_SHIFT;
354 pgd_t *kpgd = pgd_offset_k(v);
355 pud_t *kpud = pud_offset(kpgd, v);
356 pmd_t *kpmd = pmd_offset(kpud, v);
357 pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1);
358 memcpy(pmd, kpmd, PAGE_SIZE);
359 make_lowmem_page_readonly(
360 pmd, XENFEAT_writable_page_tables);
361 }
362 pgd_list_add(pgd);
363 spin_unlock_irqrestore(&pgd_lock, flags);
364 }
366 return pgd;
368 out_oom:
369 for (i--; i >= 0; i--)
370 kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1));
371 kmem_cache_free(pgd_cache, pgd);
372 return NULL;
373 }
375 void pgd_free(pgd_t *pgd)
376 {
377 int i;
379 pgd_test_and_unpin(pgd);
381 /* in the PAE case user pgd entries are overwritten before usage */
382 if (PTRS_PER_PMD > 1) {
383 for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
384 pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1);
385 kmem_cache_free(pmd_cache, pmd);
386 }
387 if (!HAVE_SHARED_KERNEL_PMD) {
388 unsigned long flags;
389 spin_lock_irqsave(&pgd_lock, flags);
390 pgd_list_del(pgd);
391 spin_unlock_irqrestore(&pgd_lock, flags);
392 for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) {
393 pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1);
394 make_lowmem_page_writable(
395 pmd, XENFEAT_writable_page_tables);
396 memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
397 kmem_cache_free(pmd_cache, pmd);
398 }
399 }
400 }
401 /* in the non-PAE case, free_pgtables() clears user pgd entries */
402 kmem_cache_free(pgd_cache, pgd);
403 }
405 void make_lowmem_page_readonly(void *va, unsigned int feature)
406 {
407 pte_t *pte;
408 int rc;
410 if (xen_feature(feature))
411 return;
413 pte = virt_to_ptep(va);
414 rc = HYPERVISOR_update_va_mapping(
415 (unsigned long)va, pte_wrprotect(*pte), 0);
416 BUG_ON(rc);
417 }
419 void make_lowmem_page_writable(void *va, unsigned int feature)
420 {
421 pte_t *pte;
422 int rc;
424 if (xen_feature(feature))
425 return;
427 pte = virt_to_ptep(va);
428 rc = HYPERVISOR_update_va_mapping(
429 (unsigned long)va, pte_mkwrite(*pte), 0);
430 BUG_ON(rc);
431 }
433 void make_page_readonly(void *va, unsigned int feature)
434 {
435 pte_t *pte;
436 int rc;
438 if (xen_feature(feature))
439 return;
441 pte = virt_to_ptep(va);
442 rc = HYPERVISOR_update_va_mapping(
443 (unsigned long)va, pte_wrprotect(*pte), 0);
444 if (rc) /* fallback? */
445 xen_l1_entry_update(pte, pte_wrprotect(*pte));
446 if ((unsigned long)va >= (unsigned long)high_memory) {
447 unsigned long pfn = pte_pfn(*pte);
448 #ifdef CONFIG_HIGHMEM
449 if (pfn >= highstart_pfn)
450 kmap_flush_unused(); /* flush stale writable kmaps */
451 else
452 #endif
453 make_lowmem_page_readonly(
454 phys_to_virt(pfn << PAGE_SHIFT), feature);
455 }
456 }
458 void make_page_writable(void *va, unsigned int feature)
459 {
460 pte_t *pte;
461 int rc;
463 if (xen_feature(feature))
464 return;
466 pte = virt_to_ptep(va);
467 rc = HYPERVISOR_update_va_mapping(
468 (unsigned long)va, pte_mkwrite(*pte), 0);
469 if (rc) /* fallback? */
470 xen_l1_entry_update(pte, pte_mkwrite(*pte));
471 if ((unsigned long)va >= (unsigned long)high_memory) {
472 unsigned long pfn = pte_pfn(*pte);
473 #ifdef CONFIG_HIGHMEM
474 if (pfn < highstart_pfn)
475 #endif
476 make_lowmem_page_writable(
477 phys_to_virt(pfn << PAGE_SHIFT), feature);
478 }
479 }
481 void make_pages_readonly(void *va, unsigned int nr, unsigned int feature)
482 {
483 if (xen_feature(feature))
484 return;
486 while (nr-- != 0) {
487 make_page_readonly(va, feature);
488 va = (void *)((unsigned long)va + PAGE_SIZE);
489 }
490 }
492 void make_pages_writable(void *va, unsigned int nr, unsigned int feature)
493 {
494 if (xen_feature(feature))
495 return;
497 while (nr-- != 0) {
498 make_page_writable(va, feature);
499 va = (void *)((unsigned long)va + PAGE_SIZE);
500 }
501 }
503 static inline void pgd_walk_set_prot(void *pt, pgprot_t flags)
504 {
505 struct page *page = virt_to_page(pt);
506 unsigned long pfn = page_to_pfn(page);
508 if (PageHighMem(page))
509 return;
510 BUG_ON(HYPERVISOR_update_va_mapping(
511 (unsigned long)__va(pfn << PAGE_SHIFT),
512 pfn_pte(pfn, flags), 0));
513 }
515 static void pgd_walk(pgd_t *pgd_base, pgprot_t flags)
516 {
517 pgd_t *pgd = pgd_base;
518 pud_t *pud;
519 pmd_t *pmd;
520 pte_t *pte;
521 int g, u, m;
523 if (xen_feature(XENFEAT_auto_translated_physmap))
524 return;
526 for (g = 0; g < USER_PTRS_PER_PGD; g++, pgd++) {
527 if (pgd_none(*pgd))
528 continue;
529 pud = pud_offset(pgd, 0);
530 if (PTRS_PER_PUD > 1) /* not folded */
531 pgd_walk_set_prot(pud,flags);
532 for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
533 if (pud_none(*pud))
534 continue;
535 pmd = pmd_offset(pud, 0);
536 if (PTRS_PER_PMD > 1) /* not folded */
537 pgd_walk_set_prot(pmd,flags);
538 for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
539 if (pmd_none(*pmd))
540 continue;
541 pte = pte_offset_kernel(pmd,0);
542 pgd_walk_set_prot(pte,flags);
543 }
544 }
545 }
547 BUG_ON(HYPERVISOR_update_va_mapping(
548 (unsigned long)pgd_base,
549 pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
550 UVMF_TLB_FLUSH));
551 }
553 static void __pgd_pin(pgd_t *pgd)
554 {
555 pgd_walk(pgd, PAGE_KERNEL_RO);
556 xen_pgd_pin(__pa(pgd));
557 set_bit(PG_pinned, &virt_to_page(pgd)->flags);
558 }
560 static void __pgd_unpin(pgd_t *pgd)
561 {
562 xen_pgd_unpin(__pa(pgd));
563 pgd_walk(pgd, PAGE_KERNEL);
564 clear_bit(PG_pinned, &virt_to_page(pgd)->flags);
565 }
567 static void pgd_test_and_unpin(pgd_t *pgd)
568 {
569 if (test_bit(PG_pinned, &virt_to_page(pgd)->flags))
570 __pgd_unpin(pgd);
571 }
573 void mm_pin(struct mm_struct *mm)
574 {
575 spin_lock(&mm->page_table_lock);
576 __pgd_pin(mm->pgd);
577 spin_unlock(&mm->page_table_lock);
578 }
580 void mm_unpin(struct mm_struct *mm)
581 {
582 spin_lock(&mm->page_table_lock);
583 __pgd_unpin(mm->pgd);
584 spin_unlock(&mm->page_table_lock);
585 }
587 void mm_pin_all(void)
588 {
589 struct page *page;
590 for (page = pgd_list; page; page = (struct page *)page->index) {
591 if (!test_bit(PG_pinned, &page->flags))
592 __pgd_pin((pgd_t *)page_address(page));
593 }
594 }
596 void _arch_exit_mmap(struct mm_struct *mm)
597 {
598 struct task_struct *tsk = current;
600 task_lock(tsk);
602 /*
603 * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
604 * *much* faster this way, as no tlb flushes means bigger wrpt batches.
605 */
606 if (tsk->active_mm == mm) {
607 tsk->active_mm = &init_mm;
608 atomic_inc(&init_mm.mm_count);
610 switch_mm(mm, &init_mm, tsk);
612 atomic_dec(&mm->mm_count);
613 BUG_ON(atomic_read(&mm->mm_count) == 0);
614 }
616 task_unlock(tsk);
618 if (test_bit(PG_pinned, &virt_to_page(mm->pgd)->flags) &&
619 (atomic_read(&mm->mm_count) == 1))
620 mm_unpin(mm);
621 }
623 /*
624 * Local variables:
625 * c-file-style: "linux"
626 * indent-tabs-mode: t
627 * c-indent-level: 8
628 * c-basic-offset: 8
629 * tab-width: 8
630 * End:
631 */