ia64/xen-unstable

view linux-2.6-xen-sparse/arch/xen/i386/mm/pgtable.c @ 6294:1a0723cd37f1

Fix many uses of machine addresses in XenLinux. Primarily
this fixes users of virt_to_machine/machine_to_virt to
use virt_to_mfn/mfn_to_virt where that is more appropriate.

This should be a big step to improved PAE stability.

Signed-off-by: Keir Fraser <keir@xensource.com>
author kaf24@firebug.cl.cam.ac.uk
date Fri Aug 19 16:06:43 2005 +0000 (2005-08-19)
parents 3d187585c141
children f51fe43c5d1c 5f4724c13040 81576d3d1ca8 3a8f27c6d56c
line source
1 /*
2 * linux/arch/i386/mm/pgtable.c
3 */
5 #include <linux/config.h>
6 #include <linux/sched.h>
7 #include <linux/kernel.h>
8 #include <linux/errno.h>
9 #include <linux/mm.h>
10 #include <linux/swap.h>
11 #include <linux/smp.h>
12 #include <linux/highmem.h>
13 #include <linux/slab.h>
14 #include <linux/pagemap.h>
15 #include <linux/spinlock.h>
17 #include <asm/system.h>
18 #include <asm/pgtable.h>
19 #include <asm/pgalloc.h>
20 #include <asm/fixmap.h>
21 #include <asm/e820.h>
22 #include <asm/tlb.h>
23 #include <asm/tlbflush.h>
24 #include <asm/io.h>
25 #include <asm/mmu_context.h>
27 #include <asm-xen/foreign_page.h>
28 #include <asm-xen/hypervisor.h>
30 void show_mem(void)
31 {
32 int total = 0, reserved = 0;
33 int shared = 0, cached = 0;
34 int highmem = 0;
35 struct page *page;
36 pg_data_t *pgdat;
37 unsigned long i;
39 printk("Mem-info:\n");
40 show_free_areas();
41 printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
42 for_each_pgdat(pgdat) {
43 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
44 page = pgdat->node_mem_map + i;
45 total++;
46 if (PageHighMem(page))
47 highmem++;
48 if (PageReserved(page))
49 reserved++;
50 else if (PageSwapCache(page))
51 cached++;
52 else if (page_count(page))
53 shared += page_count(page) - 1;
54 }
55 }
56 printk("%d pages of RAM\n", total);
57 printk("%d pages of HIGHMEM\n",highmem);
58 printk("%d reserved pages\n",reserved);
59 printk("%d pages shared\n",shared);
60 printk("%d pages swap cached\n",cached);
61 }
63 /*
64 * Associate a virtual page frame with a given physical page frame
65 * and protection flags for that frame.
66 */
67 static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
68 {
69 pgd_t *pgd;
70 pud_t *pud;
71 pmd_t *pmd;
72 pte_t *pte;
74 pgd = swapper_pg_dir + pgd_index(vaddr);
75 if (pgd_none(*pgd)) {
76 BUG();
77 return;
78 }
79 pud = pud_offset(pgd, vaddr);
80 if (pud_none(*pud)) {
81 BUG();
82 return;
83 }
84 pmd = pmd_offset(pud, vaddr);
85 if (pmd_none(*pmd)) {
86 BUG();
87 return;
88 }
89 pte = pte_offset_kernel(pmd, vaddr);
90 /* <pfn,flags> stored as-is, to permit clearing entries */
91 set_pte(pte, pfn_pte(pfn, flags));
93 /*
94 * It's enough to flush this one mapping.
95 * (PGE mappings get flushed as well)
96 */
97 __flush_tlb_one(vaddr);
98 }
100 /*
101 * Associate a virtual page frame with a given physical page frame
102 * and protection flags for that frame.
103 */
104 static void set_pte_pfn_ma(unsigned long vaddr, unsigned long pfn,
105 pgprot_t flags)
106 {
107 pgd_t *pgd;
108 pud_t *pud;
109 pmd_t *pmd;
110 pte_t *pte;
112 pgd = swapper_pg_dir + pgd_index(vaddr);
113 if (pgd_none(*pgd)) {
114 BUG();
115 return;
116 }
117 pud = pud_offset(pgd, vaddr);
118 if (pud_none(*pud)) {
119 BUG();
120 return;
121 }
122 pmd = pmd_offset(pud, vaddr);
123 if (pmd_none(*pmd)) {
124 BUG();
125 return;
126 }
127 pte = pte_offset_kernel(pmd, vaddr);
128 /* <pfn,flags> stored as-is, to permit clearing entries */
129 set_pte(pte, pfn_pte_ma(pfn, flags));
131 /*
132 * It's enough to flush this one mapping.
133 * (PGE mappings get flushed as well)
134 */
135 __flush_tlb_one(vaddr);
136 }
138 /*
139 * Associate a large virtual page frame with a given physical page frame
140 * and protection flags for that frame. pfn is for the base of the page,
141 * vaddr is what the page gets mapped to - both must be properly aligned.
142 * The pmd must already be instantiated. Assumes PAE mode.
143 */
144 void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
145 {
146 pgd_t *pgd;
147 pud_t *pud;
148 pmd_t *pmd;
150 if (vaddr & (PMD_SIZE-1)) { /* vaddr is misaligned */
151 printk ("set_pmd_pfn: vaddr misaligned\n");
152 return; /* BUG(); */
153 }
154 if (pfn & (PTRS_PER_PTE-1)) { /* pfn is misaligned */
155 printk ("set_pmd_pfn: pfn misaligned\n");
156 return; /* BUG(); */
157 }
158 pgd = swapper_pg_dir + pgd_index(vaddr);
159 if (pgd_none(*pgd)) {
160 printk ("set_pmd_pfn: pgd_none\n");
161 return; /* BUG(); */
162 }
163 pud = pud_offset(pgd, vaddr);
164 pmd = pmd_offset(pud, vaddr);
165 set_pmd(pmd, pfn_pmd(pfn, flags));
166 /*
167 * It's enough to flush this one mapping.
168 * (PGE mappings get flushed as well)
169 */
170 __flush_tlb_one(vaddr);
171 }
173 void __set_fixmap (enum fixed_addresses idx, maddr_t phys, pgprot_t flags)
174 {
175 unsigned long address = __fix_to_virt(idx);
177 if (idx >= __end_of_fixed_addresses) {
178 BUG();
179 return;
180 }
181 switch (idx) {
182 case FIX_WP_TEST:
183 case FIX_VSYSCALL:
184 #ifdef CONFIG_X86_F00F_BUG
185 case FIX_F00F_IDT:
186 #endif
187 set_pte_pfn(address, phys >> PAGE_SHIFT, flags);
188 break;
189 default:
190 set_pte_pfn_ma(address, phys >> PAGE_SHIFT, flags);
191 break;
192 }
193 }
195 pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
196 {
197 pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
198 if (pte)
199 make_page_readonly(pte);
200 return pte;
201 }
203 struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
204 {
205 struct page *pte;
207 #ifdef CONFIG_HIGHPTE
208 pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
209 #else
210 pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
211 if (pte) {
212 SetPageForeign(pte, pte_free);
213 set_page_count(pte, 1);
214 }
215 #endif
217 return pte;
218 }
220 void pte_free(struct page *pte)
221 {
222 unsigned long va = (unsigned long)__va(page_to_pfn(pte)<<PAGE_SHIFT);
224 if (!pte_write(*virt_to_ptep(va)))
225 BUG_ON(HYPERVISOR_update_va_mapping(
226 va, pfn_pte(page_to_pfn(pte), PAGE_KERNEL), 0));
228 ClearPageForeign(pte);
229 set_page_count(pte, 1);
231 __free_page(pte);
232 }
234 void pmd_ctor(void *pmd, kmem_cache_t *cache, unsigned long flags)
235 {
236 memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
237 }
239 /*
240 * List of all pgd's needed for non-PAE so it can invalidate entries
241 * in both cached and uncached pgd's; not needed for PAE since the
242 * kernel pmd is shared. If PAE were not to share the pmd a similar
243 * tactic would be needed. This is essentially codepath-based locking
244 * against pageattr.c; it is the unique case in which a valid change
245 * of kernel pagetables can't be lazily synchronized by vmalloc faults.
246 * vmalloc faults work because attached pagetables are never freed.
247 * The locking scheme was chosen on the basis of manfred's
248 * recommendations and having no core impact whatsoever.
249 * -- wli
250 */
251 DEFINE_SPINLOCK(pgd_lock);
252 struct page *pgd_list;
254 static inline void pgd_list_add(pgd_t *pgd)
255 {
256 struct page *page = virt_to_page(pgd);
257 page->index = (unsigned long)pgd_list;
258 if (pgd_list)
259 pgd_list->private = (unsigned long)&page->index;
260 pgd_list = page;
261 page->private = (unsigned long)&pgd_list;
262 }
264 static inline void pgd_list_del(pgd_t *pgd)
265 {
266 struct page *next, **pprev, *page = virt_to_page(pgd);
267 next = (struct page *)page->index;
268 pprev = (struct page **)page->private;
269 *pprev = next;
270 if (next)
271 next->private = (unsigned long)pprev;
272 }
274 void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused)
275 {
276 unsigned long flags;
278 #ifdef CONFIG_X86_PAE
279 /* this gives us a page below 4GB */
280 xen_create_contiguous_region((unsigned long)pgd, 0);
281 #endif
283 if (!HAVE_SHARED_KERNEL_PMD)
284 spin_lock_irqsave(&pgd_lock, flags);
286 memcpy((pgd_t *)pgd + USER_PTRS_PER_PGD,
287 swapper_pg_dir + USER_PTRS_PER_PGD,
288 (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t));
290 if (HAVE_SHARED_KERNEL_PMD)
291 return;
293 pgd_list_add(pgd);
294 spin_unlock_irqrestore(&pgd_lock, flags);
295 memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
296 }
298 void pgd_dtor(void *pgd, kmem_cache_t *cache, unsigned long unused)
299 {
300 unsigned long flags; /* can be called from interrupt context */
302 if (HAVE_SHARED_KERNEL_PMD)
303 return;
305 spin_lock_irqsave(&pgd_lock, flags);
306 pgd_list_del(pgd);
307 spin_unlock_irqrestore(&pgd_lock, flags);
308 }
310 pgd_t *pgd_alloc(struct mm_struct *mm)
311 {
312 int i = 0;
313 pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL);
315 if (PTRS_PER_PMD == 1 || !pgd)
316 return pgd;
318 if (!HAVE_SHARED_KERNEL_PMD) {
319 /* alloc and copy kernel pmd */
320 unsigned long flags;
321 pgd_t *copy_pgd = pgd_offset_k(PAGE_OFFSET);
322 pud_t *copy_pud = pud_offset(copy_pgd, PAGE_OFFSET);
323 pmd_t *copy_pmd = pmd_offset(copy_pud, PAGE_OFFSET);
324 pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
325 if (0 == pmd)
326 goto out_oom;
328 spin_lock_irqsave(&pgd_lock, flags);
329 memcpy(pmd, copy_pmd, PAGE_SIZE);
330 spin_unlock_irqrestore(&pgd_lock, flags);
331 make_page_readonly(pmd);
332 set_pgd(&pgd[USER_PTRS_PER_PGD], __pgd(1 + __pa(pmd)));
333 }
335 /* alloc user pmds */
336 for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
337 pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
338 if (!pmd)
339 goto out_oom;
340 set_pgd(&pgd[i], __pgd(1 + __pa(pmd)));
341 }
342 return pgd;
344 out_oom:
345 for (i--; i >= 0; i--)
346 kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1));
347 kmem_cache_free(pgd_cache, pgd);
348 return NULL;
349 }
351 void pgd_free(pgd_t *pgd)
352 {
353 int i;
354 pte_t *ptep = virt_to_ptep(pgd);
356 if (!pte_write(*ptep)) {
357 xen_pgd_unpin(__pa(pgd));
358 BUG_ON(HYPERVISOR_update_va_mapping(
359 (unsigned long)pgd,
360 pfn_pte(virt_to_phys(pgd)>>PAGE_SHIFT, PAGE_KERNEL),
361 0));
362 }
364 /* in the PAE case user pgd entries are overwritten before usage */
365 if (PTRS_PER_PMD > 1) {
366 for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
367 pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1);
368 make_page_writable(pmd);
369 kmem_cache_free(pmd_cache, pmd);
370 }
371 if (!HAVE_SHARED_KERNEL_PMD) {
372 pmd_t *pmd = (void *)__va(pgd_val(pgd[USER_PTRS_PER_PGD])-1);
373 make_page_writable(pmd);
374 memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
375 kmem_cache_free(pmd_cache, pmd);
376 }
377 }
378 /* in the non-PAE case, free_pgtables() clears user pgd entries */
379 kmem_cache_free(pgd_cache, pgd);
380 }
382 #ifndef CONFIG_XEN_SHADOW_MODE
383 void make_lowmem_page_readonly(void *va)
384 {
385 pte_t *pte = virt_to_ptep(va);
386 set_pte(pte, pte_wrprotect(*pte));
387 }
389 void make_lowmem_page_writable(void *va)
390 {
391 pte_t *pte = virt_to_ptep(va);
392 set_pte(pte, pte_mkwrite(*pte));
393 }
395 void make_page_readonly(void *va)
396 {
397 pte_t *pte = virt_to_ptep(va);
398 set_pte(pte, pte_wrprotect(*pte));
399 if ( (unsigned long)va >= (unsigned long)high_memory )
400 {
401 unsigned long phys;
402 phys = machine_to_phys(*(unsigned long *)pte & PAGE_MASK);
403 #ifdef CONFIG_HIGHMEM
404 if ( (phys >> PAGE_SHIFT) < highstart_pfn )
405 #endif
406 make_lowmem_page_readonly(phys_to_virt(phys));
407 }
408 }
410 void make_page_writable(void *va)
411 {
412 pte_t *pte = virt_to_ptep(va);
413 set_pte(pte, pte_mkwrite(*pte));
414 if ( (unsigned long)va >= (unsigned long)high_memory )
415 {
416 unsigned long phys;
417 phys = machine_to_phys(*(unsigned long *)pte & PAGE_MASK);
418 #ifdef CONFIG_HIGHMEM
419 if ( (phys >> PAGE_SHIFT) < highstart_pfn )
420 #endif
421 make_lowmem_page_writable(phys_to_virt(phys));
422 }
423 }
425 void make_pages_readonly(void *va, unsigned int nr)
426 {
427 while ( nr-- != 0 )
428 {
429 make_page_readonly(va);
430 va = (void *)((unsigned long)va + PAGE_SIZE);
431 }
432 }
434 void make_pages_writable(void *va, unsigned int nr)
435 {
436 while ( nr-- != 0 )
437 {
438 make_page_writable(va);
439 va = (void *)((unsigned long)va + PAGE_SIZE);
440 }
441 }
442 #endif /* CONFIG_XEN_SHADOW_MODE */
444 LIST_HEAD(mm_unpinned);
445 DEFINE_SPINLOCK(mm_unpinned_lock);
447 static inline void mm_walk_set_prot(void *pt, pgprot_t flags)
448 {
449 struct page *page = virt_to_page(pt);
450 unsigned long pfn = page_to_pfn(page);
452 if (PageHighMem(page))
453 return;
454 BUG_ON(HYPERVISOR_update_va_mapping(
455 (unsigned long)__va(pfn << PAGE_SHIFT),
456 pfn_pte(pfn, flags), 0));
457 }
459 static void mm_walk(struct mm_struct *mm, pgprot_t flags)
460 {
461 pgd_t *pgd;
462 pud_t *pud;
463 pmd_t *pmd;
464 pte_t *pte;
465 int g,u,m;
467 pgd = mm->pgd;
468 for (g = 0; g < USER_PTRS_PER_PGD; g++, pgd++) {
469 if (pgd_none(*pgd))
470 continue;
471 pud = pud_offset(pgd, 0);
472 if (PTRS_PER_PUD > 1) /* not folded */
473 mm_walk_set_prot(pud,flags);
474 for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
475 if (pud_none(*pud))
476 continue;
477 pmd = pmd_offset(pud, 0);
478 if (PTRS_PER_PMD > 1) /* not folded */
479 mm_walk_set_prot(pmd,flags);
480 for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
481 if (pmd_none(*pmd))
482 continue;
483 pte = pte_offset_kernel(pmd,0);
484 mm_walk_set_prot(pte,flags);
485 }
486 }
487 }
488 }
490 void mm_pin(struct mm_struct *mm)
491 {
492 spin_lock(&mm->page_table_lock);
494 mm_walk(mm, PAGE_KERNEL_RO);
495 BUG_ON(HYPERVISOR_update_va_mapping(
496 (unsigned long)mm->pgd,
497 pfn_pte(virt_to_phys(mm->pgd)>>PAGE_SHIFT, PAGE_KERNEL_RO),
498 UVMF_TLB_FLUSH));
499 xen_pgd_pin(__pa(mm->pgd));
500 mm->context.pinned = 1;
501 spin_lock(&mm_unpinned_lock);
502 list_del(&mm->context.unpinned);
503 spin_unlock(&mm_unpinned_lock);
505 spin_unlock(&mm->page_table_lock);
506 }
508 void mm_unpin(struct mm_struct *mm)
509 {
510 spin_lock(&mm->page_table_lock);
512 xen_pgd_unpin(__pa(mm->pgd));
513 BUG_ON(HYPERVISOR_update_va_mapping(
514 (unsigned long)mm->pgd,
515 pfn_pte(virt_to_phys(mm->pgd)>>PAGE_SHIFT, PAGE_KERNEL), 0));
516 mm_walk(mm, PAGE_KERNEL);
517 xen_tlb_flush();
518 mm->context.pinned = 0;
519 spin_lock(&mm_unpinned_lock);
520 list_add(&mm->context.unpinned, &mm_unpinned);
521 spin_unlock(&mm_unpinned_lock);
523 spin_unlock(&mm->page_table_lock);
524 }
526 void mm_pin_all(void)
527 {
528 while (!list_empty(&mm_unpinned))
529 mm_pin(list_entry(mm_unpinned.next, struct mm_struct,
530 context.unpinned));
531 }
533 void _arch_exit_mmap(struct mm_struct *mm)
534 {
535 struct task_struct *tsk = current;
537 task_lock(tsk);
539 /*
540 * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
541 * *much* faster this way, as no tlb flushes means bigger wrpt batches.
542 */
543 if ( tsk->active_mm == mm )
544 {
545 tsk->active_mm = &init_mm;
546 atomic_inc(&init_mm.mm_count);
548 switch_mm(mm, &init_mm, tsk);
550 atomic_dec(&mm->mm_count);
551 BUG_ON(atomic_read(&mm->mm_count) == 0);
552 }
554 task_unlock(tsk);
556 if ( mm->context.pinned && (atomic_read(&mm->mm_count) == 1) )
557 mm_unpin(mm);
558 }