ia64/xen-unstable

view linux-2.6-xen-sparse/mm/memory.c @ 6190:4ec947baae75

Add generic_page_range() -- generic page table operation.

Linux has several instances of repeated code to do updates to a range
of PTEs. Mapping memory between domains in Xen also tends to need to
do this quite frequently, to ensure page tables have been constructed
and to look up PTE addresses when making mapping-related hypercalls.
This patch adds a generic PTE walk-and-fill operation that takes a
function pointer to call on leaf entries. direct_remap_area_pages()
is updated to use the new call, ass are abuses of
__direct_remap_area_pages.

This patch also introduces two new helper functions for working with
page tables when mapping memory between domains:
create_lookup_pte_addr() returns the machine address of a PTE,
allocating intermediate page tables as necessary. touch_pte_range()
ensures that page tables exist for a virtual address range.

Many of the existing linux page table operations (e.g. zap/remap/etc)
could be modified to use this interface, which would potentially
shorten up mm/memory.c a bit.
author akw27@arcadians.cl.cam.ac.uk
date Mon Aug 15 13:16:04 2005 +0000 (2005-08-15)
parents f294acb25858
children f51fe43c5d1c 5f4724c13040 1ae656509f02 23979fb12c49 84ee014ebd41 99914b54f7bf 81576d3d1ca8 3a8f27c6d56c
line source
1 /*
2 * linux/mm/memory.c
3 *
4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
5 */
7 /*
8 * demand-loading started 01.12.91 - seems it is high on the list of
9 * things wanted, and it should be easy to implement. - Linus
10 */
12 /*
13 * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
14 * pages started 02.12.91, seems to work. - Linus.
15 *
16 * Tested sharing by executing about 30 /bin/sh: under the old kernel it
17 * would have taken more than the 6M I have free, but it worked well as
18 * far as I could see.
19 *
20 * Also corrected some "invalidate()"s - I wasn't doing enough of them.
21 */
23 /*
24 * Real VM (paging to/from disk) started 18.12.91. Much more work and
25 * thought has to go into this. Oh, well..
26 * 19.12.91 - works, somewhat. Sometimes I get faults, don't know why.
27 * Found it. Everything seems to work now.
28 * 20.12.91 - Ok, making the swap-device changeable like the root.
29 */
31 /*
32 * 05.04.94 - Multi-page memory management added for v1.1.
33 * Idea by Alex Bligh (alex@cconcepts.co.uk)
34 *
35 * 16.07.99 - Support of BIGMEM added by Gerhard Wichert, Siemens AG
36 * (Gerhard.Wichert@pdb.siemens.de)
37 *
38 * Aug/Sep 2004 Changed to four level page tables (Andi Kleen)
39 */
41 #include <linux/kernel_stat.h>
42 #include <linux/mm.h>
43 #include <linux/hugetlb.h>
44 #include <linux/mman.h>
45 #include <linux/swap.h>
46 #include <linux/highmem.h>
47 #include <linux/pagemap.h>
48 #include <linux/rmap.h>
49 #include <linux/module.h>
50 #include <linux/init.h>
52 #include <asm/pgalloc.h>
53 #include <asm/uaccess.h>
54 #include <asm/tlb.h>
55 #include <asm/tlbflush.h>
56 #include <asm/pgtable.h>
58 #include <linux/swapops.h>
59 #include <linux/elf.h>
61 #ifndef CONFIG_DISCONTIGMEM
62 /* use the per-pgdat data instead for discontigmem - mbligh */
63 unsigned long max_mapnr;
64 struct page *mem_map;
66 EXPORT_SYMBOL(max_mapnr);
67 EXPORT_SYMBOL(mem_map);
68 #endif
70 unsigned long num_physpages;
71 /*
72 * A number of key systems in x86 including ioremap() rely on the assumption
73 * that high_memory defines the upper bound on direct map memory, then end
74 * of ZONE_NORMAL. Under CONFIG_DISCONTIG this means that max_low_pfn and
75 * highstart_pfn must be the same; there must be no gap between ZONE_NORMAL
76 * and ZONE_HIGHMEM.
77 */
78 void * high_memory;
79 unsigned long vmalloc_earlyreserve;
81 EXPORT_SYMBOL(num_physpages);
82 EXPORT_SYMBOL(high_memory);
83 EXPORT_SYMBOL(vmalloc_earlyreserve);
85 /*
86 * If a p?d_bad entry is found while walking page tables, report
87 * the error, before resetting entry to p?d_none. Usually (but
88 * very seldom) called out from the p?d_none_or_clear_bad macros.
89 */
91 void pgd_clear_bad(pgd_t *pgd)
92 {
93 pgd_ERROR(*pgd);
94 pgd_clear(pgd);
95 }
97 void pud_clear_bad(pud_t *pud)
98 {
99 pud_ERROR(*pud);
100 pud_clear(pud);
101 }
103 void pmd_clear_bad(pmd_t *pmd)
104 {
105 pmd_ERROR(*pmd);
106 pmd_clear(pmd);
107 }
109 /*
110 * Note: this doesn't free the actual pages themselves. That
111 * has been handled earlier when unmapping all the memory regions.
112 */
113 static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd)
114 {
115 struct page *page = pmd_page(*pmd);
116 pmd_clear(pmd);
117 pte_free_tlb(tlb, page);
118 dec_page_state(nr_page_table_pages);
119 tlb->mm->nr_ptes--;
120 }
122 static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
123 unsigned long addr, unsigned long end,
124 unsigned long floor, unsigned long ceiling)
125 {
126 pmd_t *pmd;
127 unsigned long next;
128 unsigned long start;
130 start = addr;
131 pmd = pmd_offset(pud, addr);
132 do {
133 next = pmd_addr_end(addr, end);
134 if (pmd_none_or_clear_bad(pmd))
135 continue;
136 free_pte_range(tlb, pmd);
137 } while (pmd++, addr = next, addr != end);
139 start &= PUD_MASK;
140 if (start < floor)
141 return;
142 if (ceiling) {
143 ceiling &= PUD_MASK;
144 if (!ceiling)
145 return;
146 }
147 if (end - 1 > ceiling - 1)
148 return;
150 pmd = pmd_offset(pud, start);
151 pud_clear(pud);
152 pmd_free_tlb(tlb, pmd);
153 }
155 static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
156 unsigned long addr, unsigned long end,
157 unsigned long floor, unsigned long ceiling)
158 {
159 pud_t *pud;
160 unsigned long next;
161 unsigned long start;
163 start = addr;
164 pud = pud_offset(pgd, addr);
165 do {
166 next = pud_addr_end(addr, end);
167 if (pud_none_or_clear_bad(pud))
168 continue;
169 free_pmd_range(tlb, pud, addr, next, floor, ceiling);
170 } while (pud++, addr = next, addr != end);
172 start &= PGDIR_MASK;
173 if (start < floor)
174 return;
175 if (ceiling) {
176 ceiling &= PGDIR_MASK;
177 if (!ceiling)
178 return;
179 }
180 if (end - 1 > ceiling - 1)
181 return;
183 pud = pud_offset(pgd, start);
184 pgd_clear(pgd);
185 pud_free_tlb(tlb, pud);
186 }
188 /*
189 * This function frees user-level page tables of a process.
190 *
191 * Must be called with pagetable lock held.
192 */
193 void free_pgd_range(struct mmu_gather **tlb,
194 unsigned long addr, unsigned long end,
195 unsigned long floor, unsigned long ceiling)
196 {
197 pgd_t *pgd;
198 unsigned long next;
199 unsigned long start;
201 /*
202 * The next few lines have given us lots of grief...
203 *
204 * Why are we testing PMD* at this top level? Because often
205 * there will be no work to do at all, and we'd prefer not to
206 * go all the way down to the bottom just to discover that.
207 *
208 * Why all these "- 1"s? Because 0 represents both the bottom
209 * of the address space and the top of it (using -1 for the
210 * top wouldn't help much: the masks would do the wrong thing).
211 * The rule is that addr 0 and floor 0 refer to the bottom of
212 * the address space, but end 0 and ceiling 0 refer to the top
213 * Comparisons need to use "end - 1" and "ceiling - 1" (though
214 * that end 0 case should be mythical).
215 *
216 * Wherever addr is brought up or ceiling brought down, we must
217 * be careful to reject "the opposite 0" before it confuses the
218 * subsequent tests. But what about where end is brought down
219 * by PMD_SIZE below? no, end can't go down to 0 there.
220 *
221 * Whereas we round start (addr) and ceiling down, by different
222 * masks at different levels, in order to test whether a table
223 * now has no other vmas using it, so can be freed, we don't
224 * bother to round floor or end up - the tests don't need that.
225 */
227 addr &= PMD_MASK;
228 if (addr < floor) {
229 addr += PMD_SIZE;
230 if (!addr)
231 return;
232 }
233 if (ceiling) {
234 ceiling &= PMD_MASK;
235 if (!ceiling)
236 return;
237 }
238 if (end - 1 > ceiling - 1)
239 end -= PMD_SIZE;
240 if (addr > end - 1)
241 return;
243 start = addr;
244 pgd = pgd_offset((*tlb)->mm, addr);
245 do {
246 next = pgd_addr_end(addr, end);
247 if (pgd_none_or_clear_bad(pgd))
248 continue;
249 free_pud_range(*tlb, pgd, addr, next, floor, ceiling);
250 } while (pgd++, addr = next, addr != end);
252 if (!tlb_is_full_mm(*tlb))
253 flush_tlb_pgtables((*tlb)->mm, start, end);
254 }
256 void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
257 unsigned long floor, unsigned long ceiling)
258 {
259 while (vma) {
260 struct vm_area_struct *next = vma->vm_next;
261 unsigned long addr = vma->vm_start;
263 if (is_hugepage_only_range(vma->vm_mm, addr, HPAGE_SIZE)) {
264 hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
265 floor, next? next->vm_start: ceiling);
266 } else {
267 /*
268 * Optimization: gather nearby vmas into one call down
269 */
270 while (next && next->vm_start <= vma->vm_end + PMD_SIZE
271 && !is_hugepage_only_range(vma->vm_mm, next->vm_start,
272 HPAGE_SIZE)) {
273 vma = next;
274 next = vma->vm_next;
275 }
276 free_pgd_range(tlb, addr, vma->vm_end,
277 floor, next? next->vm_start: ceiling);
278 }
279 vma = next;
280 }
281 }
283 pte_t fastcall *pte_alloc_map(struct mm_struct *mm, pmd_t *pmd,
284 unsigned long address)
285 {
286 if (!pmd_present(*pmd)) {
287 struct page *new;
289 spin_unlock(&mm->page_table_lock);
290 new = pte_alloc_one(mm, address);
291 spin_lock(&mm->page_table_lock);
292 if (!new)
293 return NULL;
294 /*
295 * Because we dropped the lock, we should re-check the
296 * entry, as somebody else could have populated it..
297 */
298 if (pmd_present(*pmd)) {
299 pte_free(new);
300 goto out;
301 }
302 mm->nr_ptes++;
303 inc_page_state(nr_page_table_pages);
304 pmd_populate(mm, pmd, new);
305 }
306 out:
307 return pte_offset_map(pmd, address);
308 }
310 pte_t fastcall * pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
311 {
312 if (!pmd_present(*pmd)) {
313 pte_t *new;
315 spin_unlock(&mm->page_table_lock);
316 new = pte_alloc_one_kernel(mm, address);
317 spin_lock(&mm->page_table_lock);
318 if (!new)
319 return NULL;
321 /*
322 * Because we dropped the lock, we should re-check the
323 * entry, as somebody else could have populated it..
324 */
325 if (pmd_present(*pmd)) {
326 pte_free_kernel(new);
327 goto out;
328 }
329 pmd_populate_kernel(mm, pmd, new);
330 }
331 out:
332 return pte_offset_kernel(pmd, address);
333 }
335 /*
336 * copy one vm_area from one task to the other. Assumes the page tables
337 * already present in the new task to be cleared in the whole range
338 * covered by this vma.
339 *
340 * dst->page_table_lock is held on entry and exit,
341 * but may be dropped within p[mg]d_alloc() and pte_alloc_map().
342 */
344 static inline void
345 copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
346 pte_t *dst_pte, pte_t *src_pte, unsigned long vm_flags,
347 unsigned long addr)
348 {
349 pte_t pte = *src_pte;
350 struct page *page;
351 unsigned long pfn;
353 /* pte contains position in swap or file, so copy. */
354 if (unlikely(!pte_present(pte))) {
355 if (!pte_file(pte)) {
356 swap_duplicate(pte_to_swp_entry(pte));
357 /* make sure dst_mm is on swapoff's mmlist. */
358 if (unlikely(list_empty(&dst_mm->mmlist))) {
359 spin_lock(&mmlist_lock);
360 list_add(&dst_mm->mmlist, &src_mm->mmlist);
361 spin_unlock(&mmlist_lock);
362 }
363 }
364 set_pte_at(dst_mm, addr, dst_pte, pte);
365 return;
366 }
368 pfn = pte_pfn(pte);
369 /* the pte points outside of valid memory, the
370 * mapping is assumed to be good, meaningful
371 * and not mapped via rmap - duplicate the
372 * mapping as is.
373 */
374 page = NULL;
375 if (pfn_valid(pfn))
376 page = pfn_to_page(pfn);
378 if (!page || PageReserved(page)) {
379 set_pte_at(dst_mm, addr, dst_pte, pte);
380 return;
381 }
383 /*
384 * If it's a COW mapping, write protect it both
385 * in the parent and the child
386 */
387 if ((vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE) {
388 ptep_set_wrprotect(src_mm, addr, src_pte);
389 pte = *src_pte;
390 }
392 /*
393 * If it's a shared mapping, mark it clean in
394 * the child
395 */
396 if (vm_flags & VM_SHARED)
397 pte = pte_mkclean(pte);
398 pte = pte_mkold(pte);
399 get_page(page);
400 inc_mm_counter(dst_mm, rss);
401 if (PageAnon(page))
402 inc_mm_counter(dst_mm, anon_rss);
403 set_pte_at(dst_mm, addr, dst_pte, pte);
404 page_dup_rmap(page);
405 }
407 static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
408 pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
409 unsigned long addr, unsigned long end)
410 {
411 pte_t *src_pte, *dst_pte;
412 unsigned long vm_flags = vma->vm_flags;
413 int progress;
415 again:
416 dst_pte = pte_alloc_map(dst_mm, dst_pmd, addr);
417 if (!dst_pte)
418 return -ENOMEM;
419 src_pte = pte_offset_map_nested(src_pmd, addr);
421 progress = 0;
422 spin_lock(&src_mm->page_table_lock);
423 do {
424 /*
425 * We are holding two locks at this point - either of them
426 * could generate latencies in another task on another CPU.
427 */
428 if (progress >= 32 && (need_resched() ||
429 need_lockbreak(&src_mm->page_table_lock) ||
430 need_lockbreak(&dst_mm->page_table_lock)))
431 break;
432 if (pte_none(*src_pte)) {
433 progress++;
434 continue;
435 }
436 copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vm_flags, addr);
437 progress += 8;
438 } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
439 spin_unlock(&src_mm->page_table_lock);
441 pte_unmap_nested(src_pte - 1);
442 pte_unmap(dst_pte - 1);
443 cond_resched_lock(&dst_mm->page_table_lock);
444 if (addr != end)
445 goto again;
446 return 0;
447 }
449 static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
450 pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma,
451 unsigned long addr, unsigned long end)
452 {
453 pmd_t *src_pmd, *dst_pmd;
454 unsigned long next;
456 dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
457 if (!dst_pmd)
458 return -ENOMEM;
459 src_pmd = pmd_offset(src_pud, addr);
460 do {
461 next = pmd_addr_end(addr, end);
462 if (pmd_none_or_clear_bad(src_pmd))
463 continue;
464 if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
465 vma, addr, next))
466 return -ENOMEM;
467 } while (dst_pmd++, src_pmd++, addr = next, addr != end);
468 return 0;
469 }
471 static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
472 pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,
473 unsigned long addr, unsigned long end)
474 {
475 pud_t *src_pud, *dst_pud;
476 unsigned long next;
478 dst_pud = pud_alloc(dst_mm, dst_pgd, addr);
479 if (!dst_pud)
480 return -ENOMEM;
481 src_pud = pud_offset(src_pgd, addr);
482 do {
483 next = pud_addr_end(addr, end);
484 if (pud_none_or_clear_bad(src_pud))
485 continue;
486 if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
487 vma, addr, next))
488 return -ENOMEM;
489 } while (dst_pud++, src_pud++, addr = next, addr != end);
490 return 0;
491 }
493 int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
494 struct vm_area_struct *vma)
495 {
496 pgd_t *src_pgd, *dst_pgd;
497 unsigned long next;
498 unsigned long addr = vma->vm_start;
499 unsigned long end = vma->vm_end;
501 if (is_vm_hugetlb_page(vma))
502 return copy_hugetlb_page_range(dst_mm, src_mm, vma);
504 dst_pgd = pgd_offset(dst_mm, addr);
505 src_pgd = pgd_offset(src_mm, addr);
506 do {
507 next = pgd_addr_end(addr, end);
508 if (pgd_none_or_clear_bad(src_pgd))
509 continue;
510 if (copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,
511 vma, addr, next))
512 return -ENOMEM;
513 } while (dst_pgd++, src_pgd++, addr = next, addr != end);
514 return 0;
515 }
517 static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
518 unsigned long addr, unsigned long end,
519 struct zap_details *details)
520 {
521 pte_t *pte;
523 pte = pte_offset_map(pmd, addr);
524 do {
525 pte_t ptent = *pte;
526 if (pte_none(ptent))
527 continue;
528 if (pte_present(ptent)) {
529 struct page *page = NULL;
530 unsigned long pfn = pte_pfn(ptent);
531 if (pfn_valid(pfn)) {
532 page = pfn_to_page(pfn);
533 if (PageReserved(page))
534 page = NULL;
535 }
536 if (unlikely(details) && page) {
537 /*
538 * unmap_shared_mapping_pages() wants to
539 * invalidate cache without truncating:
540 * unmap shared but keep private pages.
541 */
542 if (details->check_mapping &&
543 details->check_mapping != page->mapping)
544 continue;
545 /*
546 * Each page->index must be checked when
547 * invalidating or truncating nonlinear.
548 */
549 if (details->nonlinear_vma &&
550 (page->index < details->first_index ||
551 page->index > details->last_index))
552 continue;
553 }
554 ptent = ptep_get_and_clear(tlb->mm, addr, pte);
555 tlb_remove_tlb_entry(tlb, pte, addr);
556 if (unlikely(!page))
557 continue;
558 if (unlikely(details) && details->nonlinear_vma
559 && linear_page_index(details->nonlinear_vma,
560 addr) != page->index)
561 set_pte_at(tlb->mm, addr, pte,
562 pgoff_to_pte(page->index));
563 if (pte_dirty(ptent))
564 set_page_dirty(page);
565 if (PageAnon(page))
566 dec_mm_counter(tlb->mm, anon_rss);
567 else if (pte_young(ptent))
568 mark_page_accessed(page);
569 tlb->freed++;
570 page_remove_rmap(page);
571 tlb_remove_page(tlb, page);
572 continue;
573 }
574 /*
575 * If details->check_mapping, we leave swap entries;
576 * if details->nonlinear_vma, we leave file entries.
577 */
578 if (unlikely(details))
579 continue;
580 if (!pte_file(ptent))
581 free_swap_and_cache(pte_to_swp_entry(ptent));
582 pte_clear(tlb->mm, addr, pte);
583 } while (pte++, addr += PAGE_SIZE, addr != end);
584 pte_unmap(pte - 1);
585 }
587 static inline void zap_pmd_range(struct mmu_gather *tlb, pud_t *pud,
588 unsigned long addr, unsigned long end,
589 struct zap_details *details)
590 {
591 pmd_t *pmd;
592 unsigned long next;
594 pmd = pmd_offset(pud, addr);
595 do {
596 next = pmd_addr_end(addr, end);
597 if (pmd_none_or_clear_bad(pmd))
598 continue;
599 zap_pte_range(tlb, pmd, addr, next, details);
600 } while (pmd++, addr = next, addr != end);
601 }
603 static inline void zap_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
604 unsigned long addr, unsigned long end,
605 struct zap_details *details)
606 {
607 pud_t *pud;
608 unsigned long next;
610 pud = pud_offset(pgd, addr);
611 do {
612 next = pud_addr_end(addr, end);
613 if (pud_none_or_clear_bad(pud))
614 continue;
615 zap_pmd_range(tlb, pud, addr, next, details);
616 } while (pud++, addr = next, addr != end);
617 }
619 static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
620 unsigned long addr, unsigned long end,
621 struct zap_details *details)
622 {
623 pgd_t *pgd;
624 unsigned long next;
626 if (details && !details->check_mapping && !details->nonlinear_vma)
627 details = NULL;
629 BUG_ON(addr >= end);
630 tlb_start_vma(tlb, vma);
631 pgd = pgd_offset(vma->vm_mm, addr);
632 do {
633 next = pgd_addr_end(addr, end);
634 if (pgd_none_or_clear_bad(pgd))
635 continue;
636 zap_pud_range(tlb, pgd, addr, next, details);
637 } while (pgd++, addr = next, addr != end);
638 tlb_end_vma(tlb, vma);
639 }
641 #ifdef CONFIG_PREEMPT
642 # define ZAP_BLOCK_SIZE (8 * PAGE_SIZE)
643 #else
644 /* No preempt: go for improved straight-line efficiency */
645 # define ZAP_BLOCK_SIZE (1024 * PAGE_SIZE)
646 #endif
648 /**
649 * unmap_vmas - unmap a range of memory covered by a list of vma's
650 * @tlbp: address of the caller's struct mmu_gather
651 * @mm: the controlling mm_struct
652 * @vma: the starting vma
653 * @start_addr: virtual address at which to start unmapping
654 * @end_addr: virtual address at which to end unmapping
655 * @nr_accounted: Place number of unmapped pages in vm-accountable vma's here
656 * @details: details of nonlinear truncation or shared cache invalidation
657 *
658 * Returns the end address of the unmapping (restart addr if interrupted).
659 *
660 * Unmap all pages in the vma list. Called under page_table_lock.
661 *
662 * We aim to not hold page_table_lock for too long (for scheduling latency
663 * reasons). So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to
664 * return the ending mmu_gather to the caller.
665 *
666 * Only addresses between `start' and `end' will be unmapped.
667 *
668 * The VMA list must be sorted in ascending virtual address order.
669 *
670 * unmap_vmas() assumes that the caller will flush the whole unmapped address
671 * range after unmap_vmas() returns. So the only responsibility here is to
672 * ensure that any thus-far unmapped pages are flushed before unmap_vmas()
673 * drops the lock and schedules.
674 */
675 unsigned long unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm,
676 struct vm_area_struct *vma, unsigned long start_addr,
677 unsigned long end_addr, unsigned long *nr_accounted,
678 struct zap_details *details)
679 {
680 unsigned long zap_bytes = ZAP_BLOCK_SIZE;
681 unsigned long tlb_start = 0; /* For tlb_finish_mmu */
682 int tlb_start_valid = 0;
683 unsigned long start = start_addr;
684 spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
685 int fullmm = tlb_is_full_mm(*tlbp);
687 for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
688 unsigned long end;
690 start = max(vma->vm_start, start_addr);
691 if (start >= vma->vm_end)
692 continue;
693 end = min(vma->vm_end, end_addr);
694 if (end <= vma->vm_start)
695 continue;
697 if (vma->vm_flags & VM_ACCOUNT)
698 *nr_accounted += (end - start) >> PAGE_SHIFT;
700 while (start != end) {
701 unsigned long block;
703 if (!tlb_start_valid) {
704 tlb_start = start;
705 tlb_start_valid = 1;
706 }
708 if (is_vm_hugetlb_page(vma)) {
709 block = end - start;
710 unmap_hugepage_range(vma, start, end);
711 } else {
712 block = min(zap_bytes, end - start);
713 unmap_page_range(*tlbp, vma, start,
714 start + block, details);
715 }
717 start += block;
718 zap_bytes -= block;
719 if ((long)zap_bytes > 0)
720 continue;
722 tlb_finish_mmu(*tlbp, tlb_start, start);
724 if (need_resched() ||
725 need_lockbreak(&mm->page_table_lock) ||
726 (i_mmap_lock && need_lockbreak(i_mmap_lock))) {
727 if (i_mmap_lock) {
728 /* must reset count of rss freed */
729 *tlbp = tlb_gather_mmu(mm, fullmm);
730 goto out;
731 }
732 spin_unlock(&mm->page_table_lock);
733 cond_resched();
734 spin_lock(&mm->page_table_lock);
735 }
737 *tlbp = tlb_gather_mmu(mm, fullmm);
738 tlb_start_valid = 0;
739 zap_bytes = ZAP_BLOCK_SIZE;
740 }
741 }
742 out:
743 return start; /* which is now the end (or restart) address */
744 }
746 /**
747 * zap_page_range - remove user pages in a given range
748 * @vma: vm_area_struct holding the applicable pages
749 * @address: starting address of pages to zap
750 * @size: number of bytes to zap
751 * @details: details of nonlinear truncation or shared cache invalidation
752 */
753 unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
754 unsigned long size, struct zap_details *details)
755 {
756 struct mm_struct *mm = vma->vm_mm;
757 struct mmu_gather *tlb;
758 unsigned long end = address + size;
759 unsigned long nr_accounted = 0;
761 if (is_vm_hugetlb_page(vma)) {
762 zap_hugepage_range(vma, address, size);
763 return end;
764 }
766 lru_add_drain();
767 spin_lock(&mm->page_table_lock);
768 tlb = tlb_gather_mmu(mm, 0);
769 end = unmap_vmas(&tlb, mm, vma, address, end, &nr_accounted, details);
770 tlb_finish_mmu(tlb, address, end);
771 spin_unlock(&mm->page_table_lock);
772 return end;
773 }
775 /*
776 * Do a quick page-table lookup for a single page.
777 * mm->page_table_lock must be held.
778 */
779 static struct page *
780 __follow_page(struct mm_struct *mm, unsigned long address, int read, int write)
781 {
782 pgd_t *pgd;
783 pud_t *pud;
784 pmd_t *pmd;
785 pte_t *ptep, pte;
786 unsigned long pfn;
787 struct page *page;
789 page = follow_huge_addr(mm, address, write);
790 if (! IS_ERR(page))
791 return page;
793 pgd = pgd_offset(mm, address);
794 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
795 goto out;
797 pud = pud_offset(pgd, address);
798 if (pud_none(*pud) || unlikely(pud_bad(*pud)))
799 goto out;
801 pmd = pmd_offset(pud, address);
802 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
803 goto out;
804 if (pmd_huge(*pmd))
805 return follow_huge_pmd(mm, address, pmd, write);
807 ptep = pte_offset_map(pmd, address);
808 if (!ptep)
809 goto out;
811 pte = *ptep;
812 pte_unmap(ptep);
813 if (pte_present(pte)) {
814 if (write && !pte_write(pte))
815 goto out;
816 if (read && !pte_read(pte))
817 goto out;
818 pfn = pte_pfn(pte);
819 if (pfn_valid(pfn)) {
820 page = pfn_to_page(pfn);
821 if (write && !pte_dirty(pte) && !PageDirty(page))
822 set_page_dirty(page);
823 mark_page_accessed(page);
824 return page;
825 }
826 }
828 out:
829 return NULL;
830 }
832 struct page *
833 follow_page(struct mm_struct *mm, unsigned long address, int write)
834 {
835 return __follow_page(mm, address, /*read*/0, write);
836 }
838 int
839 check_user_page_readable(struct mm_struct *mm, unsigned long address)
840 {
841 return __follow_page(mm, address, /*read*/1, /*write*/0) != NULL;
842 }
844 EXPORT_SYMBOL(check_user_page_readable);
846 /*
847 * Given a physical address, is there a useful struct page pointing to
848 * it? This may become more complex in the future if we start dealing
849 * with IO-aperture pages for direct-IO.
850 */
852 static inline struct page *get_page_map(struct page *page)
853 {
854 if (!pfn_valid(page_to_pfn(page)))
855 return NULL;
856 return page;
857 }
860 static inline int
861 untouched_anonymous_page(struct mm_struct* mm, struct vm_area_struct *vma,
862 unsigned long address)
863 {
864 pgd_t *pgd;
865 pud_t *pud;
866 pmd_t *pmd;
868 /* Check if the vma is for an anonymous mapping. */
869 if (vma->vm_ops && vma->vm_ops->nopage)
870 return 0;
872 /* Check if page directory entry exists. */
873 pgd = pgd_offset(mm, address);
874 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
875 return 1;
877 pud = pud_offset(pgd, address);
878 if (pud_none(*pud) || unlikely(pud_bad(*pud)))
879 return 1;
881 /* Check if page middle directory entry exists. */
882 pmd = pmd_offset(pud, address);
883 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
884 return 1;
886 /* There is a pte slot for 'address' in 'mm'. */
887 return 0;
888 }
891 int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
892 unsigned long start, int len, int write, int force,
893 struct page **pages, struct vm_area_struct **vmas)
894 {
895 int i;
896 unsigned int flags;
898 /*
899 * Require read or write permissions.
900 * If 'force' is set, we only require the "MAY" flags.
901 */
902 flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
903 flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
904 i = 0;
906 do {
907 struct vm_area_struct * vma;
909 vma = find_extend_vma(mm, start);
910 if (!vma && in_gate_area(tsk, start)) {
911 unsigned long pg = start & PAGE_MASK;
912 struct vm_area_struct *gate_vma = get_gate_vma(tsk);
913 pgd_t *pgd;
914 pud_t *pud;
915 pmd_t *pmd;
916 pte_t *pte;
917 if (write) /* user gate pages are read-only */
918 return i ? : -EFAULT;
919 if (pg > TASK_SIZE)
920 pgd = pgd_offset_k(pg);
921 else
922 pgd = pgd_offset_gate(mm, pg);
923 BUG_ON(pgd_none(*pgd));
924 pud = pud_offset(pgd, pg);
925 BUG_ON(pud_none(*pud));
926 pmd = pmd_offset(pud, pg);
927 BUG_ON(pmd_none(*pmd));
928 pte = pte_offset_map(pmd, pg);
929 BUG_ON(pte_none(*pte));
930 if (pages) {
931 pages[i] = pte_page(*pte);
932 get_page(pages[i]);
933 }
934 pte_unmap(pte);
935 if (vmas)
936 vmas[i] = gate_vma;
937 i++;
938 start += PAGE_SIZE;
939 len--;
940 continue;
941 }
943 if (vma && (vma->vm_flags & VM_FOREIGN))
944 {
945 struct page **map = vma->vm_private_data;
946 int offset = (start - vma->vm_start) >> PAGE_SHIFT;
948 if (map[offset] != NULL) {
949 if (pages) {
950 pages[i] = map[offset];
951 }
952 if (vmas)
953 vmas[i] = vma;
954 i++;
955 start += PAGE_SIZE;
956 len--;
957 printk(KERN_ALERT "HIT 0x%lx\n", start);
958 continue;
959 }
960 else printk(KERN_ALERT "MISS 0x%lx\n", start);
961 }
963 if (!vma || (vma->vm_flags & VM_IO)
964 || !(flags & vma->vm_flags))
965 return i ? : -EFAULT;
967 if (is_vm_hugetlb_page(vma)) {
968 i = follow_hugetlb_page(mm, vma, pages, vmas,
969 &start, &len, i);
970 continue;
971 }
972 spin_lock(&mm->page_table_lock);
973 do {
974 struct page *map;
975 int lookup_write = write;
977 cond_resched_lock(&mm->page_table_lock);
978 while (!(map = follow_page(mm, start, lookup_write))) {
979 /*
980 * Shortcut for anonymous pages. We don't want
981 * to force the creation of pages tables for
982 * insanly big anonymously mapped areas that
983 * nobody touched so far. This is important
984 * for doing a core dump for these mappings.
985 */
986 if (!lookup_write &&
987 untouched_anonymous_page(mm,vma,start)) {
988 map = ZERO_PAGE(start);
989 break;
990 }
991 spin_unlock(&mm->page_table_lock);
992 switch (handle_mm_fault(mm,vma,start,write)) {
993 case VM_FAULT_MINOR:
994 tsk->min_flt++;
995 break;
996 case VM_FAULT_MAJOR:
997 tsk->maj_flt++;
998 break;
999 case VM_FAULT_SIGBUS:
1000 return i ? i : -EFAULT;
1001 case VM_FAULT_OOM:
1002 return i ? i : -ENOMEM;
1003 default:
1004 BUG();
1006 /*
1007 * Now that we have performed a write fault
1008 * and surely no longer have a shared page we
1009 * shouldn't write, we shouldn't ignore an
1010 * unwritable page in the page table if
1011 * we are forcing write access.
1012 */
1013 lookup_write = write && !force;
1014 spin_lock(&mm->page_table_lock);
1016 if (pages) {
1017 pages[i] = get_page_map(map);
1018 if (!pages[i]) {
1019 spin_unlock(&mm->page_table_lock);
1020 while (i--)
1021 page_cache_release(pages[i]);
1022 i = -EFAULT;
1023 goto out;
1025 flush_dcache_page(pages[i]);
1026 if (!PageReserved(pages[i]))
1027 page_cache_get(pages[i]);
1029 if (vmas)
1030 vmas[i] = vma;
1031 i++;
1032 start += PAGE_SIZE;
1033 len--;
1034 } while(len && start < vma->vm_end);
1035 spin_unlock(&mm->page_table_lock);
1036 } while(len);
1037 out:
1038 return i;
1041 EXPORT_SYMBOL(get_user_pages);
1043 static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd,
1044 unsigned long addr, unsigned long end, pgprot_t prot)
1046 pte_t *pte;
1048 pte = pte_alloc_map(mm, pmd, addr);
1049 if (!pte)
1050 return -ENOMEM;
1051 do {
1052 pte_t zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE(addr), prot));
1053 BUG_ON(!pte_none(*pte));
1054 set_pte_at(mm, addr, pte, zero_pte);
1055 } while (pte++, addr += PAGE_SIZE, addr != end);
1056 pte_unmap(pte - 1);
1057 return 0;
1060 static inline int zeromap_pmd_range(struct mm_struct *mm, pud_t *pud,
1061 unsigned long addr, unsigned long end, pgprot_t prot)
1063 pmd_t *pmd;
1064 unsigned long next;
1066 pmd = pmd_alloc(mm, pud, addr);
1067 if (!pmd)
1068 return -ENOMEM;
1069 do {
1070 next = pmd_addr_end(addr, end);
1071 if (zeromap_pte_range(mm, pmd, addr, next, prot))
1072 return -ENOMEM;
1073 } while (pmd++, addr = next, addr != end);
1074 return 0;
1077 static inline int zeromap_pud_range(struct mm_struct *mm, pgd_t *pgd,
1078 unsigned long addr, unsigned long end, pgprot_t prot)
1080 pud_t *pud;
1081 unsigned long next;
1083 pud = pud_alloc(mm, pgd, addr);
1084 if (!pud)
1085 return -ENOMEM;
1086 do {
1087 next = pud_addr_end(addr, end);
1088 if (zeromap_pmd_range(mm, pud, addr, next, prot))
1089 return -ENOMEM;
1090 } while (pud++, addr = next, addr != end);
1091 return 0;
1094 int zeromap_page_range(struct vm_area_struct *vma,
1095 unsigned long addr, unsigned long size, pgprot_t prot)
1097 pgd_t *pgd;
1098 unsigned long next;
1099 unsigned long end = addr + size;
1100 struct mm_struct *mm = vma->vm_mm;
1101 int err;
1103 BUG_ON(addr >= end);
1104 pgd = pgd_offset(mm, addr);
1105 flush_cache_range(vma, addr, end);
1106 spin_lock(&mm->page_table_lock);
1107 do {
1108 next = pgd_addr_end(addr, end);
1109 err = zeromap_pud_range(mm, pgd, addr, next, prot);
1110 if (err)
1111 break;
1112 } while (pgd++, addr = next, addr != end);
1113 spin_unlock(&mm->page_table_lock);
1114 return err;
1117 /*
1118 * maps a range of physical memory into the requested pages. the old
1119 * mappings are removed. any references to nonexistent pages results
1120 * in null mappings (currently treated as "copy-on-access")
1121 */
1122 static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
1123 unsigned long addr, unsigned long end,
1124 unsigned long pfn, pgprot_t prot)
1126 pte_t *pte;
1128 pte = pte_alloc_map(mm, pmd, addr);
1129 if (!pte)
1130 return -ENOMEM;
1131 do {
1132 BUG_ON(!pte_none(*pte));
1133 if (!pfn_valid(pfn) || PageReserved(pfn_to_page(pfn)))
1134 set_pte_at(mm, addr, pte, pfn_pte(pfn, prot));
1135 pfn++;
1136 } while (pte++, addr += PAGE_SIZE, addr != end);
1137 pte_unmap(pte - 1);
1138 return 0;
1141 static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
1142 unsigned long addr, unsigned long end,
1143 unsigned long pfn, pgprot_t prot)
1145 pmd_t *pmd;
1146 unsigned long next;
1148 pfn -= addr >> PAGE_SHIFT;
1149 pmd = pmd_alloc(mm, pud, addr);
1150 if (!pmd)
1151 return -ENOMEM;
1152 do {
1153 next = pmd_addr_end(addr, end);
1154 if (remap_pte_range(mm, pmd, addr, next,
1155 pfn + (addr >> PAGE_SHIFT), prot))
1156 return -ENOMEM;
1157 } while (pmd++, addr = next, addr != end);
1158 return 0;
1161 static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd,
1162 unsigned long addr, unsigned long end,
1163 unsigned long pfn, pgprot_t prot)
1165 pud_t *pud;
1166 unsigned long next;
1168 pfn -= addr >> PAGE_SHIFT;
1169 pud = pud_alloc(mm, pgd, addr);
1170 if (!pud)
1171 return -ENOMEM;
1172 do {
1173 next = pud_addr_end(addr, end);
1174 if (remap_pmd_range(mm, pud, addr, next,
1175 pfn + (addr >> PAGE_SHIFT), prot))
1176 return -ENOMEM;
1177 } while (pud++, addr = next, addr != end);
1178 return 0;
1181 /* Note: this is only safe if the mm semaphore is held when called. */
1182 int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
1183 unsigned long pfn, unsigned long size, pgprot_t prot)
1185 pgd_t *pgd;
1186 unsigned long next;
1187 unsigned long end = addr + size;
1188 struct mm_struct *mm = vma->vm_mm;
1189 int err;
1191 /*
1192 * Physically remapped pages are special. Tell the
1193 * rest of the world about it:
1194 * VM_IO tells people not to look at these pages
1195 * (accesses can have side effects).
1196 * VM_RESERVED tells swapout not to try to touch
1197 * this region.
1198 */
1199 vma->vm_flags |= VM_IO | VM_RESERVED;
1201 BUG_ON(addr >= end);
1202 pfn -= addr >> PAGE_SHIFT;
1203 pgd = pgd_offset(mm, addr);
1204 flush_cache_range(vma, addr, end);
1205 spin_lock(&mm->page_table_lock);
1206 do {
1207 next = pgd_addr_end(addr, end);
1208 err = remap_pud_range(mm, pgd, addr, next,
1209 pfn + (addr >> PAGE_SHIFT), prot);
1210 if (err)
1211 break;
1212 } while (pgd++, addr = next, addr != end);
1213 spin_unlock(&mm->page_table_lock);
1214 return err;
1216 EXPORT_SYMBOL(remap_pfn_range);
1218 static inline int generic_pte_range(struct mm_struct *mm,
1219 pmd_t *pmd,
1220 unsigned long addr,
1221 unsigned long end,
1222 pte_fn_t fn, void *data)
1224 pte_t *pte;
1225 int err;
1226 struct page *pte_page;
1228 pte = (mm == &init_mm) ?
1229 pte_alloc_kernel(mm, pmd, addr) :
1230 pte_alloc_map(mm, pmd, addr);
1231 if (!pte)
1232 return -ENOMEM;
1234 pte_page = pmd_page(*pmd);
1236 do {
1237 err = fn(pte, pte_page, addr, data);
1238 if (err)
1239 break;
1240 } while (pte++, addr += PAGE_SIZE, addr != end);
1242 if (mm != &init_mm)
1243 pte_unmap(pte-1);
1244 return err;
1248 static inline int generic_pmd_range(struct mm_struct *mm,
1249 pud_t *pud,
1250 unsigned long addr,
1251 unsigned long end,
1252 pte_fn_t fn, void *data)
1254 pmd_t *pmd;
1255 unsigned long next;
1256 int err;
1258 pmd = pmd_alloc(mm, pud, addr);
1259 if (!pmd)
1260 return -ENOMEM;
1261 do {
1262 next = pmd_addr_end(addr, end);
1263 err = generic_pte_range(mm, pmd, addr, next, fn, data);
1264 if (err)
1265 break;
1266 } while (pmd++, addr = next, addr != end);
1267 return err;
1270 static inline int generic_pud_range(struct mm_struct *mm, pgd_t *pgd,
1271 unsigned long addr,
1272 unsigned long end,
1273 pte_fn_t fn, void *data)
1275 pud_t *pud;
1276 unsigned long next;
1277 int err;
1279 pud = pud_alloc(mm, pgd, addr);
1280 if (!pud)
1281 return -ENOMEM;
1282 do {
1283 next = pud_addr_end(addr, end);
1284 err = generic_pmd_range(mm, pud, addr, next, fn, data);
1285 if (err)
1286 break;
1287 } while (pud++, addr = next, addr != end);
1288 return err;
1291 /*
1292 * Scan a region of virtual memory, filling in page tables as necessary
1293 * and calling a provided function on each leaf page table.
1294 */
1295 int generic_page_range(struct mm_struct *mm, unsigned long addr,
1296 unsigned long size, pte_fn_t fn, void *data)
1298 pgd_t *pgd;
1299 unsigned long next;
1300 unsigned long end = addr + size;
1301 int err;
1303 BUG_ON(addr >= end);
1304 pgd = pgd_offset(mm, addr);
1305 spin_lock(&mm->page_table_lock);
1306 do {
1307 next = pgd_addr_end(addr, end);
1308 err = generic_pud_range(mm, pgd, addr, next, fn, data);
1309 if (err)
1310 break;
1311 } while (pgd++, addr = next, addr != end);
1312 spin_unlock(&mm->page_table_lock);
1313 return err;
1316 /*
1317 * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when
1318 * servicing faults for write access. In the normal case, do always want
1319 * pte_mkwrite. But get_user_pages can cause write faults for mappings
1320 * that do not have writing enabled, when used by access_process_vm.
1321 */
1322 static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
1324 if (likely(vma->vm_flags & VM_WRITE))
1325 pte = pte_mkwrite(pte);
1326 return pte;
1329 /*
1330 * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock
1331 */
1332 static inline void break_cow(struct vm_area_struct * vma, struct page * new_page, unsigned long address,
1333 pte_t *page_table)
1335 pte_t entry;
1337 entry = maybe_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot)),
1338 vma);
1339 ptep_establish(vma, address, page_table, entry);
1340 update_mmu_cache(vma, address, entry);
1341 lazy_mmu_prot_update(entry);
1344 /*
1345 * This routine handles present pages, when users try to write
1346 * to a shared page. It is done by copying the page to a new address
1347 * and decrementing the shared-page counter for the old page.
1349 * Goto-purists beware: the only reason for goto's here is that it results
1350 * in better assembly code.. The "default" path will see no jumps at all.
1352 * Note that this routine assumes that the protection checks have been
1353 * done by the caller (the low-level page fault routine in most cases).
1354 * Thus we can safely just mark it writable once we've done any necessary
1355 * COW.
1357 * We also mark the page dirty at this point even though the page will
1358 * change only once the write actually happens. This avoids a few races,
1359 * and potentially makes it more efficient.
1361 * We hold the mm semaphore and the page_table_lock on entry and exit
1362 * with the page_table_lock released.
1363 */
1364 static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
1365 unsigned long address, pte_t *page_table, pmd_t *pmd, pte_t pte)
1367 struct page *old_page, *new_page;
1368 unsigned long pfn = pte_pfn(pte);
1369 pte_t entry;
1371 if (unlikely(!pfn_valid(pfn))) {
1372 /*
1373 * This should really halt the system so it can be debugged or
1374 * at least the kernel stops what it's doing before it corrupts
1375 * data, but for the moment just pretend this is OOM.
1376 */
1377 pte_unmap(page_table);
1378 printk(KERN_ERR "do_wp_page: bogus page at address %08lx\n",
1379 address);
1380 spin_unlock(&mm->page_table_lock);
1381 return VM_FAULT_OOM;
1383 old_page = pfn_to_page(pfn);
1385 if (!TestSetPageLocked(old_page)) {
1386 int reuse = can_share_swap_page(old_page);
1387 unlock_page(old_page);
1388 if (reuse) {
1389 flush_cache_page(vma, address, pfn);
1390 entry = maybe_mkwrite(pte_mkyoung(pte_mkdirty(pte)),
1391 vma);
1392 ptep_set_access_flags(vma, address, page_table, entry, 1);
1393 update_mmu_cache(vma, address, entry);
1394 lazy_mmu_prot_update(entry);
1395 pte_unmap(page_table);
1396 spin_unlock(&mm->page_table_lock);
1397 return VM_FAULT_MINOR;
1400 pte_unmap(page_table);
1402 /*
1403 * Ok, we need to copy. Oh, well..
1404 */
1405 if (!PageReserved(old_page))
1406 page_cache_get(old_page);
1407 spin_unlock(&mm->page_table_lock);
1409 if (unlikely(anon_vma_prepare(vma)))
1410 goto no_new_page;
1411 if (old_page == ZERO_PAGE(address)) {
1412 new_page = alloc_zeroed_user_highpage(vma, address);
1413 if (!new_page)
1414 goto no_new_page;
1415 } else {
1416 new_page = alloc_page_vma(GFP_HIGHUSER, vma, address);
1417 if (!new_page)
1418 goto no_new_page;
1419 copy_user_highpage(new_page, old_page, address);
1421 /*
1422 * Re-check the pte - we dropped the lock
1423 */
1424 spin_lock(&mm->page_table_lock);
1425 page_table = pte_offset_map(pmd, address);
1426 if (likely(pte_same(*page_table, pte))) {
1427 if (PageAnon(old_page))
1428 dec_mm_counter(mm, anon_rss);
1429 if (PageReserved(old_page))
1430 inc_mm_counter(mm, rss);
1431 else
1432 page_remove_rmap(old_page);
1433 flush_cache_page(vma, address, pfn);
1434 break_cow(vma, new_page, address, page_table);
1435 lru_cache_add_active(new_page);
1436 page_add_anon_rmap(new_page, vma, address);
1438 /* Free the old page.. */
1439 new_page = old_page;
1441 pte_unmap(page_table);
1442 page_cache_release(new_page);
1443 page_cache_release(old_page);
1444 spin_unlock(&mm->page_table_lock);
1445 return VM_FAULT_MINOR;
1447 no_new_page:
1448 page_cache_release(old_page);
1449 return VM_FAULT_OOM;
1452 /*
1453 * Helper functions for unmap_mapping_range().
1455 * __ Notes on dropping i_mmap_lock to reduce latency while unmapping __
1457 * We have to restart searching the prio_tree whenever we drop the lock,
1458 * since the iterator is only valid while the lock is held, and anyway
1459 * a later vma might be split and reinserted earlier while lock dropped.
1461 * The list of nonlinear vmas could be handled more efficiently, using
1462 * a placeholder, but handle it in the same way until a need is shown.
1463 * It is important to search the prio_tree before nonlinear list: a vma
1464 * may become nonlinear and be shifted from prio_tree to nonlinear list
1465 * while the lock is dropped; but never shifted from list to prio_tree.
1467 * In order to make forward progress despite restarting the search,
1468 * vm_truncate_count is used to mark a vma as now dealt with, so we can
1469 * quickly skip it next time around. Since the prio_tree search only
1470 * shows us those vmas affected by unmapping the range in question, we
1471 * can't efficiently keep all vmas in step with mapping->truncate_count:
1472 * so instead reset them all whenever it wraps back to 0 (then go to 1).
1473 * mapping->truncate_count and vma->vm_truncate_count are protected by
1474 * i_mmap_lock.
1476 * In order to make forward progress despite repeatedly restarting some
1477 * large vma, note the restart_addr from unmap_vmas when it breaks out:
1478 * and restart from that address when we reach that vma again. It might
1479 * have been split or merged, shrunk or extended, but never shifted: so
1480 * restart_addr remains valid so long as it remains in the vma's range.
1481 * unmap_mapping_range forces truncate_count to leap over page-aligned
1482 * values so we can save vma's restart_addr in its truncate_count field.
1483 */
1484 #define is_restart_addr(truncate_count) (!((truncate_count) & ~PAGE_MASK))
1486 static void reset_vma_truncate_counts(struct address_space *mapping)
1488 struct vm_area_struct *vma;
1489 struct prio_tree_iter iter;
1491 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
1492 vma->vm_truncate_count = 0;
1493 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
1494 vma->vm_truncate_count = 0;
1497 static int unmap_mapping_range_vma(struct vm_area_struct *vma,
1498 unsigned long start_addr, unsigned long end_addr,
1499 struct zap_details *details)
1501 unsigned long restart_addr;
1502 int need_break;
1504 again:
1505 restart_addr = vma->vm_truncate_count;
1506 if (is_restart_addr(restart_addr) && start_addr < restart_addr) {
1507 start_addr = restart_addr;
1508 if (start_addr >= end_addr) {
1509 /* Top of vma has been split off since last time */
1510 vma->vm_truncate_count = details->truncate_count;
1511 return 0;
1515 restart_addr = zap_page_range(vma, start_addr,
1516 end_addr - start_addr, details);
1518 /*
1519 * We cannot rely on the break test in unmap_vmas:
1520 * on the one hand, we don't want to restart our loop
1521 * just because that broke out for the page_table_lock;
1522 * on the other hand, it does no test when vma is small.
1523 */
1524 need_break = need_resched() ||
1525 need_lockbreak(details->i_mmap_lock);
1527 if (restart_addr >= end_addr) {
1528 /* We have now completed this vma: mark it so */
1529 vma->vm_truncate_count = details->truncate_count;
1530 if (!need_break)
1531 return 0;
1532 } else {
1533 /* Note restart_addr in vma's truncate_count field */
1534 vma->vm_truncate_count = restart_addr;
1535 if (!need_break)
1536 goto again;
1539 spin_unlock(details->i_mmap_lock);
1540 cond_resched();
1541 spin_lock(details->i_mmap_lock);
1542 return -EINTR;
1545 static inline void unmap_mapping_range_tree(struct prio_tree_root *root,
1546 struct zap_details *details)
1548 struct vm_area_struct *vma;
1549 struct prio_tree_iter iter;
1550 pgoff_t vba, vea, zba, zea;
1552 restart:
1553 vma_prio_tree_foreach(vma, &iter, root,
1554 details->first_index, details->last_index) {
1555 /* Skip quickly over those we have already dealt with */
1556 if (vma->vm_truncate_count == details->truncate_count)
1557 continue;
1559 vba = vma->vm_pgoff;
1560 vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1;
1561 /* Assume for now that PAGE_CACHE_SHIFT == PAGE_SHIFT */
1562 zba = details->first_index;
1563 if (zba < vba)
1564 zba = vba;
1565 zea = details->last_index;
1566 if (zea > vea)
1567 zea = vea;
1569 if (unmap_mapping_range_vma(vma,
1570 ((zba - vba) << PAGE_SHIFT) + vma->vm_start,
1571 ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
1572 details) < 0)
1573 goto restart;
1577 static inline void unmap_mapping_range_list(struct list_head *head,
1578 struct zap_details *details)
1580 struct vm_area_struct *vma;
1582 /*
1583 * In nonlinear VMAs there is no correspondence between virtual address
1584 * offset and file offset. So we must perform an exhaustive search
1585 * across *all* the pages in each nonlinear VMA, not just the pages
1586 * whose virtual address lies outside the file truncation point.
1587 */
1588 restart:
1589 list_for_each_entry(vma, head, shared.vm_set.list) {
1590 /* Skip quickly over those we have already dealt with */
1591 if (vma->vm_truncate_count == details->truncate_count)
1592 continue;
1593 details->nonlinear_vma = vma;
1594 if (unmap_mapping_range_vma(vma, vma->vm_start,
1595 vma->vm_end, details) < 0)
1596 goto restart;
1600 /**
1601 * unmap_mapping_range - unmap the portion of all mmaps
1602 * in the specified address_space corresponding to the specified
1603 * page range in the underlying file.
1604 * @address_space: the address space containing mmaps to be unmapped.
1605 * @holebegin: byte in first page to unmap, relative to the start of
1606 * the underlying file. This will be rounded down to a PAGE_SIZE
1607 * boundary. Note that this is different from vmtruncate(), which
1608 * must keep the partial page. In contrast, we must get rid of
1609 * partial pages.
1610 * @holelen: size of prospective hole in bytes. This will be rounded
1611 * up to a PAGE_SIZE boundary. A holelen of zero truncates to the
1612 * end of the file.
1613 * @even_cows: 1 when truncating a file, unmap even private COWed pages;
1614 * but 0 when invalidating pagecache, don't throw away private data.
1615 */
1616 void unmap_mapping_range(struct address_space *mapping,
1617 loff_t const holebegin, loff_t const holelen, int even_cows)
1619 struct zap_details details;
1620 pgoff_t hba = holebegin >> PAGE_SHIFT;
1621 pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
1623 /* Check for overflow. */
1624 if (sizeof(holelen) > sizeof(hlen)) {
1625 long long holeend =
1626 (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
1627 if (holeend & ~(long long)ULONG_MAX)
1628 hlen = ULONG_MAX - hba + 1;
1631 details.check_mapping = even_cows? NULL: mapping;
1632 details.nonlinear_vma = NULL;
1633 details.first_index = hba;
1634 details.last_index = hba + hlen - 1;
1635 if (details.last_index < details.first_index)
1636 details.last_index = ULONG_MAX;
1637 details.i_mmap_lock = &mapping->i_mmap_lock;
1639 spin_lock(&mapping->i_mmap_lock);
1641 /* serialize i_size write against truncate_count write */
1642 smp_wmb();
1643 /* Protect against page faults, and endless unmapping loops */
1644 mapping->truncate_count++;
1645 /*
1646 * For archs where spin_lock has inclusive semantics like ia64
1647 * this smp_mb() will prevent to read pagetable contents
1648 * before the truncate_count increment is visible to
1649 * other cpus.
1650 */
1651 smp_mb();
1652 if (unlikely(is_restart_addr(mapping->truncate_count))) {
1653 if (mapping->truncate_count == 0)
1654 reset_vma_truncate_counts(mapping);
1655 mapping->truncate_count++;
1657 details.truncate_count = mapping->truncate_count;
1659 if (unlikely(!prio_tree_empty(&mapping->i_mmap)))
1660 unmap_mapping_range_tree(&mapping->i_mmap, &details);
1661 if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
1662 unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
1663 spin_unlock(&mapping->i_mmap_lock);
1665 EXPORT_SYMBOL(unmap_mapping_range);
1667 /*
1668 * Handle all mappings that got truncated by a "truncate()"
1669 * system call.
1671 * NOTE! We have to be ready to update the memory sharing
1672 * between the file and the memory map for a potential last
1673 * incomplete page. Ugly, but necessary.
1674 */
1675 int vmtruncate(struct inode * inode, loff_t offset)
1677 struct address_space *mapping = inode->i_mapping;
1678 unsigned long limit;
1680 if (inode->i_size < offset)
1681 goto do_expand;
1682 /*
1683 * truncation of in-use swapfiles is disallowed - it would cause
1684 * subsequent swapout to scribble on the now-freed blocks.
1685 */
1686 if (IS_SWAPFILE(inode))
1687 goto out_busy;
1688 i_size_write(inode, offset);
1689 unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
1690 truncate_inode_pages(mapping, offset);
1691 goto out_truncate;
1693 do_expand:
1694 limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
1695 if (limit != RLIM_INFINITY && offset > limit)
1696 goto out_sig;
1697 if (offset > inode->i_sb->s_maxbytes)
1698 goto out_big;
1699 i_size_write(inode, offset);
1701 out_truncate:
1702 if (inode->i_op && inode->i_op->truncate)
1703 inode->i_op->truncate(inode);
1704 return 0;
1705 out_sig:
1706 send_sig(SIGXFSZ, current, 0);
1707 out_big:
1708 return -EFBIG;
1709 out_busy:
1710 return -ETXTBSY;
1713 EXPORT_SYMBOL(vmtruncate);
1715 /*
1716 * Primitive swap readahead code. We simply read an aligned block of
1717 * (1 << page_cluster) entries in the swap area. This method is chosen
1718 * because it doesn't cost us any seek time. We also make sure to queue
1719 * the 'original' request together with the readahead ones...
1721 * This has been extended to use the NUMA policies from the mm triggering
1722 * the readahead.
1724 * Caller must hold down_read on the vma->vm_mm if vma is not NULL.
1725 */
1726 void swapin_readahead(swp_entry_t entry, unsigned long addr,struct vm_area_struct *vma)
1728 #ifdef CONFIG_NUMA
1729 struct vm_area_struct *next_vma = vma ? vma->vm_next : NULL;
1730 #endif
1731 int i, num;
1732 struct page *new_page;
1733 unsigned long offset;
1735 /*
1736 * Get the number of handles we should do readahead io to.
1737 */
1738 num = valid_swaphandles(entry, &offset);
1739 for (i = 0; i < num; offset++, i++) {
1740 /* Ok, do the async read-ahead now */
1741 new_page = read_swap_cache_async(swp_entry(swp_type(entry),
1742 offset), vma, addr);
1743 if (!new_page)
1744 break;
1745 page_cache_release(new_page);
1746 #ifdef CONFIG_NUMA
1747 /*
1748 * Find the next applicable VMA for the NUMA policy.
1749 */
1750 addr += PAGE_SIZE;
1751 if (addr == 0)
1752 vma = NULL;
1753 if (vma) {
1754 if (addr >= vma->vm_end) {
1755 vma = next_vma;
1756 next_vma = vma ? vma->vm_next : NULL;
1758 if (vma && addr < vma->vm_start)
1759 vma = NULL;
1760 } else {
1761 if (next_vma && addr >= next_vma->vm_start) {
1762 vma = next_vma;
1763 next_vma = vma->vm_next;
1766 #endif
1768 lru_add_drain(); /* Push any new pages onto the LRU now */
1771 /*
1772 * We hold the mm semaphore and the page_table_lock on entry and
1773 * should release the pagetable lock on exit..
1774 */
1775 static int do_swap_page(struct mm_struct * mm,
1776 struct vm_area_struct * vma, unsigned long address,
1777 pte_t *page_table, pmd_t *pmd, pte_t orig_pte, int write_access)
1779 struct page *page;
1780 swp_entry_t entry = pte_to_swp_entry(orig_pte);
1781 pte_t pte;
1782 int ret = VM_FAULT_MINOR;
1784 pte_unmap(page_table);
1785 spin_unlock(&mm->page_table_lock);
1786 page = lookup_swap_cache(entry);
1787 if (!page) {
1788 swapin_readahead(entry, address, vma);
1789 page = read_swap_cache_async(entry, vma, address);
1790 if (!page) {
1791 /*
1792 * Back out if somebody else faulted in this pte while
1793 * we released the page table lock.
1794 */
1795 spin_lock(&mm->page_table_lock);
1796 page_table = pte_offset_map(pmd, address);
1797 if (likely(pte_same(*page_table, orig_pte)))
1798 ret = VM_FAULT_OOM;
1799 else
1800 ret = VM_FAULT_MINOR;
1801 pte_unmap(page_table);
1802 spin_unlock(&mm->page_table_lock);
1803 goto out;
1806 /* Had to read the page from swap area: Major fault */
1807 ret = VM_FAULT_MAJOR;
1808 inc_page_state(pgmajfault);
1809 grab_swap_token();
1812 mark_page_accessed(page);
1813 lock_page(page);
1815 /*
1816 * Back out if somebody else faulted in this pte while we
1817 * released the page table lock.
1818 */
1819 spin_lock(&mm->page_table_lock);
1820 page_table = pte_offset_map(pmd, address);
1821 if (unlikely(!pte_same(*page_table, orig_pte))) {
1822 ret = VM_FAULT_MINOR;
1823 goto out_nomap;
1826 if (unlikely(!PageUptodate(page))) {
1827 ret = VM_FAULT_SIGBUS;
1828 goto out_nomap;
1831 /* The page isn't present yet, go ahead with the fault. */
1833 swap_free(entry);
1834 if (vm_swap_full())
1835 remove_exclusive_swap_page(page);
1837 inc_mm_counter(mm, rss);
1838 pte = mk_pte(page, vma->vm_page_prot);
1839 if (write_access && can_share_swap_page(page)) {
1840 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
1841 write_access = 0;
1843 unlock_page(page);
1845 flush_icache_page(vma, page);
1846 set_pte_at(mm, address, page_table, pte);
1847 page_add_anon_rmap(page, vma, address);
1849 if (write_access) {
1850 if (do_wp_page(mm, vma, address,
1851 page_table, pmd, pte) == VM_FAULT_OOM)
1852 ret = VM_FAULT_OOM;
1853 goto out;
1856 /* No need to invalidate - it was non-present before */
1857 update_mmu_cache(vma, address, pte);
1858 lazy_mmu_prot_update(pte);
1859 pte_unmap(page_table);
1860 spin_unlock(&mm->page_table_lock);
1861 out:
1862 return ret;
1863 out_nomap:
1864 pte_unmap(page_table);
1865 spin_unlock(&mm->page_table_lock);
1866 unlock_page(page);
1867 page_cache_release(page);
1868 goto out;
1871 /*
1872 * We are called with the MM semaphore and page_table_lock
1873 * spinlock held to protect against concurrent faults in
1874 * multithreaded programs.
1875 */
1876 static int
1877 do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
1878 pte_t *page_table, pmd_t *pmd, int write_access,
1879 unsigned long addr)
1881 pte_t entry;
1882 struct page * page = ZERO_PAGE(addr);
1884 /* Read-only mapping of ZERO_PAGE. */
1885 entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot));
1887 /* ..except if it's a write access */
1888 if (write_access) {
1889 /* Allocate our own private page. */
1890 pte_unmap(page_table);
1891 spin_unlock(&mm->page_table_lock);
1893 if (unlikely(anon_vma_prepare(vma)))
1894 goto no_mem;
1895 page = alloc_zeroed_user_highpage(vma, addr);
1896 if (!page)
1897 goto no_mem;
1899 spin_lock(&mm->page_table_lock);
1900 page_table = pte_offset_map(pmd, addr);
1902 if (!pte_none(*page_table)) {
1903 pte_unmap(page_table);
1904 page_cache_release(page);
1905 spin_unlock(&mm->page_table_lock);
1906 goto out;
1908 inc_mm_counter(mm, rss);
1909 entry = maybe_mkwrite(pte_mkdirty(mk_pte(page,
1910 vma->vm_page_prot)),
1911 vma);
1912 lru_cache_add_active(page);
1913 SetPageReferenced(page);
1914 page_add_anon_rmap(page, vma, addr);
1917 set_pte_at(mm, addr, page_table, entry);
1918 pte_unmap(page_table);
1920 /* No need to invalidate - it was non-present before */
1921 update_mmu_cache(vma, addr, entry);
1922 lazy_mmu_prot_update(entry);
1923 spin_unlock(&mm->page_table_lock);
1924 out:
1925 return VM_FAULT_MINOR;
1926 no_mem:
1927 return VM_FAULT_OOM;
1930 /*
1931 * do_no_page() tries to create a new page mapping. It aggressively
1932 * tries to share with existing pages, but makes a separate copy if
1933 * the "write_access" parameter is true in order to avoid the next
1934 * page fault.
1936 * As this is called only for pages that do not currently exist, we
1937 * do not need to flush old virtual caches or the TLB.
1939 * This is called with the MM semaphore held and the page table
1940 * spinlock held. Exit with the spinlock released.
1941 */
1942 static int
1943 do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
1944 unsigned long address, int write_access, pte_t *page_table, pmd_t *pmd)
1946 struct page * new_page;
1947 struct address_space *mapping = NULL;
1948 pte_t entry;
1949 unsigned int sequence = 0;
1950 int ret = VM_FAULT_MINOR;
1951 int anon = 0;
1953 if (!vma->vm_ops || !vma->vm_ops->nopage)
1954 return do_anonymous_page(mm, vma, page_table,
1955 pmd, write_access, address);
1956 pte_unmap(page_table);
1957 spin_unlock(&mm->page_table_lock);
1959 if (vma->vm_file) {
1960 mapping = vma->vm_file->f_mapping;
1961 sequence = mapping->truncate_count;
1962 smp_rmb(); /* serializes i_size against truncate_count */
1964 retry:
1965 cond_resched();
1966 new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret);
1967 /*
1968 * No smp_rmb is needed here as long as there's a full
1969 * spin_lock/unlock sequence inside the ->nopage callback
1970 * (for the pagecache lookup) that acts as an implicit
1971 * smp_mb() and prevents the i_size read to happen
1972 * after the next truncate_count read.
1973 */
1975 /* no page was available -- either SIGBUS or OOM */
1976 if (new_page == NOPAGE_SIGBUS)
1977 return VM_FAULT_SIGBUS;
1978 if (new_page == NOPAGE_OOM)
1979 return VM_FAULT_OOM;
1981 /*
1982 * Should we do an early C-O-W break?
1983 */
1984 if (write_access && !(vma->vm_flags & VM_SHARED)) {
1985 struct page *page;
1987 if (unlikely(anon_vma_prepare(vma)))
1988 goto oom;
1989 page = alloc_page_vma(GFP_HIGHUSER, vma, address);
1990 if (!page)
1991 goto oom;
1992 copy_user_highpage(page, new_page, address);
1993 page_cache_release(new_page);
1994 new_page = page;
1995 anon = 1;
1998 spin_lock(&mm->page_table_lock);
1999 /*
2000 * For a file-backed vma, someone could have truncated or otherwise
2001 * invalidated this page. If unmap_mapping_range got called,
2002 * retry getting the page.
2003 */
2004 if (mapping && unlikely(sequence != mapping->truncate_count)) {
2005 sequence = mapping->truncate_count;
2006 spin_unlock(&mm->page_table_lock);
2007 page_cache_release(new_page);
2008 goto retry;
2010 page_table = pte_offset_map(pmd, address);
2012 /*
2013 * This silly early PAGE_DIRTY setting removes a race
2014 * due to the bad i386 page protection. But it's valid
2015 * for other architectures too.
2017 * Note that if write_access is true, we either now have
2018 * an exclusive copy of the page, or this is a shared mapping,
2019 * so we can make it writable and dirty to avoid having to
2020 * handle that later.
2021 */
2022 /* Only go through if we didn't race with anybody else... */
2023 if (pte_none(*page_table)) {
2024 if (!PageReserved(new_page))
2025 inc_mm_counter(mm, rss);
2027 flush_icache_page(vma, new_page);
2028 entry = mk_pte(new_page, vma->vm_page_prot);
2029 if (write_access)
2030 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2031 set_pte_at(mm, address, page_table, entry);
2032 if (anon) {
2033 lru_cache_add_active(new_page);
2034 page_add_anon_rmap(new_page, vma, address);
2035 } else
2036 page_add_file_rmap(new_page);
2037 pte_unmap(page_table);
2038 } else {
2039 /* One of our sibling threads was faster, back out. */
2040 pte_unmap(page_table);
2041 page_cache_release(new_page);
2042 spin_unlock(&mm->page_table_lock);
2043 goto out;
2046 /* no need to invalidate: a not-present page shouldn't be cached */
2047 update_mmu_cache(vma, address, entry);
2048 lazy_mmu_prot_update(entry);
2049 spin_unlock(&mm->page_table_lock);
2050 out:
2051 return ret;
2052 oom:
2053 page_cache_release(new_page);
2054 ret = VM_FAULT_OOM;
2055 goto out;
2058 /*
2059 * Fault of a previously existing named mapping. Repopulate the pte
2060 * from the encoded file_pte if possible. This enables swappable
2061 * nonlinear vmas.
2062 */
2063 static int do_file_page(struct mm_struct * mm, struct vm_area_struct * vma,
2064 unsigned long address, int write_access, pte_t *pte, pmd_t *pmd)
2066 unsigned long pgoff;
2067 int err;
2069 BUG_ON(!vma->vm_ops || !vma->vm_ops->nopage);
2070 /*
2071 * Fall back to the linear mapping if the fs does not support
2072 * ->populate:
2073 */
2074 if (!vma->vm_ops || !vma->vm_ops->populate ||
2075 (write_access && !(vma->vm_flags & VM_SHARED))) {
2076 pte_clear(mm, address, pte);
2077 return do_no_page(mm, vma, address, write_access, pte, pmd);
2080 pgoff = pte_to_pgoff(*pte);
2082 pte_unmap(pte);
2083 spin_unlock(&mm->page_table_lock);
2085 err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE, vma->vm_page_prot, pgoff, 0);
2086 if (err == -ENOMEM)
2087 return VM_FAULT_OOM;
2088 if (err)
2089 return VM_FAULT_SIGBUS;
2090 return VM_FAULT_MAJOR;
2093 /*
2094 * These routines also need to handle stuff like marking pages dirty
2095 * and/or accessed for architectures that don't do it in hardware (most
2096 * RISC architectures). The early dirtying is also good on the i386.
2098 * There is also a hook called "update_mmu_cache()" that architectures
2099 * with external mmu caches can use to update those (ie the Sparc or
2100 * PowerPC hashed page tables that act as extended TLBs).
2102 * Note the "page_table_lock". It is to protect against kswapd removing
2103 * pages from under us. Note that kswapd only ever _removes_ pages, never
2104 * adds them. As such, once we have noticed that the page is not present,
2105 * we can drop the lock early.
2107 * The adding of pages is protected by the MM semaphore (which we hold),
2108 * so we don't need to worry about a page being suddenly been added into
2109 * our VM.
2111 * We enter with the pagetable spinlock held, we are supposed to
2112 * release it when done.
2113 */
2114 static inline int handle_pte_fault(struct mm_struct *mm,
2115 struct vm_area_struct * vma, unsigned long address,
2116 int write_access, pte_t *pte, pmd_t *pmd)
2118 pte_t entry;
2120 entry = *pte;
2121 if (!pte_present(entry)) {
2122 /*
2123 * If it truly wasn't present, we know that kswapd
2124 * and the PTE updates will not touch it later. So
2125 * drop the lock.
2126 */
2127 if (pte_none(entry))
2128 return do_no_page(mm, vma, address, write_access, pte, pmd);
2129 if (pte_file(entry))
2130 return do_file_page(mm, vma, address, write_access, pte, pmd);
2131 return do_swap_page(mm, vma, address, pte, pmd, entry, write_access);
2134 if (write_access) {
2135 if (!pte_write(entry))
2136 return do_wp_page(mm, vma, address, pte, pmd, entry);
2138 entry = pte_mkdirty(entry);
2140 entry = pte_mkyoung(entry);
2141 ptep_set_access_flags(vma, address, pte, entry, write_access);
2142 update_mmu_cache(vma, address, entry);
2143 lazy_mmu_prot_update(entry);
2144 pte_unmap(pte);
2145 spin_unlock(&mm->page_table_lock);
2146 return VM_FAULT_MINOR;
2149 /*
2150 * By the time we get here, we already hold the mm semaphore
2151 */
2152 int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma,
2153 unsigned long address, int write_access)
2155 pgd_t *pgd;
2156 pud_t *pud;
2157 pmd_t *pmd;
2158 pte_t *pte;
2160 __set_current_state(TASK_RUNNING);
2162 inc_page_state(pgfault);
2164 if (is_vm_hugetlb_page(vma))
2165 return VM_FAULT_SIGBUS; /* mapping truncation does this. */
2167 /*
2168 * We need the page table lock to synchronize with kswapd
2169 * and the SMP-safe atomic PTE updates.
2170 */
2171 pgd = pgd_offset(mm, address);
2172 spin_lock(&mm->page_table_lock);
2174 pud = pud_alloc(mm, pgd, address);
2175 if (!pud)
2176 goto oom;
2178 pmd = pmd_alloc(mm, pud, address);
2179 if (!pmd)
2180 goto oom;
2182 pte = pte_alloc_map(mm, pmd, address);
2183 if (!pte)
2184 goto oom;
2186 return handle_pte_fault(mm, vma, address, write_access, pte, pmd);
2188 oom:
2189 spin_unlock(&mm->page_table_lock);
2190 return VM_FAULT_OOM;
2193 #ifndef __PAGETABLE_PUD_FOLDED
2194 /*
2195 * Allocate page upper directory.
2197 * We've already handled the fast-path in-line, and we own the
2198 * page table lock.
2199 */
2200 pud_t fastcall *__pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
2202 pud_t *new;
2204 spin_unlock(&mm->page_table_lock);
2205 new = pud_alloc_one(mm, address);
2206 spin_lock(&mm->page_table_lock);
2207 if (!new)
2208 return NULL;
2210 /*
2211 * Because we dropped the lock, we should re-check the
2212 * entry, as somebody else could have populated it..
2213 */
2214 if (pgd_present(*pgd)) {
2215 pud_free(new);
2216 goto out;
2218 pgd_populate(mm, pgd, new);
2219 out:
2220 return pud_offset(pgd, address);
2222 #endif /* __PAGETABLE_PUD_FOLDED */
2224 #ifndef __PAGETABLE_PMD_FOLDED
2225 /*
2226 * Allocate page middle directory.
2228 * We've already handled the fast-path in-line, and we own the
2229 * page table lock.
2230 */
2231 pmd_t fastcall *__pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
2233 pmd_t *new;
2235 spin_unlock(&mm->page_table_lock);
2236 new = pmd_alloc_one(mm, address);
2237 spin_lock(&mm->page_table_lock);
2238 if (!new)
2239 return NULL;
2241 /*
2242 * Because we dropped the lock, we should re-check the
2243 * entry, as somebody else could have populated it..
2244 */
2245 #ifndef __ARCH_HAS_4LEVEL_HACK
2246 if (pud_present(*pud)) {
2247 pmd_free(new);
2248 goto out;
2250 pud_populate(mm, pud, new);
2251 #else
2252 if (pgd_present(*pud)) {
2253 pmd_free(new);
2254 goto out;
2256 pgd_populate(mm, pud, new);
2257 #endif /* __ARCH_HAS_4LEVEL_HACK */
2259 out:
2260 return pmd_offset(pud, address);
2262 #endif /* __PAGETABLE_PMD_FOLDED */
2264 int make_pages_present(unsigned long addr, unsigned long end)
2266 int ret, len, write;
2267 struct vm_area_struct * vma;
2269 vma = find_vma(current->mm, addr);
2270 if (!vma)
2271 return -1;
2272 write = (vma->vm_flags & VM_WRITE) != 0;
2273 if (addr >= end)
2274 BUG();
2275 if (end > vma->vm_end)
2276 BUG();
2277 len = (end+PAGE_SIZE-1)/PAGE_SIZE-addr/PAGE_SIZE;
2278 ret = get_user_pages(current, current->mm, addr,
2279 len, write, 0, NULL, NULL);
2280 if (ret < 0)
2281 return ret;
2282 return ret == len ? 0 : -1;
2285 /*
2286 * Map a vmalloc()-space virtual address to the physical page.
2287 */
2288 struct page * vmalloc_to_page(void * vmalloc_addr)
2290 unsigned long addr = (unsigned long) vmalloc_addr;
2291 struct page *page = NULL;
2292 pgd_t *pgd = pgd_offset_k(addr);
2293 pud_t *pud;
2294 pmd_t *pmd;
2295 pte_t *ptep, pte;
2297 if (!pgd_none(*pgd)) {
2298 pud = pud_offset(pgd, addr);
2299 if (!pud_none(*pud)) {
2300 pmd = pmd_offset(pud, addr);
2301 if (!pmd_none(*pmd)) {
2302 ptep = pte_offset_map(pmd, addr);
2303 pte = *ptep;
2304 if (pte_present(pte))
2305 page = pte_page(pte);
2306 pte_unmap(ptep);
2310 return page;
2313 EXPORT_SYMBOL(vmalloc_to_page);
2315 /*
2316 * Map a vmalloc()-space virtual address to the physical page frame number.
2317 */
2318 unsigned long vmalloc_to_pfn(void * vmalloc_addr)
2320 return page_to_pfn(vmalloc_to_page(vmalloc_addr));
2323 EXPORT_SYMBOL(vmalloc_to_pfn);
2325 /*
2326 * update_mem_hiwater
2327 * - update per process rss and vm high water data
2328 */
2329 void update_mem_hiwater(struct task_struct *tsk)
2331 if (tsk->mm) {
2332 unsigned long rss = get_mm_counter(tsk->mm, rss);
2334 if (tsk->mm->hiwater_rss < rss)
2335 tsk->mm->hiwater_rss = rss;
2336 if (tsk->mm->hiwater_vm < tsk->mm->total_vm)
2337 tsk->mm->hiwater_vm = tsk->mm->total_vm;
2341 #if !defined(__HAVE_ARCH_GATE_AREA)
2343 #if defined(AT_SYSINFO_EHDR)
2344 struct vm_area_struct gate_vma;
2346 static int __init gate_vma_init(void)
2348 gate_vma.vm_mm = NULL;
2349 gate_vma.vm_start = FIXADDR_USER_START;
2350 gate_vma.vm_end = FIXADDR_USER_END;
2351 gate_vma.vm_page_prot = PAGE_READONLY;
2352 gate_vma.vm_flags = 0;
2353 return 0;
2355 __initcall(gate_vma_init);
2356 #endif
2358 struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
2360 #ifdef AT_SYSINFO_EHDR
2361 return &gate_vma;
2362 #else
2363 return NULL;
2364 #endif
2367 int in_gate_area_no_task(unsigned long addr)
2369 #ifdef AT_SYSINFO_EHDR
2370 if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END))
2371 return 1;
2372 #endif
2373 return 0;
2376 #endif /* __HAVE_ARCH_GATE_AREA */