ia64/linux-2.6.18-xen.hg

view mm/memory.c @ 878:eba6fe6d8d53

blktap2: a completely rewritten blktap implementation

Benefits to blktap2 over the old version of blktap:

* Isolation from xenstore - Blktap devices are now created directly on
the linux dom0 command line, rather than being spawned in response
to XenStore events. This is handy for debugging, makes blktap
generally easier to work with, and is a step toward a generic
user-level block device implementation that is not Xen-specific.

* Improved tapdisk infrastructure: simpler request forwarding, new
request scheduler, request merging, more efficient use of AIO.

* Improved tapdisk error handling and memory management. No
allocations on the block data path, IO retry logic to protect
guests
transient block device failures. This has been tested and is known
to work on weird environments such as NFS soft mounts.

* Pause and snapshot of live virtual disks (see xmsnap script).

* VHD support. The VHD code in this release has been rigorously
tested, and represents a very mature implementation of the VHD
image
format.

* No more duplication of mechanism with blkback. The blktap kernel
module has changed dramatically from the original blktap. Blkback
is now always used to talk to Xen guests, blktap just presents a
Linux gendisk that blkback can export. This is done while
preserving the zero-copy data path from domU to physical device.

These patches deprecate the old blktap code, which can hopefully be
removed from the tree completely at some point in the future.

Signed-off-by: Jake Wires <jake.wires@citrix.com>
Signed-off-by: Dutch Meyer <dmeyer@cs.ubc.ca>
author Keir Fraser <keir.fraser@citrix.com>
date Tue May 26 11:23:16 2009 +0100 (2009-05-26)
parents ca1dd3c0daa9
children
line source
1 /*
2 * linux/mm/memory.c
3 *
4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
5 */
7 /*
8 * demand-loading started 01.12.91 - seems it is high on the list of
9 * things wanted, and it should be easy to implement. - Linus
10 */
12 /*
13 * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
14 * pages started 02.12.91, seems to work. - Linus.
15 *
16 * Tested sharing by executing about 30 /bin/sh: under the old kernel it
17 * would have taken more than the 6M I have free, but it worked well as
18 * far as I could see.
19 *
20 * Also corrected some "invalidate()"s - I wasn't doing enough of them.
21 */
23 /*
24 * Real VM (paging to/from disk) started 18.12.91. Much more work and
25 * thought has to go into this. Oh, well..
26 * 19.12.91 - works, somewhat. Sometimes I get faults, don't know why.
27 * Found it. Everything seems to work now.
28 * 20.12.91 - Ok, making the swap-device changeable like the root.
29 */
31 /*
32 * 05.04.94 - Multi-page memory management added for v1.1.
33 * Idea by Alex Bligh (alex@cconcepts.co.uk)
34 *
35 * 16.07.99 - Support of BIGMEM added by Gerhard Wichert, Siemens AG
36 * (Gerhard.Wichert@pdb.siemens.de)
37 *
38 * Aug/Sep 2004 Changed to four level page tables (Andi Kleen)
39 */
41 #include <linux/kernel_stat.h>
42 #include <linux/mm.h>
43 #include <linux/hugetlb.h>
44 #include <linux/mman.h>
45 #include <linux/swap.h>
46 #include <linux/highmem.h>
47 #include <linux/pagemap.h>
48 #include <linux/rmap.h>
49 #include <linux/module.h>
50 #include <linux/delayacct.h>
51 #include <linux/init.h>
53 #include <asm/pgalloc.h>
54 #include <asm/uaccess.h>
55 #include <asm/tlb.h>
56 #include <asm/tlbflush.h>
57 #include <asm/pgtable.h>
59 #include <linux/swapops.h>
60 #include <linux/elf.h>
62 #ifndef CONFIG_NEED_MULTIPLE_NODES
63 /* use the per-pgdat data instead for discontigmem - mbligh */
64 unsigned long max_mapnr;
65 struct page *mem_map;
67 EXPORT_SYMBOL(max_mapnr);
68 EXPORT_SYMBOL(mem_map);
69 #endif
71 unsigned long num_physpages;
72 /*
73 * A number of key systems in x86 including ioremap() rely on the assumption
74 * that high_memory defines the upper bound on direct map memory, then end
75 * of ZONE_NORMAL. Under CONFIG_DISCONTIG this means that max_low_pfn and
76 * highstart_pfn must be the same; there must be no gap between ZONE_NORMAL
77 * and ZONE_HIGHMEM.
78 */
79 void * high_memory;
80 unsigned long vmalloc_earlyreserve;
82 EXPORT_SYMBOL(num_physpages);
83 EXPORT_SYMBOL(high_memory);
84 EXPORT_SYMBOL(vmalloc_earlyreserve);
86 int randomize_va_space __read_mostly = 1;
88 static int __init disable_randmaps(char *s)
89 {
90 randomize_va_space = 0;
91 return 1;
92 }
93 __setup("norandmaps", disable_randmaps);
96 /*
97 * If a p?d_bad entry is found while walking page tables, report
98 * the error, before resetting entry to p?d_none. Usually (but
99 * very seldom) called out from the p?d_none_or_clear_bad macros.
100 */
102 void pgd_clear_bad(pgd_t *pgd)
103 {
104 pgd_ERROR(*pgd);
105 pgd_clear(pgd);
106 }
108 void pud_clear_bad(pud_t *pud)
109 {
110 pud_ERROR(*pud);
111 pud_clear(pud);
112 }
114 void pmd_clear_bad(pmd_t *pmd)
115 {
116 pmd_ERROR(*pmd);
117 pmd_clear(pmd);
118 }
120 /*
121 * Note: this doesn't free the actual pages themselves. That
122 * has been handled earlier when unmapping all the memory regions.
123 */
124 static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd)
125 {
126 struct page *page = pmd_page(*pmd);
127 pmd_clear(pmd);
128 pte_lock_deinit(page);
129 pte_free_tlb(tlb, page);
130 dec_zone_page_state(page, NR_PAGETABLE);
131 tlb->mm->nr_ptes--;
132 }
134 static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
135 unsigned long addr, unsigned long end,
136 unsigned long floor, unsigned long ceiling)
137 {
138 pmd_t *pmd;
139 unsigned long next;
140 unsigned long start;
142 start = addr;
143 pmd = pmd_offset(pud, addr);
144 do {
145 next = pmd_addr_end(addr, end);
146 if (pmd_none_or_clear_bad(pmd))
147 continue;
148 free_pte_range(tlb, pmd);
149 } while (pmd++, addr = next, addr != end);
151 start &= PUD_MASK;
152 if (start < floor)
153 return;
154 if (ceiling) {
155 ceiling &= PUD_MASK;
156 if (!ceiling)
157 return;
158 }
159 if (end - 1 > ceiling - 1)
160 return;
162 pmd = pmd_offset(pud, start);
163 pud_clear(pud);
164 pmd_free_tlb(tlb, pmd);
165 }
167 static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
168 unsigned long addr, unsigned long end,
169 unsigned long floor, unsigned long ceiling)
170 {
171 pud_t *pud;
172 unsigned long next;
173 unsigned long start;
175 start = addr;
176 pud = pud_offset(pgd, addr);
177 do {
178 next = pud_addr_end(addr, end);
179 if (pud_none_or_clear_bad(pud))
180 continue;
181 free_pmd_range(tlb, pud, addr, next, floor, ceiling);
182 } while (pud++, addr = next, addr != end);
184 start &= PGDIR_MASK;
185 if (start < floor)
186 return;
187 if (ceiling) {
188 ceiling &= PGDIR_MASK;
189 if (!ceiling)
190 return;
191 }
192 if (end - 1 > ceiling - 1)
193 return;
195 pud = pud_offset(pgd, start);
196 pgd_clear(pgd);
197 pud_free_tlb(tlb, pud);
198 }
200 /*
201 * This function frees user-level page tables of a process.
202 *
203 * Must be called with pagetable lock held.
204 */
205 void free_pgd_range(struct mmu_gather **tlb,
206 unsigned long addr, unsigned long end,
207 unsigned long floor, unsigned long ceiling)
208 {
209 pgd_t *pgd;
210 unsigned long next;
211 unsigned long start;
213 /*
214 * The next few lines have given us lots of grief...
215 *
216 * Why are we testing PMD* at this top level? Because often
217 * there will be no work to do at all, and we'd prefer not to
218 * go all the way down to the bottom just to discover that.
219 *
220 * Why all these "- 1"s? Because 0 represents both the bottom
221 * of the address space and the top of it (using -1 for the
222 * top wouldn't help much: the masks would do the wrong thing).
223 * The rule is that addr 0 and floor 0 refer to the bottom of
224 * the address space, but end 0 and ceiling 0 refer to the top
225 * Comparisons need to use "end - 1" and "ceiling - 1" (though
226 * that end 0 case should be mythical).
227 *
228 * Wherever addr is brought up or ceiling brought down, we must
229 * be careful to reject "the opposite 0" before it confuses the
230 * subsequent tests. But what about where end is brought down
231 * by PMD_SIZE below? no, end can't go down to 0 there.
232 *
233 * Whereas we round start (addr) and ceiling down, by different
234 * masks at different levels, in order to test whether a table
235 * now has no other vmas using it, so can be freed, we don't
236 * bother to round floor or end up - the tests don't need that.
237 */
239 addr &= PMD_MASK;
240 if (addr < floor) {
241 addr += PMD_SIZE;
242 if (!addr)
243 return;
244 }
245 if (ceiling) {
246 ceiling &= PMD_MASK;
247 if (!ceiling)
248 return;
249 }
250 if (end - 1 > ceiling - 1)
251 end -= PMD_SIZE;
252 if (addr > end - 1)
253 return;
255 start = addr;
256 pgd = pgd_offset((*tlb)->mm, addr);
257 do {
258 next = pgd_addr_end(addr, end);
259 if (pgd_none_or_clear_bad(pgd))
260 continue;
261 free_pud_range(*tlb, pgd, addr, next, floor, ceiling);
262 } while (pgd++, addr = next, addr != end);
264 if (!(*tlb)->fullmm)
265 flush_tlb_pgtables((*tlb)->mm, start, end);
266 }
268 void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
269 unsigned long floor, unsigned long ceiling)
270 {
271 while (vma) {
272 struct vm_area_struct *next = vma->vm_next;
273 unsigned long addr = vma->vm_start;
275 /*
276 * Hide vma from rmap and vmtruncate before freeing pgtables
277 */
278 anon_vma_unlink(vma);
279 unlink_file_vma(vma);
281 if (is_vm_hugetlb_page(vma)) {
282 hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
283 floor, next? next->vm_start: ceiling);
284 } else {
285 /*
286 * Optimization: gather nearby vmas into one call down
287 */
288 while (next && next->vm_start <= vma->vm_end + PMD_SIZE
289 && !is_vm_hugetlb_page(next)) {
290 vma = next;
291 next = vma->vm_next;
292 anon_vma_unlink(vma);
293 unlink_file_vma(vma);
294 }
295 free_pgd_range(tlb, addr, vma->vm_end,
296 floor, next? next->vm_start: ceiling);
297 }
298 vma = next;
299 }
300 }
302 int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
303 {
304 struct page *new = pte_alloc_one(mm, address);
305 if (!new)
306 return -ENOMEM;
308 pte_lock_init(new);
309 spin_lock(&mm->page_table_lock);
310 if (pmd_present(*pmd)) { /* Another has populated it */
311 pte_lock_deinit(new);
312 pte_free(new);
313 } else {
314 mm->nr_ptes++;
315 inc_zone_page_state(new, NR_PAGETABLE);
316 pmd_populate(mm, pmd, new);
317 }
318 spin_unlock(&mm->page_table_lock);
319 return 0;
320 }
322 int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
323 {
324 pte_t *new = pte_alloc_one_kernel(&init_mm, address);
325 if (!new)
326 return -ENOMEM;
328 spin_lock(&init_mm.page_table_lock);
329 if (pmd_present(*pmd)) /* Another has populated it */
330 pte_free_kernel(new);
331 else
332 pmd_populate_kernel(&init_mm, pmd, new);
333 spin_unlock(&init_mm.page_table_lock);
334 return 0;
335 }
337 static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
338 {
339 if (file_rss)
340 add_mm_counter(mm, file_rss, file_rss);
341 if (anon_rss)
342 add_mm_counter(mm, anon_rss, anon_rss);
343 }
345 /*
346 * This function is called to print an error when a bad pte
347 * is found. For example, we might have a PFN-mapped pte in
348 * a region that doesn't allow it.
349 *
350 * The calling function must still handle the error.
351 */
352 void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr)
353 {
354 printk(KERN_ERR "Bad pte = %08llx, process = %s, "
355 "vm_flags = %lx, vaddr = %lx\n",
356 (long long)pte_val(pte),
357 (vma->vm_mm == current->mm ? current->comm : "???"),
358 vma->vm_flags, vaddr);
359 dump_stack();
360 }
362 static inline int is_cow_mapping(unsigned int flags)
363 {
364 return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
365 }
367 /*
368 * This function gets the "struct page" associated with a pte.
369 *
370 * NOTE! Some mappings do not have "struct pages". A raw PFN mapping
371 * will have each page table entry just pointing to a raw page frame
372 * number, and as far as the VM layer is concerned, those do not have
373 * pages associated with them - even if the PFN might point to memory
374 * that otherwise is perfectly fine and has a "struct page".
375 *
376 * The way we recognize those mappings is through the rules set up
377 * by "remap_pfn_range()": the vma will have the VM_PFNMAP bit set,
378 * and the vm_pgoff will point to the first PFN mapped: thus every
379 * page that is a raw mapping will always honor the rule
380 *
381 * pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT)
382 *
383 * and if that isn't true, the page has been COW'ed (in which case it
384 * _does_ have a "struct page" associated with it even if it is in a
385 * VM_PFNMAP range).
386 */
387 struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte)
388 {
389 unsigned long pfn = pte_pfn(pte);
391 if (unlikely(vma->vm_flags & VM_PFNMAP)) {
392 unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT;
393 if (pfn == vma->vm_pgoff + off)
394 return NULL;
395 if (!is_cow_mapping(vma->vm_flags))
396 return NULL;
397 }
399 #if defined(CONFIG_XEN) && defined(CONFIG_X86)
400 /* XEN: Covers user-space grant mappings (even of local pages). */
401 if (unlikely(vma->vm_flags & VM_FOREIGN))
402 return NULL;
403 #endif
405 /*
406 * Add some anal sanity checks for now. Eventually,
407 * we should just do "return pfn_to_page(pfn)", but
408 * in the meantime we check that we get a valid pfn,
409 * and that the resulting page looks ok.
410 */
411 if (unlikely(!pfn_valid(pfn))) {
412 if (!(vma->vm_flags & VM_RESERVED))
413 print_bad_pte(vma, pte, addr);
414 return NULL;
415 }
417 /*
418 * NOTE! We still have PageReserved() pages in the page
419 * tables.
420 *
421 * The PAGE_ZERO() pages and various VDSO mappings can
422 * cause them to exist.
423 */
424 return pfn_to_page(pfn);
425 }
427 /*
428 * copy one vm_area from one task to the other. Assumes the page tables
429 * already present in the new task to be cleared in the whole range
430 * covered by this vma.
431 */
433 static inline void
434 copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
435 pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
436 unsigned long addr, int *rss)
437 {
438 unsigned long vm_flags = vma->vm_flags;
439 pte_t pte = *src_pte;
440 struct page *page;
442 /* pte contains position in swap or file, so copy. */
443 if (unlikely(!pte_present(pte))) {
444 if (!pte_file(pte)) {
445 swp_entry_t entry = pte_to_swp_entry(pte);
447 swap_duplicate(entry);
448 /* make sure dst_mm is on swapoff's mmlist. */
449 if (unlikely(list_empty(&dst_mm->mmlist))) {
450 spin_lock(&mmlist_lock);
451 if (list_empty(&dst_mm->mmlist))
452 list_add(&dst_mm->mmlist,
453 &src_mm->mmlist);
454 spin_unlock(&mmlist_lock);
455 }
456 if (is_write_migration_entry(entry) &&
457 is_cow_mapping(vm_flags)) {
458 /*
459 * COW mappings require pages in both parent
460 * and child to be set to read.
461 */
462 make_migration_entry_read(&entry);
463 pte = swp_entry_to_pte(entry);
464 set_pte_at(src_mm, addr, src_pte, pte);
465 }
466 }
467 goto out_set_pte;
468 }
470 /*
471 * If it's a COW mapping, write protect it both
472 * in the parent and the child
473 */
474 if (is_cow_mapping(vm_flags)) {
475 ptep_set_wrprotect(src_mm, addr, src_pte);
476 pte = *src_pte;
477 }
479 /*
480 * If it's a shared mapping, mark it clean in
481 * the child
482 */
483 if (vm_flags & VM_SHARED)
484 pte = pte_mkclean(pte);
485 pte = pte_mkold(pte);
487 page = vm_normal_page(vma, addr, pte);
488 if (page) {
489 get_page(page);
490 page_dup_rmap(page);
491 rss[!!PageAnon(page)]++;
492 }
494 out_set_pte:
495 set_pte_at(dst_mm, addr, dst_pte, pte);
496 }
498 static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
499 pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
500 unsigned long addr, unsigned long end)
501 {
502 pte_t *src_pte, *dst_pte;
503 spinlock_t *src_ptl, *dst_ptl;
504 int progress = 0;
505 int rss[2];
507 again:
508 rss[1] = rss[0] = 0;
509 dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
510 if (!dst_pte)
511 return -ENOMEM;
512 src_pte = pte_offset_map_nested(src_pmd, addr);
513 src_ptl = pte_lockptr(src_mm, src_pmd);
514 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
516 do {
517 /*
518 * We are holding two locks at this point - either of them
519 * could generate latencies in another task on another CPU.
520 */
521 if (progress >= 32) {
522 progress = 0;
523 if (need_resched() ||
524 need_lockbreak(src_ptl) ||
525 need_lockbreak(dst_ptl))
526 break;
527 }
528 if (pte_none(*src_pte)) {
529 progress++;
530 continue;
531 }
532 copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss);
533 progress += 8;
534 } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
536 spin_unlock(src_ptl);
537 pte_unmap_nested(src_pte - 1);
538 add_mm_rss(dst_mm, rss[0], rss[1]);
539 pte_unmap_unlock(dst_pte - 1, dst_ptl);
540 cond_resched();
541 if (addr != end)
542 goto again;
543 return 0;
544 }
546 static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
547 pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma,
548 unsigned long addr, unsigned long end)
549 {
550 pmd_t *src_pmd, *dst_pmd;
551 unsigned long next;
553 dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
554 if (!dst_pmd)
555 return -ENOMEM;
556 src_pmd = pmd_offset(src_pud, addr);
557 do {
558 next = pmd_addr_end(addr, end);
559 if (pmd_none_or_clear_bad(src_pmd))
560 continue;
561 if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
562 vma, addr, next))
563 return -ENOMEM;
564 } while (dst_pmd++, src_pmd++, addr = next, addr != end);
565 return 0;
566 }
568 static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
569 pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,
570 unsigned long addr, unsigned long end)
571 {
572 pud_t *src_pud, *dst_pud;
573 unsigned long next;
575 dst_pud = pud_alloc(dst_mm, dst_pgd, addr);
576 if (!dst_pud)
577 return -ENOMEM;
578 src_pud = pud_offset(src_pgd, addr);
579 do {
580 next = pud_addr_end(addr, end);
581 if (pud_none_or_clear_bad(src_pud))
582 continue;
583 if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
584 vma, addr, next))
585 return -ENOMEM;
586 } while (dst_pud++, src_pud++, addr = next, addr != end);
587 return 0;
588 }
590 int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
591 struct vm_area_struct *vma)
592 {
593 pgd_t *src_pgd, *dst_pgd;
594 unsigned long next;
595 unsigned long addr = vma->vm_start;
596 unsigned long end = vma->vm_end;
598 /*
599 * Don't copy ptes where a page fault will fill them correctly.
600 * Fork becomes much lighter when there are big shared or private
601 * readonly mappings. The tradeoff is that copy_page_range is more
602 * efficient than faulting.
603 */
604 if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP|VM_INSERTPAGE))) {
605 if (!vma->anon_vma)
606 return 0;
607 }
609 if (is_vm_hugetlb_page(vma))
610 return copy_hugetlb_page_range(dst_mm, src_mm, vma);
612 dst_pgd = pgd_offset(dst_mm, addr);
613 src_pgd = pgd_offset(src_mm, addr);
614 do {
615 next = pgd_addr_end(addr, end);
616 if (pgd_none_or_clear_bad(src_pgd))
617 continue;
618 if (copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,
619 vma, addr, next))
620 return -ENOMEM;
621 } while (dst_pgd++, src_pgd++, addr = next, addr != end);
622 return 0;
623 }
625 static unsigned long zap_pte_range(struct mmu_gather *tlb,
626 struct vm_area_struct *vma, pmd_t *pmd,
627 unsigned long addr, unsigned long end,
628 long *zap_work, struct zap_details *details)
629 {
630 struct mm_struct *mm = tlb->mm;
631 pte_t *pte;
632 spinlock_t *ptl;
633 int file_rss = 0;
634 int anon_rss = 0;
636 pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
637 do {
638 pte_t ptent = *pte;
639 if (pte_none(ptent)) {
640 (*zap_work)--;
641 continue;
642 }
644 (*zap_work) -= PAGE_SIZE;
646 if (pte_present(ptent)) {
647 struct page *page;
649 page = vm_normal_page(vma, addr, ptent);
650 if (unlikely(details) && page) {
651 /*
652 * unmap_shared_mapping_pages() wants to
653 * invalidate cache without truncating:
654 * unmap shared but keep private pages.
655 */
656 if (details->check_mapping &&
657 details->check_mapping != page->mapping)
658 continue;
659 /*
660 * Each page->index must be checked when
661 * invalidating or truncating nonlinear.
662 */
663 if (details->nonlinear_vma &&
664 (page->index < details->first_index ||
665 page->index > details->last_index))
666 continue;
667 }
668 if (unlikely(vma->vm_ops && vma->vm_ops->zap_pte))
669 ptent = vma->vm_ops->zap_pte(vma, addr, pte,
670 tlb->fullmm);
671 else
672 ptent = ptep_get_and_clear_full(mm, addr, pte,
673 tlb->fullmm);
674 tlb_remove_tlb_entry(tlb, pte, addr);
675 if (unlikely(!page))
676 continue;
677 if (unlikely(details) && details->nonlinear_vma
678 && linear_page_index(details->nonlinear_vma,
679 addr) != page->index)
680 set_pte_at(mm, addr, pte,
681 pgoff_to_pte(page->index));
682 if (PageAnon(page))
683 anon_rss--;
684 else {
685 if (pte_dirty(ptent))
686 set_page_dirty(page);
687 if (pte_young(ptent))
688 mark_page_accessed(page);
689 file_rss--;
690 }
691 page_remove_rmap(page);
692 tlb_remove_page(tlb, page);
693 continue;
694 }
695 /*
696 * If details->check_mapping, we leave swap entries;
697 * if details->nonlinear_vma, we leave file entries.
698 */
699 if (unlikely(details))
700 continue;
701 if (!pte_file(ptent))
702 free_swap_and_cache(pte_to_swp_entry(ptent));
703 pte_clear_full(mm, addr, pte, tlb->fullmm);
704 } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
706 add_mm_rss(mm, file_rss, anon_rss);
707 pte_unmap_unlock(pte - 1, ptl);
709 return addr;
710 }
712 static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
713 struct vm_area_struct *vma, pud_t *pud,
714 unsigned long addr, unsigned long end,
715 long *zap_work, struct zap_details *details)
716 {
717 pmd_t *pmd;
718 unsigned long next;
720 pmd = pmd_offset(pud, addr);
721 do {
722 next = pmd_addr_end(addr, end);
723 if (pmd_none_or_clear_bad(pmd)) {
724 (*zap_work)--;
725 continue;
726 }
727 next = zap_pte_range(tlb, vma, pmd, addr, next,
728 zap_work, details);
729 } while (pmd++, addr = next, (addr != end && *zap_work > 0));
731 return addr;
732 }
734 static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
735 struct vm_area_struct *vma, pgd_t *pgd,
736 unsigned long addr, unsigned long end,
737 long *zap_work, struct zap_details *details)
738 {
739 pud_t *pud;
740 unsigned long next;
742 pud = pud_offset(pgd, addr);
743 do {
744 next = pud_addr_end(addr, end);
745 if (pud_none_or_clear_bad(pud)) {
746 (*zap_work)--;
747 continue;
748 }
749 next = zap_pmd_range(tlb, vma, pud, addr, next,
750 zap_work, details);
751 } while (pud++, addr = next, (addr != end && *zap_work > 0));
753 return addr;
754 }
756 static unsigned long unmap_page_range(struct mmu_gather *tlb,
757 struct vm_area_struct *vma,
758 unsigned long addr, unsigned long end,
759 long *zap_work, struct zap_details *details)
760 {
761 pgd_t *pgd;
762 unsigned long next;
764 if (details && !details->check_mapping && !details->nonlinear_vma)
765 details = NULL;
767 BUG_ON(addr >= end);
769 tlb_start_vma(tlb, vma);
770 pgd = pgd_offset(vma->vm_mm, addr);
771 do {
772 next = pgd_addr_end(addr, end);
773 if (pgd_none_or_clear_bad(pgd)) {
774 (*zap_work)--;
775 continue;
776 }
777 next = zap_pud_range(tlb, vma, pgd, addr, next,
778 zap_work, details);
779 } while (pgd++, addr = next, (addr != end && *zap_work > 0));
780 tlb_end_vma(tlb, vma);
782 return addr;
783 }
785 #ifdef CONFIG_PREEMPT
786 # define ZAP_BLOCK_SIZE (8 * PAGE_SIZE)
787 #else
788 /* No preempt: go for improved straight-line efficiency */
789 # define ZAP_BLOCK_SIZE (1024 * PAGE_SIZE)
790 #endif
792 /**
793 * unmap_vmas - unmap a range of memory covered by a list of vma's
794 * @tlbp: address of the caller's struct mmu_gather
795 * @vma: the starting vma
796 * @start_addr: virtual address at which to start unmapping
797 * @end_addr: virtual address at which to end unmapping
798 * @nr_accounted: Place number of unmapped pages in vm-accountable vma's here
799 * @details: details of nonlinear truncation or shared cache invalidation
800 *
801 * Returns the end address of the unmapping (restart addr if interrupted).
802 *
803 * Unmap all pages in the vma list.
804 *
805 * We aim to not hold locks for too long (for scheduling latency reasons).
806 * So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to
807 * return the ending mmu_gather to the caller.
808 *
809 * Only addresses between `start' and `end' will be unmapped.
810 *
811 * The VMA list must be sorted in ascending virtual address order.
812 *
813 * unmap_vmas() assumes that the caller will flush the whole unmapped address
814 * range after unmap_vmas() returns. So the only responsibility here is to
815 * ensure that any thus-far unmapped pages are flushed before unmap_vmas()
816 * drops the lock and schedules.
817 */
818 unsigned long unmap_vmas(struct mmu_gather **tlbp,
819 struct vm_area_struct *vma, unsigned long start_addr,
820 unsigned long end_addr, unsigned long *nr_accounted,
821 struct zap_details *details)
822 {
823 long zap_work = ZAP_BLOCK_SIZE;
824 unsigned long tlb_start = 0; /* For tlb_finish_mmu */
825 int tlb_start_valid = 0;
826 unsigned long start = start_addr;
827 spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
828 int fullmm = (*tlbp)->fullmm;
830 for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
831 unsigned long end;
833 start = max(vma->vm_start, start_addr);
834 if (start >= vma->vm_end)
835 continue;
836 end = min(vma->vm_end, end_addr);
837 if (end <= vma->vm_start)
838 continue;
840 if (vma->vm_flags & VM_ACCOUNT)
841 *nr_accounted += (end - start) >> PAGE_SHIFT;
843 while (start != end) {
844 if (!tlb_start_valid) {
845 tlb_start = start;
846 tlb_start_valid = 1;
847 }
849 if (unlikely(is_vm_hugetlb_page(vma))) {
850 unmap_hugepage_range(vma, start, end);
851 zap_work -= (end - start) /
852 (HPAGE_SIZE / PAGE_SIZE);
853 start = end;
854 } else
855 start = unmap_page_range(*tlbp, vma,
856 start, end, &zap_work, details);
858 if (zap_work > 0) {
859 BUG_ON(start != end);
860 break;
861 }
863 tlb_finish_mmu(*tlbp, tlb_start, start);
865 if (need_resched() ||
866 (i_mmap_lock && need_lockbreak(i_mmap_lock))) {
867 if (i_mmap_lock) {
868 *tlbp = NULL;
869 goto out;
870 }
871 cond_resched();
872 }
874 *tlbp = tlb_gather_mmu(vma->vm_mm, fullmm);
875 tlb_start_valid = 0;
876 zap_work = ZAP_BLOCK_SIZE;
877 }
878 }
879 out:
880 return start; /* which is now the end (or restart) address */
881 }
883 /**
884 * zap_page_range - remove user pages in a given range
885 * @vma: vm_area_struct holding the applicable pages
886 * @address: starting address of pages to zap
887 * @size: number of bytes to zap
888 * @details: details of nonlinear truncation or shared cache invalidation
889 */
890 unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
891 unsigned long size, struct zap_details *details)
892 {
893 struct mm_struct *mm = vma->vm_mm;
894 struct mmu_gather *tlb;
895 unsigned long end = address + size;
896 unsigned long nr_accounted = 0;
898 lru_add_drain();
899 tlb = tlb_gather_mmu(mm, 0);
900 update_hiwater_rss(mm);
901 end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
902 if (tlb)
903 tlb_finish_mmu(tlb, address, end);
904 return end;
905 }
906 EXPORT_SYMBOL(zap_page_range);
908 /*
909 * Do a quick page-table lookup for a single page.
910 */
911 struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
912 unsigned int flags)
913 {
914 pgd_t *pgd;
915 pud_t *pud;
916 pmd_t *pmd;
917 pte_t *ptep, pte;
918 spinlock_t *ptl;
919 struct page *page;
920 struct mm_struct *mm = vma->vm_mm;
922 page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
923 if (!IS_ERR(page)) {
924 BUG_ON(flags & FOLL_GET);
925 goto out;
926 }
928 page = NULL;
929 pgd = pgd_offset(mm, address);
930 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
931 goto no_page_table;
933 pud = pud_offset(pgd, address);
934 if (pud_none(*pud) || unlikely(pud_bad(*pud)))
935 goto no_page_table;
937 pmd = pmd_offset(pud, address);
938 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
939 goto no_page_table;
941 if (pmd_huge(*pmd)) {
942 BUG_ON(flags & FOLL_GET);
943 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
944 goto out;
945 }
947 ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
948 if (!ptep)
949 goto out;
951 pte = *ptep;
952 if (!pte_present(pte))
953 goto unlock;
954 if ((flags & FOLL_WRITE) && !pte_write(pte))
955 goto unlock;
956 page = vm_normal_page(vma, address, pte);
957 if (unlikely(!page))
958 goto unlock;
960 if (flags & FOLL_GET)
961 get_page(page);
962 if (flags & FOLL_TOUCH) {
963 if ((flags & FOLL_WRITE) &&
964 !pte_dirty(pte) && !PageDirty(page))
965 set_page_dirty(page);
966 mark_page_accessed(page);
967 }
968 unlock:
969 pte_unmap_unlock(ptep, ptl);
970 out:
971 return page;
973 no_page_table:
974 /*
975 * When core dumping an enormous anonymous area that nobody
976 * has touched so far, we don't want to allocate page tables.
977 */
978 if (flags & FOLL_ANON) {
979 page = ZERO_PAGE(address);
980 if (flags & FOLL_GET)
981 get_page(page);
982 BUG_ON(flags & FOLL_WRITE);
983 }
984 return page;
985 }
987 int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
988 unsigned long start, int len, int write, int force,
989 struct page **pages, struct vm_area_struct **vmas)
990 {
991 int i;
992 unsigned int vm_flags;
994 /*
995 * Require read or write permissions.
996 * If 'force' is set, we only require the "MAY" flags.
997 */
998 vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
999 vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
1000 i = 0;
1002 do {
1003 struct vm_area_struct *vma;
1004 unsigned int foll_flags;
1006 vma = find_extend_vma(mm, start);
1007 if (!vma && in_gate_area(tsk, start)) {
1008 unsigned long pg = start & PAGE_MASK;
1009 struct vm_area_struct *gate_vma = get_gate_vma(tsk);
1010 pgd_t *pgd;
1011 pud_t *pud;
1012 pmd_t *pmd;
1013 pte_t *pte;
1014 if (write) /* user gate pages are read-only */
1015 return i ? : -EFAULT;
1016 if (pg > TASK_SIZE)
1017 pgd = pgd_offset_k(pg);
1018 else
1019 pgd = pgd_offset_gate(mm, pg);
1020 BUG_ON(pgd_none(*pgd));
1021 pud = pud_offset(pgd, pg);
1022 BUG_ON(pud_none(*pud));
1023 pmd = pmd_offset(pud, pg);
1024 if (pmd_none(*pmd))
1025 return i ? : -EFAULT;
1026 pte = pte_offset_map(pmd, pg);
1027 if (pte_none(*pte)) {
1028 pte_unmap(pte);
1029 return i ? : -EFAULT;
1031 if (pages) {
1032 struct page *page = vm_normal_page(gate_vma, start, *pte);
1033 pages[i] = page;
1034 if (page)
1035 get_page(page);
1037 pte_unmap(pte);
1038 if (vmas)
1039 vmas[i] = gate_vma;
1040 i++;
1041 start += PAGE_SIZE;
1042 len--;
1043 continue;
1046 #ifdef CONFIG_XEN
1047 if (vma && (vma->vm_flags & VM_FOREIGN)) {
1048 struct vm_foreign_map *foreign_map =
1049 vma->vm_private_data;
1050 struct page **map = foreign_map->map;
1051 int offset = (start - vma->vm_start) >> PAGE_SHIFT;
1052 if (map[offset] != NULL) {
1053 if (pages) {
1054 struct page *page = map[offset];
1056 pages[i] = page;
1057 get_page(page);
1059 if (vmas)
1060 vmas[i] = vma;
1061 i++;
1062 start += PAGE_SIZE;
1063 len--;
1064 continue;
1067 #endif
1068 if (!vma || (vma->vm_flags & (VM_IO | VM_PFNMAP))
1069 || !(vm_flags & vma->vm_flags))
1070 return i ? : -EFAULT;
1072 if (is_vm_hugetlb_page(vma)) {
1073 i = follow_hugetlb_page(mm, vma, pages, vmas,
1074 &start, &len, i);
1075 continue;
1078 foll_flags = FOLL_TOUCH;
1079 if (pages)
1080 foll_flags |= FOLL_GET;
1081 if (!write && !(vma->vm_flags & VM_LOCKED) &&
1082 (!vma->vm_ops || !vma->vm_ops->nopage))
1083 foll_flags |= FOLL_ANON;
1085 do {
1086 struct page *page;
1088 if (write)
1089 foll_flags |= FOLL_WRITE;
1091 cond_resched();
1092 while (!(page = follow_page(vma, start, foll_flags))) {
1093 int ret;
1094 ret = __handle_mm_fault(mm, vma, start,
1095 foll_flags & FOLL_WRITE);
1096 /*
1097 * The VM_FAULT_WRITE bit tells us that do_wp_page has
1098 * broken COW when necessary, even if maybe_mkwrite
1099 * decided not to set pte_write. We can thus safely do
1100 * subsequent page lookups as if they were reads.
1101 */
1102 if (ret & VM_FAULT_WRITE)
1103 foll_flags &= ~FOLL_WRITE;
1105 switch (ret & ~VM_FAULT_WRITE) {
1106 case VM_FAULT_MINOR:
1107 tsk->min_flt++;
1108 break;
1109 case VM_FAULT_MAJOR:
1110 tsk->maj_flt++;
1111 break;
1112 case VM_FAULT_SIGBUS:
1113 return i ? i : -EFAULT;
1114 case VM_FAULT_OOM:
1115 return i ? i : -ENOMEM;
1116 default:
1117 BUG();
1120 if (pages) {
1121 pages[i] = page;
1123 flush_anon_page(page, start);
1124 flush_dcache_page(page);
1126 if (vmas)
1127 vmas[i] = vma;
1128 i++;
1129 start += PAGE_SIZE;
1130 len--;
1131 } while (len && start < vma->vm_end);
1132 } while (len);
1133 return i;
1135 EXPORT_SYMBOL(get_user_pages);
1137 static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd,
1138 unsigned long addr, unsigned long end, pgprot_t prot)
1140 pte_t *pte;
1141 spinlock_t *ptl;
1142 int err = 0;
1144 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
1145 if (!pte)
1146 return -EAGAIN;
1147 do {
1148 struct page *page = ZERO_PAGE(addr);
1149 pte_t zero_pte = pte_wrprotect(mk_pte(page, prot));
1151 if (unlikely(!pte_none(*pte))) {
1152 err = -EEXIST;
1153 pte++;
1154 break;
1156 page_cache_get(page);
1157 page_add_file_rmap(page);
1158 inc_mm_counter(mm, file_rss);
1159 set_pte_at(mm, addr, pte, zero_pte);
1160 } while (pte++, addr += PAGE_SIZE, addr != end);
1161 pte_unmap_unlock(pte - 1, ptl);
1162 return err;
1165 static inline int zeromap_pmd_range(struct mm_struct *mm, pud_t *pud,
1166 unsigned long addr, unsigned long end, pgprot_t prot)
1168 pmd_t *pmd;
1169 unsigned long next;
1170 int err;
1172 pmd = pmd_alloc(mm, pud, addr);
1173 if (!pmd)
1174 return -EAGAIN;
1175 do {
1176 next = pmd_addr_end(addr, end);
1177 err = zeromap_pte_range(mm, pmd, addr, next, prot);
1178 if (err)
1179 break;
1180 } while (pmd++, addr = next, addr != end);
1181 return err;
1184 static inline int zeromap_pud_range(struct mm_struct *mm, pgd_t *pgd,
1185 unsigned long addr, unsigned long end, pgprot_t prot)
1187 pud_t *pud;
1188 unsigned long next;
1189 int err;
1191 pud = pud_alloc(mm, pgd, addr);
1192 if (!pud)
1193 return -EAGAIN;
1194 do {
1195 next = pud_addr_end(addr, end);
1196 err = zeromap_pmd_range(mm, pud, addr, next, prot);
1197 if (err)
1198 break;
1199 } while (pud++, addr = next, addr != end);
1200 return err;
1203 int zeromap_page_range(struct vm_area_struct *vma,
1204 unsigned long addr, unsigned long size, pgprot_t prot)
1206 pgd_t *pgd;
1207 unsigned long next;
1208 unsigned long end = addr + size;
1209 struct mm_struct *mm = vma->vm_mm;
1210 int err;
1212 BUG_ON(addr >= end);
1213 pgd = pgd_offset(mm, addr);
1214 flush_cache_range(vma, addr, end);
1215 do {
1216 next = pgd_addr_end(addr, end);
1217 err = zeromap_pud_range(mm, pgd, addr, next, prot);
1218 if (err)
1219 break;
1220 } while (pgd++, addr = next, addr != end);
1221 return err;
1224 pte_t * fastcall get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl)
1226 pgd_t * pgd = pgd_offset(mm, addr);
1227 pud_t * pud = pud_alloc(mm, pgd, addr);
1228 if (pud) {
1229 pmd_t * pmd = pmd_alloc(mm, pud, addr);
1230 if (pmd)
1231 return pte_alloc_map_lock(mm, pmd, addr, ptl);
1233 return NULL;
1236 /*
1237 * This is the old fallback for page remapping.
1239 * For historical reasons, it only allows reserved pages. Only
1240 * old drivers should use this, and they needed to mark their
1241 * pages reserved for the old functions anyway.
1242 */
1243 static int insert_page(struct mm_struct *mm, unsigned long addr, struct page *page, pgprot_t prot)
1245 int retval;
1246 pte_t *pte;
1247 spinlock_t *ptl;
1249 retval = -EINVAL;
1250 if (PageAnon(page))
1251 goto out;
1252 retval = -ENOMEM;
1253 flush_dcache_page(page);
1254 pte = get_locked_pte(mm, addr, &ptl);
1255 if (!pte)
1256 goto out;
1257 retval = -EBUSY;
1258 if (!pte_none(*pte))
1259 goto out_unlock;
1261 /* Ok, finally just insert the thing.. */
1262 get_page(page);
1263 inc_mm_counter(mm, file_rss);
1264 page_add_file_rmap(page);
1265 set_pte_at(mm, addr, pte, mk_pte(page, prot));
1267 retval = 0;
1268 out_unlock:
1269 pte_unmap_unlock(pte, ptl);
1270 out:
1271 return retval;
1274 /*
1275 * This allows drivers to insert individual pages they've allocated
1276 * into a user vma.
1278 * The page has to be a nice clean _individual_ kernel allocation.
1279 * If you allocate a compound page, you need to have marked it as
1280 * such (__GFP_COMP), or manually just split the page up yourself
1281 * (see split_page()).
1283 * NOTE! Traditionally this was done with "remap_pfn_range()" which
1284 * took an arbitrary page protection parameter. This doesn't allow
1285 * that. Your vma protection will have to be set up correctly, which
1286 * means that if you want a shared writable mapping, you'd better
1287 * ask for a shared writable mapping!
1289 * The page does not need to be reserved.
1290 */
1291 int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, struct page *page)
1293 if (addr < vma->vm_start || addr >= vma->vm_end)
1294 return -EFAULT;
1295 if (!page_count(page))
1296 return -EINVAL;
1297 vma->vm_flags |= VM_INSERTPAGE;
1298 return insert_page(vma->vm_mm, addr, page, vma->vm_page_prot);
1300 EXPORT_SYMBOL(vm_insert_page);
1302 /*
1303 * maps a range of physical memory into the requested pages. the old
1304 * mappings are removed. any references to nonexistent pages results
1305 * in null mappings (currently treated as "copy-on-access")
1306 */
1307 static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
1308 unsigned long addr, unsigned long end,
1309 unsigned long pfn, pgprot_t prot)
1311 pte_t *pte;
1312 spinlock_t *ptl;
1314 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
1315 if (!pte)
1316 return -ENOMEM;
1317 do {
1318 BUG_ON(!pte_none(*pte));
1319 set_pte_at(mm, addr, pte, pfn_pte(pfn, prot));
1320 pfn++;
1321 } while (pte++, addr += PAGE_SIZE, addr != end);
1322 pte_unmap_unlock(pte - 1, ptl);
1323 return 0;
1326 static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
1327 unsigned long addr, unsigned long end,
1328 unsigned long pfn, pgprot_t prot)
1330 pmd_t *pmd;
1331 unsigned long next;
1333 pfn -= addr >> PAGE_SHIFT;
1334 pmd = pmd_alloc(mm, pud, addr);
1335 if (!pmd)
1336 return -ENOMEM;
1337 do {
1338 next = pmd_addr_end(addr, end);
1339 if (remap_pte_range(mm, pmd, addr, next,
1340 pfn + (addr >> PAGE_SHIFT), prot))
1341 return -ENOMEM;
1342 } while (pmd++, addr = next, addr != end);
1343 return 0;
1346 static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd,
1347 unsigned long addr, unsigned long end,
1348 unsigned long pfn, pgprot_t prot)
1350 pud_t *pud;
1351 unsigned long next;
1353 pfn -= addr >> PAGE_SHIFT;
1354 pud = pud_alloc(mm, pgd, addr);
1355 if (!pud)
1356 return -ENOMEM;
1357 do {
1358 next = pud_addr_end(addr, end);
1359 if (remap_pmd_range(mm, pud, addr, next,
1360 pfn + (addr >> PAGE_SHIFT), prot))
1361 return -ENOMEM;
1362 } while (pud++, addr = next, addr != end);
1363 return 0;
1366 /* Note: this is only safe if the mm semaphore is held when called. */
1367 int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
1368 unsigned long pfn, unsigned long size, pgprot_t prot)
1370 pgd_t *pgd;
1371 unsigned long next;
1372 unsigned long end = addr + PAGE_ALIGN(size);
1373 struct mm_struct *mm = vma->vm_mm;
1374 int err;
1376 /*
1377 * Physically remapped pages are special. Tell the
1378 * rest of the world about it:
1379 * VM_IO tells people not to look at these pages
1380 * (accesses can have side effects).
1381 * VM_RESERVED is specified all over the place, because
1382 * in 2.4 it kept swapout's vma scan off this vma; but
1383 * in 2.6 the LRU scan won't even find its pages, so this
1384 * flag means no more than count its pages in reserved_vm,
1385 * and omit it from core dump, even when VM_IO turned off.
1386 * VM_PFNMAP tells the core MM that the base pages are just
1387 * raw PFN mappings, and do not have a "struct page" associated
1388 * with them.
1390 * There's a horrible special case to handle copy-on-write
1391 * behaviour that some programs depend on. We mark the "original"
1392 * un-COW'ed pages by matching them up with "vma->vm_pgoff".
1393 */
1394 if (is_cow_mapping(vma->vm_flags)) {
1395 if (addr != vma->vm_start || end != vma->vm_end)
1396 return -EINVAL;
1397 vma->vm_pgoff = pfn;
1400 vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
1402 BUG_ON(addr >= end);
1403 pfn -= addr >> PAGE_SHIFT;
1404 pgd = pgd_offset(mm, addr);
1405 flush_cache_range(vma, addr, end);
1406 do {
1407 next = pgd_addr_end(addr, end);
1408 err = remap_pud_range(mm, pgd, addr, next,
1409 pfn + (addr >> PAGE_SHIFT), prot);
1410 if (err)
1411 break;
1412 } while (pgd++, addr = next, addr != end);
1413 return err;
1415 EXPORT_SYMBOL(remap_pfn_range);
1417 #ifdef CONFIG_XEN
1418 static inline int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
1419 unsigned long addr, unsigned long end,
1420 pte_fn_t fn, void *data)
1422 pte_t *pte;
1423 int err;
1424 struct page *pmd_page;
1425 spinlock_t *ptl;
1427 pte = (mm == &init_mm) ?
1428 pte_alloc_kernel(pmd, addr) :
1429 pte_alloc_map_lock(mm, pmd, addr, &ptl);
1430 if (!pte)
1431 return -ENOMEM;
1433 BUG_ON(pmd_huge(*pmd));
1435 pmd_page = pmd_page(*pmd);
1437 do {
1438 err = fn(pte, pmd_page, addr, data);
1439 if (err)
1440 break;
1441 } while (pte++, addr += PAGE_SIZE, addr != end);
1443 if (mm != &init_mm)
1444 pte_unmap_unlock(pte-1, ptl);
1445 return err;
1448 static inline int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
1449 unsigned long addr, unsigned long end,
1450 pte_fn_t fn, void *data)
1452 pmd_t *pmd;
1453 unsigned long next;
1454 int err;
1456 pmd = pmd_alloc(mm, pud, addr);
1457 if (!pmd)
1458 return -ENOMEM;
1459 do {
1460 next = pmd_addr_end(addr, end);
1461 err = apply_to_pte_range(mm, pmd, addr, next, fn, data);
1462 if (err)
1463 break;
1464 } while (pmd++, addr = next, addr != end);
1465 return err;
1468 static inline int apply_to_pud_range(struct mm_struct *mm, pgd_t *pgd,
1469 unsigned long addr, unsigned long end,
1470 pte_fn_t fn, void *data)
1472 pud_t *pud;
1473 unsigned long next;
1474 int err;
1476 pud = pud_alloc(mm, pgd, addr);
1477 if (!pud)
1478 return -ENOMEM;
1479 do {
1480 next = pud_addr_end(addr, end);
1481 err = apply_to_pmd_range(mm, pud, addr, next, fn, data);
1482 if (err)
1483 break;
1484 } while (pud++, addr = next, addr != end);
1485 return err;
1488 /*
1489 * Scan a region of virtual memory, filling in page tables as necessary
1490 * and calling a provided function on each leaf page table.
1491 */
1492 int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
1493 unsigned long size, pte_fn_t fn, void *data)
1495 pgd_t *pgd;
1496 unsigned long next;
1497 unsigned long end = addr + size;
1498 int err;
1500 BUG_ON(addr >= end);
1501 pgd = pgd_offset(mm, addr);
1502 do {
1503 next = pgd_addr_end(addr, end);
1504 err = apply_to_pud_range(mm, pgd, addr, next, fn, data);
1505 if (err)
1506 break;
1507 } while (pgd++, addr = next, addr != end);
1508 return err;
1510 EXPORT_SYMBOL_GPL(apply_to_page_range);
1511 #endif
1513 /*
1514 * handle_pte_fault chooses page fault handler according to an entry
1515 * which was read non-atomically. Before making any commitment, on
1516 * those architectures or configurations (e.g. i386 with PAE) which
1517 * might give a mix of unmatched parts, do_swap_page and do_file_page
1518 * must check under lock before unmapping the pte and proceeding
1519 * (but do_wp_page is only called after already making such a check;
1520 * and do_anonymous_page and do_no_page can safely check later on).
1521 */
1522 static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
1523 pte_t *page_table, pte_t orig_pte)
1525 int same = 1;
1526 #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
1527 if (sizeof(pte_t) > sizeof(unsigned long)) {
1528 spinlock_t *ptl = pte_lockptr(mm, pmd);
1529 spin_lock(ptl);
1530 same = pte_same(*page_table, orig_pte);
1531 spin_unlock(ptl);
1533 #endif
1534 pte_unmap(page_table);
1535 return same;
1538 /*
1539 * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when
1540 * servicing faults for write access. In the normal case, do always want
1541 * pte_mkwrite. But get_user_pages can cause write faults for mappings
1542 * that do not have writing enabled, when used by access_process_vm.
1543 */
1544 static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
1546 if (likely(vma->vm_flags & VM_WRITE))
1547 pte = pte_mkwrite(pte);
1548 return pte;
1551 static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va)
1553 /*
1554 * If the source page was a PFN mapping, we don't have
1555 * a "struct page" for it. We do a best-effort copy by
1556 * just copying from the original user address. If that
1557 * fails, we just zero-fill it. Live with it.
1558 */
1559 if (unlikely(!src)) {
1560 void *kaddr = kmap_atomic(dst, KM_USER0);
1561 void __user *uaddr = (void __user *)(va & PAGE_MASK);
1563 /*
1564 * This really shouldn't fail, because the page is there
1565 * in the page tables. But it might just be unreadable,
1566 * in which case we just give up and fill the result with
1567 * zeroes.
1568 */
1569 if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
1570 memset(kaddr, 0, PAGE_SIZE);
1571 kunmap_atomic(kaddr, KM_USER0);
1572 return;
1575 copy_user_highpage(dst, src, va);
1578 /*
1579 * This routine handles present pages, when users try to write
1580 * to a shared page. It is done by copying the page to a new address
1581 * and decrementing the shared-page counter for the old page.
1583 * Note that this routine assumes that the protection checks have been
1584 * done by the caller (the low-level page fault routine in most cases).
1585 * Thus we can safely just mark it writable once we've done any necessary
1586 * COW.
1588 * We also mark the page dirty at this point even though the page will
1589 * change only once the write actually happens. This avoids a few races,
1590 * and potentially makes it more efficient.
1592 * We enter with non-exclusive mmap_sem (to exclude vma changes,
1593 * but allow concurrent faults), with pte both mapped and locked.
1594 * We return with mmap_sem still held, but pte unmapped and unlocked.
1595 */
1596 static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1597 unsigned long address, pte_t *page_table, pmd_t *pmd,
1598 spinlock_t *ptl, pte_t orig_pte)
1600 struct page *old_page, *new_page;
1601 pte_t entry;
1602 int reuse, ret = VM_FAULT_MINOR;
1604 old_page = vm_normal_page(vma, address, orig_pte);
1605 if (!old_page)
1606 goto gotten;
1608 if (unlikely((vma->vm_flags & (VM_SHARED|VM_WRITE)) ==
1609 (VM_SHARED|VM_WRITE))) {
1610 if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
1611 /*
1612 * Notify the address space that the page is about to
1613 * become writable so that it can prohibit this or wait
1614 * for the page to get into an appropriate state.
1616 * We do this without the lock held, so that it can
1617 * sleep if it needs to.
1618 */
1619 page_cache_get(old_page);
1620 pte_unmap_unlock(page_table, ptl);
1622 if (vma->vm_ops->page_mkwrite(vma, old_page) < 0)
1623 goto unwritable_page;
1625 page_cache_release(old_page);
1627 /*
1628 * Since we dropped the lock we need to revalidate
1629 * the PTE as someone else may have changed it. If
1630 * they did, we just return, as we can count on the
1631 * MMU to tell us if they didn't also make it writable.
1632 */
1633 page_table = pte_offset_map_lock(mm, pmd, address,
1634 &ptl);
1635 if (!pte_same(*page_table, orig_pte))
1636 goto unlock;
1639 reuse = 1;
1640 } else if (PageAnon(old_page) && !TestSetPageLocked(old_page)) {
1641 reuse = can_share_swap_page(old_page);
1642 unlock_page(old_page);
1643 } else {
1644 reuse = 0;
1647 if (reuse) {
1648 flush_cache_page(vma, address, pte_pfn(orig_pte));
1649 entry = pte_mkyoung(orig_pte);
1650 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1651 ptep_set_access_flags(vma, address, page_table, entry, 1);
1652 update_mmu_cache(vma, address, entry);
1653 lazy_mmu_prot_update(entry);
1654 ret |= VM_FAULT_WRITE;
1655 goto unlock;
1658 /*
1659 * Ok, we need to copy. Oh, well..
1660 */
1661 page_cache_get(old_page);
1662 gotten:
1663 pte_unmap_unlock(page_table, ptl);
1665 if (unlikely(anon_vma_prepare(vma)))
1666 goto oom;
1667 if (old_page == ZERO_PAGE(address)) {
1668 new_page = alloc_zeroed_user_highpage(vma, address);
1669 if (!new_page)
1670 goto oom;
1671 } else {
1672 new_page = alloc_page_vma(GFP_HIGHUSER, vma, address);
1673 if (!new_page)
1674 goto oom;
1675 cow_user_page(new_page, old_page, address);
1678 /*
1679 * Re-check the pte - we dropped the lock
1680 */
1681 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
1682 if (likely(pte_same(*page_table, orig_pte))) {
1683 if (old_page) {
1684 page_remove_rmap(old_page);
1685 if (!PageAnon(old_page)) {
1686 dec_mm_counter(mm, file_rss);
1687 inc_mm_counter(mm, anon_rss);
1689 } else
1690 inc_mm_counter(mm, anon_rss);
1691 flush_cache_page(vma, address, pte_pfn(orig_pte));
1692 entry = mk_pte(new_page, vma->vm_page_prot);
1693 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1694 lazy_mmu_prot_update(entry);
1695 /*
1696 * Clear the pte entry and flush it first, before updating the
1697 * pte with the new entry. This will avoid a race condition
1698 * seen in the presence of one thread doing SMC and another
1699 * thread doing COW.
1700 */
1701 ptep_clear_flush(vma, address, page_table);
1702 set_pte_at(mm, address, page_table, entry);
1703 update_mmu_cache(vma, address, entry);
1704 lru_cache_add_active(new_page);
1705 page_add_new_anon_rmap(new_page, vma, address);
1707 /* Free the old page.. */
1708 new_page = old_page;
1709 ret |= VM_FAULT_WRITE;
1711 if (new_page)
1712 page_cache_release(new_page);
1713 if (old_page)
1714 page_cache_release(old_page);
1715 unlock:
1716 pte_unmap_unlock(page_table, ptl);
1717 return ret;
1718 oom:
1719 if (old_page)
1720 page_cache_release(old_page);
1721 return VM_FAULT_OOM;
1723 unwritable_page:
1724 page_cache_release(old_page);
1725 return VM_FAULT_SIGBUS;
1728 /*
1729 * Helper functions for unmap_mapping_range().
1731 * __ Notes on dropping i_mmap_lock to reduce latency while unmapping __
1733 * We have to restart searching the prio_tree whenever we drop the lock,
1734 * since the iterator is only valid while the lock is held, and anyway
1735 * a later vma might be split and reinserted earlier while lock dropped.
1737 * The list of nonlinear vmas could be handled more efficiently, using
1738 * a placeholder, but handle it in the same way until a need is shown.
1739 * It is important to search the prio_tree before nonlinear list: a vma
1740 * may become nonlinear and be shifted from prio_tree to nonlinear list
1741 * while the lock is dropped; but never shifted from list to prio_tree.
1743 * In order to make forward progress despite restarting the search,
1744 * vm_truncate_count is used to mark a vma as now dealt with, so we can
1745 * quickly skip it next time around. Since the prio_tree search only
1746 * shows us those vmas affected by unmapping the range in question, we
1747 * can't efficiently keep all vmas in step with mapping->truncate_count:
1748 * so instead reset them all whenever it wraps back to 0 (then go to 1).
1749 * mapping->truncate_count and vma->vm_truncate_count are protected by
1750 * i_mmap_lock.
1752 * In order to make forward progress despite repeatedly restarting some
1753 * large vma, note the restart_addr from unmap_vmas when it breaks out:
1754 * and restart from that address when we reach that vma again. It might
1755 * have been split or merged, shrunk or extended, but never shifted: so
1756 * restart_addr remains valid so long as it remains in the vma's range.
1757 * unmap_mapping_range forces truncate_count to leap over page-aligned
1758 * values so we can save vma's restart_addr in its truncate_count field.
1759 */
1760 #define is_restart_addr(truncate_count) (!((truncate_count) & ~PAGE_MASK))
1762 static void reset_vma_truncate_counts(struct address_space *mapping)
1764 struct vm_area_struct *vma;
1765 struct prio_tree_iter iter;
1767 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
1768 vma->vm_truncate_count = 0;
1769 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
1770 vma->vm_truncate_count = 0;
1773 static int unmap_mapping_range_vma(struct vm_area_struct *vma,
1774 unsigned long start_addr, unsigned long end_addr,
1775 struct zap_details *details)
1777 unsigned long restart_addr;
1778 int need_break;
1780 again:
1781 restart_addr = vma->vm_truncate_count;
1782 if (is_restart_addr(restart_addr) && start_addr < restart_addr) {
1783 start_addr = restart_addr;
1784 if (start_addr >= end_addr) {
1785 /* Top of vma has been split off since last time */
1786 vma->vm_truncate_count = details->truncate_count;
1787 return 0;
1791 restart_addr = zap_page_range(vma, start_addr,
1792 end_addr - start_addr, details);
1793 need_break = need_resched() ||
1794 need_lockbreak(details->i_mmap_lock);
1796 if (restart_addr >= end_addr) {
1797 /* We have now completed this vma: mark it so */
1798 vma->vm_truncate_count = details->truncate_count;
1799 if (!need_break)
1800 return 0;
1801 } else {
1802 /* Note restart_addr in vma's truncate_count field */
1803 vma->vm_truncate_count = restart_addr;
1804 if (!need_break)
1805 goto again;
1808 spin_unlock(details->i_mmap_lock);
1809 cond_resched();
1810 spin_lock(details->i_mmap_lock);
1811 return -EINTR;
1814 static inline void unmap_mapping_range_tree(struct prio_tree_root *root,
1815 struct zap_details *details)
1817 struct vm_area_struct *vma;
1818 struct prio_tree_iter iter;
1819 pgoff_t vba, vea, zba, zea;
1821 restart:
1822 vma_prio_tree_foreach(vma, &iter, root,
1823 details->first_index, details->last_index) {
1824 /* Skip quickly over those we have already dealt with */
1825 if (vma->vm_truncate_count == details->truncate_count)
1826 continue;
1828 vba = vma->vm_pgoff;
1829 vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1;
1830 /* Assume for now that PAGE_CACHE_SHIFT == PAGE_SHIFT */
1831 zba = details->first_index;
1832 if (zba < vba)
1833 zba = vba;
1834 zea = details->last_index;
1835 if (zea > vea)
1836 zea = vea;
1838 if (unmap_mapping_range_vma(vma,
1839 ((zba - vba) << PAGE_SHIFT) + vma->vm_start,
1840 ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
1841 details) < 0)
1842 goto restart;
1846 static inline void unmap_mapping_range_list(struct list_head *head,
1847 struct zap_details *details)
1849 struct vm_area_struct *vma;
1851 /*
1852 * In nonlinear VMAs there is no correspondence between virtual address
1853 * offset and file offset. So we must perform an exhaustive search
1854 * across *all* the pages in each nonlinear VMA, not just the pages
1855 * whose virtual address lies outside the file truncation point.
1856 */
1857 restart:
1858 list_for_each_entry(vma, head, shared.vm_set.list) {
1859 /* Skip quickly over those we have already dealt with */
1860 if (vma->vm_truncate_count == details->truncate_count)
1861 continue;
1862 details->nonlinear_vma = vma;
1863 if (unmap_mapping_range_vma(vma, vma->vm_start,
1864 vma->vm_end, details) < 0)
1865 goto restart;
1869 /**
1870 * unmap_mapping_range - unmap the portion of all mmaps
1871 * in the specified address_space corresponding to the specified
1872 * page range in the underlying file.
1873 * @mapping: the address space containing mmaps to be unmapped.
1874 * @holebegin: byte in first page to unmap, relative to the start of
1875 * the underlying file. This will be rounded down to a PAGE_SIZE
1876 * boundary. Note that this is different from vmtruncate(), which
1877 * must keep the partial page. In contrast, we must get rid of
1878 * partial pages.
1879 * @holelen: size of prospective hole in bytes. This will be rounded
1880 * up to a PAGE_SIZE boundary. A holelen of zero truncates to the
1881 * end of the file.
1882 * @even_cows: 1 when truncating a file, unmap even private COWed pages;
1883 * but 0 when invalidating pagecache, don't throw away private data.
1884 */
1885 void unmap_mapping_range(struct address_space *mapping,
1886 loff_t const holebegin, loff_t const holelen, int even_cows)
1888 struct zap_details details;
1889 pgoff_t hba = holebegin >> PAGE_SHIFT;
1890 pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
1892 /* Check for overflow. */
1893 if (sizeof(holelen) > sizeof(hlen)) {
1894 long long holeend =
1895 (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
1896 if (holeend & ~(long long)ULONG_MAX)
1897 hlen = ULONG_MAX - hba + 1;
1900 details.check_mapping = even_cows? NULL: mapping;
1901 details.nonlinear_vma = NULL;
1902 details.first_index = hba;
1903 details.last_index = hba + hlen - 1;
1904 if (details.last_index < details.first_index)
1905 details.last_index = ULONG_MAX;
1906 details.i_mmap_lock = &mapping->i_mmap_lock;
1908 spin_lock(&mapping->i_mmap_lock);
1910 /* serialize i_size write against truncate_count write */
1911 smp_wmb();
1912 /* Protect against page faults, and endless unmapping loops */
1913 mapping->truncate_count++;
1914 /*
1915 * For archs where spin_lock has inclusive semantics like ia64
1916 * this smp_mb() will prevent to read pagetable contents
1917 * before the truncate_count increment is visible to
1918 * other cpus.
1919 */
1920 smp_mb();
1921 if (unlikely(is_restart_addr(mapping->truncate_count))) {
1922 if (mapping->truncate_count == 0)
1923 reset_vma_truncate_counts(mapping);
1924 mapping->truncate_count++;
1926 details.truncate_count = mapping->truncate_count;
1928 if (unlikely(!prio_tree_empty(&mapping->i_mmap)))
1929 unmap_mapping_range_tree(&mapping->i_mmap, &details);
1930 if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
1931 unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
1932 spin_unlock(&mapping->i_mmap_lock);
1934 EXPORT_SYMBOL(unmap_mapping_range);
1936 /*
1937 * Handle all mappings that got truncated by a "truncate()"
1938 * system call.
1940 * NOTE! We have to be ready to update the memory sharing
1941 * between the file and the memory map for a potential last
1942 * incomplete page. Ugly, but necessary.
1943 */
1944 int vmtruncate(struct inode * inode, loff_t offset)
1946 struct address_space *mapping = inode->i_mapping;
1947 unsigned long limit;
1949 if (inode->i_size < offset)
1950 goto do_expand;
1951 /*
1952 * truncation of in-use swapfiles is disallowed - it would cause
1953 * subsequent swapout to scribble on the now-freed blocks.
1954 */
1955 if (IS_SWAPFILE(inode))
1956 goto out_busy;
1957 i_size_write(inode, offset);
1958 unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
1959 truncate_inode_pages(mapping, offset);
1960 goto out_truncate;
1962 do_expand:
1963 limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
1964 if (limit != RLIM_INFINITY && offset > limit)
1965 goto out_sig;
1966 if (offset > inode->i_sb->s_maxbytes)
1967 goto out_big;
1968 i_size_write(inode, offset);
1970 out_truncate:
1971 if (inode->i_op && inode->i_op->truncate)
1972 inode->i_op->truncate(inode);
1973 return 0;
1974 out_sig:
1975 send_sig(SIGXFSZ, current, 0);
1976 out_big:
1977 return -EFBIG;
1978 out_busy:
1979 return -ETXTBSY;
1981 EXPORT_SYMBOL(vmtruncate);
1983 int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
1985 struct address_space *mapping = inode->i_mapping;
1987 /*
1988 * If the underlying filesystem is not going to provide
1989 * a way to truncate a range of blocks (punch a hole) -
1990 * we should return failure right now.
1991 */
1992 if (!inode->i_op || !inode->i_op->truncate_range)
1993 return -ENOSYS;
1995 mutex_lock(&inode->i_mutex);
1996 down_write(&inode->i_alloc_sem);
1997 unmap_mapping_range(mapping, offset, (end - offset), 1);
1998 truncate_inode_pages_range(mapping, offset, end);
1999 inode->i_op->truncate_range(inode, offset, end);
2000 up_write(&inode->i_alloc_sem);
2001 mutex_unlock(&inode->i_mutex);
2003 return 0;
2005 EXPORT_UNUSED_SYMBOL(vmtruncate_range); /* June 2006 */
2007 /*
2008 * Primitive swap readahead code. We simply read an aligned block of
2009 * (1 << page_cluster) entries in the swap area. This method is chosen
2010 * because it doesn't cost us any seek time. We also make sure to queue
2011 * the 'original' request together with the readahead ones...
2013 * This has been extended to use the NUMA policies from the mm triggering
2014 * the readahead.
2016 * Caller must hold down_read on the vma->vm_mm if vma is not NULL.
2017 */
2018 void swapin_readahead(swp_entry_t entry, unsigned long addr,struct vm_area_struct *vma)
2020 #ifdef CONFIG_NUMA
2021 struct vm_area_struct *next_vma = vma ? vma->vm_next : NULL;
2022 #endif
2023 int i, num;
2024 struct page *new_page;
2025 unsigned long offset;
2027 /*
2028 * Get the number of handles we should do readahead io to.
2029 */
2030 num = valid_swaphandles(entry, &offset);
2031 for (i = 0; i < num; offset++, i++) {
2032 /* Ok, do the async read-ahead now */
2033 new_page = read_swap_cache_async(swp_entry(swp_type(entry),
2034 offset), vma, addr);
2035 if (!new_page)
2036 break;
2037 page_cache_release(new_page);
2038 #ifdef CONFIG_NUMA
2039 /*
2040 * Find the next applicable VMA for the NUMA policy.
2041 */
2042 addr += PAGE_SIZE;
2043 if (addr == 0)
2044 vma = NULL;
2045 if (vma) {
2046 if (addr >= vma->vm_end) {
2047 vma = next_vma;
2048 next_vma = vma ? vma->vm_next : NULL;
2050 if (vma && addr < vma->vm_start)
2051 vma = NULL;
2052 } else {
2053 if (next_vma && addr >= next_vma->vm_start) {
2054 vma = next_vma;
2055 next_vma = vma->vm_next;
2058 #endif
2060 lru_add_drain(); /* Push any new pages onto the LRU now */
2063 /*
2064 * We enter with non-exclusive mmap_sem (to exclude vma changes,
2065 * but allow concurrent faults), and pte mapped but not yet locked.
2066 * We return with mmap_sem still held, but pte unmapped and unlocked.
2067 */
2068 static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2069 unsigned long address, pte_t *page_table, pmd_t *pmd,
2070 int write_access, pte_t orig_pte)
2072 spinlock_t *ptl;
2073 struct page *page;
2074 swp_entry_t entry;
2075 pte_t pte;
2076 int ret = VM_FAULT_MINOR;
2078 if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
2079 goto out;
2081 entry = pte_to_swp_entry(orig_pte);
2082 if (is_migration_entry(entry)) {
2083 migration_entry_wait(mm, pmd, address);
2084 goto out;
2086 delayacct_set_flag(DELAYACCT_PF_SWAPIN);
2087 page = lookup_swap_cache(entry);
2088 if (!page) {
2089 swapin_readahead(entry, address, vma);
2090 page = read_swap_cache_async(entry, vma, address);
2091 if (!page) {
2092 /*
2093 * Back out if somebody else faulted in this pte
2094 * while we released the pte lock.
2095 */
2096 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2097 if (likely(pte_same(*page_table, orig_pte)))
2098 ret = VM_FAULT_OOM;
2099 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2100 goto unlock;
2103 /* Had to read the page from swap area: Major fault */
2104 ret = VM_FAULT_MAJOR;
2105 count_vm_event(PGMAJFAULT);
2106 grab_swap_token();
2109 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2110 mark_page_accessed(page);
2111 lock_page(page);
2113 /*
2114 * Back out if somebody else already faulted in this pte.
2115 */
2116 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2117 if (unlikely(!pte_same(*page_table, orig_pte)))
2118 goto out_nomap;
2120 if (unlikely(!PageUptodate(page))) {
2121 ret = VM_FAULT_SIGBUS;
2122 goto out_nomap;
2125 /* The page isn't present yet, go ahead with the fault. */
2127 inc_mm_counter(mm, anon_rss);
2128 pte = mk_pte(page, vma->vm_page_prot);
2129 if (write_access && can_share_swap_page(page)) {
2130 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
2131 write_access = 0;
2134 flush_icache_page(vma, page);
2135 set_pte_at(mm, address, page_table, pte);
2136 page_add_anon_rmap(page, vma, address);
2138 swap_free(entry);
2139 if (vm_swap_full())
2140 remove_exclusive_swap_page(page);
2141 unlock_page(page);
2143 if (write_access) {
2144 if (do_wp_page(mm, vma, address,
2145 page_table, pmd, ptl, pte) == VM_FAULT_OOM)
2146 ret = VM_FAULT_OOM;
2147 goto out;
2150 /* No need to invalidate - it was non-present before */
2151 update_mmu_cache(vma, address, pte);
2152 lazy_mmu_prot_update(pte);
2153 unlock:
2154 pte_unmap_unlock(page_table, ptl);
2155 out:
2156 return ret;
2157 out_nomap:
2158 pte_unmap_unlock(page_table, ptl);
2159 unlock_page(page);
2160 page_cache_release(page);
2161 return ret;
2164 /*
2165 * We enter with non-exclusive mmap_sem (to exclude vma changes,
2166 * but allow concurrent faults), and pte mapped but not yet locked.
2167 * We return with mmap_sem still held, but pte unmapped and unlocked.
2168 */
2169 static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2170 unsigned long address, pte_t *page_table, pmd_t *pmd,
2171 int write_access)
2173 struct page *page;
2174 spinlock_t *ptl;
2175 pte_t entry;
2177 if (write_access) {
2178 /* Allocate our own private page. */
2179 pte_unmap(page_table);
2181 if (unlikely(anon_vma_prepare(vma)))
2182 goto oom;
2183 page = alloc_zeroed_user_highpage(vma, address);
2184 if (!page)
2185 goto oom;
2187 entry = mk_pte(page, vma->vm_page_prot);
2188 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2190 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2191 if (!pte_none(*page_table))
2192 goto release;
2193 inc_mm_counter(mm, anon_rss);
2194 lru_cache_add_active(page);
2195 page_add_new_anon_rmap(page, vma, address);
2196 } else {
2197 /* Map the ZERO_PAGE - vm_page_prot is readonly */
2198 page = ZERO_PAGE(address);
2199 page_cache_get(page);
2200 entry = mk_pte(page, vma->vm_page_prot);
2202 ptl = pte_lockptr(mm, pmd);
2203 spin_lock(ptl);
2204 if (!pte_none(*page_table))
2205 goto release;
2206 inc_mm_counter(mm, file_rss);
2207 page_add_file_rmap(page);
2210 set_pte_at(mm, address, page_table, entry);
2212 /* No need to invalidate - it was non-present before */
2213 update_mmu_cache(vma, address, entry);
2214 lazy_mmu_prot_update(entry);
2215 unlock:
2216 pte_unmap_unlock(page_table, ptl);
2217 return VM_FAULT_MINOR;
2218 release:
2219 page_cache_release(page);
2220 goto unlock;
2221 oom:
2222 return VM_FAULT_OOM;
2225 /*
2226 * do_no_page() tries to create a new page mapping. It aggressively
2227 * tries to share with existing pages, but makes a separate copy if
2228 * the "write_access" parameter is true in order to avoid the next
2229 * page fault.
2231 * As this is called only for pages that do not currently exist, we
2232 * do not need to flush old virtual caches or the TLB.
2234 * We enter with non-exclusive mmap_sem (to exclude vma changes,
2235 * but allow concurrent faults), and pte mapped but not yet locked.
2236 * We return with mmap_sem still held, but pte unmapped and unlocked.
2237 */
2238 static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
2239 unsigned long address, pte_t *page_table, pmd_t *pmd,
2240 int write_access)
2242 spinlock_t *ptl;
2243 struct page *new_page;
2244 struct address_space *mapping = NULL;
2245 pte_t entry;
2246 unsigned int sequence = 0;
2247 int ret = VM_FAULT_MINOR;
2248 int anon = 0;
2250 pte_unmap(page_table);
2251 BUG_ON(vma->vm_flags & VM_PFNMAP);
2253 if (vma->vm_file) {
2254 mapping = vma->vm_file->f_mapping;
2255 sequence = mapping->truncate_count;
2256 smp_rmb(); /* serializes i_size against truncate_count */
2258 retry:
2259 new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret);
2260 /*
2261 * No smp_rmb is needed here as long as there's a full
2262 * spin_lock/unlock sequence inside the ->nopage callback
2263 * (for the pagecache lookup) that acts as an implicit
2264 * smp_mb() and prevents the i_size read to happen
2265 * after the next truncate_count read.
2266 */
2268 /* no page was available -- either SIGBUS or OOM */
2269 if (new_page == NOPAGE_SIGBUS)
2270 return VM_FAULT_SIGBUS;
2271 if (new_page == NOPAGE_OOM)
2272 return VM_FAULT_OOM;
2274 /*
2275 * Should we do an early C-O-W break?
2276 */
2277 if (write_access) {
2278 if (!(vma->vm_flags & VM_SHARED)) {
2279 struct page *page;
2281 if (unlikely(anon_vma_prepare(vma)))
2282 goto oom;
2283 page = alloc_page_vma(GFP_HIGHUSER, vma, address);
2284 if (!page)
2285 goto oom;
2286 copy_user_highpage(page, new_page, address);
2287 page_cache_release(new_page);
2288 new_page = page;
2289 anon = 1;
2291 } else {
2292 /* if the page will be shareable, see if the backing
2293 * address space wants to know that the page is about
2294 * to become writable */
2295 if (vma->vm_ops->page_mkwrite &&
2296 vma->vm_ops->page_mkwrite(vma, new_page) < 0
2297 ) {
2298 page_cache_release(new_page);
2299 return VM_FAULT_SIGBUS;
2304 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2305 /*
2306 * For a file-backed vma, someone could have truncated or otherwise
2307 * invalidated this page. If unmap_mapping_range got called,
2308 * retry getting the page.
2309 */
2310 if (mapping && unlikely(sequence != mapping->truncate_count)) {
2311 pte_unmap_unlock(page_table, ptl);
2312 page_cache_release(new_page);
2313 cond_resched();
2314 sequence = mapping->truncate_count;
2315 smp_rmb();
2316 goto retry;
2319 /*
2320 * This silly early PAGE_DIRTY setting removes a race
2321 * due to the bad i386 page protection. But it's valid
2322 * for other architectures too.
2324 * Note that if write_access is true, we either now have
2325 * an exclusive copy of the page, or this is a shared mapping,
2326 * so we can make it writable and dirty to avoid having to
2327 * handle that later.
2328 */
2329 /* Only go through if we didn't race with anybody else... */
2330 if (pte_none(*page_table)) {
2331 flush_icache_page(vma, new_page);
2332 entry = mk_pte(new_page, vma->vm_page_prot);
2333 if (write_access)
2334 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2335 set_pte_at(mm, address, page_table, entry);
2336 if (anon) {
2337 inc_mm_counter(mm, anon_rss);
2338 lru_cache_add_active(new_page);
2339 page_add_new_anon_rmap(new_page, vma, address);
2340 } else {
2341 inc_mm_counter(mm, file_rss);
2342 page_add_file_rmap(new_page);
2344 } else {
2345 /* One of our sibling threads was faster, back out. */
2346 page_cache_release(new_page);
2347 goto unlock;
2350 /* no need to invalidate: a not-present page shouldn't be cached */
2351 update_mmu_cache(vma, address, entry);
2352 lazy_mmu_prot_update(entry);
2353 unlock:
2354 pte_unmap_unlock(page_table, ptl);
2355 return ret;
2356 oom:
2357 page_cache_release(new_page);
2358 return VM_FAULT_OOM;
2361 /*
2362 * Fault of a previously existing named mapping. Repopulate the pte
2363 * from the encoded file_pte if possible. This enables swappable
2364 * nonlinear vmas.
2366 * We enter with non-exclusive mmap_sem (to exclude vma changes,
2367 * but allow concurrent faults), and pte mapped but not yet locked.
2368 * We return with mmap_sem still held, but pte unmapped and unlocked.
2369 */
2370 static int do_file_page(struct mm_struct *mm, struct vm_area_struct *vma,
2371 unsigned long address, pte_t *page_table, pmd_t *pmd,
2372 int write_access, pte_t orig_pte)
2374 pgoff_t pgoff;
2375 int err;
2377 if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
2378 return VM_FAULT_MINOR;
2380 if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {
2381 /*
2382 * Page table corrupted: show pte and kill process.
2383 */
2384 print_bad_pte(vma, orig_pte, address);
2385 return VM_FAULT_OOM;
2387 /* We can then assume vm->vm_ops && vma->vm_ops->populate */
2389 pgoff = pte_to_pgoff(orig_pte);
2390 err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE,
2391 vma->vm_page_prot, pgoff, 0);
2392 if (err == -ENOMEM)
2393 return VM_FAULT_OOM;
2394 if (err)
2395 return VM_FAULT_SIGBUS;
2396 return VM_FAULT_MAJOR;
2399 /*
2400 * These routines also need to handle stuff like marking pages dirty
2401 * and/or accessed for architectures that don't do it in hardware (most
2402 * RISC architectures). The early dirtying is also good on the i386.
2404 * There is also a hook called "update_mmu_cache()" that architectures
2405 * with external mmu caches can use to update those (ie the Sparc or
2406 * PowerPC hashed page tables that act as extended TLBs).
2408 * We enter with non-exclusive mmap_sem (to exclude vma changes,
2409 * but allow concurrent faults), and pte mapped but not yet locked.
2410 * We return with mmap_sem still held, but pte unmapped and unlocked.
2411 */
2412 static inline int handle_pte_fault(struct mm_struct *mm,
2413 struct vm_area_struct *vma, unsigned long address,
2414 pte_t *pte, pmd_t *pmd, int write_access)
2416 pte_t entry;
2417 pte_t old_entry;
2418 spinlock_t *ptl;
2420 old_entry = entry = *pte;
2421 if (!pte_present(entry)) {
2422 if (pte_none(entry)) {
2423 if (!vma->vm_ops || !vma->vm_ops->nopage)
2424 return do_anonymous_page(mm, vma, address,
2425 pte, pmd, write_access);
2426 return do_no_page(mm, vma, address,
2427 pte, pmd, write_access);
2429 if (pte_file(entry))
2430 return do_file_page(mm, vma, address,
2431 pte, pmd, write_access, entry);
2432 return do_swap_page(mm, vma, address,
2433 pte, pmd, write_access, entry);
2436 ptl = pte_lockptr(mm, pmd);
2437 spin_lock(ptl);
2438 if (unlikely(!pte_same(*pte, entry)))
2439 goto unlock;
2440 if (write_access) {
2441 if (!pte_write(entry))
2442 return do_wp_page(mm, vma, address,
2443 pte, pmd, ptl, entry);
2444 entry = pte_mkdirty(entry);
2446 entry = pte_mkyoung(entry);
2447 if (!pte_same(old_entry, entry)) {
2448 ptep_set_access_flags(vma, address, pte, entry, write_access);
2449 update_mmu_cache(vma, address, entry);
2450 lazy_mmu_prot_update(entry);
2451 } else {
2452 /*
2453 * This is needed only for protection faults but the arch code
2454 * is not yet telling us if this is a protection fault or not.
2455 * This still avoids useless tlb flushes for .text page faults
2456 * with threads.
2457 */
2458 if (write_access)
2459 flush_tlb_page(vma, address);
2461 unlock:
2462 pte_unmap_unlock(pte, ptl);
2463 return VM_FAULT_MINOR;
2466 /*
2467 * By the time we get here, we already hold the mm semaphore
2468 */
2469 int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2470 unsigned long address, int write_access)
2472 pgd_t *pgd;
2473 pud_t *pud;
2474 pmd_t *pmd;
2475 pte_t *pte;
2477 __set_current_state(TASK_RUNNING);
2479 count_vm_event(PGFAULT);
2481 if (unlikely(is_vm_hugetlb_page(vma)))
2482 return hugetlb_fault(mm, vma, address, write_access);
2484 pgd = pgd_offset(mm, address);
2485 pud = pud_alloc(mm, pgd, address);
2486 if (!pud)
2487 return VM_FAULT_OOM;
2488 pmd = pmd_alloc(mm, pud, address);
2489 if (!pmd)
2490 return VM_FAULT_OOM;
2491 pte = pte_alloc_map(mm, pmd, address);
2492 if (!pte)
2493 return VM_FAULT_OOM;
2495 return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
2498 EXPORT_SYMBOL_GPL(__handle_mm_fault);
2500 #ifndef __PAGETABLE_PUD_FOLDED
2501 /*
2502 * Allocate page upper directory.
2503 * We've already handled the fast-path in-line.
2504 */
2505 int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
2507 pud_t *new = pud_alloc_one(mm, address);
2508 if (!new)
2509 return -ENOMEM;
2511 spin_lock(&mm->page_table_lock);
2512 if (pgd_present(*pgd)) /* Another has populated it */
2513 pud_free(new);
2514 else
2515 pgd_populate(mm, pgd, new);
2516 spin_unlock(&mm->page_table_lock);
2517 return 0;
2519 #else
2520 /* Workaround for gcc 2.96 */
2521 int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
2523 return 0;
2525 #endif /* __PAGETABLE_PUD_FOLDED */
2527 #ifndef __PAGETABLE_PMD_FOLDED
2528 /*
2529 * Allocate page middle directory.
2530 * We've already handled the fast-path in-line.
2531 */
2532 int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
2534 pmd_t *new = pmd_alloc_one(mm, address);
2535 if (!new)
2536 return -ENOMEM;
2538 spin_lock(&mm->page_table_lock);
2539 #ifndef __ARCH_HAS_4LEVEL_HACK
2540 if (pud_present(*pud)) /* Another has populated it */
2541 pmd_free(new);
2542 else
2543 pud_populate(mm, pud, new);
2544 #else
2545 if (pgd_present(*pud)) /* Another has populated it */
2546 pmd_free(new);
2547 else
2548 pgd_populate(mm, pud, new);
2549 #endif /* __ARCH_HAS_4LEVEL_HACK */
2550 spin_unlock(&mm->page_table_lock);
2551 return 0;
2553 #else
2554 /* Workaround for gcc 2.96 */
2555 int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
2557 return 0;
2559 #endif /* __PAGETABLE_PMD_FOLDED */
2561 int make_pages_present(unsigned long addr, unsigned long end)
2563 int ret, len, write;
2564 struct vm_area_struct * vma;
2566 vma = find_vma(current->mm, addr);
2567 if (!vma)
2568 return -1;
2569 write = (vma->vm_flags & VM_WRITE) != 0;
2570 BUG_ON(addr >= end);
2571 BUG_ON(end > vma->vm_end);
2572 len = (end+PAGE_SIZE-1)/PAGE_SIZE-addr/PAGE_SIZE;
2573 ret = get_user_pages(current, current->mm, addr,
2574 len, write, 0, NULL, NULL);
2575 if (ret < 0)
2576 return ret;
2577 return ret == len ? 0 : -1;
2580 /*
2581 * Map a vmalloc()-space virtual address to the physical page.
2582 */
2583 struct page * vmalloc_to_page(void * vmalloc_addr)
2585 unsigned long addr = (unsigned long) vmalloc_addr;
2586 struct page *page = NULL;
2587 pgd_t *pgd = pgd_offset_k(addr);
2588 pud_t *pud;
2589 pmd_t *pmd;
2590 pte_t *ptep, pte;
2592 if (!pgd_none(*pgd)) {
2593 pud = pud_offset(pgd, addr);
2594 if (!pud_none(*pud)) {
2595 pmd = pmd_offset(pud, addr);
2596 if (!pmd_none(*pmd)) {
2597 ptep = pte_offset_map(pmd, addr);
2598 pte = *ptep;
2599 if (pte_present(pte))
2600 page = pte_page(pte);
2601 pte_unmap(ptep);
2605 return page;
2608 EXPORT_SYMBOL(vmalloc_to_page);
2610 /*
2611 * Map a vmalloc()-space virtual address to the physical page frame number.
2612 */
2613 unsigned long vmalloc_to_pfn(void * vmalloc_addr)
2615 return page_to_pfn(vmalloc_to_page(vmalloc_addr));
2618 EXPORT_SYMBOL(vmalloc_to_pfn);
2620 #if !defined(__HAVE_ARCH_GATE_AREA)
2622 #if defined(AT_SYSINFO_EHDR)
2623 static struct vm_area_struct gate_vma;
2625 static int __init gate_vma_init(void)
2627 gate_vma.vm_mm = NULL;
2628 gate_vma.vm_start = FIXADDR_USER_START;
2629 gate_vma.vm_end = FIXADDR_USER_END;
2630 gate_vma.vm_page_prot = PAGE_READONLY;
2631 gate_vma.vm_flags = 0;
2632 return 0;
2634 __initcall(gate_vma_init);
2635 #endif
2637 struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
2639 #ifdef AT_SYSINFO_EHDR
2640 return &gate_vma;
2641 #else
2642 return NULL;
2643 #endif
2646 int in_gate_area_no_task(unsigned long addr)
2648 #ifdef AT_SYSINFO_EHDR
2649 if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END))
2650 return 1;
2651 #endif
2652 return 0;
2655 #endif /* __HAVE_ARCH_GATE_AREA */