ia64/xen-unstable

view linux-2.6-xen-sparse/mm/memory.c @ 7446:18eb059ae471

New network-bridge script and associated gubbins.

This is Kurt Garloff's reworked network-bridge script:

* we got rid of ifconfig
* it works for netdev != eth0
* arp on and off are symmetric as are ifdown and ifup
* ifup will be passed the ifcfg config file name if needed
(the ifup may otherwise figure that the veth0 hardware is
NOT the same as the original ${netdev} and not use the same
config -- this happens on SUSE. Charles Coffing tracked this
one down.)

Plus Kurt's avoid-dash patch:

the network setup scripts on SUSE have trouble with the bridge
name xen-br0; they don't expect the '-'.
Arguably this should be fixed.
But I assume there's more scripts out there which may not like it,
so I suggest the following patch to rename xen-br0 to xenbr0.

Plus Charles Duffy's patch to support multiple bridges:

The attached patch allows the network-bridge script to be used to
generate multiple bridges corresponding to different physical
interfaces. It adds a new parameter, "vifnum", used to refer both to
the loopback interface to be used and to set defaults regarding the
physical interface and bridge name.

Thus, if one wishes to start xenbr0 on eth0 and xenbr1 on eth1, one
need only call:

network-bridge start ## vifnum is 0 by default
network-bridge start vifnum=1

...well, that and set loopback.nloopbacks=2 in the Dom0 kernel
parameters.

Plus renaming of virtnum to vifnum in Charles' patch, as requested by Ian Pratt.

Plus a fix to DevController to allocate vif IDs starting from 0 (i.e. vif2.0
is now domain 2's first vif, as opposed to vif2.1 in the recent past).

Plus tidying up inside network-bridge using some helper variables.

Signed-off-by: Ewan Mellor <ewan@xensource.com>
author emellor@leeni.uk.xensource.com
date Wed Oct 19 16:24:54 2005 +0100 (2005-10-19)
parents 06d84bf87159
children 990c009015e8
line source
1 /*
2 * linux/mm/memory.c
3 *
4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
5 */
7 /*
8 * demand-loading started 01.12.91 - seems it is high on the list of
9 * things wanted, and it should be easy to implement. - Linus
10 */
12 /*
13 * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
14 * pages started 02.12.91, seems to work. - Linus.
15 *
16 * Tested sharing by executing about 30 /bin/sh: under the old kernel it
17 * would have taken more than the 6M I have free, but it worked well as
18 * far as I could see.
19 *
20 * Also corrected some "invalidate()"s - I wasn't doing enough of them.
21 */
23 /*
24 * Real VM (paging to/from disk) started 18.12.91. Much more work and
25 * thought has to go into this. Oh, well..
26 * 19.12.91 - works, somewhat. Sometimes I get faults, don't know why.
27 * Found it. Everything seems to work now.
28 * 20.12.91 - Ok, making the swap-device changeable like the root.
29 */
31 /*
32 * 05.04.94 - Multi-page memory management added for v1.1.
33 * Idea by Alex Bligh (alex@cconcepts.co.uk)
34 *
35 * 16.07.99 - Support of BIGMEM added by Gerhard Wichert, Siemens AG
36 * (Gerhard.Wichert@pdb.siemens.de)
37 *
38 * Aug/Sep 2004 Changed to four level page tables (Andi Kleen)
39 */
41 #include <linux/kernel_stat.h>
42 #include <linux/mm.h>
43 #include <linux/hugetlb.h>
44 #include <linux/mman.h>
45 #include <linux/swap.h>
46 #include <linux/highmem.h>
47 #include <linux/pagemap.h>
48 #include <linux/rmap.h>
49 #include <linux/module.h>
50 #include <linux/init.h>
52 #include <asm/pgalloc.h>
53 #include <asm/uaccess.h>
54 #include <asm/tlb.h>
55 #include <asm/tlbflush.h>
56 #include <asm/pgtable.h>
58 #include <linux/swapops.h>
59 #include <linux/elf.h>
61 #ifndef CONFIG_DISCONTIGMEM
62 /* use the per-pgdat data instead for discontigmem - mbligh */
63 unsigned long max_mapnr;
64 struct page *mem_map;
66 EXPORT_SYMBOL(max_mapnr);
67 EXPORT_SYMBOL(mem_map);
68 #endif
70 unsigned long num_physpages;
71 /*
72 * A number of key systems in x86 including ioremap() rely on the assumption
73 * that high_memory defines the upper bound on direct map memory, then end
74 * of ZONE_NORMAL. Under CONFIG_DISCONTIG this means that max_low_pfn and
75 * highstart_pfn must be the same; there must be no gap between ZONE_NORMAL
76 * and ZONE_HIGHMEM.
77 */
78 void * high_memory;
79 unsigned long vmalloc_earlyreserve;
81 EXPORT_SYMBOL(num_physpages);
82 EXPORT_SYMBOL(high_memory);
83 EXPORT_SYMBOL(vmalloc_earlyreserve);
85 /*
86 * If a p?d_bad entry is found while walking page tables, report
87 * the error, before resetting entry to p?d_none. Usually (but
88 * very seldom) called out from the p?d_none_or_clear_bad macros.
89 */
91 void pgd_clear_bad(pgd_t *pgd)
92 {
93 pgd_ERROR(*pgd);
94 pgd_clear(pgd);
95 }
97 void pud_clear_bad(pud_t *pud)
98 {
99 pud_ERROR(*pud);
100 pud_clear(pud);
101 }
103 void pmd_clear_bad(pmd_t *pmd)
104 {
105 pmd_ERROR(*pmd);
106 pmd_clear(pmd);
107 }
109 /*
110 * Note: this doesn't free the actual pages themselves. That
111 * has been handled earlier when unmapping all the memory regions.
112 */
113 static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd)
114 {
115 struct page *page = pmd_page(*pmd);
116 pmd_clear(pmd);
117 pte_free_tlb(tlb, page);
118 dec_page_state(nr_page_table_pages);
119 tlb->mm->nr_ptes--;
120 }
122 static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
123 unsigned long addr, unsigned long end,
124 unsigned long floor, unsigned long ceiling)
125 {
126 pmd_t *pmd;
127 unsigned long next;
128 unsigned long start;
130 start = addr;
131 pmd = pmd_offset(pud, addr);
132 do {
133 next = pmd_addr_end(addr, end);
134 if (pmd_none_or_clear_bad(pmd))
135 continue;
136 free_pte_range(tlb, pmd);
137 } while (pmd++, addr = next, addr != end);
139 start &= PUD_MASK;
140 if (start < floor)
141 return;
142 if (ceiling) {
143 ceiling &= PUD_MASK;
144 if (!ceiling)
145 return;
146 }
147 if (end - 1 > ceiling - 1)
148 return;
150 pmd = pmd_offset(pud, start);
151 pud_clear(pud);
152 pmd_free_tlb(tlb, pmd);
153 }
155 static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
156 unsigned long addr, unsigned long end,
157 unsigned long floor, unsigned long ceiling)
158 {
159 pud_t *pud;
160 unsigned long next;
161 unsigned long start;
163 start = addr;
164 pud = pud_offset(pgd, addr);
165 do {
166 next = pud_addr_end(addr, end);
167 if (pud_none_or_clear_bad(pud))
168 continue;
169 free_pmd_range(tlb, pud, addr, next, floor, ceiling);
170 } while (pud++, addr = next, addr != end);
172 start &= PGDIR_MASK;
173 if (start < floor)
174 return;
175 if (ceiling) {
176 ceiling &= PGDIR_MASK;
177 if (!ceiling)
178 return;
179 }
180 if (end - 1 > ceiling - 1)
181 return;
183 pud = pud_offset(pgd, start);
184 pgd_clear(pgd);
185 pud_free_tlb(tlb, pud);
186 }
188 /*
189 * This function frees user-level page tables of a process.
190 *
191 * Must be called with pagetable lock held.
192 */
193 void free_pgd_range(struct mmu_gather **tlb,
194 unsigned long addr, unsigned long end,
195 unsigned long floor, unsigned long ceiling)
196 {
197 pgd_t *pgd;
198 unsigned long next;
199 unsigned long start;
201 /*
202 * The next few lines have given us lots of grief...
203 *
204 * Why are we testing PMD* at this top level? Because often
205 * there will be no work to do at all, and we'd prefer not to
206 * go all the way down to the bottom just to discover that.
207 *
208 * Why all these "- 1"s? Because 0 represents both the bottom
209 * of the address space and the top of it (using -1 for the
210 * top wouldn't help much: the masks would do the wrong thing).
211 * The rule is that addr 0 and floor 0 refer to the bottom of
212 * the address space, but end 0 and ceiling 0 refer to the top
213 * Comparisons need to use "end - 1" and "ceiling - 1" (though
214 * that end 0 case should be mythical).
215 *
216 * Wherever addr is brought up or ceiling brought down, we must
217 * be careful to reject "the opposite 0" before it confuses the
218 * subsequent tests. But what about where end is brought down
219 * by PMD_SIZE below? no, end can't go down to 0 there.
220 *
221 * Whereas we round start (addr) and ceiling down, by different
222 * masks at different levels, in order to test whether a table
223 * now has no other vmas using it, so can be freed, we don't
224 * bother to round floor or end up - the tests don't need that.
225 */
227 addr &= PMD_MASK;
228 if (addr < floor) {
229 addr += PMD_SIZE;
230 if (!addr)
231 return;
232 }
233 if (ceiling) {
234 ceiling &= PMD_MASK;
235 if (!ceiling)
236 return;
237 }
238 if (end - 1 > ceiling - 1)
239 end -= PMD_SIZE;
240 if (addr > end - 1)
241 return;
243 start = addr;
244 pgd = pgd_offset((*tlb)->mm, addr);
245 do {
246 next = pgd_addr_end(addr, end);
247 if (pgd_none_or_clear_bad(pgd))
248 continue;
249 free_pud_range(*tlb, pgd, addr, next, floor, ceiling);
250 } while (pgd++, addr = next, addr != end);
252 if (!tlb_is_full_mm(*tlb))
253 flush_tlb_pgtables((*tlb)->mm, start, end);
254 }
256 void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
257 unsigned long floor, unsigned long ceiling)
258 {
259 while (vma) {
260 struct vm_area_struct *next = vma->vm_next;
261 unsigned long addr = vma->vm_start;
263 if (is_hugepage_only_range(vma->vm_mm, addr, HPAGE_SIZE)) {
264 hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
265 floor, next? next->vm_start: ceiling);
266 } else {
267 /*
268 * Optimization: gather nearby vmas into one call down
269 */
270 while (next && next->vm_start <= vma->vm_end + PMD_SIZE
271 && !is_hugepage_only_range(vma->vm_mm, next->vm_start,
272 HPAGE_SIZE)) {
273 vma = next;
274 next = vma->vm_next;
275 }
276 free_pgd_range(tlb, addr, vma->vm_end,
277 floor, next? next->vm_start: ceiling);
278 }
279 vma = next;
280 }
281 }
283 pte_t fastcall *pte_alloc_map(struct mm_struct *mm, pmd_t *pmd,
284 unsigned long address)
285 {
286 if (!pmd_present(*pmd)) {
287 struct page *new;
289 spin_unlock(&mm->page_table_lock);
290 new = pte_alloc_one(mm, address);
291 spin_lock(&mm->page_table_lock);
292 if (!new)
293 return NULL;
294 /*
295 * Because we dropped the lock, we should re-check the
296 * entry, as somebody else could have populated it..
297 */
298 if (pmd_present(*pmd)) {
299 pte_free(new);
300 goto out;
301 }
302 mm->nr_ptes++;
303 inc_page_state(nr_page_table_pages);
304 pmd_populate(mm, pmd, new);
305 }
306 out:
307 return pte_offset_map(pmd, address);
308 }
310 pte_t fastcall * pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
311 {
312 if (!pmd_present(*pmd)) {
313 pte_t *new;
315 spin_unlock(&mm->page_table_lock);
316 new = pte_alloc_one_kernel(mm, address);
317 spin_lock(&mm->page_table_lock);
318 if (!new)
319 return NULL;
321 /*
322 * Because we dropped the lock, we should re-check the
323 * entry, as somebody else could have populated it..
324 */
325 if (pmd_present(*pmd)) {
326 pte_free_kernel(new);
327 goto out;
328 }
329 pmd_populate_kernel(mm, pmd, new);
330 }
331 out:
332 return pte_offset_kernel(pmd, address);
333 }
335 /*
336 * copy one vm_area from one task to the other. Assumes the page tables
337 * already present in the new task to be cleared in the whole range
338 * covered by this vma.
339 *
340 * dst->page_table_lock is held on entry and exit,
341 * but may be dropped within p[mg]d_alloc() and pte_alloc_map().
342 */
344 static inline void
345 copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
346 pte_t *dst_pte, pte_t *src_pte, unsigned long vm_flags,
347 unsigned long addr)
348 {
349 pte_t pte = *src_pte;
350 struct page *page;
351 unsigned long pfn;
353 /* pte contains position in swap or file, so copy. */
354 if (unlikely(!pte_present(pte))) {
355 if (!pte_file(pte)) {
356 swap_duplicate(pte_to_swp_entry(pte));
357 /* make sure dst_mm is on swapoff's mmlist. */
358 if (unlikely(list_empty(&dst_mm->mmlist))) {
359 spin_lock(&mmlist_lock);
360 list_add(&dst_mm->mmlist, &src_mm->mmlist);
361 spin_unlock(&mmlist_lock);
362 }
363 }
364 set_pte_at(dst_mm, addr, dst_pte, pte);
365 return;
366 }
368 pfn = pte_pfn(pte);
369 /* the pte points outside of valid memory, the
370 * mapping is assumed to be good, meaningful
371 * and not mapped via rmap - duplicate the
372 * mapping as is.
373 */
374 page = NULL;
375 if (pfn_valid(pfn))
376 page = pfn_to_page(pfn);
378 if (!page || PageReserved(page)) {
379 set_pte_at(dst_mm, addr, dst_pte, pte);
380 return;
381 }
383 /*
384 * If it's a COW mapping, write protect it both
385 * in the parent and the child
386 */
387 if ((vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE) {
388 ptep_set_wrprotect(src_mm, addr, src_pte);
389 pte = *src_pte;
390 }
392 /*
393 * If it's a shared mapping, mark it clean in
394 * the child
395 */
396 if (vm_flags & VM_SHARED)
397 pte = pte_mkclean(pte);
398 pte = pte_mkold(pte);
399 get_page(page);
400 inc_mm_counter(dst_mm, rss);
401 if (PageAnon(page))
402 inc_mm_counter(dst_mm, anon_rss);
403 set_pte_at(dst_mm, addr, dst_pte, pte);
404 page_dup_rmap(page);
405 }
407 static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
408 pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
409 unsigned long addr, unsigned long end)
410 {
411 pte_t *src_pte, *dst_pte;
412 unsigned long vm_flags = vma->vm_flags;
413 int progress;
415 again:
416 dst_pte = pte_alloc_map(dst_mm, dst_pmd, addr);
417 if (!dst_pte)
418 return -ENOMEM;
419 src_pte = pte_offset_map_nested(src_pmd, addr);
421 progress = 0;
422 spin_lock(&src_mm->page_table_lock);
423 do {
424 /*
425 * We are holding two locks at this point - either of them
426 * could generate latencies in another task on another CPU.
427 */
428 if (progress >= 32 && (need_resched() ||
429 need_lockbreak(&src_mm->page_table_lock) ||
430 need_lockbreak(&dst_mm->page_table_lock)))
431 break;
432 if (pte_none(*src_pte)) {
433 progress++;
434 continue;
435 }
436 copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vm_flags, addr);
437 progress += 8;
438 } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
439 spin_unlock(&src_mm->page_table_lock);
441 pte_unmap_nested(src_pte - 1);
442 pte_unmap(dst_pte - 1);
443 cond_resched_lock(&dst_mm->page_table_lock);
444 if (addr != end)
445 goto again;
446 return 0;
447 }
449 static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
450 pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma,
451 unsigned long addr, unsigned long end)
452 {
453 pmd_t *src_pmd, *dst_pmd;
454 unsigned long next;
456 dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
457 if (!dst_pmd)
458 return -ENOMEM;
459 src_pmd = pmd_offset(src_pud, addr);
460 do {
461 next = pmd_addr_end(addr, end);
462 if (pmd_none_or_clear_bad(src_pmd))
463 continue;
464 if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
465 vma, addr, next))
466 return -ENOMEM;
467 } while (dst_pmd++, src_pmd++, addr = next, addr != end);
468 return 0;
469 }
471 static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
472 pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,
473 unsigned long addr, unsigned long end)
474 {
475 pud_t *src_pud, *dst_pud;
476 unsigned long next;
478 dst_pud = pud_alloc(dst_mm, dst_pgd, addr);
479 if (!dst_pud)
480 return -ENOMEM;
481 src_pud = pud_offset(src_pgd, addr);
482 do {
483 next = pud_addr_end(addr, end);
484 if (pud_none_or_clear_bad(src_pud))
485 continue;
486 if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
487 vma, addr, next))
488 return -ENOMEM;
489 } while (dst_pud++, src_pud++, addr = next, addr != end);
490 return 0;
491 }
493 int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
494 struct vm_area_struct *vma)
495 {
496 pgd_t *src_pgd, *dst_pgd;
497 unsigned long next;
498 unsigned long addr = vma->vm_start;
499 unsigned long end = vma->vm_end;
501 if (is_vm_hugetlb_page(vma))
502 return copy_hugetlb_page_range(dst_mm, src_mm, vma);
504 dst_pgd = pgd_offset(dst_mm, addr);
505 src_pgd = pgd_offset(src_mm, addr);
506 do {
507 next = pgd_addr_end(addr, end);
508 if (pgd_none_or_clear_bad(src_pgd))
509 continue;
510 if (copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,
511 vma, addr, next))
512 return -ENOMEM;
513 } while (dst_pgd++, src_pgd++, addr = next, addr != end);
514 return 0;
515 }
517 static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
518 unsigned long addr, unsigned long end,
519 struct zap_details *details)
520 {
521 pte_t *pte;
523 pte = pte_offset_map(pmd, addr);
524 do {
525 pte_t ptent = *pte;
526 if (pte_none(ptent))
527 continue;
528 if (pte_present(ptent)) {
529 struct page *page = NULL;
530 unsigned long pfn = pte_pfn(ptent);
531 if (pfn_valid(pfn)) {
532 page = pfn_to_page(pfn);
533 if (PageReserved(page))
534 page = NULL;
535 }
536 if (unlikely(details) && page) {
537 /*
538 * unmap_shared_mapping_pages() wants to
539 * invalidate cache without truncating:
540 * unmap shared but keep private pages.
541 */
542 if (details->check_mapping &&
543 details->check_mapping != page->mapping)
544 continue;
545 /*
546 * Each page->index must be checked when
547 * invalidating or truncating nonlinear.
548 */
549 if (details->nonlinear_vma &&
550 (page->index < details->first_index ||
551 page->index > details->last_index))
552 continue;
553 }
554 ptent = ptep_get_and_clear(tlb->mm, addr, pte);
555 tlb_remove_tlb_entry(tlb, pte, addr);
556 if (unlikely(!page))
557 continue;
558 if (unlikely(details) && details->nonlinear_vma
559 && linear_page_index(details->nonlinear_vma,
560 addr) != page->index)
561 set_pte_at(tlb->mm, addr, pte,
562 pgoff_to_pte(page->index));
563 if (pte_dirty(ptent))
564 set_page_dirty(page);
565 if (PageAnon(page))
566 dec_mm_counter(tlb->mm, anon_rss);
567 else if (pte_young(ptent))
568 mark_page_accessed(page);
569 tlb->freed++;
570 page_remove_rmap(page);
571 tlb_remove_page(tlb, page);
572 continue;
573 }
574 /*
575 * If details->check_mapping, we leave swap entries;
576 * if details->nonlinear_vma, we leave file entries.
577 */
578 if (unlikely(details))
579 continue;
580 if (!pte_file(ptent))
581 free_swap_and_cache(pte_to_swp_entry(ptent));
582 pte_clear(tlb->mm, addr, pte);
583 } while (pte++, addr += PAGE_SIZE, addr != end);
584 pte_unmap(pte - 1);
585 }
587 static inline void zap_pmd_range(struct mmu_gather *tlb, pud_t *pud,
588 unsigned long addr, unsigned long end,
589 struct zap_details *details)
590 {
591 pmd_t *pmd;
592 unsigned long next;
594 pmd = pmd_offset(pud, addr);
595 do {
596 next = pmd_addr_end(addr, end);
597 if (pmd_none_or_clear_bad(pmd))
598 continue;
599 zap_pte_range(tlb, pmd, addr, next, details);
600 } while (pmd++, addr = next, addr != end);
601 }
603 static inline void zap_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
604 unsigned long addr, unsigned long end,
605 struct zap_details *details)
606 {
607 pud_t *pud;
608 unsigned long next;
610 pud = pud_offset(pgd, addr);
611 do {
612 next = pud_addr_end(addr, end);
613 if (pud_none_or_clear_bad(pud))
614 continue;
615 zap_pmd_range(tlb, pud, addr, next, details);
616 } while (pud++, addr = next, addr != end);
617 }
619 static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
620 unsigned long addr, unsigned long end,
621 struct zap_details *details)
622 {
623 pgd_t *pgd;
624 unsigned long next;
626 if (details && !details->check_mapping && !details->nonlinear_vma)
627 details = NULL;
629 BUG_ON(addr >= end);
630 tlb_start_vma(tlb, vma);
631 pgd = pgd_offset(vma->vm_mm, addr);
632 do {
633 next = pgd_addr_end(addr, end);
634 if (pgd_none_or_clear_bad(pgd))
635 continue;
636 zap_pud_range(tlb, pgd, addr, next, details);
637 } while (pgd++, addr = next, addr != end);
638 tlb_end_vma(tlb, vma);
639 }
641 #ifdef CONFIG_PREEMPT
642 # define ZAP_BLOCK_SIZE (8 * PAGE_SIZE)
643 #else
644 /* No preempt: go for improved straight-line efficiency */
645 # define ZAP_BLOCK_SIZE (1024 * PAGE_SIZE)
646 #endif
648 /**
649 * unmap_vmas - unmap a range of memory covered by a list of vma's
650 * @tlbp: address of the caller's struct mmu_gather
651 * @mm: the controlling mm_struct
652 * @vma: the starting vma
653 * @start_addr: virtual address at which to start unmapping
654 * @end_addr: virtual address at which to end unmapping
655 * @nr_accounted: Place number of unmapped pages in vm-accountable vma's here
656 * @details: details of nonlinear truncation or shared cache invalidation
657 *
658 * Returns the end address of the unmapping (restart addr if interrupted).
659 *
660 * Unmap all pages in the vma list. Called under page_table_lock.
661 *
662 * We aim to not hold page_table_lock for too long (for scheduling latency
663 * reasons). So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to
664 * return the ending mmu_gather to the caller.
665 *
666 * Only addresses between `start' and `end' will be unmapped.
667 *
668 * The VMA list must be sorted in ascending virtual address order.
669 *
670 * unmap_vmas() assumes that the caller will flush the whole unmapped address
671 * range after unmap_vmas() returns. So the only responsibility here is to
672 * ensure that any thus-far unmapped pages are flushed before unmap_vmas()
673 * drops the lock and schedules.
674 */
675 unsigned long unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm,
676 struct vm_area_struct *vma, unsigned long start_addr,
677 unsigned long end_addr, unsigned long *nr_accounted,
678 struct zap_details *details)
679 {
680 unsigned long zap_bytes = ZAP_BLOCK_SIZE;
681 unsigned long tlb_start = 0; /* For tlb_finish_mmu */
682 int tlb_start_valid = 0;
683 unsigned long start = start_addr;
684 spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
685 int fullmm = tlb_is_full_mm(*tlbp);
687 for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
688 unsigned long end;
690 start = max(vma->vm_start, start_addr);
691 if (start >= vma->vm_end)
692 continue;
693 end = min(vma->vm_end, end_addr);
694 if (end <= vma->vm_start)
695 continue;
697 if (vma->vm_flags & VM_ACCOUNT)
698 *nr_accounted += (end - start) >> PAGE_SHIFT;
700 while (start != end) {
701 unsigned long block;
703 if (!tlb_start_valid) {
704 tlb_start = start;
705 tlb_start_valid = 1;
706 }
708 if (is_vm_hugetlb_page(vma)) {
709 block = end - start;
710 unmap_hugepage_range(vma, start, end);
711 } else {
712 block = min(zap_bytes, end - start);
713 unmap_page_range(*tlbp, vma, start,
714 start + block, details);
715 }
717 start += block;
718 zap_bytes -= block;
719 if ((long)zap_bytes > 0)
720 continue;
722 tlb_finish_mmu(*tlbp, tlb_start, start);
724 if (need_resched() ||
725 need_lockbreak(&mm->page_table_lock) ||
726 (i_mmap_lock && need_lockbreak(i_mmap_lock))) {
727 if (i_mmap_lock) {
728 /* must reset count of rss freed */
729 *tlbp = tlb_gather_mmu(mm, fullmm);
730 goto out;
731 }
732 spin_unlock(&mm->page_table_lock);
733 cond_resched();
734 spin_lock(&mm->page_table_lock);
735 }
737 *tlbp = tlb_gather_mmu(mm, fullmm);
738 tlb_start_valid = 0;
739 zap_bytes = ZAP_BLOCK_SIZE;
740 }
741 }
742 out:
743 return start; /* which is now the end (or restart) address */
744 }
746 /**
747 * zap_page_range - remove user pages in a given range
748 * @vma: vm_area_struct holding the applicable pages
749 * @address: starting address of pages to zap
750 * @size: number of bytes to zap
751 * @details: details of nonlinear truncation or shared cache invalidation
752 */
753 unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
754 unsigned long size, struct zap_details *details)
755 {
756 struct mm_struct *mm = vma->vm_mm;
757 struct mmu_gather *tlb;
758 unsigned long end = address + size;
759 unsigned long nr_accounted = 0;
761 if (is_vm_hugetlb_page(vma)) {
762 zap_hugepage_range(vma, address, size);
763 return end;
764 }
766 lru_add_drain();
767 spin_lock(&mm->page_table_lock);
768 tlb = tlb_gather_mmu(mm, 0);
769 end = unmap_vmas(&tlb, mm, vma, address, end, &nr_accounted, details);
770 tlb_finish_mmu(tlb, address, end);
771 spin_unlock(&mm->page_table_lock);
772 return end;
773 }
775 /*
776 * Do a quick page-table lookup for a single page.
777 * mm->page_table_lock must be held.
778 */
779 static struct page *
780 __follow_page(struct mm_struct *mm, unsigned long address, int read, int write)
781 {
782 pgd_t *pgd;
783 pud_t *pud;
784 pmd_t *pmd;
785 pte_t *ptep, pte;
786 unsigned long pfn;
787 struct page *page;
789 page = follow_huge_addr(mm, address, write);
790 if (! IS_ERR(page))
791 return page;
793 pgd = pgd_offset(mm, address);
794 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
795 goto out;
797 pud = pud_offset(pgd, address);
798 if (pud_none(*pud) || unlikely(pud_bad(*pud)))
799 goto out;
801 pmd = pmd_offset(pud, address);
802 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
803 goto out;
804 if (pmd_huge(*pmd))
805 return follow_huge_pmd(mm, address, pmd, write);
807 ptep = pte_offset_map(pmd, address);
808 if (!ptep)
809 goto out;
811 pte = *ptep;
812 pte_unmap(ptep);
813 if (pte_present(pte)) {
814 if (write && !pte_write(pte))
815 goto out;
816 if (read && !pte_read(pte))
817 goto out;
818 pfn = pte_pfn(pte);
819 if (pfn_valid(pfn)) {
820 page = pfn_to_page(pfn);
821 if (write && !pte_dirty(pte) && !PageDirty(page))
822 set_page_dirty(page);
823 mark_page_accessed(page);
824 return page;
825 }
826 }
828 out:
829 return NULL;
830 }
832 struct page *
833 follow_page(struct mm_struct *mm, unsigned long address, int write)
834 {
835 return __follow_page(mm, address, /*read*/0, write);
836 }
838 int
839 check_user_page_readable(struct mm_struct *mm, unsigned long address)
840 {
841 return __follow_page(mm, address, /*read*/1, /*write*/0) != NULL;
842 }
844 EXPORT_SYMBOL(check_user_page_readable);
846 /*
847 * Given a physical address, is there a useful struct page pointing to
848 * it? This may become more complex in the future if we start dealing
849 * with IO-aperture pages for direct-IO.
850 */
852 static inline struct page *get_page_map(struct page *page)
853 {
854 if (!pfn_valid(page_to_pfn(page)))
855 return NULL;
856 return page;
857 }
860 static inline int
861 untouched_anonymous_page(struct mm_struct* mm, struct vm_area_struct *vma,
862 unsigned long address)
863 {
864 pgd_t *pgd;
865 pud_t *pud;
866 pmd_t *pmd;
868 /* Check if the vma is for an anonymous mapping. */
869 if (vma->vm_ops && vma->vm_ops->nopage)
870 return 0;
872 /* Check if page directory entry exists. */
873 pgd = pgd_offset(mm, address);
874 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
875 return 1;
877 pud = pud_offset(pgd, address);
878 if (pud_none(*pud) || unlikely(pud_bad(*pud)))
879 return 1;
881 /* Check if page middle directory entry exists. */
882 pmd = pmd_offset(pud, address);
883 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
884 return 1;
886 /* There is a pte slot for 'address' in 'mm'. */
887 return 0;
888 }
891 int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
892 unsigned long start, int len, int write, int force,
893 struct page **pages, struct vm_area_struct **vmas)
894 {
895 int i;
896 unsigned int flags;
898 /*
899 * Require read or write permissions.
900 * If 'force' is set, we only require the "MAY" flags.
901 */
902 flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
903 flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
904 i = 0;
906 do {
907 struct vm_area_struct * vma;
909 vma = find_extend_vma(mm, start);
910 if (!vma && in_gate_area(tsk, start)) {
911 unsigned long pg = start & PAGE_MASK;
912 struct vm_area_struct *gate_vma = get_gate_vma(tsk);
913 pgd_t *pgd;
914 pud_t *pud;
915 pmd_t *pmd;
916 pte_t *pte;
917 if (write) /* user gate pages are read-only */
918 return i ? : -EFAULT;
919 if (pg > TASK_SIZE)
920 pgd = pgd_offset_k(pg);
921 else
922 pgd = pgd_offset_gate(mm, pg);
923 BUG_ON(pgd_none(*pgd));
924 pud = pud_offset(pgd, pg);
925 BUG_ON(pud_none(*pud));
926 pmd = pmd_offset(pud, pg);
927 BUG_ON(pmd_none(*pmd));
928 pte = pte_offset_map(pmd, pg);
929 BUG_ON(pte_none(*pte));
930 if (pages) {
931 pages[i] = pte_page(*pte);
932 get_page(pages[i]);
933 }
934 pte_unmap(pte);
935 if (vmas)
936 vmas[i] = gate_vma;
937 i++;
938 start += PAGE_SIZE;
939 len--;
940 continue;
941 }
943 if (vma && (vma->vm_flags & VM_FOREIGN))
944 {
945 struct page **map = vma->vm_private_data;
946 int offset = (start - vma->vm_start) >> PAGE_SHIFT;
948 if (map[offset] != NULL) {
949 if (pages) {
950 pages[i] = map[offset];
951 }
952 if (vmas)
953 vmas[i] = vma;
954 i++;
955 start += PAGE_SIZE;
956 len--;
957 continue;
958 }
959 }
961 if (!vma || (vma->vm_flags & VM_IO)
962 || !(flags & vma->vm_flags))
963 return i ? : -EFAULT;
965 if (is_vm_hugetlb_page(vma)) {
966 i = follow_hugetlb_page(mm, vma, pages, vmas,
967 &start, &len, i);
968 continue;
969 }
970 spin_lock(&mm->page_table_lock);
971 do {
972 struct page *map;
973 int lookup_write = write;
975 cond_resched_lock(&mm->page_table_lock);
976 while (!(map = follow_page(mm, start, lookup_write))) {
977 /*
978 * Shortcut for anonymous pages. We don't want
979 * to force the creation of pages tables for
980 * insanly big anonymously mapped areas that
981 * nobody touched so far. This is important
982 * for doing a core dump for these mappings.
983 */
984 if (!lookup_write &&
985 untouched_anonymous_page(mm,vma,start)) {
986 map = ZERO_PAGE(start);
987 break;
988 }
989 spin_unlock(&mm->page_table_lock);
990 switch (handle_mm_fault(mm,vma,start,write)) {
991 case VM_FAULT_MINOR:
992 tsk->min_flt++;
993 break;
994 case VM_FAULT_MAJOR:
995 tsk->maj_flt++;
996 break;
997 case VM_FAULT_SIGBUS:
998 return i ? i : -EFAULT;
999 case VM_FAULT_OOM:
1000 return i ? i : -ENOMEM;
1001 default:
1002 BUG();
1004 /*
1005 * Now that we have performed a write fault
1006 * and surely no longer have a shared page we
1007 * shouldn't write, we shouldn't ignore an
1008 * unwritable page in the page table if
1009 * we are forcing write access.
1010 */
1011 lookup_write = write && !force;
1012 spin_lock(&mm->page_table_lock);
1014 if (pages) {
1015 pages[i] = get_page_map(map);
1016 if (!pages[i]) {
1017 spin_unlock(&mm->page_table_lock);
1018 while (i--)
1019 page_cache_release(pages[i]);
1020 i = -EFAULT;
1021 goto out;
1023 flush_dcache_page(pages[i]);
1024 if (!PageReserved(pages[i]))
1025 page_cache_get(pages[i]);
1027 if (vmas)
1028 vmas[i] = vma;
1029 i++;
1030 start += PAGE_SIZE;
1031 len--;
1032 } while(len && start < vma->vm_end);
1033 spin_unlock(&mm->page_table_lock);
1034 } while(len);
1035 out:
1036 return i;
1039 EXPORT_SYMBOL(get_user_pages);
1041 static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd,
1042 unsigned long addr, unsigned long end, pgprot_t prot)
1044 pte_t *pte;
1046 pte = pte_alloc_map(mm, pmd, addr);
1047 if (!pte)
1048 return -ENOMEM;
1049 do {
1050 pte_t zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE(addr), prot));
1051 BUG_ON(!pte_none(*pte));
1052 set_pte_at(mm, addr, pte, zero_pte);
1053 } while (pte++, addr += PAGE_SIZE, addr != end);
1054 pte_unmap(pte - 1);
1055 return 0;
1058 static inline int zeromap_pmd_range(struct mm_struct *mm, pud_t *pud,
1059 unsigned long addr, unsigned long end, pgprot_t prot)
1061 pmd_t *pmd;
1062 unsigned long next;
1064 pmd = pmd_alloc(mm, pud, addr);
1065 if (!pmd)
1066 return -ENOMEM;
1067 do {
1068 next = pmd_addr_end(addr, end);
1069 if (zeromap_pte_range(mm, pmd, addr, next, prot))
1070 return -ENOMEM;
1071 } while (pmd++, addr = next, addr != end);
1072 return 0;
1075 static inline int zeromap_pud_range(struct mm_struct *mm, pgd_t *pgd,
1076 unsigned long addr, unsigned long end, pgprot_t prot)
1078 pud_t *pud;
1079 unsigned long next;
1081 pud = pud_alloc(mm, pgd, addr);
1082 if (!pud)
1083 return -ENOMEM;
1084 do {
1085 next = pud_addr_end(addr, end);
1086 if (zeromap_pmd_range(mm, pud, addr, next, prot))
1087 return -ENOMEM;
1088 } while (pud++, addr = next, addr != end);
1089 return 0;
1092 int zeromap_page_range(struct vm_area_struct *vma,
1093 unsigned long addr, unsigned long size, pgprot_t prot)
1095 pgd_t *pgd;
1096 unsigned long next;
1097 unsigned long end = addr + size;
1098 struct mm_struct *mm = vma->vm_mm;
1099 int err;
1101 BUG_ON(addr >= end);
1102 pgd = pgd_offset(mm, addr);
1103 flush_cache_range(vma, addr, end);
1104 spin_lock(&mm->page_table_lock);
1105 do {
1106 next = pgd_addr_end(addr, end);
1107 err = zeromap_pud_range(mm, pgd, addr, next, prot);
1108 if (err)
1109 break;
1110 } while (pgd++, addr = next, addr != end);
1111 spin_unlock(&mm->page_table_lock);
1112 return err;
1115 /*
1116 * maps a range of physical memory into the requested pages. the old
1117 * mappings are removed. any references to nonexistent pages results
1118 * in null mappings (currently treated as "copy-on-access")
1119 */
1120 static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
1121 unsigned long addr, unsigned long end,
1122 unsigned long pfn, pgprot_t prot)
1124 pte_t *pte;
1126 pte = pte_alloc_map(mm, pmd, addr);
1127 if (!pte)
1128 return -ENOMEM;
1129 do {
1130 BUG_ON(!pte_none(*pte));
1131 if (!pfn_valid(pfn) || PageReserved(pfn_to_page(pfn)))
1132 set_pte_at(mm, addr, pte, pfn_pte(pfn, prot));
1133 pfn++;
1134 } while (pte++, addr += PAGE_SIZE, addr != end);
1135 pte_unmap(pte - 1);
1136 return 0;
1139 static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
1140 unsigned long addr, unsigned long end,
1141 unsigned long pfn, pgprot_t prot)
1143 pmd_t *pmd;
1144 unsigned long next;
1146 pfn -= addr >> PAGE_SHIFT;
1147 pmd = pmd_alloc(mm, pud, addr);
1148 if (!pmd)
1149 return -ENOMEM;
1150 do {
1151 next = pmd_addr_end(addr, end);
1152 if (remap_pte_range(mm, pmd, addr, next,
1153 pfn + (addr >> PAGE_SHIFT), prot))
1154 return -ENOMEM;
1155 } while (pmd++, addr = next, addr != end);
1156 return 0;
1159 static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd,
1160 unsigned long addr, unsigned long end,
1161 unsigned long pfn, pgprot_t prot)
1163 pud_t *pud;
1164 unsigned long next;
1166 pfn -= addr >> PAGE_SHIFT;
1167 pud = pud_alloc(mm, pgd, addr);
1168 if (!pud)
1169 return -ENOMEM;
1170 do {
1171 next = pud_addr_end(addr, end);
1172 if (remap_pmd_range(mm, pud, addr, next,
1173 pfn + (addr >> PAGE_SHIFT), prot))
1174 return -ENOMEM;
1175 } while (pud++, addr = next, addr != end);
1176 return 0;
1179 /* Note: this is only safe if the mm semaphore is held when called. */
1180 int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
1181 unsigned long pfn, unsigned long size, pgprot_t prot)
1183 pgd_t *pgd;
1184 unsigned long next;
1185 unsigned long end = addr + size;
1186 struct mm_struct *mm = vma->vm_mm;
1187 int err;
1189 /*
1190 * Physically remapped pages are special. Tell the
1191 * rest of the world about it:
1192 * VM_IO tells people not to look at these pages
1193 * (accesses can have side effects).
1194 * VM_RESERVED tells swapout not to try to touch
1195 * this region.
1196 */
1197 vma->vm_flags |= VM_IO | VM_RESERVED;
1199 BUG_ON(addr >= end);
1200 pfn -= addr >> PAGE_SHIFT;
1201 pgd = pgd_offset(mm, addr);
1202 flush_cache_range(vma, addr, end);
1203 spin_lock(&mm->page_table_lock);
1204 do {
1205 next = pgd_addr_end(addr, end);
1206 err = remap_pud_range(mm, pgd, addr, next,
1207 pfn + (addr >> PAGE_SHIFT), prot);
1208 if (err)
1209 break;
1210 } while (pgd++, addr = next, addr != end);
1211 spin_unlock(&mm->page_table_lock);
1212 return err;
1214 EXPORT_SYMBOL(remap_pfn_range);
1216 static inline int generic_pte_range(struct mm_struct *mm,
1217 pmd_t *pmd,
1218 unsigned long addr,
1219 unsigned long end,
1220 pte_fn_t fn, void *data)
1222 pte_t *pte;
1223 int err;
1224 struct page *pte_page;
1226 pte = (mm == &init_mm) ?
1227 pte_alloc_kernel(mm, pmd, addr) :
1228 pte_alloc_map(mm, pmd, addr);
1229 if (!pte)
1230 return -ENOMEM;
1232 pte_page = pmd_page(*pmd);
1234 do {
1235 err = fn(pte, pte_page, addr, data);
1236 if (err)
1237 break;
1238 } while (pte++, addr += PAGE_SIZE, addr != end);
1240 if (mm != &init_mm)
1241 pte_unmap(pte-1);
1242 return err;
1246 static inline int generic_pmd_range(struct mm_struct *mm,
1247 pud_t *pud,
1248 unsigned long addr,
1249 unsigned long end,
1250 pte_fn_t fn, void *data)
1252 pmd_t *pmd;
1253 unsigned long next;
1254 int err;
1256 pmd = pmd_alloc(mm, pud, addr);
1257 if (!pmd)
1258 return -ENOMEM;
1259 do {
1260 next = pmd_addr_end(addr, end);
1261 err = generic_pte_range(mm, pmd, addr, next, fn, data);
1262 if (err)
1263 break;
1264 } while (pmd++, addr = next, addr != end);
1265 return err;
1268 static inline int generic_pud_range(struct mm_struct *mm, pgd_t *pgd,
1269 unsigned long addr,
1270 unsigned long end,
1271 pte_fn_t fn, void *data)
1273 pud_t *pud;
1274 unsigned long next;
1275 int err;
1277 pud = pud_alloc(mm, pgd, addr);
1278 if (!pud)
1279 return -ENOMEM;
1280 do {
1281 next = pud_addr_end(addr, end);
1282 err = generic_pmd_range(mm, pud, addr, next, fn, data);
1283 if (err)
1284 break;
1285 } while (pud++, addr = next, addr != end);
1286 return err;
1289 /*
1290 * Scan a region of virtual memory, filling in page tables as necessary
1291 * and calling a provided function on each leaf page table.
1292 */
1293 int generic_page_range(struct mm_struct *mm, unsigned long addr,
1294 unsigned long size, pte_fn_t fn, void *data)
1296 pgd_t *pgd;
1297 unsigned long next;
1298 unsigned long end = addr + size;
1299 int err;
1301 BUG_ON(addr >= end);
1302 pgd = pgd_offset(mm, addr);
1303 spin_lock(&mm->page_table_lock);
1304 do {
1305 next = pgd_addr_end(addr, end);
1306 err = generic_pud_range(mm, pgd, addr, next, fn, data);
1307 if (err)
1308 break;
1309 } while (pgd++, addr = next, addr != end);
1310 spin_unlock(&mm->page_table_lock);
1311 return err;
1314 /*
1315 * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when
1316 * servicing faults for write access. In the normal case, do always want
1317 * pte_mkwrite. But get_user_pages can cause write faults for mappings
1318 * that do not have writing enabled, when used by access_process_vm.
1319 */
1320 static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
1322 if (likely(vma->vm_flags & VM_WRITE))
1323 pte = pte_mkwrite(pte);
1324 return pte;
1327 /*
1328 * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock
1329 */
1330 static inline void break_cow(struct vm_area_struct * vma, struct page * new_page, unsigned long address,
1331 pte_t *page_table)
1333 pte_t entry;
1335 entry = maybe_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot)),
1336 vma);
1337 ptep_establish(vma, address, page_table, entry);
1338 update_mmu_cache(vma, address, entry);
1339 lazy_mmu_prot_update(entry);
1342 /*
1343 * This routine handles present pages, when users try to write
1344 * to a shared page. It is done by copying the page to a new address
1345 * and decrementing the shared-page counter for the old page.
1347 * Goto-purists beware: the only reason for goto's here is that it results
1348 * in better assembly code.. The "default" path will see no jumps at all.
1350 * Note that this routine assumes that the protection checks have been
1351 * done by the caller (the low-level page fault routine in most cases).
1352 * Thus we can safely just mark it writable once we've done any necessary
1353 * COW.
1355 * We also mark the page dirty at this point even though the page will
1356 * change only once the write actually happens. This avoids a few races,
1357 * and potentially makes it more efficient.
1359 * We hold the mm semaphore and the page_table_lock on entry and exit
1360 * with the page_table_lock released.
1361 */
1362 static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
1363 unsigned long address, pte_t *page_table, pmd_t *pmd, pte_t pte)
1365 struct page *old_page, *new_page;
1366 unsigned long pfn = pte_pfn(pte);
1367 pte_t entry;
1368 struct page invalid_page;
1370 if (unlikely(!pfn_valid(pfn))) {
1371 /* This can happen with /dev/mem (PROT_WRITE, MAP_PRIVATE). */
1372 invalid_page.flags = (1<<PG_reserved) | (1<<PG_locked);
1373 old_page = &invalid_page;
1374 } else {
1375 old_page = pfn_to_page(pfn);
1378 if (!TestSetPageLocked(old_page)) {
1379 int reuse = can_share_swap_page(old_page);
1380 unlock_page(old_page);
1381 if (reuse) {
1382 flush_cache_page(vma, address, pfn);
1383 entry = maybe_mkwrite(pte_mkyoung(pte_mkdirty(pte)),
1384 vma);
1385 ptep_set_access_flags(vma, address, page_table, entry, 1);
1386 update_mmu_cache(vma, address, entry);
1387 lazy_mmu_prot_update(entry);
1388 pte_unmap(page_table);
1389 spin_unlock(&mm->page_table_lock);
1390 return VM_FAULT_MINOR;
1393 pte_unmap(page_table);
1395 /*
1396 * Ok, we need to copy. Oh, well..
1397 */
1398 if (!PageReserved(old_page))
1399 page_cache_get(old_page);
1400 spin_unlock(&mm->page_table_lock);
1402 if (unlikely(anon_vma_prepare(vma)))
1403 goto no_new_page;
1404 if (old_page == ZERO_PAGE(address)) {
1405 new_page = alloc_zeroed_user_highpage(vma, address);
1406 if (!new_page)
1407 goto no_new_page;
1408 } else {
1409 new_page = alloc_page_vma(GFP_HIGHUSER, vma, address);
1410 if (!new_page)
1411 goto no_new_page;
1412 if (old_page == &invalid_page) {
1413 char *vto = kmap_atomic(new_page, KM_USER1);
1414 copy_page(vto, (void *)(address & PAGE_MASK));
1415 kunmap_atomic(vto, KM_USER1);
1416 } else {
1417 copy_user_highpage(new_page, old_page, address);
1420 /*
1421 * Re-check the pte - we dropped the lock
1422 */
1423 spin_lock(&mm->page_table_lock);
1424 page_table = pte_offset_map(pmd, address);
1425 if (likely(pte_same(*page_table, pte))) {
1426 if (PageAnon(old_page))
1427 dec_mm_counter(mm, anon_rss);
1428 if (PageReserved(old_page))
1429 inc_mm_counter(mm, rss);
1430 else
1431 page_remove_rmap(old_page);
1432 flush_cache_page(vma, address, pfn);
1433 break_cow(vma, new_page, address, page_table);
1434 lru_cache_add_active(new_page);
1435 page_add_anon_rmap(new_page, vma, address);
1437 /* Free the old page.. */
1438 new_page = old_page;
1440 pte_unmap(page_table);
1441 page_cache_release(new_page);
1442 page_cache_release(old_page);
1443 spin_unlock(&mm->page_table_lock);
1444 return VM_FAULT_MINOR;
1446 no_new_page:
1447 page_cache_release(old_page);
1448 return VM_FAULT_OOM;
1451 /*
1452 * Helper functions for unmap_mapping_range().
1454 * __ Notes on dropping i_mmap_lock to reduce latency while unmapping __
1456 * We have to restart searching the prio_tree whenever we drop the lock,
1457 * since the iterator is only valid while the lock is held, and anyway
1458 * a later vma might be split and reinserted earlier while lock dropped.
1460 * The list of nonlinear vmas could be handled more efficiently, using
1461 * a placeholder, but handle it in the same way until a need is shown.
1462 * It is important to search the prio_tree before nonlinear list: a vma
1463 * may become nonlinear and be shifted from prio_tree to nonlinear list
1464 * while the lock is dropped; but never shifted from list to prio_tree.
1466 * In order to make forward progress despite restarting the search,
1467 * vm_truncate_count is used to mark a vma as now dealt with, so we can
1468 * quickly skip it next time around. Since the prio_tree search only
1469 * shows us those vmas affected by unmapping the range in question, we
1470 * can't efficiently keep all vmas in step with mapping->truncate_count:
1471 * so instead reset them all whenever it wraps back to 0 (then go to 1).
1472 * mapping->truncate_count and vma->vm_truncate_count are protected by
1473 * i_mmap_lock.
1475 * In order to make forward progress despite repeatedly restarting some
1476 * large vma, note the restart_addr from unmap_vmas when it breaks out:
1477 * and restart from that address when we reach that vma again. It might
1478 * have been split or merged, shrunk or extended, but never shifted: so
1479 * restart_addr remains valid so long as it remains in the vma's range.
1480 * unmap_mapping_range forces truncate_count to leap over page-aligned
1481 * values so we can save vma's restart_addr in its truncate_count field.
1482 */
1483 #define is_restart_addr(truncate_count) (!((truncate_count) & ~PAGE_MASK))
1485 static void reset_vma_truncate_counts(struct address_space *mapping)
1487 struct vm_area_struct *vma;
1488 struct prio_tree_iter iter;
1490 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
1491 vma->vm_truncate_count = 0;
1492 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
1493 vma->vm_truncate_count = 0;
1496 static int unmap_mapping_range_vma(struct vm_area_struct *vma,
1497 unsigned long start_addr, unsigned long end_addr,
1498 struct zap_details *details)
1500 unsigned long restart_addr;
1501 int need_break;
1503 again:
1504 restart_addr = vma->vm_truncate_count;
1505 if (is_restart_addr(restart_addr) && start_addr < restart_addr) {
1506 start_addr = restart_addr;
1507 if (start_addr >= end_addr) {
1508 /* Top of vma has been split off since last time */
1509 vma->vm_truncate_count = details->truncate_count;
1510 return 0;
1514 restart_addr = zap_page_range(vma, start_addr,
1515 end_addr - start_addr, details);
1517 /*
1518 * We cannot rely on the break test in unmap_vmas:
1519 * on the one hand, we don't want to restart our loop
1520 * just because that broke out for the page_table_lock;
1521 * on the other hand, it does no test when vma is small.
1522 */
1523 need_break = need_resched() ||
1524 need_lockbreak(details->i_mmap_lock);
1526 if (restart_addr >= end_addr) {
1527 /* We have now completed this vma: mark it so */
1528 vma->vm_truncate_count = details->truncate_count;
1529 if (!need_break)
1530 return 0;
1531 } else {
1532 /* Note restart_addr in vma's truncate_count field */
1533 vma->vm_truncate_count = restart_addr;
1534 if (!need_break)
1535 goto again;
1538 spin_unlock(details->i_mmap_lock);
1539 cond_resched();
1540 spin_lock(details->i_mmap_lock);
1541 return -EINTR;
1544 static inline void unmap_mapping_range_tree(struct prio_tree_root *root,
1545 struct zap_details *details)
1547 struct vm_area_struct *vma;
1548 struct prio_tree_iter iter;
1549 pgoff_t vba, vea, zba, zea;
1551 restart:
1552 vma_prio_tree_foreach(vma, &iter, root,
1553 details->first_index, details->last_index) {
1554 /* Skip quickly over those we have already dealt with */
1555 if (vma->vm_truncate_count == details->truncate_count)
1556 continue;
1558 vba = vma->vm_pgoff;
1559 vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1;
1560 /* Assume for now that PAGE_CACHE_SHIFT == PAGE_SHIFT */
1561 zba = details->first_index;
1562 if (zba < vba)
1563 zba = vba;
1564 zea = details->last_index;
1565 if (zea > vea)
1566 zea = vea;
1568 if (unmap_mapping_range_vma(vma,
1569 ((zba - vba) << PAGE_SHIFT) + vma->vm_start,
1570 ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
1571 details) < 0)
1572 goto restart;
1576 static inline void unmap_mapping_range_list(struct list_head *head,
1577 struct zap_details *details)
1579 struct vm_area_struct *vma;
1581 /*
1582 * In nonlinear VMAs there is no correspondence between virtual address
1583 * offset and file offset. So we must perform an exhaustive search
1584 * across *all* the pages in each nonlinear VMA, not just the pages
1585 * whose virtual address lies outside the file truncation point.
1586 */
1587 restart:
1588 list_for_each_entry(vma, head, shared.vm_set.list) {
1589 /* Skip quickly over those we have already dealt with */
1590 if (vma->vm_truncate_count == details->truncate_count)
1591 continue;
1592 details->nonlinear_vma = vma;
1593 if (unmap_mapping_range_vma(vma, vma->vm_start,
1594 vma->vm_end, details) < 0)
1595 goto restart;
1599 /**
1600 * unmap_mapping_range - unmap the portion of all mmaps
1601 * in the specified address_space corresponding to the specified
1602 * page range in the underlying file.
1603 * @address_space: the address space containing mmaps to be unmapped.
1604 * @holebegin: byte in first page to unmap, relative to the start of
1605 * the underlying file. This will be rounded down to a PAGE_SIZE
1606 * boundary. Note that this is different from vmtruncate(), which
1607 * must keep the partial page. In contrast, we must get rid of
1608 * partial pages.
1609 * @holelen: size of prospective hole in bytes. This will be rounded
1610 * up to a PAGE_SIZE boundary. A holelen of zero truncates to the
1611 * end of the file.
1612 * @even_cows: 1 when truncating a file, unmap even private COWed pages;
1613 * but 0 when invalidating pagecache, don't throw away private data.
1614 */
1615 void unmap_mapping_range(struct address_space *mapping,
1616 loff_t const holebegin, loff_t const holelen, int even_cows)
1618 struct zap_details details;
1619 pgoff_t hba = holebegin >> PAGE_SHIFT;
1620 pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
1622 /* Check for overflow. */
1623 if (sizeof(holelen) > sizeof(hlen)) {
1624 long long holeend =
1625 (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
1626 if (holeend & ~(long long)ULONG_MAX)
1627 hlen = ULONG_MAX - hba + 1;
1630 details.check_mapping = even_cows? NULL: mapping;
1631 details.nonlinear_vma = NULL;
1632 details.first_index = hba;
1633 details.last_index = hba + hlen - 1;
1634 if (details.last_index < details.first_index)
1635 details.last_index = ULONG_MAX;
1636 details.i_mmap_lock = &mapping->i_mmap_lock;
1638 spin_lock(&mapping->i_mmap_lock);
1640 /* serialize i_size write against truncate_count write */
1641 smp_wmb();
1642 /* Protect against page faults, and endless unmapping loops */
1643 mapping->truncate_count++;
1644 /*
1645 * For archs where spin_lock has inclusive semantics like ia64
1646 * this smp_mb() will prevent to read pagetable contents
1647 * before the truncate_count increment is visible to
1648 * other cpus.
1649 */
1650 smp_mb();
1651 if (unlikely(is_restart_addr(mapping->truncate_count))) {
1652 if (mapping->truncate_count == 0)
1653 reset_vma_truncate_counts(mapping);
1654 mapping->truncate_count++;
1656 details.truncate_count = mapping->truncate_count;
1658 if (unlikely(!prio_tree_empty(&mapping->i_mmap)))
1659 unmap_mapping_range_tree(&mapping->i_mmap, &details);
1660 if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
1661 unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
1662 spin_unlock(&mapping->i_mmap_lock);
1664 EXPORT_SYMBOL(unmap_mapping_range);
1666 /*
1667 * Handle all mappings that got truncated by a "truncate()"
1668 * system call.
1670 * NOTE! We have to be ready to update the memory sharing
1671 * between the file and the memory map for a potential last
1672 * incomplete page. Ugly, but necessary.
1673 */
1674 int vmtruncate(struct inode * inode, loff_t offset)
1676 struct address_space *mapping = inode->i_mapping;
1677 unsigned long limit;
1679 if (inode->i_size < offset)
1680 goto do_expand;
1681 /*
1682 * truncation of in-use swapfiles is disallowed - it would cause
1683 * subsequent swapout to scribble on the now-freed blocks.
1684 */
1685 if (IS_SWAPFILE(inode))
1686 goto out_busy;
1687 i_size_write(inode, offset);
1688 unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
1689 truncate_inode_pages(mapping, offset);
1690 goto out_truncate;
1692 do_expand:
1693 limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
1694 if (limit != RLIM_INFINITY && offset > limit)
1695 goto out_sig;
1696 if (offset > inode->i_sb->s_maxbytes)
1697 goto out_big;
1698 i_size_write(inode, offset);
1700 out_truncate:
1701 if (inode->i_op && inode->i_op->truncate)
1702 inode->i_op->truncate(inode);
1703 return 0;
1704 out_sig:
1705 send_sig(SIGXFSZ, current, 0);
1706 out_big:
1707 return -EFBIG;
1708 out_busy:
1709 return -ETXTBSY;
1712 EXPORT_SYMBOL(vmtruncate);
1714 /*
1715 * Primitive swap readahead code. We simply read an aligned block of
1716 * (1 << page_cluster) entries in the swap area. This method is chosen
1717 * because it doesn't cost us any seek time. We also make sure to queue
1718 * the 'original' request together with the readahead ones...
1720 * This has been extended to use the NUMA policies from the mm triggering
1721 * the readahead.
1723 * Caller must hold down_read on the vma->vm_mm if vma is not NULL.
1724 */
1725 void swapin_readahead(swp_entry_t entry, unsigned long addr,struct vm_area_struct *vma)
1727 #ifdef CONFIG_NUMA
1728 struct vm_area_struct *next_vma = vma ? vma->vm_next : NULL;
1729 #endif
1730 int i, num;
1731 struct page *new_page;
1732 unsigned long offset;
1734 /*
1735 * Get the number of handles we should do readahead io to.
1736 */
1737 num = valid_swaphandles(entry, &offset);
1738 for (i = 0; i < num; offset++, i++) {
1739 /* Ok, do the async read-ahead now */
1740 new_page = read_swap_cache_async(swp_entry(swp_type(entry),
1741 offset), vma, addr);
1742 if (!new_page)
1743 break;
1744 page_cache_release(new_page);
1745 #ifdef CONFIG_NUMA
1746 /*
1747 * Find the next applicable VMA for the NUMA policy.
1748 */
1749 addr += PAGE_SIZE;
1750 if (addr == 0)
1751 vma = NULL;
1752 if (vma) {
1753 if (addr >= vma->vm_end) {
1754 vma = next_vma;
1755 next_vma = vma ? vma->vm_next : NULL;
1757 if (vma && addr < vma->vm_start)
1758 vma = NULL;
1759 } else {
1760 if (next_vma && addr >= next_vma->vm_start) {
1761 vma = next_vma;
1762 next_vma = vma->vm_next;
1765 #endif
1767 lru_add_drain(); /* Push any new pages onto the LRU now */
1770 /*
1771 * We hold the mm semaphore and the page_table_lock on entry and
1772 * should release the pagetable lock on exit..
1773 */
1774 static int do_swap_page(struct mm_struct * mm,
1775 struct vm_area_struct * vma, unsigned long address,
1776 pte_t *page_table, pmd_t *pmd, pte_t orig_pte, int write_access)
1778 struct page *page;
1779 swp_entry_t entry = pte_to_swp_entry(orig_pte);
1780 pte_t pte;
1781 int ret = VM_FAULT_MINOR;
1783 pte_unmap(page_table);
1784 spin_unlock(&mm->page_table_lock);
1785 page = lookup_swap_cache(entry);
1786 if (!page) {
1787 swapin_readahead(entry, address, vma);
1788 page = read_swap_cache_async(entry, vma, address);
1789 if (!page) {
1790 /*
1791 * Back out if somebody else faulted in this pte while
1792 * we released the page table lock.
1793 */
1794 spin_lock(&mm->page_table_lock);
1795 page_table = pte_offset_map(pmd, address);
1796 if (likely(pte_same(*page_table, orig_pte)))
1797 ret = VM_FAULT_OOM;
1798 else
1799 ret = VM_FAULT_MINOR;
1800 pte_unmap(page_table);
1801 spin_unlock(&mm->page_table_lock);
1802 goto out;
1805 /* Had to read the page from swap area: Major fault */
1806 ret = VM_FAULT_MAJOR;
1807 inc_page_state(pgmajfault);
1808 grab_swap_token();
1811 mark_page_accessed(page);
1812 lock_page(page);
1814 /*
1815 * Back out if somebody else faulted in this pte while we
1816 * released the page table lock.
1817 */
1818 spin_lock(&mm->page_table_lock);
1819 page_table = pte_offset_map(pmd, address);
1820 if (unlikely(!pte_same(*page_table, orig_pte))) {
1821 ret = VM_FAULT_MINOR;
1822 goto out_nomap;
1825 if (unlikely(!PageUptodate(page))) {
1826 ret = VM_FAULT_SIGBUS;
1827 goto out_nomap;
1830 /* The page isn't present yet, go ahead with the fault. */
1832 swap_free(entry);
1833 if (vm_swap_full())
1834 remove_exclusive_swap_page(page);
1836 inc_mm_counter(mm, rss);
1837 pte = mk_pte(page, vma->vm_page_prot);
1838 if (write_access && can_share_swap_page(page)) {
1839 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
1840 write_access = 0;
1842 unlock_page(page);
1844 flush_icache_page(vma, page);
1845 set_pte_at(mm, address, page_table, pte);
1846 page_add_anon_rmap(page, vma, address);
1848 if (write_access) {
1849 if (do_wp_page(mm, vma, address,
1850 page_table, pmd, pte) == VM_FAULT_OOM)
1851 ret = VM_FAULT_OOM;
1852 goto out;
1855 /* No need to invalidate - it was non-present before */
1856 update_mmu_cache(vma, address, pte);
1857 lazy_mmu_prot_update(pte);
1858 pte_unmap(page_table);
1859 spin_unlock(&mm->page_table_lock);
1860 out:
1861 return ret;
1862 out_nomap:
1863 pte_unmap(page_table);
1864 spin_unlock(&mm->page_table_lock);
1865 unlock_page(page);
1866 page_cache_release(page);
1867 goto out;
1870 /*
1871 * We are called with the MM semaphore and page_table_lock
1872 * spinlock held to protect against concurrent faults in
1873 * multithreaded programs.
1874 */
1875 static int
1876 do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
1877 pte_t *page_table, pmd_t *pmd, int write_access,
1878 unsigned long addr)
1880 pte_t entry;
1881 struct page * page = ZERO_PAGE(addr);
1883 /* Read-only mapping of ZERO_PAGE. */
1884 entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot));
1886 /* ..except if it's a write access */
1887 if (write_access) {
1888 /* Allocate our own private page. */
1889 pte_unmap(page_table);
1890 spin_unlock(&mm->page_table_lock);
1892 if (unlikely(anon_vma_prepare(vma)))
1893 goto no_mem;
1894 page = alloc_zeroed_user_highpage(vma, addr);
1895 if (!page)
1896 goto no_mem;
1898 spin_lock(&mm->page_table_lock);
1899 page_table = pte_offset_map(pmd, addr);
1901 if (!pte_none(*page_table)) {
1902 pte_unmap(page_table);
1903 page_cache_release(page);
1904 spin_unlock(&mm->page_table_lock);
1905 goto out;
1907 inc_mm_counter(mm, rss);
1908 entry = maybe_mkwrite(pte_mkdirty(mk_pte(page,
1909 vma->vm_page_prot)),
1910 vma);
1911 lru_cache_add_active(page);
1912 SetPageReferenced(page);
1913 page_add_anon_rmap(page, vma, addr);
1916 set_pte_at(mm, addr, page_table, entry);
1917 pte_unmap(page_table);
1919 /* No need to invalidate - it was non-present before */
1920 update_mmu_cache(vma, addr, entry);
1921 lazy_mmu_prot_update(entry);
1922 spin_unlock(&mm->page_table_lock);
1923 out:
1924 return VM_FAULT_MINOR;
1925 no_mem:
1926 return VM_FAULT_OOM;
1929 /*
1930 * do_no_page() tries to create a new page mapping. It aggressively
1931 * tries to share with existing pages, but makes a separate copy if
1932 * the "write_access" parameter is true in order to avoid the next
1933 * page fault.
1935 * As this is called only for pages that do not currently exist, we
1936 * do not need to flush old virtual caches or the TLB.
1938 * This is called with the MM semaphore held and the page table
1939 * spinlock held. Exit with the spinlock released.
1940 */
1941 static int
1942 do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
1943 unsigned long address, int write_access, pte_t *page_table, pmd_t *pmd)
1945 struct page * new_page;
1946 struct address_space *mapping = NULL;
1947 pte_t entry;
1948 unsigned int sequence = 0;
1949 int ret = VM_FAULT_MINOR;
1950 int anon = 0;
1952 if (!vma->vm_ops || !vma->vm_ops->nopage)
1953 return do_anonymous_page(mm, vma, page_table,
1954 pmd, write_access, address);
1955 pte_unmap(page_table);
1956 spin_unlock(&mm->page_table_lock);
1958 if (vma->vm_file) {
1959 mapping = vma->vm_file->f_mapping;
1960 sequence = mapping->truncate_count;
1961 smp_rmb(); /* serializes i_size against truncate_count */
1963 retry:
1964 cond_resched();
1965 new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret);
1966 /*
1967 * No smp_rmb is needed here as long as there's a full
1968 * spin_lock/unlock sequence inside the ->nopage callback
1969 * (for the pagecache lookup) that acts as an implicit
1970 * smp_mb() and prevents the i_size read to happen
1971 * after the next truncate_count read.
1972 */
1974 /* no page was available -- either SIGBUS or OOM */
1975 if (new_page == NOPAGE_SIGBUS)
1976 return VM_FAULT_SIGBUS;
1977 if (new_page == NOPAGE_OOM)
1978 return VM_FAULT_OOM;
1980 /*
1981 * Should we do an early C-O-W break?
1982 */
1983 if (write_access && !(vma->vm_flags & VM_SHARED)) {
1984 struct page *page;
1986 if (unlikely(anon_vma_prepare(vma)))
1987 goto oom;
1988 page = alloc_page_vma(GFP_HIGHUSER, vma, address);
1989 if (!page)
1990 goto oom;
1991 copy_user_highpage(page, new_page, address);
1992 page_cache_release(new_page);
1993 new_page = page;
1994 anon = 1;
1997 spin_lock(&mm->page_table_lock);
1998 /*
1999 * For a file-backed vma, someone could have truncated or otherwise
2000 * invalidated this page. If unmap_mapping_range got called,
2001 * retry getting the page.
2002 */
2003 if (mapping && unlikely(sequence != mapping->truncate_count)) {
2004 sequence = mapping->truncate_count;
2005 spin_unlock(&mm->page_table_lock);
2006 page_cache_release(new_page);
2007 goto retry;
2009 page_table = pte_offset_map(pmd, address);
2011 /*
2012 * This silly early PAGE_DIRTY setting removes a race
2013 * due to the bad i386 page protection. But it's valid
2014 * for other architectures too.
2016 * Note that if write_access is true, we either now have
2017 * an exclusive copy of the page, or this is a shared mapping,
2018 * so we can make it writable and dirty to avoid having to
2019 * handle that later.
2020 */
2021 /* Only go through if we didn't race with anybody else... */
2022 if (pte_none(*page_table)) {
2023 if (!PageReserved(new_page))
2024 inc_mm_counter(mm, rss);
2026 flush_icache_page(vma, new_page);
2027 entry = mk_pte(new_page, vma->vm_page_prot);
2028 if (write_access)
2029 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2030 set_pte_at(mm, address, page_table, entry);
2031 if (anon) {
2032 lru_cache_add_active(new_page);
2033 page_add_anon_rmap(new_page, vma, address);
2034 } else
2035 page_add_file_rmap(new_page);
2036 pte_unmap(page_table);
2037 } else {
2038 /* One of our sibling threads was faster, back out. */
2039 pte_unmap(page_table);
2040 page_cache_release(new_page);
2041 spin_unlock(&mm->page_table_lock);
2042 goto out;
2045 /* no need to invalidate: a not-present page shouldn't be cached */
2046 update_mmu_cache(vma, address, entry);
2047 lazy_mmu_prot_update(entry);
2048 spin_unlock(&mm->page_table_lock);
2049 out:
2050 return ret;
2051 oom:
2052 page_cache_release(new_page);
2053 ret = VM_FAULT_OOM;
2054 goto out;
2057 /*
2058 * Fault of a previously existing named mapping. Repopulate the pte
2059 * from the encoded file_pte if possible. This enables swappable
2060 * nonlinear vmas.
2061 */
2062 static int do_file_page(struct mm_struct * mm, struct vm_area_struct * vma,
2063 unsigned long address, int write_access, pte_t *pte, pmd_t *pmd)
2065 unsigned long pgoff;
2066 int err;
2068 BUG_ON(!vma->vm_ops || !vma->vm_ops->nopage);
2069 /*
2070 * Fall back to the linear mapping if the fs does not support
2071 * ->populate:
2072 */
2073 if (!vma->vm_ops || !vma->vm_ops->populate ||
2074 (write_access && !(vma->vm_flags & VM_SHARED))) {
2075 pte_clear(mm, address, pte);
2076 return do_no_page(mm, vma, address, write_access, pte, pmd);
2079 pgoff = pte_to_pgoff(*pte);
2081 pte_unmap(pte);
2082 spin_unlock(&mm->page_table_lock);
2084 err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE, vma->vm_page_prot, pgoff, 0);
2085 if (err == -ENOMEM)
2086 return VM_FAULT_OOM;
2087 if (err)
2088 return VM_FAULT_SIGBUS;
2089 return VM_FAULT_MAJOR;
2092 /*
2093 * These routines also need to handle stuff like marking pages dirty
2094 * and/or accessed for architectures that don't do it in hardware (most
2095 * RISC architectures). The early dirtying is also good on the i386.
2097 * There is also a hook called "update_mmu_cache()" that architectures
2098 * with external mmu caches can use to update those (ie the Sparc or
2099 * PowerPC hashed page tables that act as extended TLBs).
2101 * Note the "page_table_lock". It is to protect against kswapd removing
2102 * pages from under us. Note that kswapd only ever _removes_ pages, never
2103 * adds them. As such, once we have noticed that the page is not present,
2104 * we can drop the lock early.
2106 * The adding of pages is protected by the MM semaphore (which we hold),
2107 * so we don't need to worry about a page being suddenly been added into
2108 * our VM.
2110 * We enter with the pagetable spinlock held, we are supposed to
2111 * release it when done.
2112 */
2113 static inline int handle_pte_fault(struct mm_struct *mm,
2114 struct vm_area_struct * vma, unsigned long address,
2115 int write_access, pte_t *pte, pmd_t *pmd)
2117 pte_t entry;
2119 entry = *pte;
2120 if (!pte_present(entry)) {
2121 /*
2122 * If it truly wasn't present, we know that kswapd
2123 * and the PTE updates will not touch it later. So
2124 * drop the lock.
2125 */
2126 if (pte_none(entry))
2127 return do_no_page(mm, vma, address, write_access, pte, pmd);
2128 if (pte_file(entry))
2129 return do_file_page(mm, vma, address, write_access, pte, pmd);
2130 return do_swap_page(mm, vma, address, pte, pmd, entry, write_access);
2133 if (write_access) {
2134 if (!pte_write(entry))
2135 return do_wp_page(mm, vma, address, pte, pmd, entry);
2137 entry = pte_mkdirty(entry);
2139 entry = pte_mkyoung(entry);
2140 ptep_set_access_flags(vma, address, pte, entry, write_access);
2141 update_mmu_cache(vma, address, entry);
2142 lazy_mmu_prot_update(entry);
2143 pte_unmap(pte);
2144 spin_unlock(&mm->page_table_lock);
2145 return VM_FAULT_MINOR;
2148 /*
2149 * By the time we get here, we already hold the mm semaphore
2150 */
2151 int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma,
2152 unsigned long address, int write_access)
2154 pgd_t *pgd;
2155 pud_t *pud;
2156 pmd_t *pmd;
2157 pte_t *pte;
2159 __set_current_state(TASK_RUNNING);
2161 inc_page_state(pgfault);
2163 if (is_vm_hugetlb_page(vma))
2164 return VM_FAULT_SIGBUS; /* mapping truncation does this. */
2166 /*
2167 * We need the page table lock to synchronize with kswapd
2168 * and the SMP-safe atomic PTE updates.
2169 */
2170 pgd = pgd_offset(mm, address);
2171 spin_lock(&mm->page_table_lock);
2173 pud = pud_alloc(mm, pgd, address);
2174 if (!pud)
2175 goto oom;
2177 pmd = pmd_alloc(mm, pud, address);
2178 if (!pmd)
2179 goto oom;
2181 pte = pte_alloc_map(mm, pmd, address);
2182 if (!pte)
2183 goto oom;
2185 return handle_pte_fault(mm, vma, address, write_access, pte, pmd);
2187 oom:
2188 spin_unlock(&mm->page_table_lock);
2189 return VM_FAULT_OOM;
2192 #ifndef __PAGETABLE_PUD_FOLDED
2193 /*
2194 * Allocate page upper directory.
2196 * We've already handled the fast-path in-line, and we own the
2197 * page table lock.
2198 */
2199 pud_t fastcall *__pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
2201 pud_t *new;
2203 spin_unlock(&mm->page_table_lock);
2204 new = pud_alloc_one(mm, address);
2205 spin_lock(&mm->page_table_lock);
2206 if (!new)
2207 return NULL;
2209 /*
2210 * Because we dropped the lock, we should re-check the
2211 * entry, as somebody else could have populated it..
2212 */
2213 if (pgd_present(*pgd)) {
2214 pud_free(new);
2215 goto out;
2217 pgd_populate(mm, pgd, new);
2218 out:
2219 return pud_offset(pgd, address);
2221 #endif /* __PAGETABLE_PUD_FOLDED */
2223 #ifndef __PAGETABLE_PMD_FOLDED
2224 /*
2225 * Allocate page middle directory.
2227 * We've already handled the fast-path in-line, and we own the
2228 * page table lock.
2229 */
2230 pmd_t fastcall *__pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
2232 pmd_t *new;
2234 spin_unlock(&mm->page_table_lock);
2235 new = pmd_alloc_one(mm, address);
2236 spin_lock(&mm->page_table_lock);
2237 if (!new)
2238 return NULL;
2240 /*
2241 * Because we dropped the lock, we should re-check the
2242 * entry, as somebody else could have populated it..
2243 */
2244 #ifndef __ARCH_HAS_4LEVEL_HACK
2245 if (pud_present(*pud)) {
2246 pmd_free(new);
2247 goto out;
2249 pud_populate(mm, pud, new);
2250 #else
2251 if (pgd_present(*pud)) {
2252 pmd_free(new);
2253 goto out;
2255 pgd_populate(mm, pud, new);
2256 #endif /* __ARCH_HAS_4LEVEL_HACK */
2258 out:
2259 return pmd_offset(pud, address);
2261 #endif /* __PAGETABLE_PMD_FOLDED */
2263 int make_pages_present(unsigned long addr, unsigned long end)
2265 int ret, len, write;
2266 struct vm_area_struct * vma;
2268 vma = find_vma(current->mm, addr);
2269 if (!vma)
2270 return -1;
2271 write = (vma->vm_flags & VM_WRITE) != 0;
2272 if (addr >= end)
2273 BUG();
2274 if (end > vma->vm_end)
2275 BUG();
2276 len = (end+PAGE_SIZE-1)/PAGE_SIZE-addr/PAGE_SIZE;
2277 ret = get_user_pages(current, current->mm, addr,
2278 len, write, 0, NULL, NULL);
2279 if (ret < 0)
2280 return ret;
2281 return ret == len ? 0 : -1;
2284 /*
2285 * Map a vmalloc()-space virtual address to the physical page.
2286 */
2287 struct page * vmalloc_to_page(void * vmalloc_addr)
2289 unsigned long addr = (unsigned long) vmalloc_addr;
2290 struct page *page = NULL;
2291 pgd_t *pgd = pgd_offset_k(addr);
2292 pud_t *pud;
2293 pmd_t *pmd;
2294 pte_t *ptep, pte;
2296 if (!pgd_none(*pgd)) {
2297 pud = pud_offset(pgd, addr);
2298 if (!pud_none(*pud)) {
2299 pmd = pmd_offset(pud, addr);
2300 if (!pmd_none(*pmd)) {
2301 ptep = pte_offset_map(pmd, addr);
2302 pte = *ptep;
2303 if (pte_present(pte))
2304 page = pte_page(pte);
2305 pte_unmap(ptep);
2309 return page;
2312 EXPORT_SYMBOL(vmalloc_to_page);
2314 /*
2315 * Map a vmalloc()-space virtual address to the physical page frame number.
2316 */
2317 unsigned long vmalloc_to_pfn(void * vmalloc_addr)
2319 return page_to_pfn(vmalloc_to_page(vmalloc_addr));
2322 EXPORT_SYMBOL(vmalloc_to_pfn);
2324 /*
2325 * update_mem_hiwater
2326 * - update per process rss and vm high water data
2327 */
2328 void update_mem_hiwater(struct task_struct *tsk)
2330 if (tsk->mm) {
2331 unsigned long rss = get_mm_counter(tsk->mm, rss);
2333 if (tsk->mm->hiwater_rss < rss)
2334 tsk->mm->hiwater_rss = rss;
2335 if (tsk->mm->hiwater_vm < tsk->mm->total_vm)
2336 tsk->mm->hiwater_vm = tsk->mm->total_vm;
2340 #if !defined(__HAVE_ARCH_GATE_AREA)
2342 #if defined(AT_SYSINFO_EHDR)
2343 struct vm_area_struct gate_vma;
2345 static int __init gate_vma_init(void)
2347 gate_vma.vm_mm = NULL;
2348 gate_vma.vm_start = FIXADDR_USER_START;
2349 gate_vma.vm_end = FIXADDR_USER_END;
2350 gate_vma.vm_page_prot = PAGE_READONLY;
2351 gate_vma.vm_flags = 0;
2352 return 0;
2354 __initcall(gate_vma_init);
2355 #endif
2357 struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
2359 #ifdef AT_SYSINFO_EHDR
2360 return &gate_vma;
2361 #else
2362 return NULL;
2363 #endif
2366 int in_gate_area_no_task(unsigned long addr)
2368 #ifdef AT_SYSINFO_EHDR
2369 if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END))
2370 return 1;
2371 #endif
2372 return 0;
2375 #endif /* __HAVE_ARCH_GATE_AREA */