ia64/xen-unstable

view linux-2.6-xen-sparse/mm/mmap.c @ 7446:18eb059ae471

New network-bridge script and associated gubbins.

This is Kurt Garloff's reworked network-bridge script:

* we got rid of ifconfig
* it works for netdev != eth0
* arp on and off are symmetric as are ifdown and ifup
* ifup will be passed the ifcfg config file name if needed
(the ifup may otherwise figure that the veth0 hardware is
NOT the same as the original ${netdev} and not use the same
config -- this happens on SUSE. Charles Coffing tracked this
one down.)

Plus Kurt's avoid-dash patch:

the network setup scripts on SUSE have trouble with the bridge
name xen-br0; they don't expect the '-'.
Arguably this should be fixed.
But I assume there's more scripts out there which may not like it,
so I suggest the following patch to rename xen-br0 to xenbr0.

Plus Charles Duffy's patch to support multiple bridges:

The attached patch allows the network-bridge script to be used to
generate multiple bridges corresponding to different physical
interfaces. It adds a new parameter, "vifnum", used to refer both to
the loopback interface to be used and to set defaults regarding the
physical interface and bridge name.

Thus, if one wishes to start xenbr0 on eth0 and xenbr1 on eth1, one
need only call:

network-bridge start ## vifnum is 0 by default
network-bridge start vifnum=1

...well, that and set loopback.nloopbacks=2 in the Dom0 kernel
parameters.

Plus renaming of virtnum to vifnum in Charles' patch, as requested by Ian Pratt.

Plus a fix to DevController to allocate vif IDs starting from 0 (i.e. vif2.0
is now domain 2's first vif, as opposed to vif2.1 in the recent past).

Plus tidying up inside network-bridge using some helper variables.

Signed-off-by: Ewan Mellor <ewan@xensource.com>
author emellor@leeni.uk.xensource.com
date Wed Oct 19 16:24:54 2005 +0100 (2005-10-19)
parents 06d84bf87159
children fd9b2c1bb577
line source
1 /*
2 * mm/mmap.c
3 *
4 * Written by obz.
5 *
6 * Address space accounting code <alan@redhat.com>
7 */
9 #include <linux/slab.h>
10 #include <linux/mm.h>
11 #include <linux/shm.h>
12 #include <linux/mman.h>
13 #include <linux/pagemap.h>
14 #include <linux/swap.h>
15 #include <linux/syscalls.h>
16 #include <linux/init.h>
17 #include <linux/file.h>
18 #include <linux/fs.h>
19 #include <linux/personality.h>
20 #include <linux/security.h>
21 #include <linux/hugetlb.h>
22 #include <linux/profile.h>
23 #include <linux/module.h>
24 #include <linux/mount.h>
25 #include <linux/mempolicy.h>
26 #include <linux/rmap.h>
28 #include <asm/uaccess.h>
29 #include <asm/cacheflush.h>
30 #include <asm/tlb.h>
32 static void unmap_region(struct mm_struct *mm,
33 struct vm_area_struct *vma, struct vm_area_struct *prev,
34 unsigned long start, unsigned long end);
36 /*
37 * WARNING: the debugging will use recursive algorithms so never enable this
38 * unless you know what you are doing.
39 */
40 #undef DEBUG_MM_RB
42 /* description of effects of mapping type and prot in current implementation.
43 * this is due to the limited x86 page protection hardware. The expected
44 * behavior is in parens:
45 *
46 * map_type prot
47 * PROT_NONE PROT_READ PROT_WRITE PROT_EXEC
48 * MAP_SHARED r: (no) no r: (yes) yes r: (no) yes r: (no) yes
49 * w: (no) no w: (no) no w: (yes) yes w: (no) no
50 * x: (no) no x: (no) yes x: (no) yes x: (yes) yes
51 *
52 * MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes
53 * w: (no) no w: (no) no w: (copy) copy w: (no) no
54 * x: (no) no x: (no) yes x: (no) yes x: (yes) yes
55 *
56 */
57 pgprot_t protection_map[16] = {
58 __P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111,
59 __S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111
60 };
62 int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
63 int sysctl_overcommit_ratio = 50; /* default is 50% */
64 int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
65 atomic_t vm_committed_space = ATOMIC_INIT(0);
67 /*
68 * Check that a process has enough memory to allocate a new virtual
69 * mapping. 0 means there is enough memory for the allocation to
70 * succeed and -ENOMEM implies there is not.
71 *
72 * We currently support three overcommit policies, which are set via the
73 * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting
74 *
75 * Strict overcommit modes added 2002 Feb 26 by Alan Cox.
76 * Additional code 2002 Jul 20 by Robert Love.
77 *
78 * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise.
79 *
80 * Note this is a helper function intended to be used by LSMs which
81 * wish to use this logic.
82 */
83 int __vm_enough_memory(long pages, int cap_sys_admin)
84 {
85 unsigned long free, allowed;
87 vm_acct_memory(pages);
89 /*
90 * Sometimes we want to use more memory than we have
91 */
92 if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS)
93 return 0;
95 if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
96 unsigned long n;
98 free = get_page_cache_size();
99 free += nr_swap_pages;
101 /*
102 * Any slabs which are created with the
103 * SLAB_RECLAIM_ACCOUNT flag claim to have contents
104 * which are reclaimable, under pressure. The dentry
105 * cache and most inode caches should fall into this
106 */
107 free += atomic_read(&slab_reclaim_pages);
109 /*
110 * Leave the last 3% for root
111 */
112 if (!cap_sys_admin)
113 free -= free / 32;
115 if (free > pages)
116 return 0;
118 /*
119 * nr_free_pages() is very expensive on large systems,
120 * only call if we're about to fail.
121 */
122 n = nr_free_pages();
123 if (!cap_sys_admin)
124 n -= n / 32;
125 free += n;
127 if (free > pages)
128 return 0;
129 vm_unacct_memory(pages);
130 return -ENOMEM;
131 }
133 allowed = (totalram_pages - hugetlb_total_pages())
134 * sysctl_overcommit_ratio / 100;
135 /*
136 * Leave the last 3% for root
137 */
138 if (!cap_sys_admin)
139 allowed -= allowed / 32;
140 allowed += total_swap_pages;
142 /* Don't let a single process grow too big:
143 leave 3% of the size of this process for other processes */
144 allowed -= current->mm->total_vm / 32;
146 if (atomic_read(&vm_committed_space) < allowed)
147 return 0;
149 vm_unacct_memory(pages);
151 return -ENOMEM;
152 }
154 EXPORT_SYMBOL(sysctl_overcommit_memory);
155 EXPORT_SYMBOL(sysctl_overcommit_ratio);
156 EXPORT_SYMBOL(sysctl_max_map_count);
157 EXPORT_SYMBOL(vm_committed_space);
158 EXPORT_SYMBOL(__vm_enough_memory);
160 /*
161 * Requires inode->i_mapping->i_mmap_lock
162 */
163 static void __remove_shared_vm_struct(struct vm_area_struct *vma,
164 struct file *file, struct address_space *mapping)
165 {
166 if (vma->vm_flags & VM_DENYWRITE)
167 atomic_inc(&file->f_dentry->d_inode->i_writecount);
168 if (vma->vm_flags & VM_SHARED)
169 mapping->i_mmap_writable--;
171 flush_dcache_mmap_lock(mapping);
172 if (unlikely(vma->vm_flags & VM_NONLINEAR))
173 list_del_init(&vma->shared.vm_set.list);
174 else
175 vma_prio_tree_remove(vma, &mapping->i_mmap);
176 flush_dcache_mmap_unlock(mapping);
177 }
179 /*
180 * Remove one vm structure and free it.
181 */
182 static void remove_vm_struct(struct vm_area_struct *vma)
183 {
184 struct file *file = vma->vm_file;
186 might_sleep();
187 if (file) {
188 struct address_space *mapping = file->f_mapping;
189 spin_lock(&mapping->i_mmap_lock);
190 __remove_shared_vm_struct(vma, file, mapping);
191 spin_unlock(&mapping->i_mmap_lock);
192 }
193 if (vma->vm_ops && vma->vm_ops->close)
194 vma->vm_ops->close(vma);
195 if (file)
196 fput(file);
197 anon_vma_unlink(vma);
198 mpol_free(vma_policy(vma));
199 kmem_cache_free(vm_area_cachep, vma);
200 }
202 /*
203 * sys_brk() for the most part doesn't need the global kernel
204 * lock, except when an application is doing something nasty
205 * like trying to un-brk an area that has already been mapped
206 * to a regular file. in this case, the unmapping will need
207 * to invoke file system routines that need the global lock.
208 */
209 asmlinkage unsigned long sys_brk(unsigned long brk)
210 {
211 unsigned long rlim, retval;
212 unsigned long newbrk, oldbrk;
213 struct mm_struct *mm = current->mm;
215 down_write(&mm->mmap_sem);
217 if (brk < mm->end_code)
218 goto out;
219 newbrk = PAGE_ALIGN(brk);
220 oldbrk = PAGE_ALIGN(mm->brk);
221 if (oldbrk == newbrk)
222 goto set_brk;
224 /* Always allow shrinking brk. */
225 if (brk <= mm->brk) {
226 if (!do_munmap(mm, newbrk, oldbrk-newbrk))
227 goto set_brk;
228 goto out;
229 }
231 /* Check against rlimit.. */
232 rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur;
233 if (rlim < RLIM_INFINITY && brk - mm->start_data > rlim)
234 goto out;
236 /* Check against existing mmap mappings. */
237 if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE))
238 goto out;
240 /* Ok, looks good - let it rip. */
241 if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk)
242 goto out;
243 set_brk:
244 mm->brk = brk;
245 out:
246 retval = mm->brk;
247 up_write(&mm->mmap_sem);
248 return retval;
249 }
251 #ifdef DEBUG_MM_RB
252 static int browse_rb(struct rb_root *root)
253 {
254 int i = 0, j;
255 struct rb_node *nd, *pn = NULL;
256 unsigned long prev = 0, pend = 0;
258 for (nd = rb_first(root); nd; nd = rb_next(nd)) {
259 struct vm_area_struct *vma;
260 vma = rb_entry(nd, struct vm_area_struct, vm_rb);
261 if (vma->vm_start < prev)
262 printk("vm_start %lx prev %lx\n", vma->vm_start, prev), i = -1;
263 if (vma->vm_start < pend)
264 printk("vm_start %lx pend %lx\n", vma->vm_start, pend);
265 if (vma->vm_start > vma->vm_end)
266 printk("vm_end %lx < vm_start %lx\n", vma->vm_end, vma->vm_start);
267 i++;
268 pn = nd;
269 }
270 j = 0;
271 for (nd = pn; nd; nd = rb_prev(nd)) {
272 j++;
273 }
274 if (i != j)
275 printk("backwards %d, forwards %d\n", j, i), i = 0;
276 return i;
277 }
279 void validate_mm(struct mm_struct *mm)
280 {
281 int bug = 0;
282 int i = 0;
283 struct vm_area_struct *tmp = mm->mmap;
284 while (tmp) {
285 tmp = tmp->vm_next;
286 i++;
287 }
288 if (i != mm->map_count)
289 printk("map_count %d vm_next %d\n", mm->map_count, i), bug = 1;
290 i = browse_rb(&mm->mm_rb);
291 if (i != mm->map_count)
292 printk("map_count %d rb %d\n", mm->map_count, i), bug = 1;
293 if (bug)
294 BUG();
295 }
296 #else
297 #define validate_mm(mm) do { } while (0)
298 #endif
300 static struct vm_area_struct *
301 find_vma_prepare(struct mm_struct *mm, unsigned long addr,
302 struct vm_area_struct **pprev, struct rb_node ***rb_link,
303 struct rb_node ** rb_parent)
304 {
305 struct vm_area_struct * vma;
306 struct rb_node ** __rb_link, * __rb_parent, * rb_prev;
308 __rb_link = &mm->mm_rb.rb_node;
309 rb_prev = __rb_parent = NULL;
310 vma = NULL;
312 while (*__rb_link) {
313 struct vm_area_struct *vma_tmp;
315 __rb_parent = *__rb_link;
316 vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb);
318 if (vma_tmp->vm_end > addr) {
319 vma = vma_tmp;
320 if (vma_tmp->vm_start <= addr)
321 return vma;
322 __rb_link = &__rb_parent->rb_left;
323 } else {
324 rb_prev = __rb_parent;
325 __rb_link = &__rb_parent->rb_right;
326 }
327 }
329 *pprev = NULL;
330 if (rb_prev)
331 *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
332 *rb_link = __rb_link;
333 *rb_parent = __rb_parent;
334 return vma;
335 }
337 static inline void
338 __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
339 struct vm_area_struct *prev, struct rb_node *rb_parent)
340 {
341 if (prev) {
342 vma->vm_next = prev->vm_next;
343 prev->vm_next = vma;
344 } else {
345 mm->mmap = vma;
346 if (rb_parent)
347 vma->vm_next = rb_entry(rb_parent,
348 struct vm_area_struct, vm_rb);
349 else
350 vma->vm_next = NULL;
351 }
352 }
354 void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
355 struct rb_node **rb_link, struct rb_node *rb_parent)
356 {
357 rb_link_node(&vma->vm_rb, rb_parent, rb_link);
358 rb_insert_color(&vma->vm_rb, &mm->mm_rb);
359 }
361 static inline void __vma_link_file(struct vm_area_struct *vma)
362 {
363 struct file * file;
365 file = vma->vm_file;
366 if (file) {
367 struct address_space *mapping = file->f_mapping;
369 if (vma->vm_flags & VM_DENYWRITE)
370 atomic_dec(&file->f_dentry->d_inode->i_writecount);
371 if (vma->vm_flags & VM_SHARED)
372 mapping->i_mmap_writable++;
374 flush_dcache_mmap_lock(mapping);
375 if (unlikely(vma->vm_flags & VM_NONLINEAR))
376 vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
377 else
378 vma_prio_tree_insert(vma, &mapping->i_mmap);
379 flush_dcache_mmap_unlock(mapping);
380 }
381 }
383 static void
384 __vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
385 struct vm_area_struct *prev, struct rb_node **rb_link,
386 struct rb_node *rb_parent)
387 {
388 __vma_link_list(mm, vma, prev, rb_parent);
389 __vma_link_rb(mm, vma, rb_link, rb_parent);
390 __anon_vma_link(vma);
391 }
393 static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
394 struct vm_area_struct *prev, struct rb_node **rb_link,
395 struct rb_node *rb_parent)
396 {
397 struct address_space *mapping = NULL;
399 if (vma->vm_file)
400 mapping = vma->vm_file->f_mapping;
402 if (mapping) {
403 spin_lock(&mapping->i_mmap_lock);
404 vma->vm_truncate_count = mapping->truncate_count;
405 }
406 anon_vma_lock(vma);
408 __vma_link(mm, vma, prev, rb_link, rb_parent);
409 __vma_link_file(vma);
411 anon_vma_unlock(vma);
412 if (mapping)
413 spin_unlock(&mapping->i_mmap_lock);
415 mm->map_count++;
416 validate_mm(mm);
417 }
419 /*
420 * Helper for vma_adjust in the split_vma insert case:
421 * insert vm structure into list and rbtree and anon_vma,
422 * but it has already been inserted into prio_tree earlier.
423 */
424 static void
425 __insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
426 {
427 struct vm_area_struct * __vma, * prev;
428 struct rb_node ** rb_link, * rb_parent;
430 __vma = find_vma_prepare(mm, vma->vm_start,&prev, &rb_link, &rb_parent);
431 if (__vma && __vma->vm_start < vma->vm_end)
432 BUG();
433 __vma_link(mm, vma, prev, rb_link, rb_parent);
434 mm->map_count++;
435 }
437 static inline void
438 __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
439 struct vm_area_struct *prev)
440 {
441 prev->vm_next = vma->vm_next;
442 rb_erase(&vma->vm_rb, &mm->mm_rb);
443 if (mm->mmap_cache == vma)
444 mm->mmap_cache = prev;
445 }
447 /*
448 * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that
449 * is already present in an i_mmap tree without adjusting the tree.
450 * The following helper function should be used when such adjustments
451 * are necessary. The "insert" vma (if any) is to be inserted
452 * before we drop the necessary locks.
453 */
454 void vma_adjust(struct vm_area_struct *vma, unsigned long start,
455 unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert)
456 {
457 struct mm_struct *mm = vma->vm_mm;
458 struct vm_area_struct *next = vma->vm_next;
459 struct vm_area_struct *importer = NULL;
460 struct address_space *mapping = NULL;
461 struct prio_tree_root *root = NULL;
462 struct file *file = vma->vm_file;
463 struct anon_vma *anon_vma = NULL;
464 long adjust_next = 0;
465 int remove_next = 0;
467 if (next && !insert) {
468 if (end >= next->vm_end) {
469 /*
470 * vma expands, overlapping all the next, and
471 * perhaps the one after too (mprotect case 6).
472 */
473 again: remove_next = 1 + (end > next->vm_end);
474 end = next->vm_end;
475 anon_vma = next->anon_vma;
476 importer = vma;
477 } else if (end > next->vm_start) {
478 /*
479 * vma expands, overlapping part of the next:
480 * mprotect case 5 shifting the boundary up.
481 */
482 adjust_next = (end - next->vm_start) >> PAGE_SHIFT;
483 anon_vma = next->anon_vma;
484 importer = vma;
485 } else if (end < vma->vm_end) {
486 /*
487 * vma shrinks, and !insert tells it's not
488 * split_vma inserting another: so it must be
489 * mprotect case 4 shifting the boundary down.
490 */
491 adjust_next = - ((vma->vm_end - end) >> PAGE_SHIFT);
492 anon_vma = next->anon_vma;
493 importer = next;
494 }
495 }
497 if (file) {
498 mapping = file->f_mapping;
499 if (!(vma->vm_flags & VM_NONLINEAR))
500 root = &mapping->i_mmap;
501 spin_lock(&mapping->i_mmap_lock);
502 if (importer &&
503 vma->vm_truncate_count != next->vm_truncate_count) {
504 /*
505 * unmap_mapping_range might be in progress:
506 * ensure that the expanding vma is rescanned.
507 */
508 importer->vm_truncate_count = 0;
509 }
510 if (insert) {
511 insert->vm_truncate_count = vma->vm_truncate_count;
512 /*
513 * Put into prio_tree now, so instantiated pages
514 * are visible to arm/parisc __flush_dcache_page
515 * throughout; but we cannot insert into address
516 * space until vma start or end is updated.
517 */
518 __vma_link_file(insert);
519 }
520 }
522 /*
523 * When changing only vma->vm_end, we don't really need
524 * anon_vma lock: but is that case worth optimizing out?
525 */
526 if (vma->anon_vma)
527 anon_vma = vma->anon_vma;
528 if (anon_vma) {
529 spin_lock(&anon_vma->lock);
530 /*
531 * Easily overlooked: when mprotect shifts the boundary,
532 * make sure the expanding vma has anon_vma set if the
533 * shrinking vma had, to cover any anon pages imported.
534 */
535 if (importer && !importer->anon_vma) {
536 importer->anon_vma = anon_vma;
537 __anon_vma_link(importer);
538 }
539 }
541 if (root) {
542 flush_dcache_mmap_lock(mapping);
543 vma_prio_tree_remove(vma, root);
544 if (adjust_next)
545 vma_prio_tree_remove(next, root);
546 }
548 vma->vm_start = start;
549 vma->vm_end = end;
550 vma->vm_pgoff = pgoff;
551 if (adjust_next) {
552 next->vm_start += adjust_next << PAGE_SHIFT;
553 next->vm_pgoff += adjust_next;
554 }
556 if (root) {
557 if (adjust_next)
558 vma_prio_tree_insert(next, root);
559 vma_prio_tree_insert(vma, root);
560 flush_dcache_mmap_unlock(mapping);
561 }
563 if (remove_next) {
564 /*
565 * vma_merge has merged next into vma, and needs
566 * us to remove next before dropping the locks.
567 */
568 __vma_unlink(mm, next, vma);
569 if (file)
570 __remove_shared_vm_struct(next, file, mapping);
571 if (next->anon_vma)
572 __anon_vma_merge(vma, next);
573 } else if (insert) {
574 /*
575 * split_vma has split insert from vma, and needs
576 * us to insert it before dropping the locks
577 * (it may either follow vma or precede it).
578 */
579 __insert_vm_struct(mm, insert);
580 }
582 if (anon_vma)
583 spin_unlock(&anon_vma->lock);
584 if (mapping)
585 spin_unlock(&mapping->i_mmap_lock);
587 if (remove_next) {
588 if (file)
589 fput(file);
590 mm->map_count--;
591 mpol_free(vma_policy(next));
592 kmem_cache_free(vm_area_cachep, next);
593 /*
594 * In mprotect's case 6 (see comments on vma_merge),
595 * we must remove another next too. It would clutter
596 * up the code too much to do both in one go.
597 */
598 if (remove_next == 2) {
599 next = vma->vm_next;
600 goto again;
601 }
602 }
604 validate_mm(mm);
605 }
607 /*
608 * If the vma has a ->close operation then the driver probably needs to release
609 * per-vma resources, so we don't attempt to merge those.
610 */
611 #define VM_SPECIAL (VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_RESERVED)
613 static inline int is_mergeable_vma(struct vm_area_struct *vma,
614 struct file *file, unsigned long vm_flags)
615 {
616 if (vma->vm_flags != vm_flags)
617 return 0;
618 if (vma->vm_file != file)
619 return 0;
620 if (vma->vm_ops && vma->vm_ops->close)
621 return 0;
622 return 1;
623 }
625 static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1,
626 struct anon_vma *anon_vma2)
627 {
628 return !anon_vma1 || !anon_vma2 || (anon_vma1 == anon_vma2);
629 }
631 /*
632 * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
633 * in front of (at a lower virtual address and file offset than) the vma.
634 *
635 * We cannot merge two vmas if they have differently assigned (non-NULL)
636 * anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
637 *
638 * We don't check here for the merged mmap wrapping around the end of pagecache
639 * indices (16TB on ia32) because do_mmap_pgoff() does not permit mmap's which
640 * wrap, nor mmaps which cover the final page at index -1UL.
641 */
642 static int
643 can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
644 struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
645 {
646 if (is_mergeable_vma(vma, file, vm_flags) &&
647 is_mergeable_anon_vma(anon_vma, vma->anon_vma)) {
648 if (vma->vm_pgoff == vm_pgoff)
649 return 1;
650 }
651 return 0;
652 }
654 /*
655 * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
656 * beyond (at a higher virtual address and file offset than) the vma.
657 *
658 * We cannot merge two vmas if they have differently assigned (non-NULL)
659 * anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
660 */
661 static int
662 can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
663 struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
664 {
665 if (is_mergeable_vma(vma, file, vm_flags) &&
666 is_mergeable_anon_vma(anon_vma, vma->anon_vma)) {
667 pgoff_t vm_pglen;
668 vm_pglen = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
669 if (vma->vm_pgoff + vm_pglen == vm_pgoff)
670 return 1;
671 }
672 return 0;
673 }
675 /*
676 * Given a mapping request (addr,end,vm_flags,file,pgoff), figure out
677 * whether that can be merged with its predecessor or its successor.
678 * Or both (it neatly fills a hole).
679 *
680 * In most cases - when called for mmap, brk or mremap - [addr,end) is
681 * certain not to be mapped by the time vma_merge is called; but when
682 * called for mprotect, it is certain to be already mapped (either at
683 * an offset within prev, or at the start of next), and the flags of
684 * this area are about to be changed to vm_flags - and the no-change
685 * case has already been eliminated.
686 *
687 * The following mprotect cases have to be considered, where AAAA is
688 * the area passed down from mprotect_fixup, never extending beyond one
689 * vma, PPPPPP is the prev vma specified, and NNNNNN the next vma after:
690 *
691 * AAAA AAAA AAAA AAAA
692 * PPPPPPNNNNNN PPPPPPNNNNNN PPPPPPNNNNNN PPPPNNNNXXXX
693 * cannot merge might become might become might become
694 * PPNNNNNNNNNN PPPPPPPPPPNN PPPPPPPPPPPP 6 or
695 * mmap, brk or case 4 below case 5 below PPPPPPPPXXXX 7 or
696 * mremap move: PPPPNNNNNNNN 8
697 * AAAA
698 * PPPP NNNN PPPPPPPPPPPP PPPPPPPPNNNN PPPPNNNNNNNN
699 * might become case 1 below case 2 below case 3 below
700 *
701 * Odd one out? Case 8, because it extends NNNN but needs flags of XXXX:
702 * mprotect_fixup updates vm_flags & vm_page_prot on successful return.
703 */
704 struct vm_area_struct *vma_merge(struct mm_struct *mm,
705 struct vm_area_struct *prev, unsigned long addr,
706 unsigned long end, unsigned long vm_flags,
707 struct anon_vma *anon_vma, struct file *file,
708 pgoff_t pgoff, struct mempolicy *policy)
709 {
710 pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
711 struct vm_area_struct *area, *next;
713 /*
714 * We later require that vma->vm_flags == vm_flags,
715 * so this tests vma->vm_flags & VM_SPECIAL, too.
716 */
717 if (vm_flags & VM_SPECIAL)
718 return NULL;
720 if (prev)
721 next = prev->vm_next;
722 else
723 next = mm->mmap;
724 area = next;
725 if (next && next->vm_end == end) /* cases 6, 7, 8 */
726 next = next->vm_next;
728 /*
729 * Can it merge with the predecessor?
730 */
731 if (prev && prev->vm_end == addr &&
732 mpol_equal(vma_policy(prev), policy) &&
733 can_vma_merge_after(prev, vm_flags,
734 anon_vma, file, pgoff)) {
735 /*
736 * OK, it can. Can we now merge in the successor as well?
737 */
738 if (next && end == next->vm_start &&
739 mpol_equal(policy, vma_policy(next)) &&
740 can_vma_merge_before(next, vm_flags,
741 anon_vma, file, pgoff+pglen) &&
742 is_mergeable_anon_vma(prev->anon_vma,
743 next->anon_vma)) {
744 /* cases 1, 6 */
745 vma_adjust(prev, prev->vm_start,
746 next->vm_end, prev->vm_pgoff, NULL);
747 } else /* cases 2, 5, 7 */
748 vma_adjust(prev, prev->vm_start,
749 end, prev->vm_pgoff, NULL);
750 return prev;
751 }
753 /*
754 * Can this new request be merged in front of next?
755 */
756 if (next && end == next->vm_start &&
757 mpol_equal(policy, vma_policy(next)) &&
758 can_vma_merge_before(next, vm_flags,
759 anon_vma, file, pgoff+pglen)) {
760 if (prev && addr < prev->vm_end) /* case 4 */
761 vma_adjust(prev, prev->vm_start,
762 addr, prev->vm_pgoff, NULL);
763 else /* cases 3, 8 */
764 vma_adjust(area, addr, next->vm_end,
765 next->vm_pgoff - pglen, NULL);
766 return area;
767 }
769 return NULL;
770 }
772 /*
773 * find_mergeable_anon_vma is used by anon_vma_prepare, to check
774 * neighbouring vmas for a suitable anon_vma, before it goes off
775 * to allocate a new anon_vma. It checks because a repetitive
776 * sequence of mprotects and faults may otherwise lead to distinct
777 * anon_vmas being allocated, preventing vma merge in subsequent
778 * mprotect.
779 */
780 struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)
781 {
782 struct vm_area_struct *near;
783 unsigned long vm_flags;
785 near = vma->vm_next;
786 if (!near)
787 goto try_prev;
789 /*
790 * Since only mprotect tries to remerge vmas, match flags
791 * which might be mprotected into each other later on.
792 * Neither mlock nor madvise tries to remerge at present,
793 * so leave their flags as obstructing a merge.
794 */
795 vm_flags = vma->vm_flags & ~(VM_READ|VM_WRITE|VM_EXEC);
796 vm_flags |= near->vm_flags & (VM_READ|VM_WRITE|VM_EXEC);
798 if (near->anon_vma && vma->vm_end == near->vm_start &&
799 mpol_equal(vma_policy(vma), vma_policy(near)) &&
800 can_vma_merge_before(near, vm_flags,
801 NULL, vma->vm_file, vma->vm_pgoff +
802 ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT)))
803 return near->anon_vma;
804 try_prev:
805 /*
806 * It is potentially slow to have to call find_vma_prev here.
807 * But it's only on the first write fault on the vma, not
808 * every time, and we could devise a way to avoid it later
809 * (e.g. stash info in next's anon_vma_node when assigning
810 * an anon_vma, or when trying vma_merge). Another time.
811 */
812 if (find_vma_prev(vma->vm_mm, vma->vm_start, &near) != vma)
813 BUG();
814 if (!near)
815 goto none;
817 vm_flags = vma->vm_flags & ~(VM_READ|VM_WRITE|VM_EXEC);
818 vm_flags |= near->vm_flags & (VM_READ|VM_WRITE|VM_EXEC);
820 if (near->anon_vma && near->vm_end == vma->vm_start &&
821 mpol_equal(vma_policy(near), vma_policy(vma)) &&
822 can_vma_merge_after(near, vm_flags,
823 NULL, vma->vm_file, vma->vm_pgoff))
824 return near->anon_vma;
825 none:
826 /*
827 * There's no absolute need to look only at touching neighbours:
828 * we could search further afield for "compatible" anon_vmas.
829 * But it would probably just be a waste of time searching,
830 * or lead to too many vmas hanging off the same anon_vma.
831 * We're trying to allow mprotect remerging later on,
832 * not trying to minimize memory used for anon_vmas.
833 */
834 return NULL;
835 }
837 #ifdef CONFIG_PROC_FS
838 void __vm_stat_account(struct mm_struct *mm, unsigned long flags,
839 struct file *file, long pages)
840 {
841 const unsigned long stack_flags
842 = VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN);
844 #ifdef CONFIG_HUGETLB
845 if (flags & VM_HUGETLB) {
846 if (!(flags & VM_DONTCOPY))
847 mm->shared_vm += pages;
848 return;
849 }
850 #endif /* CONFIG_HUGETLB */
852 if (file) {
853 mm->shared_vm += pages;
854 if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC)
855 mm->exec_vm += pages;
856 } else if (flags & stack_flags)
857 mm->stack_vm += pages;
858 if (flags & (VM_RESERVED|VM_IO))
859 mm->reserved_vm += pages;
860 }
861 #endif /* CONFIG_PROC_FS */
863 /*
864 * The caller must hold down_write(current->mm->mmap_sem).
865 */
867 unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
868 unsigned long len, unsigned long prot,
869 unsigned long flags, unsigned long pgoff)
870 {
871 struct mm_struct * mm = current->mm;
872 struct vm_area_struct * vma, * prev;
873 struct inode *inode;
874 unsigned int vm_flags;
875 int correct_wcount = 0;
876 int error;
877 struct rb_node ** rb_link, * rb_parent;
878 int accountable = 1;
879 unsigned long charged = 0, reqprot = prot;
881 if (file) {
882 if (is_file_hugepages(file))
883 accountable = 0;
885 if (!file->f_op || !file->f_op->mmap)
886 return -ENODEV;
888 if ((prot & PROT_EXEC) &&
889 (file->f_vfsmnt->mnt_flags & MNT_NOEXEC))
890 return -EPERM;
891 }
892 /*
893 * Does the application expect PROT_READ to imply PROT_EXEC?
894 *
895 * (the exception is when the underlying filesystem is noexec
896 * mounted, in which case we dont add PROT_EXEC.)
897 */
898 if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
899 if (!(file && (file->f_vfsmnt->mnt_flags & MNT_NOEXEC)))
900 prot |= PROT_EXEC;
902 if (!len)
903 return -EINVAL;
905 /* Careful about overflows.. */
906 len = PAGE_ALIGN(len);
907 if (!len || len > TASK_SIZE)
908 return -ENOMEM;
910 /* offset overflow? */
911 if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
912 return -EOVERFLOW;
914 /* Too many mappings? */
915 if (mm->map_count > sysctl_max_map_count)
916 return -ENOMEM;
918 /* Obtain the address to map to. we verify (or select) it and ensure
919 * that it represents a valid section of the address space.
920 */
921 addr = get_unmapped_area(file, addr, len, pgoff, flags);
922 if (addr & ~PAGE_MASK)
923 return addr;
925 /* Do simple checking here so the lower-level routines won't have
926 * to. we assume access permissions have been handled by the open
927 * of the memory object, so we don't do any here.
928 */
929 vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |
930 mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
932 if (flags & MAP_LOCKED) {
933 if (!can_do_mlock())
934 return -EPERM;
935 vm_flags |= VM_LOCKED;
936 }
937 /* mlock MCL_FUTURE? */
938 if (vm_flags & VM_LOCKED) {
939 unsigned long locked, lock_limit;
940 locked = len >> PAGE_SHIFT;
941 locked += mm->locked_vm;
942 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
943 lock_limit >>= PAGE_SHIFT;
944 if (locked > lock_limit && !capable(CAP_IPC_LOCK))
945 return -EAGAIN;
946 }
948 inode = file ? file->f_dentry->d_inode : NULL;
950 if (file) {
951 switch (flags & MAP_TYPE) {
952 case MAP_SHARED:
953 if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE))
954 return -EACCES;
956 /*
957 * Make sure we don't allow writing to an append-only
958 * file..
959 */
960 if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE))
961 return -EACCES;
963 /*
964 * Make sure there are no mandatory locks on the file.
965 */
966 if (locks_verify_locked(inode))
967 return -EAGAIN;
969 vm_flags |= VM_SHARED | VM_MAYSHARE;
970 if (!(file->f_mode & FMODE_WRITE))
971 vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
973 /* fall through */
974 case MAP_PRIVATE:
975 if (!(file->f_mode & FMODE_READ))
976 return -EACCES;
977 break;
979 default:
980 return -EINVAL;
981 }
982 } else {
983 switch (flags & MAP_TYPE) {
984 case MAP_SHARED:
985 vm_flags |= VM_SHARED | VM_MAYSHARE;
986 break;
987 case MAP_PRIVATE:
988 /*
989 * Set pgoff according to addr for anon_vma.
990 */
991 pgoff = addr >> PAGE_SHIFT;
992 break;
993 default:
994 return -EINVAL;
995 }
996 }
998 error = security_file_mmap(file, reqprot, prot, flags);
999 if (error)
1000 return error;
1002 /* Clear old maps */
1003 error = -ENOMEM;
1004 munmap_back:
1005 vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
1006 if (vma && vma->vm_start < addr + len) {
1007 if (do_munmap(mm, addr, len))
1008 return -ENOMEM;
1009 goto munmap_back;
1012 /* Check against address space limit. */
1013 if (!may_expand_vm(mm, len >> PAGE_SHIFT))
1014 return -ENOMEM;
1016 if (accountable && (!(flags & MAP_NORESERVE) ||
1017 sysctl_overcommit_memory == OVERCOMMIT_NEVER)) {
1018 if (vm_flags & VM_SHARED) {
1019 /* Check memory availability in shmem_file_setup? */
1020 vm_flags |= VM_ACCOUNT;
1021 } else if (vm_flags & VM_WRITE) {
1022 /*
1023 * Private writable mapping: check memory availability
1024 */
1025 charged = len >> PAGE_SHIFT;
1026 if (security_vm_enough_memory(charged))
1027 return -ENOMEM;
1028 vm_flags |= VM_ACCOUNT;
1032 /*
1033 * Can we just expand an old private anonymous mapping?
1034 * The VM_SHARED test is necessary because shmem_zero_setup
1035 * will create the file object for a shared anonymous map below.
1036 */
1037 if (!file && !(vm_flags & VM_SHARED) &&
1038 vma_merge(mm, prev, addr, addr + len, vm_flags,
1039 NULL, NULL, pgoff, NULL))
1040 goto out;
1042 /*
1043 * Determine the object being mapped and call the appropriate
1044 * specific mapper. the address has already been validated, but
1045 * not unmapped, but the maps are removed from the list.
1046 */
1047 vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
1048 if (!vma) {
1049 error = -ENOMEM;
1050 goto unacct_error;
1052 memset(vma, 0, sizeof(*vma));
1054 vma->vm_mm = mm;
1055 vma->vm_start = addr;
1056 vma->vm_end = addr + len;
1057 vma->vm_flags = vm_flags;
1058 vma->vm_page_prot = protection_map[vm_flags & 0x0f];
1059 vma->vm_pgoff = pgoff;
1061 if (file) {
1062 error = -EINVAL;
1063 if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
1064 goto free_vma;
1065 if (vm_flags & VM_DENYWRITE) {
1066 error = deny_write_access(file);
1067 if (error)
1068 goto free_vma;
1069 correct_wcount = 1;
1071 vma->vm_file = file;
1072 get_file(file);
1073 error = file->f_op->mmap(file, vma);
1074 if (error)
1075 goto unmap_and_free_vma;
1076 } else if (vm_flags & VM_SHARED) {
1077 error = shmem_zero_setup(vma);
1078 if (error)
1079 goto free_vma;
1082 /* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform
1083 * shmem_zero_setup (perhaps called through /dev/zero's ->mmap)
1084 * that memory reservation must be checked; but that reservation
1085 * belongs to shared memory object, not to vma: so now clear it.
1086 */
1087 if ((vm_flags & (VM_SHARED|VM_ACCOUNT)) == (VM_SHARED|VM_ACCOUNT))
1088 vma->vm_flags &= ~VM_ACCOUNT;
1090 /* Can addr have changed??
1092 * Answer: Yes, several device drivers can do it in their
1093 * f_op->mmap method. -DaveM
1094 */
1095 addr = vma->vm_start;
1096 pgoff = vma->vm_pgoff;
1097 vm_flags = vma->vm_flags;
1099 if (!file || !vma_merge(mm, prev, addr, vma->vm_end,
1100 vma->vm_flags, NULL, file, pgoff, vma_policy(vma))) {
1101 file = vma->vm_file;
1102 vma_link(mm, vma, prev, rb_link, rb_parent);
1103 if (correct_wcount)
1104 atomic_inc(&inode->i_writecount);
1105 } else {
1106 if (file) {
1107 if (correct_wcount)
1108 atomic_inc(&inode->i_writecount);
1109 fput(file);
1111 mpol_free(vma_policy(vma));
1112 kmem_cache_free(vm_area_cachep, vma);
1114 out:
1115 mm->total_vm += len >> PAGE_SHIFT;
1116 __vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
1117 if (vm_flags & VM_LOCKED) {
1118 mm->locked_vm += len >> PAGE_SHIFT;
1119 make_pages_present(addr, addr + len);
1121 if (flags & MAP_POPULATE) {
1122 up_write(&mm->mmap_sem);
1123 sys_remap_file_pages(addr, len, 0,
1124 pgoff, flags & MAP_NONBLOCK);
1125 down_write(&mm->mmap_sem);
1127 return addr;
1129 unmap_and_free_vma:
1130 if (correct_wcount)
1131 atomic_inc(&inode->i_writecount);
1132 vma->vm_file = NULL;
1133 fput(file);
1135 /* Undo any partial mapping done by a device driver. */
1136 unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);
1137 charged = 0;
1138 free_vma:
1139 kmem_cache_free(vm_area_cachep, vma);
1140 unacct_error:
1141 if (charged)
1142 vm_unacct_memory(charged);
1143 return error;
1146 EXPORT_SYMBOL(do_mmap_pgoff);
1148 /* Get an address range which is currently unmapped.
1149 * For shmat() with addr=0.
1151 * Ugly calling convention alert:
1152 * Return value with the low bits set means error value,
1153 * ie
1154 * if (ret & ~PAGE_MASK)
1155 * error = ret;
1157 * This function "knows" that -ENOMEM has the bits set.
1158 */
1159 #ifndef HAVE_ARCH_UNMAPPED_AREA
1160 unsigned long
1161 arch_get_unmapped_area(struct file *filp, unsigned long addr,
1162 unsigned long len, unsigned long pgoff, unsigned long flags)
1164 struct mm_struct *mm = current->mm;
1165 struct vm_area_struct *vma;
1166 unsigned long start_addr;
1168 if (len > TASK_SIZE)
1169 return -ENOMEM;
1171 if (addr) {
1172 addr = PAGE_ALIGN(addr);
1173 vma = find_vma(mm, addr);
1174 if (TASK_SIZE - len >= addr &&
1175 (!vma || addr + len <= vma->vm_start))
1176 return addr;
1178 start_addr = addr = mm->free_area_cache;
1180 full_search:
1181 for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
1182 /* At this point: (!vma || addr < vma->vm_end). */
1183 if (TASK_SIZE - len < addr) {
1184 /*
1185 * Start a new search - just in case we missed
1186 * some holes.
1187 */
1188 if (start_addr != TASK_UNMAPPED_BASE) {
1189 start_addr = addr = TASK_UNMAPPED_BASE;
1190 goto full_search;
1192 return -ENOMEM;
1194 if (!vma || addr + len <= vma->vm_start) {
1195 /*
1196 * Remember the place where we stopped the search:
1197 */
1198 mm->free_area_cache = addr + len;
1199 return addr;
1201 addr = vma->vm_end;
1204 #endif
1206 void arch_unmap_area(struct vm_area_struct *area)
1208 /*
1209 * Is this a new hole at the lowest possible address?
1210 */
1211 if (area->vm_start >= TASK_UNMAPPED_BASE &&
1212 area->vm_start < area->vm_mm->free_area_cache)
1213 area->vm_mm->free_area_cache = area->vm_start;
1216 /*
1217 * This mmap-allocator allocates new areas top-down from below the
1218 * stack's low limit (the base):
1219 */
1220 #ifndef HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
1221 unsigned long
1222 arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
1223 const unsigned long len, const unsigned long pgoff,
1224 const unsigned long flags)
1226 struct vm_area_struct *vma;
1227 struct mm_struct *mm = current->mm;
1228 unsigned long addr = addr0;
1230 /* requested length too big for entire address space */
1231 if (len > TASK_SIZE)
1232 return -ENOMEM;
1234 /* requesting a specific address */
1235 if (addr) {
1236 addr = PAGE_ALIGN(addr);
1237 vma = find_vma(mm, addr);
1238 if (TASK_SIZE - len >= addr &&
1239 (!vma || addr + len <= vma->vm_start))
1240 return addr;
1243 /* either no address requested or can't fit in requested address hole */
1244 addr = mm->free_area_cache;
1246 /* make sure it can fit in the remaining address space */
1247 if (addr > len) {
1248 vma = find_vma(mm, addr-len);
1249 if (!vma || addr <= vma->vm_start)
1250 /* remember the address as a hint for next time */
1251 return (mm->free_area_cache = addr-len);
1254 addr = mm->mmap_base-len;
1256 do {
1257 /*
1258 * Lookup failure means no vma is above this address,
1259 * else if new region fits below vma->vm_start,
1260 * return with success:
1261 */
1262 vma = find_vma(mm, addr);
1263 if (!vma || addr+len <= vma->vm_start)
1264 /* remember the address as a hint for next time */
1265 return (mm->free_area_cache = addr);
1267 /* try just below the current vma->vm_start */
1268 addr = vma->vm_start-len;
1269 } while (len < vma->vm_start);
1271 /*
1272 * A failed mmap() very likely causes application failure,
1273 * so fall back to the bottom-up function here. This scenario
1274 * can happen with large stack limits and large mmap()
1275 * allocations.
1276 */
1277 mm->free_area_cache = TASK_UNMAPPED_BASE;
1278 addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
1279 /*
1280 * Restore the topdown base:
1281 */
1282 mm->free_area_cache = mm->mmap_base;
1284 return addr;
1286 #endif
1288 void arch_unmap_area_topdown(struct vm_area_struct *area)
1290 /*
1291 * Is this a new hole at the highest possible address?
1292 */
1293 if (area->vm_end > area->vm_mm->free_area_cache)
1294 area->vm_mm->free_area_cache = area->vm_end;
1296 /* dont allow allocations above current base */
1297 if (area->vm_mm->free_area_cache > area->vm_mm->mmap_base)
1298 area->vm_mm->free_area_cache = area->vm_mm->mmap_base;
1301 unsigned long
1302 get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
1303 unsigned long pgoff, unsigned long flags)
1305 unsigned long ret;
1307 if (!(flags & MAP_FIXED)) {
1308 unsigned long (*get_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
1310 get_area = current->mm->get_unmapped_area;
1311 if (file && file->f_op && file->f_op->get_unmapped_area)
1312 get_area = file->f_op->get_unmapped_area;
1313 addr = get_area(file, addr, len, pgoff, flags);
1314 if (IS_ERR_VALUE(addr))
1315 return addr;
1318 if (addr > TASK_SIZE - len)
1319 return -ENOMEM;
1320 if (addr & ~PAGE_MASK)
1321 return -EINVAL;
1322 if (file && is_file_hugepages(file)) {
1323 /*
1324 * Check if the given range is hugepage aligned, and
1325 * can be made suitable for hugepages.
1326 */
1327 ret = prepare_hugepage_range(addr, len);
1328 } else {
1329 /*
1330 * Ensure that a normal request is not falling in a
1331 * reserved hugepage range. For some archs like IA-64,
1332 * there is a separate region for hugepages.
1333 */
1334 ret = is_hugepage_only_range(current->mm, addr, len);
1336 if (ret)
1337 return -EINVAL;
1338 return addr;
1341 EXPORT_SYMBOL(get_unmapped_area);
1343 /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */
1344 struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr)
1346 struct vm_area_struct *vma = NULL;
1348 if (mm) {
1349 /* Check the cache first. */
1350 /* (Cache hit rate is typically around 35%.) */
1351 vma = mm->mmap_cache;
1352 if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
1353 struct rb_node * rb_node;
1355 rb_node = mm->mm_rb.rb_node;
1356 vma = NULL;
1358 while (rb_node) {
1359 struct vm_area_struct * vma_tmp;
1361 vma_tmp = rb_entry(rb_node,
1362 struct vm_area_struct, vm_rb);
1364 if (vma_tmp->vm_end > addr) {
1365 vma = vma_tmp;
1366 if (vma_tmp->vm_start <= addr)
1367 break;
1368 rb_node = rb_node->rb_left;
1369 } else
1370 rb_node = rb_node->rb_right;
1372 if (vma)
1373 mm->mmap_cache = vma;
1376 return vma;
1379 EXPORT_SYMBOL(find_vma);
1381 /* Same as find_vma, but also return a pointer to the previous VMA in *pprev. */
1382 struct vm_area_struct *
1383 find_vma_prev(struct mm_struct *mm, unsigned long addr,
1384 struct vm_area_struct **pprev)
1386 struct vm_area_struct *vma = NULL, *prev = NULL;
1387 struct rb_node * rb_node;
1388 if (!mm)
1389 goto out;
1391 /* Guard against addr being lower than the first VMA */
1392 vma = mm->mmap;
1394 /* Go through the RB tree quickly. */
1395 rb_node = mm->mm_rb.rb_node;
1397 while (rb_node) {
1398 struct vm_area_struct *vma_tmp;
1399 vma_tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);
1401 if (addr < vma_tmp->vm_end) {
1402 rb_node = rb_node->rb_left;
1403 } else {
1404 prev = vma_tmp;
1405 if (!prev->vm_next || (addr < prev->vm_next->vm_end))
1406 break;
1407 rb_node = rb_node->rb_right;
1411 out:
1412 *pprev = prev;
1413 return prev ? prev->vm_next : vma;
1416 /*
1417 * Verify that the stack growth is acceptable and
1418 * update accounting. This is shared with both the
1419 * grow-up and grow-down cases.
1420 */
1421 static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, unsigned long grow)
1423 struct mm_struct *mm = vma->vm_mm;
1424 struct rlimit *rlim = current->signal->rlim;
1426 /* address space limit tests */
1427 if (!may_expand_vm(mm, grow))
1428 return -ENOMEM;
1430 /* Stack limit test */
1431 if (size > rlim[RLIMIT_STACK].rlim_cur)
1432 return -ENOMEM;
1434 /* mlock limit tests */
1435 if (vma->vm_flags & VM_LOCKED) {
1436 unsigned long locked;
1437 unsigned long limit;
1438 locked = mm->locked_vm + grow;
1439 limit = rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
1440 if (locked > limit && !capable(CAP_IPC_LOCK))
1441 return -ENOMEM;
1444 /*
1445 * Overcommit.. This must be the final test, as it will
1446 * update security statistics.
1447 */
1448 if (security_vm_enough_memory(grow))
1449 return -ENOMEM;
1451 /* Ok, everything looks good - let it rip */
1452 mm->total_vm += grow;
1453 if (vma->vm_flags & VM_LOCKED)
1454 mm->locked_vm += grow;
1455 __vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow);
1456 return 0;
1459 #ifdef CONFIG_STACK_GROWSUP
1460 /*
1461 * vma is the first one with address > vma->vm_end. Have to extend vma.
1462 */
1463 int expand_stack(struct vm_area_struct * vma, unsigned long address)
1465 int error;
1467 if (!(vma->vm_flags & VM_GROWSUP))
1468 return -EFAULT;
1470 /*
1471 * We must make sure the anon_vma is allocated
1472 * so that the anon_vma locking is not a noop.
1473 */
1474 if (unlikely(anon_vma_prepare(vma)))
1475 return -ENOMEM;
1476 anon_vma_lock(vma);
1478 /*
1479 * vma->vm_start/vm_end cannot change under us because the caller
1480 * is required to hold the mmap_sem in read mode. We need the
1481 * anon_vma lock to serialize against concurrent expand_stacks.
1482 */
1483 address += 4 + PAGE_SIZE - 1;
1484 address &= PAGE_MASK;
1485 error = 0;
1487 /* Somebody else might have raced and expanded it already */
1488 if (address > vma->vm_end) {
1489 unsigned long size, grow;
1491 size = address - vma->vm_start;
1492 grow = (address - vma->vm_end) >> PAGE_SHIFT;
1494 error = acct_stack_growth(vma, size, grow);
1495 if (!error)
1496 vma->vm_end = address;
1498 anon_vma_unlock(vma);
1499 return error;
1502 struct vm_area_struct *
1503 find_extend_vma(struct mm_struct *mm, unsigned long addr)
1505 struct vm_area_struct *vma, *prev;
1507 addr &= PAGE_MASK;
1508 vma = find_vma_prev(mm, addr, &prev);
1509 if (vma && (vma->vm_start <= addr))
1510 return vma;
1511 if (!prev || expand_stack(prev, addr))
1512 return NULL;
1513 if (prev->vm_flags & VM_LOCKED) {
1514 make_pages_present(addr, prev->vm_end);
1516 return prev;
1518 #else
1519 /*
1520 * vma is the first one with address < vma->vm_start. Have to extend vma.
1521 */
1522 int expand_stack(struct vm_area_struct *vma, unsigned long address)
1524 int error;
1526 /*
1527 * We must make sure the anon_vma is allocated
1528 * so that the anon_vma locking is not a noop.
1529 */
1530 if (unlikely(anon_vma_prepare(vma)))
1531 return -ENOMEM;
1532 anon_vma_lock(vma);
1534 /*
1535 * vma->vm_start/vm_end cannot change under us because the caller
1536 * is required to hold the mmap_sem in read mode. We need the
1537 * anon_vma lock to serialize against concurrent expand_stacks.
1538 */
1539 address &= PAGE_MASK;
1540 error = 0;
1542 /* Somebody else might have raced and expanded it already */
1543 if (address < vma->vm_start) {
1544 unsigned long size, grow;
1546 size = vma->vm_end - address;
1547 grow = (vma->vm_start - address) >> PAGE_SHIFT;
1549 error = acct_stack_growth(vma, size, grow);
1550 if (!error) {
1551 vma->vm_start = address;
1552 vma->vm_pgoff -= grow;
1555 anon_vma_unlock(vma);
1556 return error;
1559 struct vm_area_struct *
1560 find_extend_vma(struct mm_struct * mm, unsigned long addr)
1562 struct vm_area_struct * vma;
1563 unsigned long start;
1565 addr &= PAGE_MASK;
1566 vma = find_vma(mm,addr);
1567 if (!vma)
1568 return NULL;
1569 if (vma->vm_start <= addr)
1570 return vma;
1571 if (!(vma->vm_flags & VM_GROWSDOWN))
1572 return NULL;
1573 start = vma->vm_start;
1574 if (expand_stack(vma, addr))
1575 return NULL;
1576 if (vma->vm_flags & VM_LOCKED) {
1577 make_pages_present(addr, start);
1579 return vma;
1581 #endif
1583 /* Normal function to fix up a mapping
1584 * This function is the default for when an area has no specific
1585 * function. This may be used as part of a more specific routine.
1587 * By the time this function is called, the area struct has been
1588 * removed from the process mapping list.
1589 */
1590 static void unmap_vma(struct mm_struct *mm, struct vm_area_struct *area)
1592 size_t len = area->vm_end - area->vm_start;
1594 area->vm_mm->total_vm -= len >> PAGE_SHIFT;
1595 if (area->vm_flags & VM_LOCKED)
1596 area->vm_mm->locked_vm -= len >> PAGE_SHIFT;
1597 vm_stat_unaccount(area);
1598 area->vm_mm->unmap_area(area);
1599 remove_vm_struct(area);
1602 /*
1603 * Update the VMA and inode share lists.
1605 * Ok - we have the memory areas we should free on the 'free' list,
1606 * so release them, and do the vma updates.
1607 */
1608 static void unmap_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
1610 do {
1611 struct vm_area_struct *next = vma->vm_next;
1612 unmap_vma(mm, vma);
1613 vma = next;
1614 } while (vma);
1615 validate_mm(mm);
1618 /*
1619 * Get rid of page table information in the indicated region.
1621 * Called with the page table lock held.
1622 */
1623 static void unmap_region(struct mm_struct *mm,
1624 struct vm_area_struct *vma, struct vm_area_struct *prev,
1625 unsigned long start, unsigned long end)
1627 struct vm_area_struct *next = prev? prev->vm_next: mm->mmap;
1628 struct mmu_gather *tlb;
1629 unsigned long nr_accounted = 0;
1631 lru_add_drain();
1632 spin_lock(&mm->page_table_lock);
1633 tlb = tlb_gather_mmu(mm, 0);
1634 unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted, NULL);
1635 vm_unacct_memory(nr_accounted);
1636 free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
1637 next? next->vm_start: 0);
1638 tlb_finish_mmu(tlb, start, end);
1639 spin_unlock(&mm->page_table_lock);
1642 /*
1643 * Create a list of vma's touched by the unmap, removing them from the mm's
1644 * vma list as we go..
1645 */
1646 static void
1647 detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
1648 struct vm_area_struct *prev, unsigned long end)
1650 struct vm_area_struct **insertion_point;
1651 struct vm_area_struct *tail_vma = NULL;
1653 insertion_point = (prev ? &prev->vm_next : &mm->mmap);
1654 do {
1655 rb_erase(&vma->vm_rb, &mm->mm_rb);
1656 mm->map_count--;
1657 tail_vma = vma;
1658 vma = vma->vm_next;
1659 } while (vma && vma->vm_start < end);
1660 *insertion_point = vma;
1661 tail_vma->vm_next = NULL;
1662 mm->mmap_cache = NULL; /* Kill the cache. */
1665 /*
1666 * Split a vma into two pieces at address 'addr', a new vma is allocated
1667 * either for the first part or the the tail.
1668 */
1669 int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
1670 unsigned long addr, int new_below)
1672 struct mempolicy *pol;
1673 struct vm_area_struct *new;
1675 if (is_vm_hugetlb_page(vma) && (addr & ~HPAGE_MASK))
1676 return -EINVAL;
1678 if (mm->map_count >= sysctl_max_map_count)
1679 return -ENOMEM;
1681 new = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
1682 if (!new)
1683 return -ENOMEM;
1685 /* most fields are the same, copy all, and then fixup */
1686 *new = *vma;
1688 if (new_below)
1689 new->vm_end = addr;
1690 else {
1691 new->vm_start = addr;
1692 new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT);
1695 pol = mpol_copy(vma_policy(vma));
1696 if (IS_ERR(pol)) {
1697 kmem_cache_free(vm_area_cachep, new);
1698 return PTR_ERR(pol);
1700 vma_set_policy(new, pol);
1702 if (new->vm_file)
1703 get_file(new->vm_file);
1705 if (new->vm_ops && new->vm_ops->open)
1706 new->vm_ops->open(new);
1708 if (new_below)
1709 vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff +
1710 ((addr - new->vm_start) >> PAGE_SHIFT), new);
1711 else
1712 vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new);
1714 return 0;
1717 /* Munmap is split into 2 main parts -- this part which finds
1718 * what needs doing, and the areas themselves, which do the
1719 * work. This now handles partial unmappings.
1720 * Jeremy Fitzhardinge <jeremy@goop.org>
1721 */
1722 int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
1724 unsigned long end;
1725 struct vm_area_struct *vma, *prev, *last;
1727 if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start)
1728 return -EINVAL;
1730 if ((len = PAGE_ALIGN(len)) == 0)
1731 return -EINVAL;
1733 /* Find the first overlapping VMA */
1734 vma = find_vma_prev(mm, start, &prev);
1735 if (!vma)
1736 return 0;
1737 /* we have start < vma->vm_end */
1739 /* if it doesn't overlap, we have nothing.. */
1740 end = start + len;
1741 if (vma->vm_start >= end)
1742 return 0;
1744 /*
1745 * If we need to split any vma, do it now to save pain later.
1747 * Note: mremap's move_vma VM_ACCOUNT handling assumes a partially
1748 * unmapped vm_area_struct will remain in use: so lower split_vma
1749 * places tmp vma above, and higher split_vma places tmp vma below.
1750 */
1751 if (start > vma->vm_start) {
1752 int error = split_vma(mm, vma, start, 0);
1753 if (error)
1754 return error;
1755 prev = vma;
1758 /* Does it split the last one? */
1759 last = find_vma(mm, end);
1760 if (last && end > last->vm_start) {
1761 int error = split_vma(mm, last, end, 1);
1762 if (error)
1763 return error;
1765 vma = prev? prev->vm_next: mm->mmap;
1767 /*
1768 * Remove the vma's, and unmap the actual pages
1769 */
1770 detach_vmas_to_be_unmapped(mm, vma, prev, end);
1771 unmap_region(mm, vma, prev, start, end);
1773 /* Fix up all other VM information */
1774 unmap_vma_list(mm, vma);
1776 return 0;
1779 EXPORT_SYMBOL(do_munmap);
1781 asmlinkage long sys_munmap(unsigned long addr, size_t len)
1783 int ret;
1784 struct mm_struct *mm = current->mm;
1786 profile_munmap(addr);
1788 down_write(&mm->mmap_sem);
1789 ret = do_munmap(mm, addr, len);
1790 up_write(&mm->mmap_sem);
1791 return ret;
1794 static inline void verify_mm_writelocked(struct mm_struct *mm)
1796 #ifdef CONFIG_DEBUG_KERNEL
1797 if (unlikely(down_read_trylock(&mm->mmap_sem))) {
1798 WARN_ON(1);
1799 up_read(&mm->mmap_sem);
1801 #endif
1804 /*
1805 * this is really a simplified "do_mmap". it only handles
1806 * anonymous maps. eventually we may be able to do some
1807 * brk-specific accounting here.
1808 */
1809 unsigned long do_brk(unsigned long addr, unsigned long len)
1811 struct mm_struct * mm = current->mm;
1812 struct vm_area_struct * vma, * prev;
1813 unsigned long flags;
1814 struct rb_node ** rb_link, * rb_parent;
1815 pgoff_t pgoff = addr >> PAGE_SHIFT;
1817 len = PAGE_ALIGN(len);
1818 if (!len)
1819 return addr;
1821 if ((addr + len) > TASK_SIZE || (addr + len) < addr)
1822 return -EINVAL;
1824 /*
1825 * mlock MCL_FUTURE?
1826 */
1827 if (mm->def_flags & VM_LOCKED) {
1828 unsigned long locked, lock_limit;
1829 locked = len >> PAGE_SHIFT;
1830 locked += mm->locked_vm;
1831 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
1832 lock_limit >>= PAGE_SHIFT;
1833 if (locked > lock_limit && !capable(CAP_IPC_LOCK))
1834 return -EAGAIN;
1837 /*
1838 * mm->mmap_sem is required to protect against another thread
1839 * changing the mappings in case we sleep.
1840 */
1841 verify_mm_writelocked(mm);
1843 /*
1844 * Clear old maps. this also does some error checking for us
1845 */
1846 munmap_back:
1847 vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
1848 if (vma && vma->vm_start < addr + len) {
1849 if (do_munmap(mm, addr, len))
1850 return -ENOMEM;
1851 goto munmap_back;
1854 /* Check against address space limits *after* clearing old maps... */
1855 if (!may_expand_vm(mm, len >> PAGE_SHIFT))
1856 return -ENOMEM;
1858 if (mm->map_count > sysctl_max_map_count)
1859 return -ENOMEM;
1861 if (security_vm_enough_memory(len >> PAGE_SHIFT))
1862 return -ENOMEM;
1864 flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
1866 /* Can we just expand an old private anonymous mapping? */
1867 if (vma_merge(mm, prev, addr, addr + len, flags,
1868 NULL, NULL, pgoff, NULL))
1869 goto out;
1871 /*
1872 * create a vma struct for an anonymous mapping
1873 */
1874 vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
1875 if (!vma) {
1876 vm_unacct_memory(len >> PAGE_SHIFT);
1877 return -ENOMEM;
1879 memset(vma, 0, sizeof(*vma));
1881 vma->vm_mm = mm;
1882 vma->vm_start = addr;
1883 vma->vm_end = addr + len;
1884 vma->vm_pgoff = pgoff;
1885 vma->vm_flags = flags;
1886 vma->vm_page_prot = protection_map[flags & 0x0f];
1887 vma_link(mm, vma, prev, rb_link, rb_parent);
1888 out:
1889 mm->total_vm += len >> PAGE_SHIFT;
1890 if (flags & VM_LOCKED) {
1891 mm->locked_vm += len >> PAGE_SHIFT;
1892 make_pages_present(addr, addr + len);
1894 return addr;
1897 EXPORT_SYMBOL(do_brk);
1899 /* Release all mmaps. */
1900 void exit_mmap(struct mm_struct *mm)
1902 struct mmu_gather *tlb;
1903 struct vm_area_struct *vma = mm->mmap;
1904 unsigned long nr_accounted = 0;
1905 unsigned long end;
1907 #ifdef arch_exit_mmap
1908 arch_exit_mmap(mm);
1909 #endif
1911 lru_add_drain();
1913 spin_lock(&mm->page_table_lock);
1915 flush_cache_mm(mm);
1916 tlb = tlb_gather_mmu(mm, 1);
1917 /* Use -1 here to ensure all VMAs in the mm are unmapped */
1918 end = unmap_vmas(&tlb, mm, vma, 0, -1, &nr_accounted, NULL);
1919 vm_unacct_memory(nr_accounted);
1920 free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
1921 tlb_finish_mmu(tlb, 0, end);
1923 mm->mmap = mm->mmap_cache = NULL;
1924 mm->mm_rb = RB_ROOT;
1925 set_mm_counter(mm, rss, 0);
1926 mm->total_vm = 0;
1927 mm->locked_vm = 0;
1929 spin_unlock(&mm->page_table_lock);
1931 /*
1932 * Walk the list again, actually closing and freeing it
1933 * without holding any MM locks.
1934 */
1935 while (vma) {
1936 struct vm_area_struct *next = vma->vm_next;
1937 remove_vm_struct(vma);
1938 vma = next;
1941 BUG_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
1944 /* Insert vm structure into process list sorted by address
1945 * and into the inode's i_mmap tree. If vm_file is non-NULL
1946 * then i_mmap_lock is taken here.
1947 */
1948 int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
1950 struct vm_area_struct * __vma, * prev;
1951 struct rb_node ** rb_link, * rb_parent;
1953 /*
1954 * The vm_pgoff of a purely anonymous vma should be irrelevant
1955 * until its first write fault, when page's anon_vma and index
1956 * are set. But now set the vm_pgoff it will almost certainly
1957 * end up with (unless mremap moves it elsewhere before that
1958 * first wfault), so /proc/pid/maps tells a consistent story.
1960 * By setting it to reflect the virtual start address of the
1961 * vma, merges and splits can happen in a seamless way, just
1962 * using the existing file pgoff checks and manipulations.
1963 * Similarly in do_mmap_pgoff and in do_brk.
1964 */
1965 if (!vma->vm_file) {
1966 BUG_ON(vma->anon_vma);
1967 vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
1969 __vma = find_vma_prepare(mm,vma->vm_start,&prev,&rb_link,&rb_parent);
1970 if (__vma && __vma->vm_start < vma->vm_end)
1971 return -ENOMEM;
1972 vma_link(mm, vma, prev, rb_link, rb_parent);
1973 return 0;
1976 /*
1977 * Copy the vma structure to a new location in the same mm,
1978 * prior to moving page table entries, to effect an mremap move.
1979 */
1980 struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
1981 unsigned long addr, unsigned long len, pgoff_t pgoff)
1983 struct vm_area_struct *vma = *vmap;
1984 unsigned long vma_start = vma->vm_start;
1985 struct mm_struct *mm = vma->vm_mm;
1986 struct vm_area_struct *new_vma, *prev;
1987 struct rb_node **rb_link, *rb_parent;
1988 struct mempolicy *pol;
1990 /*
1991 * If anonymous vma has not yet been faulted, update new pgoff
1992 * to match new location, to increase its chance of merging.
1993 */
1994 if (!vma->vm_file && !vma->anon_vma)
1995 pgoff = addr >> PAGE_SHIFT;
1997 find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
1998 new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
1999 vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma));
2000 if (new_vma) {
2001 /*
2002 * Source vma may have been merged into new_vma
2003 */
2004 if (vma_start >= new_vma->vm_start &&
2005 vma_start < new_vma->vm_end)
2006 *vmap = new_vma;
2007 } else {
2008 new_vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
2009 if (new_vma) {
2010 *new_vma = *vma;
2011 pol = mpol_copy(vma_policy(vma));
2012 if (IS_ERR(pol)) {
2013 kmem_cache_free(vm_area_cachep, new_vma);
2014 return NULL;
2016 vma_set_policy(new_vma, pol);
2017 new_vma->vm_start = addr;
2018 new_vma->vm_end = addr + len;
2019 new_vma->vm_pgoff = pgoff;
2020 if (new_vma->vm_file)
2021 get_file(new_vma->vm_file);
2022 if (new_vma->vm_ops && new_vma->vm_ops->open)
2023 new_vma->vm_ops->open(new_vma);
2024 vma_link(mm, new_vma, prev, rb_link, rb_parent);
2027 return new_vma;
2030 /*
2031 * Return true if the calling process may expand its vm space by the passed
2032 * number of pages
2033 */
2034 int may_expand_vm(struct mm_struct *mm, unsigned long npages)
2036 unsigned long cur = mm->total_vm; /* pages */
2037 unsigned long lim;
2039 lim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT;
2041 if (cur + npages > lim)
2042 return 0;
2043 return 1;