ia64/linux-2.6.18-xen.hg

view mm/mmap.c @ 878:eba6fe6d8d53

blktap2: a completely rewritten blktap implementation

Benefits to blktap2 over the old version of blktap:

* Isolation from xenstore - Blktap devices are now created directly on
the linux dom0 command line, rather than being spawned in response
to XenStore events. This is handy for debugging, makes blktap
generally easier to work with, and is a step toward a generic
user-level block device implementation that is not Xen-specific.

* Improved tapdisk infrastructure: simpler request forwarding, new
request scheduler, request merging, more efficient use of AIO.

* Improved tapdisk error handling and memory management. No
allocations on the block data path, IO retry logic to protect
guests
transient block device failures. This has been tested and is known
to work on weird environments such as NFS soft mounts.

* Pause and snapshot of live virtual disks (see xmsnap script).

* VHD support. The VHD code in this release has been rigorously
tested, and represents a very mature implementation of the VHD
image
format.

* No more duplication of mechanism with blkback. The blktap kernel
module has changed dramatically from the original blktap. Blkback
is now always used to talk to Xen guests, blktap just presents a
Linux gendisk that blkback can export. This is done while
preserving the zero-copy data path from domU to physical device.

These patches deprecate the old blktap code, which can hopefully be
removed from the tree completely at some point in the future.

Signed-off-by: Jake Wires <jake.wires@citrix.com>
Signed-off-by: Dutch Meyer <dmeyer@cs.ubc.ca>
author Keir Fraser <keir.fraser@citrix.com>
date Tue May 26 11:23:16 2009 +0100 (2009-05-26)
parents a533be77c572
children
line source
1 /*
2 * mm/mmap.c
3 *
4 * Written by obz.
5 *
6 * Address space accounting code <alan@redhat.com>
7 */
9 #include <linux/slab.h>
10 #include <linux/mm.h>
11 #include <linux/shm.h>
12 #include <linux/mman.h>
13 #include <linux/pagemap.h>
14 #include <linux/swap.h>
15 #include <linux/syscalls.h>
16 #include <linux/capability.h>
17 #include <linux/init.h>
18 #include <linux/file.h>
19 #include <linux/fs.h>
20 #include <linux/personality.h>
21 #include <linux/security.h>
22 #include <linux/hugetlb.h>
23 #include <linux/profile.h>
24 #include <linux/module.h>
25 #include <linux/mount.h>
26 #include <linux/mempolicy.h>
27 #include <linux/rmap.h>
29 #include <asm/uaccess.h>
30 #include <asm/cacheflush.h>
31 #include <asm/tlb.h>
33 #ifndef arch_mmap_check
34 #define arch_mmap_check(addr, len, flags) (0)
35 #endif
37 static void unmap_region(struct mm_struct *mm,
38 struct vm_area_struct *vma, struct vm_area_struct *prev,
39 unsigned long start, unsigned long end);
41 /*
42 * WARNING: the debugging will use recursive algorithms so never enable this
43 * unless you know what you are doing.
44 */
45 #undef DEBUG_MM_RB
47 /* description of effects of mapping type and prot in current implementation.
48 * this is due to the limited x86 page protection hardware. The expected
49 * behavior is in parens:
50 *
51 * map_type prot
52 * PROT_NONE PROT_READ PROT_WRITE PROT_EXEC
53 * MAP_SHARED r: (no) no r: (yes) yes r: (no) yes r: (no) yes
54 * w: (no) no w: (no) no w: (yes) yes w: (no) no
55 * x: (no) no x: (no) yes x: (no) yes x: (yes) yes
56 *
57 * MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes
58 * w: (no) no w: (no) no w: (copy) copy w: (no) no
59 * x: (no) no x: (no) yes x: (no) yes x: (yes) yes
60 *
61 */
62 pgprot_t protection_map[16] = {
63 __P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111,
64 __S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111
65 };
67 int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
68 int sysctl_overcommit_ratio = 50; /* default is 50% */
69 int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
70 atomic_t vm_committed_space = ATOMIC_INIT(0);
72 /*
73 * Check that a process has enough memory to allocate a new virtual
74 * mapping. 0 means there is enough memory for the allocation to
75 * succeed and -ENOMEM implies there is not.
76 *
77 * We currently support three overcommit policies, which are set via the
78 * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting
79 *
80 * Strict overcommit modes added 2002 Feb 26 by Alan Cox.
81 * Additional code 2002 Jul 20 by Robert Love.
82 *
83 * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise.
84 *
85 * Note this is a helper function intended to be used by LSMs which
86 * wish to use this logic.
87 */
88 int __vm_enough_memory(long pages, int cap_sys_admin)
89 {
90 unsigned long free, allowed;
92 vm_acct_memory(pages);
94 /*
95 * Sometimes we want to use more memory than we have
96 */
97 if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS)
98 return 0;
100 if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
101 unsigned long n;
103 free = global_page_state(NR_FILE_PAGES);
104 free += nr_swap_pages;
106 /*
107 * Any slabs which are created with the
108 * SLAB_RECLAIM_ACCOUNT flag claim to have contents
109 * which are reclaimable, under pressure. The dentry
110 * cache and most inode caches should fall into this
111 */
112 free += atomic_read(&slab_reclaim_pages);
114 /*
115 * Leave the last 3% for root
116 */
117 if (!cap_sys_admin)
118 free -= free / 32;
120 if (free > pages)
121 return 0;
123 /*
124 * nr_free_pages() is very expensive on large systems,
125 * only call if we're about to fail.
126 */
127 n = nr_free_pages();
129 /*
130 * Leave reserved pages. The pages are not for anonymous pages.
131 */
132 if (n <= totalreserve_pages)
133 goto error;
134 else
135 n -= totalreserve_pages;
137 /*
138 * Leave the last 3% for root
139 */
140 if (!cap_sys_admin)
141 n -= n / 32;
142 free += n;
144 if (free > pages)
145 return 0;
147 goto error;
148 }
150 allowed = (totalram_pages - hugetlb_total_pages())
151 * sysctl_overcommit_ratio / 100;
152 /*
153 * Leave the last 3% for root
154 */
155 if (!cap_sys_admin)
156 allowed -= allowed / 32;
157 allowed += total_swap_pages;
159 /* Don't let a single process grow too big:
160 leave 3% of the size of this process for other processes */
161 allowed -= current->mm->total_vm / 32;
163 /*
164 * cast `allowed' as a signed long because vm_committed_space
165 * sometimes has a negative value
166 */
167 if (atomic_read(&vm_committed_space) < (long)allowed)
168 return 0;
169 error:
170 vm_unacct_memory(pages);
172 return -ENOMEM;
173 }
175 EXPORT_SYMBOL(__vm_enough_memory);
177 /*
178 * Requires inode->i_mapping->i_mmap_lock
179 */
180 static void __remove_shared_vm_struct(struct vm_area_struct *vma,
181 struct file *file, struct address_space *mapping)
182 {
183 if (vma->vm_flags & VM_DENYWRITE)
184 atomic_inc(&file->f_dentry->d_inode->i_writecount);
185 if (vma->vm_flags & VM_SHARED)
186 mapping->i_mmap_writable--;
188 flush_dcache_mmap_lock(mapping);
189 if (unlikely(vma->vm_flags & VM_NONLINEAR))
190 list_del_init(&vma->shared.vm_set.list);
191 else
192 vma_prio_tree_remove(vma, &mapping->i_mmap);
193 flush_dcache_mmap_unlock(mapping);
194 }
196 /*
197 * Unlink a file-based vm structure from its prio_tree, to hide
198 * vma from rmap and vmtruncate before freeing its page tables.
199 */
200 void unlink_file_vma(struct vm_area_struct *vma)
201 {
202 struct file *file = vma->vm_file;
204 if (file) {
205 struct address_space *mapping = file->f_mapping;
206 spin_lock(&mapping->i_mmap_lock);
207 __remove_shared_vm_struct(vma, file, mapping);
208 spin_unlock(&mapping->i_mmap_lock);
209 }
210 }
212 /*
213 * Close a vm structure and free it, returning the next.
214 */
215 static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
216 {
217 struct vm_area_struct *next = vma->vm_next;
219 might_sleep();
220 if (vma->vm_ops && vma->vm_ops->close)
221 vma->vm_ops->close(vma);
222 if (vma->vm_file)
223 fput(vma->vm_file);
224 mpol_free(vma_policy(vma));
225 kmem_cache_free(vm_area_cachep, vma);
226 return next;
227 }
229 asmlinkage unsigned long sys_brk(unsigned long brk)
230 {
231 unsigned long rlim, retval;
232 unsigned long newbrk, oldbrk;
233 struct mm_struct *mm = current->mm;
235 down_write(&mm->mmap_sem);
237 if (brk < mm->end_code)
238 goto out;
240 /*
241 * Check against rlimit here. If this check is done later after the test
242 * of oldbrk with newbrk then it can escape the test and let the data
243 * segment grow beyond its set limit the in case where the limit is
244 * not page aligned -Ram Gupta
245 */
246 rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur;
247 if (rlim < RLIM_INFINITY && brk - mm->start_data > rlim)
248 goto out;
250 newbrk = PAGE_ALIGN(brk);
251 oldbrk = PAGE_ALIGN(mm->brk);
252 if (oldbrk == newbrk)
253 goto set_brk;
255 /* Always allow shrinking brk. */
256 if (brk <= mm->brk) {
257 if (!do_munmap(mm, newbrk, oldbrk-newbrk))
258 goto set_brk;
259 goto out;
260 }
262 /* Check against existing mmap mappings. */
263 if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE))
264 goto out;
266 /* Ok, looks good - let it rip. */
267 if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk)
268 goto out;
269 set_brk:
270 mm->brk = brk;
271 out:
272 retval = mm->brk;
273 up_write(&mm->mmap_sem);
274 return retval;
275 }
277 #ifdef DEBUG_MM_RB
278 static int browse_rb(struct rb_root *root)
279 {
280 int i = 0, j;
281 struct rb_node *nd, *pn = NULL;
282 unsigned long prev = 0, pend = 0;
284 for (nd = rb_first(root); nd; nd = rb_next(nd)) {
285 struct vm_area_struct *vma;
286 vma = rb_entry(nd, struct vm_area_struct, vm_rb);
287 if (vma->vm_start < prev)
288 printk("vm_start %lx prev %lx\n", vma->vm_start, prev), i = -1;
289 if (vma->vm_start < pend)
290 printk("vm_start %lx pend %lx\n", vma->vm_start, pend);
291 if (vma->vm_start > vma->vm_end)
292 printk("vm_end %lx < vm_start %lx\n", vma->vm_end, vma->vm_start);
293 i++;
294 pn = nd;
295 }
296 j = 0;
297 for (nd = pn; nd; nd = rb_prev(nd)) {
298 j++;
299 }
300 if (i != j)
301 printk("backwards %d, forwards %d\n", j, i), i = 0;
302 return i;
303 }
305 void validate_mm(struct mm_struct *mm)
306 {
307 int bug = 0;
308 int i = 0;
309 struct vm_area_struct *tmp = mm->mmap;
310 while (tmp) {
311 tmp = tmp->vm_next;
312 i++;
313 }
314 if (i != mm->map_count)
315 printk("map_count %d vm_next %d\n", mm->map_count, i), bug = 1;
316 i = browse_rb(&mm->mm_rb);
317 if (i != mm->map_count)
318 printk("map_count %d rb %d\n", mm->map_count, i), bug = 1;
319 BUG_ON(bug);
320 }
321 #else
322 #define validate_mm(mm) do { } while (0)
323 #endif
325 static struct vm_area_struct *
326 find_vma_prepare(struct mm_struct *mm, unsigned long addr,
327 struct vm_area_struct **pprev, struct rb_node ***rb_link,
328 struct rb_node ** rb_parent)
329 {
330 struct vm_area_struct * vma;
331 struct rb_node ** __rb_link, * __rb_parent, * rb_prev;
333 __rb_link = &mm->mm_rb.rb_node;
334 rb_prev = __rb_parent = NULL;
335 vma = NULL;
337 while (*__rb_link) {
338 struct vm_area_struct *vma_tmp;
340 __rb_parent = *__rb_link;
341 vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb);
343 if (vma_tmp->vm_end > addr) {
344 vma = vma_tmp;
345 if (vma_tmp->vm_start <= addr)
346 return vma;
347 __rb_link = &__rb_parent->rb_left;
348 } else {
349 rb_prev = __rb_parent;
350 __rb_link = &__rb_parent->rb_right;
351 }
352 }
354 *pprev = NULL;
355 if (rb_prev)
356 *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
357 *rb_link = __rb_link;
358 *rb_parent = __rb_parent;
359 return vma;
360 }
362 static inline void
363 __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
364 struct vm_area_struct *prev, struct rb_node *rb_parent)
365 {
366 if (prev) {
367 vma->vm_next = prev->vm_next;
368 prev->vm_next = vma;
369 } else {
370 mm->mmap = vma;
371 if (rb_parent)
372 vma->vm_next = rb_entry(rb_parent,
373 struct vm_area_struct, vm_rb);
374 else
375 vma->vm_next = NULL;
376 }
377 }
379 void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
380 struct rb_node **rb_link, struct rb_node *rb_parent)
381 {
382 rb_link_node(&vma->vm_rb, rb_parent, rb_link);
383 rb_insert_color(&vma->vm_rb, &mm->mm_rb);
384 }
386 static inline void __vma_link_file(struct vm_area_struct *vma)
387 {
388 struct file * file;
390 file = vma->vm_file;
391 if (file) {
392 struct address_space *mapping = file->f_mapping;
394 if (vma->vm_flags & VM_DENYWRITE)
395 atomic_dec(&file->f_dentry->d_inode->i_writecount);
396 if (vma->vm_flags & VM_SHARED)
397 mapping->i_mmap_writable++;
399 flush_dcache_mmap_lock(mapping);
400 if (unlikely(vma->vm_flags & VM_NONLINEAR))
401 vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
402 else
403 vma_prio_tree_insert(vma, &mapping->i_mmap);
404 flush_dcache_mmap_unlock(mapping);
405 }
406 }
408 static void
409 __vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
410 struct vm_area_struct *prev, struct rb_node **rb_link,
411 struct rb_node *rb_parent)
412 {
413 __vma_link_list(mm, vma, prev, rb_parent);
414 __vma_link_rb(mm, vma, rb_link, rb_parent);
415 __anon_vma_link(vma);
416 }
418 static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
419 struct vm_area_struct *prev, struct rb_node **rb_link,
420 struct rb_node *rb_parent)
421 {
422 struct address_space *mapping = NULL;
424 if (vma->vm_file)
425 mapping = vma->vm_file->f_mapping;
427 if (mapping) {
428 spin_lock(&mapping->i_mmap_lock);
429 vma->vm_truncate_count = mapping->truncate_count;
430 }
431 anon_vma_lock(vma);
433 __vma_link(mm, vma, prev, rb_link, rb_parent);
434 __vma_link_file(vma);
436 anon_vma_unlock(vma);
437 if (mapping)
438 spin_unlock(&mapping->i_mmap_lock);
440 mm->map_count++;
441 validate_mm(mm);
442 }
444 /*
445 * Helper for vma_adjust in the split_vma insert case:
446 * insert vm structure into list and rbtree and anon_vma,
447 * but it has already been inserted into prio_tree earlier.
448 */
449 static void
450 __insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
451 {
452 struct vm_area_struct * __vma, * prev;
453 struct rb_node ** rb_link, * rb_parent;
455 __vma = find_vma_prepare(mm, vma->vm_start,&prev, &rb_link, &rb_parent);
456 BUG_ON(__vma && __vma->vm_start < vma->vm_end);
457 __vma_link(mm, vma, prev, rb_link, rb_parent);
458 mm->map_count++;
459 }
461 static inline void
462 __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
463 struct vm_area_struct *prev)
464 {
465 prev->vm_next = vma->vm_next;
466 rb_erase(&vma->vm_rb, &mm->mm_rb);
467 if (mm->mmap_cache == vma)
468 mm->mmap_cache = prev;
469 }
471 /*
472 * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that
473 * is already present in an i_mmap tree without adjusting the tree.
474 * The following helper function should be used when such adjustments
475 * are necessary. The "insert" vma (if any) is to be inserted
476 * before we drop the necessary locks.
477 */
478 void vma_adjust(struct vm_area_struct *vma, unsigned long start,
479 unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert)
480 {
481 struct mm_struct *mm = vma->vm_mm;
482 struct vm_area_struct *next = vma->vm_next;
483 struct vm_area_struct *importer = NULL;
484 struct address_space *mapping = NULL;
485 struct prio_tree_root *root = NULL;
486 struct file *file = vma->vm_file;
487 struct anon_vma *anon_vma = NULL;
488 long adjust_next = 0;
489 int remove_next = 0;
491 if (next && !insert) {
492 if (end >= next->vm_end) {
493 /*
494 * vma expands, overlapping all the next, and
495 * perhaps the one after too (mprotect case 6).
496 */
497 again: remove_next = 1 + (end > next->vm_end);
498 end = next->vm_end;
499 anon_vma = next->anon_vma;
500 importer = vma;
501 } else if (end > next->vm_start) {
502 /*
503 * vma expands, overlapping part of the next:
504 * mprotect case 5 shifting the boundary up.
505 */
506 adjust_next = (end - next->vm_start) >> PAGE_SHIFT;
507 anon_vma = next->anon_vma;
508 importer = vma;
509 } else if (end < vma->vm_end) {
510 /*
511 * vma shrinks, and !insert tells it's not
512 * split_vma inserting another: so it must be
513 * mprotect case 4 shifting the boundary down.
514 */
515 adjust_next = - ((vma->vm_end - end) >> PAGE_SHIFT);
516 anon_vma = next->anon_vma;
517 importer = next;
518 }
519 }
521 if (file) {
522 mapping = file->f_mapping;
523 if (!(vma->vm_flags & VM_NONLINEAR))
524 root = &mapping->i_mmap;
525 spin_lock(&mapping->i_mmap_lock);
526 if (importer &&
527 vma->vm_truncate_count != next->vm_truncate_count) {
528 /*
529 * unmap_mapping_range might be in progress:
530 * ensure that the expanding vma is rescanned.
531 */
532 importer->vm_truncate_count = 0;
533 }
534 if (insert) {
535 insert->vm_truncate_count = vma->vm_truncate_count;
536 /*
537 * Put into prio_tree now, so instantiated pages
538 * are visible to arm/parisc __flush_dcache_page
539 * throughout; but we cannot insert into address
540 * space until vma start or end is updated.
541 */
542 __vma_link_file(insert);
543 }
544 }
546 /*
547 * When changing only vma->vm_end, we don't really need
548 * anon_vma lock: but is that case worth optimizing out?
549 */
550 if (vma->anon_vma)
551 anon_vma = vma->anon_vma;
552 if (anon_vma) {
553 spin_lock(&anon_vma->lock);
554 /*
555 * Easily overlooked: when mprotect shifts the boundary,
556 * make sure the expanding vma has anon_vma set if the
557 * shrinking vma had, to cover any anon pages imported.
558 */
559 if (importer && !importer->anon_vma) {
560 importer->anon_vma = anon_vma;
561 __anon_vma_link(importer);
562 }
563 }
565 if (root) {
566 flush_dcache_mmap_lock(mapping);
567 vma_prio_tree_remove(vma, root);
568 if (adjust_next)
569 vma_prio_tree_remove(next, root);
570 }
572 vma->vm_start = start;
573 vma->vm_end = end;
574 vma->vm_pgoff = pgoff;
575 if (adjust_next) {
576 next->vm_start += adjust_next << PAGE_SHIFT;
577 next->vm_pgoff += adjust_next;
578 }
580 if (root) {
581 if (adjust_next)
582 vma_prio_tree_insert(next, root);
583 vma_prio_tree_insert(vma, root);
584 flush_dcache_mmap_unlock(mapping);
585 }
587 if (remove_next) {
588 /*
589 * vma_merge has merged next into vma, and needs
590 * us to remove next before dropping the locks.
591 */
592 __vma_unlink(mm, next, vma);
593 if (file)
594 __remove_shared_vm_struct(next, file, mapping);
595 if (next->anon_vma)
596 __anon_vma_merge(vma, next);
597 } else if (insert) {
598 /*
599 * split_vma has split insert from vma, and needs
600 * us to insert it before dropping the locks
601 * (it may either follow vma or precede it).
602 */
603 __insert_vm_struct(mm, insert);
604 }
606 if (anon_vma)
607 spin_unlock(&anon_vma->lock);
608 if (mapping)
609 spin_unlock(&mapping->i_mmap_lock);
611 if (remove_next) {
612 if (file)
613 fput(file);
614 mm->map_count--;
615 mpol_free(vma_policy(next));
616 kmem_cache_free(vm_area_cachep, next);
617 /*
618 * In mprotect's case 6 (see comments on vma_merge),
619 * we must remove another next too. It would clutter
620 * up the code too much to do both in one go.
621 */
622 if (remove_next == 2) {
623 next = vma->vm_next;
624 goto again;
625 }
626 }
628 validate_mm(mm);
629 }
631 /*
632 * If the vma has a ->close operation then the driver probably needs to release
633 * per-vma resources, so we don't attempt to merge those.
634 */
635 #define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP)
637 static inline int is_mergeable_vma(struct vm_area_struct *vma,
638 struct file *file, unsigned long vm_flags)
639 {
640 if (vma->vm_flags != vm_flags)
641 return 0;
642 if (vma->vm_file != file)
643 return 0;
644 if (vma->vm_ops && vma->vm_ops->close)
645 return 0;
646 return 1;
647 }
649 static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1,
650 struct anon_vma *anon_vma2)
651 {
652 return !anon_vma1 || !anon_vma2 || (anon_vma1 == anon_vma2);
653 }
655 /*
656 * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
657 * in front of (at a lower virtual address and file offset than) the vma.
658 *
659 * We cannot merge two vmas if they have differently assigned (non-NULL)
660 * anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
661 *
662 * We don't check here for the merged mmap wrapping around the end of pagecache
663 * indices (16TB on ia32) because do_mmap_pgoff() does not permit mmap's which
664 * wrap, nor mmaps which cover the final page at index -1UL.
665 */
666 static int
667 can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
668 struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
669 {
670 if (is_mergeable_vma(vma, file, vm_flags) &&
671 is_mergeable_anon_vma(anon_vma, vma->anon_vma)) {
672 if (vma->vm_pgoff == vm_pgoff)
673 return 1;
674 }
675 return 0;
676 }
678 /*
679 * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
680 * beyond (at a higher virtual address and file offset than) the vma.
681 *
682 * We cannot merge two vmas if they have differently assigned (non-NULL)
683 * anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
684 */
685 static int
686 can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
687 struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
688 {
689 if (is_mergeable_vma(vma, file, vm_flags) &&
690 is_mergeable_anon_vma(anon_vma, vma->anon_vma)) {
691 pgoff_t vm_pglen;
692 vm_pglen = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
693 if (vma->vm_pgoff + vm_pglen == vm_pgoff)
694 return 1;
695 }
696 return 0;
697 }
699 /*
700 * Given a mapping request (addr,end,vm_flags,file,pgoff), figure out
701 * whether that can be merged with its predecessor or its successor.
702 * Or both (it neatly fills a hole).
703 *
704 * In most cases - when called for mmap, brk or mremap - [addr,end) is
705 * certain not to be mapped by the time vma_merge is called; but when
706 * called for mprotect, it is certain to be already mapped (either at
707 * an offset within prev, or at the start of next), and the flags of
708 * this area are about to be changed to vm_flags - and the no-change
709 * case has already been eliminated.
710 *
711 * The following mprotect cases have to be considered, where AAAA is
712 * the area passed down from mprotect_fixup, never extending beyond one
713 * vma, PPPPPP is the prev vma specified, and NNNNNN the next vma after:
714 *
715 * AAAA AAAA AAAA AAAA
716 * PPPPPPNNNNNN PPPPPPNNNNNN PPPPPPNNNNNN PPPPNNNNXXXX
717 * cannot merge might become might become might become
718 * PPNNNNNNNNNN PPPPPPPPPPNN PPPPPPPPPPPP 6 or
719 * mmap, brk or case 4 below case 5 below PPPPPPPPXXXX 7 or
720 * mremap move: PPPPNNNNNNNN 8
721 * AAAA
722 * PPPP NNNN PPPPPPPPPPPP PPPPPPPPNNNN PPPPNNNNNNNN
723 * might become case 1 below case 2 below case 3 below
724 *
725 * Odd one out? Case 8, because it extends NNNN but needs flags of XXXX:
726 * mprotect_fixup updates vm_flags & vm_page_prot on successful return.
727 */
728 struct vm_area_struct *vma_merge(struct mm_struct *mm,
729 struct vm_area_struct *prev, unsigned long addr,
730 unsigned long end, unsigned long vm_flags,
731 struct anon_vma *anon_vma, struct file *file,
732 pgoff_t pgoff, struct mempolicy *policy)
733 {
734 pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
735 struct vm_area_struct *area, *next;
737 /*
738 * We later require that vma->vm_flags == vm_flags,
739 * so this tests vma->vm_flags & VM_SPECIAL, too.
740 */
741 if (vm_flags & VM_SPECIAL)
742 return NULL;
744 if (prev)
745 next = prev->vm_next;
746 else
747 next = mm->mmap;
748 area = next;
749 if (next && next->vm_end == end) /* cases 6, 7, 8 */
750 next = next->vm_next;
752 /*
753 * Can it merge with the predecessor?
754 */
755 if (prev && prev->vm_end == addr &&
756 mpol_equal(vma_policy(prev), policy) &&
757 can_vma_merge_after(prev, vm_flags,
758 anon_vma, file, pgoff)) {
759 /*
760 * OK, it can. Can we now merge in the successor as well?
761 */
762 if (next && end == next->vm_start &&
763 mpol_equal(policy, vma_policy(next)) &&
764 can_vma_merge_before(next, vm_flags,
765 anon_vma, file, pgoff+pglen) &&
766 is_mergeable_anon_vma(prev->anon_vma,
767 next->anon_vma)) {
768 /* cases 1, 6 */
769 vma_adjust(prev, prev->vm_start,
770 next->vm_end, prev->vm_pgoff, NULL);
771 } else /* cases 2, 5, 7 */
772 vma_adjust(prev, prev->vm_start,
773 end, prev->vm_pgoff, NULL);
774 return prev;
775 }
777 /*
778 * Can this new request be merged in front of next?
779 */
780 if (next && end == next->vm_start &&
781 mpol_equal(policy, vma_policy(next)) &&
782 can_vma_merge_before(next, vm_flags,
783 anon_vma, file, pgoff+pglen)) {
784 if (prev && addr < prev->vm_end) /* case 4 */
785 vma_adjust(prev, prev->vm_start,
786 addr, prev->vm_pgoff, NULL);
787 else /* cases 3, 8 */
788 vma_adjust(area, addr, next->vm_end,
789 next->vm_pgoff - pglen, NULL);
790 return area;
791 }
793 return NULL;
794 }
796 /*
797 * find_mergeable_anon_vma is used by anon_vma_prepare, to check
798 * neighbouring vmas for a suitable anon_vma, before it goes off
799 * to allocate a new anon_vma. It checks because a repetitive
800 * sequence of mprotects and faults may otherwise lead to distinct
801 * anon_vmas being allocated, preventing vma merge in subsequent
802 * mprotect.
803 */
804 struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)
805 {
806 struct vm_area_struct *near;
807 unsigned long vm_flags;
809 near = vma->vm_next;
810 if (!near)
811 goto try_prev;
813 /*
814 * Since only mprotect tries to remerge vmas, match flags
815 * which might be mprotected into each other later on.
816 * Neither mlock nor madvise tries to remerge at present,
817 * so leave their flags as obstructing a merge.
818 */
819 vm_flags = vma->vm_flags & ~(VM_READ|VM_WRITE|VM_EXEC);
820 vm_flags |= near->vm_flags & (VM_READ|VM_WRITE|VM_EXEC);
822 if (near->anon_vma && vma->vm_end == near->vm_start &&
823 mpol_equal(vma_policy(vma), vma_policy(near)) &&
824 can_vma_merge_before(near, vm_flags,
825 NULL, vma->vm_file, vma->vm_pgoff +
826 ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT)))
827 return near->anon_vma;
828 try_prev:
829 /*
830 * It is potentially slow to have to call find_vma_prev here.
831 * But it's only on the first write fault on the vma, not
832 * every time, and we could devise a way to avoid it later
833 * (e.g. stash info in next's anon_vma_node when assigning
834 * an anon_vma, or when trying vma_merge). Another time.
835 */
836 BUG_ON(find_vma_prev(vma->vm_mm, vma->vm_start, &near) != vma);
837 if (!near)
838 goto none;
840 vm_flags = vma->vm_flags & ~(VM_READ|VM_WRITE|VM_EXEC);
841 vm_flags |= near->vm_flags & (VM_READ|VM_WRITE|VM_EXEC);
843 if (near->anon_vma && near->vm_end == vma->vm_start &&
844 mpol_equal(vma_policy(near), vma_policy(vma)) &&
845 can_vma_merge_after(near, vm_flags,
846 NULL, vma->vm_file, vma->vm_pgoff))
847 return near->anon_vma;
848 none:
849 /*
850 * There's no absolute need to look only at touching neighbours:
851 * we could search further afield for "compatible" anon_vmas.
852 * But it would probably just be a waste of time searching,
853 * or lead to too many vmas hanging off the same anon_vma.
854 * We're trying to allow mprotect remerging later on,
855 * not trying to minimize memory used for anon_vmas.
856 */
857 return NULL;
858 }
860 #ifdef CONFIG_PROC_FS
861 void vm_stat_account(struct mm_struct *mm, unsigned long flags,
862 struct file *file, long pages)
863 {
864 const unsigned long stack_flags
865 = VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN);
867 if (file) {
868 mm->shared_vm += pages;
869 if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC)
870 mm->exec_vm += pages;
871 } else if (flags & stack_flags)
872 mm->stack_vm += pages;
873 if (flags & (VM_RESERVED|VM_IO))
874 mm->reserved_vm += pages;
875 }
876 #endif /* CONFIG_PROC_FS */
878 /*
879 * The caller must hold down_write(current->mm->mmap_sem).
880 */
882 unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
883 unsigned long len, unsigned long prot,
884 unsigned long flags, unsigned long pgoff)
885 {
886 struct mm_struct * mm = current->mm;
887 struct vm_area_struct * vma, * prev;
888 struct inode *inode;
889 unsigned int vm_flags;
890 int correct_wcount = 0;
891 int error;
892 struct rb_node ** rb_link, * rb_parent;
893 int accountable = 1;
894 unsigned long charged = 0, reqprot = prot;
896 if (file) {
897 if (is_file_hugepages(file))
898 accountable = 0;
900 if (!file->f_op || !file->f_op->mmap)
901 return -ENODEV;
903 if ((prot & PROT_EXEC) &&
904 (file->f_vfsmnt->mnt_flags & MNT_NOEXEC))
905 return -EPERM;
906 }
907 /*
908 * Does the application expect PROT_READ to imply PROT_EXEC?
909 *
910 * (the exception is when the underlying filesystem is noexec
911 * mounted, in which case we dont add PROT_EXEC.)
912 */
913 if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
914 if (!(file && (file->f_vfsmnt->mnt_flags & MNT_NOEXEC)))
915 prot |= PROT_EXEC;
917 if (!len)
918 return -EINVAL;
920 error = arch_mmap_check(addr, len, flags);
921 if (error)
922 return error;
924 /* Careful about overflows.. */
925 len = PAGE_ALIGN(len);
926 if (!len || len > TASK_SIZE)
927 return -ENOMEM;
929 /* offset overflow? */
930 if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
931 return -EOVERFLOW;
933 /* Too many mappings? */
934 if (mm->map_count > sysctl_max_map_count)
935 return -ENOMEM;
937 /* Obtain the address to map to. we verify (or select) it and ensure
938 * that it represents a valid section of the address space.
939 */
940 addr = get_unmapped_area(file, addr, len, pgoff, flags);
941 if (addr & ~PAGE_MASK)
942 return addr;
944 /* Do simple checking here so the lower-level routines won't have
945 * to. we assume access permissions have been handled by the open
946 * of the memory object, so we don't do any here.
947 */
948 vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |
949 mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
951 if (flags & MAP_LOCKED) {
952 if (!can_do_mlock())
953 return -EPERM;
954 vm_flags |= VM_LOCKED;
955 }
956 /* mlock MCL_FUTURE? */
957 if (vm_flags & VM_LOCKED) {
958 unsigned long locked, lock_limit;
959 locked = len >> PAGE_SHIFT;
960 locked += mm->locked_vm;
961 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
962 lock_limit >>= PAGE_SHIFT;
963 if (locked > lock_limit && !capable(CAP_IPC_LOCK))
964 return -EAGAIN;
965 }
967 inode = file ? file->f_dentry->d_inode : NULL;
969 if (file) {
970 switch (flags & MAP_TYPE) {
971 case MAP_SHARED:
972 if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE))
973 return -EACCES;
975 /*
976 * Make sure we don't allow writing to an append-only
977 * file..
978 */
979 if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE))
980 return -EACCES;
982 /*
983 * Make sure there are no mandatory locks on the file.
984 */
985 if (locks_verify_locked(inode))
986 return -EAGAIN;
988 vm_flags |= VM_SHARED | VM_MAYSHARE;
989 if (!(file->f_mode & FMODE_WRITE))
990 vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
992 /* fall through */
993 case MAP_PRIVATE:
994 if (!(file->f_mode & FMODE_READ))
995 return -EACCES;
996 break;
998 default:
999 return -EINVAL;
1001 } else {
1002 switch (flags & MAP_TYPE) {
1003 case MAP_SHARED:
1004 vm_flags |= VM_SHARED | VM_MAYSHARE;
1005 break;
1006 case MAP_PRIVATE:
1007 /*
1008 * Set pgoff according to addr for anon_vma.
1009 */
1010 pgoff = addr >> PAGE_SHIFT;
1011 break;
1012 default:
1013 return -EINVAL;
1017 error = security_file_mmap(file, reqprot, prot, flags);
1018 if (error)
1019 return error;
1021 /* Clear old maps */
1022 error = -ENOMEM;
1023 munmap_back:
1024 vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
1025 if (vma && vma->vm_start < addr + len) {
1026 if (do_munmap(mm, addr, len))
1027 return -ENOMEM;
1028 goto munmap_back;
1031 /* Check against address space limit. */
1032 if (!may_expand_vm(mm, len >> PAGE_SHIFT))
1033 return -ENOMEM;
1035 if (accountable && (!(flags & MAP_NORESERVE) ||
1036 sysctl_overcommit_memory == OVERCOMMIT_NEVER)) {
1037 if (vm_flags & VM_SHARED) {
1038 /* Check memory availability in shmem_file_setup? */
1039 vm_flags |= VM_ACCOUNT;
1040 } else if (vm_flags & VM_WRITE) {
1041 /*
1042 * Private writable mapping: check memory availability
1043 */
1044 charged = len >> PAGE_SHIFT;
1045 if (security_vm_enough_memory(charged))
1046 return -ENOMEM;
1047 vm_flags |= VM_ACCOUNT;
1051 /*
1052 * Can we just expand an old private anonymous mapping?
1053 * The VM_SHARED test is necessary because shmem_zero_setup
1054 * will create the file object for a shared anonymous map below.
1055 */
1056 if (!file && !(vm_flags & VM_SHARED) &&
1057 vma_merge(mm, prev, addr, addr + len, vm_flags,
1058 NULL, NULL, pgoff, NULL))
1059 goto out;
1061 /*
1062 * Determine the object being mapped and call the appropriate
1063 * specific mapper. the address has already been validated, but
1064 * not unmapped, but the maps are removed from the list.
1065 */
1066 vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
1067 if (!vma) {
1068 error = -ENOMEM;
1069 goto unacct_error;
1072 vma->vm_mm = mm;
1073 vma->vm_start = addr;
1074 vma->vm_end = addr + len;
1075 vma->vm_flags = vm_flags;
1076 vma->vm_page_prot = protection_map[vm_flags &
1077 (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)];
1078 vma->vm_pgoff = pgoff;
1080 if (file) {
1081 error = -EINVAL;
1082 if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
1083 goto free_vma;
1084 if (vm_flags & VM_DENYWRITE) {
1085 error = deny_write_access(file);
1086 if (error)
1087 goto free_vma;
1088 correct_wcount = 1;
1090 vma->vm_file = file;
1091 get_file(file);
1092 error = file->f_op->mmap(file, vma);
1093 if (error)
1094 goto unmap_and_free_vma;
1095 } else if (vm_flags & VM_SHARED) {
1096 error = shmem_zero_setup(vma);
1097 if (error)
1098 goto free_vma;
1101 /* Don't make the VMA automatically writable if it's shared, but the
1102 * backer wishes to know when pages are first written to */
1103 if (vma->vm_ops && vma->vm_ops->page_mkwrite)
1104 vma->vm_page_prot =
1105 protection_map[vm_flags & (VM_READ|VM_WRITE|VM_EXEC)];
1107 /* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform
1108 * shmem_zero_setup (perhaps called through /dev/zero's ->mmap)
1109 * that memory reservation must be checked; but that reservation
1110 * belongs to shared memory object, not to vma: so now clear it.
1111 */
1112 if ((vm_flags & (VM_SHARED|VM_ACCOUNT)) == (VM_SHARED|VM_ACCOUNT))
1113 vma->vm_flags &= ~VM_ACCOUNT;
1115 /* Can addr have changed??
1117 * Answer: Yes, several device drivers can do it in their
1118 * f_op->mmap method. -DaveM
1119 */
1120 addr = vma->vm_start;
1121 pgoff = vma->vm_pgoff;
1122 vm_flags = vma->vm_flags;
1124 if (!file || !vma_merge(mm, prev, addr, vma->vm_end,
1125 vma->vm_flags, NULL, file, pgoff, vma_policy(vma))) {
1126 file = vma->vm_file;
1127 vma_link(mm, vma, prev, rb_link, rb_parent);
1128 if (correct_wcount)
1129 atomic_inc(&inode->i_writecount);
1130 } else {
1131 if (file) {
1132 if (correct_wcount)
1133 atomic_inc(&inode->i_writecount);
1134 fput(file);
1136 mpol_free(vma_policy(vma));
1137 kmem_cache_free(vm_area_cachep, vma);
1139 out:
1140 mm->total_vm += len >> PAGE_SHIFT;
1141 vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
1142 if (vm_flags & VM_LOCKED) {
1143 mm->locked_vm += len >> PAGE_SHIFT;
1144 make_pages_present(addr, addr + len);
1146 if (flags & MAP_POPULATE) {
1147 up_write(&mm->mmap_sem);
1148 sys_remap_file_pages(addr, len, 0,
1149 pgoff, flags & MAP_NONBLOCK);
1150 down_write(&mm->mmap_sem);
1152 return addr;
1154 unmap_and_free_vma:
1155 if (correct_wcount)
1156 atomic_inc(&inode->i_writecount);
1157 vma->vm_file = NULL;
1158 fput(file);
1160 /* Undo any partial mapping done by a device driver. */
1161 unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);
1162 charged = 0;
1163 free_vma:
1164 kmem_cache_free(vm_area_cachep, vma);
1165 unacct_error:
1166 if (charged)
1167 vm_unacct_memory(charged);
1168 return error;
1171 EXPORT_SYMBOL(do_mmap_pgoff);
1173 /* Get an address range which is currently unmapped.
1174 * For shmat() with addr=0.
1176 * Ugly calling convention alert:
1177 * Return value with the low bits set means error value,
1178 * ie
1179 * if (ret & ~PAGE_MASK)
1180 * error = ret;
1182 * This function "knows" that -ENOMEM has the bits set.
1183 */
1184 #ifndef HAVE_ARCH_UNMAPPED_AREA
1185 unsigned long
1186 arch_get_unmapped_area(struct file *filp, unsigned long addr,
1187 unsigned long len, unsigned long pgoff, unsigned long flags)
1189 struct mm_struct *mm = current->mm;
1190 struct vm_area_struct *vma;
1191 unsigned long start_addr;
1193 if (len > TASK_SIZE)
1194 return -ENOMEM;
1196 if (addr) {
1197 addr = PAGE_ALIGN(addr);
1198 vma = find_vma(mm, addr);
1199 if (TASK_SIZE - len >= addr &&
1200 (!vma || addr + len <= vma->vm_start))
1201 return addr;
1203 if (len > mm->cached_hole_size) {
1204 start_addr = addr = mm->free_area_cache;
1205 } else {
1206 start_addr = addr = TASK_UNMAPPED_BASE;
1207 mm->cached_hole_size = 0;
1210 full_search:
1211 for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
1212 /* At this point: (!vma || addr < vma->vm_end). */
1213 if (TASK_SIZE - len < addr) {
1214 /*
1215 * Start a new search - just in case we missed
1216 * some holes.
1217 */
1218 if (start_addr != TASK_UNMAPPED_BASE) {
1219 addr = TASK_UNMAPPED_BASE;
1220 start_addr = addr;
1221 mm->cached_hole_size = 0;
1222 goto full_search;
1224 return -ENOMEM;
1226 if (!vma || addr + len <= vma->vm_start) {
1227 /*
1228 * Remember the place where we stopped the search:
1229 */
1230 mm->free_area_cache = addr + len;
1231 return addr;
1233 if (addr + mm->cached_hole_size < vma->vm_start)
1234 mm->cached_hole_size = vma->vm_start - addr;
1235 addr = vma->vm_end;
1238 #endif
1240 void arch_unmap_area(struct mm_struct *mm, unsigned long addr)
1242 /*
1243 * Is this a new hole at the lowest possible address?
1244 */
1245 if (addr >= TASK_UNMAPPED_BASE && addr < mm->free_area_cache) {
1246 mm->free_area_cache = addr;
1247 mm->cached_hole_size = ~0UL;
1251 /*
1252 * This mmap-allocator allocates new areas top-down from below the
1253 * stack's low limit (the base):
1254 */
1255 #ifndef HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
1256 unsigned long
1257 arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
1258 const unsigned long len, const unsigned long pgoff,
1259 const unsigned long flags)
1261 struct vm_area_struct *vma;
1262 struct mm_struct *mm = current->mm;
1263 unsigned long addr = addr0;
1265 /* requested length too big for entire address space */
1266 if (len > TASK_SIZE)
1267 return -ENOMEM;
1269 /* requesting a specific address */
1270 if (addr) {
1271 addr = PAGE_ALIGN(addr);
1272 vma = find_vma(mm, addr);
1273 if (TASK_SIZE - len >= addr &&
1274 (!vma || addr + len <= vma->vm_start))
1275 return addr;
1278 /* check if free_area_cache is useful for us */
1279 if (len <= mm->cached_hole_size) {
1280 mm->cached_hole_size = 0;
1281 mm->free_area_cache = mm->mmap_base;
1284 /* either no address requested or can't fit in requested address hole */
1285 addr = mm->free_area_cache;
1287 /* make sure it can fit in the remaining address space */
1288 if (addr > len) {
1289 vma = find_vma(mm, addr-len);
1290 if (!vma || addr <= vma->vm_start)
1291 /* remember the address as a hint for next time */
1292 return (mm->free_area_cache = addr-len);
1295 if (mm->mmap_base < len)
1296 goto bottomup;
1298 addr = mm->mmap_base-len;
1300 do {
1301 /*
1302 * Lookup failure means no vma is above this address,
1303 * else if new region fits below vma->vm_start,
1304 * return with success:
1305 */
1306 vma = find_vma(mm, addr);
1307 if (!vma || addr+len <= vma->vm_start)
1308 /* remember the address as a hint for next time */
1309 return (mm->free_area_cache = addr);
1311 /* remember the largest hole we saw so far */
1312 if (addr + mm->cached_hole_size < vma->vm_start)
1313 mm->cached_hole_size = vma->vm_start - addr;
1315 /* try just below the current vma->vm_start */
1316 addr = vma->vm_start-len;
1317 } while (len < vma->vm_start);
1319 bottomup:
1320 /*
1321 * A failed mmap() very likely causes application failure,
1322 * so fall back to the bottom-up function here. This scenario
1323 * can happen with large stack limits and large mmap()
1324 * allocations.
1325 */
1326 mm->cached_hole_size = ~0UL;
1327 mm->free_area_cache = TASK_UNMAPPED_BASE;
1328 addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
1329 /*
1330 * Restore the topdown base:
1331 */
1332 mm->free_area_cache = mm->mmap_base;
1333 mm->cached_hole_size = ~0UL;
1335 return addr;
1337 #endif
1339 void arch_unmap_area_topdown(struct mm_struct *mm, unsigned long addr)
1341 /*
1342 * Is this a new hole at the highest possible address?
1343 */
1344 if (addr > mm->free_area_cache)
1345 mm->free_area_cache = addr;
1347 /* dont allow allocations above current base */
1348 if (mm->free_area_cache > mm->mmap_base)
1349 mm->free_area_cache = mm->mmap_base;
1352 unsigned long
1353 get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
1354 unsigned long pgoff, unsigned long flags)
1356 unsigned long ret;
1358 if (!(flags & MAP_FIXED)) {
1359 unsigned long (*get_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
1361 get_area = current->mm->get_unmapped_area;
1362 if (file && file->f_op && file->f_op->get_unmapped_area)
1363 get_area = file->f_op->get_unmapped_area;
1364 addr = get_area(file, addr, len, pgoff, flags);
1365 if (IS_ERR_VALUE(addr))
1366 return addr;
1369 if (addr > TASK_SIZE - len)
1370 return -ENOMEM;
1371 if (addr & ~PAGE_MASK)
1372 return -EINVAL;
1373 if (file && is_file_hugepages(file)) {
1374 /*
1375 * Check if the given range is hugepage aligned, and
1376 * can be made suitable for hugepages.
1377 */
1378 ret = prepare_hugepage_range(addr, len);
1379 } else {
1380 /*
1381 * Ensure that a normal request is not falling in a
1382 * reserved hugepage range. For some archs like IA-64,
1383 * there is a separate region for hugepages.
1384 */
1385 ret = is_hugepage_only_range(current->mm, addr, len);
1387 if (ret)
1388 return -EINVAL;
1389 return addr;
1392 EXPORT_SYMBOL(get_unmapped_area);
1394 /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */
1395 struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr)
1397 struct vm_area_struct *vma = NULL;
1399 if (mm) {
1400 /* Check the cache first. */
1401 /* (Cache hit rate is typically around 35%.) */
1402 vma = mm->mmap_cache;
1403 if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
1404 struct rb_node * rb_node;
1406 rb_node = mm->mm_rb.rb_node;
1407 vma = NULL;
1409 while (rb_node) {
1410 struct vm_area_struct * vma_tmp;
1412 vma_tmp = rb_entry(rb_node,
1413 struct vm_area_struct, vm_rb);
1415 if (vma_tmp->vm_end > addr) {
1416 vma = vma_tmp;
1417 if (vma_tmp->vm_start <= addr)
1418 break;
1419 rb_node = rb_node->rb_left;
1420 } else
1421 rb_node = rb_node->rb_right;
1423 if (vma)
1424 mm->mmap_cache = vma;
1427 return vma;
1430 EXPORT_SYMBOL(find_vma);
1432 /* Same as find_vma, but also return a pointer to the previous VMA in *pprev. */
1433 struct vm_area_struct *
1434 find_vma_prev(struct mm_struct *mm, unsigned long addr,
1435 struct vm_area_struct **pprev)
1437 struct vm_area_struct *vma = NULL, *prev = NULL;
1438 struct rb_node * rb_node;
1439 if (!mm)
1440 goto out;
1442 /* Guard against addr being lower than the first VMA */
1443 vma = mm->mmap;
1445 /* Go through the RB tree quickly. */
1446 rb_node = mm->mm_rb.rb_node;
1448 while (rb_node) {
1449 struct vm_area_struct *vma_tmp;
1450 vma_tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);
1452 if (addr < vma_tmp->vm_end) {
1453 rb_node = rb_node->rb_left;
1454 } else {
1455 prev = vma_tmp;
1456 if (!prev->vm_next || (addr < prev->vm_next->vm_end))
1457 break;
1458 rb_node = rb_node->rb_right;
1462 out:
1463 *pprev = prev;
1464 return prev ? prev->vm_next : vma;
1467 /*
1468 * Verify that the stack growth is acceptable and
1469 * update accounting. This is shared with both the
1470 * grow-up and grow-down cases.
1471 */
1472 static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, unsigned long grow)
1474 struct mm_struct *mm = vma->vm_mm;
1475 struct rlimit *rlim = current->signal->rlim;
1477 /* address space limit tests */
1478 if (!may_expand_vm(mm, grow))
1479 return -ENOMEM;
1481 /* Stack limit test */
1482 if (size > rlim[RLIMIT_STACK].rlim_cur)
1483 return -ENOMEM;
1485 /* mlock limit tests */
1486 if (vma->vm_flags & VM_LOCKED) {
1487 unsigned long locked;
1488 unsigned long limit;
1489 locked = mm->locked_vm + grow;
1490 limit = rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
1491 if (locked > limit && !capable(CAP_IPC_LOCK))
1492 return -ENOMEM;
1495 /*
1496 * Overcommit.. This must be the final test, as it will
1497 * update security statistics.
1498 */
1499 if (security_vm_enough_memory(grow))
1500 return -ENOMEM;
1502 /* Ok, everything looks good - let it rip */
1503 mm->total_vm += grow;
1504 if (vma->vm_flags & VM_LOCKED)
1505 mm->locked_vm += grow;
1506 vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow);
1507 return 0;
1510 #if defined(CONFIG_STACK_GROWSUP) || defined(CONFIG_IA64)
1511 /*
1512 * PA-RISC uses this for its stack; IA64 for its Register Backing Store.
1513 * vma is the last one with address > vma->vm_end. Have to extend vma.
1514 */
1515 #ifndef CONFIG_IA64
1516 static inline
1517 #endif
1518 int expand_upwards(struct vm_area_struct *vma, unsigned long address)
1520 int error;
1522 if (!(vma->vm_flags & VM_GROWSUP))
1523 return -EFAULT;
1525 /*
1526 * We must make sure the anon_vma is allocated
1527 * so that the anon_vma locking is not a noop.
1528 */
1529 if (unlikely(anon_vma_prepare(vma)))
1530 return -ENOMEM;
1531 anon_vma_lock(vma);
1533 /*
1534 * vma->vm_start/vm_end cannot change under us because the caller
1535 * is required to hold the mmap_sem in read mode. We need the
1536 * anon_vma lock to serialize against concurrent expand_stacks.
1537 */
1538 address += 4 + PAGE_SIZE - 1;
1539 address &= PAGE_MASK;
1540 error = 0;
1542 /* Somebody else might have raced and expanded it already */
1543 if (address > vma->vm_end) {
1544 unsigned long size, grow;
1546 size = address - vma->vm_start;
1547 grow = (address - vma->vm_end) >> PAGE_SHIFT;
1549 error = acct_stack_growth(vma, size, grow);
1550 if (!error)
1551 vma->vm_end = address;
1553 anon_vma_unlock(vma);
1554 return error;
1556 #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
1558 #ifdef CONFIG_STACK_GROWSUP
1559 int expand_stack(struct vm_area_struct *vma, unsigned long address)
1561 return expand_upwards(vma, address);
1564 struct vm_area_struct *
1565 find_extend_vma(struct mm_struct *mm, unsigned long addr)
1567 struct vm_area_struct *vma, *prev;
1569 addr &= PAGE_MASK;
1570 vma = find_vma_prev(mm, addr, &prev);
1571 if (vma && (vma->vm_start <= addr))
1572 return vma;
1573 if (!prev || expand_stack(prev, addr))
1574 return NULL;
1575 if (prev->vm_flags & VM_LOCKED) {
1576 make_pages_present(addr, prev->vm_end);
1578 return prev;
1580 #else
1581 /*
1582 * vma is the first one with address < vma->vm_start. Have to extend vma.
1583 */
1584 int expand_stack(struct vm_area_struct *vma, unsigned long address)
1586 int error;
1588 /*
1589 * We must make sure the anon_vma is allocated
1590 * so that the anon_vma locking is not a noop.
1591 */
1592 if (unlikely(anon_vma_prepare(vma)))
1593 return -ENOMEM;
1594 anon_vma_lock(vma);
1596 /*
1597 * vma->vm_start/vm_end cannot change under us because the caller
1598 * is required to hold the mmap_sem in read mode. We need the
1599 * anon_vma lock to serialize against concurrent expand_stacks.
1600 */
1601 address &= PAGE_MASK;
1602 error = 0;
1604 /* Somebody else might have raced and expanded it already */
1605 if (address < vma->vm_start) {
1606 unsigned long size, grow;
1608 size = vma->vm_end - address;
1609 grow = (vma->vm_start - address) >> PAGE_SHIFT;
1611 error = acct_stack_growth(vma, size, grow);
1612 if (!error) {
1613 vma->vm_start = address;
1614 vma->vm_pgoff -= grow;
1617 anon_vma_unlock(vma);
1618 return error;
1621 struct vm_area_struct *
1622 find_extend_vma(struct mm_struct * mm, unsigned long addr)
1624 struct vm_area_struct * vma;
1625 unsigned long start;
1627 addr &= PAGE_MASK;
1628 vma = find_vma(mm,addr);
1629 if (!vma)
1630 return NULL;
1631 if (vma->vm_start <= addr)
1632 return vma;
1633 if (!(vma->vm_flags & VM_GROWSDOWN))
1634 return NULL;
1635 start = vma->vm_start;
1636 if (expand_stack(vma, addr))
1637 return NULL;
1638 if (vma->vm_flags & VM_LOCKED) {
1639 make_pages_present(addr, start);
1641 return vma;
1643 #endif
1645 /*
1646 * Ok - we have the memory areas we should free on the vma list,
1647 * so release them, and do the vma updates.
1649 * Called with the mm semaphore held.
1650 */
1651 static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
1653 /* Update high watermark before we lower total_vm */
1654 update_hiwater_vm(mm);
1655 do {
1656 long nrpages = vma_pages(vma);
1658 mm->total_vm -= nrpages;
1659 if (vma->vm_flags & VM_LOCKED)
1660 mm->locked_vm -= nrpages;
1661 vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages);
1662 vma = remove_vma(vma);
1663 } while (vma);
1664 validate_mm(mm);
1667 /*
1668 * Get rid of page table information in the indicated region.
1670 * Called with the mm semaphore held.
1671 */
1672 static void unmap_region(struct mm_struct *mm,
1673 struct vm_area_struct *vma, struct vm_area_struct *prev,
1674 unsigned long start, unsigned long end)
1676 struct vm_area_struct *next = prev? prev->vm_next: mm->mmap;
1677 struct mmu_gather *tlb;
1678 unsigned long nr_accounted = 0;
1680 lru_add_drain();
1681 tlb = tlb_gather_mmu(mm, 0);
1682 update_hiwater_rss(mm);
1683 unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL);
1684 vm_unacct_memory(nr_accounted);
1685 free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
1686 next? next->vm_start: 0);
1687 tlb_finish_mmu(tlb, start, end);
1690 static inline void unmap_vma(struct vm_area_struct *vma)
1692 if (unlikely(vma->vm_ops && vma->vm_ops->unmap))
1693 vma->vm_ops->unmap(vma);
1696 /*
1697 * Create a list of vma's touched by the unmap, removing them from the mm's
1698 * vma list as we go..
1699 */
1700 static void
1701 detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
1702 struct vm_area_struct *prev, unsigned long end)
1704 struct vm_area_struct **insertion_point;
1705 struct vm_area_struct *tail_vma = NULL;
1706 unsigned long addr;
1708 insertion_point = (prev ? &prev->vm_next : &mm->mmap);
1709 do {
1710 rb_erase(&vma->vm_rb, &mm->mm_rb);
1711 unmap_vma(vma);
1712 mm->map_count--;
1713 tail_vma = vma;
1714 vma = vma->vm_next;
1715 } while (vma && vma->vm_start < end);
1716 *insertion_point = vma;
1717 tail_vma->vm_next = NULL;
1718 if (mm->unmap_area == arch_unmap_area)
1719 addr = prev ? prev->vm_end : mm->mmap_base;
1720 else
1721 addr = vma ? vma->vm_start : mm->mmap_base;
1722 mm->unmap_area(mm, addr);
1723 mm->mmap_cache = NULL; /* Kill the cache. */
1726 /*
1727 * Split a vma into two pieces at address 'addr', a new vma is allocated
1728 * either for the first part or the the tail.
1729 */
1730 int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
1731 unsigned long addr, int new_below)
1733 struct mempolicy *pol;
1734 struct vm_area_struct *new;
1736 if (is_vm_hugetlb_page(vma) && (addr & ~HPAGE_MASK))
1737 return -EINVAL;
1739 if (mm->map_count >= sysctl_max_map_count)
1740 return -ENOMEM;
1742 new = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
1743 if (!new)
1744 return -ENOMEM;
1746 /* most fields are the same, copy all, and then fixup */
1747 *new = *vma;
1749 if (new_below)
1750 new->vm_end = addr;
1751 else {
1752 new->vm_start = addr;
1753 new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT);
1756 pol = mpol_copy(vma_policy(vma));
1757 if (IS_ERR(pol)) {
1758 kmem_cache_free(vm_area_cachep, new);
1759 return PTR_ERR(pol);
1761 vma_set_policy(new, pol);
1763 if (new->vm_file)
1764 get_file(new->vm_file);
1766 if (new->vm_ops && new->vm_ops->open)
1767 new->vm_ops->open(new);
1769 if (new_below)
1770 vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff +
1771 ((addr - new->vm_start) >> PAGE_SHIFT), new);
1772 else
1773 vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new);
1775 return 0;
1778 /* Munmap is split into 2 main parts -- this part which finds
1779 * what needs doing, and the areas themselves, which do the
1780 * work. This now handles partial unmappings.
1781 * Jeremy Fitzhardinge <jeremy@goop.org>
1782 */
1783 int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
1785 unsigned long end;
1786 struct vm_area_struct *vma, *prev, *last;
1788 if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start)
1789 return -EINVAL;
1791 if ((len = PAGE_ALIGN(len)) == 0)
1792 return -EINVAL;
1794 /* Find the first overlapping VMA */
1795 vma = find_vma_prev(mm, start, &prev);
1796 if (!vma)
1797 return 0;
1798 /* we have start < vma->vm_end */
1800 /* if it doesn't overlap, we have nothing.. */
1801 end = start + len;
1802 if (vma->vm_start >= end)
1803 return 0;
1805 /*
1806 * If we need to split any vma, do it now to save pain later.
1808 * Note: mremap's move_vma VM_ACCOUNT handling assumes a partially
1809 * unmapped vm_area_struct will remain in use: so lower split_vma
1810 * places tmp vma above, and higher split_vma places tmp vma below.
1811 */
1812 if (start > vma->vm_start) {
1813 int error = split_vma(mm, vma, start, 0);
1814 if (error)
1815 return error;
1816 prev = vma;
1819 /* Does it split the last one? */
1820 last = find_vma(mm, end);
1821 if (last && end > last->vm_start) {
1822 int error = split_vma(mm, last, end, 1);
1823 if (error)
1824 return error;
1826 vma = prev? prev->vm_next: mm->mmap;
1828 /*
1829 * Remove the vma's, and unmap the actual pages
1830 */
1831 detach_vmas_to_be_unmapped(mm, vma, prev, end);
1832 unmap_region(mm, vma, prev, start, end);
1834 /* Fix up all other VM information */
1835 remove_vma_list(mm, vma);
1837 return 0;
1840 EXPORT_SYMBOL(do_munmap);
1842 asmlinkage long sys_munmap(unsigned long addr, size_t len)
1844 int ret;
1845 struct mm_struct *mm = current->mm;
1847 profile_munmap(addr);
1849 down_write(&mm->mmap_sem);
1850 ret = do_munmap(mm, addr, len);
1851 up_write(&mm->mmap_sem);
1852 return ret;
1855 static inline void verify_mm_writelocked(struct mm_struct *mm)
1857 #ifdef CONFIG_DEBUG_VM
1858 if (unlikely(down_read_trylock(&mm->mmap_sem))) {
1859 WARN_ON(1);
1860 up_read(&mm->mmap_sem);
1862 #endif
1865 /*
1866 * this is really a simplified "do_mmap". it only handles
1867 * anonymous maps. eventually we may be able to do some
1868 * brk-specific accounting here.
1869 */
1870 unsigned long do_brk(unsigned long addr, unsigned long len)
1872 struct mm_struct * mm = current->mm;
1873 struct vm_area_struct * vma, * prev;
1874 unsigned long flags;
1875 struct rb_node ** rb_link, * rb_parent;
1876 pgoff_t pgoff = addr >> PAGE_SHIFT;
1877 int error;
1879 len = PAGE_ALIGN(len);
1880 if (!len)
1881 return addr;
1883 if ((addr + len) > TASK_SIZE || (addr + len) < addr)
1884 return -EINVAL;
1886 flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
1888 error = arch_mmap_check(addr, len, flags);
1889 if (error)
1890 return error;
1892 /*
1893 * mlock MCL_FUTURE?
1894 */
1895 if (mm->def_flags & VM_LOCKED) {
1896 unsigned long locked, lock_limit;
1897 locked = len >> PAGE_SHIFT;
1898 locked += mm->locked_vm;
1899 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
1900 lock_limit >>= PAGE_SHIFT;
1901 if (locked > lock_limit && !capable(CAP_IPC_LOCK))
1902 return -EAGAIN;
1905 /*
1906 * mm->mmap_sem is required to protect against another thread
1907 * changing the mappings in case we sleep.
1908 */
1909 verify_mm_writelocked(mm);
1911 /*
1912 * Clear old maps. this also does some error checking for us
1913 */
1914 munmap_back:
1915 vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
1916 if (vma && vma->vm_start < addr + len) {
1917 if (do_munmap(mm, addr, len))
1918 return -ENOMEM;
1919 goto munmap_back;
1922 /* Check against address space limits *after* clearing old maps... */
1923 if (!may_expand_vm(mm, len >> PAGE_SHIFT))
1924 return -ENOMEM;
1926 if (mm->map_count > sysctl_max_map_count)
1927 return -ENOMEM;
1929 if (security_vm_enough_memory(len >> PAGE_SHIFT))
1930 return -ENOMEM;
1932 /* Can we just expand an old private anonymous mapping? */
1933 if (vma_merge(mm, prev, addr, addr + len, flags,
1934 NULL, NULL, pgoff, NULL))
1935 goto out;
1937 /*
1938 * create a vma struct for an anonymous mapping
1939 */
1940 vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
1941 if (!vma) {
1942 vm_unacct_memory(len >> PAGE_SHIFT);
1943 return -ENOMEM;
1946 vma->vm_mm = mm;
1947 vma->vm_start = addr;
1948 vma->vm_end = addr + len;
1949 vma->vm_pgoff = pgoff;
1950 vma->vm_flags = flags;
1951 vma->vm_page_prot = protection_map[flags &
1952 (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)];
1953 vma_link(mm, vma, prev, rb_link, rb_parent);
1954 out:
1955 mm->total_vm += len >> PAGE_SHIFT;
1956 if (flags & VM_LOCKED) {
1957 mm->locked_vm += len >> PAGE_SHIFT;
1958 make_pages_present(addr, addr + len);
1960 return addr;
1963 EXPORT_SYMBOL(do_brk);
1965 /* Release all mmaps. */
1966 void exit_mmap(struct mm_struct *mm)
1968 struct mmu_gather *tlb;
1969 struct vm_area_struct *vma_tmp, *vma = mm->mmap;
1970 unsigned long nr_accounted = 0;
1971 unsigned long end;
1973 #ifdef arch_exit_mmap
1974 arch_exit_mmap(mm);
1975 #endif
1977 for (vma_tmp = mm->mmap; vma_tmp; vma_tmp = vma_tmp->vm_next)
1978 unmap_vma(vma_tmp);
1980 lru_add_drain();
1981 flush_cache_mm(mm);
1982 tlb = tlb_gather_mmu(mm, 1);
1983 /* Don't update_hiwater_rss(mm) here, do_exit already did */
1984 /* Use -1 here to ensure all VMAs in the mm are unmapped */
1985 end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
1986 vm_unacct_memory(nr_accounted);
1987 free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
1988 tlb_finish_mmu(tlb, 0, end);
1990 /*
1991 * Walk the list again, actually closing and freeing it,
1992 * with preemption enabled, without holding any MM locks.
1993 */
1994 while (vma)
1995 vma = remove_vma(vma);
1997 BUG_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
2000 /* Insert vm structure into process list sorted by address
2001 * and into the inode's i_mmap tree. If vm_file is non-NULL
2002 * then i_mmap_lock is taken here.
2003 */
2004 int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
2006 struct vm_area_struct * __vma, * prev;
2007 struct rb_node ** rb_link, * rb_parent;
2009 /*
2010 * The vm_pgoff of a purely anonymous vma should be irrelevant
2011 * until its first write fault, when page's anon_vma and index
2012 * are set. But now set the vm_pgoff it will almost certainly
2013 * end up with (unless mremap moves it elsewhere before that
2014 * first wfault), so /proc/pid/maps tells a consistent story.
2016 * By setting it to reflect the virtual start address of the
2017 * vma, merges and splits can happen in a seamless way, just
2018 * using the existing file pgoff checks and manipulations.
2019 * Similarly in do_mmap_pgoff and in do_brk.
2020 */
2021 if (!vma->vm_file) {
2022 BUG_ON(vma->anon_vma);
2023 vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
2025 __vma = find_vma_prepare(mm,vma->vm_start,&prev,&rb_link,&rb_parent);
2026 if (__vma && __vma->vm_start < vma->vm_end)
2027 return -ENOMEM;
2028 if ((vma->vm_flags & VM_ACCOUNT) &&
2029 security_vm_enough_memory(vma_pages(vma)))
2030 return -ENOMEM;
2031 vma_link(mm, vma, prev, rb_link, rb_parent);
2032 return 0;
2035 /*
2036 * Copy the vma structure to a new location in the same mm,
2037 * prior to moving page table entries, to effect an mremap move.
2038 */
2039 struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
2040 unsigned long addr, unsigned long len, pgoff_t pgoff)
2042 struct vm_area_struct *vma = *vmap;
2043 unsigned long vma_start = vma->vm_start;
2044 struct mm_struct *mm = vma->vm_mm;
2045 struct vm_area_struct *new_vma, *prev;
2046 struct rb_node **rb_link, *rb_parent;
2047 struct mempolicy *pol;
2049 /*
2050 * If anonymous vma has not yet been faulted, update new pgoff
2051 * to match new location, to increase its chance of merging.
2052 */
2053 if (!vma->vm_file && !vma->anon_vma)
2054 pgoff = addr >> PAGE_SHIFT;
2056 find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
2057 new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
2058 vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma));
2059 if (new_vma) {
2060 /*
2061 * Source vma may have been merged into new_vma
2062 */
2063 if (vma_start >= new_vma->vm_start &&
2064 vma_start < new_vma->vm_end)
2065 *vmap = new_vma;
2066 } else {
2067 new_vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
2068 if (new_vma) {
2069 *new_vma = *vma;
2070 pol = mpol_copy(vma_policy(vma));
2071 if (IS_ERR(pol)) {
2072 kmem_cache_free(vm_area_cachep, new_vma);
2073 return NULL;
2075 vma_set_policy(new_vma, pol);
2076 new_vma->vm_start = addr;
2077 new_vma->vm_end = addr + len;
2078 new_vma->vm_pgoff = pgoff;
2079 if (new_vma->vm_file)
2080 get_file(new_vma->vm_file);
2081 if (new_vma->vm_ops && new_vma->vm_ops->open)
2082 new_vma->vm_ops->open(new_vma);
2083 vma_link(mm, new_vma, prev, rb_link, rb_parent);
2086 return new_vma;
2089 /*
2090 * Return true if the calling process may expand its vm space by the passed
2091 * number of pages
2092 */
2093 int may_expand_vm(struct mm_struct *mm, unsigned long npages)
2095 unsigned long cur = mm->total_vm; /* pages */
2096 unsigned long lim;
2098 lim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT;
2100 if (cur + npages > lim)
2101 return 0;
2102 return 1;