ia64/xen-unstable

changeset 4669:6c0dd2c2ca58

bitkeeper revision 1.1385.1.6 (426fa4d8AQ8dQHrihxXpPojuqYnL0g)

Improve multi-processor XenLinux fork/exec/destroy times. We do this
by lazily pinning page-tables for p.t. use, and aggressively unpinning
them on last use, to put as little pressure on the batched wrpt
interface as possible. Basically this means that the copy loop and
destroy loop will usually be able to directly write pagetables with no
Xen intervention at all (implicit or explicit).
Signed-off-by: Keir Fraser <keir@xensource.com>
author kaf24@firebug.cl.cam.ac.uk
date Wed Apr 27 14:42:32 2005 +0000 (2005-04-27)
parents 435ff7469fc1
children 98d5be103415 251ac792d8c1
files .rootkeys linux-2.6.11-xen-sparse/arch/xen/i386/kernel/ldt.c linux-2.6.11-xen-sparse/arch/xen/i386/mm/init.c linux-2.6.11-xen-sparse/arch/xen/i386/mm/pgtable.c linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/mmu.h linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/mmu_context.h linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/pgalloc.h linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/pgtable.h linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/tlbflush.h linux-2.6.11-xen-sparse/include/asm-xen/asm-x86_64/tlbflush.h linux-2.6.11-xen-sparse/mm/mmap.c xen/arch/x86/mm.c
line diff
     1.1 --- a/.rootkeys	Wed Apr 27 10:39:11 2005 +0000
     1.2 +++ b/.rootkeys	Wed Apr 27 14:42:32 2005 +0000
     1.3 @@ -351,6 +351,7 @@ 40f5623aKXkBBxgpLx2NcvkncQ1Yyw linux-2.6
     1.4  40f5623aDMCsWOFO0jktZ4e8sjwvEg linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/mach-xen/setup_arch_post.h
     1.5  40f5623arsFXkGdPvIqvFi3yFXGR0Q linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/mach-xen/setup_arch_pre.h
     1.6  41811f07Iri9hrvs97t-baxmhOwWDQ linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/mach-xen/smpboot_hooks.h
     1.7 +426fa4d7RzvcFMqff_M76HrvRQZHSg linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/mmu.h
     1.8  4120f807GCO0uqsLqdZj9csxR1Wthw linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/mmu_context.h
     1.9  40f5623adgjZq9nAgCt0IXdWl7udSA linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/page.h
    1.10  40f5623a54NuG-7qHihGYmw4wWQnMA linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/param.h
    1.11 @@ -418,6 +419,7 @@ 419dfc6awx7w88wk6cG9P3mPidX6LQ linux-2.6
    1.12  40f56a0ddHCSs3501MY4hRf22tctOw linux-2.6.11-xen-sparse/mkbuildtree
    1.13  42305f54Q6xJ1bXcQJlCQq1m-e2C8g linux-2.6.11-xen-sparse/mm/highmem.c
    1.14  412f46c0LJuKAgSPGoC0Z1DEkLfuLA linux-2.6.11-xen-sparse/mm/memory.c
    1.15 +426fa4d7ooLYmFcFjJMF_ut4GFVh2Q linux-2.6.11-xen-sparse/mm/mmap.c
    1.16  410a94a4KT6I6X0LVc7djB39tRDp4g linux-2.6.11-xen-sparse/mm/page_alloc.c
    1.17  413cb1e4zst25MDYjg63Y-NGC5_pLg netbsd-2.0-xen-sparse/Makefile
    1.18  413cb1e5c_Mkxf_X0zimEhTKI_l4DA netbsd-2.0-xen-sparse/mkbuildtree
     2.1 --- a/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/ldt.c	Wed Apr 27 10:39:11 2005 +0000
     2.2 +++ b/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/ldt.c	Wed Apr 27 14:42:32 2005 +0000
     2.3 @@ -100,8 +100,8 @@ int init_new_context(struct task_struct 
     2.4  	struct mm_struct * old_mm;
     2.5  	int retval = 0;
     2.6  
     2.7 +	memset(&mm->context, 0, sizeof(mm->context));
     2.8  	init_MUTEX(&mm->context.sem);
     2.9 -	mm->context.size = 0;
    2.10  	old_mm = current->mm;
    2.11  	if (old_mm && old_mm->context.size > 0) {
    2.12  		down(&old_mm->context.sem);
     3.1 --- a/linux-2.6.11-xen-sparse/arch/xen/i386/mm/init.c	Wed Apr 27 10:39:11 2005 +0000
     3.2 +++ b/linux-2.6.11-xen-sparse/arch/xen/i386/mm/init.c	Wed Apr 27 14:42:32 2005 +0000
     3.3 @@ -710,18 +710,9 @@ void __init mem_init(void)
     3.4  
     3.5  kmem_cache_t *pgd_cache;
     3.6  kmem_cache_t *pmd_cache;
     3.7 -kmem_cache_t *pte_cache;
     3.8  
     3.9  void __init pgtable_cache_init(void)
    3.10  {
    3.11 -	pte_cache = kmem_cache_create("pte",
    3.12 -				PTRS_PER_PTE*sizeof(pte_t),
    3.13 -				PTRS_PER_PTE*sizeof(pte_t),
    3.14 -				0,
    3.15 -				pte_ctor,
    3.16 -				pte_dtor);
    3.17 -	if (!pte_cache)
    3.18 -		panic("pgtable_cache_init(): Cannot create pte cache");
    3.19  	if (PTRS_PER_PMD > 1) {
    3.20  		pmd_cache = kmem_cache_create("pmd",
    3.21  					PTRS_PER_PMD*sizeof(pmd_t),
     4.1 --- a/linux-2.6.11-xen-sparse/arch/xen/i386/mm/pgtable.c	Wed Apr 27 10:39:11 2005 +0000
     4.2 +++ b/linux-2.6.11-xen-sparse/arch/xen/i386/mm/pgtable.c	Wed Apr 27 14:42:32 2005 +0000
     4.3 @@ -198,59 +198,35 @@ pte_t *pte_alloc_one_kernel(struct mm_st
     4.4  	return pte;
     4.5  }
     4.6  
     4.7 -void pte_ctor(void *pte, kmem_cache_t *cache, unsigned long unused)
     4.8 -{
     4.9 -	struct page *page = virt_to_page(pte);
    4.10 -	SetPageForeign(page, pte_free);
    4.11 -	set_page_count(page, 1);
    4.12 -
    4.13 -	clear_page(pte);
    4.14 -	make_page_readonly(pte);
    4.15 -	xen_pte_pin(__pa(pte));
    4.16 -}
    4.17 -
    4.18 -void pte_dtor(void *pte, kmem_cache_t *cache, unsigned long unused)
    4.19 -{
    4.20 -	struct page *page = virt_to_page(pte);
    4.21 -	ClearPageForeign(page);
    4.22 -
    4.23 -	xen_pte_unpin(__pa(pte));
    4.24 -	make_page_writable(pte);
    4.25 -}
    4.26 -
    4.27  struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
    4.28  {
    4.29 -	pte_t *ptep;
    4.30 -
    4.31 -#ifdef CONFIG_HIGHPTE
    4.32  	struct page *pte;
    4.33  
    4.34 +#ifdef CONFIG_HIGHPTE
    4.35  	pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
    4.36 -	if (pte == NULL)
    4.37 -		return pte;
    4.38 -	if (PageHighMem(pte))
    4.39 -		return pte;
    4.40 -	/* not a highmem page -- free page and grab one from the cache */
    4.41 -	__free_page(pte);
    4.42 +#else
    4.43 +	pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
    4.44 +	if (pte) {
    4.45 +		SetPageForeign(pte, pte_free);
    4.46 +		set_page_count(pte, 1);
    4.47 +	}
    4.48  #endif
    4.49 -	ptep = kmem_cache_alloc(pte_cache, GFP_KERNEL);
    4.50 -	if (ptep)
    4.51 -		return virt_to_page(ptep);
    4.52 -	return NULL;
    4.53 +
    4.54 +	return pte;
    4.55  }
    4.56  
    4.57  void pte_free(struct page *pte)
    4.58  {
    4.59 +	unsigned long va = (unsigned long)__va(page_to_pfn(pte)<<PAGE_SHIFT);
    4.60 +
    4.61 +	if (!pte_write(*virt_to_ptep(va)))
    4.62 +		HYPERVISOR_update_va_mapping(
    4.63 +			va, pfn_pte(page_to_pfn(pte), PAGE_KERNEL), 0);
    4.64 +
    4.65 +	ClearPageForeign(pte);
    4.66  	set_page_count(pte, 1);
    4.67 -#ifdef CONFIG_HIGHPTE
    4.68 -	if (!PageHighMem(pte))
    4.69 -#endif
    4.70 -		kmem_cache_free(pte_cache,
    4.71 -				phys_to_virt(page_to_pseudophys(pte)));
    4.72 -#ifdef CONFIG_HIGHPTE
    4.73 -	else
    4.74 -		__free_page(pte);
    4.75 -#endif
    4.76 +
    4.77 +	__free_page(pte);
    4.78  }
    4.79  
    4.80  void pmd_ctor(void *pmd, kmem_cache_t *cache, unsigned long flags)
    4.81 @@ -305,14 +281,11 @@ void pgd_ctor(void *pgd, kmem_cache_t *c
    4.82  			(PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t));
    4.83  
    4.84  	if (PTRS_PER_PMD > 1)
    4.85 -		goto out;
    4.86 +		return;
    4.87  
    4.88  	pgd_list_add(pgd);
    4.89  	spin_unlock_irqrestore(&pgd_lock, flags);
    4.90  	memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
    4.91 - out:
    4.92 -	make_page_readonly(pgd);
    4.93 -	xen_pgd_pin(__pa(pgd));
    4.94  }
    4.95  
    4.96  /* never called when PTRS_PER_PMD > 1 */
    4.97 @@ -320,9 +293,6 @@ void pgd_dtor(void *pgd, kmem_cache_t *c
    4.98  {
    4.99  	unsigned long flags; /* can be called from interrupt context */
   4.100  
   4.101 -	xen_pgd_unpin(__pa(pgd));
   4.102 -	make_page_writable(pgd);
   4.103 -
   4.104  	if (PTRS_PER_PMD > 1)
   4.105  		return;
   4.106  
   4.107 @@ -357,6 +327,15 @@ out_oom:
   4.108  void pgd_free(pgd_t *pgd)
   4.109  {
   4.110  	int i;
   4.111 +	pte_t *ptep = virt_to_ptep(pgd);
   4.112 +
   4.113 +	if (!pte_write(*ptep)) {
   4.114 +		xen_pgd_unpin(__pa(pgd));
   4.115 +		HYPERVISOR_update_va_mapping(
   4.116 +			(unsigned long)pgd,
   4.117 +			pfn_pte(virt_to_phys(pgd)>>PAGE_SHIFT, PAGE_KERNEL),
   4.118 +			0);
   4.119 +	}
   4.120  
   4.121  	/* in the PAE case user pgd entries are overwritten before usage */
   4.122  	if (PTRS_PER_PMD > 1)
   4.123 @@ -369,28 +348,19 @@ void pgd_free(pgd_t *pgd)
   4.124  #ifndef CONFIG_XEN_SHADOW_MODE
   4.125  void make_lowmem_page_readonly(void *va)
   4.126  {
   4.127 -	pgd_t *pgd = pgd_offset_k((unsigned long)va);
   4.128 -	pud_t *pud = pud_offset(pgd, (unsigned long)va);
   4.129 -	pmd_t *pmd = pmd_offset(pud, (unsigned long)va);
   4.130 -	pte_t *pte = pte_offset_kernel(pmd, (unsigned long)va);
   4.131 +	pte_t *pte = virt_to_ptep(va);
   4.132  	set_pte(pte, pte_wrprotect(*pte));
   4.133  }
   4.134  
   4.135  void make_lowmem_page_writable(void *va)
   4.136  {
   4.137 -	pgd_t *pgd = pgd_offset_k((unsigned long)va);
   4.138 -	pud_t *pud = pud_offset(pgd, (unsigned long)va);
   4.139 -	pmd_t *pmd = pmd_offset(pud, (unsigned long)va);
   4.140 -	pte_t *pte = pte_offset_kernel(pmd, (unsigned long)va);
   4.141 +	pte_t *pte = virt_to_ptep(va);
   4.142  	set_pte(pte, pte_mkwrite(*pte));
   4.143  }
   4.144  
   4.145  void make_page_readonly(void *va)
   4.146  {
   4.147 -	pgd_t *pgd = pgd_offset_k((unsigned long)va);
   4.148 -	pud_t *pud = pud_offset(pgd, (unsigned long)va);
   4.149 -	pmd_t *pmd = pmd_offset(pud, (unsigned long)va);
   4.150 -	pte_t *pte = pte_offset_kernel(pmd, (unsigned long)va);
   4.151 +	pte_t *pte = virt_to_ptep(va);
   4.152  	set_pte(pte, pte_wrprotect(*pte));
   4.153  	if ( (unsigned long)va >= (unsigned long)high_memory )
   4.154  	{
   4.155 @@ -405,10 +375,7 @@ void make_page_readonly(void *va)
   4.156  
   4.157  void make_page_writable(void *va)
   4.158  {
   4.159 -	pgd_t *pgd = pgd_offset_k((unsigned long)va);
   4.160 -	pud_t *pud = pud_offset(pgd, (unsigned long)va);
   4.161 -	pmd_t *pmd = pmd_offset(pud, (unsigned long)va);
   4.162 -	pte_t *pte = pte_offset_kernel(pmd, (unsigned long)va);
   4.163 +	pte_t *pte = virt_to_ptep(va);
   4.164  	set_pte(pte, pte_mkwrite(*pte));
   4.165  	if ( (unsigned long)va >= (unsigned long)high_memory )
   4.166  	{
   4.167 @@ -439,3 +406,91 @@ void make_pages_writable(void *va, unsig
   4.168  	}
   4.169  }
   4.170  #endif /* CONFIG_XEN_SHADOW_MODE */
   4.171 +
   4.172 +void mm_pin(struct mm_struct *mm)
   4.173 +{
   4.174 +    pgd_t       *pgd;
   4.175 +    struct page *page;
   4.176 +    int          i;
   4.177 +
   4.178 +    spin_lock(&mm->page_table_lock);
   4.179 +
   4.180 +    for ( i = 0, pgd = mm->pgd; i < USER_PTRS_PER_PGD; i++, pgd++ )
   4.181 +    {
   4.182 +        if ( *(unsigned long *)pgd == 0 )
   4.183 +            continue;
   4.184 +        page = pmd_page(*(pmd_t *)pgd);
   4.185 +        if ( !PageHighMem(page) )
   4.186 +            HYPERVISOR_update_va_mapping(
   4.187 +                (unsigned long)__va(page_to_pfn(page)<<PAGE_SHIFT),
   4.188 +                pfn_pte(page_to_pfn(page), PAGE_KERNEL_RO), 0);
   4.189 +    }
   4.190 +
   4.191 +    HYPERVISOR_update_va_mapping(
   4.192 +        (unsigned long)mm->pgd,
   4.193 +        pfn_pte(virt_to_phys(mm->pgd)>>PAGE_SHIFT, PAGE_KERNEL_RO), 0);
   4.194 +    xen_pgd_pin(__pa(mm->pgd));
   4.195 +
   4.196 +    mm->context.pinned = 1;
   4.197 +
   4.198 +    spin_unlock(&mm->page_table_lock);
   4.199 +}
   4.200 +
   4.201 +void mm_unpin(struct mm_struct *mm)
   4.202 +{
   4.203 +    pgd_t       *pgd;
   4.204 +    struct page *page;
   4.205 +    int          i;
   4.206 +
   4.207 +    spin_lock(&mm->page_table_lock);
   4.208 +
   4.209 +    xen_pgd_unpin(__pa(mm->pgd));
   4.210 +    HYPERVISOR_update_va_mapping(
   4.211 +        (unsigned long)mm->pgd,
   4.212 +        pfn_pte(virt_to_phys(mm->pgd)>>PAGE_SHIFT, PAGE_KERNEL), 0);
   4.213 +
   4.214 +    for ( i = 0, pgd = mm->pgd; i < USER_PTRS_PER_PGD; i++, pgd++ )
   4.215 +    {
   4.216 +        if ( *(unsigned long *)pgd == 0 )
   4.217 +            continue;
   4.218 +        page = pmd_page(*(pmd_t *)pgd);
   4.219 +        if ( !PageHighMem(page) )
   4.220 +            HYPERVISOR_update_va_mapping(
   4.221 +                (unsigned long)__va(page_to_pfn(page)<<PAGE_SHIFT),
   4.222 +                pfn_pte(page_to_pfn(page), PAGE_KERNEL), 0);
   4.223 +    }
   4.224 +
   4.225 +    mm->context.pinned = 0;
   4.226 +
   4.227 +    spin_unlock(&mm->page_table_lock);
   4.228 +}
   4.229 +
   4.230 +void _arch_exit_mmap(struct mm_struct *mm)
   4.231 +{
   4.232 +    unsigned int cpu = smp_processor_id();
   4.233 +    struct task_struct *tsk = current;
   4.234 +
   4.235 +    task_lock(tsk);
   4.236 +
   4.237 +    /*
   4.238 +     * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
   4.239 +     * *much* faster this way, as no tlb flushes means bigger wrpt batches.
   4.240 +     */
   4.241 +    if ( tsk->active_mm == mm )
   4.242 +    {
   4.243 +        tsk->active_mm = &init_mm;
   4.244 +        atomic_inc(&init_mm.mm_count);
   4.245 +
   4.246 +        cpu_set(cpu, init_mm.cpu_vm_mask);
   4.247 +        load_cr3(swapper_pg_dir);
   4.248 +        cpu_clear(cpu, mm->cpu_vm_mask);
   4.249 +
   4.250 +        atomic_dec(&mm->mm_count);
   4.251 +        BUG_ON(atomic_read(&mm->mm_count) == 0);
   4.252 +    }
   4.253 +
   4.254 +    task_unlock(tsk);
   4.255 +
   4.256 +    if ( mm->context.pinned && (atomic_read(&mm->mm_count) == 1) )
   4.257 +        mm_unpin(mm);
   4.258 +}
     5.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     5.2 +++ b/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/mmu.h	Wed Apr 27 14:42:32 2005 +0000
     5.3 @@ -0,0 +1,22 @@
     5.4 +#ifndef __i386_MMU_H
     5.5 +#define __i386_MMU_H
     5.6 +
     5.7 +#include <asm/semaphore.h>
     5.8 +/*
     5.9 + * The i386 doesn't have a mmu context, but
    5.10 + * we put the segment information here.
    5.11 + *
    5.12 + * cpu_vm_mask is used to optimize ldt flushing.
    5.13 + */
    5.14 +typedef struct { 
    5.15 +	int size;
    5.16 +	struct semaphore sem;
    5.17 +	void *ldt;
    5.18 +	unsigned pinned:1;
    5.19 +} mm_context_t;
    5.20 +
    5.21 +/* mm/memory.c:exit_mmap hook */
    5.22 +extern void _arch_exit_mmap(struct mm_struct *mm);
    5.23 +#define arch_exit_mmap(_mm) _arch_exit_mmap(_mm)
    5.24 +
    5.25 +#endif
     6.1 --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/mmu_context.h	Wed Apr 27 10:39:11 2005 +0000
     6.2 +++ b/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/mmu_context.h	Wed Apr 27 14:42:32 2005 +0000
     6.3 @@ -41,6 +41,9 @@ static inline void __prepare_arch_switch
     6.4  		: : "r" (0) );
     6.5  }
     6.6  
     6.7 +extern void mm_pin(struct mm_struct *mm);
     6.8 +extern void mm_unpin(struct mm_struct *mm);
     6.9 +
    6.10  static inline void switch_mm(struct mm_struct *prev,
    6.11  			     struct mm_struct *next,
    6.12  			     struct task_struct *tsk)
    6.13 @@ -49,6 +52,9 @@ static inline void switch_mm(struct mm_s
    6.14  	struct mmuext_op _op[2], *op = _op;
    6.15  
    6.16  	if (likely(prev != next)) {
    6.17 +		if (!next->context.pinned)
    6.18 +			mm_pin(next);
    6.19 +
    6.20  		/* stop flush ipis for the previous mm */
    6.21  		cpu_clear(cpu, prev->cpu_vm_mask);
    6.22  #if 0 /* XEN: no lazy tlb */
    6.23 @@ -92,20 +98,10 @@ static inline void switch_mm(struct mm_s
    6.24  #endif
    6.25  }
    6.26  
    6.27 -/*
    6.28 - * XEN: We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
    6.29 - * *much* faster this way, as no tlb flushes means much bigger wrpt batches.
    6.30 - */
    6.31 -#define deactivate_mm(tsk, mm) do {					\
    6.32 -	asm("movl %0,%%fs ; movl %0,%%gs": :"r" (0));			\
    6.33 -	if ((mm) && cpu_isset(smp_processor_id(), (mm)->cpu_vm_mask)) {	\
    6.34 -		cpu_clear(smp_processor_id(), (mm)->cpu_vm_mask);	\
    6.35 -		load_cr3(swapper_pg_dir);				\
    6.36 -	}								\
    6.37 -} while (0)
    6.38 +#define deactivate_mm(tsk, mm) \
    6.39 +	asm("movl %0,%%fs ; movl %0,%%gs": :"r" (0))
    6.40  
    6.41 -#define activate_mm(prev, next) do {		\
    6.42 -	switch_mm((prev),(next),NULL);		\
    6.43 -} while (0)
    6.44 +#define activate_mm(prev, next) \
    6.45 +	switch_mm((prev),(next),NULL)
    6.46  
    6.47  #endif
     7.1 --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/pgalloc.h	Wed Apr 27 10:39:11 2005 +0000
     7.2 +++ b/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/pgalloc.h	Wed Apr 27 14:42:32 2005 +0000
     7.3 @@ -11,10 +11,23 @@
     7.4  #define pmd_populate_kernel(mm, pmd, pte) \
     7.5  		set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte)))
     7.6  
     7.7 -#define pmd_populate(mm, pmd, pte) 				\
     7.8 -	set_pmd(pmd, __pmd(_PAGE_TABLE +			\
     7.9 -		((unsigned long long)page_to_pfn(pte) <<	\
    7.10 -			(unsigned long long) PAGE_SHIFT)))
    7.11 +#define pmd_populate(mm, pmd, pte) 					\
    7.12 +do {									\
    7.13 +	if (unlikely((mm)->context.pinned)) {				\
    7.14 +		if (!PageHighMem(pte))					\
    7.15 +			HYPERVISOR_update_va_mapping(			\
    7.16 +			  (unsigned long)__va(page_to_pfn(pte)<<PAGE_SHIFT),\
    7.17 +			  pfn_pte(page_to_pfn(pte), PAGE_KERNEL_RO), 0);\
    7.18 +		set_pmd(pmd, __pmd(_PAGE_TABLE +			\
    7.19 +			((unsigned long long)page_to_pfn(pte) <<	\
    7.20 +				(unsigned long long) PAGE_SHIFT)));	\
    7.21 +	} else {							\
    7.22 +		*(pmd) = __pmd(_PAGE_TABLE +				\
    7.23 +			((unsigned long long)page_to_pfn(pte) <<	\
    7.24 +				(unsigned long long) PAGE_SHIFT));	\
    7.25 +	}								\
    7.26 +} while (0)
    7.27 +
    7.28  /*
    7.29   * Allocate and free page tables.
    7.30   */
     8.1 --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/pgtable.h	Wed Apr 27 10:39:11 2005 +0000
     8.2 +++ b/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/pgtable.h	Wed Apr 27 14:42:32 2005 +0000
     8.3 @@ -35,12 +35,9 @@ extern unsigned long empty_zero_page[102
     8.4  extern pgd_t swapper_pg_dir[1024];
     8.5  extern kmem_cache_t *pgd_cache;
     8.6  extern kmem_cache_t *pmd_cache;
     8.7 -extern kmem_cache_t *pte_cache;
     8.8  extern spinlock_t pgd_lock;
     8.9  extern struct page *pgd_list;
    8.10  
    8.11 -void pte_ctor(void *, kmem_cache_t *, unsigned long);
    8.12 -void pte_dtor(void *, kmem_cache_t *, unsigned long);
    8.13  void pmd_ctor(void *, kmem_cache_t *, unsigned long);
    8.14  void pgd_ctor(void *, kmem_cache_t *, unsigned long);
    8.15  void pgd_dtor(void *, kmem_cache_t *, unsigned long);
    8.16 @@ -448,12 +445,17 @@ void make_pages_writable(void *va, unsig
    8.17  #define make_pages_writable(_va, _nr)  ((void)0)
    8.18  #endif
    8.19  
    8.20 -#define arbitrary_virt_to_machine(__va)					\
    8.21 +#define virt_to_ptep(__va)						\
    8.22  ({									\
    8.23  	pgd_t *__pgd = pgd_offset_k((unsigned long)(__va));		\
    8.24  	pud_t *__pud = pud_offset(__pgd, (unsigned long)(__va));	\
    8.25  	pmd_t *__pmd = pmd_offset(__pud, (unsigned long)(__va));	\
    8.26 -	pte_t *__pte = pte_offset_kernel(__pmd, (unsigned long)(__va));	\
    8.27 +	pte_offset_kernel(__pmd, (unsigned long)(__va));		\
    8.28 +})
    8.29 +
    8.30 +#define arbitrary_virt_to_machine(__va)					\
    8.31 +({									\
    8.32 +	pte_t *__pte = virt_to_ptep(__va);				\
    8.33  	unsigned long __pa = (*(unsigned long *)__pte) & PAGE_MASK;	\
    8.34  	__pa | ((unsigned long)(__va) & (PAGE_SIZE-1));			\
    8.35  })
     9.1 --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/tlbflush.h	Wed Apr 27 10:39:11 2005 +0000
     9.2 +++ b/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/tlbflush.h	Wed Apr 27 14:42:32 2005 +0000
     9.3 @@ -40,24 +40,21 @@ extern unsigned long pgkern_mask;
     9.4  
     9.5  static inline void flush_tlb_mm(struct mm_struct *mm)
     9.6  {
     9.7 -	/* XEN: cpu_vm_mask is more accurate than active_mm. */
     9.8 -	if (cpu_isset(smp_processor_id(), mm->cpu_vm_mask))
     9.9 +	if (mm == current->active_mm)
    9.10  		__flush_tlb();
    9.11  }
    9.12  
    9.13  static inline void flush_tlb_page(struct vm_area_struct *vma,
    9.14  	unsigned long addr)
    9.15  {
    9.16 -	/* XEN: cpu_vm_mask is more accurate than active_mm. */
    9.17 -	if (cpu_isset(smp_processor_id(), vma->vm_mm->cpu_vm_mask))
    9.18 +	if (vma->vm_mm == current->active_mm)
    9.19  		__flush_tlb_one(addr);
    9.20  }
    9.21  
    9.22  static inline void flush_tlb_range(struct vm_area_struct *vma,
    9.23  	unsigned long start, unsigned long end)
    9.24  {
    9.25 -	/* XEN: cpu_vm_mask is more accurate than active_mm. */
    9.26 -	if (cpu_isset(smp_processor_id(), vma->vm_mm->cpu_vm_mask))
    9.27 +	if (vma->vm_mm == current->active_mm)
    9.28  		__flush_tlb();
    9.29  }
    9.30  
    10.1 --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-x86_64/tlbflush.h	Wed Apr 27 10:39:11 2005 +0000
    10.2 +++ b/linux-2.6.11-xen-sparse/include/asm-xen/asm-x86_64/tlbflush.h	Wed Apr 27 14:42:32 2005 +0000
    10.3 @@ -44,24 +44,21 @@ extern unsigned long pgkern_mask;
    10.4  
    10.5  static inline void flush_tlb_mm(struct mm_struct *mm)
    10.6  {
    10.7 -	/* XEN: cpu_vm_mask is more accurate than active_mm. */
    10.8 -	if (cpu_isset(smp_processor_id(), mm->cpu_vm_mask))
    10.9 +	if (mm == current->active_mm)
   10.10  		__flush_tlb();
   10.11  }
   10.12  
   10.13  static inline void flush_tlb_page(struct vm_area_struct *vma,
   10.14  	unsigned long addr)
   10.15  {
   10.16 -	/* XEN: cpu_vm_mask is more accurate than active_mm. */
   10.17 -	if (cpu_isset(smp_processor_id(), vma->vm_mm->cpu_vm_mask))
   10.18 +	if (vma->vm_mm == current->active_mm)
   10.19  		__flush_tlb_one(addr);
   10.20  }
   10.21  
   10.22  static inline void flush_tlb_range(struct vm_area_struct *vma,
   10.23  	unsigned long start, unsigned long end)
   10.24  {
   10.25 -	/* XEN: cpu_vm_mask is more accurate than active_mm. */
   10.26 -	if (cpu_isset(smp_processor_id(), vma->vm_mm->cpu_vm_mask))
   10.27 +	if (vma->vm_mm == current->active_mm)
   10.28  		__flush_tlb();
   10.29  }
   10.30  
    11.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    11.2 +++ b/linux-2.6.11-xen-sparse/mm/mmap.c	Wed Apr 27 14:42:32 2005 +0000
    11.3 @@ -0,0 +1,2108 @@
    11.4 +/*
    11.5 + * mm/mmap.c
    11.6 + *
    11.7 + * Written by obz.
    11.8 + *
    11.9 + * Address space accounting code	<alan@redhat.com>
   11.10 + */
   11.11 +
   11.12 +#include <linux/slab.h>
   11.13 +#include <linux/mm.h>
   11.14 +#include <linux/shm.h>
   11.15 +#include <linux/mman.h>
   11.16 +#include <linux/pagemap.h>
   11.17 +#include <linux/swap.h>
   11.18 +#include <linux/syscalls.h>
   11.19 +#include <linux/init.h>
   11.20 +#include <linux/file.h>
   11.21 +#include <linux/fs.h>
   11.22 +#include <linux/personality.h>
   11.23 +#include <linux/security.h>
   11.24 +#include <linux/hugetlb.h>
   11.25 +#include <linux/profile.h>
   11.26 +#include <linux/module.h>
   11.27 +#include <linux/acct.h>
   11.28 +#include <linux/mount.h>
   11.29 +#include <linux/mempolicy.h>
   11.30 +#include <linux/rmap.h>
   11.31 +
   11.32 +#include <asm/uaccess.h>
   11.33 +#include <asm/cacheflush.h>
   11.34 +#include <asm/tlb.h>
   11.35 +
   11.36 +/*
   11.37 + * WARNING: the debugging will use recursive algorithms so never enable this
   11.38 + * unless you know what you are doing.
   11.39 + */
   11.40 +#undef DEBUG_MM_RB
   11.41 +
   11.42 +/* description of effects of mapping type and prot in current implementation.
   11.43 + * this is due to the limited x86 page protection hardware.  The expected
   11.44 + * behavior is in parens:
   11.45 + *
   11.46 + * map_type	prot
   11.47 + *		PROT_NONE	PROT_READ	PROT_WRITE	PROT_EXEC
   11.48 + * MAP_SHARED	r: (no) no	r: (yes) yes	r: (no) yes	r: (no) yes
   11.49 + *		w: (no) no	w: (no) no	w: (yes) yes	w: (no) no
   11.50 + *		x: (no) no	x: (no) yes	x: (no) yes	x: (yes) yes
   11.51 + *		
   11.52 + * MAP_PRIVATE	r: (no) no	r: (yes) yes	r: (no) yes	r: (no) yes
   11.53 + *		w: (no) no	w: (no) no	w: (copy) copy	w: (no) no
   11.54 + *		x: (no) no	x: (no) yes	x: (no) yes	x: (yes) yes
   11.55 + *
   11.56 + */
   11.57 +pgprot_t protection_map[16] = {
   11.58 +	__P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111,
   11.59 +	__S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111
   11.60 +};
   11.61 +
   11.62 +int sysctl_overcommit_memory = OVERCOMMIT_GUESS;  /* heuristic overcommit */
   11.63 +int sysctl_overcommit_ratio = 50;	/* default is 50% */
   11.64 +int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
   11.65 +atomic_t vm_committed_space = ATOMIC_INIT(0);
   11.66 +
   11.67 +/*
   11.68 + * Check that a process has enough memory to allocate a new virtual
   11.69 + * mapping. 0 means there is enough memory for the allocation to
   11.70 + * succeed and -ENOMEM implies there is not.
   11.71 + *
   11.72 + * We currently support three overcommit policies, which are set via the
   11.73 + * vm.overcommit_memory sysctl.  See Documentation/vm/overcommit-accounting
   11.74 + *
   11.75 + * Strict overcommit modes added 2002 Feb 26 by Alan Cox.
   11.76 + * Additional code 2002 Jul 20 by Robert Love.
   11.77 + *
   11.78 + * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise.
   11.79 + *
   11.80 + * Note this is a helper function intended to be used by LSMs which
   11.81 + * wish to use this logic.
   11.82 + */
   11.83 +int __vm_enough_memory(long pages, int cap_sys_admin)
   11.84 +{
   11.85 +	unsigned long free, allowed;
   11.86 +
   11.87 +	vm_acct_memory(pages);
   11.88 +
   11.89 +	/*
   11.90 +	 * Sometimes we want to use more memory than we have
   11.91 +	 */
   11.92 +	if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS)
   11.93 +		return 0;
   11.94 +
   11.95 +	if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
   11.96 +		unsigned long n;
   11.97 +
   11.98 +		free = get_page_cache_size();
   11.99 +		free += nr_swap_pages;
  11.100 +
  11.101 +		/*
  11.102 +		 * Any slabs which are created with the
  11.103 +		 * SLAB_RECLAIM_ACCOUNT flag claim to have contents
  11.104 +		 * which are reclaimable, under pressure.  The dentry
  11.105 +		 * cache and most inode caches should fall into this
  11.106 +		 */
  11.107 +		free += atomic_read(&slab_reclaim_pages);
  11.108 +
  11.109 +		/*
  11.110 +		 * Leave the last 3% for root
  11.111 +		 */
  11.112 +		if (!cap_sys_admin)
  11.113 +			free -= free / 32;
  11.114 +
  11.115 +		if (free > pages)
  11.116 +			return 0;
  11.117 +
  11.118 +		/*
  11.119 +		 * nr_free_pages() is very expensive on large systems,
  11.120 +		 * only call if we're about to fail.
  11.121 +		 */
  11.122 +		n = nr_free_pages();
  11.123 +		if (!cap_sys_admin)
  11.124 +			n -= n / 32;
  11.125 +		free += n;
  11.126 +
  11.127 +		if (free > pages)
  11.128 +			return 0;
  11.129 +		vm_unacct_memory(pages);
  11.130 +		return -ENOMEM;
  11.131 +	}
  11.132 +
  11.133 +	allowed = (totalram_pages - hugetlb_total_pages())
  11.134 +	       	* sysctl_overcommit_ratio / 100;
  11.135 +	/*
  11.136 +	 * Leave the last 3% for root
  11.137 +	 */
  11.138 +	if (!cap_sys_admin)
  11.139 +		allowed -= allowed / 32;
  11.140 +	allowed += total_swap_pages;
  11.141 +
  11.142 +	/* Don't let a single process grow too big:
  11.143 +	   leave 3% of the size of this process for other processes */
  11.144 +	allowed -= current->mm->total_vm / 32;
  11.145 +
  11.146 +	if (atomic_read(&vm_committed_space) < allowed)
  11.147 +		return 0;
  11.148 +
  11.149 +	vm_unacct_memory(pages);
  11.150 +
  11.151 +	return -ENOMEM;
  11.152 +}
  11.153 +
  11.154 +EXPORT_SYMBOL(sysctl_overcommit_memory);
  11.155 +EXPORT_SYMBOL(sysctl_overcommit_ratio);
  11.156 +EXPORT_SYMBOL(sysctl_max_map_count);
  11.157 +EXPORT_SYMBOL(vm_committed_space);
  11.158 +EXPORT_SYMBOL(__vm_enough_memory);
  11.159 +
  11.160 +/*
  11.161 + * Requires inode->i_mapping->i_mmap_lock
  11.162 + */
  11.163 +static void __remove_shared_vm_struct(struct vm_area_struct *vma,
  11.164 +		struct file *file, struct address_space *mapping)
  11.165 +{
  11.166 +	if (vma->vm_flags & VM_DENYWRITE)
  11.167 +		atomic_inc(&file->f_dentry->d_inode->i_writecount);
  11.168 +	if (vma->vm_flags & VM_SHARED)
  11.169 +		mapping->i_mmap_writable--;
  11.170 +
  11.171 +	flush_dcache_mmap_lock(mapping);
  11.172 +	if (unlikely(vma->vm_flags & VM_NONLINEAR))
  11.173 +		list_del_init(&vma->shared.vm_set.list);
  11.174 +	else
  11.175 +		vma_prio_tree_remove(vma, &mapping->i_mmap);
  11.176 +	flush_dcache_mmap_unlock(mapping);
  11.177 +}
  11.178 +
  11.179 +/*
  11.180 + * Remove one vm structure and free it.
  11.181 + */
  11.182 +static void remove_vm_struct(struct vm_area_struct *vma)
  11.183 +{
  11.184 +	struct file *file = vma->vm_file;
  11.185 +
  11.186 +	might_sleep();
  11.187 +	if (file) {
  11.188 +		struct address_space *mapping = file->f_mapping;
  11.189 +		spin_lock(&mapping->i_mmap_lock);
  11.190 +		__remove_shared_vm_struct(vma, file, mapping);
  11.191 +		spin_unlock(&mapping->i_mmap_lock);
  11.192 +	}
  11.193 +	if (vma->vm_ops && vma->vm_ops->close)
  11.194 +		vma->vm_ops->close(vma);
  11.195 +	if (file)
  11.196 +		fput(file);
  11.197 +	anon_vma_unlink(vma);
  11.198 +	mpol_free(vma_policy(vma));
  11.199 +	kmem_cache_free(vm_area_cachep, vma);
  11.200 +}
  11.201 +
  11.202 +/*
  11.203 + *  sys_brk() for the most part doesn't need the global kernel
  11.204 + *  lock, except when an application is doing something nasty
  11.205 + *  like trying to un-brk an area that has already been mapped
  11.206 + *  to a regular file.  in this case, the unmapping will need
  11.207 + *  to invoke file system routines that need the global lock.
  11.208 + */
  11.209 +asmlinkage unsigned long sys_brk(unsigned long brk)
  11.210 +{
  11.211 +	unsigned long rlim, retval;
  11.212 +	unsigned long newbrk, oldbrk;
  11.213 +	struct mm_struct *mm = current->mm;
  11.214 +
  11.215 +	down_write(&mm->mmap_sem);
  11.216 +
  11.217 +	if (brk < mm->end_code)
  11.218 +		goto out;
  11.219 +	newbrk = PAGE_ALIGN(brk);
  11.220 +	oldbrk = PAGE_ALIGN(mm->brk);
  11.221 +	if (oldbrk == newbrk)
  11.222 +		goto set_brk;
  11.223 +
  11.224 +	/* Always allow shrinking brk. */
  11.225 +	if (brk <= mm->brk) {
  11.226 +		if (!do_munmap(mm, newbrk, oldbrk-newbrk))
  11.227 +			goto set_brk;
  11.228 +		goto out;
  11.229 +	}
  11.230 +
  11.231 +	/* Check against rlimit.. */
  11.232 +	rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur;
  11.233 +	if (rlim < RLIM_INFINITY && brk - mm->start_data > rlim)
  11.234 +		goto out;
  11.235 +
  11.236 +	/* Check against existing mmap mappings. */
  11.237 +	if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE))
  11.238 +		goto out;
  11.239 +
  11.240 +	/* Ok, looks good - let it rip. */
  11.241 +	if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk)
  11.242 +		goto out;
  11.243 +set_brk:
  11.244 +	mm->brk = brk;
  11.245 +out:
  11.246 +	retval = mm->brk;
  11.247 +	up_write(&mm->mmap_sem);
  11.248 +	return retval;
  11.249 +}
  11.250 +
  11.251 +#ifdef DEBUG_MM_RB
  11.252 +static int browse_rb(struct rb_root *root)
  11.253 +{
  11.254 +	int i = 0, j;
  11.255 +	struct rb_node *nd, *pn = NULL;
  11.256 +	unsigned long prev = 0, pend = 0;
  11.257 +
  11.258 +	for (nd = rb_first(root); nd; nd = rb_next(nd)) {
  11.259 +		struct vm_area_struct *vma;
  11.260 +		vma = rb_entry(nd, struct vm_area_struct, vm_rb);
  11.261 +		if (vma->vm_start < prev)
  11.262 +			printk("vm_start %lx prev %lx\n", vma->vm_start, prev), i = -1;
  11.263 +		if (vma->vm_start < pend)
  11.264 +			printk("vm_start %lx pend %lx\n", vma->vm_start, pend);
  11.265 +		if (vma->vm_start > vma->vm_end)
  11.266 +			printk("vm_end %lx < vm_start %lx\n", vma->vm_end, vma->vm_start);
  11.267 +		i++;
  11.268 +		pn = nd;
  11.269 +	}
  11.270 +	j = 0;
  11.271 +	for (nd = pn; nd; nd = rb_prev(nd)) {
  11.272 +		j++;
  11.273 +	}
  11.274 +	if (i != j)
  11.275 +		printk("backwards %d, forwards %d\n", j, i), i = 0;
  11.276 +	return i;
  11.277 +}
  11.278 +
  11.279 +void validate_mm(struct mm_struct *mm)
  11.280 +{
  11.281 +	int bug = 0;
  11.282 +	int i = 0;
  11.283 +	struct vm_area_struct *tmp = mm->mmap;
  11.284 +	while (tmp) {
  11.285 +		tmp = tmp->vm_next;
  11.286 +		i++;
  11.287 +	}
  11.288 +	if (i != mm->map_count)
  11.289 +		printk("map_count %d vm_next %d\n", mm->map_count, i), bug = 1;
  11.290 +	i = browse_rb(&mm->mm_rb);
  11.291 +	if (i != mm->map_count)
  11.292 +		printk("map_count %d rb %d\n", mm->map_count, i), bug = 1;
  11.293 +	if (bug)
  11.294 +		BUG();
  11.295 +}
  11.296 +#else
  11.297 +#define validate_mm(mm) do { } while (0)
  11.298 +#endif
  11.299 +
  11.300 +static struct vm_area_struct *
  11.301 +find_vma_prepare(struct mm_struct *mm, unsigned long addr,
  11.302 +		struct vm_area_struct **pprev, struct rb_node ***rb_link,
  11.303 +		struct rb_node ** rb_parent)
  11.304 +{
  11.305 +	struct vm_area_struct * vma;
  11.306 +	struct rb_node ** __rb_link, * __rb_parent, * rb_prev;
  11.307 +
  11.308 +	__rb_link = &mm->mm_rb.rb_node;
  11.309 +	rb_prev = __rb_parent = NULL;
  11.310 +	vma = NULL;
  11.311 +
  11.312 +	while (*__rb_link) {
  11.313 +		struct vm_area_struct *vma_tmp;
  11.314 +
  11.315 +		__rb_parent = *__rb_link;
  11.316 +		vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb);
  11.317 +
  11.318 +		if (vma_tmp->vm_end > addr) {
  11.319 +			vma = vma_tmp;
  11.320 +			if (vma_tmp->vm_start <= addr)
  11.321 +				return vma;
  11.322 +			__rb_link = &__rb_parent->rb_left;
  11.323 +		} else {
  11.324 +			rb_prev = __rb_parent;
  11.325 +			__rb_link = &__rb_parent->rb_right;
  11.326 +		}
  11.327 +	}
  11.328 +
  11.329 +	*pprev = NULL;
  11.330 +	if (rb_prev)
  11.331 +		*pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
  11.332 +	*rb_link = __rb_link;
  11.333 +	*rb_parent = __rb_parent;
  11.334 +	return vma;
  11.335 +}
  11.336 +
  11.337 +static inline void
  11.338 +__vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
  11.339 +		struct vm_area_struct *prev, struct rb_node *rb_parent)
  11.340 +{
  11.341 +	if (prev) {
  11.342 +		vma->vm_next = prev->vm_next;
  11.343 +		prev->vm_next = vma;
  11.344 +	} else {
  11.345 +		mm->mmap = vma;
  11.346 +		if (rb_parent)
  11.347 +			vma->vm_next = rb_entry(rb_parent,
  11.348 +					struct vm_area_struct, vm_rb);
  11.349 +		else
  11.350 +			vma->vm_next = NULL;
  11.351 +	}
  11.352 +}
  11.353 +
  11.354 +void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
  11.355 +		struct rb_node **rb_link, struct rb_node *rb_parent)
  11.356 +{
  11.357 +	rb_link_node(&vma->vm_rb, rb_parent, rb_link);
  11.358 +	rb_insert_color(&vma->vm_rb, &mm->mm_rb);
  11.359 +}
  11.360 +
  11.361 +static inline void __vma_link_file(struct vm_area_struct *vma)
  11.362 +{
  11.363 +	struct file * file;
  11.364 +
  11.365 +	file = vma->vm_file;
  11.366 +	if (file) {
  11.367 +		struct address_space *mapping = file->f_mapping;
  11.368 +
  11.369 +		if (vma->vm_flags & VM_DENYWRITE)
  11.370 +			atomic_dec(&file->f_dentry->d_inode->i_writecount);
  11.371 +		if (vma->vm_flags & VM_SHARED)
  11.372 +			mapping->i_mmap_writable++;
  11.373 +
  11.374 +		flush_dcache_mmap_lock(mapping);
  11.375 +		if (unlikely(vma->vm_flags & VM_NONLINEAR))
  11.376 +			vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
  11.377 +		else
  11.378 +			vma_prio_tree_insert(vma, &mapping->i_mmap);
  11.379 +		flush_dcache_mmap_unlock(mapping);
  11.380 +	}
  11.381 +}
  11.382 +
  11.383 +static void
  11.384 +__vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
  11.385 +	struct vm_area_struct *prev, struct rb_node **rb_link,
  11.386 +	struct rb_node *rb_parent)
  11.387 +{
  11.388 +	__vma_link_list(mm, vma, prev, rb_parent);
  11.389 +	__vma_link_rb(mm, vma, rb_link, rb_parent);
  11.390 +	__anon_vma_link(vma);
  11.391 +}
  11.392 +
  11.393 +static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
  11.394 +			struct vm_area_struct *prev, struct rb_node **rb_link,
  11.395 +			struct rb_node *rb_parent)
  11.396 +{
  11.397 +	struct address_space *mapping = NULL;
  11.398 +
  11.399 +	if (vma->vm_file)
  11.400 +		mapping = vma->vm_file->f_mapping;
  11.401 +
  11.402 +	if (mapping) {
  11.403 +		spin_lock(&mapping->i_mmap_lock);
  11.404 +		vma->vm_truncate_count = mapping->truncate_count;
  11.405 +	}
  11.406 +	anon_vma_lock(vma);
  11.407 +
  11.408 +	__vma_link(mm, vma, prev, rb_link, rb_parent);
  11.409 +	__vma_link_file(vma);
  11.410 +
  11.411 +	anon_vma_unlock(vma);
  11.412 +	if (mapping)
  11.413 +		spin_unlock(&mapping->i_mmap_lock);
  11.414 +
  11.415 +	mm->map_count++;
  11.416 +	validate_mm(mm);
  11.417 +}
  11.418 +
  11.419 +/*
  11.420 + * Helper for vma_adjust in the split_vma insert case:
  11.421 + * insert vm structure into list and rbtree and anon_vma,
  11.422 + * but it has already been inserted into prio_tree earlier.
  11.423 + */
  11.424 +static void
  11.425 +__insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
  11.426 +{
  11.427 +	struct vm_area_struct * __vma, * prev;
  11.428 +	struct rb_node ** rb_link, * rb_parent;
  11.429 +
  11.430 +	__vma = find_vma_prepare(mm, vma->vm_start,&prev, &rb_link, &rb_parent);
  11.431 +	if (__vma && __vma->vm_start < vma->vm_end)
  11.432 +		BUG();
  11.433 +	__vma_link(mm, vma, prev, rb_link, rb_parent);
  11.434 +	mm->map_count++;
  11.435 +}
  11.436 +
  11.437 +static inline void
  11.438 +__vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
  11.439 +		struct vm_area_struct *prev)
  11.440 +{
  11.441 +	prev->vm_next = vma->vm_next;
  11.442 +	rb_erase(&vma->vm_rb, &mm->mm_rb);
  11.443 +	if (mm->mmap_cache == vma)
  11.444 +		mm->mmap_cache = prev;
  11.445 +}
  11.446 +
  11.447 +/*
  11.448 + * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that
  11.449 + * is already present in an i_mmap tree without adjusting the tree.
  11.450 + * The following helper function should be used when such adjustments
  11.451 + * are necessary.  The "insert" vma (if any) is to be inserted
  11.452 + * before we drop the necessary locks.
  11.453 + */
  11.454 +void vma_adjust(struct vm_area_struct *vma, unsigned long start,
  11.455 +	unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert)
  11.456 +{
  11.457 +	struct mm_struct *mm = vma->vm_mm;
  11.458 +	struct vm_area_struct *next = vma->vm_next;
  11.459 +	struct vm_area_struct *importer = NULL;
  11.460 +	struct address_space *mapping = NULL;
  11.461 +	struct prio_tree_root *root = NULL;
  11.462 +	struct file *file = vma->vm_file;
  11.463 +	struct anon_vma *anon_vma = NULL;
  11.464 +	long adjust_next = 0;
  11.465 +	int remove_next = 0;
  11.466 +
  11.467 +	if (next && !insert) {
  11.468 +		if (end >= next->vm_end) {
  11.469 +			/*
  11.470 +			 * vma expands, overlapping all the next, and
  11.471 +			 * perhaps the one after too (mprotect case 6).
  11.472 +			 */
  11.473 +again:			remove_next = 1 + (end > next->vm_end);
  11.474 +			end = next->vm_end;
  11.475 +			anon_vma = next->anon_vma;
  11.476 +			importer = vma;
  11.477 +		} else if (end > next->vm_start) {
  11.478 +			/*
  11.479 +			 * vma expands, overlapping part of the next:
  11.480 +			 * mprotect case 5 shifting the boundary up.
  11.481 +			 */
  11.482 +			adjust_next = (end - next->vm_start) >> PAGE_SHIFT;
  11.483 +			anon_vma = next->anon_vma;
  11.484 +			importer = vma;
  11.485 +		} else if (end < vma->vm_end) {
  11.486 +			/*
  11.487 +			 * vma shrinks, and !insert tells it's not
  11.488 +			 * split_vma inserting another: so it must be
  11.489 +			 * mprotect case 4 shifting the boundary down.
  11.490 +			 */
  11.491 +			adjust_next = - ((vma->vm_end - end) >> PAGE_SHIFT);
  11.492 +			anon_vma = next->anon_vma;
  11.493 +			importer = next;
  11.494 +		}
  11.495 +	}
  11.496 +
  11.497 +	if (file) {
  11.498 +		mapping = file->f_mapping;
  11.499 +		if (!(vma->vm_flags & VM_NONLINEAR))
  11.500 +			root = &mapping->i_mmap;
  11.501 +		spin_lock(&mapping->i_mmap_lock);
  11.502 +		if (importer &&
  11.503 +		    vma->vm_truncate_count != next->vm_truncate_count) {
  11.504 +			/*
  11.505 +			 * unmap_mapping_range might be in progress:
  11.506 +			 * ensure that the expanding vma is rescanned.
  11.507 +			 */
  11.508 +			importer->vm_truncate_count = 0;
  11.509 +		}
  11.510 +		if (insert) {
  11.511 +			insert->vm_truncate_count = vma->vm_truncate_count;
  11.512 +			/*
  11.513 +			 * Put into prio_tree now, so instantiated pages
  11.514 +			 * are visible to arm/parisc __flush_dcache_page
  11.515 +			 * throughout; but we cannot insert into address
  11.516 +			 * space until vma start or end is updated.
  11.517 +			 */
  11.518 +			__vma_link_file(insert);
  11.519 +		}
  11.520 +	}
  11.521 +
  11.522 +	/*
  11.523 +	 * When changing only vma->vm_end, we don't really need
  11.524 +	 * anon_vma lock: but is that case worth optimizing out?
  11.525 +	 */
  11.526 +	if (vma->anon_vma)
  11.527 +		anon_vma = vma->anon_vma;
  11.528 +	if (anon_vma) {
  11.529 +		spin_lock(&anon_vma->lock);
  11.530 +		/*
  11.531 +		 * Easily overlooked: when mprotect shifts the boundary,
  11.532 +		 * make sure the expanding vma has anon_vma set if the
  11.533 +		 * shrinking vma had, to cover any anon pages imported.
  11.534 +		 */
  11.535 +		if (importer && !importer->anon_vma) {
  11.536 +			importer->anon_vma = anon_vma;
  11.537 +			__anon_vma_link(importer);
  11.538 +		}
  11.539 +	}
  11.540 +
  11.541 +	if (root) {
  11.542 +		flush_dcache_mmap_lock(mapping);
  11.543 +		vma_prio_tree_remove(vma, root);
  11.544 +		if (adjust_next)
  11.545 +			vma_prio_tree_remove(next, root);
  11.546 +	}
  11.547 +
  11.548 +	vma->vm_start = start;
  11.549 +	vma->vm_end = end;
  11.550 +	vma->vm_pgoff = pgoff;
  11.551 +	if (adjust_next) {
  11.552 +		next->vm_start += adjust_next << PAGE_SHIFT;
  11.553 +		next->vm_pgoff += adjust_next;
  11.554 +	}
  11.555 +
  11.556 +	if (root) {
  11.557 +		if (adjust_next)
  11.558 +			vma_prio_tree_insert(next, root);
  11.559 +		vma_prio_tree_insert(vma, root);
  11.560 +		flush_dcache_mmap_unlock(mapping);
  11.561 +	}
  11.562 +
  11.563 +	if (remove_next) {
  11.564 +		/*
  11.565 +		 * vma_merge has merged next into vma, and needs
  11.566 +		 * us to remove next before dropping the locks.
  11.567 +		 */
  11.568 +		__vma_unlink(mm, next, vma);
  11.569 +		if (file)
  11.570 +			__remove_shared_vm_struct(next, file, mapping);
  11.571 +		if (next->anon_vma)
  11.572 +			__anon_vma_merge(vma, next);
  11.573 +	} else if (insert) {
  11.574 +		/*
  11.575 +		 * split_vma has split insert from vma, and needs
  11.576 +		 * us to insert it before dropping the locks
  11.577 +		 * (it may either follow vma or precede it).
  11.578 +		 */
  11.579 +		__insert_vm_struct(mm, insert);
  11.580 +	}
  11.581 +
  11.582 +	if (anon_vma)
  11.583 +		spin_unlock(&anon_vma->lock);
  11.584 +	if (mapping)
  11.585 +		spin_unlock(&mapping->i_mmap_lock);
  11.586 +
  11.587 +	if (remove_next) {
  11.588 +		if (file)
  11.589 +			fput(file);
  11.590 +		mm->map_count--;
  11.591 +		mpol_free(vma_policy(next));
  11.592 +		kmem_cache_free(vm_area_cachep, next);
  11.593 +		/*
  11.594 +		 * In mprotect's case 6 (see comments on vma_merge),
  11.595 +		 * we must remove another next too. It would clutter
  11.596 +		 * up the code too much to do both in one go.
  11.597 +		 */
  11.598 +		if (remove_next == 2) {
  11.599 +			next = vma->vm_next;
  11.600 +			goto again;
  11.601 +		}
  11.602 +	}
  11.603 +
  11.604 +	validate_mm(mm);
  11.605 +}
  11.606 +
  11.607 +/*
  11.608 + * If the vma has a ->close operation then the driver probably needs to release
  11.609 + * per-vma resources, so we don't attempt to merge those.
  11.610 + */
  11.611 +#define VM_SPECIAL (VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_RESERVED)
  11.612 +
  11.613 +static inline int is_mergeable_vma(struct vm_area_struct *vma,
  11.614 +			struct file *file, unsigned long vm_flags)
  11.615 +{
  11.616 +	if (vma->vm_flags != vm_flags)
  11.617 +		return 0;
  11.618 +	if (vma->vm_file != file)
  11.619 +		return 0;
  11.620 +	if (vma->vm_ops && vma->vm_ops->close)
  11.621 +		return 0;
  11.622 +	return 1;
  11.623 +}
  11.624 +
  11.625 +static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1,
  11.626 +					struct anon_vma *anon_vma2)
  11.627 +{
  11.628 +	return !anon_vma1 || !anon_vma2 || (anon_vma1 == anon_vma2);
  11.629 +}
  11.630 +
  11.631 +/*
  11.632 + * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
  11.633 + * in front of (at a lower virtual address and file offset than) the vma.
  11.634 + *
  11.635 + * We cannot merge two vmas if they have differently assigned (non-NULL)
  11.636 + * anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
  11.637 + *
  11.638 + * We don't check here for the merged mmap wrapping around the end of pagecache
  11.639 + * indices (16TB on ia32) because do_mmap_pgoff() does not permit mmap's which
  11.640 + * wrap, nor mmaps which cover the final page at index -1UL.
  11.641 + */
  11.642 +static int
  11.643 +can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
  11.644 +	struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
  11.645 +{
  11.646 +	if (is_mergeable_vma(vma, file, vm_flags) &&
  11.647 +	    is_mergeable_anon_vma(anon_vma, vma->anon_vma)) {
  11.648 +		if (vma->vm_pgoff == vm_pgoff)
  11.649 +			return 1;
  11.650 +	}
  11.651 +	return 0;
  11.652 +}
  11.653 +
  11.654 +/*
  11.655 + * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
  11.656 + * beyond (at a higher virtual address and file offset than) the vma.
  11.657 + *
  11.658 + * We cannot merge two vmas if they have differently assigned (non-NULL)
  11.659 + * anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
  11.660 + */
  11.661 +static int
  11.662 +can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
  11.663 +	struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
  11.664 +{
  11.665 +	if (is_mergeable_vma(vma, file, vm_flags) &&
  11.666 +	    is_mergeable_anon_vma(anon_vma, vma->anon_vma)) {
  11.667 +		pgoff_t vm_pglen;
  11.668 +		vm_pglen = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
  11.669 +		if (vma->vm_pgoff + vm_pglen == vm_pgoff)
  11.670 +			return 1;
  11.671 +	}
  11.672 +	return 0;
  11.673 +}
  11.674 +
  11.675 +/*
  11.676 + * Given a mapping request (addr,end,vm_flags,file,pgoff), figure out
  11.677 + * whether that can be merged with its predecessor or its successor.
  11.678 + * Or both (it neatly fills a hole).
  11.679 + *
  11.680 + * In most cases - when called for mmap, brk or mremap - [addr,end) is
  11.681 + * certain not to be mapped by the time vma_merge is called; but when
  11.682 + * called for mprotect, it is certain to be already mapped (either at
  11.683 + * an offset within prev, or at the start of next), and the flags of
  11.684 + * this area are about to be changed to vm_flags - and the no-change
  11.685 + * case has already been eliminated.
  11.686 + *
  11.687 + * The following mprotect cases have to be considered, where AAAA is
  11.688 + * the area passed down from mprotect_fixup, never extending beyond one
  11.689 + * vma, PPPPPP is the prev vma specified, and NNNNNN the next vma after:
  11.690 + *
  11.691 + *     AAAA             AAAA                AAAA          AAAA
  11.692 + *    PPPPPPNNNNNN    PPPPPPNNNNNN    PPPPPPNNNNNN    PPPPNNNNXXXX
  11.693 + *    cannot merge    might become    might become    might become
  11.694 + *                    PPNNNNNNNNNN    PPPPPPPPPPNN    PPPPPPPPPPPP 6 or
  11.695 + *    mmap, brk or    case 4 below    case 5 below    PPPPPPPPXXXX 7 or
  11.696 + *    mremap move:                                    PPPPNNNNNNNN 8
  11.697 + *        AAAA
  11.698 + *    PPPP    NNNN    PPPPPPPPPPPP    PPPPPPPPNNNN    PPPPNNNNNNNN
  11.699 + *    might become    case 1 below    case 2 below    case 3 below
  11.700 + *
  11.701 + * Odd one out? Case 8, because it extends NNNN but needs flags of XXXX:
  11.702 + * mprotect_fixup updates vm_flags & vm_page_prot on successful return.
  11.703 + */
  11.704 +struct vm_area_struct *vma_merge(struct mm_struct *mm,
  11.705 +			struct vm_area_struct *prev, unsigned long addr,
  11.706 +			unsigned long end, unsigned long vm_flags,
  11.707 +		     	struct anon_vma *anon_vma, struct file *file,
  11.708 +			pgoff_t pgoff, struct mempolicy *policy)
  11.709 +{
  11.710 +	pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
  11.711 +	struct vm_area_struct *area, *next;
  11.712 +
  11.713 +	/*
  11.714 +	 * We later require that vma->vm_flags == vm_flags,
  11.715 +	 * so this tests vma->vm_flags & VM_SPECIAL, too.
  11.716 +	 */
  11.717 +	if (vm_flags & VM_SPECIAL)
  11.718 +		return NULL;
  11.719 +
  11.720 +	if (prev)
  11.721 +		next = prev->vm_next;
  11.722 +	else
  11.723 +		next = mm->mmap;
  11.724 +	area = next;
  11.725 +	if (next && next->vm_end == end)		/* cases 6, 7, 8 */
  11.726 +		next = next->vm_next;
  11.727 +
  11.728 +	/*
  11.729 +	 * Can it merge with the predecessor?
  11.730 +	 */
  11.731 +	if (prev && prev->vm_end == addr &&
  11.732 +  			mpol_equal(vma_policy(prev), policy) &&
  11.733 +			can_vma_merge_after(prev, vm_flags,
  11.734 +						anon_vma, file, pgoff)) {
  11.735 +		/*
  11.736 +		 * OK, it can.  Can we now merge in the successor as well?
  11.737 +		 */
  11.738 +		if (next && end == next->vm_start &&
  11.739 +				mpol_equal(policy, vma_policy(next)) &&
  11.740 +				can_vma_merge_before(next, vm_flags,
  11.741 +					anon_vma, file, pgoff+pglen) &&
  11.742 +				is_mergeable_anon_vma(prev->anon_vma,
  11.743 +						      next->anon_vma)) {
  11.744 +							/* cases 1, 6 */
  11.745 +			vma_adjust(prev, prev->vm_start,
  11.746 +				next->vm_end, prev->vm_pgoff, NULL);
  11.747 +		} else					/* cases 2, 5, 7 */
  11.748 +			vma_adjust(prev, prev->vm_start,
  11.749 +				end, prev->vm_pgoff, NULL);
  11.750 +		return prev;
  11.751 +	}
  11.752 +
  11.753 +	/*
  11.754 +	 * Can this new request be merged in front of next?
  11.755 +	 */
  11.756 +	if (next && end == next->vm_start &&
  11.757 + 			mpol_equal(policy, vma_policy(next)) &&
  11.758 +			can_vma_merge_before(next, vm_flags,
  11.759 +					anon_vma, file, pgoff+pglen)) {
  11.760 +		if (prev && addr < prev->vm_end)	/* case 4 */
  11.761 +			vma_adjust(prev, prev->vm_start,
  11.762 +				addr, prev->vm_pgoff, NULL);
  11.763 +		else					/* cases 3, 8 */
  11.764 +			vma_adjust(area, addr, next->vm_end,
  11.765 +				next->vm_pgoff - pglen, NULL);
  11.766 +		return area;
  11.767 +	}
  11.768 +
  11.769 +	return NULL;
  11.770 +}
  11.771 +
  11.772 +/*
  11.773 + * find_mergeable_anon_vma is used by anon_vma_prepare, to check
  11.774 + * neighbouring vmas for a suitable anon_vma, before it goes off
  11.775 + * to allocate a new anon_vma.  It checks because a repetitive
  11.776 + * sequence of mprotects and faults may otherwise lead to distinct
  11.777 + * anon_vmas being allocated, preventing vma merge in subsequent
  11.778 + * mprotect.
  11.779 + */
  11.780 +struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)
  11.781 +{
  11.782 +	struct vm_area_struct *near;
  11.783 +	unsigned long vm_flags;
  11.784 +
  11.785 +	near = vma->vm_next;
  11.786 +	if (!near)
  11.787 +		goto try_prev;
  11.788 +
  11.789 +	/*
  11.790 +	 * Since only mprotect tries to remerge vmas, match flags
  11.791 +	 * which might be mprotected into each other later on.
  11.792 +	 * Neither mlock nor madvise tries to remerge at present,
  11.793 +	 * so leave their flags as obstructing a merge.
  11.794 +	 */
  11.795 +	vm_flags = vma->vm_flags & ~(VM_READ|VM_WRITE|VM_EXEC);
  11.796 +	vm_flags |= near->vm_flags & (VM_READ|VM_WRITE|VM_EXEC);
  11.797 +
  11.798 +	if (near->anon_vma && vma->vm_end == near->vm_start &&
  11.799 + 			mpol_equal(vma_policy(vma), vma_policy(near)) &&
  11.800 +			can_vma_merge_before(near, vm_flags,
  11.801 +				NULL, vma->vm_file, vma->vm_pgoff +
  11.802 +				((vma->vm_end - vma->vm_start) >> PAGE_SHIFT)))
  11.803 +		return near->anon_vma;
  11.804 +try_prev:
  11.805 +	/*
  11.806 +	 * It is potentially slow to have to call find_vma_prev here.
  11.807 +	 * But it's only on the first write fault on the vma, not
  11.808 +	 * every time, and we could devise a way to avoid it later
  11.809 +	 * (e.g. stash info in next's anon_vma_node when assigning
  11.810 +	 * an anon_vma, or when trying vma_merge).  Another time.
  11.811 +	 */
  11.812 +	if (find_vma_prev(vma->vm_mm, vma->vm_start, &near) != vma)
  11.813 +		BUG();
  11.814 +	if (!near)
  11.815 +		goto none;
  11.816 +
  11.817 +	vm_flags = vma->vm_flags & ~(VM_READ|VM_WRITE|VM_EXEC);
  11.818 +	vm_flags |= near->vm_flags & (VM_READ|VM_WRITE|VM_EXEC);
  11.819 +
  11.820 +	if (near->anon_vma && near->vm_end == vma->vm_start &&
  11.821 +  			mpol_equal(vma_policy(near), vma_policy(vma)) &&
  11.822 +			can_vma_merge_after(near, vm_flags,
  11.823 +				NULL, vma->vm_file, vma->vm_pgoff))
  11.824 +		return near->anon_vma;
  11.825 +none:
  11.826 +	/*
  11.827 +	 * There's no absolute need to look only at touching neighbours:
  11.828 +	 * we could search further afield for "compatible" anon_vmas.
  11.829 +	 * But it would probably just be a waste of time searching,
  11.830 +	 * or lead to too many vmas hanging off the same anon_vma.
  11.831 +	 * We're trying to allow mprotect remerging later on,
  11.832 +	 * not trying to minimize memory used for anon_vmas.
  11.833 +	 */
  11.834 +	return NULL;
  11.835 +}
  11.836 +
  11.837 +#ifdef CONFIG_PROC_FS
  11.838 +void __vm_stat_account(struct mm_struct *mm, unsigned long flags,
  11.839 +						struct file *file, long pages)
  11.840 +{
  11.841 +	const unsigned long stack_flags
  11.842 +		= VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN);
  11.843 +
  11.844 +#ifdef CONFIG_HUGETLB
  11.845 +	if (flags & VM_HUGETLB) {
  11.846 +		if (!(flags & VM_DONTCOPY))
  11.847 +			mm->shared_vm += pages;
  11.848 +		return;
  11.849 +	}
  11.850 +#endif /* CONFIG_HUGETLB */
  11.851 +
  11.852 +	if (file) {
  11.853 +		mm->shared_vm += pages;
  11.854 +		if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC)
  11.855 +			mm->exec_vm += pages;
  11.856 +	} else if (flags & stack_flags)
  11.857 +		mm->stack_vm += pages;
  11.858 +	if (flags & (VM_RESERVED|VM_IO))
  11.859 +		mm->reserved_vm += pages;
  11.860 +}
  11.861 +#endif /* CONFIG_PROC_FS */
  11.862 +
  11.863 +/*
  11.864 + * The caller must hold down_write(current->mm->mmap_sem).
  11.865 + */
  11.866 +
  11.867 +unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
  11.868 +			unsigned long len, unsigned long prot,
  11.869 +			unsigned long flags, unsigned long pgoff)
  11.870 +{
  11.871 +	struct mm_struct * mm = current->mm;
  11.872 +	struct vm_area_struct * vma, * prev;
  11.873 +	struct inode *inode;
  11.874 +	unsigned int vm_flags;
  11.875 +	int correct_wcount = 0;
  11.876 +	int error;
  11.877 +	struct rb_node ** rb_link, * rb_parent;
  11.878 +	int accountable = 1;
  11.879 +	unsigned long charged = 0;
  11.880 +
  11.881 +	if (file) {
  11.882 +		if (is_file_hugepages(file))
  11.883 +			accountable = 0;
  11.884 +
  11.885 +		if (!file->f_op || !file->f_op->mmap)
  11.886 +			return -ENODEV;
  11.887 +
  11.888 +		if ((prot & PROT_EXEC) &&
  11.889 +		    (file->f_vfsmnt->mnt_flags & MNT_NOEXEC))
  11.890 +			return -EPERM;
  11.891 +	}
  11.892 +	/*
  11.893 +	 * Does the application expect PROT_READ to imply PROT_EXEC?
  11.894 +	 *
  11.895 +	 * (the exception is when the underlying filesystem is noexec
  11.896 +	 *  mounted, in which case we dont add PROT_EXEC.)
  11.897 +	 */
  11.898 +	if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
  11.899 +		if (!(file && (file->f_vfsmnt->mnt_flags & MNT_NOEXEC)))
  11.900 +			prot |= PROT_EXEC;
  11.901 +
  11.902 +	if (!len)
  11.903 +		return addr;
  11.904 +
  11.905 +	/* Careful about overflows.. */
  11.906 +	len = PAGE_ALIGN(len);
  11.907 +	if (!len || len > TASK_SIZE)
  11.908 +		return -EINVAL;
  11.909 +
  11.910 +	/* offset overflow? */
  11.911 +	if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
  11.912 +		return -EINVAL;
  11.913 +
  11.914 +	/* Too many mappings? */
  11.915 +	if (mm->map_count > sysctl_max_map_count)
  11.916 +		return -ENOMEM;
  11.917 +
  11.918 +	/* Obtain the address to map to. we verify (or select) it and ensure
  11.919 +	 * that it represents a valid section of the address space.
  11.920 +	 */
  11.921 +	addr = get_unmapped_area(file, addr, len, pgoff, flags);
  11.922 +	if (addr & ~PAGE_MASK)
  11.923 +		return addr;
  11.924 +
  11.925 +	/* Do simple checking here so the lower-level routines won't have
  11.926 +	 * to. we assume access permissions have been handled by the open
  11.927 +	 * of the memory object, so we don't do any here.
  11.928 +	 */
  11.929 +	vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |
  11.930 +			mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
  11.931 +
  11.932 +	if (flags & MAP_LOCKED) {
  11.933 +		if (!can_do_mlock())
  11.934 +			return -EPERM;
  11.935 +		vm_flags |= VM_LOCKED;
  11.936 +	}
  11.937 +	/* mlock MCL_FUTURE? */
  11.938 +	if (vm_flags & VM_LOCKED) {
  11.939 +		unsigned long locked, lock_limit;
  11.940 +		locked = mm->locked_vm << PAGE_SHIFT;
  11.941 +		lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
  11.942 +		locked += len;
  11.943 +		if (locked > lock_limit && !capable(CAP_IPC_LOCK))
  11.944 +			return -EAGAIN;
  11.945 +	}
  11.946 +
  11.947 +	inode = file ? file->f_dentry->d_inode : NULL;
  11.948 +
  11.949 +	if (file) {
  11.950 +		switch (flags & MAP_TYPE) {
  11.951 +		case MAP_SHARED:
  11.952 +			if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE))
  11.953 +				return -EACCES;
  11.954 +
  11.955 +			/*
  11.956 +			 * Make sure we don't allow writing to an append-only
  11.957 +			 * file..
  11.958 +			 */
  11.959 +			if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE))
  11.960 +				return -EACCES;
  11.961 +
  11.962 +			/*
  11.963 +			 * Make sure there are no mandatory locks on the file.
  11.964 +			 */
  11.965 +			if (locks_verify_locked(inode))
  11.966 +				return -EAGAIN;
  11.967 +
  11.968 +			vm_flags |= VM_SHARED | VM_MAYSHARE;
  11.969 +			if (!(file->f_mode & FMODE_WRITE))
  11.970 +				vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
  11.971 +
  11.972 +			/* fall through */
  11.973 +		case MAP_PRIVATE:
  11.974 +			if (!(file->f_mode & FMODE_READ))
  11.975 +				return -EACCES;
  11.976 +			break;
  11.977 +
  11.978 +		default:
  11.979 +			return -EINVAL;
  11.980 +		}
  11.981 +	} else {
  11.982 +		switch (flags & MAP_TYPE) {
  11.983 +		case MAP_SHARED:
  11.984 +			vm_flags |= VM_SHARED | VM_MAYSHARE;
  11.985 +			break;
  11.986 +		case MAP_PRIVATE:
  11.987 +			/*
  11.988 +			 * Set pgoff according to addr for anon_vma.
  11.989 +			 */
  11.990 +			pgoff = addr >> PAGE_SHIFT;
  11.991 +			break;
  11.992 +		default:
  11.993 +			return -EINVAL;
  11.994 +		}
  11.995 +	}
  11.996 +
  11.997 +	error = security_file_mmap(file, prot, flags);
  11.998 +	if (error)
  11.999 +		return error;
 11.1000 +		
 11.1001 +	/* Clear old maps */
 11.1002 +	error = -ENOMEM;
 11.1003 +munmap_back:
 11.1004 +	vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
 11.1005 +	if (vma && vma->vm_start < addr + len) {
 11.1006 +		if (do_munmap(mm, addr, len))
 11.1007 +			return -ENOMEM;
 11.1008 +		goto munmap_back;
 11.1009 +	}
 11.1010 +
 11.1011 +	/* Check against address space limit. */
 11.1012 +	if ((mm->total_vm << PAGE_SHIFT) + len
 11.1013 +	    > current->signal->rlim[RLIMIT_AS].rlim_cur)
 11.1014 +		return -ENOMEM;
 11.1015 +
 11.1016 +	if (accountable && (!(flags & MAP_NORESERVE) ||
 11.1017 +			    sysctl_overcommit_memory == OVERCOMMIT_NEVER)) {
 11.1018 +		if (vm_flags & VM_SHARED) {
 11.1019 +			/* Check memory availability in shmem_file_setup? */
 11.1020 +			vm_flags |= VM_ACCOUNT;
 11.1021 +		} else if (vm_flags & VM_WRITE) {
 11.1022 +			/*
 11.1023 +			 * Private writable mapping: check memory availability
 11.1024 +			 */
 11.1025 +			charged = len >> PAGE_SHIFT;
 11.1026 +			if (security_vm_enough_memory(charged))
 11.1027 +				return -ENOMEM;
 11.1028 +			vm_flags |= VM_ACCOUNT;
 11.1029 +		}
 11.1030 +	}
 11.1031 +
 11.1032 +	/*
 11.1033 +	 * Can we just expand an old private anonymous mapping?
 11.1034 +	 * The VM_SHARED test is necessary because shmem_zero_setup
 11.1035 +	 * will create the file object for a shared anonymous map below.
 11.1036 +	 */
 11.1037 +	if (!file && !(vm_flags & VM_SHARED) &&
 11.1038 +	    vma_merge(mm, prev, addr, addr + len, vm_flags,
 11.1039 +					NULL, NULL, pgoff, NULL))
 11.1040 +		goto out;
 11.1041 +
 11.1042 +	/*
 11.1043 +	 * Determine the object being mapped and call the appropriate
 11.1044 +	 * specific mapper. the address has already been validated, but
 11.1045 +	 * not unmapped, but the maps are removed from the list.
 11.1046 +	 */
 11.1047 +	vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
 11.1048 +	if (!vma) {
 11.1049 +		error = -ENOMEM;
 11.1050 +		goto unacct_error;
 11.1051 +	}
 11.1052 +	memset(vma, 0, sizeof(*vma));
 11.1053 +
 11.1054 +	vma->vm_mm = mm;
 11.1055 +	vma->vm_start = addr;
 11.1056 +	vma->vm_end = addr + len;
 11.1057 +	vma->vm_flags = vm_flags;
 11.1058 +	vma->vm_page_prot = protection_map[vm_flags & 0x0f];
 11.1059 +	vma->vm_pgoff = pgoff;
 11.1060 +
 11.1061 +	if (file) {
 11.1062 +		error = -EINVAL;
 11.1063 +		if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
 11.1064 +			goto free_vma;
 11.1065 +		if (vm_flags & VM_DENYWRITE) {
 11.1066 +			error = deny_write_access(file);
 11.1067 +			if (error)
 11.1068 +				goto free_vma;
 11.1069 +			correct_wcount = 1;
 11.1070 +		}
 11.1071 +		vma->vm_file = file;
 11.1072 +		get_file(file);
 11.1073 +		error = file->f_op->mmap(file, vma);
 11.1074 +		if (error)
 11.1075 +			goto unmap_and_free_vma;
 11.1076 +	} else if (vm_flags & VM_SHARED) {
 11.1077 +		error = shmem_zero_setup(vma);
 11.1078 +		if (error)
 11.1079 +			goto free_vma;
 11.1080 +	}
 11.1081 +
 11.1082 +	/* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform
 11.1083 +	 * shmem_zero_setup (perhaps called through /dev/zero's ->mmap)
 11.1084 +	 * that memory reservation must be checked; but that reservation
 11.1085 +	 * belongs to shared memory object, not to vma: so now clear it.
 11.1086 +	 */
 11.1087 +	if ((vm_flags & (VM_SHARED|VM_ACCOUNT)) == (VM_SHARED|VM_ACCOUNT))
 11.1088 +		vma->vm_flags &= ~VM_ACCOUNT;
 11.1089 +
 11.1090 +	/* Can addr have changed??
 11.1091 +	 *
 11.1092 +	 * Answer: Yes, several device drivers can do it in their
 11.1093 +	 *         f_op->mmap method. -DaveM
 11.1094 +	 */
 11.1095 +	addr = vma->vm_start;
 11.1096 +	pgoff = vma->vm_pgoff;
 11.1097 +	vm_flags = vma->vm_flags;
 11.1098 +
 11.1099 +	if (!file || !vma_merge(mm, prev, addr, vma->vm_end,
 11.1100 +			vma->vm_flags, NULL, file, pgoff, vma_policy(vma))) {
 11.1101 +		file = vma->vm_file;
 11.1102 +		vma_link(mm, vma, prev, rb_link, rb_parent);
 11.1103 +		if (correct_wcount)
 11.1104 +			atomic_inc(&inode->i_writecount);
 11.1105 +	} else {
 11.1106 +		if (file) {
 11.1107 +			if (correct_wcount)
 11.1108 +				atomic_inc(&inode->i_writecount);
 11.1109 +			fput(file);
 11.1110 +		}
 11.1111 +		mpol_free(vma_policy(vma));
 11.1112 +		kmem_cache_free(vm_area_cachep, vma);
 11.1113 +	}
 11.1114 +out:	
 11.1115 +	mm->total_vm += len >> PAGE_SHIFT;
 11.1116 +	__vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
 11.1117 +	if (vm_flags & VM_LOCKED) {
 11.1118 +		mm->locked_vm += len >> PAGE_SHIFT;
 11.1119 +		make_pages_present(addr, addr + len);
 11.1120 +	}
 11.1121 +	if (flags & MAP_POPULATE) {
 11.1122 +		up_write(&mm->mmap_sem);
 11.1123 +		sys_remap_file_pages(addr, len, 0,
 11.1124 +					pgoff, flags & MAP_NONBLOCK);
 11.1125 +		down_write(&mm->mmap_sem);
 11.1126 +	}
 11.1127 +	acct_update_integrals();
 11.1128 +	update_mem_hiwater();
 11.1129 +	return addr;
 11.1130 +
 11.1131 +unmap_and_free_vma:
 11.1132 +	if (correct_wcount)
 11.1133 +		atomic_inc(&inode->i_writecount);
 11.1134 +	vma->vm_file = NULL;
 11.1135 +	fput(file);
 11.1136 +
 11.1137 +	/* Undo any partial mapping done by a device driver. */
 11.1138 +	zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start, NULL);
 11.1139 +free_vma:
 11.1140 +	kmem_cache_free(vm_area_cachep, vma);
 11.1141 +unacct_error:
 11.1142 +	if (charged)
 11.1143 +		vm_unacct_memory(charged);
 11.1144 +	return error;
 11.1145 +}
 11.1146 +
 11.1147 +EXPORT_SYMBOL(do_mmap_pgoff);
 11.1148 +
 11.1149 +/* Get an address range which is currently unmapped.
 11.1150 + * For shmat() with addr=0.
 11.1151 + *
 11.1152 + * Ugly calling convention alert:
 11.1153 + * Return value with the low bits set means error value,
 11.1154 + * ie
 11.1155 + *	if (ret & ~PAGE_MASK)
 11.1156 + *		error = ret;
 11.1157 + *
 11.1158 + * This function "knows" that -ENOMEM has the bits set.
 11.1159 + */
 11.1160 +#ifndef HAVE_ARCH_UNMAPPED_AREA
 11.1161 +unsigned long
 11.1162 +arch_get_unmapped_area(struct file *filp, unsigned long addr,
 11.1163 +		unsigned long len, unsigned long pgoff, unsigned long flags)
 11.1164 +{
 11.1165 +	struct mm_struct *mm = current->mm;
 11.1166 +	struct vm_area_struct *vma;
 11.1167 +	unsigned long start_addr;
 11.1168 +
 11.1169 +	if (len > TASK_SIZE)
 11.1170 +		return -ENOMEM;
 11.1171 +
 11.1172 +	if (addr) {
 11.1173 +		addr = PAGE_ALIGN(addr);
 11.1174 +		vma = find_vma(mm, addr);
 11.1175 +		if (TASK_SIZE - len >= addr &&
 11.1176 +		    (!vma || addr + len <= vma->vm_start))
 11.1177 +			return addr;
 11.1178 +	}
 11.1179 +	start_addr = addr = mm->free_area_cache;
 11.1180 +
 11.1181 +full_search:
 11.1182 +	for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
 11.1183 +		/* At this point:  (!vma || addr < vma->vm_end). */
 11.1184 +		if (TASK_SIZE - len < addr) {
 11.1185 +			/*
 11.1186 +			 * Start a new search - just in case we missed
 11.1187 +			 * some holes.
 11.1188 +			 */
 11.1189 +			if (start_addr != TASK_UNMAPPED_BASE) {
 11.1190 +				start_addr = addr = TASK_UNMAPPED_BASE;
 11.1191 +				goto full_search;
 11.1192 +			}
 11.1193 +			return -ENOMEM;
 11.1194 +		}
 11.1195 +		if (!vma || addr + len <= vma->vm_start) {
 11.1196 +			/*
 11.1197 +			 * Remember the place where we stopped the search:
 11.1198 +			 */
 11.1199 +			mm->free_area_cache = addr + len;
 11.1200 +			return addr;
 11.1201 +		}
 11.1202 +		addr = vma->vm_end;
 11.1203 +	}
 11.1204 +}
 11.1205 +#endif	
 11.1206 +
 11.1207 +void arch_unmap_area(struct vm_area_struct *area)
 11.1208 +{
 11.1209 +	/*
 11.1210 +	 * Is this a new hole at the lowest possible address?
 11.1211 +	 */
 11.1212 +	if (area->vm_start >= TASK_UNMAPPED_BASE &&
 11.1213 +			area->vm_start < area->vm_mm->free_area_cache)
 11.1214 +		area->vm_mm->free_area_cache = area->vm_start;
 11.1215 +}
 11.1216 +
 11.1217 +/*
 11.1218 + * This mmap-allocator allocates new areas top-down from below the
 11.1219 + * stack's low limit (the base):
 11.1220 + */
 11.1221 +#ifndef HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
 11.1222 +unsigned long
 11.1223 +arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
 11.1224 +			  const unsigned long len, const unsigned long pgoff,
 11.1225 +			  const unsigned long flags)
 11.1226 +{
 11.1227 +	struct vm_area_struct *vma, *prev_vma;
 11.1228 +	struct mm_struct *mm = current->mm;
 11.1229 +	unsigned long base = mm->mmap_base, addr = addr0;
 11.1230 +	int first_time = 1;
 11.1231 +
 11.1232 +	/* requested length too big for entire address space */
 11.1233 +	if (len > TASK_SIZE)
 11.1234 +		return -ENOMEM;
 11.1235 +
 11.1236 +	/* dont allow allocations above current base */
 11.1237 +	if (mm->free_area_cache > base)
 11.1238 +		mm->free_area_cache = base;
 11.1239 +
 11.1240 +	/* requesting a specific address */
 11.1241 +	if (addr) {
 11.1242 +		addr = PAGE_ALIGN(addr);
 11.1243 +		vma = find_vma(mm, addr);
 11.1244 +		if (TASK_SIZE - len >= addr &&
 11.1245 +				(!vma || addr + len <= vma->vm_start))
 11.1246 +			return addr;
 11.1247 +	}
 11.1248 +
 11.1249 +try_again:
 11.1250 +	/* make sure it can fit in the remaining address space */
 11.1251 +	if (mm->free_area_cache < len)
 11.1252 +		goto fail;
 11.1253 +
 11.1254 +	/* either no address requested or cant fit in requested address hole */
 11.1255 +	addr = (mm->free_area_cache - len) & PAGE_MASK;
 11.1256 +	do {
 11.1257 +		/*
 11.1258 +		 * Lookup failure means no vma is above this address,
 11.1259 +		 * i.e. return with success:
 11.1260 +		 */
 11.1261 + 	 	if (!(vma = find_vma_prev(mm, addr, &prev_vma)))
 11.1262 +			return addr;
 11.1263 +
 11.1264 +		/*
 11.1265 +		 * new region fits between prev_vma->vm_end and
 11.1266 +		 * vma->vm_start, use it:
 11.1267 +		 */
 11.1268 +		if (addr+len <= vma->vm_start &&
 11.1269 +				(!prev_vma || (addr >= prev_vma->vm_end)))
 11.1270 +			/* remember the address as a hint for next time */
 11.1271 +			return (mm->free_area_cache = addr);
 11.1272 +		else
 11.1273 +			/* pull free_area_cache down to the first hole */
 11.1274 +			if (mm->free_area_cache == vma->vm_end)
 11.1275 +				mm->free_area_cache = vma->vm_start;
 11.1276 +
 11.1277 +		/* try just below the current vma->vm_start */
 11.1278 +		addr = vma->vm_start-len;
 11.1279 +	} while (len <= vma->vm_start);
 11.1280 +
 11.1281 +fail:
 11.1282 +	/*
 11.1283 +	 * if hint left us with no space for the requested
 11.1284 +	 * mapping then try again:
 11.1285 +	 */
 11.1286 +	if (first_time) {
 11.1287 +		mm->free_area_cache = base;
 11.1288 +		first_time = 0;
 11.1289 +		goto try_again;
 11.1290 +	}
 11.1291 +	/*
 11.1292 +	 * A failed mmap() very likely causes application failure,
 11.1293 +	 * so fall back to the bottom-up function here. This scenario
 11.1294 +	 * can happen with large stack limits and large mmap()
 11.1295 +	 * allocations.
 11.1296 +	 */
 11.1297 +	mm->free_area_cache = TASK_UNMAPPED_BASE;
 11.1298 +	addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
 11.1299 +	/*
 11.1300 +	 * Restore the topdown base:
 11.1301 +	 */
 11.1302 +	mm->free_area_cache = base;
 11.1303 +
 11.1304 +	return addr;
 11.1305 +}
 11.1306 +#endif
 11.1307 +
 11.1308 +void arch_unmap_area_topdown(struct vm_area_struct *area)
 11.1309 +{
 11.1310 +	/*
 11.1311 +	 * Is this a new hole at the highest possible address?
 11.1312 +	 */
 11.1313 +	if (area->vm_end > area->vm_mm->free_area_cache)
 11.1314 +		area->vm_mm->free_area_cache = area->vm_end;
 11.1315 +}
 11.1316 +
 11.1317 +unsigned long
 11.1318 +get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
 11.1319 +		unsigned long pgoff, unsigned long flags)
 11.1320 +{
 11.1321 +	if (flags & MAP_FIXED) {
 11.1322 +		unsigned long ret;
 11.1323 +
 11.1324 +		if (addr > TASK_SIZE - len)
 11.1325 +			return -ENOMEM;
 11.1326 +		if (addr & ~PAGE_MASK)
 11.1327 +			return -EINVAL;
 11.1328 +		if (file && is_file_hugepages(file))  {
 11.1329 +			/*
 11.1330 +			 * Check if the given range is hugepage aligned, and
 11.1331 +			 * can be made suitable for hugepages.
 11.1332 +			 */
 11.1333 +			ret = prepare_hugepage_range(addr, len);
 11.1334 +		} else {
 11.1335 +			/*
 11.1336 +			 * Ensure that a normal request is not falling in a
 11.1337 +			 * reserved hugepage range.  For some archs like IA-64,
 11.1338 +			 * there is a separate region for hugepages.
 11.1339 +			 */
 11.1340 +			ret = is_hugepage_only_range(addr, len);
 11.1341 +		}
 11.1342 +		if (ret)
 11.1343 +			return -EINVAL;
 11.1344 +		return addr;
 11.1345 +	}
 11.1346 +
 11.1347 +	if (file && file->f_op && file->f_op->get_unmapped_area)
 11.1348 +		return file->f_op->get_unmapped_area(file, addr, len,
 11.1349 +						pgoff, flags);
 11.1350 +
 11.1351 +	return current->mm->get_unmapped_area(file, addr, len, pgoff, flags);
 11.1352 +}
 11.1353 +
 11.1354 +EXPORT_SYMBOL(get_unmapped_area);
 11.1355 +
 11.1356 +/* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
 11.1357 +struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr)
 11.1358 +{
 11.1359 +	struct vm_area_struct *vma = NULL;
 11.1360 +
 11.1361 +	if (mm) {
 11.1362 +		/* Check the cache first. */
 11.1363 +		/* (Cache hit rate is typically around 35%.) */
 11.1364 +		vma = mm->mmap_cache;
 11.1365 +		if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
 11.1366 +			struct rb_node * rb_node;
 11.1367 +
 11.1368 +			rb_node = mm->mm_rb.rb_node;
 11.1369 +			vma = NULL;
 11.1370 +
 11.1371 +			while (rb_node) {
 11.1372 +				struct vm_area_struct * vma_tmp;
 11.1373 +
 11.1374 +				vma_tmp = rb_entry(rb_node,
 11.1375 +						struct vm_area_struct, vm_rb);
 11.1376 +
 11.1377 +				if (vma_tmp->vm_end > addr) {
 11.1378 +					vma = vma_tmp;
 11.1379 +					if (vma_tmp->vm_start <= addr)
 11.1380 +						break;
 11.1381 +					rb_node = rb_node->rb_left;
 11.1382 +				} else
 11.1383 +					rb_node = rb_node->rb_right;
 11.1384 +			}
 11.1385 +			if (vma)
 11.1386 +				mm->mmap_cache = vma;
 11.1387 +		}
 11.1388 +	}
 11.1389 +	return vma;
 11.1390 +}
 11.1391 +
 11.1392 +EXPORT_SYMBOL(find_vma);
 11.1393 +
 11.1394 +/* Same as find_vma, but also return a pointer to the previous VMA in *pprev. */
 11.1395 +struct vm_area_struct *
 11.1396 +find_vma_prev(struct mm_struct *mm, unsigned long addr,
 11.1397 +			struct vm_area_struct **pprev)
 11.1398 +{
 11.1399 +	struct vm_area_struct *vma = NULL, *prev = NULL;
 11.1400 +	struct rb_node * rb_node;
 11.1401 +	if (!mm)
 11.1402 +		goto out;
 11.1403 +
 11.1404 +	/* Guard against addr being lower than the first VMA */
 11.1405 +	vma = mm->mmap;
 11.1406 +
 11.1407 +	/* Go through the RB tree quickly. */
 11.1408 +	rb_node = mm->mm_rb.rb_node;
 11.1409 +
 11.1410 +	while (rb_node) {
 11.1411 +		struct vm_area_struct *vma_tmp;
 11.1412 +		vma_tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);
 11.1413 +
 11.1414 +		if (addr < vma_tmp->vm_end) {
 11.1415 +			rb_node = rb_node->rb_left;
 11.1416 +		} else {
 11.1417 +			prev = vma_tmp;
 11.1418 +			if (!prev->vm_next || (addr < prev->vm_next->vm_end))
 11.1419 +				break;
 11.1420 +			rb_node = rb_node->rb_right;
 11.1421 +		}
 11.1422 +	}
 11.1423 +
 11.1424 +out:
 11.1425 +	*pprev = prev;
 11.1426 +	return prev ? prev->vm_next : vma;
 11.1427 +}
 11.1428 +
 11.1429 +/*
 11.1430 + * Verify that the stack growth is acceptable and
 11.1431 + * update accounting. This is shared with both the
 11.1432 + * grow-up and grow-down cases.
 11.1433 + */
 11.1434 +static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, unsigned long grow)
 11.1435 +{
 11.1436 +	struct mm_struct *mm = vma->vm_mm;
 11.1437 +	struct rlimit *rlim = current->signal->rlim;
 11.1438 +
 11.1439 +	/* address space limit tests */
 11.1440 +	if (mm->total_vm + grow > rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT)
 11.1441 +		return -ENOMEM;
 11.1442 +
 11.1443 +	/* Stack limit test */
 11.1444 +	if (size > rlim[RLIMIT_STACK].rlim_cur)
 11.1445 +		return -ENOMEM;
 11.1446 +
 11.1447 +	/* mlock limit tests */
 11.1448 +	if (vma->vm_flags & VM_LOCKED) {
 11.1449 +		unsigned long locked;
 11.1450 +		unsigned long limit;
 11.1451 +		locked = mm->locked_vm + grow;
 11.1452 +		limit = rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
 11.1453 +		if (locked > limit && !capable(CAP_IPC_LOCK))
 11.1454 +			return -ENOMEM;
 11.1455 +	}
 11.1456 +
 11.1457 +	/*
 11.1458 +	 * Overcommit..  This must be the final test, as it will
 11.1459 +	 * update security statistics.
 11.1460 +	 */
 11.1461 +	if (security_vm_enough_memory(grow))
 11.1462 +		return -ENOMEM;
 11.1463 +
 11.1464 +	/* Ok, everything looks good - let it rip */
 11.1465 +	mm->total_vm += grow;
 11.1466 +	if (vma->vm_flags & VM_LOCKED)
 11.1467 +		mm->locked_vm += grow;
 11.1468 +	__vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow);
 11.1469 +	acct_update_integrals();
 11.1470 +	update_mem_hiwater();
 11.1471 +	return 0;
 11.1472 +}
 11.1473 +
 11.1474 +#ifdef CONFIG_STACK_GROWSUP
 11.1475 +/*
 11.1476 + * vma is the first one with address > vma->vm_end.  Have to extend vma.
 11.1477 + */
 11.1478 +int expand_stack(struct vm_area_struct * vma, unsigned long address)
 11.1479 +{
 11.1480 +	int error;
 11.1481 +
 11.1482 +	if (!(vma->vm_flags & VM_GROWSUP))
 11.1483 +		return -EFAULT;
 11.1484 +
 11.1485 +	/*
 11.1486 +	 * We must make sure the anon_vma is allocated
 11.1487 +	 * so that the anon_vma locking is not a noop.
 11.1488 +	 */
 11.1489 +	if (unlikely(anon_vma_prepare(vma)))
 11.1490 +		return -ENOMEM;
 11.1491 +	anon_vma_lock(vma);
 11.1492 +
 11.1493 +	/*
 11.1494 +	 * vma->vm_start/vm_end cannot change under us because the caller
 11.1495 +	 * is required to hold the mmap_sem in read mode.  We need the
 11.1496 +	 * anon_vma lock to serialize against concurrent expand_stacks.
 11.1497 +	 */
 11.1498 +	address += 4 + PAGE_SIZE - 1;
 11.1499 +	address &= PAGE_MASK;
 11.1500 +	error = 0;
 11.1501 +
 11.1502 +	/* Somebody else might have raced and expanded it already */
 11.1503 +	if (address > vma->vm_end) {
 11.1504 +		unsigned long size, grow;
 11.1505 +
 11.1506 +		size = address - vma->vm_start;
 11.1507 +		grow = (address - vma->vm_end) >> PAGE_SHIFT;
 11.1508 +
 11.1509 +		error = acct_stack_growth(vma, size, grow);
 11.1510 +		if (!error)
 11.1511 +			vma->vm_end = address;
 11.1512 +	}
 11.1513 +	anon_vma_unlock(vma);
 11.1514 +	return error;
 11.1515 +}
 11.1516 +
 11.1517 +struct vm_area_struct *
 11.1518 +find_extend_vma(struct mm_struct *mm, unsigned long addr)
 11.1519 +{
 11.1520 +	struct vm_area_struct *vma, *prev;
 11.1521 +
 11.1522 +	addr &= PAGE_MASK;
 11.1523 +	vma = find_vma_prev(mm, addr, &prev);
 11.1524 +	if (vma && (vma->vm_start <= addr))
 11.1525 +		return vma;
 11.1526 +	if (!prev || expand_stack(prev, addr))
 11.1527 +		return NULL;
 11.1528 +	if (prev->vm_flags & VM_LOCKED) {
 11.1529 +		make_pages_present(addr, prev->vm_end);
 11.1530 +	}
 11.1531 +	return prev;
 11.1532 +}
 11.1533 +#else
 11.1534 +/*
 11.1535 + * vma is the first one with address < vma->vm_start.  Have to extend vma.
 11.1536 + */
 11.1537 +int expand_stack(struct vm_area_struct *vma, unsigned long address)
 11.1538 +{
 11.1539 +	int error;
 11.1540 +
 11.1541 +	/*
 11.1542 +	 * We must make sure the anon_vma is allocated
 11.1543 +	 * so that the anon_vma locking is not a noop.
 11.1544 +	 */
 11.1545 +	if (unlikely(anon_vma_prepare(vma)))
 11.1546 +		return -ENOMEM;
 11.1547 +	anon_vma_lock(vma);
 11.1548 +
 11.1549 +	/*
 11.1550 +	 * vma->vm_start/vm_end cannot change under us because the caller
 11.1551 +	 * is required to hold the mmap_sem in read mode.  We need the
 11.1552 +	 * anon_vma lock to serialize against concurrent expand_stacks.
 11.1553 +	 */
 11.1554 +	address &= PAGE_MASK;
 11.1555 +	error = 0;
 11.1556 +
 11.1557 +	/* Somebody else might have raced and expanded it already */
 11.1558 +	if (address < vma->vm_start) {
 11.1559 +		unsigned long size, grow;
 11.1560 +
 11.1561 +		size = vma->vm_end - address;
 11.1562 +		grow = (vma->vm_start - address) >> PAGE_SHIFT;
 11.1563 +
 11.1564 +		error = acct_stack_growth(vma, size, grow);
 11.1565 +		if (!error) {
 11.1566 +			vma->vm_start = address;
 11.1567 +			vma->vm_pgoff -= grow;
 11.1568 +		}
 11.1569 +	}
 11.1570 +	anon_vma_unlock(vma);
 11.1571 +	return error;
 11.1572 +}
 11.1573 +
 11.1574 +struct vm_area_struct *
 11.1575 +find_extend_vma(struct mm_struct * mm, unsigned long addr)
 11.1576 +{
 11.1577 +	struct vm_area_struct * vma;
 11.1578 +	unsigned long start;
 11.1579 +
 11.1580 +	addr &= PAGE_MASK;
 11.1581 +	vma = find_vma(mm,addr);
 11.1582 +	if (!vma)
 11.1583 +		return NULL;
 11.1584 +	if (vma->vm_start <= addr)
 11.1585 +		return vma;
 11.1586 +	if (!(vma->vm_flags & VM_GROWSDOWN))
 11.1587 +		return NULL;
 11.1588 +	start = vma->vm_start;
 11.1589 +	if (expand_stack(vma, addr))
 11.1590 +		return NULL;
 11.1591 +	if (vma->vm_flags & VM_LOCKED) {
 11.1592 +		make_pages_present(addr, start);
 11.1593 +	}
 11.1594 +	return vma;
 11.1595 +}
 11.1596 +#endif
 11.1597 +
 11.1598 +/*
 11.1599 + * Try to free as many page directory entries as we can,
 11.1600 + * without having to work very hard at actually scanning
 11.1601 + * the page tables themselves.
 11.1602 + *
 11.1603 + * Right now we try to free page tables if we have a nice
 11.1604 + * PGDIR-aligned area that got free'd up. We could be more
 11.1605 + * granular if we want to, but this is fast and simple,
 11.1606 + * and covers the bad cases.
 11.1607 + *
 11.1608 + * "prev", if it exists, points to a vma before the one
 11.1609 + * we just free'd - but there's no telling how much before.
 11.1610 + */
 11.1611 +static void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *prev,
 11.1612 +	unsigned long start, unsigned long end)
 11.1613 +{
 11.1614 +	unsigned long first = start & PGDIR_MASK;
 11.1615 +	unsigned long last = end + PGDIR_SIZE - 1;
 11.1616 +	struct mm_struct *mm = tlb->mm;
 11.1617 +
 11.1618 +	if (last > MM_VM_SIZE(mm) || last < end)
 11.1619 +		last = MM_VM_SIZE(mm);
 11.1620 +
 11.1621 +	if (!prev) {
 11.1622 +		prev = mm->mmap;
 11.1623 +		if (!prev)
 11.1624 +			goto no_mmaps;
 11.1625 +		if (prev->vm_end > start) {
 11.1626 +			if (last > prev->vm_start)
 11.1627 +				last = prev->vm_start;
 11.1628 +			goto no_mmaps;
 11.1629 +		}
 11.1630 +	}
 11.1631 +	for (;;) {
 11.1632 +		struct vm_area_struct *next = prev->vm_next;
 11.1633 +
 11.1634 +		if (next) {
 11.1635 +			if (next->vm_start < start) {
 11.1636 +				prev = next;
 11.1637 +				continue;
 11.1638 +			}
 11.1639 +			if (last > next->vm_start)
 11.1640 +				last = next->vm_start;
 11.1641 +		}
 11.1642 +		if (prev->vm_end > first)
 11.1643 +			first = prev->vm_end;
 11.1644 +		break;
 11.1645 +	}
 11.1646 +no_mmaps:
 11.1647 +	if (last < first)	/* for arches with discontiguous pgd indices */
 11.1648 +		return;
 11.1649 +	if (first < FIRST_USER_PGD_NR * PGDIR_SIZE)
 11.1650 +		first = FIRST_USER_PGD_NR * PGDIR_SIZE;
 11.1651 +	/* No point trying to free anything if we're in the same pte page */
 11.1652 +	if ((first & PMD_MASK) < (last & PMD_MASK)) {
 11.1653 +		clear_page_range(tlb, first, last);
 11.1654 +		flush_tlb_pgtables(mm, first, last);
 11.1655 +	}
 11.1656 +}
 11.1657 +
 11.1658 +/* Normal function to fix up a mapping
 11.1659 + * This function is the default for when an area has no specific
 11.1660 + * function.  This may be used as part of a more specific routine.
 11.1661 + *
 11.1662 + * By the time this function is called, the area struct has been
 11.1663 + * removed from the process mapping list.
 11.1664 + */
 11.1665 +static void unmap_vma(struct mm_struct *mm, struct vm_area_struct *area)
 11.1666 +{
 11.1667 +	size_t len = area->vm_end - area->vm_start;
 11.1668 +
 11.1669 +	area->vm_mm->total_vm -= len >> PAGE_SHIFT;
 11.1670 +	if (area->vm_flags & VM_LOCKED)
 11.1671 +		area->vm_mm->locked_vm -= len >> PAGE_SHIFT;
 11.1672 +	vm_stat_unaccount(area);
 11.1673 +	area->vm_mm->unmap_area(area);
 11.1674 +	remove_vm_struct(area);
 11.1675 +}
 11.1676 +
 11.1677 +/*
 11.1678 + * Update the VMA and inode share lists.
 11.1679 + *
 11.1680 + * Ok - we have the memory areas we should free on the 'free' list,
 11.1681 + * so release them, and do the vma updates.
 11.1682 + */
 11.1683 +static void unmap_vma_list(struct mm_struct *mm,
 11.1684 +	struct vm_area_struct *mpnt)
 11.1685 +{
 11.1686 +	do {
 11.1687 +		struct vm_area_struct *next = mpnt->vm_next;
 11.1688 +		unmap_vma(mm, mpnt);
 11.1689 +		mpnt = next;
 11.1690 +	} while (mpnt != NULL);
 11.1691 +	validate_mm(mm);
 11.1692 +}
 11.1693 +
 11.1694 +/*
 11.1695 + * Get rid of page table information in the indicated region.
 11.1696 + *
 11.1697 + * Called with the page table lock held.
 11.1698 + */
 11.1699 +static void unmap_region(struct mm_struct *mm,
 11.1700 +	struct vm_area_struct *vma,
 11.1701 +	struct vm_area_struct *prev,
 11.1702 +	unsigned long start,
 11.1703 +	unsigned long end)
 11.1704 +{
 11.1705 +	struct mmu_gather *tlb;
 11.1706 +	unsigned long nr_accounted = 0;
 11.1707 +
 11.1708 +	lru_add_drain();
 11.1709 +	tlb = tlb_gather_mmu(mm, 0);
 11.1710 +	unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted, NULL);
 11.1711 +	vm_unacct_memory(nr_accounted);
 11.1712 +
 11.1713 +	if (is_hugepage_only_range(start, end - start))
 11.1714 +		hugetlb_free_pgtables(tlb, prev, start, end);
 11.1715 +	else
 11.1716 +		free_pgtables(tlb, prev, start, end);
 11.1717 +	tlb_finish_mmu(tlb, start, end);
 11.1718 +}
 11.1719 +
 11.1720 +/*
 11.1721 + * Create a list of vma's touched by the unmap, removing them from the mm's
 11.1722 + * vma list as we go..
 11.1723 + */
 11.1724 +static void
 11.1725 +detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
 11.1726 +	struct vm_area_struct *prev, unsigned long end)
 11.1727 +{
 11.1728 +	struct vm_area_struct **insertion_point;
 11.1729 +	struct vm_area_struct *tail_vma = NULL;
 11.1730 +
 11.1731 +	insertion_point = (prev ? &prev->vm_next : &mm->mmap);
 11.1732 +	do {
 11.1733 +		rb_erase(&vma->vm_rb, &mm->mm_rb);
 11.1734 +		mm->map_count--;
 11.1735 +		tail_vma = vma;
 11.1736 +		vma = vma->vm_next;
 11.1737 +	} while (vma && vma->vm_start < end);
 11.1738 +	*insertion_point = vma;
 11.1739 +	tail_vma->vm_next = NULL;
 11.1740 +	mm->mmap_cache = NULL;		/* Kill the cache. */
 11.1741 +}
 11.1742 +
 11.1743 +/*
 11.1744 + * Split a vma into two pieces at address 'addr', a new vma is allocated
 11.1745 + * either for the first part or the the tail.
 11.1746 + */
 11.1747 +int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
 11.1748 +	      unsigned long addr, int new_below)
 11.1749 +{
 11.1750 +	struct mempolicy *pol;
 11.1751 +	struct vm_area_struct *new;
 11.1752 +
 11.1753 +	if (is_vm_hugetlb_page(vma) && (addr & ~HPAGE_MASK))
 11.1754 +		return -EINVAL;
 11.1755 +
 11.1756 +	if (mm->map_count >= sysctl_max_map_count)
 11.1757 +		return -ENOMEM;
 11.1758 +
 11.1759 +	new = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
 11.1760 +	if (!new)
 11.1761 +		return -ENOMEM;
 11.1762 +
 11.1763 +	/* most fields are the same, copy all, and then fixup */
 11.1764 +	*new = *vma;
 11.1765 +
 11.1766 +	if (new_below)
 11.1767 +		new->vm_end = addr;
 11.1768 +	else {
 11.1769 +		new->vm_start = addr;
 11.1770 +		new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT);
 11.1771 +	}
 11.1772 +
 11.1773 +	pol = mpol_copy(vma_policy(vma));
 11.1774 +	if (IS_ERR(pol)) {
 11.1775 +		kmem_cache_free(vm_area_cachep, new);
 11.1776 +		return PTR_ERR(pol);
 11.1777 +	}
 11.1778 +	vma_set_policy(new, pol);
 11.1779 +
 11.1780 +	if (new->vm_file)
 11.1781 +		get_file(new->vm_file);
 11.1782 +
 11.1783 +	if (new->vm_ops && new->vm_ops->open)
 11.1784 +		new->vm_ops->open(new);
 11.1785 +
 11.1786 +	if (new_below)
 11.1787 +		vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff +
 11.1788 +			((addr - new->vm_start) >> PAGE_SHIFT), new);
 11.1789 +	else
 11.1790 +		vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new);
 11.1791 +
 11.1792 +	return 0;
 11.1793 +}
 11.1794 +
 11.1795 +/* Munmap is split into 2 main parts -- this part which finds
 11.1796 + * what needs doing, and the areas themselves, which do the
 11.1797 + * work.  This now handles partial unmappings.
 11.1798 + * Jeremy Fitzhardinge <jeremy@goop.org>
 11.1799 + */
 11.1800 +int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
 11.1801 +{
 11.1802 +	unsigned long end;
 11.1803 +	struct vm_area_struct *mpnt, *prev, *last;
 11.1804 +
 11.1805 +	if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start)
 11.1806 +		return -EINVAL;
 11.1807 +
 11.1808 +	if ((len = PAGE_ALIGN(len)) == 0)
 11.1809 +		return -EINVAL;
 11.1810 +
 11.1811 +	/* Find the first overlapping VMA */
 11.1812 +	mpnt = find_vma_prev(mm, start, &prev);
 11.1813 +	if (!mpnt)
 11.1814 +		return 0;
 11.1815 +	/* we have  start < mpnt->vm_end  */
 11.1816 +
 11.1817 +	/* if it doesn't overlap, we have nothing.. */
 11.1818 +	end = start + len;
 11.1819 +	if (mpnt->vm_start >= end)
 11.1820 +		return 0;
 11.1821 +
 11.1822 +	/*
 11.1823 +	 * If we need to split any vma, do it now to save pain later.
 11.1824 +	 *
 11.1825 +	 * Note: mremap's move_vma VM_ACCOUNT handling assumes a partially
 11.1826 +	 * unmapped vm_area_struct will remain in use: so lower split_vma
 11.1827 +	 * places tmp vma above, and higher split_vma places tmp vma below.
 11.1828 +	 */
 11.1829 +	if (start > mpnt->vm_start) {
 11.1830 +		int error = split_vma(mm, mpnt, start, 0);
 11.1831 +		if (error)
 11.1832 +			return error;
 11.1833 +		prev = mpnt;
 11.1834 +	}
 11.1835 +
 11.1836 +	/* Does it split the last one? */
 11.1837 +	last = find_vma(mm, end);
 11.1838 +	if (last && end > last->vm_start) {
 11.1839 +		int error = split_vma(mm, last, end, 1);
 11.1840 +		if (error)
 11.1841 +			return error;
 11.1842 +	}
 11.1843 +	mpnt = prev? prev->vm_next: mm->mmap;
 11.1844 +
 11.1845 +	/*
 11.1846 +	 * Remove the vma's, and unmap the actual pages
 11.1847 +	 */
 11.1848 +	detach_vmas_to_be_unmapped(mm, mpnt, prev, end);
 11.1849 +	spin_lock(&mm->page_table_lock);
 11.1850 +	unmap_region(mm, mpnt, prev, start, end);
 11.1851 +	spin_unlock(&mm->page_table_lock);
 11.1852 +
 11.1853 +	/* Fix up all other VM information */
 11.1854 +	unmap_vma_list(mm, mpnt);
 11.1855 +
 11.1856 +	return 0;
 11.1857 +}
 11.1858 +
 11.1859 +EXPORT_SYMBOL(do_munmap);
 11.1860 +
 11.1861 +asmlinkage long sys_munmap(unsigned long addr, size_t len)
 11.1862 +{
 11.1863 +	int ret;
 11.1864 +	struct mm_struct *mm = current->mm;
 11.1865 +
 11.1866 +	profile_munmap(addr);
 11.1867 +
 11.1868 +	down_write(&mm->mmap_sem);
 11.1869 +	ret = do_munmap(mm, addr, len);
 11.1870 +	up_write(&mm->mmap_sem);
 11.1871 +	return ret;
 11.1872 +}
 11.1873 +
 11.1874 +static inline void verify_mm_writelocked(struct mm_struct *mm)
 11.1875 +{
 11.1876 +#ifdef CONFIG_DEBUG_KERNEL
 11.1877 +	if (unlikely(down_read_trylock(&mm->mmap_sem))) {
 11.1878 +		WARN_ON(1);
 11.1879 +		up_read(&mm->mmap_sem);
 11.1880 +	}
 11.1881 +#endif
 11.1882 +}
 11.1883 +
 11.1884 +/*
 11.1885 + *  this is really a simplified "do_mmap".  it only handles
 11.1886 + *  anonymous maps.  eventually we may be able to do some
 11.1887 + *  brk-specific accounting here.
 11.1888 + */
 11.1889 +unsigned long do_brk(unsigned long addr, unsigned long len)
 11.1890 +{
 11.1891 +	struct mm_struct * mm = current->mm;
 11.1892 +	struct vm_area_struct * vma, * prev;
 11.1893 +	unsigned long flags;
 11.1894 +	struct rb_node ** rb_link, * rb_parent;
 11.1895 +	pgoff_t pgoff = addr >> PAGE_SHIFT;
 11.1896 +
 11.1897 +	len = PAGE_ALIGN(len);
 11.1898 +	if (!len)
 11.1899 +		return addr;
 11.1900 +
 11.1901 +	if ((addr + len) > TASK_SIZE || (addr + len) < addr)
 11.1902 +		return -EINVAL;
 11.1903 +
 11.1904 +	/*
 11.1905 +	 * mlock MCL_FUTURE?
 11.1906 +	 */
 11.1907 +	if (mm->def_flags & VM_LOCKED) {
 11.1908 +		unsigned long locked, lock_limit;
 11.1909 +		locked = mm->locked_vm << PAGE_SHIFT;
 11.1910 +		lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
 11.1911 +		locked += len;
 11.1912 +		if (locked > lock_limit && !capable(CAP_IPC_LOCK))
 11.1913 +			return -EAGAIN;
 11.1914 +	}
 11.1915 +
 11.1916 +	/*
 11.1917 +	 * mm->mmap_sem is required to protect against another thread
 11.1918 +	 * changing the mappings in case we sleep.
 11.1919 +	 */
 11.1920 +	verify_mm_writelocked(mm);
 11.1921 +
 11.1922 +	/*
 11.1923 +	 * Clear old maps.  this also does some error checking for us
 11.1924 +	 */
 11.1925 + munmap_back:
 11.1926 +	vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
 11.1927 +	if (vma && vma->vm_start < addr + len) {
 11.1928 +		if (do_munmap(mm, addr, len))
 11.1929 +			return -ENOMEM;
 11.1930 +		goto munmap_back;
 11.1931 +	}
 11.1932 +
 11.1933 +	/* Check against address space limits *after* clearing old maps... */
 11.1934 +	if ((mm->total_vm << PAGE_SHIFT) + len
 11.1935 +	    > current->signal->rlim[RLIMIT_AS].rlim_cur)
 11.1936 +		return -ENOMEM;
 11.1937 +
 11.1938 +	if (mm->map_count > sysctl_max_map_count)
 11.1939 +		return -ENOMEM;
 11.1940 +
 11.1941 +	if (security_vm_enough_memory(len >> PAGE_SHIFT))
 11.1942 +		return -ENOMEM;
 11.1943 +
 11.1944 +	flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
 11.1945 +
 11.1946 +	/* Can we just expand an old private anonymous mapping? */
 11.1947 +	if (vma_merge(mm, prev, addr, addr + len, flags,
 11.1948 +					NULL, NULL, pgoff, NULL))
 11.1949 +		goto out;
 11.1950 +
 11.1951 +	/*
 11.1952 +	 * create a vma struct for an anonymous mapping
 11.1953 +	 */
 11.1954 +	vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
 11.1955 +	if (!vma) {
 11.1956 +		vm_unacct_memory(len >> PAGE_SHIFT);
 11.1957 +		return -ENOMEM;
 11.1958 +	}
 11.1959 +	memset(vma, 0, sizeof(*vma));
 11.1960 +
 11.1961 +	vma->vm_mm = mm;
 11.1962 +	vma->vm_start = addr;
 11.1963 +	vma->vm_end = addr + len;
 11.1964 +	vma->vm_pgoff = pgoff;
 11.1965 +	vma->vm_flags = flags;
 11.1966 +	vma->vm_page_prot = protection_map[flags & 0x0f];
 11.1967 +	vma_link(mm, vma, prev, rb_link, rb_parent);
 11.1968 +out:
 11.1969 +	mm->total_vm += len >> PAGE_SHIFT;
 11.1970 +	if (flags & VM_LOCKED) {
 11.1971 +		mm->locked_vm += len >> PAGE_SHIFT;
 11.1972 +		make_pages_present(addr, addr + len);
 11.1973 +	}
 11.1974 +	acct_update_integrals();
 11.1975 +	update_mem_hiwater();
 11.1976 +	return addr;
 11.1977 +}
 11.1978 +
 11.1979 +EXPORT_SYMBOL(do_brk);
 11.1980 +
 11.1981 +/* Release all mmaps. */
 11.1982 +void exit_mmap(struct mm_struct *mm)
 11.1983 +{
 11.1984 +	struct mmu_gather *tlb;
 11.1985 +	struct vm_area_struct *vma;
 11.1986 +	unsigned long nr_accounted = 0;
 11.1987 +
 11.1988 +#ifdef arch_exit_mmap
 11.1989 +	arch_exit_mmap(mm);
 11.1990 +#endif
 11.1991 +
 11.1992 +	lru_add_drain();
 11.1993 +
 11.1994 +	spin_lock(&mm->page_table_lock);
 11.1995 +
 11.1996 +	tlb = tlb_gather_mmu(mm, 1);
 11.1997 +	flush_cache_mm(mm);
 11.1998 +	/* Use ~0UL here to ensure all VMAs in the mm are unmapped */
 11.1999 +	mm->map_count -= unmap_vmas(&tlb, mm, mm->mmap, 0,
 11.2000 +					~0UL, &nr_accounted, NULL);
 11.2001 +	vm_unacct_memory(nr_accounted);
 11.2002 +	BUG_ON(mm->map_count);	/* This is just debugging */
 11.2003 +	clear_page_range(tlb, FIRST_USER_PGD_NR * PGDIR_SIZE, MM_VM_SIZE(mm));
 11.2004 +	
 11.2005 +	tlb_finish_mmu(tlb, 0, MM_VM_SIZE(mm));
 11.2006 +
 11.2007 +	vma = mm->mmap;
 11.2008 +	mm->mmap = mm->mmap_cache = NULL;
 11.2009 +	mm->mm_rb = RB_ROOT;
 11.2010 +	mm->rss = 0;
 11.2011 +	mm->total_vm = 0;
 11.2012 +	mm->locked_vm = 0;
 11.2013 +
 11.2014 +	spin_unlock(&mm->page_table_lock);
 11.2015 +
 11.2016 +	/*
 11.2017 +	 * Walk the list again, actually closing and freeing it
 11.2018 +	 * without holding any MM locks.
 11.2019 +	 */
 11.2020 +	while (vma) {
 11.2021 +		struct vm_area_struct *next = vma->vm_next;
 11.2022 +		remove_vm_struct(vma);
 11.2023 +		vma = next;
 11.2024 +	}
 11.2025 +}
 11.2026 +
 11.2027 +/* Insert vm structure into process list sorted by address
 11.2028 + * and into the inode's i_mmap tree.  If vm_file is non-NULL
 11.2029 + * then i_mmap_lock is taken here.
 11.2030 + */
 11.2031 +int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
 11.2032 +{
 11.2033 +	struct vm_area_struct * __vma, * prev;
 11.2034 +	struct rb_node ** rb_link, * rb_parent;
 11.2035 +
 11.2036 +	/*
 11.2037 +	 * The vm_pgoff of a purely anonymous vma should be irrelevant
 11.2038 +	 * until its first write fault, when page's anon_vma and index
 11.2039 +	 * are set.  But now set the vm_pgoff it will almost certainly
 11.2040 +	 * end up with (unless mremap moves it elsewhere before that
 11.2041 +	 * first wfault), so /proc/pid/maps tells a consistent story.
 11.2042 +	 *
 11.2043 +	 * By setting it to reflect the virtual start address of the
 11.2044 +	 * vma, merges and splits can happen in a seamless way, just
 11.2045 +	 * using the existing file pgoff checks and manipulations.
 11.2046 +	 * Similarly in do_mmap_pgoff and in do_brk.
 11.2047 +	 */
 11.2048 +	if (!vma->vm_file) {
 11.2049 +		BUG_ON(vma->anon_vma);
 11.2050 +		vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
 11.2051 +	}
 11.2052 +	__vma = find_vma_prepare(mm,vma->vm_start,&prev,&rb_link,&rb_parent);
 11.2053 +	if (__vma && __vma->vm_start < vma->vm_end)
 11.2054 +		return -ENOMEM;
 11.2055 +	vma_link(mm, vma, prev, rb_link, rb_parent);
 11.2056 +	return 0;
 11.2057 +}
 11.2058 +
 11.2059 +/*
 11.2060 + * Copy the vma structure to a new location in the same mm,
 11.2061 + * prior to moving page table entries, to effect an mremap move.
 11.2062 + */
 11.2063 +struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
 11.2064 +	unsigned long addr, unsigned long len, pgoff_t pgoff)
 11.2065 +{
 11.2066 +	struct vm_area_struct *vma = *vmap;
 11.2067 +	unsigned long vma_start = vma->vm_start;
 11.2068 +	struct mm_struct *mm = vma->vm_mm;
 11.2069 +	struct vm_area_struct *new_vma, *prev;
 11.2070 +	struct rb_node **rb_link, *rb_parent;
 11.2071 +	struct mempolicy *pol;
 11.2072 +
 11.2073 +	/*
 11.2074 +	 * If anonymous vma has not yet been faulted, update new pgoff
 11.2075 +	 * to match new location, to increase its chance of merging.
 11.2076 +	 */
 11.2077 +	if (!vma->vm_file && !vma->anon_vma)
 11.2078 +		pgoff = addr >> PAGE_SHIFT;
 11.2079 +
 11.2080 +	find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
 11.2081 +	new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
 11.2082 +			vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma));
 11.2083 +	if (new_vma) {
 11.2084 +		/*
 11.2085 +		 * Source vma may have been merged into new_vma
 11.2086 +		 */
 11.2087 +		if (vma_start >= new_vma->vm_start &&
 11.2088 +		    vma_start < new_vma->vm_end)
 11.2089 +			*vmap = new_vma;
 11.2090 +	} else {
 11.2091 +		new_vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
 11.2092 +		if (new_vma) {
 11.2093 +			*new_vma = *vma;
 11.2094 +			pol = mpol_copy(vma_policy(vma));
 11.2095 +			if (IS_ERR(pol)) {
 11.2096 +				kmem_cache_free(vm_area_cachep, new_vma);
 11.2097 +				return NULL;
 11.2098 +			}
 11.2099 +			vma_set_policy(new_vma, pol);
 11.2100 +			new_vma->vm_start = addr;
 11.2101 +			new_vma->vm_end = addr + len;
 11.2102 +			new_vma->vm_pgoff = pgoff;
 11.2103 +			if (new_vma->vm_file)
 11.2104 +				get_file(new_vma->vm_file);
 11.2105 +			if (new_vma->vm_ops && new_vma->vm_ops->open)
 11.2106 +				new_vma->vm_ops->open(new_vma);
 11.2107 +			vma_link(mm, new_vma, prev, rb_link, rb_parent);
 11.2108 +		}
 11.2109 +	}
 11.2110 +	return new_vma;
 11.2111 +}
    12.1 --- a/xen/arch/x86/mm.c	Wed Apr 27 10:39:11 2005 +0000
    12.2 +++ b/xen/arch/x86/mm.c	Wed Apr 27 14:42:32 2005 +0000
    12.3 @@ -482,7 +482,7 @@ get_page_from_l2e(
    12.4  {
    12.5      int rc;
    12.6  
    12.7 -    ASSERT( !shadow_mode_enabled(d) );
    12.8 +    ASSERT(!shadow_mode_enabled(d));
    12.9  
   12.10      if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
   12.11          return 1;
   12.12 @@ -641,7 +641,7 @@ static int alloc_l1_table(struct pfn_inf
   12.13      l1_pgentry_t  *pl1e;
   12.14      int            i;
   12.15  
   12.16 -    ASSERT( !shadow_mode_enabled(d) );
   12.17 +    ASSERT(!shadow_mode_enabled(d));
   12.18  
   12.19      pl1e = map_domain_mem(pfn << PAGE_SHIFT);
   12.20  
   12.21 @@ -2670,22 +2670,6 @@ static int ptwr_emulated_update(
   12.22      }
   12.23      unmap_domain_mem(pl1e);
   12.24  
   12.25 -    /* Propagate update to shadow cache. */
   12.26 -    if ( unlikely(shadow_mode_enabled(d)) )
   12.27 -    {
   12.28 -        BUG(); // XXX fix me...
   12.29 -#if 0
   12.30 -        sstat = get_shadow_status(d, page_to_pfn(page));
   12.31 -        if ( sstat & PSH_shadowed )
   12.32 -        {
   12.33 -            sl1e = map_domain_mem(
   12.34 -                ((sstat & PSH_pfn_mask) << PAGE_SHIFT) + (addr & ~PAGE_MASK));
   12.35 -            l1pte_propagate_from_guest(d, &nl1e, sl1e);
   12.36 -            unmap_domain_mem(sl1e);
   12.37 -        }
   12.38 -#endif
   12.39 -    }
   12.40 -
   12.41      /* Finally, drop the old PTE. */
   12.42      put_page_from_l1e(ol1e, d);
   12.43  
   12.44 @@ -2748,6 +2732,7 @@ int ptwr_do_page_fault(struct domain *d,
   12.45      /* We are looking only for read-only mappings of p.t. pages. */
   12.46      if ( ((l1e_get_flags(pte) & (_PAGE_RW|_PAGE_PRESENT)) != _PAGE_PRESENT) ||
   12.47           ((page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table) ||
   12.48 +         ((page->u.inuse.type_info & PGT_count_mask) == 0) ||
   12.49           (page_get_owner(page) != d) )
   12.50      {
   12.51          return 0;