ia64/xen-unstable

changeset 4673:98d5be103415

bitkeeper revision 1.1388 (426fc416kd_SxU1l3YCeVWTczbT41A)

Merge arcadians.cl.cam.ac.uk:/auto/groups/xeno-xenod/BK/xen-unstable.bk
into arcadians.cl.cam.ac.uk:/local/scratch-2/vh249/xen-unstable.bk
author vh249@arcadians.cl.cam.ac.uk
date Wed Apr 27 16:55:50 2005 +0000 (2005-04-27)
parents 5b4ab00d85d1 6c0dd2c2ca58
children 3000c660f103
files .rootkeys BitKeeper/etc/ignore linux-2.6.11-xen-sparse/arch/xen/i386/kernel/ldt.c linux-2.6.11-xen-sparse/arch/xen/i386/mm/hypervisor.c linux-2.6.11-xen-sparse/arch/xen/i386/mm/init.c linux-2.6.11-xen-sparse/arch/xen/i386/mm/pgtable.c linux-2.6.11-xen-sparse/arch/xen/x86_64/mm/hypervisor.c linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/mmu.h linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/mmu_context.h linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/pgalloc.h linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/pgtable.h linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/tlbflush.h linux-2.6.11-xen-sparse/include/asm-xen/asm-x86_64/tlbflush.h linux-2.6.11-xen-sparse/mm/mmap.c tools/libxc/xc_domain.c tools/libxc/xc_linux_restore.c tools/libxc/xc_linux_save.c xen/arch/x86/mm.c xen/common/dom0_ops.c xen/common/dom_mem_ops.c
line diff
     1.1 --- a/.rootkeys	Wed Apr 27 16:55:30 2005 +0000
     1.2 +++ b/.rootkeys	Wed Apr 27 16:55:50 2005 +0000
     1.3 @@ -351,6 +351,7 @@ 40f5623aKXkBBxgpLx2NcvkncQ1Yyw linux-2.6
     1.4  40f5623aDMCsWOFO0jktZ4e8sjwvEg linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/mach-xen/setup_arch_post.h
     1.5  40f5623arsFXkGdPvIqvFi3yFXGR0Q linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/mach-xen/setup_arch_pre.h
     1.6  41811f07Iri9hrvs97t-baxmhOwWDQ linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/mach-xen/smpboot_hooks.h
     1.7 +426fa4d7RzvcFMqff_M76HrvRQZHSg linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/mmu.h
     1.8  4120f807GCO0uqsLqdZj9csxR1Wthw linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/mmu_context.h
     1.9  40f5623adgjZq9nAgCt0IXdWl7udSA linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/page.h
    1.10  40f5623a54NuG-7qHihGYmw4wWQnMA linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/param.h
    1.11 @@ -418,6 +419,7 @@ 419dfc6awx7w88wk6cG9P3mPidX6LQ linux-2.6
    1.12  40f56a0ddHCSs3501MY4hRf22tctOw linux-2.6.11-xen-sparse/mkbuildtree
    1.13  42305f54Q6xJ1bXcQJlCQq1m-e2C8g linux-2.6.11-xen-sparse/mm/highmem.c
    1.14  412f46c0LJuKAgSPGoC0Z1DEkLfuLA linux-2.6.11-xen-sparse/mm/memory.c
    1.15 +426fa4d7ooLYmFcFjJMF_ut4GFVh2Q linux-2.6.11-xen-sparse/mm/mmap.c
    1.16  410a94a4KT6I6X0LVc7djB39tRDp4g linux-2.6.11-xen-sparse/mm/page_alloc.c
    1.17  413cb1e4zst25MDYjg63Y-NGC5_pLg netbsd-2.0-xen-sparse/Makefile
    1.18  413cb1e5c_Mkxf_X0zimEhTKI_l4DA netbsd-2.0-xen-sparse/mkbuildtree
     3.1 --- a/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/ldt.c	Wed Apr 27 16:55:30 2005 +0000
     3.2 +++ b/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/ldt.c	Wed Apr 27 16:55:50 2005 +0000
     3.3 @@ -100,8 +100,8 @@ int init_new_context(struct task_struct 
     3.4  	struct mm_struct * old_mm;
     3.5  	int retval = 0;
     3.6  
     3.7 +	memset(&mm->context, 0, sizeof(mm->context));
     3.8  	init_MUTEX(&mm->context.sem);
     3.9 -	mm->context.size = 0;
    3.10  	old_mm = current->mm;
    3.11  	if (old_mm && old_mm->context.size > 0) {
    3.12  		down(&old_mm->context.sem);
     4.1 --- a/linux-2.6.11-xen-sparse/arch/xen/i386/mm/hypervisor.c	Wed Apr 27 16:55:30 2005 +0000
     4.2 +++ b/linux-2.6.11-xen-sparse/arch/xen/i386/mm/hypervisor.c	Wed Apr 27 16:55:50 2005 +0000
     4.3 @@ -211,7 +211,8 @@ unsigned long allocate_empty_lowmem_regi
     4.4          pte = pte_offset_kernel(pmd, (vstart + (i*PAGE_SIZE))); 
     4.5          pfn_array[i] = pte->pte_low >> PAGE_SHIFT;
     4.6          HYPERVISOR_update_va_mapping(vstart + (i*PAGE_SIZE), __pte_ma(0), 0);
     4.7 -        phys_to_machine_mapping[__pa(vstart)>>PAGE_SHIFT] = INVALID_P2M_ENTRY;
     4.8 +        phys_to_machine_mapping[(__pa(vstart)>>PAGE_SHIFT)+i] =
     4.9 +            INVALID_P2M_ENTRY;
    4.10      }
    4.11  
    4.12      flush_tlb_all();
     5.1 --- a/linux-2.6.11-xen-sparse/arch/xen/i386/mm/init.c	Wed Apr 27 16:55:30 2005 +0000
     5.2 +++ b/linux-2.6.11-xen-sparse/arch/xen/i386/mm/init.c	Wed Apr 27 16:55:50 2005 +0000
     5.3 @@ -710,18 +710,9 @@ void __init mem_init(void)
     5.4  
     5.5  kmem_cache_t *pgd_cache;
     5.6  kmem_cache_t *pmd_cache;
     5.7 -kmem_cache_t *pte_cache;
     5.8  
     5.9  void __init pgtable_cache_init(void)
    5.10  {
    5.11 -	pte_cache = kmem_cache_create("pte",
    5.12 -				PTRS_PER_PTE*sizeof(pte_t),
    5.13 -				PTRS_PER_PTE*sizeof(pte_t),
    5.14 -				0,
    5.15 -				pte_ctor,
    5.16 -				pte_dtor);
    5.17 -	if (!pte_cache)
    5.18 -		panic("pgtable_cache_init(): Cannot create pte cache");
    5.19  	if (PTRS_PER_PMD > 1) {
    5.20  		pmd_cache = kmem_cache_create("pmd",
    5.21  					PTRS_PER_PMD*sizeof(pmd_t),
     6.1 --- a/linux-2.6.11-xen-sparse/arch/xen/i386/mm/pgtable.c	Wed Apr 27 16:55:30 2005 +0000
     6.2 +++ b/linux-2.6.11-xen-sparse/arch/xen/i386/mm/pgtable.c	Wed Apr 27 16:55:50 2005 +0000
     6.3 @@ -198,59 +198,35 @@ pte_t *pte_alloc_one_kernel(struct mm_st
     6.4  	return pte;
     6.5  }
     6.6  
     6.7 -void pte_ctor(void *pte, kmem_cache_t *cache, unsigned long unused)
     6.8 -{
     6.9 -	struct page *page = virt_to_page(pte);
    6.10 -	SetPageForeign(page, pte_free);
    6.11 -	set_page_count(page, 1);
    6.12 -
    6.13 -	clear_page(pte);
    6.14 -	make_page_readonly(pte);
    6.15 -	xen_pte_pin(__pa(pte));
    6.16 -}
    6.17 -
    6.18 -void pte_dtor(void *pte, kmem_cache_t *cache, unsigned long unused)
    6.19 -{
    6.20 -	struct page *page = virt_to_page(pte);
    6.21 -	ClearPageForeign(page);
    6.22 -
    6.23 -	xen_pte_unpin(__pa(pte));
    6.24 -	make_page_writable(pte);
    6.25 -}
    6.26 -
    6.27  struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
    6.28  {
    6.29 -	pte_t *ptep;
    6.30 -
    6.31 -#ifdef CONFIG_HIGHPTE
    6.32  	struct page *pte;
    6.33  
    6.34 +#ifdef CONFIG_HIGHPTE
    6.35  	pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
    6.36 -	if (pte == NULL)
    6.37 -		return pte;
    6.38 -	if (PageHighMem(pte))
    6.39 -		return pte;
    6.40 -	/* not a highmem page -- free page and grab one from the cache */
    6.41 -	__free_page(pte);
    6.42 +#else
    6.43 +	pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
    6.44 +	if (pte) {
    6.45 +		SetPageForeign(pte, pte_free);
    6.46 +		set_page_count(pte, 1);
    6.47 +	}
    6.48  #endif
    6.49 -	ptep = kmem_cache_alloc(pte_cache, GFP_KERNEL);
    6.50 -	if (ptep)
    6.51 -		return virt_to_page(ptep);
    6.52 -	return NULL;
    6.53 +
    6.54 +	return pte;
    6.55  }
    6.56  
    6.57  void pte_free(struct page *pte)
    6.58  {
    6.59 +	unsigned long va = (unsigned long)__va(page_to_pfn(pte)<<PAGE_SHIFT);
    6.60 +
    6.61 +	if (!pte_write(*virt_to_ptep(va)))
    6.62 +		HYPERVISOR_update_va_mapping(
    6.63 +			va, pfn_pte(page_to_pfn(pte), PAGE_KERNEL), 0);
    6.64 +
    6.65 +	ClearPageForeign(pte);
    6.66  	set_page_count(pte, 1);
    6.67 -#ifdef CONFIG_HIGHPTE
    6.68 -	if (!PageHighMem(pte))
    6.69 -#endif
    6.70 -		kmem_cache_free(pte_cache,
    6.71 -				phys_to_virt(page_to_pseudophys(pte)));
    6.72 -#ifdef CONFIG_HIGHPTE
    6.73 -	else
    6.74 -		__free_page(pte);
    6.75 -#endif
    6.76 +
    6.77 +	__free_page(pte);
    6.78  }
    6.79  
    6.80  void pmd_ctor(void *pmd, kmem_cache_t *cache, unsigned long flags)
    6.81 @@ -305,14 +281,11 @@ void pgd_ctor(void *pgd, kmem_cache_t *c
    6.82  			(PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t));
    6.83  
    6.84  	if (PTRS_PER_PMD > 1)
    6.85 -		goto out;
    6.86 +		return;
    6.87  
    6.88  	pgd_list_add(pgd);
    6.89  	spin_unlock_irqrestore(&pgd_lock, flags);
    6.90  	memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
    6.91 - out:
    6.92 -	make_page_readonly(pgd);
    6.93 -	xen_pgd_pin(__pa(pgd));
    6.94  }
    6.95  
    6.96  /* never called when PTRS_PER_PMD > 1 */
    6.97 @@ -320,9 +293,6 @@ void pgd_dtor(void *pgd, kmem_cache_t *c
    6.98  {
    6.99  	unsigned long flags; /* can be called from interrupt context */
   6.100  
   6.101 -	xen_pgd_unpin(__pa(pgd));
   6.102 -	make_page_writable(pgd);
   6.103 -
   6.104  	if (PTRS_PER_PMD > 1)
   6.105  		return;
   6.106  
   6.107 @@ -357,6 +327,15 @@ out_oom:
   6.108  void pgd_free(pgd_t *pgd)
   6.109  {
   6.110  	int i;
   6.111 +	pte_t *ptep = virt_to_ptep(pgd);
   6.112 +
   6.113 +	if (!pte_write(*ptep)) {
   6.114 +		xen_pgd_unpin(__pa(pgd));
   6.115 +		HYPERVISOR_update_va_mapping(
   6.116 +			(unsigned long)pgd,
   6.117 +			pfn_pte(virt_to_phys(pgd)>>PAGE_SHIFT, PAGE_KERNEL),
   6.118 +			0);
   6.119 +	}
   6.120  
   6.121  	/* in the PAE case user pgd entries are overwritten before usage */
   6.122  	if (PTRS_PER_PMD > 1)
   6.123 @@ -369,28 +348,19 @@ void pgd_free(pgd_t *pgd)
   6.124  #ifndef CONFIG_XEN_SHADOW_MODE
   6.125  void make_lowmem_page_readonly(void *va)
   6.126  {
   6.127 -	pgd_t *pgd = pgd_offset_k((unsigned long)va);
   6.128 -	pud_t *pud = pud_offset(pgd, (unsigned long)va);
   6.129 -	pmd_t *pmd = pmd_offset(pud, (unsigned long)va);
   6.130 -	pte_t *pte = pte_offset_kernel(pmd, (unsigned long)va);
   6.131 +	pte_t *pte = virt_to_ptep(va);
   6.132  	set_pte(pte, pte_wrprotect(*pte));
   6.133  }
   6.134  
   6.135  void make_lowmem_page_writable(void *va)
   6.136  {
   6.137 -	pgd_t *pgd = pgd_offset_k((unsigned long)va);
   6.138 -	pud_t *pud = pud_offset(pgd, (unsigned long)va);
   6.139 -	pmd_t *pmd = pmd_offset(pud, (unsigned long)va);
   6.140 -	pte_t *pte = pte_offset_kernel(pmd, (unsigned long)va);
   6.141 +	pte_t *pte = virt_to_ptep(va);
   6.142  	set_pte(pte, pte_mkwrite(*pte));
   6.143  }
   6.144  
   6.145  void make_page_readonly(void *va)
   6.146  {
   6.147 -	pgd_t *pgd = pgd_offset_k((unsigned long)va);
   6.148 -	pud_t *pud = pud_offset(pgd, (unsigned long)va);
   6.149 -	pmd_t *pmd = pmd_offset(pud, (unsigned long)va);
   6.150 -	pte_t *pte = pte_offset_kernel(pmd, (unsigned long)va);
   6.151 +	pte_t *pte = virt_to_ptep(va);
   6.152  	set_pte(pte, pte_wrprotect(*pte));
   6.153  	if ( (unsigned long)va >= (unsigned long)high_memory )
   6.154  	{
   6.155 @@ -405,10 +375,7 @@ void make_page_readonly(void *va)
   6.156  
   6.157  void make_page_writable(void *va)
   6.158  {
   6.159 -	pgd_t *pgd = pgd_offset_k((unsigned long)va);
   6.160 -	pud_t *pud = pud_offset(pgd, (unsigned long)va);
   6.161 -	pmd_t *pmd = pmd_offset(pud, (unsigned long)va);
   6.162 -	pte_t *pte = pte_offset_kernel(pmd, (unsigned long)va);
   6.163 +	pte_t *pte = virt_to_ptep(va);
   6.164  	set_pte(pte, pte_mkwrite(*pte));
   6.165  	if ( (unsigned long)va >= (unsigned long)high_memory )
   6.166  	{
   6.167 @@ -439,3 +406,91 @@ void make_pages_writable(void *va, unsig
   6.168  	}
   6.169  }
   6.170  #endif /* CONFIG_XEN_SHADOW_MODE */
   6.171 +
   6.172 +void mm_pin(struct mm_struct *mm)
   6.173 +{
   6.174 +    pgd_t       *pgd;
   6.175 +    struct page *page;
   6.176 +    int          i;
   6.177 +
   6.178 +    spin_lock(&mm->page_table_lock);
   6.179 +
   6.180 +    for ( i = 0, pgd = mm->pgd; i < USER_PTRS_PER_PGD; i++, pgd++ )
   6.181 +    {
   6.182 +        if ( *(unsigned long *)pgd == 0 )
   6.183 +            continue;
   6.184 +        page = pmd_page(*(pmd_t *)pgd);
   6.185 +        if ( !PageHighMem(page) )
   6.186 +            HYPERVISOR_update_va_mapping(
   6.187 +                (unsigned long)__va(page_to_pfn(page)<<PAGE_SHIFT),
   6.188 +                pfn_pte(page_to_pfn(page), PAGE_KERNEL_RO), 0);
   6.189 +    }
   6.190 +
   6.191 +    HYPERVISOR_update_va_mapping(
   6.192 +        (unsigned long)mm->pgd,
   6.193 +        pfn_pte(virt_to_phys(mm->pgd)>>PAGE_SHIFT, PAGE_KERNEL_RO), 0);
   6.194 +    xen_pgd_pin(__pa(mm->pgd));
   6.195 +
   6.196 +    mm->context.pinned = 1;
   6.197 +
   6.198 +    spin_unlock(&mm->page_table_lock);
   6.199 +}
   6.200 +
   6.201 +void mm_unpin(struct mm_struct *mm)
   6.202 +{
   6.203 +    pgd_t       *pgd;
   6.204 +    struct page *page;
   6.205 +    int          i;
   6.206 +
   6.207 +    spin_lock(&mm->page_table_lock);
   6.208 +
   6.209 +    xen_pgd_unpin(__pa(mm->pgd));
   6.210 +    HYPERVISOR_update_va_mapping(
   6.211 +        (unsigned long)mm->pgd,
   6.212 +        pfn_pte(virt_to_phys(mm->pgd)>>PAGE_SHIFT, PAGE_KERNEL), 0);
   6.213 +
   6.214 +    for ( i = 0, pgd = mm->pgd; i < USER_PTRS_PER_PGD; i++, pgd++ )
   6.215 +    {
   6.216 +        if ( *(unsigned long *)pgd == 0 )
   6.217 +            continue;
   6.218 +        page = pmd_page(*(pmd_t *)pgd);
   6.219 +        if ( !PageHighMem(page) )
   6.220 +            HYPERVISOR_update_va_mapping(
   6.221 +                (unsigned long)__va(page_to_pfn(page)<<PAGE_SHIFT),
   6.222 +                pfn_pte(page_to_pfn(page), PAGE_KERNEL), 0);
   6.223 +    }
   6.224 +
   6.225 +    mm->context.pinned = 0;
   6.226 +
   6.227 +    spin_unlock(&mm->page_table_lock);
   6.228 +}
   6.229 +
   6.230 +void _arch_exit_mmap(struct mm_struct *mm)
   6.231 +{
   6.232 +    unsigned int cpu = smp_processor_id();
   6.233 +    struct task_struct *tsk = current;
   6.234 +
   6.235 +    task_lock(tsk);
   6.236 +
   6.237 +    /*
   6.238 +     * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
   6.239 +     * *much* faster this way, as no tlb flushes means bigger wrpt batches.
   6.240 +     */
   6.241 +    if ( tsk->active_mm == mm )
   6.242 +    {
   6.243 +        tsk->active_mm = &init_mm;
   6.244 +        atomic_inc(&init_mm.mm_count);
   6.245 +
   6.246 +        cpu_set(cpu, init_mm.cpu_vm_mask);
   6.247 +        load_cr3(swapper_pg_dir);
   6.248 +        cpu_clear(cpu, mm->cpu_vm_mask);
   6.249 +
   6.250 +        atomic_dec(&mm->mm_count);
   6.251 +        BUG_ON(atomic_read(&mm->mm_count) == 0);
   6.252 +    }
   6.253 +
   6.254 +    task_unlock(tsk);
   6.255 +
   6.256 +    if ( mm->context.pinned && (atomic_read(&mm->mm_count) == 1) )
   6.257 +        mm_unpin(mm);
   6.258 +}
     7.1 --- a/linux-2.6.11-xen-sparse/arch/xen/x86_64/mm/hypervisor.c	Wed Apr 27 16:55:30 2005 +0000
     7.2 +++ b/linux-2.6.11-xen-sparse/arch/xen/x86_64/mm/hypervisor.c	Wed Apr 27 16:55:50 2005 +0000
     7.3 @@ -258,7 +258,8 @@ unsigned long allocate_empty_lowmem_regi
     7.4          pte = pte_offset_kernel(pmd, (vstart + (i*PAGE_SIZE))); 
     7.5          pfn_array[i] = pte->pte >> PAGE_SHIFT;
     7.6          xen_l1_entry_update(pte, 0);
     7.7 -        phys_to_machine_mapping[__pa(vstart)>>PAGE_SHIFT] = (u32)INVALID_P2M_ENTRY;
     7.8 +        phys_to_machine_mapping[(__pa(vstart)>>PAGE_SHIFT)+i] =
     7.9 +            (u32)INVALID_P2M_ENTRY;
    7.10      }
    7.11  
    7.12      /* Flush updates through and flush the TLB. */
     8.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     8.2 +++ b/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/mmu.h	Wed Apr 27 16:55:50 2005 +0000
     8.3 @@ -0,0 +1,22 @@
     8.4 +#ifndef __i386_MMU_H
     8.5 +#define __i386_MMU_H
     8.6 +
     8.7 +#include <asm/semaphore.h>
     8.8 +/*
     8.9 + * The i386 doesn't have a mmu context, but
    8.10 + * we put the segment information here.
    8.11 + *
    8.12 + * cpu_vm_mask is used to optimize ldt flushing.
    8.13 + */
    8.14 +typedef struct { 
    8.15 +	int size;
    8.16 +	struct semaphore sem;
    8.17 +	void *ldt;
    8.18 +	unsigned pinned:1;
    8.19 +} mm_context_t;
    8.20 +
    8.21 +/* mm/memory.c:exit_mmap hook */
    8.22 +extern void _arch_exit_mmap(struct mm_struct *mm);
    8.23 +#define arch_exit_mmap(_mm) _arch_exit_mmap(_mm)
    8.24 +
    8.25 +#endif
     9.1 --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/mmu_context.h	Wed Apr 27 16:55:30 2005 +0000
     9.2 +++ b/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/mmu_context.h	Wed Apr 27 16:55:50 2005 +0000
     9.3 @@ -41,6 +41,9 @@ static inline void __prepare_arch_switch
     9.4  		: : "r" (0) );
     9.5  }
     9.6  
     9.7 +extern void mm_pin(struct mm_struct *mm);
     9.8 +extern void mm_unpin(struct mm_struct *mm);
     9.9 +
    9.10  static inline void switch_mm(struct mm_struct *prev,
    9.11  			     struct mm_struct *next,
    9.12  			     struct task_struct *tsk)
    9.13 @@ -49,6 +52,9 @@ static inline void switch_mm(struct mm_s
    9.14  	struct mmuext_op _op[2], *op = _op;
    9.15  
    9.16  	if (likely(prev != next)) {
    9.17 +		if (!next->context.pinned)
    9.18 +			mm_pin(next);
    9.19 +
    9.20  		/* stop flush ipis for the previous mm */
    9.21  		cpu_clear(cpu, prev->cpu_vm_mask);
    9.22  #if 0 /* XEN: no lazy tlb */
    9.23 @@ -92,20 +98,10 @@ static inline void switch_mm(struct mm_s
    9.24  #endif
    9.25  }
    9.26  
    9.27 -/*
    9.28 - * XEN: We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
    9.29 - * *much* faster this way, as no tlb flushes means much bigger wrpt batches.
    9.30 - */
    9.31 -#define deactivate_mm(tsk, mm) do {					\
    9.32 -	asm("movl %0,%%fs ; movl %0,%%gs": :"r" (0));			\
    9.33 -	if ((mm) && cpu_isset(smp_processor_id(), (mm)->cpu_vm_mask)) {	\
    9.34 -		cpu_clear(smp_processor_id(), (mm)->cpu_vm_mask);	\
    9.35 -		load_cr3(swapper_pg_dir);				\
    9.36 -	}								\
    9.37 -} while (0)
    9.38 +#define deactivate_mm(tsk, mm) \
    9.39 +	asm("movl %0,%%fs ; movl %0,%%gs": :"r" (0))
    9.40  
    9.41 -#define activate_mm(prev, next) do {		\
    9.42 -	switch_mm((prev),(next),NULL);		\
    9.43 -} while (0)
    9.44 +#define activate_mm(prev, next) \
    9.45 +	switch_mm((prev),(next),NULL)
    9.46  
    9.47  #endif
    10.1 --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/pgalloc.h	Wed Apr 27 16:55:30 2005 +0000
    10.2 +++ b/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/pgalloc.h	Wed Apr 27 16:55:50 2005 +0000
    10.3 @@ -11,10 +11,23 @@
    10.4  #define pmd_populate_kernel(mm, pmd, pte) \
    10.5  		set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte)))
    10.6  
    10.7 -#define pmd_populate(mm, pmd, pte) 				\
    10.8 -	set_pmd(pmd, __pmd(_PAGE_TABLE +			\
    10.9 -		((unsigned long long)page_to_pfn(pte) <<	\
   10.10 -			(unsigned long long) PAGE_SHIFT)))
   10.11 +#define pmd_populate(mm, pmd, pte) 					\
   10.12 +do {									\
   10.13 +	if (unlikely((mm)->context.pinned)) {				\
   10.14 +		if (!PageHighMem(pte))					\
   10.15 +			HYPERVISOR_update_va_mapping(			\
   10.16 +			  (unsigned long)__va(page_to_pfn(pte)<<PAGE_SHIFT),\
   10.17 +			  pfn_pte(page_to_pfn(pte), PAGE_KERNEL_RO), 0);\
   10.18 +		set_pmd(pmd, __pmd(_PAGE_TABLE +			\
   10.19 +			((unsigned long long)page_to_pfn(pte) <<	\
   10.20 +				(unsigned long long) PAGE_SHIFT)));	\
   10.21 +	} else {							\
   10.22 +		*(pmd) = __pmd(_PAGE_TABLE +				\
   10.23 +			((unsigned long long)page_to_pfn(pte) <<	\
   10.24 +				(unsigned long long) PAGE_SHIFT));	\
   10.25 +	}								\
   10.26 +} while (0)
   10.27 +
   10.28  /*
   10.29   * Allocate and free page tables.
   10.30   */
    11.1 --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/pgtable.h	Wed Apr 27 16:55:30 2005 +0000
    11.2 +++ b/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/pgtable.h	Wed Apr 27 16:55:50 2005 +0000
    11.3 @@ -35,12 +35,9 @@ extern unsigned long empty_zero_page[102
    11.4  extern pgd_t swapper_pg_dir[1024];
    11.5  extern kmem_cache_t *pgd_cache;
    11.6  extern kmem_cache_t *pmd_cache;
    11.7 -extern kmem_cache_t *pte_cache;
    11.8  extern spinlock_t pgd_lock;
    11.9  extern struct page *pgd_list;
   11.10  
   11.11 -void pte_ctor(void *, kmem_cache_t *, unsigned long);
   11.12 -void pte_dtor(void *, kmem_cache_t *, unsigned long);
   11.13  void pmd_ctor(void *, kmem_cache_t *, unsigned long);
   11.14  void pgd_ctor(void *, kmem_cache_t *, unsigned long);
   11.15  void pgd_dtor(void *, kmem_cache_t *, unsigned long);
   11.16 @@ -448,12 +445,17 @@ void make_pages_writable(void *va, unsig
   11.17  #define make_pages_writable(_va, _nr)  ((void)0)
   11.18  #endif
   11.19  
   11.20 -#define arbitrary_virt_to_machine(__va)					\
   11.21 +#define virt_to_ptep(__va)						\
   11.22  ({									\
   11.23  	pgd_t *__pgd = pgd_offset_k((unsigned long)(__va));		\
   11.24  	pud_t *__pud = pud_offset(__pgd, (unsigned long)(__va));	\
   11.25  	pmd_t *__pmd = pmd_offset(__pud, (unsigned long)(__va));	\
   11.26 -	pte_t *__pte = pte_offset_kernel(__pmd, (unsigned long)(__va));	\
   11.27 +	pte_offset_kernel(__pmd, (unsigned long)(__va));		\
   11.28 +})
   11.29 +
   11.30 +#define arbitrary_virt_to_machine(__va)					\
   11.31 +({									\
   11.32 +	pte_t *__pte = virt_to_ptep(__va);				\
   11.33  	unsigned long __pa = (*(unsigned long *)__pte) & PAGE_MASK;	\
   11.34  	__pa | ((unsigned long)(__va) & (PAGE_SIZE-1));			\
   11.35  })
    12.1 --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/tlbflush.h	Wed Apr 27 16:55:30 2005 +0000
    12.2 +++ b/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/tlbflush.h	Wed Apr 27 16:55:50 2005 +0000
    12.3 @@ -40,24 +40,21 @@ extern unsigned long pgkern_mask;
    12.4  
    12.5  static inline void flush_tlb_mm(struct mm_struct *mm)
    12.6  {
    12.7 -	/* XEN: cpu_vm_mask is more accurate than active_mm. */
    12.8 -	if (cpu_isset(smp_processor_id(), mm->cpu_vm_mask))
    12.9 +	if (mm == current->active_mm)
   12.10  		__flush_tlb();
   12.11  }
   12.12  
   12.13  static inline void flush_tlb_page(struct vm_area_struct *vma,
   12.14  	unsigned long addr)
   12.15  {
   12.16 -	/* XEN: cpu_vm_mask is more accurate than active_mm. */
   12.17 -	if (cpu_isset(smp_processor_id(), vma->vm_mm->cpu_vm_mask))
   12.18 +	if (vma->vm_mm == current->active_mm)
   12.19  		__flush_tlb_one(addr);
   12.20  }
   12.21  
   12.22  static inline void flush_tlb_range(struct vm_area_struct *vma,
   12.23  	unsigned long start, unsigned long end)
   12.24  {
   12.25 -	/* XEN: cpu_vm_mask is more accurate than active_mm. */
   12.26 -	if (cpu_isset(smp_processor_id(), vma->vm_mm->cpu_vm_mask))
   12.27 +	if (vma->vm_mm == current->active_mm)
   12.28  		__flush_tlb();
   12.29  }
   12.30  
    13.1 --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-x86_64/tlbflush.h	Wed Apr 27 16:55:30 2005 +0000
    13.2 +++ b/linux-2.6.11-xen-sparse/include/asm-xen/asm-x86_64/tlbflush.h	Wed Apr 27 16:55:50 2005 +0000
    13.3 @@ -44,24 +44,21 @@ extern unsigned long pgkern_mask;
    13.4  
    13.5  static inline void flush_tlb_mm(struct mm_struct *mm)
    13.6  {
    13.7 -	/* XEN: cpu_vm_mask is more accurate than active_mm. */
    13.8 -	if (cpu_isset(smp_processor_id(), mm->cpu_vm_mask))
    13.9 +	if (mm == current->active_mm)
   13.10  		__flush_tlb();
   13.11  }
   13.12  
   13.13  static inline void flush_tlb_page(struct vm_area_struct *vma,
   13.14  	unsigned long addr)
   13.15  {
   13.16 -	/* XEN: cpu_vm_mask is more accurate than active_mm. */
   13.17 -	if (cpu_isset(smp_processor_id(), vma->vm_mm->cpu_vm_mask))
   13.18 +	if (vma->vm_mm == current->active_mm)
   13.19  		__flush_tlb_one(addr);
   13.20  }
   13.21  
   13.22  static inline void flush_tlb_range(struct vm_area_struct *vma,
   13.23  	unsigned long start, unsigned long end)
   13.24  {
   13.25 -	/* XEN: cpu_vm_mask is more accurate than active_mm. */
   13.26 -	if (cpu_isset(smp_processor_id(), vma->vm_mm->cpu_vm_mask))
   13.27 +	if (vma->vm_mm == current->active_mm)
   13.28  		__flush_tlb();
   13.29  }
   13.30  
    14.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    14.2 +++ b/linux-2.6.11-xen-sparse/mm/mmap.c	Wed Apr 27 16:55:50 2005 +0000
    14.3 @@ -0,0 +1,2108 @@
    14.4 +/*
    14.5 + * mm/mmap.c
    14.6 + *
    14.7 + * Written by obz.
    14.8 + *
    14.9 + * Address space accounting code	<alan@redhat.com>
   14.10 + */
   14.11 +
   14.12 +#include <linux/slab.h>
   14.13 +#include <linux/mm.h>
   14.14 +#include <linux/shm.h>
   14.15 +#include <linux/mman.h>
   14.16 +#include <linux/pagemap.h>
   14.17 +#include <linux/swap.h>
   14.18 +#include <linux/syscalls.h>
   14.19 +#include <linux/init.h>
   14.20 +#include <linux/file.h>
   14.21 +#include <linux/fs.h>
   14.22 +#include <linux/personality.h>
   14.23 +#include <linux/security.h>
   14.24 +#include <linux/hugetlb.h>
   14.25 +#include <linux/profile.h>
   14.26 +#include <linux/module.h>
   14.27 +#include <linux/acct.h>
   14.28 +#include <linux/mount.h>
   14.29 +#include <linux/mempolicy.h>
   14.30 +#include <linux/rmap.h>
   14.31 +
   14.32 +#include <asm/uaccess.h>
   14.33 +#include <asm/cacheflush.h>
   14.34 +#include <asm/tlb.h>
   14.35 +
   14.36 +/*
   14.37 + * WARNING: the debugging will use recursive algorithms so never enable this
   14.38 + * unless you know what you are doing.
   14.39 + */
   14.40 +#undef DEBUG_MM_RB
   14.41 +
   14.42 +/* description of effects of mapping type and prot in current implementation.
   14.43 + * this is due to the limited x86 page protection hardware.  The expected
   14.44 + * behavior is in parens:
   14.45 + *
   14.46 + * map_type	prot
   14.47 + *		PROT_NONE	PROT_READ	PROT_WRITE	PROT_EXEC
   14.48 + * MAP_SHARED	r: (no) no	r: (yes) yes	r: (no) yes	r: (no) yes
   14.49 + *		w: (no) no	w: (no) no	w: (yes) yes	w: (no) no
   14.50 + *		x: (no) no	x: (no) yes	x: (no) yes	x: (yes) yes
   14.51 + *		
   14.52 + * MAP_PRIVATE	r: (no) no	r: (yes) yes	r: (no) yes	r: (no) yes
   14.53 + *		w: (no) no	w: (no) no	w: (copy) copy	w: (no) no
   14.54 + *		x: (no) no	x: (no) yes	x: (no) yes	x: (yes) yes
   14.55 + *
   14.56 + */
   14.57 +pgprot_t protection_map[16] = {
   14.58 +	__P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111,
   14.59 +	__S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111
   14.60 +};
   14.61 +
   14.62 +int sysctl_overcommit_memory = OVERCOMMIT_GUESS;  /* heuristic overcommit */
   14.63 +int sysctl_overcommit_ratio = 50;	/* default is 50% */
   14.64 +int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
   14.65 +atomic_t vm_committed_space = ATOMIC_INIT(0);
   14.66 +
   14.67 +/*
   14.68 + * Check that a process has enough memory to allocate a new virtual
   14.69 + * mapping. 0 means there is enough memory for the allocation to
   14.70 + * succeed and -ENOMEM implies there is not.
   14.71 + *
   14.72 + * We currently support three overcommit policies, which are set via the
   14.73 + * vm.overcommit_memory sysctl.  See Documentation/vm/overcommit-accounting
   14.74 + *
   14.75 + * Strict overcommit modes added 2002 Feb 26 by Alan Cox.
   14.76 + * Additional code 2002 Jul 20 by Robert Love.
   14.77 + *
   14.78 + * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise.
   14.79 + *
   14.80 + * Note this is a helper function intended to be used by LSMs which
   14.81 + * wish to use this logic.
   14.82 + */
   14.83 +int __vm_enough_memory(long pages, int cap_sys_admin)
   14.84 +{
   14.85 +	unsigned long free, allowed;
   14.86 +
   14.87 +	vm_acct_memory(pages);
   14.88 +
   14.89 +	/*
   14.90 +	 * Sometimes we want to use more memory than we have
   14.91 +	 */
   14.92 +	if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS)
   14.93 +		return 0;
   14.94 +
   14.95 +	if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
   14.96 +		unsigned long n;
   14.97 +
   14.98 +		free = get_page_cache_size();
   14.99 +		free += nr_swap_pages;
  14.100 +
  14.101 +		/*
  14.102 +		 * Any slabs which are created with the
  14.103 +		 * SLAB_RECLAIM_ACCOUNT flag claim to have contents
  14.104 +		 * which are reclaimable, under pressure.  The dentry
  14.105 +		 * cache and most inode caches should fall into this
  14.106 +		 */
  14.107 +		free += atomic_read(&slab_reclaim_pages);
  14.108 +
  14.109 +		/*
  14.110 +		 * Leave the last 3% for root
  14.111 +		 */
  14.112 +		if (!cap_sys_admin)
  14.113 +			free -= free / 32;
  14.114 +
  14.115 +		if (free > pages)
  14.116 +			return 0;
  14.117 +
  14.118 +		/*
  14.119 +		 * nr_free_pages() is very expensive on large systems,
  14.120 +		 * only call if we're about to fail.
  14.121 +		 */
  14.122 +		n = nr_free_pages();
  14.123 +		if (!cap_sys_admin)
  14.124 +			n -= n / 32;
  14.125 +		free += n;
  14.126 +
  14.127 +		if (free > pages)
  14.128 +			return 0;
  14.129 +		vm_unacct_memory(pages);
  14.130 +		return -ENOMEM;
  14.131 +	}
  14.132 +
  14.133 +	allowed = (totalram_pages - hugetlb_total_pages())
  14.134 +	       	* sysctl_overcommit_ratio / 100;
  14.135 +	/*
  14.136 +	 * Leave the last 3% for root
  14.137 +	 */
  14.138 +	if (!cap_sys_admin)
  14.139 +		allowed -= allowed / 32;
  14.140 +	allowed += total_swap_pages;
  14.141 +
  14.142 +	/* Don't let a single process grow too big:
  14.143 +	   leave 3% of the size of this process for other processes */
  14.144 +	allowed -= current->mm->total_vm / 32;
  14.145 +
  14.146 +	if (atomic_read(&vm_committed_space) < allowed)
  14.147 +		return 0;
  14.148 +
  14.149 +	vm_unacct_memory(pages);
  14.150 +
  14.151 +	return -ENOMEM;
  14.152 +}
  14.153 +
  14.154 +EXPORT_SYMBOL(sysctl_overcommit_memory);
  14.155 +EXPORT_SYMBOL(sysctl_overcommit_ratio);
  14.156 +EXPORT_SYMBOL(sysctl_max_map_count);
  14.157 +EXPORT_SYMBOL(vm_committed_space);
  14.158 +EXPORT_SYMBOL(__vm_enough_memory);
  14.159 +
  14.160 +/*
  14.161 + * Requires inode->i_mapping->i_mmap_lock
  14.162 + */
  14.163 +static void __remove_shared_vm_struct(struct vm_area_struct *vma,
  14.164 +		struct file *file, struct address_space *mapping)
  14.165 +{
  14.166 +	if (vma->vm_flags & VM_DENYWRITE)
  14.167 +		atomic_inc(&file->f_dentry->d_inode->i_writecount);
  14.168 +	if (vma->vm_flags & VM_SHARED)
  14.169 +		mapping->i_mmap_writable--;
  14.170 +
  14.171 +	flush_dcache_mmap_lock(mapping);
  14.172 +	if (unlikely(vma->vm_flags & VM_NONLINEAR))
  14.173 +		list_del_init(&vma->shared.vm_set.list);
  14.174 +	else
  14.175 +		vma_prio_tree_remove(vma, &mapping->i_mmap);
  14.176 +	flush_dcache_mmap_unlock(mapping);
  14.177 +}
  14.178 +
  14.179 +/*
  14.180 + * Remove one vm structure and free it.
  14.181 + */
  14.182 +static void remove_vm_struct(struct vm_area_struct *vma)
  14.183 +{
  14.184 +	struct file *file = vma->vm_file;
  14.185 +
  14.186 +	might_sleep();
  14.187 +	if (file) {
  14.188 +		struct address_space *mapping = file->f_mapping;
  14.189 +		spin_lock(&mapping->i_mmap_lock);
  14.190 +		__remove_shared_vm_struct(vma, file, mapping);
  14.191 +		spin_unlock(&mapping->i_mmap_lock);
  14.192 +	}
  14.193 +	if (vma->vm_ops && vma->vm_ops->close)
  14.194 +		vma->vm_ops->close(vma);
  14.195 +	if (file)
  14.196 +		fput(file);
  14.197 +	anon_vma_unlink(vma);
  14.198 +	mpol_free(vma_policy(vma));
  14.199 +	kmem_cache_free(vm_area_cachep, vma);
  14.200 +}
  14.201 +
  14.202 +/*
  14.203 + *  sys_brk() for the most part doesn't need the global kernel
  14.204 + *  lock, except when an application is doing something nasty
  14.205 + *  like trying to un-brk an area that has already been mapped
  14.206 + *  to a regular file.  in this case, the unmapping will need
  14.207 + *  to invoke file system routines that need the global lock.
  14.208 + */
  14.209 +asmlinkage unsigned long sys_brk(unsigned long brk)
  14.210 +{
  14.211 +	unsigned long rlim, retval;
  14.212 +	unsigned long newbrk, oldbrk;
  14.213 +	struct mm_struct *mm = current->mm;
  14.214 +
  14.215 +	down_write(&mm->mmap_sem);
  14.216 +
  14.217 +	if (brk < mm->end_code)
  14.218 +		goto out;
  14.219 +	newbrk = PAGE_ALIGN(brk);
  14.220 +	oldbrk = PAGE_ALIGN(mm->brk);
  14.221 +	if (oldbrk == newbrk)
  14.222 +		goto set_brk;
  14.223 +
  14.224 +	/* Always allow shrinking brk. */
  14.225 +	if (brk <= mm->brk) {
  14.226 +		if (!do_munmap(mm, newbrk, oldbrk-newbrk))
  14.227 +			goto set_brk;
  14.228 +		goto out;
  14.229 +	}
  14.230 +
  14.231 +	/* Check against rlimit.. */
  14.232 +	rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur;
  14.233 +	if (rlim < RLIM_INFINITY && brk - mm->start_data > rlim)
  14.234 +		goto out;
  14.235 +
  14.236 +	/* Check against existing mmap mappings. */
  14.237 +	if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE))
  14.238 +		goto out;
  14.239 +
  14.240 +	/* Ok, looks good - let it rip. */
  14.241 +	if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk)
  14.242 +		goto out;
  14.243 +set_brk:
  14.244 +	mm->brk = brk;
  14.245 +out:
  14.246 +	retval = mm->brk;
  14.247 +	up_write(&mm->mmap_sem);
  14.248 +	return retval;
  14.249 +}
  14.250 +
  14.251 +#ifdef DEBUG_MM_RB
  14.252 +static int browse_rb(struct rb_root *root)
  14.253 +{
  14.254 +	int i = 0, j;
  14.255 +	struct rb_node *nd, *pn = NULL;
  14.256 +	unsigned long prev = 0, pend = 0;
  14.257 +
  14.258 +	for (nd = rb_first(root); nd; nd = rb_next(nd)) {
  14.259 +		struct vm_area_struct *vma;
  14.260 +		vma = rb_entry(nd, struct vm_area_struct, vm_rb);
  14.261 +		if (vma->vm_start < prev)
  14.262 +			printk("vm_start %lx prev %lx\n", vma->vm_start, prev), i = -1;
  14.263 +		if (vma->vm_start < pend)
  14.264 +			printk("vm_start %lx pend %lx\n", vma->vm_start, pend);
  14.265 +		if (vma->vm_start > vma->vm_end)
  14.266 +			printk("vm_end %lx < vm_start %lx\n", vma->vm_end, vma->vm_start);
  14.267 +		i++;
  14.268 +		pn = nd;
  14.269 +	}
  14.270 +	j = 0;
  14.271 +	for (nd = pn; nd; nd = rb_prev(nd)) {
  14.272 +		j++;
  14.273 +	}
  14.274 +	if (i != j)
  14.275 +		printk("backwards %d, forwards %d\n", j, i), i = 0;
  14.276 +	return i;
  14.277 +}
  14.278 +
  14.279 +void validate_mm(struct mm_struct *mm)
  14.280 +{
  14.281 +	int bug = 0;
  14.282 +	int i = 0;
  14.283 +	struct vm_area_struct *tmp = mm->mmap;
  14.284 +	while (tmp) {
  14.285 +		tmp = tmp->vm_next;
  14.286 +		i++;
  14.287 +	}
  14.288 +	if (i != mm->map_count)
  14.289 +		printk("map_count %d vm_next %d\n", mm->map_count, i), bug = 1;
  14.290 +	i = browse_rb(&mm->mm_rb);
  14.291 +	if (i != mm->map_count)
  14.292 +		printk("map_count %d rb %d\n", mm->map_count, i), bug = 1;
  14.293 +	if (bug)
  14.294 +		BUG();
  14.295 +}
  14.296 +#else
  14.297 +#define validate_mm(mm) do { } while (0)
  14.298 +#endif
  14.299 +
  14.300 +static struct vm_area_struct *
  14.301 +find_vma_prepare(struct mm_struct *mm, unsigned long addr,
  14.302 +		struct vm_area_struct **pprev, struct rb_node ***rb_link,
  14.303 +		struct rb_node ** rb_parent)
  14.304 +{
  14.305 +	struct vm_area_struct * vma;
  14.306 +	struct rb_node ** __rb_link, * __rb_parent, * rb_prev;
  14.307 +
  14.308 +	__rb_link = &mm->mm_rb.rb_node;
  14.309 +	rb_prev = __rb_parent = NULL;
  14.310 +	vma = NULL;
  14.311 +
  14.312 +	while (*__rb_link) {
  14.313 +		struct vm_area_struct *vma_tmp;
  14.314 +
  14.315 +		__rb_parent = *__rb_link;
  14.316 +		vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb);
  14.317 +
  14.318 +		if (vma_tmp->vm_end > addr) {
  14.319 +			vma = vma_tmp;
  14.320 +			if (vma_tmp->vm_start <= addr)
  14.321 +				return vma;
  14.322 +			__rb_link = &__rb_parent->rb_left;
  14.323 +		} else {
  14.324 +			rb_prev = __rb_parent;
  14.325 +			__rb_link = &__rb_parent->rb_right;
  14.326 +		}
  14.327 +	}
  14.328 +
  14.329 +	*pprev = NULL;
  14.330 +	if (rb_prev)
  14.331 +		*pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
  14.332 +	*rb_link = __rb_link;
  14.333 +	*rb_parent = __rb_parent;
  14.334 +	return vma;
  14.335 +}
  14.336 +
  14.337 +static inline void
  14.338 +__vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
  14.339 +		struct vm_area_struct *prev, struct rb_node *rb_parent)
  14.340 +{
  14.341 +	if (prev) {
  14.342 +		vma->vm_next = prev->vm_next;
  14.343 +		prev->vm_next = vma;
  14.344 +	} else {
  14.345 +		mm->mmap = vma;
  14.346 +		if (rb_parent)
  14.347 +			vma->vm_next = rb_entry(rb_parent,
  14.348 +					struct vm_area_struct, vm_rb);
  14.349 +		else
  14.350 +			vma->vm_next = NULL;
  14.351 +	}
  14.352 +}
  14.353 +
  14.354 +void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
  14.355 +		struct rb_node **rb_link, struct rb_node *rb_parent)
  14.356 +{
  14.357 +	rb_link_node(&vma->vm_rb, rb_parent, rb_link);
  14.358 +	rb_insert_color(&vma->vm_rb, &mm->mm_rb);
  14.359 +}
  14.360 +
  14.361 +static inline void __vma_link_file(struct vm_area_struct *vma)
  14.362 +{
  14.363 +	struct file * file;
  14.364 +
  14.365 +	file = vma->vm_file;
  14.366 +	if (file) {
  14.367 +		struct address_space *mapping = file->f_mapping;
  14.368 +
  14.369 +		if (vma->vm_flags & VM_DENYWRITE)
  14.370 +			atomic_dec(&file->f_dentry->d_inode->i_writecount);
  14.371 +		if (vma->vm_flags & VM_SHARED)
  14.372 +			mapping->i_mmap_writable++;
  14.373 +
  14.374 +		flush_dcache_mmap_lock(mapping);
  14.375 +		if (unlikely(vma->vm_flags & VM_NONLINEAR))
  14.376 +			vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
  14.377 +		else
  14.378 +			vma_prio_tree_insert(vma, &mapping->i_mmap);
  14.379 +		flush_dcache_mmap_unlock(mapping);
  14.380 +	}
  14.381 +}
  14.382 +
  14.383 +static void
  14.384 +__vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
  14.385 +	struct vm_area_struct *prev, struct rb_node **rb_link,
  14.386 +	struct rb_node *rb_parent)
  14.387 +{
  14.388 +	__vma_link_list(mm, vma, prev, rb_parent);
  14.389 +	__vma_link_rb(mm, vma, rb_link, rb_parent);
  14.390 +	__anon_vma_link(vma);
  14.391 +}
  14.392 +
  14.393 +static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
  14.394 +			struct vm_area_struct *prev, struct rb_node **rb_link,
  14.395 +			struct rb_node *rb_parent)
  14.396 +{
  14.397 +	struct address_space *mapping = NULL;
  14.398 +
  14.399 +	if (vma->vm_file)
  14.400 +		mapping = vma->vm_file->f_mapping;
  14.401 +
  14.402 +	if (mapping) {
  14.403 +		spin_lock(&mapping->i_mmap_lock);
  14.404 +		vma->vm_truncate_count = mapping->truncate_count;
  14.405 +	}
  14.406 +	anon_vma_lock(vma);
  14.407 +
  14.408 +	__vma_link(mm, vma, prev, rb_link, rb_parent);
  14.409 +	__vma_link_file(vma);
  14.410 +
  14.411 +	anon_vma_unlock(vma);
  14.412 +	if (mapping)
  14.413 +		spin_unlock(&mapping->i_mmap_lock);
  14.414 +
  14.415 +	mm->map_count++;
  14.416 +	validate_mm(mm);
  14.417 +}
  14.418 +
  14.419 +/*
  14.420 + * Helper for vma_adjust in the split_vma insert case:
  14.421 + * insert vm structure into list and rbtree and anon_vma,
  14.422 + * but it has already been inserted into prio_tree earlier.
  14.423 + */
  14.424 +static void
  14.425 +__insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
  14.426 +{
  14.427 +	struct vm_area_struct * __vma, * prev;
  14.428 +	struct rb_node ** rb_link, * rb_parent;
  14.429 +
  14.430 +	__vma = find_vma_prepare(mm, vma->vm_start,&prev, &rb_link, &rb_parent);
  14.431 +	if (__vma && __vma->vm_start < vma->vm_end)
  14.432 +		BUG();
  14.433 +	__vma_link(mm, vma, prev, rb_link, rb_parent);
  14.434 +	mm->map_count++;
  14.435 +}
  14.436 +
  14.437 +static inline void
  14.438 +__vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
  14.439 +		struct vm_area_struct *prev)
  14.440 +{
  14.441 +	prev->vm_next = vma->vm_next;
  14.442 +	rb_erase(&vma->vm_rb, &mm->mm_rb);
  14.443 +	if (mm->mmap_cache == vma)
  14.444 +		mm->mmap_cache = prev;
  14.445 +}
  14.446 +
  14.447 +/*
  14.448 + * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that
  14.449 + * is already present in an i_mmap tree without adjusting the tree.
  14.450 + * The following helper function should be used when such adjustments
  14.451 + * are necessary.  The "insert" vma (if any) is to be inserted
  14.452 + * before we drop the necessary locks.
  14.453 + */
  14.454 +void vma_adjust(struct vm_area_struct *vma, unsigned long start,
  14.455 +	unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert)
  14.456 +{
  14.457 +	struct mm_struct *mm = vma->vm_mm;
  14.458 +	struct vm_area_struct *next = vma->vm_next;
  14.459 +	struct vm_area_struct *importer = NULL;
  14.460 +	struct address_space *mapping = NULL;
  14.461 +	struct prio_tree_root *root = NULL;
  14.462 +	struct file *file = vma->vm_file;
  14.463 +	struct anon_vma *anon_vma = NULL;
  14.464 +	long adjust_next = 0;
  14.465 +	int remove_next = 0;
  14.466 +
  14.467 +	if (next && !insert) {
  14.468 +		if (end >= next->vm_end) {
  14.469 +			/*
  14.470 +			 * vma expands, overlapping all the next, and
  14.471 +			 * perhaps the one after too (mprotect case 6).
  14.472 +			 */
  14.473 +again:			remove_next = 1 + (end > next->vm_end);
  14.474 +			end = next->vm_end;
  14.475 +			anon_vma = next->anon_vma;
  14.476 +			importer = vma;
  14.477 +		} else if (end > next->vm_start) {
  14.478 +			/*
  14.479 +			 * vma expands, overlapping part of the next:
  14.480 +			 * mprotect case 5 shifting the boundary up.
  14.481 +			 */
  14.482 +			adjust_next = (end - next->vm_start) >> PAGE_SHIFT;
  14.483 +			anon_vma = next->anon_vma;
  14.484 +			importer = vma;
  14.485 +		} else if (end < vma->vm_end) {
  14.486 +			/*
  14.487 +			 * vma shrinks, and !insert tells it's not
  14.488 +			 * split_vma inserting another: so it must be
  14.489 +			 * mprotect case 4 shifting the boundary down.
  14.490 +			 */
  14.491 +			adjust_next = - ((vma->vm_end - end) >> PAGE_SHIFT);
  14.492 +			anon_vma = next->anon_vma;
  14.493 +			importer = next;
  14.494 +		}
  14.495 +	}
  14.496 +
  14.497 +	if (file) {
  14.498 +		mapping = file->f_mapping;
  14.499 +		if (!(vma->vm_flags & VM_NONLINEAR))
  14.500 +			root = &mapping->i_mmap;
  14.501 +		spin_lock(&mapping->i_mmap_lock);
  14.502 +		if (importer &&
  14.503 +		    vma->vm_truncate_count != next->vm_truncate_count) {
  14.504 +			/*
  14.505 +			 * unmap_mapping_range might be in progress:
  14.506 +			 * ensure that the expanding vma is rescanned.
  14.507 +			 */
  14.508 +			importer->vm_truncate_count = 0;
  14.509 +		}
  14.510 +		if (insert) {
  14.511 +			insert->vm_truncate_count = vma->vm_truncate_count;
  14.512 +			/*
  14.513 +			 * Put into prio_tree now, so instantiated pages
  14.514 +			 * are visible to arm/parisc __flush_dcache_page
  14.515 +			 * throughout; but we cannot insert into address
  14.516 +			 * space until vma start or end is updated.
  14.517 +			 */
  14.518 +			__vma_link_file(insert);
  14.519 +		}
  14.520 +	}
  14.521 +
  14.522 +	/*
  14.523 +	 * When changing only vma->vm_end, we don't really need
  14.524 +	 * anon_vma lock: but is that case worth optimizing out?
  14.525 +	 */
  14.526 +	if (vma->anon_vma)
  14.527 +		anon_vma = vma->anon_vma;
  14.528 +	if (anon_vma) {
  14.529 +		spin_lock(&anon_vma->lock);
  14.530 +		/*
  14.531 +		 * Easily overlooked: when mprotect shifts the boundary,
  14.532 +		 * make sure the expanding vma has anon_vma set if the
  14.533 +		 * shrinking vma had, to cover any anon pages imported.
  14.534 +		 */
  14.535 +		if (importer && !importer->anon_vma) {
  14.536 +			importer->anon_vma = anon_vma;
  14.537 +			__anon_vma_link(importer);
  14.538 +		}
  14.539 +	}
  14.540 +
  14.541 +	if (root) {
  14.542 +		flush_dcache_mmap_lock(mapping);
  14.543 +		vma_prio_tree_remove(vma, root);
  14.544 +		if (adjust_next)
  14.545 +			vma_prio_tree_remove(next, root);
  14.546 +	}
  14.547 +
  14.548 +	vma->vm_start = start;
  14.549 +	vma->vm_end = end;
  14.550 +	vma->vm_pgoff = pgoff;
  14.551 +	if (adjust_next) {
  14.552 +		next->vm_start += adjust_next << PAGE_SHIFT;
  14.553 +		next->vm_pgoff += adjust_next;
  14.554 +	}
  14.555 +
  14.556 +	if (root) {
  14.557 +		if (adjust_next)
  14.558 +			vma_prio_tree_insert(next, root);
  14.559 +		vma_prio_tree_insert(vma, root);
  14.560 +		flush_dcache_mmap_unlock(mapping);
  14.561 +	}
  14.562 +
  14.563 +	if (remove_next) {
  14.564 +		/*
  14.565 +		 * vma_merge has merged next into vma, and needs
  14.566 +		 * us to remove next before dropping the locks.
  14.567 +		 */
  14.568 +		__vma_unlink(mm, next, vma);
  14.569 +		if (file)
  14.570 +			__remove_shared_vm_struct(next, file, mapping);
  14.571 +		if (next->anon_vma)
  14.572 +			__anon_vma_merge(vma, next);
  14.573 +	} else if (insert) {
  14.574 +		/*
  14.575 +		 * split_vma has split insert from vma, and needs
  14.576 +		 * us to insert it before dropping the locks
  14.577 +		 * (it may either follow vma or precede it).
  14.578 +		 */
  14.579 +		__insert_vm_struct(mm, insert);
  14.580 +	}
  14.581 +
  14.582 +	if (anon_vma)
  14.583 +		spin_unlock(&anon_vma->lock);
  14.584 +	if (mapping)
  14.585 +		spin_unlock(&mapping->i_mmap_lock);
  14.586 +
  14.587 +	if (remove_next) {
  14.588 +		if (file)
  14.589 +			fput(file);
  14.590 +		mm->map_count--;
  14.591 +		mpol_free(vma_policy(next));
  14.592 +		kmem_cache_free(vm_area_cachep, next);
  14.593 +		/*
  14.594 +		 * In mprotect's case 6 (see comments on vma_merge),
  14.595 +		 * we must remove another next too. It would clutter
  14.596 +		 * up the code too much to do both in one go.
  14.597 +		 */
  14.598 +		if (remove_next == 2) {
  14.599 +			next = vma->vm_next;
  14.600 +			goto again;
  14.601 +		}
  14.602 +	}
  14.603 +
  14.604 +	validate_mm(mm);
  14.605 +}
  14.606 +
  14.607 +/*
  14.608 + * If the vma has a ->close operation then the driver probably needs to release
  14.609 + * per-vma resources, so we don't attempt to merge those.
  14.610 + */
  14.611 +#define VM_SPECIAL (VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_RESERVED)
  14.612 +
  14.613 +static inline int is_mergeable_vma(struct vm_area_struct *vma,
  14.614 +			struct file *file, unsigned long vm_flags)
  14.615 +{
  14.616 +	if (vma->vm_flags != vm_flags)
  14.617 +		return 0;
  14.618 +	if (vma->vm_file != file)
  14.619 +		return 0;
  14.620 +	if (vma->vm_ops && vma->vm_ops->close)
  14.621 +		return 0;
  14.622 +	return 1;
  14.623 +}
  14.624 +
  14.625 +static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1,
  14.626 +					struct anon_vma *anon_vma2)
  14.627 +{
  14.628 +	return !anon_vma1 || !anon_vma2 || (anon_vma1 == anon_vma2);
  14.629 +}
  14.630 +
  14.631 +/*
  14.632 + * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
  14.633 + * in front of (at a lower virtual address and file offset than) the vma.
  14.634 + *
  14.635 + * We cannot merge two vmas if they have differently assigned (non-NULL)
  14.636 + * anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
  14.637 + *
  14.638 + * We don't check here for the merged mmap wrapping around the end of pagecache
  14.639 + * indices (16TB on ia32) because do_mmap_pgoff() does not permit mmap's which
  14.640 + * wrap, nor mmaps which cover the final page at index -1UL.
  14.641 + */
  14.642 +static int
  14.643 +can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
  14.644 +	struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
  14.645 +{
  14.646 +	if (is_mergeable_vma(vma, file, vm_flags) &&
  14.647 +	    is_mergeable_anon_vma(anon_vma, vma->anon_vma)) {
  14.648 +		if (vma->vm_pgoff == vm_pgoff)
  14.649 +			return 1;
  14.650 +	}
  14.651 +	return 0;
  14.652 +}
  14.653 +
  14.654 +/*
  14.655 + * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
  14.656 + * beyond (at a higher virtual address and file offset than) the vma.
  14.657 + *
  14.658 + * We cannot merge two vmas if they have differently assigned (non-NULL)
  14.659 + * anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
  14.660 + */
  14.661 +static int
  14.662 +can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
  14.663 +	struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
  14.664 +{
  14.665 +	if (is_mergeable_vma(vma, file, vm_flags) &&
  14.666 +	    is_mergeable_anon_vma(anon_vma, vma->anon_vma)) {
  14.667 +		pgoff_t vm_pglen;
  14.668 +		vm_pglen = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
  14.669 +		if (vma->vm_pgoff + vm_pglen == vm_pgoff)
  14.670 +			return 1;
  14.671 +	}
  14.672 +	return 0;
  14.673 +}
  14.674 +
  14.675 +/*
  14.676 + * Given a mapping request (addr,end,vm_flags,file,pgoff), figure out
  14.677 + * whether that can be merged with its predecessor or its successor.
  14.678 + * Or both (it neatly fills a hole).
  14.679 + *
  14.680 + * In most cases - when called for mmap, brk or mremap - [addr,end) is
  14.681 + * certain not to be mapped by the time vma_merge is called; but when
  14.682 + * called for mprotect, it is certain to be already mapped (either at
  14.683 + * an offset within prev, or at the start of next), and the flags of
  14.684 + * this area are about to be changed to vm_flags - and the no-change
  14.685 + * case has already been eliminated.
  14.686 + *
  14.687 + * The following mprotect cases have to be considered, where AAAA is
  14.688 + * the area passed down from mprotect_fixup, never extending beyond one
  14.689 + * vma, PPPPPP is the prev vma specified, and NNNNNN the next vma after:
  14.690 + *
  14.691 + *     AAAA             AAAA                AAAA          AAAA
  14.692 + *    PPPPPPNNNNNN    PPPPPPNNNNNN    PPPPPPNNNNNN    PPPPNNNNXXXX
  14.693 + *    cannot merge    might become    might become    might become
  14.694 + *                    PPNNNNNNNNNN    PPPPPPPPPPNN    PPPPPPPPPPPP 6 or
  14.695 + *    mmap, brk or    case 4 below    case 5 below    PPPPPPPPXXXX 7 or
  14.696 + *    mremap move:                                    PPPPNNNNNNNN 8
  14.697 + *        AAAA
  14.698 + *    PPPP    NNNN    PPPPPPPPPPPP    PPPPPPPPNNNN    PPPPNNNNNNNN
  14.699 + *    might become    case 1 below    case 2 below    case 3 below
  14.700 + *
  14.701 + * Odd one out? Case 8, because it extends NNNN but needs flags of XXXX:
  14.702 + * mprotect_fixup updates vm_flags & vm_page_prot on successful return.
  14.703 + */
  14.704 +struct vm_area_struct *vma_merge(struct mm_struct *mm,
  14.705 +			struct vm_area_struct *prev, unsigned long addr,
  14.706 +			unsigned long end, unsigned long vm_flags,
  14.707 +		     	struct anon_vma *anon_vma, struct file *file,
  14.708 +			pgoff_t pgoff, struct mempolicy *policy)
  14.709 +{
  14.710 +	pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
  14.711 +	struct vm_area_struct *area, *next;
  14.712 +
  14.713 +	/*
  14.714 +	 * We later require that vma->vm_flags == vm_flags,
  14.715 +	 * so this tests vma->vm_flags & VM_SPECIAL, too.
  14.716 +	 */
  14.717 +	if (vm_flags & VM_SPECIAL)
  14.718 +		return NULL;
  14.719 +
  14.720 +	if (prev)
  14.721 +		next = prev->vm_next;
  14.722 +	else
  14.723 +		next = mm->mmap;
  14.724 +	area = next;
  14.725 +	if (next && next->vm_end == end)		/* cases 6, 7, 8 */
  14.726 +		next = next->vm_next;
  14.727 +
  14.728 +	/*
  14.729 +	 * Can it merge with the predecessor?
  14.730 +	 */
  14.731 +	if (prev && prev->vm_end == addr &&
  14.732 +  			mpol_equal(vma_policy(prev), policy) &&
  14.733 +			can_vma_merge_after(prev, vm_flags,
  14.734 +						anon_vma, file, pgoff)) {
  14.735 +		/*
  14.736 +		 * OK, it can.  Can we now merge in the successor as well?
  14.737 +		 */
  14.738 +		if (next && end == next->vm_start &&
  14.739 +				mpol_equal(policy, vma_policy(next)) &&
  14.740 +				can_vma_merge_before(next, vm_flags,
  14.741 +					anon_vma, file, pgoff+pglen) &&
  14.742 +				is_mergeable_anon_vma(prev->anon_vma,
  14.743 +						      next->anon_vma)) {
  14.744 +							/* cases 1, 6 */
  14.745 +			vma_adjust(prev, prev->vm_start,
  14.746 +				next->vm_end, prev->vm_pgoff, NULL);
  14.747 +		} else					/* cases 2, 5, 7 */
  14.748 +			vma_adjust(prev, prev->vm_start,
  14.749 +				end, prev->vm_pgoff, NULL);
  14.750 +		return prev;
  14.751 +	}
  14.752 +
  14.753 +	/*
  14.754 +	 * Can this new request be merged in front of next?
  14.755 +	 */
  14.756 +	if (next && end == next->vm_start &&
  14.757 + 			mpol_equal(policy, vma_policy(next)) &&
  14.758 +			can_vma_merge_before(next, vm_flags,
  14.759 +					anon_vma, file, pgoff+pglen)) {
  14.760 +		if (prev && addr < prev->vm_end)	/* case 4 */
  14.761 +			vma_adjust(prev, prev->vm_start,
  14.762 +				addr, prev->vm_pgoff, NULL);
  14.763 +		else					/* cases 3, 8 */
  14.764 +			vma_adjust(area, addr, next->vm_end,
  14.765 +				next->vm_pgoff - pglen, NULL);
  14.766 +		return area;
  14.767 +	}
  14.768 +
  14.769 +	return NULL;
  14.770 +}
  14.771 +
  14.772 +/*
  14.773 + * find_mergeable_anon_vma is used by anon_vma_prepare, to check
  14.774 + * neighbouring vmas for a suitable anon_vma, before it goes off
  14.775 + * to allocate a new anon_vma.  It checks because a repetitive
  14.776 + * sequence of mprotects and faults may otherwise lead to distinct
  14.777 + * anon_vmas being allocated, preventing vma merge in subsequent
  14.778 + * mprotect.
  14.779 + */
  14.780 +struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)
  14.781 +{
  14.782 +	struct vm_area_struct *near;
  14.783 +	unsigned long vm_flags;
  14.784 +
  14.785 +	near = vma->vm_next;
  14.786 +	if (!near)
  14.787 +		goto try_prev;
  14.788 +
  14.789 +	/*
  14.790 +	 * Since only mprotect tries to remerge vmas, match flags
  14.791 +	 * which might be mprotected into each other later on.
  14.792 +	 * Neither mlock nor madvise tries to remerge at present,
  14.793 +	 * so leave their flags as obstructing a merge.
  14.794 +	 */
  14.795 +	vm_flags = vma->vm_flags & ~(VM_READ|VM_WRITE|VM_EXEC);
  14.796 +	vm_flags |= near->vm_flags & (VM_READ|VM_WRITE|VM_EXEC);
  14.797 +
  14.798 +	if (near->anon_vma && vma->vm_end == near->vm_start &&
  14.799 + 			mpol_equal(vma_policy(vma), vma_policy(near)) &&
  14.800 +			can_vma_merge_before(near, vm_flags,
  14.801 +				NULL, vma->vm_file, vma->vm_pgoff +
  14.802 +				((vma->vm_end - vma->vm_start) >> PAGE_SHIFT)))
  14.803 +		return near->anon_vma;
  14.804 +try_prev:
  14.805 +	/*
  14.806 +	 * It is potentially slow to have to call find_vma_prev here.
  14.807 +	 * But it's only on the first write fault on the vma, not
  14.808 +	 * every time, and we could devise a way to avoid it later
  14.809 +	 * (e.g. stash info in next's anon_vma_node when assigning
  14.810 +	 * an anon_vma, or when trying vma_merge).  Another time.
  14.811 +	 */
  14.812 +	if (find_vma_prev(vma->vm_mm, vma->vm_start, &near) != vma)
  14.813 +		BUG();
  14.814 +	if (!near)
  14.815 +		goto none;
  14.816 +
  14.817 +	vm_flags = vma->vm_flags & ~(VM_READ|VM_WRITE|VM_EXEC);
  14.818 +	vm_flags |= near->vm_flags & (VM_READ|VM_WRITE|VM_EXEC);
  14.819 +
  14.820 +	if (near->anon_vma && near->vm_end == vma->vm_start &&
  14.821 +  			mpol_equal(vma_policy(near), vma_policy(vma)) &&
  14.822 +			can_vma_merge_after(near, vm_flags,
  14.823 +				NULL, vma->vm_file, vma->vm_pgoff))
  14.824 +		return near->anon_vma;
  14.825 +none:
  14.826 +	/*
  14.827 +	 * There's no absolute need to look only at touching neighbours:
  14.828 +	 * we could search further afield for "compatible" anon_vmas.
  14.829 +	 * But it would probably just be a waste of time searching,
  14.830 +	 * or lead to too many vmas hanging off the same anon_vma.
  14.831 +	 * We're trying to allow mprotect remerging later on,
  14.832 +	 * not trying to minimize memory used for anon_vmas.
  14.833 +	 */
  14.834 +	return NULL;
  14.835 +}
  14.836 +
  14.837 +#ifdef CONFIG_PROC_FS
  14.838 +void __vm_stat_account(struct mm_struct *mm, unsigned long flags,
  14.839 +						struct file *file, long pages)
  14.840 +{
  14.841 +	const unsigned long stack_flags
  14.842 +		= VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN);
  14.843 +
  14.844 +#ifdef CONFIG_HUGETLB
  14.845 +	if (flags & VM_HUGETLB) {
  14.846 +		if (!(flags & VM_DONTCOPY))
  14.847 +			mm->shared_vm += pages;
  14.848 +		return;
  14.849 +	}
  14.850 +#endif /* CONFIG_HUGETLB */
  14.851 +
  14.852 +	if (file) {
  14.853 +		mm->shared_vm += pages;
  14.854 +		if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC)
  14.855 +			mm->exec_vm += pages;
  14.856 +	} else if (flags & stack_flags)
  14.857 +		mm->stack_vm += pages;
  14.858 +	if (flags & (VM_RESERVED|VM_IO))
  14.859 +		mm->reserved_vm += pages;
  14.860 +}
  14.861 +#endif /* CONFIG_PROC_FS */
  14.862 +
  14.863 +/*
  14.864 + * The caller must hold down_write(current->mm->mmap_sem).
  14.865 + */
  14.866 +
  14.867 +unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
  14.868 +			unsigned long len, unsigned long prot,
  14.869 +			unsigned long flags, unsigned long pgoff)
  14.870 +{
  14.871 +	struct mm_struct * mm = current->mm;
  14.872 +	struct vm_area_struct * vma, * prev;
  14.873 +	struct inode *inode;
  14.874 +	unsigned int vm_flags;
  14.875 +	int correct_wcount = 0;
  14.876 +	int error;
  14.877 +	struct rb_node ** rb_link, * rb_parent;
  14.878 +	int accountable = 1;
  14.879 +	unsigned long charged = 0;
  14.880 +
  14.881 +	if (file) {
  14.882 +		if (is_file_hugepages(file))
  14.883 +			accountable = 0;
  14.884 +
  14.885 +		if (!file->f_op || !file->f_op->mmap)
  14.886 +			return -ENODEV;
  14.887 +
  14.888 +		if ((prot & PROT_EXEC) &&
  14.889 +		    (file->f_vfsmnt->mnt_flags & MNT_NOEXEC))
  14.890 +			return -EPERM;
  14.891 +	}
  14.892 +	/*
  14.893 +	 * Does the application expect PROT_READ to imply PROT_EXEC?
  14.894 +	 *
  14.895 +	 * (the exception is when the underlying filesystem is noexec
  14.896 +	 *  mounted, in which case we dont add PROT_EXEC.)
  14.897 +	 */
  14.898 +	if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
  14.899 +		if (!(file && (file->f_vfsmnt->mnt_flags & MNT_NOEXEC)))
  14.900 +			prot |= PROT_EXEC;
  14.901 +
  14.902 +	if (!len)
  14.903 +		return addr;
  14.904 +
  14.905 +	/* Careful about overflows.. */
  14.906 +	len = PAGE_ALIGN(len);
  14.907 +	if (!len || len > TASK_SIZE)
  14.908 +		return -EINVAL;
  14.909 +
  14.910 +	/* offset overflow? */
  14.911 +	if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
  14.912 +		return -EINVAL;
  14.913 +
  14.914 +	/* Too many mappings? */
  14.915 +	if (mm->map_count > sysctl_max_map_count)
  14.916 +		return -ENOMEM;
  14.917 +
  14.918 +	/* Obtain the address to map to. we verify (or select) it and ensure
  14.919 +	 * that it represents a valid section of the address space.
  14.920 +	 */
  14.921 +	addr = get_unmapped_area(file, addr, len, pgoff, flags);
  14.922 +	if (addr & ~PAGE_MASK)
  14.923 +		return addr;
  14.924 +
  14.925 +	/* Do simple checking here so the lower-level routines won't have
  14.926 +	 * to. we assume access permissions have been handled by the open
  14.927 +	 * of the memory object, so we don't do any here.
  14.928 +	 */
  14.929 +	vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |
  14.930 +			mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
  14.931 +
  14.932 +	if (flags & MAP_LOCKED) {
  14.933 +		if (!can_do_mlock())
  14.934 +			return -EPERM;
  14.935 +		vm_flags |= VM_LOCKED;
  14.936 +	}
  14.937 +	/* mlock MCL_FUTURE? */
  14.938 +	if (vm_flags & VM_LOCKED) {
  14.939 +		unsigned long locked, lock_limit;
  14.940 +		locked = mm->locked_vm << PAGE_SHIFT;
  14.941 +		lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
  14.942 +		locked += len;
  14.943 +		if (locked > lock_limit && !capable(CAP_IPC_LOCK))
  14.944 +			return -EAGAIN;
  14.945 +	}
  14.946 +
  14.947 +	inode = file ? file->f_dentry->d_inode : NULL;
  14.948 +
  14.949 +	if (file) {
  14.950 +		switch (flags & MAP_TYPE) {
  14.951 +		case MAP_SHARED:
  14.952 +			if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE))
  14.953 +				return -EACCES;
  14.954 +
  14.955 +			/*
  14.956 +			 * Make sure we don't allow writing to an append-only
  14.957 +			 * file..
  14.958 +			 */
  14.959 +			if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE))
  14.960 +				return -EACCES;
  14.961 +
  14.962 +			/*
  14.963 +			 * Make sure there are no mandatory locks on the file.
  14.964 +			 */
  14.965 +			if (locks_verify_locked(inode))
  14.966 +				return -EAGAIN;
  14.967 +
  14.968 +			vm_flags |= VM_SHARED | VM_MAYSHARE;
  14.969 +			if (!(file->f_mode & FMODE_WRITE))
  14.970 +				vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
  14.971 +
  14.972 +			/* fall through */
  14.973 +		case MAP_PRIVATE:
  14.974 +			if (!(file->f_mode & FMODE_READ))
  14.975 +				return -EACCES;
  14.976 +			break;
  14.977 +
  14.978 +		default:
  14.979 +			return -EINVAL;
  14.980 +		}
  14.981 +	} else {
  14.982 +		switch (flags & MAP_TYPE) {
  14.983 +		case MAP_SHARED:
  14.984 +			vm_flags |= VM_SHARED | VM_MAYSHARE;
  14.985 +			break;
  14.986 +		case MAP_PRIVATE:
  14.987 +			/*
  14.988 +			 * Set pgoff according to addr for anon_vma.
  14.989 +			 */
  14.990 +			pgoff = addr >> PAGE_SHIFT;
  14.991 +			break;
  14.992 +		default:
  14.993 +			return -EINVAL;
  14.994 +		}
  14.995 +	}
  14.996 +
  14.997 +	error = security_file_mmap(file, prot, flags);
  14.998 +	if (error)
  14.999 +		return error;
 14.1000 +		
 14.1001 +	/* Clear old maps */
 14.1002 +	error = -ENOMEM;
 14.1003 +munmap_back:
 14.1004 +	vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
 14.1005 +	if (vma && vma->vm_start < addr + len) {
 14.1006 +		if (do_munmap(mm, addr, len))
 14.1007 +			return -ENOMEM;
 14.1008 +		goto munmap_back;
 14.1009 +	}
 14.1010 +
 14.1011 +	/* Check against address space limit. */
 14.1012 +	if ((mm->total_vm << PAGE_SHIFT) + len
 14.1013 +	    > current->signal->rlim[RLIMIT_AS].rlim_cur)
 14.1014 +		return -ENOMEM;
 14.1015 +
 14.1016 +	if (accountable && (!(flags & MAP_NORESERVE) ||
 14.1017 +			    sysctl_overcommit_memory == OVERCOMMIT_NEVER)) {
 14.1018 +		if (vm_flags & VM_SHARED) {
 14.1019 +			/* Check memory availability in shmem_file_setup? */
 14.1020 +			vm_flags |= VM_ACCOUNT;
 14.1021 +		} else if (vm_flags & VM_WRITE) {
 14.1022 +			/*
 14.1023 +			 * Private writable mapping: check memory availability
 14.1024 +			 */
 14.1025 +			charged = len >> PAGE_SHIFT;
 14.1026 +			if (security_vm_enough_memory(charged))
 14.1027 +				return -ENOMEM;
 14.1028 +			vm_flags |= VM_ACCOUNT;
 14.1029 +		}
 14.1030 +	}
 14.1031 +
 14.1032 +	/*
 14.1033 +	 * Can we just expand an old private anonymous mapping?
 14.1034 +	 * The VM_SHARED test is necessary because shmem_zero_setup
 14.1035 +	 * will create the file object for a shared anonymous map below.
 14.1036 +	 */
 14.1037 +	if (!file && !(vm_flags & VM_SHARED) &&
 14.1038 +	    vma_merge(mm, prev, addr, addr + len, vm_flags,
 14.1039 +					NULL, NULL, pgoff, NULL))
 14.1040 +		goto out;
 14.1041 +
 14.1042 +	/*
 14.1043 +	 * Determine the object being mapped and call the appropriate
 14.1044 +	 * specific mapper. the address has already been validated, but
 14.1045 +	 * not unmapped, but the maps are removed from the list.
 14.1046 +	 */
 14.1047 +	vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
 14.1048 +	if (!vma) {
 14.1049 +		error = -ENOMEM;
 14.1050 +		goto unacct_error;
 14.1051 +	}
 14.1052 +	memset(vma, 0, sizeof(*vma));
 14.1053 +
 14.1054 +	vma->vm_mm = mm;
 14.1055 +	vma->vm_start = addr;
 14.1056 +	vma->vm_end = addr + len;
 14.1057 +	vma->vm_flags = vm_flags;
 14.1058 +	vma->vm_page_prot = protection_map[vm_flags & 0x0f];
 14.1059 +	vma->vm_pgoff = pgoff;
 14.1060 +
 14.1061 +	if (file) {
 14.1062 +		error = -EINVAL;
 14.1063 +		if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
 14.1064 +			goto free_vma;
 14.1065 +		if (vm_flags & VM_DENYWRITE) {
 14.1066 +			error = deny_write_access(file);
 14.1067 +			if (error)
 14.1068 +				goto free_vma;
 14.1069 +			correct_wcount = 1;
 14.1070 +		}
 14.1071 +		vma->vm_file = file;
 14.1072 +		get_file(file);
 14.1073 +		error = file->f_op->mmap(file, vma);
 14.1074 +		if (error)
 14.1075 +			goto unmap_and_free_vma;
 14.1076 +	} else if (vm_flags & VM_SHARED) {
 14.1077 +		error = shmem_zero_setup(vma);
 14.1078 +		if (error)
 14.1079 +			goto free_vma;
 14.1080 +	}
 14.1081 +
 14.1082 +	/* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform
 14.1083 +	 * shmem_zero_setup (perhaps called through /dev/zero's ->mmap)
 14.1084 +	 * that memory reservation must be checked; but that reservation
 14.1085 +	 * belongs to shared memory object, not to vma: so now clear it.
 14.1086 +	 */
 14.1087 +	if ((vm_flags & (VM_SHARED|VM_ACCOUNT)) == (VM_SHARED|VM_ACCOUNT))
 14.1088 +		vma->vm_flags &= ~VM_ACCOUNT;
 14.1089 +
 14.1090 +	/* Can addr have changed??
 14.1091 +	 *
 14.1092 +	 * Answer: Yes, several device drivers can do it in their
 14.1093 +	 *         f_op->mmap method. -DaveM
 14.1094 +	 */
 14.1095 +	addr = vma->vm_start;
 14.1096 +	pgoff = vma->vm_pgoff;
 14.1097 +	vm_flags = vma->vm_flags;
 14.1098 +
 14.1099 +	if (!file || !vma_merge(mm, prev, addr, vma->vm_end,
 14.1100 +			vma->vm_flags, NULL, file, pgoff, vma_policy(vma))) {
 14.1101 +		file = vma->vm_file;
 14.1102 +		vma_link(mm, vma, prev, rb_link, rb_parent);
 14.1103 +		if (correct_wcount)
 14.1104 +			atomic_inc(&inode->i_writecount);
 14.1105 +	} else {
 14.1106 +		if (file) {
 14.1107 +			if (correct_wcount)
 14.1108 +				atomic_inc(&inode->i_writecount);
 14.1109 +			fput(file);
 14.1110 +		}
 14.1111 +		mpol_free(vma_policy(vma));
 14.1112 +		kmem_cache_free(vm_area_cachep, vma);
 14.1113 +	}
 14.1114 +out:	
 14.1115 +	mm->total_vm += len >> PAGE_SHIFT;
 14.1116 +	__vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
 14.1117 +	if (vm_flags & VM_LOCKED) {
 14.1118 +		mm->locked_vm += len >> PAGE_SHIFT;
 14.1119 +		make_pages_present(addr, addr + len);
 14.1120 +	}
 14.1121 +	if (flags & MAP_POPULATE) {
 14.1122 +		up_write(&mm->mmap_sem);
 14.1123 +		sys_remap_file_pages(addr, len, 0,
 14.1124 +					pgoff, flags & MAP_NONBLOCK);
 14.1125 +		down_write(&mm->mmap_sem);
 14.1126 +	}
 14.1127 +	acct_update_integrals();
 14.1128 +	update_mem_hiwater();
 14.1129 +	return addr;
 14.1130 +
 14.1131 +unmap_and_free_vma:
 14.1132 +	if (correct_wcount)
 14.1133 +		atomic_inc(&inode->i_writecount);
 14.1134 +	vma->vm_file = NULL;
 14.1135 +	fput(file);
 14.1136 +
 14.1137 +	/* Undo any partial mapping done by a device driver. */
 14.1138 +	zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start, NULL);
 14.1139 +free_vma:
 14.1140 +	kmem_cache_free(vm_area_cachep, vma);
 14.1141 +unacct_error:
 14.1142 +	if (charged)
 14.1143 +		vm_unacct_memory(charged);
 14.1144 +	return error;
 14.1145 +}
 14.1146 +
 14.1147 +EXPORT_SYMBOL(do_mmap_pgoff);
 14.1148 +
 14.1149 +/* Get an address range which is currently unmapped.
 14.1150 + * For shmat() with addr=0.
 14.1151 + *
 14.1152 + * Ugly calling convention alert:
 14.1153 + * Return value with the low bits set means error value,
 14.1154 + * ie
 14.1155 + *	if (ret & ~PAGE_MASK)
 14.1156 + *		error = ret;
 14.1157 + *
 14.1158 + * This function "knows" that -ENOMEM has the bits set.
 14.1159 + */
 14.1160 +#ifndef HAVE_ARCH_UNMAPPED_AREA
 14.1161 +unsigned long
 14.1162 +arch_get_unmapped_area(struct file *filp, unsigned long addr,
 14.1163 +		unsigned long len, unsigned long pgoff, unsigned long flags)
 14.1164 +{
 14.1165 +	struct mm_struct *mm = current->mm;
 14.1166 +	struct vm_area_struct *vma;
 14.1167 +	unsigned long start_addr;
 14.1168 +
 14.1169 +	if (len > TASK_SIZE)
 14.1170 +		return -ENOMEM;
 14.1171 +
 14.1172 +	if (addr) {
 14.1173 +		addr = PAGE_ALIGN(addr);
 14.1174 +		vma = find_vma(mm, addr);
 14.1175 +		if (TASK_SIZE - len >= addr &&
 14.1176 +		    (!vma || addr + len <= vma->vm_start))
 14.1177 +			return addr;
 14.1178 +	}
 14.1179 +	start_addr = addr = mm->free_area_cache;
 14.1180 +
 14.1181 +full_search:
 14.1182 +	for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
 14.1183 +		/* At this point:  (!vma || addr < vma->vm_end). */
 14.1184 +		if (TASK_SIZE - len < addr) {
 14.1185 +			/*
 14.1186 +			 * Start a new search - just in case we missed
 14.1187 +			 * some holes.
 14.1188 +			 */
 14.1189 +			if (start_addr != TASK_UNMAPPED_BASE) {
 14.1190 +				start_addr = addr = TASK_UNMAPPED_BASE;
 14.1191 +				goto full_search;
 14.1192 +			}
 14.1193 +			return -ENOMEM;
 14.1194 +		}
 14.1195 +		if (!vma || addr + len <= vma->vm_start) {
 14.1196 +			/*
 14.1197 +			 * Remember the place where we stopped the search:
 14.1198 +			 */
 14.1199 +			mm->free_area_cache = addr + len;
 14.1200 +			return addr;
 14.1201 +		}
 14.1202 +		addr = vma->vm_end;
 14.1203 +	}
 14.1204 +}
 14.1205 +#endif	
 14.1206 +
 14.1207 +void arch_unmap_area(struct vm_area_struct *area)
 14.1208 +{
 14.1209 +	/*
 14.1210 +	 * Is this a new hole at the lowest possible address?
 14.1211 +	 */
 14.1212 +	if (area->vm_start >= TASK_UNMAPPED_BASE &&
 14.1213 +			area->vm_start < area->vm_mm->free_area_cache)
 14.1214 +		area->vm_mm->free_area_cache = area->vm_start;
 14.1215 +}
 14.1216 +
 14.1217 +/*
 14.1218 + * This mmap-allocator allocates new areas top-down from below the
 14.1219 + * stack's low limit (the base):
 14.1220 + */
 14.1221 +#ifndef HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
 14.1222 +unsigned long
 14.1223 +arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
 14.1224 +			  const unsigned long len, const unsigned long pgoff,
 14.1225 +			  const unsigned long flags)
 14.1226 +{
 14.1227 +	struct vm_area_struct *vma, *prev_vma;
 14.1228 +	struct mm_struct *mm = current->mm;
 14.1229 +	unsigned long base = mm->mmap_base, addr = addr0;
 14.1230 +	int first_time = 1;
 14.1231 +
 14.1232 +	/* requested length too big for entire address space */
 14.1233 +	if (len > TASK_SIZE)
 14.1234 +		return -ENOMEM;
 14.1235 +
 14.1236 +	/* dont allow allocations above current base */
 14.1237 +	if (mm->free_area_cache > base)
 14.1238 +		mm->free_area_cache = base;
 14.1239 +
 14.1240 +	/* requesting a specific address */
 14.1241 +	if (addr) {
 14.1242 +		addr = PAGE_ALIGN(addr);
 14.1243 +		vma = find_vma(mm, addr);
 14.1244 +		if (TASK_SIZE - len >= addr &&
 14.1245 +				(!vma || addr + len <= vma->vm_start))
 14.1246 +			return addr;
 14.1247 +	}
 14.1248 +
 14.1249 +try_again:
 14.1250 +	/* make sure it can fit in the remaining address space */
 14.1251 +	if (mm->free_area_cache < len)
 14.1252 +		goto fail;
 14.1253 +
 14.1254 +	/* either no address requested or cant fit in requested address hole */
 14.1255 +	addr = (mm->free_area_cache - len) & PAGE_MASK;
 14.1256 +	do {
 14.1257 +		/*
 14.1258 +		 * Lookup failure means no vma is above this address,
 14.1259 +		 * i.e. return with success:
 14.1260 +		 */
 14.1261 + 	 	if (!(vma = find_vma_prev(mm, addr, &prev_vma)))
 14.1262 +			return addr;
 14.1263 +
 14.1264 +		/*
 14.1265 +		 * new region fits between prev_vma->vm_end and
 14.1266 +		 * vma->vm_start, use it:
 14.1267 +		 */
 14.1268 +		if (addr+len <= vma->vm_start &&
 14.1269 +				(!prev_vma || (addr >= prev_vma->vm_end)))
 14.1270 +			/* remember the address as a hint for next time */
 14.1271 +			return (mm->free_area_cache = addr);
 14.1272 +		else
 14.1273 +			/* pull free_area_cache down to the first hole */
 14.1274 +			if (mm->free_area_cache == vma->vm_end)
 14.1275 +				mm->free_area_cache = vma->vm_start;
 14.1276 +
 14.1277 +		/* try just below the current vma->vm_start */
 14.1278 +		addr = vma->vm_start-len;
 14.1279 +	} while (len <= vma->vm_start);
 14.1280 +
 14.1281 +fail:
 14.1282 +	/*
 14.1283 +	 * if hint left us with no space for the requested
 14.1284 +	 * mapping then try again:
 14.1285 +	 */
 14.1286 +	if (first_time) {
 14.1287 +		mm->free_area_cache = base;
 14.1288 +		first_time = 0;
 14.1289 +		goto try_again;
 14.1290 +	}
 14.1291 +	/*
 14.1292 +	 * A failed mmap() very likely causes application failure,
 14.1293 +	 * so fall back to the bottom-up function here. This scenario
 14.1294 +	 * can happen with large stack limits and large mmap()
 14.1295 +	 * allocations.
 14.1296 +	 */
 14.1297 +	mm->free_area_cache = TASK_UNMAPPED_BASE;
 14.1298 +	addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
 14.1299 +	/*
 14.1300 +	 * Restore the topdown base:
 14.1301 +	 */
 14.1302 +	mm->free_area_cache = base;
 14.1303 +
 14.1304 +	return addr;
 14.1305 +}
 14.1306 +#endif
 14.1307 +
 14.1308 +void arch_unmap_area_topdown(struct vm_area_struct *area)
 14.1309 +{
 14.1310 +	/*
 14.1311 +	 * Is this a new hole at the highest possible address?
 14.1312 +	 */
 14.1313 +	if (area->vm_end > area->vm_mm->free_area_cache)
 14.1314 +		area->vm_mm->free_area_cache = area->vm_end;
 14.1315 +}
 14.1316 +
 14.1317 +unsigned long
 14.1318 +get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
 14.1319 +		unsigned long pgoff, unsigned long flags)
 14.1320 +{
 14.1321 +	if (flags & MAP_FIXED) {
 14.1322 +		unsigned long ret;
 14.1323 +
 14.1324 +		if (addr > TASK_SIZE - len)
 14.1325 +			return -ENOMEM;
 14.1326 +		if (addr & ~PAGE_MASK)
 14.1327 +			return -EINVAL;
 14.1328 +		if (file && is_file_hugepages(file))  {
 14.1329 +			/*
 14.1330 +			 * Check if the given range is hugepage aligned, and
 14.1331 +			 * can be made suitable for hugepages.
 14.1332 +			 */
 14.1333 +			ret = prepare_hugepage_range(addr, len);
 14.1334 +		} else {
 14.1335 +			/*
 14.1336 +			 * Ensure that a normal request is not falling in a
 14.1337 +			 * reserved hugepage range.  For some archs like IA-64,
 14.1338 +			 * there is a separate region for hugepages.
 14.1339 +			 */
 14.1340 +			ret = is_hugepage_only_range(addr, len);
 14.1341 +		}
 14.1342 +		if (ret)
 14.1343 +			return -EINVAL;
 14.1344 +		return addr;
 14.1345 +	}
 14.1346 +
 14.1347 +	if (file && file->f_op && file->f_op->get_unmapped_area)
 14.1348 +		return file->f_op->get_unmapped_area(file, addr, len,
 14.1349 +						pgoff, flags);
 14.1350 +
 14.1351 +	return current->mm->get_unmapped_area(file, addr, len, pgoff, flags);
 14.1352 +}
 14.1353 +
 14.1354 +EXPORT_SYMBOL(get_unmapped_area);
 14.1355 +
 14.1356 +/* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
 14.1357 +struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr)
 14.1358 +{
 14.1359 +	struct vm_area_struct *vma = NULL;
 14.1360 +
 14.1361 +	if (mm) {
 14.1362 +		/* Check the cache first. */
 14.1363 +		/* (Cache hit rate is typically around 35%.) */
 14.1364 +		vma = mm->mmap_cache;
 14.1365 +		if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
 14.1366 +			struct rb_node * rb_node;
 14.1367 +
 14.1368 +			rb_node = mm->mm_rb.rb_node;
 14.1369 +			vma = NULL;
 14.1370 +
 14.1371 +			while (rb_node) {
 14.1372 +				struct vm_area_struct * vma_tmp;
 14.1373 +
 14.1374 +				vma_tmp = rb_entry(rb_node,
 14.1375 +						struct vm_area_struct, vm_rb);
 14.1376 +
 14.1377 +				if (vma_tmp->vm_end > addr) {
 14.1378 +					vma = vma_tmp;
 14.1379 +					if (vma_tmp->vm_start <= addr)
 14.1380 +						break;
 14.1381 +					rb_node = rb_node->rb_left;
 14.1382 +				} else
 14.1383 +					rb_node = rb_node->rb_right;
 14.1384 +			}
 14.1385 +			if (vma)
 14.1386 +				mm->mmap_cache = vma;
 14.1387 +		}
 14.1388 +	}
 14.1389 +	return vma;
 14.1390 +}
 14.1391 +
 14.1392 +EXPORT_SYMBOL(find_vma);
 14.1393 +
 14.1394 +/* Same as find_vma, but also return a pointer to the previous VMA in *pprev. */
 14.1395 +struct vm_area_struct *
 14.1396 +find_vma_prev(struct mm_struct *mm, unsigned long addr,
 14.1397 +			struct vm_area_struct **pprev)
 14.1398 +{
 14.1399 +	struct vm_area_struct *vma = NULL, *prev = NULL;
 14.1400 +	struct rb_node * rb_node;
 14.1401 +	if (!mm)
 14.1402 +		goto out;
 14.1403 +
 14.1404 +	/* Guard against addr being lower than the first VMA */
 14.1405 +	vma = mm->mmap;
 14.1406 +
 14.1407 +	/* Go through the RB tree quickly. */
 14.1408 +	rb_node = mm->mm_rb.rb_node;
 14.1409 +
 14.1410 +	while (rb_node) {
 14.1411 +		struct vm_area_struct *vma_tmp;
 14.1412 +		vma_tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);
 14.1413 +
 14.1414 +		if (addr < vma_tmp->vm_end) {
 14.1415 +			rb_node = rb_node->rb_left;
 14.1416 +		} else {
 14.1417 +			prev = vma_tmp;
 14.1418 +			if (!prev->vm_next || (addr < prev->vm_next->vm_end))
 14.1419 +				break;
 14.1420 +			rb_node = rb_node->rb_right;
 14.1421 +		}
 14.1422 +	}
 14.1423 +
 14.1424 +out:
 14.1425 +	*pprev = prev;
 14.1426 +	return prev ? prev->vm_next : vma;
 14.1427 +}
 14.1428 +
 14.1429 +/*
 14.1430 + * Verify that the stack growth is acceptable and
 14.1431 + * update accounting. This is shared with both the
 14.1432 + * grow-up and grow-down cases.
 14.1433 + */
 14.1434 +static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, unsigned long grow)
 14.1435 +{
 14.1436 +	struct mm_struct *mm = vma->vm_mm;
 14.1437 +	struct rlimit *rlim = current->signal->rlim;
 14.1438 +
 14.1439 +	/* address space limit tests */
 14.1440 +	if (mm->total_vm + grow > rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT)
 14.1441 +		return -ENOMEM;
 14.1442 +
 14.1443 +	/* Stack limit test */
 14.1444 +	if (size > rlim[RLIMIT_STACK].rlim_cur)
 14.1445 +		return -ENOMEM;
 14.1446 +
 14.1447 +	/* mlock limit tests */
 14.1448 +	if (vma->vm_flags & VM_LOCKED) {
 14.1449 +		unsigned long locked;
 14.1450 +		unsigned long limit;
 14.1451 +		locked = mm->locked_vm + grow;
 14.1452 +		limit = rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
 14.1453 +		if (locked > limit && !capable(CAP_IPC_LOCK))
 14.1454 +			return -ENOMEM;
 14.1455 +	}
 14.1456 +
 14.1457 +	/*
 14.1458 +	 * Overcommit..  This must be the final test, as it will
 14.1459 +	 * update security statistics.
 14.1460 +	 */
 14.1461 +	if (security_vm_enough_memory(grow))
 14.1462 +		return -ENOMEM;
 14.1463 +
 14.1464 +	/* Ok, everything looks good - let it rip */
 14.1465 +	mm->total_vm += grow;
 14.1466 +	if (vma->vm_flags & VM_LOCKED)
 14.1467 +		mm->locked_vm += grow;
 14.1468 +	__vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow);
 14.1469 +	acct_update_integrals();
 14.1470 +	update_mem_hiwater();
 14.1471 +	return 0;
 14.1472 +}
 14.1473 +
 14.1474 +#ifdef CONFIG_STACK_GROWSUP
 14.1475 +/*
 14.1476 + * vma is the first one with address > vma->vm_end.  Have to extend vma.
 14.1477 + */
 14.1478 +int expand_stack(struct vm_area_struct * vma, unsigned long address)
 14.1479 +{
 14.1480 +	int error;
 14.1481 +
 14.1482 +	if (!(vma->vm_flags & VM_GROWSUP))
 14.1483 +		return -EFAULT;
 14.1484 +
 14.1485 +	/*
 14.1486 +	 * We must make sure the anon_vma is allocated
 14.1487 +	 * so that the anon_vma locking is not a noop.
 14.1488 +	 */
 14.1489 +	if (unlikely(anon_vma_prepare(vma)))
 14.1490 +		return -ENOMEM;
 14.1491 +	anon_vma_lock(vma);
 14.1492 +
 14.1493 +	/*
 14.1494 +	 * vma->vm_start/vm_end cannot change under us because the caller
 14.1495 +	 * is required to hold the mmap_sem in read mode.  We need the
 14.1496 +	 * anon_vma lock to serialize against concurrent expand_stacks.
 14.1497 +	 */
 14.1498 +	address += 4 + PAGE_SIZE - 1;
 14.1499 +	address &= PAGE_MASK;
 14.1500 +	error = 0;
 14.1501 +
 14.1502 +	/* Somebody else might have raced and expanded it already */
 14.1503 +	if (address > vma->vm_end) {
 14.1504 +		unsigned long size, grow;
 14.1505 +
 14.1506 +		size = address - vma->vm_start;
 14.1507 +		grow = (address - vma->vm_end) >> PAGE_SHIFT;
 14.1508 +
 14.1509 +		error = acct_stack_growth(vma, size, grow);
 14.1510 +		if (!error)
 14.1511 +			vma->vm_end = address;
 14.1512 +	}
 14.1513 +	anon_vma_unlock(vma);
 14.1514 +	return error;
 14.1515 +}
 14.1516 +
 14.1517 +struct vm_area_struct *
 14.1518 +find_extend_vma(struct mm_struct *mm, unsigned long addr)
 14.1519 +{
 14.1520 +	struct vm_area_struct *vma, *prev;
 14.1521 +
 14.1522 +	addr &= PAGE_MASK;
 14.1523 +	vma = find_vma_prev(mm, addr, &prev);
 14.1524 +	if (vma && (vma->vm_start <= addr))
 14.1525 +		return vma;
 14.1526 +	if (!prev || expand_stack(prev, addr))
 14.1527 +		return NULL;
 14.1528 +	if (prev->vm_flags & VM_LOCKED) {
 14.1529 +		make_pages_present(addr, prev->vm_end);
 14.1530 +	}
 14.1531 +	return prev;
 14.1532 +}
 14.1533 +#else
 14.1534 +/*
 14.1535 + * vma is the first one with address < vma->vm_start.  Have to extend vma.
 14.1536 + */
 14.1537 +int expand_stack(struct vm_area_struct *vma, unsigned long address)
 14.1538 +{
 14.1539 +	int error;
 14.1540 +
 14.1541 +	/*
 14.1542 +	 * We must make sure the anon_vma is allocated
 14.1543 +	 * so that the anon_vma locking is not a noop.
 14.1544 +	 */
 14.1545 +	if (unlikely(anon_vma_prepare(vma)))
 14.1546 +		return -ENOMEM;
 14.1547 +	anon_vma_lock(vma);
 14.1548 +
 14.1549 +	/*
 14.1550 +	 * vma->vm_start/vm_end cannot change under us because the caller
 14.1551 +	 * is required to hold the mmap_sem in read mode.  We need the
 14.1552 +	 * anon_vma lock to serialize against concurrent expand_stacks.
 14.1553 +	 */
 14.1554 +	address &= PAGE_MASK;
 14.1555 +	error = 0;
 14.1556 +
 14.1557 +	/* Somebody else might have raced and expanded it already */
 14.1558 +	if (address < vma->vm_start) {
 14.1559 +		unsigned long size, grow;
 14.1560 +
 14.1561 +		size = vma->vm_end - address;
 14.1562 +		grow = (vma->vm_start - address) >> PAGE_SHIFT;
 14.1563 +
 14.1564 +		error = acct_stack_growth(vma, size, grow);
 14.1565 +		if (!error) {
 14.1566 +			vma->vm_start = address;
 14.1567 +			vma->vm_pgoff -= grow;
 14.1568 +		}
 14.1569 +	}
 14.1570 +	anon_vma_unlock(vma);
 14.1571 +	return error;
 14.1572 +}
 14.1573 +
 14.1574 +struct vm_area_struct *
 14.1575 +find_extend_vma(struct mm_struct * mm, unsigned long addr)
 14.1576 +{
 14.1577 +	struct vm_area_struct * vma;
 14.1578 +	unsigned long start;
 14.1579 +
 14.1580 +	addr &= PAGE_MASK;
 14.1581 +	vma = find_vma(mm,addr);
 14.1582 +	if (!vma)
 14.1583 +		return NULL;
 14.1584 +	if (vma->vm_start <= addr)
 14.1585 +		return vma;
 14.1586 +	if (!(vma->vm_flags & VM_GROWSDOWN))
 14.1587 +		return NULL;
 14.1588 +	start = vma->vm_start;
 14.1589 +	if (expand_stack(vma, addr))
 14.1590 +		return NULL;
 14.1591 +	if (vma->vm_flags & VM_LOCKED) {
 14.1592 +		make_pages_present(addr, start);
 14.1593 +	}
 14.1594 +	return vma;
 14.1595 +}
 14.1596 +#endif
 14.1597 +
 14.1598 +/*
 14.1599 + * Try to free as many page directory entries as we can,
 14.1600 + * without having to work very hard at actually scanning
 14.1601 + * the page tables themselves.
 14.1602 + *
 14.1603 + * Right now we try to free page tables if we have a nice
 14.1604 + * PGDIR-aligned area that got free'd up. We could be more
 14.1605 + * granular if we want to, but this is fast and simple,
 14.1606 + * and covers the bad cases.
 14.1607 + *
 14.1608 + * "prev", if it exists, points to a vma before the one
 14.1609 + * we just free'd - but there's no telling how much before.
 14.1610 + */
 14.1611 +static void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *prev,
 14.1612 +	unsigned long start, unsigned long end)
 14.1613 +{
 14.1614 +	unsigned long first = start & PGDIR_MASK;
 14.1615 +	unsigned long last = end + PGDIR_SIZE - 1;
 14.1616 +	struct mm_struct *mm = tlb->mm;
 14.1617 +
 14.1618 +	if (last > MM_VM_SIZE(mm) || last < end)
 14.1619 +		last = MM_VM_SIZE(mm);
 14.1620 +
 14.1621 +	if (!prev) {
 14.1622 +		prev = mm->mmap;
 14.1623 +		if (!prev)
 14.1624 +			goto no_mmaps;
 14.1625 +		if (prev->vm_end > start) {
 14.1626 +			if (last > prev->vm_start)
 14.1627 +				last = prev->vm_start;
 14.1628 +			goto no_mmaps;
 14.1629 +		}
 14.1630 +	}
 14.1631 +	for (;;) {
 14.1632 +		struct vm_area_struct *next = prev->vm_next;
 14.1633 +
 14.1634 +		if (next) {
 14.1635 +			if (next->vm_start < start) {
 14.1636 +				prev = next;
 14.1637 +				continue;
 14.1638 +			}
 14.1639 +			if (last > next->vm_start)
 14.1640 +				last = next->vm_start;
 14.1641 +		}
 14.1642 +		if (prev->vm_end > first)
 14.1643 +			first = prev->vm_end;
 14.1644 +		break;
 14.1645 +	}
 14.1646 +no_mmaps:
 14.1647 +	if (last < first)	/* for arches with discontiguous pgd indices */
 14.1648 +		return;
 14.1649 +	if (first < FIRST_USER_PGD_NR * PGDIR_SIZE)
 14.1650 +		first = FIRST_USER_PGD_NR * PGDIR_SIZE;
 14.1651 +	/* No point trying to free anything if we're in the same pte page */
 14.1652 +	if ((first & PMD_MASK) < (last & PMD_MASK)) {
 14.1653 +		clear_page_range(tlb, first, last);
 14.1654 +		flush_tlb_pgtables(mm, first, last);
 14.1655 +	}
 14.1656 +}
 14.1657 +
 14.1658 +/* Normal function to fix up a mapping
 14.1659 + * This function is the default for when an area has no specific
 14.1660 + * function.  This may be used as part of a more specific routine.
 14.1661 + *
 14.1662 + * By the time this function is called, the area struct has been
 14.1663 + * removed from the process mapping list.
 14.1664 + */
 14.1665 +static void unmap_vma(struct mm_struct *mm, struct vm_area_struct *area)
 14.1666 +{
 14.1667 +	size_t len = area->vm_end - area->vm_start;
 14.1668 +
 14.1669 +	area->vm_mm->total_vm -= len >> PAGE_SHIFT;
 14.1670 +	if (area->vm_flags & VM_LOCKED)
 14.1671 +		area->vm_mm->locked_vm -= len >> PAGE_SHIFT;
 14.1672 +	vm_stat_unaccount(area);
 14.1673 +	area->vm_mm->unmap_area(area);
 14.1674 +	remove_vm_struct(area);
 14.1675 +}
 14.1676 +
 14.1677 +/*
 14.1678 + * Update the VMA and inode share lists.
 14.1679 + *
 14.1680 + * Ok - we have the memory areas we should free on the 'free' list,
 14.1681 + * so release them, and do the vma updates.
 14.1682 + */
 14.1683 +static void unmap_vma_list(struct mm_struct *mm,
 14.1684 +	struct vm_area_struct *mpnt)
 14.1685 +{
 14.1686 +	do {
 14.1687 +		struct vm_area_struct *next = mpnt->vm_next;
 14.1688 +		unmap_vma(mm, mpnt);
 14.1689 +		mpnt = next;
 14.1690 +	} while (mpnt != NULL);
 14.1691 +	validate_mm(mm);
 14.1692 +}
 14.1693 +
 14.1694 +/*
 14.1695 + * Get rid of page table information in the indicated region.
 14.1696 + *
 14.1697 + * Called with the page table lock held.
 14.1698 + */
 14.1699 +static void unmap_region(struct mm_struct *mm,
 14.1700 +	struct vm_area_struct *vma,
 14.1701 +	struct vm_area_struct *prev,
 14.1702 +	unsigned long start,
 14.1703 +	unsigned long end)
 14.1704 +{
 14.1705 +	struct mmu_gather *tlb;
 14.1706 +	unsigned long nr_accounted = 0;
 14.1707 +
 14.1708 +	lru_add_drain();
 14.1709 +	tlb = tlb_gather_mmu(mm, 0);
 14.1710 +	unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted, NULL);
 14.1711 +	vm_unacct_memory(nr_accounted);
 14.1712 +
 14.1713 +	if (is_hugepage_only_range(start, end - start))
 14.1714 +		hugetlb_free_pgtables(tlb, prev, start, end);
 14.1715 +	else
 14.1716 +		free_pgtables(tlb, prev, start, end);
 14.1717 +	tlb_finish_mmu(tlb, start, end);
 14.1718 +}
 14.1719 +
 14.1720 +/*
 14.1721 + * Create a list of vma's touched by the unmap, removing them from the mm's
 14.1722 + * vma list as we go..
 14.1723 + */
 14.1724 +static void
 14.1725 +detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
 14.1726 +	struct vm_area_struct *prev, unsigned long end)
 14.1727 +{
 14.1728 +	struct vm_area_struct **insertion_point;
 14.1729 +	struct vm_area_struct *tail_vma = NULL;
 14.1730 +
 14.1731 +	insertion_point = (prev ? &prev->vm_next : &mm->mmap);
 14.1732 +	do {
 14.1733 +		rb_erase(&vma->vm_rb, &mm->mm_rb);
 14.1734 +		mm->map_count--;
 14.1735 +		tail_vma = vma;
 14.1736 +		vma = vma->vm_next;
 14.1737 +	} while (vma && vma->vm_start < end);
 14.1738 +	*insertion_point = vma;
 14.1739 +	tail_vma->vm_next = NULL;
 14.1740 +	mm->mmap_cache = NULL;		/* Kill the cache. */
 14.1741 +}
 14.1742 +
 14.1743 +/*
 14.1744 + * Split a vma into two pieces at address 'addr', a new vma is allocated
 14.1745 + * either for the first part or the the tail.
 14.1746 + */
 14.1747 +int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
 14.1748 +	      unsigned long addr, int new_below)
 14.1749 +{
 14.1750 +	struct mempolicy *pol;
 14.1751 +	struct vm_area_struct *new;
 14.1752 +
 14.1753 +	if (is_vm_hugetlb_page(vma) && (addr & ~HPAGE_MASK))
 14.1754 +		return -EINVAL;
 14.1755 +
 14.1756 +	if (mm->map_count >= sysctl_max_map_count)
 14.1757 +		return -ENOMEM;
 14.1758 +
 14.1759 +	new = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
 14.1760 +	if (!new)
 14.1761 +		return -ENOMEM;
 14.1762 +
 14.1763 +	/* most fields are the same, copy all, and then fixup */
 14.1764 +	*new = *vma;
 14.1765 +
 14.1766 +	if (new_below)
 14.1767 +		new->vm_end = addr;
 14.1768 +	else {
 14.1769 +		new->vm_start = addr;
 14.1770 +		new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT);
 14.1771 +	}
 14.1772 +
 14.1773 +	pol = mpol_copy(vma_policy(vma));
 14.1774 +	if (IS_ERR(pol)) {
 14.1775 +		kmem_cache_free(vm_area_cachep, new);
 14.1776 +		return PTR_ERR(pol);
 14.1777 +	}
 14.1778 +	vma_set_policy(new, pol);
 14.1779 +
 14.1780 +	if (new->vm_file)
 14.1781 +		get_file(new->vm_file);
 14.1782 +
 14.1783 +	if (new->vm_ops && new->vm_ops->open)
 14.1784 +		new->vm_ops->open(new);
 14.1785 +
 14.1786 +	if (new_below)
 14.1787 +		vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff +
 14.1788 +			((addr - new->vm_start) >> PAGE_SHIFT), new);
 14.1789 +	else
 14.1790 +		vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new);
 14.1791 +
 14.1792 +	return 0;
 14.1793 +}
 14.1794 +
 14.1795 +/* Munmap is split into 2 main parts -- this part which finds
 14.1796 + * what needs doing, and the areas themselves, which do the
 14.1797 + * work.  This now handles partial unmappings.
 14.1798 + * Jeremy Fitzhardinge <jeremy@goop.org>
 14.1799 + */
 14.1800 +int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
 14.1801 +{
 14.1802 +	unsigned long end;
 14.1803 +	struct vm_area_struct *mpnt, *prev, *last;
 14.1804 +
 14.1805 +	if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start)
 14.1806 +		return -EINVAL;
 14.1807 +
 14.1808 +	if ((len = PAGE_ALIGN(len)) == 0)
 14.1809 +		return -EINVAL;
 14.1810 +
 14.1811 +	/* Find the first overlapping VMA */
 14.1812 +	mpnt = find_vma_prev(mm, start, &prev);
 14.1813 +	if (!mpnt)
 14.1814 +		return 0;
 14.1815 +	/* we have  start < mpnt->vm_end  */
 14.1816 +
 14.1817 +	/* if it doesn't overlap, we have nothing.. */
 14.1818 +	end = start + len;
 14.1819 +	if (mpnt->vm_start >= end)
 14.1820 +		return 0;
 14.1821 +
 14.1822 +	/*
 14.1823 +	 * If we need to split any vma, do it now to save pain later.
 14.1824 +	 *
 14.1825 +	 * Note: mremap's move_vma VM_ACCOUNT handling assumes a partially
 14.1826 +	 * unmapped vm_area_struct will remain in use: so lower split_vma
 14.1827 +	 * places tmp vma above, and higher split_vma places tmp vma below.
 14.1828 +	 */
 14.1829 +	if (start > mpnt->vm_start) {
 14.1830 +		int error = split_vma(mm, mpnt, start, 0);
 14.1831 +		if (error)
 14.1832 +			return error;
 14.1833 +		prev = mpnt;
 14.1834 +	}
 14.1835 +
 14.1836 +	/* Does it split the last one? */
 14.1837 +	last = find_vma(mm, end);
 14.1838 +	if (last && end > last->vm_start) {
 14.1839 +		int error = split_vma(mm, last, end, 1);
 14.1840 +		if (error)
 14.1841 +			return error;
 14.1842 +	}
 14.1843 +	mpnt = prev? prev->vm_next: mm->mmap;
 14.1844 +
 14.1845 +	/*
 14.1846 +	 * Remove the vma's, and unmap the actual pages
 14.1847 +	 */
 14.1848 +	detach_vmas_to_be_unmapped(mm, mpnt, prev, end);
 14.1849 +	spin_lock(&mm->page_table_lock);
 14.1850 +	unmap_region(mm, mpnt, prev, start, end);
 14.1851 +	spin_unlock(&mm->page_table_lock);
 14.1852 +
 14.1853 +	/* Fix up all other VM information */
 14.1854 +	unmap_vma_list(mm, mpnt);
 14.1855 +
 14.1856 +	return 0;
 14.1857 +}
 14.1858 +
 14.1859 +EXPORT_SYMBOL(do_munmap);
 14.1860 +
 14.1861 +asmlinkage long sys_munmap(unsigned long addr, size_t len)
 14.1862 +{
 14.1863 +	int ret;
 14.1864 +	struct mm_struct *mm = current->mm;
 14.1865 +
 14.1866 +	profile_munmap(addr);
 14.1867 +
 14.1868 +	down_write(&mm->mmap_sem);
 14.1869 +	ret = do_munmap(mm, addr, len);
 14.1870 +	up_write(&mm->mmap_sem);
 14.1871 +	return ret;
 14.1872 +}
 14.1873 +
 14.1874 +static inline void verify_mm_writelocked(struct mm_struct *mm)
 14.1875 +{
 14.1876 +#ifdef CONFIG_DEBUG_KERNEL
 14.1877 +	if (unlikely(down_read_trylock(&mm->mmap_sem))) {
 14.1878 +		WARN_ON(1);
 14.1879 +		up_read(&mm->mmap_sem);
 14.1880 +	}
 14.1881 +#endif
 14.1882 +}
 14.1883 +
 14.1884 +/*
 14.1885 + *  this is really a simplified "do_mmap".  it only handles
 14.1886 + *  anonymous maps.  eventually we may be able to do some
 14.1887 + *  brk-specific accounting here.
 14.1888 + */
 14.1889 +unsigned long do_brk(unsigned long addr, unsigned long len)
 14.1890 +{
 14.1891 +	struct mm_struct * mm = current->mm;
 14.1892 +	struct vm_area_struct * vma, * prev;
 14.1893 +	unsigned long flags;
 14.1894 +	struct rb_node ** rb_link, * rb_parent;
 14.1895 +	pgoff_t pgoff = addr >> PAGE_SHIFT;
 14.1896 +
 14.1897 +	len = PAGE_ALIGN(len);
 14.1898 +	if (!len)
 14.1899 +		return addr;
 14.1900 +
 14.1901 +	if ((addr + len) > TASK_SIZE || (addr + len) < addr)
 14.1902 +		return -EINVAL;
 14.1903 +
 14.1904 +	/*
 14.1905 +	 * mlock MCL_FUTURE?
 14.1906 +	 */
 14.1907 +	if (mm->def_flags & VM_LOCKED) {
 14.1908 +		unsigned long locked, lock_limit;
 14.1909 +		locked = mm->locked_vm << PAGE_SHIFT;
 14.1910 +		lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
 14.1911 +		locked += len;
 14.1912 +		if (locked > lock_limit && !capable(CAP_IPC_LOCK))
 14.1913 +			return -EAGAIN;
 14.1914 +	}
 14.1915 +
 14.1916 +	/*
 14.1917 +	 * mm->mmap_sem is required to protect against another thread
 14.1918 +	 * changing the mappings in case we sleep.
 14.1919 +	 */
 14.1920 +	verify_mm_writelocked(mm);
 14.1921 +
 14.1922 +	/*
 14.1923 +	 * Clear old maps.  this also does some error checking for us
 14.1924 +	 */
 14.1925 + munmap_back:
 14.1926 +	vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
 14.1927 +	if (vma && vma->vm_start < addr + len) {
 14.1928 +		if (do_munmap(mm, addr, len))
 14.1929 +			return -ENOMEM;
 14.1930 +		goto munmap_back;
 14.1931 +	}
 14.1932 +
 14.1933 +	/* Check against address space limits *after* clearing old maps... */
 14.1934 +	if ((mm->total_vm << PAGE_SHIFT) + len
 14.1935 +	    > current->signal->rlim[RLIMIT_AS].rlim_cur)
 14.1936 +		return -ENOMEM;
 14.1937 +
 14.1938 +	if (mm->map_count > sysctl_max_map_count)
 14.1939 +		return -ENOMEM;
 14.1940 +
 14.1941 +	if (security_vm_enough_memory(len >> PAGE_SHIFT))
 14.1942 +		return -ENOMEM;
 14.1943 +
 14.1944 +	flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
 14.1945 +
 14.1946 +	/* Can we just expand an old private anonymous mapping? */
 14.1947 +	if (vma_merge(mm, prev, addr, addr + len, flags,
 14.1948 +					NULL, NULL, pgoff, NULL))
 14.1949 +		goto out;
 14.1950 +
 14.1951 +	/*
 14.1952 +	 * create a vma struct for an anonymous mapping
 14.1953 +	 */
 14.1954 +	vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
 14.1955 +	if (!vma) {
 14.1956 +		vm_unacct_memory(len >> PAGE_SHIFT);
 14.1957 +		return -ENOMEM;
 14.1958 +	}
 14.1959 +	memset(vma, 0, sizeof(*vma));
 14.1960 +
 14.1961 +	vma->vm_mm = mm;
 14.1962 +	vma->vm_start = addr;
 14.1963 +	vma->vm_end = addr + len;
 14.1964 +	vma->vm_pgoff = pgoff;
 14.1965 +	vma->vm_flags = flags;
 14.1966 +	vma->vm_page_prot = protection_map[flags & 0x0f];
 14.1967 +	vma_link(mm, vma, prev, rb_link, rb_parent);
 14.1968 +out:
 14.1969 +	mm->total_vm += len >> PAGE_SHIFT;
 14.1970 +	if (flags & VM_LOCKED) {
 14.1971 +		mm->locked_vm += len >> PAGE_SHIFT;
 14.1972 +		make_pages_present(addr, addr + len);
 14.1973 +	}
 14.1974 +	acct_update_integrals();
 14.1975 +	update_mem_hiwater();
 14.1976 +	return addr;
 14.1977 +}
 14.1978 +
 14.1979 +EXPORT_SYMBOL(do_brk);
 14.1980 +
 14.1981 +/* Release all mmaps. */
 14.1982 +void exit_mmap(struct mm_struct *mm)
 14.1983 +{
 14.1984 +	struct mmu_gather *tlb;
 14.1985 +	struct vm_area_struct *vma;
 14.1986 +	unsigned long nr_accounted = 0;
 14.1987 +
 14.1988 +#ifdef arch_exit_mmap
 14.1989 +	arch_exit_mmap(mm);
 14.1990 +#endif
 14.1991 +
 14.1992 +	lru_add_drain();
 14.1993 +
 14.1994 +	spin_lock(&mm->page_table_lock);
 14.1995 +
 14.1996 +	tlb = tlb_gather_mmu(mm, 1);
 14.1997 +	flush_cache_mm(mm);
 14.1998 +	/* Use ~0UL here to ensure all VMAs in the mm are unmapped */
 14.1999 +	mm->map_count -= unmap_vmas(&tlb, mm, mm->mmap, 0,
 14.2000 +					~0UL, &nr_accounted, NULL);
 14.2001 +	vm_unacct_memory(nr_accounted);
 14.2002 +	BUG_ON(mm->map_count);	/* This is just debugging */
 14.2003 +	clear_page_range(tlb, FIRST_USER_PGD_NR * PGDIR_SIZE, MM_VM_SIZE(mm));
 14.2004 +	
 14.2005 +	tlb_finish_mmu(tlb, 0, MM_VM_SIZE(mm));
 14.2006 +
 14.2007 +	vma = mm->mmap;
 14.2008 +	mm->mmap = mm->mmap_cache = NULL;
 14.2009 +	mm->mm_rb = RB_ROOT;
 14.2010 +	mm->rss = 0;
 14.2011 +	mm->total_vm = 0;
 14.2012 +	mm->locked_vm = 0;
 14.2013 +
 14.2014 +	spin_unlock(&mm->page_table_lock);
 14.2015 +
 14.2016 +	/*
 14.2017 +	 * Walk the list again, actually closing and freeing it
 14.2018 +	 * without holding any MM locks.
 14.2019 +	 */
 14.2020 +	while (vma) {
 14.2021 +		struct vm_area_struct *next = vma->vm_next;
 14.2022 +		remove_vm_struct(vma);
 14.2023 +		vma = next;
 14.2024 +	}
 14.2025 +}
 14.2026 +
 14.2027 +/* Insert vm structure into process list sorted by address
 14.2028 + * and into the inode's i_mmap tree.  If vm_file is non-NULL
 14.2029 + * then i_mmap_lock is taken here.
 14.2030 + */
 14.2031 +int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
 14.2032 +{
 14.2033 +	struct vm_area_struct * __vma, * prev;
 14.2034 +	struct rb_node ** rb_link, * rb_parent;
 14.2035 +
 14.2036 +	/*
 14.2037 +	 * The vm_pgoff of a purely anonymous vma should be irrelevant
 14.2038 +	 * until its first write fault, when page's anon_vma and index
 14.2039 +	 * are set.  But now set the vm_pgoff it will almost certainly
 14.2040 +	 * end up with (unless mremap moves it elsewhere before that
 14.2041 +	 * first wfault), so /proc/pid/maps tells a consistent story.
 14.2042 +	 *
 14.2043 +	 * By setting it to reflect the virtual start address of the
 14.2044 +	 * vma, merges and splits can happen in a seamless way, just
 14.2045 +	 * using the existing file pgoff checks and manipulations.
 14.2046 +	 * Similarly in do_mmap_pgoff and in do_brk.
 14.2047 +	 */
 14.2048 +	if (!vma->vm_file) {
 14.2049 +		BUG_ON(vma->anon_vma);
 14.2050 +		vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
 14.2051 +	}
 14.2052 +	__vma = find_vma_prepare(mm,vma->vm_start,&prev,&rb_link,&rb_parent);
 14.2053 +	if (__vma && __vma->vm_start < vma->vm_end)
 14.2054 +		return -ENOMEM;
 14.2055 +	vma_link(mm, vma, prev, rb_link, rb_parent);
 14.2056 +	return 0;
 14.2057 +}
 14.2058 +
 14.2059 +/*
 14.2060 + * Copy the vma structure to a new location in the same mm,
 14.2061 + * prior to moving page table entries, to effect an mremap move.
 14.2062 + */
 14.2063 +struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
 14.2064 +	unsigned long addr, unsigned long len, pgoff_t pgoff)
 14.2065 +{
 14.2066 +	struct vm_area_struct *vma = *vmap;
 14.2067 +	unsigned long vma_start = vma->vm_start;
 14.2068 +	struct mm_struct *mm = vma->vm_mm;
 14.2069 +	struct vm_area_struct *new_vma, *prev;
 14.2070 +	struct rb_node **rb_link, *rb_parent;
 14.2071 +	struct mempolicy *pol;
 14.2072 +
 14.2073 +	/*
 14.2074 +	 * If anonymous vma has not yet been faulted, update new pgoff
 14.2075 +	 * to match new location, to increase its chance of merging.
 14.2076 +	 */
 14.2077 +	if (!vma->vm_file && !vma->anon_vma)
 14.2078 +		pgoff = addr >> PAGE_SHIFT;
 14.2079 +
 14.2080 +	find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
 14.2081 +	new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
 14.2082 +			vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma));
 14.2083 +	if (new_vma) {
 14.2084 +		/*
 14.2085 +		 * Source vma may have been merged into new_vma
 14.2086 +		 */
 14.2087 +		if (vma_start >= new_vma->vm_start &&
 14.2088 +		    vma_start < new_vma->vm_end)
 14.2089 +			*vmap = new_vma;
 14.2090 +	} else {
 14.2091 +		new_vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
 14.2092 +		if (new_vma) {
 14.2093 +			*new_vma = *vma;
 14.2094 +			pol = mpol_copy(vma_policy(vma));
 14.2095 +			if (IS_ERR(pol)) {
 14.2096 +				kmem_cache_free(vm_area_cachep, new_vma);
 14.2097 +				return NULL;
 14.2098 +			}
 14.2099 +			vma_set_policy(new_vma, pol);
 14.2100 +			new_vma->vm_start = addr;
 14.2101 +			new_vma->vm_end = addr + len;
 14.2102 +			new_vma->vm_pgoff = pgoff;
 14.2103 +			if (new_vma->vm_file)
 14.2104 +				get_file(new_vma->vm_file);
 14.2105 +			if (new_vma->vm_ops && new_vma->vm_ops->open)
 14.2106 +				new_vma->vm_ops->open(new_vma);
 14.2107 +			vma_link(mm, new_vma, prev, rb_link, rb_parent);
 14.2108 +		}
 14.2109 +	}
 14.2110 +	return new_vma;
 14.2111 +}
    16.1 --- a/tools/libxc/xc_linux_restore.c	Wed Apr 27 16:55:30 2005 +0000
    16.2 +++ b/tools/libxc/xc_linux_restore.c	Wed Apr 27 16:55:50 2005 +0000
    16.3 @@ -170,13 +170,13 @@ int xc_linux_restore(int xc_handle, XcIO
    16.4      if ( xc_domain_create(xc_handle, nr_pfns * (PAGE_SIZE / 1024),
    16.5                            -1, 1, &dom) )
    16.6      {
    16.7 -	xcio_error(ioctxt, "Could not create domain. pfns=%d, %dKB",
    16.8 -		   nr_pfns,nr_pfns * (PAGE_SIZE / 1024));
    16.9 +	xcio_error(ioctxt, "Could not create domain. pfns=%ld, %ldKB",
   16.10 +		   nr_pfns, nr_pfns * (PAGE_SIZE / 1024));
   16.11          goto out;
   16.12      }
   16.13      
   16.14      ioctxt->domain = dom;
   16.15 -    xcio_info(ioctxt, "Created domain %ld\n",dom);
   16.16 +    xcio_info(ioctxt, "Created domain %u\n", dom);
   16.17  
   16.18      /* Get the domain's shared-info frame. */
   16.19      op.cmd = DOM0_GETDOMAININFO;
   16.20 @@ -200,7 +200,8 @@ int xc_linux_restore(int xc_handle, XcIO
   16.21      }
   16.22  
   16.23      /* Build the pfn-to-mfn table. We choose MFN ordering returned by Xen. */
   16.24 -    if ( xc_get_pfn_list(xc_handle, dom, pfn_to_mfn_table, nr_pfns) != nr_pfns )
   16.25 +    if ( xc_get_pfn_list(xc_handle, dom, 
   16.26 +                         pfn_to_mfn_table, nr_pfns) != nr_pfns )
   16.27      {
   16.28          xcio_error(ioctxt, "Did not read correct number of frame "
   16.29                     "numbers for new dom");
   16.30 @@ -657,7 +658,7 @@ int xc_linux_restore(int xc_handle, XcIO
   16.31      if ( rc == 0 )
   16.32      {
   16.33          /* Success: print the domain id. */
   16.34 -        xcio_info(ioctxt, "DOM=%lu\n", dom);
   16.35 +        xcio_info(ioctxt, "DOM=%u\n", dom);
   16.36          return 0;
   16.37      }
   16.38  
    17.1 --- a/tools/libxc/xc_linux_save.c	Wed Apr 27 16:55:30 2005 +0000
    17.2 +++ b/tools/libxc/xc_linux_save.c	Wed Apr 27 16:55:50 2005 +0000
    17.3 @@ -167,7 +167,8 @@ static int burst_time_us = -1;
    17.4  #define RATE_TO_BTU 781250
    17.5  #define BURST_TIME_US burst_time_us
    17.6  
    17.7 -static int xcio_ratewrite(XcIOContext *ioctxt, void *buf, int n){
    17.8 +static int xcio_ratewrite(XcIOContext *ioctxt, void *buf, int n)
    17.9 +{
   17.10      static int budget = 0;
   17.11      static struct timeval last_put = { 0 };
   17.12      struct timeval now;
   17.13 @@ -230,8 +231,8 @@ static int print_stats( int xc_handle, u
   17.14  
   17.15      gettimeofday(&wall_now, NULL);
   17.16  
   17.17 -    d0_cpu_now = xc_domain_get_cpu_usage( xc_handle, 0, /* FIXME */ 0 )/1000;
   17.18 -    d1_cpu_now = xc_domain_get_cpu_usage( xc_handle, domid, /* FIXME */ 0 )/1000;
   17.19 +    d0_cpu_now = xc_domain_get_cpu_usage(xc_handle, 0, /* FIXME */ 0)/1000;
   17.20 +    d1_cpu_now = xc_domain_get_cpu_usage(xc_handle, domid, /* FIXME */ 0)/1000;
   17.21  
   17.22      if ( (d0_cpu_now == -1) || (d1_cpu_now == -1) ) 
   17.23          printf("ARRHHH!!\n");
   17.24 @@ -273,10 +274,13 @@ static int print_stats( int xc_handle, u
   17.25   * @param ioctxt i/o context
   17.26   * @return 0 on success, non-zero on error.
   17.27   */
   17.28 -static int write_vmconfig(XcIOContext *ioctxt){
   17.29 +static int write_vmconfig(XcIOContext *ioctxt)
   17.30 +{
   17.31      int err = -1;
   17.32 -    if(xcio_write(ioctxt, &ioctxt->vmconfig_n, sizeof(ioctxt->vmconfig_n))) goto exit;
   17.33 -    if(xcio_write(ioctxt, ioctxt->vmconfig, ioctxt->vmconfig_n)) goto exit;
   17.34 +    if(xcio_write(ioctxt, &ioctxt->vmconfig_n, sizeof(ioctxt->vmconfig_n))) 
   17.35 +        goto exit;
   17.36 +    if(xcio_write(ioctxt, ioctxt->vmconfig, ioctxt->vmconfig_n)) 
   17.37 +        goto exit;
   17.38      err = 0;
   17.39    exit:
   17.40      return err;
   17.41 @@ -329,7 +333,8 @@ int suspend_and_state(int xc_handle, XcI
   17.42  
   17.43  retry:
   17.44  
   17.45 -    if ( xc_domain_getfullinfo(xc_handle, ioctxt->domain, /* FIXME */ 0, info, ctxt) )
   17.46 +    if ( xc_domain_getfullinfo(xc_handle, ioctxt->domain, /* FIXME */ 0, 
   17.47 +                               info, ctxt) )
   17.48      {
   17.49  	xcio_error(ioctxt, "Could not get full domain info");
   17.50  	return -1;
   17.51 @@ -347,7 +352,7 @@ retry:
   17.52  	// try unpausing domain, wait, and retest	
   17.53  	xc_domain_unpause( xc_handle, ioctxt->domain );
   17.54  
   17.55 -	xcio_error(ioctxt, "Domain was paused. Wait and re-test. (%lx)",
   17.56 +	xcio_error(ioctxt, "Domain was paused. Wait and re-test. (%u)",
   17.57  		   info->flags);
   17.58  	usleep(10000);  // 10ms
   17.59  
   17.60 @@ -357,14 +362,12 @@ retry:
   17.61  
   17.62      if( ++i < 100 )
   17.63      {
   17.64 -	xcio_error(ioctxt, "Retry suspend domain (%lx)",
   17.65 -		   info->flags);
   17.66 +	xcio_error(ioctxt, "Retry suspend domain (%u)", info->flags);
   17.67  	usleep(10000);  // 10ms	
   17.68  	goto retry;
   17.69      }
   17.70  
   17.71 -    xcio_error(ioctxt, "Unable to suspend domain. (%lx)",
   17.72 -	       info->flags);
   17.73 +    xcio_error(ioctxt, "Unable to suspend domain. (%u)", info->flags);
   17.74  
   17.75      return -1;
   17.76  }
   17.77 @@ -442,7 +445,8 @@ int xc_linux_save(int xc_handle, XcIOCon
   17.78          return 1;
   17.79      }
   17.80  
   17.81 -    if ( xc_domain_getfullinfo( xc_handle, domid, /* FIXME */ 0, &info, &ctxt) )
   17.82 +    if ( xc_domain_getfullinfo( xc_handle, domid, /* FIXME */ 0, 
   17.83 +                                &info, &ctxt) )
   17.84      {
   17.85          xcio_error(ioctxt, "Could not get full domain info");
   17.86          goto out;
   17.87 @@ -459,7 +463,9 @@ int xc_linux_save(int xc_handle, XcIOCon
   17.88  
   17.89      /* cheesy sanity check */
   17.90      if ( nr_pfns > 1024*1024 ){
   17.91 -        xcio_error(ioctxt, "Invalid state record -- pfn count out of range: %lu", nr_pfns);
   17.92 +        xcio_error(ioctxt, 
   17.93 +                   "Invalid state record -- pfn count out of range: %lu", 
   17.94 +                   nr_pfns);
   17.95          goto out;
   17.96      }
   17.97  
   17.98 @@ -513,7 +519,8 @@ int xc_linux_save(int xc_handle, XcIOCon
   17.99  
  17.100      for ( i = 0; i < nr_pfns; i += 1024 ){
  17.101          if ( !translate_mfn_to_pfn(&pfn_to_mfn_frame_list[i/1024]) ){
  17.102 -            xcio_error(ioctxt, "Frame # in pfn-to-mfn frame list is not in pseudophys");
  17.103 +            xcio_error(ioctxt, 
  17.104 +                       "Frame# in pfn-to-mfn frame list is not in pseudophys");
  17.105              goto out;
  17.106          }
  17.107      }
  17.108 @@ -539,7 +546,7 @@ int xc_linux_save(int xc_handle, XcIOCon
  17.109  
  17.110  	if ( suspend_and_state( xc_handle, ioctxt, &info, &ctxt) )
  17.111  	{
  17.112 -	    xcio_error(ioctxt, "Domain appears not to have suspended: %lx",
  17.113 +	    xcio_error(ioctxt, "Domain appears not to have suspended: %u",
  17.114  		       info.flags);
  17.115  	    goto out;
  17.116  	}
  17.117 @@ -836,7 +843,8 @@ int xc_linux_save(int xc_handle, XcIOCon
  17.118                      } /* end of page table rewrite for loop */
  17.119        
  17.120                      if ( xcio_ratewrite(ioctxt, page, PAGE_SIZE) ){
  17.121 -                        xcio_error(ioctxt, "Error when writing to state file (4)");
  17.122 +                        xcio_error(ioctxt, 
  17.123 +                                   "Error when writing to state file (4)");
  17.124                          goto out;
  17.125                      }
  17.126        
  17.127 @@ -844,7 +852,8 @@ int xc_linux_save(int xc_handle, XcIOCon
  17.128  
  17.129                      if ( xcio_ratewrite(ioctxt, region_base + (PAGE_SIZE*j), 
  17.130                                       PAGE_SIZE) ){
  17.131 -                        xcio_error(ioctxt, "Error when writing to state file (5)");
  17.132 +                        xcio_error(ioctxt, 
  17.133 +                                   "Error when writing to state file (5)");
  17.134                          goto out;
  17.135                      }
  17.136                  }
  17.137 @@ -903,14 +912,15 @@ int xc_linux_save(int xc_handle, XcIOCon
  17.138  
  17.139  		if ( suspend_and_state( xc_handle, ioctxt, &info, &ctxt) )
  17.140  		{
  17.141 -		    xcio_error(ioctxt, "Domain appears not to have suspended: %lx",
  17.142 +		    xcio_error(ioctxt, 
  17.143 +                               "Domain appears not to have suspended: %u",
  17.144  			       info.flags);
  17.145  		    goto out;
  17.146  		}
  17.147  
  17.148  		xcio_info(ioctxt,
  17.149 -                          "SUSPEND flags %08lx shinfo %08lx eip %08lx "
  17.150 -                          "esi %08lx\n",info.flags,
  17.151 +                          "SUSPEND flags %08u shinfo %08lx eip %08u "
  17.152 +                          "esi %08u\n",info.flags,
  17.153                            info.shared_info_frame,
  17.154                            ctxt.cpu_ctxt.eip, ctxt.cpu_ctxt.esi );
  17.155              } 
  17.156 @@ -972,7 +982,8 @@ int xc_linux_save(int xc_handle, XcIOCon
  17.157  	    {
  17.158  		if ( xcio_write(ioctxt, &pfntab, sizeof(unsigned long)*j) )
  17.159  		{
  17.160 -		    xcio_error(ioctxt, "Error when writing to state file (6b)");
  17.161 +		    xcio_error(ioctxt, 
  17.162 +                               "Error when writing to state file (6b)");
  17.163  		    goto out;
  17.164  		}	
  17.165  		j = 0;
  17.166 @@ -1027,14 +1038,24 @@ int xc_linux_save(int xc_handle, XcIOCon
  17.167  
  17.168   out:
  17.169  
  17.170 -    if ( live_shinfo )          munmap(live_shinfo, PAGE_SIZE);
  17.171 -    if ( p_srec )               munmap(p_srec, sizeof(*p_srec));
  17.172 -    if ( live_pfn_to_mfn_frame_list ) munmap(live_pfn_to_mfn_frame_list, PAGE_SIZE);
  17.173 -    if ( live_pfn_to_mfn_table ) munmap(live_pfn_to_mfn_table, nr_pfns*4 );
  17.174 -    if ( live_mfn_to_pfn_table ) munmap(live_mfn_to_pfn_table, PAGE_SIZE*1024 );
  17.175 +    if(live_shinfo)
  17.176 +        munmap(live_shinfo, PAGE_SIZE);
  17.177 +
  17.178 +    if(p_srec) 
  17.179 +        munmap(p_srec, sizeof(*p_srec));
  17.180 +
  17.181 +    if(live_pfn_to_mfn_frame_list) 
  17.182 +        munmap(live_pfn_to_mfn_frame_list, PAGE_SIZE);
  17.183  
  17.184 -    if ( pfn_type != NULL ) free(pfn_type);
  17.185 +    if(live_pfn_to_mfn_table) 
  17.186 +        munmap(live_pfn_to_mfn_table, nr_pfns*4);
  17.187 +
  17.188 +    if(live_mfn_to_pfn_table) 
  17.189 +        munmap(live_mfn_to_pfn_table, PAGE_SIZE*1024);
  17.190 +
  17.191 +    if (pfn_type != NULL) 
  17.192 +        free(pfn_type);
  17.193 +
  17.194      DPRINTF("Save exit rc=%d\n",rc);
  17.195      return !!rc;
  17.196 -
  17.197  }
    18.1 --- a/xen/arch/x86/mm.c	Wed Apr 27 16:55:30 2005 +0000
    18.2 +++ b/xen/arch/x86/mm.c	Wed Apr 27 16:55:50 2005 +0000
    18.3 @@ -482,7 +482,7 @@ get_page_from_l2e(
    18.4  {
    18.5      int rc;
    18.6  
    18.7 -    ASSERT( !shadow_mode_enabled(d) );
    18.8 +    ASSERT(!shadow_mode_enabled(d));
    18.9  
   18.10      if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
   18.11          return 1;
   18.12 @@ -641,7 +641,7 @@ static int alloc_l1_table(struct pfn_inf
   18.13      l1_pgentry_t  *pl1e;
   18.14      int            i;
   18.15  
   18.16 -    ASSERT( !shadow_mode_enabled(d) );
   18.17 +    ASSERT(!shadow_mode_enabled(d));
   18.18  
   18.19      pl1e = map_domain_mem(pfn << PAGE_SHIFT);
   18.20  
   18.21 @@ -2670,22 +2670,6 @@ static int ptwr_emulated_update(
   18.22      }
   18.23      unmap_domain_mem(pl1e);
   18.24  
   18.25 -    /* Propagate update to shadow cache. */
   18.26 -    if ( unlikely(shadow_mode_enabled(d)) )
   18.27 -    {
   18.28 -        BUG(); // XXX fix me...
   18.29 -#if 0
   18.30 -        sstat = get_shadow_status(d, page_to_pfn(page));
   18.31 -        if ( sstat & PSH_shadowed )
   18.32 -        {
   18.33 -            sl1e = map_domain_mem(
   18.34 -                ((sstat & PSH_pfn_mask) << PAGE_SHIFT) + (addr & ~PAGE_MASK));
   18.35 -            l1pte_propagate_from_guest(d, &nl1e, sl1e);
   18.36 -            unmap_domain_mem(sl1e);
   18.37 -        }
   18.38 -#endif
   18.39 -    }
   18.40 -
   18.41      /* Finally, drop the old PTE. */
   18.42      put_page_from_l1e(ol1e, d);
   18.43  
   18.44 @@ -2748,6 +2732,7 @@ int ptwr_do_page_fault(struct domain *d,
   18.45      /* We are looking only for read-only mappings of p.t. pages. */
   18.46      if ( ((l1e_get_flags(pte) & (_PAGE_RW|_PAGE_PRESENT)) != _PAGE_PRESENT) ||
   18.47           ((page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table) ||
   18.48 +         ((page->u.inuse.type_info & PGT_count_mask) == 0) ||
   18.49           (page_get_owner(page) != d) )
   18.50      {
   18.51          return 0;