ia64/xen-unstable

changeset 4673:98d5be103415

bitkeeper revision 1.1388 (426fc416kd_SxU1l3YCeVWTczbT41A)

Merge arcadians.cl.cam.ac.uk:/auto/groups/xeno-xenod/BK/xen-unstable.bk
into arcadians.cl.cam.ac.uk:/local/scratch-2/vh249/xen-unstable.bk
author vh249@arcadians.cl.cam.ac.uk
date Wed Apr 27 16:55:50 2005 +0000 (2005-04-27)
parents 5b4ab00d85d1 6c0dd2c2ca58
children 3000c660f103
files .rootkeys BitKeeper/etc/ignore linux-2.6.11-xen-sparse/arch/xen/i386/kernel/ldt.c linux-2.6.11-xen-sparse/arch/xen/i386/mm/hypervisor.c linux-2.6.11-xen-sparse/arch/xen/i386/mm/init.c linux-2.6.11-xen-sparse/arch/xen/i386/mm/pgtable.c linux-2.6.11-xen-sparse/arch/xen/x86_64/mm/hypervisor.c linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/mmu.h linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/mmu_context.h linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/pgalloc.h linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/pgtable.h linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/tlbflush.h linux-2.6.11-xen-sparse/include/asm-xen/asm-x86_64/tlbflush.h linux-2.6.11-xen-sparse/mm/mmap.c tools/libxc/xc_domain.c tools/libxc/xc_linux_restore.c tools/libxc/xc_linux_save.c xen/arch/x86/mm.c xen/common/dom0_ops.c xen/common/dom_mem_ops.c
line diff
     1.1 --- a/.rootkeys	Wed Apr 27 16:55:30 2005 +0000
     1.2 +++ b/.rootkeys	Wed Apr 27 16:55:50 2005 +0000
     1.3 @@ -351,6 +351,7 @@ 40f5623aKXkBBxgpLx2NcvkncQ1Yyw linux-2.6
     1.4  40f5623aDMCsWOFO0jktZ4e8sjwvEg linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/mach-xen/setup_arch_post.h
     1.5  40f5623arsFXkGdPvIqvFi3yFXGR0Q linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/mach-xen/setup_arch_pre.h
     1.6  41811f07Iri9hrvs97t-baxmhOwWDQ linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/mach-xen/smpboot_hooks.h
     1.7 +426fa4d7RzvcFMqff_M76HrvRQZHSg linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/mmu.h
     1.8  4120f807GCO0uqsLqdZj9csxR1Wthw linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/mmu_context.h
     1.9  40f5623adgjZq9nAgCt0IXdWl7udSA linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/page.h
    1.10  40f5623a54NuG-7qHihGYmw4wWQnMA linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/param.h
    1.11 @@ -418,6 +419,7 @@ 419dfc6awx7w88wk6cG9P3mPidX6LQ linux-2.6
    1.12  40f56a0ddHCSs3501MY4hRf22tctOw linux-2.6.11-xen-sparse/mkbuildtree
    1.13  42305f54Q6xJ1bXcQJlCQq1m-e2C8g linux-2.6.11-xen-sparse/mm/highmem.c
    1.14  412f46c0LJuKAgSPGoC0Z1DEkLfuLA linux-2.6.11-xen-sparse/mm/memory.c
    1.15 +426fa4d7ooLYmFcFjJMF_ut4GFVh2Q linux-2.6.11-xen-sparse/mm/mmap.c
    1.16  410a94a4KT6I6X0LVc7djB39tRDp4g linux-2.6.11-xen-sparse/mm/page_alloc.c
    1.17  413cb1e4zst25MDYjg63Y-NGC5_pLg netbsd-2.0-xen-sparse/Makefile
    1.18  413cb1e5c_Mkxf_X0zimEhTKI_l4DA netbsd-2.0-xen-sparse/mkbuildtree
     2.1 --- a/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/ldt.c	Wed Apr 27 16:55:30 2005 +0000
     2.2 +++ b/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/ldt.c	Wed Apr 27 16:55:50 2005 +0000
     2.3 @@ -100,8 +100,8 @@ int init_new_context(struct task_struct 
     2.4  	struct mm_struct * old_mm;
     2.5  	int retval = 0;
     2.6  
     2.7 +	memset(&mm->context, 0, sizeof(mm->context));
     2.8  	init_MUTEX(&mm->context.sem);
     2.9 -	mm->context.size = 0;
    2.10  	old_mm = current->mm;
    2.11  	if (old_mm && old_mm->context.size > 0) {
    2.12  		down(&old_mm->context.sem);
     3.1 --- a/linux-2.6.11-xen-sparse/arch/xen/i386/mm/hypervisor.c	Wed Apr 27 16:55:30 2005 +0000
     3.2 +++ b/linux-2.6.11-xen-sparse/arch/xen/i386/mm/hypervisor.c	Wed Apr 27 16:55:50 2005 +0000
     3.3 @@ -211,7 +211,8 @@ unsigned long allocate_empty_lowmem_regi
     3.4          pte = pte_offset_kernel(pmd, (vstart + (i*PAGE_SIZE))); 
     3.5          pfn_array[i] = pte->pte_low >> PAGE_SHIFT;
     3.6          HYPERVISOR_update_va_mapping(vstart + (i*PAGE_SIZE), __pte_ma(0), 0);
     3.7 -        phys_to_machine_mapping[__pa(vstart)>>PAGE_SHIFT] = INVALID_P2M_ENTRY;
     3.8 +        phys_to_machine_mapping[(__pa(vstart)>>PAGE_SHIFT)+i] =
     3.9 +            INVALID_P2M_ENTRY;
    3.10      }
    3.11  
    3.12      flush_tlb_all();
     4.1 --- a/linux-2.6.11-xen-sparse/arch/xen/i386/mm/init.c	Wed Apr 27 16:55:30 2005 +0000
     4.2 +++ b/linux-2.6.11-xen-sparse/arch/xen/i386/mm/init.c	Wed Apr 27 16:55:50 2005 +0000
     4.3 @@ -710,18 +710,9 @@ void __init mem_init(void)
     4.4  
     4.5  kmem_cache_t *pgd_cache;
     4.6  kmem_cache_t *pmd_cache;
     4.7 -kmem_cache_t *pte_cache;
     4.8  
     4.9  void __init pgtable_cache_init(void)
    4.10  {
    4.11 -	pte_cache = kmem_cache_create("pte",
    4.12 -				PTRS_PER_PTE*sizeof(pte_t),
    4.13 -				PTRS_PER_PTE*sizeof(pte_t),
    4.14 -				0,
    4.15 -				pte_ctor,
    4.16 -				pte_dtor);
    4.17 -	if (!pte_cache)
    4.18 -		panic("pgtable_cache_init(): Cannot create pte cache");
    4.19  	if (PTRS_PER_PMD > 1) {
    4.20  		pmd_cache = kmem_cache_create("pmd",
    4.21  					PTRS_PER_PMD*sizeof(pmd_t),
     5.1 --- a/linux-2.6.11-xen-sparse/arch/xen/i386/mm/pgtable.c	Wed Apr 27 16:55:30 2005 +0000
     5.2 +++ b/linux-2.6.11-xen-sparse/arch/xen/i386/mm/pgtable.c	Wed Apr 27 16:55:50 2005 +0000
     5.3 @@ -198,59 +198,35 @@ pte_t *pte_alloc_one_kernel(struct mm_st
     5.4  	return pte;
     5.5  }
     5.6  
     5.7 -void pte_ctor(void *pte, kmem_cache_t *cache, unsigned long unused)
     5.8 -{
     5.9 -	struct page *page = virt_to_page(pte);
    5.10 -	SetPageForeign(page, pte_free);
    5.11 -	set_page_count(page, 1);
    5.12 -
    5.13 -	clear_page(pte);
    5.14 -	make_page_readonly(pte);
    5.15 -	xen_pte_pin(__pa(pte));
    5.16 -}
    5.17 -
    5.18 -void pte_dtor(void *pte, kmem_cache_t *cache, unsigned long unused)
    5.19 -{
    5.20 -	struct page *page = virt_to_page(pte);
    5.21 -	ClearPageForeign(page);
    5.22 -
    5.23 -	xen_pte_unpin(__pa(pte));
    5.24 -	make_page_writable(pte);
    5.25 -}
    5.26 -
    5.27  struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
    5.28  {
    5.29 -	pte_t *ptep;
    5.30 -
    5.31 -#ifdef CONFIG_HIGHPTE
    5.32  	struct page *pte;
    5.33  
    5.34 +#ifdef CONFIG_HIGHPTE
    5.35  	pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
    5.36 -	if (pte == NULL)
    5.37 -		return pte;
    5.38 -	if (PageHighMem(pte))
    5.39 -		return pte;
    5.40 -	/* not a highmem page -- free page and grab one from the cache */
    5.41 -	__free_page(pte);
    5.42 +#else
    5.43 +	pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
    5.44 +	if (pte) {
    5.45 +		SetPageForeign(pte, pte_free);
    5.46 +		set_page_count(pte, 1);
    5.47 +	}
    5.48  #endif
    5.49 -	ptep = kmem_cache_alloc(pte_cache, GFP_KERNEL);
    5.50 -	if (ptep)
    5.51 -		return virt_to_page(ptep);
    5.52 -	return NULL;
    5.53 +
    5.54 +	return pte;
    5.55  }
    5.56  
    5.57  void pte_free(struct page *pte)
    5.58  {
    5.59 +	unsigned long va = (unsigned long)__va(page_to_pfn(pte)<<PAGE_SHIFT);
    5.60 +
    5.61 +	if (!pte_write(*virt_to_ptep(va)))
    5.62 +		HYPERVISOR_update_va_mapping(
    5.63 +			va, pfn_pte(page_to_pfn(pte), PAGE_KERNEL), 0);
    5.64 +
    5.65 +	ClearPageForeign(pte);
    5.66  	set_page_count(pte, 1);
    5.67 -#ifdef CONFIG_HIGHPTE
    5.68 -	if (!PageHighMem(pte))
    5.69 -#endif
    5.70 -		kmem_cache_free(pte_cache,
    5.71 -				phys_to_virt(page_to_pseudophys(pte)));
    5.72 -#ifdef CONFIG_HIGHPTE
    5.73 -	else
    5.74 -		__free_page(pte);
    5.75 -#endif
    5.76 +
    5.77 +	__free_page(pte);
    5.78  }
    5.79  
    5.80  void pmd_ctor(void *pmd, kmem_cache_t *cache, unsigned long flags)
    5.81 @@ -305,14 +281,11 @@ void pgd_ctor(void *pgd, kmem_cache_t *c
    5.82  			(PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t));
    5.83  
    5.84  	if (PTRS_PER_PMD > 1)
    5.85 -		goto out;
    5.86 +		return;
    5.87  
    5.88  	pgd_list_add(pgd);
    5.89  	spin_unlock_irqrestore(&pgd_lock, flags);
    5.90  	memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
    5.91 - out:
    5.92 -	make_page_readonly(pgd);
    5.93 -	xen_pgd_pin(__pa(pgd));
    5.94  }
    5.95  
    5.96  /* never called when PTRS_PER_PMD > 1 */
    5.97 @@ -320,9 +293,6 @@ void pgd_dtor(void *pgd, kmem_cache_t *c
    5.98  {
    5.99  	unsigned long flags; /* can be called from interrupt context */
   5.100  
   5.101 -	xen_pgd_unpin(__pa(pgd));
   5.102 -	make_page_writable(pgd);
   5.103 -
   5.104  	if (PTRS_PER_PMD > 1)
   5.105  		return;
   5.106  
   5.107 @@ -357,6 +327,15 @@ out_oom:
   5.108  void pgd_free(pgd_t *pgd)
   5.109  {
   5.110  	int i;
   5.111 +	pte_t *ptep = virt_to_ptep(pgd);
   5.112 +
   5.113 +	if (!pte_write(*ptep)) {
   5.114 +		xen_pgd_unpin(__pa(pgd));
   5.115 +		HYPERVISOR_update_va_mapping(
   5.116 +			(unsigned long)pgd,
   5.117 +			pfn_pte(virt_to_phys(pgd)>>PAGE_SHIFT, PAGE_KERNEL),
   5.118 +			0);
   5.119 +	}
   5.120  
   5.121  	/* in the PAE case user pgd entries are overwritten before usage */
   5.122  	if (PTRS_PER_PMD > 1)
   5.123 @@ -369,28 +348,19 @@ void pgd_free(pgd_t *pgd)
   5.124  #ifndef CONFIG_XEN_SHADOW_MODE
   5.125  void make_lowmem_page_readonly(void *va)
   5.126  {
   5.127 -	pgd_t *pgd = pgd_offset_k((unsigned long)va);
   5.128 -	pud_t *pud = pud_offset(pgd, (unsigned long)va);
   5.129 -	pmd_t *pmd = pmd_offset(pud, (unsigned long)va);
   5.130 -	pte_t *pte = pte_offset_kernel(pmd, (unsigned long)va);
   5.131 +	pte_t *pte = virt_to_ptep(va);
   5.132  	set_pte(pte, pte_wrprotect(*pte));
   5.133  }
   5.134  
   5.135  void make_lowmem_page_writable(void *va)
   5.136  {
   5.137 -	pgd_t *pgd = pgd_offset_k((unsigned long)va);
   5.138 -	pud_t *pud = pud_offset(pgd, (unsigned long)va);
   5.139 -	pmd_t *pmd = pmd_offset(pud, (unsigned long)va);
   5.140 -	pte_t *pte = pte_offset_kernel(pmd, (unsigned long)va);
   5.141 +	pte_t *pte = virt_to_ptep(va);
   5.142  	set_pte(pte, pte_mkwrite(*pte));
   5.143  }
   5.144  
   5.145  void make_page_readonly(void *va)
   5.146  {
   5.147 -	pgd_t *pgd = pgd_offset_k((unsigned long)va);
   5.148 -	pud_t *pud = pud_offset(pgd, (unsigned long)va);
   5.149 -	pmd_t *pmd = pmd_offset(pud, (unsigned long)va);
   5.150 -	pte_t *pte = pte_offset_kernel(pmd, (unsigned long)va);
   5.151 +	pte_t *pte = virt_to_ptep(va);
   5.152  	set_pte(pte, pte_wrprotect(*pte));
   5.153  	if ( (unsigned long)va >= (unsigned long)high_memory )
   5.154  	{
   5.155 @@ -405,10 +375,7 @@ void make_page_readonly(void *va)
   5.156  
   5.157  void make_page_writable(void *va)
   5.158  {
   5.159 -	pgd_t *pgd = pgd_offset_k((unsigned long)va);
   5.160 -	pud_t *pud = pud_offset(pgd, (unsigned long)va);
   5.161 -	pmd_t *pmd = pmd_offset(pud, (unsigned long)va);
   5.162 -	pte_t *pte = pte_offset_kernel(pmd, (unsigned long)va);
   5.163 +	pte_t *pte = virt_to_ptep(va);
   5.164  	set_pte(pte, pte_mkwrite(*pte));
   5.165  	if ( (unsigned long)va >= (unsigned long)high_memory )
   5.166  	{
   5.167 @@ -439,3 +406,91 @@ void make_pages_writable(void *va, unsig
   5.168  	}
   5.169  }
   5.170  #endif /* CONFIG_XEN_SHADOW_MODE */
   5.171 +
   5.172 +void mm_pin(struct mm_struct *mm)
   5.173 +{
   5.174 +    pgd_t       *pgd;
   5.175 +    struct page *page;
   5.176 +    int          i;
   5.177 +
   5.178 +    spin_lock(&mm->page_table_lock);
   5.179 +
   5.180 +    for ( i = 0, pgd = mm->pgd; i < USER_PTRS_PER_PGD; i++, pgd++ )
   5.181 +    {
   5.182 +        if ( *(unsigned long *)pgd == 0 )
   5.183 +            continue;
   5.184 +        page = pmd_page(*(pmd_t *)pgd);
   5.185 +        if ( !PageHighMem(page) )
   5.186 +            HYPERVISOR_update_va_mapping(
   5.187 +                (unsigned long)__va(page_to_pfn(page)<<PAGE_SHIFT),
   5.188 +                pfn_pte(page_to_pfn(page), PAGE_KERNEL_RO), 0);
   5.189 +    }
   5.190 +
   5.191 +    HYPERVISOR_update_va_mapping(
   5.192 +        (unsigned long)mm->pgd,
   5.193 +        pfn_pte(virt_to_phys(mm->pgd)>>PAGE_SHIFT, PAGE_KERNEL_RO), 0);
   5.194 +    xen_pgd_pin(__pa(mm->pgd));
   5.195 +
   5.196 +    mm->context.pinned = 1;
   5.197 +
   5.198 +    spin_unlock(&mm->page_table_lock);
   5.199 +}
   5.200 +
   5.201 +void mm_unpin(struct mm_struct *mm)
   5.202 +{
   5.203 +    pgd_t       *pgd;
   5.204 +    struct page *page;
   5.205 +    int          i;
   5.206 +
   5.207 +    spin_lock(&mm->page_table_lock);
   5.208 +
   5.209 +    xen_pgd_unpin(__pa(mm->pgd));
   5.210 +    HYPERVISOR_update_va_mapping(
   5.211 +        (unsigned long)mm->pgd,
   5.212 +        pfn_pte(virt_to_phys(mm->pgd)>>PAGE_SHIFT, PAGE_KERNEL), 0);
   5.213 +
   5.214 +    for ( i = 0, pgd = mm->pgd; i < USER_PTRS_PER_PGD; i++, pgd++ )
   5.215 +    {
   5.216 +        if ( *(unsigned long *)pgd == 0 )
   5.217 +            continue;
   5.218 +        page = pmd_page(*(pmd_t *)pgd);
   5.219 +        if ( !PageHighMem(page) )
   5.220 +            HYPERVISOR_update_va_mapping(
   5.221 +                (unsigned long)__va(page_to_pfn(page)<<PAGE_SHIFT),
   5.222 +                pfn_pte(page_to_pfn(page), PAGE_KERNEL), 0);
   5.223 +    }
   5.224 +
   5.225 +    mm->context.pinned = 0;
   5.226 +
   5.227 +    spin_unlock(&mm->page_table_lock);
   5.228 +}
   5.229 +
   5.230 +void _arch_exit_mmap(struct mm_struct *mm)
   5.231 +{
   5.232 +    unsigned int cpu = smp_processor_id();
   5.233 +    struct task_struct *tsk = current;
   5.234 +
   5.235 +    task_lock(tsk);
   5.236 +
   5.237 +    /*
   5.238 +     * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
   5.239 +     * *much* faster this way, as no tlb flushes means bigger wrpt batches.
   5.240 +     */
   5.241 +    if ( tsk->active_mm == mm )
   5.242 +    {
   5.243 +        tsk->active_mm = &init_mm;
   5.244 +        atomic_inc(&init_mm.mm_count);
   5.245 +
   5.246 +        cpu_set(cpu, init_mm.cpu_vm_mask);
   5.247 +        load_cr3(swapper_pg_dir);
   5.248 +        cpu_clear(cpu, mm->cpu_vm_mask);
   5.249 +
   5.250 +        atomic_dec(&mm->mm_count);
   5.251 +        BUG_ON(atomic_read(&mm->mm_count) == 0);
   5.252 +    }
   5.253 +
   5.254 +    task_unlock(tsk);
   5.255 +
   5.256 +    if ( mm->context.pinned && (atomic_read(&mm->mm_count) == 1) )
   5.257 +        mm_unpin(mm);
   5.258 +}
     6.1 --- a/linux-2.6.11-xen-sparse/arch/xen/x86_64/mm/hypervisor.c	Wed Apr 27 16:55:30 2005 +0000
     6.2 +++ b/linux-2.6.11-xen-sparse/arch/xen/x86_64/mm/hypervisor.c	Wed Apr 27 16:55:50 2005 +0000
     6.3 @@ -258,7 +258,8 @@ unsigned long allocate_empty_lowmem_regi
     6.4          pte = pte_offset_kernel(pmd, (vstart + (i*PAGE_SIZE))); 
     6.5          pfn_array[i] = pte->pte >> PAGE_SHIFT;
     6.6          xen_l1_entry_update(pte, 0);
     6.7 -        phys_to_machine_mapping[__pa(vstart)>>PAGE_SHIFT] = (u32)INVALID_P2M_ENTRY;
     6.8 +        phys_to_machine_mapping[(__pa(vstart)>>PAGE_SHIFT)+i] =
     6.9 +            (u32)INVALID_P2M_ENTRY;
    6.10      }
    6.11  
    6.12      /* Flush updates through and flush the TLB. */
     7.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     7.2 +++ b/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/mmu.h	Wed Apr 27 16:55:50 2005 +0000
     7.3 @@ -0,0 +1,22 @@
     7.4 +#ifndef __i386_MMU_H
     7.5 +#define __i386_MMU_H
     7.6 +
     7.7 +#include <asm/semaphore.h>
     7.8 +/*
     7.9 + * The i386 doesn't have a mmu context, but
    7.10 + * we put the segment information here.
    7.11 + *
    7.12 + * cpu_vm_mask is used to optimize ldt flushing.
    7.13 + */
    7.14 +typedef struct { 
    7.15 +	int size;
    7.16 +	struct semaphore sem;
    7.17 +	void *ldt;
    7.18 +	unsigned pinned:1;
    7.19 +} mm_context_t;
    7.20 +
    7.21 +/* mm/memory.c:exit_mmap hook */
    7.22 +extern void _arch_exit_mmap(struct mm_struct *mm);
    7.23 +#define arch_exit_mmap(_mm) _arch_exit_mmap(_mm)
    7.24 +
    7.25 +#endif
     8.1 --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/mmu_context.h	Wed Apr 27 16:55:30 2005 +0000
     8.2 +++ b/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/mmu_context.h	Wed Apr 27 16:55:50 2005 +0000
     8.3 @@ -41,6 +41,9 @@ static inline void __prepare_arch_switch
     8.4  		: : "r" (0) );
     8.5  }
     8.6  
     8.7 +extern void mm_pin(struct mm_struct *mm);
     8.8 +extern void mm_unpin(struct mm_struct *mm);
     8.9 +
    8.10  static inline void switch_mm(struct mm_struct *prev,
    8.11  			     struct mm_struct *next,
    8.12  			     struct task_struct *tsk)
    8.13 @@ -49,6 +52,9 @@ static inline void switch_mm(struct mm_s
    8.14  	struct mmuext_op _op[2], *op = _op;
    8.15  
    8.16  	if (likely(prev != next)) {
    8.17 +		if (!next->context.pinned)
    8.18 +			mm_pin(next);
    8.19 +
    8.20  		/* stop flush ipis for the previous mm */
    8.21  		cpu_clear(cpu, prev->cpu_vm_mask);
    8.22  #if 0 /* XEN: no lazy tlb */
    8.23 @@ -92,20 +98,10 @@ static inline void switch_mm(struct mm_s
    8.24  #endif
    8.25  }
    8.26  
    8.27 -/*
    8.28 - * XEN: We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
    8.29 - * *much* faster this way, as no tlb flushes means much bigger wrpt batches.
    8.30 - */
    8.31 -#define deactivate_mm(tsk, mm) do {					\
    8.32 -	asm("movl %0,%%fs ; movl %0,%%gs": :"r" (0));			\
    8.33 -	if ((mm) && cpu_isset(smp_processor_id(), (mm)->cpu_vm_mask)) {	\
    8.34 -		cpu_clear(smp_processor_id(), (mm)->cpu_vm_mask);	\
    8.35 -		load_cr3(swapper_pg_dir);				\
    8.36 -	}								\
    8.37 -} while (0)
    8.38 +#define deactivate_mm(tsk, mm) \
    8.39 +	asm("movl %0,%%fs ; movl %0,%%gs": :"r" (0))
    8.40  
    8.41 -#define activate_mm(prev, next) do {		\
    8.42 -	switch_mm((prev),(next),NULL);		\
    8.43 -} while (0)
    8.44 +#define activate_mm(prev, next) \
    8.45 +	switch_mm((prev),(next),NULL)
    8.46  
    8.47  #endif
     9.1 --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/pgalloc.h	Wed Apr 27 16:55:30 2005 +0000
     9.2 +++ b/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/pgalloc.h	Wed Apr 27 16:55:50 2005 +0000
     9.3 @@ -11,10 +11,23 @@
     9.4  #define pmd_populate_kernel(mm, pmd, pte) \
     9.5  		set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte)))
     9.6  
     9.7 -#define pmd_populate(mm, pmd, pte) 				\
     9.8 -	set_pmd(pmd, __pmd(_PAGE_TABLE +			\
     9.9 -		((unsigned long long)page_to_pfn(pte) <<	\
    9.10 -			(unsigned long long) PAGE_SHIFT)))
    9.11 +#define pmd_populate(mm, pmd, pte) 					\
    9.12 +do {									\
    9.13 +	if (unlikely((mm)->context.pinned)) {				\
    9.14 +		if (!PageHighMem(pte))					\
    9.15 +			HYPERVISOR_update_va_mapping(			\
    9.16 +			  (unsigned long)__va(page_to_pfn(pte)<<PAGE_SHIFT),\
    9.17 +			  pfn_pte(page_to_pfn(pte), PAGE_KERNEL_RO), 0);\
    9.18 +		set_pmd(pmd, __pmd(_PAGE_TABLE +			\
    9.19 +			((unsigned long long)page_to_pfn(pte) <<	\
    9.20 +				(unsigned long long) PAGE_SHIFT)));	\
    9.21 +	} else {							\
    9.22 +		*(pmd) = __pmd(_PAGE_TABLE +				\
    9.23 +			((unsigned long long)page_to_pfn(pte) <<	\
    9.24 +				(unsigned long long) PAGE_SHIFT));	\
    9.25 +	}								\
    9.26 +} while (0)
    9.27 +
    9.28  /*
    9.29   * Allocate and free page tables.
    9.30   */
    10.1 --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/pgtable.h	Wed Apr 27 16:55:30 2005 +0000
    10.2 +++ b/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/pgtable.h	Wed Apr 27 16:55:50 2005 +0000
    10.3 @@ -35,12 +35,9 @@ extern unsigned long empty_zero_page[102
    10.4  extern pgd_t swapper_pg_dir[1024];
    10.5  extern kmem_cache_t *pgd_cache;
    10.6  extern kmem_cache_t *pmd_cache;
    10.7 -extern kmem_cache_t *pte_cache;
    10.8  extern spinlock_t pgd_lock;
    10.9  extern struct page *pgd_list;
   10.10  
   10.11 -void pte_ctor(void *, kmem_cache_t *, unsigned long);
   10.12 -void pte_dtor(void *, kmem_cache_t *, unsigned long);
   10.13  void pmd_ctor(void *, kmem_cache_t *, unsigned long);
   10.14  void pgd_ctor(void *, kmem_cache_t *, unsigned long);
   10.15  void pgd_dtor(void *, kmem_cache_t *, unsigned long);
   10.16 @@ -448,12 +445,17 @@ void make_pages_writable(void *va, unsig
   10.17  #define make_pages_writable(_va, _nr)  ((void)0)
   10.18  #endif
   10.19  
   10.20 -#define arbitrary_virt_to_machine(__va)					\
   10.21 +#define virt_to_ptep(__va)						\
   10.22  ({									\
   10.23  	pgd_t *__pgd = pgd_offset_k((unsigned long)(__va));		\
   10.24  	pud_t *__pud = pud_offset(__pgd, (unsigned long)(__va));	\
   10.25  	pmd_t *__pmd = pmd_offset(__pud, (unsigned long)(__va));	\
   10.26 -	pte_t *__pte = pte_offset_kernel(__pmd, (unsigned long)(__va));	\
   10.27 +	pte_offset_kernel(__pmd, (unsigned long)(__va));		\
   10.28 +})
   10.29 +
   10.30 +#define arbitrary_virt_to_machine(__va)					\
   10.31 +({									\
   10.32 +	pte_t *__pte = virt_to_ptep(__va);				\
   10.33  	unsigned long __pa = (*(unsigned long *)__pte) & PAGE_MASK;	\
   10.34  	__pa | ((unsigned long)(__va) & (PAGE_SIZE-1));			\
   10.35  })
    11.1 --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/tlbflush.h	Wed Apr 27 16:55:30 2005 +0000
    11.2 +++ b/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/tlbflush.h	Wed Apr 27 16:55:50 2005 +0000
    11.3 @@ -40,24 +40,21 @@ extern unsigned long pgkern_mask;
    11.4  
    11.5  static inline void flush_tlb_mm(struct mm_struct *mm)
    11.6  {
    11.7 -	/* XEN: cpu_vm_mask is more accurate than active_mm. */
    11.8 -	if (cpu_isset(smp_processor_id(), mm->cpu_vm_mask))
    11.9 +	if (mm == current->active_mm)
   11.10  		__flush_tlb();
   11.11  }
   11.12  
   11.13  static inline void flush_tlb_page(struct vm_area_struct *vma,
   11.14  	unsigned long addr)
   11.15  {
   11.16 -	/* XEN: cpu_vm_mask is more accurate than active_mm. */
   11.17 -	if (cpu_isset(smp_processor_id(), vma->vm_mm->cpu_vm_mask))
   11.18 +	if (vma->vm_mm == current->active_mm)
   11.19  		__flush_tlb_one(addr);
   11.20  }
   11.21  
   11.22  static inline void flush_tlb_range(struct vm_area_struct *vma,
   11.23  	unsigned long start, unsigned long end)
   11.24  {
   11.25 -	/* XEN: cpu_vm_mask is more accurate than active_mm. */
   11.26 -	if (cpu_isset(smp_processor_id(), vma->vm_mm->cpu_vm_mask))
   11.27 +	if (vma->vm_mm == current->active_mm)
   11.28  		__flush_tlb();
   11.29  }
   11.30  
    12.1 --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-x86_64/tlbflush.h	Wed Apr 27 16:55:30 2005 +0000
    12.2 +++ b/linux-2.6.11-xen-sparse/include/asm-xen/asm-x86_64/tlbflush.h	Wed Apr 27 16:55:50 2005 +0000
    12.3 @@ -44,24 +44,21 @@ extern unsigned long pgkern_mask;
    12.4  
    12.5  static inline void flush_tlb_mm(struct mm_struct *mm)
    12.6  {
    12.7 -	/* XEN: cpu_vm_mask is more accurate than active_mm. */
    12.8 -	if (cpu_isset(smp_processor_id(), mm->cpu_vm_mask))
    12.9 +	if (mm == current->active_mm)
   12.10  		__flush_tlb();
   12.11  }
   12.12  
   12.13  static inline void flush_tlb_page(struct vm_area_struct *vma,
   12.14  	unsigned long addr)
   12.15  {
   12.16 -	/* XEN: cpu_vm_mask is more accurate than active_mm. */
   12.17 -	if (cpu_isset(smp_processor_id(), vma->vm_mm->cpu_vm_mask))
   12.18 +	if (vma->vm_mm == current->active_mm)
   12.19  		__flush_tlb_one(addr);
   12.20  }
   12.21  
   12.22  static inline void flush_tlb_range(struct vm_area_struct *vma,
   12.23  	unsigned long start, unsigned long end)
   12.24  {
   12.25 -	/* XEN: cpu_vm_mask is more accurate than active_mm. */
   12.26 -	if (cpu_isset(smp_processor_id(), vma->vm_mm->cpu_vm_mask))
   12.27 +	if (vma->vm_mm == current->active_mm)
   12.28  		__flush_tlb();
   12.29  }
   12.30  
    13.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    13.2 +++ b/linux-2.6.11-xen-sparse/mm/mmap.c	Wed Apr 27 16:55:50 2005 +0000
    13.3 @@ -0,0 +1,2108 @@
    13.4 +/*
    13.5 + * mm/mmap.c
    13.6 + *
    13.7 + * Written by obz.
    13.8 + *
    13.9 + * Address space accounting code	<alan@redhat.com>
   13.10 + */
   13.11 +
   13.12 +#include <linux/slab.h>
   13.13 +#include <linux/mm.h>
   13.14 +#include <linux/shm.h>
   13.15 +#include <linux/mman.h>
   13.16 +#include <linux/pagemap.h>
   13.17 +#include <linux/swap.h>
   13.18 +#include <linux/syscalls.h>
   13.19 +#include <linux/init.h>
   13.20 +#include <linux/file.h>
   13.21 +#include <linux/fs.h>
   13.22 +#include <linux/personality.h>
   13.23 +#include <linux/security.h>
   13.24 +#include <linux/hugetlb.h>
   13.25 +#include <linux/profile.h>
   13.26 +#include <linux/module.h>
   13.27 +#include <linux/acct.h>
   13.28 +#include <linux/mount.h>
   13.29 +#include <linux/mempolicy.h>
   13.30 +#include <linux/rmap.h>
   13.31 +
   13.32 +#include <asm/uaccess.h>
   13.33 +#include <asm/cacheflush.h>
   13.34 +#include <asm/tlb.h>
   13.35 +
   13.36 +/*
   13.37 + * WARNING: the debugging will use recursive algorithms so never enable this
   13.38 + * unless you know what you are doing.
   13.39 + */
   13.40 +#undef DEBUG_MM_RB
   13.41 +
   13.42 +/* description of effects of mapping type and prot in current implementation.
   13.43 + * this is due to the limited x86 page protection hardware.  The expected
   13.44 + * behavior is in parens:
   13.45 + *
   13.46 + * map_type	prot
   13.47 + *		PROT_NONE	PROT_READ	PROT_WRITE	PROT_EXEC
   13.48 + * MAP_SHARED	r: (no) no	r: (yes) yes	r: (no) yes	r: (no) yes
   13.49 + *		w: (no) no	w: (no) no	w: (yes) yes	w: (no) no
   13.50 + *		x: (no) no	x: (no) yes	x: (no) yes	x: (yes) yes
   13.51 + *		
   13.52 + * MAP_PRIVATE	r: (no) no	r: (yes) yes	r: (no) yes	r: (no) yes
   13.53 + *		w: (no) no	w: (no) no	w: (copy) copy	w: (no) no
   13.54 + *		x: (no) no	x: (no) yes	x: (no) yes	x: (yes) yes
   13.55 + *
   13.56 + */
   13.57 +pgprot_t protection_map[16] = {
   13.58 +	__P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111,
   13.59 +	__S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111
   13.60 +};
   13.61 +
   13.62 +int sysctl_overcommit_memory = OVERCOMMIT_GUESS;  /* heuristic overcommit */
   13.63 +int sysctl_overcommit_ratio = 50;	/* default is 50% */
   13.64 +int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
   13.65 +atomic_t vm_committed_space = ATOMIC_INIT(0);
   13.66 +
   13.67 +/*
   13.68 + * Check that a process has enough memory to allocate a new virtual
   13.69 + * mapping. 0 means there is enough memory for the allocation to
   13.70 + * succeed and -ENOMEM implies there is not.
   13.71 + *
   13.72 + * We currently support three overcommit policies, which are set via the
   13.73 + * vm.overcommit_memory sysctl.  See Documentation/vm/overcommit-accounting
   13.74 + *
   13.75 + * Strict overcommit modes added 2002 Feb 26 by Alan Cox.
   13.76 + * Additional code 2002 Jul 20 by Robert Love.
   13.77 + *
   13.78 + * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise.
   13.79 + *
   13.80 + * Note this is a helper function intended to be used by LSMs which
   13.81 + * wish to use this logic.
   13.82 + */
   13.83 +int __vm_enough_memory(long pages, int cap_sys_admin)
   13.84 +{
   13.85 +	unsigned long free, allowed;
   13.86 +
   13.87 +	vm_acct_memory(pages);
   13.88 +
   13.89 +	/*
   13.90 +	 * Sometimes we want to use more memory than we have
   13.91 +	 */
   13.92 +	if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS)
   13.93 +		return 0;
   13.94 +
   13.95 +	if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
   13.96 +		unsigned long n;
   13.97 +
   13.98 +		free = get_page_cache_size();
   13.99 +		free += nr_swap_pages;
  13.100 +
  13.101 +		/*
  13.102 +		 * Any slabs which are created with the
  13.103 +		 * SLAB_RECLAIM_ACCOUNT flag claim to have contents
  13.104 +		 * which are reclaimable, under pressure.  The dentry
  13.105 +		 * cache and most inode caches should fall into this
  13.106 +		 */
  13.107 +		free += atomic_read(&slab_reclaim_pages);
  13.108 +
  13.109 +		/*
  13.110 +		 * Leave the last 3% for root
  13.111 +		 */
  13.112 +		if (!cap_sys_admin)
  13.113 +			free -= free / 32;
  13.114 +
  13.115 +		if (free > pages)
  13.116 +			return 0;
  13.117 +
  13.118 +		/*
  13.119 +		 * nr_free_pages() is very expensive on large systems,
  13.120 +		 * only call if we're about to fail.
  13.121 +		 */
  13.122 +		n = nr_free_pages();
  13.123 +		if (!cap_sys_admin)
  13.124 +			n -= n / 32;
  13.125 +		free += n;
  13.126 +
  13.127 +		if (free > pages)
  13.128 +			return 0;
  13.129 +		vm_unacct_memory(pages);
  13.130 +		return -ENOMEM;
  13.131 +	}
  13.132 +
  13.133 +	allowed = (totalram_pages - hugetlb_total_pages())
  13.134 +	       	* sysctl_overcommit_ratio / 100;
  13.135 +	/*
  13.136 +	 * Leave the last 3% for root
  13.137 +	 */
  13.138 +	if (!cap_sys_admin)
  13.139 +		allowed -= allowed / 32;
  13.140 +	allowed += total_swap_pages;
  13.141 +
  13.142 +	/* Don't let a single process grow too big:
  13.143 +	   leave 3% of the size of this process for other processes */
  13.144 +	allowed -= current->mm->total_vm / 32;
  13.145 +
  13.146 +	if (atomic_read(&vm_committed_space) < allowed)
  13.147 +		return 0;
  13.148 +
  13.149 +	vm_unacct_memory(pages);
  13.150 +
  13.151 +	return -ENOMEM;
  13.152 +}
  13.153 +
  13.154 +EXPORT_SYMBOL(sysctl_overcommit_memory);
  13.155 +EXPORT_SYMBOL(sysctl_overcommit_ratio);
  13.156 +EXPORT_SYMBOL(sysctl_max_map_count);
  13.157 +EXPORT_SYMBOL(vm_committed_space);
  13.158 +EXPORT_SYMBOL(__vm_enough_memory);
  13.159 +
  13.160 +/*
  13.161 + * Requires inode->i_mapping->i_mmap_lock
  13.162 + */
  13.163 +static void __remove_shared_vm_struct(struct vm_area_struct *vma,
  13.164 +		struct file *file, struct address_space *mapping)
  13.165 +{
  13.166 +	if (vma->vm_flags & VM_DENYWRITE)
  13.167 +		atomic_inc(&file->f_dentry->d_inode->i_writecount);
  13.168 +	if (vma->vm_flags & VM_SHARED)
  13.169 +		mapping->i_mmap_writable--;
  13.170 +
  13.171 +	flush_dcache_mmap_lock(mapping);
  13.172 +	if (unlikely(vma->vm_flags & VM_NONLINEAR))
  13.173 +		list_del_init(&vma->shared.vm_set.list);
  13.174 +	else
  13.175 +		vma_prio_tree_remove(vma, &mapping->i_mmap);
  13.176 +	flush_dcache_mmap_unlock(mapping);
  13.177 +}
  13.178 +
  13.179 +/*
  13.180 + * Remove one vm structure and free it.
  13.181 + */
  13.182 +static void remove_vm_struct(struct vm_area_struct *vma)
  13.183 +{
  13.184 +	struct file *file = vma->vm_file;
  13.185 +
  13.186 +	might_sleep();
  13.187 +	if (file) {
  13.188 +		struct address_space *mapping = file->f_mapping;
  13.189 +		spin_lock(&mapping->i_mmap_lock);
  13.190 +		__remove_shared_vm_struct(vma, file, mapping);
  13.191 +		spin_unlock(&mapping->i_mmap_lock);
  13.192 +	}
  13.193 +	if (vma->vm_ops && vma->vm_ops->close)
  13.194 +		vma->vm_ops->close(vma);
  13.195 +	if (file)
  13.196 +		fput(file);
  13.197 +	anon_vma_unlink(vma);
  13.198 +	mpol_free(vma_policy(vma));
  13.199 +	kmem_cache_free(vm_area_cachep, vma);
  13.200 +}
  13.201 +
  13.202 +/*
  13.203 + *  sys_brk() for the most part doesn't need the global kernel
  13.204 + *  lock, except when an application is doing something nasty
  13.205 + *  like trying to un-brk an area that has already been mapped
  13.206 + *  to a regular file.  in this case, the unmapping will need
  13.207 + *  to invoke file system routines that need the global lock.
  13.208 + */
  13.209 +asmlinkage unsigned long sys_brk(unsigned long brk)
  13.210 +{
  13.211 +	unsigned long rlim, retval;
  13.212 +	unsigned long newbrk, oldbrk;
  13.213 +	struct mm_struct *mm = current->mm;
  13.214 +
  13.215 +	down_write(&mm->mmap_sem);
  13.216 +
  13.217 +	if (brk < mm->end_code)
  13.218 +		goto out;
  13.219 +	newbrk = PAGE_ALIGN(brk);
  13.220 +	oldbrk = PAGE_ALIGN(mm->brk);
  13.221 +	if (oldbrk == newbrk)
  13.222 +		goto set_brk;
  13.223 +
  13.224 +	/* Always allow shrinking brk. */
  13.225 +	if (brk <= mm->brk) {
  13.226 +		if (!do_munmap(mm, newbrk, oldbrk-newbrk))
  13.227 +			goto set_brk;
  13.228 +		goto out;
  13.229 +	}
  13.230 +
  13.231 +	/* Check against rlimit.. */
  13.232 +	rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur;
  13.233 +	if (rlim < RLIM_INFINITY && brk - mm->start_data > rlim)
  13.234 +		goto out;
  13.235 +
  13.236 +	/* Check against existing mmap mappings. */
  13.237 +	if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE))
  13.238 +		goto out;
  13.239 +
  13.240 +	/* Ok, looks good - let it rip. */
  13.241 +	if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk)
  13.242 +		goto out;
  13.243 +set_brk:
  13.244 +	mm->brk = brk;
  13.245 +out:
  13.246 +	retval = mm->brk;
  13.247 +	up_write(&mm->mmap_sem);
  13.248 +	return retval;
  13.249 +}
  13.250 +
  13.251 +#ifdef DEBUG_MM_RB
  13.252 +static int browse_rb(struct rb_root *root)
  13.253 +{
  13.254 +	int i = 0, j;
  13.255 +	struct rb_node *nd, *pn = NULL;
  13.256 +	unsigned long prev = 0, pend = 0;
  13.257 +
  13.258 +	for (nd = rb_first(root); nd; nd = rb_next(nd)) {
  13.259 +		struct vm_area_struct *vma;
  13.260 +		vma = rb_entry(nd, struct vm_area_struct, vm_rb);
  13.261 +		if (vma->vm_start < prev)
  13.262 +			printk("vm_start %lx prev %lx\n", vma->vm_start, prev), i = -1;
  13.263 +		if (vma->vm_start < pend)
  13.264 +			printk("vm_start %lx pend %lx\n", vma->vm_start, pend);
  13.265 +		if (vma->vm_start > vma->vm_end)
  13.266 +			printk("vm_end %lx < vm_start %lx\n", vma->vm_end, vma->vm_start);
  13.267 +		i++;
  13.268 +		pn = nd;
  13.269 +	}
  13.270 +	j = 0;
  13.271 +	for (nd = pn; nd; nd = rb_prev(nd)) {
  13.272 +		j++;
  13.273 +	}
  13.274 +	if (i != j)
  13.275 +		printk("backwards %d, forwards %d\n", j, i), i = 0;
  13.276 +	return i;
  13.277 +}
  13.278 +
  13.279 +void validate_mm(struct mm_struct *mm)
  13.280 +{
  13.281 +	int bug = 0;
  13.282 +	int i = 0;
  13.283 +	struct vm_area_struct *tmp = mm->mmap;
  13.284 +	while (tmp) {
  13.285 +		tmp = tmp->vm_next;
  13.286 +		i++;
  13.287 +	}
  13.288 +	if (i != mm->map_count)
  13.289 +		printk("map_count %d vm_next %d\n", mm->map_count, i), bug = 1;
  13.290 +	i = browse_rb(&mm->mm_rb);
  13.291 +	if (i != mm->map_count)
  13.292 +		printk("map_count %d rb %d\n", mm->map_count, i), bug = 1;
  13.293 +	if (bug)
  13.294 +		BUG();
  13.295 +}
  13.296 +#else
  13.297 +#define validate_mm(mm) do { } while (0)
  13.298 +#endif
  13.299 +
  13.300 +static struct vm_area_struct *
  13.301 +find_vma_prepare(struct mm_struct *mm, unsigned long addr,
  13.302 +		struct vm_area_struct **pprev, struct rb_node ***rb_link,
  13.303 +		struct rb_node ** rb_parent)
  13.304 +{
  13.305 +	struct vm_area_struct * vma;
  13.306 +	struct rb_node ** __rb_link, * __rb_parent, * rb_prev;
  13.307 +
  13.308 +	__rb_link = &mm->mm_rb.rb_node;
  13.309 +	rb_prev = __rb_parent = NULL;
  13.310 +	vma = NULL;
  13.311 +
  13.312 +	while (*__rb_link) {
  13.313 +		struct vm_area_struct *vma_tmp;
  13.314 +
  13.315 +		__rb_parent = *__rb_link;
  13.316 +		vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb);
  13.317 +
  13.318 +		if (vma_tmp->vm_end > addr) {
  13.319 +			vma = vma_tmp;
  13.320 +			if (vma_tmp->vm_start <= addr)
  13.321 +				return vma;
  13.322 +			__rb_link = &__rb_parent->rb_left;
  13.323 +		} else {
  13.324 +			rb_prev = __rb_parent;
  13.325 +			__rb_link = &__rb_parent->rb_right;
  13.326 +		}
  13.327 +	}
  13.328 +
  13.329 +	*pprev = NULL;
  13.330 +	if (rb_prev)
  13.331 +		*pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
  13.332 +	*rb_link = __rb_link;
  13.333 +	*rb_parent = __rb_parent;
  13.334 +	return vma;
  13.335 +}
  13.336 +
  13.337 +static inline void
  13.338 +__vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
  13.339 +		struct vm_area_struct *prev, struct rb_node *rb_parent)
  13.340 +{
  13.341 +	if (prev) {
  13.342 +		vma->vm_next = prev->vm_next;
  13.343 +		prev->vm_next = vma;
  13.344 +	} else {
  13.345 +		mm->mmap = vma;
  13.346 +		if (rb_parent)
  13.347 +			vma->vm_next = rb_entry(rb_parent,
  13.348 +					struct vm_area_struct, vm_rb);
  13.349 +		else
  13.350 +			vma->vm_next = NULL;
  13.351 +	}
  13.352 +}
  13.353 +
  13.354 +void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
  13.355 +		struct rb_node **rb_link, struct rb_node *rb_parent)
  13.356 +{
  13.357 +	rb_link_node(&vma->vm_rb, rb_parent, rb_link);
  13.358 +	rb_insert_color(&vma->vm_rb, &mm->mm_rb);
  13.359 +}
  13.360 +
  13.361 +static inline void __vma_link_file(struct vm_area_struct *vma)
  13.362 +{
  13.363 +	struct file * file;
  13.364 +
  13.365 +	file = vma->vm_file;
  13.366 +	if (file) {
  13.367 +		struct address_space *mapping = file->f_mapping;
  13.368 +
  13.369 +		if (vma->vm_flags & VM_DENYWRITE)
  13.370 +			atomic_dec(&file->f_dentry->d_inode->i_writecount);
  13.371 +		if (vma->vm_flags & VM_SHARED)
  13.372 +			mapping->i_mmap_writable++;
  13.373 +
  13.374 +		flush_dcache_mmap_lock(mapping);
  13.375 +		if (unlikely(vma->vm_flags & VM_NONLINEAR))
  13.376 +			vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
  13.377 +		else
  13.378 +			vma_prio_tree_insert(vma, &mapping->i_mmap);
  13.379 +		flush_dcache_mmap_unlock(mapping);
  13.380 +	}
  13.381 +}
  13.382 +
  13.383 +static void
  13.384 +__vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
  13.385 +	struct vm_area_struct *prev, struct rb_node **rb_link,
  13.386 +	struct rb_node *rb_parent)
  13.387 +{
  13.388 +	__vma_link_list(mm, vma, prev, rb_parent);
  13.389 +	__vma_link_rb(mm, vma, rb_link, rb_parent);
  13.390 +	__anon_vma_link(vma);
  13.391 +}
  13.392 +
  13.393 +static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
  13.394 +			struct vm_area_struct *prev, struct rb_node **rb_link,
  13.395 +			struct rb_node *rb_parent)
  13.396 +{
  13.397 +	struct address_space *mapping = NULL;
  13.398 +
  13.399 +	if (vma->vm_file)
  13.400 +		mapping = vma->vm_file->f_mapping;
  13.401 +
  13.402 +	if (mapping) {
  13.403 +		spin_lock(&mapping->i_mmap_lock);
  13.404 +		vma->vm_truncate_count = mapping->truncate_count;
  13.405 +	}
  13.406 +	anon_vma_lock(vma);
  13.407 +
  13.408 +	__vma_link(mm, vma, prev, rb_link, rb_parent);
  13.409 +	__vma_link_file(vma);
  13.410 +
  13.411 +	anon_vma_unlock(vma);
  13.412 +	if (mapping)
  13.413 +		spin_unlock(&mapping->i_mmap_lock);
  13.414 +
  13.415 +	mm->map_count++;
  13.416 +	validate_mm(mm);
  13.417 +}
  13.418 +
  13.419 +/*
  13.420 + * Helper for vma_adjust in the split_vma insert case:
  13.421 + * insert vm structure into list and rbtree and anon_vma,
  13.422 + * but it has already been inserted into prio_tree earlier.
  13.423 + */
  13.424 +static void
  13.425 +__insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
  13.426 +{
  13.427 +	struct vm_area_struct * __vma, * prev;
  13.428 +	struct rb_node ** rb_link, * rb_parent;
  13.429 +
  13.430 +	__vma = find_vma_prepare(mm, vma->vm_start,&prev, &rb_link, &rb_parent);
  13.431 +	if (__vma && __vma->vm_start < vma->vm_end)
  13.432 +		BUG();
  13.433 +	__vma_link(mm, vma, prev, rb_link, rb_parent);
  13.434 +	mm->map_count++;
  13.435 +}
  13.436 +
  13.437 +static inline void
  13.438 +__vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
  13.439 +		struct vm_area_struct *prev)
  13.440 +{
  13.441 +	prev->vm_next = vma->vm_next;
  13.442 +	rb_erase(&vma->vm_rb, &mm->mm_rb);
  13.443 +	if (mm->mmap_cache == vma)
  13.444 +		mm->mmap_cache = prev;
  13.445 +}
  13.446 +
  13.447 +/*
  13.448 + * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that
  13.449 + * is already present in an i_mmap tree without adjusting the tree.
  13.450 + * The following helper function should be used when such adjustments
  13.451 + * are necessary.  The "insert" vma (if any) is to be inserted
  13.452 + * before we drop the necessary locks.
  13.453 + */
  13.454 +void vma_adjust(struct vm_area_struct *vma, unsigned long start,
  13.455 +	unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert)
  13.456 +{
  13.457 +	struct mm_struct *mm = vma->vm_mm;
  13.458 +	struct vm_area_struct *next = vma->vm_next;
  13.459 +	struct vm_area_struct *importer = NULL;
  13.460 +	struct address_space *mapping = NULL;
  13.461 +	struct prio_tree_root *root = NULL;
  13.462 +	struct file *file = vma->vm_file;
  13.463 +	struct anon_vma *anon_vma = NULL;
  13.464 +	long adjust_next = 0;
  13.465 +	int remove_next = 0;
  13.466 +
  13.467 +	if (next && !insert) {
  13.468 +		if (end >= next->vm_end) {
  13.469 +			/*
  13.470 +			 * vma expands, overlapping all the next, and
  13.471 +			 * perhaps the one after too (mprotect case 6).
  13.472 +			 */
  13.473 +again:			remove_next = 1 + (end > next->vm_end);
  13.474 +			end = next->vm_end;
  13.475 +			anon_vma = next->anon_vma;
  13.476 +			importer = vma;
  13.477 +		} else if (end > next->vm_start) {
  13.478 +			/*
  13.479 +			 * vma expands, overlapping part of the next:
  13.480 +			 * mprotect case 5 shifting the boundary up.
  13.481 +			 */
  13.482 +			adjust_next = (end - next->vm_start) >> PAGE_SHIFT;
  13.483 +			anon_vma = next->anon_vma;
  13.484 +			importer = vma;
  13.485 +		} else if (end < vma->vm_end) {
  13.486 +			/*
  13.487 +			 * vma shrinks, and !insert tells it's not
  13.488 +			 * split_vma inserting another: so it must be
  13.489 +			 * mprotect case 4 shifting the boundary down.
  13.490 +			 */
  13.491 +			adjust_next = - ((vma->vm_end - end) >> PAGE_SHIFT);
  13.492 +			anon_vma = next->anon_vma;
  13.493 +			importer = next;
  13.494 +		}
  13.495 +	}
  13.496 +
  13.497 +	if (file) {
  13.498 +		mapping = file->f_mapping;
  13.499 +		if (!(vma->vm_flags & VM_NONLINEAR))
  13.500 +			root = &mapping->i_mmap;
  13.501 +		spin_lock(&mapping->i_mmap_lock);
  13.502 +		if (importer &&
  13.503 +		    vma->vm_truncate_count != next->vm_truncate_count) {
  13.504 +			/*
  13.505 +			 * unmap_mapping_range might be in progress:
  13.506 +			 * ensure that the expanding vma is rescanned.
  13.507 +			 */
  13.508 +			importer->vm_truncate_count = 0;
  13.509 +		}
  13.510 +		if (insert) {
  13.511 +			insert->vm_truncate_count = vma->vm_truncate_count;
  13.512 +			/*
  13.513 +			 * Put into prio_tree now, so instantiated pages
  13.514 +			 * are visible to arm/parisc __flush_dcache_page
  13.515 +			 * throughout; but we cannot insert into address
  13.516 +			 * space until vma start or end is updated.
  13.517 +			 */
  13.518 +			__vma_link_file(insert);
  13.519 +		}
  13.520 +	}
  13.521 +
  13.522 +	/*
  13.523 +	 * When changing only vma->vm_end, we don't really need
  13.524 +	 * anon_vma lock: but is that case worth optimizing out?
  13.525 +	 */
  13.526 +	if (vma->anon_vma)
  13.527 +		anon_vma = vma->anon_vma;
  13.528 +	if (anon_vma) {
  13.529 +		spin_lock(&anon_vma->lock);
  13.530 +		/*
  13.531 +		 * Easily overlooked: when mprotect shifts the boundary,
  13.532 +		 * make sure the expanding vma has anon_vma set if the
  13.533 +		 * shrinking vma had, to cover any anon pages imported.
  13.534 +		 */
  13.535 +		if (importer && !importer->anon_vma) {
  13.536 +			importer->anon_vma = anon_vma;
  13.537 +			__anon_vma_link(importer);
  13.538 +		}
  13.539 +	}
  13.540 +
  13.541 +	if (root) {
  13.542 +		flush_dcache_mmap_lock(mapping);
  13.543 +		vma_prio_tree_remove(vma, root);
  13.544 +		if (adjust_next)
  13.545 +			vma_prio_tree_remove(next, root);
  13.546 +	}
  13.547 +
  13.548 +	vma->vm_start = start;
  13.549 +	vma->vm_end = end;
  13.550 +	vma->vm_pgoff = pgoff;
  13.551 +	if (adjust_next) {
  13.552 +		next->vm_start += adjust_next << PAGE_SHIFT;
  13.553 +		next->vm_pgoff += adjust_next;
  13.554 +	}
  13.555 +
  13.556 +	if (root) {
  13.557 +		if (adjust_next)
  13.558 +			vma_prio_tree_insert(next, root);
  13.559 +		vma_prio_tree_insert(vma, root);
  13.560 +		flush_dcache_mmap_unlock(mapping);
  13.561 +	}
  13.562 +
  13.563 +	if (remove_next) {
  13.564 +		/*
  13.565 +		 * vma_merge has merged next into vma, and needs
  13.566 +		 * us to remove next before dropping the locks.
  13.567 +		 */
  13.568 +		__vma_unlink(mm, next, vma);
  13.569 +		if (file)
  13.570 +			__remove_shared_vm_struct(next, file, mapping);
  13.571 +		if (next->anon_vma)
  13.572 +			__anon_vma_merge(vma, next);
  13.573 +	} else if (insert) {
  13.574 +		/*
  13.575 +		 * split_vma has split insert from vma, and needs
  13.576 +		 * us to insert it before dropping the locks
  13.577 +		 * (it may either follow vma or precede it).
  13.578 +		 */
  13.579 +		__insert_vm_struct(mm, insert);
  13.580 +	}
  13.581 +
  13.582 +	if (anon_vma)
  13.583 +		spin_unlock(&anon_vma->lock);
  13.584 +	if (mapping)
  13.585 +		spin_unlock(&mapping->i_mmap_lock);
  13.586 +
  13.587 +	if (remove_next) {
  13.588 +		if (file)
  13.589 +			fput(file);
  13.590 +		mm->map_count--;
  13.591 +		mpol_free(vma_policy(next));
  13.592 +		kmem_cache_free(vm_area_cachep, next);
  13.593 +		/*
  13.594 +		 * In mprotect's case 6 (see comments on vma_merge),
  13.595 +		 * we must remove another next too. It would clutter
  13.596 +		 * up the code too much to do both in one go.
  13.597 +		 */
  13.598 +		if (remove_next == 2) {
  13.599 +			next = vma->vm_next;
  13.600 +			goto again;
  13.601 +		}
  13.602 +	}
  13.603 +
  13.604 +	validate_mm(mm);
  13.605 +}
  13.606 +
  13.607 +/*
  13.608 + * If the vma has a ->close operation then the driver probably needs to release
  13.609 + * per-vma resources, so we don't attempt to merge those.
  13.610 + */
  13.611 +#define VM_SPECIAL (VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_RESERVED)
  13.612 +
  13.613 +static inline int is_mergeable_vma(struct vm_area_struct *vma,
  13.614 +			struct file *file, unsigned long vm_flags)
  13.615 +{
  13.616 +	if (vma->vm_flags != vm_flags)
  13.617 +		return 0;
  13.618 +	if (vma->vm_file != file)
  13.619 +		return 0;
  13.620 +	if (vma->vm_ops && vma->vm_ops->close)
  13.621 +		return 0;
  13.622 +	return 1;
  13.623 +}
  13.624 +
  13.625 +static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1,
  13.626 +					struct anon_vma *anon_vma2)
  13.627 +{
  13.628 +	return !anon_vma1 || !anon_vma2 || (anon_vma1 == anon_vma2);
  13.629 +}
  13.630 +
  13.631 +/*
  13.632 + * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
  13.633 + * in front of (at a lower virtual address and file offset than) the vma.
  13.634 + *
  13.635 + * We cannot merge two vmas if they have differently assigned (non-NULL)
  13.636 + * anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
  13.637 + *
  13.638 + * We don't check here for the merged mmap wrapping around the end of pagecache
  13.639 + * indices (16TB on ia32) because do_mmap_pgoff() does not permit mmap's which
  13.640 + * wrap, nor mmaps which cover the final page at index -1UL.
  13.641 + */
  13.642 +static int
  13.643 +can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
  13.644 +	struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
  13.645 +{
  13.646 +	if (is_mergeable_vma(vma, file, vm_flags) &&
  13.647 +	    is_mergeable_anon_vma(anon_vma, vma->anon_vma)) {
  13.648 +		if (vma->vm_pgoff == vm_pgoff)
  13.649 +			return 1;
  13.650 +	}
  13.651 +	return 0;
  13.652 +}
  13.653 +
  13.654 +/*
  13.655 + * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
  13.656 + * beyond (at a higher virtual address and file offset than) the vma.
  13.657 + *
  13.658 + * We cannot merge two vmas if they have differently assigned (non-NULL)
  13.659 + * anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
  13.660 + */
  13.661 +static int
  13.662 +can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
  13.663 +	struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
  13.664 +{
  13.665 +	if (is_mergeable_vma(vma, file, vm_flags) &&
  13.666 +	    is_mergeable_anon_vma(anon_vma, vma->anon_vma)) {
  13.667 +		pgoff_t vm_pglen;
  13.668 +		vm_pglen = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
  13.669 +		if (vma->vm_pgoff + vm_pglen == vm_pgoff)
  13.670 +			return 1;
  13.671 +	}
  13.672 +	return 0;
  13.673 +}
  13.674 +
  13.675 +/*
  13.676 + * Given a mapping request (addr,end,vm_flags,file,pgoff), figure out
  13.677 + * whether that can be merged with its predecessor or its successor.
  13.678 + * Or both (it neatly fills a hole).
  13.679 + *
  13.680 + * In most cases - when called for mmap, brk or mremap - [addr,end) is
  13.681 + * certain not to be mapped by the time vma_merge is called; but when
  13.682 + * called for mprotect, it is certain to be already mapped (either at
  13.683 + * an offset within prev, or at the start of next), and the flags of
  13.684 + * this area are about to be changed to vm_flags - and the no-change
  13.685 + * case has already been eliminated.
  13.686 + *
  13.687 + * The following mprotect cases have to be considered, where AAAA is
  13.688 + * the area passed down from mprotect_fixup, never extending beyond one
  13.689 + * vma, PPPPPP is the prev vma specified, and NNNNNN the next vma after:
  13.690 + *
  13.691 + *     AAAA             AAAA                AAAA          AAAA
  13.692 + *    PPPPPPNNNNNN    PPPPPPNNNNNN    PPPPPPNNNNNN    PPPPNNNNXXXX
  13.693 + *    cannot merge    might become    might become    might become
  13.694 + *                    PPNNNNNNNNNN    PPPPPPPPPPNN    PPPPPPPPPPPP 6 or
  13.695 + *    mmap, brk or    case 4 below    case 5 below    PPPPPPPPXXXX 7 or
  13.696 + *    mremap move:                                    PPPPNNNNNNNN 8
  13.697 + *        AAAA
  13.698 + *    PPPP    NNNN    PPPPPPPPPPPP    PPPPPPPPNNNN    PPPPNNNNNNNN
  13.699 + *    might become    case 1 below    case 2 below    case 3 below
  13.700 + *
  13.701 + * Odd one out? Case 8, because it extends NNNN but needs flags of XXXX:
  13.702 + * mprotect_fixup updates vm_flags & vm_page_prot on successful return.
  13.703 + */
  13.704 +struct vm_area_struct *vma_merge(struct mm_struct *mm,
  13.705 +			struct vm_area_struct *prev, unsigned long addr,
  13.706 +			unsigned long end, unsigned long vm_flags,
  13.707 +		     	struct anon_vma *anon_vma, struct file *file,
  13.708 +			pgoff_t pgoff, struct mempolicy *policy)
  13.709 +{
  13.710 +	pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
  13.711 +	struct vm_area_struct *area, *next;
  13.712 +
  13.713 +	/*
  13.714 +	 * We later require that vma->vm_flags == vm_flags,
  13.715 +	 * so this tests vma->vm_flags & VM_SPECIAL, too.
  13.716 +	 */
  13.717 +	if (vm_flags & VM_SPECIAL)
  13.718 +		return NULL;
  13.719 +
  13.720 +	if (prev)
  13.721 +		next = prev->vm_next;
  13.722 +	else
  13.723 +		next = mm->mmap;
  13.724 +	area = next;
  13.725 +	if (next && next->vm_end == end)		/* cases 6, 7, 8 */
  13.726 +		next = next->vm_next;
  13.727 +
  13.728 +	/*
  13.729 +	 * Can it merge with the predecessor?
  13.730 +	 */
  13.731 +	if (prev && prev->vm_end == addr &&
  13.732 +  			mpol_equal(vma_policy(prev), policy) &&
  13.733 +			can_vma_merge_after(prev, vm_flags,
  13.734 +						anon_vma, file, pgoff)) {
  13.735 +		/*
  13.736 +		 * OK, it can.  Can we now merge in the successor as well?
  13.737 +		 */
  13.738 +		if (next && end == next->vm_start &&
  13.739 +				mpol_equal(policy, vma_policy(next)) &&
  13.740 +				can_vma_merge_before(next, vm_flags,
  13.741 +					anon_vma, file, pgoff+pglen) &&
  13.742 +				is_mergeable_anon_vma(prev->anon_vma,
  13.743 +						      next->anon_vma)) {
  13.744 +							/* cases 1, 6 */
  13.745 +			vma_adjust(prev, prev->vm_start,
  13.746 +				next->vm_end, prev->vm_pgoff, NULL);
  13.747 +		} else					/* cases 2, 5, 7 */
  13.748 +			vma_adjust(prev, prev->vm_start,
  13.749 +				end, prev->vm_pgoff, NULL);
  13.750 +		return prev;
  13.751 +	}
  13.752 +
  13.753 +	/*
  13.754 +	 * Can this new request be merged in front of next?
  13.755 +	 */
  13.756 +	if (next && end == next->vm_start &&
  13.757 + 			mpol_equal(policy, vma_policy(next)) &&
  13.758 +			can_vma_merge_before(next, vm_flags,
  13.759 +					anon_vma, file, pgoff+pglen)) {
  13.760 +		if (prev && addr < prev->vm_end)	/* case 4 */
  13.761 +			vma_adjust(prev, prev->vm_start,
  13.762 +				addr, prev->vm_pgoff, NULL);
  13.763 +		else					/* cases 3, 8 */
  13.764 +			vma_adjust(area, addr, next->vm_end,
  13.765 +				next->vm_pgoff - pglen, NULL);
  13.766 +		return area;
  13.767 +	}
  13.768 +
  13.769 +	return NULL;
  13.770 +}
  13.771 +
  13.772 +/*
  13.773 + * find_mergeable_anon_vma is used by anon_vma_prepare, to check
  13.774 + * neighbouring vmas for a suitable anon_vma, before it goes off
  13.775 + * to allocate a new anon_vma.  It checks because a repetitive
  13.776 + * sequence of mprotects and faults may otherwise lead to distinct
  13.777 + * anon_vmas being allocated, preventing vma merge in subsequent
  13.778 + * mprotect.
  13.779 + */
  13.780 +struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)
  13.781 +{
  13.782 +	struct vm_area_struct *near;
  13.783 +	unsigned long vm_flags;
  13.784 +
  13.785 +	near = vma->vm_next;
  13.786 +	if (!near)
  13.787 +		goto try_prev;
  13.788 +
  13.789 +	/*
  13.790 +	 * Since only mprotect tries to remerge vmas, match flags
  13.791 +	 * which might be mprotected into each other later on.
  13.792 +	 * Neither mlock nor madvise tries to remerge at present,
  13.793 +	 * so leave their flags as obstructing a merge.
  13.794 +	 */
  13.795 +	vm_flags = vma->vm_flags & ~(VM_READ|VM_WRITE|VM_EXEC);
  13.796 +	vm_flags |= near->vm_flags & (VM_READ|VM_WRITE|VM_EXEC);
  13.797 +
  13.798 +	if (near->anon_vma && vma->vm_end == near->vm_start &&
  13.799 + 			mpol_equal(vma_policy(vma), vma_policy(near)) &&
  13.800 +			can_vma_merge_before(near, vm_flags,
  13.801 +				NULL, vma->vm_file, vma->vm_pgoff +
  13.802 +				((vma->vm_end - vma->vm_start) >> PAGE_SHIFT)))
  13.803 +		return near->anon_vma;
  13.804 +try_prev:
  13.805 +	/*
  13.806 +	 * It is potentially slow to have to call find_vma_prev here.
  13.807 +	 * But it's only on the first write fault on the vma, not
  13.808 +	 * every time, and we could devise a way to avoid it later
  13.809 +	 * (e.g. stash info in next's anon_vma_node when assigning
  13.810 +	 * an anon_vma, or when trying vma_merge).  Another time.
  13.811 +	 */
  13.812 +	if (find_vma_prev(vma->vm_mm, vma->vm_start, &near) != vma)
  13.813 +		BUG();
  13.814 +	if (!near)
  13.815 +		goto none;
  13.816 +
  13.817 +	vm_flags = vma->vm_flags & ~(VM_READ|VM_WRITE|VM_EXEC);
  13.818 +	vm_flags |= near->vm_flags & (VM_READ|VM_WRITE|VM_EXEC);
  13.819 +
  13.820 +	if (near->anon_vma && near->vm_end == vma->vm_start &&
  13.821 +  			mpol_equal(vma_policy(near), vma_policy(vma)) &&
  13.822 +			can_vma_merge_after(near, vm_flags,
  13.823 +				NULL, vma->vm_file, vma->vm_pgoff))
  13.824 +		return near->anon_vma;
  13.825 +none:
  13.826 +	/*
  13.827 +	 * There's no absolute need to look only at touching neighbours:
  13.828 +	 * we could search further afield for "compatible" anon_vmas.
  13.829 +	 * But it would probably just be a waste of time searching,
  13.830 +	 * or lead to too many vmas hanging off the same anon_vma.
  13.831 +	 * We're trying to allow mprotect remerging later on,
  13.832 +	 * not trying to minimize memory used for anon_vmas.
  13.833 +	 */
  13.834 +	return NULL;
  13.835 +}
  13.836 +
  13.837 +#ifdef CONFIG_PROC_FS
  13.838 +void __vm_stat_account(struct mm_struct *mm, unsigned long flags,
  13.839 +						struct file *file, long pages)
  13.840 +{
  13.841 +	const unsigned long stack_flags
  13.842 +		= VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN);
  13.843 +
  13.844 +#ifdef CONFIG_HUGETLB
  13.845 +	if (flags & VM_HUGETLB) {
  13.846 +		if (!(flags & VM_DONTCOPY))
  13.847 +			mm->shared_vm += pages;
  13.848 +		return;
  13.849 +	}
  13.850 +#endif /* CONFIG_HUGETLB */
  13.851 +
  13.852 +	if (file) {
  13.853 +		mm->shared_vm += pages;
  13.854 +		if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC)
  13.855 +			mm->exec_vm += pages;
  13.856 +	} else if (flags & stack_flags)
  13.857 +		mm->stack_vm += pages;
  13.858 +	if (flags & (VM_RESERVED|VM_IO))
  13.859 +		mm->reserved_vm += pages;
  13.860 +}
  13.861 +#endif /* CONFIG_PROC_FS */
  13.862 +
  13.863 +/*
  13.864 + * The caller must hold down_write(current->mm->mmap_sem).
  13.865 + */
  13.866 +
  13.867 +unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
  13.868 +			unsigned long len, unsigned long prot,
  13.869 +			unsigned long flags, unsigned long pgoff)
  13.870 +{
  13.871 +	struct mm_struct * mm = current->mm;
  13.872 +	struct vm_area_struct * vma, * prev;
  13.873 +	struct inode *inode;
  13.874 +	unsigned int vm_flags;
  13.875 +	int correct_wcount = 0;
  13.876 +	int error;
  13.877 +	struct rb_node ** rb_link, * rb_parent;
  13.878 +	int accountable = 1;
  13.879 +	unsigned long charged = 0;
  13.880 +
  13.881 +	if (file) {
  13.882 +		if (is_file_hugepages(file))
  13.883 +			accountable = 0;
  13.884 +
  13.885 +		if (!file->f_op || !file->f_op->mmap)
  13.886 +			return -ENODEV;
  13.887 +
  13.888 +		if ((prot & PROT_EXEC) &&
  13.889 +		    (file->f_vfsmnt->mnt_flags & MNT_NOEXEC))
  13.890 +			return -EPERM;
  13.891 +	}
  13.892 +	/*
  13.893 +	 * Does the application expect PROT_READ to imply PROT_EXEC?
  13.894 +	 *
  13.895 +	 * (the exception is when the underlying filesystem is noexec
  13.896 +	 *  mounted, in which case we dont add PROT_EXEC.)
  13.897 +	 */
  13.898 +	if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
  13.899 +		if (!(file && (file->f_vfsmnt->mnt_flags & MNT_NOEXEC)))
  13.900 +			prot |= PROT_EXEC;
  13.901 +
  13.902 +	if (!len)
  13.903 +		return addr;
  13.904 +
  13.905 +	/* Careful about overflows.. */
  13.906 +	len = PAGE_ALIGN(len);
  13.907 +	if (!len || len > TASK_SIZE)
  13.908 +		return -EINVAL;
  13.909 +
  13.910 +	/* offset overflow? */
  13.911 +	if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
  13.912 +		return -EINVAL;
  13.913 +
  13.914 +	/* Too many mappings? */
  13.915 +	if (mm->map_count > sysctl_max_map_count)
  13.916 +		return -ENOMEM;
  13.917 +
  13.918 +	/* Obtain the address to map to. we verify (or select) it and ensure
  13.919 +	 * that it represents a valid section of the address space.
  13.920 +	 */
  13.921 +	addr = get_unmapped_area(file, addr, len, pgoff, flags);
  13.922 +	if (addr & ~PAGE_MASK)
  13.923 +		return addr;
  13.924 +
  13.925 +	/* Do simple checking here so the lower-level routines won't have
  13.926 +	 * to. we assume access permissions have been handled by the open
  13.927 +	 * of the memory object, so we don't do any here.
  13.928 +	 */
  13.929 +	vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |
  13.930 +			mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
  13.931 +
  13.932 +	if (flags & MAP_LOCKED) {
  13.933 +		if (!can_do_mlock())
  13.934 +			return -EPERM;
  13.935 +		vm_flags |= VM_LOCKED;
  13.936 +	}
  13.937 +	/* mlock MCL_FUTURE? */
  13.938 +	if (vm_flags & VM_LOCKED) {
  13.939 +		unsigned long locked, lock_limit;
  13.940 +		locked = mm->locked_vm << PAGE_SHIFT;
  13.941 +		lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
  13.942 +		locked += len;
  13.943 +		if (locked > lock_limit && !capable(CAP_IPC_LOCK))
  13.944 +			return -EAGAIN;
  13.945 +	}
  13.946 +
  13.947 +	inode = file ? file->f_dentry->d_inode : NULL;
  13.948 +
  13.949 +	if (file) {
  13.950 +		switch (flags & MAP_TYPE) {
  13.951 +		case MAP_SHARED:
  13.952 +			if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE))
  13.953 +				return -EACCES;
  13.954 +
  13.955 +			/*
  13.956 +			 * Make sure we don't allow writing to an append-only
  13.957 +			 * file..
  13.958 +			 */
  13.959 +			if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE))
  13.960 +				return -EACCES;
  13.961 +
  13.962 +			/*
  13.963 +			 * Make sure there are no mandatory locks on the file.
  13.964 +			 */
  13.965 +			if (locks_verify_locked(inode))
  13.966 +				return -EAGAIN;
  13.967 +
  13.968 +			vm_flags |= VM_SHARED | VM_MAYSHARE;
  13.969 +			if (!(file->f_mode & FMODE_WRITE))
  13.970 +				vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
  13.971 +
  13.972 +			/* fall through */
  13.973 +		case MAP_PRIVATE:
  13.974 +			if (!(file->f_mode & FMODE_READ))
  13.975 +				return -EACCES;
  13.976 +			break;
  13.977 +
  13.978 +		default:
  13.979 +			return -EINVAL;
  13.980 +		}
  13.981 +	} else {
  13.982 +		switch (flags & MAP_TYPE) {
  13.983 +		case MAP_SHARED:
  13.984 +			vm_flags |= VM_SHARED | VM_MAYSHARE;
  13.985 +			break;
  13.986 +		case MAP_PRIVATE:
  13.987 +			/*
  13.988 +			 * Set pgoff according to addr for anon_vma.
  13.989 +			 */
  13.990 +			pgoff = addr >> PAGE_SHIFT;
  13.991 +			break;
  13.992 +		default:
  13.993 +			return -EINVAL;
  13.994 +		}
  13.995 +	}
  13.996 +
  13.997 +	error = security_file_mmap(file, prot, flags);
  13.998 +	if (error)
  13.999 +		return error;
 13.1000 +		
 13.1001 +	/* Clear old maps */
 13.1002 +	error = -ENOMEM;
 13.1003 +munmap_back:
 13.1004 +	vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
 13.1005 +	if (vma && vma->vm_start < addr + len) {
 13.1006 +		if (do_munmap(mm, addr, len))
 13.1007 +			return -ENOMEM;
 13.1008 +		goto munmap_back;
 13.1009 +	}
 13.1010 +
 13.1011 +	/* Check against address space limit. */
 13.1012 +	if ((mm->total_vm << PAGE_SHIFT) + len
 13.1013 +	    > current->signal->rlim[RLIMIT_AS].rlim_cur)
 13.1014 +		return -ENOMEM;
 13.1015 +
 13.1016 +	if (accountable && (!(flags & MAP_NORESERVE) ||
 13.1017 +			    sysctl_overcommit_memory == OVERCOMMIT_NEVER)) {
 13.1018 +		if (vm_flags & VM_SHARED) {
 13.1019 +			/* Check memory availability in shmem_file_setup? */
 13.1020 +			vm_flags |= VM_ACCOUNT;
 13.1021 +		} else if (vm_flags & VM_WRITE) {
 13.1022 +			/*
 13.1023 +			 * Private writable mapping: check memory availability
 13.1024 +			 */
 13.1025 +			charged = len >> PAGE_SHIFT;
 13.1026 +			if (security_vm_enough_memory(charged))
 13.1027 +				return -ENOMEM;
 13.1028 +			vm_flags |= VM_ACCOUNT;
 13.1029 +		}
 13.1030 +	}
 13.1031 +
 13.1032 +	/*
 13.1033 +	 * Can we just expand an old private anonymous mapping?
 13.1034 +	 * The VM_SHARED test is necessary because shmem_zero_setup
 13.1035 +	 * will create the file object for a shared anonymous map below.
 13.1036 +	 */
 13.1037 +	if (!file && !(vm_flags & VM_SHARED) &&
 13.1038 +	    vma_merge(mm, prev, addr, addr + len, vm_flags,
 13.1039 +					NULL, NULL, pgoff, NULL))
 13.1040 +		goto out;
 13.1041 +
 13.1042 +	/*
 13.1043 +	 * Determine the object being mapped and call the appropriate
 13.1044 +	 * specific mapper. the address has already been validated, but
 13.1045 +	 * not unmapped, but the maps are removed from the list.
 13.1046 +	 */
 13.1047 +	vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
 13.1048 +	if (!vma) {
 13.1049 +		error = -ENOMEM;
 13.1050 +		goto unacct_error;
 13.1051 +	}
 13.1052 +	memset(vma, 0, sizeof(*vma));
 13.1053 +
 13.1054 +	vma->vm_mm = mm;
 13.1055 +	vma->vm_start = addr;
 13.1056 +	vma->vm_end = addr + len;
 13.1057 +	vma->vm_flags = vm_flags;
 13.1058 +	vma->vm_page_prot = protection_map[vm_flags & 0x0f];
 13.1059 +	vma->vm_pgoff = pgoff;
 13.1060 +
 13.1061 +	if (file) {
 13.1062 +		error = -EINVAL;
 13.1063 +		if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
 13.1064 +			goto free_vma;
 13.1065 +		if (vm_flags & VM_DENYWRITE) {
 13.1066 +			error = deny_write_access(file);
 13.1067 +			if (error)
 13.1068 +				goto free_vma;
 13.1069 +			correct_wcount = 1;
 13.1070 +		}
 13.1071 +		vma->vm_file = file;
 13.1072 +		get_file(file);
 13.1073 +		error = file->f_op->mmap(file, vma);
 13.1074 +		if (error)
 13.1075 +			goto unmap_and_free_vma;
 13.1076 +	} else if (vm_flags & VM_SHARED) {
 13.1077 +		error = shmem_zero_setup(vma);
 13.1078 +		if (error)
 13.1079 +			goto free_vma;
 13.1080 +	}
 13.1081 +
 13.1082 +	/* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform
 13.1083 +	 * shmem_zero_setup (perhaps called through /dev/zero's ->mmap)
 13.1084 +	 * that memory reservation must be checked; but that reservation
 13.1085 +	 * belongs to shared memory object, not to vma: so now clear it.
 13.1086 +	 */
 13.1087 +	if ((vm_flags & (VM_SHARED|VM_ACCOUNT)) == (VM_SHARED|VM_ACCOUNT))
 13.1088 +		vma->vm_flags &= ~VM_ACCOUNT;
 13.1089 +
 13.1090 +	/* Can addr have changed??
 13.1091 +	 *
 13.1092 +	 * Answer: Yes, several device drivers can do it in their
 13.1093 +	 *         f_op->mmap method. -DaveM
 13.1094 +	 */
 13.1095 +	addr = vma->vm_start;
 13.1096 +	pgoff = vma->vm_pgoff;
 13.1097 +	vm_flags = vma->vm_flags;
 13.1098 +
 13.1099 +	if (!file || !vma_merge(mm, prev, addr, vma->vm_end,
 13.1100 +			vma->vm_flags, NULL, file, pgoff, vma_policy(vma))) {
 13.1101 +		file = vma->vm_file;
 13.1102 +		vma_link(mm, vma, prev, rb_link, rb_parent);
 13.1103 +		if (correct_wcount)
 13.1104 +			atomic_inc(&inode->i_writecount);
 13.1105 +	} else {
 13.1106 +		if (file) {
 13.1107 +			if (correct_wcount)
 13.1108 +				atomic_inc(&inode->i_writecount);
 13.1109 +			fput(file);
 13.1110 +		}
 13.1111 +		mpol_free(vma_policy(vma));
 13.1112 +		kmem_cache_free(vm_area_cachep, vma);
 13.1113 +	}
 13.1114 +out:	
 13.1115 +	mm->total_vm += len >> PAGE_SHIFT;
 13.1116 +	__vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
 13.1117 +	if (vm_flags & VM_LOCKED) {
 13.1118 +		mm->locked_vm += len >> PAGE_SHIFT;
 13.1119 +		make_pages_present(addr, addr + len);
 13.1120 +	}
 13.1121 +	if (flags & MAP_POPULATE) {
 13.1122 +		up_write(&mm->mmap_sem);
 13.1123 +		sys_remap_file_pages(addr, len, 0,
 13.1124 +					pgoff, flags & MAP_NONBLOCK);
 13.1125 +		down_write(&mm->mmap_sem);
 13.1126 +	}
 13.1127 +	acct_update_integrals();
 13.1128 +	update_mem_hiwater();
 13.1129 +	return addr;
 13.1130 +
 13.1131 +unmap_and_free_vma:
 13.1132 +	if (correct_wcount)
 13.1133 +		atomic_inc(&inode->i_writecount);
 13.1134 +	vma->vm_file = NULL;
 13.1135 +	fput(file);
 13.1136 +
 13.1137 +	/* Undo any partial mapping done by a device driver. */
 13.1138 +	zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start, NULL);
 13.1139 +free_vma:
 13.1140 +	kmem_cache_free(vm_area_cachep, vma);
 13.1141 +unacct_error:
 13.1142 +	if (charged)
 13.1143 +		vm_unacct_memory(charged);
 13.1144 +	return error;
 13.1145 +}
 13.1146 +
 13.1147 +EXPORT_SYMBOL(do_mmap_pgoff);
 13.1148 +
 13.1149 +/* Get an address range which is currently unmapped.
 13.1150 + * For shmat() with addr=0.
 13.1151 + *
 13.1152 + * Ugly calling convention alert:
 13.1153 + * Return value with the low bits set means error value,
 13.1154 + * ie
 13.1155 + *	if (ret & ~PAGE_MASK)
 13.1156 + *		error = ret;
 13.1157 + *
 13.1158 + * This function "knows" that -ENOMEM has the bits set.
 13.1159 + */
 13.1160 +#ifndef HAVE_ARCH_UNMAPPED_AREA
 13.1161 +unsigned long
 13.1162 +arch_get_unmapped_area(struct file *filp, unsigned long addr,
 13.1163 +		unsigned long len, unsigned long pgoff, unsigned long flags)
 13.1164 +{
 13.1165 +	struct mm_struct *mm = current->mm;
 13.1166 +	struct vm_area_struct *vma;
 13.1167 +	unsigned long start_addr;
 13.1168 +
 13.1169 +	if (len > TASK_SIZE)
 13.1170 +		return -ENOMEM;
 13.1171 +
 13.1172 +	if (addr) {
 13.1173 +		addr = PAGE_ALIGN(addr);
 13.1174 +		vma = find_vma(mm, addr);
 13.1175 +		if (TASK_SIZE - len >= addr &&
 13.1176 +		    (!vma || addr + len <= vma->vm_start))
 13.1177 +			return addr;
 13.1178 +	}
 13.1179 +	start_addr = addr = mm->free_area_cache;
 13.1180 +
 13.1181 +full_search:
 13.1182 +	for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
 13.1183 +		/* At this point:  (!vma || addr < vma->vm_end). */
 13.1184 +		if (TASK_SIZE - len < addr) {
 13.1185 +			/*
 13.1186 +			 * Start a new search - just in case we missed
 13.1187 +			 * some holes.
 13.1188 +			 */
 13.1189 +			if (start_addr != TASK_UNMAPPED_BASE) {
 13.1190 +				start_addr = addr = TASK_UNMAPPED_BASE;
 13.1191 +				goto full_search;
 13.1192 +			}
 13.1193 +			return -ENOMEM;
 13.1194 +		}
 13.1195 +		if (!vma || addr + len <= vma->vm_start) {
 13.1196 +			/*
 13.1197 +			 * Remember the place where we stopped the search:
 13.1198 +			 */
 13.1199 +			mm->free_area_cache = addr + len;
 13.1200 +			return addr;
 13.1201 +		}
 13.1202 +		addr = vma->vm_end;
 13.1203 +	}
 13.1204 +}
 13.1205 +#endif	
 13.1206 +
 13.1207 +void arch_unmap_area(struct vm_area_struct *area)
 13.1208 +{
 13.1209 +	/*
 13.1210 +	 * Is this a new hole at the lowest possible address?
 13.1211 +	 */
 13.1212 +	if (area->vm_start >= TASK_UNMAPPED_BASE &&
 13.1213 +			area->vm_start < area->vm_mm->free_area_cache)
 13.1214 +		area->vm_mm->free_area_cache = area->vm_start;
 13.1215 +}
 13.1216 +
 13.1217 +/*
 13.1218 + * This mmap-allocator allocates new areas top-down from below the
 13.1219 + * stack's low limit (the base):
 13.1220 + */
 13.1221 +#ifndef HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
 13.1222 +unsigned long
 13.1223 +arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
 13.1224 +			  const unsigned long len, const unsigned long pgoff,
 13.1225 +			  const unsigned long flags)
 13.1226 +{
 13.1227 +	struct vm_area_struct *vma, *prev_vma;
 13.1228 +	struct mm_struct *mm = current->mm;
 13.1229 +	unsigned long base = mm->mmap_base, addr = addr0;
 13.1230 +	int first_time = 1;
 13.1231 +
 13.1232 +	/* requested length too big for entire address space */
 13.1233 +	if (len > TASK_SIZE)
 13.1234 +		return -ENOMEM;
 13.1235 +
 13.1236 +	/* dont allow allocations above current base */
 13.1237 +	if (mm->free_area_cache > base)
 13.1238 +		mm->free_area_cache = base;
 13.1239 +
 13.1240 +	/* requesting a specific address */
 13.1241 +	if (addr) {
 13.1242 +		addr = PAGE_ALIGN(addr);
 13.1243 +		vma = find_vma(mm, addr);
 13.1244 +		if (TASK_SIZE - len >= addr &&
 13.1245 +				(!vma || addr + len <= vma->vm_start))
 13.1246 +			return addr;
 13.1247 +	}
 13.1248 +
 13.1249 +try_again:
 13.1250 +	/* make sure it can fit in the remaining address space */
 13.1251 +	if (mm->free_area_cache < len)
 13.1252 +		goto fail;
 13.1253 +
 13.1254 +	/* either no address requested or cant fit in requested address hole */
 13.1255 +	addr = (mm->free_area_cache - len) & PAGE_MASK;
 13.1256 +	do {
 13.1257 +		/*
 13.1258 +		 * Lookup failure means no vma is above this address,
 13.1259 +		 * i.e. return with success:
 13.1260 +		 */
 13.1261 + 	 	if (!(vma = find_vma_prev(mm, addr, &prev_vma)))
 13.1262 +			return addr;
 13.1263 +
 13.1264 +		/*
 13.1265 +		 * new region fits between prev_vma->vm_end and
 13.1266 +		 * vma->vm_start, use it:
 13.1267 +		 */
 13.1268 +		if (addr+len <= vma->vm_start &&
 13.1269 +				(!prev_vma || (addr >= prev_vma->vm_end)))
 13.1270 +			/* remember the address as a hint for next time */
 13.1271 +			return (mm->free_area_cache = addr);
 13.1272 +		else
 13.1273 +			/* pull free_area_cache down to the first hole */
 13.1274 +			if (mm->free_area_cache == vma->vm_end)
 13.1275 +				mm->free_area_cache = vma->vm_start;
 13.1276 +
 13.1277 +		/* try just below the current vma->vm_start */
 13.1278 +		addr = vma->vm_start-len;
 13.1279 +	} while (len <= vma->vm_start);
 13.1280 +
 13.1281 +fail:
 13.1282 +	/*
 13.1283 +	 * if hint left us with no space for the requested
 13.1284 +	 * mapping then try again:
 13.1285 +	 */
 13.1286 +	if (first_time) {
 13.1287 +		mm->free_area_cache = base;
 13.1288 +		first_time = 0;
 13.1289 +		goto try_again;
 13.1290 +	}
 13.1291 +	/*
 13.1292 +	 * A failed mmap() very likely causes application failure,
 13.1293 +	 * so fall back to the bottom-up function here. This scenario
 13.1294 +	 * can happen with large stack limits and large mmap()
 13.1295 +	 * allocations.
 13.1296 +	 */
 13.1297 +	mm->free_area_cache = TASK_UNMAPPED_BASE;
 13.1298 +	addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
 13.1299 +	/*
 13.1300 +	 * Restore the topdown base:
 13.1301 +	 */
 13.1302 +	mm->free_area_cache = base;
 13.1303 +
 13.1304 +	return addr;
 13.1305 +}
 13.1306 +#endif
 13.1307 +
 13.1308 +void arch_unmap_area_topdown(struct vm_area_struct *area)
 13.1309 +{
 13.1310 +	/*
 13.1311 +	 * Is this a new hole at the highest possible address?
 13.1312 +	 */
 13.1313 +	if (area->vm_end > area->vm_mm->free_area_cache)
 13.1314 +		area->vm_mm->free_area_cache = area->vm_end;
 13.1315 +}
 13.1316 +
 13.1317 +unsigned long
 13.1318 +get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
 13.1319 +		unsigned long pgoff, unsigned long flags)
 13.1320 +{
 13.1321 +	if (flags & MAP_FIXED) {
 13.1322 +		unsigned long ret;
 13.1323 +
 13.1324 +		if (addr > TASK_SIZE - len)
 13.1325 +			return -ENOMEM;
 13.1326 +		if (addr & ~PAGE_MASK)
 13.1327 +			return -EINVAL;
 13.1328 +		if (file && is_file_hugepages(file))  {
 13.1329 +			/*
 13.1330 +			 * Check if the given range is hugepage aligned, and
 13.1331 +			 * can be made suitable for hugepages.
 13.1332 +			 */
 13.1333 +			ret = prepare_hugepage_range(addr, len);
 13.1334 +		} else {
 13.1335 +			/*
 13.1336 +			 * Ensure that a normal request is not falling in a
 13.1337 +			 * reserved hugepage range.  For some archs like IA-64,
 13.1338 +			 * there is a separate region for hugepages.
 13.1339 +			 */
 13.1340 +			ret = is_hugepage_only_range(addr, len);
 13.1341 +		}
 13.1342 +		if (ret)
 13.1343 +			return -EINVAL;
 13.1344 +		return addr;
 13.1345 +	}
 13.1346 +
 13.1347 +	if (file && file->f_op && file->f_op->get_unmapped_area)
 13.1348 +		return file->f_op->get_unmapped_area(file, addr, len,
 13.1349 +						pgoff, flags);
 13.1350 +
 13.1351 +	return current->mm->get_unmapped_area(file, addr, len, pgoff, flags);
 13.1352 +}
 13.1353 +
 13.1354 +EXPORT_SYMBOL(get_unmapped_area);
 13.1355 +
 13.1356 +/* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
 13.1357 +struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr)
 13.1358 +{
 13.1359 +	struct vm_area_struct *vma = NULL;
 13.1360 +
 13.1361 +	if (mm) {
 13.1362 +		/* Check the cache first. */
 13.1363 +		/* (Cache hit rate is typically around 35%.) */
 13.1364 +		vma = mm->mmap_cache;
 13.1365 +		if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
 13.1366 +			struct rb_node * rb_node;
 13.1367 +
 13.1368 +			rb_node = mm->mm_rb.rb_node;
 13.1369 +			vma = NULL;
 13.1370 +
 13.1371 +			while (rb_node) {
 13.1372 +				struct vm_area_struct * vma_tmp;
 13.1373 +
 13.1374 +				vma_tmp = rb_entry(rb_node,
 13.1375 +						struct vm_area_struct, vm_rb);
 13.1376 +
 13.1377 +				if (vma_tmp->vm_end > addr) {
 13.1378 +					vma = vma_tmp;
 13.1379 +					if (vma_tmp->vm_start <= addr)
 13.1380 +						break;
 13.1381 +					rb_node = rb_node->rb_left;
 13.1382 +				} else
 13.1383 +					rb_node = rb_node->rb_right;
 13.1384 +			}
 13.1385 +			if (vma)
 13.1386 +				mm->mmap_cache = vma;
 13.1387 +		}
 13.1388 +	}
 13.1389 +	return vma;
 13.1390 +}
 13.1391 +
 13.1392 +EXPORT_SYMBOL(find_vma);
 13.1393 +
 13.1394 +/* Same as find_vma, but also return a pointer to the previous VMA in *pprev. */
 13.1395 +struct vm_area_struct *
 13.1396 +find_vma_prev(struct mm_struct *mm, unsigned long addr,
 13.1397 +			struct vm_area_struct **pprev)
 13.1398 +{
 13.1399 +	struct vm_area_struct *vma = NULL, *prev = NULL;
 13.1400 +	struct rb_node * rb_node;
 13.1401 +	if (!mm)
 13.1402 +		goto out;
 13.1403 +
 13.1404 +	/* Guard against addr being lower than the first VMA */
 13.1405 +	vma = mm->mmap;
 13.1406 +
 13.1407 +	/* Go through the RB tree quickly. */
 13.1408 +	rb_node = mm->mm_rb.rb_node;
 13.1409 +
 13.1410 +	while (rb_node) {
 13.1411 +		struct vm_area_struct *vma_tmp;
 13.1412 +		vma_tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);
 13.1413 +
 13.1414 +		if (addr < vma_tmp->vm_end) {
 13.1415 +			rb_node = rb_node->rb_left;
 13.1416 +		} else {
 13.1417 +			prev = vma_tmp;
 13.1418 +			if (!prev->vm_next || (addr < prev->vm_next->vm_end))
 13.1419 +				break;
 13.1420 +			rb_node = rb_node->rb_right;
 13.1421 +		}
 13.1422 +	}
 13.1423 +
 13.1424 +out:
 13.1425 +	*pprev = prev;
 13.1426 +	return prev ? prev->vm_next : vma;
 13.1427 +}
 13.1428 +
 13.1429 +/*
 13.1430 + * Verify that the stack growth is acceptable and
 13.1431 + * update accounting. This is shared with both the
 13.1432 + * grow-up and grow-down cases.
 13.1433 + */
 13.1434 +static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, unsigned long grow)
 13.1435 +{
 13.1436 +	struct mm_struct *mm = vma->vm_mm;
 13.1437 +	struct rlimit *rlim = current->signal->rlim;
 13.1438 +
 13.1439 +	/* address space limit tests */
 13.1440 +	if (mm->total_vm + grow > rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT)
 13.1441 +		return -ENOMEM;
 13.1442 +
 13.1443 +	/* Stack limit test */
 13.1444 +	if (size > rlim[RLIMIT_STACK].rlim_cur)
 13.1445 +		return -ENOMEM;
 13.1446 +
 13.1447 +	/* mlock limit tests */
 13.1448 +	if (vma->vm_flags & VM_LOCKED) {
 13.1449 +		unsigned long locked;
 13.1450 +		unsigned long limit;
 13.1451 +		locked = mm->locked_vm + grow;
 13.1452 +		limit = rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
 13.1453 +		if (locked > limit && !capable(CAP_IPC_LOCK))
 13.1454 +			return -ENOMEM;
 13.1455 +	}
 13.1456 +
 13.1457 +	/*
 13.1458 +	 * Overcommit..  This must be the final test, as it will
 13.1459 +	 * update security statistics.
 13.1460 +	 */
 13.1461 +	if (security_vm_enough_memory(grow))
 13.1462 +		return -ENOMEM;
 13.1463 +
 13.1464 +	/* Ok, everything looks good - let it rip */
 13.1465 +	mm->total_vm += grow;
 13.1466 +	if (vma->vm_flags & VM_LOCKED)
 13.1467 +		mm->locked_vm += grow;
 13.1468 +	__vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow);
 13.1469 +	acct_update_integrals();
 13.1470 +	update_mem_hiwater();
 13.1471 +	return 0;
 13.1472 +}
 13.1473 +
 13.1474 +#ifdef CONFIG_STACK_GROWSUP
 13.1475 +/*
 13.1476 + * vma is the first one with address > vma->vm_end.  Have to extend vma.
 13.1477 + */
 13.1478 +int expand_stack(struct vm_area_struct * vma, unsigned long address)
 13.1479 +{
 13.1480 +	int error;
 13.1481 +
 13.1482 +	if (!(vma->vm_flags & VM_GROWSUP))
 13.1483 +		return -EFAULT;
 13.1484 +
 13.1485 +	/*
 13.1486 +	 * We must make sure the anon_vma is allocated
 13.1487 +	 * so that the anon_vma locking is not a noop.
 13.1488 +	 */
 13.1489 +	if (unlikely(anon_vma_prepare(vma)))
 13.1490 +		return -ENOMEM;
 13.1491 +	anon_vma_lock(vma);
 13.1492 +
 13.1493 +	/*
 13.1494 +	 * vma->vm_start/vm_end cannot change under us because the caller
 13.1495 +	 * is required to hold the mmap_sem in read mode.  We need the
 13.1496 +	 * anon_vma lock to serialize against concurrent expand_stacks.
 13.1497 +	 */
 13.1498 +	address += 4 + PAGE_SIZE - 1;
 13.1499 +	address &= PAGE_MASK;
 13.1500 +	error = 0;
 13.1501 +
 13.1502 +	/* Somebody else might have raced and expanded it already */
 13.1503 +	if (address > vma->vm_end) {
 13.1504 +		unsigned long size, grow;
 13.1505 +
 13.1506 +		size = address - vma->vm_start;
 13.1507 +		grow = (address - vma->vm_end) >> PAGE_SHIFT;
 13.1508 +
 13.1509 +		error = acct_stack_growth(vma, size, grow);
 13.1510 +		if (!error)
 13.1511 +			vma->vm_end = address;
 13.1512 +	}
 13.1513 +	anon_vma_unlock(vma);
 13.1514 +	return error;
 13.1515 +}
 13.1516 +
 13.1517 +struct vm_area_struct *
 13.1518 +find_extend_vma(struct mm_struct *mm, unsigned long addr)
 13.1519 +{
 13.1520 +	struct vm_area_struct *vma, *prev;
 13.1521 +
 13.1522 +	addr &= PAGE_MASK;
 13.1523 +	vma = find_vma_prev(mm, addr, &prev);
 13.1524 +	if (vma && (vma->vm_start <= addr))
 13.1525 +		return vma;
 13.1526 +	if (!prev || expand_stack(prev, addr))
 13.1527 +		return NULL;
 13.1528 +	if (prev->vm_flags & VM_LOCKED) {
 13.1529 +		make_pages_present(addr, prev->vm_end);
 13.1530 +	}
 13.1531 +	return prev;
 13.1532 +}
 13.1533 +#else
 13.1534 +/*
 13.1535 + * vma is the first one with address < vma->vm_start.  Have to extend vma.
 13.1536 + */
 13.1537 +int expand_stack(struct vm_area_struct *vma, unsigned long address)
 13.1538 +{
 13.1539 +	int error;
 13.1540 +
 13.1541 +	/*
 13.1542 +	 * We must make sure the anon_vma is allocated
 13.1543 +	 * so that the anon_vma locking is not a noop.
 13.1544 +	 */
 13.1545 +	if (unlikely(anon_vma_prepare(vma)))
 13.1546 +		return -ENOMEM;
 13.1547 +	anon_vma_lock(vma);
 13.1548 +
 13.1549 +	/*
 13.1550 +	 * vma->vm_start/vm_end cannot change under us because the caller
 13.1551 +	 * is required to hold the mmap_sem in read mode.  We need the
 13.1552 +	 * anon_vma lock to serialize against concurrent expand_stacks.
 13.1553 +	 */
 13.1554 +	address &= PAGE_MASK;
 13.1555 +	error = 0;
 13.1556 +
 13.1557 +	/* Somebody else might have raced and expanded it already */
 13.1558 +	if (address < vma->vm_start) {
 13.1559 +		unsigned long size, grow;
 13.1560 +
 13.1561 +		size = vma->vm_end - address;
 13.1562 +		grow = (vma->vm_start - address) >> PAGE_SHIFT;
 13.1563 +
 13.1564 +		error = acct_stack_growth(vma, size, grow);
 13.1565 +		if (!error) {
 13.1566 +			vma->vm_start = address;
 13.1567 +			vma->vm_pgoff -= grow;
 13.1568 +		}
 13.1569 +	}
 13.1570 +	anon_vma_unlock(vma);
 13.1571 +	return error;
 13.1572 +}
 13.1573 +
 13.1574 +struct vm_area_struct *
 13.1575 +find_extend_vma(struct mm_struct * mm, unsigned long addr)
 13.1576 +{
 13.1577 +	struct vm_area_struct * vma;
 13.1578 +	unsigned long start;
 13.1579 +
 13.1580 +	addr &= PAGE_MASK;
 13.1581 +	vma = find_vma(mm,addr);
 13.1582 +	if (!vma)
 13.1583 +		return NULL;
 13.1584 +	if (vma->vm_start <= addr)
 13.1585 +		return vma;
 13.1586 +	if (!(vma->vm_flags & VM_GROWSDOWN))
 13.1587 +		return NULL;
 13.1588 +	start = vma->vm_start;
 13.1589 +	if (expand_stack(vma, addr))
 13.1590 +		return NULL;
 13.1591 +	if (vma->vm_flags & VM_LOCKED) {
 13.1592 +		make_pages_present(addr, start);
 13.1593 +	}
 13.1594 +	return vma;
 13.1595 +}
 13.1596 +#endif
 13.1597 +
 13.1598 +/*
 13.1599 + * Try to free as many page directory entries as we can,
 13.1600 + * without having to work very hard at actually scanning
 13.1601 + * the page tables themselves.
 13.1602 + *
 13.1603 + * Right now we try to free page tables if we have a nice
 13.1604 + * PGDIR-aligned area that got free'd up. We could be more
 13.1605 + * granular if we want to, but this is fast and simple,
 13.1606 + * and covers the bad cases.
 13.1607 + *
 13.1608 + * "prev", if it exists, points to a vma before the one
 13.1609 + * we just free'd - but there's no telling how much before.
 13.1610 + */
 13.1611 +static void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *prev,
 13.1612 +	unsigned long start, unsigned long end)
 13.1613 +{
 13.1614 +	unsigned long first = start & PGDIR_MASK;
 13.1615 +	unsigned long last = end + PGDIR_SIZE - 1;
 13.1616 +	struct mm_struct *mm = tlb->mm;
 13.1617 +
 13.1618 +	if (last > MM_VM_SIZE(mm) || last < end)
 13.1619 +		last = MM_VM_SIZE(mm);
 13.1620 +
 13.1621 +	if (!prev) {
 13.1622 +		prev = mm->mmap;
 13.1623 +		if (!prev)
 13.1624 +			goto no_mmaps;
 13.1625 +		if (prev->vm_end > start) {
 13.1626 +			if (last > prev->vm_start)
 13.1627 +				last = prev->vm_start;
 13.1628 +			goto no_mmaps;
 13.1629 +		}
 13.1630 +	}
 13.1631 +	for (;;) {
 13.1632 +		struct vm_area_struct *next = prev->vm_next;
 13.1633 +
 13.1634 +		if (next) {
 13.1635 +			if (next->vm_start < start) {
 13.1636 +				prev = next;
 13.1637 +				continue;
 13.1638 +			}
 13.1639 +			if (last > next->vm_start)
 13.1640 +				last = next->vm_start;
 13.1641 +		}
 13.1642 +		if (prev->vm_end > first)
 13.1643 +			first = prev->vm_end;
 13.1644 +		break;
 13.1645 +	}
 13.1646 +no_mmaps:
 13.1647 +	if (last < first)	/* for arches with discontiguous pgd indices */
 13.1648 +		return;
 13.1649 +	if (first < FIRST_USER_PGD_NR * PGDIR_SIZE)
 13.1650 +		first = FIRST_USER_PGD_NR * PGDIR_SIZE;
 13.1651 +	/* No point trying to free anything if we're in the same pte page */
 13.1652 +	if ((first & PMD_MASK) < (last & PMD_MASK)) {
 13.1653 +		clear_page_range(tlb, first, last);
 13.1654 +		flush_tlb_pgtables(mm, first, last);
 13.1655 +	}
 13.1656 +}
 13.1657 +
 13.1658 +/* Normal function to fix up a mapping
 13.1659 + * This function is the default for when an area has no specific
 13.1660 + * function.  This may be used as part of a more specific routine.
 13.1661 + *
 13.1662 + * By the time this function is called, the area struct has been
 13.1663 + * removed from the process mapping list.
 13.1664 + */
 13.1665 +static void unmap_vma(struct mm_struct *mm, struct vm_area_struct *area)
 13.1666 +{
 13.1667 +	size_t len = area->vm_end - area->vm_start;
 13.1668 +
 13.1669 +	area->vm_mm->total_vm -= len >> PAGE_SHIFT;
 13.1670 +	if (area->vm_flags & VM_LOCKED)
 13.1671 +		area->vm_mm->locked_vm -= len >> PAGE_SHIFT;
 13.1672 +	vm_stat_unaccount(area);
 13.1673 +	area->vm_mm->unmap_area(area);
 13.1674 +	remove_vm_struct(area);
 13.1675 +}
 13.1676 +
 13.1677 +/*
 13.1678 + * Update the VMA and inode share lists.
 13.1679 + *
 13.1680 + * Ok - we have the memory areas we should free on the 'free' list,
 13.1681 + * so release them, and do the vma updates.
 13.1682 + */
 13.1683 +static void unmap_vma_list(struct mm_struct *mm,
 13.1684 +	struct vm_area_struct *mpnt)
 13.1685 +{
 13.1686 +	do {
 13.1687 +		struct vm_area_struct *next = mpnt->vm_next;
 13.1688 +		unmap_vma(mm, mpnt);
 13.1689 +		mpnt = next;
 13.1690 +	} while (mpnt != NULL);
 13.1691 +	validate_mm(mm);
 13.1692 +}
 13.1693 +
 13.1694 +/*
 13.1695 + * Get rid of page table information in the indicated region.
 13.1696 + *
 13.1697 + * Called with the page table lock held.
 13.1698 + */
 13.1699 +static void unmap_region(struct mm_struct *mm,
 13.1700 +	struct vm_area_struct *vma,
 13.1701 +	struct vm_area_struct *prev,
 13.1702 +	unsigned long start,
 13.1703 +	unsigned long end)
 13.1704 +{
 13.1705 +	struct mmu_gather *tlb;
 13.1706 +	unsigned long nr_accounted = 0;
 13.1707 +
 13.1708 +	lru_add_drain();
 13.1709 +	tlb = tlb_gather_mmu(mm, 0);
 13.1710 +	unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted, NULL);
 13.1711 +	vm_unacct_memory(nr_accounted);
 13.1712 +
 13.1713 +	if (is_hugepage_only_range(start, end - start))
 13.1714 +		hugetlb_free_pgtables(tlb, prev, start, end);
 13.1715 +	else
 13.1716 +		free_pgtables(tlb, prev, start, end);
 13.1717 +	tlb_finish_mmu(tlb, start, end);
 13.1718 +}
 13.1719 +
 13.1720 +/*
 13.1721 + * Create a list of vma's touched by the unmap, removing them from the mm's
 13.1722 + * vma list as we go..
 13.1723 + */
 13.1724 +static void
 13.1725 +detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
 13.1726 +	struct vm_area_struct *prev, unsigned long end)
 13.1727 +{
 13.1728 +	struct vm_area_struct **insertion_point;
 13.1729 +	struct vm_area_struct *tail_vma = NULL;
 13.1730 +
 13.1731 +	insertion_point = (prev ? &prev->vm_next : &mm->mmap);
 13.1732 +	do {
 13.1733 +		rb_erase(&vma->vm_rb, &mm->mm_rb);
 13.1734 +		mm->map_count--;
 13.1735 +		tail_vma = vma;
 13.1736 +		vma = vma->vm_next;
 13.1737 +	} while (vma && vma->vm_start < end);
 13.1738 +	*insertion_point = vma;
 13.1739 +	tail_vma->vm_next = NULL;
 13.1740 +	mm->mmap_cache = NULL;		/* Kill the cache. */
 13.1741 +}
 13.1742 +
 13.1743 +/*
 13.1744 + * Split a vma into two pieces at address 'addr', a new vma is allocated
 13.1745 + * either for the first part or the the tail.
 13.1746 + */
 13.1747 +int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
 13.1748 +	      unsigned long addr, int new_below)
 13.1749 +{
 13.1750 +	struct mempolicy *pol;
 13.1751 +	struct vm_area_struct *new;
 13.1752 +
 13.1753 +	if (is_vm_hugetlb_page(vma) && (addr & ~HPAGE_MASK))
 13.1754 +		return -EINVAL;
 13.1755 +
 13.1756 +	if (mm->map_count >= sysctl_max_map_count)
 13.1757 +		return -ENOMEM;
 13.1758 +
 13.1759 +	new = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
 13.1760 +	if (!new)
 13.1761 +		return -ENOMEM;
 13.1762 +
 13.1763 +	/* most fields are the same, copy all, and then fixup */
 13.1764 +	*new = *vma;
 13.1765 +
 13.1766 +	if (new_below)
 13.1767 +		new->vm_end = addr;
 13.1768 +	else {
 13.1769 +		new->vm_start = addr;
 13.1770 +		new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT);
 13.1771 +	}
 13.1772 +
 13.1773 +	pol = mpol_copy(vma_policy(vma));
 13.1774 +	if (IS_ERR(pol)) {
 13.1775 +		kmem_cache_free(vm_area_cachep, new);
 13.1776 +		return PTR_ERR(pol);
 13.1777 +	}
 13.1778 +	vma_set_policy(new, pol);
 13.1779 +
 13.1780 +	if (new->vm_file)
 13.1781 +		get_file(new->vm_file);
 13.1782 +
 13.1783 +	if (new->vm_ops && new->vm_ops->open)
 13.1784 +		new->vm_ops->open(new);
 13.1785 +
 13.1786 +	if (new_below)
 13.1787 +		vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff +
 13.1788 +			((addr - new->vm_start) >> PAGE_SHIFT), new);
 13.1789 +	else
 13.1790 +		vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new);
 13.1791 +
 13.1792 +	return 0;
 13.1793 +}
 13.1794 +
 13.1795 +/* Munmap is split into 2 main parts -- this part which finds
 13.1796 + * what needs doing, and the areas themselves, which do the
 13.1797 + * work.  This now handles partial unmappings.
 13.1798 + * Jeremy Fitzhardinge <jeremy@goop.org>
 13.1799 + */
 13.1800 +int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
 13.1801 +{
 13.1802 +	unsigned long end;
 13.1803 +	struct vm_area_struct *mpnt, *prev, *last;
 13.1804 +
 13.1805 +	if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start)
 13.1806 +		return -EINVAL;
 13.1807 +
 13.1808 +	if ((len = PAGE_ALIGN(len)) == 0)
 13.1809 +		return -EINVAL;
 13.1810 +
 13.1811 +	/* Find the first overlapping VMA */
 13.1812 +	mpnt = find_vma_prev(mm, start, &prev);
 13.1813 +	if (!mpnt)
 13.1814 +		return 0;
 13.1815 +	/* we have  start < mpnt->vm_end  */
 13.1816 +
 13.1817 +	/* if it doesn't overlap, we have nothing.. */
 13.1818 +	end = start + len;
 13.1819 +	if (mpnt->vm_start >= end)
 13.1820 +		return 0;
 13.1821 +
 13.1822 +	/*
 13.1823 +	 * If we need to split any vma, do it now to save pain later.
 13.1824 +	 *
 13.1825 +	 * Note: mremap's move_vma VM_ACCOUNT handling assumes a partially
 13.1826 +	 * unmapped vm_area_struct will remain in use: so lower split_vma
 13.1827 +	 * places tmp vma above, and higher split_vma places tmp vma below.
 13.1828 +	 */
 13.1829 +	if (start > mpnt->vm_start) {
 13.1830 +		int error = split_vma(mm, mpnt, start, 0);
 13.1831 +		if (error)
 13.1832 +			return error;
 13.1833 +		prev = mpnt;
 13.1834 +	}
 13.1835 +
 13.1836 +	/* Does it split the last one? */
 13.1837 +	last = find_vma(mm, end);
 13.1838 +	if (last && end > last->vm_start) {
 13.1839 +		int error = split_vma(mm, last, end, 1);
 13.1840 +		if (error)
 13.1841 +			return error;
 13.1842 +	}
 13.1843 +	mpnt = prev? prev->vm_next: mm->mmap;
 13.1844 +
 13.1845 +	/*
 13.1846 +	 * Remove the vma's, and unmap the actual pages
 13.1847 +	 */
 13.1848 +	detach_vmas_to_be_unmapped(mm, mpnt, prev, end);
 13.1849 +	spin_lock(&mm->page_table_lock);
 13.1850 +	unmap_region(mm, mpnt, prev, start, end);
 13.1851 +	spin_unlock(&mm->page_table_lock);
 13.1852 +
 13.1853 +	/* Fix up all other VM information */
 13.1854 +	unmap_vma_list(mm, mpnt);
 13.1855 +
 13.1856 +	return 0;
 13.1857 +}
 13.1858 +
 13.1859 +EXPORT_SYMBOL(do_munmap);
 13.1860 +
 13.1861 +asmlinkage long sys_munmap(unsigned long addr, size_t len)
 13.1862 +{
 13.1863 +	int ret;
 13.1864 +	struct mm_struct *mm = current->mm;
 13.1865 +
 13.1866 +	profile_munmap(addr);
 13.1867 +
 13.1868 +	down_write(&mm->mmap_sem);
 13.1869 +	ret = do_munmap(mm, addr, len);
 13.1870 +	up_write(&mm->mmap_sem);
 13.1871 +	return ret;
 13.1872 +}
 13.1873 +
 13.1874 +static inline void verify_mm_writelocked(struct mm_struct *mm)
 13.1875 +{
 13.1876 +#ifdef CONFIG_DEBUG_KERNEL
 13.1877 +	if (unlikely(down_read_trylock(&mm->mmap_sem))) {
 13.1878 +		WARN_ON(1);
 13.1879 +		up_read(&mm->mmap_sem);
 13.1880 +	}
 13.1881 +#endif
 13.1882 +}
 13.1883 +
 13.1884 +/*
 13.1885 + *  this is really a simplified "do_mmap".  it only handles
 13.1886 + *  anonymous maps.  eventually we may be able to do some
 13.1887 + *  brk-specific accounting here.
 13.1888 + */
 13.1889 +unsigned long do_brk(unsigned long addr, unsigned long len)
 13.1890 +{
 13.1891 +	struct mm_struct * mm = current->mm;
 13.1892 +	struct vm_area_struct * vma, * prev;
 13.1893 +	unsigned long flags;
 13.1894 +	struct rb_node ** rb_link, * rb_parent;
 13.1895 +	pgoff_t pgoff = addr >> PAGE_SHIFT;
 13.1896 +
 13.1897 +	len = PAGE_ALIGN(len);
 13.1898 +	if (!len)
 13.1899 +		return addr;
 13.1900 +
 13.1901 +	if ((addr + len) > TASK_SIZE || (addr + len) < addr)
 13.1902 +		return -EINVAL;
 13.1903 +
 13.1904 +	/*
 13.1905 +	 * mlock MCL_FUTURE?
 13.1906 +	 */
 13.1907 +	if (mm->def_flags & VM_LOCKED) {
 13.1908 +		unsigned long locked, lock_limit;
 13.1909 +		locked = mm->locked_vm << PAGE_SHIFT;
 13.1910 +		lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
 13.1911 +		locked += len;
 13.1912 +		if (locked > lock_limit && !capable(CAP_IPC_LOCK))
 13.1913 +			return -EAGAIN;
 13.1914 +	}
 13.1915 +
 13.1916 +	/*
 13.1917 +	 * mm->mmap_sem is required to protect against another thread
 13.1918 +	 * changing the mappings in case we sleep.
 13.1919 +	 */
 13.1920 +	verify_mm_writelocked(mm);
 13.1921 +
 13.1922 +	/*
 13.1923 +	 * Clear old maps.  this also does some error checking for us
 13.1924 +	 */
 13.1925 + munmap_back:
 13.1926 +	vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
 13.1927 +	if (vma && vma->vm_start < addr + len) {
 13.1928 +		if (do_munmap(mm, addr, len))
 13.1929 +			return -ENOMEM;
 13.1930 +		goto munmap_back;
 13.1931 +	}
 13.1932 +
 13.1933 +	/* Check against address space limits *after* clearing old maps... */
 13.1934 +	if ((mm->total_vm << PAGE_SHIFT) + len
 13.1935 +	    > current->signal->rlim[RLIMIT_AS].rlim_cur)
 13.1936 +		return -ENOMEM;
 13.1937 +
 13.1938 +	if (mm->map_count > sysctl_max_map_count)
 13.1939 +		return -ENOMEM;
 13.1940 +
 13.1941 +	if (security_vm_enough_memory(len >> PAGE_SHIFT))
 13.1942 +		return -ENOMEM;
 13.1943 +
 13.1944 +	flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
 13.1945 +
 13.1946 +	/* Can we just expand an old private anonymous mapping? */
 13.1947 +	if (vma_merge(mm, prev, addr, addr + len, flags,
 13.1948 +					NULL, NULL, pgoff, NULL))
 13.1949 +		goto out;
 13.1950 +
 13.1951 +	/*
 13.1952 +	 * create a vma struct for an anonymous mapping
 13.1953 +	 */
 13.1954 +	vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
 13.1955 +	if (!vma) {
 13.1956 +		vm_unacct_memory(len >> PAGE_SHIFT);
 13.1957 +		return -ENOMEM;
 13.1958 +	}
 13.1959 +	memset(vma, 0, sizeof(*vma));
 13.1960 +
 13.1961 +	vma->vm_mm = mm;
 13.1962 +	vma->vm_start = addr;
 13.1963 +	vma->vm_end = addr + len;
 13.1964 +	vma->vm_pgoff = pgoff;
 13.1965 +	vma->vm_flags = flags;
 13.1966 +	vma->vm_page_prot = protection_map[flags & 0x0f];
 13.1967 +	vma_link(mm, vma, prev, rb_link, rb_parent);
 13.1968 +out:
 13.1969 +	mm->total_vm += len >> PAGE_SHIFT;
 13.1970 +	if (flags & VM_LOCKED) {
 13.1971 +		mm->locked_vm += len >> PAGE_SHIFT;
 13.1972 +		make_pages_present(addr, addr + len);
 13.1973 +	}
 13.1974 +	acct_update_integrals();
 13.1975 +	update_mem_hiwater();
 13.1976 +	return addr;
 13.1977 +}
 13.1978 +
 13.1979 +EXPORT_SYMBOL(do_brk);
 13.1980 +
 13.1981 +/* Release all mmaps. */
 13.1982 +void exit_mmap(struct mm_struct *mm)
 13.1983 +{
 13.1984 +	struct mmu_gather *tlb;
 13.1985 +	struct vm_area_struct *vma;
 13.1986 +	unsigned long nr_accounted = 0;
 13.1987 +
 13.1988 +#ifdef arch_exit_mmap
 13.1989 +	arch_exit_mmap(mm);
 13.1990 +#endif
 13.1991 +
 13.1992 +	lru_add_drain();
 13.1993 +
 13.1994 +	spin_lock(&mm->page_table_lock);
 13.1995 +
 13.1996 +	tlb = tlb_gather_mmu(mm, 1);
 13.1997 +	flush_cache_mm(mm);
 13.1998 +	/* Use ~0UL here to ensure all VMAs in the mm are unmapped */
 13.1999 +	mm->map_count -= unmap_vmas(&tlb, mm, mm->mmap, 0,
 13.2000 +					~0UL, &nr_accounted, NULL);
 13.2001 +	vm_unacct_memory(nr_accounted);
 13.2002 +	BUG_ON(mm->map_count);	/* This is just debugging */
 13.2003 +	clear_page_range(tlb, FIRST_USER_PGD_NR * PGDIR_SIZE, MM_VM_SIZE(mm));
 13.2004 +	
 13.2005 +	tlb_finish_mmu(tlb, 0, MM_VM_SIZE(mm));
 13.2006 +
 13.2007 +	vma = mm->mmap;
 13.2008 +	mm->mmap = mm->mmap_cache = NULL;
 13.2009 +	mm->mm_rb = RB_ROOT;
 13.2010 +	mm->rss = 0;
 13.2011 +	mm->total_vm = 0;
 13.2012 +	mm->locked_vm = 0;
 13.2013 +
 13.2014 +	spin_unlock(&mm->page_table_lock);
 13.2015 +
 13.2016 +	/*
 13.2017 +	 * Walk the list again, actually closing and freeing it
 13.2018 +	 * without holding any MM locks.
 13.2019 +	 */
 13.2020 +	while (vma) {
 13.2021 +		struct vm_area_struct *next = vma->vm_next;
 13.2022 +		remove_vm_struct(vma);
 13.2023 +		vma = next;
 13.2024 +	}
 13.2025 +}
 13.2026 +
 13.2027 +/* Insert vm structure into process list sorted by address
 13.2028 + * and into the inode's i_mmap tree.  If vm_file is non-NULL
 13.2029 + * then i_mmap_lock is taken here.
 13.2030 + */
 13.2031 +int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
 13.2032 +{
 13.2033 +	struct vm_area_struct * __vma, * prev;
 13.2034 +	struct rb_node ** rb_link, * rb_parent;
 13.2035 +
 13.2036 +	/*
 13.2037 +	 * The vm_pgoff of a purely anonymous vma should be irrelevant
 13.2038 +	 * until its first write fault, when page's anon_vma and index
 13.2039 +	 * are set.  But now set the vm_pgoff it will almost certainly
 13.2040 +	 * end up with (unless mremap moves it elsewhere before that
 13.2041 +	 * first wfault), so /proc/pid/maps tells a consistent story.
 13.2042 +	 *
 13.2043 +	 * By setting it to reflect the virtual start address of the
 13.2044 +	 * vma, merges and splits can happen in a seamless way, just
 13.2045 +	 * using the existing file pgoff checks and manipulations.
 13.2046 +	 * Similarly in do_mmap_pgoff and in do_brk.
 13.2047 +	 */
 13.2048 +	if (!vma->vm_file) {
 13.2049 +		BUG_ON(vma->anon_vma);
 13.2050 +		vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
 13.2051 +	}
 13.2052 +	__vma = find_vma_prepare(mm,vma->vm_start,&prev,&rb_link,&rb_parent);
 13.2053 +	if (__vma && __vma->vm_start < vma->vm_end)
 13.2054 +		return -ENOMEM;
 13.2055 +	vma_link(mm, vma, prev, rb_link, rb_parent);
 13.2056 +	return 0;
 13.2057 +}
 13.2058 +
 13.2059 +/*
 13.2060 + * Copy the vma structure to a new location in the same mm,
 13.2061 + * prior to moving page table entries, to effect an mremap move.
 13.2062 + */
 13.2063 +struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
 13.2064 +	unsigned long addr, unsigned long len, pgoff_t pgoff)
 13.2065 +{
 13.2066 +	struct vm_area_struct *vma = *vmap;
 13.2067 +	unsigned long vma_start = vma->vm_start;
 13.2068 +	struct mm_struct *mm = vma->vm_mm;
 13.2069 +	struct vm_area_struct *new_vma, *prev;
 13.2070 +	struct rb_node **rb_link, *rb_parent;
 13.2071 +	struct mempolicy *pol;
 13.2072 +
 13.2073 +	/*
 13.2074 +	 * If anonymous vma has not yet been faulted, update new pgoff
 13.2075 +	 * to match new location, to increase its chance of merging.
 13.2076 +	 */
 13.2077 +	if (!vma->vm_file && !vma->anon_vma)
 13.2078 +		pgoff = addr >> PAGE_SHIFT;
 13.2079 +
 13.2080 +	find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
 13.2081 +	new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
 13.2082 +			vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma));
 13.2083 +	if (new_vma) {
 13.2084 +		/*
 13.2085 +		 * Source vma may have been merged into new_vma
 13.2086 +		 */
 13.2087 +		if (vma_start >= new_vma->vm_start &&
 13.2088 +		    vma_start < new_vma->vm_end)
 13.2089 +			*vmap = new_vma;
 13.2090 +	} else {
 13.2091 +		new_vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
 13.2092 +		if (new_vma) {
 13.2093 +			*new_vma = *vma;
 13.2094 +			pol = mpol_copy(vma_policy(vma));
 13.2095 +			if (IS_ERR(pol)) {
 13.2096 +				kmem_cache_free(vm_area_cachep, new_vma);
 13.2097 +				return NULL;
 13.2098 +			}
 13.2099 +			vma_set_policy(new_vma, pol);
 13.2100 +			new_vma->vm_start = addr;
 13.2101 +			new_vma->vm_end = addr + len;
 13.2102 +			new_vma->vm_pgoff = pgoff;
 13.2103 +			if (new_vma->vm_file)
 13.2104 +				get_file(new_vma->vm_file);
 13.2105 +			if (new_vma->vm_ops && new_vma->vm_ops->open)
 13.2106 +				new_vma->vm_ops->open(new_vma);
 13.2107 +			vma_link(mm, new_vma, prev, rb_link, rb_parent);
 13.2108 +		}
 13.2109 +	}
 13.2110 +	return new_vma;
 13.2111 +}
    14.1 --- a/tools/libxc/xc_linux_restore.c	Wed Apr 27 16:55:30 2005 +0000
    14.2 +++ b/tools/libxc/xc_linux_restore.c	Wed Apr 27 16:55:50 2005 +0000
    14.3 @@ -170,13 +170,13 @@ int xc_linux_restore(int xc_handle, XcIO
    14.4      if ( xc_domain_create(xc_handle, nr_pfns * (PAGE_SIZE / 1024),
    14.5                            -1, 1, &dom) )
    14.6      {
    14.7 -	xcio_error(ioctxt, "Could not create domain. pfns=%d, %dKB",
    14.8 -		   nr_pfns,nr_pfns * (PAGE_SIZE / 1024));
    14.9 +	xcio_error(ioctxt, "Could not create domain. pfns=%ld, %ldKB",
   14.10 +		   nr_pfns, nr_pfns * (PAGE_SIZE / 1024));
   14.11          goto out;
   14.12      }
   14.13      
   14.14      ioctxt->domain = dom;
   14.15 -    xcio_info(ioctxt, "Created domain %ld\n",dom);
   14.16 +    xcio_info(ioctxt, "Created domain %u\n", dom);
   14.17  
   14.18      /* Get the domain's shared-info frame. */
   14.19      op.cmd = DOM0_GETDOMAININFO;
   14.20 @@ -200,7 +200,8 @@ int xc_linux_restore(int xc_handle, XcIO
   14.21      }
   14.22  
   14.23      /* Build the pfn-to-mfn table. We choose MFN ordering returned by Xen. */
   14.24 -    if ( xc_get_pfn_list(xc_handle, dom, pfn_to_mfn_table, nr_pfns) != nr_pfns )
   14.25 +    if ( xc_get_pfn_list(xc_handle, dom, 
   14.26 +                         pfn_to_mfn_table, nr_pfns) != nr_pfns )
   14.27      {
   14.28          xcio_error(ioctxt, "Did not read correct number of frame "
   14.29                     "numbers for new dom");
   14.30 @@ -657,7 +658,7 @@ int xc_linux_restore(int xc_handle, XcIO
   14.31      if ( rc == 0 )
   14.32      {
   14.33          /* Success: print the domain id. */
   14.34 -        xcio_info(ioctxt, "DOM=%lu\n", dom);
   14.35 +        xcio_info(ioctxt, "DOM=%u\n", dom);
   14.36          return 0;
   14.37      }
   14.38  
    15.1 --- a/tools/libxc/xc_linux_save.c	Wed Apr 27 16:55:30 2005 +0000
    15.2 +++ b/tools/libxc/xc_linux_save.c	Wed Apr 27 16:55:50 2005 +0000
    15.3 @@ -167,7 +167,8 @@ static int burst_time_us = -1;
    15.4  #define RATE_TO_BTU 781250
    15.5  #define BURST_TIME_US burst_time_us
    15.6  
    15.7 -static int xcio_ratewrite(XcIOContext *ioctxt, void *buf, int n){
    15.8 +static int xcio_ratewrite(XcIOContext *ioctxt, void *buf, int n)
    15.9 +{
   15.10      static int budget = 0;
   15.11      static struct timeval last_put = { 0 };
   15.12      struct timeval now;
   15.13 @@ -230,8 +231,8 @@ static int print_stats( int xc_handle, u
   15.14  
   15.15      gettimeofday(&wall_now, NULL);
   15.16  
   15.17 -    d0_cpu_now = xc_domain_get_cpu_usage( xc_handle, 0, /* FIXME */ 0 )/1000;
   15.18 -    d1_cpu_now = xc_domain_get_cpu_usage( xc_handle, domid, /* FIXME */ 0 )/1000;
   15.19 +    d0_cpu_now = xc_domain_get_cpu_usage(xc_handle, 0, /* FIXME */ 0)/1000;
   15.20 +    d1_cpu_now = xc_domain_get_cpu_usage(xc_handle, domid, /* FIXME */ 0)/1000;
   15.21  
   15.22      if ( (d0_cpu_now == -1) || (d1_cpu_now == -1) ) 
   15.23          printf("ARRHHH!!\n");
   15.24 @@ -273,10 +274,13 @@ static int print_stats( int xc_handle, u
   15.25   * @param ioctxt i/o context
   15.26   * @return 0 on success, non-zero on error.
   15.27   */
   15.28 -static int write_vmconfig(XcIOContext *ioctxt){
   15.29 +static int write_vmconfig(XcIOContext *ioctxt)
   15.30 +{
   15.31      int err = -1;
   15.32 -    if(xcio_write(ioctxt, &ioctxt->vmconfig_n, sizeof(ioctxt->vmconfig_n))) goto exit;
   15.33 -    if(xcio_write(ioctxt, ioctxt->vmconfig, ioctxt->vmconfig_n)) goto exit;
   15.34 +    if(xcio_write(ioctxt, &ioctxt->vmconfig_n, sizeof(ioctxt->vmconfig_n))) 
   15.35 +        goto exit;
   15.36 +    if(xcio_write(ioctxt, ioctxt->vmconfig, ioctxt->vmconfig_n)) 
   15.37 +        goto exit;
   15.38      err = 0;
   15.39    exit:
   15.40      return err;
   15.41 @@ -329,7 +333,8 @@ int suspend_and_state(int xc_handle, XcI
   15.42  
   15.43  retry:
   15.44  
   15.45 -    if ( xc_domain_getfullinfo(xc_handle, ioctxt->domain, /* FIXME */ 0, info, ctxt) )
   15.46 +    if ( xc_domain_getfullinfo(xc_handle, ioctxt->domain, /* FIXME */ 0, 
   15.47 +                               info, ctxt) )
   15.48      {
   15.49  	xcio_error(ioctxt, "Could not get full domain info");
   15.50  	return -1;
   15.51 @@ -347,7 +352,7 @@ retry:
   15.52  	// try unpausing domain, wait, and retest	
   15.53  	xc_domain_unpause( xc_handle, ioctxt->domain );
   15.54  
   15.55 -	xcio_error(ioctxt, "Domain was paused. Wait and re-test. (%lx)",
   15.56 +	xcio_error(ioctxt, "Domain was paused. Wait and re-test. (%u)",
   15.57  		   info->flags);
   15.58  	usleep(10000);  // 10ms
   15.59  
   15.60 @@ -357,14 +362,12 @@ retry:
   15.61  
   15.62      if( ++i < 100 )
   15.63      {
   15.64 -	xcio_error(ioctxt, "Retry suspend domain (%lx)",
   15.65 -		   info->flags);
   15.66 +	xcio_error(ioctxt, "Retry suspend domain (%u)", info->flags);
   15.67  	usleep(10000);  // 10ms	
   15.68  	goto retry;
   15.69      }
   15.70  
   15.71 -    xcio_error(ioctxt, "Unable to suspend domain. (%lx)",
   15.72 -	       info->flags);
   15.73 +    xcio_error(ioctxt, "Unable to suspend domain. (%u)", info->flags);
   15.74  
   15.75      return -1;
   15.76  }
   15.77 @@ -442,7 +445,8 @@ int xc_linux_save(int xc_handle, XcIOCon
   15.78          return 1;
   15.79      }
   15.80  
   15.81 -    if ( xc_domain_getfullinfo( xc_handle, domid, /* FIXME */ 0, &info, &ctxt) )
   15.82 +    if ( xc_domain_getfullinfo( xc_handle, domid, /* FIXME */ 0, 
   15.83 +                                &info, &ctxt) )
   15.84      {
   15.85          xcio_error(ioctxt, "Could not get full domain info");
   15.86          goto out;
   15.87 @@ -459,7 +463,9 @@ int xc_linux_save(int xc_handle, XcIOCon
   15.88  
   15.89      /* cheesy sanity check */
   15.90      if ( nr_pfns > 1024*1024 ){
   15.91 -        xcio_error(ioctxt, "Invalid state record -- pfn count out of range: %lu", nr_pfns);
   15.92 +        xcio_error(ioctxt, 
   15.93 +                   "Invalid state record -- pfn count out of range: %lu", 
   15.94 +                   nr_pfns);
   15.95          goto out;
   15.96      }
   15.97  
   15.98 @@ -513,7 +519,8 @@ int xc_linux_save(int xc_handle, XcIOCon
   15.99  
  15.100      for ( i = 0; i < nr_pfns; i += 1024 ){
  15.101          if ( !translate_mfn_to_pfn(&pfn_to_mfn_frame_list[i/1024]) ){
  15.102 -            xcio_error(ioctxt, "Frame # in pfn-to-mfn frame list is not in pseudophys");
  15.103 +            xcio_error(ioctxt, 
  15.104 +                       "Frame# in pfn-to-mfn frame list is not in pseudophys");
  15.105              goto out;
  15.106          }
  15.107      }
  15.108 @@ -539,7 +546,7 @@ int xc_linux_save(int xc_handle, XcIOCon
  15.109  
  15.110  	if ( suspend_and_state( xc_handle, ioctxt, &info, &ctxt) )
  15.111  	{
  15.112 -	    xcio_error(ioctxt, "Domain appears not to have suspended: %lx",
  15.113 +	    xcio_error(ioctxt, "Domain appears not to have suspended: %u",
  15.114  		       info.flags);
  15.115  	    goto out;
  15.116  	}
  15.117 @@ -836,7 +843,8 @@ int xc_linux_save(int xc_handle, XcIOCon
  15.118                      } /* end of page table rewrite for loop */
  15.119        
  15.120                      if ( xcio_ratewrite(ioctxt, page, PAGE_SIZE) ){
  15.121 -                        xcio_error(ioctxt, "Error when writing to state file (4)");
  15.122 +                        xcio_error(ioctxt, 
  15.123 +                                   "Error when writing to state file (4)");
  15.124                          goto out;
  15.125                      }
  15.126        
  15.127 @@ -844,7 +852,8 @@ int xc_linux_save(int xc_handle, XcIOCon
  15.128  
  15.129                      if ( xcio_ratewrite(ioctxt, region_base + (PAGE_SIZE*j), 
  15.130                                       PAGE_SIZE) ){
  15.131 -                        xcio_error(ioctxt, "Error when writing to state file (5)");
  15.132 +                        xcio_error(ioctxt, 
  15.133 +                                   "Error when writing to state file (5)");
  15.134                          goto out;
  15.135                      }
  15.136                  }
  15.137 @@ -903,14 +912,15 @@ int xc_linux_save(int xc_handle, XcIOCon
  15.138  
  15.139  		if ( suspend_and_state( xc_handle, ioctxt, &info, &ctxt) )
  15.140  		{
  15.141 -		    xcio_error(ioctxt, "Domain appears not to have suspended: %lx",
  15.142 +		    xcio_error(ioctxt, 
  15.143 +                               "Domain appears not to have suspended: %u",
  15.144  			       info.flags);
  15.145  		    goto out;
  15.146  		}
  15.147  
  15.148  		xcio_info(ioctxt,
  15.149 -                          "SUSPEND flags %08lx shinfo %08lx eip %08lx "
  15.150 -                          "esi %08lx\n",info.flags,
  15.151 +                          "SUSPEND flags %08u shinfo %08lx eip %08u "
  15.152 +                          "esi %08u\n",info.flags,
  15.153                            info.shared_info_frame,
  15.154                            ctxt.cpu_ctxt.eip, ctxt.cpu_ctxt.esi );
  15.155              } 
  15.156 @@ -972,7 +982,8 @@ int xc_linux_save(int xc_handle, XcIOCon
  15.157  	    {
  15.158  		if ( xcio_write(ioctxt, &pfntab, sizeof(unsigned long)*j) )
  15.159  		{
  15.160 -		    xcio_error(ioctxt, "Error when writing to state file (6b)");
  15.161 +		    xcio_error(ioctxt, 
  15.162 +                               "Error when writing to state file (6b)");
  15.163  		    goto out;
  15.164  		}	
  15.165  		j = 0;
  15.166 @@ -1027,14 +1038,24 @@ int xc_linux_save(int xc_handle, XcIOCon
  15.167  
  15.168   out:
  15.169  
  15.170 -    if ( live_shinfo )          munmap(live_shinfo, PAGE_SIZE);
  15.171 -    if ( p_srec )               munmap(p_srec, sizeof(*p_srec));
  15.172 -    if ( live_pfn_to_mfn_frame_list ) munmap(live_pfn_to_mfn_frame_list, PAGE_SIZE);
  15.173 -    if ( live_pfn_to_mfn_table ) munmap(live_pfn_to_mfn_table, nr_pfns*4 );
  15.174 -    if ( live_mfn_to_pfn_table ) munmap(live_mfn_to_pfn_table, PAGE_SIZE*1024 );
  15.175 +    if(live_shinfo)
  15.176 +        munmap(live_shinfo, PAGE_SIZE);
  15.177  
  15.178 -    if ( pfn_type != NULL ) free(pfn_type);
  15.179 +    if(p_srec) 
  15.180 +        munmap(p_srec, sizeof(*p_srec));
  15.181 +
  15.182 +    if(live_pfn_to_mfn_frame_list) 
  15.183 +        munmap(live_pfn_to_mfn_frame_list, PAGE_SIZE);
  15.184 +
  15.185 +    if(live_pfn_to_mfn_table) 
  15.186 +        munmap(live_pfn_to_mfn_table, nr_pfns*4);
  15.187 +
  15.188 +    if(live_mfn_to_pfn_table) 
  15.189 +        munmap(live_mfn_to_pfn_table, PAGE_SIZE*1024);
  15.190 +
  15.191 +    if (pfn_type != NULL) 
  15.192 +        free(pfn_type);
  15.193 +
  15.194      DPRINTF("Save exit rc=%d\n",rc);
  15.195      return !!rc;
  15.196 -
  15.197  }
    16.1 --- a/xen/arch/x86/mm.c	Wed Apr 27 16:55:30 2005 +0000
    16.2 +++ b/xen/arch/x86/mm.c	Wed Apr 27 16:55:50 2005 +0000
    16.3 @@ -482,7 +482,7 @@ get_page_from_l2e(
    16.4  {
    16.5      int rc;
    16.6  
    16.7 -    ASSERT( !shadow_mode_enabled(d) );
    16.8 +    ASSERT(!shadow_mode_enabled(d));
    16.9  
   16.10      if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
   16.11          return 1;
   16.12 @@ -641,7 +641,7 @@ static int alloc_l1_table(struct pfn_inf
   16.13      l1_pgentry_t  *pl1e;
   16.14      int            i;
   16.15  
   16.16 -    ASSERT( !shadow_mode_enabled(d) );
   16.17 +    ASSERT(!shadow_mode_enabled(d));
   16.18  
   16.19      pl1e = map_domain_mem(pfn << PAGE_SHIFT);
   16.20  
   16.21 @@ -2670,22 +2670,6 @@ static int ptwr_emulated_update(
   16.22      }
   16.23      unmap_domain_mem(pl1e);
   16.24  
   16.25 -    /* Propagate update to shadow cache. */
   16.26 -    if ( unlikely(shadow_mode_enabled(d)) )
   16.27 -    {
   16.28 -        BUG(); // XXX fix me...
   16.29 -#if 0
   16.30 -        sstat = get_shadow_status(d, page_to_pfn(page));
   16.31 -        if ( sstat & PSH_shadowed )
   16.32 -        {
   16.33 -            sl1e = map_domain_mem(
   16.34 -                ((sstat & PSH_pfn_mask) << PAGE_SHIFT) + (addr & ~PAGE_MASK));
   16.35 -            l1pte_propagate_from_guest(d, &nl1e, sl1e);
   16.36 -            unmap_domain_mem(sl1e);
   16.37 -        }
   16.38 -#endif
   16.39 -    }
   16.40 -
   16.41      /* Finally, drop the old PTE. */
   16.42      put_page_from_l1e(ol1e, d);
   16.43  
   16.44 @@ -2748,6 +2732,7 @@ int ptwr_do_page_fault(struct domain *d,
   16.45      /* We are looking only for read-only mappings of p.t. pages. */
   16.46      if ( ((l1e_get_flags(pte) & (_PAGE_RW|_PAGE_PRESENT)) != _PAGE_PRESENT) ||
   16.47           ((page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table) ||
   16.48 +         ((page->u.inuse.type_info & PGT_count_mask) == 0) ||
   16.49           (page_get_owner(page) != d) )
   16.50      {
   16.51          return 0;