ia64/xen-unstable
changeset 4673:98d5be103415
bitkeeper revision 1.1388 (426fc416kd_SxU1l3YCeVWTczbT41A)
Merge arcadians.cl.cam.ac.uk:/auto/groups/xeno-xenod/BK/xen-unstable.bk
into arcadians.cl.cam.ac.uk:/local/scratch-2/vh249/xen-unstable.bk
Merge arcadians.cl.cam.ac.uk:/auto/groups/xeno-xenod/BK/xen-unstable.bk
into arcadians.cl.cam.ac.uk:/local/scratch-2/vh249/xen-unstable.bk
line diff
1.1 --- a/.rootkeys Wed Apr 27 16:55:30 2005 +0000 1.2 +++ b/.rootkeys Wed Apr 27 16:55:50 2005 +0000 1.3 @@ -351,6 +351,7 @@ 40f5623aKXkBBxgpLx2NcvkncQ1Yyw linux-2.6 1.4 40f5623aDMCsWOFO0jktZ4e8sjwvEg linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/mach-xen/setup_arch_post.h 1.5 40f5623arsFXkGdPvIqvFi3yFXGR0Q linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/mach-xen/setup_arch_pre.h 1.6 41811f07Iri9hrvs97t-baxmhOwWDQ linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/mach-xen/smpboot_hooks.h 1.7 +426fa4d7RzvcFMqff_M76HrvRQZHSg linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/mmu.h 1.8 4120f807GCO0uqsLqdZj9csxR1Wthw linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/mmu_context.h 1.9 40f5623adgjZq9nAgCt0IXdWl7udSA linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/page.h 1.10 40f5623a54NuG-7qHihGYmw4wWQnMA linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/param.h 1.11 @@ -418,6 +419,7 @@ 419dfc6awx7w88wk6cG9P3mPidX6LQ linux-2.6 1.12 40f56a0ddHCSs3501MY4hRf22tctOw linux-2.6.11-xen-sparse/mkbuildtree 1.13 42305f54Q6xJ1bXcQJlCQq1m-e2C8g linux-2.6.11-xen-sparse/mm/highmem.c 1.14 412f46c0LJuKAgSPGoC0Z1DEkLfuLA linux-2.6.11-xen-sparse/mm/memory.c 1.15 +426fa4d7ooLYmFcFjJMF_ut4GFVh2Q linux-2.6.11-xen-sparse/mm/mmap.c 1.16 410a94a4KT6I6X0LVc7djB39tRDp4g linux-2.6.11-xen-sparse/mm/page_alloc.c 1.17 413cb1e4zst25MDYjg63Y-NGC5_pLg netbsd-2.0-xen-sparse/Makefile 1.18 413cb1e5c_Mkxf_X0zimEhTKI_l4DA netbsd-2.0-xen-sparse/mkbuildtree
3.1 --- a/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/ldt.c Wed Apr 27 16:55:30 2005 +0000 3.2 +++ b/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/ldt.c Wed Apr 27 16:55:50 2005 +0000 3.3 @@ -100,8 +100,8 @@ int init_new_context(struct task_struct 3.4 struct mm_struct * old_mm; 3.5 int retval = 0; 3.6 3.7 + memset(&mm->context, 0, sizeof(mm->context)); 3.8 init_MUTEX(&mm->context.sem); 3.9 - mm->context.size = 0; 3.10 old_mm = current->mm; 3.11 if (old_mm && old_mm->context.size > 0) { 3.12 down(&old_mm->context.sem);
4.1 --- a/linux-2.6.11-xen-sparse/arch/xen/i386/mm/hypervisor.c Wed Apr 27 16:55:30 2005 +0000 4.2 +++ b/linux-2.6.11-xen-sparse/arch/xen/i386/mm/hypervisor.c Wed Apr 27 16:55:50 2005 +0000 4.3 @@ -211,7 +211,8 @@ unsigned long allocate_empty_lowmem_regi 4.4 pte = pte_offset_kernel(pmd, (vstart + (i*PAGE_SIZE))); 4.5 pfn_array[i] = pte->pte_low >> PAGE_SHIFT; 4.6 HYPERVISOR_update_va_mapping(vstart + (i*PAGE_SIZE), __pte_ma(0), 0); 4.7 - phys_to_machine_mapping[__pa(vstart)>>PAGE_SHIFT] = INVALID_P2M_ENTRY; 4.8 + phys_to_machine_mapping[(__pa(vstart)>>PAGE_SHIFT)+i] = 4.9 + INVALID_P2M_ENTRY; 4.10 } 4.11 4.12 flush_tlb_all();
5.1 --- a/linux-2.6.11-xen-sparse/arch/xen/i386/mm/init.c Wed Apr 27 16:55:30 2005 +0000 5.2 +++ b/linux-2.6.11-xen-sparse/arch/xen/i386/mm/init.c Wed Apr 27 16:55:50 2005 +0000 5.3 @@ -710,18 +710,9 @@ void __init mem_init(void) 5.4 5.5 kmem_cache_t *pgd_cache; 5.6 kmem_cache_t *pmd_cache; 5.7 -kmem_cache_t *pte_cache; 5.8 5.9 void __init pgtable_cache_init(void) 5.10 { 5.11 - pte_cache = kmem_cache_create("pte", 5.12 - PTRS_PER_PTE*sizeof(pte_t), 5.13 - PTRS_PER_PTE*sizeof(pte_t), 5.14 - 0, 5.15 - pte_ctor, 5.16 - pte_dtor); 5.17 - if (!pte_cache) 5.18 - panic("pgtable_cache_init(): Cannot create pte cache"); 5.19 if (PTRS_PER_PMD > 1) { 5.20 pmd_cache = kmem_cache_create("pmd", 5.21 PTRS_PER_PMD*sizeof(pmd_t),
6.1 --- a/linux-2.6.11-xen-sparse/arch/xen/i386/mm/pgtable.c Wed Apr 27 16:55:30 2005 +0000 6.2 +++ b/linux-2.6.11-xen-sparse/arch/xen/i386/mm/pgtable.c Wed Apr 27 16:55:50 2005 +0000 6.3 @@ -198,59 +198,35 @@ pte_t *pte_alloc_one_kernel(struct mm_st 6.4 return pte; 6.5 } 6.6 6.7 -void pte_ctor(void *pte, kmem_cache_t *cache, unsigned long unused) 6.8 -{ 6.9 - struct page *page = virt_to_page(pte); 6.10 - SetPageForeign(page, pte_free); 6.11 - set_page_count(page, 1); 6.12 - 6.13 - clear_page(pte); 6.14 - make_page_readonly(pte); 6.15 - xen_pte_pin(__pa(pte)); 6.16 -} 6.17 - 6.18 -void pte_dtor(void *pte, kmem_cache_t *cache, unsigned long unused) 6.19 -{ 6.20 - struct page *page = virt_to_page(pte); 6.21 - ClearPageForeign(page); 6.22 - 6.23 - xen_pte_unpin(__pa(pte)); 6.24 - make_page_writable(pte); 6.25 -} 6.26 - 6.27 struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) 6.28 { 6.29 - pte_t *ptep; 6.30 - 6.31 -#ifdef CONFIG_HIGHPTE 6.32 struct page *pte; 6.33 6.34 +#ifdef CONFIG_HIGHPTE 6.35 pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0); 6.36 - if (pte == NULL) 6.37 - return pte; 6.38 - if (PageHighMem(pte)) 6.39 - return pte; 6.40 - /* not a highmem page -- free page and grab one from the cache */ 6.41 - __free_page(pte); 6.42 +#else 6.43 + pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0); 6.44 + if (pte) { 6.45 + SetPageForeign(pte, pte_free); 6.46 + set_page_count(pte, 1); 6.47 + } 6.48 #endif 6.49 - ptep = kmem_cache_alloc(pte_cache, GFP_KERNEL); 6.50 - if (ptep) 6.51 - return virt_to_page(ptep); 6.52 - return NULL; 6.53 + 6.54 + return pte; 6.55 } 6.56 6.57 void pte_free(struct page *pte) 6.58 { 6.59 + unsigned long va = (unsigned long)__va(page_to_pfn(pte)<<PAGE_SHIFT); 6.60 + 6.61 + if (!pte_write(*virt_to_ptep(va))) 6.62 + HYPERVISOR_update_va_mapping( 6.63 + va, pfn_pte(page_to_pfn(pte), PAGE_KERNEL), 0); 6.64 + 6.65 + ClearPageForeign(pte); 6.66 set_page_count(pte, 1); 6.67 -#ifdef CONFIG_HIGHPTE 6.68 - if (!PageHighMem(pte)) 6.69 -#endif 6.70 - kmem_cache_free(pte_cache, 6.71 - phys_to_virt(page_to_pseudophys(pte))); 6.72 -#ifdef CONFIG_HIGHPTE 6.73 - else 6.74 - __free_page(pte); 6.75 -#endif 6.76 + 6.77 + __free_page(pte); 6.78 } 6.79 6.80 void pmd_ctor(void *pmd, kmem_cache_t *cache, unsigned long flags) 6.81 @@ -305,14 +281,11 @@ void pgd_ctor(void *pgd, kmem_cache_t *c 6.82 (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); 6.83 6.84 if (PTRS_PER_PMD > 1) 6.85 - goto out; 6.86 + return; 6.87 6.88 pgd_list_add(pgd); 6.89 spin_unlock_irqrestore(&pgd_lock, flags); 6.90 memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); 6.91 - out: 6.92 - make_page_readonly(pgd); 6.93 - xen_pgd_pin(__pa(pgd)); 6.94 } 6.95 6.96 /* never called when PTRS_PER_PMD > 1 */ 6.97 @@ -320,9 +293,6 @@ void pgd_dtor(void *pgd, kmem_cache_t *c 6.98 { 6.99 unsigned long flags; /* can be called from interrupt context */ 6.100 6.101 - xen_pgd_unpin(__pa(pgd)); 6.102 - make_page_writable(pgd); 6.103 - 6.104 if (PTRS_PER_PMD > 1) 6.105 return; 6.106 6.107 @@ -357,6 +327,15 @@ out_oom: 6.108 void pgd_free(pgd_t *pgd) 6.109 { 6.110 int i; 6.111 + pte_t *ptep = virt_to_ptep(pgd); 6.112 + 6.113 + if (!pte_write(*ptep)) { 6.114 + xen_pgd_unpin(__pa(pgd)); 6.115 + HYPERVISOR_update_va_mapping( 6.116 + (unsigned long)pgd, 6.117 + pfn_pte(virt_to_phys(pgd)>>PAGE_SHIFT, PAGE_KERNEL), 6.118 + 0); 6.119 + } 6.120 6.121 /* in the PAE case user pgd entries are overwritten before usage */ 6.122 if (PTRS_PER_PMD > 1) 6.123 @@ -369,28 +348,19 @@ void pgd_free(pgd_t *pgd) 6.124 #ifndef CONFIG_XEN_SHADOW_MODE 6.125 void make_lowmem_page_readonly(void *va) 6.126 { 6.127 - pgd_t *pgd = pgd_offset_k((unsigned long)va); 6.128 - pud_t *pud = pud_offset(pgd, (unsigned long)va); 6.129 - pmd_t *pmd = pmd_offset(pud, (unsigned long)va); 6.130 - pte_t *pte = pte_offset_kernel(pmd, (unsigned long)va); 6.131 + pte_t *pte = virt_to_ptep(va); 6.132 set_pte(pte, pte_wrprotect(*pte)); 6.133 } 6.134 6.135 void make_lowmem_page_writable(void *va) 6.136 { 6.137 - pgd_t *pgd = pgd_offset_k((unsigned long)va); 6.138 - pud_t *pud = pud_offset(pgd, (unsigned long)va); 6.139 - pmd_t *pmd = pmd_offset(pud, (unsigned long)va); 6.140 - pte_t *pte = pte_offset_kernel(pmd, (unsigned long)va); 6.141 + pte_t *pte = virt_to_ptep(va); 6.142 set_pte(pte, pte_mkwrite(*pte)); 6.143 } 6.144 6.145 void make_page_readonly(void *va) 6.146 { 6.147 - pgd_t *pgd = pgd_offset_k((unsigned long)va); 6.148 - pud_t *pud = pud_offset(pgd, (unsigned long)va); 6.149 - pmd_t *pmd = pmd_offset(pud, (unsigned long)va); 6.150 - pte_t *pte = pte_offset_kernel(pmd, (unsigned long)va); 6.151 + pte_t *pte = virt_to_ptep(va); 6.152 set_pte(pte, pte_wrprotect(*pte)); 6.153 if ( (unsigned long)va >= (unsigned long)high_memory ) 6.154 { 6.155 @@ -405,10 +375,7 @@ void make_page_readonly(void *va) 6.156 6.157 void make_page_writable(void *va) 6.158 { 6.159 - pgd_t *pgd = pgd_offset_k((unsigned long)va); 6.160 - pud_t *pud = pud_offset(pgd, (unsigned long)va); 6.161 - pmd_t *pmd = pmd_offset(pud, (unsigned long)va); 6.162 - pte_t *pte = pte_offset_kernel(pmd, (unsigned long)va); 6.163 + pte_t *pte = virt_to_ptep(va); 6.164 set_pte(pte, pte_mkwrite(*pte)); 6.165 if ( (unsigned long)va >= (unsigned long)high_memory ) 6.166 { 6.167 @@ -439,3 +406,91 @@ void make_pages_writable(void *va, unsig 6.168 } 6.169 } 6.170 #endif /* CONFIG_XEN_SHADOW_MODE */ 6.171 + 6.172 +void mm_pin(struct mm_struct *mm) 6.173 +{ 6.174 + pgd_t *pgd; 6.175 + struct page *page; 6.176 + int i; 6.177 + 6.178 + spin_lock(&mm->page_table_lock); 6.179 + 6.180 + for ( i = 0, pgd = mm->pgd; i < USER_PTRS_PER_PGD; i++, pgd++ ) 6.181 + { 6.182 + if ( *(unsigned long *)pgd == 0 ) 6.183 + continue; 6.184 + page = pmd_page(*(pmd_t *)pgd); 6.185 + if ( !PageHighMem(page) ) 6.186 + HYPERVISOR_update_va_mapping( 6.187 + (unsigned long)__va(page_to_pfn(page)<<PAGE_SHIFT), 6.188 + pfn_pte(page_to_pfn(page), PAGE_KERNEL_RO), 0); 6.189 + } 6.190 + 6.191 + HYPERVISOR_update_va_mapping( 6.192 + (unsigned long)mm->pgd, 6.193 + pfn_pte(virt_to_phys(mm->pgd)>>PAGE_SHIFT, PAGE_KERNEL_RO), 0); 6.194 + xen_pgd_pin(__pa(mm->pgd)); 6.195 + 6.196 + mm->context.pinned = 1; 6.197 + 6.198 + spin_unlock(&mm->page_table_lock); 6.199 +} 6.200 + 6.201 +void mm_unpin(struct mm_struct *mm) 6.202 +{ 6.203 + pgd_t *pgd; 6.204 + struct page *page; 6.205 + int i; 6.206 + 6.207 + spin_lock(&mm->page_table_lock); 6.208 + 6.209 + xen_pgd_unpin(__pa(mm->pgd)); 6.210 + HYPERVISOR_update_va_mapping( 6.211 + (unsigned long)mm->pgd, 6.212 + pfn_pte(virt_to_phys(mm->pgd)>>PAGE_SHIFT, PAGE_KERNEL), 0); 6.213 + 6.214 + for ( i = 0, pgd = mm->pgd; i < USER_PTRS_PER_PGD; i++, pgd++ ) 6.215 + { 6.216 + if ( *(unsigned long *)pgd == 0 ) 6.217 + continue; 6.218 + page = pmd_page(*(pmd_t *)pgd); 6.219 + if ( !PageHighMem(page) ) 6.220 + HYPERVISOR_update_va_mapping( 6.221 + (unsigned long)__va(page_to_pfn(page)<<PAGE_SHIFT), 6.222 + pfn_pte(page_to_pfn(page), PAGE_KERNEL), 0); 6.223 + } 6.224 + 6.225 + mm->context.pinned = 0; 6.226 + 6.227 + spin_unlock(&mm->page_table_lock); 6.228 +} 6.229 + 6.230 +void _arch_exit_mmap(struct mm_struct *mm) 6.231 +{ 6.232 + unsigned int cpu = smp_processor_id(); 6.233 + struct task_struct *tsk = current; 6.234 + 6.235 + task_lock(tsk); 6.236 + 6.237 + /* 6.238 + * We aggressively remove defunct pgd from cr3. We execute unmap_vmas() 6.239 + * *much* faster this way, as no tlb flushes means bigger wrpt batches. 6.240 + */ 6.241 + if ( tsk->active_mm == mm ) 6.242 + { 6.243 + tsk->active_mm = &init_mm; 6.244 + atomic_inc(&init_mm.mm_count); 6.245 + 6.246 + cpu_set(cpu, init_mm.cpu_vm_mask); 6.247 + load_cr3(swapper_pg_dir); 6.248 + cpu_clear(cpu, mm->cpu_vm_mask); 6.249 + 6.250 + atomic_dec(&mm->mm_count); 6.251 + BUG_ON(atomic_read(&mm->mm_count) == 0); 6.252 + } 6.253 + 6.254 + task_unlock(tsk); 6.255 + 6.256 + if ( mm->context.pinned && (atomic_read(&mm->mm_count) == 1) ) 6.257 + mm_unpin(mm); 6.258 +}
7.1 --- a/linux-2.6.11-xen-sparse/arch/xen/x86_64/mm/hypervisor.c Wed Apr 27 16:55:30 2005 +0000 7.2 +++ b/linux-2.6.11-xen-sparse/arch/xen/x86_64/mm/hypervisor.c Wed Apr 27 16:55:50 2005 +0000 7.3 @@ -258,7 +258,8 @@ unsigned long allocate_empty_lowmem_regi 7.4 pte = pte_offset_kernel(pmd, (vstart + (i*PAGE_SIZE))); 7.5 pfn_array[i] = pte->pte >> PAGE_SHIFT; 7.6 xen_l1_entry_update(pte, 0); 7.7 - phys_to_machine_mapping[__pa(vstart)>>PAGE_SHIFT] = (u32)INVALID_P2M_ENTRY; 7.8 + phys_to_machine_mapping[(__pa(vstart)>>PAGE_SHIFT)+i] = 7.9 + (u32)INVALID_P2M_ENTRY; 7.10 } 7.11 7.12 /* Flush updates through and flush the TLB. */
8.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 8.2 +++ b/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/mmu.h Wed Apr 27 16:55:50 2005 +0000 8.3 @@ -0,0 +1,22 @@ 8.4 +#ifndef __i386_MMU_H 8.5 +#define __i386_MMU_H 8.6 + 8.7 +#include <asm/semaphore.h> 8.8 +/* 8.9 + * The i386 doesn't have a mmu context, but 8.10 + * we put the segment information here. 8.11 + * 8.12 + * cpu_vm_mask is used to optimize ldt flushing. 8.13 + */ 8.14 +typedef struct { 8.15 + int size; 8.16 + struct semaphore sem; 8.17 + void *ldt; 8.18 + unsigned pinned:1; 8.19 +} mm_context_t; 8.20 + 8.21 +/* mm/memory.c:exit_mmap hook */ 8.22 +extern void _arch_exit_mmap(struct mm_struct *mm); 8.23 +#define arch_exit_mmap(_mm) _arch_exit_mmap(_mm) 8.24 + 8.25 +#endif
9.1 --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/mmu_context.h Wed Apr 27 16:55:30 2005 +0000 9.2 +++ b/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/mmu_context.h Wed Apr 27 16:55:50 2005 +0000 9.3 @@ -41,6 +41,9 @@ static inline void __prepare_arch_switch 9.4 : : "r" (0) ); 9.5 } 9.6 9.7 +extern void mm_pin(struct mm_struct *mm); 9.8 +extern void mm_unpin(struct mm_struct *mm); 9.9 + 9.10 static inline void switch_mm(struct mm_struct *prev, 9.11 struct mm_struct *next, 9.12 struct task_struct *tsk) 9.13 @@ -49,6 +52,9 @@ static inline void switch_mm(struct mm_s 9.14 struct mmuext_op _op[2], *op = _op; 9.15 9.16 if (likely(prev != next)) { 9.17 + if (!next->context.pinned) 9.18 + mm_pin(next); 9.19 + 9.20 /* stop flush ipis for the previous mm */ 9.21 cpu_clear(cpu, prev->cpu_vm_mask); 9.22 #if 0 /* XEN: no lazy tlb */ 9.23 @@ -92,20 +98,10 @@ static inline void switch_mm(struct mm_s 9.24 #endif 9.25 } 9.26 9.27 -/* 9.28 - * XEN: We aggressively remove defunct pgd from cr3. We execute unmap_vmas() 9.29 - * *much* faster this way, as no tlb flushes means much bigger wrpt batches. 9.30 - */ 9.31 -#define deactivate_mm(tsk, mm) do { \ 9.32 - asm("movl %0,%%fs ; movl %0,%%gs": :"r" (0)); \ 9.33 - if ((mm) && cpu_isset(smp_processor_id(), (mm)->cpu_vm_mask)) { \ 9.34 - cpu_clear(smp_processor_id(), (mm)->cpu_vm_mask); \ 9.35 - load_cr3(swapper_pg_dir); \ 9.36 - } \ 9.37 -} while (0) 9.38 +#define deactivate_mm(tsk, mm) \ 9.39 + asm("movl %0,%%fs ; movl %0,%%gs": :"r" (0)) 9.40 9.41 -#define activate_mm(prev, next) do { \ 9.42 - switch_mm((prev),(next),NULL); \ 9.43 -} while (0) 9.44 +#define activate_mm(prev, next) \ 9.45 + switch_mm((prev),(next),NULL) 9.46 9.47 #endif
10.1 --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/pgalloc.h Wed Apr 27 16:55:30 2005 +0000 10.2 +++ b/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/pgalloc.h Wed Apr 27 16:55:50 2005 +0000 10.3 @@ -11,10 +11,23 @@ 10.4 #define pmd_populate_kernel(mm, pmd, pte) \ 10.5 set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte))) 10.6 10.7 -#define pmd_populate(mm, pmd, pte) \ 10.8 - set_pmd(pmd, __pmd(_PAGE_TABLE + \ 10.9 - ((unsigned long long)page_to_pfn(pte) << \ 10.10 - (unsigned long long) PAGE_SHIFT))) 10.11 +#define pmd_populate(mm, pmd, pte) \ 10.12 +do { \ 10.13 + if (unlikely((mm)->context.pinned)) { \ 10.14 + if (!PageHighMem(pte)) \ 10.15 + HYPERVISOR_update_va_mapping( \ 10.16 + (unsigned long)__va(page_to_pfn(pte)<<PAGE_SHIFT),\ 10.17 + pfn_pte(page_to_pfn(pte), PAGE_KERNEL_RO), 0);\ 10.18 + set_pmd(pmd, __pmd(_PAGE_TABLE + \ 10.19 + ((unsigned long long)page_to_pfn(pte) << \ 10.20 + (unsigned long long) PAGE_SHIFT))); \ 10.21 + } else { \ 10.22 + *(pmd) = __pmd(_PAGE_TABLE + \ 10.23 + ((unsigned long long)page_to_pfn(pte) << \ 10.24 + (unsigned long long) PAGE_SHIFT)); \ 10.25 + } \ 10.26 +} while (0) 10.27 + 10.28 /* 10.29 * Allocate and free page tables. 10.30 */
11.1 --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/pgtable.h Wed Apr 27 16:55:30 2005 +0000 11.2 +++ b/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/pgtable.h Wed Apr 27 16:55:50 2005 +0000 11.3 @@ -35,12 +35,9 @@ extern unsigned long empty_zero_page[102 11.4 extern pgd_t swapper_pg_dir[1024]; 11.5 extern kmem_cache_t *pgd_cache; 11.6 extern kmem_cache_t *pmd_cache; 11.7 -extern kmem_cache_t *pte_cache; 11.8 extern spinlock_t pgd_lock; 11.9 extern struct page *pgd_list; 11.10 11.11 -void pte_ctor(void *, kmem_cache_t *, unsigned long); 11.12 -void pte_dtor(void *, kmem_cache_t *, unsigned long); 11.13 void pmd_ctor(void *, kmem_cache_t *, unsigned long); 11.14 void pgd_ctor(void *, kmem_cache_t *, unsigned long); 11.15 void pgd_dtor(void *, kmem_cache_t *, unsigned long); 11.16 @@ -448,12 +445,17 @@ void make_pages_writable(void *va, unsig 11.17 #define make_pages_writable(_va, _nr) ((void)0) 11.18 #endif 11.19 11.20 -#define arbitrary_virt_to_machine(__va) \ 11.21 +#define virt_to_ptep(__va) \ 11.22 ({ \ 11.23 pgd_t *__pgd = pgd_offset_k((unsigned long)(__va)); \ 11.24 pud_t *__pud = pud_offset(__pgd, (unsigned long)(__va)); \ 11.25 pmd_t *__pmd = pmd_offset(__pud, (unsigned long)(__va)); \ 11.26 - pte_t *__pte = pte_offset_kernel(__pmd, (unsigned long)(__va)); \ 11.27 + pte_offset_kernel(__pmd, (unsigned long)(__va)); \ 11.28 +}) 11.29 + 11.30 +#define arbitrary_virt_to_machine(__va) \ 11.31 +({ \ 11.32 + pte_t *__pte = virt_to_ptep(__va); \ 11.33 unsigned long __pa = (*(unsigned long *)__pte) & PAGE_MASK; \ 11.34 __pa | ((unsigned long)(__va) & (PAGE_SIZE-1)); \ 11.35 })
12.1 --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/tlbflush.h Wed Apr 27 16:55:30 2005 +0000 12.2 +++ b/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/tlbflush.h Wed Apr 27 16:55:50 2005 +0000 12.3 @@ -40,24 +40,21 @@ extern unsigned long pgkern_mask; 12.4 12.5 static inline void flush_tlb_mm(struct mm_struct *mm) 12.6 { 12.7 - /* XEN: cpu_vm_mask is more accurate than active_mm. */ 12.8 - if (cpu_isset(smp_processor_id(), mm->cpu_vm_mask)) 12.9 + if (mm == current->active_mm) 12.10 __flush_tlb(); 12.11 } 12.12 12.13 static inline void flush_tlb_page(struct vm_area_struct *vma, 12.14 unsigned long addr) 12.15 { 12.16 - /* XEN: cpu_vm_mask is more accurate than active_mm. */ 12.17 - if (cpu_isset(smp_processor_id(), vma->vm_mm->cpu_vm_mask)) 12.18 + if (vma->vm_mm == current->active_mm) 12.19 __flush_tlb_one(addr); 12.20 } 12.21 12.22 static inline void flush_tlb_range(struct vm_area_struct *vma, 12.23 unsigned long start, unsigned long end) 12.24 { 12.25 - /* XEN: cpu_vm_mask is more accurate than active_mm. */ 12.26 - if (cpu_isset(smp_processor_id(), vma->vm_mm->cpu_vm_mask)) 12.27 + if (vma->vm_mm == current->active_mm) 12.28 __flush_tlb(); 12.29 } 12.30
13.1 --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-x86_64/tlbflush.h Wed Apr 27 16:55:30 2005 +0000 13.2 +++ b/linux-2.6.11-xen-sparse/include/asm-xen/asm-x86_64/tlbflush.h Wed Apr 27 16:55:50 2005 +0000 13.3 @@ -44,24 +44,21 @@ extern unsigned long pgkern_mask; 13.4 13.5 static inline void flush_tlb_mm(struct mm_struct *mm) 13.6 { 13.7 - /* XEN: cpu_vm_mask is more accurate than active_mm. */ 13.8 - if (cpu_isset(smp_processor_id(), mm->cpu_vm_mask)) 13.9 + if (mm == current->active_mm) 13.10 __flush_tlb(); 13.11 } 13.12 13.13 static inline void flush_tlb_page(struct vm_area_struct *vma, 13.14 unsigned long addr) 13.15 { 13.16 - /* XEN: cpu_vm_mask is more accurate than active_mm. */ 13.17 - if (cpu_isset(smp_processor_id(), vma->vm_mm->cpu_vm_mask)) 13.18 + if (vma->vm_mm == current->active_mm) 13.19 __flush_tlb_one(addr); 13.20 } 13.21 13.22 static inline void flush_tlb_range(struct vm_area_struct *vma, 13.23 unsigned long start, unsigned long end) 13.24 { 13.25 - /* XEN: cpu_vm_mask is more accurate than active_mm. */ 13.26 - if (cpu_isset(smp_processor_id(), vma->vm_mm->cpu_vm_mask)) 13.27 + if (vma->vm_mm == current->active_mm) 13.28 __flush_tlb(); 13.29 } 13.30
14.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 14.2 +++ b/linux-2.6.11-xen-sparse/mm/mmap.c Wed Apr 27 16:55:50 2005 +0000 14.3 @@ -0,0 +1,2108 @@ 14.4 +/* 14.5 + * mm/mmap.c 14.6 + * 14.7 + * Written by obz. 14.8 + * 14.9 + * Address space accounting code <alan@redhat.com> 14.10 + */ 14.11 + 14.12 +#include <linux/slab.h> 14.13 +#include <linux/mm.h> 14.14 +#include <linux/shm.h> 14.15 +#include <linux/mman.h> 14.16 +#include <linux/pagemap.h> 14.17 +#include <linux/swap.h> 14.18 +#include <linux/syscalls.h> 14.19 +#include <linux/init.h> 14.20 +#include <linux/file.h> 14.21 +#include <linux/fs.h> 14.22 +#include <linux/personality.h> 14.23 +#include <linux/security.h> 14.24 +#include <linux/hugetlb.h> 14.25 +#include <linux/profile.h> 14.26 +#include <linux/module.h> 14.27 +#include <linux/acct.h> 14.28 +#include <linux/mount.h> 14.29 +#include <linux/mempolicy.h> 14.30 +#include <linux/rmap.h> 14.31 + 14.32 +#include <asm/uaccess.h> 14.33 +#include <asm/cacheflush.h> 14.34 +#include <asm/tlb.h> 14.35 + 14.36 +/* 14.37 + * WARNING: the debugging will use recursive algorithms so never enable this 14.38 + * unless you know what you are doing. 14.39 + */ 14.40 +#undef DEBUG_MM_RB 14.41 + 14.42 +/* description of effects of mapping type and prot in current implementation. 14.43 + * this is due to the limited x86 page protection hardware. The expected 14.44 + * behavior is in parens: 14.45 + * 14.46 + * map_type prot 14.47 + * PROT_NONE PROT_READ PROT_WRITE PROT_EXEC 14.48 + * MAP_SHARED r: (no) no r: (yes) yes r: (no) yes r: (no) yes 14.49 + * w: (no) no w: (no) no w: (yes) yes w: (no) no 14.50 + * x: (no) no x: (no) yes x: (no) yes x: (yes) yes 14.51 + * 14.52 + * MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes 14.53 + * w: (no) no w: (no) no w: (copy) copy w: (no) no 14.54 + * x: (no) no x: (no) yes x: (no) yes x: (yes) yes 14.55 + * 14.56 + */ 14.57 +pgprot_t protection_map[16] = { 14.58 + __P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111, 14.59 + __S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111 14.60 +}; 14.61 + 14.62 +int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ 14.63 +int sysctl_overcommit_ratio = 50; /* default is 50% */ 14.64 +int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; 14.65 +atomic_t vm_committed_space = ATOMIC_INIT(0); 14.66 + 14.67 +/* 14.68 + * Check that a process has enough memory to allocate a new virtual 14.69 + * mapping. 0 means there is enough memory for the allocation to 14.70 + * succeed and -ENOMEM implies there is not. 14.71 + * 14.72 + * We currently support three overcommit policies, which are set via the 14.73 + * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting 14.74 + * 14.75 + * Strict overcommit modes added 2002 Feb 26 by Alan Cox. 14.76 + * Additional code 2002 Jul 20 by Robert Love. 14.77 + * 14.78 + * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise. 14.79 + * 14.80 + * Note this is a helper function intended to be used by LSMs which 14.81 + * wish to use this logic. 14.82 + */ 14.83 +int __vm_enough_memory(long pages, int cap_sys_admin) 14.84 +{ 14.85 + unsigned long free, allowed; 14.86 + 14.87 + vm_acct_memory(pages); 14.88 + 14.89 + /* 14.90 + * Sometimes we want to use more memory than we have 14.91 + */ 14.92 + if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS) 14.93 + return 0; 14.94 + 14.95 + if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { 14.96 + unsigned long n; 14.97 + 14.98 + free = get_page_cache_size(); 14.99 + free += nr_swap_pages; 14.100 + 14.101 + /* 14.102 + * Any slabs which are created with the 14.103 + * SLAB_RECLAIM_ACCOUNT flag claim to have contents 14.104 + * which are reclaimable, under pressure. The dentry 14.105 + * cache and most inode caches should fall into this 14.106 + */ 14.107 + free += atomic_read(&slab_reclaim_pages); 14.108 + 14.109 + /* 14.110 + * Leave the last 3% for root 14.111 + */ 14.112 + if (!cap_sys_admin) 14.113 + free -= free / 32; 14.114 + 14.115 + if (free > pages) 14.116 + return 0; 14.117 + 14.118 + /* 14.119 + * nr_free_pages() is very expensive on large systems, 14.120 + * only call if we're about to fail. 14.121 + */ 14.122 + n = nr_free_pages(); 14.123 + if (!cap_sys_admin) 14.124 + n -= n / 32; 14.125 + free += n; 14.126 + 14.127 + if (free > pages) 14.128 + return 0; 14.129 + vm_unacct_memory(pages); 14.130 + return -ENOMEM; 14.131 + } 14.132 + 14.133 + allowed = (totalram_pages - hugetlb_total_pages()) 14.134 + * sysctl_overcommit_ratio / 100; 14.135 + /* 14.136 + * Leave the last 3% for root 14.137 + */ 14.138 + if (!cap_sys_admin) 14.139 + allowed -= allowed / 32; 14.140 + allowed += total_swap_pages; 14.141 + 14.142 + /* Don't let a single process grow too big: 14.143 + leave 3% of the size of this process for other processes */ 14.144 + allowed -= current->mm->total_vm / 32; 14.145 + 14.146 + if (atomic_read(&vm_committed_space) < allowed) 14.147 + return 0; 14.148 + 14.149 + vm_unacct_memory(pages); 14.150 + 14.151 + return -ENOMEM; 14.152 +} 14.153 + 14.154 +EXPORT_SYMBOL(sysctl_overcommit_memory); 14.155 +EXPORT_SYMBOL(sysctl_overcommit_ratio); 14.156 +EXPORT_SYMBOL(sysctl_max_map_count); 14.157 +EXPORT_SYMBOL(vm_committed_space); 14.158 +EXPORT_SYMBOL(__vm_enough_memory); 14.159 + 14.160 +/* 14.161 + * Requires inode->i_mapping->i_mmap_lock 14.162 + */ 14.163 +static void __remove_shared_vm_struct(struct vm_area_struct *vma, 14.164 + struct file *file, struct address_space *mapping) 14.165 +{ 14.166 + if (vma->vm_flags & VM_DENYWRITE) 14.167 + atomic_inc(&file->f_dentry->d_inode->i_writecount); 14.168 + if (vma->vm_flags & VM_SHARED) 14.169 + mapping->i_mmap_writable--; 14.170 + 14.171 + flush_dcache_mmap_lock(mapping); 14.172 + if (unlikely(vma->vm_flags & VM_NONLINEAR)) 14.173 + list_del_init(&vma->shared.vm_set.list); 14.174 + else 14.175 + vma_prio_tree_remove(vma, &mapping->i_mmap); 14.176 + flush_dcache_mmap_unlock(mapping); 14.177 +} 14.178 + 14.179 +/* 14.180 + * Remove one vm structure and free it. 14.181 + */ 14.182 +static void remove_vm_struct(struct vm_area_struct *vma) 14.183 +{ 14.184 + struct file *file = vma->vm_file; 14.185 + 14.186 + might_sleep(); 14.187 + if (file) { 14.188 + struct address_space *mapping = file->f_mapping; 14.189 + spin_lock(&mapping->i_mmap_lock); 14.190 + __remove_shared_vm_struct(vma, file, mapping); 14.191 + spin_unlock(&mapping->i_mmap_lock); 14.192 + } 14.193 + if (vma->vm_ops && vma->vm_ops->close) 14.194 + vma->vm_ops->close(vma); 14.195 + if (file) 14.196 + fput(file); 14.197 + anon_vma_unlink(vma); 14.198 + mpol_free(vma_policy(vma)); 14.199 + kmem_cache_free(vm_area_cachep, vma); 14.200 +} 14.201 + 14.202 +/* 14.203 + * sys_brk() for the most part doesn't need the global kernel 14.204 + * lock, except when an application is doing something nasty 14.205 + * like trying to un-brk an area that has already been mapped 14.206 + * to a regular file. in this case, the unmapping will need 14.207 + * to invoke file system routines that need the global lock. 14.208 + */ 14.209 +asmlinkage unsigned long sys_brk(unsigned long brk) 14.210 +{ 14.211 + unsigned long rlim, retval; 14.212 + unsigned long newbrk, oldbrk; 14.213 + struct mm_struct *mm = current->mm; 14.214 + 14.215 + down_write(&mm->mmap_sem); 14.216 + 14.217 + if (brk < mm->end_code) 14.218 + goto out; 14.219 + newbrk = PAGE_ALIGN(brk); 14.220 + oldbrk = PAGE_ALIGN(mm->brk); 14.221 + if (oldbrk == newbrk) 14.222 + goto set_brk; 14.223 + 14.224 + /* Always allow shrinking brk. */ 14.225 + if (brk <= mm->brk) { 14.226 + if (!do_munmap(mm, newbrk, oldbrk-newbrk)) 14.227 + goto set_brk; 14.228 + goto out; 14.229 + } 14.230 + 14.231 + /* Check against rlimit.. */ 14.232 + rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur; 14.233 + if (rlim < RLIM_INFINITY && brk - mm->start_data > rlim) 14.234 + goto out; 14.235 + 14.236 + /* Check against existing mmap mappings. */ 14.237 + if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE)) 14.238 + goto out; 14.239 + 14.240 + /* Ok, looks good - let it rip. */ 14.241 + if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk) 14.242 + goto out; 14.243 +set_brk: 14.244 + mm->brk = brk; 14.245 +out: 14.246 + retval = mm->brk; 14.247 + up_write(&mm->mmap_sem); 14.248 + return retval; 14.249 +} 14.250 + 14.251 +#ifdef DEBUG_MM_RB 14.252 +static int browse_rb(struct rb_root *root) 14.253 +{ 14.254 + int i = 0, j; 14.255 + struct rb_node *nd, *pn = NULL; 14.256 + unsigned long prev = 0, pend = 0; 14.257 + 14.258 + for (nd = rb_first(root); nd; nd = rb_next(nd)) { 14.259 + struct vm_area_struct *vma; 14.260 + vma = rb_entry(nd, struct vm_area_struct, vm_rb); 14.261 + if (vma->vm_start < prev) 14.262 + printk("vm_start %lx prev %lx\n", vma->vm_start, prev), i = -1; 14.263 + if (vma->vm_start < pend) 14.264 + printk("vm_start %lx pend %lx\n", vma->vm_start, pend); 14.265 + if (vma->vm_start > vma->vm_end) 14.266 + printk("vm_end %lx < vm_start %lx\n", vma->vm_end, vma->vm_start); 14.267 + i++; 14.268 + pn = nd; 14.269 + } 14.270 + j = 0; 14.271 + for (nd = pn; nd; nd = rb_prev(nd)) { 14.272 + j++; 14.273 + } 14.274 + if (i != j) 14.275 + printk("backwards %d, forwards %d\n", j, i), i = 0; 14.276 + return i; 14.277 +} 14.278 + 14.279 +void validate_mm(struct mm_struct *mm) 14.280 +{ 14.281 + int bug = 0; 14.282 + int i = 0; 14.283 + struct vm_area_struct *tmp = mm->mmap; 14.284 + while (tmp) { 14.285 + tmp = tmp->vm_next; 14.286 + i++; 14.287 + } 14.288 + if (i != mm->map_count) 14.289 + printk("map_count %d vm_next %d\n", mm->map_count, i), bug = 1; 14.290 + i = browse_rb(&mm->mm_rb); 14.291 + if (i != mm->map_count) 14.292 + printk("map_count %d rb %d\n", mm->map_count, i), bug = 1; 14.293 + if (bug) 14.294 + BUG(); 14.295 +} 14.296 +#else 14.297 +#define validate_mm(mm) do { } while (0) 14.298 +#endif 14.299 + 14.300 +static struct vm_area_struct * 14.301 +find_vma_prepare(struct mm_struct *mm, unsigned long addr, 14.302 + struct vm_area_struct **pprev, struct rb_node ***rb_link, 14.303 + struct rb_node ** rb_parent) 14.304 +{ 14.305 + struct vm_area_struct * vma; 14.306 + struct rb_node ** __rb_link, * __rb_parent, * rb_prev; 14.307 + 14.308 + __rb_link = &mm->mm_rb.rb_node; 14.309 + rb_prev = __rb_parent = NULL; 14.310 + vma = NULL; 14.311 + 14.312 + while (*__rb_link) { 14.313 + struct vm_area_struct *vma_tmp; 14.314 + 14.315 + __rb_parent = *__rb_link; 14.316 + vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb); 14.317 + 14.318 + if (vma_tmp->vm_end > addr) { 14.319 + vma = vma_tmp; 14.320 + if (vma_tmp->vm_start <= addr) 14.321 + return vma; 14.322 + __rb_link = &__rb_parent->rb_left; 14.323 + } else { 14.324 + rb_prev = __rb_parent; 14.325 + __rb_link = &__rb_parent->rb_right; 14.326 + } 14.327 + } 14.328 + 14.329 + *pprev = NULL; 14.330 + if (rb_prev) 14.331 + *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb); 14.332 + *rb_link = __rb_link; 14.333 + *rb_parent = __rb_parent; 14.334 + return vma; 14.335 +} 14.336 + 14.337 +static inline void 14.338 +__vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, 14.339 + struct vm_area_struct *prev, struct rb_node *rb_parent) 14.340 +{ 14.341 + if (prev) { 14.342 + vma->vm_next = prev->vm_next; 14.343 + prev->vm_next = vma; 14.344 + } else { 14.345 + mm->mmap = vma; 14.346 + if (rb_parent) 14.347 + vma->vm_next = rb_entry(rb_parent, 14.348 + struct vm_area_struct, vm_rb); 14.349 + else 14.350 + vma->vm_next = NULL; 14.351 + } 14.352 +} 14.353 + 14.354 +void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, 14.355 + struct rb_node **rb_link, struct rb_node *rb_parent) 14.356 +{ 14.357 + rb_link_node(&vma->vm_rb, rb_parent, rb_link); 14.358 + rb_insert_color(&vma->vm_rb, &mm->mm_rb); 14.359 +} 14.360 + 14.361 +static inline void __vma_link_file(struct vm_area_struct *vma) 14.362 +{ 14.363 + struct file * file; 14.364 + 14.365 + file = vma->vm_file; 14.366 + if (file) { 14.367 + struct address_space *mapping = file->f_mapping; 14.368 + 14.369 + if (vma->vm_flags & VM_DENYWRITE) 14.370 + atomic_dec(&file->f_dentry->d_inode->i_writecount); 14.371 + if (vma->vm_flags & VM_SHARED) 14.372 + mapping->i_mmap_writable++; 14.373 + 14.374 + flush_dcache_mmap_lock(mapping); 14.375 + if (unlikely(vma->vm_flags & VM_NONLINEAR)) 14.376 + vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); 14.377 + else 14.378 + vma_prio_tree_insert(vma, &mapping->i_mmap); 14.379 + flush_dcache_mmap_unlock(mapping); 14.380 + } 14.381 +} 14.382 + 14.383 +static void 14.384 +__vma_link(struct mm_struct *mm, struct vm_area_struct *vma, 14.385 + struct vm_area_struct *prev, struct rb_node **rb_link, 14.386 + struct rb_node *rb_parent) 14.387 +{ 14.388 + __vma_link_list(mm, vma, prev, rb_parent); 14.389 + __vma_link_rb(mm, vma, rb_link, rb_parent); 14.390 + __anon_vma_link(vma); 14.391 +} 14.392 + 14.393 +static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, 14.394 + struct vm_area_struct *prev, struct rb_node **rb_link, 14.395 + struct rb_node *rb_parent) 14.396 +{ 14.397 + struct address_space *mapping = NULL; 14.398 + 14.399 + if (vma->vm_file) 14.400 + mapping = vma->vm_file->f_mapping; 14.401 + 14.402 + if (mapping) { 14.403 + spin_lock(&mapping->i_mmap_lock); 14.404 + vma->vm_truncate_count = mapping->truncate_count; 14.405 + } 14.406 + anon_vma_lock(vma); 14.407 + 14.408 + __vma_link(mm, vma, prev, rb_link, rb_parent); 14.409 + __vma_link_file(vma); 14.410 + 14.411 + anon_vma_unlock(vma); 14.412 + if (mapping) 14.413 + spin_unlock(&mapping->i_mmap_lock); 14.414 + 14.415 + mm->map_count++; 14.416 + validate_mm(mm); 14.417 +} 14.418 + 14.419 +/* 14.420 + * Helper for vma_adjust in the split_vma insert case: 14.421 + * insert vm structure into list and rbtree and anon_vma, 14.422 + * but it has already been inserted into prio_tree earlier. 14.423 + */ 14.424 +static void 14.425 +__insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) 14.426 +{ 14.427 + struct vm_area_struct * __vma, * prev; 14.428 + struct rb_node ** rb_link, * rb_parent; 14.429 + 14.430 + __vma = find_vma_prepare(mm, vma->vm_start,&prev, &rb_link, &rb_parent); 14.431 + if (__vma && __vma->vm_start < vma->vm_end) 14.432 + BUG(); 14.433 + __vma_link(mm, vma, prev, rb_link, rb_parent); 14.434 + mm->map_count++; 14.435 +} 14.436 + 14.437 +static inline void 14.438 +__vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, 14.439 + struct vm_area_struct *prev) 14.440 +{ 14.441 + prev->vm_next = vma->vm_next; 14.442 + rb_erase(&vma->vm_rb, &mm->mm_rb); 14.443 + if (mm->mmap_cache == vma) 14.444 + mm->mmap_cache = prev; 14.445 +} 14.446 + 14.447 +/* 14.448 + * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that 14.449 + * is already present in an i_mmap tree without adjusting the tree. 14.450 + * The following helper function should be used when such adjustments 14.451 + * are necessary. The "insert" vma (if any) is to be inserted 14.452 + * before we drop the necessary locks. 14.453 + */ 14.454 +void vma_adjust(struct vm_area_struct *vma, unsigned long start, 14.455 + unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert) 14.456 +{ 14.457 + struct mm_struct *mm = vma->vm_mm; 14.458 + struct vm_area_struct *next = vma->vm_next; 14.459 + struct vm_area_struct *importer = NULL; 14.460 + struct address_space *mapping = NULL; 14.461 + struct prio_tree_root *root = NULL; 14.462 + struct file *file = vma->vm_file; 14.463 + struct anon_vma *anon_vma = NULL; 14.464 + long adjust_next = 0; 14.465 + int remove_next = 0; 14.466 + 14.467 + if (next && !insert) { 14.468 + if (end >= next->vm_end) { 14.469 + /* 14.470 + * vma expands, overlapping all the next, and 14.471 + * perhaps the one after too (mprotect case 6). 14.472 + */ 14.473 +again: remove_next = 1 + (end > next->vm_end); 14.474 + end = next->vm_end; 14.475 + anon_vma = next->anon_vma; 14.476 + importer = vma; 14.477 + } else if (end > next->vm_start) { 14.478 + /* 14.479 + * vma expands, overlapping part of the next: 14.480 + * mprotect case 5 shifting the boundary up. 14.481 + */ 14.482 + adjust_next = (end - next->vm_start) >> PAGE_SHIFT; 14.483 + anon_vma = next->anon_vma; 14.484 + importer = vma; 14.485 + } else if (end < vma->vm_end) { 14.486 + /* 14.487 + * vma shrinks, and !insert tells it's not 14.488 + * split_vma inserting another: so it must be 14.489 + * mprotect case 4 shifting the boundary down. 14.490 + */ 14.491 + adjust_next = - ((vma->vm_end - end) >> PAGE_SHIFT); 14.492 + anon_vma = next->anon_vma; 14.493 + importer = next; 14.494 + } 14.495 + } 14.496 + 14.497 + if (file) { 14.498 + mapping = file->f_mapping; 14.499 + if (!(vma->vm_flags & VM_NONLINEAR)) 14.500 + root = &mapping->i_mmap; 14.501 + spin_lock(&mapping->i_mmap_lock); 14.502 + if (importer && 14.503 + vma->vm_truncate_count != next->vm_truncate_count) { 14.504 + /* 14.505 + * unmap_mapping_range might be in progress: 14.506 + * ensure that the expanding vma is rescanned. 14.507 + */ 14.508 + importer->vm_truncate_count = 0; 14.509 + } 14.510 + if (insert) { 14.511 + insert->vm_truncate_count = vma->vm_truncate_count; 14.512 + /* 14.513 + * Put into prio_tree now, so instantiated pages 14.514 + * are visible to arm/parisc __flush_dcache_page 14.515 + * throughout; but we cannot insert into address 14.516 + * space until vma start or end is updated. 14.517 + */ 14.518 + __vma_link_file(insert); 14.519 + } 14.520 + } 14.521 + 14.522 + /* 14.523 + * When changing only vma->vm_end, we don't really need 14.524 + * anon_vma lock: but is that case worth optimizing out? 14.525 + */ 14.526 + if (vma->anon_vma) 14.527 + anon_vma = vma->anon_vma; 14.528 + if (anon_vma) { 14.529 + spin_lock(&anon_vma->lock); 14.530 + /* 14.531 + * Easily overlooked: when mprotect shifts the boundary, 14.532 + * make sure the expanding vma has anon_vma set if the 14.533 + * shrinking vma had, to cover any anon pages imported. 14.534 + */ 14.535 + if (importer && !importer->anon_vma) { 14.536 + importer->anon_vma = anon_vma; 14.537 + __anon_vma_link(importer); 14.538 + } 14.539 + } 14.540 + 14.541 + if (root) { 14.542 + flush_dcache_mmap_lock(mapping); 14.543 + vma_prio_tree_remove(vma, root); 14.544 + if (adjust_next) 14.545 + vma_prio_tree_remove(next, root); 14.546 + } 14.547 + 14.548 + vma->vm_start = start; 14.549 + vma->vm_end = end; 14.550 + vma->vm_pgoff = pgoff; 14.551 + if (adjust_next) { 14.552 + next->vm_start += adjust_next << PAGE_SHIFT; 14.553 + next->vm_pgoff += adjust_next; 14.554 + } 14.555 + 14.556 + if (root) { 14.557 + if (adjust_next) 14.558 + vma_prio_tree_insert(next, root); 14.559 + vma_prio_tree_insert(vma, root); 14.560 + flush_dcache_mmap_unlock(mapping); 14.561 + } 14.562 + 14.563 + if (remove_next) { 14.564 + /* 14.565 + * vma_merge has merged next into vma, and needs 14.566 + * us to remove next before dropping the locks. 14.567 + */ 14.568 + __vma_unlink(mm, next, vma); 14.569 + if (file) 14.570 + __remove_shared_vm_struct(next, file, mapping); 14.571 + if (next->anon_vma) 14.572 + __anon_vma_merge(vma, next); 14.573 + } else if (insert) { 14.574 + /* 14.575 + * split_vma has split insert from vma, and needs 14.576 + * us to insert it before dropping the locks 14.577 + * (it may either follow vma or precede it). 14.578 + */ 14.579 + __insert_vm_struct(mm, insert); 14.580 + } 14.581 + 14.582 + if (anon_vma) 14.583 + spin_unlock(&anon_vma->lock); 14.584 + if (mapping) 14.585 + spin_unlock(&mapping->i_mmap_lock); 14.586 + 14.587 + if (remove_next) { 14.588 + if (file) 14.589 + fput(file); 14.590 + mm->map_count--; 14.591 + mpol_free(vma_policy(next)); 14.592 + kmem_cache_free(vm_area_cachep, next); 14.593 + /* 14.594 + * In mprotect's case 6 (see comments on vma_merge), 14.595 + * we must remove another next too. It would clutter 14.596 + * up the code too much to do both in one go. 14.597 + */ 14.598 + if (remove_next == 2) { 14.599 + next = vma->vm_next; 14.600 + goto again; 14.601 + } 14.602 + } 14.603 + 14.604 + validate_mm(mm); 14.605 +} 14.606 + 14.607 +/* 14.608 + * If the vma has a ->close operation then the driver probably needs to release 14.609 + * per-vma resources, so we don't attempt to merge those. 14.610 + */ 14.611 +#define VM_SPECIAL (VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_RESERVED) 14.612 + 14.613 +static inline int is_mergeable_vma(struct vm_area_struct *vma, 14.614 + struct file *file, unsigned long vm_flags) 14.615 +{ 14.616 + if (vma->vm_flags != vm_flags) 14.617 + return 0; 14.618 + if (vma->vm_file != file) 14.619 + return 0; 14.620 + if (vma->vm_ops && vma->vm_ops->close) 14.621 + return 0; 14.622 + return 1; 14.623 +} 14.624 + 14.625 +static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1, 14.626 + struct anon_vma *anon_vma2) 14.627 +{ 14.628 + return !anon_vma1 || !anon_vma2 || (anon_vma1 == anon_vma2); 14.629 +} 14.630 + 14.631 +/* 14.632 + * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) 14.633 + * in front of (at a lower virtual address and file offset than) the vma. 14.634 + * 14.635 + * We cannot merge two vmas if they have differently assigned (non-NULL) 14.636 + * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. 14.637 + * 14.638 + * We don't check here for the merged mmap wrapping around the end of pagecache 14.639 + * indices (16TB on ia32) because do_mmap_pgoff() does not permit mmap's which 14.640 + * wrap, nor mmaps which cover the final page at index -1UL. 14.641 + */ 14.642 +static int 14.643 +can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags, 14.644 + struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff) 14.645 +{ 14.646 + if (is_mergeable_vma(vma, file, vm_flags) && 14.647 + is_mergeable_anon_vma(anon_vma, vma->anon_vma)) { 14.648 + if (vma->vm_pgoff == vm_pgoff) 14.649 + return 1; 14.650 + } 14.651 + return 0; 14.652 +} 14.653 + 14.654 +/* 14.655 + * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) 14.656 + * beyond (at a higher virtual address and file offset than) the vma. 14.657 + * 14.658 + * We cannot merge two vmas if they have differently assigned (non-NULL) 14.659 + * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. 14.660 + */ 14.661 +static int 14.662 +can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, 14.663 + struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff) 14.664 +{ 14.665 + if (is_mergeable_vma(vma, file, vm_flags) && 14.666 + is_mergeable_anon_vma(anon_vma, vma->anon_vma)) { 14.667 + pgoff_t vm_pglen; 14.668 + vm_pglen = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; 14.669 + if (vma->vm_pgoff + vm_pglen == vm_pgoff) 14.670 + return 1; 14.671 + } 14.672 + return 0; 14.673 +} 14.674 + 14.675 +/* 14.676 + * Given a mapping request (addr,end,vm_flags,file,pgoff), figure out 14.677 + * whether that can be merged with its predecessor or its successor. 14.678 + * Or both (it neatly fills a hole). 14.679 + * 14.680 + * In most cases - when called for mmap, brk or mremap - [addr,end) is 14.681 + * certain not to be mapped by the time vma_merge is called; but when 14.682 + * called for mprotect, it is certain to be already mapped (either at 14.683 + * an offset within prev, or at the start of next), and the flags of 14.684 + * this area are about to be changed to vm_flags - and the no-change 14.685 + * case has already been eliminated. 14.686 + * 14.687 + * The following mprotect cases have to be considered, where AAAA is 14.688 + * the area passed down from mprotect_fixup, never extending beyond one 14.689 + * vma, PPPPPP is the prev vma specified, and NNNNNN the next vma after: 14.690 + * 14.691 + * AAAA AAAA AAAA AAAA 14.692 + * PPPPPPNNNNNN PPPPPPNNNNNN PPPPPPNNNNNN PPPPNNNNXXXX 14.693 + * cannot merge might become might become might become 14.694 + * PPNNNNNNNNNN PPPPPPPPPPNN PPPPPPPPPPPP 6 or 14.695 + * mmap, brk or case 4 below case 5 below PPPPPPPPXXXX 7 or 14.696 + * mremap move: PPPPNNNNNNNN 8 14.697 + * AAAA 14.698 + * PPPP NNNN PPPPPPPPPPPP PPPPPPPPNNNN PPPPNNNNNNNN 14.699 + * might become case 1 below case 2 below case 3 below 14.700 + * 14.701 + * Odd one out? Case 8, because it extends NNNN but needs flags of XXXX: 14.702 + * mprotect_fixup updates vm_flags & vm_page_prot on successful return. 14.703 + */ 14.704 +struct vm_area_struct *vma_merge(struct mm_struct *mm, 14.705 + struct vm_area_struct *prev, unsigned long addr, 14.706 + unsigned long end, unsigned long vm_flags, 14.707 + struct anon_vma *anon_vma, struct file *file, 14.708 + pgoff_t pgoff, struct mempolicy *policy) 14.709 +{ 14.710 + pgoff_t pglen = (end - addr) >> PAGE_SHIFT; 14.711 + struct vm_area_struct *area, *next; 14.712 + 14.713 + /* 14.714 + * We later require that vma->vm_flags == vm_flags, 14.715 + * so this tests vma->vm_flags & VM_SPECIAL, too. 14.716 + */ 14.717 + if (vm_flags & VM_SPECIAL) 14.718 + return NULL; 14.719 + 14.720 + if (prev) 14.721 + next = prev->vm_next; 14.722 + else 14.723 + next = mm->mmap; 14.724 + area = next; 14.725 + if (next && next->vm_end == end) /* cases 6, 7, 8 */ 14.726 + next = next->vm_next; 14.727 + 14.728 + /* 14.729 + * Can it merge with the predecessor? 14.730 + */ 14.731 + if (prev && prev->vm_end == addr && 14.732 + mpol_equal(vma_policy(prev), policy) && 14.733 + can_vma_merge_after(prev, vm_flags, 14.734 + anon_vma, file, pgoff)) { 14.735 + /* 14.736 + * OK, it can. Can we now merge in the successor as well? 14.737 + */ 14.738 + if (next && end == next->vm_start && 14.739 + mpol_equal(policy, vma_policy(next)) && 14.740 + can_vma_merge_before(next, vm_flags, 14.741 + anon_vma, file, pgoff+pglen) && 14.742 + is_mergeable_anon_vma(prev->anon_vma, 14.743 + next->anon_vma)) { 14.744 + /* cases 1, 6 */ 14.745 + vma_adjust(prev, prev->vm_start, 14.746 + next->vm_end, prev->vm_pgoff, NULL); 14.747 + } else /* cases 2, 5, 7 */ 14.748 + vma_adjust(prev, prev->vm_start, 14.749 + end, prev->vm_pgoff, NULL); 14.750 + return prev; 14.751 + } 14.752 + 14.753 + /* 14.754 + * Can this new request be merged in front of next? 14.755 + */ 14.756 + if (next && end == next->vm_start && 14.757 + mpol_equal(policy, vma_policy(next)) && 14.758 + can_vma_merge_before(next, vm_flags, 14.759 + anon_vma, file, pgoff+pglen)) { 14.760 + if (prev && addr < prev->vm_end) /* case 4 */ 14.761 + vma_adjust(prev, prev->vm_start, 14.762 + addr, prev->vm_pgoff, NULL); 14.763 + else /* cases 3, 8 */ 14.764 + vma_adjust(area, addr, next->vm_end, 14.765 + next->vm_pgoff - pglen, NULL); 14.766 + return area; 14.767 + } 14.768 + 14.769 + return NULL; 14.770 +} 14.771 + 14.772 +/* 14.773 + * find_mergeable_anon_vma is used by anon_vma_prepare, to check 14.774 + * neighbouring vmas for a suitable anon_vma, before it goes off 14.775 + * to allocate a new anon_vma. It checks because a repetitive 14.776 + * sequence of mprotects and faults may otherwise lead to distinct 14.777 + * anon_vmas being allocated, preventing vma merge in subsequent 14.778 + * mprotect. 14.779 + */ 14.780 +struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma) 14.781 +{ 14.782 + struct vm_area_struct *near; 14.783 + unsigned long vm_flags; 14.784 + 14.785 + near = vma->vm_next; 14.786 + if (!near) 14.787 + goto try_prev; 14.788 + 14.789 + /* 14.790 + * Since only mprotect tries to remerge vmas, match flags 14.791 + * which might be mprotected into each other later on. 14.792 + * Neither mlock nor madvise tries to remerge at present, 14.793 + * so leave their flags as obstructing a merge. 14.794 + */ 14.795 + vm_flags = vma->vm_flags & ~(VM_READ|VM_WRITE|VM_EXEC); 14.796 + vm_flags |= near->vm_flags & (VM_READ|VM_WRITE|VM_EXEC); 14.797 + 14.798 + if (near->anon_vma && vma->vm_end == near->vm_start && 14.799 + mpol_equal(vma_policy(vma), vma_policy(near)) && 14.800 + can_vma_merge_before(near, vm_flags, 14.801 + NULL, vma->vm_file, vma->vm_pgoff + 14.802 + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT))) 14.803 + return near->anon_vma; 14.804 +try_prev: 14.805 + /* 14.806 + * It is potentially slow to have to call find_vma_prev here. 14.807 + * But it's only on the first write fault on the vma, not 14.808 + * every time, and we could devise a way to avoid it later 14.809 + * (e.g. stash info in next's anon_vma_node when assigning 14.810 + * an anon_vma, or when trying vma_merge). Another time. 14.811 + */ 14.812 + if (find_vma_prev(vma->vm_mm, vma->vm_start, &near) != vma) 14.813 + BUG(); 14.814 + if (!near) 14.815 + goto none; 14.816 + 14.817 + vm_flags = vma->vm_flags & ~(VM_READ|VM_WRITE|VM_EXEC); 14.818 + vm_flags |= near->vm_flags & (VM_READ|VM_WRITE|VM_EXEC); 14.819 + 14.820 + if (near->anon_vma && near->vm_end == vma->vm_start && 14.821 + mpol_equal(vma_policy(near), vma_policy(vma)) && 14.822 + can_vma_merge_after(near, vm_flags, 14.823 + NULL, vma->vm_file, vma->vm_pgoff)) 14.824 + return near->anon_vma; 14.825 +none: 14.826 + /* 14.827 + * There's no absolute need to look only at touching neighbours: 14.828 + * we could search further afield for "compatible" anon_vmas. 14.829 + * But it would probably just be a waste of time searching, 14.830 + * or lead to too many vmas hanging off the same anon_vma. 14.831 + * We're trying to allow mprotect remerging later on, 14.832 + * not trying to minimize memory used for anon_vmas. 14.833 + */ 14.834 + return NULL; 14.835 +} 14.836 + 14.837 +#ifdef CONFIG_PROC_FS 14.838 +void __vm_stat_account(struct mm_struct *mm, unsigned long flags, 14.839 + struct file *file, long pages) 14.840 +{ 14.841 + const unsigned long stack_flags 14.842 + = VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN); 14.843 + 14.844 +#ifdef CONFIG_HUGETLB 14.845 + if (flags & VM_HUGETLB) { 14.846 + if (!(flags & VM_DONTCOPY)) 14.847 + mm->shared_vm += pages; 14.848 + return; 14.849 + } 14.850 +#endif /* CONFIG_HUGETLB */ 14.851 + 14.852 + if (file) { 14.853 + mm->shared_vm += pages; 14.854 + if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC) 14.855 + mm->exec_vm += pages; 14.856 + } else if (flags & stack_flags) 14.857 + mm->stack_vm += pages; 14.858 + if (flags & (VM_RESERVED|VM_IO)) 14.859 + mm->reserved_vm += pages; 14.860 +} 14.861 +#endif /* CONFIG_PROC_FS */ 14.862 + 14.863 +/* 14.864 + * The caller must hold down_write(current->mm->mmap_sem). 14.865 + */ 14.866 + 14.867 +unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, 14.868 + unsigned long len, unsigned long prot, 14.869 + unsigned long flags, unsigned long pgoff) 14.870 +{ 14.871 + struct mm_struct * mm = current->mm; 14.872 + struct vm_area_struct * vma, * prev; 14.873 + struct inode *inode; 14.874 + unsigned int vm_flags; 14.875 + int correct_wcount = 0; 14.876 + int error; 14.877 + struct rb_node ** rb_link, * rb_parent; 14.878 + int accountable = 1; 14.879 + unsigned long charged = 0; 14.880 + 14.881 + if (file) { 14.882 + if (is_file_hugepages(file)) 14.883 + accountable = 0; 14.884 + 14.885 + if (!file->f_op || !file->f_op->mmap) 14.886 + return -ENODEV; 14.887 + 14.888 + if ((prot & PROT_EXEC) && 14.889 + (file->f_vfsmnt->mnt_flags & MNT_NOEXEC)) 14.890 + return -EPERM; 14.891 + } 14.892 + /* 14.893 + * Does the application expect PROT_READ to imply PROT_EXEC? 14.894 + * 14.895 + * (the exception is when the underlying filesystem is noexec 14.896 + * mounted, in which case we dont add PROT_EXEC.) 14.897 + */ 14.898 + if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC)) 14.899 + if (!(file && (file->f_vfsmnt->mnt_flags & MNT_NOEXEC))) 14.900 + prot |= PROT_EXEC; 14.901 + 14.902 + if (!len) 14.903 + return addr; 14.904 + 14.905 + /* Careful about overflows.. */ 14.906 + len = PAGE_ALIGN(len); 14.907 + if (!len || len > TASK_SIZE) 14.908 + return -EINVAL; 14.909 + 14.910 + /* offset overflow? */ 14.911 + if ((pgoff + (len >> PAGE_SHIFT)) < pgoff) 14.912 + return -EINVAL; 14.913 + 14.914 + /* Too many mappings? */ 14.915 + if (mm->map_count > sysctl_max_map_count) 14.916 + return -ENOMEM; 14.917 + 14.918 + /* Obtain the address to map to. we verify (or select) it and ensure 14.919 + * that it represents a valid section of the address space. 14.920 + */ 14.921 + addr = get_unmapped_area(file, addr, len, pgoff, flags); 14.922 + if (addr & ~PAGE_MASK) 14.923 + return addr; 14.924 + 14.925 + /* Do simple checking here so the lower-level routines won't have 14.926 + * to. we assume access permissions have been handled by the open 14.927 + * of the memory object, so we don't do any here. 14.928 + */ 14.929 + vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) | 14.930 + mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; 14.931 + 14.932 + if (flags & MAP_LOCKED) { 14.933 + if (!can_do_mlock()) 14.934 + return -EPERM; 14.935 + vm_flags |= VM_LOCKED; 14.936 + } 14.937 + /* mlock MCL_FUTURE? */ 14.938 + if (vm_flags & VM_LOCKED) { 14.939 + unsigned long locked, lock_limit; 14.940 + locked = mm->locked_vm << PAGE_SHIFT; 14.941 + lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; 14.942 + locked += len; 14.943 + if (locked > lock_limit && !capable(CAP_IPC_LOCK)) 14.944 + return -EAGAIN; 14.945 + } 14.946 + 14.947 + inode = file ? file->f_dentry->d_inode : NULL; 14.948 + 14.949 + if (file) { 14.950 + switch (flags & MAP_TYPE) { 14.951 + case MAP_SHARED: 14.952 + if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE)) 14.953 + return -EACCES; 14.954 + 14.955 + /* 14.956 + * Make sure we don't allow writing to an append-only 14.957 + * file.. 14.958 + */ 14.959 + if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE)) 14.960 + return -EACCES; 14.961 + 14.962 + /* 14.963 + * Make sure there are no mandatory locks on the file. 14.964 + */ 14.965 + if (locks_verify_locked(inode)) 14.966 + return -EAGAIN; 14.967 + 14.968 + vm_flags |= VM_SHARED | VM_MAYSHARE; 14.969 + if (!(file->f_mode & FMODE_WRITE)) 14.970 + vm_flags &= ~(VM_MAYWRITE | VM_SHARED); 14.971 + 14.972 + /* fall through */ 14.973 + case MAP_PRIVATE: 14.974 + if (!(file->f_mode & FMODE_READ)) 14.975 + return -EACCES; 14.976 + break; 14.977 + 14.978 + default: 14.979 + return -EINVAL; 14.980 + } 14.981 + } else { 14.982 + switch (flags & MAP_TYPE) { 14.983 + case MAP_SHARED: 14.984 + vm_flags |= VM_SHARED | VM_MAYSHARE; 14.985 + break; 14.986 + case MAP_PRIVATE: 14.987 + /* 14.988 + * Set pgoff according to addr for anon_vma. 14.989 + */ 14.990 + pgoff = addr >> PAGE_SHIFT; 14.991 + break; 14.992 + default: 14.993 + return -EINVAL; 14.994 + } 14.995 + } 14.996 + 14.997 + error = security_file_mmap(file, prot, flags); 14.998 + if (error) 14.999 + return error; 14.1000 + 14.1001 + /* Clear old maps */ 14.1002 + error = -ENOMEM; 14.1003 +munmap_back: 14.1004 + vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); 14.1005 + if (vma && vma->vm_start < addr + len) { 14.1006 + if (do_munmap(mm, addr, len)) 14.1007 + return -ENOMEM; 14.1008 + goto munmap_back; 14.1009 + } 14.1010 + 14.1011 + /* Check against address space limit. */ 14.1012 + if ((mm->total_vm << PAGE_SHIFT) + len 14.1013 + > current->signal->rlim[RLIMIT_AS].rlim_cur) 14.1014 + return -ENOMEM; 14.1015 + 14.1016 + if (accountable && (!(flags & MAP_NORESERVE) || 14.1017 + sysctl_overcommit_memory == OVERCOMMIT_NEVER)) { 14.1018 + if (vm_flags & VM_SHARED) { 14.1019 + /* Check memory availability in shmem_file_setup? */ 14.1020 + vm_flags |= VM_ACCOUNT; 14.1021 + } else if (vm_flags & VM_WRITE) { 14.1022 + /* 14.1023 + * Private writable mapping: check memory availability 14.1024 + */ 14.1025 + charged = len >> PAGE_SHIFT; 14.1026 + if (security_vm_enough_memory(charged)) 14.1027 + return -ENOMEM; 14.1028 + vm_flags |= VM_ACCOUNT; 14.1029 + } 14.1030 + } 14.1031 + 14.1032 + /* 14.1033 + * Can we just expand an old private anonymous mapping? 14.1034 + * The VM_SHARED test is necessary because shmem_zero_setup 14.1035 + * will create the file object for a shared anonymous map below. 14.1036 + */ 14.1037 + if (!file && !(vm_flags & VM_SHARED) && 14.1038 + vma_merge(mm, prev, addr, addr + len, vm_flags, 14.1039 + NULL, NULL, pgoff, NULL)) 14.1040 + goto out; 14.1041 + 14.1042 + /* 14.1043 + * Determine the object being mapped and call the appropriate 14.1044 + * specific mapper. the address has already been validated, but 14.1045 + * not unmapped, but the maps are removed from the list. 14.1046 + */ 14.1047 + vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); 14.1048 + if (!vma) { 14.1049 + error = -ENOMEM; 14.1050 + goto unacct_error; 14.1051 + } 14.1052 + memset(vma, 0, sizeof(*vma)); 14.1053 + 14.1054 + vma->vm_mm = mm; 14.1055 + vma->vm_start = addr; 14.1056 + vma->vm_end = addr + len; 14.1057 + vma->vm_flags = vm_flags; 14.1058 + vma->vm_page_prot = protection_map[vm_flags & 0x0f]; 14.1059 + vma->vm_pgoff = pgoff; 14.1060 + 14.1061 + if (file) { 14.1062 + error = -EINVAL; 14.1063 + if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) 14.1064 + goto free_vma; 14.1065 + if (vm_flags & VM_DENYWRITE) { 14.1066 + error = deny_write_access(file); 14.1067 + if (error) 14.1068 + goto free_vma; 14.1069 + correct_wcount = 1; 14.1070 + } 14.1071 + vma->vm_file = file; 14.1072 + get_file(file); 14.1073 + error = file->f_op->mmap(file, vma); 14.1074 + if (error) 14.1075 + goto unmap_and_free_vma; 14.1076 + } else if (vm_flags & VM_SHARED) { 14.1077 + error = shmem_zero_setup(vma); 14.1078 + if (error) 14.1079 + goto free_vma; 14.1080 + } 14.1081 + 14.1082 + /* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform 14.1083 + * shmem_zero_setup (perhaps called through /dev/zero's ->mmap) 14.1084 + * that memory reservation must be checked; but that reservation 14.1085 + * belongs to shared memory object, not to vma: so now clear it. 14.1086 + */ 14.1087 + if ((vm_flags & (VM_SHARED|VM_ACCOUNT)) == (VM_SHARED|VM_ACCOUNT)) 14.1088 + vma->vm_flags &= ~VM_ACCOUNT; 14.1089 + 14.1090 + /* Can addr have changed?? 14.1091 + * 14.1092 + * Answer: Yes, several device drivers can do it in their 14.1093 + * f_op->mmap method. -DaveM 14.1094 + */ 14.1095 + addr = vma->vm_start; 14.1096 + pgoff = vma->vm_pgoff; 14.1097 + vm_flags = vma->vm_flags; 14.1098 + 14.1099 + if (!file || !vma_merge(mm, prev, addr, vma->vm_end, 14.1100 + vma->vm_flags, NULL, file, pgoff, vma_policy(vma))) { 14.1101 + file = vma->vm_file; 14.1102 + vma_link(mm, vma, prev, rb_link, rb_parent); 14.1103 + if (correct_wcount) 14.1104 + atomic_inc(&inode->i_writecount); 14.1105 + } else { 14.1106 + if (file) { 14.1107 + if (correct_wcount) 14.1108 + atomic_inc(&inode->i_writecount); 14.1109 + fput(file); 14.1110 + } 14.1111 + mpol_free(vma_policy(vma)); 14.1112 + kmem_cache_free(vm_area_cachep, vma); 14.1113 + } 14.1114 +out: 14.1115 + mm->total_vm += len >> PAGE_SHIFT; 14.1116 + __vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); 14.1117 + if (vm_flags & VM_LOCKED) { 14.1118 + mm->locked_vm += len >> PAGE_SHIFT; 14.1119 + make_pages_present(addr, addr + len); 14.1120 + } 14.1121 + if (flags & MAP_POPULATE) { 14.1122 + up_write(&mm->mmap_sem); 14.1123 + sys_remap_file_pages(addr, len, 0, 14.1124 + pgoff, flags & MAP_NONBLOCK); 14.1125 + down_write(&mm->mmap_sem); 14.1126 + } 14.1127 + acct_update_integrals(); 14.1128 + update_mem_hiwater(); 14.1129 + return addr; 14.1130 + 14.1131 +unmap_and_free_vma: 14.1132 + if (correct_wcount) 14.1133 + atomic_inc(&inode->i_writecount); 14.1134 + vma->vm_file = NULL; 14.1135 + fput(file); 14.1136 + 14.1137 + /* Undo any partial mapping done by a device driver. */ 14.1138 + zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start, NULL); 14.1139 +free_vma: 14.1140 + kmem_cache_free(vm_area_cachep, vma); 14.1141 +unacct_error: 14.1142 + if (charged) 14.1143 + vm_unacct_memory(charged); 14.1144 + return error; 14.1145 +} 14.1146 + 14.1147 +EXPORT_SYMBOL(do_mmap_pgoff); 14.1148 + 14.1149 +/* Get an address range which is currently unmapped. 14.1150 + * For shmat() with addr=0. 14.1151 + * 14.1152 + * Ugly calling convention alert: 14.1153 + * Return value with the low bits set means error value, 14.1154 + * ie 14.1155 + * if (ret & ~PAGE_MASK) 14.1156 + * error = ret; 14.1157 + * 14.1158 + * This function "knows" that -ENOMEM has the bits set. 14.1159 + */ 14.1160 +#ifndef HAVE_ARCH_UNMAPPED_AREA 14.1161 +unsigned long 14.1162 +arch_get_unmapped_area(struct file *filp, unsigned long addr, 14.1163 + unsigned long len, unsigned long pgoff, unsigned long flags) 14.1164 +{ 14.1165 + struct mm_struct *mm = current->mm; 14.1166 + struct vm_area_struct *vma; 14.1167 + unsigned long start_addr; 14.1168 + 14.1169 + if (len > TASK_SIZE) 14.1170 + return -ENOMEM; 14.1171 + 14.1172 + if (addr) { 14.1173 + addr = PAGE_ALIGN(addr); 14.1174 + vma = find_vma(mm, addr); 14.1175 + if (TASK_SIZE - len >= addr && 14.1176 + (!vma || addr + len <= vma->vm_start)) 14.1177 + return addr; 14.1178 + } 14.1179 + start_addr = addr = mm->free_area_cache; 14.1180 + 14.1181 +full_search: 14.1182 + for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { 14.1183 + /* At this point: (!vma || addr < vma->vm_end). */ 14.1184 + if (TASK_SIZE - len < addr) { 14.1185 + /* 14.1186 + * Start a new search - just in case we missed 14.1187 + * some holes. 14.1188 + */ 14.1189 + if (start_addr != TASK_UNMAPPED_BASE) { 14.1190 + start_addr = addr = TASK_UNMAPPED_BASE; 14.1191 + goto full_search; 14.1192 + } 14.1193 + return -ENOMEM; 14.1194 + } 14.1195 + if (!vma || addr + len <= vma->vm_start) { 14.1196 + /* 14.1197 + * Remember the place where we stopped the search: 14.1198 + */ 14.1199 + mm->free_area_cache = addr + len; 14.1200 + return addr; 14.1201 + } 14.1202 + addr = vma->vm_end; 14.1203 + } 14.1204 +} 14.1205 +#endif 14.1206 + 14.1207 +void arch_unmap_area(struct vm_area_struct *area) 14.1208 +{ 14.1209 + /* 14.1210 + * Is this a new hole at the lowest possible address? 14.1211 + */ 14.1212 + if (area->vm_start >= TASK_UNMAPPED_BASE && 14.1213 + area->vm_start < area->vm_mm->free_area_cache) 14.1214 + area->vm_mm->free_area_cache = area->vm_start; 14.1215 +} 14.1216 + 14.1217 +/* 14.1218 + * This mmap-allocator allocates new areas top-down from below the 14.1219 + * stack's low limit (the base): 14.1220 + */ 14.1221 +#ifndef HAVE_ARCH_UNMAPPED_AREA_TOPDOWN 14.1222 +unsigned long 14.1223 +arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, 14.1224 + const unsigned long len, const unsigned long pgoff, 14.1225 + const unsigned long flags) 14.1226 +{ 14.1227 + struct vm_area_struct *vma, *prev_vma; 14.1228 + struct mm_struct *mm = current->mm; 14.1229 + unsigned long base = mm->mmap_base, addr = addr0; 14.1230 + int first_time = 1; 14.1231 + 14.1232 + /* requested length too big for entire address space */ 14.1233 + if (len > TASK_SIZE) 14.1234 + return -ENOMEM; 14.1235 + 14.1236 + /* dont allow allocations above current base */ 14.1237 + if (mm->free_area_cache > base) 14.1238 + mm->free_area_cache = base; 14.1239 + 14.1240 + /* requesting a specific address */ 14.1241 + if (addr) { 14.1242 + addr = PAGE_ALIGN(addr); 14.1243 + vma = find_vma(mm, addr); 14.1244 + if (TASK_SIZE - len >= addr && 14.1245 + (!vma || addr + len <= vma->vm_start)) 14.1246 + return addr; 14.1247 + } 14.1248 + 14.1249 +try_again: 14.1250 + /* make sure it can fit in the remaining address space */ 14.1251 + if (mm->free_area_cache < len) 14.1252 + goto fail; 14.1253 + 14.1254 + /* either no address requested or cant fit in requested address hole */ 14.1255 + addr = (mm->free_area_cache - len) & PAGE_MASK; 14.1256 + do { 14.1257 + /* 14.1258 + * Lookup failure means no vma is above this address, 14.1259 + * i.e. return with success: 14.1260 + */ 14.1261 + if (!(vma = find_vma_prev(mm, addr, &prev_vma))) 14.1262 + return addr; 14.1263 + 14.1264 + /* 14.1265 + * new region fits between prev_vma->vm_end and 14.1266 + * vma->vm_start, use it: 14.1267 + */ 14.1268 + if (addr+len <= vma->vm_start && 14.1269 + (!prev_vma || (addr >= prev_vma->vm_end))) 14.1270 + /* remember the address as a hint for next time */ 14.1271 + return (mm->free_area_cache = addr); 14.1272 + else 14.1273 + /* pull free_area_cache down to the first hole */ 14.1274 + if (mm->free_area_cache == vma->vm_end) 14.1275 + mm->free_area_cache = vma->vm_start; 14.1276 + 14.1277 + /* try just below the current vma->vm_start */ 14.1278 + addr = vma->vm_start-len; 14.1279 + } while (len <= vma->vm_start); 14.1280 + 14.1281 +fail: 14.1282 + /* 14.1283 + * if hint left us with no space for the requested 14.1284 + * mapping then try again: 14.1285 + */ 14.1286 + if (first_time) { 14.1287 + mm->free_area_cache = base; 14.1288 + first_time = 0; 14.1289 + goto try_again; 14.1290 + } 14.1291 + /* 14.1292 + * A failed mmap() very likely causes application failure, 14.1293 + * so fall back to the bottom-up function here. This scenario 14.1294 + * can happen with large stack limits and large mmap() 14.1295 + * allocations. 14.1296 + */ 14.1297 + mm->free_area_cache = TASK_UNMAPPED_BASE; 14.1298 + addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags); 14.1299 + /* 14.1300 + * Restore the topdown base: 14.1301 + */ 14.1302 + mm->free_area_cache = base; 14.1303 + 14.1304 + return addr; 14.1305 +} 14.1306 +#endif 14.1307 + 14.1308 +void arch_unmap_area_topdown(struct vm_area_struct *area) 14.1309 +{ 14.1310 + /* 14.1311 + * Is this a new hole at the highest possible address? 14.1312 + */ 14.1313 + if (area->vm_end > area->vm_mm->free_area_cache) 14.1314 + area->vm_mm->free_area_cache = area->vm_end; 14.1315 +} 14.1316 + 14.1317 +unsigned long 14.1318 +get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, 14.1319 + unsigned long pgoff, unsigned long flags) 14.1320 +{ 14.1321 + if (flags & MAP_FIXED) { 14.1322 + unsigned long ret; 14.1323 + 14.1324 + if (addr > TASK_SIZE - len) 14.1325 + return -ENOMEM; 14.1326 + if (addr & ~PAGE_MASK) 14.1327 + return -EINVAL; 14.1328 + if (file && is_file_hugepages(file)) { 14.1329 + /* 14.1330 + * Check if the given range is hugepage aligned, and 14.1331 + * can be made suitable for hugepages. 14.1332 + */ 14.1333 + ret = prepare_hugepage_range(addr, len); 14.1334 + } else { 14.1335 + /* 14.1336 + * Ensure that a normal request is not falling in a 14.1337 + * reserved hugepage range. For some archs like IA-64, 14.1338 + * there is a separate region for hugepages. 14.1339 + */ 14.1340 + ret = is_hugepage_only_range(addr, len); 14.1341 + } 14.1342 + if (ret) 14.1343 + return -EINVAL; 14.1344 + return addr; 14.1345 + } 14.1346 + 14.1347 + if (file && file->f_op && file->f_op->get_unmapped_area) 14.1348 + return file->f_op->get_unmapped_area(file, addr, len, 14.1349 + pgoff, flags); 14.1350 + 14.1351 + return current->mm->get_unmapped_area(file, addr, len, pgoff, flags); 14.1352 +} 14.1353 + 14.1354 +EXPORT_SYMBOL(get_unmapped_area); 14.1355 + 14.1356 +/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ 14.1357 +struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr) 14.1358 +{ 14.1359 + struct vm_area_struct *vma = NULL; 14.1360 + 14.1361 + if (mm) { 14.1362 + /* Check the cache first. */ 14.1363 + /* (Cache hit rate is typically around 35%.) */ 14.1364 + vma = mm->mmap_cache; 14.1365 + if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) { 14.1366 + struct rb_node * rb_node; 14.1367 + 14.1368 + rb_node = mm->mm_rb.rb_node; 14.1369 + vma = NULL; 14.1370 + 14.1371 + while (rb_node) { 14.1372 + struct vm_area_struct * vma_tmp; 14.1373 + 14.1374 + vma_tmp = rb_entry(rb_node, 14.1375 + struct vm_area_struct, vm_rb); 14.1376 + 14.1377 + if (vma_tmp->vm_end > addr) { 14.1378 + vma = vma_tmp; 14.1379 + if (vma_tmp->vm_start <= addr) 14.1380 + break; 14.1381 + rb_node = rb_node->rb_left; 14.1382 + } else 14.1383 + rb_node = rb_node->rb_right; 14.1384 + } 14.1385 + if (vma) 14.1386 + mm->mmap_cache = vma; 14.1387 + } 14.1388 + } 14.1389 + return vma; 14.1390 +} 14.1391 + 14.1392 +EXPORT_SYMBOL(find_vma); 14.1393 + 14.1394 +/* Same as find_vma, but also return a pointer to the previous VMA in *pprev. */ 14.1395 +struct vm_area_struct * 14.1396 +find_vma_prev(struct mm_struct *mm, unsigned long addr, 14.1397 + struct vm_area_struct **pprev) 14.1398 +{ 14.1399 + struct vm_area_struct *vma = NULL, *prev = NULL; 14.1400 + struct rb_node * rb_node; 14.1401 + if (!mm) 14.1402 + goto out; 14.1403 + 14.1404 + /* Guard against addr being lower than the first VMA */ 14.1405 + vma = mm->mmap; 14.1406 + 14.1407 + /* Go through the RB tree quickly. */ 14.1408 + rb_node = mm->mm_rb.rb_node; 14.1409 + 14.1410 + while (rb_node) { 14.1411 + struct vm_area_struct *vma_tmp; 14.1412 + vma_tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb); 14.1413 + 14.1414 + if (addr < vma_tmp->vm_end) { 14.1415 + rb_node = rb_node->rb_left; 14.1416 + } else { 14.1417 + prev = vma_tmp; 14.1418 + if (!prev->vm_next || (addr < prev->vm_next->vm_end)) 14.1419 + break; 14.1420 + rb_node = rb_node->rb_right; 14.1421 + } 14.1422 + } 14.1423 + 14.1424 +out: 14.1425 + *pprev = prev; 14.1426 + return prev ? prev->vm_next : vma; 14.1427 +} 14.1428 + 14.1429 +/* 14.1430 + * Verify that the stack growth is acceptable and 14.1431 + * update accounting. This is shared with both the 14.1432 + * grow-up and grow-down cases. 14.1433 + */ 14.1434 +static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, unsigned long grow) 14.1435 +{ 14.1436 + struct mm_struct *mm = vma->vm_mm; 14.1437 + struct rlimit *rlim = current->signal->rlim; 14.1438 + 14.1439 + /* address space limit tests */ 14.1440 + if (mm->total_vm + grow > rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT) 14.1441 + return -ENOMEM; 14.1442 + 14.1443 + /* Stack limit test */ 14.1444 + if (size > rlim[RLIMIT_STACK].rlim_cur) 14.1445 + return -ENOMEM; 14.1446 + 14.1447 + /* mlock limit tests */ 14.1448 + if (vma->vm_flags & VM_LOCKED) { 14.1449 + unsigned long locked; 14.1450 + unsigned long limit; 14.1451 + locked = mm->locked_vm + grow; 14.1452 + limit = rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; 14.1453 + if (locked > limit && !capable(CAP_IPC_LOCK)) 14.1454 + return -ENOMEM; 14.1455 + } 14.1456 + 14.1457 + /* 14.1458 + * Overcommit.. This must be the final test, as it will 14.1459 + * update security statistics. 14.1460 + */ 14.1461 + if (security_vm_enough_memory(grow)) 14.1462 + return -ENOMEM; 14.1463 + 14.1464 + /* Ok, everything looks good - let it rip */ 14.1465 + mm->total_vm += grow; 14.1466 + if (vma->vm_flags & VM_LOCKED) 14.1467 + mm->locked_vm += grow; 14.1468 + __vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow); 14.1469 + acct_update_integrals(); 14.1470 + update_mem_hiwater(); 14.1471 + return 0; 14.1472 +} 14.1473 + 14.1474 +#ifdef CONFIG_STACK_GROWSUP 14.1475 +/* 14.1476 + * vma is the first one with address > vma->vm_end. Have to extend vma. 14.1477 + */ 14.1478 +int expand_stack(struct vm_area_struct * vma, unsigned long address) 14.1479 +{ 14.1480 + int error; 14.1481 + 14.1482 + if (!(vma->vm_flags & VM_GROWSUP)) 14.1483 + return -EFAULT; 14.1484 + 14.1485 + /* 14.1486 + * We must make sure the anon_vma is allocated 14.1487 + * so that the anon_vma locking is not a noop. 14.1488 + */ 14.1489 + if (unlikely(anon_vma_prepare(vma))) 14.1490 + return -ENOMEM; 14.1491 + anon_vma_lock(vma); 14.1492 + 14.1493 + /* 14.1494 + * vma->vm_start/vm_end cannot change under us because the caller 14.1495 + * is required to hold the mmap_sem in read mode. We need the 14.1496 + * anon_vma lock to serialize against concurrent expand_stacks. 14.1497 + */ 14.1498 + address += 4 + PAGE_SIZE - 1; 14.1499 + address &= PAGE_MASK; 14.1500 + error = 0; 14.1501 + 14.1502 + /* Somebody else might have raced and expanded it already */ 14.1503 + if (address > vma->vm_end) { 14.1504 + unsigned long size, grow; 14.1505 + 14.1506 + size = address - vma->vm_start; 14.1507 + grow = (address - vma->vm_end) >> PAGE_SHIFT; 14.1508 + 14.1509 + error = acct_stack_growth(vma, size, grow); 14.1510 + if (!error) 14.1511 + vma->vm_end = address; 14.1512 + } 14.1513 + anon_vma_unlock(vma); 14.1514 + return error; 14.1515 +} 14.1516 + 14.1517 +struct vm_area_struct * 14.1518 +find_extend_vma(struct mm_struct *mm, unsigned long addr) 14.1519 +{ 14.1520 + struct vm_area_struct *vma, *prev; 14.1521 + 14.1522 + addr &= PAGE_MASK; 14.1523 + vma = find_vma_prev(mm, addr, &prev); 14.1524 + if (vma && (vma->vm_start <= addr)) 14.1525 + return vma; 14.1526 + if (!prev || expand_stack(prev, addr)) 14.1527 + return NULL; 14.1528 + if (prev->vm_flags & VM_LOCKED) { 14.1529 + make_pages_present(addr, prev->vm_end); 14.1530 + } 14.1531 + return prev; 14.1532 +} 14.1533 +#else 14.1534 +/* 14.1535 + * vma is the first one with address < vma->vm_start. Have to extend vma. 14.1536 + */ 14.1537 +int expand_stack(struct vm_area_struct *vma, unsigned long address) 14.1538 +{ 14.1539 + int error; 14.1540 + 14.1541 + /* 14.1542 + * We must make sure the anon_vma is allocated 14.1543 + * so that the anon_vma locking is not a noop. 14.1544 + */ 14.1545 + if (unlikely(anon_vma_prepare(vma))) 14.1546 + return -ENOMEM; 14.1547 + anon_vma_lock(vma); 14.1548 + 14.1549 + /* 14.1550 + * vma->vm_start/vm_end cannot change under us because the caller 14.1551 + * is required to hold the mmap_sem in read mode. We need the 14.1552 + * anon_vma lock to serialize against concurrent expand_stacks. 14.1553 + */ 14.1554 + address &= PAGE_MASK; 14.1555 + error = 0; 14.1556 + 14.1557 + /* Somebody else might have raced and expanded it already */ 14.1558 + if (address < vma->vm_start) { 14.1559 + unsigned long size, grow; 14.1560 + 14.1561 + size = vma->vm_end - address; 14.1562 + grow = (vma->vm_start - address) >> PAGE_SHIFT; 14.1563 + 14.1564 + error = acct_stack_growth(vma, size, grow); 14.1565 + if (!error) { 14.1566 + vma->vm_start = address; 14.1567 + vma->vm_pgoff -= grow; 14.1568 + } 14.1569 + } 14.1570 + anon_vma_unlock(vma); 14.1571 + return error; 14.1572 +} 14.1573 + 14.1574 +struct vm_area_struct * 14.1575 +find_extend_vma(struct mm_struct * mm, unsigned long addr) 14.1576 +{ 14.1577 + struct vm_area_struct * vma; 14.1578 + unsigned long start; 14.1579 + 14.1580 + addr &= PAGE_MASK; 14.1581 + vma = find_vma(mm,addr); 14.1582 + if (!vma) 14.1583 + return NULL; 14.1584 + if (vma->vm_start <= addr) 14.1585 + return vma; 14.1586 + if (!(vma->vm_flags & VM_GROWSDOWN)) 14.1587 + return NULL; 14.1588 + start = vma->vm_start; 14.1589 + if (expand_stack(vma, addr)) 14.1590 + return NULL; 14.1591 + if (vma->vm_flags & VM_LOCKED) { 14.1592 + make_pages_present(addr, start); 14.1593 + } 14.1594 + return vma; 14.1595 +} 14.1596 +#endif 14.1597 + 14.1598 +/* 14.1599 + * Try to free as many page directory entries as we can, 14.1600 + * without having to work very hard at actually scanning 14.1601 + * the page tables themselves. 14.1602 + * 14.1603 + * Right now we try to free page tables if we have a nice 14.1604 + * PGDIR-aligned area that got free'd up. We could be more 14.1605 + * granular if we want to, but this is fast and simple, 14.1606 + * and covers the bad cases. 14.1607 + * 14.1608 + * "prev", if it exists, points to a vma before the one 14.1609 + * we just free'd - but there's no telling how much before. 14.1610 + */ 14.1611 +static void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *prev, 14.1612 + unsigned long start, unsigned long end) 14.1613 +{ 14.1614 + unsigned long first = start & PGDIR_MASK; 14.1615 + unsigned long last = end + PGDIR_SIZE - 1; 14.1616 + struct mm_struct *mm = tlb->mm; 14.1617 + 14.1618 + if (last > MM_VM_SIZE(mm) || last < end) 14.1619 + last = MM_VM_SIZE(mm); 14.1620 + 14.1621 + if (!prev) { 14.1622 + prev = mm->mmap; 14.1623 + if (!prev) 14.1624 + goto no_mmaps; 14.1625 + if (prev->vm_end > start) { 14.1626 + if (last > prev->vm_start) 14.1627 + last = prev->vm_start; 14.1628 + goto no_mmaps; 14.1629 + } 14.1630 + } 14.1631 + for (;;) { 14.1632 + struct vm_area_struct *next = prev->vm_next; 14.1633 + 14.1634 + if (next) { 14.1635 + if (next->vm_start < start) { 14.1636 + prev = next; 14.1637 + continue; 14.1638 + } 14.1639 + if (last > next->vm_start) 14.1640 + last = next->vm_start; 14.1641 + } 14.1642 + if (prev->vm_end > first) 14.1643 + first = prev->vm_end; 14.1644 + break; 14.1645 + } 14.1646 +no_mmaps: 14.1647 + if (last < first) /* for arches with discontiguous pgd indices */ 14.1648 + return; 14.1649 + if (first < FIRST_USER_PGD_NR * PGDIR_SIZE) 14.1650 + first = FIRST_USER_PGD_NR * PGDIR_SIZE; 14.1651 + /* No point trying to free anything if we're in the same pte page */ 14.1652 + if ((first & PMD_MASK) < (last & PMD_MASK)) { 14.1653 + clear_page_range(tlb, first, last); 14.1654 + flush_tlb_pgtables(mm, first, last); 14.1655 + } 14.1656 +} 14.1657 + 14.1658 +/* Normal function to fix up a mapping 14.1659 + * This function is the default for when an area has no specific 14.1660 + * function. This may be used as part of a more specific routine. 14.1661 + * 14.1662 + * By the time this function is called, the area struct has been 14.1663 + * removed from the process mapping list. 14.1664 + */ 14.1665 +static void unmap_vma(struct mm_struct *mm, struct vm_area_struct *area) 14.1666 +{ 14.1667 + size_t len = area->vm_end - area->vm_start; 14.1668 + 14.1669 + area->vm_mm->total_vm -= len >> PAGE_SHIFT; 14.1670 + if (area->vm_flags & VM_LOCKED) 14.1671 + area->vm_mm->locked_vm -= len >> PAGE_SHIFT; 14.1672 + vm_stat_unaccount(area); 14.1673 + area->vm_mm->unmap_area(area); 14.1674 + remove_vm_struct(area); 14.1675 +} 14.1676 + 14.1677 +/* 14.1678 + * Update the VMA and inode share lists. 14.1679 + * 14.1680 + * Ok - we have the memory areas we should free on the 'free' list, 14.1681 + * so release them, and do the vma updates. 14.1682 + */ 14.1683 +static void unmap_vma_list(struct mm_struct *mm, 14.1684 + struct vm_area_struct *mpnt) 14.1685 +{ 14.1686 + do { 14.1687 + struct vm_area_struct *next = mpnt->vm_next; 14.1688 + unmap_vma(mm, mpnt); 14.1689 + mpnt = next; 14.1690 + } while (mpnt != NULL); 14.1691 + validate_mm(mm); 14.1692 +} 14.1693 + 14.1694 +/* 14.1695 + * Get rid of page table information in the indicated region. 14.1696 + * 14.1697 + * Called with the page table lock held. 14.1698 + */ 14.1699 +static void unmap_region(struct mm_struct *mm, 14.1700 + struct vm_area_struct *vma, 14.1701 + struct vm_area_struct *prev, 14.1702 + unsigned long start, 14.1703 + unsigned long end) 14.1704 +{ 14.1705 + struct mmu_gather *tlb; 14.1706 + unsigned long nr_accounted = 0; 14.1707 + 14.1708 + lru_add_drain(); 14.1709 + tlb = tlb_gather_mmu(mm, 0); 14.1710 + unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted, NULL); 14.1711 + vm_unacct_memory(nr_accounted); 14.1712 + 14.1713 + if (is_hugepage_only_range(start, end - start)) 14.1714 + hugetlb_free_pgtables(tlb, prev, start, end); 14.1715 + else 14.1716 + free_pgtables(tlb, prev, start, end); 14.1717 + tlb_finish_mmu(tlb, start, end); 14.1718 +} 14.1719 + 14.1720 +/* 14.1721 + * Create a list of vma's touched by the unmap, removing them from the mm's 14.1722 + * vma list as we go.. 14.1723 + */ 14.1724 +static void 14.1725 +detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, 14.1726 + struct vm_area_struct *prev, unsigned long end) 14.1727 +{ 14.1728 + struct vm_area_struct **insertion_point; 14.1729 + struct vm_area_struct *tail_vma = NULL; 14.1730 + 14.1731 + insertion_point = (prev ? &prev->vm_next : &mm->mmap); 14.1732 + do { 14.1733 + rb_erase(&vma->vm_rb, &mm->mm_rb); 14.1734 + mm->map_count--; 14.1735 + tail_vma = vma; 14.1736 + vma = vma->vm_next; 14.1737 + } while (vma && vma->vm_start < end); 14.1738 + *insertion_point = vma; 14.1739 + tail_vma->vm_next = NULL; 14.1740 + mm->mmap_cache = NULL; /* Kill the cache. */ 14.1741 +} 14.1742 + 14.1743 +/* 14.1744 + * Split a vma into two pieces at address 'addr', a new vma is allocated 14.1745 + * either for the first part or the the tail. 14.1746 + */ 14.1747 +int split_vma(struct mm_struct * mm, struct vm_area_struct * vma, 14.1748 + unsigned long addr, int new_below) 14.1749 +{ 14.1750 + struct mempolicy *pol; 14.1751 + struct vm_area_struct *new; 14.1752 + 14.1753 + if (is_vm_hugetlb_page(vma) && (addr & ~HPAGE_MASK)) 14.1754 + return -EINVAL; 14.1755 + 14.1756 + if (mm->map_count >= sysctl_max_map_count) 14.1757 + return -ENOMEM; 14.1758 + 14.1759 + new = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); 14.1760 + if (!new) 14.1761 + return -ENOMEM; 14.1762 + 14.1763 + /* most fields are the same, copy all, and then fixup */ 14.1764 + *new = *vma; 14.1765 + 14.1766 + if (new_below) 14.1767 + new->vm_end = addr; 14.1768 + else { 14.1769 + new->vm_start = addr; 14.1770 + new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT); 14.1771 + } 14.1772 + 14.1773 + pol = mpol_copy(vma_policy(vma)); 14.1774 + if (IS_ERR(pol)) { 14.1775 + kmem_cache_free(vm_area_cachep, new); 14.1776 + return PTR_ERR(pol); 14.1777 + } 14.1778 + vma_set_policy(new, pol); 14.1779 + 14.1780 + if (new->vm_file) 14.1781 + get_file(new->vm_file); 14.1782 + 14.1783 + if (new->vm_ops && new->vm_ops->open) 14.1784 + new->vm_ops->open(new); 14.1785 + 14.1786 + if (new_below) 14.1787 + vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff + 14.1788 + ((addr - new->vm_start) >> PAGE_SHIFT), new); 14.1789 + else 14.1790 + vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new); 14.1791 + 14.1792 + return 0; 14.1793 +} 14.1794 + 14.1795 +/* Munmap is split into 2 main parts -- this part which finds 14.1796 + * what needs doing, and the areas themselves, which do the 14.1797 + * work. This now handles partial unmappings. 14.1798 + * Jeremy Fitzhardinge <jeremy@goop.org> 14.1799 + */ 14.1800 +int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) 14.1801 +{ 14.1802 + unsigned long end; 14.1803 + struct vm_area_struct *mpnt, *prev, *last; 14.1804 + 14.1805 + if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start) 14.1806 + return -EINVAL; 14.1807 + 14.1808 + if ((len = PAGE_ALIGN(len)) == 0) 14.1809 + return -EINVAL; 14.1810 + 14.1811 + /* Find the first overlapping VMA */ 14.1812 + mpnt = find_vma_prev(mm, start, &prev); 14.1813 + if (!mpnt) 14.1814 + return 0; 14.1815 + /* we have start < mpnt->vm_end */ 14.1816 + 14.1817 + /* if it doesn't overlap, we have nothing.. */ 14.1818 + end = start + len; 14.1819 + if (mpnt->vm_start >= end) 14.1820 + return 0; 14.1821 + 14.1822 + /* 14.1823 + * If we need to split any vma, do it now to save pain later. 14.1824 + * 14.1825 + * Note: mremap's move_vma VM_ACCOUNT handling assumes a partially 14.1826 + * unmapped vm_area_struct will remain in use: so lower split_vma 14.1827 + * places tmp vma above, and higher split_vma places tmp vma below. 14.1828 + */ 14.1829 + if (start > mpnt->vm_start) { 14.1830 + int error = split_vma(mm, mpnt, start, 0); 14.1831 + if (error) 14.1832 + return error; 14.1833 + prev = mpnt; 14.1834 + } 14.1835 + 14.1836 + /* Does it split the last one? */ 14.1837 + last = find_vma(mm, end); 14.1838 + if (last && end > last->vm_start) { 14.1839 + int error = split_vma(mm, last, end, 1); 14.1840 + if (error) 14.1841 + return error; 14.1842 + } 14.1843 + mpnt = prev? prev->vm_next: mm->mmap; 14.1844 + 14.1845 + /* 14.1846 + * Remove the vma's, and unmap the actual pages 14.1847 + */ 14.1848 + detach_vmas_to_be_unmapped(mm, mpnt, prev, end); 14.1849 + spin_lock(&mm->page_table_lock); 14.1850 + unmap_region(mm, mpnt, prev, start, end); 14.1851 + spin_unlock(&mm->page_table_lock); 14.1852 + 14.1853 + /* Fix up all other VM information */ 14.1854 + unmap_vma_list(mm, mpnt); 14.1855 + 14.1856 + return 0; 14.1857 +} 14.1858 + 14.1859 +EXPORT_SYMBOL(do_munmap); 14.1860 + 14.1861 +asmlinkage long sys_munmap(unsigned long addr, size_t len) 14.1862 +{ 14.1863 + int ret; 14.1864 + struct mm_struct *mm = current->mm; 14.1865 + 14.1866 + profile_munmap(addr); 14.1867 + 14.1868 + down_write(&mm->mmap_sem); 14.1869 + ret = do_munmap(mm, addr, len); 14.1870 + up_write(&mm->mmap_sem); 14.1871 + return ret; 14.1872 +} 14.1873 + 14.1874 +static inline void verify_mm_writelocked(struct mm_struct *mm) 14.1875 +{ 14.1876 +#ifdef CONFIG_DEBUG_KERNEL 14.1877 + if (unlikely(down_read_trylock(&mm->mmap_sem))) { 14.1878 + WARN_ON(1); 14.1879 + up_read(&mm->mmap_sem); 14.1880 + } 14.1881 +#endif 14.1882 +} 14.1883 + 14.1884 +/* 14.1885 + * this is really a simplified "do_mmap". it only handles 14.1886 + * anonymous maps. eventually we may be able to do some 14.1887 + * brk-specific accounting here. 14.1888 + */ 14.1889 +unsigned long do_brk(unsigned long addr, unsigned long len) 14.1890 +{ 14.1891 + struct mm_struct * mm = current->mm; 14.1892 + struct vm_area_struct * vma, * prev; 14.1893 + unsigned long flags; 14.1894 + struct rb_node ** rb_link, * rb_parent; 14.1895 + pgoff_t pgoff = addr >> PAGE_SHIFT; 14.1896 + 14.1897 + len = PAGE_ALIGN(len); 14.1898 + if (!len) 14.1899 + return addr; 14.1900 + 14.1901 + if ((addr + len) > TASK_SIZE || (addr + len) < addr) 14.1902 + return -EINVAL; 14.1903 + 14.1904 + /* 14.1905 + * mlock MCL_FUTURE? 14.1906 + */ 14.1907 + if (mm->def_flags & VM_LOCKED) { 14.1908 + unsigned long locked, lock_limit; 14.1909 + locked = mm->locked_vm << PAGE_SHIFT; 14.1910 + lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; 14.1911 + locked += len; 14.1912 + if (locked > lock_limit && !capable(CAP_IPC_LOCK)) 14.1913 + return -EAGAIN; 14.1914 + } 14.1915 + 14.1916 + /* 14.1917 + * mm->mmap_sem is required to protect against another thread 14.1918 + * changing the mappings in case we sleep. 14.1919 + */ 14.1920 + verify_mm_writelocked(mm); 14.1921 + 14.1922 + /* 14.1923 + * Clear old maps. this also does some error checking for us 14.1924 + */ 14.1925 + munmap_back: 14.1926 + vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); 14.1927 + if (vma && vma->vm_start < addr + len) { 14.1928 + if (do_munmap(mm, addr, len)) 14.1929 + return -ENOMEM; 14.1930 + goto munmap_back; 14.1931 + } 14.1932 + 14.1933 + /* Check against address space limits *after* clearing old maps... */ 14.1934 + if ((mm->total_vm << PAGE_SHIFT) + len 14.1935 + > current->signal->rlim[RLIMIT_AS].rlim_cur) 14.1936 + return -ENOMEM; 14.1937 + 14.1938 + if (mm->map_count > sysctl_max_map_count) 14.1939 + return -ENOMEM; 14.1940 + 14.1941 + if (security_vm_enough_memory(len >> PAGE_SHIFT)) 14.1942 + return -ENOMEM; 14.1943 + 14.1944 + flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; 14.1945 + 14.1946 + /* Can we just expand an old private anonymous mapping? */ 14.1947 + if (vma_merge(mm, prev, addr, addr + len, flags, 14.1948 + NULL, NULL, pgoff, NULL)) 14.1949 + goto out; 14.1950 + 14.1951 + /* 14.1952 + * create a vma struct for an anonymous mapping 14.1953 + */ 14.1954 + vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); 14.1955 + if (!vma) { 14.1956 + vm_unacct_memory(len >> PAGE_SHIFT); 14.1957 + return -ENOMEM; 14.1958 + } 14.1959 + memset(vma, 0, sizeof(*vma)); 14.1960 + 14.1961 + vma->vm_mm = mm; 14.1962 + vma->vm_start = addr; 14.1963 + vma->vm_end = addr + len; 14.1964 + vma->vm_pgoff = pgoff; 14.1965 + vma->vm_flags = flags; 14.1966 + vma->vm_page_prot = protection_map[flags & 0x0f]; 14.1967 + vma_link(mm, vma, prev, rb_link, rb_parent); 14.1968 +out: 14.1969 + mm->total_vm += len >> PAGE_SHIFT; 14.1970 + if (flags & VM_LOCKED) { 14.1971 + mm->locked_vm += len >> PAGE_SHIFT; 14.1972 + make_pages_present(addr, addr + len); 14.1973 + } 14.1974 + acct_update_integrals(); 14.1975 + update_mem_hiwater(); 14.1976 + return addr; 14.1977 +} 14.1978 + 14.1979 +EXPORT_SYMBOL(do_brk); 14.1980 + 14.1981 +/* Release all mmaps. */ 14.1982 +void exit_mmap(struct mm_struct *mm) 14.1983 +{ 14.1984 + struct mmu_gather *tlb; 14.1985 + struct vm_area_struct *vma; 14.1986 + unsigned long nr_accounted = 0; 14.1987 + 14.1988 +#ifdef arch_exit_mmap 14.1989 + arch_exit_mmap(mm); 14.1990 +#endif 14.1991 + 14.1992 + lru_add_drain(); 14.1993 + 14.1994 + spin_lock(&mm->page_table_lock); 14.1995 + 14.1996 + tlb = tlb_gather_mmu(mm, 1); 14.1997 + flush_cache_mm(mm); 14.1998 + /* Use ~0UL here to ensure all VMAs in the mm are unmapped */ 14.1999 + mm->map_count -= unmap_vmas(&tlb, mm, mm->mmap, 0, 14.2000 + ~0UL, &nr_accounted, NULL); 14.2001 + vm_unacct_memory(nr_accounted); 14.2002 + BUG_ON(mm->map_count); /* This is just debugging */ 14.2003 + clear_page_range(tlb, FIRST_USER_PGD_NR * PGDIR_SIZE, MM_VM_SIZE(mm)); 14.2004 + 14.2005 + tlb_finish_mmu(tlb, 0, MM_VM_SIZE(mm)); 14.2006 + 14.2007 + vma = mm->mmap; 14.2008 + mm->mmap = mm->mmap_cache = NULL; 14.2009 + mm->mm_rb = RB_ROOT; 14.2010 + mm->rss = 0; 14.2011 + mm->total_vm = 0; 14.2012 + mm->locked_vm = 0; 14.2013 + 14.2014 + spin_unlock(&mm->page_table_lock); 14.2015 + 14.2016 + /* 14.2017 + * Walk the list again, actually closing and freeing it 14.2018 + * without holding any MM locks. 14.2019 + */ 14.2020 + while (vma) { 14.2021 + struct vm_area_struct *next = vma->vm_next; 14.2022 + remove_vm_struct(vma); 14.2023 + vma = next; 14.2024 + } 14.2025 +} 14.2026 + 14.2027 +/* Insert vm structure into process list sorted by address 14.2028 + * and into the inode's i_mmap tree. If vm_file is non-NULL 14.2029 + * then i_mmap_lock is taken here. 14.2030 + */ 14.2031 +int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) 14.2032 +{ 14.2033 + struct vm_area_struct * __vma, * prev; 14.2034 + struct rb_node ** rb_link, * rb_parent; 14.2035 + 14.2036 + /* 14.2037 + * The vm_pgoff of a purely anonymous vma should be irrelevant 14.2038 + * until its first write fault, when page's anon_vma and index 14.2039 + * are set. But now set the vm_pgoff it will almost certainly 14.2040 + * end up with (unless mremap moves it elsewhere before that 14.2041 + * first wfault), so /proc/pid/maps tells a consistent story. 14.2042 + * 14.2043 + * By setting it to reflect the virtual start address of the 14.2044 + * vma, merges and splits can happen in a seamless way, just 14.2045 + * using the existing file pgoff checks and manipulations. 14.2046 + * Similarly in do_mmap_pgoff and in do_brk. 14.2047 + */ 14.2048 + if (!vma->vm_file) { 14.2049 + BUG_ON(vma->anon_vma); 14.2050 + vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; 14.2051 + } 14.2052 + __vma = find_vma_prepare(mm,vma->vm_start,&prev,&rb_link,&rb_parent); 14.2053 + if (__vma && __vma->vm_start < vma->vm_end) 14.2054 + return -ENOMEM; 14.2055 + vma_link(mm, vma, prev, rb_link, rb_parent); 14.2056 + return 0; 14.2057 +} 14.2058 + 14.2059 +/* 14.2060 + * Copy the vma structure to a new location in the same mm, 14.2061 + * prior to moving page table entries, to effect an mremap move. 14.2062 + */ 14.2063 +struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, 14.2064 + unsigned long addr, unsigned long len, pgoff_t pgoff) 14.2065 +{ 14.2066 + struct vm_area_struct *vma = *vmap; 14.2067 + unsigned long vma_start = vma->vm_start; 14.2068 + struct mm_struct *mm = vma->vm_mm; 14.2069 + struct vm_area_struct *new_vma, *prev; 14.2070 + struct rb_node **rb_link, *rb_parent; 14.2071 + struct mempolicy *pol; 14.2072 + 14.2073 + /* 14.2074 + * If anonymous vma has not yet been faulted, update new pgoff 14.2075 + * to match new location, to increase its chance of merging. 14.2076 + */ 14.2077 + if (!vma->vm_file && !vma->anon_vma) 14.2078 + pgoff = addr >> PAGE_SHIFT; 14.2079 + 14.2080 + find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); 14.2081 + new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, 14.2082 + vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma)); 14.2083 + if (new_vma) { 14.2084 + /* 14.2085 + * Source vma may have been merged into new_vma 14.2086 + */ 14.2087 + if (vma_start >= new_vma->vm_start && 14.2088 + vma_start < new_vma->vm_end) 14.2089 + *vmap = new_vma; 14.2090 + } else { 14.2091 + new_vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); 14.2092 + if (new_vma) { 14.2093 + *new_vma = *vma; 14.2094 + pol = mpol_copy(vma_policy(vma)); 14.2095 + if (IS_ERR(pol)) { 14.2096 + kmem_cache_free(vm_area_cachep, new_vma); 14.2097 + return NULL; 14.2098 + } 14.2099 + vma_set_policy(new_vma, pol); 14.2100 + new_vma->vm_start = addr; 14.2101 + new_vma->vm_end = addr + len; 14.2102 + new_vma->vm_pgoff = pgoff; 14.2103 + if (new_vma->vm_file) 14.2104 + get_file(new_vma->vm_file); 14.2105 + if (new_vma->vm_ops && new_vma->vm_ops->open) 14.2106 + new_vma->vm_ops->open(new_vma); 14.2107 + vma_link(mm, new_vma, prev, rb_link, rb_parent); 14.2108 + } 14.2109 + } 14.2110 + return new_vma; 14.2111 +}
16.1 --- a/tools/libxc/xc_linux_restore.c Wed Apr 27 16:55:30 2005 +0000 16.2 +++ b/tools/libxc/xc_linux_restore.c Wed Apr 27 16:55:50 2005 +0000 16.3 @@ -170,13 +170,13 @@ int xc_linux_restore(int xc_handle, XcIO 16.4 if ( xc_domain_create(xc_handle, nr_pfns * (PAGE_SIZE / 1024), 16.5 -1, 1, &dom) ) 16.6 { 16.7 - xcio_error(ioctxt, "Could not create domain. pfns=%d, %dKB", 16.8 - nr_pfns,nr_pfns * (PAGE_SIZE / 1024)); 16.9 + xcio_error(ioctxt, "Could not create domain. pfns=%ld, %ldKB", 16.10 + nr_pfns, nr_pfns * (PAGE_SIZE / 1024)); 16.11 goto out; 16.12 } 16.13 16.14 ioctxt->domain = dom; 16.15 - xcio_info(ioctxt, "Created domain %ld\n",dom); 16.16 + xcio_info(ioctxt, "Created domain %u\n", dom); 16.17 16.18 /* Get the domain's shared-info frame. */ 16.19 op.cmd = DOM0_GETDOMAININFO; 16.20 @@ -200,7 +200,8 @@ int xc_linux_restore(int xc_handle, XcIO 16.21 } 16.22 16.23 /* Build the pfn-to-mfn table. We choose MFN ordering returned by Xen. */ 16.24 - if ( xc_get_pfn_list(xc_handle, dom, pfn_to_mfn_table, nr_pfns) != nr_pfns ) 16.25 + if ( xc_get_pfn_list(xc_handle, dom, 16.26 + pfn_to_mfn_table, nr_pfns) != nr_pfns ) 16.27 { 16.28 xcio_error(ioctxt, "Did not read correct number of frame " 16.29 "numbers for new dom"); 16.30 @@ -657,7 +658,7 @@ int xc_linux_restore(int xc_handle, XcIO 16.31 if ( rc == 0 ) 16.32 { 16.33 /* Success: print the domain id. */ 16.34 - xcio_info(ioctxt, "DOM=%lu\n", dom); 16.35 + xcio_info(ioctxt, "DOM=%u\n", dom); 16.36 return 0; 16.37 } 16.38
17.1 --- a/tools/libxc/xc_linux_save.c Wed Apr 27 16:55:30 2005 +0000 17.2 +++ b/tools/libxc/xc_linux_save.c Wed Apr 27 16:55:50 2005 +0000 17.3 @@ -167,7 +167,8 @@ static int burst_time_us = -1; 17.4 #define RATE_TO_BTU 781250 17.5 #define BURST_TIME_US burst_time_us 17.6 17.7 -static int xcio_ratewrite(XcIOContext *ioctxt, void *buf, int n){ 17.8 +static int xcio_ratewrite(XcIOContext *ioctxt, void *buf, int n) 17.9 +{ 17.10 static int budget = 0; 17.11 static struct timeval last_put = { 0 }; 17.12 struct timeval now; 17.13 @@ -230,8 +231,8 @@ static int print_stats( int xc_handle, u 17.14 17.15 gettimeofday(&wall_now, NULL); 17.16 17.17 - d0_cpu_now = xc_domain_get_cpu_usage( xc_handle, 0, /* FIXME */ 0 )/1000; 17.18 - d1_cpu_now = xc_domain_get_cpu_usage( xc_handle, domid, /* FIXME */ 0 )/1000; 17.19 + d0_cpu_now = xc_domain_get_cpu_usage(xc_handle, 0, /* FIXME */ 0)/1000; 17.20 + d1_cpu_now = xc_domain_get_cpu_usage(xc_handle, domid, /* FIXME */ 0)/1000; 17.21 17.22 if ( (d0_cpu_now == -1) || (d1_cpu_now == -1) ) 17.23 printf("ARRHHH!!\n"); 17.24 @@ -273,10 +274,13 @@ static int print_stats( int xc_handle, u 17.25 * @param ioctxt i/o context 17.26 * @return 0 on success, non-zero on error. 17.27 */ 17.28 -static int write_vmconfig(XcIOContext *ioctxt){ 17.29 +static int write_vmconfig(XcIOContext *ioctxt) 17.30 +{ 17.31 int err = -1; 17.32 - if(xcio_write(ioctxt, &ioctxt->vmconfig_n, sizeof(ioctxt->vmconfig_n))) goto exit; 17.33 - if(xcio_write(ioctxt, ioctxt->vmconfig, ioctxt->vmconfig_n)) goto exit; 17.34 + if(xcio_write(ioctxt, &ioctxt->vmconfig_n, sizeof(ioctxt->vmconfig_n))) 17.35 + goto exit; 17.36 + if(xcio_write(ioctxt, ioctxt->vmconfig, ioctxt->vmconfig_n)) 17.37 + goto exit; 17.38 err = 0; 17.39 exit: 17.40 return err; 17.41 @@ -329,7 +333,8 @@ int suspend_and_state(int xc_handle, XcI 17.42 17.43 retry: 17.44 17.45 - if ( xc_domain_getfullinfo(xc_handle, ioctxt->domain, /* FIXME */ 0, info, ctxt) ) 17.46 + if ( xc_domain_getfullinfo(xc_handle, ioctxt->domain, /* FIXME */ 0, 17.47 + info, ctxt) ) 17.48 { 17.49 xcio_error(ioctxt, "Could not get full domain info"); 17.50 return -1; 17.51 @@ -347,7 +352,7 @@ retry: 17.52 // try unpausing domain, wait, and retest 17.53 xc_domain_unpause( xc_handle, ioctxt->domain ); 17.54 17.55 - xcio_error(ioctxt, "Domain was paused. Wait and re-test. (%lx)", 17.56 + xcio_error(ioctxt, "Domain was paused. Wait and re-test. (%u)", 17.57 info->flags); 17.58 usleep(10000); // 10ms 17.59 17.60 @@ -357,14 +362,12 @@ retry: 17.61 17.62 if( ++i < 100 ) 17.63 { 17.64 - xcio_error(ioctxt, "Retry suspend domain (%lx)", 17.65 - info->flags); 17.66 + xcio_error(ioctxt, "Retry suspend domain (%u)", info->flags); 17.67 usleep(10000); // 10ms 17.68 goto retry; 17.69 } 17.70 17.71 - xcio_error(ioctxt, "Unable to suspend domain. (%lx)", 17.72 - info->flags); 17.73 + xcio_error(ioctxt, "Unable to suspend domain. (%u)", info->flags); 17.74 17.75 return -1; 17.76 } 17.77 @@ -442,7 +445,8 @@ int xc_linux_save(int xc_handle, XcIOCon 17.78 return 1; 17.79 } 17.80 17.81 - if ( xc_domain_getfullinfo( xc_handle, domid, /* FIXME */ 0, &info, &ctxt) ) 17.82 + if ( xc_domain_getfullinfo( xc_handle, domid, /* FIXME */ 0, 17.83 + &info, &ctxt) ) 17.84 { 17.85 xcio_error(ioctxt, "Could not get full domain info"); 17.86 goto out; 17.87 @@ -459,7 +463,9 @@ int xc_linux_save(int xc_handle, XcIOCon 17.88 17.89 /* cheesy sanity check */ 17.90 if ( nr_pfns > 1024*1024 ){ 17.91 - xcio_error(ioctxt, "Invalid state record -- pfn count out of range: %lu", nr_pfns); 17.92 + xcio_error(ioctxt, 17.93 + "Invalid state record -- pfn count out of range: %lu", 17.94 + nr_pfns); 17.95 goto out; 17.96 } 17.97 17.98 @@ -513,7 +519,8 @@ int xc_linux_save(int xc_handle, XcIOCon 17.99 17.100 for ( i = 0; i < nr_pfns; i += 1024 ){ 17.101 if ( !translate_mfn_to_pfn(&pfn_to_mfn_frame_list[i/1024]) ){ 17.102 - xcio_error(ioctxt, "Frame # in pfn-to-mfn frame list is not in pseudophys"); 17.103 + xcio_error(ioctxt, 17.104 + "Frame# in pfn-to-mfn frame list is not in pseudophys"); 17.105 goto out; 17.106 } 17.107 } 17.108 @@ -539,7 +546,7 @@ int xc_linux_save(int xc_handle, XcIOCon 17.109 17.110 if ( suspend_and_state( xc_handle, ioctxt, &info, &ctxt) ) 17.111 { 17.112 - xcio_error(ioctxt, "Domain appears not to have suspended: %lx", 17.113 + xcio_error(ioctxt, "Domain appears not to have suspended: %u", 17.114 info.flags); 17.115 goto out; 17.116 } 17.117 @@ -836,7 +843,8 @@ int xc_linux_save(int xc_handle, XcIOCon 17.118 } /* end of page table rewrite for loop */ 17.119 17.120 if ( xcio_ratewrite(ioctxt, page, PAGE_SIZE) ){ 17.121 - xcio_error(ioctxt, "Error when writing to state file (4)"); 17.122 + xcio_error(ioctxt, 17.123 + "Error when writing to state file (4)"); 17.124 goto out; 17.125 } 17.126 17.127 @@ -844,7 +852,8 @@ int xc_linux_save(int xc_handle, XcIOCon 17.128 17.129 if ( xcio_ratewrite(ioctxt, region_base + (PAGE_SIZE*j), 17.130 PAGE_SIZE) ){ 17.131 - xcio_error(ioctxt, "Error when writing to state file (5)"); 17.132 + xcio_error(ioctxt, 17.133 + "Error when writing to state file (5)"); 17.134 goto out; 17.135 } 17.136 } 17.137 @@ -903,14 +912,15 @@ int xc_linux_save(int xc_handle, XcIOCon 17.138 17.139 if ( suspend_and_state( xc_handle, ioctxt, &info, &ctxt) ) 17.140 { 17.141 - xcio_error(ioctxt, "Domain appears not to have suspended: %lx", 17.142 + xcio_error(ioctxt, 17.143 + "Domain appears not to have suspended: %u", 17.144 info.flags); 17.145 goto out; 17.146 } 17.147 17.148 xcio_info(ioctxt, 17.149 - "SUSPEND flags %08lx shinfo %08lx eip %08lx " 17.150 - "esi %08lx\n",info.flags, 17.151 + "SUSPEND flags %08u shinfo %08lx eip %08u " 17.152 + "esi %08u\n",info.flags, 17.153 info.shared_info_frame, 17.154 ctxt.cpu_ctxt.eip, ctxt.cpu_ctxt.esi ); 17.155 } 17.156 @@ -972,7 +982,8 @@ int xc_linux_save(int xc_handle, XcIOCon 17.157 { 17.158 if ( xcio_write(ioctxt, &pfntab, sizeof(unsigned long)*j) ) 17.159 { 17.160 - xcio_error(ioctxt, "Error when writing to state file (6b)"); 17.161 + xcio_error(ioctxt, 17.162 + "Error when writing to state file (6b)"); 17.163 goto out; 17.164 } 17.165 j = 0; 17.166 @@ -1027,14 +1038,24 @@ int xc_linux_save(int xc_handle, XcIOCon 17.167 17.168 out: 17.169 17.170 - if ( live_shinfo ) munmap(live_shinfo, PAGE_SIZE); 17.171 - if ( p_srec ) munmap(p_srec, sizeof(*p_srec)); 17.172 - if ( live_pfn_to_mfn_frame_list ) munmap(live_pfn_to_mfn_frame_list, PAGE_SIZE); 17.173 - if ( live_pfn_to_mfn_table ) munmap(live_pfn_to_mfn_table, nr_pfns*4 ); 17.174 - if ( live_mfn_to_pfn_table ) munmap(live_mfn_to_pfn_table, PAGE_SIZE*1024 ); 17.175 + if(live_shinfo) 17.176 + munmap(live_shinfo, PAGE_SIZE); 17.177 + 17.178 + if(p_srec) 17.179 + munmap(p_srec, sizeof(*p_srec)); 17.180 + 17.181 + if(live_pfn_to_mfn_frame_list) 17.182 + munmap(live_pfn_to_mfn_frame_list, PAGE_SIZE); 17.183 17.184 - if ( pfn_type != NULL ) free(pfn_type); 17.185 + if(live_pfn_to_mfn_table) 17.186 + munmap(live_pfn_to_mfn_table, nr_pfns*4); 17.187 + 17.188 + if(live_mfn_to_pfn_table) 17.189 + munmap(live_mfn_to_pfn_table, PAGE_SIZE*1024); 17.190 + 17.191 + if (pfn_type != NULL) 17.192 + free(pfn_type); 17.193 + 17.194 DPRINTF("Save exit rc=%d\n",rc); 17.195 return !!rc; 17.196 - 17.197 }
18.1 --- a/xen/arch/x86/mm.c Wed Apr 27 16:55:30 2005 +0000 18.2 +++ b/xen/arch/x86/mm.c Wed Apr 27 16:55:50 2005 +0000 18.3 @@ -482,7 +482,7 @@ get_page_from_l2e( 18.4 { 18.5 int rc; 18.6 18.7 - ASSERT( !shadow_mode_enabled(d) ); 18.8 + ASSERT(!shadow_mode_enabled(d)); 18.9 18.10 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) ) 18.11 return 1; 18.12 @@ -641,7 +641,7 @@ static int alloc_l1_table(struct pfn_inf 18.13 l1_pgentry_t *pl1e; 18.14 int i; 18.15 18.16 - ASSERT( !shadow_mode_enabled(d) ); 18.17 + ASSERT(!shadow_mode_enabled(d)); 18.18 18.19 pl1e = map_domain_mem(pfn << PAGE_SHIFT); 18.20 18.21 @@ -2670,22 +2670,6 @@ static int ptwr_emulated_update( 18.22 } 18.23 unmap_domain_mem(pl1e); 18.24 18.25 - /* Propagate update to shadow cache. */ 18.26 - if ( unlikely(shadow_mode_enabled(d)) ) 18.27 - { 18.28 - BUG(); // XXX fix me... 18.29 -#if 0 18.30 - sstat = get_shadow_status(d, page_to_pfn(page)); 18.31 - if ( sstat & PSH_shadowed ) 18.32 - { 18.33 - sl1e = map_domain_mem( 18.34 - ((sstat & PSH_pfn_mask) << PAGE_SHIFT) + (addr & ~PAGE_MASK)); 18.35 - l1pte_propagate_from_guest(d, &nl1e, sl1e); 18.36 - unmap_domain_mem(sl1e); 18.37 - } 18.38 -#endif 18.39 - } 18.40 - 18.41 /* Finally, drop the old PTE. */ 18.42 put_page_from_l1e(ol1e, d); 18.43 18.44 @@ -2748,6 +2732,7 @@ int ptwr_do_page_fault(struct domain *d, 18.45 /* We are looking only for read-only mappings of p.t. pages. */ 18.46 if ( ((l1e_get_flags(pte) & (_PAGE_RW|_PAGE_PRESENT)) != _PAGE_PRESENT) || 18.47 ((page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table) || 18.48 + ((page->u.inuse.type_info & PGT_count_mask) == 0) || 18.49 (page_get_owner(page) != d) ) 18.50 { 18.51 return 0;