ia64/xen-unstable

changeset 2371:a05901f24fa4

bitkeeper revision 1.1159.57.2 (412f4f96lcvl1zNFbliRXwffVN2DFg)

Add ptep_establish_new and use in page fault path.
author cl349@freefall.cl.cam.ac.uk
date Fri Aug 27 15:13:26 2004 +0000 (2004-08-27)
parents 8b6698017789
children 41dcf2060c59
files .rootkeys linux-2.6.8.1-xen-sparse/include/asm-generic/pgtable.h linux-2.6.8.1-xen-sparse/include/asm-xen/asm-i386/pgtable.h linux-2.6.8.1-xen-sparse/mm/memory.c
line diff
     1.1 --- a/.rootkeys	Fri Aug 27 13:29:44 2004 +0000
     1.2 +++ b/.rootkeys	Fri Aug 27 15:13:26 2004 +0000
     1.3 @@ -202,6 +202,7 @@ 40f56239Wd4k_ycG_mFsSO1r5xKdtQ linux-2.6
     1.4  405853f6nbeazrNyEWNHBuoSg2PiPA linux-2.6.8.1-xen-sparse/drivers/xen/netfront/netfront.c
     1.5  4108f5c1ppFXVpQzCOAZ6xXYubsjKA linux-2.6.8.1-xen-sparse/drivers/xen/privcmd/Makefile
     1.6  3e5a4e65IUfzzMu2kZFlGEB8-rpTaA linux-2.6.8.1-xen-sparse/drivers/xen/privcmd/privcmd.c
     1.7 +412f47e4RKD-R5IS5gEXvcT8L4v8gA linux-2.6.8.1-xen-sparse/include/asm-generic/pgtable.h
     1.8  40f56239YAjS52QG2FIAQpHDZAdGHg linux-2.6.8.1-xen-sparse/include/asm-xen/asm-i386/desc.h
     1.9  4107adf1E5O4ztGHNGMzCCNhcvqNow linux-2.6.8.1-xen-sparse/include/asm-xen/asm-i386/dma-mapping.h
    1.10  40f5623anSzpuEHgiNmQ56fIRfCoaQ linux-2.6.8.1-xen-sparse/include/asm-xen/asm-i386/e820.h
    1.11 @@ -252,6 +253,7 @@ 4124d8c4aocX7A-jIbuGraWN84pxGQ linux-2.6
    1.12  4124f66fp5QwbDHEfoUIa7pqO5Xhag linux-2.6.8.1-xen-sparse/include/linux/page-flags.h
    1.13  4124f66f4NaKNa0xPiGGykn9QaZk3w linux-2.6.8.1-xen-sparse/include/linux/skbuff.h
    1.14  40f56a0ddHCSs3501MY4hRf22tctOw linux-2.6.8.1-xen-sparse/mkbuildtree
    1.15 +412f46c0LJuKAgSPGoC0Z1DEkLfuLA linux-2.6.8.1-xen-sparse/mm/memory.c
    1.16  410a94a4KT6I6X0LVc7djB39tRDp4g linux-2.6.8.1-xen-sparse/mm/page_alloc.c
    1.17  40e1b09db5mN69Ijj0X_Eol-S7dXiw tools/Make.defs
    1.18  3f776bd1Hy9rn69ntXBhPReUFw9IEA tools/Makefile
     2.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     2.2 +++ b/linux-2.6.8.1-xen-sparse/include/asm-generic/pgtable.h	Fri Aug 27 15:13:26 2004 +0000
     2.3 @@ -0,0 +1,136 @@
     2.4 +#ifndef _ASM_GENERIC_PGTABLE_H
     2.5 +#define _ASM_GENERIC_PGTABLE_H
     2.6 +
     2.7 +#ifndef __HAVE_ARCH_PTEP_ESTABLISH
     2.8 +/*
     2.9 + * Establish a new mapping:
    2.10 + *  - flush the old one
    2.11 + *  - update the page tables
    2.12 + *  - inform the TLB about the new one
    2.13 + *
    2.14 + * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock.
    2.15 + *
    2.16 + * Note: the old pte is known to not be writable, so we don't need to
    2.17 + * worry about dirty bits etc getting lost.
    2.18 + */
    2.19 +#define ptep_establish(__vma, __address, __ptep, __entry)		\
    2.20 +do {				  					\
    2.21 +	set_pte(__ptep, __entry);					\
    2.22 +	flush_tlb_page(__vma, __address);				\
    2.23 +} while (0)
    2.24 +#endif
    2.25 +
    2.26 +#ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
    2.27 +/*
    2.28 + * Largely same as above, but only sets the access flags (dirty,
    2.29 + * accessed, and writable). Furthermore, we know it always gets set
    2.30 + * to a "more permissive" setting, which allows most architectures
    2.31 + * to optimize this.
    2.32 + */
    2.33 +#define ptep_set_access_flags(__vma, __address, __ptep, __entry, __dirty) \
    2.34 +do {				  					  \
    2.35 +	set_pte(__ptep, __entry);					  \
    2.36 +	flush_tlb_page(__vma, __address);				  \
    2.37 +} while (0)
    2.38 +#endif
    2.39 +
    2.40 +#ifndef __HAVE_ARCH_PTEP_ESTABLISH_NEW
    2.41 +#define ptep_establish_new(__vma, __address, __ptep, __entry)		\
    2.42 +do {									\
    2.43 +	set_pte(__ptep, __entry);					\
    2.44 +} while (0)
    2.45 +#endif
    2.46 +
    2.47 +#ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
    2.48 +static inline int ptep_test_and_clear_young(pte_t *ptep)
    2.49 +{
    2.50 +	pte_t pte = *ptep;
    2.51 +	if (!pte_young(pte))
    2.52 +		return 0;
    2.53 +	set_pte(ptep, pte_mkold(pte));
    2.54 +	return 1;
    2.55 +}
    2.56 +#endif
    2.57 +
    2.58 +#ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
    2.59 +#define ptep_clear_flush_young(__vma, __address, __ptep)		\
    2.60 +({									\
    2.61 +	int __young = ptep_test_and_clear_young(__ptep);		\
    2.62 +	if (__young)							\
    2.63 +		flush_tlb_page(__vma, __address);			\
    2.64 +	__young;							\
    2.65 +})
    2.66 +#endif
    2.67 +
    2.68 +#ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY
    2.69 +static inline int ptep_test_and_clear_dirty(pte_t *ptep)
    2.70 +{
    2.71 +	pte_t pte = *ptep;
    2.72 +	if (!pte_dirty(pte))
    2.73 +		return 0;
    2.74 +	set_pte(ptep, pte_mkclean(pte));
    2.75 +	return 1;
    2.76 +}
    2.77 +#endif
    2.78 +
    2.79 +#ifndef __HAVE_ARCH_PTEP_CLEAR_DIRTY_FLUSH
    2.80 +#define ptep_clear_flush_dirty(__vma, __address, __ptep)		\
    2.81 +({									\
    2.82 +	int __dirty = ptep_test_and_clear_dirty(__ptep);		\
    2.83 +	if (__dirty)							\
    2.84 +		flush_tlb_page(__vma, __address);			\
    2.85 +	__dirty;							\
    2.86 +})
    2.87 +#endif
    2.88 +
    2.89 +#ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR
    2.90 +static inline pte_t ptep_get_and_clear(pte_t *ptep)
    2.91 +{
    2.92 +	pte_t pte = *ptep;
    2.93 +	pte_clear(ptep);
    2.94 +	return pte;
    2.95 +}
    2.96 +#endif
    2.97 +
    2.98 +#ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH
    2.99 +#define ptep_clear_flush(__vma, __address, __ptep)			\
   2.100 +({									\
   2.101 +	pte_t __pte = ptep_get_and_clear(__ptep);			\
   2.102 +	flush_tlb_page(__vma, __address);				\
   2.103 +	__pte;								\
   2.104 +})
   2.105 +#endif
   2.106 +
   2.107 +#ifndef __HAVE_ARCH_PTEP_SET_WRPROTECT
   2.108 +static inline void ptep_set_wrprotect(pte_t *ptep)
   2.109 +{
   2.110 +	pte_t old_pte = *ptep;
   2.111 +	set_pte(ptep, pte_wrprotect(old_pte));
   2.112 +}
   2.113 +#endif
   2.114 +
   2.115 +#ifndef __HAVE_ARCH_PTEP_MKDIRTY
   2.116 +static inline void ptep_mkdirty(pte_t *ptep)
   2.117 +{
   2.118 +	pte_t old_pte = *ptep;
   2.119 +	set_pte(ptep, pte_mkdirty(old_pte));
   2.120 +}
   2.121 +#endif
   2.122 +
   2.123 +#ifndef __HAVE_ARCH_PTE_SAME
   2.124 +#define pte_same(A,B)	(pte_val(A) == pte_val(B))
   2.125 +#endif
   2.126 +
   2.127 +#ifndef __HAVE_ARCH_PAGE_TEST_AND_CLEAR_DIRTY
   2.128 +#define page_test_and_clear_dirty(page) (0)
   2.129 +#endif
   2.130 +
   2.131 +#ifndef __HAVE_ARCH_PAGE_TEST_AND_CLEAR_YOUNG
   2.132 +#define page_test_and_clear_young(page) (0)
   2.133 +#endif
   2.134 +
   2.135 +#ifndef __HAVE_ARCH_PGD_OFFSET_GATE
   2.136 +#define pgd_offset_gate(mm, addr)	pgd_offset(mm, addr)
   2.137 +#endif
   2.138 +
   2.139 +#endif /* _ASM_GENERIC_PGTABLE_H */
     3.1 --- a/linux-2.6.8.1-xen-sparse/include/asm-xen/asm-i386/pgtable.h	Fri Aug 27 13:29:44 2004 +0000
     3.2 +++ b/linux-2.6.8.1-xen-sparse/include/asm-xen/asm-i386/pgtable.h	Fri Aug 27 15:13:26 2004 +0000
     3.3 @@ -436,12 +436,31 @@ extern pte_t *lookup_address(unsigned lo
     3.4  			    HYPERVISOR_update_va_mapping(address>>PAGE_SHIFT, entry, UVMF_INVLPG); \
     3.5  			} else {                                          \
     3.6                              xen_l1_entry_update((__ptep), (__entry).pte_low); \
     3.7 +			    flush_tlb_page(__vma, __address);             \
     3.8  			}                                                 \
     3.9  		}							  \
    3.10  	} while (0)
    3.11  
    3.12  #endif
    3.13  
    3.14 +#define __HAVE_ARCH_PTEP_ESTABLISH
    3.15 +#define ptep_establish(__vma, __address, __ptep, __entry)		\
    3.16 +do {				  					\
    3.17 +	ptep_set_access_flags(__vma, __address, __ptep, __entry, 1);	\
    3.18 +} while (0)
    3.19 +
    3.20 +#define __HAVE_ARCH_PTEP_ESTABLISH_NEW
    3.21 +#define ptep_establish_new(__vma, __address, __ptep, __entry)		\
    3.22 +do {				  					\
    3.23 +	if ( likely((__vma)->vm_mm == current->mm) ) {			\
    3.24 +		xen_flush_page_update_queue();				\
    3.25 +		HYPERVISOR_update_va_mapping((__address)>>PAGE_SHIFT,	\
    3.26 +					     __entry, 0);		\
    3.27 +	} else {							\
    3.28 +		xen_l1_entry_update((__ptep), (__entry).pte_low);	\
    3.29 +	}								\
    3.30 +} while (0)
    3.31 +
    3.32  /* Encode and de-code a swap entry */
    3.33  #define __swp_type(x)			(((x).val >> 1) & 0x1f)
    3.34  #define __swp_offset(x)			((x).val >> 8)
     4.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     4.2 +++ b/linux-2.6.8.1-xen-sparse/mm/memory.c	Fri Aug 27 15:13:26 2004 +0000
     4.3 @@ -0,0 +1,1822 @@
     4.4 +/*
     4.5 + *  linux/mm/memory.c
     4.6 + *
     4.7 + *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
     4.8 + */
     4.9 +
    4.10 +/*
    4.11 + * demand-loading started 01.12.91 - seems it is high on the list of
    4.12 + * things wanted, and it should be easy to implement. - Linus
    4.13 + */
    4.14 +
    4.15 +/*
    4.16 + * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
    4.17 + * pages started 02.12.91, seems to work. - Linus.
    4.18 + *
    4.19 + * Tested sharing by executing about 30 /bin/sh: under the old kernel it
    4.20 + * would have taken more than the 6M I have free, but it worked well as
    4.21 + * far as I could see.
    4.22 + *
    4.23 + * Also corrected some "invalidate()"s - I wasn't doing enough of them.
    4.24 + */
    4.25 +
    4.26 +/*
    4.27 + * Real VM (paging to/from disk) started 18.12.91. Much more work and
    4.28 + * thought has to go into this. Oh, well..
    4.29 + * 19.12.91  -  works, somewhat. Sometimes I get faults, don't know why.
    4.30 + *		Found it. Everything seems to work now.
    4.31 + * 20.12.91  -  Ok, making the swap-device changeable like the root.
    4.32 + */
    4.33 +
    4.34 +/*
    4.35 + * 05.04.94  -  Multi-page memory management added for v1.1.
    4.36 + * 		Idea by Alex Bligh (alex@cconcepts.co.uk)
    4.37 + *
    4.38 + * 16.07.99  -  Support of BIGMEM added by Gerhard Wichert, Siemens AG
    4.39 + *		(Gerhard.Wichert@pdb.siemens.de)
    4.40 + */
    4.41 +
    4.42 +#include <linux/kernel_stat.h>
    4.43 +#include <linux/mm.h>
    4.44 +#include <linux/hugetlb.h>
    4.45 +#include <linux/mman.h>
    4.46 +#include <linux/swap.h>
    4.47 +#include <linux/highmem.h>
    4.48 +#include <linux/pagemap.h>
    4.49 +#include <linux/rmap.h>
    4.50 +#include <linux/module.h>
    4.51 +#include <linux/init.h>
    4.52 +
    4.53 +#include <asm/pgalloc.h>
    4.54 +#include <asm/uaccess.h>
    4.55 +#include <asm/tlb.h>
    4.56 +#include <asm/tlbflush.h>
    4.57 +#include <asm/pgtable.h>
    4.58 +
    4.59 +#include <linux/swapops.h>
    4.60 +#include <linux/elf.h>
    4.61 +
    4.62 +#ifndef CONFIG_DISCONTIGMEM
    4.63 +/* use the per-pgdat data instead for discontigmem - mbligh */
    4.64 +unsigned long max_mapnr;
    4.65 +struct page *mem_map;
    4.66 +
    4.67 +EXPORT_SYMBOL(max_mapnr);
    4.68 +EXPORT_SYMBOL(mem_map);
    4.69 +#endif
    4.70 +
    4.71 +unsigned long num_physpages;
    4.72 +/*
    4.73 + * A number of key systems in x86 including ioremap() rely on the assumption
    4.74 + * that high_memory defines the upper bound on direct map memory, then end
    4.75 + * of ZONE_NORMAL.  Under CONFIG_DISCONTIG this means that max_low_pfn and
    4.76 + * highstart_pfn must be the same; there must be no gap between ZONE_NORMAL
    4.77 + * and ZONE_HIGHMEM.
    4.78 + */
    4.79 +void * high_memory;
    4.80 +struct page *highmem_start_page;
    4.81 +unsigned long vmalloc_earlyreserve;
    4.82 +
    4.83 +EXPORT_SYMBOL(num_physpages);
    4.84 +EXPORT_SYMBOL(highmem_start_page);
    4.85 +EXPORT_SYMBOL(high_memory);
    4.86 +EXPORT_SYMBOL(vmalloc_earlyreserve);
    4.87 +
    4.88 +/*
    4.89 + * We special-case the C-O-W ZERO_PAGE, because it's such
    4.90 + * a common occurrence (no need to read the page to know
    4.91 + * that it's zero - better for the cache and memory subsystem).
    4.92 + */
    4.93 +static inline void copy_cow_page(struct page * from, struct page * to, unsigned long address)
    4.94 +{
    4.95 +	if (from == ZERO_PAGE(address)) {
    4.96 +		clear_user_highpage(to, address);
    4.97 +		return;
    4.98 +	}
    4.99 +	copy_user_highpage(to, from, address);
   4.100 +}
   4.101 +
   4.102 +/*
   4.103 + * Note: this doesn't free the actual pages themselves. That
   4.104 + * has been handled earlier when unmapping all the memory regions.
   4.105 + */
   4.106 +static inline void free_one_pmd(struct mmu_gather *tlb, pmd_t * dir)
   4.107 +{
   4.108 +	struct page *page;
   4.109 +
   4.110 +	if (pmd_none(*dir))
   4.111 +		return;
   4.112 +	if (unlikely(pmd_bad(*dir))) {
   4.113 +		pmd_ERROR(*dir);
   4.114 +		pmd_clear(dir);
   4.115 +		return;
   4.116 +	}
   4.117 +	page = pmd_page(*dir);
   4.118 +	pmd_clear(dir);
   4.119 +	dec_page_state(nr_page_table_pages);
   4.120 +	pte_free_tlb(tlb, page);
   4.121 +}
   4.122 +
   4.123 +static inline void free_one_pgd(struct mmu_gather *tlb, pgd_t * dir)
   4.124 +{
   4.125 +	int j;
   4.126 +	pmd_t * pmd;
   4.127 +
   4.128 +	if (pgd_none(*dir))
   4.129 +		return;
   4.130 +	if (unlikely(pgd_bad(*dir))) {
   4.131 +		pgd_ERROR(*dir);
   4.132 +		pgd_clear(dir);
   4.133 +		return;
   4.134 +	}
   4.135 +	pmd = pmd_offset(dir, 0);
   4.136 +	pgd_clear(dir);
   4.137 +	for (j = 0; j < PTRS_PER_PMD ; j++)
   4.138 +		free_one_pmd(tlb, pmd+j);
   4.139 +	pmd_free_tlb(tlb, pmd);
   4.140 +}
   4.141 +
   4.142 +/*
   4.143 + * This function clears all user-level page tables of a process - this
   4.144 + * is needed by execve(), so that old pages aren't in the way.
   4.145 + *
   4.146 + * Must be called with pagetable lock held.
   4.147 + */
   4.148 +void clear_page_tables(struct mmu_gather *tlb, unsigned long first, int nr)
   4.149 +{
   4.150 +	pgd_t * page_dir = tlb->mm->pgd;
   4.151 +
   4.152 +	page_dir += first;
   4.153 +	do {
   4.154 +		free_one_pgd(tlb, page_dir);
   4.155 +		page_dir++;
   4.156 +	} while (--nr);
   4.157 +}
   4.158 +
   4.159 +pte_t fastcall * pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
   4.160 +{
   4.161 +	if (!pmd_present(*pmd)) {
   4.162 +		struct page *new;
   4.163 +
   4.164 +		spin_unlock(&mm->page_table_lock);
   4.165 +		new = pte_alloc_one(mm, address);
   4.166 +		spin_lock(&mm->page_table_lock);
   4.167 +		if (!new)
   4.168 +			return NULL;
   4.169 +
   4.170 +		/*
   4.171 +		 * Because we dropped the lock, we should re-check the
   4.172 +		 * entry, as somebody else could have populated it..
   4.173 +		 */
   4.174 +		if (pmd_present(*pmd)) {
   4.175 +			pte_free(new);
   4.176 +			goto out;
   4.177 +		}
   4.178 +		inc_page_state(nr_page_table_pages);
   4.179 +		pmd_populate(mm, pmd, new);
   4.180 +	}
   4.181 +out:
   4.182 +	return pte_offset_map(pmd, address);
   4.183 +}
   4.184 +
   4.185 +pte_t fastcall * pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
   4.186 +{
   4.187 +	if (!pmd_present(*pmd)) {
   4.188 +		pte_t *new;
   4.189 +
   4.190 +		spin_unlock(&mm->page_table_lock);
   4.191 +		new = pte_alloc_one_kernel(mm, address);
   4.192 +		spin_lock(&mm->page_table_lock);
   4.193 +		if (!new)
   4.194 +			return NULL;
   4.195 +
   4.196 +		/*
   4.197 +		 * Because we dropped the lock, we should re-check the
   4.198 +		 * entry, as somebody else could have populated it..
   4.199 +		 */
   4.200 +		if (pmd_present(*pmd)) {
   4.201 +			pte_free_kernel(new);
   4.202 +			goto out;
   4.203 +		}
   4.204 +		pmd_populate_kernel(mm, pmd, new);
   4.205 +	}
   4.206 +out:
   4.207 +	return pte_offset_kernel(pmd, address);
   4.208 +}
   4.209 +#define PTE_TABLE_MASK	((PTRS_PER_PTE-1) * sizeof(pte_t))
   4.210 +#define PMD_TABLE_MASK	((PTRS_PER_PMD-1) * sizeof(pmd_t))
   4.211 +
   4.212 +/*
   4.213 + * copy one vm_area from one task to the other. Assumes the page tables
   4.214 + * already present in the new task to be cleared in the whole range
   4.215 + * covered by this vma.
   4.216 + *
   4.217 + * 08Jan98 Merged into one routine from several inline routines to reduce
   4.218 + *         variable count and make things faster. -jj
   4.219 + *
   4.220 + * dst->page_table_lock is held on entry and exit,
   4.221 + * but may be dropped within pmd_alloc() and pte_alloc_map().
   4.222 + */
   4.223 +int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
   4.224 +			struct vm_area_struct *vma)
   4.225 +{
   4.226 +	pgd_t * src_pgd, * dst_pgd;
   4.227 +	unsigned long address = vma->vm_start;
   4.228 +	unsigned long end = vma->vm_end;
   4.229 +	unsigned long cow;
   4.230 +
   4.231 +	if (is_vm_hugetlb_page(vma))
   4.232 +		return copy_hugetlb_page_range(dst, src, vma);
   4.233 +
   4.234 +	cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
   4.235 +	src_pgd = pgd_offset(src, address)-1;
   4.236 +	dst_pgd = pgd_offset(dst, address)-1;
   4.237 +
   4.238 +	for (;;) {
   4.239 +		pmd_t * src_pmd, * dst_pmd;
   4.240 +
   4.241 +		src_pgd++; dst_pgd++;
   4.242 +		
   4.243 +		/* copy_pmd_range */
   4.244 +		
   4.245 +		if (pgd_none(*src_pgd))
   4.246 +			goto skip_copy_pmd_range;
   4.247 +		if (unlikely(pgd_bad(*src_pgd))) {
   4.248 +			pgd_ERROR(*src_pgd);
   4.249 +			pgd_clear(src_pgd);
   4.250 +skip_copy_pmd_range:	address = (address + PGDIR_SIZE) & PGDIR_MASK;
   4.251 +			if (!address || (address >= end))
   4.252 +				goto out;
   4.253 +			continue;
   4.254 +		}
   4.255 +
   4.256 +		src_pmd = pmd_offset(src_pgd, address);
   4.257 +		dst_pmd = pmd_alloc(dst, dst_pgd, address);
   4.258 +		if (!dst_pmd)
   4.259 +			goto nomem;
   4.260 +
   4.261 +		do {
   4.262 +			pte_t * src_pte, * dst_pte;
   4.263 +		
   4.264 +			/* copy_pte_range */
   4.265 +		
   4.266 +			if (pmd_none(*src_pmd))
   4.267 +				goto skip_copy_pte_range;
   4.268 +			if (unlikely(pmd_bad(*src_pmd))) {
   4.269 +				pmd_ERROR(*src_pmd);
   4.270 +				pmd_clear(src_pmd);
   4.271 +skip_copy_pte_range:
   4.272 +				address = (address + PMD_SIZE) & PMD_MASK;
   4.273 +				if (address >= end)
   4.274 +					goto out;
   4.275 +				goto cont_copy_pmd_range;
   4.276 +			}
   4.277 +
   4.278 +			dst_pte = pte_alloc_map(dst, dst_pmd, address);
   4.279 +			if (!dst_pte)
   4.280 +				goto nomem;
   4.281 +			spin_lock(&src->page_table_lock);	
   4.282 +			src_pte = pte_offset_map_nested(src_pmd, address);
   4.283 +			do {
   4.284 +				pte_t pte = *src_pte;
   4.285 +				struct page *page;
   4.286 +				unsigned long pfn;
   4.287 +
   4.288 +				/* copy_one_pte */
   4.289 +
   4.290 +				if (pte_none(pte))
   4.291 +					goto cont_copy_pte_range_noset;
   4.292 +				/* pte contains position in swap, so copy. */
   4.293 +				if (!pte_present(pte)) {
   4.294 +					if (!pte_file(pte))
   4.295 +						swap_duplicate(pte_to_swp_entry(pte));
   4.296 +					set_pte(dst_pte, pte);
   4.297 +					goto cont_copy_pte_range_noset;
   4.298 +				}
   4.299 +				pfn = pte_pfn(pte);
   4.300 +				/* the pte points outside of valid memory, the
   4.301 +				 * mapping is assumed to be good, meaningful
   4.302 +				 * and not mapped via rmap - duplicate the
   4.303 +				 * mapping as is.
   4.304 +				 */
   4.305 +				page = NULL;
   4.306 +				if (pfn_valid(pfn)) 
   4.307 +					page = pfn_to_page(pfn); 
   4.308 +
   4.309 +				if (!page || PageReserved(page)) {
   4.310 +					set_pte(dst_pte, pte);
   4.311 +					goto cont_copy_pte_range_noset;
   4.312 +				}
   4.313 +
   4.314 +				/*
   4.315 +				 * If it's a COW mapping, write protect it both
   4.316 +				 * in the parent and the child
   4.317 +				 */
   4.318 +				if (cow) {
   4.319 +					ptep_set_wrprotect(src_pte);
   4.320 +					pte = *src_pte;
   4.321 +				}
   4.322 +
   4.323 +				/*
   4.324 +				 * If it's a shared mapping, mark it clean in
   4.325 +				 * the child
   4.326 +				 */
   4.327 +				if (vma->vm_flags & VM_SHARED)
   4.328 +					pte = pte_mkclean(pte);
   4.329 +				pte = pte_mkold(pte);
   4.330 +				get_page(page);
   4.331 +				dst->rss++;
   4.332 +				set_pte(dst_pte, pte);
   4.333 +				page_dup_rmap(page);
   4.334 +cont_copy_pte_range_noset:
   4.335 +				address += PAGE_SIZE;
   4.336 +				if (address >= end) {
   4.337 +					pte_unmap_nested(src_pte);
   4.338 +					pte_unmap(dst_pte);
   4.339 +					goto out_unlock;
   4.340 +				}
   4.341 +				src_pte++;
   4.342 +				dst_pte++;
   4.343 +			} while ((unsigned long)src_pte & PTE_TABLE_MASK);
   4.344 +			pte_unmap_nested(src_pte-1);
   4.345 +			pte_unmap(dst_pte-1);
   4.346 +			spin_unlock(&src->page_table_lock);
   4.347 +			cond_resched_lock(&dst->page_table_lock);
   4.348 +cont_copy_pmd_range:
   4.349 +			src_pmd++;
   4.350 +			dst_pmd++;
   4.351 +		} while ((unsigned long)src_pmd & PMD_TABLE_MASK);
   4.352 +	}
   4.353 +out_unlock:
   4.354 +	spin_unlock(&src->page_table_lock);
   4.355 +out:
   4.356 +	return 0;
   4.357 +nomem:
   4.358 +	return -ENOMEM;
   4.359 +}
   4.360 +
   4.361 +static void zap_pte_range(struct mmu_gather *tlb,
   4.362 +		pmd_t *pmd, unsigned long address,
   4.363 +		unsigned long size, struct zap_details *details)
   4.364 +{
   4.365 +	unsigned long offset;
   4.366 +	pte_t *ptep;
   4.367 +
   4.368 +	if (pmd_none(*pmd))
   4.369 +		return;
   4.370 +	if (unlikely(pmd_bad(*pmd))) {
   4.371 +		pmd_ERROR(*pmd);
   4.372 +		pmd_clear(pmd);
   4.373 +		return;
   4.374 +	}
   4.375 +	ptep = pte_offset_map(pmd, address);
   4.376 +	offset = address & ~PMD_MASK;
   4.377 +	if (offset + size > PMD_SIZE)
   4.378 +		size = PMD_SIZE - offset;
   4.379 +	size &= PAGE_MASK;
   4.380 +	if (details && !details->check_mapping && !details->nonlinear_vma)
   4.381 +		details = NULL;
   4.382 +	for (offset=0; offset < size; ptep++, offset += PAGE_SIZE) {
   4.383 +		pte_t pte = *ptep;
   4.384 +		if (pte_none(pte))
   4.385 +			continue;
   4.386 +		if (pte_present(pte)) {
   4.387 +			struct page *page = NULL;
   4.388 +			unsigned long pfn = pte_pfn(pte);
   4.389 +			if (pfn_valid(pfn)) {
   4.390 +				page = pfn_to_page(pfn);
   4.391 +				if (PageReserved(page))
   4.392 +					page = NULL;
   4.393 +			}
   4.394 +			if (unlikely(details) && page) {
   4.395 +				/*
   4.396 +				 * unmap_shared_mapping_pages() wants to
   4.397 +				 * invalidate cache without truncating:
   4.398 +				 * unmap shared but keep private pages.
   4.399 +				 */
   4.400 +				if (details->check_mapping &&
   4.401 +				    details->check_mapping != page->mapping)
   4.402 +					continue;
   4.403 +				/*
   4.404 +				 * Each page->index must be checked when
   4.405 +				 * invalidating or truncating nonlinear.
   4.406 +				 */
   4.407 +				if (details->nonlinear_vma &&
   4.408 +				    (page->index < details->first_index ||
   4.409 +				     page->index > details->last_index))
   4.410 +					continue;
   4.411 +			}
   4.412 +			pte = ptep_get_and_clear(ptep);
   4.413 +			tlb_remove_tlb_entry(tlb, ptep, address+offset);
   4.414 +			if (unlikely(!page))
   4.415 +				continue;
   4.416 +			if (unlikely(details) && details->nonlinear_vma
   4.417 +			    && linear_page_index(details->nonlinear_vma,
   4.418 +					address+offset) != page->index)
   4.419 +				set_pte(ptep, pgoff_to_pte(page->index));
   4.420 +			if (pte_dirty(pte))
   4.421 +				set_page_dirty(page);
   4.422 +			if (pte_young(pte) && !PageAnon(page))
   4.423 +				mark_page_accessed(page);
   4.424 +			tlb->freed++;
   4.425 +			page_remove_rmap(page);
   4.426 +			tlb_remove_page(tlb, page);
   4.427 +			continue;
   4.428 +		}
   4.429 +		/*
   4.430 +		 * If details->check_mapping, we leave swap entries;
   4.431 +		 * if details->nonlinear_vma, we leave file entries.
   4.432 +		 */
   4.433 +		if (unlikely(details))
   4.434 +			continue;
   4.435 +		if (!pte_file(pte))
   4.436 +			free_swap_and_cache(pte_to_swp_entry(pte));
   4.437 +		pte_clear(ptep);
   4.438 +	}
   4.439 +	pte_unmap(ptep-1);
   4.440 +}
   4.441 +
   4.442 +static void zap_pmd_range(struct mmu_gather *tlb,
   4.443 +		pgd_t * dir, unsigned long address,
   4.444 +		unsigned long size, struct zap_details *details)
   4.445 +{
   4.446 +	pmd_t * pmd;
   4.447 +	unsigned long end;
   4.448 +
   4.449 +	if (pgd_none(*dir))
   4.450 +		return;
   4.451 +	if (unlikely(pgd_bad(*dir))) {
   4.452 +		pgd_ERROR(*dir);
   4.453 +		pgd_clear(dir);
   4.454 +		return;
   4.455 +	}
   4.456 +	pmd = pmd_offset(dir, address);
   4.457 +	end = address + size;
   4.458 +	if (end > ((address + PGDIR_SIZE) & PGDIR_MASK))
   4.459 +		end = ((address + PGDIR_SIZE) & PGDIR_MASK);
   4.460 +	do {
   4.461 +		zap_pte_range(tlb, pmd, address, end - address, details);
   4.462 +		address = (address + PMD_SIZE) & PMD_MASK; 
   4.463 +		pmd++;
   4.464 +	} while (address && (address < end));
   4.465 +}
   4.466 +
   4.467 +static void unmap_page_range(struct mmu_gather *tlb,
   4.468 +		struct vm_area_struct *vma, unsigned long address,
   4.469 +		unsigned long end, struct zap_details *details)
   4.470 +{
   4.471 +	pgd_t * dir;
   4.472 +
   4.473 +	BUG_ON(address >= end);
   4.474 +	dir = pgd_offset(vma->vm_mm, address);
   4.475 +	tlb_start_vma(tlb, vma);
   4.476 +	do {
   4.477 +		zap_pmd_range(tlb, dir, address, end - address, details);
   4.478 +		address = (address + PGDIR_SIZE) & PGDIR_MASK;
   4.479 +		dir++;
   4.480 +	} while (address && (address < end));
   4.481 +	tlb_end_vma(tlb, vma);
   4.482 +}
   4.483 +
   4.484 +/* Dispose of an entire struct mmu_gather per rescheduling point */
   4.485 +#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT)
   4.486 +#define ZAP_BLOCK_SIZE	(FREE_PTE_NR * PAGE_SIZE)
   4.487 +#endif
   4.488 +
   4.489 +/* For UP, 256 pages at a time gives nice low latency */
   4.490 +#if !defined(CONFIG_SMP) && defined(CONFIG_PREEMPT)
   4.491 +#define ZAP_BLOCK_SIZE	(256 * PAGE_SIZE)
   4.492 +#endif
   4.493 +
   4.494 +/* No preempt: go for improved straight-line efficiency */
   4.495 +#if !defined(CONFIG_PREEMPT)
   4.496 +#define ZAP_BLOCK_SIZE	(1024 * PAGE_SIZE)
   4.497 +#endif
   4.498 +
   4.499 +/**
   4.500 + * unmap_vmas - unmap a range of memory covered by a list of vma's
   4.501 + * @tlbp: address of the caller's struct mmu_gather
   4.502 + * @mm: the controlling mm_struct
   4.503 + * @vma: the starting vma
   4.504 + * @start_addr: virtual address at which to start unmapping
   4.505 + * @end_addr: virtual address at which to end unmapping
   4.506 + * @nr_accounted: Place number of unmapped pages in vm-accountable vma's here
   4.507 + * @details: details of nonlinear truncation or shared cache invalidation
   4.508 + *
   4.509 + * Returns the number of vma's which were covered by the unmapping.
   4.510 + *
   4.511 + * Unmap all pages in the vma list.  Called under page_table_lock.
   4.512 + *
   4.513 + * We aim to not hold page_table_lock for too long (for scheduling latency
   4.514 + * reasons).  So zap pages in ZAP_BLOCK_SIZE bytecounts.  This means we need to
   4.515 + * return the ending mmu_gather to the caller.
   4.516 + *
   4.517 + * Only addresses between `start' and `end' will be unmapped.
   4.518 + *
   4.519 + * The VMA list must be sorted in ascending virtual address order.
   4.520 + *
   4.521 + * unmap_vmas() assumes that the caller will flush the whole unmapped address
   4.522 + * range after unmap_vmas() returns.  So the only responsibility here is to
   4.523 + * ensure that any thus-far unmapped pages are flushed before unmap_vmas()
   4.524 + * drops the lock and schedules.
   4.525 + */
   4.526 +int unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm,
   4.527 +		struct vm_area_struct *vma, unsigned long start_addr,
   4.528 +		unsigned long end_addr, unsigned long *nr_accounted,
   4.529 +		struct zap_details *details)
   4.530 +{
   4.531 +	unsigned long zap_bytes = ZAP_BLOCK_SIZE;
   4.532 +	unsigned long tlb_start = 0;	/* For tlb_finish_mmu */
   4.533 +	int tlb_start_valid = 0;
   4.534 +	int ret = 0;
   4.535 +	int atomic = details && details->atomic;
   4.536 +
   4.537 +	for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
   4.538 +		unsigned long start;
   4.539 +		unsigned long end;
   4.540 +
   4.541 +		start = max(vma->vm_start, start_addr);
   4.542 +		if (start >= vma->vm_end)
   4.543 +			continue;
   4.544 +		end = min(vma->vm_end, end_addr);
   4.545 +		if (end <= vma->vm_start)
   4.546 +			continue;
   4.547 +
   4.548 +		if (vma->vm_flags & VM_ACCOUNT)
   4.549 +			*nr_accounted += (end - start) >> PAGE_SHIFT;
   4.550 +
   4.551 +		ret++;
   4.552 +		while (start != end) {
   4.553 +			unsigned long block;
   4.554 +
   4.555 +			if (!tlb_start_valid) {
   4.556 +				tlb_start = start;
   4.557 +				tlb_start_valid = 1;
   4.558 +			}
   4.559 +
   4.560 +			if (is_vm_hugetlb_page(vma)) {
   4.561 +				block = end - start;
   4.562 +				unmap_hugepage_range(vma, start, end);
   4.563 +			} else {
   4.564 +				block = min(zap_bytes, end - start);
   4.565 +				unmap_page_range(*tlbp, vma, start,
   4.566 +						start + block, details);
   4.567 +			}
   4.568 +
   4.569 +			start += block;
   4.570 +			zap_bytes -= block;
   4.571 +			if ((long)zap_bytes > 0)
   4.572 +				continue;
   4.573 +			if (!atomic && need_resched()) {
   4.574 +				int fullmm = tlb_is_full_mm(*tlbp);
   4.575 +				tlb_finish_mmu(*tlbp, tlb_start, start);
   4.576 +				cond_resched_lock(&mm->page_table_lock);
   4.577 +				*tlbp = tlb_gather_mmu(mm, fullmm);
   4.578 +				tlb_start_valid = 0;
   4.579 +			}
   4.580 +			zap_bytes = ZAP_BLOCK_SIZE;
   4.581 +		}
   4.582 +	}
   4.583 +	return ret;
   4.584 +}
   4.585 +
   4.586 +/**
   4.587 + * zap_page_range - remove user pages in a given range
   4.588 + * @vma: vm_area_struct holding the applicable pages
   4.589 + * @address: starting address of pages to zap
   4.590 + * @size: number of bytes to zap
   4.591 + * @details: details of nonlinear truncation or shared cache invalidation
   4.592 + */
   4.593 +void zap_page_range(struct vm_area_struct *vma, unsigned long address,
   4.594 +		unsigned long size, struct zap_details *details)
   4.595 +{
   4.596 +	struct mm_struct *mm = vma->vm_mm;
   4.597 +	struct mmu_gather *tlb;
   4.598 +	unsigned long end = address + size;
   4.599 +	unsigned long nr_accounted = 0;
   4.600 +
   4.601 +	if (is_vm_hugetlb_page(vma)) {
   4.602 +		zap_hugepage_range(vma, address, size);
   4.603 +		return;
   4.604 +	}
   4.605 +
   4.606 +	lru_add_drain();
   4.607 +	spin_lock(&mm->page_table_lock);
   4.608 +	tlb = tlb_gather_mmu(mm, 0);
   4.609 +	unmap_vmas(&tlb, mm, vma, address, end, &nr_accounted, details);
   4.610 +	tlb_finish_mmu(tlb, address, end);
   4.611 +	spin_unlock(&mm->page_table_lock);
   4.612 +}
   4.613 +
   4.614 +/*
   4.615 + * Do a quick page-table lookup for a single page.
   4.616 + * mm->page_table_lock must be held.
   4.617 + */
   4.618 +struct page *
   4.619 +follow_page(struct mm_struct *mm, unsigned long address, int write) 
   4.620 +{
   4.621 +	pgd_t *pgd;
   4.622 +	pmd_t *pmd;
   4.623 +	pte_t *ptep, pte;
   4.624 +	unsigned long pfn;
   4.625 +	struct page *page;
   4.626 +
   4.627 +	page = follow_huge_addr(mm, address, write);
   4.628 +	if (! IS_ERR(page))
   4.629 +		return page;
   4.630 +
   4.631 +	pgd = pgd_offset(mm, address);
   4.632 +	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
   4.633 +		goto out;
   4.634 +
   4.635 +	pmd = pmd_offset(pgd, address);
   4.636 +	if (pmd_none(*pmd))
   4.637 +		goto out;
   4.638 +	if (pmd_huge(*pmd))
   4.639 +		return follow_huge_pmd(mm, address, pmd, write);
   4.640 +	if (unlikely(pmd_bad(*pmd)))
   4.641 +		goto out;
   4.642 +
   4.643 +	ptep = pte_offset_map(pmd, address);
   4.644 +	if (!ptep)
   4.645 +		goto out;
   4.646 +
   4.647 +	pte = *ptep;
   4.648 +	pte_unmap(ptep);
   4.649 +	if (pte_present(pte)) {
   4.650 +		if (write && !pte_write(pte))
   4.651 +			goto out;
   4.652 +		pfn = pte_pfn(pte);
   4.653 +		if (pfn_valid(pfn)) {
   4.654 +			page = pfn_to_page(pfn);
   4.655 +			if (write && !pte_dirty(pte) && !PageDirty(page))
   4.656 +				set_page_dirty(page);
   4.657 +			mark_page_accessed(page);
   4.658 +			return page;
   4.659 +		}
   4.660 +	}
   4.661 +
   4.662 +out:
   4.663 +	return NULL;
   4.664 +}
   4.665 +
   4.666 +/* 
   4.667 + * Given a physical address, is there a useful struct page pointing to
   4.668 + * it?  This may become more complex in the future if we start dealing
   4.669 + * with IO-aperture pages for direct-IO.
   4.670 + */
   4.671 +
   4.672 +static inline struct page *get_page_map(struct page *page)
   4.673 +{
   4.674 +	if (!pfn_valid(page_to_pfn(page)))
   4.675 +		return NULL;
   4.676 +	return page;
   4.677 +}
   4.678 +
   4.679 +
   4.680 +static inline int
   4.681 +untouched_anonymous_page(struct mm_struct* mm, struct vm_area_struct *vma,
   4.682 +			 unsigned long address)
   4.683 +{
   4.684 +	pgd_t *pgd;
   4.685 +	pmd_t *pmd;
   4.686 +
   4.687 +	/* Check if the vma is for an anonymous mapping. */
   4.688 +	if (vma->vm_ops && vma->vm_ops->nopage)
   4.689 +		return 0;
   4.690 +
   4.691 +	/* Check if page directory entry exists. */
   4.692 +	pgd = pgd_offset(mm, address);
   4.693 +	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
   4.694 +		return 1;
   4.695 +
   4.696 +	/* Check if page middle directory entry exists. */
   4.697 +	pmd = pmd_offset(pgd, address);
   4.698 +	if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
   4.699 +		return 1;
   4.700 +
   4.701 +	/* There is a pte slot for 'address' in 'mm'. */
   4.702 +	return 0;
   4.703 +}
   4.704 +
   4.705 +
   4.706 +int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
   4.707 +		unsigned long start, int len, int write, int force,
   4.708 +		struct page **pages, struct vm_area_struct **vmas)
   4.709 +{
   4.710 +	int i;
   4.711 +	unsigned int flags;
   4.712 +
   4.713 +	/* 
   4.714 +	 * Require read or write permissions.
   4.715 +	 * If 'force' is set, we only require the "MAY" flags.
   4.716 +	 */
   4.717 +	flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
   4.718 +	flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
   4.719 +	i = 0;
   4.720 +
   4.721 +	do {
   4.722 +		struct vm_area_struct *	vma;
   4.723 +
   4.724 +		vma = find_extend_vma(mm, start);
   4.725 +		if (!vma && in_gate_area(tsk, start)) {
   4.726 +			unsigned long pg = start & PAGE_MASK;
   4.727 +			struct vm_area_struct *gate_vma = get_gate_vma(tsk);
   4.728 +			pgd_t *pgd;
   4.729 +			pmd_t *pmd;
   4.730 +			pte_t *pte;
   4.731 +			if (write) /* user gate pages are read-only */
   4.732 +				return i ? : -EFAULT;
   4.733 +			pgd = pgd_offset_gate(mm, pg);
   4.734 +			if (!pgd)
   4.735 +				return i ? : -EFAULT;
   4.736 +			pmd = pmd_offset(pgd, pg);
   4.737 +			if (!pmd)
   4.738 +				return i ? : -EFAULT;
   4.739 +			pte = pte_offset_map(pmd, pg);
   4.740 +			if (!pte)
   4.741 +				return i ? : -EFAULT;
   4.742 +			if (!pte_present(*pte)) {
   4.743 +				pte_unmap(pte);
   4.744 +				return i ? : -EFAULT;
   4.745 +			}
   4.746 +			if (pages) {
   4.747 +				pages[i] = pte_page(*pte);
   4.748 +				get_page(pages[i]);
   4.749 +			}
   4.750 +			pte_unmap(pte);
   4.751 +			if (vmas)
   4.752 +				vmas[i] = gate_vma;
   4.753 +			i++;
   4.754 +			start += PAGE_SIZE;
   4.755 +			len--;
   4.756 +			continue;
   4.757 +		}
   4.758 +
   4.759 +		if (!vma || (pages && (vma->vm_flags & VM_IO))
   4.760 +				|| !(flags & vma->vm_flags))
   4.761 +			return i ? : -EFAULT;
   4.762 +
   4.763 +		if (is_vm_hugetlb_page(vma)) {
   4.764 +			i = follow_hugetlb_page(mm, vma, pages, vmas,
   4.765 +						&start, &len, i);
   4.766 +			continue;
   4.767 +		}
   4.768 +		spin_lock(&mm->page_table_lock);
   4.769 +		do {
   4.770 +			struct page *map;
   4.771 +			int lookup_write = write;
   4.772 +			while (!(map = follow_page(mm, start, lookup_write))) {
   4.773 +				/*
   4.774 +				 * Shortcut for anonymous pages. We don't want
   4.775 +				 * to force the creation of pages tables for
   4.776 +				 * insanly big anonymously mapped areas that
   4.777 +				 * nobody touched so far. This is important
   4.778 +				 * for doing a core dump for these mappings.
   4.779 +				 */
   4.780 +				if (!lookup_write &&
   4.781 +				    untouched_anonymous_page(mm,vma,start)) {
   4.782 +					map = ZERO_PAGE(start);
   4.783 +					break;
   4.784 +				}
   4.785 +				spin_unlock(&mm->page_table_lock);
   4.786 +				switch (handle_mm_fault(mm,vma,start,write)) {
   4.787 +				case VM_FAULT_MINOR:
   4.788 +					tsk->min_flt++;
   4.789 +					break;
   4.790 +				case VM_FAULT_MAJOR:
   4.791 +					tsk->maj_flt++;
   4.792 +					break;
   4.793 +				case VM_FAULT_SIGBUS:
   4.794 +					return i ? i : -EFAULT;
   4.795 +				case VM_FAULT_OOM:
   4.796 +					return i ? i : -ENOMEM;
   4.797 +				default:
   4.798 +					BUG();
   4.799 +				}
   4.800 +				/*
   4.801 +				 * Now that we have performed a write fault
   4.802 +				 * and surely no longer have a shared page we
   4.803 +				 * shouldn't write, we shouldn't ignore an
   4.804 +				 * unwritable page in the page table if
   4.805 +				 * we are forcing write access.
   4.806 +				 */
   4.807 +				lookup_write = write && !force;
   4.808 +				spin_lock(&mm->page_table_lock);
   4.809 +			}
   4.810 +			if (pages) {
   4.811 +				pages[i] = get_page_map(map);
   4.812 +				if (!pages[i]) {
   4.813 +					spin_unlock(&mm->page_table_lock);
   4.814 +					while (i--)
   4.815 +						page_cache_release(pages[i]);
   4.816 +					i = -EFAULT;
   4.817 +					goto out;
   4.818 +				}
   4.819 +				flush_dcache_page(pages[i]);
   4.820 +				if (!PageReserved(pages[i]))
   4.821 +					page_cache_get(pages[i]);
   4.822 +			}
   4.823 +			if (vmas)
   4.824 +				vmas[i] = vma;
   4.825 +			i++;
   4.826 +			start += PAGE_SIZE;
   4.827 +			len--;
   4.828 +		} while(len && start < vma->vm_end);
   4.829 +		spin_unlock(&mm->page_table_lock);
   4.830 +	} while(len);
   4.831 +out:
   4.832 +	return i;
   4.833 +}
   4.834 +
   4.835 +EXPORT_SYMBOL(get_user_pages);
   4.836 +
   4.837 +static void zeromap_pte_range(pte_t * pte, unsigned long address,
   4.838 +                                     unsigned long size, pgprot_t prot)
   4.839 +{
   4.840 +	unsigned long end;
   4.841 +
   4.842 +	address &= ~PMD_MASK;
   4.843 +	end = address + size;
   4.844 +	if (end > PMD_SIZE)
   4.845 +		end = PMD_SIZE;
   4.846 +	do {
   4.847 +		pte_t zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE(address), prot));
   4.848 +		BUG_ON(!pte_none(*pte));
   4.849 +		set_pte(pte, zero_pte);
   4.850 +		address += PAGE_SIZE;
   4.851 +		pte++;
   4.852 +	} while (address && (address < end));
   4.853 +}
   4.854 +
   4.855 +static inline int zeromap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned long address,
   4.856 +                                    unsigned long size, pgprot_t prot)
   4.857 +{
   4.858 +	unsigned long base, end;
   4.859 +
   4.860 +	base = address & PGDIR_MASK;
   4.861 +	address &= ~PGDIR_MASK;
   4.862 +	end = address + size;
   4.863 +	if (end > PGDIR_SIZE)
   4.864 +		end = PGDIR_SIZE;
   4.865 +	do {
   4.866 +		pte_t * pte = pte_alloc_map(mm, pmd, base + address);
   4.867 +		if (!pte)
   4.868 +			return -ENOMEM;
   4.869 +		zeromap_pte_range(pte, base + address, end - address, prot);
   4.870 +		pte_unmap(pte);
   4.871 +		address = (address + PMD_SIZE) & PMD_MASK;
   4.872 +		pmd++;
   4.873 +	} while (address && (address < end));
   4.874 +	return 0;
   4.875 +}
   4.876 +
   4.877 +int zeromap_page_range(struct vm_area_struct *vma, unsigned long address, unsigned long size, pgprot_t prot)
   4.878 +{
   4.879 +	int error = 0;
   4.880 +	pgd_t * dir;
   4.881 +	unsigned long beg = address;
   4.882 +	unsigned long end = address + size;
   4.883 +	struct mm_struct *mm = vma->vm_mm;
   4.884 +
   4.885 +	dir = pgd_offset(mm, address);
   4.886 +	flush_cache_range(vma, beg, end);
   4.887 +	if (address >= end)
   4.888 +		BUG();
   4.889 +
   4.890 +	spin_lock(&mm->page_table_lock);
   4.891 +	do {
   4.892 +		pmd_t *pmd = pmd_alloc(mm, dir, address);
   4.893 +		error = -ENOMEM;
   4.894 +		if (!pmd)
   4.895 +			break;
   4.896 +		error = zeromap_pmd_range(mm, pmd, address, end - address, prot);
   4.897 +		if (error)
   4.898 +			break;
   4.899 +		address = (address + PGDIR_SIZE) & PGDIR_MASK;
   4.900 +		dir++;
   4.901 +	} while (address && (address < end));
   4.902 +	/*
   4.903 +	 * Why flush? zeromap_pte_range has a BUG_ON for !pte_none()
   4.904 +	 */
   4.905 +	flush_tlb_range(vma, beg, end);
   4.906 +	spin_unlock(&mm->page_table_lock);
   4.907 +	return error;
   4.908 +}
   4.909 +
   4.910 +/*
   4.911 + * maps a range of physical memory into the requested pages. the old
   4.912 + * mappings are removed. any references to nonexistent pages results
   4.913 + * in null mappings (currently treated as "copy-on-access")
   4.914 + */
   4.915 +static inline void remap_pte_range(pte_t * pte, unsigned long address, unsigned long size,
   4.916 +	unsigned long phys_addr, pgprot_t prot)
   4.917 +{
   4.918 +	unsigned long end;
   4.919 +	unsigned long pfn;
   4.920 +
   4.921 +	address &= ~PMD_MASK;
   4.922 +	end = address + size;
   4.923 +	if (end > PMD_SIZE)
   4.924 +		end = PMD_SIZE;
   4.925 +	pfn = phys_addr >> PAGE_SHIFT;
   4.926 +	do {
   4.927 +		BUG_ON(!pte_none(*pte));
   4.928 +		if (!pfn_valid(pfn) || PageReserved(pfn_to_page(pfn)))
   4.929 + 			set_pte(pte, pfn_pte(pfn, prot));
   4.930 +		address += PAGE_SIZE;
   4.931 +		pfn++;
   4.932 +		pte++;
   4.933 +	} while (address && (address < end));
   4.934 +}
   4.935 +
   4.936 +static inline int remap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned long address, unsigned long size,
   4.937 +	unsigned long phys_addr, pgprot_t prot)
   4.938 +{
   4.939 +	unsigned long base, end;
   4.940 +
   4.941 +	base = address & PGDIR_MASK;
   4.942 +	address &= ~PGDIR_MASK;
   4.943 +	end = address + size;
   4.944 +	if (end > PGDIR_SIZE)
   4.945 +		end = PGDIR_SIZE;
   4.946 +	phys_addr -= address;
   4.947 +	do {
   4.948 +		pte_t * pte = pte_alloc_map(mm, pmd, base + address);
   4.949 +		if (!pte)
   4.950 +			return -ENOMEM;
   4.951 +		remap_pte_range(pte, base + address, end - address, address + phys_addr, prot);
   4.952 +		pte_unmap(pte);
   4.953 +		address = (address + PMD_SIZE) & PMD_MASK;
   4.954 +		pmd++;
   4.955 +	} while (address && (address < end));
   4.956 +	return 0;
   4.957 +}
   4.958 +
   4.959 +/*  Note: this is only safe if the mm semaphore is held when called. */
   4.960 +int remap_page_range(struct vm_area_struct *vma, unsigned long from, unsigned long phys_addr, unsigned long size, pgprot_t prot)
   4.961 +{
   4.962 +	int error = 0;
   4.963 +	pgd_t * dir;
   4.964 +	unsigned long beg = from;
   4.965 +	unsigned long end = from + size;
   4.966 +	struct mm_struct *mm = vma->vm_mm;
   4.967 +
   4.968 +	phys_addr -= from;
   4.969 +	dir = pgd_offset(mm, from);
   4.970 +	flush_cache_range(vma, beg, end);
   4.971 +	if (from >= end)
   4.972 +		BUG();
   4.973 +
   4.974 +	spin_lock(&mm->page_table_lock);
   4.975 +	do {
   4.976 +		pmd_t *pmd = pmd_alloc(mm, dir, from);
   4.977 +		error = -ENOMEM;
   4.978 +		if (!pmd)
   4.979 +			break;
   4.980 +		error = remap_pmd_range(mm, pmd, from, end - from, phys_addr + from, prot);
   4.981 +		if (error)
   4.982 +			break;
   4.983 +		from = (from + PGDIR_SIZE) & PGDIR_MASK;
   4.984 +		dir++;
   4.985 +	} while (from && (from < end));
   4.986 +	/*
   4.987 +	 * Why flush? remap_pte_range has a BUG_ON for !pte_none()
   4.988 +	 */
   4.989 +	flush_tlb_range(vma, beg, end);
   4.990 +	spin_unlock(&mm->page_table_lock);
   4.991 +	return error;
   4.992 +}
   4.993 +
   4.994 +EXPORT_SYMBOL(remap_page_range);
   4.995 +
   4.996 +/*
   4.997 + * Do pte_mkwrite, but only if the vma says VM_WRITE.  We do this when
   4.998 + * servicing faults for write access.  In the normal case, do always want
   4.999 + * pte_mkwrite.  But get_user_pages can cause write faults for mappings
  4.1000 + * that do not have writing enabled, when used by access_process_vm.
  4.1001 + */
  4.1002 +static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
  4.1003 +{
  4.1004 +	if (likely(vma->vm_flags & VM_WRITE))
  4.1005 +		pte = pte_mkwrite(pte);
  4.1006 +	return pte;
  4.1007 +}
  4.1008 +
  4.1009 +/*
  4.1010 + * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock
  4.1011 + */
  4.1012 +static inline void break_cow(struct vm_area_struct * vma, struct page * new_page, unsigned long address, 
  4.1013 +		pte_t *page_table)
  4.1014 +{
  4.1015 +	pte_t entry;
  4.1016 +
  4.1017 +	flush_cache_page(vma, address);
  4.1018 +	entry = maybe_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot)),
  4.1019 +			      vma);
  4.1020 +	ptep_establish(vma, address, page_table, entry);
  4.1021 +	update_mmu_cache(vma, address, entry);
  4.1022 +}
  4.1023 +
  4.1024 +/*
  4.1025 + * This routine handles present pages, when users try to write
  4.1026 + * to a shared page. It is done by copying the page to a new address
  4.1027 + * and decrementing the shared-page counter for the old page.
  4.1028 + *
  4.1029 + * Goto-purists beware: the only reason for goto's here is that it results
  4.1030 + * in better assembly code.. The "default" path will see no jumps at all.
  4.1031 + *
  4.1032 + * Note that this routine assumes that the protection checks have been
  4.1033 + * done by the caller (the low-level page fault routine in most cases).
  4.1034 + * Thus we can safely just mark it writable once we've done any necessary
  4.1035 + * COW.
  4.1036 + *
  4.1037 + * We also mark the page dirty at this point even though the page will
  4.1038 + * change only once the write actually happens. This avoids a few races,
  4.1039 + * and potentially makes it more efficient.
  4.1040 + *
  4.1041 + * We hold the mm semaphore and the page_table_lock on entry and exit
  4.1042 + * with the page_table_lock released.
  4.1043 + */
  4.1044 +static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
  4.1045 +	unsigned long address, pte_t *page_table, pmd_t *pmd, pte_t pte)
  4.1046 +{
  4.1047 +	struct page *old_page, *new_page;
  4.1048 +	unsigned long pfn = pte_pfn(pte);
  4.1049 +	pte_t entry;
  4.1050 +
  4.1051 +	if (unlikely(!pfn_valid(pfn))) {
  4.1052 +		/*
  4.1053 +		 * This should really halt the system so it can be debugged or
  4.1054 +		 * at least the kernel stops what it's doing before it corrupts
  4.1055 +		 * data, but for the moment just pretend this is OOM.
  4.1056 +		 */
  4.1057 +		pte_unmap(page_table);
  4.1058 +		printk(KERN_ERR "do_wp_page: bogus page at address %08lx\n",
  4.1059 +				address);
  4.1060 +		spin_unlock(&mm->page_table_lock);
  4.1061 +		return VM_FAULT_OOM;
  4.1062 +	}
  4.1063 +	old_page = pfn_to_page(pfn);
  4.1064 +
  4.1065 +	if (!TestSetPageLocked(old_page)) {
  4.1066 +		int reuse = can_share_swap_page(old_page);
  4.1067 +		unlock_page(old_page);
  4.1068 +		if (reuse) {
  4.1069 +			flush_cache_page(vma, address);
  4.1070 +			entry = maybe_mkwrite(pte_mkyoung(pte_mkdirty(pte)),
  4.1071 +					      vma);
  4.1072 +			ptep_set_access_flags(vma, address, page_table, entry, 1);
  4.1073 +			update_mmu_cache(vma, address, entry);
  4.1074 +			pte_unmap(page_table);
  4.1075 +			spin_unlock(&mm->page_table_lock);
  4.1076 +			return VM_FAULT_MINOR;
  4.1077 +		}
  4.1078 +	}
  4.1079 +	pte_unmap(page_table);
  4.1080 +
  4.1081 +	/*
  4.1082 +	 * Ok, we need to copy. Oh, well..
  4.1083 +	 */
  4.1084 +	if (!PageReserved(old_page))
  4.1085 +		page_cache_get(old_page);
  4.1086 +	spin_unlock(&mm->page_table_lock);
  4.1087 +
  4.1088 +	if (unlikely(anon_vma_prepare(vma)))
  4.1089 +		goto no_new_page;
  4.1090 +	new_page = alloc_page_vma(GFP_HIGHUSER, vma, address);
  4.1091 +	if (!new_page)
  4.1092 +		goto no_new_page;
  4.1093 +	copy_cow_page(old_page,new_page,address);
  4.1094 +
  4.1095 +	/*
  4.1096 +	 * Re-check the pte - we dropped the lock
  4.1097 +	 */
  4.1098 +	spin_lock(&mm->page_table_lock);
  4.1099 +	page_table = pte_offset_map(pmd, address);
  4.1100 +	if (likely(pte_same(*page_table, pte))) {
  4.1101 +		if (PageReserved(old_page))
  4.1102 +			++mm->rss;
  4.1103 +		else
  4.1104 +			page_remove_rmap(old_page);
  4.1105 +		break_cow(vma, new_page, address, page_table);
  4.1106 +		lru_cache_add_active(new_page);
  4.1107 +		page_add_anon_rmap(new_page, vma, address);
  4.1108 +
  4.1109 +		/* Free the old page.. */
  4.1110 +		new_page = old_page;
  4.1111 +	}
  4.1112 +	pte_unmap(page_table);
  4.1113 +	page_cache_release(new_page);
  4.1114 +	page_cache_release(old_page);
  4.1115 +	spin_unlock(&mm->page_table_lock);
  4.1116 +	return VM_FAULT_MINOR;
  4.1117 +
  4.1118 +no_new_page:
  4.1119 +	page_cache_release(old_page);
  4.1120 +	return VM_FAULT_OOM;
  4.1121 +}
  4.1122 +
  4.1123 +/*
  4.1124 + * Helper function for unmap_mapping_range().
  4.1125 + */
  4.1126 +static inline void unmap_mapping_range_list(struct prio_tree_root *root,
  4.1127 +					    struct zap_details *details)
  4.1128 +{
  4.1129 +	struct vm_area_struct *vma = NULL;
  4.1130 +	struct prio_tree_iter iter;
  4.1131 +	pgoff_t vba, vea, zba, zea;
  4.1132 +
  4.1133 +	while ((vma = vma_prio_tree_next(vma, root, &iter,
  4.1134 +			details->first_index, details->last_index)) != NULL) {
  4.1135 +		vba = vma->vm_pgoff;
  4.1136 +		vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1;
  4.1137 +		/* Assume for now that PAGE_CACHE_SHIFT == PAGE_SHIFT */
  4.1138 +		zba = details->first_index;
  4.1139 +		if (zba < vba)
  4.1140 +			zba = vba;
  4.1141 +		zea = details->last_index;
  4.1142 +		if (zea > vea)
  4.1143 +			zea = vea;
  4.1144 +		zap_page_range(vma,
  4.1145 +			((zba - vba) << PAGE_SHIFT) + vma->vm_start,
  4.1146 +			(zea - zba + 1) << PAGE_SHIFT, details);
  4.1147 +	}
  4.1148 +}
  4.1149 +
  4.1150 +/**
  4.1151 + * unmap_mapping_range - unmap the portion of all mmaps
  4.1152 + * in the specified address_space corresponding to the specified
  4.1153 + * page range in the underlying file.
  4.1154 + * @address_space: the address space containing mmaps to be unmapped.
  4.1155 + * @holebegin: byte in first page to unmap, relative to the start of
  4.1156 + * the underlying file.  This will be rounded down to a PAGE_SIZE
  4.1157 + * boundary.  Note that this is different from vmtruncate(), which
  4.1158 + * must keep the partial page.  In contrast, we must get rid of
  4.1159 + * partial pages.
  4.1160 + * @holelen: size of prospective hole in bytes.  This will be rounded
  4.1161 + * up to a PAGE_SIZE boundary.  A holelen of zero truncates to the
  4.1162 + * end of the file.
  4.1163 + * @even_cows: 1 when truncating a file, unmap even private COWed pages;
  4.1164 + * but 0 when invalidating pagecache, don't throw away private data.
  4.1165 + */
  4.1166 +void unmap_mapping_range(struct address_space *mapping,
  4.1167 +		loff_t const holebegin, loff_t const holelen, int even_cows)
  4.1168 +{
  4.1169 +	struct zap_details details;
  4.1170 +	pgoff_t hba = holebegin >> PAGE_SHIFT;
  4.1171 +	pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
  4.1172 +
  4.1173 +	/* Check for overflow. */
  4.1174 +	if (sizeof(holelen) > sizeof(hlen)) {
  4.1175 +		long long holeend =
  4.1176 +			(holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
  4.1177 +		if (holeend & ~(long long)ULONG_MAX)
  4.1178 +			hlen = ULONG_MAX - hba + 1;
  4.1179 +	}
  4.1180 +
  4.1181 +	details.check_mapping = even_cows? NULL: mapping;
  4.1182 +	details.nonlinear_vma = NULL;
  4.1183 +	details.first_index = hba;
  4.1184 +	details.last_index = hba + hlen - 1;
  4.1185 +	details.atomic = 1;	/* A spinlock is held */
  4.1186 +	if (details.last_index < details.first_index)
  4.1187 +		details.last_index = ULONG_MAX;
  4.1188 +
  4.1189 +	spin_lock(&mapping->i_mmap_lock);
  4.1190 +	/* Protect against page fault */
  4.1191 +	atomic_inc(&mapping->truncate_count);
  4.1192 +
  4.1193 +	if (unlikely(!prio_tree_empty(&mapping->i_mmap)))
  4.1194 +		unmap_mapping_range_list(&mapping->i_mmap, &details);
  4.1195 +
  4.1196 +	/*
  4.1197 +	 * In nonlinear VMAs there is no correspondence between virtual address
  4.1198 +	 * offset and file offset.  So we must perform an exhaustive search
  4.1199 +	 * across *all* the pages in each nonlinear VMA, not just the pages
  4.1200 +	 * whose virtual address lies outside the file truncation point.
  4.1201 +	 */
  4.1202 +	if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) {
  4.1203 +		struct vm_area_struct *vma;
  4.1204 +		list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
  4.1205 +						shared.vm_set.list) {
  4.1206 +			details.nonlinear_vma = vma;
  4.1207 +			zap_page_range(vma, vma->vm_start,
  4.1208 +				vma->vm_end - vma->vm_start, &details);
  4.1209 +		}
  4.1210 +	}
  4.1211 +	spin_unlock(&mapping->i_mmap_lock);
  4.1212 +}
  4.1213 +EXPORT_SYMBOL(unmap_mapping_range);
  4.1214 +
  4.1215 +/*
  4.1216 + * Handle all mappings that got truncated by a "truncate()"
  4.1217 + * system call.
  4.1218 + *
  4.1219 + * NOTE! We have to be ready to update the memory sharing
  4.1220 + * between the file and the memory map for a potential last
  4.1221 + * incomplete page.  Ugly, but necessary.
  4.1222 + */
  4.1223 +int vmtruncate(struct inode * inode, loff_t offset)
  4.1224 +{
  4.1225 +	struct address_space *mapping = inode->i_mapping;
  4.1226 +	unsigned long limit;
  4.1227 +
  4.1228 +	if (inode->i_size < offset)
  4.1229 +		goto do_expand;
  4.1230 +	/*
  4.1231 +	 * truncation of in-use swapfiles is disallowed - it would cause
  4.1232 +	 * subsequent swapout to scribble on the now-freed blocks.
  4.1233 +	 */
  4.1234 +	if (IS_SWAPFILE(inode))
  4.1235 +		goto out_busy;
  4.1236 +	i_size_write(inode, offset);
  4.1237 +	unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
  4.1238 +	truncate_inode_pages(mapping, offset);
  4.1239 +	goto out_truncate;
  4.1240 +
  4.1241 +do_expand:
  4.1242 +	limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
  4.1243 +	if (limit != RLIM_INFINITY && offset > limit)
  4.1244 +		goto out_sig;
  4.1245 +	if (offset > inode->i_sb->s_maxbytes)
  4.1246 +		goto out_big;
  4.1247 +	i_size_write(inode, offset);
  4.1248 +
  4.1249 +out_truncate:
  4.1250 +	if (inode->i_op && inode->i_op->truncate)
  4.1251 +		inode->i_op->truncate(inode);
  4.1252 +	return 0;
  4.1253 +out_sig:
  4.1254 +	send_sig(SIGXFSZ, current, 0);
  4.1255 +out_big:
  4.1256 +	return -EFBIG;
  4.1257 +out_busy:
  4.1258 +	return -ETXTBSY;
  4.1259 +}
  4.1260 +
  4.1261 +EXPORT_SYMBOL(vmtruncate);
  4.1262 +
  4.1263 +/* 
  4.1264 + * Primitive swap readahead code. We simply read an aligned block of
  4.1265 + * (1 << page_cluster) entries in the swap area. This method is chosen
  4.1266 + * because it doesn't cost us any seek time.  We also make sure to queue
  4.1267 + * the 'original' request together with the readahead ones...  
  4.1268 + *
  4.1269 + * This has been extended to use the NUMA policies from the mm triggering
  4.1270 + * the readahead.
  4.1271 + *
  4.1272 + * Caller must hold down_read on the vma->vm_mm if vma is not NULL.
  4.1273 + */
  4.1274 +void swapin_readahead(swp_entry_t entry, unsigned long addr,struct vm_area_struct *vma)
  4.1275 +{
  4.1276 +#ifdef CONFIG_NUMA
  4.1277 +	struct vm_area_struct *next_vma = vma ? vma->vm_next : NULL;
  4.1278 +#endif
  4.1279 +	int i, num;
  4.1280 +	struct page *new_page;
  4.1281 +	unsigned long offset;
  4.1282 +
  4.1283 +	/*
  4.1284 +	 * Get the number of handles we should do readahead io to.
  4.1285 +	 */
  4.1286 +	num = valid_swaphandles(entry, &offset);
  4.1287 +	for (i = 0; i < num; offset++, i++) {
  4.1288 +		/* Ok, do the async read-ahead now */
  4.1289 +		new_page = read_swap_cache_async(swp_entry(swp_type(entry),
  4.1290 +							   offset), vma, addr);
  4.1291 +		if (!new_page)
  4.1292 +			break;
  4.1293 +		page_cache_release(new_page);
  4.1294 +#ifdef CONFIG_NUMA
  4.1295 +		/*
  4.1296 +		 * Find the next applicable VMA for the NUMA policy.
  4.1297 +		 */
  4.1298 +		addr += PAGE_SIZE;
  4.1299 +		if (addr == 0)
  4.1300 +			vma = NULL;
  4.1301 +		if (vma) {
  4.1302 +			if (addr >= vma->vm_end) {
  4.1303 +				vma = next_vma;
  4.1304 +				next_vma = vma ? vma->vm_next : NULL;
  4.1305 +			}
  4.1306 +			if (vma && addr < vma->vm_start)
  4.1307 +				vma = NULL;
  4.1308 +		} else {
  4.1309 +			if (next_vma && addr >= next_vma->vm_start) {
  4.1310 +				vma = next_vma;
  4.1311 +				next_vma = vma->vm_next;
  4.1312 +			}
  4.1313 +		}
  4.1314 +#endif
  4.1315 +	}
  4.1316 +	lru_add_drain();	/* Push any new pages onto the LRU now */
  4.1317 +}
  4.1318 +
  4.1319 +/*
  4.1320 + * We hold the mm semaphore and the page_table_lock on entry and
  4.1321 + * should release the pagetable lock on exit..
  4.1322 + */
  4.1323 +static int do_swap_page(struct mm_struct * mm,
  4.1324 +	struct vm_area_struct * vma, unsigned long address,
  4.1325 +	pte_t *page_table, pmd_t *pmd, pte_t orig_pte, int write_access)
  4.1326 +{
  4.1327 +	struct page *page;
  4.1328 +	swp_entry_t entry = pte_to_swp_entry(orig_pte);
  4.1329 +	pte_t pte;
  4.1330 +	int ret = VM_FAULT_MINOR;
  4.1331 +
  4.1332 +	pte_unmap(page_table);
  4.1333 +	spin_unlock(&mm->page_table_lock);
  4.1334 +	page = lookup_swap_cache(entry);
  4.1335 +	if (!page) {
  4.1336 + 		swapin_readahead(entry, address, vma);
  4.1337 + 		page = read_swap_cache_async(entry, vma, address);
  4.1338 +		if (!page) {
  4.1339 +			/*
  4.1340 +			 * Back out if somebody else faulted in this pte while
  4.1341 +			 * we released the page table lock.
  4.1342 +			 */
  4.1343 +			spin_lock(&mm->page_table_lock);
  4.1344 +			page_table = pte_offset_map(pmd, address);
  4.1345 +			if (likely(pte_same(*page_table, orig_pte)))
  4.1346 +				ret = VM_FAULT_OOM;
  4.1347 +			else
  4.1348 +				ret = VM_FAULT_MINOR;
  4.1349 +			pte_unmap(page_table);
  4.1350 +			spin_unlock(&mm->page_table_lock);
  4.1351 +			goto out;
  4.1352 +		}
  4.1353 +
  4.1354 +		/* Had to read the page from swap area: Major fault */
  4.1355 +		ret = VM_FAULT_MAJOR;
  4.1356 +		inc_page_state(pgmajfault);
  4.1357 +	}
  4.1358 +
  4.1359 +	mark_page_accessed(page);
  4.1360 +	lock_page(page);
  4.1361 +
  4.1362 +	/*
  4.1363 +	 * Back out if somebody else faulted in this pte while we
  4.1364 +	 * released the page table lock.
  4.1365 +	 */
  4.1366 +	spin_lock(&mm->page_table_lock);
  4.1367 +	page_table = pte_offset_map(pmd, address);
  4.1368 +	if (unlikely(!pte_same(*page_table, orig_pte))) {
  4.1369 +		pte_unmap(page_table);
  4.1370 +		spin_unlock(&mm->page_table_lock);
  4.1371 +		unlock_page(page);
  4.1372 +		page_cache_release(page);
  4.1373 +		ret = VM_FAULT_MINOR;
  4.1374 +		goto out;
  4.1375 +	}
  4.1376 +
  4.1377 +	/* The page isn't present yet, go ahead with the fault. */
  4.1378 +		
  4.1379 +	swap_free(entry);
  4.1380 +	if (vm_swap_full())
  4.1381 +		remove_exclusive_swap_page(page);
  4.1382 +
  4.1383 +	mm->rss++;
  4.1384 +	pte = mk_pte(page, vma->vm_page_prot);
  4.1385 +	if (write_access && can_share_swap_page(page)) {
  4.1386 +		pte = maybe_mkwrite(pte_mkdirty(pte), vma);
  4.1387 +		write_access = 0;
  4.1388 +	}
  4.1389 +	unlock_page(page);
  4.1390 +
  4.1391 +	flush_icache_page(vma, page);
  4.1392 +	set_pte(page_table, pte);
  4.1393 +	page_add_anon_rmap(page, vma, address);
  4.1394 +
  4.1395 +	if (write_access) {
  4.1396 +		if (do_wp_page(mm, vma, address,
  4.1397 +				page_table, pmd, pte) == VM_FAULT_OOM)
  4.1398 +			ret = VM_FAULT_OOM;
  4.1399 +		goto out;
  4.1400 +	}
  4.1401 +
  4.1402 +	/* No need to invalidate - it was non-present before */
  4.1403 +	update_mmu_cache(vma, address, pte);
  4.1404 +	pte_unmap(page_table);
  4.1405 +	spin_unlock(&mm->page_table_lock);
  4.1406 +out:
  4.1407 +	return ret;
  4.1408 +}
  4.1409 +
  4.1410 +/*
  4.1411 + * We are called with the MM semaphore and page_table_lock
  4.1412 + * spinlock held to protect against concurrent faults in
  4.1413 + * multithreaded programs. 
  4.1414 + */
  4.1415 +static int
  4.1416 +do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
  4.1417 +		pte_t *page_table, pmd_t *pmd, int write_access,
  4.1418 +		unsigned long addr)
  4.1419 +{
  4.1420 +	pte_t entry;
  4.1421 +	struct page * page = ZERO_PAGE(addr);
  4.1422 +
  4.1423 +	/* Read-only mapping of ZERO_PAGE. */
  4.1424 +	entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot));
  4.1425 +
  4.1426 +	/* ..except if it's a write access */
  4.1427 +	if (write_access) {
  4.1428 +		/* Allocate our own private page. */
  4.1429 +		pte_unmap(page_table);
  4.1430 +		spin_unlock(&mm->page_table_lock);
  4.1431 +
  4.1432 +		if (unlikely(anon_vma_prepare(vma)))
  4.1433 +			goto no_mem;
  4.1434 +		page = alloc_page_vma(GFP_HIGHUSER, vma, addr);
  4.1435 +		if (!page)
  4.1436 +			goto no_mem;
  4.1437 +		clear_user_highpage(page, addr);
  4.1438 +
  4.1439 +		spin_lock(&mm->page_table_lock);
  4.1440 +		page_table = pte_offset_map(pmd, addr);
  4.1441 +
  4.1442 +		if (!pte_none(*page_table)) {
  4.1443 +			pte_unmap(page_table);
  4.1444 +			page_cache_release(page);
  4.1445 +			spin_unlock(&mm->page_table_lock);
  4.1446 +			goto out;
  4.1447 +		}
  4.1448 +		mm->rss++;
  4.1449 +		entry = maybe_mkwrite(pte_mkdirty(mk_pte(page,
  4.1450 +							 vma->vm_page_prot)),
  4.1451 +				      vma);
  4.1452 +		lru_cache_add_active(page);
  4.1453 +		mark_page_accessed(page);
  4.1454 +		page_add_anon_rmap(page, vma, addr);
  4.1455 +	}
  4.1456 +
  4.1457 +	ptep_establish_new(vma, addr, page_table, entry);
  4.1458 +	pte_unmap(page_table);
  4.1459 +
  4.1460 +	/* No need to invalidate - it was non-present before */
  4.1461 +	update_mmu_cache(vma, addr, entry);
  4.1462 +	spin_unlock(&mm->page_table_lock);
  4.1463 +out:
  4.1464 +	return VM_FAULT_MINOR;
  4.1465 +no_mem:
  4.1466 +	return VM_FAULT_OOM;
  4.1467 +}
  4.1468 +
  4.1469 +/*
  4.1470 + * do_no_page() tries to create a new page mapping. It aggressively
  4.1471 + * tries to share with existing pages, but makes a separate copy if
  4.1472 + * the "write_access" parameter is true in order to avoid the next
  4.1473 + * page fault.
  4.1474 + *
  4.1475 + * As this is called only for pages that do not currently exist, we
  4.1476 + * do not need to flush old virtual caches or the TLB.
  4.1477 + *
  4.1478 + * This is called with the MM semaphore held and the page table
  4.1479 + * spinlock held. Exit with the spinlock released.
  4.1480 + */
  4.1481 +static int
  4.1482 +do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
  4.1483 +	unsigned long address, int write_access, pte_t *page_table, pmd_t *pmd)
  4.1484 +{
  4.1485 +	struct page * new_page;
  4.1486 +	struct address_space *mapping = NULL;
  4.1487 +	pte_t entry;
  4.1488 +	int sequence = 0;
  4.1489 +	int ret = VM_FAULT_MINOR;
  4.1490 +	int anon = 0;
  4.1491 +
  4.1492 +	if (!vma->vm_ops || !vma->vm_ops->nopage)
  4.1493 +		return do_anonymous_page(mm, vma, page_table,
  4.1494 +					pmd, write_access, address);
  4.1495 +	pte_unmap(page_table);
  4.1496 +	spin_unlock(&mm->page_table_lock);
  4.1497 +
  4.1498 +	if (vma->vm_file) {
  4.1499 +		mapping = vma->vm_file->f_mapping;
  4.1500 +		sequence = atomic_read(&mapping->truncate_count);
  4.1501 +	}
  4.1502 +	smp_rmb();  /* Prevent CPU from reordering lock-free ->nopage() */
  4.1503 +retry:
  4.1504 +	new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret);
  4.1505 +
  4.1506 +	/* no page was available -- either SIGBUS or OOM */
  4.1507 +	if (new_page == NOPAGE_SIGBUS)
  4.1508 +		return VM_FAULT_SIGBUS;
  4.1509 +	if (new_page == NOPAGE_OOM)
  4.1510 +		return VM_FAULT_OOM;
  4.1511 +
  4.1512 +	/*
  4.1513 +	 * Should we do an early C-O-W break?
  4.1514 +	 */
  4.1515 +	if (write_access && !(vma->vm_flags & VM_SHARED)) {
  4.1516 +		struct page *page;
  4.1517 +
  4.1518 +		if (unlikely(anon_vma_prepare(vma)))
  4.1519 +			goto oom;
  4.1520 +		page = alloc_page_vma(GFP_HIGHUSER, vma, address);
  4.1521 +		if (!page)
  4.1522 +			goto oom;
  4.1523 +		copy_user_highpage(page, new_page, address);
  4.1524 +		page_cache_release(new_page);
  4.1525 +		new_page = page;
  4.1526 +		anon = 1;
  4.1527 +	}
  4.1528 +
  4.1529 +	spin_lock(&mm->page_table_lock);
  4.1530 +	/*
  4.1531 +	 * For a file-backed vma, someone could have truncated or otherwise
  4.1532 +	 * invalidated this page.  If unmap_mapping_range got called,
  4.1533 +	 * retry getting the page.
  4.1534 +	 */
  4.1535 +	if (mapping &&
  4.1536 +	      (unlikely(sequence != atomic_read(&mapping->truncate_count)))) {
  4.1537 +		sequence = atomic_read(&mapping->truncate_count);
  4.1538 +		spin_unlock(&mm->page_table_lock);
  4.1539 +		page_cache_release(new_page);
  4.1540 +		goto retry;
  4.1541 +	}
  4.1542 +	page_table = pte_offset_map(pmd, address);
  4.1543 +
  4.1544 +	/*
  4.1545 +	 * This silly early PAGE_DIRTY setting removes a race
  4.1546 +	 * due to the bad i386 page protection. But it's valid
  4.1547 +	 * for other architectures too.
  4.1548 +	 *
  4.1549 +	 * Note that if write_access is true, we either now have
  4.1550 +	 * an exclusive copy of the page, or this is a shared mapping,
  4.1551 +	 * so we can make it writable and dirty to avoid having to
  4.1552 +	 * handle that later.
  4.1553 +	 */
  4.1554 +	/* Only go through if we didn't race with anybody else... */
  4.1555 +	if (pte_none(*page_table)) {
  4.1556 +		if (!PageReserved(new_page))
  4.1557 +			++mm->rss;
  4.1558 +		flush_icache_page(vma, new_page);
  4.1559 +		entry = mk_pte(new_page, vma->vm_page_prot);
  4.1560 +		if (write_access)
  4.1561 +			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
  4.1562 +		ptep_establish_new(vma, address, page_table, entry);
  4.1563 +		if (anon) {
  4.1564 +			lru_cache_add_active(new_page);
  4.1565 +			page_add_anon_rmap(new_page, vma, address);
  4.1566 +		} else
  4.1567 +			page_add_file_rmap(new_page);
  4.1568 +		pte_unmap(page_table);
  4.1569 +	} else {
  4.1570 +		/* One of our sibling threads was faster, back out. */
  4.1571 +		pte_unmap(page_table);
  4.1572 +		page_cache_release(new_page);
  4.1573 +		spin_unlock(&mm->page_table_lock);
  4.1574 +		goto out;
  4.1575 +	}
  4.1576 +
  4.1577 +	/* no need to invalidate: a not-present page shouldn't be cached */
  4.1578 +	update_mmu_cache(vma, address, entry);
  4.1579 +	spin_unlock(&mm->page_table_lock);
  4.1580 +out:
  4.1581 +	return ret;
  4.1582 +oom:
  4.1583 +	page_cache_release(new_page);
  4.1584 +	ret = VM_FAULT_OOM;
  4.1585 +	goto out;
  4.1586 +}
  4.1587 +
  4.1588 +/*
  4.1589 + * Fault of a previously existing named mapping. Repopulate the pte
  4.1590 + * from the encoded file_pte if possible. This enables swappable
  4.1591 + * nonlinear vmas.
  4.1592 + */
  4.1593 +static int do_file_page(struct mm_struct * mm, struct vm_area_struct * vma,
  4.1594 +	unsigned long address, int write_access, pte_t *pte, pmd_t *pmd)
  4.1595 +{
  4.1596 +	unsigned long pgoff;
  4.1597 +	int err;
  4.1598 +
  4.1599 +	BUG_ON(!vma->vm_ops || !vma->vm_ops->nopage);
  4.1600 +	/*
  4.1601 +	 * Fall back to the linear mapping if the fs does not support
  4.1602 +	 * ->populate:
  4.1603 +	 */
  4.1604 +	if (!vma->vm_ops || !vma->vm_ops->populate || 
  4.1605 +			(write_access && !(vma->vm_flags & VM_SHARED))) {
  4.1606 +		pte_clear(pte);
  4.1607 +		return do_no_page(mm, vma, address, write_access, pte, pmd);
  4.1608 +	}
  4.1609 +
  4.1610 +	pgoff = pte_to_pgoff(*pte);
  4.1611 +
  4.1612 +	pte_unmap(pte);
  4.1613 +	spin_unlock(&mm->page_table_lock);
  4.1614 +
  4.1615 +	err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE, vma->vm_page_prot, pgoff, 0);
  4.1616 +	if (err == -ENOMEM)
  4.1617 +		return VM_FAULT_OOM;
  4.1618 +	if (err)
  4.1619 +		return VM_FAULT_SIGBUS;
  4.1620 +	return VM_FAULT_MAJOR;
  4.1621 +}
  4.1622 +
  4.1623 +/*
  4.1624 + * These routines also need to handle stuff like marking pages dirty
  4.1625 + * and/or accessed for architectures that don't do it in hardware (most
  4.1626 + * RISC architectures).  The early dirtying is also good on the i386.
  4.1627 + *
  4.1628 + * There is also a hook called "update_mmu_cache()" that architectures
  4.1629 + * with external mmu caches can use to update those (ie the Sparc or
  4.1630 + * PowerPC hashed page tables that act as extended TLBs).
  4.1631 + *
  4.1632 + * Note the "page_table_lock". It is to protect against kswapd removing
  4.1633 + * pages from under us. Note that kswapd only ever _removes_ pages, never
  4.1634 + * adds them. As such, once we have noticed that the page is not present,
  4.1635 + * we can drop the lock early.
  4.1636 + *
  4.1637 + * The adding of pages is protected by the MM semaphore (which we hold),
  4.1638 + * so we don't need to worry about a page being suddenly been added into
  4.1639 + * our VM.
  4.1640 + *
  4.1641 + * We enter with the pagetable spinlock held, we are supposed to
  4.1642 + * release it when done.
  4.1643 + */
  4.1644 +static inline int handle_pte_fault(struct mm_struct *mm,
  4.1645 +	struct vm_area_struct * vma, unsigned long address,
  4.1646 +	int write_access, pte_t *pte, pmd_t *pmd)
  4.1647 +{
  4.1648 +	pte_t entry;
  4.1649 +
  4.1650 +	entry = *pte;
  4.1651 +	if (!pte_present(entry)) {
  4.1652 +		/*
  4.1653 +		 * If it truly wasn't present, we know that kswapd
  4.1654 +		 * and the PTE updates will not touch it later. So
  4.1655 +		 * drop the lock.
  4.1656 +		 */
  4.1657 +		if (pte_none(entry))
  4.1658 +			return do_no_page(mm, vma, address, write_access, pte, pmd);
  4.1659 +		if (pte_file(entry))
  4.1660 +			return do_file_page(mm, vma, address, write_access, pte, pmd);
  4.1661 +		return do_swap_page(mm, vma, address, pte, pmd, entry, write_access);
  4.1662 +	}
  4.1663 +
  4.1664 +	if (write_access) {
  4.1665 +		if (!pte_write(entry))
  4.1666 +			return do_wp_page(mm, vma, address, pte, pmd, entry);
  4.1667 +
  4.1668 +		entry = pte_mkdirty(entry);
  4.1669 +	}
  4.1670 +	entry = pte_mkyoung(entry);
  4.1671 +	ptep_set_access_flags(vma, address, pte, entry, write_access);
  4.1672 +	update_mmu_cache(vma, address, entry);
  4.1673 +	pte_unmap(pte);
  4.1674 +	spin_unlock(&mm->page_table_lock);
  4.1675 +	return VM_FAULT_MINOR;
  4.1676 +}
  4.1677 +
  4.1678 +/*
  4.1679 + * By the time we get here, we already hold the mm semaphore
  4.1680 + */
  4.1681 +int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma,
  4.1682 +	unsigned long address, int write_access)
  4.1683 +{
  4.1684 +	pgd_t *pgd;
  4.1685 +	pmd_t *pmd;
  4.1686 +
  4.1687 +	__set_current_state(TASK_RUNNING);
  4.1688 +	pgd = pgd_offset(mm, address);
  4.1689 +
  4.1690 +	inc_page_state(pgfault);
  4.1691 +
  4.1692 +	if (is_vm_hugetlb_page(vma))
  4.1693 +		return VM_FAULT_SIGBUS;	/* mapping truncation does this. */
  4.1694 +
  4.1695 +	/*
  4.1696 +	 * We need the page table lock to synchronize with kswapd
  4.1697 +	 * and the SMP-safe atomic PTE updates.
  4.1698 +	 */
  4.1699 +	spin_lock(&mm->page_table_lock);
  4.1700 +	pmd = pmd_alloc(mm, pgd, address);
  4.1701 +
  4.1702 +	if (pmd) {
  4.1703 +		pte_t * pte = pte_alloc_map(mm, pmd, address);
  4.1704 +		if (pte)
  4.1705 +			return handle_pte_fault(mm, vma, address, write_access, pte, pmd);
  4.1706 +	}
  4.1707 +	spin_unlock(&mm->page_table_lock);
  4.1708 +	return VM_FAULT_OOM;
  4.1709 +}
  4.1710 +
  4.1711 +/*
  4.1712 + * Allocate page middle directory.
  4.1713 + *
  4.1714 + * We've already handled the fast-path in-line, and we own the
  4.1715 + * page table lock.
  4.1716 + *
  4.1717 + * On a two-level page table, this ends up actually being entirely
  4.1718 + * optimized away.
  4.1719 + */
  4.1720 +pmd_t fastcall *__pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
  4.1721 +{
  4.1722 +	pmd_t *new;
  4.1723 +
  4.1724 +	spin_unlock(&mm->page_table_lock);
  4.1725 +	new = pmd_alloc_one(mm, address);
  4.1726 +	spin_lock(&mm->page_table_lock);
  4.1727 +	if (!new)
  4.1728 +		return NULL;
  4.1729 +
  4.1730 +	/*
  4.1731 +	 * Because we dropped the lock, we should re-check the
  4.1732 +	 * entry, as somebody else could have populated it..
  4.1733 +	 */
  4.1734 +	if (pgd_present(*pgd)) {
  4.1735 +		pmd_free(new);
  4.1736 +		goto out;
  4.1737 +	}
  4.1738 +	pgd_populate(mm, pgd, new);
  4.1739 +out:
  4.1740 +	return pmd_offset(pgd, address);
  4.1741 +}
  4.1742 +
  4.1743 +int make_pages_present(unsigned long addr, unsigned long end)
  4.1744 +{
  4.1745 +	int ret, len, write;
  4.1746 +	struct vm_area_struct * vma;
  4.1747 +
  4.1748 +	vma = find_vma(current->mm, addr);
  4.1749 +	write = (vma->vm_flags & VM_WRITE) != 0;
  4.1750 +	if (addr >= end)
  4.1751 +		BUG();
  4.1752 +	if (end > vma->vm_end)
  4.1753 +		BUG();
  4.1754 +	len = (end+PAGE_SIZE-1)/PAGE_SIZE-addr/PAGE_SIZE;
  4.1755 +	ret = get_user_pages(current, current->mm, addr,
  4.1756 +			len, write, 0, NULL, NULL);
  4.1757 +	if (ret < 0)
  4.1758 +		return ret;
  4.1759 +	return ret == len ? 0 : -1;
  4.1760 +}
  4.1761 +
  4.1762 +/* 
  4.1763 + * Map a vmalloc()-space virtual address to the physical page.
  4.1764 + */
  4.1765 +struct page * vmalloc_to_page(void * vmalloc_addr)
  4.1766 +{
  4.1767 +	unsigned long addr = (unsigned long) vmalloc_addr;
  4.1768 +	struct page *page = NULL;
  4.1769 +	pgd_t *pgd = pgd_offset_k(addr);
  4.1770 +	pmd_t *pmd;
  4.1771 +	pte_t *ptep, pte;
  4.1772 +  
  4.1773 +	if (!pgd_none(*pgd)) {
  4.1774 +		pmd = pmd_offset(pgd, addr);
  4.1775 +		if (!pmd_none(*pmd)) {
  4.1776 +			preempt_disable();
  4.1777 +			ptep = pte_offset_map(pmd, addr);
  4.1778 +			pte = *ptep;
  4.1779 +			if (pte_present(pte))
  4.1780 +				page = pte_page(pte);
  4.1781 +			pte_unmap(ptep);
  4.1782 +			preempt_enable();
  4.1783 +		}
  4.1784 +	}
  4.1785 +	return page;
  4.1786 +}
  4.1787 +
  4.1788 +EXPORT_SYMBOL(vmalloc_to_page);
  4.1789 +
  4.1790 +#if !defined(CONFIG_ARCH_GATE_AREA)
  4.1791 +
  4.1792 +#if defined(AT_SYSINFO_EHDR)
  4.1793 +struct vm_area_struct gate_vma;
  4.1794 +
  4.1795 +static int __init gate_vma_init(void)
  4.1796 +{
  4.1797 +	gate_vma.vm_mm = NULL;
  4.1798 +	gate_vma.vm_start = FIXADDR_USER_START;
  4.1799 +	gate_vma.vm_end = FIXADDR_USER_END;
  4.1800 +	gate_vma.vm_page_prot = PAGE_READONLY;
  4.1801 +	gate_vma.vm_flags = 0;
  4.1802 +	return 0;
  4.1803 +}
  4.1804 +__initcall(gate_vma_init);
  4.1805 +#endif
  4.1806 +
  4.1807 +struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
  4.1808 +{
  4.1809 +#ifdef AT_SYSINFO_EHDR
  4.1810 +	return &gate_vma;
  4.1811 +#else
  4.1812 +	return NULL;
  4.1813 +#endif
  4.1814 +}
  4.1815 +
  4.1816 +int in_gate_area(struct task_struct *task, unsigned long addr)
  4.1817 +{
  4.1818 +#ifdef AT_SYSINFO_EHDR
  4.1819 +	if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END))
  4.1820 +		return 1;
  4.1821 +#endif
  4.1822 +	return 0;
  4.1823 +}
  4.1824 +
  4.1825 +#endif