ia64/xen-unstable

changeset 14736:14c25e48a557

linux: improve x86 page table handling performance

Where possible,
- use hypercalls instead of writing to read-only pages
- fold TLB flushes into page table update hypercalls
- on PAE, use single-access updates instead of two-access ones

The single change to PAE pte_clear() yields a 25-30% boost for kernel
builds on a 4x2x2 CPUs, 8Gb box; the other changes together yield
improvements of 2-5%.

Also, adjust backward compatibility handling in a few more places.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
author kfraser@localhost.localdomain
date Thu Apr 05 09:10:33 2007 +0100 (2007-04-05)
parents 07d3208c0ca3
children e5931b5e6cc5
files linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/pgtable-2level.h linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/pgtable-3level.h linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/pgtable.h linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/pgtable.h
line diff
     1.1 --- a/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/pgtable-2level.h	Thu Apr 05 08:59:12 2007 +0100
     1.2 +++ b/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/pgtable-2level.h	Thu Apr 05 09:10:33 2007 +0100
     1.3 @@ -36,8 +36,37 @@
     1.4  #define pte_clear(mm,addr,xp)	do { set_pte_at(mm, addr, xp, __pte(0)); } while (0)
     1.5  #define pmd_clear(xp)	do { set_pmd(xp, __pmd(0)); } while (0)
     1.6  
     1.7 -#define ptep_get_and_clear(mm,addr,xp)	__pte_ma(xchg(&(xp)->pte_low, 0))
     1.8 +#define pte_none(x) (!(x).pte_low)
     1.9 +
    1.10 +static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
    1.11 +{
    1.12 +	pte_t pte = *ptep;
    1.13 +	if (!pte_none(pte)) {
    1.14 +		if (mm != &init_mm)
    1.15 +			pte = __pte_ma(xchg(&ptep->pte_low, 0));
    1.16 +		else
    1.17 +			HYPERVISOR_update_va_mapping(addr, __pte(0), 0);
    1.18 +	}
    1.19 +	return pte;
    1.20 +}
    1.21 +
    1.22 +#define ptep_clear_flush(vma, addr, ptep)			\
    1.23 +({								\
    1.24 +	pte_t *__ptep = (ptep);					\
    1.25 +	pte_t __res = *__ptep;					\
    1.26 +	if (!pte_none(__res) &&					\
    1.27 +	    ((vma)->vm_mm != current->mm ||			\
    1.28 +	     HYPERVISOR_update_va_mapping(addr, __pte(0),	\
    1.29 +			(unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
    1.30 +				UVMF_INVLPG|UVMF_MULTI))) {	\
    1.31 +		__ptep->pte_low = 0;				\
    1.32 +		flush_tlb_page(vma, addr);			\
    1.33 +	}							\
    1.34 +	__res;							\
    1.35 +})
    1.36 +
    1.37  #define pte_same(a, b)		((a).pte_low == (b).pte_low)
    1.38 +
    1.39  #define __pte_mfn(_pte) ((_pte).pte_low >> PAGE_SHIFT)
    1.40  #define pte_mfn(_pte) ((_pte).pte_low & _PAGE_PRESENT ? \
    1.41  	__pte_mfn(_pte) : pfn_to_mfn(__pte_mfn(_pte)))
    1.42 @@ -46,7 +75,6 @@
    1.43  
    1.44  #define pte_page(_pte) pfn_to_page(pte_pfn(_pte))
    1.45  
    1.46 -#define pte_none(x)		(!(x).pte_low)
    1.47  #define pfn_pte(pfn, prot)	__pte(((pfn) << PAGE_SHIFT) | pgprot_val(prot))
    1.48  #define pfn_pmd(pfn, prot)	__pmd(((pfn) << PAGE_SHIFT) | pgprot_val(prot))
    1.49  
     2.1 --- a/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/pgtable-3level.h	Thu Apr 05 08:59:12 2007 +0100
     2.2 +++ b/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/pgtable-3level.h	Thu Apr 05 09:10:33 2007 +0100
     2.3 @@ -99,6 +99,11 @@ static inline void pud_clear (pud_t * pu
     2.4  #define pmd_offset(pud, address) ((pmd_t *) pud_page(*(pud)) + \
     2.5  			pmd_index(address))
     2.6  
     2.7 +static inline int pte_none(pte_t pte)
     2.8 +{
     2.9 +	return !(pte.pte_low | pte.pte_high);
    2.10 +}
    2.11 +
    2.12  /*
    2.13   * For PTEs and PDEs, we must clear the P-bit first when clearing a page table
    2.14   * entry, so clear the bottom half first and enforce ordering with a compiler
    2.15 @@ -106,24 +111,50 @@ static inline void pud_clear (pud_t * pu
    2.16   */
    2.17  static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
    2.18  {
    2.19 -	ptep->pte_low = 0;
    2.20 -	smp_wmb();
    2.21 -	ptep->pte_high = 0;
    2.22 +	if ((mm != current->mm && mm != &init_mm)
    2.23 +	    || HYPERVISOR_update_va_mapping(addr, __pte(0), 0)) {
    2.24 +		ptep->pte_low = 0;
    2.25 +		smp_wmb();
    2.26 +		ptep->pte_high = 0;
    2.27 +	}
    2.28  }
    2.29  
    2.30  #define pmd_clear(xp)	do { set_pmd(xp, __pmd(0)); } while (0)
    2.31  
    2.32  static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
    2.33  {
    2.34 -	pte_t res;
    2.35 +	pte_t pte = *ptep;
    2.36 +	if (!pte_none(pte)) {
    2.37 +		if (mm != &init_mm) {
    2.38 +			uint64_t val = pte_val_ma(pte);
    2.39 +			if (__cmpxchg64(ptep, val, 0) != val) {
    2.40 +				/* xchg acts as a barrier before the setting of the high bits */
    2.41 +				pte.pte_low = xchg(&ptep->pte_low, 0);
    2.42 +				pte.pte_high = ptep->pte_high;
    2.43 +				ptep->pte_high = 0;
    2.44 +			}
    2.45 +		} else
    2.46 +			HYPERVISOR_update_va_mapping(addr, __pte(0), 0);
    2.47 +	}
    2.48 +	return pte;
    2.49 +}
    2.50  
    2.51 -	/* xchg acts as a barrier before the setting of the high bits */
    2.52 -	res.pte_low = xchg(&ptep->pte_low, 0);
    2.53 -	res.pte_high = ptep->pte_high;
    2.54 -	ptep->pte_high = 0;
    2.55 -
    2.56 -	return res;
    2.57 -}
    2.58 +#define ptep_clear_flush(vma, addr, ptep)			\
    2.59 +({								\
    2.60 +	pte_t *__ptep = (ptep);					\
    2.61 +	pte_t __res = *__ptep;					\
    2.62 +	if (!pte_none(__res) &&					\
    2.63 +	    ((vma)->vm_mm != current->mm ||			\
    2.64 +	     HYPERVISOR_update_va_mapping(addr,	__pte(0),	\
    2.65 +			(unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
    2.66 +				UVMF_INVLPG|UVMF_MULTI))) {	\
    2.67 +		__ptep->pte_low = 0;				\
    2.68 +		smp_wmb();					\
    2.69 +		__ptep->pte_high = 0;				\
    2.70 +		flush_tlb_page(vma, addr);			\
    2.71 +	}							\
    2.72 +	__res;							\
    2.73 +})
    2.74  
    2.75  static inline int pte_same(pte_t a, pte_t b)
    2.76  {
    2.77 @@ -132,11 +163,6 @@ static inline int pte_same(pte_t a, pte_
    2.78  
    2.79  #define pte_page(x)	pfn_to_page(pte_pfn(x))
    2.80  
    2.81 -static inline int pte_none(pte_t pte)
    2.82 -{
    2.83 -	return !pte.pte_low && !pte.pte_high;
    2.84 -}
    2.85 -
    2.86  #define __pte_mfn(_pte) (((_pte).pte_low >> PAGE_SHIFT) | \
    2.87  			 ((_pte).pte_high << (32-PAGE_SHIFT)))
    2.88  #define pte_mfn(_pte) ((_pte).pte_low & _PAGE_PRESENT ? \
     3.1 --- a/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/pgtable.h	Thu Apr 05 08:59:12 2007 +0100
     3.2 +++ b/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/pgtable.h	Thu Apr 05 09:10:33 2007 +0100
     3.3 @@ -210,9 +210,13 @@ extern unsigned long pg0[];
     3.4  
     3.5  /* To avoid harmful races, pmd_none(x) should check only the lower when PAE */
     3.6  #define pmd_none(x)	(!(unsigned long)pmd_val(x))
     3.7 +#ifdef CONFIG_XEN_COMPAT_030002
     3.8  /* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t.
     3.9     can temporarily clear it. */
    3.10  #define pmd_present(x)	(pmd_val(x))
    3.11 +#else
    3.12 +#define pmd_present(x)	(pmd_val(x) & _PAGE_PRESENT)
    3.13 +#endif
    3.14  #define pmd_bad(x)	((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER & ~_PAGE_PRESENT)) != (_KERNPG_TABLE & ~_PAGE_PRESENT))
    3.15  
    3.16  
    3.17 @@ -252,36 +256,47 @@ static inline pte_t pte_mkhuge(pte_t pte
    3.18  # include <asm/pgtable-2level.h>
    3.19  #endif
    3.20  
    3.21 -static inline int ptep_test_and_clear_dirty(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
    3.22 -{
    3.23 -	if (!pte_dirty(*ptep))
    3.24 -		return 0;
    3.25 -	return test_and_clear_bit(_PAGE_BIT_DIRTY, &ptep->pte_low);
    3.26 -}
    3.27 +#define ptep_test_and_clear_dirty(vma, addr, ptep)			\
    3.28 +({									\
    3.29 +	pte_t __pte = *(ptep);						\
    3.30 +	int __ret = pte_dirty(__pte);					\
    3.31 +	if (__ret) {							\
    3.32 +		__pte = pte_mkclean(__pte);				\
    3.33 +		if ((vma)->vm_mm != current->mm ||			\
    3.34 +		    HYPERVISOR_update_va_mapping(addr, __pte, 0))	\
    3.35 +			(ptep)->pte_low = __pte.pte_low;		\
    3.36 +	}								\
    3.37 +	__ret;								\
    3.38 +})
    3.39  
    3.40 -static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
    3.41 -{
    3.42 -	if (!pte_young(*ptep))
    3.43 -		return 0;
    3.44 -	return test_and_clear_bit(_PAGE_BIT_ACCESSED, &ptep->pte_low);
    3.45 -}
    3.46 +#define ptep_test_and_clear_young(vma, addr, ptep)			\
    3.47 +({									\
    3.48 +	pte_t __pte = *(ptep);						\
    3.49 +	int __ret = pte_young(__pte);					\
    3.50 +	if (__ret)							\
    3.51 +		__pte = pte_mkold(__pte);				\
    3.52 +		if ((vma)->vm_mm != current->mm ||			\
    3.53 +		    HYPERVISOR_update_va_mapping(addr, __pte, 0))	\
    3.54 +			(ptep)->pte_low = __pte.pte_low;		\
    3.55 +	__ret;								\
    3.56 +})
    3.57  
    3.58 -static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long addr, pte_t *ptep, int full)
    3.59 -{
    3.60 -	pte_t pte;
    3.61 -	if (full) {
    3.62 -		pte = *ptep;
    3.63 -		pte_clear(mm, addr, ptep);
    3.64 -	} else {
    3.65 -		pte = ptep_get_and_clear(mm, addr, ptep);
    3.66 -	}
    3.67 -	return pte;
    3.68 -}
    3.69 +#define ptep_get_and_clear_full(mm, addr, ptep, full)			\
    3.70 +	((full) ? ({							\
    3.71 +		pte_t __res = *(ptep);					\
    3.72 +		if (test_bit(PG_pinned, &virt_to_page((mm)->pgd)->flags)) \
    3.73 +			xen_l1_entry_update(ptep, __pte(0));		\
    3.74 +		else							\
    3.75 +			*(ptep) = __pte(0);				\
    3.76 +		__res;							\
    3.77 +	 }) :								\
    3.78 +	 ptep_get_and_clear(mm, addr, ptep))
    3.79  
    3.80  static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
    3.81  {
    3.82 -	if (pte_write(*ptep))
    3.83 -		clear_bit(_PAGE_BIT_RW, &ptep->pte_low);
    3.84 +	pte_t pte = *ptep;
    3.85 +	if (pte_write(pte))
    3.86 +		set_pte_at(mm, addr, ptep, pte_wrprotect(pte));
    3.87  }
    3.88  
    3.89  /*
    3.90 @@ -418,6 +433,20 @@ extern void noexec_setup(const char *str
    3.91  #define pte_unmap_nested(pte) do { } while (0)
    3.92  #endif
    3.93  
    3.94 +#define __HAVE_ARCH_PTEP_ESTABLISH
    3.95 +#define ptep_establish(vma, address, ptep, pteval)			\
    3.96 +	do {								\
    3.97 +		if ( likely((vma)->vm_mm == current->mm) ) {		\
    3.98 +			BUG_ON(HYPERVISOR_update_va_mapping(address,	\
    3.99 +				pteval,					\
   3.100 +				(unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
   3.101 +					UVMF_INVLPG|UVMF_MULTI));	\
   3.102 +		} else {						\
   3.103 +			xen_l1_entry_update(ptep, pteval);		\
   3.104 +			flush_tlb_page(vma, address);			\
   3.105 +		}							\
   3.106 +	} while (0)
   3.107 +
   3.108  /*
   3.109   * The i386 doesn't have any external MMU info: the kernel page
   3.110   * tables contain all the necessary information.
   3.111 @@ -430,27 +459,12 @@ extern void noexec_setup(const char *str
   3.112   */
   3.113  #define update_mmu_cache(vma,address,pte) do { } while (0)
   3.114  #define  __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
   3.115 -#define ptep_set_access_flags(__vma, __address, __ptep, __entry, __dirty) \
   3.116 -	do {								  \
   3.117 -		if (__dirty) {						  \
   3.118 -			if ( likely((__vma)->vm_mm == current->mm) ) {	  \
   3.119 -				BUG_ON(HYPERVISOR_update_va_mapping(__address, \
   3.120 -					__entry,			  \
   3.121 -					(unsigned long)(__vma)->vm_mm->cpu_vm_mask.bits| \
   3.122 -					UVMF_INVLPG|UVMF_MULTI));	  \
   3.123 -			} else {					  \
   3.124 -				xen_l1_entry_update(__ptep, __entry);	  \
   3.125 -				flush_tlb_page(__vma, __address);	  \
   3.126 -			}						  \
   3.127 -		}							  \
   3.128 +#define ptep_set_access_flags(vma, address, ptep, entry, dirty)		\
   3.129 +	do {								\
   3.130 +		if (dirty)						\
   3.131 +			ptep_establish(vma, address, ptep, entry);	\
   3.132  	} while (0)
   3.133  
   3.134 -#define __HAVE_ARCH_PTEP_ESTABLISH
   3.135 -#define ptep_establish(__vma, __address, __ptep, __entry)		\
   3.136 -do {				  					\
   3.137 -	ptep_set_access_flags(__vma, __address, __ptep, __entry, 1);	\
   3.138 -} while (0)
   3.139 -
   3.140  #include <xen/features.h>
   3.141  void make_lowmem_page_readonly(void *va, unsigned int feature);
   3.142  void make_lowmem_page_writable(void *va, unsigned int feature);
   3.143 @@ -508,6 +522,7 @@ direct_remap_pfn_range(vma,from,pfn,size
   3.144  #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY
   3.145  #define __HAVE_ARCH_PTEP_GET_AND_CLEAR
   3.146  #define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
   3.147 +#define __HAVE_ARCH_PTEP_CLEAR_FLUSH
   3.148  #define __HAVE_ARCH_PTEP_SET_WRPROTECT
   3.149  #define __HAVE_ARCH_PTE_SAME
   3.150  #include <asm-generic/pgtable.h>
     4.1 --- a/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/pgtable.h	Thu Apr 05 08:59:12 2007 +0100
     4.2 +++ b/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/pgtable.h	Thu Apr 05 09:10:33 2007 +0100
     4.3 @@ -93,11 +93,6 @@ extern unsigned long empty_zero_page[PAG
     4.4  #define pgd_none(x)	(!pgd_val(x))
     4.5  #define pud_none(x)	(!pud_val(x))
     4.6  
     4.7 -#define set_pte_batched(pteptr, pteval) \
     4.8 -	queue_l1_entry_update(pteptr, (pteval))
     4.9 -
    4.10 -extern inline int pud_present(pud_t pud)	{ return !pud_none(pud); }
    4.11 -
    4.12  static inline void set_pte(pte_t *dst, pte_t val)
    4.13  {
    4.14  	*dst = val;
    4.15 @@ -123,41 +118,6 @@ static inline void pgd_clear (pgd_t * pg
    4.16  #define pud_page(pud) \
    4.17      ((unsigned long) __va(pud_val(pud) & PHYSICAL_PAGE_MASK))
    4.18  
    4.19 -/*
    4.20 - * A note on implementation of this atomic 'get-and-clear' operation.
    4.21 - * This is actually very simple because Xen Linux can only run on a single
    4.22 - * processor. Therefore, we cannot race other processors setting the 'accessed'
    4.23 - * or 'dirty' bits on a page-table entry.
    4.24 - * Even if pages are shared between domains, that is not a problem because
    4.25 - * each domain will have separate page tables, with their own versions of
    4.26 - * accessed & dirty state.
    4.27 - */
    4.28 -#define ptep_get_and_clear(mm,addr,xp)	__pte_ma(xchg(&(xp)->pte, 0))
    4.29 -
    4.30 -#if 0
    4.31 -static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *xp)
    4.32 -{
    4.33 -        pte_t pte = *xp;
    4.34 -        if (pte.pte)
    4.35 -                set_pte(xp, __pte_ma(0));
    4.36 -        return pte;
    4.37 -}
    4.38 -#endif
    4.39 -
    4.40 -struct mm_struct;
    4.41 -
    4.42 -static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long addr, pte_t *ptep, int full)
    4.43 -{
    4.44 -	pte_t pte;
    4.45 -	if (full) {
    4.46 -		pte = *ptep;
    4.47 -		*ptep = __pte(0);
    4.48 -	} else {
    4.49 -		pte = ptep_get_and_clear(mm, addr, ptep);
    4.50 -	}
    4.51 -	return pte;
    4.52 -}
    4.53 -
    4.54  #define pte_same(a, b)		((a).pte == (b).pte)
    4.55  
    4.56  #define pte_pgprot(a)	(__pgprot((a).pte & ~PHYSICAL_PAGE_MASK))
    4.57 @@ -318,6 +278,46 @@ static inline pte_t pfn_pte(unsigned lon
    4.58  	return __pte(pte);
    4.59  }
    4.60  
    4.61 +static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
    4.62 +{
    4.63 +	pte_t pte = *ptep;
    4.64 +	if (!pte_none(pte)) {
    4.65 +		if (mm != &init_mm)
    4.66 +			pte = __pte_ma(xchg(&ptep->pte, 0));
    4.67 +		else
    4.68 +			HYPERVISOR_update_va_mapping(addr, __pte(0), 0);
    4.69 +	}
    4.70 +	return pte;
    4.71 +}
    4.72 +
    4.73 +static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long addr, pte_t *ptep, int full)
    4.74 +{
    4.75 +	if (full) {
    4.76 +		pte_t pte = *ptep;
    4.77 +		if (mm->context.pinned)
    4.78 +			xen_l1_entry_update(ptep, __pte(0));
    4.79 +		else
    4.80 +			*ptep = __pte(0);
    4.81 +		return pte;
    4.82 +	}
    4.83 +	return ptep_get_and_clear(mm, addr, ptep);
    4.84 +}
    4.85 +
    4.86 +#define ptep_clear_flush(vma, addr, ptep)			\
    4.87 +({								\
    4.88 +	pte_t *__ptep = (ptep);					\
    4.89 +	pte_t __res = *__ptep;					\
    4.90 +	if (!pte_none(__res) &&					\
    4.91 +	    ((vma)->vm_mm != current->mm ||			\
    4.92 +	     HYPERVISOR_update_va_mapping(addr,	__pte(0), 	\
    4.93 +			(unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
    4.94 +				UVMF_INVLPG|UVMF_MULTI))) {	\
    4.95 +		__ptep->pte = 0;				\
    4.96 +		flush_tlb_page(vma, addr);			\
    4.97 +	}							\
    4.98 +	__res;							\
    4.99 +})
   4.100 +
   4.101  /*
   4.102   * The following only work if pte_present() is true.
   4.103   * Undefined behaviour if not..
   4.104 @@ -346,31 +346,29 @@ static inline pte_t pte_mkyoung(pte_t pt
   4.105  static inline pte_t pte_mkwrite(pte_t pte)	{ __pte_val(pte) |= _PAGE_RW; return pte; }
   4.106  static inline pte_t pte_mkhuge(pte_t pte)	{ __pte_val(pte) |= _PAGE_PSE; return pte; }
   4.107  
   4.108 -struct vm_area_struct;
   4.109 +#define ptep_test_and_clear_dirty(vma, addr, ptep)			\
   4.110 +({									\
   4.111 +	pte_t __pte = *(ptep);						\
   4.112 +	int __ret = pte_dirty(__pte);					\
   4.113 +	if (__ret)							\
   4.114 +		set_pte_at((vma)->vm_mm, addr, ptep, pte_mkclean(__pte)); \
   4.115 +	__ret;								\
   4.116 +})
   4.117  
   4.118 -static inline int ptep_test_and_clear_dirty(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
   4.119 -{
   4.120 -	pte_t pte = *ptep;
   4.121 -	int ret = pte_dirty(pte);
   4.122 -	if (ret)
   4.123 -		set_pte(ptep, pte_mkclean(pte));
   4.124 -	return ret;
   4.125 -}
   4.126 -
   4.127 -static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
   4.128 -{
   4.129 -	pte_t pte = *ptep;
   4.130 -	int ret = pte_young(pte);
   4.131 -	if (ret)
   4.132 -		set_pte(ptep, pte_mkold(pte));
   4.133 -	return ret;
   4.134 -}
   4.135 +#define ptep_test_and_clear_young(vma, addr, ptep)			\
   4.136 +({									\
   4.137 +	pte_t __pte = *(ptep);						\
   4.138 +	int __ret = pte_young(__pte);					\
   4.139 +	if (__ret)							\
   4.140 +		set_pte_at((vma)->vm_mm, addr, ptep, pte_mkold(__pte)); \
   4.141 +	__ret;								\
   4.142 +})
   4.143  
   4.144  static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
   4.145  {
   4.146  	pte_t pte = *ptep;
   4.147  	if (pte_write(pte))
   4.148 -		set_pte(ptep, pte_wrprotect(pte));
   4.149 +		set_pte_at(mm, addr, ptep, pte_wrprotect(pte));
   4.150  }
   4.151  
   4.152  /*
   4.153 @@ -403,6 +401,7 @@ static inline int pmd_large(pmd_t pte) {
   4.154  /* to find an entry in a page-table-directory. */
   4.155  #define pud_index(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
   4.156  #define pud_offset(pgd, address) ((pud_t *) pgd_page(*(pgd)) + pud_index(address))
   4.157 +#define pud_present(pud) (pud_val(pud) & _PAGE_PRESENT)
   4.158  
   4.159  /* PMD  - Level 2 access */
   4.160  #define pmd_page_kernel(pmd) ((unsigned long) __va(pmd_val(pmd) & PTE_MASK))
   4.161 @@ -412,9 +411,13 @@ static inline int pmd_large(pmd_t pte) {
   4.162  #define pmd_offset(dir, address) ((pmd_t *) pud_page(*(dir)) + \
   4.163                                    pmd_index(address))
   4.164  #define pmd_none(x)	(!pmd_val(x))
   4.165 +#ifdef CONFIG_XEN_COMPAT_030002
   4.166  /* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t.
   4.167     can temporarily clear it. */
   4.168  #define pmd_present(x)	(pmd_val(x))
   4.169 +#else
   4.170 +#define pmd_present(x)	(pmd_val(x) & _PAGE_PRESENT)
   4.171 +#endif
   4.172  #define pmd_clear(xp)	do { set_pmd(xp, __pmd(0)); } while (0)
   4.173  #define pmd_bad(x) ((pmd_val(x) & ~(PTE_MASK | _PAGE_USER | _PAGE_PRESENT)) \
   4.174  		    != (_KERNPG_TABLE & ~(_PAGE_USER | _PAGE_PRESENT)))
   4.175 @@ -468,25 +471,34 @@ static inline pte_t pte_modify(pte_t pte
   4.176  
   4.177  #define update_mmu_cache(vma,address,pte) do { } while (0)
   4.178  
   4.179 +/*
   4.180 + * Rules for using ptep_establish: the pte MUST be a user pte, and
   4.181 + * must be a present->present transition.
   4.182 + */
   4.183 +#define __HAVE_ARCH_PTEP_ESTABLISH
   4.184 +#define ptep_establish(vma, address, ptep, pteval)			\
   4.185 +	do {								\
   4.186 +		if ( likely((vma)->vm_mm == current->mm) ) {		\
   4.187 +			BUG_ON(HYPERVISOR_update_va_mapping(address,	\
   4.188 +				pteval,					\
   4.189 +				(unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
   4.190 +					UVMF_INVLPG|UVMF_MULTI));	\
   4.191 +		} else {						\
   4.192 +			xen_l1_entry_update(ptep, pteval);		\
   4.193 +			flush_tlb_page(vma, address);			\
   4.194 +		}							\
   4.195 +	} while (0)
   4.196 +
   4.197  /* We only update the dirty/accessed state if we set
   4.198   * the dirty bit by hand in the kernel, since the hardware
   4.199   * will do the accessed bit for us, and we don't want to
   4.200   * race with other CPU's that might be updating the dirty
   4.201   * bit at the same time. */
   4.202  #define  __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
   4.203 -#define ptep_set_access_flags(__vma, __address, __ptep, __entry, __dirty) \
   4.204 -	do {								  \
   4.205 -		if (__dirty) {						  \
   4.206 -			if ( likely((__vma)->vm_mm == current->mm) ) {	  \
   4.207 -				BUG_ON(HYPERVISOR_update_va_mapping(__address, \
   4.208 -					__entry,			  \
   4.209 -					(unsigned long)(__vma)->vm_mm->cpu_vm_mask.bits| \
   4.210 -						UVMF_INVLPG|UVMF_MULTI)); \
   4.211 -			} else {					  \
   4.212 -				xen_l1_entry_update(__ptep, __entry);	  \
   4.213 -				flush_tlb_page(__vma, __address);	  \
   4.214 -			}						  \
   4.215 -		}							  \
   4.216 +#define ptep_set_access_flags(vma, address, ptep, entry, dirty)		\
   4.217 +	do {								\
   4.218 +		if (dirty)						\
   4.219 +			ptep_establish(vma, address, ptep, entry);	\
   4.220  	} while (0)
   4.221  
   4.222  /* Encode and de-code a swap entry */
   4.223 @@ -506,6 +518,8 @@ extern int kern_addr_valid(unsigned long
   4.224  
   4.225  #define DOMID_LOCAL (0xFFFFU)
   4.226  
   4.227 +struct vm_area_struct;
   4.228 +
   4.229  int direct_remap_pfn_range(struct vm_area_struct *vma,
   4.230                              unsigned long address,
   4.231                              unsigned long mfn,
   4.232 @@ -551,6 +565,7 @@ int touch_pte_range(struct mm_struct *mm
   4.233  #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY
   4.234  #define __HAVE_ARCH_PTEP_GET_AND_CLEAR
   4.235  #define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
   4.236 +#define __HAVE_ARCH_PTEP_CLEAR_FLUSH
   4.237  #define __HAVE_ARCH_PTEP_SET_WRPROTECT
   4.238  #define __HAVE_ARCH_PTE_SAME
   4.239  #include <asm-generic/pgtable.h>