direct-io.hg

changeset 10425:533bad7c0883

[LINUX] Add spurious page-fault detection, intended primarily
for spurious write faults on mappings that have been
changed from read-only to writable. If a CPU has a stale
read-only entry in its TLB, it is allowed to fault on
the next write access without re-walking the page table.
Signed-off-by: Keir Fraser <keir@xensource.com>
author kfraser@dhcp93.uk.xensource.com
date Fri Jun 16 18:19:40 2006 +0100 (2006-06-16)
parents e1ae7b3cb5b7
children 840f33e54054
files linux-2.6-xen-sparse/arch/i386/mm/fault-xen.c linux-2.6-xen-sparse/arch/x86_64/mm/fault-xen.c
line diff
     1.1 --- a/linux-2.6-xen-sparse/arch/i386/mm/fault-xen.c	Fri Jun 16 18:18:55 2006 +0100
     1.2 +++ b/linux-2.6-xen-sparse/arch/i386/mm/fault-xen.c	Fri Jun 16 18:19:40 2006 +0100
     1.3 @@ -273,6 +273,49 @@ static void dump_fault_path(unsigned lon
     1.4  }
     1.5  #endif
     1.6  
     1.7 +static int spurious_fault(struct pt_regs *regs,
     1.8 +			  unsigned long address,
     1.9 +			  unsigned long error_code)
    1.10 +{
    1.11 +	pgd_t *pgd;
    1.12 +	pud_t *pud;
    1.13 +	pmd_t *pmd;
    1.14 +	pte_t *pte;
    1.15 +
    1.16 +#ifdef CONFIG_XEN
    1.17 +	/* Faults in hypervisor area are never spurious. */
    1.18 +	if (address >= HYPERVISOR_VIRT_START)
    1.19 +		return 0;
    1.20 +#endif
    1.21 +
    1.22 +	/* Reserved-bit violation or user access to kernel space? */
    1.23 +	if (error_code & 0x0c)
    1.24 +		return 0;
    1.25 +
    1.26 +	pgd = init_mm.pgd + pgd_index(address);
    1.27 +	if (!pgd_present(*pgd))
    1.28 +		return 0;
    1.29 +
    1.30 +	pud = pud_offset(pgd, address);
    1.31 +	if (!pud_present(*pud))
    1.32 +		return 0;
    1.33 +
    1.34 +	pmd = pmd_offset(pud, address);
    1.35 +	if (!pmd_present(*pmd))
    1.36 +		return 0;
    1.37 +
    1.38 +	pte = pte_offset_kernel(pmd, address);
    1.39 +	if (!pte_present(*pte))
    1.40 +		return 0;
    1.41 +	if ((error_code & 0x02) && !pte_write(*pte))
    1.42 +		return 0;
    1.43 +#ifdef CONFIG_X86_PAE
    1.44 +	if ((error_code & 0x10) && (pte_val(*pte) & _PAGE_NX))
    1.45 +		return 0;
    1.46 +#endif
    1.47 +
    1.48 +	return 1;
    1.49 +}
    1.50  
    1.51  /*
    1.52   * This routine handles page faults.  It determines the address,
    1.53 @@ -327,8 +370,16 @@ fastcall void __kprobes do_page_fault(st
    1.54  	 * protection error (error_code & 1) == 0.
    1.55  	 */
    1.56  	if (unlikely(address >= TASK_SIZE)) { 
    1.57 +#ifdef CONFIG_XEN
    1.58 +		/* Faults in hypervisor area can never be patched up. */
    1.59 +		if (address >= HYPERVISOR_VIRT_START)
    1.60 +			goto bad_area_nosemaphore;
    1.61 +#endif
    1.62  		if (!(error_code & 5))
    1.63  			goto vmalloc_fault;
    1.64 +		/* Can take a spurious fault if mapping changes R/O -> R/W. */
    1.65 +		if (spurious_fault(regs, address, error_code))
    1.66 +			return;
    1.67  		/* 
    1.68  		 * Don't take the mm semaphore here. If we fixup a prefetch
    1.69  		 * fault we could otherwise deadlock.
     2.1 --- a/linux-2.6-xen-sparse/arch/x86_64/mm/fault-xen.c	Fri Jun 16 18:18:55 2006 +0100
     2.2 +++ b/linux-2.6-xen-sparse/arch/x86_64/mm/fault-xen.c	Fri Jun 16 18:19:40 2006 +0100
     2.3 @@ -307,6 +307,49 @@ int exception_trace = 1;
     2.4  #define MEM_LOG(_f, _a...) ((void)0)
     2.5  #endif
     2.6  
     2.7 +static int spurious_fault(struct pt_regs *regs,
     2.8 +			  unsigned long address,
     2.9 +			  unsigned long error_code)
    2.10 +{
    2.11 +	pgd_t *pgd;
    2.12 +	pud_t *pud;
    2.13 +	pmd_t *pmd;
    2.14 +	pte_t *pte;
    2.15 +
    2.16 +#ifdef CONFIG_XEN
    2.17 +	/* Faults in hypervisor area are never spurious. */
    2.18 +	if ((address >= HYPERVISOR_VIRT_START) &&
    2.19 +	    (address < HYPERVISOR_VIRT_END))
    2.20 +		return 0;
    2.21 +#endif
    2.22 +
    2.23 +	/* Reserved-bit violation or user access to kernel space? */
    2.24 +	if (error_code & PF_RSVD|PF_USER)
    2.25 +		return 0;
    2.26 +
    2.27 +	pgd = init_mm.pgd + pgd_index(address);
    2.28 +	if (!pgd_present(*pgd))
    2.29 +		return 0;
    2.30 +
    2.31 +	pud = pud_offset(pgd, address);
    2.32 +	if (!pud_present(*pud))
    2.33 +		return 0;
    2.34 +
    2.35 +	pmd = pmd_offset(pud, address);
    2.36 +	if (!pmd_present(*pmd))
    2.37 +		return 0;
    2.38 +
    2.39 +	pte = pte_offset_kernel(pmd, address);
    2.40 +	if (!pte_present(*pte))
    2.41 +		return 0;
    2.42 +	if ((error_code & PF_WRITE) && !pte_write(*pte))
    2.43 +		return 0;
    2.44 +	if ((error_code & PF_INSTR) && (pte_val(*pte) & _PAGE_NX))
    2.45 +		return 0;
    2.46 +
    2.47 +	return 1;
    2.48 +}
    2.49 +
    2.50  /*
    2.51   * This routine handles page faults.  It determines the address,
    2.52   * and the problem, and then passes it off to one of the appropriate
    2.53 @@ -361,16 +404,19 @@ asmlinkage void __kprobes do_page_fault(
    2.54  	 */
    2.55  	if (unlikely(address >= TASK_SIZE64)) {
    2.56  		/*
    2.57 -		 * Must check for the entire kernel range here: with writable
    2.58 -		 * page tables the hypervisor may temporarily clear PMD
    2.59 -		 * entries.
    2.60 +		 * Don't check for the module range here: its PML4
    2.61 +		 * is always initialized because it's shared with the main
    2.62 +		 * kernel text. Only vmalloc may need PML4 syncups.
    2.63  		 */
    2.64  		if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
    2.65 -		    address >= PAGE_OFFSET) {
    2.66 +		      ((address >= VMALLOC_START && address < VMALLOC_END))) {
    2.67  			if (vmalloc_fault(address) < 0)
    2.68  				goto bad_area_nosemaphore;
    2.69  			return;
    2.70  		}
    2.71 +		/* Can take a spurious fault if mapping changes R/O -> R/W. */
    2.72 +		if (spurious_fault(regs, address, error_code))
    2.73 +			return;
    2.74  		/*
    2.75  		 * Don't take the mm semaphore here. If we fixup a prefetch
    2.76  		 * fault we could otherwise deadlock.