direct-io.hg

changeset 10424:e1ae7b3cb5b7

[XEN] Make the spurious page-fault detection logic
more robust. In particular it must be able to handle
spurious write faults on mappings that have been
changed from read-only to writable. If a CPU has a stale
read-only entry in its TLB, it is allowed to fault on
the next write access without re-walking the page table.
Signed-off-by: Keir Fraser <keir@xensource.com>
author kfraser@dhcp93.uk.xensource.com
date Fri Jun 16 18:18:55 2006 +0100 (2006-06-16)
parents 05ab081f3c67
children 533bad7c0883
files xen/arch/x86/traps.c xen/arch/x86/x86_32/traps.c xen/arch/x86/x86_64/traps.c xen/include/asm-x86/processor.h
line diff
     1.1 --- a/xen/arch/x86/traps.c	Fri Jun 16 18:08:27 2006 +0100
     1.2 +++ b/xen/arch/x86/traps.c	Fri Jun 16 18:18:55 2006 +0100
     1.3 @@ -511,9 +511,9 @@ void propagate_page_fault(unsigned long 
     1.4      v->vcpu_info->arch.cr2           = addr;
     1.5  
     1.6      /* Re-set error_code.user flag appropriately for the guest. */
     1.7 -    error_code &= ~4;
     1.8 +    error_code &= ~PGERR_user_mode;
     1.9      if ( !guest_kernel_mode(v, guest_cpu_user_regs()) )
    1.10 -        error_code |= 4;
    1.11 +        error_code |= PGERR_user_mode;
    1.12  
    1.13      ti = &v->arch.guest_context.trap_ctxt[TRAP_page_fault];
    1.14      tb->flags = TBF_EXCEPTION | TBF_EXCEPTION_ERRCODE;
    1.15 @@ -578,10 +578,125 @@ static int handle_gdt_ldt_mapping_fault(
    1.16      (((va) >= HYPERVISOR_VIRT_START))
    1.17  #endif
    1.18  
    1.19 +static int __spurious_page_fault(
    1.20 +    unsigned long addr, struct cpu_user_regs *regs)
    1.21 +{
    1.22 +    unsigned long mfn = read_cr3() >> PAGE_SHIFT;
    1.23 +#if CONFIG_PAGING_LEVELS >= 4
    1.24 +    l4_pgentry_t l4e, *l4t;
    1.25 +#endif
    1.26 +#if CONFIG_PAGING_LEVELS >= 3
    1.27 +    l3_pgentry_t l3e, *l3t;
    1.28 +#endif
    1.29 +    l2_pgentry_t l2e, *l2t;
    1.30 +    l1_pgentry_t l1e, *l1t;
    1.31 +    unsigned int required_flags, disallowed_flags;
    1.32 +
    1.33 +    required_flags  = _PAGE_PRESENT;
    1.34 +    if ( regs->error_code & PGERR_write_access )
    1.35 +        required_flags |= _PAGE_RW;
    1.36 +    if ( regs->error_code & PGERR_user_mode )
    1.37 +        required_flags |= _PAGE_USER;
    1.38 +
    1.39 +    disallowed_flags = 0;
    1.40 +    if ( regs->error_code & PGERR_instr_fetch )
    1.41 +        disallowed_flags |= _PAGE_NX;
    1.42 +
    1.43 +#if CONFIG_PAGING_LEVELS >= 4
    1.44 +    l4t = map_domain_page(mfn);
    1.45 +    l4e = l4t[l4_table_offset(addr)];
    1.46 +    mfn = l4e_get_pfn(l4e);
    1.47 +    unmap_domain_page(l4t);
    1.48 +    if ( !(l4e_get_flags(l4e) & required_flags) ||
    1.49 +         (l4e_get_flags(l4e) & disallowed_flags) )
    1.50 +        return 0;
    1.51 +#endif
    1.52 +
    1.53 +#if CONFIG_PAGING_LEVELS >= 3
    1.54 +    l3t = map_domain_page(mfn);
    1.55 +    l3e = l3t[l3_table_offset(addr)];
    1.56 +    mfn = l3e_get_pfn(l3e);
    1.57 +    unmap_domain_page(l3t);
    1.58 +#ifdef CONFIG_X86_PAE
    1.59 +    if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
    1.60 +        return 0;
    1.61 +#else
    1.62 +    if ( !(l3e_get_flags(l3e) & required_flags) ||
    1.63 +         (l3e_get_flags(l3e) & disallowed_flags) )
    1.64 +        return 0;
    1.65 +#endif
    1.66 +#endif
    1.67 +
    1.68 +    l2t = map_domain_page(mfn);
    1.69 +    l2e = l2t[l2_table_offset(addr)];
    1.70 +    mfn = l2e_get_pfn(l2e);
    1.71 +    unmap_domain_page(l2t);
    1.72 +    if ( !(l2e_get_flags(l2e) & required_flags) ||
    1.73 +         (l2e_get_flags(l2e) & disallowed_flags) )
    1.74 +        return 0;
    1.75 +    if ( l2e_get_flags(l2e) & _PAGE_PSE )
    1.76 +        return 1;
    1.77 +
    1.78 +    l1t = map_domain_page(mfn);
    1.79 +    l1e = l1t[l1_table_offset(addr)];
    1.80 +    mfn = l1e_get_pfn(l1e);
    1.81 +    unmap_domain_page(l1t);
    1.82 +    if ( !(l1e_get_flags(l1e) & required_flags) ||
    1.83 +         (l1e_get_flags(l1e) & disallowed_flags) )
    1.84 +        return 0;
    1.85 +    return 1;
    1.86 +}
    1.87 +
    1.88 +static int spurious_page_fault(
    1.89 +    unsigned long addr, struct cpu_user_regs *regs)
    1.90 +{
    1.91 +    struct vcpu   *v = current;
    1.92 +    struct domain *d = v->domain;
    1.93 +    int            is_spurious;
    1.94 +
    1.95 +    /* Reserved bit violations are never spurious faults. */
    1.96 +    if ( regs->error_code & PGERR_reserved_bit )
    1.97 +        return 0;
    1.98 +
    1.99 +    LOCK_BIGLOCK(d);
   1.100 +
   1.101 +    is_spurious = __spurious_page_fault(addr, regs);
   1.102 +    if ( is_spurious )
   1.103 +        goto out;
   1.104 +
   1.105 +    /*
   1.106 +     * The only possible reason for a spurious page fault not to be picked
   1.107 +     * up already is that a page directory was unhooked by writable page table
   1.108 +     * logic and then reattached before the faulting VCPU could detect it.
   1.109 +     */
   1.110 +    if ( is_idle_domain(d) ||               /* no ptwr in idle domain       */
   1.111 +         IN_HYPERVISOR_RANGE(addr) ||       /* no ptwr on hypervisor addrs  */
   1.112 +         shadow_mode_enabled(d) ||          /* no ptwr logic in shadow mode */
   1.113 +         (regs->error_code & PGERR_page_present) ) /* not-present fault?    */
   1.114 +        goto out;
   1.115 +
   1.116 +    /*
   1.117 +     * The page directory could have been detached again while we weren't
   1.118 +     * holding the per-domain lock. Detect that and fix up if it's the case.
   1.119 +     */
   1.120 +    if ( unlikely(d->arch.ptwr[PTWR_PT_ACTIVE].l1va) &&
   1.121 +         unlikely(l2_linear_offset(addr) ==
   1.122 +                  d->arch.ptwr[PTWR_PT_ACTIVE].l2_idx) )
   1.123 +    {
   1.124 +        ptwr_flush(d, PTWR_PT_ACTIVE);
   1.125 +        is_spurious = 1;
   1.126 +    }
   1.127 +
   1.128 + out:
   1.129 +    UNLOCK_BIGLOCK(d);
   1.130 +    return is_spurious;
   1.131 +}
   1.132 +
   1.133  static int fixup_page_fault(unsigned long addr, struct cpu_user_regs *regs)
   1.134  {
   1.135      struct vcpu   *v = current;
   1.136      struct domain *d = v->domain;
   1.137 +    int            rc;
   1.138  
   1.139      if ( unlikely(IN_HYPERVISOR_RANGE(addr)) )
   1.140      {
   1.141 @@ -590,12 +705,20 @@ static int fixup_page_fault(unsigned lon
   1.142          if ( (addr >= GDT_LDT_VIRT_START) && (addr < GDT_LDT_VIRT_END) )
   1.143              return handle_gdt_ldt_mapping_fault(
   1.144                  addr - GDT_LDT_VIRT_START, regs);
   1.145 +        /*
   1.146 +         * Do not propagate spurious faults in the hypervisor area to the
   1.147 +         * guest. It cannot fix them up.
   1.148 +         */
   1.149 +        LOCK_BIGLOCK(d);
   1.150 +        rc = __spurious_page_fault(addr, regs);
   1.151 +        UNLOCK_BIGLOCK(d);
   1.152 +        return rc;
   1.153      }
   1.154 -    else if ( unlikely(shadow_mode_enabled(d)) )
   1.155 -    {
   1.156 +
   1.157 +    if ( unlikely(shadow_mode_enabled(d)) )
   1.158          return shadow_fault(addr, regs);
   1.159 -    }
   1.160 -    else if ( likely(VM_ASSIST(d, VMASST_TYPE_writable_pagetables)) )
   1.161 +
   1.162 +    if ( likely(VM_ASSIST(d, VMASST_TYPE_writable_pagetables)) )
   1.163      {
   1.164          LOCK_BIGLOCK(d);
   1.165          if ( unlikely(d->arch.ptwr[PTWR_PT_ACTIVE].l1va) &&
   1.166 @@ -609,7 +732,10 @@ static int fixup_page_fault(unsigned lon
   1.167  
   1.168          if ( guest_kernel_mode(v, regs) &&
   1.169               /* Protection violation on write? No reserved-bit violation? */
   1.170 -             ((regs->error_code & 0xb) == 0x3) &&
   1.171 +             ((regs->error_code & (PGERR_page_present |
   1.172 +                                   PGERR_write_access |
   1.173 +                                   PGERR_reserved_bit)) ==
   1.174 +              (PGERR_page_present | PGERR_write_access)) &&
   1.175               ptwr_do_page_fault(d, addr, regs) )
   1.176          {
   1.177              UNLOCK_BIGLOCK(d);
   1.178 @@ -621,46 +747,6 @@ static int fixup_page_fault(unsigned lon
   1.179      return 0;
   1.180  }
   1.181  
   1.182 -static int spurious_page_fault(unsigned long addr, struct cpu_user_regs *regs)
   1.183 -{
   1.184 -    struct vcpu   *v = current;
   1.185 -    struct domain *d = v->domain;
   1.186 -    int            rc;
   1.187 -
   1.188 -    /*
   1.189 -     * The only possible reason for a spurious page fault not to be picked
   1.190 -     * up already is that a page directory was unhooked by writable page table
   1.191 -     * logic and then reattached before the faulting VCPU could detect it.
   1.192 -     */
   1.193 -    if ( is_idle_domain(d) ||               /* no ptwr in idle domain       */
   1.194 -         IN_HYPERVISOR_RANGE(addr) ||       /* no ptwr on hypervisor addrs  */
   1.195 -         shadow_mode_enabled(d) ||          /* no ptwr logic in shadow mode */
   1.196 -         ((regs->error_code & 0x1d) != 0) ) /* simple not-present fault?    */
   1.197 -        return 0;
   1.198 -
   1.199 -    LOCK_BIGLOCK(d);
   1.200 -
   1.201 -    /*
   1.202 -     * The page directory could have been detached again while we weren't
   1.203 -     * holding the per-domain lock. Detect that and fix up if it's the case.
   1.204 -     */
   1.205 -    if ( unlikely(d->arch.ptwr[PTWR_PT_ACTIVE].l1va) &&
   1.206 -         unlikely(l2_linear_offset(addr) ==
   1.207 -                  d->arch.ptwr[PTWR_PT_ACTIVE].l2_idx) )
   1.208 -    {
   1.209 -        ptwr_flush(d, PTWR_PT_ACTIVE);
   1.210 -        rc = 1;
   1.211 -    }
   1.212 -    else
   1.213 -    {
   1.214 -        /* Okay, walk the page tables. Only check for not-present faults.*/
   1.215 -        rc = __spurious_page_fault(addr);
   1.216 -    }
   1.217 -
   1.218 -    UNLOCK_BIGLOCK(d);
   1.219 -    return rc;
   1.220 -}
   1.221 -
   1.222  /*
   1.223   * #PF error code:
   1.224   *  Bit 0: Protection violation (=1) ; Page not present (=0)
   1.225 @@ -784,8 +870,8 @@ static inline int admin_io_okay(
   1.226      (admin_io_okay(_p, 4, _d, _r) ? outl(_v, _p) : ((void)0))
   1.227  
   1.228  /* Propagate a fault back to the guest kernel. */
   1.229 -#define USER_READ_FAULT  4 /* user mode, read fault */
   1.230 -#define USER_WRITE_FAULT 6 /* user mode, write fault */
   1.231 +#define USER_READ_FAULT  (PGERR_user_mode)
   1.232 +#define USER_WRITE_FAULT (PGERR_user_mode | PGERR_write_access)
   1.233  #define PAGE_FAULT(_faultaddr, _errcode)        \
   1.234  ({  propagate_page_fault(_faultaddr, _errcode); \
   1.235      return EXCRET_fault_fixed;                  \
     2.1 --- a/xen/arch/x86/x86_32/traps.c	Fri Jun 16 18:08:27 2006 +0100
     2.2 +++ b/xen/arch/x86/x86_32/traps.c	Fri Jun 16 18:18:55 2006 +0100
     2.3 @@ -113,40 +113,6 @@ void show_page_walk(unsigned long addr)
     2.4      unmap_domain_page(l1t);
     2.5  }
     2.6  
     2.7 -int __spurious_page_fault(unsigned long addr)
     2.8 -{
     2.9 -    unsigned long mfn = read_cr3() >> PAGE_SHIFT;
    2.10 -#ifdef CONFIG_X86_PAE
    2.11 -    l3_pgentry_t l3e, *l3t;
    2.12 -#endif
    2.13 -    l2_pgentry_t l2e, *l2t;
    2.14 -    l1_pgentry_t l1e, *l1t;
    2.15 -
    2.16 -#ifdef CONFIG_X86_PAE
    2.17 -    l3t = map_domain_page(mfn);
    2.18 -    l3e = l3t[l3_table_offset(addr)];
    2.19 -    mfn = l3e_get_pfn(l3e);
    2.20 -    unmap_domain_page(l3t);
    2.21 -    if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
    2.22 -        return 0;
    2.23 -#endif
    2.24 -
    2.25 -    l2t = map_domain_page(mfn);
    2.26 -    l2e = l2t[l2_table_offset(addr)];
    2.27 -    mfn = l2e_get_pfn(l2e);
    2.28 -    unmap_domain_page(l2t);
    2.29 -    if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
    2.30 -        return 0;
    2.31 -    if ( l2e_get_flags(l2e) & _PAGE_PSE )
    2.32 -        return 1;
    2.33 -
    2.34 -    l1t = map_domain_page(mfn);
    2.35 -    l1e = l1t[l1_table_offset(addr)];
    2.36 -    mfn = l1e_get_pfn(l1e);
    2.37 -    unmap_domain_page(l1t);
    2.38 -    return !!(l1e_get_flags(l1e) & _PAGE_PRESENT);
    2.39 -}
    2.40 -
    2.41  #define DOUBLEFAULT_STACK_SIZE 1024
    2.42  static struct tss_struct doublefault_tss;
    2.43  static unsigned char doublefault_stack[DOUBLEFAULT_STACK_SIZE];
     3.1 --- a/xen/arch/x86/x86_64/traps.c	Fri Jun 16 18:08:27 2006 +0100
     3.2 +++ b/xen/arch/x86/x86_64/traps.c	Fri Jun 16 18:18:55 2006 +0100
     3.3 @@ -115,40 +115,6 @@ void show_page_walk(unsigned long addr)
     3.4      printk("    L1 = %"PRIpte" %016lx\n", l1e_get_intpte(l1e), pfn);
     3.5  }
     3.6  
     3.7 -int __spurious_page_fault(unsigned long addr)
     3.8 -{
     3.9 -    unsigned long mfn = read_cr3() >> PAGE_SHIFT;
    3.10 -    l4_pgentry_t l4e, *l4t;
    3.11 -    l3_pgentry_t l3e, *l3t;
    3.12 -    l2_pgentry_t l2e, *l2t;
    3.13 -    l1_pgentry_t l1e, *l1t;
    3.14 -
    3.15 -    l4t = mfn_to_virt(mfn);
    3.16 -    l4e = l4t[l4_table_offset(addr)];
    3.17 -    mfn = l4e_get_pfn(l4e);
    3.18 -    if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
    3.19 -        return 0;
    3.20 -
    3.21 -    l3t = mfn_to_virt(mfn);
    3.22 -    l3e = l3t[l3_table_offset(addr)];
    3.23 -    mfn = l3e_get_pfn(l3e);
    3.24 -    if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
    3.25 -        return 0;
    3.26 -
    3.27 -    l2t = mfn_to_virt(mfn);
    3.28 -    l2e = l2t[l2_table_offset(addr)];
    3.29 -    mfn = l2e_get_pfn(l2e);
    3.30 -    if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
    3.31 -        return 0;
    3.32 -    if ( l2e_get_flags(l2e) & _PAGE_PSE )
    3.33 -        return 1;
    3.34 -
    3.35 -    l1t = mfn_to_virt(mfn);
    3.36 -    l1e = l1t[l1_table_offset(addr)];
    3.37 -    mfn = l1e_get_pfn(l1e);
    3.38 -    return !!(l1e_get_flags(l1e) & _PAGE_PRESENT);
    3.39 -}
    3.40 -
    3.41  asmlinkage void double_fault(void);
    3.42  asmlinkage void do_double_fault(struct cpu_user_regs *regs)
    3.43  {
     4.1 --- a/xen/include/asm-x86/processor.h	Fri Jun 16 18:08:27 2006 +0100
     4.2 +++ b/xen/include/asm-x86/processor.h	Fri Jun 16 18:18:55 2006 +0100
     4.3 @@ -129,6 +129,13 @@
     4.4  #define _TF_kernel_mode        0
     4.5  #define TF_kernel_mode         (1<<_TF_kernel_mode)
     4.6  
     4.7 +/* #PF error code values. */
     4.8 +#define PGERR_page_present   (1U<<0)
     4.9 +#define PGERR_write_access   (1U<<1)
    4.10 +#define PGERR_user_mode      (1U<<2)
    4.11 +#define PGERR_reserved_bit   (1U<<3)
    4.12 +#define PGERR_instr_fetch    (1U<<4)
    4.13 +
    4.14  #ifndef __ASSEMBLY__
    4.15  
    4.16  struct domain;
    4.17 @@ -524,7 +531,6 @@ extern always_inline void prefetchw(cons
    4.18  void show_stack(struct cpu_user_regs *regs);
    4.19  void show_registers(struct cpu_user_regs *regs);
    4.20  void show_page_walk(unsigned long addr);
    4.21 -int __spurious_page_fault(unsigned long addr);
    4.22  asmlinkage void fatal_trap(int trapnr, struct cpu_user_regs *regs);
    4.23  
    4.24  extern void mtrr_ap_init(void);