ia64/xen-unstable

changeset 17151:8612d3d9578a

x86 shadow: Remove lock on first guest table walk.

Existing shadow fault path grabs big lock before walking
guest tables, to ensure consistency with shadow content
lest concurrent change from other vcpu in a bad OS.
But this lock brings more lock contention when scaled up
for a good guest which already prevents above case happen.
So this patch tries to remove the lock on first guest
table walk, and then delay check at some special points.

The key is to check whether any guest table update happens
between 1st walk and holding shadow lock. Here we take
two hints for guest table update:
* write permission removal
* write emulation
If any above two operations are observed within the race
window, it indicates possiblity that previous walk result
may be inaccurate and re-check is requried. If mismatch,
simply return to trigger another fault.

I made some experiment to sample perfc count:
<64bit guest>
3.7% of gwalks are re-checked
For re-check, 68% comes from write permission removal
<32bit pae guest>
7.2% of gwalks are re-checked
For re-check, 54.9% comes from write permission removal

Actually previous fast emulation optimization already skip
lots of guest table walks, and thus above ratio can be
smaller if compared to total shadow fault count.

Basically shadow promotion with write permision removal
does suffer higher overhead, but the benefit to reduce
lock contention is more obvious.

Improvement on kernel compile for this patch is:
(64bit Xen)
32bit guest: 1.1%
pae guest: 0.4%
64bit guest: 0.5%

Signed-off-by Kevin Tian <kevin.tian@intel.com>
author Keir Fraser <keir.fraser@citrix.com>
date Thu Feb 28 13:18:29 2008 +0000 (2008-02-28)
parents 6b875abd0a9e
children 36529ef3ef23
files xen/arch/x86/mm/shadow/multi.c xen/arch/x86/mm/shadow/types.h xen/include/asm-x86/domain.h xen/include/asm-x86/perfc_defn.h
line diff
     1.1 --- a/xen/arch/x86/mm/shadow/multi.c	Thu Feb 28 13:10:28 2008 +0000
     1.2 +++ b/xen/arch/x86/mm/shadow/multi.c	Thu Feb 28 13:18:29 2008 +0000
     1.3 @@ -55,12 +55,6 @@
     1.4   * l3-and-l2h-only shadow mode for PAE PV guests that would allow them 
     1.5   * to share l2h pages again. 
     1.6   *
     1.7 - * GUEST_WALK_TABLES TLB FLUSH COALESCE
     1.8 - * guest_walk_tables can do up to three remote TLB flushes as it walks to
     1.9 - * the first l1 of a new pagetable.  Should coalesce the flushes to the end, 
    1.10 - * and if we do flush, re-do the walk.  If anything has changed, then 
    1.11 - * pause all the other vcpus and do the walk *again*.
    1.12 - *
    1.13   * PSE disabled / PSE36
    1.14   * We don't support any modes other than PSE enabled, PSE36 disabled.
    1.15   * Neither of those would be hard to change, but we'd need to be able to 
    1.16 @@ -246,10 +240,97 @@ static uint32_t set_ad_bits(void *guest_
    1.17      return 0;
    1.18  }
    1.19  
    1.20 +/* This validation is called with lock held, and after write permission
    1.21 + * removal. Then check is atomic and no more inconsistent content can
    1.22 + * be observed before lock is released
    1.23 + *
    1.24 + * Return 1 to indicate success and 0 for inconsistency
    1.25 + */
    1.26 +static inline uint32_t
    1.27 +shadow_check_gwalk(struct vcpu *v, unsigned long va, walk_t *gw)
    1.28 +{
    1.29 +    struct domain *d = v->domain;
    1.30 +    guest_l1e_t *l1p;
    1.31 +    guest_l2e_t *l2p;
    1.32 +#if GUEST_PAGING_LEVELS >= 4
    1.33 +    guest_l3e_t *l3p;
    1.34 +    guest_l4e_t *l4p;
    1.35 +#endif
    1.36 +
    1.37 +    ASSERT(shadow_locked_by_me(d));
    1.38 +
    1.39 +    if ( gw->version ==
    1.40 +         atomic_read(&d->arch.paging.shadow.gtable_dirty_version) )
    1.41 +        return 1;
    1.42 +
    1.43 +    /* We may consider caching guest page mapping from last
    1.44 +     * guest table walk. However considering this check happens
    1.45 +     * relatively less-frequent, and a bit burden here to
    1.46 +     * remap guest page is better than caching mapping in each
    1.47 +     * guest table walk.
    1.48 +     *
    1.49 +     * Also when inconsistency occurs, simply return to trigger
    1.50 +     * another fault instead of re-validate new path to make
    1.51 +     * logic simple.
    1.52 +     */
    1.53 +    perfc_incr(shadow_check_gwalk);
    1.54 +#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
    1.55 +#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
    1.56 +    l4p = (guest_l4e_t *)v->arch.paging.shadow.guest_vtable;
    1.57 +    if ( gw->l4e.l4 != l4p[guest_l4_table_offset(va)].l4 )
    1.58 +        return 0;
    1.59 +    l3p = sh_map_domain_page(gw->l3mfn);
    1.60 +    if ( gw->l3e.l3 != l3p[guest_l3_table_offset(va)].l3 )
    1.61 +        return 0; 
    1.62 +#else
    1.63 +    if ( gw->l3e.l3 !=
    1.64 +         v->arch.paging.shadow.gl3e[guest_l3_table_offset(va)].l3 )
    1.65 +        return 0;
    1.66 +#endif
    1.67 +    l2p = sh_map_domain_page(gw->l2mfn);
    1.68 +    if ( gw->l2e.l2 != l2p[guest_l2_table_offset(va)].l2 )
    1.69 +        return 0;
    1.70 +#else
    1.71 +    l2p = (guest_l2e_t *)v->arch.paging.shadow.guest_vtable;
    1.72 +    if ( gw->l2e.l2 != l2p[guest_l2_table_offset(va)].l2 )
    1.73 +        return 0;
    1.74 +#endif
    1.75 +    if ( !(guest_supports_superpages(v) &&
    1.76 +           (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)) )
    1.77 +    {
    1.78 +        l1p = sh_map_domain_page(gw->l1mfn);
    1.79 +        if ( gw->l1e.l1 != l1p[guest_l1_table_offset(va)].l1 )
    1.80 +            return 0;
    1.81 +    }
    1.82 +
    1.83 +    return 1;
    1.84 +}
    1.85 +
    1.86 +/* Remove write access permissions from a gwalk_t in a batch, and
    1.87 + * return OR-ed result for TLB flush hint
    1.88 + */
    1.89 +static inline uint32_t
    1.90 +gw_remove_write_accesses(struct vcpu *v, unsigned long va, walk_t *gw)
    1.91 +{
    1.92 +    int rc = 0;
    1.93 +
    1.94 +#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
    1.95 +#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
    1.96 +    rc = sh_remove_write_access(v, gw->l3mfn, 3, va);
    1.97 +#endif
    1.98 +    rc |= sh_remove_write_access(v, gw->l2mfn, 2, va);
    1.99 +#endif
   1.100 +    if ( !(guest_supports_superpages(v) &&
   1.101 +           (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)) )
   1.102 +        rc |= sh_remove_write_access(v, gw->l1mfn, 1, va);
   1.103 +
   1.104 +    return rc;
   1.105 +}
   1.106 +
   1.107  /* Walk the guest pagetables, after the manner of a hardware walker. 
   1.108   *
   1.109   * Inputs: a vcpu, a virtual address, a walk_t to fill, a 
   1.110 - *         pointer to a pagefault code, and a flag "shadow_op".
   1.111 + *         pointer to a pagefault code
   1.112   * 
   1.113   * We walk the vcpu's guest pagetables, filling the walk_t with what we
   1.114   * see and adding any Accessed and Dirty bits that are needed in the
   1.115 @@ -257,10 +338,9 @@ static uint32_t set_ad_bits(void *guest_
   1.116   * we go.  For the purposes of reading pagetables we treat all non-RAM
   1.117   * memory as contining zeroes.
   1.118   * 
   1.119 - * If "shadow_op" is non-zero, we are serving a genuine guest memory access, 
   1.120 - * and must (a) be under the shadow lock, and (b) remove write access
   1.121 - * from any guest PT pages we see, as we will be shadowing them soon
   1.122 - * and will rely on the contents' not having changed.
   1.123 + * The walk is done in a lock-free style, with some sanity check postponed
   1.124 + * after grabbing shadow lock later. Those delayed checks will make sure
   1.125 + * no inconsistent mapping being translated into shadow page table.
   1.126   * 
   1.127   * Returns 0 for success, or the set of permission bits that we failed on 
   1.128   * if the walk did not complete.
   1.129 @@ -268,8 +348,7 @@ static uint32_t set_ad_bits(void *guest_
   1.130   * checked the old return code anyway.
   1.131   */
   1.132  static uint32_t
   1.133 -guest_walk_tables(struct vcpu *v, unsigned long va, walk_t *gw, 
   1.134 -                  uint32_t pfec, int shadow_op)
   1.135 +guest_walk_tables(struct vcpu *v, unsigned long va, walk_t *gw, uint32_t pfec)
   1.136  {
   1.137      struct domain *d = v->domain;
   1.138      p2m_type_t p2mt;
   1.139 @@ -282,12 +361,13 @@ guest_walk_tables(struct vcpu *v, unsign
   1.140      uint32_t gflags, mflags, rc = 0;
   1.141      int pse;
   1.142  
   1.143 -    ASSERT(!shadow_op || shadow_locked_by_me(d));
   1.144 -    
   1.145      perfc_incr(shadow_guest_walk);
   1.146      memset(gw, 0, sizeof(*gw));
   1.147      gw->va = va;
   1.148  
   1.149 +    gw->version = atomic_read(&d->arch.paging.shadow.gtable_dirty_version);
   1.150 +    rmb();
   1.151 +
   1.152      /* Mandatory bits that must be set in every entry.  We invert NX, to
   1.153       * calculate as if there were an "X" bit that allowed access. 
   1.154       * We will accumulate, in rc, the set of flags that are missing. */
   1.155 @@ -312,9 +392,7 @@ guest_walk_tables(struct vcpu *v, unsign
   1.156          goto out;
   1.157      }
   1.158      ASSERT(mfn_valid(gw->l3mfn));
   1.159 -    /* This mfn is a pagetable: make sure the guest can't write to it. */
   1.160 -    if ( shadow_op && sh_remove_write_access(v, gw->l3mfn, 3, va) != 0 )
   1.161 -        flush_tlb_mask(d->domain_dirty_cpumask); 
   1.162 +
   1.163      /* Get the l3e and check its flags*/
   1.164      l3p = sh_map_domain_page(gw->l3mfn);
   1.165      gw->l3e = l3p[guest_l3_table_offset(va)];
   1.166 @@ -343,9 +421,7 @@ guest_walk_tables(struct vcpu *v, unsign
   1.167          goto out;
   1.168      }
   1.169      ASSERT(mfn_valid(gw->l2mfn));
   1.170 -    /* This mfn is a pagetable: make sure the guest can't write to it. */
   1.171 -    if ( shadow_op && sh_remove_write_access(v, gw->l2mfn, 2, va) != 0 )
   1.172 -        flush_tlb_mask(d->domain_dirty_cpumask); 
   1.173 +
   1.174      /* Get the l2e */
   1.175      l2p = sh_map_domain_page(gw->l2mfn);
   1.176      gw->l2e = l2p[guest_l2_table_offset(va)];
   1.177 @@ -403,10 +479,6 @@ guest_walk_tables(struct vcpu *v, unsign
   1.178              goto out;
   1.179          }
   1.180          ASSERT(mfn_valid(gw->l1mfn));
   1.181 -        /* This mfn is a pagetable: make sure the guest can't write to it. */
   1.182 -        if ( shadow_op 
   1.183 -             && sh_remove_write_access(v, gw->l1mfn, 1, va) != 0 )
   1.184 -            flush_tlb_mask(d->domain_dirty_cpumask); 
   1.185          l1p = sh_map_domain_page(gw->l1mfn);
   1.186          gw->l1e = l1p[guest_l1_table_offset(va)];
   1.187          gflags = guest_l1e_get_flags(gw->l1e) ^ _PAGE_NX_BIT;
   1.188 @@ -548,8 +620,7 @@ sh_guest_map_l1e(struct vcpu *v, unsigne
   1.189      // XXX -- this is expensive, but it's easy to cobble together...
   1.190      // FIXME!
   1.191  
   1.192 -    shadow_lock(v->domain);
   1.193 -    if ( guest_walk_tables(v, addr, &gw, PFEC_page_present, 1) == 0 
   1.194 +    if ( guest_walk_tables(v, addr, &gw, PFEC_page_present) == 0 
   1.195           && mfn_valid(gw.l1mfn) )
   1.196      {
   1.197          if ( gl1mfn )
   1.198 @@ -558,8 +629,6 @@ sh_guest_map_l1e(struct vcpu *v, unsigne
   1.199              (guest_l1_table_offset(addr) * sizeof(guest_l1e_t));
   1.200      }
   1.201  
   1.202 -    shadow_unlock(v->domain);
   1.203 -
   1.204      return pl1e;
   1.205  }
   1.206  
   1.207 @@ -573,10 +642,8 @@ sh_guest_get_eff_l1e(struct vcpu *v, uns
   1.208      // XXX -- this is expensive, but it's easy to cobble together...
   1.209      // FIXME!
   1.210  
   1.211 -    shadow_lock(v->domain);
   1.212 -    (void) guest_walk_tables(v, addr, &gw, PFEC_page_present, 1);
   1.213 +    (void) guest_walk_tables(v, addr, &gw, PFEC_page_present);
   1.214      *(guest_l1e_t *)eff_l1e = gw.l1e;
   1.215 -    shadow_unlock(v->domain);
   1.216  }
   1.217  #endif /* CONFIG==SHADOW==GUEST */
   1.218  
   1.219 @@ -2842,14 +2909,12 @@ static int sh_page_fault(struct vcpu *v,
   1.220          return 0;
   1.221      }
   1.222  
   1.223 -    shadow_lock(d);
   1.224 -    
   1.225 -    shadow_audit_tables(v);
   1.226 -    
   1.227 -    if ( guest_walk_tables(v, va, &gw, regs->error_code, 1) != 0 )
   1.228 +    if ( guest_walk_tables(v, va, &gw, regs->error_code) != 0 )
   1.229      {
   1.230          perfc_incr(shadow_fault_bail_real_fault);
   1.231 -        goto not_a_shadow_fault;
   1.232 +        SHADOW_PRINTK("not a shadow fault\n");
   1.233 +        reset_early_unshadow(v);
   1.234 +        return 0;
   1.235      }
   1.236  
   1.237      /* It's possible that the guest has put pagetables in memory that it has 
   1.238 @@ -2859,12 +2924,9 @@ static int sh_page_fault(struct vcpu *v,
   1.239      if ( unlikely(d->is_shutting_down) )
   1.240      {
   1.241          SHADOW_PRINTK("guest is shutting down\n");
   1.242 -        shadow_unlock(d);
   1.243          return 0;
   1.244      }
   1.245  
   1.246 -    sh_audit_gw(v, &gw);
   1.247 -
   1.248      /* What kind of access are we dealing with? */
   1.249      ft = ((regs->error_code & PFEC_write_access)
   1.250            ? ft_demand_write : ft_demand_read);
   1.251 @@ -2879,7 +2941,8 @@ static int sh_page_fault(struct vcpu *v,
   1.252          perfc_incr(shadow_fault_bail_bad_gfn);
   1.253          SHADOW_PRINTK("BAD gfn=%"SH_PRI_gfn" gmfn=%"PRI_mfn"\n", 
   1.254                        gfn_x(gfn), mfn_x(gmfn));
   1.255 -        goto not_a_shadow_fault;
   1.256 +        reset_early_unshadow(v);
   1.257 +        return 0;
   1.258      }
   1.259  
   1.260  #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
   1.261 @@ -2888,6 +2951,27 @@ static int sh_page_fault(struct vcpu *v,
   1.262                  regs->error_code | PFEC_page_present);
   1.263  #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
   1.264  
   1.265 +    shadow_lock(d);
   1.266 +    shadow_audit_tables(v);
   1.267 +    sh_audit_gw(v, &gw);
   1.268 +
   1.269 +    if ( gw_remove_write_accesses(v, va, &gw) )
   1.270 +    {
   1.271 +        /* Write permission removal is also a hint that other gwalks
   1.272 +         * overlapping with this one may be inconsistent
   1.273 +         */
   1.274 +        perfc_incr(shadow_rm_write_flush_tlb);
   1.275 +        atomic_inc(&d->arch.paging.shadow.gtable_dirty_version);
   1.276 +        flush_tlb_mask(d->domain_dirty_cpumask);
   1.277 +    }
   1.278 +
   1.279 +    if ( !shadow_check_gwalk(v, va, &gw) )
   1.280 +    {
   1.281 +        perfc_incr(shadow_inconsistent_gwalk);
   1.282 +        shadow_unlock(d);
   1.283 +        return EXCRET_fault_fixed;
   1.284 +    }
   1.285 +
   1.286      /* Make sure there is enough free shadow memory to build a chain of
   1.287       * shadow tables. (We never allocate a top-level shadow on this path,
   1.288       * only a 32b l1, pae l1, or 64b l3+2+1. Note that while
   1.289 @@ -3223,7 +3307,7 @@ sh_gva_to_gfn(struct vcpu *v, unsigned l
   1.290          return vtlb_gfn;
   1.291  #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
   1.292  
   1.293 -    if ( guest_walk_tables(v, va, &gw, pfec[0], 0) != 0 )
   1.294 +    if ( guest_walk_tables(v, va, &gw, pfec[0]) != 0 )
   1.295      {
   1.296          if ( !(guest_l1e_get_flags(gw.l1e) & _PAGE_PRESENT) )
   1.297              pfec[0] &= ~PFEC_page_present;
   1.298 @@ -4276,6 +4360,8 @@ static void emulate_unmap_dest(struct vc
   1.299      }
   1.300      else 
   1.301          sh_unmap_domain_page(addr);
   1.302 +
   1.303 +    atomic_inc(&v->domain->arch.paging.shadow.gtable_dirty_version);
   1.304  }
   1.305  
   1.306  int
     2.1 --- a/xen/arch/x86/mm/shadow/types.h	Thu Feb 28 13:10:28 2008 +0000
     2.2 +++ b/xen/arch/x86/mm/shadow/types.h	Thu Feb 28 13:18:29 2008 +0000
     2.3 @@ -435,6 +435,7 @@ struct shadow_walk_t
     2.4  #endif
     2.5      mfn_t l2mfn;                /* MFN that the level 2 entry was in */
     2.6      mfn_t l1mfn;                /* MFN that the level 1 entry was in */
     2.7 +    int version;                /* Saved guest dirty version */
     2.8  };
     2.9  
    2.10  /* macros for dealing with the naming of the internal function names of the
     3.1 --- a/xen/include/asm-x86/domain.h	Thu Feb 28 13:10:28 2008 +0000
     3.2 +++ b/xen/include/asm-x86/domain.h	Thu Feb 28 13:18:29 2008 +0000
     3.3 @@ -97,6 +97,11 @@ struct shadow_domain {
     3.4  
     3.5      /* Fast MMIO path heuristic */
     3.6      int has_fast_mmio_entries;
     3.7 +
     3.8 +    /* reflect guest table dirty status, incremented by write
     3.9 +     * emulation and remove write permission
    3.10 +     */
    3.11 +    atomic_t          gtable_dirty_version;
    3.12  };
    3.13  
    3.14  struct shadow_vcpu {
     4.1 --- a/xen/include/asm-x86/perfc_defn.h	Thu Feb 28 13:10:28 2008 +0000
     4.2 +++ b/xen/include/asm-x86/perfc_defn.h	Thu Feb 28 13:18:29 2008 +0000
     4.3 @@ -88,6 +88,11 @@ PERFCOUNTER(shadow_up_pointer,     "shad
     4.4  PERFCOUNTER(shadow_unshadow_bf,    "shadow unshadow brute-force")
     4.5  PERFCOUNTER(shadow_get_page_fail,  "shadow_get_page_from_l1e failed")
     4.6  PERFCOUNTER(shadow_guest_walk,     "shadow walks guest tables")
     4.7 +PERFCOUNTER(shadow_check_gwalk,    "shadow checks gwalk")
     4.8 +PERFCOUNTER(shadow_inconsistent_gwalk, "shadow check inconsistent gwalk")
     4.9 +PERFCOUNTER(shadow_rm_write_flush_tlb,
    4.10 +                                   "shadow flush tlb by removing write perm")
    4.11 +
    4.12  PERFCOUNTER(shadow_invlpg,         "shadow emulates invlpg")
    4.13  PERFCOUNTER(shadow_invlpg_fault,   "shadow invlpg faults")
    4.14