ia64/xen-unstable

changeset 16321:bfb1cb958632

[SHADOW] Fix error paths in guest-pagetable walker.
Real hardware sets PFEC_page_present regardless of the access bits,
and doesn't write back _PAGE_ACCESSED except after a successful walk.
Signed-off-by: Tim Deegan <Tim.Deegan@citrix.com>
author Keir Fraser <keir@xensource.com>
date Mon Nov 05 16:38:47 2007 +0000 (2007-11-05)
parents d945240821e7
children ed20c4232e16
files xen/arch/x86/mm/shadow/multi.c
line diff
     1.1 --- a/xen/arch/x86/mm/shadow/multi.c	Mon Nov 05 16:37:48 2007 +0000
     1.2 +++ b/xen/arch/x86/mm/shadow/multi.c	Mon Nov 05 16:38:47 2007 +0000
     1.3 @@ -226,51 +226,24 @@ static uint32_t mandatory_flags(struct v
     1.4      return f;
     1.5  }
     1.6  
     1.7 -/* Read, check and modify a guest pagetable entry.  Returns 0 if the
     1.8 - * flags are OK.  Although we use l1e types here, the logic and the bits
     1.9 - * are the same for all types except PAE l3es. */
    1.10 -static int guest_walk_entry(struct vcpu *v, mfn_t gmfn, 
    1.11 -                            void *gp, void *wp,
    1.12 -                            uint32_t flags, int level)
    1.13 +/* Modify a guest pagetable entry to set the Accessed and Dirty bits.
    1.14 + * Returns non-zero if it actually writes to guest memory. */
    1.15 +static uint32_t set_ad_bits(void *guest_p, void *walk_p, int set_dirty)
    1.16  {
    1.17 -    guest_l1e_t e, old_e;
    1.18 -    uint32_t gflags;
    1.19 -    int rc;
    1.20 -
    1.21 -    /* Read the guest entry */
    1.22 -    e = *(guest_l1e_t *)gp;
    1.23 -
    1.24 -    /* Check that all the mandatory flag bits are there.  Invert NX, to
    1.25 -     * calculate as if there were an "X" bit that allowed access. */
    1.26 -    gflags = guest_l1e_get_flags(e) ^ _PAGE_NX_BIT;
    1.27 -    rc = ((gflags & flags) != flags);
    1.28 -    
    1.29 -    /* Set the accessed/dirty bits */
    1.30 -    if ( rc == 0 ) 
    1.31 +    guest_intpte_t old, new;
    1.32 +
    1.33 +    old = *(guest_intpte_t *)walk_p;
    1.34 +    new = old | _PAGE_ACCESSED | (set_dirty ? _PAGE_DIRTY : 0);
    1.35 +    if ( old != new ) 
    1.36      {
    1.37 -        uint32_t bits = _PAGE_ACCESSED;
    1.38 -        if ( (flags & _PAGE_RW) // Implies that the action is a write
    1.39 -             && ((level == 1) || ((level == 2) && (gflags & _PAGE_PSE))) )
    1.40 -            bits |= _PAGE_DIRTY;
    1.41 -        old_e = e;
    1.42 -        e.l1 |= bits;
    1.43 -        SHADOW_PRINTK("flags %lx bits %lx old_e %llx e %llx\n",
    1.44 -                      (unsigned long) flags, 
    1.45 -                      (unsigned long) bits, 
    1.46 -                      (unsigned long long) old_e.l1, 
    1.47 -                      (unsigned long long) e.l1);
    1.48 -        /* Try to write the entry back.  If it's changed under out feet 
    1.49 -         * then leave it alone */
    1.50 -        if ( e.l1 != old_e.l1 )
    1.51 -        {
    1.52 -            (void) cmpxchg(((guest_intpte_t *)gp), old_e.l1, e.l1);
    1.53 -            paging_mark_dirty(v->domain, mfn_x(gmfn));
    1.54 -        }
    1.55 +        /* Write the new entry into the walk, and try to write it back
    1.56 +         * into the guest table as well.  If the guest table has changed
    1.57 +         * under out feet then leave it alone. */
    1.58 +        *(guest_intpte_t *)walk_p = new;
    1.59 +        if ( cmpxchg(((guest_intpte_t *)guest_p), old, new) == old ) 
    1.60 +            return 1;
    1.61      }
    1.62 -
    1.63 -    /* Record the entry in the walk */
    1.64 -    *(guest_l1e_t *)wp = e;
    1.65 -    return rc;
    1.66 +    return 0;
    1.67  }
    1.68  
    1.69  /* Walk the guest pagetables, after the manner of a hardware walker. 
    1.70 @@ -293,21 +266,20 @@ static int guest_walk_entry(struct vcpu 
    1.71   * N.B. This is different from the old return code but almost no callers
    1.72   * checked the old return code anyway.
    1.73   */
    1.74 -static int 
    1.75 +static uint32_t
    1.76  guest_walk_tables(struct vcpu *v, unsigned long va, walk_t *gw, 
    1.77                    uint32_t pfec, int shadow_op)
    1.78  {
    1.79      struct domain *d = v->domain;
    1.80      p2m_type_t p2mt;
    1.81 -    guest_l1e_t *l1p;
    1.82 -#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
    1.83 -    guest_l1e_t *l2p;
    1.84 +    guest_l1e_t *l1p = NULL;
    1.85 +    guest_l2e_t *l2p = NULL;
    1.86  #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
    1.87 -    guest_l1e_t *l3p;
    1.88 -#endif    
    1.89 +    guest_l3e_t *l3p = NULL;
    1.90 +    guest_l4e_t *l4p;
    1.91  #endif
    1.92 -    uint32_t flags = mandatory_flags(v, pfec);
    1.93 -    int rc;
    1.94 +    uint32_t gflags, mflags, rc = 0;
    1.95 +    int pse;
    1.96  
    1.97      ASSERT(!shadow_op || shadow_locked_by_me(d));
    1.98      
    1.99 @@ -315,66 +287,86 @@ guest_walk_tables(struct vcpu *v, unsign
   1.100      memset(gw, 0, sizeof(*gw));
   1.101      gw->va = va;
   1.102  
   1.103 +    /* Mandatory bits that must be set in every entry.  We invert NX, to
   1.104 +     * calculate as if there were an "X" bit that allowed access. 
   1.105 +     * We will accumulate, in rc, the set of flags that are missing. */
   1.106 +    mflags = mandatory_flags(v, pfec);
   1.107 +
   1.108  #if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
   1.109  #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
   1.110 +
   1.111      /* Get the l4e from the top level table and check its flags*/
   1.112      gw->l4mfn = pagetable_get_mfn(v->arch.guest_table);
   1.113 -    rc = guest_walk_entry(v, gw->l4mfn,
   1.114 -                          (guest_l4e_t *)v->arch.paging.shadow.guest_vtable
   1.115 -                          + guest_l4_table_offset(va),
   1.116 -                          &gw->l4e, flags, 4);
   1.117 -    if ( rc != 0 ) return rc;
   1.118 +    l4p = ((guest_l4e_t *)v->arch.paging.shadow.guest_vtable);
   1.119 +    gw->l4e = l4p[guest_l4_table_offset(va)];
   1.120 +    gflags = guest_l4e_get_flags(gw->l4e) ^ _PAGE_NX_BIT;
   1.121 +    rc |= ((gflags & mflags) ^ mflags);
   1.122 +    if ( rc & _PAGE_PRESENT ) goto out;
   1.123  
   1.124      /* Map the l3 table */
   1.125      gw->l3mfn = gfn_to_mfn(d, guest_l4e_get_gfn(gw->l4e), &p2mt);
   1.126 -    if ( !p2m_is_ram(p2mt) ) return 1;
   1.127 +    if ( !p2m_is_ram(p2mt) ) 
   1.128 +    {
   1.129 +        rc |= _PAGE_PRESENT;
   1.130 +        goto out;
   1.131 +    }
   1.132      ASSERT(mfn_valid(gw->l3mfn));
   1.133      /* This mfn is a pagetable: make sure the guest can't write to it. */
   1.134      if ( shadow_op && sh_remove_write_access(v, gw->l3mfn, 3, va) != 0 )
   1.135          flush_tlb_mask(d->domain_dirty_cpumask); 
   1.136      /* Get the l3e and check its flags*/
   1.137      l3p = sh_map_domain_page(gw->l3mfn);
   1.138 -    rc = guest_walk_entry(v, gw->l3mfn, l3p + guest_l3_table_offset(va), 
   1.139 -                          &gw->l3e, flags, 3);
   1.140 -    sh_unmap_domain_page(l3p);
   1.141 -    if ( rc != 0 ) return rc;
   1.142 +    gw->l3e = l3p[guest_l3_table_offset(va)];
   1.143 +    gflags = guest_l3e_get_flags(gw->l3e) ^ _PAGE_NX_BIT;
   1.144 +    rc |= ((gflags & mflags) ^ mflags);
   1.145 +    if ( rc & _PAGE_PRESENT )
   1.146 +        goto out;
   1.147  
   1.148  #else /* PAE only... */
   1.149  
   1.150      /* Get l3e from the cache of the top level table and check its flag */
   1.151      gw->l3e = v->arch.paging.shadow.gl3e[guest_l3_table_offset(va)];
   1.152 -    if ( !(guest_l3e_get_flags(gw->l3e) & _PAGE_PRESENT) ) return 1;
   1.153 +    if ( !(guest_l3e_get_flags(gw->l3e) & _PAGE_PRESENT) ) 
   1.154 +    {
   1.155 +        rc |= _PAGE_PRESENT;
   1.156 +        goto out;
   1.157 +    }
   1.158  
   1.159  #endif /* PAE or 64... */
   1.160  
   1.161      /* Map the l2 table */
   1.162      gw->l2mfn = gfn_to_mfn(d, guest_l3e_get_gfn(gw->l3e), &p2mt);
   1.163 -    if ( !p2m_is_ram(p2mt) ) return 1;
   1.164 +    if ( !p2m_is_ram(p2mt) )
   1.165 +    {
   1.166 +        rc |= _PAGE_PRESENT;
   1.167 +        goto out;
   1.168 +    }
   1.169      ASSERT(mfn_valid(gw->l2mfn));
   1.170      /* This mfn is a pagetable: make sure the guest can't write to it. */
   1.171      if ( shadow_op && sh_remove_write_access(v, gw->l2mfn, 2, va) != 0 )
   1.172          flush_tlb_mask(d->domain_dirty_cpumask); 
   1.173      /* Get the l2e */
   1.174      l2p = sh_map_domain_page(gw->l2mfn);
   1.175 -    rc = guest_walk_entry(v, gw->l2mfn, l2p + guest_l2_table_offset(va),
   1.176 -                          &gw->l2e, flags, 2);
   1.177 -    sh_unmap_domain_page(l2p);
   1.178 -    if ( rc != 0 ) return rc;
   1.179 +    gw->l2e = l2p[guest_l2_table_offset(va)];
   1.180  
   1.181  #else /* 32-bit only... */
   1.182  
   1.183 -    /* Get l2e from the top level table and check its flags */
   1.184 +    /* Get l2e from the top level table */
   1.185      gw->l2mfn = pagetable_get_mfn(v->arch.guest_table);
   1.186 -    rc = guest_walk_entry(v, gw->l2mfn, 
   1.187 -                          (guest_l2e_t *)v->arch.paging.shadow.guest_vtable
   1.188 -                          + guest_l2_table_offset(va),
   1.189 -                          &gw->l2e, flags, 2);
   1.190 -    if ( rc != 0 ) return rc;
   1.191 +    l2p = ((guest_l2e_t *)v->arch.paging.shadow.guest_vtable);
   1.192 +    gw->l2e = l2p[guest_l2_table_offset(va)];
   1.193  
   1.194  #endif /* All levels... */
   1.195  
   1.196 -    if ( guest_supports_superpages(v) &&
   1.197 -         (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE) ) 
   1.198 +    gflags = guest_l2e_get_flags(gw->l2e) ^ _PAGE_NX_BIT;
   1.199 +    rc |= ((gflags & mflags) ^ mflags);
   1.200 +    if ( rc & _PAGE_PRESENT )
   1.201 +        goto out;
   1.202 +
   1.203 +    pse = (guest_supports_superpages(v) && 
   1.204 +           (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)); 
   1.205 +
   1.206 +    if ( pse )
   1.207      {
   1.208          /* Special case: this guest VA is in a PSE superpage, so there's
   1.209           * no guest l1e.  We make one up so that the propagation code
   1.210 @@ -404,20 +396,55 @@ guest_walk_tables(struct vcpu *v, unsign
   1.211      {
   1.212          /* Not a superpage: carry on and find the l1e. */
   1.213          gw->l1mfn = gfn_to_mfn(d, guest_l2e_get_gfn(gw->l2e), &p2mt);
   1.214 -        if ( !p2m_is_ram(p2mt) ) return 1;
   1.215 +        if ( !p2m_is_ram(p2mt) )
   1.216 +        {
   1.217 +            rc |= _PAGE_PRESENT;
   1.218 +            goto out;
   1.219 +        }
   1.220          ASSERT(mfn_valid(gw->l1mfn));
   1.221          /* This mfn is a pagetable: make sure the guest can't write to it. */
   1.222          if ( shadow_op 
   1.223               && sh_remove_write_access(v, gw->l1mfn, 1, va) != 0 )
   1.224              flush_tlb_mask(d->domain_dirty_cpumask); 
   1.225          l1p = sh_map_domain_page(gw->l1mfn);
   1.226 -        rc = guest_walk_entry(v, gw->l2mfn, l1p + guest_l1_table_offset(va),
   1.227 -                              &gw->l1e, flags, 1);
   1.228 -        sh_unmap_domain_page(l1p);
   1.229 -        if ( rc != 0 ) return rc;
   1.230 +        gw->l1e = l1p[guest_l1_table_offset(va)];
   1.231 +        gflags = guest_l1e_get_flags(gw->l1e) ^ _PAGE_NX_BIT;
   1.232 +        rc |= ((gflags & mflags) ^ mflags);
   1.233      }
   1.234  
   1.235 -    return 0;
   1.236 +    /* Go back and set accessed and dirty bits only if the walk was a
   1.237 +     * success.  Although the PRMs say higher-level _PAGE_ACCESSED bits
   1.238 +     * get set whenever a lower-level PT is used, at least some hardware
   1.239 +     * walkers behave this way. */
   1.240 +    if ( rc == 0 ) 
   1.241 +    {
   1.242 +#if GUEST_PAGING_LEVELS == 4 /* 64-bit only... */
   1.243 +        if ( set_ad_bits(l4p + guest_l4_table_offset(va), &gw->l4e, 0) )
   1.244 +            paging_mark_dirty(d, mfn_x(gw->l4mfn));
   1.245 +        if ( set_ad_bits(l3p + guest_l3_table_offset(va), &gw->l3e, 0) )
   1.246 +            paging_mark_dirty(d, mfn_x(gw->l3mfn));
   1.247 +#endif
   1.248 +        if ( set_ad_bits(l2p + guest_l2_table_offset(va), &gw->l2e,
   1.249 +                         (pse && (pfec & PFEC_write_access))) )
   1.250 +            paging_mark_dirty(d, mfn_x(gw->l2mfn));            
   1.251 +        if ( !pse ) 
   1.252 +        {
   1.253 +            if ( set_ad_bits(l1p + guest_l1_table_offset(va), &gw->l1e, 
   1.254 +                             (pfec & PFEC_write_access)) )
   1.255 +                paging_mark_dirty(d, mfn_x(gw->l1mfn));
   1.256 +        }
   1.257 +    }
   1.258 +
   1.259 + out:
   1.260 +#if GUEST_PAGING_LEVELS == 4
   1.261 +    if ( l3p ) sh_unmap_domain_page(l3p);
   1.262 +#endif
   1.263 +#if GUEST_PAGING_LEVELS >= 3
   1.264 +    if ( l2p ) sh_unmap_domain_page(l2p);
   1.265 +#endif
   1.266 +    if ( l1p ) sh_unmap_domain_page(l1p);
   1.267 +
   1.268 +    return rc;
   1.269  }
   1.270  
   1.271  /* Given a walk_t, translate the gw->va into the guest's notion of the
   1.272 @@ -521,9 +548,8 @@ sh_guest_map_l1e(struct vcpu *v, unsigne
   1.273      // FIXME!
   1.274  
   1.275      shadow_lock(v->domain);
   1.276 -    guest_walk_tables(v, addr, &gw, 0, 1);
   1.277 -
   1.278 -    if ( mfn_valid(gw.l1mfn) )
   1.279 +    if ( guest_walk_tables(v, addr, &gw, PFEC_page_present, 1) == 0 
   1.280 +         && mfn_valid(gw.l1mfn) )
   1.281      {
   1.282          if ( gl1mfn )
   1.283              *gl1mfn = mfn_x(gw.l1mfn);
   1.284 @@ -547,7 +573,7 @@ sh_guest_get_eff_l1e(struct vcpu *v, uns
   1.285      // FIXME!
   1.286  
   1.287      shadow_lock(v->domain);
   1.288 -    guest_walk_tables(v, addr, &gw, 0, 1);
   1.289 +    (void) guest_walk_tables(v, addr, &gw, PFEC_page_present, 1);
   1.290      *(guest_l1e_t *)eff_l1e = gw.l1e;
   1.291      shadow_unlock(v->domain);
   1.292  }