direct-io.hg

changeset 12190:9b553a9787cf

Merge.

Signed-off-by: Ewan Mellor <ewan@xensource.com>
author Ewan Mellor <ewan@xensource.com>
date Wed Nov 01 10:41:44 2006 +0000 (2006-11-01)
parents 33e9c88aab02 0b6f49d25d4f
children 444496ecb14e
files
line diff
     1.1 --- a/xen/arch/x86/mm/shadow/common.c	Wed Nov 01 10:40:46 2006 +0000
     1.2 +++ b/xen/arch/x86/mm/shadow/common.c	Wed Nov 01 10:41:44 2006 +0000
     1.3 @@ -1327,8 +1327,18 @@ static void sh_hash_audit_bucket(struct 
     1.4               && e->t != (PGC_SH_fl1_pae_shadow >> PGC_SH_type_shift)
     1.5               && e->t != (PGC_SH_fl1_64_shadow >> PGC_SH_type_shift) )
     1.6          {
     1.7 +            struct page_info *gpg = mfn_to_page(_mfn(e->n));
     1.8              /* Bad shadow flags on guest page? */
     1.9 -            BUG_ON( !(mfn_to_page(_mfn(e->n))->shadow_flags & (1<<e->t)) );
    1.10 +            BUG_ON( !(gpg->shadow_flags & (1<<e->t)) );
    1.11 +            /* Bad type count on guest page? */
    1.12 +            if ( (gpg->u.inuse.type_info & PGT_type_mask) == PGT_writable_page 
    1.13 +                 && (gpg->u.inuse.type_info & PGT_count_mask) != 0 )
    1.14 +            {
    1.15 +                SHADOW_ERROR("MFN %#"SH_PRI_mfn" shadowed (by %#"SH_PRI_mfn")"
    1.16 +                             " but has typecount %#lx\n",
    1.17 +                             e->n, mfn_x(e->smfn), gpg->u.inuse.type_info);
    1.18 +                BUG();
    1.19 +            }
    1.20          }
    1.21          /* That entry was OK; on we go */
    1.22          e = e->next;
     2.1 --- a/xen/arch/x86/mm/shadow/multi.c	Wed Nov 01 10:40:46 2006 +0000
     2.2 +++ b/xen/arch/x86/mm/shadow/multi.c	Wed Nov 01 10:41:44 2006 +0000
     2.3 @@ -36,10 +36,7 @@
     2.4  #include "private.h"
     2.5  #include "types.h"
     2.6  
     2.7 -/* The first cut: an absolutely synchronous, trap-and-emulate version,
     2.8 - * supporting only HVM guests (and so only "external" shadow mode). 
     2.9 - *
    2.10 - * THINGS TO DO LATER:
    2.11 +/* THINGS TO DO LATER:
    2.12   * 
    2.13   * TEARDOWN HEURISTICS
    2.14   * Also: have a heuristic for when to destroy a previous paging-mode's 
    2.15 @@ -56,14 +53,6 @@
    2.16   * l3-and-l2h-only shadow mode for PAE PV guests that would allow them 
    2.17   * to share l2h pages again. 
    2.18   *
    2.19 - * PAE L3 COPYING
    2.20 - * In this code, we copy all 32 bytes of a PAE L3 every time we change an 
    2.21 - * entry in it, and every time we change CR3.  We copy it for the linear 
    2.22 - * mappings (ugh! PAE linear mappings) and we copy it to the low-memory
    2.23 - * buffer so it fits in CR3.  Maybe we can avoid some of this recopying 
    2.24 - * by using the shadow directly in some places. 
    2.25 - * Also, for SMP, need to actually respond to seeing shadow.pae_flip_pending.
    2.26 - *
    2.27   * GUEST_WALK_TABLES TLB FLUSH COALESCE
    2.28   * guest_walk_tables can do up to three remote TLB flushes as it walks to
    2.29   * the first l1 of a new pagetable.  Should coalesce the flushes to the end, 
    2.30 @@ -99,9 +88,6 @@ static char *fetch_type_names[] = {
    2.31  };
    2.32  #endif
    2.33  
    2.34 -/* XXX forward declarations */
    2.35 -static inline void sh_update_linear_entries(struct vcpu *v);
    2.36 -
    2.37  /**************************************************************************/
    2.38  /* Hash table mapping from guest pagetables to shadows
    2.39   *
    2.40 @@ -460,16 +446,20 @@ static u32 guest_set_ad_bits(struct vcpu
    2.41      u32 flags;
    2.42      int res = 0;
    2.43  
    2.44 +    ASSERT(ep && !(((unsigned long)ep) & ((sizeof *ep) - 1)));
    2.45 +    ASSERT(level <= GUEST_PAGING_LEVELS);
    2.46 +    ASSERT(shadow_lock_is_acquired(v->domain));
    2.47 +
    2.48 +    flags = guest_l1e_get_flags(*ep);
    2.49 +
    2.50 +    /* Only set A and D bits for guest-initiated accesses */
    2.51 +    if ( !(ft & FETCH_TYPE_DEMAND) )
    2.52 +        return flags;
    2.53 +
    2.54      ASSERT(valid_mfn(gmfn)
    2.55             && (sh_mfn_is_a_page_table(gmfn)
    2.56                 || ((mfn_to_page(gmfn)->u.inuse.type_info & PGT_count_mask) 
    2.57                     == 0)));
    2.58 -    ASSERT(ep && !(((unsigned long)ep) & ((sizeof *ep) - 1)));
    2.59 -    ASSERT(level <= GUEST_PAGING_LEVELS);
    2.60 -    ASSERT(ft == ft_demand_read || ft == ft_demand_write);
    2.61 -    ASSERT(shadow_lock_is_acquired(v->domain));
    2.62 -
    2.63 -    flags = guest_l1e_get_flags(*ep);
    2.64  
    2.65      /* PAE l3s do not have A and D bits */
    2.66      ASSERT(GUEST_PAGING_LEVELS > 3 || level != 3);
    2.67 @@ -496,12 +486,20 @@ static u32 guest_set_ad_bits(struct vcpu
    2.68      /* Set the bit(s) */
    2.69      sh_mark_dirty(v->domain, gmfn);
    2.70      SHADOW_DEBUG(A_AND_D, "gfn = %" SH_PRI_gfn ", "
    2.71 -                  "old flags = %#x, new flags = %#x\n", 
    2.72 -                  gfn_x(guest_l1e_get_gfn(*ep)), guest_l1e_get_flags(*ep), flags);
    2.73 +                 "old flags = %#x, new flags = %#x\n", 
    2.74 +                 gfn_x(guest_l1e_get_gfn(*ep)), guest_l1e_get_flags(*ep), 
    2.75 +                 flags);
    2.76      *ep = guest_l1e_from_gfn(guest_l1e_get_gfn(*ep), flags);
    2.77      
    2.78 -    /* Propagate this change to any existing shadows */
    2.79 -    res = __shadow_validate_guest_entry(v, gmfn, ep, sizeof(*ep));
    2.80 +    /* Propagate this change to any other shadows of the page 
    2.81 +     * (only necessary if there is more than one shadow) */
    2.82 +    if ( mfn_to_page(gmfn)->count_info & PGC_page_table )
    2.83 +    {
    2.84 +        u32 shflags = mfn_to_page(gmfn)->shadow_flags & SHF_page_type_mask;
    2.85 +        /* More than one type bit set in shadow-flags? */
    2.86 +        if ( shflags & ~(1UL << find_first_set_bit(shflags)) )
    2.87 +            res = __shadow_validate_guest_entry(v, gmfn, ep, sizeof(*ep));
    2.88 +    }
    2.89  
    2.90      /* We should never need to flush the TLB or recopy PAE entries */
    2.91      ASSERT((res == 0) || (res == SHADOW_SET_CHANGED));
    2.92 @@ -637,79 +635,70 @@ shadow_l4_index(mfn_t *smfn, u32 guest_i
    2.93  
    2.94  
    2.95  /**************************************************************************/
    2.96 -/* Functions which compute shadow entries from their corresponding guest
    2.97 - * entries.
    2.98 - *
    2.99 - * These are the "heart" of the shadow code.
   2.100 - *
   2.101 - * There are two sets of these: those that are called on demand faults (read
   2.102 - * faults and write faults), and those that are essentially called to
   2.103 - * "prefetch" (or propagate) entries from the guest into the shadow.  The read
   2.104 - * fault and write fault are handled as two separate cases for L1 entries (due
   2.105 - * to the _PAGE_DIRTY bit handling), but for L[234], they are grouped together
   2.106 - * into the respective demand_fault functions.
   2.107 +/* Function which computes shadow entries from their corresponding guest
   2.108 + * entries.  This is the "heart" of the shadow code. It operates using
   2.109 + * level-1 shadow types, but handles all levels of entry.
   2.110 + * Don't call it directly, but use the four wrappers below.
   2.111   */
   2.112 -// The function below tries to capture all of the flag manipulation for the
   2.113 -// demand and propagate functions into one place.
   2.114 -//
   2.115 -static always_inline u32
   2.116 -sh_propagate_flags(struct vcpu *v, mfn_t target_mfn, 
   2.117 -                    u32 gflags, guest_l1e_t *guest_entry_ptr, mfn_t gmfn, 
   2.118 -                    int mmio, int level, fetch_type_t ft)
   2.119 +
   2.120 +static always_inline void
   2.121 +_sh_propagate(struct vcpu *v, 
   2.122 +              void *guest_entry_ptr, 
   2.123 +              mfn_t guest_table_mfn, 
   2.124 +              mfn_t target_mfn, 
   2.125 +              void *shadow_entry_ptr,
   2.126 +              int level,
   2.127 +              fetch_type_t ft, 
   2.128 +              int mmio)
   2.129  {
   2.130 -#define CHECK(_cond)                                    \
   2.131 -do {                                                    \
   2.132 -    if (unlikely(!(_cond)))                             \
   2.133 -    {                                                   \
   2.134 -        printk("%s %s %d ASSERTION (%s) FAILED\n",      \
   2.135 -               __func__, __FILE__, __LINE__, #_cond);   \
   2.136 -        domain_crash(d);                                \
   2.137 -    }                                                   \
   2.138 -} while (0);
   2.139 -
   2.140 +    guest_l1e_t *gp = guest_entry_ptr;
   2.141 +    shadow_l1e_t *sp = shadow_entry_ptr;
   2.142      struct domain *d = v->domain;
   2.143      u32 pass_thru_flags;
   2.144 -    u32 sflags;
   2.145 +    u32 gflags, sflags;
   2.146  
   2.147      /* We don't shadow PAE l3s */
   2.148      ASSERT(GUEST_PAGING_LEVELS > 3 || level != 3);
   2.149  
   2.150 -    // XXX -- might want to think about PAT support for HVM guests...
   2.151 -
   2.152 -#ifndef NDEBUG
   2.153 -    // MMIO can only occur from L1e's
   2.154 -    //
   2.155 -    if ( mmio )
   2.156 -        CHECK(level == 1);
   2.157 -
   2.158 -    // We should always have a pointer to the guest entry if it's a non-PSE
   2.159 -    // non-MMIO demand access.
   2.160 -    if ( ft & FETCH_TYPE_DEMAND )
   2.161 -        CHECK(guest_entry_ptr || level == 1);
   2.162 -#endif
   2.163 -
   2.164 -    // A not-present guest entry has a special signature in the shadow table,
   2.165 -    // so that we do not have to consult the guest tables multiple times...
   2.166 -    //
   2.167 +    if ( valid_mfn(guest_table_mfn) )
   2.168 +        /* Handle A and D bit propagation into the guest */
   2.169 +        gflags = guest_set_ad_bits(v, guest_table_mfn, gp, level, ft);
   2.170 +    else 
   2.171 +    {
   2.172 +        /* Must be an fl1e or a prefetch */
   2.173 +        ASSERT(level==1 || !(ft & FETCH_TYPE_DEMAND));
   2.174 +        gflags = guest_l1e_get_flags(*gp);
   2.175 +    }
   2.176 +
   2.177      if ( unlikely(!(gflags & _PAGE_PRESENT)) )
   2.178 -        return _PAGE_SHADOW_GUEST_NOT_PRESENT;
   2.179 -
   2.180 -    // Must have a valid target_mfn, unless this is mmio, or unless this is a
   2.181 -    // prefetch.  In the case of a prefetch, an invalid mfn means that we can
   2.182 -    // not usefully shadow anything, and so we return early.
   2.183 +    {
   2.184 +        /* If a guest l1 entry is not present, shadow with the magic 
   2.185 +         * guest-not-present entry. */
   2.186 +        if ( level == 1 )
   2.187 +            *sp = sh_l1e_gnp();
   2.188 +        else 
   2.189 +            *sp = shadow_l1e_empty();
   2.190 +        goto done;
   2.191 +    }
   2.192 +
   2.193 +    if ( level == 1 && mmio )
   2.194 +    {
   2.195 +        /* Guest l1e maps MMIO space */
   2.196 +        *sp = sh_l1e_mmio(guest_l1e_get_gfn(*gp), gflags);
   2.197 +        goto done;
   2.198 +    }
   2.199 +
   2.200 +    // Must have a valid target_mfn, unless this is a prefetch.  In the
   2.201 +    // case of a prefetch, an invalid mfn means that we can not usefully
   2.202 +    // shadow anything, and so we return early.
   2.203      //
   2.204      if ( !valid_mfn(target_mfn) )
   2.205      {
   2.206 -        CHECK((ft == ft_prefetch) || mmio);
   2.207 -        if ( !mmio )
   2.208 -            return 0;
   2.209 +        ASSERT((ft == ft_prefetch));
   2.210 +        *sp = shadow_l1e_empty();
   2.211 +        goto done;
   2.212      }
   2.213  
   2.214 -    // Set the A and D bits in the guest entry, if we need to.
   2.215 -    if ( guest_entry_ptr && (ft & FETCH_TYPE_DEMAND) )
   2.216 -        gflags = guest_set_ad_bits(v, gmfn, guest_entry_ptr, level, ft);
   2.217 -    
   2.218 -
   2.219      // Propagate bits from the guest to the shadow.
   2.220      // Some of these may be overwritten, below.
   2.221      // Since we know the guest's PRESENT bit is set, we also set the shadow's
   2.222 @@ -719,12 +708,7 @@ do {                                    
   2.223                         _PAGE_RW | _PAGE_PRESENT);
   2.224      if ( guest_supports_nx(v) )
   2.225          pass_thru_flags |= _PAGE_NX_BIT;
   2.226 -    sflags = (gflags & pass_thru_flags) | _PAGE_SHADOW_PRESENT;
   2.227 -
   2.228 -    // Copy the guest's RW bit into the SHADOW_RW bit.
   2.229 -    //
   2.230 -    if ( gflags & _PAGE_RW )
   2.231 -        sflags |= _PAGE_SHADOW_RW;
   2.232 +    sflags = gflags & pass_thru_flags;
   2.233  
   2.234      // Set the A&D bits for higher level shadows.
   2.235      // Higher level entries do not, strictly speaking, have dirty bits, but
   2.236 @@ -750,49 +734,35 @@ do {                                    
   2.237                    && !(gflags & _PAGE_DIRTY)) )
   2.238          sflags &= ~_PAGE_RW;
   2.239  
   2.240 -    // MMIO caching
   2.241 +    // shadow_mode_log_dirty support
   2.242      //
   2.243 -    // MMIO mappings are marked as not present, but we set the SHADOW_MMIO bit
   2.244 -    // to cache the fact that this entry  is in MMIO space.
   2.245 +    // Only allow the guest write access to a page a) on a demand fault,
   2.246 +    // or b) if the page is already marked as dirty.
   2.247      //
   2.248 -    if ( (level == 1) && mmio )
   2.249 +    if ( unlikely((level == 1) && shadow_mode_log_dirty(d)) )
   2.250      {
   2.251 -        sflags &= ~(_PAGE_PRESENT);
   2.252 -        sflags |= _PAGE_SHADOW_MMIO;
   2.253 +        if ( ft & FETCH_TYPE_WRITE ) 
   2.254 +            sh_mark_dirty(d, target_mfn);
   2.255 +        else if ( !sh_mfn_is_dirty(d, target_mfn) )
   2.256 +            sflags &= ~_PAGE_RW;
   2.257      }
   2.258 -    else 
   2.259 +    
   2.260 +    // protect guest page tables
   2.261 +    //
   2.262 +    if ( unlikely((level == 1) && sh_mfn_is_a_page_table(target_mfn)) )
   2.263      {
   2.264 -        // shadow_mode_log_dirty support
   2.265 -        //
   2.266 -        // Only allow the guest write access to a page a) on a demand fault,
   2.267 -        // or b) if the page is already marked as dirty.
   2.268 -        //
   2.269 -        if ( unlikely((level == 1) &&
   2.270 -                      !(ft & FETCH_TYPE_WRITE) &&
   2.271 -                      shadow_mode_log_dirty(d) &&
   2.272 -                      !sh_mfn_is_dirty(d, target_mfn)) )
   2.273 +        if ( shadow_mode_trap_reads(d) )
   2.274          {
   2.275 -            sflags &= ~_PAGE_RW;
   2.276 +            // if we are trapping both reads & writes, then mark this page
   2.277 +            // as not present...
   2.278 +            //
   2.279 +            sflags &= ~_PAGE_PRESENT;
   2.280          }
   2.281 -        
   2.282 -        // protect guest page tables
   2.283 -        //
   2.284 -        if ( unlikely((level == 1) &&
   2.285 -                      sh_mfn_is_a_page_table(target_mfn)) )
   2.286 +        else
   2.287          {
   2.288 -            if ( shadow_mode_trap_reads(d) )
   2.289 -            {
   2.290 -                // if we are trapping both reads & writes, then mark this page
   2.291 -                // as not present...
   2.292 -                //
   2.293 -                sflags &= ~_PAGE_PRESENT;
   2.294 -            }
   2.295 -            else
   2.296 -            {
   2.297 -                // otherwise, just prevent any writes...
   2.298 -                //
   2.299 -                sflags &= ~_PAGE_RW;
   2.300 -            }
   2.301 +            // otherwise, just prevent any writes...
   2.302 +            //
   2.303 +            sflags &= ~_PAGE_RW;
   2.304          }
   2.305      }
   2.306  
   2.307 @@ -804,29 +774,28 @@ do {                                    
   2.308          sflags |= _PAGE_USER;
   2.309      }
   2.310  
   2.311 -    return sflags;
   2.312 -#undef CHECK
   2.313 +    *sp = shadow_l1e_from_mfn(target_mfn, sflags);
   2.314 + done:
   2.315 +    SHADOW_DEBUG(PROPAGATE,
   2.316 +                 "%s level %u guest %" SH_PRI_gpte " shadow %" SH_PRI_pte "\n",
   2.317 +                 fetch_type_names[ft], level, gp->l1, sp->l1);
   2.318  }
   2.319  
   2.320 +
   2.321 +/* These four wrappers give us a little bit of type-safety back around the 
   2.322 + * use of void-* pointers in _sh_propagate(), and allow the compiler to 
   2.323 + * optimize out some level checks. */
   2.324 +
   2.325  #if GUEST_PAGING_LEVELS >= 4
   2.326  static void
   2.327  l4e_propagate_from_guest(struct vcpu *v, 
   2.328                           guest_l4e_t *gl4e,
   2.329                           mfn_t gl4mfn,
   2.330                           mfn_t sl3mfn,
   2.331 -                         shadow_l4e_t *sl4p,
   2.332 +                         shadow_l4e_t *sl4e,
   2.333                           fetch_type_t ft)
   2.334  {
   2.335 -    u32 gflags = guest_l4e_get_flags(*gl4e);
   2.336 -    u32 sflags = sh_propagate_flags(v, sl3mfn, gflags, (guest_l1e_t *) gl4e,
   2.337 -                                     gl4mfn, 0, 4, ft);
   2.338 -
   2.339 -    *sl4p = shadow_l4e_from_mfn(sl3mfn, sflags);
   2.340 -
   2.341 -    SHADOW_DEBUG(PROPAGATE,
   2.342 -                  "%s gl4e=%" SH_PRI_gpte " sl4e=%" SH_PRI_pte "\n",
   2.343 -                  fetch_type_names[ft], gl4e->l4, sl4p->l4);
   2.344 -    ASSERT(sflags != -1);
   2.345 +    _sh_propagate(v, gl4e, gl4mfn, sl3mfn, sl4e, 4, ft, 0);
   2.346  }
   2.347  
   2.348  static void
   2.349 @@ -834,19 +803,10 @@ l3e_propagate_from_guest(struct vcpu *v,
   2.350                           guest_l3e_t *gl3e,
   2.351                           mfn_t gl3mfn, 
   2.352                           mfn_t sl2mfn, 
   2.353 -                         shadow_l3e_t *sl3p,
   2.354 +                         shadow_l3e_t *sl3e,
   2.355                           fetch_type_t ft)
   2.356  {
   2.357 -    u32 gflags = guest_l3e_get_flags(*gl3e);
   2.358 -    u32 sflags = sh_propagate_flags(v, sl2mfn, gflags, (guest_l1e_t *) gl3e,
   2.359 -                                     gl3mfn, 0, 3, ft);
   2.360 -
   2.361 -    *sl3p = shadow_l3e_from_mfn(sl2mfn, sflags);
   2.362 -
   2.363 -    SHADOW_DEBUG(PROPAGATE,
   2.364 -                  "%s gl3e=%" SH_PRI_gpte " sl3e=%" SH_PRI_pte "\n",
   2.365 -                  fetch_type_names[ft], gl3e->l3, sl3p->l3);
   2.366 -    ASSERT(sflags != -1);
   2.367 +    _sh_propagate(v, gl3e, gl3mfn, sl2mfn, sl3e, 3, ft, 0);
   2.368  }
   2.369  #endif // GUEST_PAGING_LEVELS >= 4
   2.370  
   2.371 @@ -854,95 +814,23 @@ static void
   2.372  l2e_propagate_from_guest(struct vcpu *v, 
   2.373                           guest_l2e_t *gl2e,
   2.374                           mfn_t gl2mfn,
   2.375 -                         mfn_t sl1mfn, 
   2.376 -                         shadow_l2e_t *sl2p,
   2.377 +                         mfn_t sl1mfn,
   2.378 +                         shadow_l2e_t *sl2e,
   2.379                           fetch_type_t ft)
   2.380  {
   2.381 -    u32 gflags = guest_l2e_get_flags(*gl2e);
   2.382 -    u32 sflags = sh_propagate_flags(v, sl1mfn, gflags, (guest_l1e_t *) gl2e, 
   2.383 -                                     gl2mfn, 0, 2, ft);
   2.384 -
   2.385 -    *sl2p = shadow_l2e_from_mfn(sl1mfn, sflags);
   2.386 -
   2.387 -    SHADOW_DEBUG(PROPAGATE,
   2.388 -                  "%s gl2e=%" SH_PRI_gpte " sl2e=%" SH_PRI_pte "\n",
   2.389 -                  fetch_type_names[ft], gl2e->l2, sl2p->l2);
   2.390 -    ASSERT(sflags != -1);
   2.391 +    _sh_propagate(v, gl2e, gl2mfn, sl1mfn, sl2e, 2, ft, 0);
   2.392  }
   2.393  
   2.394 -static inline int
   2.395 -l1e_read_fault(struct vcpu *v, walk_t *gw, mfn_t gmfn, shadow_l1e_t *sl1p,
   2.396 -               int mmio)
   2.397 -/* returns 1 if emulation is required, and 0 otherwise */
   2.398 -{
   2.399 -    struct domain *d = v->domain;
   2.400 -    u32 gflags = guest_l1e_get_flags(gw->eff_l1e);
   2.401 -    u32 sflags = sh_propagate_flags(v, gmfn, gflags, gw->l1e, gw->l1mfn,
   2.402 -                                     mmio, 1, ft_demand_read);
   2.403 -
   2.404 -    if ( shadow_mode_trap_reads(d) && !mmio && sh_mfn_is_a_page_table(gmfn) )
   2.405 -    {
   2.406 -        // emulation required!
   2.407 -        *sl1p = shadow_l1e_empty();
   2.408 -        return 1;
   2.409 -    }
   2.410 -
   2.411 -    *sl1p = shadow_l1e_from_mfn(gmfn, sflags);
   2.412 -
   2.413 -    SHADOW_DEBUG(PROPAGATE,
   2.414 -                  "va=%p eff_gl1e=%" SH_PRI_gpte " sl1e=%" SH_PRI_pte "\n",
   2.415 -                  (void *)gw->va, gw->eff_l1e.l1, sl1p->l1);
   2.416 -
   2.417 -    ASSERT(sflags != -1);
   2.418 -    return 0;
   2.419 -}
   2.420 -
   2.421 -static inline int
   2.422 -l1e_write_fault(struct vcpu *v, walk_t *gw, mfn_t gmfn, shadow_l1e_t *sl1p,
   2.423 -                int mmio)
   2.424 -/* returns 1 if emulation is required, and 0 otherwise */
   2.425 -{
   2.426 -    struct domain *d = v->domain;
   2.427 -    u32 gflags = guest_l1e_get_flags(gw->eff_l1e);
   2.428 -    u32 sflags = sh_propagate_flags(v, gmfn, gflags, gw->l1e, gw->l1mfn,
   2.429 -                                     mmio, 1, ft_demand_write);
   2.430 -
   2.431 -    sh_mark_dirty(d, gmfn);
   2.432 -
   2.433 -    if ( !mmio && sh_mfn_is_a_page_table(gmfn) )
   2.434 -    {
   2.435 -        // emulation required!
   2.436 -        *sl1p = shadow_l1e_empty();
   2.437 -        return 1;
   2.438 -    }
   2.439 -
   2.440 -    *sl1p = shadow_l1e_from_mfn(gmfn, sflags);
   2.441 -
   2.442 -    SHADOW_DEBUG(PROPAGATE,
   2.443 -                  "va=%p eff_gl1e=%" SH_PRI_gpte " sl1e=%" SH_PRI_pte "\n",
   2.444 -                  (void *)gw->va, gw->eff_l1e.l1, sl1p->l1);
   2.445 -
   2.446 -    ASSERT(sflags != -1);
   2.447 -    return 0;
   2.448 -}
   2.449 -
   2.450 -static inline void
   2.451 -l1e_propagate_from_guest(struct vcpu *v, guest_l1e_t gl1e, shadow_l1e_t *sl1p,
   2.452 +static void
   2.453 +l1e_propagate_from_guest(struct vcpu *v, 
   2.454 +                         guest_l1e_t *gl1e,
   2.455 +                         mfn_t gl1mfn,
   2.456 +                         mfn_t gmfn, 
   2.457 +                         shadow_l1e_t *sl1e,
   2.458 +                         fetch_type_t ft, 
   2.459                           int mmio)
   2.460  {
   2.461 -    gfn_t gfn = guest_l1e_get_gfn(gl1e);
   2.462 -    mfn_t gmfn = (mmio) ? _mfn(gfn_x(gfn)) : vcpu_gfn_to_mfn(v, gfn);
   2.463 -    u32 gflags = guest_l1e_get_flags(gl1e);
   2.464 -    u32 sflags = sh_propagate_flags(v, gmfn, gflags, 0, _mfn(INVALID_MFN), 
   2.465 -                                     mmio, 1, ft_prefetch);
   2.466 -
   2.467 -    *sl1p = shadow_l1e_from_mfn(gmfn, sflags);
   2.468 -
   2.469 -    SHADOW_DEBUG(PROPAGATE,
   2.470 -                  "gl1e=%" SH_PRI_gpte " sl1e=%" SH_PRI_pte "\n",
   2.471 -                  gl1e.l1, sl1p->l1);
   2.472 -
   2.473 -    ASSERT(sflags != -1);
   2.474 +    _sh_propagate(v, gl1e, gl1mfn, gmfn, sl1e, 1, ft, mmio);
   2.475  }
   2.476  
   2.477  
   2.478 @@ -956,8 +844,6 @@ l1e_propagate_from_guest(struct vcpu *v,
   2.479   * SHADOW_SET_FLUSH   -- the caller must cause a TLB flush.
   2.480   * SHADOW_SET_ERROR   -- the input is not a valid entry (for example, if
   2.481   *                        shadow_get_page_from_l1e() fails).
   2.482 - * SHADOW_SET_L3PAE_RECOPY -- one or more vcpu's need to have their local
   2.483 - *                             copies of their PAE L3 entries re-copied.
   2.484   */
   2.485  
   2.486  static inline void safe_write_entry(void *dst, void *src) 
   2.487 @@ -1041,16 +927,13 @@ shadow_get_page_from_l1e(shadow_l1e_t sl
   2.488      int res;
   2.489      mfn_t mfn;
   2.490      struct domain *owner;
   2.491 -    shadow_l1e_t sanitized_sl1e =
   2.492 -        shadow_l1e_remove_flags(sl1e, _PAGE_SHADOW_RW | _PAGE_SHADOW_PRESENT);
   2.493 -
   2.494 -    //ASSERT(shadow_l1e_get_flags(sl1e) & _PAGE_PRESENT);
   2.495 -    //ASSERT((shadow_l1e_get_flags(sl1e) & L1_DISALLOW_MASK) == 0);
   2.496 +
   2.497 +    ASSERT(!sh_l1e_is_magic(sl1e));
   2.498  
   2.499      if ( !shadow_mode_refcounts(d) )
   2.500          return 1;
   2.501  
   2.502 -    res = get_page_from_l1e(sanitized_sl1e, d);
   2.503 +    res = get_page_from_l1e(sl1e, d);
   2.504  
   2.505      // If a privileged domain is attempting to install a map of a page it does
   2.506      // not own, we let it succeed anyway.
   2.507 @@ -1062,7 +945,7 @@ shadow_get_page_from_l1e(shadow_l1e_t sl
   2.508           (owner = page_get_owner(mfn_to_page(mfn))) &&
   2.509           (d != owner) )
   2.510      {
   2.511 -        res = get_page_from_l1e(sanitized_sl1e, owner);
   2.512 +        res = get_page_from_l1e(sl1e, owner);
   2.513          SHADOW_PRINTK("privileged domain %d installs map of mfn %05lx "
   2.514                         "which is owned by domain %d: %s\n",
   2.515                         d->domain_id, mfn_x(mfn), owner->domain_id,
   2.516 @@ -1250,7 +1133,8 @@ static int shadow_set_l1e(struct vcpu *v
   2.517  
   2.518      if ( old_sl1e.l1 == new_sl1e.l1 ) return 0; /* Nothing to do */
   2.519      
   2.520 -    if ( shadow_l1e_get_flags(new_sl1e) & _PAGE_PRESENT ) 
   2.521 +    if ( (shadow_l1e_get_flags(new_sl1e) & _PAGE_PRESENT)
   2.522 +         && !sh_l1e_is_magic(new_sl1e) ) 
   2.523      {
   2.524          /* About to install a new reference */        
   2.525          if ( shadow_mode_refcounts(d) ) {
   2.526 @@ -1267,7 +1151,8 @@ static int shadow_set_l1e(struct vcpu *v
   2.527      shadow_write_entries(sl1e, &new_sl1e, 1, sl1mfn);
   2.528      flags |= SHADOW_SET_CHANGED;
   2.529  
   2.530 -    if ( shadow_l1e_get_flags(old_sl1e) & _PAGE_PRESENT ) 
   2.531 +    if ( (shadow_l1e_get_flags(old_sl1e) & _PAGE_PRESENT) 
   2.532 +         && !sh_l1e_is_magic(old_sl1e) )
   2.533      {
   2.534          /* We lost a reference to an old mfn. */
   2.535          /* N.B. Unlike higher-level sets, never need an extra flush 
   2.536 @@ -2133,7 +2018,8 @@ void sh_destroy_l1_shadow(struct vcpu *v
   2.537          /* Decrement refcounts of all the old entries */
   2.538          mfn_t sl1mfn = smfn; 
   2.539          SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, 0, {
   2.540 -            if ( shadow_l1e_get_flags(*sl1e) & _PAGE_PRESENT ) 
   2.541 +            if ( (shadow_l1e_get_flags(*sl1e) & _PAGE_PRESENT)
   2.542 +                 && !sh_l1e_is_magic(*sl1e) )
   2.543                  shadow_put_page_from_l1e(*sl1e, d);
   2.544          });
   2.545      }
   2.546 @@ -2399,16 +2285,17 @@ static int validate_gl1e(struct vcpu *v,
   2.547      guest_l1e_t *new_gl1e = new_ge;
   2.548      shadow_l1e_t *sl1p = se;
   2.549      gfn_t gfn;
   2.550 -    mfn_t mfn;
   2.551 -    int result = 0;
   2.552 +    mfn_t gmfn;
   2.553 +    int result = 0, mmio;
   2.554  
   2.555      perfc_incrc(shadow_validate_gl1e_calls);
   2.556  
   2.557      gfn = guest_l1e_get_gfn(*new_gl1e);
   2.558 -    mfn = vcpu_gfn_to_mfn(v, gfn);
   2.559 -
   2.560 -    l1e_propagate_from_guest(v, *new_gl1e, &new_sl1e, 
   2.561 -                             /* mmio? */ !valid_mfn(mfn));
   2.562 +    gmfn = vcpu_gfn_to_mfn(v, gfn);
   2.563 +
   2.564 +    mmio = (hvm_guest(v) && shadow_vcpu_mode_translate(v) && !valid_mfn(gmfn));
   2.565 +    l1e_propagate_from_guest(v, new_gl1e, _mfn(INVALID_MFN), gmfn, &new_sl1e, 
   2.566 +                             ft_prefetch, mmio);
   2.567      
   2.568      result |= shadow_set_l1e(v, sl1p, new_sl1e, sl1mfn);
   2.569      return result;
   2.570 @@ -2579,6 +2466,80 @@ static inline void reset_early_unshadow(
   2.571  
   2.572  
   2.573  /**************************************************************************/
   2.574 +/* Optimization: Prefetch multiple L1 entries.  This is called after we have 
   2.575 + * demand-faulted a shadow l1e in the fault handler, to see if it's
   2.576 + * worth fetching some more.
   2.577 + */
   2.578 +
   2.579 +#if SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH
   2.580 +
   2.581 +/* XXX magic number */
   2.582 +#define PREFETCH_DISTANCE 32
   2.583 +
   2.584 +static void sh_prefetch(struct vcpu *v, walk_t *gw, 
   2.585 +                        shadow_l1e_t *ptr_sl1e, mfn_t sl1mfn)
   2.586 +{
   2.587 +    int i, dist, mmio;
   2.588 +    gfn_t gfn;
   2.589 +    mfn_t gmfn;
   2.590 +    guest_l1e_t gl1e;
   2.591 +    shadow_l1e_t sl1e;
   2.592 +    u32 gflags;
   2.593 +
   2.594 +    /* Prefetch no further than the end of the _shadow_ l1 MFN */
   2.595 +    dist = (PAGE_SIZE - ((unsigned long)ptr_sl1e & ~PAGE_MASK)) / sizeof sl1e;
   2.596 +    /* And no more than a maximum fetches-per-fault */
   2.597 +    if ( dist > PREFETCH_DISTANCE )
   2.598 +        dist = PREFETCH_DISTANCE;
   2.599 +
   2.600 +    for ( i = 1; i < dist ; i++ ) 
   2.601 +    {
   2.602 +        /* No point in prefetching if there's already a shadow */
   2.603 +        if ( ptr_sl1e[i].l1 != 0 )
   2.604 +            break;
   2.605 +
   2.606 +        if ( gw->l1e )
   2.607 +        {
   2.608 +            /* Normal guest page; grab the next guest entry */
   2.609 +            gl1e = gw->l1e[i];
   2.610 +            /* Not worth continuing if we hit an entry that will need another
   2.611 +             * fault for A/D-bit propagation anyway */
   2.612 +            gflags = guest_l1e_get_flags(gl1e);
   2.613 +            if ( (gflags & _PAGE_PRESENT) 
   2.614 +                 && (!(gflags & _PAGE_ACCESSED)
   2.615 +                     || ((gflags & _PAGE_RW) && !(gflags & _PAGE_DIRTY))) )
   2.616 +                break;
   2.617 +        } 
   2.618 +        else 
   2.619 +        {
   2.620 +            /* Fragmented superpage, unless we've been called wrongly */
   2.621 +            ASSERT(guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE);
   2.622 +            /* Increment the l1e's GFN by the right number of guest pages */
   2.623 +            gl1e = guest_l1e_from_gfn(
   2.624 +                _gfn(gfn_x(guest_l1e_get_gfn(gw->eff_l1e)) + i), 
   2.625 +                guest_l1e_get_flags(gw->eff_l1e));
   2.626 +        }
   2.627 +
   2.628 +        /* Look at the gfn that the l1e is pointing at */
   2.629 +        gfn = guest_l1e_get_gfn(gl1e);
   2.630 +        gmfn = vcpu_gfn_to_mfn(v, gfn);
   2.631 +        mmio = ( hvm_guest(v) 
   2.632 +                 && shadow_vcpu_mode_translate(v) 
   2.633 +                 && mmio_space(gfn_to_paddr(gfn)) );
   2.634 +
   2.635 +        /* Propagate the entry.  Safe to use a pointer to our local 
   2.636 +         * gl1e, since this is not a demand-fetch so there will be no 
   2.637 +         * write-back to the guest. */
   2.638 +        l1e_propagate_from_guest(v, &gl1e, _mfn(INVALID_MFN),
   2.639 +                                 gmfn, &sl1e, ft_prefetch, mmio);
   2.640 +        (void) shadow_set_l1e(v, ptr_sl1e + i, sl1e, sl1mfn);
   2.641 +    }
   2.642 +}
   2.643 +
   2.644 +#endif /* SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH */
   2.645 +
   2.646 +
   2.647 +/**************************************************************************/
   2.648  /* Entry points into the shadow code */
   2.649  
   2.650  /* Called from pagefault handler in Xen, and from the HVM trap handlers
   2.651 @@ -2602,16 +2563,70 @@ static int sh_page_fault(struct vcpu *v,
   2.652      int r, mmio;
   2.653      fetch_type_t ft = 0;
   2.654  
   2.655 +    SHADOW_PRINTK("d:v=%u:%u va=%#lx err=%u\n",
   2.656 +                   v->domain->domain_id, v->vcpu_id, va, regs->error_code);
   2.657 +
   2.658      //
   2.659      // XXX: Need to think about eventually mapping superpages directly in the
   2.660      //      shadow (when possible), as opposed to splintering them into a
   2.661      //      bunch of 4K maps.
   2.662      //
   2.663  
   2.664 +#if (SHADOW_OPTIMIZATIONS & SHOPT_FAST_FAULT_PATH) && SHADOW_PAGING_LEVELS > 2
   2.665 +    if ( (regs->error_code & PFEC_reserved_bit) )
   2.666 +    {
   2.667 +        /* The only reasons for reserved bits to be set in shadow entries 
   2.668 +         * are the two "magic" shadow_l1e entries. */
   2.669 +        if ( likely((__copy_from_user(&sl1e, 
   2.670 +                                      (sh_linear_l1_table(v) 
   2.671 +                                       + shadow_l1_linear_offset(va)),
   2.672 +                                      sizeof(sl1e)) == 0)
   2.673 +                    && sh_l1e_is_magic(sl1e)) )
   2.674 +        {
   2.675 +            if ( sh_l1e_is_gnp(sl1e) )
   2.676 +            {
   2.677 +                if ( likely(!hvm_guest(v) || shadow_vcpu_mode_translate(v)) )
   2.678 +                { 
   2.679 +                    /* Not-present in a guest PT: pass to the guest as
   2.680 +                     * a not-present fault (by flipping two bits). */
   2.681 +                    ASSERT(regs->error_code & PFEC_page_present);
   2.682 +                    regs->error_code ^= (PFEC_reserved_bit|PFEC_page_present);
   2.683 +                    perfc_incrc(shadow_fault_fast_gnp);
   2.684 +                    SHADOW_PRINTK("fast path not-present\n");
   2.685 +                    return 0;
   2.686 +                }
   2.687 +                else 
   2.688 +                {
   2.689 +                    /* Not-present in the P2M: MMIO */
   2.690 +                    gpa = va;
   2.691 +                }
   2.692 +            }
   2.693 +            else
   2.694 +            {
   2.695 +                /* Magic MMIO marker: extract gfn for MMIO address */
   2.696 +                ASSERT(sh_l1e_is_mmio(sl1e));
   2.697 +                gpa = (((paddr_t)(gfn_x(sh_l1e_mmio_get_gfn(sl1e)))) 
   2.698 +                       << PAGE_SHIFT) 
   2.699 +                    | (va & ~PAGE_MASK);
   2.700 +            }
   2.701 +            perfc_incrc(shadow_fault_fast_mmio);
   2.702 +            SHADOW_PRINTK("fast path mmio %#"PRIpaddr"\n", gpa);
   2.703 +            reset_early_unshadow(v);
   2.704 +            handle_mmio(gpa);
   2.705 +            return EXCRET_fault_fixed;
   2.706 +        }
   2.707 +        else
   2.708 +        {
   2.709 +            /* This should be exceptionally rare: another vcpu has fixed
   2.710 +             * the tables between the fault and our reading the l1e.
   2.711 +             * Fall through to the normal fault handing logic */
   2.712 +            perfc_incrc(shadow_fault_fast_fail);
   2.713 +            SHADOW_PRINTK("fast path false alarm!\n");
   2.714 +        }
   2.715 +    }
   2.716 +#endif /* SHOPT_FAST_FAULT_PATH */
   2.717 +
   2.718      shadow_lock(d);
   2.719 -
   2.720 -    SHADOW_PRINTK("d:v=%u:%u va=%#lx err=%u\n",
   2.721 -                   v->domain->domain_id, v->vcpu_id, va, regs->error_code);
   2.722      
   2.723      shadow_audit_tables(v);
   2.724                     
   2.725 @@ -2659,8 +2674,9 @@ static int sh_page_fault(struct vcpu *v,
   2.726      }
   2.727  
   2.728      // Was it a write fault?
   2.729 -    //
   2.730 -    if ( regs->error_code & PFEC_write_access )
   2.731 +    ft = ((regs->error_code & PFEC_write_access)
   2.732 +          ? ft_demand_write : ft_demand_read);
   2.733 +    if ( ft == ft_demand_write )
   2.734      {
   2.735          if ( unlikely(!(accumulated_gflags & _PAGE_RW)) )
   2.736          {
   2.737 @@ -2685,26 +2701,19 @@ static int sh_page_fault(struct vcpu *v,
   2.738          }
   2.739      }
   2.740  
   2.741 -    /* Is this an MMIO access? */
   2.742 +    /* What mfn is the guest trying to access? */
   2.743      gfn = guest_l1e_get_gfn(gw.eff_l1e);
   2.744 +    gmfn = vcpu_gfn_to_mfn(v, gfn);
   2.745      mmio = ( hvm_guest(v) 
   2.746               && shadow_vcpu_mode_translate(v) 
   2.747               && mmio_space(gfn_to_paddr(gfn)) );
   2.748  
   2.749 -    /* For MMIO, the shadow holds the *gfn*; for normal accesses, it holds 
   2.750 -     * the equivalent mfn. */
   2.751 -    if ( mmio ) 
   2.752 -        gmfn = _mfn(gfn_x(gfn));
   2.753 -    else
   2.754 +    if ( !mmio && !valid_mfn(gmfn) )
   2.755      {
   2.756 -        gmfn = vcpu_gfn_to_mfn(v, gfn);
   2.757 -        if ( !valid_mfn(gmfn) )
   2.758 -        {
   2.759 -            perfc_incrc(shadow_fault_bail_bad_gfn);
   2.760 -            SHADOW_PRINTK("BAD gfn=%"SH_PRI_gfn" gmfn=%"SH_PRI_mfn"\n", 
   2.761 -                           gfn_x(gfn), mfn_x(gmfn));
   2.762 -            goto not_a_shadow_fault;
   2.763 -        }
   2.764 +        perfc_incrc(shadow_fault_bail_bad_gfn);
   2.765 +        SHADOW_PRINTK("BAD gfn=%"SH_PRI_gfn" gmfn=%"SH_PRI_mfn"\n", 
   2.766 +                      gfn_x(gfn), mfn_x(gmfn));
   2.767 +        goto not_a_shadow_fault;
   2.768      }
   2.769  
   2.770      /* Make sure there is enough free shadow memory to build a chain of
   2.771 @@ -2717,45 +2726,40 @@ static int sh_page_fault(struct vcpu *v,
   2.772       * for the shadow entry, since we might promote a page here. */
   2.773      // XXX -- this code will need to change somewhat if/when the shadow code
   2.774      // can directly map superpages...
   2.775 -    ft = ((regs->error_code & PFEC_write_access) ?
   2.776 -          ft_demand_write : ft_demand_read);
   2.777      ptr_sl1e = shadow_get_and_create_l1e(v, &gw, &sl1mfn, ft);
   2.778      ASSERT(ptr_sl1e);
   2.779  
   2.780 -    /* Calculate the shadow entry */
   2.781 -    if ( ft == ft_demand_write )
   2.782 +    /* Calculate the shadow entry and write it */
   2.783 +    l1e_propagate_from_guest(v, (gw.l1e) ? gw.l1e : &gw.eff_l1e, gw.l1mfn, 
   2.784 +                             gmfn, &sl1e, ft, mmio);
   2.785 +    r = shadow_set_l1e(v, ptr_sl1e, sl1e, sl1mfn);
   2.786 +
   2.787 +#if SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH
   2.788 +    /* Prefetch some more shadow entries */
   2.789 +    sh_prefetch(v, &gw, ptr_sl1e, sl1mfn);
   2.790 +#endif
   2.791 +
   2.792 +    /* Need to emulate accesses to page tables */
   2.793 +    if ( sh_mfn_is_a_page_table(gmfn) )
   2.794      {
   2.795 -        if ( l1e_write_fault(v, &gw, gmfn, &sl1e, mmio) )
   2.796 +        if ( ft == ft_demand_write )
   2.797          {
   2.798              perfc_incrc(shadow_fault_emulate_write);
   2.799              goto emulate;
   2.800          }
   2.801 -    }
   2.802 -    else if ( l1e_read_fault(v, &gw, gmfn, &sl1e, mmio) )
   2.803 -    {
   2.804 -        perfc_incrc(shadow_fault_emulate_read);
   2.805 -        goto emulate;
   2.806 +        else if ( shadow_mode_trap_reads(d) && ft == ft_demand_read )
   2.807 +        {
   2.808 +            perfc_incrc(shadow_fault_emulate_read);
   2.809 +            goto emulate;
   2.810 +        }
   2.811      }
   2.812  
   2.813 -    /* Quick sanity check: we never make an MMIO entry that's got the 
   2.814 -     * _PAGE_PRESENT flag set in it. */
   2.815 -    ASSERT(!mmio || !(shadow_l1e_get_flags(sl1e) & _PAGE_PRESENT));
   2.816 -
   2.817 -    r = shadow_set_l1e(v, ptr_sl1e, sl1e, sl1mfn);
   2.818 -
   2.819      if ( mmio ) 
   2.820      {
   2.821          gpa = guest_walk_to_gpa(&gw);
   2.822          goto mmio;
   2.823      }
   2.824  
   2.825 -#if 0
   2.826 -    if ( !(r & SHADOW_SET_CHANGED) )
   2.827 -        debugtrace_printk("%s: shadow_set_l1e(va=%p, sl1e=%" SH_PRI_pte
   2.828 -                          ") did not change anything\n",
   2.829 -                          __func__, gw.va, l1e_get_intpte(sl1e));
   2.830 -#endif
   2.831 -
   2.832      perfc_incrc(shadow_fault_fixed);
   2.833      d->arch.shadow.fault_count++;
   2.834      reset_early_unshadow(v);
   2.835 @@ -2769,7 +2773,6 @@ static int sh_page_fault(struct vcpu *v,
   2.836      return EXCRET_fault_fixed;
   2.837  
   2.838   emulate:
   2.839 -
   2.840      /* Take the register set we were called with */
   2.841      emul_regs = *regs;
   2.842      if ( hvm_guest(v) )
   2.843 @@ -3932,25 +3935,48 @@ int sh_audit_l1_table(struct vcpu *v, mf
   2.844      gfn_t gfn;
   2.845      char *s;
   2.846      int done = 0;
   2.847 -
   2.848 +    
   2.849      /* Follow the backpointer */
   2.850      gl1mfn = _mfn(mfn_to_page(sl1mfn)->u.inuse.type_info);
   2.851      gl1e = gp = sh_map_domain_page(gl1mfn);
   2.852      SHADOW_FOREACH_L1E(sl1mfn, sl1e, &gl1e, done, {
   2.853  
   2.854 -        s = sh_audit_flags(v, 1, guest_l1e_get_flags(*gl1e),
   2.855 -                            shadow_l1e_get_flags(*sl1e));
   2.856 -        if ( s ) AUDIT_FAIL(1, "%s", s);
   2.857 -
   2.858 -        if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
   2.859 +        if ( sh_l1e_is_magic(*sl1e) ) 
   2.860          {
   2.861 -            gfn = guest_l1e_get_gfn(*gl1e);
   2.862 -            mfn = shadow_l1e_get_mfn(*sl1e);
   2.863 -            gmfn = audit_gfn_to_mfn(v, gfn, gl1mfn);
   2.864 -            if ( mfn_x(gmfn) != mfn_x(mfn) )
   2.865 -                AUDIT_FAIL(1, "bad translation: gfn %" SH_PRI_gfn
   2.866 -                           " --> %" SH_PRI_mfn " != mfn %" SH_PRI_mfn "\n",
   2.867 -                           gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
   2.868 +#if (SHADOW_OPTIMIZATIONS & SHOPT_FAST_FAULT_PATH) && SHADOW_PAGING_LEVELS > 2
   2.869 +            if ( sh_l1e_is_gnp(*sl1e) )
   2.870 +            {
   2.871 +                if ( guest_l1e_get_flags(*gl1e) & _PAGE_PRESENT )
   2.872 +                    AUDIT_FAIL(1, "shadow is GNP magic but guest is present");
   2.873 +            } 
   2.874 +            else 
   2.875 +            {
   2.876 +                ASSERT(sh_l1e_is_mmio(*sl1e));
   2.877 +                gfn = sh_l1e_mmio_get_gfn(*sl1e);
   2.878 +                if ( gfn_x(gfn) != gfn_x(guest_l1e_get_gfn(*gl1e)) )
   2.879 +                    AUDIT_FAIL(1, "shadow MMIO gfn is %" SH_PRI_gfn 
   2.880 +                               " but guest gfn is %" SH_PRI_gfn,
   2.881 +                               gfn_x(gfn),
   2.882 +                               gfn_x(guest_l1e_get_gfn(*gl1e)));
   2.883 +            }
   2.884 +#endif
   2.885 +        }
   2.886 +        else 
   2.887 +        {
   2.888 +            s = sh_audit_flags(v, 1, guest_l1e_get_flags(*gl1e),
   2.889 +                               shadow_l1e_get_flags(*sl1e));
   2.890 +            if ( s ) AUDIT_FAIL(1, "%s", s);
   2.891 +            
   2.892 +            if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
   2.893 +            {
   2.894 +                gfn = guest_l1e_get_gfn(*gl1e);
   2.895 +                mfn = shadow_l1e_get_mfn(*sl1e);
   2.896 +                gmfn = audit_gfn_to_mfn(v, gfn, gl1mfn);
   2.897 +                if ( mfn_x(gmfn) != mfn_x(mfn) )
   2.898 +                    AUDIT_FAIL(1, "bad translation: gfn %" SH_PRI_gfn
   2.899 +                               " --> %" SH_PRI_mfn " != mfn %" SH_PRI_mfn,
   2.900 +                               gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
   2.901 +            }
   2.902          }
   2.903      });
   2.904      sh_unmap_domain_page(gp);
   2.905 @@ -3973,7 +3999,8 @@ int sh_audit_fl1_table(struct vcpu *v, m
   2.906          if ( !(f == 0 
   2.907                 || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
   2.908                          _PAGE_ACCESSED|_PAGE_DIRTY) 
   2.909 -               || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_ACCESSED|_PAGE_DIRTY)) )
   2.910 +               || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_ACCESSED|_PAGE_DIRTY)
   2.911 +               || sh_l1e_is_magic(*sl1e)) )
   2.912              AUDIT_FAIL(1, "fl1e has bad flags");
   2.913      });
   2.914      return 0;
   2.915 @@ -4011,7 +4038,7 @@ int sh_audit_l2_table(struct vcpu *v, mf
   2.916              if ( mfn_x(gmfn) != mfn_x(mfn) )
   2.917                  AUDIT_FAIL(2, "bad translation: gfn %" SH_PRI_gfn
   2.918                             " (--> %" SH_PRI_mfn ")"
   2.919 -                           " --> %" SH_PRI_mfn " != mfn %" SH_PRI_mfn "\n",
   2.920 +                           " --> %" SH_PRI_mfn " != mfn %" SH_PRI_mfn,
   2.921                             gfn_x(gfn), 
   2.922                             (guest_l2e_get_flags(*gl2e) & _PAGE_PSE) ? 0
   2.923                             : mfn_x(audit_gfn_to_mfn(v, gfn, gl2mfn)),
   2.924 @@ -4053,7 +4080,7 @@ int sh_audit_l3_table(struct vcpu *v, mf
   2.925                                       : PGC_SH_l2_shadow);
   2.926              if ( mfn_x(gmfn) != mfn_x(mfn) )
   2.927                  AUDIT_FAIL(3, "bad translation: gfn %" SH_PRI_gfn
   2.928 -                           " --> %" SH_PRI_mfn " != mfn %" SH_PRI_mfn "\n",
   2.929 +                           " --> %" SH_PRI_mfn " != mfn %" SH_PRI_mfn,
   2.930                             gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
   2.931          }
   2.932      });
   2.933 @@ -4088,7 +4115,7 @@ int sh_audit_l4_table(struct vcpu *v, mf
   2.934                                       PGC_SH_l3_shadow);
   2.935              if ( mfn_x(gmfn) != mfn_x(mfn) )
   2.936                  AUDIT_FAIL(4, "bad translation: gfn %" SH_PRI_gfn
   2.937 -                           " --> %" SH_PRI_mfn " != mfn %" SH_PRI_mfn "\n",
   2.938 +                           " --> %" SH_PRI_mfn " != mfn %" SH_PRI_mfn,
   2.939                             gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
   2.940          }
   2.941      });
     3.1 --- a/xen/arch/x86/mm/shadow/private.h	Wed Nov 01 10:40:46 2006 +0000
     3.2 +++ b/xen/arch/x86/mm/shadow/private.h	Wed Nov 01 10:41:44 2006 +0000
     3.3 @@ -33,111 +33,6 @@
     3.4  
     3.5  
     3.6  /******************************************************************************
     3.7 - * Definitions for the use of the "available" bits in the shadow PTEs.
     3.8 - *
     3.9 - * Review of the low 12 bits of a shadow page table entry:
    3.10 - *
    3.11 - *         in a guest:                      in a shadow:
    3.12 - * Bit 11: _PAGE_AVAIL2, aka _PAGE_GNTTAB
    3.13 - * Bit 10: _PAGE_AVAIL1                     _PAGE_SHADOW_RW ("SW" below)
    3.14 - * Bit  9: _PAGE_AVAIL0                     _PAGE_SHADOW_PRESENT ("SP" below)
    3.15 - * Bit  8: _PAGE_GLOBAL                     _PAGE_SHADOW_MMIO ("MMIO" below),
    3.16 - *                                          aka _PAGE_SHADOW_GUEST_NOT_PRESENT
    3.17 - * Bit  7: _PAGE_PSE, aka _PAGE_PAT
    3.18 - * Bit  6: _PAGE_DIRTY
    3.19 - * Bit  5: _PAGE_ACCESSED
    3.20 - * Bit  4: _PAGE_PCD
    3.21 - * Bit  3: _PAGE_PWT
    3.22 - * Bit  2: _PAGE_USER
    3.23 - * Bit  1: _PAGE_RW ("GW" below)
    3.24 - * Bit  0: _PAGE_PRESENT ("GP" below)
    3.25 - *
    3.26 - * Given a guest entry, as shown below, we can expect the following in the
    3.27 - * corresponding shadow entry:
    3.28 - *
    3.29 - * Guest entry  Shadow entry      Commentary
    3.30 - * -----------  ----------------  ---------------------------------------------
    3.31 - *       Maps     
    3.32 - * GP GW  IO    GP SP GW SW MMIO 
    3.33 - * -- -- ----   -- -- -- -- ----
    3.34 - *  -  -   -     0  0  0  0   0   The guest entry has not yet been shadowed.
    3.35 - *  0  -   -     0  0  0  0   1   The guest entry is marked not-present.
    3.36 - *  1  1  no     ?  1  ?  1   0   Writable entry in the guest.
    3.37 - *  1  0  no     ?  1  0  0   0   Read-only entry in the guest.
    3.38 - *  1  1  yes    0  1  ?  1   1   Writable MMIO mapping in the guest.
    3.39 - *  1  0  yes    0  1  0  0   1   Read-only MMIO mapping in the guest.
    3.40 - *
    3.41 - * Normally, we would expect that GP=1 in the guest to imply GP=1 in the
    3.42 - * shadow, and similarly for GW=1.  However, various functionality that may be
    3.43 - * implemented via the shadow can cause GP or GW to be cleared in such cases.
    3.44 - * A & D bit emulation is a prime example of such functionality.
    3.45 - *
    3.46 - * If _PAGE_SHADOW_PRESENT is zero, then the _PAGE_PRESENT bit in that same
    3.47 - * entry will always be zero, too.
    3.48 -
    3.49 - * Bit 11 is used in debug builds as the _PAGE_GNTTAB bit in PV guests.  It is
    3.50 - * currently available for random (ab)use in shadow entries.
    3.51 - *
    3.52 - * Bit 8 (the global bit) could be propagated from an HVM guest to the shadow,
    3.53 - * but currently there is no benefit, as the guest's TLB is flushed on every
    3.54 - * transition of CR3 anyway due to the HVM exit/re-entry.
    3.55 - *
    3.56 - * In shadow entries in which the _PAGE_SHADOW_PRESENT is set, bit 8 is used
    3.57 - * as the _PAGE_SHADOW_MMIO bit.  In such entries, if _PAGE_SHADOW_MMIO is
    3.58 - * set, then the entry contains the *gfn* directly from the corresponding
    3.59 - * guest entry (not an mfn!!).
    3.60 - *
    3.61 - * Bit 7 is set in a guest L2 to signify a superpage entry.  The current
    3.62 - * shadow code splinters superpage mappings into 512 or 1024 4K mappings; the
    3.63 - * resulting shadow L1 table is called an FL1.  Note that there is no guest
    3.64 - * page that corresponds to an FL1.
    3.65 - *
    3.66 - * Bit 7 in a guest L1 is the PAT2 bit.  Currently we do not support PAT in
    3.67 - * this shadow code.
    3.68 - *
    3.69 - * Bit 6 is the dirty bit.
    3.70 - *
    3.71 - * Bit 5 is the accessed bit.
    3.72 - *
    3.73 - * Bit 4 is the cache disable bit.  If set in a guest, the hardware is
    3.74 - * supposed to refuse to cache anything found via this entry.  It can be set
    3.75 - * in an L4e, L3e, L2e, or L1e.  This shadow code currently does not support
    3.76 - * cache disable bits.  They are silently ignored.
    3.77 - *
    3.78 - * Bit 4 is a guest L1 is also the PAT1 bit.  Currently we do not support PAT
    3.79 - * in this shadow code.
    3.80 - *
    3.81 - * Bit 3 is the cache write-thru bit.  If set in a guest, the hardware is
    3.82 - * supposed to use write-thru instead of write-back caching for anything found
    3.83 - * via this entry.  It can be set in an L4e, L3e, L2e, or L1e.  This shadow
    3.84 - * code currently does not support cache write-thru bits.  They are silently
    3.85 - * ignored.
    3.86 - *
    3.87 - * Bit 3 is a guest L1 is also the PAT0 bit.  Currently we do not support PAT
    3.88 - * in this shadow code.
    3.89 - *
    3.90 - * Bit 2 is the user bit.
    3.91 - *
    3.92 - * Bit 1 is the read-write bit.
    3.93 - *
    3.94 - * Bit 0 is the present bit.
    3.95 - */
    3.96 -
    3.97 -// Copy of the _PAGE_RW bit from the guest's PTE, appropriately zero'ed by
    3.98 -// the appropriate shadow rules.
    3.99 -#define _PAGE_SHADOW_RW                 _PAGE_AVAIL1
   3.100 -
   3.101 -// Copy of the _PAGE_PRESENT bit from the guest's PTE
   3.102 -#define _PAGE_SHADOW_PRESENT            _PAGE_AVAIL0
   3.103 -
   3.104 -// The matching guest entry maps MMIO space
   3.105 -#define _PAGE_SHADOW_MMIO               _PAGE_GLOBAL
   3.106 -
   3.107 -// Shadow flags value used when the guest is not present
   3.108 -#define _PAGE_SHADOW_GUEST_NOT_PRESENT  _PAGE_GLOBAL
   3.109 -
   3.110 -
   3.111 -/******************************************************************************
   3.112   * Debug and error-message output
   3.113   */
   3.114  #define SHADOW_PRINTK(_f, _a...)                                     \
   3.115 @@ -151,13 +46,13 @@
   3.116      } while (0)
   3.117  
   3.118  // The flags for use with SHADOW_DEBUG:
   3.119 -#define SHADOW_DEBUG_PROPAGATE         0
   3.120 -#define SHADOW_DEBUG_MAKE_SHADOW       0
   3.121 -#define SHADOW_DEBUG_DESTROY_SHADOW    0
   3.122 +#define SHADOW_DEBUG_PROPAGATE         1
   3.123 +#define SHADOW_DEBUG_MAKE_SHADOW       1
   3.124 +#define SHADOW_DEBUG_DESTROY_SHADOW    1
   3.125  #define SHADOW_DEBUG_P2M               0
   3.126 -#define SHADOW_DEBUG_A_AND_D           0
   3.127 -#define SHADOW_DEBUG_EMULATE           0
   3.128 -#define SHADOW_DEBUG_LOGDIRTY          1
   3.129 +#define SHADOW_DEBUG_A_AND_D           1
   3.130 +#define SHADOW_DEBUG_EMULATE           1
   3.131 +#define SHADOW_DEBUG_LOGDIRTY          0
   3.132  
   3.133  
   3.134  /******************************************************************************
     4.1 --- a/xen/arch/x86/mm/shadow/types.h	Wed Nov 01 10:40:46 2006 +0000
     4.2 +++ b/xen/arch/x86/mm/shadow/types.h	Wed Nov 01 10:41:44 2006 +0000
     4.3 @@ -591,6 +591,77 @@ accumulate_guest_flags(struct vcpu *v, w
     4.4      return accumulated_flags;
     4.5  }
     4.6  
     4.7 +
     4.8 +#if (SHADOW_OPTIMIZATIONS & SHOPT_FAST_FAULT_PATH) && SHADOW_PAGING_LEVELS > 2
     4.9 +/******************************************************************************
    4.10 + * We implement a "fast path" for two special cases: faults that require
    4.11 + * MMIO emulation, and faults where the guest PTE is not present.  We
    4.12 + * record these as shadow l1 entries that have reserved bits set in
    4.13 + * them, so we can spot them immediately in the fault handler and handle
    4.14 + * them without needing to hold the shadow lock or walk the guest
    4.15 + * pagetables.
    4.16 + *
    4.17 + * This is only feasible for PAE and 64bit Xen: 32-bit non-PAE PTEs don't
    4.18 + * have reserved bits that we can use for this.
    4.19 + */
    4.20 +
    4.21 +#define SH_L1E_MAGIC 0xffffffff00000000ULL
    4.22 +static inline int sh_l1e_is_magic(shadow_l1e_t sl1e)
    4.23 +{
    4.24 +    return ((sl1e.l1 & SH_L1E_MAGIC) == SH_L1E_MAGIC);
    4.25 +}
    4.26 +
    4.27 +/* Guest not present: a single magic value */
    4.28 +static inline shadow_l1e_t sh_l1e_gnp(void) 
    4.29 +{
    4.30 +    return (shadow_l1e_t){ -1ULL };
    4.31 +}
    4.32 +
    4.33 +static inline int sh_l1e_is_gnp(shadow_l1e_t sl1e) 
    4.34 +{
    4.35 +    return (sl1e.l1 == sh_l1e_gnp().l1);
    4.36 +}
    4.37 +
    4.38 +/* MMIO: an invalid PTE that contains the GFN of the equivalent guest l1e.
    4.39 + * We store 28 bits of GFN in bits 4:32 of the entry.
    4.40 + * The present bit is set, and the U/S and R/W bits are taken from the guest.
    4.41 + * Bit 3 is always 0, to differentiate from gnp above.  */
    4.42 +#define SH_L1E_MMIO_MAGIC       0xffffffff00000001ULL
    4.43 +#define SH_L1E_MMIO_MAGIC_MASK  0xffffffff00000009ULL
    4.44 +#define SH_L1E_MMIO_GFN_MASK    0x00000000fffffff0ULL
    4.45 +#define SH_L1E_MMIO_GFN_SHIFT   4
    4.46 +
    4.47 +static inline shadow_l1e_t sh_l1e_mmio(gfn_t gfn, u32 gflags) 
    4.48 +{
    4.49 +    return (shadow_l1e_t) { (SH_L1E_MMIO_MAGIC 
    4.50 +                             | (gfn_x(gfn) << SH_L1E_MMIO_GFN_SHIFT) 
    4.51 +                             | (gflags & (_PAGE_USER|_PAGE_RW))) };
    4.52 +}
    4.53 +
    4.54 +static inline int sh_l1e_is_mmio(shadow_l1e_t sl1e) 
    4.55 +{
    4.56 +    return ((sl1e.l1 & SH_L1E_MMIO_MAGIC_MASK) == SH_L1E_MMIO_MAGIC);
    4.57 +}
    4.58 +
    4.59 +static inline gfn_t sh_l1e_mmio_get_gfn(shadow_l1e_t sl1e) 
    4.60 +{
    4.61 +    return _gfn((sl1e.l1 & SH_L1E_MMIO_GFN_MASK) >> SH_L1E_MMIO_GFN_SHIFT);
    4.62 +}
    4.63 +
    4.64 +static inline u32 sh_l1e_mmio_get_flags(shadow_l1e_t sl1e) 
    4.65 +{
    4.66 +    return (u32)((sl1e.l1 & (_PAGE_USER|_PAGE_RW)));
    4.67 +}
    4.68 +
    4.69 +#else
    4.70 +
    4.71 +#define sh_l1e_gnp() shadow_l1e_empty()
    4.72 +#define sh_l1e_mmio(_gfn, _flags) shadow_l1e_empty()
    4.73 +#define sh_l1e_is_magic(_e) (0)
    4.74 +
    4.75 +#endif /* SHOPT_FAST_FAULT_PATH */
    4.76 +
    4.77 +
    4.78  #endif /* _XEN_SHADOW_TYPES_H */
    4.79  
    4.80  /*
     5.1 --- a/xen/arch/x86/x86_32/seg_fixup.c	Wed Nov 01 10:40:46 2006 +0000
     5.2 +++ b/xen/arch/x86/x86_32/seg_fixup.c	Wed Nov 01 10:41:44 2006 +0000
     5.3 @@ -296,8 +296,8 @@ int gpf_emulate_4gb(struct cpu_user_regs
     5.4          if ( get_user(b, pb) )
     5.5          {
     5.6              dprintk(XENLOG_DEBUG,
     5.7 -                    "Fault while accessing byte %d of instruction\n",
     5.8 -                    pb-eip);
     5.9 +                    "Fault while accessing byte %ld of instruction\n",
    5.10 +                    (long)(pb-eip));
    5.11              goto page_fault;
    5.12          }
    5.13  
     6.1 --- a/xen/include/asm-x86/perfc_defn.h	Wed Nov 01 10:40:46 2006 +0000
     6.2 +++ b/xen/include/asm-x86/perfc_defn.h	Wed Nov 01 10:41:44 2006 +0000
     6.3 @@ -43,6 +43,9 @@ PERFCOUNTER_CPU(shadow_linear_map_failed
     6.4  PERFCOUNTER_CPU(shadow_a_update,       "shadow A bit update")
     6.5  PERFCOUNTER_CPU(shadow_ad_update,      "shadow A&D bit update")
     6.6  PERFCOUNTER_CPU(shadow_fault,          "calls to shadow_fault")
     6.7 +PERFCOUNTER_CPU(shadow_fault_fast_gnp, "shadow_fault fast path n/p")
     6.8 +PERFCOUNTER_CPU(shadow_fault_fast_mmio, "shadow_fault fast path mmio")
     6.9 +PERFCOUNTER_CPU(shadow_fault_fast_fail, "shadow_fault fast path error")
    6.10  PERFCOUNTER_CPU(shadow_fault_bail_bad_gfn, "shadow_fault guest bad gfn")
    6.11  PERFCOUNTER_CPU(shadow_fault_bail_not_present, 
    6.12                                          "shadow_fault guest not-present")
     7.1 --- a/xen/include/asm-x86/shadow.h	Wed Nov 01 10:40:46 2006 +0000
     7.2 +++ b/xen/include/asm-x86/shadow.h	Wed Nov 01 10:41:44 2006 +0000
     7.3 @@ -161,8 +161,10 @@ extern int shadow_audit_enable;
     7.4   */
     7.5  #define SHOPT_WRITABLE_HEURISTIC  0x01  /* Guess at RW PTEs via linear maps */
     7.6  #define SHOPT_EARLY_UNSHADOW      0x02  /* Unshadow l1s on fork or exit */
     7.7 +#define SHOPT_FAST_FAULT_PATH     0x04  /* Fast-path MMIO and not-present */
     7.8 +#define SHOPT_PREFETCH            0x08  /* Shadow multiple entries per fault */
     7.9  
    7.10 -#define SHADOW_OPTIMIZATIONS      0x03
    7.11 +#define SHADOW_OPTIMIZATIONS      0x0f
    7.12  
    7.13  
    7.14  /* With shadow pagetables, the different kinds of address start