ia64/xen-unstable

changeset 4160:7a281bd1e0f5

bitkeeper revision 1.1236.34.3 (4237063cE2rat5RdEGCsTzuaC6XCcA)

Tidy the x86 emulator interface, and use it from within the
writable pagetable algorithm to deal with otherwise unhandleable cases.
For example: L1 mapped at multiple L2 slots; L1 that maps itself; L1
that also maps the code making the update, or the kernel stack.
This provides a proof-of-concept for the emulator that can be picked
up for the VMX code to improve the device-model emulation.
Signed-off-by: Keir Fraser <keir@xensource.com>
author kaf24@firebug.cl.cam.ac.uk
date Tue Mar 15 15:58:52 2005 +0000 (2005-03-15)
parents 5c5ca35a900a
children fc1e5218f616
files tools/tests/test_x86_emulator.c xen/arch/x86/mm.c xen/arch/x86/x86_emulate.c xen/include/asm-x86/mm.h xen/include/asm-x86/x86_emulate.h xen/include/xen/perfc_defn.h
line diff
     1.1 --- a/tools/tests/test_x86_emulator.c	Tue Mar 15 14:31:42 2005 +0000
     1.2 +++ b/tools/tests/test_x86_emulator.c	Tue Mar 15 15:58:52 2005 +0000
     1.3 @@ -26,7 +26,7 @@ static int read_any(
     1.4      case 4: *val = *(u32 *)addr; break;
     1.5      case 8: *val = *(unsigned long *)addr; break;
     1.6      }
     1.7 -    return 0;
     1.8 +    return X86EMUL_CONTINUE;
     1.9  }
    1.10  
    1.11  static int write_any(
    1.12 @@ -41,17 +41,15 @@ static int write_any(
    1.13      case 4: *(u32 *)addr = (u32)val; break;
    1.14      case 8: *(unsigned long *)addr = val; break;
    1.15      }
    1.16 -    return 0;
    1.17 +    return X86EMUL_CONTINUE;
    1.18  }
    1.19  
    1.20  static int cmpxchg_any(
    1.21      unsigned long addr,
    1.22      unsigned long old,
    1.23      unsigned long new,
    1.24 -    unsigned long *seen,
    1.25      unsigned int bytes)
    1.26  {
    1.27 -    *seen = old;
    1.28      switch ( bytes )
    1.29      {
    1.30      case 1: *(u8 *)addr = (u8)new; break;
    1.31 @@ -59,7 +57,7 @@ static int cmpxchg_any(
    1.32      case 4: *(u32 *)addr = (u32)new; break;
    1.33      case 8: *(unsigned long *)addr = new; break;
    1.34      }
    1.35 -    return 0;
    1.36 +    return X86EMUL_CONTINUE;
    1.37  }
    1.38  
    1.39  static struct x86_mem_emulator emulops = {
     2.1 --- a/xen/arch/x86/mm.c	Tue Mar 15 14:31:42 2005 +0000
     2.2 +++ b/xen/arch/x86/mm.c	Tue Mar 15 15:58:52 2005 +0000
     2.3 @@ -101,6 +101,7 @@
     2.4  #include <asm/uaccess.h>
     2.5  #include <asm/domain_page.h>
     2.6  #include <asm/ldt.h>
     2.7 +#include <asm/x86_emulate.h>
     2.8  
     2.9  #ifdef VERBOSE
    2.10  #define MEM_LOG(_f, _a...)                           \
    2.11 @@ -265,8 +266,7 @@ int map_ldt_shadow_page(unsigned int off
    2.12  #define TOGGLE_MODE() ((void)0)
    2.13  #endif
    2.14  
    2.15 -    if ( unlikely(in_irq()) )
    2.16 -        BUG();
    2.17 +    BUG_ON(unlikely(in_irq()));
    2.18  
    2.19      TOGGLE_MODE();
    2.20      __get_user(l1e, (unsigned long *)
    2.21 @@ -1939,12 +1939,13 @@ void update_shadow_va_mapping(unsigned l
    2.22          &shadow_linear_pg_table[l1_linear_offset(va)])))) )
    2.23      {
    2.24          /*
    2.25 -         * Since L2's are guranteed RW, failure indicates either that the
    2.26 +         * Since L2's are guaranteed RW, failure indicates either that the
    2.27           * page was not shadowed, or that the L2 entry has not yet been
    2.28           * updated to reflect the shadow.
    2.29           */
    2.30 -        if ( shadow_mode_external(current->domain) )
    2.31 -            BUG(); // can't use linear_l2_table with external tables.
    2.32 +
    2.33 +        /* Can't use linear_l2_table with external tables. */
    2.34 +        BUG_ON(shadow_mode_external(current->domain));
    2.35  
    2.36          l2_pgentry_t gpde = linear_l2_table[l2_table_offset(va)];
    2.37          unsigned long gpfn = l2_pgentry_val(gpde) >> PAGE_SHIFT;
    2.38 @@ -2294,9 +2295,7 @@ void ptwr_flush(const int which)
    2.39      int            i, cpu = smp_processor_id();
    2.40      struct exec_domain *ed = current;
    2.41      struct domain *d = ed->domain;
    2.42 -#ifdef PERF_COUNTERS
    2.43      unsigned int   modified = 0;
    2.44 -#endif
    2.45  
    2.46      l1va = ptwr_info[cpu].ptinfo[which].l1va;
    2.47      ptep = (unsigned long *)&linear_pg_table[l1_linear_offset(l1va)];
    2.48 @@ -2344,11 +2343,7 @@ void ptwr_flush(const int which)
    2.49  
    2.50      /* Ensure that there are no stale writable mappings in any TLB. */
    2.51      /* NB. INVLPG is a serialising instruction: flushes pending updates. */
    2.52 -#if 1
    2.53      __flush_tlb_one(l1va); /* XXX Multi-CPU guests? */
    2.54 -#else
    2.55 -    flush_tlb_all();
    2.56 -#endif
    2.57      PTWR_PRINTK("[%c] disconnected_l1va at %p now %p\n",
    2.58                  PTWR_PRINT_WHICH, ptep, pte);
    2.59  
    2.60 @@ -2365,10 +2360,8 @@ void ptwr_flush(const int which)
    2.61          if ( likely(l1_pgentry_val(ol1e) == l1_pgentry_val(nl1e)) )
    2.62              continue;
    2.63  
    2.64 -#ifdef PERF_COUNTERS
    2.65          /* Update number of entries modified. */
    2.66          modified++;
    2.67 -#endif
    2.68  
    2.69          /*
    2.70           * Fast path for PTEs that have merely been write-protected
    2.71 @@ -2411,6 +2404,8 @@ void ptwr_flush(const int which)
    2.72      unmap_domain_mem(pl1e);
    2.73  
    2.74      perfc_incr_histo(wpt_updates, modified, PT_UPDATES);
    2.75 +    ptwr_info[cpu].ptinfo[which].prev_exec_domain = ed;
    2.76 +    ptwr_info[cpu].ptinfo[which].prev_nr_updates  = modified;
    2.77  
    2.78      /*
    2.79       * STEP 3. Reattach the L1 p.t. page into the current address space.
    2.80 @@ -2435,6 +2430,133 @@ void ptwr_flush(const int which)
    2.81      }
    2.82  }
    2.83  
    2.84 +static int ptwr_emulated_update(
    2.85 +    unsigned long addr,
    2.86 +    unsigned long old,
    2.87 +    unsigned long val,
    2.88 +    unsigned int bytes,
    2.89 +    unsigned int do_cmpxchg)
    2.90 +{
    2.91 +    unsigned long sstat, pte, pfn;
    2.92 +    struct pfn_info *page;
    2.93 +    l1_pgentry_t ol1e, nl1e, *pl1e, *sl1e;
    2.94 +    struct domain *d = current->domain;
    2.95 +
    2.96 +    /* Aligned access only, thank you. */
    2.97 +    if ( (addr & (bytes-1)) != 0 )
    2.98 +    {
    2.99 +        MEM_LOG("ptwr_emulate: Unaligned or bad size ptwr access (%d, %p)\n",
   2.100 +                bytes, addr);
   2.101 +        return X86EMUL_UNHANDLEABLE;
   2.102 +    }
   2.103 +
   2.104 +    /* Turn a sub-word access into a full-word access. */
   2.105 +    if ( (addr & ((BITS_PER_LONG/8)-1)) != 0 )
   2.106 +    {
   2.107 +        int           rc;
   2.108 +        unsigned long full;
   2.109 +        unsigned int  mask = addr & ((BITS_PER_LONG/8)-1);
   2.110 +        /* Align address; read full word. */
   2.111 +        addr &= ~((BITS_PER_LONG/8)-1);
   2.112 +        if ( (rc = x86_emulate_read_std(addr, &full, BITS_PER_LONG/8)) )
   2.113 +            return rc;
   2.114 +        /* Mask out bits provided by caller. */
   2.115 +        full &= ~((1UL << (bytes*8)) - 1UL) << (mask*8);
   2.116 +        /* Shift the caller value and OR in the missing bits. */
   2.117 +        val  &= (1UL << (bytes*8)) - 1UL;
   2.118 +        val <<= mask*8;
   2.119 +        val  |= full;
   2.120 +    }
   2.121 +
   2.122 +    /* Read the PTE that maps the page being updated. */
   2.123 +    if ( __get_user(pte, (unsigned long *)
   2.124 +                    &linear_pg_table[l1_linear_offset(addr)]) )
   2.125 +    {
   2.126 +        MEM_LOG("ptwr_emulate: Cannot read thru linear_pg_table\n");
   2.127 +        return X86EMUL_UNHANDLEABLE;
   2.128 +    }
   2.129 +
   2.130 +    pfn  = pte >> PAGE_SHIFT;
   2.131 +    page = &frame_table[pfn];
   2.132 +
   2.133 +    /* We are looking only for read-only mappings of p.t. pages. */
   2.134 +    if ( ((pte & (_PAGE_RW | _PAGE_PRESENT)) != _PAGE_PRESENT) ||
   2.135 +         ((page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table) )
   2.136 +    {
   2.137 +        MEM_LOG("ptwr_emulate: Page is mistyped or bad pte (%p, %x)\n",
   2.138 +                pte, page->u.inuse.type_info);
   2.139 +        return X86EMUL_UNHANDLEABLE;
   2.140 +    }
   2.141 +
   2.142 +    /* Check the new PTE. */
   2.143 +    nl1e = mk_l1_pgentry(val);
   2.144 +    if ( unlikely(!get_page_from_l1e(nl1e, d)) )
   2.145 +        return X86EMUL_UNHANDLEABLE;
   2.146 +
   2.147 +    /* Checked successfully: do the update (write or cmpxchg). */
   2.148 +    pl1e = map_domain_mem(page_to_phys(page) + (addr & ~PAGE_MASK));
   2.149 +    if ( do_cmpxchg )
   2.150 +    {
   2.151 +        ol1e = mk_l1_pgentry(old);
   2.152 +        if ( cmpxchg((unsigned long *)pl1e, old, val) != old )
   2.153 +        {
   2.154 +            unmap_domain_mem(pl1e);
   2.155 +            return X86EMUL_CMPXCHG_FAILED;
   2.156 +        }
   2.157 +    }
   2.158 +    else
   2.159 +    {
   2.160 +        ol1e  = *pl1e;
   2.161 +        *pl1e = nl1e;
   2.162 +    }
   2.163 +    unmap_domain_mem(pl1e);
   2.164 +
   2.165 +    /* Propagate update to shadow cache. */
   2.166 +    if ( unlikely(shadow_mode_enabled(d)) )
   2.167 +    {
   2.168 +        sstat = get_shadow_status(d, page_to_pfn(page));
   2.169 +        if ( sstat & PSH_shadowed )
   2.170 +        {
   2.171 +            sl1e = map_domain_mem(
   2.172 +                ((sstat & PSH_pfn_mask) << PAGE_SHIFT) + (addr & ~PAGE_MASK));
   2.173 +            l1pte_propagate_from_guest(
   2.174 +                d, &l1_pgentry_val(nl1e), &l1_pgentry_val(*sl1e));
   2.175 +            unmap_domain_mem(sl1e);
   2.176 +        }
   2.177 +    }
   2.178 +
   2.179 +    /* Finally, drop the old PTE. */
   2.180 +    if ( unlikely(l1_pgentry_val(ol1e) & _PAGE_PRESENT) )
   2.181 +        put_page_from_l1e(ol1e, d);
   2.182 +
   2.183 +    return X86EMUL_CONTINUE;
   2.184 +}
   2.185 +
   2.186 +static int ptwr_emulated_write(
   2.187 +    unsigned long addr,
   2.188 +    unsigned long val,
   2.189 +    unsigned int bytes)
   2.190 +{
   2.191 +    return ptwr_emulated_update(addr, 0, val, bytes, 0);
   2.192 +}
   2.193 +
   2.194 +static int ptwr_emulated_cmpxchg(
   2.195 +    unsigned long addr,
   2.196 +    unsigned long old,
   2.197 +    unsigned long new,
   2.198 +    unsigned int bytes)
   2.199 +{
   2.200 +    return ptwr_emulated_update(addr, old, new, bytes, 1);
   2.201 +}
   2.202 +
   2.203 +static struct x86_mem_emulator ptwr_mem_emulator = {
   2.204 +    .read_std         = x86_emulate_read_std,
   2.205 +    .write_std        = x86_emulate_write_std,
   2.206 +    .read_emulated    = x86_emulate_read_std,
   2.207 +    .write_emulated   = ptwr_emulated_write,
   2.208 +    .cmpxchg_emulated = ptwr_emulated_cmpxchg
   2.209 +};
   2.210 +
   2.211  /* Write page fault handler: check if guest is trying to modify a PTE. */
   2.212  int ptwr_do_page_fault(unsigned long addr)
   2.213  {
   2.214 @@ -2448,13 +2570,13 @@ int ptwr_do_page_fault(unsigned long add
   2.215      return 0; /* Writable pagetables need fixing for x86_64. */
   2.216  #endif
   2.217  
   2.218 +    /* Can't use linear_l2_table with external tables. */
   2.219 +    BUG_ON(shadow_mode_external(current->domain));
   2.220 +
   2.221      /*
   2.222       * Attempt to read the PTE that maps the VA being accessed. By checking for
   2.223       * PDE validity in the L2 we avoid many expensive fixups in __get_user().
   2.224       */
   2.225 -    if ( shadow_mode_external(current->domain) )
   2.226 -        BUG(); // can't use linear_l2_table with external tables.
   2.227 -
   2.228      if ( !(l2_pgentry_val(linear_l2_table[addr>>L2_PAGETABLE_SHIFT]) &
   2.229             _PAGE_PRESENT) ||
   2.230           __get_user(pte, (unsigned long *)
   2.231 @@ -2472,47 +2594,35 @@ int ptwr_do_page_fault(unsigned long add
   2.232      {
   2.233          return 0;
   2.234      }
   2.235 -    
   2.236 +
   2.237      /* Get the L2 index at which this L1 p.t. is always mapped. */
   2.238      l2_idx = page->u.inuse.type_info & PGT_va_mask;
   2.239      if ( unlikely(l2_idx >= PGT_va_unknown) )
   2.240 -    {
   2.241 -        domain_crash(); /* Urk! This L1 is mapped in multiple L2 slots! */
   2.242 -    }
   2.243 +        goto emulate; /* Urk! This L1 is mapped in multiple L2 slots! */
   2.244      l2_idx >>= PGT_va_shift;
   2.245  
   2.246 -    if ( l2_idx == (addr >> L2_PAGETABLE_SHIFT) )
   2.247 -    {
   2.248 -        MEM_LOG("PTWR failure! Pagetable maps itself at %p\n", addr);
   2.249 -        domain_crash();
   2.250 -    }
   2.251 +    if ( unlikely(l2_idx == (addr >> L2_PAGETABLE_SHIFT)) )
   2.252 +        goto emulate; /* Urk! Pagetable maps itself! */
   2.253  
   2.254      /*
   2.255       * Is the L1 p.t. mapped into the current address space? If so we call it
   2.256       * an ACTIVE p.t., otherwise it is INACTIVE.
   2.257       */
   2.258 -    if ( shadow_mode_external(current->domain) )
   2.259 -        BUG(); // can't use linear_l2_table with external tables.
   2.260 -
   2.261      pl2e = &linear_l2_table[l2_idx];
   2.262      l2e  = l2_pgentry_val(*pl2e);
   2.263      which = PTWR_PT_INACTIVE;
   2.264      if ( (l2e >> PAGE_SHIFT) == pfn )
   2.265      {
   2.266 -        /* Check the PRESENT bit to set ACTIVE. */
   2.267 -        if ( likely(l2e & _PAGE_PRESENT) )
   2.268 +        /*
   2.269 +         * Check the PRESENT bit to set ACTIVE mode.
   2.270 +         * If the PRESENT bit is clear, we may be conflicting with the current 
   2.271 +         * ACTIVE p.t. (it may be the same p.t. mapped at another virt addr).
   2.272 +         * The ptwr_flush call below will restore the PRESENT bit.
   2.273 +         */
   2.274 +        if ( likely(l2e & _PAGE_PRESENT) ||
   2.275 +             (ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va &&
   2.276 +              (l2_idx == ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l2_idx)) )
   2.277              which = PTWR_PT_ACTIVE;
   2.278 -        else {
   2.279 -            /*
   2.280 -             * If the PRESENT bit is clear, we may be conflicting with
   2.281 -             * the current ACTIVE p.t. (it may be the same p.t. mapped
   2.282 -             * at another virt addr).
   2.283 -             * The ptwr_flush call below will restore the PRESENT bit.
   2.284 -             */
   2.285 -            if ( ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va &&
   2.286 -                 l2_idx == ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l2_idx )
   2.287 -                which = PTWR_PT_ACTIVE;
   2.288 -        }
   2.289      }
   2.290      
   2.291      PTWR_PRINTK("[%c] page_fault on l1 pt at va %p, pt for %08x, "
   2.292 @@ -2526,6 +2636,18 @@ int ptwr_do_page_fault(unsigned long add
   2.293      if ( ptwr_info[cpu].ptinfo[which].l1va )
   2.294          ptwr_flush(which);
   2.295  
   2.296 +    /*
   2.297 +     * If last batch made no updates then we are probably stuck. Emulate this 
   2.298 +     * update to ensure we make progress.
   2.299 +     */
   2.300 +    if ( (ptwr_info[cpu].ptinfo[which].prev_exec_domain == current) &&
   2.301 +         (ptwr_info[cpu].ptinfo[which].prev_nr_updates  == 0) )
   2.302 +    {
   2.303 +        /* Force non-emul next time, or we can get stuck emulating forever. */
   2.304 +        ptwr_info[cpu].ptinfo[which].prev_exec_domain = NULL;
   2.305 +        goto emulate;
   2.306 +    }
   2.307 +
   2.308      ptwr_info[cpu].ptinfo[which].l1va   = addr | 1;
   2.309      ptwr_info[cpu].ptinfo[which].l2_idx = l2_idx;
   2.310      
   2.311 @@ -2534,11 +2656,7 @@ int ptwr_do_page_fault(unsigned long add
   2.312           likely(!shadow_mode_enabled(current->domain)) )
   2.313      {
   2.314          *pl2e = mk_l2_pgentry(l2e & ~_PAGE_PRESENT);
   2.315 -#if 1
   2.316          flush_tlb(); /* XXX Multi-CPU guests? */
   2.317 -#else
   2.318 -        flush_tlb_all();
   2.319 -#endif
   2.320      }
   2.321      
   2.322      /* Temporarily map the L1 page, and make a copy of it. */
   2.323 @@ -2563,6 +2681,13 @@ int ptwr_do_page_fault(unsigned long add
   2.324      }
   2.325      
   2.326      return EXCRET_fault_fixed;
   2.327 +
   2.328 + emulate:
   2.329 +    if ( x86_emulate_memop(get_execution_context(), addr,
   2.330 +                           &ptwr_mem_emulator, BITS_PER_LONG/8) )
   2.331 +        return 0;
   2.332 +    perfc_incrc(ptwr_emulations);
   2.333 +    return EXCRET_fault_fixed;
   2.334  }
   2.335  
   2.336  static __init int ptwr_init(void)
   2.337 @@ -2762,8 +2887,7 @@ void audit_domain(struct domain *d)
   2.338          pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;       
   2.339          page = &frame_table[pfn];
   2.340  
   2.341 -        if ( page_get_owner(page) != d )
   2.342 -            BUG();
   2.343 +        BUG_ON(page_get_owner(page) != d);
   2.344  
   2.345          if ( (page->u.inuse.type_info & PGT_count_mask) >
   2.346               (page->count_info & PGC_count_mask) )
   2.347 @@ -2809,8 +2933,7 @@ void audit_domain(struct domain *d)
   2.348          pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;       
   2.349          page = &frame_table[pfn];
   2.350  
   2.351 -        if ( page_get_owner(page) != d )
   2.352 -            BUG();
   2.353 +        BUG_ON(page_get_owner(page) != d);
   2.354  
   2.355          switch ( page->u.inuse.type_info & PGT_type_mask )
   2.356          {
   2.357 @@ -3060,7 +3183,10 @@ void audit_domain(struct domain *d)
   2.358              d->exec_domain[0]->arch.guest_table)>>PAGE_SHIFT], 1, 1);
   2.359  
   2.360      spin_unlock(&d->page_alloc_lock);
   2.361 -    printk("Audit %d: Done. ref=%d xenpages=%d pages=%d l1=%d l2=%d ctot=%d ttot=%d\n", d->id, atomic_read(&d->refcnt), d->xenheap_pages, d->tot_pages, l1, l2, ctot, ttot );
   2.362 +    printk("Audit %d: Done. ref=%d xenpages=%d pages=%d l1=%d"
   2.363 +           " l2=%d ctot=%d ttot=%d\n", 
   2.364 +           d->id, atomic_read(&d->refcnt), d->xenheap_pages, d->tot_pages,
   2.365 +           l1, l2, ctot, ttot );
   2.366  
   2.367      if ( d != current->domain )
   2.368          domain_unpause(d);
     3.1 --- a/xen/arch/x86/x86_emulate.c	Tue Mar 15 14:31:42 2005 +0000
     3.2 +++ b/xen/arch/x86/x86_emulate.c	Tue Mar 15 15:58:52 2005 +0000
     3.3 @@ -363,7 +363,7 @@ do{ __asm__ __volatile__ (              
     3.4  /* Fetch next part of the instruction being emulated. */
     3.5  #define insn_fetch(_type, _size, _eip) \
     3.6  ({ unsigned long _x; \
     3.7 -   if ( ops->read_std((unsigned long)(_eip), &_x, (_size)) ) \
     3.8 +   if ( (rc = ops->read_std((unsigned long)(_eip), &_x, (_size))) != 0 ) \
     3.9         goto done; \
    3.10     (_eip) += (_size); \
    3.11     (_type)_x; \
    3.12 @@ -422,6 +422,7 @@ x86_emulate_memop(
    3.13      u8 modrm, modrm_mod = 0, modrm_reg = 0, modrm_rm = 0;
    3.14      unsigned int op_bytes = (mode == 8) ? 4 : mode, ad_bytes = mode;
    3.15      unsigned int lock_prefix = 0, rep_prefix = 0, i;
    3.16 +    int rc = 0;
    3.17      struct operand src, dst;
    3.18  
    3.19      /* Shadow copy of register state. Committed on successful emulation. */
    3.20 @@ -556,7 +557,8 @@ x86_emulate_memop(
    3.21          dst.ptr   = (unsigned long *)cr2;
    3.22          dst.bytes = (d & ByteOp) ? 1 : op_bytes;
    3.23          if ( !(d & Mov) && /* optimisation - avoid slow emulated read */
    3.24 -             ops->read_emulated((unsigned long)dst.ptr, &dst.val, dst.bytes) )
    3.25 +             ((rc = ops->read_emulated((unsigned long)dst.ptr,
    3.26 +                                       &dst.val, dst.bytes)) != 0) )
    3.27               goto done;
    3.28          break;
    3.29      }
    3.30 @@ -590,7 +592,8 @@ x86_emulate_memop(
    3.31          src.type  = OP_MEM;
    3.32          src.ptr   = (unsigned long *)cr2;
    3.33          src.bytes = (d & ByteOp) ? 1 : op_bytes;
    3.34 -        if ( ops->read_emulated((unsigned long)src.ptr, &src.val, src.bytes) )
    3.35 +        if ( (rc = ops->read_emulated((unsigned long)src.ptr, 
    3.36 +                                      &src.val, src.bytes)) != 0 )
    3.37              goto done;
    3.38          src.orig_val = src.val;
    3.39          break;
    3.40 @@ -664,6 +667,7 @@ x86_emulate_memop(
    3.41          src.val ^= dst.val;
    3.42          dst.val ^= src.val;
    3.43          src.val ^= dst.val;
    3.44 +        lock_prefix = 1;
    3.45          break;
    3.46      case 0xa0 ... 0xa1: /* mov */
    3.47          dst.ptr = (unsigned long *)&_regs.eax;
    3.48 @@ -682,7 +686,7 @@ x86_emulate_memop(
    3.49          /* 64-bit mode: POP defaults to 64-bit operands. */
    3.50          if ( (mode == 8) && (dst.bytes == 4) )
    3.51              dst.bytes = 8;
    3.52 -        if ( ops->read_std(_regs.esp, &dst.val, dst.bytes) )
    3.53 +        if ( (rc = ops->read_std(_regs.esp, &dst.val, dst.bytes)) != 0 )
    3.54              goto done;
    3.55          _regs.esp += dst.bytes;
    3.56          break;
    3.57 @@ -759,11 +763,12 @@ x86_emulate_memop(
    3.58              if ( (mode == 8) && (dst.bytes == 4) )
    3.59              {
    3.60                  dst.bytes = 8;
    3.61 -                if ( ops->read_std((unsigned long)dst.ptr, &dst.val, 8) )
    3.62 +                if ( (rc = ops->read_std((unsigned long)dst.ptr,
    3.63 +                                         &dst.val, 8)) != 0 )
    3.64                      goto done;
    3.65              }
    3.66              _regs.esp -= dst.bytes;
    3.67 -            if ( ops->write_std(_regs.esp, dst.val, dst.bytes) )
    3.68 +            if ( (rc = ops->write_std(_regs.esp, dst.val, dst.bytes)) != 0 )
    3.69                  goto done;
    3.70              dst.val = dst.orig_val; /* skanky: disable writeback */
    3.71              break;
    3.72 @@ -790,22 +795,13 @@ x86_emulate_memop(
    3.73              break;
    3.74          case OP_MEM:
    3.75              if ( lock_prefix )
    3.76 -            {
    3.77 -                unsigned long seen;
    3.78 -                if ( ops->cmpxchg_emulated((unsigned long)dst.ptr,
    3.79 -                                           dst.orig_val, dst.val,
    3.80 -                                           &seen, dst.bytes) )
    3.81 -                    goto done;
    3.82 -                if ( seen != dst.orig_val )
    3.83 -                    goto done; /* Try again... */
    3.84 -            }
    3.85 +                rc = ops->cmpxchg_emulated(
    3.86 +                    (unsigned long)dst.ptr, dst.orig_val, dst.val, dst.bytes);
    3.87              else
    3.88 -            {
    3.89 -                if ( ops->write_emulated((unsigned long)dst.ptr,
    3.90 -                                         dst.val, dst.bytes) )
    3.91 -                    goto done;
    3.92 -            }
    3.93 -            break;
    3.94 +                rc = ops->write_emulated(
    3.95 +                    (unsigned long)dst.ptr, dst.val, dst.bytes);
    3.96 +            if ( rc != 0 )
    3.97 +                goto done;
    3.98          default:
    3.99              break;
   3.100          }
   3.101 @@ -815,7 +811,7 @@ x86_emulate_memop(
   3.102      *regs = _regs;
   3.103  
   3.104   done:
   3.105 -    return 0;
   3.106 +    return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
   3.107  
   3.108   special_insn:
   3.109      if ( twobyte )
   3.110 @@ -839,15 +835,15 @@ x86_emulate_memop(
   3.111          {
   3.112              /* Write fault: destination is special memory. */
   3.113              dst.ptr = (unsigned long *)cr2;
   3.114 -            if ( ops->read_std(_regs.esi - _regs.edi + cr2, 
   3.115 -                               &dst.val, dst.bytes) )
   3.116 +            if ( (rc = ops->read_std(_regs.esi - _regs.edi + cr2, 
   3.117 +                                     &dst.val, dst.bytes)) != 0 )
   3.118                  goto done;
   3.119          }
   3.120          else
   3.121          {
   3.122              /* Read fault: source is special memory. */
   3.123              dst.ptr = (unsigned long *)(_regs.edi - _regs.esi + cr2);
   3.124 -            if ( ops->read_emulated(cr2, &dst.val, dst.bytes) )
   3.125 +            if ( (rc = ops->read_emulated(cr2, &dst.val, dst.bytes)) != 0 )
   3.126                  goto done;
   3.127          }
   3.128          _regs.esi += (_regs.eflags & EFLG_DF) ? -dst.bytes : dst.bytes;
   3.129 @@ -867,7 +863,7 @@ x86_emulate_memop(
   3.130          dst.type  = OP_REG;
   3.131          dst.bytes = (d & ByteOp) ? 1 : op_bytes;
   3.132          dst.ptr   = (unsigned long *)&_regs.eax;
   3.133 -        if ( ops->read_emulated(cr2, &dst.val, dst.bytes) )
   3.134 +        if ( (rc = ops->read_emulated(cr2, &dst.val, dst.bytes)) != 0 )
   3.135              goto done;
   3.136          _regs.esi += (_regs.eflags & EFLG_DF) ? -dst.bytes : dst.bytes;
   3.137          break;
   3.138 @@ -971,3 +967,39 @@ x86_emulate_memop(
   3.139      DPRINTF("Cannot emulate %02x\n", b);
   3.140      return -1;
   3.141  }
   3.142 +
   3.143 +#ifndef __TEST_HARNESS__
   3.144 +
   3.145 +#include <asm/mm.h>
   3.146 +#include <asm/uaccess.h>
   3.147 +
   3.148 +int
   3.149 +x86_emulate_read_std(
   3.150 +    unsigned long addr,
   3.151 +    unsigned long *val,
   3.152 +    unsigned int bytes)
   3.153 +{
   3.154 +    *val = 0;
   3.155 +    if ( copy_from_user((void *)val, (void *)addr, bytes) )
   3.156 +    {
   3.157 +        propagate_page_fault(addr, 4); /* user mode, read fault */
   3.158 +        return X86EMUL_PROPAGATE_FAULT;
   3.159 +    }
   3.160 +    return X86EMUL_CONTINUE;
   3.161 +}
   3.162 +
   3.163 +int
   3.164 +x86_emulate_write_std(
   3.165 +    unsigned long addr,
   3.166 +    unsigned long val,
   3.167 +    unsigned int bytes)
   3.168 +{
   3.169 +    if ( copy_to_user((void *)addr, (void *)&val, bytes) )
   3.170 +    {
   3.171 +        propagate_page_fault(addr, 6); /* user mode, write fault */
   3.172 +        return X86EMUL_PROPAGATE_FAULT;
   3.173 +    }
   3.174 +    return X86EMUL_CONTINUE;
   3.175 +}
   3.176 +
   3.177 +#endif
     4.1 --- a/xen/include/asm-x86/mm.h	Tue Mar 15 14:31:42 2005 +0000
     4.2 +++ b/xen/include/asm-x86/mm.h	Tue Mar 15 15:58:52 2005 +0000
     4.3 @@ -289,6 +289,9 @@ typedef struct {
     4.4      l1_pgentry_t *pl1e;
     4.5      /* Index in L2 page table where this L1 p.t. is always hooked. */
     4.6      unsigned int l2_idx; /* NB. Only used for PTWR_PT_ACTIVE. */
     4.7 +    /* Info about last ptwr update batch. */
     4.8 +    struct exec_domain *prev_exec_domain; /* domain making the update */
     4.9 +    unsigned int        prev_nr_updates;  /* size of update batch */
    4.10  } ptwr_ptinfo_t;
    4.11  
    4.12  typedef struct {
     5.1 --- a/xen/include/asm-x86/x86_emulate.h	Tue Mar 15 14:31:42 2005 +0000
     5.2 +++ b/xen/include/asm-x86/x86_emulate.h	Tue Mar 15 15:58:52 2005 +0000
     5.3 @@ -32,9 +32,17 @@
     5.4   *  2. If the access fails (cannot emulate, or a standard access faults) then
     5.5   *     it is up to the memop to propagate the fault to the guest VM via
     5.6   *     some out-of-band mechanism, unknown to the emulator. The memop signals
     5.7 - *     failure by returning a non-zero value to the emulator, which will then
     5.8 - *     immediately bail.
     5.9 + *     failure by returning X86EMUL_PROPAGATE_FAULT to the emulator, which will
    5.10 + *     then immediately bail.
    5.11   */
    5.12 +/* Access completed successfully: continue emulation as normal. */
    5.13 +#define X86EMUL_CONTINUE        0
    5.14 +/* Access is unhandleable: bail from emulation and return error to caller. */
    5.15 +#define X86EMUL_UNHANDLEABLE    1
    5.16 +/* Terminate emulation but return success to the caller. */
    5.17 +#define X86EMUL_PROPAGATE_FAULT 2 /* propagate a generated fault to guest */
    5.18 +#define X86EMUL_RETRY_INSTR     2 /* retry the instruction for some reason */
    5.19 +#define X86EMUL_CMPXCHG_FAILED  2 /* cmpxchg did not see expected value */
    5.20  struct x86_mem_emulator
    5.21  {
    5.22      /*
    5.23 @@ -89,17 +97,26 @@ struct x86_mem_emulator
    5.24       *  @addr:  [IN ] Linear address to access.
    5.25       *  @old:   [IN ] Value expected to be current at @addr.
    5.26       *  @new:   [IN ] Value to write to @addr.
    5.27 -     *  @seen:  [OUT] Value actually seen at @addr, zero-extended to 'u_long'.
    5.28       *  @bytes: [IN ] Number of bytes to access using CMPXCHG.
    5.29       */
    5.30      int (*cmpxchg_emulated)(
    5.31          unsigned long addr,
    5.32 -        unsigned long old, 
    5.33 +        unsigned long old,
    5.34          unsigned long new,
    5.35 -        unsigned long *seen,
    5.36          unsigned int bytes);
    5.37  };
    5.38  
    5.39 +/* Standard reader/writer functions that callers may wish to use. */
    5.40 +extern int
    5.41 +x86_emulate_read_std(
    5.42 +    unsigned long addr,
    5.43 +    unsigned long *val,
    5.44 +    unsigned int bytes);
    5.45 +extern int
    5.46 +x86_emulate_write_std(
    5.47 +    unsigned long addr,
    5.48 +    unsigned long val,
    5.49 +    unsigned int bytes);
    5.50  
    5.51  struct xen_regs;
    5.52  
     6.1 --- a/xen/include/xen/perfc_defn.h	Tue Mar 15 14:31:42 2005 +0000
     6.2 +++ b/xen/include/xen/perfc_defn.h	Tue Mar 15 15:58:52 2005 +0000
     6.3 @@ -20,6 +20,7 @@ PERFCOUNTER_CPU( calls_to_update_va, "ca
     6.4  PERFCOUNTER_CPU( page_faults, "page faults" )
     6.5  PERFCOUNTER_CPU( copy_user_faults, "copy_user faults" )
     6.6  PERFCOUNTER_CPU( map_domain_mem_count, "map_domain_mem count" )
     6.7 +PERFCOUNTER_CPU( ptwr_emulations, "writable pt emulations" )
     6.8  
     6.9  PERFCOUNTER_CPU( shadow_l2_table_count, "shadow_l2_table count" )
    6.10  PERFCOUNTER_CPU( shadow_l1_table_count, "shadow_l1_table count" )