direct-io.hg

changeset 11649:b6ee084892da

[XEN] Support lightweight shadow-translate PV guests, for paravirt-ops.
This is a modified subset of Michael Fetterman's shadow-translate work.
Signed-off-by: Tim Deegan <Tim.Deegan@xensource.com>
author Tim Deegan <tim.deegan@xensource.com>
date Thu Sep 28 17:10:54 2006 +0100 (2006-09-28)
parents 5f42b4824e45
children f9929b7e009e
files xen/arch/x86/domain.c xen/arch/x86/mm.c xen/arch/x86/mm/shadow/common.c xen/arch/x86/mm/shadow/multi.c xen/arch/x86/mm/shadow/multi.h xen/arch/x86/mm/shadow/private.h xen/arch/x86/mm/shadow/types.h xen/arch/x86/traps.c xen/include/asm-x86/domain.h xen/include/asm-x86/guest_access.h xen/include/asm-x86/mm.h xen/include/asm-x86/shadow.h
line diff
     1.1 --- a/xen/arch/x86/domain.c	Thu Sep 28 17:09:11 2006 +0100
     1.2 +++ b/xen/arch/x86/domain.c	Thu Sep 28 17:10:54 2006 +0100
     1.3 @@ -334,8 +334,10 @@ int arch_set_info_guest(
     1.4      }
     1.5      else
     1.6      {
     1.7 -        if ( !get_page_and_type(mfn_to_page(cr3_pfn), d,
     1.8 -                                PGT_base_page_table) )
     1.9 +        if ( shadow_mode_refcounts(d)
    1.10 +             ? !get_page(mfn_to_page(cr3_pfn), d)
    1.11 +             : !get_page_and_type(mfn_to_page(cr3_pfn), d,
    1.12 +                                  PGT_base_page_table) )
    1.13          {
    1.14              destroy_gdt(v);
    1.15              return -EINVAL;
    1.16 @@ -952,7 +954,10 @@ void domain_relinquish_resources(struct 
    1.17          pfn = pagetable_get_pfn(v->arch.guest_table_user);
    1.18          if ( pfn != 0 )
    1.19          {
    1.20 -            put_page_and_type(mfn_to_page(pfn));
    1.21 +            if ( shadow_mode_refcounts(d) )
    1.22 +                put_page(mfn_to_page(pfn));
    1.23 +            else
    1.24 +                put_page_and_type(mfn_to_page(pfn));
    1.25              v->arch.guest_table_user = pagetable_null();
    1.26          }
    1.27  #endif
     2.1 --- a/xen/arch/x86/mm.c	Thu Sep 28 17:09:11 2006 +0100
     2.2 +++ b/xen/arch/x86/mm.c	Thu Sep 28 17:10:54 2006 +0100
     2.3 @@ -427,23 +427,11 @@ int map_ldt_shadow_page(unsigned int off
     2.4      unsigned long gmfn, mfn;
     2.5      l1_pgentry_t l1e, nl1e;
     2.6      unsigned long gva = v->arch.guest_context.ldt_base + (off << PAGE_SHIFT);
     2.7 -    int res;
     2.8 -
     2.9 -#if defined(__x86_64__)
    2.10 -    /* If in user mode, switch to kernel mode just to read LDT mapping. */
    2.11 -    int user_mode = !(v->arch.flags & TF_kernel_mode);
    2.12 -#define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v)
    2.13 -#elif defined(__i386__)
    2.14 -#define TOGGLE_MODE() ((void)0)
    2.15 -#endif
    2.16 +    int okay;
    2.17  
    2.18      BUG_ON(unlikely(in_irq()));
    2.19  
    2.20 -    TOGGLE_MODE();
    2.21 -    __copy_from_user(&l1e, &linear_pg_table[l1_linear_offset(gva)],
    2.22 -                     sizeof(l1e));
    2.23 -    TOGGLE_MODE();
    2.24 -
    2.25 +    guest_get_eff_kern_l1e(v, gva, &l1e);
    2.26      if ( unlikely(!(l1e_get_flags(l1e) & _PAGE_PRESENT)) )
    2.27          return 0;
    2.28  
    2.29 @@ -452,17 +440,17 @@ int map_ldt_shadow_page(unsigned int off
    2.30      if ( unlikely(!VALID_MFN(mfn)) )
    2.31          return 0;
    2.32  
    2.33 -    res = get_page_and_type(mfn_to_page(mfn), d, PGT_ldt_page);
    2.34 -
    2.35 -    if ( !res && unlikely(shadow_mode_refcounts(d)) )
    2.36 +    okay = get_page_and_type(mfn_to_page(mfn), d, PGT_ldt_page);
    2.37 +
    2.38 +    if ( !okay && unlikely(shadow_mode_refcounts(d)) )
    2.39      {
    2.40          shadow_lock(d);
    2.41          shadow_remove_write_access(d->vcpu[0], _mfn(mfn), 0, 0);
    2.42 -        res = get_page_and_type(mfn_to_page(mfn), d, PGT_ldt_page);
    2.43 +        okay = get_page_and_type(mfn_to_page(mfn), d, PGT_ldt_page);
    2.44          shadow_unlock(d);
    2.45      }
    2.46  
    2.47 -    if ( unlikely(!res) )
    2.48 +    if ( unlikely(!okay) )
    2.49          return 0;
    2.50  
    2.51      nl1e = l1e_from_pfn(mfn, l1e_get_flags(l1e) | _PAGE_RW);
    2.52 @@ -1233,7 +1221,7 @@ static inline int update_l1e(l1_pgentry_
    2.53          }
    2.54      }
    2.55  #endif
    2.56 -    if ( unlikely(shadow_mode_enabled(v->domain)) )
    2.57 +    if ( unlikely(shadow_mode_enabled(v->domain)) && rv )
    2.58      {
    2.59          shadow_validate_guest_entry(v, _mfn(gl1mfn), pl1e);
    2.60          shadow_unlock(v->domain);    
    2.61 @@ -1252,6 +1240,9 @@ static int mod_l1_entry(l1_pgentry_t *pl
    2.62      if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) )
    2.63          return 0;
    2.64  
    2.65 +    if ( unlikely(shadow_mode_refcounts(d)) )
    2.66 +        return update_l1e(pl1e, ol1e, nl1e, gl1mfn, current);
    2.67 +
    2.68      if ( l1e_get_flags(nl1e) & _PAGE_PRESENT )
    2.69      {
    2.70          if ( unlikely(l1e_get_flags(nl1e) & L1_DISALLOW_MASK) )
    2.71 @@ -1871,6 +1862,14 @@ static int set_foreigndom(domid_t domid)
    2.72          }
    2.73      }
    2.74  
    2.75 +    if ( unlikely(shadow_mode_translate(d)) )
    2.76 +    {
    2.77 +        MEM_LOG("%s: can not mix foreign mappings with translated domains",
    2.78 +                __func__);
    2.79 +        info->foreign = NULL;
    2.80 +        okay = 0; 
    2.81 +    }
    2.82 +
    2.83   out:
    2.84      return okay;
    2.85  }
    2.86 @@ -1902,7 +1901,7 @@ int do_mmuext_op(
    2.87  {
    2.88      struct mmuext_op op;
    2.89      int rc = 0, i = 0, okay;
    2.90 -    unsigned long mfn, type;
    2.91 +    unsigned long mfn = 0, gmfn = 0, type;
    2.92      unsigned int done = 0;
    2.93      struct page_info *page;
    2.94      struct vcpu *v = current;
    2.95 @@ -1947,7 +1946,8 @@ int do_mmuext_op(
    2.96          }
    2.97  
    2.98          okay = 1;
    2.99 -        mfn  = op.arg1.mfn;
   2.100 +        gmfn  = op.arg1.mfn;
   2.101 +        mfn = gmfn_to_mfn(FOREIGNDOM, gmfn);
   2.102          page = mfn_to_page(mfn);
   2.103  
   2.104          switch ( op.cmd )
   2.105 @@ -2022,7 +2022,6 @@ int do_mmuext_op(
   2.106              break;
   2.107  
   2.108          case MMUEXT_NEW_BASEPTR:
   2.109 -            mfn = gmfn_to_mfn(current->domain, mfn);
   2.110              okay = new_guest_cr3(mfn);
   2.111              this_cpu(percpu_mm_info).deferred_ops &= ~DOP_FLUSH_TLB;
   2.112              break;
   2.113 @@ -2031,8 +2030,13 @@ int do_mmuext_op(
   2.114          case MMUEXT_NEW_USER_BASEPTR:
   2.115              okay = 1;
   2.116              if (likely(mfn != 0))
   2.117 -                okay = get_page_and_type_from_pagenr(
   2.118 -                    mfn, PGT_root_page_table, d);
   2.119 +            {
   2.120 +                if ( shadow_mode_refcounts(d) )
   2.121 +                    okay = get_page_from_pagenr(mfn, d);
   2.122 +                else
   2.123 +                    okay = get_page_and_type_from_pagenr(
   2.124 +                        mfn, PGT_root_page_table, d);
   2.125 +            }
   2.126              if ( unlikely(!okay) )
   2.127              {
   2.128                  MEM_LOG("Error while installing new mfn %lx", mfn);
   2.129 @@ -2043,7 +2047,12 @@ int do_mmuext_op(
   2.130                      pagetable_get_pfn(v->arch.guest_table_user);
   2.131                  v->arch.guest_table_user = pagetable_from_pfn(mfn);
   2.132                  if ( old_mfn != 0 )
   2.133 -                    put_page_and_type(mfn_to_page(old_mfn));
   2.134 +                {
   2.135 +                    if ( shadow_mode_refcounts(d) )
   2.136 +                        put_page(mfn_to_page(old_mfn));
   2.137 +                    else
   2.138 +                        put_page_and_type(mfn_to_page(old_mfn));
   2.139 +                }
   2.140              }
   2.141              break;
   2.142  #endif
   2.143 @@ -2504,17 +2513,26 @@ static int create_grant_va_mapping(
   2.144  {
   2.145      l1_pgentry_t *pl1e, ol1e;
   2.146      struct domain *d = v->domain;
   2.147 +    unsigned long gl1mfn;
   2.148 +    int okay;
   2.149      
   2.150      ASSERT(spin_is_locked(&d->big_lock));
   2.151  
   2.152      adjust_guest_l1e(nl1e);
   2.153  
   2.154 -    pl1e = &linear_pg_table[l1_linear_offset(va)];
   2.155 -
   2.156 -    if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) ||
   2.157 -         !update_l1e(pl1e, ol1e, nl1e, 
   2.158 -                    l2e_get_pfn(__linear_l2_table[l2_linear_offset(va)]), v) )
   2.159 +    pl1e = guest_map_l1e(v, va, &gl1mfn);
   2.160 +    if ( !pl1e )
   2.161 +    {
   2.162 +        MEM_LOG("Could not find L1 PTE for address %lx", va);
   2.163          return GNTST_general_error;
   2.164 +    }
   2.165 +    ol1e = *pl1e;
   2.166 +    okay = update_l1e(pl1e, ol1e, nl1e, gl1mfn, v);
   2.167 +    guest_unmap_l1e(v, pl1e);
   2.168 +    pl1e = NULL;
   2.169 +
   2.170 +    if ( !okay )
   2.171 +            return GNTST_general_error;
   2.172  
   2.173      if ( !shadow_mode_refcounts(d) )
   2.174          put_page_from_l1e(ol1e, d);
   2.175 @@ -2523,17 +2541,19 @@ static int create_grant_va_mapping(
   2.176  }
   2.177  
   2.178  static int destroy_grant_va_mapping(
   2.179 -    unsigned long addr, unsigned long frame, struct domain *d)
   2.180 +    unsigned long addr, unsigned long frame, struct vcpu *v)
   2.181  {
   2.182      l1_pgentry_t *pl1e, ol1e;
   2.183 +    unsigned long gl1mfn;
   2.184 +    int rc = 0;
   2.185      
   2.186 -    pl1e = &linear_pg_table[l1_linear_offset(addr)];
   2.187 -
   2.188 -    if ( unlikely(__get_user(ol1e.l1, &pl1e->l1) != 0) )
   2.189 +    pl1e = guest_map_l1e(v, addr, &gl1mfn);
   2.190 +    if ( !pl1e )
   2.191      {
   2.192 -        MEM_LOG("Could not find PTE entry for address %lx", addr);
   2.193 +        MEM_LOG("Could not find L1 PTE for address %lx", addr);
   2.194          return GNTST_general_error;
   2.195      }
   2.196 +    ol1e = *pl1e;
   2.197  
   2.198      /*
   2.199       * Check that the virtual address supplied is actually mapped to
   2.200 @@ -2543,19 +2563,21 @@ static int destroy_grant_va_mapping(
   2.201      {
   2.202          MEM_LOG("PTE entry %lx for address %lx doesn't match frame %lx",
   2.203                  l1e_get_pfn(ol1e), addr, frame);
   2.204 -        return GNTST_general_error;
   2.205 +        rc = GNTST_general_error;
   2.206 +        goto out;
   2.207      }
   2.208  
   2.209      /* Delete pagetable entry. */
   2.210 -    if ( unlikely(!update_l1e(pl1e, ol1e, l1e_empty(), 
   2.211 -                      l2e_get_pfn(__linear_l2_table[l2_linear_offset(addr)]),
   2.212 -                      d->vcpu[0] /* Change for per-vcpu shadows */)) )
   2.213 +    if ( unlikely(!update_l1e(pl1e, ol1e, l1e_empty(), gl1mfn, v)) )
   2.214      {
   2.215          MEM_LOG("Cannot delete PTE entry at %p", (unsigned long *)pl1e);
   2.216 -        return GNTST_general_error;
   2.217 +        rc = GNTST_general_error;
   2.218 +        goto out; // this is redundant & unnecessary, but informative
   2.219      }
   2.220  
   2.221 -    return 0;
   2.222 + out:
   2.223 +    guest_unmap_l1e(v, pl1e);
   2.224 +    return rc;
   2.225  }
   2.226  
   2.227  int create_grant_host_mapping(
   2.228 @@ -2578,7 +2600,7 @@ int destroy_grant_host_mapping(
   2.229  {
   2.230      if ( flags & GNTMAP_contains_pte )
   2.231          return destroy_grant_pte_mapping(addr, frame, current->domain);
   2.232 -    return destroy_grant_va_mapping(addr, frame, current->domain);
   2.233 +    return destroy_grant_va_mapping(addr, frame, current);
   2.234  }
   2.235  
   2.236  int steal_page(
   2.237 @@ -2634,7 +2656,8 @@ int do_update_va_mapping(unsigned long v
   2.238      l1_pgentry_t   val = l1e_from_intpte(val64);
   2.239      struct vcpu   *v   = current;
   2.240      struct domain *d   = v->domain;
   2.241 -    unsigned long  vmask, bmap_ptr;
   2.242 +    l1_pgentry_t  *pl1e;
   2.243 +    unsigned long  vmask, bmap_ptr, gl1mfn;
   2.244      cpumask_t      pmask;
   2.245      int            rc  = 0;
   2.246  
   2.247 @@ -2643,35 +2666,17 @@ int do_update_va_mapping(unsigned long v
   2.248      if ( unlikely(!__addr_ok(va) && !shadow_mode_external(d)) )
   2.249          return -EINVAL;
   2.250  
   2.251 -    if ( unlikely(shadow_mode_refcounts(d)) )
   2.252 -    {
   2.253 -        DPRINTK("Grant op on a shadow-refcounted domain\n");
   2.254 -        return -EINVAL; 
   2.255 -    }
   2.256 -
   2.257      LOCK_BIGLOCK(d);
   2.258  
   2.259 -    if ( likely(rc == 0) && unlikely(shadow_mode_enabled(d)) )
   2.260 -    {
   2.261 -        if ( unlikely(this_cpu(percpu_mm_info).foreign &&
   2.262 -                      (shadow_mode_translate(d) ||
   2.263 -                       shadow_mode_translate(
   2.264 -                           this_cpu(percpu_mm_info).foreign))) )
   2.265 -        {
   2.266 -            /*
   2.267 -             * The foreign domain's pfn's are in a different namespace. There's
   2.268 -             * not enough information in just a gpte to figure out how to   
   2.269 -             * (re-)shadow this entry.
   2.270 -             */
   2.271 -            domain_crash(d);
   2.272 -        }
   2.273 -    }
   2.274 -
   2.275 -    if ( unlikely(!mod_l1_entry(
   2.276 -                      &linear_pg_table[l1_linear_offset(va)], val,
   2.277 -                      l2e_get_pfn(__linear_l2_table[l2_linear_offset(va)]))) )
   2.278 +    pl1e = guest_map_l1e(v, va, &gl1mfn);
   2.279 +
   2.280 +    if ( unlikely(!pl1e || !mod_l1_entry(pl1e, val, gl1mfn)) )
   2.281          rc = -EINVAL;
   2.282 -    
   2.283 +
   2.284 +    if ( pl1e )
   2.285 +        guest_unmap_l1e(v, pl1e);
   2.286 +    pl1e = NULL;
   2.287 +
   2.288      switch ( flags & UVMF_FLUSHTYPE_MASK )
   2.289      {
   2.290      case UVMF_TLB_FLUSH:
   2.291 @@ -3033,7 +3038,7 @@ static int ptwr_emulated_update(
   2.292      unsigned int bytes,
   2.293      unsigned int do_cmpxchg)
   2.294  {
   2.295 -    unsigned long pfn;
   2.296 +    unsigned long gmfn, mfn;
   2.297      struct page_info *page;
   2.298      l1_pgentry_t pte, ol1e, nl1e, *pl1e;
   2.299      struct vcpu *v = current;
   2.300 @@ -3073,15 +3078,17 @@ static int ptwr_emulated_update(
   2.301      }
   2.302  
   2.303      /* Read the PTE that maps the page being updated. */
   2.304 -    if ( __copy_from_user(&pte, &linear_pg_table[l1_linear_offset(addr)],
   2.305 -                          sizeof(pte)) )
   2.306 +    guest_get_eff_l1e(v, addr, &pte);
   2.307 +    if ( unlikely(!(l1e_get_flags(pte) & _PAGE_PRESENT)) )
   2.308      {
   2.309 -        MEM_LOG("ptwr_emulate: Cannot read thru linear_pg_table");
   2.310 +        MEM_LOG("%s: Cannot get L1 PTE for guest address %lx",
   2.311 +                __func__, addr);
   2.312          return X86EMUL_UNHANDLEABLE;
   2.313      }
   2.314  
   2.315 -    pfn  = l1e_get_pfn(pte);
   2.316 -    page = mfn_to_page(pfn);
   2.317 +    gmfn  = l1e_get_pfn(pte);
   2.318 +    mfn = gmfn_to_mfn(d, gmfn);
   2.319 +    page = mfn_to_page(mfn);
   2.320  
   2.321      /* We are looking only for read-only mappings of p.t. pages. */
   2.322      ASSERT((l1e_get_flags(pte) & (_PAGE_RW|_PAGE_PRESENT)) == _PAGE_PRESENT);
   2.323 @@ -3091,7 +3098,7 @@ static int ptwr_emulated_update(
   2.324  
   2.325      /* Check the new PTE. */
   2.326      nl1e = l1e_from_intpte(val);
   2.327 -    if ( unlikely(!get_page_from_l1e(nl1e, d)) )
   2.328 +    if ( unlikely(!get_page_from_l1e(gl1e_to_ml1e(d, nl1e), d)) )
   2.329      {
   2.330          if ( (CONFIG_PAGING_LEVELS == 3) &&
   2.331               (bytes == 4) &&
   2.332 @@ -3130,13 +3137,13 @@ static int ptwr_emulated_update(
   2.333              if ( shadow_mode_enabled(d) )
   2.334                  shadow_unlock(d);
   2.335              unmap_domain_page(pl1e);
   2.336 -            put_page_from_l1e(nl1e, d);
   2.337 +            put_page_from_l1e(gl1e_to_ml1e(d, nl1e), d);
   2.338              return X86EMUL_CMPXCHG_FAILED;
   2.339          }
   2.340 -        if ( unlikely(shadow_mode_enabled(v->domain)) )
   2.341 +        if ( unlikely(shadow_mode_enabled(d)) )
   2.342          {
   2.343              shadow_validate_guest_entry(v, _mfn(page_to_mfn(page)), pl1e);
   2.344 -            shadow_unlock(v->domain);    
   2.345 +            shadow_unlock(d);    
   2.346          }
   2.347      }
   2.348      else
   2.349 @@ -3149,7 +3156,7 @@ static int ptwr_emulated_update(
   2.350      unmap_domain_page(pl1e);
   2.351  
   2.352      /* Finally, drop the old PTE. */
   2.353 -    put_page_from_l1e(ol1e, d);
   2.354 +    put_page_from_l1e(gl1e_to_ml1e(d, ol1e), d);
   2.355  
   2.356      return X86EMUL_CONTINUE;
   2.357  }
   2.358 @@ -3198,13 +3205,13 @@ static struct x86_emulate_ops ptwr_emula
   2.359  };
   2.360  
   2.361  /* Write page fault handler: check if guest is trying to modify a PTE. */
   2.362 -int ptwr_do_page_fault(struct domain *d, unsigned long addr, 
   2.363 +int ptwr_do_page_fault(struct vcpu *v, unsigned long addr, 
   2.364                         struct cpu_user_regs *regs)
   2.365  {
   2.366 +    struct domain *d = v->domain;
   2.367      unsigned long     pfn;
   2.368      struct page_info *page;
   2.369      l1_pgentry_t      pte;
   2.370 -    l2_pgentry_t     *pl2e, l2e;
   2.371      struct x86_emulate_ctxt emul_ctxt;
   2.372  
   2.373      LOCK_BIGLOCK(d);
   2.374 @@ -3213,13 +3220,9 @@ int ptwr_do_page_fault(struct domain *d,
   2.375       * Attempt to read the PTE that maps the VA being accessed. By checking for
   2.376       * PDE validity in the L2 we avoid many expensive fixups in __get_user().
   2.377       */
   2.378 -    pl2e = &__linear_l2_table[l2_linear_offset(addr)];
   2.379 -    if ( __copy_from_user(&l2e, pl2e, sizeof(l2e)) ||
   2.380 -        !(l2e_get_flags(l2e) & _PAGE_PRESENT) ||
   2.381 -         __copy_from_user(&pte, &linear_pg_table[l1_linear_offset(addr)],
   2.382 -                          sizeof(pte)) )
   2.383 +    guest_get_eff_l1e(v, addr, &pte);
   2.384 +    if ( !(l1e_get_flags(pte) & _PAGE_PRESENT) )
   2.385          goto bail;
   2.386 -
   2.387      pfn  = l1e_get_pfn(pte);
   2.388      page = mfn_to_page(pfn);
   2.389  
     3.1 --- a/xen/arch/x86/mm/shadow/common.c	Thu Sep 28 17:09:11 2006 +0100
     3.2 +++ b/xen/arch/x86/mm/shadow/common.c	Thu Sep 28 17:10:54 2006 +0100
     3.3 @@ -75,35 +75,27 @@ sh_x86_emulate_read_std(unsigned long ad
     3.4                           unsigned int bytes,
     3.5                           struct x86_emulate_ctxt *ctxt)
     3.6  {
     3.7 -    struct vcpu *v = current;
     3.8 -    if ( hvm_guest(v) )
     3.9 +    *val = 0;
    3.10 +    // XXX -- this is WRONG.
    3.11 +    //        It entirely ignores the permissions in the page tables.
    3.12 +    //        In this case, that is only a user vs supervisor access check.
    3.13 +    //
    3.14 +    if ( hvm_copy(val, addr, bytes, HVM_COPY_IN) )
    3.15      {
    3.16 -        *val = 0;
    3.17 -        // XXX -- this is WRONG.
    3.18 -        //        It entirely ignores the permissions in the page tables.
    3.19 -        //        In this case, that is only a user vs supervisor access check.
    3.20 -        //
    3.21 -        if ( hvm_copy(val, addr, bytes, HVM_COPY_IN) )
    3.22 -        {
    3.23  #if 0
    3.24 -            SHADOW_PRINTK("d=%u v=%u a=%#lx v=%#lx bytes=%u\n",
    3.25 -                           v->domain->domain_id, v->vcpu_id, 
    3.26 -                           addr, *val, bytes);
    3.27 +        struct vcpu *v = current;
    3.28 +        SHADOW_PRINTK("d=%u v=%u a=%#lx v=%#lx bytes=%u\n",
    3.29 +                       v->domain->domain_id, v->vcpu_id, 
    3.30 +                       addr, *val, bytes);
    3.31  #endif
    3.32 -            return X86EMUL_CONTINUE;
    3.33 -        }
    3.34 -
    3.35 -        /* If we got here, there was nothing mapped here, or a bad GFN 
    3.36 -         * was mapped here.  This should never happen: we're here because
    3.37 -         * of a write fault at the end of the instruction we're emulating. */ 
    3.38 -        SHADOW_PRINTK("read failed to va %#lx\n", addr);
    3.39 -        return X86EMUL_PROPAGATE_FAULT;
    3.40 +        return X86EMUL_CONTINUE;
    3.41      }
    3.42 -    else 
    3.43 -    {
    3.44 -        SHADOW_PRINTK("this operation is not emulated yet\n");
    3.45 -        return X86EMUL_UNHANDLEABLE;
    3.46 -    }
    3.47 +
    3.48 +    /* If we got here, there was nothing mapped here, or a bad GFN 
    3.49 +     * was mapped here.  This should never happen: we're here because
    3.50 +     * of a write fault at the end of the instruction we're emulating. */ 
    3.51 +    SHADOW_PRINTK("read failed to va %#lx\n", addr);
    3.52 +    return X86EMUL_PROPAGATE_FAULT;
    3.53  }
    3.54  
    3.55  static int
    3.56 @@ -112,33 +104,26 @@ sh_x86_emulate_write_std(unsigned long a
    3.57                            unsigned int bytes,
    3.58                            struct x86_emulate_ctxt *ctxt)
    3.59  {
    3.60 +#if 0
    3.61      struct vcpu *v = current;
    3.62 -#if 0
    3.63      SHADOW_PRINTK("d=%u v=%u a=%#lx v=%#lx bytes=%u\n",
    3.64                    v->domain->domain_id, v->vcpu_id, addr, val, bytes);
    3.65  #endif
    3.66 -    if ( hvm_guest(v) )
    3.67 -    {
    3.68 -        // XXX -- this is WRONG.
    3.69 -        //        It entirely ignores the permissions in the page tables.
    3.70 -        //        In this case, that includes user vs supervisor, and
    3.71 -        //        write access.
    3.72 -        //
    3.73 -        if ( hvm_copy(&val, addr, bytes, HVM_COPY_OUT) )
    3.74 -            return X86EMUL_CONTINUE;
    3.75 -
    3.76 -        /* If we got here, there was nothing mapped here, or a bad GFN 
    3.77 -         * was mapped here.  This should never happen: we're here because
    3.78 -         * of a write fault at the end of the instruction we're emulating,
    3.79 -         * which should be handled by sh_x86_emulate_write_emulated. */ 
    3.80 -        SHADOW_PRINTK("write failed to va %#lx\n", addr);
    3.81 -        return X86EMUL_PROPAGATE_FAULT;
    3.82 -    }
    3.83 -    else 
    3.84 -    {
    3.85 -        SHADOW_PRINTK("this operation is not emulated yet\n");
    3.86 -        return X86EMUL_UNHANDLEABLE;
    3.87 -    }
    3.88 +
    3.89 +    // XXX -- this is WRONG.
    3.90 +    //        It entirely ignores the permissions in the page tables.
    3.91 +    //        In this case, that includes user vs supervisor, and
    3.92 +    //        write access.
    3.93 +    //
    3.94 +    if ( hvm_copy(&val, addr, bytes, HVM_COPY_OUT) )
    3.95 +        return X86EMUL_CONTINUE;
    3.96 +
    3.97 +    /* If we got here, there was nothing mapped here, or a bad GFN 
    3.98 +     * was mapped here.  This should never happen: we're here because
    3.99 +     * of a write fault at the end of the instruction we're emulating,
   3.100 +     * which should be handled by sh_x86_emulate_write_emulated. */ 
   3.101 +    SHADOW_PRINTK("write failed to va %#lx\n", addr);
   3.102 +    return X86EMUL_PROPAGATE_FAULT;
   3.103  }
   3.104  
   3.105  static int
   3.106 @@ -152,15 +137,7 @@ sh_x86_emulate_write_emulated(unsigned l
   3.107      SHADOW_PRINTK("d=%u v=%u a=%#lx v=%#lx bytes=%u\n",
   3.108                    v->domain->domain_id, v->vcpu_id, addr, val, bytes);
   3.109  #endif
   3.110 -    if ( hvm_guest(v) )
   3.111 -    {
   3.112 -        return v->arch.shadow.mode->x86_emulate_write(v, addr, &val, bytes, ctxt);
   3.113 -    }
   3.114 -    else 
   3.115 -    {
   3.116 -        SHADOW_PRINTK("this operation is not emulated yet\n");
   3.117 -        return X86EMUL_UNHANDLEABLE;
   3.118 -    }
   3.119 +    return v->arch.shadow.mode->x86_emulate_write(v, addr, &val, bytes, ctxt);
   3.120  }
   3.121  
   3.122  static int 
   3.123 @@ -175,16 +152,8 @@ sh_x86_emulate_cmpxchg_emulated(unsigned
   3.124      SHADOW_PRINTK("d=%u v=%u a=%#lx o?=%#lx n:=%#lx bytes=%u\n",
   3.125                     v->domain->domain_id, v->vcpu_id, addr, old, new, bytes);
   3.126  #endif
   3.127 -    if ( hvm_guest(v) )
   3.128 -    {
   3.129 -        return v->arch.shadow.mode->x86_emulate_cmpxchg(v, addr, old, new, 
   3.130 -                                                    bytes, ctxt);
   3.131 -    }
   3.132 -    else 
   3.133 -    {
   3.134 -        SHADOW_PRINTK("this operation is not emulated yet\n");
   3.135 -        return X86EMUL_UNHANDLEABLE;
   3.136 -    }
   3.137 +    return v->arch.shadow.mode->x86_emulate_cmpxchg(v, addr, old, new,
   3.138 +                                                     bytes, ctxt);
   3.139  }
   3.140  
   3.141  static int 
   3.142 @@ -201,16 +170,8 @@ sh_x86_emulate_cmpxchg8b_emulated(unsign
   3.143                     v->domain->domain_id, v->vcpu_id, addr, old_hi, old_lo,
   3.144                     new_hi, new_lo, ctxt);
   3.145  #endif
   3.146 -    if ( hvm_guest(v) )
   3.147 -    {
   3.148 -        return v->arch.shadow.mode->x86_emulate_cmpxchg8b(v, addr, old_lo, old_hi,
   3.149 -                                                      new_lo, new_hi, ctxt);
   3.150 -    }
   3.151 -    else 
   3.152 -    {
   3.153 -        SHADOW_PRINTK("this operation is not emulated yet\n");
   3.154 -        return X86EMUL_UNHANDLEABLE;
   3.155 -    }
   3.156 +    return v->arch.shadow.mode->x86_emulate_cmpxchg8b(v, addr, old_lo, old_hi,
   3.157 +                                                       new_lo, new_hi, ctxt);
   3.158  }
   3.159  
   3.160  
   3.161 @@ -267,7 +228,7 @@ void shadow_demote(struct vcpu *v, mfn_t
   3.162  /* Validate a pagetable change from the guest and update the shadows.
   3.163   * Returns a bitmask of SHADOW_SET_* flags. */
   3.164  
   3.165 -static int
   3.166 +int
   3.167  __shadow_validate_guest_entry(struct vcpu *v, mfn_t gmfn, 
   3.168                                 void *entry, u32 size)
   3.169  {
   3.170 @@ -367,7 +328,9 @@ shadow_validate_guest_entry(struct vcpu 
   3.171  void
   3.172  shadow_validate_guest_pt_write(struct vcpu *v, mfn_t gmfn,
   3.173                                  void *entry, u32 size)
   3.174 -/* This is the entry point for emulated writes to pagetables in HVM guests */
   3.175 +/* This is the entry point for emulated writes to pagetables in HVM guests and
   3.176 + * PV translated guests.
   3.177 + */
   3.178  {
   3.179      struct domain *d = v->domain;
   3.180      int rc;
   3.181 @@ -806,7 +769,7 @@ void shadow_free(struct domain *d, mfn_t
   3.182  
   3.183  /* Divert some memory from the pool to be used by the p2m mapping.
   3.184   * This action is irreversible: the p2m mapping only ever grows.
   3.185 - * That's OK because the p2m table only exists for external domains,
   3.186 + * That's OK because the p2m table only exists for translated domains,
   3.187   * and those domains can't ever turn off shadow mode.
   3.188   * Also, we only ever allocate a max-order chunk, so as to preserve
   3.189   * the invariant that shadow_prealloc() always works.
   3.190 @@ -830,7 +793,12 @@ shadow_alloc_p2m_pages(struct domain *d)
   3.191      d->arch.shadow.total_pages -= (1<<SHADOW_MAX_ORDER);
   3.192      for (i = 0; i < (1<<SHADOW_MAX_ORDER); i++)
   3.193      {
   3.194 -        /* Unlike shadow pages, mark p2m pages as owned by the domain */
   3.195 +        /* Unlike shadow pages, mark p2m pages as owned by the domain.
   3.196 +         * Marking the domain as the owner would normally allow the guest to
   3.197 +         * create mappings of these pages, but these p2m pages will never be
   3.198 +         * in the domain's guest-physical address space, and so that is not
   3.199 +         * believed to be a concern.
   3.200 +         */
   3.201          page_set_owner(&pg[i], d);
   3.202          list_add_tail(&pg[i].list, &d->arch.shadow.p2m_freelist);
   3.203      }
   3.204 @@ -2269,7 +2237,7 @@ void sh_update_paging_modes(struct vcpu 
   3.205      //
   3.206      if ( !test_bit(_VCPUF_initialised, &v->vcpu_flags) )
   3.207      {
   3.208 -        printk("%s: postponing determination of shadow mode\n", __func__);
   3.209 +        SHADOW_PRINTK("%s: postponing determination of shadow mode\n", __func__);
   3.210          return;
   3.211      }
   3.212  
   3.213 @@ -2294,6 +2262,7 @@ void sh_update_paging_modes(struct vcpu 
   3.214  #else
   3.215  #error unexpected paging mode
   3.216  #endif
   3.217 +        v->arch.shadow.translate_enabled = !!shadow_mode_translate(d);
   3.218      }
   3.219      else
   3.220      {
   3.221 @@ -2303,8 +2272,8 @@ void sh_update_paging_modes(struct vcpu 
   3.222          ASSERT(shadow_mode_translate(d));
   3.223          ASSERT(shadow_mode_external(d));
   3.224  
   3.225 -        v->arch.shadow.hvm_paging_enabled = !!hvm_paging_enabled(v);
   3.226 -        if ( !v->arch.shadow.hvm_paging_enabled )
   3.227 +        v->arch.shadow.translate_enabled = !!hvm_paging_enabled(v);
   3.228 +        if ( !v->arch.shadow.translate_enabled )
   3.229          {
   3.230              
   3.231              /* Set v->arch.guest_table to use the p2m map, and choose
   3.232 @@ -2381,13 +2350,14 @@ void sh_update_paging_modes(struct vcpu 
   3.233  
   3.234          if ( v->arch.shadow.mode != old_mode )
   3.235          {
   3.236 -            SHADOW_PRINTK("new paging mode: d=%u v=%u g=%u s=%u "
   3.237 -                           "(was g=%u s=%u)\n",
   3.238 -                           d->domain_id, v->vcpu_id, 
   3.239 -                           v->arch.shadow.mode->guest_levels,
   3.240 -                           v->arch.shadow.mode->shadow_levels,
   3.241 -                           old_mode ? old_mode->guest_levels : 0,
   3.242 -                           old_mode ? old_mode->shadow_levels : 0);
   3.243 +            SHADOW_PRINTK("new paging mode: d=%u v=%u pe=%d g=%u s=%u "
   3.244 +                          "(was g=%u s=%u)\n",
   3.245 +                          d->domain_id, v->vcpu_id,
   3.246 +                          hvm_guest(v) ? !!hvm_paging_enabled(v) : 1,
   3.247 +                          v->arch.shadow.mode->guest_levels,
   3.248 +                          v->arch.shadow.mode->shadow_levels,
   3.249 +                          old_mode ? old_mode->guest_levels : 0,
   3.250 +                          old_mode ? old_mode->shadow_levels : 0);
   3.251              if ( old_mode &&
   3.252                   (v->arch.shadow.mode->shadow_levels !=
   3.253                    old_mode->shadow_levels) )
   3.254 @@ -2467,6 +2437,7 @@ static int shadow_enable(struct domain *
   3.255      /* Sanity check the arguments */
   3.256      if ( (d == current->domain) ||
   3.257           shadow_mode_enabled(d) ||
   3.258 +         ((mode & SHM2_translate) && !(mode & SHM2_refcounts)) ||
   3.259           ((mode & SHM2_external) && !(mode & SHM2_translate)) )
   3.260      {
   3.261          rv = -EINVAL;
   3.262 @@ -2522,7 +2493,7 @@ static int shadow_enable(struct domain *
   3.263   out:
   3.264      shadow_unlock(d);
   3.265      domain_unpause(d);
   3.266 -    return 0;
   3.267 +    return rv;
   3.268  }
   3.269  
   3.270  void shadow_teardown(struct domain *d)
     4.1 --- a/xen/arch/x86/mm/shadow/multi.c	Thu Sep 28 17:09:11 2006 +0100
     4.2 +++ b/xen/arch/x86/mm/shadow/multi.c	Thu Sep 28 17:10:54 2006 +0100
     4.3 @@ -483,8 +483,7 @@ static u32 guest_set_ad_bits(struct vcpu
     4.4                               unsigned int level, 
     4.5                               fetch_type_t ft)
     4.6  {
     4.7 -    u32 flags, shflags, bit;
     4.8 -    struct page_info *pg;
     4.9 +    u32 flags;
    4.10      int res = 0;
    4.11  
    4.12      ASSERT(valid_mfn(gmfn)
    4.13 @@ -502,11 +501,10 @@ static u32 guest_set_ad_bits(struct vcpu
    4.14      if ( unlikely(GUEST_PAGING_LEVELS == 3 && level == 3) )
    4.15          return flags;
    4.16  
    4.17 -    /* Need the D bit as well for writes, in l1es and 32bit/PAE PSE l2es. */
    4.18 +    /* Need the D bit as well for writes, in L1es and PSE L2es. */
    4.19      if ( ft == ft_demand_write  
    4.20 -         && (level == 1 || 
    4.21 -             (level == 2 && GUEST_PAGING_LEVELS < 4 
    4.22 -              && (flags & _PAGE_PSE) && guest_supports_superpages(v))) )
    4.23 +         && (level == 1 ||
    4.24 +             (level == 2 && (flags & _PAGE_PSE) && guest_supports_superpages(v))) )
    4.25      {
    4.26          if ( (flags & (_PAGE_DIRTY | _PAGE_ACCESSED)) 
    4.27               == (_PAGE_DIRTY | _PAGE_ACCESSED) )
    4.28 @@ -524,77 +522,70 @@ static u32 guest_set_ad_bits(struct vcpu
    4.29  
    4.30      /* Set the bit(s) */
    4.31      sh_mark_dirty(v->domain, gmfn);
    4.32 -    SHADOW_DEBUG(A_AND_D, "gfn = %"SH_PRI_gfn", "
    4.33 +    SHADOW_DEBUG(A_AND_D, "gfn = %" SH_PRI_gfn ", "
    4.34                    "old flags = %#x, new flags = %#x\n", 
    4.35 -                  guest_l1e_get_gfn(*ep), guest_l1e_get_flags(*ep), flags);
    4.36 +                  gfn_x(guest_l1e_get_gfn(*ep)), guest_l1e_get_flags(*ep), flags);
    4.37      *ep = guest_l1e_from_gfn(guest_l1e_get_gfn(*ep), flags);
    4.38      
    4.39 -    /* May need to propagate this change forward to other kinds of shadow */
    4.40 -    pg = mfn_to_page(gmfn);
    4.41 -    if ( !sh_mfn_is_a_page_table(gmfn) ) 
    4.42 -    {
    4.43 -        /* This guest pagetable is not yet shadowed at all. */
    4.44 -        // MAF: I think this assert is busted...  If this gmfn has not yet
    4.45 -        // been promoted, then it seems perfectly reasonable for there to be
    4.46 -        // outstanding type refs to it...
    4.47 -        /* TJD: No. If the gmfn has not been promoted, we must at least 
    4.48 -         * have recognised that it is a pagetable, and pulled write access.
    4.49 -         * The type count should only be non-zero if it is actually a page 
    4.50 -         * table.  The test above was incorrect, though, so I've fixed it. */
    4.51 -        ASSERT((pg->u.inuse.type_info & PGT_count_mask) == 0);
    4.52 -        return flags;  
    4.53 -    }
    4.54 -
    4.55 -    shflags = pg->shadow_flags & SHF_page_type_mask;
    4.56 -    while ( shflags )
    4.57 -    {
    4.58 -        bit = find_first_set_bit(shflags);
    4.59 -        ASSERT(shflags & (1u << bit));
    4.60 -        shflags &= ~(1u << bit);
    4.61 -        if ( !(pg->shadow_flags & (1u << bit)) )
    4.62 -            continue;
    4.63 -        switch ( bit )
    4.64 -        {
    4.65 -        case PGC_SH_type_to_index(PGC_SH_l1_shadow):
    4.66 -            if (level != 1) 
    4.67 -                res |= sh_map_and_validate_gl1e(v, gmfn, ep, sizeof (*ep));
    4.68 -            break;
    4.69 -        case PGC_SH_type_to_index(PGC_SH_l2_shadow):
    4.70 -            if (level != 2) 
    4.71 -                res |= sh_map_and_validate_gl2e(v, gmfn, ep, sizeof (*ep));
    4.72 -            break;
    4.73 -#if GUEST_PAGING_LEVELS == 3 /* PAE only */
    4.74 -        case PGC_SH_type_to_index(PGC_SH_l2h_shadow):
    4.75 -            if (level != 2) 
    4.76 -                res |= sh_map_and_validate_gl2he(v, gmfn, ep, sizeof (*ep));
    4.77 -            break;
    4.78 -#endif
    4.79 -#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
    4.80 -        case PGC_SH_type_to_index(PGC_SH_l3_shadow):
    4.81 -            if (level != 3) 
    4.82 -                res |= sh_map_and_validate_gl3e(v, gmfn, ep, sizeof (*ep));
    4.83 -            break;
    4.84 -#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
    4.85 -        case PGC_SH_type_to_index(PGC_SH_l4_shadow):
    4.86 -            if (level != 4) 
    4.87 -                res |= sh_map_and_validate_gl4e(v, gmfn, ep, sizeof (*ep));
    4.88 -            break;
    4.89 -#endif 
    4.90 -#endif
    4.91 -        default:
    4.92 -            SHADOW_ERROR("mfn %"SH_PRI_mfn" is shadowed in multiple "
    4.93 -                          "modes: A&D bits may be out of sync (flags=%#x).\n", 
    4.94 -                          mfn_x(gmfn), pg->shadow_flags); 
    4.95 -            /* XXX Shadows in other modes will not be updated, so will
    4.96 -             * have their A and D bits out of sync. */
    4.97 -        }
    4.98 -    }
    4.99 -    
   4.100 +    /* Propagate this change to any existing shadows */
   4.101 +    res = __shadow_validate_guest_entry(v, gmfn, ep, sizeof(*ep));
   4.102 +
   4.103      /* We should never need to flush the TLB or recopy PAE entries */
   4.104 -    ASSERT( res == 0 || res == SHADOW_SET_CHANGED );
   4.105 +    ASSERT((res == 0) || (res == SHADOW_SET_CHANGED));
   4.106 +
   4.107      return flags;
   4.108  }
   4.109  
   4.110 +#if (CONFIG_PAGING_LEVELS == GUEST_PAGING_LEVELS) && (CONFIG_PAGING_LEVELS == SHADOW_PAGING_LEVELS)
   4.111 +void *
   4.112 +sh_guest_map_l1e(struct vcpu *v, unsigned long addr,
   4.113 +                  unsigned long *gl1mfn)
   4.114 +{
   4.115 +    void *pl1e = NULL;
   4.116 +    walk_t gw;
   4.117 +
   4.118 +    ASSERT(shadow_mode_translate(v->domain));
   4.119 +        
   4.120 +    // XXX -- this is expensive, but it's easy to cobble together...
   4.121 +    // FIXME!
   4.122 +
   4.123 +    shadow_lock(v->domain);
   4.124 +    guest_walk_tables(v, addr, &gw, 1);
   4.125 +
   4.126 +    if ( gw.l2e &&
   4.127 +         (guest_l2e_get_flags(*gw.l2e) & _PAGE_PRESENT) &&
   4.128 +         !(guest_supports_superpages(v) && (guest_l2e_get_flags(*gw.l2e) & _PAGE_PSE)) )
   4.129 +    {
   4.130 +        if ( gl1mfn )
   4.131 +            *gl1mfn = mfn_x(gw.l1mfn);
   4.132 +        pl1e = map_domain_page(mfn_x(gw.l1mfn)) +
   4.133 +            (guest_l1_table_offset(addr) * sizeof(guest_l1e_t));
   4.134 +    }
   4.135 +
   4.136 +    unmap_walk(v, &gw);
   4.137 +    shadow_unlock(v->domain);
   4.138 +
   4.139 +    return pl1e;
   4.140 +}
   4.141 +
   4.142 +void
   4.143 +sh_guest_get_eff_l1e(struct vcpu *v, unsigned long addr, void *eff_l1e)
   4.144 +{
   4.145 +    walk_t gw;
   4.146 +
   4.147 +    ASSERT(shadow_mode_translate(v->domain));
   4.148 +        
   4.149 +    // XXX -- this is expensive, but it's easy to cobble together...
   4.150 +    // FIXME!
   4.151 +
   4.152 +    shadow_lock(v->domain);
   4.153 +    guest_walk_tables(v, addr, &gw, 1);
   4.154 +    *(guest_l1e_t *)eff_l1e = gw.eff_l1e;
   4.155 +    unmap_walk(v, &gw);
   4.156 +    shadow_unlock(v->domain);
   4.157 +}
   4.158 +#endif /* CONFIG==SHADOW==GUEST */
   4.159 +
   4.160  /**************************************************************************/
   4.161  /* Functions to compute the correct index into a shadow page, given an
   4.162   * index into the guest page (as returned by guest_get_index()).
   4.163 @@ -709,17 +700,6 @@ shadow_l4_index(mfn_t *smfn, u32 guest_i
   4.164   * to the _PAGE_DIRTY bit handling), but for L[234], they are grouped together
   4.165   * into the respective demand_fault functions.
   4.166   */
   4.167 -
   4.168 -#define CHECK(_cond)                                    \
   4.169 -do {                                                    \
   4.170 -    if (unlikely(!(_cond)))                             \
   4.171 -    {                                                   \
   4.172 -        printk("%s %s %d ASSERTION (%s) FAILED\n",      \
   4.173 -               __func__, __FILE__, __LINE__, #_cond);   \
   4.174 -        return -1;                                      \
   4.175 -    }                                                   \
   4.176 -} while (0);
   4.177 -
   4.178  // The function below tries to capture all of the flag manipulation for the
   4.179  // demand and propagate functions into one place.
   4.180  //
   4.181 @@ -728,6 +708,16 @@ sh_propagate_flags(struct vcpu *v, mfn_t
   4.182                      u32 gflags, guest_l1e_t *guest_entry_ptr, mfn_t gmfn, 
   4.183                      int mmio, int level, fetch_type_t ft)
   4.184  {
   4.185 +#define CHECK(_cond)                                    \
   4.186 +do {                                                    \
   4.187 +    if (unlikely(!(_cond)))                             \
   4.188 +    {                                                   \
   4.189 +        printk("%s %s %d ASSERTION (%s) FAILED\n",      \
   4.190 +               __func__, __FILE__, __LINE__, #_cond);   \
   4.191 +        domain_crash(d);                                \
   4.192 +    }                                                   \
   4.193 +} while (0);
   4.194 +
   4.195      struct domain *d = v->domain;
   4.196      u32 pass_thru_flags;
   4.197      u32 sflags;
   4.198 @@ -763,6 +753,10 @@ sh_propagate_flags(struct vcpu *v, mfn_t
   4.199              return 0;
   4.200      }
   4.201  
   4.202 +    // Set the A and D bits in the guest entry, if we need to.
   4.203 +    if ( guest_entry_ptr && (ft & FETCH_TYPE_DEMAND) )
   4.204 +        gflags = guest_set_ad_bits(v, gmfn, guest_entry_ptr, level, ft);
   4.205 +    
   4.206      // PAE does not allow NX, RW, USER, ACCESSED, or DIRTY bits in its L3e's...
   4.207      //
   4.208      if ( (SHADOW_PAGING_LEVELS == 3) && (level == 3) )
   4.209 @@ -797,17 +791,12 @@ sh_propagate_flags(struct vcpu *v, mfn_t
   4.210      // Higher level entries do not, strictly speaking, have dirty bits, but
   4.211      // since we use shadow linear tables, each of these entries may, at some
   4.212      // point in time, also serve as a shadow L1 entry.
   4.213 -    // By setting both the  A&D bits in each of these, we eliminate the burden
   4.214 +    // By setting both the A&D bits in each of these, we eliminate the burden
   4.215      // on the hardware to update these bits on initial accesses.
   4.216      //
   4.217      if ( (level > 1) && !((SHADOW_PAGING_LEVELS == 3) && (level == 3)) )
   4.218          sflags |= _PAGE_ACCESSED | _PAGE_DIRTY;
   4.219  
   4.220 -
   4.221 -    // Set the A and D bits in the guest entry, if we need to.
   4.222 -    if ( guest_entry_ptr && (ft & FETCH_TYPE_DEMAND) )
   4.223 -        gflags = guest_set_ad_bits(v, gmfn, guest_entry_ptr, level, ft);
   4.224 -    
   4.225      // If the A or D bit has not yet been set in the guest, then we must
   4.226      // prevent the corresponding kind of access.
   4.227      //
   4.228 @@ -815,12 +804,12 @@ sh_propagate_flags(struct vcpu *v, mfn_t
   4.229                    !(gflags & _PAGE_ACCESSED)) )
   4.230          sflags &= ~_PAGE_PRESENT;
   4.231  
   4.232 -    /* D bits exist in l1es, and 32bit/PAE PSE l2es, but not 64bit PSE l2es */
   4.233 -    if ( unlikely( ((level == 1) 
   4.234 -                    || ((level == 2) && (GUEST_PAGING_LEVELS < 4) 
   4.235 -                        && guest_supports_superpages(v) &&
   4.236 -                        (gflags & _PAGE_PSE)))
   4.237 -                   && !(gflags & _PAGE_DIRTY)) )
   4.238 +    /* D bits exist in L1es and PSE L2es */
   4.239 +    if ( unlikely(((level == 1) ||
   4.240 +                   ((level == 2) &&
   4.241 +                    (gflags & _PAGE_PSE) &&
   4.242 +                    guest_supports_superpages(v)))
   4.243 +                  && !(gflags & _PAGE_DIRTY)) )
   4.244          sflags &= ~_PAGE_RW;
   4.245  
   4.246      // MMIO caching
   4.247 @@ -869,11 +858,18 @@ sh_propagate_flags(struct vcpu *v, mfn_t
   4.248          }
   4.249      }
   4.250  
   4.251 +    // PV guests in 64-bit mode use two different page tables for user vs
   4.252 +    // supervisor permissions, making the guest's _PAGE_USER bit irrelevant.
   4.253 +    // It is always shadowed as present...
   4.254 +    if ( (GUEST_PAGING_LEVELS == 4) && !hvm_guest(v) )
   4.255 +    {
   4.256 +        sflags |= _PAGE_USER;
   4.257 +    }
   4.258 +
   4.259      return sflags;
   4.260 +#undef CHECK
   4.261  }
   4.262  
   4.263 -#undef CHECK
   4.264 -
   4.265  #if GUEST_PAGING_LEVELS >= 4
   4.266  static void
   4.267  l4e_propagate_from_guest(struct vcpu *v, 
   4.268 @@ -1732,11 +1728,21 @@ void sh_install_xen_entries_in_l4(struct
   4.269                              __PAGE_HYPERVISOR);
   4.270  
   4.271      /* Linear mapping */
   4.272 -    sl4e[shadow_l4_table_offset(LINEAR_PT_VIRT_START)] =
   4.273 -        shadow_l4e_from_mfn(gl4mfn, __PAGE_HYPERVISOR);
   4.274      sl4e[shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START)] =
   4.275          shadow_l4e_from_mfn(sl4mfn, __PAGE_HYPERVISOR);
   4.276  
   4.277 +    if ( shadow_mode_translate(v->domain) && !shadow_mode_external(v->domain) )
   4.278 +    {
   4.279 +        // linear tables may not be used with translated PV guests
   4.280 +        sl4e[shadow_l4_table_offset(LINEAR_PT_VIRT_START)] =
   4.281 +            shadow_l4e_empty();
   4.282 +    }
   4.283 +    else
   4.284 +    {
   4.285 +        sl4e[shadow_l4_table_offset(LINEAR_PT_VIRT_START)] =
   4.286 +            shadow_l4e_from_mfn(gl4mfn, __PAGE_HYPERVISOR);
   4.287 +    }
   4.288 +
   4.289      if ( shadow_mode_translate(v->domain) )
   4.290      {
   4.291          /* install domain-specific P2M table */
   4.292 @@ -1779,7 +1785,15 @@ void sh_install_xen_entries_in_l2h(struc
   4.293      
   4.294      /* We don't set up a linear mapping here because we can't until this
   4.295       * l2h is installed in an l3e.  sh_update_linear_entries() handles
   4.296 -     * the linear mappings when the l3 is loaded. */
   4.297 +     * the linear mappings when the l3 is loaded.  We zero them here, just as
   4.298 +     * a safety measure.
   4.299 +     */
   4.300 +    for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
   4.301 +        sl2e[shadow_l2_table_offset(LINEAR_PT_VIRT_START) + i] =
   4.302 +            shadow_l2e_empty();
   4.303 +    for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
   4.304 +        sl2e[shadow_l2_table_offset(SH_LINEAR_PT_VIRT_START) + i] =
   4.305 +            shadow_l2e_empty();
   4.306  
   4.307      if ( shadow_mode_translate(d) )
   4.308      {
   4.309 @@ -1817,6 +1831,12 @@ void sh_install_xen_entries_in_l3(struct
   4.310      l2smfn = get_shadow_status(v, l2gmfn, PGC_SH_l2h_shadow);
   4.311      if ( !valid_mfn(l2smfn) )
   4.312      {
   4.313 +        /* must remove write access to this page before shadowing it */
   4.314 +        // XXX -- should check to see whether this is better with level==0 or
   4.315 +        // level==2...
   4.316 +        if ( shadow_remove_write_access(v, l2gmfn, 2, 0xc0000000ul) != 0 )
   4.317 +            flush_tlb_mask(v->domain->domain_dirty_cpumask);
   4.318 + 
   4.319          l2smfn = sh_make_shadow(v, l2gmfn, PGC_SH_l2h_shadow);
   4.320      }
   4.321      l3e_propagate_from_guest(v, &gl3e[3], gl3mfn, l2smfn, &new_sl3e,
   4.322 @@ -1852,11 +1872,21 @@ void sh_install_xen_entries_in_l2(struct
   4.323                  __PAGE_HYPERVISOR);
   4.324  
   4.325      /* Linear mapping */
   4.326 -    sl2e[shadow_l2_table_offset(LINEAR_PT_VIRT_START)] =
   4.327 -        shadow_l2e_from_mfn(gl2mfn, __PAGE_HYPERVISOR);
   4.328      sl2e[shadow_l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
   4.329          shadow_l2e_from_mfn(sl2mfn, __PAGE_HYPERVISOR);
   4.330  
   4.331 +    if ( shadow_mode_translate(v->domain) && !shadow_mode_external(v->domain) )
   4.332 +    {
   4.333 +        // linear tables may not be used with translated PV guests
   4.334 +        sl2e[shadow_l2_table_offset(LINEAR_PT_VIRT_START)] =
   4.335 +            shadow_l2e_empty();
   4.336 +    }
   4.337 +    else
   4.338 +    {
   4.339 +        sl2e[shadow_l2_table_offset(LINEAR_PT_VIRT_START)] =
   4.340 +            shadow_l2e_from_mfn(gl2mfn, __PAGE_HYPERVISOR);
   4.341 +    }
   4.342 +
   4.343      if ( shadow_mode_translate(d) )
   4.344      {
   4.345          /* install domain-specific P2M table */
   4.346 @@ -2527,6 +2557,32 @@ static int validate_gl4e(struct vcpu *v,
   4.347      }
   4.348      l4e_propagate_from_guest(v, new_gl4e, _mfn(INVALID_MFN),
   4.349                               sl3mfn, &new_sl4e, ft_prefetch);
   4.350 +
   4.351 +    // check for updates to xen reserved slots
   4.352 +    if ( !shadow_mode_external(v->domain) )
   4.353 +    {
   4.354 +        int shadow_index = (((unsigned long)sl4p & ~PAGE_MASK) /
   4.355 +                            sizeof(shadow_l4e_t));
   4.356 +        int reserved_xen_slot = !is_guest_l4_slot(shadow_index);
   4.357 +
   4.358 +        if ( unlikely(reserved_xen_slot) )
   4.359 +        {
   4.360 +            // attempt by the guest to write to a xen reserved slot
   4.361 +            //
   4.362 +            SHADOW_PRINTK("%s out-of-range update "
   4.363 +                           "sl4mfn=%05lx index=0x%x val=%" SH_PRI_pte "\n",
   4.364 +                           __func__, mfn_x(sl4mfn), shadow_index, new_sl4e.l4);
   4.365 +            if ( shadow_l4e_get_flags(new_sl4e) & _PAGE_PRESENT )
   4.366 +            {
   4.367 +                SHADOW_ERROR("out-of-range l4e update\n");
   4.368 +                result |= SHADOW_SET_ERROR;
   4.369 +            }
   4.370 +
   4.371 +            // do not call shadow_set_l4e...
   4.372 +            return result;
   4.373 +        }
   4.374 +    }
   4.375 +
   4.376      result |= shadow_set_l4e(v, sl4p, new_sl4e, sl4mfn);
   4.377      return result;
   4.378  }
   4.379 @@ -2616,6 +2672,48 @@ static int validate_gl2e(struct vcpu *v,
   4.380      }
   4.381      l2e_propagate_from_guest(v, new_gl2e, _mfn(INVALID_MFN),
   4.382                               sl1mfn, &new_sl2e, ft_prefetch);
   4.383 +
   4.384 +    // check for updates to xen reserved slots in PV guests...
   4.385 +    // XXX -- need to revisit this for PV 3-on-4 guests.
   4.386 +    //
   4.387 +#if SHADOW_PAGING_LEVELS < 4
   4.388 +#if CONFIG_PAGING_LEVELS == SHADOW_PAGING_LEVELS
   4.389 +    if ( !shadow_mode_external(v->domain) )
   4.390 +    {
   4.391 +        int shadow_index = (((unsigned long)sl2p & ~PAGE_MASK) /
   4.392 +                            sizeof(shadow_l2e_t));
   4.393 +        int reserved_xen_slot;
   4.394 +
   4.395 +#if SHADOW_PAGING_LEVELS == 3
   4.396 +        reserved_xen_slot = 
   4.397 +            (((mfn_to_page(sl2mfn)->count_info & PGC_SH_type_mask)
   4.398 +              == PGC_SH_l2h_pae_shadow) &&
   4.399 +             (shadow_index 
   4.400 +              >= (L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1))));
   4.401 +#else /* SHADOW_PAGING_LEVELS == 2 */
   4.402 +        reserved_xen_slot = (shadow_index >= L2_PAGETABLE_FIRST_XEN_SLOT);
   4.403 +#endif
   4.404 +
   4.405 +        if ( unlikely(reserved_xen_slot) )
   4.406 +        {
   4.407 +            // attempt by the guest to write to a xen reserved slot
   4.408 +            //
   4.409 +            SHADOW_PRINTK("%s out-of-range update "
   4.410 +                           "sl2mfn=%05lx index=0x%x val=%" SH_PRI_pte "\n",
   4.411 +                           __func__, mfn_x(sl2mfn), shadow_index, new_sl2e.l2);
   4.412 +            if ( shadow_l2e_get_flags(new_sl2e) & _PAGE_PRESENT )
   4.413 +            {
   4.414 +                SHADOW_ERROR("out-of-range l2e update\n");
   4.415 +                result |= SHADOW_SET_ERROR;
   4.416 +            }
   4.417 +
   4.418 +            // do not call shadow_set_l2e...
   4.419 +            return result;
   4.420 +        }
   4.421 +    }
   4.422 +#endif /* CONFIG_PAGING_LEVELS == SHADOW_PAGING_LEVELS */
   4.423 +#endif /* SHADOW_PAGING_LEVELS < 4 */
   4.424 +
   4.425      result |= shadow_set_l2e(v, sl2p, new_sl2e, sl2mfn);
   4.426  
   4.427      return result;
   4.428 @@ -2897,7 +2995,7 @@ static int sh_page_fault(struct vcpu *v,
   4.429      }
   4.430  
   4.431      // All levels of the guest page table are now known to be present.
   4.432 -    accumulated_gflags = accumulate_guest_flags(&gw);
   4.433 +    accumulated_gflags = accumulate_guest_flags(v, &gw);
   4.434  
   4.435      // Check for attempts to access supervisor-only pages from user mode,
   4.436      // i.e. ring 3.  Such errors are not caused or dealt with by the shadow
   4.437 @@ -3348,6 +3446,7 @@ sh_update_linear_entries(struct vcpu *v)
   4.438          l2_pgentry_t *l2e, new_l2e;
   4.439          shadow_l3e_t *guest_l3e = NULL, *shadow_l3e;
   4.440          int i;
   4.441 +        int unmap_l2e = 0;
   4.442  
   4.443  #if GUEST_PAGING_LEVELS == 2
   4.444          /* Shadow l3 tables were built by update_cr3 */
   4.445 @@ -3365,39 +3464,45 @@ sh_update_linear_entries(struct vcpu *v)
   4.446  #endif /* GUEST_PAGING_LEVELS */
   4.447          
   4.448          /* Choose where to write the entries, using linear maps if possible */
   4.449 -        if ( v == current && shadow_mode_external(d) ) 
   4.450 -        {
   4.451 -            /* From the monitor tables, it's safe to use linear maps to update
   4.452 -             * monitor l2s */
   4.453 -            l2e = __linear_l2_table + (3 * L2_PAGETABLE_ENTRIES);
   4.454 -        }
   4.455 -        else if ( shadow_mode_external(d) ) 
   4.456 +        if ( shadow_mode_external(d) )
   4.457          {
   4.458 -            /* Map the monitor table's high l2 */
   4.459 -            l3_pgentry_t *l3e;
   4.460 -            l3e = sh_map_domain_page(
   4.461 -                pagetable_get_mfn(v->arch.monitor_table));
   4.462 -            ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT);
   4.463 -            l2e = sh_map_domain_page(_mfn(l3e_get_pfn(l3e[3])));
   4.464 -            sh_unmap_domain_page(l3e);
   4.465 -        } 
   4.466 +            if ( v == current )
   4.467 +            {
   4.468 +                /* From the monitor tables, it's safe to use linear maps
   4.469 +                 * to update monitor l2s */
   4.470 +                l2e = __linear_l2_table + (3 * L2_PAGETABLE_ENTRIES);
   4.471 +            }
   4.472 +            else
   4.473 +            {
   4.474 +                /* Map the monitor table's high l2 */
   4.475 +                l3_pgentry_t *l3e;
   4.476 +                l3e = sh_map_domain_page(
   4.477 +                    pagetable_get_mfn(v->arch.monitor_table));
   4.478 +                ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT);
   4.479 +                l2e = sh_map_domain_page(_mfn(l3e_get_pfn(l3e[3])));
   4.480 +                unmap_l2e = 1;
   4.481 +                sh_unmap_domain_page(l3e);
   4.482 +            }
   4.483 +        }
   4.484          else 
   4.485          {
   4.486              /* Map the shadow table's high l2 */
   4.487              ASSERT(shadow_l3e_get_flags(shadow_l3e[3]) & _PAGE_PRESENT);
   4.488              l2e = sh_map_domain_page(shadow_l3e_get_mfn(shadow_l3e[3]));
   4.489 +            unmap_l2e = 1;
   4.490          }
   4.491          
   4.492 -        
   4.493 -        if ( !shadow_mode_external(d) )
   4.494 +        /* Write linear mapping of guest (only in PV, and only when 
   4.495 +         * not translated). */
   4.496 +        if ( !shadow_mode_translate(d) )
   4.497          {
   4.498 -            /* Write linear mapping of guest. */
   4.499              for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
   4.500 -            { 
   4.501 -                new_l2e = (shadow_l3e_get_flags(guest_l3e[i]) & _PAGE_PRESENT) 
   4.502 -                    ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(guest_l3e[i])),
   4.503 -                                   __PAGE_HYPERVISOR) 
   4.504 -                    : l2e_empty();
   4.505 +            {
   4.506 +                new_l2e = 
   4.507 +                    ((shadow_l3e_get_flags(guest_l3e[i]) & _PAGE_PRESENT)
   4.508 +                     ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(guest_l3e[i])),
   4.509 +                                    __PAGE_HYPERVISOR) 
   4.510 +                     : l2e_empty());
   4.511                  safe_write_entry(
   4.512                      &l2e[l2_table_offset(LINEAR_PT_VIRT_START) + i],
   4.513                      &new_l2e);
   4.514 @@ -3416,9 +3521,8 @@ sh_update_linear_entries(struct vcpu *v)
   4.515                  &new_l2e);
   4.516          }
   4.517          
   4.518 -        if ( v != current || !shadow_mode_external(d) )
   4.519 +        if ( unmap_l2e )
   4.520              sh_unmap_domain_page(l2e);
   4.521 -        
   4.522      }
   4.523  
   4.524  #elif CONFIG_PAGING_LEVELS == 2
   4.525 @@ -3521,16 +3625,24 @@ void sh_pae_recopy(struct domain *d)
   4.526  static void
   4.527  sh_detach_old_tables(struct vcpu *v)
   4.528  {
   4.529 +    struct domain *d = v->domain;
   4.530      mfn_t smfn;
   4.531  
   4.532      ////
   4.533      //// vcpu->arch.guest_vtable
   4.534      ////
   4.535 -    if ( (shadow_mode_external(v->domain) || (GUEST_PAGING_LEVELS == 3)) &&
   4.536 -         v->arch.guest_vtable )
   4.537 +    if ( v->arch.guest_vtable )
   4.538      {
   4.539 -        // Q: why does this need to use (un)map_domain_page_*global* ?
   4.540 -        sh_unmap_domain_page_global(v->arch.guest_vtable);
   4.541 +#if GUEST_PAGING_LEVELS == 4
   4.542 +        if ( shadow_mode_external(d) || shadow_mode_translate(d) )
   4.543 +            sh_unmap_domain_page_global(v->arch.guest_vtable);
   4.544 +#elif GUEST_PAGING_LEVELS == 3
   4.545 +        if ( 1 || shadow_mode_external(d) || shadow_mode_translate(d) )
   4.546 +            sh_unmap_domain_page_global(v->arch.guest_vtable);
   4.547 +#elif GUEST_PAGING_LEVELS == 2
   4.548 +        if ( shadow_mode_external(d) || shadow_mode_translate(d) )
   4.549 +            sh_unmap_domain_page_global(v->arch.guest_vtable);
   4.550 +#endif
   4.551          v->arch.guest_vtable = NULL;
   4.552      }
   4.553  
   4.554 @@ -3645,9 +3757,14 @@ sh_update_cr3(struct vcpu *v)
   4.555      ////
   4.556      //// vcpu->arch.guest_vtable
   4.557      ////
   4.558 +#if GUEST_PAGING_LEVELS == 4
   4.559 +    if ( shadow_mode_external(d) || shadow_mode_translate(d) )
   4.560 +        v->arch.guest_vtable = sh_map_domain_page_global(gmfn);
   4.561 +    else
   4.562 +        v->arch.guest_vtable = __linear_l4_table;
   4.563 +#elif GUEST_PAGING_LEVELS == 3
   4.564      if ( shadow_mode_external(d) )
   4.565      {
   4.566 -#if GUEST_PAGING_LEVELS == 3
   4.567          if ( shadow_vcpu_mode_translate(v) ) 
   4.568              /* Paging enabled: find where in the page the l3 table is */
   4.569              guest_idx = guest_index((void *)hvm_get_guest_ctrl_reg(v, 3));
   4.570 @@ -3658,25 +3775,21 @@ sh_update_cr3(struct vcpu *v)
   4.571          // Ignore the low 2 bits of guest_idx -- they are really just
   4.572          // cache control.
   4.573          guest_idx &= ~3;
   4.574 +
   4.575          // XXX - why does this need a global map?
   4.576          v->arch.guest_vtable =
   4.577              (guest_l3e_t *)sh_map_domain_page_global(gmfn) + guest_idx;
   4.578 -#else
   4.579 -        // XXX - why does this need a global map?
   4.580 -        v->arch.guest_vtable = sh_map_domain_page_global(gmfn);
   4.581 -#endif
   4.582      }
   4.583      else
   4.584 -    {
   4.585 -#ifdef __x86_64__
   4.586 -        v->arch.guest_vtable = __linear_l4_table;
   4.587 -#elif GUEST_PAGING_LEVELS == 3
   4.588 -        // XXX - why does this need a global map?
   4.589 +        v->arch.guest_vtable = sh_map_domain_page_global(gmfn);
   4.590 +#elif GUEST_PAGING_LEVELS == 2
   4.591 +    if ( shadow_mode_external(d) || shadow_mode_translate(d) )
   4.592          v->arch.guest_vtable = sh_map_domain_page_global(gmfn);
   4.593 -#else
   4.594 +    else
   4.595          v->arch.guest_vtable = __linear_l2_table;
   4.596 +#else
   4.597 +#error this should never happen
   4.598  #endif
   4.599 -    }
   4.600  
   4.601  #if 0
   4.602      printk("%s %s %d gmfn=%05lx guest_vtable=%p\n",
   4.603 @@ -3744,6 +3857,17 @@ sh_update_cr3(struct vcpu *v)
   4.604  #endif
   4.605      }
   4.606  
   4.607 +#if (CONFIG_PAGING_LEVELS == 3) && (GUEST_PAGING_LEVELS == 3)
   4.608 +    // Now that shadow_vtable is in place, check that the sl3e[3] is properly
   4.609 +    // shadowed and installed in PAE PV guests...
   4.610 +    if ( !shadow_mode_external(d) &&
   4.611 +         !(shadow_l3e_get_flags(((shadow_l3e_t *)v->arch.shadow_vtable)[3]) &
   4.612 +           _PAGE_PRESENT) )
   4.613 +    {
   4.614 +        sh_install_xen_entries_in_l3(v, gmfn, smfn);
   4.615 +    }
   4.616 +#endif
   4.617 +
   4.618      ////
   4.619      //// Take a ref to the new shadow table, and pin it.
   4.620      ////
   4.621 @@ -4049,7 +4173,7 @@ static inline void * emulate_map_dest(st
   4.622      mfn_t mfn;
   4.623  
   4.624      guest_walk_tables(v, vaddr, &gw, 1);
   4.625 -    flags = accumulate_guest_flags(&gw);
   4.626 +    flags = accumulate_guest_flags(v, &gw);
   4.627      gfn = guest_l1e_get_gfn(gw.eff_l1e);
   4.628      mfn = vcpu_gfn_to_mfn(v, gfn);
   4.629      sh_audit_gw(v, &gw);
   4.630 @@ -4453,6 +4577,8 @@ struct shadow_paging_mode sh_paging_mode
   4.631      .x86_emulate_cmpxchg8b  = sh_x86_emulate_cmpxchg8b,
   4.632      .make_monitor_table     = sh_make_monitor_table,
   4.633      .destroy_monitor_table  = sh_destroy_monitor_table,
   4.634 +    .guest_map_l1e          = sh_guest_map_l1e,
   4.635 +    .guest_get_eff_l1e      = sh_guest_get_eff_l1e,
   4.636  #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
   4.637      .guess_wrmap            = sh_guess_wrmap,
   4.638  #endif
     5.1 --- a/xen/arch/x86/mm/shadow/multi.h	Thu Sep 28 17:09:11 2006 +0100
     5.2 +++ b/xen/arch/x86/mm/shadow/multi.h	Thu Sep 28 17:10:54 2006 +0100
     5.3 @@ -103,6 +103,13 @@ SHADOW_INTERNAL_NAME(sh_audit_l4_table, 
     5.4      (struct vcpu *v, mfn_t sl4mfn, mfn_t x);
     5.5  #endif
     5.6  
     5.7 +extern void *
     5.8 +SHADOW_INTERNAL_NAME(sh_guest_map_l1e, CONFIG_PAGING_LEVELS, CONFIG_PAGING_LEVELS)
     5.9 +    (struct vcpu *v, unsigned long va, unsigned long *gl1mfn);
    5.10 +extern void
    5.11 +SHADOW_INTERNAL_NAME(sh_guest_get_eff_l1e, CONFIG_PAGING_LEVELS, CONFIG_PAGING_LEVELS)
    5.12 +    (struct vcpu *v, unsigned long va, void *eff_l1e);
    5.13 +
    5.14  #if SHADOW_LEVELS == GUEST_LEVELS
    5.15  extern mfn_t
    5.16  SHADOW_INTERNAL_NAME(sh_make_monitor_table, SHADOW_LEVELS, GUEST_LEVELS)
     6.1 --- a/xen/arch/x86/mm/shadow/private.h	Thu Sep 28 17:09:11 2006 +0100
     6.2 +++ b/xen/arch/x86/mm/shadow/private.h	Thu Sep 28 17:10:54 2006 +0100
     6.3 @@ -532,55 +532,6 @@ static inline void sh_unpin(struct vcpu 
     6.4      }
     6.5  }
     6.6  
     6.7 -/**************************************************************************/
     6.8 -/* Guest physmap (p2m) support */
     6.9 -
    6.10 -/* Read our own P2M table, checking in the linear pagetables first to be
    6.11 - * sure that we will succeed.  Call this function if you expect it to
    6.12 - * fail often, as it avoids page faults.  If you expect to succeed, use
    6.13 - * vcpu_gfn_to_mfn, which copy_from_user()s the entry */
    6.14 -static inline mfn_t
    6.15 -vcpu_gfn_to_mfn_nofault(struct vcpu *v, unsigned long gfn)
    6.16 -{
    6.17 -    unsigned long entry_addr = (unsigned long) &phys_to_machine_mapping[gfn];
    6.18 -#if CONFIG_PAGING_LEVELS >= 4
    6.19 -    l4_pgentry_t *l4e;
    6.20 -    l3_pgentry_t *l3e;
    6.21 -#endif
    6.22 -    l2_pgentry_t *l2e;
    6.23 -    l1_pgentry_t *l1e;
    6.24 -
    6.25 -    ASSERT(current == v);
    6.26 -    if ( !shadow_vcpu_mode_translate(v) )
    6.27 -        return _mfn(gfn);
    6.28 -
    6.29 -#if CONFIG_PAGING_LEVELS > 2
    6.30 -    if ( gfn >= (RO_MPT_VIRT_END - RO_MPT_VIRT_START) / sizeof(l1_pgentry_t) ) 
    6.31 -        /* This pfn is higher than the p2m map can hold */
    6.32 -        return _mfn(INVALID_MFN);
    6.33 -#endif
    6.34 -    
    6.35 -    /* Walk the linear pagetables.  Note that this is *not* the same as 
    6.36 -     * the walk in sh_gfn_to_mfn_foreign, which is walking the p2m map */
    6.37 -#if CONFIG_PAGING_LEVELS >= 4
    6.38 -    l4e = __linear_l4_table + l4_linear_offset(entry_addr);
    6.39 -    if ( !(l4e_get_flags(*l4e) & _PAGE_PRESENT) ) return _mfn(INVALID_MFN);
    6.40 -    l3e = __linear_l3_table + l3_linear_offset(entry_addr);
    6.41 -    if ( !(l3e_get_flags(*l3e) & _PAGE_PRESENT) ) return _mfn(INVALID_MFN);
    6.42 -#endif
    6.43 -    l2e = __linear_l2_table + l2_linear_offset(entry_addr);
    6.44 -    if ( !(l2e_get_flags(*l2e) & _PAGE_PRESENT) ) return _mfn(INVALID_MFN);
    6.45 -    l1e = __linear_l1_table + l1_linear_offset(entry_addr);
    6.46 -    if ( !(l1e_get_flags(*l1e) & _PAGE_PRESENT) ) return _mfn(INVALID_MFN);
    6.47 -
    6.48 -    /* Safe to look at this part of the table */
    6.49 -    if ( l1e_get_flags(phys_to_machine_mapping[gfn])  & _PAGE_PRESENT )
    6.50 -        return _mfn(l1e_get_pfn(phys_to_machine_mapping[gfn]));
    6.51 -    
    6.52 -    return _mfn(INVALID_MFN);
    6.53 -}
    6.54 -
    6.55 -
    6.56  #endif /* _XEN_SHADOW_PRIVATE_H */
    6.57  
    6.58  /*
     7.1 --- a/xen/arch/x86/mm/shadow/types.h	Thu Sep 28 17:09:11 2006 +0100
     7.2 +++ b/xen/arch/x86/mm/shadow/types.h	Thu Sep 28 17:10:54 2006 +0100
     7.3 @@ -205,6 +205,9 @@ static inline shadow_l4e_t shadow_l4e_fr
     7.4      __sh_linear_l1_table; \
     7.5  })
     7.6  
     7.7 +// XXX -- these should not be conditional on hvm_guest(v), but rather on
     7.8 +//        shadow_mode_external(d)...
     7.9 +//
    7.10  #define sh_linear_l2_table(v) ({ \
    7.11      ASSERT(current == (v)); \
    7.12      ((shadow_l2e_t *) \
    7.13 @@ -507,10 +510,22 @@ struct shadow_walk_t
    7.14  #define sh_guess_wrmap             INTERNAL_NAME(sh_guess_wrmap)
    7.15  #define sh_clear_shadow_entry      INTERNAL_NAME(sh_clear_shadow_entry)
    7.16  
    7.17 +/* The sh_guest_(map|get)_* functions only depends on the number of config
    7.18 + * levels
    7.19 + */
    7.20 +#define sh_guest_map_l1e                                       \
    7.21 +        SHADOW_INTERNAL_NAME(sh_guest_map_l1e,                \
    7.22 +                              CONFIG_PAGING_LEVELS,             \
    7.23 +                              CONFIG_PAGING_LEVELS)
    7.24 +#define sh_guest_get_eff_l1e                                   \
    7.25 +        SHADOW_INTERNAL_NAME(sh_guest_get_eff_l1e,            \
    7.26 +                              CONFIG_PAGING_LEVELS,             \
    7.27 +                              CONFIG_PAGING_LEVELS)
    7.28 +
    7.29  /* sh_make_monitor_table only depends on the number of shadow levels */
    7.30 -#define sh_make_monitor_table                          \
    7.31 -        SHADOW_INTERNAL_NAME(sh_make_monitor_table,   \
    7.32 -                              SHADOW_PAGING_LEVELS,     \
    7.33 +#define sh_make_monitor_table                                  \
    7.34 +        SHADOW_INTERNAL_NAME(sh_make_monitor_table,           \
    7.35 +                              SHADOW_PAGING_LEVELS,             \
    7.36                                SHADOW_PAGING_LEVELS)
    7.37  #define sh_destroy_monitor_table                               \
    7.38          SHADOW_INTERNAL_NAME(sh_destroy_monitor_table,        \
    7.39 @@ -652,7 +667,7 @@ static inline void sh_unpin_l3_subshadow
    7.40  #endif /* GUEST_PAGING_LEVELS >= 3 */
    7.41  
    7.42  static inline u32
    7.43 -accumulate_guest_flags(walk_t *gw)
    7.44 +accumulate_guest_flags(struct vcpu *v, walk_t *gw)
    7.45  {
    7.46      u32 accumulated_flags;
    7.47  
    7.48 @@ -674,9 +689,15 @@ accumulate_guest_flags(walk_t *gw)
    7.49      accumulated_flags &= guest_l4e_get_flags(*gw->l4e) ^ _PAGE_NX_BIT;
    7.50  #endif
    7.51  
    7.52 -    // Finally, revert the NX bit back to its original polarity
    7.53 +    // Revert the NX bit back to its original polarity
    7.54      accumulated_flags ^= _PAGE_NX_BIT;
    7.55  
    7.56 +    // In 64-bit PV guests, the _PAGE_USER bit is implied in all guest
    7.57 +    // entries (since even the guest kernel runs in ring 3).
    7.58 +    //
    7.59 +    if ( (GUEST_PAGING_LEVELS == 4) && !hvm_guest(v) )
    7.60 +        accumulated_flags |= _PAGE_USER;
    7.61 +
    7.62      return accumulated_flags;
    7.63  }
    7.64  
     8.1 --- a/xen/arch/x86/traps.c	Thu Sep 28 17:09:11 2006 +0100
     8.2 +++ b/xen/arch/x86/traps.c	Thu Sep 28 17:10:54 2006 +0100
     8.3 @@ -886,7 +886,7 @@ static int fixup_page_fault(unsigned lon
     8.4           /* Do not check if access-protection fault since the page may 
     8.5              legitimately be not present in shadow page tables */
     8.6           ((regs->error_code & PFEC_write_access) == PFEC_write_access) &&
     8.7 -         ptwr_do_page_fault(d, addr, regs) )
     8.8 +         ptwr_do_page_fault(v, addr, regs) )
     8.9          return EXCRET_fault_fixed;
    8.10  
    8.11      if ( shadow_mode_enabled(d) )
     9.1 --- a/xen/include/asm-x86/domain.h	Thu Sep 28 17:09:11 2006 +0100
     9.2 +++ b/xen/include/asm-x86/domain.h	Thu Sep 28 17:10:54 2006 +0100
     9.3 @@ -139,7 +139,7 @@ struct shadow_vcpu {
     9.4      /* Last MFN that we emulated a write to. */
     9.5      unsigned long last_emulated_mfn;
     9.6      /* HVM guest: paging enabled (CR0.PG)?  */
     9.7 -    unsigned int hvm_paging_enabled:1;
     9.8 +    unsigned int translate_enabled:1;
     9.9      /* Emulated fault needs to be propagated to guest? */
    9.10      unsigned int propagate_fault:1;
    9.11  #if CONFIG_PAGING_LEVELS >= 3
    10.1 --- a/xen/include/asm-x86/guest_access.h	Thu Sep 28 17:09:11 2006 +0100
    10.2 +++ b/xen/include/asm-x86/guest_access.h	Thu Sep 28 17:10:54 2006 +0100
    10.3 @@ -8,6 +8,7 @@
    10.4  #define __ASM_X86_GUEST_ACCESS_H__
    10.5  
    10.6  #include <asm/uaccess.h>
    10.7 +#include <asm/shadow.h>
    10.8  #include <asm/hvm/support.h>
    10.9  #include <asm/hvm/guest_access.h>
   10.10  
   10.11 @@ -33,7 +34,7 @@
   10.12  #define copy_to_guest_offset(hnd, off, ptr, nr) ({      \
   10.13      const typeof(ptr) _x = (hnd).p;                     \
   10.14      const typeof(ptr) _y = (ptr);                       \
   10.15 -    hvm_guest(current) ?                                \
   10.16 +    shadow_mode_translate(current->domain) ?            \
   10.17      copy_to_user_hvm(_x+(off), _y, sizeof(*_x)*(nr)) :  \
   10.18      copy_to_user(_x+(off), _y, sizeof(*_x)*(nr));       \
   10.19  })
   10.20 @@ -45,7 +46,7 @@
   10.21  #define copy_from_guest_offset(ptr, hnd, off, nr) ({    \
   10.22      const typeof(ptr) _x = (hnd).p;                     \
   10.23      const typeof(ptr) _y = (ptr);                       \
   10.24 -    hvm_guest(current) ?                                \
   10.25 +    shadow_mode_translate(current->domain) ?            \
   10.26      copy_from_user_hvm(_y, _x+(off), sizeof(*_x)*(nr)) :\
   10.27      copy_from_user(_y, _x+(off), sizeof(*_x)*(nr));     \
   10.28  })
   10.29 @@ -54,7 +55,7 @@
   10.30  #define copy_field_to_guest(hnd, ptr, field) ({         \
   10.31      const typeof(&(ptr)->field) _x = &(hnd).p->field;   \
   10.32      const typeof(&(ptr)->field) _y = &(ptr)->field;     \
   10.33 -    hvm_guest(current) ?                                \
   10.34 +    shadow_mode_translate(current->domain) ?            \
   10.35      copy_to_user_hvm(_x, _y, sizeof(*_x)) :             \
   10.36      copy_to_user(_x, _y, sizeof(*_x));                  \
   10.37  })
   10.38 @@ -63,7 +64,7 @@
   10.39  #define copy_field_from_guest(ptr, hnd, field) ({       \
   10.40      const typeof(&(ptr)->field) _x = &(hnd).p->field;   \
   10.41      const typeof(&(ptr)->field) _y = &(ptr)->field;     \
   10.42 -    hvm_guest(current) ?                                \
   10.43 +    shadow_mode_translate(current->domain) ?            \
   10.44      copy_from_user_hvm(_y, _x, sizeof(*_x)) :           \
   10.45      copy_from_user(_y, _x, sizeof(*_x));                \
   10.46  })
   10.47 @@ -73,12 +74,13 @@
   10.48   * Allows use of faster __copy_* functions.
   10.49   */
   10.50  #define guest_handle_okay(hnd, nr)                      \
   10.51 -    (hvm_guest(current) || array_access_ok((hnd).p, (nr), sizeof(*(hnd).p)))
   10.52 +    (shadow_mode_external(current->domain) ||           \
   10.53 +     array_access_ok((hnd).p, (nr), sizeof(*(hnd).p)))
   10.54  
   10.55  #define __copy_to_guest_offset(hnd, off, ptr, nr) ({    \
   10.56      const typeof(ptr) _x = (hnd).p;                     \
   10.57      const typeof(ptr) _y = (ptr);                       \
   10.58 -    hvm_guest(current) ?                                \
   10.59 +    shadow_mode_translate(current->domain) ?            \
   10.60      copy_to_user_hvm(_x+(off), _y, sizeof(*_x)*(nr)) :  \
   10.61      __copy_to_user(_x+(off), _y, sizeof(*_x)*(nr));     \
   10.62  })
   10.63 @@ -86,7 +88,7 @@
   10.64  #define __copy_from_guest_offset(ptr, hnd, off, nr) ({  \
   10.65      const typeof(ptr) _x = (hnd).p;                     \
   10.66      const typeof(ptr) _y = (ptr);                       \
   10.67 -    hvm_guest(current) ?                                \
   10.68 +    shadow_mode_translate(current->domain) ?            \
   10.69      copy_from_user_hvm(_y, _x+(off),sizeof(*_x)*(nr)) : \
   10.70      __copy_from_user(_y, _x+(off), sizeof(*_x)*(nr));   \
   10.71  })
   10.72 @@ -94,7 +96,7 @@
   10.73  #define __copy_field_to_guest(hnd, ptr, field) ({       \
   10.74      const typeof(&(ptr)->field) _x = &(hnd).p->field;   \
   10.75      const typeof(&(ptr)->field) _y = &(ptr)->field;     \
   10.76 -    hvm_guest(current) ?                                \
   10.77 +    shadow_mode_translate(current->domain) ?            \
   10.78      copy_to_user_hvm(_x, _y, sizeof(*_x)) :             \
   10.79      __copy_to_user(_x, _y, sizeof(*_x));                \
   10.80  })
   10.81 @@ -102,7 +104,7 @@
   10.82  #define __copy_field_from_guest(ptr, hnd, field) ({     \
   10.83      const typeof(&(ptr)->field) _x = &(hnd).p->field;   \
   10.84      const typeof(&(ptr)->field) _y = &(ptr)->field;     \
   10.85 -    hvm_guest(current) ?                                \
   10.86 +    shadow_mode_translate(current->domain) ?            \
   10.87      copy_from_user_hvm(_x, _y, sizeof(*_x)) :           \
   10.88      __copy_from_user(_y, _x, sizeof(*_x));              \
   10.89  })
    11.1 --- a/xen/include/asm-x86/mm.h	Thu Sep 28 17:09:11 2006 +0100
    11.2 +++ b/xen/include/asm-x86/mm.h	Thu Sep 28 17:10:54 2006 +0100
    11.3 @@ -348,7 +348,7 @@ void memguard_unguard_range(void *p, uns
    11.4  
    11.5  void memguard_guard_stack(void *p);
    11.6  
    11.7 -int  ptwr_do_page_fault(struct domain *, unsigned long,
    11.8 +int  ptwr_do_page_fault(struct vcpu *, unsigned long,
    11.9                          struct cpu_user_regs *);
   11.10  
   11.11  int audit_adjust_pgtables(struct domain *d, int dir, int noisy);
    12.1 --- a/xen/include/asm-x86/shadow.h	Thu Sep 28 17:09:11 2006 +0100
    12.2 +++ b/xen/include/asm-x86/shadow.h	Thu Sep 28 17:10:54 2006 +0100
    12.3 @@ -26,6 +26,7 @@
    12.4  #include <public/domctl.h> 
    12.5  #include <xen/sched.h>
    12.6  #include <xen/perfc.h>
    12.7 +#include <xen/domain_page.h>
    12.8  #include <asm/flushtlb.h>
    12.9  
   12.10  /* How to make sure a page is not referred to in a shadow PT */
   12.11 @@ -245,7 +246,9 @@ shadow_vcpu_mode_translate(struct vcpu *
   12.12      // enabled.  (HVM vcpu's with paging disabled are using the p2m table as
   12.13      // its paging table, so no translation occurs in this case.)
   12.14      //
   12.15 -    return v->arch.shadow.hvm_paging_enabled;
   12.16 +    // It is also true for translated PV domains.
   12.17 +    //
   12.18 +    return v->arch.shadow.translate_enabled;
   12.19  }
   12.20  
   12.21  
   12.22 @@ -287,6 +290,10 @@ struct shadow_paging_mode {
   12.23                                              struct x86_emulate_ctxt *ctxt);
   12.24      mfn_t         (*make_monitor_table    )(struct vcpu *v);
   12.25      void          (*destroy_monitor_table )(struct vcpu *v, mfn_t mmfn);
   12.26 +    void *        (*guest_map_l1e         )(struct vcpu *v, unsigned long va,
   12.27 +                                            unsigned long *gl1mfn);
   12.28 +    void          (*guest_get_eff_l1e     )(struct vcpu *v, unsigned long va,
   12.29 +                                            void *eff_l1e);
   12.30  #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
   12.31      int           (*guess_wrmap           )(struct vcpu *v, 
   12.32                                              unsigned long vaddr, mfn_t gmfn);
   12.33 @@ -452,9 +459,73 @@ shadow_destroy_monitor_table(struct vcpu
   12.34      v->arch.shadow.mode->destroy_monitor_table(v, mmfn);
   12.35  }
   12.36  
   12.37 +static inline void *
   12.38 +guest_map_l1e(struct vcpu *v, unsigned long addr, unsigned long *gl1mfn)
   12.39 +{
   12.40 +    if ( likely(!shadow_mode_translate(v->domain)) )
   12.41 +    {
   12.42 +        l2_pgentry_t l2e;
   12.43 +        ASSERT(!shadow_mode_external(v->domain));
   12.44 +        /* Find this l1e and its enclosing l1mfn in the linear map */
   12.45 +        if ( __copy_from_user(&l2e, 
   12.46 +                              &__linear_l2_table[l2_linear_offset(addr)],
   12.47 +                              sizeof(l2_pgentry_t)) != 0 )
   12.48 +            return NULL;
   12.49 +        /* Check flags that it will be safe to read the l1e */
   12.50 +        if ( (l2e_get_flags(l2e) & (_PAGE_PRESENT | _PAGE_PSE)) 
   12.51 +             != _PAGE_PRESENT )
   12.52 +            return NULL;
   12.53 +        *gl1mfn = l2e_get_pfn(l2e);
   12.54 +        return &__linear_l1_table[l1_linear_offset(addr)];
   12.55 +    }
   12.56 +
   12.57 +    return v->arch.shadow.mode->guest_map_l1e(v, addr, gl1mfn);
   12.58 +}
   12.59 +
   12.60 +static inline void
   12.61 +guest_unmap_l1e(struct vcpu *v, void *p)
   12.62 +{
   12.63 +    if ( unlikely(shadow_mode_translate(v->domain)) )
   12.64 +        unmap_domain_page(p);
   12.65 +}
   12.66 +
   12.67 +static inline void
   12.68 +guest_get_eff_l1e(struct vcpu *v, unsigned long addr, void *eff_l1e)
   12.69 +{
   12.70 +    if ( likely(!shadow_mode_translate(v->domain)) )
   12.71 +    {
   12.72 +        ASSERT(!shadow_mode_external(v->domain));
   12.73 +        if ( __copy_from_user(eff_l1e, 
   12.74 +                              &__linear_l1_table[l1_linear_offset(addr)],
   12.75 +                              sizeof(l1_pgentry_t)) != 0 )
   12.76 +            *(l1_pgentry_t *)eff_l1e = l1e_empty();
   12.77 +        return;
   12.78 +    }
   12.79 +        
   12.80 +    v->arch.shadow.mode->guest_get_eff_l1e(v, addr, eff_l1e);
   12.81 +}
   12.82 +
   12.83 +static inline void
   12.84 +guest_get_eff_kern_l1e(struct vcpu *v, unsigned long addr, void *eff_l1e)
   12.85 +{
   12.86 +#if defined(__x86_64__)
   12.87 +    int user_mode = !(v->arch.flags & TF_kernel_mode);
   12.88 +#define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v)
   12.89 +#else
   12.90 +#define TOGGLE_MODE() ((void)0)
   12.91 +#endif
   12.92 +
   12.93 +    TOGGLE_MODE();
   12.94 +    guest_get_eff_l1e(v, addr, eff_l1e);
   12.95 +    TOGGLE_MODE();
   12.96 +}
   12.97 +
   12.98 +
   12.99  /* Validate a pagetable change from the guest and update the shadows. */
  12.100  extern int shadow_validate_guest_entry(struct vcpu *v, mfn_t gmfn,
  12.101                                          void *new_guest_entry);
  12.102 +extern int __shadow_validate_guest_entry(struct vcpu *v, mfn_t gmfn, 
  12.103 +                                         void *entry, u32 size);
  12.104  
  12.105  /* Update the shadows in response to a pagetable write from a HVM guest */
  12.106  extern void shadow_validate_guest_pt_write(struct vcpu *v, mfn_t gmfn, 
  12.107 @@ -629,7 +700,14 @@ sh_mfn_to_gfn(struct domain *d, mfn_t mf
  12.108          return mfn_x(mfn);
  12.109  }
  12.110  
  12.111 -
  12.112 +static inline l1_pgentry_t
  12.113 +gl1e_to_ml1e(struct domain *d, l1_pgentry_t l1e)
  12.114 +{
  12.115 +    if ( unlikely(shadow_mode_translate(d)) )
  12.116 +        l1e = l1e_from_pfn(gmfn_to_mfn(d, l1e_get_pfn(l1e)),
  12.117 +                           l1e_get_flags(l1e));
  12.118 +    return l1e;
  12.119 +}
  12.120  
  12.121  #endif /* _XEN_SHADOW_H */
  12.122