ia64/xen-unstable

changeset 11172:0f917d63e960

Replace shadow pagetable code with shadow2.
author tdeegan@york.uk.xensource.com
date Wed Aug 16 17:02:35 2006 +0100 (2006-08-16)
parents fda70200da01
children e66352312acb
files .hgtags tools/examples/xmexample.hvm tools/libxc/xc_domain.c tools/libxc/xc_hvm_build.c tools/libxc/xc_linux_build.c tools/libxc/xc_linux_save.c tools/libxc/xenctrl.h tools/misc/xc_shadow.c tools/python/xen/lowlevel/xc/xc.c tools/python/xen/xend/XendDomain.py tools/python/xen/xend/XendDomainInfo.py tools/python/xen/xend/image.py tools/python/xen/xm/create.py xen/arch/x86/Makefile xen/arch/x86/audit.c xen/arch/x86/dom0_ops.c xen/arch/x86/domain.c xen/arch/x86/domain_build.c xen/arch/x86/hvm/hvm.c xen/arch/x86/hvm/platform.c xen/arch/x86/hvm/svm/svm.c xen/arch/x86/hvm/svm/vmcb.c xen/arch/x86/hvm/vlapic.c xen/arch/x86/hvm/vmx/vmcs.c xen/arch/x86/hvm/vmx/vmx.c xen/arch/x86/mm.c xen/arch/x86/setup.c xen/arch/x86/shadow.c xen/arch/x86/shadow2-common.c xen/arch/x86/shadow2.c xen/arch/x86/shadow32.c xen/arch/x86/shadow_guest32.c xen/arch/x86/shadow_guest32pae.c xen/arch/x86/shadow_public.c xen/arch/x86/smpboot.c xen/arch/x86/traps.c xen/arch/x86/x86_32/domain_page.c xen/arch/x86/x86_32/mm.c xen/arch/x86/x86_64/mm.c xen/arch/x86/x86_64/traps.c xen/common/acm_ops.c xen/common/grant_table.c xen/common/keyhandler.c xen/common/memory.c xen/drivers/char/console.c xen/include/asm-x86/bitops.h xen/include/asm-x86/config.h xen/include/asm-x86/domain.h xen/include/asm-x86/grant_table.h xen/include/asm-x86/hvm/hvm.h xen/include/asm-x86/hvm/support.h xen/include/asm-x86/hvm/vcpu.h xen/include/asm-x86/hvm/vmx/vmcs.h xen/include/asm-x86/hvm/vmx/vmx.h xen/include/asm-x86/mm.h xen/include/asm-x86/msr.h xen/include/asm-x86/page-guest32.h xen/include/asm-x86/page.h xen/include/asm-x86/perfc_defn.h xen/include/asm-x86/processor.h xen/include/asm-x86/shadow.h xen/include/asm-x86/shadow2-multi.h xen/include/asm-x86/shadow2-private.h xen/include/asm-x86/shadow2-types.h xen/include/asm-x86/shadow2.h xen/include/asm-x86/shadow_64.h xen/include/asm-x86/shadow_ops.h xen/include/asm-x86/shadow_public.h xen/include/asm-x86/x86_32/page-2level.h xen/include/asm-x86/x86_32/page-3level.h xen/include/asm-x86/x86_64/page.h xen/include/public/dom0_ops.h xen/include/xen/domain_page.h xen/include/xen/lib.h xen/include/xen/list.h xen/include/xen/sched.h
line diff
     1.1 --- a/.hgtags	Wed Aug 16 16:16:32 2006 +0100
     1.2 +++ b/.hgtags	Wed Aug 16 17:02:35 2006 +0100
     1.3 @@ -15,3 +15,13 @@ 3d330e41f41ce1bc118c02346e18949ad5d67f6b
     1.4  c8fdb0caa77b429cf47f9707926e83947778cb48 RELEASE-3.0.0
     1.5  af0573e9e5258db0a9d28aa954dd302ddd2c2d23 3.0.2-rc
     1.6  d0d3fef37685be264a7f52201f8ef44c030daad3 3.0.2-branched
     1.7 +6e864d7de9db066f92bea505d256bfe286200fed last-code-review
     1.8 +a898a6510c5db4e3d1f69d40fcacb540643b0f22 mainline
     1.9 +bfa6f4a0c594bc0ebd896437d69857b58dab0988 last-code-review
    1.10 +fc6cbf31bd883bc76ceb97f4b817ac88078d696a latest patch to unstable
    1.11 +8e55c5c1147589b7a6a1875384d4317aec7ccf84 mainline
    1.12 +2d2ed4d9b1c14aeee29dfdd77acd6017d31290cd mainline
    1.13 +0e32095a7b4611d18a82052a9d5b23e474f91af9 mainline
    1.14 +88e6bd5e2b5439f97e1d50a8724103c619aeaadf mainline
    1.15 +5233c4b076b9aa073eff63508461b7bfa597737c mainline
    1.16 +fda70200da01b89d5339342df6c0db372369a16d mainline
     2.1 --- a/tools/examples/xmexample.hvm	Wed Aug 16 16:16:32 2006 +0100
     2.2 +++ b/tools/examples/xmexample.hvm	Wed Aug 16 17:02:35 2006 +0100
     2.3 @@ -27,6 +27,10 @@ builder='hvm'
     2.4  #          and modules. Allocating less than 32MBs is not recommended.
     2.5  memory = 128
     2.6  
     2.7 +# Shadow pagetable memory for the domain, in MB.
     2.8 +# Should be at least 2KB per MB of domain memory, plus a few MB per vcpu.
     2.9 +shadow_memory = 8
    2.10 +
    2.11  # A name for your domain. All domains must have different names.
    2.12  name = "ExampleHVMDomain"
    2.13  
     3.1 --- a/tools/libxc/xc_domain.c	Wed Aug 16 16:16:32 2006 +0100
     3.2 +++ b/tools/libxc/xc_domain.c	Wed Aug 16 17:02:35 2006 +0100
     3.3 @@ -213,21 +213,28 @@ int xc_shadow_control(int xc_handle,
     3.4                        unsigned int sop,
     3.5                        unsigned long *dirty_bitmap,
     3.6                        unsigned long pages,
     3.7 -                      xc_shadow_control_stats_t *stats )
     3.8 +                      unsigned long *mb,
     3.9 +                      uint32_t mode,
    3.10 +                      xc_shadow_control_stats_t *stats)
    3.11  {
    3.12      int rc;
    3.13      DECLARE_DOM0_OP;
    3.14      op.cmd = DOM0_SHADOW_CONTROL;
    3.15      op.u.shadow_control.domain = (domid_t)domid;
    3.16      op.u.shadow_control.op     = sop;
    3.17 +    op.u.shadow_control.pages  = pages;
    3.18 +    op.u.shadow_control.mb     = mb ? *mb : 0;
    3.19 +    op.u.shadow_control.mode   = mode;
    3.20      set_xen_guest_handle(op.u.shadow_control.dirty_bitmap, dirty_bitmap);
    3.21 -    op.u.shadow_control.pages  = pages;
    3.22  
    3.23      rc = do_dom0_op(xc_handle, &op);
    3.24  
    3.25      if ( stats )
    3.26          memcpy(stats, &op.u.shadow_control.stats,
    3.27                 sizeof(xc_shadow_control_stats_t));
    3.28 +    
    3.29 +    if ( mb ) 
    3.30 +        *mb = op.u.shadow_control.mb;
    3.31  
    3.32      return (rc == 0) ? op.u.shadow_control.pages : rc;
    3.33  }
    3.34 @@ -391,7 +398,7 @@ int xc_domain_memory_populate_physmap(in
    3.35  
    3.36      if ( err > 0 )
    3.37      {
    3.38 -        DPRINTF("Failed deallocation for dom %d: %ld pages order %d\n",
    3.39 +        DPRINTF("Failed allocation for dom %d: %ld pages order %d\n",
    3.40                  domid, nr_extents, extent_order);
    3.41          errno = EBUSY;
    3.42          err = -1;
     4.1 --- a/tools/libxc/xc_hvm_build.c	Wed Aug 16 16:16:32 2006 +0100
     4.2 +++ b/tools/libxc/xc_hvm_build.c	Wed Aug 16 17:02:35 2006 +0100
     4.3 @@ -396,6 +396,19 @@ static int xc_hvm_build_internal(int xc_
     4.4          goto error_out;
     4.5      }
     4.6  
     4.7 +    /* HVM domains must be put into shadow2 mode at the start of day */
     4.8 +    if ( xc_shadow_control(xc_handle, domid, DOM0_SHADOW2_CONTROL_OP_ENABLE,
     4.9 +                           NULL, 0, NULL, 
    4.10 +                           DOM0_SHADOW2_CONTROL_FLAG_ENABLE 
    4.11 +                           | DOM0_SHADOW2_CONTROL_FLAG_REFCOUNT
    4.12 +                           | DOM0_SHADOW2_CONTROL_FLAG_TRANSLATE
    4.13 +                           | DOM0_SHADOW2_CONTROL_FLAG_EXTERNAL, 
    4.14 +                           NULL) ) 
    4.15 +    {
    4.16 +        PERROR("Could not enable shadow paging for domain.\n");
    4.17 +        goto error_out;
    4.18 +    }        
    4.19 +
    4.20      memset(ctxt, 0, sizeof(*ctxt));
    4.21  
    4.22      ctxt->flags = VGCF_HVM_GUEST;
     5.1 --- a/tools/libxc/xc_linux_build.c	Wed Aug 16 16:16:32 2006 +0100
     5.2 +++ b/tools/libxc/xc_linux_build.c	Wed Aug 16 17:02:35 2006 +0100
     5.3 @@ -972,7 +972,7 @@ static int setup_guest(int xc_handle,
     5.4          /* Enable shadow translate mode */
     5.5          if ( xc_shadow_control(xc_handle, dom,
     5.6                                 DOM0_SHADOW_CONTROL_OP_ENABLE_TRANSLATE,
     5.7 -                               NULL, 0, NULL) < 0 )
     5.8 +                               NULL, 0, NULL, 0, NULL) < 0 )
     5.9          {
    5.10              PERROR("Could not enable translation mode");
    5.11              goto error_out;
     6.1 --- a/tools/libxc/xc_linux_save.c	Wed Aug 16 16:16:32 2006 +0100
     6.2 +++ b/tools/libxc/xc_linux_save.c	Wed Aug 16 17:02:35 2006 +0100
     6.3 @@ -338,13 +338,13 @@ static int analysis_phase(int xc_handle,
     6.4          int i;
     6.5  
     6.6          xc_shadow_control(xc_handle, domid, DOM0_SHADOW_CONTROL_OP_CLEAN,
     6.7 -                          arr, max_pfn, NULL);
     6.8 +                          arr, max_pfn, NULL, 0, NULL);
     6.9          DPRINTF("#Flush\n");
    6.10          for ( i = 0; i < 40; i++ ) {
    6.11              usleep(50000);
    6.12              now = llgettimeofday();
    6.13              xc_shadow_control(xc_handle, domid, DOM0_SHADOW_CONTROL_OP_PEEK,
    6.14 -                              NULL, 0, &stats);
    6.15 +                              NULL, 0, NULL, 0, &stats);
    6.16  
    6.17              DPRINTF("now= %lld faults= %" PRId32 " dirty= %" PRId32
    6.18                      " dirty_net= %" PRId32 " dirty_block= %" PRId32"\n",
    6.19 @@ -727,7 +727,7 @@ int xc_linux_save(int xc_handle, int io_
    6.20  
    6.21          if (xc_shadow_control(xc_handle, dom,
    6.22                                DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY,
    6.23 -                              NULL, 0, NULL ) < 0) {
    6.24 +                              NULL, 0, NULL, 0, NULL) < 0) {
    6.25              ERR("Couldn't enable shadow mode");
    6.26              goto out;
    6.27          }
    6.28 @@ -879,7 +879,7 @@ int xc_linux_save(int xc_handle, int io_
    6.29                 but this is fast enough for the moment. */
    6.30              if (!last_iter && xc_shadow_control(
    6.31                      xc_handle, dom, DOM0_SHADOW_CONTROL_OP_PEEK,
    6.32 -                    to_skip, max_pfn, NULL) != max_pfn) {
    6.33 +                    to_skip, max_pfn, NULL, 0, NULL) != max_pfn) {
    6.34                  ERR("Error peeking shadow bitmap");
    6.35                  goto out;
    6.36              }
    6.37 @@ -1084,8 +1084,9 @@ int xc_linux_save(int xc_handle, int io_
    6.38                          (unsigned long)ctxt.user_regs.edx);
    6.39              }
    6.40  
    6.41 -            if (xc_shadow_control(xc_handle, dom, DOM0_SHADOW_CONTROL_OP_CLEAN,
    6.42 -                                  to_send, max_pfn, &stats ) != max_pfn) {
    6.43 +            if (xc_shadow_control(xc_handle, dom, 
    6.44 +                                  DOM0_SHADOW_CONTROL_OP_CLEAN, to_send, 
    6.45 +                                  max_pfn, NULL, 0, &stats) != max_pfn) {
    6.46                  ERR("Error flushing shadow PT");
    6.47                  goto out;
    6.48              }
    6.49 @@ -1174,8 +1175,9 @@ int xc_linux_save(int xc_handle, int io_
    6.50   out:
    6.51  
    6.52      if (live) {
    6.53 -        if(xc_shadow_control(xc_handle, dom, DOM0_SHADOW_CONTROL_OP_OFF,
    6.54 -                             NULL, 0, NULL ) < 0) {
    6.55 +        if(xc_shadow_control(xc_handle, dom, 
    6.56 +                             DOM0_SHADOW_CONTROL_OP_OFF,
    6.57 +                             NULL, 0, NULL, 0, NULL) < 0) {
    6.58              DPRINTF("Warning - couldn't disable shadow mode");
    6.59          }
    6.60      }
     7.1 --- a/tools/libxc/xenctrl.h	Wed Aug 16 16:16:32 2006 +0100
     7.2 +++ b/tools/libxc/xenctrl.h	Wed Aug 16 17:02:35 2006 +0100
     7.3 @@ -323,6 +323,8 @@ int xc_shadow_control(int xc_handle,
     7.4                        unsigned int sop,
     7.5                        unsigned long *dirty_bitmap,
     7.6                        unsigned long pages,
     7.7 +                      unsigned long *mb,
     7.8 +                      uint32_t mode,
     7.9                        xc_shadow_control_stats_t *stats);
    7.10  
    7.11  int xc_bvtsched_global_set(int xc_handle,
     8.1 --- a/tools/misc/xc_shadow.c	Wed Aug 16 16:16:32 2006 +0100
     8.2 +++ b/tools/misc/xc_shadow.c	Wed Aug 16 17:02:35 2006 +0100
     8.3 @@ -60,6 +60,8 @@ int main(int argc, char *argv[])
     8.4                             mode, 
     8.5                             NULL,
     8.6                             0,
     8.7 +                           NULL,
     8.8 +                           0,
     8.9                             NULL) < 0 )
    8.10      {    
    8.11          fprintf(stderr, "Error reseting performance counters: %d (%s)\n",
     9.1 --- a/tools/python/xen/lowlevel/xc/xc.c	Wed Aug 16 16:16:32 2006 +0100
     9.2 +++ b/tools/python/xen/lowlevel/xc/xc.c	Wed Aug 16 17:02:35 2006 +0100
     9.3 @@ -669,6 +669,59 @@ static PyObject *pyxc_sedf_domain_get(Xc
     9.4                           "weight",    weight);
     9.5  }
     9.6  
     9.7 +static PyObject *pyxc_shadow_control(PyObject *self,
     9.8 +                                     PyObject *args,
     9.9 +                                     PyObject *kwds)
    9.10 +{
    9.11 +    XcObject *xc = (XcObject *)self;
    9.12 +
    9.13 +    uint32_t dom;
    9.14 +    int op=0;
    9.15 +
    9.16 +    static char *kwd_list[] = { "dom", "op", NULL };
    9.17 +
    9.18 +    if ( !PyArg_ParseTupleAndKeywords(args, kwds, "i|i", kwd_list, 
    9.19 +                                      &dom, &op) )
    9.20 +        return NULL;
    9.21 +    
    9.22 +    if ( xc_shadow_control(xc->xc_handle, dom, op, NULL, 0, NULL, 0, NULL) 
    9.23 +         < 0 )
    9.24 +        return PyErr_SetFromErrno(xc_error);
    9.25 +    
    9.26 +    Py_INCREF(zero);
    9.27 +    return zero;
    9.28 +}
    9.29 +
    9.30 +static PyObject *pyxc_shadow_mem_control(PyObject *self,
    9.31 +                                         PyObject *args,
    9.32 +                                         PyObject *kwds)
    9.33 +{
    9.34 +    XcObject *xc = (XcObject *)self;
    9.35 +    int op;
    9.36 +    uint32_t dom;
    9.37 +    int mbarg = -1;
    9.38 +    unsigned long mb;
    9.39 +
    9.40 +    static char *kwd_list[] = { "dom", "mb", NULL };
    9.41 +
    9.42 +    if ( !PyArg_ParseTupleAndKeywords(args, kwds, "i|i", kwd_list, 
    9.43 +                                      &dom, &mbarg) )
    9.44 +        return NULL;
    9.45 +    
    9.46 +    if ( mbarg < 0 ) 
    9.47 +        op = DOM0_SHADOW2_CONTROL_OP_GET_ALLOCATION;
    9.48 +    else 
    9.49 +    {
    9.50 +        mb = mbarg;
    9.51 +        op = DOM0_SHADOW2_CONTROL_OP_SET_ALLOCATION;
    9.52 +    }
    9.53 +    if ( xc_shadow_control(xc->xc_handle, dom, op, NULL, 0, &mb, 0, NULL) < 0 )
    9.54 +        return PyErr_SetFromErrno(xc_error);
    9.55 +    
    9.56 +    mbarg = mb;
    9.57 +    return Py_BuildValue("i", mbarg);
    9.58 +}
    9.59 +
    9.60  static PyObject *pyxc_sched_credit_domain_set(XcObject *self,
    9.61                                                PyObject *args,
    9.62                                                PyObject *kwds)
    9.63 @@ -1119,6 +1172,22 @@ static PyMethodDef pyxc_methods[] = {
    9.64        "Returns [dict]: information about Xen"
    9.65        "        [None]: on failure.\n" },
    9.66  
    9.67 +    { "shadow_control", 
    9.68 +      (PyCFunction)pyxc_shadow_control, 
    9.69 +      METH_VARARGS | METH_KEYWORDS, "\n"
    9.70 +      "Set parameter for shadow pagetable interface\n"
    9.71 +      " dom [int]:   Identifier of domain.\n"
    9.72 +      " op [int, 0]: operation\n\n"
    9.73 +      "Returns: [int] 0 on success; -1 on error.\n" },
    9.74 +
    9.75 +    { "shadow_mem_control", 
    9.76 +      (PyCFunction)pyxc_shadow_mem_control, 
    9.77 +      METH_VARARGS | METH_KEYWORDS, "\n"
    9.78 +      "Set or read shadow pagetable memory use\n"
    9.79 +      " dom [int]:   Identifier of domain.\n"
    9.80 +      " mb [int, -1]: MB of shadow memory this domain should have.\n\n"
    9.81 +      "Returns: [int] MB of shadow memory in use by this domain.\n" },
    9.82 +
    9.83      { "domain_setmaxmem", 
    9.84        (PyCFunction)pyxc_domain_setmaxmem, 
    9.85        METH_VARARGS, "\n"
    10.1 --- a/tools/python/xen/xend/XendDomain.py	Wed Aug 16 16:16:32 2006 +0100
    10.2 +++ b/tools/python/xen/xend/XendDomain.py	Wed Aug 16 17:02:35 2006 +0100
    10.3 @@ -532,6 +532,30 @@ class XendDomain:
    10.4          except Exception, ex:
    10.5              raise XendError(str(ex))
    10.6  
    10.7 +    def domain_shadow_control(self, domid, op):
    10.8 +        """Shadow page control."""
    10.9 +        dominfo = self.domain_lookup(domid)
   10.10 +        try:
   10.11 +            return xc.shadow_control(dominfo.getDomid(), op)
   10.12 +        except Exception, ex:
   10.13 +            raise XendError(str(ex))
   10.14 +
   10.15 +    def domain_shadow_mem_get(self, domid):
   10.16 +        """Get shadow pagetable memory allocation."""
   10.17 +        dominfo = self.domain_lookup(domid)
   10.18 +        try:
   10.19 +            return xc.shadow_mem_control(dominfo.getDomid())
   10.20 +        except Exception, ex:
   10.21 +            raise XendError(str(ex))
   10.22 +
   10.23 +    def domain_shadow_mem_set(self, domid, mb):
   10.24 +        """Set shadow pagetable memory allocation."""
   10.25 +        dominfo = self.domain_lookup(domid)
   10.26 +        try:
   10.27 +            return xc.shadow_mem_control(dominfo.getDomid(), mb=mb)
   10.28 +        except Exception, ex:
   10.29 +            raise XendError(str(ex))
   10.30 +
   10.31      def domain_sched_credit_get(self, domid):
   10.32          """Get credit scheduler parameters for a domain.
   10.33          """
    11.1 --- a/tools/python/xen/xend/XendDomainInfo.py	Wed Aug 16 16:16:32 2006 +0100
    11.2 +++ b/tools/python/xen/xend/XendDomainInfo.py	Wed Aug 16 17:02:35 2006 +0100
    11.3 @@ -30,6 +30,7 @@ import string
    11.4  import time
    11.5  import threading
    11.6  import os
    11.7 +import math
    11.8  
    11.9  import xen.lowlevel.xc
   11.10  from xen.util import asserts
   11.11 @@ -126,16 +127,17 @@ VM_CONFIG_PARAMS = [
   11.12  # don't come out of xc in the same form as they are specified in the config
   11.13  # file, so those are handled separately.
   11.14  ROUNDTRIPPING_CONFIG_ENTRIES = [
   11.15 -    ('uuid',       str),
   11.16 -    ('vcpus',      int),
   11.17 -    ('vcpu_avail', int),
   11.18 -    ('cpu_weight', float),
   11.19 -    ('memory',     int),
   11.20 -    ('maxmem',     int),
   11.21 -    ('bootloader', str),
   11.22 +    ('uuid',            str),
   11.23 +    ('vcpus',           int),
   11.24 +    ('vcpu_avail',      int),
   11.25 +    ('cpu_weight',      float),
   11.26 +    ('memory',          int),
   11.27 +    ('shadow_memory',   int),
   11.28 +    ('maxmem',          int),
   11.29 +    ('bootloader',      str),
   11.30      ('bootloader_args', str),
   11.31 -    ('features', str),
   11.32 -    ('localtime', int),
   11.33 +    ('features',        str),
   11.34 +    ('localtime',       int),
   11.35      ]
   11.36  
   11.37  ROUNDTRIPPING_CONFIG_ENTRIES += VM_CONFIG_PARAMS
   11.38 @@ -146,12 +148,13 @@ ROUNDTRIPPING_CONFIG_ENTRIES += VM_CONFI
   11.39  # entries written to the store that cannot be reconfigured on-the-fly.
   11.40  #
   11.41  VM_STORE_ENTRIES = [
   11.42 -    ('uuid',       str),
   11.43 -    ('vcpus',      int),
   11.44 -    ('vcpu_avail', int),
   11.45 -    ('memory',     int),
   11.46 -    ('maxmem',     int),
   11.47 -    ('start_time', float),
   11.48 +    ('uuid',          str),
   11.49 +    ('vcpus',         int),
   11.50 +    ('vcpu_avail',    int),
   11.51 +    ('memory',        int),
   11.52 +    ('shadow_memory', int),
   11.53 +    ('maxmem',        int),
   11.54 +    ('start_time',    float),
   11.55      ]
   11.56  
   11.57  VM_STORE_ENTRIES += VM_CONFIG_PARAMS
   11.58 @@ -572,6 +575,7 @@ class XendDomainInfo:
   11.59              defaultInfo('vcpu_avail',   lambda: (1 << self.info['vcpus']) - 1)
   11.60  
   11.61              defaultInfo('memory',       lambda: 0)
   11.62 +            defaultInfo('shadow_memory', lambda: 0)
   11.63              defaultInfo('maxmem',       lambda: 0)
   11.64              defaultInfo('bootloader',   lambda: None)
   11.65              defaultInfo('bootloader_args', lambda: None)            
   11.66 @@ -1280,7 +1284,18 @@ class XendDomainInfo:
   11.67              xc.domain_setmaxmem(self.domid, self.info['maxmem'] * 1024)
   11.68  
   11.69              m = self.image.getDomainMemory(self.info['memory'] * 1024)
   11.70 -            balloon.free(m)
   11.71 +
   11.72 +            # get the domain's shadow memory requirement
   11.73 +            sm = int(math.ceil(self.image.getDomainShadowMemory(m) / 1024.0))
   11.74 +            if self.info['shadow_memory'] > sm:
   11.75 +                sm = self.info['shadow_memory']
   11.76 +
   11.77 +            # Make sure there's enough RAM available for the domain
   11.78 +            balloon.free(m + sm * 1024)
   11.79 +
   11.80 +            # Set up the shadow memory
   11.81 +            sm = xc.shadow_mem_control(self.domid, mb=sm)
   11.82 +            self.info['shadow_memory'] = sm
   11.83  
   11.84              init_reservation = self.info['memory'] * 1024
   11.85              if os.uname()[4] in ('ia64', 'ppc64'):
    12.1 --- a/tools/python/xen/xend/image.py	Wed Aug 16 16:16:32 2006 +0100
    12.2 +++ b/tools/python/xen/xend/image.py	Wed Aug 16 17:02:35 2006 +0100
    12.3 @@ -153,6 +153,12 @@ class ImageHandler:
    12.4                  mem_kb += 4*1024;
    12.5          return mem_kb
    12.6  
    12.7 +    def getDomainShadowMemory(self, mem_kb):
    12.8 +        """@return The minimum shadow memory required, in KiB, for a domain 
    12.9 +        with mem_kb KiB of RAM."""
   12.10 +        # PV domains don't need any shadow memory
   12.11 +        return 0
   12.12 +
   12.13      def buildDomain(self):
   12.14          """Build the domain. Define in subclass."""
   12.15          raise NotImplementedError()
   12.16 @@ -364,6 +370,17 @@ class HVMImageHandler(ImageHandler):
   12.17              extra_pages = int( math.ceil( extra_mb*1024 / page_kb ))
   12.18          return mem_kb + extra_pages * page_kb
   12.19  
   12.20 +    def getDomainShadowMemory(self, mem_kb):
   12.21 +        """@return The minimum shadow memory required, in KiB, for a domain 
   12.22 +        with mem_kb KiB of RAM."""
   12.23 +        if os.uname()[4] in ('ia64', 'ppc64'):
   12.24 +            # Explicit shadow memory is not a concept 
   12.25 +            return 0
   12.26 +        else:
   12.27 +            # 1MB per vcpu plus 4Kib/Mib of RAM.  This is higher than 
   12.28 +            # the minimum that Xen would allocate if no value were given.
   12.29 +            return 1024 * self.vm.getVCpuCount() + mem_kb / 256
   12.30 +
   12.31      def register_shutdown_watch(self):
   12.32          """ add xen store watch on control/shutdown """
   12.33          self.shutdownWatch = xswatch(self.vm.dompath + "/control/shutdown", \
    13.1 --- a/tools/python/xen/xm/create.py	Wed Aug 16 16:16:32 2006 +0100
    13.2 +++ b/tools/python/xen/xm/create.py	Wed Aug 16 17:02:35 2006 +0100
    13.3 @@ -158,6 +158,10 @@ gopts.var('maxmem', val='MEMORY',
    13.4            fn=set_int, default=None,
    13.5            use="Maximum domain memory in MB.")
    13.6  
    13.7 +gopts.var('shadow_memory', val='MEMORY',
    13.8 +          fn=set_int, default=0,
    13.9 +          use="Domain shadow memory in MB.")
   13.10 +
   13.11  gopts.var('cpu', val='CPU',
   13.12            fn=set_int, default=None,
   13.13            use="CPU to run the VCPU0 on.")
   13.14 @@ -666,8 +670,9 @@ def make_config(vals):
   13.15              if v:
   13.16                  config.append([n, v])
   13.17  
   13.18 -    map(add_conf, ['name', 'memory', 'maxmem', 'restart', 'on_poweroff',
   13.19 -                   'on_reboot', 'on_crash', 'vcpus', 'features'])
   13.20 +    map(add_conf, ['name', 'memory', 'maxmem', 'shadow_memory',
   13.21 +                   'restart', 'on_poweroff', 'on_reboot', 'on_crash',
   13.22 +                   'vcpus', 'features'])
   13.23  
   13.24      if vals.uuid is not None:
   13.25          config.append(['uuid', vals.uuid])
    14.1 --- a/xen/arch/x86/Makefile	Wed Aug 16 16:16:32 2006 +0100
    14.2 +++ b/xen/arch/x86/Makefile	Wed Aug 16 17:02:35 2006 +0100
    14.3 @@ -8,7 +8,6 @@ subdir-$(x86_32) += x86_32
    14.4  subdir-$(x86_64) += x86_64
    14.5  
    14.6  obj-y += apic.o
    14.7 -obj-y += audit.o
    14.8  obj-y += bitops.o
    14.9  obj-y += compat.o
   14.10  obj-y += delay.o
   14.11 @@ -41,12 +40,21 @@ obj-y += usercopy.o
   14.12  obj-y += x86_emulate.o
   14.13  
   14.14  ifneq ($(pae),n)
   14.15 -obj-$(x86_32) += shadow.o shadow_public.o shadow_guest32.o shadow_guest32pae.o
   14.16 +obj-$(x86_32) += shadow2-common.o shadow2_g2_on_s3.o shadow2_g3_on_s3.o
   14.17  else
   14.18 -obj-$(x86_32) += shadow32.o
   14.19 +obj-$(x86_32) += shadow2-common.o shadow2_g2_on_s2.o
   14.20  endif
   14.21  
   14.22 -obj-$(x86_64) += shadow.o shadow_public.o shadow_guest32.o shadow_guest32pae.o
   14.23 +obj-$(x86_64) += shadow2-common.o shadow2_g4_on_s4.o shadow2_g3_on_s3.o \
   14.24 +                 shadow2_g2_on_s3.o
   14.25 +
   14.26 +guest_levels  = $(subst g,,$(filter g%,$(subst ., ,$(subst _, ,$(subst shadow2_,,$(1))))))
   14.27 +shadow_levels = $(subst s,,$(filter s%,$(subst ., ,$(subst _, ,$(subst shadow2_,,$(1))))))
   14.28 +shadow2_defns = -DGUEST_PAGING_LEVELS=$(call guest_levels,$(1)) \
   14.29 +                -DSHADOW_PAGING_LEVELS=$(call shadow_levels,$(1))
   14.30 +
   14.31 +shadow2_%.o: shadow2.c $(HDRS) Makefile
   14.32 +	$(CC) $(CFLAGS) $(call shadow2_defns,$(@F)) -c $< -o $@
   14.33  
   14.34  obj-$(crash_debug) += gdbstub.o
   14.35  
    15.1 --- a/xen/arch/x86/audit.c	Wed Aug 16 16:16:32 2006 +0100
    15.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    15.3 @@ -1,984 +0,0 @@
    15.4 -/******************************************************************************
    15.5 - * arch/x86/audit.c
    15.6 - * 
    15.7 - * Copyright (c) 2002-2005 K A Fraser
    15.8 - * Copyright (c) 2004 Christian Limpach
    15.9 - * Copyright (c) 2005 Michael A Fetterman
   15.10 - * 
   15.11 - * This program is free software; you can redistribute it and/or modify
   15.12 - * it under the terms of the GNU General Public License as published by
   15.13 - * the Free Software Foundation; either version 2 of the License, or
   15.14 - * (at your option) any later version.
   15.15 - * 
   15.16 - * This program is distributed in the hope that it will be useful,
   15.17 - * but WITHOUT ANY WARRANTY; without even the implied warranty of
   15.18 - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   15.19 - * GNU General Public License for more details.
   15.20 - * 
   15.21 - * You should have received a copy of the GNU General Public License
   15.22 - * along with this program; if not, write to the Free Software
   15.23 - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
   15.24 - */
   15.25 -
   15.26 -#include <xen/config.h>
   15.27 -#include <xen/init.h>
   15.28 -#include <xen/kernel.h>
   15.29 -#include <xen/lib.h>
   15.30 -#include <xen/mm.h>
   15.31 -#include <xen/perfc.h>
   15.32 -#include <asm/shadow.h>
   15.33 -#include <asm/page.h>
   15.34 -#include <asm/flushtlb.h>
   15.35 -
   15.36 -/* XXX SMP bug -- these should not be statics... */
   15.37 -static int ttot=0, ctot=0, io_mappings=0, lowmem_mappings=0;
   15.38 -static int l1, l2, oos_count, page_count;
   15.39 -
   15.40 -#define FILE_AND_LINE 0
   15.41 -
   15.42 -#if FILE_AND_LINE
   15.43 -#define adjust(_p, _a) _adjust((_p), (_a), __FILE__, __LINE__)
   15.44 -#define ADJUST_EXTRA_ARGS ,const char *file, int line
   15.45 -#define APRINTK(_f, _a...) printk(_f " %s:%d\n", ## _a, file, line)
   15.46 -#else
   15.47 -#define adjust _adjust
   15.48 -#define ADJUST_EXTRA_ARGS
   15.49 -#define APRINTK(_f, _a...) printk(_f "\n", ##_a)
   15.50 -#endif
   15.51 -
   15.52 -int audit_adjust_pgtables(struct domain *d, int dir, int noisy)
   15.53 -{
   15.54 -    int errors = 0;
   15.55 -    int shadow_refcounts = !!shadow_mode_refcounts(d);
   15.56 -    int shadow_enabled = !!shadow_mode_enabled(d);
   15.57 -
   15.58 -    int l2limit( unsigned long mfn )
   15.59 -    {
   15.60 -
   15.61 -        if ( shadow_mode_external(d) )
   15.62 -            return L2_PAGETABLE_ENTRIES;
   15.63 -
   15.64 -#ifdef __i386__
   15.65 -#ifdef CONFIG_X86_PAE
   15.66 -        /* 32b PAE */
   15.67 -        if ( (( mfn_to_page(mfn)->u.inuse.type_info & PGT_va_mask ) 
   15.68 -	    >> PGT_va_shift) == 3 )
   15.69 -            return l2_table_offset(HYPERVISOR_VIRT_START);
   15.70 -        else
   15.71 -            return L2_PAGETABLE_ENTRIES;
   15.72 -#else
   15.73 -        /* 32b non-PAE */
   15.74 -        return DOMAIN_ENTRIES_PER_L2_PAGETABLE;
   15.75 -#endif
   15.76 -#else
   15.77 -        /* 64b */
   15.78 -        return 0; /* XXX x86/64 XXX */
   15.79 -#endif
   15.80 -    }
   15.81 -
   15.82 -    void _adjust(struct page_info *page, int adjtype ADJUST_EXTRA_ARGS)
   15.83 -    {
   15.84 -        int count;
   15.85 -
   15.86 -        if ( adjtype )
   15.87 -        {
   15.88 -            /* adjust the type count */
   15.89 -            int tcount = page->u.inuse.type_info & PGT_count_mask;
   15.90 -            tcount += dir;
   15.91 -            ttot++;
   15.92 -
   15.93 -            if ( page_get_owner(page) == NULL )
   15.94 -            {
   15.95 -                APRINTK("adjust(mfn=%lx, dir=%d, adjtype=%d) owner=NULL",
   15.96 -                        page_to_mfn(page), dir, adjtype);
   15.97 -                errors++;
   15.98 -            }
   15.99 -
  15.100 -            if ( tcount < 0 )
  15.101 -            {
  15.102 -                APRINTK("Audit %d: type count went below zero "
  15.103 -                        "mfn=%lx t=%" PRtype_info " ot=%x",
  15.104 -                        d->domain_id, page_to_mfn(page),
  15.105 -                        page->u.inuse.type_info,
  15.106 -                        page->tlbflush_timestamp);
  15.107 -                errors++;
  15.108 -            }
  15.109 -            else if ( (tcount & ~PGT_count_mask) != 0 )
  15.110 -            {
  15.111 -                APRINTK("Audit %d: type count overflowed "
  15.112 -                        "mfn=%lx t=%" PRtype_info " ot=%x",
  15.113 -                        d->domain_id, page_to_mfn(page),
  15.114 -                        page->u.inuse.type_info,
  15.115 -                        page->tlbflush_timestamp);
  15.116 -                errors++;
  15.117 -            }
  15.118 -            else
  15.119 -                page->u.inuse.type_info += dir;
  15.120 -        }
  15.121 -
  15.122 -        /* adjust the general count */
  15.123 -        count = (page->count_info & PGC_count_mask) + dir;
  15.124 -        ctot++;
  15.125 -
  15.126 -        if ( count < 0 )
  15.127 -        {
  15.128 -            APRINTK("Audit %d: general count went below zero "
  15.129 -                    "mfn=%lx t=%" PRtype_info " ot=%x",
  15.130 -                    d->domain_id, page_to_mfn(page),
  15.131 -                    page->u.inuse.type_info,
  15.132 -                    page->tlbflush_timestamp);
  15.133 -            errors++;
  15.134 -        }
  15.135 -        else if ( (count & ~PGT_count_mask) != 0 )
  15.136 -        {
  15.137 -            APRINTK("Audit %d: general count overflowed "
  15.138 -                    "mfn=%lx t=%" PRtype_info " ot=%x",
  15.139 -                    d->domain_id, page_to_mfn(page),
  15.140 -                    page->u.inuse.type_info,
  15.141 -                    page->tlbflush_timestamp);
  15.142 -            errors++;
  15.143 -        }
  15.144 -        else
  15.145 -            page->count_info += dir;
  15.146 -    }
  15.147 -
  15.148 -    void adjust_l2_page(unsigned long mfn, int shadow)
  15.149 -    {
  15.150 -        l2_pgentry_t *pt = map_domain_page(mfn);
  15.151 -        int i;
  15.152 -        u32 page_type;
  15.153 -
  15.154 -        for ( i = 0; i < l2limit(mfn); i++ )
  15.155 -        {
  15.156 -            if ( l2e_get_flags(pt[i]) & _PAGE_PRESENT )
  15.157 -            {
  15.158 -	        unsigned long l1mfn = l2e_get_pfn(pt[i]);
  15.159 -                struct page_info *l1page = mfn_to_page(l1mfn);
  15.160 -
  15.161 -                if ( noisy )
  15.162 -                {
  15.163 -                    if ( shadow )
  15.164 -                    {
  15.165 -                        if ( page_get_owner(l1page) != NULL )
  15.166 -                        {
  15.167 -                            printk("L2: Bizarre shadow L1 page mfn=%lx "
  15.168 -                                   "belonging to a domain %p (id=%d)\n",
  15.169 -                                   l1mfn,
  15.170 -                                   page_get_owner(l1page),
  15.171 -                                   page_get_owner(l1page)->domain_id);
  15.172 -                            errors++;
  15.173 -                            continue;
  15.174 -                        }
  15.175 -
  15.176 -                        page_type = l1page->u.inuse.type_info & PGT_type_mask;
  15.177 -                        if ( page_type != PGT_l1_shadow )
  15.178 -                        {
  15.179 -                            printk("Audit %d: [Shadow L2 mfn=%lx i=%x] "
  15.180 -                                   "Expected Shadow L1 t=%" PRtype_info 
  15.181 -				   " mfn=%lx\n",
  15.182 -                                   d->domain_id, mfn, i,
  15.183 -                                   l1page->u.inuse.type_info, l1mfn);
  15.184 -                            errors++;
  15.185 -                        }
  15.186 -                    }
  15.187 -                    else
  15.188 -                    {
  15.189 -                        if ( page_get_owner(l1page) != d )
  15.190 -                        {
  15.191 -                            printk("L2: Skip bizarre L1 page mfn=%lx "
  15.192 -                                   "belonging to other dom %p (id=%d)\n",
  15.193 -                                   l1mfn,
  15.194 -                                   page_get_owner(l1page),
  15.195 -                                   (page_get_owner(l1page)
  15.196 -                                    ? page_get_owner(l1page)->domain_id
  15.197 -                                    : -1));
  15.198 -                            errors++;
  15.199 -                            continue;
  15.200 -                        }
  15.201 -
  15.202 -                        page_type = l1page->u.inuse.type_info & PGT_type_mask;
  15.203 -                        if ( page_type == PGT_l2_page_table )
  15.204 -                        {
  15.205 -                            printk("Audit %d: [%x] Found %s Linear PT "
  15.206 -                                   "t=%" PRtype_info " mfn=%lx\n",
  15.207 -                                   d->domain_id, i, (l1mfn==mfn) ? "Self" : "Other",
  15.208 -                                   l1page->u.inuse.type_info, l1mfn);
  15.209 -                        }
  15.210 -                        else if ( page_type != PGT_l1_page_table )
  15.211 -                        {
  15.212 -                            printk("Audit %d: [L2 mfn=%lx i=%x] "
  15.213 -                                   "Expected L1 t=%" PRtype_info " mfn=%lx\n",
  15.214 -                                   d->domain_id, mfn, i,
  15.215 -                                   l1page->u.inuse.type_info, l1mfn);
  15.216 -                            errors++;
  15.217 -                        }
  15.218 -                    }
  15.219 -                }
  15.220 -
  15.221 -                adjust(l1page, !shadow);
  15.222 -            }
  15.223 -        }
  15.224 -
  15.225 -        if ( shadow_mode_translate(d) && !shadow_mode_external(d) )
  15.226 -        {
  15.227 -            unsigned long hl2mfn =
  15.228 -                l2e_get_pfn(pt[l2_table_offset(LINEAR_PT_VIRT_START)]);
  15.229 -            struct page_info *hl2page = mfn_to_page(hl2mfn);
  15.230 -            adjust(hl2page, 0);
  15.231 -        }
  15.232 -
  15.233 -        unmap_domain_page(pt);
  15.234 -    }
  15.235 -
  15.236 -    void adjust_hl2_page(unsigned long hl2mfn)
  15.237 -    {
  15.238 -        l2_pgentry_t *pt = map_domain_page(hl2mfn);
  15.239 -        int i;
  15.240 -
  15.241 -        for ( i = 0; i < l2limit(hl2mfn); i++ )
  15.242 -        {
  15.243 -            if ( l2e_get_flags(pt[i]) & _PAGE_PRESENT )
  15.244 -            {
  15.245 -                unsigned long mfn = l2e_get_pfn(pt[i]);
  15.246 -                struct page_info *gpage = mfn_to_page(mfn);
  15.247 -
  15.248 -                if ( mfn < 0x100 )
  15.249 -                {
  15.250 -                    lowmem_mappings++;
  15.251 -                    continue;
  15.252 -                }
  15.253 -
  15.254 -                if ( !mfn_valid(mfn) )
  15.255 -                {
  15.256 -                    io_mappings++;
  15.257 -                    continue;
  15.258 -                }
  15.259 -
  15.260 -                if ( noisy )
  15.261 -                {
  15.262 -                    if ( page_get_owner(gpage) != d )
  15.263 -                    {
  15.264 -                        printk("Audit %d: [hl2mfn=%lx,i=%x] Skip foreign page "
  15.265 -                               "dom=%p (id=%d) mfn=%lx c=%08x t=%"
  15.266 -			       PRtype_info "\n",
  15.267 -                               d->domain_id, hl2mfn, i,
  15.268 -                               page_get_owner(gpage),
  15.269 -                               page_get_owner(gpage)->domain_id,
  15.270 -                               mfn,
  15.271 -                               gpage->count_info,
  15.272 -                               gpage->u.inuse.type_info);
  15.273 -                        continue;
  15.274 -                    }
  15.275 -                }
  15.276 -                adjust(gpage, 0);
  15.277 -            }
  15.278 -        }
  15.279 -
  15.280 -        unmap_domain_page(pt);
  15.281 -    }
  15.282 -
  15.283 -    void adjust_l1_page(unsigned long l1mfn)
  15.284 -    {
  15.285 -        l1_pgentry_t *pt = map_domain_page(l1mfn);
  15.286 -        int i;
  15.287 -
  15.288 -        for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
  15.289 -        {
  15.290 -            if ( l1e_get_flags(pt[i]) & _PAGE_PRESENT )
  15.291 -            {
  15.292 -                unsigned long mfn = l1e_get_pfn(pt[i]);
  15.293 -                struct page_info *gpage = mfn_to_page(mfn);
  15.294 -
  15.295 -                if ( mfn < 0x100 )
  15.296 -                {
  15.297 -                    lowmem_mappings++;
  15.298 -                    continue;
  15.299 -                }
  15.300 -
  15.301 -                if ( !mfn_valid(mfn) )
  15.302 -                {
  15.303 -                    io_mappings++;
  15.304 -                    continue;
  15.305 -                }
  15.306 -
  15.307 -                if ( noisy )
  15.308 -                {
  15.309 -                    if ( l1e_get_flags(pt[i]) & _PAGE_RW )
  15.310 -                    {
  15.311 -                        // If it's not a writable page, complain.
  15.312 -                        //
  15.313 -                        if ( !((gpage->u.inuse.type_info & PGT_type_mask) ==
  15.314 -                               PGT_writable_page) )
  15.315 -                        {
  15.316 -                            printk("Audit %d: [l1mfn=%lx, i=%x] Illegal RW "
  15.317 -                                   "t=%" PRtype_info " mfn=%lx\n",
  15.318 -                                   d->domain_id, l1mfn, i,
  15.319 -                                   gpage->u.inuse.type_info, mfn);
  15.320 -                            errors++;
  15.321 -                        }
  15.322 -
  15.323 -                        if ( shadow_refcounts &&
  15.324 -                             page_is_page_table(gpage) &&
  15.325 -                             ! page_out_of_sync(gpage) )
  15.326 -                        {
  15.327 -                            printk("Audit %d: [l1mfn=%lx, i=%x] Illegal RW of "
  15.328 -                                   "page table mfn=%lx\n",
  15.329 -                                   d->domain_id, l1mfn, i, mfn);
  15.330 -                            errors++;
  15.331 -                        }
  15.332 -                    }		   
  15.333 -
  15.334 -                    if ( page_get_owner(gpage) != d )
  15.335 -                    {
  15.336 -                        printk("Audit %d: [l1mfn=%lx,i=%x] Skip foreign page "
  15.337 -                               "dom=%p (id=%d) mfn=%lx c=%08x t=%" 
  15.338 -			       PRtype_info "\n",
  15.339 -                               d->domain_id, l1mfn, i,
  15.340 -                               page_get_owner(gpage),
  15.341 -                               page_get_owner(gpage)->domain_id,
  15.342 -                               mfn,
  15.343 -                               gpage->count_info,
  15.344 -                               gpage->u.inuse.type_info);
  15.345 -                        continue;
  15.346 -                    }
  15.347 -                }
  15.348 -
  15.349 -                adjust(gpage, (l1e_get_flags(pt[i]) & _PAGE_RW) ? 1 : 0);
  15.350 -            }
  15.351 -        }
  15.352 -
  15.353 -        unmap_domain_page(pt);
  15.354 -    }
  15.355 -
  15.356 -    void adjust_shadow_tables(void)
  15.357 -    {
  15.358 -        struct shadow_status *a;
  15.359 -        unsigned long smfn, gmfn;
  15.360 -        struct page_info *page;
  15.361 -        int i;
  15.362 -
  15.363 -        for ( i = 0; i < shadow_ht_buckets; i++ )
  15.364 -        {
  15.365 -            a = &d->arch.shadow_ht[i];
  15.366 -            while ( a && a->gpfn_and_flags )
  15.367 -            {
  15.368 -                gmfn = gmfn_to_mfn(d, a->gpfn_and_flags & PGT_mfn_mask);
  15.369 -                smfn = a->smfn;
  15.370 -                page = mfn_to_page(smfn);
  15.371 -
  15.372 -                switch ( a->gpfn_and_flags & PGT_type_mask ) {
  15.373 -                case PGT_writable_pred:
  15.374 -                    break;
  15.375 -                case PGT_snapshot:
  15.376 -                    adjust(mfn_to_page(gmfn), 0);
  15.377 -                    break;
  15.378 -                case PGT_l1_shadow:
  15.379 -                    adjust(mfn_to_page(gmfn), 0);
  15.380 -                    if ( shadow_refcounts )
  15.381 -                        adjust_l1_page(smfn);
  15.382 -                    if ( page->u.inuse.type_info & PGT_pinned )
  15.383 -                        adjust(page, 0);
  15.384 -                    break;
  15.385 -                case PGT_hl2_shadow:
  15.386 -                    adjust(mfn_to_page(gmfn), 0);
  15.387 -                    if ( shadow_refcounts )
  15.388 -                        adjust_hl2_page(smfn);
  15.389 -                    if ( page->u.inuse.type_info & PGT_pinned )
  15.390 -                        adjust(page, 0);
  15.391 -                    break;
  15.392 -                case PGT_l2_shadow:
  15.393 -                    adjust(mfn_to_page(gmfn), 0);
  15.394 -                    adjust_l2_page(smfn, 1);
  15.395 -                    if ( page->u.inuse.type_info & PGT_pinned )
  15.396 -                        adjust(page, 0);
  15.397 -                    break;
  15.398 -                default:
  15.399 -                    BUG();
  15.400 -                    break;
  15.401 -                }
  15.402 -
  15.403 -                a = a->next;
  15.404 -            }
  15.405 -        }
  15.406 -    }
  15.407 -
  15.408 -    void adjust_oos_list(void)
  15.409 -    {
  15.410 -        struct out_of_sync_entry *oos;
  15.411 -
  15.412 -        if ( (oos = d->arch.out_of_sync) )
  15.413 -            ASSERT(shadow_enabled);
  15.414 -
  15.415 -        while ( oos )
  15.416 -        {
  15.417 -            adjust(mfn_to_page(oos->gmfn), 0);
  15.418 -
  15.419 -            // Only use entries that have low bits clear...
  15.420 -            //
  15.421 -            if ( !(oos->writable_pl1e & (sizeof(l1_pgentry_t)-1)) )
  15.422 -                adjust(mfn_to_page(oos->writable_pl1e >> PAGE_SHIFT), 0);
  15.423 -
  15.424 -            if ( oos->snapshot_mfn != SHADOW_SNAPSHOT_ELSEWHERE )
  15.425 -                adjust(mfn_to_page(oos->snapshot_mfn), 0);
  15.426 -
  15.427 -            oos = oos->next;
  15.428 -            oos_count++;
  15.429 -        }
  15.430 -    }
  15.431 -
  15.432 -    void adjust_for_pgtbase(void)
  15.433 -    {
  15.434 -        struct vcpu *v;
  15.435 -
  15.436 -        for_each_vcpu(d, v)
  15.437 -        {
  15.438 -            if ( !pagetable_is_null(v->arch.guest_table) )
  15.439 -                adjust(mfn_to_page(pagetable_get_pfn(v->arch.guest_table)),
  15.440 -                       !shadow_mode_refcounts(d));
  15.441 -            if ( !pagetable_is_null(v->arch.shadow_table) )
  15.442 -                adjust(mfn_to_page(pagetable_get_pfn(v->arch.shadow_table)),
  15.443 -                       0);
  15.444 -            if ( v->arch.monitor_shadow_ref )
  15.445 -                adjust(mfn_to_page(v->arch.monitor_shadow_ref), 0);
  15.446 -        }
  15.447 -    }
  15.448 -
  15.449 -    void adjust_guest_pages(void)
  15.450 -    {
  15.451 -        struct list_head *list_ent = d->page_list.next;
  15.452 -        struct page_info *page;
  15.453 -        unsigned long mfn, snapshot_mfn;
  15.454 -
  15.455 -        while ( list_ent != &d->page_list )
  15.456 -        {
  15.457 -            u32 page_type;
  15.458 -
  15.459 -            page = list_entry(list_ent, struct page_info, list);
  15.460 -            snapshot_mfn = mfn = page_to_mfn(page);
  15.461 -            page_type = page->u.inuse.type_info & PGT_type_mask;
  15.462 -
  15.463 -            BUG_ON(page_get_owner(page) != d);
  15.464 -
  15.465 -            page_count++;
  15.466 -
  15.467 -            if ( shadow_enabled && !shadow_refcounts &&
  15.468 -                 page_out_of_sync(page) )
  15.469 -            {
  15.470 -                unsigned long gpfn = mfn_to_gmfn(d, mfn);
  15.471 -                ASSERT( VALID_M2P(gpfn) );
  15.472 -                snapshot_mfn = __shadow_status(d, gpfn, PGT_snapshot);
  15.473 -                ASSERT( snapshot_mfn );
  15.474 -            }
  15.475 -
  15.476 -            switch ( page_type )
  15.477 -            {
  15.478 -            case PGT_l2_page_table:
  15.479 -                l2++;
  15.480 -
  15.481 -                if ( noisy )
  15.482 -                {
  15.483 -                    if ( shadow_refcounts )
  15.484 -                    {
  15.485 -                        printk("Audit %d: found an L2 guest page "
  15.486 -                               "mfn=%lx t=%" PRtype_info " c=%08x while in shadow mode\n",
  15.487 -                               d->domain_id, mfn, page->u.inuse.type_info,
  15.488 -                               page->count_info);
  15.489 -                        errors++;
  15.490 -                    }
  15.491 -
  15.492 -                    if ( (page->u.inuse.type_info & PGT_count_mask) != 0 )
  15.493 -                    {
  15.494 -                        if ( (page->u.inuse.type_info & PGT_validated) !=
  15.495 -                             PGT_validated )
  15.496 -                        {
  15.497 -                            printk("Audit %d: L2 mfn=%lx not validated %"
  15.498 -				   PRtype_info "\n",
  15.499 -                                   d->domain_id, mfn, page->u.inuse.type_info);
  15.500 -                            errors++;
  15.501 -                        }
  15.502 -
  15.503 -                    }
  15.504 -                }
  15.505 -
  15.506 -                if ( page->u.inuse.type_info & PGT_pinned )
  15.507 -                    adjust(page, 1);
  15.508 -
  15.509 -                if ( page->u.inuse.type_info & PGT_validated )
  15.510 -                    adjust_l2_page(snapshot_mfn, 0);
  15.511 -
  15.512 -                break;
  15.513 -
  15.514 -            case PGT_l1_page_table:
  15.515 -                l1++;
  15.516 -
  15.517 -                if ( noisy )
  15.518 -                {
  15.519 -                    if ( shadow_refcounts )
  15.520 -                    {
  15.521 -                        printk("found an L1 guest page mfn=%lx t=%" 
  15.522 -			       PRtype_info " c=%08x "
  15.523 -                               "while in shadow mode\n",
  15.524 -                               mfn, page->u.inuse.type_info, page->count_info);
  15.525 -                        errors++;
  15.526 -                    }
  15.527 -
  15.528 -                    if ( (page->u.inuse.type_info & PGT_count_mask) != 0 )
  15.529 -                    {
  15.530 -                        if ( (page->u.inuse.type_info & PGT_validated) !=
  15.531 -                             PGT_validated )
  15.532 -                        {
  15.533 -                            printk("Audit %d: L1 not validated mfn=%lx t=%"
  15.534 -				   PRtype_info "\n",
  15.535 -                                   d->domain_id, mfn, page->u.inuse.type_info);
  15.536 -                            errors++;
  15.537 -                        }
  15.538 -                    }
  15.539 -                }
  15.540 -                
  15.541 -                if ( page->u.inuse.type_info & PGT_pinned )
  15.542 -                    adjust(page, 1);
  15.543 -
  15.544 -                if ( page->u.inuse.type_info & PGT_validated )
  15.545 -                    adjust_l1_page(snapshot_mfn);
  15.546 -
  15.547 -                break;
  15.548 -
  15.549 -            case PGT_gdt_page:
  15.550 -                ASSERT( !page_out_of_sync(page) );
  15.551 -                adjust(page, 1);
  15.552 -                break;
  15.553 -
  15.554 -            case PGT_ldt_page:
  15.555 -                ASSERT( !page_out_of_sync(page) );
  15.556 -                adjust(page, 1);
  15.557 -                break;
  15.558 -
  15.559 -            case PGT_writable_page:
  15.560 -                if ( shadow_refcounts )
  15.561 -                {
  15.562 -                    // In shadow mode, writable pages can get pinned by
  15.563 -                    // paravirtualized guests that think they are pinning
  15.564 -                    // their L1s and/or L2s.
  15.565 -                    //
  15.566 -                    if ( page->u.inuse.type_info & PGT_pinned )
  15.567 -                        adjust(page, 1);
  15.568 -                }
  15.569 -            }
  15.570 -
  15.571 -            list_ent = page->list.next;
  15.572 -        }
  15.573 -    }
  15.574 -
  15.575 -    adjust_for_pgtbase();
  15.576 -
  15.577 -    adjust_guest_pages();
  15.578 -
  15.579 -    if ( shadow_enabled )
  15.580 -    {
  15.581 -        adjust_oos_list();
  15.582 -        adjust_shadow_tables();
  15.583 -    }
  15.584 -
  15.585 -    adjust(virt_to_page(d->shared_info), 1);
  15.586 -
  15.587 -    return errors;
  15.588 -}
  15.589 -
  15.590 -
  15.591 -#ifndef NDEBUG
  15.592 -
  15.593 -void audit_pagelist(struct domain *d)
  15.594 -{
  15.595 -    struct list_head *list_ent;
  15.596 -    int xenpages, totpages;
  15.597 -
  15.598 -    list_ent = d->xenpage_list.next;
  15.599 -    for ( xenpages = 0; (list_ent != &d->xenpage_list); xenpages++ )
  15.600 -    {
  15.601 -        list_ent = list_ent->next;
  15.602 -    }
  15.603 -    list_ent = d->page_list.next;
  15.604 -    for ( totpages = 0; (list_ent != &d->page_list); totpages++ )
  15.605 -    {
  15.606 -        list_ent = list_ent->next;
  15.607 -    }
  15.608 -
  15.609 -    if ( xenpages != d->xenheap_pages ||
  15.610 -         totpages != d->tot_pages )
  15.611 -    {
  15.612 -        printk("ARGH! dom %d: xen=%d %d, pages=%d %d\n", d->domain_id,
  15.613 -               xenpages, d->xenheap_pages, 
  15.614 -               totpages, d->tot_pages );
  15.615 -    }
  15.616 -}
  15.617 -
  15.618 -void _audit_domain(struct domain *d, int flags)
  15.619 -{
  15.620 -    int shadow_refcounts = !!shadow_mode_refcounts(d);
  15.621 -
  15.622 -    void scan_for_pfn_in_mfn(struct domain *d, unsigned long xmfn,
  15.623 -                             unsigned long mfn)
  15.624 -    {
  15.625 -        struct page_info *page = mfn_to_page(mfn);
  15.626 -        l1_pgentry_t *pt = map_domain_page(mfn);
  15.627 -        int i;
  15.628 -
  15.629 -        for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
  15.630 -        {
  15.631 -            if ( (l1e_get_flags(pt[i]) & _PAGE_PRESENT) && 
  15.632 -                 (l1e_get_pfn(pt[i]) == xmfn) )
  15.633 -                printk("     found dom=%d mfn=%lx t=%" PRtype_info " c=%08x "
  15.634 -                       "pt[i=%x]=%" PRIpte "\n",
  15.635 -                       d->domain_id, mfn, page->u.inuse.type_info,
  15.636 -                       page->count_info, i, l1e_get_intpte(pt[i]));
  15.637 -        }
  15.638 -
  15.639 -        unmap_domain_page(pt);           
  15.640 -    }
  15.641 -
  15.642 -    void scan_for_pfn_in_grant_table(struct domain *d, unsigned xmfn)
  15.643 -    {
  15.644 -        int i;
  15.645 -        struct active_grant_entry *act = d->grant_table->active;
  15.646 -
  15.647 -        spin_lock(&d->grant_table->lock);
  15.648 -
  15.649 -        for ( i = 0; i < NR_GRANT_ENTRIES; i++ )
  15.650 -        {
  15.651 -            if ( act[i].pin && (act[i].frame == xmfn) )
  15.652 -            {
  15.653 -                printk("     found active grant table entry i=%d dom=%d pin=%d\n",
  15.654 -                       i, act[i].domid, act[i].pin);
  15.655 -            }
  15.656 -        }
  15.657 -
  15.658 -        spin_unlock(&d->grant_table->lock);
  15.659 -    }
  15.660 -
  15.661 -    void scan_for_pfn(struct domain *d, unsigned long xmfn)
  15.662 -    {
  15.663 -        scan_for_pfn_in_grant_table(d, xmfn);
  15.664 -
  15.665 -        if ( !shadow_mode_enabled(d) )
  15.666 -        {
  15.667 -            struct list_head *list_ent = d->page_list.next;
  15.668 -            struct page_info *page;
  15.669 -
  15.670 -            while ( list_ent != &d->page_list )
  15.671 -            {
  15.672 -                page = list_entry(list_ent, struct page_info, list);
  15.673 -
  15.674 -                switch ( page->u.inuse.type_info & PGT_type_mask )
  15.675 -                {
  15.676 -                case PGT_l1_page_table:
  15.677 -                case PGT_l2_page_table:
  15.678 -                    scan_for_pfn_in_mfn(d, xmfn, page_to_mfn(page));
  15.679 -                    break;
  15.680 -                default:
  15.681 -                    break;
  15.682 -                }
  15.683 -
  15.684 -                list_ent = page->list.next;
  15.685 -            }
  15.686 -        }
  15.687 -        else
  15.688 -        {
  15.689 -            struct shadow_status *a;
  15.690 -            int i;
  15.691 -            
  15.692 -            for ( i = 0; i < shadow_ht_buckets; i++ )
  15.693 -            {
  15.694 -                a = &d->arch.shadow_ht[i];
  15.695 -                while ( a && a->gpfn_and_flags )
  15.696 -                {
  15.697 -                    switch ( a->gpfn_and_flags & PGT_type_mask )
  15.698 -                    {
  15.699 -                    case PGT_l1_shadow:
  15.700 -                    case PGT_l2_shadow:
  15.701 -                    case PGT_hl2_shadow:
  15.702 -                        scan_for_pfn_in_mfn(d, xmfn, a->smfn);
  15.703 -                        break;
  15.704 -                    case PGT_snapshot:
  15.705 -                    case PGT_writable_pred:
  15.706 -                        break;
  15.707 -                    default:
  15.708 -                        BUG();
  15.709 -                        break;
  15.710 -                    }
  15.711 -                    a = a->next;
  15.712 -                }
  15.713 -            }
  15.714 -        }
  15.715 -    }
  15.716 -
  15.717 -    void scan_for_pfn_remote(unsigned long xmfn)
  15.718 -    {
  15.719 -        struct domain *e;
  15.720 -        for_each_domain ( e )
  15.721 -            scan_for_pfn( e, xmfn );
  15.722 -    } 
  15.723 -
  15.724 -    unsigned long mfn;
  15.725 -    struct list_head *list_ent;
  15.726 -    struct page_info *page;
  15.727 -    int errors = 0;
  15.728 -
  15.729 -    if ( (d != current->domain) && shadow_mode_translate(d) )
  15.730 -    {
  15.731 -        printk("skipping audit domain of translated domain %d "
  15.732 -               "from other context\n",
  15.733 -               d->domain_id);
  15.734 -        return;
  15.735 -    }
  15.736 -
  15.737 -    if ( d != current->domain )
  15.738 -        domain_pause(d);
  15.739 -
  15.740 -    // Maybe we should just be using BIGLOCK?
  15.741 -    //
  15.742 -    if ( !(flags & AUDIT_SHADOW_ALREADY_LOCKED) )
  15.743 -        shadow_lock(d);
  15.744 -
  15.745 -    spin_lock(&d->page_alloc_lock);
  15.746 -
  15.747 -    audit_pagelist(d);
  15.748 -
  15.749 -    /* PHASE 0 */
  15.750 -
  15.751 -    list_ent = d->page_list.next;
  15.752 -    while ( list_ent != &d->page_list )
  15.753 -    {
  15.754 -        u32 page_type;
  15.755 -        unsigned long pfn;
  15.756 -
  15.757 -        page = list_entry(list_ent, struct page_info, list);
  15.758 -        mfn = page_to_mfn(page);
  15.759 -        page_type = page->u.inuse.type_info & PGT_type_mask;
  15.760 -
  15.761 -        BUG_ON(page_get_owner(page) != d);
  15.762 -
  15.763 -        if ( (page->u.inuse.type_info & PGT_count_mask) >
  15.764 -             (page->count_info & PGC_count_mask) )
  15.765 -        {
  15.766 -            printk("taf(%" PRtype_info ") > caf(%08x) mfn=%lx\n",
  15.767 -                   page->u.inuse.type_info, page->count_info, mfn);
  15.768 -            errors++;
  15.769 -        }
  15.770 -
  15.771 -        if ( shadow_mode_refcounts(d) &&
  15.772 -             (page_type == PGT_writable_page) &&
  15.773 -             !(page->u.inuse.type_info & PGT_validated) )
  15.774 -        {
  15.775 -            printk("shadow mode writable page not validated mfn=%lx " 
  15.776 -		   "t=%" PRtype_info  " c=%08x\n",
  15.777 -                   mfn, page->u.inuse.type_info, page->count_info);
  15.778 -            errors++;
  15.779 -        }
  15.780 - 
  15.781 -#if 0   /* SYSV shared memory pages plus writeable files. */
  15.782 -        if ( page_type == PGT_writable_page && 
  15.783 -             (page->u.inuse.type_info & PGT_count_mask) > 1 )
  15.784 -        {
  15.785 -            printk("writeable page with type count >1: "
  15.786 -                   "mfn=%lx t=%" PRtype_info " c=%08x\n",
  15.787 -                  mfn,
  15.788 -                  page->u.inuse.type_info,
  15.789 -                  page->count_info );
  15.790 -            errors++;
  15.791 -            scan_for_pfn_remote(mfn);
  15.792 -        }
  15.793 -#endif
  15.794 -
  15.795 -        if ( page_type == PGT_none && 
  15.796 -             (page->u.inuse.type_info & PGT_count_mask) > 0 )
  15.797 -        {
  15.798 -            printk("normal page with type count >0: mfn=%lx t=%" PRtype_info " c=%08x\n",
  15.799 -                  mfn,
  15.800 -                  page->u.inuse.type_info,
  15.801 -                  page->count_info );
  15.802 -            errors++;
  15.803 -        }
  15.804 -
  15.805 -        if ( page_out_of_sync(page) )
  15.806 -        {
  15.807 -            if ( !page_is_page_table(page) )
  15.808 -            {
  15.809 -                printk("out of sync page mfn=%lx is not a page table\n", mfn);
  15.810 -                errors++;
  15.811 -            }
  15.812 -            pfn = mfn_to_gmfn(d, mfn);
  15.813 -            if ( !__shadow_status(d, pfn, PGT_snapshot) )
  15.814 -            {
  15.815 -                printk("out of sync page mfn=%lx doesn't have a snapshot\n",
  15.816 -                       mfn);
  15.817 -                errors++;
  15.818 -            }
  15.819 -            if ( shadow_refcounts
  15.820 -                 ? (page_type != PGT_writable_page)
  15.821 -                 : !(page_type && (page_type <= PGT_l4_page_table)) )
  15.822 -            {
  15.823 -                printk("out of sync page mfn=%lx has strange type "
  15.824 -                       "t=%" PRtype_info  " c=%08x\n",
  15.825 -                       mfn, page->u.inuse.type_info, page->count_info);
  15.826 -                errors++;
  15.827 -            }
  15.828 -        }
  15.829 -
  15.830 -        /* Use tlbflush_timestamp to store original type_info. */
  15.831 -        page->tlbflush_timestamp = page->u.inuse.type_info;
  15.832 -
  15.833 -        list_ent = page->list.next;
  15.834 -    }
  15.835 -
  15.836 -    /* PHASE 1 */
  15.837 -    io_mappings = lowmem_mappings = 0;
  15.838 -
  15.839 -    errors += audit_adjust_pgtables(d, -1, 1);
  15.840 -
  15.841 -    if ( !(flags & AUDIT_QUIET) &&
  15.842 -         ((io_mappings > 0) || (lowmem_mappings > 0)) )
  15.843 -        printk("Audit %d: Found %d lowmem mappings and %d io mappings\n",
  15.844 -               d->domain_id, lowmem_mappings, io_mappings);
  15.845 -
  15.846 -    /* PHASE 2 */
  15.847 -
  15.848 -    list_ent = d->page_list.next;
  15.849 -    while ( list_ent != &d->page_list )
  15.850 -    {
  15.851 -        page = list_entry(list_ent, struct page_info, list);
  15.852 -        mfn = page_to_mfn(page);
  15.853 -
  15.854 -        switch ( page->u.inuse.type_info & PGT_type_mask)
  15.855 -        {
  15.856 -        case PGT_l1_page_table:
  15.857 -        case PGT_l2_page_table:
  15.858 -        case PGT_l3_page_table:
  15.859 -        case PGT_l4_page_table:
  15.860 -            if ( (page->u.inuse.type_info & PGT_count_mask) != 0 )
  15.861 -            {
  15.862 -                printk("Audit %d: type count!=0 t=%" PRtype_info " ot=%x c=%x mfn=%lx\n",
  15.863 -                       d->domain_id, page->u.inuse.type_info, 
  15.864 -                       page->tlbflush_timestamp,
  15.865 -                       page->count_info, mfn);
  15.866 -                errors++;
  15.867 -                scan_for_pfn_remote(mfn);
  15.868 -            }
  15.869 -            break;
  15.870 -        case PGT_none:
  15.871 -        case PGT_writable_page:
  15.872 -        case PGT_gdt_page:
  15.873 -        case PGT_ldt_page:
  15.874 -            if ( (page->u.inuse.type_info & PGT_count_mask) != 0 )
  15.875 -            {
  15.876 -                printk("Audit %d: type count!=0 t=%" PRtype_info " ot=%x c=%x mfn=%lx\n",
  15.877 -                       d->domain_id, page->u.inuse.type_info, 
  15.878 -                       page->tlbflush_timestamp,
  15.879 -                       page->count_info, mfn);
  15.880 -                //errors++;
  15.881 -            }
  15.882 -            break;
  15.883 -        default:
  15.884 -            BUG(); // XXX fix me...
  15.885 -        }
  15.886 -        
  15.887 -        if ( (page->count_info & PGC_count_mask) != 1 )
  15.888 -        {
  15.889 -            printk("Audit %d: gen count!=1 (c=%x) t=%" PRtype_info " ot=%x mfn=%lx\n",
  15.890 -                   d->domain_id,
  15.891 -                   page->count_info,
  15.892 -                   page->u.inuse.type_info, 
  15.893 -                   page->tlbflush_timestamp, mfn );
  15.894 -            //errors++;
  15.895 -            scan_for_pfn_remote(mfn);
  15.896 -        }
  15.897 -
  15.898 -        list_ent = page->list.next;
  15.899 -    }
  15.900 -
  15.901 -    if ( shadow_mode_enabled(d) )
  15.902 -    {
  15.903 -        struct shadow_status *a;
  15.904 -        struct page_info *page;
  15.905 -        u32 page_type;
  15.906 -        int i;
  15.907 -
  15.908 -        for ( i = 0; i < shadow_ht_buckets; i++ )
  15.909 -        {
  15.910 -            a = &d->arch.shadow_ht[i];
  15.911 -            while ( a && a->gpfn_and_flags )
  15.912 -            {
  15.913 -                page = mfn_to_page(a->smfn);
  15.914 -                page_type = a->gpfn_and_flags & PGT_type_mask;
  15.915 -
  15.916 -                switch ( page_type ) {
  15.917 -                case PGT_l1_shadow:
  15.918 -                case PGT_l2_shadow:
  15.919 -                case PGT_hl2_shadow:
  15.920 -                case PGT_snapshot:
  15.921 -                    if ( ((page->u.inuse.type_info & PGT_type_mask) != page_type ) ||
  15.922 -                         (page->count_info != 0) )
  15.923 -                    {
  15.924 -                        printk("Audit %d: shadow page counts wrong "
  15.925 -                               "mfn=%lx t=%" PRtype_info " c=%08x\n",
  15.926 -                               d->domain_id, page_to_mfn(page),
  15.927 -                               page->u.inuse.type_info,
  15.928 -                               page->count_info);
  15.929 -                        printk("a->gpfn_and_flags=%"PRIx64"\n",
  15.930 -                               (u64)a->gpfn_and_flags);
  15.931 -                        errors++;
  15.932 -                    }
  15.933 -                    break;
  15.934 -                case PGT_writable_pred:
  15.935 -                    // XXX - nothing to check?
  15.936 -                    break;
  15.937 -
  15.938 -                default:
  15.939 -                    BUG();
  15.940 -                    break;
  15.941 -                }
  15.942 -
  15.943 -                a = a->next;
  15.944 -            }
  15.945 -        }
  15.946 -    }
  15.947 -
  15.948 -    /* PHASE 3 */
  15.949 -    ctot = ttot = page_count = l1 = l2 = oos_count = 0;
  15.950 -
  15.951 -    audit_adjust_pgtables(d, 1, 0);
  15.952 -
  15.953 -#if 0
  15.954 -    // This covers our sins of trashing the tlbflush_timestamps...
  15.955 -    //
  15.956 -    local_flush_tlb();
  15.957 -#endif
  15.958 -
  15.959 -    spin_unlock(&d->page_alloc_lock);
  15.960 -
  15.961 -    if ( !(flags & AUDIT_QUIET) )
  15.962 -        printk("Audit dom%d Done. "
  15.963 -               "pages=%d oos=%d l1=%d l2=%d ctot=%d ttot=%d\n",
  15.964 -               d->domain_id, page_count, oos_count, l1, l2, ctot, ttot);
  15.965 -
  15.966 -    if ( !(flags & AUDIT_SHADOW_ALREADY_LOCKED) )
  15.967 -        shadow_unlock(d);
  15.968 -
  15.969 -    if ( d != current->domain )
  15.970 -        domain_unpause(d);
  15.971 -
  15.972 -    if ( errors && !(flags & AUDIT_ERRORS_OK) )
  15.973 -        BUG();
  15.974 -}
  15.975 -
  15.976 -void audit_domains(void)
  15.977 -{
  15.978 -    struct domain *d;
  15.979 -    for_each_domain ( d )
  15.980 -        audit_domain(d);
  15.981 -}
  15.982 -
  15.983 -void audit_domains_key(unsigned char key)
  15.984 -{
  15.985 -    audit_domains();
  15.986 -}
  15.987 -#endif
    16.1 --- a/xen/arch/x86/dom0_ops.c	Wed Aug 16 16:16:32 2006 +0100
    16.2 +++ b/xen/arch/x86/dom0_ops.c	Wed Aug 16 17:02:35 2006 +0100
    16.3 @@ -89,7 +89,7 @@ long arch_do_dom0_op(struct dom0_op *op,
    16.4          d = find_domain_by_id(op->u.shadow_control.domain);
    16.5          if ( d != NULL )
    16.6          {
    16.7 -            ret = shadow_mode_control(d, &op->u.shadow_control);
    16.8 +            ret = shadow2_control_op(d, &op->u.shadow_control, u_dom0_op);
    16.9              put_domain(d);
   16.10              copy_to_guest(u_dom0_op, op, 1);
   16.11          } 
    17.1 --- a/xen/arch/x86/domain.c	Wed Aug 16 16:16:32 2006 +0100
    17.2 +++ b/xen/arch/x86/domain.c	Wed Aug 16 17:02:35 2006 +0100
    17.3 @@ -134,13 +134,6 @@ struct vcpu *alloc_vcpu_struct(struct do
    17.4      v->arch.perdomain_ptes =
    17.5          d->arch.mm_perdomain_pt + (vcpu_id << GDT_LDT_VCPU_SHIFT);
    17.6  
    17.7 -    v->arch.guest_vtable  = __linear_l2_table;
    17.8 -    v->arch.shadow_vtable = __shadow_linear_l2_table;
    17.9 -#if defined(__x86_64__)
   17.10 -    v->arch.guest_vl3table = __linear_l3_table;
   17.11 -    v->arch.guest_vl4table = __linear_l4_table;
   17.12 -#endif
   17.13 -
   17.14      pae_l3_cache_init(&v->arch.pae_l3_cache);
   17.15  
   17.16      return v;
   17.17 @@ -155,9 +148,7 @@ int arch_domain_create(struct domain *d)
   17.18  {
   17.19      l1_pgentry_t gdt_l1e;
   17.20      int vcpuid, pdpt_order;
   17.21 -#ifdef __x86_64__
   17.22      int i;
   17.23 -#endif
   17.24  
   17.25      pdpt_order = get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t));
   17.26      d->arch.mm_perdomain_pt = alloc_xenheap_pages(pdpt_order);
   17.27 @@ -202,8 +193,12 @@ int arch_domain_create(struct domain *d)
   17.28  
   17.29  #endif /* __x86_64__ */
   17.30  
   17.31 -    shadow_lock_init(d);
   17.32 -    INIT_LIST_HEAD(&d->arch.free_shadow_frames);
   17.33 +    shadow2_lock_init(d);
   17.34 +    for ( i = 0; i <= SHADOW2_MAX_ORDER; i++ )
   17.35 +        INIT_LIST_HEAD(&d->arch.shadow2_freelists[i]);
   17.36 +    INIT_LIST_HEAD(&d->arch.shadow2_p2m_freelist);
   17.37 +    INIT_LIST_HEAD(&d->arch.shadow2_p2m_inuse);
   17.38 +    INIT_LIST_HEAD(&d->arch.shadow2_toplevel_shadows);
   17.39  
   17.40      if ( !is_idle_domain(d) )
   17.41      {
   17.42 @@ -234,6 +229,8 @@ int arch_domain_create(struct domain *d)
   17.43  
   17.44  void arch_domain_destroy(struct domain *d)
   17.45  {
   17.46 +    shadow2_final_teardown(d);
   17.47 +
   17.48      free_xenheap_pages(
   17.49          d->arch.mm_perdomain_pt,
   17.50          get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t)));
   17.51 @@ -328,14 +325,6 @@ int arch_set_info_guest(
   17.52          if ( !hvm_initialize_guest_resources(v) )
   17.53              return -EINVAL;
   17.54      }
   17.55 -    else if ( shadow_mode_refcounts(d) )
   17.56 -    {
   17.57 -        if ( !get_page(mfn_to_page(cr3_pfn), d) )
   17.58 -        {
   17.59 -            destroy_gdt(v);
   17.60 -            return -EINVAL;
   17.61 -        }
   17.62 -    }
   17.63      else
   17.64      {
   17.65          if ( !get_page_and_type(mfn_to_page(cr3_pfn), d,
   17.66 @@ -344,9 +333,16 @@ int arch_set_info_guest(
   17.67              destroy_gdt(v);
   17.68              return -EINVAL;
   17.69          }
   17.70 -    }
   17.71 +    }    
   17.72  
   17.73 -    update_pagetables(v);
   17.74 +    /* Shadow2: make sure the domain has enough shadow memory to
   17.75 +     * boot another vcpu */
   17.76 +    if ( shadow2_mode_enabled(d) 
   17.77 +         && d->arch.shadow2_total_pages < shadow2_min_acceptable_pages(d) )
   17.78 +    {
   17.79 +        destroy_gdt(v);
   17.80 +        return -ENOMEM;
   17.81 +    }
   17.82  
   17.83      if ( v->vcpu_id == 0 )
   17.84          update_domain_wallclock_time(d);
   17.85 @@ -354,6 +350,11 @@ int arch_set_info_guest(
   17.86      /* Don't redo final setup */
   17.87      set_bit(_VCPUF_initialised, &v->vcpu_flags);
   17.88  
   17.89 +    if ( shadow2_mode_enabled(d) )
   17.90 +        shadow2_update_paging_modes(v);
   17.91 +
   17.92 +    update_cr3(v);
   17.93 +
   17.94      return 0;
   17.95  }
   17.96  
   17.97 @@ -669,7 +670,6 @@ static void __context_switch(void)
   17.98              loaddebug(&n->arch.guest_context, 6);
   17.99              loaddebug(&n->arch.guest_context, 7);
  17.100          }
  17.101 -
  17.102          n->arch.ctxt_switch_to(n);
  17.103      }
  17.104  
  17.105 @@ -927,29 +927,34 @@ void domain_relinquish_resources(struct 
  17.106      /* Drop the in-use references to page-table bases. */
  17.107      for_each_vcpu ( d, v )
  17.108      {
  17.109 -        if ( (pfn = pagetable_get_pfn(v->arch.guest_table)) != 0 )
  17.110 +        /* Drop ref to guest_table (from new_guest_cr3(), svm/vmx cr3 handling,
  17.111 +         * or sh2_update_paging_modes()) */
  17.112 +        pfn = pagetable_get_pfn(v->arch.guest_table);
  17.113 +        if ( pfn != 0 )
  17.114          {
  17.115 -            if ( !shadow_mode_refcounts(d) )
  17.116 -                put_page_type(mfn_to_page(pfn));
  17.117 -            put_page(mfn_to_page(pfn));
  17.118 -
  17.119 +            if ( shadow2_mode_refcounts(d) )
  17.120 +                put_page(mfn_to_page(pfn));
  17.121 +            else
  17.122 +                put_page_and_type(mfn_to_page(pfn));
  17.123              v->arch.guest_table = pagetable_null();
  17.124          }
  17.125  
  17.126 -        if ( (pfn = pagetable_get_pfn(v->arch.guest_table_user)) != 0 )
  17.127 +#ifdef __x86_64__
  17.128 +        /* Drop ref to guest_table_user (from MMUEXT_NEW_USER_BASEPTR) */
  17.129 +        pfn = pagetable_get_pfn(v->arch.guest_table_user);
  17.130 +        if ( pfn != 0 )
  17.131          {
  17.132 -            if ( !shadow_mode_refcounts(d) )
  17.133 -                put_page_type(mfn_to_page(pfn));
  17.134 -            put_page(mfn_to_page(pfn));
  17.135 -
  17.136 +            put_page_and_type(mfn_to_page(pfn));
  17.137              v->arch.guest_table_user = pagetable_null();
  17.138          }
  17.139 +#endif
  17.140      }
  17.141  
  17.142      if ( d->vcpu[0] && hvm_guest(d->vcpu[0]) )
  17.143          hvm_relinquish_guest_resources(d);
  17.144  
  17.145 -    shadow_mode_disable(d);
  17.146 +    /* Tear down shadow mode stuff. */
  17.147 +    shadow2_teardown(d);
  17.148  
  17.149      /*
  17.150       * Relinquish GDT mappings. No need for explicit unmapping of the LDT as
  17.151 @@ -964,26 +969,23 @@ void domain_relinquish_resources(struct 
  17.152  
  17.153      /* Free page used by xen oprofile buffer */
  17.154      free_xenoprof_pages(d);
  17.155 -
  17.156  }
  17.157  
  17.158  void arch_dump_domain_info(struct domain *d)
  17.159  {
  17.160 -    if ( shadow_mode_enabled(d) )
  17.161 +    if ( shadow2_mode_enabled(d) )
  17.162      {
  17.163 -        printk("    shadow mode: ");
  17.164 -        if ( shadow_mode_refcounts(d) )
  17.165 +        printk("    shadow2 mode: ");
  17.166 +        if ( d->arch.shadow2_mode & SHM2_enable )
  17.167 +            printk("enabled ");
  17.168 +        if ( shadow2_mode_refcounts(d) )
  17.169              printk("refcounts ");
  17.170 -        if ( shadow_mode_write_all(d) )
  17.171 -            printk("write_all ");
  17.172 -        if ( shadow_mode_log_dirty(d) )
  17.173 +        if ( shadow2_mode_log_dirty(d) )
  17.174              printk("log_dirty ");
  17.175 -        if ( shadow_mode_translate(d) )
  17.176 +        if ( shadow2_mode_translate(d) )
  17.177              printk("translate ");
  17.178 -        if ( shadow_mode_external(d) )
  17.179 +        if ( shadow2_mode_external(d) )
  17.180              printk("external ");
  17.181 -        if ( shadow_mode_wr_pt_pte(d) )
  17.182 -            printk("wr_pt_pte ");
  17.183          printk("\n");
  17.184      }
  17.185  }
    18.1 --- a/xen/arch/x86/domain_build.c	Wed Aug 16 16:16:32 2006 +0100
    18.2 +++ b/xen/arch/x86/domain_build.c	Wed Aug 16 17:02:35 2006 +0100
    18.3 @@ -683,8 +683,11 @@ int construct_dom0(struct domain *d,
    18.4      for ( i = 1; i < opt_dom0_max_vcpus; i++ )
    18.5          (void)alloc_vcpu(d, i, i);
    18.6  
    18.7 -    /* Set up monitor table */
    18.8 -    update_pagetables(v);
    18.9 +    /* Set up CR3 value for write_ptbase */
   18.10 +    if ( shadow2_mode_enabled(v->domain) )
   18.11 +        shadow2_update_paging_modes(v);
   18.12 +    else
   18.13 +        update_cr3(v);
   18.14  
   18.15      /* Install the new page tables. */
   18.16      local_irq_disable();
   18.17 @@ -796,10 +799,8 @@ int construct_dom0(struct domain *d,
   18.18      new_thread(v, dsi.v_kernentry, vstack_end, vstartinfo_start);
   18.19  
   18.20      if ( opt_dom0_shadow )
   18.21 -    {
   18.22 -        shadow_mode_enable(d, SHM_enable);
   18.23 -        update_pagetables(v);
   18.24 -    }
   18.25 +        if ( shadow2_test_enable(d) == 0 ) 
   18.26 +            shadow2_update_paging_modes(v);
   18.27  
   18.28      if ( supervisor_mode_kernel )
   18.29      {
    19.1 --- a/xen/arch/x86/hvm/hvm.c	Wed Aug 16 16:16:32 2006 +0100
    19.2 +++ b/xen/arch/x86/hvm/hvm.c	Wed Aug 16 17:02:35 2006 +0100
    19.3 @@ -30,6 +30,7 @@
    19.4  #include <xen/hypercall.h>
    19.5  #include <xen/guest_access.h>
    19.6  #include <xen/event.h>
    19.7 +#include <xen/shadow.h>
    19.8  #include <asm/current.h>
    19.9  #include <asm/e820.h>
   19.10  #include <asm/io.h>
   19.11 @@ -42,10 +43,6 @@
   19.12  #include <asm/spinlock.h>
   19.13  #include <asm/hvm/hvm.h>
   19.14  #include <asm/hvm/support.h>
   19.15 -#include <asm/shadow.h>
   19.16 -#if CONFIG_PAGING_LEVELS >= 3
   19.17 -#include <asm/shadow_64.h>
   19.18 -#endif
   19.19  #include <public/sched.h>
   19.20  #include <public/hvm/ioreq.h>
   19.21  #include <public/version.h>
   19.22 @@ -61,7 +58,7 @@ struct hvm_function_table hvm_funcs;
   19.23  static void hvm_zap_mmio_range(
   19.24      struct domain *d, unsigned long pfn, unsigned long nr_pfn)
   19.25  {
   19.26 -    unsigned long i, val = INVALID_MFN;
   19.27 +    unsigned long i;
   19.28  
   19.29      ASSERT(d == current->domain);
   19.30  
   19.31 @@ -70,7 +67,8 @@ static void hvm_zap_mmio_range(
   19.32          if ( pfn + i >= 0xfffff )
   19.33              break;
   19.34  
   19.35 -        __copy_to_user(&phys_to_machine_mapping[pfn + i], &val, sizeof (val));
   19.36 +        if ( VALID_MFN(gmfn_to_mfn(d, pfn + i)) )
   19.37 +            guest_remove_page(d, pfn + i);
   19.38      }
   19.39  }
   19.40  
   19.41 @@ -262,11 +260,13 @@ void hvm_setup_platform(struct domain* d
   19.42      if ( !hvm_guest(v) || (v->vcpu_id != 0) )
   19.43          return;
   19.44  
   19.45 +#if 0 /* SHADOW2 does not have this */
   19.46      if ( shadow_direct_map_init(d) == 0 )
   19.47      {
   19.48          printk("Can not allocate shadow direct map for HVM domain.\n");
   19.49          domain_crash_synchronous();
   19.50      }
   19.51 +#endif
   19.52  
   19.53      hvm_zap_iommu_pages(d);
   19.54  
   19.55 @@ -380,6 +380,8 @@ void hvm_hlt(unsigned long rflags)
   19.56   */
   19.57  int hvm_copy(void *buf, unsigned long vaddr, int size, int dir)
   19.58  {
   19.59 +    struct vcpu *v = current;
   19.60 +    unsigned long gfn;
   19.61      unsigned long mfn;
   19.62      char *addr;
   19.63      int count;
   19.64 @@ -389,10 +391,9 @@ int hvm_copy(void *buf, unsigned long va
   19.65          if (count > size)
   19.66              count = size;
   19.67  
   19.68 -        if (hvm_paging_enabled(current))
   19.69 -            mfn = gva_to_mfn(vaddr);
   19.70 -        else
   19.71 -            mfn = get_mfn_from_gpfn(vaddr >> PAGE_SHIFT);
   19.72 +        gfn = shadow2_gva_to_gfn(v, vaddr);
   19.73 +        mfn = mfn_x(sh2_vcpu_gfn_to_mfn(v, gfn));
   19.74 +
   19.75          if (mfn == INVALID_MFN)
   19.76              return 0;
   19.77  
   19.78 @@ -545,7 +546,7 @@ void hvm_do_hypercall(struct cpu_user_re
   19.79          return;
   19.80      }
   19.81  
   19.82 -    if ( current->domain->arch.ops->guest_paging_levels == PAGING_L4 )
   19.83 +    if ( current->arch.shadow2->guest_levels == 4 )
   19.84      {
   19.85          pregs->rax = hvm_hypercall64_table[pregs->rax](pregs->rdi,
   19.86                                                         pregs->rsi,
    20.1 --- a/xen/arch/x86/hvm/platform.c	Wed Aug 16 16:16:32 2006 +0100
    20.2 +++ b/xen/arch/x86/hvm/platform.c	Wed Aug 16 17:02:35 2006 +0100
    20.3 @@ -21,7 +21,7 @@
    20.4  #include <xen/config.h>
    20.5  #include <xen/types.h>
    20.6  #include <xen/mm.h>
    20.7 -#include <asm/shadow.h>
    20.8 +#include <xen/shadow.h>
    20.9  #include <xen/domain_page.h>
   20.10  #include <asm/page.h>
   20.11  #include <xen/event.h>
   20.12 @@ -35,9 +35,6 @@
   20.13  #include <xen/lib.h>
   20.14  #include <xen/sched.h>
   20.15  #include <asm/current.h>
   20.16 -#if CONFIG_PAGING_LEVELS >= 3
   20.17 -#include <asm/shadow_64.h>
   20.18 -#endif
   20.19  
   20.20  #define DECODE_success  1
   20.21  #define DECODE_failure  0
   20.22 @@ -724,7 +721,7 @@ void send_pio_req(struct cpu_user_regs *
   20.23  
   20.24      if (pvalid) {
   20.25          if (hvm_paging_enabled(current))
   20.26 -            p->u.pdata = (void *) gva_to_gpa(value);
   20.27 +            p->u.data = shadow2_gva_to_gpa(current, value);
   20.28          else
   20.29              p->u.pdata = (void *) value; /* guest VA == guest PA */
   20.30      } else
   20.31 @@ -774,7 +771,7 @@ void send_mmio_req(
   20.32  
   20.33      if (pvalid) {
   20.34          if (hvm_paging_enabled(v))
   20.35 -            p->u.pdata = (void *) gva_to_gpa(value);
   20.36 +            p->u.data = shadow2_gva_to_gpa(v, value);
   20.37          else
   20.38              p->u.pdata = (void *) value; /* guest VA == guest PA */
   20.39      } else
    21.1 --- a/xen/arch/x86/hvm/svm/svm.c	Wed Aug 16 16:16:32 2006 +0100
    21.2 +++ b/xen/arch/x86/hvm/svm/svm.c	Wed Aug 16 17:02:35 2006 +0100
    21.3 @@ -26,9 +26,10 @@
    21.4  #include <xen/irq.h>
    21.5  #include <xen/softirq.h>
    21.6  #include <xen/hypercall.h>
    21.7 +#include <xen/domain_page.h>
    21.8  #include <asm/current.h>
    21.9  #include <asm/io.h>
   21.10 -#include <asm/shadow.h>
   21.11 +#include <asm/shadow2.h>
   21.12  #include <asm/regs.h>
   21.13  #include <asm/cpufeature.h>
   21.14  #include <asm/processor.h>
   21.15 @@ -43,10 +44,6 @@
   21.16  #include <asm/hvm/svm/emulate.h>
   21.17  #include <asm/hvm/svm/vmmcall.h>
   21.18  #include <asm/hvm/svm/intr.h>
   21.19 -#include <asm/shadow.h>
   21.20 -#if CONFIG_PAGING_LEVELS >= 3
   21.21 -#include <asm/shadow_64.h>
   21.22 -#endif
   21.23  #include <public/sched.h>
   21.24  
   21.25  #define SVM_EXTRA_DEBUG
   21.26 @@ -414,7 +411,7 @@ static int svm_realmode(struct vcpu *v)
   21.27      return (eflags & X86_EFLAGS_VM) || !(cr0 & X86_CR0_PE);
   21.28  }
   21.29  
   21.30 -static int svm_instruction_length(struct vcpu *v)
   21.31 +int svm_guest_x86_mode(struct vcpu *v)
   21.32  {
   21.33      struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
   21.34      unsigned long cr0 = vmcb->cr0, eflags = vmcb->rflags, mode;
   21.35 @@ -423,10 +420,20 @@ static int svm_instruction_length(struct
   21.36          mode = vmcb->cs.attributes.fields.l ? 8 : 4;
   21.37      else
   21.38          mode = (eflags & X86_EFLAGS_VM) || !(cr0 & X86_CR0_PE) ? 2 : 4;
   21.39 -    return svm_instrlen(guest_cpu_user_regs(), mode);
   21.40 +    return mode;
   21.41 +}
   21.42 +
   21.43 +int svm_instruction_length(struct vcpu *v)
   21.44 +{
   21.45 +    return svm_instrlen(guest_cpu_user_regs(), svm_guest_x86_mode(v));
   21.46  }
   21.47  
   21.48 -static unsigned long svm_get_ctrl_reg(struct vcpu *v, unsigned int num)
   21.49 +void svm_update_host_cr3(struct vcpu *v)
   21.50 +{
   21.51 +    /* SVM doesn't have a HOST_CR3 equivalent to update. */
   21.52 +}
   21.53 +
   21.54 +unsigned long svm_get_ctrl_reg(struct vcpu *v, unsigned int num)
   21.55  {
   21.56      switch ( num )
   21.57      {
   21.58 @@ -436,6 +443,8 @@ static unsigned long svm_get_ctrl_reg(st
   21.59          return v->arch.hvm_svm.cpu_cr2;
   21.60      case 3:
   21.61          return v->arch.hvm_svm.cpu_cr3;
   21.62 +    case 4:
   21.63 +        return v->arch.hvm_svm.cpu_shadow_cr4;
   21.64      default:
   21.65          BUG();
   21.66      }
   21.67 @@ -526,8 +535,6 @@ static void svm_init_hypercall_page(stru
   21.68  }
   21.69  
   21.70  
   21.71 -
   21.72 -
   21.73  int svm_dbg_on = 0;
   21.74  
   21.75  static inline int svm_do_debugout(unsigned long exit_code)
   21.76 @@ -647,6 +654,11 @@ static void svm_load_cpu_guest_regs(
   21.77      svm_load_cpu_user_regs(v, regs);
   21.78  }
   21.79  
   21.80 +int svm_long_mode_enabled(struct vcpu *v)
   21.81 +{
   21.82 +    return SVM_LONG_GUEST(v);
   21.83 +}
   21.84 +
   21.85  
   21.86  
   21.87  static void arch_svm_do_launch(struct vcpu *v) 
   21.88 @@ -726,7 +738,6 @@ static void svm_ctxt_switch_to(struct vc
   21.89  static void svm_final_setup_guest(struct vcpu *v)
   21.90  {
   21.91      struct domain *d = v->domain;
   21.92 -    struct vcpu *vc;
   21.93  
   21.94      v->arch.schedule_tail    = arch_svm_do_launch;
   21.95      v->arch.ctxt_switch_from = svm_ctxt_switch_from;
   21.96 @@ -735,9 +746,12 @@ static void svm_final_setup_guest(struct
   21.97      if ( v != d->vcpu[0] )
   21.98          return;
   21.99  
  21.100 -    /* Initialize monitor page table */
  21.101 -    for_each_vcpu( d, vc )
  21.102 -        vc->arch.monitor_table = pagetable_null();
  21.103 +    if ( !shadow2_mode_external(d) )
  21.104 +    {
  21.105 +        DPRINTK("Can't init HVM for dom %u vcpu %u: "
  21.106 +                "not in shadow2 external mode\n", d->domain_id, v->vcpu_id);
  21.107 +        domain_crash(d);
  21.108 +    }
  21.109  
  21.110      /* 
  21.111       * Required to do this once per domain
  21.112 @@ -745,13 +759,6 @@ static void svm_final_setup_guest(struct
  21.113       */
  21.114      memset(&d->shared_info->evtchn_mask[0], 0xff, 
  21.115             sizeof(d->shared_info->evtchn_mask));       
  21.116 -
  21.117 -    /* 
  21.118 -     * Put the domain in shadow mode even though we're going to be using
  21.119 -     * the shared 1:1 page table initially. It shouldn't hurt 
  21.120 -     */
  21.121 -    shadow_mode_enable(d, SHM_enable|SHM_refcounts|
  21.122 -                       SHM_translate|SHM_external|SHM_wr_pt_pte);
  21.123  }
  21.124  
  21.125  
  21.126 @@ -809,9 +816,13 @@ int start_svm(void)
  21.127  
  21.128      hvm_funcs.realmode = svm_realmode;
  21.129      hvm_funcs.paging_enabled = svm_paging_enabled;
  21.130 +    hvm_funcs.long_mode_enabled = svm_long_mode_enabled;
  21.131 +    hvm_funcs.guest_x86_mode = svm_guest_x86_mode;
  21.132      hvm_funcs.instruction_length = svm_instruction_length;
  21.133      hvm_funcs.get_guest_ctrl_reg = svm_get_ctrl_reg;
  21.134  
  21.135 +    hvm_funcs.update_host_cr3 = svm_update_host_cr3;
  21.136 +    
  21.137      hvm_funcs.stts = svm_stts;
  21.138      hvm_funcs.set_tsc_offset = svm_set_tsc_offset;
  21.139  
  21.140 @@ -834,7 +845,6 @@ static void svm_relinquish_guest_resourc
  21.141              continue;
  21.142  
  21.143          destroy_vmcb(&v->arch.hvm_svm);
  21.144 -        free_monitor_pagetable(v);
  21.145          kill_timer(&v->arch.hvm_vcpu.hlt_timer);
  21.146          if ( hvm_apic_support(v->domain) && (VLAPIC(v) != NULL) ) 
  21.147          {
  21.148 @@ -851,8 +861,6 @@ static void svm_relinquish_guest_resourc
  21.149  
  21.150      if ( d->arch.hvm_domain.buffered_io_va )
  21.151          unmap_domain_page_global((void *)d->arch.hvm_domain.buffered_io_va);
  21.152 -
  21.153 -    shadow_direct_map_clean(d);
  21.154  }
  21.155  
  21.156  
  21.157 @@ -894,7 +902,6 @@ static int svm_do_page_fault(unsigned lo
  21.158  {
  21.159      struct vcpu *v = current;
  21.160      unsigned long eip;
  21.161 -    unsigned long gpa; /* FIXME: PAE */
  21.162      int result;
  21.163      struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
  21.164  
  21.165 @@ -907,43 +914,7 @@ static int svm_do_page_fault(unsigned lo
  21.166              va, eip, (unsigned long)regs->error_code);
  21.167  //#endif
  21.168  
  21.169 -    if ( !svm_paging_enabled(v) )
  21.170 -    {
  21.171 -        if ( shadow_direct_map_fault(va, regs) ) 
  21.172 -            return 1;
  21.173 -
  21.174 -        handle_mmio(va, va);
  21.175 -        return 1;
  21.176 -    }
  21.177 -
  21.178 -
  21.179 -    gpa = gva_to_gpa(va);
  21.180 -
  21.181 -    /* Use 1:1 page table to identify MMIO address space */
  21.182 -    if (mmio_space(gpa))
  21.183 -    {
  21.184 -        /* No support for APIC */
  21.185 -        if (!hvm_apic_support(v->domain) && gpa >= 0xFEC00000)
  21.186 -        { 
  21.187 -            int inst_len;
  21.188 -            inst_len = svm_instruction_length(v);
  21.189 -            if (inst_len == -1)
  21.190 -            {
  21.191 -                printf("%s: INST_LEN - Unable to decode properly\n", __func__);
  21.192 -                domain_crash_synchronous();
  21.193 -            }
  21.194 -
  21.195 -            __update_guest_eip(vmcb, inst_len);
  21.196 -
  21.197 -            return 1;
  21.198 -        }
  21.199 -
  21.200 -        handle_mmio(va, gpa);
  21.201 -
  21.202 -        return 1;
  21.203 -    }
  21.204 -    
  21.205 -    result = shadow_fault(va, regs);
  21.206 +    result = shadow2_fault(va, regs); 
  21.207  
  21.208      if( result ) {
  21.209          /* Let's make sure that the Guest TLB is flushed */
  21.210 @@ -1035,19 +1006,12 @@ static void svm_vmexit_do_cpuid(struct v
  21.211              clear_bit(X86_FEATURE_APIC, &edx);
  21.212          }
  21.213  
  21.214 -#if CONFIG_PAGING_LEVELS < 3
  21.215 -        clear_bit(X86_FEATURE_PAE, &edx);
  21.216 -        clear_bit(X86_FEATURE_PSE, &edx);
  21.217 +#if CONFIG_PAGING_LEVELS >= 3
  21.218 +        if ( !v->domain->arch.hvm_domain.params[HVM_PARAM_PAE_ENABLED] )
  21.219 +#endif
  21.220 +            clear_bit(X86_FEATURE_PAE, &edx);
  21.221          clear_bit(X86_FEATURE_PSE36, &edx);
  21.222 -#else
  21.223 -        if ( v->domain->arch.ops->guest_paging_levels == PAGING_L2 )
  21.224 -        {
  21.225 -            if ( !v->domain->arch.hvm_domain.params[HVM_PARAM_PAE_ENABLED] )
  21.226 -                clear_bit(X86_FEATURE_PAE, &edx);
  21.227 -            clear_bit(X86_FEATURE_PSE, &edx);
  21.228 -            clear_bit(X86_FEATURE_PSE36, &edx);
  21.229 -        }
  21.230 -#endif
  21.231 +
  21.232          /* Clear out reserved bits. */
  21.233          ecx &= ~SVM_VCPU_CPUID_L1_ECX_RESERVED;
  21.234          edx &= ~SVM_VCPU_CPUID_L1_EDX_RESERVED;
  21.235 @@ -1097,23 +1061,12 @@ static void svm_vmexit_do_cpuid(struct v
  21.236          clear_bit(X86_FEATURE_SYSCALL & 31, &edx);
  21.237  #endif
  21.238  
  21.239 -#if CONFIG_PAGING_LEVELS < 3
  21.240 -        clear_bit(X86_FEATURE_NX & 31, &edx);
  21.241 -        clear_bit(X86_FEATURE_PAE, &edx);
  21.242 -        clear_bit(X86_FEATURE_PSE, &edx);
  21.243 +
  21.244 +#if CONFIG_PAGING_LEVELS >= 3
  21.245 +        if ( !v->domain->arch.hvm_domain.params[HVM_PARAM_PAE_ENABLED] )
  21.246 +#endif
  21.247 +            clear_bit(X86_FEATURE_PAE, &edx);
  21.248          clear_bit(X86_FEATURE_PSE36, &edx);
  21.249 -#else
  21.250 -        if ( v->domain->arch.ops->guest_paging_levels == PAGING_L2 )
  21.251 -        {
  21.252 -            if ( !v->domain->arch.hvm_domain.params[HVM_PARAM_PAE_ENABLED] )
  21.253 -            {
  21.254 -                clear_bit(X86_FEATURE_NX & 31, &edx);
  21.255 -                clear_bit(X86_FEATURE_PAE, &edx);
  21.256 -            }
  21.257 -            clear_bit(X86_FEATURE_PSE, &edx);
  21.258 -            clear_bit(X86_FEATURE_PSE36, &edx);
  21.259 -        }
  21.260 -#endif
  21.261  
  21.262          /* Make SVM feature invisible to the guest. */
  21.263          clear_bit(X86_FEATURE_SVME & 31, &ecx);
  21.264 @@ -1555,6 +1508,7 @@ static int svm_set_cr0(unsigned long val
  21.265      unsigned long mfn;
  21.266      int paging_enabled;
  21.267      struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
  21.268 +    unsigned long old_base_mfn;
  21.269    
  21.270      ASSERT(vmcb);
  21.271  
  21.272 @@ -1600,54 +1554,21 @@ static int svm_set_cr0(unsigned long val
  21.273              set_bit(SVM_CPU_STATE_LMA_ENABLED,
  21.274                      &v->arch.hvm_svm.cpu_state);
  21.275              vmcb->efer |= (EFER_LMA | EFER_LME);
  21.276 -            if (!shadow_set_guest_paging_levels(v->domain, PAGING_L4) )
  21.277 -            {
  21.278 -                printk("Unsupported guest paging levels\n");
  21.279 -                domain_crash_synchronous(); /* need to take a clean path */
  21.280 -            }
  21.281          }
  21.282 -        else
  21.283  #endif  /* __x86_64__ */
  21.284 -        {
  21.285 -#if CONFIG_PAGING_LEVELS >= 3
  21.286 -            /* seems it's a 32-bit or 32-bit PAE guest */
  21.287 -            if ( test_bit(SVM_CPU_STATE_PAE_ENABLED,
  21.288 -                        &v->arch.hvm_svm.cpu_state) )
  21.289 -            {
  21.290 -                /* The guest enables PAE first and then it enables PG, it is
  21.291 -                 * really a PAE guest */
  21.292 -                if ( !shadow_set_guest_paging_levels(v->domain, PAGING_L3) )
  21.293 -                {
  21.294 -                    printk("Unsupported guest paging levels\n");
  21.295 -                    domain_crash_synchronous();
  21.296 -                }
  21.297 -            }
  21.298 -            else
  21.299 -            {
  21.300 -                if ( !shadow_set_guest_paging_levels(v->domain, PAGING_L2) )
  21.301 -                {
  21.302 -                    printk("Unsupported guest paging levels\n");
  21.303 -                    domain_crash_synchronous(); /* need to take a clean path */
  21.304 -                }
  21.305 -            }
  21.306 -#endif
  21.307 -        }
  21.308  
  21.309          /* Now arch.guest_table points to machine physical. */
  21.310 +        old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
  21.311          v->arch.guest_table = pagetable_from_pfn(mfn);
  21.312 -        update_pagetables(v);
  21.313 +        if ( old_base_mfn )
  21.314 +            put_page(mfn_to_page(old_base_mfn));
  21.315 +        shadow2_update_paging_modes(v);
  21.316  
  21.317          HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx", 
  21.318                  (unsigned long) (mfn << PAGE_SHIFT));
  21.319  
  21.320 +        vmcb->cr3 = v->arch.hvm_vcpu.hw_cr3; 
  21.321          set_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags);
  21.322 -        vmcb->cr3 = pagetable_get_paddr(v->arch.shadow_table);
  21.323 -
  21.324 -        /* arch->shadow_table should hold the next CR3 for shadow */
  21.325 -        HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx, mfn = %lx\n", 
  21.326 -                    v->arch.hvm_svm.cpu_cr3, mfn);
  21.327 -
  21.328 -        return 1;
  21.329      }
  21.330  
  21.331      if ( !((value & X86_CR0_PE) && (value & X86_CR0_PG)) && paging_enabled )
  21.332 @@ -1667,17 +1588,16 @@ static int svm_set_cr0(unsigned long val
  21.333              svm_inject_exception(v, TRAP_gp_fault, 1, 0);
  21.334              return 0;
  21.335          }
  21.336 -
  21.337 -        clear_all_shadow_status( v->domain );
  21.338 +        shadow2_update_paging_modes(v);
  21.339 +        vmcb->cr3 = v->arch.hvm_vcpu.hw_cr3;
  21.340          set_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags);
  21.341 -        vmcb->cr3 = pagetable_get_paddr(v->domain->arch.phys_table);
  21.342      }
  21.343      else if ( (value & (X86_CR0_PE | X86_CR0_PG)) == X86_CR0_PE )
  21.344      {
  21.345          /* we should take care of this kind of situation */
  21.346 -        clear_all_shadow_status(v->domain);
  21.347 +        shadow2_update_paging_modes(v);
  21.348 +        vmcb->cr3 = v->arch.hvm_vcpu.hw_cr3;
  21.349          set_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags);
  21.350 -        vmcb->cr3 = pagetable_get_paddr(v->domain->arch.phys_table);
  21.351      }
  21.352  
  21.353      return 1;
  21.354 @@ -1786,7 +1706,7 @@ static int mov_to_cr(int gpreg, int cr, 
  21.355              mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
  21.356              if (mfn != pagetable_get_pfn(v->arch.guest_table))
  21.357                  __hvm_bug(regs);
  21.358 -            shadow_sync_all(v->domain);
  21.359 +            shadow2_update_cr3(v);
  21.360          }
  21.361          else 
  21.362          {
  21.363 @@ -1812,14 +1732,10 @@ static int mov_to_cr(int gpreg, int cr, 
  21.364              /*
  21.365               * arch.shadow_table should now hold the next CR3 for shadow
  21.366               */
  21.367 -#if CONFIG_PAGING_LEVELS >= 3
  21.368 -            if ( v->domain->arch.ops->guest_paging_levels == PAGING_L3 )
  21.369 -                shadow_sync_all(v->domain);
  21.370 -#endif
  21.371              v->arch.hvm_svm.cpu_cr3 = value;
  21.372 -            update_pagetables(v);
  21.373 +            update_cr3(v);
  21.374 +            vmcb->cr3 = v->arch.hvm_vcpu.hw_cr3; 
  21.375              HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx", value);
  21.376 -            vmcb->cr3 = pagetable_get_paddr(v->arch.shadow_table);
  21.377          }
  21.378          break;
  21.379      }
  21.380 @@ -1839,12 +1755,6 @@ static int mov_to_cr(int gpreg, int cr, 
  21.381  #if CONFIG_PAGING_LEVELS >= 3
  21.382                  unsigned long mfn, old_base_mfn;
  21.383  
  21.384 -                if( !shadow_set_guest_paging_levels(v->domain, PAGING_L3) )
  21.385 -                {
  21.386 -                    printk("Unsupported guest paging levels\n");
  21.387 -                    domain_crash_synchronous(); /* need to take a clean path */
  21.388 -                }
  21.389 -
  21.390                  if ( !VALID_MFN(mfn = get_mfn_from_gpfn(
  21.391                                      v->arch.hvm_svm.cpu_cr3 >> PAGE_SHIFT)) ||
  21.392                       !get_page(mfn_to_page(mfn), v->domain) )
  21.393 @@ -1853,21 +1763,20 @@ static int mov_to_cr(int gpreg, int cr, 
  21.394                      domain_crash_synchronous(); /* need to take a clean path */
  21.395                  }
  21.396  
  21.397 -                old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
  21.398 -                if ( old_base_mfn )
  21.399 -                    put_page(mfn_to_page(old_base_mfn));
  21.400 -
  21.401                  /*
  21.402                   * Now arch.guest_table points to machine physical.
  21.403                   */
  21.404  
  21.405 +                old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
  21.406                  v->arch.guest_table = pagetable_from_pfn(mfn);
  21.407 -                update_pagetables(v);
  21.408 +                if ( old_base_mfn )
  21.409 +                    put_page(mfn_to_page(old_base_mfn));
  21.410 +                shadow2_update_paging_modes(v);
  21.411  
  21.412                  HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
  21.413                              (unsigned long) (mfn << PAGE_SHIFT));
  21.414  
  21.415 -                vmcb->cr3 = pagetable_get_paddr(v->arch.shadow_table);
  21.416 +                vmcb->cr3 = v->arch.hvm_vcpu.hw_cr3; 
  21.417  
  21.418                  /*
  21.419                   * arch->shadow_table should hold the next CR3 for shadow
  21.420 @@ -1878,33 +1787,6 @@ static int mov_to_cr(int gpreg, int cr, 
  21.421                              v->arch.hvm_svm.cpu_cr3, mfn);
  21.422  #endif
  21.423              }
  21.424 -            else
  21.425 -            {
  21.426 -                /*  The guest is a 64 bit or 32-bit PAE guest. */
  21.427 -#if CONFIG_PAGING_LEVELS >= 3
  21.428 -                if ( (v->domain->arch.ops != NULL) &&
  21.429 -                        v->domain->arch.ops->guest_paging_levels == PAGING_L2)
  21.430 -                {
  21.431 -                    /* Seems the guest first enables PAE without enabling PG,
  21.432 -                     * it must enable PG after that, and it is a 32-bit PAE
  21.433 -                     * guest */
  21.434 -
  21.435 -                    if ( !shadow_set_guest_paging_levels(v->domain, PAGING_L3))
  21.436 -                    {
  21.437 -                        printk("Unsupported guest paging levels\n");
  21.438 -                        domain_crash_synchronous();
  21.439 -                    }                   
  21.440 -                }
  21.441 -                else
  21.442 -                {
  21.443 -                    if ( !shadow_set_guest_paging_levels(v->domain, PAGING_L4))
  21.444 -                    {
  21.445 -                        printk("Unsupported guest paging levels\n");
  21.446 -                        domain_crash_synchronous();
  21.447 -                    }
  21.448 -                }
  21.449 -#endif
  21.450 -            }
  21.451          }
  21.452          else if (value & X86_CR4_PAE) {
  21.453              set_bit(SVM_CPU_STATE_PAE_ENABLED, &v->arch.hvm_svm.cpu_state);
  21.454 @@ -1926,7 +1808,7 @@ static int mov_to_cr(int gpreg, int cr, 
  21.455          if ((old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE))
  21.456          {
  21.457              set_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags);
  21.458 -            shadow_sync_all(v->domain);
  21.459 +            shadow2_update_paging_modes(v);
  21.460          }
  21.461          break;
  21.462      }
  21.463 @@ -2267,7 +2149,7 @@ void svm_handle_invlpg(const short invlp
  21.464  
  21.465      /* Overkill, we may not this */
  21.466      set_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags);
  21.467 -    shadow_invlpg(v, g_vaddr);
  21.468 +    shadow2_invlpg(v, g_vaddr);
  21.469  }
  21.470  
  21.471  
  21.472 @@ -2638,7 +2520,7 @@ void walk_shadow_and_guest_pt(unsigned l
  21.473      struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
  21.474      unsigned long gpa;
  21.475  
  21.476 -    gpa = gva_to_gpa( gva );
  21.477 +    gpa = shadow2_gva_to_gpa(current, gva);
  21.478      printk( "gva = %lx, gpa=%lx, gCR3=%x\n", gva, gpa, (u32)vmcb->cr3 );
  21.479      if( !svm_paging_enabled(v) || mmio_space(gpa) )
  21.480         return;
  21.481 @@ -2662,8 +2544,12 @@ void walk_shadow_and_guest_pt(unsigned l
  21.482      __copy_from_user(&gpte, &linear_pg_table[ l1_linear_offset(gva) ],
  21.483                       sizeof(gpte) );
  21.484      printk( "G-PTE = %x, flags=%x\n", gpte.l1, l1e_get_flags(gpte) );
  21.485 -    __copy_from_user( &spte, &phys_to_machine_mapping[ l1e_get_pfn( gpte ) ],
  21.486 +
  21.487 +    BUG(); // need to think about this, and convert usage of
  21.488 +           // phys_to_machine_mapping to use pagetable format...
  21.489 +    __copy_from_user( &spte, &phys_to_machine_mapping[ l1e_get_pfn( gpte ) ], 
  21.490                        sizeof(spte) );
  21.491 +
  21.492      printk( "S-PTE = %x, flags=%x\n", spte.l1, l1e_get_flags(spte));
  21.493  }
  21.494  #endif /* SVM_WALK_GUEST_PAGES */
  21.495 @@ -2704,7 +2590,8 @@ asmlinkage void svm_vmexit_handler(struc
  21.496  
  21.497      if (svm_dbg_on && exit_reason == VMEXIT_EXCEPTION_PF) 
  21.498      {
  21.499 -        if (svm_paging_enabled(v) && !mmio_space(gva_to_gpa(vmcb->exitinfo2)))
  21.500 +        if (svm_paging_enabled(v) && 
  21.501 +            !mmio_space(shadow2_gva_to_gpa(current, vmcb->exitinfo2)))
  21.502          {
  21.503              printk("I%08ld,ExC=%s(%d),IP=%x:%llx,I1=%llx,I2=%llx,INT=%llx, "
  21.504                     "gpa=%llx\n", intercepts_counter,
  21.505 @@ -2713,7 +2600,7 @@ asmlinkage void svm_vmexit_handler(struc
  21.506  		    (unsigned long long) vmcb->exitinfo1,
  21.507  		    (unsigned long long) vmcb->exitinfo2,
  21.508  		    (unsigned long long) vmcb->exitintinfo.bytes,
  21.509 -            (unsigned long long) gva_to_gpa( vmcb->exitinfo2 ) );
  21.510 +            (unsigned long long) shadow2_gva_to_gpa(current, vmcb->exitinfo2));
  21.511          }
  21.512          else 
  21.513          {
  21.514 @@ -2757,7 +2644,7 @@ asmlinkage void svm_vmexit_handler(struc
  21.515          && ( ( vmcb->exitinfo2 == vmcb->rip )
  21.516          || vmcb->exitintinfo.bytes) )
  21.517      {
  21.518 -       if (svm_paging_enabled(v) && !mmio_space(gva_to_gpa(vmcb->exitinfo2)))     
  21.519 +       if (svm_paging_enabled(v) && !mmio_space(gva_to_gpa(vmcb->exitinfo2)))
  21.520             walk_shadow_and_guest_pt( vmcb->exitinfo2 );
  21.521      }
  21.522  #endif
    22.1 --- a/xen/arch/x86/hvm/svm/vmcb.c	Wed Aug 16 16:16:32 2006 +0100
    22.2 +++ b/xen/arch/x86/hvm/svm/vmcb.c	Wed Aug 16 17:02:35 2006 +0100
    22.3 @@ -380,8 +380,8 @@ void svm_do_launch(struct vcpu *v)
    22.4          printk("%s: phys_table   = %lx\n", __func__, pt);
    22.5      }
    22.6  
    22.7 -    /* At launch we always use the phys_table */
    22.8 -    vmcb->cr3 = pagetable_get_paddr(v->domain->arch.phys_table);
    22.9 +    /* Set cr3 from hw_cr3 even when guest-visible paging is not enabled */
   22.10 +    vmcb->cr3 = v->arch.hvm_vcpu.hw_cr3; 
   22.11  
   22.12      if (svm_dbg_on) 
   22.13      {
    23.1 --- a/xen/arch/x86/hvm/vlapic.c	Wed Aug 16 16:16:32 2006 +0100
    23.2 +++ b/xen/arch/x86/hvm/vlapic.c	Wed Aug 16 17:02:35 2006 +0100
    23.3 @@ -21,7 +21,8 @@
    23.4  #include <xen/types.h>
    23.5  #include <xen/mm.h>
    23.6  #include <xen/xmalloc.h>
    23.7 -#include <asm/shadow.h>
    23.8 +#include <xen/shadow.h>
    23.9 +#include <xen/domain_page.h>
   23.10  #include <asm/page.h>
   23.11  #include <xen/event.h>
   23.12  #include <xen/trace.h>
    24.1 --- a/xen/arch/x86/hvm/vmx/vmcs.c	Wed Aug 16 16:16:32 2006 +0100
    24.2 +++ b/xen/arch/x86/hvm/vmx/vmcs.c	Wed Aug 16 17:02:35 2006 +0100
    24.3 @@ -34,12 +34,8 @@
    24.4  #include <asm/flushtlb.h>
    24.5  #include <xen/event.h>
    24.6  #include <xen/kernel.h>
    24.7 -#include <asm/shadow.h>
    24.8  #include <xen/keyhandler.h>
    24.9 -
   24.10 -#if CONFIG_PAGING_LEVELS >= 3
   24.11 -#include <asm/shadow_64.h>
   24.12 -#endif
   24.13 +#include <asm/shadow2.h>
   24.14  
   24.15  static int vmcs_size;
   24.16  static int vmcs_order;
   24.17 @@ -238,7 +234,7 @@ static void vmx_set_host_env(struct vcpu
   24.18  
   24.19  static void vmx_do_launch(struct vcpu *v)
   24.20  {
   24.21 -/* Update CR3, GDT, LDT, TR */
   24.22 +/* Update CR3, CR0, CR4, GDT, LDT, TR */
   24.23      unsigned int  error = 0;
   24.24      unsigned long cr0, cr4;
   24.25  
   24.26 @@ -276,8 +272,11 @@ static void vmx_do_launch(struct vcpu *v
   24.27      error |= __vmwrite(GUEST_TR_BASE, 0);
   24.28      error |= __vmwrite(GUEST_TR_LIMIT, 0xff);
   24.29  
   24.30 -    __vmwrite(GUEST_CR3, pagetable_get_paddr(v->domain->arch.phys_table));
   24.31 -    __vmwrite(HOST_CR3, pagetable_get_paddr(v->arch.monitor_table));
   24.32 +    shadow2_update_paging_modes(v);
   24.33 +    printk("%s(): GUEST_CR3<=%08lx, HOST_CR3<=%08lx\n",
   24.34 +           __func__, v->arch.hvm_vcpu.hw_cr3, v->arch.cr3);
   24.35 +    __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
   24.36 +    __vmwrite(HOST_CR3, v->arch.cr3);
   24.37  
   24.38      v->arch.schedule_tail = arch_vmx_do_resume;
   24.39  
    25.1 --- a/xen/arch/x86/hvm/vmx/vmx.c	Wed Aug 16 16:16:32 2006 +0100
    25.2 +++ b/xen/arch/x86/hvm/vmx/vmx.c	Wed Aug 16 17:02:35 2006 +0100
    25.3 @@ -26,9 +26,9 @@
    25.4  #include <xen/softirq.h>
    25.5  #include <xen/domain_page.h>
    25.6  #include <xen/hypercall.h>
    25.7 +#include <xen/perfc.h>
    25.8  #include <asm/current.h>
    25.9  #include <asm/io.h>
   25.10 -#include <asm/shadow.h>
   25.11  #include <asm/regs.h>
   25.12  #include <asm/cpufeature.h>
   25.13  #include <asm/processor.h>
   25.14 @@ -40,10 +40,7 @@
   25.15  #include <asm/hvm/vmx/vmx.h>
   25.16  #include <asm/hvm/vmx/vmcs.h>
   25.17  #include <asm/hvm/vmx/cpu.h>
   25.18 -#include <asm/shadow.h>
   25.19 -#if CONFIG_PAGING_LEVELS >= 3
   25.20 -#include <asm/shadow_64.h>
   25.21 -#endif
   25.22 +#include <asm/shadow2.h>
   25.23  #include <public/sched.h>
   25.24  #include <public/hvm/ioreq.h>
   25.25  #include <asm/hvm/vpic.h>
   25.26 @@ -69,11 +66,16 @@ static int vmx_initialize_guest_resource
   25.27      if ( v->vcpu_id != 0 )
   25.28          return 1;
   25.29  
   25.30 +    if ( !shadow2_mode_external(d) )
   25.31 +    {
   25.32 +        DPRINTK("Can't init HVM for dom %u vcpu %u: "
   25.33 +                "not in shadow2 external mode\n", 
   25.34 +                d->domain_id, v->vcpu_id);
   25.35 +        domain_crash(d);
   25.36 +    }
   25.37 +
   25.38      for_each_vcpu ( d, vc )
   25.39      {
   25.40 -        /* Initialize monitor page table */
   25.41 -        vc->arch.monitor_table = pagetable_null();
   25.42 -
   25.43          memset(&vc->arch.hvm_vmx, 0, sizeof(struct arch_vmx_struct));
   25.44  
   25.45          if ( (rc = vmx_create_vmcs(vc)) != 0 )
   25.46 @@ -107,6 +109,7 @@ static int vmx_initialize_guest_resource
   25.47  
   25.48          vc->arch.hvm_vmx.io_bitmap_a = io_bitmap_a;
   25.49          vc->arch.hvm_vmx.io_bitmap_b = io_bitmap_b;
   25.50 +
   25.51      }
   25.52  
   25.53      /*
   25.54 @@ -116,11 +119,6 @@ static int vmx_initialize_guest_resource
   25.55      memset(&d->shared_info->evtchn_mask[0], 0xff,
   25.56             sizeof(d->shared_info->evtchn_mask));
   25.57  
   25.58 -    /* Put the domain in shadow mode even though we're going to be using
   25.59 -     * the shared 1:1 page table initially. It shouldn't hurt */
   25.60 -    shadow_mode_enable(
   25.61 -        d, SHM_enable|SHM_refcounts|SHM_translate|SHM_external|SHM_wr_pt_pte);
   25.62 -
   25.63      return 1;
   25.64  }
   25.65  
   25.66 @@ -133,7 +131,6 @@ static void vmx_relinquish_guest_resourc
   25.67          vmx_destroy_vmcs(v);
   25.68          if ( !test_bit(_VCPUF_initialised, &v->vcpu_flags) )
   25.69              continue;
   25.70 -        free_monitor_pagetable(v);
   25.71          kill_timer(&v->arch.hvm_vcpu.hlt_timer);
   25.72          if ( hvm_apic_support(v->domain) && (VLAPIC(v) != NULL) )
   25.73          {
   25.74 @@ -153,8 +150,6 @@ static void vmx_relinquish_guest_resourc
   25.75  
   25.76      if ( d->arch.hvm_domain.buffered_io_va )
   25.77          unmap_domain_page_global((void *)d->arch.hvm_domain.buffered_io_va);
   25.78 -
   25.79 -    shadow_direct_map_clean(d);
   25.80  }
   25.81  
   25.82  #ifdef __x86_64__
   25.83 @@ -595,14 +590,6 @@ static void vmx_load_cpu_guest_regs(stru
   25.84      vmx_vmcs_exit(v);
   25.85  }
   25.86  
   25.87 -static int vmx_realmode(struct vcpu *v)
   25.88 -{
   25.89 -    unsigned long rflags;
   25.90 -
   25.91 -    __vmread(GUEST_RFLAGS, &rflags);
   25.92 -    return rflags & X86_EFLAGS_VM;
   25.93 -}
   25.94 -
   25.95  static int vmx_instruction_length(struct vcpu *v)
   25.96  {
   25.97      unsigned long inst_len;
   25.98 @@ -622,6 +609,8 @@ static unsigned long vmx_get_ctrl_reg(st
   25.99          return v->arch.hvm_vmx.cpu_cr2;
  25.100      case 3:
  25.101          return v->arch.hvm_vmx.cpu_cr3;
  25.102 +    case 4:
  25.103 +        return v->arch.hvm_vmx.cpu_shadow_cr4;
  25.104      default:
  25.105          BUG();
  25.106      }
  25.107 @@ -753,9 +742,13 @@ static void vmx_setup_hvm_funcs(void)
  25.108  
  25.109      hvm_funcs.realmode = vmx_realmode;
  25.110      hvm_funcs.paging_enabled = vmx_paging_enabled;
  25.111 +    hvm_funcs.long_mode_enabled = vmx_long_mode_enabled;
  25.112 +    hvm_funcs.guest_x86_mode = vmx_guest_x86_mode;
  25.113      hvm_funcs.instruction_length = vmx_instruction_length;
  25.114      hvm_funcs.get_guest_ctrl_reg = vmx_get_ctrl_reg;
  25.115  
  25.116 +    hvm_funcs.update_host_cr3 = vmx_update_host_cr3;
  25.117 +
  25.118      hvm_funcs.stts = vmx_stts;
  25.119      hvm_funcs.set_tsc_offset = vmx_set_tsc_offset;
  25.120  
  25.121 @@ -855,53 +848,25 @@ static void inline __update_guest_eip(un
  25.122      __vmwrite(GUEST_INTERRUPTIBILITY_INFO, 0);
  25.123  }
  25.124  
  25.125 -
  25.126  static int vmx_do_page_fault(unsigned long va, struct cpu_user_regs *regs)
  25.127  {
  25.128 -    unsigned long gpa; /* FIXME: PAE */
  25.129      int result;
  25.130  
  25.131  #if 0 /* keep for debugging */
  25.132      {
  25.133 -        unsigned long eip;
  25.134 +        unsigned long eip, cs;
  25.135  
  25.136 +        __vmread(GUEST_CS_BASE, &cs);
  25.137          __vmread(GUEST_RIP, &eip);
  25.138          HVM_DBG_LOG(DBG_LEVEL_VMMU,
  25.139 -                    "vmx_do_page_fault = 0x%lx, eip = %lx, error_code = %lx",
  25.140 -                    va, eip, (unsigned long)regs->error_code);
  25.141 +                    "vmx_do_page_fault = 0x%lx, cs_base=%lx, "
  25.142 +                    "eip = %lx, error_code = %lx\n",
  25.143 +                    va, cs, eip, (unsigned long)regs->error_code);
  25.144      }
  25.145  #endif
  25.146  
  25.147 -    if ( !vmx_paging_enabled(current) )
  25.148 -    {
  25.149 -        /* construct 1-to-1 direct mapping */
  25.150 -        if ( shadow_direct_map_fault(va, regs) ) 
  25.151 -            return 1;
  25.152 -
  25.153 -        handle_mmio(va, va);
  25.154 -        TRACE_VMEXIT (2,2);
  25.155 -        return 1;
  25.156 -    }
  25.157 -    gpa = gva_to_gpa(va);
  25.158 +    result = shadow2_fault(va, regs);
  25.159  
  25.160 -    /* Use 1:1 page table to identify MMIO address space */
  25.161 -    if ( mmio_space(gpa) ){
  25.162 -        struct vcpu *v = current;
  25.163 -        /* No support for APIC */
  25.164 -        if (!hvm_apic_support(v->domain) && gpa >= 0xFEC00000) { 
  25.165 -            u32 inst_len;
  25.166 -            __vmread(VM_EXIT_INSTRUCTION_LEN, &(inst_len));
  25.167 -            __update_guest_eip(inst_len);
  25.168 -            return 1;
  25.169 -        }
  25.170 -        TRACE_VMEXIT (2,2);
  25.171 -        /* in the case of MMIO, we are more interested in gpa than in va */
  25.172 -        TRACE_VMEXIT (4,gpa);
  25.173 -        handle_mmio(va, gpa);
  25.174 -        return 1;
  25.175 -    }
  25.176 -
  25.177 -    result = shadow_fault(va, regs);
  25.178      TRACE_VMEXIT (2,result);
  25.179  #if 0
  25.180      if ( !result )
  25.181 @@ -972,23 +937,11 @@ static void vmx_vmexit_do_cpuid(struct c
  25.182                  clear_bit(X86_FEATURE_APIC, &edx);
  25.183              }
  25.184      
  25.185 -#if CONFIG_PAGING_LEVELS < 3
  25.186 -            edx &= ~(bitmaskof(X86_FEATURE_PAE)  |
  25.187 -                     bitmaskof(X86_FEATURE_PSE)  |
  25.188 -                     bitmaskof(X86_FEATURE_PSE36));
  25.189 -#else
  25.190 -            if ( v->domain->arch.ops->guest_paging_levels == PAGING_L2 )
  25.191 -            {
  25.192 -                if ( v->domain->arch.hvm_domain.params[HVM_PARAM_PAE_ENABLED] )
  25.193 -                    clear_bit(X86_FEATURE_PSE36, &edx);
  25.194 -                else
  25.195 -                {
  25.196 -                    clear_bit(X86_FEATURE_PAE, &edx);
  25.197 -                    clear_bit(X86_FEATURE_PSE, &edx);
  25.198 -                    clear_bit(X86_FEATURE_PSE36, &edx);
  25.199 -                }
  25.200 -            }
  25.201 +#if CONFIG_PAGING_LEVELS >= 3
  25.202 +            if ( !v->domain->arch.hvm_domain.params[HVM_PARAM_PAE_ENABLED] )
  25.203  #endif
  25.204 +                clear_bit(X86_FEATURE_PAE, &edx);
  25.205 +            clear_bit(X86_FEATURE_PSE36, &edx);
  25.206  
  25.207              ebx &= NUM_THREADS_RESET_MASK;  
  25.208  
  25.209 @@ -1086,7 +1039,7 @@ static void vmx_vmexit_do_invlpg(unsigne
  25.210       * We do the safest things first, then try to update the shadow
  25.211       * copying from guest
  25.212       */
  25.213 -    shadow_invlpg(v, va);
  25.214 +    shadow2_invlpg(v, va);
  25.215  }
  25.216  
  25.217  
  25.218 @@ -1307,11 +1260,8 @@ vmx_world_restore(struct vcpu *v, struct
  25.219  
  25.220      error |= __vmwrite(CR0_READ_SHADOW, c->cr0);
  25.221  
  25.222 -    if (!vmx_paging_enabled(v)) {
  25.223 -        HVM_DBG_LOG(DBG_LEVEL_VMMU, "switching to vmxassist. use phys table");
  25.224 -        __vmwrite(GUEST_CR3, pagetable_get_paddr(v->domain->arch.phys_table));
  25.225 +    if (!vmx_paging_enabled(v))
  25.226          goto skip_cr3;
  25.227 -    }
  25.228  
  25.229      if (c->cr3 == v->arch.hvm_vmx.cpu_cr3) {
  25.230          /*
  25.231 @@ -1325,7 +1275,6 @@ vmx_world_restore(struct vcpu *v, struct
  25.232              domain_crash_synchronous();
  25.233              return 0;
  25.234          }
  25.235 -        shadow_sync_all(v->domain);
  25.236      } else {
  25.237          /*
  25.238           * If different, make a shadow. Check if the PDBR is valid
  25.239 @@ -1348,13 +1297,17 @@ vmx_world_restore(struct vcpu *v, struct
  25.240           * arch.shadow_table should now hold the next CR3 for shadow
  25.241           */
  25.242          v->arch.hvm_vmx.cpu_cr3 = c->cr3;
  25.243 -        update_pagetables(v);
  25.244 -        HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %x", c->cr3);
  25.245 -        __vmwrite(GUEST_CR3, pagetable_get_paddr(v->arch.shadow_table));
  25.246      }
  25.247  
  25.248   skip_cr3:
  25.249  
  25.250 +    shadow2_update_paging_modes(v);
  25.251 +    if (!vmx_paging_enabled(v))
  25.252 +        HVM_DBG_LOG(DBG_LEVEL_VMMU, "switching to vmxassist. use phys table");
  25.253 +    else
  25.254 +        HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %x", c->cr3);
  25.255 +    __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
  25.256 +
  25.257      error |= __vmread(CR4_READ_SHADOW, &old_cr4);
  25.258      error |= __vmwrite(GUEST_CR4, (c->cr4 | VMX_CR4_HOST_MASK));
  25.259      error |= __vmwrite(CR4_READ_SHADOW, c->cr4);
  25.260 @@ -1485,6 +1438,7 @@ static int vmx_set_cr0(unsigned long val
  25.261      int paging_enabled;
  25.262      unsigned long vm_entry_value;
  25.263      unsigned long old_cr0;
  25.264 +    unsigned long old_base_mfn;
  25.265  
  25.266      /*
  25.267       * CR0: We don't want to lose PE and PG.
  25.268 @@ -1514,7 +1468,8 @@ static int vmx_set_cr0(unsigned long val
  25.269              v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT)) ||
  25.270               !get_page(mfn_to_page(mfn), v->domain) )
  25.271          {
  25.272 -            printk("Invalid CR3 value = %lx", v->arch.hvm_vmx.cpu_cr3);
  25.273 +            printk("Invalid CR3 value = %lx (mfn=%lx)\n", 
  25.274 +                   v->arch.hvm_vmx.cpu_cr3, mfn);
  25.275              domain_crash_synchronous(); /* need to take a clean path */
  25.276          }
  25.277  
  25.278 @@ -1539,51 +1494,22 @@ static int vmx_set_cr0(unsigned long val
  25.279              __vmread(VM_ENTRY_CONTROLS, &vm_entry_value);
  25.280              vm_entry_value |= VM_ENTRY_CONTROLS_IA32E_MODE;
  25.281              __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
  25.282 -
  25.283 -            if ( !shadow_set_guest_paging_levels(v->domain, PAGING_L4) )
  25.284 -            {
  25.285 -                printk("Unsupported guest paging levels\n");
  25.286 -                domain_crash_synchronous(); /* need to take a clean path */
  25.287 -            }
  25.288          }
  25.289 -        else
  25.290 -#endif  /* __x86_64__ */
  25.291 -        {
  25.292 -#if CONFIG_PAGING_LEVELS >= 3
  25.293 -            /* seems it's a 32-bit or 32-bit PAE guest */
  25.294 -
  25.295 -            if ( test_bit(VMX_CPU_STATE_PAE_ENABLED,
  25.296 -                        &v->arch.hvm_vmx.cpu_state) )
  25.297 -            {
  25.298 -                /* The guest enables PAE first and then it enables PG, it is
  25.299 -                 * really a PAE guest */
  25.300 -                if ( !shadow_set_guest_paging_levels(v->domain, PAGING_L3) )
  25.301 -                {
  25.302 -                    printk("Unsupported guest paging levels\n");
  25.303 -                    domain_crash_synchronous();
  25.304 -                }
  25.305 -            }
  25.306 -            else
  25.307 -            {
  25.308 -                if ( !shadow_set_guest_paging_levels(v->domain, PAGING_L2) )
  25.309 -                {
  25.310 -                    printk("Unsupported guest paging levels\n");
  25.311 -                    domain_crash_synchronous(); /* need to take a clean path */
  25.312 -                }
  25.313 -            }
  25.314  #endif
  25.315 -        }
  25.316  
  25.317          /*
  25.318           * Now arch.guest_table points to machine physical.
  25.319           */
  25.320 +        old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
  25.321          v->arch.guest_table = pagetable_from_pfn(mfn);
  25.322 -        update_pagetables(v);
  25.323 +        if (old_base_mfn)
  25.324 +            put_page(mfn_to_page(old_base_mfn));
  25.325 +        shadow2_update_paging_modes(v);
  25.326  
  25.327          HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
  25.328                      (unsigned long) (mfn << PAGE_SHIFT));
  25.329  
  25.330 -        __vmwrite(GUEST_CR3, pagetable_get_paddr(v->arch.shadow_table));
  25.331 +        __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
  25.332          /*
  25.333           * arch->shadow_table should hold the next CR3 for shadow
  25.334           */
  25.335 @@ -1625,7 +1551,6 @@ static int vmx_set_cr0(unsigned long val
  25.336              }
  25.337          }
  25.338  
  25.339 -        clear_all_shadow_status(v->domain);
  25.340          if ( vmx_assist(v, VMX_ASSIST_INVOKE) ) {
  25.341              set_bit(VMX_CPU_STATE_ASSIST_ENABLED, &v->arch.hvm_vmx.cpu_state);
  25.342              __vmread(GUEST_RIP, &eip);
  25.343 @@ -1651,9 +1576,8 @@ static int vmx_set_cr0(unsigned long val
  25.344      }
  25.345      else if ( (value & (X86_CR0_PE | X86_CR0_PG)) == X86_CR0_PE )
  25.346      {
  25.347 -        /* we should take care of this kind of situation */
  25.348 -        clear_all_shadow_status(v->domain);
  25.349 -        __vmwrite(GUEST_CR3, pagetable_get_paddr(v->domain->arch.phys_table));
  25.350 +        __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
  25.351 +        shadow2_update_paging_modes(v);
  25.352      }
  25.353  
  25.354      return 1;
  25.355 @@ -1738,7 +1662,7 @@ static int mov_to_cr(int gp, int cr, str
  25.356              mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
  25.357              if (mfn != pagetable_get_pfn(v->arch.guest_table))
  25.358                  __hvm_bug(regs);
  25.359 -            shadow_sync_all(v->domain);
  25.360 +            shadow2_update_cr3(v);
  25.361          } else {
  25.362              /*
  25.363               * If different, make a shadow. Check if the PDBR is valid
  25.364 @@ -1759,16 +1683,11 @@ static int mov_to_cr(int gp, int cr, str
  25.365              /*
  25.366               * arch.shadow_table should now hold the next CR3 for shadow
  25.367               */
  25.368 -#if CONFIG_PAGING_LEVELS >= 3
  25.369 -            if ( v->domain->arch.ops->guest_paging_levels == PAGING_L3 )
  25.370 -                shadow_sync_all(v->domain);
  25.371 -#endif
  25.372 -
  25.373              v->arch.hvm_vmx.cpu_cr3 = value;
  25.374 -            update_pagetables(v);
  25.375 +            update_cr3(v);
  25.376              HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx",
  25.377                          value);
  25.378 -            __vmwrite(GUEST_CR3, pagetable_get_paddr(v->arch.shadow_table));
  25.379 +            __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
  25.380          }
  25.381          break;
  25.382      }
  25.383 @@ -1786,12 +1705,6 @@ static int mov_to_cr(int gp, int cr, str
  25.384  #if CONFIG_PAGING_LEVELS >= 3
  25.385                  unsigned long mfn, old_base_mfn;
  25.386  
  25.387 -                if( !shadow_set_guest_paging_levels(v->domain, PAGING_L3) )
  25.388 -                {
  25.389 -                    printk("Unsupported guest paging levels\n");
  25.390 -                    domain_crash_synchronous(); /* need to take a clean path */
  25.391 -                }
  25.392 -
  25.393                  if ( !VALID_MFN(mfn = get_mfn_from_gpfn(
  25.394                                      v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT)) ||
  25.395                       !get_page(mfn_to_page(mfn), v->domain) )
  25.396 @@ -1800,21 +1713,20 @@ static int mov_to_cr(int gp, int cr, str
  25.397                      domain_crash_synchronous(); /* need to take a clean path */
  25.398                  }
  25.399  
  25.400 -                old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
  25.401 -                if ( old_base_mfn )
  25.402 -                    put_page(mfn_to_page(old_base_mfn));
  25.403  
  25.404                  /*
  25.405                   * Now arch.guest_table points to machine physical.
  25.406                   */
  25.407  
  25.408 +                old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
  25.409                  v->arch.guest_table = pagetable_from_pfn(mfn);
  25.410 -                update_pagetables(v);
  25.411 +                if ( old_base_mfn )
  25.412 +                    put_page(mfn_to_page(old_base_mfn));
  25.413  
  25.414                  HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
  25.415                              (unsigned long) (mfn << PAGE_SHIFT));
  25.416  
  25.417 -                __vmwrite(GUEST_CR3, pagetable_get_paddr(v->arch.shadow_table));
  25.418 +                __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
  25.419  
  25.420                  /*
  25.421                   * arch->shadow_table should hold the next CR3 for shadow
  25.422 @@ -1824,27 +1736,6 @@ static int mov_to_cr(int gp, int cr, str
  25.423                              v->arch.hvm_vmx.cpu_cr3, mfn);
  25.424  #endif
  25.425              }
  25.426 -            else
  25.427 -            {
  25.428 -                /*  The guest is a 64 bit or 32-bit PAE guest. */
  25.429 -#if CONFIG_PAGING_LEVELS >= 3
  25.430 -                if ( (v->domain->arch.ops != NULL) &&
  25.431 -                        v->domain->arch.ops->guest_paging_levels == PAGING_L2)
  25.432 -                {
  25.433 -                    /* Seems the guest first enables PAE without enabling PG,
  25.434 -                     * it must enable PG after that, and it is a 32-bit PAE
  25.435 -                     * guest */
  25.436 -
  25.437 -                    if ( !shadow_set_guest_paging_levels(v->domain,
  25.438 -                                                            PAGING_L3) )
  25.439 -                    {
  25.440 -                        printk("Unsupported guest paging levels\n");
  25.441 -                        /* need to take a clean path */
  25.442 -                        domain_crash_synchronous();
  25.443 -                    }
  25.444 -                }
  25.445 -#endif
  25.446 -            }
  25.447          }
  25.448          else if ( value & X86_CR4_PAE )
  25.449              set_bit(VMX_CPU_STATE_PAE_ENABLED, &v->arch.hvm_vmx.cpu_state);
  25.450 @@ -1864,8 +1755,7 @@ static int mov_to_cr(int gp, int cr, str
  25.451           * all TLB entries except global entries.
  25.452           */
  25.453          if ( (old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE) )
  25.454 -            shadow_sync_all(v->domain);
  25.455 -
  25.456 +            shadow2_update_paging_modes(v);
  25.457          break;
  25.458      }
  25.459      default:
    26.1 --- a/xen/arch/x86/mm.c	Wed Aug 16 16:16:32 2006 +0100
    26.2 +++ b/xen/arch/x86/mm.c	Wed Aug 16 17:02:35 2006 +0100
    26.3 @@ -137,7 +137,7 @@ static void free_l1_table(struct page_in
    26.4  
    26.5  static int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t, unsigned long,
    26.6                          unsigned long type);
    26.7 -static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t);
    26.8 +static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t, unsigned long gl1mfn);
    26.9  
   26.10  /* Used to defer flushing of memory structures. */
   26.11  struct percpu_mm_info {
   26.12 @@ -274,9 +274,9 @@ void share_xen_page_with_privileged_gues
   26.13  #else
   26.14  /*
   26.15   * In debug builds we shadow a selection of <4GB PDPTs to exercise code paths.
   26.16 - * We cannot safely shadow the idle page table, nor shadow-mode page tables
   26.17 + * We cannot safely shadow the idle page table, nor shadow (v1) page tables
   26.18   * (detected by lack of an owning domain). As required for correctness, we
   26.19 - * always shadow PDPTs aboive 4GB.
   26.20 + * always shadow PDPTs above 4GB.
   26.21   */
   26.22  #define l3tab_needs_shadow(mfn)                         \
   26.23      (((((mfn) << PAGE_SHIFT) != __pa(idle_pg_table)) && \
   26.24 @@ -297,17 +297,21 @@ static int __init cache_pae_fixmap_addre
   26.25  }
   26.26  __initcall(cache_pae_fixmap_address);
   26.27  
   26.28 -static void __write_ptbase(unsigned long mfn)
   26.29 +static DEFINE_PER_CPU(u32, make_cr3_timestamp);
   26.30 +
   26.31 +void make_cr3(struct vcpu *v, unsigned long mfn)
   26.32 +/* Takes the MFN of a PAE l3 table, copies the contents to below 4GB if
   26.33 + * necessary, and sets v->arch.cr3 to the value to load in CR3. */
   26.34  {
   26.35      l3_pgentry_t *highmem_l3tab, *lowmem_l3tab;
   26.36 -    struct pae_l3_cache *cache = &current->arch.pae_l3_cache;
   26.37 +    struct pae_l3_cache *cache = &v->arch.pae_l3_cache;
   26.38      unsigned int cpu = smp_processor_id();
   26.39  
   26.40 -    /* Fast path 1: does this mfn need a shadow at all? */
   26.41 +    /* Fast path: does this mfn need a shadow at all? */
   26.42      if ( !l3tab_needs_shadow(mfn) )
   26.43      {
   26.44 -        write_cr3(mfn << PAGE_SHIFT);
   26.45 -        /* Cache is no longer in use or valid (/after/ write to %cr3). */
   26.46 +        v->arch.cr3 = mfn << PAGE_SHIFT;
   26.47 +        /* Cache is no longer in use or valid */
   26.48          cache->high_mfn = 0;
   26.49          return;
   26.50      }
   26.51 @@ -315,13 +319,6 @@ static void __write_ptbase(unsigned long
   26.52      /* Caching logic is not interrupt safe. */
   26.53      ASSERT(!in_irq());
   26.54  
   26.55 -    /* Fast path 2: is this mfn already cached? */
   26.56 -    if ( cache->high_mfn == mfn )
   26.57 -    {
   26.58 -        write_cr3(__pa(cache->table[cache->inuse_idx]));
   26.59 -        return;
   26.60 -    }
   26.61 -
   26.62      /* Protects against pae_flush_pgd(). */
   26.63      spin_lock(&cache->lock);
   26.64  
   26.65 @@ -330,29 +327,33 @@ static void __write_ptbase(unsigned long
   26.66  
   26.67      /* Map the guest L3 table and copy to the chosen low-memory cache. */
   26.68      *(fix_pae_highmem_pl1e - cpu) = l1e_from_pfn(mfn, __PAGE_HYPERVISOR);
   26.69 +    /* First check the previous high mapping can't be in the TLB. 
   26.70 +     * (i.e. have we loaded CR3 since we last did this?) */
   26.71 +    if ( unlikely(this_cpu(make_cr3_timestamp) == this_cpu(tlbflush_time)) )
   26.72 +        local_flush_tlb_one(fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu));
   26.73      highmem_l3tab = (l3_pgentry_t *)fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu);
   26.74      lowmem_l3tab  = cache->table[cache->inuse_idx];
   26.75      memcpy(lowmem_l3tab, highmem_l3tab, sizeof(cache->table[0]));
   26.76      *(fix_pae_highmem_pl1e - cpu) = l1e_empty();
   26.77 -
   26.78 -    /* Install the low-memory L3 table in CR3. */
   26.79 -    write_cr3(__pa(lowmem_l3tab));
   26.80 +    this_cpu(make_cr3_timestamp) = this_cpu(tlbflush_time);
   26.81 +
   26.82 +    v->arch.cr3 = __pa(lowmem_l3tab);
   26.83  
   26.84      spin_unlock(&cache->lock);
   26.85  }
   26.86  
   26.87  #else /* !CONFIG_X86_PAE */
   26.88  
   26.89 -static void __write_ptbase(unsigned long mfn)
   26.90 +void make_cr3(struct vcpu *v, unsigned long mfn)
   26.91  {
   26.92 -    write_cr3(mfn << PAGE_SHIFT);
   26.93 +    v->arch.cr3 = mfn << PAGE_SHIFT;
   26.94  }
   26.95  
   26.96  #endif /* !CONFIG_X86_PAE */
   26.97  
   26.98  void write_ptbase(struct vcpu *v)
   26.99  {
  26.100 -    __write_ptbase(pagetable_get_pfn(v->arch.monitor_table));
  26.101 +    write_cr3(v->arch.cr3);
  26.102  }
  26.103  
  26.104  void invalidate_shadow_ldt(struct vcpu *v)
  26.105 @@ -423,8 +424,6 @@ int map_ldt_shadow_page(unsigned int off
  26.106  
  26.107      BUG_ON(unlikely(in_irq()));
  26.108  
  26.109 -    shadow_sync_va(v, gva);
  26.110 -
  26.111      TOGGLE_MODE();
  26.112      __copy_from_user(&l1e, &linear_pg_table[l1_linear_offset(gva)],
  26.113                       sizeof(l1e));
  26.114 @@ -440,12 +439,12 @@ int map_ldt_shadow_page(unsigned int off
  26.115  
  26.116      res = get_page_and_type(mfn_to_page(mfn), d, PGT_ldt_page);
  26.117  
  26.118 -    if ( !res && unlikely(shadow_mode_refcounts(d)) )
  26.119 +    if ( !res && unlikely(shadow2_mode_refcounts(d)) )
  26.120      {
  26.121 -        shadow_lock(d);
  26.122 -        shadow_remove_all_write_access(d, gmfn, mfn);
  26.123 +        shadow2_lock(d);
  26.124 +        shadow2_remove_write_access(d->vcpu[0], _mfn(mfn), 0, 0);
  26.125          res = get_page_and_type(mfn_to_page(mfn), d, PGT_ldt_page);
  26.126 -        shadow_unlock(d);
  26.127 +        shadow2_unlock(d);
  26.128      }
  26.129  
  26.130      if ( unlikely(!res) )
  26.131 @@ -513,7 +512,7 @@ get_linear_pagetable(
  26.132      struct page_info *page;
  26.133      unsigned long pfn;
  26.134  
  26.135 -    ASSERT( !shadow_mode_refcounts(d) );
  26.136 +    ASSERT( !shadow2_mode_refcounts(d) );
  26.137  
  26.138      if ( (root_get_flags(re) & _PAGE_RW) )
  26.139      {
  26.140 @@ -576,7 +575,8 @@ get_page_from_l1e(
  26.141  
  26.142          if ( !iomem_access_permitted(d, mfn, mfn) )
  26.143          {
  26.144 -            MEM_LOG("Non-privileged attempt to map I/O space %08lx", mfn);
  26.145 +            MEM_LOG("Non-privileged (%u) attempt to map I/O space %08lx", 
  26.146 +                    d->domain_id, mfn);
  26.147              return 0;
  26.148          }
  26.149  
  26.150 @@ -587,9 +587,14 @@ get_page_from_l1e(
  26.151          d = dom_io;
  26.152      }
  26.153  
  26.154 -    okay = ((l1e_get_flags(l1e) & _PAGE_RW) ?
  26.155 -            get_page_and_type(page, d, PGT_writable_page) :
  26.156 -            get_page(page, d));
  26.157 +    /* Foreign mappings into guests in shadow2 external mode don't
  26.158 +     * contribute to writeable mapping refcounts.  (This allows the
  26.159 +     * qemu-dm helper process in dom0 to map the domain's memory without
  26.160 +     * messing up the count of "real" writable mappings.) */
  26.161 +    okay = (((l1e_get_flags(l1e) & _PAGE_RW) && 
  26.162 +             !(unlikely(shadow2_mode_external(d) && (d != current->domain))))
  26.163 +            ? get_page_and_type(page, d, PGT_writable_page)
  26.164 +            : get_page(page, d));
  26.165      if ( !okay )
  26.166      {
  26.167          MEM_LOG("Error getting mfn %lx (pfn %lx) from L1 entry %" PRIpte
  26.168 @@ -610,8 +615,6 @@ get_page_from_l2e(
  26.169  {
  26.170      int rc;
  26.171  
  26.172 -    ASSERT(!shadow_mode_refcounts(d));
  26.173 -
  26.174      if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
  26.175          return 1;
  26.176  
  26.177 @@ -641,8 +644,6 @@ get_page_from_l3e(
  26.178  {
  26.179      int rc;
  26.180  
  26.181 -    ASSERT(!shadow_mode_refcounts(d));
  26.182 -
  26.183      if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
  26.184          return 1;
  26.185  
  26.186 @@ -669,8 +670,6 @@ get_page_from_l4e(
  26.187  {
  26.188      int rc;
  26.189  
  26.190 -    ASSERT( !shadow_mode_refcounts(d) );
  26.191 -
  26.192      if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
  26.193          return 1;
  26.194  
  26.195 @@ -727,7 +726,10 @@ void put_page_from_l1e(l1_pgentry_t l1e,
  26.196          domain_crash(d);
  26.197      }
  26.198  
  26.199 -    if ( l1e_get_flags(l1e) & _PAGE_RW )
  26.200 +    /* Remember we didn't take a type-count of foreign writable mappings
  26.201 +     * to shadow2 external domains */
  26.202 +    if ( (l1e_get_flags(l1e) & _PAGE_RW) && 
  26.203 +         !(unlikely((e != d) && shadow2_mode_external(e))) )
  26.204      {
  26.205          put_page_and_type(page);
  26.206      }
  26.207 @@ -784,7 +786,7 @@ static int alloc_l1_table(struct page_in
  26.208      l1_pgentry_t  *pl1e;
  26.209      int            i;
  26.210  
  26.211 -    ASSERT(!shadow_mode_refcounts(d));
  26.212 +    ASSERT(!shadow2_mode_refcounts(d));
  26.213  
  26.214      pl1e = map_domain_page(pfn);
  26.215  
  26.216 @@ -832,6 +834,8 @@ static int create_pae_xen_mappings(l3_pg
  26.217       *  2. Cannot appear in another page table's L3:
  26.218       *     a. alloc_l3_table() calls this function and this check will fail
  26.219       *     b. mod_l3_entry() disallows updates to slot 3 in an existing table
  26.220 +     *
  26.221 +     * XXX -- this needs revisiting for shadow2_mode_refcount()==true...
  26.222       */
  26.223      page = l3e_get_page(l3e3);
  26.224      BUG_ON(page->u.inuse.type_info & PGT_pinned);
  26.225 @@ -955,11 +959,7 @@ static int alloc_l2_table(struct page_in
  26.226      l2_pgentry_t  *pl2e;
  26.227      int            i;
  26.228  
  26.229 -    /* See the code in shadow_promote() to understand why this is here. */
  26.230 -    if ( (PGT_base_page_table == PGT_l2_page_table) &&
  26.231 -         unlikely(shadow_mode_refcounts(d)) )
  26.232 -        return 1;
  26.233 -    ASSERT(!shadow_mode_refcounts(d));
  26.234 +    ASSERT(!shadow2_mode_refcounts(d));
  26.235      
  26.236      pl2e = map_domain_page(pfn);
  26.237  
  26.238 @@ -1009,11 +1009,7 @@ static int alloc_l3_table(struct page_in
  26.239      l3_pgentry_t  *pl3e;
  26.240      int            i;
  26.241  
  26.242 -    /* See the code in shadow_promote() to understand why this is here. */
  26.243 -    if ( (PGT_base_page_table == PGT_l3_page_table) &&
  26.244 -         shadow_mode_refcounts(d) )
  26.245 -        return 1;
  26.246 -    ASSERT(!shadow_mode_refcounts(d));
  26.247 +    ASSERT(!shadow2_mode_refcounts(d));
  26.248  
  26.249  #ifdef CONFIG_X86_PAE
  26.250      /*
  26.251 @@ -1072,11 +1068,7 @@ static int alloc_l4_table(struct page_in
  26.252      unsigned long vaddr;
  26.253      int            i;
  26.254  
  26.255 -    /* See the code in shadow_promote() to understand why this is here. */
  26.256 -    if ( (PGT_base_page_table == PGT_l4_page_table) &&
  26.257 -         shadow_mode_refcounts(d) )
  26.258 -        return 1;
  26.259 -    ASSERT(!shadow_mode_refcounts(d));
  26.260 +    ASSERT(!shadow2_mode_refcounts(d));
  26.261  
  26.262      for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
  26.263      {
  26.264 @@ -1183,42 +1175,55 @@ static void free_l4_table(struct page_in
  26.265  
  26.266  static inline int update_l1e(l1_pgentry_t *pl1e, 
  26.267                               l1_pgentry_t  ol1e, 
  26.268 -                             l1_pgentry_t  nl1e)
  26.269 +                             l1_pgentry_t  nl1e,
  26.270 +                             unsigned long gl1mfn,
  26.271 +                             struct vcpu *v)
  26.272  {
  26.273 +    int rv = 1;
  26.274 +    if ( unlikely(shadow2_mode_enabled(v->domain)) )
  26.275 +        shadow2_lock(v->domain);
  26.276  #ifndef PTE_UPDATE_WITH_CMPXCHG
  26.277 -    return !__copy_to_user(pl1e, &nl1e, sizeof(nl1e));
  26.278 +    rv = (!__copy_to_user(pl1e, &nl1e, sizeof(nl1e)));
  26.279  #else
  26.280 -    intpte_t o = l1e_get_intpte(ol1e);
  26.281 -    intpte_t n = l1e_get_intpte(nl1e);
  26.282 -
  26.283 -    for ( ; ; )
  26.284      {
  26.285 -        if ( unlikely(cmpxchg_user(pl1e, o, n) != 0) )
  26.286 +        intpte_t o = l1e_get_intpte(ol1e);
  26.287 +        intpte_t n = l1e_get_intpte(nl1e);
  26.288 +        
  26.289 +        for ( ; ; )
  26.290          {
  26.291 -            MEM_LOG("Failed to update %" PRIpte " -> %" PRIpte
  26.292 -                    ": saw %" PRIpte,
  26.293 -                    l1e_get_intpte(ol1e),
  26.294 -                    l1e_get_intpte(nl1e),
  26.295 -                    o);
  26.296 -            return 0;
  26.297 +            if ( unlikely(cmpxchg_user(pl1e, o, n) != 0) )
  26.298 +            {
  26.299 +                MEM_LOG("Failed to update %" PRIpte " -> %" PRIpte
  26.300 +                        ": saw %" PRIpte,
  26.301 +                        l1e_get_intpte(ol1e),
  26.302 +                        l1e_get_intpte(nl1e),
  26.303 +                        o);
  26.304 +                rv = 0;
  26.305 +                break;
  26.306 +            }
  26.307 +
  26.308 +            if ( o == l1e_get_intpte(ol1e) )
  26.309 +                break;
  26.310 +
  26.311 +            /* Allowed to change in Accessed/Dirty flags only. */
  26.312 +            BUG_ON((o ^ l1e_get_intpte(ol1e)) &
  26.313 +                   ~(int)(_PAGE_ACCESSED|_PAGE_DIRTY));
  26.314 +            ol1e = l1e_from_intpte(o);
  26.315          }
  26.316 -
  26.317 -        if ( o == l1e_get_intpte(ol1e) )
  26.318 -            break;
  26.319 -
  26.320 -        /* Allowed to change in Accessed/Dirty flags only. */
  26.321 -        BUG_ON((o ^ l1e_get_intpte(ol1e)) &
  26.322 -               ~(int)(_PAGE_ACCESSED|_PAGE_DIRTY));
  26.323 -        ol1e = l1e_from_intpte(o);
  26.324      }
  26.325 -
  26.326 -    return 1;
  26.327  #endif
  26.328 +    if ( unlikely(shadow2_mode_enabled(v->domain)) )
  26.329 +    {
  26.330 +        shadow2_validate_guest_entry(v, _mfn(gl1mfn), pl1e);
  26.331 +        shadow2_unlock(v->domain);    
  26.332 +    }
  26.333 +    return rv;
  26.334  }
  26.335  
  26.336  
  26.337  /* Update the L1 entry at pl1e to new value nl1e. */
  26.338 -static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e)
  26.339 +static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e, 
  26.340 +                        unsigned long gl1mfn)
  26.341  {
  26.342      l1_pgentry_t ol1e;
  26.343      struct domain *d = current->domain;
  26.344 @@ -1226,9 +1231,6 @@ static int mod_l1_entry(l1_pgentry_t *pl
  26.345      if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) )
  26.346          return 0;
  26.347  
  26.348 -    if ( unlikely(shadow_mode_refcounts(d)) )
  26.349 -        return update_l1e(pl1e, ol1e, nl1e);
  26.350 -
  26.351      if ( l1e_get_flags(nl1e) & _PAGE_PRESENT )
  26.352      {
  26.353          if ( unlikely(l1e_get_flags(nl1e) & L1_DISALLOW_MASK) )
  26.354 @@ -1239,13 +1241,13 @@ static int mod_l1_entry(l1_pgentry_t *pl
  26.355          }
  26.356  
  26.357          /* Fast path for identical mapping, r/w and presence. */
  26.358 -        if ( !l1e_has_changed(ol1e, nl1e, _PAGE_RW | _PAGE_PRESENT))
  26.359 -            return update_l1e(pl1e, ol1e, nl1e);
  26.360 +        if ( !l1e_has_changed(ol1e, nl1e, _PAGE_RW | _PAGE_PRESENT) )
  26.361 +            return update_l1e(pl1e, ol1e, nl1e, gl1mfn, current);
  26.362  
  26.363          if ( unlikely(!get_page_from_l1e(nl1e, FOREIGNDOM)) )
  26.364              return 0;
  26.365          
  26.366 -        if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
  26.367 +        if ( unlikely(!update_l1e(pl1e, ol1e, nl1e, gl1mfn, current)) )
  26.368          {
  26.369              put_page_from_l1e(nl1e, d);
  26.370              return 0;
  26.371 @@ -1253,7 +1255,7 @@ static int mod_l1_entry(l1_pgentry_t *pl
  26.372      }
  26.373      else
  26.374      {
  26.375 -        if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
  26.376 +        if ( unlikely(!update_l1e(pl1e, ol1e, nl1e, gl1mfn, current)) )
  26.377              return 0;
  26.378      }
  26.379  
  26.380 @@ -1262,9 +1264,9 @@ static int mod_l1_entry(l1_pgentry_t *pl
  26.381  }
  26.382  
  26.383  #ifndef PTE_UPDATE_WITH_CMPXCHG
  26.384 -#define UPDATE_ENTRY(_t,_p,_o,_n) ({ (*(_p) = (_n)); 1; })
  26.385 +#define _UPDATE_ENTRY(_t,_p,_o,_n) ({ (*(_p) = (_n)); 1; })
  26.386  #else
  26.387 -#define UPDATE_ENTRY(_t,_p,_o,_n) ({                            \
  26.388 +#define _UPDATE_ENTRY(_t,_p,_o,_n) ({                            \
  26.389      for ( ; ; )                                                 \
  26.390      {                                                           \
  26.391          intpte_t __o = cmpxchg((intpte_t *)(_p),                \
  26.392 @@ -1279,6 +1281,18 @@ static int mod_l1_entry(l1_pgentry_t *pl
  26.393      }                                                           \
  26.394      1; })
  26.395  #endif
  26.396 +#define UPDATE_ENTRY(_t,_p,_o,_n,_m)  ({                            \
  26.397 +    int rv;                                                         \
  26.398 +    if ( unlikely(shadow2_mode_enabled(current->domain)) )          \
  26.399 +        shadow2_lock(current->domain);                              \
  26.400 +    rv = _UPDATE_ENTRY(_t, _p, _o, _n);                             \
  26.401 +    if ( unlikely(shadow2_mode_enabled(current->domain)) )          \
  26.402 +    {                                                               \
  26.403 +        shadow2_validate_guest_entry(current, _mfn(_m), (_p));      \
  26.404 +        shadow2_unlock(current->domain);                            \
  26.405 +    }                                                               \
  26.406 +    rv;                                                             \
  26.407 +})
  26.408  
  26.409  /* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
  26.410  static int mod_l2_entry(l2_pgentry_t *pl2e, 
  26.411 @@ -1309,19 +1323,19 @@ static int mod_l2_entry(l2_pgentry_t *pl
  26.412  
  26.413          /* Fast path for identical mapping and presence. */
  26.414          if ( !l2e_has_changed(ol2e, nl2e, _PAGE_PRESENT))
  26.415 -            return UPDATE_ENTRY(l2, pl2e, ol2e, nl2e);
  26.416 +            return UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn);
  26.417  
  26.418          if ( unlikely(!l1_backptr(&vaddr, pgentry_ptr_to_slot(pl2e), type)) ||
  26.419               unlikely(!get_page_from_l2e(nl2e, pfn, current->domain, vaddr)) )
  26.420              return 0;
  26.421  
  26.422 -        if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e)) )
  26.423 +        if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn)) )
  26.424          {
  26.425              put_page_from_l2e(nl2e, pfn);
  26.426              return 0;
  26.427          }
  26.428      }
  26.429 -    else if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e)) )
  26.430 +    else if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn)) )
  26.431      {
  26.432          return 0;
  26.433      }
  26.434 @@ -1330,7 +1344,6 @@ static int mod_l2_entry(l2_pgentry_t *pl
  26.435      return 1;
  26.436  }
  26.437  
  26.438 -
  26.439  #if CONFIG_PAGING_LEVELS >= 3
  26.440  
  26.441  /* Update the L3 entry at pl3e to new value nl3e. pl3e is within frame pfn. */
  26.442 @@ -1356,7 +1369,7 @@ static int mod_l3_entry(l3_pgentry_t *pl
  26.443       */
  26.444      if ( pgentry_ptr_to_slot(pl3e) >= 3 )
  26.445          return 0;
  26.446 -#endif
  26.447 +#endif 
  26.448  
  26.449      if ( unlikely(__copy_from_user(&ol3e, pl3e, sizeof(ol3e)) != 0) )
  26.450          return 0;
  26.451 @@ -1372,7 +1385,7 @@ static int mod_l3_entry(l3_pgentry_t *pl
  26.452  
  26.453          /* Fast path for identical mapping and presence. */
  26.454          if (!l3e_has_changed(ol3e, nl3e, _PAGE_PRESENT))
  26.455 -            return UPDATE_ENTRY(l3, pl3e, ol3e, nl3e);
  26.456 +            return UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn);
  26.457  
  26.458  #if CONFIG_PAGING_LEVELS >= 4
  26.459          if ( unlikely(!l2_backptr(&vaddr, pgentry_ptr_to_slot(pl3e), type)) ||
  26.460 @@ -1383,15 +1396,15 @@ static int mod_l3_entry(l3_pgentry_t *pl
  26.461              << L3_PAGETABLE_SHIFT;
  26.462          if ( unlikely(!get_page_from_l3e(nl3e, pfn, current->domain, vaddr)) )
  26.463              return 0;
  26.464 -#endif
  26.465 -
  26.466 -        if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e)) )
  26.467 +#endif 
  26.468 +
  26.469 +        if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn)) )
  26.470          {
  26.471              put_page_from_l3e(nl3e, pfn);
  26.472              return 0;
  26.473          }
  26.474      }
  26.475 -    else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e)) )
  26.476 +    else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn)) )
  26.477      {
  26.478          return 0;
  26.479      }
  26.480 @@ -1438,19 +1451,19 @@ static int mod_l4_entry(l4_pgentry_t *pl
  26.481  
  26.482          /* Fast path for identical mapping and presence. */
  26.483          if (!l4e_has_changed(ol4e, nl4e, _PAGE_PRESENT))
  26.484 -            return UPDATE_ENTRY(l4, pl4e, ol4e, nl4e);
  26.485 +            return UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn);
  26.486  
  26.487          if ( unlikely(!l3_backptr(&vaddr, pgentry_ptr_to_slot(pl4e), type)) ||
  26.488               unlikely(!get_page_from_l4e(nl4e, pfn, current->domain, vaddr)) )
  26.489              return 0;
  26.490  
  26.491 -        if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e)) )
  26.492 +        if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn)) )
  26.493          {
  26.494              put_page_from_l4e(nl4e, pfn);
  26.495              return 0;
  26.496          }
  26.497      }
  26.498 -    else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e)) )
  26.499 +    else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn)) )
  26.500      {
  26.501          return 0;
  26.502      }
  26.503 @@ -1506,18 +1519,21 @@ void free_page_type(struct page_info *pa
  26.504           */
  26.505          this_cpu(percpu_mm_info).deferred_ops |= DOP_FLUSH_ALL_TLBS;
  26.506  
  26.507 -        if ( unlikely(shadow_mode_enabled(owner)) )
  26.508 +        if ( unlikely(shadow2_mode_enabled(owner)
  26.509 +                 && !shadow2_lock_is_acquired(owner)) )
  26.510          {
  26.511              /* Raw page tables are rewritten during save/restore. */
  26.512 -            if ( !shadow_mode_translate(owner) )
  26.513 +            if ( !shadow2_mode_translate(owner) )
  26.514                  mark_dirty(owner, page_to_mfn(page));
  26.515  
  26.516 -            if ( shadow_mode_refcounts(owner) )
  26.517 +            if ( shadow2_mode_refcounts(owner) )
  26.518                  return;
  26.519  
  26.520              gmfn = mfn_to_gmfn(owner, page_to_mfn(page));
  26.521              ASSERT(VALID_M2P(gmfn));
  26.522 -            remove_shadow(owner, gmfn, type & PGT_type_mask);
  26.523 +            shadow2_lock(owner);
  26.524 +            shadow2_remove_all_shadows(owner->vcpu[0], _mfn(gmfn));
  26.525 +            shadow2_unlock(owner);
  26.526          }
  26.527      }
  26.528  
  26.529 @@ -1573,9 +1589,6 @@ void put_page_type(struct page_info *pag
  26.530  
  26.531          if ( unlikely((nx & PGT_count_mask) == 0) )
  26.532          {
  26.533 -            /* Record TLB information for flush later. Races are harmless. */
  26.534 -            page->tlbflush_timestamp = tlbflush_current_time();
  26.535 -            
  26.536              if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
  26.537                   likely(nx & PGT_validated) )
  26.538              {
  26.539 @@ -1593,6 +1606,9 @@ void put_page_type(struct page_info *pag
  26.540                  x  &= ~PGT_validated;
  26.541                  nx &= ~PGT_validated;
  26.542              }
  26.543 +
  26.544 +            /* Record TLB information for flush later. */
  26.545 +            page->tlbflush_timestamp = tlbflush_current_time();
  26.546          }
  26.547          else if ( unlikely((nx & (PGT_pinned|PGT_type_mask|PGT_count_mask)) == 
  26.548                             (PGT_pinned|PGT_l1_page_table|1)) )
  26.549 @@ -1682,7 +1698,7 @@ int get_page_type(struct page_info *page
  26.550  #endif
  26.551                      /* Fixme: add code to propagate va_unknown to subtables. */
  26.552                      if ( ((type & PGT_type_mask) >= PGT_l2_page_table) &&
  26.553 -                         !shadow_mode_refcounts(page_get_owner(page)) )
  26.554 +                         !shadow2_mode_refcounts(page_get_owner(page)) )
  26.555                          return 0;
  26.556                      /* This table is possibly mapped at multiple locations. */
  26.557                      nx &= ~PGT_va_mask;
  26.558 @@ -1729,7 +1745,10 @@ int new_guest_cr3(unsigned long mfn)
  26.559      int okay;
  26.560      unsigned long old_base_mfn;
  26.561  
  26.562 -    if ( shadow_mode_refcounts(d) )
  26.563 +    if ( hvm_guest(v) && !hvm_paging_enabled(v) )
  26.564 +        domain_crash_synchronous();
  26.565 +
  26.566 +    if ( shadow2_mode_refcounts(d) )
  26.567      {
  26.568          okay = get_page_from_pagenr(mfn, d);
  26.569          if ( unlikely(!okay) )
  26.570 @@ -1747,7 +1766,7 @@ int new_guest_cr3(unsigned long mfn)
  26.571              MEM_LOG("New baseptr %lx: slow path via idle pagetables", mfn);
  26.572              old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
  26.573              v->arch.guest_table = pagetable_null();
  26.574 -            update_pagetables(v);
  26.575 +            update_cr3(v);
  26.576              write_cr3(__pa(idle_pg_table));
  26.577              if ( old_base_mfn != 0 )
  26.578                  put_page_and_type(mfn_to_page(old_base_mfn));
  26.579 @@ -1769,30 +1788,20 @@ int new_guest_cr3(unsigned long mfn)
  26.580      invalidate_shadow_ldt(v);
  26.581  
  26.582      old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
  26.583 +
  26.584      v->arch.guest_table = pagetable_from_pfn(mfn);
  26.585 -    update_pagetables(v); /* update shadow_table and monitor_table */
  26.586 +    update_cr3(v); /* update shadow_table and cr3 fields of vcpu struct */
  26.587  
  26.588      write_ptbase(v);
  26.589  
  26.590      if ( likely(old_base_mfn != 0) )
  26.591      {
  26.592 -        if ( shadow_mode_refcounts(d) )
  26.593 +        if ( shadow2_mode_refcounts(d) )
  26.594              put_page(mfn_to_page(old_base_mfn));
  26.595          else
  26.596              put_page_and_type(mfn_to_page(old_base_mfn));
  26.597      }
  26.598  
  26.599 -    /* CR3 also holds a ref to its shadow... */
  26.600 -    if ( shadow_mode_enabled(d) )
  26.601 -    {
  26.602 -        if ( v->arch.monitor_shadow_ref )
  26.603 -            put_shadow_ref(v->arch.monitor_shadow_ref);
  26.604 -        v->arch.monitor_shadow_ref =
  26.605 -            pagetable_get_pfn(v->arch.monitor_table);
  26.606 -        ASSERT(!page_get_owner(mfn_to_page(v->arch.monitor_shadow_ref)));
  26.607 -        get_shadow_ref(v->arch.monitor_shadow_ref);
  26.608 -    }
  26.609 -
  26.610      return 1;
  26.611  }
  26.612  
  26.613 @@ -1807,8 +1816,6 @@ static void process_deferred_ops(void)
  26.614  
  26.615      if ( deferred_ops & (DOP_FLUSH_ALL_TLBS|DOP_FLUSH_TLB) )
  26.616      {
  26.617 -        if ( shadow_mode_enabled(d) )
  26.618 -            shadow_sync_all(d);
  26.619          if ( deferred_ops & DOP_FLUSH_ALL_TLBS )
  26.620              flush_tlb_mask(d->domain_dirty_cpumask);
  26.621          else
  26.622 @@ -1974,7 +1981,7 @@ int do_mmuext_op(
  26.623              type = PGT_root_page_table;
  26.624  
  26.625          pin_page:
  26.626 -            if ( shadow_mode_refcounts(FOREIGNDOM) )
  26.627 +            if ( shadow2_mode_refcounts(FOREIGNDOM) )
  26.628                  break;
  26.629  
  26.630              okay = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM);
  26.631 @@ -1996,7 +2003,7 @@ int do_mmuext_op(
  26.632              break;
  26.633  
  26.634          case MMUEXT_UNPIN_TABLE:
  26.635 -            if ( shadow_mode_refcounts(d) )
  26.636 +            if ( shadow2_mode_refcounts(d) )
  26.637                  break;
  26.638  
  26.639              if ( unlikely(!(okay = get_page_from_pagenr(mfn, d))) )
  26.640 @@ -2009,6 +2016,12 @@ int do_mmuext_op(
  26.641              {
  26.642                  put_page_and_type(page);
  26.643                  put_page(page);
  26.644 +                if ( shadow2_mode_enabled(d) )
  26.645 +                {
  26.646 +                    shadow2_lock(d);
  26.647 +                    shadow2_remove_all_shadows(v, _mfn(mfn));
  26.648 +                    shadow2_unlock(d);
  26.649 +                }
  26.650              }
  26.651              else
  26.652              {
  26.653 @@ -2050,9 +2063,9 @@ int do_mmuext_op(
  26.654              break;
  26.655      
  26.656          case MMUEXT_INVLPG_LOCAL:
  26.657 -            if ( shadow_mode_enabled(d) )
  26.658 -                shadow_invlpg(v, op.arg1.linear_addr);
  26.659 -            local_flush_tlb_one(op.arg1.linear_addr);
  26.660 +            if ( !shadow2_mode_enabled(d) 
  26.661 +                 || shadow2_invlpg(v, op.arg1.linear_addr) != 0 )
  26.662 +                local_flush_tlb_one(op.arg1.linear_addr);
  26.663              break;
  26.664  
  26.665          case MMUEXT_TLB_FLUSH_MULTI:
  26.666 @@ -2098,7 +2111,7 @@ int do_mmuext_op(
  26.667              unsigned long ptr  = op.arg1.linear_addr;
  26.668              unsigned long ents = op.arg2.nr_ents;
  26.669  
  26.670 -            if ( shadow_mode_external(d) )
  26.671 +            if ( shadow2_mode_external(d) )
  26.672              {
  26.673                  MEM_LOG("ignoring SET_LDT hypercall from external "
  26.674                          "domain %u", d->domain_id);
  26.675 @@ -2171,9 +2184,6 @@ int do_mmu_update(
  26.676  
  26.677      LOCK_BIGLOCK(d);
  26.678  
  26.679 -    if ( unlikely(shadow_mode_enabled(d)) )
  26.680 -        check_pagetable(v, "pre-mmu"); /* debug */
  26.681 -
  26.682      if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
  26.683      {
  26.684          count &= ~MMU_UPDATE_PREEMPTED;
  26.685 @@ -2248,7 +2258,12 @@ int do_mmu_update(
  26.686              case PGT_l3_page_table:
  26.687              case PGT_l4_page_table:
  26.688              {
  26.689 -                ASSERT(!shadow_mode_refcounts(d));
  26.690 +                if ( shadow2_mode_refcounts(d) )
  26.691 +                {
  26.692 +                    DPRINTK("mmu update on shadow-refcounted domain!");
  26.693 +                    break;
  26.694 +                }
  26.695 +
  26.696                  if ( unlikely(!get_page_type(
  26.697                      page, type_info & (PGT_type_mask|PGT_va_mask))) )
  26.698                      goto not_a_pt;
  26.699 @@ -2258,10 +2273,7 @@ int do_mmu_update(
  26.700                  case PGT_l1_page_table:
  26.701                  {
  26.702                      l1_pgentry_t l1e = l1e_from_intpte(req.val);
  26.703 -                    okay = mod_l1_entry(va, l1e);
  26.704 -                    if ( okay && unlikely(shadow_mode_enabled(d)) )
  26.705 -                        shadow_l1_normal_pt_update(
  26.706 -                            d, req.ptr, l1e, &sh_mapcache);
  26.707 +                    okay = mod_l1_entry(va, l1e, mfn);
  26.708                  }
  26.709                  break;
  26.710                  case PGT_l2_page_table:
  26.711 @@ -2269,9 +2281,6 @@ int do_mmu_update(
  26.712                      l2_pgentry_t l2e = l2e_from_intpte(req.val);
  26.713                      okay = mod_l2_entry(
  26.714                          (l2_pgentry_t *)va, l2e, mfn, type_info);
  26.715 -                    if ( okay && unlikely(shadow_mode_enabled(d)) )
  26.716 -                        shadow_l2_normal_pt_update(
  26.717 -                            d, req.ptr, l2e, &sh_mapcache);
  26.718                  }
  26.719                  break;
  26.720  #if CONFIG_PAGING_LEVELS >= 3
  26.721 @@ -2279,9 +2288,6 @@ int do_mmu_update(
  26.722                  {
  26.723                      l3_pgentry_t l3e = l3e_from_intpte(req.val);
  26.724                      okay = mod_l3_entry(va, l3e, mfn, type_info);
  26.725 -                    if ( okay && unlikely(shadow_mode_enabled(d)) )
  26.726 -                        shadow_l3_normal_pt_update(
  26.727 -                            d, req.ptr, l3e, &sh_mapcache);
  26.728                  }
  26.729                  break;
  26.730  #endif
  26.731 @@ -2290,9 +2296,6 @@ int do_mmu_update(
  26.732                  {
  26.733                      l4_pgentry_t l4e = l4e_from_intpte(req.val);
  26.734                      okay = mod_l4_entry(va, l4e, mfn, type_info);
  26.735 -                    if ( okay && unlikely(shadow_mode_enabled(d)) )
  26.736 -                        shadow_l4_normal_pt_update(
  26.737 -                            d, req.ptr, l4e, &sh_mapcache);
  26.738                  }
  26.739                  break;
  26.740  #endif
  26.741 @@ -2308,19 +2311,17 @@ int do_mmu_update(
  26.742                  if ( unlikely(!get_page_type(page, PGT_writable_page)) )
  26.743                      break;
  26.744  
  26.745 -                if ( shadow_mode_enabled(d) )
  26.746 -                {
  26.747 -                    shadow_lock(d);
  26.748 -                    __mark_dirty(d, mfn);
  26.749 -                    if ( page_is_page_table(page) && !page_out_of_sync(page) )
  26.750 -                        shadow_mark_mfn_out_of_sync(v, gmfn, mfn);
  26.751 -                }
  26.752 +                if ( unlikely(shadow2_mode_enabled(d)) )
  26.753 +                    shadow2_lock(d);
  26.754  
  26.755                  *(intpte_t *)va = req.val;
  26.756                  okay = 1;
  26.757  
  26.758 -                if ( shadow_mode_enabled(d) )
  26.759 -                    shadow_unlock(d);
  26.760 +                if ( unlikely(shadow2_mode_enabled(d)) )
  26.761 +                {
  26.762 +                    shadow2_validate_guest_entry(v, _mfn(mfn), va);
  26.763 +                    shadow2_unlock(d);
  26.764 +                }
  26.765  
  26.766                  put_page_type(page);
  26.767              }
  26.768 @@ -2334,12 +2335,6 @@ int do_mmu_update(
  26.769  
  26.770          case MMU_MACHPHYS_UPDATE:
  26.771  
  26.772 -            if ( shadow_mode_translate(FOREIGNDOM) )
  26.773 -            {
  26.774 -                MEM_LOG("can't mutate m2p table of translate mode guest");
  26.775 -                break;
  26.776 -            }
  26.777 -
  26.778              mfn = req.ptr >> PAGE_SHIFT;
  26.779              gpfn = req.val;
  26.780  
  26.781 @@ -2349,9 +2344,13 @@ int do_mmu_update(
  26.782                  break;
  26.783              }
  26.784  
  26.785 -            set_gpfn_from_mfn(mfn, gpfn);
  26.786 +            if ( shadow2_mode_translate(FOREIGNDOM) )
  26.787 +                shadow2_guest_physmap_add_page(FOREIGNDOM, gpfn, mfn);
  26.788 +            else 
  26.789 +                set_gpfn_from_mfn(mfn, gpfn);
  26.790              okay = 1;
  26.791  
  26.792 +            // Mark the new gfn dirty...
  26.793              mark_dirty(FOREIGNDOM, mfn);
  26.794  
  26.795              put_page(mfn_to_page(mfn));
  26.796 @@ -2382,9 +2381,6 @@ int do_mmu_update(
  26.797      if ( unlikely(!guest_handle_is_null(pdone)) )
  26.798          copy_to_guest(pdone, &done, 1);
  26.799  
  26.800 -    if ( unlikely(shadow_mode_enabled(d)) )
  26.801 -        check_pagetable(v, "post-mmu"); /* debug */
  26.802 -
  26.803      UNLOCK_BIGLOCK(d);
  26.804      return rc;
  26.805  }
  26.806 @@ -2402,7 +2398,6 @@ static int create_grant_pte_mapping(
  26.807      struct domain *d = v->domain;
  26.808  
  26.809      ASSERT(spin_is_locked(&d->big_lock));
  26.810 -    ASSERT(!shadow_mode_refcounts(d));
  26.811  
  26.812      gmfn = pte_addr >> PAGE_SHIFT;
  26.813      mfn = gmfn_to_mfn(d, gmfn);
  26.814 @@ -2418,7 +2413,7 @@ static int create_grant_pte_mapping(
  26.815      page = mfn_to_page(mfn);
  26.816  
  26.817      type_info = page->u.inuse.type_info;
  26.818 -    if ( ((type_info & PGT_type_mask) != PGT_l1_page_table) ||
  26.819 +    if ( ((type_info & PGT_type_mask) != PGT_l1_page_table) ||         
  26.820           !get_page_type(page, type_info & (PGT_type_mask|PGT_va_mask)) )
  26.821      {
  26.822          MEM_LOG("Grant map attempted to update a non-L1 page");
  26.823 @@ -2427,28 +2422,22 @@ static int create_grant_pte_mapping(
  26.824      }
  26.825  
  26.826      ol1e = *(l1_pgentry_t *)va;
  26.827 -    if ( !update_l1e(va, ol1e, _nl1e) )
  26.828 +    if ( !update_l1e(va, ol1e, _nl1e, mfn, v) )
  26.829      {
  26.830          put_page_type(page);
  26.831          rc = GNTST_general_error;
  26.832          goto failed;
  26.833      } 
  26.834  
  26.835 -    put_page_from_l1e(ol1e, d);
  26.836 -
  26.837 -    if ( unlikely(shadow_mode_enabled(d)) )
  26.838 -    {
  26.839 -        struct domain_mmap_cache sh_mapcache;
  26.840 -        domain_mmap_cache_init(&sh_mapcache);
  26.841 -        shadow_l1_normal_pt_update(d, pte_addr, _nl1e, &sh_mapcache);
  26.842 -        domain_mmap_cache_destroy(&sh_mapcache);
  26.843 -    }
  26.844 +    if ( !shadow2_mode_refcounts(d) )
  26.845 +        put_page_from_l1e(ol1e, d);
  26.846  
  26.847      put_page_type(page);
  26.848   
  26.849   failed:
  26.850      unmap_domain_page(va);
  26.851      put_page(page);
  26.852 +
  26.853      return rc;
  26.854  }
  26.855  
  26.856 @@ -2462,8 +2451,6 @@ static int destroy_grant_pte_mapping(
  26.857      u32 type_info;
  26.858      l1_pgentry_t ol1e;
  26.859  
  26.860 -    ASSERT(!shadow_mode_refcounts(d));
  26.861 -
  26.862      gmfn = addr >> PAGE_SHIFT;
  26.863      mfn = gmfn_to_mfn(d, gmfn);
  26.864  
  26.865 @@ -2504,7 +2491,9 @@ static int destroy_grant_pte_mapping(
  26.866      }
  26.867  
  26.868      /* Delete pagetable entry. */
  26.869 -    if ( unlikely(!update_l1e((l1_pgentry_t *)va, ol1e, l1e_empty())) )
  26.870 +    if ( unlikely(!update_l1e(
  26.871 +                      (l1_pgentry_t *)va, ol1e, l1e_empty(), mfn, 
  26.872 +                      d->vcpu[0] /* Change if we go to per-vcpu shadows. */)) )
  26.873      {
  26.874          MEM_LOG("Cannot delete PTE entry at %p", va);
  26.875          put_page_type(page);
  26.876 @@ -2512,14 +2501,6 @@ static int destroy_grant_pte_mapping(
  26.877          goto failed;
  26.878      }
  26.879  
  26.880 -    if ( unlikely(shadow_mode_enabled(d)) )
  26.881 -    {
  26.882 -        struct domain_mmap_cache sh_mapcache;
  26.883 -        domain_mmap_cache_init(&sh_mapcache);
  26.884 -        shadow_l1_normal_pt_update(d, addr, l1e_empty(), &sh_mapcache);
  26.885 -        domain_mmap_cache_destroy(&sh_mapcache);
  26.886 -    }
  26.887 -
  26.888      put_page_type(page);
  26.889  
  26.890   failed:
  26.891 @@ -2536,31 +2517,22 @@ static int create_grant_va_mapping(
  26.892      struct domain *d = v->domain;
  26.893      
  26.894      ASSERT(spin_is_locked(&d->big_lock));
  26.895 -    ASSERT(!shadow_mode_refcounts(d));
  26.896 -
  26.897 -    /*
  26.898 -     * This is actually overkill - we don't need to sync the L1 itself,
  26.899 -     * just everything involved in getting to this L1 (i.e. we need
  26.900 -     * linear_pg_table[l1_linear_offset(va)] to be in sync)...
  26.901 -     */
  26.902 -    __shadow_sync_va(v, va);
  26.903  
  26.904      pl1e = &linear_pg_table[l1_linear_offset(va)];
  26.905  
  26.906      if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) ||
  26.907 -         !update_l1e(pl1e, ol1e, _nl1e) )
  26.908 +         !update_l1e(pl1e, ol1e, _nl1e, 
  26.909 +                    l2e_get_pfn(__linear_l2_table[l2_linear_offset(va)]), v) )
  26.910          return GNTST_general_error;
  26.911  
  26.912 -    put_page_from_l1e(ol1e, d);
  26.913 -
  26.914 -    if ( unlikely(shadow_mode_enabled(d)) )
  26.915 -        shadow_do_update_va_mapping(va, _nl1e, v);
  26.916 +    if ( !shadow2_mode_refcounts(d) )
  26.917 +        put_page_from_l1e(ol1e, d);
  26.918  
  26.919      return GNTST_okay;
  26.920  }
  26.921  
  26.922  static int destroy_grant_va_mapping(
  26.923 -    unsigned long addr, unsigned long frame)
  26.924 +    unsigned long addr, unsigned long frame, struct domain *d)
  26.925  {
  26.926      l1_pgentry_t *pl1e, ol1e;
  26.927      
  26.928 @@ -2584,12 +2556,14 @@ static int destroy_grant_va_mapping(
  26.929      }
  26.930  
  26.931      /* Delete pagetable entry. */
  26.932 -    if ( unlikely(!update_l1e(pl1e, ol1e, l1e_empty())) )
  26.933 +    if ( unlikely(!update_l1e(pl1e, ol1e, l1e_empty(), 
  26.934 +                      l2e_get_pfn(__linear_l2_table[l2_linear_offset(addr)]),
  26.935 +                      d->vcpu[0] /* Change for per-vcpu shadows */)) )
  26.936      {
  26.937          MEM_LOG("Cannot delete PTE entry at %p", (unsigned long *)pl1e);
  26.938          return GNTST_general_error;
  26.939      }
  26.940 -    
  26.941 +
  26.942      return 0;
  26.943  }
  26.944  
  26.945 @@ -2597,7 +2571,7 @@ int create_grant_host_mapping(
  26.946      unsigned long addr, unsigned long frame, unsigned int flags)
  26.947  {
  26.948      l1_pgentry_t pte = l1e_from_pfn(frame, GRANT_PTE_FLAGS);
  26.949 -        
  26.950 +
  26.951      if ( (flags & GNTMAP_application_map) )
  26.952          l1e_add_flags(pte,_PAGE_USER);
  26.953      if ( !(flags & GNTMAP_readonly) )
  26.954 @@ -2613,7 +2587,7 @@ int destroy_grant_host_mapping(
  26.955  {
  26.956      if ( flags & GNTMAP_contains_pte )
  26.957          return destroy_grant_pte_mapping(addr, frame, current->domain);
  26.958 -    return destroy_grant_va_mapping(addr, frame);
  26.959 +    return destroy_grant_va_mapping(addr, frame, current->domain);
  26.960  }
  26.961  
  26.962  int steal_page(
  26.963 @@ -2675,46 +2649,44 @@ int do_update_va_mapping(unsigned long v
  26.964  
  26.965      perfc_incrc(calls_to_update_va);
  26.966  
  26.967 -    if ( unlikely(!__addr_ok(va) && !shadow_mode_external(d)) )
  26.968 +    if ( unlikely(!__addr_ok(va) && !shadow2_mode_external(d)) )
  26.969          return -EINVAL;
  26.970  
  26.971 +    if ( unlikely(shadow2_mode_refcounts(d)) )
  26.972 +    {
  26.973 +        DPRINTK("Grant op on a shadow-refcounted domain\n");
  26.974 +        return -EINVAL; 
  26.975 +    }
  26.976 +
  26.977      LOCK_BIGLOCK(d);
  26.978  
  26.979 -    if ( unlikely(shadow_mode_enabled(d)) )
  26.980 -        check_pagetable(v, "pre-va"); /* debug */
  26.981 -
  26.982 -    if ( unlikely(!mod_l1_entry(&linear_pg_table[l1_linear_offset(va)],
  26.983 -                                val)) )
  26.984 -        rc = -EINVAL;
  26.985 -
  26.986 -    if ( likely(rc == 0) && unlikely(shadow_mode_enabled(d)) )
  26.987 +    if ( likely(rc == 0) && unlikely(shadow2_mode_enabled(d)) )
  26.988      {
  26.989          if ( unlikely(this_cpu(percpu_mm_info).foreign &&
  26.990 -                      (shadow_mode_translate(d) ||
  26.991 -                       shadow_mode_translate(
  26.992 +                      (shadow2_mode_translate(d) ||
  26.993 +                       shadow2_mode_translate(
  26.994                             this_cpu(percpu_mm_info).foreign))) )
  26.995          {
  26.996              /*
  26.997               * The foreign domain's pfn's are in a different namespace. There's
  26.998 -             * not enough information in just a gpte to figure out how to
  26.999 +             * not enough information in just a gpte to figure out how to   
 26.1000               * (re-)shadow this entry.
 26.1001               */
 26.1002              domain_crash(d);
 26.1003          }
 26.1004 +    }
 26.1005 +
 26.1006 +    if ( unlikely(!mod_l1_entry(
 26.1007 +                      &linear_pg_table[l1_linear_offset(va)], val,
 26.1008 +                      l2e_get_pfn(__linear_l2_table[l2_linear_offset(va)]))) )
 26.1009 +        rc = -EINVAL;
 26.1010      
 26.1011 -        rc = shadow_do_update_va_mapping(va, val, v);
 26.1012 -
 26.1013 -        check_pagetable(v, "post-va"); /* debug */
 26.1014 -    }
 26.1015 -
 26.1016      switch ( flags & UVMF_FLUSHTYPE_MASK )
 26.1017      {
 26.1018      case UVMF_TLB_FLUSH:
 26.1019          switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
 26.1020          {
 26.1021          case UVMF_LOCAL:
 26.1022 -            if ( unlikely(shadow_mode_enabled(d)) )
 26.1023 -                shadow_sync_all(d);
 26.1024              local_flush_tlb();
 26.1025              break;
 26.1026          case UVMF_ALL:
 26.1027 @@ -2733,9 +2705,9 @@ int do_update_va_mapping(unsigned long v
 26.1028          switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
 26.1029          {
 26.1030          case UVMF_LOCAL:
 26.1031 -            if ( unlikely(shadow_mode_enabled(d)) )
 26.1032 -                shadow_invlpg(current, va);
 26.1033 -            local_flush_tlb_one(va);
 26.1034 +            if ( !shadow2_mode_enabled(d) 
 26.1035 +                 || (shadow2_invlpg(current, va) != 0) ) 
 26.1036 +                local_flush_tlb_one(va);
 26.1037              break;
 26.1038          case UVMF_ALL:
 26.1039              flush_tlb_one_mask(d->domain_dirty_cpumask, va);
 26.1040 @@ -2808,8 +2780,6 @@ long set_gdt(struct vcpu *v,
 26.1041      if ( entries > FIRST_RESERVED_GDT_ENTRY )
 26.1042          return -EINVAL;
 26.1043  
 26.1044 -    shadow_sync_all(d);
 26.1045 -
 26.1046      /* Check the pages in the new GDT. */
 26.1047      for ( i = 0; i < nr_pages; i++ ) {
 26.1048          mfn = frames[i] = gmfn_to_mfn(d, frames[i]);
 26.1049 @@ -2912,24 +2882,13 @@ long do_update_descriptor(u64 pa, u64 de
 26.1050          break;
 26.1051      }
 26.1052  
 26.1053 -    if ( shadow_mode_enabled(dom) )
 26.1054 -    {
 26.1055 -        shadow_lock(dom);
 26.1056 -
 26.1057 -        __mark_dirty(dom, mfn);
 26.1058 -
 26.1059 -        if ( page_is_page_table(page) && !page_out_of_sync(page) )
 26.1060 -            shadow_mark_mfn_out_of_sync(current, gmfn, mfn);
 26.1061 -    }
 26.1062 +    mark_dirty(dom, mfn);
 26.1063  
 26.1064      /* All is good so make the update. */
 26.1065      gdt_pent = map_domain_page(mfn);
 26.1066      memcpy(&gdt_pent[offset], &d, 8);
 26.1067      unmap_domain_page(gdt_pent);
 26.1068  
 26.1069 -    if ( shadow_mode_enabled(dom) )
 26.1070 -        shadow_unlock(dom);
 26.1071 -
 26.1072      put_page_type(page);
 26.1073  
 26.1074      ret = 0; /* success */
 26.1075 @@ -2981,8 +2940,8 @@ long arch_memory_op(int op, XEN_GUEST_HA
 26.1076          default:
 26.1077              break;
 26.1078          }
 26.1079 -        
 26.1080 -        if ( !shadow_mode_translate(d) || (mfn == 0) )
 26.1081 +
 26.1082 +        if ( !shadow2_mode_translate(d) || (mfn == 0) )
 26.1083          {
 26.1084              put_domain(d);
 26.1085              return -EINVAL;
 26.1086 @@ -3011,7 +2970,7 @@ long arch_memory_op(int op, XEN_GUEST_HA
 26.1087          guest_physmap_add_page(d, xatp.gpfn, mfn);
 26.1088  
 26.1089          UNLOCK_BIGLOCK(d);
 26.1090 -
 26.1091 +        
 26.1092          put_domain(d);
 26.1093  
 26.1094          break;
 26.1095 @@ -3136,7 +3095,8 @@ static int ptwr_emulated_update(
 26.1096      unsigned long pfn;
 26.1097      struct page_info *page;
 26.1098      l1_pgentry_t pte, ol1e, nl1e, *pl1e;
 26.1099 -    struct domain *d = current->domain;
 26.1100 +    struct vcpu *v = current;
 26.1101 +    struct domain *d = v->domain;
 26.1102  
 26.1103      /* Aligned access only, thank you. */
 26.1104      if ( !access_ok(addr, bytes) || ((addr & (bytes-1)) != 0) )
 26.1105 @@ -3196,25 +3156,36 @@ static int ptwr_emulated_update(
 26.1106          return X86EMUL_UNHANDLEABLE;
 26.1107      }
 26.1108  
 26.1109 +
 26.1110      /* Checked successfully: do the update (write or cmpxchg). */
 26.1111      pl1e = map_domain_page(page_to_mfn(page));
 26.1112      pl1e = (l1_pgentry_t *)((unsigned long)pl1e + (addr & ~PAGE_MASK));
 26.1113      if ( do_cmpxchg )
 26.1114      {
 26.1115 +        if ( shadow2_mode_enabled(d) )
 26.1116 +            shadow2_lock(d);
 26.1117          ol1e = l1e_from_intpte(old);
 26.1118          if ( cmpxchg((intpte_t *)pl1e, old, val) != old )
 26.1119          {
 26.1120 +            if ( shadow2_mode_enabled(d) )
 26.1121 +                shadow2_unlock(d);
 26.1122              unmap_domain_page(pl1e);
 26.1123              put_page_from_l1e(nl1e, d);
 26.1124              return X86EMUL_CMPXCHG_FAILED;
 26.1125          }
 26.1126 +        if ( unlikely(shadow2_mode_enabled(v->domain)) )
 26.1127 +        {
 26.1128 +            shadow2_validate_guest_entry(v, _mfn(page_to_mfn(page)), pl1e);
 26.1129 +            shadow2_unlock(v->domain);    
 26.1130 +        }
 26.1131      }
 26.1132      else
 26.1133      {
 26.1134          ol1e = *pl1e;
 26.1135 -        if ( !update_l1e(pl1e, ol1e, nl1e) )
 26.1136 +        if ( !update_l1e(pl1e, ol1e, nl1e, page_to_mfn(page), v) )
 26.1137              BUG();
 26.1138      }
 26.1139 +
 26.1140      unmap_domain_page(pl1e);
 26.1141  
 26.1142      /* Finally, drop the old PTE. */
    27.1 --- a/xen/arch/x86/setup.c	Wed Aug 16 16:16:32 2006 +0100
    27.2 +++ b/xen/arch/x86/setup.c	Wed Aug 16 17:02:35 2006 +0100
    27.3 @@ -532,8 +532,6 @@ void __init __start_xen(multiboot_info_t
    27.4      if ( opt_watchdog ) 
    27.5          watchdog_enable();
    27.6  
    27.7 -    shadow_mode_init();
    27.8 -
    27.9      /* initialize access control security module */
   27.10      acm_init(&initrdidx, mbi, initial_images_start);
   27.11  
    28.1 --- a/xen/arch/x86/shadow.c	Wed Aug 16 16:16:32 2006 +0100
    28.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    28.3 @@ -1,4150 +0,0 @@
    28.4 -/******************************************************************************
    28.5 - * arch/x86/shadow.c
    28.6 - *
    28.7 - * Copyright (c) 2005 Michael A Fetterman
    28.8 - * Based on an earlier implementation by Ian Pratt et al
    28.9 - *
   28.10 - * This program is free software; you can redistribute it and/or modify
   28.11 - * it under the terms of the GNU General Public License as published by
   28.12 - * the Free Software Foundation; either version 2 of the License, or
   28.13 - * (at your option) any later version.
   28.14 - *
   28.15 - * This program is distributed in the hope that it will be useful,
   28.16 - * but WITHOUT ANY WARRANTY; without even the implied warranty of
   28.17 - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   28.18 - * GNU General Public License for more details.
   28.19 - *
   28.20 - * You should have received a copy of the GNU General Public License
   28.21 - * along with this program; if not, write to the Free Software
   28.22 - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
   28.23 - */
   28.24 -/*
   28.25 - * Jun Nakajima <jun.nakajima@intel.com>
   28.26 - * Chengyuan Li <chengyuan.li@intel.com>
   28.27 - *
   28.28 - * Extended to support 32-bit PAE and 64-bit guests.
   28.29 - */
   28.30 -
   28.31 -#include <xen/config.h>
   28.32 -#include <xen/types.h>
   28.33 -#include <xen/mm.h>
   28.34 -#include <xen/domain_page.h>
   28.35 -#include <asm/shadow.h>
   28.36 -#include <asm/page.h>
   28.37 -#include <xen/event.h>
   28.38 -#include <xen/sched.h>
   28.39 -#include <xen/trace.h>
   28.40 -#include <asm/shadow_64.h>
   28.41 -
   28.42 -/* Use this to have the compiler remove unnecessary branches */
   28.43 -#define SH_L1_HAS_NEXT_PAGE (GUEST_L1_PAGETABLE_ENTRIES - L1_PAGETABLE_ENTRIES)
   28.44 -
   28.45 -extern void free_shadow_pages(struct domain *d);
   28.46 -
   28.47 -#if 0 // this code has not been updated for 32pae & 64 bit modes
   28.48 -#if SHADOW_DEBUG
   28.49 -static void mark_shadows_as_reflecting_snapshot(struct domain *d, unsigned long gpfn);
   28.50 -#endif
   28.51 -#endif
   28.52 -
   28.53 -#if CONFIG_PAGING_LEVELS == 3
   28.54 -static unsigned long shadow_l3_table(
   28.55 -    struct vcpu *v, unsigned long gpfn, unsigned long gmfn);
   28.56 -#endif
   28.57 -
   28.58 -#if CONFIG_PAGING_LEVELS == 4
   28.59 -static unsigned long shadow_l4_table(
   28.60 -    struct vcpu *v, unsigned long gpfn, unsigned long gmfn);
   28.61 -#endif
   28.62 -
   28.63 -#if CONFIG_PAGING_LEVELS >= 3
   28.64 -static void shadow_map_into_current(struct vcpu *v,
   28.65 -    unsigned long va, unsigned int from, unsigned int to);
   28.66 -static inline void validate_bl2e_change( struct domain *d,
   28.67 -    guest_root_pgentry_t *new_gle_p, pgentry_64_t *shadow_l3, int index);
   28.68 -static void update_top_level_shadow(struct vcpu *v, unsigned long smfn);
   28.69 -#endif
   28.70 -
   28.71 -/********
   28.72 -
   28.73 -There's a per-domain shadow table spin lock which works fine for SMP
   28.74 -hosts. We don't have to worry about interrupts as no shadow operations
   28.75 -happen in an interrupt context. It's probably not quite ready for SMP
   28.76 -guest operation as we have to worry about synchonisation between gpte
   28.77 -and spte updates. Its possible that this might only happen in a
   28.78 -hypercall context, in which case we'll probably at have a per-domain
   28.79 -hypercall lock anyhow (at least initially).
   28.80 -
   28.81 -********/
   28.82 -
   28.83 -static inline int
   28.84 -shadow_promote(struct domain *d, unsigned long gpfn, unsigned long gmfn,
   28.85 -               unsigned long new_type)
   28.86 -{
   28.87 -    struct page_info *page = mfn_to_page(gmfn);
   28.88 -    int pinned = 0, okay = 1;
   28.89 -
   28.90 -    if ( page_out_of_sync(page) )
   28.91 -    {
   28.92 -        // Don't know how long ago this snapshot was taken.
   28.93 -        // Can't trust it to be recent enough.
   28.94 -        //
   28.95 -        __shadow_sync_mfn(d, gmfn);
   28.96 -    }
   28.97 -
   28.98 -    if ( !shadow_mode_refcounts(d) )
   28.99 -        return 1;
  28.100 -
  28.101 -    if ( unlikely(page_is_page_table(page)) )
  28.102 -        return 1;
  28.103 -
  28.104 -    FSH_LOG("%s: gpfn=%lx gmfn=%lx nt=%08lx", __func__, gpfn, gmfn, new_type);
  28.105 -
  28.106 -    if ( !shadow_remove_all_write_access(d, gpfn, gmfn) )
  28.107 -    {
  28.108 -        FSH_LOG("%s: couldn't find/remove all write accesses, gpfn=%lx gmfn=%lx",
  28.109 -                __func__, gpfn, gmfn);
  28.110 -#if 1 || defined(LIVE_DANGEROUSLY)
  28.111 -        set_bit(_PGC_page_table, &page->count_info);
  28.112 -        return 1;
  28.113 -#endif
  28.114 -        return 0;
  28.115 -    }
  28.116 -
  28.117 -    // To convert this page to use as a page table, the writable count
  28.118 -    // should now be zero.  Test this by grabbing the page as an page table,
  28.119 -    // and then immediately releasing.  This will also deal with any
  28.120 -    // necessary TLB flushing issues for us.
  28.121 -    //
  28.122 -    // The cruft here about pinning doesn't really work right.  This
  28.123 -    // needs rethinking/rewriting...  Need to gracefully deal with the
  28.124 -    // TLB flushes required when promoting a writable page, and also deal
  28.125 -    // with any outstanding (external) writable refs to this page (by
  28.126 -    // refusing to promote it).  The pinning headache complicates this
  28.127 -    // code -- it would all get much simpler if we stop using
  28.128 -    // shadow_lock() and move the shadow code to BIGLOCK().
  28.129 -    //
  28.130 -    if ( unlikely(!get_page(page, d)) )
  28.131 -        BUG(); // XXX -- needs more thought for a graceful failure
  28.132 -    if ( unlikely(test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info)) )
  28.133 -    {
  28.134 -        pinned = 1;
  28.135 -        put_page_and_type(page);
  28.136 -    }
  28.137 -    if ( get_page_type(page, PGT_base_page_table) )
  28.138 -    {
  28.139 -        set_bit(_PGC_page_table, &page->count_info);
  28.140 -        put_page_type(page);
  28.141 -    }
  28.142 -    else
  28.143 -    {
  28.144 -        printk("shadow_promote: get_page_type failed "
  28.145 -               "dom%d gpfn=%lx gmfn=%lx t=%08lx\n",
  28.146 -               d->domain_id, gpfn, gmfn, new_type);
  28.147 -        okay = 0;
  28.148 -    }
  28.149 -
  28.150 -    // Now put the type back to writable...
  28.151 -    if ( unlikely(!get_page_type(page, PGT_writable_page)) )
  28.152 -        BUG(); // XXX -- needs more thought for a graceful failure
  28.153 -    if ( unlikely(pinned) )
  28.154 -    {
  28.155 -        if ( unlikely(test_and_set_bit(_PGT_pinned,
  28.156 -                                       &page->u.inuse.type_info)) )
  28.157 -            BUG(); // hmm... someone pinned this again?
  28.158 -    }
  28.159 -    else
  28.160 -        put_page_and_type(page);
  28.161 -
  28.162 -    return okay;
  28.163 -}
  28.164 -
  28.165 -
  28.166 -/*
  28.167 - * Things in shadow mode that collect get_page() refs to the domain's
  28.168 - * pages are:
  28.169 - * - PGC_allocated takes a gen count, just like normal.
  28.170 - * - A writable page can be pinned (paravirtualized guests may consider
  28.171 - *   these pages to be L1s or L2s, and don't know the difference).
  28.172 - *   Pinning a page takes a gen count (but, for domains in shadow mode,
  28.173 - *   it *doesn't* take a type count)
  28.174 - * - CR3 grabs a ref to whatever it points at, just like normal.
  28.175 - * - Shadow mode grabs an initial gen count for itself, as a placehold
  28.176 - *   for whatever references will exist.
  28.177 - * - Shadow PTEs that point to a page take a gen count, just like regular
  28.178 - *   PTEs.  However, they don't get a type count, as get_page_type() is
  28.179 - *   hardwired to keep writable pages' counts at 1 for domains in shadow
  28.180 - *   mode.
  28.181 - * - Whenever we shadow a page, the entry in the shadow hash grabs a
  28.182 - *   general ref to the page.
  28.183 - * - Whenever a page goes out of sync, the out of sync entry grabs a
  28.184 - *   general ref to the page.
  28.185 - */
  28.186 -/*
  28.187 - * page_info fields for pages allocated as shadow pages:
  28.188 - *
  28.189 - * All 32 bits of count_info are a simple count of refs to this shadow
  28.190 - * from a) other shadow pages, b) current CR3's (aka ed->arch.shadow_table),
  28.191 - * c) if it's a pinned shadow root pgtable, d) outstanding out-of-sync
  28.192 - * references.
  28.193 - *
  28.194 - * u.inuse._domain is left NULL, to prevent accidently allow some random
  28.195 - * domain from gaining permissions to map this page.
  28.196 - *
  28.197 - * u.inuse.type_info & PGT_type_mask remembers what kind of page is being
  28.198 - * shadowed.
  28.199 - * u.inuse.type_info & PGT_mfn_mask holds the mfn of the page being shadowed.
  28.200 - * u.inuse.type_info & PGT_pinned says that an extra reference to this shadow
  28.201 - * is currently exists because this is a shadow of a root page, and we
  28.202 - * don't want to let those disappear just because no CR3 is currently pointing
  28.203 - * at it.
  28.204 - *
  28.205 - * tlbflush_timestamp holds a min & max index of valid page table entries
  28.206 - * within the shadow page.
  28.207 - */
  28.208 -static inline void
  28.209 -shadow_page_info_init(struct page_info *page,
  28.210 -                      unsigned long gmfn,
  28.211 -                      u32 psh_type)
  28.212 -{
  28.213 -    ASSERT( (gmfn & ~PGT_mfn_mask) == 0 );
  28.214 -    page->u.inuse.type_info = psh_type | gmfn;
  28.215 -    page->count_info = 0;
  28.216 -    page->tlbflush_timestamp = 0;
  28.217 -}
  28.218 -
  28.219 -static inline unsigned long
  28.220 -alloc_shadow_page(struct domain *d,
  28.221 -                  unsigned long gpfn, unsigned long gmfn,
  28.222 -                  u32 psh_type)
  28.223 -{
  28.224 -    struct page_info *page;
  28.225 -    unsigned long smfn, real_gpfn;
  28.226 -    int pin = 0;
  28.227 -    void *l1, *lp;
  28.228 -    u64 index = 0;
  28.229 -
  28.230 -    // Currently, we only keep pre-zero'ed pages around for use as L1's...
  28.231 -    // This will change.  Soon.
  28.232 -    //
  28.233 -    if ( psh_type == PGT_l1_shadow )
  28.234 -    {
  28.235 -        if ( !list_empty(&d->arch.free_shadow_frames) )
  28.236 -        {
  28.237 -            struct list_head *entry = d->arch.free_shadow_frames.next;
  28.238 -            page = list_entry(entry, struct page_info, list);
  28.239 -            list_del(entry);
  28.240 -            perfc_decr(free_l1_pages);
  28.241 -        }
  28.242 -        else
  28.243 -        {
  28.244 -            if ( SH_L1_HAS_NEXT_PAGE &&
  28.245 -                 d->arch.ops->guest_paging_levels == PAGING_L2)
  28.246 -            {
  28.247 -#if CONFIG_PAGING_LEVELS >= 3
  28.248 -                /* 
  28.249 -                 * For 32-bit HVM guest, 2 shadow L1s are required to
  28.250 -                 * simulate 1 guest L1 So need allocate 2 shadow L1
  28.251 -                 * pages each time. 
  28.252 -                 *
  28.253 -                 * --> Need to avoidalloc_domheap_pages.
  28.254 -                 */
  28.255 -                page = alloc_domheap_pages(NULL, SL1_ORDER, 0);
  28.256 -                if (!page)
  28.257 -                    goto no_shadow_page;
  28.258 -
  28.259 -                l1 = map_domain_page(page_to_mfn(page));
  28.260 -                memset(l1, 0, PAGE_SIZE);
  28.261 -                unmap_domain_page(l1);
  28.262 -
  28.263 -                l1 = map_domain_page(page_to_mfn(page + 1));
  28.264 -                memset(l1, 0, PAGE_SIZE);
  28.265 -                unmap_domain_page(l1);
  28.266 -
  28.267 -                /* we'd like to initialize the second continuous page here
  28.268 -                 * and leave the first page initialization later */
  28.269 -
  28.270 -                shadow_page_info_init(page+1, gmfn, psh_type);
  28.271 -#else
  28.272 -                page = alloc_domheap_page(NULL);
  28.273 -                if (!page)
  28.274 -                    goto no_shadow_page;
  28.275 -
  28.276 -                l1 = map_domain_page(page_to_mfn(page));
  28.277 -                memset(l1, 0, PAGE_SIZE);
  28.278 -                unmap_domain_page(l1);
  28.279 -#endif
  28.280 -            }
  28.281 -            else
  28.282 -            {
  28.283 -                page = alloc_domheap_page(NULL);
  28.284 -                if (!page)
  28.285 -                    goto no_shadow_page;
  28.286 -
  28.287 -                l1 = map_domain_page(page_to_mfn(page));
  28.288 -                memset(l1, 0, PAGE_SIZE);
  28.289 -                unmap_domain_page(l1);
  28.290 -            }
  28.291 -        }
  28.292 -    }
  28.293 -    else {
  28.294 -#if CONFIG_PAGING_LEVELS == 2
  28.295 -        page = alloc_domheap_page(NULL);
  28.296 -#elif CONFIG_PAGING_LEVELS >= 3
  28.297 -        if ( d->arch.ops->guest_paging_levels == PAGING_L2 &&
  28.298 -             psh_type == PGT_l4_shadow )      /* allocated for PAE PDP page */
  28.299 -            page = alloc_domheap_pages(NULL, 0, MEMF_dma);
  28.300 -        else if ( d->arch.ops->guest_paging_levels == PAGING_L3 &&
  28.301 -                  (psh_type == PGT_l3_shadow || psh_type == PGT_l4_shadow) )
  28.302 -            page = alloc_domheap_pages(NULL, 0, MEMF_dma); /* allocated for PAE PDP page */
  28.303 -        else
  28.304 -            page = alloc_domheap_page(NULL);
  28.305 -#endif
  28.306 -        if (!page)
  28.307 -            goto no_shadow_page;
  28.308 -
  28.309 -        lp = map_domain_page(page_to_mfn(page));
  28.310 -        memset(lp, 0, PAGE_SIZE);
  28.311 -        unmap_domain_page(lp);
  28.312 -    }
  28.313 -
  28.314 -    smfn = page_to_mfn(page);
  28.315 -
  28.316 -    shadow_page_info_init(page, gmfn, psh_type);
  28.317 -
  28.318 -    switch ( psh_type )
  28.319 -    {
  28.320 -    case PGT_l1_shadow:
  28.321 -        if ( !shadow_promote(d, gpfn, gmfn, psh_type) )
  28.322 -            goto fail;
  28.323 -        perfc_incr(shadow_l1_pages);
  28.324 -        d->arch.shadow_page_count++;
  28.325 -        break;
  28.326 -
  28.327 -    case PGT_l2_shadow:
  28.328 -        if ( !shadow_promote(d, gpfn, gmfn, psh_type) )
  28.329 -            goto fail;
  28.330 -        perfc_incr(shadow_l2_pages);
  28.331 -        d->arch.shadow_page_count++;
  28.332 -        if ( PGT_l2_page_table == PGT_root_page_table )
  28.333 -            pin = 1;
  28.334 -
  28.335 -        break;
  28.336 -
  28.337 -    case PGT_l3_shadow:
  28.338 -        if ( !shadow_promote(d, gpfn, gmfn, psh_type) )
  28.339 -            goto fail;
  28.340 -        perfc_incr(shadow_l3_pages);
  28.341 -        d->arch.shadow_page_count++;
  28.342 -        if ( PGT_l3_page_table == PGT_root_page_table )
  28.343 -            pin = 1;
  28.344 -        break;
  28.345 -
  28.346 -    case PGT_l4_shadow:
  28.347 -        real_gpfn = gpfn & PGT_mfn_mask;
  28.348 -        if ( !shadow_promote(d, real_gpfn, gmfn, psh_type) )
  28.349 -            goto fail;
  28.350 -        perfc_incr(shadow_l4_pages);
  28.351 -        d->arch.shadow_page_count++;
  28.352 -        if ( PGT_l4_page_table == PGT_root_page_table )
  28.353 -            pin = 1;
  28.354 -#if CONFIG_PAGING_LEVELS == 3 & defined (GUEST_PGENTRY_32)
  28.355 -        /*
  28.356 -         * We use PGT_l4_shadow for 2-level paging guests on PAE
  28.357 -         */
  28.358 -        if ( d->arch.ops->guest_paging_levels == PAGING_L2 )
  28.359 -            pin = 1;
  28.360 -#endif
  28.361 -
  28.362 -#if CONFIG_PAGING_LEVELS == 3 & defined ( GUEST_32PAE )
  28.363 -        /*
  28.364 -         * We use PGT_l4_shadow for 2-level paging guests on PAE
  28.365 -         */
  28.366 -        if ( d->arch.ops->guest_paging_levels == PAGING_L3 )
  28.367 -            pin = 1;
  28.368 -#endif
  28.369 -        if ( d->arch.ops->guest_paging_levels == PAGING_L3 )
  28.370 -            index = get_cr3_idxval(current);
  28.371 -        break;
  28.372 -
  28.373 -#if CONFIG_PAGING_LEVELS >= 3
  28.374 -    case PGT_fl1_shadow:
  28.375 -        perfc_incr(shadow_l1_pages);
  28.376 -        d->arch.shadow_page_count++;
  28.377 -        break;
  28.378 -#else
  28.379 -
  28.380 -    case PGT_hl2_shadow:
  28.381 -        // Treat an hl2 as an L1 for purposes of promotion.
  28.382 -        // For external mode domains, treat them as an L2 for purposes of
  28.383 -        // pinning.
  28.384 -        //
  28.385 -        if ( !shadow_promote(d, gpfn, gmfn, PGT_l1_shadow) )
  28.386 -            goto fail;
  28.387 -        perfc_incr(hl2_table_pages);
  28.388 -        d->arch.hl2_page_count++;
  28.389 -        if ( shadow_mode_external(d) &&
  28.390 -             (PGT_l2_page_table == PGT_root_page_table) )
  28.391 -            pin = 1;
  28.392 -
  28.393 -        break;
  28.394 -#endif
  28.395 -    case PGT_snapshot:
  28.396 -        perfc_incr(snapshot_pages);
  28.397 -        d->arch.snapshot_page_count++;
  28.398 -        break;
  28.399 -
  28.400 -    default:
  28.401 -        printk("Alloc shadow weird page type type=%08x\n", psh_type);
  28.402 -        BUG();
  28.403 -        break;
  28.404 -    }
  28.405 -
  28.406 -    // Don't add a new shadow of something that already has a snapshot.
  28.407 -    //
  28.408 -    ASSERT( (psh_type == PGT_snapshot) || !mfn_out_of_sync(gmfn) );
  28.409 -
  28.410 -    set_shadow_status(d, gpfn, gmfn, smfn, psh_type, index);
  28.411 -
  28.412 -    if ( pin )
  28.413 -        shadow_pin(smfn);
  28.414 -
  28.415 -    return smfn;
  28.416 -
  28.417 -fail:
  28.418 -    FSH_LOG("promotion of pfn=%lx mfn=%lx failed!  external gnttab refs?",
  28.419 -            gpfn, gmfn);
  28.420 -    if (psh_type == PGT_l1_shadow)
  28.421 -    {
  28.422 -        if (d->arch.ops->guest_paging_levels == PAGING_L2)
  28.423 -        {
  28.424 -#if CONFIG_PAGING_LEVELS >=3
  28.425 -            free_domheap_pages(page, SL1_ORDER);
  28.426 -#else
  28.427 -            free_domheap_page(page);
  28.428 -#endif
  28.429 -        }
  28.430 -        else
  28.431 -            free_domheap_page(page);
  28.432 -    }
  28.433 -    else
  28.434 -        free_domheap_page(page);
  28.435 -
  28.436 -    return 0;
  28.437 -
  28.438 -no_shadow_page:
  28.439 -    ASSERT(page == NULL);
  28.440 -    printk("Couldn't alloc shadow page! dom%d count=%d\n",
  28.441 -           d->domain_id, d->arch.shadow_page_count);
  28.442 -    printk("Shadow table counts: l1=%d l2=%d hl2=%d snapshot=%d\n",
  28.443 -           perfc_value(shadow_l1_pages),
  28.444 -           perfc_value(shadow_l2_pages),
  28.445 -           perfc_value(hl2_table_pages),
  28.446 -           perfc_value(snapshot_pages));
  28.447 -    /* XXX FIXME: try a shadow flush to free up some memory. */
  28.448 -    domain_crash_synchronous();
  28.449 -
  28.450 -    return 0;
  28.451 -}
  28.452 -
  28.453 -#if CONFIG_PAGING_LEVELS == 2
  28.454 -static unsigned long
  28.455 -shadow_hl2_table(struct domain *d, unsigned long gpfn, unsigned long gmfn,
  28.456 -                unsigned long smfn)
  28.457 -{
  28.458 -    unsigned long hl2mfn;
  28.459 -    l1_pgentry_t *hl2;
  28.460 -    int limit;
  28.461 -
  28.462 -    ASSERT(PGT_base_page_table == PGT_l2_page_table);
  28.463 -
  28.464 -    if ( unlikely(!(hl2mfn = alloc_shadow_page(d, gpfn, gmfn, PGT_hl2_shadow))) )
  28.465 -    {
  28.466 -        printk("Couldn't alloc an HL2 shadow for pfn=%lx mfn=%lx\n",
  28.467 -               gpfn, gmfn);
  28.468 -        BUG(); /* XXX Deal gracefully with failure. */
  28.469 -    }
  28.470 -
  28.471 -    SH_VVLOG("shadow_hl2_table(gpfn=%lx, gmfn=%lx, smfn=%lx) => %lx",
  28.472 -             gpfn, gmfn, smfn, hl2mfn);
  28.473 -    perfc_incrc(shadow_hl2_table_count);
  28.474 -
  28.475 -    hl2 = map_domain_page(hl2mfn);
  28.476 -
  28.477 -    if ( shadow_mode_external(d) )
  28.478 -        limit = L2_PAGETABLE_ENTRIES;
  28.479 -    else
  28.480 -        limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
  28.481 -
  28.482 -    memset(hl2, 0, limit * sizeof(l1_pgentry_t));
  28.483 -
  28.484 -    if ( !shadow_mode_external(d) )
  28.485 -    {
  28.486 -        memset(&hl2[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 0,
  28.487 -               HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
  28.488 -
  28.489 -        // Setup easy access to the GL2, SL2, and HL2 frames.
  28.490 -        //
  28.491 -        hl2[l2_table_offset(LINEAR_PT_VIRT_START)] =
  28.492 -            l1e_from_pfn(gmfn, __PAGE_HYPERVISOR);
  28.493 -        hl2[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
  28.494 -            l1e_from_pfn(smfn, __PAGE_HYPERVISOR);
  28.495 -        hl2[l2_table_offset(PERDOMAIN_VIRT_START)] =
  28.496 -            l1e_from_pfn(hl2mfn, __PAGE_HYPERVISOR);
  28.497 -    }
  28.498 -
  28.499 -    unmap_domain_page(hl2);
  28.500 -
  28.501 -    return hl2mfn;
  28.502 -}
  28.503 -
  28.504 -/*
  28.505 - * This could take and use a snapshot, and validate the entire page at
  28.506 - * once, or it could continue to fault in entries one at a time...
  28.507 - * Might be worth investigating...
  28.508 - */
  28.509 -static unsigned long shadow_l2_table(
  28.510 -    struct vcpu *v, unsigned long gpfn, unsigned long gmfn)
  28.511 -{
  28.512 -    unsigned long smfn;
  28.513 -    l2_pgentry_t *spl2e;
  28.514 -    struct domain *d = v->domain;
  28.515 -    int i;
  28.516 -
  28.517 -    SH_VVLOG("shadow_l2_table(gpfn=%lx, gmfn=%lx)", gpfn, gmfn);
  28.518 -
  28.519 -    perfc_incrc(shadow_l2_table_count);
  28.520 -
  28.521 -    if ( unlikely(!(smfn = alloc_shadow_page(d, gpfn, gmfn, PGT_l2_shadow))) )
  28.522 -    {
  28.523 -        printk("Couldn't alloc an L2 shadow for pfn=%lx mfn=%lx\n",
  28.524 -               gpfn, gmfn);
  28.525 -        BUG(); /* XXX Deal gracefully with failure. */
  28.526 -    }
  28.527 -
  28.528 -    spl2e = (l2_pgentry_t *)map_domain_page(smfn);
  28.529 -
  28.530 -    /* Install hypervisor and 2x linear p.t. mapings. */
  28.531 -    if ( (PGT_base_page_table == PGT_l2_page_table) &&
  28.532 -         !shadow_mode_external(d) )
  28.533 -    {
  28.534 -        /*
  28.535 -         * We could proactively fill in PDEs for pages that are already
  28.536 -         * shadowed *and* where the guest PDE has _PAGE_ACCESSED set
  28.537 -         * (restriction required for coherence of the accessed bit). However,
  28.538 -         * we tried it and it didn't help performance. This is simpler.
  28.539 -         */
  28.540 -        memset(spl2e, 0, DOMAIN_ENTRIES_PER_L2_PAGETABLE*sizeof(l2_pgentry_t));
  28.541 -
  28.542 -        /* Install hypervisor and 2x linear p.t. mapings. */
  28.543 -        memcpy(&spl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
  28.544 -               &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
  28.545 -               HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
  28.546 -
  28.547 -        spl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
  28.548 -            l2e_from_pfn(smfn, __PAGE_HYPERVISOR);
  28.549 -
  28.550 -        for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
  28.551 -            spl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i] =
  28.552 -                l2e_from_page(virt_to_page(page_get_owner(mfn_to_page(gmfn))->
  28.553 -                                           arch.mm_perdomain_pt) + i,
  28.554 -                              __PAGE_HYPERVISOR);
  28.555 -
  28.556 -        if ( shadow_mode_translate(d) ) // NB: not external
  28.557 -        {
  28.558 -            unsigned long hl2mfn;
  28.559 -
  28.560 -            spl2e[l2_table_offset(RO_MPT_VIRT_START)] =
  28.561 -                l2e_from_paddr(pagetable_get_paddr(d->arch.phys_table),
  28.562 -                                __PAGE_HYPERVISOR);
  28.563 -
  28.564 -            if ( unlikely(!(hl2mfn = __shadow_status(d, gpfn, PGT_hl2_shadow))) )
  28.565 -                hl2mfn = shadow_hl2_table(d, gpfn, gmfn, smfn);
  28.566 -
  28.567 -            // shadow_mode_translate (but not external) sl2 tables hold a
  28.568 -            // ref to their hl2.
  28.569 -            //
  28.570 -            if ( !get_shadow_ref(hl2mfn) )
  28.571 -                BUG();
  28.572 -
  28.573 -            spl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
  28.574 -                l2e_from_pfn(hl2mfn, __PAGE_HYPERVISOR);
  28.575 -        }
  28.576 -        else
  28.577 -            spl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
  28.578 -                l2e_from_pfn(gmfn, __PAGE_HYPERVISOR);
  28.579 -    }
  28.580 -    else
  28.581 -    {
  28.582 -        memset(spl2e, 0, L2_PAGETABLE_ENTRIES*sizeof(l2_pgentry_t));
  28.583 -    }
  28.584 -
  28.585 -    unmap_domain_page(spl2e);
  28.586 -
  28.587 -    SH_VLOG("shadow_l2_table(%lx -> %lx)", gmfn, smfn);
  28.588 -    return smfn;
  28.589 -}
  28.590 -#endif /* CONFIG_PAGING_LEVELS == 2 */
  28.591 -
  28.592 -static void shadow_map_l1_into_current_l2(unsigned long va)
  28.593 -{
  28.594 -    struct vcpu *v = current;
  28.595 -    struct domain *d = v->domain;
  28.596 -    l1_pgentry_t *spl1e, *spl1e_next = 0;
  28.597 -    l2_pgentry_t sl2e;
  28.598 -    guest_l1_pgentry_t *gpl1e;
  28.599 -    guest_l2_pgentry_t gl2e = {0};
  28.600 -    unsigned long gl1pfn, gl1mfn, sl1mfn;
  28.601 -    int i, init_table = 0;
  28.602 -
  28.603 -    __guest_get_l2e(v, va, &gl2e);
  28.604 -    ASSERT(guest_l2e_get_flags(gl2e) & _PAGE_PRESENT);
  28.605 -    gl1pfn = l2e_get_pfn(gl2e);
  28.606 -
  28.607 -    if ( !(sl1mfn = __shadow_status(d, gl1pfn, PGT_l1_shadow)) )
  28.608 -    {
  28.609 -        /* This L1 is NOT already shadowed so we need to shadow it. */
  28.610 -        SH_VVLOG("4a: l1 not shadowed");
  28.611 -
  28.612 -        gl1mfn = gmfn_to_mfn(d, gl1pfn);
  28.613 -        if ( unlikely(!VALID_MFN(gl1mfn)) )
  28.614 -        {
  28.615 -            // Attempt to use an invalid pfn as an L1 page.
  28.616 -            // XXX this needs to be more graceful!
  28.617 -            BUG();
  28.618 -        }
  28.619 -
  28.620 -        if ( unlikely(!(sl1mfn =
  28.621 -                        alloc_shadow_page(d, gl1pfn, gl1mfn, PGT_l1_shadow))) )
  28.622 -        {
  28.623 -            printk("Couldn't alloc an L1 shadow for pfn=%lx mfn=%lx\n",
  28.624 -                   gl1pfn, gl1mfn);
  28.625 -            BUG(); /* XXX Need to deal gracefully with failure. */
  28.626 -        }
  28.627 -
  28.628 -        perfc_incrc(shadow_l1_table_count);
  28.629 -        init_table = 1;
  28.630 -    }
  28.631 -    else
  28.632 -    {
  28.633 -        /* This L1 is shadowed already, but the L2 entry is missing. */
  28.634 -        SH_VVLOG("4b: was shadowed, l2 missing (%lx)", sl1mfn);
  28.635 -    }
  28.636 -
  28.637 -#ifndef NDEBUG
  28.638 -    {
  28.639 -        l2_pgentry_t old_sl2e;
  28.640 -        __shadow_get_l2e(v, va, &old_sl2e);
  28.641 -        ASSERT(!(l2e_get_flags(old_sl2e) & _PAGE_PRESENT));
  28.642 -    }
  28.643 -#endif
  28.644 -
  28.645 -#if CONFIG_PAGING_LEVELS >= 3
  28.646 -    if ( SH_L1_HAS_NEXT_PAGE && 
  28.647 -         d->arch.ops->guest_paging_levels == PAGING_L2 )
  28.648 -    {
  28.649 -        /* for 32-bit HVM guest on 64-bit or PAE host,
  28.650 -         * need update two L2 entries each time
  28.651 -         */
  28.652 -        if ( !get_shadow_ref(sl1mfn))
  28.653 -            BUG();
  28.654 -        l2pde_general(d, &gl2e, &sl2e, sl1mfn);
  28.655 -        __guest_set_l2e(v, va, &gl2e);
  28.656 -        __shadow_set_l2e(v, va & ~((1<<L2_PAGETABLE_SHIFT_32) - 1), &sl2e);
  28.657 -        if ( !get_shadow_ref(sl1mfn+1))
  28.658 -            BUG();
  28.659 -        sl2e = l2e_empty();
  28.660 -        l2pde_general(d, &gl2e, &sl2e, sl1mfn+1);
  28.661 -        __shadow_set_l2e(v,((va & ~((1<<L2_PAGETABLE_SHIFT_32) - 1)) + (1 << L2_PAGETABLE_SHIFT)) , &sl2e);
  28.662 -    } else
  28.663 -#endif
  28.664 -    {
  28.665 -        if ( !get_shadow_ref(sl1mfn) )
  28.666 -            BUG();
  28.667 -        l2pde_general(d, &gl2e, &sl2e, sl1mfn);
  28.668 -        __guest_set_l2e(v, va, &gl2e);
  28.669 -        __shadow_set_l2e(v, va , &sl2e);
  28.670 -    }
  28.671 -
  28.672 -    if ( init_table )
  28.673 -    {
  28.674 -        l1_pgentry_t sl1e;
  28.675 -        int index = guest_l1_table_offset(va);
  28.676 -        int min = 1, max = 0;
  28.677 -
  28.678 -        unsigned long tmp_gmfn;
  28.679 -        l2_pgentry_t tmp_sl2e = {0};
  28.680 -        guest_l2_pgentry_t tmp_gl2e = {0};
  28.681 -
  28.682 -        __guest_get_l2e(v, va, &tmp_gl2e);
  28.683 -        tmp_gmfn = gmfn_to_mfn(d, l2e_get_pfn(tmp_gl2e));
  28.684 -        gpl1e = (guest_l1_pgentry_t *) map_domain_page(tmp_gmfn);
  28.685 -
  28.686 -        /* If the PGT_l1_shadow has two contiguous pages */
  28.687 -#if CONFIG_PAGING_LEVELS >= 3
  28.688 -        if ( SH_L1_HAS_NEXT_PAGE &&
  28.689 -             d->arch.ops->guest_paging_levels == PAGING_L2 )
  28.690 -            __shadow_get_l2e(v,  va & ~((1UL << L2_PAGETABLE_SHIFT_32) - 1), &tmp_sl2e);
  28.691 -        else
  28.692 -#endif
  28.693 -        __shadow_get_l2e(v, va, &tmp_sl2e);
  28.694 -
  28.695 -        spl1e = (l1_pgentry_t *) map_domain_page(l2e_get_pfn(tmp_sl2e));
  28.696 -
  28.697 -        if ( SH_L1_HAS_NEXT_PAGE )
  28.698 -            spl1e_next = (l1_pgentry_t *) map_domain_page(
  28.699 -                (l2e_get_pfn(tmp_sl2e) + 1UL));
  28.700 -
  28.701 -        for ( i = 0; i < GUEST_L1_PAGETABLE_ENTRIES; i++ )
  28.702 -        {
  28.703 -            l1pte_propagate_from_guest(d, gpl1e[i], &sl1e);
  28.704 -            if ( (l1e_get_flags(sl1e) & _PAGE_PRESENT) &&
  28.705 -                 unlikely(!shadow_get_page_from_l1e(sl1e, d)) )
  28.706 -                sl1e = l1e_empty();
  28.707 -            if ( l1e_get_flags(sl1e) == 0 )
  28.708 -            {
  28.709 -                // First copy entries from 0 until first invalid.
  28.710 -                // Then copy entries from index until first invalid.
  28.711 -                //
  28.712 -                if ( i < index ) {
  28.713 -                    i = index - 1;
  28.714 -                    continue;
  28.715 -                }
  28.716 -                break;
  28.717 -            }
  28.718 -
  28.719 -            if ( SH_L1_HAS_NEXT_PAGE && i >= L1_PAGETABLE_ENTRIES )
  28.720 -                spl1e_next[i - L1_PAGETABLE_ENTRIES] = sl1e;
  28.721 -            else 
  28.722 -                spl1e[i] = sl1e;
  28.723 -
  28.724 -            if ( unlikely(i < min) )
  28.725 -                min = i;
  28.726 -            if ( likely(i > max) )
  28.727 -                max = i;
  28.728 -            set_guest_back_ptr(d, sl1e, sl1mfn, i);
  28.729 -        }
  28.730 -
  28.731 -        mfn_to_page(sl1mfn)->tlbflush_timestamp =
  28.732 -            SHADOW_ENCODE_MIN_MAX(min, max);
  28.733 -
  28.734 -        unmap_domain_page(gpl1e);
  28.735 -        unmap_domain_page(spl1e);
  28.736 -
  28.737 -        if ( SH_L1_HAS_NEXT_PAGE )
  28.738 -            unmap_domain_page(spl1e_next);
  28.739 -    }
  28.740 -}
  28.741 -
  28.742 -#if CONFIG_PAGING_LEVELS == 2
  28.743 -static void
  28.744 -shadow_set_l1e(unsigned long va, l1_pgentry_t new_spte, int create_l1_shadow)
  28.745 -{
  28.746 -    struct vcpu *v = current;
  28.747 -    struct domain *d = v->domain;
  28.748 -    l2_pgentry_t sl2e = {0};
  28.749 -
  28.750 -    __shadow_get_l2e(v, va, &sl2e);
  28.751 -    if ( !(l2e_get_flags(sl2e) & _PAGE_PRESENT) )
  28.752 -    {
  28.753 -        /*
  28.754 -         * Either the L1 is not shadowed, or the shadow isn't linked into
  28.755 -         * the current shadow L2.
  28.756 -         */
  28.757 -        if ( create_l1_shadow )
  28.758 -        {
  28.759 -            perfc_incrc(shadow_set_l1e_force_map);
  28.760 -            shadow_map_l1_into_current_l2(va);
  28.761 -        }
  28.762 -        else /* check to see if it exists; if so, link it in */
  28.763 -        {
  28.764 -            l2_pgentry_t gpde = {0};
  28.765 -            unsigned long gl1pfn;
  28.766 -            unsigned long sl1mfn;
  28.767 -
  28.768 -            __guest_get_l2e(v, va, &gpde);
  28.769 -
  28.770 -            if ( l2e_get_flags(gpde) & _PAGE_PRESENT )
  28.771 -            {
  28.772 -                gl1pfn = l2e_get_pfn(gpde);
  28.773 -                sl1mfn = __shadow_status(d, gl1pfn, PGT_l1_shadow);
  28.774 -            }
  28.775 -            else
  28.776 -            {
  28.777 -                // no shadow exists, so there's nothing to do.
  28.778 -                perfc_incrc(shadow_set_l1e_fail);
  28.779 -                return;
  28.780 -            }
  28.781 -
  28.782 -            if ( sl1mfn )
  28.783 -            {
  28.784 -                perfc_incrc(shadow_set_l1e_unlinked);
  28.785 -                if ( !get_shadow_ref(sl1mfn) )
  28.786 -                    BUG();
  28.787 -                l2pde_general(d, (guest_l2_pgentry_t *)&gpde, &sl2e, sl1mfn);
  28.788 -                __guest_set_l2e(v, va, &gpde);
  28.789 -                __shadow_set_l2e(v, va, &sl2e);
  28.790 -            }
  28.791 -            else
  28.792 -            {
  28.793 -                // no shadow exists, so there's nothing to do.
  28.794 -                perfc_incrc(shadow_set_l1e_fail);
  28.795 -                return;
  28.796 -            }
  28.797 -        }
  28.798 -    }
  28.799 -
  28.800 -    __shadow_get_l2e(v, va, &sl2e);
  28.801 -
  28.802 -    if ( shadow_mode_refcounts(d) )
  28.803 -    {
  28.804 -        l1_pgentry_t old_spte;
  28.805 -        __shadow_get_l1e(v, va, &old_spte);
  28.806 -
  28.807 -        // only do the ref counting if something important changed.
  28.808 -        //
  28.809 -        if ( l1e_has_changed(old_spte, new_spte, _PAGE_RW | _PAGE_PRESENT) )
  28.810 -        {
  28.811 -            if ( (l1e_get_flags(new_spte) & _PAGE_PRESENT) &&
  28.812 -                 !shadow_get_page_from_l1e(new_spte, d) )
  28.813 -                new_spte = l1e_empty();
  28.814 -            if ( l1e_get_flags(old_spte) & _PAGE_PRESENT )
  28.815 -                shadow_put_page_from_l1e(old_spte, d);
  28.816 -        }
  28.817 -    }
  28.818 -
  28.819 -    set_guest_back_ptr(d, new_spte, l2e_get_pfn(sl2e), l1_table_offset(va));
  28.820 -    __shadow_set_l1e(v, va, &new_spte);
  28.821 -    shadow_update_min_max(l2e_get_pfn(sl2e), l1_table_offset(va));
  28.822 -}
  28.823 -
  28.824 -static void shadow_invlpg_32(struct vcpu *v, unsigned long va)
  28.825 -{
  28.826 -    struct domain *d = v->domain;
  28.827 -    l1_pgentry_t gpte, spte;
  28.828 -
  28.829 -    ASSERT(shadow_mode_enabled(d));
  28.830 -
  28.831 -    shadow_lock(d);
  28.832 -
  28.833 -    __shadow_sync_va(v, va);
  28.834 -
  28.835 -    // XXX mafetter: will need to think about 4MB pages...
  28.836 -
  28.837 -    // It's not strictly necessary to update the shadow here,
  28.838 -    // but it might save a fault later.
  28.839 -    //
  28.840 -    /*if (__copy_from_user(&gpte, &linear_pg_table[va >> PAGE_SHIFT],
  28.841 -                         sizeof(gpte))) {*/
  28.842 -    if (unlikely(!__guest_get_l1e(v, va, &gpte))) {
  28.843 -        perfc_incrc(shadow_invlpg_faults);
  28.844 -        shadow_unlock(d);
  28.845 -        return;
  28.846 -    }
  28.847 -    l1pte_propagate_from_guest(d, gpte, &spte);
  28.848 -    shadow_set_l1e(va, spte, 1);
  28.849 -
  28.850 -    shadow_unlock(d);
  28.851 -}
  28.852 -#endif /* CONFIG_PAGING_LEVELS == 2 */
  28.853 -
  28.854 -#if CONFIG_PAGING_LEVELS >= 3
  28.855 -static void shadow_set_l1e_64(
  28.856 -    unsigned long va, pgentry_64_t *sl1e_p,
  28.857 -    int create_l1_shadow)
  28.858 -{
  28.859 -    struct vcpu *v = current;
  28.860 -    struct domain *d = v->domain;
  28.861 -    pgentry_64_t sle = { 0 };
  28.862 -    pgentry_64_t sle_up = {0};
  28.863 -    l1_pgentry_t old_spte;
  28.864 -    l1_pgentry_t sl1e = *(l1_pgentry_t *)sl1e_p;
  28.865 -    int i;
  28.866 -    unsigned long orig_va = 0;
  28.867 -
  28.868 -    if ( d->arch.ops->guest_paging_levels == PAGING_L2 ) 
  28.869 -    {
  28.870 -        /* This is for 32-bit VMX guest on 64-bit host */
  28.871 -        orig_va = va;
  28.872 -        va = va & (~((1<<L2_PAGETABLE_SHIFT_32)-1));
  28.873 -    }
  28.874 -
  28.875 -    for ( i = PAGING_L4; i >= PAGING_L2; i-- )
  28.876 -    {
  28.877 -        if ( !__rw_entry(v, va, &sle, SHADOW_ENTRY | GET_ENTRY | i) )
  28.878 -        {
  28.879 -            sl1e = l1e_empty();
  28.880 -            goto out;
  28.881 -        }
  28.882 -        if ( !(entry_get_flags(sle) & _PAGE_PRESENT) )
  28.883 -        {
  28.884 -            if ( create_l1_shadow )
  28.885 -            {
  28.886 -                perfc_incrc(shadow_set_l3e_force_map);
  28.887 -                shadow_map_into_current(v, va, i-1, i);
  28.888 -                __rw_entry(v, va, &sle, SHADOW_ENTRY | GET_ENTRY | i);
  28.889 -            }
  28.890 -        }
  28.891 -        if ( d->arch.ops->guest_paging_levels == PAGING_L3 ) 
  28.892 -        {
  28.893 -            if ( i < PAGING_L3 )
  28.894 -                shadow_update_min_max(entry_get_pfn(sle_up), table_offset_64(va, i));
  28.895 -        }
  28.896 -        else 
  28.897 -        {
  28.898 -            if ( i < PAGING_L4 )
  28.899 -                shadow_update_min_max(entry_get_pfn(sle_up), table_offset_64(va, i));
  28.900 -        }
  28.901 -
  28.902 -        sle_up = sle;
  28.903 -    }
  28.904 -
  28.905 -    if ( d->arch.ops->guest_paging_levels == PAGING_L2 )
  28.906 -    {
  28.907 -        va = orig_va;
  28.908 -    }
  28.909 -
  28.910 -    if ( shadow_mode_refcounts(d) )
  28.911 -    {
  28.912 -        __shadow_get_l1e(v, va, &old_spte);
  28.913 -        if ( l1e_has_changed(old_spte, sl1e, _PAGE_RW | _PAGE_PRESENT) )
  28.914 -        {
  28.915 -            if ( (l1e_get_flags(sl1e) & _PAGE_PRESENT) &&
  28.916 -                 !shadow_get_page_from_l1e(sl1e, d) )
  28.917 -                sl1e = l1e_empty();
  28.918 -            if ( l1e_get_flags(old_spte) & _PAGE_PRESENT )
  28.919 -                put_page_from_l1e(old_spte, d);
  28.920 -        }
  28.921 -    }
  28.922 -
  28.923 -out:
  28.924 -    __shadow_set_l1e(v, va, &sl1e);
  28.925 -
  28.926 -    shadow_update_min_max(entry_get_pfn(sle_up), guest_l1_table_offset(va));
  28.927 -}
  28.928 -#endif /* CONFIG_PAGING_LEVELS >= 3 */
  28.929 -
  28.930 -static struct out_of_sync_entry *
  28.931 -shadow_alloc_oos_entry(struct domain *d)
  28.932 -{
  28.933 -    struct out_of_sync_entry *f, *extra;
  28.934 -    unsigned size, i;
  28.935 -
  28.936 -    if ( unlikely(d->arch.out_of_sync_free == NULL) )
  28.937 -    {
  28.938 -        FSH_LOG("Allocate more fullshadow tuple blocks.");
  28.939 -
  28.940 -        size = sizeof(void *) + (out_of_sync_extra_size * sizeof(*f));
  28.941 -        extra = xmalloc_bytes(size);
  28.942 -
  28.943 -        /* XXX Should be more graceful here. */
  28.944 -        if ( extra == NULL )
  28.945 -            BUG();
  28.946 -
  28.947 -        memset(extra, 0, size);
  28.948 -
  28.949 -        /* Record the allocation block so it can be correctly freed later. */
  28.950 -        d->arch.out_of_sync_extras_count++;
  28.951 -        *((struct out_of_sync_entry **)&extra[out_of_sync_extra_size]) =
  28.952 -            d->arch.out_of_sync_extras;
  28.953 -        d->arch.out_of_sync_extras = &extra[0];
  28.954 -
  28.955 -        /* Thread a free chain through the newly-allocated nodes. */
  28.956 -        for ( i = 0; i < (out_of_sync_extra_size - 1); i++ )
  28.957 -            extra[i].next = &extra[i+1];
  28.958 -        extra[i].next = NULL;
  28.959 -
  28.960 -        /* Add the new nodes to the free list. */
  28.961 -        d->arch.out_of_sync_free = &extra[0];
  28.962 -    }
  28.963 -
  28.964 -    /* Allocate a new node from the quicklist. */
  28.965 -    f = d->arch.out_of_sync_free;
  28.966 -    d->arch.out_of_sync_free = f->next;
  28.967 -
  28.968 -    return f;
  28.969 -}
  28.970 -
  28.971 -static inline unsigned long
  28.972 -shadow_make_snapshot(
  28.973 -    struct domain *d, unsigned long gpfn, unsigned long gmfn)
  28.974 -{
  28.975 -    unsigned long smfn, sl1mfn = 0;
  28.976 -    void *original, *snapshot;
  28.977 -    u32 min_max = 0;
  28.978 -    int min, max, length;
  28.979 -
  28.980 -    if ( test_and_set_bit(_PGC_out_of_sync, &mfn_to_page(gmfn)->count_info) )
  28.981 -    {
  28.982 -        ASSERT(__shadow_status(d, gpfn, PGT_snapshot));
  28.983 -        return SHADOW_SNAPSHOT_ELSEWHERE;
  28.984 -    }
  28.985 -
  28.986 -    perfc_incrc(shadow_make_snapshot);
  28.987 -
  28.988 -    if ( unlikely(!(smfn = alloc_shadow_page(d, gpfn, gmfn, PGT_snapshot))) )
  28.989 -    {
  28.990 -        printk("Couldn't alloc fullshadow snapshot for pfn=%lx mfn=%lx!\n"
  28.991 -               "Dom%d snapshot_count_count=%d\n",
  28.992 -               gpfn, gmfn, d->domain_id, d->arch.snapshot_page_count);
  28.993 -        BUG(); /* XXX FIXME: try a shadow flush to free up some memory. */
  28.994 -    }
  28.995 -
  28.996 -    if ( !get_shadow_ref(smfn) )
  28.997 -        BUG();
  28.998 -
  28.999 -    if ( shadow_mode_refcounts(d) &&
 28.1000 -         (shadow_max_pgtable_type(d, gpfn, &sl1mfn) == PGT_l1_shadow) )
 28.1001 -        min_max = mfn_to_page(sl1mfn)->tlbflush_timestamp;
 28.1002 -    mfn_to_page(smfn)->tlbflush_timestamp = min_max;
 28.1003 -
 28.1004 -    min = SHADOW_MIN(min_max);
 28.1005 -    max = SHADOW_MAX(min_max);
 28.1006 -    length = max - min + 1;
 28.1007 -    perfc_incr_histo(snapshot_copies, length, PT_UPDATES);
 28.1008 -
 28.1009 -    min *= sizeof(guest_l1_pgentry_t);
 28.1010 -    length *= sizeof(guest_l1_pgentry_t);
 28.1011 -
 28.1012 -    original = map_domain_page(gmfn);
 28.1013 -    snapshot = map_domain_page(smfn);
 28.1014 -    memcpy(snapshot + min, original + min, length);
 28.1015 -    unmap_domain_page(original);
 28.1016 -    unmap_domain_page(snapshot);
 28.1017 -
 28.1018 -    return smfn;
 28.1019 -}
 28.1020 -
 28.1021 -static struct out_of_sync_entry *
 28.1022 -__mark_mfn_out_of_sync(struct vcpu *v, unsigned long gpfn,
 28.1023 -                             unsigned long mfn)
 28.1024 -{
 28.1025 -    struct domain *d = v->domain;
 28.1026 -    struct page_info *page = mfn_to_page(mfn);
 28.1027 -    struct out_of_sync_entry *entry = shadow_alloc_oos_entry(d);
 28.1028 -
 28.1029 -    ASSERT(shadow_lock_is_acquired(d));
 28.1030 -    ASSERT(mfn_valid(mfn));
 28.1031 -
 28.1032 -#ifndef NDEBUG
 28.1033 -    {
 28.1034 -        u32 type = page->u.inuse.type_info & PGT_type_mask;
 28.1035 -        if ( shadow_mode_refcounts(d) )
 28.1036 -        {
 28.1037 -            ASSERT(type == PGT_writable_page);
 28.1038 -        }
 28.1039 -        else
 28.1040 -        {
 28.1041 -            ASSERT(type && (type < PGT_l4_page_table));
 28.1042 -        }
 28.1043 -    }
 28.1044 -#endif
 28.1045 -
 28.1046 -    FSH_LOG("%s(gpfn=%lx, mfn=%lx) c=%08x t=%08x", __func__,
 28.1047 -            gpfn, mfn, page->count_info, page->u.inuse.type_info);
 28.1048 -
 28.1049 -    // XXX this will require some more thought...  Cross-domain sharing and
 28.1050 -    //     modification of page tables?  Hmm...
 28.1051 -    //
 28.1052 -    if ( d != page_get_owner(page) )
 28.1053 -        BUG();
 28.1054 -
 28.1055 -    perfc_incrc(shadow_mark_mfn_out_of_sync_calls);
 28.1056 -
 28.1057 -    entry->v = v;
 28.1058 -    entry->gpfn = gpfn;
 28.1059 -    entry->gmfn = mfn;
 28.1060 -    entry->writable_pl1e = -1;
 28.1061 -
 28.1062 -#if 0 // this code has not been updated for 32pae & 64 bit modes
 28.1063 -#if SHADOW_DEBUG
 28.1064 -    mark_shadows_as_reflecting_snapshot(d, gpfn);
 28.1065 -#endif
 28.1066 -#endif
 28.1067 -
 28.1068 -    // increment guest's ref count to represent the entry in the
 28.1069 -    // full shadow out-of-sync list.
 28.1070 -    //
 28.1071 -    get_page(page, d);
 28.1072 -
 28.1073 -    return entry;
 28.1074 -}
 28.1075 -
 28.1076 -static struct out_of_sync_entry *
 28.1077 -mark_mfn_out_of_sync(struct vcpu *v, unsigned long gpfn,
 28.1078 -                             unsigned long mfn)
 28.1079 -{
 28.1080 -    struct out_of_sync_entry *entry =
 28.1081 -        __mark_mfn_out_of_sync(v, gpfn, mfn);
 28.1082 -    struct domain *d = v->domain;
 28.1083 -
 28.1084 -    entry->snapshot_mfn = shadow_make_snapshot(d, gpfn, mfn);
 28.1085 -    // Add to the out-of-sync list
 28.1086 -    //
 28.1087 -    entry->next = d->arch.out_of_sync;
 28.1088 -    d->arch.out_of_sync = entry;
 28.1089 -
 28.1090 -    return entry;
 28.1091 -
 28.1092 -}
 28.1093 -
 28.1094 -static void shadow_mark_va_out_of_sync(
 28.1095 -    struct vcpu *v, unsigned long gpfn, unsigned long mfn, unsigned long va)
 28.1096 -{
 28.1097 -    struct out_of_sync_entry *entry =
 28.1098 -        __mark_mfn_out_of_sync(v, gpfn, mfn);
 28.1099 -    l2_pgentry_t sl2e;
 28.1100 -    struct domain *d = v->domain;
 28.1101 -
 28.1102 -#if CONFIG_PAGING_LEVELS >= 3
 28.1103 -    {
 28.1104 -        l4_pgentry_t sl4e;
 28.1105 -        l3_pgentry_t sl3e;
 28.1106 -
 28.1107 -        __shadow_get_l4e(v, va, &sl4e);
 28.1108 -        if ( !(l4e_get_flags(sl4e) & _PAGE_PRESENT)) {
 28.1109 -            shadow_map_into_current(v, va, PAGING_L3, PAGING_L4);
 28.1110 -        }
 28.1111 -
 28.1112 -        if (!__shadow_get_l3e(v, va, &sl3e)) {
 28.1113 -            BUG();
 28.1114 -        }
 28.1115 -
 28.1116 -        if ( !(l3e_get_flags(sl3e) & _PAGE_PRESENT)) {
 28.1117 -            shadow_map_into_current(v, va, PAGING_L2, PAGING_L3);
 28.1118 -        }
 28.1119 -    }
 28.1120 -#endif
 28.1121 -
 28.1122 -    // We need the address of shadow PTE that maps @va.
 28.1123 -    // It might not exist yet.  Make sure it's there.
 28.1124 -    //
 28.1125 -    __shadow_get_l2e(v, va, &sl2e);
 28.1126 -    if ( !(l2e_get_flags(sl2e) & _PAGE_PRESENT) )
 28.1127 -    {
 28.1128 -        // either this L1 isn't shadowed yet, or the shadow isn't linked into
 28.1129 -        // the current L2.
 28.1130 -        shadow_map_l1_into_current_l2(va);
 28.1131 -        __shadow_get_l2e(v, va, &sl2e);
 28.1132 -    }
 28.1133 -    ASSERT(l2e_get_flags(sl2e) & _PAGE_PRESENT);
 28.1134 -
 28.1135 -    entry->snapshot_mfn = shadow_make_snapshot(d, gpfn, mfn);
 28.1136 -    // NB: this is stored as a machine address.
 28.1137 -    entry->writable_pl1e =
 28.1138 -        l2e_get_paddr(sl2e) | (sizeof(l1_pgentry_t) * l1_table_offset(va));
 28.1139 -    ASSERT( !(entry->writable_pl1e & (sizeof(l1_pgentry_t)-1)) );
 28.1140 -    entry->va = va;
 28.1141 -
 28.1142 -    // Increment shadow's page count to represent the reference
 28.1143 -    // inherent in entry->writable_pl1e
 28.1144 -    //
 28.1145 -    if ( !get_shadow_ref(l2e_get_pfn(sl2e)) )
 28.1146 -        BUG();
 28.1147 -
 28.1148 -    // Add to the out-of-sync list
 28.1149 -    //
 28.1150 -    entry->next = d->arch.out_of_sync;
 28.1151 -    d->arch.out_of_sync = entry;
 28.1152 -
 28.1153 -    FSH_LOG("%s(va=%lx -> writable_pl1e=%lx)",
 28.1154 -            __func__, va, entry->writable_pl1e);
 28.1155 -}
 28.1156 -
 28.1157 -/*
 28.1158 - * Returns 1 if the snapshot for @gmfn exists and its @index'th entry matches.
 28.1159 - * Returns 0 otherwise.
 28.1160 - */
 28.1161 -static int snapshot_entry_matches(
 28.1162 -    struct domain *d, guest_l1_pgentry_t *guest_pt,
 28.1163 -    unsigned long gpfn, unsigned index)
 28.1164 -{
 28.1165 -    unsigned long smfn = __shadow_status(d, gpfn, PGT_snapshot);
 28.1166 -    guest_l1_pgentry_t *snapshot, gpte; // could be L1s or L2s or ...
 28.1167 -    int entries_match;
 28.1168 -
 28.1169 -    perfc_incrc(snapshot_entry_matches_calls);
 28.1170 -
 28.1171 -    if ( !smfn )
 28.1172 -        return 0;
 28.1173 -
 28.1174 -    snapshot = map_domain_page(smfn);
 28.1175 -
 28.1176 -    if (__copy_from_user(&gpte, &guest_pt[index],
 28.1177 -                         sizeof(gpte)))
 28.1178 -    {
 28.1179 -        unmap_domain_page(snapshot);
 28.1180 -        return 0;
 28.1181 -    }
 28.1182 -
 28.1183 -    // This could probably be smarter, but this is sufficent for
 28.1184 -    // our current needs.
 28.1185 -    //
 28.1186 -    entries_match = !guest_l1e_has_changed(gpte, snapshot[index],
 28.1187 -                                     PAGE_FLAG_MASK);
 28.1188 -
 28.1189 -    unmap_domain_page(snapshot);
 28.1190 -
 28.1191 -#ifdef PERF_COUNTERS
 28.1192 -    if ( entries_match )
 28.1193 -        perfc_incrc(snapshot_entry_matches_true);
 28.1194 -#endif
 28.1195 -
 28.1196 -    return entries_match;
 28.1197 -}
 28.1198 -
 28.1199 -/*
 28.1200 - * Returns 1 if va's shadow mapping is out-of-sync.
 28.1201 - * Returns 0 otherwise.
 28.1202 - */
 28.1203 -static int is_out_of_sync(struct vcpu *v, unsigned long va) /* __shadow_out_of_sync */
 28.1204 -{
 28.1205 -    struct domain *d = v->domain;
 28.1206 -#if CONFIG_PAGING_LEVELS == 4
 28.1207 -    unsigned long l2mfn = ((v->arch.flags & TF_kernel_mode)?
 28.1208 -                          pagetable_get_pfn(v->arch.guest_table) :
 28.1209 -                          pagetable_get_pfn(v->arch.guest_table_user));
 28.1210 -#else
 28.1211 -    unsigned long l2mfn = pagetable_get_pfn(v->arch.guest_table);
 28.1212 -#endif
 28.1213 -    unsigned long l2pfn = mfn_to_gmfn(d, l2mfn);
 28.1214 -    guest_l2_pgentry_t l2e;
 28.1215 -    unsigned long l1pfn, l1mfn;
 28.1216 -    guest_l1_pgentry_t *guest_pt;
 28.1217 -
 28.1218 -    ASSERT(shadow_lock_is_acquired(d));
 28.1219 -    ASSERT(VALID_M2P(l2pfn));
 28.1220 -
 28.1221 -    perfc_incrc(shadow_out_of_sync_calls);
 28.1222 -
 28.1223 -#if CONFIG_PAGING_LEVELS >= 3
 28.1224 -
 28.1225 -#define unmap_and_return(x)                                         \
 28.1226 -    if ( guest_pt != (guest_l1_pgentry_t *) v->arch.guest_vtable )  \
 28.1227 -        unmap_domain_page(guest_pt);                                \
 28.1228 -    return (x);
 28.1229 -
 28.1230 -    if (d->arch.ops->guest_paging_levels >= PAGING_L3) 
 28.1231 -    { 
 28.1232 -        pgentry_64_t le;
 28.1233 -        unsigned long gmfn;
 28.1234 -        unsigned long gpfn;
 28.1235 -        int i;
 28.1236 -        unsigned int base_idx = 0;
 28.1237 -        base_idx = get_cr3_idxval(v);
 28.1238 -
 28.1239 -        gmfn = l2mfn;
 28.1240 -        gpfn = l2pfn;
 28.1241 -        guest_pt = (guest_l1_pgentry_t *)v->arch.guest_vtable;
 28.1242 -
 28.1243 -        for ( i = PAGING_L4; i >= PAGING_L3; i-- ) 
 28.1244 -        {
 28.1245 -            if (d->arch.ops->guest_paging_levels == PAGING_L3 
 28.1246 -                && i == PAGING_L4)
 28.1247 -                continue;       /* skip the top-level for 3-level */
 28.1248 -
 28.1249 -            if ( page_out_of_sync(mfn_to_page(gmfn)) &&
 28.1250 -                 !snapshot_entry_matches(
 28.1251 -                     d, guest_pt, gpfn, guest_table_offset_64(va, i, base_idx)) )
 28.1252 -            {
 28.1253 -                unmap_and_return (1);
 28.1254 -            }
 28.1255 -
 28.1256 -            le = entry_empty();
 28.1257 -            __rw_entry(v, va, &le, GUEST_ENTRY | GET_ENTRY | i);
 28.1258 -
 28.1259 -            if ( !(entry_get_flags(le) & _PAGE_PRESENT) )
 28.1260 -            {
 28.1261 -                unmap_and_return (0);
 28.1262 -            }
 28.1263 -            gpfn = entry_get_pfn(le);
 28.1264 -            gmfn = gmfn_to_mfn(d, gpfn);
 28.1265 -            if ( !VALID_MFN(gmfn) )
 28.1266 -            {
 28.1267 -                unmap_and_return (0);
 28.1268 -            }
 28.1269 -            if ( guest_pt != (guest_l1_pgentry_t *)v->arch.guest_vtable )
 28.1270 -                unmap_domain_page(guest_pt);
 28.1271 -            guest_pt = (guest_l1_pgentry_t *)map_domain_page(gmfn);
 28.1272 -        }
 28.1273 -
 28.1274 -        /* L2 */
 28.1275 -        if ( page_out_of_sync(mfn_to_page(gmfn)) &&
 28.1276 -             !snapshot_entry_matches(d, guest_pt, gpfn, l2_table_offset(va)) )
 28.1277 -        {
 28.1278 -            unmap_and_return (1);
 28.1279 -        }
 28.1280 -
 28.1281 -        if ( guest_pt != (guest_l1_pgentry_t *)v->arch.guest_vtable )
 28.1282 -            unmap_domain_page(guest_pt);
 28.1283 -
 28.1284 -    } 
 28.1285 -    else
 28.1286 -#undef unmap_and_return
 28.1287 -#endif /* CONFIG_PAGING_LEVELS >= 3 */
 28.1288 -    {
 28.1289 -        if ( page_out_of_sync(mfn_to_page(l2mfn)) &&
 28.1290 -             !snapshot_entry_matches(d, (guest_l1_pgentry_t *)v->arch.guest_vtable,
 28.1291 -                                     l2pfn, guest_l2_table_offset(va)) )
 28.1292 -            return 1;
 28.1293 -    }
 28.1294 -
 28.1295 -    __guest_get_l2e(v, va, &l2e);
 28.1296 -    if ( !(guest_l2e_get_flags(l2e) & _PAGE_PRESENT) ||
 28.1297 -         (guest_l2e_get_flags(l2e) & _PAGE_PSE))
 28.1298 -        return 0;
 28.1299 -
 28.1300 -    l1pfn = l2e_get_pfn(l2e);
 28.1301 -    l1mfn = gmfn_to_mfn(d, l1pfn);
 28.1302 -
 28.1303 -    // If the l1 pfn is invalid, it can't be out of sync...
 28.1304 -    if ( !VALID_MFN(l1mfn) )
 28.1305 -        return 0;
 28.1306 -
 28.1307 -    guest_pt = (guest_l1_pgentry_t *) map_domain_page(l1mfn);
 28.1308 -
 28.1309 -    if ( page_out_of_sync(mfn_to_page(l1mfn)) &&
 28.1310 -         !snapshot_entry_matches(
 28.1311 -             d, guest_pt, l1pfn, guest_l1_table_offset(va)) ) 
 28.1312 -    {
 28.1313 -        unmap_domain_page(guest_pt);
 28.1314 -        return 1;
 28.1315 -    }
 28.1316 -
 28.1317 -    unmap_domain_page(guest_pt);
 28.1318 -    return 0;
 28.1319 -}
 28.1320 -
 28.1321 -static int fix_entry(
 28.1322 -    struct domain *d,
 28.1323 -    l1_pgentry_t *pt, u32 *found, int is_l1_shadow, u32 max_refs_to_find)
 28.1324 -{
 28.1325 -    l1_pgentry_t old = *pt;
 28.1326 -    l1_pgentry_t new = old;
 28.1327 -
 28.1328 -    l1e_remove_flags(new,_PAGE_RW);
 28.1329 -    if ( is_l1_shadow && !shadow_get_page_from_l1e(new, d) )
 28.1330 -        BUG();
 28.1331 -    (*found)++;
 28.1332 -    *pt = new;
 28.1333 -    if ( is_l1_shadow )
 28.1334 -        shadow_put_page_from_l1e(old, d);
 28.1335 -
 28.1336 -    return (*found == max_refs_to_find);
 28.1337 -}
 28.1338 -
 28.1339 -static u32 remove_all_write_access_in_ptpage(
 28.1340 -    struct domain *d, unsigned long pt_pfn, unsigned long pt_mfn,
 28.1341 -    unsigned long readonly_gpfn, unsigned long readonly_gmfn,
 28.1342 -    u32 max_refs_to_find, unsigned long prediction)
 28.1343 -{
 28.1344 -    l1_pgentry_t *pt = map_domain_page(pt_mfn);
 28.1345 -    l1_pgentry_t *pt_next = 0, *sl1e_p;
 28.1346 -    l1_pgentry_t match;
 28.1347 -    unsigned long flags = _PAGE_RW | _PAGE_PRESENT;
 28.1348 -    int i;
 28.1349 -    u32 found = 0;
 28.1350 -    int is_l1_shadow =
 28.1351 -        ((mfn_to_page(pt_mfn)->u.inuse.type_info & PGT_type_mask) ==
 28.1352 -         PGT_l1_shadow);
 28.1353 -#if CONFIG_PAGING_LEVELS >= 3
 28.1354 -    is_l1_shadow |=
 28.1355 -      ((mfn_to_page(pt_mfn)->u.inuse.type_info & PGT_type_mask) ==
 28.1356 -                PGT_fl1_shadow);
 28.1357 -#endif
 28.1358 -
 28.1359 -    if ( SH_L1_HAS_NEXT_PAGE )
 28.1360 -        pt_next = map_domain_page(pt_mfn + 1);
 28.1361 -
 28.1362 -    match = l1e_from_pfn(readonly_gmfn, flags);
 28.1363 -
 28.1364 -    if ( shadow_mode_external(d) ) 
 28.1365 -    {
 28.1366 -        i = (mfn_to_page(readonly_gmfn)->u.inuse.type_info & PGT_va_mask)
 28.1367 -            >> PGT_va_shift;
 28.1368 -
 28.1369 -        if ( SH_L1_HAS_NEXT_PAGE &&
 28.1370 -             i >= L1_PAGETABLE_ENTRIES )
 28.1371 -            sl1e_p = &pt_next[i - L1_PAGETABLE_ENTRIES];
 28.1372 -        else
 28.1373 -            sl1e_p = &pt[i];
 28.1374 -
 28.1375 -        if ( (i >= 0 && i < GUEST_L1_PAGETABLE_ENTRIES) &&
 28.1376 -             !l1e_has_changed(*sl1e_p, match, flags) &&
 28.1377 -             fix_entry(d, sl1e_p, &found, is_l1_shadow, max_refs_to_find) &&
 28.1378 -             !prediction )
 28.1379 -            goto out;
 28.1380 -    }
 28.1381 -
 28.1382 -    for ( i = 0; i < GUEST_L1_PAGETABLE_ENTRIES; i++ )
 28.1383 -    {
 28.1384 -        if ( SH_L1_HAS_NEXT_PAGE &&
 28.1385 -             i >= L1_PAGETABLE_ENTRIES )
 28.1386 -            sl1e_p = &pt_next[i - L1_PAGETABLE_ENTRIES];
 28.1387 -        else
 28.1388 -            sl1e_p = &pt[i];
 28.1389 -
 28.1390 -        if ( unlikely(!l1e_has_changed(*sl1e_p, match, flags)) &&
 28.1391 -             fix_entry(d, sl1e_p, &found, is_l1_shadow, max_refs_to_find) )
 28.1392 -            break;
 28.1393 -    }
 28.1394 -
 28.1395 -out:
 28.1396 -    unmap_domain_page(pt);
 28.1397 -    if ( SH_L1_HAS_NEXT_PAGE )
 28.1398 -        unmap_domain_page(pt_next);
 28.1399 -
 28.1400 -    return found;
 28.1401 -}
 28.1402 -
 28.1403 -static int remove_all_write_access(
 28.1404 -    struct domain *d, unsigned long readonly_gpfn, unsigned long readonly_gmfn)
 28.1405 -{
 28.1406 -    int i;
 28.1407 -    struct shadow_status *a;
 28.1408 -    u32 found = 0, write_refs;
 28.1409 -    unsigned long predicted_smfn;
 28.1410 -
 28.1411 -    ASSERT(shadow_lock_is_acquired(d));
 28.1412 -    ASSERT(VALID_MFN(readonly_gmfn));
 28.1413 -
 28.1414 -    perfc_incrc(remove_write_access);
 28.1415 -
 28.1416 -    // If it's not a writable page, then no writable refs can be outstanding.
 28.1417 -    //
 28.1418 -    if ( (mfn_to_page(readonly_gmfn)->u.inuse.type_info & PGT_type_mask) !=
 28.1419 -         PGT_writable_page )
 28.1420 -    {
 28.1421 -        perfc_incrc(remove_write_not_writable);
 28.1422 -        return 1;
 28.1423 -    }
 28.1424 -
 28.1425 -    // How many outstanding writable PTEs for this page are there?
 28.1426 -    //
 28.1427 -    write_refs =
 28.1428 -        (mfn_to_page(readonly_gmfn)->u.inuse.type_info & PGT_count_mask);
 28.1429 -    if ( write_refs && MFN_PINNED(readonly_gmfn) )
 28.1430 -    {
 28.1431 -        write_refs--;
 28.1432 -    }
 28.1433 -
 28.1434 -    if ( write_refs == 0 )
 28.1435 -    {
 28.1436 -        perfc_incrc(remove_write_no_work);
 28.1437 -        return 1;
 28.1438 -    }
 28.1439 -
 28.1440 -    if ( shadow_mode_external(d) ) {
 28.1441 -        if (--write_refs == 0)
 28.1442 -            return 0;
 28.1443 -
 28.1444 -         // Use the back pointer to locate the shadow page that can contain
 28.1445 -         // the PTE of interest
 28.1446 -         if ( (predicted_smfn = mfn_to_page(readonly_gmfn)->tlbflush_timestamp) ) {
 28.1447 -             found += remove_all_write_access_in_ptpage(
 28.1448 -                 d, predicted_smfn, predicted_smfn, readonly_gpfn, readonly_gmfn, write_refs, 0);
 28.1449 -             if ( found == write_refs )
 28.1450 -                 return 0;
 28.1451 -         }
 28.1452 -    }
 28.1453 -
 28.1454 -    // Search all the shadow L1 page tables...
 28.1455 -    //
 28.1456 -    for (i = 0; i < shadow_ht_buckets; i++)
 28.1457 -    {
 28.1458 -        a = &d->arch.shadow_ht[i];
 28.1459 -        while ( a && a->gpfn_and_flags )
 28.1460 -        {
 28.1461 -            if ( (a->gpfn_and_flags & PGT_type_mask) == PGT_l1_shadow
 28.1462 -#if CONFIG_PAGING_LEVELS >= 3
 28.1463 -              || (a->gpfn_and_flags & PGT_type_mask) == PGT_fl1_shadow
 28.1464 -#endif
 28.1465 -              )
 28.1466 -
 28.1467 -            {
 28.1468 -                found += remove_all_write_access_in_ptpage(d, a->gpfn_and_flags & PGT_mfn_mask, a->smfn, readonly_gpfn, readonly_gmfn, write_refs - found, a->gpfn_and_flags & PGT_mfn_mask);
 28.1469 -                if ( found == write_refs )
 28.1470 -                    return 0;
 28.1471 -            }
 28.1472 -
 28.1473 -            a = a->next;
 28.1474 -        }
 28.1475 -    }
 28.1476 -
 28.1477 -    FSH_LOG("%s: looking for %d refs, found %d refs",
 28.1478 -            __func__, write_refs, found);
 28.1479 -
 28.1480 -    return 0;
 28.1481 -}
 28.1482 -
 28.1483 -static void resync_pae_guest_l3(struct domain *d)
 28.1484 -{
 28.1485 -    struct out_of_sync_entry *entry;
 28.1486 -    unsigned long i, idx;
 28.1487 -    unsigned long smfn, gmfn;
 28.1488 -    pgentry_64_t *guest, *shadow_l3, *snapshot;
 28.1489 -    struct vcpu *v = current;
 28.1490 -    int max = -1;
 28.1491 -    int unshadow = 0;
 28.1492 -
 28.1493 -    
 28.1494 -    ASSERT( shadow_mode_external(d) );
 28.1495 -
 28.1496 -    gmfn = pagetable_get_pfn(v->arch.guest_table);
 28.1497 -           
 28.1498 -    for ( entry = d->arch.out_of_sync; entry; entry = entry->next ) 
 28.1499 -    {
 28.1500 -        if ( entry->snapshot_mfn == SHADOW_SNAPSHOT_ELSEWHERE )
 28.1501 -            continue;
 28.1502 -        if ( entry->gmfn != gmfn )
 28.1503 -            continue;
 28.1504 -
 28.1505 -        idx = get_cr3_idxval(v);
 28.1506 -
 28.1507 -        smfn = __shadow_status(d, entry->gpfn, PGT_l4_shadow);
 28.1508 -
 28.1509 -        if ( !smfn ) 
 28.1510 -            continue;
 28.1511 -
 28.1512 -        guest    = (pgentry_64_t *)map_domain_page(entry->gmfn);
 28.1513 -        snapshot = (pgentry_64_t *)map_domain_page(entry->snapshot_mfn);
 28.1514 -        shadow_l3 = (pgentry_64_t *)map_domain_page(smfn);
 28.1515 -
 28.1516 -        for ( i = 0; i < PAE_L3_PAGETABLE_ENTRIES; i++ ) 
 28.1517 -        {
 28.1518 -            int index = i + idx * PAE_L3_PAGETABLE_ENTRIES;
 28.1519 -            if ( entry_has_changed(
 28.1520 -                    guest[index], snapshot[index], PAGE_FLAG_MASK) ) 
 28.1521 -            {
 28.1522 -                unsigned long gpfn;
 28.1523 -
 28.1524 -                /*
 28.1525 -                 * Looks like it's no longer a page table. 
 28.1526 -                 */
 28.1527 -                if ( unlikely(entry_get_value(guest[index]) & PAE_PDPT_RESERVED) )
 28.1528 -                {
 28.1529 -                    if ( entry_get_flags(shadow_l3[i]) & _PAGE_PRESENT )
 28.1530 -                        put_shadow_ref(entry_get_pfn(shadow_l3[i]));
 28.1531 -
 28.1532 -                    shadow_l3[i] = entry_empty();
 28.1533 -                    continue;
 28.1534 -                }
 28.1535 -
 28.1536 -                gpfn = entry_get_pfn(guest[index]);
 28.1537 -
 28.1538 -                if ( unlikely(gpfn != (gpfn & PGT_mfn_mask)) )
 28.1539 -                {
 28.1540 -                    if ( entry_get_flags(shadow_l3[i]) & _PAGE_PRESENT )
 28.1541 -                        put_shadow_ref(entry_get_pfn(shadow_l3[i]));
 28.1542 -
 28.1543 -                    shadow_l3[i] = entry_empty();
 28.1544 -                    continue;
 28.1545 -                }
 28.1546 -
 28.1547 -                validate_entry_change(d, &guest[index],
 28.1548 -                                      &shadow_l3[i], PAGING_L3);
 28.1549 -            }
 28.1550 -
 28.1551 -            if ( entry_get_value(guest[index]) != 0 )
 28.1552 -                max = i;
 28.1553 -
 28.1554 -            if ( !(entry_get_flags(guest[index]) & _PAGE_PRESENT) &&
 28.1555 -                 unlikely(entry_get_value(guest[index]) != 0) &&
 28.1556 -                 !unshadow &&
 28.1557 -                 (frame_table[smfn].u.inuse.type_info & PGT_pinned) )
 28.1558 -                unshadow = 1;
 28.1559 -
 28.1560 -        }
 28.1561 -        if ( max == -1 )
 28.1562 -            unshadow = 1;
 28.1563 -
 28.1564 -        unmap_domain_page(guest);
 28.1565 -        unmap_domain_page(snapshot);
 28.1566 -        unmap_domain_page(shadow_l3);
 28.1567 -
 28.1568 -        if ( unlikely(unshadow) )
 28.1569 -            shadow_unpin(smfn);
 28.1570 -        break;
 28.1571 -    }
 28.1572 -}
 28.1573 -
 28.1574 -static int resync_all(struct domain *d, u32 stype)
 28.1575 -{
 28.1576 -    struct out_of_sync_entry *entry;
 28.1577 -    unsigned i;
 28.1578 -    unsigned long smfn;
 28.1579 -    void *guest, *shadow, *snapshot;
 28.1580 -    int need_flush = 0, external = shadow_mode_external(d);
 28.1581 -    int unshadow;
 28.1582 -    int changed;
 28.1583 -    u32 min_max_shadow, min_max_snapshot;
 28.1584 -    int min_shadow, max_shadow, min_snapshot, max_snapshot;
 28.1585 -    struct vcpu *v;
 28.1586 -
 28.1587 -    ASSERT(shadow_lock_is_acquired(d));
 28.1588 -
 28.1589 -    for ( entry = d->arch.out_of_sync; entry; entry = entry->next)
 28.1590 -    {
 28.1591 -        int max = -1;
 28.1592 -
 28.1593 -        if ( entry->snapshot_mfn == SHADOW_SNAPSHOT_ELSEWHERE )
 28.1594 -            continue;
 28.1595 -
 28.1596 -        smfn = __shadow_status(d, entry->gpfn, stype);
 28.1597 -
 28.1598 -        if ( !smfn )
 28.1599 -        {
 28.1600 -            // For heavy weight shadows: no need to update refcounts if
 28.1601 -            // there's no shadow page.
 28.1602 -            //
 28.1603 -            if ( shadow_mode_refcounts(d) )
 28.1604 -                continue;
 28.1605 -
 28.1606 -            // For light weight shadows: only need up resync the refcounts to
 28.1607 -            // the new contents of the guest page iff this it has the right
 28.1608 -            // page type.
 28.1609 -            //
 28.1610 -            if ( stype != ( mfn_to_page(entry->gmfn)->u.inuse.type_info & PGT_type_mask) )
 28.1611 -                continue;
 28.1612 -        }
 28.1613 -
 28.1614 -        FSH_LOG("resyncing t=%08x gpfn=%lx gmfn=%lx smfn=%lx snapshot_mfn=%lx",
 28.1615 -                stype, entry->gpfn, entry->gmfn, smfn, entry->snapshot_mfn);
 28.1616 -
 28.1617 -        // Compare guest's new contents to its snapshot, validating
 28.1618 -        // and updating its shadow as appropriate.
 28.1619 -        //
 28.1620 -        guest    = map_domain_page(entry->gmfn);
 28.1621 -        snapshot = map_domain_page(entry->snapshot_mfn);
 28.1622 -
 28.1623 -        if ( smfn )
 28.1624 -            shadow = map_domain_page(smfn);
 28.1625 -        else
 28.1626 -            shadow = NULL;
 28.1627 -
 28.1628 -        unshadow = 0;
 28.1629 -
 28.1630 -        min_max_shadow = mfn_to_page(smfn)->tlbflush_timestamp;
 28.1631 -        min_shadow     = SHADOW_MIN(min_max_shadow);
 28.1632 -        max_shadow     = SHADOW_MAX(min_max_shadow);
 28.1633 -
 28.1634 -        min_max_snapshot= mfn_to_page(entry->snapshot_mfn)->tlbflush_timestamp;
 28.1635 -        min_snapshot    = SHADOW_MIN(min_max_snapshot);
 28.1636 -        max_snapshot    = SHADOW_MAX(min_max_snapshot);
 28.1637 -
 28.1638 -        switch ( stype )
 28.1639 -        {
 28.1640 -        case PGT_l1_shadow:
 28.1641 -        {
 28.1642 -            guest_l1_pgentry_t *guest1 = guest;
 28.1643 -            l1_pgentry_t *shadow1 = shadow;
 28.1644 -            l1_pgentry_t *shadow1_next = 0, *sl1e_p;
 28.1645 -            guest_l1_pgentry_t *snapshot1 = snapshot;
 28.1646 -            int unshadow_l1 = 0;
 28.1647 -
 28.1648 -            ASSERT(shadow_mode_write_l1(d) ||
 28.1649 -                   shadow_mode_write_all(d) || shadow_mode_wr_pt_pte(d));
 28.1650 -
 28.1651 -            if ( !shadow_mode_refcounts(d) )
 28.1652 -                revalidate_l1(d, (l1_pgentry_t *)guest1, (l1_pgentry_t *)snapshot1);
 28.1653 -            if ( !smfn )
 28.1654 -                break;
 28.1655 -
 28.1656 -            changed = 0;
 28.1657 -
 28.1658 -            if ( SH_L1_HAS_NEXT_PAGE && shadow1 )
 28.1659 -                shadow1_next = map_domain_page(smfn + 1);
 28.1660 -
 28.1661 -            for ( i = min_shadow; i <= max_shadow; i++ )
 28.1662 -            {
 28.1663 -
 28.1664 -                if ( SH_L1_HAS_NEXT_PAGE && i >= L1_PAGETABLE_ENTRIES )
 28.1665 -                    sl1e_p = &shadow1_next[i - L1_PAGETABLE_ENTRIES];
 28.1666 -                else
 28.1667 -                    sl1e_p = &shadow1[i];
 28.1668 -
 28.1669 -                if ( (i < min_snapshot) || (i > max_snapshot) ||
 28.1670 -                     guest_l1e_has_changed(guest1[i], snapshot1[i], PAGE_FLAG_MASK) )
 28.1671 -                {
 28.1672 -                    int error;
 28.1673 -
 28.1674 -#if CONFIG_PAGING_LEVELS >= 3
 28.1675 -                    unsigned long gpfn;
 28.1676 -
 28.1677 -                    gpfn = guest_l1e_get_paddr(guest1[i]) >> PAGE_SHIFT;
 28.1678 -
 28.1679 -                    if ( unlikely(gpfn != (gpfn & PGT_mfn_mask)) )
 28.1680 -                    {
 28.1681 -                        guest_l1_pgentry_t tmp_gl1e = guest_l1e_empty();
 28.1682 -                        validate_pte_change(d, tmp_gl1e, sl1e_p);
 28.1683 -                        unshadow_l1 = 1;
 28.1684 -                        continue;
 28.1685 -                    }
 28.1686 -#endif
 28.1687 -
 28.1688 -                    error = validate_pte_change(d, guest1[i], sl1e_p);
 28.1689 -                    if ( error ==  -1 )
 28.1690 -                        unshadow_l1 = 1;
 28.1691 -                    else {
 28.1692 -                        need_flush |= error;
 28.1693 -                        if ( l1e_get_flags(*sl1e_p) & _PAGE_PRESENT )
 28.1694 -                            set_guest_back_ptr(d, *sl1e_p, smfn, i);
 28.1695 -                    }
 28.1696 -                    // can't update snapshots of linear page tables -- they
 28.1697 -                    // are used multiple times...
 28.1698 -                    //
 28.1699 -                    // snapshot[i] = new_pte;
 28.1700 -
 28.1701 -                    changed++;
 28.1702 -                }
 28.1703 -            }
 28.1704 -
 28.1705 -            if ( shadow1_next )
 28.1706 -                unmap_domain_page(shadow1_next);
 28.1707 -
 28.1708 -            perfc_incrc(resync_l1);
 28.1709 -            perfc_incr_histo(wpt_updates, changed, PT_UPDATES);
 28.1710 -            perfc_incr_histo(l1_entries_checked, max_shadow - min_shadow + 1, PT_UPDATES);
 28.1711 -
 28.1712 -            if ( d->arch.ops->guest_paging_levels >= PAGING_L3 &&
 28.1713 -                 unshadow_l1 ) {
 28.1714 -                pgentry_64_t l2e = { 0 };
 28.1715 -
 28.1716 -                __shadow_get_l2e(entry->v, entry->va, &l2e);
 28.1717 -
 28.1718 -                if ( entry_get_flags(l2e) & _PAGE_PRESENT ) {
 28.1719 -                    put_shadow_ref(entry_get_pfn(l2e));
 28.1720 -                    l2e = entry_empty();
 28.1721 -                    __shadow_set_l2e(entry->v, entry->va, &l2e);
 28.1722 -
 28.1723 -                    if (entry->v == current)
 28.1724 -                        need_flush = 1;
 28.1725 -                }
 28.1726 -            }
 28.1727 -
 28.1728 -            break;
 28.1729 -        }
 28.1730 -#if CONFIG_PAGING_LEVELS == 2
 28.1731 -        case PGT_l2_shadow:
 28.1732 -        {
 28.1733 -            l2_pgentry_t *guest2 = guest;
 28.1734 -            l2_pgentry_t *shadow2 = shadow;
 28.1735 -            l2_pgentry_t *snapshot2 = snapshot;
 28.1736 -
 28.1737 -            ASSERT(shadow_mode_write_all(d) || shadow_mode_wr_pt_pte(d));
 28.1738 -            BUG_ON(!shadow_mode_refcounts(d)); // not yet implemented
 28.1739 -
 28.1740 -            changed = 0;
 28.1741 -            for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
 28.1742 -            {
 28.1743 -                if ( !is_guest_l2_slot(0,i) && !external )
 28.1744 -                    continue;
 28.1745 -
 28.1746 -                l2_pgentry_t new_pde = guest2[i];
 28.1747 -                if ( l2e_has_changed(new_pde, snapshot2[i], PAGE_FLAG_MASK))
 28.1748 -                {
 28.1749 -                    need_flush |= validate_pde_change(d, new_pde, &shadow2[i]);
 28.1750 -
 28.1751 -                    // can't update snapshots of linear page tables -- they
 28.1752 -                    // are used multiple times...
 28.1753 -                    //
 28.1754 -                    // snapshot[i] = new_pde;
 28.1755 -
 28.1756 -                    changed++;
 28.1757 -                }
 28.1758 -                if ( l2e_get_intpte(new_pde) != 0 ) /* FIXME: check flags? */
 28.1759 -                    max = i;
 28.1760 -
 28.1761 -                // XXX - This hack works for linux guests.
 28.1762 -                //       Need a better solution long term.
 28.1763 -                if ( !(l2e_get_flags(new_pde) & _PAGE_PRESENT) &&
 28.1764 -                     unlikely(l2e_get_intpte(new_pde) != 0) &&
 28.1765 -                     !unshadow && MFN_PINNED(smfn) )
 28.1766 -                    unshadow = 1;
 28.1767 -            }
 28.1768 -            if ( max == -1 )
 28.1769 -                unshadow = 1;
 28.1770 -            perfc_incrc(resync_l2);
 28.1771 -            perfc_incr_histo(shm_l2_updates, changed, PT_UPDATES);
 28.1772 -            break;
 28.1773 -        }
 28.1774 -        case PGT_hl2_shadow:
 28.1775 -        {
 28.1776 -            l2_pgentry_t *guest2 = guest;
 28.1777 -            l2_pgentry_t *snapshot2 = snapshot;
 28.1778 -            l1_pgentry_t *shadow2 = shadow;
 28.1779 -
 28.1780 -            ASSERT(shadow_mode_write_all(d) || shadow_mode_wr_pt_pte(d));
 28.1781 -            BUG_ON(!shadow_mode_refcounts(d)); // not yet implemented
 28.1782 -
 28.1783 -            changed = 0;
 28.1784 -            for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
 28.1785 -            {
 28.1786 -                if ( !is_guest_l2_slot(0, i) && !external )
 28.1787 -                    continue;
 28.1788 -
 28.1789 -                l2_pgentry_t new_pde = guest2[i];
 28.1790 -                if ( l2e_has_changed(new_pde, snapshot2[i], PAGE_FLAG_MASK) )
 28.1791 -                {
 28.1792 -                    need_flush |= validate_hl2e_change(d, new_pde, &shadow2[i]);
 28.1793 -
 28.1794 -                    // can't update snapshots of linear page tables -- they
 28.1795 -                    // are used multiple times...
 28.1796 -                    //
 28.1797 -                    // snapshot[i] = new_pde;
 28.1798 -
 28.1799 -                    changed++;
 28.1800 -                }
 28.1801 -            }
 28.1802 -            perfc_incrc(resync_hl2);
 28.1803 -            perfc_incr_histo(shm_hl2_updates, changed, PT_UPDATES);
 28.1804 -            break;
 28.1805 -        }
 28.1806 -#elif CONFIG_PAGING_LEVELS >= 3
 28.1807 -        case PGT_l2_shadow:
 28.1808 -        case PGT_l3_shadow:
 28.1809 -        {
 28.1810 -            pgentry_64_t *guest_pt = guest;
 28.1811 -            pgentry_64_t *shadow_pt = shadow;
 28.1812 -            pgentry_64_t *snapshot_pt = snapshot;
 28.1813 -
 28.1814 -            changed = 0;
 28.1815 -            for ( i = min_shadow; i <= max_shadow; i++ )
 28.1816 -            {
 28.1817 -                if ( (i < min_snapshot) || (i > max_snapshot) ||
 28.1818 -                    entry_has_changed(
 28.1819 -                        guest_pt[i], snapshot_pt[i], PAGE_FLAG_MASK) )
 28.1820 -                {
 28.1821 -                    unsigned long gpfn;
 28.1822 -
 28.1823 -                    gpfn = entry_get_pfn(guest_pt[i]);
 28.1824 -                    /*
 28.1825 -                     * Looks like it's no longer a page table.
 28.1826 -                     */
 28.1827 -                    if ( unlikely(gpfn != (gpfn & PGT_mfn_mask)) )
 28.1828 -                    {
 28.1829 -                        if ( entry_get_flags(shadow_pt[i]) & _PAGE_PRESENT )
 28.1830 -                            put_shadow_ref(entry_get_pfn(shadow_pt[i]));
 28.1831 -                         shadow_pt[i] = entry_empty();
 28.1832 -                        continue;
 28.1833 -                    }
 28.1834 -
 28.1835 -                    need_flush |= validate_entry_change(
 28.1836 -                        d, &guest_pt[i], &shadow_pt[i],
 28.1837 -                        shadow_type_to_level(stype));
 28.1838 -                    changed++;
 28.1839 -                }
 28.1840 -#if CONFIG_PAGING_LEVELS == 3
 28.1841 -                if ( stype == PGT_l3_shadow ) 
 28.1842 -                {
 28.1843 -                    if ( entry_get_value(guest_pt[i]) != 0 ) 
 28.1844 -                        max = i;
 28.1845 -
 28.1846 -                    if ( !(entry_get_flags(guest_pt[i]) & _PAGE_PRESENT) &&
 28.1847 -                         unlikely(entry_get_value(guest_pt[i]) != 0) &&
 28.1848 -                         !unshadow &&
 28.1849 -                         (mfn_to_page(smfn)->u.inuse.type_info & PGT_pinned) )
 28.1850 -                        unshadow = 1;
 28.1851 -                }
 28.1852 -#endif
 28.1853 -            }
 28.1854 -
 28.1855 -            if ( d->arch.ops->guest_paging_levels == PAGING_L3
 28.1856 -                 && max == -1 && stype == PGT_l3_shadow )
 28.1857 -                unshadow = 1;
 28.1858 -
 28.1859 -            perfc_incrc(resync_l3);
 28.1860 -            perfc_incr_histo(shm_l3_updates, changed, PT_UPDATES);
 28.1861 -            break;
 28.1862 -        }
 28.1863 -        case PGT_l4_shadow:
 28.1864 -        {
 28.1865 -            guest_root_pgentry_t *guest_root = guest;
 28.1866 -            guest_root_pgentry_t *snapshot_root = snapshot;
 28.1867 -
 28.1868 -            changed = 0;
 28.1869 -            for ( i = 0; i < GUEST_ROOT_PAGETABLE_ENTRIES; i++ )
 28.1870 -            {
 28.1871 -                guest_root_pgentry_t new_root_e = guest_root[i];
 28.1872 -                if ( !is_guest_l4_slot(i) && !external )
 28.1873 -                    continue;
 28.1874 -                if ( root_entry_has_changed(
 28.1875 -                        new_root_e, snapshot_root[i], PAGE_FLAG_MASK))
 28.1876 -                {
 28.1877 -#ifndef GUEST_PGENTRY_32
 28.1878 -                    l4_pgentry_t *shadow4 = shadow;
 28.1879 -                    unsigned long gpfn;
 28.1880 -
 28.1881 -                    gpfn = l4e_get_pfn(new_root_e);
 28.1882 -
 28.1883 -                    /*
 28.1884 -                     * Looks like it's no longer a page table.
 28.1885 -                     */
 28.1886 -                    if ( unlikely(gpfn != (gpfn & PGT_mfn_mask)) )
 28.1887 -                    {
 28.1888 -                        if ( l4e_get_flags(shadow4[i]) & _PAGE_PRESENT )
 28.1889 -                            put_shadow_ref(l4e_get_pfn(shadow4[i]));
 28.1890 -                        shadow4[i] = l4e_empty();
 28.1891 -                        continue;
 28.1892 -                    }
 28.1893 -
 28.1894 -                    if ( d->arch.ops->guest_paging_levels == PAGING_L4 ) 
 28.1895 -                    {
 28.1896 -                        need_flush |= validate_entry_change(
 28.1897 -                          d, (pgentry_64_t *)&new_root_e,
 28.1898 -                          (pgentry_64_t *)&shadow4[i], shadow_type_to_level(stype));
 28.1899 -                    }
 28.1900 -                    else
 28.1901 -#endif
 28.1902 -                    {
 28.1903 -                        validate_bl2e_change(d, &new_root_e, shadow, i);
 28.1904 -                    }
 28.1905 -                    changed++;
 28.1906 -                    ESH_LOG("%d: shadow4 mfn: %lx, shadow root: %lx\n", i,
 28.1907 -                      smfn, pagetable_get_paddr(current->arch.shadow_table));
 28.1908 -                }
 28.1909 -                if ( guest_root_get_intpte(new_root_e) != 0 ) /* FIXME: check flags? */
 28.1910 -                    max = i;
 28.1911 -
 28.1912 -                //  Need a better solution in the long term.
 28.1913 -                if ( !(guest_root_get_flags(new_root_e) & _PAGE_PRESENT) &&
 28.1914 -                     unlikely(guest_root_get_intpte(new_root_e) != 0) &&
 28.1915 -                     !unshadow &&
 28.1916 -                     (mfn_to_page(smfn)->u.inuse.type_info & PGT_pinned) )
 28.1917 -                    unshadow = 1;
 28.1918 -            }
 28.1919 -            if ( max == -1 )
 28.1920 -                unshadow = 1;
 28.1921 -            perfc_incrc(resync_l4);
 28.1922 -            perfc_incr_histo(shm_l4_updates, changed, PT_UPDATES);
 28.1923 -            break;
 28.1924 -        }
 28.1925 -
 28.1926 -#endif /* CONFIG_PAGING_LEVELS >= 3 */
 28.1927 -        default:
 28.1928 -            BUG();
 28.1929 -        }
 28.1930 -
 28.1931 -        if ( smfn )
 28.1932 -            unmap_domain_page(shadow);
 28.1933 -        unmap_domain_page(snapshot);
 28.1934 -        unmap_domain_page(guest);
 28.1935 -
 28.1936 -        if ( unlikely(unshadow && stype == PGT_root_page_table) )
 28.1937 -        {
 28.1938 -            for_each_vcpu(d, v)
 28.1939 -                if(smfn == pagetable_get_pfn(v->arch.shadow_table))
 28.1940 -                    return need_flush;
 28.1941 -            perfc_incrc(unshadow_l2_count);
 28.1942 -            shadow_unpin(smfn);
 28.1943 -#if CONFIG_PAGING_LEVELS == 2
 28.1944 -            if ( unlikely(shadow_mode_external(d)) )
 28.1945 -            {
 28.1946 -                unsigned long hl2mfn;
 28.1947 -
 28.1948 -                if ( (hl2mfn = __shadow_status(d, entry->gpfn, PGT_hl2_shadow)) &&
 28.1949 -                     MFN_PINNED(hl2mfn) )
 28.1950 -                    shadow_unpin(hl2mfn);
 28.1951 -            }
 28.1952 -#endif
 28.1953 -        }
 28.1954 -    }
 28.1955 -
 28.1956 -    return need_flush;
 28.1957 -}
 28.1958 -
 28.1959 -#if CONFIG_PAGING_LEVELS == 2
 28.1960 -static int resync_all_levels_guest_page(struct domain *d)
 28.1961 -{
 28.1962 -    int need_flush = 0;
 28.1963 -
 28.1964 -    need_flush |= resync_all(d, PGT_l1_shadow);
 28.1965 -    if ( d->arch.ops->guest_paging_levels == PAGING_L2 &&
 28.1966 -         shadow_mode_translate(d) )
 28.1967 -    {
 28.1968 -        need_flush |= resync_all(d, PGT_hl2_shadow);
 28.1969 -    }
 28.1970 -    return need_flush;
 28.1971 -}
 28.1972 -#elif CONFIG_PAGING_LEVELS == 3
 28.1973 -static int resync_all_levels_guest_page(struct domain *d)
 28.1974 -{
 28.1975 -    int need_flush = 0;
 28.1976 -
 28.1977 -    need_flush |= resync_all(d, PGT_l1_shadow);
 28.1978 -    if ( d->arch.ops->guest_paging_levels == PAGING_L2 ) 
 28.1979 -        need_flush |= resync_all(d, PGT_l4_shadow);
 28.1980 -    else
 28.1981 -    {
 28.1982 -        need_flush |= resync_all(d, PGT_l2_shadow);
 28.1983 -        if ( shadow_mode_log_dirty(d) )
 28.1984 -        {
 28.1985 -            need_flush |= resync_all(d, PGT_l3_shadow);
 28.1986 -            need_flush |= resync_all(d, PGT_l4_shadow);
 28.1987 -        }
 28.1988 -        else
 28.1989 -            resync_pae_guest_l3(d);
 28.1990 -    }
 28.1991 -    
 28.1992 -    return need_flush;
 28.1993 -}
 28.1994 -#elif CONFIG_PAGING_LEVELS == 4
 28.1995 -static int resync_all_levels_guest_page(struct domain *d)
 28.1996 -{
 28.1997 -    int need_flush = 0;
 28.1998 -
 28.1999 -    need_flush |= resync_all(d, PGT_l1_shadow);
 28.2000 -    if ( d->arch.ops->guest_paging_levels == PAGING_L2 )
 28.2001 -        need_flush |= resync_all(d, PGT_l4_shadow);
 28.2002 -    else
 28.2003 -    {
 28.2004 -        need_flush |= resync_all(d, PGT_l2_shadow);
 28.2005 -        if ( d->arch.ops->guest_paging_levels == PAGING_L3 )
 28.2006 -            resync_pae_guest_l3(d);
 28.2007 -        else
 28.2008 -        {
 28.2009 -            need_flush |= resync_all(d, PGT_l3_shadow);
 28.2010 -            need_flush |= resync_all(d, PGT_l4_shadow);
 28.2011 -        }
 28.2012 -    }
 28.2013 -    return need_flush;
 28.2014 -}
 28.2015 -#endif
 28.2016 -
 28.2017 -static void sync_all(struct domain *d)
 28.2018 -{
 28.2019 -    struct out_of_sync_entry *entry;
 28.2020 -    int need_flush = 0;
 28.2021 -    l1_pgentry_t *ppte, opte, npte;
 28.2022 -    cpumask_t other_vcpus_mask;
 28.2023 -
 28.2024 -    perfc_incrc(shadow_sync_all);
 28.2025 -
 28.2026 -    ASSERT(shadow_lock_is_acquired(d));
 28.2027 -
 28.2028 -    // First, remove all write permissions to the page tables
 28.2029 -    //
 28.2030 -    for ( entry = d->arch.out_of_sync; entry; entry = entry->next)
 28.2031 -    {
 28.2032 -        // Skip entries that have low bits set...  Those aren't
 28.2033 -        // real PTEs.
 28.2034 -        //
 28.2035 -        if ( entry->writable_pl1e & (sizeof(l1_pgentry_t)-1) )
 28.2036 -            continue;
 28.2037 -
 28.2038 -        ppte = (l1_pgentry_t *)(
 28.2039 -            (char *)map_domain_page(entry->writable_pl1e >> PAGE_SHIFT) +
 28.2040 -            (entry->writable_pl1e & ~PAGE_MASK));
 28.2041 -        opte = npte = *ppte;
 28.2042 -        l1e_remove_flags(npte, _PAGE_RW);
 28.2043 -
 28.2044 -        if ( (l1e_get_flags(npte) & _PAGE_PRESENT) &&
 28.2045 -             !shadow_get_page_from_l1e(npte, d) )
 28.2046 -            BUG();
 28.2047 -        *ppte = npte;
 28.2048 -        set_guest_back_ptr(d, npte, (entry->writable_pl1e) >> PAGE_SHIFT,
 28.2049 -                           (entry->writable_pl1e & ~PAGE_MASK)/sizeof(l1_pgentry_t));
 28.2050 -        shadow_put_page_from_l1e(opte, d);
 28.2051 -
 28.2052 -        unmap_domain_page(ppte);
 28.2053 -    }
 28.2054 -
 28.2055 -    /* Other VCPUs mustn't use the revoked writable mappings. */
 28.2056 -    other_vcpus_mask = d->domain_dirty_cpumask;
 28.2057 -    cpu_clear(smp_processor_id(), other_vcpus_mask);
 28.2058 -    flush_tlb_mask(other_vcpus_mask);
 28.2059 -
 28.2060 -    /* Flush ourself later. */
 28.2061 -    need_flush = 1;
 28.2062 -
 28.2063 -    need_flush |= resync_all_levels_guest_page(d);
 28.2064 -
 28.2065 -    if ( need_flush && !unlikely(shadow_mode_external(d)) )
 28.2066 -        local_flush_tlb();
 28.2067 -
 28.2068 -    free_out_of_sync_state(d);
 28.2069 -}
 28.2070 -
 28.2071 -static inline int l1pte_write_fault(
 28.2072 -    struct vcpu *v, guest_l1_pgentry_t *gpte_p, l1_pgentry_t *spte_p,
 28.2073 -    unsigned long va)
 28.2074 -{
 28.2075 -    struct domain *d = v->domain;
 28.2076 -    guest_l1_pgentry_t gpte = *gpte_p;
 28.2077 -    l1_pgentry_t spte;
 28.2078 -    unsigned long gpfn = l1e_get_pfn(gpte);
 28.2079 -    unsigned long gmfn = gmfn_to_mfn(d, gpfn);
 28.2080 -
 28.2081 -    //printk("l1pte_write_fault gmfn=%lx\n", gmfn);
 28.2082 -
 28.2083 -    if ( unlikely(!VALID_MFN(gmfn)) )
 28.2084 -    {
 28.2085 -        SH_VLOG("l1pte_write_fault: invalid gpfn=%lx", gpfn);
 28.2086 -        *spte_p = l1e_empty();
 28.2087 -        return 0;
 28.2088 -    }
 28.2089 -
 28.2090 -    ASSERT(guest_l1e_get_flags(gpte) & _PAGE_RW);
 28.2091 -    guest_l1e_add_flags(gpte, _PAGE_DIRTY | _PAGE_ACCESSED);
 28.2092 -    spte = l1e_from_pfn(gmfn, guest_l1e_get_flags(gpte) & ~_PAGE_GLOBAL);
 28.2093 -
 28.2094 -    SH_VVLOG("l1pte_write_fault: updating spte=0x%" PRIpte " gpte=0x%" PRIpte,
 28.2095 -             l1e_get_intpte(spte), l1e_get_intpte(gpte));
 28.2096 -
 28.2097 -    __mark_dirty(d, gmfn);
 28.2098 -
 28.2099 -    if ( mfn_is_page_table(gmfn) )
 28.2100 -        shadow_mark_va_out_of_sync(v, gpfn, gmfn, va);
 28.2101 -
 28.2102 -    *gpte_p = gpte;
 28.2103 -    *spte_p = spte;
 28.2104 -
 28.2105 -    return 1;
 28.2106 -}
 28.2107 -
 28.2108 -static inline int l1pte_read_fault(
 28.2109 -    struct domain *d, guest_l1_pgentry_t *gpte_p, l1_pgentry_t *spte_p)
 28.2110 -{
 28.2111 -    guest_l1_pgentry_t gpte = *gpte_p;
 28.2112 -    l1_pgentry_t spte = *spte_p;
 28.2113 -    unsigned long pfn = l1e_get_pfn(gpte);
 28.2114 -    unsigned long mfn = gmfn_to_mfn(d, pfn);
 28.2115 -
 28.2116 -    if ( unlikely(!VALID_MFN(mfn)) )
 28.2117 -    {
 28.2118 -        SH_VLOG("l1pte_read_fault: invalid gpfn=%lx", pfn);
 28.2119 -        *spte_p = l1e_empty();
 28.2120 -        return 0;
 28.2121 -    }
 28.2122 -
 28.2123 -    guest_l1e_add_flags(gpte, _PAGE_ACCESSED);
 28.2124 -    spte = l1e_from_pfn(mfn, guest_l1e_get_flags(gpte) & ~_PAGE_GLOBAL);
 28.2125 -
 28.2126 -    if ( shadow_mode_log_dirty(d) || !(guest_l1e_get_flags(gpte) & _PAGE_DIRTY) ||
 28.2127 -         mfn_is_page_table(mfn) )
 28.2128 -    {
 28.2129 -        l1e_remove_flags(spte, _PAGE_RW);
 28.2130 -    }
 28.2131 -
 28.2132 -    SH_VVLOG("l1pte_read_fault: updating spte=0x%" PRIpte " gpte=0x%" PRIpte,
 28.2133 -             l1e_get_intpte(spte), l1e_get_intpte(gpte));
 28.2134 -    *gpte_p = gpte;
 28.2135 -    *spte_p = spte;
 28.2136 -
 28.2137 -    return 1;
 28.2138 -}
 28.2139 -#if CONFIG_PAGING_LEVELS == 2
 28.2140 -static int shadow_fault_32(unsigned long va, struct cpu_user_regs *regs)
 28.2141 -{
 28.2142 -    l1_pgentry_t gpte, spte, orig_gpte;
 28.2143 -    struct vcpu *v = current;
 28.2144 -    struct domain *d = v->domain;
 28.2145 -    l2_pgentry_t gpde;
 28.2146 -
 28.2147 -    spte = l1e_empty();
 28.2148 -
 28.2149 -    SH_VVLOG("shadow_fault( va=%lx, code=%lu )",
 28.2150 -             va, (unsigned long)regs->error_code);
 28.2151 -    perfc_incrc(shadow_fault_calls);
 28.2152 -
 28.2153 -    check_pagetable(v, "pre-sf");
 28.2154 -
 28.2155 -    /*
 28.2156 -     * Don't let someone else take the guest's table pages out-of-sync.
 28.2157 -     */
 28.2158 -    shadow_lock(d);
 28.2159 -
 28.2160 -    /* XXX - FIX THIS COMMENT!!!
 28.2161 -     * STEP 1. Check to see if this fault might have been caused by an
 28.2162 -     *         out-of-sync table page entry, or if we should pass this
 28.2163 -     *         fault onto the guest.
 28.2164 -     */
 28.2165 -    __shadow_sync_va(v, va);
 28.2166 -
 28.2167 -    /*
 28.2168 -     * STEP 2. Check the guest PTE.
 28.2169 -     */
 28.2170 -    __guest_get_l2e(v, va, &gpde);
 28.2171 -    if ( unlikely(!(l2e_get_flags(gpde) & _PAGE_PRESENT)) )
 28.2172 -    {
 28.2173 -        SH_VVLOG("shadow_fault - EXIT: L1 not present");
 28.2174 -        perfc_incrc(shadow_fault_bail_pde_not_present);
 28.2175 -        goto fail;
 28.2176 -    }
 28.2177 -
 28.2178 -    // This can't fault because we hold the shadow lock and we've ensured that
 28.2179 -    // the mapping is in-sync, so the check of the PDE's present bit, above,
 28.2180 -    // covers this access.
 28.2181 -    //
 28.2182 -    //orig_gpte = gpte = linear_pg_table[l1_linear_offset(va)];
 28.2183 -    __guest_get_l1e(v, va, &gpte);
 28.2184 -    orig_gpte = gpte;
 28.2185 -
 28.2186 -    if ( unlikely(!(l1e_get_flags(gpte) & _PAGE_PRESENT)) )
 28.2187 -    {
 28.2188 -        SH_VVLOG("shadow_fault - EXIT: gpte not present (%" PRIpte ")",
 28.2189 -                 l1e_get_intpte(gpte));
 28.2190 -        perfc_incrc(shadow_fault_bail_pte_not_present);
 28.2191 -        goto fail;
 28.2192 -    }
 28.2193 -
 28.2194 -    /* Write fault? */
 28.2195 -    if ( regs->error_code & 2 )
 28.2196 -    {
 28.2197 -        int allow_writes = 0;
 28.2198 -
 28.2199 -        if ( unlikely(!(l1e_get_flags(gpte) & _PAGE_RW)) )
 28.2200 -        {
 28.2201 -            if ( shadow_mode_page_writable(va, regs, l1e_get_pfn(gpte)) )
 28.2202 -            {
 28.2203 -                allow_writes = 1;
 28.2204 -                l1e_add_flags(gpte, _PAGE_RW);
 28.2205 -            }
 28.2206 -            else
 28.2207 -            {
 28.2208 -                /* Write fault on a read-only mapping. */
 28.2209 -                SH_VVLOG("shadow_fault - EXIT: wr fault on RO page (%" PRIpte ")",
 28.2210 -                         l1e_get_intpte(gpte));
 28.2211 -                perfc_incrc(shadow_fault_bail_ro_mapping);
 28.2212 -                goto fail;
 28.2213 -            }
 28.2214 -        }
 28.2215 -        else if ( unlikely(!shadow_mode_wr_pt_pte(d) && mfn_is_page_table(l1e_get_pfn(gpte))) )
 28.2216 -        {
 28.2217 -            SH_LOG("l1pte_write_fault: no write access to page table page");
 28.2218 -            domain_crash_synchronous();
 28.2219 -        }
 28.2220 -
 28.2221 -        if ( unlikely(!l1pte_write_fault(v, &gpte, &spte, va)) )
 28.2222 -        {
 28.2223 -            SH_VVLOG("shadow_fault - EXIT: l1pte_write_fault failed");
 28.2224 -            perfc_incrc(write_fault_bail);
 28.2225 -            shadow_unlock(d);
 28.2226 -            return 0;
 28.2227 -        }
 28.2228 -
 28.2229 -        if ( allow_writes )
 28.2230 -            l1e_remove_flags(gpte, _PAGE_RW);
 28.2231 -    }
 28.2232 -    else
 28.2233 -    {
 28.2234 -        if ( !l1pte_read_fault(d, &gpte, &spte) )
 28.2235 -        {
 28.2236 -            SH_VVLOG("shadow_fault - EXIT: l1pte_read_fault failed");
 28.2237 -            perfc_incrc(read_fault_bail);
 28.2238 -            shadow_unlock(d);
 28.2239 -            return 0;
 28.2240 -        }
 28.2241 -    }
 28.2242 -
 28.2243 -    /*
 28.2244 -     * STEP 3. Write the modified shadow PTE and guest PTE back to the tables.
 28.2245 -     */
 28.2246 -    if ( l1e_has_changed(orig_gpte, gpte, PAGE_FLAG_MASK) )
 28.2247 -    {
 28.2248 -        /* XXX Watch out for read-only L2 entries! (not used in Linux). */
 28.2249 -        /*if ( unlikely(__copy_to_user(&linear_pg_table[l1_linear_offset(va)],
 28.2250 -                                     &gpte, sizeof(gpte))) )*/
 28.2251 -        if ( unlikely(!__guest_set_l1e(v, va, &gpte)))
 28.2252 -        {
 28.2253 -            printk("%s() failed, crashing domain %d "
 28.2254 -                   "due to a read-only L2 page table (gpde=%" PRIpte "), va=%lx\n",
 28.2255 -                   __func__,d->domain_id, l2e_get_intpte(gpde), va);
 28.2256 -            domain_crash_synchronous();
 28.2257 -        }
 28.2258 -
 28.2259 -        __mark_dirty(d, gmfn_to_mfn(d, l2e_get_pfn(gpde)));
 28.2260 -    }
 28.2261 -
 28.2262 -    shadow_set_l1e(va, spte, 1);
 28.2263 -
 28.2264 -    perfc_incrc(shadow_fault_fixed);
 28.2265 -    d->arch.shadow_fault_count++;
 28.2266 -
 28.2267 -    shadow_unlock(d);
 28.2268 -
 28.2269 -    check_pagetable(v, "post-sf");
 28.2270 -    return EXCRET_fault_fixed;
 28.2271 -
 28.2272 -fail:
 28.2273 -    shadow_unlock(d);
 28.2274 -    return 0;
 28.2275 -}
 28.2276 -#endif /* CONFIG_PAGING_LEVELS == 2 */
 28.2277 -
 28.2278 -static inline unsigned long va_to_l1mfn(struct vcpu *v, unsigned long va)
 28.2279 -{
 28.2280 -    struct domain *d = v->domain;
 28.2281 -    guest_l2_pgentry_t gl2e = {0};
 28.2282 -
 28.2283 -    __guest_get_l2e(v, va, &gl2e);
 28.2284 -    
 28.2285 -    if ( unlikely(!(guest_l2e_get_flags(gl2e) & _PAGE_PRESENT)) )
 28.2286 -        return INVALID_MFN;
 28.2287 -
 28.2288 -    return gmfn_to_mfn(d, l2e_get_pfn(gl2e));
 28.2289 -}
 28.2290 -
 28.2291 -static int do_update_va_mapping(unsigned long va,
 28.2292 -                                l1_pgentry_t val,
 28.2293 -                                struct vcpu *v)
 28.2294 -{
 28.2295 -    struct domain *d = v->domain;
 28.2296 -    l1_pgentry_t spte;
 28.2297 -    int rc = 0;
 28.2298 -
 28.2299 -    shadow_lock(d);
 28.2300 -
 28.2301 -    // This is actually overkill - we don't need to sync the L1 itself,
 28.2302 -    // just everything involved in getting to this L1 (i.e. we need
 28.2303 -    // linear_pg_table[l1_linear_offset(va)] to be in sync)...
 28.2304 -    //
 28.2305 -    __shadow_sync_va(v, va);
 28.2306 -
 28.2307 -    l1pte_propagate_from_guest(d, *(guest_l1_pgentry_t *)&val, &spte);
 28.2308 -#if CONFIG_PAGING_LEVELS == 2
 28.2309 -    shadow_set_l1e(va, spte, 0);
 28.2310 -#elif CONFIG_PAGING_LEVELS >= 3
 28.2311 -    shadow_set_l1e_64(va, (pgentry_64_t *) &spte, 0);
 28.2312 -#endif
 28.2313 -    /*
 28.2314 -     * If we're in log-dirty mode then we need to note that we've updated
 28.2315 -     * the PTE in the PT-holding page. We need the machine frame number
 28.2316 -     * for this.
 28.2317 -     */
 28.2318 -    __mark_dirty(d, va_to_l1mfn(v, va));
 28.2319 -
 28.2320 -    shadow_unlock(d);
 28.2321 -
 28.2322 -    return rc;
 28.2323 -}
 28.2324 -
 28.2325 -
 28.2326 -/*
 28.2327 - * What lives where in the 32-bit address space in the various shadow modes,
 28.2328 - * and what it uses to get/maintain that mapping.
 28.2329 - *
 28.2330 - * SHADOW MODE:      none         enable         translate         external
 28.2331 - *
 28.2332 - * 4KB things:
 28.2333 - * guest_vtable    lin_l2     mapped per gl2   lin_l2 via hl2   mapped per gl2
 28.2334 - * shadow_vtable     n/a         sh_lin_l2       sh_lin_l2      mapped per gl2
 28.2335 - * hl2_vtable        n/a            n/a        lin_hl2 via hl2  mapped per gl2
 28.2336 - * monitor_vtable    n/a            n/a             n/a           mapped once
 28.2337 - *
 28.2338 - * 4MB things:
 28.2339 - * guest_linear  lin via gl2    lin via gl2      lin via hl2      lin via hl2
 28.2340 - * shadow_linear     n/a      sh_lin via sl2   sh_lin via sl2   sh_lin via sl2
 28.2341 - * monitor_linear    n/a            n/a             n/a              ???
 28.2342 - * perdomain      perdomain      perdomain       perdomain        perdomain
 28.2343 - * R/O M2P         R/O M2P        R/O M2P           n/a              n/a
 28.2344 - * R/W M2P         R/W M2P        R/W M2P         R/W M2P          R/W M2P
 28.2345 - * P2M               n/a            n/a           R/O M2P          R/O M2P
 28.2346 - *
 28.2347 - * NB:
 28.2348 - * update_pagetables(), shadow_update_pagetables(), shadow_mode_enable(),
 28.2349 - * shadow_l2_table(), shadow_hl2_table(), and alloc_monitor_pagetable()
 28.2350 - * all play a part in maintaining these mappings.
 28.2351 - */
 28.2352 -static void shadow_update_pagetables(struct vcpu *v)
 28.2353 -{
 28.2354 -    struct domain *d = v->domain;
 28.2355 -#if CONFIG_PAGING_LEVELS == 4
 28.2356 -    unsigned long gmfn = ((v->arch.flags & TF_kernel_mode)?
 28.2357 -                          pagetable_get_pfn(v->arch.guest_table) :
 28.2358 -                          pagetable_get_pfn(v->arch.guest_table_user));
 28.2359 -#else
 28.2360 -    unsigned long gmfn = pagetable_get_pfn(v->arch.guest_table);
 28.2361 -#endif
 28.2362 -
 28.2363 -    unsigned long gpfn = mfn_to_gmfn(d, gmfn);
 28.2364 -    unsigned long smfn, old_smfn;
 28.2365 -
 28.2366 -#if CONFIG_PAGING_LEVELS == 2
 28.2367 -    unsigned long hl2mfn;
 28.2368 -#endif
 28.2369 -    int need_sync = 0;
 28.2370 -
 28.2371 -    int max_mode = ( shadow_mode_external(d) ? SHM_external
 28.2372 -                     : shadow_mode_translate(d) ? SHM_translate
 28.2373 -                     : shadow_mode_enabled(d) ? SHM_enable
 28.2374 -                     : 0 );
 28.2375 -
 28.2376 -    ASSERT( ! IS_INVALID_M2P_ENTRY(gpfn) );
 28.2377 -    ASSERT( max_mode );
 28.2378 -
 28.2379 -    /*
 28.2380 -     *  arch.guest_vtable
 28.2381 -     */
 28.2382 -    if ( max_mode & (SHM_enable | SHM_external) )
 28.2383 -    {
 28.2384 -        if ( likely(v->arch.guest_vtable != NULL) )
 28.2385 -            unmap_domain_page_global(v->arch.guest_vtable);
 28.2386 -        v->arch.guest_vtable = map_domain_page_global(gmfn);
 28.2387 -    }
 28.2388 -
 28.2389 -    /*
 28.2390 -     *  arch.shadow_table
 28.2391 -     */
 28.2392 -#if CONFIG_PAGING_LEVELS == 3 & defined (GUEST_PGENTRY_32)
 28.2393 -    /*
 28.2394 -     * We use PGT_l4_shadow for 2-level paging guests on PAE
 28.2395 -     */
 28.2396 -    if ( d->arch.ops->guest_paging_levels == PAGING_L2 )
 28.2397 -    { 
 28.2398 -        if ( unlikely(!(smfn = __shadow_status(d, gpfn, PGT_l4_shadow))) )
 28.2399 -            smfn = shadow_l3_table(v, gpfn, gmfn);
 28.2400 -    } 
 28.2401 -    else
 28.2402 -#endif
 28.2403 -
 28.2404 -#if CONFIG_PAGING_LEVELS == 3 & defined ( GUEST_32PAE )
 28.2405 -    /*
 28.2406 -     * We use PGT_l4_shadow for 2-level paging guests on PAE
 28.2407 -     */
 28.2408 -    if ( d->arch.ops->guest_paging_levels == PAGING_L3 )
 28.2409 -    {
 28.2410 -        if ( unlikely(!(smfn = __shadow_status(d, gpfn, PGT_l4_shadow))) )
 28.2411 -            smfn = shadow_l3_table(v, gpfn, gmfn);
 28.2412 -        else
 28.2413 -        {
 28.2414 -            update_top_level_shadow(v, smfn);
 28.2415 -            need_sync = 1;
 28.2416 -        }
 28.2417 -    }
 28.2418 -    else
 28.2419 -#endif
 28.2420 -    if ( unlikely(!(smfn = __shadow_status(d, gpfn, PGT_base_page_table))) ) 
 28.2421 -    {
 28.2422 -#if CONFIG_PAGING_LEVELS == 2
 28.2423 -        smfn = shadow_l2_table(v, gpfn, gmfn);
 28.2424 -#elif CONFIG_PAGING_LEVELS == 3
 28.2425 -        smfn = shadow_l3_table(v, gpfn, gmfn);
 28.2426 -#elif CONFIG_PAGING_LEVELS == 4
 28.2427 -        smfn = shadow_l4_table(v, gpfn, gmfn);
 28.2428 -#endif
 28.2429 -    }
 28.2430 -    else
 28.2431 -    {
 28.2432 -#if CONFIG_PAGING_LEVELS >= 3
 28.2433 -        if ( SH_GUEST_32PAE && d->arch.ops->guest_paging_levels == PAGING_L3 )
 28.2434 -            update_top_level_shadow(v, smfn);
 28.2435 -#endif
 28.2436 -        /*
 28.2437 -         *  move sync later in order to avoid this smfn been 
 28.2438 -         *  unshadowed occasionally
 28.2439 -         */
 28.2440 -        need_sync = 1;
 28.2441 -    }
 28.2442 -
 28.2443 -
 28.2444 -    if ( !get_shadow_ref(smfn) )
 28.2445 -        BUG();
 28.2446 -    old_smfn = pagetable_get_pfn(v->arch.shadow_table);
 28.2447 -    v->arch.shadow_table = pagetable_from_pfn(smfn);
 28.2448 -    if ( old_smfn )
 28.2449 -        put_shadow_ref(old_smfn);
 28.2450 -
 28.2451 -    SH_VVLOG("shadow_update_pagetables(gmfn=%lx, smfn=%lx)", gmfn, smfn);
 28.2452 -
 28.2453 -    /*
 28.2454 -     * arch.shadow_vtable
 28.2455 -     */
 28.2456 -    if ( max_mode == SHM_external
 28.2457 -#if CONFIG_PAGING_LEVELS >=3
 28.2458 -         || max_mode & SHM_enable
 28.2459 -#endif
 28.2460 -        )
 28.2461 -    {
 28.2462 -        if ( v->arch.shadow_vtable )
 28.2463 -            unmap_domain_page_global(v->arch.shadow_vtable);
 28.2464 -        v->arch.shadow_vtable = map_domain_page_global(smfn);
 28.2465 -    }
 28.2466 -
 28.2467 -#if CONFIG_PAGING_LEVELS == 2
 28.2468 -    /*
 28.2469 -     * arch.hl2_vtable
 28.2470 -     */
 28.2471 -
 28.2472 -    // if max_mode == SHM_translate, then the hl2 is already installed
 28.2473 -    // correctly in its smfn, and there's nothing to do.
 28.2474 -    //
 28.2475 -    if ( max_mode == SHM_external )
 28.2476 -    {
 28.2477 -        if ( unlikely(!(hl2mfn = __shadow_status(d, gpfn, PGT_hl2_shadow))) )
 28.2478 -            hl2mfn = shadow_hl2_table(d, gpfn, gmfn, smfn);
 28.2479 -        if ( v->arch.hl2_vtable )
 28.2480 -            unmap_domain_page_global(v->arch.hl2_vtable);
 28.2481 -        v->arch.hl2_vtable = map_domain_page_global(hl2mfn);
 28.2482 -    }
 28.2483 -
 28.2484 -    /*
 28.2485 -     * fixup pointers in monitor table, as necessary
 28.2486 -     */
 28.2487 -    if ( max_mode == SHM_external )
 28.2488 -    {
 28.2489 -        l2_pgentry_t *mpl2e = v->arch.monitor_vtable;
 28.2490 -        l2_pgentry_t old_hl2e = mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)];
 28.2491 -        l2_pgentry_t old_sl2e = mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)];
 28.2492 -
 28.2493 -        ASSERT( shadow_mode_translate(d) );
 28.2494 -
 28.2495 -        if ( !get_shadow_ref(hl2mfn) )
 28.2496 -            BUG();
 28.2497 -        mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
 28.2498 -            l2e_from_pfn(hl2mfn, __PAGE_HYPERVISOR);
 28.2499 -        if ( l2e_get_flags(old_hl2e) & _PAGE_PRESENT )
 28.2500 -            put_shadow_ref(l2e_get_pfn(old_hl2e));
 28.2501 -
 28.2502 -        if ( !get_shadow_ref(smfn) )
 28.2503 -            BUG();
 28.2504 -        mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
 28.2505 -            l2e_from_pfn(smfn, __PAGE_HYPERVISOR);
 28.2506 -        if ( l2e_get_flags(old_sl2e) & _PAGE_PRESENT )
 28.2507 -            put_shadow_ref(l2e_get_pfn(old_sl2e));
 28.2508 -
 28.2509 -        // XXX - maybe this can be optimized somewhat??
 28.2510 -        local_flush_tlb();
 28.2511 -    }
 28.2512 -#endif /* CONFIG_PAGING_LEVELS == 2 */
 28.2513 -
 28.2514 -#if CONFIG_PAGING_LEVELS == 3
 28.2515 -    /*
 28.2516 -     * fixup pointers in monitor table, as necessary
 28.2517 -     */
 28.2518 -    if ( max_mode == SHM_external )
 28.2519 -    {
 28.2520 -        l3_pgentry_t *mpl3e = (l3_pgentry_t *) v->arch.monitor_vtable;
 28.2521 -        l2_pgentry_t *spl2e;
 28.2522 -        unsigned long s2mfn;
 28.2523 -        int i;
 28.2524 - 
 28.2525 -        ASSERT( shadow_mode_translate(d) );
 28.2526 -        s2mfn = l3e_get_pfn(mpl3e[L3_PAGETABLE_ENTRIES - 1]);
 28.2527 - 
 28.2528 -        ASSERT( s2mfn);
 28.2529 -        spl2e = map_domain_page(s2mfn);
 28.2530 - 
 28.2531 -        for ( i = 0; i < (LINEARPT_MBYTES >> (L2_PAGETABLE_SHIFT - 20)); i++ )
 28.2532 -            spl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START) + i] =
 28.2533 -                (l3e_get_flags(mpl3e[i]) & _PAGE_PRESENT) ?
 28.2534 -                l2e_from_pfn(l3e_get_pfn(mpl3e[i]), __PAGE_HYPERVISOR) :
 28.2535 -                l2e_empty();
 28.2536 - 
 28.2537 -        unmap_domain_page(spl2e);
 28.2538 -        local_flush_tlb();
 28.2539 -    }
 28.2540 -#endif
 28.2541 -
 28.2542 -    if(likely(need_sync))
 28.2543 -        shadow_sync_all(d);
 28.2544 -}
 28.2545 -
 28.2546 -
 28.2547 -/************************************************************************/
 28.2548 -/************************************************************************/
 28.2549 -/************************************************************************/
 28.2550 -
 28.2551 -#if 0 // this code has not been updated for 32pae & 64 bit modes
 28.2552 -#if SHADOW_DEBUG
 28.2553 -
 28.2554 -// The following is entirely for _check_pagetable()'s benefit.
 28.2555 -// _check_pagetable() wants to know whether a given entry in a
 28.2556 -// shadow page table is supposed to be the shadow of the guest's
 28.2557 -// current entry, or the shadow of the entry held in the snapshot
 28.2558 -// taken above.
 28.2559 -//
 28.2560 -// Here, we mark all currently existing entries as reflecting
 28.2561 -// the snapshot, above.  All other places in xen that update
 28.2562 -// the shadow will keep the shadow in sync with the guest's
 28.2563 -// entries (via l1pte_propagate_from_guest and friends), which clear
 28.2564 -// the SHADOW_REFLECTS_SNAPSHOT bit.
 28.2565 -//
 28.2566 -static void
 28.2567 -mark_shadows_as_reflecting_snapshot(struct domain *d, unsigned long gpfn)
 28.2568 -{
 28.2569 -    unsigned long smfn;
 28.2570 -    l1_pgentry_t *l1e;
 28.2571 -    l2_pgentry_t *l2e;
 28.2572 -    unsigned i;
 28.2573 -
 28.2574 -    if ( (smfn = __shadow_status(d, gpfn, PGT_l1_shadow)) )
 28.2575 -    {
 28.2576 -        l1e = map_domain_page(smfn);
 28.2577 -        for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
 28.2578 -            if ( is_guest_l1_slot(i) &&
 28.2579 -                 (l1e_get_flags(l1e[i]) & _PAGE_PRESENT) )
 28.2580 -                l1e_add_flags(l1e[i], SHADOW_REFLECTS_SNAPSHOT);
 28.2581 -        unmap_domain_page(l1e);
 28.2582 -    }
 28.2583 -
 28.2584 -    if ( (smfn = __shadow_status(d, gpfn, PGT_l2_shadow)) )
 28.2585 -    {
 28.2586 -        l2e = map_domain_page(smfn);
 28.2587 -        for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
 28.2588 -            if ( is_guest_l2_slot(0, i) &&
 28.2589 -                 (l2e_get_flags(l2e[i]) & _PAGE_PRESENT) )
 28.2590 -                l2e_add_flags(l2e[i], SHADOW_REFLECTS_SNAPSHOT);
 28.2591 -        unmap_domain_page(l2e);
 28.2592 -    }
 28.2593 -}
 28.2594 -
 28.2595 -// BUG: these are not SMP safe...
 28.2596 -static int sh_l2_present;
 28.2597 -static int sh_l1_present;
 28.2598 -static char *sh_check_name;
 28.2599 -// int shadow_status_noswap; // declared in shadow32.c
 28.2600 -
 28.2601 -#define v2m(_v, _adr) ({                                                     \
 28.2602 -    unsigned long _a  = (unsigned long)(_adr);                               \
 28.2603 -    l2_pgentry_t _pde = shadow_linear_l2_table(_v)[l2_table_offset(_a)];     \
 28.2604 -    unsigned long _pa = -1;                                                  \
 28.2605 -    if ( l2e_get_flags(_pde) & _PAGE_PRESENT )                               \
 28.2606 -    {                                                                        \
 28.2607 -        l1_pgentry_t _pte;                                                   \
 28.2608 -        _pte = shadow_linear_pg_table[l1_linear_offset(_a)];                 \
 28.2609 -        if ( l1e_get_flags(_pte) & _PAGE_PRESENT )                           \
 28.2610 -            _pa = l1e_get_paddr(_pte);                                       \
 28.2611 -    }                                                                        \
 28.2612 -    _pa | (_a & ~PAGE_MASK);                                                 \
 28.2613 -})
 28.2614 -
 28.2615 -#define FAIL(_f, _a...)                                                      \
 28.2616 -    do {                                                                     \
 28.2617 -        printk("XXX %s-FAIL (%d,%d,%d) " _f " at %s(%d)\n",                  \
 28.2618 -               sh_check_name, level, l2_idx, l1_idx, ## _a,                  \
 28.2619 -               __FILE__, __LINE__);                                          \
 28.2620 -        printk("guest_pte=%" PRIpte " eff_guest_pte=%" PRIpte                \
 28.2621 -               " shadow_pte=%" PRIpte " snapshot_pte=%" PRIpte               \
 28.2622 -               " &guest=%p &shadow=%p &snap=%p v2m(&guest)=%p"               \
 28.2623 -               " v2m(&shadow)=%p v2m(&snap)=%p ea=%08x\n",                   \
 28.2624 -               l1e_get_intpte(guest_pte), l1e_get_intpte(eff_guest_pte),     \
 28.2625 -               l1e_get_intpte(shadow_pte), l1e_get_intpte(snapshot_pte),     \
 28.2626 -               p_guest_pte, p_shadow_pte, p_snapshot_pte,                    \
 28.2627 -               (void *)v2m(v, p_guest_pte), (void *)v2m(v, p_shadow_pte),    \
 28.2628 -               (void *)v2m(v, p_snapshot_pte),                               \
 28.2629 -               (l2_idx << L2_PAGETABLE_SHIFT) |                              \
 28.2630 -               (l1_idx << L1_PAGETABLE_SHIFT));                              \
 28.2631 -        errors++;                                                            \
 28.2632 -    } while ( 0 )
 28.2633 -
 28.2634 -static int check_pte(
 28.2635 -    struct vcpu *v,
 28.2636 -    l1_pgentry_t *p_guest_pte,
 28.2637 -    l1_pgentry_t *p_shadow_pte,
 28.2638 -    l1_pgentry_t *p_snapshot_pte,
 28.2639 -    int level, int l2_idx, int l1_idx)
 28.2640 -{
 28.2641 -    struct domain *d = v->domain;
 28.2642 -    l1_pgentry_t guest_pte = *p_guest_pte;
 28.2643 -    l1_pgentry_t shadow_pte = *p_shadow_pte;
 28.2644 -    l1_pgentry_t snapshot_pte = p_snapshot_pte ? *p_snapshot_pte : l1e_empty();
 28.2645 -    l1_pgentry_t eff_guest_pte;
 28.2646 -    unsigned long mask, eff_guest_pfn, eff_guest_mfn, shadow_mfn;
 28.2647 -    int errors = 0, guest_writable;
 28.2648 -    int page_table_page;
 28.2649 -
 28.2650 -    if ( (l1e_get_intpte(shadow_pte) == 0) ||
 28.2651 -         (l1e_get_intpte(shadow_pte) == 0xdeadface) ||
 28.2652 -         (l1e_get_intpte(shadow_pte) == 0x00000E00) )
 28.2653 -        return errors;  /* always safe */
 28.2654 -
 28.2655 -    if ( !(l1e_get_flags(shadow_pte) & _PAGE_PRESENT) )
 28.2656 -        FAIL("Non zero not present shadow_pte");
 28.2657 -
 28.2658 -    if ( level == 2 ) sh_l2_present++;
 28.2659 -    if ( level == 1 ) sh_l1_present++;
 28.2660 -
 28.2661 -    if ( (l1e_get_flags(shadow_pte) & SHADOW_REFLECTS_SNAPSHOT) && p_snapshot_pte )
 28.2662 -        eff_guest_pte = snapshot_pte;
 28.2663 -    else
 28.2664 -        eff_guest_pte = guest_pte;
 28.2665 -
 28.2666 -    if ( !(l1e_get_flags(eff_guest_pte) & _PAGE_PRESENT) )
 28.2667 -        FAIL("Guest not present yet shadow is");
 28.2668 -
 28.2669 -    mask = ~(_PAGE_GLOBAL|_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW|_PAGE_AVAIL|PAGE_MASK);
 28.2670 -
 28.2671 -    if ( ((l1e_get_intpte(shadow_pte) & mask) != (l1e_get_intpte(eff_guest_pte) & mask)) )
 28.2672 -        FAIL("Corrupt?");
 28.2673 -
 28.2674 -    if ( (level == 1) &&
 28.2675 -         (l1e_get_flags(shadow_pte) & _PAGE_DIRTY) &&
 28.2676 -         !(l1e_get_flags(eff_guest_pte) & _PAGE_DIRTY) )
 28.2677 -        FAIL("Dirty coherence");
 28.2678 -
 28.2679 -    if ( (l1e_get_flags(shadow_pte) & _PAGE_ACCESSED) &&
 28.2680 -         !(l1e_get_flags(eff_guest_pte) & _PAGE_ACCESSED) )
 28.2681 -        FAIL("Accessed coherence");
 28.2682 -
 28.2683 -    if ( l1e_get_flags(shadow_pte) & _PAGE_GLOBAL )
 28.2684 -        FAIL("global bit set in shadow");
 28.2685 -
 28.2686 -    eff_guest_pfn = l1e_get_pfn(eff_guest_pte);
 28.2687 -    eff_guest_mfn = gmfn_to_mfn(d, eff_guest_pfn);
 28.2688 -    shadow_mfn = l1e_get_pfn(shadow_pte);
 28.2689 -
 28.2690 -    if ( !VALID_MFN(eff_guest_mfn) && !shadow_mode_refcounts(d) )
 28.2691 -        FAIL("%s: invalid eff_guest_pfn=%lx eff_guest_pte=%" PRIpte "\n",
 28.2692 -             __func__, eff_guest_pfn, l1e_get_intpte(eff_guest_pte));
 28.2693 -
 28.2694 -    page_table_page = mfn_is_page_table(eff_guest_mfn);
 28.2695 -
 28.2696 -    guest_writable =
 28.2697 -        (l1e_get_flags(eff_guest_pte) & _PAGE_RW) ||
 28.2698 -        (shadow_mode_write_l1(d) && (level == 1) && mfn_out_of_sync(eff_guest_mfn));
 28.2699 -
 28.2700 -    if ( (l1e_get_flags(shadow_pte) & _PAGE_RW ) && !guest_writable )
 28.2701 -    {
 28.2702 -        printk("eff_guest_pfn=%lx eff_guest_mfn=%lx shadow_mfn=%lx t=0x%08lx page_table_page=%d\n",
 28.2703 -               eff_guest_pfn, eff_guest_mfn, shadow_mfn,
 28.2704 -               mfn_to_page(eff_guest_mfn)->u.inuse.type_info,
 28.2705 -               page_table_page);
 28.2706 -        FAIL("RW coherence");
 28.2707 -    }
 28.2708 -
 28.2709 -    if ( (level == 1) &&
 28.2710 -         (l1e_get_flags(shadow_pte) & _PAGE_RW ) &&
 28.2711 -         !(guest_writable && (l1e_get_flags(eff_guest_pte) & _PAGE_DIRTY)) )
 28.2712 -    {
 28.2713 -        printk("eff_guest_pfn=%lx eff_guest_mfn=%lx shadow_mfn=%lx t=0x%08lx page_table_page=%d\n",
 28.2714 -               eff_guest_pfn, eff_guest_mfn, shadow_mfn,
 28.2715 -               mfn_to_page(eff_guest_mfn)->u.inuse.type_info,
 28.2716 -               page_table_page);
 28.2717 -        FAIL("RW2 coherence");
 28.2718 -    }
 28.2719 -
 28.2720 -    if ( eff_guest_mfn == shadow_mfn )
 28.2721 -    {
 28.2722 -        if ( level > 1 )
 28.2723 -            FAIL("Linear map ???");    /* XXX this will fail on BSD */
 28.2724 -    }
 28.2725 -    else
 28.2726 -    {
 28.2727 -        if ( level < 2 )
 28.2728 -            FAIL("Shadow in L1 entry?");
 28.2729 -
 28.2730 -        if ( level == 2 )
 28.2731 -        {
 28.2732 -            if ( __shadow_status(d, eff_guest_pfn, PGT_l1_shadow) != shadow_mfn )
 28.2733 -                FAIL("shadow_mfn problem eff_guest_pfn=%lx shadow_mfn=%lx", eff_guest_pfn,
 28.2734 -                     __shadow_status(d, eff_guest_pfn, PGT_l1_shadow));
 28.2735 -        }
 28.2736 -        else
 28.2737 -            BUG(); // XXX -- not handled yet.
 28.2738 -    }
 28.2739 -
 28.2740 -    return errors;
 28.2741 -}
 28.2742 -#undef FAIL
 28.2743 -#undef v2m
 28.2744 -
 28.2745 -static int check_l1_table(
 28.2746 -    struct vcpu *v, unsigned long gpfn,
 28.2747 -    unsigned long gmfn, unsigned long smfn, unsigned l2_idx)
 28.2748 -{
 28.2749 -    struct domain *d = v->domain;
 28.2750 -    int i;
 28.2751 -    unsigned long snapshot_mfn;
 28.2752 -    l1_pgentry_t *p_guest, *p_shadow, *p_snapshot = NULL;
 28.2753 -    int errors = 0;
 28.2754 -
 28.2755 -    if ( page_out_of_sync(mfn_to_page(gmfn)) )
 28.2756 -    {
 28.2757 -        snapshot_mfn = __shadow_status(d, gpfn, PGT_snapshot);
 28.2758 -        ASSERT(snapshot_mfn);
 28.2759 -        p_snapshot = map_domain_page(snapshot_mfn);
 28.2760 -    }
 28.2761 -
 28.2762 -    p_guest  = map_domain_page(gmfn);
 28.2763 -    p_shadow = map_domain_page(smfn);
 28.2764 -
 28.2765 -    for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
 28.2766 -        errors += check_pte(v, p_guest+i, p_shadow+i,
 28.2767 -                            p_snapshot ? p_snapshot+i : NULL,
 28.2768 -                            1, l2_idx, i);
 28.2769 -
 28.2770 -    unmap_domain_page(p_shadow);
 28.2771 -    unmap_domain_page(p_guest);
 28.2772 -    if ( p_snapshot )
 28.2773 -        unmap_domain_page(p_snapshot);
 28.2774 -
 28.2775 -    return errors;
 28.2776 -}
 28.2777 -
 28.2778 -#define FAILPT(_f, _a...)                                         \
 28.2779 -    do {                                                          \
 28.2780 -        printk("XXX FAIL %s-PT " _f "\n", sh_check_name, ## _a ); \
 28.2781 -        errors++;                                                 \
 28.2782 -    } while ( 0 )
 28.2783 -
 28.2784 -static int check_l2_table(
 28.2785 -    struct vcpu *v, unsigned long gmfn, unsigned long smfn, int oos_pdes)
 28.2786 -{
 28.2787 -    struct domain *d = v->domain;
 28.2788 -    l2_pgentry_t *gpl2e = (l2_pgentry_t *)map_domain_page(gmfn);
 28.2789 -    l2_pgentry_t *spl2e = (l2_pgentry_t *)map_domain_page(smfn);
 28.2790 -    l2_pgentry_t match;
 28.2791 -    int i;
 28.2792 -    int errors = 0;
 28.2793 -    int limit;
 28.2794 -
 28.2795 -    if ( !oos_pdes && (page_get_owner(mfn_to_page(gmfn)) != d) )
 28.2796 -        FAILPT("domain doesn't own page");
 28.2797 -    if ( oos_pdes && (page_get_owner(mfn_to_page(gmfn)) != NULL) )
 28.2798 -        FAILPT("bogus owner for snapshot page");
 28.2799 -    if ( page_get_owner(mfn_to_page(smfn)) != NULL )
 28.2800 -        FAILPT("shadow page mfn=0x%lx is owned by someone, domid=%d",
 28.2801 -               smfn, page_get_owner(mfn_to_page(smfn))->domain_id);
 28.2802 -
 28.2803 -#if 0
 28.2804 -    if ( memcmp(&spl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
 28.2805 -                &gpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
 28.2806 -                ((SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT) -
 28.2807 -                 DOMAIN_ENTRIES_PER_L2_PAGETABLE) * sizeof(l2_pgentry_t)) )
 28.2808 -    {
 28.2809 -        for ( i = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
 28.2810 -              i < (SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT);
 28.2811 -              i++ )
 28.2812 -            printk("+++ (%d) %lx %lx\n",i,
 28.2813 -                   l2_pgentry_val(gpl2e[i]), l2_pgentry_val(spl2e[i]));
 28.2814 -        FAILPT("hypervisor entries inconsistent");
 28.2815 -    }
 28.2816 -
 28.2817 -    if ( (l2_pgentry_val(spl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT]) !=
 28.2818 -          l2_pgentry_val(gpl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT])) )
 28.2819 -        FAILPT("hypervisor linear map inconsistent");
 28.2820 -#endif
 28.2821 -
 28.2822 -    match = l2e_from_pfn(smfn, __PAGE_HYPERVISOR);
 28.2823 -    if ( !shadow_mode_external(d) &&
 28.2824 -         l2e_has_changed(spl2e[SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT],
 28.2825 -                         match, PAGE_FLAG_MASK))
 28.2826 -    {
 28.2827 -        FAILPT("hypervisor shadow linear map inconsistent %" PRIpte " %" PRIpte,
 28.2828 -               l2e_get_intpte(spl2e[SH_LINEAR_PT_VIRT_START >>
 28.2829 -                                   L2_PAGETABLE_SHIFT]),
 28.2830 -               l2e_get_intpte(match));
 28.2831 -    }
 28.2832 -
 28.2833 -    match = l2e_from_paddr(__pa(d->arch.mm_perdomain_pt), __PAGE_HYPERVISOR);
 28.2834 -    if ( !shadow_mode_external(d) &&
 28.2835 -         l2e_has_changed(spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT],
 28.2836 -                         match, PAGE_FLAG_MASK))
 28.2837 -    {
 28.2838 -        FAILPT("hypervisor per-domain map inconsistent saw %" PRIpte ", expected (va=%p) %" PRIpte,
 28.2839 -               l2e_get_intpte(spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT]),
 28.2840 -               d->arch.mm_perdomain_pt,
 28.2841 -               l2e_get_intpte(match));
 28.2842 -    }
 28.2843 -
 28.2844 -#if CONFIG_PAGING_LEVELS == 2
 28.2845 -    if ( shadow_mode_external(d) )
 28.2846 -        limit = L2_PAGETABLE_ENTRIES;
 28.2847 -    else
 28.2848 -        limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
 28.2849 -#else
 28.2850 -    limit = 0; /* XXX x86/64 XXX */
 28.2851 -#endif
 28.2852 -
 28.2853 -    /* Check the whole L2. */
 28.2854 -    for ( i = 0; i < limit; i++ )
 28.2855 -        errors += check_pte(v,
 28.2856 -                            (l1_pgentry_t*)(&gpl2e[i]), /* Hmm, dirty ... */
 28.2857 -                            (l1_pgentry_t*)(&spl2e[i]),
 28.2858 -                            NULL,
 28.2859 -                            2, i, 0);
 28.2860 -
 28.2861 -    unmap_domain_page(spl2e);
 28.2862 -    unmap_domain_page(gpl2e);
 28.2863 -
 28.2864 -#if 1
 28.2865 -    if ( errors )
 28.2866 -        printk("check_l2_table returning %d errors\n", errors);
 28.2867 -#endif
 28.2868 -
 28.2869 -    return errors;
 28.2870 -}
 28.2871 -#undef FAILPT
 28.2872 -
 28.2873 -int _check_pagetable(struct vcpu *v, char *s)
 28.2874 -{
 28.2875 -    struct domain *d = v->domain;
 28.2876 -#if CONFIG_PAGING_LEVELS == 4
 28.2877 -    pagetable_t pt = ((v->arch.flags & TF_kernel_mode)?
 28.2878 -                      v->arch.guest_table : v->arch.guest_table_user);
 28.2879 -#else
 28.2880 -    pagetable_t pt = v->arch.guest_table;
 28.2881 -#endif
 28.2882 -    unsigned long gptbase = pagetable_get_paddr(pt);
 28.2883 -    unsigned long ptbase_pfn, smfn;
 28.2884 -    unsigned long i;
 28.2885 -    l2_pgentry_t *gpl2e, *spl2e;
 28.2886 -    unsigned long ptbase_mfn = 0;
 28.2887 -    int errors = 0, limit, oos_pdes = 0;
 28.2888 -
 28.2889 -    //_audit_domain(d, AUDIT_QUIET);
 28.2890 -    shadow_lock(d);
 28.2891 -
 28.2892 -    sh_check_name = s;
 28.2893 -    //SH_VVLOG("%s-PT Audit", s);
 28.2894 -    sh_l2_present = sh_l1_present = 0;
 28.2895 -    perfc_incrc(check_pagetable);
 28.2896 -
 28.2897 -    ptbase_mfn = gptbase >> PAGE_SHIFT;
 28.2898 -    ptbase_pfn = mfn_to_gmfn(d, ptbase_mfn);
 28.2899 -
 28.2900 -    if ( !(smfn = __shadow_status(d, ptbase_pfn, PGT_base_page_table)) )
 28.2901 -    {
 28.2902 -        printk("%s-PT %lx not shadowed\n", s, gptbase);
 28.2903 -        goto out;
 28.2904 -    }
 28.2905 -    if ( page_out_of_sync(mfn_to_page(ptbase_mfn)) )
 28.2906 -    {
 28.2907 -        ptbase_mfn = __shadow_status(d, ptbase_pfn, PGT_snapshot);
 28.2908 -        oos_pdes = 1;
 28.2909 -        ASSERT(ptbase_mfn);
 28.2910 -    }
 28.2911 -
 28.2912 -    errors += check_l2_table(v, ptbase_mfn, smfn, oos_pdes);
 28.2913 -
 28.2914 -    gpl2e = (l2_pgentry_t *) map_domain_page(ptbase_mfn);
 28.2915 -    spl2e = (l2_pgentry_t *) map_domain_page(smfn);
 28.2916 -
 28.2917 -    /* Go back and recurse. */
 28.2918 -#if CONFIG_PAGING_LEVELS == 2
 28.2919 -    if ( shadow_mode_external(d) )
 28.2920 -        limit = L2_PAGETABLE_ENTRIES;
 28.2921 -    else
 28.2922 -        limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
 28.2923 -#else
 28.2924 -    limit = 0; /* XXX x86/64 XXX */
 28.2925 -#endif
 28.2926 -
 28.2927 -    for ( i = 0; i < limit; i++ )
 28.2928 -    {
 28.2929 -        unsigned long gl1pfn = l2e_get_pfn(gpl2e[i]);
 28.2930 -        unsigned long gl1mfn = gmfn_to_mfn(d, gl1pfn);
 28.2931 -        unsigned long sl1mfn = l2e_get_pfn(spl2e[i]);
 28.2932 -
 28.2933 -        if ( l2e_get_intpte(spl2e[i]) != 0 )  /* FIXME: check flags? */
 28.2934 -        {
 28.2935 -            errors += check_l1_table(v, gl1pfn, gl1mfn, sl1mfn, i);
 28.2936 -        }
 28.2937 -    }
 28.2938 -
 28.2939 -    unmap_domain_page(spl2e);
 28.2940 -    unmap_domain_page(gpl2e);
 28.2941 -
 28.2942 -#if 0
 28.2943 -    SH_VVLOG("PT verified : l2_present = %d, l1_present = %d",
 28.2944 -             sh_l2_present, sh_l1_present);
 28.2945 -#endif
 28.2946 -
 28.2947 - out:
 28.2948 -    if ( errors )
 28.2949 -        BUG();
 28.2950 -
 28.2951 -    shadow_unlock(d);
 28.2952 -
 28.2953 -    return errors;
 28.2954 -}
 28.2955 -
 28.2956 -int _check_all_pagetables(struct vcpu *v, char *s)
 28.2957 -{
 28.2958 -    struct domain *d = v->domain;
 28.2959 -    int i;
 28.2960 -    struct shadow_status *a;
 28.2961 -    unsigned long gmfn;
 28.2962 -    int errors = 0;
 28.2963 -
 28.2964 -    shadow_status_noswap = 1;
 28.2965 -
 28.2966 -    sh_check_name = s;
 28.2967 -    SH_VVLOG("%s-PT Audit domid=%d", s, d->domain_id);
 28.2968 -    sh_l2_present = sh_l1_present = 0;
 28.2969 -    perfc_incrc(check_all_pagetables);
 28.2970 -
 28.2971 -    for (i = 0; i < shadow_ht_buckets; i++)
 28.2972 -    {
 28.2973 -        a = &d->arch.shadow_ht[i];
 28.2974 -        while ( a && a->gpfn_and_flags )
 28.2975 -        {
 28.2976 -            gmfn = gmfn_to_mfn(d, a->gpfn_and_flags & PGT_mfn_mask);
 28.2977 -
 28.2978 -            switch ( a->gpfn_and_flags & PGT_type_mask )
 28.2979 -            {
 28.2980 -            case PGT_l1_shadow:
 28.2981 -                errors += check_l1_table(v, a->gpfn_and_flags & PGT_mfn_mask,
 28.2982 -                                         gmfn, a->smfn, 0);
 28.2983 -                break;
 28.2984 -            case PGT_l2_shadow:
 28.2985 -                errors += check_l2_table(v, gmfn, a->smfn,
 28.2986 -                                         page_out_of_sync(mfn_to_page(gmfn)));
 28.2987 -                break;
 28.2988 -            case PGT_l3_shadow:
 28.2989 -            case PGT_l4_shadow:
 28.2990 -            case PGT_hl2_shadow:
 28.2991 -                BUG(); // XXX - ought to fix this...
 28.2992 -                break;
 28.2993 -            case PGT_snapshot:
 28.2994 -            case PGT_writable_pred:
 28.2995 -                break;
 28.2996 -            default:
 28.2997 -                errors++;
 28.2998 -                printk("unexpected shadow type %lx, gpfn=%lx, "
 28.2999 -                       "gmfn=%lx smfn=%lx\n",
 28.3000 -                       a->gpfn_and_flags & PGT_type_mask,
 28.3001 -                       a->gpfn_and_flags & PGT_mfn_mask,
 28.3002 -                       gmfn, a->smfn);
 28.3003 -                BUG();
 28.3004 -            }
 28.3005 -            a = a->next;
 28.3006 -        }
 28.3007 -    }
 28.3008 -
 28.3009 -    shadow_status_noswap = 0;
 28.3010 -
 28.3011 -    if ( errors )
 28.3012 -        BUG();
 28.3013 -
 28.3014 -    return errors;
 28.3015 -}
 28.3016 -
 28.3017 -#endif // SHADOW_DEBUG
 28.3018 -#endif // this code has not been updated for 32pae & 64 bit modes
 28.3019 -
 28.3020 -#if CONFIG_PAGING_LEVELS >= 3
 28.3021 -/****************************************************************************/
 28.3022 -/* 64-bit shadow-mode code testing */
 28.3023 -/****************************************************************************/
 28.3024 -/*
 28.3025 - * init_bl2() is for 32-bit VMX guest on 64-bit host
 28.3026 - * Using 1 shadow L4(l3) and 4 shadow L2s to simulate guest L2
 28.3027 - */
 28.3028 -static inline unsigned long init_bl2(
 28.3029 -    struct domain *d, unsigned long gpfn, unsigned long gmfn)
 28.3030 -{
 28.3031 -    unsigned int count;
 28.3032 -    unsigned long sl2mfn;
 28.3033 -    unsigned long smfn;
 28.3034 -    struct page_info *page;
 28.3035 -    l4_pgentry_t *spl4e;
 28.3036 -    void *l2;
 28.3037 -
 28.3038 -    if ( unlikely(!(smfn = alloc_shadow_page(d, gpfn, gmfn, PGT_l4_shadow))) )
 28.3039 -    {
 28.3040 -        printk("Couldn't alloc an L4 shadow for pfn=%lx mfn=%lx\n", gpfn, gmfn);
 28.3041 -        /* XXX Deal gracefully with failure. */
 28.3042 -        domain_crash_synchronous();
 28.3043 -    }
 28.3044 -
 28.3045 -    spl4e = (l4_pgentry_t *)map_domain_page(smfn);
 28.3046 -
 28.3047 -    /* Map the self entry, L4&L3 share the same page */
 28.3048 -    spl4e[PAE_SHADOW_SELF_ENTRY] = l4e_from_pfn(smfn, __PAGE_HYPERVISOR);
 28.3049 -
 28.3050 -    /* Allocate 4 shadow L2s */
 28.3051 -    page = alloc_domheap_pages(NULL, SL2_ORDER, 0);
 28.3052 -    if ( !page )
 28.3053 -        domain_crash_synchronous();
 28.3054 -
 28.3055 -    for ( count = 0; count < PAE_L3_PAGETABLE_ENTRIES; count++ )
 28.3056 -    {
 28.3057 -        sl2mfn = page_to_mfn(page+count);
 28.3058 -        l2 = map_domain_page(sl2mfn);
 28.3059 -        memset(l2, 0, PAGE_SIZE);
 28.3060 -        unmap_domain_page(l2);
 28.3061 -        spl4e[count] = l4e_from_pfn(sl2mfn, _PAGE_PRESENT);
 28.3062 -    }
 28.3063 -
 28.3064 -    unmap_domain_page(spl4e);
 28.3065 -
 28.3066 -    return smfn;
 28.3067 -}
 28.3068 -
 28.3069 -static inline unsigned long init_l3(
 28.3070 -    struct vcpu *v, unsigned long gpfn, unsigned long gmfn)
 28.3071 -{
 28.3072 -    unsigned long smfn;
 28.3073 -    l4_pgentry_t *spl4e;
 28.3074 -    unsigned long index;
 28.3075 -
 28.3076 -    if ( unlikely(!(smfn = alloc_shadow_page(v->domain, gpfn, gmfn, PGT_l4_shadow))) )
 28.3077 -    {
 28.3078 -        printk("Couldn't alloc an L4 shadow for pfn= %lx mfn= %lx\n", gpfn, gmfn);
 28.3079 -        BUG(); /* XXX Deal gracefully wiht failure. */
 28.3080 -    }
 28.3081 -
 28.3082 -    /* Map the self entry, L4&L3 share the same page */
 28.3083 -    spl4e = (l4_pgentry_t *)map_domain_page(smfn);
 28.3084 -
 28.3085 -    /*
 28.3086 -     * Shadow L4's pfn_info->tlbflush_timestamp
 28.3087 -     * should also save it's own index.
 28.3088 -     */
 28.3089 -
 28.3090 -    index = get_cr3_idxval(v);
 28.3091 -    frame_table[smfn].tlbflush_timestamp = index;
 28.3092 -
 28.3093 -    memset(spl4e, 0, L4_PAGETABLE_ENTRIES*sizeof(l4_pgentry_t));
 28.3094 -    spl4e[PAE_SHADOW_SELF_ENTRY] = l4e_from_pfn(smfn, __PAGE_HYPERVISOR);
 28.3095 -    unmap_domain_page(spl4e);
 28.3096 -    return smfn;
 28.3097 -}
 28.3098 -#endif
 28.3099 -
 28.3100 -#if CONFIG_PAGING_LEVELS == 3
 28.3101 -static unsigned long shadow_l3_table(
 28.3102 -    struct vcpu *v, unsigned long gpfn, unsigned long gmfn)
 28.3103 -{
 28.3104 -    unsigned long smfn;
 28.3105 -    l3_pgentry_t *spl3e;
 28.3106 -    struct domain *d = v->domain;
 28.3107 -
 28.3108 -    perfc_incrc(shadow_l3_table_count);
 28.3109 -
 28.3110 -    SH_VVLOG("shadow_l3_table(gpfn=%lx, gmfn=%lx)", gpfn, gmfn);
 28.3111 -
 28.3112 -    if ( SH_L1_HAS_NEXT_PAGE &&
 28.3113 -         d->arch.ops->guest_paging_levels == PAGING_L2 )
 28.3114 -    {
 28.3115 -        return init_bl2(d, gpfn, gmfn);
 28.3116 -    }
 28.3117 -
 28.3118 -    if ( SH_GUEST_32PAE &&
 28.3119 -         d->arch.ops->guest_paging_levels == PAGING_L3 )
 28.3120 -    {
 28.3121 -        return init_l3(v, gpfn, gmfn);
 28.3122 -    }
 28.3123 -
 28.3124 -    if ( unlikely(!(smfn = alloc_shadow_page(d, gpfn, gmfn, PGT_l3_shadow))) )
 28.3125 -    {
 28.3126 -            printk("Couldn't alloc an L3 shadow for pfn=%lx mfn=%lx\n", gpfn, gmfn);
 28.3127 -            BUG(); /* XXX Deal gracefully with failure. */
 28.3128 -    }
 28.3129 -
 28.3130 -    spl3e = (l3_pgentry_t *)map_domain_page(smfn);
 28.3131 -
 28.3132 -    /* Make the self entry */
 28.3133 -    spl3e[PAE_SHADOW_SELF_ENTRY] = l3e_from_pfn(smfn, __PAGE_HYPERVISOR);
 28.3134 -
 28.3135 -    if ( (PGT_base_page_table == PGT_l3_page_table) &&
 28.3136 -         !shadow_mode_external(d) ) {
 28.3137 -        int i;
 28.3138 -        unsigned long g2mfn, s2mfn;
 28.3139 -        l2_pgentry_t *spl2e;
 28.3140 -        l3_pgentry_t *gpl3e;
 28.3141 -
 28.3142 -        /* Get the top entry */
 28.3143 -        gpl3e = (l3_pgentry_t *)map_domain_page(gmfn);
 28.3144 -
 28.3145 -        if ( !(l3e_get_flags(gpl3e[L3_PAGETABLE_ENTRIES - 1]) & _PAGE_PRESENT) )
 28.3146 -        {
 28.3147 -            BUG();
 28.3148 -        }
 28.3149 -
 28.3150 -        g2mfn = l3e_get_pfn(gpl3e[L3_PAGETABLE_ENTRIES - 1]);
 28.3151 -
 28.3152 -        /* NB. g2mfn should be same as g2pfn */
 28.3153 -        if (!(s2mfn = __shadow_status(d, g2mfn, PGT_l2_shadow))) {
 28.3154 -            if ( unlikely(!(s2mfn =
 28.3155 -                    alloc_shadow_page(d, g2mfn, g2mfn, PGT_l2_shadow))) ) {
 28.3156 -                printk("Couldn't alloc an L2 shadow for pfn=%lx mfn=%lx\n",
 28.3157 -                    g2mfn, g2mfn);
 28.3158 -                BUG(); /* XXX Deal gracefully with failure. */
 28.3159 -            }
 28.3160 -        } 
 28.3161 -
 28.3162 -        if (!get_shadow_ref(s2mfn))
 28.3163 -            BUG();
 28.3164 -            
 28.3165 -        /* Map shadow L2 into shadow L3 */
 28.3166 -        spl3e[L3_PAGETABLE_ENTRIES - 1] = l3e_from_pfn(s2mfn, _PAGE_PRESENT);
 28.3167 -        shadow_update_min_max(smfn, L3_PAGETABLE_ENTRIES -1);
 28.3168 -
 28.3169 -        /*  
 28.3170 -         * Xen private mappings. Do the similar things as
 28.3171 -         * create_pae_xen_mappings().
 28.3172 -         */
 28.3173 -        spl2e = (l2_pgentry_t *)map_domain_page(s2mfn);
 28.3174 -
 28.3175 -        /*
 28.3176 -         * When we free L2 pages, we need to tell if the page contains
 28.3177 -         * Xen private mappings. Use the va_mask part.
 28.3178 -         */
 28.3179 -        mfn_to_page(s2mfn)->u.inuse.type_info |= 
 28.3180 -            (unsigned long) 3 << PGT_score_shift;
 28.3181 -
 28.3182 -        memset(spl2e, 0, 
 28.3183 -               (L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)) * sizeof(l2_pgentry_t));
 28.3184 -
 28.3185 -        memcpy(&spl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)],
 28.3186 -           &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT],
 28.3187 -           L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));       
 28.3188 -
 28.3189 -        for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
 28.3190 -            spl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i] =
 28.3191 -                l2e_from_page(
 28.3192 -                    virt_to_page(page_get_owner(mfn_to_page(gmfn))->arch.mm_perdomain_pt) + i, 
 28.3193 -                    __PAGE_HYPERVISOR);
 28.3194 -        for ( i = 0; i < (LINEARPT_MBYTES >> (L2_PAGETABLE_SHIFT - 20)); i++ )
 28.3195 -            spl2e[l2_table_offset(LINEAR_PT_VIRT_START) + i] =
 28.3196 -                (l3e_get_flags(gpl3e[i]) & _PAGE_PRESENT) ?
 28.3197 -                l2e_from_pfn(l3e_get_pfn(gpl3e[i]), __PAGE_HYPERVISOR) :
 28.3198 -                l2e_empty();
 28.3199 -       
 28.3200 -        unmap_domain_page(spl2e);
 28.3201 -        unmap_domain_page(gpl3e);
 28.3202 -    }
 28.3203 -    unmap_domain_page(spl3e);
 28.3204 -
 28.3205 -    return smfn;
 28.3206 -}
 28.3207 -#endif /* CONFIG_PAGING_LEVELS == 3 */
 28.3208 -
 28.3209 -#if (!defined(GUEST_PGENTRY_32) && !defined(GUEST_32PAE))
 28.3210 -static unsigned long gva_to_gpa_pae(unsigned long gva)
 28.3211 -{
 28.3212 -    BUG();
 28.3213 -    return 43;
 28.3214 -}
 28.3215 -#endif
 28.3216 -
 28.3217 -#if CONFIG_PAGING_LEVELS == 4
 28.3218 -static unsigned long shadow_l4_table(
 28.3219 -  struct vcpu *v, unsigned long gpfn, unsigned long gmfn)
 28.3220 -{
 28.3221 -    unsigned long smfn;
 28.3222 -    l4_pgentry_t *spl4e;
 28.3223 -    struct domain *d = v->domain;
 28.3224 -
 28.3225 -    SH_VVLOG("shadow_l4_table(gpfn=%lx, gmfn=%lx)", gpfn, gmfn);
 28.3226 -
 28.3227 -    perfc_incrc(shadow_l4_table_count);
 28.3228 -
 28.3229 -    if ( d->arch.ops->guest_paging_levels == PAGING_L2 )
 28.3230 -    {
 28.3231 -        return init_bl2(d, gpfn, gmfn);
 28.3232 -    }
 28.3233 -
 28.3234 -    if ( SH_GUEST_32PAE && d->arch.ops->guest_paging_levels == PAGING_L3 )
 28.3235 -    {
 28.3236 -        return init_l3(v, gpfn, gmfn);
 28.3237 -    }
 28.3238 -
 28.3239 -    if ( unlikely(!(smfn = alloc_shadow_page(d, gpfn, gmfn, PGT_l4_shadow))) )
 28.3240 -    {
 28.3241 -        printk("Couldn't alloc an L4 shadow for pfn=%lx mfn=%lx\n", gpfn, gmfn);
 28.3242 -        BUG(); /* XXX Deal gracefully with failure. */
 28.3243 -    }
 28.3244 -
 28.3245 -    spl4e = (l4_pgentry_t *)map_domain_page(smfn);
 28.3246 -
 28.3247 -    /* Install hypervisor and 4x linear p.t. mapings. */
 28.3248 -    if ( (PGT_base_page_table == PGT_l4_page_table) &&
 28.3249 -      !shadow_mode_external(d) )
 28.3250 -    {
 28.3251 -        /*
 28.3252 -         * We could proactively fill in PDEs for pages that are already
 28.3253 -         * shadowed *and* where the guest PDE has _PAGE_ACCESSED set
 28.3254 -         * (restriction required for coherence of the accessed bit). However,
 28.3255 -         * we tried it and it didn't help performance. This is simpler.
 28.3256 -         */
 28.3257 -        memset(spl4e, 0, L4_PAGETABLE_ENTRIES*sizeof(l4_pgentry_t));
 28.3258 -
 28.3259 -        /* Install hypervisor and 2x linear p.t. mapings. */
 28.3260 -        memcpy(&spl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT],
 28.3261 -           &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT],
 28.3262 -           ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t));
 28.3263 -
 28.3264 -        spl4e[l4_table_offset(PERDOMAIN_VIRT_START)] =
 28.3265 -            l4e_from_paddr(__pa(page_get_owner(mfn_to_page(gmfn))->arch.mm_perdomain_l3),
 28.3266 -                            __PAGE_HYPERVISOR);
 28.3267 -
 28.3268 -        if ( shadow_mode_translate(d) ) // NB: not external
 28.3269 -        {
 28.3270 -            spl4e[l4_table_offset(RO_MPT_VIRT_START)] =
 28.3271 -                l4e_from_paddr(pagetable_get_paddr(d->arch.phys_table),
 28.3272 -                                __PAGE_HYPERVISOR);
 28.3273 -        }
 28.3274 -        else
 28.3275 -            spl4e[l4_table_offset(LINEAR_PT_VIRT_START)] =
 28.3276 -                l4e_from_pfn(gmfn, __PAGE_HYPERVISOR);
 28.3277 -
 28.3278 -    } else
 28.3279 -        memset(spl4e, 0, L4_PAGETABLE_ENTRIES*sizeof(l4_pgentry_t));
 28.3280 -
 28.3281 -    unmap_domain_page(spl4e);
 28.3282 -
 28.3283 -    ESH_LOG("shadow_l4_table(%lx -> %lx)", gmfn, smfn);
 28.3284 -    return smfn;
 28.3285 -}
 28.3286 -#endif /* CONFIG_PAGING_LEVELS == 4 */
 28.3287 -
 28.3288 -#if CONFIG_PAGING_LEVELS >= 3
 28.3289 -static void 
 28.3290 -update_top_level_shadow(struct vcpu *v, unsigned long smfn)
 28.3291 -{
 28.3292 -    unsigned long index = get_cr3_idxval(v);
 28.3293 -    pgentry_64_t *sple = (pgentry_64_t *)map_domain_page(smfn);
 28.3294 -    pgentry_64_t *gple = (pgentry_64_t *)&v->arch.guest_vtable;
 28.3295 -    int i;
 28.3296 -
 28.3297 -    for ( i = 0; i < PAE_L3_PAGETABLE_ENTRIES; i++ )
 28.3298 -    {
 28.3299 -        unsigned long gpfn;
 28.3300 -
 28.3301 -        /*
 28.3302 -         * Looks like it's no longer a page table. 
 28.3303 -         */
 28.3304 -        if ( unlikely(entry_get_value(gple[index*4+i]) & PAE_PDPT_RESERVED) )
 28.3305 -        {
 28.3306 -            if ( entry_get_flags(sple[i]) & _PAGE_PRESENT )
 28.3307 -                put_shadow_ref(entry_get_pfn(sple[i]));
 28.3308 -
 28.3309 -            sple[i] = entry_empty();
 28.3310 -            continue;
 28.3311 -        }
 28.3312 -
 28.3313 -        gpfn = entry_get_pfn(gple[index*4+i]);
 28.3314 -
 28.3315 -        if ( unlikely(gpfn != (gpfn & PGT_mfn_mask)) )
 28.3316 -        {
 28.3317 -            if ( entry_get_flags(sple[i]) & _PAGE_PRESENT )
 28.3318 -                put_shadow_ref(entry_get_pfn(sple[i]));
 28.3319 -
 28.3320 -            sple[i] = entry_empty();
 28.3321 -            continue;
 28.3322 -        }
 28.3323 -
 28.3324 -        validate_entry_change(
 28.3325 -            v->domain, &gple[index*4+i], &sple[i], PAGING_L3);
 28.3326 -    }
 28.3327 -
 28.3328 -    unmap_domain_page(sple);
 28.3329 -}
 28.3330 -
 28.3331 -/*
 28.3332 - * validate_bl2e_change()
 28.3333 - * The code is for 32-bit HVM guest on 64-bit host.
 28.3334 - * To sync guest L2.
 28.3335 - */
 28.3336 -
 28.3337 -static inline void
 28.3338 -validate_bl2e_change(
 28.3339 -    struct domain *d,
 28.3340 -    guest_root_pgentry_t *new_gle_p,
 28.3341 -    pgentry_64_t *shadow_l3,
 28.3342 -    int index)
 28.3343 -{
 28.3344 -    int sl3_idx, sl2_idx;
 28.3345 -    unsigned long sl2mfn, sl1mfn;
 28.3346 -    pgentry_64_t *sl2_p;
 28.3347 -
 28.3348 -    /* Using guest l2 pte index to get shadow l3&l2 index
 28.3349 -     * index: 0 ~ 1023, PAGETABLE_ENTRIES: 512
 28.3350 -     */
 28.3351 -    sl3_idx = index / (PAGETABLE_ENTRIES / 2);
 28.3352 -    sl2_idx = (index % (PAGETABLE_ENTRIES / 2)) * 2;
 28.3353 -
 28.3354 -    sl2mfn = entry_get_pfn(shadow_l3[sl3_idx]);
 28.3355 -    sl2_p = (pgentry_64_t *)map_domain_page(sl2mfn);
 28.3356 -
 28.3357 -    validate_pde_change(
 28.3358 -        d, *(guest_l2_pgentry_t *)new_gle_p, (l2_pgentry_t *)&sl2_p[sl2_idx]);
 28.3359 -
 28.3360 -    /* Mapping the second l1 shadow page */
 28.3361 -    if (entry_get_flags(sl2_p[sl2_idx]) & _PAGE_PRESENT) {
 28.3362 -       sl1mfn = entry_get_pfn(sl2_p[sl2_idx]);
 28.3363 -       sl2_p[sl2_idx + 1] =
 28.3364 -            entry_from_pfn(sl1mfn + 1, entry_get_flags(sl2_p[sl2_idx]));
 28.3365 -    }
 28.3366 -    else
 28.3367 -        sl2_p[sl2_idx + 1] = (pgentry_64_t){0};
 28.3368 -    unmap_domain_page(sl2_p);
 28.3369 -
 28.3370 -}
 28.3371 -
 28.3372 -/*
 28.3373 - * This shadow_mark_va_out_of_sync() is for 2M page shadow
 28.3374 - */
 28.3375 -static void shadow_mark_va_out_of_sync_2mp(
 28.3376 -  struct vcpu *v, unsigned long gpfn, unsigned long mfn, paddr_t writable_pl1e)
 28.3377 -{
 28.3378 -    struct out_of_sync_entry *entry =
 28.3379 -      shadow_mark_mfn_out_of_sync(v, gpfn, mfn);
 28.3380 -
 28.3381 -    entry->writable_pl1e = writable_pl1e;
 28.3382 -    ESH_LOG("<shadow_mark_va_out_of_sync_2mp> gpfn = %lx\n", gpfn);
 28.3383 -    if ( !get_shadow_ref(writable_pl1e >> L1_PAGETABLE_SHIFT) )
 28.3384 -        BUG();
 28.3385 -}
 28.3386 -
 28.3387 -static int get_shadow_mfn(struct domain *d, unsigned long gpfn, unsigned long *spmfn, u32 flag)
 28.3388 -{
 28.3389 -    unsigned long gmfn;
 28.3390 -    if ( !(*spmfn = __shadow_status(d, gpfn, flag)) )
 28.3391 -    {
 28.3392 -        /* This is NOT already shadowed so we need to shadow it. */
 28.3393 -        SH_VVLOG("<get_shadow_mfn>: not shadowed");
 28.3394 -
 28.3395 -        gmfn = gmfn_to_mfn(d, gpfn);
 28.3396 -        if ( unlikely(!VALID_MFN(gmfn)) )
 28.3397 -        {
 28.3398 -            // Attempt to use an invalid pfn as an shadow page.
 28.3399 -            // XXX this needs to be more graceful!
 28.3400 -            BUG();
 28.3401 -        }
 28.3402 -
 28.3403 -        if ( unlikely(!(*spmfn =
 28.3404 -                  alloc_shadow_page(d, gpfn, gmfn, flag))) )
 28.3405 -        {
 28.3406 -            printk("<get_shadow_mfn>Couldn't alloc an shadow for pfn=%lx mfn=%lx\n", gpfn, gmfn);
 28.3407 -            BUG(); /* XXX Need to deal gracefully with failure. */
 28.3408 -        }
 28.3409 -        switch(flag) {
 28.3410 -            case PGT_l1_shadow:
 28.3411 -                perfc_incrc(shadow_l1_table_count);
 28.3412 -                break;
 28.3413 -            case PGT_l2_shadow:
 28.3414 -                perfc_incrc(shadow_l2_table_count);
 28.3415 -                break;
 28.3416 -            case PGT_l3_shadow:
 28.3417 -                perfc_incrc(shadow_l3_table_count);
 28.3418 -                break;
 28.3419 -            case PGT_hl2_shadow:
 28.3420 -                perfc_incrc(shadow_hl2_table_count);
 28.3421 -                break;
 28.3422 -        }
 28.3423 -
 28.3424 -        return 1;
 28.3425 -    } else {
 28.3426 -        /* This L1 is shadowed already, but the L2 entry is missing. */
 28.3427 -        SH_VVLOG("4b: was shadowed, l2 missing (%lx)", *spmfn);
 28.3428 -        return 0;
 28.3429 -    }
 28.3430 -}
 28.3431 -
 28.3432 -static void shadow_map_into_current(struct vcpu *v,
 28.3433 -  unsigned long va, unsigned int from, unsigned int to)
 28.3434 -{
 28.3435 -    pgentry_64_t gle = {0}, sle;
 28.3436 -    unsigned long gpfn, smfn;
 28.3437 -
 28.3438 -    if (from == PAGING_L1 && to == PAGING_L2) {
 28.3439 -        shadow_map_l1_into_current_l2(va);
 28.3440 -        return;
 28.3441 -    }
 28.3442 -
 28.3443 -    __rw_entry(v, va, &gle, GUEST_ENTRY | GET_ENTRY | to);
 28.3444 -    ASSERT(entry_get_flags(gle) & _PAGE_PRESENT);
 28.3445 -    gpfn = entry_get_pfn(gle);
 28.3446 -
 28.3447 -    get_shadow_mfn(v->domain, gpfn, &smfn, shadow_level_to_type(from));
 28.3448 -
 28.3449 -    if ( !get_shadow_ref(smfn) )
 28.3450 -        BUG();
 28.3451 -    entry_general(v->domain, &gle, &sle, smfn, to);
 28.3452 -    __rw_entry(v, va, &gle, GUEST_ENTRY | SET_ENTRY | to);
 28.3453 -    __rw_entry(v, va, &sle, SHADOW_ENTRY | SET_ENTRY | to);
 28.3454 -}
 28.3455 -
 28.3456 -/*
 28.3457 - * shadow_set_lxe should be put in shadow.h
 28.3458 - */
 28.3459 -static void shadow_set_l2e_64(unsigned long va, l2_pgentry_t sl2e,
 28.3460 -  int create_l2_shadow, int put_ref_check)
 28.3461 -{
 28.3462 -    struct vcpu *v = current;
 28.3463 -    l4_pgentry_t sl4e;
 28.3464 -    l3_pgentry_t sl3e;
 28.3465 -
 28.3466 -    __shadow_get_l4e(v, va, &sl4e);
 28.3467 -    if (!(l4e_get_flags(sl4e) & _PAGE_PRESENT)) {
 28.3468 -        if (create_l2_shadow) {
 28.3469 -            perfc_incrc(shadow_set_l3e_force_map);
 28.3470 -            shadow_map_into_current(v, va, PAGING_L3, PAGING_L4);
 28.3471 -            __shadow_get_l4e(v, va, &sl4e);
 28.3472 -        } else {
 28.3473 -            printk("For non HVM shadow, create_l1_shadow:%d\n", create_l2_shadow);
 28.3474 -        }
 28.3475 -    }
 28.3476 -
 28.3477 -    __shadow_get_l3e(v, va, &sl3e);
 28.3478 -    if (!(l3e_get_flags(sl3e) & _PAGE_PRESENT)) {
 28.3479 -        if (create_l2_shadow) {
 28.3480 -            perfc_incrc(shadow_set_l2e_force_map);
 28.3481 -            shadow_map_into_current(v, va, PAGING_L2, PAGING_L3);
 28.3482 -            __shadow_get_l3e(v, va, &sl3e);
 28.3483 -        } else {
 28.3484 -            printk("For non HVM shadow, create_l1_shadow:%d\n", create_l2_shadow);
 28.3485 -        }
 28.3486 -
 28.3487 -        if ( v->domain->arch.ops->guest_paging_levels == PAGING_L4 )
 28.3488 -            shadow_update_min_max(l4e_get_pfn(sl4e), l3_table_offset(va));
 28.3489 -    }
 28.3490 -
 28.3491 -    if ( put_ref_check ) {
 28.3492 -        l2_pgentry_t tmp_sl2e;
 28.3493 -        if ( __shadow_get_l2e(v, va, &tmp_sl2e) ) {
 28.3494 -            if ( l2e_get_flags(tmp_sl2e) & _PAGE_PRESENT )
 28.3495 -                if ( l2e_get_pfn(tmp_sl2e) == l2e_get_pfn(sl2e) ) {
 28.3496 -                    put_shadow_ref(l2e_get_pfn(sl2e));
 28.3497 -                }
 28.3498 -        }
 28.3499 -
 28.3500 -    }
 28.3501 -
 28.3502 -    if (! __shadow_set_l2e(v, va, &sl2e))
 28.3503 -        BUG();
 28.3504 -    shadow_update_min_max(l3e_get_pfn(sl3e), l2_table_offset(va));
 28.3505 -}
 28.3506 -
 28.3507 -
 28.3508 -/* As 32-bit guest don't support 4M page yet,
 28.3509 - * we don't concern double compile for this function
 28.3510 - */
 28.3511 -static inline int l2e_rw_fault(
 28.3512 -    struct vcpu *v, l2_pgentry_t *gl2e_p, unsigned long va, int rw)
 28.3513 -{
 28.3514 -    struct domain *d = v->domain;
 28.3515 -    l2_pgentry_t gl2e = *gl2e_p;
 28.3516 -    l2_pgentry_t tmp_l2e = gl2e;
 28.3517 -    unsigned long start_gpfn = l2e_get_pfn(gl2e);
 28.3518 -    unsigned long gpfn, mfn;
 28.3519 -    unsigned long l1_mfn, gmfn;
 28.3520 -    l1_pgentry_t *l1_p;
 28.3521 -    l1_pgentry_t sl1e;
 28.3522 -    l1_pgentry_t old_sl1e;
 28.3523 -    l2_pgentry_t sl2e;
 28.3524 -#ifdef __x86_64__
 28.3525 -    u64 nx = 0;
 28.3526 -#endif
 28.3527 -    int put_ref_check = 0;
 28.3528 -    /* Check if gpfn is 2M aligned */
 28.3529 -
 28.3530 -    /* Update guest l2e */
 28.3531 -    if (rw) {
 28.3532 -        ASSERT(l2e_get_flags(gl2e) & _PAGE_RW);
 28.3533 -        l2e_add_flags(gl2e, _PAGE_DIRTY | _PAGE_ACCESSED);
 28.3534 -    } else {
 28.3535 -        l2e_add_flags(gl2e, _PAGE_ACCESSED);
 28.3536 -    }
 28.3537 -
 28.3538 -    l2e_remove_flags(tmp_l2e, _PAGE_PSE);
 28.3539 -    if (l2e_get_flags(gl2e) & _PAGE_NX) {
 28.3540 -        l2e_remove_flags(tmp_l2e, _PAGE_NX);
 28.3541 -#ifdef __x86_64__
 28.3542 -        nx = PGT_high_mfn_nx;
 28.3543 -#endif
 28.3544 -    }
 28.3545 -
 28.3546 -
 28.3547 -    /* Get the shadow l2 first */
 28.3548 -    if ( !__shadow_get_l2e(v, va, &sl2e) )
 28.3549 -        sl2e = l2e_empty();
 28.3550 -
 28.3551 -#ifdef __x86_64__
 28.3552 -    l1_mfn = __shadow_status(d, start_gpfn | nx, PGT_fl1_shadow);
 28.3553 -#else
 28.3554 -    l1_mfn = __shadow_status(d, start_gpfn, PGT_fl1_shadow);
 28.3555 -#endif
 28.3556 -
 28.3557 -    /* Check the corresponding l2e */
 28.3558 -    if (l1_mfn) {
 28.3559 -        /* Why it is PRESENT?*/
 28.3560 -        if ((l2e_get_flags(sl2e) & _PAGE_PRESENT) &&
 28.3561 -                l2e_get_pfn(sl2e) == l1_mfn) {
 28.3562 -            ESH_LOG("sl2e PRSENT bit is set: %lx, l1_mfn = %lx\n", l2e_get_pfn(sl2e), l1_mfn);
 28.3563 -        } else {
 28.3564 -            put_ref_check = 1;
 28.3565 -            if (!get_shadow_ref(l1_mfn))
 28.3566 -                BUG();
 28.3567 -        }
 28.3568 -        l1_p = (l1_pgentry_t *)map_domain_page(l1_mfn);
 28.3569 -        sl2e = l2e_from_pfn(l1_mfn, l2e_get_flags(tmp_l2e));
 28.3570 -    } else {
 28.3571 -        /* Allocate a new page as shadow page table if need */
 28.3572 -        gmfn = gmfn_to_mfn(d, start_gpfn);
 28.3573 -#ifdef __x86_64__
 28.3574 -        l1_mfn = alloc_shadow_page(d, start_gpfn | nx, gmfn, PGT_fl1_shadow);
 28.3575 -#else
 28.3576 -        l1_mfn = alloc_shadow_page(d, start_gpfn, gmfn, PGT_fl1_shadow);
 28.3577 -#endif
 28.3578 -        if (unlikely(!l1_mfn)) {
 28.3579 -            BUG();
 28.3580 -        }
 28.3581 -
 28.3582 -        if (!get_shadow_ref(l1_mfn))
 28.3583 -            BUG();
 28.3584 -        l1_p = (l1_pgentry_t *)map_domain_page(l1_mfn );
 28.3585 -        sl2e = l2e_from_pfn(l1_mfn, l2e_get_flags(tmp_l2e));
 28.3586 -        memset(l1_p, 0, PAGE_SIZE);
 28.3587 -        ESH_LOG("Alloc a shadow page: %lx\n", l1_mfn);
 28.3588 -    }
 28.3589 -
 28.3590 -    ESH_LOG("<%s>: sl2e = %lx\n", __func__, l2e_get_intpte(sl2e));
 28.3591 -    /* Map the page to l2*/
 28.3592 -    shadow_set_l2e_64(va, sl2e, 1, put_ref_check);
 28.3593 -
 28.3594 -    if (l2e_get_flags(gl2e) & _PAGE_NX)
 28.3595 -        l2e_add_flags(tmp_l2e, _PAGE_NX);
 28.3596 -
 28.3597 -    /* Propagate the shadow page table, i.e. setting sl1e */
 28.3598 -    for (gpfn = start_gpfn;
 28.3599 -      gpfn < (start_gpfn + L1_PAGETABLE_ENTRIES); gpfn++) {
 28.3600 -
 28.3601 -        mfn = gmfn_to_mfn(d, gpfn);
 28.3602 -
 28.3603 -        if ( unlikely(!VALID_MFN(mfn)) )
 28.3604 -        {
 28.3605 -            continue;
 28.3606 -        }
 28.3607 -
 28.3608 -        sl1e = l1e_from_pfn(mfn, l2e_get_flags(tmp_l2e));
 28.3609 -
 28.3610 -        if (!rw) {
 28.3611 -            if ( shadow_mode_log_dirty(d) ||
 28.3612 -              !(l2e_get_flags(gl2e) & _PAGE_DIRTY) || mfn_is_page_table(mfn) )
 28.3613 -            {
 28.3614 -                l1e_remove_flags(sl1e, _PAGE_RW);
 28.3615 -            }
 28.3616 -        } else {
 28.3617 -            /* __mark_dirty(d, gmfn); */
 28.3618 -        }
 28.3619 -       // printk("<%s> gpfn: %lx, mfn: %lx, sl1e: %lx\n", __func__, gpfn, mfn, l1e_get_intpte(sl1e));
 28.3620 -        /* The shadow entrys need setup before shadow_mark_va_out_of_sync()*/
 28.3621 -        old_sl1e = l1_p[gpfn - start_gpfn];
 28.3622 -
 28.3623 -        if ( l1e_has_changed(old_sl1e, sl1e, _PAGE_RW | _PAGE_PRESENT) )
 28.3624 -        {
 28.3625 -            if ( (l1e_get_flags(sl1e) & _PAGE_PRESENT) &&
 28.3626 -              !shadow_get_page_from_l1e(sl1e, d) ) {
 28.3627 -                ESH_LOG("%lx, mfn: %lx why make me empty, start_pfn: %lx, gpfn: %lx\n", l1e_get_intpte(sl1e),mfn, start_gpfn, gpfn);
 28.3628 -                sl1e = l1e_empty();
 28.3629 -            }
 28.3630 -            if ( l1e_get_flags(old_sl1e) & _PAGE_PRESENT )
 28.3631 -                put_page_from_l1e(old_sl1e, d);
 28.3632 -        }
 28.3633 -
 28.3634 -        if (rw) {
 28.3635 -            /* shadow_mark_va_out_of_sync() need modificatin for 2M pages*/
 28.3636 -            if ( mfn_is_page_table(mfn) )
 28.3637 -                shadow_mark_va_out_of_sync_2mp(v, gpfn, mfn,
 28.3638 -                  l2e_get_paddr(sl2e) | (sizeof(l1_pgentry_t) * (gpfn - start_gpfn)));
 28.3639 -        }
 28.3640 -
 28.3641 -        l1_p[gpfn - start_gpfn] = sl1e;
 28.3642 -    }
 28.3643 -
 28.3644 -    unmap_domain_page(l1_p);
 28.3645 -    *gl2e_p = gl2e;
 28.3646 -    return 1;
 28.3647 -}
 28.3648 -
 28.3649 -/*
 28.3650 - * Check P, R/W, U/S bits in the guest page table.
 28.3651 - * If the fault belongs to guest return 1,
 28.3652 - * else return 0.
 28.3653 - */
 28.3654 -#if defined( GUEST_PGENTRY_32 )
 28.3655 -static inline int guest_page_fault(
 28.3656 -    struct vcpu *v,
 28.3657 -    unsigned long va, unsigned int error_code,
 28.3658 -    guest_l2_pgentry_t *gpl2e, guest_l1_pgentry_t *gpl1e)
 28.3659 -{
 28.3660 -    /* The following check for 32-bit guest on 64-bit host */
 28.3661 -
 28.3662 -    __guest_get_l2e(v, va, gpl2e);
 28.3663 -
 28.3664 -    /* Check the guest L2 page-table entry first*/
 28.3665 -    if ( unlikely(!(guest_l2e_get_flags(*gpl2e) & _PAGE_PRESENT)) )
 28.3666 -        return 1;
 28.3667 -
 28.3668 -    if ( error_code & ERROR_W ) 
 28.3669 -    {
 28.3670 -        if ( unlikely(!(guest_l2e_get_flags(*gpl2e) & _PAGE_RW)) )
 28.3671 -            return 1;
 28.3672 -    }
 28.3673 -
 28.3674 -    if ( error_code & ERROR_U ) 
 28.3675 -    {
 28.3676 -        if ( unlikely(!(guest_l2e_get_flags(*gpl2e) & _PAGE_USER)) )
 28.3677 -            return 1;
 28.3678 -    }
 28.3679 -
 28.3680 -    if ( guest_l2e_get_flags(*gpl2e) & _PAGE_PSE )
 28.3681 -    {
 28.3682 -        printk("None-PAE HVM guests can NOT use PSE, "
 28.3683 -               "because we don't support 4MBytes PSE pages.\n");
 28.3684 -        printk("remove pae=1 from your config file.\n");
 28.3685 -        domain_crash_synchronous();
 28.3686 -        return 0;
 28.3687 -    }
 28.3688 -
 28.3689 -    __guest_get_l1e(v, va, gpl1e);
 28.3690 -
 28.3691 -    /* Then check the guest L1 page-table entry */
 28.3692 -    if ( unlikely(!(guest_l1e_get_flags(*gpl1e) & _PAGE_PRESENT)) )
 28.3693 -        return 1;
 28.3694 -
 28.3695 -    if ( error_code & ERROR_W ) 
 28.3696 -    {
 28.3697 -        if ( unlikely(!(guest_l1e_get_flags(*gpl1e) & _PAGE_RW)) )
 28.3698 -            return 1;
 28.3699 -    }
 28.3700 -
 28.3701 -    if ( error_code & ERROR_U ) 
 28.3702 -    {
 28.3703 -        if ( unlikely(!(guest_l1e_get_flags(*gpl1e) & _PAGE_USER)) )
 28.3704 -            return 1;
 28.3705 -    }
 28.3706 -
 28.3707 -    return 0;
 28.3708 -}
 28.3709 -#else
 28.3710 -static inline int guest_page_fault(
 28.3711 -    struct vcpu *v,
 28.3712 -    unsigned long va, unsigned int error_code,
 28.3713 -    guest_l2_pgentry_t *gpl2e, guest_l1_pgentry_t *gpl1e)
 28.3714 -{
 28.3715 -    struct domain *d = v->domain;
 28.3716 -    pgentry_64_t gle = { 0 };
 28.3717 -    unsigned long gpfn = 0, mfn;
 28.3718 -    int i;
 28.3719 -    unsigned int base_idx = 0;
 28.3720 -    base_idx = get_cr3_idxval(v);
 28.3721 -
 28.3722 -    ASSERT( d->arch.ops->guest_paging_levels >= PAGING_L3 );
 28.3723 -
 28.3724 -#if CONFIG_PAGING_LEVELS >= 3
 28.3725 -    if ( (error_code & (ERROR_I | ERROR_P)) == (ERROR_I | ERROR_P) )
 28.3726 -        return 1;
 28.3727 -#endif
 28.3728 -
 28.3729 -#if CONFIG_PAGING_LEVELS == 4
 28.3730 -    if ( d->arch.ops->guest_paging_levels == PAGING_L4 )
 28.3731 -    {
 28.3732 -        __rw_entry(v, va, &gle, GUEST_ENTRY | GET_ENTRY | PAGING_L4);
 28.3733 -        if ( unlikely(!(entry_get_flags(gle) & _PAGE_PRESENT)) )
 28.3734 -            return 1;
 28.3735 -
 28.3736 -        if ( error_code & ERROR_W )
 28.3737 -        {
 28.3738 -            if ( unlikely(!(entry_get_flags(gle) & _PAGE_RW)) )
 28.3739 -                return 1;
 28.3740 -        }
 28.3741 -
 28.3742 -        if ( error_code & ERROR_U )
 28.3743 -        {
 28.3744 -            if ( unlikely(!(entry_get_flags(gle) & _PAGE_USER)) )
 28.3745 -                return 1;
 28.3746 -        }
 28.3747 -        gpfn = entry_get_pfn(gle);
 28.3748 -    }
 28.3749 -#endif
 28.3750 -
 28.3751 -#if CONFIG_PAGING_LEVELS >= 3
 28.3752 -    if ( d->arch.ops->guest_paging_levels == PAGING_L3 )
 28.3753 -    {
 28.3754 -        if ( SH_GUEST_32PAE )
 28.3755 -            gpfn = (hvm_get_guest_ctrl_reg(v, 3)) >> PAGE_SHIFT;
 28.3756 -        else
 28.3757 -            gpfn = pagetable_get_pfn(v->arch.guest_table);
 28.3758 -    }
 28.3759 -#endif
 28.3760 -
 28.3761 -    for ( i = PAGING_L3; i >= PAGING_L1; i-- )
 28.3762 -    {
 28.3763 -        pgentry_64_t *lva;
 28.3764 -        /*
 28.3765 -         * If it's not external mode, then mfn should be machine physical.
 28.3766 -         */
 28.3767 -        mfn = gmfn_to_mfn(d, gpfn);
 28.3768 -
 28.3769 -        lva = (pgentry_64_t *) map_domain_page(mfn);
 28.3770 -        gle = lva[guest_table_offset_64(va, i, base_idx)];
 28.3771 -
 28.3772 -        unmap_domain_page(lva);
 28.3773 -
 28.3774 -        gpfn = entry_get_pfn(gle);
 28.3775 -
 28.3776 -        if ( unlikely(!(entry_get_flags(gle) & _PAGE_PRESENT)) )
 28.3777 -            return 1;
 28.3778 -
 28.3779 -        if ( i < PAGING_L3 ||
 28.3780 -             d->arch.ops->guest_paging_levels == PAGING_L4 )
 28.3781 -        {
 28.3782 -            if ( error_code & ERROR_W )
 28.3783 -            {
 28.3784 -                if ( unlikely(!(entry_get_flags(gle) & _PAGE_RW)) )
 28.3785 -                {
 28.3786 -                    if ( i == PAGING_L1 )
 28.3787 -                        if ( gpl1e )
 28.3788 -                            gpl1e->l1 = gle.lo;
 28.3789 -                    return 1;
 28.3790 -                }
 28.3791 -            }
 28.3792 -            if ( error_code & ERROR_U )
 28.3793 -            {
 28.3794 -                if ( unlikely(!(entry_get_flags(gle) & _PAGE_USER)) )
 28.3795 -                    return 1;
 28.3796 -            }
 28.3797 -        }
 28.3798 -
 28.3799 -        if ( i == PAGING_L2 )
 28.3800 -        {
 28.3801 -            if ( gpl2e )
 28.3802 -                gpl2e->l2 = gle.lo;
 28.3803 -            if ( likely(entry_get_flags(gle) & _PAGE_PSE) )
 28.3804 -                return 0;
 28.3805 -        }
 28.3806 -
 28.3807 -        if ( i == PAGING_L1 )
 28.3808 -            if ( gpl1e )
 28.3809 -                gpl1e->l1 = gle.lo;
 28.3810 -    }
 28.3811 -
 28.3812 -    return 0;
 28.3813 -
 28.3814 -}
 28.3815 -#endif
 28.3816 -
 28.3817 -static int shadow_fault_64(unsigned long va, struct cpu_user_regs *regs)
 28.3818 -{
 28.3819 -    struct vcpu *v = current;
 28.3820 -    struct domain *d = v->domain;
 28.3821 -    guest_l2_pgentry_t gl2e;
 28.3822 -    guest_l1_pgentry_t gl1e, orig_gl1e;
 28.3823 -    l1_pgentry_t sl1e;
 28.3824 -
 28.3825 -    gl1e = guest_l1e_empty(); gl2e = guest_l2e_empty();
 28.3826 -
 28.3827 -    sl1e = l1e_empty();
 28.3828 -
 28.3829 -    perfc_incrc(shadow_fault_calls);
 28.3830 -
 28.3831 -    ESH_LOG("<shadow_fault_64> va=%lx,  rip = %lx, error code = %x\n",
 28.3832 -            va, regs->eip, regs->error_code);
 28.3833 -
 28.3834 -    /*
 28.3835 -     * Don't let someone else take the guest's table pages out-of-sync.
 28.3836 -     */
 28.3837 -    shadow_lock(d);
 28.3838 -
 28.3839 -    /*
 28.3840 -     * STEP 1. Check to see if this fault might have been caused by an
 28.3841 -     *         out-of-sync table page entry, or if we should pass this
 28.3842 -     *         fault onto the guest.
 28.3843 -     */
 28.3844 -    __shadow_sync_va(v, va);
 28.3845 -
 28.3846 -    /*
 28.3847 -     * STEP 2. Check if the fault belongs to guest
 28.3848 -     */
 28.3849 -    if ( guest_page_fault(v, va, regs->error_code, &gl2e, &gl1e) ) 
 28.3850 -    {
 28.3851 -        if ( unlikely(shadow_mode_log_dirty(d)) && l1e_get_intpte(gl1e) != 0 )
 28.3852 -            goto check_writeable;
 28.3853 -        
 28.3854 -        goto fail;
 28.3855 -    }
 28.3856 -
 28.3857 -    if ( unlikely((guest_l2e_get_flags(gl2e) & _PAGE_PSE)) ) 
 28.3858 -        goto pse;
 28.3859 -
 28.3860 -    /*
 28.3861 -     * Handle 4K pages here
 28.3862 -     */
 28.3863 -check_writeable:
 28.3864 -    orig_gl1e = gl1e;
 28.3865 -    
 28.3866 -    /* Write fault? */
 28.3867 -    if ( regs->error_code & 2 ) 
 28.3868 -    {
 28.3869 -        int allow_writes = 0;
 28.3870 -
 28.3871 -        if ( unlikely(!(guest_l1e_get_flags(gl1e) & _PAGE_RW)) )
 28.3872 -        {
 28.3873 -            if ( shadow_mode_page_writable(va, regs, l1e_get_pfn(gl1e)) )
 28.3874 -            {
 28.3875 -                allow_writes = 1;
 28.3876 -                l1e_add_flags(gl1e, _PAGE_RW);
 28.3877 -            }
 28.3878 -            else
 28.3879 -            {
 28.3880 -                /* Write fault on a read-only mapping. */
 28.3881 -                SH_VVLOG("shadow_fault - EXIT: wr fault on RO page (%" PRIpte ")", 
 28.3882 -                         l1e_get_intpte(gl1e));
 28.3883 -                perfc_incrc(shadow_fault_bail_ro_mapping);
 28.3884 -                goto fail;
 28.3885 -            }
 28.3886 -        }
 28.3887 -
 28.3888 -        if ( !l1pte_write_fault(v, &gl1e, &sl1e, va) ) 
 28.3889 -        {
 28.3890 -            SH_VVLOG("shadow_fault - EXIT: l1pte_write_fault failed");
 28.3891 -            perfc_incrc(write_fault_bail);
 28.3892 -            shadow_unlock(d);
 28.3893 -            return 0;
 28.3894 -        }
 28.3895 - 
 28.3896 -        if (allow_writes)
 28.3897 -            l1e_remove_flags(gl1e, _PAGE_RW);
 28.3898 -    }
 28.3899 -    else 
 28.3900 -    {
 28.3901 -        if ( !l1pte_read_fault(d, &gl1e, &sl1e) )
 28.3902 -        {
 28.3903 -            SH_VVLOG("shadow_fault - EXIT: l1pte_read_fault failed");
 28.3904 -            perfc_incrc(read_fault_bail);
 28.3905 -            shadow_unlock(d);
 28.3906 -            return 0;
 28.3907 -        }
 28.3908 -    }
 28.3909 -
 28.3910 -    /*
 28.3911 -     * STEP 3. Write the modified shadow PTE and guest PTE back to the tables
 28.3912 -     */
 28.3913 -    if ( l1e_has_changed(orig_gl1e, gl1e, PAGE_FLAG_MASK) )
 28.3914 -    {
 28.3915 -        if (unlikely(!__guest_set_l1e(v, va, &gl1e))) 
 28.3916 -            domain_crash_synchronous();
 28.3917 -
 28.3918 -        __mark_dirty(d, gmfn_to_mfn(d, l2e_get_pfn(gl2e)));
 28.3919 -    }
 28.3920 -
 28.3921 -    shadow_set_l1e_64(va, (pgentry_64_t *)&sl1e, 1);
 28.3922 -
 28.3923 -    perfc_incrc(shadow_fault_fixed);
 28.3924 -    d->arch.shadow_fault_count++;
 28.3925 -
 28.3926 -    shadow_unlock(d);
 28.3927 -
 28.3928 -    return EXCRET_fault_fixed;
 28.3929 -
 28.3930 -pse:
 28.3931 -    /*
 28.3932 -     * Handle 2M pages here
 28.3933 -     */
 28.3934 -    if ( unlikely(!shadow_mode_external(d)) )
 28.3935 -        BUG();
 28.3936 -
 28.3937 -    /* Write fault? */
 28.3938 -    if ( regs->error_code & 2 ) 
 28.3939 -    {
 28.3940 -        if ( !l2e_rw_fault(v, (l2_pgentry_t *)&gl2e, va, WRITE_FAULT) ) 
 28.3941 -        {
 28.3942 -            goto fail;
 28.3943 -        }
 28.3944 -    } 
 28.3945 -    else 
 28.3946 -    {
 28.3947 -        l2e_rw_fault(v, (l2_pgentry_t *)&gl2e, va, READ_FAULT);
 28.3948 -    }
 28.3949 -
 28.3950 -    /*
 28.3951 -     * STEP 3. Write guest/shadow l2e back
 28.3952 -     */
 28.3953 -
 28.3954 -    if ( unlikely(!__guest_set_l2e(v, va, &gl2e)) ) 
 28.3955 -    {
 28.3956 -        domain_crash_synchronous();
 28.3957 -    }
 28.3958 -
 28.3959 -    /*
 28.3960 -     * Todo: if necessary, record the page table page as dirty
 28.3961 -     */
 28.3962 -
 28.3963 -    perfc_incrc(shadow_fault_fixed);
 28.3964 -    d->arch.shadow_fault_count++;
 28.3965 -
 28.3966 -    shadow_unlock(d);
 28.3967 -
 28.3968 -    return EXCRET_fault_fixed;
 28.3969 -fail:
 28.3970 -    shadow_unlock(d);
 28.3971 -    ESH_LOG("Guest fault~~~\n");
 28.3972 -    return 0;
 28.3973 -}
 28.3974 -
 28.3975 -static void shadow_invlpg_64(struct vcpu *v, unsigned long va)
 28.3976 -{
 28.3977 -    struct domain *d = v->domain;
 28.3978 -    l1_pgentry_t  sl1e, old_sl1e;
 28.3979 -
 28.3980 -    shadow_lock(d);
 28.3981 -
 28.3982 -    __shadow_sync_va(v, va);
 28.3983 -
 28.3984 -    if ( shadow_mode_external(d) && __shadow_get_l1e(v, va, &old_sl1e) )
 28.3985 -        if ( l1e_get_flags(old_sl1e) & _PAGE_PRESENT )
 28.3986 -            put_page_from_l1e(old_sl1e, d);
 28.3987 -
 28.3988 -    sl1e = l1e_empty();
 28.3989 -    __shadow_set_l1e(v, va, &sl1e);
 28.3990 -
 28.3991 -    shadow_unlock(d);
 28.3992 -}
 28.3993 -
 28.3994 -static unsigned long gva_to_gpa_64(unsigned long gva)
 28.3995 -{
 28.3996 -    struct vcpu *v = current;
 28.3997 -    guest_l1_pgentry_t gl1e = {0};
 28.3998 -    guest_l2_pgentry_t gl2e = {0};
 28.3999 -    unsigned long gpa;
 28.4000 -
 28.4001 -    if (guest_page_fault(v, gva, 0, &gl2e, &gl1e))
 28.4002 -        return 0;
 28.4003 -
 28.4004 -    if (guest_l2e_get_flags(gl2e) & _PAGE_PSE)
 28.4005 -        gpa = guest_l2e_get_paddr(gl2e) + (gva & ((1 << GUEST_L2_PAGETABLE_SHIFT) - 1));
 28.4006 -    else
 28.4007 -        gpa = guest_l1e_get_paddr(gl1e) + (gva & ~PAGE_MASK);
 28.4008 -
 28.4009 -    return gpa;
 28.4010 -}
 28.4011 -
 28.4012 -/*
 28.4013 - * The naming convention of the shadow_ops:
 28.4014 - * MODE_<pgentry size>_<guest paging levels>_HANDLER
 28.4015 - */
 28.4016 -#if (!defined(GUEST_PGENTRY_32) && !defined(GUEST_32PAE))
 28.4017 -struct shadow_ops MODE_64_3_HANDLER = {
 28.4018 -    .guest_paging_levels        = 3,
 28.4019 -    .invlpg                     = shadow_invlpg_64,
 28.4020 -    .fault                      = shadow_fault_64,
 28.4021 -    .update_pagetables          = shadow_update_pagetables,
 28.4022 -    .sync_all                   = sync_all,
 28.4023 -    .remove_all_write_access    = remove_all_write_access,
 28.4024 -    .do_update_va_mapping       = do_update_va_mapping,
 28.4025 -    .mark_mfn_out_of_sync       = mark_mfn_out_of_sync,
 28.4026 -    .is_out_of_sync             = is_out_of_sync,
 28.4027 -    .gva_to_gpa                 = gva_to_gpa_pae,
 28.4028 -};
 28.4029 -
 28.4030 -struct shadow_ops MODE_64_4_HANDLER = {
 28.4031 -    .guest_paging_levels        = 4,
 28.4032 -    .invlpg                     = shadow_invlpg_64,
 28.4033 -    .fault                      = shadow_fault_64,
 28.4034 -    .update_pagetables          = shadow_update_pagetables,
 28.4035 -    .sync_all                   = sync_all,
 28.4036 -    .remove_all_write_access    = remove_all_write_access,
 28.4037 -    .do_update_va_mapping       = do_update_va_mapping,
 28.4038 -    .mark_mfn_out_of_sync       = mark_mfn_out_of_sync,
 28.4039 -    .is_out_of_sync             = is_out_of_sync,
 28.4040 -    .gva_to_gpa                 = gva_to_gpa_64,
 28.4041 -};
 28.4042 -#endif /* GUEST_PGENTRY_32 */
 28.4043 -#endif /* CONFIG_PAGING_LEVELS >= 3 */
 28.4044 -
 28.4045 -
 28.4046 -#if CONFIG_PAGING_LEVELS == 2
 28.4047 -struct shadow_ops MODE_32_2_HANDLER = {
 28.4048 -    .guest_paging_levels        = 2,
 28.4049 -    .invlpg                     = shadow_invlpg_32,
 28.4050 -    .fault                      = shadow_fault_32,
 28.4051 -    .update_pagetables          = shadow_update_pagetables,
 28.4052 -    .sync_all                   = sync_all,
 28.4053 -    .remove_all_write_access    = remove_all_write_access,
 28.4054 -    .do_update_va_mapping       = do_update_va_mapping,
 28.4055 -    .mark_mfn_out_of_sync       = mark_mfn_out_of_sync,
 28.4056 -    .is_out_of_sync             = is_out_of_sync,
 28.4057 -    .gva_to_gpa                 = gva_to_gpa_64,
 28.4058 -};
 28.4059 -#endif
 28.4060 -
 28.4061 -#if ( CONFIG_PAGING_LEVELS == 3 && !defined (GUEST_PGENTRY_32) && !defined (GUEST_32PAE) ) ||  \
 28.4062 -    ( CONFIG_PAGING_LEVELS == 4 && defined (GUEST_PGENTRY_32) ) 
 28.4063 -
 28.4064 -
 28.4065 -/* 
 28.4066 - * Use GUEST_PGENTRY_32 to force PAE_SHADOW_SELF_ENTRY for L4.
 28.4067 - *
 28.4068 - * Very simple shadow code to handle 1:1 direct mapping for guest 
 28.4069 - * non-paging code, which actually is running in PAE/vm86 mode with 
 28.4070 - * paging-enabled.
 28.4071 - *
 28.4072 - * We expect that the top level (L3) page has been allocated and initialized.
 28.4073 - */
 28.4074 -int shadow_direct_map_fault(unsigned long vpa, struct cpu_user_regs *regs)
 28.4075 -{
 28.4076 -    struct vcpu *v = current;
 28.4077 -    struct domain *d = v->domain;
 28.4078 -    l3_pgentry_t sl3e, *sl3e_p;
 28.4079 -    l2_pgentry_t sl2e, *sl2e_p;
 28.4080 -    l1_pgentry_t sl1e;
 28.4081 -    unsigned long mfn, smfn;
 28.4082 -    struct page_info *page;
 28.4083 -
 28.4084 -    /*
 28.4085 -     * If the faulting address is within the MMIO range, we continue
 28.4086 -     * on handling the #PF as such.
 28.4087 -     */
 28.4088 -    if ( (mfn = get_mfn_from_gpfn(vpa >> PAGE_SHIFT)) == INVALID_MFN )
 28.4089 -        return 0;
 28.4090 -
 28.4091 -    shadow_lock(d);
 28.4092 -
 28.4093 -    __direct_get_l3e(v, vpa, &sl3e);
 28.4094 -
 28.4095 -    if ( !(l3e_get_flags(sl3e) & _PAGE_PRESENT) )
 28.4096 -    {
 28.4097 -        page = alloc_domheap_page(NULL);
 28.4098 -        if ( !page )
 28.4099 -            goto nomem;
 28.4100 -
 28.4101 -        smfn = page_to_mfn(page);
 28.4102 -        sl3e = l3e_from_pfn(smfn, _PAGE_PRESENT);
 28.4103 -
 28.4104 -        sl3e_p = (l3_pgentry_t *)map_domain_page(smfn);
 28.4105 -        memset(sl3e_p, 0, PAGE_SIZE);
 28.4106 -        unmap_domain_page(sl3e_p);
 28.4107 -
 28.4108 -        __direct_set_l3e(v, vpa, &sl3e);
 28.4109 -    }
 28.4110 -
 28.4111 -    __direct_get_l2e(v, vpa, &sl2e);
 28.4112 -
 28.4113 -    if ( !(l2e_get_flags(sl2e) & _PAGE_PRESENT) )
 28.4114 -    {
 28.4115 -        page = alloc_domheap_page(NULL);
 28.4116 -        if ( !page )
 28.4117 -            goto nomem;
 28.4118 -
 28.4119 -        smfn = page_to_mfn(page);
 28.4120 -        sl2e = l2e_from_pfn(smfn, __PAGE_HYPERVISOR | _PAGE_USER);
 28.4121 -        sl2e_p = (l2_pgentry_t *)map_domain_page(smfn);
 28.4122 -        memset(sl2e_p, 0, PAGE_SIZE);
 28.4123 -        unmap_domain_page(sl2e_p);
 28.4124 -
 28.4125 -        __direct_set_l2e(v, vpa, &sl2e);
 28.4126 -    }
 28.4127 -
 28.4128 -    __direct_get_l1e(v, vpa, &sl1e);
 28.4129 -
 28.4130 -    if ( !(l1e_get_flags(sl1e) & _PAGE_PRESENT) )
 28.4131 -    {
 28.4132 -        sl1e = l1e_from_pfn(mfn, __PAGE_HYPERVISOR | _PAGE_USER);
 28.4133 -        __direct_set_l1e(v, vpa, &sl1e);
 28.4134 -    }
 28.4135 -
 28.4136 -    shadow_unlock(d);
 28.4137 -    return EXCRET_fault_fixed;
 28.4138 -
 28.4139 -nomem:
 28.4140 -    shadow_direct_map_clean(d);
 28.4141 -    domain_crash_synchronous();
 28.4142 -}
 28.4143 -#endif
 28.4144 -
 28.4145 -/*
 28.4146 - * Local variables:
 28.4147 - * mode: C
 28.4148 - * c-set-style: "BSD"
 28.4149 - * c-basic-offset: 4
 28.4150 - * tab-width: 4
 28.4151 - * indent-tabs-mode: nil
 28.4152 - * End:
 28.4153 - */
    29.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    29.2 +++ b/xen/arch/x86/shadow2-common.c	Wed Aug 16 17:02:35 2006 +0100
    29.3 @@ -0,0 +1,3394 @@
    29.4 +/******************************************************************************
    29.5 + * arch/x86/shadow2-common.c
    29.6 + *
    29.7 + * Shadow2 code that does not need to be multiply compiled.
    29.8 + * Parts of this code are Copyright (c) 2006 by XenSource Inc.
    29.9 + * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
   29.10 + * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
   29.11 + * 
   29.12 + * This program is free software; you can redistribute it and/or modify
   29.13 + * it under the terms of the GNU General Public License as published by
   29.14 + * the Free Software Foundation; either version 2 of the License, or
   29.15 + * (at your option) any later version.
   29.16 + *
   29.17 + * This program is distributed in the hope that it will be useful,
   29.18 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
   29.19 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   29.20 + * GNU General Public License for more details.
   29.21 + *
   29.22 + * You should have received a copy of the GNU General Public License
   29.23 + * along with this program; if not, write to the Free Software
   29.24 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
   29.25 + */
   29.26 +
   29.27 +#define SHADOW2 1
   29.28 +
   29.29 +#include <xen/config.h>
   29.30 +#include <xen/types.h>
   29.31 +#include <xen/mm.h>
   29.32 +#include <xen/trace.h>
   29.33 +#include <xen/sched.h>
   29.34 +#include <xen/perfc.h>
   29.35 +#include <xen/irq.h>
   29.36 +#include <xen/domain_page.h>
   29.37 +#include <xen/guest_access.h>
   29.38 +#include <asm/event.h>
   29.39 +#include <asm/page.h>
   29.40 +#include <asm/current.h>
   29.41 +#include <asm/flushtlb.h>
   29.42 +#include <asm/shadow2.h>
   29.43 +#include <asm/shadow2-private.h>
   29.44 +
   29.45 +#if SHADOW2_AUDIT
   29.46 +int shadow2_audit_enable = 0;
   29.47 +#endif
   29.48 +
   29.49 +static void sh2_free_log_dirty_bitmap(struct domain *d);
   29.50 +
   29.51 +int _shadow2_mode_refcounts(struct domain *d)
   29.52 +{
   29.53 +    return shadow2_mode_refcounts(d);
   29.54 +}
   29.55 +
   29.56 +
   29.57 +/**************************************************************************/
   29.58 +/* x86 emulator support for the shadow2 code
   29.59 + */
   29.60 +
   29.61 +static int
   29.62 +sh2_x86_emulate_read_std(unsigned long addr,
   29.63 +                         unsigned long *val,
   29.64 +                         unsigned int bytes,
   29.65 +                         struct x86_emulate_ctxt *ctxt)
   29.66 +{
   29.67 +    struct vcpu *v = current;
   29.68 +    if ( hvm_guest(v) )
   29.69 +    {
   29.70 +        *val = 0;
   29.71 +        // XXX -- this is WRONG.
   29.72 +        //        It entirely ignores the permissions in the page tables.
   29.73 +        //        In this case, that is only a user vs supervisor access check.
   29.74 +        //
   29.75 +        if ( hvm_copy(val, addr, bytes, HVM_COPY_IN) )
   29.76 +        {
   29.77 +#if 0
   29.78 +            SHADOW2_PRINTK("d=%u v=%u a=%#lx v=%#lx bytes=%u\n",
   29.79 +                           v->domain->domain_id, v->vcpu_id, 
   29.80 +                           addr, *val, bytes);
   29.81 +#endif
   29.82 +            return X86EMUL_CONTINUE;
   29.83 +        }
   29.84 +
   29.85 +        /* If we got here, there was nothing mapped here, or a bad GFN 
   29.86 +         * was mapped here.  This should never happen: we're here because
   29.87 +         * of a write fault at the end of the instruction we're emulating. */ 
   29.88 +        SHADOW2_PRINTK("read failed to va %#lx\n", addr);
   29.89 +        return X86EMUL_PROPAGATE_FAULT;
   29.90 +    }
   29.91 +    else 
   29.92 +    {
   29.93 +        SHADOW2_PRINTK("this operation is not emulated yet\n");
   29.94 +        return X86EMUL_UNHANDLEABLE;
   29.95 +    }
   29.96 +}
   29.97 +
   29.98 +static int
   29.99 +sh2_x86_emulate_write_std(unsigned long addr,
  29.100 +                          unsigned long val,
  29.101 +                          unsigned int bytes,
  29.102 +                          struct x86_emulate_ctxt *ctxt)
  29.103 +{
  29.104 +    struct vcpu *v = current;
  29.105 +#if 0
  29.106 +    SHADOW2_PRINTK("d=%u v=%u a=%#lx v=%#lx bytes=%u\n",
  29.107 +                  v->domain->domain_id, v->vcpu_id, addr, val, bytes);
  29.108 +#endif
  29.109 +    if ( hvm_guest(v) )
  29.110 +    {
  29.111 +        // XXX -- this is WRONG.
  29.112 +        //        It entirely ignores the permissions in the page tables.
  29.113 +        //        In this case, that includes user vs supervisor, and
  29.114 +        //        write access.
  29.115 +        //
  29.116 +        if ( hvm_copy(&val, addr, bytes, HVM_COPY_OUT) )
  29.117 +            return X86EMUL_CONTINUE;
  29.118 +
  29.119 +        /* If we got here, there was nothing mapped here, or a bad GFN 
  29.120 +         * was mapped here.  This should never happen: we're here because
  29.121 +         * of a write fault at the end of the instruction we're emulating,
  29.122 +         * which should be handled by sh2_x86_emulate_write_emulated. */ 
  29.123 +        SHADOW2_PRINTK("write failed to va %#lx\n", addr);
  29.124 +        return X86EMUL_PROPAGATE_FAULT;
  29.125 +    }
  29.126 +    else 
  29.127 +    {
  29.128 +        SHADOW2_PRINTK("this operation is not emulated yet\n");
  29.129 +        return X86EMUL_UNHANDLEABLE;
  29.130 +    }
  29.131 +}
  29.132 +
  29.133 +static int
  29.134 +sh2_x86_emulate_write_emulated(unsigned long addr,
  29.135 +                               unsigned long val,
  29.136 +                               unsigned int bytes,
  29.137 +                               struct x86_emulate_ctxt *ctxt)
  29.138 +{
  29.139 +    struct vcpu *v = current;
  29.140 +#if 0
  29.141 +    SHADOW2_PRINTK("d=%u v=%u a=%#lx v=%#lx bytes=%u\n",
  29.142 +                  v->domain->domain_id, v->vcpu_id, addr, val, bytes);
  29.143 +#endif
  29.144 +    if ( hvm_guest(v) )
  29.145 +    {
  29.146 +        return v->arch.shadow2->x86_emulate_write(v, addr, &val, bytes, ctxt);
  29.147 +    }
  29.148 +    else 
  29.149 +    {
  29.150 +        SHADOW2_PRINTK("this operation is not emulated yet\n");
  29.151 +        return X86EMUL_UNHANDLEABLE;
  29.152 +    }
  29.153 +}
  29.154 +
  29.155 +static int 
  29.156 +sh2_x86_emulate_cmpxchg_emulated(unsigned long addr,
  29.157 +                                 unsigned long old,
  29.158 +                                 unsigned long new,
  29.159 +                                 unsigned int bytes,
  29.160 +                                 struct x86_emulate_ctxt *ctxt)
  29.161 +{
  29.162 +    struct vcpu *v = current;
  29.163 +#if 0
  29.164 +    SHADOW2_PRINTK("d=%u v=%u a=%#lx o?=%#lx n:=%#lx bytes=%u\n",
  29.165 +                   v->domain->domain_id, v->vcpu_id, addr, old, new, bytes);
  29.166 +#endif
  29.167 +    if ( hvm_guest(v) )
  29.168 +    {
  29.169 +        return v->arch.shadow2->x86_emulate_cmpxchg(v, addr, old, new, 
  29.170 +                                                    bytes, ctxt);
  29.171 +    }
  29.172 +    else 
  29.173 +    {
  29.174 +        SHADOW2_PRINTK("this operation is not emulated yet\n");
  29.175 +        return X86EMUL_UNHANDLEABLE;
  29.176 +    }
  29.177 +}
  29.178 +
  29.179 +static int 
  29.180 +sh2_x86_emulate_cmpxchg8b_emulated(unsigned long addr,
  29.181 +                                   unsigned long old_lo,
  29.182 +                                   unsigned long old_hi,
  29.183 +                                   unsigned long new_lo,
  29.184 +                                   unsigned long new_hi,
  29.185 +                                   struct x86_emulate_ctxt *ctxt)
  29.186 +{
  29.187 +    struct vcpu *v = current;
  29.188 +#if 0
  29.189 +    SHADOW2_PRINTK("d=%u v=%u a=%#lx o?=%#lx:%lx n:=%#lx:%lx\n",
  29.190 +                   v->domain->domain_id, v->vcpu_id, addr, old_hi, old_lo,
  29.191 +                   new_hi, new_lo, ctxt);
  29.192 +#endif
  29.193 +    if ( hvm_guest(v) )
  29.194 +    {
  29.195 +        return v->arch.shadow2->x86_emulate_cmpxchg8b(v, addr, old_lo, old_hi,
  29.196 +                                                      new_lo, new_hi, ctxt);
  29.197 +    }
  29.198 +    else 
  29.199 +    {
  29.200 +        SHADOW2_PRINTK("this operation is not emulated yet\n");
  29.201 +        return X86EMUL_UNHANDLEABLE;
  29.202 +    }
  29.203 +}
  29.204 +
  29.205 +
  29.206 +struct x86_emulate_ops shadow2_emulator_ops = {
  29.207 +    .read_std           = sh2_x86_emulate_read_std,
  29.208 +    .write_std          = sh2_x86_emulate_write_std,
  29.209 +    .read_emulated      = sh2_x86_emulate_read_std,
  29.210 +    .write_emulated     = sh2_x86_emulate_write_emulated,
  29.211 +    .cmpxchg_emulated   = sh2_x86_emulate_cmpxchg_emulated,
  29.212 +    .cmpxchg8b_emulated = sh2_x86_emulate_cmpxchg8b_emulated,
  29.213 +};
  29.214 +
  29.215 +
  29.216 +/**************************************************************************/
  29.217 +/* Code for "promoting" a guest page to the point where the shadow code is
  29.218 + * willing to let it be treated as a guest page table.  This generally
  29.219 + * involves making sure there are no writable mappings available to the guest
  29.220 + * for this page.
  29.221 + */
  29.222 +void shadow2_promote(struct vcpu *v, mfn_t gmfn, u32 type)
  29.223 +{
  29.224 +    struct page_info *page = mfn_to_page(gmfn);
  29.225 +    unsigned long type_info;
  29.226 +
  29.227 +    ASSERT(valid_mfn(gmfn));
  29.228 +
  29.229 +    /* We should never try to promote a gmfn that has writeable mappings */
  29.230 +    ASSERT(shadow2_remove_write_access(v, gmfn, 0, 0) == 0);
  29.231 +
  29.232 +    // Is the page already shadowed?
  29.233 +    if ( !test_and_set_bit(_PGC_page_table, &page->count_info) )
  29.234 +    {
  29.235 +        // No prior shadow exists...
  29.236 +
  29.237 +        // Grab a type-ref.  We don't really care if we are racing with another
  29.238 +        // vcpu or not, or even what kind of type we get; we just want the type
  29.239 +        // count to be > 0.
  29.240 +        //
  29.241 +        do {
  29.242 +            type_info =
  29.243 +                page->u.inuse.type_info & (PGT_type_mask | PGT_va_mask);
  29.244 +        } while ( !get_page_type(page, type_info) );
  29.245 +
  29.246 +        // Now that the type ref is non-zero, we can safely use the
  29.247 +        // shadow2_flags.
  29.248 +        //
  29.249 +        page->shadow2_flags = 0;
  29.250 +    }
  29.251 +
  29.252 +    ASSERT(!test_bit(type >> PGC_SH2_type_shift, &page->shadow2_flags));
  29.253 +    set_bit(type >> PGC_SH2_type_shift, &page->shadow2_flags);
  29.254 +}
  29.255 +
  29.256 +void shadow2_demote(struct vcpu *v, mfn_t gmfn, u32 type)
  29.257 +{
  29.258 +    struct page_info *page = mfn_to_page(gmfn);
  29.259 +
  29.260 +    ASSERT(test_bit(_PGC_page_table, &page->count_info));
  29.261 +    ASSERT(test_bit(type >> PGC_SH2_type_shift, &page->shadow2_flags));
  29.262 +
  29.263 +    clear_bit(type >> PGC_SH2_type_shift, &page->shadow2_flags);
  29.264 +
  29.265 +    if ( (page->shadow2_flags & SH2F_page_type_mask) == 0 )
  29.266 +    {
  29.267 +        // release the extra type ref
  29.268 +        put_page_type(page);
  29.269 +
  29.270 +        // clear the is-a-page-table bit.
  29.271 +        clear_bit(_PGC_page_table, &page->count_info);
  29.272 +    }
  29.273 +}
  29.274 +
  29.275 +/**************************************************************************/
  29.276 +/* Validate a pagetable change from the guest and update the shadows.
  29.277 + * Returns a bitmask of SHADOW2_SET_* flags. */
  29.278 +
  29.279 +static int
  29.280 +__shadow2_validate_guest_entry(struct vcpu *v, mfn_t gmfn, 
  29.281 +                               void *entry, u32 size)
  29.282 +{
  29.283 +    int result = 0;
  29.284 +    struct page_info *page = mfn_to_page(gmfn);
  29.285 +
  29.286 +    sh2_mark_dirty(v->domain, gmfn);
  29.287 +    
  29.288 +    // Determine which types of shadows are affected, and update each.
  29.289 +    //
  29.290 +    // Always validate L1s before L2s to prevent another cpu with a linear
  29.291 +    // mapping of this gmfn from seeing a walk that results from 
  29.292 +    // using the new L2 value and the old L1 value.  (It is OK for such a
  29.293 +    // guest to see a walk that uses the old L2 value with the new L1 value,
  29.294 +    // as hardware could behave this way if one level of the pagewalk occurs
  29.295 +    // before the store, and the next level of the pagewalk occurs after the
  29.296 +    // store.
  29.297 +    //
  29.298 +    // Ditto for L2s before L3s, etc.
  29.299 +    //
  29.300 +
  29.301 +    if ( !(page->count_info & PGC_page_table) )
  29.302 +        return 0;  /* Not shadowed at all */
  29.303 +
  29.304 +#if CONFIG_PAGING_LEVELS == 2
  29.305 +    if ( page->shadow2_flags & SH2F_L1_32 ) 
  29.306 +        result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl1e, 2, 2)
  29.307 +            (v, gmfn, entry, size);
  29.308 +#else 
  29.309 +    if ( page->shadow2_flags & SH2F_L1_32 ) 
  29.310 +        result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl1e, 3, 2)
  29.311 +            (v, gmfn, entry, size);
  29.312 +#endif
  29.313 +
  29.314 +#if CONFIG_PAGING_LEVELS == 2
  29.315 +    if ( page->shadow2_flags & SH2F_L2_32 ) 
  29.316 +        result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl2e, 2, 2)
  29.317 +            (v, gmfn, entry, size);
  29.318 +#else 
  29.319 +    if ( page->shadow2_flags & SH2F_L2_32 ) 
  29.320 +        result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl2e, 3, 2)
  29.321 +            (v, gmfn, entry, size);
  29.322 +#endif
  29.323 +
  29.324 +#if CONFIG_PAGING_LEVELS >= 3 
  29.325 +    if ( page->shadow2_flags & SH2F_L1_PAE ) 
  29.326 +        result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl1e, 3, 3)
  29.327 +            (v, gmfn, entry, size);
  29.328 +    if ( page->shadow2_flags & SH2F_L2_PAE ) 
  29.329 +        result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl2e, 3, 3)
  29.330 +            (v, gmfn, entry, size);
  29.331 +    if ( page->shadow2_flags & SH2F_L2H_PAE ) 
  29.332 +        result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl2he, 3, 3)
  29.333 +            (v, gmfn, entry, size);
  29.334 +    if ( page->shadow2_flags & SH2F_L3_PAE ) 
  29.335 +        result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl3e, 3, 3)
  29.336 +            (v, gmfn, entry, size);
  29.337 +#else /* 32-bit non-PAE hypervisor does not support PAE guests */
  29.338 +    ASSERT((page->shadow2_flags & (SH2F_L3_PAE|SH2F_L2_PAE|SH2F_L1_PAE)) == 0);
  29.339 +#endif
  29.340 +
  29.341 +#if CONFIG_PAGING_LEVELS >= 4 
  29.342 +    if ( page->shadow2_flags & SH2F_L1_64 ) 
  29.343 +        result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl1e, 4, 4)
  29.344 +            (v, gmfn, entry, size);
  29.345 +    if ( page->shadow2_flags & SH2F_L2_64 ) 
  29.346 +        result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl2e, 4, 4)
  29.347 +            (v, gmfn, entry, size);
  29.348 +    if ( page->shadow2_flags & SH2F_L3_64 ) 
  29.349 +        result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl3e, 4, 4)
  29.350 +            (v, gmfn, entry, size);
  29.351 +    if ( page->shadow2_flags & SH2F_L4_64 ) 
  29.352 +        result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl4e, 4, 4)
  29.353 +            (v, gmfn, entry, size);
  29.354 +#else /* 32-bit/PAE hypervisor does not support 64-bit guests */
  29.355 +    ASSERT((page->shadow2_flags 
  29.356 +            & (SH2F_L4_64|SH2F_L3_64|SH2F_L2_64|SH2F_L1_64)) == 0);
  29.357 +#endif
  29.358 +
  29.359 +    return result;
  29.360 +}
  29.361 +
  29.362 +
  29.363 +int
  29.364 +shadow2_validate_guest_entry(struct vcpu *v, mfn_t gmfn, void *entry)
  29.365 +/* This is the entry point from hypercalls. It returns a bitmask of all the 
  29.366 + * results of shadow_set_l*e() calls, so the caller knows to do TLB flushes. */
  29.367 +{
  29.368 +    int rc;
  29.369 +
  29.370 +    ASSERT(shadow2_lock_is_acquired(v->domain));
  29.371 +    rc = __shadow2_validate_guest_entry(v, gmfn, entry, sizeof(l1_pgentry_t));
  29.372 +    shadow2_audit_tables(v);
  29.373 +    return rc;
  29.374 +}
  29.375 +
  29.376 +void
  29.377 +shadow2_validate_guest_pt_write(struct vcpu *v, mfn_t gmfn,
  29.378 +                                void *entry, u32 size)
  29.379 +/* This is the entry point for emulated writes to pagetables in HVM guests */
  29.380 +{
  29.381 +    struct domain *d = v->domain;
  29.382 +    int rc;
  29.383 +
  29.384 +    ASSERT(shadow2_lock_is_acquired(v->domain));
  29.385 +    rc = __shadow2_validate_guest_entry(v, gmfn, entry, size);
  29.386 +    if ( rc & SHADOW2_SET_FLUSH )
  29.387 +    {
  29.388 +        // Flush everyone except the local processor, which will flush when it
  29.389 +        // re-enters the HVM guest.
  29.390 +        //
  29.391 +        cpumask_t mask = d->domain_dirty_cpumask;
  29.392 +        cpu_clear(v->processor, mask);
  29.393 +        flush_tlb_mask(mask);
  29.394 +    }
  29.395 +    if ( rc & SHADOW2_SET_ERROR ) 
  29.396 +    {
  29.397 +        /* This page is probably not a pagetable any more: tear it out of the 
  29.398 +         * shadows, along with any tables that reference it */
  29.399 +        shadow2_remove_all_shadows_and_parents(v, gmfn);
  29.400 +    }
  29.401 +    /* We ignore the other bits: since we are about to change CR3 on
  29.402 +     * VMENTER we don't need to do any extra TLB flushes. */ 
  29.403 +}
  29.404 +
  29.405 +
  29.406 +/**************************************************************************/
  29.407 +/* Memory management for shadow pages. */ 
  29.408 +
  29.409 +/* Meaning of the count_info field in shadow pages
  29.410 + * ----------------------------------------------
  29.411 + * 
  29.412 + * A count of all references to this page from other shadow pages and
  29.413 + * guest CR3s (a.k.a. v->arch.shadow_table).  
  29.414 + *
  29.415 + * The top bits hold the shadow type and the pinned bit.  Top-level
  29.416 + * shadows are pinned so that they don't disappear when not in a CR3
  29.417 + * somewhere.
  29.418 + *
  29.419 + * We don't need to use get|put_page for this as the updates are all
  29.420 + * protected by the shadow lock.  We can't use get|put_page for this
  29.421 + * as the size of the count on shadow pages is different from that on
  29.422 + * normal guest pages.
  29.423 + */
  29.424 +
  29.425 +/* Meaning of the type_info field in shadow pages
  29.426 + * ----------------------------------------------
  29.427 + * 
  29.428 + * type_info use depends on the shadow type (from count_info)
  29.429 + * 
  29.430 + * PGC_SH2_none : This page is in the shadow2 free pool.  type_info holds
  29.431 + *                the chunk order for our freelist allocator.
  29.432 + *
  29.433 + * PGC_SH2_l*_shadow : This page is in use as a shadow. type_info 
  29.434 + *                     holds the mfn of the guest page being shadowed,
  29.435 + *
  29.436 + * PGC_SH2_fl1_*_shadow : This page is being used to shatter a superpage.
  29.437 + *                        type_info holds the gfn being shattered.
  29.438 + *
  29.439 + * PGC_SH2_monitor_table : This page is part of a monitor table.
  29.440 + *                         type_info is not used.
  29.441 + */
  29.442 +
  29.443 +/* Meaning of the _domain field in shadow pages
  29.444 + * --------------------------------------------
  29.445 + *
  29.446 + * In shadow pages, this field will always have its least significant bit
  29.447 + * set.  This ensures that all attempts to get_page() will fail (as all
  29.448 + * valid pickled domain pointers have a zero for their least significant bit).
  29.449 + * Instead, the remaining upper bits are used to record the shadow generation
  29.450 + * counter when the shadow was created.
  29.451 + */
  29.452 +
  29.453 +/* Meaning of the shadow2_flags field
  29.454 + * ----------------------------------
  29.455 + * 
  29.456 + * In guest pages that are shadowed, one bit for each kind of shadow they have.
  29.457 + * 
  29.458 + * In shadow pages, will be used for holding a representation of the populated
  29.459 + * entries in this shadow (either a min/max, or a bitmap, or ...)
  29.460 + *
  29.461 + * In monitor-table pages, holds the level of the particular page (to save
  29.462 + * spilling the shadow types into an extra bit by having three types of monitor
  29.463 + * page).
  29.464 + */
  29.465 +
  29.466 +/* Meaning of the list_head struct in shadow pages
  29.467 + * -----------------------------------------------
  29.468 + *
  29.469 + * In free shadow pages, this is used to hold the free-lists of chunks.
  29.470 + *
  29.471 + * In top-level shadow tables, this holds a linked-list of all top-level
  29.472 + * shadows (used for recovering memory and destroying shadows). 
  29.473 + *
  29.474 + * In lower-level shadows, this holds the physical address of a higher-level
  29.475 + * shadow entry that holds a reference to this shadow (or zero).
  29.476 + */
  29.477 +
  29.478 +/* Allocating shadow pages
  29.479 + * -----------------------
  29.480 + *
  29.481 + * Most shadow pages are allocated singly, but there are two cases where we 
  29.482 + * need to allocate multiple pages together.
  29.483 + * 
  29.484 + * 1: Shadowing 32-bit guest tables on PAE or 64-bit shadows.
  29.485 + *    A 32-bit guest l1 table covers 4MB of virtuial address space,
  29.486 + *    and needs to be shadowed by two PAE/64-bit l1 tables (covering 2MB
  29.487 + *    of virtual address space each).  Similarly, a 32-bit guest l2 table 
  29.488 + *    (4GB va) needs to be shadowed by four PAE/64-bit l2 tables (1GB va 
  29.489 + *    each).  These multi-page shadows are contiguous and aligned; 
  29.490 + *    functions for handling offsets into them are defined in shadow2.c 
  29.491 + *    (shadow_l1_index() etc.)
  29.492 + *    
  29.493 + * 2: Shadowing PAE top-level pages.  Each guest page that contains
  29.494 + *    any PAE top-level pages requires two shadow pages to shadow it.
  29.495 + *    They contain alternating l3 tables and pae_l3_bookkeeping structs.
  29.496 + *
  29.497 + * This table shows the allocation behaviour of the different modes:
  29.498 + *
  29.499 + * Xen paging      32b  pae  pae  64b  64b  64b
  29.500 + * Guest paging    32b  32b  pae  32b  pae  64b
  29.501 + * PV or HVM        *   HVM   *   HVM  HVM   * 
  29.502 + * Shadow paging   32b  pae  pae  pae  pae  64b
  29.503 + *
  29.504 + * sl1 size         4k   8k   4k   8k   4k   4k
  29.505 + * sl2 size         4k  16k   4k  16k   4k   4k
  29.506 + * sl3 size         -    -    8k   -    8k   4k
  29.507 + * sl4 size         -    -    -    -    -    4k
  29.508 + *
  29.509 + * We allocate memory from xen in four-page units and break them down
  29.510 + * with a simple buddy allocator.  Can't use the xen allocator to handle
  29.511 + * this as it only works for contiguous zones, and a domain's shadow
  29.512 + * pool is made of fragments.
  29.513 + *
  29.514 + * In HVM guests, the p2m table is built out of shadow pages, and we provide 
  29.515 + * a function for the p2m management to steal pages, in max-order chunks, from 
  29.516 + * the free pool.  We don't provide for giving them back, yet.
  29.517 + */
  29.518 +
  29.519 +/* Figure out the least acceptable quantity of shadow memory.
  29.520 + * The minimum memory requirement for always being able to free up a
  29.521 + * chunk of memory is very small -- only three max-order chunks per
  29.522 + * vcpu to hold the top level shadows and pages with Xen mappings in them.  
  29.523 + *
  29.524 + * But for a guest to be guaranteed to successfully execute a single
  29.525 + * instruction, we must be able to map a large number (about thirty) VAs
  29.526 + * at the same time, which means that to guarantee progress, we must
  29.527 + * allow for more than ninety allocated pages per vcpu.  We round that
  29.528 + * up to 128 pages, or half a megabyte per vcpu. */
  29.529 +unsigned int shadow2_min_acceptable_pages(struct domain *d) 
  29.530 +{
  29.531 +    u32 vcpu_count = 0;
  29.532 +    struct vcpu *v;
  29.533 +
  29.534 +    for_each_vcpu(d, v)
  29.535 +        vcpu_count++;
  29.536 +
  29.537 +    return (vcpu_count * 128);
  29.538 +}
  29.539 +
  29.540 +/* Using the type_info field to store freelist order */
  29.541 +#define SH2_PFN_ORDER(_p) ((_p)->u.inuse.type_info)
  29.542 +#define SH2_SET_PFN_ORDER(_p, _o)                       \
  29.543 + do { (_p)->u.inuse.type_info = (_o); } while (0)
  29.544 + 
  29.545 +
  29.546 +/* Figure out the order of allocation needed for a given shadow type */
  29.547 +static inline u32
  29.548 +shadow_order(u32 shadow_type) 
  29.549 +{
  29.550 +#if CONFIG_PAGING_LEVELS > 2
  29.551 +    static const u32 type_to_order[16] = {
  29.552 +        0, /* PGC_SH2_none           */
  29.553 +        1, /* PGC_SH2_l1_32_shadow   */
  29.554 +        1, /* PGC_SH2_fl1_32_shadow  */
  29.555 +        2, /* PGC_SH2_l2_32_shadow   */
  29.556 +        0, /* PGC_SH2_l1_pae_shadow  */
  29.557 +        0, /* PGC_SH2_fl1_pae_shadow */
  29.558 +        0, /* PGC_SH2_l2_pae_shadow  */
  29.559 +        0, /* PGC_SH2_l2h_pae_shadow */
  29.560 +        1, /* PGC_SH2_l3_pae_shadow  */
  29.561 +        0, /* PGC_SH2_l1_64_shadow   */
  29.562 +        0, /* PGC_SH2_fl1_64_shadow  */
  29.563 +        0, /* PGC_SH2_l2_64_shadow   */
  29.564 +        0, /* PGC_SH2_l3_64_shadow   */
  29.565 +        0, /* PGC_SH2_l4_64_shadow   */
  29.566 +        2, /* PGC_SH2_p2m_table      */
  29.567 +        0  /* PGC_SH2_monitor_table  */
  29.568 +        };
  29.569 +    u32 type = (shadow_type & PGC_SH2_type_mask) >> PGC_SH2_type_shift;
  29.570 +    return type_to_order[type];
  29.571 +#else  /* 32-bit Xen only ever shadows 32-bit guests on 32-bit shadows. */
  29.572 +    return 0;
  29.573 +#endif
  29.574 +}
  29.575 +
  29.576 +
  29.577 +/* Do we have a free chunk of at least this order? */
  29.578 +static inline int chunk_is_available(struct domain *d, int order)
  29.579 +{
  29.580 +    int i;
  29.581 +    
  29.582 +    for ( i = order; i <= SHADOW2_MAX_ORDER; i++ )
  29.583 +        if ( !list_empty(&d->arch.shadow2_freelists[i]) )
  29.584 +            return 1;
  29.585 +    return 0;
  29.586 +}
  29.587 +
  29.588 +/* Dispatcher function: call the per-mode function that will unhook the
  29.589 + * non-Xen mappings in this top-level shadow mfn */
  29.590 +void shadow2_unhook_mappings(struct vcpu *v, mfn_t smfn)
  29.591 +{
  29.592 +    struct page_info *pg = mfn_to_page(smfn);
  29.593 +    switch ( (pg->count_info & PGC_SH2_type_mask) >> PGC_SH2_type_shift )
  29.594 +    {
  29.595 +    case PGC_SH2_l2_32_shadow >> PGC_SH2_type_shift:
  29.596 +#if CONFIG_PAGING_LEVELS == 2
  29.597 +        SHADOW2_INTERNAL_NAME(sh2_unhook_32b_mappings,2,2)(v,smfn);
  29.598 +#else
  29.599 +        SHADOW2_INTERNAL_NAME(sh2_unhook_32b_mappings,3,2)(v,smfn);
  29.600 +#endif
  29.601 +        break;
  29.602 +#if CONFIG_PAGING_LEVELS >= 3
  29.603 +    case PGC_SH2_l3_pae_shadow >> PGC_SH2_type_shift:
  29.604 +        SHADOW2_INTERNAL_NAME(sh2_unhook_pae_mappings,3,3)(v,smfn);
  29.605 +        break;
  29.606 +#endif
  29.607 +#if CONFIG_PAGING_LEVELS >= 4
  29.608 +    case PGC_SH2_l4_64_shadow >> PGC_SH2_type_shift:
  29.609 +        SHADOW2_INTERNAL_NAME(sh2_unhook_64b_mappings,4,4)(v,smfn);
  29.610 +        break;
  29.611 +#endif
  29.612 +    default:
  29.613 +        SHADOW2_PRINTK("top-level shadow has bad type %08lx\n", 
  29.614 +                       (unsigned long)((pg->count_info & PGC_SH2_type_mask)
  29.615 +                                       >> PGC_SH2_type_shift));
  29.616 +        BUG();
  29.617 +    }
  29.618 +}
  29.619 +
  29.620 +
  29.621 +/* Make sure there is at least one chunk of the required order available
  29.622 + * in the shadow page pool. This must be called before any calls to
  29.623 + * shadow2_alloc().  Since this will free existing shadows to make room,
  29.624 + * it must be called early enough to avoid freeing shadows that the
  29.625 + * caller is currently working on. */
  29.626 +void shadow2_prealloc(struct domain *d, unsigned int order)
  29.627 +{
  29.628 +    /* Need a vpcu for calling unpins; for now, since we don't have
  29.629 +     * per-vcpu shadows, any will do */
  29.630 +    struct vcpu *v = d->vcpu[0];
  29.631 +    struct list_head *l, *t;
  29.632 +    struct page_info *pg;
  29.633 +    mfn_t smfn;
  29.634 +
  29.635 +    if ( chunk_is_available(d, order) ) return; 
  29.636 +    
  29.637 +    /* Stage one: walk the list of top-level pages, unpinning them */
  29.638 +    perfc_incrc(shadow2_prealloc_1);
  29.639 +    list_for_each_backwards_safe(l, t, &d->arch.shadow2_toplevel_shadows)
  29.640 +    {
  29.641 +        pg = list_entry(l, struct page_info, list);
  29.642 +        smfn = page_to_mfn(pg);
  29.643 +
  29.644 +#if CONFIG_PAGING_LEVELS >= 3
  29.645 +        if ( (pg->count_info & PGC_SH2_type_mask) == PGC_SH2_l3_pae_shadow )
  29.646 +        {
  29.647 +            /* For PAE, we need to unpin each subshadow on this shadow */
  29.648 +            SHADOW2_INTERNAL_NAME(sh2_unpin_all_l3_subshadows,3,3)(v, smfn);
  29.649 +        } 
  29.650 +        else 
  29.651 +#endif /* 32-bit code always takes this branch */
  29.652 +        {
  29.653 +            /* Unpin this top-level shadow */
  29.654 +            sh2_unpin(v, smfn);
  29.655 +        }
  29.656 +
  29.657 +        /* See if that freed up a chunk of appropriate size */
  29.658 +        if ( chunk_is_available(d, order) ) return;
  29.659 +    }
  29.660 +
  29.661 +    /* Stage two: all shadow pages are in use in hierarchies that are
  29.662 +     * loaded in cr3 on some vcpu.  Walk them, unhooking the non-Xen
  29.663 +     * mappings. */
  29.664 +    perfc_incrc(shadow2_prealloc_2);
  29.665 +    v = current;
  29.666 +    if ( v->domain != d )
  29.667 +        v = d->vcpu[0];
  29.668 +    /* Walk the list from the tail: recently used toplevels have been pulled
  29.669 +     * to the head */
  29.670 +    list_for_each_backwards_safe(l, t, &d->arch.shadow2_toplevel_shadows)
  29.671 +    {
  29.672 +        pg = list_entry(l, struct page_info, list);
  29.673 +        smfn = page_to_mfn(pg);
  29.674 +        shadow2_unhook_mappings(v, smfn);
  29.675 +
  29.676 +        /* Need to flush TLB if we've altered our own tables */
  29.677 +        if ( !shadow2_mode_external(d) 
  29.678 +             && pagetable_get_pfn(current->arch.shadow_table) == mfn_x(smfn) )
  29.679 +            local_flush_tlb();
  29.680 +        
  29.681 +        /* See if that freed up a chunk of appropriate size */
  29.682 +        if ( chunk_is_available(d, order) ) return;
  29.683 +    }
  29.684 +    
  29.685 +    /* Nothing more we can do: all remaining shadows are of pages that
  29.686 +     * hold Xen mappings for some vcpu.  This can never happen. */
  29.687 +    SHADOW2_PRINTK("Can't pre-allocate %i shadow pages!\n"
  29.688 +                   "  shadow pages total = %u, free = %u, p2m=%u\n",
  29.689 +                   1 << order, 
  29.690 +                   d->arch.shadow2_total_pages, 
  29.691 +                   d->arch.shadow2_free_pages, 
  29.692 +                   d->arch.shadow2_p2m_pages);
  29.693 +    BUG();
  29.694 +}
  29.695 +
  29.696 +
  29.697 +/* Allocate another shadow's worth of (contiguous, aligned) pages,
  29.698 + * and fill in the type and backpointer fields of their page_infos. 
  29.699 + * Never fails to allocate. */
  29.700 +mfn_t shadow2_alloc(struct domain *d,  
  29.701 +                    u32 shadow_type,
  29.702 +                    unsigned long backpointer)
  29.703 +{
  29.704 +    struct page_info *pg = NULL;
  29.705 +    unsigned int order = shadow_order(shadow_type);
  29.706 +    cpumask_t mask;
  29.707 +    void *p;
  29.708 +    int i;
  29.709 +
  29.710 +    ASSERT(shadow2_lock_is_acquired(d));
  29.711 +    ASSERT(order <= SHADOW2_MAX_ORDER);
  29.712 +    ASSERT(shadow_type != PGC_SH2_none);
  29.713 +    perfc_incrc(shadow2_alloc);
  29.714 +
  29.715 +    /* Find smallest order which can satisfy the request. */
  29.716 +    for ( i = order; i <= SHADOW2_MAX_ORDER; i++ )
  29.717 +        if ( !list_empty(&d->arch.shadow2_freelists[i]) )
  29.718 +        {
  29.719 +            pg = list_entry(d->arch.shadow2_freelists[i].next, 
  29.720 +                            struct page_info, list);
  29.721 +            list_del(&pg->list);
  29.722 +            
  29.723 +            /* We may have to halve the chunk a number of times. */
  29.724 +            while ( i != order )
  29.725 +            {
  29.726 +                i--;
  29.727 +                SH2_SET_PFN_ORDER(pg, i);
  29.728 +                list_add_tail(&pg->list, &d->arch.shadow2_freelists[i]);
  29.729 +                pg += 1 << i;
  29.730 +            }
  29.731 +            d->arch.shadow2_free_pages -= 1 << order;
  29.732 +
  29.733 +            /* Init page info fields and clear the pages */
  29.734 +            for ( i = 0; i < 1<<order ; i++ ) 
  29.735 +            {
  29.736 +                pg[i].u.inuse.type_info = backpointer;
  29.737 +                pg[i].count_info = shadow_type;
  29.738 +                pg[i].shadow2_flags = 0;
  29.739 +                INIT_LIST_HEAD(&pg[i].list);
  29.740 +                /* Before we overwrite the old contents of this page, 
  29.741 +                 * we need to be sure that no TLB holds a pointer to it. */
  29.742 +                mask = d->domain_dirty_cpumask;
  29.743 +                tlbflush_filter(mask, pg[i].tlbflush_timestamp);
  29.744 +                if ( unlikely(!cpus_empty(mask)) )
  29.745 +                {
  29.746 +                    perfc_incrc(shadow2_alloc_tlbflush);
  29.747 +                    flush_tlb_mask(mask);
  29.748 +                }
  29.749 +                /* Now safe to clear the page for reuse */
  29.750 +                p = sh2_map_domain_page(page_to_mfn(pg+i));
  29.751 +                ASSERT(p != NULL);
  29.752 +                clear_page(p);
  29.753 +                sh2_unmap_domain_page(p);
  29.754 +                perfc_incr(shadow2_alloc_count);
  29.755 +            }
  29.756 +            return page_to_mfn(pg);
  29.757 +        }
  29.758 +    
  29.759 +    /* If we get here, we failed to allocate. This should never happen.
  29.760 +     * It means that we didn't call shadow2_prealloc() correctly before
  29.761 +     * we allocated.  We can't recover by calling prealloc here, because
  29.762 +     * we might free up higher-level pages that the caller is working on. */
  29.763 +    SHADOW2_PRINTK("Can't allocate %i shadow pages!\n", 1 << order);
  29.764 +    BUG();
  29.765 +}
  29.766 +
  29.767 +
  29.768 +/* Return some shadow pages to the pool. */
  29.769 +void shadow2_free(struct domain *d, mfn_t smfn)
  29.770 +{
  29.771 +    struct page_info *pg = mfn_to_page(smfn); 
  29.772 +    u32 shadow_type;
  29.773 +    unsigned long order;
  29.774 +    unsigned long mask;
  29.775 +    int i;
  29.776 +
  29.777 +    ASSERT(shadow2_lock_is_acquired(d));
  29.778 +    perfc_incrc(shadow2_free);
  29.779 +
  29.780 +    shadow_type = pg->count_info & PGC_SH2_type_mask;
  29.781 +    ASSERT(shadow_type != PGC_SH2_none);
  29.782 +    ASSERT(shadow_type != PGC_SH2_p2m_table);
  29.783 +    order = shadow_order(shadow_type);
  29.784 +
  29.785 +    d->arch.shadow2_free_pages += 1 << order;
  29.786 +
  29.787 +    for ( i = 0; i < 1<<order; i++ ) 
  29.788 +    {
  29.789 +        /* Strip out the type: this is now a free shadow page */
  29.790 +        pg[i].count_info = 0;
  29.791 +        /* Remember the TLB timestamp so we will know whether to flush 
  29.792 +         * TLBs when we reuse the page.  Because the destructors leave the
  29.793 +         * contents of the pages in place, we can delay TLB flushes until
  29.794 +         * just before the allocator hands the page out again. */
  29.795 +        pg[i].tlbflush_timestamp = tlbflush_current_time();
  29.796 +        perfc_decr(shadow2_alloc_count);
  29.797 +    }
  29.798 +
  29.799 +    /* Merge chunks as far as possible. */
  29.800 +    while ( order < SHADOW2_MAX_ORDER )
  29.801 +    {
  29.802 +        mask = 1 << order;
  29.803 +        if ( (mfn_x(page_to_mfn(pg)) & mask) ) {
  29.804 +            /* Merge with predecessor block? */
  29.805 +            if ( (((pg-mask)->count_info & PGC_SH2_type_mask) != PGT_none) 
  29.806 +                 || (SH2_PFN_ORDER(pg-mask) != order) )
  29.807 +                break;
  29.808 +            list_del(&(pg-mask)->list);
  29.809 +            pg -= mask;
  29.810 +        } else {
  29.811 +            /* Merge with successor block? */
  29.812 +            if ( (((pg+mask)->count_info & PGC_SH2_type_mask) != PGT_none)
  29.813 +                 || (SH2_PFN_ORDER(pg+mask) != order) )
  29.814 +                break;
  29.815 +            list_del(&(pg+mask)->list);
  29.816 +        }
  29.817 +        order++;
  29.818 +    }
  29.819 +
  29.820 +    SH2_SET_PFN_ORDER(pg, order);
  29.821 +    list_add_tail(&pg->list, &d->arch.shadow2_freelists[order]);
  29.822 +}
  29.823 +
  29.824 +/* Divert some memory from the pool to be used by the p2m mapping.
  29.825 + * This action is irreversible: the p2m mapping only ever grows.
  29.826 + * That's OK because the p2m table only exists for external domains,
  29.827 + * and those domains can't ever turn off shadow mode.
  29.828 + * Also, we only ever allocate a max-order chunk, so as to preserve
  29.829 + * the invariant that shadow2_prealloc() always works.
  29.830 + * Returns 0 iff it can't get a chunk (the caller should then
  29.831 + * free up some pages in domheap and call set_sh2_allocation);
  29.832 + * returns non-zero on success.
  29.833 + */
  29.834 +static int
  29.835 +shadow2_alloc_p2m_pages(struct domain *d)
  29.836 +{
  29.837 +    struct page_info *pg;
  29.838 +    u32 i;
  29.839 +    ASSERT(shadow2_lock_is_acquired(d));
  29.840 +    
  29.841 +    if ( d->arch.shadow2_total_pages 
  29.842 +         < (shadow2_min_acceptable_pages(d) + (1<<SHADOW2_MAX_ORDER)) )
  29.843 +        return 0; /* Not enough shadow memory: need to increase it first */
  29.844 +    
  29.845 +    pg = mfn_to_page(shadow2_alloc(d, PGC_SH2_p2m_table, 0));
  29.846 +    d->arch.shadow2_p2m_pages += (1<<SHADOW2_MAX_ORDER);
  29.847 +    d->arch.shadow2_total_pages -= (1<<SHADOW2_MAX_ORDER);
  29.848 +    for (i = 0; i < (1<<SHADOW2_MAX_ORDER); i++)
  29.849 +    {
  29.850 +        /* Unlike shadow pages, mark p2m pages as owned by the domain */
  29.851 +        page_set_owner(&pg[i], d);
  29.852 +        list_add_tail(&pg[i].list, &d->arch.shadow2_p2m_freelist);
  29.853 +    }
  29.854 +    return 1;
  29.855 +}
  29.856 +
  29.857 +// Returns 0 if no memory is available...
  29.858 +mfn_t
  29.859 +shadow2_alloc_p2m_page(struct domain *d)
  29.860 +{
  29.861 +    struct list_head *entry;
  29.862 +    mfn_t mfn;
  29.863 +    void *p;
  29.864 +
  29.865 +    if ( list_empty(&d->arch.shadow2_p2m_freelist) &&
  29.866 +         !shadow2_alloc_p2m_pages(d) )
  29.867 +        return _mfn(0);
  29.868 +    entry = d->arch.shadow2_p2m_freelist.next;
  29.869 +    list_del(entry);
  29.870 +    list_add_tail(entry, &d->arch.shadow2_p2m_inuse);
  29.871 +    mfn = page_to_mfn(list_entry(entry, struct page_info, list));
  29.872 +    sh2_get_ref(mfn, 0);
  29.873 +    p = sh2_map_domain_page(mfn);
  29.874 +    clear_page(p);
  29.875 +    sh2_unmap_domain_page(p);
  29.876 +
  29.877 +    return mfn;
  29.878 +}
  29.879 +
  29.880 +#if CONFIG_PAGING_LEVELS == 3
  29.881 +static void p2m_install_entry_in_monitors(struct domain *d, 
  29.882 +                                          l3_pgentry_t *l3e) 
  29.883 +/* Special case, only used for external-mode domains on PAE hosts:
  29.884 + * update the mapping of the p2m table.  Once again, this is trivial in
  29.885 + * other paging modes (one top-level entry points to the top-level p2m,
  29.886 + * no maintenance needed), but PAE makes life difficult by needing a
  29.887 + * copy the eight l3es of the p2m table in eight l2h slots in the
  29.888 + * monitor table.  This function makes fresh copies when a p2m l3e
  29.889 + * changes. */
  29.890 +{
  29.891 +    l2_pgentry_t *ml2e;
  29.892 +    struct vcpu *v;
  29.893 +    unsigned int index;
  29.894 +
  29.895 +    index = ((unsigned long)l3e & ~PAGE_MASK) / sizeof(l3_pgentry_t);
  29.896 +    ASSERT(index < MACHPHYS_MBYTES>>1);
  29.897 +
  29.898 +    for_each_vcpu(d, v) 
  29.899 +    {
  29.900 +        if ( pagetable_get_pfn(v->arch.monitor_table) == 0 ) 
  29.901 +            continue;
  29.902 +        ASSERT(shadow2_mode_external(v->domain));
  29.903 +
  29.904 +        SHADOW2_DEBUG(P2M, "d=%u v=%u index=%u mfn=%#lx\n",
  29.905 +                      d->domain_id, v->vcpu_id, index, l3e_get_pfn(*l3e));
  29.906 +
  29.907 +        if ( v == current ) /* OK to use linear map of monitor_table */
  29.908 +            ml2e = __linear_l2_table + l2_linear_offset(RO_MPT_VIRT_START);
  29.909 +        else 
  29.910 +        {
  29.911 +            l3_pgentry_t *ml3e;
  29.912 +            ml3e = sh2_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
  29.913 +            ASSERT(l3e_get_flags(ml3e[3]) & _PAGE_PRESENT);
  29.914 +            ml2e = sh2_map_domain_page(_mfn(l3e_get_pfn(ml3e[3])));
  29.915 +            ml2e += l2_table_offset(RO_MPT_VIRT_START);
  29.916 +            sh2_unmap_domain_page(ml3e);
  29.917 +        }
  29.918 +        ml2e[index] = l2e_from_pfn(l3e_get_pfn(*l3e), __PAGE_HYPERVISOR);
  29.919 +        if ( v != current )
  29.920 +            sh2_unmap_domain_page(ml2e);
  29.921 +    }
  29.922 +}
  29.923 +#endif
  29.924 +
  29.925 +// Find the next level's P2M entry, checking for out-of-range gfn's...
  29.926 +// Returns NULL on error.
  29.927 +//
  29.928 +static l1_pgentry_t *
  29.929 +p2m_find_entry(void *table, unsigned long *gfn_remainder,
  29.930 +                   unsigned long gfn, u32 shift, u32 max)
  29.931 +{
  29.932 +    u32 index;
  29.933 +
  29.934 +    index = *gfn_remainder >> shift;
  29.935 +    if ( index >= max )
  29.936 +    {
  29.937 +        SHADOW2_DEBUG(P2M, "gfn=0x%lx out of range "
  29.938 +                      "(gfn_remainder=0x%lx shift=%d index=0x%x max=0x%x)\n",
  29.939 +                       gfn, *gfn_remainder, shift, index, max);
  29.940 +        return NULL;
  29.941 +    }
  29.942 +    *gfn_remainder &= (1 << shift) - 1;
  29.943 +    return (l1_pgentry_t *)table + index;
  29.944 +}
  29.945 +
  29.946 +// Walk one level of the P2M table, allocating a new table if required.
  29.947 +// Returns 0 on error.
  29.948 +//
  29.949 +static int
  29.950 +p2m_next_level(struct domain *d, mfn_t *table_mfn, void **table, 
  29.951 +               unsigned long *gfn_remainder, unsigned long gfn, u32 shift, 
  29.952 +               u32 max, unsigned long type)
  29.953 +{
  29.954 +    l1_pgentry_t *p2m_entry;
  29.955 +    void *next;
  29.956 +
  29.957 +    if ( !(p2m_entry = p2m_find_entry(*table, gfn_remainder, gfn,
  29.958 +                                      shift, max)) )
  29.959 +        return 0;
  29.960 +
  29.961 +    if ( !(l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) )
  29.962 +    {
  29.963 +        mfn_t mfn = shadow2_alloc_p2m_page(d);
  29.964 +        if ( mfn_x(mfn) == 0 )
  29.965 +            return 0;
  29.966 +        *p2m_entry = l1e_from_pfn(mfn_x(mfn), __PAGE_HYPERVISOR|_PAGE_USER);
  29.967 +        mfn_to_page(mfn)->u.inuse.type_info = type | 1 | PGT_validated;
  29.968 +        mfn_to_page(mfn)->count_info = 1;
  29.969 +#if CONFIG_PAGING_LEVELS == 3
  29.970 +        if (type == PGT_l2_page_table)
  29.971 +        {
  29.972 +            /* We have written to the p2m l3: need to sync the per-vcpu
  29.973 +             * copies of it in the monitor tables */
  29.974 +            p2m_install_entry_in_monitors(d, (l3_pgentry_t *)p2m_entry);
  29.975 +        }
  29.976 +#endif
  29.977 +        /* The P2M can be shadowed: keep the shadows synced */
  29.978 +        if ( d->vcpu[0] )
  29.979 +            (void)__shadow2_validate_guest_entry(d->vcpu[0], *table_mfn,
  29.980 +                                                 p2m_entry, sizeof *p2m_entry);
  29.981 +    }
  29.982 +    *table_mfn = _mfn(l1e_get_pfn(*p2m_entry));
  29.983 +    next = sh2_map_domain_page(*table_mfn);
  29.984 +    sh2_unmap_domain_page(*table);
  29.985 +    *table = next;
  29.986 +
  29.987 +    return 1;
  29.988 +}
  29.989 +
  29.990 +// Returns 0 on error (out of memory)
  29.991 +int
  29.992 +shadow2_set_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn)
  29.993 +{
  29.994 +    // XXX -- this might be able to be faster iff current->domain == d
  29.995 +    mfn_t table_mfn = pagetable_get_mfn(d->arch.phys_table);
  29.996 +    void *table = sh2_map_domain_page(table_mfn);
  29.997 +    unsigned long gfn_remainder = gfn;
  29.998 +    l1_pgentry_t *p2m_entry;
  29.999 +
 29.1000 +#if CONFIG_PAGING_LEVELS >= 4
 29.1001 +    if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
 29.1002 +                         L4_PAGETABLE_SHIFT - PAGE_SHIFT,
 29.1003 +                         L4_PAGETABLE_ENTRIES, PGT_l3_page_table) )
 29.1004 +        return 0;
 29.1005 +#endif
 29.1006 +#if CONFIG_PAGING_LEVELS >= 3
 29.1007 +    // When using PAE Xen, we only allow 33 bits of pseudo-physical
 29.1008 +    // address in translated guests (i.e. 8 GBytes).  This restriction
 29.1009 +    // comes from wanting to map the P2M table into the 16MB RO_MPT hole
 29.1010 +    // in Xen's address space for translated PV guests.
 29.1011 +    //
 29.1012 +    if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
 29.1013 +                         L3_PAGETABLE_SHIFT - PAGE_SHIFT,
 29.1014 +                         (CONFIG_PAGING_LEVELS == 3
 29.1015 +                          ? 8
 29.1016 +                          : L3_PAGETABLE_ENTRIES),
 29.1017 +                         PGT_l2_page_table) )
 29.1018 +        return 0;
 29.1019 +#endif
 29.1020 +    if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
 29.1021 +                         L2_PAGETABLE_SHIFT - PAGE_SHIFT,
 29.1022 +                         L2_PAGETABLE_ENTRIES, PGT_l1_page_table) )
 29.1023 +        return 0;
 29.1024 +
 29.1025 +    p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn,
 29.1026 +                               0, L1_PAGETABLE_ENTRIES);
 29.1027 +    ASSERT(p2m_entry);
 29.1028 +    if ( valid_mfn(mfn) )
 29.1029 +        *p2m_entry = l1e_from_pfn(mfn_x(mfn), __PAGE_HYPERVISOR|_PAGE_USER);
 29.1030 +    else
 29.1031 +        *p2m_entry = l1e_empty();
 29.1032 +
 29.1033 +    /* The P2M can be shadowed: keep the shadows synced */
 29.1034 +    (void) __shadow2_validate_guest_entry(d->vcpu[0], table_mfn, 
 29.1035 +                                          p2m_entry, sizeof *p2m_entry);
 29.1036 +
 29.1037 +    sh2_unmap_domain_page(table);
 29.1038 +
 29.1039 +    return 1;
 29.1040 +}
 29.1041 +
 29.1042 +// Allocate a new p2m table for a domain.
 29.1043 +//
 29.1044 +// The structure of the p2m table is that of a pagetable for xen (i.e. it is
 29.1045 +// controlled by CONFIG_PAGING_LEVELS).
 29.1046 +//
 29.1047 +// Returns 0 if p2m table could not be initialized
 29.1048 +//
 29.1049 +static int
 29.1050 +shadow2_alloc_p2m_table(struct domain *d)
 29.1051 +{
 29.1052 +    mfn_t p2m_top;
 29.1053 +    struct list_head *entry;
 29.1054 +    unsigned int page_count = 0;
 29.1055 +    
 29.1056 +    SHADOW2_PRINTK("allocating p2m table\n");
 29.1057 +    ASSERT(pagetable_get_pfn(d->arch.phys_table) == 0);
 29.1058 +
 29.1059 +    p2m_top = shadow2_alloc_p2m_page(d);
 29.1060 +    mfn_to_page(p2m_top)->count_info = 1;
 29.1061 +    mfn_to_page(p2m_top)->u.inuse.type_info = 
 29.1062 +#if CONFIG_PAGING_LEVELS == 4
 29.1063 +        PGT_l4_page_table
 29.1064 +#elif CONFIG_PAGING_LEVELS == 3
 29.1065 +        PGT_l3_page_table
 29.1066 +#elif CONFIG_PAGING_LEVELS == 2
 29.1067 +        PGT_l2_page_table
 29.1068 +#endif
 29.1069 +        | 1 | PGT_validated;
 29.1070 +   
 29.1071 +    if ( mfn_x(p2m_top) == 0 )
 29.1072 +        return 0;
 29.1073 +
 29.1074 +    d->arch.phys_table = pagetable_from_mfn(p2m_top);
 29.1075 +
 29.1076 +    SHADOW2_PRINTK("populating p2m table\n");
 29.1077 + 
 29.1078 +    for ( entry = d->page_list.next;
 29.1079 +          entry != &d->page_list;
 29.1080 +          entry = entry->next )
 29.1081 +    {
 29.1082 +        struct page_info *page = list_entry(entry, struct page_info, list);
 29.1083 +        mfn_t mfn = page_to_mfn(page);
 29.1084 +        unsigned long gfn = get_gpfn_from_mfn(mfn_x(mfn));
 29.1085 +        page_count++;
 29.1086 +        if (
 29.1087 +#ifdef __x86_64__
 29.1088 +            (gfn != 0x5555555555555555L)
 29.1089 +#else
 29.1090 +            (gfn != 0x55555555L)
 29.1091 +#endif
 29.1092 +             && gfn != INVALID_M2P_ENTRY
 29.1093 +             && !shadow2_set_p2m_entry(d, gfn, mfn) )
 29.1094 +        {
 29.1095 +            SHADOW2_PRINTK("failed to initialize p2m table, gfn=%05lx, mfn=%" SH2_PRI_mfn "\n",
 29.1096 +                           gfn, mfn_x(mfn));
 29.1097 +            return 0;
 29.1098 +        }
 29.1099 +    }
 29.1100 +
 29.1101 +    SHADOW2_PRINTK("p2m table initialised (%u pages)\n", page_count);
 29.1102 +    return 1;
 29.1103 +}
 29.1104 +
 29.1105 +mfn_t
 29.1106 +sh2_gfn_to_mfn_foreign(struct domain *d, unsigned long gpfn)
 29.1107 +/* Read another domain's p2m entries */
 29.1108 +{
 29.1109 +    mfn_t mfn;
 29.1110 +    unsigned long addr = gpfn << PAGE_SHIFT;
 29.1111 +    l2_pgentry_t *l2e;
 29.1112 +    l1_pgentry_t *l1e;
 29.1113 +    
 29.1114 +    ASSERT(shadow2_mode_translate(d));
 29.1115 +    mfn = pagetable_get_mfn(d->arch.phys_table);
 29.1116 +
 29.1117 +
 29.1118 +#if CONFIG_PAGING_LEVELS > 2
 29.1119 +    if ( gpfn > (RO_MPT_VIRT_END - RO_MPT_VIRT_START) / sizeof(l1_pgentry_t) ) 
 29.1120 +        /* This pfn is higher than the p2m map can hold */
 29.1121 +        return _mfn(INVALID_MFN);
 29.1122 +#endif
 29.1123 +
 29.1124 +
 29.1125 +#if CONFIG_PAGING_LEVELS >= 4
 29.1126 +    { 
 29.1127 +        l4_pgentry_t *l4e = sh2_map_domain_page(mfn);
 29.1128 +        l4e += l4_table_offset(addr);
 29.1129 +        if ( (l4e_get_flags(*l4e) & _PAGE_PRESENT) == 0 )
 29.1130 +        {
 29.1131 +            sh2_unmap_domain_page(l4e);
 29.1132 +            return _mfn(INVALID_MFN);
 29.1133 +        }
 29.1134 +        mfn = _mfn(l4e_get_pfn(*l4e));
 29.1135 +        sh2_unmap_domain_page(l4e);
 29.1136 +    }
 29.1137 +#endif
 29.1138 +#if CONFIG_PAGING_LEVELS >= 3
 29.1139 +    {
 29.1140 +        l3_pgentry_t *l3e = sh2_map_domain_page(mfn);
 29.1141 +        l3e += l3_table_offset(addr);
 29.1142 +        if ( (l3e_get_flags(*l3e) & _PAGE_PRESENT) == 0 )
 29.1143 +        {
 29.1144 +            sh2_unmap_domain_page(l3e);
 29.1145 +            return _mfn(INVALID_MFN);
 29.1146 +        }
 29.1147 +        mfn = _mfn(l3e_get_pfn(*l3e));
 29.1148 +        sh2_unmap_domain_page(l3e);
 29.1149 +    }
 29.1150 +#endif
 29.1151 +
 29.1152 +    l2e = sh2_map_domain_page(mfn);
 29.1153 +    l2e += l2_table_offset(addr);
 29.1154 +    if ( (l2e_get_flags(*l2e) & _PAGE_PRESENT) == 0 )
 29.1155 +    {
 29.1156 +        sh2_unmap_domain_page(l2e);
 29.1157 +        return _mfn(INVALID_MFN);
 29.1158 +    }
 29.1159 +    mfn = _mfn(l2e_get_pfn(*l2e));
 29.1160 +    sh2_unmap_domain_page(l2e);
 29.1161 +
 29.1162 +    l1e = sh2_map_domain_page(mfn);
 29.1163 +    l1e += l1_table_offset(addr);
 29.1164 +    if ( (l1e_get_flags(*l1e) & _PAGE_PRESENT) == 0 )
 29.1165 +    {
 29.1166 +        sh2_unmap_domain_page(l1e);
 29.1167 +        return _mfn(INVALID_MFN);
 29.1168 +    }
 29.1169 +    mfn = _mfn(l1e_get_pfn(*l1e));
 29.1170 +    sh2_unmap_domain_page(l1e);
 29.1171 +
 29.1172 +    return mfn;
 29.1173 +}
 29.1174 +
 29.1175 +unsigned long
 29.1176 +shadow2_gfn_to_mfn_foreign(unsigned long gpfn)
 29.1177 +{
 29.1178 +    return mfn_x(sh2_gfn_to_mfn_foreign(current->domain, gpfn));
 29.1179 +}
 29.1180 +
 29.1181 +
 29.1182 +static void shadow2_p2m_teardown(struct domain *d)
 29.1183 +/* Return all the p2m pages to Xen.
 29.1184 + * We know we don't have any extra mappings to these pages */
 29.1185 +{
 29.1186 +    struct list_head *entry, *n;
 29.1187 +    struct page_info *pg;
 29.1188 +
 29.1189 +    d->arch.phys_table = pagetable_null();
 29.1190 +
 29.1191 +    list_for_each_safe(entry, n, &d->arch.shadow2_p2m_inuse)
 29.1192 +    {
 29.1193 +        pg = list_entry(entry, struct page_info, list);
 29.1194 +        list_del(entry);
 29.1195 +        /* Should have just the one ref we gave it in alloc_p2m_page() */
 29.1196 +        if ( (pg->count_info & PGC_SH2_count_mask) != 1 )
 29.1197 +        {
 29.1198 +            SHADOW2_PRINTK("Odd p2m page count c=%#x t=%"PRtype_info"\n",
 29.1199 +                           pg->count_info, pg->u.inuse.type_info);
 29.1200 +        }
 29.1201 +        ASSERT(page_get_owner(pg) == d);
 29.1202 +        /* Free should not decrement domain's total allocation, since 
 29.1203 +         * these pages were allocated without an owner. */
 29.1204 +        page_set_owner(pg, NULL); 
 29.1205 +        free_domheap_pages(pg, 0);
 29.1206 +        d->arch.shadow2_p2m_pages--;
 29.1207 +        perfc_decr(shadow2_alloc_count);
 29.1208 +    }
 29.1209 +    list_for_each_safe(entry, n, &d->arch.shadow2_p2m_freelist)
 29.1210 +    {
 29.1211 +        list_del(entry);
 29.1212 +        pg = list_entry(entry, struct page_info, list);
 29.1213 +        ASSERT(page_get_owner(pg) == d);
 29.1214 +        /* Free should not decrement domain's total allocation. */
 29.1215 +        page_set_owner(pg, NULL); 
 29.1216 +        free_domheap_pages(pg, 0);
 29.1217 +        d->arch.shadow2_p2m_pages--;
 29.1218 +        perfc_decr(shadow2_alloc_count);
 29.1219 +    }
 29.1220 +    ASSERT(d->arch.shadow2_p2m_pages == 0);
 29.1221 +}
 29.1222 +
 29.1223 +/* Set the pool of shadow pages to the required number of pages.
 29.1224 + * Input will be rounded up to at least shadow2_min_acceptable_pages(),
 29.1225 + * plus space for the p2m table.
 29.1226 + * Returns 0 for success, non-zero for failure. */
 29.1227 +static unsigned int set_sh2_allocation(struct domain *d, 
 29.1228 +                                       unsigned int pages,
 29.1229 +                                       int *preempted)
 29.1230 +{
 29.1231 +    struct page_info *pg;
 29.1232 +    unsigned int lower_bound;
 29.1233 +    int j;
 29.1234 +
 29.1235 +    ASSERT(shadow2_lock_is_acquired(d));
 29.1236 +    
 29.1237 +    /* Don't allocate less than the minimum acceptable, plus one page per
 29.1238 +     * megabyte of RAM (for the p2m table) */
 29.1239 +    lower_bound = shadow2_min_acceptable_pages(d) + (d->tot_pages / 256);
 29.1240 +    if ( pages > 0 && pages < lower_bound )
 29.1241 +        pages = lower_bound;
 29.1242 +    /* Round up to largest block size */
 29.1243 +    pages = (pages + ((1<<SHADOW2_MAX_ORDER)-1)) & ~((1<<SHADOW2_MAX_ORDER)-1);
 29.1244 +
 29.1245 +    SHADOW2_PRINTK("current %i target %i\n", 
 29.1246 +                   d->arch.shadow2_total_pages, pages);
 29.1247 +
 29.1248 +    while ( d->arch.shadow2_total_pages != pages ) 
 29.1249 +    {
 29.1250 +        if ( d->arch.shadow2_total_pages < pages ) 
 29.1251 +        {
 29.1252 +            /* Need to allocate more memory from domheap */
 29.1253 +            pg = alloc_domheap_pages(NULL, SHADOW2_MAX_ORDER, 0); 
 29.1254 +            if ( pg == NULL ) 
 29.1255 +            { 
 29.1256 +                SHADOW2_PRINTK("failed to allocate shadow pages.\n");
 29.1257 +                return -ENOMEM;
 29.1258 +            }
 29.1259 +            d->arch.shadow2_free_pages += 1<<SHADOW2_MAX_ORDER;
 29.1260 +            d->arch.shadow2_total_pages += 1<<SHADOW2_MAX_ORDER;
 29.1261 +            for ( j = 0; j < 1<<SHADOW2_MAX_ORDER; j++ ) 
 29.1262 +            {
 29.1263 +                pg[j].u.inuse.type_info = 0;  /* Free page */
 29.1264 +                pg[j].tlbflush_timestamp = 0; /* Not in any TLB */
 29.1265 +            }
 29.1266 +            SH2_SET_PFN_ORDER(pg, SHADOW2_MAX_ORDER);
 29.1267 +            list_add_tail(&pg->list, 
 29.1268 +                          &d->arch.shadow2_freelists[SHADOW2_MAX_ORDER]);
 29.1269 +        } 
 29.1270 +        else if ( d->arch.shadow2_total_pages > pages ) 
 29.1271 +        {
 29.1272 +            /* Need to return memory to domheap */
 29.1273 +            shadow2_prealloc(d, SHADOW2_MAX_ORDER);
 29.1274 +            ASSERT(!list_empty(&d->arch.shadow2_freelists[SHADOW2_MAX_ORDER]));
 29.1275 +            pg = list_entry(d->arch.shadow2_freelists[SHADOW2_MAX_ORDER].next, 
 29.1276 +                            struct page_info, list);
 29.1277 +            list_del(&pg->list);
 29.1278 +            d->arch.shadow2_free_pages -= 1<<SHADOW2_MAX_ORDER;
 29.1279 +            d->arch.shadow2_total_pages -= 1<<SHADOW2_MAX_ORDER;
 29.1280 +            free_domheap_pages(pg, SHADOW2_MAX_ORDER);
 29.1281 +        }
 29.1282 +
 29.1283 +        /* Check to see if we need to yield and try again */
 29.1284 +        if ( preempted && hypercall_preempt_check() )
 29.1285 +        {
 29.1286 +            *preempted = 1;
 29.1287 +            return 0;
 29.1288 +        }
 29.1289 +    }
 29.1290 +
 29.1291 +    return 0;
 29.1292 +}
 29.1293 +
 29.1294 +unsigned int shadow2_set_allocation(struct domain *d, 
 29.1295 +                                    unsigned int megabytes,
 29.1296 +                                    int *preempted)
 29.1297 +/* Hypercall interface to set the shadow memory allocation */
 29.1298 +{
 29.1299 +    unsigned int rv;
 29.1300 +    shadow2_lock(d);
 29.1301 +    rv = set_sh2_allocation(d, megabytes << (20 - PAGE_SHIFT), preempted); 
 29.1302 +    SHADOW2_PRINTK("dom %u allocation now %u pages (%u MB)\n",
 29.1303 +                   d->domain_id,
 29.1304 +                   d->arch.shadow2_total_pages,
 29.1305 +                   shadow2_get_allocation(d));
 29.1306 +    shadow2_unlock(d);
 29.1307 +    return rv;
 29.1308 +}
 29.1309 +
 29.1310 +/**************************************************************************/
 29.1311 +/* Hash table for storing the guest->shadow mappings */
 29.1312 +
 29.1313 +/* Hash function that takes a gfn or mfn, plus another byte of type info */
 29.1314 +typedef u32 key_t;
 29.1315 +static inline key_t sh2_hash(unsigned long n, u8 t) 
 29.1316 +{
 29.1317 +    unsigned char *p = (unsigned char *)&n;
 29.1318 +    key_t k = t;
 29.1319 +    int i;
 29.1320 +    for ( i = 0; i < sizeof(n) ; i++ ) k = (u32)p[i] + (k<<6) + (k<<16) - k;
 29.1321 +    return k;
 29.1322 +}
 29.1323 +
 29.1324 +#if SHADOW2_AUDIT & (SHADOW2_AUDIT_HASH|SHADOW2_AUDIT_HASH_FULL)
 29.1325 +
 29.1326 +/* Before we get to the mechanism, define a pair of audit functions
 29.1327 + * that sanity-check the contents of the hash table. */
 29.1328 +static void sh2_hash_audit_bucket(struct domain *d, int bucket)
 29.1329 +/* Audit one bucket of the hash table */
 29.1330 +{
 29.1331 +    struct shadow2_hash_entry *e, *x;
 29.1332 +    struct page_info *pg;
 29.1333 +
 29.1334 +    if ( !(SHADOW2_AUDIT_ENABLE) )
 29.1335 +        return;
 29.1336 +
 29.1337 +    e = &d->arch.shadow2_hash_table[bucket];
 29.1338 +    if ( e->t == 0 ) return; /* Bucket is empty */ 
 29.1339 +    while ( e )
 29.1340 +    {
 29.1341 +        /* Empty link? */
 29.1342 +        BUG_ON( e->t == 0 ); 
 29.1343 +        /* Bogus type? */
 29.1344 +        BUG_ON( e->t > (PGC_SH2_max_shadow >> PGC_SH2_type_shift) );
 29.1345 +        /* Wrong bucket? */
 29.1346 +        BUG_ON( sh2_hash(e->n, e->t) % SHADOW2_HASH_BUCKETS != bucket ); 
 29.1347 +        /* Duplicate entry? */
 29.1348 +        for ( x = e->next; x; x = x->next )
 29.1349 +            BUG_ON( x->n == e->n && x->t == e->t );
 29.1350 +        /* Bogus MFN? */
 29.1351 +        BUG_ON( !valid_mfn(e->smfn) );
 29.1352 +        pg = mfn_to_page(e->smfn);
 29.1353 +        /* Not a shadow? */
 29.1354 +        BUG_ON( page_get_owner(pg) != 0 );
 29.1355 +        /* Wrong kind of shadow? */
 29.1356 +        BUG_ON( (pg->count_info & PGC_SH2_type_mask) >> PGC_SH2_type_shift 
 29.1357 +                != e->t ); 
 29.1358 +        /* Bad backlink? */
 29.1359 +        BUG_ON( pg->u.inuse.type_info != e->n );
 29.1360 +        if ( e->t != (PGC_SH2_fl1_32_shadow >> PGC_SH2_type_shift)
 29.1361 +             && e->t != (PGC_SH2_fl1_pae_shadow >> PGC_SH2_type_shift)
 29.1362 +             && e->t != (PGC_SH2_fl1_64_shadow >> PGC_SH2_type_shift) )
 29.1363 +        {
 29.1364 +            /* Bad shadow flags on guest page? */
 29.1365 +            BUG_ON( !(mfn_to_page(_mfn(e->n))->shadow2_flags & (1<<e->t)) );
 29.1366 +        }
 29.1367 +        /* That entry was OK; on we go */
 29.1368 +        e = e->next;
 29.1369 +    }
 29.1370 +}
 29.1371 +
 29.1372 +#else
 29.1373 +#define sh2_hash_audit_bucket(_d, _b)
 29.1374 +#endif /* Hashtable bucket audit */
 29.1375 +
 29.1376 +
 29.1377 +#if SHADOW2_AUDIT & SHADOW2_AUDIT_HASH_FULL
 29.1378 +
 29.1379 +static void sh2_hash_audit(struct domain *d)
 29.1380 +/* Full audit: audit every bucket in the table */
 29.1381 +{
 29.1382 +    int i;
 29.1383 +
 29.1384 +    if ( !(SHADOW2_AUDIT_ENABLE) )
 29.1385 +        return;
 29.1386 +
 29.1387 +    for ( i = 0; i < SHADOW2_HASH_BUCKETS; i++ ) 
 29.1388 +    {
 29.1389 +        sh2_hash_audit_bucket(d, i);
 29.1390 +    }
 29.1391 +}
 29.1392 +
 29.1393 +#else
 29.1394 +#define sh2_hash_audit(_d)
 29.1395 +#endif /* Hashtable bucket audit */
 29.1396 +
 29.1397 +/* Memory management interface for bucket allocation.
 29.1398 + * These ought to come out of shadow memory, but at least on 32-bit
 29.1399 + * machines we are forced to allocate them from xenheap so that we can
 29.1400 + * address them. */
 29.1401 +static struct shadow2_hash_entry *sh2_alloc_hash_entry(struct domain *d)
 29.1402 +{
 29.1403 +    struct shadow2_hash_entry *extra, *x;
 29.1404 +    int i;
 29.1405 +
 29.1406 +    /* We need to allocate a new node. Ensure the free list is not empty. 
 29.1407 +     * Allocate new entries in units the same size as the original table. */
 29.1408 +    if ( unlikely(d->arch.shadow2_hash_freelist == NULL) )
 29.1409 +    {
 29.1410 +        size_t sz = sizeof(void *) + (SHADOW2_HASH_BUCKETS * sizeof(*x));
 29.1411 +        extra = xmalloc_bytes(sz);
 29.1412 +
 29.1413 +        if ( extra == NULL )
 29.1414 +        {
 29.1415 +            /* No memory left! */
 29.1416 +            SHADOW2_ERROR("xmalloc() failed when allocating hash buckets.\n");
 29.1417 +            domain_crash_synchronous();
 29.1418 +        }
 29.1419 +        memset(extra, 0, sz);
 29.1420 +
 29.1421 +        /* Record the allocation block so it can be correctly freed later. */
 29.1422 +        *((struct shadow2_hash_entry **)&extra[SHADOW2_HASH_BUCKETS]) = 
 29.1423 +            d->arch.shadow2_hash_allocations;
 29.1424 +        d->arch.shadow2_hash_allocations = &extra[0];
 29.1425 +
 29.1426 +        /* Thread a free chain through the newly-allocated nodes. */
 29.1427 +        for ( i = 0; i < (SHADOW2_HASH_BUCKETS - 1); i++ )
 29.1428 +            extra[i].next = &extra[i+1];
 29.1429 +        extra[i].next = NULL;
 29.1430 +
 29.1431 +        /* Add the new nodes to the free list. */
 29.1432 +        d->arch.shadow2_hash_freelist = &extra[0];
 29.1433 +    }
 29.1434 +
 29.1435 +    /* Allocate a new node from the free list. */
 29.1436 +    x = d->arch.shadow2_hash_freelist;
 29.1437 +    d->arch.shadow2_hash_freelist = x->next;
 29.1438 +    return x;
 29.1439 +}
 29.1440 +
 29.1441 +static void sh2_free_hash_entry(struct domain *d, struct shadow2_hash_entry *e)
 29.1442 +{
 29.1443 +    /* Mark the bucket as empty and return it to the free list */
 29.1444 +    e->t = 0; 
 29.1445 +    e->next = d->arch.shadow2_hash_freelist;
 29.1446 +    d->arch.shadow2_hash_freelist = e;
 29.1447 +}
 29.1448 +
 29.1449 +
 29.1450 +/* Allocate and initialise the table itself.  
 29.1451 + * Returns 0 for success, 1 for error. */
 29.1452 +static int shadow2_hash_alloc(struct domain *d)
 29.1453 +{
 29.1454 +    struct shadow2_hash_entry *table;
 29.1455 +
 29.1456 +    ASSERT(shadow2_lock_is_acquired(d));
 29.1457 +    ASSERT(!d->arch.shadow2_hash_table);
 29.1458 +
 29.1459 +    table = xmalloc_array(struct shadow2_hash_entry, SHADOW2_HASH_BUCKETS);
 29.1460 +    if ( !table ) return 1;
 29.1461 +    memset(table, 0, 
 29.1462 +           SHADOW2_HASH_BUCKETS * sizeof (struct shadow2_hash_entry));
 29.1463 +    d->arch.shadow2_hash_table = table;
 29.1464 +    return 0;
 29.1465 +}
 29.1466 +
 29.1467 +/* Tear down the hash table and return all memory to Xen.
 29.1468 + * This function does not care whether the table is populated. */
 29.1469 +static void shadow2_hash_teardown(struct domain *d)
 29.1470 +{
 29.1471 +    struct shadow2_hash_entry *a, *n;
 29.1472 +
 29.1473 +    ASSERT(shadow2_lock_is_acquired(d));
 29.1474 +    ASSERT(d->arch.shadow2_hash_table);
 29.1475 +
 29.1476 +    /* Return the table itself */
 29.1477 +    xfree(d->arch.shadow2_hash_table);
 29.1478 +    d->arch.shadow2_hash_table = NULL;
 29.1479 +
 29.1480 +    /* Return any extra allocations */
 29.1481 +    a = d->arch.shadow2_hash_allocations;
 29.1482 +    while ( a ) 
 29.1483 +    {
 29.1484 +        /* We stored a linked-list pointer at the end of each allocation */
 29.1485 +        n = *((struct shadow2_hash_entry **)(&a[SHADOW2_HASH_BUCKETS]));
 29.1486 +        xfree(a);
 29.1487 +        a = n;
 29.1488 +    }
 29.1489 +    d->arch.shadow2_hash_allocations = NULL;
 29.1490 +    d->arch.shadow2_hash_freelist = NULL;
 29.1491 +}
 29.1492 +
 29.1493 +
 29.1494 +mfn_t shadow2_hash_lookup(struct vcpu *v, unsigned long n, u8 t)
 29.1495 +/* Find an entry in the hash table.  Returns the MFN of the shadow,
 29.1496 + * or INVALID_MFN if it doesn't exist */
 29.1497 +{
 29.1498 +    struct domain *d = v->domain;
 29.1499 +    struct shadow2_hash_entry *p, *x, *head;
 29.1500 +    key_t key;
 29.1501 +
 29.1502 +    ASSERT(shadow2_lock_is_acquired(d));
 29.1503 +    ASSERT(d->arch.shadow2_hash_table);
 29.1504 +    ASSERT(t);
 29.1505 +
 29.1506 +    sh2_hash_audit(d);
 29.1507 +
 29.1508 +    perfc_incrc(shadow2_hash_lookups);
 29.1509 +    key = sh2_hash(n, t);
 29.1510 +
 29.1511 +    x = head = &d->arch.shadow2_hash_table[key % SHADOW2_HASH_BUCKETS];
 29.1512 +    p = NULL;
 29.1513 +
 29.1514 +    sh2_hash_audit_bucket(d, key % SHADOW2_HASH_BUCKETS);
 29.1515 +
 29.1516 +    do
 29.1517 +    {
 29.1518 +        ASSERT(x->t || ((x == head) && (x->next == NULL)));
 29.1519 +
 29.1520 +        if ( x->n == n && x->t == t )
 29.1521 +        {
 29.1522 +            /* Pull-to-front if 'x' isn't already the head item */
 29.1523 +            if ( unlikely(x != head) )
 29.1524 +            {
 29.1525 +                if ( unlikely(d->arch.shadow2_hash_walking != 0) )
 29.1526 +                    /* Can't reorder: someone is walking the hash chains */
 29.1527 +                    return x->smfn;
 29.1528 +                else 
 29.1529 +                {
 29.1530 +                    /* Delete 'x' from list and reinsert after head. */
 29.1531 +                    p->next = x->next;
 29.1532 +                    x->next = head->next;
 29.1533 +                    head->next = x;
 29.1534 +                    
 29.1535 +                    /* Swap 'x' contents with head contents. */
 29.1536 +                    SWAP(head->n, x->n);
 29.1537 +                    SWAP(head->t, x->t);
 29.1538 +                    SWAP(head->smfn, x->smfn);
 29.1539 +                }
 29.1540 +            }
 29.1541 +            else
 29.1542 +            {
 29.1543 +                perfc_incrc(shadow2_hash_lookup_head);
 29.1544 +            }
 29.1545 +            return head->smfn;
 29.1546 +        }
 29.1547 +
 29.1548 +        p = x;
 29.1549 +        x = x->next;
 29.1550 +    }
 29.1551 +    while ( x != NULL );
 29.1552 +
 29.1553 +    perfc_incrc(shadow2_hash_lookup_miss);
 29.1554 +    return _mfn(INVALID_MFN);
 29.1555 +}
 29.1556 +
 29.1557 +void shadow2_hash_insert(struct vcpu *v, unsigned long n, u8 t, mfn_t smfn)
 29.1558 +/* Put a mapping (n,t)->smfn into the hash table */
 29.1559 +{
 29.1560 +    struct domain *d = v->domain;
 29.1561 +    struct shadow2_hash_entry *x, *head;
 29.1562 +    key_t key;
 2