direct-io.hg

changeset 11260:fab84f9c0ce6

[XEN] Rename shadow2 to shadow and move the various source
files into a sensible directory hierarchy.
Signed-off-by: Keir Fraser <keir@xensource.com>
author kaf24@localhost.localdomain
date Mon Aug 28 12:09:36 2006 +0100 (2006-08-28)
parents 5b9ff5e8653a
children b61b7478b324
files tools/libxc/xc_hvm_build.c xen/arch/x86/Makefile xen/arch/x86/domain.c xen/arch/x86/domain_build.c xen/arch/x86/domctl.c xen/arch/x86/hvm/hvm.c xen/arch/x86/hvm/platform.c xen/arch/x86/hvm/svm/svm.c xen/arch/x86/hvm/vmx/vmcs.c xen/arch/x86/hvm/vmx/vmx.c xen/arch/x86/mm.c xen/arch/x86/mm/Makefile xen/arch/x86/mm/shadow/Makefile xen/arch/x86/mm/shadow/common.c xen/arch/x86/mm/shadow/multi.c xen/arch/x86/mm/shadow/multi.h xen/arch/x86/mm/shadow/page-guest32.h xen/arch/x86/mm/shadow/private.h xen/arch/x86/mm/shadow/types.h xen/arch/x86/shadow2-common.c xen/arch/x86/shadow2.c xen/arch/x86/traps.c xen/include/asm-x86/domain.h xen/include/asm-x86/mm.h xen/include/asm-x86/page-guest32.h xen/include/asm-x86/perfc_defn.h xen/include/asm-x86/shadow.h xen/include/asm-x86/shadow2-multi.h xen/include/asm-x86/shadow2-private.h xen/include/asm-x86/shadow2-types.h xen/include/asm-x86/shadow2.h
line diff
     1.1 --- a/tools/libxc/xc_hvm_build.c	Sun Aug 27 06:56:01 2006 +0100
     1.2 +++ b/tools/libxc/xc_hvm_build.c	Mon Aug 28 12:09:36 2006 +0100
     1.3 @@ -441,7 +441,7 @@ static int xc_hvm_build_internal(int xc_
     1.4          goto error_out;
     1.5      }
     1.6  
     1.7 -    /* HVM domains must be put into shadow2 mode at the start of day */
     1.8 +    /* HVM domains must be put into shadow mode at the start of day */
     1.9      if ( xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_ENABLE,
    1.10                             NULL, 0, NULL, 
    1.11                             XEN_DOMCTL_SHADOW_ENABLE_REFCOUNT  |
     2.1 --- a/xen/arch/x86/Makefile	Sun Aug 27 06:56:01 2006 +0100
     2.2 +++ b/xen/arch/x86/Makefile	Mon Aug 28 12:09:36 2006 +0100
     2.3 @@ -2,6 +2,7 @@ subdir-y += acpi
     2.4  subdir-y += cpu
     2.5  subdir-y += genapic
     2.6  subdir-y += hvm
     2.7 +subdir-y += mm
     2.8  subdir-y += oprofile
     2.9  
    2.10  subdir-$(x86_32) += x86_32
    2.11 @@ -41,23 +42,6 @@ obj-y += traps.o
    2.12  obj-y += usercopy.o
    2.13  obj-y += x86_emulate.o
    2.14  
    2.15 -ifneq ($(pae),n)
    2.16 -obj-$(x86_32) += shadow2-common.o shadow2_g2_on_s3.o shadow2_g3_on_s3.o
    2.17 -else
    2.18 -obj-$(x86_32) += shadow2-common.o shadow2_g2_on_s2.o
    2.19 -endif
    2.20 -
    2.21 -obj-$(x86_64) += shadow2-common.o shadow2_g4_on_s4.o shadow2_g3_on_s3.o \
    2.22 -                 shadow2_g2_on_s3.o
    2.23 -
    2.24 -guest_levels  = $(subst g,,$(filter g%,$(subst ., ,$(subst _, ,$(subst shadow2_,,$(1))))))
    2.25 -shadow_levels = $(subst s,,$(filter s%,$(subst ., ,$(subst _, ,$(subst shadow2_,,$(1))))))
    2.26 -shadow2_defns = -DGUEST_PAGING_LEVELS=$(call guest_levels,$(1)) \
    2.27 -                -DSHADOW_PAGING_LEVELS=$(call shadow_levels,$(1))
    2.28 -
    2.29 -shadow2_%.o: shadow2.c $(HDRS) Makefile
    2.30 -	$(CC) $(CFLAGS) $(call shadow2_defns,$(@F)) -c $< -o $@
    2.31 -
    2.32  obj-$(crash_debug) += gdbstub.o
    2.33  
    2.34  $(TARGET): $(TARGET)-syms boot/mkelf32
    2.35 @@ -86,9 +70,6 @@ xen.lds: $(TARGET_SUBARCH)/xen.lds.S $(H
    2.36  boot/mkelf32: boot/mkelf32.c
    2.37  	$(HOSTCC) $(HOSTCFLAGS) -o $@ $<
    2.38  
    2.39 -shadow_guest32.o: shadow.c
    2.40 -shadow_guest32pae.o: shadow.c
    2.41 -
    2.42  .PHONY: clean
    2.43  clean::
    2.44  	rm -f asm-offsets.s xen.lds boot/*.o boot/*~ boot/core boot/mkelf32
     3.1 --- a/xen/arch/x86/domain.c	Sun Aug 27 06:56:01 2006 +0100
     3.2 +++ b/xen/arch/x86/domain.c	Mon Aug 28 12:09:36 2006 +0100
     3.3 @@ -200,12 +200,12 @@ int arch_domain_create(struct domain *d)
     3.4  
     3.5  #endif /* __x86_64__ */
     3.6  
     3.7 -    shadow2_lock_init(d);
     3.8 -    for ( i = 0; i <= SHADOW2_MAX_ORDER; i++ )
     3.9 -        INIT_LIST_HEAD(&d->arch.shadow2.freelists[i]);
    3.10 -    INIT_LIST_HEAD(&d->arch.shadow2.p2m_freelist);
    3.11 -    INIT_LIST_HEAD(&d->arch.shadow2.p2m_inuse);
    3.12 -    INIT_LIST_HEAD(&d->arch.shadow2.toplevel_shadows);
    3.13 +    shadow_lock_init(d);
    3.14 +    for ( i = 0; i <= SHADOW_MAX_ORDER; i++ )
    3.15 +        INIT_LIST_HEAD(&d->arch.shadow.freelists[i]);
    3.16 +    INIT_LIST_HEAD(&d->arch.shadow.p2m_freelist);
    3.17 +    INIT_LIST_HEAD(&d->arch.shadow.p2m_inuse);
    3.18 +    INIT_LIST_HEAD(&d->arch.shadow.toplevel_shadows);
    3.19  
    3.20      if ( !is_idle_domain(d) )
    3.21      {
    3.22 @@ -236,7 +236,7 @@ int arch_domain_create(struct domain *d)
    3.23  
    3.24  void arch_domain_destroy(struct domain *d)
    3.25  {
    3.26 -    shadow2_final_teardown(d);
    3.27 +    shadow_final_teardown(d);
    3.28  
    3.29      free_xenheap_pages(
    3.30          d->arch.mm_perdomain_pt,
    3.31 @@ -342,10 +342,10 @@ int arch_set_info_guest(
    3.32          }
    3.33      }    
    3.34  
    3.35 -    /* Shadow2: make sure the domain has enough shadow memory to
    3.36 +    /* Shadow: make sure the domain has enough shadow memory to
    3.37       * boot another vcpu */
    3.38 -    if ( shadow2_mode_enabled(d) 
    3.39 -         && d->arch.shadow2.total_pages < shadow2_min_acceptable_pages(d) )
    3.40 +    if ( shadow_mode_enabled(d) 
    3.41 +         && d->arch.shadow.total_pages < shadow_min_acceptable_pages(d) )
    3.42      {
    3.43          destroy_gdt(v);
    3.44          return -ENOMEM;
    3.45 @@ -357,8 +357,8 @@ int arch_set_info_guest(
    3.46      /* Don't redo final setup */
    3.47      set_bit(_VCPUF_initialised, &v->vcpu_flags);
    3.48  
    3.49 -    if ( shadow2_mode_enabled(d) )
    3.50 -        shadow2_update_paging_modes(v);
    3.51 +    if ( shadow_mode_enabled(d) )
    3.52 +        shadow_update_paging_modes(v);
    3.53  
    3.54      update_cr3(v);
    3.55  
    3.56 @@ -936,11 +936,11 @@ void domain_relinquish_resources(struct 
    3.57      for_each_vcpu ( d, v )
    3.58      {
    3.59          /* Drop ref to guest_table (from new_guest_cr3(), svm/vmx cr3 handling,
    3.60 -         * or sh2_update_paging_modes()) */
    3.61 +         * or sh_update_paging_modes()) */
    3.62          pfn = pagetable_get_pfn(v->arch.guest_table);
    3.63          if ( pfn != 0 )
    3.64          {
    3.65 -            if ( shadow2_mode_refcounts(d) )
    3.66 +            if ( shadow_mode_refcounts(d) )
    3.67                  put_page(mfn_to_page(pfn));
    3.68              else
    3.69                  put_page_and_type(mfn_to_page(pfn));
    3.70 @@ -962,7 +962,7 @@ void domain_relinquish_resources(struct 
    3.71          hvm_relinquish_guest_resources(d);
    3.72  
    3.73      /* Tear down shadow mode stuff. */
    3.74 -    shadow2_teardown(d);
    3.75 +    shadow_teardown(d);
    3.76  
    3.77      /*
    3.78       * Relinquish GDT mappings. No need for explicit unmapping of the LDT as
    3.79 @@ -981,18 +981,18 @@ void domain_relinquish_resources(struct 
    3.80  
    3.81  void arch_dump_domain_info(struct domain *d)
    3.82  {
    3.83 -    if ( shadow2_mode_enabled(d) )
    3.84 +    if ( shadow_mode_enabled(d) )
    3.85      {
    3.86 -        printk("    shadow2 mode: ");
    3.87 -        if ( d->arch.shadow2.mode & SHM2_enable )
    3.88 +        printk("    shadow mode: ");
    3.89 +        if ( d->arch.shadow.mode & SHM2_enable )
    3.90              printk("enabled ");
    3.91 -        if ( shadow2_mode_refcounts(d) )
    3.92 +        if ( shadow_mode_refcounts(d) )
    3.93              printk("refcounts ");
    3.94 -        if ( shadow2_mode_log_dirty(d) )
    3.95 +        if ( shadow_mode_log_dirty(d) )
    3.96              printk("log_dirty ");
    3.97 -        if ( shadow2_mode_translate(d) )
    3.98 +        if ( shadow_mode_translate(d) )
    3.99              printk("translate ");
   3.100 -        if ( shadow2_mode_external(d) )
   3.101 +        if ( shadow_mode_external(d) )
   3.102              printk("external ");
   3.103          printk("\n");
   3.104      }
     4.1 --- a/xen/arch/x86/domain_build.c	Sun Aug 27 06:56:01 2006 +0100
     4.2 +++ b/xen/arch/x86/domain_build.c	Mon Aug 28 12:09:36 2006 +0100
     4.3 @@ -679,8 +679,8 @@ int construct_dom0(struct domain *d,
     4.4          (void)alloc_vcpu(d, i, i);
     4.5  
     4.6      /* Set up CR3 value for write_ptbase */
     4.7 -    if ( shadow2_mode_enabled(v->domain) )
     4.8 -        shadow2_update_paging_modes(v);
     4.9 +    if ( shadow_mode_enabled(v->domain) )
    4.10 +        shadow_update_paging_modes(v);
    4.11      else
    4.12          update_cr3(v);
    4.13  
    4.14 @@ -791,8 +791,8 @@ int construct_dom0(struct domain *d,
    4.15      new_thread(v, dsi.v_kernentry, vstack_end, vstartinfo_start);
    4.16  
    4.17      if ( opt_dom0_shadow )
    4.18 -        if ( shadow2_test_enable(d) == 0 ) 
    4.19 -            shadow2_update_paging_modes(v);
    4.20 +        if ( shadow_test_enable(d) == 0 ) 
    4.21 +            shadow_update_paging_modes(v);
    4.22  
    4.23      if ( supervisor_mode_kernel )
    4.24      {
     5.1 --- a/xen/arch/x86/domctl.c	Sun Aug 27 06:56:01 2006 +0100
     5.2 +++ b/xen/arch/x86/domctl.c	Mon Aug 28 12:09:36 2006 +0100
     5.3 @@ -39,7 +39,7 @@ long arch_do_domctl(
     5.4          d = find_domain_by_id(domctl->domain);
     5.5          if ( d != NULL )
     5.6          {
     5.7 -            ret = shadow2_domctl(d, &domctl->u.shadow_op, u_domctl);
     5.8 +            ret = shadow_domctl(d, &domctl->u.shadow_op, u_domctl);
     5.9              put_domain(d);
    5.10              copy_to_guest(u_domctl, domctl, 1);
    5.11          } 
     6.1 --- a/xen/arch/x86/hvm/hvm.c	Sun Aug 27 06:56:01 2006 +0100
     6.2 +++ b/xen/arch/x86/hvm/hvm.c	Mon Aug 28 12:09:36 2006 +0100
     6.3 @@ -384,8 +384,8 @@ int hvm_copy(void *buf, unsigned long va
     6.4          if (count > size)
     6.5              count = size;
     6.6  
     6.7 -        gfn = shadow2_gva_to_gfn(v, vaddr);
     6.8 -        mfn = mfn_x(sh2_vcpu_gfn_to_mfn(v, gfn));
     6.9 +        gfn = shadow_gva_to_gfn(v, vaddr);
    6.10 +        mfn = mfn_x(sh_vcpu_gfn_to_mfn(v, gfn));
    6.11  
    6.12          if (mfn == INVALID_MFN)
    6.13              return 0;
    6.14 @@ -539,7 +539,7 @@ void hvm_do_hypercall(struct cpu_user_re
    6.15          return;
    6.16      }
    6.17  
    6.18 -    if ( current->arch.shadow2.mode->guest_levels == 4 )
    6.19 +    if ( current->arch.shadow.mode->guest_levels == 4 )
    6.20      {
    6.21          pregs->rax = hvm_hypercall64_table[pregs->rax](pregs->rdi,
    6.22                                                         pregs->rsi,
     7.1 --- a/xen/arch/x86/hvm/platform.c	Sun Aug 27 06:56:01 2006 +0100
     7.2 +++ b/xen/arch/x86/hvm/platform.c	Mon Aug 28 12:09:36 2006 +0100
     7.3 @@ -721,7 +721,7 @@ void send_pio_req(struct cpu_user_regs *
     7.4  
     7.5      if (pvalid) {
     7.6          if (hvm_paging_enabled(current))
     7.7 -            p->u.data = shadow2_gva_to_gpa(current, value);
     7.8 +            p->u.data = shadow_gva_to_gpa(current, value);
     7.9          else
    7.10              p->u.pdata = (void *) value; /* guest VA == guest PA */
    7.11      } else
    7.12 @@ -771,7 +771,7 @@ void send_mmio_req(
    7.13  
    7.14      if (pvalid) {
    7.15          if (hvm_paging_enabled(v))
    7.16 -            p->u.data = shadow2_gva_to_gpa(v, value);
    7.17 +            p->u.data = shadow_gva_to_gpa(v, value);
    7.18          else
    7.19              p->u.pdata = (void *) value; /* guest VA == guest PA */
    7.20      } else
     8.1 --- a/xen/arch/x86/hvm/svm/svm.c	Sun Aug 27 06:56:01 2006 +0100
     8.2 +++ b/xen/arch/x86/hvm/svm/svm.c	Mon Aug 28 12:09:36 2006 +0100
     8.3 @@ -29,7 +29,7 @@
     8.4  #include <xen/domain_page.h>
     8.5  #include <asm/current.h>
     8.6  #include <asm/io.h>
     8.7 -#include <asm/shadow2.h>
     8.8 +#include <asm/shadow.h>
     8.9  #include <asm/regs.h>
    8.10  #include <asm/cpufeature.h>
    8.11  #include <asm/processor.h>
    8.12 @@ -746,10 +746,10 @@ static void svm_final_setup_guest(struct
    8.13      if ( v != d->vcpu[0] )
    8.14          return;
    8.15  
    8.16 -    if ( !shadow2_mode_external(d) )
    8.17 +    if ( !shadow_mode_external(d) )
    8.18      {
    8.19          DPRINTK("Can't init HVM for dom %u vcpu %u: "
    8.20 -                "not in shadow2 external mode\n", d->domain_id, v->vcpu_id);
    8.21 +                "not in shadow external mode\n", d->domain_id, v->vcpu_id);
    8.22          domain_crash(d);
    8.23      }
    8.24  
    8.25 @@ -914,7 +914,7 @@ static int svm_do_page_fault(unsigned lo
    8.26                  va, eip, (unsigned long)regs->error_code);
    8.27  //#endif
    8.28  
    8.29 -    result = shadow2_fault(va, regs); 
    8.30 +    result = shadow_fault(va, regs); 
    8.31  
    8.32      if( result ) {
    8.33          /* Let's make sure that the Guest TLB is flushed */
    8.34 @@ -1562,7 +1562,7 @@ static int svm_set_cr0(unsigned long val
    8.35          v->arch.guest_table = pagetable_from_pfn(mfn);
    8.36          if ( old_base_mfn )
    8.37              put_page(mfn_to_page(old_base_mfn));
    8.38 -        shadow2_update_paging_modes(v);
    8.39 +        shadow_update_paging_modes(v);
    8.40  
    8.41          HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx", 
    8.42                      (unsigned long) (mfn << PAGE_SHIFT));
    8.43 @@ -1588,14 +1588,14 @@ static int svm_set_cr0(unsigned long val
    8.44              svm_inject_exception(v, TRAP_gp_fault, 1, 0);
    8.45              return 0;
    8.46          }
    8.47 -        shadow2_update_paging_modes(v);
    8.48 +        shadow_update_paging_modes(v);
    8.49          vmcb->cr3 = v->arch.hvm_vcpu.hw_cr3;
    8.50          set_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags);
    8.51      }
    8.52      else if ( (value & (X86_CR0_PE | X86_CR0_PG)) == X86_CR0_PE )
    8.53      {
    8.54          /* we should take care of this kind of situation */
    8.55 -        shadow2_update_paging_modes(v);
    8.56 +        shadow_update_paging_modes(v);
    8.57          vmcb->cr3 = v->arch.hvm_vcpu.hw_cr3;
    8.58          set_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags);
    8.59      }
    8.60 @@ -1706,7 +1706,7 @@ static int mov_to_cr(int gpreg, int cr, 
    8.61              mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
    8.62              if (mfn != pagetable_get_pfn(v->arch.guest_table))
    8.63                  __hvm_bug(regs);
    8.64 -            shadow2_update_cr3(v);
    8.65 +            shadow_update_cr3(v);
    8.66          }
    8.67          else 
    8.68          {
    8.69 @@ -1771,7 +1771,7 @@ static int mov_to_cr(int gpreg, int cr, 
    8.70                  v->arch.guest_table = pagetable_from_pfn(mfn);
    8.71                  if ( old_base_mfn )
    8.72                      put_page(mfn_to_page(old_base_mfn));
    8.73 -                shadow2_update_paging_modes(v);
    8.74 +                shadow_update_paging_modes(v);
    8.75  
    8.76                  HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
    8.77                              (unsigned long) (mfn << PAGE_SHIFT));
    8.78 @@ -1808,7 +1808,7 @@ static int mov_to_cr(int gpreg, int cr, 
    8.79          if ((old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE))
    8.80          {
    8.81              set_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags);
    8.82 -            shadow2_update_paging_modes(v);
    8.83 +            shadow_update_paging_modes(v);
    8.84          }
    8.85          break;
    8.86      }
    8.87 @@ -2149,7 +2149,7 @@ void svm_handle_invlpg(const short invlp
    8.88  
    8.89      /* Overkill, we may not this */
    8.90      set_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags);
    8.91 -    shadow2_invlpg(v, g_vaddr);
    8.92 +    shadow_invlpg(v, g_vaddr);
    8.93  }
    8.94  
    8.95  
    8.96 @@ -2520,7 +2520,7 @@ void walk_shadow_and_guest_pt(unsigned l
    8.97      struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
    8.98      unsigned long gpa;
    8.99  
   8.100 -    gpa = shadow2_gva_to_gpa(current, gva);
   8.101 +    gpa = shadow_gva_to_gpa(current, gva);
   8.102      printk( "gva = %lx, gpa=%lx, gCR3=%x\n", gva, gpa, (u32)vmcb->cr3 );
   8.103      if( !svm_paging_enabled(v) || mmio_space(gpa) )
   8.104          return;
   8.105 @@ -2591,7 +2591,7 @@ asmlinkage void svm_vmexit_handler(struc
   8.106          if (svm_dbg_on && exit_reason == VMEXIT_EXCEPTION_PF) 
   8.107          {
   8.108              if (svm_paging_enabled(v) && 
   8.109 -                !mmio_space(shadow2_gva_to_gpa(current, vmcb->exitinfo2)))
   8.110 +                !mmio_space(shadow_gva_to_gpa(current, vmcb->exitinfo2)))
   8.111              {
   8.112                  printk("I%08ld,ExC=%s(%d),IP=%x:%llx,"
   8.113                         "I1=%llx,I2=%llx,INT=%llx, "
   8.114 @@ -2601,7 +2601,7 @@ asmlinkage void svm_vmexit_handler(struc
   8.115                         (unsigned long long) vmcb->exitinfo1,
   8.116                         (unsigned long long) vmcb->exitinfo2,
   8.117                         (unsigned long long) vmcb->exitintinfo.bytes,
   8.118 -                       (unsigned long long) shadow2_gva_to_gpa(current, vmcb->exitinfo2));
   8.119 +                       (unsigned long long) shadow_gva_to_gpa(current, vmcb->exitinfo2));
   8.120              }
   8.121              else 
   8.122              {
     9.1 --- a/xen/arch/x86/hvm/vmx/vmcs.c	Sun Aug 27 06:56:01 2006 +0100
     9.2 +++ b/xen/arch/x86/hvm/vmx/vmcs.c	Mon Aug 28 12:09:36 2006 +0100
     9.3 @@ -35,7 +35,7 @@
     9.4  #include <xen/event.h>
     9.5  #include <xen/kernel.h>
     9.6  #include <xen/keyhandler.h>
     9.7 -#include <asm/shadow2.h>
     9.8 +#include <asm/shadow.h>
     9.9  
    9.10  static int vmcs_size;
    9.11  static int vmcs_order;
    9.12 @@ -272,7 +272,7 @@ static void vmx_do_launch(struct vcpu *v
    9.13      error |= __vmwrite(GUEST_TR_BASE, 0);
    9.14      error |= __vmwrite(GUEST_TR_LIMIT, 0xff);
    9.15  
    9.16 -    shadow2_update_paging_modes(v);
    9.17 +    shadow_update_paging_modes(v);
    9.18      printk("%s(): GUEST_CR3<=%08lx, HOST_CR3<=%08lx\n",
    9.19             __func__, v->arch.hvm_vcpu.hw_cr3, v->arch.cr3);
    9.20      __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
    10.1 --- a/xen/arch/x86/hvm/vmx/vmx.c	Sun Aug 27 06:56:01 2006 +0100
    10.2 +++ b/xen/arch/x86/hvm/vmx/vmx.c	Mon Aug 28 12:09:36 2006 +0100
    10.3 @@ -40,7 +40,7 @@
    10.4  #include <asm/hvm/vmx/vmx.h>
    10.5  #include <asm/hvm/vmx/vmcs.h>
    10.6  #include <asm/hvm/vmx/cpu.h>
    10.7 -#include <asm/shadow2.h>
    10.8 +#include <asm/shadow.h>
    10.9  #include <public/sched.h>
   10.10  #include <public/hvm/ioreq.h>
   10.11  #include <asm/hvm/vpic.h>
   10.12 @@ -66,10 +66,10 @@ static int vmx_initialize_guest_resource
   10.13      if ( v->vcpu_id != 0 )
   10.14          return 1;
   10.15  
   10.16 -    if ( !shadow2_mode_external(d) )
   10.17 +    if ( !shadow_mode_external(d) )
   10.18      {
   10.19          DPRINTK("Can't init HVM for dom %u vcpu %u: "
   10.20 -                "not in shadow2 external mode\n", 
   10.21 +                "not in shadow external mode\n", 
   10.22                  d->domain_id, v->vcpu_id);
   10.23          domain_crash(d);
   10.24      }
   10.25 @@ -865,7 +865,7 @@ static int vmx_do_page_fault(unsigned lo
   10.26      }
   10.27  #endif
   10.28  
   10.29 -    result = shadow2_fault(va, regs);
   10.30 +    result = shadow_fault(va, regs);
   10.31  
   10.32      TRACE_VMEXIT (2,result);
   10.33  #if 0
   10.34 @@ -1039,7 +1039,7 @@ static void vmx_vmexit_do_invlpg(unsigne
   10.35       * We do the safest things first, then try to update the shadow
   10.36       * copying from guest
   10.37       */
   10.38 -    shadow2_invlpg(v, va);
   10.39 +    shadow_invlpg(v, va);
   10.40  }
   10.41  
   10.42  
   10.43 @@ -1301,7 +1301,7 @@ vmx_world_restore(struct vcpu *v, struct
   10.44  
   10.45   skip_cr3:
   10.46  
   10.47 -    shadow2_update_paging_modes(v);
   10.48 +    shadow_update_paging_modes(v);
   10.49      if (!vmx_paging_enabled(v))
   10.50          HVM_DBG_LOG(DBG_LEVEL_VMMU, "switching to vmxassist. use phys table");
   10.51      else
   10.52 @@ -1504,7 +1504,7 @@ static int vmx_set_cr0(unsigned long val
   10.53          v->arch.guest_table = pagetable_from_pfn(mfn);
   10.54          if (old_base_mfn)
   10.55              put_page(mfn_to_page(old_base_mfn));
   10.56 -        shadow2_update_paging_modes(v);
   10.57 +        shadow_update_paging_modes(v);
   10.58  
   10.59          HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
   10.60                      (unsigned long) (mfn << PAGE_SHIFT));
   10.61 @@ -1577,7 +1577,7 @@ static int vmx_set_cr0(unsigned long val
   10.62      else if ( (value & (X86_CR0_PE | X86_CR0_PG)) == X86_CR0_PE )
   10.63      {
   10.64          __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
   10.65 -        shadow2_update_paging_modes(v);
   10.66 +        shadow_update_paging_modes(v);
   10.67      }
   10.68  
   10.69      return 1;
   10.70 @@ -1662,7 +1662,7 @@ static int mov_to_cr(int gp, int cr, str
   10.71              mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
   10.72              if (mfn != pagetable_get_pfn(v->arch.guest_table))
   10.73                  __hvm_bug(regs);
   10.74 -            shadow2_update_cr3(v);
   10.75 +            shadow_update_cr3(v);
   10.76          } else {
   10.77              /*
   10.78               * If different, make a shadow. Check if the PDBR is valid
   10.79 @@ -1755,7 +1755,7 @@ static int mov_to_cr(int gp, int cr, str
   10.80           * all TLB entries except global entries.
   10.81           */
   10.82          if ( (old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE) )
   10.83 -            shadow2_update_paging_modes(v);
   10.84 +            shadow_update_paging_modes(v);
   10.85          break;
   10.86      }
   10.87      default:
    11.1 --- a/xen/arch/x86/mm.c	Sun Aug 27 06:56:01 2006 +0100
    11.2 +++ b/xen/arch/x86/mm.c	Mon Aug 28 12:09:36 2006 +0100
    11.3 @@ -454,12 +454,12 @@ int map_ldt_shadow_page(unsigned int off
    11.4  
    11.5      res = get_page_and_type(mfn_to_page(mfn), d, PGT_ldt_page);
    11.6  
    11.7 -    if ( !res && unlikely(shadow2_mode_refcounts(d)) )
    11.8 +    if ( !res && unlikely(shadow_mode_refcounts(d)) )
    11.9      {
   11.10 -        shadow2_lock(d);
   11.11 -        shadow2_remove_write_access(d->vcpu[0], _mfn(mfn), 0, 0);
   11.12 +        shadow_lock(d);
   11.13 +        shadow_remove_write_access(d->vcpu[0], _mfn(mfn), 0, 0);
   11.14          res = get_page_and_type(mfn_to_page(mfn), d, PGT_ldt_page);
   11.15 -        shadow2_unlock(d);
   11.16 +        shadow_unlock(d);
   11.17      }
   11.18  
   11.19      if ( unlikely(!res) )
   11.20 @@ -527,7 +527,7 @@ get_linear_pagetable(
   11.21      struct page_info *page;
   11.22      unsigned long pfn;
   11.23  
   11.24 -    ASSERT( !shadow2_mode_refcounts(d) );
   11.25 +    ASSERT( !shadow_mode_refcounts(d) );
   11.26  
   11.27      if ( (root_get_flags(re) & _PAGE_RW) )
   11.28      {
   11.29 @@ -602,12 +602,12 @@ get_page_from_l1e(
   11.30          d = dom_io;
   11.31      }
   11.32  
   11.33 -    /* Foreign mappings into guests in shadow2 external mode don't
   11.34 +    /* Foreign mappings into guests in shadow external mode don't
   11.35       * contribute to writeable mapping refcounts.  (This allows the
   11.36       * qemu-dm helper process in dom0 to map the domain's memory without
   11.37       * messing up the count of "real" writable mappings.) */
   11.38      okay = (((l1e_get_flags(l1e) & _PAGE_RW) && 
   11.39 -             !(unlikely(shadow2_mode_external(d) && (d != current->domain))))
   11.40 +             !(unlikely(shadow_mode_external(d) && (d != current->domain))))
   11.41              ? get_page_and_type(page, d, PGT_writable_page)
   11.42              : get_page(page, d));
   11.43      if ( !okay )
   11.44 @@ -771,9 +771,9 @@ void put_page_from_l1e(l1_pgentry_t l1e,
   11.45      }
   11.46  
   11.47      /* Remember we didn't take a type-count of foreign writable mappings
   11.48 -     * to shadow2 external domains */
   11.49 +     * to shadow external domains */
   11.50      if ( (l1e_get_flags(l1e) & _PAGE_RW) && 
   11.51 -         !(unlikely((e != d) && shadow2_mode_external(e))) )
   11.52 +         !(unlikely((e != d) && shadow_mode_external(e))) )
   11.53      {
   11.54          put_page_and_type(page);
   11.55      }
   11.56 @@ -830,7 +830,7 @@ static int alloc_l1_table(struct page_in
   11.57      l1_pgentry_t  *pl1e;
   11.58      int            i;
   11.59  
   11.60 -    ASSERT(!shadow2_mode_refcounts(d));
   11.61 +    ASSERT(!shadow_mode_refcounts(d));
   11.62  
   11.63      pl1e = map_domain_page(pfn);
   11.64  
   11.65 @@ -883,7 +883,7 @@ static int create_pae_xen_mappings(l3_pg
   11.66       *     a. alloc_l3_table() calls this function and this check will fail
   11.67       *     b. mod_l3_entry() disallows updates to slot 3 in an existing table
   11.68       *
   11.69 -     * XXX -- this needs revisiting for shadow2_mode_refcount()==true...
   11.70 +     * XXX -- this needs revisiting for shadow_mode_refcount()==true...
   11.71       */
   11.72      page = l3e_get_page(l3e3);
   11.73      BUG_ON(page->u.inuse.type_info & PGT_pinned);
   11.74 @@ -1007,7 +1007,7 @@ static int alloc_l2_table(struct page_in
   11.75      l2_pgentry_t  *pl2e;
   11.76      int            i;
   11.77  
   11.78 -    ASSERT(!shadow2_mode_refcounts(d));
   11.79 +    ASSERT(!shadow_mode_refcounts(d));
   11.80      
   11.81      pl2e = map_domain_page(pfn);
   11.82  
   11.83 @@ -1059,7 +1059,7 @@ static int alloc_l3_table(struct page_in
   11.84      l3_pgentry_t  *pl3e;
   11.85      int            i;
   11.86  
   11.87 -    ASSERT(!shadow2_mode_refcounts(d));
   11.88 +    ASSERT(!shadow_mode_refcounts(d));
   11.89  
   11.90  #ifdef CONFIG_X86_PAE
   11.91      /*
   11.92 @@ -1120,7 +1120,7 @@ static int alloc_l4_table(struct page_in
   11.93      unsigned long vaddr;
   11.94      int            i;
   11.95  
   11.96 -    ASSERT(!shadow2_mode_refcounts(d));
   11.97 +    ASSERT(!shadow_mode_refcounts(d));
   11.98  
   11.99      for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
  11.100      {
  11.101 @@ -1234,8 +1234,8 @@ static inline int update_l1e(l1_pgentry_
  11.102                               struct vcpu *v)
  11.103  {
  11.104      int rv = 1;
  11.105 -    if ( unlikely(shadow2_mode_enabled(v->domain)) )
  11.106 -        shadow2_lock(v->domain);
  11.107 +    if ( unlikely(shadow_mode_enabled(v->domain)) )
  11.108 +        shadow_lock(v->domain);
  11.109  #ifndef PTE_UPDATE_WITH_CMPXCHG
  11.110      rv = (!__copy_to_user(pl1e, &nl1e, sizeof(nl1e)));
  11.111  #else
  11.112 @@ -1266,10 +1266,10 @@ static inline int update_l1e(l1_pgentry_
  11.113          }
  11.114      }
  11.115  #endif
  11.116 -    if ( unlikely(shadow2_mode_enabled(v->domain)) )
  11.117 +    if ( unlikely(shadow_mode_enabled(v->domain)) )
  11.118      {
  11.119 -        shadow2_validate_guest_entry(v, _mfn(gl1mfn), pl1e);
  11.120 -        shadow2_unlock(v->domain);    
  11.121 +        shadow_validate_guest_entry(v, _mfn(gl1mfn), pl1e);
  11.122 +        shadow_unlock(v->domain);    
  11.123      }
  11.124      return rv;
  11.125  }
  11.126 @@ -1339,13 +1339,13 @@ static int mod_l1_entry(l1_pgentry_t *pl
  11.127  #endif
  11.128  #define UPDATE_ENTRY(_t,_p,_o,_n,_m)  ({                            \
  11.129      int rv;                                                         \
  11.130 -    if ( unlikely(shadow2_mode_enabled(current->domain)) )          \
  11.131 -        shadow2_lock(current->domain);                              \
  11.132 +    if ( unlikely(shadow_mode_enabled(current->domain)) )          \
  11.133 +        shadow_lock(current->domain);                              \
  11.134      rv = _UPDATE_ENTRY(_t, _p, _o, _n);                             \
  11.135 -    if ( unlikely(shadow2_mode_enabled(current->domain)) )          \
  11.136 +    if ( unlikely(shadow_mode_enabled(current->domain)) )          \
  11.137      {                                                               \
  11.138 -        shadow2_validate_guest_entry(current, _mfn(_m), (_p));      \
  11.139 -        shadow2_unlock(current->domain);                            \
  11.140 +        shadow_validate_guest_entry(current, _mfn(_m), (_p));      \
  11.141 +        shadow_unlock(current->domain);                            \
  11.142      }                                                               \
  11.143      rv;                                                             \
  11.144  })
  11.145 @@ -1581,21 +1581,21 @@ void free_page_type(struct page_info *pa
  11.146           */
  11.147          this_cpu(percpu_mm_info).deferred_ops |= DOP_FLUSH_ALL_TLBS;
  11.148  
  11.149 -        if ( unlikely(shadow2_mode_enabled(owner)
  11.150 -                 && !shadow2_lock_is_acquired(owner)) )
  11.151 +        if ( unlikely(shadow_mode_enabled(owner)
  11.152 +                 && !shadow_lock_is_acquired(owner)) )
  11.153          {
  11.154              /* Raw page tables are rewritten during save/restore. */
  11.155 -            if ( !shadow2_mode_translate(owner) )
  11.156 +            if ( !shadow_mode_translate(owner) )
  11.157                  mark_dirty(owner, page_to_mfn(page));
  11.158  
  11.159 -            if ( shadow2_mode_refcounts(owner) )
  11.160 +            if ( shadow_mode_refcounts(owner) )
  11.161                  return;
  11.162  
  11.163              gmfn = mfn_to_gmfn(owner, page_to_mfn(page));
  11.164              ASSERT(VALID_M2P(gmfn));
  11.165 -            shadow2_lock(owner);
  11.166 -            shadow2_remove_all_shadows(owner->vcpu[0], _mfn(gmfn));
  11.167 -            shadow2_unlock(owner);
  11.168 +            shadow_lock(owner);
  11.169 +            shadow_remove_all_shadows(owner->vcpu[0], _mfn(gmfn));
  11.170 +            shadow_unlock(owner);
  11.171          }
  11.172      }
  11.173  
  11.174 @@ -1760,7 +1760,7 @@ int get_page_type(struct page_info *page
  11.175  #endif
  11.176                      /* Fixme: add code to propagate va_unknown to subtables. */
  11.177                      if ( ((type & PGT_type_mask) >= PGT_l2_page_table) &&
  11.178 -                         !shadow2_mode_refcounts(page_get_owner(page)) )
  11.179 +                         !shadow_mode_refcounts(page_get_owner(page)) )
  11.180                          return 0;
  11.181                      /* This table is possibly mapped at multiple locations. */
  11.182                      nx &= ~PGT_va_mask;
  11.183 @@ -1810,7 +1810,7 @@ int new_guest_cr3(unsigned long mfn)
  11.184      if ( hvm_guest(v) && !hvm_paging_enabled(v) )
  11.185          domain_crash_synchronous();
  11.186  
  11.187 -    if ( shadow2_mode_refcounts(d) )
  11.188 +    if ( shadow_mode_refcounts(d) )
  11.189      {
  11.190          okay = get_page_from_pagenr(mfn, d);
  11.191          if ( unlikely(!okay) )
  11.192 @@ -1858,7 +1858,7 @@ int new_guest_cr3(unsigned long mfn)
  11.193  
  11.194      if ( likely(old_base_mfn != 0) )
  11.195      {
  11.196 -        if ( shadow2_mode_refcounts(d) )
  11.197 +        if ( shadow_mode_refcounts(d) )
  11.198              put_page(mfn_to_page(old_base_mfn));
  11.199          else
  11.200              put_page_and_type(mfn_to_page(old_base_mfn));
  11.201 @@ -2043,7 +2043,7 @@ int do_mmuext_op(
  11.202              type = PGT_root_page_table;
  11.203  
  11.204          pin_page:
  11.205 -            if ( shadow2_mode_refcounts(FOREIGNDOM) )
  11.206 +            if ( shadow_mode_refcounts(FOREIGNDOM) )
  11.207                  break;
  11.208  
  11.209              okay = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM);
  11.210 @@ -2065,7 +2065,7 @@ int do_mmuext_op(
  11.211              break;
  11.212  
  11.213          case MMUEXT_UNPIN_TABLE:
  11.214 -            if ( shadow2_mode_refcounts(d) )
  11.215 +            if ( shadow_mode_refcounts(d) )
  11.216                  break;
  11.217  
  11.218              if ( unlikely(!(okay = get_page_from_pagenr(mfn, d))) )
  11.219 @@ -2078,11 +2078,11 @@ int do_mmuext_op(
  11.220              {
  11.221                  put_page_and_type(page);
  11.222                  put_page(page);
  11.223 -                if ( shadow2_mode_enabled(d) )
  11.224 +                if ( shadow_mode_enabled(d) )
  11.225                  {
  11.226 -                    shadow2_lock(d);
  11.227 -                    shadow2_remove_all_shadows(v, _mfn(mfn));
  11.228 -                    shadow2_unlock(d);
  11.229 +                    shadow_lock(d);
  11.230 +                    shadow_remove_all_shadows(v, _mfn(mfn));
  11.231 +                    shadow_unlock(d);
  11.232                  }
  11.233              }
  11.234              else
  11.235 @@ -2125,8 +2125,8 @@ int do_mmuext_op(
  11.236              break;
  11.237      
  11.238          case MMUEXT_INVLPG_LOCAL:
  11.239 -            if ( !shadow2_mode_enabled(d) 
  11.240 -                 || shadow2_invlpg(v, op.arg1.linear_addr) != 0 )
  11.241 +            if ( !shadow_mode_enabled(d) 
  11.242 +                 || shadow_invlpg(v, op.arg1.linear_addr) != 0 )
  11.243                  local_flush_tlb_one(op.arg1.linear_addr);
  11.244              break;
  11.245  
  11.246 @@ -2173,7 +2173,7 @@ int do_mmuext_op(
  11.247              unsigned long ptr  = op.arg1.linear_addr;
  11.248              unsigned long ents = op.arg2.nr_ents;
  11.249  
  11.250 -            if ( shadow2_mode_external(d) )
  11.251 +            if ( shadow_mode_external(d) )
  11.252              {
  11.253                  MEM_LOG("ignoring SET_LDT hypercall from external "
  11.254                          "domain %u", d->domain_id);
  11.255 @@ -2319,7 +2319,7 @@ int do_mmu_update(
  11.256              case PGT_l3_page_table:
  11.257              case PGT_l4_page_table:
  11.258              {
  11.259 -                if ( shadow2_mode_refcounts(d) )
  11.260 +                if ( shadow_mode_refcounts(d) )
  11.261                  {
  11.262                      DPRINTK("mmu update on shadow-refcounted domain!");
  11.263                      break;
  11.264 @@ -2372,16 +2372,16 @@ int do_mmu_update(
  11.265                  if ( unlikely(!get_page_type(page, PGT_writable_page)) )
  11.266                      break;
  11.267  
  11.268 -                if ( unlikely(shadow2_mode_enabled(d)) )
  11.269 -                    shadow2_lock(d);
  11.270 +                if ( unlikely(shadow_mode_enabled(d)) )
  11.271 +                    shadow_lock(d);
  11.272  
  11.273                  *(intpte_t *)va = req.val;
  11.274                  okay = 1;
  11.275  
  11.276 -                if ( unlikely(shadow2_mode_enabled(d)) )
  11.277 +                if ( unlikely(shadow_mode_enabled(d)) )
  11.278                  {
  11.279 -                    shadow2_validate_guest_entry(v, _mfn(mfn), va);
  11.280 -                    shadow2_unlock(d);
  11.281 +                    shadow_validate_guest_entry(v, _mfn(mfn), va);
  11.282 +                    shadow_unlock(d);
  11.283                  }
  11.284  
  11.285                  put_page_type(page);
  11.286 @@ -2405,8 +2405,8 @@ int do_mmu_update(
  11.287                  break;
  11.288              }
  11.289  
  11.290 -            if ( shadow2_mode_translate(FOREIGNDOM) )
  11.291 -                shadow2_guest_physmap_add_page(FOREIGNDOM, gpfn, mfn);
  11.292 +            if ( shadow_mode_translate(FOREIGNDOM) )
  11.293 +                shadow_guest_physmap_add_page(FOREIGNDOM, gpfn, mfn);
  11.294              else 
  11.295                  set_gpfn_from_mfn(mfn, gpfn);
  11.296              okay = 1;
  11.297 @@ -2492,7 +2492,7 @@ static int create_grant_pte_mapping(
  11.298          goto failed;
  11.299      } 
  11.300  
  11.301 -    if ( !shadow2_mode_refcounts(d) )
  11.302 +    if ( !shadow_mode_refcounts(d) )
  11.303          put_page_from_l1e(ol1e, d);
  11.304  
  11.305      put_page_type(page);
  11.306 @@ -2590,7 +2590,7 @@ static int create_grant_va_mapping(
  11.307                      l2e_get_pfn(__linear_l2_table[l2_linear_offset(va)]), v) )
  11.308          return GNTST_general_error;
  11.309  
  11.310 -    if ( !shadow2_mode_refcounts(d) )
  11.311 +    if ( !shadow_mode_refcounts(d) )
  11.312          put_page_from_l1e(ol1e, d);
  11.313  
  11.314      return GNTST_okay;
  11.315 @@ -2714,10 +2714,10 @@ int do_update_va_mapping(unsigned long v
  11.316  
  11.317      perfc_incrc(calls_to_update_va);
  11.318  
  11.319 -    if ( unlikely(!__addr_ok(va) && !shadow2_mode_external(d)) )
  11.320 +    if ( unlikely(!__addr_ok(va) && !shadow_mode_external(d)) )
  11.321          return -EINVAL;
  11.322  
  11.323 -    if ( unlikely(shadow2_mode_refcounts(d)) )
  11.324 +    if ( unlikely(shadow_mode_refcounts(d)) )
  11.325      {
  11.326          DPRINTK("Grant op on a shadow-refcounted domain\n");
  11.327          return -EINVAL; 
  11.328 @@ -2725,11 +2725,11 @@ int do_update_va_mapping(unsigned long v
  11.329  
  11.330      LOCK_BIGLOCK(d);
  11.331  
  11.332 -    if ( likely(rc == 0) && unlikely(shadow2_mode_enabled(d)) )
  11.333 +    if ( likely(rc == 0) && unlikely(shadow_mode_enabled(d)) )
  11.334      {
  11.335          if ( unlikely(this_cpu(percpu_mm_info).foreign &&
  11.336 -                      (shadow2_mode_translate(d) ||
  11.337 -                       shadow2_mode_translate(
  11.338 +                      (shadow_mode_translate(d) ||
  11.339 +                       shadow_mode_translate(
  11.340                             this_cpu(percpu_mm_info).foreign))) )
  11.341          {
  11.342              /*
  11.343 @@ -2770,8 +2770,8 @@ int do_update_va_mapping(unsigned long v
  11.344          switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
  11.345          {
  11.346          case UVMF_LOCAL:
  11.347 -            if ( !shadow2_mode_enabled(d) 
  11.348 -                 || (shadow2_invlpg(current, va) != 0) ) 
  11.349 +            if ( !shadow_mode_enabled(d) 
  11.350 +                 || (shadow_invlpg(current, va) != 0) ) 
  11.351                  local_flush_tlb_one(va);
  11.352              break;
  11.353          case UVMF_ALL:
  11.354 @@ -3006,7 +3006,7 @@ long arch_memory_op(int op, XEN_GUEST_HA
  11.355              break;
  11.356          }
  11.357  
  11.358 -        if ( !shadow2_mode_translate(d) || (mfn == 0) )
  11.359 +        if ( !shadow_mode_translate(d) || (mfn == 0) )
  11.360          {
  11.361              put_domain(d);
  11.362              return -EINVAL;
  11.363 @@ -3196,21 +3196,21 @@ static int ptwr_emulated_update(
  11.364      pl1e = (l1_pgentry_t *)((unsigned long)pl1e + (addr & ~PAGE_MASK));
  11.365      if ( do_cmpxchg )
  11.366      {
  11.367 -        if ( shadow2_mode_enabled(d) )
  11.368 -            shadow2_lock(d);
  11.369 +        if ( shadow_mode_enabled(d) )
  11.370 +            shadow_lock(d);
  11.371          ol1e = l1e_from_intpte(old);
  11.372          if ( cmpxchg((intpte_t *)pl1e, old, val) != old )
  11.373          {
  11.374 -            if ( shadow2_mode_enabled(d) )
  11.375 -                shadow2_unlock(d);
  11.376 +            if ( shadow_mode_enabled(d) )
  11.377 +                shadow_unlock(d);
  11.378              unmap_domain_page(pl1e);
  11.379              put_page_from_l1e(nl1e, d);
  11.380              return X86EMUL_CMPXCHG_FAILED;
  11.381          }
  11.382 -        if ( unlikely(shadow2_mode_enabled(v->domain)) )
  11.383 +        if ( unlikely(shadow_mode_enabled(v->domain)) )
  11.384          {
  11.385 -            shadow2_validate_guest_entry(v, _mfn(page_to_mfn(page)), pl1e);
  11.386 -            shadow2_unlock(v->domain);    
  11.387 +            shadow_validate_guest_entry(v, _mfn(page_to_mfn(page)), pl1e);
  11.388 +            shadow_unlock(v->domain);    
  11.389          }
  11.390      }
  11.391      else
    12.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    12.2 +++ b/xen/arch/x86/mm/Makefile	Mon Aug 28 12:09:36 2006 +0100
    12.3 @@ -0,0 +1,1 @@
    12.4 +subdir-y += shadow
    13.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    13.2 +++ b/xen/arch/x86/mm/shadow/Makefile	Mon Aug 28 12:09:36 2006 +0100
    13.3 @@ -0,0 +1,15 @@
    13.4 +ifneq ($(pae),n)
    13.5 +obj-$(x86_32) += common.o g2_on_s3.o g3_on_s3.o
    13.6 +else
    13.7 +obj-$(x86_32) += common.o g2_on_s2.o
    13.8 +endif
    13.9 +
   13.10 +obj-$(x86_64) += common.o g4_on_s4.o g3_on_s3.o g2_on_s3.o
   13.11 +
   13.12 +guest_levels  = $(subst g,,$(filter g%,$(subst ., ,$(subst _, ,$(1)))))
   13.13 +shadow_levels = $(subst s,,$(filter s%,$(subst ., ,$(subst _, ,$(1)))))
   13.14 +shadow_defns  = -DGUEST_PAGING_LEVELS=$(call guest_levels,$(1)) \
   13.15 +                -DSHADOW_PAGING_LEVELS=$(call shadow_levels,$(1))
   13.16 +
   13.17 +g%.o: multi.c $(HDRS) Makefile
   13.18 +	$(CC) $(CFLAGS) $(call shadow_defns,$(@F)) -c $< -o $@
    14.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    14.2 +++ b/xen/arch/x86/mm/shadow/common.c	Mon Aug 28 12:09:36 2006 +0100
    14.3 @@ -0,0 +1,3407 @@
    14.4 +/******************************************************************************
    14.5 + * arch/x86/mm/shadow/common.c
    14.6 + *
    14.7 + * Shadow code that does not need to be multiply compiled.
    14.8 + * Parts of this code are Copyright (c) 2006 by XenSource Inc.
    14.9 + * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
   14.10 + * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
   14.11 + * 
   14.12 + * This program is free software; you can redistribute it and/or modify
   14.13 + * it under the terms of the GNU General Public License as published by
   14.14 + * the Free Software Foundation; either version 2 of the License, or
   14.15 + * (at your option) any later version.
   14.16 + *
   14.17 + * This program is distributed in the hope that it will be useful,
   14.18 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
   14.19 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   14.20 + * GNU General Public License for more details.
   14.21 + *
   14.22 + * You should have received a copy of the GNU General Public License
   14.23 + * along with this program; if not, write to the Free Software
   14.24 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
   14.25 + */
   14.26 +
   14.27 +#define SHADOW 1
   14.28 +
   14.29 +#include <xen/config.h>
   14.30 +#include <xen/types.h>
   14.31 +#include <xen/mm.h>
   14.32 +#include <xen/trace.h>
   14.33 +#include <xen/sched.h>
   14.34 +#include <xen/perfc.h>
   14.35 +#include <xen/irq.h>
   14.36 +#include <xen/domain_page.h>
   14.37 +#include <xen/guest_access.h>
   14.38 +#include <xen/keyhandler.h>
   14.39 +#include <asm/event.h>
   14.40 +#include <asm/page.h>
   14.41 +#include <asm/current.h>
   14.42 +#include <asm/flushtlb.h>
   14.43 +#include <asm/shadow.h>
   14.44 +#include "private.h"
   14.45 +
   14.46 +#if SHADOW_AUDIT
   14.47 +int shadow_audit_enable = 0;
   14.48 +
   14.49 +static void shadow_audit_key(unsigned char key)
   14.50 +{
   14.51 +    shadow_audit_enable = !shadow_audit_enable;
   14.52 +    printk("%s shadow_audit_enable=%d\n",
   14.53 +           __func__, shadow_audit_enable);
   14.54 +}
   14.55 +
   14.56 +static int __init shadow_audit_key_init(void)
   14.57 +{
   14.58 +    register_keyhandler(
   14.59 +        'O', shadow_audit_key,  "toggle shadow audits");
   14.60 +    return 0;
   14.61 +}
   14.62 +__initcall(shadow_audit_key_init);
   14.63 +#endif /* SHADOW_AUDIT */
   14.64 +
   14.65 +static void sh_free_log_dirty_bitmap(struct domain *d);
   14.66 +
   14.67 +int _shadow_mode_refcounts(struct domain *d)
   14.68 +{
   14.69 +    return shadow_mode_refcounts(d);
   14.70 +}
   14.71 +
   14.72 +
   14.73 +/**************************************************************************/
   14.74 +/* x86 emulator support for the shadow code
   14.75 + */
   14.76 +
   14.77 +static int
   14.78 +sh_x86_emulate_read_std(unsigned long addr,
   14.79 +                         unsigned long *val,
   14.80 +                         unsigned int bytes,
   14.81 +                         struct x86_emulate_ctxt *ctxt)
   14.82 +{
   14.83 +    struct vcpu *v = current;
   14.84 +    if ( hvm_guest(v) )
   14.85 +    {
   14.86 +        *val = 0;
   14.87 +        // XXX -- this is WRONG.
   14.88 +        //        It entirely ignores the permissions in the page tables.
   14.89 +        //        In this case, that is only a user vs supervisor access check.
   14.90 +        //
   14.91 +        if ( hvm_copy(val, addr, bytes, HVM_COPY_IN) )
   14.92 +        {
   14.93 +#if 0
   14.94 +            SHADOW_PRINTK("d=%u v=%u a=%#lx v=%#lx bytes=%u\n",
   14.95 +                           v->domain->domain_id, v->vcpu_id, 
   14.96 +                           addr, *val, bytes);
   14.97 +#endif
   14.98 +            return X86EMUL_CONTINUE;
   14.99 +        }
  14.100 +
  14.101 +        /* If we got here, there was nothing mapped here, or a bad GFN 
  14.102 +         * was mapped here.  This should never happen: we're here because
  14.103 +         * of a write fault at the end of the instruction we're emulating. */ 
  14.104 +        SHADOW_PRINTK("read failed to va %#lx\n", addr);
  14.105 +        return X86EMUL_PROPAGATE_FAULT;
  14.106 +    }
  14.107 +    else 
  14.108 +    {
  14.109 +        SHADOW_PRINTK("this operation is not emulated yet\n");
  14.110 +        return X86EMUL_UNHANDLEABLE;
  14.111 +    }
  14.112 +}
  14.113 +
  14.114 +static int
  14.115 +sh_x86_emulate_write_std(unsigned long addr,
  14.116 +                          unsigned long val,
  14.117 +                          unsigned int bytes,
  14.118 +                          struct x86_emulate_ctxt *ctxt)
  14.119 +{
  14.120 +    struct vcpu *v = current;
  14.121 +#if 0
  14.122 +    SHADOW_PRINTK("d=%u v=%u a=%#lx v=%#lx bytes=%u\n",
  14.123 +                  v->domain->domain_id, v->vcpu_id, addr, val, bytes);
  14.124 +#endif
  14.125 +    if ( hvm_guest(v) )
  14.126 +    {
  14.127 +        // XXX -- this is WRONG.
  14.128 +        //        It entirely ignores the permissions in the page tables.
  14.129 +        //        In this case, that includes user vs supervisor, and
  14.130 +        //        write access.
  14.131 +        //
  14.132 +        if ( hvm_copy(&val, addr, bytes, HVM_COPY_OUT) )
  14.133 +            return X86EMUL_CONTINUE;
  14.134 +
  14.135 +        /* If we got here, there was nothing mapped here, or a bad GFN 
  14.136 +         * was mapped here.  This should never happen: we're here because
  14.137 +         * of a write fault at the end of the instruction we're emulating,
  14.138 +         * which should be handled by sh_x86_emulate_write_emulated. */ 
  14.139 +        SHADOW_PRINTK("write failed to va %#lx\n", addr);
  14.140 +        return X86EMUL_PROPAGATE_FAULT;
  14.141 +    }
  14.142 +    else 
  14.143 +    {
  14.144 +        SHADOW_PRINTK("this operation is not emulated yet\n");
  14.145 +        return X86EMUL_UNHANDLEABLE;
  14.146 +    }
  14.147 +}
  14.148 +
  14.149 +static int
  14.150 +sh_x86_emulate_write_emulated(unsigned long addr,
  14.151 +                               unsigned long val,
  14.152 +                               unsigned int bytes,
  14.153 +                               struct x86_emulate_ctxt *ctxt)
  14.154 +{
  14.155 +    struct vcpu *v = current;
  14.156 +#if 0
  14.157 +    SHADOW_PRINTK("d=%u v=%u a=%#lx v=%#lx bytes=%u\n",
  14.158 +                  v->domain->domain_id, v->vcpu_id, addr, val, bytes);
  14.159 +#endif
  14.160 +    if ( hvm_guest(v) )
  14.161 +    {
  14.162 +        return v->arch.shadow.mode->x86_emulate_write(v, addr, &val, bytes, ctxt);
  14.163 +    }
  14.164 +    else 
  14.165 +    {
  14.166 +        SHADOW_PRINTK("this operation is not emulated yet\n");
  14.167 +        return X86EMUL_UNHANDLEABLE;
  14.168 +    }
  14.169 +}
  14.170 +
  14.171 +static int 
  14.172 +sh_x86_emulate_cmpxchg_emulated(unsigned long addr,
  14.173 +                                 unsigned long old,
  14.174 +                                 unsigned long new,
  14.175 +                                 unsigned int bytes,
  14.176 +                                 struct x86_emulate_ctxt *ctxt)
  14.177 +{
  14.178 +    struct vcpu *v = current;
  14.179 +#if 0
  14.180 +    SHADOW_PRINTK("d=%u v=%u a=%#lx o?=%#lx n:=%#lx bytes=%u\n",
  14.181 +                   v->domain->domain_id, v->vcpu_id, addr, old, new, bytes);
  14.182 +#endif
  14.183 +    if ( hvm_guest(v) )
  14.184 +    {
  14.185 +        return v->arch.shadow.mode->x86_emulate_cmpxchg(v, addr, old, new, 
  14.186 +                                                    bytes, ctxt);
  14.187 +    }
  14.188 +    else 
  14.189 +    {
  14.190 +        SHADOW_PRINTK("this operation is not emulated yet\n");
  14.191 +        return X86EMUL_UNHANDLEABLE;
  14.192 +    }
  14.193 +}
  14.194 +
  14.195 +static int 
  14.196 +sh_x86_emulate_cmpxchg8b_emulated(unsigned long addr,
  14.197 +                                   unsigned long old_lo,
  14.198 +                                   unsigned long old_hi,
  14.199 +                                   unsigned long new_lo,
  14.200 +                                   unsigned long new_hi,
  14.201 +                                   struct x86_emulate_ctxt *ctxt)
  14.202 +{
  14.203 +    struct vcpu *v = current;
  14.204 +#if 0
  14.205 +    SHADOW_PRINTK("d=%u v=%u a=%#lx o?=%#lx:%lx n:=%#lx:%lx\n",
  14.206 +                   v->domain->domain_id, v->vcpu_id, addr, old_hi, old_lo,
  14.207 +                   new_hi, new_lo, ctxt);
  14.208 +#endif
  14.209 +    if ( hvm_guest(v) )
  14.210 +    {
  14.211 +        return v->arch.shadow.mode->x86_emulate_cmpxchg8b(v, addr, old_lo, old_hi,
  14.212 +                                                      new_lo, new_hi, ctxt);
  14.213 +    }
  14.214 +    else 
  14.215 +    {
  14.216 +        SHADOW_PRINTK("this operation is not emulated yet\n");
  14.217 +        return X86EMUL_UNHANDLEABLE;
  14.218 +    }
  14.219 +}
  14.220 +
  14.221 +
  14.222 +struct x86_emulate_ops shadow_emulator_ops = {
  14.223 +    .read_std           = sh_x86_emulate_read_std,
  14.224 +    .write_std          = sh_x86_emulate_write_std,
  14.225 +    .read_emulated      = sh_x86_emulate_read_std,
  14.226 +    .write_emulated     = sh_x86_emulate_write_emulated,
  14.227 +    .cmpxchg_emulated   = sh_x86_emulate_cmpxchg_emulated,
  14.228 +    .cmpxchg8b_emulated = sh_x86_emulate_cmpxchg8b_emulated,
  14.229 +};
  14.230 +
  14.231 +
  14.232 +/**************************************************************************/
  14.233 +/* Code for "promoting" a guest page to the point where the shadow code is
  14.234 + * willing to let it be treated as a guest page table.  This generally
  14.235 + * involves making sure there are no writable mappings available to the guest
  14.236 + * for this page.
  14.237 + */
  14.238 +void shadow_promote(struct vcpu *v, mfn_t gmfn, u32 type)
  14.239 +{
  14.240 +    struct page_info *page = mfn_to_page(gmfn);
  14.241 +    unsigned long type_info;
  14.242 +
  14.243 +    ASSERT(valid_mfn(gmfn));
  14.244 +
  14.245 +    /* We should never try to promote a gmfn that has writeable mappings */
  14.246 +    ASSERT(shadow_remove_write_access(v, gmfn, 0, 0) == 0);
  14.247 +
  14.248 +    // Is the page already shadowed?
  14.249 +    if ( !test_and_set_bit(_PGC_page_table, &page->count_info) )
  14.250 +    {
  14.251 +        // No prior shadow exists...
  14.252 +
  14.253 +        // Grab a type-ref.  We don't really care if we are racing with another
  14.254 +        // vcpu or not, or even what kind of type we get; we just want the type
  14.255 +        // count to be > 0.
  14.256 +        //
  14.257 +        do {
  14.258 +            type_info =
  14.259 +                page->u.inuse.type_info & (PGT_type_mask | PGT_va_mask);
  14.260 +        } while ( !get_page_type(page, type_info) );
  14.261 +
  14.262 +        // Now that the type ref is non-zero, we can safely use the
  14.263 +        // shadow_flags.
  14.264 +        //
  14.265 +        page->shadow_flags = 0;
  14.266 +    }
  14.267 +
  14.268 +    ASSERT(!test_bit(type >> PGC_SH_type_shift, &page->shadow_flags));
  14.269 +    set_bit(type >> PGC_SH_type_shift, &page->shadow_flags);
  14.270 +}
  14.271 +
  14.272 +void shadow_demote(struct vcpu *v, mfn_t gmfn, u32 type)
  14.273 +{
  14.274 +    struct page_info *page = mfn_to_page(gmfn);
  14.275 +
  14.276 +    ASSERT(test_bit(_PGC_page_table, &page->count_info));
  14.277 +    ASSERT(test_bit(type >> PGC_SH_type_shift, &page->shadow_flags));
  14.278 +
  14.279 +    clear_bit(type >> PGC_SH_type_shift, &page->shadow_flags);
  14.280 +
  14.281 +    if ( (page->shadow_flags & SHF_page_type_mask) == 0 )
  14.282 +    {
  14.283 +        // release the extra type ref
  14.284 +        put_page_type(page);
  14.285 +
  14.286 +        // clear the is-a-page-table bit.
  14.287 +        clear_bit(_PGC_page_table, &page->count_info);
  14.288 +    }
  14.289 +}
  14.290 +
  14.291 +/**************************************************************************/
  14.292 +/* Validate a pagetable change from the guest and update the shadows.
  14.293 + * Returns a bitmask of SHADOW_SET_* flags. */
  14.294 +
  14.295 +static int
  14.296 +__shadow_validate_guest_entry(struct vcpu *v, mfn_t gmfn, 
  14.297 +                               void *entry, u32 size)
  14.298 +{
  14.299 +    int result = 0;
  14.300 +    struct page_info *page = mfn_to_page(gmfn);
  14.301 +
  14.302 +    sh_mark_dirty(v->domain, gmfn);
  14.303 +    
  14.304 +    // Determine which types of shadows are affected, and update each.
  14.305 +    //
  14.306 +    // Always validate L1s before L2s to prevent another cpu with a linear
  14.307 +    // mapping of this gmfn from seeing a walk that results from 
  14.308 +    // using the new L2 value and the old L1 value.  (It is OK for such a
  14.309 +    // guest to see a walk that uses the old L2 value with the new L1 value,
  14.310 +    // as hardware could behave this way if one level of the pagewalk occurs
  14.311 +    // before the store, and the next level of the pagewalk occurs after the
  14.312 +    // store.
  14.313 +    //
  14.314 +    // Ditto for L2s before L3s, etc.
  14.315 +    //
  14.316 +
  14.317 +    if ( !(page->count_info & PGC_page_table) )
  14.318 +        return 0;  /* Not shadowed at all */
  14.319 +
  14.320 +#if CONFIG_PAGING_LEVELS == 2
  14.321 +    if ( page->shadow_flags & SHF_L1_32 ) 
  14.322 +        result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 2, 2)
  14.323 +            (v, gmfn, entry, size);
  14.324 +#else 
  14.325 +    if ( page->shadow_flags & SHF_L1_32 ) 
  14.326 +        result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 3, 2)
  14.327 +            (v, gmfn, entry, size);
  14.328 +#endif
  14.329 +
  14.330 +#if CONFIG_PAGING_LEVELS == 2
  14.331 +    if ( page->shadow_flags & SHF_L2_32 ) 
  14.332 +        result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 2, 2)
  14.333 +            (v, gmfn, entry, size);
  14.334 +#else 
  14.335 +    if ( page->shadow_flags & SHF_L2_32 ) 
  14.336 +        result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 3, 2)
  14.337 +            (v, gmfn, entry, size);
  14.338 +#endif
  14.339 +
  14.340 +#if CONFIG_PAGING_LEVELS >= 3 
  14.341 +    if ( page->shadow_flags & SHF_L1_PAE ) 
  14.342 +        result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 3, 3)
  14.343 +            (v, gmfn, entry, size);
  14.344 +    if ( page->shadow_flags & SHF_L2_PAE ) 
  14.345 +        result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 3, 3)
  14.346 +            (v, gmfn, entry, size);
  14.347 +    if ( page->shadow_flags & SHF_L2H_PAE ) 
  14.348 +        result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2he, 3, 3)
  14.349 +            (v, gmfn, entry, size);
  14.350 +    if ( page->shadow_flags & SHF_L3_PAE ) 
  14.351 +        result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl3e, 3, 3)
  14.352 +            (v, gmfn, entry, size);
  14.353 +#else /* 32-bit non-PAE hypervisor does not support PAE guests */
  14.354 +    ASSERT((page->shadow_flags & (SHF_L3_PAE|SHF_L2_PAE|SHF_L1_PAE)) == 0);
  14.355 +#endif
  14.356 +
  14.357 +#if CONFIG_PAGING_LEVELS >= 4 
  14.358 +    if ( page->shadow_flags & SHF_L1_64 ) 
  14.359 +        result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 4, 4)
  14.360 +            (v, gmfn, entry, size);
  14.361 +    if ( page->shadow_flags & SHF_L2_64 ) 
  14.362 +        result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 4, 4)
  14.363 +            (v, gmfn, entry, size);
  14.364 +    if ( page->shadow_flags & SHF_L3_64 ) 
  14.365 +        result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl3e, 4, 4)
  14.366 +            (v, gmfn, entry, size);
  14.367 +    if ( page->shadow_flags & SHF_L4_64 ) 
  14.368 +        result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl4e, 4, 4)
  14.369 +            (v, gmfn, entry, size);
  14.370 +#else /* 32-bit/PAE hypervisor does not support 64-bit guests */
  14.371 +    ASSERT((page->shadow_flags 
  14.372 +            & (SHF_L4_64|SHF_L3_64|SHF_L2_64|SHF_L1_64)) == 0);
  14.373 +#endif
  14.374 +
  14.375 +    return result;
  14.376 +}
  14.377 +
  14.378 +
  14.379 +int
  14.380 +shadow_validate_guest_entry(struct vcpu *v, mfn_t gmfn, void *entry)
  14.381 +/* This is the entry point from hypercalls. It returns a bitmask of all the 
  14.382 + * results of shadow_set_l*e() calls, so the caller knows to do TLB flushes. */
  14.383 +{
  14.384 +    int rc;
  14.385 +
  14.386 +    ASSERT(shadow_lock_is_acquired(v->domain));
  14.387 +    rc = __shadow_validate_guest_entry(v, gmfn, entry, sizeof(l1_pgentry_t));
  14.388 +    shadow_audit_tables(v);
  14.389 +    return rc;
  14.390 +}
  14.391 +
  14.392 +void
  14.393 +shadow_validate_guest_pt_write(struct vcpu *v, mfn_t gmfn,
  14.394 +                                void *entry, u32 size)
  14.395 +/* This is the entry point for emulated writes to pagetables in HVM guests */
  14.396 +{
  14.397 +    struct domain *d = v->domain;
  14.398 +    int rc;
  14.399 +
  14.400 +    ASSERT(shadow_lock_is_acquired(v->domain));
  14.401 +    rc = __shadow_validate_guest_entry(v, gmfn, entry, size);
  14.402 +    if ( rc & SHADOW_SET_FLUSH )
  14.403 +    {
  14.404 +        // Flush everyone except the local processor, which will flush when it
  14.405 +        // re-enters the HVM guest.
  14.406 +        //
  14.407 +        cpumask_t mask = d->domain_dirty_cpumask;
  14.408 +        cpu_clear(v->processor, mask);
  14.409 +        flush_tlb_mask(mask);
  14.410 +    }
  14.411 +    if ( rc & SHADOW_SET_ERROR ) 
  14.412 +    {
  14.413 +        /* This page is probably not a pagetable any more: tear it out of the 
  14.414 +         * shadows, along with any tables that reference it */
  14.415 +        shadow_remove_all_shadows_and_parents(v, gmfn);
  14.416 +    }
  14.417 +    /* We ignore the other bits: since we are about to change CR3 on
  14.418 +     * VMENTER we don't need to do any extra TLB flushes. */ 
  14.419 +}
  14.420 +
  14.421 +
  14.422 +/**************************************************************************/
  14.423 +/* Memory management for shadow pages. */ 
  14.424 +
  14.425 +/* Meaning of the count_info field in shadow pages
  14.426 + * ----------------------------------------------
  14.427 + * 
  14.428 + * A count of all references to this page from other shadow pages and
  14.429 + * guest CR3s (a.k.a. v->arch.shadow.table).  
  14.430 + *
  14.431 + * The top bits hold the shadow type and the pinned bit.  Top-level
  14.432 + * shadows are pinned so that they don't disappear when not in a CR3
  14.433 + * somewhere.
  14.434 + *
  14.435 + * We don't need to use get|put_page for this as the updates are all
  14.436 + * protected by the shadow lock.  We can't use get|put_page for this
  14.437 + * as the size of the count on shadow pages is different from that on
  14.438 + * normal guest pages.
  14.439 + */
  14.440 +
  14.441 +/* Meaning of the type_info field in shadow pages
  14.442 + * ----------------------------------------------
  14.443 + * 
  14.444 + * type_info use depends on the shadow type (from count_info)
  14.445 + * 
  14.446 + * PGC_SH_none : This page is in the shadow free pool.  type_info holds
  14.447 + *                the chunk order for our freelist allocator.
  14.448 + *
  14.449 + * PGC_SH_l*_shadow : This page is in use as a shadow. type_info 
  14.450 + *                     holds the mfn of the guest page being shadowed,
  14.451 + *
  14.452 + * PGC_SH_fl1_*_shadow : This page is being used to shatter a superpage.
  14.453 + *                        type_info holds the gfn being shattered.
  14.454 + *
  14.455 + * PGC_SH_monitor_table : This page is part of a monitor table.
  14.456 + *                         type_info is not used.
  14.457 + */
  14.458 +
  14.459 +/* Meaning of the _domain field in shadow pages
  14.460 + * --------------------------------------------
  14.461 + *
  14.462 + * In shadow pages, this field will always have its least significant bit
  14.463 + * set.  This ensures that all attempts to get_page() will fail (as all
  14.464 + * valid pickled domain pointers have a zero for their least significant bit).
  14.465 + * Instead, the remaining upper bits are used to record the shadow generation
  14.466 + * counter when the shadow was created.
  14.467 + */
  14.468 +
  14.469 +/* Meaning of the shadow_flags field
  14.470 + * ----------------------------------
  14.471 + * 
  14.472 + * In guest pages that are shadowed, one bit for each kind of shadow they have.
  14.473 + * 
  14.474 + * In shadow pages, will be used for holding a representation of the populated
  14.475 + * entries in this shadow (either a min/max, or a bitmap, or ...)
  14.476 + *
  14.477 + * In monitor-table pages, holds the level of the particular page (to save
  14.478 + * spilling the shadow types into an extra bit by having three types of monitor
  14.479 + * page).
  14.480 + */
  14.481 +
  14.482 +/* Meaning of the list_head struct in shadow pages
  14.483 + * -----------------------------------------------
  14.484 + *
  14.485 + * In free shadow pages, this is used to hold the free-lists of chunks.
  14.486 + *
  14.487 + * In top-level shadow tables, this holds a linked-list of all top-level
  14.488 + * shadows (used for recovering memory and destroying shadows). 
  14.489 + *
  14.490 + * In lower-level shadows, this holds the physical address of a higher-level
  14.491 + * shadow entry that holds a reference to this shadow (or zero).
  14.492 + */
  14.493 +
  14.494 +/* Allocating shadow pages
  14.495 + * -----------------------
  14.496 + *
  14.497 + * Most shadow pages are allocated singly, but there are two cases where we 
  14.498 + * need to allocate multiple pages together.
  14.499 + * 
  14.500 + * 1: Shadowing 32-bit guest tables on PAE or 64-bit shadows.
  14.501 + *    A 32-bit guest l1 table covers 4MB of virtuial address space,
  14.502 + *    and needs to be shadowed by two PAE/64-bit l1 tables (covering 2MB
  14.503 + *    of virtual address space each).  Similarly, a 32-bit guest l2 table 
  14.504 + *    (4GB va) needs to be shadowed by four PAE/64-bit l2 tables (1GB va 
  14.505 + *    each).  These multi-page shadows are contiguous and aligned; 
  14.506 + *    functions for handling offsets into them are defined in shadow.c 
  14.507 + *    (shadow_l1_index() etc.)
  14.508 + *    
  14.509 + * 2: Shadowing PAE top-level pages.  Each guest page that contains
  14.510 + *    any PAE top-level pages requires two shadow pages to shadow it.
  14.511 + *    They contain alternating l3 tables and pae_l3_bookkeeping structs.
  14.512 + *
  14.513 + * This table shows the allocation behaviour of the different modes:
  14.514 + *
  14.515 + * Xen paging      32b  pae  pae  64b  64b  64b
  14.516 + * Guest paging    32b  32b  pae  32b  pae  64b
  14.517 + * PV or HVM        *   HVM   *   HVM  HVM   * 
  14.518 + * Shadow paging   32b  pae  pae  pae  pae  64b
  14.519 + *
  14.520 + * sl1 size         4k   8k   4k   8k   4k   4k
  14.521 + * sl2 size         4k  16k   4k  16k   4k   4k
  14.522 + * sl3 size         -    -    8k   -    8k   4k
  14.523 + * sl4 size         -    -    -    -    -    4k
  14.524 + *
  14.525 + * We allocate memory from xen in four-page units and break them down
  14.526 + * with a simple buddy allocator.  Can't use the xen allocator to handle
  14.527 + * this as it only works for contiguous zones, and a domain's shadow
  14.528 + * pool is made of fragments.
  14.529 + *
  14.530 + * In HVM guests, the p2m table is built out of shadow pages, and we provide 
  14.531 + * a function for the p2m management to steal pages, in max-order chunks, from 
  14.532 + * the free pool.  We don't provide for giving them back, yet.
  14.533 + */
  14.534 +
  14.535 +/* Figure out the least acceptable quantity of shadow memory.
  14.536 + * The minimum memory requirement for always being able to free up a
  14.537 + * chunk of memory is very small -- only three max-order chunks per
  14.538 + * vcpu to hold the top level shadows and pages with Xen mappings in them.  
  14.539 + *
  14.540 + * But for a guest to be guaranteed to successfully execute a single
  14.541 + * instruction, we must be able to map a large number (about thirty) VAs
  14.542 + * at the same time, which means that to guarantee progress, we must
  14.543 + * allow for more than ninety allocated pages per vcpu.  We round that
  14.544 + * up to 128 pages, or half a megabyte per vcpu. */
  14.545 +unsigned int shadow_min_acceptable_pages(struct domain *d) 
  14.546 +{
  14.547 +    u32 vcpu_count = 0;
  14.548 +    struct vcpu *v;
  14.549 +
  14.550 +    for_each_vcpu(d, v)
  14.551 +        vcpu_count++;
  14.552 +
  14.553 +    return (vcpu_count * 128);
  14.554 +}
  14.555 +
  14.556 +/* Using the type_info field to store freelist order */
  14.557 +#define SH_PFN_ORDER(_p) ((_p)->u.inuse.type_info)
  14.558 +#define SH_SET_PFN_ORDER(_p, _o)                       \
  14.559 + do { (_p)->u.inuse.type_info = (_o); } while (0)
  14.560 + 
  14.561 +
  14.562 +/* Figure out the order of allocation needed for a given shadow type */
  14.563 +static inline u32
  14.564 +shadow_order(u32 shadow_type) 
  14.565 +{
  14.566 +#if CONFIG_PAGING_LEVELS > 2
  14.567 +    static const u32 type_to_order[16] = {
  14.568 +        0, /* PGC_SH_none           */
  14.569 +        1, /* PGC_SH_l1_32_shadow   */
  14.570 +        1, /* PGC_SH_fl1_32_shadow  */
  14.571 +        2, /* PGC_SH_l2_32_shadow   */
  14.572 +        0, /* PGC_SH_l1_pae_shadow  */
  14.573 +        0, /* PGC_SH_fl1_pae_shadow */
  14.574 +        0, /* PGC_SH_l2_pae_shadow  */
  14.575 +        0, /* PGC_SH_l2h_pae_shadow */
  14.576 +        1, /* PGC_SH_l3_pae_shadow  */
  14.577 +        0, /* PGC_SH_l1_64_shadow   */
  14.578 +        0, /* PGC_SH_fl1_64_shadow  */
  14.579 +        0, /* PGC_SH_l2_64_shadow   */
  14.580 +        0, /* PGC_SH_l3_64_shadow   */
  14.581 +        0, /* PGC_SH_l4_64_shadow   */
  14.582 +        2, /* PGC_SH_p2m_table      */
  14.583 +        0  /* PGC_SH_monitor_table  */
  14.584 +        };
  14.585 +    u32 type = (shadow_type & PGC_SH_type_mask) >> PGC_SH_type_shift;
  14.586 +    return type_to_order[type];
  14.587 +#else  /* 32-bit Xen only ever shadows 32-bit guests on 32-bit shadows. */
  14.588 +    return 0;
  14.589 +#endif
  14.590 +}
  14.591 +
  14.592 +
  14.593 +/* Do we have a free chunk of at least this order? */
  14.594 +static inline int chunk_is_available(struct domain *d, int order)
  14.595 +{
  14.596 +    int i;
  14.597 +    
  14.598 +    for ( i = order; i <= SHADOW_MAX_ORDER; i++ )
  14.599 +        if ( !list_empty(&d->arch.shadow.freelists[i]) )
  14.600 +            return 1;
  14.601 +    return 0;
  14.602 +}
  14.603 +
  14.604 +/* Dispatcher function: call the per-mode function that will unhook the
  14.605 + * non-Xen mappings in this top-level shadow mfn */
  14.606 +void shadow_unhook_mappings(struct vcpu *v, mfn_t smfn)
  14.607 +{
  14.608 +    struct page_info *pg = mfn_to_page(smfn);
  14.609 +    switch ( (pg->count_info & PGC_SH_type_mask) >> PGC_SH_type_shift )
  14.610 +    {
  14.611 +    case PGC_SH_l2_32_shadow >> PGC_SH_type_shift:
  14.612 +#if CONFIG_PAGING_LEVELS == 2
  14.613 +        SHADOW_INTERNAL_NAME(sh_unhook_32b_mappings,2,2)(v,smfn);
  14.614 +#else
  14.615 +        SHADOW_INTERNAL_NAME(sh_unhook_32b_mappings,3,2)(v,smfn);
  14.616 +#endif
  14.617 +        break;
  14.618 +#if CONFIG_PAGING_LEVELS >= 3
  14.619 +    case PGC_SH_l3_pae_shadow >> PGC_SH_type_shift:
  14.620 +        SHADOW_INTERNAL_NAME(sh_unhook_pae_mappings,3,3)(v,smfn);
  14.621 +        break;
  14.622 +#endif
  14.623 +#if CONFIG_PAGING_LEVELS >= 4
  14.624 +    case PGC_SH_l4_64_shadow >> PGC_SH_type_shift:
  14.625 +        SHADOW_INTERNAL_NAME(sh_unhook_64b_mappings,4,4)(v,smfn);
  14.626 +        break;
  14.627 +#endif
  14.628 +    default:
  14.629 +        SHADOW_PRINTK("top-level shadow has bad type %08lx\n", 
  14.630 +                       (unsigned long)((pg->count_info & PGC_SH_type_mask)
  14.631 +                                       >> PGC_SH_type_shift));
  14.632 +        BUG();
  14.633 +    }
  14.634 +}
  14.635 +
  14.636 +
  14.637 +/* Make sure there is at least one chunk of the required order available
  14.638 + * in the shadow page pool. This must be called before any calls to
  14.639 + * shadow_alloc().  Since this will free existing shadows to make room,
  14.640 + * it must be called early enough to avoid freeing shadows that the
  14.641 + * caller is currently working on. */
  14.642 +void shadow_prealloc(struct domain *d, unsigned int order)
  14.643 +{
  14.644 +    /* Need a vpcu for calling unpins; for now, since we don't have
  14.645 +     * per-vcpu shadows, any will do */
  14.646 +    struct vcpu *v = d->vcpu[0];
  14.647 +    struct list_head *l, *t;
  14.648 +    struct page_info *pg;
  14.649 +    mfn_t smfn;
  14.650 +
  14.651 +    if ( chunk_is_available(d, order) ) return; 
  14.652 +    
  14.653 +    /* Stage one: walk the list of top-level pages, unpinning them */
  14.654 +    perfc_incrc(shadow_prealloc_1);
  14.655 +    list_for_each_backwards_safe(l, t, &d->arch.shadow.toplevel_shadows)
  14.656 +    {
  14.657 +        pg = list_entry(l, struct page_info, list);
  14.658 +        smfn = page_to_mfn(pg);
  14.659 +
  14.660 +#if CONFIG_PAGING_LEVELS >= 3
  14.661 +        if ( (pg->count_info & PGC_SH_type_mask) == PGC_SH_l3_pae_shadow )
  14.662 +        {
  14.663 +            /* For PAE, we need to unpin each subshadow on this shadow */
  14.664 +            SHADOW_INTERNAL_NAME(sh_unpin_all_l3_subshadows,3,3)(v, smfn);
  14.665 +        } 
  14.666 +        else 
  14.667 +#endif /* 32-bit code always takes this branch */
  14.668 +        {
  14.669 +            /* Unpin this top-level shadow */
  14.670 +            sh_unpin(v, smfn);
  14.671 +        }
  14.672 +
  14.673 +        /* See if that freed up a chunk of appropriate size */
  14.674 +        if ( chunk_is_available(d, order) ) return;
  14.675 +    }
  14.676 +
  14.677 +    /* Stage two: all shadow pages are in use in hierarchies that are
  14.678 +     * loaded in cr3 on some vcpu.  Walk them, unhooking the non-Xen
  14.679 +     * mappings. */
  14.680 +    perfc_incrc(shadow_prealloc_2);
  14.681 +    v = current;
  14.682 +    if ( v->domain != d )
  14.683 +        v = d->vcpu[0];
  14.684 +    /* Walk the list from the tail: recently used toplevels have been pulled
  14.685 +     * to the head */
  14.686 +    list_for_each_backwards_safe(l, t, &d->arch.shadow.toplevel_shadows)
  14.687 +    {
  14.688 +        pg = list_entry(l, struct page_info, list);
  14.689 +        smfn = page_to_mfn(pg);
  14.690 +        shadow_unhook_mappings(v, smfn);
  14.691 +
  14.692 +        /* Need to flush TLB if we've altered our own tables */
  14.693 +        if ( !shadow_mode_external(d) 
  14.694 +             && pagetable_get_pfn(current->arch.shadow_table) == mfn_x(smfn) )
  14.695 +            local_flush_tlb();
  14.696 +        
  14.697 +        /* See if that freed up a chunk of appropriate size */
  14.698 +        if ( chunk_is_available(d, order) ) return;
  14.699 +    }
  14.700 +    
  14.701 +    /* Nothing more we can do: all remaining shadows are of pages that
  14.702 +     * hold Xen mappings for some vcpu.  This can never happen. */
  14.703 +    SHADOW_PRINTK("Can't pre-allocate %i shadow pages!\n"
  14.704 +                   "  shadow pages total = %u, free = %u, p2m=%u\n",
  14.705 +                   1 << order, 
  14.706 +                   d->arch.shadow.total_pages, 
  14.707 +                   d->arch.shadow.free_pages, 
  14.708 +                   d->arch.shadow.p2m_pages);
  14.709 +    BUG();
  14.710 +}
  14.711 +
  14.712 +
  14.713 +/* Allocate another shadow's worth of (contiguous, aligned) pages,
  14.714 + * and fill in the type and backpointer fields of their page_infos. 
  14.715 + * Never fails to allocate. */
  14.716 +mfn_t shadow_alloc(struct domain *d,  
  14.717 +                    u32 shadow_type,
  14.718 +                    unsigned long backpointer)
  14.719 +{
  14.720 +    struct page_info *pg = NULL;
  14.721 +    unsigned int order = shadow_order(shadow_type);
  14.722 +    cpumask_t mask;
  14.723 +    void *p;
  14.724 +    int i;
  14.725 +
  14.726 +    ASSERT(shadow_lock_is_acquired(d));
  14.727 +    ASSERT(order <= SHADOW_MAX_ORDER);
  14.728 +    ASSERT(shadow_type != PGC_SH_none);
  14.729 +    perfc_incrc(shadow_alloc);
  14.730 +
  14.731 +    /* Find smallest order which can satisfy the request. */
  14.732 +    for ( i = order; i <= SHADOW_MAX_ORDER; i++ )
  14.733 +        if ( !list_empty(&d->arch.shadow.freelists[i]) )
  14.734 +        {
  14.735 +            pg = list_entry(d->arch.shadow.freelists[i].next, 
  14.736 +                            struct page_info, list);
  14.737 +            list_del(&pg->list);
  14.738 +            
  14.739 +            /* We may have to halve the chunk a number of times. */
  14.740 +            while ( i != order )
  14.741 +            {
  14.742 +                i--;
  14.743 +                SH_SET_PFN_ORDER(pg, i);
  14.744 +                list_add_tail(&pg->list, &d->arch.shadow.freelists[i]);
  14.745 +                pg += 1 << i;
  14.746 +            }
  14.747 +            d->arch.shadow.free_pages -= 1 << order;
  14.748 +
  14.749 +            /* Init page info fields and clear the pages */
  14.750 +            for ( i = 0; i < 1<<order ; i++ ) 
  14.751 +            {
  14.752 +                pg[i].u.inuse.type_info = backpointer;
  14.753 +                pg[i].count_info = shadow_type;
  14.754 +                pg[i].shadow_flags = 0;
  14.755 +                INIT_LIST_HEAD(&pg[i].list);
  14.756 +                /* Before we overwrite the old contents of this page, 
  14.757 +                 * we need to be sure that no TLB holds a pointer to it. */
  14.758 +                mask = d->domain_dirty_cpumask;
  14.759 +                tlbflush_filter(mask, pg[i].tlbflush_timestamp);
  14.760 +                if ( unlikely(!cpus_empty(mask)) )
  14.761 +                {
  14.762 +                    perfc_incrc(shadow_alloc_tlbflush);
  14.763 +                    flush_tlb_mask(mask);
  14.764 +                }
  14.765 +                /* Now safe to clear the page for reuse */
  14.766 +                p = sh_map_domain_page(page_to_mfn(pg+i));
  14.767 +                ASSERT(p != NULL);
  14.768 +                clear_page(p);
  14.769 +                sh_unmap_domain_page(p);
  14.770 +                perfc_incr(shadow_alloc_count);
  14.771 +            }
  14.772 +            return page_to_mfn(pg);
  14.773 +        }
  14.774 +    
  14.775 +    /* If we get here, we failed to allocate. This should never happen.
  14.776 +     * It means that we didn't call shadow_prealloc() correctly before
  14.777 +     * we allocated.  We can't recover by calling prealloc here, because
  14.778 +     * we might free up higher-level pages that the caller is working on. */
  14.779 +    SHADOW_PRINTK("Can't allocate %i shadow pages!\n", 1 << order);
  14.780 +    BUG();
  14.781 +}
  14.782 +
  14.783 +
  14.784 +/* Return some shadow pages to the pool. */
  14.785 +void shadow_free(struct domain *d, mfn_t smfn)
  14.786 +{
  14.787 +    struct page_info *pg = mfn_to_page(smfn); 
  14.788 +    u32 shadow_type;
  14.789 +    unsigned long order;
  14.790 +    unsigned long mask;
  14.791 +    int i;
  14.792 +
  14.793 +    ASSERT(shadow_lock_is_acquired(d));
  14.794 +    perfc_incrc(shadow_free);
  14.795 +
  14.796 +    shadow_type = pg->count_info & PGC_SH_type_mask;
  14.797 +    ASSERT(shadow_type != PGC_SH_none);
  14.798 +    ASSERT(shadow_type != PGC_SH_p2m_table);
  14.799 +    order = shadow_order(shadow_type);
  14.800 +
  14.801 +    d->arch.shadow.free_pages += 1 << order;
  14.802 +
  14.803 +    for ( i = 0; i < 1<<order; i++ ) 
  14.804 +    {
  14.805 +        /* Strip out the type: this is now a free shadow page */
  14.806 +        pg[i].count_info = 0;
  14.807 +        /* Remember the TLB timestamp so we will know whether to flush 
  14.808 +         * TLBs when we reuse the page.  Because the destructors leave the
  14.809 +         * contents of the pages in place, we can delay TLB flushes until
  14.810 +         * just before the allocator hands the page out again. */
  14.811 +        pg[i].tlbflush_timestamp = tlbflush_current_time();
  14.812 +        perfc_decr(shadow_alloc_count);
  14.813 +    }
  14.814 +
  14.815 +    /* Merge chunks as far as possible. */
  14.816 +    while ( order < SHADOW_MAX_ORDER )
  14.817 +    {
  14.818 +        mask = 1 << order;
  14.819 +        if ( (mfn_x(page_to_mfn(pg)) & mask) ) {
  14.820 +            /* Merge with predecessor block? */
  14.821 +            if ( (((pg-mask)->count_info & PGC_SH_type_mask) != PGT_none) 
  14.822 +                 || (SH_PFN_ORDER(pg-mask) != order) )
  14.823 +                break;
  14.824 +            list_del(&(pg-mask)->list);
  14.825 +            pg -= mask;
  14.826 +        } else {
  14.827 +            /* Merge with successor block? */
  14.828 +            if ( (((pg+mask)->count_info & PGC_SH_type_mask) != PGT_none)
  14.829 +                 || (SH_PFN_ORDER(pg+mask) != order) )
  14.830 +                break;
  14.831 +            list_del(&(pg+mask)->list);
  14.832 +        }
  14.833 +        order++;
  14.834 +    }
  14.835 +
  14.836 +    SH_SET_PFN_ORDER(pg, order);
  14.837 +    list_add_tail(&pg->list, &d->arch.shadow.freelists[order]);
  14.838 +}
  14.839 +
  14.840 +/* Divert some memory from the pool to be used by the p2m mapping.
  14.841 + * This action is irreversible: the p2m mapping only ever grows.
  14.842 + * That's OK because the p2m table only exists for external domains,
  14.843 + * and those domains can't ever turn off shadow mode.
  14.844 + * Also, we only ever allocate a max-order chunk, so as to preserve
  14.845 + * the invariant that shadow_prealloc() always works.
  14.846 + * Returns 0 iff it can't get a chunk (the caller should then
  14.847 + * free up some pages in domheap and call set_sh_allocation);
  14.848 + * returns non-zero on success.
  14.849 + */
  14.850 +static int
  14.851 +shadow_alloc_p2m_pages(struct domain *d)
  14.852 +{
  14.853 +    struct page_info *pg;
  14.854 +    u32 i;
  14.855 +    ASSERT(shadow_lock_is_acquired(d));
  14.856 +    
  14.857 +    if ( d->arch.shadow.total_pages 
  14.858 +         < (shadow_min_acceptable_pages(d) + (1<<SHADOW_MAX_ORDER)) )
  14.859 +        return 0; /* Not enough shadow memory: need to increase it first */
  14.860 +    
  14.861 +    pg = mfn_to_page(shadow_alloc(d, PGC_SH_p2m_table, 0));
  14.862 +    d->arch.shadow.p2m_pages += (1<<SHADOW_MAX_ORDER);
  14.863 +    d->arch.shadow.total_pages -= (1<<SHADOW_MAX_ORDER);
  14.864 +    for (i = 0; i < (1<<SHADOW_MAX_ORDER); i++)
  14.865 +    {
  14.866 +        /* Unlike shadow pages, mark p2m pages as owned by the domain */
  14.867 +        page_set_owner(&pg[i], d);
  14.868 +        list_add_tail(&pg[i].list, &d->arch.shadow.p2m_freelist);
  14.869 +    }
  14.870 +    return 1;
  14.871 +}
  14.872 +
  14.873 +// Returns 0 if no memory is available...
  14.874 +mfn_t
  14.875 +shadow_alloc_p2m_page(struct domain *d)
  14.876 +{
  14.877 +    struct list_head *entry;
  14.878 +    mfn_t mfn;
  14.879 +    void *p;
  14.880 +
  14.881 +    if ( list_empty(&d->arch.shadow.p2m_freelist) &&
  14.882 +         !shadow_alloc_p2m_pages(d) )
  14.883 +        return _mfn(0);
  14.884 +    entry = d->arch.shadow.p2m_freelist.next;
  14.885 +    list_del(entry);
  14.886 +    list_add_tail(entry, &d->arch.shadow.p2m_inuse);
  14.887 +    mfn = page_to_mfn(list_entry(entry, struct page_info, list));
  14.888 +    sh_get_ref(mfn, 0);
  14.889 +    p = sh_map_domain_page(mfn);
  14.890 +    clear_page(p);
  14.891 +    sh_unmap_domain_page(p);
  14.892 +
  14.893 +    return mfn;
  14.894 +}
  14.895 +
  14.896 +#if CONFIG_PAGING_LEVELS == 3
  14.897 +static void p2m_install_entry_in_monitors(struct domain *d, 
  14.898 +                                          l3_pgentry_t *l3e) 
  14.899 +/* Special case, only used for external-mode domains on PAE hosts:
  14.900 + * update the mapping of the p2m table.  Once again, this is trivial in
  14.901 + * other paging modes (one top-level entry points to the top-level p2m,
  14.902 + * no maintenance needed), but PAE makes life difficult by needing a
  14.903 + * copy the eight l3es of the p2m table in eight l2h slots in the
  14.904 + * monitor table.  This function makes fresh copies when a p2m l3e
  14.905 + * changes. */
  14.906 +{
  14.907 +    l2_pgentry_t *ml2e;
  14.908 +    struct vcpu *v;
  14.909 +    unsigned int index;
  14.910 +
  14.911 +    index = ((unsigned long)l3e & ~PAGE_MASK) / sizeof(l3_pgentry_t);
  14.912 +    ASSERT(index < MACHPHYS_MBYTES>>1);
  14.913 +
  14.914 +    for_each_vcpu(d, v) 
  14.915 +    {
  14.916 +        if ( pagetable_get_pfn(v->arch.monitor_table) == 0 ) 
  14.917 +            continue;
  14.918 +        ASSERT(shadow_mode_external(v->domain));
  14.919 +
  14.920 +        SHADOW_DEBUG(P2M, "d=%u v=%u index=%u mfn=%#lx\n",
  14.921 +                      d->domain_id, v->vcpu_id, index, l3e_get_pfn(*l3e));
  14.922 +
  14.923 +        if ( v == current ) /* OK to use linear map of monitor_table */
  14.924 +            ml2e = __linear_l2_table + l2_linear_offset(RO_MPT_VIRT_START);
  14.925 +        else 
  14.926 +        {
  14.927 +            l3_pgentry_t *ml3e;
  14.928 +            ml3e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
  14.929 +            ASSERT(l3e_get_flags(ml3e[3]) & _PAGE_PRESENT);
  14.930 +            ml2e = sh_map_domain_page(_mfn(l3e_get_pfn(ml3e[3])));
  14.931 +            ml2e += l2_table_offset(RO_MPT_VIRT_START);
  14.932 +            sh_unmap_domain_page(ml3e);
  14.933 +        }
  14.934 +        ml2e[index] = l2e_from_pfn(l3e_get_pfn(*l3e), __PAGE_HYPERVISOR);
  14.935 +        if ( v != current )
  14.936 +            sh_unmap_domain_page(ml2e);
  14.937 +    }
  14.938 +}
  14.939 +#endif
  14.940 +
  14.941 +// Find the next level's P2M entry, checking for out-of-range gfn's...
  14.942 +// Returns NULL on error.
  14.943 +//
  14.944 +static l1_pgentry_t *
  14.945 +p2m_find_entry(void *table, unsigned long *gfn_remainder,
  14.946 +                   unsigned long gfn, u32 shift, u32 max)
  14.947 +{
  14.948 +    u32 index;
  14.949 +
  14.950 +    index = *gfn_remainder >> shift;
  14.951 +    if ( index >= max )
  14.952 +    {
  14.953 +        SHADOW_DEBUG(P2M, "gfn=0x%lx out of range "
  14.954 +                      "(gfn_remainder=0x%lx shift=%d index=0x%x max=0x%x)\n",
  14.955 +                       gfn, *gfn_remainder, shift, index, max);
  14.956 +        return NULL;
  14.957 +    }
  14.958 +    *gfn_remainder &= (1 << shift) - 1;
  14.959 +    return (l1_pgentry_t *)table + index;
  14.960 +}
  14.961 +
  14.962 +// Walk one level of the P2M table, allocating a new table if required.
  14.963 +// Returns 0 on error.
  14.964 +//
  14.965 +static int
  14.966 +p2m_next_level(struct domain *d, mfn_t *table_mfn, void **table, 
  14.967 +               unsigned long *gfn_remainder, unsigned long gfn, u32 shift, 
  14.968 +               u32 max, unsigned long type)
  14.969 +{
  14.970 +    l1_pgentry_t *p2m_entry;
  14.971 +    void *next;
  14.972 +
  14.973 +    if ( !(p2m_entry = p2m_find_entry(*table, gfn_remainder, gfn,
  14.974 +                                      shift, max)) )
  14.975 +        return 0;
  14.976 +
  14.977 +    if ( !(l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) )
  14.978 +    {
  14.979 +        mfn_t mfn = shadow_alloc_p2m_page(d);
  14.980 +        if ( mfn_x(mfn) == 0 )
  14.981 +            return 0;
  14.982 +        *p2m_entry = l1e_from_pfn(mfn_x(mfn), __PAGE_HYPERVISOR|_PAGE_USER);
  14.983 +        mfn_to_page(mfn)->u.inuse.type_info = type | 1 | PGT_validated;
  14.984 +        mfn_to_page(mfn)->count_info = 1;
  14.985 +#if CONFIG_PAGING_LEVELS == 3
  14.986 +        if (type == PGT_l2_page_table)
  14.987 +        {
  14.988 +            /* We have written to the p2m l3: need to sync the per-vcpu
  14.989 +             * copies of it in the monitor tables */
  14.990 +            p2m_install_entry_in_monitors(d, (l3_pgentry_t *)p2m_entry);
  14.991 +        }
  14.992 +#endif
  14.993 +        /* The P2M can be shadowed: keep the shadows synced */
  14.994 +        if ( d->vcpu[0] )
  14.995 +            (void)__shadow_validate_guest_entry(d->vcpu[0], *table_mfn,
  14.996 +                                                 p2m_entry, sizeof *p2m_entry);
  14.997 +    }
  14.998 +    *table_mfn = _mfn(l1e_get_pfn(*p2m_entry));
  14.999 +    next = sh_map_domain_page(*table_mfn);
 14.1000 +    sh_unmap_domain_page(*table);
 14.1001 +    *table = next;
 14.1002 +
 14.1003 +    return 1;
 14.1004 +}
 14.1005 +
 14.1006 +// Returns 0 on error (out of memory)
 14.1007 +int
 14.1008 +shadow_set_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn)
 14.1009 +{
 14.1010 +    // XXX -- this might be able to be faster iff current->domain == d
 14.1011 +    mfn_t table_mfn = pagetable_get_mfn(d->arch.phys_table);
 14.1012 +    void *table = sh_map_domain_page(table_mfn);
 14.1013 +    unsigned long gfn_remainder = gfn;
 14.1014 +    l1_pgentry_t *p2m_entry;
 14.1015 +
 14.1016 +#if CONFIG_PAGING_LEVELS >= 4
 14.1017 +    if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
 14.1018 +                         L4_PAGETABLE_SHIFT - PAGE_SHIFT,
 14.1019 +                         L4_PAGETABLE_ENTRIES, PGT_l3_page_table) )
 14.1020 +        return 0;
 14.1021 +#endif
 14.1022 +#if CONFIG_PAGING_LEVELS >= 3
 14.1023 +    // When using PAE Xen, we only allow 33 bits of pseudo-physical
 14.1024 +    // address in translated guests (i.e. 8 GBytes).  This restriction
 14.1025 +    // comes from wanting to map the P2M table into the 16MB RO_MPT hole
 14.1026 +    // in Xen's address space for translated PV guests.
 14.1027 +    //
 14.1028 +    if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
 14.1029 +                         L3_PAGETABLE_SHIFT - PAGE_SHIFT,
 14.1030 +                         (CONFIG_PAGING_LEVELS == 3
 14.1031 +                          ? 8
 14.1032 +                          : L3_PAGETABLE_ENTRIES),
 14.1033 +                         PGT_l2_page_table) )
 14.1034 +        return 0;
 14.1035 +#endif
 14.1036 +    if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
 14.1037 +                         L2_PAGETABLE_SHIFT - PAGE_SHIFT,
 14.1038 +                         L2_PAGETABLE_ENTRIES, PGT_l1_page_table) )
 14.1039 +        return 0;
 14.1040 +
 14.1041 +    p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn,
 14.1042 +                               0, L1_PAGETABLE_ENTRIES);
 14.1043 +    ASSERT(p2m_entry);
 14.1044 +    if ( valid_mfn(mfn) )
 14.1045 +        *p2m_entry = l1e_from_pfn(mfn_x(mfn), __PAGE_HYPERVISOR|_PAGE_USER);
 14.1046 +    else
 14.1047 +        *p2m_entry = l1e_empty();
 14.1048 +
 14.1049 +    /* The P2M can be shadowed: keep the shadows synced */
 14.1050 +    (void) __shadow_validate_guest_entry(d->vcpu[0], table_mfn, 
 14.1051 +                                          p2m_entry, sizeof *p2m_entry);
 14.1052 +
 14.1053 +    sh_unmap_domain_page(table);
 14.1054 +
 14.1055 +    return 1;
 14.1056 +}
 14.1057 +
 14.1058 +// Allocate a new p2m table for a domain.
 14.1059 +//
 14.1060 +// The structure of the p2m table is that of a pagetable for xen (i.e. it is
 14.1061 +// controlled by CONFIG_PAGING_LEVELS).
 14.1062 +//
 14.1063 +// Returns 0 if p2m table could not be initialized
 14.1064 +//
 14.1065 +static int
 14.1066 +shadow_alloc_p2m_table(struct domain *d)
 14.1067 +{
 14.1068 +    mfn_t p2m_top;
 14.1069 +    struct list_head *entry;
 14.1070 +    unsigned int page_count = 0;
 14.1071 +    
 14.1072 +    SHADOW_PRINTK("allocating p2m table\n");
 14.1073 +    ASSERT(pagetable_get_pfn(d->arch.phys_table) == 0);
 14.1074 +
 14.1075 +    p2m_top = shadow_alloc_p2m_page(d);
 14.1076 +    mfn_to_page(p2m_top)->count_info = 1;
 14.1077 +    mfn_to_page(p2m_top)->u.inuse.type_info = 
 14.1078 +#if CONFIG_PAGING_LEVELS == 4
 14.1079 +        PGT_l4_page_table
 14.1080 +#elif CONFIG_PAGING_LEVELS == 3
 14.1081 +        PGT_l3_page_table
 14.1082 +#elif CONFIG_PAGING_LEVELS == 2
 14.1083 +        PGT_l2_page_table
 14.1084 +#endif
 14.1085 +        | 1 | PGT_validated;
 14.1086 +   
 14.1087 +    if ( mfn_x(p2m_top) == 0 )
 14.1088 +        return 0;
 14.1089 +
 14.1090 +    d->arch.phys_table = pagetable_from_mfn(p2m_top);
 14.1091 +
 14.1092 +    SHADOW_PRINTK("populating p2m table\n");
 14.1093 + 
 14.1094 +    for ( entry = d->page_list.next;
 14.1095 +          entry != &d->page_list;
 14.1096 +          entry = entry->next )
 14.1097 +    {
 14.1098 +        struct page_info *page = list_entry(entry, struct page_info, list);
 14.1099 +        mfn_t mfn = page_to_mfn(page);
 14.1100 +        unsigned long gfn = get_gpfn_from_mfn(mfn_x(mfn));
 14.1101 +        page_count++;
 14.1102 +        if (
 14.1103 +#ifdef __x86_64__
 14.1104 +            (gfn != 0x5555555555555555L)
 14.1105 +#else
 14.1106 +            (gfn != 0x55555555L)
 14.1107 +#endif
 14.1108 +             && gfn != INVALID_M2P_ENTRY
 14.1109 +             && !shadow_set_p2m_entry(d, gfn, mfn) )
 14.1110 +        {
 14.1111 +            SHADOW_PRINTK("failed to initialize p2m table, gfn=%05lx, mfn=%" SH_PRI_mfn "\n",
 14.1112 +                           gfn, mfn_x(mfn));
 14.1113 +            return 0;
 14.1114 +        }
 14.1115 +    }
 14.1116 +
 14.1117 +    SHADOW_PRINTK("p2m table initialised (%u pages)\n", page_count);
 14.1118 +    return 1;
 14.1119 +}
 14.1120 +
 14.1121 +mfn_t
 14.1122 +sh_gfn_to_mfn_foreign(struct domain *d, unsigned long gpfn)
 14.1123 +/* Read another domain's p2m entries */
 14.1124 +{
 14.1125 +    mfn_t mfn;
 14.1126 +    unsigned long addr = gpfn << PAGE_SHIFT;
 14.1127 +    l2_pgentry_t *l2e;
 14.1128 +    l1_pgentry_t *l1e;
 14.1129 +    
 14.1130 +    ASSERT(shadow_mode_translate(d));
 14.1131 +    mfn = pagetable_get_mfn(d->arch.phys_table);
 14.1132 +
 14.1133 +
 14.1134 +#if CONFIG_PAGING_LEVELS > 2
 14.1135 +    if ( gpfn > (RO_MPT_VIRT_END - RO_MPT_VIRT_START) / sizeof(l1_pgentry_t) ) 
 14.1136 +        /* This pfn is higher than the p2m map can hold */
 14.1137 +        return _mfn(INVALID_MFN);
 14.1138 +#endif
 14.1139 +
 14.1140 +
 14.1141 +#if CONFIG_PAGING_LEVELS >= 4
 14.1142 +    { 
 14.1143 +        l4_pgentry_t *l4e = sh_map_domain_page(mfn);
 14.1144 +        l4e += l4_table_offset(addr);
 14.1145 +        if ( (l4e_get_flags(*l4e) & _PAGE_PRESENT) == 0 )
 14.1146 +        {
 14.1147 +            sh_unmap_domain_page(l4e);
 14.1148 +            return _mfn(INVALID_MFN);
 14.1149 +        }
 14.1150 +        mfn = _mfn(l4e_get_pfn(*l4e));
 14.1151 +        sh_unmap_domain_page(l4e);
 14.1152 +    }
 14.1153 +#endif
 14.1154 +#if CONFIG_PAGING_LEVELS >= 3
 14.1155 +    {
 14.1156 +        l3_pgentry_t *l3e = sh_map_domain_page(mfn);
 14.1157 +        l3e += l3_table_offset(addr);
 14.1158 +        if ( (l3e_get_flags(*l3e) & _PAGE_PRESENT) == 0 )
 14.1159 +        {
 14.1160 +            sh_unmap_domain_page(l3e);
 14.1161 +            return _mfn(INVALID_MFN);
 14.1162 +        }
 14.1163 +        mfn = _mfn(l3e_get_pfn(*l3e));
 14.1164 +        sh_unmap_domain_page(l3e);
 14.1165 +    }
 14.1166 +#endif
 14.1167 +
 14.1168 +    l2e = sh_map_domain_page(mfn);
 14.1169 +    l2e += l2_table_offset(addr);
 14.1170 +    if ( (l2e_get_flags(*l2e) & _PAGE_PRESENT) == 0 )
 14.1171 +    {
 14.1172 +        sh_unmap_domain_page(l2e);
 14.1173 +        return _mfn(INVALID_MFN);
 14.1174 +    }
 14.1175 +    mfn = _mfn(l2e_get_pfn(*l2e));
 14.1176 +    sh_unmap_domain_page(l2e);
 14.1177 +
 14.1178 +    l1e = sh_map_domain_page(mfn);
 14.1179 +    l1e += l1_table_offset(addr);
 14.1180 +    if ( (l1e_get_flags(*l1e) & _PAGE_PRESENT) == 0 )
 14.1181 +    {
 14.1182 +        sh_unmap_domain_page(l1e);
 14.1183 +        return _mfn(INVALID_MFN);
 14.1184 +    }
 14.1185 +    mfn = _mfn(l1e_get_pfn(*l1e));
 14.1186 +    sh_unmap_domain_page(l1e);
 14.1187 +
 14.1188 +    return mfn;
 14.1189 +}
 14.1190 +
 14.1191 +unsigned long
 14.1192 +shadow_gfn_to_mfn_foreign(unsigned long gpfn)
 14.1193 +{
 14.1194 +    return mfn_x(sh_gfn_to_mfn_foreign(current->domain, gpfn));
 14.1195 +}
 14.1196 +
 14.1197 +
 14.1198 +static void shadow_p2m_teardown(struct domain *d)
 14.1199 +/* Return all the p2m pages to Xen.
 14.1200 + * We know we don't have any extra mappings to these pages */
 14.1201 +{
 14.1202 +    struct list_head *entry, *n;
 14.1203 +    struct page_info *pg;
 14.1204 +
 14.1205 +    d->arch.phys_table = pagetable_null();
 14.1206 +
 14.1207 +    list_for_each_safe(entry, n, &d->arch.shadow.p2m_inuse)
 14.1208 +    {
 14.1209 +        pg = list_entry(entry, struct page_info, list);
 14.1210 +        list_del(entry);
 14.1211 +        /* Should have just the one ref we gave it in alloc_p2m_page() */
 14.1212 +        if ( (pg->count_info & PGC_SH_count_mask) != 1 )
 14.1213 +        {
 14.1214 +            SHADOW_PRINTK("Odd p2m page count c=%#x t=%"PRtype_info"\n",
 14.1215 +                           pg->count_info, pg->u.inuse.type_info);
 14.1216 +        }
 14.1217 +        ASSERT(page_get_owner(pg) == d);
 14.1218 +        /* Free should not decrement domain's total allocation, since 
 14.1219 +         * these pages were allocated without an owner. */
 14.1220 +        page_set_owner(pg, NULL); 
 14.1221 +        free_domheap_pages(pg, 0);
 14.1222 +        d->arch.shadow.p2m_pages--;
 14.1223 +        perfc_decr(shadow_alloc_count);
 14.1224 +    }
 14.1225 +    list_for_each_safe(entry, n, &d->arch.shadow.p2m_freelist)
 14.1226 +    {
 14.1227 +        list_del(entry);
 14.1228 +        pg = list_entry(entry, struct page_info, list);
 14.1229 +        ASSERT(page_get_owner(pg) == d);
 14.1230 +        /* Free should not decrement domain's total allocation. */
 14.1231 +        page_set_owner(pg, NULL); 
 14.1232 +        free_domheap_pages(pg, 0);
 14.1233 +        d->arch.shadow.p2m_pages--;
 14.1234 +        perfc_decr(shadow_alloc_count);
 14.1235 +    }
 14.1236 +    ASSERT(d->arch.shadow.p2m_pages == 0);
 14.1237 +}
 14.1238 +
 14.1239 +/* Set the pool of shadow pages to the required number of pages.
 14.1240 + * Input will be rounded up to at least shadow_min_acceptable_pages(),
 14.1241 + * plus space for the p2m table.
 14.1242 + * Returns 0 for success, non-zero for failure. */
 14.1243 +static unsigned int set_sh_allocation(struct domain *d, 
 14.1244 +                                       unsigned int pages,
 14.1245 +                                       int *preempted)
 14.1246 +{
 14.1247 +    struct page_info *pg;
 14.1248 +    unsigned int lower_bound;
 14.1249 +    int j;
 14.1250 +
 14.1251 +    ASSERT(shadow_lock_is_acquired(d));
 14.1252 +    
 14.1253 +    /* Don't allocate less than the minimum acceptable, plus one page per
 14.1254 +     * megabyte of RAM (for the p2m table) */
 14.1255 +    lower_bound = shadow_min_acceptable_pages(d) + (d->tot_pages / 256);
 14.1256 +    if ( pages > 0 && pages < lower_bound )
 14.1257 +        pages = lower_bound;
 14.1258 +    /* Round up to largest block size */
 14.1259 +    pages = (pages + ((1<<SHADOW_MAX_ORDER)-1)) & ~((1<<SHADOW_MAX_ORDER)-1);
 14.1260 +
 14.1261 +    SHADOW_PRINTK("current %i target %i\n", 
 14.1262 +                   d->arch.shadow.total_pages, pages);
 14.1263 +
 14.1264 +    while ( d->arch.shadow.total_pages != pages ) 
 14.1265 +    {
 14.1266 +        if ( d->arch.shadow.total_pages < pages ) 
 14.1267 +        {
 14.1268 +            /* Need to allocate more memory from domheap */
 14.1269 +            pg = alloc_domheap_pages(NULL, SHADOW_MAX_ORDER, 0); 
 14.1270 +            if ( pg == NULL ) 
 14.1271 +            { 
 14.1272 +                SHADOW_PRINTK("failed to allocate shadow pages.\n");
 14.1273 +                return -ENOMEM;
 14.1274 +            }
 14.1275 +            d->arch.shadow.free_pages += 1<<SHADOW_MAX_ORDER;
 14.1276 +            d->arch.shadow.total_pages += 1<<SHADOW_MAX_ORDER;
 14.1277 +            for ( j = 0; j < 1<<SHADOW_MAX_ORDER; j++ ) 
 14.1278 +            {
 14.1279 +                pg[j].u.inuse.type_info = 0;  /* Free page */
 14.1280 +                pg[j].tlbflush_timestamp = 0; /* Not in any TLB */
 14.1281 +            }
 14.1282 +            SH_SET_PFN_ORDER(pg, SHADOW_MAX_ORDER);
 14.1283 +            list_add_tail(&pg->list, 
 14.1284 +                          &d->arch.shadow.freelists[SHADOW_MAX_ORDER]);
 14.1285 +        } 
 14.1286 +        else if ( d->arch.shadow.total_pages > pages ) 
 14.1287 +        {
 14.1288 +            /* Need to return memory to domheap */
 14.1289 +            shadow_prealloc(d, SHADOW_MAX_ORDER);
 14.1290 +            ASSERT(!list_empty(&d->arch.shadow.freelists[SHADOW_MAX_ORDER]));
 14.1291 +            pg = list_entry(d->arch.shadow.freelists[SHADOW_MAX_ORDER].next, 
 14.1292 +                            struct page_info, list);
 14.1293 +            list_del(&pg->list);
 14.1294 +            d->arch.shadow.free_pages -= 1<<SHADOW_MAX_ORDER;
 14.1295 +            d->arch.shadow.total_pages -= 1<<SHADOW_MAX_ORDER;
 14.1296 +            free_domheap_pages(pg, SHADOW_MAX_ORDER);
 14.1297 +        }
 14.1298 +
 14.1299 +        /* Check to see if we need to yield and try again */
 14.1300 +        if ( preempted && hypercall_preempt_check() )
 14.1301 +        {
 14.1302 +            *preempted = 1;
 14.1303 +            return 0;
 14.1304 +        }
 14.1305 +    }
 14.1306 +
 14.1307 +    return 0;
 14.1308 +}
 14.1309 +
 14.1310 +unsigned int shadow_set_allocation(struct domain *d, 
 14.1311 +                                    unsigned int megabytes,
 14.1312 +                                    int *preempted)
 14.1313 +/* Hypercall interface to set the shadow memory allocation */
 14.1314 +{
 14.1315 +    unsigned int rv;
 14.1316 +    shadow_lock(d);
 14.1317 +    rv = set_sh_allocation(d, megabytes << (20 - PAGE_SHIFT), preempted); 
 14.1318 +    SHADOW_PRINTK("dom %u allocation now %u pages (%u MB)\n",
 14.1319 +                   d->domain_id,
 14.1320 +                   d->arch.shadow.total_pages,
 14.1321 +                   shadow_get_allocation(d));
 14.1322 +    shadow_unlock(d);
 14.1323 +    return rv;
 14.1324 +}
 14.1325 +
 14.1326 +/**************************************************************************/
 14.1327 +/* Hash table for storing the guest->shadow mappings */
 14.1328 +
 14.1329 +/* Hash function that takes a gfn or mfn, plus another byte of type info */
 14.1330 +typedef u32 key_t;
 14.1331 +static inline key_t sh_hash(unsigned long n, u8 t) 
 14.1332 +{
 14.1333 +    unsigned char *p = (unsigned char *)&n;
 14.1334 +    key_t k = t;
 14.1335 +    int i;
 14.1336 +    for ( i = 0; i < sizeof(n) ; i++ ) k = (u32)p[i] + (k<<6) + (k<<16) - k;
 14.1337 +    return k;
 14.1338 +}
 14.1339 +
 14.1340 +#if SHADOW_AUDIT & (SHADOW_AUDIT_HASH|SHADOW_AUDIT_HASH_FULL)
 14.1341 +
 14.1342 +/* Before we get to the mechanism, define a pair of audit functions
 14.1343 + * that sanity-check the contents of the hash table. */
 14.1344 +static void sh_hash_audit_bucket(struct domain *d, int bucket)
 14.1345 +/* Audit one bucket of the hash table */
 14.1346 +{
 14.1347 +    struct shadow_hash_entry *e, *x;
 14.1348 +    struct page_info *pg;
 14.1349 +
 14.1350 +    if ( !(SHADOW_AUDIT_ENABLE) )
 14.1351 +        return;
 14.1352 +
 14.1353 +    e = &d->arch.shadow.hash_table[bucket];
 14.1354 +    if ( e->t == 0 ) return; /* Bucket is empty */ 
 14.1355 +    while ( e )
 14.1356 +    {
 14.1357 +        /* Empty link? */
 14.1358 +        BUG_ON( e->t == 0 ); 
 14.1359 +        /* Bogus type? */
 14.1360 +        BUG_ON( e->t > (PGC_SH_max_shadow >> PGC_SH_type_shift) );
 14.1361 +        /* Wrong bucket? */
 14.1362 +        BUG_ON( sh_hash(e->n, e->t) % SHADOW_HASH_BUCKETS != bucket ); 
 14.1363 +        /* Duplicate entry? */
 14.1364 +        for ( x = e->next; x; x = x->next )
 14.1365 +            BUG_ON( x->n == e->n && x->t == e->t );
 14.1366 +        /* Bogus MFN? */
 14.1367 +        BUG_ON( !valid_mfn(e->smfn) );
 14.1368 +        pg = mfn_to_page(e->smfn);
 14.1369 +        /* Not a shadow? */
 14.1370 +        BUG_ON( page_get_owner(pg) != 0 );
 14.1371 +        /* Wrong kind of shadow? */
 14.1372 +        BUG_ON( (pg->count_info & PGC_SH_type_mask) >> PGC_SH_type_shift 
 14.1373 +                != e->t ); 
 14.1374 +        /* Bad backlink? */
 14.1375 +        BUG_ON( pg->u.inuse.type_info != e->n );
 14.1376 +        if ( e->t != (PGC_SH_fl1_32_shadow >> PGC_SH_type_shift)
 14.1377 +             && e->t != (PGC_SH_fl1_pae_shadow >> PGC_SH_type_shift)
 14.1378 +             && e->t != (PGC_SH_fl1_64_shadow >> PGC_SH_type_shift) )
 14.1379 +        {
 14.1380 +            /* Bad shadow flags on guest page? */
 14.1381 +            BUG_ON( !(mfn_to_page(_mfn(e->n))->shadow_flags & (1<<e->t)) );
 14.1382 +        }
 14.1383 +        /* That entry was OK; on we go */
 14.1384 +        e = e->next;
 14.1385 +    }
 14.1386 +}
 14.1387 +
 14.1388 +#else
 14.1389 +#define sh_hash_audit_bucket(_d, _b)
 14.1390 +#endif /* Hashtable bucket audit */
 14.1391 +
 14.1392 +
 14.1393 +#if SHADOW_AUDIT & SHADOW_AUDIT_HASH_FULL
 14.1394 +
 14.1395 +static void sh_hash_audit(struct domain *d)
 14.1396 +/* Full audit: audit every bucket in the table */
 14.1397 +{
 14.1398 +    int i;
 14.1399 +
 14.1400 +    if ( !(SHADOW_AUDIT_ENABLE) )
 14.1401 +        return;
 14.1402 +
 14.1403 +    for ( i = 0; i < SHADOW_HASH_BUCKETS; i++ ) 
 14.1404 +    {
 14.1405 +        sh_hash_audit_bucket(d, i);
 14.1406 +    }
 14.1407 +}
 14.1408 +
 14.1409 +#else
 14.1410 +#define sh_hash_audit(_d)
 14.1411 +#endif /* Hashtable bucket audit */
 14.1412 +
 14.1413 +/* Memory management interface for bucket allocation.
 14.1414 + * These ought to come out of shadow memory, but at least on 32-bit
 14.1415 + * machines we are forced to allocate them from xenheap so that we can
 14.1416 + * address them. */
 14.1417 +static struct shadow_hash_entry *sh_alloc_hash_entry(struct domain *d)
 14.1418 +{
 14.1419 +    struct shadow_hash_entry *extra, *x;
 14.1420 +    int i;
 14.1421 +
 14.1422 +    /* We need to allocate a new node. Ensure the free list is not empty. 
 14.1423 +     * Allocate new entries in units the same size as the original table. */
 14.1424 +    if ( unlikely(d->arch.shadow.hash_freelist == NULL) )
 14.1425 +    {
 14.1426 +        size_t sz = sizeof(void *) + (SHADOW_HASH_BUCKETS * sizeof(*x));
 14.1427 +        extra = xmalloc_bytes(sz);
 14.1428 +
 14.1429 +        if ( extra == NULL )
 14.1430 +        {
 14.1431 +            /* No memory left! */
 14.1432 +            SHADOW_ERROR("xmalloc() failed when allocating hash buckets.\n");
 14.1433 +            domain_crash_synchronous();
 14.1434 +        }
 14.1435 +        memset(extra, 0, sz);
 14.1436 +
 14.1437 +        /* Record the allocation block so it can be correctly freed later. */
 14.1438 +        *((struct shadow_hash_entry **)&extra[SHADOW_HASH_BUCKETS]) = 
 14.1439 +            d->arch.shadow.hash_allocations;
 14.1440 +        d->arch.shadow.hash_allocations = &extra[0];
 14.1441 +
 14.1442 +        /* Thread a free chain through the newly-allocated nodes. */
 14.1443 +        for ( i = 0; i < (SHADOW_HASH_BUCKETS - 1); i++ )
 14.1444 +            extra[i].next = &extra[i+1];
 14.1445 +        extra[i].next = NULL;
 14.1446 +
 14.1447 +        /* Add the new nodes to the free list. */
 14.1448 +        d->arch.shadow.hash_freelist = &extra[0];
 14.1449 +    }
 14.1450 +
 14.1451 +    /* Allocate a new node from the free list. */
 14.1452 +    x = d->arch.shadow.hash_freelist;
 14.1453 +    d->arch.shadow.hash_freelist = x->next;
 14.1454 +    return x;
 14.1455 +}
 14.1456 +
 14.1457 +static void sh_free_hash_entry(struct domain *d, struct shadow_hash_entry *e)
 14.1458 +{
 14.1459 +    /* Mark the bucket as empty and return it to the free list */
 14.1460 +    e->t = 0; 
 14.1461 +    e->next = d->arch.shadow.hash_freelist;
 14.1462 +    d->arch.shadow.hash_freelist = e;
 14.1463 +}
 14.1464 +
 14.1465 +
 14.1466 +/* Allocate and initialise the table itself.  
 14.1467 + * Returns 0 for success, 1 for error. */
 14.1468 +static int shadow_hash_alloc(struct domain *d)
 14.1469 +{
 14.1470 +    struct shadow_hash_entry *table;
 14.1471 +
 14.1472 +    ASSERT(shadow_lock_is_acquired(d));
 14.1473 +    ASSERT(!d->arch.shadow.hash_table);
 14.1474 +
 14.1475 +    table = xmalloc_array(struct shadow_hash_entry, SHADOW_HASH_BUCKETS);
 14.1476 +    if ( !table ) return 1;
 14.1477 +    memset(table, 0, 
 14.1478 +           SHADOW_HASH_BUCKETS * sizeof (struct shadow_hash_entry));
 14.1479 +    d->arch.shadow.hash_table = table;
 14.1480 +    return 0;
 14.1481 +}
 14.1482 +
 14.1483 +/* Tear down the hash table and return all memory to Xen.
 14.1484 + * This function does not care whether the table is populated. */
 14.1485 +static void shadow_hash_teardown(struct domain *d)
 14.1486 +{
 14.1487 +    struct shadow_hash_entry *a, *n;
 14.1488 +
 14.1489 +    ASSERT(shadow_lock_is_acquired(d));
 14.1490 +    ASSERT(d->arch.shadow.hash_table);
 14.1491 +
 14.1492 +    /* Return the table itself */
 14.1493 +    xfree(d->arch.shadow.hash_table);
 14.1494 +    d->arch.shadow.hash_table = NULL;
 14.1495 +
 14.1496 +    /* Return any extra allocations */
 14.1497 +    a = d->arch.shadow.hash_allocations;
 14.1498 +    while ( a ) 
 14.1499 +    {
 14.1500 +        /* We stored a linked-list pointer at the end of each allocation */
 14.1501 +        n = *((struct shadow_hash_entry **)(&a[SHADOW_HASH_BUCKETS]));
 14.1502 +        xfree(a);
 14.1503 +        a = n;
 14.1504 +    }
 14.1505 +    d->arch.shadow.hash_allocations = NULL;
 14.1506 +    d->arch.shadow.hash_freelist = NULL;
 14.1507 +}
 14.1508 +
 14.1509 +
 14.1510 +mfn_t shadow_hash_lookup(struct vcpu *v, unsigned long n, u8 t)
 14.1511 +/* Find an entry in the hash table.  Returns the MFN of the shadow,
 14.1512 + * or INVALID_MFN if it doesn't exist */
 14.1513 +{
 14.1514 +    struct domain *d = v->domain;
 14.1515 +    struct shadow_hash_entry *p, *x, *head;
 14.1516 +    key_t key;
 14.1517 +
 14.1518 +    ASSERT(shadow_lock_is_acquired(d));
 14.1519 +    ASSERT(d->arch.shadow.hash_table);
 14.1520 +    ASSERT(t);
 14.1521 +
 14.1522 +    sh_hash_audit(d);
 14.1523 +
 14.1524 +    perfc_incrc(shadow_hash_lookups);
 14.1525 +    key = sh_hash(n, t);
 14.1526 +
 14.1527 +    x = head = &d->arch.shadow.hash_table[key % SHADOW_HASH_BUCKETS];
 14.1528 +    p = NULL;
 14.1529 +
 14.1530 +    sh_hash_audit_bucket(d, key % SHADOW_HASH_BUCKETS);
 14.1531 +
 14.1532 +    do
 14.1533 +    {
 14.1534 +        ASSERT(x->t || ((x == head) && (x->next == NULL)));
 14.1535 +
 14.1536 +        if ( x->n == n && x->t == t )
 14.1537 +        {
 14.1538 +            /* Pull-to-front if 'x' isn't already the head item */
 14.1539 +            if ( unlikely(x != head) )
 14.1540 +            {
 14.1541 +                if ( unlikely(d->arch.shadow.hash_walking != 0) )
 14.1542 +                    /* Can't reorder: someone is walking the hash chains */
 14.1543 +                    return x->smfn;
 14.1544 +                else 
 14.1545 +                {
 14.1546 +                    /* Delete 'x' from list and reinsert after head. */
 14.1547 +                    p->next = x->next;
 14.1548 +                    x->next = head->next;
 14.1549 +                    head->next = x;
 14.1550 +                    
 14.1551 +                    /* Swap 'x' contents with head contents. */
 14.1552 +                    SWAP(head->n, x->n);
 14.1553 +                    SWAP(head->t, x->t);
 14.1554 +                    SWAP(head->smfn, x->smfn);
 14.1555 +                }
 14.1556 +            }
 14.1557 +            else
 14.1558 +            {
 14.1559 +                perfc_incrc(shadow_hash_lookup_head);
 14.1560 +            }
 14.1561 +            return head->smfn;
 14.1562 +        }
 14.1563 +
 14.1564 +        p = x;
 14.1565 +        x = x->next;
 14.1566 +    }
 14.1567 +    while ( x != NULL );
 14.1568 +
 14.1569 +    perfc_incrc(shadow_hash_lookup_miss);
 14.1570 +    return _mfn(INVALID_MFN);
 14.1571 +}
 14.1572 +
 14.1573 +void shadow_hash_insert(struct vcpu *v, unsigned long n, u8 t, mfn_t smfn)
 14.1574 +/* Put a mapping (n,t)->smfn into the hash table */
 14.1575 +{
 14.1576 +    struct domain *d = v->domain;
 14.1577 +    struct shadow_hash_entry *x, *head;
 14.1578 +    key_t key;
 14.1579 +    
 14.1580 +    ASSERT(shadow_lock_is_acquired(d));
 14.1581 +    ASSERT(d->arch.shadow.hash_table);
 14.1582 +    ASSERT(t);
 14.1583 +
 14.1584 +    sh_hash_audit(d);
 14.1585 +
 14.1586 +    perfc_incrc(shadow_hash_inserts);
 14.1587 +    key = sh_hash(n, t);
 14.1588 +
 14.1589 +    head = &d->arch.shadow.hash_table[key % SHADOW_HASH_BUCKETS];
 14.1590 +
 14.1591 +    sh_hash_audit_bucket(d, key % SHADOW_HASH_BUCKETS);
 14.1592 +
 14.1593 +    /* If the bucket is empty then insert the new page as the head item. */
 14.1594 +    if ( head->t == 0 )
 14.1595 +    {
 14.1596 +        head->n = n;
 14.1597 +        head->t = t;
 14.1598 +        head->smfn = smfn;
 14.1599 +        ASSERT(head->next == NULL);
 14.1600 +    }
 14.1601 +    else 
 14.1602 +    {
 14.1603 +        /* Insert a new entry directly after the head item. */
 14.1604 +        x = sh_alloc_hash_entry(d);
 14.1605 +        x->n = n; 
 14.1606 +        x->t = t;
 14.1607 +        x->smfn = smfn;
 14.1608 +        x->next = head->next;
 14.1609 +        head->next = x;
 14.1610 +    }
 14.1611 +    
 14.1612 +    sh_hash_audit_bucket(d, key % SHADOW_HASH_BUCKETS);
 14.1613 +}
 14.1614 +
 14.1615 +void shadow_hash_delete(struct vcpu *v, unsigned long n, u8 t, mfn_t smfn)
 14.1616 +/* Excise the mapping (n,t)->smfn from the hash table */
 14.1617 +{
 14.1618 +    struct domain *d = v->domain;
 14.1619 +    struct shadow_hash_entry *p, *x, *head;
 14.1620 +    key_t key;
 14.1621 +
 14.1622 +    ASSERT(shadow_lock_is_acquired(d));
 14.1623 +    ASSERT(d->arch.shadow.hash_table);
 14.1624 +    ASSERT(t);
 14.1625 +
 14.1626 +    sh_hash_audit(d);
 14.1627 +
 14.1628 +    perfc_incrc(shadow_hash_deletes);
 14.1629 +    key = sh_hash(n, t);
 14.1630 +
 14.1631 +    head = &d->arch.shadow.hash_table[key % SHADOW_HASH_BUCKETS];
 14.1632 +
 14.1633 +    sh_hash_audit_bucket(d, key % SHADOW_HASH_BUCKETS);
 14.1634 +
 14.1635 +    /* Match on head item? */
 14.1636 +    if ( head->n == n && head->t == t )
 14.1637 +    {
 14.1638 +        if ( (x = head->next) != NULL )
 14.1639 +        {
 14.1640 +            /* Overwrite head with contents of following node. */
 14.1641 +            head->n = x->n;
 14.1642 +            head->t = x->t;
 14.1643 +            head->smfn = x->smfn;
 14.1644 +
 14.1645 +            /* Delete following node. */
 14.1646 +            head->next = x->next;
 14.1647 +            sh_free_hash_entry(d, x);
 14.1648 +        }
 14.1649 +        else
 14.1650 +        {
 14.1651 +            /* This bucket is now empty. Initialise the head node. */
 14.1652 +            head->t = 0;
 14.1653 +        }
 14.1654 +    }
 14.1655 +    else 
 14.1656 +    {
 14.1657 +        /* Not at the head; need to walk the chain */
 14.1658 +        p = head;
 14.1659 +        x = head->next; 
 14.1660 +        
 14.1661 +        while(1)
 14.1662 +        {
 14.1663 +            ASSERT(x); /* We can't have hit the end, since our target is
 14.1664 +                        * still in the chain somehwere... */
 14.1665 +            if ( x->n == n && x->t == t )
 14.1666 +            {
 14.1667 +                /* Delete matching node. */
 14.1668 +                p->next = x->next;
 14.1669 +                sh_free_hash_entry(d, x);
 14.1670 +                break;
 14.1671 +            }
 14.1672 +            p = x;
 14.1673 +            x = x->next;
 14.1674 +        }
 14.1675 +    }
 14.1676 +
 14.1677 +    sh_hash_audit_bucket(d, key % SHADOW_HASH_BUCKETS);
 14.1678 +}
 14.1679 +
 14.1680 +typedef int (*hash_callback_t)(struct vcpu *v, mfn_t smfn, mfn_t other_mfn);
 14.1681 +
 14.1682 +static void hash_foreach(struct vcpu *v, 
 14.1683 +                         unsigned int callback_mask, 
 14.1684 +                         hash_callback_t callbacks[], 
 14.1685 +                         mfn_t callback_mfn)
 14.1686 +/* Walk the hash table looking at the types of the entries and 
 14.1687 + * calling the appropriate callback function for each entry. 
 14.1688 + * The mask determines which shadow types we call back for, and the array
 14.1689 + * of callbacks tells us which function to call.
 14.1690 + * Any callback may return non-zero to let us skip the rest of the scan. 
 14.1691 + *
 14.1692 + * WARNING: Callbacks MUST NOT add or remove hash entries unless they 
 14.1693 + * then return non-zero to terminate the scan. */
 14.1694 +{
 14.1695 +    int i, done = 0;
 14.1696 +    struct domain *d = v->domain;
 14.1697 +    struct shadow_hash_entry *x;
 14.1698 +
 14.1699 +    /* Say we're here, to stop hash-lookups reordering the chains */
 14.1700 +    ASSERT(shadow_lock_is_acquired(d));
 14.1701 +    ASSERT(d->arch.shadow.hash_walking == 0);
 14.1702 +    d->arch.shadow.hash_walking = 1;
 14.1703 +
 14.1704 +    callback_mask &= ~1; /* Never attempt to call back on empty buckets */
 14.1705 +    for ( i = 0; i < SHADOW_HASH_BUCKETS; i++ ) 
 14.1706 +    {
 14.1707 +        /* WARNING: This is not safe against changes to the hash table.
 14.1708 +         * The callback *must* return non-zero if it has inserted or
 14.1709 +         * deleted anything from the hash (lookups are OK, though). */
 14.1710 +        for ( x = &d->arch.shadow.hash_table[i]; x; x = x->next )
 14.1711 +        {
 14.1712 +            if ( callback_mask & (1 << x->t) ) 
 14.1713 +            {
 14.1714 +                ASSERT(x->t <= 15);
 14.1715 +                ASSERT(callbacks[x->t] != NULL);
 14.1716 +                if ( (done = callbacks[x->t](v, x->smfn, callback_mfn)) != 0 )
 14.1717 +                    break;
 14.1718 +            }
 14.1719 +        }
 14.1720 +        if ( done ) break; 
 14.1721 +    }
 14.1722 +    d->arch.shadow.hash_walking = 0; 
 14.1723 +}
 14.1724 +
 14.1725 +
 14.1726 +/**************************************************************************/
 14.1727 +/* Destroy a shadow page: simple dispatcher to call the per-type destructor
 14.1728 + * which will decrement refcounts appropriately and return memory to the 
 14.1729 + * free pool. */
 14.1730 +
 14.1731 +void sh_destroy_shadow(struct vcpu *v, mfn_t smfn)
 14.1732 +{
 14.1733 +    struct page_info *pg = mfn_to_page(smfn);
 14.1734 +    u32 t = pg->count_info & PGC_SH_type_mask;
 14.1735 +
 14.1736 +
 14.1737 +    SHADOW_PRINTK("smfn=%#lx\n", mfn_x(smfn));
 14.1738 +
 14.1739 +    /* Double-check, if we can, that the shadowed page belongs to this
 14.1740 +     * domain, (by following the back-pointer). */
 14.1741 +    ASSERT(t == PGC_SH_fl1_32_shadow  ||  
 14.1742 +           t == PGC_SH_fl1_pae_shadow ||  
 14.1743 +           t == PGC_SH_fl1_64_shadow  || 
 14.1744 +           t == PGC_SH_monitor_table  || 
 14.1745 +           (page_get_owner(mfn_to_page(_mfn(pg->u.inuse.type_info))) 
 14.1746 +            == v->domain)); 
 14.1747 +
 14.1748 +    /* The down-shifts here are so that the switch statement is on nice
 14.1749 +     * small numbers that the compiler will enjoy */
 14.1750 +    switch ( t >> PGC_SH_type_shift )
 14.1751 +    {
 14.1752 +#if CONFIG_PAGING_LEVELS == 2
 14.1753 +    case PGC_SH_l1_32_shadow >> PGC_SH_type_shift:
 14.1754 +    case PGC_SH_fl1_32_shadow >> PGC_SH_type_shift:
 14.1755 +        SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 2, 2)(v, smfn); 
 14.1756 +        break;
 14.1757 +    case PGC_SH_l2_32_shadow >> PGC_SH_type_shift:
 14.1758 +        SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 2, 2)(v, smfn);
 14.1759 +        break;
 14.1760 +#else /* PAE or 64bit */
 14.1761 +    case PGC_SH_l1_32_shadow >> PGC_SH_type_shift:
 14.1762 +    case PGC_SH_fl1_32_shadow >> PGC_SH_type_shift:
 14.1763 +        SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 3, 2)(v, smfn);
 14.1764 +        break;
 14.1765 +    case PGC_SH_l2_32_shadow >> PGC_SH_type_shift:
 14.1766 +        SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 3, 2)(v, smfn);
 14.1767 +        break;
 14.1768 +#endif
 14.1769 +
 14.1770 +#if CONFIG_PAGING_LEVELS >= 3
 14.1771 +    case PGC_SH_l1_pae_shadow >> PGC_SH_type_shift:
 14.1772 +    case PGC_SH_fl1_pae_shadow >> PGC_SH_type_shift:
 14.1773 +        SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 3, 3)(v, smfn);
 14.1774 +        break;
 14.1775 +    case PGC_SH_l2_pae_shadow >> PGC_SH_type_shift:
 14.1776 +    case PGC_SH_l2h_pae_shadow >> PGC_SH_type_shift:
 14.1777 +        SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 3, 3)(v, smfn);
 14.1778 +        break;
 14.1779 +    case PGC_SH_l3_pae_shadow >> PGC_SH_type_shift:
 14.1780 +        SHADOW_INTERNAL_NAME(sh_destroy_l3_shadow, 3, 3)(v, smfn);
 14.1781 +        break;
 14.1782 +#endif
 14.1783 +
 14.1784 +#if CONFIG_PAGING_LEVELS >= 4
 14.1785 +    case PGC_SH_l1_64_shadow >> PGC_SH_type_shift:
 14.1786 +    case PGC_SH_fl1_64_shadow >> PGC_SH_type_shift:
 14.1787 +        SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 4, 4)(v, smfn);
 14.1788 +        break;
 14.1789 +    case PGC_SH_l2_64_shadow >> PGC_SH_type_shift:
 14.1790 +        SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 4, 4)(v, smfn);
 14.1791 +        break;
 14.1792 +    case PGC_SH_l3_64_shadow >> PGC_SH_type_shift:
 14.1793 +        SHADOW_INTERNAL_NAME(sh_destroy_l3_shadow, 4, 4)(v, smfn);
 14.1794 +        break;
 14.1795 +    case PGC_SH_l4_64_shadow >> PGC_SH_type_shift:
 14.1796 +        SHADOW_INTERNAL_NAME(sh_destroy_l4_shadow, 4, 4)(v, smfn);
 14.1797 +        break;
 14.1798 +#endif
 14.1799 +    default:
 14.1800 +        SHADOW_PRINTK("tried to destroy shadow of bad type %08lx\n", 
 14.1801 +                       (unsigned long)t);
 14.1802 +        BUG();
 14.1803 +    }    
 14.1804 +}
 14.1805 +
 14.1806 +/**************************************************************************/
 14.1807 +/* Remove all writeable mappings of a guest frame from the shadow tables 
 14.1808 + * Returns non-zero if we need to flush TLBs. 
 14.1809 + * level and fault_addr desribe how we found this to be a pagetable;
 14.1810 + * level==0 means we have some other reason for revoking write access.*/
 14.1811 +
 14.1812 +int shadow_remove_write_access(struct vcpu *v, mfn_t gmfn, 
 14.1813 +                                unsigned int level,
 14.1814 +                                unsigned long fault_addr)
 14.1815 +{
 14.1816 +    /* Dispatch table for getting per-type functions */
 14.1817 +    static hash_callback_t callbacks[16] = {
 14.1818 +        NULL, /* none    */
 14.1819 +#if CONFIG_PAGING_LEVELS == 2
 14.1820 +        SHADOW_INTERNAL_NAME(sh_remove_write_access,2,2), /* l1_32   */
 14.1821 +        SHADOW_INTERNAL_NAME(sh_remove_write_access,2,2), /* fl1_32  */
 14.1822 +#else 
 14.1823 +        SHADOW_INTERNAL_NAME(sh_remove_write_access,3,2), /* l1_32   */
 14.1824 +        SHADOW_INTERNAL_NAME(sh_remove_write_access,3,2), /* fl1_32  */
 14.1825 +#endif
 14.1826 +        NULL, /* l2_32   */
 14.1827 +#if CONFIG_PAGING_LEVELS >= 3
 14.1828 +        SHADOW_INTERNAL_NAME(sh_remove_write_access,3,3), /* l1_pae  */
 14.1829 +        SHADOW_INTERNAL_NAME(sh_remove_write_access,3,3), /* fl1_pae */
 14.1830 +#else 
 14.1831 +        NULL, /* l1_pae  */
 14.1832 +        NULL, /* fl1_pae */
 14.1833 +#endif
 14.1834 +        NULL, /* l2_pae  */
 14.1835 +        NULL, /* l2h_pae */
 14.1836 +        NULL, /* l3_pae  */
 14.1837 +#if CONFIG_PAGING_LEVELS >= 4
 14.1838 +        SHADOW_INTERNAL_NAME(sh_remove_write_access,4,4), /* l1_64   */
 14.1839 +        SHADOW_INTERNAL_NAME(sh_remove_write_access,4,4), /* fl1_64  */
 14.1840 +#else
 14.1841 +        NULL, /* l1_64   */
 14.1842 +        NULL, /* fl1_64  */
 14.1843 +#endif
 14.1844 +        NULL, /* l2_64   */
 14.1845 +        NULL, /* l3_64   */
 14.1846 +        NULL, /* l4_64   */
 14.1847 +        NULL, /* p2m     */
 14.1848 +        NULL  /* unused  */
 14.1849 +    };
 14.1850 +
 14.1851 +    static unsigned int callback_mask = 
 14.1852 +          1 << (PGC_SH_l1_32_shadow >> PGC_SH_type_shift)
 14.1853 +        | 1 << (PGC_SH_fl1_32_shadow >> PGC_SH_type_shift)
 14.1854 +        | 1 << (PGC_SH_l1_pae_shadow >> PGC_SH_type_shift)
 14.1855 +        | 1 << (PGC_SH_fl1_pae_shadow >> PGC_SH_type_shift)
 14.1856 +        | 1 << (PGC_SH_l1_64_shadow >> PGC_SH_type_shift)
 14.1857 +        | 1 << (PGC_SH_fl1_64_shadow >> PGC_SH_type_shift)
 14.1858 +        ;
 14.1859 +    struct page_info *pg = mfn_to_page(gmfn);
 14.1860 +
 14.1861 +    ASSERT(shadow_lock_is_acquired(v->domain));
 14.1862 +
 14.1863 +    /* Only remove writable mappings if we are doing shadow refcounts.
 14.1864 +     * In guest refcounting, we trust Xen to already be restricting
 14.1865 +     * all the writes to the guest page tables, so we do not need to
 14.1866 +     * do more. */
 14.1867 +    if ( !shadow_mode_refcounts(v->domain) )
 14.1868 +        return 0;
 14.1869 +
 14.1870 +    /* Early exit if it's already a pagetable, or otherwise not writeable */
 14.1871 +    if ( sh_mfn_is_a_page_table(gmfn) 
 14.1872 +         || (pg->u.inuse.type_info & PGT_count_mask) == 0 )
 14.1873 +        return 0;
 14.1874 +
 14.1875 +    perfc_incrc(shadow_writeable);
 14.1876 +
 14.1877 +    /* If this isn't a "normal" writeable page, the domain is trying to 
 14.1878 +     * put pagetables in special memory of some kind.  We can't allow that. */
 14.1879 +    if ( (pg->u.inuse.type_info & PGT_type_mask) != PGT_writable_page )
 14.1880 +    {
 14.1881 +        SHADOW_ERROR("can't remove write access to mfn %lx, type_info is %" 
 14.1882 +                      PRtype_info "\n",
 14.1883 +                      mfn_x(gmfn), mfn_to_page(gmfn)->u.inuse.type_info);
 14.1884 +        domain_crash(v->domain);
 14.1885 +    }
 14.1886 +
 14.1887 +#if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
 14.1888 +    if ( v == current && level != 0 )
 14.1889 +    {
 14.1890 +        unsigned long gfn;
 14.1891 +        /* Heuristic: there is likely to be only one writeable mapping,
 14.1892 +         * and that mapping is likely to be in the current pagetable,
 14.1893 +         * either in the guest's linear map (linux, windows) or in a
 14.1894 +         * magic slot used to map high memory regions (linux HIGHTPTE) */
 14.1895 +
 14.1896 +#define GUESS(_a, _h) do {                                              \
 14.1897 +            if ( v->arch.shadow.mode->guess_wrmap(v, (_a), gmfn) )          \
 14.1898 +                perfc_incrc(shadow_writeable_h_ ## _h);                \
 14.1899 +            if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 )        \
 14.1900 +                return 1;                                               \
 14.1901 +        } while (0)
 14.1902 +
 14.1903 +        
 14.1904 +        /* Linux lowmem: first 1GB is mapped 1-to-1 above 0xC0000000 */
 14.1905 +        if ( v == current 
 14.1906 +             && (gfn = sh_mfn_to_gfn(v->domain, gmfn)) < 0x40000000 )
 14.1907 +            GUESS(0xC0000000 + (gfn << PAGE_SHIFT), 4);
 14.1908 +
 14.1909 +        if ( v->arch.shadow.mode->guest_levels == 2 )
 14.1910 +        {
 14.1911 +            if ( level == 1 )
 14.1912 +                /* 32bit non-PAE w2k3: linear map at 0xC0000000 */
 14.1913 +                GUESS(0xC0000000UL + (fault_addr >> 10), 1);
 14.1914 +        }
 14.1915 +#if CONFIG_PAGING_LEVELS >= 3
 14.1916 +        else if ( v->arch.shadow.mode->guest_levels == 3 )
 14.1917 +        {
 14.1918 +            /* 32bit PAE w2k3: linear map at 0xC0000000 */
 14.1919 +            switch ( level ) 
 14.1920 +            {
 14.1921 +            case 1: GUESS(0xC0000000UL + (fault_addr >> 9), 2); break;
 14.1922 +            case 2: GUESS(0xC0600000UL + (fault_addr >> 18), 2); break;
 14.1923 +            }
 14.1924 +        }
 14.1925 +#if CONFIG_PAGING_LEVELS >= 4
 14.1926 +        else if ( v->arch.shadow.mode->guest_levels == 4 )
 14.1927 +        {
 14.1928 +            /* 64bit w2k3: linear map at 0x0000070000000000 */
 14.1929 +            switch ( level ) 
 14.1930 +            {
 14.1931 +            case 1: GUESS(0x70000000000UL + (fault_addr >> 9), 3); break;
 14.1932 +            case 2: GUESS(0x70380000000UL + (fault_addr >> 18), 3); break;
 14.1933 +            case 3: GUESS(0x70381C00000UL + (fault_addr >> 27), 3); break;
 14.1934 +            }
 14.1935 +        }
 14.1936 +#endif /* CONFIG_PAGING_LEVELS >= 4 */
 14.1937 +#endif /* CONFIG_PAGING_LEVELS >= 3 */
 14.1938 +
 14.1939 +#undef GUESS
 14.1940 +
 14.1941 +    }
 14.1942 +#endif
 14.1943 +    
 14.1944 +    /* Brute-force search of all the shadows, by walking the hash */
 14.1945 +    perfc_incrc(shadow_writeable_bf);
 14.1946 +    hash_foreach(v, callback_mask, callbacks, gmfn);
 14.1947 +
 14.1948 +    /* If that didn't catch the mapping, something is very wrong */
 14.1949 +    if ( (mfn_to_page(gmfn)->u.inuse.type_info & PGT_count_mask) != 0 )
 14.1950 +    {
 14.1951 +        SHADOW_ERROR("can't find all writeable mappings of mfn %lx: "
 14.1952 +                      "%lu left\n", mfn_x(gmfn),
 14.1953 +                      (mfn_to_page(gmfn)->u.inuse.type_info&PGT_count_mask));
 14.1954 +        domain_crash(v->domain);
 14.1955 +    }
 14.1956 +    
 14.1957 +    /* We killed at least one writeable mapping, so must flush TLBs. */
 14.1958 +    return 1;
 14.1959 +}
 14.1960 +
 14.1961 +
 14.1962 +
 14.1963 +/**************************************************************************/
 14.1964 +/* Remove all mappings of a guest frame from the shadow tables.
 14.1965 + * Returns non-zero if we need to flush TLBs. */
 14.1966 +
 14.1967 +int shadow_remove_all_mappings(struct vcpu *v, mfn_t gmfn)
 14.1968 +{
 14.1969 +    struct page_info *page = mfn_to_page(gmfn);
 14.1970 +    int expected_count;
 14.1971 +
 14.1972 +    /* Dispatch table for getting per-type functions */
 14.1973 +    static hash_callback_t callbacks[16] = {
 14.1974 +        NULL, /* none    */
 14.1975 +#if CONFIG_PAGING_LEVELS == 2
 14.1976 +        SHADOW_INTERNAL_NAME(sh_remove_all_mappings,2,2), /* l1_32   */
 14.1977 +        SHADOW_INTERNAL_NAME(sh_remove_all_mappings,2,2), /* fl1_32  */
 14.1978 +#else 
 14.1979 +        SHADOW_INTERNAL_NAME(sh_remove_all_mappings,3,2), /* l1_32   */
 14.1980 +        SHADOW_INTERNAL_NAME(sh_remove_all_mappings,3,2), /* fl1_32  */
 14.1981 +#endif
 14.1982 +        NULL, /* l2_32   */
 14.1983 +#if CONFIG_PAGING_LEVELS >= 3
 14.1984 +        SHADOW_INTERNAL_NAME(sh_remove_all_mappings,3,3), /* l1_pae  */
 14.1985 +        SHADOW_INTERNAL_NAME(sh_remove_all_mappings,3,3), /* fl1_pae */
 14.1986 +#else 
 14.1987 +        NULL, /* l1_pae  */
 14.1988 +        NULL, /* fl1_pae */
 14.1989 +#endif
 14.1990 +        NULL, /* l2_pae  */
 14.1991 +        NULL, /* l2h_pae */
 14.1992 +        NULL, /* l3_pae  */
 14.1993 +#if CONFIG_PAGING_LEVELS >= 4
 14.1994 +        SHADOW_INTERNAL_NAME(sh_remove_all_mappings,4,4), /* l1_64   */
 14.1995 +        SHADOW_INTERNAL_NAME(sh_remove_all_mappings,4,4), /* fl1_64  */
 14.1996 +#else
 14.1997 +        NULL, /* l1_64   */
 14.1998 +        NULL, /* fl1_64  */
 14.1999 +#endif
 14.2000 +        NULL, /* l2_64   */
 14.2001 +        NULL, /* l3_64   */
 14.2002 +        NULL, /* l4_64   */
 14.2003 +        NULL, /* p2m     */
 14.2004 +        NULL  /* unused  */
 14.2005 +    };
 14.2006 +
 14.2007 +    static unsigned int callback_mask = 
 14.2008 +          1 << (PGC_SH_l1_32_shadow >> PGC_SH_type_shift)
 14.2009 +        | 1 << (PGC_SH_fl1_32_shadow >> PGC_SH_type_shift)
 14.2010 +        | 1 << (PGC_SH_l1_pae_shadow >> PGC_SH_type_shift)
 14.2011 +        | 1 << (PGC_SH_fl1_pae_shadow >> PGC_SH_type_shift)
 14.2012 +        | 1 << (PGC_SH_l1_64_shadow >> PGC_SH_type_shift)
 14.2013 +        | 1 << (PGC_SH_fl1_64_shadow >> PGC_SH_type_shift)
 14.2014 +        ;
 14.2015 +
 14.2016 +    perfc_incrc(shadow_mappings);
 14.2017 +    if ( (page->count_info & PGC_count_mask) == 0 )
 14.2018 +        return 0;
 14.2019 +
 14.2020 +    ASSERT(shadow_lock_is_acquired(v->domain));
 14.2021 +
 14.2022 +    /* XXX TODO: 
 14.2023 +     * Heuristics for finding the (probably) single mapping of this gmfn */
 14.2024 +    
 14.2025 +    /* Brute-force search of all the shadows, by walking the hash */
 14.2026 +    perfc_incrc(shadow_mappings_bf);
 14.2027 +    hash_foreach(v, callback_mask, callbacks, gmfn);
 14.2028 +
 14.2029 +    /* If that didn't catch the mapping, something is very wrong */
 14.2030 +    expected_count = (page->count_info & PGC_allocated) ? 1 : 0;
 14.2031 +    if ( (page->count_info & PGC_count_mask) != expected_count )
 14.2032 +    {
 14.2033 +        /* Don't complain if we're in HVM and there's one extra mapping: 
 14.2034 +         * The qemu helper process has an untyped mapping of this dom's RAM */
 14.2035 +        if ( !(shadow_mode_external(v->domain)
 14.2036 +               && (page->count_info & PGC_count_mask) <= 2
 14.2037 +               && (page->u.inuse.type_info & PGT_count_mask) == 0) )
 14.2038 +        {
 14.2039 +            SHADOW_ERROR("can't find all mappings of mfn %lx: "
 14.2040 +                          "c=%08x t=%08lx\n", mfn_x(gmfn), 
 14.2041 +                          page->count_info, page->u.inuse.type_info);
 14.2042 +        }
 14.2043 +    }
 14.2044 +
 14.2045 +    /* We killed at least one mapping, so must flush TLBs. */
 14.2046 +    return 1;
 14.2047 +}
 14.2048 +
 14.2049 +
 14.2050 +/**************************************************************************/
 14.2051 +/* Remove all shadows of a guest frame from the shadow tables */
 14.2052 +
 14.2053 +static int sh_remove_shadow_via_pointer(struct vcpu *v, mfn_t smfn)
 14.2054 +/* Follow this shadow's up-pointer, if it has one, and remove the reference
 14.2055 + * found there.  Returns 1 if that was the only reference to this shadow */
 14.2056 +{
 14.2057 +    struct page_info *pg = mfn_to_page(smfn);
 14.2058 +    mfn_t pmfn;
 14.2059 +    void *vaddr;
 14.2060 +    int rc;
 14.2061 +
 14.2062 +    ASSERT((pg->count_info & PGC_SH_type_mask) > 0);
 14.2063 +    ASSERT((pg->count_info & PGC_SH_type_mask) < PGC_SH_max_shadow);
 14.2064 +    ASSERT((pg->count_info & PGC_SH_type_mask) != PGC_SH_l2_32_shadow);
 14.2065 +    ASSERT((pg->count_info & PGC_SH_type_mask) != PGC_SH_l3_pae_shadow);
 14.2066 +    ASSERT((pg->count_info & PGC_SH_type_mask) != PGC_SH_l4_64_shadow);
 14.2067 +    
 14.2068 +    if (pg->up == 0) return 0;
 14.2069 +    pmfn = _mfn(pg->up >> PAGE_SHIFT);
 14.2070 +    ASSERT(valid_mfn(pmfn));
 14.2071 +    vaddr = sh_map_domain_page(pmfn);
 14.2072 +    ASSERT(vaddr);
 14.2073 +    vaddr += pg->up & (PAGE_SIZE-1);
 14.2074 +    ASSERT(l1e_get_pfn(*(l1_pgentry_t *)vaddr) == mfn_x(smfn));
 14.2075 +    
 14.2076 +    /* Is this the only reference to this shadow? */
 14.2077 +    rc = ((pg->count_info & PGC_SH_count_mask) == 1) ? 1 : 0;
 14.2078 +
 14.2079 +    /* Blank the offending entry */
 14.2080 +    switch ((pg->count_info & PGC_SH_type_mask)) 
 14.2081 +    {
 14.2082 +    case PGC_SH_l1_32_shadow:
 14.2083 +    case PGC_SH_l2_32_shadow:
 14.2084 +#if CONFIG_PAGING_LEVELS == 2
 14.2085 +        SHADOW_INTERNAL_NAME(sh_clear_shadow_entry,2,2)(v, vaddr, pmfn);
 14.2086 +#else
 14.2087 +        SHADOW_INTERNAL_NAME(sh_clear_shadow_entry,3,2)(v, vaddr, pmfn);
 14.2088 +#endif
 14.2089 +        break;
 14.2090 +#if CONFIG_PAGING_LEVELS >=3
 14.2091 +    case PGC_SH_l1_pae_shadow:
 14.2092 +    case PGC_SH_l2_pae_shadow:
 14.2093 +    case PGC_SH_l2h_pae_shadow:
 14.2094 +    case PGC_SH_l3_pae_shadow:
 14.2095 +        SHADOW_INTERNAL_NAME(sh_clear_shadow_entry,3,3)(v, vaddr, pmfn);
 14.2096 +        break;
 14.2097 +#if CONFIG_PAGING_LEVELS >= 4
 14.2098 +    case PGC_SH_l1_64_shadow:
 14.2099 +    case PGC_SH_l2_64_shadow:
 14.2100 +    case PGC_SH_l3_64_shadow:
 14.2101 +    case PGC_SH_l4_64_shadow:
 14.2102 +        SHADOW_INTERNAL_NAME(sh_clear_shadow_entry,4,4)(v, vaddr, pmfn);
 14.2103 +        break;
 14.2104 +#endif
 14.2105 +#endif
 14.2106 +    default: BUG(); /* Some wierd unknown shadow type */
 14.2107 +    }
 14.2108 +    
 14.2109 +    sh_unmap_domain_page(vaddr);
 14.2110 +    if ( rc )
 14.2111 +        perfc_incrc(shadow_up_pointer);
 14.2112 +    else
 14.2113 +        perfc_incrc(shadow_unshadow_bf);
 14.2114 +
 14.2115 +    return rc;
 14.2116 +}
 14.2117 +
 14.2118 +void sh_remove_shadows(struct vcpu *v, mfn_t gmfn, int all)
 14.2119 +/* Remove the shadows of this guest page.  
 14.2120 + * If all != 0, find all shadows, if necessary by walking the tables.
 14.2121 + * Otherwise, just try the (much faster) heuristics, which will remove 
 14.2122 + * at most one reference to each shadow of the page. */
 14.2123 +{
 14.2124 +    struct page_info *pg;
 14.2125 +    mfn_t smfn;
 14.2126 +    u32 sh_flags;
 14.2127 +    unsigned char t;
 14.2128 +
 14.2129 +    /* Dispatch table for getting per-type functions: each level must
 14.2130 +     * be called with the function to remove a lower-level shadow. */
 14.2131 +    static hash_callback_t callbacks[16] = {
 14.2132 +        NULL, /* none    */
 14.2133 +        NULL, /* l1_32   */
 14.2134 +        NULL, /* fl1_32  */
 14.2135 +#if CONFIG_PAGING_LEVELS == 2
 14.2136 +        SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,2,2), /* l2_32   */
 14.2137 +#else 
 14.2138 +        SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,3,2), /* l2_32   */
 14.2139 +#endif
 14.2140 +        NULL, /* l1_pae  */
 14.2141 +        NULL, /* fl1_pae */
 14.2142 +#if CONFIG_PAGING_LEVELS >= 3
 14.2143 +        SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,3,3), /* l2_pae  */
 14.2144 +        SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,3,3), /* l2h_pae */
 14.2145 +        SHADOW_INTERNAL_NAME(sh_remove_l2_shadow,3,3), /* l3_pae  */
 14.2146 +#else 
 14.2147 +        NULL, /* l2_pae  */
 14.2148 +        NULL, /* l2h_pae */
 14.2149 +        NULL, /* l3_pae  */
 14.2150 +#endif
 14.2151 +        NULL, /* l1_64   */
 14.2152 +        NULL, /* fl1_64  */
 14.2153 +#if CONFIG_PAGING_LEVELS >= 4
 14.2154 +        SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,4,4), /* l2_64   */
 14.2155 +        SHADOW_INTERNAL_NAME(sh_remove_l2_shadow,4,4), /* l3_64   */
 14.2156 +        SHADOW_INTERNAL_NAME(sh_remove_l3_shadow,4,4), /* l4_64   */
 14.2157 +#else
 14.2158 +        NULL, /* l2_64   */
 14.2159 +        NULL, /* l3_64   */
 14.2160 +        NULL, /* l4_64   */
 14.2161 +#endif
 14.2162 +        NULL, /* p2m     */
 14.2163 +        NULL  /* unused  */
 14.2164 +    };
 14.2165 +
 14.2166 +    /* Another lookup table, for choosing which mask to use */
 14.2167 +    static unsigned int masks[16] = {
 14.2168 +        0, /* none    */
 14.2169 +        1 << (PGC_SH_l2_32_shadow >> PGC_SH_type_shift), /* l1_32   */
 14.2170 +        0, /* fl1_32  */
 14.2171 +        0, /* l2_32   */
 14.2172 +        ((1 << (PGC_SH_l2h_pae_shadow >> PGC_SH_type_shift))
 14.2173 +         | (1 << (PGC_SH_l2_pae_shadow >> PGC_SH_type_shift))), /* l1_pae  */
 14.2174 +        0, /* fl1_pae */
 14.2175 +        1 << (PGC_SH_l3_pae_shadow >> PGC_SH_type_shift), /* l2_pae  */
 14.2176 +        1 << (PGC_SH_l3_pae_shadow >> PGC_SH_type_shift), /* l2h_pae  */
 14.2177 +        0, /* l3_pae  */
 14.2178 +        1 << (PGC_SH_l2_64_shadow >> PGC_SH_type_shift), /* l1_64   */
 14.2179 +        0, /* fl1_64  */
 14.2180 +        1 << (PGC_SH_l3_64_shadow >> PGC_SH_type_shift), /* l2_64   */
 14.2181 +        1 << (PGC_SH_l4_64_shadow >> PGC_SH_type_shift), /* l3_64   */
 14.2182 +        0, /* l4_64   */
 14.2183 +        0, /* p2m     */
 14.2184 +        0  /* unused  */
 14.2185 +    };
 14.2186 +
 14.2187 +    ASSERT(shadow_lock_is_acquired(v->domain));
 14.2188 +
 14.2189 +    pg = mfn_to_page(gmfn);
 14.2190 +
 14.2191 +    /* Bale out now if the page is not shadowed */
 14.2192 +    if ( (pg->count_info & PGC_page_table) == 0 )
 14.2193 +        return;
 14.2194 +
 14.2195 +    SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx\n",
 14.2196 +                   v->domain->domain_id, v->vcpu_id, mfn_x(gmfn));
 14.2197 +
 14.2198 +    /* Search for this shadow in all appropriate shadows */
 14.2199 +    perfc_incrc(shadow_unshadow);
 14.2200 +    sh_flags = pg->shadow_flags;
 14.2201 +
 14.2202 +    /* Lower-level shadows need to be excised from upper-level shadows.
 14.2203 +     * This call to hash_foreach() looks dangerous but is in fact OK: each
 14.2204 +     * call will remove at most one shadow, and terminate immediately when
 14.2205 +     * it does remove it, so we never walk the hash after doing a deletion.  */
 14.2206 +#define DO_UNSHADOW(_type) do {                                 \
 14.2207 +    t = (_type) >> PGC_SH_type_shift;                          \
 14.2208 +    smfn = shadow_hash_lookup(v, mfn_x(gmfn), t);              \
 14.2209 +    if ( !sh_remove_shadow_via_pointer(v, smfn) && all )       \
 14.2210 +        hash_foreach(v, masks[t], callbacks, smfn);             \
 14.2211 +} while (0)
 14.2212 +
 14.2213 +    /* Top-level shadows need to be unpinned */
 14.2214 +#define DO_UNPIN(_type) do {                                             \
 14.2215 +    t = (_type) >> PGC_SH_type_shift;                                   \
 14.2216 +    smfn = shadow_hash_lookup(v, mfn_x(gmfn), t);                       \
 14.2217 +    if ( mfn_to_page(smfn)->count_info & PGC_SH_pinned )                \
 14.2218 +        sh_unpin(v, smfn);                                              \
 14.2219 +    if ( (_type) == PGC_SH_l3_pae_shadow )                              \
 14.2220 +        SHADOW_INTERNAL_NAME(sh_unpin_all_l3_subshadows,3,3)(v, smfn); \
 14.2221 +} while (0)
 14.2222 +
 14.2223 +    if ( sh_flags & SHF_L1_32 )   DO_UNSHADOW(PGC_SH_l1_32_shadow);
 14.2224 +    if ( sh_flags & SHF_L2_32 )   DO_UNPIN(PGC_SH_l2_32_shadow);
 14.2225 +#if CONFIG_PAGING_LEVELS >= 3
 14.2226 +    if ( sh_flags & SHF_L1_PAE )  DO_UNSHADOW(PGC_SH_l1_pae_shadow);
 14.2227 +    if ( sh_flags & SHF_L2_PAE )  DO_UNSHADOW(PGC_SH_l2_pae_shadow);
 14.2228 +    if ( sh_flags & SHF_L2H_PAE ) DO_UNSHADOW(PGC_SH_l2h_pae_shadow);
 14.2229 +    if ( sh_flags & SHF_L3_PAE )  DO_UNPIN(PGC_SH_l3_pae_shadow);
 14.2230 +#if CONFIG_PAGING_LEVELS >= 4
 14.2231 +    if ( sh_flags & SHF_L1_64 )   DO_UNSHADOW(PGC_SH_l1_64_shadow);
 14.2232 +    if ( sh_flags & SHF_L2_64 )   DO_UNSHADOW(PGC_SH_l2_64_shadow);
 14.2233 +    if ( sh_flags & SHF_L3_64 )   DO_UNSHADOW(PGC_SH_l3_64_shadow);
 14.2234 +    if ( sh_flags & SHF_L4_64 )   DO_UNPIN(PGC_SH_l4_64_shadow);
 14.2235 +#endif
 14.2236 +#endif
 14.2237 +
 14.2238 +#undef DO_UNSHADOW
 14.2239 +#undef DO_UNPIN
 14.2240 +
 14.2241 +
 14.2242 +#if CONFIG_PAGING_LEVELS > 2
 14.2243 +    /* We may have caused some PAE l3 entries to change: need to 
 14.2244 +     * fix up the copies of them in various places */
 14.2245 +    if ( sh_flags & (SHF_L2_PAE|SHF_L2H_PAE) )
 14.2246 +        sh_pae_recopy(v->domain);
 14.2247 +#endif
 14.2248 +
 14.2249 +    /* If that didn't catch the shadows, something is wrong */
 14.2250 +    if ( all && (pg->count_info & PGC_page_table) )
 14.2251 +    {
 14.2252 +        SHADOW_ERROR("can't find all shadows of mfn %05lx (shadow_flags=%08x)\n",
 14.2253 +                      mfn_x(gmfn), pg->shadow_flags);
 14.2254 +        domain_crash(v->domain);
 14.2255 +    }
 14.2256 +}
 14.2257 +
 14.2258 +void
 14.2259 +shadow_remove_all_shadows_and_parents(struct vcpu *v, mfn_t gmfn)
 14.2260 +/* Even harsher: this is a HVM page that we thing is no longer a pagetable.
 14.2261 + * Unshadow it, and recursively unshadow pages that reference it. */
 14.2262 +{
 14.2263 +    shadow_remove_all_shadows(v, gmfn);
 14.2264 +    /* XXX TODO:
 14.2265 +     * Rework this hashtable walker to return a linked-list of all 
 14.2266 +     * the shadows it modified, then do breadth-first recursion 
 14.2267 +     * to find the way up to higher-level tables and unshadow them too. 
 14.2268 +     *
 14.2269 +     * The current code (just tearing down each page's shadows as we
 14.2270 +     * detect that it is not a pagetable) is correct, but very slow. 
 14.2271 +     * It means extra emulated writes and slows down removal of mappings. */
 14.2272 +}
 14.2273 +
 14.2274 +/**************************************************************************/
 14.2275 +
 14.2276 +void sh_update_paging_modes(struct vcpu *v)
 14.2277 +{
 14.2278 +    struct domain *d = v->domain;
 14.2279 +    struct shadow_paging_mode *old_mode = v->arch.shadow.mode;
 14.2280 +    mfn_t old_guest_table;
 14.2281 +
 14.2282 +    ASSERT(shadow_lock_is_acquired(d));
 14.2283 +
 14.2284 +    // Valid transitions handled by this function:
 14.2285 +    // - For PV guests:
 14.2286 +    //     - after a shadow mode has been changed
 14.2287 +    // - For HVM guests:
 14.2288 +    //     - after a shadow mode has been changed
 14.2289 +    //     - changes in CR0.PG, CR4.PAE, CR4.PSE, or CR4.PGE
 14.2290 +    //
 14.2291 +
 14.2292 +    // Avoid determining the current shadow mode for uninitialized CPUs, as
 14.2293 +    // we can not yet determine whether it is an HVM or PV domain.
 14.2294 +    //
 14.2295 +    if ( !test_bit(_VCPUF_initialised, &v->vcpu_flags) )
 14.2296 +    {
 14.2297 +        printk("%s: postponing determination of shadow mode\n", __func__);
 14.2298 +        return;
 14.2299 +    }
 14.2300 +
 14.2301 +    // First, tear down any old shadow tables held by this vcpu.
 14.2302 +    //
 14.2303 +    shadow_detach_old_tables(v);
 14.2304 +
 14.2305 +    if ( !hvm_guest(v) )
 14.2306 +    {
 14.2307 +        ///
 14.2308 +        /// PV guest
 14.2309 +        ///
 14.2310 +#if CONFIG_PAGING_LEVELS == 4
 14.2311 +        if ( pv_32bit_guest(v) )
 14.2312 +            v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,4,3);
 14.2313 +        else
 14.2314 +            v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,4,4);
 14.2315 +#elif CONFIG_PAGING_LEVELS == 3
 14.2316 +        v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,3,3);
 14.2317 +#elif CONFIG_PAGING_LEVELS == 2
 14.2318 +        v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,2,2);
 14.2319 +#else
 14.2320 +#error unexpected paging mode
 14.2321 +#endif
 14.2322 +    }
 14.2323 +    else
 14.2324 +    {
 14.2325 +        ///
 14.2326 +        /// HVM guest
 14.2327 +        ///
 14.2328 +        ASSERT(shadow_mode_translate(d));
 14.2329 +        ASSERT(shadow_mode_external(d));
 14.2330 +
 14.2331 +        v->arch.shadow.hvm_paging_enabled = !!hvm_paging_enabled(v);
 14.2332 +        if ( !v->arch.shadow.hvm_paging_enabled )
 14.2333 +        {
 14.2334 +            
 14.2335 +            /* Set v->arch.guest_table to use the p2m map, and choose
 14.2336 +             * the appropriate shadow mode */
 14.2337 +            old_guest_table = pagetable_get_mfn(v->arch.guest_table);
 14.2338 +#if CONFIG_PAGING_LEVELS == 2
 14.2339 +            v->arch.guest_table =
 14.2340 +                pagetable_from_pfn(pagetable_get_pfn(d->arch.phys_table));
 14.2341 +            v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,2,2);
 14.2342 +#elif CONFIG_PAGING_LEVELS == 3 
 14.2343 +            v->arch.guest_table =
 14.2344 +                pagetable_from_pfn(pagetable_get_pfn(d->arch.phys_table));
 14.2345 +            v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,3,3);
 14.2346 +#else /* CONFIG_PAGING_LEVELS == 4 */
 14.2347 +            { 
 14.2348 +                l4_pgentry_t *l4e; 
 14.2349 +                /* Use the start of the first l3 table as a PAE l3 */
 14.2350 +                ASSERT(pagetable_get_pfn(d->arch.phys_table) != 0);
 14.2351 +                l4e = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
 14.2352 +                ASSERT(l4e_get_flags(l4e[0]) & _PAGE_PRESENT);
 14.2353 +                v->arch.guest_table =
 14.2354 +                    pagetable_from_pfn(l4e_get_pfn(l4e[0]));
 14.2355 +                sh_unmap_domain_page(l4e);
 14.2356 +            }
 14.2357 +            v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,3,3);
 14.2358 +#endif
 14.2359 +            /* Fix up refcounts on guest_table */
 14.2360 +            get_page(mfn_to_page(pagetable_get_mfn(v->arch.guest_table)), d);
 14.2361 +            if ( mfn_x(old_guest_table) != 0 )
 14.2362 +                put_page(mfn_to_page(old_guest_table));
 14.2363 +        }
 14.2364 +        else
 14.2365 +        {
 14.2366 +#ifdef __x86_64__
 14.2367 +            if ( hvm_long_mode_enabled(v) )
 14.2368 +            {
 14.2369 +                // long mode guest...
 14.2370 +                v->arch.shadow.mode =
 14.2371 +                    &SHADOW_INTERNAL_NAME(sh_paging_mode, 4, 4);
 14.2372 +            }
 14.2373 +            else
 14.2374 +#endif
 14.2375 +                if ( hvm_get_guest_ctrl_reg(v, 4) & X86_CR4_PAE )
 14.2376 +                {
 14.2377 +#if CONFIG_PAGING_LEVELS >= 3
 14.2378 +                    // 32-bit PAE mode guest...
 14.2379 +                    v->arch.shadow.mode =
 14.2380 +                        &SHADOW_INTERNAL_NAME(sh_paging_mode, 3, 3);
 14.2381 +#else
 14.2382 +                    SHADOW_ERROR("PAE not supported in 32-bit Xen\n");
 14.2383 +                    domain_crash(d);
 14.2384 +                    return;
 14.2385 +#endif
 14.2386 +                }
 14.2387 +                else
 14.2388 +                {
 14.2389 +                    // 32-bit 2 level guest...
 14.2390 +#if CONFIG_PAGING_LEVELS >= 3
 14.2391 +                    v->arch.shadow.mode =
 14.2392 +                        &SHADOW_INTERNAL_NAME(sh_paging_mode, 3, 2);
 14.2393 +#else
 14.2394 +                    v->arch.shadow.mode =
 14.2395 +                        &SHADOW_INTERNAL_NAME(sh_paging_mode, 2, 2);
 14.2396 +#endif
 14.2397 +                }
 14.2398 +        }
 14.2399 +
 14.2400 +        if ( pagetable_get_pfn(v->arch.monitor_table) == 0 )
 14.2401 +        {
 14.2402 +            mfn_t mmfn = shadow_make_monitor_table(v);
 14.2403 +            v->arch.monitor_table = pagetable_from_mfn(mmfn);
 14.2404 +            v->arch.monitor_vtable = sh_map_domain_page(mmfn);
 14.2405 +        } 
 14.2406 +
 14.2407 +        if ( v->arch.shadow.mode != old_mode )
 14.2408 +        {
 14.2409 +            SHADOW_PRINTK("new paging mode: d=%u v=%u g=%u s=%u "
 14.2410 +                           "(was g=%u s=%u)\n",
 14.2411 +                           d->domain_id, v->vcpu_id, 
 14.2412 +                           v->arch.shadow.mode->guest_levels,
 14.2413 +                           v->arch.shadow.mode->shadow_levels,
 14.2414 +                           old_mode ? old_mode->guest_levels : 0,
 14.2415 +                           old_mode ? old_mode->shadow_levels : 0);
 14.2416 +            if ( old_mode &&
 14.2417 +                 (v->arch.shadow.mode->shadow_levels !=
 14.2418 +                  old_mode->shadow_levels) )
 14.2419 +            {
 14.2420 +                /* Need to make a new monitor table for the new mode */
 14.2421 +                mfn_t new_mfn, old_mfn;
 14.2422 +
 14.2423 +                if ( v != current ) 
 14.2424 +                {
 14.2425 +                    SHADOW_ERROR("Some third party (d=%u v=%u) is changing "
 14.2426 +                                  "this HVM vcpu's (d=%u v=%u) paging mode!\n",
 14.2427 +                                  current->domain->domain_id, current->vcpu_id,
 14.2428 +                                  v->domain->domain_id, v->vcpu_id);
 14.2429 +                    domain_crash(v->domain);
 14.2430 +                    return;
 14.2431 +                }
 14.2432 +
 14.2433 +                sh_unmap_domain_page(v->arch.monitor_vtable);
 14.2434 +                old_mfn = pagetable_get_mfn(v->arch.monitor_table);
 14.2435 +                v->arch.monitor_table = pagetable_null();
 14.2436 +                new_mfn = v->arch.shadow.mode->make_monitor_table(v);            
 14.2437 +                v->arch.monitor_table = pagetable_from_mfn(new_mfn);
 14.2438 +                v->arch.monitor_vtable = sh_map_domain_page(new_mfn);
 14.2439 +                SHADOW_PRINTK("new monitor table %"SH_PRI_mfn "\n",
 14.2440 +                               mfn_x(new_mfn));
 14.2441 +
 14.2442 +                /* Don't be running on the old monitor table when we 
 14.2443 +                 * pull it down!  Switch CR3, and warn the HVM code that
 14.2444 +                 * its host cr3 has changed. */
 14.2445 +                make_cr3(v, mfn_x(new_mfn));
 14.2446 +                write_ptbase(v);
 14.2447 +                hvm_update_host_cr3(v);
 14.2448 +                old_mode->destroy_monitor_table(v, old_mfn);
 14.2449 +            }
 14.2450 +        }
 14.2451 +
 14.2452 +        // XXX -- Need to deal with changes in CR4.PSE and CR4.PGE.
 14.2453 +        //        These are HARD: think about the case where two CPU's have
 14.2454 +        //        different values for CR4.PSE and CR4.PGE at the same time.
 14.2455 +        //        This *does* happen, at least for CR4.PGE...
 14.2456 +    }
 14.2457 +
 14.2458 +    v->arch.shadow.mode->update_cr3(v);
 14.2459 +}
 14.2460 +
 14.2461 +/**************************************************************************/
 14.2462 +/* Turning on and off shadow features */
 14.2463 +
 14.2464 +static void sh_new_mode(struct domain *d, u32 new_mode)
 14.2465 +/* Inform all the vcpus that the shadow mode has been changed */
 14.2466 +{
 14.2467 +    struct vcpu *v;
 14.2468 +
 14.2469 +    ASSERT(shadow_lock_is_acquired(d));
 14.2470 +    ASSERT(d != current->domain);
 14.2471 +    d->arch.shadow.mode = new_mode;
 14.2472 +    if ( new_mode & SHM2_translate ) 
 14.2473 +        shadow_audit_p2m(d);
 14.2474 +    for_each_vcpu(d, v)
 14.2475 +        sh_update_paging_modes(v);
 14.2476 +}
 14.2477 +
 14.2478 +static int shadow_enable(struct domain *d, u32 mode)
 14.2479 +/* Turn on "permanent" shadow features: external, translate, refcount.
 14.2480 + * Can only be called once on a domain, and these features cannot be
 14.2481 + * disabled. 
 14.2482 + * Returns 0 for success, -errno for failure. */
 14.2483 +{    
 14.2484 +    unsigned int old_pages;
 14.2485 +    int rv = 0;
 14.2486 +
 14.2487 +    mode |= SHM2_enable;
 14.2488 +
 14.2489 +    domain_pause(d);
 14.2490 +    shadow_lock(d);
 14.2491 +
 14.2492 +    /* Sanity check the arguments */
 14.2493 +    if ( (d == current->domain) ||
 14.2494 +         shadow_mode_enabled(d) ||
 14.2495 +         ((mode & SHM2_external) && !(mode & SHM2_translate)) )
 14.2496 +    {
 14.2497 +        rv = -EINVAL;
 14.2498 +        goto out;
 14.2499 +    }
 14.2500 +
 14.2501 +    // XXX -- eventually would like to require that all memory be allocated
 14.2502 +    // *after* shadow_enabled() is called...  So here, we would test to make
 14.2503 +    // sure that d->page_list is empty.
 14.2504 +#if 0
 14.2505 +    spin_lock(&d->page_alloc_lock);
 14.2506 +    if ( !list_empty(&d->page_list) )
 14.2507 +    {
 14.2508 +        spin_unlock(&d->page_alloc_lock);
 14.2509 +        rv = -EINVAL;
 14.2510 +        goto out;
 14.2511 +    }
 14.2512 +    spin_unlock(&d->page_alloc_lock);
 14.2513 +#endif
 14.2514 +
 14.2515 +    /* Init the shadow memory allocation if the user hasn't done so */
 14.2516 +    old_pages = d->arch.shadow.total_pages;
 14.2517 +    if ( old_pages == 0 )
 14.2518 +        if ( set_sh_allocation(d, 256, NULL) != 0 ) /* Use at least 1MB */
 14.2519 +        {
 14.2520 +            set_sh_allocation(d, 0, NULL);
 14.2521 +            rv = -ENOMEM;
 14.2522 +            goto out;
 14.2523 +        }
 14.2524 +
 14.2525 +    /* Init the hash table */
 14.2526 +    if ( shadow_hash_alloc(d) != 0 )
 14.2527 +    {
 14.2528 +        set_sh_allocation(d, old_pages, NULL);            
 14.2529 +        rv = -ENOMEM;
 14.2530 +        goto out;
 14.2531 +    }
 14.2532 +
 14.2533 +    /* Init the P2M table */
 14.2534 +    if ( mode & SHM2_translate )
 14.2535 +        if ( !shadow_alloc_p2m_table(d) )
 14.2536 +        {
 14.2537 +            shadow_hash_teardown(d);
 14.2538 +            set_sh_allocation(d, old_pages, NULL);
 14.2539 +            shadow_p2m_teardown(d);
 14.2540 +            rv = -ENOMEM;
 14.2541 +            goto out;
 14.2542 +        }
 14.2543 +
 14.2544 +    /* Update the bits */
 14.2545 +    sh_new_mode(d, mode);
 14.2546 +    shadow_audit_p2m(d);
 14.2547 + out:
 14.2548 +    shadow_unlock(d);
 14.2549 +    domain_unpause(d);
 14.2550 +    return 0;
 14.2551 +}
 14.2552 +
 14.2553 +void shadow_teardown(struct domain *d)
 14.2554 +/* Destroy the shadow pagetables of this domain and free its shadow memory.
 14.2555 + * Should only be called for dying domains. */
 14.2556 +{
 14.2557 +    struct vcpu *v;
 14.2558 +    mfn_t mfn;
 14.2559 +
 14.2560 +    ASSERT(test_bit(_DOMF_dying, &d->domain_flags));
 14.2561 +    ASSERT(d != current->domain);
 14.2562 +
 14.2563 +    if ( !shadow_lock_is_acquired(d) )
 14.2564 +        shadow_lock(d); /* Keep various asserts happy */
 14.2565 +
 14.2566 +    if ( shadow_mode_enabled(d) )
 14.2567 +    {
 14.2568 +        /* Release the shadow and monitor tables held by each vcpu */
 14.2569 +        for_each_vcpu(d, v)
 14.2570 +        {
 14.2571 +            shadow_detach_old_tables(v);
 14.2572 +            if ( shadow_mode_external(d) )
 14.2573 +            {
 14.2574 +                mfn = pagetable_get_mfn(v->arch.monitor_table);
 14.2575 +                if ( valid_mfn(mfn) && (mfn_x(mfn) != 0) )
 14.2576 +                    shadow_destroy_monitor_table(v, mfn);
 14.2577 +                v->arch.monitor_table = pagetable_null();
 14.2578 +            }
 14.2579 +        }
 14.2580 +    }
 14.2581 +
 14.2582 +    if ( d->arch.shadow.total_pages != 0 )
 14.2583 +    {
 14.2584 +        SHADOW_PRINTK("teardown of domain %u starts."
 14.2585 +                       "  Shadow pages total = %u, free = %u, p2m=%u\n",
 14.2586 +                       d->domain_id,
 14.2587 +                       d->arch.shadow.total_pages, 
 14.2588 +                       d->arch.shadow.free_pages, 
 14.2589 +                       d->arch.shadow.p2m_pages);
 14.2590 +        /* Destroy all the shadows and release memory to domheap */
 14.2591 +        set_sh_allocation(d, 0, NULL);
 14.2592 +        /* Release the hash table back to xenheap */
 14.2593 +        if (d->arch.shadow.hash_table) 
 14.2594 +            shadow_hash_teardown(d);
 14.2595 +        /* Release the log-dirty bitmap of dirtied pages */
 14.2596 +        sh_free_log_dirty_bitmap(d);
 14.2597 +        /* Should not have any more memory held */
 14.2598 +        SHADOW_PRINTK("teardown done."
 14.2599 +                       "  Shadow pages total = %u, free = %u, p2m=%u\n",
 14.2600 +                       d->arch.shadow.total_pages, 
 14.2601 +                       d->arch.shadow.free_pages, 
 14.2602 +                       d->arch.shadow.p2m_pages);
 14.2603 +        ASSERT(d->arch.shadow.total_pages == 0);
 14.2604 +    }
 14.2605 +
 14.2606 +    /* We leave the "permanent" shadow modes enabled, but clear the
 14.2607 +     * log-dirty mode bit.  We don't want any more mark_dirty()
 14.2608 +     * calls now that we've torn down the bitmap */
 14.2609 +    d->arch.shadow.mode &= ~SHM2_log_dirty;
 14.2610 +
 14.2611 +    shadow_unlock(d);
 14.2612 +}
 14.2613 +
 14.2614 +void shadow_final_teardown(struct domain *d)
 14.2615 +/* Called by arch_domain_destroy(), when it's safe to pull down the p2m map. */
 14.2616 +{
 14.2617 +
 14.2618 +    SHADOW_PRINTK("dom %u final teardown starts."
 14.2619 +                   "  Shadow pages total = %u, free = %u, p2m=%u\n",
 14.2620 +                   d->domain_id,
 14.2621 +                   d->arch.shadow.total_pages, 
 14.2622 +                   d->arch.shadow.free_pages, 
 14.2623 +                   d->arch.shadow.p2m_pages);
 14.2624 +
 14.2625 +    /* Double-check that the domain didn't have any shadow memory.  
 14.2626 +     * It is possible for a domain that never got domain_kill()ed
 14.2627 +     * to get here with its shadow allocation intact. */
 14.2628 +    if ( d->arch.shadow.total_pages != 0 )
 14.2629 +        shadow_teardown(d);
 14.2630 +
 14.2631 +    /* It is now safe to pull down the p2m map. */
 14.2632 +    if ( d->arch.shadow.p2m_pages != 0 )
 14.2633 +        shadow_p2m_teardown(d);
 14.2634 +
 14.2635 +    SHADOW_PRINTK("dom %u final teardown done."
 14.2636 +                   "  Shadow pages total = %u, free = %u, p2m=%u\n",
 14.2637 +                   d->domain_id,
 14.2638 +                   d->arch.shadow.total_pages, 
 14.2639 +                   d->arch.shadow.free_pages, 
 14.2640 +                   d->arch.shadow.p2m_pages);
 14.2641 +}
 14.2642 +
 14.2643 +static int shadow_one_bit_enable(struct domain *d, u32 mode)
 14.2644 +/* Turn on a single shadow mode feature */
 14.2645 +{
 14.2646 +    ASSERT(shadow_lock_is_acquired(d));
 14.2647 +
 14.2648 +    /* Sanity check the call */
 14.2649 +    if ( d == current->domain || (d->arch.shadow.mode & mode) )
 14.2650 +    {
 14.2651 +        return -EINVAL;
 14.2652 +    }
 14.2653 +
 14.2654 +    if ( d->arch.shadow.mode == 0 )
 14.2655 +    {
 14.2656 +        /* Init the shadow memory allocation and the hash table */
 14.2657 +        if ( set_sh_allocation(d, 1, NULL) != 0 
 14.2658 +             || shadow_hash_alloc(d) != 0 )
 14.2659 +        {
 14.2660 +            set_sh_allocation(d, 0, NULL);
 14.2661 +            return -ENOMEM;
 14.2662 +        }
 14.2663 +    }
 14.2664 +
 14.2665 +    /* Update the bits */
 14.2666 +    sh_new_mode(d, d->arch.shadow.mode | mode);
 14.2667 +
 14.2668 +    return 0;
 14.2669 +}
 14.2670 +
 14.2671 +static int shadow_one_bit_disable(struct domain *d, u32 mode) 
 14.2672 +/* Turn off a single shadow mode feature */
 14.2673 +{
 14.2674 +    struct vcpu *v;
 14.2675 +    ASSERT(shadow_lock_is_acquired(d));
 14.2676 +
 14.2677 +    /* Sanity check the call */
 14.2678 +    if ( d == current->domain || !(d->arch.shadow.mode & mode) )
 14.2679 +    {
 14.2680 +        return -EINVAL;
 14.2681 +    }
 14.2682 +
 14.2683 +    /* Update the bits */
 14.2684 +    sh_new_mode(d, d->arch.shadow.mode & ~mode);
 14.2685 +    if ( d->arch.shadow.mode == 0 )
 14.2686 +    {
 14.2687 +        /* Get this domain off shadows */
 14.2688 +        SHADOW_PRINTK("un-shadowing of domain %u starts."
 14.2689 +                       "  Shadow pages total = %u, free = %u, p2m=%u\n",
 14.2690 +                       d->domain_id,
 14.2691 +                       d->arch.shadow.total_pages, 
 14.2692 +                       d->arch.shadow.free_pages, 
 14.2693 +                       d->arch.shadow.p2m_pages);
 14.2694 +        for_each_vcpu(d, v)
 14.2695 +        {
 14.2696 +            shadow_detach_old_tables(v);
 14.2697 +#if CONFIG_PAGING_LEVELS == 4
 14.2698 +            if ( !(v->arch.flags & TF_kernel_mode) )
 14.2699 +                make_cr3(v, pagetable_get_pfn(v->arch.guest_table_user));
 14.2700 +            else
 14.2701 +#endif
 14.2702 +                make_cr3(v, pagetable_get_pfn(v->arch.guest_table));
 14.2703 +
 14.2704 +        }
 14.2705 +
 14.2706 +        /* Pull down the memory allocation */
 14.2707 +        if ( set_sh_allocation(d, 0, NULL) != 0 )
 14.2708 +        {
 14.2709 +            // XXX - How can this occur?
 14.2710 +            //       Seems like a bug to return an error now that we've
 14.2711 +            //       disabled the relevant shadow mode.
 14.2712 +            //
 14.2713 +            return -ENOMEM;
 14.2714 +        }
 14.2715 +        shadow_hash_teardown(d);
 14.2716 +        SHADOW_PRINTK("un-shadowing of domain %u done."
 14.2717 +                       "  Shadow pages total = %u, free = %u, p2m=%u\n",
 14.2718 +                       d->domain_id,
 14.2719 +                       d->arch.shadow.total_pages, 
 14.2720 +                       d->arch.shadow.free_pages, 
 14.2721 +                       d->arch.shadow.p2m_pages);
 14.2722 +    }
 14.2723 +
 14.2724 +    return 0;
 14.2725 +}
 14.2726 +
 14.2727 +/* Enable/disable ops for the "test" and "log-dirty" modes */
 14.2728 +int shadow_test_enable(struct domain *d)
 14.2729 +{
 14.2730 +    int ret;
 14.2731 +
 14.2732 +    domain_pause(d);
 14.2733 +    shadow_lock(d);
 14.2734 +
 14.2735 +    if ( shadow_mode_enabled(d) )
 14.2736 +    {
 14.2737 +        SHADOW_ERROR("Don't support enabling test mode"
 14.2738 +                      "on already shadowed doms\n");
 14.2739 +        ret = -EINVAL;
 14.2740 +        goto out;
 14.2741 +    }
 14.2742 +
 14.2743 +    ret = shadow_one_bit_enable(d, SHM2_enable);
 14.2744 + out:
 14.2745 +    shadow_unlock(d);
 14.2746 +    domain_unpause(d);
 14.2747 +
 14.2748 +    return ret;
 14.2749 +}
 14.2750 +
 14.2751 +int shadow_test_disable(struct domain *d)
 14.2752 +{
 14.2753 +    int ret;
 14.2754 +
 14.2755 +    domain_pause(d);
 14.2756 +    shadow_lock(d);
 14.2757 +    ret = shadow_one_bit_disable(d, SHM2_enable);
 14.2758 +    shadow_unlock(d);
 14.2759 +    domain_unpause(d);
 14.2760 +
 14.2761 +    return ret;
 14.2762 +}
 14.2763 +
 14.2764 +static int
 14.2765 +sh_alloc_log_dirty_bitmap(struct domain *d)
 14.2766 +{
 14.2767 +    ASSERT(d->arch.shadow.dirty_bitmap == NULL);
 14.2768 +    d->arch.shadow.dirty_bitmap_size =
 14.2769 +        (d->shared_info->arch.max_pfn + (BITS_PER_LONG - 1)) &
 14.2770 +        ~(BITS_PER_LONG - 1);
 14.2771 +    d->arch.shadow.dirty_bitmap =
 14.2772 +        xmalloc_array(unsigned long,
 14.2773 +                      d->arch.shadow.dirty_bitmap_size / BITS_PER_LONG);
 14.2774 +    if ( d->arch.shadow.dirty_bitmap == NULL )
 14.2775 +    {
 14.2776 +        d->arch.shadow.dirty_bitmap_size = 0;
 14.2777 +        return -ENOMEM;
 14.2778 +    }
 14.2779 +    memset(d->arch.shadow.dirty_bitmap, 0, d->arch.shadow.dirty_bitmap_size/8);
 14.2780 +
 14.2781 +    return 0;
 14.2782 +}
 14.2783 +
 14.2784 +static void
 14.2785 +sh_free_log_dirty_bitmap(struct domain *d)
 14.2786 +{
 14.2787 +    d->arch.shadow.dirty_bitmap_size = 0;
 14.2788 +    if ( d->arch.shadow.dirty_bitmap )
 14.2789 +    {
 14.2790 +        xfree(d->arch.shadow.dirty_bitmap);
 14.2791 +        d->arch.shadow.dirty_bitmap = NULL;
 14.2792 +    }
 14.2793 +}
 14.2794 +
 14.2795 +static int shadow_log_dirty_enable(struct domain *d)
 14.2796 +{
 14.2797 +    int ret;
 14.2798 +
 14.2799 +    domain_pause(d);
 14.2800 +    shadow_lock(d);
 14.2801 +
 14.2802 +    if ( shadow_mode_log_dirty(d) )
 14.2803 +    {
 14.2804 +        ret = -EINVAL;
 14.2805 +        goto out;
 14.2806 +    }
 14.2807 +
 14.2808 +    if ( shadow_mode_enabled(d) )
 14.2809 +    {
 14.2810 +        SHADOW_ERROR("Don't (yet) support enabling log-dirty"
 14.2811 +                      "on already shadowed doms\n");
 14.2812 +        ret = -EINVAL;
 14.2813 +        goto out;
 14.2814 +    }
 14.2815 +
 14.2816 +    ret = sh_alloc_log_dirty_bitmap(d);
 14.2817 +    if ( ret != 0 )
 14.2818 +    {
 14.2819 +        sh_free_log_dirty_bitmap(d);
 14.2820 +        goto out;
 14.2821 +    }
 14.2822 +
 14.2823 +    ret = shadow_one_bit_enable(d, SHM2_log_dirty);
 14.2824 +    if ( ret != 0 )
 14.2825 +        sh_free_log_dirty_bitmap(d);
 14.2826 +
 14.2827 + out:
 14.2828 +    shadow_unlock(d);
 14.2829 +    domain_unpause(d);
 14.2830 +    return ret;
 14.2831 +}
 14.2832 +
 14.2833 +static int shadow_log_dirty_disable(struct domain *d)
 14.2834 +{
 14.2835 +    int ret;
 14.2836 +
 14.2837 +    domain_pause(d);
 14.2838 +    shadow_lock(d);
 14.2839 +    ret = shadow_one_bit_disable(d, SHM2_log_dirty);
 14.2840 +    if ( !shadow_mode_log_dirty(d) )
 14.2841 +        sh_free_log_dirty_bitmap(d);
 14.2842 +    shadow_unlock(d);
 14.2843 +    domain_unpause(d);
 14.2844 +
 14.2845 +    return ret;
 14.2846 +}
 14.2847 +
 14.2848 +/**************************************************************************/
 14.2849 +/* P2M map manipulations */
 14.2850 +
 14.2851 +static void
 14.2852 +sh_p2m_remove_page(struct domain *d, unsigned long gfn, unsigned long mfn)
 14.2853 +{
 14.2854 +    struct vcpu *v;
 14.2855 +
 14.2856 +    if ( !shadow_mode_translate(d) )
 14.2857 +        return;
 14.2858 +
 14.2859 +    v = current;
 14.2860 +    if ( v->domain != d )
 14.2861 +        v = d->vcpu[0];
 14.2862 +
 14.2863 +
 14.2864 +    SHADOW_DEBUG(P2M, "removing gfn=%#lx mfn=%#lx\n", gfn, mfn);
 14.2865 +
 14.2866 +    ASSERT(mfn_x(sh_gfn_to_mfn(d, gfn)) == mfn);
 14.2867 +    //ASSERT(sh_mfn_to_gfn(d, mfn) == gfn);
 14.2868 +
 14.2869 +    shadow_remove_all_shadows_and_parents(v, _mfn(mfn));
 14.2870 +    if ( shadow_remove_all_mappings(v, _mfn(mfn)) )
 14.2871 +        flush_tlb_mask(d->domain_dirty_cpumask);
 14.2872 +    shadow_set_p2m_entry(d, gfn, _mfn(INVALID_MFN));
 14.2873 +    set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
 14.2874 +}
 14.2875 +
 14.2876 +void
 14.2877 +shadow_guest_physmap_remove_page(struct domain *d, unsigned long gfn,
 14.2878 +                                  unsigned long mfn)
 14.2879 +{
 14.2880 +    shadow_lock(d);
 14.2881 +    shadow_audit_p2m(d);
 14.2882 +    sh_p2m_remove_page(d, gfn, mfn);
 14.2883 +    shadow_audit_p2m(d);
 14.2884 +    shadow_unlock(d);    
 14.2885 +}
 14.2886 +
 14.2887 +void
 14.2888 +shadow_guest_physmap_add_page(struct domain *d, unsigned long gfn,
 14.2889 +                               unsigned long mfn)
 14.2890 +{
 14.2891 +    struct vcpu *v;
 14.2892 +    unsigned long ogfn;
 14.2893 +    mfn_t omfn;
 14.2894 +
 14.2895 +    if ( !shadow_mode_translate(d) )
 14.2896 +        return;
 14.2897 +
 14.2898 +    v = current;
 14.2899 +    if ( v->domain != d )
 14.2900 +        v = d->vcpu[0];
 14.2901 +
 14.2902 +    shadow_lock(d);
 14.2903 +    shadow_audit_p2m(d);
 14.2904 +
 14.2905 +    SHADOW_DEBUG(P2M, "adding gfn=%#lx mfn=%#lx\n", gfn, mfn);
 14.2906 +
 14.2907 +    omfn = sh_gfn_to_mfn(d, gfn);
 14.2908 +    if ( valid_mfn(omfn) )
 14.2909 +    {
 14.2910 +        /* Get rid of the old mapping, especially any shadows */
 14.2911 +        shadow_remove_all_shadows_and_parents(v, omfn);
 14.2912 +        if ( shadow_remove_all_mappings(v, omfn) )
 14.2913 +            flush_tlb_mask(d->domain_dirty_cpumask);
 14.2914 +        set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY);
 14.2915 +    }        
 14.2916 +
 14.2917 +    ogfn = sh_mfn_to_gfn(d, _mfn(mfn));
 14.2918 +    if (
 14.2919 +#ifdef __x86_64__
 14.2920 +        (ogfn != 0x5555555555555555L)
 14.2921 +#else
 14.2922 +        (ogfn != 0x55555555L)
 14.2923 +#endif
 14.2924 +        && (ogfn != INVALID_M2P_ENTRY)
 14.2925 +        && (ogfn != gfn) )
 14.2926 +    {
 14.2927 +        /* This machine frame is already mapped at another physical address */
 14.2928 +        SHADOW_DEBUG(P2M, "aliased! mfn=%#lx, old gfn=%#lx, new gfn=%#lx\n",
 14.2929 +                       mfn, ogfn, gfn);
 14.2930 +        if ( valid_mfn(omfn = sh_gfn_to_mfn(d, ogfn)) ) 
 14.2931 +        {
 14.2932 +            SHADOW_DEBUG(P2M, "old gfn=%#lx -> mfn %#lx\n", 
 14.2933 +                           ogfn , mfn_x(omfn));
 14.2934 +            if ( mfn_x(omfn) == mfn ) 
 14.2935 +                sh_p2m_remove_page(d, ogfn, mfn);
 14.2936 +        }
 14.2937 +    }
 14.2938 +
 14.2939 +    shadow_set_p2m_entry(d, gfn, _mfn(mfn));
 14.2940 +    set_gpfn_from_mfn(mfn, gfn);
 14.2941 +    shadow_audit_p2m(d);
 14.2942 +    shadow_unlock(d);
 14.2943 +}
 14.2944 +
 14.2945 +/**************************************************************************/
 14.2946 +/* Log-dirty mode support */
 14.2947 +
 14.2948 +/* Convert a shadow to log-dirty mode. */
 14.2949 +void shadow_convert_to_log_dirty(struct vcpu *v, mfn_t smfn)
 14.2950 +{
 14.2951 +    BUG();
 14.2952 +}
 14.2953 +
 14.2954 +
 14.2955 +/* Read a domain's log-dirty bitmap and stats.  
 14.2956 + * If the operation is a CLEAN, clear the bitmap and stats as well. */
 14.2957 +static int shadow_log_dirty_op(
 14.2958 +    struct domain *d, struct xen_domctl_shadow_op *sc)
 14.2959 +{
 14.2960 +    int i, rv = 0, clean = 0;
 14.2961 +
 14.2962 +    domain_pause(d);
 14.2963 +    shadow_lock(d);
 14.2964 +
 14.2965 +    clean = (sc->op == XEN_DOMCTL_SHADOW_OP_CLEAN);
 14.2966 +
 14.2967 +    SHADOW_DEBUG(LOGDIRTY, "log-dirty %s: dom %u faults=%u dirty=%u\n", 
 14.2968 +                  (clean) ? "clean" : "peek",
 14.2969 +                  d->domain_id,
 14.2970 +                  d->arch.shadow.fault_count, 
 14.2971 +                  d->arch.shadow.dirty_count);
 14.2972 +
 14.2973 +    sc->stats.fault_count = d->arch.shadow.fault_count;
 14.2974 +    sc->stats.dirty_count = d->arch.shadow.dirty_count;    
 14.2975 +        
 14.2976 +    if ( clean ) 
 14.2977 +    {
 14.2978 +        struct list_head *l, *t;
 14.2979 +        struct page_info *pg;
 14.2980 +
 14.2981 +        /* Need to revoke write access to the domain's pages again. 
 14.2982 +         * In future, we'll have a less heavy-handed approach to this, 
 14.2983 +         * but for now, we just unshadow everything except Xen. */
 14.2984 +        list_for_each_safe(l, t, &d->arch.shadow.toplevel_shadows)
 14.2985 +        {
 14.2986 +            pg = list_entry(l, struct page_info, list);
 14.2987 +            shadow_unhook_mappings(d->vcpu[0], page_to_mfn(pg));
 14.2988 +        }
 14.2989 +
 14.2990 +        d->arch.shadow.fault_count = 0;
 14.2991 +        d->arch.shadow.dirty_count = 0;
 14.2992 +    }
 14.2993 +
 14.2994 +    if ( guest_handle_is_null(sc->dirty_bitmap) ||
 14.2995 +         (d->arch.shadow.dirty_bitmap == NULL) )
 14.2996 +    {
 14.2997 +        rv = -EINVAL;
 14.2998 +        goto out;
 14.2999 +    }
 14.3000 + 
 14.3001 +    if ( sc->pages > d->arch.shadow.dirty_bitmap_size )
 14.3002 +        sc->pages = d->arch.shadow.dirty_bitmap_size; 
 14.3003 +
 14.3004 +#define CHUNK (8*1024) /* Transfer and clear in 1kB chunks for L1 cache. */
 14.3005 +    for ( i = 0; i < sc->pages; i += CHUNK )
 14.3006 +    {
 14.3007 +        int bytes = ((((sc->pages - i) > CHUNK) 
 14.3008 +                      ? CHUNK 
 14.3009 +                      : (sc->pages - i)) + 7) / 8;
 14.3010 +     
 14.3011 +        if ( copy_to_guest_offset(
 14.3012 +                 sc->dirty_bitmap, 
 14.3013 +                 i/(8*sizeof(unsigned long)),
 14.3014 +                 d->arch.shadow.dirty_bitmap + (i/(8*sizeof(unsigned long))),
 14.3015 +                 (bytes + sizeof(unsigned long) - 1) / sizeof(unsigned long)) )
 14.3016 +        {
 14.3017 +            rv = -EINVAL;
 14.3018 +            goto out;
 14.3019 +        }
 14.3020 +
 14.3021 +        if ( clean )
 14.3022 +            memset(d->arch.shadow.dirty_bitmap + (i/(8*sizeof(unsigned long))),
 14.3023 +                   0, bytes);
 14.3024 +    }
 14.3025 +#undef CHUNK
 14.3026 +
 14.3027 + out:
 14.3028 +    shadow_unlock(d);
 14.3029 +    domain_unpause(d);
 14.3030 +    return 0;
 14.3031 +}
 14.3032 +
 14.3033 +
 14.3034 +/* Mark a page as dirty */
 14.3035 +void sh_do_mark_dirty(struct domain *d, mfn_t gmfn)
 14.3036 +{
 14.3037 +    unsigned long pfn;
 14.3038 +
 14.3039 +    ASSERT(shadow_lock_is_acquired(d));
 14.3040 +    ASSERT(shadow_mode_log_dirty(d));
 14.3041 +
 14.3042 +    if ( !valid_mfn(gmfn) )
 14.3043 +        return;
 14.3044 +
 14.3045 +    ASSERT(d->arch.shadow.dirty_bitmap != NULL);
 14.3046 +
 14.3047 +    /* We /really/ mean PFN here, even for non-translated guests. */
 14.3048 +    pfn = get_gpfn_from_mfn(mfn_x(gmfn));
 14.3049 +
 14.3050 +    /*
 14.3051 +     * Values with the MSB set denote MFNs that aren't really part of the 
 14.3052 +     * domain's pseudo-physical memory map (e.g., the shared info frame).
 14.3053 +     * Nothing to do here...
 14.3054 +     */
 14.3055 +    if ( unlikely(!VALID_M2P(pfn)) )
 14.3056 +        return;
 14.3057 +
 14.3058 +    /* N.B. Can use non-atomic TAS because protected by shadow_lock. */
 14.3059 +    if ( likely(pfn < d->arch.shadow.dirty_bitmap_size) ) 
 14.3060 +    { 
 14.3061 +        if ( !__test_and_set_bit(pfn, d->arch.shadow.dirty_bitmap) )
 14.3062 +        {
 14.3063 +            SHADOW_DEBUG(LOGDIRTY, 
 14.3064 +                          "marked mfn %" SH_PRI_mfn " (pfn=%lx), dom %d\n",
 14.3065 +                          mfn_x(gmfn), pfn, d->domain_id);
 14.3066 +            d->arch.shadow.dirty_count++;
 14.3067 +        }
 14.3068 +    }
 14.3069 +    else
 14.3070 +    {
 14.3071 +        SHADOW_PRINTK("mark_dirty OOR! "
 14.3072 +                       "mfn=%" SH_PRI_mfn " pfn=%lx max=%x (dom %d)\n"
 14.3073 +                       "owner=%d c=%08x t=%" PRtype_info "\n",
 14.3074 +                       mfn_x(gmfn), 
 14.3075 +                       pfn, 
 14.3076 +                       d->arch.shadow.dirty_bitmap_size,
 14.3077 +                       d->domain_id,
 14.3078 +                       (page_get_owner(mfn_to_page(gmfn))
 14.3079 +                        ? page_get_owner(mfn_to_page(gmfn))->domain_id
 14.3080 +                        : -1),
 14.3081 +                       mfn_to_page(gmfn)->count_info, 
 14.3082 +                       mfn_to_page(gmfn)->u.inuse.type_info);
 14.3083 +    }
 14.3084 +}
 14.3085 +
 14.3086 +
 14.3087 +/**************************************************************************/
 14.3088 +/* Shadow-control XEN_DOMCTL dispatcher */
 14.3089 +
 14.3090 +int shadow_domctl(struct domain *d, 
 14.3091 +                   xen_domctl_shadow_op_t *sc,
 14.3092 +                   XEN_GUEST_HANDLE(xen_domctl_t) u_domctl)
 14.3093 +{
 14.3094 +    int rc, preempted = 0;
 14.3095 +
 14.3096 +    if ( unlikely(d == current->domain) )
 14.3097 +    {
 14.3098 +        DPRINTK("Don't try to do a shadow op on yourself!\n");
 14.3099 +        return -EINVAL;
 14.3100 +    }
 14.3101 +
 14.3102 +    switch ( sc->op )
 14.3103 +    {
 14.3104 +    case XEN_DOMCTL_SHADOW_OP_OFF:
 14.3105 +        if ( shadow_mode_log_dirty(d) )
 14.3106 +            if ( (rc = shadow_log_dirty_disable(d)) != 0 ) 
 14.3107 +                return rc;
 14.3108 +        if ( d->arch.shadow.mode & SHM2_enable )
 14.3109 +            if ( (rc = shadow_test_disable(d)) != 0 ) 
 14.3110 +                return rc;
 14.3111 +        return 0;
 14.3112 +
 14.3113 +    case XEN_DOMCTL_SHADOW_OP_ENABLE_TEST:
 14.3114 +        return shadow_test_enable(d);
 14.3115 +
 14.3116 +    case XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY:
 14.3117 +        return shadow_log_dirty_enable(d);
 14.3118 +
 14.3119 +    case XEN_DOMCTL_SHADOW_OP_ENABLE_TRANSLATE:
 14.3120 +        return shadow_enable(d, SHM2_refcounts|SHM2_translate);
 14.3121 +
 14.3122 +    case XEN_DOMCTL_SHADOW_OP_CLEAN:
 14.3123 +    case XEN_DOMCTL_SHADOW_OP_PEEK:
 14.3124 +        return shadow_log_dirty_op(d, sc);
 14.3125 +
 14.3126 +    case XEN_DOMCTL_SHADOW_OP_ENABLE:
 14.3127 +        if ( sc->mode & XEN_DOMCTL_SHADOW_ENABLE_LOG_DIRTY )
 14.3128 +            return shadow_log_dirty_enable(d);
 14.3129 +        return shadow_enable(d, sc->mode << SHM2_shift);
 14.3130 +
 14.3131 +    case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION:
 14.3132 +        sc->mb = shadow_get_allocation(d);
 14.3133 +        return 0;
 14.3134 +
 14.3135 +    case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION:
 14.3136 +        rc = shadow_set_allocation(d, sc->mb, &preempted);
 14.3137 +        if ( preempted )
 14.3138 +            /* Not finished.  Set up to re-run the call. */
 14.3139 +            rc = hypercall_create_continuation(
 14.3140 +                __HYPERVISOR_domctl, "h", u_domctl);
 14.3141 +        else 
 14.3142 +            /* Finished.  Return the new allocation */
 14.3143 +            sc->mb = shadow_get_allocation(d);
 14.3144 +        return rc;
 14.3145 +
 14.3146 +    default:
 14.3147 +        SHADOW_ERROR("Bad shadow op %u\n", sc->op);
 14.3148 +        return -EINVAL;
 14.3149 +    }
 14.3150 +}
 14.3151 +
 14.3152 +
 14.3153 +/**************************************************************************/
 14.3154 +/* Auditing shadow tables */
 14.3155 +
 14.3156 +#if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL
 14.3157 +
 14.3158 +void shadow_audit_tables(struct vcpu *v) 
 14.3159 +{
 14.3160 +    /* Dispatch table for getting per-type functions */
 14.3161 +    static hash_callback_t callbacks[16] = {
 14.3162 +        NULL, /* none    */
 14.3163 +#if CONFIG_PAGING_LEVELS == 2
 14.3164 +        SHADOW_INTERNAL_NAME(sh_audit_l1_table,2,2),  /* l1_32   */
 14.3165 +        SHADOW_INTERNAL_NAME(sh_audit_fl1_table,2,2), /* fl1_32  */
 14.3166 +        SHADOW_INTERNAL_NAME(sh_audit_l2_table,2,2),  /* l2_32   */
 14.3167 +#else 
 14.3168 +        SHADOW_INTERNAL_NAME(sh_audit_l1_table,3,2),  /* l1_32   */
 14.3169 +        SHADOW_INTERNAL_NAME(sh_audit_fl1_table,3,2), /* fl1_32  */
 14.3170 +        SHADOW_INTERNAL_NAME(sh_audit_l2_table,3,2),  /* l2_32   */
 14.3171 +        SHADOW_INTERNAL_NAME(sh_audit_l1_table,3,3),  /* l1_pae  */
 14.3172 +        SHADOW_INTERNAL_NAME(sh_audit_fl1_table,3,3), /* fl1_pae */
 14.3173 +        SHADOW_INTERNAL_NAME(sh_audit_l2_table,3,3),  /* l2_pae  */
 14.3174 +        SHADOW_INTERNAL_NAME(sh_audit_l2_table,3,3),  /* l2h_pae */
 14.3175 +        SHADOW_INTERNAL_NAME(sh_audit_l3_table,3,3),  /* l3_pae  */
 14.3176 +#if CONFIG_PAGING_LEVELS >= 4
 14.3177 +        SHADOW_INTERNAL_NAME(sh_audit_l1_table,4,4),  /* l1_64   */
 14.3178 +        SHADOW_INTERNAL_NAME(sh_audit_fl1_table,4,4), /* fl1_64  */
 14.3179 +        SHADOW_INTERNAL_NAME(sh_audit_l2_table,4,4),  /* l2_64   */
 14.3180 +        SHADOW_INTERNAL_NAME(sh_audit_l3_table,4,4),  /* l3_64   */
 14.3181 +        SHADOW_INTERNAL_NAME(sh_audit_l4_table,4,4),  /* l4_64   */
 14.3182 +#endif /* CONFIG_PAGING_LEVELS >= 4 */
 14.3183 +#endif /* CONFIG_PAGING_LEVELS > 2 */
 14.3184 +        NULL  /* All the rest */
 14.3185 +    };
 14.3186 +    unsigned int mask; 
 14.3187 +
 14.3188 +    if ( !(SHADOW_AUDIT_ENABLE) )
 14.3189 +        return;
 14.3190 +    
 14.3191 +    if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL )
 14.3192 +        mask = ~1; /* Audit every table in the system */
 14.3193 +    else 
 14.3194 +    {
 14.3195 +        /* Audit only the current mode's tables */
 14.3196 +        switch ( v->arch.shadow.mode->guest_levels )
 14.3197 +        {
 14.3198 +        case 2: mask = (SHF_L1_32|SHF_FL1_32|SHF_L2_32); break;
 14.3199 +        case 3: mask = (SHF_L1_PAE|SHF_FL1_PAE|SHF_L2_PAE
 14.3200 +                        |SHF_L2H_PAE|SHF_L3_PAE); break;
 14.3201 +        case 4: mask = (SHF_L1_64|SHF_FL1_64|SHF_L2_64  
 14.3202 +                        |SHF_L3_64|SHF_L4_64); break;
 14.3203 +        default: BUG();
 14.3204 +        }
 14.3205 +    }
 14.3206 +
 14.3207 +    hash_foreach(v, ~1, callbacks, _mfn(INVALID_MFN));
 14.3208 +}
 14.3209 +
 14.3210 +#endif /* Shadow audit */
 14.3211 +
 14.3212 +
 14.3213 +/**************************************************************************/
 14.3214 +/* Auditing p2m tables */
 14.3215 +
 14.3216 +#if SHADOW_AUDIT & SHADOW_AUDIT_P2M
 14.3217 +
 14.3218 +void shadow_audit_p2m(struct domain *d)
 14.3219 +{
 14.3220 +    struct list_head *entry;
 14.3221 +    struct page_info *page;
 14.3222 +    struct domain *od;
 14.3223 +    unsigned long mfn, gfn, m2pfn, lp2mfn = 0;
 14.3224 +    mfn_t p2mfn;
 14.3225 +    unsigned long orphans_d = 0, orphans_i = 0, mpbad = 0, pmbad = 0;
 14.3226 +    int test_linear;
 14.3227 +    
 14.3228 +    if ( !(SHADOW_AUDIT_ENABLE) || !shadow_mode_translate(d) )
 14.3229 +        return;
 14.3230 +
 14.3231 +    //SHADOW_PRINTK("p2m audit starts\n");
 14.3232 +
 14.3233 +    test_linear = ( (d == current->domain) && current->arch.monitor_vtable );
 14.3234 +    if ( test_linear )
 14.3235 +        local_flush_tlb(); 
 14.3236 +
 14.3237 +    /* Audit part one: walk the domain's page allocation list, checking 
 14.3238 +     * the m2p entries. */
 14.3239 +    for ( entry = d->page_list.next;
 14.3240 +          entry != &d->page_list;
 14.3241 +          entry = entry->next )
 14.3242 +    {
 14.3243 +        page = list_entry(entry, struct page_info, list);
 14.3244 +        mfn = mfn_x(page_to_mfn(page));
 14.3245 +
 14.3246 +        // SHADOW_PRINTK("auditing guest page, mfn=%#lx\n", mfn); 
 14.3247 +
 14.3248 +        od = page_get_owner(page);
 14.3249 +
 14.3250 +        if ( od != d ) 
 14.3251 +        {
 14.3252 +            SHADOW_PRINTK("wrong owner %#lx -> %p(%u) != %p(%u)\n",
 14.3253 +                           mfn, od, (od?od->domain_id:-1), d, d->domain_id);
 14.3254 +            continue;
 14.3255 +        }
 14.3256 +
 14.3257 +        gfn = get_gpfn_from_mfn(mfn);
 14.3258 +        if ( gfn == INVALID_M2P_ENTRY ) 
 14.3259 +        {
 14.3260 +            orphans_i++;
 14.3261 +            //SHADOW_PRINTK("orphaned guest page: mfn=%#lx has invalid gfn\n",
 14.3262 +            //               mfn); 
 14.3263 +            continue;
 14.3264 +        }
 14.3265 +
 14.3266 +        if ( gfn == 0x55555555 ) 
 14.3267 +        {
 14.3268 +            orphans_d++;
 14.3269 +            //SHADOW_PRINTK("orphaned guest page: mfn=%#lx has debug gfn\n", 
 14.3270 +            //               mfn); 
 14.3271 +            continue;
 14.3272 +        }
 14.3273 +
 14.3274 +        p2mfn = sh_gfn_to_mfn_foreign(d, gfn);
 14.3275 +        if ( mfn_x(p2mfn) != mfn )
 14.3276 +        {
 14.3277 +            mpbad++;
 14.3278 +            SHADOW_PRINTK("map mismatch mfn %#lx -> gfn %#lx -> mfn %#lx"
 14.3279 +                           " (-> gfn %#lx)\n",
 14.3280 +                           mfn, gfn, mfn_x(p2mfn),
 14.3281 +                           (mfn_valid(p2mfn)
 14.3282 +                            ? get_gpfn_from_mfn(mfn_x(p2mfn))
 14.3283 +                            : -1u));
 14.3284 +            /* This m2p entry is stale: the domain has another frame in
 14.3285 +             * this physical slot.  No great disaster, but for neatness,
 14.3286 +             * blow away the m2p entry. */ 
 14.3287 +            set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
 14.3288 +        }
 14.3289 +
 14.3290 +        if ( test_linear )
 14.3291 +        {
 14.3292 +            lp2mfn = get_mfn_from_gpfn(gfn);
 14.3293 +            if ( lp2mfn != mfn_x(p2mfn) )
 14.3294 +            {
 14.3295 +                SHADOW_PRINTK("linear mismatch gfn %#lx -> mfn %#lx "
 14.3296 +                               "(!= mfn %#lx)\n", gfn, lp2mfn, p2mfn);
 14.3297 +            }
 14.3298 +        }
 14.3299 +
 14.3300 +        // SHADOW_PRINTK("OK: mfn=%#lx, gfn=%#lx, p2mfn=%#lx, lp2mfn=%#lx\n", 
 14.3301 +        //                mfn, gfn, p2mfn, lp2mfn); 
 14.3302 +    }   
 14.3303 +
 14.3304 +    /* Audit part two: walk the domain's p2m table, checking the entries. */
 14.3305 +    if ( pagetable_get_pfn(d->arch.phys_table) != 0 )
 14.3306 +    {
 14.3307 +        l2_pgentry_t *l2e;
 14.3308 +        l1_pgentry_t *l1e;
 14.3309 +        int i1, i2;
 14.3310 +        
 14.3311 +#if CONFIG_PAGING_LEVELS == 4
 14.3312 +        l4_pgentry_t *l4e;
 14.3313 +        l3_pgentry_t *l3e;
 14.3314 +        int i3, i4;
 14.3315 +        l4e = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
 14.3316 +#elif CONFIG_PAGING_LEVELS == 3
 14.3317 +        l3_pgentry_t *l3e;
 14.3318 +        int i3;
 14.3319 +        l3e = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
 14.3320 +#else /* CONFIG_PAGING_LEVELS == 2 */
 14.3321 +        l2e = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
 14.3322 +#endif
 14.3323 +
 14.3324 +        gfn = 0;
 14.3325 +#if CONFIG_PAGING_LEVELS >= 3
 14.3326 +#if CONFIG_PAGING_LEVELS >= 4
 14.3327 +        for ( i4 = 0; i4 < L4_PAGETABLE_ENTRIES; i4++ )
 14.3328 +        {
 14.3329 +            if ( !(l4e_get_flags(l4e[i4]) & _PAGE_PRESENT) )
 14.3330 +            {
 14.3331 +                gfn += 1 << (L4_PAGETABLE_SHIFT - PAGE_SHIFT);
 14.3332 +                continue;
 14.3333 +            }
 14.3334 +            l3e = sh_map_domain_page(_mfn(l4e_get_pfn(l4e[i4])));
 14.3335 +#endif /* now at levels 3 or 4... */
 14.3336 +            for ( i3 = 0; 
 14.3337 +                  i3 < ((CONFIG_PAGING_LEVELS==4) ? L3_PAGETABLE_ENTRIES : 8); 
 14.3338 +                  i3++ )
 14.3339 +            {
 14.3340 +                if ( !(l3e_get_flags(l3e[i3]) & _PAGE_PRESENT) )
 14.3341 +                {
 14.3342 +                    gfn += 1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT);
 14.3343 +                    continue;
 14.3344 +                }
 14.3345 +                l2e = sh_map_domain_page(_mfn(l3e_get_pfn(l3e[i3])));
 14.3346 +#endif /* all levels... */
 14.3347 +                for ( i2 = 0; i2 < L2_PAGETABLE_ENTRIES; i2++ )
 14.3348 +                {
 14.3349 +                    if ( !(l2e_get_flags(l2e[i2]) & _PAGE_PRESENT) )
 14.3350 +                    {
 14.3351 +                        gfn += 1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT);
 14.3352 +                        continue;
 14.3353 +                    }
 14.3354 +                    l1e = sh_map_domain_page(_mfn(l2e_get_pfn(l2e[i2])));
 14.3355 +                    
 14.3356 +                    for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++, gfn++ )
 14.3357 +                    {
 14.3358 +                        if ( !(l1e_get_flags(l1e[i1]) & _PAGE_PRESENT) )
 14.3359 +                            continue;
 14.3360 +                        mfn = l1e_get_pfn(l1e[i1]);
 14.3361 +                        ASSERT(valid_mfn(_mfn(mfn)));
 14.3362 +                        m2pfn = get_gpfn_from_mfn(mfn);
 14.3363 +                        if ( m2pfn != gfn )
 14.3364 +                        {
 14.3365 +                            pmbad++;
 14.3366 +                            SHADOW_PRINTK("mismatch: gfn %#lx -> mfn %#lx"
 14.3367 +                                           " -> gfn %#lx\n", gfn, mfn, m2pfn);
 14.3368 +                            BUG();
 14.3369 +                        }
 14.3370 +                    }
 14.3371 +                    sh_unmap_domain_page(l1e);
 14.3372 +                }
 14.3373 +#if CONFIG_PAGING_LEVELS >= 3
 14.3374 +                sh_unmap_domain_page(l2e);
 14.3375 +            }
 14.3376 +#if CONFIG_PAGING_LEVELS >= 4
 14.3377 +            sh_unmap_domain_page(l3e);
 14.3378 +        }
 14.3379 +#endif
 14.3380 +#endif
 14.3381 +
 14.3382 +#if CONFIG_PAGING_LEVELS == 4
 14.3383 +        sh_unmap_domain_page(l4e);
 14.3384 +#elif CONFIG_PAGING_LEVELS == 3
 14.3385 +        sh_unmap_domain_page(l3e);
 14.3386 +#else /* CONFIG_PAGING_LEVELS == 2 */
 14.3387 +        sh_unmap_domain_page(l2e);
 14.3388 +#endif
 14.3389 +
 14.3390 +    }
 14.3391 +
 14.3392 +    //SHADOW_PRINTK("p2m audit complete\n");
 14.3393 +    //if ( orphans_i | orphans_d | mpbad | pmbad ) 
 14.3394 +    //    SHADOW_PRINTK("p2m audit found %lu orphans (%lu inval %lu debug)\n",
 14.3395 +    //                   orphans_i + orphans_d, orphans_i, orphans_d,
 14.3396 +    if ( mpbad | pmbad ) 
 14.3397 +        SHADOW_PRINTK("p2m audit found %lu odd p2m, %lu bad m2p entries\n",
 14.3398 +                       pmbad, mpbad);
 14.3399 +}
 14.3400 +
 14.3401 +#endif /* p2m audit */
 14.3402 +
 14.3403 +/*
 14.3404 + * Local variables:
 14.3405 + * mode: C
 14.3406 + * c-set-style: "BSD"
 14.3407 + * c-basic-offset: 4
 14.3408 + * indent-tabs-mode: nil
 14.3409 + * End: 
 14.3410 + */
    15.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    15.2 +++ b/xen/arch/x86/mm/shadow/multi.c	Mon Aug 28 12:09:36 2006 +0100
    15.3 @@ -0,0 +1,4492 @@
    15.4 +/******************************************************************************
    15.5 + * arch/x86/mm/shadow/multi.c
    15.6 + *
    15.7 + * Simple, mostly-synchronous shadow page tables. 
    15.8 + * Parts of this code are Copyright (c) 2006 by XenSource Inc.
    15.9 + * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
   15.10 + * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
   15.11 + *
   15.12 + * This program is free software; you can redistribute it and/or modify
   15.13 + * it under the terms of the GNU General Public License as published by
   15.14 + * the Free Software Foundation; either version 2 of the License, or
   15.15 + * (at your option) any later version.
   15.16 + *
   15.17 + * This program is distributed in the hope that it will be useful,
   15.18 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
   15.19 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   15.20 + * GNU General Public License for more details.
   15.21 + *
   15.22 + * You should have received a copy of the GNU General Public License
   15.23 + * along with this program; if not, write to the Free Software
   15.24 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
   15.25 + */
   15.26 +
   15.27 +// DESIGN QUESTIONS:
   15.28 +// Why use subshadows for PAE guests?
   15.29 +// - reduces pressure in the hash table
   15.30 +// - reduces shadow size (64-vs-4096 bytes of shadow for 32 bytes of guest L3)
   15.31 +// - would need to find space in the page_info to store 7 more bits of
   15.32 +//   backpointer
   15.33 +// - independent shadows of 32 byte chunks makes it non-obvious how to quickly
   15.34 +//   figure out when to demote the guest page from l3 status
   15.35 +//
   15.36 +// PAE Xen HVM guests are restricted to 8GB of pseudo-physical address space.
   15.37 +// - Want to map the P2M table into the 16MB RO_MPT hole in Xen's address
   15.38 +//   space for both PV and HVM guests.
   15.39 +//
   15.40 +
   15.41 +#define SHADOW 1
   15.42 +
   15.43 +#include <xen/config.h>
   15.44 +#include <xen/types.h>
   15.45 +#include <xen/mm.h>
   15.46 +#include <xen/trace.h>
   15.47 +#include <xen/sched.h>
   15.48 +#include <xen/perfc.h>
   15.49 +#include <xen/domain_page.h>
   15.50 +#include <asm/page.h>
   15.51 +#include <asm/current.h>
   15.52 +#include <asm/shadow.h>
   15.53 +#include <asm/flushtlb.h>
   15.54 +#include <asm/hvm/hvm.h>
   15.55 +#include "private.h"
   15.56 +#include "types.h"
   15.57 +
   15.58 +/* The first cut: an absolutely synchronous, trap-and-emulate version,
   15.59 + * supporting only HVM guests (and so only "external" shadow mode). 
   15.60 + *
   15.61 + * THINGS TO DO LATER:
   15.62 + * 
   15.63 + * FIX GVA_TO_GPA
   15.64 + * The current interface returns an unsigned long, which is not big enough
   15.65 + * to hold a physical address in PAE.  Should return a gfn instead.
   15.66 + * 
   15.67 + * TEARDOWN HEURISTICS
   15.68 + * Also: have a heuristic for when to destroy a previous paging-mode's 
   15.69 + * shadows.  When a guest is done with its start-of-day 32-bit tables
   15.70 + * and reuses the memory we want to drop those shadows.  Start with 
   15.71 + * shadows in a page in two modes as a hint, but beware of clever tricks 
   15.72 + * like reusing a pagetable for both PAE and 64-bit during boot...
   15.73 + *
   15.74 + * PAE LINEAR MAPS
   15.75 + * Rework shadow_get_l*e() to have the option of using map_domain_page()
   15.76 + * instead of linear maps.  Add appropriate unmap_l*e calls in the users. 
   15.77 + * Then we can test the speed difference made by linear maps.  If the 
   15.78 + * map_domain_page() version is OK on PAE, we could maybe allow a lightweight 
   15.79 + * l3-and-l2h-only shadow mode for PAE PV guests that would allow them 
   15.80 + * to share l2h pages again. 
   15.81 + *
   15.82 + * PAE L3 COPYING
   15.83 + * In this code, we copy all 32 bytes of a PAE L3 every time we change an 
   15.84 + * entry in it, and every time we change CR3.  We copy it for the linear 
   15.85 + * mappings (ugh! PAE linear mappings) and we copy it to the low-memory
   15.86 + * buffer so it fits in CR3.  Maybe we can avoid some of this recopying 
   15.87 + * by using the shadow directly in some places. 
   15.88 + * Also, for SMP, need to actually respond to seeing shadow.pae_flip_pending.
   15.89 + *
   15.90 + * GUEST_WALK_TABLES TLB FLUSH COALESCE
   15.91 + * guest_walk_tables can do up to three remote TLB flushes as it walks to
   15.92 + * the first l1 of a new pagetable.  Should coalesce the flushes to the end, 
   15.93 + * and if we do flush, re-do the walk.  If anything has changed, then 
   15.94 + * pause all the other vcpus and do the walk *again*.
   15.95 + *
   15.96 + * WP DISABLED
   15.97 + * Consider how to implement having the WP bit of CR0 set to 0.  
   15.98 + * Since we need to be able to cause write faults to pagetables, this might
   15.99 + * end up looking like not having the (guest) pagetables present at all in 
  15.100 + * HVM guests...
  15.101 + *
  15.102 + * PSE disabled / PSE36
  15.103 + * We don't support any modes other than PSE enabled, PSE36 disabled.
  15.104 + * Neither of those would be hard to change, but we'd need to be able to 
  15.105 + * deal with shadows made in one mode and used in another.
  15.106 + */
  15.107 +
  15.108 +#define FETCH_TYPE_PREFETCH 1
  15.109 +#define FETCH_TYPE_DEMAND   2
  15.110 +#define FETCH_TYPE_WRITE    4
  15.111 +typedef enum {
  15.112 +    ft_prefetch     = FETCH_TYPE_PREFETCH,
  15.113 +    ft_demand_read  = FETCH_TYPE_DEMAND,
  15.114 +    ft_demand_write = FETCH_TYPE_DEMAND | FETCH_TYPE_WRITE,
  15.115 +} fetch_type_t;
  15.116 +
  15.117 +#ifdef DEBUG_TRACE_DUMP
  15.118 +static char *fetch_type_names[] = {
  15.119 +    [ft_prefetch]     "prefetch",
  15.120 +    [ft_demand_read]  "demand read",
  15.121 +    [ft_demand_write] "demand write",
  15.122 +};
  15.123 +#endif
  15.124 +
  15.125 +/* XXX forward declarations */
  15.126 +#if (GUEST_PAGING_LEVELS == 3) && (SHADOW_PAGING_LEVELS == 3)
  15.127 +static unsigned long hvm_pae_copy_root(struct vcpu *v, l3_pgentry_t *l3tab, int clear_res);
  15.128 +#endif
  15.129 +static inline void sh_update_linear_entries(struct vcpu *v);
  15.130 +
  15.131 +/**************************************************************************/
  15.132 +/* Hash table mapping from guest pagetables to shadows
  15.133 + *
  15.134 + * Normal case: maps the mfn of a guest page to the mfn of its shadow page.
  15.135 + * FL1's:       maps the *gfn* of the start of a superpage to the mfn of a
  15.136 + *              shadow L1 which maps its "splinters".
  15.137 + * PAE CR3s:    maps the 32-byte aligned, 32-bit CR3 value to the mfn of the
  15.138 + *              PAE L3 info page for that CR3 value.
  15.139 + */
  15.140 +
  15.141 +static inline mfn_t 
  15.142 +get_fl1_shadow_status(struct vcpu *v, gfn_t gfn)
  15.143 +/* Look for FL1 shadows in the hash table */
  15.144 +{
  15.145 +    mfn_t smfn = shadow_hash_lookup(v, gfn_x(gfn),
  15.146 +                                     PGC_SH_fl1_shadow >> PGC_SH_type_shift);
  15.147 +
  15.148 +    if ( unlikely(shadow_mode_log_dirty(v->domain) && valid_mfn(smfn)) )
  15.149 +    {
  15.150 +        struct page_info *page = mfn_to_page(smfn);
  15.151 +        if ( !(page->count_info & PGC_SH_log_dirty) )
  15.152 +            shadow_convert_to_log_dirty(v, smfn);
  15.153 +    }
  15.154 +
  15.155 +    return smfn;
  15.156 +}
  15.157 +
  15.158 +static inline mfn_t 
  15.159 +get_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type)
  15.160 +/* Look for shadows in the hash table */
  15.161 +{
  15.162 +    mfn_t smfn = shadow_hash_lookup(v, mfn_x(gmfn),
  15.163 +                                     shadow_type >> PGC_SH_type_shift);
  15.164 +    perfc_incrc(shadow_get_shadow_status);
  15.165 +
  15.166 +    if ( unlikely(shadow_mode_log_dirty(v->domain) && valid_mfn(smfn)) )
  15.167 +    {
  15.168 +        struct page_info *page = mfn_to_page(smfn);
  15.169 +        if ( !(page->count_info & PGC_SH_log_dirty) )
  15.170 +            shadow_convert_to_log_dirty(v, smfn);
  15.171 +    }
  15.172 +
  15.173 +    return smfn;
  15.174 +}
  15.175 +
  15.176 +static inline void 
  15.177 +set_fl1_shadow_status(struct vcpu *v, gfn_t gfn, mfn_t smfn)
  15.178 +/* Put an FL1 shadow into the hash table */
  15.179 +{
  15.180 +    SHADOW_PRINTK("gfn=%"SH_PRI_gfn", type=%08x, smfn=%05lx\n",
  15.181 +                   gfn_x(gfn), PGC_SH_fl1_shadow, mfn_x(smfn));
  15.182 +
  15.183 +    if ( unlikely(shadow_mode_log_dirty(v->domain)) )
  15.184 +        // mark this shadow as a log dirty shadow...
  15.185 +        set_bit(_PGC_SH_log_dirty, &mfn_to_page(smfn)->count_info);
  15.186 +    else
  15.187 +        clear_bit(_PGC_SH_log_dirty, &mfn_to_page(smfn)->count_info);
  15.188 +
  15.189 +    shadow_hash_insert(v, gfn_x(gfn),
  15.190 +                        PGC_SH_fl1_shadow >> PGC_SH_type_shift, smfn);
  15.191 +}
  15.192 +
  15.193 +static inline void 
  15.194 +set_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type, mfn_t smfn)
  15.195 +/* Put a shadow into the hash table */
  15.196 +{
  15.197 +    struct domain *d = v->domain;
  15.198 +    int res;
  15.199 +
  15.200 +    SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx, type=%08x, smfn=%05lx\n",
  15.201 +                   d->domain_id, v->vcpu_id, mfn_x(gmfn),
  15.202 +                   shadow_type, mfn_x(smfn));
  15.203 +
  15.204 +    if ( unlikely(shadow_mode_log_dirty(d)) )
  15.205 +        // mark this shadow as a log dirty shadow...
  15.206 +        set_bit(_PGC_SH_log_dirty, &mfn_to_page(smfn)->count_info);
  15.207 +    else
  15.208 +        clear_bit(_PGC_SH_log_dirty, &mfn_to_page(smfn)->count_info);
  15.209 +
  15.210 +    res = get_page(mfn_to_page(gmfn), d);
  15.211 +    ASSERT(res == 1);
  15.212 +
  15.213 +    shadow_hash_insert(v, mfn_x(gmfn), shadow_type >> PGC_SH_type_shift,
  15.214 +                        smfn);
  15.215 +}
  15.216 +
  15.217 +static inline void 
  15.218 +delete_fl1_shadow_status(struct vcpu *v, gfn_t gfn, mfn_t smfn)
  15.219 +/* Remove a shadow from the hash table */
  15.220 +{
  15.221 +    SHADOW_PRINTK("gfn=%"SH_PRI_gfn", type=%08x, smfn=%05lx\n",
  15.222 +                   gfn_x(gfn), PGC_SH_fl1_shadow, mfn_x(smfn));
  15.223 +
  15.224 +    shadow_hash_delete(v, gfn_x(gfn),
  15.225 +                        PGC_SH_fl1_shadow >> PGC_SH_type_shift, smfn);
  15.226 +}
  15.227 +
  15.228 +static inline void 
  15.229 +delete_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type, mfn_t smfn)
  15.230 +/* Remove a shadow from the hash table */
  15.231 +{
  15.232 +    SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx, type=%08x, smfn=%05lx\n",
  15.233 +                   v->domain->domain_id, v->vcpu_id,
  15.234 +                   mfn_x(gmfn), shadow_type, mfn_x(smfn));
  15.235 +    shadow_hash_delete(v, mfn_x(gmfn),
  15.236 +                        shadow_type >> PGC_SH_type_shift, smfn);
  15.237 +    put_page(mfn_to_page(gmfn));
  15.238 +}
  15.239 +
  15.240 +/**************************************************************************/
  15.241 +/* CPU feature support querying */
  15.242 +
  15.243 +static inline int
  15.244 +guest_supports_superpages(struct vcpu *v)
  15.245 +{
  15.246 +    /* The _PAGE_PSE bit must be honoured in HVM guests, whenever
  15.247 +     * CR4.PSE is set or the guest is in PAE or long mode */
  15.248 +    return (hvm_guest(v) && (GUEST_PAGING_LEVELS != 2 
  15.249 +                             || (hvm_get_guest_ctrl_reg(v, 4) & X86_CR4_PSE)));
  15.250 +}
  15.251 +
  15.252 +static inline int
  15.253 +guest_supports_nx(struct vcpu *v)
  15.254 +{
  15.255 +    if ( !hvm_guest(v) )
  15.256 +        return cpu_has_nx;
  15.257 +
  15.258 +    // XXX - fix this!
  15.259 +    return 1;
  15.260 +}
  15.261 +
  15.262 +
  15.263 +/**************************************************************************/
  15.264 +/* Functions for walking the guest page tables */
  15.265 +
  15.266 +
  15.267 +/* Walk the guest pagetables, filling the walk_t with what we see. 
  15.268 + * Takes an uninitialised walk_t.  The caller must call unmap_walk() 
  15.269 + * on the walk_t before discarding it or calling guest_walk_tables again. 
  15.270 + * If "guest_op" is non-zero, we are serving a genuine guest memory access, 
  15.271 + * and must (a) be under the shadow lock, and (b) remove write access
  15.272 + * from any gueat PT pages we see, as we will be using their contents to 
  15.273 + * perform shadow updates.
  15.274 + * Returns 0 for success or non-zero if the guest pagetables are malformed.
  15.275 + * N.B. Finding a not-present entry does not cause a non-zero return code. */
  15.276 +static inline int 
  15.277 +guest_walk_tables(struct vcpu *v, unsigned long va, walk_t *gw, int guest_op)
  15.278 +{
  15.279 +    ASSERT(!guest_op || shadow_lock_is_acquired(v->domain));
  15.280 +
  15.281 +    perfc_incrc(shadow_guest_walk);
  15.282 +    memset(gw, 0, sizeof(*gw));
  15.283 +    gw->va = va;
  15.284 +
  15.285 +#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
  15.286 +#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
  15.287 +    /* Get l4e from the top level table */
  15.288 +    gw->l4mfn = pagetable_get_mfn(v->arch.guest_table);
  15.289 +    gw->l4e = (guest_l4e_t *)v->arch.guest_vtable + guest_l4_table_offset(va);
  15.290 +    /* Walk down to the l3e */
  15.291 +    if ( !(guest_l4e_get_flags(*gw->l4e) & _PAGE_PRESENT) ) return 0;
  15.292 +    gw->l3mfn = vcpu_gfn_to_mfn(v, guest_l4e_get_gfn(*gw->l4e));
  15.293 +    if ( !valid_mfn(gw->l3mfn) ) return 1;
  15.294 +    /* This mfn is a pagetable: make sure the guest can't write to it. */
  15.295 +    if ( guest_op && shadow_remove_write_access(v, gw->l3mfn, 3, va) != 0 )
  15.296 +        flush_tlb_mask(v->domain->domain_dirty_cpumask); 
  15.297 +    gw->l3e = ((guest_l3e_t *)sh_map_domain_page(gw->l3mfn))
  15.298 +        + guest_l3_table_offset(va);
  15.299 +#else /* PAE only... */
  15.300 +    /* Get l3e from the top level table */
  15.301 +    gw->l3mfn = pagetable_get_mfn(v->arch.guest_table);
  15.302 +    gw->l3e = (guest_l3e_t *)v->arch.guest_vtable + guest_l3_table_offset(va);
  15.303 +#endif /* PAE or 64... */
  15.304 +    /* Walk down to the l2e */
  15.305 +    if ( !(guest_l3e_get_flags(*gw->l3e) & _PAGE_PRESENT) ) return 0;
  15.306 +    gw->l2mfn = vcpu_gfn_to_mfn(v, guest_l3e_get_gfn(*gw->l3e));
  15.307 +    if ( !valid_mfn(gw->l2mfn) ) return 1;
  15.308 +    /* This mfn is a pagetable: make sure the guest can't write to it. */
  15.309 +    if ( guest_op && shadow_remove_write_access(v, gw->l2mfn, 2, va) != 0 )
  15.310 +        flush_tlb_mask(v->domain->domain_dirty_cpumask); 
  15.311 +    gw->l2e = ((guest_l2e_t *)sh_map_domain_page(gw->l2mfn))
  15.312 +        + guest_l2_table_offset(va);
  15.313 +#else /* 32-bit only... */
  15.314 +    /* Get l2e from the top level table */
  15.315 +    gw->l2mfn = pagetable_get_mfn(v->arch.guest_table);
  15.316 +    gw->l2e = (guest_l2e_t *)v->arch.guest_vtable + guest_l2_table_offset(va);
  15.317 +#endif /* All levels... */
  15.318 +    
  15.319 +    if ( !(guest_l2e_get_flags(*gw->l2e) & _PAGE_PRESENT) ) return 0;
  15.320 +    if ( guest_supports_superpages(v) &&
  15.321 +         (guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE) ) 
  15.322 +    {
  15.323 +        /* Special case: this guest VA is in a PSE superpage, so there's
  15.324 +         * no guest l1e.  We make one up so that the propagation code
  15.325 +         * can generate a shadow l1 table.  Start with the gfn of the 
  15.326 +         * first 4k-page of the superpage. */
  15.327 +        gfn_t start = guest_l2e_get_gfn(*gw->l2e);
  15.328 +        /* Grant full access in the l1e, since all the guest entry's 
  15.329 +         * access controls are enforced in the shadow l2e.  This lets 
  15.330 +         * us reflect l2 changes later without touching the l1s. */
  15.331 +        int flags = (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
  15.332 +                     _PAGE_ACCESSED|_PAGE_DIRTY);
  15.333 +        /* PSE level 2 entries use bit 12 for PAT; propagate it to bit 7
  15.334 +         * of the level 1 */
  15.335 +        if ( (guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE_PAT) ) 
  15.336 +            flags |= _PAGE_PAT; 
  15.337 +        /* Increment the pfn by the right number of 4k pages.  
  15.338 +         * The ~0x1 is to mask out the PAT bit mentioned above. */
  15.339 +        start = _gfn((gfn_x(start) & ~0x1) + guest_l1_table_offset(va));
  15.340 +        gw->eff_l1e = guest_l1e_from_gfn(start, flags);
  15.341 +        gw->l1e = NULL;
  15.342 +        gw->l1mfn = _mfn(INVALID_MFN);
  15.343 +    } 
  15.344 +    else 
  15.345 +    {
  15.346 +        /* Not a superpage: carry on and find the l1e. */
  15.347 +        gw->l1mfn = vcpu_gfn_to_mfn(v, guest_l2e_get_gfn(*gw->l2e));
  15.348 +        if ( !valid_mfn(gw->l1mfn) ) return 1;
  15.349 +        /* This mfn is a pagetable: make sure the guest can't write to it. */
  15.350 +        if ( guest_op 
  15.351 +             && shadow_remove_write_access(v, gw->l1mfn, 1, va) != 0 )
  15.352 +            flush_tlb_mask(v->domain->domain_dirty_cpumask); 
  15.353 +        gw->l1e = ((guest_l1e_t *)sh_map_domain_page(gw->l1mfn))
  15.354 +            + guest_l1_table_offset(va);
  15.355 +        gw->eff_l1e = *gw->l1e;
  15.356 +    }
  15.357 +
  15.358 +    return 0;
  15.359 +}
  15.360 +
  15.361 +/* Given a walk_t, translate the gw->va into the guest's notion of the
  15.362 + * corresponding frame number. */
  15.363 +static inline gfn_t
  15.364 +guest_walk_to_gfn(walk_t *gw)
  15.365 +{
  15.366 +    if ( !(guest_l1e_get_flags(gw->eff_l1e) & _PAGE_PRESENT) )
  15.367 +        return _gfn(INVALID_GFN);
  15.368 +    return guest_l1e_get_gfn(gw->eff_l1e);
  15.369 +}
  15.370 +
  15.371 +/* Given a walk_t, translate the gw->va into the guest's notion of the
  15.372 + * corresponding physical address. */
  15.373 +static inline paddr_t
  15.374 +guest_walk_to_gpa(walk_t *gw)
  15.375 +{
  15.376 +    if ( !(guest_l1e_get_flags(gw->eff_l1e) & _PAGE_PRESENT) )
  15.377 +        return 0;
  15.378 +    return guest_l1e_get_paddr(gw->eff_l1e) + (gw->va & ~PAGE_MASK);
  15.379 +}
  15.380 +
  15.381 +
  15.382 +/* Unmap (and reinitialise) a guest walk.  
  15.383 + * Call this to dispose of any walk filled in by guest_walk_tables() */
  15.384 +static void unmap_walk(struct vcpu *v, walk_t *gw)
  15.385 +{
  15.386 +#if GUEST_PAGING_LEVELS >= 3
  15.387 +#if GUEST_PAGING_LEVELS >= 4
  15.388 +    if ( gw->l3e != NULL ) sh_unmap_domain_page(gw->l3e);
  15.389 +#endif
  15.390 +    if ( gw->l2e != NULL ) sh_unmap_domain_page(gw->l2e);
  15.391 +#endif
  15.392 +    if ( gw->l1e != NULL ) sh_unmap_domain_page(gw->l1e);
  15.393 +#ifdef DEBUG
  15.394 +    memset(gw, 0, sizeof(*gw));
  15.395 +#endif
  15.396 +}
  15.397 +
  15.398 +
  15.399 +/* Pretty-print the contents of a guest-walk */
  15.400 +static inline void print_gw(walk_t *gw)
  15.401 +{
  15.402 +    SHADOW_PRINTK("GUEST WALK TO %#lx:\n", gw->va);
  15.403 +#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
  15.404 +#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
  15.405 +    SHADOW_PRINTK("   l4mfn=%" SH_PRI_mfn "\n", mfn_x(gw->l4mfn));
  15.406 +    SHADOW_PRINTK("   l4e=%p\n", gw->l4e);
  15.407 +    if ( gw->l4e )
  15.408 +        SHADOW_PRINTK("   *l4e=%" SH_PRI_gpte "\n", gw->l4e->l4);
  15.409 +#endif /* PAE or 64... */
  15.410 +    SHADOW_PRINTK("   l3mfn=%" SH_PRI_mfn "\n", mfn_x(gw->l3mfn));
  15.411 +    SHADOW_PRINTK("   l3e=%p\n", gw->l3e);
  15.412 +    if ( gw->l3e )
  15.413 +        SHADOW_PRINTK("   *l3e=%" SH_PRI_gpte "\n", gw->l3e->l3);
  15.414 +#endif /* All levels... */
  15.415 +    SHADOW_PRINTK("   l2mfn=%" SH_PRI_mfn "\n", mfn_x(gw->l2mfn));
  15.416 +    SHADOW_PRINTK("   l2e=%p\n", gw->l2e);
  15.417 +    if ( gw->l2e )
  15.418 +        SHADOW_PRINTK("   *l2e=%" SH_PRI_gpte "\n", gw->l2e->l2);
  15.419 +    SHADOW_PRINTK("   l1mfn=%" SH_PRI_mfn "\n", mfn_x(gw->l1mfn));
  15.420 +    SHADOW_PRINTK("   l1e=%p\n", gw->l1e);
  15.421 +    if ( gw->l1e )
  15.422 +        SHADOW_PRINTK("   *l1e=%" SH_PRI_gpte "\n", gw->l1e->l1);
  15.423 +    SHADOW_PRINTK("   eff_l1e=%" SH_PRI_gpte "\n", gw->eff_l1e.l1);
  15.424 +}
  15.425 +
  15.426 +
  15.427 +#if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
  15.428 +/* Lightweight audit: pass all the shadows associated with this guest walk
  15.429 + * through the audit mechanisms */
  15.430 +static void sh_audit_gw(struct vcpu *v, walk_t *gw) 
  15.431 +{
  15.432 +    mfn_t smfn;
  15.433 +
  15.434 +    if ( !(SHADOW_AUDIT_ENABLE) )
  15.435 +        return;
  15.436 +
  15.437 +#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
  15.438 +#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
  15.439 +    if ( valid_mfn(gw->l4mfn)
  15.440 +         && valid_mfn((smfn = get_shadow_status(v, gw->l4mfn, 
  15.441 +                                                PGC_SH_l4_shadow))) )
  15.442 +        (void) sh_audit_l4_table(v, smfn, _mfn(INVALID_MFN));
  15.443 +#endif /* PAE or 64... */
  15.444 +    if ( valid_mfn(gw->l3mfn)
  15.445 +         && valid_mfn((smfn = get_shadow_status(v, gw->l3mfn, 
  15.446 +                                                PGC_SH_l3_shadow))) )
  15.447 +        (void) sh_audit_l3_table(v, smfn, _mfn(INVALID_MFN));
  15.448 +#endif /* All levels... */
  15.449 +    if ( valid_mfn(gw->l2mfn) )
  15.450 +    {
  15.451 +        if ( valid_mfn((smfn = get_shadow_status(v, gw->l2mfn, 
  15.452 +                                                 PGC_SH_l2_shadow))) )
  15.453 +            (void) sh_audit_l2_table(v, smfn, _mfn(INVALID_MFN));
  15.454 +#if GUEST_PAGING_LEVELS == 3
  15.455 +        if ( valid_mfn((smfn = get_shadow_status(v, gw->l2mfn, 
  15.456 +                                                 PGC_SH_l2h_shadow))) )
  15.457 +            (void) sh_audit_l2_table(v, smfn, _mfn(INVALID_MFN));
  15.458 +#endif
  15.459 +    }
  15.460 +    if ( valid_mfn(gw->l1mfn)
  15.461 +         && valid_mfn((smfn = get_shadow_status(v, gw->l1mfn, 
  15.462 +                                                PGC_SH_l1_shadow))) )
  15.463 +        (void) sh_audit_l1_table(v, smfn, _mfn(INVALID_MFN));
  15.464 +    else if ( gw->l2e
  15.465 +              && (guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE)
  15.466 +              && valid_mfn( 
  15.467 +              (smfn = get_fl1_shadow_status(v, guest_l2e_get_gfn(*gw->l2e)))) )
  15.468 +        (void) sh_audit_fl1_table(v, smfn, _mfn(INVALID_MFN));
  15.469 +}
  15.470 +
  15.471 +#else
  15.472 +#define sh_audit_gw(_v, _gw) do {} while(0)
  15.473 +#endif /* audit code */
  15.474 +
  15.475 +
  15.476 +
  15.477 +/**************************************************************************/
  15.478 +/* Function to write to the guest tables, for propagating accessed and 
  15.479 + * dirty bits from the shadow to the guest.
  15.480 + * Takes a guest mfn, a pointer to the guest entry, the level of pagetable,
  15.481 + * and an operation type.  The guest entry is always passed as an l1e: 
  15.482 + * since we only ever write flags, that's OK.
  15.483 + * Returns the new flag bits of the guest entry. */
  15.484 +
  15.485 +static u32 guest_set_ad_bits(struct vcpu *v,
  15.486 +                             mfn_t gmfn, 
  15.487 +                             guest_l1e_t *ep,
  15.488 +                             unsigned int level, 
  15.489 +                             fetch_type_t ft)
  15.490 +{
  15.491 +    u32 flags, shflags, bit;
  15.492 +    struct page_info *pg;
  15.493 +    int res = 0;
  15.494 +
  15.495 +    ASSERT(valid_mfn(gmfn)
  15.496 +           && (sh_mfn_is_a_page_table(gmfn)
  15.497 +               || ((mfn_to_page(gmfn)->u.inuse.type_info & PGT_count_mask) 
  15.498 +                   == 0)));
  15.499 +    ASSERT(ep && !(((unsigned long)ep) & ((sizeof *ep) - 1)));
  15.500 +    ASSERT(level <= GUEST_PAGING_LEVELS);
  15.501 +    ASSERT(ft == ft_demand_read || ft == ft_demand_write);
  15.502 +    ASSERT(shadow_lock_is_acquired(v->domain));
  15.503 +
  15.504 +    flags = guest_l1e_get_flags(*ep);
  15.505 +
  15.506 +    /* PAE l3s do not have A and D bits */
  15.507 +    if ( unlikely(GUEST_PAGING_LEVELS == 3 && level == 3) )
  15.508 +        return flags;
  15.509 +
  15.510 +    /* Need the D bit as well for writes, in l1es and 32bit/PAE PSE l2es. */
  15.511 +    if ( ft == ft_demand_write  
  15.512 +         && (level == 1 || 
  15.513 +             (level == 2 && GUEST_PAGING_LEVELS < 4 
  15.514 +              && (flags & _PAGE_PSE) && guest_supports_superpages(v))) )
  15.515 +    {
  15.516 +        if ( (flags & (_PAGE_DIRTY | _PAGE_ACCESSED)) 
  15.517 +             == (_PAGE_DIRTY | _PAGE_ACCESSED) )
  15.518 +            return flags;  /* Guest already has A and D bits set */
  15.519 +        flags |= _PAGE_DIRTY | _PAGE_ACCESSED;
  15.520 +        perfc_incrc(shadow_ad_update);
  15.521 +    }
  15.522 +    else 
  15.523 +    {
  15.524 +        if ( flags & _PAGE_ACCESSED )
  15.525 +            return flags;  /* Guest already has A bit set */
  15.526 +        flags |= _PAGE_ACCESSED;
  15.527 +        perfc_incrc(shadow_a_update);
  15.528 +    }
  15.529 +
  15.530 +    /* Set the bit(s) */
  15.531 +    sh_mark_dirty(v->domain, gmfn);
  15.532 +    SHADOW_DEBUG(A_AND_D, "gfn = %"SH_PRI_gfn", "
  15.533 +                  "old flags = %#x, new flags = %#x\n", 
  15.534 +                  guest_l1e_get_gfn(*ep), guest_l1e_get_flags(*ep), flags);
  15.535 +    *ep = guest_l1e_from_gfn(guest_l1e_get_gfn(*ep), flags);
  15.536 +    
  15.537 +    /* May need to propagate this change forward to other kinds of shadow */
  15.538 +    pg = mfn_to_page(gmfn);
  15.539 +    if ( !sh_mfn_is_a_page_table(gmfn) ) 
  15.540 +    {
  15.541 +        /* This guest pagetable is not yet shadowed at all. */
  15.542 +        // MAF: I think this assert is busted...  If this gmfn has not yet
  15.543 +        // been promoted, then it seems perfectly reasonable for there to be
  15.544 +        // outstanding type refs to it...
  15.545 +        /* TJD: No. If the gmfn has not been promoted, we must at least 
  15.546 +         * have recognised that it is a pagetable, and pulled write access.
  15.547 +         * The type count should only be non-zero if it is actually a page 
  15.548 +         * table.  The test above was incorrect, though, so I've fixed it. */
  15.549 +        ASSERT((pg->u.inuse.type_info & PGT_count_mask) == 0);
  15.550 +        return flags;  
  15.551 +    }
  15.552 +
  15.553 +    shflags = pg->shadow_flags & SHF_page_type_mask;
  15.554 +    while ( shflags )
  15.555 +    {
  15.556 +        bit = find_first_set_bit(shflags);
  15.557 +        ASSERT(shflags & (1u << bit));
  15.558 +        shflags &= ~(1u << bit);
  15.559 +        if ( !(pg->shadow_flags & (1u << bit)) )
  15.560 +            continue;
  15.561 +        switch ( bit )
  15.562 +        {
  15.563 +        case PGC_SH_type_to_index(PGC_SH_l1_shadow):
  15.564 +            if (level != 1) 
  15.565 +                res |= sh_map_and_validate_gl1e(v, gmfn, ep, sizeof (*ep));
  15.566 +            break;
  15.567 +        case PGC_SH_type_to_index(PGC_SH_l2_shadow):
  15.568 +            if (level != 2) 
  15.569 +                res |= sh_map_and_validate_gl2e(v, gmfn, ep, sizeof (*ep));
  15.570 +            break;
  15.571 +#if GUEST_PAGING_LEVELS == 3 /* PAE only */
  15.572 +        case PGC_SH_type_to_index(PGC_SH_l2h_shadow):
  15.573 +            if (level != 2) 
  15.574 +                res |= sh_map_and_validate_gl2he(v, gmfn, ep, sizeof (*ep));
  15.575 +            break;
  15.576 +#endif
  15.577 +#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
  15.578 +        case PGC_SH_type_to_index(PGC_SH_l3_shadow):
  15.579 +            if (level != 3) 
  15.580 +                res |= sh_map_and_validate_gl3e(v, gmfn, ep, sizeof (*ep));
  15.581 +            break;
  15.582 +#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
  15.583 +        case PGC_SH_type_to_index(PGC_SH_l4_shadow):
  15.584 +            if (level != 4) 
  15.585 +                res |= sh_map_and_validate_gl4e(v, gmfn, ep, sizeof (*ep));
  15.586 +            break;
  15.587 +#endif 
  15.588 +#endif
  15.589 +        default:
  15.590 +            SHADOW_ERROR("mfn %"SH_PRI_mfn" is shadowed in multiple "
  15.591 +                          "modes: A&D bits may be out of sync (flags=%#x).\n", 
  15.592 +                          mfn_x(gmfn), pg->shadow_flags); 
  15.593 +            /* XXX Shadows in other modes will not be updated, so will
  15.594 +             * have their A and D bits out of sync. */
  15.595 +        }
  15.596 +    }
  15.597 +    
  15.598 +    /* We should never need to flush the TLB or recopy PAE entries */
  15.599 +    ASSERT( res == 0 || res == SHADOW_SET_CHANGED );
  15.600 +    return flags;
  15.601 +}
  15.602 +
  15.603 +/**************************************************************************/
  15.604 +/* Functions to compute the correct index into a shadow page, given an
  15.605 + * index into the guest page (as returned by guest_get_index()).
  15.606 + * This is trivial when the shadow and guest use the same sized PTEs, but
  15.607 + * gets more interesting when those sizes are mismatched (e.g. 32-bit guest,
  15.608 + * PAE- or 64-bit shadows).
  15.609 + *
  15.610 + * These functions also increment the shadow mfn, when necessary.  When PTE
  15.611 + * sizes are mismatched, it takes 2 shadow L1 pages for a single guest L1
  15.612 + * page.  In this case, we allocate 2 contiguous pages for the shadow L1, and
  15.613 + * use simple pointer arithmetic on a pointer to the guest L1e to figure out
  15.614 + * which shadow page we really want.  Similarly, when PTE sizes are
  15.615 + * mismatched, we shadow a guest L2 page with 4 shadow L2 pages.  (The easiest
  15.616 + * way to see this is: a 32-bit guest L2 page maps 4GB of virtual address
  15.617 + * space, while a PAE- or 64-bit shadow L2 page maps 1GB of virtual address
  15.618 + * space.)
  15.619 + *
  15.620 + * For PAE guests, for every 32-bytes of guest L3 page table, we use 64-bytes
  15.621 + * of shadow (to store both the shadow, and the info that would normally be
  15.622 + * stored in page_info fields).  This arrangement allows the shadow and the
  15.623 + * "page_info" fields to always be stored in the same page (in fact, in
  15.624 + * the same cache line), avoiding an extra call to map_domain_page().
  15.625 + */
  15.626 +
  15.627 +static inline u32
  15.628 +guest_index(void *ptr)
  15.629 +{
  15.630 +    return (u32)((unsigned long)ptr & ~PAGE_MASK) / sizeof(guest_l1e_t);
  15.631 +}
  15.632 +
  15.633 +static inline u32
  15.634 +shadow_l1_index(mfn_t *smfn, u32 guest_index)
  15.635 +{
  15.636 +#if (GUEST_PAGING_LEVELS == 2) && (SHADOW_PAGING_LEVELS != 2)
  15.637 +    *smfn = _mfn(mfn_x(*smfn) +
  15.638 +                 (guest_index / SHADOW_L1_PAGETABLE_ENTRIES));
  15.639 +    return (guest_index % SHADOW_L1_PAGETABLE_ENTRIES);
  15.640 +#else
  15.641 +    return guest_index;
  15.642 +#endif
  15.643 +}
  15.644 +
  15.645 +static inline u32
  15.646 +shadow_l2_index(mfn_t *smfn, u32 guest_index)
  15.647 +{
  15.648 +#if (GUEST_PAGING_LEVELS == 2) && (SHADOW_PAGING_LEVELS != 2)
  15.649 +    // Because we use 2 shadow l2 entries for each guest entry, the number of
  15.650 +    // guest entries per shadow page is SHADOW_L2_PAGETABLE_ENTRIES/2
  15.651 +    //
  15.652 +    *smfn = _mfn(mfn_x(*smfn) +
  15.653 +                 (guest_index / (SHADOW_L2_PAGETABLE_ENTRIES / 2)));
  15.654 +
  15.655 +    // We multiple by two to get the index of the first of the two entries
  15.656 +    // used to shadow the specified guest entry.
  15.657 +    return (guest_index % (SHADOW_L2_PAGETABLE_ENTRIES / 2)) * 2;
  15.658 +#else
  15.659 +    return guest_index;
  15.660 +#endif
  15.661 +}
  15.662 +
  15.663 +#if GUEST_PAGING_LEVELS >= 3
  15.664 +
  15.665 +static inline u32
  15.666 +shadow_l3_index(mfn_t *smfn, u32 guest_index)
  15.667 +{
  15.668 +#if GUEST_PAGING_LEVELS == 3
  15.669 +    u32 group_id;
  15.670 +
  15.671 +    // Because we use twice the space in L3 shadows as was consumed in guest
  15.672 +    // L3s, the number of guest entries per shadow page is
  15.673 +    // SHADOW_L2_PAGETABLE_ENTRIES/2.  (Note this is *not*
  15.674 +    // SHADOW_L3_PAGETABLE_ENTRIES, which in this case is 4...)
  15.675 +    //
  15.676 +    *smfn = _mfn(mfn_x(*smfn) +
  15.677 +                 (guest_index / (SHADOW_L2_PAGETABLE_ENTRIES / 2)));
  15.678 +
  15.679 +    // We store PAE L3 shadows in groups of 4, alternating shadows and
  15.680 +    // pae_l3_bookkeeping structs.  So the effective shadow index is
  15.681 +    // the the group_id * 8 + the offset within the group.
  15.682 +    //
  15.683 +    guest_index %= (SHADOW_L2_PAGETABLE_ENTRIES / 2);
  15.684 +    group_id = guest_index / 4;
  15.685 +    return (group_id * 8) + (guest_index % 4);
  15.686 +#else
  15.687 +    return guest_index;
  15.688 +#endif
  15.689 +}
  15.690 +
  15.691 +#endif // GUEST_PAGING_LEVELS >= 3
  15.692 +
  15.693 +#if GUEST_PAGING_LEVELS >= 4
  15.694 +
  15.695 +static inline u32
  15.696 +shadow_l4_index(mfn_t *smfn, u32 guest_index)
  15.697 +{
  15.698 +    return guest_index;
  15.699 +}
  15.700 +
  15.701 +#endif // GUEST_PAGING_LEVELS >= 4
  15.702 +
  15.703 +
  15.704 +/**************************************************************************/
  15.705 +/* Functions which compute shadow entries from their corresponding guest
  15.706 + * entries.
  15.707 + *
  15.708 + * These are the "heart" of the shadow code.
  15.709 + *
  15.710 + * There are two sets of these: those that are called on demand faults (read
  15.711 + * faults and write faults), and those that are essentially called to
  15.712 + * "prefetch" (or propagate) entries from the guest into the shadow.  The read
  15.713 + * fault and write fault are handled as two separate cases for L1 entries (due
  15.714 + * to the _PAGE_DIRTY bit handling), but for L[234], they are grouped together
  15.715 + * into the respective demand_fault functions.
  15.716 + */
  15.717 +
  15.718 +#define CHECK(_cond)                                    \
  15.719 +do {                                                    \
  15.720 +    if (unlikely(!(_cond)))                             \
  15.721 +    {                                                   \
  15.722 +        printk("%s %s %d ASSERTION (%s) FAILED\n",      \
  15.723 +               __func__, __FILE__, __LINE__, #_cond);   \
  15.724 +        return -1;                                      \
  15.725 +    }                                                   \
  15.726 +} while (0);
  15.727 +
  15.728 +// The function below tries to capture all of the flag manipulation for the
  15.729 +// demand and propagate functions into one place.
  15.730 +//
  15.731 +static always_inline u32
  15.732 +sh_propagate_flags(struct vcpu *v, mfn_t target_mfn, 
  15.733 +                    u32 gflags, guest_l1e_t *guest_entry_ptr, mfn_t gmfn, 
  15.734 +                    int mmio, int level, fetch_type_t ft)
  15.735 +{
  15.736 +    struct domain *d = v->domain;
  15.737 +    u32 pass_thru_flags;
  15.738 +    u32 sflags;
  15.739 +
  15.740 +    // XXX -- might want to think about PAT support for HVM guests...
  15.741 +
  15.742 +#ifndef NDEBUG
  15.743 +    // MMIO can only occur from L1e's
  15.744 +    //
  15.745 +    if ( mmio )
  15.746 +        CHECK(level == 1);
  15.747 +
  15.748 +    // We should always have a pointer to the guest entry if it's a non-PSE
  15.749 +    // non-MMIO demand access.
  15.750 +    if ( ft & FETCH_TYPE_DEMAND )
  15.751 +        CHECK(guest_entry_ptr || level == 1);
  15.752 +#endif
  15.753 +
  15.754 +    // A not-present guest entry has a special signature in the shadow table,
  15.755 +    // so that we do not have to consult the guest tables multiple times...
  15.756 +    //
  15.757 +    if ( unlikely(!(gflags & _PAGE_PRESENT)) )
  15.758 +        return _PAGE_SHADOW_GUEST_NOT_PRESENT;
  15.759 +
  15.760 +    // Must have a valid target_mfn, unless this is mmio, or unless this is a
  15.761 +    // prefetch.  In the case of a prefetch, an invalid mfn means that we can
  15.762 +    // not usefully shadow anything, and so we return early.
  15.763 +    //
  15.764 +    if ( !valid_mfn(target_mfn) )
  15.765 +    {
  15.766 +        CHECK((ft == ft_prefetch) || mmio);
  15.767 +        if ( !mmio )
  15.768 +            return 0;
  15.769 +    }
  15.770 +
  15.771 +    // PAE does not allow NX, RW, USER, ACCESSED, or DIRTY bits in its L3e's...
  15.772 +    //
  15.773 +    if ( (SHADOW_PAGING_LEVELS == 3) && (level == 3) )
  15.774 +        pass_thru_flags = _PAGE_PRESENT;
  15.775 +    else
  15.776 +    {
  15.777 +        pass_thru_flags = (_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_USER |
  15.778 +                           _PAGE_RW | _PAGE_PRESENT);
  15.779 +        if ( guest_supports_nx(v) )
  15.780 +            pass_thru_flags |= _PAGE_NX_BIT;
  15.781 +    }
  15.782 +
  15.783 +    // PAE guests can not put NX, RW, USER, ACCESSED, or DIRTY bits into their
  15.784 +    // L3e's; they are all implied.  So we emulate them here.
  15.785 +    //
  15.786 +    if ( (GUEST_PAGING_LEVELS == 3) && (level == 3) )
  15.787 +        gflags = pass_thru_flags;
  15.788 +
  15.789 +    // Propagate bits from the guest to the shadow.
  15.790 +    // Some of these may be overwritten, below.
  15.791 +    // Since we know the guest's PRESENT bit is set, we also set the shadow's
  15.792 +    // SHADOW_PRESENT bit.
  15.793 +    //
  15.794 +    sflags = (gflags & pass_thru_flags) | _PAGE_SHADOW_PRESENT;
  15.795 +
  15.796 +    // Copy the guest's RW bit into the SHADOW_RW bit.
  15.797 +    //
  15.798 +    if ( gflags & _PAGE_RW )
  15.799 +        sflags |= _PAGE_SHADOW_RW;
  15.800 +
  15.801 +    // Set the A&D bits for higher level shadows.
  15.802 +    // Higher level entries do not, strictly speaking, have dirty bits, but
  15.803 +    // since we use shadow linear tables, each of these entries may, at some
  15.804 +    // point in time, also serve as a shadow L1 entry.
  15.805 +    // By setting both the  A&D bits in each of these, we eliminate the burden
  15.806 +    // on the hardware to update these bits on initial accesses.
  15.807 +    //
  15.808 +    if ( (level > 1) && !((SHADOW_PAGING_LEVELS == 3) && (level == 3)) )
  15.809 +        sflags |= _PAGE_ACCESSED | _PAGE_DIRTY;
  15.810 +
  15.811 +
  15.812 +    // Set the A and D bits in the guest entry, if we need to.
  15.813 +    if ( guest_entry_ptr && (ft & FETCH_TYPE_DEMAND) )
  15.814 +        gflags = guest_set_ad_bits(v, gmfn, guest_entry_ptr, level, ft);
  15.815 +    
  15.816 +    // If the A or D bit has not yet been set in the guest, then we must
  15.817 +    // prevent the corresponding kind of access.
  15.818 +    //
  15.819 +    if ( unlikely(!((GUEST_PAGING_LEVELS == 3) && (level == 3)) &&
  15.820 +                  !(gflags & _PAGE_ACCESSED)) )
  15.821 +        sflags &= ~_PAGE_PRESENT;
  15.822 +
  15.823 +    /* D bits exist in l1es, and 32bit/PAE PSE l2es, but not 64bit PSE l2es */
  15.824 +    if ( unlikely( ((level == 1) 
  15.825 +                    || ((level == 2) && (GUEST_PAGING_LEVELS < 4) 
  15.826 +                        && guest_supports_superpages(v) &&
  15.827 +                        (gflags & _PAGE_PSE)))
  15.828 +                   && !(gflags & _PAGE_DIRTY)) )
  15.829 +        sflags &= ~_PAGE_RW;
  15.830 +
  15.831 +    // MMIO caching
  15.832 +    //
  15.833 +    // MMIO mappings are marked as not present, but we set the SHADOW_MMIO bit
  15.834 +    // to cache the fact that this entry  is in MMIO space.
  15.835 +    //
  15.836 +    if ( (level == 1) && mmio )
  15.837 +    {
  15.838 +        sflags &= ~(_PAGE_PRESENT);
  15.839 +        sflags |= _PAGE_SHADOW_MMIO;
  15.840 +    }
  15.841 +    else 
  15.842 +    {
  15.843 +        // shadow_mode_log_dirty support
  15.844 +        //
  15.845 +        // Only allow the guest write access to a page a) on a demand fault,
  15.846 +        // or b) if the page is already marked as dirty.
  15.847 +        //
  15.848 +        if ( unlikely((level == 1) &&
  15.849 +                      !(ft & FETCH_TYPE_WRITE) &&
  15.850 +                      shadow_mode_log_dirty(d) &&
  15.851 +                      !sh_mfn_is_dirty(d, target_mfn)) )
  15.852 +        {
  15.853 +            sflags &= ~_PAGE_RW;
  15.854 +        }
  15.855 +        
  15.856 +        // protect guest page tables
  15.857 +        //
  15.858 +        if ( unlikely((level == 1) &&
  15.859 +                      sh_mfn_is_a_page_table(target_mfn)) )
  15.860 +        {
  15.861 +            if ( shadow_mode_trap_reads(d) )
  15.862 +            {
  15.863 +                // if we are trapping both reads & writes, then mark this page
  15.864 +                // as not present...
  15.865 +                //
  15.866 +                sflags &= ~_PAGE_PRESENT;
  15.867 +            }
  15.868 +            else
  15.869 +            {
  15.870 +                // otherwise, just prevent any writes...
  15.871 +                //
  15.872 +                sflags &= ~_PAGE_RW;
  15.873 +            }
  15.874 +        }
  15.875 +    }
  15.876 +
  15.877 +    return sflags;
  15.878 +}
  15.879 +
  15.880 +#undef CHECK
  15.881 +
  15.882 +#if GUEST_PAGING_LEVELS >= 4
  15.883 +static void
  15.884 +l4e_propagate_from_guest(struct vcpu *v, 
  15.885 +                         guest_l4e_t *gl4e,
  15.886 +                         mfn_t gl4mfn,
  15.887 +                         mfn_t sl3mfn,
  15.888 +                         shadow_l4e_t *sl4p,
  15.889 +                         fetch_type_t ft)
  15.890 +{
  15.891 +    u32 gflags = guest_l4e_get_flags(*gl4e);
  15.892 +    u32 sflags = sh_propagate_flags(v, sl3mfn, gflags, (guest_l1e_t *) gl4e,
  15.893 +                                     gl4mfn, 0, 4, ft);
  15.894 +
  15.895 +    *sl4p = shadow_l4e_from_mfn(sl3mfn, sflags);
  15.896 +
  15.897 +    SHADOW_DEBUG(PROPAGATE,
  15.898 +                  "%s gl4e=%" SH_PRI_gpte " sl4e=%" SH_PRI_pte "\n",
  15.899 +                  fetch_type_names[ft], gl4e->l4, sl4p->l4);
  15.900 +    ASSERT(sflags != -1);
  15.901 +}
  15.902 +#endif // GUEST_PAGING_LEVELS >= 4
  15.903 +
  15.904 +#if GUEST_PAGING_LEVELS >= 3
  15.905 +static void
  15.906 +l3e_propagate_from_guest(struct vcpu *v,
  15.907 +                         guest_l3e_t *gl3e,
  15.908 +                         mfn_t gl3mfn, 
  15.909 +                         mfn_t sl2mfn, 
  15.910 +                         shadow_l3e_t *sl3p,
  15.911 +                         fetch_type_t ft)
  15.912 +{
  15.913 +    u32 gflags = guest_l3e_get_flags(*gl3e);
  15.914 +    u32 sflags = sh_propagate_flags(v, sl2mfn, gflags, (guest_l1e_t *) gl3e,
  15.915 +                                     gl3mfn, 0, 3, ft);
  15.916 +
  15.917 +    *sl3p = shadow_l3e_from_mfn(sl2mfn, sflags);
  15.918 +
  15.919 +    SHADOW_DEBUG(PROPAGATE,
  15.920 +                  "%s gl3e=%" SH_PRI_gpte " sl3e=%" SH_PRI_pte "\n",
  15.921 +                  fetch_type_names[ft], gl3e->l3, sl3p->l3);
  15.922 +    ASSERT(sflags != -1);
  15.923 +}
  15.924 +#endif // GUEST_PAGING_LEVELS >= 3
  15.925 +
  15.926 +static void
  15.927 +l2e_propagate_from_guest(struct vcpu *v, 
  15.928 +                         guest_l2e_t *gl2e,
  15.929 +                         mfn_t gl2mfn,
  15.930 +                         mfn_t sl1mfn, 
  15.931 +                         shadow_l2e_t *sl2p,
  15.932 +                         fetch_type_t ft)
  15.933 +{
  15.934 +    u32 gflags = guest_l2e_get_flags(*gl2e);
  15.935 +    u32 sflags = sh_propagate_flags(v, sl1mfn, gflags, (guest_l1e_t *) gl2e, 
  15.936 +                                     gl2mfn, 0, 2, ft);
  15.937 +
  15.938 +    *sl2p = shadow_l2e_from_mfn(sl1mfn, sflags);
  15.939 +
  15.940 +    SHADOW_DEBUG(PROPAGATE,
  15.941 +                  "%s gl2e=%" SH_PRI_gpte " sl2e=%" SH_PRI_pte "\n",
  15.942 +                  fetch_type_names[ft], gl2e->l2, sl2p->l2);
  15.943 +    ASSERT(sflags != -1);
  15.944 +}
  15.945 +
  15.946 +static inline int
  15.947 +l1e_read_fault(struct vcpu *v, walk_t *gw, mfn_t gmfn, shadow_l1e_t *sl1p,
  15.948 +               int mmio)
  15.949 +/* returns 1 if emulation is required, and 0 otherwise */
  15.950 +{
  15.951 +    struct domain *d = v->domain;
  15.952 +    u32 gflags = guest_l1e_get_flags(gw->eff_l1e);
  15.953 +    u32 sflags = sh_propagate_flags(v, gmfn, gflags, gw->l1e, gw->l1mfn,
  15.954 +                                     mmio, 1, ft_demand_read);
  15.955 +
  15.956 +    if ( shadow_mode_trap_reads(d) && !mmio && sh_mfn_is_a_page_table(gmfn) )
  15.957 +    {
  15.958 +        // emulation required!
  15.959 +        *sl1p = shadow_l1e_empty();
  15.960 +        return 1;
  15.961 +    }
  15.962 +
  15.963 +    *sl1p = shadow_l1e_from_mfn(gmfn, sflags);
  15.964 +
  15.965 +    SHADOW_DEBUG(PROPAGATE,
  15.966 +                  "va=%p eff_gl1e=%" SH_PRI_gpte " sl1e=%" SH_PRI_pte "\n",
  15.967 +                  (void *)gw->va, gw->eff_l1e.l1, sl1p->l1);
  15.968 +
  15.969 +    ASSERT(sflags != -1);
  15.970 +    return 0;
  15.971 +}
  15.972 +
  15.973 +static inline int
  15.974 +l1e_write_fault(struct vcpu *v, walk_t *gw, mfn_t gmfn, shadow_l1e_t *sl1p,
  15.975 +                int mmio)
  15.976 +/* returns 1 if emulation is required, and 0 otherwise */
  15.977 +{
  15.978 +    struct domain *d = v->domain;
  15.979 +    u32 gflags = guest_l1e_get_flags(gw->eff_l1e);
  15.980 +    u32 sflags = sh_propagate_flags(v, gmfn, gflags, gw->l1e, gw->l1mfn,
  15.981 +                                     mmio, 1, ft_demand_write);
  15.982 +
  15.983 +    sh_mark_dirty(d, gmfn);
  15.984 +
  15.985 +    if ( !mmio && sh_mfn_is_a_page_table(gmfn) )
  15.986 +    {
  15.987 +        // emulation required!
  15.988 +        *sl1p = shadow_l1e_empty();
  15.989 +        return 1;
  15.990 +    }
  15.991 +
  15.992 +    *sl1p = shadow_l1e_from_mfn(gmfn, sflags);
  15.993 +
  15.994 +    SHADOW_DEBUG(PROPAGATE,
  15.995 +                  "va=%p eff_gl1e=%" SH_PRI_gpte " sl1e=%" SH_PRI_pte "\n",
  15.996 +                  (void *)gw->va, gw->eff_l1e.l1, sl1p->l1);
  15.997 +
  15.998 +    ASSERT(sflags != -1);
  15.999 +    return 0;
 15.1000 +}
 15.1001 +
 15.1002 +static inline void
 15.1003 +l1e_propagate_from_guest(struct vcpu *v, guest_l1e_t gl1e, shadow_l1e_t *sl1p,
 15.1004 +                         int mmio)
 15.1005 +{
 15.1006 +    gfn_t gfn = guest_l1e_get_gfn(gl1e);
 15.1007 +    mfn_t gmfn = (mmio) ? _mfn(gfn_x(gfn)) : vcpu_gfn_to_mfn(v, gfn);
 15.1008 +    u32 gflags = guest_l1e_get_flags(gl1e);
 15.1009 +    u32 sflags = sh_propagate_flags(v, gmfn, gflags, 0, _mfn(INVALID_MFN), 
 15.1010 +                                     mmio, 1, ft_prefetch);
 15.1011 +
 15.1012 +    *sl1p = shadow_l1e_from_mfn(gmfn, sflags);
 15.1013 +
 15.1014 +    SHADOW_DEBUG(PROPAGATE,
 15.1015 +                  "gl1e=%" SH_PRI_gpte " sl1e=%" SH_PRI_pte "\n",
 15.1016 +                  gl1e.l1, sl1p->l1);
 15.1017 +
 15.1018 +    ASSERT(sflags != -1);
 15.1019 +}
 15.1020 +
 15.1021 +
 15.1022 +/**************************************************************************/
 15.1023 +/* These functions update shadow entries (and do bookkeeping on the shadow
 15.1024 + * tables they are in).  It is intended that they are the only
 15.1025 + * functions which ever write (non-zero) data onto a shadow page.
 15.1026 + *
 15.1027 + * They return a set of flags: 
 15.1028 + * SHADOW_SET_CHANGED -- we actually wrote a new value to the shadow.
 15.1029 + * SHADOW_SET_FLUSH   -- the caller must cause a TLB flush.
 15.1030 + * SHADOW_SET_ERROR   -- the input is not a valid entry (for example, if
 15.1031 + *                        shadow_get_page_from_l1e() fails).
 15.1032 + * SHADOW_SET_L3PAE_RECOPY -- one or more vcpu's need to have their local
 15.1033 + *                             copies of their PAE L3 entries re-copied.
 15.1034 + */
 15.1035 +
 15.1036 +static inline void safe_write_entry(void *dst, void *src) 
 15.1037 +/* Copy one PTE safely when processors might be running on the
 15.1038 + * destination pagetable.   This does *not* give safety against
 15.1039 + * concurrent writes (that's what the shadow lock is for), just 
 15.1040 + * stops the hardware picking up partially written entries. */
 15.1041 +{
 15.1042 +    volatile unsigned long *d = dst;
 15.1043 +    unsigned long *s = src;
 15.1044 +    ASSERT(!((unsigned long) d & (sizeof (shadow_l1e_t) - 1)));
 15.1045 +#if CONFIG_PAGING_LEVELS == 3
 15.1046 +    /* In PAE mode, pagetable entries are larger
 15.1047 +     * than machine words, so won't get written atomically.  We need to make
 15.1048 +     * sure any other cpu running on these shadows doesn't see a
 15.1049 +     * half-written entry.  Do this by marking the entry not-present first,
 15.1050 +     * then writing the high word before the low word. */
 15.1051 +    BUILD_BUG_ON(sizeof (shadow_l1e_t) != 2 * sizeof (unsigned long));
 15.1052 +    d[0] = 0;
 15.1053 +    d[1] = s[1];
 15.1054 +    d[0] = s[0];
 15.1055 +#else
 15.1056 +    /* In 32-bit and 64-bit, sizeof(pte) == sizeof(ulong) == 1 word,
 15.1057 +     * which will be an atomic write, since the entry is aligned. */
 15.1058 +    BUILD_BUG_ON(sizeof (shadow_l1e_t) != sizeof (unsigned long));
 15.1059 +    *d = *s;
 15.1060 +#endif
 15.1061 +}
 15.1062 +
 15.1063 +
 15.1064 +static inline void 
 15.1065 +shadow_write_entries(void *d, void *s, int entries, mfn_t mfn)
 15.1066 +/* This function does the actual writes to shadow pages.
 15.1067 + * It must not be called directly, since it doesn't do the bookkeeping
 15.1068 + * that shadow_set_l*e() functions do. */
 15.1069 +{
 15.1070 +    shadow_l1e_t *dst = d;
 15.1071 +    shadow_l1e_t *src = s;
 15.1072 +    void *map = NULL;
 15.1073 +    int i;
 15.1074 +
 15.1075 +    /* Because we mirror access rights at all levels in the shadow, an
 15.1076 +     * l2 (or higher) entry with the RW bit cleared will leave us with
 15.1077 +     * no write access through the linear map.  
 15.1078 +     * We detect that by writing to the shadow with copy_to_user() and 
 15.1079 +     * using map_domain_page() to get a writeable mapping if we need to. */
 15.1080 +    if ( __copy_to_user(d, d, sizeof (unsigned long)) != 0 ) 
 15.1081 +    {
 15.1082 +        perfc_incrc(shadow_linear_map_failed);
 15.1083 +        map = sh_map_domain_page(mfn);
 15.1084 +        ASSERT(map != NULL);
 15.1085 +        dst = map + ((unsigned long)dst & (PAGE_SIZE - 1));
 15.1086 +    }
 15.1087 +
 15.1088 +
 15.1089 +    for ( i = 0; i < entries; i++ )
 15.1090 +        safe_write_entry(dst++, src++);
 15.1091 +
 15.1092 +    if ( map != NULL ) sh_unmap_domain_page(map);
 15.1093 +
 15.1094 +    /* XXX TODO:
 15.1095 +     * Update min/max field in page_info struct of this mfn */
 15.1096 +}
 15.1097 +
 15.1098 +static inline int
 15.1099 +perms_strictly_increased(u32 old_flags, u32 new_flags) 
 15.1100 +/* Given the flags of two entries, are the new flags a strict
 15.1101 + * increase in rights over the old ones? */
 15.1102 +{
 15.1103 +    u32 of = old_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX);
 15.1104 +    u32 nf = new_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX);
 15.1105 +    /* Flip the NX bit, since it's the only one that decreases rights;
 15.1106 +     * we calculate as if it were an "X" bit. */
 15.1107 +    of ^= _PAGE_NX_BIT;
 15.1108 +    nf ^= _PAGE_NX_BIT;
 15.1109 +    /* If the changed bits are all set in the new flags, then rights strictly 
 15.1110 +     * increased between old and new. */
 15.1111 +    return ((of | (of ^ nf)) == nf);
 15.1112 +}
 15.1113 +
 15.1114 +static int inline
 15.1115 +shadow_get_page_from_l1e(shadow_l1e_t sl1e, struct domain *d)
 15.1116 +{
 15.1117 +    int res;
 15.1118 +    mfn_t mfn;
 15.1119 +    struct domain *owner;
 15.1120 +    shadow_l1e_t sanitized_sl1e =
 15.1121 +        shadow_l1e_remove_flags(sl1e, _PAGE_SHADOW_RW | _PAGE_SHADOW_PRESENT);
 15.1122 +
 15.1123 +    //ASSERT(shadow_l1e_get_flags(sl1e) & _PAGE_PRESENT);
 15.1124 +    //ASSERT((shadow_l1e_get_flags(sl1e) & L1_DISALLOW_MASK) == 0);
 15.1125 +
 15.1126 +    if ( !shadow_mode_refcounts(d) )
 15.1127 +        return 1;
 15.1128 +
 15.1129 +    res = get_page_from_l1e(sanitized_sl1e, d);
 15.1130 +
 15.1131 +    // If a privileged domain is attempting to install a map of a page it does
 15.1132 +    // not own, we let it succeed anyway.
 15.1133 +    //
 15.1134 +    if ( unlikely(!res) &&
 15.1135 +         IS_PRIV(d) &&
 15.1136 +         !shadow_mode_translate(d) &&
 15.1137 +         valid_mfn(mfn = shadow_l1e_get_mfn(sl1e)) &&
 15.1138 +         (owner = page_get_owner(mfn_to_page(mfn))) &&
 15.1139 +         (d != owner) )
 15.1140 +    {
 15.1141 +        res = get_page_from_l1e(sanitized_sl1e, owner);
 15.1142 +        SHADOW_PRINTK("privileged domain %d installs map of mfn %05lx "
 15.1143 +                       "which is owned by domain %d: %s\n",
 15.1144 +                       d->domain_id, mfn_x(mfn), owner->domain_id,
 15.1145 +                       res ? "success" : "failed");
 15.1146 +    }
 15.1147 +
 15.1148 +    if ( unlikely(!res) )
 15.1149 +    {
 15.1150 +        perfc_incrc(shadow_get_page_fail);
 15.1151 +        SHADOW_PRINTK("failed: l1e=" SH_PRI_pte "\n");
 15.1152 +    }
 15.1153 +
 15.1154 +    return res;
 15.1155 +}
 15.1156 +
 15.1157 +static void inline
 15.1158 +shadow_put_page_from_l1e(shadow_l1e_t sl1e, struct domain *d)
 15.1159 +{ 
 15.1160 +    if ( !shadow_mode_refcounts(d) )
 15.1161 +        return;
 15.1162 +
 15.1163 +    put_page_from_l1e(sl1e, d);
 15.1164 +}
 15.1165 +
 15.1166 +#if GUEST_PAGING_LEVELS >= 4
 15.1167 +static int shadow_set_l4e(struct vcpu *v, 
 15.1168 +                          shadow_l4e_t *sl4e, 
 15.1169 +                          shadow_l4e_t new_sl4e, 
 15.1170 +                          mfn_t sl4mfn)
 15.1171 +{
 15.1172 +    int flags = 0;
 15.1173 +    shadow_l4e_t old_sl4e;
 15.1174 +    paddr_t paddr;
 15.1175 +    ASSERT(sl4e != NULL);
 15.1176 +    old_sl4e = *sl4e;
 15.1177 +
 15.1178 +    if ( old_sl4e.l4 == new_sl4e.l4 ) return 0; /* Nothing to do */
 15.1179 +    
 15.1180 +    paddr = ((((paddr_t)mfn_x(sl4mfn)) << PAGE_SHIFT) 
 15.1181 +             | (((unsigned long)sl4e) & ~PAGE_MASK));
 15.1182 +
 15.1183 +    if ( shadow_l4e_get_flags(new_sl4e) & _PAGE_PRESENT ) 
 15.1184 +    {
 15.1185 +        /* About to install a new reference */        
 15.1186 +        sh_get_ref(shadow_l4e_get_mfn(new_sl4e), paddr);
 15.1187 +    } 
 15.1188 +
 15.1189 +    /* Write the new entry */
 15.1190 +    shadow_write_entries(sl4e, &new_sl4e, 1, sl4mfn);
 15.1191 +    flags |= SHADOW_SET_CHANGED;
 15.1192 +
 15.1193 +    if ( shadow_l4e_get_flags(old_sl4e) & _PAGE_PRESENT ) 
 15.1194 +    {
 15.1195 +        /* We lost a reference to an old mfn. */
 15.1196 +        mfn_t osl3mfn = shadow_l4e_get_mfn(old_sl4e);
 15.1197 +        if ( (mfn_x(osl3mfn) != mfn_x(shadow_l4e_get_mfn(new_sl4e)))
 15.1198 +             || !perms_strictly_increased(shadow_l4e_get_flags(old_sl4e), 
 15.1199 +                                          shadow_l4e_get_flags(new_sl4e)) )
 15.1200 +        {
 15.1201 +            flags |= SHADOW_SET_FLUSH;
 15.1202 +        }
 15.1203 +        sh_put_ref(v, osl3mfn, paddr);
 15.1204 +    }
 15.1205 +    return flags;
 15.1206 +}
 15.1207 +#endif /* GUEST_PAGING_LEVELS >= 4 */
 15.1208 +
 15.1209 +#if GUEST_PAGING_LEVELS >= 3
 15.1210 +static int shadow_set_l3e(struct vcpu *v, 
 15.1211 +                          shadow_l3e_t *sl3e, 
 15.1212 +                          shadow_l3e_t new_sl3e, 
 15.1213 +                          mfn_t sl3mfn)
 15.1214 +{
 15.1215 +    int flags = 0;
 15.1216 +    shadow_l3e_t old_sl3e;
 15.1217 +    paddr_t paddr;
 15.1218 +    ASSERT(sl3e != NULL);
 15.1219 +    old_sl3e = *sl3e;
 15.1220 +
 15.1221 +    if ( old_sl3e.l3 == new_sl3e.l3 ) return 0; /* Nothing to do */
 15.1222 +
 15.1223 +    paddr = ((((paddr_t)mfn_x(sl3mfn)) << PAGE_SHIFT) 
 15.1224 +             | (((unsigned long)sl3e) & ~PAGE_MASK));
 15.1225 +    
 15.1226 +    if ( shadow_l3e_get_flags(new_sl3e) & _PAGE_PRESENT ) 
 15.1227 +    {
 15.1228 +        /* About to install a new reference */        
 15.1229 +        sh_get_ref(shadow_l3e_get_mfn(new_sl3e), paddr);
 15.1230 +    } 
 15.1231 +
 15.1232 +    /* Write the new entry */
 15.1233 +    shadow_write_entries(sl3e, &new_sl3e, 1, sl3mfn);
 15.1234 +    flags |= SHADOW_SET_CHANGED;
 15.1235 +
 15.1236 +#if GUEST_PAGING_LEVELS == 3 
 15.1237 +    /* We wrote a guest l3e in a PAE pagetable.  This table is copied in
 15.1238 +     * the linear pagetable entries of its l2s, and may also be copied
 15.1239 +     * to a low memory location to make it fit in CR3.  Report that we
 15.1240 +     * need to resync those copies (we can't wait for the guest to flush
 15.1241 +     * the TLB because it might be an increase in rights). */
 15.1242 +    {
 15.1243 +        struct vcpu *vcpu;
 15.1244 +
 15.1245 +        struct pae_l3_bookkeeping *info = sl3p_to_info(sl3e);
 15.1246 +        for_each_vcpu(v->domain, vcpu)
 15.1247 +        {
 15.1248 +            if (info->vcpus & (1 << vcpu->vcpu_id))
 15.1249 +            {
 15.1250 +                // Remember that this flip/update needs to occur.
 15.1251 +                vcpu->arch.shadow.pae_flip_pending = 1;
 15.1252 +                flags |= SHADOW_SET_L3PAE_RECOPY;
 15.1253 +            }
 15.1254 +        }
 15.1255 +    }
 15.1256 +#endif
 15.1257 +
 15.1258 +    if ( shadow_l3e_get_flags(old_sl3e) & _PAGE_PRESENT ) 
 15.1259 +    {
 15.1260 +        /* We lost a reference to an old mfn. */
 15.1261 +        mfn_t osl2mfn = shadow_l3e_get_mfn(old_sl3e);
 15.1262 +        if ( (mfn_x(osl2mfn) != mfn_x(shadow_l3e_get_mfn(new_sl3e))) ||
 15.1263 +             !perms_strictly_increased(shadow_l3e_get_flags(old_sl3e), 
 15.1264 +                                       shadow_l3e_get_flags(new_sl3e)) ) 
 15.1265 +        {
 15.1266 +            flags |= SHADOW_SET_FLUSH;
 15.1267 +        }
 15.1268 +        sh_put_ref(v, osl2mfn, paddr);
 15.1269 +    }
 15.1270 +    return flags;
 15.1271 +}
 15.1272 +#endif /* GUEST_PAGING_LEVELS >= 3 */ 
 15.1273 +
 15.1274 +static int shadow_set_l2e(struct vcpu *v, 
 15.1275 +                          shadow_l2e_t *sl2e, 
 15.1276 +                          shadow_l2e_t new_sl2e, 
 15.1277 +                          mfn_t sl2mfn)
 15.1278 +{
 15.1279 +    int flags = 0;
 15.1280 +    shadow_l2e_t old_sl2e;
 15.1281 +    paddr_t paddr;
 15.1282 +
 15.1283 +#if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
 15.1284 +    /* In 2-on-3 we work with pairs of l2es pointing at two-page
 15.1285 +     * shadows.  Reference counting and up-pointers track from the first
 15.1286 +     * page of the shadow to the first l2e, so make sure that we're 
 15.1287 +     * working with those:     
 15.1288 +     * Align the pointer down so it's pointing at the first of the pair */
 15.1289 +    sl2e = (shadow_l2e_t *)((unsigned long)sl2e & ~(sizeof(shadow_l2e_t)));
 15.1290 +    /* Align the mfn of the shadow entry too */
 15.1291 +    new_sl2e.l2 &= ~(1<<PAGE_SHIFT);
 15.1292 +#endif
 15.1293 +
 15.1294 +    ASSERT(sl2e != NULL);
 15.1295 +    old_sl2e = *sl2e;
 15.1296 +    
 15.1297 +    if ( old_sl2e.l2 == new_sl2e.l2 ) return 0; /* Nothing to do */
 15.1298 +    
 15.1299 +    paddr = ((((paddr_t)mfn_x(sl2mfn)) << PAGE_SHIFT)
 15.1300 +             | (((unsigned long)sl2e) & ~PAGE_MASK));
 15.1301 +
 15.1302 +    if ( shadow_l2e_get_flags(new_sl2e) & _PAGE_PRESENT ) 
 15.1303 +    {
 15.1304 +        /* About to install a new reference */
 15.1305 +        sh_get_ref(shadow_l2e_get_mfn(new_sl2e), paddr);
 15.1306 +    } 
 15.1307 +
 15.1308 +    /* Write the new entry */
 15.1309 +#if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
 15.1310 +    {
 15.1311 +        shadow_l2e_t pair[2] = { new_sl2e, new_sl2e };
 15.1312 +        /* The l1 shadow is two pages long and need to be pointed to by
 15.1313 +         * two adjacent l1es.  The pair have the same flags, but point
 15.1314 +         * at odd and even MFNs */
 15.1315 +        ASSERT(!(pair[0].l2 & (1<<PAGE_SHIFT)));
 15.1316 +        pair[1].l2 |= (1<<PAGE_SHIFT);
 15.1317 +        shadow_write_entries(sl2e, &pair, 2, sl2mfn);
 15.1318 +    }
 15.1319 +#else /* normal case */
 15.1320 +    shadow_write_entries(sl2e, &new_sl2e, 1, sl2mfn);
 15.1321 +#endif
 15.1322 +    flags |= SHADOW_SET_CHANGED;
 15.1323 +
 15.1324 +    if ( shadow_l2e_get_flags(old_sl2e) & _PAGE_PRESENT ) 
 15.1325 +    {
 15.1326 +        /* We lost a reference to an old mfn. */
 15.1327 +        mfn_t osl1mfn = shadow_l2e_get_mfn(old_sl2e);
 15.1328 +        if ( (mfn_x(osl1mfn) != mfn_x(shadow_l2e_get_mfn(new_sl2e))) ||
 15.1329 +             !perms_strictly_increased(shadow_l2e_get_flags(old_sl2e), 
 15.1330 +                                       shadow_l2e_get_flags(new_sl2e)) ) 
 15.1331 +        {
 15.1332 +            flags |= SHADOW_SET_FLUSH;
 15.1333 +        }
 15.1334 +        sh_put_ref(v, osl1mfn, paddr);
 15.1335 +    }
 15.1336 +    return flags;
 15.1337 +}
 15.1338 +
 15.1339 +static int shadow_set_l1e(struct vcpu *v, 
 15.1340 +                          shadow_l1e_t *sl1e, 
 15.1341 +                          shadow_l1e_t new_sl1e,
 15.1342 +                          mfn_t sl1mfn)
 15.1343 +{
 15.1344 +    int flags = 0;
 15.1345 +    struct domain *d = v->domain;
 15.1346 +    shadow_l1e_t old_sl1e;
 15.1347 +    ASSERT(sl1e != NULL);
 15.1348 +    
 15.1349 +    old_sl1e = *sl1e;
 15.1350 +
 15.1351 +    if ( old_sl1e.l1 == new_sl1e.l1 ) return 0; /* Nothing to do */
 15.1352 +    
 15.1353 +    if ( shadow_l1e_get_flags(new_sl1e) & _PAGE_PRESENT ) 
 15.1354 +    {
 15.1355 +        /* About to install a new reference */        
 15.1356 +        if ( shadow_mode_refcounts(d) ) {
 15.1357 +            if ( shadow_get_page_from_l1e(new_sl1e, d) == 0 ) 
 15.1358 +            {
 15.1359 +                /* Doesn't look like a pagetable. */
 15.1360 +                flags |= SHADOW_SET_ERROR;
 15.1361 +                new_sl1e = shadow_l1e_empty();
 15.1362 +            }
 15.1363 +        }
 15.1364 +    } 
 15.1365 +
 15.1366 +    /* Write the new entry */
 15.1367 +    shadow_write_entries(sl1e, &new_sl1e, 1, sl1mfn);
 15.1368 +    flags |= SHADOW_SET_CHANGED;
 15.1369 +
 15.1370 +    if ( shadow_l1e_get_flags(old_sl1e) & _PAGE_PRESENT ) 
 15.1371 +    {
 15.1372 +        /* We lost a reference to an old mfn. */
 15.1373 +        /* N.B. Unlike higher-level sets, never need an extra flush 
 15.1374 +         * when writing an l1e.  Because it points to the same guest frame 
 15.1375 +         * as the guest l1e did, it's the guest's responsibility to
 15.1376 +         * trigger a flush later. */
 15.1377 +        if ( shadow_mode_refcounts(d) ) 
 15.1378 +        {
 15.1379 +            shadow_put_page_from_l1e(old_sl1e, d);
 15.1380 +        } 
 15.1381 +    }
 15.1382 +    return flags;
 15.1383 +}
 15.1384 +
 15.1385 +
 15.1386 +/**************************************************************************/
 15.1387 +/* These functions take a vcpu and a virtual address, and return a pointer
 15.1388 + * to the appropriate level N entry from the shadow tables.  
 15.1389 + * If the necessary tables are not present in the shadow, they return NULL. */
 15.1390 +
 15.1391 +/* N.B. The use of GUEST_PAGING_LEVELS here is correct.  If the shadow has
 15.1392 + * more levels than the guest, the upper levels are always fixed and do not 
 15.1393 + * reflect any information from the guest, so we do not use these functions 
 15.1394 + * to access them. */
 15.1395 +
 15.1396 +#if GUEST_PAGING_LEVELS >= 4
 15.1397 +static shadow_l4e_t *
 15.1398 +shadow_get_l4e(struct vcpu *v, unsigned long va)
 15.1399 +{
 15.1400 +    /* Reading the top level table is always valid. */
 15.1401 +    return sh_linear_l4_table(v) + shadow_l4_linear_offset(va);
 15.1402 +}
 15.1403 +#endif /* GUEST_PAGING_LEVELS >= 4 */
 15.1404 +
 15.1405 +
 15.1406 +#if GUEST_PAGING_LEVELS >= 3
 15.1407 +static shadow_l3e_t *
 15.1408 +shadow_get_l3e(struct vcpu *v, unsigned long va)
 15.1409 +{
 15.1410 +#if GUEST_PAGING_LEVELS >= 4 /* 64bit... */
 15.1411 +    /* Get the l4 */
 15.1412 +    shadow_l4e_t *sl4e = shadow_get_l4e(v, va);
 15.1413 +    ASSERT(sl4e != NULL);
 15.1414 +    if ( !(shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT) )
 15.1415 +        return NULL;
 15.1416 +    ASSERT(valid_mfn(shadow_l4e_get_mfn(*sl4e)));
 15.1417 +    /* l4 was present; OK to get the l3 */
 15.1418 +    return sh_linear_l3_table(v) + shadow_l3_linear_offset(va);
 15.1419 +#else /* PAE... */
 15.1420 +    /* Top level is always mapped */
 15.1421 +    ASSERT(v->arch.shadow_vtable);
 15.1422 +    return ((shadow_l3e_t *)v->arch.shadow_vtable) + shadow_l3_linear_offset(va);
 15.1423 +#endif 
 15.1424 +}
 15.1425 +#endif /* GUEST_PAGING_LEVELS >= 3 */
 15.1426 +
 15.1427 +
 15.1428 +static shadow_l2e_t *
 15.1429 +shadow_get_l2e(struct vcpu *v, unsigned long va)
 15.1430 +{
 15.1431 +#if GUEST_PAGING_LEVELS >= 3  /* 64bit/PAE... */
 15.1432 +    /* Get the l3 */
 15.1433 +    shadow_l3e_t *sl3e = shadow_get_l3e(v, va);
 15.1434 +    if ( sl3e == NULL || !(shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT) )
 15.1435 +        return NULL;
 15.1436 +    ASSERT(valid_mfn(shadow_l3e_get_mfn(*sl3e)));
 15.1437 +    /* l3 was present; OK to get the l2 */
 15.1438 +#endif
 15.1439 +    return sh_linear_l2_table(v) + shadow_l2_linear_offset(va);
 15.1440 +}
 15.1441 +
 15.1442 +
 15.1443 +#if 0 // avoid the compiler warning for now...
 15.1444 +
 15.1445 +static shadow_l1e_t *
 15.1446 +shadow_get_l1e(struct vcpu *v, unsigned long va)
 15.1447 +{
 15.1448 +    /* Get the l2 */
 15.1449 +    shadow_l2e_t *sl2e = shadow_get_l2e(v, va);
 15.1450 +    if ( sl2e == NULL || !(shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT) )
 15.1451 +        return NULL;
 15.1452 +    ASSERT(valid_mfn(shadow_l2e_get_mfn(*sl2e)));
 15.1453 +    /* l2 was present; OK to get the l1 */
 15.1454 +    return sh_linear_l1_table(v) + shadow_l1_linear_offset(va);
 15.1455 +}
 15.1456 +
 15.1457 +#endif
 15.1458 +
 15.1459 +
 15.1460 +/**************************************************************************/
 15.1461 +/* Macros to walk pagetables.  These take the shadow of a pagetable and 
 15.1462 + * walk every "interesting" entry.  That is, they don't touch Xen mappings, 
 15.1463 + * and for 32-bit l2s shadowed onto PAE or 64-bit, they only touch every 
 15.1464 + * second entry (since pairs of entries are managed together). For multi-page
 15.1465 + * shadows they walk all pages.
 15.1466 + * 
 15.1467 + * Arguments are an MFN, the variable to point to each entry, a variable 
 15.1468 + * to indicate that we are done (we will shortcut to the end of the scan 
 15.1469 + * when _done != 0), a variable to indicate that we should avoid Xen mappings,
 15.1470 + * and the code. 
 15.1471 + *
 15.1472 + * WARNING: These macros have side-effects.  They change the values of both 
 15.1473 + * the pointer and the MFN. */ 
 15.1474 +
 15.1475 +static inline void increment_ptr_to_guest_entry(void *ptr)
 15.1476 +{
 15.1477 +    if ( ptr )
 15.1478 +    {
 15.1479 +        guest_l1e_t **entry = ptr;
 15.1480 +        (*entry)++;
 15.1481 +    }
 15.1482 +}
 15.1483 +
 15.1484 +/* All kinds of l1: touch all entries */
 15.1485 +#define _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code)       \
 15.1486 +do {                                                                    \
 15.1487 +    int _i;                                                             \
 15.1488 +    shadow_l1e_t *_sp = map_shadow_page((_sl1mfn));                     \
 15.1489 +    ASSERT((mfn_to_page(_sl1mfn)->count_info & PGC_SH_type_mask)       \
 15.1490 +           == PGC_SH_l1_shadow                                         \
 15.1491 +           || (mfn_to_page(_sl1mfn)->count_info & PGC_SH_type_mask)    \
 15.1492 +           == PGC_SH_fl1_shadow);                                      \
 15.1493 +    for ( _i = 0; _i < SHADOW_L1_PAGETABLE_ENTRIES; _i++ )              \
 15.1494 +    {                                                                   \
 15.1495 +        (_sl1e) = _sp + _i;                                             \
 15.1496 +        if ( shadow_l1e_get_flags(*(_sl1e)) & _PAGE_PRESENT )           \
 15.1497 +            {_code}                                                     \
 15.1498 +        if ( _done ) break;                                             \
 15.1499 +        increment_ptr_to_guest_entry(_gl1p);                            \
 15.1500 +    }                                                                   \
 15.1501 +    unmap_shadow_page(_sp);                                             \
 15.1502 +} while (0)
 15.1503 +
 15.1504 +/* 32-bit l1, on PAE or 64-bit shadows: need to walk both pages of shadow */
 15.1505 +#if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
 15.1506 +#define SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done,  _code)       \
 15.1507 +do {                                                                    \
 15.1508 +    int __done = 0;                                                     \
 15.1509 +    _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p,                         \
 15.1510 +                         ({ (__done = _done); }), _code);               \
 15.1511 +    _sl1mfn = _mfn(mfn_x(_sl1mfn) + 1);                                 \
 15.1512 +    if ( !__done )                                                      \
 15.1513 +        _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p,                     \
 15.1514 +                             ({ (__done = _done); }), _code);           \
 15.1515 +} while (0)
 15.1516 +#else /* Everything else; l1 shadows are only one page */
 15.1517 +#define SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code)        \
 15.1518 +       _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code)
 15.1519 +#endif
 15.1520 +    
 15.1521 +
 15.1522 +#if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
 15.1523 +
 15.1524 +/* 32-bit l2 on PAE/64: four pages, touch every second entry, and avoid Xen */
 15.1525 +#define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _xen, _code)    \
 15.1526 +do {                                                                      \
 15.1527 +    int _i, _j, __done = 0;                                               \
 15.1528 +    ASSERT((mfn_to_page(_sl2mfn)->count_info & PGC_SH_type_mask)         \
 15.1529 +           == PGC_SH_l2_32_shadow);                                      \
 15.1530 +    for ( _j = 0; _j < 4 && !__done; _j++ )                               \
 15.1531 +    {                                                                     \
 15.1532 +        shadow_l2e_t *_sp = map_shadow_page(_sl2mfn);                     \
 15.1533 +        for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i += 2 )         \
 15.1534 +            if ( (!(_xen))                                                \
 15.1535 +                 || ((_j * SHADOW_L2_PAGETABLE_ENTRIES) + _i)             \
 15.1536 +                 < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT) ) \
 15.1537 +            {                                                             \
 15.1538 +                (_sl2e) = _sp + _i;                                       \
 15.1539 +                if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT )     \
 15.1540 +                    {_code}                                               \
 15.1541 +                if ( (__done = (_done)) ) break;                          \
 15.1542 +                increment_ptr_to_guest_entry(_gl2p);                      \
 15.1543 +            }                                                             \
 15.1544 +        unmap_shadow_page(_sp);                                           \
 15.1545 +        _sl2mfn = _mfn(mfn_x(_sl2mfn) + 1);                               \
 15.1546 +    }                                                                     \
 15.1547 +} while (0)
 15.1548 +
 15.1549 +#elif GUEST_PAGING_LEVELS == 2
 15.1550 +
 15.1551 +/* 32-bit on 32-bit: avoid Xen entries */
 15.1552 +#define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _xen, _code)     \
 15.1553 +do {                                                                       \
 15.1554 +    int _i;                                                                \
 15.1555 +    shadow_l2e_t *_sp = map_shadow_page((_sl2mfn));                        \
 15.1556 +    ASSERT((mfn_to_page(_sl2mfn)->count_info & PGC_SH_type_mask)          \
 15.1557 +           == PGC_SH_l2_32_shadow);                                       \
 15.1558 +    for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ )                 \
 15.1559 +        if ( (!(_xen))                                                     \
 15.1560 +             ||                                                            \
 15.1561 +             (_i < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT)) ) \
 15.1562 +        {                                                                  \
 15.1563 +            (_sl2e) = _sp + _i;                                            \
 15.1564 +            if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT )          \
 15.1565 +                {_code}                                                    \
 15.1566 +            if ( _done ) break;                                            \
 15.1567 +            increment_ptr_to_guest_entry(_gl2p);                           \
 15.1568 +        }                                                                  \
 15.1569 +    unmap_shadow_page(_sp);                                                \
 15.1570 +} while (0)
 15.1571 +
 15.1572 +#elif GUEST_PAGING_LEVELS == 3
 15.1573 +
 15.1574 +/* PAE: if it's an l2h, don't touch Xen mappings */
 15.1575 +#define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _xen, _code)     \
 15.1576 +do {                                                                       \
 15.1577 +    int _i;                                                                \
 15.1578 +    shadow_l2e_t *_sp = map_shadow_page((_sl2mfn));                        \
 15.1579 +    ASSERT((mfn_to_page(_sl2mfn)->count_info & PGC_SH_type_mask)          \
 15.1580 +           == PGC_SH_l2_pae_shadow                                        \
 15.1581 +           || (mfn_to_page(_sl2mfn)->count_info & PGC_SH_type_mask)       \
 15.1582 +           == PGC_SH_l2h_pae_shadow);                                     \
 15.1583 +    for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ )                 \
 15.1584 +        if ( (!(_xen))                                                     \
 15.1585 +             || ((mfn_to_page(_sl2mfn)->count_info & PGC_SH_type_mask)    \
 15.1586 +                 != PGC_SH_l2h_pae_shadow)                                \
 15.1587 +             || ((_i + (3 * SHADOW_L2_PAGETABLE_ENTRIES))                  \
 15.1588 +                 < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT)) ) \
 15.1589 +        {                                                                  \
 15.1590 +            (_sl2e) = _sp + _i;                                            \
 15.1591 +            if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT )          \
 15.1592 +                {_code}                                                    \
 15.1593 +            if ( _done ) break;                                            \
 15.1594 +            increment_ptr_to_guest_entry(_gl2p);                           \
 15.1595 +        }                                                                  \
 15.1596 +    unmap_shadow_page(_sp);                                                \
 15.1597 +} while (0)
 15.1598 +
 15.1599 +#else 
 15.1600 +
 15.1601 +/* 64-bit l2: touch all entries */
 15.1602 +#define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _xen, _code)  \
 15.1603 +do {                                                                    \
 15.1604 +    int _i;                                                             \
 15.1605 +    shadow_l2e_t *_sp = map_shadow_page((_sl2mfn));                     \
 15.1606 +    ASSERT((mfn_to_page(_sl2mfn)->count_info & PGC_SH_type_mask)       \
 15.1607 +           == PGC_SH_l2_64_shadow);                                    \
 15.1608 +    for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ )              \
 15.1609 +    {                                                                   \
 15.1610 +        (_sl2e) = _sp + _i;                                             \
 15.1611 +        if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT )           \
 15.1612 +            {_code}                                                     \
 15.1613 +        if ( _done ) break;                                             \
 15.1614 +        increment_ptr_to_guest_entry(_gl2p);                            \
 15.1615 +    }                                                                   \
 15.1616 +    unmap_shadow_page(_sp);                                             \
 15.1617 +} while (0)
 15.1618 +
 15.1619 +#endif /* different kinds of l2 */
 15.1620 +
 15.1621 +#if GUEST_PAGING_LEVELS == 3
 15.1622 +
 15.1623 +/* PAE l3 subshadow: touch all entries (FOREACH_L2E will find Xen l2es). */
 15.1624 +#define SHADOW_FOREACH_L3E_SUB(_sl3e, _gl3p, _done, _code)             \
 15.1625 +do {                                                                    \
 15.1626 +    int _i;                                                             \
 15.1627 +    for ( _i = 0; _i < 4; _i++ )                                        \
 15.1628 +    {                                                                   \
 15.1629 +        if ( shadow_l3e_get_flags(*(_sl3e)) & _PAGE_PRESENT )           \
 15.1630 +            {_code}                                                     \
 15.1631 +        if ( _done ) break;                                             \
 15.1632 +        _sl3e++;                                                        \
 15.1633 +        increment_ptr_to_guest_entry(_gl3p);                            \
 15.1634 +    }                                                                   \
 15.1635 +} while (0)
 15.1636 +
 15.1637 +/* PAE l3 full shadow: call subshadow walk on all valid l3 subshadows */
 15.1638 +#define SHADOW_FOREACH_L3E(_sl3mfn, _sl3e, _gl3p, _done, _code)        \
 15.1639 +do {                                                                    \
 15.1640 +    int _i, _j, _k, __done = 0;                                         \
 15.1641 +    ASSERT((mfn_to_page(_sl3mfn)->count_info & PGC_SH_type_mask)       \
 15.1642 +           == PGC_SH_l3_pae_shadow);                                   \
 15.1643 +    /* The subshadows are split, 64 on each page of the shadow */       \
 15.1644 +    for ( _j = 0; _j < 2 && !__done; _j++ )                             \
 15.1645 +    {                                                                   \
 15.1646 +        void *_sp = sh_map_domain_page(_sl3mfn);                       \
 15.1647 +        for ( _i = 0; _i < 64; _i++ )                                   \
 15.1648 +        {                                                               \
 15.1649 +            /* Every second 32-byte region is a bookkeeping entry */    \
 15.1650 +            _sl3e = (shadow_l3e_t *)(_sp + (64 * _i));                  \
 15.1651 +            if ( (sl3p_to_info(_sl3e))->refcount > 0 )                  \
 15.1652 +                SHADOW_FOREACH_L3E_SUB(_sl3e, _gl3p,                   \
 15.1653 +                                        ({ __done = (_done); __done; }), \
 15.1654 +                                        _code);                         \
 15.1655 +            else                                                        \
 15.1656 +                for ( _k = 0 ; _k < 4 ; _k++ )                          \
 15.1657 +                    increment_ptr_to_guest_entry(_gl3p);                \
 15.1658 +            if ( __done ) break;                                        \
 15.1659 +        }                                                               \
 15.1660 +        sh_unmap_domain_page(_sp);                                     \
 15.1661 +        _sl3mfn = _mfn(mfn_x(_sl3mfn) + 1);                             \
 15.1662 +    }                                                                   \
 15.1663 +} while (0)
 15.1664 +
 15.1665 +#elif GUEST_PAGING_LEVELS == 4
 15.1666 +
 15.1667 +/* 64-bit l3: touch all entries */
 15.1668 +#define SHADOW_FOREACH_L3E(_sl3mfn, _sl3e, _gl3p, _done, _code)        \
 15.1669 +do {                                                                    \
 15.1670 +    int _i;                                                             \
 15.1671 +    shadow_l3e_t *_sp = map_shadow_page((_sl3mfn));                     \
 15.1672 +    ASSERT((mfn_to_page(_sl3mfn)->count_info & PGC_SH_type_mask)       \
 15.1673 +           == PGC_SH_l3_64_shadow);                                    \
 15.1674 +    for ( _i = 0; _i < SHADOW_L3_PAGETABLE_ENTRIES; _i++ )              \
 15.1675 +    {                                                                   \
 15.1676 +        (_sl3e) = _sp + _i;                                             \
 15.1677 +        if ( shadow_l3e_get_flags(*(_sl3e)) & _PAGE_PRESENT )           \
 15.1678 +            {_code}                                                     \
 15.1679 +        if ( _done ) break;                                             \
 15.1680 +        increment_ptr_to_guest_entry(_gl3p);                            \
 15.1681 +    }                                                                   \
 15.1682 +    unmap_shadow_page(_sp);                                             \
 15.1683 +} while (0)
 15.1684 +
 15.1685 +/* 64-bit l4: avoid Xen mappings */
 15.1686 +#define SHADOW_FOREACH_L4E(_sl4mfn, _sl4e, _gl4p, _done, _xen, _code)  \
 15.1687 +do {                                                                    \
 15.1688 +    int _i;                                                             \
 15.1689 +    shadow_l4e_t *_sp = map_shadow_page((_sl4mfn));                     \
 15.1690 +    ASSERT((mfn_to_page(_sl4mfn)->count_info & PGC_SH_type_mask)       \
 15.1691 +           == PGC_SH_l4_64_shadow);                                    \
 15.1692 +    for ( _i = 0; _i < SHADOW_L4_PAGETABLE_ENTRIES; _i++ )              \
 15.1693 +    {                                                                   \
 15.1694 +        if ( (!(_xen)) || is_guest_l4_slot(_i) )                        \
 15.1695 +        {                                                               \
 15.1696 +            (_sl4e) = _sp + _i;                                         \
 15.1697 +            if ( shadow_l4e_get_flags(*(_sl4e)) & _PAGE_PRESENT )       \
 15.1698 +                {_code}                                                 \
 15.1699 +            if ( _done ) break;                                         \
 15.1700 +        }                                                               \
 15.1701 +        increment_ptr_to_guest_entry(_gl4p);                            \
 15.1702 +    }                                                                   \
 15.1703 +    unmap_shadow_page(_sp);                                             \
 15.1704 +} while (0)
 15.1705 +
 15.1706 +#endif
 15.1707 +
 15.1708 +
 15.1709 +
 15.1710 +/**************************************************************************/
 15.1711 +/* Functions to install Xen mappings and linear mappings in shadow pages */
 15.1712 +
 15.1713 +static mfn_t sh_make_shadow(struct vcpu *v, mfn_t gmfn, u32 shadow_type);
 15.1714 +
 15.1715 +// XXX -- this function should probably be moved to shadow-common.c, but that
 15.1716 +//        probably wants to wait until the shadow types have been moved from
 15.1717 +//        shadow-types.h to shadow-private.h
 15.1718 +//
 15.1719 +#if CONFIG_PAGING_LEVELS == 4 && GUEST_PAGING_LEVELS == 4
 15.1720 +void sh_install_xen_entries_in_l4(struct vcpu *v, mfn_t gl4mfn, mfn_t sl4mfn)
 15.1721 +{
 15.1722 +    struct domain *d = v->domain;
 15.1723 +    shadow_l4e_t *sl4e;
 15.1724 +
 15.1725 +    sl4e = sh_map_domain_page(sl4mfn);
 15.1726 +    ASSERT(sl4e != NULL);
 15.1727 +    ASSERT(sizeof (l4_pgentry_t) == sizeof (shadow_l4e_t));
 15.1728 +    
 15.1729 +    /* Copy the common Xen mappings from the idle domain */
 15.1730 +    memcpy(&sl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT],
 15.1731 +           &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT],
 15.1732 +           ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t));
 15.1733 +
 15.1734 +    /* Install the per-domain mappings for this domain */
 15.1735 +    sl4e[shadow_l4_table_offset(PERDOMAIN_VIRT_START)] =
 15.1736 +        shadow_l4e_from_mfn(page_to_mfn(virt_to_page(d->arch.mm_perdomain_l3)),
 15.1737 +                            __PAGE_HYPERVISOR);
 15.1738 +
 15.1739 +    /* Linear mapping */
 15.1740 +    sl4e[shadow_l4_table_offset(LINEAR_PT_VIRT_START)] =
 15.1741 +        shadow_l4e_from_mfn(gl4mfn, __PAGE_HYPERVISOR);
 15.1742 +    sl4e[shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START)] =
 15.1743 +        shadow_l4e_from_mfn(sl4mfn, __PAGE_HYPERVISOR);
 15.1744 +
 15.1745 +    if ( shadow_mode_translate(v->domain) )
 15.1746 +    {
 15.1747 +        /* install domain-specific P2M table */
 15.1748 +        sl4e[shadow_l4_table_offset(RO_MPT_VIRT_START)] =
 15.1749 +            shadow_l4e_from_mfn(pagetable_get_mfn(d->arch.phys_table),
 15.1750 +                                __PAGE_HYPERVISOR);
 15.1751 +    }
 15.1752 +
 15.1753 +    sh_unmap_domain_page(sl4e);    
 15.1754 +}
 15.1755 +#endif
 15.1756 +
 15.1757 +#if CONFIG_PAGING_LEVELS == 3 && GUEST_PAGING_LEVELS == 3
 15.1758 +// For 3-on-3 PV guests, we need to make sure the xen mappings are in
 15.1759 +// place, which means that we need to populate the l2h entry in the l3
 15.1760 +// table.
 15.1761 +
 15.1762 +void sh_install_xen_entries_in_l2h(struct vcpu *v, 
 15.1763 +                                    mfn_t sl2hmfn)
 15.1764 +{
 15.1765 +    struct domain *d = v->domain;
 15.1766 +    shadow_l2e_t *sl2e;
 15.1767 +    int i;
 15.1768 +
 15.1769 +    sl2e = sh_map_domain_page(sl2hmfn);
 15.1770 +    ASSERT(sl2e != NULL);
 15.1771 +    ASSERT(sizeof (l2_pgentry_t) == sizeof (shadow_l2e_t));
 15.1772 +    
 15.1773 +    /* Copy the common Xen mappings from the idle domain */
 15.1774 +    memcpy(&sl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)],
 15.1775 +           &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT],
 15.1776 +           L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
 15.1777 +
 15.1778 +    /* Install the per-domain mappings for this domain */
 15.1779 +    for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
 15.1780 +        sl2e[shadow_l2_table_offset(PERDOMAIN_VIRT_START) + i] =
 15.1781 +            shadow_l2e_from_mfn(
 15.1782 +                page_to_mfn(virt_to_page(d->arch.mm_perdomain_pt) + i),
 15.1783 +                __PAGE_HYPERVISOR);
 15.1784 +    
 15.1785 +    /* We don't set up a linear mapping here because we can't until this
 15.1786 +     * l2h is installed in an l3e.  sh_update_linear_entries() handles
 15.1787 +     * the linear mappings when the l3 is loaded. */
 15.1788 +
 15.1789 +    if ( shadow_mode_translate(d) )
 15.1790 +    {
 15.1791 +        /* Install the domain-specific p2m table */
 15.1792 +        l3_pgentry_t *p2m;
 15.1793 +        ASSERT(pagetable_get_pfn(d->arch.phys_table) != 0);
 15.1794 +        p2m = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
 15.1795 +        for ( i = 0; i < MACHPHYS_MBYTES>>1; i++ )
 15.1796 +        {
 15.1797 +            sl2e[shadow_l2_table_offset(RO_MPT_VIRT_START) + i] =
 15.1798 +                shadow_l2e_from_mfn(_mfn(l3e_get_pfn(p2m[i])),
 15.1799 +                                    __PAGE_HYPERVISOR);
 15.1800 +        }
 15.1801 +        sh_unmap_domain_page(p2m);
 15.1802 +    }
 15.1803 +    
 15.1804 +    sh_unmap_domain_page(sl2e);
 15.1805 +}
 15.1806 +
 15.1807 +void sh_install_xen_entries_in_l3(struct vcpu *v, mfn_t gl3mfn, mfn_t sl3mfn)
 15.1808 +{
 15.1809 +    shadow_l3e_t *sl3e;
 15.1810 +    guest_l3e_t *gl3e = v->arch.guest_vtable;
 15.1811 +    shadow_l3e_t new_sl3e;
 15.1812 +    gfn_t l2gfn;
 15.1813 +    mfn_t l2gmfn, l2smfn;
 15.1814 +    int r;
 15.1815 +
 15.1816 +    ASSERT(!shadow_mode_external(v->domain));
 15.1817 +    ASSERT(guest_l3e_get_flags(gl3e[3]) & _PAGE_PRESENT);
 15.1818 +    l2gfn = guest_l3e_get_gfn(gl3e[3]);
 15.1819 +    l2gmfn = sh_gfn_to_mfn(v->domain, gfn_x(l2gfn));
 15.1820 +    l2smfn = get_shadow_status(v, l2gmfn, PGC_SH_l2h_shadow);
 15.1821 +    if ( !valid_mfn(l2smfn) )
 15.1822 +    {
 15.1823 +        l2smfn = sh_make_shadow(v, l2gmfn, PGC_SH_l2h_shadow);
 15.1824 +    }
 15.1825 +    l3e_propagate_from_guest(v, &gl3e[3], gl3mfn, l2smfn, &new_sl3e,
 15.1826 +                             ft_prefetch);
 15.1827 +    sl3e = sh_map_domain_page(sl3mfn);
 15.1828 +    r = shadow_set_l3e(v, &sl3e[3], new_sl3e, sl3mfn);
 15.1829 +    sh_unmap_domain_page(sl3e);
 15.1830 +}
 15.1831 +#endif
 15.1832 +
 15.1833 +
 15.1834 +#if CONFIG_PAGING_LEVELS == 2 && GUEST_PAGING_LEVELS == 2
 15.1835 +void sh_install_xen_entries_in_l2(struct vcpu *v, mfn_t gl2mfn, mfn_t sl2mfn)
 15.1836 +{
 15.1837 +    struct domain *d = v->domain;
 15.1838 +    shadow_l2e_t *sl2e;
 15.1839 +    int i;
 15.1840 +
 15.1841 +    sl2e = sh_map_domain_page(sl2mfn);
 15.1842 +    ASSERT(sl2e != NULL);
 15.1843 +    ASSERT(sizeof (l2_pgentry_t) == sizeof (shadow_l2e_t));
 15.1844 +    
 15.1845 +    /* Copy the common Xen mappings from the idle domain */
 15.1846 +    memcpy(&sl2e[L2_PAGETABLE_FIRST_XEN_SLOT],
 15.1847 +           &idle_pg_table[L2_PAGETABLE_FIRST_XEN_SLOT],
 15.1848 +           L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
 15.1849 +
 15.1850 +    /* Install the per-domain mappings for this domain */
 15.1851 +    for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
 15.1852 +        sl2e[shadow_l2_table_offset(PERDOMAIN_VIRT_START) + i] =
 15.1853 +            shadow_l2e_from_mfn(
 15.1854 +                page_to_mfn(virt_to_page(d->arch.mm_perdomain_pt) + i),
 15.1855 +                __PAGE_HYPERVISOR);
 15.1856 +
 15.1857 +    /* Linear mapping */
 15.1858 +    sl2e[shadow_l2_table_offset(LINEAR_PT_VIRT_START)] =
 15.1859 +        shadow_l2e_from_mfn(gl2mfn, __PAGE_HYPERVISOR);
 15.1860 +    sl2e[shadow_l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
 15.1861 +        shadow_l2e_from_mfn(sl2mfn, __PAGE_HYPERVISOR);
 15.1862 +
 15.1863 +    if ( shadow_mode_translate(d) )
 15.1864 +    {
 15.1865 +        /* install domain-specific P2M table */
 15.1866 +        sl2e[shadow_l2_table_offset(RO_MPT_VIRT_START)] =
 15.1867 +            shadow_l2e_from_mfn(pagetable_get_mfn(d->arch.phys_table),
 15.1868 +                                __PAGE_HYPERVISOR);
 15.1869 +    }
 15.1870 +
 15.1871 +    sh_unmap_domain_page(sl2e);
 15.1872 +}
 15.1873 +#endif
 15.1874 +
 15.1875 +
 15.1876 +
 15.1877 +
 15.1878 +
 15.1879 +/**************************************************************************/
 15.1880 +/* Create a shadow of a given guest page.
 15.1881 + */
 15.1882 +static mfn_t
 15.1883 +sh_make_shadow(struct vcpu *v, mfn_t gmfn, u32 shadow_type)
 15.1884 +{
 15.1885 +    mfn_t smfn = shadow_alloc(v->domain, shadow_type, mfn_x(gmfn));
 15.1886 +    SHADOW_DEBUG(MAKE_SHADOW, "(%05lx, %u)=>%05lx\n",
 15.1887 +                  mfn_x(gmfn), shadow_type, mfn_x(smfn));
 15.1888 +
 15.1889 +    if ( shadow_type != PGC_SH_guest_root_type )
 15.1890 +        /* Lower-level shadow, not yet linked form a higher level */
 15.1891 +        mfn_to_page(smfn)->up = 0;
 15.1892 +
 15.1893 +    // Create the Xen mappings...
 15.1894 +    if ( !shadow_mode_external(v->domain) )
 15.1895 +    {
 15.1896 +        switch (shadow_type) 
 15.1897 +        {
 15.1898 +#if CONFIG_PAGING_LEVELS == 4 && GUEST_PAGING_LEVELS == 4
 15.1899 +        case PGC_SH_l4_shadow:
 15.1900 +            sh_install_xen_entries_in_l4(v, gmfn, smfn); break;
 15.1901 +#endif
 15.1902 +#if CONFIG_PAGING_LEVELS == 3 && GUEST_PAGING_LEVELS == 3
 15.1903 +        case PGC_SH_l3_shadow:
 15.1904 +            sh_install_xen_entries_in_l3(v, gmfn, smfn); break;
 15.1905 +        case PGC_SH_l2h_shadow:
 15.1906 +            sh_install_xen_entries_in_l2h(v, smfn); break;
 15.1907 +#endif
 15.1908 +#if CONFIG_PAGING_LEVELS == 2 && GUEST_PAGING_LEVELS == 2
 15.1909 +        case PGC_SH_l2_shadow:
 15.1910 +            sh_install_xen_entries_in_l2(v, gmfn, smfn); break;
 15.1911 +#endif
 15.1912 +        default: /* Do nothing */ break;
 15.1913 +        }
 15.1914 +    }
 15.1915 +    
 15.1916 +    shadow_promote(v, gmfn, shadow_type);
 15.1917 +    set_shadow_status(v, gmfn, shadow_type, smfn);
 15.1918 +
 15.1919 +    return smfn;
 15.1920 +}
 15.1921 +
 15.1922 +/* Make a splintered superpage shadow */
 15.1923 +static mfn_t
 15.1924 +make_fl1_shadow(struct vcpu *v, gfn_t gfn)
 15.1925 +{
 15.1926 +    mfn_t smfn = shadow_alloc(v->domain, PGC_SH_fl1_shadow,
 15.1927 +                               (unsigned long) gfn_x(gfn));
 15.1928 +
 15.1929 +    SHADOW_DEBUG(MAKE_SHADOW, "(%" SH_PRI_gfn ")=>%" SH_PRI_mfn "\n",
 15.1930 +                  gfn_x(gfn), mfn_x(smfn));
 15.1931 +
 15.1932 +    set_fl1_shadow_status(v, gfn, smfn);
 15.1933 +    return smfn;
 15.1934 +}
 15.1935 +
 15.1936 +
 15.1937 +#if SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS
 15.1938 +mfn_t
 15.1939 +sh_make_monitor_table(struct vcpu *v)
 15.1940 +{
 15.1941 +
 15.1942 +    ASSERT(pagetable_get_pfn(v->arch.monitor_table) == 0);
 15.1943 +    
 15.1944 +#if CONFIG_PAGING_LEVELS == 4    
 15.1945 +    {
 15.1946 +        struct domain *d = v->domain;
 15.1947 +        mfn_t m4mfn;
 15.1948 +        m4mfn = shadow_alloc(d, PGC_SH_monitor_table, 0);
 15.1949 +        sh_install_xen_entries_in_l4(v, m4mfn, m4mfn);
 15.1950 +        /* Remember the level of this table */
 15.1951 +        mfn_to_page(m4mfn)->shadow_flags = 4;
 15.1952 +#if SHADOW_PAGING_LEVELS < 4
 15.1953 +        // Install a monitor l3 table in slot 0 of the l4 table.
 15.1954 +        // This is used for shadow linear maps.
 15.1955 +        {
 15.1956 +            mfn_t m3mfn; 
 15.1957 +            l4_pgentry_t *l4e;
 15.1958 +            m3mfn = shadow_alloc(d, PGC_SH_monitor_table, 0);
 15.1959 +            mfn_to_page(m3mfn)->shadow_flags = 3;
 15.1960 +            l4e = sh_map_domain_page(m4mfn);
 15.1961 +            l4e[0] = l4e_from_pfn(mfn_x(m3mfn), __PAGE_HYPERVISOR);
 15.1962 +            sh_unmap_domain_page(l4e);
 15.1963 +        }
 15.1964 +#endif /* SHADOW_PAGING_LEVELS < 4 */
 15.1965 +        return m4mfn;
 15.1966 +    }
 15.1967 +
 15.1968 +#elif CONFIG_PAGING_LEVELS == 3
 15.1969 +
 15.1970 +    {
 15.1971 +        struct domain *d = v->domain;
 15.1972 +        mfn_t m3mfn, m2mfn; 
 15.1973 +        l3_pgentry_t *l3e;
 15.1974 +        l2_pgentry_t *l2e;
 15.1975 +        int i;
 15.1976 +
 15.1977 +        m3mfn = shadow_alloc(d, PGC_SH_monitor_table, 0);
 15.1978 +        /* Remember the level of this table */
 15.1979 +        mfn_to_page(m3mfn)->shadow_flags = 3;
 15.1980 +
 15.1981 +        // Install a monitor l2 table in slot 3 of the l3 table.
 15.1982 +        // This is used for all Xen entries, including linear maps
 15.1983 +        m2mfn = shadow_alloc(d, PGC_SH_monitor_table, 0);
 15.1984 +        mfn_to_page(m2mfn)->shadow_flags = 2;
 15.1985 +        l3e = sh_map_domain_page(m3mfn);
 15.1986 +        l3e[3] = l3e_from_pfn(mfn_x(m2mfn), _PAGE_PRESENT);
 15.1987 +        sh_install_xen_entries_in_l2h(v, m2mfn);
 15.1988 +        /* Install the monitor's own linear map */
 15.1989 +        l2e = sh_map_domain_page(m2mfn);
 15.1990 +        for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
 15.1991 +            l2e[l2_table_offset(LINEAR_PT_VIRT_START) + i] =
 15.1992 +                (l3e_get_flags(l3e[i]) & _PAGE_PRESENT) 
 15.1993 +                ? l2e_from_pfn(l3e_get_pfn(l3e[i]), __PAGE_HYPERVISOR) 
 15.1994 +                : l2e_empty();
 15.1995 +        sh_unmap_domain_page(l2e);
 15.1996 +        sh_unmap_domain_page(l3e);
 15.1997 +
 15.1998 +        SHADOW_PRINTK("new monitor table: %#lx\n", mfn_x(m3mfn));
 15.1999 +        return m3mfn;
 15.2000 +    }
 15.2001 +
 15.2002 +#elif CONFIG_PAGING_LEVELS == 2
 15.2003 +
 15.2004 +    {
 15.2005 +        struct domain *d = v->domain;
 15.2006 +        mfn_t m2mfn;
 15.2007 +        m2mfn = shadow_alloc(d, PGC_SH_monitor_table, 0);
 15.2008 +        sh_install_xen_entries_in_l2(v, m2mfn, m2mfn);
 15.2009 +        /* Remember the level of this table */
 15.2010 +        mfn_to_page(m2mfn)->shadow_flags = 2;
 15.2011 +        return m2mfn;
 15.2012 +    }
 15.2013 +
 15.2014 +#else
 15.2015 +#error this should not happen
 15.2016 +#endif /* CONFIG_PAGING_LEVELS */
 15.2017 +}
 15.2018 +#endif /* SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS */
 15.2019 +
 15.2020 +/**************************************************************************/
 15.2021 +/* These functions also take a virtual address and return the level-N
 15.2022 + * shadow table mfn and entry, but they create the shadow pagetables if
 15.2023 + * they are needed.  The "demand" argument is non-zero when handling
 15.2024 + * a demand fault (so we know what to do about accessed bits &c).
 15.2025 + * If the necessary tables are not present in the guest, they return NULL. */
 15.2026 +#if GUEST_PAGING_LEVELS >= 4
 15.2027 +static shadow_l4e_t * shadow_get_and_create_l4e(struct vcpu *v, 
 15.2028 +                                                walk_t *gw, 
 15.2029 +                                                mfn_t *sl4mfn)
 15.2030 +{
 15.2031 +    /* There is always a shadow of the top level table.  Get it. */
 15.2032 +    *sl4mfn = pagetable_get_mfn(v->arch.shadow_table);
 15.2033 +    /* Reading the top level table is always valid. */
 15.2034 +    return sh_linear_l4_table(v) + shadow_l4_linear_offset(gw->va);
 15.2035 +}
 15.2036 +#endif /* GUEST_PAGING_LEVELS >= 4 */
 15.2037 +
 15.2038 +
 15.2039 +#if GUEST_PAGING_LEVELS >= 3
 15.2040 +static shadow_l3e_t * shadow_get_and_create_l3e(struct vcpu *v, 
 15.2041 +                                                walk_t *gw, 
 15.2042 +                                                mfn_t *sl3mfn,
 15.2043 +                                                fetch_type_t ft)
 15.2044 +{
 15.2045 +#if GUEST_PAGING_LEVELS >= 4 /* 64bit... */
 15.2046 +    mfn_t sl4mfn;
 15.2047 +    shadow_l4e_t *sl4e;
 15.2048 +    if ( !valid_mfn(gw->l3mfn) ) return NULL; /* No guest page. */
 15.2049 +    /* Get the l4e */
 15.2050 +    sl4e = shadow_get_and_create_l4e(v, gw, &sl4mfn);
 15.2051 +    ASSERT(sl4e != NULL);
 15.2052 +    if ( shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT ) 
 15.2053 +    {
 15.2054 +        *sl3mfn = shadow_l4e_get_mfn(*sl4e);
 15.2055 +        ASSERT(valid_mfn(*sl3mfn));
 15.2056 +    } 
 15.2057 +    else 
 15.2058 +    {
 15.2059 +        int r;
 15.2060 +        shadow_l4e_t new_sl4e;
 15.2061 +        /* No l3 shadow installed: find and install it. */
 15.2062 +        *sl3mfn = get_shadow_status(v, gw->l3mfn, PGC_SH_l3_shadow);
 15.2063 +        if ( !valid_mfn(*sl3mfn) ) 
 15.2064 +        {
 15.2065 +            /* No l3 shadow of this page exists at all: make one. */
 15.2066 +            *sl3mfn = sh_make_shadow(v, gw->l3mfn, PGC_SH_l3_shadow);
 15.2067 +        }
 15.2068 +        /* Install the new sl3 table in the sl4e */
 15.2069 +        l4e_propagate_from_guest(v, gw->l4e, gw->l4mfn, 
 15.2070 +                                 *sl3mfn, &new_sl4e, ft);
 15.2071 +        r = shadow_set_l4e(v, sl4e, new_sl4e, sl4mfn);
 15.2072 +        ASSERT((r & SHADOW_SET_FLUSH) == 0);
 15.2073 +    }
 15.2074 +    /* Now follow it down a level.  Guaranteed to succeed. */
 15.2075 +    return sh_linear_l3_table(v) + shadow_l3_linear_offset(gw->va);
 15.2076 +#else /* PAE... */
 15.2077 +    /* There is always a shadow of the top level table.  Get it. */
 15.2078 +    *sl3mfn = pagetable_get_mfn(v->arch.shadow_table);
 15.2079 +    /* This next line is important: the shadow l3 table is in an 8k
 15.2080 +     * shadow and we need to return the right mfn of the pair. This call
 15.2081 +     * will set it for us as a side-effect. */
 15.2082 +    (void) shadow_l3_index(sl3mfn, guest_index(gw->l3e));
 15.2083 +    ASSERT(v->arch.shadow_vtable);
 15.2084 +    return ((shadow_l3e_t *)v->arch.shadow_vtable) 
 15.2085 +        + shadow_l3_table_offset(gw->va);
 15.2086 +#endif /* GUEST_PAGING_LEVELS >= 4 */
 15.2087 +}
 15.2088 +#endif /* GUEST_PAGING_LEVELS >= 3 */
 15.2089 +
 15.2090 +
 15.2091 +static shadow_l2e_t * shadow_get_and_create_l2e(struct vcpu *v, 
 15.2092 +                                                walk_t *gw, 
 15.2093 +                                                mfn_t *sl2mfn,
 15.2094 +                                                fetch_type_t ft)
 15.2095 +{
 15.2096 +#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64bit... */
 15.2097 +    mfn_t sl3mfn = _mfn(INVALID_MFN);
 15.2098 +    shadow_l3e_t *sl3e;
 15.2099 +    if ( !valid_mfn(gw->l2mfn) ) return NULL; /* No guest page. */
 15.2100 +    /* Get the l3e */
 15.2101 +    sl3e = shadow_get_and_create_l3e(v, gw, &sl3mfn, ft);
 15.2102 +    ASSERT(sl3e != NULL);  /* Since we know guest PT is valid this far */
 15.2103 +    if ( shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT ) 
 15.2104 +    {
 15.2105 +        *sl2mfn = shadow_l3e_get_mfn(*sl3e);
 15.2106 +        ASSERT(valid_mfn(*sl2mfn));
 15.2107 +    } 
 15.2108 +    else 
 15.2109 +    {
 15.2110 +        int r;
 15.2111 +        shadow_l3e_t new_sl3e;
 15.2112 +        /* No l2 shadow installed: find and install it. */
 15.2113 +        *sl2mfn = get_shadow_status(v, gw->l2mfn, PGC_SH_l2_shadow);
 15.2114 +        if ( !valid_mfn(*sl2mfn) ) 
 15.2115 +        {
 15.2116 +            /* No l2 shadow of this page exists at all: make one. */
 15.2117 +            *sl2mfn = sh_make_shadow(v, gw->l2mfn, PGC_SH_l2_shadow);
 15.2118 +        }
 15.2119 +        /* Install the new sl2 table in the sl3e */
 15.2120 +        l3e_propagate_from_guest(v, gw->l3e, gw->l3mfn, 
 15.2121 +                                 *sl2mfn, &new_sl3e, ft);
 15.2122 +        r = shadow_set_l3e(v, sl3e, new_sl3e, sl3mfn);
 15.2123 +        ASSERT((r & SHADOW_SET_FLUSH) == 0);
 15.2124 +#if GUEST_PAGING_LEVELS == 3 
 15.2125 +        /* Need to sync up the linear maps, as we are about to use them */
 15.2126 +        ASSERT( r & SHADOW_SET_L3PAE_RECOPY );
 15.2127 +        sh_pae_recopy(v->domain);
 15.2128 +#endif
 15.2129 +    }
 15.2130 +    /* Now follow it down a level.  Guaranteed to succeed. */
 15.2131 +    return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
 15.2132 +#else /* 32bit... */
 15.2133 +    /* There is always a shadow of the top level table.  Get it. */
 15.2134 +    *sl2mfn = pagetable_get_mfn(v->arch.shadow_table);
 15.2135 +    /* This next line is important: the guest l2 has a 16k
 15.2136 +     * shadow, we need to return the right mfn of the four. This
 15.2137 +     * call will set it for us as a side-effect. */
 15.2138 +    (void) shadow_l2_index(sl2mfn, guest_index(gw->l2e));
 15.2139 +    /* Reading the top level table is always valid. */
 15.2140 +    return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
 15.2141 +#endif 
 15.2142 +}
 15.2143 +
 15.2144 +
 15.2145 +static shadow_l1e_t * shadow_get_and_create_l1e(struct vcpu *v, 
 15.2146 +                                                walk_t *gw, 
 15.2147 +                                                mfn_t *sl1mfn,
 15.2148 +                                                fetch_type_t ft)
 15.2149 +{
 15.2150 +    mfn_t sl2mfn;
 15.2151 +    shadow_l2e_t *sl2e;
 15.2152 +
 15.2153 +    /* Get the l2e */
 15.2154 +    sl2e = shadow_get_and_create_l2e(v, gw, &sl2mfn, ft);
 15.2155 +    if ( sl2e == NULL ) return NULL;
 15.2156 +    if ( shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT ) 
 15.2157 +    {
 15.2158 +        *sl1mfn = shadow_l2e_get_mfn(*sl2e);
 15.2159 +        ASSERT(valid_mfn(*sl1mfn));
 15.2160 +    } 
 15.2161 +    else 
 15.2162 +    {
 15.2163 +        shadow_l2e_t new_sl2e;
 15.2164 +        int r, flags = guest_l2e_get_flags(*gw->l2e);
 15.2165 +        /* No l1 shadow installed: find and install it. */
 15.2166 +        if ( !(flags & _PAGE_PRESENT) )
 15.2167 +            return NULL; /* No guest page. */
 15.2168 +        if ( guest_supports_superpages(v) && (flags & _PAGE_PSE) ) 
 15.2169 +        {
 15.2170 +            /* Splintering a superpage */
 15.2171 +            gfn_t l2gfn = guest_l2e_get_gfn(*gw->l2e);
 15.2172 +            *sl1mfn = get_fl1_shadow_status(v, l2gfn);
 15.2173 +            if ( !valid_mfn(*sl1mfn) ) 
 15.2174 +            {
 15.2175 +                /* No fl1 shadow of this superpage exists at all: make one. */
 15.2176 +                *sl1mfn = make_fl1_shadow(v, l2gfn);
 15.2177 +            }
 15.2178 +        } 
 15.2179 +        else 
 15.2180 +        {
 15.2181 +            /* Shadowing an actual guest l1 table */
 15.2182 +            if ( !valid_mfn(gw->l2mfn) ) return NULL; /* No guest page. */
 15.2183 +            *sl1mfn = get_shadow_status(v, gw->l1mfn, PGC_SH_l1_shadow);
 15.2184 +            if ( !valid_mfn(*sl1mfn) ) 
 15.2185 +            {
 15.2186 +                /* No l1 shadow of this page exists at all: make one. */
 15.2187 +                *sl1mfn = sh_make_shadow(v, gw->l1mfn, PGC_SH_l1_shadow);
 15.2188 +            }
 15.2189 +        }
 15.2190 +        /* Install the new sl1 table in the sl2e */
 15.2191 +        l2e_propagate_from_guest(v, gw->l2e, gw->l2mfn, 
 15.2192 +                                 *sl1mfn, &new_sl2e, ft);
 15.2193 +        r = shadow_set_l2e(v, sl2e, new_sl2e, sl2mfn);
 15.2194 +        ASSERT((r & SHADOW_SET_FLUSH) == 0);        
 15.2195 +        /* This next line is important: in 32-on-PAE and 32-on-64 modes,
 15.2196 +         * the guest l1 table has an 8k shadow, and we need to return
 15.2197 +         * the right mfn of the pair. This call will set it for us as a
 15.2198 +         * side-effect.  (In all other cases, it's a no-op and will be
 15.2199 +         * compiled out.) */
 15.2200 +        (void) shadow_l1_index(sl1mfn, guest_l1_table_offset(gw->va));
 15.2201 +    }
 15.2202 +    /* Now follow it down a level.  Guaranteed to succeed. */
 15.2203 +    return sh_linear_l1_table(v) + shadow_l1_linear_offset(gw->va);
 15.2204 +}
 15.2205 +
 15.2206 +
 15.2207 +
 15.2208 +/**************************************************************************/
 15.2209 +/* Destructors for shadow tables: 
 15.2210 + * Unregister the shadow, decrement refcounts of any entries present in it,
 15.2211 + * and release the memory.
 15.2212 + *
 15.2213 + * N.B. These destructors do not clear the contents of the shadows.
 15.2214 + *      This allows us to delay TLB shootdowns until the page is being reused.
 15.2215 + *      See shadow_alloc() and shadow_free() for how this is handled.
 15.2216 + */
 15.2217 +
 15.2218 +#if GUEST_PAGING_LEVELS >= 4
 15.2219 +void sh_destroy_l4_shadow(struct vcpu *v, mfn_t smfn)
 15.2220 +{
 15.2221 +    shadow_l4e_t *sl4e;
 15.2222 +    u32 t = mfn_to_page(smfn)->count_info & PGC_SH_type_mask;
 15.2223 +    mfn_t gmfn, sl4mfn;
 15.2224 +    int xen_mappings;
 15.2225 +
 15.2226 +    SHADOW_DEBUG(DESTROY_SHADOW,
 15.2227 +                  "%s(%05lx)\n", __func__, mfn_x(smfn));
 15.2228 +    ASSERT(t == PGC_SH_l4_shadow);
 15.2229 +
 15.2230 +    /* Record that the guest page isn't shadowed any more (in this type) */
 15.2231 +    gmfn = _mfn(mfn_to_page(smfn)->u.inuse.type_info);
 15.2232 +    delete_shadow_status(v, gmfn, t, smfn);
 15.2233 +    shadow_demote(v, gmfn, t);
 15.2234 +    /* Take this shadow off the list of root shadows */
 15.2235 +    list_del_init(&mfn_to_page(smfn)->list);
 15.2236 +
 15.2237 +    /* Decrement refcounts of all the old entries */
 15.2238 +    xen_mappings = (!shadow_mode_external(v->domain));
 15.2239 +    sl4mfn = smfn; 
 15.2240 +    SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, 0, xen_mappings, {
 15.2241 +        if ( shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT ) 
 15.2242 +        {
 15.2243 +            sh_put_ref(v, shadow_l4e_get_mfn(*sl4e),
 15.2244 +                        (((paddr_t)mfn_x(sl4mfn)) << PAGE_SHIFT) 
 15.2245 +                        | ((unsigned long)sl4e & ~PAGE_MASK));
 15.2246 +        }
 15.2247 +    });
 15.2248 +    
 15.2249 +    /* Put the memory back in the pool */
 15.2250 +    shadow_free(v->domain, smfn);
 15.2251 +}
 15.2252 +#endif    
 15.2253 +
 15.2254 +#if GUEST_PAGING_LEVELS >= 3
 15.2255 +void sh_destroy_l3_shadow(struct vcpu *v, mfn_t smfn)
 15.2256 +{
 15.2257 +    shadow_l3e_t *sl3e;
 15.2258 +    u32 t = mfn_to_page(smfn)->count_info & PGC_SH_type_mask;
 15.2259 +    mfn_t gmfn, sl3mfn;
 15.2260 +
 15.2261 +    SHADOW_DEBUG(DESTROY_SHADOW,
 15.2262 +                  "%s(%05lx)\n", __func__, mfn_x(smfn));
 15.2263 +    ASSERT(t == PGC_SH_l3_shadow);
 15.2264 +
 15.2265 +    /* Record that the guest page isn't shadowed any more (in this type) */
 15.2266 +    gmfn = _mfn(mfn_to_page(smfn)->u.inuse.type_info);
 15.2267 +    delete_shadow_status(v, gmfn, t, smfn);
 15.2268 +    shadow_demote(v, gmfn, t);
 15.2269 +#if GUEST_PAGING_LEVELS == 3
 15.2270 +    /* Take this shadow off the list of root shadows */
 15.2271 +    list_del_init(&mfn_to_page(smfn)->list);
 15.2272 +#endif
 15.2273 +
 15.2274 +    /* Decrement refcounts of all the old entries */
 15.2275 +    sl3mfn = smfn; 
 15.2276 +    SHADOW_FOREACH_L3E(sl3mfn, sl3e, 0, 0, {
 15.2277 +        if ( shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT ) 
 15.2278 +            sh_put_ref(v, shadow_l3e_get_mfn(*sl3e),
 15.2279 +                        (((paddr_t)mfn_x(sl3mfn)) << PAGE_SHIFT) 
 15.2280 +                        | ((unsigned long)sl3e & ~PAGE_MASK));
 15.2281 +    });
 15.2282 +
 15.2283 +    /* Put the memory back in the pool */
 15.2284 +    shadow_free(v->domain, smfn);
 15.2285 +}
 15.2286 +#endif    
 15.2287 +
 15.2288 +
 15.2289 +#if GUEST_PAGING_LEVELS == 3
 15.2290 +static void sh_destroy_l3_subshadow(struct vcpu *v, 
 15.2291 +                                     shadow_l3e_t *sl3e)
 15.2292 +/* Tear down just a single 4-entry l3 on a 2-page l3 shadow. */
 15.2293 +{
 15.2294 +    int i;
 15.2295 +    ASSERT((unsigned long)sl3e % (4 * sizeof (shadow_l3e_t)) == 0); 
 15.2296 +    for ( i = 0; i < GUEST_L3_PAGETABLE_ENTRIES; i++ ) 
 15.2297 +        if ( shadow_l3e_get_flags(sl3e[i]) & _PAGE_PRESENT ) 
 15.2298 +            sh_put_ref(v, shadow_l3e_get_mfn(sl3e[i]),
 15.2299 +                        maddr_from_mapped_domain_page(sl3e));
 15.2300 +}
 15.2301 +#endif
 15.2302 +
 15.2303 +#if (GUEST_PAGING_LEVELS == 3) && (SHADOW_PAGING_LEVELS == 3)
 15.2304 +void sh_unpin_all_l3_subshadows(struct vcpu *v, mfn_t smfn)
 15.2305 +/* Walk a full PAE l3 shadow, unpinning all of the subshadows on it */
 15.2306 +{
 15.2307 +    int i, j;
 15.2308 +    struct pae_l3_bookkeeping *bk;
 15.2309 +    
 15.2310 +    ASSERT((mfn_to_page(smfn)->count_info & PGC_SH_type_mask) 
 15.2311 +           == PGC_SH_l3_pae_shadow);
 15.2312 +    /* The subshadows are split, 64 on each page of the shadow */
 15.2313 +    for ( i = 0; i < 2; i++ ) 
 15.2314 +    {
 15.2315 +        void *p = sh_map_domain_page(_mfn(mfn_x(smfn) + i));
 15.2316 +        for ( j = 0; j < 64; j++ )
 15.2317 +        {
 15.2318 +            /* Every second 32-byte region is a bookkeeping entry */
 15.2319 +            bk = (struct pae_l3_bookkeeping *)(p + (64 * j) + 32);
 15.2320 +            if ( bk->pinned )
 15.2321 +                sh_unpin_l3_subshadow(v, (shadow_l3e_t *)(p + (64*j)), smfn);
 15.2322 +            /* Check whether we've just freed the whole shadow */
 15.2323 +            if ( (mfn_to_page(smfn)->count_info & PGC_SH_count_mask) == 0 ) 
 15.2324 +            {
 15.2325 +                sh_unmap_domain_page(p);
 15.2326 +                return;
 15.2327 +            }
 15.2328 +        }
 15.2329 +        sh_unmap_domain_page(p);
 15.2330 +    }
 15.2331 +}
 15.2332 +#endif
 15.2333 +
 15.2334 +void sh_destroy_l2_shadow(struct vcpu *v, mfn_t smfn)
 15.2335 +{
 15.2336 +    shadow_l2e_t *sl2e;
 15.2337 +    u32 t = mfn_to_page(smfn)->count_info & PGC_SH_type_mask;
 15.2338 +    mfn_t gmfn, sl2mfn;
 15.2339 +    int xen_mappings;
 15.2340 +
 15.2341 +    SHADOW_DEBUG(DESTROY_SHADOW,
 15.2342 +                  "%s(%05lx)\n", __func__, mfn_x(smfn));
 15.2343 +    ASSERT(t == PGC_SH_l2_shadow 
 15.2344 +           || t == PGC_SH_l2h_pae_shadow);
 15.2345 +
 15.2346 +    /* Record that the guest page isn't shadowed any more (in this type) */
 15.2347 +    gmfn = _mfn(mfn_to_page(smfn)->u.inuse.type_info);
 15.2348 +    delete_shadow_status(v, gmfn, t, smfn);
 15.2349 +    shadow_demote(v, gmfn, t);
 15.2350 +#if GUEST_PAGING_LEVELS == 2
 15.2351 +    /* Take this shadow off the list of root shadows */
 15.2352 +    list_del_init(&mfn_to_page(smfn)->list);
 15.2353 +#endif
 15.2354 +
 15.2355 +    /* Decrement refcounts of all the old entries */
 15.2356 +    sl2mfn = smfn;
 15.2357 +    xen_mappings = (!shadow_mode_external(v->domain) &&
 15.2358 +                    ((GUEST_PAGING_LEVELS == 2) ||
 15.2359 +                     ((GUEST_PAGING_LEVELS == 3) &&
 15.2360 +                      (t == PGC_SH_l2h_pae_shadow))));
 15.2361 +    SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, xen_mappings, {
 15.2362 +        if ( shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT ) 
 15.2363 +            sh_put_ref(v, shadow_l2e_get_mfn(*sl2e),
 15.2364 +                        (((paddr_t)mfn_x(sl2mfn)) << PAGE_SHIFT) 
 15.2365 +                        | ((unsigned long)sl2e & ~PAGE_MASK));
 15.2366 +    });
 15.2367 +
 15.2368 +    /* Put the memory back in the pool */
 15.2369 +    shadow_free(v->domain, smfn);
 15.2370 +}
 15.2371 +
 15.2372 +void sh_destroy_l1_shadow(struct vcpu *v, mfn_t smfn)
 15.2373 +{
 15.2374 +    struct domain *d = v->domain;
 15.2375 +    shadow_l1e_t *sl1e;
 15.2376 +    u32 t = mfn_to_page(smfn)->count_info & PGC_SH_type_mask;
 15.2377 +
 15.2378 +    SHADOW_DEBUG(DESTROY_SHADOW,
 15.2379 +                  "%s(%05lx)\n", __func__, mfn_x(smfn));
 15.2380 +    ASSERT(t == PGC_SH_l1_shadow || t == PGC_SH_fl1_shadow);
 15.2381 +
 15.2382 +    /* Record that the guest page isn't shadowed any more (in this type) */
 15.2383 +    if ( t == PGC_SH_fl1_shadow )
 15.2384 +    {
 15.2385 +        gfn_t gfn = _gfn(mfn_to_page(smfn)->u.inuse.type_info);
 15.2386 +        delete_fl1_shadow_status(v, gfn, smfn);
 15.2387 +    }
 15.2388 +    else 
 15.2389 +    {
 15.2390 +        mfn_t gmfn = _mfn(mfn_to_page(smfn)->u.inuse.type_info);
 15.2391 +        delete_shadow_status(v, gmfn, t, smfn);
 15.2392 +        shadow_demote(v, gmfn, t);
 15.2393 +    }
 15.2394 +    
 15.2395 +    if ( shadow_mode_refcounts(d) )
 15.2396 +    {
 15.2397 +        /* Decrement refcounts of all the old entries */
 15.2398 +        mfn_t sl1mfn = smfn; 
 15.2399 +        SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, 0, {
 15.2400 +            if ( shadow_l1e_get_flags(*sl1e) & _PAGE_PRESENT ) 
 15.2401 +                shadow_put_page_from_l1e(*sl1e, d);
 15.2402 +        });
 15.2403 +    }
 15.2404 +    
 15.2405 +    /* Put the memory back in the pool */
 15.2406 +    shadow_free(v->domain, smfn);
 15.2407 +}
 15.2408 +
 15.2409 +#if SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS
 15.2410 +void sh_destroy_monitor_table(struct vcpu *v, mfn_t mmfn)
 15.2411 +{
 15.2412 +    struct domain *d = v->domain;
 15.2413 +    ASSERT((mfn_to_page(mmfn)->count_info & PGC_SH_type_mask)
 15.2414 +           == PGC_SH_monitor_table);
 15.2415 +
 15.2416 +#if (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS != 4)
 15.2417 +    /* Need to destroy the l3 monitor page in slot 0 too */
 15.2418 +    {
 15.2419 +        l4_pgentry_t *l4e = sh_map_domain_page(mmfn);
 15.2420 +        ASSERT(l4e_get_flags(l4e[0]) & _PAGE_PRESENT);
 15.2421 +        shadow_free(d, _mfn(l4e_get_pfn(l4e[0])));
 15.2422 +        sh_unmap_domain_page(l4e);
 15.2423 +    }
 15.2424 +#elif CONFIG_PAGING_LEVELS == 3
 15.2425 +    /* Need to destroy the l2 monitor page in slot 4 too */
 15.2426 +    {
 15.2427 +        l3_pgentry_t *l3e = sh_map_domain_page(mmfn);
 15.2428 +        ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT);
 15.2429 +        shadow_free(d, _mfn(l3e_get_pfn(l3e[3])));
 15.2430 +        sh_unmap_domain_page(l3e);
 15.2431 +    }
 15.2432 +#endif
 15.2433 +
 15.2434 +    /* Put the memory back in the pool */
 15.2435 +    shadow_free(d, mmfn);
 15.2436 +}
 15.2437 +#endif
 15.2438 +
 15.2439 +/**************************************************************************/
 15.2440 +/* Functions to destroy non-Xen mappings in a pagetable hierarchy.
 15.2441 + * These are called from common code when we are running out of shadow
 15.2442 + * memory, and unpinning all the top-level shadows hasn't worked. 
 15.2443 + *
 15.2444 + * This implementation is pretty crude and slow, but we hope that it won't 
 15.2445 + * be called very often. */
 15.2446 +
 15.2447 +#if GUEST_PAGING_LEVELS == 2
 15.2448 +
 15.2449 +void sh_unhook_32b_mappings(struct vcpu *v, mfn_t sl2mfn)
 15.2450 +{    
 15.2451 +    shadow_l2e_t *sl2e;
 15.2452 +    int xen_mappings = !shadow_mode_external(v->domain);
 15.2453 +    SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, xen_mappings, {
 15.2454 +        (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
 15.2455 +    });
 15.2456 +}
 15.2457 +
 15.2458 +#elif GUEST_PAGING_LEVELS == 3
 15.2459 +
 15.2460 +void sh_unhook_pae_mappings(struct vcpu *v, mfn_t sl3mfn)
 15.2461 +/* Walk a full PAE l3 shadow, unhooking entries from all the subshadows */
 15.2462 +{
 15.2463 +    shadow_l3e_t *sl3e;
 15.2464 +    SHADOW_FOREACH_L3E(sl3mfn, sl3e, 0, 0, {
 15.2465 +        if ( (shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT) ) {
 15.2466 +            mfn_t sl2mfn = shadow_l3e_get_mfn(*sl3e);
 15.2467 +            if ( (mfn_to_page(sl2mfn)->count_info & PGC_SH_type_mask) 
 15.2468 +                 == PGC_SH_l2h_pae_shadow ) 
 15.2469 +            {
 15.2470 +                /* High l2: need to pick particular l2es to unhook */
 15.2471 +                shadow_l2e_t *sl2e;
 15.2472 +                SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, 1, {
 15.2473 +                    (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
 15.2474 +                });
 15.2475 +            }
 15.2476 +            else
 15.2477 +            {
 15.2478 +                /* Normal l2: can safely unhook the whole l3e */
 15.2479 +                (void) shadow_set_l3e(v, sl3e, shadow_l3e_empty(), sl3mfn);
 15.2480 +            }
 15.2481 +        }
 15.2482 +    });
 15.2483 +    /* We've changed PAE L3 entries: must sync up various copies of them */
 15.2484 +    sh_pae_recopy(v->domain);
 15.2485 +}
 15.2486 +
 15.2487 +#elif GUEST_PAGING_LEVELS == 4
 15.2488 +
 15.2489 +void sh_unhook_64b_mappings(struct vcpu *v, mfn_t sl4mfn)
 15.2490 +{
 15.2491 +    shadow_l4e_t *sl4e;
 15.2492 +    int xen_mappings = !shadow_mode_external(v->domain);
 15.2493 +    SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, 0, xen_mappings, {
 15.2494 +        (void) shadow_set_l4e(v, sl4e, shadow_l4e_empty(), sl4mfn);
 15.2495 +    });
 15.2496 +}
 15.2497 +
 15.2498 +#endif
 15.2499 +
 15.2500 +/**************************************************************************/
 15.2501 +/* Internal translation functions.
 15.2502 + * These functions require a pointer to the shadow entry that will be updated.
 15.2503 + */
 15.2504 +
 15.2505 +/* These functions take a new guest entry, translate it to shadow and write 
 15.2506 + * the shadow entry.
 15.2507 + *
 15.2508 + * They return the same bitmaps as the shadow_set_lXe() functions.
 15.2509 + */
 15.2510 +
 15.2511 +#if GUEST_PAGING_LEVELS >= 4
 15.2512 +static int validate_gl4e(struct vcpu *v, void *new_ge, mfn_t sl4mfn, void *se)
 15.2513 +{
 15.2514 +    shadow_l4e_t new_sl4e;
 15.2515 +    guest_l4e_t *new_gl4e = new_ge;
 15.2516 +    shadow_l4e_t *sl4p = se;
 15.2517 +    mfn_t sl3mfn = _mfn(INVALID_MFN);
 15.2518 +    int result = 0;
 15.2519 +
 15.2520 +    perfc_incrc(shadow_validate_gl4e_calls);
 15.2521 +
 15.2522 +    if ( guest_l4e_get_flags(*new_gl4e) & _PAGE_PRESENT )
 15.2523 +    {
 15.2524 +        gfn_t gl3gfn = guest_l4e_get_gfn(*new_gl4e);
 15.2525 +        mfn_t gl3mfn = vcpu_gfn_to_mfn(v, gl3gfn);
 15.2526 +        if ( valid_mfn(gl3mfn) )
 15.2527 +            sl3mfn = get_shadow_status(v, gl3mfn, PGC_SH_l3_shadow);
 15.2528 +        else
 15.2529 +            result |= SHADOW_SET_ERROR;
 15.2530 +    }
 15.2531 +    l4e_propagate_from_guest(v, new_gl4e, _mfn(INVALID_MFN),
 15.2532 +                             sl3mfn, &new_sl4e, ft_prefetch);
 15.2533 +    result |= shadow_set_l4e(v, sl4p, new_sl4e, sl4mfn);
 15.2534 +    return result;
 15.2535 +}
 15.2536 +#endif // GUEST_PAGING_LEVELS >= 4
 15.2537 +
 15.2538 +#if GUEST_PAGING_LEVELS >= 3
 15.2539 +static int validate_gl3e(struct vcpu *v, void *new_ge, mfn_t sl3mfn, void *se)
 15.2540 +{
 15.2541 +    shadow_l3e_t new_sl3e;
 15.2542 +    guest_l3e_t *new_gl3e = new_ge;
 15.2543 +    shadow_l3e_t *sl3p = se;
 15.2544 +    mfn_t sl2mfn = _mfn(INVALID_MFN);
 15.2545 +    int result = 0;
 15.2546 +
 15.2547 +    perfc_incrc(shadow_validate_gl3e_calls);
 15.2548 +
 15.2549 +    if ( guest_l3e_get_flags(*new_gl3e) & _PAGE_PRESENT )
 15.2550 +    {
 15.2551 +        gfn_t gl2gfn = guest_l3e_get_gfn(*new_gl3e);
 15.2552 +        mfn_t gl2mfn = vcpu_gfn_to_mfn(v, gl2gfn);
 15.2553 +        if ( valid_mfn(gl2mfn) )
 15.2554 +            sl2mfn = get_shadow_status(v, gl2mfn, PGC_SH_l2_shadow);
 15.2555 +        else
 15.2556 +            result |= SHADOW_SET_ERROR;
 15.2557 +    }
 15.2558 +    l3e_propagate_from_guest(v, new_gl3e, _mfn(INVALID_MFN), 
 15.2559 +                             sl2mfn, &new_sl3e, ft_prefetch);
 15.2560 +    result |= shadow_set_l3e(v, sl3p, new_sl3e, sl3mfn);
 15.2561 +
 15.2562 +#if GUEST_PAGING_LEVELS == 3
 15.2563 +    /* We have changed a PAE l3 entry: need to sync up the possible copies 
 15.2564 +     * of it */
 15.2565 +    if ( result & SHADOW_SET_L3PAE_RECOPY )
 15.2566 +        sh_pae_recopy(v->domain);
 15.2567 +#endif
 15.2568 +
 15.2569 +    return result;
 15.2570 +}
 15.2571 +#endif // GUEST_PAGING_LEVELS >= 3
 15.2572 +
 15.2573 +static int validate_gl2e(struct vcpu *v, void *new_ge, mfn_t sl2mfn, void *se)
 15.2574 +{
 15.2575 +    shadow_l2e_t new_sl2e;
 15.2576 +    guest_l2e_t *new_gl2e = new_ge;
 15.2577 +    shadow_l2e_t *sl2p = se;
 15.2578 +    mfn_t sl1mfn = _mfn(INVALID_MFN);
 15.2579 +    int result = 0;
 15.2580 +
 15.2581 +    perfc_incrc(shadow_validate_gl2e_calls);
 15.2582 +
 15.2583 +    if ( guest_l2e_get_flags(*new_gl2e) & _PAGE_PRESENT )
 15.2584 +    {
 15.2585 +        gfn_t gl1gfn = guest_l2e_get_gfn(*new_gl2e);
 15.2586 +        if ( guest_supports_superpages(v) &&
 15.2587 +             (guest_l2e_get_flags(*new_gl2e) & _PAGE_PSE) )
 15.2588 +        {
 15.2589 +            // superpage -- need to look up the shadow L1 which holds the
 15.2590 +            // splitters...
 15.2591 +            sl1mfn = get_fl1_shadow_status(v, gl1gfn);
 15.2592 +#if 0
 15.2593 +            // XXX - it's possible that we want to do some kind of prefetch
 15.2594 +            // for superpage fl1's here, but this is *not* on the demand path,
 15.2595 +            // so we'll hold off trying that for now...
 15.2596 +            //
 15.2597 +            if ( !valid_mfn(sl1mfn) )
 15.2598 +                sl1mfn = make_fl1_shadow(v, gl1gfn);
 15.2599 +#endif
 15.2600 +        }
 15.2601 +        else
 15.2602 +        {
 15.2603 +            mfn_t gl1mfn = vcpu_gfn_to_mfn(v, gl1gfn);
 15.2604 +            if ( valid_mfn(gl1mfn) )
 15.2605 +                sl1mfn = get_shadow_status(v, gl1mfn, PGC_SH_l1_shadow);
 15.2606 +            else
 15.2607 +                result |= SHADOW_SET_ERROR;
 15.2608 +        }
 15.2609 +    }
 15.2610 +    l2e_propagate_from_guest(v, new_gl2e, _mfn(INVALID_MFN),
 15.2611 +                             sl1mfn, &new_sl2e, ft_prefetch);
 15.2612 +    result |= shadow_set_l2e(v, sl2p, new_sl2e, sl2mfn);
 15.2613 +
 15.2614 +    return result;
 15.2615 +}
 15.2616 +
 15.2617 +static int validate_gl1e(struct vcpu *v, void *new_ge, mfn_t sl1mfn, void *se)
 15.2618 +{
 15.2619 +    shadow_l1e_t new_sl1e;
 15.2620 +    guest_l1e_t *new_gl1e = new_ge;
 15.2621 +    shadow_l1e_t *sl1p = se;
 15.2622 +    gfn_t gfn;
 15.2623 +    mfn_t mfn;
 15.2624 +    int result = 0;
 15.2625 +
 15.2626 +    perfc_incrc(shadow_validate_gl1e_calls);
 15.2627 +
 15.2628 +    gfn = guest_l1e_get_gfn(*new_gl1e);
 15.2629 +    mfn = vcpu_gfn_to_mfn(v, gfn);
 15.2630 +
 15.2631 +    l1e_propagate_from_guest(v, *new_gl1e, &new_sl1e, 
 15.2632 +                             /* mmio? */ !valid_mfn(mfn));
 15.2633 +    
 15.2634 +    result |= shadow_set_l1e(v, sl1p, new_sl1e, sl1mfn);
 15.2635 +    return result;
 15.2636 +}
 15.2637 +
 15.2638 +
 15.2639 +/**************************************************************************/
 15.2640 +/* Functions which translate and install a the shadows of arbitrary guest 
 15.2641 + * entries that we have just seen the guest write. */
 15.2642 +
 15.2643 +
 15.2644 +static inline int 
 15.2645 +sh_map_and_validate(struct vcpu *v, mfn_t gmfn,
 15.2646 +                     void *new_gp, u32 size, u32 sh_type, 
 15.2647 +                     u32 (*shadow_index)(mfn_t *smfn, u32 idx),
 15.2648 +                     int (*validate_ge)(struct vcpu *v, void *ge, 
 15.2649 +                                        mfn_t smfn, void *se))
 15.2650 +/* Generic function for mapping and validating. */
 15.2651 +{
 15.2652 +    mfn_t smfn, smfn2, map_mfn;
 15.2653 +    shadow_l1e_t *sl1p;
 15.2654 +    u32 shadow_idx, guest_idx;
 15.2655 +    int result = 0;
 15.2656 +
 15.2657 +    /* Align address and size to guest entry boundaries */
 15.2658 +    size += (unsigned long)new_gp & (sizeof (guest_l1e_t) - 1);
 15.2659 +    new_gp = (void *)((unsigned long)new_gp & ~(sizeof (guest_l1e_t) - 1));
 15.2660 +    size = (size + sizeof (guest_l1e_t) - 1) & ~(sizeof (guest_l1e_t) - 1);
 15.2661 +    ASSERT(size + (((unsigned long)new_gp) & ~PAGE_MASK) <= PAGE_SIZE);
 15.2662 +
 15.2663 +    /* Map the shadow page */
 15.2664 +    smfn = get_shadow_status(v, gmfn, sh_type);
 15.2665 +    ASSERT(valid_mfn(smfn)); /* Otherwise we would not have been called */
 15.2666 +    guest_idx = guest_index(new_gp);
 15.2667 +    map_mfn = smfn;
 15.2668 +    shadow_idx = shadow_index(&map_mfn, guest_idx);
 15.2669 +    sl1p = map_shadow_page(map_mfn);
 15.2670 +
 15.2671 +    /* Validate one entry at a time */
 15.2672 +    while ( size )
 15.2673 +    {
 15.2674 +        smfn2 = smfn;
 15.2675 +        guest_idx = guest_index(new_gp);
 15.2676 +        shadow_idx = shadow_index(&smfn2, guest_idx);
 15.2677 +        if ( mfn_x(smfn2) != mfn_x(map_mfn) )
 15.2678 +        {
 15.2679 +            /* We have moved to another page of the shadow */
 15.2680 +            map_mfn = smfn2;
 15.2681 +            unmap_shadow_page(sl1p);
 15.2682 +            sl1p = map_shadow_page(map_mfn);
 15.2683 +        }
 15.2684 +        result |= validate_ge(v,
 15.2685 +                              new_gp,
 15.2686 +                              map_mfn,
 15.2687 +                              &sl1p[shadow_idx]);
 15.2688 +        size -= sizeof(guest_l1e_t);
 15.2689 +        new_gp += sizeof(guest_l1e_t);
 15.2690 +    }
 15.2691 +    unmap_shadow_page(sl1p);
 15.2692 +    return result;
 15.2693 +}
 15.2694 +
 15.2695 +
 15.2696 +int
 15.2697 +sh_map_and_validate_gl4e(struct vcpu *v, mfn_t gl4mfn,
 15.2698 +                          void *new_gl4p, u32 size)
 15.2699 +{
 15.2700 +#if GUEST_PAGING_LEVELS >= 4
 15.2701 +    return sh_map_and_validate(v, gl4mfn, new_gl4p, size, 
 15.2702 +                                PGC_SH_l4_shadow, 
 15.2703 +                                shadow_l4_index, 
 15.2704 +                                validate_gl4e);
 15.2705 +#else // ! GUEST_PAGING_LEVELS >= 4
 15.2706 +    SHADOW_PRINTK("called in wrong paging mode!\n");
 15.2707 +    BUG();
 15.2708 +    return 0;
 15.2709 +#endif 
 15.2710 +}
 15.2711 +    
 15.2712 +int
 15.2713 +sh_map_and_validate_gl3e(struct vcpu *v, mfn_t gl3mfn,
 15.2714 +                          void *new_gl3p, u32 size)
 15.2715 +{
 15.2716 +#if GUEST_PAGING_LEVELS >= 3
 15.2717 +    return sh_map_and_validate(v, gl3mfn, new_gl3p, size, 
 15.2718 +                                PGC_SH_l3_shadow, 
 15.2719 +                                shadow_l3_index, 
 15.2720 +                                validate_gl3e);
 15.2721 +#else // ! GUEST_PAGING_LEVELS >= 3
 15.2722 +    SHADOW_PRINTK("called in wrong paging mode!\n");
 15.2723 +    BUG();
 15.2724 +    return 0;
 15.2725 +#endif
 15.2726 +}
 15.2727 +
 15.2728 +int
 15.2729 +sh_map_and_validate_gl2e(struct vcpu *v, mfn_t gl2mfn,
 15.2730 +                          void *new_gl2p, u32 size)
 15.2731 +{
 15.2732 +    return sh_map_and_validate(v, gl2mfn, new_gl2p, size, 
 15.2733 +                                PGC_SH_l2_shadow, 
 15.2734 +                                shadow_l2_index, 
 15.2735 +                                validate_gl2e);
 15.2736 +}
 15.2737 +
 15.2738 +int
 15.2739 +sh_map_and_validate_gl2he(struct vcpu *v, mfn_t gl2mfn,
 15.2740 +                           void *new_gl2p, u32 size)
 15.2741 +{
 15.2742 +#if GUEST_PAGING_LEVELS == 3
 15.2743 +    return sh_map_and_validate(v, gl2mfn, new_gl2p, size, 
 15.2744 +                                PGC_SH_l2h_shadow, 
 15.2745 +                                shadow_l2_index, 
 15.2746 +                                validate_gl2e);
 15.2747 +#else /* Non-PAE guests don't have different kinds of l2 table */
 15.2748 +    SHADOW_PRINTK("called in wrong paging mode!\n");
 15.2749 +    BUG();
 15.2750 +    return 0;
 15.2751 +#endif
 15.2752 +}
 15.2753 +
 15.2754 +int
 15.2755 +sh_map_and_validate_gl1e(struct vcpu *v, mfn_t gl1mfn,
 15.2756 +                          void *new_gl1p, u32 size)
 15.2757 +{
 15.2758 +    return sh_map_and_validate(v, gl1mfn, new_gl1p, size, 
 15.2759 +                                PGC_SH_l1_shadow, 
 15.2760 +                                shadow_l1_index, 
 15.2761 +                                validate_gl1e);
 15.2762 +}
 15.2763 +
 15.2764 +
 15.2765 +/**************************************************************************/
 15.2766 +/* Optimization: If we see two emulated writes of zeros to the same
 15.2767 + * page-table without another kind of page fault in between, we guess
 15.2768 + * that this is a batch of changes (for process destruction) and
 15.2769 + * unshadow the page so we don't take a pagefault on every entry.  This
 15.2770 + * should also make finding writeable mappings of pagetables much
 15.2771 + * easier. */
 15.2772 +
 15.2773 +/* Look to see if this is the second emulated write in a row to this
 15.2774 + * page, and unshadow/unhook if it is */
 15.2775 +static inline void check_for_early_unshadow(struct vcpu *v, mfn_t gmfn)
 15.2776 +{
 15.2777 +#if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW
 15.2778 +    if ( v->arch.shadow.last_emulated_mfn == mfn_x(gmfn) &&
 15.2779 +         sh_mfn_is_a_page_table(gmfn) )
 15.2780 +    {
 15.2781 +        u32 flags = mfn_to_page(gmfn)->shadow_flags;
 15.2782 +        mfn_t smfn;
 15.2783 +        if ( !(flags & (SHF_L2_32|SHF_L3_PAE|SHF_L4_64)) )
 15.2784 +        {
 15.2785 +            perfc_incrc(shadow_early_unshadow);
 15.2786 +            sh_remove_shadows(v, gmfn, 0 /* Can fail to unshadow */ );
 15.2787 +            return;
 15.2788 +        }
 15.2789 +        /* SHF_unhooked_mappings is set to make sure we only unhook
 15.2790 +         * once in a single batch of updates. It is reset when this
 15.2791 +         * top-level page is loaded into CR3 again */
 15.2792 +        if ( !(flags & SHF_unhooked_mappings) ) 
 15.2793 +        {
 15.2794 +            perfc_incrc(shadow_early_unshadow_top);
 15.2795 +            mfn_to_page(gmfn)->shadow_flags |= SHF_unhooked_mappings;
 15.2796 +            if ( flags & SHF_L2_32 )
 15.2797 +            {
 15.2798 +                smfn = get_shadow_status(v, gmfn, PGC_SH_l2_32_shadow);
 15.2799 +                shadow_unhook_mappings(v, smfn);
 15.2800 +            }
 15.2801 +            if ( flags & SHF_L3_PAE ) 
 15.2802 +            {
 15.2803 +                smfn = get_shadow_status(v, gmfn, PGC_SH_l3_pae_shadow);
 15.2804 +                shadow_unhook_mappings(v, smfn);
 15.2805 +            }
 15.2806 +            if ( flags & SHF_L4_64 ) 
 15.2807 +            {
 15.2808 +                smfn = get_shadow_status(v, gmfn, PGC_SH_l4_64_shadow);
 15.2809 +                shadow_unhook_mappings(v, smfn);
 15.2810 +            }
 15.2811 +        }
 15.2812 +    }
 15.2813 +    v->arch.shadow.last_emulated_mfn = mfn_x(gmfn);
 15.2814 +#endif
 15.2815 +}
 15.2816 +
 15.2817 +/* Stop counting towards early unshadows, as we've seen a real page fault */
 15.2818 +static inline void reset_early_unshadow(struct vcpu *v)
 15.2819 +{
 15.2820 +#if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW
 15.2821 +    v->arch.shadow.last_emulated_mfn = INVALID_MFN;
 15.2822 +#endif
 15.2823 +}
 15.2824 +
 15.2825 +
 15.2826 +
 15.2827 +/**************************************************************************/
 15.2828 +/* Entry points into the shadow code */
 15.2829 +
 15.2830 +/* Called from pagefault handler in Xen, and from the HVM trap handlers
 15.2831 + * for pagefaults.  Returns 1 if this fault was an artefact of the
 15.2832 + * shadow code (and the guest should retry) or 0 if it is not (and the
 15.2833 + * fault should be handled elsewhere or passed to the guest). */
 15.2834 +
 15.2835 +static int sh_page_fault(struct vcpu *v, 
 15.2836 +                          unsigned long va, 
 15.2837 +                          struct cpu_user_regs *regs)
 15.2838 +{
 15.2839 +    struct domain *d = v->domain;
 15.2840 +    walk_t gw;
 15.2841 +    u32 accumulated_gflags;
 15.2842 +    gfn_t gfn;
 15.2843 +    mfn_t gmfn, sl1mfn=_mfn(0);
 15.2844 +    shadow_l1e_t sl1e, *ptr_sl1e;
 15.2845 +    paddr_t gpa;
 15.2846 +    struct cpu_user_regs emul_regs;
 15.2847 +    struct x86_emulate_ctxt emul_ctxt;
 15.2848 +    int r, mmio;
 15.2849 +    fetch_type_t ft = 0;
 15.2850 +
 15.2851 +    //
 15.2852 +    // XXX: Need to think about eventually mapping superpages directly in the
 15.2853 +    //      shadow (when possible), as opposed to splintering them into a
 15.2854 +    //      bunch of 4K maps.
 15.2855 +    //
 15.2856 +
 15.2857 +    SHADOW_PRINTK("d:v=%u:%u va=%#lx err=%u\n",
 15.2858 +                   v->domain->domain_id, v->vcpu_id, va, regs->error_code);
 15.2859 +    
 15.2860 +    shadow_lock(d);
 15.2861 +
 15.2862 +    shadow_audit_tables(v);
 15.2863 +                   
 15.2864 +    if ( guest_walk_tables(v, va, &gw, 1) != 0 )
 15.2865 +    {
 15.2866 +        SHADOW_PRINTK("malformed guest pagetable!");
 15.2867 +        print_gw(&gw);
 15.2868 +    }
 15.2869 +
 15.2870 +    sh_audit_gw(v, &gw);
 15.2871 +
 15.2872 +    // We do not look at the gw->l1e, as that will not exist for superpages.
 15.2873 +    // Instead, we use the gw->eff_l1e...
 15.2874 +    //
 15.2875 +    // We need not check all the levels of the guest page table entries for
 15.2876 +    // present vs not-present, as the eff_l1e will always be not present if
 15.2877 +    // one of the higher level entries is not present.
 15.2878 +    //
 15.2879 +    if ( unlikely(!(guest_l1e_get_flags(gw.eff_l1e) & _PAGE_PRESENT)) )
 15.2880 +    {
 15.2881 +        if ( hvm_guest(v) && !shadow_vcpu_mode_translate(v) )
 15.2882 +        {
 15.2883 +            /* Not present in p2m map, means this is mmio */
 15.2884 +            gpa = va;
 15.2885 +            goto mmio;
 15.2886 +        }
 15.2887 +
 15.2888 +        perfc_incrc(shadow_fault_bail_not_present);
 15.2889 +        goto not_a_shadow_fault;
 15.2890 +    }
 15.2891 +
 15.2892 +    // All levels of the guest page table are now known to be present.
 15.2893 +    accumulated_gflags = accumulate_guest_flags(&gw);
 15.2894 +
 15.2895 +    // Check for attempts to access supervisor-only pages from user mode,
 15.2896 +    // i.e. ring 3.  Such errors are not caused or dealt with by the shadow
 15.2897 +    // code.
 15.2898 +    //
 15.2899 +    if ( (regs->error_code & PFEC_user_mode) &&
 15.2900 +         !(accumulated_gflags & _PAGE_USER) )
 15.2901 +    {
 15.2902 +        /* illegal user-mode access to supervisor-only page */
 15.2903 +        perfc_incrc(shadow_fault_bail_user_supervisor);
 15.2904 +        goto not_a_shadow_fault;
 15.2905 +    }
 15.2906 +
 15.2907 +    // Was it a write fault?
 15.2908 +    //
 15.2909 +    if ( regs->error_code & PFEC_write_access )
 15.2910 +    {
 15.2911 +        if ( unlikely(!(accumulated_gflags & _PAGE_RW)) )
 15.2912 +        {
 15.2913 +            perfc_incrc(shadow_fault_bail_ro_mapping);
 15.2914 +            goto not_a_shadow_fault;
 15.2915 +        }
 15.2916 +    }
 15.2917 +    else // must have been either an insn fetch or read fault
 15.2918 +    {
 15.2919 +        // Check for NX bit violations: attempts to execute code that is
 15.2920 +        // marked "do not execute".  Such errors are not caused or dealt with
 15.2921 +        // by the shadow code.
 15.2922 +        //
 15.2923 +        if ( regs->error_code & PFEC_insn_fetch )
 15.2924 +        {
 15.2925 +            if ( accumulated_gflags & _PAGE_NX_BIT )
 15.2926 +            {
 15.2927 +                /* NX prevented this code fetch */
 15.2928 +                perfc_incrc(shadow_fault_bail_nx);
 15.2929 +                goto not_a_shadow_fault;
 15.2930 +            }
 15.2931 +        }
 15.2932 +    }
 15.2933 +
 15.2934 +    /* Is this an MMIO access? */
 15.2935 +    gfn = guest_l1e_get_gfn(gw.eff_l1e);
 15.2936 +    mmio = ( hvm_guest(v) 
 15.2937 +             && shadow_vcpu_mode_translate(v) 
 15.2938 +             && mmio_space(gfn_to_paddr(gfn)) );
 15.2939 +
 15.2940 +    /* For MMIO, the shadow holds the *gfn*; for normal accesses, if holds 
 15.2941 +     * the equivalent mfn. */
 15.2942 +    if ( mmio ) 
 15.2943 +        gmfn = _mfn(gfn_x(gfn));
 15.2944 +    else
 15.2945 +    {
 15.2946 +        gmfn = vcpu_gfn_to_mfn(v, gfn);
 15.2947 +        if ( !valid_mfn(gmfn) )
 15.2948 +        {
 15.2949 +            perfc_incrc(shadow_fault_bail_bad_gfn);
 15.2950 +            SHADOW_PRINTK("BAD gfn=%"SH_PRI_gfn" gmfn=%"SH_PRI_mfn"\n", 
 15.2951 +                           gfn_x(gfn), mfn_x(gmfn));
 15.2952 +            goto not_a_shadow_fault;
 15.2953 +        }
 15.2954 +    }
 15.2955 +
 15.2956 +    /* Make sure there is enough free shadow memory to build a chain of
 15.2957 +     * shadow tables: one SHADOW_MAX_ORDER chunk will always be enough
 15.2958 +     * to allocate all we need.  (We never allocate a top-level shadow
 15.2959 +     * on this path, only a 32b l1, pae l2+1 or 64b l3+2+1) */
 15.2960 +    shadow_prealloc(d, SHADOW_MAX_ORDER);
 15.2961 +
 15.2962 +    /* Acquire the shadow.  This must happen before we figure out the rights 
 15.2963 +     * for the shadow entry, since we might promote a page here. */
 15.2964 +    // XXX -- this code will need to change somewhat if/when the shadow code
 15.2965 +    // can directly map superpages...
 15.2966 +    ft = ((regs->error_code & PFEC_write_access) ?
 15.2967 +          ft_demand_write : ft_demand_read);
 15.2968 +    ptr_sl1e = shadow_get_and_create_l1e(v, &gw, &sl1mfn, ft);
 15.2969 +    ASSERT(ptr_sl1e);
 15.2970 +
 15.2971 +    /* Calculate the shadow entry */
 15.2972 +    if ( ft == ft_demand_write )
 15.2973 +    {
 15.2974 +        if ( l1e_write_fault(v, &gw, gmfn, &sl1e, mmio) )
 15.2975 +        {
 15.2976 +            perfc_incrc(shadow_fault_emulate_write);
 15.2977 +            goto emulate;
 15.2978 +        }
 15.2979 +    }
 15.2980 +    else if ( l1e_read_fault(v, &gw, gmfn, &sl1e, mmio) )
 15.2981 +    {
 15.2982 +        perfc_incrc(shadow_fault_emulate_read);
 15.2983 +        goto emulate;
 15.2984 +    }
 15.2985 +
 15.2986 +    /* Quick sanity check: we never make an MMIO entry that's got the 
 15.2987 +     * _PAGE_PRESENT flag set in it. */
 15.2988 +    ASSERT(!mmio || !(shadow_l1e_get_flags(sl1e) & _PAGE_PRESENT));
 15.2989 +
 15.2990 +    r = shadow_set_l1e(v, ptr_sl1e, sl1e, sl1mfn);
 15.2991 +
 15.2992 +    if ( mmio ) 
 15.2993 +    {
 15.2994 +        gpa = guest_walk_to_gpa(&gw);
 15.2995 +        goto mmio;
 15.2996 +    }
 15.2997 +
 15.2998 +#if 0
 15.2999 +    if ( !(r & SHADOW_SET_CHANGED) )
 15.3000 +        debugtrace_printk("%s: shadow_set_l1e(va=%p, sl1e=%" SH_PRI_pte
 15.3001 +                          ") did not change anything\n",
 15.3002 +                          __func__, gw.va, l1e_get_intpte(sl1e));
 15.3003 +#endif
 15.3004 +
 15.3005 +    perfc_incrc(shadow_fault_fixed);
 15.3006 +    d->arch.shadow.fault_count++;
 15.3007 +    reset_early_unshadow(v);
 15.3008 +
 15.3009 + done:
 15.3010 +    sh_audit_gw(v, &gw);
 15.3011 +    unmap_walk(v, &gw);
 15.3012 +    SHADOW_PRINTK("fixed\n");
 15.3013 +    shadow_audit_tables(v);
 15.3014 +    shadow_unlock(d);
 15.3015 +    return EXCRET_fault_fixed;
 15.3016 +
 15.3017 + emulate:
 15.3018 +
 15.3019 +    /* Take the register set we were called with */
 15.3020 +    emul_regs = *regs;
 15.3021 +    if ( hvm_guest(v) )
 15.3022 +    {
 15.3023 +        /* Add the guest's segment selectors, rip, rsp. rflags */ 
 15.3024 +        hvm_store_cpu_guest_regs(v, &emul_regs, NULL);
 15.3025 +    }
 15.3026 +    emul_ctxt.regs = &emul_regs;
 15.3027 +    emul_ctxt.cr2 = va;
 15.3028 +    emul_ctxt.mode = hvm_guest(v) ? hvm_guest_x86_mode(v) : X86EMUL_MODE_HOST;
 15.3029 +
 15.3030 +    SHADOW_PRINTK("emulate: eip=%#lx\n", emul_regs.eip);
 15.3031 +
 15.3032 +    v->arch.shadow.propagate_fault = 0;
 15.3033 +    if ( x86_emulate_memop(&emul_ctxt, &shadow_emulator_ops) )
 15.3034 +    {
 15.3035 +        SHADOW_PRINTK("emulator failure, unshadowing mfn %#lx\n", 
 15.3036 +                       mfn_x(gmfn));
 15.3037 +        perfc_incrc(shadow_fault_emulate_failed);
 15.3038 +        /* If this is actually a page table, then we have a bug, and need 
 15.3039 +         * to support more operations in the emulator.  More likely, 
 15.3040 +         * though, this is a hint that this page should not be shadowed. */
 15.3041 +        shadow_remove_all_shadows(v, gmfn);
 15.3042 +        /* This means that actual missing operations will cause the 
 15.3043 +         * guest to loop on the same page fault. */
 15.3044 +        goto done;
 15.3045 +    }
 15.3046 +    if ( v->arch.shadow.propagate_fault )
 15.3047 +    {
 15.3048 +        /* Emulation triggered another page fault */
 15.3049 +        goto not_a_shadow_fault;
 15.3050 +    }
 15.3051 +
 15.3052 +    /* Emulator has changed the user registers: write back */
 15.3053 +    if ( hvm_guest(v) )
 15.3054 +    {
 15.3055 +        /* Write back the guest's segment selectors, rip, rsp. rflags */ 
 15.3056 +        hvm_load_cpu_guest_regs(v, &emul_regs);
 15.3057 +        /* And don't overwrite those in the caller's regs. */
 15.3058 +        emul_regs.eip = regs->eip;
 15.3059 +        emul_regs.cs = regs->cs;
 15.3060 +        emul_regs.eflags = regs->eflags;
 15.3061 +        emul_regs.esp = regs->esp;
 15.3062 +        emul_regs.ss = regs->ss;
 15.3063 +        emul_regs.es = regs->es;
 15.3064 +        emul_regs.ds = regs->ds;
 15.3065 +        emul_regs.fs = regs->fs;
 15.3066 +        emul_regs.gs = regs->gs;
 15.3067 +    }
 15.3068 +    *regs = emul_regs;
 15.3069 +
 15.3070 +    goto done;
 15.3071 +
 15.3072 + mmio:
 15.3073 +    perfc_incrc(shadow_fault_mmio);
 15.3074 +    if ( !hvm_apic_support(d) && (gpa >= 0xFEC00000) )
 15.3075 +    {
 15.3076 +        /* Need to deal with these disabled-APIC accesses, as
 15.3077 +         * handle_mmio() apparently does not currently do that. */
 15.3078 +        /* TJD: What about it, then?   For now, I'm turning this BUG() 
 15.3079 +         * into a domain_crash() since we don't want to kill Xen. */
 15.3080 +        SHADOW_ERROR("disabled-APIC access: not supported\n.");
 15.3081 +        domain_crash(d); 
 15.3082 +    }
 15.3083 +    sh_audit_gw(v, &gw);
 15.3084 +    unmap_walk(v, &gw);
 15.3085 +    SHADOW_PRINTK("mmio\n");
 15.3086 +    shadow_audit_tables(v);
 15.3087 +    reset_early_unshadow(v);
 15.3088 +    shadow_unlock(d);
 15.3089 +    sh_log_mmio(v, gpa);
 15.3090 +    handle_mmio(va, gpa);
 15.3091 +    return EXCRET_fault_fixed;
 15.3092 +
 15.3093 + not_a_shadow_fault:
 15.3094 +    sh_audit_gw(v, &gw);
 15.3095 +    unmap_walk(v, &gw);
 15.3096 +    SHADOW_PRINTK("not a shadow fault\n");
 15.3097 +    shadow_audit_tables(v);
 15.3098 +    reset_early_unshadow(v);
 15.3099 +    shadow_unlock(d);
 15.3100 +    return 0;
 15.3101 +}
 15.3102 +
 15.3103 +
 15.3104 +static int
 15.3105 +sh_invlpg(struct vcpu *v, unsigned long va)
 15.3106 +/* Called when the guest requests an invlpg.  Returns 1 if the invlpg
 15.3107 + * instruction should be issued on the hardware, or 0 if it's safe not
 15.3108 + * to do so. */
 15.3109 +{
 15.3110 +    shadow_l2e_t *ptr_sl2e = shadow_get_l2e(v, va);
 15.3111 +
 15.3112 +    // XXX -- might be a good thing to prefetch the va into the shadow
 15.3113 +
 15.3114 +    // no need to flush anything if there's no SL2...
 15.3115 +    //
 15.3116 +    if ( !ptr_sl2e )
 15.3117 +        return 0;
 15.3118 +
 15.3119 +    // If there's nothing shadowed for this particular sl2e, then
 15.3120 +    // there is no need to do an invlpg, either...
 15.3121 +    //
 15.3122 +    if ( !(shadow_l2e_get_flags(*ptr_sl2e) & _PAGE_PRESENT) )
 15.3123 +        return 0;
 15.3124 +
 15.3125 +    // Check to see if the SL2 is a splintered superpage...
 15.3126 +    // If so, then we'll need to flush the entire TLB (because that's
 15.3127 +    // easier than invalidating all of the individual 4K pages).
 15.3128 +    //
 15.3129 +    if ( (mfn_to_page(shadow_l2e_get_mfn(*ptr_sl2e))->count_info &
 15.3130 +          PGC_SH_type_mask) == PGC_SH_fl1_shadow )
 15.3131 +    {
 15.3132 +        local_flush_tlb();
 15.3133 +        return 0;
 15.3134 +    }
 15.3135 +
 15.3136 +    return 1;
 15.3137 +}
 15.3138 +
 15.3139 +static unsigned long
 15.3140 +sh_gva_to_gfn(struct vcpu *v, unsigned long va)
 15.3141 +/* Called to translate a guest virtual address to what the *guest*
 15.3142 + * pagetables would map it to. */
 15.3143 +{
 15.3144 +    walk_t gw;
 15.3145 +    gfn_t gfn;
 15.3146 +
 15.3147 +    guest_walk_tables(v, va, &gw, 0);
 15.3148 +    gfn = guest_walk_to_gfn(&gw);
 15.3149 +    unmap_walk(v, &gw);
 15.3150 +
 15.3151 +    return gfn_x(gfn);
 15.3152 +}
 15.3153 +
 15.3154 +
 15.3155 +static unsigned long
 15.3156 +sh_gva_to_gpa(struct vcpu *v, unsigned long va)
 15.3157 +/* Called to translate a guest virtual address to what the *guest*
 15.3158 + * pagetables would map it to. */
 15.3159 +{
 15.3160 +    unsigned long gfn = sh_gva_to_gfn(v, va);
 15.3161 +    if ( gfn == INVALID_GFN )
 15.3162 +        return 0;
 15.3163 +    else
 15.3164 +        return (gfn << PAGE_SHIFT) | (va & ~PAGE_MASK);
 15.3165 +}
 15.3166 +
 15.3167 +
 15.3168 +// XXX -- should this be in this file?
 15.3169 +//        Or should it be moved to shadow-common.c?
 15.3170 +//
 15.3171 +/* returns a lowmem machine address of the copied HVM L3 root table
 15.3172 + * If clear_res != 0, then clear the PAE-l3 reserved bits in the copy,
 15.3173 + * otherwise blank out any entries with reserved bits in them.  */
 15.3174 +#if (GUEST_PAGING_LEVELS == 3) && (SHADOW_PAGING_LEVELS == 3)
 15.3175 +static unsigned long
 15.3176 +hvm_pae_copy_root(struct vcpu *v, l3_pgentry_t *l3tab, int clear_res)
 15.3177 +{
 15.3178 +    int i, f;
 15.3179 +    int res = (_PAGE_RW|_PAGE_NX_BIT|_PAGE_USER|_PAGE_ACCESSED|_PAGE_DIRTY);
 15.3180 +    l3_pgentry_t new_l3e, *copy = v->arch.hvm_vcpu.hvm_lowmem_l3tab;
 15.3181 +    memcpy(copy, l3tab, 4 * sizeof(l3_pgentry_t));
 15.3182 +    for ( i = 0; i < 4; i++ )
 15.3183 +    {
 15.3184 +        f = l3e_get_flags(l3tab[i]);
 15.3185 +        if ( (f & _PAGE_PRESENT) && (!(f & res) || clear_res) )
 15.3186 +            new_l3e = l3e_from_pfn(l3e_get_pfn(l3tab[i]), f & ~res);
 15.3187 +        else
 15.3188 +            new_l3e = l3e_empty();
 15.3189 +        safe_write_entry(&copy[i], &new_l3e);
 15.3190 +    }
 15.3191 +    return __pa(copy);
 15.3192 +}
 15.3193 +#endif
 15.3194 +
 15.3195 +
 15.3196 +static inline void
 15.3197 +sh_update_linear_entries(struct vcpu *v)
 15.3198 +/* Sync up all the linear mappings for this vcpu's pagetables */
 15.3199 +{
 15.3200 +    struct domain *d = v->domain;
 15.3201 +
 15.3202 +    /* Linear pagetables in PV guests
 15.3203 +     * ------------------------------
 15.3204 +     *
 15.3205 +     * Guest linear pagetables, which map the guest pages, are at
 15.3206 +     * LINEAR_PT_VIRT_START.  Shadow linear pagetables, which map the
 15.3207 +     * shadows, are at SH_LINEAR_PT_VIRT_START.  Most of the time these
 15.3208 +     * are set up at shadow creation time, but (of course!) the PAE case
 15.3209 +     * is subtler.  Normal linear mappings are made by having an entry
 15.3210 +     * in the top-level table that points to itself (shadow linear) or
 15.3211 +     * to the guest top-level table (guest linear).  For PAE, to set up
 15.3212 +     * a linear map requires us to copy the four top-level entries into 
 15.3213 +     * level-2 entries.  That means that every time we change a PAE l3e,
 15.3214 +     * we need to reflect the change into the copy.
 15.3215 +     *
 15.3216 +     * Linear pagetables in HVM guests
 15.3217 +     * -------------------------------
 15.3218 +     *
 15.3219 +     * For HVM guests, the linear pagetables are installed in the monitor
 15.3220 +     * tables (since we can't put them in the shadow).  Shadow linear
 15.3221 +     * pagetables, which map the shadows, are at SH_LINEAR_PT_VIRT_START,
 15.3222 +     * and we use the linear pagetable slot at LINEAR_PT_VIRT_START for 
 15.3223 +     * a linear pagetable of the monitor tables themselves.  We have 
 15.3224 +     * the same issue of having to re-copy PAE l3 entries whevever we use
 15.3225 +     * PAE shadows. 
 15.3226 +     *
 15.3227 +     * Because HVM guests run on the same monitor tables regardless of the 
 15.3228 +     * shadow tables in use, the linear mapping of the shadow tables has to 
 15.3229 +     * be updated every time v->arch.shadow_table changes. 
 15.3230 +     */
 15.3231 +
 15.3232 +    /* Don't try to update the monitor table if it doesn't exist */
 15.3233 +    if ( shadow_mode_external(d) 
 15.3234 +         && pagetable_get_pfn(v->arch.monitor_table) == 0 ) 
 15.3235 +        return;
 15.3236 +
 15.3237 +#if (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS == 4)
 15.3238 +    
 15.3239 +    /* For PV, one l4e points at the guest l4, one points at the shadow
 15.3240 +     * l4.  No maintenance required. 
 15.3241 +     * For HVM, just need to update the l4e that points to the shadow l4. */
 15.3242 +
 15.3243 +    if ( shadow_mode_external(d) )
 15.3244 +    {
 15.3245 +        /* Use the linear map if we can; otherwise make a new mapping */
 15.3246 +        if ( v == current ) 
 15.3247 +        {
 15.3248 +            __linear_l4_table[l4_linear_offset(SH_LINEAR_PT_VIRT_START)] = 
 15.3249 +                l4e_from_pfn(pagetable_get_pfn(v->arch.shadow_table),
 15.3250 +                             __PAGE_HYPERVISOR);
 15.3251 +        } 
 15.3252 +        else
 15.3253 +        { 
 15.3254 +            l4_pgentry_t *ml4e;
 15.3255 +            ml4e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
 15.3256 +            ml4e[l4_table_offset(SH_LINEAR_PT_VIRT_START)] = 
 15.3257 +                l4e_from_pfn(pagetable_get_pfn(v->arch.shadow_table),
 15.3258 +                             __PAGE_HYPERVISOR);
 15.3259 +            sh_unmap_domain_page(ml4e);
 15.3260 +        }
 15.3261 +    }
 15.3262 +
 15.3263 +#elif (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS == 3)
 15.3264 +
 15.3265 +    /* This case only exists in HVM.  To give ourselves a linear map of the 
 15.3266 +     * shadows, we need to extend a PAE shadow to 4 levels.  We do this by 
 15.3267 +     * having a monitor l3 in slot 0 of the monitor l4 table, and 
 15.3268 +     * copying the PAE l3 entries into it.  Then, by having the monitor l4e
 15.3269 +     * for shadow pagetables also point to the monitor l4, we can use it
 15.3270 +     * to access the shadows. */
 15.3271 +
 15.3272 +    if ( shadow_mode_external(d) )
 15.3273 +    {
 15.3274 +        /* Install copies of the shadow l3es into the monitor l3 table.
 15.3275 +         * The monitor l3 table is hooked into slot 0 of the monitor
 15.3276 +         * l4 table, so we use l3 linear indices 0 to 3 */
 15.3277 +        shadow_l3e_t *sl3e;
 15.3278 +        l3_pgentry_t *ml3e;
 15.3279 +        mfn_t l3mfn;
 15.3280 +        int i;
 15.3281 +
 15.3282 +        /* Use linear mappings if we can; otherwise make new mappings */
 15.3283 +        if ( v == current ) 
 15.3284 +        {
 15.3285 +            ml3e = __linear_l3_table;
 15.3286 +            l3mfn = _mfn(l4e_get_pfn(__linear_l4_table[0]));
 15.3287 +#if GUEST_PAGING_LEVELS == 2
 15.3288 +            /* Shadow l3 tables are made up by update_cr3 */
 15.3289 +            sl3e = v->arch.hvm_vcpu.hvm_lowmem_l3tab;
 15.3290 +#else
 15.3291 +            sl3e = v->arch.shadow_vtable;
 15.3292 +#endif
 15.3293 +        }
 15.3294 +        else 
 15.3295 +        {   
 15.3296 +            l4_pgentry_t *ml4e;
 15.3297 +            ml4e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
 15.3298 +            ASSERT(l4e_get_flags(ml4e[0]) & _PAGE_PRESENT);
 15.3299 +            l3mfn = _mfn(l4e_get_pfn(ml4e[0]));
 15.3300 +            ml3e = sh_map_domain_page(l3mfn);
 15.3301 +            sh_unmap_domain_page(ml4e);
 15.3302 +#if GUEST_PAGING_LEVELS == 2
 15.3303 +            /* Shadow l3 tables are made up by update_cr3 */
 15.3304 +            sl3e = v->arch.hvm_vcpu.hvm_lowmem_l3tab;
 15.3305 +#else
 15.3306 +            sl3e = sh_map_domain_page(pagetable_get_mfn(v->arch.shadow_table));
 15.3307 +#endif
 15.3308 +        }
 15.3309 +
 15.3310 +        for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
 15.3311 +        {
 15.3312 +            ml3e[i] = 
 15.3313 +                (shadow_l3e_get_flags(sl3e[i]) & _PAGE_PRESENT) 
 15.3314 +                ? l3e_from_pfn(mfn_x(shadow_l3e_get_mfn(sl3e[i])), 
 15.3315 +                               __PAGE_HYPERVISOR) 
 15.3316 +                : l3e_empty();
 15.3317 +        }
 15.3318 +
 15.3319 +        if ( v != current ) 
 15.3320 +        {
 15.3321 +            sh_unmap_domain_page(ml3e);
 15.3322 +#if GUEST_PAGING_LEVELS != 2
 15.3323 +            sh_unmap_domain_page(sl3e);
 15.3324 +#endif
 15.3325 +        }
 15.3326 +    }
 15.3327 +
 15.3328 +#elif CONFIG_PAGING_LEVELS == 3
 15.3329 +
 15.3330 +    /* PV: need to copy the guest's l3 entries into the guest-linear-map l2
 15.3331 +     * entries in the shadow, and the shadow's l3 entries into the 
 15.3332 +     * shadow-linear-map l2 entries in the shadow.  This is safe to do 
 15.3333 +     * because Xen does not let guests share high-slot l2 tables between l3s,
 15.3334 +     * so we know we're not treading on anyone's toes. 
 15.3335 +     *
 15.3336 +     * HVM: need to copy the shadow's l3 entries into the
 15.3337 +     * shadow-linear-map l2 entries in the monitor table.  This is safe
 15.3338 +     * because we have one monitor table for each vcpu.  The monitor's
 15.3339 +     * own l3es don't need to be copied because they never change.  
 15.3340 +     * XXX That might change if we start stuffing things into the rest
 15.3341 +     * of the monitor's virtual address space. 
 15.3342 +     */ 
 15.3343 +    {
 15.3344 +        l2_pgentry_t *l2e, new_l2e;
 15.3345 +        shadow_l3e_t *guest_l3e = NULL, *shadow_l3e;
 15.3346 +        int i;
 15.3347 +
 15.3348 +#if GUEST_PAGING_LEVELS == 2
 15.3349 +        /* Shadow l3 tables were built by update_cr3 */
 15.3350 +        if ( shadow_mode_external(d) )
 15.3351 +            shadow_l3e = v->arch.hvm_vcpu.hvm_lowmem_l3tab;
 15.3352 +        else
 15.3353 +            BUG(); /* PV 2-on-3 is not supported yet */
 15.3354 +        
 15.3355 +#else /* GUEST_PAGING_LEVELS == 3 */
 15.3356 +        
 15.3357 +        /* Use local vcpu's mappings if we can; otherwise make new mappings */
 15.3358 +        if ( v == current ) 
 15.3359 +        {
 15.3360 +            shadow_l3e = v->arch.shadow_vtable;
 15.3361 +            if ( !shadow_mode_external(d) )
 15.3362 +                guest_l3e = v->arch.guest_vtable;
 15.3363 +        }
 15.3364 +        else 
 15.3365 +        {
 15.3366 +            mfn_t smfn;
 15.3367 +            int idx;
 15.3368 +            
 15.3369 +            /* Map the shadow l3 */
 15.3370 +            smfn = pagetable_get_mfn(v->arch.shadow_table);
 15.3371 +            idx = shadow_l3_index(&smfn, guest_index(v->arch.shadow_vtable));
 15.3372 +            shadow_l3e = sh_map_domain_page(smfn);
 15.3373 +            shadow_l3e += idx;
 15.3374 +            if ( !shadow_mode_external(d) )
 15.3375 +            {
 15.3376 +                /* Also the guest l3 */
 15.3377 +                mfn_t gmfn = pagetable_get_mfn(v->arch.guest_table); 
 15.3378 +                guest_l3e = sh_map_domain_page(gmfn);
 15.3379 +                guest_l3e += guest_index(v->arch.guest_vtable);
 15.3380 +            }
 15.3381 +        }
 15.3382 +#endif /* GUEST_PAGING_LEVELS */
 15.3383 +        
 15.3384 +        /* Choose where to write the entries, using linear maps if possible */
 15.3385 +        if ( v == current && shadow_mode_external(d) ) 
 15.3386 +        {
 15.3387 +            /* From the monitor tables, it's safe to use linear maps to update
 15.3388 +             * monitor l2s */
 15.3389 +            l2e = __linear_l2_table + (3 * L2_PAGETABLE_ENTRIES);
 15.3390 +        }
 15.3391 +        else if ( shadow_mode_external(d) ) 
 15.3392 +        {
 15.3393 +            /* Map the monitor table's high l2 */
 15.3394 +            l3_pgentry_t *l3e;
 15.3395 +            l3e = sh_map_domain_page(
 15.3396 +                pagetable_get_mfn(v->arch.monitor_table));
 15.3397 +            ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT);
 15.3398 +            l2e = sh_map_domain_page(_mfn(l3e_get_pfn(l3e[3])));
 15.3399 +            sh_unmap_domain_page(l3e);
 15.3400 +        } 
 15.3401 +        else 
 15.3402 +        {
 15.3403 +            /* Map the shadow table's high l2 */
 15.3404 +            ASSERT(shadow_l3e_get_flags(shadow_l3e[3]) & _PAGE_PRESENT);
 15.3405 +            l2e = sh_map_domain_page(shadow_l3e_get_mfn(shadow_l3e[3]));
 15.3406 +        }
 15.3407 +        
 15.3408 +        
 15.3409 +        if ( !shadow_mode_external(d) )
 15.3410 +        {
 15.3411 +            /* Write linear mapping of guest. */
 15.3412 +            for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
 15.3413 +            { 
 15.3414 +                new_l2e = (shadow_l3e_get_flags(guest_l3e[i]) & _PAGE_PRESENT) 
 15.3415 +                    ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(guest_l3e[i])),
 15.3416 +                                   __PAGE_HYPERVISOR) 
 15.3417 +                    : l2e_empty();
 15.3418 +                safe_write_entry(
 15.3419 +                    &l2e[l2_table_offset(LINEAR_PT_VIRT_START) + i],
 15.3420 +                    &new_l2e);
 15.3421 +            }
 15.3422 +        }
 15.3423 +        
 15.3424 +        /* Write linear mapping of shadow. */
 15.3425 +        for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
 15.3426 +        {
 15.3427 +            new_l2e = (shadow_l3e_get_flags(shadow_l3e[i]) & _PAGE_PRESENT) 
 15.3428 +                ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(shadow_l3e[i])),
 15.3429 +                               __PAGE_HYPERVISOR) 
 15.3430 +                : l2e_empty();
 15.3431 +            safe_write_entry(
 15.3432 +                &l2e[l2_table_offset(SH_LINEAR_PT_VIRT_START) + i],
 15.3433 +                &new_l2e);
 15.3434 +        }
 15.3435 +        
 15.3436 +        if ( v != current || !shadow_mode_external(d) )
 15.3437 +            sh_unmap_domain_page(l2e);
 15.3438 +        
 15.3439 +#if GUEST_PAGING_LEVELS == 3
 15.3440 +        if ( v != current) 
 15.3441 +        {
 15.3442 +            sh_unmap_domain_page(shadow_l3e);
 15.3443 +            if ( !shadow_mode_external(d) )
 15.3444 +                sh_unmap_domain_page(guest_l3e);
 15.3445 +        }
 15.3446 +#endif
 15.3447 +    }
 15.3448 +
 15.3449 +#elif CONFIG_PAGING_LEVELS == 2
 15.3450 +
 15.3451 +    /* For PV, one l2e points at the guest l2, one points at the shadow
 15.3452 +     * l2. No maintenance required. 
 15.3453 +     * For HVM, just need to update the l2e that points to the shadow l2. */
 15.3454 +
 15.3455 +    if ( shadow_mode_external(d) )
 15.3456 +    {
 15.3457 +        /* Use the linear map if we can; otherwise make a new mapping */
 15.3458 +        if ( v == current ) 
 15.3459 +        {
 15.3460 +            __linear_l2_table[l2_linear_offset(SH_LINEAR_PT_VIRT_START)] = 
 15.3461 +                l2e_from_pfn(pagetable_get_pfn(v->arch.shadow_table),
 15.3462 +                             __PAGE_HYPERVISOR);
 15.3463 +        } 
 15.3464 +        else
 15.3465 +        { 
 15.3466 +            l2_pgentry_t *ml2e;
 15.3467 +            ml2e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
 15.3468 +            ml2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] = 
 15.3469 +                l2e_from_pfn(pagetable_get_pfn(v->arch.shadow_table),
 15.3470 +                             __PAGE_HYPERVISOR);
 15.3471 +            sh_unmap_domain_page(ml2e);
 15.3472 +        }
 15.3473 +    }
 15.3474 +
 15.3475 +#else
 15.3476 +#error this should not happen
 15.3477 +#endif
 15.3478 +}
 15.3479 +
 15.3480 +
 15.3481 +// XXX -- should this be in this file?
 15.3482 +//        Or should it be moved to shadow-common.c?
 15.3483 +//
 15.3484 +#if (GUEST_PAGING_LEVELS == 3) && (SHADOW_PAGING_LEVELS == 3)
 15.3485 +void sh_pae_recopy(struct domain *d)
 15.3486 +/* Called whenever we write to the l3 entries of a PAE pagetable which 
 15.3487 + * is currently in use.  Each vcpu that is using the table needs to 
 15.3488 + * resync its copies of the l3s in linear maps and any low-memory
 15.3489 + * copies it might have made for fitting into 32bit CR3.
 15.3490 + * Since linear maps are also resynced when we change CR3, we don't
 15.3491 + * need to worry about changes to PAE l3es that are not currently in use.*/
 15.3492 +{
 15.3493 +    struct vcpu *v;
 15.3494 +    cpumask_t flush_mask = CPU_MASK_NONE;
 15.3495 +    ASSERT(shadow_lock_is_acquired(d));
 15.3496 +    
 15.3497 +    for_each_vcpu(d, v)
 15.3498 +    {
 15.3499 +        if ( !v->arch.shadow.pae_flip_pending ) 
 15.3500 +            continue;
 15.3501 +
 15.3502 +        cpu_set(v->processor, flush_mask);
 15.3503 +        
 15.3504 +        SHADOW_PRINTK("d=%u v=%u\n", v->domain->domain_id, v->vcpu_id);
 15.3505 +
 15.3506 +        /* This vcpu has a copy in its linear maps */
 15.3507 +        sh_update_linear_entries(v);
 15.3508 +        if ( hvm_guest(v) )
 15.3509 +        {
 15.3510 +            /* This vcpu has a copy in its HVM PAE l3 */
 15.3511 +            v->arch.hvm_vcpu.hw_cr3 = 
 15.3512 +                hvm_pae_copy_root(v, v->arch.shadow_vtable,
 15.3513 +                                  !shadow_vcpu_mode_translate(v));
 15.3514 +        }
 15.3515 +#if CONFIG_PAGING_LEVELS == 3
 15.3516 +        else 
 15.3517 +        {
 15.3518 +            /* This vcpu might have copied the l3 to below 4GB */
 15.3519 +            if ( v->arch.cr3 >> PAGE_SHIFT 
 15.3520 +                 != pagetable_get_pfn(v->arch.shadow_table) )
 15.3521 +            {
 15.3522 +                /* Recopy to where that copy is. */
 15.3523 +                int i;
 15.3524 +                l3_pgentry_t *dst, *src;
 15.3525 +                dst = __va(v->arch.cr3 & ~0x1f); /* Mask cache control bits */
 15.3526 +                src = v->arch.shadow_vtable;
 15.3527 +                for ( i = 0 ; i < 4 ; i++ ) 
 15.3528 +                    safe_write_entry(dst + i, src + i);
 15.3529 +            }
 15.3530 +        }
 15.3531 +#endif
 15.3532 +        v->arch.shadow.pae_flip_pending = 0;        
 15.3533 +    }
 15.3534 +
 15.3535 +    flush_tlb_mask(flush_mask);
 15.3536 +}
 15.3537 +#endif /* (GUEST_PAGING_LEVELS == 3) && (SHADOW_PAGING_LEVELS == 3) */
 15.3538 +
 15.3539 +
 15.3540 +/* removes:
 15.3541 + *     vcpu->arch.guest_vtable
 15.3542 + *     vcpu->arch.shadow_table
 15.3543 + *     vcpu->arch.shadow_vtable
 15.3544 + * Does all appropriate management/bookkeeping/refcounting/etc...
 15.3545 + */
 15.3546 +static void
 15.3547 +sh_detach_old_tables(struct vcpu *v)
 15.3548 +{
 15.3549 +    mfn_t smfn;
 15.3550 +
 15.3551 +    ////
 15.3552 +    //// vcpu->arch.guest_vtable
 15.3553 +    ////
 15.3554 +    if ( (shadow_mode_external(v->domain) || (GUEST_PAGING_LEVELS == 3)) &&
 15.3555 +         v->arch.guest_vtable )
 15.3556 +    {
 15.3557 +        // Q: why does this need to use (un)map_domain_page_*global* ?
 15.3558 +        sh_unmap_domain_page_global(v->arch.guest_vtable);
 15.3559 +        v->arch.guest_vtable = NULL;
 15.3560 +    }
 15.3561 +
 15.3562 +    ////
 15.3563 +    //// vcpu->arch.shadow_table
 15.3564 +    ////
 15.3565 +    smfn = pagetable_get_mfn(v->arch.shadow_table);
 15.3566 +    if ( mfn_x(smfn) )
 15.3567 +    {
 15.3568 +        ASSERT(v->arch.shadow_vtable);
 15.3569 +
 15.3570 +#if GUEST_PAGING_LEVELS == 3
 15.3571 +        // PAE guests do not (necessarily) use an entire page for their
 15.3572 +        // 4-entry L3s, so we have to deal with them specially.
 15.3573 +        //
 15.3574 +        sh_put_ref_l3_subshadow(v, v->arch.shadow_vtable, smfn);
 15.3575 +#else
 15.3576 +        sh_put_ref(v, smfn, 0);
 15.3577 +#endif
 15.3578 +
 15.3579 +#if (SHADOW_PAGING_LEVELS == 3) && (GUEST_PAGING_LEVELS == 3)
 15.3580 +        {
 15.3581 +            struct pae_l3_bookkeeping *info =
 15.3582 +                sl3p_to_info(v->arch.shadow_vtable);
 15.3583 +            ASSERT(test_bit(v->vcpu_id, &info->vcpus));
 15.3584 +            clear_bit(v->vcpu_id, &info->vcpus);
 15.3585 +        }
 15.3586 +#endif
 15.3587 +        v->arch.shadow_table = pagetable_null();
 15.3588 +    }
 15.3589 +
 15.3590 +    ////
 15.3591 +    //// vcpu->arch.shadow_vtable
 15.3592 +    ////
 15.3593 +    if ( (shadow_mode_external(v->domain) || (GUEST_PAGING_LEVELS == 3)) &&
 15.3594 +         v->arch.shadow_vtable )
 15.3595 +    {
 15.3596 +        // Q: why does this need to use (un)map_domain_page_*global* ?
 15.3597 +        //
 15.3598 +        sh_unmap_domain_page_global(v->arch.shadow_vtable);
 15.3599 +        v->arch.shadow_vtable = NULL;
 15.3600 +    }
 15.3601 +}
 15.3602 +
 15.3603 +static void
 15.3604 +sh_update_cr3(struct vcpu *v)
 15.3605 +/* Updates vcpu->arch.shadow_table after the guest has changed CR3.
 15.3606 + * Paravirtual guests should set v->arch.guest_table (and guest_table_user,
 15.3607 + * if appropriate).
 15.3608 + * HVM guests should also set hvm_get_guest_cntl_reg(v, 3)...
 15.3609 + */
 15.3610 +{
 15.3611 +    struct domain *d = v->domain;
 15.3612 +    mfn_t gmfn, smfn;
 15.3613 +#if GUEST_PAGING_LEVELS == 3
 15.3614 +    u32 guest_idx=0;
 15.3615 +#endif
 15.3616 +
 15.3617 +    ASSERT(shadow_lock_is_acquired(v->domain));
 15.3618 +    ASSERT(v->arch.shadow.mode);
 15.3619 +
 15.3620 +    ////
 15.3621 +    //// vcpu->arch.guest_table is already set
 15.3622 +    ////
 15.3623 +    
 15.3624 +#ifndef NDEBUG 
 15.3625 +    /* Double-check that the HVM code has sent us a sane guest_table */
 15.3626 +    if ( hvm_guest(v) )
 15.3627 +    {
 15.3628 +        gfn_t gfn;
 15.3629 +
 15.3630 +        ASSERT(shadow_mode_external(d));
 15.3631 +
 15.3632 +        // Is paging enabled on this vcpu?
 15.3633 +        if ( shadow_vcpu_mode_translate(v) )
 15.3634 +        {
 15.3635 +            gfn = _gfn(paddr_to_pfn(hvm_get_guest_ctrl_reg(v, 3)));
 15.3636 +            gmfn = vcpu_gfn_to_mfn(v, gfn);
 15.3637 +            ASSERT(valid_mfn(gmfn));
 15.3638 +            ASSERT(pagetable_get_pfn(v->arch.guest_table) == mfn_x(gmfn));
 15.3639 +        } 
 15.3640 +        else 
 15.3641 +        {
 15.3642 +            /* Paging disabled: guest_table points at (part of) p2m */
 15.3643 +#if SHADOW_PAGING_LEVELS != 3 /* in 3-on-4, guest-table is in slot 0 of p2m */
 15.3644 +            /* For everything else, they sould be the same */
 15.3645 +            ASSERT(v->arch.guest_table.pfn == d->arch.phys_table.pfn);
 15.3646 +#endif
 15.3647 +        }
 15.3648 +    }
 15.3649 +#endif
 15.3650 +
 15.3651 +    SHADOW_PRINTK("d=%u v=%u guest_table=%05lx\n",
 15.3652 +                   d->domain_id, v->vcpu_id, 
 15.3653 +                   (unsigned long)pagetable_get_pfn(v->arch.guest_table));
 15.3654 +
 15.3655 +#if GUEST_PAGING_LEVELS == 4
 15.3656 +    if ( !(v->arch.flags & TF_kernel_mode) )
 15.3657 +        gmfn = pagetable_get_mfn(v->arch.guest_table_user);
 15.3658 +    else
 15.3659 +#endif
 15.3660 +        gmfn = pagetable_get_mfn(v->arch.guest_table);
 15.3661 +
 15.3662 +    sh_detach_old_tables(v);
 15.3663 +
 15.3664 +    if ( !test_bit(_VCPUF_initialised, &v->vcpu_flags) )
 15.3665 +    {
 15.3666 +        ASSERT(v->arch.cr3 == 0);
 15.3667 +        return;
 15.3668 +    }
 15.3669 +
 15.3670 +    ////
 15.3671 +    //// vcpu->arch.guest_vtable
 15.3672 +    ////
 15.3673 +    if ( shadow_mode_external(d) )
 15.3674 +    {
 15.3675 +#if GUEST_PAGING_LEVELS == 3
 15.3676 +        if ( shadow_vcpu_mode_translate(v) ) 
 15.3677 +            /* Paging enabled: find where in the page the l3 table is */
 15.3678 +            guest_idx = guest_index((void *)hvm_get_guest_ctrl_reg(v, 3));
 15.3679 +        else
 15.3680 +            /* Paging disabled: l3 is at the start of a page (in the p2m) */ 
 15.3681 +            guest_idx = 0; 
 15.3682 +
 15.3683 +        // Ignore the low 2 bits of guest_idx -- they are really just
 15.3684 +        // cache control.
 15.3685 +        guest_idx &= ~3;
 15.3686 +        // XXX - why does this need a global map?
 15.3687 +        v->arch.guest_vtable =
 15.3688 +            (guest_l3e_t *)sh_map_domain_page_global(gmfn) + guest_idx;
 15.3689 +#else
 15.3690 +        // XXX - why does this need a global map?
 15.3691 +        v->arch.guest_vtable = sh_map_domain_page_global(gmfn);
 15.3692 +#endif
 15.3693 +    }
 15.3694 +    else
 15.3695 +    {
 15.3696 +#ifdef __x86_64__
 15.3697 +        v->arch.guest_vtable = __linear_l4_table;
 15.3698 +#elif GUEST_PAGING_LEVELS == 3
 15.3699 +        // XXX - why does this need a global map?
 15.3700 +        v->arch.guest_vtable = sh_map_domain_page_global(gmfn);
 15.3701 +#else
 15.3702 +        v->arch.guest_vtable = __linear_l2_table;
 15.3703 +#endif
 15.3704 +    }
 15.3705 +
 15.3706 +#if 0
 15.3707 +    printk("%s %s %d gmfn=%05lx guest_vtable=%p\n",
 15.3708 +           __func__, __FILE__, __LINE__, gmfn, v->arch.guest_vtable);
 15.3709 +#endif
 15.3710 +
 15.3711 +    ////
 15.3712 +    //// vcpu->arch.shadow_table
 15.3713 +    ////
 15.3714 +    smfn = get_shadow_status(v, gmfn, PGC_SH_guest_root_type);
 15.3715 +    if ( valid_mfn(smfn) )
 15.3716 +    {
 15.3717 +        /* Pull this root shadow to the front of the list of roots. */
 15.3718 +        list_del(&mfn_to_page(smfn)->list);
 15.3719 +        list_add(&mfn_to_page(smfn)->list, &d->arch.shadow.toplevel_shadows);
 15.3720 +    }
 15.3721 +    else
 15.3722 +    {
 15.3723 +        /* This guest MFN is a pagetable.  Must revoke write access. */
 15.3724 +        if ( shadow_remove_write_access(v, gmfn, GUEST_PAGING_LEVELS, 0) 
 15.3725 +             != 0 )
 15.3726 +            flush_tlb_mask(d->domain_dirty_cpumask); 
 15.3727 +        /* Make sure there's enough free shadow memory. */
 15.3728 +        shadow_prealloc(d, SHADOW_MAX_ORDER); 
 15.3729 +        /* Shadow the page. */
 15.3730 +        smfn = sh_make_shadow(v, gmfn, PGC_SH_guest_root_type);
 15.3731 +        list_add(&mfn_to_page(smfn)->list, &d->arch.shadow.toplevel_shadows);
 15.3732 +    }
 15.3733 +    ASSERT(valid_mfn(smfn));
 15.3734 +    v->arch.shadow_table = pagetable_from_mfn(smfn);
 15.3735 +
 15.3736 +#if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW
 15.3737 +    /* Once again OK to unhook entries from this table if we see fork/exit */
 15.3738 +    ASSERT(sh_mfn_is_a_page_table(gmfn));
 15.3739 +    mfn_to_page(gmfn)->shadow_flags &= ~SHF_unhooked_mappings;
 15.3740 +#endif
 15.3741 +
 15.3742 +
 15.3743 +    ////
 15.3744 +    //// vcpu->arch.shadow_vtable
 15.3745 +    ////
 15.3746 +    if ( shadow_mode_external(d) )
 15.3747 +    {
 15.3748 +#if (SHADOW_PAGING_LEVELS == 3) && (GUEST_PAGING_LEVELS == 3)
 15.3749 +        mfn_t adjusted_smfn = smfn;
 15.3750 +        u32 shadow_idx = shadow_l3_index(&adjusted_smfn, guest_idx);
 15.3751 +        // Q: why does this need to use (un)map_domain_page_*global* ?
 15.3752 +        v->arch.shadow_vtable =
 15.3753 +            (shadow_l3e_t *)sh_map_domain_page_global(adjusted_smfn) +
 15.3754 +            shadow_idx;
 15.3755 +#else
 15.3756 +        // Q: why does this need to use (un)map_domain_page_*global* ?
 15.3757 +        v->arch.shadow_vtable = sh_map_domain_page_global(smfn);
 15.3758 +#endif
 15.3759 +    }
 15.3760 +    else
 15.3761 +    {
 15.3762 +#if SHADOW_PAGING_LEVELS == 4
 15.3763 +        v->arch.shadow_vtable = __sh_linear_l4_table;
 15.3764 +#elif GUEST_PAGING_LEVELS == 3
 15.3765 +        // XXX - why does this need a global map?
 15.3766 +        v->arch.shadow_vtable = sh_map_domain_page_global(smfn);
 15.3767 +#else
 15.3768 +        v->arch.shadow_vtable = __sh_linear_l2_table;
 15.3769 +#endif
 15.3770 +    }
 15.3771 +
 15.3772 +    ////
 15.3773 +    //// Take a ref to the new shadow table, and pin it.
 15.3774 +    ////
 15.3775 +    //
 15.3776 +    // This ref is logically "held" by v->arch.shadow_table entry itself.
 15.3777 +    // Release the old ref.
 15.3778 +    //
 15.3779 +#if GUEST_PAGING_LEVELS == 3
 15.3780 +    // PAE guests do not (necessarily) use an entire page for their
 15.3781 +    // 4-entry L3s, so we have to deal with them specially.
 15.3782 +    //
 15.3783 +    // XXX - might want to revisit this if/when we do multiple compilation for
 15.3784 +    //       HVM-vs-PV guests, as PAE PV guests could get away without doing
 15.3785 +    //       subshadows.
 15.3786 +    //
 15.3787 +    sh_get_ref_l3_subshadow(v->arch.shadow_vtable, smfn);
 15.3788 +    sh_pin_l3_subshadow(v->arch.shadow_vtable, smfn);
 15.3789 +#else
 15.3790 +    sh_get_ref(smfn, 0);
 15.3791 +    sh_pin(smfn);
 15.3792 +#endif
 15.3793 +
 15.3794 +#if (SHADOW_PAGING_LEVELS == 3) && (GUEST_PAGING_LEVELS == 3)
 15.3795 +    // PAE 3-on-3 shadows have to keep track of which vcpu's are using
 15.3796 +    // which l3 subshadow, in order handle the SHADOW_SET_L3PAE_RECOPY
 15.3797 +    // case from validate_gl3e().  Search for SHADOW_SET_L3PAE_RECOPY
 15.3798 +    // in the code for more info.
 15.3799 +    //
 15.3800 +    {
 15.3801 +        struct pae_l3_bookkeeping *info =
 15.3802 +            sl3p_to_info(v->arch.shadow_vtable);
 15.3803 +        ASSERT(!test_bit(v->vcpu_id, &info->vcpus));
 15.3804 +        set_bit(v->vcpu_id, &info->vcpus);
 15.3805 +    }
 15.3806 +#endif
 15.3807 +
 15.3808 +    debugtrace_printk("%s cr3 gmfn=%05lx smfn=%05lx\n",
 15.3809 +                      __func__, gmfn, smfn);
 15.3810 +
 15.3811 +    ///
 15.3812 +    /// v->arch.cr3 and, if appropriate, v->arch.hvm_vcpu.hw_cr3
 15.3813 +    ///
 15.3814 +    if ( shadow_mode_external(d) )
 15.3815 +    {
 15.3816 +        ASSERT(hvm_guest(v));
 15.3817 +        make_cr3(v, pagetable_get_pfn(v->arch.monitor_table));
 15.3818 +
 15.3819 +#if (GUEST_PAGING_LEVELS == 2) && (SHADOW_PAGING_LEVELS != 2)
 15.3820 +#if SHADOW_PAGING_LEVELS != 3
 15.3821 +#error unexpected combination of GUEST and SHADOW paging levels
 15.3822 +#endif
 15.3823 +        /* 2-on-3: make a PAE l3 table that points at the four-page l2 */
 15.3824 +        {
 15.3825 +            mfn_t smfn = pagetable_get_mfn(v->arch.shadow_table);
 15.3826 +            int i;
 15.3827 +
 15.3828 +            ASSERT(v->arch.hvm_vcpu.hw_cr3 ==
 15.3829 +                   virt_to_maddr(v->arch.hvm_vcpu.hvm_lowmem_l3tab));
 15.3830 +            for (i = 0; i < 4; i++)
 15.3831 +            {
 15.3832 +                v->arch.hvm_vcpu.hvm_lowmem_l3tab[i] =
 15.3833 +                    shadow_l3e_from_mfn(_mfn(mfn_x(smfn)+i), _PAGE_PRESENT);
 15.3834 +            }
 15.3835 +        }
 15.3836 +#elif (GUEST_PAGING_LEVELS == 3) && (SHADOW_PAGING_LEVELS == 3)
 15.3837 +        /* 3-on-3: copy the shadow l3 to slots that are below 4GB.
 15.3838 +         * If paging is disabled, clear l3e reserved bits; otherwise 
 15.3839 +         * remove entries that have reserved bits set. */
 15.3840 +        v->arch.hvm_vcpu.hw_cr3 =
 15.3841 +            hvm_pae_copy_root(v, v->arch.shadow_vtable, 
 15.3842 +                              !shadow_vcpu_mode_translate(v));
 15.3843 +#else
 15.3844 +        /* 2-on-2 or 4-on-4: just put the shadow top-level into cr3 */
 15.3845 +        v->arch.hvm_vcpu.hw_cr3 =
 15.3846 +            pagetable_get_paddr(v->arch.shadow_table);
 15.3847 +#endif
 15.3848 +    }
 15.3849 +    else // not shadow_mode_external...
 15.3850 +    {
 15.3851 +        /* We don't support PV except guest == shadow == config levels */
 15.3852 +        BUG_ON(GUEST_PAGING_LEVELS != SHADOW_PAGING_LEVELS);
 15.3853 +        make_cr3(v, pagetable_get_pfn(v->arch.shadow_table));
 15.3854 +    }
 15.3855 +
 15.3856 +    /* Fix up the linear pagetable mappings */
 15.3857 +    sh_update_linear_entries(v);
 15.3858 +}
 15.3859 +
 15.3860 +
 15.3861 +/**************************************************************************/
 15.3862 +/* Functions to revoke guest rights */
 15.3863 +
 15.3864 +#if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
 15.3865 +static int sh_guess_wrmap(struct vcpu *v, unsigned long vaddr, mfn_t gmfn)
 15.3866 +/* Look up this vaddr in the current shadow and see if it's a writeable
 15.3867 + * mapping of this gmfn.  If so, remove it.  Returns 1 if it worked. */
 15.3868 +{
 15.3869 +    shadow_l1e_t sl1e, *sl1p;
 15.3870 +    shadow_l2e_t *sl2p;
 15.3871 +#if GUEST_PAGING_LEVELS >= 3
 15.3872 +    shadow_l3e_t *sl3p;
 15.3873 +#if GUEST_PAGING_LEVELS >= 4
 15.3874 +    shadow_l4e_t *sl4p;
 15.3875 +#endif
 15.3876 +#endif
 15.3877 +    mfn_t sl1mfn;
 15.3878 +
 15.3879 +
 15.3880 +    /* Carefully look in the shadow linear map for the l1e we expect */
 15.3881 +    if ( v->arch.shadow_vtable == NULL ) return 0;
 15.3882 +#if GUEST_PAGING_LEVELS >= 4
 15.3883 +    sl4p = sh_linear_l4_table(v) + shadow_l4_linear_offset(vaddr);
 15.3884 +    if ( !(shadow_l4e_get_flags(*sl4p) & _PAGE_PRESENT) )
 15.3885 +        return 0;
 15.3886 +    sl3p = sh_linear_l3_table(v) + shadow_l3_linear_offset(vaddr);
 15.3887 +    if ( !(shadow_l3e_get_flags(*sl3p) & _PAGE_PRESENT) )
 15.3888 +        return 0;
 15.3889 +#elif GUEST_PAGING_LEVELS == 3
 15.3890 +    sl3p = ((shadow_l3e_t *) v->arch.shadow_vtable) 
 15.3891 +        + shadow_l3_linear_offset(vaddr);
 15.3892 +    if ( !(shadow_l3e_get_flags(*sl3p) & _PAGE_PRESENT) )
 15.3893 +        return 0;
 15.3894 +#endif
 15.3895 +    sl2p = sh_linear_l2_table(v) + shadow_l2_linear_offset(vaddr);
 15.3896 +    if ( !(shadow_l2e_get_flags(*sl2p) & _PAGE_PRESENT) )
 15.3897 +        return 0;
 15.3898 +    sl1p = sh_linear_l1_table(v) + shadow_l1_linear_offset(vaddr);
 15.3899 +    sl1e = *sl1p;
 15.3900 +    if ( ((shadow_l1e_get_flags(sl1e) & (_PAGE_PRESENT|_PAGE_RW))
 15.3901 +          != (_PAGE_PRESENT|_PAGE_RW))
 15.3902 +         || (mfn_x(shadow_l1e_get_mfn(sl1e)) != mfn_x(gmfn)) )
 15.3903 +        return 0;
 15.3904 +
 15.3905 +    /* Found it!  Need to remove its write permissions. */
 15.3906 +    sl1mfn = shadow_l2e_get_mfn(*sl2p);
 15.3907 +    sl1e = shadow_l1e_remove_flags(sl1e, _PAGE_RW);
 15.3908 +    shadow_set_l1e(v, sl1p, sl1e, sl1mfn);
 15.3909 +    return 1;
 15.3910 +}
 15.3911 +#endif
 15.3912 +
 15.3913 +int sh_remove_write_access(struct vcpu *v, mfn_t sl1mfn, mfn_t readonly_mfn)
 15.3914 +/* Excises all writeable mappings to readonly_mfn from this l1 shadow table */
 15.3915 +{
 15.3916 +    shadow_l1e_t *sl1e;
 15.3917 +    int done = 0;
 15.3918 +    int flags;
 15.3919 +    
 15.3920 +    SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done, 
 15.3921 +    {
 15.3922 +        flags = shadow_l1e_get_flags(*sl1e);
 15.3923 +        if ( (flags & _PAGE_PRESENT) 
 15.3924 +             && (flags & _PAGE_RW) 
 15.3925 +             && (mfn_x(shadow_l1e_get_mfn(*sl1e)) == mfn_x(readonly_mfn)) )
 15.3926 +        {
 15.3927 +            shadow_set_l1e(v, sl1e, shadow_l1e_empty(), sl1mfn);
 15.3928 +            if ( (mfn_to_page(readonly_mfn)->u.inuse.type_info
 15.3929 +                  & PGT_count_mask) == 0 )
 15.3930 +                /* This breaks us cleanly out of the FOREACH macro */
 15.3931 +                done = 1;
 15.3932 +        }
 15.3933 +    });
 15.3934 +    return done;
 15.3935 +}
 15.3936 +
 15.3937 +
 15.3938 +int sh_remove_all_mappings(struct vcpu *v, mfn_t sl1mfn, mfn_t target_mfn)
 15.3939 +/* Excises all mappings to guest frame from this shadow l1 table */
 15.3940 +{
 15.3941 +    shadow_l1e_t *sl1e;
 15.3942 +    int done = 0;
 15.3943 +    int flags;
 15.3944 +    
 15.3945 +    SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done, 
 15.3946 +    {
 15.3947 +        flags = shadow_l1e_get_flags(*sl1e);
 15.3948 +        if ( (flags & _PAGE_PRESENT) 
 15.3949 +             && (mfn_x(shadow_l1e_get_mfn(*sl1e)) == mfn_x(target_mfn)) )
 15.3950 +        {
 15.3951 +            shadow_set_l1e(v, sl1e, shadow_l1e_empty(), sl1mfn);
 15.3952 +            if ( (mfn_to_page(target_mfn)->count_info & PGC_count_mask) == 0 )
 15.3953 +                /* This breaks us cleanly out of the FOREACH macro */
 15.3954 +                done = 1;
 15.3955 +        }
 15.3956 +    });
 15.3957 +    return done;
 15.3958 +}
 15.3959 +
 15.3960 +/**************************************************************************/
 15.3961 +/* Functions to excise all pointers to shadows from higher-level shadows. */
 15.3962 +
 15.3963 +void sh_clear_shadow_entry(struct vcpu *v, void *ep, mfn_t smfn)
 15.3964 +/* Blank out a single shadow entry */
 15.3965 +{
 15.3966 +    switch (mfn_to_page(smfn)->count_info & PGC_SH_type_mask) 
 15.3967 +    {
 15.3968 +    case PGC_SH_l1_shadow:
 15.3969 +        shadow_set_l1e(v, ep, shadow_l1e_empty(), smfn); break;
 15.3970 +    case PGC_SH_l2_shadow:
 15.3971 +#if GUEST_PAGING_LEVELS == 3
 15.3972 +    case PGC_SH_l2h_shadow:
 15.3973 +#endif
 15.3974 +        shadow_set_l2e(v, ep, shadow_l2e_empty(), smfn); break;
 15.3975 +#if GUEST_PAGING_LEVELS >= 3
 15.3976 +    case PGC_SH_l3_shadow:
 15.3977 +        shadow_set_l3e(v, ep, shadow_l3e_empty(), smfn); break;
 15.3978 +#if GUEST_PAGING_LEVELS >= 4
 15.3979 +    case PGC_SH_l4_shadow:
 15.3980 +        shadow_set_l4e(v, ep, shadow_l4e_empty(), smfn); break;
 15.3981 +#endif
 15.3982 +#endif
 15.3983 +    default: BUG(); /* Called with the wrong kind of shadow. */
 15.3984 +    }
 15.3985 +}
 15.3986 +
 15.3987 +int sh_remove_l1_shadow(struct vcpu *v, mfn_t sl2mfn, mfn_t sl1mfn)
 15.3988 +/* Remove all mappings of this l1 shadow from this l2 shadow */
 15.3989 +{
 15.3990 +    shadow_l2e_t *sl2e;
 15.3991 +    int done = 0;
 15.3992 +    int flags;
 15.3993 +#if GUEST_PAGING_LEVELS != 4
 15.3994 +    int xen_mappings = !shadow_mode_external(v->domain);
 15.3995 +#endif
 15.3996 +    
 15.3997 +    SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, done, xen_mappings, 
 15.3998 +    {
 15.3999 +        flags = shadow_l2e_get_flags(*sl2e);
 15.4000 +        if ( (flags & _PAGE_PRESENT) 
 15.4001 +             && (mfn_x(shadow_l2e_get_mfn(*sl2e)) == mfn_x(sl1mfn)) )
 15.4002 +        {
 15.4003 +            shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
 15.4004 +            if ( (mfn_to_page(sl1mfn)->count_info & PGC_SH_type_mask) == 0 )
 15.4005 +                /* This breaks us cleanly out of the FOREACH macro */
 15.4006 +                done = 1;
 15.4007 +        }
 15.4008 +    });
 15.4009 +    return done;
 15.4010 +}
 15.4011 +
 15.4012 +#if GUEST_PAGING_LEVELS >= 3
 15.4013 +int sh_remove_l2_shadow(struct vcpu *v, mfn_t sl3mfn, mfn_t sl2mfn)
 15.4014 +/* Remove all mappings of this l2 shadow from this l3 shadow */
 15.4015 +{
 15.4016 +    shadow_l3e_t *sl3e;
 15.4017 +    int done = 0;
 15.4018 +    int flags;
 15.4019 +    
 15.4020 +    SHADOW_FOREACH_L3E(sl3mfn, sl3e, 0, done, 
 15.4021 +    {
 15.4022 +        flags = shadow_l3e_get_flags(*sl3e);
 15.4023 +        if ( (flags & _PAGE_PRESENT) 
 15.4024 +             && (mfn_x(shadow_l3e_get_mfn(*sl3e)) == mfn_x(sl2mfn)) )
 15.4025 +        {
 15.4026 +            shadow_set_l3e(v, sl3e, shadow_l3e_empty(), sl3mfn);
 15.4027 +            if ( (mfn_to_page(sl2mfn)->count_info & PGC_SH_type_mask) == 0 )
 15.4028 +                /* This breaks us cleanly out of the FOREACH macro */
 15.4029 +                done = 1;
 15.4030 +        }
 15.4031 +    });
 15.4032 +    return done;
 15.4033 +}
 15.4034 +
 15.4035 +#if GUEST_PAGING_LEVELS >= 4
 15.4036 +int sh_remove_l3_shadow(struct vcpu *v, mfn_t sl4mfn, mfn_t sl3mfn)
 15.4037 +/* Remove all mappings of this l3 shadow from this l4 shadow */
 15.4038 +{
 15.4039 +    shadow_l4e_t *sl4e;
 15.4040 +    int done = 0;
 15.4041 +    int flags, xen_mappings = !shadow_mode_external(v->domain);
 15.4042 +    
 15.4043 +    SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, done, xen_mappings,
 15.4044 +    {
 15.4045 +        flags = shadow_l4e_get_flags(*sl4e);
 15.4046 +        if ( (flags & _PAGE_PRESENT) 
 15.4047 +             && (mfn_x(shadow_l4e_get_mfn(*sl4e)) == mfn_x(sl3mfn)) )
 15.4048 +        {
 15.4049 +            shadow_set_l4e(v, sl4e, shadow_l4e_empty(), sl4mfn);
 15.4050 +            if ( (mfn_to_page(sl3mfn)->count_info & PGC_SH_type_mask) == 0 )
 15.4051 +                /* This breaks us cleanly out of the FOREACH macro */
 15.4052 +                done = 1;
 15.4053 +        }
 15.4054 +    });
 15.4055 +    return done;
 15.4056 +}
 15.4057 +#endif /* 64bit guest */ 
 15.4058 +#endif /* PAE guest */
 15.4059 +
 15.4060 +/**************************************************************************/
 15.4061 +/* Handling HVM guest writes to pagetables  */
 15.4062 +
 15.4063 +/* Check that the user is allowed to perform this write. 
 15.4064 + * Returns a mapped pointer to write to, and the mfn it's on,
 15.4065 + * or NULL for error. */
 15.4066 +static inline void * emulate_map_dest(struct vcpu *v,
 15.4067 +                                      unsigned long vaddr,
 15.4068 +                                      struct x86_emulate_ctxt *ctxt,
 15.4069 +                                      mfn_t *mfnp)
 15.4070 +{
 15.4071 +    walk_t gw;
 15.4072 +    u32 flags;
 15.4073 +    gfn_t gfn;
 15.4074 +    mfn_t mfn;
 15.4075 +
 15.4076 +    guest_walk_tables(v, vaddr, &gw, 1);
 15.4077 +    flags = accumulate_guest_flags(&gw);
 15.4078 +    gfn = guest_l1e_get_gfn(gw.eff_l1e);
 15.4079 +    mfn = vcpu_gfn_to_mfn(v, gfn);
 15.4080 +    sh_audit_gw(v, &gw);
 15.4081 +    unmap_walk(v, &gw);
 15.4082 +
 15.4083 +    if ( !(flags & _PAGE_PRESENT) 
 15.4084 +         || !(flags & _PAGE_RW) 
 15.4085 +         || (!(flags & _PAGE_USER) && ring_3(ctxt->regs)) )
 15.4086 +    {
 15.4087 +        /* This write would have faulted even on bare metal */
 15.4088 +        v->arch.shadow.propagate_fault = 1;
 15.4089 +        return NULL;
 15.4090 +    }
 15.4091 +    
 15.4092 +    if ( !valid_mfn(mfn) )
 15.4093 +    {
 15.4094 +        /* Attempted a write to a bad gfn.  This should never happen:
 15.4095 +         * after all, we're here because this write is to a page table. */
 15.4096 +        BUG();
 15.4097 +    }
 15.4098 +
 15.4099 +    ASSERT(sh_mfn_is_a_page_table(mfn));
 15.4100 +    *mfnp = mfn;
 15.4101 +    return sh_map_domain_page(mfn) + (vaddr & ~PAGE_MASK);
 15.4102 +}
 15.4103 +
 15.4104 +int
 15.4105 +sh_x86_emulate_write(struct vcpu *v, unsigned long vaddr, void *src,
 15.4106 +                      u32 bytes, struct x86_emulate_ctxt *ctxt)
 15.4107 +{
 15.4108 +    ASSERT(shadow_lock_is_acquired(v->domain));
 15.4109 +    while ( bytes > 0 )
 15.4110 +    {
 15.4111 +        mfn_t mfn;
 15.4112 +        int bytes_on_page;
 15.4113 +        void *addr;
 15.4114 +
 15.4115 +        bytes_on_page = PAGE_SIZE - (vaddr & ~PAGE_MASK);
 15.4116 +        if ( bytes_on_page > bytes )
 15.4117 +            bytes_on_page = bytes;
 15.4118 +
 15.4119 +        if ( (addr = emulate_map_dest(v, vaddr, ctxt, &mfn)) == NULL )
 15.4120 +            return X86EMUL_PROPAGATE_FAULT;
 15.4121 +        memcpy(addr, src, bytes_on_page);
 15.4122 +        shadow_validate_guest_pt_write(v, mfn, addr, bytes_on_page);
 15.4123 +        bytes -= bytes_on_page;
 15.4124 +        /* If we are writing zeros to this page, might want to unshadow */
 15.4125 +        if ( *(u8 *)addr == 0 )
 15.4126 +            check_for_early_unshadow(v, mfn);
 15.4127 +        sh_unmap_domain_page(addr);
 15.4128 +    }
 15.4129 +    shadow_audit_tables(v);
 15.4130 +    return X86EMUL_CONTINUE;
 15.4131 +}
 15.4132 +
 15.4133 +int
 15.4134 +sh_x86_emulate_cmpxchg(struct vcpu *v, unsigned long vaddr, 
 15.4135 +                        unsigned long old, unsigned long new,
 15.4136 +                        unsigned int bytes, struct x86_emulate_ctxt *ctxt)
 15.4137 +{
 15.4138 +    mfn_t mfn;
 15.4139 +    void *addr;
 15.4140 +    unsigned long prev;
 15.4141 +    int rv = X86EMUL_CONTINUE;
 15.4142 +
 15.4143 +    ASSERT(shadow_lock_is_acquired(v->domain));
 15.4144 +    ASSERT(bytes <= sizeof (unsigned long));
 15.4145 +
 15.4146 +    if ( (addr = emulate_map_dest(v, vaddr, ctxt, &mfn)) == NULL )
 15.4147 +        return X86EMUL_PROPAGATE_FAULT;
 15.4148 +
 15.4149 +    switch (bytes) 
 15.4150 +    {
 15.4151 +    case 1: prev = cmpxchg(((u8 *)addr), old, new);  break;
 15.4152 +    case 2: prev = cmpxchg(((u16 *)addr), old, new); break;
 15.4153 +    case 4: prev = cmpxchg(((u32 *)addr), old, new); break;
 15.4154 +    case 8: prev = cmpxchg(((u64 *)addr), old, new); break;
 15.4155 +    default:
 15.4156 +        SHADOW_PRINTK("cmpxchg of size %i is not supported\n", bytes);
 15.4157 +        prev = ~old;
 15.4158 +    }
 15.4159 +
 15.4160 +    if ( (prev == old)  )
 15.4161 +        shadow_validate_guest_pt_write(v, mfn, addr, bytes);
 15.4162 +    else
 15.4163 +        rv = X86EMUL_CMPXCHG_FAILED;
 15.4164 +
 15.4165 +    SHADOW_DEBUG(EMULATE, "va %#lx was %#lx expected %#lx"
 15.4166 +                  " wanted %#lx now %#lx bytes %u\n",
 15.4167 +                  vaddr, prev, old, new, *(unsigned long *)addr, bytes);
 15.4168 +
 15.4169 +    /* If we are writing zeros to this page, might want to unshadow */
 15.4170 +    if ( *(u8 *)addr == 0 )
 15.4171 +        check_for_early_unshadow(v, mfn);
 15.4172 +
 15.4173 +    sh_unmap_domain_page(addr);
 15.4174 +    shadow_audit_tables(v);
 15.4175 +    check_for_early_unshadow(v, mfn);
 15.4176 +    return rv;
 15.4177 +}
 15.4178 +
 15.4179 +int
 15.4180 +sh_x86_emulate_cmpxchg8b(struct vcpu *v, unsigned long vaddr, 
 15.4181 +                          unsigned long old_lo, unsigned long old_hi,
 15.4182 +                          unsigned long new_lo, unsigned long new_hi,
 15.4183 +                          struct x86_emulate_ctxt *ctxt)
 15.4184 +{
 15.4185 +    mfn_t mfn;
 15.4186 +    void *addr;
 15.4187 +    u64 old, new, prev;
 15.4188 +    int rv = X86EMUL_CONTINUE;
 15.4189 +
 15.4190 +    ASSERT(shadow_lock_is_acquired(v->domain));
 15.4191 +
 15.4192 +    if ( (addr = emulate_map_dest(v, vaddr, ctxt, &mfn)) == NULL )
 15.4193 +        return X86EMUL_PROPAGATE_FAULT;
 15.4194 +
 15.4195 +    old = (((u64) old_hi) << 32) | (u64) old_lo;
 15.4196 +    new = (((u64) new_hi) << 32) | (u64) new_lo;
 15.4197 +    prev = cmpxchg(((u64 *)addr), old, new);
 15.4198 +
 15.4199 +    if ( (prev == old)  )
 15.4200 +        shadow_validate_guest_pt_write(v, mfn, addr, 8);
 15.4201 +    else
 15.4202 +        rv = X86EMUL_CMPXCHG_FAILED;
 15.4203 +
 15.4204 +    /* If we are writing zeros to this page, might want to unshadow */
 15.4205 +    if ( *(u8 *)addr == 0 )
 15.4206 +        check_for_early_unshadow(v, mfn);
 15.4207 +
 15.4208 +    sh_unmap_domain_page(addr);
 15.4209 +    shadow_audit_tables(v);
 15.4210 +    check_for_early_unshadow(v, mfn);
 15.4211 +    return rv;
 15.4212 +}
 15.4213 +
 15.4214 +
 15.4215 +/**************************************************************************/
 15.4216 +/* Audit tools */
 15.4217 +
 15.4218 +#if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
 15.4219 +
 15.4220 +#define AUDIT_FAIL(_level, _fmt, _a...) do {                               \
 15.4221 +    printk("Shadow %u-on-%u audit failed at level %i, index %i\n"         \
 15.4222 +           "gl" #_level "mfn = %" SH_PRI_mfn                              \
 15.4223 +           " sl" #_level "mfn = %" SH_PRI_mfn                             \
 15.4224 +           " &gl" #_level "e = %p &sl" #_level "e = %p"                    \
 15.4225 +           " gl" #_level "e = %" SH_PRI_gpte                              \
 15.4226 +           " sl" #_level "e = %" SH_PRI_pte "\nError: " _fmt "\n",        \
 15.4227 +           GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS,                      \
 15.4228 +           _level, guest_index(gl ## _level ## e),                         \
 15.4229 +           mfn_x(gl ## _level ## mfn), mfn_x(sl ## _level ## mfn),         \
 15.4230 +           gl ## _level ## e, sl ## _level ## e,                           \
 15.4231 +           gl ## _level ## e->l ## _level, sl ## _level ## e->l ## _level, \
 15.4232 +           ##_a);                                                          \
 15.4233 +    BUG();                                                                 \
 15.4234 +    done = 1;                                                              \
 15.4235 +} while (0)
 15.4236 +
 15.4237 +
 15.4238 +static char * sh_audit_flags(struct vcpu *v, int level,
 15.4239 +                              int gflags, int sflags) 
 15.4240 +/* Common code for auditing flag bits */
 15.4241 +{
 15.4242 +    if ( (sflags & _PAGE_PRESENT) && !(gflags & _PAGE_PRESENT) )
 15.4243 +        return "shadow is present but guest is not present";
 15.4244 +    if ( (sflags & _PAGE_GLOBAL) && !hvm_guest(v) ) 
 15.4245 +        return "global bit set in PV shadow";
 15.4246 +    if ( (level == 1 || (level == 2 && (gflags & _PAGE_PSE)))
 15.4247 +         && ((sflags & _PAGE_DIRTY) && !(gflags & _PAGE_DIRTY)) ) 
 15.4248 +        return "dirty bit not propagated";
 15.4249 +    if ( level == 2 && (sflags & _PAGE_PSE) )
 15.4250 +        return "PS bit set in shadow";
 15.4251 +#if SHADOW_PAGING_LEVELS == 3
 15.4252 +    if ( level == 3 ) return NULL; /* All the other bits are blank in PAEl3 */
 15.4253 +#endif
 15.4254 +    if ( (sflags & _PAGE_USER) != (gflags & _PAGE_USER) ) 
 15.4255 +        return "user/supervisor bit does not match";
 15.4256 +    if ( (sflags & _PAGE_NX_BIT) != (gflags & _PAGE_NX_BIT) ) 
 15.4257 +        return "NX bit does not match";
 15.4258 +    if ( (sflags & _PAGE_RW) && !(gflags & _PAGE_RW) ) 
 15.4259 +        return "shadow grants write access but guest does not";
 15.4260 +    if ( (sflags & _PAGE_ACCESSED) && !(gflags & _PAGE_ACCESSED) ) 
 15.4261 +        return "accessed bit not propagated";
 15.4262 +    return NULL;
 15.4263 +}
 15.4264 +
 15.4265 +static inline mfn_t
 15.4266 +audit_gfn_to_mfn(struct vcpu *v, gfn_t gfn, mfn_t gmfn)
 15.4267 +/* Convert this gfn to an mfn in the manner appropriate for the
 15.4268 + * guest pagetable it's used in (gmfn) */ 
 15.4269 +{
 15.4270 +    if ( !shadow_mode_translate(v->domain) )
 15.4271 +        return _mfn(gfn_x(gfn));
 15.4272 +    
 15.4273 +    if ( (mfn_to_page(gmfn)->u.inuse.type_info & PGT_type_mask)
 15.4274 +         != PGT_writable_page ) 
 15.4275 +        return _mfn(gfn_x(gfn)); /* This is a paging-disabled shadow */
 15.4276 +    else 
 15.4277 +        return sh_gfn_to_mfn(v->domain, gfn_x(gfn));
 15.4278 +} 
 15.4279 +
 15.4280 +
 15.4281 +int sh_audit_l1_table(struct vcpu *v, mfn_t sl1mfn, mfn_t x)
 15.4282 +{
 15.4283 +    guest_l1e_t *gl1e, *gp;
 15.4284 +    shadow_l1e_t *sl1e;
 15.4285 +    mfn_t mfn, gmfn, gl1mfn;
 15.4286 +    gfn_t gfn;
 15.4287 +    char *s;
 15.4288 +    int done = 0;
 15.4289 +
 15.4290 +    /* Follow the backpointer */
 15.4291 +    gl1mfn = _mfn(mfn_to_page(sl1mfn)->u.inuse.type_info);
 15.4292 +    gl1e = gp = sh_map_domain_page(gl1mfn);
 15.4293 +    SHADOW_FOREACH_L1E(sl1mfn, sl1e, &gl1e, done, {
 15.4294 +
 15.4295 +        s = sh_audit_flags(v, 1, guest_l1e_get_flags(*gl1e),
 15.4296 +                            shadow_l1e_get_flags(*sl1e));
 15.4297 +        if ( s ) AUDIT_FAIL(1, "%s", s);
 15.4298 +
 15.4299 +        if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
 15.4300 +        {
 15.4301 +            gfn = guest_l1e_get_gfn(*gl1e);
 15.4302 +            mfn = shadow_l1e_get_mfn(*sl1e);
 15.4303 +            gmfn = audit_gfn_to_mfn(v, gfn, gl1mfn);
 15.4304 +            if ( mfn_x(gmfn) != mfn_x(mfn) )
 15.4305 +                AUDIT_FAIL(1, "bad translation: gfn %" SH_PRI_gfn
 15.4306 +                           " --> %" SH_PRI_mfn " != mfn %" SH_PRI_mfn "\n",
 15.4307 +                           gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
 15.4308 +        }
 15.4309 +    });
 15.4310 +    sh_unmap_domain_page(gp);
 15.4311 +    return done;
 15.4312 +}
 15.4313 +
 15.4314 +int sh_audit_fl1_table(struct vcpu *v, mfn_t sl1mfn, mfn_t x)
 15.4315 +{
 15.4316 +    guest_l1e_t *gl1e, e;
 15.4317 +    shadow_l1e_t *sl1e;
 15.4318 +    mfn_t gl1mfn = _mfn(INVALID_MFN);
 15.4319 +    int f;
 15.4320 +    int done = 0;
 15.4321 +
 15.4322 +    /* fl1 has no useful backpointer: all we can check are flags */
 15.4323 +    e = guest_l1e_from_gfn(_gfn(0), 0); gl1e = &e; /* Needed for macro */
 15.4324 +    SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done, {
 15.4325 +        f = shadow_l1e_get_flags(*sl1e);
 15.4326 +        f &= ~(_PAGE_AVAIL0|_PAGE_AVAIL1|_PAGE_AVAIL2);
 15.4327 +        if ( !(f == 0 
 15.4328 +               || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
 15.4329 +                        _PAGE_ACCESSED|_PAGE_DIRTY) 
 15.4330 +               || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_ACCESSED|_PAGE_DIRTY)) )
 15.4331 +            AUDIT_FAIL(1, "fl1e has bad flags");
 15.4332 +    });
 15.4333 +    return 0;
 15.4334 +}
 15.4335 +
 15.4336 +int sh_audit_l2_table(struct vcpu *v, mfn_t sl2mfn, mfn_t x)
 15.4337 +{
 15.4338 +    guest_l2e_t *gl2e, *gp;
 15.4339 +    shadow_l2e_t *sl2e;
 15.4340 +    mfn_t mfn, gmfn, gl2mfn;
 15.4341 +    gfn_t gfn;
 15.4342 +    char *s;
 15.4343 +    int done = 0;
 15.4344 +#if GUEST_PAGING_LEVELS != 4
 15.4345 +    int xen_mappings = !shadow_mode_external(v->domain);
 15.4346 +#endif
 15.4347 +
 15.4348 +    /* Follow the backpointer */
 15.4349 +    gl2mfn = _mfn(mfn_to_page(sl2mfn)->u.inuse.type_info);
 15.4350 +    gl2e = gp = sh_map_domain_page(gl2mfn);
 15.4351 +    SHADOW_FOREACH_L2E(sl2mfn, sl2e, &gl2e, done, xen_mappings, {
 15.4352 +
 15.4353 +        s = sh_audit_flags(v, 2, guest_l2e_get_flags(*gl2e),
 15.4354 +                            shadow_l2e_get_flags(*sl2e));
 15.4355 +        if ( s ) AUDIT_FAIL(2, "%s", s);
 15.4356 +
 15.4357 +        if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
 15.4358 +        {
 15.4359 +            gfn = guest_l2e_get_gfn(*gl2e);
 15.4360 +            mfn = shadow_l2e_get_mfn(*sl2e);
 15.4361 +            gmfn = (guest_l2e_get_flags(*gl2e) & _PAGE_PSE)  
 15.4362 +                ? get_fl1_shadow_status(v, gfn)
 15.4363 +                : get_shadow_status(v, audit_gfn_to_mfn(v, gfn, gl2mfn), 
 15.4364 +                                    PGC_SH_l1_shadow);
 15.4365 +            if ( mfn_x(gmfn) != mfn_x(mfn) )
 15.4366 +                AUDIT_FAIL(2, "bad translation: gfn %" SH_PRI_gfn
 15.4367 +                           " (--> %" SH_PRI_mfn ")"
 15.4368 +                           " --> %" SH_PRI_mfn " != mfn %" SH_PRI_mfn "\n",
 15.4369 +                           gfn_x(gfn), 
 15.4370 +                           (guest_l2e_get_flags(*gl2e) & _PAGE_PSE) ? 0
 15.4371 +                           : mfn_x(audit_gfn_to_mfn(v, gfn, gl2mfn)),
 15.4372 +                           mfn_x(gmfn), mfn_x(mfn));
 15.4373 +        }
 15.4374 +    });
 15.4375 +    sh_unmap_domain_page(gp);
 15.4376 +    return 0;
 15.4377 +}
 15.4378 +
 15.4379 +#if GUEST_PAGING_LEVELS >= 3
 15.4380 +int sh_audit_l3_table(struct vcpu *v, mfn_t sl3mfn, mfn_t x)
 15.4381 +{
 15.4382 +    guest_l3e_t *gl3e, *gp;
 15.4383 +    shadow_l3e_t *sl3e;
 15.4384 +    mfn_t mfn, gmfn, gl3mfn;
 15.4385 +    gfn_t gfn;
 15.4386 +    char *s;
 15.4387 +    int done = 0;
 15.4388 +
 15.4389 +    /* Follow the backpointer */
 15.4390 +    gl3mfn = _mfn(mfn_to_page(sl3mfn)->u.inuse.type_info);
 15.4391 +    gl3e = gp = sh_map_domain_page(gl3mfn);
 15.4392 +    SHADOW_FOREACH_L3E(sl3mfn, sl3e, &gl3e, done, {
 15.4393 +
 15.4394 +        s = sh_audit_flags(v, 3, guest_l3e_get_flags(*gl3e),
 15.4395 +                            shadow_l3e_get_flags(*sl3e));
 15.4396 +        if ( s ) AUDIT_FAIL(3, "%s", s);
 15.4397 +
 15.4398 +        if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
 15.4399 +        {
 15.4400 +            gfn = guest_l3e_get_gfn(*gl3e);
 15.4401 +            mfn = shadow_l3e_get_mfn(*sl3e);
 15.4402 +            gmfn = get_shadow_status(v, audit_gfn_to_mfn(v, gfn, gl3mfn), 
 15.4403 +                                     (GUEST_PAGING_LEVELS == 3 
 15.4404 +                                      && !shadow_mode_external(v->domain)
 15.4405 +                                      && (guest_index(gl3e) % 4) == 3)
 15.4406 +                                     ? PGC_SH_l2h_pae_shadow
 15.4407 +                                     : PGC_SH_l2_shadow);
 15.4408 +            if ( mfn_x(gmfn) != mfn_x(mfn) )
 15.4409 +                AUDIT_FAIL(3, "bad translation: gfn %" SH_PRI_gfn
 15.4410 +                           " --> %" SH_PRI_mfn " != mfn %" SH_PRI_mfn "\n",
 15.4411 +                           gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
 15.4412 +        }
 15.4413 +    });
 15.4414 +    sh_unmap_domain_page(gp);
 15.4415 +    return 0;
 15.4416 +}
 15.4417 +#endif /* GUEST_PAGING_LEVELS >= 3 */
 15.4418 +
 15.4419 +#if GUEST_PAGING_LEVELS >= 4
 15.4420 +int sh_audit_l4_table(struct vcpu *v, mfn_t sl4mfn, mfn_t x)
 15.4421 +{
 15.4422 +    guest_l4e_t *gl4e, *gp;
 15.4423 +    shadow_l4e_t *sl4e;
 15.4424 +    mfn_t mfn, gmfn, gl4mfn;
 15.4425 +    gfn_t gfn;
 15.4426 +    char *s;
 15.4427 +    int done = 0;
 15.4428 +    int xen_mappings = !shadow_mode_external(v->domain);
 15.4429 +
 15.4430 +    /* Follow the backpointer */
 15.4431 +    gl4mfn = _mfn(mfn_to_page(sl4mfn)->u.inuse.type_info);
 15.4432 +    gl4e = gp = sh_map_domain_page(gl4mfn);
 15.4433 +    SHADOW_FOREACH_L4E(sl4mfn, sl4e, &gl4e, done, xen_mappings,
 15.4434 +    {
 15.4435 +        s = sh_audit_flags(v, 4, guest_l4e_get_flags(*gl4e),
 15.4436 +                            shadow_l4e_get_flags(*sl4e));
 15.4437 +        if ( s ) AUDIT_FAIL(4, "%s", s);
 15.4438 +
 15.4439 +        if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
 15.4440 +        {
 15.4441 +            gfn = guest_l4e_get_gfn(*gl4e);
 15.4442 +            mfn = shadow_l4e_get_mfn(*sl4e);
 15.4443 +            gmfn = get_shadow_status(v, audit_gfn_to_mfn(v, gfn, gl4mfn), 
 15.4444 +                                     PGC_SH_l3_shadow);
 15.4445 +            if ( mfn_x(gmfn) != mfn_x(mfn) )
 15.4446 +                AUDIT_FAIL(4, "bad translation: gfn %" SH_PRI_gfn
 15.4447 +                           " --> %" SH_PRI_mfn " != mfn %" SH_PRI_mfn "\n",
 15.4448 +                           gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
 15.4449 +        }
 15.4450 +    });
 15.4451 +    sh_unmap_domain_page(gp);
 15.4452 +    return 0;
 15.4453 +}
 15.4454 +#endif /* GUEST_PAGING_LEVELS >= 4 */
 15.4455 +
 15.4456 +
 15.4457 +#undef AUDIT_FAIL
 15.4458 +
 15.4459 +#endif /* Audit code */
 15.4460 +
 15.4461 +/**************************************************************************/
 15.4462 +/* Entry points into this mode of the shadow code.
 15.4463 + * This will all be mangled by the preprocessor to uniquify everything. */
 15.4464 +struct shadow_paging_mode sh_paging_mode = {
 15.4465 +    .page_fault             = sh_page_fault, 
 15.4466 +    .invlpg                 = sh_invlpg,
 15.4467 +    .gva_to_gpa             = sh_gva_to_gpa,
 15.4468 +    .gva_to_gfn             = sh_gva_to_gfn,
 15.4469 +    .update_cr3             = sh_update_cr3,
 15.4470 +    .map_and_validate_gl1e  = sh_map_and_validate_gl1e,
 15.4471 +    .map_and_validate_gl2e  = sh_map_and_validate_gl2e,
 15.4472 +    .map_and_validate_gl2he = sh_map_and_validate_gl2he,
 15.4473 +    .map_and_validate_gl3e  = sh_map_and_validate_gl3e,
 15.4474 +    .map_and_validate_gl4e  = sh_map_and_validate_gl4e,
 15.4475 +    .detach_old_tables      = sh_detach_old_tables,
 15.4476 +    .x86_emulate_write      = sh_x86_emulate_write,
 15.4477 +    .x86_emulate_cmpxchg    = sh_x86_emulate_cmpxchg,
 15.4478 +    .x86_emulate_cmpxchg8b  = sh_x86_emulate_cmpxchg8b,
 15.4479 +    .make_monitor_table     = sh_make_monitor_table,
 15.4480 +    .destroy_monitor_table  = sh_destroy_monitor_table,
 15.4481 +#if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
 15.4482 +    .guess_wrmap            = sh_guess_wrmap,
 15.4483 +#endif
 15.4484 +    .guest_levels           = GUEST_PAGING_LEVELS,
 15.4485 +    .shadow_levels          = SHADOW_PAGING_LEVELS,
 15.4486 +};
 15.4487 +
 15.4488 +/*
 15.4489 + * Local variables:
 15.4490 + * mode: C
 15.4491 + * c-set-style: "BSD"
 15.4492 + * c-basic-offset: 4
 15.4493 + * indent-tabs-mode: nil
 15.4494 + * End: 
 15.4495 + */
    16.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    16.2 +++ b/xen/arch/x86/mm/shadow/multi.h	Mon Aug 28 12:09:36 2006 +0100
    16.3 @@ -0,0 +1,116 @@
    16.4 +/******************************************************************************
    16.5 + * arch/x86/mm/shadow/multi.h
    16.6 + *
    16.7 + * Shadow declarations which will be multiply compiled.
    16.8 + * Parts of this code are Copyright (c) 2006 by XenSource Inc.
    16.9 + * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
   16.10 + * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
   16.11 + *
   16.12 + * This program is free software; you can redistribute it and/or modify
   16.13 + * it under the terms of the GNU General Public License as published by
   16.14 + * the Free Software Foundation; either version 2 of the License, or
   16.15 + * (at your option) any later version.
   16.16 + *
   16.17 + * This program is distributed in the hope that it will be useful,
   16.18 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
   16.19 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   16.20 + * GNU General Public License for more details.
   16.21 + *
   16.22 + * You should have received a copy of the GNU General Public License
   16.23 + * along with this program; if not, write to the Free Software
   16.24 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
   16.25 + */
   16.26 +
   16.27 +extern int 
   16.28 +SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, SHADOW_LEVELS, GUEST_LEVELS)(
   16.29 +    struct vcpu *v, mfn_t gl1mfn, void *new_gl1p, u32 size);
   16.30 +extern int 
   16.31 +SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, SHADOW_LEVELS, GUEST_LEVELS)(
   16.32 +    struct vcpu *v, mfn_t gl2mfn, void *new_gl2p, u32 size);
   16.33 +extern int 
   16.34 +SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2he, SHADOW_LEVELS, GUEST_LEVELS)(
   16.35 +    struct vcpu *v, mfn_t gl2mfn, void *new_gl2p, u32 size);
   16.36 +extern int 
   16.37 +SHADOW_INTERNAL_NAME(sh_map_and_validate_gl3e, SHADOW_LEVELS, GUEST_LEVELS)(
   16.38 +    struct vcpu *v, mfn_t gl3mfn, void *new_gl3p, u32 size);
   16.39 +extern int 
   16.40 +SHADOW_INTERNAL_NAME(sh_map_and_validate_gl4e, SHADOW_LEVELS, GUEST_LEVELS)(
   16.41 +    struct vcpu *v, mfn_t gl4mfn, void *new_gl4p, u32 size);
   16.42 +
   16.43 +extern void 
   16.44 +SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, SHADOW_LEVELS, GUEST_LEVELS)(
   16.45 +    struct vcpu *v, mfn_t smfn);
   16.46 +extern void 
   16.47 +SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, SHADOW_LEVELS, GUEST_LEVELS)(
   16.48 +    struct vcpu *v, mfn_t smfn);
   16.49 +extern void 
   16.50 +SHADOW_INTERNAL_NAME(sh_destroy_l3_shadow, SHADOW_LEVELS, GUEST_LEVELS)(
   16.51 +    struct vcpu *v, mfn_t smfn);
   16.52 +extern void 
   16.53 +SHADOW_INTERNAL_NAME(sh_destroy_l4_shadow, SHADOW_LEVELS, GUEST_LEVELS)(
   16.54 +    struct vcpu *v, mfn_t smfn);
   16.55 +
   16.56 +extern void
   16.57 +SHADOW_INTERNAL_NAME(sh_unpin_all_l3_subshadows, 3, 3)
   16.58 +    (struct vcpu *v, mfn_t smfn);
   16.59 +
   16.60 +extern void 
   16.61 +SHADOW_INTERNAL_NAME(sh_unhook_32b_mappings, SHADOW_LEVELS, GUEST_LEVELS)
   16.62 +    (struct vcpu *v, mfn_t sl2mfn);
   16.63 +extern void 
   16.64 +SHADOW_INTERNAL_NAME(sh_unhook_pae_mappings, SHADOW_LEVELS, GUEST_LEVELS)
   16.65 +    (struct vcpu *v, mfn_t sl3mfn);
   16.66 +extern void 
   16.67 +SHADOW_INTERNAL_NAME(sh_unhook_64b_mappings, SHADOW_LEVELS, GUEST_LEVELS)
   16.68 +    (struct vcpu *v, mfn_t sl4mfn);
   16.69 +
   16.70 +extern int
   16.71 +SHADOW_INTERNAL_NAME(sh_remove_write_access, SHADOW_LEVELS, GUEST_LEVELS)
   16.72 +    (struct vcpu *v, mfn_t sl1mfn, mfn_t readonly_mfn);
   16.73 +extern int
   16.74 +SHADOW_INTERNAL_NAME(sh_remove_all_mappings, SHADOW_LEVELS, GUEST_LEVELS)
   16.75 +    (struct vcpu *v, mfn_t sl1mfn, mfn_t target_mfn);
   16.76 +
   16.77 +extern void
   16.78 +SHADOW_INTERNAL_NAME(sh_clear_shadow_entry, SHADOW_LEVELS, GUEST_LEVELS)
   16.79 +    (struct vcpu *v, void *ep, mfn_t smfn);
   16.80 +
   16.81 +extern int
   16.82 +SHADOW_INTERNAL_NAME(sh_remove_l1_shadow, SHADOW_LEVELS, GUEST_LEVELS)
   16.83 +    (struct vcpu *v, mfn_t sl2mfn, mfn_t sl1mfn);
   16.84 +extern int
   16.85 +SHADOW_INTERNAL_NAME(sh_remove_l2_shadow, SHADOW_LEVELS, GUEST_LEVELS)
   16.86 +    (struct vcpu *v, mfn_t sl3mfn, mfn_t sl2mfn);
   16.87 +extern int
   16.88 +SHADOW_INTERNAL_NAME(sh_remove_l3_shadow, SHADOW_LEVELS, GUEST_LEVELS)
   16.89 +    (struct vcpu *v, mfn_t sl4mfn, mfn_t sl3mfn);
   16.90 +
   16.91 +#if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
   16.92 +int 
   16.93 +SHADOW_INTERNAL_NAME(sh_audit_l1_table, SHADOW_LEVELS, GUEST_LEVELS)
   16.94 +    (struct vcpu *v, mfn_t sl1mfn, mfn_t x);
   16.95 +int 
   16.96 +SHADOW_INTERNAL_NAME(sh_audit_fl1_table, SHADOW_LEVELS, GUEST_LEVELS)
   16.97 +    (struct vcpu *v, mfn_t sl1mfn, mfn_t x);
   16.98 +int 
   16.99 +SHADOW_INTERNAL_NAME(sh_audit_l2_table, SHADOW_LEVELS, GUEST_LEVELS)
  16.100 +    (struct vcpu *v, mfn_t sl2mfn, mfn_t x);
  16.101 +int 
  16.102 +SHADOW_INTERNAL_NAME(sh_audit_l3_table, SHADOW_LEVELS, GUEST_LEVELS)
  16.103 +    (struct vcpu *v, mfn_t sl3mfn, mfn_t x);
  16.104 +int 
  16.105 +SHADOW_INTERNAL_NAME(sh_audit_l4_table, SHADOW_LEVELS, GUEST_LEVELS)
  16.106 +    (struct vcpu *v, mfn_t sl4mfn, mfn_t x);
  16.107 +#endif
  16.108 +
  16.109 +#if SHADOW_LEVELS == GUEST_LEVELS
  16.110 +extern mfn_t
  16.111 +SHADOW_INTERNAL_NAME(sh_make_monitor_table, SHADOW_LEVELS, GUEST_LEVELS)
  16.112 +    (struct vcpu *v);
  16.113 +extern void
  16.114 +SHADOW_INTERNAL_NAME(sh_destroy_monitor_table, SHADOW_LEVELS, GUEST_LEVELS)
  16.115 +    (struct vcpu *v, mfn_t mmfn);
  16.116 +#endif
  16.117 +
  16.118 +extern struct shadow_paging_mode 
  16.119 +SHADOW_INTERNAL_NAME(sh_paging_mode, SHADOW_LEVELS, GUEST_LEVELS);
    17.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    17.2 +++ b/xen/arch/x86/mm/shadow/page-guest32.h	Mon Aug 28 12:09:36 2006 +0100
    17.3 @@ -0,0 +1,105 @@
    17.4 +
    17.5 +#ifndef __X86_PAGE_GUEST_H__
    17.6 +#define __X86_PAGE_GUEST_H__
    17.7 +
    17.8 +#ifndef __ASSEMBLY__
    17.9 +# include <asm/types.h>
   17.10 +#endif
   17.11 +
   17.12 +#define PAGETABLE_ORDER_32         10
   17.13 +#define L1_PAGETABLE_ENTRIES_32    (1<<PAGETABLE_ORDER_32)
   17.14 +#define L2_PAGETABLE_ENTRIES_32    (1<<PAGETABLE_ORDER_32)
   17.15 +#define ROOT_PAGETABLE_ENTRIES_32  L2_PAGETABLE_ENTRIES_32
   17.16 +
   17.17 +
   17.18 +#define L1_PAGETABLE_SHIFT_32 12
   17.19 +#define L2_PAGETABLE_SHIFT_32 22
   17.20 +
   17.21 +/* Extract flags into 12-bit integer, or turn 12-bit flags into a pte mask. */
   17.22 +
   17.23 +#ifndef __ASSEMBLY__
   17.24 +
   17.25 +typedef u32 intpte_32_t;
   17.26 +
   17.27 +typedef struct { intpte_32_t l1; } l1_pgentry_32_t;
   17.28 +typedef struct { intpte_32_t l2; } l2_pgentry_32_t;
   17.29 +typedef l2_pgentry_t root_pgentry_32_t;
   17.30 +#endif
   17.31 +
   17.32 +#define get_pte_flags_32(x) ((u32)(x) & 0xFFF)
   17.33 +#define put_pte_flags_32(x) ((intpte_32_t)(x))
   17.34 +
   17.35 +/* Get pte access flags (unsigned int). */
   17.36 +#define l1e_get_flags_32(x)           (get_pte_flags_32((x).l1))
   17.37 +#define l2e_get_flags_32(x)           (get_pte_flags_32((x).l2))
   17.38 +
   17.39 +#define l1e_get_paddr_32(x)           \
   17.40 +    ((paddr_t)(((x).l1 & (PADDR_MASK&PAGE_MASK))))
   17.41 +#define l2e_get_paddr_32(x)           \
   17.42 +    ((paddr_t)(((x).l2 & (PADDR_MASK&PAGE_MASK))))
   17.43 +
   17.44 +/* Construct an empty pte. */
   17.45 +#define l1e_empty_32()                ((l1_pgentry_32_t) { 0 })
   17.46 +#define l2e_empty_32()                ((l2_pgentry_32_t) { 0 })
   17.47 +
   17.48 +/* Construct a pte from a pfn and access flags. */
   17.49 +#define l1e_from_pfn_32(pfn, flags)   \
   17.50 +    ((l1_pgentry_32_t) { ((intpte_32_t)(pfn) << PAGE_SHIFT) | put_pte_flags_32(flags) })
   17.51 +#define l2e_from_pfn_32(pfn, flags)   \
   17.52 +    ((l2_pgentry_32_t) { ((intpte_32_t)(pfn) << PAGE_SHIFT) | put_pte_flags_32(flags) })
   17.53 +
   17.54 +/* Construct a pte from a physical address and access flags. */
   17.55 +#ifndef __ASSEMBLY__
   17.56 +static inline l1_pgentry_32_t l1e_from_paddr_32(paddr_t pa, unsigned int flags)
   17.57 +{
   17.58 +    ASSERT((pa & ~(PADDR_MASK & PAGE_MASK)) == 0);
   17.59 +    return (l1_pgentry_32_t) { pa | put_pte_flags_32(flags) };
   17.60 +}
   17.61 +static inline l2_pgentry_32_t l2e_from_paddr_32(paddr_t pa, unsigned int flags)
   17.62 +{
   17.63 +    ASSERT((pa & ~(PADDR_MASK & PAGE_MASK)) == 0);
   17.64 +    return (l2_pgentry_32_t) { pa | put_pte_flags_32(flags) };
   17.65 +}
   17.66 +#endif /* !__ASSEMBLY__ */
   17.67 +
   17.68 +
   17.69 +/* Construct a pte from a page pointer and access flags. */
   17.70 +#define l1e_from_page_32(page, flags) (l1e_from_pfn_32(page_to_mfn(page),(flags)))
   17.71 +#define l2e_from_page_32(page, flags) (l2e_from_pfn_32(page_to_mfn(page),(flags)))
   17.72 +
   17.73 +/* Add extra flags to an existing pte. */
   17.74 +#define l1e_add_flags_32(x, flags)    ((x).l1 |= put_pte_flags_32(flags))
   17.75 +#define l2e_add_flags_32(x, flags)    ((x).l2 |= put_pte_flags_32(flags))
   17.76 +
   17.77 +/* Remove flags from an existing pte. */
   17.78 +#define l1e_remove_flags_32(x, flags) ((x).l1 &= ~put_pte_flags_32(flags))
   17.79 +#define l2e_remove_flags_32(x, flags) ((x).l2 &= ~put_pte_flags_32(flags))
   17.80 +
   17.81 +/* Check if a pte's page mapping or significant access flags have changed. */
   17.82 +#define l1e_has_changed_32(x,y,flags) \
   17.83 +    ( !!(((x).l1 ^ (y).l1) & ((PADDR_MASK&PAGE_MASK)|put_pte_flags_32(flags))) )
   17.84 +#define l2e_has_changed_32(x,y,flags) \
   17.85 +    ( !!(((x).l2 ^ (y).l2) & ((PADDR_MASK&PAGE_MASK)|put_pte_flags_32(flags))) )
   17.86 +
   17.87 +/* Given a virtual address, get an entry offset into a page table. */
   17.88 +#define l1_table_offset_32(a)         \
   17.89 +    (((a) >> L1_PAGETABLE_SHIFT_32) & (L1_PAGETABLE_ENTRIES_32 - 1))
   17.90 +#define l2_table_offset_32(a)         \
   17.91 +    (((a) >> L2_PAGETABLE_SHIFT_32) & (L2_PAGETABLE_ENTRIES_32 - 1))
   17.92 +
   17.93 +#define linear_l1_table_32                                                 \
   17.94 +    ((l1_pgentry_32_t *)(LINEAR_PT_VIRT_START))
   17.95 +
   17.96 +#define linear_pg_table_32 linear_l1_table_32
   17.97 +
   17.98 +#endif /* __X86_PAGE_GUEST_H__ */
   17.99 +
  17.100 +/*
  17.101 + * Local variables:
  17.102 + * mode: C
  17.103 + * c-set-style: "BSD"
  17.104 + * c-basic-offset: 4
  17.105 + * tab-width: 4
  17.106 + * indent-tabs-mode: nil
  17.107 + * End:
  17.108 + */
    18.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    18.2 +++ b/xen/arch/x86/mm/shadow/private.h	Mon Aug 28 12:09:36 2006 +0100
    18.3 @@ -0,0 +1,593 @@
    18.4 +/******************************************************************************
    18.5 + * arch/x86/mm/shadow/private.h
    18.6 + *
    18.7 + * Shadow code that is private, and does not need to be multiply compiled.
    18.8 + * Parts of this code are Copyright (c) 2006 by XenSource Inc.
    18.9 + * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
   18.10 + * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
   18.11 + *
   18.12 + * This program is free software; you can redistribute it and/or modify
   18.13 + * it under the terms of the GNU General Public License as published by
   18.14 + * the Free Software Foundation; either version 2 of the License, or
   18.15 + * (at your option) any later version.
   18.16 + *
   18.17 + * This program is distributed in the hope that it will be useful,
   18.18 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
   18.19 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   18.20 + * GNU General Public License for more details.
   18.21 + *
   18.22 + * You should have received a copy of the GNU General Public License
   18.23 + * along with this program; if not, write to the Free Software
   18.24 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
   18.25 + */
   18.26 +
   18.27 +#ifndef _XEN_SHADOW_PRIVATE_H
   18.28 +#define _XEN_SHADOW_PRIVATE_H
   18.29 +
   18.30 +// In order to override the definition of mfn_to_page, we make sure page.h has
   18.31 +// been included...
   18.32 +#include <asm/page.h>
   18.33 +#include <xen/domain_page.h>
   18.34 +#include <asm/x86_emulate.h>
   18.35 +#include <asm/hvm/support.h>
   18.36 +
   18.37 +
   18.38 +/******************************************************************************
   18.39 + * Definitions for the use of the "available" bits in the shadow PTEs.
   18.40 + *
   18.41 + * Review of the low 12 bits of a shadow page table entry:
   18.42 + *
   18.43 + *         in a guest:                      in a shadow:
   18.44 + * Bit 11: _PAGE_AVAIL2, aka _PAGE_GNTTAB
   18.45 + * Bit 10: _PAGE_AVAIL1                     _PAGE_SHADOW_RW ("SW" below)
   18.46 + * Bit  9: _PAGE_AVAIL0                     _PAGE_SHADOW_PRESENT ("SP" below)
   18.47 + * Bit  8: _PAGE_GLOBAL                     _PAGE_SHADOW_MMIO ("MMIO" below),
   18.48 + *                                          aka _PAGE_SHADOW_GUEST_NOT_PRESENT
   18.49 + * Bit  7: _PAGE_PSE, aka _PAGE_PAT
   18.50 + * Bit  6: _PAGE_DIRTY
   18.51 + * Bit  5: _PAGE_ACCESSED
   18.52 + * Bit  4: _PAGE_PCD
   18.53 + * Bit  3: _PAGE_PWT
   18.54 + * Bit  2: _PAGE_USER
   18.55 + * Bit  1: _PAGE_RW ("GW" below)
   18.56 + * Bit  0: _PAGE_PRESENT ("GP" below)
   18.57 + *
   18.58 + * Given a guest entry, as shown below, we can expect the following in the
   18.59 + * corresponding shadow entry:
   18.60 + *
   18.61 + * Guest entry  Shadow entry      Commentary
   18.62 + * -----------  ----------------  ---------------------------------------------
   18.63 + *       Maps     
   18.64 + * GP GW  IO    GP SP GW SW MMIO 
   18.65 + * -- -- ----   -- -- -- -- ----
   18.66 + *  -  -   -     0  0  0  0   0   The guest entry has not yet been shadowed.
   18.67 + *  0  -   -     0  0  0  0   1   The guest entry is marked not-present.
   18.68 + *  1  1  no     ?  1  ?  1   0   Writable entry in the guest.
   18.69 + *  1  0  no     ?  1  0  0   0   Read-only entry in the guest.
   18.70 + *  1  1  yes    0  1  ?  1   1   Writable MMIO mapping in the guest.
   18.71 + *  1  0  yes    0  1  0  0   1   Read-only MMIO mapping in the guest.
   18.72 + *
   18.73 + * Normally, we would expect that GP=1 in the guest to imply GP=1 in the
   18.74 + * shadow, and similarly for GW=1.  However, various functionality that may be
   18.75 + * implemented via the shadow can cause GP or GW to be cleared in such cases.
   18.76 + * A & D bit emulation is a prime example of such functionality.
   18.77 + *
   18.78 + * If _PAGE_SHADOW_PRESENT is zero, then the _PAGE_PRESENT bit in that same
   18.79 + * entry will always be zero, too.
   18.80 +
   18.81 + * Bit 11 is used in debug builds as the _PAGE_GNTTAB bit in PV guests.  It is
   18.82 + * currently available for random (ab)use in shadow entries.
   18.83 + *
   18.84 + * Bit 8 (the global bit) could be propagated from an HVM guest to the shadow,
   18.85 + * but currently there is no benefit, as the guest's TLB is flushed on every
   18.86 + * transition of CR3 anyway due to the HVM exit/re-entry.
   18.87 + *
   18.88 + * In shadow entries in which the _PAGE_SHADOW_PRESENT is set, bit 8 is used
   18.89 + * as the _PAGE_SHADOW_MMIO bit.  In such entries, if _PAGE_SHADOW_MMIO is
   18.90 + * set, then the entry contains the *gfn* directly from the corresponding
   18.91 + * guest entry (not an mfn!!).
   18.92 + *
   18.93 + * Bit 7 is set in a guest L2 to signify a superpage entry.  The current
   18.94 + * shadow code splinters superpage mappings into 512 or 1024 4K mappings; the
   18.95 + * resulting shadow L1 table is called an FL1.  Note that there is no guest
   18.96 + * page that corresponds to an FL1.
   18.97 + *
   18.98 + * Bit 7 in a guest L1 is the PAT2 bit.  Currently we do not support PAT in
   18.99 + * this shadow code.
  18.100 + *
  18.101 + * Bit 6 is the dirty bit.
  18.102 + *
  18.103 + * Bit 5 is the accessed bit.
  18.104 + *
  18.105 + * Bit 4 is the cache disable bit.  If set in a guest, the hardware is
  18.106 + * supposed to refuse to cache anything found via this entry.  It can be set
  18.107 + * in an L4e, L3e, L2e, or L1e.  This shadow code currently does not support
  18.108 + * cache disable bits.  They are silently ignored.
  18.109 + *
  18.110 + * Bit 4 is a guest L1 is also the PAT1 bit.  Currently we do not support PAT
  18.111 + * in this shadow code.
  18.112 + *
  18.113 + * Bit 3 is the cache write-thru bit.  If set in a guest, the hardware is
  18.114 + * supposed to use write-thru instead of write-back caching for anything found
  18.115 + * via this entry.  It can be set in an L4e, L3e, L2e, or L1e.  This shadow
  18.116 + * code currently does not support cache write-thru bits.  They are silently
  18.117 + * ignored.
  18.118 + *
  18.119 + * Bit 3 is a guest L1 is also the PAT0 bit.  Currently we do not support PAT
  18.120 + * in this shadow code.
  18.121 + *
  18.122 + * Bit 2 is the user bit.
  18.123 + *
  18.124 + * Bit 1 is the read-write bit.
  18.125 + *
  18.126 + * Bit 0 is the present bit.
  18.127 + */
  18.128 +
  18.129 +// Copy of the _PAGE_RW bit from the guest's PTE, appropriately zero'ed by
  18.130 +// the appropriate shadow rules.
  18.131 +#define _PAGE_SHADOW_RW                 _PAGE_AVAIL1
  18.132 +
  18.133 +// Copy of the _PAGE_PRESENT bit from the guest's PTE
  18.134 +#define _PAGE_SHADOW_PRESENT            _PAGE_AVAIL0
  18.135 +
  18.136 +// The matching guest entry maps MMIO space
  18.137 +#define _PAGE_SHADOW_MMIO               _PAGE_GLOBAL
  18.138 +
  18.139 +// Shadow flags value used when the guest is not present
  18.140 +#define _PAGE_SHADOW_GUEST_NOT_PRESENT  _PAGE_GLOBAL
  18.141 +
  18.142 +
  18.143 +/******************************************************************************
  18.144 + * Debug and error-message output
  18.145 + */
  18.146 +#define SHADOW_PRINTK(_f, _a...)                                     \
  18.147 +    debugtrace_printk("sh: %s(): " _f, __func__, ##_a)
  18.148 +#define SHADOW_ERROR(_f, _a...)                                      \
  18.149 +    printk("sh error: %s(): " _f, __func__, ##_a)
  18.150 +#define SHADOW_DEBUG(flag, _f, _a...)                                \
  18.151 +    do {                                                              \
  18.152 +        if (SHADOW_DEBUG_ ## flag)                                   \
  18.153 +            debugtrace_printk("shdebug: %s(): " _f, __func__, ##_a); \
  18.154 +    } while (0)
  18.155 +
  18.156 +// The flags for use with SHADOW_DEBUG:
  18.157 +#define SHADOW_DEBUG_PROPAGATE         0
  18.158 +#define SHADOW_DEBUG_MAKE_SHADOW       0
  18.159 +#define SHADOW_DEBUG_DESTROY_SHADOW    0
  18.160 +#define SHADOW_DEBUG_P2M               0
  18.161 +#define SHADOW_DEBUG_A_AND_D           0
  18.162 +#define SHADOW_DEBUG_EMULATE           0
  18.163 +#define SHADOW_DEBUG_LOGDIRTY          1
  18.164 +
  18.165 +
  18.166 +/******************************************************************************
  18.167 + * Auditing routines 
  18.168 + */
  18.169 +
  18.170 +#if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL
  18.171 +extern void shadow_audit_tables(struct vcpu *v);
  18.172 +#else
  18.173 +#define shadow_audit_tables(_v) do {} while(0)
  18.174 +#endif
  18.175 +
  18.176 +#if SHADOW_AUDIT & SHADOW_AUDIT_P2M
  18.177 +extern void shadow_audit_p2m(struct domain *d);
  18.178 +#else
  18.179 +#define shadow_audit_p2m(_d) do {} while(0)
  18.180 +#endif
  18.181 +
  18.182 +
  18.183 +/******************************************************************************
  18.184 + * Mechanism for double-checking the optimized pagefault path: this
  18.185 + * structure contains a record of actions taken by the fault handling
  18.186 + * code.  In paranoid mode, the fast-path code fills out one of these
  18.187 + * structures (but doesn't take any actual action) and then the normal 
  18.188 + * path fills in another.  When the fault handler finishes, the 
  18.189 + * two are compared */
  18.190 +
  18.191 +#ifdef SHADOW_OPTIMIZATION_PARANOIA
  18.192 +
  18.193 +typedef struct shadow_action_log sh_log_t;
  18.194 +struct shadow_action_log {
  18.195 +    paddr_t ad[CONFIG_PAGING_LEVELS];  /* A & D bits propagated here */
  18.196 +    paddr_t mmio;                      /* Address of an mmio operation */
  18.197 +    int rv;                            /* Result of the fault handler */
  18.198 +};
  18.199 +
  18.200 +/* There are two logs, one for the fast path, one for the normal path */
  18.201 +enum sh_log_type { log_slow = 0, log_fast= 1 };
  18.202 +
  18.203 +/* Alloc and zero the logs */
  18.204 +static inline void sh_init_log(struct vcpu *v) 
  18.205 +{
  18.206 +    if ( unlikely(!v->arch.shadow.action_log) ) 
  18.207 +        v->arch.shadow.action_log = xmalloc_array(sh_log_t, 2);
  18.208 +    ASSERT(v->arch.shadow.action_log);
  18.209 +    memset(v->arch.shadow.action_log, 0, 2 * sizeof (sh_log_t));
  18.210 +}
  18.211 +
  18.212 +/* Log an A&D-bit update */
  18.213 +static inline void sh_log_ad(struct vcpu *v, paddr_t e, unsigned int level)
  18.214 +{
  18.215 +    v->arch.shadow.action_log[v->arch.shadow.action_index].ad[level] = e;
  18.216 +}
  18.217 +
  18.218 +/* Log an MMIO address */
  18.219 +static inline void sh_log_mmio(struct vcpu *v, paddr_t m)
  18.220 +{
  18.221 +    v->arch.shadow.action_log[v->arch.shadow.action_index].mmio = m;
  18.222 +}
  18.223 +
  18.224 +/* Log the result */
  18.225 +static inline void sh_log_rv(struct vcpu *v, int rv)
  18.226 +{
  18.227 +    v->arch.shadow.action_log[v->arch.shadow.action_index].rv = rv;
  18.228 +}
  18.229 +
  18.230 +/* Set which mode we're in */
  18.231 +static inline void sh_set_log_mode(struct vcpu *v, enum sh_log_type t) 
  18.232 +{
  18.233 +    v->arch.shadow.action_index = t;
  18.234 +}
  18.235 +
  18.236 +/* Know not to take action, because we're only checking the mechanism */
  18.237 +static inline int sh_take_no_action(struct vcpu *v) 
  18.238 +{
  18.239 +    return (v->arch.shadow.action_index == log_fast);
  18.240 +}
  18.241 +
  18.242 +#else /* Non-paranoid mode: these logs do not exist */
  18.243 +
  18.244 +#define sh_init_log(_v) do { (void)(_v); } while(0)
  18.245 +#define sh_set_log_mode(_v,_t) do { (void)(_v); } while(0)
  18.246 +#define sh_log_ad(_v,_e,_l) do { (void)(_v),(void)(_e),(void)(_l); } while (0)
  18.247 +#define sh_log_mmio(_v,_m) do { (void)(_v),(void)(_m); } while (0)
  18.248 +#define sh_log_rv(_v,_r) do { (void)(_v),(void)(_r); } while (0)
  18.249 +#define sh_take_no_action(_v) (((void)(_v)), 0)
  18.250 +
  18.251 +#endif /* SHADOW_OPTIMIZATION_PARANOIA */
  18.252 +
  18.253 +
  18.254 +/******************************************************************************
  18.255 + * Macro for dealing with the naming of the internal names of the
  18.256 + * shadow code's external entry points.
  18.257 + */
  18.258 +#define SHADOW_INTERNAL_NAME_HIDDEN(name, shadow_levels, guest_levels) \
  18.259 +    name ## __shadow_ ## shadow_levels ## _guest_ ## guest_levels
  18.260 +#define SHADOW_INTERNAL_NAME(name, shadow_levels, guest_levels) \
  18.261 +    SHADOW_INTERNAL_NAME_HIDDEN(name, shadow_levels, guest_levels)
  18.262 +
  18.263 +#if CONFIG_PAGING_LEVELS == 2
  18.264 +#define GUEST_LEVELS  2
  18.265 +#define SHADOW_LEVELS 2
  18.266 +#include "multi.h"
  18.267 +#undef GUEST_LEVELS
  18.268 +#undef SHADOW_LEVELS
  18.269 +#endif /* CONFIG_PAGING_LEVELS == 2 */
  18.270 +
  18.271 +#if CONFIG_PAGING_LEVELS == 3
  18.272 +#define GUEST_LEVELS  2
  18.273 +#define SHADOW_LEVELS 3
  18.274 +#include "multi.h"
  18.275 +#undef GUEST_LEVELS
  18.276 +#undef SHADOW_LEVELS
  18.277 +
  18.278 +#define GUEST_LEVELS  3
  18.279 +#define SHADOW_LEVELS 3
  18.280 +#include "multi.h"
  18.281 +#undef GUEST_LEVELS
  18.282 +#undef SHADOW_LEVELS
  18.283 +#endif /* CONFIG_PAGING_LEVELS == 3 */
  18.284 +
  18.285 +#if CONFIG_PAGING_LEVELS == 4
  18.286 +#define GUEST_LEVELS  2
  18.287 +#define SHADOW_LEVELS 3
  18.288 +#include "multi.h"
  18.289 +#undef GUEST_LEVELS
  18.290 +#undef SHADOW_LEVELS
  18.291 +
  18.292 +#define GUEST_LEVELS  3
  18.293 +#define SHADOW_LEVELS 3
  18.294 +#include "multi.h"
  18.295 +#undef GUEST_LEVELS
  18.296 +#undef SHADOW_LEVELS
  18.297 +
  18.298 +#define GUEST_LEVELS  3
  18.299 +#define SHADOW_LEVELS 4
  18.300 +#include "multi.h"
  18.301 +#undef GUEST_LEVELS
  18.302