direct-io.hg

changeset 12533:2fd223c64fc6

[XEN] Pin l3 shadows of older x86_64 linux guests.
Older x86_64 linux kernels use one l4 table per cpu and context switch by
changing an l4 entry pointing to an l3 table. If we're shadowing them
we need to pin l3 shadows to stop them being torn down on every
context switch. (But don't do this for normal 64bit guests).
Signed-off-by: Tim Deegan <Tim.Deegan@xensource.com>
author Tim Deegan <Tim.Deegan@xensource.com>
date Thu Nov 23 17:46:52 2006 +0000 (2006-11-23)
parents 47a8bb3cd123
children cd40792968cb
files xen/arch/x86/domain.c xen/arch/x86/mm/shadow/common.c xen/arch/x86/mm/shadow/multi.c xen/arch/x86/mm/shadow/private.h xen/include/asm-x86/domain.h xen/include/asm-x86/shadow.h
line diff
     1.1 --- a/xen/arch/x86/domain.c	Thu Nov 23 17:44:12 2006 +0000
     1.2 +++ b/xen/arch/x86/domain.c	Thu Nov 23 17:46:52 2006 +0000
     1.3 @@ -219,7 +219,7 @@ int arch_domain_create(struct domain *d)
     1.4          INIT_LIST_HEAD(&d->arch.shadow.freelists[i]);
     1.5      INIT_LIST_HEAD(&d->arch.shadow.p2m_freelist);
     1.6      INIT_LIST_HEAD(&d->arch.shadow.p2m_inuse);
     1.7 -    INIT_LIST_HEAD(&d->arch.shadow.toplevel_shadows);
     1.8 +    INIT_LIST_HEAD(&d->arch.shadow.pinned_shadows);
     1.9  
    1.10      if ( !is_idle_domain(d) )
    1.11      {
     2.1 --- a/xen/arch/x86/mm/shadow/common.c	Thu Nov 23 17:44:12 2006 +0000
     2.2 +++ b/xen/arch/x86/mm/shadow/common.c	Thu Nov 23 17:46:52 2006 +0000
     2.3 @@ -495,6 +495,7 @@ void shadow_prealloc(struct domain *d, u
     2.4      struct shadow_page_info *sp;
     2.5      cpumask_t flushmask = CPU_MASK_NONE;
     2.6      mfn_t smfn;
     2.7 +    int i;
     2.8  
     2.9      if ( chunk_is_available(d, order) ) return; 
    2.10      
    2.11 @@ -503,9 +504,9 @@ void shadow_prealloc(struct domain *d, u
    2.12          v = d->vcpu[0];
    2.13      ASSERT(v != NULL);
    2.14  
    2.15 -    /* Stage one: walk the list of top-level pages, unpinning them */
    2.16 +    /* Stage one: walk the list of pinned pages, unpinning them */
    2.17      perfc_incrc(shadow_prealloc_1);
    2.18 -    list_for_each_backwards_safe(l, t, &d->arch.shadow.toplevel_shadows)
    2.19 +    list_for_each_backwards_safe(l, t, &d->arch.shadow.pinned_shadows)
    2.20      {
    2.21          sp = list_entry(l, struct shadow_page_info, list);
    2.22          smfn = shadow_page_to_mfn(sp);
    2.23 @@ -521,31 +522,24 @@ void shadow_prealloc(struct domain *d, u
    2.24       * loaded in cr3 on some vcpu.  Walk them, unhooking the non-Xen
    2.25       * mappings. */
    2.26      perfc_incrc(shadow_prealloc_2);
    2.27 -    list_for_each_backwards_safe(l, t, &d->arch.shadow.toplevel_shadows)
    2.28 -    {
    2.29 -        sp = list_entry(l, struct shadow_page_info, list);
    2.30 -        smfn = shadow_page_to_mfn(sp);
    2.31 -        shadow_unhook_mappings(v, smfn);
    2.32 -
    2.33 -        /* Remember to flush TLBs: we have removed shadow entries that 
    2.34 -         * were in use by some vcpu(s). */
    2.35 -        for_each_vcpu(d, v2) 
    2.36 +
    2.37 +    for_each_vcpu(d, v2) 
    2.38 +        for ( i = 0 ; i < 4 ; i++ )
    2.39          {
    2.40 -            if ( pagetable_get_pfn(v2->arch.shadow_table[0]) == mfn_x(smfn)
    2.41 -                 || pagetable_get_pfn(v2->arch.shadow_table[1]) == mfn_x(smfn)
    2.42 -                 || pagetable_get_pfn(v2->arch.shadow_table[2]) == mfn_x(smfn) 
    2.43 -                 || pagetable_get_pfn(v2->arch.shadow_table[3]) == mfn_x(smfn)
    2.44 -                )
    2.45 +            if ( !pagetable_is_null(v2->arch.shadow_table[i]) )
    2.46 +            {
    2.47 +                shadow_unhook_mappings(v, 
    2.48 +                               pagetable_get_mfn(v2->arch.shadow_table[i]));
    2.49                  cpus_or(flushmask, v2->vcpu_dirty_cpumask, flushmask);
    2.50 +
    2.51 +                /* See if that freed up a chunk of appropriate size */
    2.52 +                if ( chunk_is_available(d, order) ) 
    2.53 +                {
    2.54 +                    flush_tlb_mask(flushmask);
    2.55 +                    return;
    2.56 +                }
    2.57 +            }
    2.58          }
    2.59 -
    2.60 -        /* See if that freed up a chunk of appropriate size */
    2.61 -        if ( chunk_is_available(d, order) ) 
    2.62 -        {
    2.63 -            flush_tlb_mask(flushmask);
    2.64 -            return;
    2.65 -        }
    2.66 -    }
    2.67      
    2.68      /* Nothing more we can do: all remaining shadows are of pages that
    2.69       * hold Xen mappings for some vcpu.  This can never happen. */
    2.70 @@ -558,52 +552,57 @@ void shadow_prealloc(struct domain *d, u
    2.71      BUG();
    2.72  }
    2.73  
    2.74 -#ifndef NDEBUG
    2.75 -/* Deliberately free all the memory we can: this can be used to cause the
    2.76 - * guest's pagetables to be re-shadowed if we suspect that the shadows
    2.77 - * have somehow got out of sync */
    2.78 -static void shadow_blow_tables(unsigned char c)
    2.79 +/* Deliberately free all the memory we can: this will tear down all of
    2.80 + * this domain's shadows */
    2.81 +static void shadow_blow_tables(struct domain *d) 
    2.82  {
    2.83      struct list_head *l, *t;
    2.84      struct shadow_page_info *sp;
    2.85 -    struct domain *d;
    2.86 -    struct vcpu *v;
    2.87 +    struct vcpu *v = d->vcpu[0];
    2.88      mfn_t smfn;
    2.89 -
    2.90 +    int i;
    2.91 +    
    2.92 +    /* Pass one: unpin all pinned pages */
    2.93 +    list_for_each_backwards_safe(l,t, &d->arch.shadow.pinned_shadows)
    2.94 +    {
    2.95 +        sp = list_entry(l, struct shadow_page_info, list);
    2.96 +        smfn = shadow_page_to_mfn(sp);
    2.97 +        sh_unpin(v, smfn);
    2.98 +    }
    2.99 +        
   2.100 +    /* Second pass: unhook entries of in-use shadows */
   2.101 +    for_each_vcpu(d, v) 
   2.102 +        for ( i = 0 ; i < 4 ; i++ )
   2.103 +            if ( !pagetable_is_null(v->arch.shadow_table[i]) )
   2.104 +                shadow_unhook_mappings(v, 
   2.105 +                               pagetable_get_mfn(v->arch.shadow_table[i]));
   2.106 +
   2.107 +    /* Make sure everyone sees the unshadowings */
   2.108 +    flush_tlb_mask(d->domain_dirty_cpumask);
   2.109 +}
   2.110 +
   2.111 +
   2.112 +#ifndef NDEBUG
   2.113 +/* Blow all shadows of all shadowed domains: this can be used to cause the
   2.114 + * guest's pagetables to be re-shadowed if we suspect that the shadows
   2.115 + * have somehow got out of sync */
   2.116 +static void shadow_blow_all_tables(unsigned char c)
   2.117 +{
   2.118 +    struct domain *d;
   2.119 +    printk("'%c' pressed -> blowing all shadow tables\n", c);
   2.120      for_each_domain(d)
   2.121 -    {
   2.122 -        if ( shadow_mode_enabled(d) && (v = d->vcpu[0]) != NULL)
   2.123 +        if ( shadow_mode_enabled(d) && d->vcpu[0] != NULL )
   2.124          {
   2.125              shadow_lock(d);
   2.126 -            printk("Blowing shadow tables for domain %u\n", d->domain_id);
   2.127 -
   2.128 -            /* Pass one: unpin all top-level pages */
   2.129 -            list_for_each_backwards_safe(l,t, &d->arch.shadow.toplevel_shadows)
   2.130 -            {
   2.131 -                sp = list_entry(l, struct shadow_page_info, list);
   2.132 -                smfn = shadow_page_to_mfn(sp);
   2.133 -                sh_unpin(v, smfn);
   2.134 -            }
   2.135 -
   2.136 -            /* Second pass: unhook entries of in-use shadows */
   2.137 -            list_for_each_backwards_safe(l,t, &d->arch.shadow.toplevel_shadows)
   2.138 -            {
   2.139 -                sp = list_entry(l, struct shadow_page_info, list);
   2.140 -                smfn = shadow_page_to_mfn(sp);
   2.141 -                shadow_unhook_mappings(v, smfn);
   2.142 -            }
   2.143 -            
   2.144 -            /* Make sure everyone sees the unshadowings */
   2.145 -            flush_tlb_mask(d->domain_dirty_cpumask);
   2.146 +            shadow_blow_tables(d);
   2.147              shadow_unlock(d);
   2.148          }
   2.149 -    }
   2.150  }
   2.151  
   2.152  /* Register this function in the Xen console keypress table */
   2.153  static __init int shadow_blow_tables_keyhandler_init(void)
   2.154  {
   2.155 -    register_keyhandler('S', shadow_blow_tables, "reset shadow pagetables");
   2.156 +    register_keyhandler('S', shadow_blow_all_tables,"reset shadow pagetables");
   2.157      return 0;
   2.158  }
   2.159  __initcall(shadow_blow_tables_keyhandler_init);
   2.160 @@ -789,9 +788,9 @@ mfn_t
   2.161  shadow_alloc_p2m_page(struct domain *d)
   2.162  {
   2.163      struct list_head *entry;
   2.164 +    struct page_info *pg;
   2.165      mfn_t mfn;
   2.166      void *p;
   2.167 -    int ok;
   2.168  
   2.169      if ( list_empty(&d->arch.shadow.p2m_freelist) &&
   2.170           !shadow_alloc_p2m_pages(d) )
   2.171 @@ -799,9 +798,9 @@ shadow_alloc_p2m_page(struct domain *d)
   2.172      entry = d->arch.shadow.p2m_freelist.next;
   2.173      list_del(entry);
   2.174      list_add_tail(entry, &d->arch.shadow.p2m_inuse);
   2.175 -    mfn = page_to_mfn(list_entry(entry, struct page_info, list));
   2.176 -    ok = sh_get_ref(mfn, 0);
   2.177 -    ASSERT(ok); /* First sh_get_ref() can't possibly overflow */
   2.178 +    pg = list_entry(entry, struct page_info, list);
   2.179 +    pg->count_info = 1;
   2.180 +    mfn = page_to_mfn(pg);
   2.181      p = sh_map_domain_page(mfn);
   2.182      clear_page(p);
   2.183      sh_unmap_domain_page(p);
   2.184 @@ -2067,37 +2066,32 @@ void sh_remove_shadows(struct vcpu *v, m
   2.185       * This call to hash_foreach() looks dangerous but is in fact OK: each
   2.186       * call will remove at most one shadow, and terminate immediately when
   2.187       * it does remove it, so we never walk the hash after doing a deletion.  */
   2.188 -#define DO_UNSHADOW(_type) do {                                 \
   2.189 -    t = (_type);                                                \
   2.190 -    smfn = shadow_hash_lookup(v, mfn_x(gmfn), t);               \
   2.191 -    if ( !sh_remove_shadow_via_pointer(v, smfn) && !fast )      \
   2.192 -        hash_foreach(v, masks[t], callbacks, smfn);             \
   2.193 -} while (0)
   2.194 -
   2.195 -    /* Top-level shadows need to be unpinned */
   2.196 -#define DO_UNPIN(_type) do {                            \
   2.197 +#define DO_UNSHADOW(_type) do {                         \
   2.198      t = (_type);                                        \
   2.199      smfn = shadow_hash_lookup(v, mfn_x(gmfn), t);       \
   2.200 -    if ( mfn_to_shadow_page(smfn)->pinned )             \
   2.201 +    if ( sh_type_is_pinnable(v, t) )                    \
   2.202          sh_unpin(v, smfn);                              \
   2.203 +    else                                                \
   2.204 +        sh_remove_shadow_via_pointer(v, smfn);          \
   2.205 +    if ( (pg->count_info & PGC_page_table) && !fast )   \
   2.206 +        hash_foreach(v, masks[t], callbacks, smfn);     \
   2.207  } while (0)
   2.208  
   2.209      if ( sh_flags & SHF_L1_32 )   DO_UNSHADOW(SH_type_l1_32_shadow);
   2.210 -    if ( sh_flags & SHF_L2_32 )   DO_UNPIN(SH_type_l2_32_shadow);
   2.211 +    if ( sh_flags & SHF_L2_32 )   DO_UNSHADOW(SH_type_l2_32_shadow);
   2.212  #if CONFIG_PAGING_LEVELS >= 3
   2.213      if ( sh_flags & SHF_L1_PAE )  DO_UNSHADOW(SH_type_l1_pae_shadow);
   2.214 -    if ( sh_flags & SHF_L2_PAE )  DO_UNPIN(SH_type_l2_pae_shadow);
   2.215 -    if ( sh_flags & SHF_L2H_PAE ) DO_UNPIN(SH_type_l2h_pae_shadow);
   2.216 +    if ( sh_flags & SHF_L2_PAE )  DO_UNSHADOW(SH_type_l2_pae_shadow);
   2.217 +    if ( sh_flags & SHF_L2H_PAE ) DO_UNSHADOW(SH_type_l2h_pae_shadow);
   2.218  #if CONFIG_PAGING_LEVELS >= 4
   2.219      if ( sh_flags & SHF_L1_64 )   DO_UNSHADOW(SH_type_l1_64_shadow);
   2.220      if ( sh_flags & SHF_L2_64 )   DO_UNSHADOW(SH_type_l2_64_shadow);
   2.221      if ( sh_flags & SHF_L3_64 )   DO_UNSHADOW(SH_type_l3_64_shadow);
   2.222 -    if ( sh_flags & SHF_L4_64 )   DO_UNPIN(SH_type_l4_64_shadow);
   2.223 +    if ( sh_flags & SHF_L4_64 )   DO_UNSHADOW(SH_type_l4_64_shadow);
   2.224  #endif
   2.225  #endif
   2.226  
   2.227  #undef DO_UNSHADOW
   2.228 -#undef DO_UNPIN
   2.229  
   2.230      /* If that didn't catch the shadows, something is wrong */
   2.231      if ( !fast && (pg->count_info & PGC_page_table) )
   2.232 @@ -2393,6 +2387,12 @@ int shadow_enable(struct domain *d, u32 
   2.233              goto out;
   2.234          }
   2.235  
   2.236 +#if (SHADOW_OPTIMIZATIONS & SHOPT_LINUX_L3_TOPLEVEL) 
   2.237 +    /* We assume we're dealing with an older 64bit linux guest until we 
   2.238 +     * see the guest use more than one l4 per vcpu. */
   2.239 +    d->arch.shadow.opt_flags = SHOPT_LINUX_L3_TOPLEVEL;
   2.240 +#endif
   2.241 +
   2.242      /* Update the bits */
   2.243      sh_new_mode(d, mode);
   2.244      shadow_audit_p2m(d);
   2.245 @@ -2831,18 +2831,10 @@ static int shadow_log_dirty_op(
   2.246          
   2.247      if ( clean ) 
   2.248      {
   2.249 -        struct list_head *l, *t;
   2.250 -        struct shadow_page_info *sp;
   2.251 -
   2.252          /* Need to revoke write access to the domain's pages again. 
   2.253           * In future, we'll have a less heavy-handed approach to this, 
   2.254           * but for now, we just unshadow everything except Xen. */
   2.255 -        list_for_each_safe(l, t, &d->arch.shadow.toplevel_shadows)
   2.256 -        {
   2.257 -            sp = list_entry(l, struct shadow_page_info, list);
   2.258 -            if ( d->vcpu[0] != NULL )
   2.259 -                shadow_unhook_mappings(d->vcpu[0], shadow_page_to_mfn(sp));
   2.260 -        }
   2.261 +        shadow_blow_tables(d);
   2.262  
   2.263          d->arch.shadow.fault_count = 0;
   2.264          d->arch.shadow.dirty_count = 0;
     3.1 --- a/xen/arch/x86/mm/shadow/multi.c	Thu Nov 23 17:44:12 2006 +0000
     3.2 +++ b/xen/arch/x86/mm/shadow/multi.c	Thu Nov 23 17:46:52 2006 +0000
     3.3 @@ -964,7 +964,7 @@ static int shadow_set_l4e(struct vcpu *v
     3.4                            shadow_l4e_t new_sl4e, 
     3.5                            mfn_t sl4mfn)
     3.6  {
     3.7 -    int flags = 0;
     3.8 +    int flags = 0, ok;
     3.9      shadow_l4e_t old_sl4e;
    3.10      paddr_t paddr;
    3.11      ASSERT(sl4e != NULL);
    3.12 @@ -976,12 +976,19 @@ static int shadow_set_l4e(struct vcpu *v
    3.13               | (((unsigned long)sl4e) & ~PAGE_MASK));
    3.14  
    3.15      if ( shadow_l4e_get_flags(new_sl4e) & _PAGE_PRESENT ) 
    3.16 +    {
    3.17          /* About to install a new reference */        
    3.18 -        if ( !sh_get_ref(shadow_l4e_get_mfn(new_sl4e), paddr) )
    3.19 +        mfn_t sl3mfn = shadow_l4e_get_mfn(new_sl4e);
    3.20 +        ok = sh_get_ref(v, sl3mfn, paddr);
    3.21 +        /* Are we pinning l3 shadows to handle wierd linux behaviour? */
    3.22 +        if ( sh_type_is_pinnable(v, SH_type_l3_64_shadow) )
    3.23 +            ok |= sh_pin(v, sl3mfn);
    3.24 +        if ( !ok )
    3.25          {
    3.26              domain_crash(v->domain);
    3.27              return SHADOW_SET_ERROR;
    3.28          }
    3.29 +    }
    3.30  
    3.31      /* Write the new entry */
    3.32      shadow_write_entries(sl4e, &new_sl4e, 1, sl4mfn);
    3.33 @@ -1020,7 +1027,7 @@ static int shadow_set_l3e(struct vcpu *v
    3.34      
    3.35      if ( shadow_l3e_get_flags(new_sl3e) & _PAGE_PRESENT )
    3.36          /* About to install a new reference */        
    3.37 -        if ( !sh_get_ref(shadow_l3e_get_mfn(new_sl3e), paddr) )
    3.38 +        if ( !sh_get_ref(v, shadow_l3e_get_mfn(new_sl3e), paddr) )
    3.39          {
    3.40              domain_crash(v->domain);
    3.41              return SHADOW_SET_ERROR;
    3.42 @@ -1076,7 +1083,7 @@ static int shadow_set_l2e(struct vcpu *v
    3.43  
    3.44      if ( shadow_l2e_get_flags(new_sl2e) & _PAGE_PRESENT ) 
    3.45          /* About to install a new reference */
    3.46 -        if ( !sh_get_ref(shadow_l2e_get_mfn(new_sl2e), paddr) )
    3.47 +        if ( !sh_get_ref(v, shadow_l2e_get_mfn(new_sl2e), paddr) )
    3.48          {
    3.49              domain_crash(v->domain);
    3.50              return SHADOW_SET_ERROR;
    3.51 @@ -1361,8 +1368,6 @@ do {                                    
    3.52  /**************************************************************************/
    3.53  /* Functions to install Xen mappings and linear mappings in shadow pages */
    3.54  
    3.55 -static mfn_t sh_make_shadow(struct vcpu *v, mfn_t gmfn, u32 shadow_type);
    3.56 -
    3.57  // XXX -- this function should probably be moved to shadow-common.c, but that
    3.58  //        probably wants to wait until the shadow types have been moved from
    3.59  //        shadow-types.h to shadow-private.h
    3.60 @@ -1547,6 +1552,44 @@ sh_make_shadow(struct vcpu *v, mfn_t gmf
    3.61          /* Lower-level shadow, not yet linked form a higher level */
    3.62          mfn_to_shadow_page(smfn)->up = 0;
    3.63  
    3.64 +#if GUEST_PAGING_LEVELS == 4
    3.65 +#if (SHADOW_OPTIMIZATIONS & SHOPT_LINUX_L3_TOPLEVEL) 
    3.66 +    if ( shadow_type == SH_type_l4_64_shadow &&
    3.67 +         unlikely(v->domain->arch.shadow.opt_flags & SHOPT_LINUX_L3_TOPLEVEL) )
    3.68 +    {
    3.69 +        /* We're shadowing a new l4, but we've been assuming the guest uses
    3.70 +         * only one l4 per vcpu and context switches using an l4 entry. 
    3.71 +         * Count the number of active l4 shadows.  If there are enough
    3.72 +         * of them, decide that this isn't an old linux guest, and stop
    3.73 +         * pinning l3es.  This is not very quick but it doesn't happen
    3.74 +         * very often. */
    3.75 +        struct list_head *l, *t;
    3.76 +        struct shadow_page_info *sp;
    3.77 +        struct vcpu *v2;
    3.78 +        int l4count = 0, vcpus = 0;
    3.79 +        list_for_each(l, &v->domain->arch.shadow.pinned_shadows)
    3.80 +        {
    3.81 +            sp = list_entry(l, struct shadow_page_info, list);
    3.82 +            if ( sp->type == SH_type_l4_64_shadow )
    3.83 +                l4count++;
    3.84 +        }
    3.85 +        for_each_vcpu ( v->domain, v2 ) 
    3.86 +            vcpus++;
    3.87 +        if ( l4count > 2 * vcpus ) 
    3.88 +        {
    3.89 +            /* Unpin all the pinned l3 tables, and don't pin any more. */
    3.90 +            list_for_each_safe(l, t, &v->domain->arch.shadow.pinned_shadows)
    3.91 +            {
    3.92 +                sp = list_entry(l, struct shadow_page_info, list);
    3.93 +                if ( sp->type == SH_type_l3_64_shadow )
    3.94 +                    sh_unpin(v, shadow_page_to_mfn(sp));
    3.95 +            }
    3.96 +            v->domain->arch.shadow.opt_flags &= ~SHOPT_LINUX_L3_TOPLEVEL;
    3.97 +        }
    3.98 +    }
    3.99 +#endif
   3.100 +#endif
   3.101 +
   3.102      // Create the Xen mappings...
   3.103      if ( !shadow_mode_external(v->domain) )
   3.104      {
   3.105 @@ -1893,9 +1936,6 @@ void sh_destroy_l4_shadow(struct vcpu *v
   3.106      gmfn = _mfn(mfn_to_shadow_page(smfn)->backpointer);
   3.107      delete_shadow_status(v, gmfn, t, smfn);
   3.108      shadow_demote(v, gmfn, t);
   3.109 -    /* Take this shadow off the list of root shadows */
   3.110 -    list_del_init(&mfn_to_shadow_page(smfn)->list);
   3.111 -
   3.112      /* Decrement refcounts of all the old entries */
   3.113      xen_mappings = (!shadow_mode_external(v->domain));
   3.114      sl4mfn = smfn; 
   3.115 @@ -1903,8 +1943,8 @@ void sh_destroy_l4_shadow(struct vcpu *v
   3.116          if ( shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT ) 
   3.117          {
   3.118              sh_put_ref(v, shadow_l4e_get_mfn(*sl4e),
   3.119 -                        (((paddr_t)mfn_x(sl4mfn)) << PAGE_SHIFT) 
   3.120 -                        | ((unsigned long)sl4e & ~PAGE_MASK));
   3.121 +                       (((paddr_t)mfn_x(sl4mfn)) << PAGE_SHIFT) 
   3.122 +                       | ((unsigned long)sl4e & ~PAGE_MASK));
   3.123          }
   3.124      });
   3.125      
   3.126 @@ -1958,10 +1998,6 @@ void sh_destroy_l2_shadow(struct vcpu *v
   3.127      gmfn = _mfn(mfn_to_shadow_page(smfn)->backpointer);
   3.128      delete_shadow_status(v, gmfn, t, smfn);
   3.129      shadow_demote(v, gmfn, t);
   3.130 -#if (GUEST_PAGING_LEVELS == 2) || (GUEST_PAGING_LEVELS == 3)
   3.131 -    /* Take this shadow off the list of root shadows */
   3.132 -    list_del_init(&mfn_to_shadow_page(smfn)->list);
   3.133 -#endif
   3.134  
   3.135      /* Decrement refcounts of all the old entries */
   3.136      sl2mfn = smfn;
   3.137 @@ -3276,13 +3312,7 @@ sh_set_toplevel_shadow(struct vcpu *v,
   3.138  
   3.139      /* Guest mfn is valid: shadow it and install the shadow */
   3.140      smfn = get_shadow_status(v, gmfn, root_type);
   3.141 -    if ( valid_mfn(smfn) )
   3.142 -    {
   3.143 -        /* Pull this root shadow out of the list of roots (we will put
   3.144 -         * it back in at the head). */
   3.145 -        list_del(&mfn_to_shadow_page(smfn)->list);
   3.146 -    }
   3.147 -    else
   3.148 +    if ( !valid_mfn(smfn) )
   3.149      {
   3.150          /* Make sure there's enough free shadow memory. */
   3.151          shadow_prealloc(d, SHADOW_MAX_ORDER); 
   3.152 @@ -3298,17 +3328,15 @@ sh_set_toplevel_shadow(struct vcpu *v,
   3.153  #endif
   3.154  
   3.155      /* Pin the shadow and put it (back) on the list of top-level shadows */
   3.156 -    if ( sh_pin(smfn) )
   3.157 -        list_add(&mfn_to_shadow_page(smfn)->list, 
   3.158 -                 &d->arch.shadow.toplevel_shadows);
   3.159 -    else 
   3.160 +    if ( sh_pin(v, smfn) == 0 )
   3.161      {
   3.162          SHADOW_ERROR("can't pin %#lx as toplevel shadow\n", mfn_x(smfn));
   3.163          domain_crash(v->domain);
   3.164 -    }        
   3.165 -
   3.166 -    /* Take a ref to this page: it will be released in sh_detach_old_tables. */
   3.167 -    if ( !sh_get_ref(smfn, 0) )
   3.168 +    }
   3.169 +
   3.170 +    /* Take a ref to this page: it will be released in sh_detach_old_tables()
   3.171 +     * or the next call to set_toplevel_shadow() */
   3.172 +    if ( !sh_get_ref(v, smfn, 0) )
   3.173      {
   3.174          SHADOW_ERROR("can't install %#lx as toplevel shadow\n", mfn_x(smfn));
   3.175          domain_crash(v->domain);
     4.1 --- a/xen/arch/x86/mm/shadow/private.h	Thu Nov 23 17:44:12 2006 +0000
     4.2 +++ b/xen/arch/x86/mm/shadow/private.h	Thu Nov 23 17:46:52 2006 +0000
     4.3 @@ -157,9 +157,11 @@ struct shadow_page_info
     4.4      } __attribute__((packed));
     4.5      union {
     4.6          /* For unused shadow pages, a list of pages of this order; 
     4.7 -         * for top-level shadows, a list of other top-level shadows */
     4.8 +         * for pinnable shadows, if pinned, a list of other pinned shadows
     4.9 +         * (see sh_type_is_pinnable() below for the definition of 
    4.10 +         * "pinnable" shadow types). */
    4.11          struct list_head list;
    4.12 -        /* For lower-level shadows, a higher entry that points at us */
    4.13 +        /* For non-pinnable shadows, a higher entry that points at us */
    4.14          paddr_t up;
    4.15      };
    4.16  };
    4.17 @@ -195,6 +197,36 @@ static inline void shadow_check_page_str
    4.18  #define SH_type_monitor_table (14U) /* in use as a monitor table */
    4.19  #define SH_type_unused        (15U)
    4.20  
    4.21 +/* 
    4.22 + * What counts as a pinnable shadow?
    4.23 + */
    4.24 +
    4.25 +static inline int sh_type_is_pinnable(struct vcpu *v, unsigned int t) 
    4.26 +{
    4.27 +    /* Top-level shadow types in each mode can be pinned, so that they 
    4.28 +     * persist even when not currently in use in a guest CR3 */
    4.29 +    if ( t == SH_type_l2_32_shadow
    4.30 +         || t == SH_type_l2_pae_shadow
    4.31 +         || t == SH_type_l2h_pae_shadow 
    4.32 +         || t == SH_type_l4_64_shadow )
    4.33 +        return 1;
    4.34 +
    4.35 +#if (SHADOW_OPTIMIZATIONS & SHOPT_LINUX_L3_TOPLEVEL) 
    4.36 +    /* Early 64-bit linux used three levels of pagetables for the guest
    4.37 +     * and context switched by changing one l4 entry in a per-cpu l4
    4.38 +     * page.  When we're shadowing those kernels, we have to pin l3
    4.39 +     * shadows so they don't just evaporate on every context switch.
    4.40 +     * For all other guests, we'd rather use the up-pointer field in l3s. */ 
    4.41 +    if ( unlikely((v->domain->arch.shadow.opt_flags & SHOPT_LINUX_L3_TOPLEVEL) 
    4.42 +                  && CONFIG_PAGING_LEVELS >= 4
    4.43 +                  && t == SH_type_l3_64_shadow) )
    4.44 +        return 1;
    4.45 +#endif
    4.46 +
    4.47 +    /* Everything else is not pinnable, and can use the "up" pointer */
    4.48 +    return 0;
    4.49 +}
    4.50 +
    4.51  /*
    4.52   * Definitions for the shadow_flags field in page_info.
    4.53   * These flags are stored on *guest* pages...
    4.54 @@ -364,7 +396,7 @@ void sh_destroy_shadow(struct vcpu *v, m
    4.55   * and the physical address of the shadow entry that holds the ref (or zero
    4.56   * if the ref is held by something else).  
    4.57   * Returns 0 for failure, 1 for success. */
    4.58 -static inline int sh_get_ref(mfn_t smfn, paddr_t entry_pa)
    4.59 +static inline int sh_get_ref(struct vcpu *v, mfn_t smfn, paddr_t entry_pa)
    4.60  {
    4.61      u32 x, nx;
    4.62      struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
    4.63 @@ -385,7 +417,9 @@ static inline int sh_get_ref(mfn_t smfn,
    4.64      sp->count = nx;
    4.65  
    4.66      /* We remember the first shadow entry that points to each shadow. */
    4.67 -    if ( entry_pa != 0 && sp->up == 0 ) 
    4.68 +    if ( entry_pa != 0 
    4.69 +         && sh_type_is_pinnable(v, sp->type) 
    4.70 +         && sp->up == 0 ) 
    4.71          sp->up = entry_pa;
    4.72      
    4.73      return 1;
    4.74 @@ -403,7 +437,9 @@ static inline void sh_put_ref(struct vcp
    4.75      ASSERT(sp->mbz == 0);
    4.76  
    4.77      /* If this is the entry in the up-pointer, remove it */
    4.78 -    if ( entry_pa != 0 && sp->up == entry_pa ) 
    4.79 +    if ( entry_pa != 0 
    4.80 +         && sh_type_is_pinnable(v, sp->type) 
    4.81 +         && sp->up == entry_pa ) 
    4.82          sp->up = 0;
    4.83  
    4.84      x = sp->count;
    4.85 @@ -424,33 +460,48 @@ static inline void sh_put_ref(struct vcp
    4.86  }
    4.87  
    4.88  
    4.89 -/* Pin a shadow page: take an extra refcount and set the pin bit.
    4.90 +/* Pin a shadow page: take an extra refcount, set the pin bit,
    4.91 + * and put the shadow at the head of the list of pinned shadows.
    4.92   * Returns 0 for failure, 1 for success. */
    4.93 -static inline int sh_pin(mfn_t smfn)
    4.94 +static inline int sh_pin(struct vcpu *v, mfn_t smfn)
    4.95  {
    4.96      struct shadow_page_info *sp;
    4.97      
    4.98      ASSERT(mfn_valid(smfn));
    4.99      sp = mfn_to_shadow_page(smfn);
   4.100 -    if ( !(sp->pinned) ) 
   4.101 +    ASSERT(sh_type_is_pinnable(v, sp->type));
   4.102 +    if ( sp->pinned ) 
   4.103      {
   4.104 -        if ( !sh_get_ref(smfn, 0) )
   4.105 +        /* Already pinned: take it out of the pinned-list so it can go 
   4.106 +         * at the front */
   4.107 +        list_del(&sp->list);
   4.108 +    }
   4.109 +    else
   4.110 +    {
   4.111 +        /* Not pinned: pin it! */
   4.112 +        if ( !sh_get_ref(v, smfn, 0) )
   4.113              return 0;
   4.114          sp->pinned = 1;
   4.115      }
   4.116 +    /* Put it at the head of the list of pinned shadows */
   4.117 +    list_add(&sp->list, &v->domain->arch.shadow.pinned_shadows);
   4.118      return 1;
   4.119  }
   4.120  
   4.121 -/* Unpin a shadow page: unset the pin bit and release the extra ref. */
   4.122 +/* Unpin a shadow page: unset the pin bit, take the shadow off the list
   4.123 + * of pinned shadows, and release the extra ref. */
   4.124  static inline void sh_unpin(struct vcpu *v, mfn_t smfn)
   4.125  {
   4.126      struct shadow_page_info *sp;
   4.127      
   4.128      ASSERT(mfn_valid(smfn));
   4.129      sp = mfn_to_shadow_page(smfn);
   4.130 +    ASSERT(sh_type_is_pinnable(v, sp->type));
   4.131      if ( sp->pinned )
   4.132      {
   4.133          sp->pinned = 0;
   4.134 +        list_del(&sp->list);
   4.135 +        sp->up = 0; /* in case this stops being a pinnable type in future */
   4.136          sh_put_ref(v, smfn, 0);
   4.137      }
   4.138  }
     5.1 --- a/xen/include/asm-x86/domain.h	Thu Nov 23 17:44:12 2006 +0000
     5.2 +++ b/xen/include/asm-x86/domain.h	Thu Nov 23 17:46:52 2006 +0000
     5.3 @@ -65,10 +65,11 @@ struct shadow_domain {
     5.4      struct list_head  freelists[SHADOW_MAX_ORDER + 1]; 
     5.5      struct list_head  p2m_freelist;
     5.6      struct list_head  p2m_inuse;
     5.7 -    struct list_head  toplevel_shadows;
     5.8 +    struct list_head  pinned_shadows;
     5.9      unsigned int      total_pages;  /* number of pages allocated */
    5.10      unsigned int      free_pages;   /* number of pages on freelists */
    5.11      unsigned int      p2m_pages;    /* number of pages in p2m map */
    5.12 +    unsigned int      opt_flags;    /* runtime tunable optimizations on/off */
    5.13  
    5.14      /* Shadow hashtable */
    5.15      struct shadow_page_info **hash_table;
     6.1 --- a/xen/include/asm-x86/shadow.h	Thu Nov 23 17:44:12 2006 +0000
     6.2 +++ b/xen/include/asm-x86/shadow.h	Thu Nov 23 17:46:52 2006 +0000
     6.3 @@ -158,8 +158,9 @@ extern int shadow_audit_enable;
     6.4  #define SHOPT_EARLY_UNSHADOW      0x02  /* Unshadow l1s on fork or exit */
     6.5  #define SHOPT_FAST_FAULT_PATH     0x04  /* Fast-path MMIO and not-present */
     6.6  #define SHOPT_PREFETCH            0x08  /* Shadow multiple entries per fault */
     6.7 +#define SHOPT_LINUX_L3_TOPLEVEL   0x10  /* Pin l3es on early 64bit linux */
     6.8  
     6.9 -#define SHADOW_OPTIMIZATIONS      0x0f
    6.10 +#define SHADOW_OPTIMIZATIONS      0x1f
    6.11  
    6.12  
    6.13  /* With shadow pagetables, the different kinds of address start 
    6.14 @@ -594,24 +595,6 @@ static inline unsigned int shadow_get_al
    6.15              + ((pg & ((1 << (20 - PAGE_SHIFT)) - 1)) ? 1 : 0));
    6.16  }
    6.17  
    6.18 -#if SHADOW_OPTIMIZATIONS & SHOPT_CACHE_WALKS
    6.19 -/* Optimization: cache the results of guest walks.  This helps with MMIO
    6.20 - * and emulated writes, which tend to issue very similar walk requests
    6.21 - * repeatedly.  We keep the results of the last few walks, and blow
    6.22 - * away the cache on guest cr3 write, mode change, or page fault. */
    6.23 -
    6.24 -#define SH_WALK_CACHE_ENTRIES 4
    6.25 -
    6.26 -/* Rather than cache a guest walk, which would include mapped pointers 
    6.27 - * to pages, we cache what a TLB would remember about the walk: the 
    6.28 - * permissions and the l1 gfn */
    6.29 -struct shadow_walk_cache {
    6.30 -    unsigned long va;           /* The virtual address (or 0 == unused) */
    6.31 -    unsigned long gfn;          /* The gfn from the effective l1e   */
    6.32 -    u32 permissions;            /* The aggregated permission bits   */
    6.33 -};
    6.34 -#endif
    6.35 -
    6.36  
    6.37  /**************************************************************************/
    6.38  /* Guest physmap (p2m) support