ia64/xen-unstable

changeset 18794:7fb33d15dc9b

x86: Move the guest pagetable walker out of shadow/multi.c

Move the guest PT walker into its own file, and purge it of references
to the rest of the shadow code.

Signed-off-by: Tim Deegan <Tim.Deegan@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Thu Nov 13 13:02:08 2008 +0000 (2008-11-13)
parents b87cc4de3ca6
children 48879ca58848
files xen/arch/x86/mm/Makefile xen/arch/x86/mm/guest_walk.c xen/arch/x86/mm/shadow/multi.c xen/include/asm-x86/guest_pt.h xen/include/asm-x86/perfc_defn.h
line diff
     1.1 --- a/xen/arch/x86/mm/Makefile	Thu Nov 13 13:01:22 2008 +0000
     1.2 +++ b/xen/arch/x86/mm/Makefile	Thu Nov 13 13:02:08 2008 +0000
     1.3 @@ -3,3 +3,9 @@ subdir-y += hap
     1.4  
     1.5  obj-y += paging.o
     1.6  obj-y += p2m.o
     1.7 +obj-y += guest_walk_2.o
     1.8 +obj-y += guest_walk_3.o
     1.9 +obj-$(x86_64) += guest_walk_4.o
    1.10 +
    1.11 +guest_walk_%.o: guest_walk.c $(HDRS) Makefile
    1.12 +	$(CC) $(CFLAGS) -DGUEST_PAGING_LEVELS=$* -c $< -o $@
     2.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     2.2 +++ b/xen/arch/x86/mm/guest_walk.c	Thu Nov 13 13:02:08 2008 +0000
     2.3 @@ -0,0 +1,260 @@
     2.4 +/******************************************************************************
     2.5 + * arch/x86/mm/guest_walk.c
     2.6 + *
     2.7 + * Pagetable walker for guest memory accesses.
     2.8 + *
     2.9 + * Parts of this code are Copyright (c) 2006 by XenSource Inc.
    2.10 + * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
    2.11 + * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
    2.12 + *
    2.13 + * This program is free software; you can redistribute it and/or modify
    2.14 + * it under the terms of the GNU General Public License as published by
    2.15 + * the Free Software Foundation; either version 2 of the License, or
    2.16 + * (at your option) any later version.
    2.17 + *
    2.18 + * This program is distributed in the hope that it will be useful,
    2.19 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
    2.20 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    2.21 + * GNU General Public License for more details.
    2.22 + *
    2.23 + * You should have received a copy of the GNU General Public License
    2.24 + * along with this program; if not, write to the Free Software
    2.25 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
    2.26 + */
    2.27 +
    2.28 +#include <xen/types.h>
    2.29 +#include <xen/mm.h>
    2.30 +#include <xen/paging.h>
    2.31 +#include <xen/domain_page.h>
    2.32 +#include <xen/sched.h>
    2.33 +#include <asm/page.h>
    2.34 +#include <asm/guest_pt.h>
    2.35 +
    2.36 +
    2.37 +/* Flags that are needed in a pagetable entry, with the sense of NX inverted */
    2.38 +static uint32_t mandatory_flags(struct vcpu *v, uint32_t pfec) 
    2.39 +{
    2.40 +    static uint32_t flags[] = {
    2.41 +        /* I/F -  Usr Wr */
    2.42 +        /* 0   0   0   0 */ _PAGE_PRESENT, 
    2.43 +        /* 0   0   0   1 */ _PAGE_PRESENT|_PAGE_RW,
    2.44 +        /* 0   0   1   0 */ _PAGE_PRESENT|_PAGE_USER,
    2.45 +        /* 0   0   1   1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER,
    2.46 +        /* 0   1   0   0 */ _PAGE_PRESENT, 
    2.47 +        /* 0   1   0   1 */ _PAGE_PRESENT|_PAGE_RW,
    2.48 +        /* 0   1   1   0 */ _PAGE_PRESENT|_PAGE_USER,
    2.49 +        /* 0   1   1   1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER,
    2.50 +        /* 1   0   0   0 */ _PAGE_PRESENT|_PAGE_NX_BIT, 
    2.51 +        /* 1   0   0   1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_NX_BIT,
    2.52 +        /* 1   0   1   0 */ _PAGE_PRESENT|_PAGE_USER|_PAGE_NX_BIT,
    2.53 +        /* 1   0   1   1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX_BIT,
    2.54 +        /* 1   1   0   0 */ _PAGE_PRESENT|_PAGE_NX_BIT, 
    2.55 +        /* 1   1   0   1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_NX_BIT,
    2.56 +        /* 1   1   1   0 */ _PAGE_PRESENT|_PAGE_USER|_PAGE_NX_BIT,
    2.57 +        /* 1   1   1   1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX_BIT,
    2.58 +    };
    2.59 +
    2.60 +    /* Don't demand not-NX if the CPU wouldn't enforce it. */
    2.61 +    if ( !guest_supports_nx(v) )
    2.62 +        pfec &= ~PFEC_insn_fetch;
    2.63 +
    2.64 +    /* Don't demand R/W if the CPU wouldn't enforce it. */
    2.65 +    if ( is_hvm_vcpu(v) && unlikely(!hvm_wp_enabled(v)) 
    2.66 +         && !(pfec & PFEC_user_mode) )
    2.67 +        pfec &= ~PFEC_write_access;
    2.68 +
    2.69 +    return flags[(pfec & 0x1f) >> 1];
    2.70 +}
    2.71 +
    2.72 +/* Modify a guest pagetable entry to set the Accessed and Dirty bits.
    2.73 + * Returns non-zero if it actually writes to guest memory. */
    2.74 +static uint32_t set_ad_bits(void *guest_p, void *walk_p, int set_dirty)
    2.75 +{
    2.76 +    guest_intpte_t old, new;
    2.77 +
    2.78 +    old = *(guest_intpte_t *)walk_p;
    2.79 +    new = old | _PAGE_ACCESSED | (set_dirty ? _PAGE_DIRTY : 0);
    2.80 +    if ( old != new ) 
    2.81 +    {
    2.82 +        /* Write the new entry into the walk, and try to write it back
    2.83 +         * into the guest table as well.  If the guest table has changed
    2.84 +         * under out feet then leave it alone. */
    2.85 +        *(guest_intpte_t *)walk_p = new;
    2.86 +        if ( cmpxchg(((guest_intpte_t *)guest_p), old, new) == old ) 
    2.87 +            return 1;
    2.88 +    }
    2.89 +    return 0;
    2.90 +}
    2.91 +
    2.92 +
    2.93 +/* Walk the guest pagetables, after the manner of a hardware walker. */
    2.94 +uint32_t
    2.95 +guest_walk_tables(struct vcpu *v, unsigned long va, walk_t *gw, 
    2.96 +                  uint32_t pfec, mfn_t top_mfn, void *top_map)
    2.97 +{
    2.98 +    struct domain *d = v->domain;
    2.99 +    p2m_type_t p2mt;
   2.100 +    guest_l1e_t *l1p = NULL;
   2.101 +    guest_l2e_t *l2p = NULL;
   2.102 +#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
   2.103 +    guest_l3e_t *l3p = NULL;
   2.104 +    guest_l4e_t *l4p;
   2.105 +#endif
   2.106 +    uint32_t gflags, mflags, rc = 0;
   2.107 +    int pse;
   2.108 +
   2.109 +    perfc_incr(guest_walk);
   2.110 +    memset(gw, 0, sizeof(*gw));
   2.111 +    gw->va = va;
   2.112 +
   2.113 +    /* Mandatory bits that must be set in every entry.  We invert NX, to
   2.114 +     * calculate as if there were an "X" bit that allowed access. 
   2.115 +     * We will accumulate, in rc, the set of flags that are missing. */
   2.116 +    mflags = mandatory_flags(v, pfec);
   2.117 +
   2.118 +#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
   2.119 +#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
   2.120 +
   2.121 +    /* Get the l4e from the top level table and check its flags*/
   2.122 +    gw->l4mfn = top_mfn;
   2.123 +    l4p = (guest_l4e_t *) top_map;
   2.124 +    gw->l4e = l4p[guest_l4_table_offset(va)];
   2.125 +    gflags = guest_l4e_get_flags(gw->l4e) ^ _PAGE_NX_BIT;
   2.126 +    rc |= ((gflags & mflags) ^ mflags);
   2.127 +    if ( rc & _PAGE_PRESENT ) goto out;
   2.128 +
   2.129 +    /* Map the l3 table */
   2.130 +    gw->l3mfn = gfn_to_mfn(d, guest_l4e_get_gfn(gw->l4e), &p2mt);
   2.131 +    if ( !p2m_is_ram(p2mt) ) 
   2.132 +    {
   2.133 +        rc |= _PAGE_PRESENT;
   2.134 +        goto out;
   2.135 +    }
   2.136 +    ASSERT(mfn_valid(mfn_x(gw->l3mfn)));
   2.137 +
   2.138 +    /* Get the l3e and check its flags*/
   2.139 +    l3p = map_domain_page(mfn_x(gw->l3mfn));
   2.140 +    gw->l3e = l3p[guest_l3_table_offset(va)];
   2.141 +    gflags = guest_l3e_get_flags(gw->l3e) ^ _PAGE_NX_BIT;
   2.142 +    rc |= ((gflags & mflags) ^ mflags);
   2.143 +    if ( rc & _PAGE_PRESENT )
   2.144 +        goto out;
   2.145 +
   2.146 +#else /* PAE only... */
   2.147 +
   2.148 +    /* Get the l3e and check its flag */
   2.149 +    gw->l3e = ((guest_l3e_t *) top_map)[guest_l3_table_offset(va)];
   2.150 +    if ( !(guest_l3e_get_flags(gw->l3e) & _PAGE_PRESENT) ) 
   2.151 +    {
   2.152 +        rc |= _PAGE_PRESENT;
   2.153 +        goto out;
   2.154 +    }
   2.155 +
   2.156 +#endif /* PAE or 64... */
   2.157 +
   2.158 +    /* Map the l2 table */
   2.159 +    gw->l2mfn = gfn_to_mfn(d, guest_l3e_get_gfn(gw->l3e), &p2mt);
   2.160 +    if ( !p2m_is_ram(p2mt) )
   2.161 +    {
   2.162 +        rc |= _PAGE_PRESENT;
   2.163 +        goto out;
   2.164 +    }
   2.165 +    ASSERT(mfn_valid(mfn_x(gw->l2mfn)));
   2.166 +
   2.167 +    /* Get the l2e */
   2.168 +    l2p = map_domain_page(mfn_x(gw->l2mfn));
   2.169 +    gw->l2e = l2p[guest_l2_table_offset(va)];
   2.170 +
   2.171 +#else /* 32-bit only... */
   2.172 +
   2.173 +    /* Get l2e from the top level table */
   2.174 +    gw->l2mfn = top_mfn;
   2.175 +    l2p = (guest_l2e_t *) top_map;
   2.176 +    gw->l2e = l2p[guest_l2_table_offset(va)];
   2.177 +
   2.178 +#endif /* All levels... */
   2.179 +
   2.180 +    gflags = guest_l2e_get_flags(gw->l2e) ^ _PAGE_NX_BIT;
   2.181 +    rc |= ((gflags & mflags) ^ mflags);
   2.182 +    if ( rc & _PAGE_PRESENT )
   2.183 +        goto out;
   2.184 +
   2.185 +    pse = (guest_supports_superpages(v) && 
   2.186 +           (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)); 
   2.187 +
   2.188 +    if ( pse )
   2.189 +    {
   2.190 +        /* Special case: this guest VA is in a PSE superpage, so there's
   2.191 +         * no guest l1e.  We make one up so that the propagation code
   2.192 +         * can generate a shadow l1 table.  Start with the gfn of the 
   2.193 +         * first 4k-page of the superpage. */
   2.194 +        gfn_t start = guest_l2e_get_gfn(gw->l2e);
   2.195 +        /* Grant full access in the l1e, since all the guest entry's 
   2.196 +         * access controls are enforced in the shadow l2e. */
   2.197 +        int flags = (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
   2.198 +                     _PAGE_ACCESSED|_PAGE_DIRTY);
   2.199 +        /* PSE level 2 entries use bit 12 for PAT; propagate it to bit 7
   2.200 +         * of the level 1. */
   2.201 +        if ( (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE_PAT) ) 
   2.202 +            flags |= _PAGE_PAT;
   2.203 +        /* Copy the cache-control bits to the l1 as well, because we
   2.204 +         * can't represent PAT in the (non-PSE) shadow l2e. :(
   2.205 +         * This could cause problems if a guest ever maps an area of
   2.206 +         * memory with superpages using more than one caching mode. */
   2.207 +        flags |= guest_l2e_get_flags(gw->l2e) & (_PAGE_PWT|_PAGE_PCD);
   2.208 +        /* Increment the pfn by the right number of 4k pages.  
   2.209 +         * The ~0x1 is to mask out the PAT bit mentioned above. */
   2.210 +        start = _gfn((gfn_x(start) & ~0x1) + guest_l1_table_offset(va));
   2.211 +        gw->l1e = guest_l1e_from_gfn(start, flags);
   2.212 +        gw->l1mfn = _mfn(INVALID_MFN);
   2.213 +    } 
   2.214 +    else 
   2.215 +    {
   2.216 +        /* Not a superpage: carry on and find the l1e. */
   2.217 +        gw->l1mfn = gfn_to_mfn(d, guest_l2e_get_gfn(gw->l2e), &p2mt);
   2.218 +        if ( !p2m_is_ram(p2mt) )
   2.219 +        {
   2.220 +            rc |= _PAGE_PRESENT;
   2.221 +            goto out;
   2.222 +        }
   2.223 +        ASSERT(mfn_valid(mfn_x(gw->l1mfn)));
   2.224 +        l1p = map_domain_page(mfn_x(gw->l1mfn));
   2.225 +        gw->l1e = l1p[guest_l1_table_offset(va)];
   2.226 +        gflags = guest_l1e_get_flags(gw->l1e) ^ _PAGE_NX_BIT;
   2.227 +        rc |= ((gflags & mflags) ^ mflags);
   2.228 +    }
   2.229 +
   2.230 +    /* Go back and set accessed and dirty bits only if the walk was a
   2.231 +     * success.  Although the PRMs say higher-level _PAGE_ACCESSED bits
   2.232 +     * get set whenever a lower-level PT is used, at least some hardware
   2.233 +     * walkers behave this way. */
   2.234 +    if ( rc == 0 ) 
   2.235 +    {
   2.236 +#if GUEST_PAGING_LEVELS == 4 /* 64-bit only... */
   2.237 +        if ( set_ad_bits(l4p + guest_l4_table_offset(va), &gw->l4e, 0) )
   2.238 +            paging_mark_dirty(d, mfn_x(gw->l4mfn));
   2.239 +        if ( set_ad_bits(l3p + guest_l3_table_offset(va), &gw->l3e, 0) )
   2.240 +            paging_mark_dirty(d, mfn_x(gw->l3mfn));
   2.241 +#endif
   2.242 +        if ( set_ad_bits(l2p + guest_l2_table_offset(va), &gw->l2e,
   2.243 +                         (pse && (pfec & PFEC_write_access))) )
   2.244 +            paging_mark_dirty(d, mfn_x(gw->l2mfn));            
   2.245 +        if ( !pse ) 
   2.246 +        {
   2.247 +            if ( set_ad_bits(l1p + guest_l1_table_offset(va), &gw->l1e, 
   2.248 +                             (pfec & PFEC_write_access)) )
   2.249 +                paging_mark_dirty(d, mfn_x(gw->l1mfn));
   2.250 +        }
   2.251 +    }
   2.252 +
   2.253 + out:
   2.254 +#if GUEST_PAGING_LEVELS == 4
   2.255 +    if ( l3p ) unmap_domain_page(l3p);
   2.256 +#endif
   2.257 +#if GUEST_PAGING_LEVELS >= 3
   2.258 +    if ( l2p ) unmap_domain_page(l2p);
   2.259 +#endif
   2.260 +    if ( l1p ) unmap_domain_page(l1p);
   2.261 +
   2.262 +    return rc;
   2.263 +}
     3.1 --- a/xen/arch/x86/mm/shadow/multi.c	Thu Nov 13 13:01:22 2008 +0000
     3.2 +++ b/xen/arch/x86/mm/shadow/multi.c	Thu Nov 13 13:02:08 2008 +0000
     3.3 @@ -157,95 +157,23 @@ delete_shadow_status(struct vcpu *v, mfn
     3.4          put_page(mfn_to_page(gmfn));
     3.5  }
     3.6  
     3.7 -/**************************************************************************/
     3.8 -/* CPU feature support querying */
     3.9 -
    3.10 -static inline int
    3.11 -guest_supports_superpages(struct vcpu *v)
    3.12 -{
    3.13 -    /* The _PAGE_PSE bit must be honoured in HVM guests, whenever
    3.14 -     * CR4.PSE is set or the guest is in PAE or long mode. 
    3.15 -     * It's also used in the dummy PT for vcpus with CR4.PG cleared. */
    3.16 -    return (is_hvm_vcpu(v) && 
    3.17 -            (GUEST_PAGING_LEVELS != 2 
    3.18 -             || !hvm_paging_enabled(v)
    3.19 -             || (v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_PSE)));
    3.20 -}
    3.21 -
    3.22 -static inline int
    3.23 -guest_supports_nx(struct vcpu *v)
    3.24 -{
    3.25 -    if ( GUEST_PAGING_LEVELS == 2 || !cpu_has_nx )
    3.26 -        return 0;
    3.27 -    if ( !is_hvm_vcpu(v) )
    3.28 -        return cpu_has_nx;
    3.29 -    return hvm_nx_enabled(v);
    3.30 -}
    3.31 -
    3.32  
    3.33  /**************************************************************************/
    3.34  /* Functions for walking the guest page tables */
    3.35  
    3.36 -/* Flags that are needed in a pagetable entry, with the sense of NX inverted */
    3.37 -static uint32_t mandatory_flags(struct vcpu *v, uint32_t pfec) 
    3.38 +static inline uint32_t
    3.39 +sh_walk_guest_tables(struct vcpu *v, unsigned long va, walk_t *gw, 
    3.40 +                     uint32_t pfec)
    3.41  {
    3.42 -    static uint32_t flags[] = {
    3.43 -        /* I/F -  Usr Wr */
    3.44 -        /* 0   0   0   0 */ _PAGE_PRESENT, 
    3.45 -        /* 0   0   0   1 */ _PAGE_PRESENT|_PAGE_RW,
    3.46 -        /* 0   0   1   0 */ _PAGE_PRESENT|_PAGE_USER,
    3.47 -        /* 0   0   1   1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER,
    3.48 -        /* 0   1   0   0 */ _PAGE_PRESENT, 
    3.49 -        /* 0   1   0   1 */ _PAGE_PRESENT|_PAGE_RW,
    3.50 -        /* 0   1   1   0 */ _PAGE_PRESENT|_PAGE_USER,
    3.51 -        /* 0   1   1   1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER,
    3.52 -        /* 1   0   0   0 */ _PAGE_PRESENT|_PAGE_NX_BIT, 
    3.53 -        /* 1   0   0   1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_NX_BIT,
    3.54 -        /* 1   0   1   0 */ _PAGE_PRESENT|_PAGE_USER|_PAGE_NX_BIT,
    3.55 -        /* 1   0   1   1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX_BIT,
    3.56 -        /* 1   1   0   0 */ _PAGE_PRESENT|_PAGE_NX_BIT, 
    3.57 -        /* 1   1   0   1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_NX_BIT,
    3.58 -        /* 1   1   1   0 */ _PAGE_PRESENT|_PAGE_USER|_PAGE_NX_BIT,
    3.59 -        /* 1   1   1   1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX_BIT,
    3.60 -    };
    3.61 -
    3.62 -    /* Don't demand not-NX if the CPU wouldn't enforce it. */
    3.63 -    if ( !guest_supports_nx(v) )
    3.64 -        pfec &= ~PFEC_insn_fetch;
    3.65 -
    3.66 -    /* Don't demand R/W if the CPU wouldn't enforce it. */
    3.67 -    if ( is_hvm_vcpu(v) && unlikely(!hvm_wp_enabled(v)) 
    3.68 -         && !(pfec & PFEC_user_mode) )
    3.69 -        pfec &= ~PFEC_write_access;
    3.70 -
    3.71 -    return flags[(pfec & 0x1f) >> 1];
    3.72 -}
    3.73 -
    3.74 -/* Modify a guest pagetable entry to set the Accessed and Dirty bits.
    3.75 - * Returns non-zero if it actually writes to guest memory. */
    3.76 -static uint32_t set_ad_bits(void *guest_p, void *walk_p, int set_dirty)
    3.77 -{
    3.78 -    guest_intpte_t old, new;
    3.79 -    int ret = 0;
    3.80 -
    3.81 -    old = *(guest_intpte_t *)walk_p;
    3.82 -    new = old | _PAGE_ACCESSED | (set_dirty ? _PAGE_DIRTY : 0);
    3.83 -    if ( old != new ) 
    3.84 -    {
    3.85 -        /* Write the new entry into the walk, and try to write it back
    3.86 -         * into the guest table as well.  If the guest table has changed
    3.87 -         * under out feet then leave it alone. */
    3.88 -        *(guest_intpte_t *)walk_p = new;
    3.89 -        if( cmpxchg(((guest_intpte_t *)guest_p), old, new) == old ) 
    3.90 -            ret = 1;
    3.91 -
    3.92 -        /* FIXME -- this code is longer than necessary */
    3.93 -        if(set_dirty)
    3.94 -            TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_SET_AD);
    3.95 -        else
    3.96 -            TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_SET_A);
    3.97 -    }
    3.98 -    return ret;
    3.99 +    return guest_walk_tables(v, va, gw, pfec, 
   3.100 +#if GUEST_PAGING_LEVELS == 3 /* PAE */
   3.101 +                             _mfn(INVALID_MFN),
   3.102 +                             v->arch.paging.shadow.gl3e
   3.103 +#else /* 32 or 64 */
   3.104 +                             pagetable_get_mfn(v->arch.guest_table),
   3.105 +                             v->arch.paging.shadow.guest_vtable
   3.106 +#endif
   3.107 +                             );
   3.108  }
   3.109  
   3.110  /* This validation is called with lock held, and after write permission
   3.111 @@ -364,236 +292,6 @@ gw_remove_write_accesses(struct vcpu *v,
   3.112      return rc;
   3.113  }
   3.114  
   3.115 -/* Walk the guest pagetables, after the manner of a hardware walker. 
   3.116 - *
   3.117 - * Inputs: a vcpu, a virtual address, a walk_t to fill, a 
   3.118 - *         pointer to a pagefault code
   3.119 - * 
   3.120 - * We walk the vcpu's guest pagetables, filling the walk_t with what we
   3.121 - * see and adding any Accessed and Dirty bits that are needed in the
   3.122 - * guest entries.  Using the pagefault code, we check the permissions as
   3.123 - * we go.  For the purposes of reading pagetables we treat all non-RAM
   3.124 - * memory as contining zeroes.
   3.125 - * 
   3.126 - * The walk is done in a lock-free style, with some sanity check postponed
   3.127 - * after grabbing shadow lock later. Those delayed checks will make sure
   3.128 - * no inconsistent mapping being translated into shadow page table.
   3.129 - * 
   3.130 - * Returns 0 for success, or the set of permission bits that we failed on 
   3.131 - * if the walk did not complete.
   3.132 - * N.B. This is different from the old return code but almost no callers
   3.133 - * checked the old return code anyway.
   3.134 - */
   3.135 -static uint32_t
   3.136 -guest_walk_tables(struct vcpu *v, unsigned long va, walk_t *gw, uint32_t pfec)
   3.137 -{
   3.138 -    struct domain *d = v->domain;
   3.139 -    p2m_type_t p2mt;
   3.140 -    guest_l1e_t *l1p = NULL;
   3.141 -    guest_l2e_t *l2p = NULL;
   3.142 -#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
   3.143 -    guest_l3e_t *l3p = NULL;
   3.144 -    guest_l4e_t *l4p;
   3.145 -#endif
   3.146 -    uint32_t gflags, mflags, rc = 0;
   3.147 -    int pse;
   3.148 -
   3.149 -    perfc_incr(shadow_guest_walk);
   3.150 -    memset(gw, 0, sizeof(*gw));
   3.151 -    gw->va = va;
   3.152 -
   3.153 -    /* Mandatory bits that must be set in every entry.  We invert NX, to
   3.154 -     * calculate as if there were an "X" bit that allowed access. 
   3.155 -     * We will accumulate, in rc, the set of flags that are missing. */
   3.156 -    mflags = mandatory_flags(v, pfec);
   3.157 -
   3.158 -#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
   3.159 -#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
   3.160 -
   3.161 -    /* Get the l4e from the top level table and check its flags*/
   3.162 -    gw->l4mfn = pagetable_get_mfn(v->arch.guest_table);
   3.163 -    l4p = ((guest_l4e_t *)v->arch.paging.shadow.guest_vtable);
   3.164 -    gw->l4e = l4p[guest_l4_table_offset(va)];
   3.165 -    gflags = guest_l4e_get_flags(gw->l4e) ^ _PAGE_NX_BIT;
   3.166 -    rc |= ((gflags & mflags) ^ mflags);
   3.167 -    if ( rc & _PAGE_PRESENT ) goto out;
   3.168 -
   3.169 -    /* Map the l3 table */
   3.170 -    gw->l3mfn = gfn_to_mfn(d, guest_l4e_get_gfn(gw->l4e), &p2mt);
   3.171 -    if ( !p2m_is_ram(p2mt) ) 
   3.172 -    {
   3.173 -        rc |= _PAGE_PRESENT;
   3.174 -        goto out;
   3.175 -    }
   3.176 -    ASSERT(mfn_valid(gw->l3mfn));
   3.177 -
   3.178 -    /* Get the l3e and check its flags*/
   3.179 -    l3p = sh_map_domain_page(gw->l3mfn);
   3.180 -    gw->l3e = l3p[guest_l3_table_offset(va)];
   3.181 -    gflags = guest_l3e_get_flags(gw->l3e) ^ _PAGE_NX_BIT;
   3.182 -    rc |= ((gflags & mflags) ^ mflags);
   3.183 -    if ( rc & _PAGE_PRESENT )
   3.184 -        goto out;
   3.185 -
   3.186 -#else /* PAE only... */
   3.187 -
   3.188 -    /* Get l3e from the cache of the top level table and check its flag */
   3.189 -    gw->l3e = v->arch.paging.shadow.gl3e[guest_l3_table_offset(va)];
   3.190 -    if ( !(guest_l3e_get_flags(gw->l3e) & _PAGE_PRESENT) ) 
   3.191 -    {
   3.192 -        rc |= _PAGE_PRESENT;
   3.193 -        goto out;
   3.194 -    }
   3.195 -
   3.196 -#endif /* PAE or 64... */
   3.197 -
   3.198 -    /* Map the l2 table */
   3.199 -    gw->l2mfn = gfn_to_mfn(d, guest_l3e_get_gfn(gw->l3e), &p2mt);
   3.200 -    if ( !p2m_is_ram(p2mt) )
   3.201 -    {
   3.202 -        rc |= _PAGE_PRESENT;
   3.203 -        goto out;
   3.204 -    }
   3.205 -    ASSERT(mfn_valid(gw->l2mfn));
   3.206 -
   3.207 -    /* Get the l2e */
   3.208 -    l2p = sh_map_domain_page(gw->l2mfn);
   3.209 -    gw->l2e = l2p[guest_l2_table_offset(va)];
   3.210 -
   3.211 -#else /* 32-bit only... */
   3.212 -
   3.213 -    /* Get l2e from the top level table */
   3.214 -    gw->l2mfn = pagetable_get_mfn(v->arch.guest_table);
   3.215 -    l2p = ((guest_l2e_t *)v->arch.paging.shadow.guest_vtable);
   3.216 -    gw->l2e = l2p[guest_l2_table_offset(va)];
   3.217 -
   3.218 -#endif /* All levels... */
   3.219 -
   3.220 -    gflags = guest_l2e_get_flags(gw->l2e) ^ _PAGE_NX_BIT;
   3.221 -    rc |= ((gflags & mflags) ^ mflags);
   3.222 -    if ( rc & _PAGE_PRESENT )
   3.223 -        goto out;
   3.224 -
   3.225 -    pse = (guest_supports_superpages(v) && 
   3.226 -           (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)); 
   3.227 -
   3.228 -    if ( pse )
   3.229 -    {
   3.230 -        /* Special case: this guest VA is in a PSE superpage, so there's
   3.231 -         * no guest l1e.  We make one up so that the propagation code
   3.232 -         * can generate a shadow l1 table.  Start with the gfn of the 
   3.233 -         * first 4k-page of the superpage. */
   3.234 -        gfn_t start = guest_l2e_get_gfn(gw->l2e);
   3.235 -        /* Grant full access in the l1e, since all the guest entry's 
   3.236 -         * access controls are enforced in the shadow l2e. */
   3.237 -        int flags = (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
   3.238 -                     _PAGE_ACCESSED|_PAGE_DIRTY);
   3.239 -        /* PSE level 2 entries use bit 12 for PAT; propagate it to bit 7
   3.240 -         * of the level 1. */
   3.241 -        if ( (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE_PAT) ) 
   3.242 -            flags |= _PAGE_PAT;
   3.243 -        /* Copy the cache-control bits to the l1 as well, because we
   3.244 -         * can't represent PAT in the (non-PSE) shadow l2e. :(
   3.245 -         * This could cause problems if a guest ever maps an area of
   3.246 -         * memory with superpages using more than one caching mode. */
   3.247 -        flags |= guest_l2e_get_flags(gw->l2e) & (_PAGE_PWT|_PAGE_PCD);
   3.248 -        /* Increment the pfn by the right number of 4k pages.  
   3.249 -         * The ~0x1 is to mask out the PAT bit mentioned above. */
   3.250 -        start = _gfn((gfn_x(start) & ~0x1) + guest_l1_table_offset(va));
   3.251 -        gw->l1e = guest_l1e_from_gfn(start, flags);
   3.252 -        gw->l1mfn = _mfn(INVALID_MFN);
   3.253 -    } 
   3.254 -    else 
   3.255 -    {
   3.256 -        /* Not a superpage: carry on and find the l1e. */
   3.257 -        gw->l1mfn = gfn_to_mfn(d, guest_l2e_get_gfn(gw->l2e), &p2mt);
   3.258 -        if ( !p2m_is_ram(p2mt) )
   3.259 -        {
   3.260 -            rc |= _PAGE_PRESENT;
   3.261 -            goto out;
   3.262 -        }
   3.263 -        ASSERT(mfn_valid(gw->l1mfn));
   3.264 -        l1p = sh_map_domain_page(gw->l1mfn);
   3.265 -        gw->l1e = l1p[guest_l1_table_offset(va)];
   3.266 -        gflags = guest_l1e_get_flags(gw->l1e) ^ _PAGE_NX_BIT;
   3.267 -        rc |= ((gflags & mflags) ^ mflags);
   3.268 -    }
   3.269 -
   3.270 -    /* Go back and set accessed and dirty bits only if the walk was a
   3.271 -     * success.  Although the PRMs say higher-level _PAGE_ACCESSED bits
   3.272 -     * get set whenever a lower-level PT is used, at least some hardware
   3.273 -     * walkers behave this way. */
   3.274 -    if ( rc == 0 ) 
   3.275 -    {
   3.276 -#if GUEST_PAGING_LEVELS == 4 /* 64-bit only... */
   3.277 -        if ( set_ad_bits(l4p + guest_l4_table_offset(va), &gw->l4e, 0) )
   3.278 -            paging_mark_dirty(d, mfn_x(gw->l4mfn));
   3.279 -        if ( set_ad_bits(l3p + guest_l3_table_offset(va), &gw->l3e, 0) )
   3.280 -            paging_mark_dirty(d, mfn_x(gw->l3mfn));
   3.281 -#endif
   3.282 -        if ( set_ad_bits(l2p + guest_l2_table_offset(va), &gw->l2e,
   3.283 -                         (pse && (pfec & PFEC_write_access))) )
   3.284 -            paging_mark_dirty(d, mfn_x(gw->l2mfn));            
   3.285 -        if ( !pse ) 
   3.286 -        {
   3.287 -            if ( set_ad_bits(l1p + guest_l1_table_offset(va), &gw->l1e, 
   3.288 -                             (pfec & PFEC_write_access)) )
   3.289 -                paging_mark_dirty(d, mfn_x(gw->l1mfn));
   3.290 -        }
   3.291 -    }
   3.292 -
   3.293 - out:
   3.294 -#if GUEST_PAGING_LEVELS == 4
   3.295 -    if ( l3p ) sh_unmap_domain_page(l3p);
   3.296 -#endif
   3.297 -#if GUEST_PAGING_LEVELS >= 3
   3.298 -    if ( l2p ) sh_unmap_domain_page(l2p);
   3.299 -#endif
   3.300 -    if ( l1p ) sh_unmap_domain_page(l1p);
   3.301 -
   3.302 -    return rc;
   3.303 -}
   3.304 -
   3.305 -/* Given a walk_t, translate the gw->va into the guest's notion of the
   3.306 - * corresponding frame number. */
   3.307 -static inline gfn_t
   3.308 -guest_walk_to_gfn(walk_t *gw)
   3.309 -{
   3.310 -    if ( !(guest_l1e_get_flags(gw->l1e) & _PAGE_PRESENT) )
   3.311 -        return _gfn(INVALID_GFN);
   3.312 -    return guest_l1e_get_gfn(gw->l1e);
   3.313 -}
   3.314 -
   3.315 -/* Given a walk_t, translate the gw->va into the guest's notion of the
   3.316 - * corresponding physical address. */
   3.317 -static inline paddr_t
   3.318 -guest_walk_to_gpa(walk_t *gw)
   3.319 -{
   3.320 -    if ( !(guest_l1e_get_flags(gw->l1e) & _PAGE_PRESENT) )
   3.321 -        return 0;
   3.322 -    return guest_l1e_get_paddr(gw->l1e) + (gw->va & ~PAGE_MASK);
   3.323 -}
   3.324 -
   3.325 -#if 0 /* Keep for debugging */
   3.326 -/* Pretty-print the contents of a guest-walk */
   3.327 -static inline void print_gw(walk_t *gw)
   3.328 -{
   3.329 -    SHADOW_PRINTK("GUEST WALK TO %#lx:\n", gw->va);
   3.330 -#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
   3.331 -#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
   3.332 -    SHADOW_PRINTK("   l4mfn=%" PRI_mfn "\n", mfn_x(gw->l4mfn));
   3.333 -    SHADOW_PRINTK("   l4e=%" SH_PRI_gpte "\n", gw->l4e.l4);
   3.334 -    SHADOW_PRINTK("   l3mfn=%" PRI_mfn "\n", mfn_x(gw->l3mfn));
   3.335 -#endif /* PAE or 64... */
   3.336 -    SHADOW_PRINTK("   l3e=%" SH_PRI_gpte "\n", gw->l3e.l3);
   3.337 -#endif /* All levels... */
   3.338 -    SHADOW_PRINTK("   l2mfn=%" PRI_mfn "\n", mfn_x(gw->l2mfn));
   3.339 -    SHADOW_PRINTK("   l2e=%" SH_PRI_gpte "\n", gw->l2e.l2);
   3.340 -    SHADOW_PRINTK("   l1mfn=%" PRI_mfn "\n", mfn_x(gw->l1mfn));
   3.341 -    SHADOW_PRINTK("   l1e=%" SH_PRI_gpte "\n", gw->l1e.l1);
   3.342 -}
   3.343 -#endif /* 0 */
   3.344 -
   3.345  #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
   3.346  /* Lightweight audit: pass all the shadows associated with this guest walk
   3.347   * through the audit mechanisms */
   3.348 @@ -654,7 +352,7 @@ sh_guest_map_l1e(struct vcpu *v, unsigne
   3.349      // XXX -- this is expensive, but it's easy to cobble together...
   3.350      // FIXME!
   3.351  
   3.352 -    if ( guest_walk_tables(v, addr, &gw, PFEC_page_present) == 0 
   3.353 +    if ( sh_walk_guest_tables(v, addr, &gw, PFEC_page_present) == 0 
   3.354           && mfn_valid(gw.l1mfn) )
   3.355      {
   3.356          if ( gl1mfn )
   3.357 @@ -676,7 +374,7 @@ sh_guest_get_eff_l1e(struct vcpu *v, uns
   3.358      // XXX -- this is expensive, but it's easy to cobble together...
   3.359      // FIXME!
   3.360  
   3.361 -    (void) guest_walk_tables(v, addr, &gw, PFEC_page_present);
   3.362 +    (void) sh_walk_guest_tables(v, addr, &gw, PFEC_page_present);
   3.363      *(guest_l1e_t *)eff_l1e = gw.l1e;
   3.364  }
   3.365  #endif /* CONFIG == GUEST (== SHADOW) */
   3.366 @@ -3314,9 +3012,14 @@ static int sh_page_fault(struct vcpu *v,
   3.367      }
   3.368  
   3.369   rewalk:
   3.370 +
   3.371 +    /* The walk is done in a lock-free style, with some sanity check
   3.372 +     * postponed after grabbing shadow lock later. Those delayed checks
   3.373 +     * will make sure no inconsistent mapping being translated into
   3.374 +     * shadow page table. */ 
   3.375      version = atomic_read(&d->arch.paging.shadow.gtable_dirty_version);
   3.376      rmb();
   3.377 -    rc = guest_walk_tables(v, va, &gw, regs->error_code);
   3.378 +    rc = sh_walk_guest_tables(v, va, &gw, regs->error_code);
   3.379  
   3.380  #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
   3.381      regs->error_code &= ~PFEC_page_present;
   3.382 @@ -3869,7 +3572,7 @@ sh_gva_to_gfn(struct vcpu *v, unsigned l
   3.383          return vtlb_gfn;
   3.384  #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
   3.385  
   3.386 -    if ( guest_walk_tables(v, va, &gw, pfec[0]) != 0 )
   3.387 +    if ( sh_walk_guest_tables(v, va, &gw, pfec[0]) != 0 )
   3.388      {
   3.389          if ( !(guest_l1e_get_flags(gw.l1e) & _PAGE_PRESENT) )
   3.390              pfec[0] &= ~PFEC_page_present;
     4.1 --- a/xen/include/asm-x86/guest_pt.h	Thu Nov 13 13:01:22 2008 +0000
     4.2 +++ b/xen/include/asm-x86/guest_pt.h	Thu Nov 13 13:02:08 2008 +0000
     4.3 @@ -174,6 +174,32 @@ static inline guest_l4e_t guest_l4e_from
     4.4  #endif /* GUEST_PAGING_LEVELS != 2 */
     4.5  
     4.6  
     4.7 +/* Which pagetable features are supported on this vcpu? */
     4.8 +
     4.9 +static inline int
    4.10 +guest_supports_superpages(struct vcpu *v)
    4.11 +{
    4.12 +    /* The _PAGE_PSE bit must be honoured in HVM guests, whenever
    4.13 +     * CR4.PSE is set or the guest is in PAE or long mode. 
    4.14 +     * It's also used in the dummy PT for vcpus with CR4.PG cleared. */
    4.15 +    return (is_hvm_vcpu(v) && 
    4.16 +            (GUEST_PAGING_LEVELS != 2 
    4.17 +             || !hvm_paging_enabled(v)
    4.18 +             || (v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_PSE)));
    4.19 +}
    4.20 +
    4.21 +static inline int
    4.22 +guest_supports_nx(struct vcpu *v)
    4.23 +{
    4.24 +    if ( GUEST_PAGING_LEVELS == 2 || !cpu_has_nx )
    4.25 +        return 0;
    4.26 +    if ( !is_hvm_vcpu(v) )
    4.27 +        return cpu_has_nx;
    4.28 +    return hvm_nx_enabled(v);
    4.29 +}
    4.30 +
    4.31 +
    4.32 +
    4.33  /* Type used for recording a walk through guest pagetables.  It is
    4.34   * filled in by the pagetable walk function, and also used as a cache
    4.35   * for later walks.  When we encounter a superpage l2e, we fabricate an
    4.36 @@ -199,4 +225,67 @@ struct guest_pagetable_walk
    4.37      mfn_t l1mfn;                /* MFN that the level 1 entry was in */
    4.38  };
    4.39  
    4.40 +/* Given a walk_t, translate the gw->va into the guest's notion of the
    4.41 + * corresponding frame number. */
    4.42 +static inline gfn_t
    4.43 +guest_walk_to_gfn(walk_t *gw)
    4.44 +{
    4.45 +    if ( !(guest_l1e_get_flags(gw->l1e) & _PAGE_PRESENT) )
    4.46 +        return _gfn(INVALID_GFN);
    4.47 +    return guest_l1e_get_gfn(gw->l1e);
    4.48 +}
    4.49 +
    4.50 +/* Given a walk_t, translate the gw->va into the guest's notion of the
    4.51 + * corresponding physical address. */
    4.52 +static inline paddr_t
    4.53 +guest_walk_to_gpa(walk_t *gw)
    4.54 +{
    4.55 +    if ( !(guest_l1e_get_flags(gw->l1e) & _PAGE_PRESENT) )
    4.56 +        return 0;
    4.57 +    return guest_l1e_get_paddr(gw->l1e) + (gw->va & ~PAGE_MASK);
    4.58 +}
    4.59 +
    4.60 +/* Walk the guest pagetables, after the manner of a hardware walker. 
    4.61 + *
    4.62 + * Inputs: a vcpu, a virtual address, a walk_t to fill, a 
    4.63 + *         pointer to a pagefault code, the MFN of the guest's 
    4.64 + *         top-level pagetable, and a mapping of the 
    4.65 + *         guest's top-level pagetable.
    4.66 + * 
    4.67 + * We walk the vcpu's guest pagetables, filling the walk_t with what we
    4.68 + * see and adding any Accessed and Dirty bits that are needed in the
    4.69 + * guest entries.  Using the pagefault code, we check the permissions as
    4.70 + * we go.  For the purposes of reading pagetables we treat all non-RAM
    4.71 + * memory as contining zeroes.
    4.72 + * 
    4.73 + * Returns 0 for success, or the set of permission bits that we failed on 
    4.74 + * if the walk did not complete. */
    4.75 +
    4.76 +/* Macro-fu so you can call guest_walk_tables() and get the right one. */
    4.77 +#define GPT_RENAME2(_n, _l) _n ## _ ## _l ## _levels
    4.78 +#define GPT_RENAME(_n, _l) GPT_RENAME2(_n, _l)
    4.79 +#define guest_walk_tables GPT_RENAME(guest_walk_tables, GUEST_PAGING_LEVELS)
    4.80 +
    4.81 +extern uint32_t 
    4.82 +guest_walk_tables(struct vcpu *v, unsigned long va, walk_t *gw, 
    4.83 +                  uint32_t pfec, mfn_t top_mfn, void *top_map);
    4.84 +
    4.85 +/* Pretty-print the contents of a guest-walk */
    4.86 +static inline void print_gw(walk_t *gw)
    4.87 +{
    4.88 +    gdprintk(XENLOG_INFO, "GUEST WALK TO %#lx:\n", gw->va);
    4.89 +#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
    4.90 +#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
    4.91 +    gdprintk(XENLOG_INFO, "   l4mfn=%" PRI_mfn "\n", mfn_x(gw->l4mfn));
    4.92 +    gdprintk(XENLOG_INFO, "   l4e=%" PRI_gpte "\n", gw->l4e.l4);
    4.93 +    gdprintk(XENLOG_INFO, "   l3mfn=%" PRI_mfn "\n", mfn_x(gw->l3mfn));
    4.94 +#endif /* PAE or 64... */
    4.95 +    gdprintk(XENLOG_INFO, "   l3e=%" PRI_gpte "\n", gw->l3e.l3);
    4.96 +#endif /* All levels... */
    4.97 +    gdprintk(XENLOG_INFO, "   l2mfn=%" PRI_mfn "\n", mfn_x(gw->l2mfn));
    4.98 +    gdprintk(XENLOG_INFO, "   l2e=%" PRI_gpte "\n", gw->l2e.l2);
    4.99 +    gdprintk(XENLOG_INFO, "   l1mfn=%" PRI_mfn "\n", mfn_x(gw->l1mfn));
   4.100 +    gdprintk(XENLOG_INFO, "   l1e=%" PRI_gpte "\n", gw->l1e.l1);
   4.101 +}
   4.102 +
   4.103  #endif /* _XEN_ASM_GUEST_PT_H */
     5.1 --- a/xen/include/asm-x86/perfc_defn.h	Thu Nov 13 13:01:22 2008 +0000
     5.2 +++ b/xen/include/asm-x86/perfc_defn.h	Thu Nov 13 13:02:08 2008 +0000
     5.3 @@ -33,6 +33,7 @@ PERFCOUNTER(ptwr_emulations,        "wri
     5.4  
     5.5  PERFCOUNTER(exception_fixed,        "pre-exception fixed")
     5.6  
     5.7 +PERFCOUNTER(guest_walk,            "guest pagetable walks")
     5.8  
     5.9  /* Shadow counters */
    5.10  PERFCOUNTER(shadow_alloc,          "calls to shadow_alloc")
    5.11 @@ -92,7 +93,6 @@ PERFCOUNTER(shadow_unshadow,       "shad
    5.12  PERFCOUNTER(shadow_up_pointer,     "shadow unshadow by up-pointer")
    5.13  PERFCOUNTER(shadow_unshadow_bf,    "shadow unshadow brute-force")
    5.14  PERFCOUNTER(shadow_get_page_fail,  "shadow_get_page_from_l1e failed")
    5.15 -PERFCOUNTER(shadow_guest_walk,     "shadow walks guest tables")
    5.16  PERFCOUNTER(shadow_check_gwalk,    "shadow checks gwalk")
    5.17  PERFCOUNTER(shadow_inconsistent_gwalk, "shadow check inconsistent gwalk")
    5.18  PERFCOUNTER(shadow_rm_write_flush_tlb,