direct-io.hg

changeset 5724:0bcfd66a431e

Check in files I missed from shadow64 checkin.
author kaf24@firebug.cl.cam.ac.uk
date Mon Jul 11 09:57:38 2005 +0000 (2005-07-11)
parents d332d4df452e
children e4272b361053
files xen/arch/x86/shadow32.c xen/arch/x86/shadow_public.c
line diff
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/xen/arch/x86/shadow32.c	Mon Jul 11 09:57:38 2005 +0000
     1.3 @@ -0,0 +1,3388 @@
     1.4 +/******************************************************************************
     1.5 + * arch/x86/shadow.c
     1.6 + * 
     1.7 + * Copyright (c) 2005 Michael A Fetterman
     1.8 + * Based on an earlier implementation by Ian Pratt et al
     1.9 + * 
    1.10 + * This program is free software; you can redistribute it and/or modify
    1.11 + * it under the terms of the GNU General Public License as published by
    1.12 + * the Free Software Foundation; either version 2 of the License, or
    1.13 + * (at your option) any later version.
    1.14 + * 
    1.15 + * This program is distributed in the hope that it will be useful,
    1.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
    1.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    1.18 + * GNU General Public License for more details.
    1.19 + * 
    1.20 + * You should have received a copy of the GNU General Public License
    1.21 + * along with this program; if not, write to the Free Software
    1.22 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
    1.23 + */
    1.24 +
    1.25 +
    1.26 +#include <xen/config.h>
    1.27 +#include <xen/types.h>
    1.28 +#include <xen/mm.h>
    1.29 +#include <xen/domain_page.h>
    1.30 +#include <asm/shadow.h>
    1.31 +#include <asm/page.h>
    1.32 +#include <xen/event.h>
    1.33 +#include <xen/sched.h>
    1.34 +#include <xen/trace.h>
    1.35 +
    1.36 +#define MFN_PINNED(_x) (frame_table[_x].u.inuse.type_info & PGT_pinned)
    1.37 +
    1.38 +static void shadow_free_snapshot(struct domain *d,
    1.39 +                                 struct out_of_sync_entry *entry);
    1.40 +static void remove_out_of_sync_entries(struct domain *d, unsigned long smfn);
    1.41 +static void free_writable_pte_predictions(struct domain *d);
    1.42 +
    1.43 +#if SHADOW_DEBUG
    1.44 +static void mark_shadows_as_reflecting_snapshot(struct domain *d, unsigned long gpfn);
    1.45 +#endif
    1.46 +
    1.47 +/********
    1.48 +
    1.49 +There's a per-domain shadow table spin lock which works fine for SMP
    1.50 +hosts. We don't have to worry about interrupts as no shadow operations
    1.51 +happen in an interrupt context. It's probably not quite ready for SMP
    1.52 +guest operation as we have to worry about synchonisation between gpte
    1.53 +and spte updates. Its possible that this might only happen in a
    1.54 +hypercall context, in which case we'll probably at have a per-domain
    1.55 +hypercall lock anyhow (at least initially).
    1.56 +
    1.57 +********/
    1.58 +
    1.59 +static inline int
    1.60 +shadow_promote(struct domain *d, unsigned long gpfn, unsigned long gmfn,
    1.61 +               unsigned long new_type)
    1.62 +{
    1.63 +    struct pfn_info *page = pfn_to_page(gmfn);
    1.64 +    int pinned = 0, okay = 1;
    1.65 +
    1.66 +    if ( page_out_of_sync(page) )
    1.67 +    {
    1.68 +        // Don't know how long ago this snapshot was taken.
    1.69 +        // Can't trust it to be recent enough.
    1.70 +        //
    1.71 +        __shadow_sync_mfn(d, gmfn);
    1.72 +    }
    1.73 +
    1.74 +    if ( !shadow_mode_refcounts(d) )
    1.75 +        return 1;
    1.76 +
    1.77 +    if ( unlikely(page_is_page_table(page)) )
    1.78 +        return 1;
    1.79 +
    1.80 +    FSH_LOG("%s: gpfn=%lx gmfn=%lx nt=%08lx", __func__, gpfn, gmfn, new_type);
    1.81 +
    1.82 +    if ( !shadow_remove_all_write_access(d, gpfn, gmfn) )
    1.83 +    {
    1.84 +        FSH_LOG("%s: couldn't find/remove all write accesses, gpfn=%lx gmfn=%lx",
    1.85 +                __func__, gpfn, gmfn);
    1.86 +#if 1 || defined(LIVE_DANGEROUSLY)
    1.87 +        set_bit(_PGC_page_table, &page->count_info);
    1.88 +        return 1;
    1.89 +#endif
    1.90 +        return 0;
    1.91 +        
    1.92 +    }
    1.93 +
    1.94 +    // To convert this page to use as a page table, the writable count
    1.95 +    // should now be zero.  Test this by grabbing the page as an page table,
    1.96 +    // and then immediately releasing.  This will also deal with any
    1.97 +    // necessary TLB flushing issues for us.
    1.98 +    //
    1.99 +    // The cruft here about pinning doesn't really work right.  This
   1.100 +    // needs rethinking/rewriting...  Need to gracefully deal with the
   1.101 +    // TLB flushes required when promoting a writable page, and also deal
   1.102 +    // with any outstanding (external) writable refs to this page (by
   1.103 +    // refusing to promote it).  The pinning headache complicates this
   1.104 +    // code -- it would all get much simpler if we stop using
   1.105 +    // shadow_lock() and move the shadow code to BIGLOCK().
   1.106 +    //
   1.107 +    if ( unlikely(!get_page(page, d)) )
   1.108 +        BUG(); // XXX -- needs more thought for a graceful failure
   1.109 +    if ( unlikely(test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info)) )
   1.110 +    {
   1.111 +        pinned = 1;
   1.112 +        put_page_and_type(page);
   1.113 +    }
   1.114 +    if ( get_page_type(page, PGT_base_page_table) )
   1.115 +    {
   1.116 +        set_bit(_PGC_page_table, &page->count_info);
   1.117 +        put_page_type(page);
   1.118 +    }
   1.119 +    else
   1.120 +    {
   1.121 +        printk("shadow_promote: get_page_type failed "
   1.122 +               "dom%d gpfn=%lx gmfn=%lx t=%08lx\n",
   1.123 +               d->domain_id, gpfn, gmfn, new_type);
   1.124 +        okay = 0;
   1.125 +    }
   1.126 +
   1.127 +    // Now put the type back to writable...
   1.128 +    if ( unlikely(!get_page_type(page, PGT_writable_page)) )
   1.129 +        BUG(); // XXX -- needs more thought for a graceful failure
   1.130 +    if ( unlikely(pinned) )
   1.131 +    {
   1.132 +        if ( unlikely(test_and_set_bit(_PGT_pinned,
   1.133 +                                       &page->u.inuse.type_info)) )
   1.134 +            BUG(); // hmm... someone pinned this again?
   1.135 +    }
   1.136 +    else
   1.137 +        put_page_and_type(page);
   1.138 +
   1.139 +    return okay;
   1.140 +}
   1.141 +
   1.142 +static inline void
   1.143 +shadow_demote(struct domain *d, unsigned long gpfn, unsigned long gmfn)
   1.144 +{
   1.145 +    if ( !shadow_mode_refcounts(d) )
   1.146 +        return;
   1.147 +
   1.148 +    ASSERT(frame_table[gmfn].count_info & PGC_page_table);
   1.149 +
   1.150 +    if ( shadow_max_pgtable_type(d, gpfn, NULL) == PGT_none )
   1.151 +    {
   1.152 +        clear_bit(_PGC_page_table, &frame_table[gmfn].count_info);
   1.153 +
   1.154 +        if ( page_out_of_sync(pfn_to_page(gmfn)) )
   1.155 +        {
   1.156 +            remove_out_of_sync_entries(d, gmfn);
   1.157 +        }
   1.158 +    }
   1.159 +}
   1.160 +
   1.161 +/*
   1.162 + * Things in shadow mode that collect get_page() refs to the domain's
   1.163 + * pages are:
   1.164 + * - PGC_allocated takes a gen count, just like normal.
   1.165 + * - A writable page can be pinned (paravirtualized guests may consider
   1.166 + *   these pages to be L1s or L2s, and don't know the difference).
   1.167 + *   Pinning a page takes a gen count (but, for domains in shadow mode,
   1.168 + *   it *doesn't* take a type count)
   1.169 + * - CR3 grabs a ref to whatever it points at, just like normal.
   1.170 + * - Shadow mode grabs an initial gen count for itself, as a placehold
   1.171 + *   for whatever references will exist.
   1.172 + * - Shadow PTEs that point to a page take a gen count, just like regular
   1.173 + *   PTEs.  However, they don't get a type count, as get_page_type() is
   1.174 + *   hardwired to keep writable pages' counts at 1 for domains in shadow
   1.175 + *   mode.
   1.176 + * - Whenever we shadow a page, the entry in the shadow hash grabs a
   1.177 + *   general ref to the page.
   1.178 + * - Whenever a page goes out of sync, the out of sync entry grabs a
   1.179 + *   general ref to the page.
   1.180 + */
   1.181 +/*
   1.182 + * pfn_info fields for pages allocated as shadow pages:
   1.183 + *
   1.184 + * All 32 bits of count_info are a simple count of refs to this shadow
   1.185 + * from a) other shadow pages, b) current CR3's (aka ed->arch.shadow_table),
   1.186 + * c) if it's a pinned shadow root pgtable, d) outstanding out-of-sync
   1.187 + * references.
   1.188 + *
   1.189 + * u.inuse._domain is left NULL, to prevent accidently allow some random
   1.190 + * domain from gaining permissions to map this page.
   1.191 + *
   1.192 + * u.inuse.type_info & PGT_type_mask remembers what kind of page is being
   1.193 + * shadowed.
   1.194 + * u.inuse.type_info & PGT_mfn_mask holds the mfn of the page being shadowed.
   1.195 + * u.inuse.type_info & PGT_pinned says that an extra reference to this shadow
   1.196 + * is currently exists because this is a shadow of a root page, and we
   1.197 + * don't want to let those disappear just because no CR3 is currently pointing
   1.198 + * at it.
   1.199 + *
   1.200 + * tlbflush_timestamp holds a min & max index of valid page table entries
   1.201 + * within the shadow page.
   1.202 + */
   1.203 +
   1.204 +static inline unsigned long
   1.205 +alloc_shadow_page(struct domain *d,
   1.206 +                  unsigned long gpfn, unsigned long gmfn,
   1.207 +                  u32 psh_type)
   1.208 +{
   1.209 +    struct pfn_info *page;
   1.210 +    unsigned long smfn;
   1.211 +    int pin = 0;
   1.212 +
   1.213 +    // Currently, we only keep pre-zero'ed pages around for use as L1's...
   1.214 +    // This will change.  Soon.
   1.215 +    //
   1.216 +    if ( psh_type == PGT_l1_shadow )
   1.217 +    {
   1.218 +        if ( !list_empty(&d->arch.free_shadow_frames) )
   1.219 +        {
   1.220 +            struct list_head *entry = d->arch.free_shadow_frames.next;
   1.221 +            page = list_entry(entry, struct pfn_info, list);
   1.222 +            list_del(entry);
   1.223 +            perfc_decr(free_l1_pages);
   1.224 +        }
   1.225 +        else
   1.226 +        {
   1.227 +            page = alloc_domheap_page(NULL);
   1.228 +            void *l1 = map_domain_page(page_to_pfn(page));
   1.229 +            memset(l1, 0, PAGE_SIZE);
   1.230 +            unmap_domain_page(l1);
   1.231 +        }
   1.232 +    }
   1.233 +    else
   1.234 +        page = alloc_domheap_page(NULL);
   1.235 +
   1.236 +    if ( unlikely(page == NULL) )
   1.237 +    {
   1.238 +        printk("Couldn't alloc shadow page! dom%d count=%d\n",
   1.239 +               d->domain_id, d->arch.shadow_page_count);
   1.240 +        printk("Shadow table counts: l1=%d l2=%d hl2=%d snapshot=%d\n",
   1.241 +               perfc_value(shadow_l1_pages), 
   1.242 +               perfc_value(shadow_l2_pages),
   1.243 +               perfc_value(hl2_table_pages),
   1.244 +               perfc_value(snapshot_pages));
   1.245 +        BUG(); /* XXX FIXME: try a shadow flush to free up some memory. */
   1.246 +    }
   1.247 +
   1.248 +    smfn = page_to_pfn(page);
   1.249 +
   1.250 +    ASSERT( (gmfn & ~PGT_mfn_mask) == 0 );
   1.251 +    page->u.inuse.type_info = psh_type | gmfn;
   1.252 +    page->count_info = 0;
   1.253 +    page->tlbflush_timestamp = 0;
   1.254 +
   1.255 +    switch ( psh_type )
   1.256 +    {
   1.257 +    case PGT_l1_shadow:
   1.258 +        if ( !shadow_promote(d, gpfn, gmfn, psh_type) )
   1.259 +            goto fail;
   1.260 +        perfc_incr(shadow_l1_pages);
   1.261 +        d->arch.shadow_page_count++;
   1.262 +        break;
   1.263 +
   1.264 +    case PGT_l2_shadow:
   1.265 +        if ( !shadow_promote(d, gpfn, gmfn, psh_type) )
   1.266 +            goto fail;
   1.267 +        perfc_incr(shadow_l2_pages);
   1.268 +        d->arch.shadow_page_count++;
   1.269 +        if ( PGT_l2_page_table == PGT_root_page_table )
   1.270 +            pin = 1;
   1.271 +
   1.272 +        break;
   1.273 +
   1.274 +    case PGT_hl2_shadow:
   1.275 +        // Treat an hl2 as an L1 for purposes of promotion.
   1.276 +        // For external mode domains, treat them as an L2 for purposes of
   1.277 +        // pinning.
   1.278 +        //
   1.279 +        if ( !shadow_promote(d, gpfn, gmfn, PGT_l1_shadow) )
   1.280 +            goto fail;
   1.281 +        perfc_incr(hl2_table_pages);
   1.282 +        d->arch.hl2_page_count++;
   1.283 +        if ( shadow_mode_external(d) &&
   1.284 +             (PGT_l2_page_table == PGT_root_page_table) )
   1.285 +            pin = 1;
   1.286 +
   1.287 +        break;
   1.288 +
   1.289 +    case PGT_snapshot:
   1.290 +        perfc_incr(snapshot_pages);
   1.291 +        d->arch.snapshot_page_count++;
   1.292 +        break;
   1.293 +
   1.294 +    default:
   1.295 +        printk("Alloc shadow weird page type type=%08x\n", psh_type);
   1.296 +        BUG();
   1.297 +        break;
   1.298 +    }
   1.299 +
   1.300 +    // Don't add a new shadow of something that already has a snapshot.
   1.301 +    //
   1.302 +    ASSERT( (psh_type == PGT_snapshot) || !mfn_out_of_sync(gmfn) );
   1.303 +
   1.304 +    set_shadow_status(d, gpfn, gmfn, smfn, psh_type);
   1.305 +
   1.306 +    if ( pin )
   1.307 +        shadow_pin(smfn);
   1.308 +
   1.309 +    return smfn;
   1.310 +
   1.311 +  fail:
   1.312 +    FSH_LOG("promotion of pfn=%lx mfn=%lx failed!  external gnttab refs?",
   1.313 +            gpfn, gmfn);
   1.314 +    free_domheap_page(page);
   1.315 +    return 0;
   1.316 +}
   1.317 +
   1.318 +static void inline
   1.319 +free_shadow_l1_table(struct domain *d, unsigned long smfn)
   1.320 +{
   1.321 +    l1_pgentry_t *pl1e = map_domain_page(smfn);
   1.322 +    int i;
   1.323 +    struct pfn_info *spage = pfn_to_page(smfn);
   1.324 +    u32 min_max = spage->tlbflush_timestamp;
   1.325 +    int min = SHADOW_MIN(min_max);
   1.326 +    int max = SHADOW_MAX(min_max);
   1.327 +
   1.328 +    for ( i = min; i <= max; i++ )
   1.329 +    {
   1.330 +        shadow_put_page_from_l1e(pl1e[i], d);
   1.331 +        pl1e[i] = l1e_empty();
   1.332 +    }
   1.333 +
   1.334 +    unmap_domain_page(pl1e);
   1.335 +}
   1.336 +
   1.337 +static void inline
   1.338 +free_shadow_hl2_table(struct domain *d, unsigned long smfn)
   1.339 +{
   1.340 +    l1_pgentry_t *hl2 = map_domain_page(smfn);
   1.341 +    int i, limit;
   1.342 +
   1.343 +    SH_VVLOG("%s: smfn=%lx freed", __func__, smfn);
   1.344 +
   1.345 +#ifdef __i386__
   1.346 +    if ( shadow_mode_external(d) )
   1.347 +        limit = L2_PAGETABLE_ENTRIES;
   1.348 +    else
   1.349 +        limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
   1.350 +#else
   1.351 +    limit = 0; /* XXX x86/64 XXX */
   1.352 +#endif
   1.353 +
   1.354 +    for ( i = 0; i < limit; i++ )
   1.355 +    {
   1.356 +        if ( l1e_get_flags(hl2[i]) & _PAGE_PRESENT )
   1.357 +            put_page(pfn_to_page(l1e_get_pfn(hl2[i])));
   1.358 +    }
   1.359 +
   1.360 +    unmap_domain_page(hl2);
   1.361 +}
   1.362 +
   1.363 +static void inline
   1.364 +free_shadow_l2_table(struct domain *d, unsigned long smfn, unsigned int type)
   1.365 +{
   1.366 +    l2_pgentry_t *pl2e = map_domain_page(smfn);
   1.367 +    int i, external = shadow_mode_external(d);
   1.368 +
   1.369 +    for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
   1.370 +        if ( external || is_guest_l2_slot(type, i) )
   1.371 +            if ( l2e_get_flags(pl2e[i]) & _PAGE_PRESENT )
   1.372 +                put_shadow_ref(l2e_get_pfn(pl2e[i]));
   1.373 +
   1.374 +    if ( (PGT_base_page_table == PGT_l2_page_table) &&
   1.375 +         shadow_mode_translate(d) && !external )
   1.376 +    {
   1.377 +        // free the ref to the hl2
   1.378 +        //
   1.379 +        put_shadow_ref(l2e_get_pfn(pl2e[l2_table_offset(LINEAR_PT_VIRT_START)]));
   1.380 +    }
   1.381 +
   1.382 +    unmap_domain_page(pl2e);
   1.383 +}
   1.384 +
   1.385 +void free_shadow_page(unsigned long smfn)
   1.386 +{
   1.387 +    struct pfn_info *page = &frame_table[smfn];
   1.388 +    unsigned long gmfn = page->u.inuse.type_info & PGT_mfn_mask;
   1.389 +    struct domain *d = page_get_owner(pfn_to_page(gmfn));
   1.390 +    unsigned long gpfn = __mfn_to_gpfn(d, gmfn);
   1.391 +    unsigned long type = page->u.inuse.type_info & PGT_type_mask;
   1.392 +
   1.393 +    SH_VVLOG("%s: free'ing smfn=%lx", __func__, smfn);
   1.394 +
   1.395 +    ASSERT( ! IS_INVALID_M2P_ENTRY(gpfn) );
   1.396 +
   1.397 +    delete_shadow_status(d, gpfn, gmfn, type);
   1.398 +
   1.399 +    switch ( type )
   1.400 +    {
   1.401 +    case PGT_l1_shadow:
   1.402 +        perfc_decr(shadow_l1_pages);
   1.403 +        shadow_demote(d, gpfn, gmfn);
   1.404 +        free_shadow_l1_table(d, smfn);
   1.405 +        break;
   1.406 +
   1.407 +    case PGT_l2_shadow:
   1.408 +        perfc_decr(shadow_l2_pages);
   1.409 +        shadow_demote(d, gpfn, gmfn);
   1.410 +        free_shadow_l2_table(d, smfn, page->u.inuse.type_info);
   1.411 +        break;
   1.412 +
   1.413 +    case PGT_hl2_shadow:
   1.414 +        perfc_decr(hl2_table_pages);
   1.415 +        shadow_demote(d, gpfn, gmfn);
   1.416 +        free_shadow_hl2_table(d, smfn);
   1.417 +        break;
   1.418 +
   1.419 +    case PGT_snapshot:
   1.420 +        perfc_decr(snapshot_pages);
   1.421 +        break;
   1.422 +
   1.423 +    default:
   1.424 +        printk("Free shadow weird page type mfn=%lx type=%08x\n",
   1.425 +               page_to_pfn(page), page->u.inuse.type_info);
   1.426 +        break;
   1.427 +    }
   1.428 +
   1.429 +    d->arch.shadow_page_count--;
   1.430 +
   1.431 +    // No TLB flushes are needed the next time this page gets allocated.
   1.432 +    //
   1.433 +    page->tlbflush_timestamp = 0;
   1.434 +    page->u.free.cpumask     = CPU_MASK_NONE;
   1.435 +
   1.436 +    if ( type == PGT_l1_shadow )
   1.437 +    {
   1.438 +        list_add(&page->list, &d->arch.free_shadow_frames);
   1.439 +        perfc_incr(free_l1_pages);
   1.440 +    }
   1.441 +    else
   1.442 +        free_domheap_page(page);
   1.443 +}
   1.444 +
   1.445 +void
   1.446 +remove_shadow(struct domain *d, unsigned long gpfn, u32 stype)
   1.447 +{
   1.448 +    unsigned long smfn;
   1.449 +
   1.450 +    //printk("%s(gpfn=%lx, type=%x)\n", __func__, gpfn, stype);
   1.451 +
   1.452 +    shadow_lock(d);
   1.453 +
   1.454 +    while ( stype >= PGT_l1_shadow )
   1.455 +    {
   1.456 +        smfn = __shadow_status(d, gpfn, stype);
   1.457 +        if ( smfn && MFN_PINNED(smfn) )
   1.458 +            shadow_unpin(smfn);
   1.459 +        stype -= PGT_l1_shadow;
   1.460 +    }
   1.461 +
   1.462 +    shadow_unlock(d);
   1.463 +}
   1.464 +
   1.465 +static void inline
   1.466 +release_out_of_sync_entry(struct domain *d, struct out_of_sync_entry *entry)
   1.467 +{
   1.468 +    struct pfn_info *page;
   1.469 +
   1.470 +    page = &frame_table[entry->gmfn];
   1.471 +        
   1.472 +    // Decrement ref count of guest & shadow pages
   1.473 +    //
   1.474 +    put_page(page);
   1.475 +
   1.476 +    // Only use entries that have low bits clear...
   1.477 +    //
   1.478 +    if ( !(entry->writable_pl1e & (sizeof(l1_pgentry_t)-1)) )
   1.479 +    {
   1.480 +        put_shadow_ref(entry->writable_pl1e >> PAGE_SHIFT);
   1.481 +        entry->writable_pl1e = -2;
   1.482 +    }
   1.483 +    else
   1.484 +        ASSERT( entry->writable_pl1e == -1 );
   1.485 +
   1.486 +    // Free the snapshot
   1.487 +    //
   1.488 +    shadow_free_snapshot(d, entry);
   1.489 +}
   1.490 +
   1.491 +static void remove_out_of_sync_entries(struct domain *d, unsigned long gmfn)
   1.492 +{
   1.493 +    struct out_of_sync_entry *entry = d->arch.out_of_sync;
   1.494 +    struct out_of_sync_entry **prev = &d->arch.out_of_sync;
   1.495 +    struct out_of_sync_entry *found = NULL;
   1.496 +
   1.497 +    // NB: Be careful not to call something that manipulates this list
   1.498 +    //     while walking it.  Collect the results into a separate list
   1.499 +    //     first, then walk that list.
   1.500 +    //
   1.501 +    while ( entry )
   1.502 +    {
   1.503 +        if ( entry->gmfn == gmfn )
   1.504 +        {
   1.505 +            // remove from out of sync list
   1.506 +            *prev = entry->next;
   1.507 +
   1.508 +            // add to found list
   1.509 +            entry->next = found;
   1.510 +            found = entry;
   1.511 +
   1.512 +            entry = *prev;
   1.513 +            continue;
   1.514 +        }
   1.515 +        prev = &entry->next;
   1.516 +        entry = entry->next;
   1.517 +    }
   1.518 +
   1.519 +    prev = NULL;
   1.520 +    entry = found;
   1.521 +    while ( entry )
   1.522 +    {
   1.523 +        release_out_of_sync_entry(d, entry);
   1.524 +
   1.525 +        prev = &entry->next;
   1.526 +        entry = entry->next;
   1.527 +    }
   1.528 +
   1.529 +    // Add found list to free list
   1.530 +    if ( prev )
   1.531 +    {
   1.532 +        *prev = d->arch.out_of_sync_free;
   1.533 +        d->arch.out_of_sync_free = found;
   1.534 +    }
   1.535 +}
   1.536 +
   1.537 +static void free_out_of_sync_state(struct domain *d)
   1.538 +{
   1.539 +    struct out_of_sync_entry *entry;
   1.540 +
   1.541 +    // NB: Be careful not to call something that manipulates this list
   1.542 +    //     while walking it.  Remove one item at a time, and always
   1.543 +    //     restart from start of list.
   1.544 +    //
   1.545 +    while ( (entry = d->arch.out_of_sync) )
   1.546 +    {
   1.547 +        d->arch.out_of_sync = entry->next;
   1.548 +        release_out_of_sync_entry(d, entry);
   1.549 +
   1.550 +        entry->next = d->arch.out_of_sync_free;
   1.551 +        d->arch.out_of_sync_free = entry;
   1.552 +    }
   1.553 +}
   1.554 +
   1.555 +static void free_shadow_pages(struct domain *d)
   1.556 +{
   1.557 +    int                   i;
   1.558 +    struct shadow_status *x;
   1.559 +    struct vcpu          *v;
   1.560 + 
   1.561 +    /*
   1.562 +     * WARNING! The shadow page table must not currently be in use!
   1.563 +     * e.g., You are expected to have paused the domain and synchronized CR3.
   1.564 +     */
   1.565 +
   1.566 +    if( !d->arch.shadow_ht ) return;
   1.567 +
   1.568 +    shadow_audit(d, 1);
   1.569 +
   1.570 +    // first, remove any outstanding refs from out_of_sync entries...
   1.571 +    //
   1.572 +    free_out_of_sync_state(d);
   1.573 +
   1.574 +    // second, remove any outstanding refs from v->arch.shadow_table
   1.575 +    // and CR3.
   1.576 +    //
   1.577 +    for_each_vcpu(d, v)
   1.578 +    {
   1.579 +        if ( pagetable_get_paddr(v->arch.shadow_table) )
   1.580 +        {
   1.581 +            put_shadow_ref(pagetable_get_pfn(v->arch.shadow_table));
   1.582 +            v->arch.shadow_table = mk_pagetable(0);
   1.583 +        }
   1.584 +
   1.585 +        if ( v->arch.monitor_shadow_ref )
   1.586 +        {
   1.587 +            put_shadow_ref(v->arch.monitor_shadow_ref);
   1.588 +            v->arch.monitor_shadow_ref = 0;
   1.589 +        }
   1.590 +    }
   1.591 +
   1.592 +    // For external shadows, remove the monitor table's refs
   1.593 +    //
   1.594 +    if ( shadow_mode_external(d) )
   1.595 +    {
   1.596 +        for_each_vcpu(d, v)
   1.597 +        {
   1.598 +            l2_pgentry_t *mpl2e = v->arch.monitor_vtable;
   1.599 +
   1.600 +            if ( mpl2e )
   1.601 +            {
   1.602 +                l2_pgentry_t hl2e = mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)];
   1.603 +                l2_pgentry_t smfn = mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)];
   1.604 +
   1.605 +                if ( l2e_get_flags(hl2e) & _PAGE_PRESENT )
   1.606 +                {
   1.607 +                    put_shadow_ref(l2e_get_pfn(hl2e));
   1.608 +                    mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)] = l2e_empty();
   1.609 +                }
   1.610 +                if ( l2e_get_flags(smfn) & _PAGE_PRESENT )
   1.611 +                {
   1.612 +                    put_shadow_ref(l2e_get_pfn(smfn));
   1.613 +                    mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] = l2e_empty();
   1.614 +                }
   1.615 +            }
   1.616 +        }
   1.617 +    }
   1.618 +
   1.619 +    // Now, the only refs to shadow pages that are left are from the shadow
   1.620 +    // pages themselves.  We just unpin the pinned pages, and the rest
   1.621 +    // should automatically disappear.
   1.622 +    //
   1.623 +    // NB: Beware: each explicitly or implicit call to free_shadow_page
   1.624 +    // can/will result in the hash bucket getting rewritten out from
   1.625 +    // under us...  First, collect the list of pinned pages, then
   1.626 +    // free them.
   1.627 +    //
   1.628 +    for ( i = 0; i < shadow_ht_buckets; i++ )
   1.629 +    {
   1.630 +        u32 count;
   1.631 +        unsigned long *mfn_list;
   1.632 +
   1.633 +        /* Skip empty buckets. */
   1.634 +        if ( d->arch.shadow_ht[i].gpfn_and_flags == 0 )
   1.635 +            continue;
   1.636 +
   1.637 +        count = 0;
   1.638 +        for ( x = &d->arch.shadow_ht[i]; x != NULL; x = x->next )
   1.639 +            if ( MFN_PINNED(x->smfn) )
   1.640 +                count++;
   1.641 +        if ( !count )
   1.642 +            continue;
   1.643 +
   1.644 +        mfn_list = xmalloc_array(unsigned long, count);
   1.645 +        count = 0;
   1.646 +        for ( x = &d->arch.shadow_ht[i]; x != NULL; x = x->next )
   1.647 +            if ( MFN_PINNED(x->smfn) )
   1.648 +                mfn_list[count++] = x->smfn;
   1.649 +
   1.650 +        while ( count )
   1.651 +        {
   1.652 +            shadow_unpin(mfn_list[--count]);
   1.653 +        }
   1.654 +        xfree(mfn_list);
   1.655 +    }
   1.656 +
   1.657 +    // Now free the pre-zero'ed pages from the domain
   1.658 +    //
   1.659 +    struct list_head *list_ent, *tmp;
   1.660 +    list_for_each_safe(list_ent, tmp, &d->arch.free_shadow_frames)
   1.661 +    {
   1.662 +        list_del(list_ent);
   1.663 +        perfc_decr(free_l1_pages);
   1.664 +
   1.665 +        struct pfn_info *page = list_entry(list_ent, struct pfn_info, list);
   1.666 +        free_domheap_page(page);
   1.667 +    }
   1.668 +
   1.669 +    shadow_audit(d, 0);
   1.670 +
   1.671 +    SH_LOG("Free shadow table.");
   1.672 +}
   1.673 +
   1.674 +void shadow_mode_init(void)
   1.675 +{
   1.676 +}
   1.677 +
   1.678 +int _shadow_mode_refcounts(struct domain *d)
   1.679 +{
   1.680 +    return shadow_mode_refcounts(d);
   1.681 +}
   1.682 +
   1.683 +void alloc_monitor_pagetable(struct vcpu *v)
   1.684 +{
   1.685 +    unsigned long mmfn;
   1.686 +    l2_pgentry_t *mpl2e;
   1.687 +    struct pfn_info *mmfn_info;
   1.688 +    struct domain *d = v->domain;
   1.689 +
   1.690 +    ASSERT(pagetable_get_paddr(v->arch.monitor_table) == 0);
   1.691 +
   1.692 +    mmfn_info = alloc_domheap_page(NULL);
   1.693 +    ASSERT(mmfn_info != NULL);
   1.694 +
   1.695 +    mmfn = page_to_pfn(mmfn_info);
   1.696 +    mpl2e = (l2_pgentry_t *)map_domain_page(mmfn);
   1.697 +    memset(mpl2e, 0, PAGE_SIZE);
   1.698 +
   1.699 +#ifdef __i386__ /* XXX screws x86/64 build */
   1.700 +    memcpy(&mpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 
   1.701 +           &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
   1.702 +           HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
   1.703 +#endif
   1.704 +
   1.705 +    mpl2e[l2_table_offset(PERDOMAIN_VIRT_START)] =
   1.706 +        l2e_from_paddr(__pa(d->arch.mm_perdomain_pt),
   1.707 +                        __PAGE_HYPERVISOR);
   1.708 +
   1.709 +    // map the phys_to_machine map into the Read-Only MPT space for this domain
   1.710 +    mpl2e[l2_table_offset(RO_MPT_VIRT_START)] =
   1.711 +        l2e_from_paddr(pagetable_get_paddr(d->arch.phys_table),
   1.712 +                        __PAGE_HYPERVISOR);
   1.713 +
   1.714 +    // Don't (yet) have mappings for these...
   1.715 +    // Don't want to accidentally see the idle_pg_table's linear mapping.
   1.716 +    //
   1.717 +    mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)] = l2e_empty();
   1.718 +    mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] = l2e_empty();
   1.719 +
   1.720 +    v->arch.monitor_table = mk_pagetable(mmfn << PAGE_SHIFT);
   1.721 +    v->arch.monitor_vtable = mpl2e;
   1.722 +}
   1.723 +
   1.724 +/*
   1.725 + * Free the pages for monitor_table and hl2_table
   1.726 + */
   1.727 +void free_monitor_pagetable(struct vcpu *v)
   1.728 +{
   1.729 +    l2_pgentry_t *mpl2e, hl2e, sl2e;
   1.730 +    unsigned long mfn;
   1.731 +
   1.732 +    ASSERT( pagetable_get_paddr(v->arch.monitor_table) );
   1.733 +    
   1.734 +    mpl2e = v->arch.monitor_vtable;
   1.735 +
   1.736 +    /*
   1.737 +     * First get the mfn for hl2_table by looking at monitor_table
   1.738 +     */
   1.739 +    hl2e = mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)];
   1.740 +    if ( l2e_get_flags(hl2e) & _PAGE_PRESENT )
   1.741 +    {
   1.742 +        mfn = l2e_get_pfn(hl2e);
   1.743 +        ASSERT(mfn);
   1.744 +        put_shadow_ref(mfn);
   1.745 +    }
   1.746 +
   1.747 +    sl2e = mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)];
   1.748 +    if ( l2e_get_flags(sl2e) & _PAGE_PRESENT )
   1.749 +    {
   1.750 +        mfn = l2e_get_pfn(sl2e);
   1.751 +        ASSERT(mfn);
   1.752 +        put_shadow_ref(mfn);
   1.753 +    }
   1.754 +
   1.755 +    unmap_domain_page(mpl2e);
   1.756 +
   1.757 +    /*
   1.758 +     * Then free monitor_table.
   1.759 +     */
   1.760 +    mfn = pagetable_get_pfn(v->arch.monitor_table);
   1.761 +    free_domheap_page(&frame_table[mfn]);
   1.762 +
   1.763 +    v->arch.monitor_table = mk_pagetable(0);
   1.764 +    v->arch.monitor_vtable = 0;
   1.765 +}
   1.766 +
   1.767 +int
   1.768 +set_p2m_entry(struct domain *d, unsigned long pfn, unsigned long mfn,
   1.769 +              struct domain_mmap_cache *l2cache,
   1.770 +              struct domain_mmap_cache *l1cache)
   1.771 +{
   1.772 +    unsigned long tabpfn = pagetable_get_pfn(d->arch.phys_table);
   1.773 +    l2_pgentry_t *l2, l2e;
   1.774 +    l1_pgentry_t *l1;
   1.775 +    struct pfn_info *l1page;
   1.776 +    unsigned long va = pfn << PAGE_SHIFT;
   1.777 +
   1.778 +    ASSERT(tabpfn != 0);
   1.779 +
   1.780 +    l2 = map_domain_page_with_cache(tabpfn, l2cache);
   1.781 +    l2e = l2[l2_table_offset(va)];
   1.782 +    if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
   1.783 +    {
   1.784 +        l1page = alloc_domheap_page(NULL);
   1.785 +        if ( !l1page )
   1.786 +        {
   1.787 +            unmap_domain_page_with_cache(l2, l2cache);
   1.788 +            return 0;
   1.789 +        }
   1.790 +
   1.791 +        l1 = map_domain_page_with_cache(page_to_pfn(l1page), l1cache);
   1.792 +        memset(l1, 0, PAGE_SIZE);
   1.793 +        unmap_domain_page_with_cache(l1, l1cache);
   1.794 +
   1.795 +        l2e = l2e_from_page(l1page, __PAGE_HYPERVISOR);
   1.796 +        l2[l2_table_offset(va)] = l2e;
   1.797 +    }
   1.798 +    unmap_domain_page_with_cache(l2, l2cache);
   1.799 +
   1.800 +    l1 = map_domain_page_with_cache(l2e_get_pfn(l2e), l1cache);
   1.801 +    l1[l1_table_offset(va)] = l1e_from_pfn(mfn, __PAGE_HYPERVISOR);
   1.802 +    unmap_domain_page_with_cache(l1, l1cache);
   1.803 +
   1.804 +    return 1;
   1.805 +}
   1.806 +
   1.807 +static int
   1.808 +alloc_p2m_table(struct domain *d)
   1.809 +{
   1.810 +    struct list_head *list_ent;
   1.811 +    struct pfn_info *page, *l2page;
   1.812 +    l2_pgentry_t *l2;
   1.813 +    unsigned long mfn, pfn;
   1.814 +    struct domain_mmap_cache l1cache, l2cache;
   1.815 +
   1.816 +    l2page = alloc_domheap_page(NULL);
   1.817 +    if ( l2page == NULL )
   1.818 +        return 0;
   1.819 +
   1.820 +    domain_mmap_cache_init(&l1cache);
   1.821 +    domain_mmap_cache_init(&l2cache);
   1.822 +
   1.823 +    d->arch.phys_table = mk_pagetable(page_to_phys(l2page));
   1.824 +    l2 = map_domain_page_with_cache(page_to_pfn(l2page), &l2cache);
   1.825 +    memset(l2, 0, PAGE_SIZE);
   1.826 +    unmap_domain_page_with_cache(l2, &l2cache);
   1.827 +
   1.828 +    list_ent = d->page_list.next;
   1.829 +    while ( list_ent != &d->page_list )
   1.830 +    {
   1.831 +        page = list_entry(list_ent, struct pfn_info, list);
   1.832 +        mfn = page_to_pfn(page);
   1.833 +        pfn = machine_to_phys_mapping[mfn];
   1.834 +        ASSERT(pfn != INVALID_M2P_ENTRY);
   1.835 +        ASSERT(pfn < (1u<<20));
   1.836 +
   1.837 +        set_p2m_entry(d, pfn, mfn, &l2cache, &l1cache);
   1.838 +
   1.839 +        list_ent = page->list.next;
   1.840 +    }
   1.841 +
   1.842 +    list_ent = d->xenpage_list.next;
   1.843 +    while ( list_ent != &d->xenpage_list )
   1.844 +    {
   1.845 +        page = list_entry(list_ent, struct pfn_info, list);
   1.846 +        mfn = page_to_pfn(page);
   1.847 +        pfn = machine_to_phys_mapping[mfn];
   1.848 +        if ( (pfn != INVALID_M2P_ENTRY) &&
   1.849 +             (pfn < (1u<<20)) )
   1.850 +        {
   1.851 +            set_p2m_entry(d, pfn, mfn, &l2cache, &l1cache);
   1.852 +        }
   1.853 +
   1.854 +        list_ent = page->list.next;
   1.855 +    }
   1.856 +
   1.857 +    domain_mmap_cache_destroy(&l2cache);
   1.858 +    domain_mmap_cache_destroy(&l1cache);
   1.859 +
   1.860 +    return 1;
   1.861 +}
   1.862 +
   1.863 +static void
   1.864 +free_p2m_table(struct domain *d)
   1.865 +{
   1.866 +    // uh, this needs some work...  :)
   1.867 +    BUG();
   1.868 +}
   1.869 +
   1.870 +int __shadow_mode_enable(struct domain *d, unsigned int mode)
   1.871 +{
   1.872 +    struct vcpu *v;
   1.873 +    int new_modes = (mode & ~d->arch.shadow_mode);
   1.874 +
   1.875 +    // Gotta be adding something to call this function.
   1.876 +    ASSERT(new_modes);
   1.877 +
   1.878 +    // can't take anything away by calling this function.
   1.879 +    ASSERT(!(d->arch.shadow_mode & ~mode));
   1.880 +
   1.881 +    for_each_vcpu(d, v)
   1.882 +    {
   1.883 +        invalidate_shadow_ldt(v);
   1.884 +
   1.885 +        // We need to set these up for __update_pagetables().
   1.886 +        // See the comment there.
   1.887 +
   1.888 +        /*
   1.889 +         * arch.guest_vtable
   1.890 +         */
   1.891 +        if ( v->arch.guest_vtable &&
   1.892 +             (v->arch.guest_vtable != __linear_l2_table) )
   1.893 +        {
   1.894 +            unmap_domain_page(v->arch.guest_vtable);
   1.895 +        }
   1.896 +        if ( (mode & (SHM_translate | SHM_external)) == SHM_translate )
   1.897 +            v->arch.guest_vtable = __linear_l2_table;
   1.898 +        else
   1.899 +            v->arch.guest_vtable = NULL;
   1.900 +
   1.901 +        /*
   1.902 +         * arch.shadow_vtable
   1.903 +         */
   1.904 +        if ( v->arch.shadow_vtable &&
   1.905 +             (v->arch.shadow_vtable != __shadow_linear_l2_table) )
   1.906 +        {
   1.907 +            unmap_domain_page(v->arch.shadow_vtable);
   1.908 +        }
   1.909 +        if ( !(mode & SHM_external) )
   1.910 +            v->arch.shadow_vtable = __shadow_linear_l2_table;
   1.911 +        else
   1.912 +            v->arch.shadow_vtable = NULL;
   1.913 +
   1.914 +        /*
   1.915 +         * arch.hl2_vtable
   1.916 +         */
   1.917 +        if ( v->arch.hl2_vtable &&
   1.918 +             (v->arch.hl2_vtable != __linear_hl2_table) )
   1.919 +        {
   1.920 +            unmap_domain_page(v->arch.hl2_vtable);
   1.921 +        }
   1.922 +        if ( (mode & (SHM_translate | SHM_external)) == SHM_translate )
   1.923 +            v->arch.hl2_vtable = __linear_hl2_table;
   1.924 +        else
   1.925 +            v->arch.hl2_vtable = NULL;
   1.926 +
   1.927 +        /*
   1.928 +         * arch.monitor_table & arch.monitor_vtable
   1.929 +         */
   1.930 +        if ( v->arch.monitor_vtable )
   1.931 +        {
   1.932 +            free_monitor_pagetable(v);
   1.933 +        }
   1.934 +        if ( mode & SHM_external )
   1.935 +        {
   1.936 +            alloc_monitor_pagetable(v);
   1.937 +        }
   1.938 +    }
   1.939 +
   1.940 +    if ( new_modes & SHM_enable )
   1.941 +    {
   1.942 +        ASSERT( !d->arch.shadow_ht );
   1.943 +        d->arch.shadow_ht = xmalloc_array(struct shadow_status, shadow_ht_buckets);
   1.944 +        if ( d->arch.shadow_ht == NULL )
   1.945 +            goto nomem;
   1.946 +
   1.947 +        memset(d->arch.shadow_ht, 0,
   1.948 +           shadow_ht_buckets * sizeof(struct shadow_status));
   1.949 +    }
   1.950 +
   1.951 +    if ( new_modes & SHM_log_dirty )
   1.952 +    {
   1.953 +        ASSERT( !d->arch.shadow_dirty_bitmap );
   1.954 +        d->arch.shadow_dirty_bitmap_size = (d->max_pages + 63) & ~63;
   1.955 +        d->arch.shadow_dirty_bitmap = 
   1.956 +            xmalloc_array(unsigned long, d->arch.shadow_dirty_bitmap_size /
   1.957 +                                         (8 * sizeof(unsigned long)));
   1.958 +        if ( d->arch.shadow_dirty_bitmap == NULL )
   1.959 +        {
   1.960 +            d->arch.shadow_dirty_bitmap_size = 0;
   1.961 +            goto nomem;
   1.962 +        }
   1.963 +        memset(d->arch.shadow_dirty_bitmap, 0, 
   1.964 +               d->arch.shadow_dirty_bitmap_size/8);
   1.965 +    }
   1.966 +
   1.967 +    if ( new_modes & SHM_translate )
   1.968 +    {
   1.969 +        if ( !(new_modes & SHM_external) )
   1.970 +        {
   1.971 +            ASSERT( !pagetable_get_paddr(d->arch.phys_table) );
   1.972 +            if ( !alloc_p2m_table(d) )
   1.973 +            {
   1.974 +                printk("alloc_p2m_table failed (out-of-memory?)\n");
   1.975 +                goto nomem;
   1.976 +            }
   1.977 +        }
   1.978 +        else
   1.979 +        {
   1.980 +            // external guests provide their own memory for their P2M maps.
   1.981 +            //
   1.982 +            ASSERT( d == page_get_owner(
   1.983 +                        &frame_table[pagetable_get_pfn(d->arch.phys_table)]) );
   1.984 +        }
   1.985 +    }
   1.986 +
   1.987 +    printk("audit1\n");
   1.988 +    _audit_domain(d, AUDIT_SHADOW_ALREADY_LOCKED | AUDIT_ERRORS_OK);
   1.989 +    printk("audit1 done\n");
   1.990 +
   1.991 +    // Get rid of any shadow pages from any previous shadow mode.
   1.992 +    //
   1.993 +    free_shadow_pages(d);
   1.994 +
   1.995 +    printk("audit2\n");
   1.996 +    _audit_domain(d, AUDIT_SHADOW_ALREADY_LOCKED | AUDIT_ERRORS_OK);
   1.997 +    printk("audit2 done\n");
   1.998 +
   1.999 +    /*
  1.1000 +     * Tear down it's counts by disassembling its page-table-based ref counts.
  1.1001 +     * Also remove CR3's gcount/tcount.
  1.1002 +     * That leaves things like GDTs and LDTs and external refs in tact.
  1.1003 +     *
  1.1004 +     * Most pages will be writable tcount=0.
  1.1005 +     * Some will still be L1 tcount=0 or L2 tcount=0.
  1.1006 +     * Maybe some pages will be type none tcount=0.
  1.1007 +     * Pages granted external writable refs (via grant tables?) will
  1.1008 +     * still have a non-zero tcount.  That's OK.
  1.1009 +     *
  1.1010 +     * gcounts will generally be 1 for PGC_allocated.
  1.1011 +     * GDTs and LDTs will have additional gcounts.
  1.1012 +     * Any grant-table based refs will still be in the gcount.
  1.1013 +     *
  1.1014 +     * We attempt to grab writable refs to each page (thus setting its type).
  1.1015 +     * Immediately put back those type refs.
  1.1016 +     *
  1.1017 +     * Assert that no pages are left with L1/L2/L3/L4 type.
  1.1018 +     */
  1.1019 +    audit_adjust_pgtables(d, -1, 1);
  1.1020 +
  1.1021 +    d->arch.shadow_mode = mode;
  1.1022 +
  1.1023 +    if ( shadow_mode_refcounts(d) )
  1.1024 +    {
  1.1025 +        struct list_head *list_ent = d->page_list.next;
  1.1026 +        while ( list_ent != &d->page_list )
  1.1027 +        {
  1.1028 +            struct pfn_info *page = list_entry(list_ent, struct pfn_info, list);
  1.1029 +            if ( !get_page_type(page, PGT_writable_page) )
  1.1030 +                BUG();
  1.1031 +            put_page_type(page);
  1.1032 +
  1.1033 +            list_ent = page->list.next;
  1.1034 +        }
  1.1035 +    }
  1.1036 +
  1.1037 +    audit_adjust_pgtables(d, 1, 1);
  1.1038 +
  1.1039 +    printk("audit3\n");
  1.1040 +    _audit_domain(d, AUDIT_SHADOW_ALREADY_LOCKED | AUDIT_ERRORS_OK);
  1.1041 +    printk("audit3 done\n");
  1.1042 +
  1.1043 +    return 0;
  1.1044 +
  1.1045 + nomem:
  1.1046 +    if ( (new_modes & SHM_enable) )
  1.1047 +    {
  1.1048 +        xfree(d->arch.shadow_ht);
  1.1049 +        d->arch.shadow_ht = NULL;
  1.1050 +    }
  1.1051 +    if ( (new_modes & SHM_log_dirty) )
  1.1052 +    {
  1.1053 +        xfree(d->arch.shadow_dirty_bitmap);
  1.1054 +        d->arch.shadow_dirty_bitmap = NULL;
  1.1055 +    }
  1.1056 +    if ( (new_modes & SHM_translate) && !(new_modes & SHM_external) &&
  1.1057 +         pagetable_get_paddr(d->arch.phys_table) )
  1.1058 +    {
  1.1059 +        free_p2m_table(d);
  1.1060 +    }
  1.1061 +    return -ENOMEM;
  1.1062 +}
  1.1063 +
  1.1064 +int shadow_mode_enable(struct domain *d, unsigned int mode)
  1.1065 +{
  1.1066 +    int rc;
  1.1067 +    shadow_lock(d);
  1.1068 +    rc = __shadow_mode_enable(d, mode);
  1.1069 +    shadow_unlock(d);
  1.1070 +    return rc;
  1.1071 +}
  1.1072 +
  1.1073 +static void
  1.1074 +translate_l1pgtable(struct domain *d, l1_pgentry_t *p2m, unsigned long l1mfn)
  1.1075 +{
  1.1076 +    int i;
  1.1077 +    l1_pgentry_t *l1;
  1.1078 +
  1.1079 +    l1 = map_domain_page(l1mfn);
  1.1080 +    for (i = 0; i < L1_PAGETABLE_ENTRIES; i++)
  1.1081 +    {
  1.1082 +        if ( is_guest_l1_slot(i) &&
  1.1083 +             (l1e_get_flags(l1[i]) & _PAGE_PRESENT) )
  1.1084 +        {
  1.1085 +            unsigned long mfn = l1e_get_pfn(l1[i]);
  1.1086 +            unsigned long gpfn = __mfn_to_gpfn(d, mfn);
  1.1087 +            ASSERT(l1e_get_pfn(p2m[gpfn]) == mfn);
  1.1088 +            l1[i] = l1e_from_pfn(gpfn, l1e_get_flags(l1[i]));
  1.1089 +        }
  1.1090 +    }
  1.1091 +    unmap_domain_page(l1);
  1.1092 +}
  1.1093 +
  1.1094 +// This is not general enough to handle arbitrary pagetables
  1.1095 +// with shared L1 pages, etc., but it is sufficient for bringing
  1.1096 +// up dom0.
  1.1097 +//
  1.1098 +void
  1.1099 +translate_l2pgtable(struct domain *d, l1_pgentry_t *p2m, unsigned long l2mfn,
  1.1100 +                    unsigned int type)
  1.1101 +{
  1.1102 +    int i;
  1.1103 +    l2_pgentry_t *l2;
  1.1104 +
  1.1105 +    ASSERT(shadow_mode_translate(d) && !shadow_mode_external(d));
  1.1106 +
  1.1107 +    l2 = map_domain_page(l2mfn);
  1.1108 +    for (i = 0; i < L2_PAGETABLE_ENTRIES; i++)
  1.1109 +    {
  1.1110 +        if ( is_guest_l2_slot(type, i) &&
  1.1111 +             (l2e_get_flags(l2[i]) & _PAGE_PRESENT) )
  1.1112 +        {
  1.1113 +            unsigned long mfn = l2e_get_pfn(l2[i]);
  1.1114 +            unsigned long gpfn = __mfn_to_gpfn(d, mfn);
  1.1115 +            ASSERT(l1e_get_pfn(p2m[gpfn]) == mfn);
  1.1116 +            l2[i] = l2e_from_pfn(gpfn, l2e_get_flags(l2[i]));
  1.1117 +            translate_l1pgtable(d, p2m, mfn);
  1.1118 +        }
  1.1119 +    }
  1.1120 +    unmap_domain_page(l2);
  1.1121 +}
  1.1122 +
  1.1123 +static void free_shadow_ht_entries(struct domain *d)
  1.1124 +{
  1.1125 +    struct shadow_status *x, *n;
  1.1126 +
  1.1127 +    SH_VLOG("freed tables count=%d l1=%d l2=%d",
  1.1128 +            d->arch.shadow_page_count, perfc_value(shadow_l1_pages), 
  1.1129 +            perfc_value(shadow_l2_pages));
  1.1130 +
  1.1131 +    n = d->arch.shadow_ht_extras;
  1.1132 +    while ( (x = n) != NULL )
  1.1133 +    {
  1.1134 +        d->arch.shadow_extras_count--;
  1.1135 +        n = *((struct shadow_status **)(&x[shadow_ht_extra_size]));
  1.1136 +        xfree(x);
  1.1137 +    }
  1.1138 +
  1.1139 +    d->arch.shadow_ht_extras = NULL;
  1.1140 +    d->arch.shadow_ht_free = NULL;
  1.1141 +
  1.1142 +    ASSERT(d->arch.shadow_extras_count == 0);
  1.1143 +    SH_LOG("freed extras, now %d", d->arch.shadow_extras_count);
  1.1144 +
  1.1145 +    if ( d->arch.shadow_dirty_bitmap != NULL )
  1.1146 +    {
  1.1147 +        xfree(d->arch.shadow_dirty_bitmap);
  1.1148 +        d->arch.shadow_dirty_bitmap = 0;
  1.1149 +        d->arch.shadow_dirty_bitmap_size = 0;
  1.1150 +    }
  1.1151 +
  1.1152 +    xfree(d->arch.shadow_ht);
  1.1153 +    d->arch.shadow_ht = NULL;
  1.1154 +}
  1.1155 +
  1.1156 +static void free_out_of_sync_entries(struct domain *d)
  1.1157 +{
  1.1158 +    struct out_of_sync_entry *x, *n;
  1.1159 +
  1.1160 +    n = d->arch.out_of_sync_extras;
  1.1161 +    while ( (x = n) != NULL )
  1.1162 +    {
  1.1163 +        d->arch.out_of_sync_extras_count--;
  1.1164 +        n = *((struct out_of_sync_entry **)(&x[out_of_sync_extra_size]));
  1.1165 +        xfree(x);
  1.1166 +    }
  1.1167 +
  1.1168 +    d->arch.out_of_sync_extras = NULL;
  1.1169 +    d->arch.out_of_sync_free = NULL;
  1.1170 +    d->arch.out_of_sync = NULL;
  1.1171 +
  1.1172 +    ASSERT(d->arch.out_of_sync_extras_count == 0);
  1.1173 +    FSH_LOG("freed extra out_of_sync entries, now %d",
  1.1174 +            d->arch.out_of_sync_extras_count);
  1.1175 +}
  1.1176 +
  1.1177 +void __shadow_mode_disable(struct domain *d)
  1.1178 +{
  1.1179 +    if ( unlikely(!shadow_mode_enabled(d)) )
  1.1180 +        return;
  1.1181 +
  1.1182 +    /*
  1.1183 +     * Currently this does not fix up page ref counts, so it is valid to call
  1.1184 +     * only when a domain is being destroyed.
  1.1185 +     */
  1.1186 +    BUG_ON(!test_bit(_DOMF_dying, &d->domain_flags) &&
  1.1187 +           shadow_mode_refcounts(d));
  1.1188 +    d->arch.shadow_tainted_refcnts = shadow_mode_refcounts(d);
  1.1189 +
  1.1190 +    free_shadow_pages(d);
  1.1191 +    free_writable_pte_predictions(d);
  1.1192 +
  1.1193 +#ifndef NDEBUG
  1.1194 +    int i;
  1.1195 +    for ( i = 0; i < shadow_ht_buckets; i++ )
  1.1196 +    {
  1.1197 +        if ( d->arch.shadow_ht[i].gpfn_and_flags != 0 )
  1.1198 +        {
  1.1199 +            printk("%s: d->arch.shadow_ht[%x].gpfn_and_flags=%lx\n",
  1.1200 +                   __FILE__, i, d->arch.shadow_ht[i].gpfn_and_flags);
  1.1201 +            BUG();
  1.1202 +        }
  1.1203 +    }
  1.1204 +#endif
  1.1205 +
  1.1206 +    d->arch.shadow_mode = 0;
  1.1207 +
  1.1208 +    free_shadow_ht_entries(d);
  1.1209 +    free_out_of_sync_entries(d);
  1.1210 +
  1.1211 +    struct vcpu *v;
  1.1212 +    for_each_vcpu(d, v)
  1.1213 +    {
  1.1214 +        update_pagetables(v);
  1.1215 +    }
  1.1216 +}
  1.1217 +
  1.1218 +static int shadow_mode_table_op(
  1.1219 +    struct domain *d, dom0_shadow_control_t *sc)
  1.1220 +{
  1.1221 +    unsigned int      op = sc->op;
  1.1222 +    int               i, rc = 0;
  1.1223 +    struct vcpu *v;
  1.1224 +
  1.1225 +    ASSERT(shadow_lock_is_acquired(d));
  1.1226 +
  1.1227 +    SH_VLOG("shadow mode table op %lx %lx count %d",
  1.1228 +            (unsigned long)pagetable_get_pfn(d->vcpu[0]->arch.guest_table),  /* XXX SMP */
  1.1229 +            (unsigned long)pagetable_get_pfn(d->vcpu[0]->arch.shadow_table), /* XXX SMP */
  1.1230 +            d->arch.shadow_page_count);
  1.1231 +
  1.1232 +    shadow_audit(d, 1);
  1.1233 +
  1.1234 +    switch ( op )
  1.1235 +    {
  1.1236 +    case DOM0_SHADOW_CONTROL_OP_FLUSH:
  1.1237 +        free_shadow_pages(d);
  1.1238 +
  1.1239 +        d->arch.shadow_fault_count       = 0;
  1.1240 +        d->arch.shadow_dirty_count       = 0;
  1.1241 +        d->arch.shadow_dirty_net_count   = 0;
  1.1242 +        d->arch.shadow_dirty_block_count = 0;
  1.1243 +
  1.1244 +        break;
  1.1245 +   
  1.1246 +    case DOM0_SHADOW_CONTROL_OP_CLEAN:
  1.1247 +        free_shadow_pages(d);
  1.1248 +
  1.1249 +        sc->stats.fault_count       = d->arch.shadow_fault_count;
  1.1250 +        sc->stats.dirty_count       = d->arch.shadow_dirty_count;
  1.1251 +        sc->stats.dirty_net_count   = d->arch.shadow_dirty_net_count;
  1.1252 +        sc->stats.dirty_block_count = d->arch.shadow_dirty_block_count;
  1.1253 +
  1.1254 +        d->arch.shadow_fault_count       = 0;
  1.1255 +        d->arch.shadow_dirty_count       = 0;
  1.1256 +        d->arch.shadow_dirty_net_count   = 0;
  1.1257 +        d->arch.shadow_dirty_block_count = 0;
  1.1258 + 
  1.1259 +        if ( (d->max_pages > sc->pages) || 
  1.1260 +             (sc->dirty_bitmap == NULL) || 
  1.1261 +             (d->arch.shadow_dirty_bitmap == NULL) )
  1.1262 +        {
  1.1263 +            rc = -EINVAL;
  1.1264 +            break;
  1.1265 +        }
  1.1266 + 
  1.1267 +        sc->pages = d->max_pages;
  1.1268 +
  1.1269 +#define chunk (8*1024) /* Transfer and clear in 1kB chunks for L1 cache. */
  1.1270 +        for ( i = 0; i < d->max_pages; i += chunk )
  1.1271 +        {
  1.1272 +            int bytes = ((((d->max_pages - i) > chunk) ?
  1.1273 +                          chunk : (d->max_pages - i)) + 7) / 8;
  1.1274 +     
  1.1275 +            if (copy_to_user(
  1.1276 +                    sc->dirty_bitmap + (i/(8*sizeof(unsigned long))),
  1.1277 +                    d->arch.shadow_dirty_bitmap +(i/(8*sizeof(unsigned long))),
  1.1278 +                    bytes))
  1.1279 +            {
  1.1280 +                // copy_to_user can fail when copying to guest app memory.
  1.1281 +                // app should zero buffer after mallocing, and pin it
  1.1282 +                rc = -EINVAL;
  1.1283 +                memset(
  1.1284 +                    d->arch.shadow_dirty_bitmap + 
  1.1285 +                    (i/(8*sizeof(unsigned long))),
  1.1286 +                    0, (d->max_pages/8) - (i/(8*sizeof(unsigned long))));
  1.1287 +                break;
  1.1288 +            }
  1.1289 +
  1.1290 +            memset(
  1.1291 +                d->arch.shadow_dirty_bitmap + (i/(8*sizeof(unsigned long))),
  1.1292 +                0, bytes);
  1.1293 +        }
  1.1294 +
  1.1295 +        break;
  1.1296 +
  1.1297 +    case DOM0_SHADOW_CONTROL_OP_PEEK:
  1.1298 +        sc->stats.fault_count       = d->arch.shadow_fault_count;
  1.1299 +        sc->stats.dirty_count       = d->arch.shadow_dirty_count;
  1.1300 +        sc->stats.dirty_net_count   = d->arch.shadow_dirty_net_count;
  1.1301 +        sc->stats.dirty_block_count = d->arch.shadow_dirty_block_count;
  1.1302 + 
  1.1303 +        if ( (d->max_pages > sc->pages) || 
  1.1304 +             (sc->dirty_bitmap == NULL) || 
  1.1305 +             (d->arch.shadow_dirty_bitmap == NULL) )
  1.1306 +        {
  1.1307 +            rc = -EINVAL;
  1.1308 +            break;
  1.1309 +        }
  1.1310 + 
  1.1311 +        sc->pages = d->max_pages;
  1.1312 +        if (copy_to_user(
  1.1313 +            sc->dirty_bitmap, d->arch.shadow_dirty_bitmap, (d->max_pages+7)/8))
  1.1314 +        {
  1.1315 +            rc = -EINVAL;
  1.1316 +            break;
  1.1317 +        }
  1.1318 +
  1.1319 +        break;
  1.1320 +
  1.1321 +    default:
  1.1322 +        rc = -EINVAL;
  1.1323 +        break;
  1.1324 +    }
  1.1325 +
  1.1326 +    SH_VLOG("shadow mode table op : page count %d", d->arch.shadow_page_count);
  1.1327 +    shadow_audit(d, 1);
  1.1328 +
  1.1329 +    for_each_vcpu(d,v)
  1.1330 +        __update_pagetables(v);
  1.1331 +
  1.1332 +    return rc;
  1.1333 +}
  1.1334 +
  1.1335 +int shadow_mode_control(struct domain *d, dom0_shadow_control_t *sc)
  1.1336 +{
  1.1337 +    unsigned int op = sc->op;
  1.1338 +    int          rc = 0;
  1.1339 +    struct vcpu *v;
  1.1340 +
  1.1341 +    if ( unlikely(d == current->domain) )
  1.1342 +    {
  1.1343 +        DPRINTK("Don't try to do a shadow op on yourself!\n");
  1.1344 +        return -EINVAL;
  1.1345 +    }   
  1.1346 +
  1.1347 +    domain_pause(d);
  1.1348 +
  1.1349 +    shadow_lock(d);
  1.1350 +
  1.1351 +    switch ( op )
  1.1352 +    {
  1.1353 +    case DOM0_SHADOW_CONTROL_OP_OFF:
  1.1354 +        __shadow_sync_all(d);
  1.1355 +        __shadow_mode_disable(d);
  1.1356 +        break;
  1.1357 +
  1.1358 +    case DOM0_SHADOW_CONTROL_OP_ENABLE_TEST:
  1.1359 +        free_shadow_pages(d);
  1.1360 +        rc = __shadow_mode_enable(d, SHM_enable);
  1.1361 +        break;
  1.1362 +
  1.1363 +    case DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY:
  1.1364 +        free_shadow_pages(d);
  1.1365 +        rc = __shadow_mode_enable(
  1.1366 +            d, d->arch.shadow_mode|SHM_enable|SHM_log_dirty);
  1.1367 +        break;
  1.1368 +
  1.1369 +    case DOM0_SHADOW_CONTROL_OP_ENABLE_TRANSLATE:
  1.1370 +        free_shadow_pages(d);
  1.1371 +        rc = __shadow_mode_enable(
  1.1372 +            d, d->arch.shadow_mode|SHM_enable|SHM_refcounts|SHM_translate);
  1.1373 +        break;
  1.1374 +
  1.1375 +    default:
  1.1376 +        rc = shadow_mode_enabled(d) ? shadow_mode_table_op(d, sc) : -EINVAL;
  1.1377 +        break;
  1.1378 +    }
  1.1379 +
  1.1380 +    shadow_unlock(d);
  1.1381 +
  1.1382 +    for_each_vcpu(d,v)
  1.1383 +        update_pagetables(v);
  1.1384 +
  1.1385 +    domain_unpause(d);
  1.1386 +
  1.1387 +    return rc;
  1.1388 +}
  1.1389 +
  1.1390 +/*
  1.1391 + * XXX KAF: Why is this VMX specific?
  1.1392 + */
  1.1393 +void vmx_shadow_clear_state(struct domain *d)
  1.1394 +{
  1.1395 +    SH_VVLOG("%s:", __func__);
  1.1396 +    shadow_lock(d);
  1.1397 +    free_shadow_pages(d);
  1.1398 +    shadow_unlock(d);
  1.1399 +    update_pagetables(d->vcpu[0]);
  1.1400 +}
  1.1401 +
  1.1402 +unsigned long
  1.1403 +gpfn_to_mfn_foreign(struct domain *d, unsigned long gpfn)
  1.1404 +{
  1.1405 +    ASSERT( shadow_mode_translate(d) );
  1.1406 +
  1.1407 +    perfc_incrc(gpfn_to_mfn_foreign);
  1.1408 +
  1.1409 +    unsigned long va = gpfn << PAGE_SHIFT;
  1.1410 +    unsigned long tabpfn = pagetable_get_pfn(d->arch.phys_table);
  1.1411 +    l2_pgentry_t *l2 = map_domain_page(tabpfn);
  1.1412 +    l2_pgentry_t l2e = l2[l2_table_offset(va)];
  1.1413 +    unmap_domain_page(l2);
  1.1414 +    if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
  1.1415 +    {
  1.1416 +        printk("gpfn_to_mfn_foreign(d->id=%d, gpfn=%lx) => 0 l2e=%" PRIpte "\n",
  1.1417 +               d->domain_id, gpfn, l2e_get_intpte(l2e));
  1.1418 +        return INVALID_MFN;
  1.1419 +    }
  1.1420 +    l1_pgentry_t *l1 = map_domain_page(l2e_get_pfn(l2e));
  1.1421 +    l1_pgentry_t l1e = l1[l1_table_offset(va)];
  1.1422 +    unmap_domain_page(l1);
  1.1423 +
  1.1424 +#if 0
  1.1425 +    printk("gpfn_to_mfn_foreign(d->id=%d, gpfn=%lx) => %lx tabpfn=%lx l2e=%lx l1tab=%lx, l1e=%lx\n",
  1.1426 +           d->domain_id, gpfn, l1_pgentry_val(l1e) >> PAGE_SHIFT, tabpfn, l2e, l1tab, l1e);
  1.1427 +#endif
  1.1428 +
  1.1429 +    if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) )
  1.1430 +    {
  1.1431 +        printk("gpfn_to_mfn_foreign(d->id=%d, gpfn=%lx) => 0 l1e=%" PRIpte "\n",
  1.1432 +               d->domain_id, gpfn, l1e_get_intpte(l1e));
  1.1433 +        return INVALID_MFN;
  1.1434 +    }
  1.1435 +
  1.1436 +    return l1e_get_pfn(l1e);
  1.1437 +}
  1.1438 +
  1.1439 +static unsigned long
  1.1440 +shadow_hl2_table(struct domain *d, unsigned long gpfn, unsigned long gmfn,
  1.1441 +                unsigned long smfn)
  1.1442 +{
  1.1443 +    unsigned long hl2mfn;
  1.1444 +    l1_pgentry_t *hl2;
  1.1445 +    int limit;
  1.1446 +
  1.1447 +    ASSERT(PGT_base_page_table == PGT_l2_page_table);
  1.1448 +
  1.1449 +    if ( unlikely(!(hl2mfn = alloc_shadow_page(d, gpfn, gmfn, PGT_hl2_shadow))) )
  1.1450 +    {
  1.1451 +        printk("Couldn't alloc an HL2 shadow for pfn=%lx mfn=%lx\n",
  1.1452 +               gpfn, gmfn);
  1.1453 +        BUG(); /* XXX Deal gracefully with failure. */
  1.1454 +    }
  1.1455 +
  1.1456 +    SH_VVLOG("shadow_hl2_table(gpfn=%lx, gmfn=%lx, smfn=%lx) => %lx",
  1.1457 +             gpfn, gmfn, smfn, hl2mfn);
  1.1458 +    perfc_incrc(shadow_hl2_table_count);
  1.1459 +
  1.1460 +    hl2 = map_domain_page(hl2mfn);
  1.1461 +
  1.1462 +#ifdef __i386__
  1.1463 +    if ( shadow_mode_external(d) )
  1.1464 +        limit = L2_PAGETABLE_ENTRIES;
  1.1465 +    else
  1.1466 +        limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
  1.1467 +#else
  1.1468 +    limit = 0; /* XXX x86/64 XXX */
  1.1469 +#endif
  1.1470 +
  1.1471 +    memset(hl2, 0, limit * sizeof(l1_pgentry_t));
  1.1472 +
  1.1473 +    if ( !shadow_mode_external(d) )
  1.1474 +    {
  1.1475 +        memset(&hl2[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 0,
  1.1476 +               HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
  1.1477 +
  1.1478 +        // Setup easy access to the GL2, SL2, and HL2 frames.
  1.1479 +        //
  1.1480 +        hl2[l2_table_offset(LINEAR_PT_VIRT_START)] =
  1.1481 +            l1e_from_pfn(gmfn, __PAGE_HYPERVISOR);
  1.1482 +        hl2[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
  1.1483 +            l1e_from_pfn(smfn, __PAGE_HYPERVISOR);
  1.1484 +        hl2[l2_table_offset(PERDOMAIN_VIRT_START)] =
  1.1485 +            l1e_from_pfn(hl2mfn, __PAGE_HYPERVISOR);
  1.1486 +    }
  1.1487 +
  1.1488 +    unmap_domain_page(hl2);
  1.1489 +
  1.1490 +    return hl2mfn;
  1.1491 +}
  1.1492 +
  1.1493 +/*
  1.1494 + * This could take and use a snapshot, and validate the entire page at
  1.1495 + * once, or it could continue to fault in entries one at a time...
  1.1496 + * Might be worth investigating...
  1.1497 + */
  1.1498 +static unsigned long shadow_l2_table(
  1.1499 +    struct domain *d, unsigned long gpfn, unsigned long gmfn)
  1.1500 +{
  1.1501 +    unsigned long smfn;
  1.1502 +    l2_pgentry_t *spl2e;
  1.1503 +
  1.1504 +    SH_VVLOG("shadow_l2_table(gpfn=%lx, gmfn=%lx)", gpfn, gmfn);
  1.1505 +
  1.1506 +    perfc_incrc(shadow_l2_table_count);
  1.1507 +
  1.1508 +    if ( unlikely(!(smfn = alloc_shadow_page(d, gpfn, gmfn, PGT_l2_shadow))) )
  1.1509 +    {
  1.1510 +        printk("Couldn't alloc an L2 shadow for pfn=%lx mfn=%lx\n",
  1.1511 +               gpfn, gmfn);
  1.1512 +        BUG(); /* XXX Deal gracefully with failure. */
  1.1513 +    }
  1.1514 +
  1.1515 +    spl2e = (l2_pgentry_t *)map_domain_page(smfn);
  1.1516 +
  1.1517 +    /* Install hypervisor and 2x linear p.t. mapings. */
  1.1518 +    if ( (PGT_base_page_table == PGT_l2_page_table) &&
  1.1519 +         !shadow_mode_external(d) )
  1.1520 +    {
  1.1521 +        /*
  1.1522 +         * We could proactively fill in PDEs for pages that are already
  1.1523 +         * shadowed *and* where the guest PDE has _PAGE_ACCESSED set
  1.1524 +         * (restriction required for coherence of the accessed bit). However,
  1.1525 +         * we tried it and it didn't help performance. This is simpler. 
  1.1526 +         */
  1.1527 +        memset(spl2e, 0, DOMAIN_ENTRIES_PER_L2_PAGETABLE*sizeof(l2_pgentry_t));
  1.1528 +
  1.1529 +        /* Install hypervisor and 2x linear p.t. mapings. */
  1.1530 +        memcpy(&spl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 
  1.1531 +               &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
  1.1532 +               HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
  1.1533 +
  1.1534 +        spl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
  1.1535 +            l2e_from_pfn(smfn, __PAGE_HYPERVISOR);
  1.1536 +
  1.1537 +        spl2e[l2_table_offset(PERDOMAIN_VIRT_START)] =
  1.1538 +            l2e_from_paddr(__pa(page_get_owner(&frame_table[gmfn])->arch.mm_perdomain_pt),
  1.1539 +                            __PAGE_HYPERVISOR);
  1.1540 +
  1.1541 +        if ( shadow_mode_translate(d) ) // NB: not external
  1.1542 +        {
  1.1543 +            unsigned long hl2mfn;
  1.1544 +
  1.1545 +            spl2e[l2_table_offset(RO_MPT_VIRT_START)] =
  1.1546 +                l2e_from_paddr(pagetable_get_paddr(d->arch.phys_table),
  1.1547 +                                __PAGE_HYPERVISOR);
  1.1548 +
  1.1549 +            if ( unlikely(!(hl2mfn = __shadow_status(d, gpfn, PGT_hl2_shadow))) )
  1.1550 +                hl2mfn = shadow_hl2_table(d, gpfn, gmfn, smfn);
  1.1551 +
  1.1552 +            // shadow_mode_translate (but not external) sl2 tables hold a
  1.1553 +            // ref to their hl2.
  1.1554 +            //
  1.1555 +            if ( !get_shadow_ref(hl2mfn) )
  1.1556 +                BUG();
  1.1557 +            
  1.1558 +            spl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
  1.1559 +                l2e_from_pfn(hl2mfn, __PAGE_HYPERVISOR);
  1.1560 +        }
  1.1561 +        else
  1.1562 +            spl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
  1.1563 +                l2e_from_pfn(gmfn, __PAGE_HYPERVISOR);
  1.1564 +    }
  1.1565 +    else
  1.1566 +    {
  1.1567 +        memset(spl2e, 0, L2_PAGETABLE_ENTRIES*sizeof(l2_pgentry_t));        
  1.1568 +    }
  1.1569 +
  1.1570 +    unmap_domain_page(spl2e);
  1.1571 +
  1.1572 +    SH_VLOG("shadow_l2_table(%lx -> %lx)", gmfn, smfn);
  1.1573 +    return smfn;
  1.1574 +}
  1.1575 +
  1.1576 +void shadow_map_l1_into_current_l2(unsigned long va)
  1.1577 +{ 
  1.1578 +    struct vcpu *v = current;
  1.1579 +    struct domain *d = v->domain;
  1.1580 +    l1_pgentry_t *gpl1e, *spl1e;
  1.1581 +    l2_pgentry_t gl2e, sl2e;
  1.1582 +    unsigned long gl1pfn, gl1mfn, sl1mfn;
  1.1583 +    int i, init_table = 0;
  1.1584 +
  1.1585 +    __guest_get_l2e(v, va, &gl2e);
  1.1586 +    ASSERT(l2e_get_flags(gl2e) & _PAGE_PRESENT);
  1.1587 +    gl1pfn = l2e_get_pfn(gl2e);
  1.1588 +
  1.1589 +    if ( !(sl1mfn = __shadow_status(d, gl1pfn, PGT_l1_shadow)) )
  1.1590 +    {
  1.1591 +        /* This L1 is NOT already shadowed so we need to shadow it. */
  1.1592 +        SH_VVLOG("4a: l1 not shadowed");
  1.1593 +
  1.1594 +        gl1mfn = __gpfn_to_mfn(d, gl1pfn);
  1.1595 +        if ( unlikely(!VALID_MFN(gl1mfn)) )
  1.1596 +        {
  1.1597 +            // Attempt to use an invalid pfn as an L1 page.
  1.1598 +            // XXX this needs to be more graceful!
  1.1599 +            BUG();
  1.1600 +        }
  1.1601 +
  1.1602 +        if ( unlikely(!(sl1mfn =
  1.1603 +                        alloc_shadow_page(d, gl1pfn, gl1mfn, PGT_l1_shadow))) )
  1.1604 +        {
  1.1605 +            printk("Couldn't alloc an L1 shadow for pfn=%lx mfn=%lx\n",
  1.1606 +                   gl1pfn, gl1mfn);
  1.1607 +            BUG(); /* XXX Need to deal gracefully with failure. */
  1.1608 +        }
  1.1609 +
  1.1610 +        perfc_incrc(shadow_l1_table_count);
  1.1611 +        init_table = 1;
  1.1612 +    }
  1.1613 +    else
  1.1614 +    {
  1.1615 +        /* This L1 is shadowed already, but the L2 entry is missing. */
  1.1616 +        SH_VVLOG("4b: was shadowed, l2 missing (%lx)", sl1mfn);
  1.1617 +    }
  1.1618 +
  1.1619 +#ifndef NDEBUG
  1.1620 +    l2_pgentry_t old_sl2e;
  1.1621 +    __shadow_get_l2e(v, va, &old_sl2e);
  1.1622 +    ASSERT( !(l2e_get_flags(old_sl2e) & _PAGE_PRESENT) );
  1.1623 +#endif
  1.1624 +
  1.1625 +    if ( !get_shadow_ref(sl1mfn) )
  1.1626 +        BUG();
  1.1627 +    l2pde_general(d, &gl2e, &sl2e, sl1mfn);
  1.1628 +    __guest_set_l2e(v, va, gl2e);
  1.1629 +    __shadow_set_l2e(v, va, sl2e);
  1.1630 +
  1.1631 +    if ( init_table )
  1.1632 +    {
  1.1633 +        l1_pgentry_t sl1e;
  1.1634 +        int index = l1_table_offset(va);
  1.1635 +        int min = 1, max = 0;
  1.1636 +
  1.1637 +        gpl1e = &(linear_pg_table[l1_linear_offset(va) &
  1.1638 +                              ~(L1_PAGETABLE_ENTRIES-1)]);
  1.1639 +
  1.1640 +        spl1e = &(shadow_linear_pg_table[l1_linear_offset(va) &
  1.1641 +                                     ~(L1_PAGETABLE_ENTRIES-1)]);
  1.1642 +
  1.1643 +        for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
  1.1644 +        {
  1.1645 +            l1pte_propagate_from_guest(d, gpl1e[i], &sl1e);
  1.1646 +            if ( (l1e_get_flags(sl1e) & _PAGE_PRESENT) &&
  1.1647 +                 unlikely(!shadow_get_page_from_l1e(sl1e, d)) )
  1.1648 +                sl1e = l1e_empty();
  1.1649 +            if ( l1e_get_flags(sl1e) == 0 )
  1.1650 +            {
  1.1651 +                // First copy entries from 0 until first invalid.
  1.1652 +                // Then copy entries from index until first invalid.
  1.1653 +                //
  1.1654 +                if ( i < index ) {
  1.1655 +                    i = index - 1;
  1.1656 +                    continue;
  1.1657 +                }
  1.1658 +                break;
  1.1659 +            }
  1.1660 +            spl1e[i] = sl1e;
  1.1661 +            if ( unlikely(i < min) )
  1.1662 +                min = i;
  1.1663 +            if ( likely(i > max) )
  1.1664 +                max = i;
  1.1665 +        }
  1.1666 +
  1.1667 +        frame_table[sl1mfn].tlbflush_timestamp =
  1.1668 +            SHADOW_ENCODE_MIN_MAX(min, max);
  1.1669 +    }
  1.1670 +}
  1.1671 +
  1.1672 +void shadow_invlpg(struct vcpu *v, unsigned long va)
  1.1673 +{
  1.1674 +    struct domain *d = v->domain;
  1.1675 +    l1_pgentry_t gpte, spte;
  1.1676 +
  1.1677 +    ASSERT(shadow_mode_enabled(d));
  1.1678 +
  1.1679 +    shadow_lock(d);
  1.1680 +
  1.1681 +    __shadow_sync_va(v, va);
  1.1682 +
  1.1683 +    // XXX mafetter: will need to think about 4MB pages...
  1.1684 +
  1.1685 +    // It's not strictly necessary to update the shadow here,
  1.1686 +    // but it might save a fault later.
  1.1687 +    //
  1.1688 +    if (__copy_from_user(&gpte, &linear_pg_table[va >> PAGE_SHIFT],
  1.1689 +                         sizeof(gpte))) {
  1.1690 +        perfc_incrc(shadow_invlpg_faults);
  1.1691 +        return;
  1.1692 +    }
  1.1693 +    l1pte_propagate_from_guest(d, gpte, &spte);
  1.1694 +    shadow_set_l1e(va, spte, 1);
  1.1695 +
  1.1696 +    shadow_unlock(d);
  1.1697 +}
  1.1698 +
  1.1699 +struct out_of_sync_entry *
  1.1700 +shadow_alloc_oos_entry(struct domain *d)
  1.1701 +{
  1.1702 +    struct out_of_sync_entry *f, *extra;
  1.1703 +    unsigned size, i;
  1.1704 +
  1.1705 +    if ( unlikely(d->arch.out_of_sync_free == NULL) )
  1.1706 +    {
  1.1707 +        FSH_LOG("Allocate more fullshadow tuple blocks.");
  1.1708 +
  1.1709 +        size = sizeof(void *) + (out_of_sync_extra_size * sizeof(*f));
  1.1710 +        extra = xmalloc_bytes(size);
  1.1711 +
  1.1712 +        /* XXX Should be more graceful here. */
  1.1713 +        if ( extra == NULL )
  1.1714 +            BUG();
  1.1715 +
  1.1716 +        memset(extra, 0, size);
  1.1717 +
  1.1718 +        /* Record the allocation block so it can be correctly freed later. */
  1.1719 +        d->arch.out_of_sync_extras_count++;
  1.1720 +        *((struct out_of_sync_entry **)&extra[out_of_sync_extra_size]) = 
  1.1721 +            d->arch.out_of_sync_extras;
  1.1722 +        d->arch.out_of_sync_extras = &extra[0];
  1.1723 +
  1.1724 +        /* Thread a free chain through the newly-allocated nodes. */
  1.1725 +        for ( i = 0; i < (out_of_sync_extra_size - 1); i++ )
  1.1726 +            extra[i].next = &extra[i+1];
  1.1727 +        extra[i].next = NULL;
  1.1728 +
  1.1729 +        /* Add the new nodes to the free list. */
  1.1730 +        d->arch.out_of_sync_free = &extra[0];
  1.1731 +    }
  1.1732 +
  1.1733 +    /* Allocate a new node from the quicklist. */
  1.1734 +    f = d->arch.out_of_sync_free;
  1.1735 +    d->arch.out_of_sync_free = f->next;
  1.1736 +
  1.1737 +    return f;
  1.1738 +}
  1.1739 +
  1.1740 +static inline unsigned long
  1.1741 +shadow_make_snapshot(
  1.1742 +    struct domain *d, unsigned long gpfn, unsigned long gmfn)
  1.1743 +{
  1.1744 +    unsigned long smfn, sl1mfn = 0;
  1.1745 +    void *original, *snapshot;
  1.1746 +    u32 min_max = 0;
  1.1747 +    int min, max, length;
  1.1748 +
  1.1749 +    if ( test_and_set_bit(_PGC_out_of_sync, &frame_table[gmfn].count_info) )
  1.1750 +    {
  1.1751 +        ASSERT(__shadow_status(d, gpfn, PGT_snapshot));
  1.1752 +        return SHADOW_SNAPSHOT_ELSEWHERE;
  1.1753 +    }
  1.1754 +
  1.1755 +    perfc_incrc(shadow_make_snapshot);
  1.1756 +
  1.1757 +    if ( unlikely(!(smfn = alloc_shadow_page(d, gpfn, gmfn, PGT_snapshot))) )
  1.1758 +    {
  1.1759 +        printk("Couldn't alloc fullshadow snapshot for pfn=%lx mfn=%lx!\n"
  1.1760 +               "Dom%d snapshot_count_count=%d\n",
  1.1761 +               gpfn, gmfn, d->domain_id, d->arch.snapshot_page_count);
  1.1762 +        BUG(); /* XXX FIXME: try a shadow flush to free up some memory. */
  1.1763 +    }
  1.1764 +
  1.1765 +    if ( !get_shadow_ref(smfn) )
  1.1766 +        BUG();
  1.1767 +
  1.1768 +    if ( shadow_mode_refcounts(d) &&
  1.1769 +         (shadow_max_pgtable_type(d, gpfn, &sl1mfn) == PGT_l1_shadow) )
  1.1770 +        min_max = pfn_to_page(sl1mfn)->tlbflush_timestamp;
  1.1771 +    pfn_to_page(smfn)->tlbflush_timestamp = min_max;
  1.1772 +
  1.1773 +    min = SHADOW_MIN(min_max);
  1.1774 +    max = SHADOW_MAX(min_max);
  1.1775 +    length = max - min + 1;
  1.1776 +    perfc_incr_histo(snapshot_copies, length, PT_UPDATES);
  1.1777 +
  1.1778 +    min *= sizeof(l1_pgentry_t);
  1.1779 +    length *= sizeof(l1_pgentry_t);
  1.1780 +
  1.1781 +    original = map_domain_page(gmfn);
  1.1782 +    snapshot = map_domain_page(smfn);
  1.1783 +    memcpy(snapshot + min, original + min, length);
  1.1784 +    unmap_domain_page(original);
  1.1785 +    unmap_domain_page(snapshot);
  1.1786 +
  1.1787 +    return smfn;
  1.1788 +}
  1.1789 +
  1.1790 +static void
  1.1791 +shadow_free_snapshot(struct domain *d, struct out_of_sync_entry *entry)
  1.1792 +{
  1.1793 +    void *snapshot;
  1.1794 +
  1.1795 +    if ( entry->snapshot_mfn == SHADOW_SNAPSHOT_ELSEWHERE )
  1.1796 +        return;
  1.1797 +
  1.1798 +    // Clear the out_of_sync bit.
  1.1799 +    //
  1.1800 +    clear_bit(_PGC_out_of_sync, &frame_table[entry->gmfn].count_info);
  1.1801 +
  1.1802 +    // XXX Need to think about how to protect the domain's
  1.1803 +    // information less expensively.
  1.1804 +    //
  1.1805 +    snapshot = map_domain_page(entry->snapshot_mfn);
  1.1806 +    memset(snapshot, 0, PAGE_SIZE);
  1.1807 +    unmap_domain_page(snapshot);
  1.1808 +
  1.1809 +    put_shadow_ref(entry->snapshot_mfn);
  1.1810 +}
  1.1811 +
  1.1812 +struct out_of_sync_entry *
  1.1813 +shadow_mark_mfn_out_of_sync(struct vcpu *v, unsigned long gpfn,
  1.1814 +                             unsigned long mfn)
  1.1815 +{
  1.1816 +    struct domain *d = v->domain;
  1.1817 +    struct pfn_info *page = &frame_table[mfn];
  1.1818 +    struct out_of_sync_entry *entry = shadow_alloc_oos_entry(d);
  1.1819 +
  1.1820 +    ASSERT(shadow_lock_is_acquired(d));
  1.1821 +    ASSERT(pfn_valid(mfn));
  1.1822 +
  1.1823 +#ifndef NDEBUG
  1.1824 +    u32 type = page->u.inuse.type_info & PGT_type_mask;
  1.1825 +    if ( shadow_mode_refcounts(d) )
  1.1826 +    {
  1.1827 +        ASSERT(type == PGT_writable_page);
  1.1828 +    }
  1.1829 +    else
  1.1830 +    {
  1.1831 +        ASSERT(type && (type < PGT_l4_page_table));
  1.1832 +    }
  1.1833 +#endif
  1.1834 +
  1.1835 +    FSH_LOG("%s(gpfn=%lx, mfn=%lx) c=%08x t=%08x", __func__,
  1.1836 +            gpfn, mfn, page->count_info, page->u.inuse.type_info);
  1.1837 +
  1.1838 +    // XXX this will require some more thought...  Cross-domain sharing and
  1.1839 +    //     modification of page tables?  Hmm...
  1.1840 +    //
  1.1841 +    if ( d != page_get_owner(page) )
  1.1842 +        BUG();
  1.1843 +
  1.1844 +    perfc_incrc(shadow_mark_mfn_out_of_sync_calls);
  1.1845 +
  1.1846 +    entry->gpfn = gpfn;
  1.1847 +    entry->gmfn = mfn;
  1.1848 +    entry->snapshot_mfn = shadow_make_snapshot(d, gpfn, mfn);
  1.1849 +    entry->writable_pl1e = -1;
  1.1850 +
  1.1851 +#if SHADOW_DEBUG
  1.1852 +    mark_shadows_as_reflecting_snapshot(d, gpfn);
  1.1853 +#endif
  1.1854 +
  1.1855 +    // increment guest's ref count to represent the entry in the
  1.1856 +    // full shadow out-of-sync list.
  1.1857 +    //
  1.1858 +    get_page(page, d);
  1.1859 +
  1.1860 +    // Add to the out-of-sync list
  1.1861 +    //
  1.1862 +    entry->next = d->arch.out_of_sync;
  1.1863 +    d->arch.out_of_sync = entry;
  1.1864 +
  1.1865 +    return entry;
  1.1866 +}
  1.1867 +
  1.1868 +void shadow_mark_va_out_of_sync(
  1.1869 +    struct vcpu *v, unsigned long gpfn, unsigned long mfn, unsigned long va)
  1.1870 +{
  1.1871 +    struct out_of_sync_entry *entry =
  1.1872 +        shadow_mark_mfn_out_of_sync(v, gpfn, mfn);
  1.1873 +    l2_pgentry_t sl2e;
  1.1874 +
  1.1875 +    // We need the address of shadow PTE that maps @va.
  1.1876 +    // It might not exist yet.  Make sure it's there.
  1.1877 +    //
  1.1878 +    __shadow_get_l2e(v, va, &sl2e);
  1.1879 +    if ( !(l2e_get_flags(sl2e) & _PAGE_PRESENT) )
  1.1880 +    {
  1.1881 +        // either this L1 isn't shadowed yet, or the shadow isn't linked into
  1.1882 +        // the current L2.
  1.1883 +        shadow_map_l1_into_current_l2(va);
  1.1884 +        __shadow_get_l2e(v, va, &sl2e);
  1.1885 +    }
  1.1886 +    ASSERT(l2e_get_flags(sl2e) & _PAGE_PRESENT);
  1.1887 +
  1.1888 +    // NB: this is stored as a machine address.
  1.1889 +    entry->writable_pl1e =
  1.1890 +        l2e_get_paddr(sl2e) | (sizeof(l1_pgentry_t) * l1_table_offset(va));
  1.1891 +    ASSERT( !(entry->writable_pl1e & (sizeof(l1_pgentry_t)-1)) );
  1.1892 +
  1.1893 +    // Increment shadow's page count to represent the reference
  1.1894 +    // inherent in entry->writable_pl1e
  1.1895 +    //
  1.1896 +    if ( !get_shadow_ref(l2e_get_pfn(sl2e)) )
  1.1897 +        BUG();
  1.1898 +
  1.1899 +    FSH_LOG("mark_out_of_sync(va=%lx -> writable_pl1e=%lx)",
  1.1900 +            va, entry->writable_pl1e);
  1.1901 +}
  1.1902 +
  1.1903 +/*
  1.1904 + * Returns 1 if the snapshot for @gmfn exists and its @index'th entry matches.
  1.1905 + * Returns 0 otherwise.
  1.1906 + */
  1.1907 +static int snapshot_entry_matches(
  1.1908 +    struct domain *d, l1_pgentry_t *guest_pt,
  1.1909 +    unsigned long gpfn, unsigned index)
  1.1910 +{
  1.1911 +    unsigned long smfn = __shadow_status(d, gpfn, PGT_snapshot);
  1.1912 +    l1_pgentry_t *snapshot, gpte; // could be L1s or L2s or ...
  1.1913 +    int entries_match;
  1.1914 +
  1.1915 +    perfc_incrc(snapshot_entry_matches_calls);
  1.1916 +
  1.1917 +    if ( !smfn )
  1.1918 +        return 0;
  1.1919 +
  1.1920 +    snapshot = map_domain_page(smfn);
  1.1921 +
  1.1922 +    if (__copy_from_user(&gpte, &guest_pt[index],
  1.1923 +                         sizeof(gpte)))
  1.1924 +        return 0;
  1.1925 +
  1.1926 +    // This could probably be smarter, but this is sufficent for
  1.1927 +    // our current needs.
  1.1928 +    //
  1.1929 +    entries_match = !l1e_has_changed(gpte, snapshot[index],
  1.1930 +                                     PAGE_FLAG_MASK);
  1.1931 +
  1.1932 +    unmap_domain_page(snapshot);
  1.1933 +
  1.1934 +#ifdef PERF_COUNTERS
  1.1935 +    if ( entries_match )
  1.1936 +        perfc_incrc(snapshot_entry_matches_true);
  1.1937 +#endif
  1.1938 +
  1.1939 +    return entries_match;
  1.1940 +}
  1.1941 +
  1.1942 +/*
  1.1943 + * Returns 1 if va's shadow mapping is out-of-sync.
  1.1944 + * Returns 0 otherwise.
  1.1945 + */
  1.1946 +int __shadow_out_of_sync(struct vcpu *v, unsigned long va)
  1.1947 +{
  1.1948 +    struct domain *d = v->domain;
  1.1949 +    unsigned long l2mfn = pagetable_get_pfn(v->arch.guest_table);
  1.1950 +    unsigned long l2pfn = __mfn_to_gpfn(d, l2mfn);
  1.1951 +    l2_pgentry_t l2e;
  1.1952 +    unsigned long l1pfn, l1mfn;
  1.1953 +
  1.1954 +    ASSERT(shadow_lock_is_acquired(d));
  1.1955 +    ASSERT(VALID_M2P(l2pfn));
  1.1956 +
  1.1957 +    perfc_incrc(shadow_out_of_sync_calls);
  1.1958 +
  1.1959 +    if ( page_out_of_sync(&frame_table[l2mfn]) &&
  1.1960 +         !snapshot_entry_matches(d, (l1_pgentry_t *)v->arch.guest_vtable,
  1.1961 +                                 l2pfn, l2_table_offset(va)) )
  1.1962 +        return 1;
  1.1963 +
  1.1964 +    __guest_get_l2e(v, va, &l2e);
  1.1965 +    if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
  1.1966 +        return 0;
  1.1967 +
  1.1968 +    l1pfn = l2e_get_pfn(l2e);
  1.1969 +    l1mfn = __gpfn_to_mfn(d, l1pfn);
  1.1970 +
  1.1971 +    // If the l1 pfn is invalid, it can't be out of sync...
  1.1972 +    if ( !VALID_MFN(l1mfn) )
  1.1973 +        return 0;
  1.1974 +
  1.1975 +    if ( page_out_of_sync(&frame_table[l1mfn]) &&
  1.1976 +         !snapshot_entry_matches(
  1.1977 +             d, &linear_pg_table[l1_linear_offset(va) & ~(L1_PAGETABLE_ENTRIES-1)],
  1.1978 +             l1pfn, l1_table_offset(va)) )
  1.1979 +        return 1;
  1.1980 +
  1.1981 +    return 0;
  1.1982 +}
  1.1983 +
  1.1984 +#define GPFN_TO_GPTEPAGE(_gpfn) ((_gpfn) / (PAGE_SIZE / sizeof(l1_pgentry_t)))
  1.1985 +static inline unsigned long
  1.1986 +predict_writable_pte_page(struct domain *d, unsigned long gpfn)
  1.1987 +{
  1.1988 +    return __shadow_status(d, GPFN_TO_GPTEPAGE(gpfn), PGT_writable_pred);
  1.1989 +}
  1.1990 +
  1.1991 +static inline void
  1.1992 +increase_writable_pte_prediction(struct domain *d, unsigned long gpfn, unsigned long prediction)
  1.1993 +{
  1.1994 +    unsigned long score = prediction & PGT_score_mask;
  1.1995 +    int create = (score == 0);
  1.1996 +
  1.1997 +    // saturating addition
  1.1998 +    score = (score + (1u << PGT_score_shift)) & PGT_score_mask;
  1.1999 +    score = score ? score : PGT_score_mask;
  1.2000 +
  1.2001 +    prediction = (prediction & PGT_mfn_mask) | score;
  1.2002 +
  1.2003 +    //printk("increase gpfn=%lx pred=%lx create=%d\n", gpfn, prediction, create);
  1.2004 +    set_shadow_status(d, GPFN_TO_GPTEPAGE(gpfn), 0, prediction, PGT_writable_pred);
  1.2005 +
  1.2006 +    if ( create )
  1.2007 +        perfc_incr(writable_pte_predictions);
  1.2008 +}
  1.2009 +
  1.2010 +static inline void
  1.2011 +decrease_writable_pte_prediction(struct domain *d, unsigned long gpfn, unsigned long prediction)
  1.2012 +{
  1.2013 +    unsigned long score = prediction & PGT_score_mask;
  1.2014 +    ASSERT(score);
  1.2015 +
  1.2016 +    // divide score by 2...  We don't like bad predictions.
  1.2017 +    //
  1.2018 +    score = (score >> 1) & PGT_score_mask;
  1.2019 +
  1.2020 +    prediction = (prediction & PGT_mfn_mask) | score;
  1.2021 +
  1.2022 +    //printk("decrease gpfn=%lx pred=%lx score=%lx\n", gpfn, prediction, score);
  1.2023 +
  1.2024 +    if ( score )
  1.2025 +        set_shadow_status(d, GPFN_TO_GPTEPAGE(gpfn), 0, prediction, PGT_writable_pred);
  1.2026 +    else
  1.2027 +    {
  1.2028 +        delete_shadow_status(d, GPFN_TO_GPTEPAGE(gpfn), 0, PGT_writable_pred);
  1.2029 +        perfc_decr(writable_pte_predictions);
  1.2030 +    }
  1.2031 +}
  1.2032 +
  1.2033 +static void
  1.2034 +free_writable_pte_predictions(struct domain *d)
  1.2035 +{
  1.2036 +    int i;
  1.2037 +    struct shadow_status *x;
  1.2038 +
  1.2039 +    for ( i = 0; i < shadow_ht_buckets; i++ )
  1.2040 +    {
  1.2041 +        u32 count;
  1.2042 +        unsigned long *gpfn_list;
  1.2043 +
  1.2044 +        /* Skip empty buckets. */
  1.2045 +        if ( d->arch.shadow_ht[i].gpfn_and_flags == 0 )
  1.2046 +            continue;
  1.2047 +
  1.2048 +        count = 0;
  1.2049 +        for ( x = &d->arch.shadow_ht[i]; x != NULL; x = x->next )
  1.2050 +            if ( (x->gpfn_and_flags & PGT_type_mask) == PGT_writable_pred )
  1.2051 +                count++;
  1.2052 +
  1.2053 +        gpfn_list = xmalloc_array(unsigned long, count);
  1.2054 +        count = 0;
  1.2055 +        for ( x = &d->arch.shadow_ht[i]; x != NULL; x = x->next )
  1.2056 +            if ( (x->gpfn_and_flags & PGT_type_mask) == PGT_writable_pred )
  1.2057 +                gpfn_list[count++] = x->gpfn_and_flags & PGT_mfn_mask;
  1.2058 +
  1.2059 +        while ( count )
  1.2060 +        {
  1.2061 +            count--;
  1.2062 +            delete_shadow_status(d, gpfn_list[count], 0, PGT_writable_pred);
  1.2063 +        }
  1.2064 +
  1.2065 +        xfree(gpfn_list);
  1.2066 +    }
  1.2067 +}
  1.2068 +
  1.2069 +static u32 remove_all_write_access_in_ptpage(
  1.2070 +    struct domain *d, unsigned long pt_pfn, unsigned long pt_mfn,
  1.2071 +    unsigned long readonly_gpfn, unsigned long readonly_gmfn,
  1.2072 +    u32 max_refs_to_find, unsigned long prediction)
  1.2073 +{
  1.2074 +    l1_pgentry_t *pt = map_domain_page(pt_mfn);
  1.2075 +    l1_pgentry_t match;
  1.2076 +    unsigned long flags = _PAGE_RW | _PAGE_PRESENT;
  1.2077 +    int i;
  1.2078 +    u32 found = 0;
  1.2079 +    int is_l1_shadow =
  1.2080 +        ((frame_table[pt_mfn].u.inuse.type_info & PGT_type_mask) ==
  1.2081 +         PGT_l1_shadow);
  1.2082 +
  1.2083 +    match = l1e_from_pfn(readonly_gmfn, flags);
  1.2084 +
  1.2085 +    // returns true if all refs have been found and fixed.
  1.2086 +    //
  1.2087 +    int fix_entry(int i)
  1.2088 +    {
  1.2089 +        l1_pgentry_t old = pt[i];
  1.2090 +        l1_pgentry_t new = old;
  1.2091 +
  1.2092 +        l1e_remove_flags(new,_PAGE_RW);
  1.2093 +        if ( is_l1_shadow && !shadow_get_page_from_l1e(new, d) )
  1.2094 +            BUG();
  1.2095 +        found++;
  1.2096 +        pt[i] = new;
  1.2097 +        if ( is_l1_shadow )
  1.2098 +            shadow_put_page_from_l1e(old, d);
  1.2099 +
  1.2100 +#if 0
  1.2101 +        printk("removed write access to pfn=%lx mfn=%lx in smfn=%lx entry %x "
  1.2102 +               "is_l1_shadow=%d\n",
  1.2103 +               readonly_gpfn, readonly_gmfn, pt_mfn, i, is_l1_shadow);
  1.2104 +#endif
  1.2105 +
  1.2106 +        return (found == max_refs_to_find);
  1.2107 +    }
  1.2108 +
  1.2109 +    i = readonly_gpfn & (L1_PAGETABLE_ENTRIES - 1);
  1.2110 +    if ( !l1e_has_changed(pt[i], match, flags) && fix_entry(i) )
  1.2111 +    {
  1.2112 +        perfc_incrc(remove_write_fast_exit);
  1.2113 +        increase_writable_pte_prediction(d, readonly_gpfn, prediction);
  1.2114 +        unmap_domain_page(pt);
  1.2115 +        return found;
  1.2116 +    }
  1.2117 + 
  1.2118 +    for (i = 0; i < L1_PAGETABLE_ENTRIES; i++)
  1.2119 +    {
  1.2120 +        if ( unlikely(!l1e_has_changed(pt[i], match, flags)) && fix_entry(i) )
  1.2121 +            break;
  1.2122 +    }
  1.2123 +
  1.2124 +    unmap_domain_page(pt);
  1.2125 +
  1.2126 +    return found;
  1.2127 +#undef MATCH_ENTRY
  1.2128 +}
  1.2129 +
  1.2130 +int shadow_remove_all_write_access(
  1.2131 +    struct domain *d, unsigned long readonly_gpfn, unsigned long readonly_gmfn)
  1.2132 +{
  1.2133 +    int i;
  1.2134 +    struct shadow_status *a;
  1.2135 +    u32 found = 0, fixups, write_refs;
  1.2136 +    unsigned long prediction, predicted_gpfn, predicted_smfn;
  1.2137 +
  1.2138 +    ASSERT(shadow_lock_is_acquired(d));
  1.2139 +    ASSERT(VALID_MFN(readonly_gmfn));
  1.2140 +
  1.2141 +    perfc_incrc(remove_write_access);
  1.2142 +
  1.2143 +    // If it's not a writable page, then no writable refs can be outstanding.
  1.2144 +    //
  1.2145 +    if ( (frame_table[readonly_gmfn].u.inuse.type_info & PGT_type_mask) !=
  1.2146 +         PGT_writable_page )
  1.2147 +    {
  1.2148 +        perfc_incrc(remove_write_not_writable);
  1.2149 +        return 1;
  1.2150 +    }
  1.2151 +
  1.2152 +    // How many outstanding writable PTEs for this page are there?
  1.2153 +    //
  1.2154 +    write_refs =
  1.2155 +        (frame_table[readonly_gmfn].u.inuse.type_info & PGT_count_mask);
  1.2156 +    if ( write_refs && MFN_PINNED(readonly_gmfn) )
  1.2157 +    {
  1.2158 +        write_refs--;
  1.2159 +    }
  1.2160 +
  1.2161 +    if ( write_refs == 0 )
  1.2162 +    {
  1.2163 +        perfc_incrc(remove_write_no_work);
  1.2164 +        return 1;
  1.2165 +    }
  1.2166 +
  1.2167 +    // Before searching all the L1 page tables, check the typical culprit first
  1.2168 +    //
  1.2169 +    if ( (prediction = predict_writable_pte_page(d, readonly_gpfn)) )
  1.2170 +    {
  1.2171 +        predicted_gpfn = prediction & PGT_mfn_mask;
  1.2172 +        if ( (predicted_smfn = __shadow_status(d, predicted_gpfn, PGT_l1_shadow)) &&
  1.2173 +             (fixups = remove_all_write_access_in_ptpage(d, predicted_gpfn, predicted_smfn, readonly_gpfn, readonly_gmfn, write_refs, prediction)) )
  1.2174 +        {
  1.2175 +            found += fixups;
  1.2176 +            if ( found == write_refs )
  1.2177 +            {
  1.2178 +                perfc_incrc(remove_write_predicted);
  1.2179 +                return 1;
  1.2180 +            }
  1.2181 +        }
  1.2182 +        else
  1.2183 +        {
  1.2184 +            perfc_incrc(remove_write_bad_prediction);
  1.2185 +            decrease_writable_pte_prediction(d, readonly_gpfn, prediction);
  1.2186 +        }
  1.2187 +    }
  1.2188 +
  1.2189 +    // Search all the shadow L1 page tables...
  1.2190 +    //
  1.2191 +    for (i = 0; i < shadow_ht_buckets; i++)
  1.2192 +    {
  1.2193 +        a = &d->arch.shadow_ht[i];
  1.2194 +        while ( a && a->gpfn_and_flags )
  1.2195 +        {
  1.2196 +            if ( (a->gpfn_and_flags & PGT_type_mask) == PGT_l1_shadow )
  1.2197 +            {
  1.2198 +                found += remove_all_write_access_in_ptpage(d, a->gpfn_and_flags & PGT_mfn_mask, a->smfn, readonly_gpfn, readonly_gmfn, write_refs - found, a->gpfn_and_flags & PGT_mfn_mask);
  1.2199 +                if ( found == write_refs )
  1.2200 +                    return 1;
  1.2201 +            }
  1.2202 +
  1.2203 +            a = a->next;
  1.2204 +        }
  1.2205 +    }
  1.2206 +
  1.2207 +    FSH_LOG("%s: looking for %d refs, found %d refs",
  1.2208 +            __func__, write_refs, found);
  1.2209 +
  1.2210 +    return 0;
  1.2211 +}
  1.2212 +
  1.2213 +static u32 remove_all_access_in_page(
  1.2214 +    struct domain *d, unsigned long l1mfn, unsigned long forbidden_gmfn)
  1.2215 +{
  1.2216 +    l1_pgentry_t *pl1e = map_domain_page(l1mfn);
  1.2217 +    l1_pgentry_t match;
  1.2218 +    unsigned long flags  = _PAGE_PRESENT;
  1.2219 +    int i;
  1.2220 +    u32 count = 0;
  1.2221 +    int is_l1_shadow =
  1.2222 +        ((frame_table[l1mfn].u.inuse.type_info & PGT_type_mask) ==
  1.2223 +         PGT_l1_shadow);
  1.2224 +
  1.2225 +    match = l1e_from_pfn(forbidden_gmfn, flags);
  1.2226 +    
  1.2227 +    for (i = 0; i < L1_PAGETABLE_ENTRIES; i++)
  1.2228 +    {
  1.2229 +        if ( unlikely(!l1e_has_changed(pl1e[i], match, flags) == 0) )
  1.2230 +        {
  1.2231 +            l1_pgentry_t ol2e = pl1e[i];
  1.2232 +            pl1e[i] = l1e_empty();
  1.2233 +            count++;
  1.2234 +
  1.2235 +            if ( is_l1_shadow )
  1.2236 +                shadow_put_page_from_l1e(ol2e, d);
  1.2237 +            else /* must be an hl2 page */
  1.2238 +                put_page(&frame_table[forbidden_gmfn]);
  1.2239 +        }
  1.2240 +    }
  1.2241 +
  1.2242 +    unmap_domain_page(pl1e);
  1.2243 +
  1.2244 +    return count;
  1.2245 +}
  1.2246 +
  1.2247 +u32 shadow_remove_all_access(struct domain *d, unsigned long forbidden_gmfn)
  1.2248 +{
  1.2249 +    int i;
  1.2250 +    struct shadow_status *a;
  1.2251 +    u32 count = 0;
  1.2252 +
  1.2253 +    if ( unlikely(!shadow_mode_enabled(d)) )
  1.2254 +        return 0;
  1.2255 +
  1.2256 +    ASSERT(shadow_lock_is_acquired(d));
  1.2257 +    perfc_incrc(remove_all_access);
  1.2258 +
  1.2259 +    for (i = 0; i < shadow_ht_buckets; i++)
  1.2260 +    {
  1.2261 +        a = &d->arch.shadow_ht[i];
  1.2262 +        while ( a && a->gpfn_and_flags )
  1.2263 +        {
  1.2264 +            switch (a->gpfn_and_flags & PGT_type_mask)
  1.2265 +            {
  1.2266 +            case PGT_l1_shadow:
  1.2267 +            case PGT_l2_shadow:
  1.2268 +            case PGT_l3_shadow:
  1.2269 +            case PGT_l4_shadow:
  1.2270 +            case PGT_hl2_shadow:
  1.2271 +                count += remove_all_access_in_page(d, a->smfn, forbidden_gmfn);
  1.2272 +                break;
  1.2273 +            case PGT_snapshot:
  1.2274 +            case PGT_writable_pred:
  1.2275 +                // these can't hold refs to the forbidden page
  1.2276 +                break;
  1.2277 +            default:
  1.2278 +                BUG();
  1.2279 +            }
  1.2280 +
  1.2281 +            a = a->next;
  1.2282 +        }
  1.2283 +    }
  1.2284 +
  1.2285 +    return count;
  1.2286 +}    
  1.2287 +
  1.2288 +static int resync_all(struct domain *d, u32 stype)
  1.2289 +{
  1.2290 +    struct out_of_sync_entry *entry;
  1.2291 +    unsigned i;
  1.2292 +    unsigned long smfn;
  1.2293 +    void *guest, *shadow, *snapshot;
  1.2294 +    int need_flush = 0, external = shadow_mode_external(d);
  1.2295 +    int unshadow;
  1.2296 +    int changed;
  1.2297 +
  1.2298 +    ASSERT(shadow_lock_is_acquired(d));
  1.2299 +
  1.2300 +    for ( entry = d->arch.out_of_sync; entry; entry = entry->next)
  1.2301 +    {
  1.2302 +        if ( entry->snapshot_mfn == SHADOW_SNAPSHOT_ELSEWHERE )
  1.2303 +            continue;
  1.2304 +
  1.2305 +        smfn = __shadow_status(d, entry->gpfn, stype);
  1.2306 +
  1.2307 +        if ( !smfn )
  1.2308 +        {
  1.2309 +            if ( shadow_mode_refcounts(d) )
  1.2310 +                continue;
  1.2311 +
  1.2312 +            // For light weight shadows, even when no shadow page exists,
  1.2313 +            // we need to resync the refcounts to the new contents of the
  1.2314 +            // guest page.
  1.2315 +            // This only applies when we have writable page tables.
  1.2316 +            //
  1.2317 +            if ( !shadow_mode_write_all(d) &&
  1.2318 +                 !((stype == PGT_l1_shadow) &&
  1.2319 +                   VM_ASSIST(d, VMASST_TYPE_writable_pagetables)) )
  1.2320 +                // Page is not writable -- no resync necessary
  1.2321 +                continue;
  1.2322 +        }
  1.2323 +
  1.2324 +        FSH_LOG("resyncing t=%08x gpfn=%lx gmfn=%lx smfn=%lx snapshot_mfn=%lx",
  1.2325 +                stype, entry->gpfn, entry->gmfn, smfn, entry->snapshot_mfn);
  1.2326 +
  1.2327 +        // Compare guest's new contents to its snapshot, validating
  1.2328 +        // and updating its shadow as appropriate.
  1.2329 +        //
  1.2330 +        guest    = map_domain_page(entry->gmfn);
  1.2331 +        snapshot = map_domain_page(entry->snapshot_mfn);
  1.2332 +
  1.2333 +        if ( smfn )
  1.2334 +            shadow = map_domain_page(smfn);
  1.2335 +        else
  1.2336 +            shadow = NULL;
  1.2337 +
  1.2338 +        unshadow = 0;
  1.2339 +
  1.2340 +        switch ( stype ) {
  1.2341 +        case PGT_l1_shadow:
  1.2342 +        {
  1.2343 +            l1_pgentry_t *guest1 = guest;
  1.2344 +            l1_pgentry_t *shadow1 = shadow;
  1.2345 +            l1_pgentry_t *snapshot1 = snapshot;
  1.2346 +
  1.2347 +            ASSERT(VM_ASSIST(d, VMASST_TYPE_writable_pagetables) ||
  1.2348 +                   shadow_mode_write_all(d));
  1.2349 +
  1.2350 +            if ( !shadow_mode_refcounts(d) )
  1.2351 +                revalidate_l1(d, guest1, snapshot1);
  1.2352 +
  1.2353 +            if ( !smfn )
  1.2354 +                break;
  1.2355 +
  1.2356 +            u32 min_max_shadow = pfn_to_page(smfn)->tlbflush_timestamp;
  1.2357 +            int min_shadow = SHADOW_MIN(min_max_shadow);
  1.2358 +            int max_shadow = SHADOW_MAX(min_max_shadow);
  1.2359 +
  1.2360 +            u32 min_max_snapshot =
  1.2361 +                pfn_to_page(entry->snapshot_mfn)->tlbflush_timestamp;
  1.2362 +            int min_snapshot = SHADOW_MIN(min_max_snapshot);
  1.2363 +            int max_snapshot = SHADOW_MAX(min_max_snapshot);
  1.2364 +
  1.2365 +            changed = 0;
  1.2366 +
  1.2367 +            for ( i = min_shadow; i <= max_shadow; i++ )
  1.2368 +            {
  1.2369 +                if ( (i < min_snapshot) || (i > max_snapshot) ||
  1.2370 +                     l1e_has_changed(guest1[i], snapshot1[i], PAGE_FLAG_MASK) )
  1.2371 +                {
  1.2372 +                    need_flush |= validate_pte_change(d, guest1[i], &shadow1[i]);
  1.2373 +
  1.2374 +                    // can't update snapshots of linear page tables -- they
  1.2375 +                    // are used multiple times...
  1.2376 +                    //
  1.2377 +                    // snapshot[i] = new_pte;
  1.2378 +
  1.2379 +                    changed++;
  1.2380 +                }
  1.2381 +            }
  1.2382 +            perfc_incrc(resync_l1);
  1.2383 +            perfc_incr_histo(wpt_updates, changed, PT_UPDATES);
  1.2384 +            perfc_incr_histo(l1_entries_checked, max_shadow - min_shadow + 1, PT_UPDATES);
  1.2385 +            break;
  1.2386 +        }
  1.2387 +        case PGT_l2_shadow:
  1.2388 +        {
  1.2389 +            int max = -1;
  1.2390 +
  1.2391 +            l2_pgentry_t *guest2 = guest;
  1.2392 +            l2_pgentry_t *shadow2 = shadow;
  1.2393 +            l2_pgentry_t *snapshot2 = snapshot;
  1.2394 +
  1.2395 +            ASSERT(shadow_mode_write_all(d));
  1.2396 +            BUG_ON(!shadow_mode_refcounts(d)); // not yet implemented
  1.2397 +
  1.2398 +            changed = 0;
  1.2399 +            for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
  1.2400 +            {
  1.2401 +#if CONFIG_X86_PAE
  1.2402 +                BUG();  /* FIXME: need type_info */
  1.2403 +#endif
  1.2404 +                if ( !is_guest_l2_slot(0,i) && !external )
  1.2405 +                    continue;
  1.2406 +
  1.2407 +                l2_pgentry_t new_pde = guest2[i];
  1.2408 +                if ( l2e_has_changed(new_pde, snapshot2[i], PAGE_FLAG_MASK))
  1.2409 +                {
  1.2410 +                    need_flush |= validate_pde_change(d, new_pde, &shadow2[i]);
  1.2411 +
  1.2412 +                    // can't update snapshots of linear page tables -- they
  1.2413 +                    // are used multiple times...
  1.2414 +                    //
  1.2415 +                    // snapshot[i] = new_pde;
  1.2416 +
  1.2417 +                    changed++;
  1.2418 +                }
  1.2419 +                if ( l2e_get_intpte(new_pde) != 0 ) /* FIXME: check flags? */
  1.2420 +                    max = i;
  1.2421 +
  1.2422 +                // XXX - This hack works for linux guests.
  1.2423 +                //       Need a better solution long term.
  1.2424 +                if ( !(l2e_get_flags(new_pde) & _PAGE_PRESENT) &&
  1.2425 +                     unlikely(l2e_get_intpte(new_pde) != 0) &&
  1.2426 +                     !unshadow && MFN_PINNED(smfn) )
  1.2427 +                    unshadow = 1;
  1.2428 +            }
  1.2429 +            if ( max == -1 )
  1.2430 +                unshadow = 1;
  1.2431 +            perfc_incrc(resync_l2);
  1.2432 +            perfc_incr_histo(shm_l2_updates, changed, PT_UPDATES);
  1.2433 +            break;
  1.2434 +        }
  1.2435 +        case PGT_hl2_shadow:
  1.2436 +        {
  1.2437 +            l2_pgentry_t *guest2 = guest;
  1.2438 +            l2_pgentry_t *snapshot2 = snapshot;
  1.2439 +            l1_pgentry_t *shadow2 = shadow;
  1.2440 +            
  1.2441 +            ASSERT(shadow_mode_write_all(d));
  1.2442 +            BUG_ON(!shadow_mode_refcounts(d)); // not yet implemented
  1.2443 +
  1.2444 +            changed = 0;
  1.2445 +            for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
  1.2446 +            {
  1.2447 +#if CONFIG_X86_PAE
  1.2448 +                BUG();  /* FIXME: need type_info */
  1.2449 +#endif
  1.2450 +                if ( !is_guest_l2_slot(0, i) && !external )
  1.2451 +                    continue;
  1.2452 +
  1.2453 +                l2_pgentry_t new_pde = guest2[i];
  1.2454 +                if ( l2e_has_changed(new_pde, snapshot2[i], PAGE_FLAG_MASK) )
  1.2455 +                {
  1.2456 +                    need_flush |= validate_hl2e_change(d, new_pde, &shadow2[i]);
  1.2457 +
  1.2458 +                    // can't update snapshots of linear page tables -- they
  1.2459 +                    // are used multiple times...
  1.2460 +                    //
  1.2461 +                    // snapshot[i] = new_pde;
  1.2462 +
  1.2463 +                    changed++;
  1.2464 +                }
  1.2465 +            }
  1.2466 +            perfc_incrc(resync_hl2);
  1.2467 +            perfc_incr_histo(shm_hl2_updates, changed, PT_UPDATES);
  1.2468 +            break;
  1.2469 +        }
  1.2470 +        default:
  1.2471 +            BUG();
  1.2472 +        }
  1.2473 +
  1.2474 +        if ( smfn )
  1.2475 +            unmap_domain_page(shadow);
  1.2476 +        unmap_domain_page(snapshot);
  1.2477 +        unmap_domain_page(guest);
  1.2478 +
  1.2479 +        if ( unlikely(unshadow) )
  1.2480 +        {
  1.2481 +            perfc_incrc(unshadow_l2_count);
  1.2482 +            shadow_unpin(smfn);
  1.2483 +            if ( unlikely(shadow_mode_external(d)) )
  1.2484 +            {
  1.2485 +                unsigned long hl2mfn;
  1.2486 +
  1.2487 +                if ( (hl2mfn = __shadow_status(d, entry->gpfn, PGT_hl2_shadow)) &&
  1.2488 +                     MFN_PINNED(hl2mfn) )
  1.2489 +                    shadow_unpin(hl2mfn);
  1.2490 +            }
  1.2491 +        }
  1.2492 +    }
  1.2493 +
  1.2494 +    return need_flush;
  1.2495 +}
  1.2496 +
  1.2497 +void __shadow_sync_all(struct domain *d)
  1.2498 +{
  1.2499 +    struct out_of_sync_entry *entry;
  1.2500 +    int need_flush = 0;
  1.2501 +
  1.2502 +    perfc_incrc(shadow_sync_all);
  1.2503 +
  1.2504 +    ASSERT(shadow_lock_is_acquired(d));
  1.2505 +
  1.2506 +    // First, remove all write permissions to the page tables
  1.2507 +    //
  1.2508 +    for ( entry = d->arch.out_of_sync; entry; entry = entry->next)
  1.2509 +    {
  1.2510 +        // Skip entries that have low bits set...  Those aren't
  1.2511 +        // real PTEs.
  1.2512 +        //
  1.2513 +        if ( entry->writable_pl1e & (sizeof(l1_pgentry_t)-1) )
  1.2514 +            continue;
  1.2515 +
  1.2516 +        l1_pgentry_t *ppte = (l1_pgentry_t *)(
  1.2517 +            (char *)map_domain_page(entry->writable_pl1e >> PAGE_SHIFT) +
  1.2518 +            (entry->writable_pl1e & ~PAGE_MASK));
  1.2519 +        l1_pgentry_t opte = *ppte;
  1.2520 +        l1_pgentry_t npte = opte;
  1.2521 +        l1e_remove_flags(npte, _PAGE_RW);
  1.2522 +
  1.2523 +        if ( (l1e_get_flags(npte) & _PAGE_PRESENT) &&
  1.2524 +             !shadow_get_page_from_l1e(npte, d) )
  1.2525 +            BUG();
  1.2526 +        *ppte = npte;
  1.2527 +        shadow_put_page_from_l1e(opte, d);
  1.2528 +
  1.2529 +        unmap_domain_page(ppte);
  1.2530 +    }
  1.2531 +
  1.2532 +    // XXX mafetter: SMP
  1.2533 +    //
  1.2534 +    // With the current algorithm, we've gotta flush all the TLBs
  1.2535 +    // before we can safely continue.  I don't think we want to
  1.2536 +    // do it this way, so I think we should consider making
  1.2537 +    // entirely private copies of the shadow for each vcpu, and/or
  1.2538 +    // possibly having a mix of private and shared shadow state
  1.2539 +    // (any path from a PTE that grants write access to an out-of-sync
  1.2540 +    // page table page needs to be vcpu private).
  1.2541 +    //
  1.2542 +#if 0 // this should be enabled for SMP guests...
  1.2543 +    flush_tlb_mask(cpu_online_map);
  1.2544 +#endif
  1.2545 +    need_flush = 1;
  1.2546 +
  1.2547 +    // Second, resync all L1 pages, then L2 pages, etc...
  1.2548 +    //
  1.2549 +    need_flush |= resync_all(d, PGT_l1_shadow);
  1.2550 +    if ( shadow_mode_translate(d) )
  1.2551 +        need_flush |= resync_all(d, PGT_hl2_shadow);
  1.2552 +    need_flush |= resync_all(d, PGT_l2_shadow);
  1.2553 +
  1.2554 +    if ( need_flush && !unlikely(shadow_mode_external(d)) )
  1.2555 +        local_flush_tlb();
  1.2556 +
  1.2557 +    free_out_of_sync_state(d);
  1.2558 +}
  1.2559 +
  1.2560 +int shadow_fault(unsigned long va, struct cpu_user_regs *regs)
  1.2561 +{
  1.2562 +    l1_pgentry_t gpte, spte, orig_gpte;
  1.2563 +    struct vcpu *v = current;
  1.2564 +    struct domain *d = v->domain;
  1.2565 +    l2_pgentry_t gpde;
  1.2566 +
  1.2567 +    spte = l1e_empty();
  1.2568 +
  1.2569 +    SH_VVLOG("shadow_fault( va=%lx, code=%lu )",
  1.2570 +             va, (unsigned long)regs->error_code);
  1.2571 +    perfc_incrc(shadow_fault_calls);
  1.2572 +    
  1.2573 +    check_pagetable(v, "pre-sf");
  1.2574 +
  1.2575 +    /*
  1.2576 +     * Don't let someone else take the guest's table pages out-of-sync.
  1.2577 +     */
  1.2578 +    shadow_lock(d);
  1.2579 +
  1.2580 +    /* XXX - FIX THIS COMMENT!!!
  1.2581 +     * STEP 1. Check to see if this fault might have been caused by an
  1.2582 +     *         out-of-sync table page entry, or if we should pass this
  1.2583 +     *         fault onto the guest.
  1.2584 +     */
  1.2585 +    __shadow_sync_va(v, va);
  1.2586 +
  1.2587 +    /*
  1.2588 +     * STEP 2. Check the guest PTE.
  1.2589 +     */
  1.2590 +    __guest_get_l2e(v, va, &gpde);
  1.2591 +    if ( unlikely(!(l2e_get_flags(gpde) & _PAGE_PRESENT)) )
  1.2592 +    {
  1.2593 +        SH_VVLOG("shadow_fault - EXIT: L1 not present");
  1.2594 +        perfc_incrc(shadow_fault_bail_pde_not_present);
  1.2595 +        goto fail;
  1.2596 +    }
  1.2597 +
  1.2598 +    // This can't fault because we hold the shadow lock and we've ensured that
  1.2599 +    // the mapping is in-sync, so the check of the PDE's present bit, above,
  1.2600 +    // covers this access.
  1.2601 +    //
  1.2602 +    orig_gpte = gpte = linear_pg_table[l1_linear_offset(va)];
  1.2603 +    if ( unlikely(!(l1e_get_flags(gpte) & _PAGE_PRESENT)) )
  1.2604 +    {
  1.2605 +        SH_VVLOG("shadow_fault - EXIT: gpte not present (%" PRIpte ")",
  1.2606 +                 l1e_get_intpte(gpte));
  1.2607 +        perfc_incrc(shadow_fault_bail_pte_not_present);
  1.2608 +        goto fail;
  1.2609 +    }
  1.2610 +
  1.2611 +    /* Write fault? */
  1.2612 +    if ( regs->error_code & 2 )  
  1.2613 +    {
  1.2614 +        int allow_writes = 0;
  1.2615 +
  1.2616 +        if ( unlikely(!(l1e_get_flags(gpte) & _PAGE_RW)) )
  1.2617 +        {
  1.2618 +            if ( shadow_mode_page_writable(d, l1e_get_pfn(gpte)) )
  1.2619 +            {
  1.2620 +                allow_writes = 1;
  1.2621 +                l1e_add_flags(gpte, _PAGE_RW);
  1.2622 +            }
  1.2623 +            else
  1.2624 +            {
  1.2625 +                /* Write fault on a read-only mapping. */
  1.2626 +                SH_VVLOG("shadow_fault - EXIT: wr fault on RO page (%" PRIpte ")", 
  1.2627 +                         l1e_get_intpte(gpte));
  1.2628 +                perfc_incrc(shadow_fault_bail_ro_mapping);
  1.2629 +                goto fail;
  1.2630 +            }
  1.2631 +        }
  1.2632 +
  1.2633 +        if ( !l1pte_write_fault(v, &gpte, &spte, va) )
  1.2634 +        {
  1.2635 +            SH_VVLOG("shadow_fault - EXIT: l1pte_write_fault failed");
  1.2636 +            perfc_incrc(write_fault_bail);
  1.2637 +            shadow_unlock(d);
  1.2638 +            return 0;
  1.2639 +        }
  1.2640 +
  1.2641 +        if ( allow_writes )
  1.2642 +            l1e_remove_flags(gpte, _PAGE_RW);
  1.2643 +    }
  1.2644 +    else
  1.2645 +    {
  1.2646 +        if ( !l1pte_read_fault(d, &gpte, &spte) )
  1.2647 +        {
  1.2648 +            SH_VVLOG("shadow_fault - EXIT: l1pte_read_fault failed");
  1.2649 +            perfc_incrc(read_fault_bail);
  1.2650 +            shadow_unlock(d);
  1.2651 +            return 0;
  1.2652 +        }
  1.2653 +    }
  1.2654 +
  1.2655 +    /*
  1.2656 +     * STEP 3. Write the modified shadow PTE and guest PTE back to the tables.
  1.2657 +     */
  1.2658 +    if ( l1e_has_changed(orig_gpte, gpte, PAGE_FLAG_MASK) )
  1.2659 +    {
  1.2660 +        /* XXX Watch out for read-only L2 entries! (not used in Linux). */
  1.2661 +        if ( unlikely(__copy_to_user(&linear_pg_table[l1_linear_offset(va)],
  1.2662 +                                     &gpte, sizeof(gpte))) )
  1.2663 +        {
  1.2664 +            printk("%s() failed, crashing domain %d "
  1.2665 +                   "due to a read-only L2 page table (gpde=%" PRIpte "), va=%lx\n",
  1.2666 +                   __func__,d->domain_id, l2e_get_intpte(gpde), va);
  1.2667 +            domain_crash_synchronous();
  1.2668 +        }
  1.2669 +
  1.2670 +        // if necessary, record the page table page as dirty
  1.2671 +        if ( unlikely(shadow_mode_log_dirty(d)) )
  1.2672 +            __mark_dirty(d, __gpfn_to_mfn(d, l2e_get_pfn(gpde)));
  1.2673 +    }
  1.2674 +
  1.2675 +    shadow_set_l1e(va, spte, 1);
  1.2676 +
  1.2677 +    perfc_incrc(shadow_fault_fixed);
  1.2678 +    d->arch.shadow_fault_count++;
  1.2679 +
  1.2680 +    shadow_unlock(d);
  1.2681 +
  1.2682 +    check_pagetable(v, "post-sf");
  1.2683 +    return EXCRET_fault_fixed;
  1.2684 +
  1.2685 + fail:
  1.2686 +    shadow_unlock(d);
  1.2687 +    return 0;
  1.2688 +}
  1.2689 +
  1.2690 +void shadow_l1_normal_pt_update(
  1.2691 +    struct domain *d,
  1.2692 +    unsigned long pa, l1_pgentry_t gpte,
  1.2693 +    struct domain_mmap_cache *cache)
  1.2694 +{
  1.2695 +    unsigned long sl1mfn;    
  1.2696 +    l1_pgentry_t *spl1e, spte;
  1.2697 +
  1.2698 +    shadow_lock(d);
  1.2699 +
  1.2700 +    sl1mfn = __shadow_status(current->domain, pa >> PAGE_SHIFT, PGT_l1_shadow);
  1.2701 +    if ( sl1mfn )
  1.2702 +    {
  1.2703 +        SH_VVLOG("shadow_l1_normal_pt_update pa=%p, gpte=%" PRIpte,
  1.2704 +                 (void *)pa, l1e_get_intpte(gpte));
  1.2705 +        l1pte_propagate_from_guest(current->domain, gpte, &spte);
  1.2706 +
  1.2707 +        spl1e = map_domain_page_with_cache(sl1mfn, cache);
  1.2708 +        spl1e[(pa & ~PAGE_MASK) / sizeof(l1_pgentry_t)] = spte;
  1.2709 +        unmap_domain_page_with_cache(spl1e, cache);
  1.2710 +    }
  1.2711 +
  1.2712 +    shadow_unlock(d);
  1.2713 +}
  1.2714 +
  1.2715 +void shadow_l2_normal_pt_update(
  1.2716 +    struct domain *d,
  1.2717 +    unsigned long pa, l2_pgentry_t gpde,
  1.2718 +    struct domain_mmap_cache *cache)
  1.2719 +{
  1.2720 +    unsigned long sl2mfn;
  1.2721 +    l2_pgentry_t *spl2e;
  1.2722 +
  1.2723 +    shadow_lock(d);
  1.2724 +
  1.2725 +    sl2mfn = __shadow_status(current->domain, pa >> PAGE_SHIFT, PGT_l2_shadow);
  1.2726 +    if ( sl2mfn )
  1.2727 +    {
  1.2728 +        SH_VVLOG("shadow_l2_normal_pt_update pa=%p, gpde=%" PRIpte,
  1.2729 +                 (void *)pa, l2e_get_intpte(gpde));
  1.2730 +        spl2e = map_domain_page_with_cache(sl2mfn, cache);
  1.2731 +        validate_pde_change(d, gpde,
  1.2732 +                            &spl2e[(pa & ~PAGE_MASK) / sizeof(l2_pgentry_t)]);
  1.2733 +        unmap_domain_page_with_cache(spl2e, cache);
  1.2734 +    }
  1.2735 +
  1.2736 +    shadow_unlock(d);
  1.2737 +}
  1.2738 +
  1.2739 +#if CONFIG_PAGING_LEVELS >= 3
  1.2740 +void shadow_l3_normal_pt_update(
  1.2741 +    struct domain *d,
  1.2742 +    unsigned long pa, l3_pgentry_t gpde,
  1.2743 +    struct domain_mmap_cache *cache)
  1.2744 +{
  1.2745 +    BUG(); // not yet implemented
  1.2746 +}
  1.2747 +#endif
  1.2748 +
  1.2749 +#if CONFIG_PAGING_LEVELS >= 4
  1.2750 +void shadow_l4_normal_pt_update(
  1.2751 +    struct domain *d,
  1.2752 +    unsigned long pa, l4_pgentry_t gpde,
  1.2753 +    struct domain_mmap_cache *cache)
  1.2754 +{
  1.2755 +    BUG(); // not yet implemented
  1.2756 +}
  1.2757 +#endif
  1.2758 +
  1.2759 +int shadow_do_update_va_mapping(unsigned long va,
  1.2760 +                                l1_pgentry_t val,
  1.2761 +                                struct vcpu *v)
  1.2762 +{
  1.2763 +    struct domain *d = v->domain;
  1.2764 +    l1_pgentry_t spte;
  1.2765 +    int rc = 0;
  1.2766 +
  1.2767 +    shadow_lock(d);
  1.2768 +
  1.2769 +    //printk("%s(va=%p, val=%p)\n", __func__, (void *)va, (void *)l1e_get_intpte(val));
  1.2770 +        
  1.2771 +    // This is actually overkill - we don't need to sync the L1 itself,
  1.2772 +    // just everything involved in getting to this L1 (i.e. we need
  1.2773 +    // linear_pg_table[l1_linear_offset(va)] to be in sync)...
  1.2774 +    //
  1.2775 +    __shadow_sync_va(v, va);
  1.2776 +
  1.2777 +    l1pte_propagate_from_guest(d, val, &spte);
  1.2778 +    shadow_set_l1e(va, spte, 0);
  1.2779 +
  1.2780 +    /*
  1.2781 +     * If we're in log-dirty mode then we need to note that we've updated
  1.2782 +     * the PTE in the PT-holding page. We need the machine frame number
  1.2783 +     * for this.
  1.2784 +     */
  1.2785 +    if ( shadow_mode_log_dirty(d) )
  1.2786 +        __mark_dirty(d, va_to_l1mfn(v, va));
  1.2787 +
  1.2788 +// out:
  1.2789 +    shadow_unlock(d);
  1.2790 +
  1.2791 +    return rc;
  1.2792 +}
  1.2793 +
  1.2794 +
  1.2795 +/*
  1.2796 + * What lives where in the 32-bit address space in the various shadow modes,
  1.2797 + * and what it uses to get/maintain that mapping.
  1.2798 + *
  1.2799 + * SHADOW MODE:      none         enable         translate         external
  1.2800 + * 
  1.2801 + * 4KB things:
  1.2802 + * guest_vtable    lin_l2     mapped per gl2   lin_l2 via hl2   mapped per gl2
  1.2803 + * shadow_vtable     n/a         sh_lin_l2       sh_lin_l2      mapped per gl2
  1.2804 + * hl2_vtable        n/a            n/a        lin_hl2 via hl2  mapped per gl2
  1.2805 + * monitor_vtable    n/a            n/a             n/a           mapped once
  1.2806 + *
  1.2807 + * 4MB things:
  1.2808 + * guest_linear  lin via gl2    lin via gl2      lin via hl2      lin via hl2
  1.2809 + * shadow_linear     n/a      sh_lin via sl2   sh_lin via sl2   sh_lin via sl2
  1.2810 + * monitor_linear    n/a            n/a             n/a              ???
  1.2811 + * perdomain      perdomain      perdomain       perdomain        perdomain
  1.2812 + * R/O M2P         R/O M2P        R/O M2P           n/a              n/a
  1.2813 + * R/W M2P         R/W M2P        R/W M2P         R/W M2P          R/W M2P
  1.2814 + * P2M               n/a            n/a           R/O M2P          R/O M2P
  1.2815 + *
  1.2816 + * NB:
  1.2817 + * update_pagetables(), __update_pagetables(), shadow_mode_enable(),
  1.2818 + * shadow_l2_table(), shadow_hl2_table(), and alloc_monitor_pagetable()
  1.2819 + * all play a part in maintaining these mappings.
  1.2820 + */
  1.2821 +void __update_pagetables(struct vcpu *v)
  1.2822 +{
  1.2823 +    struct domain *d = v->domain;
  1.2824 +    unsigned long gmfn = pagetable_get_pfn(v->arch.guest_table);
  1.2825 +    unsigned long gpfn = __mfn_to_gpfn(d, gmfn);
  1.2826 +    unsigned long smfn, hl2mfn, old_smfn;
  1.2827 +
  1.2828 +    int max_mode = ( shadow_mode_external(d) ? SHM_external
  1.2829 +                     : shadow_mode_translate(d) ? SHM_translate
  1.2830 +                     : shadow_mode_enabled(d) ? SHM_enable
  1.2831 +                     : 0 );
  1.2832 +
  1.2833 +    ASSERT( ! IS_INVALID_M2P_ENTRY(gpfn) );
  1.2834 +    ASSERT( max_mode );
  1.2835 +
  1.2836 +    /*
  1.2837 +     *  arch.guest_vtable
  1.2838 +     */
  1.2839 +    if ( max_mode & (SHM_enable | SHM_external) )
  1.2840 +    {
  1.2841 +        if ( likely(v->arch.guest_vtable != NULL) )
  1.2842 +            unmap_domain_page(v->arch.guest_vtable);
  1.2843 +        v->arch.guest_vtable = map_domain_page(gmfn);
  1.2844 +    }
  1.2845 +
  1.2846 +    /*
  1.2847 +     *  arch.shadow_table
  1.2848 +     */
  1.2849 +    if ( unlikely(!(smfn = __shadow_status(d, gpfn, PGT_base_page_table))) )
  1.2850 +        smfn = shadow_l2_table(d, gpfn, gmfn);
  1.2851 +    if ( !get_shadow_ref(smfn) )
  1.2852 +        BUG();
  1.2853 +    old_smfn = pagetable_get_pfn(v->arch.shadow_table);
  1.2854 +    v->arch.shadow_table = mk_pagetable(smfn << PAGE_SHIFT);
  1.2855 +    if ( old_smfn )
  1.2856 +        put_shadow_ref(old_smfn);
  1.2857 +
  1.2858 +    SH_VVLOG("__update_pagetables(gmfn=%lx, smfn=%lx)", gmfn, smfn);
  1.2859 +
  1.2860 +    /*
  1.2861 +     * arch.shadow_vtable
  1.2862 +     */
  1.2863 +    if ( max_mode == SHM_external )
  1.2864 +    {
  1.2865 +        if ( v->arch.shadow_vtable )
  1.2866 +            unmap_domain_page(v->arch.shadow_vtable);
  1.2867 +        v->arch.shadow_vtable = map_domain_page(smfn);
  1.2868 +    }
  1.2869 +
  1.2870 +    /*
  1.2871 +     * arch.hl2_vtable
  1.2872 +     */
  1.2873 +
  1.2874 +    // if max_mode == SHM_translate, then the hl2 is already installed
  1.2875 +    // correctly in its smfn, and there's nothing to do.
  1.2876 +    //
  1.2877 +    if ( max_mode == SHM_external )
  1.2878 +    {
  1.2879 +        if ( unlikely(!(hl2mfn = __shadow_status(d, gpfn, PGT_hl2_shadow))) )
  1.2880 +            hl2mfn = shadow_hl2_table(d, gpfn, gmfn, smfn);
  1.2881 +        if ( v->arch.hl2_vtable )
  1.2882 +            unmap_domain_page(v->arch.hl2_vtable);
  1.2883 +        v->arch.hl2_vtable = map_domain_page(hl2mfn);
  1.2884 +    }
  1.2885 +
  1.2886 +    /*
  1.2887 +     * fixup pointers in monitor table, as necessary
  1.2888 +     */
  1.2889 +    if ( max_mode == SHM_external )
  1.2890 +    {
  1.2891 +        l2_pgentry_t *mpl2e = v->arch.monitor_vtable;
  1.2892 +        l2_pgentry_t old_hl2e = mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)];
  1.2893 +        l2_pgentry_t old_sl2e = mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)];
  1.2894 +
  1.2895 +        ASSERT( shadow_mode_translate(d) );
  1.2896 +
  1.2897 +        if ( !get_shadow_ref(hl2mfn) )
  1.2898 +            BUG();
  1.2899 +        mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
  1.2900 +            l2e_from_pfn(hl2mfn, __PAGE_HYPERVISOR);
  1.2901 +        if ( l2e_get_flags(old_hl2e) & _PAGE_PRESENT )
  1.2902 +            put_shadow_ref(l2e_get_pfn(old_hl2e));
  1.2903 +
  1.2904 +        if ( !get_shadow_ref(smfn) )
  1.2905 +            BUG();
  1.2906 +        mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
  1.2907 +            l2e_from_pfn(smfn, __PAGE_HYPERVISOR);
  1.2908 +        if ( l2e_get_flags(old_sl2e) & _PAGE_PRESENT )
  1.2909 +            put_shadow_ref(l2e_get_pfn(old_sl2e));
  1.2910 +
  1.2911 +        // XXX - maybe this can be optimized somewhat??
  1.2912 +        local_flush_tlb();
  1.2913 +    }
  1.2914 +}
  1.2915 +
  1.2916 +
  1.2917 +/************************************************************************/
  1.2918 +/************************************************************************/
  1.2919 +/************************************************************************/
  1.2920 +
  1.2921 +#if SHADOW_DEBUG
  1.2922 +
  1.2923 +// The following is entirely for _check_pagetable()'s benefit.
  1.2924 +// _check_pagetable() wants to know whether a given entry in a
  1.2925 +// shadow page table is supposed to be the shadow of the guest's
  1.2926 +// current entry, or the shadow of the entry held in the snapshot
  1.2927 +// taken above.
  1.2928 +//
  1.2929 +// Here, we mark all currently existing entries as reflecting
  1.2930 +// the snapshot, above.  All other places in xen that update
  1.2931 +// the shadow will keep the shadow in sync with the guest's
  1.2932 +// entries (via l1pte_propagate_from_guest and friends), which clear
  1.2933 +// the SHADOW_REFLECTS_SNAPSHOT bit.
  1.2934 +//
  1.2935 +static void
  1.2936 +mark_shadows_as_reflecting_snapshot(struct domain *d, unsigned long gpfn)
  1.2937 +{
  1.2938 +    unsigned long smfn;
  1.2939 +    l1_pgentry_t *l1e;
  1.2940 +    l2_pgentry_t *l2e;
  1.2941 +    unsigned i;
  1.2942 +
  1.2943 +    if ( (smfn = __shadow_status(d, gpfn, PGT_l1_shadow)) )
  1.2944 +    {
  1.2945 +        l1e = map_domain_page(smfn);
  1.2946 +        for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
  1.2947 +            if ( is_guest_l1_slot(i) &&
  1.2948 +                 (l1e_get_flags(l1e[i]) & _PAGE_PRESENT) )
  1.2949 +                l1e_add_flags(l1e[i], SHADOW_REFLECTS_SNAPSHOT);
  1.2950 +        unmap_domain_page(l1e);
  1.2951 +    }
  1.2952 +
  1.2953 +    if ( (smfn = __shadow_status(d, gpfn, PGT_l2_shadow)) )
  1.2954 +    {
  1.2955 +        l2e = map_domain_page(smfn);
  1.2956 +        for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
  1.2957 +            if ( is_guest_l2_slot(0, i) &&
  1.2958 +                 (l2e_get_flags(l2e[i]) & _PAGE_PRESENT) )
  1.2959 +                l2e_add_flags(l2e[i], SHADOW_REFLECTS_SNAPSHOT);
  1.2960 +        unmap_domain_page(l2e);
  1.2961 +    }
  1.2962 +}
  1.2963 +
  1.2964 +// BUG: these are not SMP safe...
  1.2965 +static int sh_l2_present;
  1.2966 +static int sh_l1_present;
  1.2967 +char * sh_check_name;
  1.2968 +int shadow_status_noswap;
  1.2969 +
  1.2970 +#define v2m(_v, _adr) ({                                                     \
  1.2971 +    unsigned long _a  = (unsigned long)(_adr);                               \
  1.2972 +    l2_pgentry_t _pde = shadow_linear_l2_table(_v)[l2_table_offset(_a)];     \
  1.2973 +    unsigned long _pa = -1;                                                  \
  1.2974 +    if ( l2e_get_flags(_pde) & _PAGE_PRESENT )                               \
  1.2975 +    {                                                                        \
  1.2976 +        l1_pgentry_t _pte;                                                   \
  1.2977 +        _pte = shadow_linear_pg_table[l1_linear_offset(_a)];                 \
  1.2978 +        if ( l1e_get_flags(_pte) & _PAGE_PRESENT )                           \
  1.2979 +            _pa = l1e_get_paddr(_pte);                                       \
  1.2980 +    }                                                                        \
  1.2981 +    _pa | (_a & ~PAGE_MASK);                                                 \
  1.2982 +})
  1.2983 +
  1.2984 +#define FAIL(_f, _a...)                                                      \
  1.2985 +    do {                                                                     \
  1.2986 +        printk("XXX %s-FAIL (%d,%d,%d) " _f " at %s(%d)\n",                  \
  1.2987 +               sh_check_name, level, l2_idx, l1_idx, ## _a,                  \
  1.2988 +               __FILE__, __LINE__);                                          \
  1.2989 +        printk("guest_pte=%" PRIpte " eff_guest_pte=%" PRIpte                \
  1.2990 +               " shadow_pte=%" PRIpte " snapshot_pte=%" PRIpte               \
  1.2991 +               " &guest=%p &shadow=%p &snap=%p v2m(&guest)=%p"               \
  1.2992 +               " v2m(&shadow)=%p v2m(&snap)=%p ea=%08x\n",                   \
  1.2993 +               l1e_get_intpte(guest_pte), l1e_get_intpte(eff_guest_pte),     \
  1.2994 +               l1e_get_intpte(shadow_pte), l1e_get_intpte(snapshot_pte),     \
  1.2995 +               p_guest_pte, p_shadow_pte, p_snapshot_pte,                    \
  1.2996 +               (void *)v2m(v, p_guest_pte), (void *)v2m(v, p_shadow_pte),    \
  1.2997 +               (void *)v2m(v, p_snapshot_pte),                               \
  1.2998 +               (l2_idx << L2_PAGETABLE_SHIFT) |                              \
  1.2999 +               (l1_idx << L1_PAGETABLE_SHIFT));                              \
  1.3000 +        errors++;                                                            \
  1.3001 +    } while ( 0 )
  1.3002 +
  1.3003 +static int check_pte(
  1.3004 +    struct vcpu *v,
  1.3005 +    l1_pgentry_t *p_guest_pte,
  1.3006 +    l1_pgentry_t *p_shadow_pte,
  1.3007 +    l1_pgentry_t *p_snapshot_pte,
  1.3008 +    int level, int l2_idx, int l1_idx)
  1.3009 +{
  1.3010 +    struct domain *d = v->domain;
  1.3011 +    l1_pgentry_t guest_pte = *p_guest_pte;
  1.3012 +    l1_pgentry_t shadow_pte = *p_shadow_pte;
  1.3013 +    l1_pgentry_t snapshot_pte = p_snapshot_pte ? *p_snapshot_pte : l1e_empty();
  1.3014 +    l1_pgentry_t eff_guest_pte;
  1.3015 +    unsigned long mask, eff_guest_pfn, eff_guest_mfn, shadow_mfn;
  1.3016 +    int errors = 0, guest_writable;
  1.3017 +    int page_table_page;
  1.3018 +
  1.3019 +    if ( (l1e_get_intpte(shadow_pte) == 0) ||
  1.3020 +         (l1e_get_intpte(shadow_pte) == 0xdeadface) ||
  1.3021 +         (l1e_get_intpte(shadow_pte) == 0x00000E00) )
  1.3022 +        return errors;  /* always safe */
  1.3023 +
  1.3024 +    if ( !(l1e_get_flags(shadow_pte) & _PAGE_PRESENT) )
  1.3025 +        FAIL("Non zero not present shadow_pte");
  1.3026 +
  1.3027 +    if ( level == 2 ) sh_l2_present++;
  1.3028 +    if ( level == 1 ) sh_l1_present++;
  1.3029 +
  1.3030 +    if ( (l1e_get_flags(shadow_pte) & SHADOW_REFLECTS_SNAPSHOT) && p_snapshot_pte )
  1.3031 +        eff_guest_pte = snapshot_pte;
  1.3032 +    else
  1.3033 +        eff_guest_pte = guest_pte;
  1.3034 +
  1.3035 +    if ( !(l1e_get_flags(eff_guest_pte) & _PAGE_PRESENT) )
  1.3036 +        FAIL("Guest not present yet shadow is");
  1.3037 +
  1.3038 +    mask = ~(_PAGE_GLOBAL|_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW|_PAGE_AVAIL|PAGE_MASK);
  1.3039 +
  1.3040 +    if ( ((l1e_get_intpte(shadow_pte) & mask) != (l1e_get_intpte(eff_guest_pte) & mask)) )
  1.3041 +        FAIL("Corrupt?");
  1.3042 +
  1.3043 +    if ( (level == 1) &&
  1.3044 +         (l1e_get_flags(shadow_pte) & _PAGE_DIRTY) &&
  1.3045 +         !(l1e_get_flags(eff_guest_pte) & _PAGE_DIRTY) )
  1.3046 +        FAIL("Dirty coherence");
  1.3047 +
  1.3048 +    if ( (l1e_get_flags(shadow_pte) & _PAGE_ACCESSED) &&
  1.3049 +         !(l1e_get_flags(eff_guest_pte) & _PAGE_ACCESSED) )
  1.3050 +        FAIL("Accessed coherence");
  1.3051 +
  1.3052 +    if ( l1e_get_flags(shadow_pte) & _PAGE_GLOBAL )
  1.3053 +        FAIL("global bit set in shadow");
  1.3054 +
  1.3055 +    eff_guest_pfn = l1e_get_pfn(eff_guest_pte);
  1.3056 +    eff_guest_mfn = __gpfn_to_mfn(d, eff_guest_pfn);
  1.3057 +    shadow_mfn = l1e_get_pfn(shadow_pte);
  1.3058 +
  1.3059 +    if ( !VALID_MFN(eff_guest_mfn) && !shadow_mode_refcounts(d) )
  1.3060 +        FAIL("%s: invalid eff_guest_pfn=%lx eff_guest_pte=%" PRIpte "\n",
  1.3061 +             __func__, eff_guest_pfn, l1e_get_intpte(eff_guest_pte));
  1.3062 +
  1.3063 +    page_table_page = mfn_is_page_table(eff_guest_mfn);
  1.3064 +
  1.3065 +    guest_writable =
  1.3066 +        (l1e_get_flags(eff_guest_pte) & _PAGE_RW) ||
  1.3067 +        (VM_ASSIST(d, VMASST_TYPE_writable_pagetables) && (level == 1) && mfn_out_of_sync(eff_guest_mfn));
  1.3068 +
  1.3069 +    if ( (l1e_get_flags(shadow_pte) & _PAGE_RW ) && !guest_writable )
  1.3070 +    {
  1.3071 +        printk("eff_guest_pfn=%lx eff_guest_mfn=%lx shadow_mfn=%lx t=0x%08x page_table_page=%d\n",
  1.3072 +               eff_guest_pfn, eff_guest_mfn, shadow_mfn,
  1.3073 +               frame_table[eff_guest_mfn].u.inuse.type_info,
  1.3074 +               page_table_page);
  1.3075 +        FAIL("RW coherence");
  1.3076 +    }
  1.3077 +
  1.3078 +    if ( (level == 1) &&
  1.3079 +         (l1e_get_flags(shadow_pte) & _PAGE_RW ) &&
  1.3080 +         !(guest_writable && (l1e_get_flags(eff_guest_pte) & _PAGE_DIRTY)) )
  1.3081 +    {
  1.3082 +        printk("eff_guest_pfn=%lx eff_guest_mfn=%lx shadow_mfn=%lx t=0x%08x page_table_page=%d\n",
  1.3083 +               eff_guest_pfn, eff_guest_mfn, shadow_mfn,
  1.3084 +               frame_table[eff_guest_mfn].u.inuse.type_info,
  1.3085 +               page_table_page);
  1.3086 +        FAIL("RW2 coherence");
  1.3087 +    }
  1.3088 + 
  1.3089 +    if ( eff_guest_mfn == shadow_mfn )
  1.3090 +    {
  1.3091 +        if ( level > 1 )
  1.3092 +            FAIL("Linear map ???");    /* XXX this will fail on BSD */
  1.3093 +    }
  1.3094 +    else
  1.3095 +    {
  1.3096 +        if ( level < 2 )
  1.3097 +            FAIL("Shadow in L1 entry?");
  1.3098 +
  1.3099 +        if ( level == 2 )
  1.3100 +        {
  1.3101 +            if ( __shadow_status(d, eff_guest_pfn, PGT_l1_shadow) != shadow_mfn )
  1.3102 +                FAIL("shadow_mfn problem eff_guest_pfn=%lx shadow_mfn=%lx", eff_guest_pfn,
  1.3103 +                     __shadow_status(d, eff_guest_pfn, PGT_l1_shadow));
  1.3104 +        }
  1.3105 +        else
  1.3106 +            BUG(); // XXX -- not handled yet.
  1.3107 +    }
  1.3108 +
  1.3109 +    return errors;
  1.3110 +}
  1.3111 +#undef FAIL
  1.3112 +#undef v2m
  1.3113 +
  1.3114 +static int check_l1_table(
  1.3115 +    struct vcpu *v, unsigned long gpfn,
  1.3116 +    unsigned long gmfn, unsigned long smfn, unsigned l2_idx)
  1.3117 +{
  1.3118 +    struct domain *d = v->domain;
  1.3119 +    int i;
  1.3120 +    unsigned long snapshot_mfn;
  1.3121 +    l1_pgentry_t *p_guest, *p_shadow, *p_snapshot = NULL;
  1.3122 +    int errors = 0;
  1.3123 +
  1.3124 +    if ( page_out_of_sync(pfn_to_page(gmfn)) )
  1.3125 +    {
  1.3126 +        snapshot_mfn = __shadow_status(d, gpfn, PGT_snapshot);
  1.3127 +        ASSERT(snapshot_mfn);
  1.3128 +        p_snapshot = map_domain_page(snapshot_mfn);
  1.3129 +    }
  1.3130 +
  1.3131 +    p_guest  = map_domain_page(gmfn);
  1.3132 +    p_shadow = map_domain_page(smfn);
  1.3133 +
  1.3134 +    for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
  1.3135 +        errors += check_pte(v, p_guest+i, p_shadow+i,
  1.3136 +                            p_snapshot ? p_snapshot+i : NULL,
  1.3137 +                            1, l2_idx, i);
  1.3138 + 
  1.3139 +    unmap_domain_page(p_shadow);
  1.3140 +    unmap_domain_page(p_guest);
  1.3141 +    if ( p_snapshot )
  1.3142 +        unmap_domain_page(p_snapshot);
  1.3143 +
  1.3144 +    return errors;
  1.3145 +}
  1.3146 +
  1.3147 +#define FAILPT(_f, _a...)                                         \
  1.3148 +    do {                                                          \
  1.3149 +        printk("XXX FAIL %s-PT " _f "\n", sh_check_name, ## _a ); \
  1.3150 +        errors++;                                                 \
  1.3151 +    } while ( 0 )
  1.3152 +
  1.3153 +int check_l2_table(
  1.3154 +    struct vcpu *v, unsigned long gmfn, unsigned long smfn, int oos_pdes)
  1.3155 +{
  1.3156 +    struct domain *d = v->domain;
  1.3157 +    l2_pgentry_t *gpl2e = (l2_pgentry_t *)map_domain_page(gmfn);
  1.3158 +    l2_pgentry_t *spl2e = (l2_pgentry_t *)map_domain_page(smfn);
  1.3159 +    l2_pgentry_t match;
  1.3160 +    int i;
  1.3161 +    int errors = 0;
  1.3162 +    int limit;
  1.3163 +
  1.3164 +    if ( !oos_pdes && (page_get_owner(pfn_to_page(gmfn)) != d) )
  1.3165 +        FAILPT("domain doesn't own page");
  1.3166 +    if ( oos_pdes && (page_get_owner(pfn_to_page(gmfn)) != NULL) )
  1.3167 +        FAILPT("bogus owner for snapshot page");
  1.3168 +    if ( page_get_owner(pfn_to_page(smfn)) != NULL )
  1.3169 +        FAILPT("shadow page mfn=0x%lx is owned by someone, domid=%d",
  1.3170 +               smfn, page_get_owner(pfn_to_page(smfn))->domain_id);
  1.3171 +
  1.3172 +#if 0
  1.3173 +    if ( memcmp(&spl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
  1.3174 +                &gpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 
  1.3175 +                ((SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT) -
  1.3176 +                 DOMAIN_ENTRIES_PER_L2_PAGETABLE) * sizeof(l2_pgentry_t)) )
  1.3177 +    {
  1.3178 +        for ( i = DOMAIN_ENTRIES_PER_L2_PAGETABLE; 
  1.3179 +              i < (SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT);
  1.3180 +              i++ )
  1.3181 +            printk("+++ (%d) %lx %lx\n",i,
  1.3182 +                   l2_pgentry_val(gpl2e[i]), l2_pgentry_val(spl2e[i]));
  1.3183 +        FAILPT("hypervisor entries inconsistent");
  1.3184 +    }
  1.3185 +
  1.3186 +    if ( (l2_pgentry_val(spl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT]) != 
  1.3187 +          l2_pgentry_val(gpl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT])) )
  1.3188 +        FAILPT("hypervisor linear map inconsistent");
  1.3189 +#endif
  1.3190 +
  1.3191 +    match = l2e_from_pfn(smfn, __PAGE_HYPERVISOR);
  1.3192 +    if ( !shadow_mode_external(d) &&
  1.3193 +         l2e_has_changed(spl2e[SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT],
  1.3194 +                         match, PAGE_FLAG_MASK))
  1.3195 +    {
  1.3196 +        FAILPT("hypervisor shadow linear map inconsistent %" PRIpte " %" PRIpte,
  1.3197 +               l2e_get_intpte(spl2e[SH_LINEAR_PT_VIRT_START >>
  1.3198 +                                   L2_PAGETABLE_SHIFT]),
  1.3199 +               l2e_get_intpte(match));
  1.3200 +    }
  1.3201 +
  1.3202 +    match = l2e_from_paddr(__pa(d->arch.mm_perdomain_pt), __PAGE_HYPERVISOR);
  1.3203 +    if ( !shadow_mode_external(d) &&
  1.3204 +         l2e_has_changed(spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT],
  1.3205 +                         match, PAGE_FLAG_MASK))
  1.3206 +    {
  1.3207 +        FAILPT("hypervisor per-domain map inconsistent saw %" PRIpte ", expected (va=%p) %" PRIpte,
  1.3208 +               l2e_get_intpte(spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT]),
  1.3209 +               d->arch.mm_perdomain_pt,
  1.3210 +               l2e_get_intpte(match));
  1.3211 +    }
  1.3212 +
  1.3213 +#ifdef __i386__
  1.3214 +    if ( shadow_mode_external(d) )
  1.3215 +        limit = L2_PAGETABLE_ENTRIES;
  1.3216 +    else
  1.3217 +        limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
  1.3218 +#else
  1.3219 +    limit = 0; /* XXX x86/64 XXX */
  1.3220 +#endif
  1.3221 +
  1.3222 +    /* Check the whole L2. */
  1.3223 +    for ( i = 0; i < limit; i++ )
  1.3224 +        errors += check_pte(v,
  1.3225 +                            (l1_pgentry_t*)(&gpl2e[i]), /* Hmm, dirty ... */
  1.3226 +                            (l1_pgentry_t*)(&spl2e[i]),
  1.3227 +                            NULL,
  1.3228 +                            2, i, 0);
  1.3229 +
  1.3230 +    unmap_domain_page(spl2e);
  1.3231 +    unmap_domain_page(gpl2e);
  1.3232 +
  1.3233 +#if 1
  1.3234 +    if ( errors )
  1.3235 +        printk("check_l2_table returning %d errors\n", errors);
  1.3236 +#endif
  1.3237 +
  1.3238 +    return errors;
  1.3239 +}
  1.3240 +#undef FAILPT
  1.3241 +
  1.3242 +int _check_pagetable(struct vcpu *v, char *s)
  1.3243 +{
  1.3244 +    struct domain *d = v->domain;
  1.3245 +    pagetable_t pt = v->arch.guest_table;
  1.3246 +    unsigned long gptbase = pagetable_get_paddr(pt);
  1.3247 +    unsigned long ptbase_pfn, smfn;
  1.3248 +    unsigned long i;
  1.3249 +    l2_pgentry_t *gpl2e, *spl2e;
  1.3250 +    unsigned long ptbase_mfn = 0;
  1.3251 +    int errors = 0, limit, oos_pdes = 0;
  1.3252 +
  1.3253 +    //_audit_domain(d, AUDIT_QUIET);
  1.3254 +    shadow_lock(d);
  1.3255 +
  1.3256 +    sh_check_name = s;
  1.3257 +    //SH_VVLOG("%s-PT Audit", s);
  1.3258 +    sh_l2_present = sh_l1_present = 0;
  1.3259 +    perfc_incrc(check_pagetable);
  1.3260 +
  1.3261 +    ptbase_mfn = gptbase >> PAGE_SHIFT;
  1.3262 +    ptbase_pfn = __mfn_to_gpfn(d, ptbase_mfn);
  1.3263 +
  1.3264 +    if ( !(smfn = __shadow_status(d, ptbase_pfn, PGT_base_page_table)) )
  1.3265 +    {
  1.3266 +        printk("%s-PT %lx not shadowed\n", s, gptbase);
  1.3267 +        goto out;
  1.3268 +    }
  1.3269 +    if ( page_out_of_sync(pfn_to_page(ptbase_mfn)) )
  1.3270 +    {
  1.3271 +        ptbase_mfn = __shadow_status(d, ptbase_pfn, PGT_snapshot);
  1.3272 +        oos_pdes = 1;
  1.3273 +        ASSERT(ptbase_mfn);
  1.3274 +    }
  1.3275 + 
  1.3276 +    errors += check_l2_table(v, ptbase_mfn, smfn, oos_pdes);
  1.3277 +
  1.3278 +    gpl2e = (l2_pgentry_t *) map_domain_page(ptbase_mfn);
  1.3279 +    spl2e = (l2_pgentry_t *) map_domain_page(smfn);
  1.3280 +
  1.3281 +    /* Go back and recurse. */
  1.3282 +#ifdef __i386__
  1.3283 +    if ( shadow_mode_external(d) )
  1.3284 +        limit = L2_PAGETABLE_ENTRIES;
  1.3285 +    else
  1.3286 +        limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
  1.3287 +#else
  1.3288 +    limit = 0; /* XXX x86/64 XXX */
  1.3289 +#endif
  1.3290 +
  1.3291 +    for ( i = 0; i < limit; i++ )
  1.3292 +    {
  1.3293 +        unsigned long gl1pfn = l2e_get_pfn(gpl2e[i]);
  1.3294 +        unsigned long gl1mfn = __gpfn_to_mfn(d, gl1pfn);
  1.3295 +        unsigned long sl1mfn = l2e_get_pfn(spl2e[i]);
  1.3296 +
  1.3297 +        if ( l2e_get_intpte(spl2e[i]) != 0 )  /* FIXME: check flags? */
  1.3298 +        {
  1.3299 +            errors += check_l1_table(v, gl1pfn, gl1mfn, sl1mfn, i);
  1.3300 +        }
  1.3301 +    }
  1.3302 +
  1.3303 +    unmap_domain_page(spl2e);
  1.3304 +    unmap_domain_page(gpl2e);
  1.3305 +
  1.3306 +#if 0
  1.3307 +    SH_VVLOG("PT verified : l2_present = %d, l1_present = %d",
  1.3308 +             sh_l2_present, sh_l1_present);
  1.3309 +#endif
  1.3310 +
  1.3311 + out:
  1.3312 +    if ( errors )
  1.3313 +        BUG();
  1.3314 +
  1.3315 +    shadow_unlock(d);
  1.3316 +
  1.3317 +    return errors;
  1.3318 +}
  1.3319 +
  1.3320 +int _check_all_pagetables(struct vcpu *v, char *s)
  1.3321 +{
  1.3322 +    struct domain *d = v->domain;
  1.3323 +    int i;
  1.3324 +    struct shadow_status *a;
  1.3325 +    unsigned long gmfn;
  1.3326 +    int errors = 0;
  1.3327 +
  1.3328 +    shadow_status_noswap = 1;
  1.3329 +
  1.3330 +    sh_check_name = s;
  1.3331 +    SH_VVLOG("%s-PT Audit domid=%d", s, d->domain_id);
  1.3332 +    sh_l2_present = sh_l1_present = 0;
  1.3333 +    perfc_incrc(check_all_pagetables);
  1.3334 +
  1.3335 +    for (i = 0; i < shadow_ht_buckets; i++)
  1.3336 +    {
  1.3337 +        a = &d->arch.shadow_ht[i];
  1.3338 +        while ( a && a->gpfn_and_flags )
  1.3339 +        {
  1.3340 +            gmfn = __gpfn_to_mfn(d, a->gpfn_and_flags & PGT_mfn_mask);
  1.3341 +
  1.3342 +            switch ( a->gpfn_and_flags & PGT_type_mask )
  1.3343 +            {
  1.3344 +            case PGT_l1_shadow:
  1.3345 +                errors += check_l1_table(v, a->gpfn_and_flags & PGT_mfn_mask,
  1.3346 +                                         gmfn, a->smfn, 0);
  1.3347 +                break;
  1.3348 +            case PGT_l2_shadow:
  1.3349 +                errors += check_l2_table(v, gmfn, a->smfn,
  1.3350 +                                         page_out_of_sync(pfn_to_page(gmfn)));
  1.3351 +                break;
  1.3352 +            case PGT_l3_shadow:
  1.3353 +            case PGT_l4_shadow:
  1.3354 +            case PGT_hl2_shadow:
  1.3355 +                BUG(); // XXX - ought to fix this...
  1.3356 +                break;
  1.3357 +            case PGT_snapshot:
  1.3358 +            case PGT_writable_pred:
  1.3359 +                break;
  1.3360 +            default:
  1.3361 +                errors++;
  1.3362 +                printk("unexpected shadow type %lx, gpfn=%lx, "
  1.3363 +                       "gmfn=%lx smfn=%lx\n",
  1.3364 +                       a->gpfn_and_flags & PGT_type_mask,
  1.3365 +                       a->gpfn_and_flags & PGT_mfn_mask,
  1.3366 +                       gmfn, a->smfn);
  1.3367 +                BUG();
  1.3368 +            }
  1.3369 +            a = a->next;
  1.3370 +        }
  1.3371 +    }
  1.3372 +
  1.3373 +    shadow_status_noswap = 0;
  1.3374 +
  1.3375 +    if ( errors )
  1.3376 +        BUG();
  1.3377 +
  1.3378 +    return errors;
  1.3379 +}
  1.3380 +
  1.3381 +#endif // SHADOW_DEBUG
  1.3382 +
  1.3383 +/*
  1.3384 + * Local variables:
  1.3385 + * mode: C
  1.3386 + * c-set-style: "BSD"
  1.3387 + * c-basic-offset: 4
  1.3388 + * tab-width: 4
  1.3389 + * indent-tabs-mode: nil
  1.3390 + * End:
  1.3391 + */
     2.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     2.2 +++ b/xen/arch/x86/shadow_public.c	Mon Jul 11 09:57:38 2005 +0000
     2.3 @@ -0,0 +1,1654 @@
     2.4 +/******************************************************************************
     2.5 + * arch/x86/shadow_public.c
     2.6 + * 
     2.7 + * Copyright (c) 2005 Michael A Fetterman
     2.8 + * Based on an earlier implementation by Ian Pratt et al
     2.9 + * 
    2.10 + * This program is free software; you can redistribute it and/or modify
    2.11 + * it under the terms of the GNU General Public License as published by
    2.12 + * the Free Software Foundation; either version 2 of the License, or
    2.13 + * (at your option) any later version.
    2.14 + * 
    2.15 + * This program is distributed in the hope that it will be useful,
    2.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
    2.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    2.18 + * GNU General Public License for more details.
    2.19 + * 
    2.20 + * You should have received a copy of the GNU General Public License
    2.21 + * along with this program; if not, write to the Free Software
    2.22 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
    2.23 + */
    2.24 +
    2.25 +
    2.26 +#include <xen/config.h>
    2.27 +#include <xen/types.h>
    2.28 +#include <xen/mm.h>
    2.29 +#include <xen/domain_page.h>
    2.30 +#include <asm/shadow.h>
    2.31 +#include <asm/page.h>
    2.32 +#include <xen/event.h>
    2.33 +#include <xen/sched.h>
    2.34 +#include <xen/trace.h>
    2.35 +
    2.36 +#if CONFIG_PAGING_LEVELS >= 4 
    2.37 +#include <asm/shadow_64.h>
    2.38 +
    2.39 +extern struct shadow_ops MODE_F_HANDLER;
    2.40 +#endif
    2.41 +
    2.42 +extern struct shadow_ops MODE_A_HANDLER;
    2.43 +
    2.44 +/****************************************************************************/
    2.45 +/************* export interface functions ***********************************/
    2.46 +/****************************************************************************/
    2.47 +
    2.48 +
    2.49 +int shadow_set_guest_paging_levels(struct domain *d, int levels)
    2.50 +{
    2.51 +    shadow_lock(d);
    2.52 +
    2.53 +    switch(levels) {
    2.54 +#if CONFIG_PAGING_LEVELS >= 4 
    2.55 +    case 4:
    2.56 +	if ( d->arch.ops != &MODE_F_HANDLER )
    2.57 +	    d->arch.ops = &MODE_F_HANDLER;
    2.58 +	shadow_unlock(d);
    2.59 +        return 1;
    2.60 +#endif
    2.61 +    case 3:
    2.62 +    case 2:                     
    2.63 +	if ( d->arch.ops != &MODE_A_HANDLER )
    2.64 +	    d->arch.ops = &MODE_A_HANDLER;
    2.65 +	shadow_unlock(d);
    2.66 +        return 1;
    2.67 +   default:
    2.68 +	shadow_unlock(d);
    2.69 +        return 0;
    2.70 +    }
    2.71 +}
    2.72 +
    2.73 +void shadow_invlpg(struct vcpu *v, unsigned long va)
    2.74 +{
    2.75 +    struct domain *d = current->domain;
    2.76 +    d->arch.ops->invlpg(v, va);
    2.77 +}
    2.78 +
    2.79 +int shadow_fault(unsigned long va, struct cpu_user_regs *regs)
    2.80 +{
    2.81 +    struct domain *d = current->domain;
    2.82 +    return d->arch.ops->fault(va, regs);
    2.83 +}
    2.84 +
    2.85 +void __update_pagetables(struct vcpu *v)
    2.86 +{
    2.87 +    struct domain *d = v->domain;
    2.88 +    d->arch.ops->update_pagetables(v);
    2.89 +}
    2.90 +
    2.91 +void __shadow_sync_all(struct domain *d)
    2.92 +{
    2.93 +    d->arch.ops->sync_all(d);
    2.94 +}
    2.95 +    
    2.96 +int shadow_remove_all_write_access(
    2.97 +    struct domain *d, unsigned long readonly_gpfn, unsigned long readonly_gmfn)
    2.98 +{
    2.99 +    return d->arch.ops->remove_all_write_access(d, readonly_gpfn, readonly_gmfn);
   2.100 +}
   2.101 +
   2.102 +int shadow_do_update_va_mapping(unsigned long va,
   2.103 +                                l1_pgentry_t val,
   2.104 +                                struct vcpu *v)
   2.105 +{
   2.106 +    struct domain *d = v->domain;
   2.107 +    return d->arch.ops->do_update_va_mapping(va, val, v);
   2.108 +}
   2.109 +
   2.110 +struct out_of_sync_entry *
   2.111 +shadow_mark_mfn_out_of_sync(struct vcpu *v, unsigned long gpfn,
   2.112 +                             unsigned long mfn)
   2.113 +{
   2.114 +   struct domain *d = v->domain;
   2.115 +   return d->arch.ops->mark_mfn_out_of_sync(v, gpfn, mfn);
   2.116 +}
   2.117 +
   2.118 +/*
   2.119 + * Returns 1 if va's shadow mapping is out-of-sync.
   2.120 + * Returns 0 otherwise.
   2.121 + */
   2.122 +int __shadow_out_of_sync(struct vcpu *v, unsigned long va)
   2.123 +{
   2.124 +    struct domain *d = v->domain;
   2.125 +    return d->arch.ops->is_out_of_sync(v, va);
   2.126 +}
   2.127 +
   2.128 +/****************************************************************************/
   2.129 +/****************************************************************************/
   2.130 +#if CONFIG_PAGING_LEVELS >= 4
   2.131 +/*
   2.132 + * Convert PAE 3-level page-table to 4-level page-table
   2.133 + */
   2.134 +#define PDP_ENTRIES   4
   2.135 +static pagetable_t page_table_convert(struct domain *d)
   2.136 +{
   2.137 +    struct pfn_info *l4page, *l3page;
   2.138 +    l4_pgentry_t *l4;
   2.139 +    l3_pgentry_t *l3, *pae_l3;
   2.140 +    int i;
   2.141 +    
   2.142 +    l4page = alloc_domheap_page(NULL);
   2.143 +    if (l4page == NULL)
   2.144 +        domain_crash();
   2.145 +    l4 = map_domain_page(page_to_pfn(l4page));
   2.146 +    memset(l4, 0, PAGE_SIZE);
   2.147 +
   2.148 +    l3page = alloc_domheap_page(NULL);
   2.149 +    if (l3page == NULL)
   2.150 +        domain_crash();
   2.151 +    l3 =  map_domain_page(page_to_pfn(l3page));
   2.152 +    memset(l3, 0, PAGE_SIZE);
   2.153 +
   2.154 +    l4[0] = l4e_from_page(l3page, __PAGE_HYPERVISOR);
   2.155 +    pae_l3 = map_domain_page(pagetable_get_pfn(d->arch.phys_table));
   2.156 +
   2.157 +    for (i = 0; i < PDP_ENTRIES; i++) {
   2.158 +        l3[i] = pae_l3[i];
   2.159 +        l3e_add_flags(l3[i], 0x67);
   2.160 +    }
   2.161 +
   2.162 +    unmap_domain_page(l4);
   2.163 +    unmap_domain_page(l3);
   2.164 +
   2.165 +    return mk_pagetable(page_to_phys(l4page));
   2.166 +}
   2.167 +
   2.168 +void alloc_monitor_pagetable(struct vcpu *v)
   2.169 +{
   2.170 +    unsigned long mmfn;
   2.171 +    l4_pgentry_t *mpl4e;
   2.172 +    struct pfn_info *mmfn_info;
   2.173 +    struct domain *d = v->domain;
   2.174 +     pagetable_t phys_table;
   2.175 +
   2.176 +    ASSERT(!pagetable_get_paddr(v->arch.monitor_table)); /* we should only get called once */
   2.177 +
   2.178 +    mmfn_info = alloc_domheap_page(NULL);
   2.179 +    ASSERT( mmfn_info );
   2.180 +
   2.181 +    mmfn = (unsigned long) (mmfn_info - frame_table);
   2.182 +    mpl4e = (l4_pgentry_t *) map_domain_page(mmfn);
   2.183 +    memcpy(mpl4e, &idle_pg_table[0], PAGE_SIZE);
   2.184 +    mpl4e[l4_table_offset(PERDOMAIN_VIRT_START)] =
   2.185 +      l4e_from_paddr(__pa(d->arch.mm_perdomain_l3), __PAGE_HYPERVISOR);
   2.186 +    /* map the phys_to_machine map into the per domain Read-Only MPT space */
   2.187 +    phys_table = page_table_convert(d);
   2.188 +
   2.189 +    mpl4e[l4_table_offset(RO_MPT_VIRT_START)] =
   2.190 +       l4e_from_paddr(pagetable_get_paddr(phys_table),
   2.191 +         __PAGE_HYPERVISOR);
   2.192 +    v->arch.monitor_table = mk_pagetable(mmfn << PAGE_SHIFT);
   2.193 +    v->arch.monitor_vtable = (l2_pgentry_t *) mpl4e;
   2.194 +}
   2.195 +
   2.196 +static void inline
   2.197 +free_shadow_fl1_table(struct domain *d, unsigned long smfn)
   2.198 +{
   2.199 +    l1_pgentry_t *pl1e = map_domain_page(smfn);
   2.200 +    int i;
   2.201 +
   2.202 +    for (i = 0; i < L1_PAGETABLE_ENTRIES; i++)
   2.203 +        put_page_from_l1e(pl1e[i], d);
   2.204 +}
   2.205 +
   2.206 +/*
   2.207 + * Free l2, l3, l4 shadow tables
   2.208 + */
   2.209 +static void inline
   2.210 +free_shadow_tables(struct domain *d, unsigned long smfn, u32 level)
   2.211 +{
   2.212 +    pgentry_64_t *ple = map_domain_page(smfn);
   2.213 +    int i, external = shadow_mode_external(d);
   2.214 +
   2.215 +    for ( i = 0; i < PAGETABLE_ENTRIES; i++ )
   2.216 +        if ( external || is_guest_l4_slot(i) )
   2.217 +            if ( entry_get_flags(ple[i]) & _PAGE_PRESENT )
   2.218 +                put_shadow_ref(entry_get_pfn(ple[i]));
   2.219 +
   2.220 +    unmap_domain_page(ple);
   2.221 +}
   2.222 +
   2.223 +void free_monitor_pagetable(struct vcpu *v)
   2.224 +{
   2.225 +    unsigned long mfn;
   2.226 +
   2.227 +//    ASSERT( pagetable_val(v->arch.monitor_table) );
   2.228 +    /*
   2.229 +     * free monitor_table.
   2.230 +     */
   2.231 +    //mfn = (pagetable_val(v->arch.monitor_table)) >> PAGE_SHIFT;
   2.232 +    mfn = pagetable_get_pfn(v->arch.monitor_table);
   2.233 +    unmap_domain_page(v->arch.monitor_vtable);
   2.234 +    free_domheap_page(&frame_table[mfn]);
   2.235 +    v->arch.monitor_table = mk_pagetable(0);
   2.236 +    v->arch.monitor_vtable = 0;
   2.237 +}
   2.238 +
   2.239 +#elif CONFIG_PAGING_LEVELS == 2
   2.240 +static void alloc_monitor_pagetable(struct vcpu *v)
   2.241 +{
   2.242 +    unsigned long mmfn;
   2.243 +    l2_pgentry_t *mpl2e;
   2.244 +    struct pfn_info *mmfn_info;
   2.245 +    struct domain *d = v->domain;
   2.246 +
   2.247 +    ASSERT(pagetable_get_paddr(v->arch.monitor_table) == 0);
   2.248 +
   2.249 +    mmfn_info = alloc_domheap_page(NULL);
   2.250 +    ASSERT(mmfn_info != NULL);
   2.251 +
   2.252 +    mmfn = page_to_pfn(mmfn_info);
   2.253 +    mpl2e = (l2_pgentry_t *)map_domain_page(mmfn);
   2.254 +    memset(mpl2e, 0, PAGE_SIZE);
   2.255 +
   2.256 +#ifdef __i386__ /* XXX screws x86/64 build */
   2.257 +    memcpy(&mpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 
   2.258 +           &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
   2.259 +           HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
   2.260 +#endif
   2.261 +
   2.262 +    mpl2e[l2_table_offset(PERDOMAIN_VIRT_START)] =
   2.263 +        l2e_from_paddr(__pa(d->arch.mm_perdomain_pt),
   2.264 +                        __PAGE_HYPERVISOR);
   2.265 +
   2.266 +    // map the phys_to_machine map into the Read-Only MPT space for this domain
   2.267 +    mpl2e[l2_table_offset(RO_MPT_VIRT_START)] =
   2.268 +        l2e_from_paddr(pagetable_get_paddr(d->arch.phys_table),
   2.269 +                        __PAGE_HYPERVISOR);
   2.270 +
   2.271 +    // Don't (yet) have mappings for these...
   2.272 +    // Don't want to accidentally see the idle_pg_table's linear mapping.
   2.273 +    //
   2.274 +    mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)] = l2e_empty();
   2.275 +    mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] = l2e_empty();
   2.276 +
   2.277 +    v->arch.monitor_table = mk_pagetable(mmfn << PAGE_SHIFT);
   2.278 +    v->arch.monitor_vtable = mpl2e;
   2.279 +}
   2.280 +
   2.281 +/*
   2.282 + * Free the pages for monitor_table and hl2_table
   2.283 + */
   2.284 +void free_monitor_pagetable(struct vcpu *v)
   2.285 +{
   2.286 +    l2_pgentry_t *mpl2e, hl2e, sl2e;
   2.287 +    unsigned long mfn;
   2.288 +
   2.289 +    ASSERT( pagetable_get_paddr(v->arch.monitor_table) );
   2.290 +    
   2.291 +    mpl2e = v->arch.monitor_vtable;
   2.292 +
   2.293 +    /*
   2.294 +     * First get the mfn for hl2_table by looking at monitor_table
   2.295 +     */
   2.296 +    hl2e = mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)];
   2.297 +    if ( l2e_get_flags(hl2e) & _PAGE_PRESENT )
   2.298 +    {
   2.299 +        mfn = l2e_get_pfn(hl2e);
   2.300 +        ASSERT(mfn);
   2.301 +        put_shadow_ref(mfn);
   2.302 +    }
   2.303 +
   2.304 +    sl2e = mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)];
   2.305 +    if ( l2e_get_flags(sl2e) & _PAGE_PRESENT )
   2.306 +    {
   2.307 +        mfn = l2e_get_pfn(sl2e);
   2.308 +        ASSERT(mfn);
   2.309 +        put_shadow_ref(mfn);
   2.310 +    }
   2.311 +
   2.312 +    unmap_domain_page(mpl2e);
   2.313 +
   2.314 +    /*
   2.315 +     * Then free monitor_table.
   2.316 +     */
   2.317 +    mfn = pagetable_get_pfn(v->arch.monitor_table);
   2.318 +    free_domheap_page(&frame_table[mfn]);
   2.319 +
   2.320 +    v->arch.monitor_table = mk_pagetable(0);
   2.321 +    v->arch.monitor_vtable = 0;
   2.322 +}
   2.323 +#endif	
   2.324 +
   2.325 +static void
   2.326 +shadow_free_snapshot(struct domain *d, struct out_of_sync_entry *entry)
   2.327 +{
   2.328 +    void *snapshot;
   2.329 +
   2.330 +    if ( entry->snapshot_mfn == SHADOW_SNAPSHOT_ELSEWHERE )
   2.331 +        return;
   2.332 +
   2.333 +    // Clear the out_of_sync bit.
   2.334 +    //
   2.335 +    clear_bit(_PGC_out_of_sync, &frame_table[entry->gmfn].count_info);
   2.336 +
   2.337 +    // XXX Need to think about how to protect the domain's
   2.338 +    // information less expensively.
   2.339 +    //
   2.340 +    snapshot = map_domain_page(entry->snapshot_mfn);
   2.341 +    memset(snapshot, 0, PAGE_SIZE);
   2.342 +    unmap_domain_page(snapshot);
   2.343 +
   2.344 +    put_shadow_ref(entry->snapshot_mfn);
   2.345 +}
   2.346 +
   2.347 +void
   2.348 +release_out_of_sync_entry(struct domain *d, struct out_of_sync_entry *entry)
   2.349 +{
   2.350 +    struct pfn_info *page;
   2.351 +
   2.352 +    page = &frame_table[entry->gmfn];
   2.353 +        
   2.354 +    // Decrement ref count of guest & shadow pages
   2.355 +    //
   2.356 +    put_page(page);
   2.357 +
   2.358 +    // Only use entries that have low bits clear...
   2.359 +    //
   2.360 +    if ( !(entry->writable_pl1e & (sizeof(l1_pgentry_t)-1)) )
   2.361 +    {
   2.362 +        put_shadow_ref(entry->writable_pl1e >> PAGE_SHIFT);
   2.363 +        entry->writable_pl1e = -2;
   2.364 +    }
   2.365 +    else
   2.366 +        ASSERT( entry->writable_pl1e == -1 );
   2.367 +
   2.368 +    // Free the snapshot
   2.369 +    //
   2.370 +    shadow_free_snapshot(d, entry);
   2.371 +}
   2.372 +
   2.373 +static void remove_out_of_sync_entries(struct domain *d, unsigned long gmfn)
   2.374 +{
   2.375 +    struct out_of_sync_entry *entry = d->arch.out_of_sync;
   2.376 +    struct out_of_sync_entry **prev = &d->arch.out_of_sync;
   2.377 +    struct out_of_sync_entry *found = NULL;
   2.378 +
   2.379 +    // NB: Be careful not to call something that manipulates this list
   2.380 +    //     while walking it.  Collect the results into a separate list
   2.381 +    //     first, then walk that list.
   2.382 +    //
   2.383 +    while ( entry )
   2.384 +    {
   2.385 +        if ( entry->gmfn == gmfn )
   2.386 +        {
   2.387 +            // remove from out of sync list
   2.388 +            *prev = entry->next;
   2.389 +
   2.390 +            // add to found list
   2.391 +            entry->next = found;
   2.392 +            found = entry;
   2.393 +
   2.394 +            entry = *prev;
   2.395 +            continue;
   2.396 +        }
   2.397 +        prev = &entry->next;
   2.398 +        entry = entry->next;
   2.399 +    }
   2.400 +
   2.401 +    prev = NULL;
   2.402 +    entry = found;
   2.403 +    while ( entry )
   2.404 +    {
   2.405 +        release_out_of_sync_entry(d, entry);
   2.406 +
   2.407 +        prev = &entry->next;
   2.408 +        entry = entry->next;
   2.409 +    }
   2.410 +
   2.411 +    // Add found list to free list
   2.412 +    if ( prev )
   2.413 +    {
   2.414 +        *prev = d->arch.out_of_sync_free;
   2.415 +        d->arch.out_of_sync_free = found;
   2.416 +    }
   2.417 +}
   2.418 +
   2.419 +static inline void
   2.420 +shadow_demote(struct domain *d, unsigned long gpfn, unsigned long gmfn)
   2.421 +{
   2.422 +    if ( !shadow_mode_refcounts(d) )
   2.423 +        return;
   2.424 +
   2.425 +    ASSERT(frame_table[gmfn].count_info & PGC_page_table);
   2.426 +
   2.427 +    if ( shadow_max_pgtable_type(d, gpfn, NULL) == PGT_none )
   2.428 +    {
   2.429 +        clear_bit(_PGC_page_table, &frame_table[gmfn].count_info);
   2.430 +
   2.431 +        if ( page_out_of_sync(pfn_to_page(gmfn)) )
   2.432 +        {
   2.433 +            remove_out_of_sync_entries(d, gmfn);
   2.434 +        }
   2.435 +    }
   2.436 +}
   2.437 +
   2.438 +static void inline
   2.439 +free_shadow_l1_table(struct domain *d, unsigned long smfn)
   2.440 +{
   2.441 +    l1_pgentry_t *pl1e = map_domain_page(smfn);
   2.442 +    int i;
   2.443 +    struct pfn_info *spage = pfn_to_page(smfn);
   2.444 +    u32 min_max = spage->tlbflush_timestamp;
   2.445 +    int min = SHADOW_MIN(min_max);
   2.446 +    int max = SHADOW_MAX(min_max);
   2.447 +
   2.448 +    for ( i = min; i <= max; i++ )
   2.449 +    {
   2.450 +        shadow_put_page_from_l1e(pl1e[i], d);
   2.451 +        pl1e[i] = l1e_empty();
   2.452 +    }
   2.453 +
   2.454 +    unmap_domain_page(pl1e);
   2.455 +}
   2.456 +
   2.457 +static void inline
   2.458 +free_shadow_hl2_table(struct domain *d, unsigned long smfn)
   2.459 +{
   2.460 +    l1_pgentry_t *hl2 = map_domain_page(smfn);
   2.461 +    int i, limit;
   2.462 +
   2.463 +    SH_VVLOG("%s: smfn=%lx freed", __func__, smfn);
   2.464 +
   2.465 +#ifdef __i386__
   2.466 +    if ( shadow_mode_external(d) )
   2.467 +        limit = L2_PAGETABLE_ENTRIES;
   2.468 +    else
   2.469 +        limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
   2.470 +#else
   2.471 +    limit = 0; /* XXX x86/64 XXX */
   2.472 +#endif
   2.473 +
   2.474 +    for ( i = 0; i < limit; i++ )
   2.475 +    {
   2.476 +        if ( l1e_get_flags(hl2[i]) & _PAGE_PRESENT )
   2.477 +            put_page(pfn_to_page(l1e_get_pfn(hl2[i])));
   2.478 +    }
   2.479 +
   2.480 +    unmap_domain_page(hl2);
   2.481 +}
   2.482 +
   2.483 +static void inline
   2.484 +free_shadow_l2_table(struct domain *d, unsigned long smfn, unsigned int type)
   2.485 +{
   2.486 +    l2_pgentry_t *pl2e = map_domain_page(smfn);
   2.487 +    int i, external = shadow_mode_external(d);
   2.488 +
   2.489 +    for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
   2.490 +        if ( external || is_guest_l2_slot(type, i) )
   2.491 +            if ( l2e_get_flags(pl2e[i]) & _PAGE_PRESENT )
   2.492 +                put_shadow_ref(l2e_get_pfn(pl2e[i]));
   2.493 +
   2.494 +    if ( (PGT_base_page_table == PGT_l2_page_table) &&
   2.495 +         shadow_mode_translate(d) && !external )
   2.496 +    {
   2.497 +        // free the ref to the hl2
   2.498 +        //
   2.499 +        put_shadow_ref(l2e_get_pfn(pl2e[l2_table_offset(LINEAR_PT_VIRT_START)]));
   2.500 +    }
   2.501 +
   2.502 +    unmap_domain_page(pl2e);
   2.503 +}
   2.504 +
   2.505 +void free_shadow_page(unsigned long smfn)
   2.506 +{
   2.507 +    struct pfn_info *page = &frame_table[smfn];
   2.508 +    unsigned long gmfn = page->u.inuse.type_info & PGT_mfn_mask;
   2.509 +    struct domain *d = page_get_owner(pfn_to_page(gmfn));
   2.510 +    unsigned long gpfn = __mfn_to_gpfn(d, gmfn);
   2.511 +    unsigned long type = page->u.inuse.type_info & PGT_type_mask;
   2.512 +
   2.513 +    SH_VVLOG("%s: free'ing smfn=%lx", __func__, smfn);
   2.514 +
   2.515 +    ASSERT( ! IS_INVALID_M2P_ENTRY(gpfn) );
   2.516 +#if CONFIG_PAGING_LEVELS >=4
   2.517 +    if (type == PGT_fl1_shadow) {
   2.518 +        unsigned long mfn;
   2.519 +        mfn = __shadow_status(d, gpfn, PGT_fl1_shadow);
   2.520 +        if (!mfn)
   2.521 +            gpfn |= (1UL << 63);
   2.522 +    }
   2.523 +#endif
   2.524 +    delete_shadow_status(d, gpfn, gmfn, type);
   2.525 +
   2.526 +    switch ( type )
   2.527 +    {
   2.528 +    case PGT_l1_shadow:
   2.529 +        perfc_decr(shadow_l1_pages);
   2.530 +        shadow_demote(d, gpfn, gmfn);
   2.531 +        free_shadow_l1_table(d, smfn);
   2.532 +        break;
   2.533 +#if defined (__i386__)
   2.534 +    case PGT_l2_shadow:
   2.535 +        perfc_decr(shadow_l2_pages);
   2.536 +        shadow_demote(d, gpfn, gmfn);
   2.537 +        free_shadow_l2_table(d, smfn, page->u.inuse.type_info);
   2.538 +        break;
   2.539 +
   2.540 +    case PGT_hl2_shadow:
   2.541 +        perfc_decr(hl2_table_pages);
   2.542 +        shadow_demote(d, gpfn, gmfn);
   2.543 +        free_shadow_hl2_table(d, smfn);
   2.544 +        break;
   2.545 +#else
   2.546 +    case PGT_l2_shadow:
   2.547 +    case PGT_l3_shadow:
   2.548 +    case PGT_l4_shadow:
   2.549 +        shadow_demote(d, gpfn, gmfn);
   2.550 +        free_shadow_tables(d, smfn, shadow_type_to_level(type));
   2.551 +        break;
   2.552 +
   2.553 +    case PGT_fl1_shadow:
   2.554 +        free_shadow_fl1_table(d, smfn);
   2.555 +        break;
   2.556 +
   2.557 +#endif
   2.558 +
   2.559 +    case PGT_snapshot:
   2.560 +        perfc_decr(apshot_pages);
   2.561 +        break;
   2.562 +
   2.563 +    default:
   2.564 +        printk("Free shadow weird page type mfn=%lx type=%08x\n",
   2.565 +               page_to_pfn(page), page->u.inuse.type_info);
   2.566 +        break;
   2.567 +    }
   2.568 +
   2.569 +    d->arch.shadow_page_count--;
   2.570 +
   2.571 +    // No TLB flushes are needed the next time this page gets allocated.
   2.572 +    //
   2.573 +    page->tlbflush_timestamp = 0;
   2.574 +    page->u.free.cpumask     = CPU_MASK_NONE;
   2.575 +
   2.576 +    if ( type == PGT_l1_shadow )
   2.577 +    {
   2.578 +        list_add(&page->list, &d->arch.free_shadow_frames);
   2.579 +        perfc_incr(free_l1_pages);
   2.580 +    }
   2.581 +    else
   2.582 +        free_domheap_page(page);
   2.583 +}
   2.584 +
   2.585 +static void
   2.586 +free_writable_pte_predictions(struct domain *d)
   2.587 +{
   2.588 +    int i;
   2.589 +    struct shadow_status *x;
   2.590 +
   2.591 +    for ( i = 0; i < shadow_ht_buckets; i++ )
   2.592 +    {
   2.593 +        u32 count;
   2.594 +        unsigned long *gpfn_list;
   2.595 +
   2.596 +        /* Skip empty buckets. */
   2.597 +        if ( d->arch.shadow_ht[i].gpfn_and_flags == 0 )
   2.598 +            continue;
   2.599 +
   2.600 +        count = 0;
   2.601 +        for ( x = &d->arch.shadow_ht[i]; x != NULL; x = x->next )
   2.602 +            if ( (x->gpfn_and_flags & PGT_type_mask) == PGT_writable_pred )
   2.603 +                count++;
   2.604 +
   2.605 +        gpfn_list = xmalloc_array(unsigned long, count);
   2.606 +        count = 0;
   2.607 +        for ( x = &d->arch.shadow_ht[i]; x != NULL; x = x->next )
   2.608 +            if ( (x->gpfn_and_flags & PGT_type_mask) == PGT_writable_pred )
   2.609 +                gpfn_list[count++] = x->gpfn_and_flags & PGT_mfn_mask;
   2.610 +
   2.611 +        while ( count )
   2.612 +        {
   2.613 +            count--;
   2.614 +            delete_shadow_status(d, gpfn_list[count], 0, PGT_writable_pred);
   2.615 +        }
   2.616 +
   2.617 +        xfree(gpfn_list);
   2.618 +    }
   2.619 +}
   2.620 +
   2.621 +static void free_shadow_ht_entries(struct domain *d)
   2.622 +{
   2.623 +    struct shadow_status *x, *n;
   2.624 +
   2.625 +    SH_VLOG("freed tables count=%d l1=%d l2=%d",
   2.626 +            d->arch.shadow_page_count, perfc_value(shadow_l1_pages), 
   2.627 +            perfc_value(shadow_l2_pages));
   2.628 +
   2.629 +    n = d->arch.shadow_ht_extras;
   2.630 +    while ( (x = n) != NULL )
   2.631 +    {
   2.632 +        d->arch.shadow_extras_count--;
   2.633 +        n = *((struct shadow_status **)(&x[shadow_ht_extra_size]));
   2.634 +        xfree(x);
   2.635 +    }
   2.636 +
   2.637 +    d->arch.shadow_ht_extras = NULL;
   2.638 +    d->arch.shadow_ht_free = NULL;
   2.639 +
   2.640 +    ASSERT(d->arch.shadow_extras_count == 0);
   2.641 +    SH_LOG("freed extras, now %d", d->arch.shadow_extras_count);
   2.642 +
   2.643 +    if ( d->arch.shadow_dirty_bitmap != NULL )
   2.644 +    {
   2.645 +        xfree(d->arch.shadow_dirty_bitmap);
   2.646 +        d->arch.shadow_dirty_bitmap = 0;
   2.647 +        d->arch.shadow_dirty_bitmap_size = 0;
   2.648 +    }
   2.649 +
   2.650 +    xfree(d->arch.shadow_ht);
   2.651 +    d->arch.shadow_ht = NULL;
   2.652 +}
   2.653 +
   2.654 +static void free_out_of_sync_entries(struct domain *d)
   2.655 +{
   2.656 +    struct out_of_sync_entry *x, *n;
   2.657 +
   2.658 +    n = d->arch.out_of_sync_extras;
   2.659 +    while ( (x = n) != NULL )
   2.660 +    {
   2.661 +        d->arch.out_of_sync_extras_count--;
   2.662 +        n = *((struct out_of_sync_entry **)(&x[out_of_sync_extra_size]));
   2.663 +        xfree(x);
   2.664 +    }
   2.665 +
   2.666 +    d->arch.out_of_sync_extras = NULL;
   2.667 +    d->arch.out_of_sync_free = NULL;
   2.668 +    d->arch.out_of_sync = NULL;
   2.669 +
   2.670 +    ASSERT(d->arch.out_of_sync_extras_count == 0);
   2.671 +    FSH_LOG("freed extra out_of_sync entries, now %d",
   2.672 +            d->arch.out_of_sync_extras_count);
   2.673 +}
   2.674 +
   2.675 +void free_shadow_pages(struct domain *d)
   2.676 +{
   2.677 +    int                   i;
   2.678 +    struct shadow_status *x;
   2.679 +    struct vcpu          *v;
   2.680 + 
   2.681 +    /*
   2.682 +     * WARNING! The shadow page table must not currently be in use!
   2.683 +     * e.g., You are expected to have paused the domain and synchronized CR3.
   2.684 +     */
   2.685 +
   2.686 +    if( !d->arch.shadow_ht ) return;
   2.687 +
   2.688 +    shadow_audit(d, 1);
   2.689 +
   2.690 +    // first, remove any outstanding refs from out_of_sync entries...
   2.691 +    //
   2.692 +    free_out_of_sync_state(d);
   2.693 +
   2.694 +    // second, remove any outstanding refs from v->arch.shadow_table
   2.695 +    // and CR3.
   2.696 +    //
   2.697 +    for_each_vcpu(d, v)
   2.698 +    {
   2.699 +        if ( pagetable_get_paddr(v->arch.shadow_table) )
   2.700 +        {
   2.701 +            put_shadow_ref(pagetable_get_pfn(v->arch.shadow_table));
   2.702 +            v->arch.shadow_table = mk_pagetable(0);
   2.703 +        }
   2.704 +
   2.705 +        if ( v->arch.monitor_shadow_ref )
   2.706 +        {
   2.707 +            put_shadow_ref(v->arch.monitor_shadow_ref);
   2.708 +            v->arch.monitor_shadow_ref = 0;
   2.709 +        }
   2.710 +    }
   2.711 +
   2.712 +#if defined (__i386__)
   2.713 +    // For external shadows, remove the monitor table's refs
   2.714 +    //
   2.715 +    if ( shadow_mode_external(d) )
   2.716 +    {
   2.717 +        for_each_vcpu(d, v)
   2.718 +        {
   2.719 +            l2_pgentry_t *mpl2e = v->arch.monitor_vtable;
   2.720 +
   2.721 +            if ( mpl2e )
   2.722 +            {
   2.723 +                l2_pgentry_t hl2e = mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)];
   2.724 +                l2_pgentry_t smfn = mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)];
   2.725 +
   2.726 +                if ( l2e_get_flags(hl2e) & _PAGE_PRESENT )
   2.727 +                {
   2.728 +                    put_shadow_ref(l2e_get_pfn(hl2e));
   2.729 +                    mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)] = l2e_empty();
   2.730 +                }
   2.731 +                if ( l2e_get_flags(smfn) & _PAGE_PRESENT )
   2.732 +                {
   2.733 +                    put_shadow_ref(l2e_get_pfn(smfn));
   2.734 +                    mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] = l2e_empty();
   2.735 +                }
   2.736 +            }
   2.737 +        }
   2.738 +    }
   2.739 +#endif
   2.740 +    // Now, the only refs to shadow pages that are left are from the shadow
   2.741 +    // pages themselves.  We just unpin the pinned pages, and the rest
   2.742 +    // should automatically disappear.
   2.743 +    //
   2.744 +    // NB: Beware: each explicitly or implicit call to free_shadow_page
   2.745 +    // can/will result in the hash bucket getting rewritten out from
   2.746 +    // under us...  First, collect the list of pinned pages, then
   2.747 +    // free them.
   2.748 +    //
   2.749 +    for ( i = 0; i < shadow_ht_buckets; i++ )
   2.750 +    {
   2.751 +        u32 count;
   2.752 +        unsigned long *mfn_list;
   2.753 +
   2.754 +        /* Skip empty buckets. */
   2.755 +        if ( d->arch.shadow_ht[i].gpfn_and_flags == 0 )
   2.756 +            continue;
   2.757 +
   2.758 +        count = 0;
   2.759 +        for ( x = &d->arch.shadow_ht[i]; x != NULL; x = x->next )
   2.760 +            if ( MFN_PINNED(x->smfn) )
   2.761 +                count++;
   2.762 +        if ( !count )
   2.763 +            continue;
   2.764 +
   2.765 +        mfn_list = xmalloc_array(unsigned long, count);
   2.766 +        count = 0;
   2.767 +        for ( x = &d->arch.shadow_ht[i]; x != NULL; x = x->next )
   2.768 +            if ( MFN_PINNED(x->smfn) )
   2.769 +                mfn_list[count++] = x->smfn;
   2.770 +
   2.771 +        while ( count )
   2.772 +        {
   2.773 +            shadow_unpin(mfn_list[--count]);
   2.774 +        }
   2.775 +        xfree(mfn_list);
   2.776 +    }
   2.777 +
   2.778 +    // Now free the pre-zero'ed pages from the domain
   2.779 +    //
   2.780 +    struct list_head *list_ent, *tmp;
   2.781 +    list_for_each_safe(list_ent, tmp, &d->arch.free_shadow_frames)
   2.782 +    {
   2.783 +        list_del(list_ent);
   2.784 +        perfc_decr(free_l1_pages);
   2.785 +
   2.786 +        struct pfn_info *page = list_entry(list_ent, struct pfn_info, list);
   2.787 +        free_domheap_page(page);
   2.788 +    }
   2.789 +
   2.790 +    shadow_audit(d, 0);
   2.791 +
   2.792 +    SH_LOG("Free shadow table.");
   2.793 +}
   2.794 +
   2.795 +void __shadow_mode_disable(struct domain *d)
   2.796 +{
   2.797 +    if ( unlikely(!shadow_mode_enabled(d)) )
   2.798 +        return;
   2.799 +
   2.800 +    /*
   2.801 +     * Currently this does not fix up page ref counts, so it is valid to call
   2.802 +     * only when a domain is being destroyed.
   2.803 +     */
   2.804 +    BUG_ON(!test_bit(_DOMF_dying, &d->domain_flags) &&
   2.805 +           shadow_mode_refcounts(d));
   2.806 +    d->arch.shadow_tainted_refcnts = shadow_mode_refcounts(d);
   2.807 +
   2.808 +    free_shadow_pages(d);
   2.809 +    free_writable_pte_predictions(d);
   2.810 +
   2.811 +#ifndef NDEBUG
   2.812 +    int i;
   2.813 +    for ( i = 0; i < shadow_ht_buckets; i++ )
   2.814 +    {
   2.815 +        if ( d->arch.shadow_ht[i].gpfn_and_flags != 0 )
   2.816 +        {
   2.817 +            printk("%s: d->arch.shadow_ht[%x].gpfn_and_flags=%lx\n",
   2.818 +                   __FILE__, i, d->arch.shadow_ht[i].gpfn_and_flags);
   2.819 +            BUG();
   2.820 +        }
   2.821 +    }
   2.822 +#endif
   2.823 +
   2.824 +    d->arch.shadow_mode = 0;
   2.825 +
   2.826 +    free_shadow_ht_entries(d);
   2.827 +    free_out_of_sync_entries(d);
   2.828 +
   2.829 +    struct vcpu *v;
   2.830 +    for_each_vcpu(d, v)
   2.831 +    {
   2.832 +        update_pagetables(v);
   2.833 +    }
   2.834 +}
   2.835 +
   2.836 +
   2.837 +static void
   2.838 +free_p2m_table(struct domain *d)
   2.839 +{
   2.840 +    // uh, this needs some work...  :)
   2.841 +    BUG();
   2.842 +}
   2.843 +
   2.844 +
   2.845 +int __shadow_mode_enable(struct domain *d, unsigned int mode)
   2.846 +{
   2.847 +    struct vcpu *v;
   2.848 +    int new_modes = (mode & ~d->arch.shadow_mode);
   2.849 +
   2.850 +    // Gotta be adding something to call this function.
   2.851 +    ASSERT(new_modes);
   2.852 +
   2.853 +    // can't take anything away by calling this function.
   2.854 +    ASSERT(!(d->arch.shadow_mode & ~mode));
   2.855 +
   2.856 +#if defined(CONFIG_PAGING_LEVELS)
   2.857 +    if(!shadow_set_guest_paging_levels(d, 
   2.858 +	   CONFIG_PAGING_LEVELS)) {
   2.859 +	printk("Unsupported guest paging levels\n");
   2.860 +	domain_crash_synchronous(); /* need to take a clean path */
   2.861 +    }
   2.862 +#endif
   2.863 +
   2.864 +    for_each_vcpu(d, v)
   2.865 +    {
   2.866 +        invalidate_shadow_ldt(v);
   2.867 +
   2.868 +        // We need to set these up for __update_pagetables().
   2.869 +        // See the comment there.
   2.870 +
   2.871 +        /*
   2.872 +         * arch.guest_vtable
   2.873 +         */
   2.874 +        if ( v->arch.guest_vtable &&
   2.875 +             (v->arch.guest_vtable != __linear_l2_table) )
   2.876 +        {
   2.877 +            unmap_domain_page(v->arch.guest_vtable);
   2.878 +        }
   2.879 +        if ( (mode & (SHM_translate | SHM_external)) == SHM_translate )
   2.880 +            v->arch.guest_vtable = __linear_l2_table;
   2.881 +        else
   2.882 +            v->arch.guest_vtable = NULL;
   2.883 +
   2.884 +        /*
   2.885 +         * arch.shadow_vtable
   2.886 +         */
   2.887 +        if ( v->arch.shadow_vtable &&
   2.888 +             (v->arch.shadow_vtable != __shadow_linear_l2_table) )
   2.889 +        {
   2.890 +            unmap_domain_page(v->arch.shadow_vtable);
   2.891 +        }
   2.892 +        if ( !(mode & SHM_external) && d->arch.ops->guest_paging_levels == 2)
   2.893 +            v->arch.shadow_vtable = __shadow_linear_l2_table;
   2.894 +        else
   2.895 +            v->arch.shadow_vtable = NULL;
   2.896 +        
   2.897 +#if defined (__i386__)
   2.898 +        /*
   2.899 +         * arch.hl2_vtable
   2.900 +         */
   2.901 +        if ( v->arch.hl2_vtable &&
   2.902 +             (v->arch.hl2_vtable != __linear_hl2_table) )
   2.903 +        {
   2.904 +            unmap_domain_page(v->arch.hl2_vtable);
   2.905 +        }
   2.906 +        if ( (mode & (SHM_translate | SHM_external)) == SHM_translate )
   2.907 +            v->arch.hl2_vtable = __linear_hl2_table;
   2.908 +        else
   2.909 +            v->arch.hl2_vtable = NULL;
   2.910 +#endif
   2.911 +        /*
   2.912 +         * arch.monitor_table & arch.monitor_vtable
   2.913 +         */
   2.914 +        if ( v->arch.monitor_vtable )
   2.915 +        {
   2.916 +            free_monitor_pagetable(v);
   2.917 +        }
   2.918 +        if ( mode & SHM_external )
   2.919 +        {
   2.920 +            alloc_monitor_pagetable(v);
   2.921 +        }
   2.922 +    }
   2.923 +
   2.924 +    if ( new_modes & SHM_enable )
   2.925 +    {
   2.926 +        ASSERT( !d->arch.shadow_ht );
   2.927 +        d->arch.shadow_ht = xmalloc_array(struct shadow_status, shadow_ht_buckets);
   2.928 +        if ( d->arch.shadow_ht == NULL )
   2.929 +            goto nomem;
   2.930 +
   2.931 +        memset(d->arch.shadow_ht, 0,
   2.932 +           shadow_ht_buckets * sizeof(struct shadow_status));
   2.933 +    }
   2.934 +
   2.935 +    if ( new_modes & SHM_log_dirty )
   2.936 +    {
   2.937 +        ASSERT( !d->arch.shadow_dirty_bitmap );
   2.938 +        d->arch.shadow_dirty_bitmap_size = (d->max_pages + 63) & ~63;
   2.939 +        d->arch.shadow_dirty_bitmap = 
   2.940 +            xmalloc_array(unsigned long, d->arch.shadow_dirty_bitmap_size /
   2.941 +                                         (8 * sizeof(unsigned long)));
   2.942 +        if ( d->arch.shadow_dirty_bitmap == NULL )
   2.943 +        {
   2.944 +            d->arch.shadow_dirty_bitmap_size = 0;
   2.945 +            goto nomem;
   2.946 +        }
   2.947 +        memset(d->arch.shadow_dirty_bitmap, 0, 
   2.948 +               d->arch.shadow_dirty_bitmap_size/8);
   2.949 +    }
   2.950 +
   2.951 +    if ( new_modes & SHM_translate )
   2.952 +    {
   2.953 +        if ( !(new_modes & SHM_external) )
   2.954 +        {
   2.955 +            ASSERT( !pagetable_get_paddr(d->arch.phys_table) );
   2.956 +            if ( !alloc_p2m_table(d) )
   2.957 +            {
   2.958 +                printk("alloc_p2m_table failed (out-of-memory?)\n");
   2.959 +                goto nomem;
   2.960 +            }
   2.961 +        }
   2.962 +        else
   2.963 +        {
   2.964 +            // external guests provide their own memory for their P2M maps.
   2.965 +            //
   2.966 +            ASSERT( d == page_get_owner(
   2.967 +                        &frame_table[pagetable_get_pfn(d->arch.phys_table)]) );
   2.968 +        }
   2.969 +    }
   2.970 +
   2.971 +    printk("audit1\n");
   2.972 +    _audit_domain(d, AUDIT_SHADOW_ALREADY_LOCKED | AUDIT_ERRORS_OK);
   2.973 +    printk("audit1 done\n");
   2.974 +
   2.975 +    // Get rid of any shadow pages from any previous shadow mode.
   2.976 +    //
   2.977 +    free_shadow_pages(d);
   2.978 +
   2.979 +    printk("audit2\n");
   2.980 +    _audit_domain(d, AUDIT_SHADOW_ALREADY_LOCKED | AUDIT_ERRORS_OK);
   2.981 +    printk("audit2 done\n");
   2.982 +
   2.983 +    /*
   2.984 +     * Tear down it's counts by disassembling its page-table-based ref counts.
   2.985 +     * Also remove CR3's gcount/tcount.
   2.986 +     * That leaves things like GDTs and LDTs and external refs in tact.
   2.987 +     *
   2.988 +     * Most pages will be writable tcount=0.
   2.989 +     * Some will still be L1 tcount=0 or L2 tcount=0.
   2.990 +     * Maybe some pages will be type none tcount=0.
   2.991 +     * Pages granted external writable refs (via grant tables?) will
   2.992 +     * still have a non-zero tcount.  That's OK.
   2.993 +     *
   2.994 +     * gcounts will generally be 1 for PGC_allocated.
   2.995 +     * GDTs and LDTs will have additional gcounts.
   2.996 +     * Any grant-table based refs will still be in the gcount.
   2.997 +     *
   2.998 +     * We attempt to grab writable refs to each page (thus setting its type).
   2.999 +     * Immediately put back those type refs.
  2.1000 +     *
  2.1001 +     * Assert that no pages are left with L1/L2/L3/L4 type.
  2.1002 +     */
  2.1003 +    audit_adjust_pgtables(d, -1, 1);
  2.1004 +
  2.1005 +    d->arch.shadow_mode = mode;
  2.1006 +
  2.1007 +    if ( shadow_mode_refcounts(d) )
  2.1008 +    {
  2.1009 +        struct list_head *list_ent = d->page_list.next;
  2.1010 +        while ( list_ent != &d->page_list )
  2.1011 +        {
  2.1012 +            struct pfn_info *page = list_entry(list_ent, struct pfn_info, list);
  2.1013 +            if ( !get_page_type(page, PGT_writable_page) )
  2.1014 +                BUG();
  2.1015 +            put_page_type(page);
  2.1016 +
  2.1017 +            list_ent = page->list.next;
  2.1018 +        }
  2.1019 +    }
  2.1020 +
  2.1021 +    audit_adjust_pgtables(d, 1, 1);
  2.1022 +
  2.1023 +    printk("audit3\n");
  2.1024 +    _audit_domain(d, AUDIT_SHADOW_ALREADY_LOCKED | AUDIT_ERRORS_OK);
  2.1025 +    printk("audit3 done\n");
  2.1026 +
  2.1027 +    return 0;
  2.1028 +
  2.1029 + nomem:
  2.1030 +    if ( (new_modes & SHM_enable) )
  2.1031 +    {
  2.1032 +        xfree(d->arch.shadow_ht);
  2.1033 +        d->arch.shadow_ht = NULL;
  2.1034 +    }
  2.1035 +    if ( (new_modes & SHM_log_dirty) )
  2.1036 +    {
  2.1037 +        xfree(d->arch.shadow_dirty_bitmap);
  2.1038 +        d->arch.shadow_dirty_bitmap = NULL;
  2.1039 +    }
  2.1040 +    if ( (new_modes & SHM_translate) && !(new_modes & SHM_external) &&
  2.1041 +         pagetable_get_paddr(d->arch.phys_table) )
  2.1042 +    {
  2.1043 +        free_p2m_table(d);
  2.1044 +    }
  2.1045 +    return -ENOMEM;
  2.1046 +}
  2.1047 +
  2.1048 +
  2.1049 +int shadow_mode_enable(struct domain *d, unsigned int mode)
  2.1050 +{
  2.1051 +    int rc;
  2.1052 +    shadow_lock(d);
  2.1053 +    rc = __shadow_mode_enable(d, mode);
  2.1054 +    shadow_unlock(d);
  2.1055 +    return rc;
  2.1056 +}
  2.1057 +
  2.1058 +static int shadow_mode_table_op(
  2.1059 +    struct domain *d, dom0_shadow_control_t *sc)
  2.1060 +{
  2.1061 +    unsigned int      op = sc->op;
  2.1062 +    int               i, rc = 0;
  2.1063 +    struct vcpu *v;
  2.1064 +
  2.1065 +    ASSERT(shadow_lock_is_acquired(d));
  2.1066 +
  2.1067 +    SH_VLOG("shadow mode table op %lx %lx count %d",
  2.1068 +            (unsigned long)pagetable_get_pfn(d->vcpu[0]->arch.guest_table),  /* XXX SMP */
  2.1069 +            (unsigned long)pagetable_get_pfn(d->vcpu[0]->arch.shadow_table), /* XXX SMP */
  2.1070 +            d->arch.shadow_page_count);
  2.1071 +
  2.1072 +    shadow_audit(d, 1);
  2.1073 +
  2.1074 +    switch ( op )
  2.1075 +    {
  2.1076 +    case DOM0_SHADOW_CONTROL_OP_FLUSH:
  2.1077 +        free_shadow_pages(d);
  2.1078 +
  2.1079 +        d->arch.shadow_fault_count       = 0;
  2.1080 +        d->arch.shadow_dirty_count       = 0;
  2.1081 +        d->arch.shadow_dirty_net_count   = 0;
  2.1082 +        d->arch.shadow_dirty_block_count = 0;
  2.1083 +
  2.1084 +        break;
  2.1085 +   
  2.1086 +    case DOM0_SHADOW_CONTROL_OP_CLEAN:
  2.1087 +        free_shadow_pages(d);
  2.1088 +
  2.1089 +        sc->stats.fault_count       = d->arch.shadow_fault_count;
  2.1090 +        sc->stats.dirty_count       = d->arch.shadow_dirty_count;
  2.1091 +        sc->stats.dirty_net_count   = d->arch.shadow_dirty_net_count;
  2.1092 +        sc->stats.dirty_block_count = d->arch.shadow_dirty_block_count;
  2.1093 +
  2.1094 +        d->arch.shadow_fault_count       = 0;
  2.1095 +        d->arch.shadow_dirty_count       = 0;
  2.1096 +        d->arch.shadow_dirty_net_count   = 0;
  2.1097 +        d->arch.shadow_dirty_block_count = 0;
  2.1098 + 
  2.1099 +        if ( (d->max_pages > sc->pages) || 
  2.1100 +             (sc->dirty_bitmap == NULL) || 
  2.1101 +             (d->arch.shadow_dirty_bitmap == NULL) )
  2.1102 +        {
  2.1103 +            rc = -EINVAL;
  2.1104 +            break;
  2.1105 +        }
  2.1106 + 
  2.1107 +        sc->pages = d->max_pages;
  2.1108 +
  2.1109 +#define chunk (8*1024) /* Transfer and clear in 1kB chunks for L1 cache. */
  2.1110 +        for ( i = 0; i < d->max_pages; i += chunk )
  2.1111 +        {
  2.1112 +            int bytes = ((((d->max_pages - i) > chunk) ?
  2.1113 +                          chunk : (d->max_pages - i)) + 7) / 8;
  2.1114 +
  2.1115 +            if (copy_to_user(
  2.1116 +                    sc->dirty_bitmap + (i/(8*sizeof(unsigned long))),
  2.1117 +                    d->arch.shadow_dirty_bitmap +(i/(8*sizeof(unsigned long))),
  2.1118 +                    bytes))
  2.1119 +            {
  2.1120 +                // copy_to_user can fail when copying to guest app memory.
  2.1121 +                // app should zero buffer after mallocing, and pin it
  2.1122 +                rc = -EINVAL;
  2.1123 +                memset(
  2.1124 +                    d->arch.shadow_dirty_bitmap + 
  2.1125 +                    (i/(8*sizeof(unsigned long))),
  2.1126 +                    0, (d->max_pages/8) - (i/(8*sizeof(unsigned long))));
  2.1127 +                break;
  2.1128 +            }
  2.1129 +            memset(
  2.1130 +                d->arch.shadow_dirty_bitmap + (i/(8*sizeof(unsigned long))),
  2.1131 +                0, bytes);
  2.1132 +        }
  2.1133 +
  2.1134 +        break;
  2.1135 +
  2.1136 +    case DOM0_SHADOW_CONTROL_OP_PEEK:
  2.1137 +        sc->stats.fault_count       = d->arch.shadow_fault_count;
  2.1138 +        sc->stats.dirty_count       = d->arch.shadow_dirty_count;
  2.1139 +        sc->stats.dirty_net_count   = d->arch.shadow_dirty_net_count;
  2.1140 +        sc->stats.dirty_block_count = d->arch.shadow_dirty_block_count;
  2.1141 + 
  2.1142 +        if ( (d->max_pages > sc->pages) || 
  2.1143 +             (sc->dirty_bitmap == NULL) || 
  2.1144 +             (d->arch.shadow_dirty_bitmap == NULL) )
  2.1145 +        {
  2.1146 +            rc = -EINVAL;
  2.1147 +            break;
  2.1148 +        }
  2.1149 + 
  2.1150 +        sc->pages = d->max_pages;
  2.1151 +        if (copy_to_user(
  2.1152 +            sc->dirty_bitmap, d->arch.shadow_dirty_bitmap, (d->max_pages+7)/8))
  2.1153 +        {
  2.1154 +            rc = -EINVAL;
  2.1155 +            break;
  2.1156 +        }
  2.1157 +
  2.1158 +        break;
  2.1159 +
  2.1160 +    default:
  2.1161 +        rc = -EINVAL;
  2.1162 +        break;
  2.1163 +    }
  2.1164 +
  2.1165 +    SH_VLOG("shadow mode table op : page count %d", d->arch.shadow_page_count);
  2.1166 +    shadow_audit(d, 1);
  2.1167 +
  2.1168 +    for_each_vcpu(d,v)
  2.1169 +        __update_pagetables(v);
  2.1170 +
  2.1171 +    return rc;
  2.1172 +}
  2.1173 +
  2.1174 +int shadow_mode_control(struct domain *d, dom0_shadow_control_t *sc)
  2.1175 +{
  2.1176 +    unsigned int op = sc->op;
  2.1177 +    int          rc = 0;
  2.1178 +    struct vcpu *v;
  2.1179 +
  2.1180 +    if ( unlikely(d == current->domain) )
  2.1181 +    {
  2.1182 +        DPRINTK("Don't try to do a shadow op on yourself!\n");
  2.1183 +        return -EINVAL;
  2.1184 +    }   
  2.1185 +
  2.1186 +    domain_pause(d);
  2.1187 +
  2.1188 +    shadow_lock(d);
  2.1189 +
  2.1190 +    switch ( op )
  2.1191 +    {
  2.1192 +    case DOM0_SHADOW_CONTROL_OP_OFF:
  2.1193 +        __shadow_sync_all(d);
  2.1194 +        __shadow_mode_disable(d);
  2.1195 +        break;
  2.1196 +
  2.1197 +    case DOM0_SHADOW_CONTROL_OP_ENABLE_TEST:
  2.1198 +        free_shadow_pages(d);
  2.1199 +        rc = __shadow_mode_enable(d, SHM_enable);
  2.1200 +        break;
  2.1201 +
  2.1202 +    case DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY:
  2.1203 +        free_shadow_pages(d);
  2.1204 +        rc = __shadow_mode_enable(
  2.1205 +            d, d->arch.shadow_mode|SHM_enable|SHM_log_dirty);
  2.1206 +        break;
  2.1207 +
  2.1208 +    case DOM0_SHADOW_CONTROL_OP_ENABLE_TRANSLATE:
  2.1209 +        free_shadow_pages(d);
  2.1210 +        rc = __shadow_mode_enable(
  2.1211 +            d, d->arch.shadow_mode|SHM_enable|SHM_refcounts|SHM_translate);
  2.1212 +        break;
  2.1213 +
  2.1214 +    default:
  2.1215 +        rc = shadow_mode_enabled(d) ? shadow_mode_table_op(d, sc) : -EINVAL;
  2.1216 +        break;
  2.1217 +    }
  2.1218 +
  2.1219 +    shadow_unlock(d);
  2.1220 +
  2.1221 +    for_each_vcpu(d,v)
  2.1222 +        update_pagetables(v);
  2.1223 +
  2.1224 +    domain_unpause(d);
  2.1225 +
  2.1226 +    return rc;
  2.1227 +}
  2.1228 +
  2.1229 +void shadow_mode_init(void)
  2.1230 +{
  2.1231 +}
  2.1232 +
  2.1233 +int _shadow_mode_refcounts(struct domain *d)
  2.1234 +{
  2.1235 +    return shadow_mode_refcounts(d);
  2.1236 +}
  2.1237 +
  2.1238 +int
  2.1239 +set_p2m_entry(struct domain *d, unsigned long pfn, unsigned long mfn,
  2.1240 +              struct domain_mmap_cache *l2cache,
  2.1241 +              struct domain_mmap_cache *l1cache)
  2.1242 +{
  2.1243 +    unsigned long tabpfn = pagetable_get_pfn(d->arch.phys_table);
  2.1244 +    l2_pgentry_t *l2, l2e;
  2.1245 +    l1_pgentry_t *l1;
  2.1246 +    struct pfn_info *l1page;
  2.1247 +    unsigned long va = pfn << PAGE_SHIFT;
  2.1248 +
  2.1249 +    ASSERT(tabpfn != 0);
  2.1250 +
  2.1251 +    l2 = map_domain_page_with_cache(tabpfn, l2cache);
  2.1252 +    l2e = l2[l2_table_offset(va)];
  2.1253 +    if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
  2.1254 +    {
  2.1255 +        l1page = alloc_domheap_page(NULL);
  2.1256 +        if ( !l1page )
  2.1257 +        {
  2.1258 +            unmap_domain_page_with_cache(l2, l2cache);
  2.1259 +            return 0;
  2.1260 +        }
  2.1261 +
  2.1262 +        l1 = map_domain_page_with_cache(page_to_pfn(l1page), l1cache);
  2.1263 +        memset(l1, 0, PAGE_SIZE);
  2.1264 +        unmap_domain_page_with_cache(l1, l1cache);
  2.1265 +
  2.1266 +        l2e = l2e_from_page(l1page, __PAGE_HYPERVISOR);
  2.1267 +        l2[l2_table_offset(va)] = l2e;
  2.1268 +    }
  2.1269 +    unmap_domain_page_with_cache(l2, l2cache);
  2.1270 +
  2.1271 +    l1 = map_domain_page_with_cache(l2e_get_pfn(l2e), l1cache);
  2.1272 +    l1[l1_table_offset(va)] = l1e_from_pfn(mfn, __PAGE_HYPERVISOR);
  2.1273 +    unmap_domain_page_with_cache(l1, l1cache);
  2.1274 +
  2.1275 +    return 1;
  2.1276 +}
  2.1277 +
  2.1278 +int
  2.1279 +alloc_p2m_table(struct domain *d)
  2.1280 +{
  2.1281 +    struct list_head *list_ent;
  2.1282 +    struct pfn_info *page, *l2page;
  2.1283 +    l2_pgentry_t *l2;
  2.1284 +    unsigned long mfn, pfn;
  2.1285 +    struct domain_mmap_cache l1cache, l2cache;
  2.1286 +
  2.1287 +    l2page = alloc_domheap_page(NULL);
  2.1288 +    if ( l2page == NULL )
  2.1289 +        return 0;
  2.1290 +
  2.1291 +    domain_mmap_cache_init(&l1cache);
  2.1292 +    domain_mmap_cache_init(&l2cache);
  2.1293 +
  2.1294 +    d->arch.phys_table = mk_pagetable(page_to_phys(l2page));
  2.1295 +    l2 = map_domain_page_with_cache(page_to_pfn(l2page), &l2cache);
  2.1296 +    memset(l2, 0, PAGE_SIZE);
  2.1297 +    unmap_domain_page_with_cache(l2, &l2cache);
  2.1298 +
  2.1299 +    list_ent = d->page_list.next;
  2.1300 +    while ( list_ent != &d->page_list )
  2.1301 +    {
  2.1302 +        page = list_entry(list_ent, struct pfn_info, list);
  2.1303 +        mfn = page_to_pfn(page);
  2.1304 +        pfn = machine_to_phys_mapping[mfn];
  2.1305 +        ASSERT(pfn != INVALID_M2P_ENTRY);
  2.1306 +        ASSERT(pfn < (1u<<20));
  2.1307 +
  2.1308 +        set_p2m_entry(d, pfn, mfn, &l2cache, &l1cache);
  2.1309 +
  2.1310 +        list_ent = page->list.next;
  2.1311 +    }
  2.1312 +
  2.1313 +    list_ent = d->xenpage_list.next;
  2.1314 +    while ( list_ent != &d->xenpage_list )
  2.1315 +    {
  2.1316 +        page = list_entry(list_ent, struct pfn_info, list);
  2.1317 +        mfn = page_to_pfn(page);
  2.1318 +        pfn = machine_to_phys_mapping[mfn];
  2.1319 +        if ( (pfn != INVALID_M2P_ENTRY) &&
  2.1320 +             (pfn < (1u<<20)) )
  2.1321 +        {
  2.1322 +            set_p2m_entry(d, pfn, mfn, &l2cache, &l1cache);
  2.1323 +        }
  2.1324 +
  2.1325 +        list_ent = page->list.next;
  2.1326 +    }
  2.1327 +
  2.1328 +    domain_mmap_cache_destroy(&l2cache);
  2.1329 +    domain_mmap_cache_destroy(&l1cache);
  2.1330 +
  2.1331 +    return 1;
  2.1332 +}
  2.1333 +
  2.1334 +void shadow_l1_normal_pt_update(
  2.1335 +    struct domain *d,
  2.1336 +    unsigned long pa, l1_pgentry_t gpte,
  2.1337 +    struct domain_mmap_cache *cache)
  2.1338 +{
  2.1339 +    unsigned long sl1mfn;    
  2.1340 +    l1_pgentry_t *spl1e, spte;
  2.1341 +
  2.1342 +    shadow_lock(d);
  2.1343 +
  2.1344 +    sl1mfn = __shadow_status(current->domain, pa >> PAGE_SHIFT, PGT_l1_shadow);
  2.1345 +    if ( sl1mfn )
  2.1346 +    {
  2.1347 +        SH_VVLOG("shadow_l1_normal_pt_update pa=%p, gpte=%" PRIpte,
  2.1348 +                 (void *)pa, l1e_get_intpte(gpte));
  2.1349 +        l1pte_propagate_from_guest(current->domain, gpte, &spte);
  2.1350 +
  2.1351 +        spl1e = map_domain_page_with_cache(sl1mfn, cache);
  2.1352 +        spl1e[(pa & ~PAGE_MASK) / sizeof(l1_pgentry_t)] = spte;
  2.1353 +        unmap_domain_page_with_cache(spl1e, cache);
  2.1354 +    }
  2.1355 +
  2.1356 +    shadow_unlock(d);
  2.1357 +}
  2.1358 +
  2.1359 +void shadow_l2_normal_pt_update(
  2.1360 +    struct domain *d,
  2.1361 +    unsigned long pa, l2_pgentry_t gpde,
  2.1362 +    struct domain_mmap_cache *cache)
  2.1363 +{
  2.1364 +    unsigned long sl2mfn;
  2.1365 +    l2_pgentry_t *spl2e;
  2.1366 +
  2.1367 +    shadow_lock(d);
  2.1368 +
  2.1369 +    sl2mfn = __shadow_status(current->domain, pa >> PAGE_SHIFT, PGT_l2_shadow);
  2.1370 +    if ( sl2mfn )
  2.1371 +    {
  2.1372 +        SH_VVLOG("shadow_l2_normal_pt_update pa=%p, gpde=%" PRIpte,
  2.1373 +                 (void *)pa, l2e_get_intpte(gpde));
  2.1374 +        spl2e = map_domain_page_with_cache(sl2mfn, cache);
  2.1375 +        validate_pde_change(d, gpde,
  2.1376 +                            &spl2e[(pa & ~PAGE_MASK) / sizeof(l2_pgentry_t)]);
  2.1377 +        unmap_domain_page_with_cache(spl2e, cache);
  2.1378 +    }
  2.1379 +
  2.1380 +    shadow_unlock(d);
  2.1381 +}
  2.1382 +
  2.1383 +#if CONFIG_PAGING_LEVELS >= 3
  2.1384 +void shadow_l3_normal_pt_update(
  2.1385 +    struct domain *d,
  2.1386 +    unsigned long pa, l3_pgentry_t gpde,
  2.1387 +    struct domain_mmap_cache *cache)
  2.1388 +{
  2.1389 +    unsigned long sl3mfn;
  2.1390 +    pgentry_64_t *spl3e;
  2.1391 +
  2.1392 +    shadow_lock(d);
  2.1393 +
  2.1394 +    sl3mfn = __shadow_status(current->domain, pa >> PAGE_SHIFT, PGT_l3_shadow);
  2.1395 +    if ( sl3mfn )
  2.1396 +    {
  2.1397 +        SH_VVLOG("shadow_l3_normal_pt_update pa=%p, gpde=%" PRIpte,
  2.1398 +                 (void *)pa, l3e_get_intpte(gpde));
  2.1399 +
  2.1400 +        spl3e = (pgentry_64_t *) map_domain_page_with_cache(sl3mfn, cache);
  2.1401 +        validate_entry_change(d, (pgentry_64_t *) &gpde,
  2.1402 +			      &spl3e[(pa & ~PAGE_MASK) / sizeof(l3_pgentry_t)], 
  2.1403 +			      shadow_type_to_level(PGT_l3_shadow));
  2.1404 +        unmap_domain_page_with_cache(spl3e, cache);
  2.1405 +    }
  2.1406 +
  2.1407 +    shadow_unlock(d);
  2.1408 +}
  2.1409 +#endif
  2.1410 +
  2.1411 +#if CONFIG_PAGING_LEVELS >= 4
  2.1412 +void shadow_l4_normal_pt_update(
  2.1413 +    struct domain *d,
  2.1414 +    unsigned long pa, l4_pgentry_t gpde,
  2.1415 +    struct domain_mmap_cache *cache)
  2.1416 +{
  2.1417 +    unsigned long sl4mfn;
  2.1418 +    pgentry_64_t *spl4e;
  2.1419 +
  2.1420 +    shadow_lock(d);
  2.1421 +
  2.1422 +    sl4mfn = __shadow_status(current->domain, pa >> PAGE_SHIFT, PGT_l4_shadow);
  2.1423 +    if ( sl4mfn )
  2.1424 +    {
  2.1425 +        SH_VVLOG("shadow_l4_normal_pt_update pa=%p, gpde=%" PRIpte,
  2.1426 +                 (void *)pa, l4e_get_intpte(gpde));
  2.1427 +
  2.1428 +        spl4e = (pgentry_64_t *)map_domain_page_with_cache(sl4mfn, cache);
  2.1429 +        validate_entry_change(d, (pgentry_64_t *)&gpde,
  2.1430 +			      &spl4e[(pa & ~PAGE_MASK) / sizeof(l4_pgentry_t)], 
  2.1431 +			      shadow_type_to_level(PGT_l4_shadow));
  2.1432 +        unmap_domain_page_with_cache(spl4e, cache);
  2.1433 +    }
  2.1434 +
  2.1435 +    shadow_unlock(d);
  2.1436 +}
  2.1437 +#endif
  2.1438 +
  2.1439 +static void
  2.1440 +translate_l1pgtable(struct domain *d, l1_pgentry_t *p2m, unsigned long l1mfn)
  2.1441 +{
  2.1442 +    int i;
  2.1443 +    l1_pgentry_t *l1;
  2.1444 +
  2.1445 +    l1 = map_domain_page(l1mfn);
  2.1446 +    for (i = 0; i < L1_PAGETABLE_ENTRIES; i++)
  2.1447 +    {
  2.1448 +        if ( is_guest_l1_slot(i) &&
  2.1449 +             (l1e_get_flags(l1[i]) & _PAGE_PRESENT) )
  2.1450 +        {
  2.1451 +            unsigned long mfn = l1e_get_pfn(l1[i]);
  2.1452 +            unsigned long gpfn = __mfn_to_gpfn(d, mfn);
  2.1453 +            ASSERT(l1e_get_pfn(p2m[gpfn]) == mfn);
  2.1454 +            l1[i] = l1e_from_pfn(gpfn, l1e_get_flags(l1[i]));
  2.1455 +        }
  2.1456 +    }
  2.1457 +    unmap_domain_page(l1);
  2.1458 +}
  2.1459 +
  2.1460 +// This is not general enough to handle arbitrary pagetables
  2.1461 +// with shared L1 pages, etc., but it is sufficient for bringing
  2.1462 +// up dom0.
  2.1463 +//
  2.1464 +void
  2.1465 +translate_l2pgtable(struct domain *d, l1_pgentry_t *p2m, unsigned long l2mfn,
  2.1466 +                    unsigned int type)
  2.1467 +{
  2.1468 +    int i;
  2.1469 +    l2_pgentry_t *l2;
  2.1470 +
  2.1471 +    ASSERT(shadow_mode_translate(d) && !shadow_mode_external(d));
  2.1472 +
  2.1473 +    l2 = map_domain_page(l2mfn);
  2.1474 +    for (i = 0; i < L2_PAGETABLE_ENTRIES; i++)
  2.1475 +    {
  2.1476 +        if ( is_guest_l2_slot(type, i) &&
  2.1477 +             (l2e_get_flags(l2[i]) & _PAGE_PRESENT) )
  2.1478 +        {
  2.1479 +            unsigned long mfn = l2e_get_pfn(l2[i]);
  2.1480 +            unsigned long gpfn = __mfn_to_gpfn(d, mfn);
  2.1481 +            ASSERT(l1e_get_pfn(p2m[gpfn]) == mfn);
  2.1482 +            l2[i] = l2e_from_pfn(gpfn, l2e_get_flags(l2[i]));
  2.1483 +            translate_l1pgtable(d, p2m, mfn);
  2.1484 +        }
  2.1485 +    }
  2.1486 +    unmap_domain_page(l2);
  2.1487 +}
  2.1488 +
  2.1489 +void
  2.1490 +remove_shadow(struct domain *d, unsigned long gpfn, u32 stype)
  2.1491 +{
  2.1492 +    unsigned long smfn;
  2.1493 +
  2.1494 +    //printk("%s(gpfn=%lx, type=%x)\n", __func__, gpfn, stype);
  2.1495 +
  2.1496 +    shadow_lock(d);
  2.1497 +
  2.1498 +    while ( stype >= PGT_l1_shadow )
  2.1499 +    {
  2.1500 +        smfn = __shadow_status(d, gpfn, stype);
  2.1501 +        if ( smfn && MFN_PINNED(smfn) )
  2.1502 +            shadow_unpin(smfn);
  2.1503 +        stype -= PGT_l1_shadow;
  2.1504 +    }
  2.1505 +
  2.1506 +    shadow_unlock(d);
  2.1507 +}
  2.1508 +
  2.1509 +unsigned long
  2.1510 +gpfn_to_mfn_foreign(struct domain *d, unsigned long gpfn)
  2.1511 +{
  2.1512 +    ASSERT( shadow_mode_translate(d) );
  2.1513 +
  2.1514 +    perfc_incrc(gpfn_to_mfn_foreign);
  2.1515 +
  2.1516 +    unsigned long va = gpfn << PAGE_SHIFT;
  2.1517 +    unsigned long tabpfn = pagetable_get_pfn(d->arch.phys_table);
  2.1518 +    l2_pgentry_t *l2 = map_domain_page(tabpfn);
  2.1519 +    l2_pgentry_t l2e = l2[l2_table_offset(va)];
  2.1520 +    unmap_domain_page(l2);
  2.1521 +    if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
  2.1522 +    {
  2.1523 +        printk("gpfn_to_mfn_foreign(d->id=%d, gpfn=%lx) => 0 l2e=%" PRIpte "\n",
  2.1524 +               d->domain_id, gpfn, l2e_get_intpte(l2e));
  2.1525 +        return INVALID_MFN;
  2.1526 +    }
  2.1527 +    l1_pgentry_t *l1 = map_domain_page(l2e_get_pfn(l2e));
  2.1528 +    l1_pgentry_t l1e = l1[l1_table_offset(va)];
  2.1529 +    unmap_domain_page(l1);
  2.1530 +
  2.1531 +#if 0
  2.1532 +    printk("gpfn_to_mfn_foreign(d->id=%d, gpfn=%lx) => %lx tabpfn=%lx l2e=%lx l1tab=%lx, l1e=%lx\n",
  2.1533 +           d->domain_id, gpfn, l1_pgentry_val(l1e) >> PAGE_SHIFT, tabpfn, l2e, l1tab, l1e);
  2.1534 +#endif
  2.1535 +
  2.1536 +    if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) )
  2.1537 +    {
  2.1538 +        printk("gpfn_to_mfn_foreign(d->id=%d, gpfn=%lx) => 0 l1e=%" PRIpte "\n",
  2.1539 +               d->domain_id, gpfn, l1e_get_intpte(l1e));
  2.1540 +        return INVALID_MFN;
  2.1541 +    }
  2.1542 +
  2.1543 +    return l1e_get_pfn(l1e);
  2.1544 +}
  2.1545 +
  2.1546 +static u32 remove_all_access_in_page(
  2.1547 +  struct domain *d, unsigned long l1mfn, unsigned long forbidden_gmfn)
  2.1548 +{
  2.1549 +    l1_pgentry_t *pl1e = map_domain_page(l1mfn);
  2.1550 +    l1_pgentry_t match;
  2.1551 +    unsigned long flags  = _PAGE_PRESENT;
  2.1552 +    int i;
  2.1553 +    u32 count = 0;
  2.1554 +    int is_l1_shadow =
  2.1555 +      ((frame_table[l1mfn].u.inuse.type_info & PGT_type_mask) ==
  2.1556 +       PGT_l1_shadow);
  2.1557 +
  2.1558 +    match = l1e_from_pfn(forbidden_gmfn, flags);
  2.1559 +
  2.1560 +    for (i = 0; i < L1_PAGETABLE_ENTRIES; i++)
  2.1561 +    {
  2.1562 +        if ( unlikely(!l1e_has_changed(pl1e[i], match, flags) == 0) )
  2.1563 +        {
  2.1564 +            l1_pgentry_t ol2e = pl1e[i];
  2.1565 +            pl1e[i] = l1e_empty();
  2.1566 +            count++;
  2.1567 +
  2.1568 +            if ( is_l1_shadow )
  2.1569 +                shadow_put_page_from_l1e(ol2e, d);
  2.1570 +            else /* must be an hl2 page */
  2.1571 +                put_page(&frame_table[forbidden_gmfn]);
  2.1572 +        }
  2.1573 +    }
  2.1574 +
  2.1575 +    unmap_domain_page(pl1e);
  2.1576 +
  2.1577 +    return count;
  2.1578 +}
  2.1579 +
  2.1580 +static u32 __shadow_remove_all_access(struct domain *d, unsigned long forbidden_gmfn)
  2.1581 +{
  2.1582 +    int i;
  2.1583 +    struct shadow_status *a;
  2.1584 +    u32 count = 0;
  2.1585 +
  2.1586 +    if ( unlikely(!shadow_mode_enabled(d)) )
  2.1587 +        return 0;
  2.1588 +
  2.1589 +    ASSERT(shadow_lock_is_acquired(d));
  2.1590 +    perfc_incrc(remove_all_access);
  2.1591 +
  2.1592 +    for (i = 0; i < shadow_ht_buckets; i++)
  2.1593 +    {
  2.1594 +        a = &d->arch.shadow_ht[i];
  2.1595 +        while ( a && a->gpfn_and_flags )
  2.1596 +        {
  2.1597 +            switch (a->gpfn_and_flags & PGT_type_mask)
  2.1598 +            {
  2.1599 +                case PGT_l1_shadow:
  2.1600 +                case PGT_l2_shadow:
  2.1601 +                case PGT_l3_shadow:
  2.1602 +                case PGT_l4_shadow:
  2.1603 +                case PGT_hl2_shadow:
  2.1604 +                    count += remove_all_access_in_page(d, a->smfn, forbidden_gmfn);
  2.1605 +                    break;
  2.1606 +                case PGT_snapshot:
  2.1607 +                case PGT_writable_pred:
  2.1608 +                    // these can't hold refs to the forbidden page
  2.1609 +                    break;
  2.1610 +                default:
  2.1611 +                    BUG();
  2.1612 +            }
  2.1613 +
  2.1614 +            a = a->next;
  2.1615 +        }
  2.1616 +    }
  2.1617 +
  2.1618 +    return count;
  2.1619 +}
  2.1620 +
  2.1621 +void shadow_drop_references(
  2.1622 +  struct domain *d, struct pfn_info *page)
  2.1623 +{
  2.1624 +    if ( likely(!shadow_mode_refcounts(d)) ||
  2.1625 +      ((page->u.inuse.type_info & PGT_count_mask) == 0) )
  2.1626 +        return;
  2.1627 +
  2.1628 +    /* XXX This needs more thought... */
  2.1629 +    printk("%s: needing to call __shadow_remove_all_access for mfn=%lx\n",
  2.1630 +      __func__, page_to_pfn(page));
  2.1631 +    printk("Before: mfn=%lx c=%08x t=%08x\n", page_to_pfn(page),
  2.1632 +      page->count_info, page->u.inuse.type_info);
  2.1633 +
  2.1634 +    shadow_lock(d);
  2.1635 +    __shadow_remove_all_access(d, page_to_pfn(page));
  2.1636 +    shadow_unlock(d);
  2.1637 +
  2.1638 +    printk("After:  mfn=%lx c=%08x t=%08x\n", page_to_pfn(page),
  2.1639 +      page->count_info, page->u.inuse.type_info);
  2.1640 +}
  2.1641 +
  2.1642 +/* XXX Needs more thought. Neither pretty nor fast: a place holder. */
  2.1643 +void shadow_sync_and_drop_references(
  2.1644 +  struct domain *d, struct pfn_info *page)
  2.1645 +{
  2.1646 +    if ( likely(!shadow_mode_refcounts(d)) )
  2.1647 +        return;
  2.1648 +
  2.1649 +    shadow_lock(d);
  2.1650 +
  2.1651 +    if ( page_out_of_sync(page) )
  2.1652 +        __shadow_sync_mfn(d, page_to_pfn(page));
  2.1653 +
  2.1654 +    __shadow_remove_all_access(d, page_to_pfn(page));
  2.1655 +
  2.1656 +    shadow_unlock(d);
  2.1657 +}