ia64/xen-unstable

changeset 4141:67c3463e70f1

bitkeeper revision 1.1236.32.2 (42360b33-HudAOddVBt3ez4shMiyOw)

Initial fullshadow checkin.

Things still to do:
- reuse snapshots intelligently.
- minimize tlb flushes during resync.
- figure out when to free up no-longer-used L2 shadows, and
generally deal with out-of-memory kinds of problems.

Some basic guidelines:
- With fullshadow on, you can not trust
linear_pg_table unless you have first checked whether the VA
in which you are interested is out-of-sync or not.
- Significant new functions/macros include:
page_out_of_sync(mfn): returns true if page is out of sync.
shadow_mark_out_of_sync: make a page be out of sync (allocating
any necessary snapshots, etc)
shadow_out_of_sync(va): returns true if the current mappings
involved in va are out-of-sync.
shadow_sync_va(): bring the pages involved in mapping a particular
va back into sync. Currently calls shadow_sync_all().
shadow_sync_all(): bring all pages back in-sync.

Signed-off-by: michael.fetterman@cl.cam.ac.uk
author mafetter@fleming.research
date Mon Mar 14 22:07:47 2005 +0000 (2005-03-14)
parents d4b85d775de2
children dff04529e881 2d50ee7a068d
files .rootkeys xen/arch/x86/audit.c xen/arch/x86/domain.c xen/arch/x86/mm.c xen/arch/x86/shadow.c xen/arch/x86/traps.c xen/arch/x86/vmx.c xen/arch/x86/x86_32/domain_build.c xen/arch/x86/x86_32/domain_page.c xen/common/dom_mem_ops.c xen/common/keyhandler.c xen/common/page_alloc.c xen/common/schedule.c xen/include/asm-x86/domain.h xen/include/asm-x86/mm.h xen/include/asm-x86/page.h xen/include/asm-x86/shadow.h xen/include/asm-x86/x86_32/page.h xen/include/xen/domain.h xen/include/xen/perfc_defn.h
line diff
     1.1 --- a/.rootkeys	Mon Mar 14 18:44:10 2005 +0000
     1.2 +++ b/.rootkeys	Mon Mar 14 22:07:47 2005 +0000
     1.3 @@ -939,6 +939,7 @@ 3ddb79bcZbRBzT3elFWSX7u6NtMagQ xen/arch/
     1.4  3ddb79bcBQF85CfLS4i1WGZ4oLLaCA xen/arch/x86/Rules.mk
     1.5  3e5636e5FAYZ5_vQnmgwFJfSdmO5Mw xen/arch/x86/acpi.c
     1.6  3ddb79bcsjinG9k1KcvbVBuas1R2dA xen/arch/x86/apic.c
     1.7 +42360b3244-Q6BpEKhR_A1YtG1wPNQ xen/arch/x86/audit.c
     1.8  3ddb79c4yGZ7_22QAFFwPzqP4NSHwA xen/arch/x86/boot/mkelf32.c
     1.9  3ddb79bcSC_LvnmFlX-T5iTgaR0SKg xen/arch/x86/boot/x86_32.S
    1.10  40e42bdbNu4MjI750THP_8J1S-Sa0g xen/arch/x86/boot/x86_64.S
     2.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     2.2 +++ b/xen/arch/x86/audit.c	Mon Mar 14 22:07:47 2005 +0000
     2.3 @@ -0,0 +1,817 @@
     2.4 +/******************************************************************************
     2.5 + * arch/x86/audit.c
     2.6 + * 
     2.7 + * Copyright (c) 2002-2005 K A Fraser
     2.8 + * Copyright (c) 2004 Christian Limpach
     2.9 + * Copyright (c) 2005 Michael A Fetterman
    2.10 + * 
    2.11 + * This program is free software; you can redistribute it and/or modify
    2.12 + * it under the terms of the GNU General Public License as published by
    2.13 + * the Free Software Foundation; either version 2 of the License, or
    2.14 + * (at your option) any later version.
    2.15 + * 
    2.16 + * This program is distributed in the hope that it will be useful,
    2.17 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
    2.18 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    2.19 + * GNU General Public License for more details.
    2.20 + * 
    2.21 + * You should have received a copy of the GNU General Public License
    2.22 + * along with this program; if not, write to the Free Software
    2.23 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
    2.24 + */
    2.25 +
    2.26 +#include <xen/config.h>
    2.27 +#include <xen/init.h>
    2.28 +#include <xen/kernel.h>
    2.29 +#include <xen/lib.h>
    2.30 +#include <xen/mm.h>
    2.31 +//#include <xen/sched.h>
    2.32 +//#include <xen/errno.h>
    2.33 +#include <xen/perfc.h>
    2.34 +//#include <xen/irq.h>
    2.35 +//#include <xen/softirq.h>
    2.36 +#include <asm/shadow.h>
    2.37 +#include <asm/page.h>
    2.38 +#include <asm/flushtlb.h>
    2.39 +//#include <asm/io.h>
    2.40 +//#include <asm/uaccess.h>
    2.41 +//#include <asm/domain_page.h>
    2.42 +//#include <asm/ldt.h>
    2.43 +
    2.44 +// XXX SMP bug -- these should not be statics...
    2.45 +//
    2.46 +static int ttot=0, ctot=0, io_mappings=0, lowmem_mappings=0;
    2.47 +static int l1, l2, oos_count, page_count;
    2.48 +
    2.49 +#define FILE_AND_LINE 1
    2.50 +
    2.51 +#if FILE_AND_LINE
    2.52 +#define adjust(_p, _a) _adjust((_p), (_a), __FILE__, __LINE__)
    2.53 +#define ADJUST_EXTRA_ARGS ,const char *file, int line
    2.54 +#define APRINTK(_f, _a...) printk(_f " %s:%d\n", ## _a, file, line)
    2.55 +#else
    2.56 +#define adjust _adjust
    2.57 +#define ADJUST_EXTRA_ARGS
    2.58 +#define APRINTK(_f, _a...) printk(_f "\n", ##_a)
    2.59 +#endif
    2.60 +
    2.61 +int audit_adjust_pgtables(struct domain *d, int dir, int noisy)
    2.62 +{
    2.63 +    int errors = 0;
    2.64 +    int shadow_enabled = shadow_mode_enabled(d) ? 1 : 0;
    2.65 +
    2.66 +    void _adjust(struct pfn_info *page, int adjtype ADJUST_EXTRA_ARGS)
    2.67 +    {
    2.68 +        if ( adjtype )
    2.69 +        {
    2.70 +            // adjust the type count
    2.71 +            //
    2.72 +            int tcount = page->u.inuse.type_info & PGT_count_mask;
    2.73 +            tcount += dir;
    2.74 +            ttot++;
    2.75 +
    2.76 +            if ( page_get_owner(page) == NULL )
    2.77 +            {
    2.78 +                APRINTK("adjust(mfn=%p, dir=%d, adjtype=%d) owner=NULL",
    2.79 +                        page_to_pfn(page), dir, adjtype, file, line);
    2.80 +                errors++;
    2.81 +            }
    2.82 +
    2.83 +            if ( tcount < 0 )
    2.84 +            {
    2.85 +                APRINTK("Audit %d: type count went below zero mfn=%x t=%x ot=%x",
    2.86 +                        d->id, page-frame_table,
    2.87 +                        page->u.inuse.type_info,
    2.88 +                        page->tlbflush_timestamp);
    2.89 +                errors++;
    2.90 +            }
    2.91 +            else if ( (tcount & ~PGT_count_mask) != 0 )
    2.92 +            {
    2.93 +                APRINTK("Audit %d: type count overflowed mfn=%x t=%x ot=%x",
    2.94 +                        d->id, page-frame_table,
    2.95 +                        page->u.inuse.type_info,
    2.96 +                        page->tlbflush_timestamp);
    2.97 +                errors++;
    2.98 +            }
    2.99 +            else
   2.100 +                page->u.inuse.type_info += dir;
   2.101 +        }
   2.102 +
   2.103 +        // adjust the general count
   2.104 +        //
   2.105 +        int count = page->count_info & PGC_count_mask;
   2.106 +        count += dir;
   2.107 +        ctot++;
   2.108 +
   2.109 +        if ( count < 0 )
   2.110 +        {
   2.111 +            APRINTK("Audit %d: general count went below zero pfn=%x t=%x ot=%x",
   2.112 +                    d->id, page-frame_table,
   2.113 +                    page->u.inuse.type_info,
   2.114 +                    page->tlbflush_timestamp);
   2.115 +            errors++;
   2.116 +        }
   2.117 +        else if ( (count & ~PGT_count_mask) != 0 )
   2.118 +        {
   2.119 +            APRINTK("Audit %d: general count overflowed pfn=%x t=%x ot=%x",
   2.120 +                    d->id, page-frame_table,
   2.121 +                    page->u.inuse.type_info,
   2.122 +                    page->tlbflush_timestamp);
   2.123 +            errors++;
   2.124 +        }
   2.125 +        else
   2.126 +            page->count_info += dir;
   2.127 +    }
   2.128 +
   2.129 +    void adjust_l2_page(unsigned long mfn, int adjtype)
   2.130 +    {
   2.131 +        unsigned long *pt = map_domain_mem(mfn << PAGE_SHIFT);
   2.132 +        int i, limit;
   2.133 +
   2.134 +        if ( shadow_mode_external(d) )
   2.135 +            limit = L2_PAGETABLE_ENTRIES;
   2.136 +        else
   2.137 +            limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
   2.138 +
   2.139 +        for ( i = 0; i < limit; i++ )
   2.140 +        {
   2.141 +            if ( pt[i] & _PAGE_PRESENT )
   2.142 +            {
   2.143 +                unsigned long l1mfn = pt[i] >> PAGE_SHIFT;
   2.144 +                struct pfn_info *l1page = pfn_to_page(l1mfn);
   2.145 +
   2.146 +                if ( noisy )
   2.147 +                {
   2.148 +                    if ( shadow_enabled )
   2.149 +                    {
   2.150 +                        if ( page_get_owner(l1page) != NULL )
   2.151 +                        {
   2.152 +                            printk("L2: Bizarre shadow L1 page mfn=%p "
   2.153 +                                   "belonging to a domain %p (id=%d)\n",
   2.154 +                                   l1mfn,
   2.155 +                                   page_get_owner(l1page),
   2.156 +                                   page_get_owner(l1page)->id);
   2.157 +                            errors++;
   2.158 +                            continue;
   2.159 +                        }
   2.160 +                    }
   2.161 +                    else
   2.162 +                    {
   2.163 +                        if ( page_get_owner(l1page) != d )
   2.164 +                        {
   2.165 +                            printk("L2: Skip bizarre L1 page mfn=%p "
   2.166 +                                   "belonging to other dom %p (id=%d)\n",
   2.167 +                                   l1mfn,
   2.168 +                                   page_get_owner(l1page),
   2.169 +                                   page_get_owner(l1page)->id);
   2.170 +                            errors++;
   2.171 +                            continue;
   2.172 +                        }
   2.173 +
   2.174 +                        u32 page_type = l1page->u.inuse.type_info & PGT_type_mask;
   2.175 +
   2.176 +                        if ( page_type == PGT_l2_page_table )
   2.177 +                        {
   2.178 +                            printk("Audit %d: [%x] Found %s Linear PT "
   2.179 +                                   "t=%x mfn=%p\n",
   2.180 +                                   d->id, i, (l1mfn==mfn) ? "Self" : "Other",
   2.181 +                                   l1page->u.inuse.type_info, l1mfn);
   2.182 +                        }
   2.183 +                        else if ( page_type != PGT_l1_page_table )
   2.184 +                        {
   2.185 +                            printk("Audit %d: [L2 mfn=%p i=%x] "
   2.186 +                                   "Expected L1 t=%x mfn=%p\n",
   2.187 +                                   d->id, mfn, i,
   2.188 +                                   l1page->u.inuse.type_info, l1mfn);
   2.189 +                            errors++;
   2.190 +                        }
   2.191 +                    }
   2.192 +                }
   2.193 +
   2.194 +                adjust(l1page, adjtype);
   2.195 +            }
   2.196 +        }
   2.197 +
   2.198 +        unmap_domain_mem(pt);
   2.199 +    }
   2.200 +
   2.201 +    void adjust_l1_page(unsigned long l1mfn)
   2.202 +    {
   2.203 +        unsigned long *pt = map_domain_mem(l1mfn << PAGE_SHIFT);
   2.204 +        int i;
   2.205 +
   2.206 +        for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
   2.207 +        {
   2.208 +            if ( pt[i] & _PAGE_PRESENT )
   2.209 +            {
   2.210 +                unsigned long gmfn = pt[i] >> PAGE_SHIFT;
   2.211 +                struct pfn_info *gpage = pfn_to_page(gmfn);
   2.212 +
   2.213 +                if ( gmfn < 0x100 )
   2.214 +                {
   2.215 +                    lowmem_mappings++;
   2.216 +                    continue;
   2.217 +                }
   2.218 +
   2.219 +                if ( gmfn > max_page )
   2.220 +                {
   2.221 +                    io_mappings++;
   2.222 +                    continue;
   2.223 +                }
   2.224 +
   2.225 +                if ( noisy )
   2.226 +                {
   2.227 +                    if ( pt[i] & _PAGE_RW )
   2.228 +                    {
   2.229 +                        // If it's not a writable page, complain.
   2.230 +                        //
   2.231 +                        if ( !((gpage->u.inuse.type_info & PGT_type_mask) ==
   2.232 +                               PGT_writable_page) )
   2.233 +                        {
   2.234 +                            printk("Audit %d: [l1mfn=%p, i=%x] Illegal RW "
   2.235 +                                   "t=%x mfn=%p\n",
   2.236 +                                   d->id, l1mfn, i,
   2.237 +                                   gpage->u.inuse.type_info, gmfn);
   2.238 +                            errors++;
   2.239 +                        }
   2.240 +
   2.241 +                        if ( shadow_enabled &&
   2.242 +                             page_is_page_table(gpage) &&
   2.243 +                             ! page_out_of_sync(gpage) )
   2.244 +                        {
   2.245 +                            printk("Audit %d: [l1mfn=%p, i=%x] Illegal RW of "
   2.246 +                                   "page table gmfn=%p\n",
   2.247 +                                   d->id, l1mfn, i, gmfn);
   2.248 +                            errors++;
   2.249 +                        }
   2.250 +                    }
   2.251 +
   2.252 +                    if ( page_get_owner(gpage) != d )
   2.253 +                    {
   2.254 +                        printk("Audit %d: [l1mfn=%p,i=%x] Skip foreign page "
   2.255 +                               "dom=%p (id=%d) mfn=%p c=%08x t=%08x\n",
   2.256 +                               d->id, l1mfn, i,
   2.257 +                               page_get_owner(gpage),
   2.258 +                               page_get_owner(gpage)->id,
   2.259 +                               gmfn,
   2.260 +                               gpage->count_info,
   2.261 +                               gpage->u.inuse.type_info);
   2.262 +                        continue;
   2.263 +                    }
   2.264 +                }
   2.265 +
   2.266 +                adjust(gpage, (pt[i] & _PAGE_RW) ? 1 : 0);
   2.267 +            }
   2.268 +        }
   2.269 +
   2.270 +        unmap_domain_mem(pt);
   2.271 +    }
   2.272 +
   2.273 +    void adjust_shadow_tables()
   2.274 +    {
   2.275 +        struct shadow_status *a;
   2.276 +        unsigned long smfn, gmfn;
   2.277 +        struct pfn_info *page;
   2.278 +        int i;
   2.279 +
   2.280 +        for ( i = 0; i < shadow_ht_buckets; i++ )
   2.281 +        {
   2.282 +            a = &d->arch.shadow_ht[i];
   2.283 +            while ( a && a->gpfn_and_flags )
   2.284 +            {
   2.285 +                gmfn = __gpfn_to_mfn(d, a->gpfn_and_flags & PGT_mfn_mask);
   2.286 +                smfn = a->smfn;
   2.287 +                page = &frame_table[smfn];
   2.288 +
   2.289 +                adjust(pfn_to_page(gmfn), 0);
   2.290 +
   2.291 +                switch ( a->gpfn_and_flags & PGT_type_mask ) {
   2.292 +                case PGT_snapshot:
   2.293 +                    break;
   2.294 +                case PGT_l1_shadow:
   2.295 +                case PGT_hl2_shadow:
   2.296 +                    adjust_l1_page(smfn);
   2.297 +                    if ( page->u.inuse.type_info & PGT_pinned )
   2.298 +                        adjust(page, 0);
   2.299 +                    break;
   2.300 +                case PGT_l2_shadow:
   2.301 +                    adjust_l2_page(smfn, 0);
   2.302 +                    if ( page->u.inuse.type_info & PGT_pinned )
   2.303 +                        adjust(page, 0);
   2.304 +                    break;
   2.305 +                default:
   2.306 +                    BUG();
   2.307 +                    break;
   2.308 +                }
   2.309 +
   2.310 +                a = a->next;
   2.311 +            }
   2.312 +        }
   2.313 +    }
   2.314 +
   2.315 +    void adjust_oos_list()
   2.316 +    {
   2.317 +        struct out_of_sync_entry *oos;
   2.318 +
   2.319 +        if ( (oos = d->arch.out_of_sync) )
   2.320 +            ASSERT(shadow_enabled);
   2.321 +
   2.322 +        while ( oos )
   2.323 +        {
   2.324 +            adjust(pfn_to_page(oos->gmfn), 0);
   2.325 +
   2.326 +            // Only use entries that have low bits clear...
   2.327 +            //
   2.328 +            if ( !(oos->writable_pl1e & (sizeof(l1_pgentry_t)-1)) )
   2.329 +                adjust(pfn_to_page(oos->writable_pl1e >> PAGE_SHIFT), 0);
   2.330 +
   2.331 +            oos = oos->next;
   2.332 +            oos_count++;
   2.333 +        }
   2.334 +    }
   2.335 +
   2.336 +    void adjust_for_pgtbase()
   2.337 +    {
   2.338 +        struct exec_domain *ed;
   2.339 +
   2.340 +        for_each_exec_domain(d, ed)
   2.341 +            {
   2.342 +                if ( !shadow_enabled )
   2.343 +                {
   2.344 +                    if ( pagetable_val(ed->arch.guest_table) )
   2.345 +                        adjust(&frame_table[pagetable_val(ed->arch.guest_table)
   2.346 +                                            >> PAGE_SHIFT], 1);
   2.347 +                }
   2.348 +                else
   2.349 +                {
   2.350 +                    if ( pagetable_val(ed->arch.guest_table) )
   2.351 +                        adjust(&frame_table[pagetable_val(ed->arch.guest_table)
   2.352 +                                            >> PAGE_SHIFT], 0);
   2.353 +                    if ( pagetable_val(ed->arch.shadow_table) )
   2.354 +                        adjust(&frame_table[pagetable_val(ed->arch.shadow_table)
   2.355 +                                            >> PAGE_SHIFT], 0);
   2.356 +                }
   2.357 +            }
   2.358 +    }
   2.359 +
   2.360 +    void adjust_guest_pages()
   2.361 +    {
   2.362 +        struct list_head *list_ent = d->page_list.next;
   2.363 +        struct pfn_info *page;
   2.364 +        unsigned long mfn;
   2.365 +
   2.366 +        while ( list_ent != &d->page_list )
   2.367 +        {
   2.368 +            u32 page_type;
   2.369 +
   2.370 +            page = list_entry(list_ent, struct pfn_info, list);
   2.371 +            mfn = page_to_pfn(page);
   2.372 +            page_type = page->u.inuse.type_info & PGT_type_mask;
   2.373 +
   2.374 +            if ( page_get_owner(page) != d )
   2.375 +                BUG();
   2.376 +
   2.377 +            page_count++;
   2.378 +
   2.379 +            switch ( page_type )
   2.380 +            {
   2.381 +            case PGT_l2_page_table:
   2.382 +                l2++;
   2.383 +
   2.384 +                if ( noisy )
   2.385 +                {
   2.386 +                    if ( shadow_enabled )
   2.387 +                    {
   2.388 +                        printk("Audit %d: found an L2 guest page "
   2.389 +                               "mfn=%p t=%08x c=%08x while in shadow mode\n",
   2.390 +                               mfn, page->u.inuse.type_info, page->count_info);
   2.391 +                        errors++;
   2.392 +                    }
   2.393 +
   2.394 +                    if ( (page->u.inuse.type_info & PGT_validated) !=
   2.395 +                         PGT_validated )
   2.396 +                    {
   2.397 +                        printk("Audit %d: L2 mfn=%p not validated %p\n",
   2.398 +                               d->id, mfn, page->u.inuse.type_info);
   2.399 +                        errors++;
   2.400 +                    }
   2.401 +
   2.402 +                    if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned )
   2.403 +                    {
   2.404 +                        printk("Audit %d: L2 mfn=%p not pinned t=%p\n",
   2.405 +                               d->id, mfn, page->u.inuse.type_info);
   2.406 +                        errors++;
   2.407 +                    }
   2.408 +                }
   2.409 +
   2.410 +                if ( page->u.inuse.type_info & PGT_pinned )
   2.411 +                    adjust(page, 1);
   2.412 +
   2.413 +                if ( page->u.inuse.type_info & PGT_validated )
   2.414 +                    adjust_l2_page(mfn, 1);
   2.415 +
   2.416 +                break;
   2.417 +
   2.418 +            case PGT_l1_page_table:
   2.419 +                l1++;
   2.420 +
   2.421 +                if ( noisy )
   2.422 +                {
   2.423 +                    if ( shadow_enabled )
   2.424 +                    {
   2.425 +                        printk("found an L1 guest page mfn=%p t=%08x c=%08x while in shadow mode\n",
   2.426 +                               mfn, page->u.inuse.type_info, page->count_info);
   2.427 +                        errors++;
   2.428 +                    }
   2.429 +
   2.430 +                    if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated )
   2.431 +                    {
   2.432 +                        printk("Audit %d: L1 not validated mfn=%p t=%p\n",
   2.433 +                               d->id, mfn, page->u.inuse.type_info);
   2.434 +                        errors++;
   2.435 +                    }
   2.436 +
   2.437 +                    if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned )
   2.438 +                    {
   2.439 +                        if ( !VM_ASSIST(d, VMASST_TYPE_writable_pagetables) )
   2.440 +                        {
   2.441 +                            printk("Audit %d: L1 mfn=%p not pinned t=%p\n",
   2.442 +                                   d->id, mfn, page->u.inuse.type_info);
   2.443 +                            errors++;
   2.444 +                        }
   2.445 +                    }
   2.446 +                }
   2.447 +                
   2.448 +                if ( page->u.inuse.type_info & PGT_pinned )
   2.449 +                    adjust(page, 1);
   2.450 +
   2.451 +                if ( page->u.inuse.type_info & PGT_validated )
   2.452 +                    adjust_l1_page(mfn);
   2.453 +
   2.454 +                break;
   2.455 +
   2.456 +            case PGT_gdt_page:
   2.457 +                ASSERT( !page_out_of_sync(page) );
   2.458 +                adjust(page, 1);
   2.459 +                break;
   2.460 +
   2.461 +            case PGT_ldt_page:
   2.462 +                ASSERT( !page_out_of_sync(page) );
   2.463 +                adjust(page, 1);
   2.464 +                break;
   2.465 +
   2.466 +            case PGT_writable_page:
   2.467 +                if ( shadow_enabled )
   2.468 +                {
   2.469 +                    // In shadow mode, writable pages can get pinned by
   2.470 +                    // paravirtualized guests that think they are pinning
   2.471 +                    // their L1s and/or L2s.
   2.472 +                    //
   2.473 +                    if ( page->u.inuse.type_info & PGT_pinned )
   2.474 +                        adjust(page, 1);
   2.475 +                }
   2.476 +            }
   2.477 +
   2.478 +            list_ent = page->list.next;
   2.479 +        }
   2.480 +    }
   2.481 +
   2.482 +    adjust_for_pgtbase();
   2.483 +
   2.484 +    adjust_guest_pages();
   2.485 +
   2.486 +    if ( shadow_enabled )
   2.487 +    {
   2.488 +        adjust_oos_list();
   2.489 +        adjust_shadow_tables();
   2.490 +    }
   2.491 +
   2.492 +    return errors;
   2.493 +}
   2.494 +
   2.495 +
   2.496 +#ifndef NDEBUG
   2.497 +
   2.498 +void _audit_domain(struct domain *d, int flags, const char *file, int line)
   2.499 +{
   2.500 +    void scan_for_pfn_in_mfn(struct domain *d, unsigned long xmfn,
   2.501 +                             unsigned long mfn)
   2.502 +    {
   2.503 +        struct pfn_info *page = &frame_table[mfn];
   2.504 +        unsigned long *pt = map_domain_mem(mfn);
   2.505 +        int i;
   2.506 +
   2.507 +        for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
   2.508 +        {
   2.509 +            if ( (pt[i] & _PAGE_PRESENT) && ((pt[i] >> PAGE_SHIFT) == xmfn) )
   2.510 +                printk("     found dom=%d mfn=%p t=%x c=%x pt[i=%x]=%p\n",
   2.511 +                       d->id, mfn, page->u.inuse.type_info,
   2.512 +                       page->count_info, i, pt[i]);
   2.513 +        }
   2.514 +
   2.515 +        unmap_domain_mem(pt);           
   2.516 +    }
   2.517 +
   2.518 +    void scan_for_pfn(struct domain *d, unsigned long xmfn)
   2.519 +    {
   2.520 +        if ( !shadow_mode_enabled(d) )
   2.521 +        {
   2.522 +            struct list_head *list_ent = d->page_list.next;
   2.523 +            struct pfn_info *page;
   2.524 +
   2.525 +            while ( list_ent != &d->page_list )
   2.526 +            {
   2.527 +                page = list_entry(list_ent, struct pfn_info, list);
   2.528 +
   2.529 +                switch ( page->u.inuse.type_info & PGT_type_mask )
   2.530 +                {
   2.531 +                case PGT_l1_page_table:
   2.532 +                case PGT_l2_page_table:
   2.533 +                    scan_for_pfn_in_mfn(d, xmfn, page_to_pfn(page));
   2.534 +                    break;
   2.535 +                default:
   2.536 +                    break;
   2.537 +                }
   2.538 +
   2.539 +                list_ent = page->list.next;
   2.540 +            }
   2.541 +        }
   2.542 +        else
   2.543 +        {
   2.544 +            struct shadow_status *a;
   2.545 +            int i;
   2.546 +            
   2.547 +            for ( i = 0; i < shadow_ht_buckets; i++ )
   2.548 +            {
   2.549 +                a = &d->arch.shadow_ht[i];
   2.550 +                while ( a && a->gpfn_and_flags )
   2.551 +                {
   2.552 +                    switch ( a->gpfn_and_flags & PGT_type_mask )
   2.553 +                    {
   2.554 +                    case PGT_l1_shadow:
   2.555 +                    case PGT_l2_shadow:
   2.556 +                    case PGT_hl2_shadow:
   2.557 +                        scan_for_pfn_in_mfn(d, xmfn, a->smfn);
   2.558 +                        break;
   2.559 +                    case PGT_snapshot:
   2.560 +                        break;
   2.561 +                    default:
   2.562 +                        BUG();
   2.563 +                        break;
   2.564 +                    }
   2.565 +                    a = a->next;
   2.566 +                }
   2.567 +            }
   2.568 +        }
   2.569 +    }
   2.570 +
   2.571 +    void scan_for_pfn_remote(unsigned long xmfn)
   2.572 +    {
   2.573 +        struct domain *e;
   2.574 +        for_each_domain ( e )
   2.575 +            scan_for_pfn( e, xmfn );
   2.576 +    } 
   2.577 +
   2.578 +    unsigned long mfn;
   2.579 +    struct list_head *list_ent;
   2.580 +    struct pfn_info *page;
   2.581 +    int errors = 0;
   2.582 +
   2.583 +    if ( d != current->domain )
   2.584 +        domain_pause(d);
   2.585 +    synchronise_pagetables(~0UL);
   2.586 +
   2.587 +    // Maybe we should just be using BIGLOCK?
   2.588 +    //
   2.589 +    if ( !(flags & AUDIT_ALREADY_LOCKED) )
   2.590 +        shadow_lock(d);
   2.591 +
   2.592 +    spin_lock(&d->page_alloc_lock);
   2.593 +
   2.594 +    /* PHASE 0 */
   2.595 +
   2.596 +    list_ent = d->page_list.next;
   2.597 +    while ( list_ent != &d->page_list )
   2.598 +    {
   2.599 +        u32 page_type;
   2.600 +
   2.601 +        page = list_entry(list_ent, struct pfn_info, list);
   2.602 +        mfn = page_to_pfn(page);
   2.603 +        page_type = page->u.inuse.type_info & PGT_type_mask;
   2.604 +
   2.605 +        if ( page_get_owner(page) != d )
   2.606 +            BUG();
   2.607 +
   2.608 +        if ( (page->u.inuse.type_info & PGT_count_mask) >
   2.609 +             (page->count_info & PGC_count_mask) )
   2.610 +        {
   2.611 +            printk("taf(%08x) > caf(%08x) mfn=%p\n",
   2.612 +                   page->u.inuse.type_info, page->count_info, mfn);
   2.613 +            errors++;
   2.614 +        }
   2.615 +
   2.616 +        if ( shadow_mode_enabled(d) &&
   2.617 +             (page_type == PGT_writable_page) &&
   2.618 +             !(page->u.inuse.type_info & PGT_validated) )
   2.619 +        {
   2.620 +            printk("shadow mode writable page not validated mfn=%p t=%08x c=%08x\n",
   2.621 +                   mfn, page->u.inuse.type_info, page->count_info);
   2.622 +            errors++;
   2.623 +        }
   2.624 + 
   2.625 +#if 0   /* SYSV shared memory pages plus writeable files. */
   2.626 +        if ( page_type == PGT_writable_page && 
   2.627 +             (page->u.inuse.type_info & PGT_count_mask) > 1 )
   2.628 +        {
   2.629 +            printk("writeable page with type count >1: mfn=%lx t=%x c=%x\n",
   2.630 +                  mfn,
   2.631 +                  page->u.inuse.type_info,
   2.632 +                  page->count_info );
   2.633 +            errors++;
   2.634 +            scan_for_pfn_remote(mfn);
   2.635 +        }
   2.636 +#endif
   2.637 +
   2.638 +        if ( page_type == PGT_none && 
   2.639 +             (page->u.inuse.type_info & PGT_count_mask) > 0 )
   2.640 +        {
   2.641 +            printk("normal page with type count >0: mfn=%lx t=%x c=%x\n",
   2.642 +                  mfn,
   2.643 +                  page->u.inuse.type_info,
   2.644 +                  page->count_info );
   2.645 +            errors++;
   2.646 +        }
   2.647 +
   2.648 +        if ( page_out_of_sync(page) )
   2.649 +        {
   2.650 +            if ( !page_is_page_table(page) )
   2.651 +            {
   2.652 +                printk("out of sync page mfn=%p is not a page table\n", mfn);
   2.653 +                errors++;
   2.654 +            }
   2.655 +            unsigned long pfn = __mfn_to_gpfn(d, mfn);
   2.656 +            if ( !__shadow_status(d, pfn, PGT_snapshot) )
   2.657 +            {
   2.658 +                printk("out of sync page mfn=%p doesn't have a snapshot\n");
   2.659 +                errors++;
   2.660 +            }
   2.661 +            if ( page_type != PGT_writable_page )
   2.662 +            {
   2.663 +                printk("out of sync page mfn=%p has strange type t=%08x c=%08x\n",
   2.664 +                       mfn, page->u.inuse.type_info, page->count_info);
   2.665 +                errors++;
   2.666 +            }
   2.667 +        }
   2.668 +
   2.669 +        /* Use tlbflush_timestamp to store original type_info. */
   2.670 +        page->tlbflush_timestamp = page->u.inuse.type_info;
   2.671 +
   2.672 +        list_ent = page->list.next;
   2.673 +    }
   2.674 +
   2.675 +    /* PHASE 1 */
   2.676 +    io_mappings = lowmem_mappings = 0;
   2.677 +
   2.678 +    errors += audit_adjust_pgtables(d, -1, 1);
   2.679 +
   2.680 +    if ( !(flags & AUDIT_QUIET) &&
   2.681 +         ((io_mappings > 0) || (lowmem_mappings > 0)) )
   2.682 +        printk("Audit %d: Found %d lowmem mappings and %d io mappings\n",
   2.683 +               d->id, lowmem_mappings, io_mappings);
   2.684 +
   2.685 +    /* PHASE 2 */
   2.686 +
   2.687 +    list_ent = d->page_list.next;
   2.688 +    while ( list_ent != &d->page_list )
   2.689 +    {
   2.690 +        page = list_entry(list_ent, struct pfn_info, list);
   2.691 +        mfn = page_to_pfn(page);
   2.692 +
   2.693 +        switch ( page->u.inuse.type_info & PGT_type_mask)
   2.694 +        {
   2.695 +        case PGT_l1_page_table:
   2.696 +        case PGT_l2_page_table:
   2.697 +            if ( (page->u.inuse.type_info & PGT_count_mask) != 0 )
   2.698 +            {
   2.699 +                printk("Audit %d: type count!=0 t=%x ot=%x c=%x mfn=%lx\n",
   2.700 +                       d->id, page->u.inuse.type_info, 
   2.701 +                       page->tlbflush_timestamp,
   2.702 +                       page->count_info, mfn);
   2.703 +                errors++;
   2.704 +                scan_for_pfn_remote(mfn);
   2.705 +            }
   2.706 +            break;
   2.707 +        case PGT_none:
   2.708 +        case PGT_writable_page:
   2.709 +        case PGT_gdt_page:
   2.710 +        case PGT_ldt_page:
   2.711 +            if ( (page->u.inuse.type_info & PGT_count_mask) != 0 )
   2.712 +            {
   2.713 +                printk("Audit %d: type count!=0 t=%x ot=%x c=%x mfn=%lx\n",
   2.714 +                       d->id, page->u.inuse.type_info, 
   2.715 +                       page->tlbflush_timestamp,
   2.716 +                       page->count_info, mfn);
   2.717 +                errors++;
   2.718 +            }
   2.719 +            break;
   2.720 +        default:
   2.721 +            BUG(); // XXX fix me...
   2.722 +        }
   2.723 +        
   2.724 +        if ( (page->count_info & PGC_count_mask) != 1 )
   2.725 +        {
   2.726 +            printk("Audit %d: gen count!=1 (c=%x) t=%x ot=%x mfn=%lx\n",
   2.727 +                   d->id,
   2.728 +                   page->count_info,
   2.729 +                   page->u.inuse.type_info, 
   2.730 +                   page->tlbflush_timestamp, mfn );
   2.731 +            errors++;
   2.732 +            scan_for_pfn_remote(mfn);
   2.733 +        }
   2.734 +
   2.735 +        list_ent = page->list.next;
   2.736 +    }
   2.737 +
   2.738 +    if ( shadow_mode_enabled(d) )
   2.739 +    {
   2.740 +        struct shadow_status *a;
   2.741 +        struct pfn_info *page;
   2.742 +        u32 page_type;
   2.743 +        int i;
   2.744 +
   2.745 +        for ( i = 0; i < shadow_ht_buckets; i++ )
   2.746 +        {
   2.747 +            a = &d->arch.shadow_ht[i];
   2.748 +            while ( a && a->gpfn_and_flags )
   2.749 +            {
   2.750 +                page = pfn_to_page(a->smfn);
   2.751 +                page_type = a->gpfn_and_flags & PGT_type_mask;
   2.752 +
   2.753 +                switch ( page_type ) {
   2.754 +                case PGT_snapshot:
   2.755 +                    // XXX -- what should we check here?
   2.756 +                    break;
   2.757 +                case PGT_l1_shadow:
   2.758 +                case PGT_l2_shadow:
   2.759 +                    if ( ((page->u.inuse.type_info & PGT_type_mask) != page_type ) ||
   2.760 +                         (page->count_info != 0) )
   2.761 +                    {
   2.762 +                        printk("Audit %d: shadow page counts wrong mfn=%p t=%x c=%x\n",
   2.763 +                               d->id, page_to_pfn(page),
   2.764 +                               page->u.inuse.type_info,
   2.765 +                               page->count_info);
   2.766 +                        errors++;
   2.767 +                    }
   2.768 +                    break;
   2.769 +
   2.770 +                case PGT_hl2_shadow: // haven't thought about this case yet.
   2.771 +                default:
   2.772 +                    BUG();
   2.773 +                    break;
   2.774 +                }
   2.775 +
   2.776 +                a = a->next;
   2.777 +            }
   2.778 +        }
   2.779 +    }
   2.780 +
   2.781 +    /* PHASE 3 */
   2.782 +    ctot = ttot = page_count = l1 = l2 = oos_count = 0;
   2.783 +
   2.784 +    audit_adjust_pgtables(d, 1, 0);
   2.785 +
   2.786 +#if 0
   2.787 +    // This covers our sins of trashing the tlbflush_timestamps...
   2.788 +    //
   2.789 +    local_flush_tlb();
   2.790 +#endif
   2.791 +
   2.792 +    spin_unlock(&d->page_alloc_lock);
   2.793 +
   2.794 +    if ( !(flags & AUDIT_QUIET) )
   2.795 +        printk("Audit dom%d (%s:%d) Done. "
   2.796 +               "pages=%d oos=%d l1=%d l2=%d ctot=%d ttot=%d\n",
   2.797 +               d->id, file, line, page_count, oos_count, l1, l2, ctot, ttot );
   2.798 +
   2.799 +    if ( !(flags & AUDIT_ALREADY_LOCKED) )
   2.800 +        shadow_unlock(d);
   2.801 +
   2.802 +    if ( d != current->domain )
   2.803 +        domain_unpause(d);
   2.804 +
   2.805 +    if ( errors && !(flags & AUDIT_ERRORS_OK) )
   2.806 +        BUG();
   2.807 +}
   2.808 +
   2.809 +void audit_domains(void)
   2.810 +{
   2.811 +    struct domain *d;
   2.812 +    for_each_domain ( d )
   2.813 +        audit_domain(d);
   2.814 +}
   2.815 +
   2.816 +void audit_domains_key(unsigned char key)
   2.817 +{
   2.818 +    audit_domains();
   2.819 +}
   2.820 +#endif
     3.1 --- a/xen/arch/x86/domain.c	Mon Mar 14 18:44:10 2005 +0000
     3.2 +++ b/xen/arch/x86/domain.c	Mon Mar 14 22:07:47 2005 +0000
     3.3 @@ -247,10 +247,9 @@ void arch_do_createdomain(struct exec_do
     3.4          machine_to_phys_mapping[virt_to_phys(d->arch.mm_perdomain_pt) >> 
     3.5                                 PAGE_SHIFT] = INVALID_M2P_ENTRY;
     3.6          ed->arch.perdomain_ptes = d->arch.mm_perdomain_pt;
     3.7 -#if 0 /* don't need this yet, but maybe soon! */
     3.8 -        ed->arch.guest_vtable = linear_l2_table;
     3.9 -        ed->arch.shadow_vtable = shadow_linear_l2_table;
    3.10 -#endif
    3.11 +
    3.12 +        ed->arch.guest_vtable  = __linear_l2_table;
    3.13 +        ed->arch.shadow_vtable = __shadow_linear_l2_table;
    3.14  
    3.15  #ifdef __x86_64__
    3.16          d->arch.mm_perdomain_l2 = (l2_pgentry_t *)alloc_xenheap_page();
    3.17 @@ -295,70 +294,6 @@ void arch_vmx_do_launch(struct exec_doma
    3.18      reset_stack_and_jump(vmx_asm_do_launch);
    3.19  }
    3.20  
    3.21 -unsigned long alloc_monitor_pagetable(struct exec_domain *ed)
    3.22 -{
    3.23 -    unsigned long mmfn;
    3.24 -    l2_pgentry_t *mpl2e;
    3.25 -    struct pfn_info *mmfn_info;
    3.26 -    struct domain *d = ed->domain;
    3.27 -
    3.28 -    ASSERT(!pagetable_val(ed->arch.monitor_table)); /* we should only get called once */
    3.29 -
    3.30 -    mmfn_info = alloc_domheap_page(NULL);
    3.31 -    ASSERT( mmfn_info ); 
    3.32 -
    3.33 -    mmfn = (unsigned long) (mmfn_info - frame_table);
    3.34 -    mpl2e = (l2_pgentry_t *) map_domain_mem(mmfn << PAGE_SHIFT);
    3.35 -    memset(mpl2e, 0, PAGE_SIZE);
    3.36 -
    3.37 -    memcpy(&mpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 
    3.38 -           &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
    3.39 -           HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
    3.40 -
    3.41 -    mpl2e[l2_table_offset(PERDOMAIN_VIRT_START)] =
    3.42 -        mk_l2_pgentry((__pa(d->arch.mm_perdomain_pt) & PAGE_MASK) 
    3.43 -                      | __PAGE_HYPERVISOR);
    3.44 -
    3.45 -    ed->arch.monitor_vtable = mpl2e;
    3.46 -
    3.47 -    // map the phys_to_machine map into the Read-Only MPT space for this domain
    3.48 -    mpl2e[l2_table_offset(RO_MPT_VIRT_START)] =
    3.49 -        mk_l2_pgentry(pagetable_val(ed->arch.phys_table) | __PAGE_HYPERVISOR);
    3.50 -
    3.51 -    return mmfn;
    3.52 -}
    3.53 -
    3.54 -/*
    3.55 - * Free the pages for monitor_table and hl2_table
    3.56 - */
    3.57 -static void free_monitor_pagetable(struct exec_domain *ed)
    3.58 -{
    3.59 -    l2_pgentry_t *mpl2e;
    3.60 -    unsigned long mfn;
    3.61 -
    3.62 -    ASSERT( pagetable_val(ed->arch.monitor_table) );
    3.63 -    
    3.64 -    mpl2e = ed->arch.monitor_vtable;
    3.65 -
    3.66 -    /*
    3.67 -     * First get the mfn for hl2_table by looking at monitor_table
    3.68 -     */
    3.69 -    mfn = l2_pgentry_val(mpl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT])
    3.70 -        >> PAGE_SHIFT;
    3.71 -
    3.72 -    free_domheap_page(&frame_table[mfn]);
    3.73 -    unmap_domain_mem(mpl2e);
    3.74 -
    3.75 -    /*
    3.76 -     * Then free monitor_table.
    3.77 -     */
    3.78 -    mfn = (pagetable_val(ed->arch.monitor_table)) >> PAGE_SHIFT;
    3.79 -    free_domheap_page(&frame_table[mfn]);
    3.80 -
    3.81 -    ed->arch.monitor_table = mk_pagetable(0);
    3.82 -    ed->arch.monitor_vtable = 0;
    3.83 -}
    3.84 -
    3.85  static int vmx_final_setup_guest(struct exec_domain *ed,
    3.86                                     full_execution_context_t *full_context)
    3.87  {
     4.1 --- a/xen/arch/x86/mm.c	Mon Mar 14 18:44:10 2005 +0000
     4.2 +++ b/xen/arch/x86/mm.c	Mon Mar 14 22:07:47 2005 +0000
     4.3 @@ -104,19 +104,12 @@
     4.4  
     4.5  #ifdef VERBOSE
     4.6  #define MEM_LOG(_f, _a...)                           \
     4.7 -  printk("DOM%u: (file=memory.c, line=%d) " _f "\n", \
     4.8 +  printk("DOM%u: MEM_LOG(line=%d) " _f "\n", \
     4.9           current->domain->id , __LINE__ , ## _a )
    4.10  #else
    4.11  #define MEM_LOG(_f, _a...) ((void)0)
    4.12  #endif
    4.13  
    4.14 -static int alloc_l2_table(struct pfn_info *page);
    4.15 -static int alloc_l1_table(struct pfn_info *page);
    4.16 -static int get_page_from_pagenr(unsigned long page_nr, struct domain *d);
    4.17 -static int get_page_and_type_from_pagenr(unsigned long page_nr, 
    4.18 -                                         u32 type,
    4.19 -                                         struct domain *d);
    4.20 -
    4.21  static void free_l2_table(struct pfn_info *page);
    4.22  static void free_l1_table(struct pfn_info *page);
    4.23  
    4.24 @@ -222,7 +215,7 @@ static void __invalidate_shadow_ldt(stru
    4.25  }
    4.26  
    4.27  
    4.28 -static inline void invalidate_shadow_ldt(struct exec_domain *d)
    4.29 +void invalidate_shadow_ldt(struct exec_domain *d)
    4.30  {
    4.31      if ( d->arch.shadow_ldt_mapcnt != 0 )
    4.32          __invalidate_shadow_ldt(d);
    4.33 @@ -254,21 +247,41 @@ int map_ldt_shadow_page(unsigned int off
    4.34  {
    4.35      struct exec_domain *ed = current;
    4.36      struct domain *d = ed->domain;
    4.37 -    unsigned long l1e;
    4.38 +    unsigned long l1e, nl1e, gpfn, gmfn;
    4.39 +    unsigned gva = ed->arch.ldt_base + (off << PAGE_SHIFT);
    4.40 +    int res;
    4.41  
    4.42      if ( unlikely(in_irq()) )
    4.43          BUG();
    4.44  
    4.45 -    __get_user(l1e, (unsigned long *)
    4.46 -               &linear_pg_table[l1_linear_offset(ed->arch.ldt_base) + off]);
    4.47 -
    4.48 -    if ( unlikely(!(l1e & _PAGE_PRESENT)) ||
    4.49 -         unlikely(!get_page_and_type(
    4.50 -             &frame_table[l1_pgentry_to_pfn(mk_l1_pgentry(l1e))],
    4.51 -             d, PGT_ldt_page)) )
    4.52 +    shadow_sync_va(ed, gva);
    4.53 +    __get_user(l1e, (unsigned long *)&linear_pg_table[l1_linear_offset(gva)]);
    4.54 +
    4.55 +    if ( unlikely(!(l1e & _PAGE_PRESENT)) )
    4.56 +        return 0;
    4.57 +
    4.58 +    gpfn = l1_pgentry_to_pfn(mk_l1_pgentry(l1e));
    4.59 +    gmfn = __gpfn_to_mfn(d, gpfn);
    4.60 +    if ( unlikely(!gmfn) )
    4.61          return 0;
    4.62  
    4.63 -    ed->arch.perdomain_ptes[off + 16] = mk_l1_pgentry(l1e | _PAGE_RW);
    4.64 +    if ( unlikely(shadow_mode_enabled(d)) )
    4.65 +    {
    4.66 +        shadow_lock(d);
    4.67 +        shadow_remove_all_write_access(d, PGT_l1_shadow, PGT_l1_shadow, gpfn);
    4.68 +    }
    4.69 +
    4.70 +    res = get_page_and_type(&frame_table[gmfn], d, PGT_ldt_page);
    4.71 +
    4.72 +    if ( unlikely(shadow_mode_enabled(d)) )
    4.73 +        shadow_unlock(d);
    4.74 +
    4.75 +    if ( unlikely(!res) )
    4.76 +        return 0;
    4.77 +
    4.78 +    nl1e = (l1e & ~PAGE_MASK) | (gmfn << PAGE_SHIFT) | _PAGE_RW;
    4.79 +
    4.80 +    ed->arch.perdomain_ptes[off + 16] = mk_l1_pgentry(nl1e);
    4.81      ed->arch.shadow_ldt_mapcnt++;
    4.82  
    4.83      return 1;
    4.84 @@ -337,6 +350,8 @@ get_linear_pagetable(
    4.85      struct pfn_info *page;
    4.86      unsigned long pfn;
    4.87  
    4.88 +    ASSERT( !shadow_mode_enabled(d) );
    4.89 +
    4.90      if ( (root_pgentry_val(re) & _PAGE_RW) )
    4.91      {
    4.92          MEM_LOG("Attempt to create linear p.t. with write perms");
    4.93 @@ -372,13 +387,13 @@ get_linear_pagetable(
    4.94  }
    4.95  
    4.96  
    4.97 -static int
    4.98 +int
    4.99  get_page_from_l1e(
   4.100      l1_pgentry_t l1e, struct domain *d)
   4.101  {
   4.102      unsigned long l1v = l1_pgentry_val(l1e);
   4.103 -    unsigned long pfn = l1_pgentry_to_pfn(l1e);
   4.104 -    struct pfn_info *page = &frame_table[pfn];
   4.105 +    unsigned long mfn = l1_pgentry_to_pfn(l1e);
   4.106 +    struct pfn_info *page = &frame_table[mfn];
   4.107      extern int domain_iomem_in_pfn(struct domain *d, unsigned long pfn);
   4.108  
   4.109      if ( !(l1v & _PAGE_PRESENT) )
   4.110 @@ -386,11 +401,11 @@ get_page_from_l1e(
   4.111  
   4.112      if ( unlikely(l1v & L1_DISALLOW_MASK) )
   4.113      {
   4.114 -        MEM_LOG("Bad L1 type settings %p", l1v & L1_DISALLOW_MASK);
   4.115 +        MEM_LOG("Bad L1 type settings %p %p", l1v, l1v & L1_DISALLOW_MASK);
   4.116          return 0;
   4.117      }
   4.118  
   4.119 -    if ( unlikely(!pfn_is_ram(pfn)) )
   4.120 +    if ( unlikely(!pfn_is_ram(mfn)) )
   4.121      {
   4.122          /* Revert to caller privileges if FD == DOMID_IO. */
   4.123          if ( d == dom_io )
   4.124 @@ -400,9 +415,9 @@ get_page_from_l1e(
   4.125              return 1;
   4.126  
   4.127          if ( IS_CAPABLE_PHYSDEV(d) )
   4.128 -            return domain_iomem_in_pfn(d, pfn);
   4.129 -
   4.130 -        MEM_LOG("Non-privileged attempt to map I/O space %p", pfn);
   4.131 +            return domain_iomem_in_pfn(d, mfn);
   4.132 +
   4.133 +        MEM_LOG("Non-privileged attempt to map I/O space %p", mfn);
   4.134          return 0;
   4.135      }
   4.136  
   4.137 @@ -420,6 +435,8 @@ get_page_from_l2e(
   4.138  {
   4.139      int rc;
   4.140  
   4.141 +    ASSERT( !shadow_mode_enabled(d) );
   4.142 +
   4.143      if ( !(l2_pgentry_val(l2e) & _PAGE_PRESENT) )
   4.144          return 1;
   4.145  
   4.146 @@ -491,7 +508,7 @@ get_page_from_l4e(
   4.147  #endif /* __x86_64__ */
   4.148  
   4.149  
   4.150 -static void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
   4.151 +void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
   4.152  {
   4.153      unsigned long    l1v  = l1_pgentry_val(l1e);
   4.154      unsigned long    pfn  = l1_pgentry_to_pfn(l1e);
   4.155 @@ -530,6 +547,8 @@ static void put_page_from_l1e(l1_pgentry
   4.156          if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) == 
   4.157                         PGT_ldt_page)) &&
   4.158               unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) )
   4.159 +
   4.160 +            // XXX SMP BUG?
   4.161              invalidate_shadow_ldt(e->exec_domain[0]);
   4.162          put_page(page);
   4.163      }
   4.164 @@ -575,6 +594,8 @@ static int alloc_l1_table(struct pfn_inf
   4.165      l1_pgentry_t  *pl1e;
   4.166      int            i;
   4.167  
   4.168 +    ASSERT( !shadow_mode_enabled(d) );
   4.169 +
   4.170      pl1e = map_domain_mem(pfn << PAGE_SHIFT);
   4.171  
   4.172      for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
   4.173 @@ -601,6 +622,11 @@ static int alloc_l2_table(struct pfn_inf
   4.174      unsigned long  pfn = page_to_pfn(page);
   4.175      l2_pgentry_t  *pl2e;
   4.176      int            i;
   4.177 +
   4.178 +    if ( (PGT_base_page_table == PGT_l2_page_table) &&
   4.179 +         shadow_mode_enabled(d) )
   4.180 +        return 1;
   4.181 +    ASSERT( !shadow_mode_enabled(d) );
   4.182     
   4.183      pl2e = map_domain_mem(pfn << PAGE_SHIFT);
   4.184  
   4.185 @@ -643,6 +669,8 @@ static int alloc_l3_table(struct pfn_inf
   4.186      l3_pgentry_t  *pl3e = page_to_virt(page);
   4.187      int            i;
   4.188  
   4.189 +    ASSERT( !shadow_mode_enabled(d) );
   4.190 +
   4.191      for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
   4.192          if ( is_guest_l3_slot(i) &&
   4.193               unlikely(!get_page_from_l3e(pl3e[i], pfn, d)) )
   4.194 @@ -666,6 +694,11 @@ static int alloc_l4_table(struct pfn_inf
   4.195      l4_pgentry_t  *pl4e = page_to_virt(page);
   4.196      int            i;
   4.197  
   4.198 +    if ( (PGT_base_page_table == PGT_l4_page_table) &&
   4.199 +         shadow_mode_enabled(d) )
   4.200 +        return 1;
   4.201 +    ASSERT( !shadow_mode_enabled(d) );
   4.202 +
   4.203      for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
   4.204          if ( is_guest_l4_slot(i) &&
   4.205               unlikely(!get_page_from_l4e(pl4e[i], pfn, d)) )
   4.206 @@ -765,7 +798,7 @@ static inline int update_l1e(l1_pgentry_
   4.207      if ( unlikely(cmpxchg_user(pl1e, o, n) != 0) ||
   4.208           unlikely(o != l1_pgentry_val(ol1e)) )
   4.209      {
   4.210 -        MEM_LOG("Failed to update %p -> %p: saw %p\n",
   4.211 +        MEM_LOG("Failed to update %p -> %p: saw %p",
   4.212                  l1_pgentry_val(ol1e), l1_pgentry_val(nl1e), o);
   4.213          return 0;
   4.214      }
   4.215 @@ -781,6 +814,8 @@ static int mod_l1_entry(l1_pgentry_t *pl
   4.216      unsigned long _ol1e;
   4.217      struct domain *d = current->domain;
   4.218  
   4.219 +    ASSERT( !shadow_mode_enabled(d) );
   4.220 +
   4.221      if ( unlikely(__get_user(_ol1e, (unsigned long *)pl1e) != 0) )
   4.222          return 0;
   4.223      ol1e = mk_l1_pgentry(_ol1e);
   4.224 @@ -807,13 +842,12 @@ static int mod_l1_entry(l1_pgentry_t *pl
   4.225              put_page_from_l1e(nl1e, d);
   4.226              return 0;
   4.227          }
   4.228 -        
   4.229 -        put_page_from_l1e(ol1e, d);
   4.230 -        return 1;
   4.231      }
   4.232 -
   4.233 -    if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
   4.234 -        return 0;
   4.235 +    else
   4.236 +    {
   4.237 +        if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
   4.238 +            return 0;
   4.239 +    }
   4.240      
   4.241      put_page_from_l1e(ol1e, d);
   4.242      return 1;
   4.243 @@ -825,7 +859,7 @@ static int mod_l1_entry(l1_pgentry_t *pl
   4.244                                  _t ## _pgentry_val(_o),                 \
   4.245                                  _t ## _pgentry_val(_n));                \
   4.246      if ( __o != _t ## _pgentry_val(_o) )                                \
   4.247 -        MEM_LOG("Failed to update %p -> %p: saw %p\n",                  \
   4.248 +        MEM_LOG("Failed to update %p -> %p: saw %p",                    \
   4.249                  _t ## _pgentry_val(_o), _t ## _pgentry_val(_n), __o);   \
   4.250      (__o == _t ## _pgentry_val(_o)); })
   4.251  
   4.252 @@ -872,13 +906,12 @@ static int mod_l2_entry(l2_pgentry_t *pl
   4.253              put_page_from_l2e(nl2e, pfn);
   4.254              return 0;
   4.255          }
   4.256 -        
   4.257 -        put_page_from_l2e(ol2e, pfn);
   4.258 -        return 1;
   4.259      }
   4.260 -
   4.261 -    if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e)) )
   4.262 -        return 0;
   4.263 +    else
   4.264 +    {
   4.265 +        if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e)) )
   4.266 +            return 0;
   4.267 +    }
   4.268  
   4.269      put_page_from_l2e(ol2e, pfn);
   4.270      return 1;
   4.271 @@ -1025,7 +1058,9 @@ int alloc_page_type(struct pfn_info *pag
   4.272  
   4.273  void free_page_type(struct pfn_info *page, unsigned int type)
   4.274  {
   4.275 -    struct domain *d = page_get_owner(page);
   4.276 +    struct domain *owner = page_get_owner(page);
   4.277 +    if ( likely(owner != NULL) && unlikely(shadow_mode_enabled(owner)) )
   4.278 +        return;
   4.279  
   4.280      switch ( type )
   4.281      {
   4.282 @@ -1050,13 +1085,6 @@ void free_page_type(struct pfn_info *pag
   4.283      default:
   4.284          BUG();
   4.285      }
   4.286 -
   4.287 -    if ( unlikely(shadow_mode_enabled(d)) && 
   4.288 -         (get_shadow_status(d, page_to_pfn(page)) & PSH_shadowed) )
   4.289 -    {
   4.290 -        unshadow_table(page_to_pfn(page), type);
   4.291 -        put_shadow_status(d);
   4.292 -    }
   4.293  }
   4.294  
   4.295  
   4.296 @@ -1096,15 +1124,16 @@ void put_page_type(struct pfn_info *page
   4.297                  if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, 
   4.298                                             x & ~PGT_validated)) != x) )
   4.299                      goto again;
   4.300 -                /* We cleared the 'valid bit' so we do the clear up. */
   4.301 +                /* We cleared the 'valid bit' so we do the clean up. */
   4.302                  free_page_type(page, x & PGT_type_mask);
   4.303                  /* Carry on, but with the 'valid bit' now clear. */
   4.304                  x  &= ~PGT_validated;
   4.305                  nx &= ~PGT_validated;
   4.306              }
   4.307          }
   4.308 -        else if ( unlikely((nx & (PGT_pinned | PGT_count_mask)) == 
   4.309 -                           (PGT_pinned | 1)) )
   4.310 +        else if ( unlikely(((nx & (PGT_pinned | PGT_count_mask)) == 
   4.311 +                            (PGT_pinned | 1)) &&
   4.312 +                           ((nx & PGT_type_mask) != PGT_writable_page)) )
   4.313          {
   4.314              /* Page is now only pinned. Make the back pointer mutable again. */
   4.315              nx |= PGT_va_mutable;
   4.316 @@ -1124,7 +1153,7 @@ int get_page_type(struct pfn_info *page,
   4.317          nx = x + 1;
   4.318          if ( unlikely((nx & PGT_count_mask) == 0) )
   4.319          {
   4.320 -            MEM_LOG("Type count overflow on pfn %p\n", page_to_pfn(page));
   4.321 +            MEM_LOG("Type count overflow on pfn %p", page_to_pfn(page));
   4.322              return 0;
   4.323          }
   4.324          else if ( unlikely((x & PGT_count_mask) == 0) )
   4.325 @@ -1137,6 +1166,8 @@ int get_page_type(struct pfn_info *page,
   4.326                   * circumstances should be very rare.
   4.327                   */
   4.328                  struct domain *d = page_get_owner(page);
   4.329 +
   4.330 +                // XXX SMP bug?
   4.331                  if ( unlikely(NEED_FLUSH(tlbflush_time[d->exec_domain[0]->
   4.332                                                        processor],
   4.333                                           page->tlbflush_timestamp)) )
   4.334 @@ -1155,14 +1186,24 @@ int get_page_type(struct pfn_info *page,
   4.335                      nx |= PGT_validated;
   4.336              }
   4.337          }
   4.338 +        else if ( unlikely(!(x & PGT_validated)) )
   4.339 +        {
   4.340 +            /* Someone else is updating validation of this page. Wait... */
   4.341 +            while ( (y = page->u.inuse.type_info) == x )
   4.342 +            {
   4.343 +                rep_nop();
   4.344 +                barrier();
   4.345 +            }
   4.346 +            goto again;
   4.347 +        }
   4.348          else if ( unlikely((x & (PGT_type_mask|PGT_va_mask)) != type) )
   4.349          {
   4.350              if ( unlikely((x & PGT_type_mask) != (type & PGT_type_mask) ) )
   4.351              {
   4.352                  if ( ((x & PGT_type_mask) != PGT_l2_page_table) ||
   4.353                       ((type & PGT_type_mask) != PGT_l1_page_table) )
   4.354 -                    MEM_LOG("Bad type (saw %08x != exp %08x) for pfn %p\n",
   4.355 -                            x & PGT_type_mask, type, page_to_pfn(page));
   4.356 +                    MEM_LOG("Bad type (saw %08x != exp %08x) for pfn %p",
   4.357 +                            x, type, page_to_pfn(page));
   4.358                  return 0;
   4.359              }
   4.360              else if ( (x & PGT_va_mask) == PGT_va_mutable )
   4.361 @@ -1178,16 +1219,6 @@ int get_page_type(struct pfn_info *page,
   4.362                  nx |= PGT_va_unknown;
   4.363              }
   4.364          }
   4.365 -        else if ( unlikely(!(x & PGT_validated)) )
   4.366 -        {
   4.367 -            /* Someone else is updating validation of this page. Wait... */
   4.368 -            while ( (y = page->u.inuse.type_info) == x )
   4.369 -            {
   4.370 -                rep_nop();
   4.371 -                barrier();
   4.372 -            }
   4.373 -            goto again;
   4.374 -        }
   4.375      }
   4.376      while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
   4.377  
   4.378 @@ -1197,7 +1228,7 @@ int get_page_type(struct pfn_info *page,
   4.379          if ( unlikely(!alloc_page_type(page, type & PGT_type_mask)) )
   4.380          {
   4.381              MEM_LOG("Error while validating pfn %p for type %08x."
   4.382 -                    " caf=%08x taf=%08x\n",
   4.383 +                    " caf=%08x taf=%08x",
   4.384                      page_to_pfn(page), type,
   4.385                      page->count_info,
   4.386                      page->u.inuse.type_info);
   4.387 @@ -1214,30 +1245,36 @@ int get_page_type(struct pfn_info *page,
   4.388  }
   4.389  
   4.390  
   4.391 -int new_guest_cr3(unsigned long pfn)
   4.392 +int new_guest_cr3(unsigned long mfn)
   4.393  {
   4.394      struct exec_domain *ed = current;
   4.395      struct domain *d = ed->domain;
   4.396 -    int okay, cpu = smp_processor_id();
   4.397 -    unsigned long old_base_pfn;
   4.398 -    
   4.399 -    okay = get_page_and_type_from_pagenr(pfn, PGT_root_page_table, d);
   4.400 +    int okay;
   4.401 +    unsigned long old_base_mfn;
   4.402 +
   4.403 +    if ( shadow_mode_enabled(d) )
   4.404 +        okay = get_page_from_pagenr(mfn, d);
   4.405 +    else
   4.406 +        okay = get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d);
   4.407 +
   4.408      if ( likely(okay) )
   4.409      {
   4.410          invalidate_shadow_ldt(ed);
   4.411  
   4.412 -        percpu_info[cpu].deferred_ops &= ~DOP_FLUSH_TLB;
   4.413 -        old_base_pfn = pagetable_val(ed->arch.guest_table) >> PAGE_SHIFT;
   4.414 -        ed->arch.guest_table = mk_pagetable(pfn << PAGE_SHIFT);
   4.415 +        old_base_mfn = pagetable_val(ed->arch.guest_table) >> PAGE_SHIFT;
   4.416 +        ed->arch.guest_table = mk_pagetable(mfn << PAGE_SHIFT);
   4.417          update_pagetables(ed); /* update shadow_table and monitor_table */
   4.418  
   4.419          write_ptbase(ed);
   4.420  
   4.421 -        put_page_and_type(&frame_table[old_base_pfn]);
   4.422 +        if ( shadow_mode_enabled(d) )
   4.423 +            put_page(&frame_table[old_base_mfn]);
   4.424 +        else
   4.425 +            put_page_and_type(&frame_table[old_base_mfn]);
   4.426      }
   4.427      else
   4.428      {
   4.429 -        MEM_LOG("Error while installing new baseptr %p", pfn);
   4.430 +        MEM_LOG("Error while installing new baseptr %p", mfn);
   4.431      }
   4.432  
   4.433      return okay;
   4.434 @@ -1247,10 +1284,11 @@ static int do_extended_command(unsigned 
   4.435  {
   4.436      int okay = 1, cpu = smp_processor_id();
   4.437      unsigned int cmd = val & MMUEXT_CMD_MASK, type;
   4.438 -    unsigned long pfn = ptr >> PAGE_SHIFT;
   4.439 -    struct pfn_info *page = &frame_table[pfn];
   4.440      struct exec_domain *ed = current;
   4.441      struct domain *d = ed->domain, *e;
   4.442 +    unsigned long gpfn = ptr >> PAGE_SHIFT;
   4.443 +    unsigned long mfn = __gpfn_to_mfn(d, gpfn);
   4.444 +    struct pfn_info *page = &frame_table[mfn];
   4.445      u32 x, y, _d, _nd;
   4.446      domid_t domid;
   4.447      grant_ref_t gntref;
   4.448 @@ -1266,17 +1304,29 @@ static int do_extended_command(unsigned 
   4.449          type = PGT_l1_page_table | PGT_va_mutable;
   4.450  
   4.451      pin_page:
   4.452 -        okay = get_page_and_type_from_pagenr(pfn, type, FOREIGNDOM);
   4.453 +        if ( unlikely(percpu_info[cpu].foreign &&
   4.454 +                      (shadow_mode_translate(d) ||
   4.455 +                       shadow_mode_translate(percpu_info[cpu].foreign))) )
   4.456 +        {
   4.457 +            // oops -- we should be using the foreign domain's P2M
   4.458 +            mfn = __gpfn_to_mfn(FOREIGNDOM, gpfn);
   4.459 +            page = &frame_table[mfn];
   4.460 +        }
   4.461 +
   4.462 +        if ( shadow_mode_enabled(FOREIGNDOM) )
   4.463 +            type = PGT_writable_page;
   4.464 +
   4.465 +        okay = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM);
   4.466          if ( unlikely(!okay) )
   4.467          {
   4.468 -            MEM_LOG("Error while pinning pfn %p", pfn);
   4.469 +            MEM_LOG("Error while pinning mfn %p", mfn);
   4.470              break;
   4.471          }
   4.472  
   4.473          if ( unlikely(test_and_set_bit(_PGT_pinned,
   4.474                                         &page->u.inuse.type_info)) )
   4.475          {
   4.476 -            MEM_LOG("Pfn %p already pinned", pfn);
   4.477 +            MEM_LOG("mfn %p already pinned", mfn);
   4.478              put_page_and_type(page);
   4.479              okay = 0;
   4.480              break;
   4.481 @@ -1299,10 +1349,19 @@ static int do_extended_command(unsigned 
   4.482  #endif /* __x86_64__ */
   4.483  
   4.484      case MMUEXT_UNPIN_TABLE:
   4.485 -        if ( unlikely(!(okay = get_page_from_pagenr(pfn, FOREIGNDOM))) )
   4.486 +        if ( unlikely(percpu_info[cpu].foreign &&
   4.487 +                      (shadow_mode_translate(d) ||
   4.488 +                       shadow_mode_translate(percpu_info[cpu].foreign))) )
   4.489          {
   4.490 -            MEM_LOG("Page %p bad domain (dom=%p)",
   4.491 -                    ptr, page_get_owner(page));
   4.492 +            // oops -- we should be using the foreign domain's P2M
   4.493 +            mfn = __gpfn_to_mfn(FOREIGNDOM, gpfn);
   4.494 +            page = &frame_table[mfn];
   4.495 +        }
   4.496 +
   4.497 +        if ( unlikely(!(okay = get_page_from_pagenr(mfn, FOREIGNDOM))) )
   4.498 +        {
   4.499 +            MEM_LOG("mfn %p bad domain (dom=%p)",
   4.500 +                    mfn, page_get_owner(page));
   4.501          }
   4.502          else if ( likely(test_and_clear_bit(_PGT_pinned, 
   4.503                                              &page->u.inuse.type_info)) )
   4.504 @@ -1314,28 +1373,29 @@ static int do_extended_command(unsigned 
   4.505          {
   4.506              okay = 0;
   4.507              put_page(page);
   4.508 -            MEM_LOG("Pfn %p not pinned", pfn);
   4.509 +            MEM_LOG("mfn %p not pinned", mfn);
   4.510          }
   4.511          break;
   4.512  
   4.513      case MMUEXT_NEW_BASEPTR:
   4.514 -        okay = new_guest_cr3(pfn);
   4.515 +        okay = new_guest_cr3(mfn);
   4.516 +        percpu_info[cpu].deferred_ops &= ~DOP_FLUSH_TLB;
   4.517          break;
   4.518          
   4.519  #ifdef __x86_64__
   4.520      case MMUEXT_NEW_USER_BASEPTR:
   4.521 -        okay = get_page_and_type_from_pagenr(pfn, PGT_root_page_table, d);
   4.522 +        okay = get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d);
   4.523          if ( unlikely(!okay) )
   4.524          {
   4.525 -            MEM_LOG("Error while installing new baseptr %p", pfn);
   4.526 +            MEM_LOG("Error while installing new baseptr %p", mfn);
   4.527          }
   4.528          else
   4.529          {
   4.530 -            unsigned long old_pfn =
   4.531 +            unsigned long old_mfn =
   4.532                  pagetable_val(ed->arch.guest_table_user) >> PAGE_SHIFT;
   4.533 -            ed->arch.guest_table_user = mk_pagetable(pfn << PAGE_SHIFT);
   4.534 -            if ( old_pfn != 0 )
   4.535 -                put_page_and_type(&frame_table[old_pfn]);
   4.536 +            ed->arch.guest_table_user = mk_pagetable(mfn << PAGE_SHIFT);
   4.537 +            if ( old_mfn != 0 )
   4.538 +                put_page_and_type(&frame_table[old_mfn]);
   4.539          }
   4.540          break;
   4.541  #endif
   4.542 @@ -1346,12 +1406,14 @@ static int do_extended_command(unsigned 
   4.543      
   4.544      case MMUEXT_INVLPG:
   4.545          __flush_tlb_one(ptr);
   4.546 +        if ( shadow_mode_enabled(d) )
   4.547 +            shadow_invlpg(ed, ptr);
   4.548          break;
   4.549  
   4.550      case MMUEXT_FLUSH_CACHE:
   4.551          if ( unlikely(!IS_CAPABLE_PHYSDEV(d)) )
   4.552          {
   4.553 -            MEM_LOG("Non-physdev domain tried to FLUSH_CACHE.\n");
   4.554 +            MEM_LOG("Non-physdev domain tried to FLUSH_CACHE.");
   4.555              okay = 0;
   4.556          }
   4.557          else
   4.558 @@ -1362,6 +1424,8 @@ static int do_extended_command(unsigned 
   4.559  
   4.560      case MMUEXT_SET_LDT:
   4.561      {
   4.562 +        ASSERT( !shadow_mode_external(d) );
   4.563 +
   4.564          unsigned long ents = val >> MMUEXT_CMD_SHIFT;
   4.565          if ( ((ptr & (PAGE_SIZE-1)) != 0) || 
   4.566               (ents > 8192) ||
   4.567 @@ -1375,6 +1439,7 @@ static int do_extended_command(unsigned 
   4.568                    (ed->arch.ldt_base != ptr) )
   4.569          {
   4.570              invalidate_shadow_ldt(ed);
   4.571 +            shadow_sync_all(d);
   4.572              ed->arch.ldt_base = ptr;
   4.573              ed->arch.ldt_ents = ents;
   4.574              load_LDT(ed);
   4.575 @@ -1401,7 +1466,7 @@ static int do_extended_command(unsigned 
   4.576                  percpu_info[cpu].foreign = dom_io;
   4.577                  break;
   4.578              default:
   4.579 -                MEM_LOG("Dom %u cannot set foreign dom\n", d->id);
   4.580 +                MEM_LOG("Dom %u cannot set foreign dom", d->id);
   4.581                  okay = 0;
   4.582                  break;
   4.583              }
   4.584 @@ -1435,10 +1500,10 @@ static int do_extended_command(unsigned 
   4.585          gntref = (grant_ref_t)((val & 0xFF00) | ((ptr >> 2) & 0x00FF));
   4.586          
   4.587          if ( unlikely(IS_XEN_HEAP_FRAME(page)) ||
   4.588 -             unlikely(!pfn_is_ram(pfn)) ||
   4.589 +             unlikely(!pfn_is_ram(mfn)) ||
   4.590               unlikely((e = find_domain_by_id(domid)) == NULL) )
   4.591          {
   4.592 -            MEM_LOG("Bad frame (%p) or bad domid (%d).\n", pfn, domid);
   4.593 +            MEM_LOG("Bad frame (%p) or bad domid (%d).", mfn, domid);
   4.594              okay = 0;
   4.595              break;
   4.596          }
   4.597 @@ -1460,7 +1525,7 @@ static int do_extended_command(unsigned 
   4.598                   unlikely(_nd != _d) )
   4.599              {
   4.600                  MEM_LOG("Bad page values %p: ed=%p(%u), sd=%p,"
   4.601 -                        " caf=%08x, taf=%08x\n", page_to_pfn(page),
   4.602 +                        " caf=%08x, taf=%08x", page_to_pfn(page),
   4.603                          d, d->id, unpickle_domptr(_nd), x, 
   4.604                          page->u.inuse.type_info);
   4.605                  spin_unlock(&d->page_alloc_lock);
   4.606 @@ -1496,7 +1561,7 @@ static int do_extended_command(unsigned 
   4.607               unlikely(!gnttab_prepare_for_transfer(e, d, gntref)) )
   4.608          {
   4.609              MEM_LOG("Transferee has no reservation headroom (%d,%d), or "
   4.610 -                    "provided a bad grant ref, or is dying (%p).\n",
   4.611 +                    "provided a bad grant ref, or is dying (%p).",
   4.612                      e->tot_pages, e->max_pages, e->d_flags);
   4.613              spin_unlock(&e->page_alloc_lock);
   4.614              put_domain(e);
   4.615 @@ -1513,7 +1578,7 @@ static int do_extended_command(unsigned 
   4.616          spin_unlock(&e->page_alloc_lock);
   4.617  
   4.618          /* Transfer is all done: tell the guest about its new page frame. */
   4.619 -        gnttab_notify_transfer(e, gntref, pfn);
   4.620 +        gnttab_notify_transfer(e, gntref, mfn);
   4.621          
   4.622          put_domain(e);
   4.623          break;
   4.624 @@ -1529,7 +1594,14 @@ static int do_extended_command(unsigned 
   4.625          e = percpu_info[cpu].foreign;
   4.626          if ( unlikely(e == NULL) )
   4.627          {
   4.628 -            MEM_LOG("No FOREIGNDOM to reassign pfn %p to", pfn);
   4.629 +            MEM_LOG("No FOREIGNDOM to reassign mfn %p to", mfn);
   4.630 +            okay = 0;
   4.631 +            break;
   4.632 +        }
   4.633 +
   4.634 +        if ( unlikely(!pfn_is_ram(mfn)) )
   4.635 +        {
   4.636 +            MEM_LOG("Can't reassign non-ram mfn %p", mfn);
   4.637              okay = 0;
   4.638              break;
   4.639          }
   4.640 @@ -1574,7 +1646,7 @@ static int do_extended_command(unsigned 
   4.641                   unlikely(_nd != _d) )
   4.642              {
   4.643                  MEM_LOG("Bad page values %p: ed=%p(%u), sd=%p,"
   4.644 -                        " caf=%08x, taf=%08x\n", page_to_pfn(page),
   4.645 +                        " caf=%08x, taf=%08x", page_to_pfn(page),
   4.646                          d, d->id, unpickle_domptr(_nd), x,
   4.647                          page->u.inuse.type_info);
   4.648                  okay = 0;
   4.649 @@ -1637,12 +1709,10 @@ int do_mmu_update(
   4.650  #define MMU_UPDATE_PREEMPT_FDOM_MASK  (0x7FFFU<<MMU_UPDATE_PREEMPT_FDOM_SHIFT)
   4.651  
   4.652      mmu_update_t req;
   4.653 -    unsigned long va = 0, deferred_ops, pfn, prev_pfn = 0;
   4.654 +    unsigned long va = 0, deferred_ops, gpfn, mfn, prev_mfn = 0;
   4.655      struct pfn_info *page;
   4.656      int rc = 0, okay = 1, i = 0, cpu = smp_processor_id();
   4.657      unsigned int cmd, done = 0;
   4.658 -    unsigned long prev_smfn = 0;
   4.659 -    l1_pgentry_t *prev_spl1e = 0;
   4.660      struct exec_domain *ed = current;
   4.661      struct domain *d = ed->domain;
   4.662      u32 type_info;
   4.663 @@ -1653,10 +1723,9 @@ int do_mmu_update(
   4.664      cleanup_writable_pagetable(d);
   4.665  
   4.666      if ( unlikely(shadow_mode_enabled(d)) )
   4.667 -        check_pagetable(d, ed->arch.guest_table, "pre-mmu"); /* debug */
   4.668 -
   4.669 -    if ( unlikely(shadow_mode_translate(d) ) )
   4.670 -        domain_crash();
   4.671 +    {
   4.672 +        check_pagetable(ed, "pre-mmu"); /* debug */
   4.673 +    }
   4.674  
   4.675      /*
   4.676       * If we are resuming after preemption, read how much work we have already
   4.677 @@ -1714,7 +1783,8 @@ int do_mmu_update(
   4.678          }
   4.679  
   4.680          cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
   4.681 -        pfn = req.ptr >> PAGE_SHIFT;
   4.682 +        gpfn = req.ptr >> PAGE_SHIFT;
   4.683 +        mfn = __gpfn_to_mfn(d, gpfn);
   4.684  
   4.685          okay = 0;
   4.686  
   4.687 @@ -1724,107 +1794,91 @@ int do_mmu_update(
   4.688               * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
   4.689               */
   4.690          case MMU_NORMAL_PT_UPDATE:
   4.691 -            if ( unlikely(!get_page_from_pagenr(pfn, current->domain)) )
   4.692 +            if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
   4.693              {
   4.694                  MEM_LOG("Could not get page for normal update");
   4.695                  break;
   4.696              }
   4.697  
   4.698 -            if ( likely(prev_pfn == pfn) )
   4.699 +            if ( likely(prev_mfn == mfn) )
   4.700              {
   4.701                  va = (va & PAGE_MASK) | (req.ptr & ~PAGE_MASK);
   4.702              }
   4.703              else
   4.704              {
   4.705 -                if ( prev_pfn != 0 )
   4.706 +                if ( prev_mfn != 0 )
   4.707                      unmap_domain_mem((void *)va);
   4.708                  va = (unsigned long)map_domain_mem(req.ptr);
   4.709 -                prev_pfn = pfn;
   4.710 +                prev_mfn = mfn;
   4.711              }
   4.712  
   4.713 -            page = &frame_table[pfn];
   4.714 +            page = &frame_table[mfn];
   4.715              switch ( (type_info = page->u.inuse.type_info) & PGT_type_mask )
   4.716              {
   4.717              case PGT_l1_page_table: 
   4.718 +                ASSERT(!shadow_mode_enabled(d));
   4.719                  if ( likely(get_page_type(
   4.720                      page, type_info & (PGT_type_mask|PGT_va_mask))) )
   4.721                  {
   4.722                      okay = mod_l1_entry((l1_pgentry_t *)va, 
   4.723 -                                        mk_l1_pgentry(req.val)); 
   4.724 -
   4.725 -                    if ( unlikely(shadow_mode_enabled(d)) && okay &&
   4.726 -                         (get_shadow_status(d, page-frame_table) &
   4.727 -                          PSH_shadowed) )
   4.728 -                    {
   4.729 -                        shadow_l1_normal_pt_update(
   4.730 -                            req.ptr, req.val, &prev_smfn, &prev_spl1e);
   4.731 -                        put_shadow_status(d);
   4.732 -                    }
   4.733 -
   4.734 +                                        mk_l1_pgentry(req.val));
   4.735                      put_page_type(page);
   4.736                  }
   4.737                  break;
   4.738              case PGT_l2_page_table:
   4.739 +                ASSERT(!shadow_mode_enabled(d));
   4.740                  if ( likely(get_page_type(page, PGT_l2_page_table)) )
   4.741                  {
   4.742                      okay = mod_l2_entry((l2_pgentry_t *)va, 
   4.743                                          mk_l2_pgentry(req.val),
   4.744 -                                        pfn); 
   4.745 -
   4.746 -                    if ( unlikely(shadow_mode_enabled(d)) && okay &&
   4.747 -                         (get_shadow_status(d, page-frame_table) & 
   4.748 -                          PSH_shadowed) )
   4.749 -                    {
   4.750 -                        shadow_l2_normal_pt_update(req.ptr, req.val);
   4.751 -                        put_shadow_status(d);
   4.752 -                    }
   4.753 -
   4.754 +                                        mfn);
   4.755                      put_page_type(page);
   4.756                  }
   4.757                  break;
   4.758  #ifdef __x86_64__
   4.759              case PGT_l3_page_table:
   4.760 +                ASSERT(!shadow_mode_enabled(d));
   4.761                  if ( likely(get_page_type(page, PGT_l3_page_table)) )
   4.762                  {
   4.763                      okay = mod_l3_entry((l3_pgentry_t *)va, 
   4.764                                          mk_l3_pgentry(req.val),
   4.765 -                                        pfn); 
   4.766 -
   4.767 -                    if ( unlikely(shadow_mode_enabled(d)) && okay &&
   4.768 -                         (get_shadow_status(d, page-frame_table) & 
   4.769 -                          PSH_shadowed) )
   4.770 -                    {
   4.771 -                        /*XXXshadow_l3_normal_pt_update(req.ptr, req.val);*/
   4.772 -                        put_shadow_status(d);
   4.773 -                    }
   4.774 -
   4.775 +                                        mfn);
   4.776                      put_page_type(page);
   4.777                  }
   4.778                  break;
   4.779              case PGT_l4_page_table:
   4.780 +                ASSERT(!shadow_mode_enabled(d));
   4.781                  if ( likely(get_page_type(page, PGT_l4_page_table)) )
   4.782                  {
   4.783                      okay = mod_l4_entry((l4_pgentry_t *)va, 
   4.784                                          mk_l4_pgentry(req.val),
   4.785 -                                        pfn); 
   4.786 -
   4.787 -                    if ( unlikely(shadow_mode_enabled(d)) && okay &&
   4.788 -                         (get_shadow_status(d, page-frame_table) & 
   4.789 -                          PSH_shadowed) )
   4.790 -                    {
   4.791 -                        /*XXXshadow_l4_normal_pt_update(req.ptr, req.val);*/
   4.792 -                        put_shadow_status(d);
   4.793 -                    }
   4.794 -
   4.795 +                                        mfn);
   4.796                      put_page_type(page);
   4.797                  }
   4.798                  break;
   4.799  #endif /* __x86_64__ */
   4.800              default:
   4.801 +                printk("do_mmu_update writable update: ma=%p val=%p\n",
   4.802 +                       req.ptr, req.val);
   4.803                  if ( likely(get_page_type(page, PGT_writable_page)) )
   4.804                  {
   4.805 +                    if ( shadow_mode_enabled(d) )
   4.806 +                    {
   4.807 +                        shadow_lock(d);
   4.808 +
   4.809 +                        if ( shadow_mode_log_dirty(d) )
   4.810 +                            __mark_dirty(d, mfn);
   4.811 +
   4.812 +                        if ( page_is_page_table(page) )
   4.813 +                            shadow_mark_mfn_out_of_sync(ed, gpfn, mfn);
   4.814 +                    }
   4.815 +
   4.816                      *(unsigned long *)va = req.val;
   4.817                      okay = 1;
   4.818 +
   4.819 +                    if ( shadow_mode_enabled(d) )
   4.820 +                        shadow_unlock(d);
   4.821 +
   4.822                      put_page_type(page);
   4.823                  }
   4.824                  break;
   4.825 @@ -1834,24 +1888,30 @@ int do_mmu_update(
   4.826              break;
   4.827  
   4.828          case MMU_MACHPHYS_UPDATE:
   4.829 -            if ( unlikely(!get_page_from_pagenr(pfn, FOREIGNDOM)) )
   4.830 +            if ( unlikely(!get_page_from_pagenr(mfn, FOREIGNDOM)) )
   4.831              {
   4.832                  MEM_LOG("Could not get page for mach->phys update");
   4.833                  break;
   4.834              }
   4.835  
   4.836 -            machine_to_phys_mapping[pfn] = req.val;
   4.837 +            if ( unlikely(shadow_mode_translate(FOREIGNDOM) && !IS_PRIV(d)) )
   4.838 +            {
   4.839 +                MEM_LOG("can't mutate the m2p of translated guests");
   4.840 +                break;
   4.841 +            }
   4.842 +
   4.843 +            set_machinetophys(mfn, req.val);
   4.844              okay = 1;
   4.845  
   4.846              /*
   4.847 -             * If in log-dirty mode, mark the corresponding pseudo-physical
   4.848 +             * If in log-dirty mode, mark the corresponding
   4.849               * page as dirty.
   4.850               */
   4.851 -            if ( unlikely(shadow_mode_log_dirty(d)) && 
   4.852 -                 mark_dirty(d, pfn) )
   4.853 -                d->arch.shadow_dirty_block_count++;
   4.854 -
   4.855 -            put_page(&frame_table[pfn]);
   4.856 +            if ( unlikely(shadow_mode_log_dirty(FOREIGNDOM)) &&
   4.857 +                 mark_dirty(FOREIGNDOM, mfn) )
   4.858 +                FOREIGNDOM->arch.shadow_dirty_block_count++;
   4.859 +
   4.860 +            put_page(&frame_table[mfn]);
   4.861              break;
   4.862  
   4.863              /*
   4.864 @@ -1878,17 +1938,18 @@ int do_mmu_update(
   4.865      }
   4.866  
   4.867   out:
   4.868 -    if ( prev_pfn != 0 )
   4.869 +    if ( prev_mfn != 0 )
   4.870          unmap_domain_mem((void *)va);
   4.871  
   4.872 -    if ( unlikely(prev_spl1e != 0) ) 
   4.873 -        unmap_domain_mem((void *)prev_spl1e);
   4.874 -
   4.875      deferred_ops = percpu_info[cpu].deferred_ops;
   4.876      percpu_info[cpu].deferred_ops = 0;
   4.877  
   4.878      if ( deferred_ops & DOP_FLUSH_TLB )
   4.879 +    {
   4.880          local_flush_tlb();
   4.881 +        if ( shadow_mode_enabled(d) )
   4.882 +            shadow_sync_all(d);
   4.883 +    }
   4.884          
   4.885      if ( deferred_ops & DOP_RELOAD_LDT )
   4.886          (void)map_ldt_shadow_page(0);
   4.887 @@ -1904,7 +1965,7 @@ int do_mmu_update(
   4.888          __put_user(done + i, pdone);
   4.889  
   4.890      if ( unlikely(shadow_mode_enabled(d)) )
   4.891 -        check_pagetable(d, ed->arch.guest_table, "post-mmu"); /* debug */
   4.892 +        check_pagetable(ed, "post-mmu"); /* debug */
   4.893  
   4.894      UNLOCK_BIGLOCK(d);
   4.895      return rc;
   4.896 @@ -1923,12 +1984,9 @@ int do_update_va_mapping(unsigned long v
   4.897  
   4.898      perfc_incrc(calls_to_update_va);
   4.899  
   4.900 -    if ( unlikely(!__addr_ok(va)) )
   4.901 +    if ( unlikely(!__addr_ok(va) && !shadow_mode_external(d)) )
   4.902          return -EINVAL;
   4.903  
   4.904 -    if ( unlikely(shadow_mode_translate(d) ) )
   4.905 -        domain_crash();
   4.906 -
   4.907      LOCK_BIGLOCK(d);
   4.908  
   4.909      cleanup_writable_pagetable(d);
   4.910 @@ -1937,55 +1995,56 @@ int do_update_va_mapping(unsigned long v
   4.911       * XXX When we make this support 4MB superpages we should also deal with 
   4.912       * the case of updating L2 entries.
   4.913       */
   4.914 -
   4.915 -    if ( unlikely(!mod_l1_entry(&linear_pg_table[l1_linear_offset(va)],
   4.916 -                                mk_l1_pgentry(val))) )
   4.917 -        err = -EINVAL;
   4.918 -
   4.919 -    if ( unlikely(shadow_mode_enabled(d)) )
   4.920 +    if ( likely(!shadow_mode_enabled(d)) )
   4.921 +    {
   4.922 +        if ( unlikely(!mod_l1_entry(&linear_pg_table[l1_linear_offset(va)],
   4.923 +                                    mk_l1_pgentry(val))) )
   4.924 +            err = -EINVAL;
   4.925 +    }
   4.926 +    else
   4.927      {
   4.928 -        unsigned long sval = 0;
   4.929 -
   4.930 -        l1pte_propagate_from_guest(d, &val, &sval);
   4.931 -
   4.932 -        if ( unlikely(__put_user(sval, ((unsigned long *)(
   4.933 -            &shadow_linear_pg_table[l1_linear_offset(va)])))) )
   4.934 +        if ( unlikely(percpu_info[cpu].foreign &&
   4.935 +                      (shadow_mode_translate(d) ||
   4.936 +                       shadow_mode_translate(percpu_info[cpu].foreign))) )
   4.937          {
   4.938 +            // The foreign domain's pfn's are in a different namespace.
   4.939 +            // We wouldn't be able to figure out how to (re-)shadow our
   4.940 +            // gpte without additional context.
   4.941 +            //
   4.942 +            domain_crash();
   4.943 +        }
   4.944 +    
   4.945 +        check_pagetable(ed, "pre-va"); /* debug */
   4.946 +        shadow_lock(d);
   4.947 +        
   4.948 +        // This is actually overkill - we don't need to sync the L1 itself,
   4.949 +        // just everything involved in getting to this L1 (i.e. we need
   4.950 +        // linear_pg_table[l1_linear_offset(va)] to be in sync)...
   4.951 +        //
   4.952 +        __shadow_sync_va(ed, va);
   4.953 +
   4.954 +        if ( unlikely(__put_user(val, &l1_pgentry_val(
   4.955 +                                     linear_pg_table[l1_linear_offset(va)]))) )
   4.956 +            err = -EINVAL;
   4.957 +        else
   4.958 +        {
   4.959 +            // also need to update the shadow
   4.960 +            unsigned long spte;
   4.961 +
   4.962 +            l1pte_propagate_from_guest(d, val, &spte);
   4.963 +            shadow_set_l1e(va, spte, 0);
   4.964 +
   4.965              /*
   4.966 -             * Since L2's are guranteed RW, failure indicates either that the
   4.967 -             * page was not shadowed, or that the L2 entry has not yet been
   4.968 -             * updated to reflect the shadow.
   4.969 +             * If we're in log-dirty mode then we need to note that we've updated
   4.970 +             * the PTE in the PT-holding page. We need the machine frame number
   4.971 +             * for this.
   4.972               */
   4.973 -            if ( shadow_mode_external(current->domain) )
   4.974 -                BUG(); // can't use linear_l2_table with external tables.
   4.975 -
   4.976 -            l2_pgentry_t gpde = linear_l2_table[l2_table_offset(va)];
   4.977 -            unsigned long gpfn = l2_pgentry_val(gpde) >> PAGE_SHIFT;
   4.978 -
   4.979 -            if (get_shadow_status(d, gpfn))
   4.980 -            {
   4.981 -                unsigned long gmfn = __gpfn_to_mfn(d, gpfn);
   4.982 -                unsigned long *gl1e = map_domain_mem(gmfn << PAGE_SHIFT);
   4.983 -                unsigned l1_idx = l1_table_offset(va);
   4.984 -                gl1e[l1_idx] = sval;
   4.985 -                unmap_domain_mem(gl1e);
   4.986 -                put_shadow_status(d);
   4.987 -
   4.988 -                perfc_incrc(shadow_update_va_fail1);
   4.989 -            }
   4.990 -            else
   4.991 -                perfc_incrc(shadow_update_va_fail2);
   4.992 +            if ( shadow_mode_log_dirty(d) )
   4.993 +                mark_dirty(d, va_to_l1mfn(ed, va));
   4.994 +
   4.995 +            shadow_unlock(d);
   4.996 +            check_pagetable(ed, "post-va"); /* debug */
   4.997          }
   4.998 -
   4.999 -        /*
  4.1000 -         * If we're in log-dirty mode then we need to note that we've updated
  4.1001 -         * the PTE in the PT-holding page. We need the machine frame number
  4.1002 -         * for this.
  4.1003 -         */
  4.1004 -        if ( shadow_mode_log_dirty(d) )
  4.1005 -            mark_dirty(d, va_to_l1mfn(va));
  4.1006 -  
  4.1007 -        check_pagetable(d, ed->arch.guest_table, "va"); /* debug */
  4.1008      }
  4.1009  
  4.1010      deferred_ops = percpu_info[cpu].deferred_ops;
  4.1011 @@ -1993,9 +2052,17 @@ int do_update_va_mapping(unsigned long v
  4.1012  
  4.1013      if ( unlikely(deferred_ops & DOP_FLUSH_TLB) || 
  4.1014           unlikely(flags & UVMF_FLUSH_TLB) )
  4.1015 +    {
  4.1016          local_flush_tlb();
  4.1017 +        if ( unlikely(shadow_mode_enabled(d)) )
  4.1018 +            shadow_sync_all(d);
  4.1019 +    }
  4.1020      else if ( unlikely(flags & UVMF_INVLPG) )
  4.1021 +    {
  4.1022          __flush_tlb_one(va);
  4.1023 +        if ( unlikely(shadow_mode_enabled(d)) )
  4.1024 +            shadow_invlpg(current, va);
  4.1025 +    }
  4.1026  
  4.1027      if ( unlikely(deferred_ops & DOP_RELOAD_LDT) )
  4.1028          (void)map_ldt_shadow_page(0);
  4.1029 @@ -2066,6 +2133,8 @@ long set_gdt(struct exec_domain *ed,
  4.1030      if ( (pfn = frames[0]) >= max_page )
  4.1031          goto fail;
  4.1032  
  4.1033 +    shadow_sync_all(d);
  4.1034 +
  4.1035      /* The first page is special because Xen owns a range of entries in it. */
  4.1036      if ( !get_page_and_type(&frame_table[pfn], d, PGT_gdt_page) )
  4.1037      {
  4.1038 @@ -2145,7 +2214,9 @@ long do_set_gdt(unsigned long *frame_lis
  4.1039  long do_update_descriptor(
  4.1040      unsigned long pa, unsigned long word1, unsigned long word2)
  4.1041  {
  4.1042 -    unsigned long pfn = pa >> PAGE_SHIFT;
  4.1043 +    struct domain *dom = current->domain;
  4.1044 +    unsigned long gpfn = pa >> PAGE_SHIFT;
  4.1045 +    unsigned long mfn;
  4.1046      struct desc_struct *gdt_pent, d;
  4.1047      struct pfn_info *page;
  4.1048      struct exec_domain *ed;
  4.1049 @@ -2154,16 +2225,21 @@ long do_update_descriptor(
  4.1050      d.a = (u32)word1;
  4.1051      d.b = (u32)word2;
  4.1052  
  4.1053 -    LOCK_BIGLOCK(current->domain);
  4.1054 -
  4.1055 -    if ( (pa & 7) || (pfn >= max_page) || !check_descriptor(&d) ) {
  4.1056 -        UNLOCK_BIGLOCK(current->domain);
  4.1057 +    LOCK_BIGLOCK(dom);
  4.1058 +
  4.1059 +    if ( !(mfn = __gpfn_to_mfn(dom, gpfn)) ) {
  4.1060 +        UNLOCK_BIGLOCK(dom);
  4.1061          return -EINVAL;
  4.1062      }
  4.1063  
  4.1064 -    page = &frame_table[pfn];
  4.1065 -    if ( unlikely(!get_page(page, current->domain)) ) {
  4.1066 -        UNLOCK_BIGLOCK(current->domain);
  4.1067 +    if ( (pa & 7) || (mfn >= max_page) || !check_descriptor(&d) ) {
  4.1068 +        UNLOCK_BIGLOCK(dom);
  4.1069 +        return -EINVAL;
  4.1070 +    }
  4.1071 +
  4.1072 +    page = &frame_table[mfn];
  4.1073 +    if ( unlikely(!get_page(page, dom)) ) {
  4.1074 +        UNLOCK_BIGLOCK(dom);
  4.1075          return -EINVAL;
  4.1076      }
  4.1077  
  4.1078 @@ -2172,8 +2248,8 @@ long do_update_descriptor(
  4.1079      {
  4.1080      case PGT_gdt_page:
  4.1081          /* Disallow updates of Xen-reserved descriptors in the current GDT. */
  4.1082 -        for_each_exec_domain(current->domain, ed) {
  4.1083 -            if ( (l1_pgentry_to_pfn(ed->arch.perdomain_ptes[0]) == pfn) &&
  4.1084 +        for_each_exec_domain(dom, ed) {
  4.1085 +            if ( (l1_pgentry_to_pfn(ed->arch.perdomain_ptes[0]) == mfn) &&
  4.1086                   (((pa&(PAGE_SIZE-1))>>3) >= FIRST_RESERVED_GDT_ENTRY) &&
  4.1087                   (((pa&(PAGE_SIZE-1))>>3) <= LAST_RESERVED_GDT_ENTRY) )
  4.1088                  goto out;
  4.1089 @@ -2191,11 +2267,25 @@ long do_update_descriptor(
  4.1090          break;
  4.1091      }
  4.1092  
  4.1093 +    if ( shadow_mode_enabled(dom) )
  4.1094 +    {
  4.1095 +        shadow_lock(dom);
  4.1096 +
  4.1097 +        if ( shadow_mode_log_dirty(dom) )
  4.1098 +            __mark_dirty(dom, mfn);
  4.1099 +
  4.1100 +        if ( page_is_page_table(page) )
  4.1101 +            shadow_mark_mfn_out_of_sync(current, gpfn, mfn);
  4.1102 +    }
  4.1103 +
  4.1104      /* All is good so make the update. */
  4.1105 -    gdt_pent = map_domain_mem(pa);
  4.1106 +    gdt_pent = map_domain_mem((mfn << PAGE_SHIFT) | (pa & ~PAGE_MASK));
  4.1107      memcpy(gdt_pent, &d, 8);
  4.1108      unmap_domain_mem(gdt_pent);
  4.1109  
  4.1110 +    if ( shadow_mode_enabled(dom) )
  4.1111 +        shadow_unlock(dom);
  4.1112 +
  4.1113      put_page_type(page);
  4.1114  
  4.1115      ret = 0; /* success */
  4.1116 @@ -2203,7 +2293,7 @@ long do_update_descriptor(
  4.1117   out:
  4.1118      put_page(page);
  4.1119  
  4.1120 -    UNLOCK_BIGLOCK(current->domain);
  4.1121 +    UNLOCK_BIGLOCK(dom);
  4.1122  
  4.1123      return ret;
  4.1124  }
  4.1125 @@ -2228,13 +2318,16 @@ int ptwr_debug = 0x0;
  4.1126  /* Flush the given writable p.t. page and write-protect it again. */
  4.1127  void ptwr_flush(const int which)
  4.1128  {
  4.1129 -    unsigned long  sstat, spte, pte, *ptep, l1va;
  4.1130 -    l1_pgentry_t  *sl1e = NULL, *pl1e, ol1e, nl1e;
  4.1131 +    unsigned long  pte, *ptep, l1va;
  4.1132 +    l1_pgentry_t  *pl1e, ol1e, nl1e;
  4.1133      l2_pgentry_t  *pl2e;
  4.1134      int            i, cpu = smp_processor_id();
  4.1135      struct exec_domain *ed = current;
  4.1136      struct domain *d = ed->domain;
  4.1137  
  4.1138 +    // not supported in combination with various shadow modes!
  4.1139 +    ASSERT( !shadow_mode_enabled(d) );
  4.1140 +    
  4.1141      l1va = ptwr_info[cpu].ptinfo[which].l1va;
  4.1142      ptep = (unsigned long *)&linear_pg_table[l1_linear_offset(l1va)];
  4.1143  
  4.1144 @@ -2244,7 +2337,7 @@ void ptwr_flush(const int which)
  4.1145  
  4.1146      if ( unlikely(__get_user(pte, ptep)) )
  4.1147      {
  4.1148 -        MEM_LOG("ptwr: Could not read pte at %p\n", ptep);
  4.1149 +        MEM_LOG("ptwr: Could not read pte at %p", ptep);
  4.1150          /*
  4.1151           * Really a bug. We could read this PTE during the initial fault,
  4.1152           * and pagetables can't have changed meantime. XXX Multi-CPU guests?
  4.1153 @@ -2255,23 +2348,10 @@ void ptwr_flush(const int which)
  4.1154                  PTWR_PRINT_WHICH, ptep, pte);
  4.1155      pte &= ~_PAGE_RW;
  4.1156  
  4.1157 -    if ( unlikely(shadow_mode_enabled(d)) )
  4.1158 -    {
  4.1159 -        /* Write-protect the p.t. page in the shadow page table. */
  4.1160 -        l1pte_propagate_from_guest(d, &pte, &spte);
  4.1161 -        __put_user(spte, (unsigned long *)
  4.1162 -                   &shadow_linear_pg_table[l1_linear_offset(l1va)]);
  4.1163 -
  4.1164 -        /* Is the p.t. page itself shadowed? Map it into Xen space if so. */
  4.1165 -        sstat = get_shadow_status(d, pte >> PAGE_SHIFT);
  4.1166 -        if ( sstat & PSH_shadowed )
  4.1167 -            sl1e = map_domain_mem((sstat & PSH_pfn_mask) << PAGE_SHIFT);
  4.1168 -    }
  4.1169 -
  4.1170      /* Write-protect the p.t. page in the guest page table. */
  4.1171      if ( unlikely(__put_user(pte, ptep)) )
  4.1172      {
  4.1173 -        MEM_LOG("ptwr: Could not update pte at %p\n", ptep);
  4.1174 +        MEM_LOG("ptwr: Could not update pte at %p", ptep);
  4.1175          /*
  4.1176           * Really a bug. We could write this PTE during the initial fault,
  4.1177           * and pagetables can't have changed meantime. XXX Multi-CPU guests?
  4.1178 @@ -2309,13 +2389,7 @@ void ptwr_flush(const int which)
  4.1179          if ( likely(l1_pgentry_val(ol1e) == (l1_pgentry_val(nl1e)|_PAGE_RW)) )
  4.1180          {
  4.1181              if ( likely(l1_pgentry_val(nl1e) & _PAGE_PRESENT) )
  4.1182 -            {
  4.1183 -                if ( unlikely(sl1e != NULL) )
  4.1184 -                    l1pte_propagate_from_guest(
  4.1185 -                        d, &l1_pgentry_val(nl1e), 
  4.1186 -                        &l1_pgentry_val(sl1e[i]));
  4.1187                  put_page_type(&frame_table[l1_pgentry_to_pfn(nl1e)]);
  4.1188 -            }
  4.1189              continue;
  4.1190          }
  4.1191  
  4.1192 @@ -2334,22 +2408,19 @@ void ptwr_flush(const int which)
  4.1193              domain_crash();
  4.1194          }
  4.1195          
  4.1196 -        if ( unlikely(sl1e != NULL) )
  4.1197 -            l1pte_propagate_from_guest(
  4.1198 -                d, &l1_pgentry_val(nl1e), &l1_pgentry_val(sl1e[i]));
  4.1199 -
  4.1200          if ( unlikely(l1_pgentry_val(ol1e) & _PAGE_PRESENT) )
  4.1201              put_page_from_l1e(ol1e, d);
  4.1202      }
  4.1203 +
  4.1204      unmap_domain_mem(pl1e);
  4.1205  
  4.1206      /*
  4.1207       * STEP 3. Reattach the L1 p.t. page into the current address space.
  4.1208       */
  4.1209  
  4.1210 -    if ( (which == PTWR_PT_ACTIVE) && likely(!shadow_mode_enabled(d)) )
  4.1211 +    if ( which == PTWR_PT_ACTIVE )
  4.1212      {
  4.1213 -        pl2e = &linear_l2_table[ptwr_info[cpu].ptinfo[which].l2_idx];
  4.1214 +        pl2e = &linear_l2_table(ed)[ptwr_info[cpu].ptinfo[which].l2_idx];
  4.1215          *pl2e = mk_l2_pgentry(l2_pgentry_val(*pl2e) | _PAGE_PRESENT); 
  4.1216      }
  4.1217  
  4.1218 @@ -2358,23 +2429,21 @@ void ptwr_flush(const int which)
  4.1219       */
  4.1220  
  4.1221      ptwr_info[cpu].ptinfo[which].l1va = 0;
  4.1222 -
  4.1223 -    if ( unlikely(sl1e != NULL) )
  4.1224 -    {
  4.1225 -        unmap_domain_mem(sl1e);
  4.1226 -        put_shadow_status(d);
  4.1227 -    }
  4.1228  }
  4.1229  
  4.1230  /* Write page fault handler: check if guest is trying to modify a PTE. */
  4.1231  int ptwr_do_page_fault(unsigned long addr)
  4.1232  {
  4.1233 +    struct exec_domain *ed = current;
  4.1234      unsigned long    pte, pfn, l2e;
  4.1235      struct pfn_info *page;
  4.1236      l2_pgentry_t    *pl2e;
  4.1237      int              which, cpu = smp_processor_id();
  4.1238      u32              l2_idx;
  4.1239  
  4.1240 +    // not supported in combination with various shadow modes!
  4.1241 +    ASSERT( !shadow_mode_enabled(ed->domain) );
  4.1242 +    
  4.1243  #ifdef __x86_64__
  4.1244      return 0; /* Writable pagetables need fixing for x86_64. */
  4.1245  #endif
  4.1246 @@ -2383,10 +2452,7 @@ int ptwr_do_page_fault(unsigned long add
  4.1247       * Attempt to read the PTE that maps the VA being accessed. By checking for
  4.1248       * PDE validity in the L2 we avoid many expensive fixups in __get_user().
  4.1249       */
  4.1250 -    if ( shadow_mode_external(current->domain) )
  4.1251 -        BUG(); // can't use linear_l2_table with external tables.
  4.1252 -
  4.1253 -    if ( !(l2_pgentry_val(linear_l2_table[addr>>L2_PAGETABLE_SHIFT]) &
  4.1254 +    if ( !(l2_pgentry_val(linear_l2_table(ed)[addr>>L2_PAGETABLE_SHIFT]) &
  4.1255             _PAGE_PRESENT) ||
  4.1256           __get_user(pte, (unsigned long *)
  4.1257                      &linear_pg_table[l1_linear_offset(addr)]) )
  4.1258 @@ -2414,7 +2480,7 @@ int ptwr_do_page_fault(unsigned long add
  4.1259  
  4.1260      if ( l2_idx == (addr >> L2_PAGETABLE_SHIFT) )
  4.1261      {
  4.1262 -        MEM_LOG("PTWR failure! Pagetable maps itself at %p\n", addr);
  4.1263 +        MEM_LOG("PTWR failure! Pagetable maps itself at %p", addr);
  4.1264          domain_crash();
  4.1265      }
  4.1266  
  4.1267 @@ -2422,10 +2488,7 @@ int ptwr_do_page_fault(unsigned long add
  4.1268       * Is the L1 p.t. mapped into the current address space? If so we call it
  4.1269       * an ACTIVE p.t., otherwise it is INACTIVE.
  4.1270       */
  4.1271 -    if ( shadow_mode_external(current->domain) )
  4.1272 -        BUG(); // can't use linear_l2_table with external tables.
  4.1273 -
  4.1274 -    pl2e = &linear_l2_table[l2_idx];
  4.1275 +    pl2e = &linear_l2_table(ed)[l2_idx];
  4.1276      l2e  = l2_pgentry_val(*pl2e);
  4.1277      which = PTWR_PT_INACTIVE;
  4.1278      if ( (l2e >> PAGE_SHIFT) == pfn )
  4.1279 @@ -2461,8 +2524,7 @@ int ptwr_do_page_fault(unsigned long add
  4.1280      ptwr_info[cpu].ptinfo[which].l2_idx = l2_idx;
  4.1281      
  4.1282      /* For safety, disconnect the L1 p.t. page from current space. */
  4.1283 -    if ( (which == PTWR_PT_ACTIVE) && 
  4.1284 -         likely(!shadow_mode_enabled(current->domain)) )
  4.1285 +    if ( which == PTWR_PT_ACTIVE )
  4.1286      {
  4.1287          *pl2e = mk_l2_pgentry(l2e & ~_PAGE_PRESENT);
  4.1288  #if 1
  4.1289 @@ -2485,7 +2547,7 @@ int ptwr_do_page_fault(unsigned long add
  4.1290      if ( unlikely(__put_user(pte, (unsigned long *)
  4.1291                               &linear_pg_table[addr>>PAGE_SHIFT])) )
  4.1292      {
  4.1293 -        MEM_LOG("ptwr: Could not update pte at %p\n", (unsigned long *)
  4.1294 +        MEM_LOG("ptwr: Could not update pte at %p", (unsigned long *)
  4.1295                  &linear_pg_table[addr>>PAGE_SHIFT]);
  4.1296          /* Toss the writable pagetable state and crash. */
  4.1297          unmap_domain_mem(ptwr_info[cpu].ptinfo[which].pl1e);
  4.1298 @@ -2531,7 +2593,7 @@ void ptwr_status(void)
  4.1299          [ptwr_info[cpu].ptinfo[PTWR_PT_INACTIVE].l1va>>PAGE_SHIFT];
  4.1300  
  4.1301      if ( __get_user(pte, ptep) ) {
  4.1302 -        MEM_LOG("ptwr: Could not read pte at %p\n", ptep);
  4.1303 +        MEM_LOG("ptwr: Could not read pte at %p", ptep);
  4.1304          domain_crash();
  4.1305      }
  4.1306  
  4.1307 @@ -2547,7 +2609,7 @@ void ptwr_status(void)
  4.1308  
  4.1309      if ( __get_user(pte, (unsigned long *)
  4.1310                      ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va) ) {
  4.1311 -        MEM_LOG("ptwr: Could not read pte at %p\n", (unsigned long *)
  4.1312 +        MEM_LOG("ptwr: Could not read pte at %p", (unsigned long *)
  4.1313                  ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va);
  4.1314          domain_crash();
  4.1315      }
  4.1316 @@ -2555,433 +2617,6 @@ void ptwr_status(void)
  4.1317      page = &frame_table[pfn];
  4.1318  }
  4.1319  
  4.1320 -void audit_domain(struct domain *d)
  4.1321 -{
  4.1322 -    int ttot=0, ctot=0, io_mappings=0, lowmem_mappings=0;
  4.1323 -
  4.1324 -    void adjust (struct pfn_info *page, int dir, int adjtype)
  4.1325 -    {
  4.1326 -        int count = page->count_info & PGC_count_mask;
  4.1327 -
  4.1328 -        if ( adjtype )
  4.1329 -        {
  4.1330 -            int tcount = page->u.inuse.type_info & PGT_count_mask;
  4.1331 -            
  4.1332 -            ttot++;
  4.1333 -
  4.1334 -            tcount += dir;
  4.1335 -
  4.1336 -            if ( tcount < 0 )
  4.1337 -            {
  4.1338 -                /* This will only come out once. */
  4.1339 -                printk("Audit %d: type count whent below zero pfn=%x "
  4.1340 -                       "taf=%x otaf=%x\n",
  4.1341 -                       d->id, page-frame_table,
  4.1342 -                       page->u.inuse.type_info,
  4.1343 -                       page->tlbflush_timestamp);
  4.1344 -            }
  4.1345 -            
  4.1346 -            page->u.inuse.type_info =
  4.1347 -                (page->u.inuse.type_info & ~PGT_count_mask) | 
  4.1348 -                (tcount & PGT_count_mask);
  4.1349 -        }
  4.1350 -
  4.1351 -        ctot++;
  4.1352 -        count += dir;
  4.1353 -        if ( count < 0 )
  4.1354 -        {
  4.1355 -            /* This will only come out once. */
  4.1356 -            printk("Audit %d: general count whent below zero pfn=%x "
  4.1357 -                   "taf=%x otaf=%x\n",
  4.1358 -                   d->id, page-frame_table,
  4.1359 -                   page->u.inuse.type_info,
  4.1360 -                   page->tlbflush_timestamp);
  4.1361 -        }
  4.1362 -            
  4.1363 -        page->count_info =
  4.1364 -            (page->count_info & ~PGC_count_mask) | 
  4.1365 -            (count & PGC_count_mask);            
  4.1366 -
  4.1367 -    }
  4.1368 -
  4.1369 -    void scan_for_pfn(struct domain *d, unsigned long xpfn)
  4.1370 -    {
  4.1371 -        unsigned long pfn, *pt;
  4.1372 -        struct list_head *list_ent;
  4.1373 -        struct pfn_info *page;
  4.1374 -        int i;
  4.1375 -
  4.1376 -        list_ent = d->page_list.next;
  4.1377 -        for ( i = 0; (list_ent != &d->page_list); i++ )
  4.1378 -        {
  4.1379 -            pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
  4.1380 -            page = &frame_table[pfn];
  4.1381 -            
  4.1382 -            switch ( page->u.inuse.type_info & PGT_type_mask )
  4.1383 -            {
  4.1384 -            case PGT_l1_page_table:
  4.1385 -            case PGT_l2_page_table:
  4.1386 -                pt = map_domain_mem(pfn<<PAGE_SHIFT);
  4.1387 -                for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
  4.1388 -                    if ( (pt[i] & _PAGE_PRESENT) &&
  4.1389 -                         ((pt[i] >> PAGE_SHIFT) == xpfn) )
  4.1390 -                        printk("     found dom=%d i=%x pfn=%lx t=%x c=%x\n",
  4.1391 -                               d->id, i, pfn, page->u.inuse.type_info,
  4.1392 -                               page->count_info);
  4.1393 -                unmap_domain_mem(pt);           
  4.1394 -            }
  4.1395 -
  4.1396 -            list_ent = frame_table[pfn].list.next;
  4.1397 -        }
  4.1398 -
  4.1399 -    }
  4.1400 -
  4.1401 -    void scan_for_pfn_remote(unsigned long xpfn)
  4.1402 -    {
  4.1403 -        struct domain *e;
  4.1404 -        for_each_domain ( e )
  4.1405 -            scan_for_pfn( e, xpfn );            
  4.1406 -    }   
  4.1407 -
  4.1408 -    int i, l1, l2;
  4.1409 -    unsigned long pfn;
  4.1410 -    struct list_head *list_ent;
  4.1411 -    struct pfn_info *page;
  4.1412 -
  4.1413 -    if ( d != current->domain )
  4.1414 -        domain_pause(d);
  4.1415 -    synchronise_pagetables(~0UL);
  4.1416 -
  4.1417 -    printk("pt base=%lx sh_info=%x\n",
  4.1418 -           pagetable_val(d->exec_domain[0]->arch.guest_table)>>PAGE_SHIFT,
  4.1419 -           virt_to_page(d->shared_info)-frame_table);
  4.1420 -           
  4.1421 -    spin_lock(&d->page_alloc_lock);
  4.1422 -
  4.1423 -    /* PHASE 0 */
  4.1424 -
  4.1425 -    list_ent = d->page_list.next;
  4.1426 -    for ( i = 0; (list_ent != &d->page_list); i++ )
  4.1427 -    {
  4.1428 -        pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;       
  4.1429 -        page = &frame_table[pfn];
  4.1430 -
  4.1431 -        if ( page_get_owner(page) != d )
  4.1432 -            BUG();
  4.1433 -
  4.1434 -        if ( (page->u.inuse.type_info & PGT_count_mask) >
  4.1435 -             (page->count_info & PGC_count_mask) )
  4.1436 -            printk("taf > caf %x %x pfn=%lx\n",
  4.1437 -                   page->u.inuse.type_info, page->count_info, pfn );
  4.1438 - 
  4.1439 -#if 0   /* SYSV shared memory pages plus writeable files. */
  4.1440 -        if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_writable_page && 
  4.1441 -             (page->u.inuse.type_info & PGT_count_mask) > 1 )
  4.1442 -        {
  4.1443 -            printk("writeable page with type count >1: pfn=%lx t=%x c=%x\n",
  4.1444 -                  pfn,
  4.1445 -                  page->u.inuse.type_info,
  4.1446 -                  page->count_info );
  4.1447 -            scan_for_pfn_remote(pfn);
  4.1448 -        }
  4.1449 -#endif
  4.1450 -        if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_none && 
  4.1451 -             (page->u.inuse.type_info & PGT_count_mask) > 1 )
  4.1452 -        {
  4.1453 -            printk("normal page with type count >1: pfn=%lx t=%x c=%x\n",
  4.1454 -                  pfn,
  4.1455 -                  page->u.inuse.type_info,
  4.1456 -                  page->count_info );
  4.1457 -        }
  4.1458 -
  4.1459 -        /* Use tlbflush_timestamp to store original type_info. */
  4.1460 -        page->tlbflush_timestamp = page->u.inuse.type_info;
  4.1461 -
  4.1462 -        list_ent = frame_table[pfn].list.next;
  4.1463 -    }
  4.1464 -
  4.1465 -
  4.1466 -    /* PHASE 1 */
  4.1467 -    if ( pagetable_val(d->exec_domain[0]->arch.guest_table) )
  4.1468 -        adjust(&frame_table[pagetable_val(d->exec_domain[0]->arch.guest_table)
  4.1469 -                           >>PAGE_SHIFT], -1, 1);
  4.1470 -
  4.1471 -    list_ent = d->page_list.next;
  4.1472 -    for ( i = 0; (list_ent != &d->page_list); i++ )
  4.1473 -    {
  4.1474 -        unsigned long *pt;
  4.1475 -        pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;       
  4.1476 -        page = &frame_table[pfn];
  4.1477 -
  4.1478 -        if ( page_get_owner(page) != d )
  4.1479 -            BUG();
  4.1480 -
  4.1481 -        switch ( page->u.inuse.type_info & PGT_type_mask )
  4.1482 -        {
  4.1483 -        case PGT_l2_page_table:
  4.1484 -
  4.1485 -            if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated )
  4.1486 -                printk("Audit %d: L2 not validated %x\n",
  4.1487 -                       d->id, page->u.inuse.type_info);
  4.1488 -
  4.1489 -            if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned )
  4.1490 -                printk("Audit %d: L2 not pinned %x\n",
  4.1491 -                       d->id, page->u.inuse.type_info);
  4.1492 -            else
  4.1493 -                adjust( page, -1, 1 );
  4.1494 -           
  4.1495 -            pt = map_domain_mem( pfn<<PAGE_SHIFT );
  4.1496 -
  4.1497 -            for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
  4.1498 -            {
  4.1499 -                if ( pt[i] & _PAGE_PRESENT )
  4.1500 -                {
  4.1501 -                    unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
  4.1502 -                    struct pfn_info *l1page = &frame_table[l1pfn];
  4.1503 -
  4.1504 -                    if ( page_get_owner(l1page) != d )
  4.1505 -                    {
  4.1506 -                        printk("L2: Skip bizarre page belonging to other "
  4.1507 -                               "dom %p\n", page_get_owner(l1page));
  4.1508 -                        continue;
  4.1509 -                    }
  4.1510 -                    
  4.1511 -                    if ( (l1page->u.inuse.type_info & PGT_type_mask) ==
  4.1512 -                         PGT_l2_page_table )
  4.1513 -                        printk("Audit %d: [%x] Found %s Linear PT "
  4.1514 -                               "t=%x pfn=%lx\n", d->id, i, 
  4.1515 -                               (l1pfn==pfn) ? "Self" : "Other",
  4.1516 -                               l1page->u.inuse.type_info,
  4.1517 -                               l1pfn);
  4.1518 -                    else if ( (l1page->u.inuse.type_info & PGT_type_mask) !=
  4.1519 -                              PGT_l1_page_table )
  4.1520 -                        printk("Audit %d: [%x] Expected L1 t=%x pfn=%lx\n",
  4.1521 -                               d->id, i,
  4.1522 -                               l1page->u.inuse.type_info,
  4.1523 -                               l1pfn);
  4.1524 -
  4.1525 -                    adjust(l1page, -1, 1);
  4.1526 -                }
  4.1527 -            }
  4.1528 -
  4.1529 -            unmap_domain_mem(pt);
  4.1530 -
  4.1531 -            break;
  4.1532 -
  4.1533 -
  4.1534 -        case PGT_l1_page_table:
  4.1535 -            
  4.1536 -            if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
  4.1537 -                adjust( page, -1, 1 );
  4.1538 -
  4.1539 -            if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated )
  4.1540 -                printk("Audit %d: L1 not validated %x\n",
  4.1541 -                       d->id, page->u.inuse.type_info);
  4.1542 -#if 0
  4.1543 -            if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned )
  4.1544 -                printk("Audit %d: L1 not pinned %x\n",
  4.1545 -                       d->id, page->u.inuse.type_info);
  4.1546 -#endif
  4.1547 -            pt = map_domain_mem( pfn<<PAGE_SHIFT );
  4.1548 -
  4.1549 -            for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
  4.1550 -            {
  4.1551 -                if ( pt[i] & _PAGE_PRESENT )
  4.1552 -                {
  4.1553 -                    unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
  4.1554 -                    struct pfn_info *l1page = &frame_table[l1pfn];
  4.1555 -
  4.1556 -                    if ( l1pfn < 0x100 )
  4.1557 -                    {
  4.1558 -                        lowmem_mappings++;
  4.1559 -                        continue;
  4.1560 -                    }
  4.1561 -
  4.1562 -                    if ( l1pfn > max_page )
  4.1563 -                    {
  4.1564 -                        io_mappings++;
  4.1565 -                        continue;
  4.1566 -                    }
  4.1567 -
  4.1568 -                    if ( pt[i] & _PAGE_RW )
  4.1569 -                    {
  4.1570 -
  4.1571 -                        if ( (l1page->u.inuse.type_info & PGT_type_mask) ==
  4.1572 -                             PGT_l1_page_table ||
  4.1573 -                             (l1page->u.inuse.type_info & PGT_type_mask) ==
  4.1574 -                             PGT_l2_page_table )
  4.1575 -                            printk("Audit %d: [%x] Ilegal RW t=%x pfn=%lx\n",
  4.1576 -                                   d->id, i,
  4.1577 -                                   l1page->u.inuse.type_info,
  4.1578 -                                   l1pfn);
  4.1579 -
  4.1580 -                    }
  4.1581 -
  4.1582 -                    if ( page_get_owner(l1page) != d )
  4.1583 -                    {
  4.1584 -                        printk("Audit %d: [%lx,%x] Skip foreign page dom=%p "
  4.1585 -                               "pfn=%lx c=%08x t=%08x m2p=%lx\n",
  4.1586 -                               d->id, pfn, i,
  4.1587 -                               page_get_owner(l1page),
  4.1588 -                               l1pfn,
  4.1589 -                               l1page->count_info,
  4.1590 -                               l1page->u.inuse.type_info,
  4.1591 -                               machine_to_phys_mapping[l1pfn]);    
  4.1592 -                        continue;
  4.1593 -                    }
  4.1594 -
  4.1595 -                    adjust(l1page, -1, 0);
  4.1596 -                }
  4.1597 -            }
  4.1598 -
  4.1599 -            unmap_domain_mem(pt);
  4.1600 -
  4.1601 -            break;
  4.1602 -        }       
  4.1603 -
  4.1604 -        list_ent = frame_table[pfn].list.next;
  4.1605 -    }
  4.1606 -
  4.1607 -    if ( (io_mappings > 0) || (lowmem_mappings > 0) )
  4.1608 -        printk("Audit %d: Found %d lowmem mappings and %d io mappings\n",
  4.1609 -               d->id, lowmem_mappings, io_mappings);
  4.1610 -
  4.1611 -    /* PHASE 2 */
  4.1612 -
  4.1613 -    ctot = ttot = 0;
  4.1614 -    list_ent = d->page_list.next;
  4.1615 -    for ( i = 0; (list_ent != &d->page_list); i++ )
  4.1616 -    {
  4.1617 -        pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
  4.1618 -        page = &frame_table[pfn];
  4.1619 -
  4.1620 -        switch ( page->u.inuse.type_info & PGT_type_mask)
  4.1621 -        {
  4.1622 -        case PGT_l1_page_table:
  4.1623 -        case PGT_l2_page_table:
  4.1624 -            if ( (page->u.inuse.type_info & PGT_count_mask) != 0 )
  4.1625 -            {
  4.1626 -                printk("Audit %d: type count!=0 t=%x ot=%x c=%x pfn=%lx\n",
  4.1627 -                       d->id, page->u.inuse.type_info, 
  4.1628 -                       page->tlbflush_timestamp,
  4.1629 -                       page->count_info, pfn );
  4.1630 -                scan_for_pfn_remote(pfn);
  4.1631 -            }
  4.1632 -        default:
  4.1633 -            if ( (page->count_info & PGC_count_mask) != 1 )
  4.1634 -            {
  4.1635 -                printk("Audit %d: gen count!=1 (c=%x) t=%x ot=%x pfn=%lx\n",
  4.1636 -                       d->id, 
  4.1637 -                       page->count_info,
  4.1638 -                       page->u.inuse.type_info, 
  4.1639 -                       page->tlbflush_timestamp, pfn );
  4.1640 -                scan_for_pfn_remote(pfn);
  4.1641 -            }
  4.1642 -            break;
  4.1643 -        }
  4.1644 -
  4.1645 -        list_ent = frame_table[pfn].list.next;
  4.1646 -    }
  4.1647 -
  4.1648 -    /* PHASE 3 */
  4.1649 -    list_ent = d->page_list.next;
  4.1650 -    l1 = l2 = 0;
  4.1651 -    for ( i = 0; (list_ent != &d->page_list); i++ )
  4.1652 -    {
  4.1653 -        unsigned long *pt;
  4.1654 -        pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
  4.1655 -        page = &frame_table[pfn];
  4.1656 -
  4.1657 -        switch ( page->u.inuse.type_info & PGT_type_mask )
  4.1658 -        {
  4.1659 -        case PGT_l2_page_table:
  4.1660 -	    l2++;
  4.1661 -            if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
  4.1662 -                adjust( page, 1, 1 );          
  4.1663 -
  4.1664 -            pt = map_domain_mem( pfn<<PAGE_SHIFT );
  4.1665 -
  4.1666 -            for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
  4.1667 -            {
  4.1668 -                if ( pt[i] & _PAGE_PRESENT )
  4.1669 -                {
  4.1670 -                    unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
  4.1671 -                    struct pfn_info *l1page;
  4.1672 -
  4.1673 -                    if (l1pfn>max_page)
  4.1674 -                        continue;
  4.1675 -
  4.1676 -                    l1page = &frame_table[l1pfn];
  4.1677 -
  4.1678 -                    if ( page_get_owner(l1page) == d )
  4.1679 -                        adjust(l1page, 1, 1);
  4.1680 -                }
  4.1681 -            }
  4.1682 -
  4.1683 -            unmap_domain_mem(pt);
  4.1684 -            break;
  4.1685 -
  4.1686 -        case PGT_l1_page_table:
  4.1687 -	    l1++;
  4.1688 -            if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
  4.1689 -                adjust( page, 1, 1 );
  4.1690 -
  4.1691 -            pt = map_domain_mem( pfn<<PAGE_SHIFT );
  4.1692 -
  4.1693 -            for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
  4.1694 -            {
  4.1695 -                if ( pt[i] & _PAGE_PRESENT )
  4.1696 -                {
  4.1697 -                    unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
  4.1698 -                    struct pfn_info *l1page;
  4.1699 -
  4.1700 -                    if (l1pfn>max_page)
  4.1701 -                        continue;
  4.1702 -
  4.1703 -                    l1page = &frame_table[l1pfn];
  4.1704 -
  4.1705 -                    if ( (page_get_owner(l1page) != d) ||
  4.1706 -                         (l1pfn < 0x100) || (l1pfn > max_page) )
  4.1707 -                        continue;
  4.1708 -
  4.1709 -                    adjust(l1page, 1, 0);
  4.1710 -                }
  4.1711 -            }
  4.1712 -
  4.1713 -            unmap_domain_mem(pt);
  4.1714 -            break;
  4.1715 -        }
  4.1716 -
  4.1717 -
  4.1718 -        page->tlbflush_timestamp = 0;
  4.1719 -
  4.1720 -        list_ent = frame_table[pfn].list.next;
  4.1721 -    }
  4.1722 -
  4.1723 -    spin_unlock(&d->page_alloc_lock);
  4.1724 -
  4.1725 -    if ( pagetable_val(d->exec_domain[0]->arch.guest_table) )
  4.1726 -        adjust(&frame_table[pagetable_val(
  4.1727 -            d->exec_domain[0]->arch.guest_table)>>PAGE_SHIFT], 1, 1);
  4.1728 -
  4.1729 -    printk("Audit %d: Done. pages=%d l1=%d l2=%d ctot=%d ttot=%d\n", d->id, i, l1, l2, ctot, ttot );
  4.1730 -
  4.1731 -    if ( d != current->domain )
  4.1732 -        domain_unpause(d);
  4.1733 -}
  4.1734 -
  4.1735 -void audit_domains(void)
  4.1736 -{
  4.1737 -    struct domain *d;
  4.1738 -    for_each_domain ( d )
  4.1739 -        audit_domain(d);
  4.1740 -}
  4.1741 -
  4.1742 -void audit_domains_key(unsigned char key)
  4.1743 -{
  4.1744 -    audit_domains();
  4.1745 -}
  4.1746 -
  4.1747  #endif /* NDEBUG */
  4.1748  
  4.1749  /*
     5.1 --- a/xen/arch/x86/shadow.c	Mon Mar 14 18:44:10 2005 +0000
     5.2 +++ b/xen/arch/x86/shadow.c	Mon Mar 14 22:07:47 2005 +0000
     5.3 @@ -1,3 +1,23 @@
     5.4 +/******************************************************************************
     5.5 + * arch/x86/shadow.c
     5.6 + * 
     5.7 + * Copyright (c) 2005 Michael A Fetterman
     5.8 + * 
     5.9 + * This program is free software; you can redistribute it and/or modify
    5.10 + * it under the terms of the GNU General Public License as published by
    5.11 + * the Free Software Foundation; either version 2 of the License, or
    5.12 + * (at your option) any later version.
    5.13 + * 
    5.14 + * This program is distributed in the hope that it will be useful,
    5.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
    5.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    5.17 + * GNU General Public License for more details.
    5.18 + * 
    5.19 + * You should have received a copy of the GNU General Public License
    5.20 + * along with this program; if not, write to the Free Software
    5.21 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
    5.22 + */
    5.23 +
    5.24  
    5.25  #include <xen/config.h>
    5.26  #include <xen/types.h>
    5.27 @@ -8,6 +28,10 @@
    5.28  #include <xen/event.h>
    5.29  #include <xen/trace.h>
    5.30  
    5.31 +static void shadow_free_snapshot(struct domain *d,
    5.32 +                                 struct out_of_sync_entry *entry);
    5.33 +static void remove_out_of_sync_entries(struct domain *d, unsigned long smfn);
    5.34 +
    5.35  /********
    5.36  
    5.37  There's a per-domain shadow table spin lock which works fine for SMP
    5.38 @@ -20,34 +44,401 @@ hypercall lock anyhow (at least initiall
    5.39  
    5.40  ********/
    5.41  
    5.42 -static inline void free_shadow_page(
    5.43 -    struct domain *d, struct pfn_info *page)
    5.44 +static inline int
    5.45 +shadow_promote(struct domain *d, unsigned long gpfn, unsigned long gmfn,
    5.46 +               unsigned long new_type)
    5.47  {
    5.48 -    d->arch.shadow_page_count--;
    5.49 +    unsigned long min_type, max_type;
    5.50 +    struct pfn_info *page = pfn_to_page(gmfn);
    5.51 +    int pinned = 0, okay = 1;
    5.52 +
    5.53 +    if ( page_out_of_sync(page) )
    5.54 +    {
    5.55 +        // Don't know how long ago this snapshot was taken.
    5.56 +        // Can't trust it to be recent enough.
    5.57 +        //
    5.58 +        __shadow_sync_mfn(d, gmfn);
    5.59 +    }
    5.60 +
    5.61 +    if ( unlikely(mfn_is_page_table(gmfn)) )
    5.62 +    {
    5.63 +        min_type = shadow_max_pgtable_type(d, gpfn) + PGT_l1_shadow;
    5.64 +        max_type = new_type;
    5.65 +    }
    5.66 +    else
    5.67 +    {
    5.68 +        min_type = PGT_l1_shadow;
    5.69 +        max_type = PGT_l1_shadow;
    5.70 +    }
    5.71 +    FSH_LOG("shadow_promote gpfn=%p gmfn=%p nt=%p min=%p max=%p\n",
    5.72 +            gmfn, gmfn, new_type, min_type, max_type);
    5.73 +
    5.74 +    if ( min_type <= max_type )
    5.75 +        shadow_remove_all_write_access(d, min_type, max_type, gpfn);
    5.76 +
    5.77 +    // To convert this page to use as a page table, the writable count
    5.78 +    // should now be zero.  Test this by grabbing the page as an page table,
    5.79 +    // and then immediately releasing.  This will also deal with any
    5.80 +    // necessary TLB flushing issues for us.
    5.81 +    //
    5.82 +    // The cruft here about pinning doesn't really work right.  This
    5.83 +    // needs rethinking/rewriting...  Need to gracefully deal with the
    5.84 +    // TLB flushes required when promoting a writable page, and also deal
    5.85 +    // with any outstanding (external) writable refs to this page (by
    5.86 +    // refusing to promote it).  The pinning headache complicates this
    5.87 +    // code -- it would all much get simpler if we stop using
    5.88 +    // shadow_lock() and move the shadow code to BIGLOCK().
    5.89 +    //
    5.90 +    if ( unlikely(!get_page(page, d)) )
    5.91 +        BUG();
    5.92 +    if ( unlikely(test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info)) )
    5.93 +    {
    5.94 +        pinned = 1;
    5.95 +        put_page_and_type(page);
    5.96 +    }
    5.97 +    if ( get_page_type(page, PGT_base_page_table) )
    5.98 +    {
    5.99 +        put_page_type(page);
   5.100 +        set_bit(_PGC_page_table, &frame_table[gmfn].count_info);
   5.101 +    }
   5.102 +    else
   5.103 +    {
   5.104 +        printk("shadow_promote: get_page_type failed "
   5.105 +               "dom%d gpfn=%p gmfn=%p t=%x\n",
   5.106 +               d->id, gpfn, gmfn, new_type);
   5.107 +        okay = 0;
   5.108 +    }
   5.109 +
   5.110 +    // Now put the type back to writable...
   5.111 +    if ( unlikely(!get_page_type(page, PGT_writable_page)) )
   5.112 +        BUG();
   5.113 +    if ( unlikely(pinned) )
   5.114 +    {
   5.115 +        if ( unlikely(test_and_set_bit(_PGT_pinned,
   5.116 +                                       &page->u.inuse.type_info)) )
   5.117 +            BUG(); // hmm... someone pinned this again?
   5.118 +    }
   5.119 +    else
   5.120 +        put_page_and_type(page);
   5.121 +
   5.122 +    return okay;
   5.123 +}
   5.124 +
   5.125 +static inline void
   5.126 +shadow_demote(struct domain *d, unsigned long gpfn, unsigned long gmfn)
   5.127 +{
   5.128 +    ASSERT(frame_table[gmfn].count_info & PGC_page_table);
   5.129 +
   5.130 +    if ( shadow_max_pgtable_type(d, gpfn) == PGT_none )
   5.131 +    {
   5.132 +        clear_bit(_PGC_page_table, &frame_table[gmfn].count_info);
   5.133  
   5.134 -    switch ( page->u.inuse.type_info & PGT_type_mask )
   5.135 +        if ( page_out_of_sync(pfn_to_page(gmfn)) )
   5.136 +        {
   5.137 +            remove_out_of_sync_entries(d, gmfn);
   5.138 +        }
   5.139 +    }
   5.140 +}
   5.141 +
   5.142 +/*
   5.143 + * Things in shadow mode that collect get_page() refs to the domain's
   5.144 + * pages are:
   5.145 + * - PGC_allocated takes a gen count, just like normal.
   5.146 + * - A writable page can be pinned (paravirtualized guests may consider
   5.147 + *   these pages to be L1s or L2s, and don't know the difference).
   5.148 + *   Pinning a page takes a gen count (but, for domains in shadow mode,
   5.149 + *   it *doesn't* take a type count)
   5.150 + * - CR3 grabs a ref to whatever it points at, just like normal.
   5.151 + * - Shadow mode grabs an initial gen count for itself, as a placehold
   5.152 + *   for whatever references will exist.
   5.153 + * - Shadow PTEs that point to a page take a gen count, just like regular
   5.154 + *   PTEs.  However, they don't get a type count, as get_page_type() is
   5.155 + *   hardwired to keep writable pages' counts at 1 for domains in shadow
   5.156 + *   mode.
   5.157 + * - Whenever we shadow a page, the entry in the shadow hash grabs a
   5.158 + *   general ref to the page.
   5.159 + * - Whenever a page goes out of sync, the out of sync entry grabs a
   5.160 + *   general ref to the page.
   5.161 + */
   5.162 +/*
   5.163 + * pfn_info fields for pages allocated as shadow pages:
   5.164 + *
   5.165 + * All 32 bits of count_info are a simple count of refs to this shadow
   5.166 + * from a) other shadow pages, b) current CR3's (aka ed->arch.shadow_table),
   5.167 + * c) if it's a pinned shadow root pgtable, d) outstanding out-of-sync
   5.168 + * references.
   5.169 + *
   5.170 + * u.inuse._domain is left NULL, to prevent accidently allow some random
   5.171 + * domain from gaining permissions to map this page.
   5.172 + *
   5.173 + * u.inuse.type_info & PGT_type_mask remembers what kind of page is being
   5.174 + * shadowed.
   5.175 + * u.inuse.type_info & PGT_mfn_mask holds the mfn of the page being shadowed.
   5.176 + * u.inuse.type_info & PGT_pinned says that an extra reference to this shadow
   5.177 + * is currently exists because this is a shadow of a root page, and we
   5.178 + * don't want to let those disappear just because no CR3 is currently pointing
   5.179 + * at it.
   5.180 + *
   5.181 + * tlbflush_timestamp holds a pickled pointer to the domain.
   5.182 + */
   5.183 +
   5.184 +static inline unsigned long
   5.185 +alloc_shadow_page(struct domain *d,
   5.186 +                  unsigned long gpfn, unsigned long gmfn,
   5.187 +                  u32 psh_type)
   5.188 +{
   5.189 +    struct pfn_info *page;
   5.190 +    unsigned long smfn;
   5.191 +    int pin = 0;
   5.192 +
   5.193 +    if ( (psh_type != PGT_snapshot) &&
   5.194 +         !shadow_promote(d, gpfn, gmfn, psh_type) )
   5.195      {
   5.196 -    case PGT_l1_page_table:
   5.197 -        perfc_decr(shadow_l1_pages);
   5.198 +        FSH_LOG("promotion of pfn=%p mfn=%p failed!  external gnttab refs?\n",
   5.199 +                gpfn, gmfn);
   5.200 +        return 0;
   5.201 +    }
   5.202 +
   5.203 +    page = alloc_domheap_page(NULL);
   5.204 +    if ( unlikely(page == NULL) )
   5.205 +    {
   5.206 +        printk("Couldn't alloc shadow page! dom%d count=%d\n",
   5.207 +               d->id, d->arch.shadow_page_count);
   5.208 +        printk("Shadow table counts: l1=%d l2=%d hl2=%d snapshot=%d\n",
   5.209 +               perfc_value(shadow_l1_pages), 
   5.210 +               perfc_value(shadow_l2_pages),
   5.211 +               perfc_value(hl2_table_pages),
   5.212 +               perfc_value(snapshot_pages));
   5.213 +        BUG(); /* XXX FIXME: try a shadow flush to free up some memory. */
   5.214 +    }
   5.215 +
   5.216 +    smfn = page_to_pfn(page);
   5.217 +
   5.218 +    ASSERT( (gmfn & ~PGT_mfn_mask) == 0 );
   5.219 +    page->u.inuse.type_info = psh_type | gmfn;
   5.220 +    page->count_info = 0;
   5.221 +    page->tlbflush_timestamp = pickle_domptr(d);
   5.222 +
   5.223 +    switch ( psh_type )
   5.224 +    {
   5.225 +    case PGT_l1_shadow:
   5.226 +        perfc_incr(shadow_l1_pages);
   5.227 +        d->arch.shadow_page_count++;
   5.228          break;
   5.229  
   5.230 -    case PGT_l2_page_table:
   5.231 -        perfc_decr(shadow_l2_pages);
   5.232 +    case PGT_l2_shadow:
   5.233 +        perfc_incr(shadow_l2_pages);
   5.234 +        d->arch.shadow_page_count++;
   5.235 +        if ( PGT_l2_page_table == PGT_root_page_table )
   5.236 +            pin = 1;
   5.237 +
   5.238 +        break;
   5.239 +
   5.240 +    case PGT_hl2_shadow:
   5.241 +        perfc_incr(hl2_table_pages);
   5.242 +        d->arch.hl2_page_count++;
   5.243 +
   5.244 +        // treat an hl2 as an L1 for purposes of promotion,
   5.245 +        // and as an L2 for purposes of pinning.
   5.246 +        //
   5.247 +        if ( PGT_l2_page_table == PGT_root_page_table )
   5.248 +            pin = 1;
   5.249 +
   5.250 +        break;
   5.251 +
   5.252 +    case PGT_snapshot:
   5.253 +        perfc_incr(snapshot_pages);
   5.254 +        d->arch.snapshot_page_count++;
   5.255          break;
   5.256  
   5.257      default:
   5.258 -        printk("Free shadow weird page type pfn=%08x type=%08x\n",
   5.259 -               frame_table-page, page->u.inuse.type_info);
   5.260 +        printk("Alloc shadow weird page type type=%08x\n", psh_type);
   5.261 +        BUG();
   5.262          break;
   5.263      }
   5.264  
   5.265 +    set_shadow_status(d, gpfn, smfn, psh_type);
   5.266 +
   5.267 +    if ( pin )
   5.268 +        shadow_pin(smfn);
   5.269 +
   5.270 +    return smfn;
   5.271 +}
   5.272 +
   5.273 +static void inline
   5.274 +free_shadow_l1_table(struct domain *d, unsigned long smfn)
   5.275 +{
   5.276 +    l1_pgentry_t *pl1e = map_domain_mem(smfn << PAGE_SHIFT);
   5.277 +    int i;
   5.278 +
   5.279 +    for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
   5.280 +        put_page_from_l1e(pl1e[i], d);
   5.281 +
   5.282 +    unmap_domain_mem(pl1e);
   5.283 +}
   5.284 +
   5.285 +static void inline
   5.286 +free_shadow_hl2_table(struct domain *d, unsigned long smfn)
   5.287 +{
   5.288 +    printk("free_shadow_hl2_table(smfn=%p)\n", smfn);
   5.289 +
   5.290 +    l1_pgentry_t *pl1e = map_domain_mem(smfn << PAGE_SHIFT);
   5.291 +    int i, limit;
   5.292 +
   5.293 +    if ( shadow_mode_external(d) )
   5.294 +        limit = L2_PAGETABLE_ENTRIES;
   5.295 +    else
   5.296 +        limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
   5.297 +
   5.298 +    for ( i = 0; i < limit; i++ )
   5.299 +        put_page_from_l1e(pl1e[i], d);
   5.300 +
   5.301 +    unmap_domain_mem(pl1e);
   5.302 +}
   5.303 +
   5.304 +static void inline
   5.305 +free_shadow_l2_table(struct domain *d, unsigned long smfn)
   5.306 +{
   5.307 +    printk("free_shadow_l2_table(smfn=%p)\n", smfn);
   5.308 +
   5.309 +    unsigned long *pl2e = map_domain_mem(smfn << PAGE_SHIFT);
   5.310 +    int i, external = shadow_mode_external(d);
   5.311 +
   5.312 +    for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
   5.313 +        if ( external || is_guest_l2_slot(i) )
   5.314 +            if ( pl2e[i] & _PAGE_PRESENT )
   5.315 +                put_shadow_ref(pl2e[i] >> PAGE_SHIFT);
   5.316 +
   5.317 +    if ( (PGT_base_page_table == PGT_l2_page_table) &&
   5.318 +         shadow_mode_translate(d) &&
   5.319 +         !shadow_mode_external(d) )
   5.320 +    {
   5.321 +        // free the ref to the hl2
   5.322 +        //
   5.323 +        put_shadow_ref(pl2e[l2_table_offset(LINEAR_PT_VIRT_START)]
   5.324 +                       >> PAGE_SHIFT);
   5.325 +    }
   5.326 +
   5.327 +    unmap_domain_mem(pl2e);
   5.328 +}
   5.329 +
   5.330 +void free_shadow_page(unsigned long smfn)
   5.331 +{
   5.332 +    struct pfn_info *page = &frame_table[smfn];
   5.333 +    struct domain *d = unpickle_domptr(page->tlbflush_timestamp);
   5.334 +    unsigned long gmfn = page->u.inuse.type_info & PGT_mfn_mask;
   5.335 +    unsigned long gpfn = __mfn_to_gpfn(d, gmfn);
   5.336 +    unsigned long type = page->u.inuse.type_info & PGT_type_mask;
   5.337 +
   5.338 +    ASSERT( ! IS_INVALID_M2P_ENTRY(gpfn) );
   5.339 +
   5.340 +    delete_shadow_status(d, gpfn, type);
   5.341 +
   5.342 +    switch ( type )
   5.343 +    {
   5.344 +    case PGT_l1_shadow:
   5.345 +        perfc_decr(shadow_l1_pages);
   5.346 +        shadow_demote(d, gpfn, gmfn);
   5.347 +        free_shadow_l1_table(d, smfn);
   5.348 +        break;
   5.349 +
   5.350 +    case PGT_l2_shadow:
   5.351 +        perfc_decr(shadow_l2_pages);
   5.352 +        shadow_demote(d, gpfn, gmfn);
   5.353 +        free_shadow_l2_table(d, smfn);
   5.354 +        break;
   5.355 +
   5.356 +    case PGT_hl2_shadow:
   5.357 +        perfc_decr(hl2_table_pages);
   5.358 +        shadow_demote(d, gpfn, gmfn);
   5.359 +        free_shadow_hl2_table(d, smfn);
   5.360 +        break;
   5.361 +
   5.362 +    case PGT_snapshot:
   5.363 +        perfc_decr(snapshot_pages);
   5.364 +        break;
   5.365 +
   5.366 +    default:
   5.367 +        printk("Free shadow weird page type mfn=%08x type=%08x\n",
   5.368 +               page-frame_table, page->u.inuse.type_info);
   5.369 +        break;
   5.370 +    }
   5.371 +
   5.372 +    d->arch.shadow_page_count--;
   5.373 +
   5.374 +    // No TLB flushes are needed the next time this page gets allocated.
   5.375 +    //
   5.376 +    page->tlbflush_timestamp = 0;
   5.377 +    page->u.free.cpu_mask = 0;
   5.378 +
   5.379      free_domheap_page(page);
   5.380  }
   5.381  
   5.382 -void free_shadow_state(struct domain *d)
   5.383 +static void inline
   5.384 +release_out_of_sync_entry(struct domain *d, struct out_of_sync_entry *entry)
   5.385 +{
   5.386 +    struct pfn_info *page;
   5.387 +
   5.388 +    page = &frame_table[entry->gmfn];
   5.389 +        
   5.390 +    // Decrement ref count of guest & shadow pages
   5.391 +    //
   5.392 +    put_page(page);
   5.393 +
   5.394 +    // Only use entries that have low bits clear...
   5.395 +    //
   5.396 +    if ( !(entry->writable_pl1e & (sizeof(l1_pgentry_t)-1)) )
   5.397 +        put_shadow_ref(entry->writable_pl1e >> PAGE_SHIFT);
   5.398 +
   5.399 +    // Free the snapshot
   5.400 +    //
   5.401 +    shadow_free_snapshot(d, entry);
   5.402 +}
   5.403 +
   5.404 +static void remove_out_of_sync_entries(struct domain *d, unsigned long gmfn)
   5.405 +{
   5.406 +    struct out_of_sync_entry *entry = d->arch.out_of_sync;
   5.407 +    struct out_of_sync_entry **prev = &d->arch.out_of_sync;
   5.408 +
   5.409 +    while ( entry )
   5.410 +    {
   5.411 +        if ( entry->gmfn == gmfn )
   5.412 +        {
   5.413 +            release_out_of_sync_entry(d, entry);
   5.414 +            *prev = entry = entry->next;
   5.415 +            continue;
   5.416 +        }
   5.417 +        prev = &entry->next;
   5.418 +        entry = entry->next;
   5.419 +    }
   5.420 +}
   5.421 +
   5.422 +static void free_out_of_sync_state(struct domain *d)
   5.423 +{
   5.424 +    struct out_of_sync_entry *entry;
   5.425 +    struct out_of_sync_entry **tail = NULL;
   5.426 +
   5.427 +    // Add the list of out-of-sync entries to the free list of entries.
   5.428 +    // Not the smartest code.  But it works.
   5.429 +    //
   5.430 +    for ( entry = d->arch.out_of_sync; entry; entry = entry->next)
   5.431 +    {
   5.432 +        release_out_of_sync_entry(d, entry);
   5.433 +        tail = &entry->next;
   5.434 +    }
   5.435 +    if ( tail )
   5.436 +    {
   5.437 +        *tail = d->arch.out_of_sync_free;
   5.438 +        d->arch.out_of_sync_free = d->arch.out_of_sync;
   5.439 +        d->arch.out_of_sync = NULL;
   5.440 +    }
   5.441 +}
   5.442 +
   5.443 +static void free_shadow_pages(struct domain *d)
   5.444  {
   5.445      int                   i, free = 0;
   5.446      struct shadow_status *x, *n;
   5.447 +    struct exec_domain   *e;
   5.448   
   5.449      /*
   5.450       * WARNING! The shadow page table must not currently be in use!
   5.451 @@ -58,21 +449,37 @@ void free_shadow_state(struct domain *d)
   5.452  
   5.453      if( !d->arch.shadow_ht ) return;
   5.454  
   5.455 -    /* Free each hash chain in turn. */
   5.456 +    // first, remove any outstanding refs from out_of_sync entries...
   5.457 +    //
   5.458 +    free_out_of_sync_state(d);
   5.459 +
   5.460 +    // second, remove any outstanding refs from ed->arch.shadow_table...
   5.461 +    //
   5.462 +    for_each_exec_domain(d, e)
   5.463 +    {
   5.464 +        if ( pagetable_val(e->arch.shadow_table) )
   5.465 +        {
   5.466 +            put_shadow_ref(pagetable_val(e->arch.shadow_table) >> PAGE_SHIFT);
   5.467 +            e->arch.shadow_table = mk_pagetable(0);
   5.468 +        }
   5.469 +    }
   5.470 +
   5.471 +    // Now, the only refs to shadow pages that are left are from the shadow
   5.472 +    // pages themselves.  We can just free them.
   5.473 +    //
   5.474      for ( i = 0; i < shadow_ht_buckets; i++ )
   5.475      {
   5.476          /* Skip empty buckets. */
   5.477          x = &d->arch.shadow_ht[i];
   5.478 -        if ( x->pfn == 0 )
   5.479 +        if ( x->gpfn_and_flags == 0 )
   5.480              continue;
   5.481  
   5.482          /* Free the head page. */
   5.483 -        free_shadow_page(
   5.484 -            d, &frame_table[x->smfn_and_flags & PSH_pfn_mask]);
   5.485 +        free_shadow_page(x->smfn);
   5.486  
   5.487          /* Reinitialise the head node. */
   5.488 -        x->pfn            = 0;
   5.489 -        x->smfn_and_flags = 0;
   5.490 +        x->gpfn_and_flags = 0;
   5.491 +        x->smfn           = 0;
   5.492          n                 = x->next;
   5.493          x->next           = NULL;
   5.494  
   5.495 @@ -82,16 +489,15 @@ void free_shadow_state(struct domain *d)
   5.496          for ( x = n; x != NULL; x = n )
   5.497          { 
   5.498              /* Free the shadow page. */
   5.499 -            free_shadow_page(
   5.500 -                d, &frame_table[x->smfn_and_flags & PSH_pfn_mask]);
   5.501 +            free_shadow_page(x->smfn);
   5.502  
   5.503              /* Re-initialise the chain node. */
   5.504 -            x->pfn            = 0;
   5.505 -            x->smfn_and_flags = 0;
   5.506 +            x->gpfn_and_flags = 0;
   5.507 +            x->smfn           = 0;
   5.508  
   5.509              /* Add to the free list. */
   5.510 -            n                 = x->next;
   5.511 -            x->next           = d->arch.shadow_ht_free;
   5.512 +            n       = x->next;
   5.513 +            x->next = d->arch.shadow_ht_free;
   5.514              d->arch.shadow_ht_free = x;
   5.515  
   5.516              free++;
   5.517 @@ -103,80 +509,140 @@ void free_shadow_state(struct domain *d)
   5.518      SH_LOG("Free shadow table. Freed=%d.", free);
   5.519  }
   5.520  
   5.521 -static inline int clear_shadow_page(
   5.522 -    struct domain *d, struct shadow_status *x)
   5.523 -{
   5.524 -    unsigned long   *p;
   5.525 -    int              restart = 0;
   5.526 -    struct pfn_info *spage = &frame_table[x->smfn_and_flags & PSH_pfn_mask];
   5.527 -
   5.528 -    // We don't clear hl2_table's here.  At least not yet.
   5.529 -    if ( x->pfn & PSH_hl2 )
   5.530 -        return 0;
   5.531 -
   5.532 -    switch ( spage->u.inuse.type_info & PGT_type_mask )
   5.533 -    {
   5.534 -        /* We clear L2 pages by zeroing the guest entries. */
   5.535 -    case PGT_l2_page_table:
   5.536 -        p = map_domain_mem((spage - frame_table) << PAGE_SHIFT);
   5.537 -        if ( shadow_mode_external(d) )
   5.538 -            memset(p, 0, L2_PAGETABLE_ENTRIES * sizeof(*p));
   5.539 -        else 
   5.540 -            memset(p, 0, DOMAIN_ENTRIES_PER_L2_PAGETABLE * sizeof(*p));
   5.541 -        unmap_domain_mem(p);
   5.542 -        break;
   5.543 -
   5.544 -        /* We clear L1 pages by freeing them: no benefit from zeroing them. */
   5.545 -    case PGT_l1_page_table:
   5.546 -        delete_shadow_status(d, x->pfn);
   5.547 -        free_shadow_page(d, spage);
   5.548 -        restart = 1; /* We need to go to start of list again. */
   5.549 -        break;
   5.550 -    }
   5.551 -
   5.552 -    return restart;
   5.553 -}
   5.554 -
   5.555 -static void clear_shadow_state(struct domain *d)
   5.556 -{
   5.557 -    int                   i;
   5.558 -    struct shadow_status *x;
   5.559 - 
   5.560 -    shadow_audit(d, 1);
   5.561 -
   5.562 -    for ( i = 0; i < shadow_ht_buckets; i++ )
   5.563 -    {
   5.564 -    retry:
   5.565 -        /* Skip empty buckets. */
   5.566 -        x = &d->arch.shadow_ht[i];
   5.567 -        if ( x->pfn == 0 )
   5.568 -            continue;
   5.569 -
   5.570 -        if ( clear_shadow_page(d, x) )
   5.571 -            goto retry;
   5.572 -
   5.573 -        for ( x = x->next; x != NULL; x = x->next )
   5.574 -            if ( clear_shadow_page(d, x) )
   5.575 -                goto retry;
   5.576 -
   5.577 -        shadow_audit(d, 0);
   5.578 -    }
   5.579 -
   5.580 -    SH_VLOG("Scan shadow table. l1=%d l2=%d",
   5.581 -            perfc_value(shadow_l1_pages), perfc_value(shadow_l2_pages));
   5.582 -}
   5.583 -
   5.584 -
   5.585  void shadow_mode_init(void)
   5.586  {
   5.587  }
   5.588  
   5.589 +static void alloc_monitor_pagetable(struct exec_domain *ed)
   5.590 +{
   5.591 +    unsigned long mmfn;
   5.592 +    l2_pgentry_t *mpl2e;
   5.593 +    struct pfn_info *mmfn_info;
   5.594 +    struct domain *d = ed->domain;
   5.595 +
   5.596 +    ASSERT(!pagetable_val(ed->arch.monitor_table)); /* we should only get called once */
   5.597 +
   5.598 +    mmfn_info = alloc_domheap_page(NULL);
   5.599 +    ASSERT( mmfn_info ); 
   5.600 +
   5.601 +    mmfn = (unsigned long) (mmfn_info - frame_table);
   5.602 +    mpl2e = (l2_pgentry_t *) map_domain_mem(mmfn << PAGE_SHIFT);
   5.603 +    memset(mpl2e, 0, PAGE_SIZE);
   5.604 +
   5.605 +    memcpy(&mpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 
   5.606 +           &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
   5.607 +           HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
   5.608 +
   5.609 +    mpl2e[l2_table_offset(PERDOMAIN_VIRT_START)] =
   5.610 +        mk_l2_pgentry((__pa(d->arch.mm_perdomain_pt) & PAGE_MASK) 
   5.611 +                      | __PAGE_HYPERVISOR);
   5.612 +
   5.613 +    // map the phys_to_machine map into the Read-Only MPT space for this domain
   5.614 +    mpl2e[l2_table_offset(RO_MPT_VIRT_START)] =
   5.615 +        mk_l2_pgentry(pagetable_val(ed->arch.phys_table) | __PAGE_HYPERVISOR);
   5.616 +
   5.617 +    ed->arch.monitor_table = mk_pagetable(mmfn << PAGE_SHIFT);
   5.618 +    ed->arch.monitor_vtable = mpl2e;
   5.619 +}
   5.620 +
   5.621 +/*
   5.622 + * Free the pages for monitor_table and hl2_table
   5.623 + */
   5.624 +void free_monitor_pagetable(struct exec_domain *ed)
   5.625 +{
   5.626 +    l2_pgentry_t *mpl2e, hl2e;
   5.627 +    unsigned long mfn;
   5.628 +
   5.629 +    ASSERT( pagetable_val(ed->arch.monitor_table) );
   5.630 +    ASSERT( shadow_mode_external(ed->domain) );
   5.631 +    
   5.632 +    mpl2e = ed->arch.monitor_vtable;
   5.633 +
   5.634 +    /*
   5.635 +     * First get the mfn for hl2_table by looking at monitor_table
   5.636 +     */
   5.637 +    hl2e = mpl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT];
   5.638 +    ASSERT(l2_pgentry_val(hl2e) & _PAGE_PRESENT);
   5.639 +    mfn = l2_pgentry_val(hl2e) >> PAGE_SHIFT;
   5.640 +    ASSERT(mfn);
   5.641 +
   5.642 +    put_shadow_ref(mfn);
   5.643 +    unmap_domain_mem(mpl2e);
   5.644 +
   5.645 +    /*
   5.646 +     * Then free monitor_table.
   5.647 +     */
   5.648 +    mfn = (pagetable_val(ed->arch.monitor_table)) >> PAGE_SHIFT;
   5.649 +    free_domheap_page(&frame_table[mfn]);
   5.650 +
   5.651 +    ed->arch.monitor_table = mk_pagetable(0);
   5.652 +    ed->arch.monitor_vtable = 0;
   5.653 +}
   5.654  
   5.655  int __shadow_mode_enable(struct domain *d, unsigned int mode)
   5.656  {
   5.657 -    d->arch.shadow_mode = mode;
   5.658 +    struct exec_domain *ed;
   5.659 +
   5.660 +    for_each_exec_domain(d, ed)
   5.661 +    {
   5.662 +        invalidate_shadow_ldt(ed);
   5.663 +
   5.664 +        // We need to set these up for __update_pagetables().
   5.665 +        // See the comment there.
   5.666 +
   5.667 +        /*
   5.668 +         * arch.guest_vtable
   5.669 +         */
   5.670 +        if ( ed->arch.guest_vtable &&
   5.671 +             (ed->arch.guest_vtable != __linear_l2_table) )
   5.672 +        {
   5.673 +            unmap_domain_mem(ed->arch.guest_vtable);
   5.674 +        }
   5.675 +        if ( (mode & (SHM_translate | SHM_external)) == SHM_translate )
   5.676 +            ed->arch.guest_vtable = __linear_l2_table;
   5.677 +        else
   5.678 +            ed->arch.guest_vtable = NULL;
   5.679  
   5.680 -    if (!d->arch.shadow_ht)
   5.681 +        /*
   5.682 +         * arch.shadow_vtable
   5.683 +         */
   5.684 +        if ( ed->arch.shadow_vtable &&
   5.685 +             (ed->arch.shadow_vtable != __shadow_linear_l2_table) )
   5.686 +        {
   5.687 +            unmap_domain_mem(ed->arch.shadow_vtable);
   5.688 +        }
   5.689 +        if ( !(mode & SHM_external) )
   5.690 +            ed->arch.shadow_vtable = __shadow_linear_l2_table;
   5.691 +        else
   5.692 +            ed->arch.shadow_vtable = NULL;
   5.693 +
   5.694 +        /*
   5.695 +         * arch.hl2_vtable
   5.696 +         */
   5.697 +        if ( ed->arch.hl2_vtable &&
   5.698 +             (ed->arch.hl2_vtable != __linear_hl2_table) )
   5.699 +        {
   5.700 +            unmap_domain_mem(ed->arch.hl2_vtable);
   5.701 +        }
   5.702 +        if ( (mode & (SHM_translate | SHM_external)) == SHM_translate )
   5.703 +            ed->arch.hl2_vtable = __linear_hl2_table;
   5.704 +        else
   5.705 +            ed->arch.hl2_vtable = NULL;
   5.706 +
   5.707 +        /*
   5.708 +         * arch.monitor_table & arch.monitor_vtable
   5.709 +         */
   5.710 +        if ( ed->arch.monitor_vtable )
   5.711 +        {
   5.712 +            free_monitor_pagetable(ed);
   5.713 +        }
   5.714 +        if ( mode & SHM_external )
   5.715 +        {
   5.716 +            alloc_monitor_pagetable(ed);
   5.717 +        }
   5.718 +    }
   5.719 +
   5.720 +    if ( !d->arch.shadow_ht )
   5.721      {
   5.722          d->arch.shadow_ht = xmalloc_array(struct shadow_status, shadow_ht_buckets);
   5.723          if ( d->arch.shadow_ht == NULL )
   5.724 @@ -186,7 +652,7 @@ int __shadow_mode_enable(struct domain *
   5.725             shadow_ht_buckets * sizeof(struct shadow_status));
   5.726      }
   5.727  
   5.728 -    if ( shadow_mode_log_dirty(d) && !d->arch.shadow_dirty_bitmap)
   5.729 +    if ( shadow_mode_log_dirty(d) && !d->arch.shadow_dirty_bitmap )
   5.730      {
   5.731          d->arch.shadow_dirty_bitmap_size = (d->max_pages + 63) & ~63;
   5.732          d->arch.shadow_dirty_bitmap = 
   5.733 @@ -201,6 +667,63 @@ int __shadow_mode_enable(struct domain *
   5.734                 d->arch.shadow_dirty_bitmap_size/8);
   5.735      }
   5.736  
   5.737 +    printk("audit1\n");
   5.738 +    _audit_domain(d, AUDIT_ALREADY_LOCKED | AUDIT_ERRORS_OK, __FILE__, __LINE__);
   5.739 +    printk("audit1 done\n");
   5.740 +
   5.741 +    // Get rid of any shadow pages from any previous shadow mode.
   5.742 +    //
   5.743 +    free_shadow_pages(d);
   5.744 +
   5.745 +    printk("audit2\n");
   5.746 +    _audit_domain(d, AUDIT_ALREADY_LOCKED | AUDIT_ERRORS_OK, __FILE__, __LINE__);
   5.747 +    printk("audit2 done\n");
   5.748 +
   5.749 +    // Turn off writable page tables.
   5.750 +    // It doesn't mix with shadow mode.
   5.751 +    //
   5.752 +    vm_assist(d, VMASST_CMD_disable, VMASST_TYPE_writable_pagetables);
   5.753 +
   5.754 +    /*
   5.755 +     * Tear down it's counts by disassembling its page-table-based ref counts.
   5.756 +     * Also remove CR3's gcount/tcount.
   5.757 +     * That leaves things like GDTs and LDTs and external refs in tact.
   5.758 +     *
   5.759 +     * Most pages will be writable tcount=0.
   5.760 +     * Some will still be L1 tcount=0 or L2 tcount=0.
   5.761 +     * Maybe some pages will be type none tcount=0.
   5.762 +     * Pages granted external writable refs (via grant tables?) will
   5.763 +     * still have a non-zero tcount.  That's OK.
   5.764 +     *
   5.765 +     * gcounts will generally be 1 for PGC_allocated.
   5.766 +     * GDTs and LDTs will have additional gcounts.
   5.767 +     * Any grant-table based refs will still be in the gcount.
   5.768 +     *
   5.769 +     * We attempt to grab writable refs to each page (thus setting its type).
   5.770 +     * Immediately put back those type refs.
   5.771 +     *
   5.772 +     * Assert that no pages are left with L1/L2/L3/L4 type.
   5.773 +     */
   5.774 +    audit_adjust_pgtables(d, -1, 1);
   5.775 +    d->arch.shadow_mode = mode;
   5.776 +
   5.777 +    struct list_head *list_ent = d->page_list.next;
   5.778 +    while ( list_ent != &d->page_list )
   5.779 +    {
   5.780 +        struct pfn_info *page = list_entry(list_ent, struct pfn_info, list);
   5.781 +        if ( !get_page_type(page, PGT_writable_page) )
   5.782 +            BUG();
   5.783 +        put_page_type(page);
   5.784 +
   5.785 +        list_ent = page->list.next;
   5.786 +    }
   5.787 +
   5.788 +    audit_adjust_pgtables(d, 1, 1);
   5.789 +
   5.790 +    printk("audit3\n");
   5.791 +    _audit_domain(d, AUDIT_ALREADY_LOCKED, __FILE__, __LINE__);
   5.792 +    printk("audit3 done\n");
   5.793 +
   5.794      return 0;
   5.795  
   5.796   nomem:
   5.797 @@ -219,13 +742,10 @@ int shadow_mode_enable(struct domain *d,
   5.798      return rc;
   5.799  }
   5.800  
   5.801 -void __shadow_mode_disable(struct domain *d)
   5.802 +static void free_shadow_ht_entries(struct domain *d)
   5.803  {
   5.804      struct shadow_status *x, *n;
   5.805  
   5.806 -    free_shadow_state(d);
   5.807 -    d->arch.shadow_mode = 0;
   5.808 -
   5.809      SH_VLOG("freed tables count=%d l1=%d l2=%d",
   5.810              d->arch.shadow_page_count, perfc_value(shadow_l1_pages), 
   5.811              perfc_value(shadow_l2_pages));
   5.812 @@ -239,6 +759,8 @@ void __shadow_mode_disable(struct domain
   5.813      }
   5.814  
   5.815      d->arch.shadow_ht_extras = NULL;
   5.816 +    d->arch.shadow_ht_free = NULL;
   5.817 +
   5.818      ASSERT(d->arch.shadow_extras_count == 0);
   5.819      SH_LOG("freed extras, now %d", d->arch.shadow_extras_count);
   5.820  
   5.821 @@ -253,6 +775,45 @@ void __shadow_mode_disable(struct domain
   5.822      d->arch.shadow_ht = NULL;
   5.823  }
   5.824  
   5.825 +static void free_out_of_sync_entries(struct domain *d)
   5.826 +{
   5.827 +    struct out_of_sync_entry *x, *n;
   5.828 +
   5.829 +    n = d->arch.out_of_sync_extras;
   5.830 +    while ( (x = n) != NULL )
   5.831 +    {
   5.832 +        d->arch.out_of_sync_extras_count--;
   5.833 +        n = *((struct out_of_sync_entry **)(&x[out_of_sync_extra_size]));
   5.834 +        xfree(x);
   5.835 +    }
   5.836 +
   5.837 +    d->arch.out_of_sync_extras = NULL;
   5.838 +    d->arch.out_of_sync_free = NULL;
   5.839 +    d->arch.out_of_sync = NULL;
   5.840 +
   5.841 +    ASSERT(d->arch.out_of_sync_extras_count == 0);
   5.842 +    FSH_LOG("freed extra out_of_sync entries, now %d",
   5.843 +            d->arch.out_of_sync_extras_count);
   5.844 +}
   5.845 +
   5.846 +void __shadow_mode_disable(struct domain *d)
   5.847 +{
   5.848 +    // This needs rethinking for the full shadow mode stuff.
   5.849 +    //
   5.850 +    // Among other things, ref counts need to be restored to a sensible
   5.851 +    // state for a non-shadow-mode guest...
   5.852 +    // This is probably easiest to do by stealing code from audit_domain().
   5.853 +    //
   5.854 +    BUG();
   5.855 +
   5.856 +    free_shadow_pages(d);
   5.857 +    
   5.858 +    d->arch.shadow_mode = 0;
   5.859 +
   5.860 +    free_shadow_ht_entries(d);
   5.861 +    free_out_of_sync_entries(d);
   5.862 +}
   5.863 +
   5.864  static int shadow_mode_table_op(
   5.865      struct domain *d, dom0_shadow_control_t *sc)
   5.866  {
   5.867 @@ -272,7 +833,7 @@ static int shadow_mode_table_op(
   5.868      switch ( op )
   5.869      {
   5.870      case DOM0_SHADOW_CONTROL_OP_FLUSH:
   5.871 -        free_shadow_state(d);
   5.872 +        free_shadow_pages(d);
   5.873  
   5.874          d->arch.shadow_fault_count       = 0;
   5.875          d->arch.shadow_dirty_count       = 0;
   5.876 @@ -282,7 +843,7 @@ static int shadow_mode_table_op(
   5.877          break;
   5.878     
   5.879      case DOM0_SHADOW_CONTROL_OP_CLEAN:
   5.880 -        clear_shadow_state(d);
   5.881 +        free_shadow_pages(d);
   5.882  
   5.883          sc->stats.fault_count       = d->arch.shadow_fault_count;
   5.884          sc->stats.dirty_count       = d->arch.shadow_dirty_count;
   5.885 @@ -394,13 +955,13 @@ int shadow_mode_control(struct domain *d
   5.886          break;
   5.887  
   5.888      case DOM0_SHADOW_CONTROL_OP_ENABLE_TEST:
   5.889 -        free_shadow_state(d);
   5.890 +        free_shadow_pages(d);
   5.891          rc = __shadow_mode_enable(d, SHM_enable);
   5.892          break;
   5.893  
   5.894      case DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY:
   5.895 -        free_shadow_state(d);
   5.896 -        rc = __shadow_mode_enable(d, d->arch.shadow_mode|SHM_log_dirty);
   5.897 +        free_shadow_pages(d);
   5.898 +        rc = __shadow_mode_enable(d, d->arch.shadow_mode|SHM_enable|SHM_log_dirty);
   5.899          break;
   5.900  
   5.901      default:
   5.902 @@ -418,87 +979,108 @@ int shadow_mode_control(struct domain *d
   5.903      return rc;
   5.904  }
   5.905  
   5.906 -static inline struct pfn_info *alloc_shadow_page(struct domain *d)
   5.907 -{
   5.908 -    struct pfn_info *page = alloc_domheap_page(NULL);
   5.909 -
   5.910 -    d->arch.shadow_page_count++;
   5.911 -
   5.912 -    if ( unlikely(page == NULL) )
   5.913 -    {
   5.914 -        printk("Couldn't alloc shadow page! count=%d\n",
   5.915 -               d->arch.shadow_page_count);
   5.916 -        SH_VLOG("Shadow tables l1=%d l2=%d",
   5.917 -                perfc_value(shadow_l1_pages), 
   5.918 -                perfc_value(shadow_l2_pages));
   5.919 -        BUG(); /* XXX FIXME: try a shadow flush to free up some memory. */
   5.920 -    }
   5.921 -
   5.922 -    return page;
   5.923 -}
   5.924 -
   5.925 -void unshadow_table(unsigned long gpfn, unsigned int type)
   5.926 -{
   5.927 -    unsigned long  smfn;
   5.928 -    struct domain *d = page_get_owner(&frame_table[gpfn]);
   5.929 -
   5.930 -    SH_VLOG("unshadow_table type=%08x gpfn=%p", type, gpfn);
   5.931 -
   5.932 -    perfc_incrc(unshadow_table_count);
   5.933 -
   5.934 -    /*
   5.935 -     * This function is the same for all p.t. pages. Even for multi-processor 
   5.936 -     * guests there won't be a race here as this CPU was the one that 
   5.937 -     * cmpxchg'ed the page to invalid.
   5.938 -     */
   5.939 -    smfn = __shadow_status(d, gpfn) & PSH_pfn_mask;
   5.940 -    delete_shadow_status(d, gpfn);
   5.941 -    free_shadow_page(d, &frame_table[smfn]);
   5.942 -}
   5.943 -
   5.944  /*
   5.945 - * XXX KAF:
   5.946 - *  1. Why is this VMX specific?
   5.947 - *  2. Why is VMX using clear_state() rather than free_state()?
   5.948 - *     (could we get rid of clear_state and fold into free_state?)
   5.949 + * XXX KAF: Why is this VMX specific?
   5.950   */
   5.951  void vmx_shadow_clear_state(struct domain *d)
   5.952  {
   5.953      SH_VVLOG("vmx_clear_shadow_state:");
   5.954      shadow_lock(d);
   5.955 -    clear_shadow_state(d);
   5.956 +    free_shadow_pages(d);
   5.957      shadow_unlock(d);
   5.958  }
   5.959  
   5.960 -unsigned long shadow_l2_table( 
   5.961 -    struct domain *d, unsigned long gmfn)
   5.962 +static unsigned long
   5.963 +shadow_hl2_table(struct domain *d, unsigned long gpfn, unsigned long gmfn,
   5.964 +                unsigned long smfn)
   5.965  {
   5.966 -    struct pfn_info *spfn_info;
   5.967 -    unsigned long    spfn;
   5.968 -    unsigned long    gpfn;
   5.969 +    unsigned long hl2mfn;
   5.970 +    l1_pgentry_t *hl2;
   5.971 +    l2_pgentry_t *gl2;
   5.972 +    int i, limit;
   5.973 +
   5.974 +    ASSERT(PGT_base_page_table == PGT_l2_page_table);
   5.975 +
   5.976 +    if ( unlikely(!(hl2mfn = alloc_shadow_page(d, gpfn, gmfn, PGT_hl2_shadow))) )
   5.977 +    {
   5.978 +        printk("Couldn't alloc an HL2 shadow for pfn=%p mfn=%p\n", gpfn, gmfn);
   5.979 +        BUG(); /* XXX Deal gracefully with failure. */
   5.980 +    }
   5.981 +
   5.982 +    perfc_incrc(shadow_hl2_table_count);
   5.983 +
   5.984 +    ASSERT( pagetable_val(current->arch.guest_table) == (gmfn << PAGE_SHIFT) );
   5.985 +    gl2 = current->arch.guest_vtable;
   5.986 +
   5.987 +    hl2 = map_domain_mem(hl2mfn << PAGE_SHIFT);
   5.988 +
   5.989 +    if ( shadow_mode_external(d) )
   5.990 +        limit = L2_PAGETABLE_ENTRIES;
   5.991 +    else
   5.992 +        limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
   5.993 +
   5.994 +    for ( i = 0; i < limit; i++ )
   5.995 +    {
   5.996 +        unsigned long gl2e = l2_pgentry_val(gl2[i]);
   5.997 +        unsigned long mfn;
   5.998  
   5.999 -    gpfn = __mfn_to_gpfn(d, gmfn);
  5.1000 +        if ( gl2e & _PAGE_PRESENT )
  5.1001 +        {
  5.1002 +            mfn = __gpfn_to_mfn(d, gl2e >> PAGE_SHIFT);
  5.1003 +            hl2[i] = mk_l1_pgentry((mfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
  5.1004 +            get_page(pfn_to_page(mfn), d);
  5.1005 +        }
  5.1006 +        else
  5.1007 +            hl2[i] = mk_l1_pgentry(0);
  5.1008 +    }
  5.1009 +
  5.1010 +    if ( !shadow_mode_external(d) )
  5.1011 +    {
  5.1012 +        memset(&hl2[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 0,
  5.1013 +               HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
  5.1014  
  5.1015 -    SH_VVLOG("shadow_l2_table( %p )", gmfn);
  5.1016 +        // Setup easy access to the GL2, SL2, and HL2 frames.
  5.1017 +        //
  5.1018 +        hl2[l2_table_offset(LINEAR_PT_VIRT_START)] =
  5.1019 +            mk_l1_pgentry((gmfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
  5.1020 +        hl2[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
  5.1021 +            mk_l1_pgentry((smfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
  5.1022 +        hl2[l2_table_offset(PERDOMAIN_VIRT_START)] =
  5.1023 +            mk_l1_pgentry((hl2mfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
  5.1024 +    }
  5.1025 +
  5.1026 +    unmap_domain_mem(hl2);
  5.1027 +
  5.1028 +    return hl2mfn;
  5.1029 +}
  5.1030 +
  5.1031 +/*
  5.1032 + * This could take and use a snapshot, and validate the entire page at
  5.1033 + * once, or it could continue to fault in entries one at a time...
  5.1034 + * Might be worth investigating...
  5.1035 + */
  5.1036 +static unsigned long shadow_l2_table(
  5.1037 +    struct domain *d, unsigned long gpfn, unsigned long gmfn)
  5.1038 +{
  5.1039 +    unsigned long smfn;
  5.1040 +    l2_pgentry_t *spl2e;
  5.1041 +
  5.1042 +    SH_VVLOG("shadow_l2_table(gpfn=%p, gmfn=%p)", gpfn, gmfn);
  5.1043  
  5.1044      perfc_incrc(shadow_l2_table_count);
  5.1045  
  5.1046 -    if ( (spfn_info = alloc_shadow_page(d)) == NULL )
  5.1047 +    if ( unlikely(!(smfn = alloc_shadow_page(d, gpfn, gmfn, PGT_l2_shadow))) )
  5.1048 +    {
  5.1049 +        printk("Couldn't alloc an L2 shadow for pfn=%p mfn=%p\n", gpfn, gmfn);
  5.1050          BUG(); /* XXX Deal gracefully with failure. */
  5.1051 -
  5.1052 -    spfn_info->u.inuse.type_info = PGT_l2_page_table;
  5.1053 -    perfc_incr(shadow_l2_pages);
  5.1054 +    }
  5.1055  
  5.1056 -    spfn = page_to_pfn(spfn_info);
  5.1057 -  /* Mark pfn as being shadowed; update field to point at shadow. */
  5.1058 -    set_shadow_status(d, gpfn, spfn | PSH_shadowed);
  5.1059 - 
  5.1060 -#ifdef __i386__
  5.1061 +    spl2e = (l2_pgentry_t *)map_domain_mem(smfn << PAGE_SHIFT);
  5.1062 +
  5.1063      /* Install hypervisor and 2x linear p.t. mapings. */
  5.1064 -    if ( !shadow_mode_translate(d) )
  5.1065 +    if ( (PGT_base_page_table == PGT_l2_page_table) &&
  5.1066 +         !shadow_mode_external(d) )
  5.1067      {
  5.1068 -        l2_pgentry_t *spl2e;
  5.1069 -        spl2e = (l2_pgentry_t *)map_domain_mem(spfn << PAGE_SHIFT);
  5.1070          /*
  5.1071           * We could proactively fill in PDEs for pages that are already
  5.1072           * shadowed *and* where the guest PDE has _PAGE_ACCESSED set
  5.1073 @@ -511,156 +1093,714 @@ unsigned long shadow_l2_table(
  5.1074          memcpy(&spl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 
  5.1075                 &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
  5.1076                 HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
  5.1077 -        spl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
  5.1078 -            mk_l2_pgentry((gmfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
  5.1079 -        spl2e[SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
  5.1080 -            mk_l2_pgentry((spfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
  5.1081 -        spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] =
  5.1082 +
  5.1083 +        if ( shadow_mode_translate(d) ) // NB: not external
  5.1084 +        {
  5.1085 +            unsigned long hl2mfn;
  5.1086 +            if ( unlikely(hl2mfn = __shadow_status(d, gpfn, PGT_hl2_shadow)) )
  5.1087 +                hl2mfn = shadow_hl2_table(d, gpfn, gmfn, smfn);
  5.1088 +
  5.1089 +            // shadow_mode_translate (but not external) sl2 tables hold a
  5.1090 +            // ref to their hl2.
  5.1091 +            //
  5.1092 +            get_shadow_ref(hl2mfn);
  5.1093 +            
  5.1094 +            spl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
  5.1095 +                mk_l2_pgentry((hl2mfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
  5.1096 +        }
  5.1097 +        else
  5.1098 +            spl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
  5.1099 +                mk_l2_pgentry((gmfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
  5.1100 +
  5.1101 +        spl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
  5.1102 +            mk_l2_pgentry((smfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
  5.1103 +
  5.1104 +        spl2e[l2_table_offset(PERDOMAIN_VIRT_START)] =
  5.1105              mk_l2_pgentry(__pa(page_get_owner(
  5.1106                  &frame_table[gmfn])->arch.mm_perdomain_pt) |
  5.1107                            __PAGE_HYPERVISOR);
  5.1108 -
  5.1109 -        unmap_domain_mem(spl2e);
  5.1110 +    }
  5.1111 +    else
  5.1112 +    {
  5.1113 +        memset(spl2e, 0, L2_PAGETABLE_ENTRIES*sizeof(l2_pgentry_t));        
  5.1114      }
  5.1115 -#endif
  5.1116 +
  5.1117 +    unmap_domain_mem(spl2e);
  5.1118  
  5.1119 -    SH_VLOG("shadow_l2_table( %p -> %p)", gmfn, spfn);
  5.1120 -    return spfn;
  5.1121 +    SH_VLOG("shadow_l2_table(%p -> %p)", gmfn, smfn);
  5.1122 +    return smfn;
  5.1123  }
  5.1124  
  5.1125 -static void shadow_map_l1_into_current_l2(unsigned long va)
  5.1126 +void shadow_map_l1_into_current_l2(unsigned long va)
  5.1127  { 
  5.1128      struct exec_domain *ed = current;
  5.1129      struct domain *d = ed->domain;
  5.1130 -    unsigned long    *gpl1e, *spl1e, gl2e, sl2e, gl1pfn, sl1mfn, sl1ss;
  5.1131 -    struct pfn_info  *sl1mfn_info;
  5.1132 -    int               i;
  5.1133 +    unsigned long    *gpl1e, *spl1e, gl2e, sl2e, gl1pfn, gl1mfn, sl1mfn;
  5.1134 +    int i, init_table = 0;
  5.1135  
  5.1136      __guest_get_l2e(ed, va, &gl2e);
  5.1137 -
  5.1138 +    ASSERT(gl2e & _PAGE_PRESENT);
  5.1139      gl1pfn = gl2e >> PAGE_SHIFT;
  5.1140  
  5.1141 -    sl1ss = __shadow_status(d, gl1pfn);
  5.1142 -    if ( !(sl1ss & PSH_shadowed) )
  5.1143 +    if ( !(sl1mfn = __shadow_status(d, gl1pfn, PGT_l1_shadow)) )
  5.1144      {
  5.1145          /* This L1 is NOT already shadowed so we need to shadow it. */
  5.1146 -        SH_VVLOG("4a: l1 not shadowed ( %p )", sl1ss);
  5.1147 +        SH_VVLOG("4a: l1 not shadowed");
  5.1148  
  5.1149 -        sl1mfn_info = alloc_shadow_page(d);
  5.1150 -        sl1mfn_info->u.inuse.type_info = PGT_l1_page_table;
  5.1151 -   
  5.1152 -        sl1mfn = sl1mfn_info - frame_table;
  5.1153 +        gl1mfn = __gpfn_to_mfn(d, gl1pfn);
  5.1154 +        if ( unlikely(!gl1mfn) )
  5.1155 +        {
  5.1156 +            // Attempt to use an invalid pfn as an L1 page.
  5.1157 +            // XXX this needs to be more graceful!
  5.1158 +            BUG();
  5.1159 +        }
  5.1160 +
  5.1161 +        if ( unlikely(!(sl1mfn =
  5.1162 +                        alloc_shadow_page(d, gl1pfn, gl1mfn, PGT_l1_shadow))) )
  5.1163 +        {
  5.1164 +            printk("Couldn't alloc an L1 shadow for pfn=%p mfn=%p\n",
  5.1165 +                   gl1pfn, gl1mfn);
  5.1166 +            BUG(); /* XXX Need to deal gracefully with failure. */
  5.1167 +        }
  5.1168  
  5.1169          perfc_incrc(shadow_l1_table_count);
  5.1170 -        perfc_incr(shadow_l1_pages);
  5.1171 -
  5.1172 -        set_shadow_status(d, gl1pfn, PSH_shadowed | sl1mfn);
  5.1173 -
  5.1174 -        l2pde_general(d, &gl2e, &sl2e, sl1mfn);
  5.1175 -
  5.1176 -        __guest_set_l2e(ed, va, gl2e);
  5.1177 -        __shadow_set_l2e(ed, va, sl2e);
  5.1178 -
  5.1179 -        gpl1e = (unsigned long *) &(linear_pg_table[
  5.1180 -            (va>>L1_PAGETABLE_SHIFT) & ~(L1_PAGETABLE_ENTRIES-1)]);
  5.1181 -
  5.1182 -        spl1e = (unsigned long *) &(shadow_linear_pg_table[
  5.1183 -            (va>>L1_PAGETABLE_SHIFT) & ~(L1_PAGETABLE_ENTRIES-1)]);
  5.1184 -
  5.1185 -        for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
  5.1186 -            l1pte_propagate_from_guest(d, &gpl1e[i], &spl1e[i]);
  5.1187 +        init_table = 1;
  5.1188      }
  5.1189      else
  5.1190      {
  5.1191          /* This L1 is shadowed already, but the L2 entry is missing. */
  5.1192 -        SH_VVLOG("4b: was shadowed, l2 missing ( %p )", sl1ss);
  5.1193 +        SH_VVLOG("4b: was shadowed, l2 missing (%p)", sl1mfn);
  5.1194 +    }
  5.1195 +
  5.1196 +#ifndef NDEBUG
  5.1197 +    unsigned long old_sl2e;
  5.1198 +    __shadow_get_l2e(ed, va, &old_sl2e);
  5.1199 +    ASSERT( !(old_sl2e & _PAGE_PRESENT) );
  5.1200 +#endif
  5.1201 +
  5.1202 +    get_shadow_ref(sl1mfn);
  5.1203 +    l2pde_general(d, &gl2e, &sl2e, sl1mfn);
  5.1204 +    __guest_set_l2e(ed, va, gl2e);
  5.1205 +    __shadow_set_l2e(ed, va, sl2e);
  5.1206  
  5.1207 -        sl1mfn = sl1ss & PSH_pfn_mask;
  5.1208 -        l2pde_general(d, &gl2e, &sl2e, sl1mfn);
  5.1209 -        __guest_set_l2e(ed, va, gl2e);
  5.1210 -        __shadow_set_l2e(ed, va, sl2e);
  5.1211 -    }              
  5.1212 +    if ( init_table )
  5.1213 +    {
  5.1214 +        gpl1e = (unsigned long *)
  5.1215 +            &(linear_pg_table[l1_linear_offset(va) &
  5.1216 +                              ~(L1_PAGETABLE_ENTRIES-1)]);
  5.1217 +
  5.1218 +        spl1e = (unsigned long *)
  5.1219 +            &(shadow_linear_pg_table[l1_linear_offset(va) &
  5.1220 +                                     ~(L1_PAGETABLE_ENTRIES-1)]);
  5.1221 +
  5.1222 +        for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
  5.1223 +        {
  5.1224 +            l1pte_propagate_from_guest(d, gpl1e[i], &spl1e[i]);
  5.1225 +            if ( spl1e[i] & _PAGE_PRESENT )
  5.1226 +                get_page_from_l1e(mk_l1_pgentry(spl1e[i]), d);
  5.1227 +        }
  5.1228 +    }
  5.1229  }
  5.1230  
  5.1231  void shadow_invlpg(struct exec_domain *ed, unsigned long va)
  5.1232  {
  5.1233 +    struct domain *d = ed->domain;
  5.1234      unsigned long gpte, spte;
  5.1235  
  5.1236 -    ASSERT(shadow_mode_enabled(ed->domain));
  5.1237 +    ASSERT(shadow_mode_enabled(d));
  5.1238 +
  5.1239 +    shadow_lock(d);
  5.1240 +
  5.1241 +    __shadow_sync_va(ed, va);
  5.1242  
  5.1243 -    /*
  5.1244 -     * XXX KAF: Why is this set-to-zero required?
  5.1245 -     *          Why, on failure, must we bin all our shadow state?
  5.1246 -     */
  5.1247 -    if (__put_user(0L, (unsigned long *)
  5.1248 -                   &shadow_linear_pg_table[va >> PAGE_SHIFT])) {
  5.1249 -        vmx_shadow_clear_state(ed->domain);
  5.1250 +    // XXX mafetter: will need to think about 4MB pages...
  5.1251 +
  5.1252 +    // It's not strictly necessary to update the shadow here,
  5.1253 +    // but it might save a fault later.
  5.1254 +    //
  5.1255 +    if (__get_user(gpte, (unsigned long *)
  5.1256 +                   &linear_pg_table[va >> PAGE_SHIFT])) {
  5.1257 +        perfc_incrc(shadow_invlpg_faults);
  5.1258          return;
  5.1259      }
  5.1260 +    l1pte_propagate_from_guest(d, gpte, &spte);
  5.1261 +    shadow_set_l1e(va, spte, 1);
  5.1262  
  5.1263 -    if (__get_user(gpte, (unsigned long *)
  5.1264 -                   &linear_pg_table[va >> PAGE_SHIFT])) {
  5.1265 +    shadow_unlock(d);
  5.1266 +}
  5.1267 +
  5.1268 +struct out_of_sync_entry *
  5.1269 +shadow_alloc_oos_entry(struct domain *d)
  5.1270 +{
  5.1271 +    struct out_of_sync_entry *f, *extra;
  5.1272 +    unsigned size, i;
  5.1273 +
  5.1274 +    if ( unlikely(d->arch.out_of_sync_free == NULL) )
  5.1275 +    {
  5.1276 +        FSH_LOG("Allocate more fullshadow tuple blocks.");
  5.1277 +
  5.1278 +        size = sizeof(void *) + (out_of_sync_extra_size * sizeof(*f));
  5.1279 +        extra = xmalloc_bytes(size);
  5.1280 +
  5.1281 +        /* XXX Should be more graceful here. */
  5.1282 +        if ( extra == NULL )
  5.1283 +            BUG();
  5.1284 +
  5.1285 +        memset(extra, 0, size);
  5.1286 +
  5.1287 +        /* Record the allocation block so it can be correctly freed later. */
  5.1288 +        d->arch.out_of_sync_extras_count++;
  5.1289 +        *((struct out_of_sync_entry **)&extra[out_of_sync_extra_size]) = 
  5.1290 +            d->arch.out_of_sync_extras;
  5.1291 +        d->arch.out_of_sync_extras = &extra[0];
  5.1292 +
  5.1293 +        /* Thread a free chain through the newly-allocated nodes. */
  5.1294 +        for ( i = 0; i < (out_of_sync_extra_size - 1); i++ )
  5.1295 +            extra[i].next = &extra[i+1];
  5.1296 +        extra[i].next = NULL;
  5.1297 +
  5.1298 +        /* Add the new nodes to the free list. */
  5.1299 +        d->arch.out_of_sync_free = &extra[0];
  5.1300 +    }
  5.1301 +
  5.1302 +    /* Allocate a new node from the quicklist. */
  5.1303 +    f = d->arch.out_of_sync_free;
  5.1304 +    d->arch.out_of_sync_free = f->next;
  5.1305 +
  5.1306 +    return f;
  5.1307 +}
  5.1308 +
  5.1309 +static unsigned long
  5.1310 +shadow_make_snapshot(
  5.1311 +    struct domain *d, unsigned long gpfn, unsigned long gmfn)
  5.1312 +{
  5.1313 +    unsigned long smfn;
  5.1314 +    void *original, *snapshot;
  5.1315 +
  5.1316 +    if ( test_and_set_bit(_PGC_out_of_sync, &frame_table[gmfn].count_info) )
  5.1317 +    {
  5.1318 +        ASSERT(__shadow_status(d, gpfn, PGT_snapshot));
  5.1319 +        return SHADOW_SNAPSHOT_ELSEWHERE;
  5.1320 +    }
  5.1321 +
  5.1322 +    perfc_incrc(shadow_make_snapshot);
  5.1323 +
  5.1324 +    if ( unlikely(!(smfn = alloc_shadow_page(d, gpfn, gmfn, PGT_snapshot))) )
  5.1325 +    {
  5.1326 +        printk("Couldn't alloc fullshadow snapshot for pfn=%p mfn=%p!\n"
  5.1327 +               "Dom%d snapshot_count_count=%d\n",
  5.1328 +               gpfn, gmfn, d->id, d->arch.snapshot_page_count);
  5.1329 +        BUG(); /* XXX FIXME: try a shadow flush to free up some memory. */
  5.1330 +    }
  5.1331 +
  5.1332 +    get_shadow_ref(smfn);
  5.1333 +
  5.1334 +    original = map_domain_mem(gmfn << PAGE_SHIFT);
  5.1335 +    snapshot = map_domain_mem(smfn << PAGE_SHIFT);
  5.1336 +    memcpy(snapshot, original, PAGE_SIZE);
  5.1337 +    unmap_domain_mem(original);
  5.1338 +    unmap_domain_mem(snapshot);
  5.1339 +
  5.1340 +    return smfn;
  5.1341 +}
  5.1342 +
  5.1343 +static void
  5.1344 +shadow_free_snapshot(struct domain *d, struct out_of_sync_entry *entry)
  5.1345 +{
  5.1346 +    void *snapshot;
  5.1347 +
  5.1348 +    if ( entry->snapshot_mfn == SHADOW_SNAPSHOT_ELSEWHERE )
  5.1349          return;
  5.1350 +
  5.1351 +    // Clear the out_of_sync bit.
  5.1352 +    //
  5.1353 +    clear_bit(_PGC_out_of_sync, &frame_table[entry->gmfn].count_info);
  5.1354 +
  5.1355 +    // XXX Need to think about how to protect the domain's
  5.1356 +    // information less expensively.
  5.1357 +    //
  5.1358 +    snapshot = map_domain_mem(entry->snapshot_mfn << PAGE_SHIFT);
  5.1359 +    memset(snapshot, 0, PAGE_SIZE);
  5.1360 +    unmap_domain_mem(snapshot);
  5.1361 +
  5.1362 +    put_shadow_ref(entry->snapshot_mfn);
  5.1363 +}
  5.1364 +
  5.1365 +struct out_of_sync_entry *
  5.1366 +shadow_mark_mfn_out_of_sync(struct exec_domain *ed, unsigned long gpfn,
  5.1367 +                             unsigned long mfn)
  5.1368 +{
  5.1369 +    struct domain *d = ed->domain;
  5.1370 +    struct pfn_info *page = &frame_table[mfn];
  5.1371 +    struct out_of_sync_entry *entry = shadow_alloc_oos_entry(d);
  5.1372 +
  5.1373 +    ASSERT(spin_is_locked(&d->arch.shadow_lock));
  5.1374 +    ASSERT(pfn_is_ram(mfn));
  5.1375 +    //ASSERT((page->u.inuse.type_info & PGT_type_mask) == PGT_writable_page);
  5.1376 +    if (!((page->u.inuse.type_info & PGT_type_mask) == PGT_writable_page))
  5.1377 +    {
  5.1378 +        printk("assertion failed: gpfn=%p gmfn=%p t=%p\n",
  5.1379 +               gpfn, mfn, page->u.inuse.type_info);
  5.1380 +        BUG();
  5.1381      }
  5.1382  
  5.1383 -    l1pte_propagate_from_guest(ed->domain, &gpte, &spte);
  5.1384 +    FSH_LOG("mark_mfn_out_of_sync(gpfn=%p, mfn=%p) c=%p t=%p",
  5.1385 +            gpfn, mfn, page->count_info, page->u.inuse.type_info);
  5.1386 +
  5.1387 +    // XXX this will require some more thought...  Cross-domain sharing and
  5.1388 +    //     modification of page tables?  Hmm...
  5.1389 +    //
  5.1390 +    if ( d != page_get_owner(page) )
  5.1391 +        BUG();
  5.1392 +
  5.1393 +    perfc_incrc(shadow_mark_mfn_out_of_sync_calls);
  5.1394 +
  5.1395 +    entry->gpfn = gpfn;
  5.1396 +    entry->gmfn = mfn;
  5.1397 +    entry->snapshot_mfn = shadow_make_snapshot(d, gpfn, mfn);
  5.1398 +    entry->writable_pl1e = -1;
  5.1399 +
  5.1400 +    // increment guest's ref count to represent the entry in the
  5.1401 +    // full shadow out-of-sync list.
  5.1402 +    //
  5.1403 +    get_page(page, d);
  5.1404 +
  5.1405 +    // Add to the out-of-sync list
  5.1406 +    //
  5.1407 +    entry->next = d->arch.out_of_sync;
  5.1408 +    d->arch.out_of_sync = entry;
  5.1409 +
  5.1410 +    return entry;
  5.1411 +}
  5.1412 +
  5.1413 +void shadow_mark_out_of_sync(
  5.1414 +    struct exec_domain *ed, unsigned long gpfn, unsigned long mfn, unsigned long va)
  5.1415 +{
  5.1416 +    struct out_of_sync_entry *entry =
  5.1417 +        shadow_mark_mfn_out_of_sync(ed, gpfn, mfn);
  5.1418 +    unsigned long sl2e;
  5.1419 +
  5.1420 +    // We need the address of shadow PTE that maps @va.
  5.1421 +    // It might not exist yet.  Make sure it's there.
  5.1422 +    //
  5.1423 +    __shadow_get_l2e(ed, va, &sl2e);
  5.1424 +    if ( !(sl2e & _PAGE_PRESENT) )
  5.1425 +    {
  5.1426 +        // either this L1 isn't shadowed yet, or the shadow isn't linked into
  5.1427 +        // the current L2.
  5.1428 +        shadow_map_l1_into_current_l2(va);
  5.1429 +        __shadow_get_l2e(ed, va, &sl2e);
  5.1430 +    }
  5.1431 +    ASSERT(sl2e & _PAGE_PRESENT);
  5.1432 +
  5.1433 +    // NB: this is stored as a machine address.
  5.1434 +    entry->writable_pl1e =
  5.1435 +        ((sl2e & PAGE_MASK) |
  5.1436 +         (sizeof(l1_pgentry_t) * l1_table_offset(va)));
  5.1437 +    ASSERT( !(entry->writable_pl1e & (sizeof(l1_pgentry_t)-1)) );
  5.1438 +
  5.1439 +    // Increment shadow's page count to represent the reference
  5.1440 +    // inherent in entry->writable_pl1e
  5.1441 +    //
  5.1442 +    get_shadow_ref(sl2e >> PAGE_SHIFT);
  5.1443 +
  5.1444 +    FSH_LOG("mark_out_of_sync(va=%p -> writable_pl1e=%p)",
  5.1445 +            va, entry->writable_pl1e);
  5.1446 +}
  5.1447 +
  5.1448 +/*
  5.1449 + * Returns 1 if the snapshot for @gmfn exists and its @index'th entry matches.
  5.1450 + * Returns 0 otherwise.
  5.1451 + */
  5.1452 +static int snapshot_entry_matches(
  5.1453 +    struct exec_domain *ed, unsigned long gmfn, unsigned index)
  5.1454 +{
  5.1455 +    unsigned long gpfn = __mfn_to_gpfn(ed->domain, gmfn);
  5.1456 +    unsigned long smfn = __shadow_status(ed->domain, gpfn, PGT_snapshot);
  5.1457 +    unsigned long *guest, *snapshot;
  5.1458 +    int compare;
  5.1459 +
  5.1460 +    ASSERT( ! IS_INVALID_M2P_ENTRY(gpfn) );
  5.1461 +
  5.1462 +    perfc_incrc(snapshot_entry_matches_calls);
  5.1463 +
  5.1464 +    if ( !smfn )
  5.1465 +        return 0;
  5.1466 +
  5.1467 +    guest    = map_domain_mem(gmfn << PAGE_SHIFT);
  5.1468 +    snapshot = map_domain_mem(smfn << PAGE_SHIFT);
  5.1469 +
  5.1470 +    // This could probably be smarter, but this is sufficent for
  5.1471 +    // our current needs.
  5.1472 +    //
  5.1473 +    compare = (guest[index] == snapshot[index]);
  5.1474 +
  5.1475 +    unmap_domain_mem(guest);
  5.1476 +    unmap_domain_mem(snapshot);
  5.1477 +
  5.1478 +#ifdef PERF_COUNTERS
  5.1479 +    if ( compare )
  5.1480 +        perfc_incrc(snapshot_entry_matches_true);
  5.1481 +#endif
  5.1482 +
  5.1483 +    return compare;
  5.1484 +}
  5.1485 +
  5.1486 +/*
  5.1487 + * Returns 1 if va's shadow mapping is out-of-sync.
  5.1488 + * Returns 0 otherwise.
  5.1489 + */
  5.1490 +int __shadow_out_of_sync(struct exec_domain *ed, unsigned long va)
  5.1491 +{
  5.1492 +    struct domain *d = ed->domain;
  5.1493 +    unsigned long l2mfn = pagetable_val(ed->arch.guest_table) >> PAGE_SHIFT;
  5.1494 +    unsigned long l2e;
  5.1495 +    unsigned long l1mfn;
  5.1496 +
  5.1497 +    ASSERT(spin_is_locked(&d->arch.shadow_lock));
  5.1498 +
  5.1499 +    perfc_incrc(shadow_out_of_sync_calls);
  5.1500 +
  5.1501 +    if ( page_out_of_sync(&frame_table[l2mfn]) &&
  5.1502 +         !snapshot_entry_matches(ed, l2mfn, l2_table_offset(va)) )
  5.1503 +        return 1;
  5.1504 +
  5.1505 +    __guest_get_l2e(ed, va, &l2e);
  5.1506 +    if ( !(l2e & _PAGE_PRESENT) )
  5.1507 +        return 0;
  5.1508 +
  5.1509 +    l1mfn = __gpfn_to_mfn(d, l2e >> PAGE_SHIFT);
  5.1510 +
  5.1511 +    // If the l1 pfn is invalid, it can't be out of sync...
  5.1512 +    if ( !l1mfn )
  5.1513 +        return 0;
  5.1514 +
  5.1515 +    if ( page_out_of_sync(&frame_table[l1mfn]) &&
  5.1516 +         !snapshot_entry_matches(ed, l1mfn, l1_table_offset(va)) )
  5.1517 +        return 1;
  5.1518 +
  5.1519 +    return 0;
  5.1520 +}
  5.1521 +
  5.1522 +static u32 remove_all_write_access_in_ptpage(
  5.1523 +    struct domain *d, unsigned long pt_mfn, unsigned long readonly_mfn)
  5.1524 +{
  5.1525 +    unsigned long *pt = map_domain_mem(pt_mfn << PAGE_SHIFT);
  5.1526 +    unsigned long match =
  5.1527 +        (readonly_mfn << PAGE_SHIFT) | _PAGE_RW | _PAGE_PRESENT;
  5.1528 +    unsigned long mask = PAGE_MASK | _PAGE_RW | _PAGE_PRESENT;
  5.1529 +    int i;
  5.1530 +    u32 count = 0;
  5.1531 +    int is_l1_shadow =
  5.1532 +        ((frame_table[pt_mfn].u.inuse.type_info & PGT_type_mask) ==
  5.1533 +         PGT_l1_shadow);
  5.1534 +
  5.1535 +    for (i = 0; i < L1_PAGETABLE_ENTRIES; i++)
  5.1536 +    {
  5.1537 +        if ( unlikely(((pt[i] ^ match) & mask) == 0) )
  5.1538 +        {
  5.1539 +            unsigned long old = pt[i];
  5.1540 +            unsigned long new = old & ~_PAGE_RW;
  5.1541 +
  5.1542 +            if ( is_l1_shadow )
  5.1543 +                get_page_from_l1e(mk_l1_pgentry(new), d);
  5.1544 +
  5.1545 +            count++;
  5.1546 +            pt[i] = new;
  5.1547 +
  5.1548 +            if ( is_l1_shadow )
  5.1549 +                put_page_from_l1e(mk_l1_pgentry(old), d);
  5.1550 +
  5.1551 +            FSH_LOG("removed write access to mfn=%p in smfn=%p entry %x "
  5.1552 +                    "is_l1_shadow=%d\n",
  5.1553 +                    readonly_mfn, pt_mfn, i, is_l1_shadow);
  5.1554 +        }
  5.1555 +    }
  5.1556 +
  5.1557 +    unmap_domain_mem(pt);
  5.1558 +
  5.1559 +    return count;
  5.1560 +}
  5.1561 +
  5.1562 +u32 shadow_remove_all_write_access(
  5.1563 +    struct domain *d, unsigned min_type, unsigned max_type, unsigned long gpfn)
  5.1564 +{
  5.1565 +    int i;
  5.1566 +    struct shadow_status *a;
  5.1567 +    unsigned long gmfn = __gpfn_to_mfn(d, gpfn);
  5.1568 +    unsigned long sl1mfn = __shadow_status(d, gpfn, PGT_l1_shadow);
  5.1569 +    u32 count = 0;
  5.1570 +
  5.1571 +    ASSERT(spin_is_locked(&d->arch.shadow_lock));
  5.1572 +    ASSERT(gmfn);
  5.1573  
  5.1574 -    if (__put_user(spte, (unsigned long *)
  5.1575 -                   &shadow_linear_pg_table[va >> PAGE_SHIFT])) {
  5.1576 -        return;
  5.1577 +    for (i = 0; i < shadow_ht_buckets; i++)
  5.1578 +    {
  5.1579 +        a = &d->arch.shadow_ht[i];
  5.1580 +        while ( a && a->gpfn_and_flags )
  5.1581 +        {
  5.1582 +            if ( ((a->gpfn_and_flags & PGT_type_mask) >= min_type) &&
  5.1583 +                 ((a->gpfn_and_flags & PGT_type_mask) <= max_type) )
  5.1584 +            {
  5.1585 +                switch ( a->gpfn_and_flags & PGT_type_mask )
  5.1586 +                {
  5.1587 +                case PGT_l1_shadow:
  5.1588 +                    count +=
  5.1589 +                        remove_all_write_access_in_ptpage(d, a->smfn, gmfn);
  5.1590 +                    break;
  5.1591 +                case PGT_l2_shadow:
  5.1592 +                    if ( sl1mfn )
  5.1593 +                        count +=
  5.1594 +                            remove_all_write_access_in_ptpage(d, a->smfn,
  5.1595 +                                                              sl1mfn);
  5.1596 +                    break;
  5.1597 +                case PGT_hl2_shadow:
  5.1598 +                    // nothing to do here...
  5.1599 +                    break;
  5.1600 +                default:
  5.1601 +                    // need to flush this out for 4 level page tables.
  5.1602 +                    BUG();
  5.1603 +                }
  5.1604 +            }
  5.1605 +            a = a->next;
  5.1606 +        }
  5.1607 +    }
  5.1608 +
  5.1609 +    return count;
  5.1610 +}
  5.1611 +
  5.1612 +static u32 remove_all_access_in_page(
  5.1613 +    struct domain *d, unsigned long l1mfn, unsigned long forbidden_gmfn)
  5.1614 +{
  5.1615 +    unsigned long *pl1e = map_domain_mem(l1mfn << PAGE_SHIFT);
  5.1616 +    unsigned long match = (forbidden_gmfn << PAGE_SHIFT) | _PAGE_PRESENT;
  5.1617 +    unsigned long mask  = PAGE_MASK | _PAGE_PRESENT;
  5.1618 +    int i;
  5.1619 +    u32 count = 0;
  5.1620 +    int is_l1_shadow =
  5.1621 +        ((frame_table[l1mfn].u.inuse.type_info & PGT_type_mask) ==
  5.1622 +         PGT_l1_shadow);
  5.1623 +
  5.1624 +    for (i = 0; i < L1_PAGETABLE_ENTRIES; i++)
  5.1625 +    {
  5.1626 +        if ( unlikely(((pl1e[i] ^ match) & mask) == 0) )
  5.1627 +        {
  5.1628 +            unsigned long ol2e = pl1e[i];
  5.1629 +            pl1e[i] = 0;
  5.1630 +            count++;
  5.1631 +
  5.1632 +            if ( is_l1_shadow )
  5.1633 +                put_page_from_l1e(mk_l1_pgentry(ol2e), d);
  5.1634 +            else /* must be an hl2 page */
  5.1635 +                put_page(&frame_table[forbidden_gmfn]);
  5.1636 +        }
  5.1637 +    }
  5.1638 +
  5.1639 +    unmap_domain_mem(pl1e);
  5.1640 +
  5.1641 +    return count;
  5.1642 +}
  5.1643 +
  5.1644 +u32 shadow_remove_all_access(struct domain *d, unsigned long gmfn)
  5.1645 +{
  5.1646 +    int i;
  5.1647 +    struct shadow_status *a;
  5.1648 +    u32 count = 0;
  5.1649 +
  5.1650 +    ASSERT(spin_is_locked(&d->arch.shadow_lock));
  5.1651 +
  5.1652 +    for (i = 0; i < shadow_ht_buckets; i++)
  5.1653 +    {
  5.1654 +        a = &d->arch.shadow_ht[i];
  5.1655 +        while ( a && a->gpfn_and_flags )
  5.1656 +        {
  5.1657 +            if ( ((a->gpfn_and_flags & PGT_type_mask) == PGT_l1_shadow) ||
  5.1658 +                 ((a->gpfn_and_flags & PGT_type_mask) == PGT_hl2_shadow) )
  5.1659 +            {
  5.1660 +                count += remove_all_access_in_page(d, a->smfn, gmfn);
  5.1661 +            }
  5.1662 +            a = a->next;
  5.1663 +        }
  5.1664      }
  5.1665 +
  5.1666 +    return count;
  5.1667 +}    
  5.1668 +
  5.1669 +static int resync_all(struct domain *d, u32 stype)
  5.1670 +{
  5.1671 +    struct out_of_sync_entry *entry;
  5.1672 +    unsigned i;
  5.1673 +    unsigned long smfn;
  5.1674 +    unsigned long *guest, *shadow, *snapshot;
  5.1675 +    int need_flush = 0, external = shadow_mode_external(d);
  5.1676 +
  5.1677 +    ASSERT(spin_is_locked(&d->arch.shadow_lock));
  5.1678 +
  5.1679 +    for ( entry = d->arch.out_of_sync; entry; entry = entry->next)
  5.1680 +    {
  5.1681 +        if ( entry->snapshot_mfn == SHADOW_SNAPSHOT_ELSEWHERE )
  5.1682 +            continue;
  5.1683 +
  5.1684 +        if ( !(smfn = __shadow_status(d, entry->gpfn, stype)) )
  5.1685 +            continue;
  5.1686 +
  5.1687 +        FSH_LOG("resyncing t=%p gpfn=%p gmfn=%p smfn=%p snapshot_mfn=%p",
  5.1688 +                stype, entry->gpfn, entry->gmfn, smfn, entry->snapshot_mfn);
  5.1689 +
  5.1690 +        // Compare guest's new contents to its snapshot, validating
  5.1691 +        // and updating its shadow as appropriate.
  5.1692 +        //
  5.1693 +        guest    = map_domain_mem(entry->gmfn         << PAGE_SHIFT);
  5.1694 +        snapshot = map_domain_mem(entry->snapshot_mfn << PAGE_SHIFT);
  5.1695 +        shadow   = map_domain_mem(smfn                << PAGE_SHIFT);
  5.1696 +
  5.1697 +        switch ( stype ) {
  5.1698 +        case PGT_l1_shadow:
  5.1699 +            for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
  5.1700 +            {
  5.1701 +                unsigned new_pte = guest[i];
  5.1702 +                if ( new_pte != snapshot[i] )
  5.1703 +                {
  5.1704 +                    need_flush |= validate_pte_change(d, new_pte, &shadow[i]);
  5.1705 +
  5.1706 +                    // can't update snapshots of linear page tables -- they
  5.1707 +                    // are used multiple times...
  5.1708 +                    //
  5.1709 +                    // snapshot[i] = new_pte;
  5.1710 +                }
  5.1711 +            }
  5.1712 +            break;
  5.1713 +        case PGT_l2_shadow:
  5.1714 +            for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
  5.1715 +            {
  5.1716 +                if ( !is_guest_l2_slot(i) && !external )
  5.1717 +                    continue;
  5.1718 +
  5.1719 +                unsigned new_pde = guest[i];
  5.1720 +                if ( new_pde != snapshot[i] )
  5.1721 +                {
  5.1722 +                    need_flush |= validate_pde_change(d, new_pde, &shadow[i]);
  5.1723 +
  5.1724 +                    // can't update snapshots of linear page tables -- they
  5.1725 +                    // are used multiple times...
  5.1726 +                    //
  5.1727 +                    // snapshot[i] = new_pde;
  5.1728 +                }
  5.1729 +            }
  5.1730 +            break;
  5.1731 +        default:
  5.1732 +            BUG();
  5.1733 +            break;
  5.1734 +        }
  5.1735 +
  5.1736 +        unmap_domain_mem(shadow);
  5.1737 +        unmap_domain_mem(snapshot);
  5.1738 +        unmap_domain_mem(guest);
  5.1739 +    }
  5.1740 +
  5.1741 +    return need_flush;
  5.1742 +}
  5.1743 +
  5.1744 +void __shadow_sync_all(struct domain *d)
  5.1745 +{
  5.1746 +    struct out_of_sync_entry *entry;
  5.1747 +    int need_flush = 0;
  5.1748 +
  5.1749 +    perfc_incrc(shadow_sync_all);
  5.1750 +
  5.1751 +    ASSERT(spin_is_locked(&d->arch.shadow_lock));
  5.1752 +
  5.1753 +    // First, remove all write permissions to the page tables
  5.1754 +    //
  5.1755 +    for ( entry = d->arch.out_of_sync; entry; entry = entry->next)
  5.1756 +    {
  5.1757 +        // Skip entries that have low bits set...  Those aren't
  5.1758 +        // real PTEs.
  5.1759 +        //
  5.1760 +        if ( entry->writable_pl1e & (sizeof(l1_pgentry_t)-1) )
  5.1761 +            continue;
  5.1762 +
  5.1763 +        unsigned long *ppte = map_domain_mem(entry->writable_pl1e);
  5.1764 +        unsigned long opte = *ppte;
  5.1765 +        unsigned long npte = opte & ~_PAGE_RW;
  5.1766 +
  5.1767 +        get_page_from_l1e(mk_l1_pgentry(npte), d);
  5.1768 +        *ppte = npte;
  5.1769 +        put_page_from_l1e(mk_l1_pgentry(opte), d);
  5.1770 +
  5.1771 +        unmap_domain_mem(ppte);
  5.1772 +    }
  5.1773 +
  5.1774 +    // XXX mafetter: SMP perf bug.
  5.1775 +    //
  5.1776 +    // With the current algorithm, we've gotta flush all the TLBs
  5.1777 +    // before we can safely continue.  I don't think we want to
  5.1778 +    // do it this way, so I think we should consider making
  5.1779 +    // entirely private copies of the shadow for each vcpu, and/or
  5.1780 +    // possibly having a mix of private and shared shadow state
  5.1781 +    // (any path from a PTE that grants write access to an out-of-sync
  5.1782 +    // page table page needs to be vcpu private).
  5.1783 +    //
  5.1784 +    flush_tlb_all();
  5.1785 +
  5.1786 +    // Second, resync all L1 pages, then L2 pages, etc...
  5.1787 +    //
  5.1788 +    need_flush |= resync_all(d, PGT_l1_shadow);
  5.1789 +    if ( shadow_mode_translate(d) )
  5.1790 +        need_flush |= resync_all(d, PGT_hl2_shadow);
  5.1791 +    need_flush |= resync_all(d, PGT_l2_shadow);
  5.1792 +
  5.1793 +    if ( need_flush )
  5.1794 +        local_flush_tlb();
  5.1795 +
  5.1796 +    free_out_of_sync_state(d);
  5.1797  }
  5.1798  
  5.1799  int shadow_fault(unsigned long va, struct xen_regs *regs)
  5.1800  {
  5.1801 -    unsigned long gpte, spte = 0;
  5.1802 +    unsigned long gpte, spte = 0, orig_gpte;
  5.1803      struct exec_domain *ed = current;
  5.1804      struct domain *d = ed->domain;
  5.1805 +    unsigned long gpde;
  5.1806  
  5.1807      SH_VVLOG("shadow_fault( va=%p, code=%lu )", va, regs->error_code );
  5.1808 -
  5.1809 -    check_pagetable(d, ed->arch.guest_table, "pre-sf");
  5.1810 +    perfc_incrc(shadow_fault_calls);
  5.1811 +    
  5.1812 +    check_pagetable(ed, "pre-sf");
  5.1813  
  5.1814      /*
  5.1815 -     * STEP 1. A fast-reject set of checks with no locking.
  5.1816 +     * Don't let someone else take the guest's table pages out-of-sync.
  5.1817       */
  5.1818 -
  5.1819 -    if ( unlikely(__get_user(gpte, (unsigned long *)
  5.1820 -                             &linear_pg_table[va >> PAGE_SHIFT])) )
  5.1821 -    {
  5.1822 -        SH_VVLOG("shadow_fault - EXIT: read gpte faulted" );
  5.1823 -        return 0;
  5.1824 -    }
  5.1825 +    shadow_lock(d);
  5.1826  
  5.1827 -    if ( !(gpte & _PAGE_PRESENT) )
  5.1828 -    {
  5.1829 -        SH_VVLOG("shadow_fault - EXIT: gpte not present (%lx)",gpte );
  5.1830 -        return 0;
  5.1831 -    }
  5.1832 -
  5.1833 -    if ( (regs->error_code & 2)  && !(gpte & _PAGE_RW) )
  5.1834 -    {
  5.1835 -        /* Write fault on a read-only mapping. */
  5.1836 -        return 0;
  5.1837 -    }
  5.1838 +    /* XXX - FIX THIS COMMENT!!!
  5.1839 +     * STEP 1. Check to see if this fault might have been caused by an
  5.1840 +     *         out-of-sync table page entry, or if we should pass this
  5.1841 +     *         fault onto the guest.
  5.1842 +     */
  5.1843 +    __shadow_sync_va(ed, va);
  5.1844  
  5.1845      /*
  5.1846 -     * STEP 2. Take the shadow lock and re-check the guest PTE.
  5.1847 +     * STEP 2. Check the guest PTE.
  5.1848       */
  5.1849 -
  5.1850 -    shadow_lock(d);
  5.1851 - 
  5.1852 -    if ( unlikely(__get_user(gpte, (unsigned long *)
  5.1853 -                             &linear_pg_table[va >> PAGE_SHIFT])) )
  5.1854 +    __guest_get_l2e(ed, va, &gpde);
  5.1855 +    if ( unlikely(!(gpde & _PAGE_PRESENT)) )
  5.1856      {
  5.1857 -        SH_VVLOG("shadow_fault - EXIT: read gpte faulted2" );
  5.1858 +        SH_VVLOG("shadow_fault - EXIT: L1 not present" );
  5.1859 +        perfc_incrc(shadow_fault_bail_pde_not_present);
  5.1860          shadow_unlock(d);
  5.1861          return 0;
  5.1862      }
  5.1863  
  5.1864 +    // This can't fault because we hold the shadow lock and we've ensured that
  5.1865 +    // the mapping is in-sync, so the check of the PDE's present bit, above,
  5.1866 +    // covers this access.
  5.1867 +    //
  5.1868 +    orig_gpte = gpte = l1_pgentry_val(linear_pg_table[l1_linear_offset(va)]);
  5.1869      if ( unlikely(!(gpte & _PAGE_PRESENT)) )
  5.1870      {
  5.1871 -        SH_VVLOG("shadow_fault - EXIT: gpte not present2 (%lx)",gpte );
  5.1872 +        SH_VVLOG("shadow_fault - EXIT: gpte not present (%lx)",gpte );
  5.1873 +        perfc_incrc(shadow_fault_bail_pte_not_present);
  5.1874          shadow_unlock(d);
  5.1875          return 0;
  5.1876      }
  5.1877 @@ -672,11 +1812,12 @@ int shadow_fault(unsigned long va, struc
  5.1878          {
  5.1879              /* Write fault on a read-only mapping. */
  5.1880              SH_VVLOG("shadow_fault - EXIT: wr fault on RO page (%lx)", gpte);
  5.1881 +            perfc_incrc(shadow_fault_bail_ro_mapping);
  5.1882              shadow_unlock(d);
  5.1883              return 0;
  5.1884          }
  5.1885  
  5.1886 -        l1pte_write_fault(d, &gpte, &spte);
  5.1887 +        l1pte_write_fault(ed, &gpte, &spte, va);
  5.1888      }
  5.1889      else
  5.1890      {
  5.1891 @@ -689,120 +1830,141 @@ int shadow_fault(unsigned long va, struc
  5.1892  
  5.1893      /* XXX Watch out for read-only L2 entries! (not used in Linux). */
  5.1894      if ( unlikely(__put_user(gpte, (unsigned long *)
  5.1895 -                             &linear_pg_table[va >> PAGE_SHIFT])) )
  5.1896 +                             &linear_pg_table[l1_linear_offset(va)])) )
  5.1897 +    {
  5.1898 +        printk("shadow_fault(): crashing domain %d "
  5.1899 +               "due to a read-only L2 page table (gpde=%p), va=%p\n",
  5.1900 +               d->id, gpde, va);
  5.1901          domain_crash();
  5.1902 -
  5.1903 -    /*
  5.1904 -     * Update of shadow PTE can fail because the L1 p.t. is not shadowed,
  5.1905 -     * or because the shadow isn't linked into this shadow L2 p.t.
  5.1906 -     */
  5.1907 -    if ( unlikely(__put_user(spte, (unsigned long *)
  5.1908 -                             &shadow_linear_pg_table[va >> PAGE_SHIFT])) )
  5.1909 -    {
  5.1910 -        SH_VVLOG("3: not shadowed/mapped gpte=%p spte=%p", gpte, spte);
  5.1911 -        shadow_map_l1_into_current_l2(va);
  5.1912 -        shadow_linear_pg_table[va >> PAGE_SHIFT] = mk_l1_pgentry(spte);
  5.1913      }
  5.1914  
  5.1915 -    perfc_incrc(shadow_fixup_count);
  5.1916 +    // if necessary, record the page table page as dirty
  5.1917 +    if ( unlikely(shadow_mode_log_dirty(d)) && (orig_gpte != gpte) )
  5.1918 +        mark_dirty(d, __gpfn_to_mfn(d, gpde >> PAGE_SHIFT));
  5.1919 +
  5.1920 +    shadow_set_l1e(va, spte, 1);
  5.1921 +
  5.1922 +    perfc_incrc(shadow_fault_fixed);
  5.1923      d->arch.shadow_fault_count++;
  5.1924  
  5.1925      shadow_unlock(d);
  5.1926  
  5.1927 -    check_pagetable(d, ed->arch.guest_table, "post-sf");
  5.1928 +    check_pagetable(ed, "post-sf");
  5.1929      return EXCRET_fault_fixed;
  5.1930  }
  5.1931  
  5.1932 -
  5.1933 -void shadow_l1_normal_pt_update(
  5.1934 -    unsigned long pa, unsigned long gpte,
  5.1935 -    unsigned long *prev_smfn_ptr,
  5.1936 -    l1_pgentry_t **prev_spl1e_ptr)
  5.1937 -{
  5.1938 -    unsigned long smfn, spte, prev_smfn = *prev_smfn_ptr;    
  5.1939 -    l1_pgentry_t *spl1e, *prev_spl1e = *prev_spl1e_ptr;
  5.1940 -
  5.1941 -    /* N.B. To get here, we know the l1 page *must* be shadowed. */
  5.1942 -    SH_VVLOG("shadow_l1_normal_pt_update pa=%p, gpte=%p, "
  5.1943 -             "prev_smfn=%p, prev_spl1e=%p",
  5.1944 -             pa, gpte, prev_smfn, prev_spl1e);
  5.1945 -
  5.1946 -    smfn = __shadow_status(current->domain, pa >> PAGE_SHIFT) & PSH_pfn_mask;
  5.1947 -
  5.1948 -    if ( smfn == prev_smfn )
  5.1949 -    {
  5.1950 -        spl1e = prev_spl1e;
  5.1951 -    }
  5.1952 -    else
  5.1953 -    {
  5.1954 -        if ( prev_spl1e != NULL )
  5.1955 -            unmap_domain_mem( prev_spl1e );
  5.1956 -        spl1e = (l1_pgentry_t *)map_domain_mem(smfn << PAGE_SHIFT);
  5.1957 -        *prev_smfn_ptr  = smfn;
  5.1958 -        *prev_spl1e_ptr = spl1e;
  5.1959 -    }
  5.1960 -
  5.1961 -    l1pte_propagate_from_guest(current->domain, &gpte, &spte);
  5.1962 -    spl1e[(pa & ~PAGE_MASK) / sizeof(l1_pgentry_t)] = mk_l1_pgentry(spte);
  5.1963 -}
  5.1964 -
  5.1965 -void shadow_l2_normal_pt_update(unsigned long pa, unsigned long gpde)
  5.1966 -{
  5.1967 -    unsigned long sl2mfn, spde = 0;
  5.1968 -    l2_pgentry_t *spl2e;
  5.1969 -    unsigned long sl1mfn;
  5.1970 -
  5.1971 -    /* N.B. To get here, we know the l2 page *must* be shadowed. */
  5.1972 -    SH_VVLOG("shadow_l2_normal_pt_update pa=%p, gpde=%p",pa,gpde);
  5.1973 -
  5.1974 -    sl2mfn = __shadow_status(current->domain, pa >> PAGE_SHIFT) & PSH_pfn_mask;
  5.1975 -
  5.1976 -    /*
  5.1977 -     * Only propagate to shadow if _PAGE_ACCESSED is set in the guest.
  5.1978 -     * Otherwise, to ensure coherency, we blow away the existing shadow value.
  5.1979 -     */
  5.1980 -    if ( gpde & _PAGE_ACCESSED )
  5.1981 -    {
  5.1982 -        sl1mfn = (gpde & _PAGE_PRESENT) ?
  5.1983 -            __shadow_status(current->domain, gpde >> PAGE_SHIFT) : 0;
  5.1984 -        l2pde_general(current->domain, &gpde, &spde, sl1mfn);
  5.1985 -    }
  5.1986 -
  5.1987 -    spl2e = (l2_pgentry_t *)map_domain_mem(sl2mfn << PAGE_SHIFT);
  5.1988 -    spl2e[(pa & ~PAGE_MASK) / sizeof(l2_pgentry_t)] = mk_l2_pgentry(spde);
  5.1989 -    unmap_domain_mem(spl2e);
  5.1990 -}
  5.1991 -
  5.1992 -unsigned long mk_hl2_table(struct exec_domain *ed)
  5.1993 +/*
  5.1994 + * What lives where in the 32-bit address space in the various shadow modes,
  5.1995 + * and what it uses to get/maintain that mapping.
  5.1996 + *
  5.1997 + * SHADOW MODE:      none         enable         translate         external
  5.1998 + * 
  5.1999 + * 4KB things:
  5.2000 + * guest_vtable    lin_l2     mapped per gpdt  lin_l2 via hl2   mapped per gpdt
  5.2001 + * shadow_vtable     n/a         sh_lin_l2       sh_lin_l2      mapped per gpdt
  5.2002 + * hl2_vtable        n/a            n/a        lin_hl2 via hl2  mapped per gpdt
  5.2003 + * monitor_vtable    n/a            n/a             n/a           mapped once
  5.2004 + *
  5.2005 + * 4MB things:
  5.2006 + * guest_linear  lin via gpdt   lin via gpdt     lin via hl2      lin via hl2
  5.2007 + * shadow_linear     n/a      sh_lin via spdt  sh_lin via spdt  sh_lin via spdt
  5.2008 + * monitor_linear    n/a            n/a             n/a              ???
  5.2009 + * perdomain      perdomain      perdomain       perdomain        perdomain
  5.2010 + * R/O M2P         R/O M2P        R/O M2P           n/a              n/a
  5.2011 + * R/W M2P         R/W M2P        R/W M2P         R/W M2P          R/W M2P
  5.2012 + * P2M               n/a            n/a           R/O M2P          R/O M2P
  5.2013 + *
  5.2014 + * NB:
  5.2015 + * update_pagetables(), __update_pagetables(), shadow_mode_enable(),
  5.2016 + * shadow_l2_table(), shadow_hl2_table(), and alloc_monitor_pagetable()
  5.2017 + * all play a part in maintaining these mappings.
  5.2018 + */
  5.2019 +void __update_pagetables(struct exec_domain *ed)
  5.2020  {
  5.2021      struct domain *d = ed->domain;
  5.2022      unsigned long gmfn = pagetable_val(ed->arch.guest_table) >> PAGE_SHIFT;
  5.2023      unsigned long gpfn = __mfn_to_gpfn(d, gmfn);
  5.2024 -    unsigned long hl2mfn, status;
  5.2025 -    struct pfn_info *hl2_info;
  5.2026 -    l1_pgentry_t *hl2;
  5.2027 +    unsigned long smfn, hl2mfn;
  5.2028 +
  5.2029 +    int max_mode = ( shadow_mode_external(d) ? SHM_external
  5.2030 +                     : shadow_mode_translate(d) ? SHM_translate
  5.2031 +                     : shadow_mode_enabled(d) ? SHM_enable
  5.2032 +                     : 0 );
  5.2033 +
  5.2034 +    ASSERT( ! IS_INVALID_M2P_ENTRY(gpfn) );
  5.2035 +    ASSERT( max_mode );
  5.2036  
  5.2037 -    perfc_incr(hl2_table_pages);
  5.2038 +    /*
  5.2039 +     *  arch.guest_vtable
  5.2040 +     */
  5.2041 +    if ( max_mode & (SHM_enable | SHM_external) )
  5.2042 +    {
  5.2043 +        if ( likely(ed->arch.guest_vtable != NULL) )
  5.2044 +            unmap_domain_mem(ed->arch.guest_vtable);
  5.2045 +        ed->arch.guest_vtable = map_domain_mem(gmfn << PAGE_SHIFT);
  5.2046 +    }
  5.2047  
  5.2048 -    if ( (hl2_info = alloc_shadow_page(d)) == NULL )
  5.2049 -        BUG(); /* XXX Deal gracefully with failure. */
  5.2050 +    /*
  5.2051 +     *  arch.shadow_table
  5.2052 +     */
  5.2053 +    if ( unlikely(!(smfn = __shadow_status(d, gpfn, PGT_base_page_table))) )
  5.2054 +        smfn = shadow_l2_table(d, gpfn, gmfn);
  5.2055 +    get_shadow_ref(smfn);
  5.2056 +    if ( pagetable_val(ed->arch.shadow_table) )
  5.2057 +        put_shadow_ref(pagetable_val(ed->arch.shadow_table) >> PAGE_SHIFT);
  5.2058 +    ed->arch.shadow_table = mk_pagetable(smfn << PAGE_SHIFT);
  5.2059  
  5.2060 -    hl2_info->u.inuse.type_info = PGT_l1_page_table;
  5.2061 +    SH_VVLOG("0: __update_pagetables(gmfn=%p, smfn=%p)", gmfn, smfn);
  5.2062  
  5.2063 -    hl2mfn = page_to_pfn(hl2_info);
  5.2064 -    status = hl2mfn | PSH_hl2;
  5.2065 -    set_shadow_status(ed->domain, gpfn | PSH_hl2, status);
  5.2066 +    /*
  5.2067 +     * arch.shadow_vtable
  5.2068 +     */
  5.2069 +    if ( max_mode == SHM_external )
  5.2070 +    {
  5.2071 +        if ( ed->arch.shadow_vtable )
  5.2072 +            unmap_domain_mem(ed->arch.shadow_vtable);
  5.2073 +        ed->arch.shadow_vtable = map_domain_mem(smfn << PAGE_SHIFT);
  5.2074 +    }
  5.2075 +
  5.2076 +    /*
  5.2077 +     * arch.hl2_vtable
  5.2078 +     */
  5.2079 +
  5.2080 +    // if max_mode == SHM_translate, then the hl2 is already installed
  5.2081 +    // correctly in its smfn, and there's nothing to do.
  5.2082 +    //
  5.2083 +    if ( max_mode == SHM_external )
  5.2084 +    {
  5.2085 +        if ( unlikely(!(hl2mfn = __shadow_status(d, gpfn, PGT_hl2_shadow))) )
  5.2086 +            hl2mfn = shadow_hl2_table(d, gpfn, gmfn, smfn);
  5.2087 +        get_shadow_ref(hl2mfn);
  5.2088  
  5.2089 -    // need to optimize this...
  5.2090 -    hl2 = map_domain_mem(hl2mfn << PAGE_SHIFT);
  5.2091 -    memset(hl2, 0, PAGE_SIZE);
  5.2092 -    unmap_domain_mem(hl2);
  5.2093 +        if ( ed->arch.hl2_vtable )
  5.2094 +            unmap_domain_mem(ed->arch.hl2_vtable);
  5.2095 +        ed->arch.hl2_vtable = map_domain_mem(hl2mfn << PAGE_SHIFT);
  5.2096 +    }
  5.2097 +
  5.2098 +    /*
  5.2099 +     * fixup pointers in monitor table, as necessary
  5.2100 +     */
  5.2101 +    if ( max_mode == SHM_external )
  5.2102 +    {
  5.2103 +        l2_pgentry_t *mpl2e = ed->arch.monitor_vtable;
  5.2104  
  5.2105 -    return status;
  5.2106 +        ASSERT( shadow_mode_translate(d) );
  5.2107 +
  5.2108 +        mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
  5.2109 +            mk_l2_pgentry((hl2mfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
  5.2110 +
  5.2111 +        mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
  5.2112 +            mk_l2_pgentry((smfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
  5.2113 +
  5.2114 +        // XXX - maybe this can be optimized somewhat??
  5.2115 +        local_flush_tlb();
  5.2116 +    }
  5.2117  }
  5.2118  
  5.2119  
  5.2120 -
  5.2121  /************************************************************************/
  5.2122  /************************************************************************/
  5.2123  /************************************************************************/
  5.2124 @@ -838,12 +2000,13 @@ int shadow_status_noswap;
  5.2125  
  5.2126  static int check_pte(
  5.2127      struct domain *d, unsigned long *pgpte, unsigned long *pspte, 
  5.2128 -    int level, int l2_idx, int l1_idx)
  5.2129 +    int level, int l2_idx, int l1_idx, int oos_ptes)
  5.2130  {
  5.2131      unsigned gpte = *pgpte;
  5.2132      unsigned spte = *pspte;
  5.2133 -    unsigned long mask, gpfn, smfn;
  5.2134 +    unsigned long mask, gpfn, smfn, gmfn;
  5.2135      int errors = 0;
  5.2136 +    int page_table_page;
  5.2137  
  5.2138      if ( (spte == 0) || (spte == 0xdeadface) || (spte == 0x00000E00) )
  5.2139          return errors;  /* always safe */
  5.2140 @@ -862,21 +2025,36 @@ static int check_pte(
  5.2141      if ( (spte & mask) != (gpte & mask) )
  5.2142          FAIL("Corrupt?");
  5.2143  
  5.2144 -    if ( (spte & _PAGE_DIRTY ) && !(gpte & _PAGE_DIRTY) )
  5.2145 +    if ( (spte & _PAGE_DIRTY ) && !(gpte & _PAGE_DIRTY) && !oos_ptes )
  5.2146          FAIL("Dirty coherence");
  5.2147  
  5.2148 -    if ( (spte & _PAGE_ACCESSED ) && !(gpte & _PAGE_ACCESSED) )
  5.2149 +    if ( (spte & _PAGE_ACCESSED ) && !(gpte & _PAGE_ACCESSED) && !oos_ptes )
  5.2150          FAIL("Accessed coherence");
  5.2151  
  5.2152 -    if ( (spte & _PAGE_RW ) && !(gpte & _PAGE_RW) )
  5.2153 -        FAIL("RW coherence");
  5.2154 -
  5.2155 -    if ( (spte & _PAGE_RW ) && !((gpte & _PAGE_RW) && (gpte & _PAGE_DIRTY)) )
  5.2156 -        FAIL("RW2 coherence");
  5.2157 - 
  5.2158      smfn = spte >> PAGE_SHIFT;
  5.2159      gpfn = gpte >> PAGE_SHIFT;
  5.2160 +    gmfn = __gpfn_to_mfn(d, gpfn);
  5.2161  
  5.2162 +    page_table_page = mfn_is_page_table(gmfn);
  5.2163 +
  5.2164 +    if ( (spte & _PAGE_RW ) && !(gpte & _PAGE_RW) && !oos_ptes )
  5.2165 +    {
  5.2166 +        printk("gpfn=%p gmfn=%p smfn=%p t=0x%08x page_table_page=%d oos_ptes=%d\n",
  5.2167 +               gpfn, gmfn, smfn,
  5.2168 +               frame_table[gmfn].u.inuse.type_info,
  5.2169 +               page_table_page, oos_ptes);
  5.2170 +        FAIL("RW coherence");
  5.2171 +    }
  5.2172 +
  5.2173 +    if ( (spte & _PAGE_RW ) && !((gpte & _PAGE_RW) && (gpte & _PAGE_DIRTY)) && !oos_ptes )
  5.2174 +    {
  5.2175 +        printk("gpfn=%p gmfn=%p smfn=%p t=0x%08x page_table_page=%d oos_ptes=%d\n",
  5.2176 +               gpfn, gmfn, smfn,
  5.2177 +               frame_table[gmfn].u.inuse.type_info,
  5.2178 +               page_table_page, oos_ptes);
  5.2179 +        FAIL("RW2 coherence");
  5.2180 +    }
  5.2181 + 
  5.2182      if ( gpfn == smfn )
  5.2183      {
  5.2184          if ( level > 1 )
  5.2185 @@ -887,23 +2065,26 @@ static int check_pte(
  5.2186          if ( level < 2 )
  5.2187              FAIL("Shadow in L1 entry?");
  5.2188  
  5.2189 -        if ( __shadow_status(d, gpfn) != (PSH_shadowed | smfn) )
  5.2190 -            FAIL("smfn problem g.sf=%p", 
  5.2191 -                 __shadow_status(d, gpfn) );
  5.2192 +        if ( level == 2 )
  5.2193 +        {
  5.2194 +            if ( __shadow_status(d, gpfn, PGT_l1_shadow) != smfn )
  5.2195 +                FAIL("smfn problem gpfn=%p smfn=%p", gpfn,
  5.2196 +                     __shadow_status(d, gpfn, PGT_l1_shadow));
  5.2197 +        }
  5.2198 +        else
  5.2199 +            BUG(); // XXX -- not handled yet.
  5.2200      }
  5.2201  
  5.2202      return errors;
  5.2203  }
  5.2204  
  5.2205 -
  5.2206  static int check_l1_table(
  5.2207 -    struct domain *d,
  5.2208 +    struct domain *d, unsigned long gpfn,
  5.2209      unsigned long gmfn, unsigned long smfn, unsigned l2_idx)
  5.2210  {
  5.2211      int i;
  5.2212      unsigned long *gpl1e, *spl1e;
  5.2213 -    int cpu = current->processor;
  5.2214 -    int errors = 0;
  5.2215 +    int errors = 0, oos_ptes = 0;
  5.2216  
  5.2217      // First check to see if this guest page is currently the active
  5.2218      // PTWR page.  If so, then we compare the (old) cached copy of the
  5.2219 @@ -912,6 +2093,8 @@ static int check_l1_table(
  5.2220      //
  5.2221      if ( VM_ASSIST(d, VMASST_TYPE_writable_pagetables) )
  5.2222      {
  5.2223 +        int cpu = current->processor;
  5.2224 +
  5.2225          for ( i = 0; i < ARRAY_SIZE(ptwr_info->ptinfo); i++)
  5.2226          {
  5.2227              if ( ptwr_info[cpu].ptinfo[i].l1va &&
  5.2228 @@ -925,11 +2108,18 @@ static int check_l1_table(
  5.2229          }
  5.2230      }
  5.2231  
  5.2232 +    if ( page_out_of_sync(pfn_to_page(gmfn)) )
  5.2233 +    {
  5.2234 +        gmfn = __shadow_status(d, gpfn, PGT_snapshot);
  5.2235 +        oos_ptes = 1;
  5.2236 +        ASSERT(gmfn);
  5.2237 +    }
  5.2238 +
  5.2239      gpl1e = map_domain_mem(gmfn << PAGE_SHIFT);
  5.2240      spl1e = map_domain_mem(smfn << PAGE_SHIFT);
  5.2241  
  5.2242      for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
  5.2243 -        errors += check_pte(d, &gpl1e[i], &spl1e[i], 1, l2_idx, i);
  5.2244 +        errors += check_pte(d, &gpl1e[i], &spl1e[i], 1, l2_idx, i, oos_ptes);
  5.2245   
  5.2246      unmap_domain_mem(spl1e);
  5.2247      unmap_domain_mem(gpl1e);
  5.2248 @@ -944,20 +2134,23 @@ static int check_l1_table(
  5.2249      } while ( 0 )
  5.2250  
  5.2251  int check_l2_table(
  5.2252 -    struct domain *d, unsigned long gpfn, unsigned long smfn)
  5.2253 +    struct domain *d, unsigned long gmfn, unsigned long smfn, int oos_pdes)
  5.2254  {
  5.2255 -    unsigned long gmfn = __gpfn_to_mfn(d, gpfn);
  5.2256 -    l2_pgentry_t *gpl2e = (l2_pgentry_t *) map_domain_mem( gmfn << PAGE_SHIFT );
  5.2257 -    l2_pgentry_t *spl2e = (l2_pgentry_t *) map_domain_mem( smfn << PAGE_SHIFT );
  5.2258 +    l2_pgentry_t *gpl2e = (l2_pgentry_t *)map_domain_mem(gmfn << PAGE_SHIFT);
  5.2259 +    l2_pgentry_t *spl2e = (l2_pgentry_t *)map_domain_mem(smfn << PAGE_SHIFT);
  5.2260      int i;
  5.2261      int errors = 0;
  5.2262 +    int limit;
  5.2263  
  5.2264 -    if ( page_get_owner(pfn_to_page(gmfn)) != d )
  5.2265 +    if ( !oos_pdes && (page_get_owner(pfn_to_page(gmfn)) != d) )
  5.2266          FAILPT("domain doesn't own page");
  5.2267 +    if ( oos_pdes && (page_get_owner(pfn_to_page(gmfn)) != NULL) )
  5.2268 +        FAILPT("bogus owner for snapshot page");
  5.2269      if ( page_get_owner(pfn_to_page(smfn)) != NULL )
  5.2270          FAILPT("shadow page mfn=0x%08x is owned by someone, domid=%d",
  5.2271                 smfn, page_get_owner(pfn_to_page(smfn))->id);
  5.2272  
  5.2273 +#if 0
  5.2274      if ( memcmp(&spl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
  5.2275                  &gpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 
  5.2276                  ((SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT) -
  5.2277 @@ -974,40 +2167,62 @@ int check_l2_table(
  5.2278      if ( (l2_pgentry_val(spl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT]) != 
  5.2279            l2_pgentry_val(gpl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT])) )
  5.2280          FAILPT("hypervisor linear map inconsistent");
  5.2281 +#endif
  5.2282  
  5.2283 -    if ( (l2_pgentry_val(spl2e[SH_LINEAR_PT_VIRT_START >> 
  5.2284 +    if ( !shadow_mode_external(d) &&
  5.2285 +         (l2_pgentry_val(spl2e[SH_LINEAR_PT_VIRT_START >> 
  5.2286                                 L2_PAGETABLE_SHIFT]) != 
  5.2287            ((smfn << PAGE_SHIFT) | __PAGE_HYPERVISOR)) )
  5.2288 +    {
  5.2289          FAILPT("hypervisor shadow linear map inconsistent %p %p",
  5.2290                 l2_pgentry_val(spl2e[SH_LINEAR_PT_VIRT_START >>
  5.2291                                      L2_PAGETABLE_SHIFT]),
  5.2292                 (smfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
  5.2293 -
  5.2294 -    if ( !shadow_mode_translate(d) ) {
  5.2295 -        if ( (l2_pgentry_val(spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT]) !=
  5.2296 -              ((v2m(page_get_owner(&frame_table[gmfn])->arch.mm_perdomain_pt) |
  5.2297 -                __PAGE_HYPERVISOR))) )
  5.2298 -            FAILPT("hypervisor per-domain map inconsistent");
  5.2299      }
  5.2300  
  5.2301 +    if ( !shadow_mode_external(d) &&
  5.2302 +         (l2_pgentry_val(spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT]) !=
  5.2303 +              ((__pa(d->arch.mm_perdomain_pt) | __PAGE_HYPERVISOR))) )
  5.2304 +    {
  5.2305 +        FAILPT("hypervisor per-domain map inconsistent saw %p, expected (va=%p) %p",
  5.2306 +               l2_pgentry_val(spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT]),
  5.2307 +               d->arch.mm_perdomain_pt,
  5.2308 +               (__pa(d->arch.mm_perdomain_pt) | __PAGE_HYPERVISOR));
  5.2309 +    }
  5.2310 +
  5.2311 +    if ( shadow_mode_external(d) )
  5.2312 +        limit = L2_PAGETABLE_ENTRIES;
  5.2313 +    else
  5.2314 +        limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
  5.2315 +
  5.2316      /* Check the whole L2. */
  5.2317 -    for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
  5.2318 -        errors += check_pte(d, &l2_pgentry_val(gpl2e[i]), &l2_pgentry_val(spl2e[i]), 2, i, 0);
  5.2319 +    for ( i = 0; i < limit; i++ )
  5.2320 +        errors += check_pte(d, &l2_pgentry_val(gpl2e[i]), &l2_pgentry_val(spl2e[i]), 2, i, 0, 0);
  5.2321  
  5.2322      unmap_domain_mem(spl2e);
  5.2323      unmap_domain_mem(gpl2e);
  5.2324  
  5.2325 +#if 1
  5.2326 +    if ( errors )
  5.2327 +        printk("check_l2_table returning %d errors\n", errors);
  5.2328 +#endif
  5.2329 +
  5.2330      return errors;
  5.2331  }
  5.2332  
  5.2333 -int _check_pagetable(struct domain *d, pagetable_t pt, char *s)
  5.2334 +int _check_pagetable(struct exec_domain *ed, char *s)
  5.2335  {
  5.2336 +    struct domain *d = ed->domain;
  5.2337 +    pagetable_t pt = ed->arch.guest_table;
  5.2338      unsigned long gptbase = pagetable_val(pt);
  5.2339 -    unsigned long ptbase_pfn, smfn, ss;
  5.2340 +    unsigned long ptbase_pfn, smfn;
  5.2341      unsigned long i;
  5.2342      l2_pgentry_t *gpl2e, *spl2e;
  5.2343      unsigned long ptbase_mfn = 0;
  5.2344 -    int errors = 0;
  5.2345 +    int errors = 0, limit, oos_pdes = 0;
  5.2346 +
  5.2347 +    audit_domain(d);
  5.2348 +    shadow_lock(d);
  5.2349  
  5.2350      sh_check_name = s;
  5.2351      SH_VVLOG("%s-PT Audit", s);
  5.2352 @@ -1017,30 +2232,31 @@ int _check_pagetable(struct domain *d, p
  5.2353      ptbase_pfn = gptbase >> PAGE_SHIFT;
  5.2354      ptbase_mfn = __gpfn_to_mfn(d, ptbase_pfn);
  5.2355  
  5.2356 -    ss = __shadow_status(d, ptbase_pfn);
  5.2357 -  
  5.2358 -    if ( ! (ss & PSH_shadowed) )
  5.2359 +    if ( !(smfn = __shadow_status(d, ptbase_pfn, PGT_base_page_table)) )
  5.2360      {
  5.2361          printk("%s-PT %p not shadowed\n", s, gptbase);
  5.2362          errors++;
  5.2363 -
  5.2364 -        if ( ss != 0 )
  5.2365 -            BUG();
  5.2366 -        return errors;
  5.2367 -    }   
  5.2368 +        goto out;
  5.2369 +    }
  5.2370 +    if ( page_out_of_sync(pfn_to_page(ptbase_mfn)) )
  5.2371 +    {
  5.2372 +        ptbase_mfn = __shadow_status(d, ptbase_pfn, PGT_snapshot);
  5.2373 +        oos_pdes = 1;
  5.2374 +        ASSERT(ptbase_mfn);
  5.2375 +    }
  5.2376   
  5.2377 -    smfn = ss & PSH_pfn_mask;
  5.2378 -
  5.2379 -    if ( ss != (PSH_shadowed | smfn) )
  5.2380 -        FAILPT("ptbase shadow inconsistent1");
  5.2381 -
  5.2382 -    errors += check_l2_table(d, ptbase_pfn, smfn);
  5.2383 +    errors += check_l2_table(d, ptbase_mfn, smfn, oos_pdes);
  5.2384  
  5.2385      gpl2e = (l2_pgentry_t *) map_domain_mem( ptbase_mfn << PAGE_SHIFT );
  5.2386      spl2e = (l2_pgentry_t *) map_domain_mem( smfn << PAGE_SHIFT );
  5.2387  
  5.2388      /* Go back and recurse. */
  5.2389 -    for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
  5.2390 +    if ( shadow_mode_external(d) )
  5.2391 +        limit = L2_PAGETABLE_ENTRIES;
  5.2392 +    else
  5.2393 +        limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
  5.2394 +
  5.2395 +    for ( i = 0; i < limit; i++ )
  5.2396      {
  5.2397          unsigned long gl1pfn = l2_pgentry_val(gpl2e[i]) >> PAGE_SHIFT;
  5.2398          unsigned long gl1mfn = __gpfn_to_mfn(d, gl1pfn);
  5.2399 @@ -1048,7 +2264,7 @@ int _check_pagetable(struct domain *d, p
  5.2400  
  5.2401          if ( l2_pgentry_val(spl2e[i]) != 0 )
  5.2402          {
  5.2403 -            errors += check_l1_table(d, gl1mfn, sl1mfn, i);
  5.2404 +            errors += check_l1_table(d, gl1pfn, gl1mfn, sl1mfn, i);
  5.2405          }
  5.2406      }
  5.2407  
  5.2408 @@ -1057,22 +2273,23 @@ int _check_pagetable(struct domain *d, p
  5.2409  
  5.2410      SH_VVLOG("PT verified : l2_present = %d, l1_present = %d",
  5.2411               sh_l2_present, sh_l1_present);
  5.2412 - 
  5.2413 -#if 1
  5.2414 +
  5.2415 + out:
  5.2416      if ( errors )
  5.2417          BUG();
  5.2418 -#endif
  5.2419 +
  5.2420 +    shadow_unlock(d);
  5.2421  
  5.2422      return errors;
  5.2423  }
  5.2424  
  5.2425 -int _check_all_pagetables(struct domain *d, char *s)
  5.2426 +int _check_all_pagetables(struct exec_domain *ed, char *s)
  5.2427  {
  5.2428 -    int i, j;
  5.2429 +    struct domain *d = ed->domain;
  5.2430 +    int i;
  5.2431      struct shadow_status *a;
  5.2432      unsigned long gmfn;
  5.2433      int errors = 0;
  5.2434 -    int cpu;
  5.2435  
  5.2436      shadow_status_noswap = 1;
  5.2437  
  5.2438 @@ -1084,22 +2301,34 @@ int _check_all_pagetables(struct domain 
  5.2439      for (i = 0; i < shadow_ht_buckets; i++)
  5.2440      {
  5.2441          a = &d->arch.shadow_ht[i];
  5.2442 -        while ( a && a->pfn )
  5.2443 +        while ( a && a->gpfn_and_flags )
  5.2444          {
  5.2445 -            gmfn = __gpfn_to_mfn(d, a->pfn);
  5.2446 -            switch ( frame_table[a->pfn].u.inuse.type_info & PGT_type_mask )
  5.2447 +            gmfn = __gpfn_to_mfn(d, a->gpfn_and_flags & PGT_mfn_mask);
  5.2448 +
  5.2449 +            switch ( a->gpfn_and_flags & PGT_type_mask )
  5.2450              {
  5.2451 -            case PGT_l1_page_table:
  5.2452 -                errors += check_l1_table(d, gmfn, a->smfn_and_flags & PSH_pfn_mask, 0);
  5.2453 +            case PGT_l1_shadow:
  5.2454 +                errors += check_l1_table(d, a->gpfn_and_flags & PGT_mfn_mask,
  5.2455 +                                         gmfn, a->smfn, 0);
  5.2456                  break;
  5.2457 -            case PGT_l2_page_table:
  5.2458 -                errors += check_l2_table(d, gmfn, a->smfn_and_flags & PSH_pfn_mask);
  5.2459 +            case PGT_l2_shadow:
  5.2460 +                errors += check_l2_table(d, gmfn, a->smfn,
  5.2461 +                                         page_out_of_sync(pfn_to_page(gmfn)));
  5.2462 +                break;
  5.2463 +            case PGT_l3_shadow:
  5.2464 +            case PGT_l4_shadow:
  5.2465 +            case PGT_hl2_shadow:
  5.2466 +                BUG(); // XXX - ought to fix this...
  5.2467 +                break;
  5.2468 +            case PGT_snapshot:
  5.2469                  break;
  5.2470              default:
  5.2471                  errors++;
  5.2472 -                printk("unexpected page type 0x%08x, pfn=0x%08x, gmfn=0x%08x\n",
  5.2473 -                       frame_table[gmfn].u.inuse.type_info,
  5.2474 -                       a->pfn, gmfn);
  5.2475 +                printk("unexpected shadow type %p, gpfn=%p, "
  5.2476 +                       "gmfn=%p smfn=%p\n",
  5.2477 +                       a->gpfn_and_flags & PGT_type_mask,
  5.2478 +                       a->gpfn_and_flags & PGT_mfn_mask,
  5.2479 +                       gmfn, a->smfn);
  5.2480                  BUG();
  5.2481              }
  5.2482              a = a->next;
  5.2483 @@ -1108,52 +2337,8 @@ int _check_all_pagetables(struct domain 
  5.2484  
  5.2485      shadow_status_noswap = 0;
  5.2486  
  5.2487 -    for (i = 0; i < 1024; i++)
  5.2488 -    {
  5.2489 -        if ( l2_pgentry_val(shadow_linear_l2_table[i]) & _PAGE_PRESENT )
  5.2490 -        {
  5.2491 -            unsigned base = i << 10;
  5.2492 -            for (j = 0; j < 1024; j++)
  5.2493 -            {
  5.2494 -                if ( (l1_pgentry_val(shadow_linear_pg_table[base + j]) & PAGE_MASK) == 0x0143d000 )
  5.2495 -                {
  5.2496 -                    printk("sh_ln_pg_tb[0x%08x] => 0x%08lx ",
  5.2497 -                           base + j,
  5.2498 -                           l1_pgentry_val(shadow_linear_pg_table[base + j]));
  5.2499 -                    if ( l1_pgentry_val(shadow_linear_pg_table[base + j]) & _PAGE_PRESENT )
  5.2500 -                        printk(" first entry => 0x%08lx\n",
  5.2501 -                               *(unsigned long *)((base + j) << PAGE_SHIFT));
  5.2502 -                    else
  5.2503 -                        printk(" page not present\n");
  5.2504 -                }
  5.2505 -            }
  5.2506 -        }
  5.2507 -    }
  5.2508 -
  5.2509      if ( errors )
  5.2510 -    {
  5.2511 -        printk("VM_ASSIST(d, VMASST_TYPE_writable_pagetables) => %d\n",
  5.2512 -               VM_ASSIST(d, VMASST_TYPE_writable_pagetables));
  5.2513 -        for ( cpu = 0; cpu < smp_num_cpus; cpu++ )
  5.2514 -        {
  5.2515 -            for ( j = 0; j < ARRAY_SIZE(ptwr_info->ptinfo); j++)
  5.2516 -            {
  5.2517 -                printk("ptwr_info[%d].ptinfo[%d].l1va => 0x%08x\n",
  5.2518 -                       cpu, j, ptwr_info[cpu].ptinfo[j].l1va);
  5.2519 -                printk("ptwr_info[%d].ptinfo[%d].pl1e => 0x%08x\n",
  5.2520 -                       cpu, j, ptwr_info[cpu].ptinfo[j].pl1e);
  5.2521 -                if (cpu == smp_processor_id())
  5.2522 -                    printk("v2m(ptwr_info[%d].ptinfo[%d].pl1e) => 0x%08x\n",
  5.2523 -                           cpu, j, v2m(ptwr_info[cpu].ptinfo[j].pl1e));
  5.2524 -                printk("ptwr_info[%d].ptinfo[%d].page => 0x%08x\n",
  5.2525 -                       cpu, j, ptwr_info[cpu].ptinfo[j].page);
  5.2526 -                if (cpu == smp_processor_id())
  5.2527 -                    printk("v2m(ptwr_info[%d].ptinfo[%d].page) => 0x%08x\n",
  5.2528 -                           cpu, j, v2m(ptwr_info[cpu].ptinfo[j].page));
  5.2529 -            }
  5.2530 -        }
  5.2531          BUG();
  5.2532 -    }
  5.2533  
  5.2534      return errors;
  5.2535  }
     6.1 --- a/xen/arch/x86/traps.c	Mon Mar 14 18:44:10 2005 +0000
     6.2 +++ b/xen/arch/x86/traps.c	Mon Mar 14 22:07:47 2005 +0000
     6.3 @@ -114,7 +114,7 @@ asmlinkage void fatal_trap(int trapnr, s
     6.4      if ( trapnr == TRAP_page_fault )
     6.5      {
     6.6          __asm__ __volatile__ ("mov %%cr2,%0" : "=r" (cr2) : );
     6.7 -        printk("Faulting linear address might be %0lx %lx\n", cr2, cr2);
     6.8 +        printk("Faulting linear address might be %p\n", cr2);
     6.9      }
    6.10  
    6.11      printk("************************************\n");
    6.12 @@ -269,6 +269,8 @@ asmlinkage int do_page_fault(struct xen_
    6.13  
    6.14      DEBUGGER_trap_entry(TRAP_page_fault, regs);
    6.15  
    6.16 +    //printk("do_page_fault(eip=%p, va=%p, code=%d)\n", regs->eip, addr, regs->error_code);
    6.17 +
    6.18      perfc_incrc(page_faults);
    6.19  
    6.20      if ( likely(VM_ASSIST(d, VMASST_TYPE_writable_pagetables)) )
    6.21 @@ -295,9 +297,12 @@ asmlinkage int do_page_fault(struct xen_
    6.22          UNLOCK_BIGLOCK(d);
    6.23      }
    6.24  
    6.25 -    if ( unlikely(shadow_mode_enabled(d)) && 
    6.26 -         (addr < PAGE_OFFSET) && shadow_fault(addr, regs) )
    6.27 +    if ( unlikely(shadow_mode_enabled(d)) &&
    6.28 +         ((addr < PAGE_OFFSET) || shadow_mode_external(d)) &&
    6.29 +         shadow_fault(addr, regs) )
    6.30 +    {
    6.31          return EXCRET_fault_fixed;
    6.32 +    }
    6.33  
    6.34      if ( unlikely(addr >= LDT_VIRT_START(ed)) && 
    6.35           (addr < (LDT_VIRT_START(ed) + (ed->arch.ldt_ents*LDT_ENTRY_SIZE))) )
     7.1 --- a/xen/arch/x86/vmx.c	Mon Mar 14 18:44:10 2005 +0000
     7.2 +++ b/xen/arch/x86/vmx.c	Mon Mar 14 22:07:47 2005 +0000
     7.3 @@ -106,6 +106,7 @@ static void inline __update_guest_eip(un
     7.4  
     7.5  static int vmx_do_page_fault(unsigned long va, struct xen_regs *regs) 
     7.6  {
     7.7 +    struct exec_domain *ed = current;
     7.8      unsigned long eip;
     7.9      unsigned long gpte, gpa;
    7.10      int result;
    7.11 @@ -123,9 +124,9 @@ static int vmx_do_page_fault(unsigned lo
    7.12       * If vpagetable is zero, then we are still emulating 1:1 page tables,
    7.13       * and we should have never gotten here.
    7.14       */
    7.15 -    if ( !current->arch.guest_vtable )
    7.16 +    if ( !test_bit(VMX_CPU_STATE_PG_ENABLED, &ed->arch.arch_vmx.cpu_state) )
    7.17      {
    7.18 -        printk("vmx_do_page_fault while still running on 1:1 page table\n");
    7.19 +        printk("vmx_do_page_fault while running on 1:1 page table\n");
    7.20          return 0;
    7.21      }
    7.22  
    7.23 @@ -269,21 +270,17 @@ static void vmx_vmexit_do_invlpg(unsigne
    7.24  {
    7.25      unsigned long eip;
    7.26      struct exec_domain *ed = current;
    7.27 -    unsigned int index;
    7.28  
    7.29      __vmread(GUEST_EIP, &eip);
    7.30  
    7.31 -    VMX_DBG_LOG(DBG_LEVEL_VMMU, "vmx_vmexit_do_invlpg:eip=%p, va=%p",
    7.32 -            eip, va);
    7.33 +    VMX_DBG_LOG(DBG_LEVEL_VMMU, "vmx_vmexit_do_invlpg: eip=%p, va=%p",
    7.34 +                eip, va);
    7.35  
    7.36      /*
    7.37       * We do the safest things first, then try to update the shadow
    7.38       * copying from guest
    7.39       */
    7.40      shadow_invlpg(ed, va);
    7.41 -    index = l2_table_offset(va);
    7.42 -    ed->arch.hl2_vtable[index] = 
    7.43 -        mk_l2_pgentry(0); /* invalidate pgd cache */
    7.44  }
    7.45  
    7.46  static void vmx_io_instruction(struct xen_regs *regs, 
    7.47 @@ -428,14 +425,6 @@ static void mov_to_cr(int gp, int cr, st
    7.48              }
    7.49              old_base_mfn = pagetable_val(d->arch.guest_table) >> PAGE_SHIFT;
    7.50  
    7.51 -            /* We know that none of the previous 1:1 shadow pages are
    7.52 -             * going to be used again, so might as well flush them.
    7.53 -             * XXXX wait until the last VCPU boots before doing the flush !!
    7.54 -             */
    7.55 -            shadow_lock(d->domain);
    7.56 -            free_shadow_state(d->domain); // XXX SMP
    7.57 -            shadow_unlock(d->domain);
    7.58 -
    7.59              /*
    7.60               * Now arch.guest_table points to machine physical.
    7.61               */
    7.62 @@ -469,7 +458,6 @@ static void mov_to_cr(int gp, int cr, st
    7.63              break;
    7.64          }
    7.65          
    7.66 -        hl2_table_invalidate(d);
    7.67          /*
    7.68           * We make a new one if the shadow does not exist.
    7.69           */
    7.70 @@ -482,8 +470,7 @@ static void mov_to_cr(int gp, int cr, st
    7.71              mfn = phys_to_machine_mapping(value >> PAGE_SHIFT);
    7.72              if ((mfn << PAGE_SHIFT) != pagetable_val(d->arch.guest_table))
    7.73                  __vmx_bug(regs);
    7.74 -            vmx_shadow_clear_state(d->domain);
    7.75 -            shadow_invalidate(d);
    7.76 +            shadow_sync_all(d->domain);
    7.77          } else {
    7.78              /*
    7.79               * If different, make a shadow. Check if the PDBR is valid
    7.80 @@ -525,8 +512,6 @@ static void mov_to_cr(int gp, int cr, st
    7.81           */
    7.82          if ((old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE)) {
    7.83              vmx_shadow_clear_state(d->domain);
    7.84 -            shadow_invalidate(d);
    7.85 -            hl2_table_invalidate(d);
    7.86          }
    7.87          break;
    7.88      default:
     8.1 --- a/xen/arch/x86/x86_32/domain_build.c	Mon Mar 14 18:44:10 2005 +0000
     8.2 +++ b/xen/arch/x86/x86_32/domain_build.c	Mon Mar 14 22:07:47 2005 +0000
     8.3 @@ -49,6 +49,8 @@ int construct_dom0(struct domain *d,
     8.4      char *image_start  = (char *)_image_start;  /* use lowmem mappings */
     8.5      char *initrd_start = (char *)_initrd_start; /* use lowmem mappings */
     8.6  
     8.7 +    int shadow_dom0 = 1; // HACK ALERT !!  Force dom0 to run in shadow mode.
     8.8 +
     8.9      /*
    8.10       * This fully describes the memory layout of the initial domain. All 
    8.11       * *_start address are page-aligned, except v_start (and v_end) which are 
    8.12 @@ -260,8 +262,14 @@ int construct_dom0(struct domain *d,
    8.13      l1tab += l1_table_offset(vpt_start);
    8.14      for ( count = 0; count < nr_pt_pages; count++ ) 
    8.15      {
    8.16 -        *l1tab = mk_l1_pgentry(l1_pgentry_val(*l1tab) & ~_PAGE_RW);
    8.17          page = &frame_table[l1_pgentry_to_pfn(*l1tab)];
    8.18 +
    8.19 +        if ( !shadow_dom0 )
    8.20 +            *l1tab = mk_l1_pgentry(l1_pgentry_val(*l1tab) & ~_PAGE_RW);
    8.21 +        else
    8.22 +            if ( !get_page_type(page, PGT_writable_page) )
    8.23 +                BUG();
    8.24 +
    8.25          if ( count == 0 )
    8.26          {
    8.27              page->u.inuse.type_info &= ~PGT_type_mask;
    8.28 @@ -380,13 +388,11 @@ int construct_dom0(struct domain *d,
    8.29  
    8.30      new_thread(ed, dsi.v_kernentry, vstack_end, vstartinfo_start);
    8.31  
    8.32 -#ifndef NDEBUG
    8.33 -    if (0) /* XXXXX DO NOT CHECK IN ENABLED !!! (but useful for testing so leave) */
    8.34 +    if ( shadow_dom0 )
    8.35      {
    8.36          shadow_mode_enable(d, SHM_enable); 
    8.37          update_pagetables(ed); /* XXX SMP */
    8.38      }
    8.39 -#endif
    8.40  
    8.41      return 0;
    8.42  }
     9.1 --- a/xen/arch/x86/x86_32/domain_page.c	Mon Mar 14 18:44:10 2005 +0000
     9.2 +++ b/xen/arch/x86/x86_32/domain_page.c	Mon Mar 14 22:07:47 2005 +0000
     9.3 @@ -91,6 +91,8 @@ void *map_domain_mem(unsigned long pa)
     9.4  void unmap_domain_mem(void *va)
     9.5  {
     9.6      unsigned int idx;
     9.7 +    ASSERT((void *)MAPCACHE_VIRT_START <= va);
     9.8 +    ASSERT(va < (void *)MAPCACHE_VIRT_END);
     9.9      idx = ((unsigned long)va - MAPCACHE_VIRT_START) >> PAGE_SHIFT;
    9.10      mapcache[idx] |= READY_FOR_TLB_FLUSH;
    9.11  }
    10.1 --- a/xen/common/dom_mem_ops.c	Mon Mar 14 18:44:10 2005 +0000
    10.2 +++ b/xen/common/dom_mem_ops.c	Mon Mar 14 22:07:47 2005 +0000
    10.3 @@ -14,6 +14,7 @@
    10.4  #include <xen/sched.h>
    10.5  #include <xen/event.h>
    10.6  #include <asm/domain_page.h>
    10.7 +#include <asm/shadow.h>
    10.8  
    10.9  /*
   10.10   * To allow safe resume of do_dom_mem_op() after preemption, we need to know 
   10.11 @@ -111,6 +112,27 @@ free_dom_mem(struct domain *d,
   10.12              if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
   10.13                  put_page(page);
   10.14  
   10.15 +            if ( unlikely(shadow_mode_enabled(d)) )
   10.16 +            {
   10.17 +                // XXX This needs more thought.  This isn't pretty,
   10.18 +                // and it's not fast.  But it's a place holder.
   10.19 +                //
   10.20 +                shadow_lock(d);
   10.21 +                if ( page_out_of_sync(page) )
   10.22 +                    __shadow_sync_mfn(d, mpfn + j);
   10.23 +                shadow_remove_all_access(d, mpfn + j);
   10.24 +
   10.25 +                if (page->count_info != 1)
   10.26 +                {
   10.27 +                    printk("free_dom_mem in shadow mode didn't release page "
   10.28 +                           "mfn=%p c=%p\n", mpfn+j, page->count_info);
   10.29 +                    shadow_unlock(d);
   10.30 +                    audit_domain(d);
   10.31 +                    BUG();
   10.32 +                }
   10.33 +                shadow_unlock(d);
   10.34 +            }
   10.35 +
   10.36              put_page(page);
   10.37          }
   10.38      }
    11.1 --- a/xen/common/keyhandler.c	Mon Mar 14 18:44:10 2005 +0000
    11.2 +++ b/xen/common/keyhandler.c	Mon Mar 14 22:07:47 2005 +0000
    11.3 @@ -188,7 +188,7 @@ void initialize_keytable(void)
    11.4      register_keyhandler(
    11.5          'o', audit_domains_key,  "audit domains >0 EXPERIMENTAL");
    11.6      register_keyhandler(
    11.7 -        'T', debugtrace_key, "dump debugtrace");
    11.8 +        'T', debugtrace_key, "toggle debugtrace to console/buffer");
    11.9  #endif
   11.10  
   11.11  #ifdef PERF_COUNTERS
    12.1 --- a/xen/common/page_alloc.c	Mon Mar 14 18:44:10 2005 +0000
    12.2 +++ b/xen/common/page_alloc.c	Mon Mar 14 22:07:47 2005 +0000
    12.3 @@ -29,6 +29,7 @@
    12.4  #include <xen/slab.h>
    12.5  #include <xen/irq.h>
    12.6  #include <asm/domain_page.h>
    12.7 +#include <asm/shadow.h>
    12.8  
    12.9  /*
   12.10   * Comma-separated list of hexadecimal page numbers containing bad bytes.
   12.11 @@ -566,7 +567,23 @@ void free_domheap_pages(struct pfn_info 
   12.12  
   12.13          for ( i = 0; i < (1 << order); i++ )
   12.14          {
   12.15 -            ASSERT((pg[i].u.inuse.type_info & PGT_count_mask) == 0);
   12.16 +            if ( ((pg[i].u.inuse.type_info & PGT_count_mask) != 0) &&
   12.17 +                shadow_mode_enabled(d) )
   12.18 +            {
   12.19 +                // XXX This needs more thought...
   12.20 +                //
   12.21 +                printk("%s: needing to call shadow_remove_all_access for mfn=%p\n",
   12.22 +                       __func__, page_to_pfn(&pg[i]));
   12.23 +                printk("Amfn=%p c=%p t=%p\n", page_to_pfn(&pg[i]),
   12.24 +                       pg[i].count_info, pg[i].u.inuse.type_info);
   12.25 +                shadow_lock(d);
   12.26 +                shadow_remove_all_access(d, page_to_pfn(&pg[i]));
   12.27 +                shadow_unlock(d);
   12.28 +                printk("Bmfn=%p c=%p t=%p\n", page_to_pfn(&pg[i]),
   12.29 +                       pg[i].count_info, pg[i].u.inuse.type_info);
   12.30 +            }
   12.31 +
   12.32 +            ASSERT( (pg[i].u.inuse.type_info & PGT_count_mask) == 0 );
   12.33              pg[i].tlbflush_timestamp  = tlbflush_current_time();
   12.34              pg[i].u.free.cpu_mask     = cpu_mask;
   12.35              list_del(&pg[i].list);
    13.1 --- a/xen/common/schedule.c	Mon Mar 14 18:44:10 2005 +0000
    13.2 +++ b/xen/common/schedule.c	Mon Mar 14 22:07:47 2005 +0000
    13.3 @@ -423,6 +423,9 @@ void __enter_scheduler(void)
    13.4      
    13.5      perfc_incrc(sched_ctx);
    13.6  
    13.7 +    // Q: With full shadow mode, do we need to flush out-of-sync pages
    13.8 +    //    before switching domains?  Current belief is NO.
    13.9 +
   13.10      if ( !is_idle_task(prev->domain) )
   13.11      {
   13.12          LOCK_BIGLOCK(prev->domain);
    14.1 --- a/xen/include/asm-x86/domain.h	Mon Mar 14 18:44:10 2005 +0000
    14.2 +++ b/xen/include/asm-x86/domain.h	Mon Mar 14 22:07:47 2005 +0000
    14.3 @@ -35,11 +35,21 @@ struct arch_domain
    14.4      unsigned int shadow_dirty_bitmap_size;  /* in pages, bit per page */
    14.5  
    14.6      /* shadow mode stats */
    14.7 -    unsigned int shadow_page_count;     
    14.8 -    unsigned int shadow_fault_count;     
    14.9 -    unsigned int shadow_dirty_count;     
   14.10 -    unsigned int shadow_dirty_net_count;     
   14.11 -    unsigned int shadow_dirty_block_count;     
   14.12 +    unsigned int shadow_page_count;
   14.13 +    unsigned int hl2_page_count;
   14.14 +    unsigned int snapshot_page_count;
   14.15 +
   14.16 +    unsigned int shadow_fault_count;
   14.17 +    unsigned int shadow_dirty_count;
   14.18 +    unsigned int shadow_dirty_net_count;
   14.19 +    unsigned int shadow_dirty_block_count;
   14.20 +
   14.21 +    /* full shadow mode */
   14.22 +    struct out_of_sync_entry *out_of_sync; /* list of out-of-sync pages */
   14.23 +    struct out_of_sync_entry *out_of_sync_free;
   14.24 +    struct out_of_sync_entry *out_of_sync_extras;
   14.25 +    unsigned int out_of_sync_extras_count;
   14.26 +
   14.27  } __cacheline_aligned;
   14.28  
   14.29  struct arch_exec_domain
   14.30 @@ -109,8 +119,8 @@ struct arch_exec_domain
   14.31  
   14.32      l2_pgentry_t *guest_vtable;         /* virtual address of pagetable */
   14.33      l2_pgentry_t *shadow_vtable;        /* virtual address of shadow_table */
   14.34 -    l2_pgentry_t *hl2_vtable;			/* virtual address of hl2_table */
   14.35      l2_pgentry_t *monitor_vtable;		/* virtual address of monitor_table */
   14.36 +    l1_pgentry_t *hl2_vtable;			/* virtual address of hl2_table */
   14.37  
   14.38      /* Virtual CR2 value. Can be read/written by guest. */
   14.39      unsigned long guest_cr2;
    15.1 --- a/xen/include/asm-x86/mm.h	Mon Mar 14 18:44:10 2005 +0000
    15.2 +++ b/xen/include/asm-x86/mm.h	Mon Mar 14 22:07:47 2005 +0000
    15.3 @@ -69,7 +69,16 @@ struct pfn_info
    15.4  #define PGT_gdt_page        (5<<29) /* using this page in a GDT? */
    15.5  #define PGT_ldt_page        (6<<29) /* using this page in an LDT? */
    15.6  #define PGT_writable_page   (7<<29) /* has writable mappings of this page? */
    15.7 +
    15.8 +#define PGT_l1_shadow       PGT_l1_page_table
    15.9 +#define PGT_l2_shadow       PGT_l2_page_table
   15.10 +#define PGT_l3_shadow       PGT_l3_page_table
   15.11 +#define PGT_l4_shadow       PGT_l4_page_table
   15.12 +#define PGT_hl2_shadow      (5<<29)
   15.13 +#define PGT_snapshot        (6<<29)
   15.14 +
   15.15  #define PGT_type_mask       (7<<29) /* Bits 29-31. */
   15.16 +
   15.17   /* Has this page been validated for use as its current type? */
   15.18  #define _PGT_validated      28
   15.19  #define PGT_validated       (1U<<_PGT_validated)
   15.20 @@ -86,11 +95,19 @@ struct pfn_info
   15.21   /* 17-bit count of uses of this frame as its current type. */
   15.22  #define PGT_count_mask      ((1U<<17)-1)
   15.23  
   15.24 +#define PGT_mfn_mask        ((1U<<21)-1) /* mfn mask for shadow types */
   15.25 +
   15.26   /* Cleared when the owning guest 'frees' this page. */
   15.27  #define _PGC_allocated      31
   15.28  #define PGC_allocated       (1U<<_PGC_allocated)
   15.29 - /* 31-bit count of references to this frame. */
   15.30 -#define PGC_count_mask      ((1U<<31)-1)
   15.31 + /* Set when fullshadow mode marks a page out-of-sync */
   15.32 +#define _PGC_out_of_sync     30
   15.33 +#define PGC_out_of_sync     (1U<<_PGC_out_of_sync)
   15.34 + /* Set when fullshadow mode is using a page as a page table */
   15.35 +#define _PGC_page_table      29
   15.36 +#define PGC_page_table      (1U<<_PGC_page_table)
   15.37 + /* 29-bit count of references to this frame. */
   15.38 +#define PGC_count_mask      ((1U<<29)-1)
   15.39  
   15.40  /* We trust the slab allocator in slab.c, and our use of it. */
   15.41  #define PageSlab(page)	    (1)
   15.42 @@ -112,6 +129,8 @@ static inline u32 pickle_domptr(struct d
   15.43  #define page_get_owner(_p)    (unpickle_domptr((_p)->u.inuse._domain))
   15.44  #define page_set_owner(_p,_d) ((_p)->u.inuse._domain = pickle_domptr(_d))
   15.45  
   15.46 +#define page_out_of_sync(_p)  ((_p)->count_info & PGC_out_of_sync)
   15.47 +
   15.48  #define SHARE_PFN_WITH_DOMAIN(_pfn, _dom)                                   \
   15.49      do {                                                                    \
   15.50          page_set_owner((_pfn), (_dom));                                     \
   15.51 @@ -135,6 +154,11 @@ void init_frametable(void);
   15.52  
   15.53  int alloc_page_type(struct pfn_info *page, unsigned int type);
   15.54  void free_page_type(struct pfn_info *page, unsigned int type);
   15.55 +extern void invalidate_shadow_ldt(struct exec_domain *d);
   15.56 +extern u32 shadow_remove_all_write_access(
   15.57 +    struct domain *d, unsigned min_type, unsigned max_type,
   15.58 +    unsigned long gpfn);
   15.59 +extern u32 shadow_remove_all_access( struct domain *d, unsigned long gmfn);
   15.60  
   15.61  static inline void put_page(struct pfn_info *page)
   15.62  {
   15.63 @@ -166,8 +190,10 @@ static inline int get_page(struct pfn_in
   15.64               unlikely((nx & PGC_count_mask) == 0) || /* Count overflow? */
   15.65               unlikely(d != _domain) )                /* Wrong owner? */
   15.66          {
   15.67 -            DPRINTK("Error pfn %p: ed=%p, sd=%p, caf=%08x, taf=%08x\n",
   15.68 -                    page_to_pfn(page), domain, unpickle_domptr(d),
   15.69 +            DPRINTK("Error pfn %p: rd=%p(%d), od=%p(%d), caf=%08x, taf=%08x\n",
   15.70 +                    page_to_pfn(page), domain, (domain ? domain->id : -1),
   15.71 +                    page_get_owner(page),
   15.72 +                    (page_get_owner(page) ? page_get_owner(page)->id : -1),
   15.73                      x, page->u.inuse.type_info);
   15.74              return 0;
   15.75          }
   15.76 @@ -184,6 +210,8 @@ static inline int get_page(struct pfn_in
   15.77  
   15.78  void put_page_type(struct pfn_info *page);
   15.79  int  get_page_type(struct pfn_info *page, u32 type);
   15.80 +int  get_page_from_l1e(l1_pgentry_t l1e, struct domain *d);
   15.81 +void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d);
   15.82  
   15.83  static inline void put_page_and_type(struct pfn_info *page)
   15.84  {
   15.85 @@ -207,6 +235,22 @@ static inline int get_page_and_type(stru
   15.86      return rc;
   15.87  }
   15.88  
   15.89 +static inline int mfn_is_page_table(unsigned long mfn)
   15.90 +{
   15.91 +    if ( !pfn_is_ram(mfn) )
   15.92 +        return 0;
   15.93 +
   15.94 +    return frame_table[mfn].count_info & PGC_page_table;
   15.95 +}
   15.96 +
   15.97 +static inline int page_is_page_table(struct pfn_info *page)
   15.98 +{
   15.99 +    if ( !pfn_is_ram(page_to_pfn(page)) )
  15.100 +        return 0;
  15.101 +
  15.102 +    return page->count_info & PGC_page_table;
  15.103 +}
  15.104 +
  15.105  #define ASSERT_PAGE_IS_TYPE(_p, _t)                            \
  15.106      ASSERT(((_p)->u.inuse.type_info & PGT_type_mask) == (_t)); \
  15.107      ASSERT(((_p)->u.inuse.type_info & PGT_count_mask) != 0)
  15.108 @@ -307,6 +351,7 @@ void ptwr_flush(const int);
  15.109  int ptwr_do_page_fault(unsigned long);
  15.110  
  15.111  int new_guest_cr3(unsigned long pfn);
  15.112 +void propagate_page_fault(unsigned long addr, u16 error_code);
  15.113  
  15.114  #define __cleanup_writable_pagetable(_what)                                 \
  15.115  do {                                                                        \
  15.116 @@ -326,14 +371,24 @@ do {                                    
  15.117                                       PTWR_CLEANUP_INACTIVE);              \
  15.118      } while ( 0 )
  15.119  
  15.120 +int audit_adjust_pgtables(struct domain *d, int dir, int noisy);
  15.121 +
  15.122  #ifndef NDEBUG
  15.123 -void audit_domain(struct domain *d);
  15.124 +
  15.125 +#define AUDIT_ALREADY_LOCKED ( 1u << 0 )
  15.126 +#define AUDIT_ERRORS_OK      ( 1u << 1 )
  15.127 +#define AUDIT_QUIET          ( 1u << 2 )
  15.128 +
  15.129 +void _audit_domain(struct domain *d, int flags, const char *file, int line);
  15.130 +#define audit_domain(_d) _audit_domain((_d), 0, __FILE__, __LINE__)
  15.131  void audit_domains(void);
  15.132 +
  15.133  #else
  15.134 +
  15.135 +#define _audit_domain(_d, _f, _file, _line) ((void)0)
  15.136  #define audit_domain(_d) ((void)0)
  15.137  #define audit_domains()  ((void)0)
  15.138 +
  15.139  #endif
  15.140  
  15.141 -void propagate_page_fault(unsigned long addr, u16 error_code);
  15.142 -
  15.143  #endif /* __ASM_X86_MM_H__ */
    16.1 --- a/xen/include/asm-x86/page.h	Mon Mar 14 18:44:10 2005 +0000
    16.2 +++ b/xen/include/asm-x86/page.h	Mon Mar 14 22:07:47 2005 +0000
    16.3 @@ -57,9 +57,11 @@ typedef struct { unsigned long pt_lo; } 
    16.4  #include <asm/flushtlb.h>
    16.5  
    16.6  #define linear_pg_table ((l1_pgentry_t *)LINEAR_PT_VIRT_START)
    16.7 -#define linear_l2_table ((l2_pgentry_t *)(LINEAR_PT_VIRT_START+(LINEAR_PT_VIRT_START>>(L2_PAGETABLE_SHIFT-L1_PAGETABLE_SHIFT))))
    16.8 +#define __linear_l2_table ((l2_pgentry_t *)(LINEAR_PT_VIRT_START + \
    16.9 +     (LINEAR_PT_VIRT_START>>(L2_PAGETABLE_SHIFT-L1_PAGETABLE_SHIFT))))
   16.10 +#define linear_l2_table(_ed) ((_ed)->arch.guest_vtable)
   16.11  
   16.12 -#define va_to_l1mfn(_va) (l2_pgentry_val(linear_l2_table[_va>>L2_PAGETABLE_SHIFT]) >> PAGE_SHIFT)
   16.13 +#define va_to_l1mfn(_ed, _va) (l2_pgentry_val(linear_l2_table(_ed)[_va>>L2_PAGETABLE_SHIFT]) >> PAGE_SHIFT)
   16.14  
   16.15  extern root_pgentry_t idle_pg_table[ROOT_PAGETABLE_ENTRIES];
   16.16  
    17.1 --- a/xen/include/asm-x86/shadow.h	Mon Mar 14 18:44:10 2005 +0000
    17.2 +++ b/xen/include/asm-x86/shadow.h	Mon Mar 14 22:07:47 2005 +0000
    17.3 @@ -1,3 +1,22 @@
    17.4 +/******************************************************************************
    17.5 + * include/asm-x86/shadow.h
    17.6 + * 
    17.7 + * Copyright (c) 2005 Michael A Fetterman
    17.8 + * 
    17.9 + * This program is free software; you can redistribute it and/or modify
   17.10 + * it under the terms of the GNU General Public License as published by
   17.11 + * the Free Software Foundation; either version 2 of the License, or
   17.12 + * (at your option) any later version.
   17.13 + * 
   17.14 + * This program is distributed in the hope that it will be useful,
   17.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
   17.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   17.17 + * GNU General Public License for more details.
   17.18 + * 
   17.19 + * You should have received a copy of the GNU General Public License
   17.20 + * along with this program; if not, write to the Free Software
   17.21 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
   17.22 + */
   17.23  
   17.24  #ifndef _XEN_SHADOW_H
   17.25  #define _XEN_SHADOW_H
   17.26 @@ -8,11 +27,6 @@
   17.27  #include <asm/processor.h>
   17.28  #include <asm/domain_page.h>
   17.29  
   17.30 -/* Shadow PT flag bits in shadow_status */
   17.31 -#define PSH_shadowed    (1<<31) /* page has a shadow. PFN points to shadow */
   17.32 -#define PSH_hl2         (1<<30) /* page is an hl2 */
   17.33 -#define PSH_pfn_mask    ((1<<21)-1)
   17.34 -
   17.35  /* Shadow PT operation mode : shadow-mode variable in arch_domain. */
   17.36  
   17.37  #define SHM_enable    (1<<0) /* we're in one of the shadow modes */
   17.38 @@ -26,8 +40,13 @@
   17.39  #define shadow_mode_external(_d)  ((_d)->arch.shadow_mode & SHM_external)
   17.40  
   17.41  #define shadow_linear_pg_table ((l1_pgentry_t *)SH_LINEAR_PT_VIRT_START)
   17.42 -#define shadow_linear_l2_table ((l2_pgentry_t *)(SH_LINEAR_PT_VIRT_START + \
   17.43 +#define __shadow_linear_l2_table ((l2_pgentry_t *)(SH_LINEAR_PT_VIRT_START + \
   17.44       (SH_LINEAR_PT_VIRT_START >> (L2_PAGETABLE_SHIFT - L1_PAGETABLE_SHIFT))))
   17.45 +#define shadow_linear_l2_table(_ed) ((_ed)->arch.shadow_vtable)
   17.46 +
   17.47 +// easy access to the hl2 table (for translated but not external modes only)
   17.48 +#define __linear_hl2_table ((l1_pgentry_t *)(LINEAR_PT_VIRT_START + \
   17.49 +     (PERDOMAIN_VIRT_START >> (L2_PAGETABLE_SHIFT - L1_PAGETABLE_SHIFT))))
   17.50  
   17.51  #define shadow_lock_init(_d) spin_lock_init(&(_d)->arch.shadow_lock)
   17.52  #define shadow_lock(_d)      spin_lock(&(_d)->arch.shadow_lock)
   17.53 @@ -36,18 +55,86 @@
   17.54  extern void shadow_mode_init(void);
   17.55  extern int shadow_mode_control(struct domain *p, dom0_shadow_control_t *sc);
   17.56  extern int shadow_fault(unsigned long va, struct xen_regs *regs);
   17.57 -extern void shadow_l1_normal_pt_update(
   17.58 -    unsigned long pa, unsigned long gpte, 
   17.59 -    unsigned long *prev_spfn_ptr, l1_pgentry_t **prev_spl1e_ptr);
   17.60 -extern void shadow_l2_normal_pt_update(unsigned long pa, unsigned long gpde);
   17.61 -extern void unshadow_table(unsigned long gpfn, unsigned int type);
   17.62  extern int shadow_mode_enable(struct domain *p, unsigned int mode);
   17.63 -extern void free_shadow_state(struct domain *d);
   17.64  extern void shadow_invlpg(struct exec_domain *, unsigned long);
   17.65 -extern unsigned long mk_hl2_table(struct exec_domain *ed);
   17.66 +extern struct out_of_sync_entry *shadow_mark_mfn_out_of_sync(
   17.67 +    struct exec_domain *ed, unsigned long gpfn, unsigned long mfn);
   17.68 +extern void free_monitor_pagetable(struct exec_domain *ed);
   17.69 +extern void __shadow_sync_all(struct domain *d);
   17.70 +extern int __shadow_out_of_sync(struct exec_domain *ed, unsigned long va);
   17.71 +
   17.72 +static inline unsigned long __shadow_status(
   17.73 +    struct domain *d, unsigned long gpfn, unsigned long stype);
   17.74  
   17.75  extern void vmx_shadow_clear_state(struct domain *);
   17.76  
   17.77 +/************************************************************************/
   17.78 +
   17.79 +static void inline
   17.80 +__shadow_sync_mfn(struct domain *d, unsigned long mfn)
   17.81 +{
   17.82 +    if ( d->arch.out_of_sync )
   17.83 +    {
   17.84 +        // XXX - could be smarter
   17.85 +        //
   17.86 +        __shadow_sync_all(d);
   17.87 +    }
   17.88 +}
   17.89 +
   17.90 +static void inline
   17.91 +__shadow_sync_va(struct exec_domain *ed, unsigned long va)
   17.92 +{
   17.93 +    struct domain *d = ed->domain;
   17.94 +
   17.95 +    if ( d->arch.out_of_sync && __shadow_out_of_sync(ed, va) )
   17.96 +    {
   17.97 +        // XXX - could be smarter
   17.98 +        //
   17.99 +        __shadow_sync_all(ed->domain);
  17.100 +    }
  17.101 +}
  17.102 +
  17.103 +static void inline
  17.104 +shadow_sync_all(struct domain *d)
  17.105 +{
  17.106 +    if ( unlikely(shadow_mode_enabled(d)) )
  17.107 +    {
  17.108 +        shadow_lock(d);
  17.109 +
  17.110 +        if ( d->arch.out_of_sync )
  17.111 +            __shadow_sync_all(d);
  17.112 +
  17.113 +        ASSERT(d->arch.out_of_sync == NULL);
  17.114 +
  17.115 +        shadow_unlock(d);
  17.116 +    }
  17.117 +}
  17.118 +
  17.119 +// SMP BUG: This routine can't ever be used properly in an SMP context.
  17.120 +//          It should be something like get_shadow_and_sync_va().
  17.121 +//          This probably shouldn't exist.
  17.122 +//
  17.123 +static void inline
  17.124 +shadow_sync_va(struct exec_domain *ed, unsigned long gva)
  17.125 +{
  17.126 +    struct domain *d = ed->domain;
  17.127 +    if ( unlikely(shadow_mode_enabled(d)) )
  17.128 +    {
  17.129 +        shadow_lock(d);
  17.130 +        __shadow_sync_va(ed, gva);
  17.131 +        shadow_unlock(d);
  17.132 +    }
  17.133 +}
  17.134 +
  17.135 +extern void __shadow_mode_disable(struct domain *d);
  17.136 +static inline void shadow_mode_disable(struct domain *d)
  17.137 +{
  17.138 +    if ( shadow_mode_enabled(d) )
  17.139 +        __shadow_mode_disable(d);
  17.140 +}
  17.141 +
  17.142 +/************************************************************************/
  17.143 +
  17.144  #define __mfn_to_gpfn(_d, mfn)                         \
  17.145      ( (shadow_mode_translate(_d))                      \
  17.146        ? machine_to_phys_mapping[(mfn)]                 \
  17.147 @@ -58,39 +145,41 @@ extern void vmx_shadow_clear_state(struc
  17.148        ? phys_to_machine_mapping(gpfn)                  \
  17.149        : (gpfn) )
  17.150  
  17.151 -extern void __shadow_mode_disable(struct domain *d);
  17.152 -static inline void shadow_mode_disable(struct domain *d)
  17.153 -{
  17.154 -    if ( shadow_mode_enabled(d) )
  17.155 -        __shadow_mode_disable(d);
  17.156 -}
  17.157 +/************************************************************************/
  17.158 +
  17.159 +struct shadow_status {
  17.160 +    unsigned long gpfn_and_flags; /* Guest pfn plus flags. */
  17.161 +    struct shadow_status *next;   /* Pull-to-front list.   */
  17.162 +    unsigned long smfn;           /* Shadow mfn.           */
  17.163 +};
  17.164 +
  17.165 +#define shadow_ht_extra_size 128
  17.166 +#define shadow_ht_buckets    256
  17.167  
  17.168 -extern unsigned long shadow_l2_table( 
  17.169 -    struct domain *d, unsigned long gmfn);
  17.170 -  
  17.171 -static inline void shadow_invalidate(struct exec_domain *ed) {
  17.172 -    if ( !VMX_DOMAIN(ed) )
  17.173 -        BUG();
  17.174 -    memset(ed->arch.shadow_vtable, 0, PAGE_SIZE);
  17.175 -}
  17.176 +struct out_of_sync_entry {
  17.177 +    struct out_of_sync_entry *next;
  17.178 +    unsigned long gpfn;    /* why is this here? */
  17.179 +    unsigned long gmfn;
  17.180 +    unsigned long snapshot_mfn;
  17.181 +    unsigned long writable_pl1e; /* NB: this is a machine address */
  17.182 +};
  17.183 +
  17.184 +#define out_of_sync_extra_size 127
  17.185 +
  17.186 +#define SHADOW_SNAPSHOT_ELSEWHERE (-1L)
  17.187 +
  17.188 +/************************************************************************/
  17.189  
  17.190  #define SHADOW_DEBUG 0
  17.191  #define SHADOW_VERBOSE_DEBUG 0
  17.192 +#define SHADOW_VVERBOSE_DEBUG 0
  17.193  #define SHADOW_HASH_DEBUG 0
  17.194 +#define FULLSHADOW_DEBUG 0
  17.195  
  17.196  #if SHADOW_DEBUG
  17.197  extern int shadow_status_noswap;
  17.198  #endif
  17.199  
  17.200 -struct shadow_status {
  17.201 -    unsigned long pfn;            /* Guest pfn.             */
  17.202 -    unsigned long smfn_and_flags; /* Shadow mfn plus flags. */
  17.203 -    struct shadow_status *next;   /* Pull-to-front list.    */
  17.204 -};
  17.205 -
  17.206 -#define shadow_ht_extra_size 128
  17.207 -#define shadow_ht_buckets    256
  17.208 -
  17.209  #ifdef VERBOSE
  17.210  #define SH_LOG(_f, _a...)                                               \
  17.211      printk("DOM%uP%u: SH_LOG(%d): " _f "\n",                            \
  17.212 @@ -99,7 +188,7 @@ struct shadow_status {
  17.213  #define SH_LOG(_f, _a...) 
  17.214  #endif
  17.215  
  17.216 -#if SHADOW_DEBUG
  17.217 +#if SHADOW_VERBOSE_DEBUG
  17.218  #define SH_VLOG(_f, _a...)                                              \
  17.219      printk("DOM%uP%u: SH_VLOG(%d): " _f "\n",                           \
  17.220             current->domain->id, current->processor, __LINE__ , ## _a )
  17.221 @@ -107,7 +196,7 @@ struct shadow_status {
  17.222  #define SH_VLOG(_f, _a...) 
  17.223  #endif
  17.224  
  17.225 -#if SHADOW_VERBOSE_DEBUG
  17.226 +#if SHADOW_VVERBOSE_DEBUG
  17.227  #define SH_VVLOG(_f, _a...)                                             \
  17.228      printk("DOM%uP%u: SH_VVLOG(%d): " _f "\n",                          \
  17.229             current->domain->id, current->processor, __LINE__ , ## _a )
  17.230 @@ -115,60 +204,148 @@ struct shadow_status {
  17.231  #define SH_VVLOG(_f, _a...)
  17.232  #endif
  17.233  
  17.234 -// BUG: mafetter: this assumes ed == current, so why pass ed?
  17.235 -static inline void __shadow_get_l2e(
  17.236 -    struct exec_domain *ed, unsigned long va, unsigned long *sl2e)
  17.237 -{
  17.238 -    if ( !likely(shadow_mode_enabled(ed->domain)) )
  17.239 -        BUG();
  17.240 +#if FULLSHADOW_DEBUG
  17.241 +#define FSH_LOG(_f, _a...)                                              \
  17.242 +    printk("DOM%uP%u: FSH_LOG(%d): " _f "\n",                           \
  17.243 +           current->domain->id, current->processor, __LINE__ , ## _a )
  17.244 +#else
  17.245 +#define FSH_LOG(_f, _a...) 
  17.246 +#endif
  17.247 +
  17.248 +
  17.249 +/************************************************************************/
  17.250  
  17.251 -    if ( shadow_mode_translate(ed->domain) )
  17.252 -        *sl2e = l2_pgentry_val(
  17.253 -            ed->arch.shadow_vtable[l2_table_offset(va)]);       
  17.254 -    else 
  17.255 -        *sl2e = l2_pgentry_val(
  17.256 -            shadow_linear_l2_table[l2_table_offset(va)]);
  17.257 +static inline void
  17.258 +__shadow_get_l2e(
  17.259 +    struct exec_domain *ed, unsigned long va, unsigned long *psl2e)
  17.260 +{
  17.261 +    ASSERT(shadow_mode_enabled(ed->domain));
  17.262 +
  17.263 +    *psl2e = l2_pgentry_val( ed->arch.shadow_vtable[l2_table_offset(va)]);
  17.264 +}
  17.265 +
  17.266 +static inline void
  17.267 +__shadow_set_l2e(
  17.268 +    struct exec_domain *ed, unsigned long va, unsigned long value)
  17.269 +{
  17.270 +    ASSERT(shadow_mode_enabled(ed->domain));
  17.271 +
  17.272 +    ed->arch.shadow_vtable[l2_table_offset(va)] = mk_l2_pgentry(value);
  17.273  }
  17.274  
  17.275 -static inline void __shadow_set_l2e(
  17.276 +static inline void
  17.277 +__guest_get_l2e(
  17.278 +    struct exec_domain *ed, unsigned long va, unsigned long *pl2e)
  17.279 +{
  17.280 +    *pl2e = l2_pgentry_val(ed->arch.guest_vtable[l2_table_offset(va)]);
  17.281 +}
  17.282 +
  17.283 +static inline void
  17.284 +__guest_set_l2e(
  17.285      struct exec_domain *ed, unsigned long va, unsigned long value)
  17.286  {
  17.287 -    if ( !likely(shadow_mode_enabled(ed->domain)) )
  17.288 -        BUG();
  17.289 +    if ( unlikely(shadow_mode_translate(ed->domain)) )
  17.290 +    {
  17.291 +        unsigned long mfn = phys_to_machine_mapping(value >> PAGE_SHIFT);
  17.292 +        unsigned long old_hl2e =
  17.293 +            l1_pgentry_val(ed->arch.hl2_vtable[l2_table_offset(va)]);
  17.294 +        unsigned long new_hl2e =
  17.295 +            (mfn ? ((mfn << PAGE_SHIFT) | __PAGE_HYPERVISOR) : 0);
  17.296  
  17.297 -    if ( shadow_mode_translate(ed->domain) ) 
  17.298 -        ed->arch.shadow_vtable[l2_table_offset(va)] = mk_l2_pgentry(value);
  17.299 -    else 
  17.300 -        shadow_linear_l2_table[l2_table_offset(va)] = mk_l2_pgentry(value);
  17.301 +        // only do the ref counting if something important changed.
  17.302 +        //
  17.303 +        if ( (old_hl2e ^ new_hl2e) & (PAGE_MASK | _PAGE_PRESENT) )
  17.304 +        {
  17.305 +            if ( new_hl2e & _PAGE_PRESENT )
  17.306 +                get_page_from_l1e(mk_l1_pgentry(new_hl2e), ed->domain);
  17.307 +            if ( old_hl2e & _PAGE_PRESENT )
  17.308 +                put_page_from_l1e(mk_l1_pgentry(old_hl2e), ed->domain);
  17.309 +        }
  17.310 +
  17.311 +        ed->arch.hl2_vtable[l2_table_offset(va)] = mk_l1_pgentry(new_hl2e);
  17.312 +    }
  17.313 +
  17.314 +    ed->arch.guest_vtable[l2_table_offset(va)] = mk_l2_pgentry(value);
  17.315  }
  17.316  
  17.317 -static inline void __guest_get_l2e(
  17.318 -    struct exec_domain *ed, unsigned long va, unsigned long *l2e)
  17.319 +/************************************************************************/
  17.320 +
  17.321 +/*
  17.322 + * Add another shadow reference to smfn.
  17.323 + */
  17.324 +static inline int
  17.325 +get_shadow_ref(unsigned long smfn)
  17.326  {
  17.327 -    *l2e = ( shadow_mode_translate(ed->domain) ) ?
  17.328 -        l2_pgentry_val(ed->arch.guest_vtable[l2_table_offset(va)]) :
  17.329 -        l2_pgentry_val(linear_l2_table[l2_table_offset(va)]);
  17.330 +    u32 x, nx;
  17.331 +
  17.332 +    ASSERT(pfn_is_ram(smfn));
  17.333 +
  17.334 +    x = frame_table[smfn].count_info;
  17.335 +    nx = x + 1;
  17.336 +
  17.337 +    if ( unlikely(nx == 0) )
  17.338 +    {
  17.339 +        printk("get_shadow_ref overflow, gmfn=%p smfn=%p\n",
  17.340 +               frame_table[smfn].u.inuse.type_info & PGT_mfn_mask, smfn);
  17.341 +        BUG();
  17.342 +    }
  17.343 +    
  17.344 +    // Guarded by the shadow lock...
  17.345 +    //
  17.346 +    frame_table[smfn].count_info = nx;
  17.347 +
  17.348 +    return 1;
  17.349  }
  17.350  
  17.351 -static inline void __guest_set_l2e(
  17.352 -    struct exec_domain *ed, unsigned long va, unsigned long value)
  17.353 +extern void free_shadow_page(unsigned long smfn);
  17.354 +
  17.355 +/*
  17.356 + * Drop a shadow reference to smfn.
  17.357 + */
  17.358 +static inline void
  17.359 +put_shadow_ref(unsigned long smfn)
  17.360  {
  17.361 -    if ( shadow_mode_translate(ed->domain) )
  17.362 -    {
  17.363 -        unsigned long pfn;
  17.364 +    u32 x, nx;
  17.365 +
  17.366 +    ASSERT(pfn_is_ram(smfn));
  17.367 +
  17.368 +    x = frame_table[smfn].count_info;
  17.369 +    nx = x - 1;
  17.370  
  17.371 -        pfn = phys_to_machine_mapping(value >> PAGE_SHIFT);
  17.372 -        ed->arch.hl2_vtable[l2_table_offset(va)] =
  17.373 -            mk_l2_pgentry((pfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
  17.374 +    if ( unlikely(x == 0) )
  17.375 +    {
  17.376 +        printk("put_shadow_ref underflow, gmfn=%p smfn=%p\n",
  17.377 +               frame_table[smfn].u.inuse.type_info & PGT_mfn_mask, smfn);
  17.378 +        BUG();
  17.379 +    }
  17.380  
  17.381 -        ed->arch.guest_vtable[l2_table_offset(va)] = mk_l2_pgentry(value);
  17.382 -    }
  17.383 -    else
  17.384 +    // Guarded by the shadow lock...
  17.385 +    //
  17.386 +    frame_table[smfn].count_info = nx;
  17.387 +
  17.388 +    if ( unlikely(nx == 0) )
  17.389      {
  17.390 -        linear_l2_table[l2_table_offset(va)] = mk_l2_pgentry(value);
  17.391 +        free_shadow_page(smfn);
  17.392      }
  17.393  }
  17.394  
  17.395 +static inline void
  17.396 +shadow_pin(unsigned long smfn)
  17.397 +{
  17.398 +    ASSERT( !(frame_table[smfn].u.inuse.type_info & PGT_pinned) );
  17.399 +
  17.400 +    frame_table[smfn].u.inuse.type_info |= PGT_pinned;
  17.401 +    get_shadow_ref(smfn);
  17.402 +}
  17.403 +
  17.404 +static inline void
  17.405 +shadow_unpin(unsigned long smfn)
  17.406 +{
  17.407 +    frame_table[smfn].u.inuse.type_info &= ~PGT_pinned;
  17.408 +    put_shadow_ref(smfn);
  17.409 +}
  17.410 +
  17.411 +
  17.412  /************************************************************************/
  17.413  
  17.414  static inline int __mark_dirty(struct domain *d, unsigned int mfn)
  17.415 @@ -179,7 +356,7 @@ static inline int __mark_dirty(struct do
  17.416      ASSERT(spin_is_locked(&d->arch.shadow_lock));
  17.417      ASSERT(d->arch.shadow_dirty_bitmap != NULL);
  17.418  
  17.419 -    pfn = machine_to_phys_mapping[mfn];
  17.420 +    pfn = __mfn_to_gpfn(d, mfn);
  17.421  
  17.422      /*
  17.423       * Values with the MSB set denote MFNs that aren't really part of the 
  17.424 @@ -226,23 +403,41 @@ static inline int mark_dirty(struct doma
  17.425  
  17.426  /************************************************************************/
  17.427  
  17.428 +extern void shadow_mark_out_of_sync(
  17.429 +    struct exec_domain *ed, unsigned long gpfn, unsigned long mfn,
  17.430 +    unsigned long va);
  17.431 +
  17.432  static inline void l1pte_write_fault(
  17.433 -    struct domain *d, unsigned long *gpte_p, unsigned long *spte_p)
  17.434 -{ 
  17.435 +    struct exec_domain *ed, unsigned long *gpte_p, unsigned long *spte_p,
  17.436 +    unsigned long va)
  17.437 +{
  17.438 +    struct domain *d = ed->domain;
  17.439      unsigned long gpte = *gpte_p;
  17.440 -    unsigned long spte = *spte_p;
  17.441 -    unsigned long pfn = gpte >> PAGE_SHIFT;
  17.442 -    unsigned long mfn = __gpfn_to_mfn(d, pfn);
  17.443 +    unsigned long spte;
  17.444 +    unsigned long gpfn = gpte >> PAGE_SHIFT;
  17.445 +    unsigned long mfn = __gpfn_to_mfn(d, gpfn);
  17.446 +
  17.447 +    //printk("l1pte_write_fault gmfn=%p\n", mfn);
  17.448 +
  17.449 +    if ( unlikely(!mfn) )
  17.450 +    {
  17.451 +        SH_LOG("l1pte_write_fault: invalid gpfn=%p", gpfn);
  17.452 +        *spte_p = 0;
  17.453 +        return;
  17.454 +    }
  17.455  
  17.456      ASSERT(gpte & _PAGE_RW);
  17.457      gpte |= _PAGE_DIRTY | _PAGE_ACCESSED;
  17.458 -
  17.459 -    if ( shadow_mode_log_dirty(d) )
  17.460 -        __mark_dirty(d, pfn);
  17.461 -
  17.462      spte = (mfn << PAGE_SHIFT) | (gpte & ~PAGE_MASK);
  17.463  
  17.464      SH_VVLOG("l1pte_write_fault: updating spte=0x%p gpte=0x%p", spte, gpte);
  17.465 +
  17.466 +    if ( shadow_mode_log_dirty(d) )
  17.467 +        __mark_dirty(d, mfn);
  17.468 +
  17.469 +    if ( mfn_is_page_table(mfn) )
  17.470 +        shadow_mark_out_of_sync(ed, gpfn, mfn, va);
  17.471 +
  17.472      *gpte_p = gpte;
  17.473      *spte_p = spte;
  17.474  }
  17.475 @@ -255,11 +450,21 @@ static inline void l1pte_read_fault(
  17.476      unsigned long pfn = gpte >> PAGE_SHIFT;
  17.477      unsigned long mfn = __gpfn_to_mfn(d, pfn);
  17.478  
  17.479 +    if ( unlikely(!mfn) )
  17.480 +    {
  17.481 +        SH_LOG("l1pte_read_fault: invalid gpfn=%p", pfn);
  17.482 +        *spte_p = 0;
  17.483 +        return;
  17.484 +    }
  17.485 +
  17.486      gpte |= _PAGE_ACCESSED;
  17.487      spte = (mfn << PAGE_SHIFT) | (gpte & ~PAGE_MASK);
  17.488  
  17.489 -    if ( shadow_mode_log_dirty(d) || !(gpte & _PAGE_DIRTY) )
  17.490 +    if ( shadow_mode_log_dirty(d) || !(gpte & _PAGE_DIRTY) ||
  17.491 +         mfn_is_page_table(mfn) )
  17.492 +    {
  17.493          spte &= ~_PAGE_RW;
  17.494 +    }
  17.495  
  17.496      SH_VVLOG("l1pte_read_fault: updating spte=0x%p gpte=0x%p", spte, gpte);
  17.497      *gpte_p = gpte;
  17.498 @@ -267,9 +472,8 @@ static inline void l1pte_read_fault(
  17.499  }
  17.500  
  17.501  static inline void l1pte_propagate_from_guest(
  17.502 -    struct domain *d, unsigned long *gpte_p, unsigned long *spte_p)
  17.503 +    struct domain *d, unsigned long gpte, unsigned long *spte_p)
  17.504  { 
  17.505 -    unsigned long gpte = *gpte_p;
  17.506      unsigned long spte = *spte_p;
  17.507      unsigned long pfn = gpte >> PAGE_SHIFT;
  17.508      unsigned long mfn = __gpfn_to_mfn(d, pfn);
  17.509 @@ -278,33 +482,36 @@ static inline void l1pte_propagate_from_
  17.510      unsigned long old_spte = spte;
  17.511  #endif
  17.512  
  17.513 -    /* Use 1:1 page table to identify MMIO address space */
  17.514 -    if ( shadow_mode_external(d) && mmio_space(gpte) ) {
  17.515 +    if ( unlikely(!mfn) )
  17.516 +    {
  17.517 +        // likely an MMIO address space mapping...
  17.518 +        //
  17.519          *spte_p = 0;
  17.520          return;
  17.521      }
  17.522 -    
  17.523 +
  17.524      spte = 0;
  17.525      if ( (gpte & (_PAGE_PRESENT|_PAGE_ACCESSED) ) == 
  17.526           (_PAGE_PRESENT|_PAGE_ACCESSED) ) {
  17.527          
  17.528          spte = (mfn << PAGE_SHIFT) | (gpte & ~PAGE_MASK);
  17.529          
  17.530 -        if ( shadow_mode_log_dirty(d) || !(gpte & _PAGE_DIRTY) )
  17.531 +        if ( shadow_mode_log_dirty(d) ||
  17.532 +             !(gpte & _PAGE_DIRTY) ||
  17.533 +             mfn_is_page_table(mfn) )
  17.534 +        {
  17.535              spte &= ~_PAGE_RW;
  17.536 +        }
  17.537      }
  17.538 -        
  17.539 +
  17.540  #if SHADOW_VERBOSE_DEBUG
  17.541      if ( old_spte || spte || gpte )
  17.542 -        SH_VVLOG("l1pte_propagate_from_guest: gpte=0x%p, old spte=0x%p, new spte=0x%p ", gpte, old_spte, spte);
  17.543 +        debugtrace_printk("l1pte_propagate_from_guest: gpte=0x%p, old spte=0x%p, new spte=0x%p\n", gpte, old_spte, spte);
  17.544  #endif
  17.545  
  17.546 -    *gpte_p = gpte;
  17.547      *spte_p = spte;
  17.548  }
  17.549  
  17.550 -
  17.551 -
  17.552  static inline void l2pde_general(
  17.553      struct domain *d,
  17.554      unsigned long *gpde_p,
  17.555 @@ -312,33 +519,104 @@ static inline void l2pde_general(
  17.556      unsigned long sl1mfn)
  17.557  {
  17.558      unsigned long gpde = *gpde_p;
  17.559 -    unsigned long spde = *spde_p;
  17.560 +    unsigned long spde;
  17.561  
  17.562      spde = 0;
  17.563 -
  17.564 -    if ( sl1mfn != 0 )
  17.565 +    if ( (gpde & _PAGE_PRESENT) && (sl1mfn != 0) )
  17.566      {
  17.567          spde = (gpde & ~PAGE_MASK) | (sl1mfn << PAGE_SHIFT) | 
  17.568              _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY;
  17.569          gpde |= _PAGE_ACCESSED; /* N.B. PDEs do not have a dirty bit. */
  17.570  
  17.571 -        /* Detect linear p.t. mappings and write-protect them. */
  17.572 -        if ( (frame_table[sl1mfn].u.inuse.type_info & PGT_type_mask) ==
  17.573 -             PGT_l2_page_table ) 
  17.574 -        {
  17.575 -            if ( !shadow_mode_translate(d) )
  17.576 -                spde = gpde & ~_PAGE_RW;
  17.577 -
  17.578 -        }
  17.579 +        // XXX mafetter: Hmm...
  17.580 +        //     Shouldn't the dirty log be checked/updated here?
  17.581 +        //     Actually, it needs to be done in this function's callers.
  17.582 +        //
  17.583 +        *gpde_p = gpde;
  17.584      }
  17.585  
  17.586 -    *gpde_p = gpde;
  17.587      *spde_p = spde;
  17.588  }
  17.589  
  17.590 +static inline void l2pde_propagate_from_guest(
  17.591 +    struct domain *d, unsigned long *gpde_p, unsigned long *spde_p)
  17.592 +{
  17.593 +    unsigned long gpde = *gpde_p, sl1mfn;
  17.594 +
  17.595 +    sl1mfn =  __shadow_status(d, gpde >> PAGE_SHIFT, PGT_l1_shadow);
  17.596 +    l2pde_general(d, gpde_p, spde_p, sl1mfn);
  17.597 +}
  17.598 +    
  17.599 +/************************************************************************/
  17.600 +
  17.601 +// returns true if a tlb flush is needed
  17.602 +//
  17.603 +static int inline
  17.604 +validate_pte_change(
  17.605 +    struct domain *d,
  17.606 +    unsigned long new_pte,
  17.607 +    unsigned long *shadow_pte_p)
  17.608 +{
  17.609 +    unsigned long old_spte, new_spte;
  17.610 +
  17.611 +    perfc_incrc(validate_pte_change);
  17.612 +
  17.613 +#if 0
  17.614 +    FSH_LOG("validate_pte(old=%p new=%p)\n", old_pte, new_pte);
  17.615 +#endif
  17.616 +
  17.617 +    old_spte = *shadow_pte_p;
  17.618 +    l1pte_propagate_from_guest(d, new_pte, shadow_pte_p);
  17.619 +    new_spte = *shadow_pte_p;
  17.620 +
  17.621 +    // only do the ref counting if something important changed.
  17.622 +    //
  17.623 +    if ( (old_spte ^ new_spte) & (PAGE_MASK | _PAGE_RW | _PAGE_PRESENT) )
  17.624 +    {
  17.625 +        if ( new_spte & _PAGE_PRESENT )
  17.626 +            get_page_from_l1e(mk_l1_pgentry(new_spte), d);
  17.627 +        if ( old_spte & _PAGE_PRESENT )
  17.628 +            put_page_from_l1e(mk_l1_pgentry(old_spte), d);
  17.629 +    }
  17.630 +
  17.631 +    // paranoia rules!
  17.632 +    return 1;
  17.633 +}
  17.634 +
  17.635 +// returns true if a tlb flush is needed
  17.636 +//
  17.637 +static int inline
  17.638 +validate_pde_change(
  17.639 +    struct domain *d,
  17.640 +    unsigned long new_pde,
  17.641 +    unsigned long *shadow_pde_p)
  17.642 +{
  17.643 +    unsigned long old_spde = *shadow_pde_p;
  17.644 +    unsigned long new_spde;
  17.645 +
  17.646 +    perfc_incrc(validate_pde_change);
  17.647 +
  17.648 +    l2pde_propagate_from_guest(d, &new_pde, shadow_pde_p);
  17.649 +    new_spde = *shadow_pde_p;
  17.650 +
  17.651 +    // only do the ref counting if something important changed.
  17.652 +    //
  17.653 +    if ( (old_spde ^ new_spde) & (PAGE_MASK | _PAGE_PRESENT) )
  17.654 +    {
  17.655 +        if ( new_spde & _PAGE_PRESENT )
  17.656 +            get_shadow_ref(new_spde >> PAGE_SHIFT);
  17.657 +        if ( old_spde & _PAGE_PRESENT )
  17.658 +            put_shadow_ref(old_spde >> PAGE_SHIFT);
  17.659 +    }
  17.660 +
  17.661 +    // paranoia rules!
  17.662 +    return 1;
  17.663 +}
  17.664 +
  17.665  /*********************************************************************/
  17.666  
  17.667  #if SHADOW_HASH_DEBUG
  17.668 +
  17.669  static void shadow_audit(struct domain *d, int print)
  17.670  {
  17.671      int live = 0, free = 0, j = 0, abs;
  17.672 @@ -347,26 +625,25 @@ static void shadow_audit(struct domain *
  17.673      for ( j = 0; j < shadow_ht_buckets; j++ )
  17.674      {
  17.675          a = &d->arch.shadow_ht[j];        
  17.676 -        if ( a->pfn )
  17.677 +        if ( a->gpfn_and_flags )
  17.678          {
  17.679              live++;
  17.680 -            ASSERT(a->smfn_and_flags & PSH_pfn_mask);
  17.681 +            ASSERT(a->smfn);
  17.682          }
  17.683          else
  17.684              ASSERT(!a->next);
  17.685 -        ASSERT( (a->pfn & ~PSH_hl2) < 0x00100000UL);
  17.686 +
  17.687          a = a->next;
  17.688          while ( a && (live < 9999) )
  17.689          { 
  17.690              live++; 
  17.691 -            if ( (a->pfn == 0) || (a->smfn_and_flags == 0) )
  17.692 +            if ( (a->gpfn_and_flags == 0) || (a->smfn == 0) )
  17.693              {
  17.694 -                printk("XXX live=%d pfn=%p sp=%p next=%p\n",
  17.695 -                       live, a->pfn, a->smfn_and_flags, a->next);
  17.696 +                printk("XXX live=%d gpfn+flags=%p sp=%p next=%p\n",
  17.697 +                       live, a->gpfn_and_flags, a->smfn, a->next);
  17.698                  BUG();
  17.699              }
  17.700 -            ASSERT( (a->pfn & ~PSH_hl2) < 0x00100000UL);
  17.701 -            ASSERT(a->smfn_and_flags & PSH_pfn_mask);
  17.702 +            ASSERT(a->smfn);
  17.703              a = a->next; 
  17.704          }
  17.705          ASSERT(live < 9999);
  17.706 @@ -376,21 +653,26 @@ static void shadow_audit(struct domain *
  17.707          free++; 
  17.708  
  17.709      if ( print )
  17.710 -        printk("Xlive=%d free=%d\n",live,free);
  17.711 +        printk("Xlive=%d free=%d\n", live, free);
  17.712  
  17.713      // BUG: this only works if there's only a single domain which is
  17.714      //      using shadow tables.
  17.715      //
  17.716 -    abs = ( perfc_value(shadow_l1_pages) +
  17.717 -            perfc_value(shadow_l2_pages) +
  17.718 -            perfc_value(hl2_table_pages) ) - live;
  17.719 +    abs = (
  17.720 +        perfc_value(shadow_l1_pages) +
  17.721 +        perfc_value(shadow_l2_pages) +
  17.722 +        perfc_value(hl2_table_pages) +
  17.723 +        perfc_value(snapshot_pages)
  17.724 +        ) - live;
  17.725  #ifdef PERF_COUNTERS
  17.726      if ( (abs < -1) || (abs > 1) )
  17.727      {
  17.728 -        printk("live=%d free=%d l1=%d l2=%d hl2=%d\n", live, free,
  17.729 +        printk("live=%d free=%d l1=%d l2=%d hl2=%d snapshot=%d\n",
  17.730 +               live, free,
  17.731                 perfc_value(shadow_l1_pages),
  17.732                 perfc_value(shadow_l2_pages),
  17.733 -               perfc_value(hl2_table_pages));
  17.734 +               perfc_value(hl2_table_pages),
  17.735 +               perfc_value(snapshot_pages));
  17.736          BUG();
  17.737      }
  17.738  #endif
  17.739 @@ -411,30 +693,36 @@ static inline struct shadow_status *hash
  17.740   * N.B. This takes a guest pfn (i.e. a pfn in the guest's namespace,
  17.741   *      which, depending on full shadow mode, may or may not equal
  17.742   *      its mfn).
  17.743 - *      The shadow status it returns is a mfn.
  17.744 + *      It returns the shadow's mfn, or zero if it doesn't exist.
  17.745   */
  17.746 +
  17.747  static inline unsigned long __shadow_status(
  17.748 -    struct domain *d, unsigned int gpfn)
  17.749 +    struct domain *d, unsigned long gpfn, unsigned long stype)
  17.750  {
  17.751      struct shadow_status *p, *x, *head;
  17.752 +    unsigned long key = gpfn | stype;
  17.753  
  17.754      ASSERT(spin_is_locked(&d->arch.shadow_lock));
  17.755 +    ASSERT(gpfn == (gpfn & PGT_mfn_mask));
  17.756 +    ASSERT(stype && !(stype & ~PGT_type_mask));
  17.757 +
  17.758 +    perfc_incrc(shadow_status_calls);
  17.759  
  17.760      x = head = hash_bucket(d, gpfn);
  17.761      p = NULL;
  17.762  
  17.763 -    //SH_VVLOG("lookup gpfn=%08x bucket=%p", gpfn, x);
  17.764 +    //SH_VVLOG("lookup gpfn=%08x type=%08x bucket=%p", gpfn, stype, x);
  17.765      shadow_audit(d, 0);
  17.766  
  17.767      do
  17.768      {
  17.769 -        ASSERT(x->pfn || ((x == head) && (x->next == NULL)));
  17.770 +        ASSERT(x->gpfn_and_flags || ((x == head) && (x->next == NULL)));
  17.771  
  17.772 -        if ( x->pfn == gpfn )
  17.773 +        if ( x->gpfn_and_flags == key )
  17.774          {
  17.775  #if SHADOW_DEBUG
  17.776              if ( unlikely(shadow_status_noswap) )
  17.777 -                return x->smfn_and_flags;
  17.778 +                return x->smfn;
  17.779  #endif
  17.780              /* Pull-to-front if 'x' isn't already the head item. */
  17.781              if ( unlikely(x != head) )
  17.782 @@ -445,13 +733,16 @@ static inline unsigned long __shadow_sta
  17.783                  head->next = x;
  17.784  
  17.785                  /* Swap 'x' contents with head contents. */
  17.786 -                SWAP(head->pfn, x->pfn);
  17.787 -                SWAP(head->smfn_and_flags, x->smfn_and_flags);
  17.788 +                SWAP(head->gpfn_and_flags, x->gpfn_and_flags);
  17.789 +                SWAP(head->smfn, x->smfn);
  17.790 +            }
  17.791 +            else
  17.792 +            {
  17.793 +                perfc_incrc(shadow_status_hit_head);
  17.794              }
  17.795  
  17.796 -            SH_VVLOG("lookup gpfn=%p => status=%p",
  17.797 -                     gpfn, head->smfn_and_flags);
  17.798 -            return head->smfn_and_flags;
  17.799 +            SH_VVLOG("lookup gpfn=%p => status=%p", key, head->smfn);
  17.800 +            return head->smfn;
  17.801          }
  17.802  
  17.803          p = x;
  17.804 @@ -459,17 +750,68 @@ static inline unsigned long __shadow_sta
  17.805      }
  17.806      while ( x != NULL );
  17.807  
  17.808 -    SH_VVLOG("lookup gpfn=%p => status=0", gpfn);
  17.809 +    SH_VVLOG("lookup gpfn=%p => status=0", key);
  17.810 +    perfc_incrc(shadow_status_miss);
  17.811      return 0;
  17.812  }
  17.813  
  17.814  /*
  17.815 + * Not clear if pull-to-front is worth while for this or not,
  17.816 + * as it generally needs to scan the entire bucket anyway.
  17.817 + * Much simpler without.
  17.818 + *
  17.819 + * Either returns PGT_none, or PGT_l{1,2,3,4}_page_table.
  17.820 + */
  17.821 +static inline unsigned long
  17.822 +shadow_max_pgtable_type(struct domain *d, unsigned long gpfn)
  17.823 +{
  17.824 +    struct shadow_status *x;
  17.825 +    unsigned long pttype = PGT_none, type;
  17.826 +
  17.827 +    ASSERT(spin_is_locked(&d->arch.shadow_lock));
  17.828 +    ASSERT(gpfn == (gpfn & PGT_mfn_mask));
  17.829 +
  17.830 +    x = hash_bucket(d, gpfn);
  17.831 +
  17.832 +    while ( x && x->gpfn_and_flags )
  17.833 +    {
  17.834 +        if ( (x->gpfn_and_flags & PGT_mfn_mask) == gpfn )
  17.835 +        {
  17.836 +            type = x->gpfn_and_flags & PGT_type_mask;
  17.837 +
  17.838 +            // Treat an HL2 as if it's an L1
  17.839 +            //
  17.840 +            if ( type == PGT_hl2_shadow )
  17.841 +                type = PGT_l1_shadow;
  17.842 +
  17.843 +            // Ignore snapshots -- they don't in and of themselves constitute
  17.844 +            // treating a page as a page table
  17.845 +            //
  17.846 +            if ( type == PGT_snapshot )
  17.847 +                goto next;
  17.848 +
  17.849 +            // Early exit if we found the max possible value
  17.850 +            //
  17.851 +            if ( type == PGT_base_page_table )
  17.852 +                return type;
  17.853 +
  17.854 +            if ( type > pttype )
  17.855 +                pttype = type;
  17.856 +        }
  17.857 +    next:
  17.858 +        x = x->next;
  17.859 +    }
  17.860 +
  17.861 +    return pttype;
  17.862 +}
  17.863 +
  17.864 +/*
  17.865   * N.B. We can make this locking more fine grained (e.g., per shadow page) if
  17.866   * it ever becomes a problem, but since we need a spin lock on the hash table 
  17.867   * anyway it's probably not worth being too clever.
  17.868   */
  17.869  static inline unsigned long get_shadow_status(
  17.870 -    struct domain *d, unsigned int gpfn )
  17.871 +    struct domain *d, unsigned long gpfn, unsigned long stype)
  17.872  {
  17.873      unsigned long res;
  17.874  
  17.875 @@ -481,65 +823,66 @@ static inline unsigned long get_shadow_s
  17.876       * has changed type. If we're in log dirty mode, we should set the
  17.877       * appropriate bit in the dirty bitmap.
  17.878       * N.B. The VA update path doesn't use this and is handled independently. 
  17.879 -
  17.880 -     XXX need to think this through for vmx guests, but probably OK
  17.881 +     *
  17.882 +     * XXX need to think this through for vmx guests, but probably OK
  17.883       */
  17.884  
  17.885      shadow_lock(d);
  17.886  
  17.887      if ( shadow_mode_log_dirty(d) )
  17.888 -        __mark_dirty(d, gpfn);
  17.889 +        __mark_dirty(d, __gpfn_to_mfn(d, gpfn));
  17.890  
  17.891 -    if ( !(res = __shadow_status(d, gpfn)) )
  17.892 +    if ( !(res = __shadow_status(d, gpfn, stype)) )
  17.893          shadow_unlock(d);
  17.894  
  17.895      return res;
  17.896  }
  17.897  
  17.898  
  17.899 -static inline void put_shadow_status(
  17.900 -    struct domain *d)
  17.901 +static inline void put_shadow_status(struct domain *d)
  17.902  {
  17.903      shadow_unlock(d);
  17.904  }
  17.905  
  17.906  
  17.907  static inline void delete_shadow_status( 
  17.908 -    struct domain *d, unsigned int gpfn)
  17.909 +    struct domain *d, unsigned int gpfn, unsigned int stype)
  17.910  {
  17.911      struct shadow_status *p, *x, *n, *head;
  17.912 +    unsigned long key = gpfn | stype;
  17.913  
  17.914      ASSERT(spin_is_locked(&d->arch.shadow_lock));
  17.915 -    ASSERT(gpfn != 0);
  17.916 +    ASSERT(gpfn && !(gpfn & ~PGT_mfn_mask));
  17.917 +    ASSERT(stype && !(stype & ~PGT_type_mask));
  17.918  
  17.919      head = hash_bucket(d, gpfn);
  17.920  
  17.921 -    SH_VVLOG("delete gpfn=%08x bucket=%p", gpfn, head);
  17.922 +    SH_VLOG("delete gpfn=%p t=%p bucket=%p", gpfn, stype, head);
  17.923      shadow_audit(d, 0);
  17.924  
  17.925      /* Match on head item? */
  17.926 -    if ( head->pfn == gpfn )
  17.927 +    if ( head->gpfn_and_flags == key )
  17.928      {
  17.929          if ( (n = head->next) != NULL )
  17.930          {
  17.931              /* Overwrite head with contents of following node. */
  17.932 -            head->pfn            = n->pfn;
  17.933 -            head->smfn_and_flags = n->smfn_and_flags;
  17.934 +            head->gpfn_and_flags = n->gpfn_and_flags;
  17.935 +            head->smfn           = n->smfn;
  17.936  
  17.937              /* Delete following node. */
  17.938              head->next           = n->next;
  17.939  
  17.940              /* Add deleted node to the free list. */
  17.941 -            n->pfn            = 0;
  17.942 -            n->smfn_and_flags = 0;
  17.943 +            n->gpfn_and_flags = 0;
  17.944 +            n->smfn           = 0;
  17.945              n->next           = d->arch.shadow_ht_free;
  17.946              d->arch.shadow_ht_free = n;
  17.947          }
  17.948          else
  17.949          {
  17.950              /* This bucket is now empty. Initialise the head node. */
  17.951 -            head->pfn            = 0;
  17.952 -            head->smfn_and_flags = 0;
  17.953 +            head->gpfn_and_flags = 0;
  17.954 +            head->smfn           = 0;
  17.955          }
  17.956  
  17.957          goto found;
  17.958 @@ -550,14 +893,14 @@ static inline void delete_shadow_status(
  17.959  
  17.960      do
  17.961      {
  17.962 -        if ( x->pfn == gpfn )
  17.963 +        if ( x->gpfn_and_flags == key )
  17.964          {
  17.965              /* Delete matching node. */
  17.966              p->next = x->next;
  17.967  
  17.968              /* Add deleted node to the free list. */
  17.969 -            x->pfn            = 0;
  17.970 -            x->smfn_and_flags = 0;
  17.971 +            x->gpfn_and_flags = 0;
  17.972 +            x->smfn           = 0;
  17.973              x->next           = d->arch.shadow_ht_free;
  17.974              d->arch.shadow_ht_free = x;
  17.975  
  17.976 @@ -573,34 +916,46 @@ static inline void delete_shadow_status(
  17.977      BUG();
  17.978  
  17.979   found:
  17.980 +    // release ref to page
  17.981 +    put_page(pfn_to_page(__gpfn_to_mfn(d, gpfn)));
  17.982 +
  17.983      shadow_audit(d, 0);
  17.984  }
  17.985  
  17.986 -
  17.987  static inline void set_shadow_status(
  17.988 -    struct domain *d, unsigned int gpfn, unsigned long s)
  17.989 +    struct domain *d, unsigned long gpfn,
  17.990 +    unsigned long smfn, unsigned long stype)
  17.991  {
  17.992      struct shadow_status *x, *head, *extra;
  17.993      int i;
  17.994 +    unsigned long gmfn = __gpfn_to_mfn(d, gpfn);
  17.995 +    unsigned long key = gpfn | stype;
  17.996  
  17.997      ASSERT(spin_is_locked(&d->arch.shadow_lock));
  17.998 -    ASSERT(gpfn != 0);
  17.999 -    ASSERT(s & (PSH_shadowed | PSH_hl2));
 17.1000 +    ASSERT(gpfn && !(gpfn & ~PGT_mfn_mask));
 17.1001 +    ASSERT(pfn_is_ram(gmfn)); // XXX need to be more graceful
 17.1002 +    ASSERT(smfn && !(smfn & ~PGT_mfn_mask));
 17.1003 +    ASSERT(stype && !(stype & ~PGT_type_mask));
 17.1004  
 17.1005      x = head = hash_bucket(d, gpfn);
 17.1006     
 17.1007 -    SH_VVLOG("set gpfn=%08x s=%p bucket=%p(%p)", gpfn, s, x, x->next);
 17.1008 +    SH_VLOG("set gpfn=%p smfn=%p t=%p bucket=%p(%p)",
 17.1009 +             gpfn, smfn, stype, x, x->next);
 17.1010      shadow_audit(d, 0);
 17.1011  
 17.1012 +    // grab a reference to the guest page to represent the entry in the shadow
 17.1013 +    // hash table
 17.1014 +    //
 17.1015 +    get_page(pfn_to_page(gmfn), d);
 17.1016 +
 17.1017      /*
 17.1018       * STEP 1. If page is already in the table, update it in place.
 17.1019       */
 17.1020 -
 17.1021      do
 17.1022      {
 17.1023 -        if ( x->pfn == gpfn )
 17.1024 +        if ( x->gpfn_and_flags == key )
 17.1025          {
 17.1026 -            x->smfn_and_flags = s;
 17.1027 +            x->smfn = smfn;
 17.1028              goto done;
 17.1029          }
 17.1030  
 17.1031 @@ -613,10 +968,10 @@ static inline void set_shadow_status(
 17.1032       */
 17.1033  
 17.1034      /* If the bucket is empty then insert the new page as the head item. */
 17.1035 -    if ( head->pfn == 0 )
 17.1036 +    if ( head->gpfn_and_flags == 0 )
 17.1037      {
 17.1038 -        head->pfn            = gpfn;
 17.1039 -        head->smfn_and_flags = s;
 17.1040 +        head->gpfn_and_flags = key;
 17.1041 +        head->smfn           = smfn;
 17.1042          ASSERT(head->next == NULL);
 17.1043          goto done;
 17.1044      }
 17.1045 @@ -655,35 +1010,107 @@ static inline void set_shadow_status(
 17.1046      d->arch.shadow_ht_free = x->next;
 17.1047  
 17.1048      /* Initialise the new node and insert directly after the head item. */
 17.1049 -    x->pfn            = gpfn;
 17.1050 -    x->smfn_and_flags = s;
 17.1051 +    x->gpfn_and_flags = key;
 17.1052 +    x->smfn           = smfn;
 17.1053      x->next           = head->next;
 17.1054      head->next        = x;
 17.1055  
 17.1056   done:
 17.1057      shadow_audit(d, 0);
 17.1058  }
 17.1059 -  
 17.1060 +
 17.1061 +/************************************************************************/
 17.1062 +
 17.1063 +extern void shadow_map_l1_into_current_l2(unsigned long va);
 17.1064 +
 17.1065 +void static inline
 17.1066 +shadow_set_l1e(unsigned long va, unsigned long new_spte, int create_l1_shadow)
 17.1067 +{
 17.1068 +    struct exec_domain *ed = current;
 17.1069 +    struct domain *d = ed->domain;
 17.1070 +    unsigned long sl2e, old_spte;
 17.1071 +
 17.1072 +#if 0
 17.1073 +    printk("shadow_set_l1e(va=%p, new_spte=%p, create=%d)\n",
 17.1074 +           va, new_spte, create_l1_shadow);
 17.1075 +#endif
 17.1076 +
 17.1077 +    __shadow_get_l2e(ed, va, &sl2e);
 17.1078 +    if ( !(sl2e & _PAGE_PRESENT) )
 17.1079 +    {
 17.1080 +        /*
 17.1081 +         * Either the L1 is not shadowed, or the shadow isn't linked into
 17.1082 +         * the current shadow L2.
 17.1083 +         */
 17.1084 +        if ( create_l1_shadow )
 17.1085 +        {
 17.1086 +            perfc_incrc(shadow_set_l1e_force_map);
 17.1087 +            shadow_map_l1_into_current_l2(va);
 17.1088 +        }
 17.1089 +        else /* check to see if it exists; if so, link it in */
 17.1090 +        {
 17.1091 +            unsigned long gpde =
 17.1092 +                l2_pgentry_val(linear_l2_table(ed)[l2_table_offset(va)]);
 17.1093 +            unsigned long gl1pfn = gpde >> PAGE_SHIFT;
 17.1094 +            unsigned long sl1mfn = __shadow_status(d, gl1pfn, PGT_l1_shadow);
 17.1095 +
 17.1096 +            ASSERT( gpde & _PAGE_PRESENT );
 17.1097 +
 17.1098 +            if ( sl1mfn )
 17.1099 +            {
 17.1100 +                perfc_incrc(shadow_set_l1e_unlinked);
 17.1101 +                get_shadow_ref(sl1mfn);
 17.1102 +                l2pde_general(d, &gpde, &sl2e, sl1mfn);
 17.1103 +                __guest_set_l2e(ed, va, gpde);
 17.1104 +                __shadow_set_l2e(ed, va, sl2e);
 17.1105 +            }
 17.1106 +            else
 17.1107 +            {
 17.1108 +                // no shadow exists, so there's nothing to do.
 17.1109 +                perfc_incrc(shadow_set_l1e_fail);
 17.1110 +                return;
 17.1111 +            }
 17.1112 +        }
 17.1113 +    }
 17.1114 +
 17.1115 +    old_spte = l1_pgentry_val(shadow_linear_pg_table[l1_linear_offset(va)]);
 17.1116 +    shadow_linear_pg_table[l1_linear_offset(va)] = mk_l1_pgentry(new_spte);
 17.1117 +
 17.1118 +    // only do the ref counting if something important changed.
 17.1119 +    //
 17.1120 +    if ( (old_spte ^ new_spte) & (PAGE_MASK | _PAGE_RW | _PAGE_PRESENT) )
 17.1121 +    {
 17.1122 +        if ( new_spte & _PAGE_PRESENT )
 17.1123 +            get_page_from_l1e(mk_l1_pgentry(new_spte), d);
 17.1124 +        if ( old_spte & _PAGE_PRESENT )
 17.1125 +            put_page_from_l1e(mk_l1_pgentry(old_spte), d);
 17.1126 +    }
 17.1127 +}
 17.1128 +
 17.1129 +/************************************************************************/
 17.1130 +
 17.1131  static inline unsigned long gva_to_gpte(unsigned long gva)
 17.1132  {
 17.1133 -    unsigned long gpde, gpte, pfn, index;
 17.1134 +    unsigned long gpde, gpte;
 17.1135      struct exec_domain *ed = current;
 17.1136  
 17.1137 +    ASSERT( shadow_mode_translate(current->domain) );
 17.1138 +
 17.1139      __guest_get_l2e(ed, gva, &gpde);
 17.1140 -    if (!(gpde & _PAGE_PRESENT))
 17.1141 +    if ( unlikely(!(gpde & _PAGE_PRESENT)) )
 17.1142          return 0;
 17.1143  
 17.1144 -    index = l2_table_offset(gva);
 17.1145 -
 17.1146 -    if (!l2_pgentry_val(ed->arch.hl2_vtable[index])) {
 17.1147 -        pfn = phys_to_machine_mapping(gpde >> PAGE_SHIFT);
 17.1148 -        ed->arch.hl2_vtable[index] = 
 17.1149 -            mk_l2_pgentry((pfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
 17.1150 -    }
 17.1151 +    // This is actually overkill - we only need to make sure the hl2
 17.1152 +    // is in-sync.
 17.1153 +    //
 17.1154 +    shadow_sync_va(ed, gva);
 17.1155  
 17.1156      if ( unlikely(__get_user(gpte, (unsigned long *)
 17.1157                               &linear_pg_table[gva >> PAGE_SHIFT])) )
 17.1158 +    {
 17.1159 +        FSH_LOG("gva_to_gpte got a fault on gva=%p\n", gva);
 17.1160          return 0;
 17.1161 +    }
 17.1162  
 17.1163      return gpte;
 17.1164  }
 17.1165 @@ -699,94 +1126,19 @@ static inline unsigned long gva_to_gpa(u
 17.1166      return (gpte & PAGE_MASK) + (gva & ~PAGE_MASK); 
 17.1167  }
 17.1168  
 17.1169 -static inline void hl2_table_invalidate(struct exec_domain *ed)
 17.1170 -{
 17.1171 -    /*
 17.1172 -     * Need to optimize this
 17.1173 -     */
 17.1174 -    memset(ed->arch.hl2_vtable, 0, PAGE_SIZE);
 17.1175 -}
 17.1176 -
 17.1177 -static inline void __update_pagetables(struct exec_domain *ed)
 17.1178 -{
 17.1179 -    struct domain *d = ed->domain;
 17.1180 -    unsigned long gmfn = pagetable_val(ed->arch.guest_table) >> PAGE_SHIFT;
 17.1181 -    unsigned long gpfn = __mfn_to_gpfn(d, gmfn);
 17.1182 -    unsigned long smfn = __shadow_status(d, gpfn) & PSH_pfn_mask;
 17.1183 -
 17.1184 -    SH_VVLOG("0: __update_pagetables(gmfn=%p, smfn=%p)", gmfn, smfn);
 17.1185 -
 17.1186 -    if ( unlikely(smfn == 0) )
 17.1187 -        smfn = shadow_l2_table(d, gmfn);
 17.1188 -
 17.1189 -    ed->arch.shadow_table = mk_pagetable(smfn<<PAGE_SHIFT);
 17.1190 -
 17.1191 -    if ( shadow_mode_translate(d) )
 17.1192 -    {
 17.1193 -        l2_pgentry_t *mpl2e = ed->arch.monitor_vtable;
 17.1194 -        l2_pgentry_t *gpl2e, *spl2e;
 17.1195 -        unsigned long hl2_status, hl2mfn, offset;
 17.1196 -        int need_flush = 0;
 17.1197 -
 17.1198 -        if ( ed->arch.guest_vtable )
 17.1199 -            unmap_domain_mem(ed->arch.guest_vtable);
 17.1200 -        if ( ed->arch.shadow_vtable )
 17.1201 -            unmap_domain_mem(ed->arch.shadow_vtable);
 17.1202 -        if ( ed->arch.hl2_vtable )
 17.1203 -            unmap_domain_mem(ed->arch.hl2_vtable);
 17.1204 +/************************************************************************/
 17.1205  
 17.1206 -        gpl2e = ed->arch.guest_vtable =
 17.1207 -            map_domain_mem(pagetable_val(ed->arch.guest_table));
 17.1208 -        spl2e = ed->arch.shadow_vtable =
 17.1209 -            map_domain_mem(pagetable_val(ed->arch.shadow_table));
 17.1210 -
 17.1211 -        hl2_status = __shadow_status(d, gpfn | PSH_hl2);
 17.1212 -        if ( unlikely(!(hl2_status & PSH_hl2)) )
 17.1213 -            hl2_status = mk_hl2_table(ed);
 17.1214 -
 17.1215 -        hl2mfn = hl2_status & PSH_pfn_mask;
 17.1216 -        ed->arch.hl2_vtable = map_domain_mem(hl2mfn << PAGE_SHIFT);
 17.1217 -
 17.1218 -        offset = l2_table_offset(LINEAR_PT_VIRT_START);
 17.1219 -        if ( hl2mfn != (l2_pgentry_val(mpl2e[offset]) >> PAGE_SHIFT) )
 17.1220 -        {
 17.1221 -            mpl2e[offset] =
 17.1222 -                mk_l2_pgentry((hl2mfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
 17.1223 -            need_flush = 1;
 17.1224 -        }
 17.1225 -
 17.1226 -        if ( shadow_mode_external(d ) )
 17.1227 -        {
 17.1228 -            offset = l2_table_offset(SH_LINEAR_PT_VIRT_START);
 17.1229 -            if ( smfn != (l2_pgentry_val(mpl2e[offset]) >> PAGE_SHIFT) )
 17.1230 -            {
 17.1231 -                mpl2e[offset] =
 17.1232 -                    mk_l2_pgentry((smfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
 17.1233 -                need_flush = 1;
 17.1234 -            }
 17.1235 -        }
 17.1236 -
 17.1237 -        if ( VMX_DOMAIN(ed) )
 17.1238 -        {
 17.1239 -            // Why is VMX mode doing this?
 17.1240 -            shadow_invalidate(ed);
 17.1241 -            hl2_table_invalidate(ed);
 17.1242 -        }
 17.1243 -
 17.1244 -        if ( need_flush )
 17.1245 -            local_flush_tlb();
 17.1246 -    }
 17.1247 -}
 17.1248 -
 17.1249 +extern void __update_pagetables(struct exec_domain *ed);
 17.1250  static inline void update_pagetables(struct exec_domain *ed)
 17.1251  {
 17.1252      struct domain *d = ed->domain;
 17.1253 +
 17.1254 +#ifdef CONFIG_VMX
 17.1255      int paging_enabled =
 17.1256 -#ifdef CONFIG_VMX
 17.1257          !VMX_DOMAIN(ed) ||
 17.1258          test_bit(VMX_CPU_STATE_PG_ENABLED, &ed->arch.arch_vmx.cpu_state);
 17.1259  #else
 17.1260 -        1;
 17.1261 +    const int paging_enabled = 1;
 17.1262  #endif
 17.1263  
 17.1264      /*
 17.1265 @@ -802,7 +1154,7 @@ static inline void update_pagetables(str
 17.1266          shadow_unlock(d);
 17.1267      }
 17.1268  
 17.1269 -    if ( !shadow_mode_external(d) )
 17.1270 +    if ( likely(!shadow_mode_external(d)) )
 17.1271      {
 17.1272  #ifdef __x86_64__
 17.1273          if ( !(ed->arch.flags & TF_kernel_mode) )
 17.1274 @@ -814,26 +1166,17 @@ static inline void update_pagetables(str
 17.1275          else
 17.1276              ed->arch.monitor_table = ed->arch.guest_table;
 17.1277      }
 17.1278 -    else
 17.1279 -    {
 17.1280 -        // External page tables...
 17.1281 -        // Allocate a monitor page table if we don't already have one.
 17.1282 -        //
 17.1283 -        if ( unlikely(!pagetable_val(ed->arch.monitor_table)) )
 17.1284 -            ed->arch.monitor_table =
 17.1285 -                mk_pagetable(alloc_monitor_pagetable(ed) << PAGE_SHIFT);
 17.1286 -    }
 17.1287  }
 17.1288  
 17.1289  #if SHADOW_DEBUG
 17.1290 -extern int _check_pagetable(struct domain *d, pagetable_t pt, char *s);
 17.1291 -extern int _check_all_pagetables(struct domain *d, char *s);
 17.1292 +extern int _check_pagetable(struct exec_domain *ed, char *s);
 17.1293 +extern int _check_all_pagetables(struct exec_domain *ed, char *s);
 17.1294  
 17.1295 -#define check_pagetable(_d, _pt, _s) _check_pagetable(_d, _pt, _s)
 17.1296 -//#define check_pagetable(_d, _pt, _s) _check_all_pagetables(_d, _s)
 17.1297 +#define check_pagetable(_ed, _s) _check_pagetable(_ed, _s)
 17.1298 +//#define check_pagetable(_ed, _s) _check_all_pagetables(_ed, _s)
 17.1299  
 17.1300  #else
 17.1301 -#define check_pagetable(_d, _pt, _s) ((void)0)
 17.1302 +#define check_pagetable(_ed, _s) ((void)0)
 17.1303  #endif
 17.1304  
 17.1305  #endif /* XEN_SHADOW_H */
    18.1 --- a/xen/include/asm-x86/x86_32/page.h	Mon Mar 14 18:44:10 2005 +0000
    18.2 +++ b/xen/include/asm-x86/x86_32/page.h	Mon Mar 14 22:07:47 2005 +0000
    18.3 @@ -68,7 +68,7 @@ typedef l2_pgentry_t root_pgentry_t;
    18.4  #define L1_DISALLOW_MASK (3UL << 7)
    18.5  #define L2_DISALLOW_MASK (7UL << 7)
    18.6  #define L3_DISALLOW_MASK (7UL << 7)
    18.7 -#define L2_DISALLOW_MASK (7UL << 7)
    18.8 +#define L4_DISALLOW_MASK (7UL << 7)
    18.9  
   18.10  #endif /* __X86_32_PAGE_H__ */
   18.11  
    19.1 --- a/xen/include/xen/domain.h	Mon Mar 14 18:44:10 2005 +0000
    19.2 +++ b/xen/include/xen/domain.h	Mon Mar 14 22:07:47 2005 +0000
    19.3 @@ -27,6 +27,4 @@ extern void domain_relinquish_memory(str
    19.4  
    19.5  extern void dump_pageframe_info(struct domain *d);
    19.6  
    19.7 -extern unsigned long alloc_monitor_pagetable(struct exec_domain *ed);
    19.8 -
    19.9  #endif /* __XEN_DOMAIN_H__ */
    20.1 --- a/xen/include/xen/perfc_defn.h	Mon Mar 14 18:44:10 2005 +0000
    20.2 +++ b/xen/include/xen/perfc_defn.h	Mon Mar 14 22:07:47 2005 +0000
    20.3 @@ -1,3 +1,7 @@
    20.4 +#define VMX_PERF_EXIT_REASON_SIZE 37
    20.5 +#define VMX_PERF_VECTOR_SIZE 0x20
    20.6 +PERFCOUNTER_ARRAY(vmexits, "vmexits", VMX_PERF_EXIT_REASON_SIZE )
    20.7 +PERFCOUNTER_ARRAY(cause_vector, "cause vector", VMX_PERF_VECTOR_SIZE )
    20.8  
    20.9  PERFCOUNTER_CPU (seg_fixups,   "segmentation fixups" )
   20.10  
   20.11 @@ -17,26 +21,42 @@ PERFCOUNTER_CPU( need_flush_tlb_flush, "
   20.12  PERFCOUNTER_CPU( calls_to_mmu_update, "calls_to_mmu_update" )
   20.13  PERFCOUNTER_CPU( num_page_updates, "num_page_updates" )
   20.14  PERFCOUNTER_CPU( calls_to_update_va, "calls_to_update_va_map" )
   20.15 -PERFCOUNTER_CPU( page_faults, "page faults" )
   20.16 -PERFCOUNTER_CPU( copy_user_faults, "copy_user faults" )
   20.17  PERFCOUNTER_CPU( map_domain_mem_count, "map_domain_mem count" )
   20.18  
   20.19 -PERFCOUNTER_CPU( shadow_l2_table_count, "shadow_l2_table count" )
   20.20 -PERFCOUNTER_CPU( shadow_l1_table_count, "shadow_l1_table count" )
   20.21 -PERFCOUNTER_CPU( unshadow_table_count, "unshadow_table count" )
   20.22 -PERFCOUNTER_CPU( shadow_fixup_count, "shadow_fixup count" )
   20.23 -PERFCOUNTER_CPU( shadow_update_va_fail1, "shadow_update_va_fail1" )
   20.24 -PERFCOUNTER_CPU( shadow_update_va_fail2, "shadow_update_va_fail2" )
   20.25 +PERFCOUNTER_CPU( shadow_l2_table_count,    "shadow_l2_table count" )
   20.26 +PERFCOUNTER_CPU( shadow_l1_table_count,    "shadow_l1_table count" )
   20.27 +PERFCOUNTER_CPU( shadow_hl2_table_count,   "shadow_hl2_table count" )
   20.28 +PERFCOUNTER_CPU( shadow_set_l1e_force_map, "shadow_set_l1e forced to map l1" )
   20.29 +PERFCOUNTER_CPU( shadow_set_l1e_unlinked,  "shadow_set_l1e found unlinked l1" )
   20.30 +PERFCOUNTER_CPU( shadow_set_l1e_fail,      "shadow_set_l1e failed (no sl1)" )
   20.31 +PERFCOUNTER_CPU( shadow_invlpg_faults,     "shadow_invlpg's get_user faulted")
   20.32 +
   20.33  
   20.34  /* STATUS counters do not reset when 'P' is hit */
   20.35  PERFSTATUS( shadow_l2_pages, "current # shadow L2 pages" )
   20.36  PERFSTATUS( shadow_l1_pages, "current # shadow L1 pages" )
   20.37  PERFSTATUS( hl2_table_pages, "current # hl2 pages" )
   20.38 +PERFSTATUS( snapshot_pages,  "current # fshadow snapshot pages" )
   20.39  
   20.40 -PERFCOUNTER_CPU( check_pagetable, "calls to check_pagetable" )
   20.41 -PERFCOUNTER_CPU( check_all_pagetables, "calls to check_all_pagetables" )
   20.42 +PERFCOUNTER_CPU(shadow_status_calls,    "calls to __shadow_status" )
   20.43 +PERFCOUNTER_CPU(shadow_status_miss,     "missed shadow cache" )
   20.44 +PERFCOUNTER_CPU(shadow_status_hit_head, "hits on head of bucket" )
   20.45 +PERFCOUNTER_CPU(check_pagetable,        "calls to check_pagetable" )
   20.46 +PERFCOUNTER_CPU(check_all_pagetables,   "calls to check_all_pagetables" )
   20.47  
   20.48 -#define VMX_PERF_EXIT_REASON_SIZE 37
   20.49 -#define VMX_PERF_VECTOR_SIZE 0x20
   20.50 -PERFCOUNTER_ARRAY(vmexits, "vmexits", VMX_PERF_EXIT_REASON_SIZE )
   20.51 -PERFCOUNTER_ARRAY(cause_vector, "cause vector", VMX_PERF_VECTOR_SIZE )
   20.52 +PERFCOUNTER_CPU(shadow_sync_all,                   "calls to shadow_sync_all")
   20.53 +PERFCOUNTER_CPU(shadow_make_snapshot,              "snapshots created")
   20.54 +PERFCOUNTER_CPU(shadow_mark_mfn_out_of_sync_calls, "calls to shadow_mk_out_of_sync")
   20.55 +PERFCOUNTER_CPU(shadow_out_of_sync_calls,          "calls to shadow_out_of_sync")
   20.56 +PERFCOUNTER_CPU(snapshot_entry_matches_calls,      "calls to ss_entry_matches")
   20.57 +PERFCOUNTER_CPU(snapshot_entry_matches_true,       "ss_entry_matches returns true")
   20.58 +
   20.59 +PERFCOUNTER_CPU(page_faults,                       "page faults" )
   20.60 +PERFCOUNTER_CPU(copy_user_faults,                  "copy_user faults" )
   20.61 +PERFCOUNTER_CPU(shadow_fault_calls,                "calls to shadow_fault")
   20.62 +PERFCOUNTER_CPU(shadow_fault_bail_pde_not_present, "sf bailed due to pde not present")
   20.63 +PERFCOUNTER_CPU(shadow_fault_bail_pte_not_present, "sf bailed due to pte not present")
   20.64 +PERFCOUNTER_CPU(shadow_fault_bail_ro_mapping,      "sf bailed due to a ro mapping")
   20.65 +PERFCOUNTER_CPU(shadow_fault_fixed,                "sf fixed the pgfault")
   20.66 +PERFCOUNTER_CPU(validate_pte_change,               "calls to validate_pte_change")
   20.67 +PERFCOUNTER_CPU(validate_pde_change,               "calls to validate_pde_change")