ia64/xen-unstable

changeset 4181:f3a1163f9d2b

bitkeeper revision 1.1247 (42386d3dpoPovazcjxeV5wadySvQoA)

michael's initial shadow code

Signed-off-by: michael.fetterman@cl.cam.ac.uk
author rneugeba@wyvis.research.intel-research.net
date Wed Mar 16 17:30:37 2005 +0000 (2005-03-16)
parents e379e05dfb91
children cf77cd925ef3
files .rootkeys xen/arch/x86/audit.c xen/arch/x86/domain.c xen/arch/x86/domain_build.c xen/arch/x86/mm.c xen/arch/x86/shadow.c xen/arch/x86/traps.c xen/arch/x86/vmx.c xen/arch/x86/x86_32/domain_page.c xen/common/dom_mem_ops.c xen/common/page_alloc.c xen/common/schedule.c xen/include/asm-x86/domain.h xen/include/asm-x86/mm.h xen/include/asm-x86/page.h xen/include/asm-x86/shadow.h xen/include/asm-x86/x86_32/page.h xen/include/xen/domain.h xen/include/xen/perfc_defn.h
line diff
     1.1 --- a/.rootkeys	Tue Mar 15 15:53:52 2005 +0000
     1.2 +++ b/.rootkeys	Wed Mar 16 17:30:37 2005 +0000
     1.3 @@ -951,6 +951,7 @@ 3ddb79bcZbRBzT3elFWSX7u6NtMagQ xen/arch/
     1.4  3ddb79bcBQF85CfLS4i1WGZ4oLLaCA xen/arch/x86/Rules.mk
     1.5  3e5636e5FAYZ5_vQnmgwFJfSdmO5Mw xen/arch/x86/acpi.c
     1.6  3ddb79bcsjinG9k1KcvbVBuas1R2dA xen/arch/x86/apic.c
     1.7 +42386d3bKw0QftYe-cDL6_4WiATRTw xen/arch/x86/audit.c
     1.8  3ddb79c4yGZ7_22QAFFwPzqP4NSHwA xen/arch/x86/boot/mkelf32.c
     1.9  3ddb79bcSC_LvnmFlX-T5iTgaR0SKg xen/arch/x86/boot/x86_32.S
    1.10  40e42bdbNu4MjI750THP_8J1S-Sa0g xen/arch/x86/boot/x86_64.S
     2.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     2.2 +++ b/xen/arch/x86/audit.c	Wed Mar 16 17:30:37 2005 +0000
     2.3 @@ -0,0 +1,817 @@
     2.4 +/******************************************************************************
     2.5 + * arch/x86/audit.c
     2.6 + * 
     2.7 + * Copyright (c) 2002-2005 K A Fraser
     2.8 + * Copyright (c) 2004 Christian Limpach
     2.9 + * Copyright (c) 2005 Michael A Fetterman
    2.10 + * 
    2.11 + * This program is free software; you can redistribute it and/or modify
    2.12 + * it under the terms of the GNU General Public License as published by
    2.13 + * the Free Software Foundation; either version 2 of the License, or
    2.14 + * (at your option) any later version.
    2.15 + * 
    2.16 + * This program is distributed in the hope that it will be useful,
    2.17 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
    2.18 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    2.19 + * GNU General Public License for more details.
    2.20 + * 
    2.21 + * You should have received a copy of the GNU General Public License
    2.22 + * along with this program; if not, write to the Free Software
    2.23 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
    2.24 + */
    2.25 +
    2.26 +#include <xen/config.h>
    2.27 +#include <xen/init.h>
    2.28 +#include <xen/kernel.h>
    2.29 +#include <xen/lib.h>
    2.30 +#include <xen/mm.h>
    2.31 +//#include <xen/sched.h>
    2.32 +//#include <xen/errno.h>
    2.33 +#include <xen/perfc.h>
    2.34 +//#include <xen/irq.h>
    2.35 +//#include <xen/softirq.h>
    2.36 +#include <asm/shadow.h>
    2.37 +#include <asm/page.h>
    2.38 +#include <asm/flushtlb.h>
    2.39 +//#include <asm/io.h>
    2.40 +//#include <asm/uaccess.h>
    2.41 +//#include <asm/domain_page.h>
    2.42 +//#include <asm/ldt.h>
    2.43 +
    2.44 +// XXX SMP bug -- these should not be statics...
    2.45 +//
    2.46 +static int ttot=0, ctot=0, io_mappings=0, lowmem_mappings=0;
    2.47 +static int l1, l2, oos_count, page_count;
    2.48 +
    2.49 +#define FILE_AND_LINE 1
    2.50 +
    2.51 +#if FILE_AND_LINE
    2.52 +#define adjust(_p, _a) _adjust((_p), (_a), __FILE__, __LINE__)
    2.53 +#define ADJUST_EXTRA_ARGS ,const char *file, int line
    2.54 +#define APRINTK(_f, _a...) printk(_f " %s:%d\n", ## _a, file, line)
    2.55 +#else
    2.56 +#define adjust _adjust
    2.57 +#define ADJUST_EXTRA_ARGS
    2.58 +#define APRINTK(_f, _a...) printk(_f "\n", ##_a)
    2.59 +#endif
    2.60 +
    2.61 +int audit_adjust_pgtables(struct domain *d, int dir, int noisy)
    2.62 +{
    2.63 +    int errors = 0;
    2.64 +    int shadow_enabled = shadow_mode_enabled(d) ? 1 : 0;
    2.65 +
    2.66 +    void _adjust(struct pfn_info *page, int adjtype ADJUST_EXTRA_ARGS)
    2.67 +    {
    2.68 +        if ( adjtype )
    2.69 +        {
    2.70 +            // adjust the type count
    2.71 +            //
    2.72 +            int tcount = page->u.inuse.type_info & PGT_count_mask;
    2.73 +            tcount += dir;
    2.74 +            ttot++;
    2.75 +
    2.76 +            if ( page_get_owner(page) == NULL )
    2.77 +            {
    2.78 +                APRINTK("adjust(mfn=%p, dir=%d, adjtype=%d) owner=NULL",
    2.79 +                        page_to_pfn(page), dir, adjtype, file, line);
    2.80 +                errors++;
    2.81 +            }
    2.82 +
    2.83 +            if ( tcount < 0 )
    2.84 +            {
    2.85 +                APRINTK("Audit %d: type count went below zero mfn=%x t=%x ot=%x",
    2.86 +                        d->id, page-frame_table,
    2.87 +                        page->u.inuse.type_info,
    2.88 +                        page->tlbflush_timestamp);
    2.89 +                errors++;
    2.90 +            }
    2.91 +            else if ( (tcount & ~PGT_count_mask) != 0 )
    2.92 +            {
    2.93 +                APRINTK("Audit %d: type count overflowed mfn=%x t=%x ot=%x",
    2.94 +                        d->id, page-frame_table,
    2.95 +                        page->u.inuse.type_info,
    2.96 +                        page->tlbflush_timestamp);
    2.97 +                errors++;
    2.98 +            }
    2.99 +            else
   2.100 +                page->u.inuse.type_info += dir;
   2.101 +        }
   2.102 +
   2.103 +        // adjust the general count
   2.104 +        //
   2.105 +        int count = page->count_info & PGC_count_mask;
   2.106 +        count += dir;
   2.107 +        ctot++;
   2.108 +
   2.109 +        if ( count < 0 )
   2.110 +        {
   2.111 +            APRINTK("Audit %d: general count went below zero pfn=%x t=%x ot=%x",
   2.112 +                    d->id, page-frame_table,
   2.113 +                    page->u.inuse.type_info,
   2.114 +                    page->tlbflush_timestamp);
   2.115 +            errors++;
   2.116 +        }
   2.117 +        else if ( (count & ~PGT_count_mask) != 0 )
   2.118 +        {
   2.119 +            APRINTK("Audit %d: general count overflowed pfn=%x t=%x ot=%x",
   2.120 +                    d->id, page-frame_table,
   2.121 +                    page->u.inuse.type_info,
   2.122 +                    page->tlbflush_timestamp);
   2.123 +            errors++;
   2.124 +        }
   2.125 +        else
   2.126 +            page->count_info += dir;
   2.127 +    }
   2.128 +
   2.129 +    void adjust_l2_page(unsigned long mfn, int adjtype)
   2.130 +    {
   2.131 +        unsigned long *pt = map_domain_mem(mfn << PAGE_SHIFT);
   2.132 +        int i, limit;
   2.133 +
   2.134 +        if ( shadow_mode_external(d) )
   2.135 +            limit = L2_PAGETABLE_ENTRIES;
   2.136 +        else
   2.137 +            limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
   2.138 +
   2.139 +        for ( i = 0; i < limit; i++ )
   2.140 +        {
   2.141 +            if ( pt[i] & _PAGE_PRESENT )
   2.142 +            {
   2.143 +                unsigned long l1mfn = pt[i] >> PAGE_SHIFT;
   2.144 +                struct pfn_info *l1page = pfn_to_page(l1mfn);
   2.145 +
   2.146 +                if ( noisy )
   2.147 +                {
   2.148 +                    if ( shadow_enabled )
   2.149 +                    {
   2.150 +                        if ( page_get_owner(l1page) != NULL )
   2.151 +                        {
   2.152 +                            printk("L2: Bizarre shadow L1 page mfn=%p "
   2.153 +                                   "belonging to a domain %p (id=%d)\n",
   2.154 +                                   l1mfn,
   2.155 +                                   page_get_owner(l1page),
   2.156 +                                   page_get_owner(l1page)->id);
   2.157 +                            errors++;
   2.158 +                            continue;
   2.159 +                        }
   2.160 +                    }
   2.161 +                    else
   2.162 +                    {
   2.163 +                        if ( page_get_owner(l1page) != d )
   2.164 +                        {
   2.165 +                            printk("L2: Skip bizarre L1 page mfn=%p "
   2.166 +                                   "belonging to other dom %p (id=%d)\n",
   2.167 +                                   l1mfn,
   2.168 +                                   page_get_owner(l1page),
   2.169 +                                   page_get_owner(l1page)->id);
   2.170 +                            errors++;
   2.171 +                            continue;
   2.172 +                        }
   2.173 +
   2.174 +                        u32 page_type = l1page->u.inuse.type_info & PGT_type_mask;
   2.175 +
   2.176 +                        if ( page_type == PGT_l2_page_table )
   2.177 +                        {
   2.178 +                            printk("Audit %d: [%x] Found %s Linear PT "
   2.179 +                                   "t=%x mfn=%p\n",
   2.180 +                                   d->id, i, (l1mfn==mfn) ? "Self" : "Other",
   2.181 +                                   l1page->u.inuse.type_info, l1mfn);
   2.182 +                        }
   2.183 +                        else if ( page_type != PGT_l1_page_table )
   2.184 +                        {
   2.185 +                            printk("Audit %d: [L2 mfn=%p i=%x] "
   2.186 +                                   "Expected L1 t=%x mfn=%p\n",
   2.187 +                                   d->id, mfn, i,
   2.188 +                                   l1page->u.inuse.type_info, l1mfn);
   2.189 +                            errors++;
   2.190 +                        }
   2.191 +                    }
   2.192 +                }
   2.193 +
   2.194 +                adjust(l1page, adjtype);
   2.195 +            }
   2.196 +        }
   2.197 +
   2.198 +        unmap_domain_mem(pt);
   2.199 +    }
   2.200 +
   2.201 +    void adjust_l1_page(unsigned long l1mfn)
   2.202 +    {
   2.203 +        unsigned long *pt = map_domain_mem(l1mfn << PAGE_SHIFT);
   2.204 +        int i;
   2.205 +
   2.206 +        for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
   2.207 +        {
   2.208 +            if ( pt[i] & _PAGE_PRESENT )
   2.209 +            {
   2.210 +                unsigned long gmfn = pt[i] >> PAGE_SHIFT;
   2.211 +                struct pfn_info *gpage = pfn_to_page(gmfn);
   2.212 +
   2.213 +                if ( gmfn < 0x100 )
   2.214 +                {
   2.215 +                    lowmem_mappings++;
   2.216 +                    continue;
   2.217 +                }
   2.218 +
   2.219 +                if ( gmfn > max_page )
   2.220 +                {
   2.221 +                    io_mappings++;
   2.222 +                    continue;
   2.223 +                }
   2.224 +
   2.225 +                if ( noisy )
   2.226 +                {
   2.227 +                    if ( pt[i] & _PAGE_RW )
   2.228 +                    {
   2.229 +                        // If it's not a writable page, complain.
   2.230 +                        //
   2.231 +                        if ( !((gpage->u.inuse.type_info & PGT_type_mask) ==
   2.232 +                               PGT_writable_page) )
   2.233 +                        {
   2.234 +                            printk("Audit %d: [l1mfn=%p, i=%x] Illegal RW "
   2.235 +                                   "t=%x mfn=%p\n",
   2.236 +                                   d->id, l1mfn, i,
   2.237 +                                   gpage->u.inuse.type_info, gmfn);
   2.238 +                            errors++;
   2.239 +                        }
   2.240 +
   2.241 +                        if ( shadow_enabled &&
   2.242 +                             page_is_page_table(gpage) &&
   2.243 +                             ! page_out_of_sync(gpage) )
   2.244 +                        {
   2.245 +                            printk("Audit %d: [l1mfn=%p, i=%x] Illegal RW of "
   2.246 +                                   "page table gmfn=%p\n",
   2.247 +                                   d->id, l1mfn, i, gmfn);
   2.248 +                            errors++;
   2.249 +                        }
   2.250 +                    }
   2.251 +
   2.252 +                    if ( page_get_owner(gpage) != d )
   2.253 +                    {
   2.254 +                        printk("Audit %d: [l1mfn=%p,i=%x] Skip foreign page "
   2.255 +                               "dom=%p (id=%d) mfn=%p c=%08x t=%08x\n",
   2.256 +                               d->id, l1mfn, i,
   2.257 +                               page_get_owner(gpage),
   2.258 +                               page_get_owner(gpage)->id,
   2.259 +                               gmfn,
   2.260 +                               gpage->count_info,
   2.261 +                               gpage->u.inuse.type_info);
   2.262 +                        continue;
   2.263 +                    }
   2.264 +                }
   2.265 +
   2.266 +                adjust(gpage, (pt[i] & _PAGE_RW) ? 1 : 0);
   2.267 +            }
   2.268 +        }
   2.269 +
   2.270 +        unmap_domain_mem(pt);
   2.271 +    }
   2.272 +
   2.273 +    void adjust_shadow_tables()
   2.274 +    {
   2.275 +        struct shadow_status *a;
   2.276 +        unsigned long smfn, gmfn;
   2.277 +        struct pfn_info *page;
   2.278 +        int i;
   2.279 +
   2.280 +        for ( i = 0; i < shadow_ht_buckets; i++ )
   2.281 +        {
   2.282 +            a = &d->arch.shadow_ht[i];
   2.283 +            while ( a && a->gpfn_and_flags )
   2.284 +            {
   2.285 +                gmfn = __gpfn_to_mfn(d, a->gpfn_and_flags & PGT_mfn_mask);
   2.286 +                smfn = a->smfn;
   2.287 +                page = &frame_table[smfn];
   2.288 +
   2.289 +                adjust(pfn_to_page(gmfn), 0);
   2.290 +
   2.291 +                switch ( a->gpfn_and_flags & PGT_type_mask ) {
   2.292 +                case PGT_snapshot:
   2.293 +                    break;
   2.294 +                case PGT_l1_shadow:
   2.295 +                case PGT_hl2_shadow:
   2.296 +                    adjust_l1_page(smfn);
   2.297 +                    if ( page->u.inuse.type_info & PGT_pinned )
   2.298 +                        adjust(page, 0);
   2.299 +                    break;
   2.300 +                case PGT_l2_shadow:
   2.301 +                    adjust_l2_page(smfn, 0);
   2.302 +                    if ( page->u.inuse.type_info & PGT_pinned )
   2.303 +                        adjust(page, 0);
   2.304 +                    break;
   2.305 +                default:
   2.306 +                    BUG();
   2.307 +                    break;
   2.308 +                }
   2.309 +
   2.310 +                a = a->next;
   2.311 +            }
   2.312 +        }
   2.313 +    }
   2.314 +
   2.315 +    void adjust_oos_list()
   2.316 +    {
   2.317 +        struct out_of_sync_entry *oos;
   2.318 +
   2.319 +        if ( (oos = d->arch.out_of_sync) )
   2.320 +            ASSERT(shadow_enabled);
   2.321 +
   2.322 +        while ( oos )
   2.323 +        {
   2.324 +            adjust(pfn_to_page(oos->gmfn), 0);
   2.325 +
   2.326 +            // Only use entries that have low bits clear...
   2.327 +            //
   2.328 +            if ( !(oos->writable_pl1e & (sizeof(l1_pgentry_t)-1)) )
   2.329 +                adjust(pfn_to_page(oos->writable_pl1e >> PAGE_SHIFT), 0);
   2.330 +
   2.331 +            oos = oos->next;
   2.332 +            oos_count++;
   2.333 +        }
   2.334 +    }
   2.335 +
   2.336 +    void adjust_for_pgtbase()
   2.337 +    {
   2.338 +        struct exec_domain *ed;
   2.339 +
   2.340 +        for_each_exec_domain(d, ed)
   2.341 +            {
   2.342 +                if ( !shadow_enabled )
   2.343 +                {
   2.344 +                    if ( pagetable_val(ed->arch.guest_table) )
   2.345 +                        adjust(&frame_table[pagetable_val(ed->arch.guest_table)
   2.346 +                                            >> PAGE_SHIFT], 1);
   2.347 +                }
   2.348 +                else
   2.349 +                {
   2.350 +                    if ( pagetable_val(ed->arch.guest_table) )
   2.351 +                        adjust(&frame_table[pagetable_val(ed->arch.guest_table)
   2.352 +                                            >> PAGE_SHIFT], 0);
   2.353 +                    if ( pagetable_val(ed->arch.shadow_table) )
   2.354 +                        adjust(&frame_table[pagetable_val(ed->arch.shadow_table)
   2.355 +                                            >> PAGE_SHIFT], 0);
   2.356 +                }
   2.357 +            }
   2.358 +    }
   2.359 +
   2.360 +    void adjust_guest_pages()
   2.361 +    {
   2.362 +        struct list_head *list_ent = d->page_list.next;
   2.363 +        struct pfn_info *page;
   2.364 +        unsigned long mfn;
   2.365 +
   2.366 +        while ( list_ent != &d->page_list )
   2.367 +        {
   2.368 +            u32 page_type;
   2.369 +
   2.370 +            page = list_entry(list_ent, struct pfn_info, list);
   2.371 +            mfn = page_to_pfn(page);
   2.372 +            page_type = page->u.inuse.type_info & PGT_type_mask;
   2.373 +
   2.374 +            if ( page_get_owner(page) != d )
   2.375 +                BUG();
   2.376 +
   2.377 +            page_count++;
   2.378 +
   2.379 +            switch ( page_type )
   2.380 +            {
   2.381 +            case PGT_l2_page_table:
   2.382 +                l2++;
   2.383 +
   2.384 +                if ( noisy )
   2.385 +                {
   2.386 +                    if ( shadow_enabled )
   2.387 +                    {
   2.388 +                        printk("Audit %d: found an L2 guest page "
   2.389 +                               "mfn=%p t=%08x c=%08x while in shadow mode\n",
   2.390 +                               mfn, page->u.inuse.type_info, page->count_info);
   2.391 +                        errors++;
   2.392 +                    }
   2.393 +
   2.394 +                    if ( (page->u.inuse.type_info & PGT_validated) !=
   2.395 +                         PGT_validated )
   2.396 +                    {
   2.397 +                        printk("Audit %d: L2 mfn=%p not validated %p\n",
   2.398 +                               d->id, mfn, page->u.inuse.type_info);
   2.399 +                        errors++;
   2.400 +                    }
   2.401 +
   2.402 +                    if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned )
   2.403 +                    {
   2.404 +                        printk("Audit %d: L2 mfn=%p not pinned t=%p\n",
   2.405 +                               d->id, mfn, page->u.inuse.type_info);
   2.406 +                        errors++;
   2.407 +                    }
   2.408 +                }
   2.409 +
   2.410 +                if ( page->u.inuse.type_info & PGT_pinned )
   2.411 +                    adjust(page, 1);
   2.412 +
   2.413 +                if ( page->u.inuse.type_info & PGT_validated )
   2.414 +                    adjust_l2_page(mfn, 1);
   2.415 +
   2.416 +                break;
   2.417 +
   2.418 +            case PGT_l1_page_table:
   2.419 +                l1++;
   2.420 +
   2.421 +                if ( noisy )
   2.422 +                {
   2.423 +                    if ( shadow_enabled )
   2.424 +                    {
   2.425 +                        printk("found an L1 guest page mfn=%p t=%08x c=%08x while in shadow mode\n",
   2.426 +                               mfn, page->u.inuse.type_info, page->count_info);
   2.427 +                        errors++;
   2.428 +                    }
   2.429 +
   2.430 +                    if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated )
   2.431 +                    {
   2.432 +                        printk("Audit %d: L1 not validated mfn=%p t=%p\n",
   2.433 +                               d->id, mfn, page->u.inuse.type_info);
   2.434 +                        errors++;
   2.435 +                    }
   2.436 +
   2.437 +                    if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned )
   2.438 +                    {
   2.439 +                        if ( !VM_ASSIST(d, VMASST_TYPE_writable_pagetables) )
   2.440 +                        {
   2.441 +                            printk("Audit %d: L1 mfn=%p not pinned t=%p\n",
   2.442 +                                   d->id, mfn, page->u.inuse.type_info);
   2.443 +                            errors++;
   2.444 +                        }
   2.445 +                    }
   2.446 +                }
   2.447 +                
   2.448 +                if ( page->u.inuse.type_info & PGT_pinned )
   2.449 +                    adjust(page, 1);
   2.450 +
   2.451 +                if ( page->u.inuse.type_info & PGT_validated )
   2.452 +                    adjust_l1_page(mfn);
   2.453 +
   2.454 +                break;
   2.455 +
   2.456 +            case PGT_gdt_page:
   2.457 +                ASSERT( !page_out_of_sync(page) );
   2.458 +                adjust(page, 1);
   2.459 +                break;
   2.460 +
   2.461 +            case PGT_ldt_page:
   2.462 +                ASSERT( !page_out_of_sync(page) );
   2.463 +                adjust(page, 1);
   2.464 +                break;
   2.465 +
   2.466 +            case PGT_writable_page:
   2.467 +                if ( shadow_enabled )
   2.468 +                {
   2.469 +                    // In shadow mode, writable pages can get pinned by
   2.470 +                    // paravirtualized guests that think they are pinning
   2.471 +                    // their L1s and/or L2s.
   2.472 +                    //
   2.473 +                    if ( page->u.inuse.type_info & PGT_pinned )
   2.474 +                        adjust(page, 1);
   2.475 +                }
   2.476 +            }
   2.477 +
   2.478 +            list_ent = page->list.next;
   2.479 +        }
   2.480 +    }
   2.481 +
   2.482 +    adjust_for_pgtbase();
   2.483 +
   2.484 +    adjust_guest_pages();
   2.485 +
   2.486 +    if ( shadow_enabled )
   2.487 +    {
   2.488 +        adjust_oos_list();
   2.489 +        adjust_shadow_tables();
   2.490 +    }
   2.491 +
   2.492 +    return errors;
   2.493 +}
   2.494 +
   2.495 +
   2.496 +#ifndef NDEBUG
   2.497 +
   2.498 +void _audit_domain(struct domain *d, int flags, const char *file, int line)
   2.499 +{
   2.500 +    void scan_for_pfn_in_mfn(struct domain *d, unsigned long xmfn,
   2.501 +                             unsigned long mfn)
   2.502 +    {
   2.503 +        struct pfn_info *page = &frame_table[mfn];
   2.504 +        unsigned long *pt = map_domain_mem(mfn);
   2.505 +        int i;
   2.506 +
   2.507 +        for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
   2.508 +        {
   2.509 +            if ( (pt[i] & _PAGE_PRESENT) && ((pt[i] >> PAGE_SHIFT) == xmfn) )
   2.510 +                printk("     found dom=%d mfn=%p t=%x c=%x pt[i=%x]=%p\n",
   2.511 +                       d->id, mfn, page->u.inuse.type_info,
   2.512 +                       page->count_info, i, pt[i]);
   2.513 +        }
   2.514 +
   2.515 +        unmap_domain_mem(pt);           
   2.516 +    }
   2.517 +
   2.518 +    void scan_for_pfn(struct domain *d, unsigned long xmfn)
   2.519 +    {
   2.520 +        if ( !shadow_mode_enabled(d) )
   2.521 +        {
   2.522 +            struct list_head *list_ent = d->page_list.next;
   2.523 +            struct pfn_info *page;
   2.524 +
   2.525 +            while ( list_ent != &d->page_list )
   2.526 +            {
   2.527 +                page = list_entry(list_ent, struct pfn_info, list);
   2.528 +
   2.529 +                switch ( page->u.inuse.type_info & PGT_type_mask )
   2.530 +                {
   2.531 +                case PGT_l1_page_table:
   2.532 +                case PGT_l2_page_table:
   2.533 +                    scan_for_pfn_in_mfn(d, xmfn, page_to_pfn(page));
   2.534 +                    break;
   2.535 +                default:
   2.536 +                    break;
   2.537 +                }
   2.538 +
   2.539 +                list_ent = page->list.next;
   2.540 +            }
   2.541 +        }
   2.542 +        else
   2.543 +        {
   2.544 +            struct shadow_status *a;
   2.545 +            int i;
   2.546 +            
   2.547 +            for ( i = 0; i < shadow_ht_buckets; i++ )
   2.548 +            {
   2.549 +                a = &d->arch.shadow_ht[i];
   2.550 +                while ( a && a->gpfn_and_flags )
   2.551 +                {
   2.552 +                    switch ( a->gpfn_and_flags & PGT_type_mask )
   2.553 +                    {
   2.554 +                    case PGT_l1_shadow:
   2.555 +                    case PGT_l2_shadow:
   2.556 +                    case PGT_hl2_shadow:
   2.557 +                        scan_for_pfn_in_mfn(d, xmfn, a->smfn);
   2.558 +                        break;
   2.559 +                    case PGT_snapshot:
   2.560 +                        break;
   2.561 +                    default:
   2.562 +                        BUG();
   2.563 +                        break;
   2.564 +                    }
   2.565 +                    a = a->next;
   2.566 +                }
   2.567 +            }
   2.568 +        }
   2.569 +    }
   2.570 +
   2.571 +    void scan_for_pfn_remote(unsigned long xmfn)
   2.572 +    {
   2.573 +        struct domain *e;
   2.574 +        for_each_domain ( e )
   2.575 +            scan_for_pfn( e, xmfn );
   2.576 +    } 
   2.577 +
   2.578 +    unsigned long mfn;
   2.579 +    struct list_head *list_ent;
   2.580 +    struct pfn_info *page;
   2.581 +    int errors = 0;
   2.582 +
   2.583 +    if ( d != current->domain )
   2.584 +        domain_pause(d);
   2.585 +    synchronise_pagetables(~0UL);
   2.586 +
   2.587 +    // Maybe we should just be using BIGLOCK?
   2.588 +    //
   2.589 +    if ( !(flags & AUDIT_ALREADY_LOCKED) )
   2.590 +        shadow_lock(d);
   2.591 +
   2.592 +    spin_lock(&d->page_alloc_lock);
   2.593 +
   2.594 +    /* PHASE 0 */
   2.595 +
   2.596 +    list_ent = d->page_list.next;
   2.597 +    while ( list_ent != &d->page_list )
   2.598 +    {
   2.599 +        u32 page_type;
   2.600 +
   2.601 +        page = list_entry(list_ent, struct pfn_info, list);
   2.602 +        mfn = page_to_pfn(page);
   2.603 +        page_type = page->u.inuse.type_info & PGT_type_mask;
   2.604 +
   2.605 +        if ( page_get_owner(page) != d )
   2.606 +            BUG();
   2.607 +
   2.608 +        if ( (page->u.inuse.type_info & PGT_count_mask) >
   2.609 +             (page->count_info & PGC_count_mask) )
   2.610 +        {
   2.611 +            printk("taf(%08x) > caf(%08x) mfn=%p\n",
   2.612 +                   page->u.inuse.type_info, page->count_info, mfn);
   2.613 +            errors++;
   2.614 +        }
   2.615 +
   2.616 +        if ( shadow_mode_enabled(d) &&
   2.617 +             (page_type == PGT_writable_page) &&
   2.618 +             !(page->u.inuse.type_info & PGT_validated) )
   2.619 +        {
   2.620 +            printk("shadow mode writable page not validated mfn=%p t=%08x c=%08x\n",
   2.621 +                   mfn, page->u.inuse.type_info, page->count_info);
   2.622 +            errors++;
   2.623 +        }
   2.624 + 
   2.625 +#if 0   /* SYSV shared memory pages plus writeable files. */
   2.626 +        if ( page_type == PGT_writable_page && 
   2.627 +             (page->u.inuse.type_info & PGT_count_mask) > 1 )
   2.628 +        {
   2.629 +            printk("writeable page with type count >1: mfn=%lx t=%x c=%x\n",
   2.630 +                  mfn,
   2.631 +                  page->u.inuse.type_info,
   2.632 +                  page->count_info );
   2.633 +            errors++;
   2.634 +            scan_for_pfn_remote(mfn);
   2.635 +        }
   2.636 +#endif
   2.637 +
   2.638 +        if ( page_type == PGT_none && 
   2.639 +             (page->u.inuse.type_info & PGT_count_mask) > 0 )
   2.640 +        {
   2.641 +            printk("normal page with type count >0: mfn=%lx t=%x c=%x\n",
   2.642 +                  mfn,
   2.643 +                  page->u.inuse.type_info,
   2.644 +                  page->count_info );
   2.645 +            errors++;
   2.646 +        }
   2.647 +
   2.648 +        if ( page_out_of_sync(page) )
   2.649 +        {
   2.650 +            if ( !page_is_page_table(page) )
   2.651 +            {
   2.652 +                printk("out of sync page mfn=%p is not a page table\n", mfn);
   2.653 +                errors++;
   2.654 +            }
   2.655 +            unsigned long pfn = __mfn_to_gpfn(d, mfn);
   2.656 +            if ( !__shadow_status(d, pfn, PGT_snapshot) )
   2.657 +            {
   2.658 +                printk("out of sync page mfn=%p doesn't have a snapshot\n");
   2.659 +                errors++;
   2.660 +            }
   2.661 +            if ( page_type != PGT_writable_page )
   2.662 +            {
   2.663 +                printk("out of sync page mfn=%p has strange type t=%08x c=%08x\n",
   2.664 +                       mfn, page->u.inuse.type_info, page->count_info);
   2.665 +                errors++;
   2.666 +            }
   2.667 +        }
   2.668 +
   2.669 +        /* Use tlbflush_timestamp to store original type_info. */
   2.670 +        page->tlbflush_timestamp = page->u.inuse.type_info;
   2.671 +
   2.672 +        list_ent = page->list.next;
   2.673 +    }
   2.674 +
   2.675 +    /* PHASE 1 */
   2.676 +    io_mappings = lowmem_mappings = 0;
   2.677 +
   2.678 +    errors += audit_adjust_pgtables(d, -1, 1);
   2.679 +
   2.680 +    if ( !(flags & AUDIT_QUIET) &&
   2.681 +         ((io_mappings > 0) || (lowmem_mappings > 0)) )
   2.682 +        printk("Audit %d: Found %d lowmem mappings and %d io mappings\n",
   2.683 +               d->id, lowmem_mappings, io_mappings);
   2.684 +
   2.685 +    /* PHASE 2 */
   2.686 +
   2.687 +    list_ent = d->page_list.next;
   2.688 +    while ( list_ent != &d->page_list )
   2.689 +    {
   2.690 +        page = list_entry(list_ent, struct pfn_info, list);
   2.691 +        mfn = page_to_pfn(page);
   2.692 +
   2.693 +        switch ( page->u.inuse.type_info & PGT_type_mask)
   2.694 +        {
   2.695 +        case PGT_l1_page_table:
   2.696 +        case PGT_l2_page_table:
   2.697 +            if ( (page->u.inuse.type_info & PGT_count_mask) != 0 )
   2.698 +            {
   2.699 +                printk("Audit %d: type count!=0 t=%x ot=%x c=%x mfn=%lx\n",
   2.700 +                       d->id, page->u.inuse.type_info, 
   2.701 +                       page->tlbflush_timestamp,
   2.702 +                       page->count_info, mfn);
   2.703 +                errors++;
   2.704 +                scan_for_pfn_remote(mfn);
   2.705 +            }
   2.706 +            break;
   2.707 +        case PGT_none:
   2.708 +        case PGT_writable_page:
   2.709 +        case PGT_gdt_page:
   2.710 +        case PGT_ldt_page:
   2.711 +            if ( (page->u.inuse.type_info & PGT_count_mask) != 0 )
   2.712 +            {
   2.713 +                printk("Audit %d: type count!=0 t=%x ot=%x c=%x mfn=%lx\n",
   2.714 +                       d->id, page->u.inuse.type_info, 
   2.715 +                       page->tlbflush_timestamp,
   2.716 +                       page->count_info, mfn);
   2.717 +                errors++;
   2.718 +            }
   2.719 +            break;
   2.720 +        default:
   2.721 +            BUG(); // XXX fix me...
   2.722 +        }
   2.723 +        
   2.724 +        if ( (page->count_info & PGC_count_mask) != 1 )
   2.725 +        {
   2.726 +            printk("Audit %d: gen count!=1 (c=%x) t=%x ot=%x mfn=%lx\n",
   2.727 +                   d->id,
   2.728 +                   page->count_info,
   2.729 +                   page->u.inuse.type_info, 
   2.730 +                   page->tlbflush_timestamp, mfn );
   2.731 +            errors++;
   2.732 +            scan_for_pfn_remote(mfn);
   2.733 +        }
   2.734 +
   2.735 +        list_ent = page->list.next;
   2.736 +    }
   2.737 +
   2.738 +    if ( shadow_mode_enabled(d) )
   2.739 +    {
   2.740 +        struct shadow_status *a;
   2.741 +        struct pfn_info *page;
   2.742 +        u32 page_type;
   2.743 +        int i;
   2.744 +
   2.745 +        for ( i = 0; i < shadow_ht_buckets; i++ )
   2.746 +        {
   2.747 +            a = &d->arch.shadow_ht[i];
   2.748 +            while ( a && a->gpfn_and_flags )
   2.749 +            {
   2.750 +                page = pfn_to_page(a->smfn);
   2.751 +                page_type = a->gpfn_and_flags & PGT_type_mask;
   2.752 +
   2.753 +                switch ( page_type ) {
   2.754 +                case PGT_snapshot:
   2.755 +                    // XXX -- what should we check here?
   2.756 +                    break;
   2.757 +                case PGT_l1_shadow:
   2.758 +                case PGT_l2_shadow:
   2.759 +                    if ( ((page->u.inuse.type_info & PGT_type_mask) != page_type ) ||
   2.760 +                         (page->count_info != 0) )
   2.761 +                    {
   2.762 +                        printk("Audit %d: shadow page counts wrong mfn=%p t=%x c=%x\n",
   2.763 +                               d->id, page_to_pfn(page),
   2.764 +                               page->u.inuse.type_info,
   2.765 +                               page->count_info);
   2.766 +                        errors++;
   2.767 +                    }
   2.768 +                    break;
   2.769 +
   2.770 +                case PGT_hl2_shadow: // haven't thought about this case yet.
   2.771 +                default:
   2.772 +                    BUG();
   2.773 +                    break;
   2.774 +                }
   2.775 +
   2.776 +                a = a->next;
   2.777 +            }
   2.778 +        }
   2.779 +    }
   2.780 +
   2.781 +    /* PHASE 3 */
   2.782 +    ctot = ttot = page_count = l1 = l2 = oos_count = 0;
   2.783 +
   2.784 +    audit_adjust_pgtables(d, 1, 0);
   2.785 +
   2.786 +#if 0
   2.787 +    // This covers our sins of trashing the tlbflush_timestamps...
   2.788 +    //
   2.789 +    local_flush_tlb();
   2.790 +#endif
   2.791 +
   2.792 +    spin_unlock(&d->page_alloc_lock);
   2.793 +
   2.794 +    if ( !(flags & AUDIT_QUIET) )
   2.795 +        printk("Audit dom%d (%s:%d) Done. "
   2.796 +               "pages=%d oos=%d l1=%d l2=%d ctot=%d ttot=%d\n",
   2.797 +               d->id, file, line, page_count, oos_count, l1, l2, ctot, ttot );
   2.798 +
   2.799 +    if ( !(flags & AUDIT_ALREADY_LOCKED) )
   2.800 +        shadow_unlock(d);
   2.801 +
   2.802 +    if ( d != current->domain )
   2.803 +        domain_unpause(d);
   2.804 +
   2.805 +    if ( errors && !(flags & AUDIT_ERRORS_OK) )
   2.806 +        BUG();
   2.807 +}
   2.808 +
   2.809 +void audit_domains(void)
   2.810 +{
   2.811 +    struct domain *d;
   2.812 +    for_each_domain ( d )
   2.813 +        audit_domain(d);
   2.814 +}
   2.815 +
   2.816 +void audit_domains_key(unsigned char key)
   2.817 +{
   2.818 +    audit_domains();
   2.819 +}
   2.820 +#endif
     3.1 --- a/xen/arch/x86/domain.c	Tue Mar 15 15:53:52 2005 +0000
     3.2 +++ b/xen/arch/x86/domain.c	Wed Mar 16 17:30:37 2005 +0000
     3.3 @@ -247,10 +247,9 @@ void arch_do_createdomain(struct exec_do
     3.4          machine_to_phys_mapping[virt_to_phys(d->arch.mm_perdomain_pt) >> 
     3.5                                 PAGE_SHIFT] = INVALID_M2P_ENTRY;
     3.6          ed->arch.perdomain_ptes = d->arch.mm_perdomain_pt;
     3.7 -#if 0 /* don't need this yet, but maybe soon! */
     3.8 -        ed->arch.guest_vtable = linear_l2_table;
     3.9 -        ed->arch.shadow_vtable = shadow_linear_l2_table;
    3.10 -#endif
    3.11 +
    3.12 +        ed->arch.guest_vtable  = __linear_l2_table;
    3.13 +        ed->arch.shadow_vtable = __shadow_linear_l2_table;
    3.14  
    3.15  #ifdef __x86_64__
    3.16          d->arch.mm_perdomain_l2 = (l2_pgentry_t *)alloc_xenheap_page();
    3.17 @@ -295,70 +294,6 @@ void arch_vmx_do_launch(struct exec_doma
    3.18      reset_stack_and_jump(vmx_asm_do_launch);
    3.19  }
    3.20  
    3.21 -unsigned long alloc_monitor_pagetable(struct exec_domain *ed)
    3.22 -{
    3.23 -    unsigned long mmfn;
    3.24 -    l2_pgentry_t *mpl2e;
    3.25 -    struct pfn_info *mmfn_info;
    3.26 -    struct domain *d = ed->domain;
    3.27 -
    3.28 -    ASSERT(!pagetable_val(ed->arch.monitor_table)); /* we should only get called once */
    3.29 -
    3.30 -    mmfn_info = alloc_domheap_page(NULL);
    3.31 -    ASSERT( mmfn_info ); 
    3.32 -
    3.33 -    mmfn = (unsigned long) (mmfn_info - frame_table);
    3.34 -    mpl2e = (l2_pgentry_t *) map_domain_mem(mmfn << PAGE_SHIFT);
    3.35 -    memset(mpl2e, 0, PAGE_SIZE);
    3.36 -
    3.37 -    memcpy(&mpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 
    3.38 -           &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
    3.39 -           HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
    3.40 -
    3.41 -    mpl2e[l2_table_offset(PERDOMAIN_VIRT_START)] =
    3.42 -        mk_l2_pgentry((__pa(d->arch.mm_perdomain_pt) & PAGE_MASK) 
    3.43 -                      | __PAGE_HYPERVISOR);
    3.44 -
    3.45 -    ed->arch.monitor_vtable = mpl2e;
    3.46 -
    3.47 -    // map the phys_to_machine map into the Read-Only MPT space for this domain
    3.48 -    mpl2e[l2_table_offset(RO_MPT_VIRT_START)] =
    3.49 -        mk_l2_pgentry(pagetable_val(ed->arch.phys_table) | __PAGE_HYPERVISOR);
    3.50 -
    3.51 -    return mmfn;
    3.52 -}
    3.53 -
    3.54 -/*
    3.55 - * Free the pages for monitor_table and hl2_table
    3.56 - */
    3.57 -static void free_monitor_pagetable(struct exec_domain *ed)
    3.58 -{
    3.59 -    l2_pgentry_t *mpl2e;
    3.60 -    unsigned long mfn;
    3.61 -
    3.62 -    ASSERT( pagetable_val(ed->arch.monitor_table) );
    3.63 -    
    3.64 -    mpl2e = ed->arch.monitor_vtable;
    3.65 -
    3.66 -    /*
    3.67 -     * First get the mfn for hl2_table by looking at monitor_table
    3.68 -     */
    3.69 -    mfn = l2_pgentry_val(mpl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT])
    3.70 -        >> PAGE_SHIFT;
    3.71 -
    3.72 -    free_domheap_page(&frame_table[mfn]);
    3.73 -    unmap_domain_mem(mpl2e);
    3.74 -
    3.75 -    /*
    3.76 -     * Then free monitor_table.
    3.77 -     */
    3.78 -    mfn = (pagetable_val(ed->arch.monitor_table)) >> PAGE_SHIFT;
    3.79 -    free_domheap_page(&frame_table[mfn]);
    3.80 -
    3.81 -    ed->arch.monitor_table = mk_pagetable(0);
    3.82 -    ed->arch.monitor_vtable = 0;
    3.83 -}
    3.84 -
    3.85  static int vmx_final_setup_guest(struct exec_domain *ed,
    3.86                                     full_execution_context_t *full_context)
    3.87  {
     4.1 --- a/xen/arch/x86/domain_build.c	Tue Mar 15 15:53:52 2005 +0000
     4.2 +++ b/xen/arch/x86/domain_build.c	Wed Mar 16 17:30:37 2005 +0000
     4.3 @@ -25,6 +25,9 @@
     4.4  static unsigned int opt_dom0_mem = 0;
     4.5  integer_param("dom0_mem", opt_dom0_mem);
     4.6  
     4.7 +static unsigned int opt_dom0_shadow = 0;
     4.8 +boolean_param("dom0_shadow", opt_dom0_shadow);
     4.9 +
    4.10  #if defined(__i386__)
    4.11  /* No ring-3 access in initial leaf page tables. */
    4.12  #define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED)
    4.13 @@ -267,8 +270,13 @@ int construct_dom0(struct domain *d,
    4.14      l1tab += l1_table_offset(vpt_start);
    4.15      for ( count = 0; count < nr_pt_pages; count++ ) 
    4.16      {
    4.17 -        *l1tab = mk_l1_pgentry(l1_pgentry_val(*l1tab) & ~_PAGE_RW);
    4.18          page = &frame_table[l1_pgentry_to_pfn(*l1tab)];
    4.19 +        if ( !opt_dom0_shadow )
    4.20 +            *l1tab = mk_l1_pgentry(l1_pgentry_val(*l1tab) & ~_PAGE_RW);
    4.21 +        else
    4.22 +            if ( !get_page_type(page, PGT_writable_page) )
    4.23 +                BUG();
    4.24 +
    4.25          if ( count == 0 )
    4.26          {
    4.27              page->u.inuse.type_info &= ~PGT_type_mask;
    4.28 @@ -512,6 +520,12 @@ int construct_dom0(struct domain *d,
    4.29  
    4.30      new_thread(ed, dsi.v_kernentry, vstack_end, vstartinfo_start);
    4.31  
    4.32 +    if ( opt_dom0_shadow )
    4.33 +    {
    4.34 +        shadow_mode_enable(d, SHM_enable); 
    4.35 +        update_pagetables(ed); /* XXX SMP */
    4.36 +    }
    4.37 +
    4.38      return 0;
    4.39  }
    4.40  
     5.1 --- a/xen/arch/x86/mm.c	Tue Mar 15 15:53:52 2005 +0000
     5.2 +++ b/xen/arch/x86/mm.c	Wed Mar 16 17:30:37 2005 +0000
     5.3 @@ -104,19 +104,12 @@
     5.4  
     5.5  #ifdef VERBOSE
     5.6  #define MEM_LOG(_f, _a...)                           \
     5.7 -  printk("DOM%u: (file=memory.c, line=%d) " _f "\n", \
     5.8 +  printk("DOM%u: MEM_LOG(line=%d) " _f "\n", \
     5.9           current->domain->id , __LINE__ , ## _a )
    5.10  #else
    5.11  #define MEM_LOG(_f, _a...) ((void)0)
    5.12  #endif
    5.13  
    5.14 -static int alloc_l2_table(struct pfn_info *page);
    5.15 -static int alloc_l1_table(struct pfn_info *page);
    5.16 -static int get_page_from_pagenr(unsigned long page_nr, struct domain *d);
    5.17 -static int get_page_and_type_from_pagenr(unsigned long page_nr, 
    5.18 -                                         u32 type,
    5.19 -                                         struct domain *d);
    5.20 -
    5.21  static void free_l2_table(struct pfn_info *page);
    5.22  static void free_l1_table(struct pfn_info *page);
    5.23  
    5.24 @@ -222,7 +215,7 @@ static void __invalidate_shadow_ldt(stru
    5.25  }
    5.26  
    5.27  
    5.28 -static inline void invalidate_shadow_ldt(struct exec_domain *d)
    5.29 +void invalidate_shadow_ldt(struct exec_domain *d)
    5.30  {
    5.31      if ( d->arch.shadow_ldt_mapcnt != 0 )
    5.32          __invalidate_shadow_ldt(d);
    5.33 @@ -254,21 +247,41 @@ int map_ldt_shadow_page(unsigned int off
    5.34  {
    5.35      struct exec_domain *ed = current;
    5.36      struct domain *d = ed->domain;
    5.37 -    unsigned long l1e;
    5.38 +    unsigned long l1e, nl1e, gpfn, gmfn;
    5.39 +    unsigned gva = ed->arch.ldt_base + (off << PAGE_SHIFT);
    5.40 +    int res;
    5.41  
    5.42      if ( unlikely(in_irq()) )
    5.43          BUG();
    5.44  
    5.45 -    __get_user(l1e, (unsigned long *)
    5.46 -               &linear_pg_table[l1_linear_offset(ed->arch.ldt_base) + off]);
    5.47 -
    5.48 -    if ( unlikely(!(l1e & _PAGE_PRESENT)) ||
    5.49 -         unlikely(!get_page_and_type(
    5.50 -             &frame_table[l1_pgentry_to_pfn(mk_l1_pgentry(l1e))],
    5.51 -             d, PGT_ldt_page)) )
    5.52 +    shadow_sync_va(ed, gva);
    5.53 +    __get_user(l1e, (unsigned long *)&linear_pg_table[l1_linear_offset(gva)]);
    5.54 +
    5.55 +    if ( unlikely(!(l1e & _PAGE_PRESENT)) )
    5.56 +        return 0;
    5.57 +
    5.58 +    gpfn = l1_pgentry_to_pfn(mk_l1_pgentry(l1e));
    5.59 +    gmfn = __gpfn_to_mfn(d, gpfn);
    5.60 +    if ( unlikely(!gmfn) )
    5.61          return 0;
    5.62  
    5.63 -    ed->arch.perdomain_ptes[off + 16] = mk_l1_pgentry(l1e | _PAGE_RW);
    5.64 +    if ( unlikely(shadow_mode_enabled(d)) )
    5.65 +    {
    5.66 +        shadow_lock(d);
    5.67 +        shadow_remove_all_write_access(d, PGT_l1_shadow, PGT_l1_shadow, gpfn);
    5.68 +    }
    5.69 +
    5.70 +    res = get_page_and_type(&frame_table[gmfn], d, PGT_ldt_page);
    5.71 +
    5.72 +    if ( unlikely(shadow_mode_enabled(d)) )
    5.73 +        shadow_unlock(d);
    5.74 +
    5.75 +    if ( unlikely(!res) )
    5.76 +        return 0;
    5.77 +
    5.78 +    nl1e = (l1e & ~PAGE_MASK) | (gmfn << PAGE_SHIFT) | _PAGE_RW;
    5.79 +
    5.80 +    ed->arch.perdomain_ptes[off + 16] = mk_l1_pgentry(nl1e);
    5.81      ed->arch.shadow_ldt_mapcnt++;
    5.82  
    5.83      return 1;
    5.84 @@ -337,6 +350,8 @@ get_linear_pagetable(
    5.85      struct pfn_info *page;
    5.86      unsigned long pfn;
    5.87  
    5.88 +    ASSERT( !shadow_mode_enabled(d) );
    5.89 +
    5.90      if ( (root_pgentry_val(re) & _PAGE_RW) )
    5.91      {
    5.92          MEM_LOG("Attempt to create linear p.t. with write perms");
    5.93 @@ -372,13 +387,13 @@ get_linear_pagetable(
    5.94  }
    5.95  
    5.96  
    5.97 -static int
    5.98 +int
    5.99  get_page_from_l1e(
   5.100      l1_pgentry_t l1e, struct domain *d)
   5.101  {
   5.102      unsigned long l1v = l1_pgentry_val(l1e);
   5.103 -    unsigned long pfn = l1_pgentry_to_pfn(l1e);
   5.104 -    struct pfn_info *page = &frame_table[pfn];
   5.105 +    unsigned long mfn = l1_pgentry_to_pfn(l1e);
   5.106 +    struct pfn_info *page = &frame_table[mfn];
   5.107      extern int domain_iomem_in_pfn(struct domain *d, unsigned long pfn);
   5.108  
   5.109      if ( !(l1v & _PAGE_PRESENT) )
   5.110 @@ -386,11 +401,11 @@ get_page_from_l1e(
   5.111  
   5.112      if ( unlikely(l1v & L1_DISALLOW_MASK) )
   5.113      {
   5.114 -        MEM_LOG("Bad L1 type settings %p", l1v & L1_DISALLOW_MASK);
   5.115 +        MEM_LOG("Bad L1 type settings %p %p", l1v, l1v & L1_DISALLOW_MASK);
   5.116          return 0;
   5.117      }
   5.118  
   5.119 -    if ( unlikely(!pfn_is_ram(pfn)) )
   5.120 +    if ( unlikely(!pfn_is_ram(mfn)) )
   5.121      {
   5.122          /* Revert to caller privileges if FD == DOMID_IO. */
   5.123          if ( d == dom_io )
   5.124 @@ -400,9 +415,9 @@ get_page_from_l1e(
   5.125              return 1;
   5.126  
   5.127          if ( IS_CAPABLE_PHYSDEV(d) )
   5.128 -            return domain_iomem_in_pfn(d, pfn);
   5.129 -
   5.130 -        MEM_LOG("Non-privileged attempt to map I/O space %p", pfn);
   5.131 +            return domain_iomem_in_pfn(d, mfn);
   5.132 +
   5.133 +        MEM_LOG("Non-privileged attempt to map I/O space %p", mfn);
   5.134          return 0;
   5.135      }
   5.136  
   5.137 @@ -420,6 +435,8 @@ get_page_from_l2e(
   5.138  {
   5.139      int rc;
   5.140  
   5.141 +    ASSERT( !shadow_mode_enabled(d) );
   5.142 +
   5.143      if ( !(l2_pgentry_val(l2e) & _PAGE_PRESENT) )
   5.144          return 1;
   5.145  
   5.146 @@ -491,7 +508,7 @@ get_page_from_l4e(
   5.147  #endif /* __x86_64__ */
   5.148  
   5.149  
   5.150 -static void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
   5.151 +void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
   5.152  {
   5.153      unsigned long    l1v  = l1_pgentry_val(l1e);
   5.154      unsigned long    pfn  = l1_pgentry_to_pfn(l1e);
   5.155 @@ -530,6 +547,8 @@ static void put_page_from_l1e(l1_pgentry
   5.156          if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) == 
   5.157                         PGT_ldt_page)) &&
   5.158               unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) )
   5.159 +
   5.160 +            // XXX SMP BUG?
   5.161              invalidate_shadow_ldt(e->exec_domain[0]);
   5.162          put_page(page);
   5.163      }
   5.164 @@ -575,6 +594,8 @@ static int alloc_l1_table(struct pfn_inf
   5.165      l1_pgentry_t  *pl1e;
   5.166      int            i;
   5.167  
   5.168 +    ASSERT( !shadow_mode_enabled(d) );
   5.169 +
   5.170      pl1e = map_domain_mem(pfn << PAGE_SHIFT);
   5.171  
   5.172      for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
   5.173 @@ -601,6 +622,11 @@ static int alloc_l2_table(struct pfn_inf
   5.174      unsigned long  pfn = page_to_pfn(page);
   5.175      l2_pgentry_t  *pl2e;
   5.176      int            i;
   5.177 +
   5.178 +    if ( (PGT_base_page_table == PGT_l2_page_table) &&
   5.179 +         shadow_mode_enabled(d) )
   5.180 +        return 1;
   5.181 +    ASSERT( !shadow_mode_enabled(d) );
   5.182     
   5.183      pl2e = map_domain_mem(pfn << PAGE_SHIFT);
   5.184  
   5.185 @@ -643,6 +669,8 @@ static int alloc_l3_table(struct pfn_inf
   5.186      l3_pgentry_t  *pl3e = page_to_virt(page);
   5.187      int            i;
   5.188  
   5.189 +    ASSERT( !shadow_mode_enabled(d) );
   5.190 +
   5.191      for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
   5.192          if ( is_guest_l3_slot(i) &&
   5.193               unlikely(!get_page_from_l3e(pl3e[i], pfn, d)) )
   5.194 @@ -666,6 +694,11 @@ static int alloc_l4_table(struct pfn_inf
   5.195      l4_pgentry_t  *pl4e = page_to_virt(page);
   5.196      int            i;
   5.197  
   5.198 +    if ( (PGT_base_page_table == PGT_l4_page_table) &&
   5.199 +         shadow_mode_enabled(d) )
   5.200 +        return 1;
   5.201 +    ASSERT( !shadow_mode_enabled(d) );
   5.202 +
   5.203      for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
   5.204          if ( is_guest_l4_slot(i) &&
   5.205               unlikely(!get_page_from_l4e(pl4e[i], pfn, d)) )
   5.206 @@ -765,7 +798,7 @@ static inline int update_l1e(l1_pgentry_
   5.207      if ( unlikely(cmpxchg_user(pl1e, o, n) != 0) ||
   5.208           unlikely(o != l1_pgentry_val(ol1e)) )
   5.209      {
   5.210 -        MEM_LOG("Failed to update %p -> %p: saw %p\n",
   5.211 +        MEM_LOG("Failed to update %p -> %p: saw %p",
   5.212                  l1_pgentry_val(ol1e), l1_pgentry_val(nl1e), o);
   5.213          return 0;
   5.214      }
   5.215 @@ -781,6 +814,8 @@ static int mod_l1_entry(l1_pgentry_t *pl
   5.216      unsigned long _ol1e;
   5.217      struct domain *d = current->domain;
   5.218  
   5.219 +    ASSERT( !shadow_mode_enabled(d) );
   5.220 +
   5.221      if ( unlikely(__get_user(_ol1e, (unsigned long *)pl1e) != 0) )
   5.222          return 0;
   5.223      ol1e = mk_l1_pgentry(_ol1e);
   5.224 @@ -807,13 +842,12 @@ static int mod_l1_entry(l1_pgentry_t *pl
   5.225              put_page_from_l1e(nl1e, d);
   5.226              return 0;
   5.227          }
   5.228 -        
   5.229 -        put_page_from_l1e(ol1e, d);
   5.230 -        return 1;
   5.231      }
   5.232 -
   5.233 -    if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
   5.234 -        return 0;
   5.235 +    else
   5.236 +    {
   5.237 +        if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
   5.238 +            return 0;
   5.239 +    }
   5.240      
   5.241      put_page_from_l1e(ol1e, d);
   5.242      return 1;
   5.243 @@ -825,7 +859,7 @@ static int mod_l1_entry(l1_pgentry_t *pl
   5.244                                  _t ## _pgentry_val(_o),                 \
   5.245                                  _t ## _pgentry_val(_n));                \
   5.246      if ( __o != _t ## _pgentry_val(_o) )                                \
   5.247 -        MEM_LOG("Failed to update %p -> %p: saw %p\n",                  \
   5.248 +        MEM_LOG("Failed to update %p -> %p: saw %p",                    \
   5.249                  _t ## _pgentry_val(_o), _t ## _pgentry_val(_n), __o);   \
   5.250      (__o == _t ## _pgentry_val(_o)); })
   5.251  
   5.252 @@ -872,13 +906,12 @@ static int mod_l2_entry(l2_pgentry_t *pl
   5.253              put_page_from_l2e(nl2e, pfn);
   5.254              return 0;
   5.255          }
   5.256 -        
   5.257 -        put_page_from_l2e(ol2e, pfn);
   5.258 -        return 1;
   5.259      }
   5.260 -
   5.261 -    if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e)) )
   5.262 -        return 0;
   5.263 +    else
   5.264 +    {
   5.265 +        if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e)) )
   5.266 +            return 0;
   5.267 +    }
   5.268  
   5.269      put_page_from_l2e(ol2e, pfn);
   5.270      return 1;
   5.271 @@ -1025,7 +1058,9 @@ int alloc_page_type(struct pfn_info *pag
   5.272  
   5.273  void free_page_type(struct pfn_info *page, unsigned int type)
   5.274  {
   5.275 -    struct domain *d = page_get_owner(page);
   5.276 +    struct domain *owner = page_get_owner(page);
   5.277 +    if ( likely(owner != NULL) && unlikely(shadow_mode_enabled(owner)) )
   5.278 +        return;
   5.279  
   5.280      switch ( type )
   5.281      {
   5.282 @@ -1050,13 +1085,6 @@ void free_page_type(struct pfn_info *pag
   5.283      default:
   5.284          BUG();
   5.285      }
   5.286 -
   5.287 -    if ( unlikely(shadow_mode_enabled(d)) && 
   5.288 -         (get_shadow_status(d, page_to_pfn(page)) & PSH_shadowed) )
   5.289 -    {
   5.290 -        unshadow_table(page_to_pfn(page), type);
   5.291 -        put_shadow_status(d);
   5.292 -    }
   5.293  }
   5.294  
   5.295  
   5.296 @@ -1096,15 +1124,16 @@ void put_page_type(struct pfn_info *page
   5.297                  if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, 
   5.298                                             x & ~PGT_validated)) != x) )
   5.299                      goto again;
   5.300 -                /* We cleared the 'valid bit' so we do the clear up. */
   5.301 +                /* We cleared the 'valid bit' so we do the clean up. */
   5.302                  free_page_type(page, x & PGT_type_mask);
   5.303                  /* Carry on, but with the 'valid bit' now clear. */
   5.304                  x  &= ~PGT_validated;
   5.305                  nx &= ~PGT_validated;
   5.306              }
   5.307          }
   5.308 -        else if ( unlikely((nx & (PGT_pinned | PGT_count_mask)) == 
   5.309 -                           (PGT_pinned | 1)) )
   5.310 +        else if ( unlikely(((nx & (PGT_pinned | PGT_count_mask)) == 
   5.311 +                            (PGT_pinned | 1)) &&
   5.312 +                           ((nx & PGT_type_mask) != PGT_writable_page)) )
   5.313          {
   5.314              /* Page is now only pinned. Make the back pointer mutable again. */
   5.315              nx |= PGT_va_mutable;
   5.316 @@ -1124,7 +1153,7 @@ int get_page_type(struct pfn_info *page,
   5.317          nx = x + 1;
   5.318          if ( unlikely((nx & PGT_count_mask) == 0) )
   5.319          {
   5.320 -            MEM_LOG("Type count overflow on pfn %p\n", page_to_pfn(page));
   5.321 +            MEM_LOG("Type count overflow on pfn %p", page_to_pfn(page));
   5.322              return 0;
   5.323          }
   5.324          else if ( unlikely((x & PGT_count_mask) == 0) )
   5.325 @@ -1137,6 +1166,8 @@ int get_page_type(struct pfn_info *page,
   5.326                   * circumstances should be very rare.
   5.327                   */
   5.328                  struct domain *d = page_get_owner(page);
   5.329 +
   5.330 +                // XXX SMP bug?
   5.331                  if ( unlikely(NEED_FLUSH(tlbflush_time[d->exec_domain[0]->
   5.332                                                        processor],
   5.333                                           page->tlbflush_timestamp)) )
   5.334 @@ -1155,14 +1186,24 @@ int get_page_type(struct pfn_info *page,
   5.335                      nx |= PGT_validated;
   5.336              }
   5.337          }
   5.338 +        else if ( unlikely(!(x & PGT_validated)) )
   5.339 +        {
   5.340 +            /* Someone else is updating validation of this page. Wait... */
   5.341 +            while ( (y = page->u.inuse.type_info) == x )
   5.342 +            {
   5.343 +                rep_nop();
   5.344 +                barrier();
   5.345 +            }
   5.346 +            goto again;
   5.347 +        }
   5.348          else if ( unlikely((x & (PGT_type_mask|PGT_va_mask)) != type) )
   5.349          {
   5.350              if ( unlikely((x & PGT_type_mask) != (type & PGT_type_mask) ) )
   5.351              {
   5.352                  if ( ((x & PGT_type_mask) != PGT_l2_page_table) ||
   5.353                       ((type & PGT_type_mask) != PGT_l1_page_table) )
   5.354 -                    MEM_LOG("Bad type (saw %08x != exp %08x) for pfn %p\n",
   5.355 -                            x & PGT_type_mask, type, page_to_pfn(page));
   5.356 +                    MEM_LOG("Bad type (saw %08x != exp %08x) for pfn %p",
   5.357 +                            x, type, page_to_pfn(page));
   5.358                  return 0;
   5.359              }
   5.360              else if ( (x & PGT_va_mask) == PGT_va_mutable )
   5.361 @@ -1178,16 +1219,6 @@ int get_page_type(struct pfn_info *page,
   5.362                  nx |= PGT_va_unknown;
   5.363              }
   5.364          }
   5.365 -        else if ( unlikely(!(x & PGT_validated)) )
   5.366 -        {
   5.367 -            /* Someone else is updating validation of this page. Wait... */
   5.368 -            while ( (y = page->u.inuse.type_info) == x )
   5.369 -            {
   5.370 -                rep_nop();
   5.371 -                barrier();
   5.372 -            }
   5.373 -            goto again;
   5.374 -        }
   5.375      }
   5.376      while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
   5.377  
   5.378 @@ -1197,7 +1228,7 @@ int get_page_type(struct pfn_info *page,
   5.379          if ( unlikely(!alloc_page_type(page, type & PGT_type_mask)) )
   5.380          {
   5.381              MEM_LOG("Error while validating pfn %p for type %08x."
   5.382 -                    " caf=%08x taf=%08x\n",
   5.383 +                    " caf=%08x taf=%08x",
   5.384                      page_to_pfn(page), type,
   5.385                      page->count_info,
   5.386                      page->u.inuse.type_info);
   5.387 @@ -1214,30 +1245,36 @@ int get_page_type(struct pfn_info *page,
   5.388  }
   5.389  
   5.390  
   5.391 -int new_guest_cr3(unsigned long pfn)
   5.392 +int new_guest_cr3(unsigned long mfn)
   5.393  {
   5.394      struct exec_domain *ed = current;
   5.395      struct domain *d = ed->domain;
   5.396 -    int okay, cpu = smp_processor_id();
   5.397 -    unsigned long old_base_pfn;
   5.398 -    
   5.399 -    okay = get_page_and_type_from_pagenr(pfn, PGT_root_page_table, d);
   5.400 +    int okay;
   5.401 +    unsigned long old_base_mfn;
   5.402 +
   5.403 +    if ( shadow_mode_enabled(d) )
   5.404 +        okay = get_page_from_pagenr(mfn, d);
   5.405 +    else
   5.406 +        okay = get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d);
   5.407 +
   5.408      if ( likely(okay) )
   5.409      {
   5.410          invalidate_shadow_ldt(ed);
   5.411  
   5.412 -        percpu_info[cpu].deferred_ops &= ~DOP_FLUSH_TLB;
   5.413 -        old_base_pfn = pagetable_val(ed->arch.guest_table) >> PAGE_SHIFT;
   5.414 -        ed->arch.guest_table = mk_pagetable(pfn << PAGE_SHIFT);
   5.415 +        old_base_mfn = pagetable_val(ed->arch.guest_table) >> PAGE_SHIFT;
   5.416 +        ed->arch.guest_table = mk_pagetable(mfn << PAGE_SHIFT);
   5.417          update_pagetables(ed); /* update shadow_table and monitor_table */
   5.418  
   5.419          write_ptbase(ed);
   5.420  
   5.421 -        put_page_and_type(&frame_table[old_base_pfn]);
   5.422 +        if ( shadow_mode_enabled(d) )
   5.423 +            put_page(&frame_table[old_base_mfn]);
   5.424 +        else
   5.425 +            put_page_and_type(&frame_table[old_base_mfn]);
   5.426      }
   5.427      else
   5.428      {
   5.429 -        MEM_LOG("Error while installing new baseptr %p", pfn);
   5.430 +        MEM_LOG("Error while installing new baseptr %p", mfn);
   5.431      }
   5.432  
   5.433      return okay;
   5.434 @@ -1247,10 +1284,11 @@ static int do_extended_command(unsigned 
   5.435  {
   5.436      int okay = 1, cpu = smp_processor_id();
   5.437      unsigned int cmd = val & MMUEXT_CMD_MASK, type;
   5.438 -    unsigned long pfn = ptr >> PAGE_SHIFT;
   5.439 -    struct pfn_info *page = &frame_table[pfn];
   5.440      struct exec_domain *ed = current;
   5.441      struct domain *d = ed->domain, *e;
   5.442 +    unsigned long gpfn = ptr >> PAGE_SHIFT;
   5.443 +    unsigned long mfn = __gpfn_to_mfn(d, gpfn);
   5.444 +    struct pfn_info *page = &frame_table[mfn];
   5.445      u32 x, y, _d, _nd;
   5.446      domid_t domid;
   5.447      grant_ref_t gntref;
   5.448 @@ -1266,17 +1304,29 @@ static int do_extended_command(unsigned 
   5.449          type = PGT_l1_page_table | PGT_va_mutable;
   5.450  
   5.451      pin_page:
   5.452 -        okay = get_page_and_type_from_pagenr(pfn, type, FOREIGNDOM);
   5.453 +        if ( unlikely(percpu_info[cpu].foreign &&
   5.454 +                      (shadow_mode_translate(d) ||
   5.455 +                       shadow_mode_translate(percpu_info[cpu].foreign))) )
   5.456 +        {
   5.457 +            // oops -- we should be using the foreign domain's P2M
   5.458 +            mfn = __gpfn_to_mfn(FOREIGNDOM, gpfn);
   5.459 +            page = &frame_table[mfn];
   5.460 +        }
   5.461 +
   5.462 +        if ( shadow_mode_enabled(FOREIGNDOM) )
   5.463 +            type = PGT_writable_page;
   5.464 +
   5.465 +        okay = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM);
   5.466          if ( unlikely(!okay) )
   5.467          {
   5.468 -            MEM_LOG("Error while pinning pfn %p", pfn);
   5.469 +            MEM_LOG("Error while pinning mfn %p", mfn);
   5.470              break;
   5.471          }
   5.472  
   5.473          if ( unlikely(test_and_set_bit(_PGT_pinned,
   5.474                                         &page->u.inuse.type_info)) )
   5.475          {
   5.476 -            MEM_LOG("Pfn %p already pinned", pfn);
   5.477 +            MEM_LOG("mfn %p already pinned", mfn);
   5.478              put_page_and_type(page);
   5.479              okay = 0;
   5.480              break;
   5.481 @@ -1299,10 +1349,19 @@ static int do_extended_command(unsigned 
   5.482  #endif /* __x86_64__ */
   5.483  
   5.484      case MMUEXT_UNPIN_TABLE:
   5.485 -        if ( unlikely(!(okay = get_page_from_pagenr(pfn, FOREIGNDOM))) )
   5.486 +        if ( unlikely(percpu_info[cpu].foreign &&
   5.487 +                      (shadow_mode_translate(d) ||
   5.488 +                       shadow_mode_translate(percpu_info[cpu].foreign))) )
   5.489          {
   5.490 -            MEM_LOG("Page %p bad domain (dom=%p)",
   5.491 -                    ptr, page_get_owner(page));
   5.492 +            // oops -- we should be using the foreign domain's P2M
   5.493 +            mfn = __gpfn_to_mfn(FOREIGNDOM, gpfn);
   5.494 +            page = &frame_table[mfn];
   5.495 +        }
   5.496 +
   5.497 +        if ( unlikely(!(okay = get_page_from_pagenr(mfn, FOREIGNDOM))) )
   5.498 +        {
   5.499 +            MEM_LOG("mfn %p bad domain (dom=%p)",
   5.500 +                    mfn, page_get_owner(page));
   5.501          }
   5.502          else if ( likely(test_and_clear_bit(_PGT_pinned, 
   5.503                                              &page->u.inuse.type_info)) )
   5.504 @@ -1314,28 +1373,29 @@ static int do_extended_command(unsigned 
   5.505          {
   5.506              okay = 0;
   5.507              put_page(page);
   5.508 -            MEM_LOG("Pfn %p not pinned", pfn);
   5.509 +            MEM_LOG("mfn %p not pinned", mfn);
   5.510          }
   5.511          break;
   5.512  
   5.513      case MMUEXT_NEW_BASEPTR:
   5.514 -        okay = new_guest_cr3(pfn);
   5.515 +        okay = new_guest_cr3(mfn);
   5.516 +        percpu_info[cpu].deferred_ops &= ~DOP_FLUSH_TLB;
   5.517          break;
   5.518          
   5.519  #ifdef __x86_64__
   5.520      case MMUEXT_NEW_USER_BASEPTR:
   5.521 -        okay = get_page_and_type_from_pagenr(pfn, PGT_root_page_table, d);
   5.522 +        okay = get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d);
   5.523          if ( unlikely(!okay) )
   5.524          {
   5.525 -            MEM_LOG("Error while installing new baseptr %p", pfn);
   5.526 +            MEM_LOG("Error while installing new baseptr %p", mfn);
   5.527          }
   5.528          else
   5.529          {
   5.530 -            unsigned long old_pfn =
   5.531 +            unsigned long old_mfn =
   5.532                  pagetable_val(ed->arch.guest_table_user) >> PAGE_SHIFT;
   5.533 -            ed->arch.guest_table_user = mk_pagetable(pfn << PAGE_SHIFT);
   5.534 -            if ( old_pfn != 0 )
   5.535 -                put_page_and_type(&frame_table[old_pfn]);
   5.536 +            ed->arch.guest_table_user = mk_pagetable(mfn << PAGE_SHIFT);
   5.537 +            if ( old_mfn != 0 )
   5.538 +                put_page_and_type(&frame_table[old_mfn]);
   5.539          }
   5.540          break;
   5.541  #endif
   5.542 @@ -1346,12 +1406,14 @@ static int do_extended_command(unsigned 
   5.543      
   5.544      case MMUEXT_INVLPG:
   5.545          __flush_tlb_one(ptr);
   5.546 +        if ( shadow_mode_enabled(d) )
   5.547 +            shadow_invlpg(ed, ptr);
   5.548          break;
   5.549  
   5.550      case MMUEXT_FLUSH_CACHE:
   5.551          if ( unlikely(!IS_CAPABLE_PHYSDEV(d)) )
   5.552          {
   5.553 -            MEM_LOG("Non-physdev domain tried to FLUSH_CACHE.\n");
   5.554 +            MEM_LOG("Non-physdev domain tried to FLUSH_CACHE.");
   5.555              okay = 0;
   5.556          }
   5.557          else
   5.558 @@ -1362,6 +1424,8 @@ static int do_extended_command(unsigned 
   5.559  
   5.560      case MMUEXT_SET_LDT:
   5.561      {
   5.562 +        ASSERT( !shadow_mode_external(d) );
   5.563 +
   5.564          unsigned long ents = val >> MMUEXT_CMD_SHIFT;
   5.565          if ( ((ptr & (PAGE_SIZE-1)) != 0) || 
   5.566               (ents > 8192) ||
   5.567 @@ -1375,6 +1439,7 @@ static int do_extended_command(unsigned 
   5.568                    (ed->arch.ldt_base != ptr) )
   5.569          {
   5.570              invalidate_shadow_ldt(ed);
   5.571 +            shadow_sync_all(d);
   5.572              ed->arch.ldt_base = ptr;
   5.573              ed->arch.ldt_ents = ents;
   5.574              load_LDT(ed);
   5.575 @@ -1401,7 +1466,7 @@ static int do_extended_command(unsigned 
   5.576                  percpu_info[cpu].foreign = dom_io;
   5.577                  break;
   5.578              default:
   5.579 -                MEM_LOG("Dom %u cannot set foreign dom\n", d->id);
   5.580 +                MEM_LOG("Dom %u cannot set foreign dom", d->id);
   5.581                  okay = 0;
   5.582                  break;
   5.583              }
   5.584 @@ -1435,10 +1500,10 @@ static int do_extended_command(unsigned 
   5.585          gntref = (grant_ref_t)((val & 0xFF00) | ((ptr >> 2) & 0x00FF));
   5.586          
   5.587          if ( unlikely(IS_XEN_HEAP_FRAME(page)) ||
   5.588 -             unlikely(!pfn_is_ram(pfn)) ||
   5.589 +             unlikely(!pfn_is_ram(mfn)) ||
   5.590               unlikely((e = find_domain_by_id(domid)) == NULL) )
   5.591          {
   5.592 -            MEM_LOG("Bad frame (%p) or bad domid (%d).\n", pfn, domid);
   5.593 +            MEM_LOG("Bad frame (%p) or bad domid (%d).", mfn, domid);
   5.594              okay = 0;
   5.595              break;
   5.596          }
   5.597 @@ -1460,7 +1525,7 @@ static int do_extended_command(unsigned 
   5.598                   unlikely(_nd != _d) )
   5.599              {
   5.600                  MEM_LOG("Bad page values %p: ed=%p(%u), sd=%p,"
   5.601 -                        " caf=%08x, taf=%08x\n", page_to_pfn(page),
   5.602 +                        " caf=%08x, taf=%08x", page_to_pfn(page),
   5.603                          d, d->id, unpickle_domptr(_nd), x, 
   5.604                          page->u.inuse.type_info);
   5.605                  spin_unlock(&d->page_alloc_lock);
   5.606 @@ -1496,7 +1561,7 @@ static int do_extended_command(unsigned 
   5.607               unlikely(!gnttab_prepare_for_transfer(e, d, gntref)) )
   5.608          {
   5.609              MEM_LOG("Transferee has no reservation headroom (%d,%d), or "
   5.610 -                    "provided a bad grant ref, or is dying (%p).\n",
   5.611 +                    "provided a bad grant ref, or is dying (%p).",
   5.612                      e->tot_pages, e->max_pages, e->d_flags);
   5.613              spin_unlock(&e->page_alloc_lock);
   5.614              put_domain(e);
   5.615 @@ -1513,7 +1578,7 @@ static int do_extended_command(unsigned 
   5.616          spin_unlock(&e->page_alloc_lock);
   5.617  
   5.618          /* Transfer is all done: tell the guest about its new page frame. */
   5.619 -        gnttab_notify_transfer(e, gntref, pfn);
   5.620 +        gnttab_notify_transfer(e, gntref, mfn);
   5.621          
   5.622          put_domain(e);
   5.623          break;
   5.624 @@ -1529,7 +1594,14 @@ static int do_extended_command(unsigned 
   5.625          e = percpu_info[cpu].foreign;
   5.626          if ( unlikely(e == NULL) )
   5.627          {
   5.628 -            MEM_LOG("No FOREIGNDOM to reassign pfn %p to", pfn);
   5.629 +            MEM_LOG("No FOREIGNDOM to reassign mfn %p to", mfn);
   5.630 +            okay = 0;
   5.631 +            break;
   5.632 +        }
   5.633 +
   5.634 +        if ( unlikely(!pfn_is_ram(mfn)) )
   5.635 +        {
   5.636 +            MEM_LOG("Can't reassign non-ram mfn %p", mfn);
   5.637              okay = 0;
   5.638              break;
   5.639          }
   5.640 @@ -1574,7 +1646,7 @@ static int do_extended_command(unsigned 
   5.641                   unlikely(_nd != _d) )
   5.642              {
   5.643                  MEM_LOG("Bad page values %p: ed=%p(%u), sd=%p,"
   5.644 -                        " caf=%08x, taf=%08x\n", page_to_pfn(page),
   5.645 +                        " caf=%08x, taf=%08x", page_to_pfn(page),
   5.646                          d, d->id, unpickle_domptr(_nd), x,
   5.647                          page->u.inuse.type_info);
   5.648                  okay = 0;
   5.649 @@ -1637,12 +1709,10 @@ int do_mmu_update(
   5.650  #define MMU_UPDATE_PREEMPT_FDOM_MASK  (0x7FFFU<<MMU_UPDATE_PREEMPT_FDOM_SHIFT)
   5.651  
   5.652      mmu_update_t req;
   5.653 -    unsigned long va = 0, deferred_ops, pfn, prev_pfn = 0;
   5.654 +    unsigned long va = 0, deferred_ops, gpfn, mfn, prev_mfn = 0;
   5.655      struct pfn_info *page;
   5.656      int rc = 0, okay = 1, i = 0, cpu = smp_processor_id();
   5.657      unsigned int cmd, done = 0;
   5.658 -    unsigned long prev_smfn = 0;
   5.659 -    l1_pgentry_t *prev_spl1e = 0;
   5.660      struct exec_domain *ed = current;
   5.661      struct domain *d = ed->domain;
   5.662      u32 type_info;
   5.663 @@ -1653,10 +1723,9 @@ int do_mmu_update(
   5.664      cleanup_writable_pagetable(d);
   5.665  
   5.666      if ( unlikely(shadow_mode_enabled(d)) )
   5.667 -        check_pagetable(d, ed->arch.guest_table, "pre-mmu"); /* debug */
   5.668 -
   5.669 -    if ( unlikely(shadow_mode_translate(d) ) )
   5.670 -        domain_crash();
   5.671 +    {
   5.672 +        check_pagetable(ed, "pre-mmu"); /* debug */
   5.673 +    }
   5.674  
   5.675      /*
   5.676       * If we are resuming after preemption, read how much work we have already
   5.677 @@ -1715,7 +1784,8 @@ int do_mmu_update(
   5.678          }
   5.679  
   5.680          cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
   5.681 -        pfn = req.ptr >> PAGE_SHIFT;
   5.682 +        gpfn = req.ptr >> PAGE_SHIFT;
   5.683 +        mfn = __gpfn_to_mfn(d, gpfn);
   5.684  
   5.685          okay = 0;
   5.686  
   5.687 @@ -1725,107 +1795,91 @@ int do_mmu_update(
   5.688               * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
   5.689               */
   5.690          case MMU_NORMAL_PT_UPDATE:
   5.691 -            if ( unlikely(!get_page_from_pagenr(pfn, current->domain)) )
   5.692 +            if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
   5.693              {
   5.694                  MEM_LOG("Could not get page for normal update");
   5.695                  break;
   5.696              }
   5.697  
   5.698 -            if ( likely(prev_pfn == pfn) )
   5.699 +            if ( likely(prev_mfn == mfn) )
   5.700              {
   5.701                  va = (va & PAGE_MASK) | (req.ptr & ~PAGE_MASK);
   5.702              }
   5.703              else
   5.704              {
   5.705 -                if ( prev_pfn != 0 )
   5.706 +                if ( prev_mfn != 0 )
   5.707                      unmap_domain_mem((void *)va);
   5.708                  va = (unsigned long)map_domain_mem(req.ptr);
   5.709 -                prev_pfn = pfn;
   5.710 +                prev_mfn = mfn;
   5.711              }
   5.712  
   5.713 -            page = &frame_table[pfn];
   5.714 +            page = &frame_table[mfn];
   5.715              switch ( (type_info = page->u.inuse.type_info) & PGT_type_mask )
   5.716              {
   5.717              case PGT_l1_page_table: 
   5.718 +                ASSERT(!shadow_mode_enabled(d));
   5.719                  if ( likely(get_page_type(
   5.720                      page, type_info & (PGT_type_mask|PGT_va_mask))) )
   5.721                  {
   5.722                      okay = mod_l1_entry((l1_pgentry_t *)va, 
   5.723 -                                        mk_l1_pgentry(req.val)); 
   5.724 -
   5.725 -                    if ( unlikely(shadow_mode_enabled(d)) && okay &&
   5.726 -                         (get_shadow_status(d, page-frame_table) &
   5.727 -                          PSH_shadowed) )
   5.728 -                    {
   5.729 -                        shadow_l1_normal_pt_update(
   5.730 -                            req.ptr, req.val, &prev_smfn, &prev_spl1e);
   5.731 -                        put_shadow_status(d);
   5.732 -                    }
   5.733 -
   5.734 +                                        mk_l1_pgentry(req.val));
   5.735                      put_page_type(page);
   5.736                  }
   5.737                  break;
   5.738              case PGT_l2_page_table:
   5.739 +                ASSERT(!shadow_mode_enabled(d));
   5.740                  if ( likely(get_page_type(page, PGT_l2_page_table)) )
   5.741                  {
   5.742                      okay = mod_l2_entry((l2_pgentry_t *)va, 
   5.743                                          mk_l2_pgentry(req.val),
   5.744 -                                        pfn); 
   5.745 -
   5.746 -                    if ( unlikely(shadow_mode_enabled(d)) && okay &&
   5.747 -                         (get_shadow_status(d, page-frame_table) & 
   5.748 -                          PSH_shadowed) )
   5.749 -                    {
   5.750 -                        shadow_l2_normal_pt_update(req.ptr, req.val);
   5.751 -                        put_shadow_status(d);
   5.752 -                    }
   5.753 -
   5.754 +                                        mfn);
   5.755                      put_page_type(page);
   5.756                  }
   5.757                  break;
   5.758  #ifdef __x86_64__
   5.759              case PGT_l3_page_table:
   5.760 +                ASSERT(!shadow_mode_enabled(d));
   5.761                  if ( likely(get_page_type(page, PGT_l3_page_table)) )
   5.762                  {
   5.763                      okay = mod_l3_entry((l3_pgentry_t *)va, 
   5.764                                          mk_l3_pgentry(req.val),
   5.765 -                                        pfn); 
   5.766 -
   5.767 -                    if ( unlikely(shadow_mode_enabled(d)) && okay &&
   5.768 -                         (get_shadow_status(d, page-frame_table) & 
   5.769 -                          PSH_shadowed) )
   5.770 -                    {
   5.771 -                        /*XXXshadow_l3_normal_pt_update(req.ptr, req.val);*/
   5.772 -                        put_shadow_status(d);
   5.773 -                    }
   5.774 -
   5.775 +                                        mfn);
   5.776                      put_page_type(page);
   5.777                  }
   5.778                  break;
   5.779              case PGT_l4_page_table:
   5.780 +                ASSERT(!shadow_mode_enabled(d));
   5.781                  if ( likely(get_page_type(page, PGT_l4_page_table)) )
   5.782                  {
   5.783                      okay = mod_l4_entry((l4_pgentry_t *)va, 
   5.784                                          mk_l4_pgentry(req.val),
   5.785 -                                        pfn); 
   5.786 -
   5.787 -                    if ( unlikely(shadow_mode_enabled(d)) && okay &&
   5.788 -                         (get_shadow_status(d, page-frame_table) & 
   5.789 -                          PSH_shadowed) )
   5.790 -                    {
   5.791 -                        /*XXXshadow_l4_normal_pt_update(req.ptr, req.val);*/
   5.792 -                        put_shadow_status(d);
   5.793 -                    }
   5.794 -
   5.795 +                                        mfn);
   5.796                      put_page_type(page);
   5.797                  }
   5.798                  break;
   5.799  #endif /* __x86_64__ */
   5.800              default:
   5.801 +                printk("do_mmu_update writable update: ma=%p val=%p\n",
   5.802 +                       req.ptr, req.val);
   5.803                  if ( likely(get_page_type(page, PGT_writable_page)) )
   5.804                  {
   5.805 +                    if ( shadow_mode_enabled(d) )
   5.806 +                    {
   5.807 +                        shadow_lock(d);
   5.808 +
   5.809 +                        if ( shadow_mode_log_dirty(d) )
   5.810 +                            __mark_dirty(d, mfn);
   5.811 +
   5.812 +                        if ( page_is_page_table(page) )
   5.813 +                            shadow_mark_mfn_out_of_sync(ed, gpfn, mfn);
   5.814 +                    }
   5.815 +
   5.816                      *(unsigned long *)va = req.val;
   5.817                      okay = 1;
   5.818 +
   5.819 +                    if ( shadow_mode_enabled(d) )
   5.820 +                        shadow_unlock(d);
   5.821 +
   5.822                      put_page_type(page);
   5.823                  }
   5.824                  break;
   5.825 @@ -1835,24 +1889,30 @@ int do_mmu_update(
   5.826              break;
   5.827  
   5.828          case MMU_MACHPHYS_UPDATE:
   5.829 -            if ( unlikely(!get_page_from_pagenr(pfn, FOREIGNDOM)) )
   5.830 +            if ( unlikely(!get_page_from_pagenr(mfn, FOREIGNDOM)) )
   5.831              {
   5.832                  MEM_LOG("Could not get page for mach->phys update");
   5.833                  break;
   5.834              }
   5.835  
   5.836 -            machine_to_phys_mapping[pfn] = req.val;
   5.837 +            if ( unlikely(shadow_mode_translate(FOREIGNDOM) && !IS_PRIV(d)) )
   5.838 +            {
   5.839 +                MEM_LOG("can't mutate the m2p of translated guests");
   5.840 +                break;
   5.841 +            }
   5.842 +
   5.843 +            set_machinetophys(mfn, req.val);
   5.844              okay = 1;
   5.845  
   5.846              /*
   5.847 -             * If in log-dirty mode, mark the corresponding pseudo-physical
   5.848 +             * If in log-dirty mode, mark the corresponding
   5.849               * page as dirty.
   5.850               */
   5.851 -            if ( unlikely(shadow_mode_log_dirty(d)) && 
   5.852 -                 mark_dirty(d, pfn) )
   5.853 -                d->arch.shadow_dirty_block_count++;
   5.854 -
   5.855 -            put_page(&frame_table[pfn]);
   5.856 +            if ( unlikely(shadow_mode_log_dirty(FOREIGNDOM)) &&
   5.857 +                 mark_dirty(FOREIGNDOM, mfn) )
   5.858 +                FOREIGNDOM->arch.shadow_dirty_block_count++;
   5.859 +
   5.860 +            put_page(&frame_table[mfn]);
   5.861              break;
   5.862  
   5.863              /*
   5.864 @@ -1879,17 +1939,18 @@ int do_mmu_update(
   5.865      }
   5.866  
   5.867   out:
   5.868 -    if ( prev_pfn != 0 )
   5.869 +    if ( prev_mfn != 0 )
   5.870          unmap_domain_mem((void *)va);
   5.871  
   5.872 -    if ( unlikely(prev_spl1e != 0) ) 
   5.873 -        unmap_domain_mem((void *)prev_spl1e);
   5.874 -
   5.875      deferred_ops = percpu_info[cpu].deferred_ops;
   5.876      percpu_info[cpu].deferred_ops = 0;
   5.877  
   5.878      if ( deferred_ops & DOP_FLUSH_TLB )
   5.879 +    {
   5.880          local_flush_tlb();
   5.881 +        if ( shadow_mode_enabled(d) )
   5.882 +            shadow_sync_all(d);
   5.883 +    }
   5.884          
   5.885      if ( deferred_ops & DOP_RELOAD_LDT )
   5.886          (void)map_ldt_shadow_page(0);
   5.887 @@ -1905,7 +1966,7 @@ int do_mmu_update(
   5.888          __put_user(done + i, pdone);
   5.889  
   5.890      if ( unlikely(shadow_mode_enabled(d)) )
   5.891 -        check_pagetable(d, ed->arch.guest_table, "post-mmu"); /* debug */
   5.892 +        check_pagetable(ed, "post-mmu"); /* debug */
   5.893  
   5.894      UNLOCK_BIGLOCK(d);
   5.895      return rc;
   5.896 @@ -1924,12 +1985,9 @@ int do_update_va_mapping(unsigned long v
   5.897  
   5.898      perfc_incrc(calls_to_update_va);
   5.899  
   5.900 -    if ( unlikely(!__addr_ok(va)) )
   5.901 +    if ( unlikely(!__addr_ok(va) && !shadow_mode_external(d)) )
   5.902          return -EINVAL;
   5.903  
   5.904 -    if ( unlikely(shadow_mode_translate(d) ) )
   5.905 -        domain_crash();
   5.906 -
   5.907      LOCK_BIGLOCK(d);
   5.908  
   5.909      cleanup_writable_pagetable(d);
   5.910 @@ -1938,55 +1996,56 @@ int do_update_va_mapping(unsigned long v
   5.911       * XXX When we make this support 4MB superpages we should also deal with 
   5.912       * the case of updating L2 entries.
   5.913       */
   5.914 -
   5.915 -    if ( unlikely(!mod_l1_entry(&linear_pg_table[l1_linear_offset(va)],
   5.916 -                                mk_l1_pgentry(val))) )
   5.917 -        err = -EINVAL;
   5.918 -
   5.919 -    if ( unlikely(shadow_mode_enabled(d)) )
   5.920 +    if ( likely(!shadow_mode_enabled(d)) )
   5.921 +    {
   5.922 +        if ( unlikely(!mod_l1_entry(&linear_pg_table[l1_linear_offset(va)],
   5.923 +                                    mk_l1_pgentry(val))) )
   5.924 +            err = -EINVAL;
   5.925 +    }
   5.926 +    else
   5.927      {
   5.928 -        unsigned long sval = 0;
   5.929 -
   5.930 -        l1pte_propagate_from_guest(d, &val, &sval);
   5.931 -
   5.932 -        if ( unlikely(__put_user(sval, ((unsigned long *)(
   5.933 -            &shadow_linear_pg_table[l1_linear_offset(va)])))) )
   5.934 +        if ( unlikely(percpu_info[cpu].foreign &&
   5.935 +                      (shadow_mode_translate(d) ||
   5.936 +                       shadow_mode_translate(percpu_info[cpu].foreign))) )
   5.937          {
   5.938 +            // The foreign domain's pfn's are in a different namespace.
   5.939 +            // We wouldn't be able to figure out how to (re-)shadow our
   5.940 +            // gpte without additional context.
   5.941 +            //
   5.942 +            domain_crash();
   5.943 +        }
   5.944 +    
   5.945 +        check_pagetable(ed, "pre-va"); /* debug */
   5.946 +        shadow_lock(d);
   5.947 +        
   5.948 +        // This is actually overkill - we don't need to sync the L1 itself,
   5.949 +        // just everything involved in getting to this L1 (i.e. we need
   5.950 +        // linear_pg_table[l1_linear_offset(va)] to be in sync)...
   5.951 +        //
   5.952 +        __shadow_sync_va(ed, va);
   5.953 +
   5.954 +        if ( unlikely(__put_user(val, &l1_pgentry_val(
   5.955 +                                     linear_pg_table[l1_linear_offset(va)]))) )
   5.956 +            err = -EINVAL;
   5.957 +        else
   5.958 +        {
   5.959 +            // also need to update the shadow
   5.960 +            unsigned long spte;
   5.961 +
   5.962 +            l1pte_propagate_from_guest(d, val, &spte);
   5.963 +            shadow_set_l1e(va, spte, 0);
   5.964 +
   5.965              /*
   5.966 -             * Since L2's are guranteed RW, failure indicates either that the
   5.967 -             * page was not shadowed, or that the L2 entry has not yet been
   5.968 -             * updated to reflect the shadow.
   5.969 +             * If we're in log-dirty mode then we need to note that we've updated
   5.970 +             * the PTE in the PT-holding page. We need the machine frame number
   5.971 +             * for this.
   5.972               */
   5.973 -            if ( shadow_mode_external(current->domain) )
   5.974 -                BUG(); // can't use linear_l2_table with external tables.
   5.975 -
   5.976 -            l2_pgentry_t gpde = linear_l2_table[l2_table_offset(va)];
   5.977 -            unsigned long gpfn = l2_pgentry_val(gpde) >> PAGE_SHIFT;
   5.978 -
   5.979 -            if (get_shadow_status(d, gpfn))
   5.980 -            {
   5.981 -                unsigned long gmfn = __gpfn_to_mfn(d, gpfn);
   5.982 -                unsigned long *gl1e = map_domain_mem(gmfn << PAGE_SHIFT);
   5.983 -                unsigned l1_idx = l1_table_offset(va);
   5.984 -                gl1e[l1_idx] = sval;
   5.985 -                unmap_domain_mem(gl1e);
   5.986 -                put_shadow_status(d);
   5.987 -
   5.988 -                perfc_incrc(shadow_update_va_fail1);
   5.989 -            }
   5.990 -            else
   5.991 -                perfc_incrc(shadow_update_va_fail2);
   5.992 +            if ( shadow_mode_log_dirty(d) )
   5.993 +                mark_dirty(d, va_to_l1mfn(ed, va));
   5.994 +
   5.995 +            shadow_unlock(d);
   5.996 +            check_pagetable(ed, "post-va"); /* debug */
   5.997          }
   5.998 -
   5.999 -        /*
  5.1000 -         * If we're in log-dirty mode then we need to note that we've updated
  5.1001 -         * the PTE in the PT-holding page. We need the machine frame number
  5.1002 -         * for this.
  5.1003 -         */
  5.1004 -        if ( shadow_mode_log_dirty(d) )
  5.1005 -            mark_dirty(d, va_to_l1mfn(va));
  5.1006 -  
  5.1007 -        check_pagetable(d, ed->arch.guest_table, "va"); /* debug */
  5.1008      }
  5.1009  
  5.1010      deferred_ops = percpu_info[cpu].deferred_ops;
  5.1011 @@ -1994,9 +2053,17 @@ int do_update_va_mapping(unsigned long v
  5.1012  
  5.1013      if ( unlikely(deferred_ops & DOP_FLUSH_TLB) || 
  5.1014           unlikely(flags & UVMF_FLUSH_TLB) )
  5.1015 +    {
  5.1016          local_flush_tlb();
  5.1017 +        if ( unlikely(shadow_mode_enabled(d)) )
  5.1018 +            shadow_sync_all(d);
  5.1019 +    }
  5.1020      else if ( unlikely(flags & UVMF_INVLPG) )
  5.1021 +    {
  5.1022          __flush_tlb_one(va);
  5.1023 +        if ( unlikely(shadow_mode_enabled(d)) )
  5.1024 +            shadow_invlpg(current, va);
  5.1025 +    }
  5.1026  
  5.1027      if ( unlikely(deferred_ops & DOP_RELOAD_LDT) )
  5.1028          (void)map_ldt_shadow_page(0);
  5.1029 @@ -2067,6 +2134,8 @@ long set_gdt(struct exec_domain *ed,
  5.1030      if ( (pfn = frames[0]) >= max_page )
  5.1031          goto fail;
  5.1032  
  5.1033 +    shadow_sync_all(d);
  5.1034 +
  5.1035      /* The first page is special because Xen owns a range of entries in it. */
  5.1036      if ( !get_page_and_type(&frame_table[pfn], d, PGT_gdt_page) )
  5.1037      {
  5.1038 @@ -2146,7 +2215,9 @@ long do_set_gdt(unsigned long *frame_lis
  5.1039  long do_update_descriptor(
  5.1040      unsigned long pa, unsigned long word1, unsigned long word2)
  5.1041  {
  5.1042 -    unsigned long pfn = pa >> PAGE_SHIFT;
  5.1043 +    struct domain *dom = current->domain;
  5.1044 +    unsigned long gpfn = pa >> PAGE_SHIFT;
  5.1045 +    unsigned long mfn;
  5.1046      struct desc_struct *gdt_pent, d;
  5.1047      struct pfn_info *page;
  5.1048      struct exec_domain *ed;
  5.1049 @@ -2155,16 +2226,21 @@ long do_update_descriptor(
  5.1050      d.a = (u32)word1;
  5.1051      d.b = (u32)word2;
  5.1052  
  5.1053 -    LOCK_BIGLOCK(current->domain);
  5.1054 -
  5.1055 -    if ( (pa & 7) || (pfn >= max_page) || !check_descriptor(&d) ) {
  5.1056 -        UNLOCK_BIGLOCK(current->domain);
  5.1057 +    LOCK_BIGLOCK(dom);
  5.1058 +
  5.1059 +    if ( !(mfn = __gpfn_to_mfn(dom, gpfn)) ) {
  5.1060 +        UNLOCK_BIGLOCK(dom);
  5.1061          return -EINVAL;
  5.1062      }
  5.1063  
  5.1064 -    page = &frame_table[pfn];
  5.1065 -    if ( unlikely(!get_page(page, current->domain)) ) {
  5.1066 -        UNLOCK_BIGLOCK(current->domain);
  5.1067 +    if ( (pa & 7) || (mfn >= max_page) || !check_descriptor(&d) ) {
  5.1068 +        UNLOCK_BIGLOCK(dom);
  5.1069 +        return -EINVAL;
  5.1070 +    }
  5.1071 +
  5.1072 +    page = &frame_table[mfn];
  5.1073 +    if ( unlikely(!get_page(page, dom)) ) {
  5.1074 +        UNLOCK_BIGLOCK(dom);
  5.1075          return -EINVAL;
  5.1076      }
  5.1077  
  5.1078 @@ -2173,8 +2249,8 @@ long do_update_descriptor(
  5.1079      {
  5.1080      case PGT_gdt_page:
  5.1081          /* Disallow updates of Xen-reserved descriptors in the current GDT. */
  5.1082 -        for_each_exec_domain(current->domain, ed) {
  5.1083 -            if ( (l1_pgentry_to_pfn(ed->arch.perdomain_ptes[0]) == pfn) &&
  5.1084 +        for_each_exec_domain(dom, ed) {
  5.1085 +            if ( (l1_pgentry_to_pfn(ed->arch.perdomain_ptes[0]) == mfn) &&
  5.1086                   (((pa&(PAGE_SIZE-1))>>3) >= FIRST_RESERVED_GDT_ENTRY) &&
  5.1087                   (((pa&(PAGE_SIZE-1))>>3) <= LAST_RESERVED_GDT_ENTRY) )
  5.1088                  goto out;
  5.1089 @@ -2192,11 +2268,25 @@ long do_update_descriptor(
  5.1090          break;
  5.1091      }
  5.1092  
  5.1093 +    if ( shadow_mode_enabled(dom) )
  5.1094 +    {
  5.1095 +        shadow_lock(dom);
  5.1096 +
  5.1097 +        if ( shadow_mode_log_dirty(dom) )
  5.1098 +            __mark_dirty(dom, mfn);
  5.1099 +
  5.1100 +        if ( page_is_page_table(page) )
  5.1101 +            shadow_mark_mfn_out_of_sync(current, gpfn, mfn);
  5.1102 +    }
  5.1103 +
  5.1104      /* All is good so make the update. */
  5.1105 -    gdt_pent = map_domain_mem(pa);
  5.1106 +    gdt_pent = map_domain_mem((mfn << PAGE_SHIFT) | (pa & ~PAGE_MASK));
  5.1107      memcpy(gdt_pent, &d, 8);
  5.1108      unmap_domain_mem(gdt_pent);
  5.1109  
  5.1110 +    if ( shadow_mode_enabled(dom) )
  5.1111 +        shadow_unlock(dom);
  5.1112 +
  5.1113      put_page_type(page);
  5.1114  
  5.1115      ret = 0; /* success */
  5.1116 @@ -2204,7 +2294,7 @@ long do_update_descriptor(
  5.1117   out:
  5.1118      put_page(page);
  5.1119  
  5.1120 -    UNLOCK_BIGLOCK(current->domain);
  5.1121 +    UNLOCK_BIGLOCK(dom);
  5.1122  
  5.1123      return ret;
  5.1124  }
  5.1125 @@ -2229,8 +2319,8 @@ int ptwr_debug = 0x0;
  5.1126  /* Flush the given writable p.t. page and write-protect it again. */
  5.1127  void ptwr_flush(const int which)
  5.1128  {
  5.1129 -    unsigned long  sstat, spte, pte, *ptep, l1va;
  5.1130 -    l1_pgentry_t  *sl1e = NULL, *pl1e, ol1e, nl1e;
  5.1131 +    unsigned long  pte, *ptep, l1va;
  5.1132 +    l1_pgentry_t  *pl1e, ol1e, nl1e;
  5.1133      l2_pgentry_t  *pl2e;
  5.1134      int            i, cpu = smp_processor_id();
  5.1135      struct exec_domain *ed = current;
  5.1136 @@ -2239,6 +2329,9 @@ void ptwr_flush(const int which)
  5.1137      unsigned int   modified = 0;
  5.1138  #endif
  5.1139  
  5.1140 +    // not supported in combination with various shadow modes!
  5.1141 +    ASSERT( !shadow_mode_enabled(d) );
  5.1142 +    
  5.1143      l1va = ptwr_info[cpu].ptinfo[which].l1va;
  5.1144      ptep = (unsigned long *)&linear_pg_table[l1_linear_offset(l1va)];
  5.1145  
  5.1146 @@ -2248,7 +2341,7 @@ void ptwr_flush(const int which)
  5.1147  
  5.1148      if ( unlikely(__get_user(pte, ptep)) )
  5.1149      {
  5.1150 -        MEM_LOG("ptwr: Could not read pte at %p\n", ptep);
  5.1151 +        MEM_LOG("ptwr: Could not read pte at %p", ptep);
  5.1152          /*
  5.1153           * Really a bug. We could read this PTE during the initial fault,
  5.1154           * and pagetables can't have changed meantime. XXX Multi-CPU guests?
  5.1155 @@ -2259,23 +2352,10 @@ void ptwr_flush(const int which)
  5.1156                  PTWR_PRINT_WHICH, ptep, pte);
  5.1157      pte &= ~_PAGE_RW;
  5.1158  
  5.1159 -    if ( unlikely(shadow_mode_enabled(d)) )
  5.1160 -    {
  5.1161 -        /* Write-protect the p.t. page in the shadow page table. */
  5.1162 -        l1pte_propagate_from_guest(d, &pte, &spte);
  5.1163 -        __put_user(spte, (unsigned long *)
  5.1164 -                   &shadow_linear_pg_table[l1_linear_offset(l1va)]);
  5.1165 -
  5.1166 -        /* Is the p.t. page itself shadowed? Map it into Xen space if so. */
  5.1167 -        sstat = get_shadow_status(d, pte >> PAGE_SHIFT);
  5.1168 -        if ( sstat & PSH_shadowed )
  5.1169 -            sl1e = map_domain_mem((sstat & PSH_pfn_mask) << PAGE_SHIFT);
  5.1170 -    }
  5.1171 -
  5.1172      /* Write-protect the p.t. page in the guest page table. */
  5.1173      if ( unlikely(__put_user(pte, ptep)) )
  5.1174      {
  5.1175 -        MEM_LOG("ptwr: Could not update pte at %p\n", ptep);
  5.1176 +        MEM_LOG("ptwr: Could not update pte at %p", ptep);
  5.1177          /*
  5.1178           * Really a bug. We could write this PTE during the initial fault,
  5.1179           * and pagetables can't have changed meantime. XXX Multi-CPU guests?
  5.1180 @@ -2318,13 +2398,7 @@ void ptwr_flush(const int which)
  5.1181          if ( likely(l1_pgentry_val(ol1e) == (l1_pgentry_val(nl1e)|_PAGE_RW)) )
  5.1182          {
  5.1183              if ( likely(l1_pgentry_val(nl1e) & _PAGE_PRESENT) )
  5.1184 -            {
  5.1185 -                if ( unlikely(sl1e != NULL) )
  5.1186 -                    l1pte_propagate_from_guest(
  5.1187 -                        d, &l1_pgentry_val(nl1e), 
  5.1188 -                        &l1_pgentry_val(sl1e[i]));
  5.1189                  put_page_type(&frame_table[l1_pgentry_to_pfn(nl1e)]);
  5.1190 -            }
  5.1191              continue;
  5.1192          }
  5.1193  
  5.1194 @@ -2343,24 +2417,20 @@ void ptwr_flush(const int which)
  5.1195              domain_crash();
  5.1196          }
  5.1197          
  5.1198 -        if ( unlikely(sl1e != NULL) )
  5.1199 -            l1pte_propagate_from_guest(
  5.1200 -                d, &l1_pgentry_val(nl1e), &l1_pgentry_val(sl1e[i]));
  5.1201 -
  5.1202          if ( unlikely(l1_pgentry_val(ol1e) & _PAGE_PRESENT) )
  5.1203              put_page_from_l1e(ol1e, d);
  5.1204      }
  5.1205      unmap_domain_mem(pl1e);
  5.1206 -
  5.1207 +    
  5.1208      perfc_incr_histo(wpt_updates, modified, PT_UPDATES);
  5.1209  
  5.1210      /*
  5.1211       * STEP 3. Reattach the L1 p.t. page into the current address space.
  5.1212       */
  5.1213  
  5.1214 -    if ( (which == PTWR_PT_ACTIVE) && likely(!shadow_mode_enabled(d)) )
  5.1215 +    if ( which == PTWR_PT_ACTIVE )
  5.1216      {
  5.1217 -        pl2e = &linear_l2_table[ptwr_info[cpu].ptinfo[which].l2_idx];
  5.1218 +        pl2e = &linear_l2_table(ed)[ptwr_info[cpu].ptinfo[which].l2_idx];
  5.1219          *pl2e = mk_l2_pgentry(l2_pgentry_val(*pl2e) | _PAGE_PRESENT); 
  5.1220      }
  5.1221  
  5.1222 @@ -2369,23 +2439,21 @@ void ptwr_flush(const int which)
  5.1223       */
  5.1224  
  5.1225      ptwr_info[cpu].ptinfo[which].l1va = 0;
  5.1226 -
  5.1227 -    if ( unlikely(sl1e != NULL) )
  5.1228 -    {
  5.1229 -        unmap_domain_mem(sl1e);
  5.1230 -        put_shadow_status(d);
  5.1231 -    }
  5.1232  }
  5.1233  
  5.1234  /* Write page fault handler: check if guest is trying to modify a PTE. */
  5.1235  int ptwr_do_page_fault(unsigned long addr)
  5.1236  {
  5.1237 +    struct exec_domain *ed = current;
  5.1238      unsigned long    pte, pfn, l2e;
  5.1239      struct pfn_info *page;
  5.1240      l2_pgentry_t    *pl2e;
  5.1241      int              which, cpu = smp_processor_id();
  5.1242      u32              l2_idx;
  5.1243  
  5.1244 +    // not supported in combination with various shadow modes!
  5.1245 +    ASSERT( !shadow_mode_enabled(ed->domain) );
  5.1246 +    
  5.1247  #ifdef __x86_64__
  5.1248      return 0; /* Writable pagetables need fixing for x86_64. */
  5.1249  #endif
  5.1250 @@ -2394,10 +2462,7 @@ int ptwr_do_page_fault(unsigned long add
  5.1251       * Attempt to read the PTE that maps the VA being accessed. By checking for
  5.1252       * PDE validity in the L2 we avoid many expensive fixups in __get_user().
  5.1253       */
  5.1254 -    if ( shadow_mode_external(current->domain) )
  5.1255 -        BUG(); // can't use linear_l2_table with external tables.
  5.1256 -
  5.1257 -    if ( !(l2_pgentry_val(linear_l2_table[addr>>L2_PAGETABLE_SHIFT]) &
  5.1258 +    if ( !(l2_pgentry_val(linear_l2_table(ed)[addr>>L2_PAGETABLE_SHIFT]) &
  5.1259             _PAGE_PRESENT) ||
  5.1260           __get_user(pte, (unsigned long *)
  5.1261                      &linear_pg_table[l1_linear_offset(addr)]) )
  5.1262 @@ -2425,7 +2490,7 @@ int ptwr_do_page_fault(unsigned long add
  5.1263  
  5.1264      if ( l2_idx == (addr >> L2_PAGETABLE_SHIFT) )
  5.1265      {
  5.1266 -        MEM_LOG("PTWR failure! Pagetable maps itself at %p\n", addr);
  5.1267 +        MEM_LOG("PTWR failure! Pagetable maps itself at %p", addr);
  5.1268          domain_crash();
  5.1269      }
  5.1270  
  5.1271 @@ -2433,10 +2498,7 @@ int ptwr_do_page_fault(unsigned long add
  5.1272       * Is the L1 p.t. mapped into the current address space? If so we call it
  5.1273       * an ACTIVE p.t., otherwise it is INACTIVE.
  5.1274       */
  5.1275 -    if ( shadow_mode_external(current->domain) )
  5.1276 -        BUG(); // can't use linear_l2_table with external tables.
  5.1277 -
  5.1278 -    pl2e = &linear_l2_table[l2_idx];
  5.1279 +    pl2e = &linear_l2_table(ed)[l2_idx];
  5.1280      l2e  = l2_pgentry_val(*pl2e);
  5.1281      which = PTWR_PT_INACTIVE;
  5.1282      if ( (l2e >> PAGE_SHIFT) == pfn )
  5.1283 @@ -2472,8 +2534,7 @@ int ptwr_do_page_fault(unsigned long add
  5.1284      ptwr_info[cpu].ptinfo[which].l2_idx = l2_idx;
  5.1285      
  5.1286      /* For safety, disconnect the L1 p.t. page from current space. */
  5.1287 -    if ( (which == PTWR_PT_ACTIVE) && 
  5.1288 -         likely(!shadow_mode_enabled(current->domain)) )
  5.1289 +    if ( which == PTWR_PT_ACTIVE )
  5.1290      {
  5.1291          *pl2e = mk_l2_pgentry(l2e & ~_PAGE_PRESENT);
  5.1292  #if 1
  5.1293 @@ -2496,7 +2557,7 @@ int ptwr_do_page_fault(unsigned long add
  5.1294      if ( unlikely(__put_user(pte, (unsigned long *)
  5.1295                               &linear_pg_table[addr>>PAGE_SHIFT])) )
  5.1296      {
  5.1297 -        MEM_LOG("ptwr: Could not update pte at %p\n", (unsigned long *)
  5.1298 +        MEM_LOG("ptwr: Could not update pte at %p", (unsigned long *)
  5.1299                  &linear_pg_table[addr>>PAGE_SHIFT]);
  5.1300          /* Toss the writable pagetable state and crash. */
  5.1301          unmap_domain_mem(ptwr_info[cpu].ptinfo[which].pl1e);
  5.1302 @@ -2542,7 +2603,7 @@ void ptwr_status(void)
  5.1303          [ptwr_info[cpu].ptinfo[PTWR_PT_INACTIVE].l1va>>PAGE_SHIFT];
  5.1304  
  5.1305      if ( __get_user(pte, ptep) ) {
  5.1306 -        MEM_LOG("ptwr: Could not read pte at %p\n", ptep);
  5.1307 +        MEM_LOG("ptwr: Could not read pte at %p", ptep);
  5.1308          domain_crash();
  5.1309      }
  5.1310  
  5.1311 @@ -2558,7 +2619,7 @@ void ptwr_status(void)
  5.1312  
  5.1313      if ( __get_user(pte, (unsigned long *)
  5.1314                      ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va) ) {
  5.1315 -        MEM_LOG("ptwr: Could not read pte at %p\n", (unsigned long *)
  5.1316 +        MEM_LOG("ptwr: Could not read pte at %p", (unsigned long *)
  5.1317                  ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va);
  5.1318          domain_crash();
  5.1319      }
  5.1320 @@ -2566,433 +2627,6 @@ void ptwr_status(void)
  5.1321      page = &frame_table[pfn];
  5.1322  }
  5.1323  
  5.1324 -void audit_domain(struct domain *d)
  5.1325 -{
  5.1326 -    int ttot=0, ctot=0, io_mappings=0, lowmem_mappings=0;
  5.1327 -
  5.1328 -    void adjust (struct pfn_info *page, int dir, int adjtype)
  5.1329 -    {
  5.1330 -        int count = page->count_info & PGC_count_mask;
  5.1331 -
  5.1332 -        if ( adjtype )
  5.1333 -        {
  5.1334 -            int tcount = page->u.inuse.type_info & PGT_count_mask;
  5.1335 -            
  5.1336 -            ttot++;
  5.1337 -
  5.1338 -            tcount += dir;
  5.1339 -
  5.1340 -            if ( tcount < 0 )
  5.1341 -            {
  5.1342 -                /* This will only come out once. */
  5.1343 -                printk("Audit %d: type count whent below zero pfn=%x "
  5.1344 -                       "taf=%x otaf=%x\n",
  5.1345 -                       d->id, page-frame_table,
  5.1346 -                       page->u.inuse.type_info,
  5.1347 -                       page->tlbflush_timestamp);
  5.1348 -            }
  5.1349 -            
  5.1350 -            page->u.inuse.type_info =
  5.1351 -                (page->u.inuse.type_info & ~PGT_count_mask) | 
  5.1352 -                (tcount & PGT_count_mask);
  5.1353 -        }
  5.1354 -
  5.1355 -        ctot++;
  5.1356 -        count += dir;
  5.1357 -        if ( count < 0 )
  5.1358 -        {
  5.1359 -            /* This will only come out once. */
  5.1360 -            printk("Audit %d: general count whent below zero pfn=%x "
  5.1361 -                   "taf=%x otaf=%x\n",
  5.1362 -                   d->id, page-frame_table,
  5.1363 -                   page->u.inuse.type_info,
  5.1364 -                   page->tlbflush_timestamp);
  5.1365 -        }
  5.1366 -            
  5.1367 -        page->count_info =
  5.1368 -            (page->count_info & ~PGC_count_mask) | 
  5.1369 -            (count & PGC_count_mask);            
  5.1370 -
  5.1371 -    }
  5.1372 -
  5.1373 -    void scan_for_pfn(struct domain *d, unsigned long xpfn)
  5.1374 -    {
  5.1375 -        unsigned long pfn, *pt;
  5.1376 -        struct list_head *list_ent;
  5.1377 -        struct pfn_info *page;
  5.1378 -        int i;
  5.1379 -
  5.1380 -        list_ent = d->page_list.next;
  5.1381 -        for ( i = 0; (list_ent != &d->page_list); i++ )
  5.1382 -        {
  5.1383 -            pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
  5.1384 -            page = &frame_table[pfn];
  5.1385 -            
  5.1386 -            switch ( page->u.inuse.type_info & PGT_type_mask )
  5.1387 -            {
  5.1388 -            case PGT_l1_page_table:
  5.1389 -            case PGT_l2_page_table:
  5.1390 -                pt = map_domain_mem(pfn<<PAGE_SHIFT);
  5.1391 -                for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
  5.1392 -                    if ( (pt[i] & _PAGE_PRESENT) &&
  5.1393 -                         ((pt[i] >> PAGE_SHIFT) == xpfn) )
  5.1394 -                        printk("     found dom=%d i=%x pfn=%lx t=%x c=%x\n",
  5.1395 -                               d->id, i, pfn, page->u.inuse.type_info,
  5.1396 -                               page->count_info);
  5.1397 -                unmap_domain_mem(pt);           
  5.1398 -            }
  5.1399 -
  5.1400 -            list_ent = frame_table[pfn].list.next;
  5.1401 -        }
  5.1402 -
  5.1403 -    }
  5.1404 -
  5.1405 -    void scan_for_pfn_remote(unsigned long xpfn)
  5.1406 -    {
  5.1407 -        struct domain *e;
  5.1408 -        for_each_domain ( e )
  5.1409 -            scan_for_pfn( e, xpfn );            
  5.1410 -    }   
  5.1411 -
  5.1412 -    int i, l1, l2;
  5.1413 -    unsigned long pfn;
  5.1414 -    struct list_head *list_ent;
  5.1415 -    struct pfn_info *page;
  5.1416 -
  5.1417 -    if ( d != current->domain )
  5.1418 -        domain_pause(d);
  5.1419 -    synchronise_pagetables(~0UL);
  5.1420 -
  5.1421 -    printk("pt base=%lx sh_info=%x\n",
  5.1422 -           pagetable_val(d->exec_domain[0]->arch.guest_table)>>PAGE_SHIFT,
  5.1423 -           virt_to_page(d->shared_info)-frame_table);
  5.1424 -           
  5.1425 -    spin_lock(&d->page_alloc_lock);
  5.1426 -
  5.1427 -    /* PHASE 0 */
  5.1428 -
  5.1429 -    list_ent = d->page_list.next;
  5.1430 -    for ( i = 0; (list_ent != &d->page_list); i++ )
  5.1431 -    {
  5.1432 -        pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;       
  5.1433 -        page = &frame_table[pfn];
  5.1434 -
  5.1435 -        if ( page_get_owner(page) != d )
  5.1436 -            BUG();
  5.1437 -
  5.1438 -        if ( (page->u.inuse.type_info & PGT_count_mask) >
  5.1439 -             (page->count_info & PGC_count_mask) )
  5.1440 -            printk("taf > caf %x %x pfn=%lx\n",
  5.1441 -                   page->u.inuse.type_info, page->count_info, pfn );
  5.1442 - 
  5.1443 -#if 0   /* SYSV shared memory pages plus writeable files. */
  5.1444 -        if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_writable_page && 
  5.1445 -             (page->u.inuse.type_info & PGT_count_mask) > 1 )
  5.1446 -        {
  5.1447 -            printk("writeable page with type count >1: pfn=%lx t=%x c=%x\n",
  5.1448 -                  pfn,
  5.1449 -                  page->u.inuse.type_info,
  5.1450 -                  page->count_info );
  5.1451 -            scan_for_pfn_remote(pfn);
  5.1452 -        }
  5.1453 -#endif
  5.1454 -        if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_none && 
  5.1455 -             (page->u.inuse.type_info & PGT_count_mask) > 1 )
  5.1456 -        {
  5.1457 -            printk("normal page with type count >1: pfn=%lx t=%x c=%x\n",
  5.1458 -                  pfn,
  5.1459 -                  page->u.inuse.type_info,
  5.1460 -                  page->count_info );
  5.1461 -        }
  5.1462 -
  5.1463 -        /* Use tlbflush_timestamp to store original type_info. */
  5.1464 -        page->tlbflush_timestamp = page->u.inuse.type_info;
  5.1465 -
  5.1466 -        list_ent = frame_table[pfn].list.next;
  5.1467 -    }
  5.1468 -
  5.1469 -
  5.1470 -    /* PHASE 1 */
  5.1471 -    if ( pagetable_val(d->exec_domain[0]->arch.guest_table) )
  5.1472 -        adjust(&frame_table[pagetable_val(d->exec_domain[0]->arch.guest_table)
  5.1473 -                           >>PAGE_SHIFT], -1, 1);
  5.1474 -
  5.1475 -    list_ent = d->page_list.next;
  5.1476 -    for ( i = 0; (list_ent != &d->page_list); i++ )
  5.1477 -    {
  5.1478 -        unsigned long *pt;
  5.1479 -        pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;       
  5.1480 -        page = &frame_table[pfn];
  5.1481 -
  5.1482 -        if ( page_get_owner(page) != d )
  5.1483 -            BUG();
  5.1484 -
  5.1485 -        switch ( page->u.inuse.type_info & PGT_type_mask )
  5.1486 -        {
  5.1487 -        case PGT_l2_page_table:
  5.1488 -
  5.1489 -            if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated )
  5.1490 -                printk("Audit %d: L2 not validated %x\n",
  5.1491 -                       d->id, page->u.inuse.type_info);
  5.1492 -
  5.1493 -            if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned )
  5.1494 -                printk("Audit %d: L2 not pinned %x\n",
  5.1495 -                       d->id, page->u.inuse.type_info);
  5.1496 -            else
  5.1497 -                adjust( page, -1, 1 );
  5.1498 -           
  5.1499 -            pt = map_domain_mem( pfn<<PAGE_SHIFT );
  5.1500 -
  5.1501 -            for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
  5.1502 -            {
  5.1503 -                if ( pt[i] & _PAGE_PRESENT )
  5.1504 -                {
  5.1505 -                    unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
  5.1506 -                    struct pfn_info *l1page = &frame_table[l1pfn];
  5.1507 -
  5.1508 -                    if ( page_get_owner(l1page) != d )
  5.1509 -                    {
  5.1510 -                        printk("L2: Skip bizarre page belonging to other "
  5.1511 -                               "dom %p\n", page_get_owner(l1page));
  5.1512 -                        continue;
  5.1513 -                    }
  5.1514 -                    
  5.1515 -                    if ( (l1page->u.inuse.type_info & PGT_type_mask) ==
  5.1516 -                         PGT_l2_page_table )
  5.1517 -                        printk("Audit %d: [%x] Found %s Linear PT "
  5.1518 -                               "t=%x pfn=%lx\n", d->id, i, 
  5.1519 -                               (l1pfn==pfn) ? "Self" : "Other",
  5.1520 -                               l1page->u.inuse.type_info,
  5.1521 -                               l1pfn);
  5.1522 -                    else if ( (l1page->u.inuse.type_info & PGT_type_mask) !=
  5.1523 -                              PGT_l1_page_table )
  5.1524 -                        printk("Audit %d: [%x] Expected L1 t=%x pfn=%lx\n",
  5.1525 -                               d->id, i,
  5.1526 -                               l1page->u.inuse.type_info,
  5.1527 -                               l1pfn);
  5.1528 -
  5.1529 -                    adjust(l1page, -1, 1);
  5.1530 -                }
  5.1531 -            }
  5.1532 -
  5.1533 -            unmap_domain_mem(pt);
  5.1534 -
  5.1535 -            break;
  5.1536 -
  5.1537 -
  5.1538 -        case PGT_l1_page_table:
  5.1539 -            
  5.1540 -            if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
  5.1541 -                adjust( page, -1, 1 );
  5.1542 -
  5.1543 -            if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated )
  5.1544 -                printk("Audit %d: L1 not validated %x\n",
  5.1545 -                       d->id, page->u.inuse.type_info);
  5.1546 -#if 0
  5.1547 -            if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned )
  5.1548 -                printk("Audit %d: L1 not pinned %x\n",
  5.1549 -                       d->id, page->u.inuse.type_info);
  5.1550 -#endif
  5.1551 -            pt = map_domain_mem( pfn<<PAGE_SHIFT );
  5.1552 -
  5.1553 -            for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
  5.1554 -            {
  5.1555 -                if ( pt[i] & _PAGE_PRESENT )
  5.1556 -                {
  5.1557 -                    unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
  5.1558 -                    struct pfn_info *l1page = &frame_table[l1pfn];
  5.1559 -
  5.1560 -                    if ( l1pfn < 0x100 )
  5.1561 -                    {
  5.1562 -                        lowmem_mappings++;
  5.1563 -                        continue;
  5.1564 -                    }
  5.1565 -
  5.1566 -                    if ( l1pfn > max_page )
  5.1567 -                    {
  5.1568 -                        io_mappings++;
  5.1569 -                        continue;
  5.1570 -                    }
  5.1571 -
  5.1572 -                    if ( pt[i] & _PAGE_RW )
  5.1573 -                    {
  5.1574 -
  5.1575 -                        if ( (l1page->u.inuse.type_info & PGT_type_mask) ==
  5.1576 -                             PGT_l1_page_table ||
  5.1577 -                             (l1page->u.inuse.type_info & PGT_type_mask) ==
  5.1578 -                             PGT_l2_page_table )
  5.1579 -                            printk("Audit %d: [%x] Ilegal RW t=%x pfn=%lx\n",
  5.1580 -                                   d->id, i,
  5.1581 -                                   l1page->u.inuse.type_info,
  5.1582 -                                   l1pfn);
  5.1583 -
  5.1584 -                    }
  5.1585 -
  5.1586 -                    if ( page_get_owner(l1page) != d )
  5.1587 -                    {
  5.1588 -                        printk("Audit %d: [%lx,%x] Skip foreign page dom=%p "
  5.1589 -                               "pfn=%lx c=%08x t=%08x m2p=%lx\n",
  5.1590 -                               d->id, pfn, i,
  5.1591 -                               page_get_owner(l1page),
  5.1592 -                               l1pfn,
  5.1593 -                               l1page->count_info,
  5.1594 -                               l1page->u.inuse.type_info,
  5.1595 -                               machine_to_phys_mapping[l1pfn]);    
  5.1596 -                        continue;
  5.1597 -                    }
  5.1598 -
  5.1599 -                    adjust(l1page, -1, 0);
  5.1600 -                }
  5.1601 -            }
  5.1602 -
  5.1603 -            unmap_domain_mem(pt);
  5.1604 -
  5.1605 -            break;
  5.1606 -        }       
  5.1607 -
  5.1608 -        list_ent = frame_table[pfn].list.next;
  5.1609 -    }
  5.1610 -
  5.1611 -    if ( (io_mappings > 0) || (lowmem_mappings > 0) )
  5.1612 -        printk("Audit %d: Found %d lowmem mappings and %d io mappings\n",
  5.1613 -               d->id, lowmem_mappings, io_mappings);
  5.1614 -
  5.1615 -    /* PHASE 2 */
  5.1616 -
  5.1617 -    ctot = ttot = 0;
  5.1618 -    list_ent = d->page_list.next;
  5.1619 -    for ( i = 0; (list_ent != &d->page_list); i++ )
  5.1620 -    {
  5.1621 -        pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
  5.1622 -        page = &frame_table[pfn];
  5.1623 -
  5.1624 -        switch ( page->u.inuse.type_info & PGT_type_mask)
  5.1625 -        {
  5.1626 -        case PGT_l1_page_table:
  5.1627 -        case PGT_l2_page_table:
  5.1628 -            if ( (page->u.inuse.type_info & PGT_count_mask) != 0 )
  5.1629 -            {
  5.1630 -                printk("Audit %d: type count!=0 t=%x ot=%x c=%x pfn=%lx\n",
  5.1631 -                       d->id, page->u.inuse.type_info, 
  5.1632 -                       page->tlbflush_timestamp,
  5.1633 -                       page->count_info, pfn );
  5.1634 -                scan_for_pfn_remote(pfn);
  5.1635 -            }
  5.1636 -        default:
  5.1637 -            if ( (page->count_info & PGC_count_mask) != 1 )
  5.1638 -            {
  5.1639 -                printk("Audit %d: gen count!=1 (c=%x) t=%x ot=%x pfn=%lx\n",
  5.1640 -                       d->id, 
  5.1641 -                       page->count_info,
  5.1642 -                       page->u.inuse.type_info, 
  5.1643 -                       page->tlbflush_timestamp, pfn );
  5.1644 -                scan_for_pfn_remote(pfn);
  5.1645 -            }
  5.1646 -            break;
  5.1647 -        }
  5.1648 -
  5.1649 -        list_ent = frame_table[pfn].list.next;
  5.1650 -    }
  5.1651 -
  5.1652 -    /* PHASE 3 */
  5.1653 -    list_ent = d->page_list.next;
  5.1654 -    l1 = l2 = 0;
  5.1655 -    for ( i = 0; (list_ent != &d->page_list); i++ )
  5.1656 -    {
  5.1657 -        unsigned long *pt;
  5.1658 -        pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
  5.1659 -        page = &frame_table[pfn];
  5.1660 -
  5.1661 -        switch ( page->u.inuse.type_info & PGT_type_mask )
  5.1662 -        {
  5.1663 -        case PGT_l2_page_table:
  5.1664 -	    l2++;
  5.1665 -            if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
  5.1666 -                adjust( page, 1, 1 );          
  5.1667 -
  5.1668 -            pt = map_domain_mem( pfn<<PAGE_SHIFT );
  5.1669 -
  5.1670 -            for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
  5.1671 -            {
  5.1672 -                if ( pt[i] & _PAGE_PRESENT )
  5.1673 -                {
  5.1674 -                    unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
  5.1675 -                    struct pfn_info *l1page;
  5.1676 -
  5.1677 -                    if (l1pfn>max_page)
  5.1678 -                        continue;
  5.1679 -
  5.1680 -                    l1page = &frame_table[l1pfn];
  5.1681 -
  5.1682 -                    if ( page_get_owner(l1page) == d )
  5.1683 -                        adjust(l1page, 1, 1);
  5.1684 -                }
  5.1685 -            }
  5.1686 -
  5.1687 -            unmap_domain_mem(pt);
  5.1688 -            break;
  5.1689 -
  5.1690 -        case PGT_l1_page_table:
  5.1691 -	    l1++;
  5.1692 -            if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
  5.1693 -                adjust( page, 1, 1 );
  5.1694 -
  5.1695 -            pt = map_domain_mem( pfn<<PAGE_SHIFT );
  5.1696 -
  5.1697 -            for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
  5.1698 -            {
  5.1699 -                if ( pt[i] & _PAGE_PRESENT )
  5.1700 -                {
  5.1701 -                    unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
  5.1702 -                    struct pfn_info *l1page;
  5.1703 -
  5.1704 -                    if (l1pfn>max_page)
  5.1705 -                        continue;
  5.1706 -
  5.1707 -                    l1page = &frame_table[l1pfn];
  5.1708 -
  5.1709 -                    if ( (page_get_owner(l1page) != d) ||
  5.1710 -                         (l1pfn < 0x100) || (l1pfn > max_page) )
  5.1711 -                        continue;
  5.1712 -
  5.1713 -                    adjust(l1page, 1, 0);
  5.1714 -                }
  5.1715 -            }
  5.1716 -
  5.1717 -            unmap_domain_mem(pt);
  5.1718 -            break;
  5.1719 -        }
  5.1720 -
  5.1721 -
  5.1722 -        page->tlbflush_timestamp = 0;
  5.1723 -
  5.1724 -        list_ent = frame_table[pfn].list.next;
  5.1725 -    }
  5.1726 -
  5.1727 -    spin_unlock(&d->page_alloc_lock);
  5.1728 -
  5.1729 -    if ( pagetable_val(d->exec_domain[0]->arch.guest_table) )
  5.1730 -        adjust(&frame_table[pagetable_val(
  5.1731 -            d->exec_domain[0]->arch.guest_table)>>PAGE_SHIFT], 1, 1);
  5.1732 -
  5.1733 -    printk("Audit %d: Done. pages=%d l1=%d l2=%d ctot=%d ttot=%d\n", d->id, i, l1, l2, ctot, ttot );
  5.1734 -
  5.1735 -    if ( d != current->domain )
  5.1736 -        domain_unpause(d);
  5.1737 -}
  5.1738 -
  5.1739 -void audit_domains(void)
  5.1740 -{
  5.1741 -    struct domain *d;
  5.1742 -    for_each_domain ( d )
  5.1743 -        audit_domain(d);
  5.1744 -}
  5.1745 -
  5.1746 -void audit_domains_key(unsigned char key)
  5.1747 -{
  5.1748 -    audit_domains();
  5.1749 -}
  5.1750 -
  5.1751  #endif /* NDEBUG */
  5.1752  
  5.1753  /*
     6.1 --- a/xen/arch/x86/shadow.c	Tue Mar 15 15:53:52 2005 +0000
     6.2 +++ b/xen/arch/x86/shadow.c	Wed Mar 16 17:30:37 2005 +0000
     6.3 @@ -1,3 +1,23 @@
     6.4 +/******************************************************************************
     6.5 + * arch/x86/shadow.c
     6.6 + * 
     6.7 + * Copyright (c) 2005 Michael A Fetterman
     6.8 + * 
     6.9 + * This program is free software; you can redistribute it and/or modify
    6.10 + * it under the terms of the GNU General Public License as published by
    6.11 + * the Free Software Foundation; either version 2 of the License, or
    6.12 + * (at your option) any later version.
    6.13 + * 
    6.14 + * This program is distributed in the hope that it will be useful,
    6.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
    6.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    6.17 + * GNU General Public License for more details.
    6.18 + * 
    6.19 + * You should have received a copy of the GNU General Public License
    6.20 + * along with this program; if not, write to the Free Software
    6.21 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
    6.22 + */
    6.23 +
    6.24  
    6.25  #include <xen/config.h>
    6.26  #include <xen/types.h>
    6.27 @@ -8,6 +28,10 @@
    6.28  #include <xen/event.h>
    6.29  #include <xen/trace.h>
    6.30  
    6.31 +static void shadow_free_snapshot(struct domain *d,
    6.32 +                                 struct out_of_sync_entry *entry);
    6.33 +static void remove_out_of_sync_entries(struct domain *d, unsigned long smfn);
    6.34 +
    6.35  /********
    6.36  
    6.37  There's a per-domain shadow table spin lock which works fine for SMP
    6.38 @@ -20,34 +44,401 @@ hypercall lock anyhow (at least initiall
    6.39  
    6.40  ********/
    6.41  
    6.42 -static inline void free_shadow_page(
    6.43 -    struct domain *d, struct pfn_info *page)
    6.44 +static inline int
    6.45 +shadow_promote(struct domain *d, unsigned long gpfn, unsigned long gmfn,
    6.46 +               unsigned long new_type)
    6.47  {
    6.48 -    d->arch.shadow_page_count--;
    6.49 +    unsigned long min_type, max_type;
    6.50 +    struct pfn_info *page = pfn_to_page(gmfn);
    6.51 +    int pinned = 0, okay = 1;
    6.52 +
    6.53 +    if ( page_out_of_sync(page) )
    6.54 +    {
    6.55 +        // Don't know how long ago this snapshot was taken.
    6.56 +        // Can't trust it to be recent enough.
    6.57 +        //
    6.58 +        __shadow_sync_mfn(d, gmfn);
    6.59 +    }
    6.60 +
    6.61 +    if ( unlikely(mfn_is_page_table(gmfn)) )
    6.62 +    {
    6.63 +        min_type = shadow_max_pgtable_type(d, gpfn) + PGT_l1_shadow;
    6.64 +        max_type = new_type;
    6.65 +    }
    6.66 +    else
    6.67 +    {
    6.68 +        min_type = PGT_l1_shadow;
    6.69 +        max_type = PGT_l1_shadow;
    6.70 +    }
    6.71 +    FSH_LOG("shadow_promote gpfn=%p gmfn=%p nt=%p min=%p max=%p\n",
    6.72 +            gmfn, gmfn, new_type, min_type, max_type);
    6.73 +
    6.74 +    if ( min_type <= max_type )
    6.75 +        shadow_remove_all_write_access(d, min_type, max_type, gpfn);
    6.76 +
    6.77 +    // To convert this page to use as a page table, the writable count
    6.78 +    // should now be zero.  Test this by grabbing the page as an page table,
    6.79 +    // and then immediately releasing.  This will also deal with any
    6.80 +    // necessary TLB flushing issues for us.
    6.81 +    //
    6.82 +    // The cruft here about pinning doesn't really work right.  This
    6.83 +    // needs rethinking/rewriting...  Need to gracefully deal with the
    6.84 +    // TLB flushes required when promoting a writable page, and also deal
    6.85 +    // with any outstanding (external) writable refs to this page (by
    6.86 +    // refusing to promote it).  The pinning headache complicates this
    6.87 +    // code -- it would all much get simpler if we stop using
    6.88 +    // shadow_lock() and move the shadow code to BIGLOCK().
    6.89 +    //
    6.90 +    if ( unlikely(!get_page(page, d)) )
    6.91 +        BUG();
    6.92 +    if ( unlikely(test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info)) )
    6.93 +    {
    6.94 +        pinned = 1;
    6.95 +        put_page_and_type(page);
    6.96 +    }
    6.97 +    if ( get_page_type(page, PGT_base_page_table) )
    6.98 +    {
    6.99 +        put_page_type(page);
   6.100 +        set_bit(_PGC_page_table, &frame_table[gmfn].count_info);
   6.101 +    }
   6.102 +    else
   6.103 +    {
   6.104 +        printk("shadow_promote: get_page_type failed "
   6.105 +               "dom%d gpfn=%p gmfn=%p t=%x\n",
   6.106 +               d->id, gpfn, gmfn, new_type);
   6.107 +        okay = 0;
   6.108 +    }
   6.109 +
   6.110 +    // Now put the type back to writable...
   6.111 +    if ( unlikely(!get_page_type(page, PGT_writable_page)) )
   6.112 +        BUG();
   6.113 +    if ( unlikely(pinned) )
   6.114 +    {
   6.115 +        if ( unlikely(test_and_set_bit(_PGT_pinned,
   6.116 +                                       &page->u.inuse.type_info)) )
   6.117 +            BUG(); // hmm... someone pinned this again?
   6.118 +    }
   6.119 +    else
   6.120 +        put_page_and_type(page);
   6.121 +
   6.122 +    return okay;
   6.123 +}
   6.124 +
   6.125 +static inline void
   6.126 +shadow_demote(struct domain *d, unsigned long gpfn, unsigned long gmfn)
   6.127 +{
   6.128 +    ASSERT(frame_table[gmfn].count_info & PGC_page_table);
   6.129 +
   6.130 +    if ( shadow_max_pgtable_type(d, gpfn) == PGT_none )
   6.131 +    {
   6.132 +        clear_bit(_PGC_page_table, &frame_table[gmfn].count_info);
   6.133  
   6.134 -    switch ( page->u.inuse.type_info & PGT_type_mask )
   6.135 +        if ( page_out_of_sync(pfn_to_page(gmfn)) )
   6.136 +        {
   6.137 +            remove_out_of_sync_entries(d, gmfn);
   6.138 +        }
   6.139 +    }
   6.140 +}
   6.141 +
   6.142 +/*
   6.143 + * Things in shadow mode that collect get_page() refs to the domain's
   6.144 + * pages are:
   6.145 + * - PGC_allocated takes a gen count, just like normal.
   6.146 + * - A writable page can be pinned (paravirtualized guests may consider
   6.147 + *   these pages to be L1s or L2s, and don't know the difference).
   6.148 + *   Pinning a page takes a gen count (but, for domains in shadow mode,
   6.149 + *   it *doesn't* take a type count)
   6.150 + * - CR3 grabs a ref to whatever it points at, just like normal.
   6.151 + * - Shadow mode grabs an initial gen count for itself, as a placehold
   6.152 + *   for whatever references will exist.
   6.153 + * - Shadow PTEs that point to a page take a gen count, just like regular
   6.154 + *   PTEs.  However, they don't get a type count, as get_page_type() is
   6.155 + *   hardwired to keep writable pages' counts at 1 for domains in shadow
   6.156 + *   mode.
   6.157 + * - Whenever we shadow a page, the entry in the shadow hash grabs a
   6.158 + *   general ref to the page.
   6.159 + * - Whenever a page goes out of sync, the out of sync entry grabs a
   6.160 + *   general ref to the page.
   6.161 + */
   6.162 +/*
   6.163 + * pfn_info fields for pages allocated as shadow pages:
   6.164 + *
   6.165 + * All 32 bits of count_info are a simple count of refs to this shadow
   6.166 + * from a) other shadow pages, b) current CR3's (aka ed->arch.shadow_table),
   6.167 + * c) if it's a pinned shadow root pgtable, d) outstanding out-of-sync
   6.168 + * references.
   6.169 + *
   6.170 + * u.inuse._domain is left NULL, to prevent accidently allow some random
   6.171 + * domain from gaining permissions to map this page.
   6.172 + *
   6.173 + * u.inuse.type_info & PGT_type_mask remembers what kind of page is being
   6.174 + * shadowed.
   6.175 + * u.inuse.type_info & PGT_mfn_mask holds the mfn of the page being shadowed.
   6.176 + * u.inuse.type_info & PGT_pinned says that an extra reference to this shadow
   6.177 + * is currently exists because this is a shadow of a root page, and we
   6.178 + * don't want to let those disappear just because no CR3 is currently pointing
   6.179 + * at it.
   6.180 + *
   6.181 + * tlbflush_timestamp holds a pickled pointer to the domain.
   6.182 + */
   6.183 +
   6.184 +static inline unsigned long
   6.185 +alloc_shadow_page(struct domain *d,
   6.186 +                  unsigned long gpfn, unsigned long gmfn,
   6.187 +                  u32 psh_type)
   6.188 +{
   6.189 +    struct pfn_info *page;
   6.190 +    unsigned long smfn;
   6.191 +    int pin = 0;
   6.192 +
   6.193 +    if ( (psh_type != PGT_snapshot) &&
   6.194 +         !shadow_promote(d, gpfn, gmfn, psh_type) )
   6.195      {
   6.196 -    case PGT_l1_page_table:
   6.197 -        perfc_decr(shadow_l1_pages);
   6.198 +        FSH_LOG("promotion of pfn=%p mfn=%p failed!  external gnttab refs?\n",
   6.199 +                gpfn, gmfn);
   6.200 +        return 0;
   6.201 +    }
   6.202 +
   6.203 +    page = alloc_domheap_page(NULL);
   6.204 +    if ( unlikely(page == NULL) )
   6.205 +    {
   6.206 +        printk("Couldn't alloc shadow page! dom%d count=%d\n",
   6.207 +               d->id, d->arch.shadow_page_count);
   6.208 +        printk("Shadow table counts: l1=%d l2=%d hl2=%d snapshot=%d\n",
   6.209 +               perfc_value(shadow_l1_pages), 
   6.210 +               perfc_value(shadow_l2_pages),
   6.211 +               perfc_value(hl2_table_pages),
   6.212 +               perfc_value(snapshot_pages));
   6.213 +        BUG(); /* XXX FIXME: try a shadow flush to free up some memory. */
   6.214 +    }
   6.215 +
   6.216 +    smfn = page_to_pfn(page);
   6.217 +
   6.218 +    ASSERT( (gmfn & ~PGT_mfn_mask) == 0 );
   6.219 +    page->u.inuse.type_info = psh_type | gmfn;
   6.220 +    page->count_info = 0;
   6.221 +    page->tlbflush_timestamp = pickle_domptr(d);
   6.222 +
   6.223 +    switch ( psh_type )
   6.224 +    {
   6.225 +    case PGT_l1_shadow:
   6.226 +        perfc_incr(shadow_l1_pages);
   6.227 +        d->arch.shadow_page_count++;
   6.228          break;
   6.229  
   6.230 -    case PGT_l2_page_table:
   6.231 -        perfc_decr(shadow_l2_pages);
   6.232 +    case PGT_l2_shadow:
   6.233 +        perfc_incr(shadow_l2_pages);
   6.234 +        d->arch.shadow_page_count++;
   6.235 +        if ( PGT_l2_page_table == PGT_root_page_table )
   6.236 +            pin = 1;
   6.237 +
   6.238 +        break;
   6.239 +
   6.240 +    case PGT_hl2_shadow:
   6.241 +        perfc_incr(hl2_table_pages);
   6.242 +        d->arch.hl2_page_count++;
   6.243 +
   6.244 +        // treat an hl2 as an L1 for purposes of promotion,
   6.245 +        // and as an L2 for purposes of pinning.
   6.246 +        //
   6.247 +        if ( PGT_l2_page_table == PGT_root_page_table )
   6.248 +            pin = 1;
   6.249 +
   6.250 +        break;
   6.251 +
   6.252 +    case PGT_snapshot:
   6.253 +        perfc_incr(snapshot_pages);
   6.254 +        d->arch.snapshot_page_count++;
   6.255          break;
   6.256  
   6.257      default:
   6.258 -        printk("Free shadow weird page type pfn=%08x type=%08x\n",
   6.259 -               frame_table-page, page->u.inuse.type_info);
   6.260 +        printk("Alloc shadow weird page type type=%08x\n", psh_type);
   6.261 +        BUG();
   6.262          break;
   6.263      }
   6.264  
   6.265 +    set_shadow_status(d, gpfn, smfn, psh_type);
   6.266 +
   6.267 +    if ( pin )
   6.268 +        shadow_pin(smfn);
   6.269 +
   6.270 +    return smfn;
   6.271 +}
   6.272 +
   6.273 +static void inline
   6.274 +free_shadow_l1_table(struct domain *d, unsigned long smfn)
   6.275 +{
   6.276 +    l1_pgentry_t *pl1e = map_domain_mem(smfn << PAGE_SHIFT);
   6.277 +    int i;
   6.278 +
   6.279 +    for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
   6.280 +        put_page_from_l1e(pl1e[i], d);
   6.281 +
   6.282 +    unmap_domain_mem(pl1e);
   6.283 +}
   6.284 +
   6.285 +static void inline
   6.286 +free_shadow_hl2_table(struct domain *d, unsigned long smfn)
   6.287 +{
   6.288 +    printk("free_shadow_hl2_table(smfn=%p)\n", smfn);
   6.289 +
   6.290 +    l1_pgentry_t *pl1e = map_domain_mem(smfn << PAGE_SHIFT);
   6.291 +    int i, limit;
   6.292 +
   6.293 +    if ( shadow_mode_external(d) )
   6.294 +        limit = L2_PAGETABLE_ENTRIES;
   6.295 +    else
   6.296 +        limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
   6.297 +
   6.298 +    for ( i = 0; i < limit; i++ )
   6.299 +        put_page_from_l1e(pl1e[i], d);
   6.300 +
   6.301 +    unmap_domain_mem(pl1e);
   6.302 +}
   6.303 +
   6.304 +static void inline
   6.305 +free_shadow_l2_table(struct domain *d, unsigned long smfn)
   6.306 +{
   6.307 +    printk("free_shadow_l2_table(smfn=%p)\n", smfn);
   6.308 +
   6.309 +    unsigned long *pl2e = map_domain_mem(smfn << PAGE_SHIFT);
   6.310 +    int i, external = shadow_mode_external(d);
   6.311 +
   6.312 +    for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
   6.313 +        if ( external || is_guest_l2_slot(i) )
   6.314 +            if ( pl2e[i] & _PAGE_PRESENT )
   6.315 +                put_shadow_ref(pl2e[i] >> PAGE_SHIFT);
   6.316 +
   6.317 +    if ( (PGT_base_page_table == PGT_l2_page_table) &&
   6.318 +         shadow_mode_translate(d) &&
   6.319 +         !shadow_mode_external(d) )
   6.320 +    {
   6.321 +        // free the ref to the hl2
   6.322 +        //
   6.323 +        put_shadow_ref(pl2e[l2_table_offset(LINEAR_PT_VIRT_START)]
   6.324 +                       >> PAGE_SHIFT);
   6.325 +    }
   6.326 +
   6.327 +    unmap_domain_mem(pl2e);
   6.328 +}
   6.329 +
   6.330 +void free_shadow_page(unsigned long smfn)
   6.331 +{
   6.332 +    struct pfn_info *page = &frame_table[smfn];
   6.333 +    struct domain *d = unpickle_domptr(page->tlbflush_timestamp);
   6.334 +    unsigned long gmfn = page->u.inuse.type_info & PGT_mfn_mask;
   6.335 +    unsigned long gpfn = __mfn_to_gpfn(d, gmfn);
   6.336 +    unsigned long type = page->u.inuse.type_info & PGT_type_mask;
   6.337 +
   6.338 +    ASSERT( ! IS_INVALID_M2P_ENTRY(gpfn) );
   6.339 +
   6.340 +    delete_shadow_status(d, gpfn, type);
   6.341 +
   6.342 +    switch ( type )
   6.343 +    {
   6.344 +    case PGT_l1_shadow:
   6.345 +        perfc_decr(shadow_l1_pages);
   6.346 +        shadow_demote(d, gpfn, gmfn);
   6.347 +        free_shadow_l1_table(d, smfn);
   6.348 +        break;
   6.349 +
   6.350 +    case PGT_l2_shadow:
   6.351 +        perfc_decr(shadow_l2_pages);
   6.352 +        shadow_demote(d, gpfn, gmfn);
   6.353 +        free_shadow_l2_table(d, smfn);
   6.354 +        break;
   6.355 +
   6.356 +    case PGT_hl2_shadow:
   6.357 +        perfc_decr(hl2_table_pages);
   6.358 +        shadow_demote(d, gpfn, gmfn);
   6.359 +        free_shadow_hl2_table(d, smfn);
   6.360 +        break;
   6.361 +
   6.362 +    case PGT_snapshot:
   6.363 +        perfc_decr(snapshot_pages);
   6.364 +        break;
   6.365 +
   6.366 +    default:
   6.367 +        printk("Free shadow weird page type mfn=%08x type=%08x\n",
   6.368 +               page-frame_table, page->u.inuse.type_info);
   6.369 +        break;
   6.370 +    }
   6.371 +
   6.372 +    d->arch.shadow_page_count--;
   6.373 +
   6.374 +    // No TLB flushes are needed the next time this page gets allocated.
   6.375 +    //
   6.376 +    page->tlbflush_timestamp = 0;
   6.377 +    page->u.free.cpu_mask = 0;
   6.378 +
   6.379      free_domheap_page(page);
   6.380  }
   6.381  
   6.382 -void free_shadow_state(struct domain *d)
   6.383 +static void inline
   6.384 +release_out_of_sync_entry(struct domain *d, struct out_of_sync_entry *entry)
   6.385 +{
   6.386 +    struct pfn_info *page;
   6.387 +
   6.388 +    page = &frame_table[entry->gmfn];
   6.389 +        
   6.390 +    // Decrement ref count of guest & shadow pages
   6.391 +    //
   6.392 +    put_page(page);
   6.393 +
   6.394 +    // Only use entries that have low bits clear...
   6.395 +    //
   6.396 +    if ( !(entry->writable_pl1e & (sizeof(l1_pgentry_t)-1)) )
   6.397 +        put_shadow_ref(entry->writable_pl1e >> PAGE_SHIFT);
   6.398 +
   6.399 +    // Free the snapshot
   6.400 +    //
   6.401 +    shadow_free_snapshot(d, entry);
   6.402 +}
   6.403 +
   6.404 +static void remove_out_of_sync_entries(struct domain *d, unsigned long gmfn)
   6.405 +{
   6.406 +    struct out_of_sync_entry *entry = d->arch.out_of_sync;
   6.407 +    struct out_of_sync_entry **prev = &d->arch.out_of_sync;
   6.408 +
   6.409 +    while ( entry )
   6.410 +    {
   6.411 +        if ( entry->gmfn == gmfn )
   6.412 +        {
   6.413 +            release_out_of_sync_entry(d, entry);
   6.414 +            *prev = entry = entry->next;
   6.415 +            continue;
   6.416 +        }
   6.417 +        prev = &entry->next;
   6.418 +        entry = entry->next;
   6.419 +    }
   6.420 +}
   6.421 +
   6.422 +static void free_out_of_sync_state(struct domain *d)
   6.423 +{
   6.424 +    struct out_of_sync_entry *entry;
   6.425 +    struct out_of_sync_entry **tail = NULL;
   6.426 +
   6.427 +    // Add the list of out-of-sync entries to the free list of entries.
   6.428 +    // Not the smartest code.  But it works.
   6.429 +    //
   6.430 +    for ( entry = d->arch.out_of_sync; entry; entry = entry->next)
   6.431 +    {
   6.432 +        release_out_of_sync_entry(d, entry);
   6.433 +        tail = &entry->next;
   6.434 +    }
   6.435 +    if ( tail )
   6.436 +    {
   6.437 +        *tail = d->arch.out_of_sync_free;
   6.438 +        d->arch.out_of_sync_free = d->arch.out_of_sync;
   6.439 +        d->arch.out_of_sync = NULL;
   6.440 +    }
   6.441 +}
   6.442 +
   6.443 +static void free_shadow_pages(struct domain *d)
   6.444  {
   6.445      int                   i, free = 0;
   6.446      struct shadow_status *x, *n;
   6.447 +    struct exec_domain   *e;
   6.448   
   6.449      /*
   6.450       * WARNING! The shadow page table must not currently be in use!
   6.451 @@ -58,21 +449,37 @@ void free_shadow_state(struct domain *d)
   6.452  
   6.453      if( !d->arch.shadow_ht ) return;
   6.454  
   6.455 -    /* Free each hash chain in turn. */
   6.456 +    // first, remove any outstanding refs from out_of_sync entries...
   6.457 +    //
   6.458 +    free_out_of_sync_state(d);
   6.459 +
   6.460 +    // second, remove any outstanding refs from ed->arch.shadow_table...
   6.461 +    //
   6.462 +    for_each_exec_domain(d, e)
   6.463 +    {
   6.464 +        if ( pagetable_val(e->arch.shadow_table) )
   6.465 +        {
   6.466 +            put_shadow_ref(pagetable_val(e->arch.shadow_table) >> PAGE_SHIFT);
   6.467 +            e->arch.shadow_table = mk_pagetable(0);
   6.468 +        }
   6.469 +    }
   6.470 +
   6.471 +    // Now, the only refs to shadow pages that are left are from the shadow
   6.472 +    // pages themselves.  We can just free them.
   6.473 +    //
   6.474      for ( i = 0; i < shadow_ht_buckets; i++ )
   6.475      {
   6.476          /* Skip empty buckets. */
   6.477          x = &d->arch.shadow_ht[i];
   6.478 -        if ( x->pfn == 0 )
   6.479 +        if ( x->gpfn_and_flags == 0 )
   6.480              continue;
   6.481  
   6.482          /* Free the head page. */
   6.483 -        free_shadow_page(
   6.484 -            d, &frame_table[x->smfn_and_flags & PSH_pfn_mask]);
   6.485 +        free_shadow_page(x->smfn);
   6.486  
   6.487          /* Reinitialise the head node. */
   6.488 -        x->pfn            = 0;
   6.489 -        x->smfn_and_flags = 0;
   6.490 +        x->gpfn_and_flags = 0;
   6.491 +        x->smfn           = 0;
   6.492          n                 = x->next;
   6.493          x->next           = NULL;
   6.494  
   6.495 @@ -82,16 +489,15 @@ void free_shadow_state(struct domain *d)
   6.496          for ( x = n; x != NULL; x = n )
   6.497          { 
   6.498              /* Free the shadow page. */
   6.499 -            free_shadow_page(
   6.500 -                d, &frame_table[x->smfn_and_flags & PSH_pfn_mask]);
   6.501 +            free_shadow_page(x->smfn);
   6.502  
   6.503              /* Re-initialise the chain node. */
   6.504 -            x->pfn            = 0;
   6.505 -            x->smfn_and_flags = 0;
   6.506 +            x->gpfn_and_flags = 0;
   6.507 +            x->smfn           = 0;
   6.508  
   6.509              /* Add to the free list. */
   6.510 -            n                 = x->next;
   6.511 -            x->next           = d->arch.shadow_ht_free;
   6.512 +            n       = x->next;
   6.513 +            x->next = d->arch.shadow_ht_free;
   6.514              d->arch.shadow_ht_free = x;
   6.515  
   6.516              free++;
   6.517 @@ -103,80 +509,140 @@ void free_shadow_state(struct domain *d)
   6.518      SH_LOG("Free shadow table. Freed=%d.", free);
   6.519  }
   6.520  
   6.521 -static inline int clear_shadow_page(
   6.522 -    struct domain *d, struct shadow_status *x)
   6.523 -{
   6.524 -    unsigned long   *p;
   6.525 -    int              restart = 0;
   6.526 -    struct pfn_info *spage = &frame_table[x->smfn_and_flags & PSH_pfn_mask];
   6.527 -
   6.528 -    // We don't clear hl2_table's here.  At least not yet.
   6.529 -    if ( x->pfn & PSH_hl2 )
   6.530 -        return 0;
   6.531 -
   6.532 -    switch ( spage->u.inuse.type_info & PGT_type_mask )
   6.533 -    {
   6.534 -        /* We clear L2 pages by zeroing the guest entries. */
   6.535 -    case PGT_l2_page_table:
   6.536 -        p = map_domain_mem((spage - frame_table) << PAGE_SHIFT);
   6.537 -        if ( shadow_mode_external(d) )
   6.538 -            memset(p, 0, L2_PAGETABLE_ENTRIES * sizeof(*p));
   6.539 -        else 
   6.540 -            memset(p, 0, DOMAIN_ENTRIES_PER_L2_PAGETABLE * sizeof(*p));
   6.541 -        unmap_domain_mem(p);
   6.542 -        break;
   6.543 -
   6.544 -        /* We clear L1 pages by freeing them: no benefit from zeroing them. */
   6.545 -    case PGT_l1_page_table:
   6.546 -        delete_shadow_status(d, x->pfn);
   6.547 -        free_shadow_page(d, spage);
   6.548 -        restart = 1; /* We need to go to start of list again. */
   6.549 -        break;
   6.550 -    }
   6.551 -
   6.552 -    return restart;
   6.553 -}
   6.554 -
   6.555 -static void clear_shadow_state(struct domain *d)
   6.556 -{
   6.557 -    int                   i;
   6.558 -    struct shadow_status *x;
   6.559 - 
   6.560 -    shadow_audit(d, 1);
   6.561 -
   6.562 -    for ( i = 0; i < shadow_ht_buckets; i++ )
   6.563 -    {
   6.564 -    retry:
   6.565 -        /* Skip empty buckets. */
   6.566 -        x = &d->arch.shadow_ht[i];
   6.567 -        if ( x->pfn == 0 )
   6.568 -            continue;
   6.569 -
   6.570 -        if ( clear_shadow_page(d, x) )
   6.571 -            goto retry;
   6.572 -
   6.573 -        for ( x = x->next; x != NULL; x = x->next )
   6.574 -            if ( clear_shadow_page(d, x) )
   6.575 -                goto retry;
   6.576 -
   6.577 -        shadow_audit(d, 0);
   6.578 -    }
   6.579 -
   6.580 -    SH_VLOG("Scan shadow table. l1=%d l2=%d",
   6.581 -            perfc_value(shadow_l1_pages), perfc_value(shadow_l2_pages));
   6.582 -}
   6.583 -
   6.584 -
   6.585  void shadow_mode_init(void)
   6.586  {
   6.587  }
   6.588  
   6.589 +static void alloc_monitor_pagetable(struct exec_domain *ed)
   6.590 +{
   6.591 +    unsigned long mmfn;
   6.592 +    l2_pgentry_t *mpl2e;
   6.593 +    struct pfn_info *mmfn_info;
   6.594 +    struct domain *d = ed->domain;
   6.595 +
   6.596 +    ASSERT(!pagetable_val(ed->arch.monitor_table)); /* we should only get called once */
   6.597 +
   6.598 +    mmfn_info = alloc_domheap_page(NULL);
   6.599 +    ASSERT( mmfn_info ); 
   6.600 +
   6.601 +    mmfn = (unsigned long) (mmfn_info - frame_table);
   6.602 +    mpl2e = (l2_pgentry_t *) map_domain_mem(mmfn << PAGE_SHIFT);
   6.603 +    memset(mpl2e, 0, PAGE_SIZE);
   6.604 +
   6.605 +    memcpy(&mpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 
   6.606 +           &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
   6.607 +           HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
   6.608 +
   6.609 +    mpl2e[l2_table_offset(PERDOMAIN_VIRT_START)] =
   6.610 +        mk_l2_pgentry((__pa(d->arch.mm_perdomain_pt) & PAGE_MASK) 
   6.611 +                      | __PAGE_HYPERVISOR);
   6.612 +
   6.613 +    // map the phys_to_machine map into the Read-Only MPT space for this domain
   6.614 +    mpl2e[l2_table_offset(RO_MPT_VIRT_START)] =
   6.615 +        mk_l2_pgentry(pagetable_val(ed->arch.phys_table) | __PAGE_HYPERVISOR);
   6.616 +
   6.617 +    ed->arch.monitor_table = mk_pagetable(mmfn << PAGE_SHIFT);
   6.618 +    ed->arch.monitor_vtable = mpl2e;
   6.619 +}
   6.620 +
   6.621 +/*
   6.622 + * Free the pages for monitor_table and hl2_table
   6.623 + */
   6.624 +void free_monitor_pagetable(struct exec_domain *ed)
   6.625 +{
   6.626 +    l2_pgentry_t *mpl2e, hl2e;
   6.627 +    unsigned long mfn;
   6.628 +
   6.629 +    ASSERT( pagetable_val(ed->arch.monitor_table) );
   6.630 +    ASSERT( shadow_mode_external(ed->domain) );
   6.631 +    
   6.632 +    mpl2e = ed->arch.monitor_vtable;
   6.633 +
   6.634 +    /*
   6.635 +     * First get the mfn for hl2_table by looking at monitor_table
   6.636 +     */
   6.637 +    hl2e = mpl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT];
   6.638 +    ASSERT(l2_pgentry_val(hl2e) & _PAGE_PRESENT);
   6.639 +    mfn = l2_pgentry_val(hl2e) >> PAGE_SHIFT;
   6.640 +    ASSERT(mfn);
   6.641 +
   6.642 +    put_shadow_ref(mfn);
   6.643 +    unmap_domain_mem(mpl2e);
   6.644 +
   6.645 +    /*
   6.646 +     * Then free monitor_table.
   6.647 +     */
   6.648 +    mfn = (pagetable_val(ed->arch.monitor_table)) >> PAGE_SHIFT;
   6.649 +    free_domheap_page(&frame_table[mfn]);
   6.650 +
   6.651 +    ed->arch.monitor_table = mk_pagetable(0);
   6.652 +    ed->arch.monitor_vtable = 0;
   6.653 +}
   6.654  
   6.655  int __shadow_mode_enable(struct domain *d, unsigned int mode)
   6.656  {
   6.657 -    d->arch.shadow_mode = mode;
   6.658 +    struct exec_domain *ed;
   6.659 +
   6.660 +    for_each_exec_domain(d, ed)
   6.661 +    {
   6.662 +        invalidate_shadow_ldt(ed);
   6.663 +
   6.664 +        // We need to set these up for __update_pagetables().
   6.665 +        // See the comment there.
   6.666 +
   6.667 +        /*
   6.668 +         * arch.guest_vtable
   6.669 +         */
   6.670 +        if ( ed->arch.guest_vtable &&
   6.671 +             (ed->arch.guest_vtable != __linear_l2_table) )
   6.672 +        {
   6.673 +            unmap_domain_mem(ed->arch.guest_vtable);
   6.674 +        }
   6.675 +        if ( (mode & (SHM_translate | SHM_external)) == SHM_translate )
   6.676 +            ed->arch.guest_vtable = __linear_l2_table;
   6.677 +        else
   6.678 +            ed->arch.guest_vtable = NULL;
   6.679  
   6.680 -    if (!d->arch.shadow_ht)
   6.681 +        /*
   6.682 +         * arch.shadow_vtable
   6.683 +         */
   6.684 +        if ( ed->arch.shadow_vtable &&
   6.685 +             (ed->arch.shadow_vtable != __shadow_linear_l2_table) )
   6.686 +        {
   6.687 +            unmap_domain_mem(ed->arch.shadow_vtable);
   6.688 +        }
   6.689 +        if ( !(mode & SHM_external) )
   6.690 +            ed->arch.shadow_vtable = __shadow_linear_l2_table;
   6.691 +        else
   6.692 +            ed->arch.shadow_vtable = NULL;
   6.693 +
   6.694 +        /*
   6.695 +         * arch.hl2_vtable
   6.696 +         */
   6.697 +        if ( ed->arch.hl2_vtable &&
   6.698 +             (ed->arch.hl2_vtable != __linear_hl2_table) )
   6.699 +        {
   6.700 +            unmap_domain_mem(ed->arch.hl2_vtable);
   6.701 +        }
   6.702 +        if ( (mode & (SHM_translate | SHM_external)) == SHM_translate )
   6.703 +            ed->arch.hl2_vtable = __linear_hl2_table;
   6.704 +        else
   6.705 +            ed->arch.hl2_vtable = NULL;
   6.706 +
   6.707 +        /*
   6.708 +         * arch.monitor_table & arch.monitor_vtable
   6.709 +         */
   6.710 +        if ( ed->arch.monitor_vtable )
   6.711 +        {
   6.712 +            free_monitor_pagetable(ed);
   6.713 +        }
   6.714 +        if ( mode & SHM_external )
   6.715 +        {
   6.716 +            alloc_monitor_pagetable(ed);
   6.717 +        }
   6.718 +    }
   6.719 +
   6.720 +    if ( !d->arch.shadow_ht )
   6.721      {
   6.722          d->arch.shadow_ht = xmalloc_array(struct shadow_status, shadow_ht_buckets);
   6.723          if ( d->arch.shadow_ht == NULL )
   6.724 @@ -186,7 +652,7 @@ int __shadow_mode_enable(struct domain *
   6.725             shadow_ht_buckets * sizeof(struct shadow_status));
   6.726      }
   6.727  
   6.728 -    if ( shadow_mode_log_dirty(d) && !d->arch.shadow_dirty_bitmap)
   6.729 +    if ( shadow_mode_log_dirty(d) && !d->arch.shadow_dirty_bitmap )
   6.730      {
   6.731          d->arch.shadow_dirty_bitmap_size = (d->max_pages + 63) & ~63;
   6.732          d->arch.shadow_dirty_bitmap = 
   6.733 @@ -201,6 +667,63 @@ int __shadow_mode_enable(struct domain *
   6.734                 d->arch.shadow_dirty_bitmap_size/8);
   6.735      }
   6.736  
   6.737 +    printk("audit1\n");
   6.738 +    _audit_domain(d, AUDIT_ALREADY_LOCKED | AUDIT_ERRORS_OK, __FILE__, __LINE__);
   6.739 +    printk("audit1 done\n");
   6.740 +
   6.741 +    // Get rid of any shadow pages from any previous shadow mode.
   6.742 +    //
   6.743 +    free_shadow_pages(d);
   6.744 +
   6.745 +    printk("audit2\n");
   6.746 +    _audit_domain(d, AUDIT_ALREADY_LOCKED | AUDIT_ERRORS_OK, __FILE__, __LINE__);
   6.747 +    printk("audit2 done\n");
   6.748 +
   6.749 +    // Turn off writable page tables.
   6.750 +    // It doesn't mix with shadow mode.
   6.751 +    //
   6.752 +    vm_assist(d, VMASST_CMD_disable, VMASST_TYPE_writable_pagetables);
   6.753 +
   6.754 +    /*
   6.755 +     * Tear down it's counts by disassembling its page-table-based ref counts.
   6.756 +     * Also remove CR3's gcount/tcount.
   6.757 +     * That leaves things like GDTs and LDTs and external refs in tact.
   6.758 +     *
   6.759 +     * Most pages will be writable tcount=0.
   6.760 +     * Some will still be L1 tcount=0 or L2 tcount=0.
   6.761 +     * Maybe some pages will be type none tcount=0.
   6.762 +     * Pages granted external writable refs (via grant tables?) will
   6.763 +     * still have a non-zero tcount.  That's OK.
   6.764 +     *
   6.765 +     * gcounts will generally be 1 for PGC_allocated.
   6.766 +     * GDTs and LDTs will have additional gcounts.
   6.767 +     * Any grant-table based refs will still be in the gcount.
   6.768 +     *
   6.769 +     * We attempt to grab writable refs to each page (thus setting its type).
   6.770 +     * Immediately put back those type refs.
   6.771 +     *
   6.772 +     * Assert that no pages are left with L1/L2/L3/L4 type.
   6.773 +     */
   6.774 +    audit_adjust_pgtables(d, -1, 1);
   6.775 +    d->arch.shadow_mode = mode;
   6.776 +
   6.777 +    struct list_head *list_ent = d->page_list.next;
   6.778 +    while ( list_ent != &d->page_list )
   6.779 +    {
   6.780 +        struct pfn_info *page = list_entry(list_ent, struct pfn_info, list);
   6.781 +        if ( !get_page_type(page, PGT_writable_page) )
   6.782 +            BUG();
   6.783 +        put_page_type(page);
   6.784 +
   6.785 +        list_ent = page->list.next;
   6.786 +    }
   6.787 +
   6.788 +    audit_adjust_pgtables(d, 1, 1);
   6.789 +
   6.790 +    printk("audit3\n");
   6.791 +    _audit_domain(d, AUDIT_ALREADY_LOCKED, __FILE__, __LINE__);
   6.792 +    printk("audit3 done\n");
   6.793 +
   6.794      return 0;
   6.795  
   6.796   nomem:
   6.797 @@ -219,13 +742,10 @@ int shadow_mode_enable(struct domain *d,
   6.798      return rc;
   6.799  }
   6.800  
   6.801 -void __shadow_mode_disable(struct domain *d)
   6.802 +static void free_shadow_ht_entries(struct domain *d)
   6.803  {
   6.804      struct shadow_status *x, *n;
   6.805  
   6.806 -    free_shadow_state(d);
   6.807 -    d->arch.shadow_mode = 0;
   6.808 -
   6.809      SH_VLOG("freed tables count=%d l1=%d l2=%d",
   6.810              d->arch.shadow_page_count, perfc_value(shadow_l1_pages), 
   6.811              perfc_value(shadow_l2_pages));
   6.812 @@ -239,6 +759,8 @@ void __shadow_mode_disable(struct domain
   6.813      }
   6.814  
   6.815      d->arch.shadow_ht_extras = NULL;
   6.816 +    d->arch.shadow_ht_free = NULL;
   6.817 +
   6.818      ASSERT(d->arch.shadow_extras_count == 0);
   6.819      SH_LOG("freed extras, now %d", d->arch.shadow_extras_count);
   6.820  
   6.821 @@ -253,6 +775,45 @@ void __shadow_mode_disable(struct domain
   6.822      d->arch.shadow_ht = NULL;
   6.823  }
   6.824  
   6.825 +static void free_out_of_sync_entries(struct domain *d)
   6.826 +{
   6.827 +    struct out_of_sync_entry *x, *n;
   6.828 +
   6.829 +    n = d->arch.out_of_sync_extras;
   6.830 +    while ( (x = n) != NULL )
   6.831 +    {
   6.832 +        d->arch.out_of_sync_extras_count--;
   6.833 +        n = *((struct out_of_sync_entry **)(&x[out_of_sync_extra_size]));
   6.834 +        xfree(x);
   6.835 +    }
   6.836 +
   6.837 +    d->arch.out_of_sync_extras = NULL;
   6.838 +    d->arch.out_of_sync_free = NULL;
   6.839 +    d->arch.out_of_sync = NULL;
   6.840 +
   6.841 +    ASSERT(d->arch.out_of_sync_extras_count == 0);
   6.842 +    FSH_LOG("freed extra out_of_sync entries, now %d",
   6.843 +            d->arch.out_of_sync_extras_count);
   6.844 +}
   6.845 +
   6.846 +void __shadow_mode_disable(struct domain *d)
   6.847 +{
   6.848 +    // This needs rethinking for the full shadow mode stuff.
   6.849 +    //
   6.850 +    // Among other things, ref counts need to be restored to a sensible
   6.851 +    // state for a non-shadow-mode guest...
   6.852 +    // This is probably easiest to do by stealing code from audit_domain().
   6.853 +    //
   6.854 +    BUG();
   6.855 +
   6.856 +    free_shadow_pages(d);
   6.857 +    
   6.858 +    d->arch.shadow_mode = 0;
   6.859 +
   6.860 +    free_shadow_ht_entries(d);
   6.861 +    free_out_of_sync_entries(d);
   6.862 +}
   6.863 +
   6.864  static int shadow_mode_table_op(
   6.865      struct domain *d, dom0_shadow_control_t *sc)
   6.866  {
   6.867 @@ -272,7 +833,7 @@ static int shadow_mode_table_op(
   6.868      switch ( op )
   6.869      {
   6.870      case DOM0_SHADOW_CONTROL_OP_FLUSH:
   6.871 -        free_shadow_state(d);
   6.872 +        free_shadow_pages(d);
   6.873  
   6.874          d->arch.shadow_fault_count       = 0;
   6.875          d->arch.shadow_dirty_count       = 0;
   6.876 @@ -282,7 +843,7 @@ static int shadow_mode_table_op(
   6.877          break;
   6.878     
   6.879      case DOM0_SHADOW_CONTROL_OP_CLEAN:
   6.880 -        clear_shadow_state(d);
   6.881 +        free_shadow_pages(d);
   6.882  
   6.883          sc->stats.fault_count       = d->arch.shadow_fault_count;
   6.884          sc->stats.dirty_count       = d->arch.shadow_dirty_count;
   6.885 @@ -394,13 +955,13 @@ int shadow_mode_control(struct domain *d
   6.886          break;
   6.887  
   6.888      case DOM0_SHADOW_CONTROL_OP_ENABLE_TEST:
   6.889 -        free_shadow_state(d);
   6.890 +        free_shadow_pages(d);
   6.891          rc = __shadow_mode_enable(d, SHM_enable);
   6.892          break;
   6.893  
   6.894      case DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY:
   6.895 -        free_shadow_state(d);
   6.896 -        rc = __shadow_mode_enable(d, d->arch.shadow_mode|SHM_log_dirty);
   6.897 +        free_shadow_pages(d);
   6.898 +        rc = __shadow_mode_enable(d, d->arch.shadow_mode|SHM_enable|SHM_log_dirty);
   6.899          break;
   6.900  
   6.901      default:
   6.902 @@ -418,87 +979,108 @@ int shadow_mode_control(struct domain *d
   6.903      return rc;
   6.904  }
   6.905  
   6.906 -static inline struct pfn_info *alloc_shadow_page(struct domain *d)
   6.907 -{
   6.908 -    struct pfn_info *page = alloc_domheap_page(NULL);
   6.909 -
   6.910 -    d->arch.shadow_page_count++;
   6.911 -
   6.912 -    if ( unlikely(page == NULL) )
   6.913 -    {
   6.914 -        printk("Couldn't alloc shadow page! count=%d\n",
   6.915 -               d->arch.shadow_page_count);
   6.916 -        SH_VLOG("Shadow tables l1=%d l2=%d",
   6.917 -                perfc_value(shadow_l1_pages), 
   6.918 -                perfc_value(shadow_l2_pages));
   6.919 -        BUG(); /* XXX FIXME: try a shadow flush to free up some memory. */
   6.920 -    }
   6.921 -
   6.922 -    return page;
   6.923 -}
   6.924 -
   6.925 -void unshadow_table(unsigned long gpfn, unsigned int type)
   6.926 -{
   6.927 -    unsigned long  smfn;
   6.928 -    struct domain *d = page_get_owner(&frame_table[gpfn]);
   6.929 -
   6.930 -    SH_VLOG("unshadow_table type=%08x gpfn=%p", type, gpfn);
   6.931 -
   6.932 -    perfc_incrc(unshadow_table_count);
   6.933 -
   6.934 -    /*
   6.935 -     * This function is the same for all p.t. pages. Even for multi-processor 
   6.936 -     * guests there won't be a race here as this CPU was the one that 
   6.937 -     * cmpxchg'ed the page to invalid.
   6.938 -     */
   6.939 -    smfn = __shadow_status(d, gpfn) & PSH_pfn_mask;
   6.940 -    delete_shadow_status(d, gpfn);
   6.941 -    free_shadow_page(d, &frame_table[smfn]);
   6.942 -}
   6.943 -
   6.944  /*
   6.945 - * XXX KAF:
   6.946 - *  1. Why is this VMX specific?
   6.947 - *  2. Why is VMX using clear_state() rather than free_state()?
   6.948 - *     (could we get rid of clear_state and fold into free_state?)
   6.949 + * XXX KAF: Why is this VMX specific?
   6.950   */
   6.951  void vmx_shadow_clear_state(struct domain *d)
   6.952  {
   6.953      SH_VVLOG("vmx_clear_shadow_state:");
   6.954      shadow_lock(d);
   6.955 -    clear_shadow_state(d);
   6.956 +    free_shadow_pages(d);
   6.957      shadow_unlock(d);
   6.958  }
   6.959  
   6.960 -unsigned long shadow_l2_table( 
   6.961 -    struct domain *d, unsigned long gmfn)
   6.962 +static unsigned long
   6.963 +shadow_hl2_table(struct domain *d, unsigned long gpfn, unsigned long gmfn,
   6.964 +                unsigned long smfn)
   6.965  {
   6.966 -    struct pfn_info *spfn_info;
   6.967 -    unsigned long    spfn;
   6.968 -    unsigned long    gpfn;
   6.969 +    unsigned long hl2mfn;
   6.970 +    l1_pgentry_t *hl2;
   6.971 +    l2_pgentry_t *gl2;
   6.972 +    int i, limit;
   6.973 +
   6.974 +    ASSERT(PGT_base_page_table == PGT_l2_page_table);
   6.975 +
   6.976 +    if ( unlikely(!(hl2mfn = alloc_shadow_page(d, gpfn, gmfn, PGT_hl2_shadow))) )
   6.977 +    {
   6.978 +        printk("Couldn't alloc an HL2 shadow for pfn=%p mfn=%p\n", gpfn, gmfn);
   6.979 +        BUG(); /* XXX Deal gracefully with failure. */
   6.980 +    }
   6.981 +
   6.982 +    perfc_incrc(shadow_hl2_table_count);
   6.983 +
   6.984 +    ASSERT( pagetable_val(current->arch.guest_table) == (gmfn << PAGE_SHIFT) );
   6.985 +    gl2 = current->arch.guest_vtable;
   6.986 +
   6.987 +    hl2 = map_domain_mem(hl2mfn << PAGE_SHIFT);
   6.988 +
   6.989 +    if ( shadow_mode_external(d) )
   6.990 +        limit = L2_PAGETABLE_ENTRIES;
   6.991 +    else
   6.992 +        limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
   6.993 +
   6.994 +    for ( i = 0; i < limit; i++ )
   6.995 +    {
   6.996 +        unsigned long gl2e = l2_pgentry_val(gl2[i]);
   6.997 +        unsigned long mfn;
   6.998  
   6.999 -    gpfn = __mfn_to_gpfn(d, gmfn);
  6.1000 +        if ( gl2e & _PAGE_PRESENT )
  6.1001 +        {
  6.1002 +            mfn = __gpfn_to_mfn(d, gl2e >> PAGE_SHIFT);
  6.1003 +            hl2[i] = mk_l1_pgentry((mfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
  6.1004 +            get_page(pfn_to_page(mfn), d);
  6.1005 +        }
  6.1006 +        else
  6.1007 +            hl2[i] = mk_l1_pgentry(0);
  6.1008 +    }
  6.1009 +
  6.1010 +    if ( !shadow_mode_external(d) )
  6.1011 +    {
  6.1012 +        memset(&hl2[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 0,
  6.1013 +               HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
  6.1014  
  6.1015 -    SH_VVLOG("shadow_l2_table( %p )", gmfn);
  6.1016 +        // Setup easy access to the GL2, SL2, and HL2 frames.
  6.1017 +        //
  6.1018 +        hl2[l2_table_offset(LINEAR_PT_VIRT_START)] =
  6.1019 +            mk_l1_pgentry((gmfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
  6.1020 +        hl2[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
  6.1021 +            mk_l1_pgentry((smfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
  6.1022 +        hl2[l2_table_offset(PERDOMAIN_VIRT_START)] =
  6.1023 +            mk_l1_pgentry((hl2mfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
  6.1024 +    }
  6.1025 +
  6.1026 +    unmap_domain_mem(hl2);
  6.1027 +
  6.1028 +    return hl2mfn;
  6.1029 +}
  6.1030 +
  6.1031 +/*
  6.1032 + * This could take and use a snapshot, and validate the entire page at
  6.1033 + * once, or it could continue to fault in entries one at a time...
  6.1034 + * Might be worth investigating...
  6.1035 + */
  6.1036 +static unsigned long shadow_l2_table(
  6.1037 +    struct domain *d, unsigned long gpfn, unsigned long gmfn)
  6.1038 +{
  6.1039 +    unsigned long smfn;
  6.1040 +    l2_pgentry_t *spl2e;
  6.1041 +
  6.1042 +    SH_VVLOG("shadow_l2_table(gpfn=%p, gmfn=%p)", gpfn, gmfn);
  6.1043  
  6.1044      perfc_incrc(shadow_l2_table_count);
  6.1045  
  6.1046 -    if ( (spfn_info = alloc_shadow_page(d)) == NULL )
  6.1047 +    if ( unlikely(!(smfn = alloc_shadow_page(d, gpfn, gmfn, PGT_l2_shadow))) )
  6.1048 +    {
  6.1049 +        printk("Couldn't alloc an L2 shadow for pfn=%p mfn=%p\n", gpfn, gmfn);
  6.1050          BUG(); /* XXX Deal gracefully with failure. */
  6.1051 -
  6.1052 -    spfn_info->u.inuse.type_info = PGT_l2_page_table;
  6.1053 -    perfc_incr(shadow_l2_pages);
  6.1054 +    }
  6.1055  
  6.1056 -    spfn = page_to_pfn(spfn_info);
  6.1057 -  /* Mark pfn as being shadowed; update field to point at shadow. */
  6.1058 -    set_shadow_status(d, gpfn, spfn | PSH_shadowed);
  6.1059 - 
  6.1060 -#ifdef __i386__
  6.1061 +    spl2e = (l2_pgentry_t *)map_domain_mem(smfn << PAGE_SHIFT);
  6.1062 +
  6.1063      /* Install hypervisor and 2x linear p.t. mapings. */
  6.1064 -    if ( !shadow_mode_translate(d) )
  6.1065 +    if ( (PGT_base_page_table == PGT_l2_page_table) &&
  6.1066 +         !shadow_mode_external(d) )
  6.1067      {
  6.1068 -        l2_pgentry_t *spl2e;
  6.1069 -        spl2e = (l2_pgentry_t *)map_domain_mem(spfn << PAGE_SHIFT);
  6.1070          /*
  6.1071           * We could proactively fill in PDEs for pages that are already
  6.1072           * shadowed *and* where the guest PDE has _PAGE_ACCESSED set
  6.1073 @@ -511,156 +1093,714 @@ unsigned long shadow_l2_table(
  6.1074          memcpy(&spl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 
  6.1075                 &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
  6.1076                 HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
  6.1077 -        spl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
  6.1078 -            mk_l2_pgentry((gmfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
  6.1079 -        spl2e[SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
  6.1080 -            mk_l2_pgentry((spfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
  6.1081 -        spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] =
  6.1082 +
  6.1083 +        if ( shadow_mode_translate(d) ) // NB: not external
  6.1084 +        {
  6.1085 +            unsigned long hl2mfn;
  6.1086 +            if ( unlikely(hl2mfn = __shadow_status(d, gpfn, PGT_hl2_shadow)) )
  6.1087 +                hl2mfn = shadow_hl2_table(d, gpfn, gmfn, smfn);
  6.1088 +
  6.1089 +            // shadow_mode_translate (but not external) sl2 tables hold a
  6.1090 +            // ref to their hl2.
  6.1091 +            //
  6.1092 +            get_shadow_ref(hl2mfn);
  6.1093 +            
  6.1094 +            spl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
  6.1095 +                mk_l2_pgentry((hl2mfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
  6.1096 +        }
  6.1097 +        else
  6.1098 +            spl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
  6.1099 +                mk_l2_pgentry((gmfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
  6.1100 +
  6.1101 +        spl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
  6.1102 +            mk_l2_pgentry((smfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
  6.1103 +
  6.1104 +        spl2e[l2_table_offset(PERDOMAIN_VIRT_START)] =
  6.1105              mk_l2_pgentry(__pa(page_get_owner(
  6.1106                  &frame_table[gmfn])->arch.mm_perdomain_pt) |
  6.1107                            __PAGE_HYPERVISOR);
  6.1108 -
  6.1109 -        unmap_domain_mem(spl2e);
  6.1110 +    }
  6.1111 +    else
  6.1112 +    {
  6.1113 +        memset(spl2e, 0, L2_PAGETABLE_ENTRIES*sizeof(l2_pgentry_t));        
  6.1114      }
  6.1115 -#endif
  6.1116 +
  6.1117 +    unmap_domain_mem(spl2e);
  6.1118  
  6.1119 -    SH_VLOG("shadow_l2_table( %p -> %p)", gmfn, spfn);
  6.1120 -    return spfn;
  6.1121 +    SH_VLOG("shadow_l2_table(%p -> %p)", gmfn, smfn);
  6.1122 +    return smfn;
  6.1123  }
  6.1124  
  6.1125 -static void shadow_map_l1_into_current_l2(unsigned long va)
  6.1126 +void shadow_map_l1_into_current_l2(unsigned long va)
  6.1127  { 
  6.1128      struct exec_domain *ed = current;
  6.1129      struct domain *d = ed->domain;
  6.1130 -    unsigned long    *gpl1e, *spl1e, gl2e, sl2e, gl1pfn, sl1mfn, sl1ss;
  6.1131 -    struct pfn_info  *sl1mfn_info;
  6.1132 -    int               i;
  6.1133 +    unsigned long    *gpl1e, *spl1e, gl2e, sl2e, gl1pfn, gl1mfn, sl1mfn;
  6.1134 +    int i, init_table = 0;
  6.1135  
  6.1136      __guest_get_l2e(ed, va, &gl2e);
  6.1137 -
  6.1138 +    ASSERT(gl2e & _PAGE_PRESENT);
  6.1139      gl1pfn = gl2e >> PAGE_SHIFT;
  6.1140  
  6.1141 -    sl1ss = __shadow_status(d, gl1pfn);
  6.1142 -    if ( !(sl1ss & PSH_shadowed) )
  6.1143 +    if ( !(sl1mfn = __shadow_status(d, gl1pfn, PGT_l1_shadow)) )
  6.1144      {
  6.1145          /* This L1 is NOT already shadowed so we need to shadow it. */
  6.1146 -        SH_VVLOG("4a: l1 not shadowed ( %p )", sl1ss);
  6.1147 +        SH_VVLOG("4a: l1 not shadowed");
  6.1148  
  6.1149 -        sl1mfn_info = alloc_shadow_page(d);
  6.1150 -        sl1mfn_info->u.inuse.type_info = PGT_l1_page_table;
  6.1151 -   
  6.1152 -        sl1mfn = sl1mfn_info - frame_table;
  6.1153 +        gl1mfn = __gpfn_to_mfn(d, gl1pfn);
  6.1154 +        if ( unlikely(!gl1mfn) )
  6.1155 +        {
  6.1156 +            // Attempt to use an invalid pfn as an L1 page.
  6.1157 +            // XXX this needs to be more graceful!
  6.1158 +            BUG();
  6.1159 +        }
  6.1160 +
  6.1161 +        if ( unlikely(!(sl1mfn =
  6.1162 +                        alloc_shadow_page(d, gl1pfn, gl1mfn, PGT_l1_shadow))) )
  6.1163 +        {
  6.1164 +            printk("Couldn't alloc an L1 shadow for pfn=%p mfn=%p\n",
  6.1165 +                   gl1pfn, gl1mfn);
  6.1166 +            BUG(); /* XXX Need to deal gracefully with failure. */
  6.1167 +        }
  6.1168  
  6.1169          perfc_incrc(shadow_l1_table_count);
  6.1170 -        perfc_incr(shadow_l1_pages);
  6.1171 -
  6.1172 -        set_shadow_status(d, gl1pfn, PSH_shadowed | sl1mfn);
  6.1173 -
  6.1174 -        l2pde_general(d, &gl2e, &sl2e, sl1mfn);
  6.1175 -
  6.1176 -        __guest_set_l2e(ed, va, gl2e);
  6.1177 -        __shadow_set_l2e(ed, va, sl2e);
  6.1178 -
  6.1179 -        gpl1e = (unsigned long *) &(linear_pg_table[
  6.1180 -            (va>>L1_PAGETABLE_SHIFT) & ~(L1_PAGETABLE_ENTRIES-1)]);
  6.1181 -
  6.1182 -        spl1e = (unsigned long *) &(shadow_linear_pg_table[
  6.1183 -            (va>>L1_PAGETABLE_SHIFT) & ~(L1_PAGETABLE_ENTRIES-1)]);
  6.1184 -
  6.1185 -        for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
  6.1186 -            l1pte_propagate_from_guest(d, &gpl1e[i], &spl1e[i]);
  6.1187 +        init_table = 1;
  6.1188      }
  6.1189      else
  6.1190      {
  6.1191          /* This L1 is shadowed already, but the L2 entry is missing. */
  6.1192 -        SH_VVLOG("4b: was shadowed, l2 missing ( %p )", sl1ss);
  6.1193 +        SH_VVLOG("4b: was shadowed, l2 missing (%p)", sl1mfn);
  6.1194 +    }
  6.1195 +
  6.1196 +#ifndef NDEBUG
  6.1197 +    unsigned long old_sl2e;
  6.1198 +    __shadow_get_l2e(ed, va, &old_sl2e);
  6.1199 +    ASSERT( !(old_sl2e & _PAGE_PRESENT) );
  6.1200 +#endif
  6.1201 +
  6.1202 +    get_shadow_ref(sl1mfn);
  6.1203 +    l2pde_general(d, &gl2e, &sl2e, sl1mfn);
  6.1204 +    __guest_set_l2e(ed, va, gl2e);
  6.1205 +    __shadow_set_l2e(ed, va, sl2e);
  6.1206  
  6.1207 -        sl1mfn = sl1ss & PSH_pfn_mask;
  6.1208 -        l2pde_general(d, &gl2e, &sl2e, sl1mfn);
  6.1209 -        __guest_set_l2e(ed, va, gl2e);
  6.1210 -        __shadow_set_l2e(ed, va, sl2e);
  6.1211 -    }              
  6.1212 +    if ( init_table )
  6.1213 +    {
  6.1214 +        gpl1e = (unsigned long *)
  6.1215 +            &(linear_pg_table[l1_linear_offset(va) &
  6.1216 +                              ~(L1_PAGETABLE_ENTRIES-1)]);
  6.1217 +
  6.1218 +        spl1e = (unsigned long *)
  6.1219 +            &(shadow_linear_pg_table[l1_linear_offset(va) &
  6.1220 +                                     ~(L1_PAGETABLE_ENTRIES-1)]);
  6.1221 +
  6.1222 +        for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
  6.1223 +        {
  6.1224 +            l1pte_propagate_from_guest(d, gpl1e[i], &spl1e[i]);
  6.1225 +            if ( spl1e[i] & _PAGE_PRESENT )
  6.1226 +                get_page_from_l1e(mk_l1_pgentry(spl1e[i]), d);
  6.1227 +        }
  6.1228 +    }
  6.1229  }
  6.1230  
  6.1231  void shadow_invlpg(struct exec_domain *ed, unsigned long va)
  6.1232  {
  6.1233 +    struct domain *d = ed->domain;
  6.1234      unsigned long gpte, spte;
  6.1235  
  6.1236 -    ASSERT(shadow_mode_enabled(ed->domain));
  6.1237 +    ASSERT(shadow_mode_enabled(d));
  6.1238 +
  6.1239 +    shadow_lock(d);
  6.1240 +
  6.1241 +    __shadow_sync_va(ed, va);
  6.1242  
  6.1243 -    /*
  6.1244 -     * XXX KAF: Why is this set-to-zero required?
  6.1245 -     *          Why, on failure, must we bin all our shadow state?
  6.1246 -     */
  6.1247 -    if (__put_user(0L, (unsigned long *)
  6.1248 -                   &shadow_linear_pg_table[va >> PAGE_SHIFT])) {
  6.1249 -        vmx_shadow_clear_state(ed->domain);
  6.1250 +    // XXX mafetter: will need to think about 4MB pages...
  6.1251 +
  6.1252 +    // It's not strictly necessary to update the shadow here,
  6.1253 +    // but it might save a fault later.
  6.1254 +    //
  6.1255 +    if (__get_user(gpte, (unsigned long *)
  6.1256 +                   &linear_pg_table[va >> PAGE_SHIFT])) {
  6.1257 +        perfc_incrc(shadow_invlpg_faults);
  6.1258          return;
  6.1259      }
  6.1260 +    l1pte_propagate_from_guest(d, gpte, &spte);
  6.1261 +    shadow_set_l1e(va, spte, 1);
  6.1262  
  6.1263 -    if (__get_user(gpte, (unsigned long *)
  6.1264 -                   &linear_pg_table[va >> PAGE_SHIFT])) {
  6.1265 +    shadow_unlock(d);
  6.1266 +}
  6.1267 +
  6.1268 +struct out_of_sync_entry *
  6.1269 +shadow_alloc_oos_entry(struct domain *d)
  6.1270 +{
  6.1271 +    struct out_of_sync_entry *f, *extra;
  6.1272 +    unsigned size, i;
  6.1273 +
  6.1274 +    if ( unlikely(d->arch.out_of_sync_free == NULL) )
  6.1275 +    {
  6.1276 +        FSH_LOG("Allocate more fullshadow tuple blocks.");
  6.1277 +
  6.1278 +        size = sizeof(void *) + (out_of_sync_extra_size * sizeof(*f));
  6.1279 +        extra = xmalloc_bytes(size);
  6.1280 +
  6.1281 +        /* XXX Should be more graceful here. */
  6.1282 +        if ( extra == NULL )
  6.1283 +            BUG();
  6.1284 +
  6.1285 +        memset(extra, 0, size);
  6.1286 +
  6.1287 +        /* Record the allocation block so it can be correctly freed later. */
  6.1288 +        d->arch.out_of_sync_extras_count++;
  6.1289 +        *((struct out_of_sync_entry **)&extra[out_of_sync_extra_size]) = 
  6.1290 +            d->arch.out_of_sync_extras;
  6.1291 +        d->arch.out_of_sync_extras = &extra[0];
  6.1292 +
  6.1293 +        /* Thread a free chain through the newly-allocated nodes. */
  6.1294 +        for ( i = 0; i < (out_of_sync_extra_size - 1); i++ )
  6.1295 +            extra[i].next = &extra[i+1];
  6.1296 +        extra[i].next = NULL;
  6.1297 +
  6.1298 +        /* Add the new nodes to the free list. */
  6.1299 +        d->arch.out_of_sync_free = &extra[0];
  6.1300 +    }
  6.1301 +
  6.1302 +    /* Allocate a new node from the quicklist. */
  6.1303 +    f = d->arch.out_of_sync_free;
  6.1304 +    d->arch.out_of_sync_free = f->next;
  6.1305 +
  6.1306 +    return f;
  6.1307 +}
  6.1308 +
  6.1309 +static unsigned long
  6.1310 +shadow_make_snapshot(
  6.1311 +    struct domain *d, unsigned long gpfn, unsigned long gmfn)
  6.1312 +{
  6.1313 +    unsigned long smfn;
  6.1314 +    void *original, *snapshot;
  6.1315 +
  6.1316 +    if ( test_and_set_bit(_PGC_out_of_sync, &frame_table[gmfn].count_info) )
  6.1317 +    {
  6.1318 +        ASSERT(__shadow_status(d, gpfn, PGT_snapshot));
  6.1319 +        return SHADOW_SNAPSHOT_ELSEWHERE;
  6.1320 +    }
  6.1321 +
  6.1322 +    perfc_incrc(shadow_make_snapshot);
  6.1323 +
  6.1324 +    if ( unlikely(!(smfn = alloc_shadow_page(d, gpfn, gmfn, PGT_snapshot))) )
  6.1325 +    {
  6.1326 +        printk("Couldn't alloc fullshadow snapshot for pfn=%p mfn=%p!\n"
  6.1327 +               "Dom%d snapshot_count_count=%d\n",
  6.1328 +               gpfn, gmfn, d->id, d->arch.snapshot_page_count);
  6.1329 +        BUG(); /* XXX FIXME: try a shadow flush to free up some memory. */
  6.1330 +    }
  6.1331 +
  6.1332 +    get_shadow_ref(smfn);
  6.1333 +
  6.1334 +    original = map_domain_mem(gmfn << PAGE_SHIFT);
  6.1335 +    snapshot = map_domain_mem(smfn << PAGE_SHIFT);
  6.1336 +    memcpy(snapshot, original, PAGE_SIZE);
  6.1337 +    unmap_domain_mem(original);
  6.1338 +    unmap_domain_mem(snapshot);
  6.1339 +
  6.1340 +    return smfn;
  6.1341 +}
  6.1342 +
  6.1343 +static void
  6.1344 +shadow_free_snapshot(struct domain *d, struct out_of_sync_entry *entry)
  6.1345 +{
  6.1346 +    void *snapshot;
  6.1347 +
  6.1348 +    if ( entry->snapshot_mfn == SHADOW_SNAPSHOT_ELSEWHERE )
  6.1349          return;
  6.1350 +
  6.1351 +    // Clear the out_of_sync bit.
  6.1352 +    //
  6.1353 +    clear_bit(_PGC_out_of_sync, &frame_table[entry->gmfn].count_info);
  6.1354 +
  6.1355 +    // XXX Need to think about how to protect the domain's
  6.1356 +    // information less expensively.
  6.1357 +    //
  6.1358 +    snapshot = map_domain_mem(entry->snapshot_mfn << PAGE_SHIFT);
  6.1359 +    memset(snapshot, 0, PAGE_SIZE);
  6.1360 +    unmap_domain_mem(snapshot);
  6.1361 +
  6.1362 +    put_shadow_ref(entry->snapshot_mfn);
  6.1363 +}
  6.1364 +
  6.1365 +struct out_of_sync_entry *
  6.1366 +shadow_mark_mfn_out_of_sync(struct exec_domain *ed, unsigned long gpfn,
  6.1367 +                             unsigned long mfn)
  6.1368 +{
  6.1369 +    struct domain *d = ed->domain;
  6.1370 +    struct pfn_info *page = &frame_table[mfn];
  6.1371 +    struct out_of_sync_entry *entry = shadow_alloc_oos_entry(d);
  6.1372 +
  6.1373 +    ASSERT(spin_is_locked(&d->arch.shadow_lock));
  6.1374 +    ASSERT(pfn_is_ram(mfn));
  6.1375 +    //ASSERT((page->u.inuse.type_info & PGT_type_mask) == PGT_writable_page);
  6.1376 +    if (!((page->u.inuse.type_info & PGT_type_mask) == PGT_writable_page))
  6.1377 +    {
  6.1378 +        printk("assertion failed: gpfn=%p gmfn=%p t=%p\n",
  6.1379 +               gpfn, mfn, page->u.inuse.type_info);
  6.1380 +        BUG();
  6.1381      }
  6.1382  
  6.1383 -    l1pte_propagate_from_guest(ed->domain, &gpte, &spte);
  6.1384 +    FSH_LOG("mark_mfn_out_of_sync(gpfn=%p, mfn=%p) c=%p t=%p",
  6.1385 +            gpfn, mfn, page->count_info, page->u.inuse.type_info);
  6.1386 +
  6.1387 +    // XXX this will require some more thought...  Cross-domain sharing and
  6.1388 +    //     modification of page tables?  Hmm...
  6.1389 +    //
  6.1390 +    if ( d != page_get_owner(page) )
  6.1391 +        BUG();
  6.1392 +
  6.1393 +    perfc_incrc(shadow_mark_mfn_out_of_sync_calls);
  6.1394 +
  6.1395 +    entry->gpfn = gpfn;
  6.1396 +    entry->gmfn = mfn;
  6.1397 +    entry->snapshot_mfn = shadow_make_snapshot(d, gpfn, mfn);
  6.1398 +    entry->writable_pl1e = -1;
  6.1399 +
  6.1400 +    // increment guest's ref count to represent the entry in the
  6.1401 +    // full shadow out-of-sync list.
  6.1402 +    //
  6.1403 +    get_page(page, d);
  6.1404 +
  6.1405 +    // Add to the out-of-sync list
  6.1406 +    //
  6.1407 +    entry->next = d->arch.out_of_sync;
  6.1408 +    d->arch.out_of_sync = entry;
  6.1409 +
  6.1410 +    return entry;
  6.1411 +}
  6.1412 +
  6.1413 +void shadow_mark_out_of_sync(
  6.1414 +    struct exec_domain *ed, unsigned long gpfn, unsigned long mfn, unsigned long va)
  6.1415 +{
  6.1416 +    struct out_of_sync_entry *entry =
  6.1417 +        shadow_mark_mfn_out_of_sync(ed, gpfn, mfn);
  6.1418 +    unsigned long sl2e;
  6.1419 +
  6.1420 +    // We need the address of shadow PTE that maps @va.
  6.1421 +    // It might not exist yet.  Make sure it's there.
  6.1422 +    //
  6.1423 +    __shadow_get_l2e(ed, va, &sl2e);
  6.1424 +    if ( !(sl2e & _PAGE_PRESENT) )
  6.1425 +    {
  6.1426 +        // either this L1 isn't shadowed yet, or the shadow isn't linked into
  6.1427 +        // the current L2.
  6.1428 +        shadow_map_l1_into_current_l2(va);
  6.1429 +        __shadow_get_l2e(ed, va, &sl2e);
  6.1430 +    }
  6.1431 +    ASSERT(sl2e & _PAGE_PRESENT);
  6.1432 +
  6.1433 +    // NB: this is stored as a machine address.
  6.1434 +    entry->writable_pl1e =
  6.1435 +        ((sl2e & PAGE_MASK) |
  6.1436 +         (sizeof(l1_pgentry_t) * l1_table_offset(va)));
  6.1437 +    ASSERT( !(entry->writable_pl1e & (sizeof(l1_pgentry_t)-1)) );
  6.1438 +
  6.1439 +    // Increment shadow's page count to represent the reference
  6.1440 +    // inherent in entry->writable_pl1e
  6.1441 +    //
  6.1442 +    get_shadow_ref(sl2e >> PAGE_SHIFT);
  6.1443 +
  6.1444 +    FSH_LOG("mark_out_of_sync(va=%p -> writable_pl1e=%p)",
  6.1445 +            va, entry->writable_pl1e);
  6.1446 +}
  6.1447 +
  6.1448 +/*
  6.1449 + * Returns 1 if the snapshot for @gmfn exists and its @index'th entry matches.
  6.1450 + * Returns 0 otherwise.
  6.1451 + */
  6.1452 +static int snapshot_entry_matches(
  6.1453 +    struct exec_domain *ed, unsigned long gmfn, unsigned index)
  6.1454 +{
  6.1455 +    unsigned long gpfn = __mfn_to_gpfn(ed->domain, gmfn);
  6.1456 +    unsigned long smfn = __shadow_status(ed->domain, gpfn, PGT_snapshot);
  6.1457 +    unsigned long *guest, *snapshot;
  6.1458 +    int compare;
  6.1459 +
  6.1460 +    ASSERT( ! IS_INVALID_M2P_ENTRY(gpfn) );
  6.1461 +
  6.1462 +    perfc_incrc(snapshot_entry_matches_calls);
  6.1463 +
  6.1464 +    if ( !smfn )
  6.1465 +        return 0;
  6.1466 +
  6.1467 +    guest    = map_domain_mem(gmfn << PAGE_SHIFT);
  6.1468 +    snapshot = map_domain_mem(smfn << PAGE_SHIFT);
  6.1469 +
  6.1470 +    // This could probably be smarter, but this is sufficent for
  6.1471 +    // our current needs.
  6.1472 +    //
  6.1473 +    compare = (guest[index] == snapshot[index]);
  6.1474 +
  6.1475 +    unmap_domain_mem(guest);
  6.1476 +    unmap_domain_mem(snapshot);
  6.1477 +
  6.1478 +#ifdef PERF_COUNTERS
  6.1479 +    if ( compare )
  6.1480 +        perfc_incrc(snapshot_entry_matches_true);
  6.1481 +#endif
  6.1482 +
  6.1483 +    return compare;
  6.1484 +}
  6.1485 +
  6.1486 +/*
  6.1487 + * Returns 1 if va's shadow mapping is out-of-sync.
  6.1488 + * Returns 0 otherwise.
  6.1489 + */
  6.1490 +int __shadow_out_of_sync(struct exec_domain *ed, unsigned long va)
  6.1491 +{
  6.1492 +    struct domain *d = ed->domain;
  6.1493 +    unsigned long l2mfn = pagetable_val(ed->arch.guest_table) >> PAGE_SHIFT;
  6.1494 +    unsigned long l2e;
  6.1495 +    unsigned long l1mfn;
  6.1496 +
  6.1497 +    ASSERT(spin_is_locked(&d->arch.shadow_lock));
  6.1498 +
  6.1499 +    perfc_incrc(shadow_out_of_sync_calls);
  6.1500 +
  6.1501 +    if ( page_out_of_sync(&frame_table[l2mfn]) &&
  6.1502 +         !snapshot_entry_matches(ed, l2mfn, l2_table_offset(va)) )
  6.1503 +        return 1;
  6.1504 +
  6.1505 +    __guest_get_l2e(ed, va, &l2e);
  6.1506 +    if ( !(l2e & _PAGE_PRESENT) )
  6.1507 +        return 0;
  6.1508 +
  6.1509 +    l1mfn = __gpfn_to_mfn(d, l2e >> PAGE_SHIFT);
  6.1510 +
  6.1511 +    // If the l1 pfn is invalid, it can't be out of sync...
  6.1512 +    if ( !l1mfn )
  6.1513 +        return 0;
  6.1514 +
  6.1515 +    if ( page_out_of_sync(&frame_table[l1mfn]) &&
  6.1516 +         !snapshot_entry_matches(ed, l1mfn, l1_table_offset(va)) )
  6.1517 +        return 1;
  6.1518 +
  6.1519 +    return 0;
  6.1520 +}
  6.1521 +
  6.1522 +static u32 remove_all_write_access_in_ptpage(
  6.1523 +    struct domain *d, unsigned long pt_mfn, unsigned long readonly_mfn)
  6.1524 +{
  6.1525 +    unsigned long *pt = map_domain_mem(pt_mfn << PAGE_SHIFT);
  6.1526 +    unsigned long match =
  6.1527 +        (readonly_mfn << PAGE_SHIFT) | _PAGE_RW | _PAGE_PRESENT;
  6.1528 +    unsigned long mask = PAGE_MASK | _PAGE_RW | _PAGE_PRESENT;
  6.1529 +    int i;
  6.1530 +    u32 count = 0;
  6.1531 +    int is_l1_shadow =
  6.1532 +        ((frame_table[pt_mfn].u.inuse.type_info & PGT_type_mask) ==
  6.1533 +         PGT_l1_shadow);
  6.1534 +
  6.1535 +    for (i = 0; i < L1_PAGETABLE_ENTRIES; i++)
  6.1536 +    {
  6.1537 +        if ( unlikely(((pt[i] ^ match) & mask) == 0) )
  6.1538 +        {
  6.1539 +            unsigned long old = pt[i];
  6.1540 +            unsigned long new = old & ~_PAGE_RW;
  6.1541 +
  6.1542 +            if ( is_l1_shadow )
  6.1543 +                get_page_from_l1e(mk_l1_pgentry(new), d);
  6.1544 +
  6.1545 +            count++;
  6.1546 +            pt[i] = new;
  6.1547 +
  6.1548 +            if ( is_l1_shadow )
  6.1549 +                put_page_from_l1e(mk_l1_pgentry(old), d);
  6.1550 +
  6.1551 +            FSH_LOG("removed write access to mfn=%p in smfn=%p entry %x "
  6.1552 +                    "is_l1_shadow=%d\n",
  6.1553 +                    readonly_mfn, pt_mfn, i, is_l1_shadow);
  6.1554 +        }
  6.1555 +    }
  6.1556 +
  6.1557 +    unmap_domain_mem(pt);
  6.1558 +
  6.1559 +    return count;
  6.1560 +}
  6.1561 +
  6.1562 +u32 shadow_remove_all_write_access(
  6.1563 +    struct domain *d, unsigned min_type, unsigned max_type, unsigned long gpfn)
  6.1564 +{
  6.1565 +    int i;
  6.1566 +    struct shadow_status *a;
  6.1567 +    unsigned long gmfn = __gpfn_to_mfn(d, gpfn);
  6.1568 +    unsigned long sl1mfn = __shadow_status(d, gpfn, PGT_l1_shadow);
  6.1569 +    u32 count = 0;
  6.1570 +
  6.1571 +    ASSERT(spin_is_locked(&d->arch.shadow_lock));
  6.1572 +    ASSERT(gmfn);
  6.1573  
  6.1574 -    if (__put_user(spte, (unsigned long *)
  6.1575 -                   &shadow_linear_pg_table[va >> PAGE_SHIFT])) {
  6.1576 -        return;
  6.1577 +    for (i = 0; i < shadow_ht_buckets; i++)
  6.1578 +    {
  6.1579 +        a = &d->arch.shadow_ht[i];
  6.1580 +        while ( a && a->gpfn_and_flags )
  6.1581 +        {
  6.1582 +            if ( ((a->gpfn_and_flags & PGT_type_mask) >= min_type) &&
  6.1583 +                 ((a->gpfn_and_flags & PGT_type_mask) <= max_type) )
  6.1584 +            {
  6.1585 +                switch ( a->gpfn_and_flags & PGT_type_mask )
  6.1586 +                {
  6.1587 +                case PGT_l1_shadow:
  6.1588 +                    count +=
  6.1589 +                        remove_all_write_access_in_ptpage(d, a->smfn, gmfn);
  6.1590 +                    break;
  6.1591 +                case PGT_l2_shadow:
  6.1592 +                    if ( sl1mfn )
  6.1593 +                        count +=
  6.1594 +                            remove_all_write_access_in_ptpage(d, a->smfn,
  6.1595 +                                                              sl1mfn);
  6.1596 +                    break;
  6.1597 +                case PGT_hl2_shadow:
  6.1598 +                    // nothing to do here...
  6.1599 +                    break;
  6.1600 +                default:
  6.1601 +                    // need to flush this out for 4 level page tables.
  6.1602 +                    BUG();
  6.1603 +                }
  6.1604 +            }
  6.1605 +            a = a->next;
  6.1606 +        }
  6.1607 +    }
  6.1608 +
  6.1609 +    return count;
  6.1610 +}
  6.1611 +
  6.1612 +static u32 remove_all_access_in_page(
  6.1613 +    struct domain *d, unsigned long l1mfn, unsigned long forbidden_gmfn)
  6.1614 +{
  6.1615 +    unsigned long *pl1e = map_domain_mem(l1mfn << PAGE_SHIFT);
  6.1616 +    unsigned long match = (forbidden_gmfn << PAGE_SHIFT) | _PAGE_PRESENT;
  6.1617 +    unsigned long mask  = PAGE_MASK | _PAGE_PRESENT;
  6.1618 +    int i;
  6.1619 +    u32 count = 0;
  6.1620 +    int is_l1_shadow =
  6.1621 +        ((frame_table[l1mfn].u.inuse.type_info & PGT_type_mask) ==
  6.1622 +         PGT_l1_shadow);
  6.1623 +
  6.1624 +    for (i = 0; i < L1_PAGETABLE_ENTRIES; i++)
  6.1625 +    {
  6.1626 +        if ( unlikely(((pl1e[i] ^ match) & mask) == 0) )
  6.1627 +        {
  6.1628 +            unsigned long ol2e = pl1e[i];
  6.1629 +            pl1e[i] = 0;
  6.1630 +            count++;
  6.1631 +
  6.1632 +            if ( is_l1_shadow )
  6.1633 +                put_page_from_l1e(mk_l1_pgentry(ol2e), d);
  6.1634 +            else /* must be an hl2 page */
  6.1635 +                put_page(&frame_table[forbidden_gmfn]);
  6.1636 +        }
  6.1637 +    }
  6.1638 +
  6.1639 +    unmap_domain_mem(pl1e);
  6.1640 +
  6.1641 +    return count;
  6.1642 +}
  6.1643 +
  6.1644 +u32 shadow_remove_all_access(struct domain *d, unsigned long gmfn)
  6.1645 +{
  6.1646 +    int i;
  6.1647 +    struct shadow_status *a;
  6.1648 +    u32 count = 0;
  6.1649 +
  6.1650 +    ASSERT(spin_is_locked(&d->arch.shadow_lock));
  6.1651 +
  6.1652 +    for (i = 0; i < shadow_ht_buckets; i++)
  6.1653 +    {
  6.1654 +        a = &d->arch.shadow_ht[i];
  6.1655 +        while ( a && a->gpfn_and_flags )
  6.1656 +        {
  6.1657 +            if ( ((a->gpfn_and_flags & PGT_type_mask) == PGT_l1_shadow) ||
  6.1658 +                 ((a->gpfn_and_flags & PGT_type_mask) == PGT_hl2_shadow) )
  6.1659 +            {
  6.1660 +                count += remove_all_access_in_page(d, a->smfn, gmfn);
  6.1661 +            }
  6.1662 +            a = a->next;
  6.1663 +        }
  6.1664      }
  6.1665 +
  6.1666 +    return count;
  6.1667 +}    
  6.1668 +
  6.1669 +static int resync_all(struct domain *d, u32 stype)
  6.1670 +{
  6.1671 +    struct out_of_sync_entry *entry;
  6.1672 +    unsigned i;
  6.1673 +    unsigned long smfn;
  6.1674 +    unsigned long *guest, *shadow, *snapshot;
  6.1675 +    int need_flush = 0, external = shadow_mode_external(d);
  6.1676 +
  6.1677 +    ASSERT(spin_is_locked(&d->arch.shadow_lock));
  6.1678 +
  6.1679 +    for ( entry = d->arch.out_of_sync; entry; entry = entry->next)
  6.1680 +    {
  6.1681 +        if ( entry->snapshot_mfn == SHADOW_SNAPSHOT_ELSEWHERE )
  6.1682 +            continue;
  6.1683 +
  6.1684 +        if ( !(smfn = __shadow_status(d, entry->gpfn, stype)) )
  6.1685 +            continue;
  6.1686 +
  6.1687 +        FSH_LOG("resyncing t=%p gpfn=%p gmfn=%p smfn=%p snapshot_mfn=%p",
  6.1688 +                stype, entry->gpfn, entry->gmfn, smfn, entry->snapshot_mfn);
  6.1689 +
  6.1690 +        // Compare guest's new contents to its snapshot, validating
  6.1691 +        // and updating its shadow as appropriate.
  6.1692 +        //
  6.1693 +        guest    = map_domain_mem(entry->gmfn         << PAGE_SHIFT);
  6.1694 +        snapshot = map_domain_mem(entry->snapshot_mfn << PAGE_SHIFT);
  6.1695 +        shadow   = map_domain_mem(smfn                << PAGE_SHIFT);
  6.1696 +
  6.1697 +        switch ( stype ) {
  6.1698 +        case PGT_l1_shadow:
  6.1699 +            for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
  6.1700 +            {
  6.1701 +                unsigned new_pte = guest[i];
  6.1702 +                if ( new_pte != snapshot[i] )
  6.1703 +                {
  6.1704 +                    need_flush |= validate_pte_change(d, new_pte, &shadow[i]);
  6.1705 +
  6.1706 +                    // can't update snapshots of linear page tables -- they
  6.1707 +                    // are used multiple times...
  6.1708 +                    //
  6.1709 +                    // snapshot[i] = new_pte;
  6.1710 +                }
  6.1711 +            }
  6.1712 +            break;
  6.1713 +        case PGT_l2_shadow:
  6.1714 +            for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
  6.1715 +            {
  6.1716 +                if ( !is_guest_l2_slot(i) && !external )
  6.1717 +                    continue;
  6.1718 +
  6.1719 +                unsigned new_pde = guest[i];
  6.1720 +                if ( new_pde != snapshot[i] )
  6.1721 +                {
  6.1722 +                    need_flush |= validate_pde_change(d, new_pde, &shadow[i]);
  6.1723 +
  6.1724 +                    // can't update snapshots of linear page tables -- they
  6.1725 +                    // are used multiple times...
  6.1726 +                    //
  6.1727 +                    // snapshot[i] = new_pde;
  6.1728 +                }
  6.1729 +            }
  6.1730 +            break;
  6.1731 +        default:
  6.1732 +            BUG();
  6.1733 +            break;
  6.1734 +        }
  6.1735 +
  6.1736 +        unmap_domain_mem(shadow);
  6.1737 +        unmap_domain_mem(snapshot);
  6.1738 +        unmap_domain_mem(guest);
  6.1739 +    }
  6.1740 +
  6.1741 +    return need_flush;
  6.1742 +}
  6.1743 +
  6.1744 +void __shadow_sync_all(struct domain *d)
  6.1745 +{
  6.1746 +    struct out_of_sync_entry *entry;
  6.1747 +    int need_flush = 0;
  6.1748 +
  6.1749 +    perfc_incrc(shadow_sync_all);
  6.1750 +
  6.1751 +    ASSERT(spin_is_locked(&d->arch.shadow_lock));
  6.1752 +
  6.1753 +    // First, remove all write permissions to the page tables
  6.1754 +    //
  6.1755 +    for ( entry = d->arch.out_of_sync; entry; entry = entry->next)
  6.1756 +    {
  6.1757 +        // Skip entries that have low bits set...  Those aren't
  6.1758 +        // real PTEs.
  6.1759 +        //
  6.1760 +        if ( entry->writable_pl1e & (sizeof(l1_pgentry_t)-1) )
  6.1761 +            continue;
  6.1762 +
  6.1763 +        unsigned long *ppte = map_domain_mem(entry->writable_pl1e);
  6.1764 +        unsigned long opte = *ppte;
  6.1765 +        unsigned long npte = opte & ~_PAGE_RW;
  6.1766 +
  6.1767 +        get_page_from_l1e(mk_l1_pgentry(npte), d);
  6.1768 +        *ppte = npte;
  6.1769 +        put_page_from_l1e(mk_l1_pgentry(opte), d);
  6.1770 +
  6.1771 +        unmap_domain_mem(ppte);
  6.1772 +    }
  6.1773 +
  6.1774 +    // XXX mafetter: SMP perf bug.
  6.1775 +    //
  6.1776 +    // With the current algorithm, we've gotta flush all the TLBs
  6.1777 +    // before we can safely continue.  I don't think we want to
  6.1778 +    // do it this way, so I think we should consider making
  6.1779 +    // entirely private copies of the shadow for each vcpu, and/or
  6.1780 +    // possibly having a mix of private and shared shadow state
  6.1781 +    // (any path from a PTE that grants write access to an out-of-sync
  6.1782 +    // page table page needs to be vcpu private).
  6.1783 +    //
  6.1784 +    flush_tlb_all();
  6.1785 +
  6.1786 +    // Second, resync all L1 pages, then L2 pages, etc...
  6.1787 +    //
  6.1788 +    need_flush |= resync_all(d, PGT_l1_shadow);
  6.1789 +    if ( shadow_mode_translate(d) )
  6.1790 +        need_flush |= resync_all(d, PGT_hl2_shadow);
  6.1791 +    need_flush |= resync_all(d, PGT_l2_shadow);
  6.1792 +
  6.1793 +    if ( need_flush )
  6.1794 +        local_flush_tlb();
  6.1795 +
  6.1796 +    free_out_of_sync_state(d);
  6.1797  }
  6.1798  
  6.1799  int shadow_fault(unsigned long va, struct xen_regs *regs)
  6.1800  {
  6.1801 -    unsigned long gpte, spte = 0;
  6.1802 +    unsigned long gpte, spte = 0, orig_gpte;
  6.1803      struct exec_domain *ed = current;
  6.1804      struct domain *d = ed->domain;
  6.1805 +    unsigned long gpde;
  6.1806  
  6.1807      SH_VVLOG("shadow_fault( va=%p, code=%lu )", va, regs->error_code );
  6.1808 -
  6.1809 -    check_pagetable(d, ed->arch.guest_table, "pre-sf");
  6.1810 +    perfc_incrc(shadow_fault_calls);
  6.1811 +    
  6.1812 +    check_pagetable(ed, "pre-sf");
  6.1813  
  6.1814      /*
  6.1815 -     * STEP 1. A fast-reject set of checks with no locking.
  6.1816 +     * Don't let someone else take the guest's table pages out-of-sync.
  6.1817       */
  6.1818 -
  6.1819 -    if ( unlikely(__get_user(gpte, (unsigned long *)
  6.1820 -                             &linear_pg_table[va >> PAGE_SHIFT])) )
  6.1821 -    {
  6.1822 -        SH_VVLOG("shadow_fault - EXIT: read gpte faulted" );
  6.1823 -        return 0;
  6.1824 -    }
  6.1825 +    shadow_lock(d);
  6.1826  
  6.1827 -    if ( !(gpte & _PAGE_PRESENT) )
  6.1828 -    {
  6.1829 -        SH_VVLOG("shadow_fault - EXIT: gpte not present (%lx)",gpte );
  6.1830 -        return 0;
  6.1831 -    }
  6.1832 -
  6.1833 -    if ( (regs->error_code & 2)  && !(gpte & _PAGE_RW) )
  6.1834 -    {
  6.1835 -        /* Write fault on a read-only mapping. */
  6.1836 -        return 0;
  6.1837 -    }
  6.1838 +    /* XXX - FIX THIS COMMENT!!!
  6.1839 +     * STEP 1. Check to see if this fault might have been caused by an
  6.1840 +     *         out-of-sync table page entry, or if we should pass this
  6.1841 +     *         fault onto the guest.
  6.1842 +     */
  6.1843 +    __shadow_sync_va(ed, va);
  6.1844  
  6.1845      /*
  6.1846 -     * STEP 2. Take the shadow lock and re-check the guest PTE.
  6.1847 +     * STEP 2. Check the guest PTE.
  6.1848       */
  6.1849 -
  6.1850 -    shadow_lock(d);
  6.1851 - 
  6.1852 -    if ( unlikely(__get_user(gpte, (unsigned long *)
  6.1853 -                             &linear_pg_table[va >> PAGE_SHIFT])) )
  6.1854 +    __guest_get_l2e(ed, va, &gpde);
  6.1855 +    if ( unlikely(!(gpde & _PAGE_PRESENT)) )
  6.1856      {
  6.1857 -        SH_VVLOG("shadow_fault - EXIT: read gpte faulted2" );
  6.1858 +        SH_VVLOG("shadow_fault - EXIT: L1 not present" );
  6.1859 +        perfc_incrc(shadow_fault_bail_pde_not_present);
  6.1860          shadow_unlock(d);
  6.1861          return 0;
  6.1862      }
  6.1863  
  6.1864 +    // This can't fault because we hold the shadow lock and we've ensured that
  6.1865 +    // the mapping is in-sync, so the check of the PDE's present bit, above,
  6.1866 +    // covers this access.
  6.1867 +    //
  6.1868 +    orig_gpte = gpte = l1_pgentry_val(linear_pg_table[l1_linear_offset(va)]);
  6.1869      if ( unlikely(!(gpte & _PAGE_PRESENT)) )
  6.1870      {
  6.1871 -        SH_VVLOG("shadow_fault - EXIT: gpte not present2 (%lx)",gpte );
  6.1872 +        SH_VVLOG("shadow_fault - EXIT: gpte not present (%lx)",gpte );
  6.1873 +        perfc_incrc(shadow_fault_bail_pte_not_present);
  6.1874          shadow_unlock(d);
  6.1875          return 0;
  6.1876      }
  6.1877 @@ -672,11 +1812,12 @@ int shadow_fault(unsigned long va, struc
  6.1878          {
  6.1879              /* Write fault on a read-only mapping. */
  6.1880              SH_VVLOG("shadow_fault - EXIT: wr fault on RO page (%lx)", gpte);
  6.1881 +            perfc_incrc(shadow_fault_bail_ro_mapping);
  6.1882              shadow_unlock(d);
  6.1883              return 0;
  6.1884          }
  6.1885  
  6.1886 -        l1pte_write_fault(d, &gpte, &spte);
  6.1887 +        l1pte_write_fault(ed, &gpte, &spte, va);
  6.1888      }
  6.1889      else
  6.1890      {
  6.1891 @@ -689,120 +1830,141 @@ int shadow_fault(unsigned long va, struc
  6.1892  
  6.1893      /* XXX Watch out for read-only L2 entries! (not used in Linux). */
  6.1894      if ( unlikely(__put_user(gpte, (unsigned long *)
  6.1895 -                             &linear_pg_table[va >> PAGE_SHIFT])) )
  6.1896 +                             &linear_pg_table[l1_linear_offset(va)])) )
  6.1897 +    {
  6.1898 +        printk("shadow_fault(): crashing domain %d "
  6.1899 +               "due to a read-only L2 page table (gpde=%p), va=%p\n",
  6.1900 +               d->id, gpde, va);
  6.1901          domain_crash();
  6.1902 -
  6.1903 -    /*
  6.1904 -     * Update of shadow PTE can fail because the L1 p.t. is not shadowed,
  6.1905 -     * or because the shadow isn't linked into this shadow L2 p.t.
  6.1906 -     */
  6.1907 -    if ( unlikely(__put_user(spte, (unsigned long *)
  6.1908 -                             &shadow_linear_pg_table[va >> PAGE_SHIFT])) )
  6.1909 -    {
  6.1910 -        SH_VVLOG("3: not shadowed/mapped gpte=%p spte=%p", gpte, spte);
  6.1911 -        shadow_map_l1_into_current_l2(va);
  6.1912 -        shadow_linear_pg_table[va >> PAGE_SHIFT] = mk_l1_pgentry(spte);
  6.1913      }
  6.1914  
  6.1915 -    perfc_incrc(shadow_fixup_count);
  6.1916 +    // if necessary, record the page table page as dirty
  6.1917 +    if ( unlikely(shadow_mode_log_dirty(d)) && (orig_gpte != gpte) )
  6.1918 +        mark_dirty(d, __gpfn_to_mfn(d, gpde >> PAGE_SHIFT));
  6.1919 +
  6.1920 +    shadow_set_l1e(va, spte, 1);
  6.1921 +
  6.1922 +    perfc_incrc(shadow_fault_fixed);
  6.1923      d->arch.shadow_fault_count++;
  6.1924  
  6.1925      shadow_unlock(d);
  6.1926  
  6.1927 -    check_pagetable(d, ed->arch.guest_table, "post-sf");
  6.1928 +    check_pagetable(ed, "post-sf");
  6.1929      return EXCRET_fault_fixed;
  6.1930  }
  6.1931  
  6.1932 -
  6.1933 -void shadow_l1_normal_pt_update(
  6.1934 -    unsigned long pa, unsigned long gpte,
  6.1935 -    unsigned long *prev_smfn_ptr,
  6.1936 -    l1_pgentry_t **prev_spl1e_ptr)
  6.1937 -{
  6.1938 -    unsigned long smfn, spte, prev_smfn = *prev_smfn_ptr;    
  6.1939 -    l1_pgentry_t *spl1e, *prev_spl1e = *prev_spl1e_ptr;
  6.1940 -
  6.1941 -    /* N.B. To get here, we know the l1 page *must* be shadowed. */
  6.1942 -    SH_VVLOG("shadow_l1_normal_pt_update pa=%p, gpte=%p, "
  6.1943 -             "prev_smfn=%p, prev_spl1e=%p",
  6.1944 -             pa, gpte, prev_smfn, prev_spl1e);
  6.1945 -
  6.1946 -    smfn = __shadow_status(current->domain, pa >> PAGE_SHIFT) & PSH_pfn_mask;
  6.1947 -
  6.1948 -    if ( smfn == prev_smfn )
  6.1949 -    {
  6.1950 -        spl1e = prev_spl1e;
  6.1951 -    }
  6.1952 -    else
  6.1953 -    {
  6.1954 -        if ( prev_spl1e != NULL )
  6.1955 -            unmap_domain_mem( prev_spl1e );
  6.1956 -        spl1e = (l1_pgentry_t *)map_domain_mem(smfn << PAGE_SHIFT);
  6.1957 -        *prev_smfn_ptr  = smfn;
  6.1958 -        *prev_spl1e_ptr = spl1e;
  6.1959 -    }
  6.1960 -
  6.1961 -    l1pte_propagate_from_guest(current->domain, &gpte, &spte);
  6.1962 -    spl1e[(pa & ~PAGE_MASK) / sizeof(l1_pgentry_t)] = mk_l1_pgentry(spte);
  6.1963 -}
  6.1964 -
  6.1965 -void shadow_l2_normal_pt_update(unsigned long pa, unsigned long gpde)
  6.1966 -{
  6.1967 -    unsigned long sl2mfn, spde = 0;
  6.1968 -    l2_pgentry_t *spl2e;
  6.1969 -    unsigned long sl1mfn;
  6.1970 -
  6.1971 -    /* N.B. To get here, we know the l2 page *must* be shadowed. */
  6.1972 -    SH_VVLOG("shadow_l2_normal_pt_update pa=%p, gpde=%p",pa,gpde);
  6.1973 -
  6.1974 -    sl2mfn = __shadow_status(current->domain, pa >> PAGE_SHIFT) & PSH_pfn_mask;
  6.1975 -
  6.1976 -    /*
  6.1977 -     * Only propagate to shadow if _PAGE_ACCESSED is set in the guest.
  6.1978 -     * Otherwise, to ensure coherency, we blow away the existing shadow value.
  6.1979 -     */
  6.1980 -    if ( gpde & _PAGE_ACCESSED )
  6.1981 -    {
  6.1982 -        sl1mfn = (gpde & _PAGE_PRESENT) ?
  6.1983 -            __shadow_status(current->domain, gpde >> PAGE_SHIFT) : 0;
  6.1984 -        l2pde_general(current->domain, &gpde, &spde, sl1mfn);
  6.1985 -    }
  6.1986 -
  6.1987 -    spl2e = (l2_pgentry_t *)map_domain_mem(sl2mfn << PAGE_SHIFT);
  6.1988 -    spl2e[(pa & ~PAGE_MASK) / sizeof(l2_pgentry_t)] = mk_l2_pgentry(spde);
  6.1989 -    unmap_domain_mem(spl2e);
  6.1990 -}
  6.1991 -
  6.1992 -unsigned long mk_hl2_table(struct exec_domain *ed)
  6.1993 +/*
  6.1994 + * What lives where in the 32-bit address space in the various shadow modes,
  6.1995 + * and what it uses to get/maintain that mapping.
  6.1996 + *
  6.1997 + * SHADOW MODE:      none         enable         translate         external
  6.1998 + * 
  6.1999 + * 4KB things:
  6.2000 + * guest_vtable    lin_l2     mapped per gpdt  lin_l2 via hl2   mapped per gpdt
  6.2001 + * shadow_vtable     n/a         sh_lin_l2       sh_lin_l2      mapped per gpdt
  6.2002 + * hl2_vtable        n/a            n/a        lin_hl2 via hl2  mapped per gpdt
  6.2003 + * monitor_vtable    n/a            n/a             n/a           mapped once
  6.2004 + *
  6.2005 + * 4MB things:
  6.2006 + * guest_linear  lin via gpdt   lin via gpdt     lin via hl2      lin via hl2
  6.2007 + * shadow_linear     n/a      sh_lin via spdt  sh_lin via spdt  sh_lin via spdt
  6.2008 + * monitor_linear    n/a            n/a             n/a              ???
  6.2009 + * perdomain      perdomain      perdomain       perdomain        perdomain
  6.2010 + * R/O M2P         R/O M2P        R/O M2P           n/a              n/a
  6.2011 + * R/W M2P         R/W M2P        R/W M2P         R/W M2P          R/W M2P
  6.2012 + * P2M               n/a            n/a           R/O M2P          R/O M2P
  6.2013 + *
  6.2014 + * NB:
  6.2015 + * update_pagetables(), __update_pagetables(), shadow_mode_enable(),
  6.2016 + * shadow_l2_table(), shadow_hl2_table(), and alloc_monitor_pagetable()
  6.2017 + * all play a part in maintaining these mappings.
  6.2018 + */
  6.2019 +void __update_pagetables(struct exec_domain *ed)
  6.2020  {
  6.2021      struct domain *d = ed->domain;
  6.2022      unsigned long gmfn = pagetable_val(ed->arch.guest_table) >> PAGE_SHIFT;
  6.2023      unsigned long gpfn = __mfn_to_gpfn(d, gmfn);
  6.2024 -    unsigned long hl2mfn, status;
  6.2025 -    struct pfn_info *hl2_info;
  6.2026 -    l1_pgentry_t *hl2;
  6.2027 +    unsigned long smfn, hl2mfn;
  6.2028 +
  6.2029 +    int max_mode = ( shadow_mode_external(d) ? SHM_external
  6.2030 +                     : shadow_mode_translate(d) ? SHM_translate
  6.2031 +                     : shadow_mode_enabled(d) ? SHM_enable
  6.2032 +                     : 0 );
  6.2033 +
  6.2034 +    ASSERT( ! IS_INVALID_M2P_ENTRY(gpfn) );
  6.2035 +    ASSERT( max_mode );
  6.2036  
  6.2037 -    perfc_incr(hl2_table_pages);
  6.2038 +    /*
  6.2039 +     *  arch.guest_vtable
  6.2040 +     */
  6.2041 +    if ( max_mode & (SHM_enable | SHM_external) )
  6.2042 +    {
  6.2043 +        if ( likely(ed->arch.guest_vtable != NULL) )
  6.2044 +            unmap_domain_mem(ed->arch.guest_vtable);
  6.2045 +        ed->arch.guest_vtable = map_domain_mem(gmfn << PAGE_SHIFT);
  6.2046 +    }
  6.2047  
  6.2048 -    if ( (hl2_info = alloc_shadow_page(d)) == NULL )
  6.2049 -        BUG(); /* XXX Deal gracefully with failure. */
  6.2050 +    /*
  6.2051 +     *  arch.shadow_table
  6.2052 +     */
  6.2053 +    if ( unlikely(!(smfn = __shadow_status(d, gpfn, PGT_base_page_table))) )
  6.2054 +        smfn = shadow_l2_table(d, gpfn, gmfn);
  6.2055 +    get_shadow_ref(smfn);
  6.2056 +    if ( pagetable_val(ed->arch.shadow_table) )
  6.2057 +        put_shadow_ref(pagetable_val(ed->arch.shadow_table) >> PAGE_SHIFT);
  6.2058 +    ed->arch.shadow_table = mk_pagetable(smfn << PAGE_SHIFT);
  6.2059  
  6.2060 -    hl2_info->u.inuse.type_info = PGT_l1_page_table;
  6.2061 +    SH_VVLOG("0: __update_pagetables(gmfn=%p, smfn=%p)", gmfn, smfn);
  6.2062  
  6.2063 -    hl2mfn = page_to_pfn(hl2_info);
  6.2064 -    status = hl2mfn | PSH_hl2;
  6.2065 -    set_shadow_status(ed->domain, gpfn | PSH_hl2, status);
  6.2066 +    /*
  6.2067 +     * arch.shadow_vtable
  6.2068 +     */
  6.2069 +    if ( max_mode == SHM_external )
  6.2070 +    {
  6.2071 +        if ( ed->arch.shadow_vtable )
  6.2072 +            unmap_domain_mem(ed->arch.shadow_vtable);
  6.2073 +        ed->arch.shadow_vtable = map_domain_mem(smfn << PAGE_SHIFT);
  6.2074 +    }
  6.2075 +
  6.2076 +    /*
  6.2077 +     * arch.hl2_vtable
  6.2078 +     */
  6.2079 +
  6.2080 +    // if max_mode == SHM_translate, then the hl2 is already installed
  6.2081 +    // correctly in its smfn, and there's nothing to do.
  6.2082 +    //
  6.2083 +    if ( max_mode == SHM_external )
  6.2084 +    {
  6.2085 +        if ( unlikely(!(hl2mfn = __shadow_status(d, gpfn, PGT_hl2_shadow))) )
  6.2086 +            hl2mfn = shadow_hl2_table(d, gpfn, gmfn, smfn);
  6.2087 +        get_shadow_ref(hl2mfn);
  6.2088  
  6.2089 -    // need to optimize this...
  6.2090 -    hl2 = map_domain_mem(hl2mfn << PAGE_SHIFT);
  6.2091 -    memset(hl2, 0, PAGE_SIZE);
  6.2092 -    unmap_domain_mem(hl2);
  6.2093 +        if ( ed->arch.hl2_vtable )
  6.2094 +            unmap_domain_mem(ed->arch.hl2_vtable);
  6.2095 +        ed->arch.hl2_vtable = map_domain_mem(hl2mfn << PAGE_SHIFT);
  6.2096 +    }
  6.2097 +
  6.2098 +    /*
  6.2099 +     * fixup pointers in monitor table, as necessary
  6.2100 +     */
  6.2101 +    if ( max_mode == SHM_external )
  6.2102 +    {
  6.2103 +        l2_pgentry_t *mpl2e = ed->arch.monitor_vtable;
  6.2104  
  6.2105 -    return status;
  6.2106 +        ASSERT( shadow_mode_translate(d) );
  6.2107 +
  6.2108 +        mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
  6.2109 +            mk_l2_pgentry((hl2mfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
  6.2110 +
  6.2111 +        mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
  6.2112 +            mk_l2_pgentry((smfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
  6.2113 +
  6.2114 +        // XXX - maybe this can be optimized somewhat??
  6.2115 +        local_flush_tlb();
  6.2116 +    }
  6.2117  }
  6.2118  
  6.2119  
  6.2120 -
  6.2121  /************************************************************************/
  6.2122  /************************************************************************/
  6.2123  /************************************************************************/
  6.2124 @@ -838,12 +2000,13 @@ int shadow_status_noswap;
  6.2125  
  6.2126  static int check_pte(
  6.2127      struct domain *d, unsigned long *pgpte, unsigned long *pspte, 
  6.2128 -    int level, int l2_idx, int l1_idx)
  6.2129 +    int level, int l2_idx, int l1_idx, int oos_ptes)
  6.2130  {
  6.2131      unsigned gpte = *pgpte;
  6.2132      unsigned spte = *pspte;
  6.2133 -    unsigned long mask, gpfn, smfn;
  6.2134 +    unsigned long mask, gpfn, smfn, gmfn;
  6.2135      int errors = 0;
  6.2136 +    int page_table_page;
  6.2137  
  6.2138      if ( (spte == 0) || (spte == 0xdeadface) || (spte == 0x00000E00) )
  6.2139          return errors;  /* always safe */
  6.2140 @@ -862,21 +2025,36 @@ static int check_pte(
  6.2141      if ( (spte & mask) != (gpte & mask) )
  6.2142          FAIL("Corrupt?");
  6.2143  
  6.2144 -    if ( (spte & _PAGE_DIRTY ) && !(gpte & _PAGE_DIRTY) )
  6.2145 +    if ( (spte & _PAGE_DIRTY ) && !(gpte & _PAGE_DIRTY) && !oos_ptes )
  6.2146          FAIL("Dirty coherence");
  6.2147  
  6.2148 -    if ( (spte & _PAGE_ACCESSED ) && !(gpte & _PAGE_ACCESSED) )
  6.2149 +    if ( (spte & _PAGE_ACCESSED ) && !(gpte & _PAGE_ACCESSED) && !oos_ptes )
  6.2150          FAIL("Accessed coherence");
  6.2151  
  6.2152 -    if ( (spte & _PAGE_RW ) && !(gpte & _PAGE_RW) )
  6.2153 -        FAIL("RW coherence");
  6.2154 -
  6.2155 -    if ( (spte & _PAGE_RW ) && !((gpte & _PAGE_RW) && (gpte & _PAGE_DIRTY)) )
  6.2156 -        FAIL("RW2 coherence");
  6.2157 - 
  6.2158      smfn = spte >> PAGE_SHIFT;
  6.2159      gpfn = gpte >> PAGE_SHIFT;
  6.2160 +    gmfn = __gpfn_to_mfn(d, gpfn);
  6.2161  
  6.2162 +    page_table_page = mfn_is_page_table(gmfn);
  6.2163 +
  6.2164 +    if ( (spte & _PAGE_RW ) && !(gpte & _PAGE_RW) && !oos_ptes )
  6.2165 +    {
  6.2166 +        printk("gpfn=%p gmfn=%p smfn=%p t=0x%08x page_table_page=%d oos_ptes=%d\n",
  6.2167 +               gpfn, gmfn, smfn,
  6.2168 +               frame_table[gmfn].u.inuse.type_info,
  6.2169 +               page_table_page, oos_ptes);
  6.2170 +        FAIL("RW coherence");
  6.2171 +    }
  6.2172 +
  6.2173 +    if ( (spte & _PAGE_RW ) && !((gpte & _PAGE_RW) && (gpte & _PAGE_DIRTY)) && !oos_ptes )
  6.2174 +    {
  6.2175 +        printk("gpfn=%p gmfn=%p smfn=%p t=0x%08x page_table_page=%d oos_ptes=%d\n",
  6.2176 +               gpfn, gmfn, smfn,
  6.2177 +               frame_table[gmfn].u.inuse.type_info,
  6.2178 +               page_table_page, oos_ptes);
  6.2179 +        FAIL("RW2 coherence");
  6.2180 +    }
  6.2181 + 
  6.2182      if ( gpfn == smfn )
  6.2183      {
  6.2184          if ( level > 1 )
  6.2185 @@ -887,23 +2065,26 @@ static int check_pte(
  6.2186          if ( level < 2 )
  6.2187              FAIL("Shadow in L1 entry?");
  6.2188  
  6.2189 -        if ( __shadow_status(d, gpfn) != (PSH_shadowed | smfn) )
  6.2190 -            FAIL("smfn problem g.sf=%p", 
  6.2191 -                 __shadow_status(d, gpfn) );
  6.2192 +        if ( level == 2 )
  6.2193 +        {
  6.2194 +            if ( __shadow_status(d, gpfn, PGT_l1_shadow) != smfn )
  6.2195 +                FAIL("smfn problem gpfn=%p smfn=%p", gpfn,
  6.2196 +                     __shadow_status(d, gpfn, PGT_l1_shadow));
  6.2197 +        }
  6.2198 +        else
  6.2199 +            BUG(); // XXX -- not handled yet.
  6.2200      }
  6.2201  
  6.2202      return errors;
  6.2203  }
  6.2204  
  6.2205 -
  6.2206  static int check_l1_table(
  6.2207 -    struct domain *d,
  6.2208 +    struct domain *d, unsigned long gpfn,
  6.2209      unsigned long gmfn, unsigned long smfn, unsigned l2_idx)
  6.2210  {
  6.2211      int i;
  6.2212      unsigned long *gpl1e, *spl1e;
  6.2213 -    int cpu = current->processor;
  6.2214 -    int errors = 0;
  6.2215 +    int errors = 0, oos_ptes = 0;
  6.2216  
  6.2217      // First check to see if this guest page is currently the active
  6.2218      // PTWR page.  If so, then we compare the (old) cached copy of the
  6.2219 @@ -912,6 +2093,8 @@ static int check_l1_table(
  6.2220      //
  6.2221      if ( VM_ASSIST(d, VMASST_TYPE_writable_pagetables) )
  6.2222      {
  6.2223 +        int cpu = current->processor;
  6.2224 +
  6.2225          for ( i = 0; i < ARRAY_SIZE(ptwr_info->ptinfo); i++)
  6.2226          {
  6.2227              if ( ptwr_info[cpu].ptinfo[i].l1va &&
  6.2228 @@ -925,11 +2108,18 @@ static int check_l1_table(
  6.2229          }
  6.2230      }
  6.2231  
  6.2232 +    if ( page_out_of_sync(pfn_to_page(gmfn)) )
  6.2233 +    {
  6.2234 +        gmfn = __shadow_status(d, gpfn, PGT_snapshot);
  6.2235 +        oos_ptes = 1;
  6.2236 +        ASSERT(gmfn);
  6.2237 +    }
  6.2238 +
  6.2239      gpl1e = map_domain_mem(gmfn << PAGE_SHIFT);
  6.2240      spl1e = map_domain_mem(smfn << PAGE_SHIFT);
  6.2241  
  6.2242      for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
  6.2243 -        errors += check_pte(d, &gpl1e[i], &spl1e[i], 1, l2_idx, i);
  6.2244 +        errors += check_pte(d, &gpl1e[i], &spl1e[i], 1, l2_idx, i, oos_ptes);
  6.2245   
  6.2246      unmap_domain_mem(spl1e);
  6.2247      unmap_domain_mem(gpl1e);
  6.2248 @@ -944,20 +2134,23 @@ static int check_l1_table(
  6.2249      } while ( 0 )
  6.2250  
  6.2251  int check_l2_table(
  6.2252 -    struct domain *d, unsigned long gpfn, unsigned long smfn)
  6.2253 +    struct domain *d, unsigned long gmfn, unsigned long smfn, int oos_pdes)
  6.2254  {
  6.2255 -    unsigned long gmfn = __gpfn_to_mfn(d, gpfn);
  6.2256 -    l2_pgentry_t *gpl2e = (l2_pgentry_t *) map_domain_mem( gmfn << PAGE_SHIFT );
  6.2257 -    l2_pgentry_t *spl2e = (l2_pgentry_t *) map_domain_mem( smfn << PAGE_SHIFT );
  6.2258 +    l2_pgentry_t *gpl2e = (l2_pgentry_t *)map_domain_mem(gmfn << PAGE_SHIFT);
  6.2259 +    l2_pgentry_t *spl2e = (l2_pgentry_t *)map_domain_mem(smfn << PAGE_SHIFT);
  6.2260      int i;
  6.2261      int errors = 0;
  6.2262 +    int limit;
  6.2263  
  6.2264 -    if ( page_get_owner(pfn_to_page(gmfn)) != d )
  6.2265 +    if ( !oos_pdes && (page_get_owner(pfn_to_page(gmfn)) != d) )
  6.2266          FAILPT("domain doesn't own page");
  6.2267 +    if ( oos_pdes && (page_get_owner(pfn_to_page(gmfn)) != NULL) )
  6.2268 +        FAILPT("bogus owner for snapshot page");
  6.2269      if ( page_get_owner(pfn_to_page(smfn)) != NULL )
  6.2270          FAILPT("shadow page mfn=0x%08x is owned by someone, domid=%d",
  6.2271                 smfn, page_get_owner(pfn_to_page(smfn))->id);
  6.2272  
  6.2273 +#if 0
  6.2274      if ( memcmp(&spl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
  6.2275                  &gpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 
  6.2276                  ((SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT) -
  6.2277 @@ -974,40 +2167,62 @@ int check_l2_table(
  6.2278      if ( (l2_pgentry_val(spl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT]) != 
  6.2279            l2_pgentry_val(gpl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT])) )
  6.2280          FAILPT("hypervisor linear map inconsistent");
  6.2281 +#endif
  6.2282  
  6.2283 -    if ( (l2_pgentry_val(spl2e[SH_LINEAR_PT_VIRT_START >> 
  6.2284 +    if ( !shadow_mode_external(d) &&
  6.2285 +         (l2_pgentry_val(spl2e[SH_LINEAR_PT_VIRT_START >> 
  6.2286                                 L2_PAGETABLE_SHIFT]) != 
  6.2287            ((smfn << PAGE_SHIFT) | __PAGE_HYPERVISOR)) )
  6.2288 +    {
  6.2289          FAILPT("hypervisor shadow linear map inconsistent %p %p",
  6.2290                 l2_pgentry_val(spl2e[SH_LINEAR_PT_VIRT_START >>
  6.2291                                      L2_PAGETABLE_SHIFT]),
  6.2292                 (smfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
  6.2293 -
  6.2294 -    if ( !shadow_mode_translate(d) ) {
  6.2295 -        if ( (l2_pgentry_val(spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT]) !=
  6.2296 -              ((v2m(page_get_owner(&frame_table[gmfn])->arch.mm_perdomain_pt) |
  6.2297 -                __PAGE_HYPERVISOR))) )
  6.2298 -            FAILPT("hypervisor per-domain map inconsistent");
  6.2299      }
  6.2300  
  6.2301 +    if ( !shadow_mode_external(d) &&
  6.2302 +         (l2_pgentry_val(spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT]) !=
  6.2303 +              ((__pa(d->arch.mm_perdomain_pt) | __PAGE_HYPERVISOR))) )
  6.2304 +    {
  6.2305 +        FAILPT("hypervisor per-domain map inconsistent saw %p, expected (va=%p) %p",
  6.2306 +               l2_pgentry_val(spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT]),
  6.2307 +               d->arch.mm_perdomain_pt,
  6.2308 +               (__pa(d->arch.mm_perdomain_pt) | __PAGE_HYPERVISOR));
  6.2309 +    }
  6.2310 +
  6.2311 +    if ( shadow_mode_external(d) )
  6.2312 +        limit = L2_PAGETABLE_ENTRIES;
  6.2313 +    else
  6.2314 +        limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
  6.2315 +
  6.2316      /* Check the whole L2. */
  6.2317 -    for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
  6.2318 -        errors += check_pte(d, &l2_pgentry_val(gpl2e[i]), &l2_pgentry_val(spl2e[i]), 2, i, 0);
  6.2319 +    for ( i = 0; i < limit; i++ )
  6.2320 +        errors += check_pte(d, &l2_pgentry_val(gpl2e[i]), &l2_pgentry_val(spl2e[i]), 2, i, 0, 0);
  6.2321  
  6.2322      unmap_domain_mem(spl2e);
  6.2323      unmap_domain_mem(gpl2e);
  6.2324  
  6.2325 +#if 1
  6.2326 +    if ( errors )
  6.2327 +        printk("check_l2_table returning %d errors\n", errors);
  6.2328 +#endif
  6.2329 +
  6.2330      return errors;
  6.2331  }
  6.2332  
  6.2333 -int _check_pagetable(struct domain *d, pagetable_t pt, char *s)
  6.2334 +int _check_pagetable(struct exec_domain *ed, char *s)
  6.2335  {
  6.2336 +    struct domain *d = ed->domain;
  6.2337 +    pagetable_t pt = ed->arch.guest_table;
  6.2338      unsigned long gptbase = pagetable_val(pt);
  6.2339 -    unsigned long ptbase_pfn, smfn, ss;
  6.2340 +    unsigned long ptbase_pfn, smfn;
  6.2341      unsigned long i;
  6.2342      l2_pgentry_t *gpl2e, *spl2e;
  6.2343      unsigned long ptbase_mfn = 0;
  6.2344 -    int errors = 0;
  6.2345 +    int errors = 0, limit, oos_pdes = 0;
  6.2346 +
  6.2347 +    audit_domain(d);
  6.2348 +    shadow_lock(d);
  6.2349  
  6.2350      sh_check_name = s;
  6.2351      SH_VVLOG("%s-PT Audit", s);
  6.2352 @@ -1017,30 +2232,31 @@ int _check_pagetable(struct domain *d, p
  6.2353      ptbase_pfn = gptbase >> PAGE_SHIFT;
  6.2354      ptbase_mfn = __gpfn_to_mfn(d, ptbase_pfn);
  6.2355  
  6.2356 -    ss = __shadow_status(d, ptbase_pfn);
  6.2357 -  
  6.2358 -    if ( ! (ss & PSH_shadowed) )
  6.2359 +    if ( !(smfn = __shadow_status(d, ptbase_pfn, PGT_base_page_table)) )
  6.2360      {
  6.2361          printk("%s-PT %p not shadowed\n", s, gptbase);
  6.2362          errors++;
  6.2363 -
  6.2364 -        if ( ss != 0 )
  6.2365 -            BUG();
  6.2366 -        return errors;
  6.2367 -    }   
  6.2368 +        goto out;
  6.2369 +    }
  6.2370 +    if ( page_out_of_sync(pfn_to_page(ptbase_mfn)) )
  6.2371 +    {
  6.2372 +        ptbase_mfn = __shadow_status(d, ptbase_pfn, PGT_snapshot);
  6.2373 +        oos_pdes = 1;
  6.2374 +        ASSERT(ptbase_mfn);
  6.2375 +    }
  6.2376   
  6.2377 -    smfn = ss & PSH_pfn_mask;
  6.2378 -
  6.2379 -    if ( ss != (PSH_shadowed | smfn) )
  6.2380 -        FAILPT("ptbase shadow inconsistent1");
  6.2381 -
  6.2382 -    errors += check_l2_table(d, ptbase_pfn, smfn);
  6.2383 +    errors += check_l2_table(d, ptbase_mfn, smfn, oos_pdes);
  6.2384  
  6.2385      gpl2e = (l2_pgentry_t *) map_domain_mem( ptbase_mfn << PAGE_SHIFT );
  6.2386      spl2e = (l2_pgentry_t *) map_domain_mem( smfn << PAGE_SHIFT );
  6.2387  
  6.2388      /* Go back and recurse. */
  6.2389 -    for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
  6.2390 +    if ( shadow_mode_external(d) )
  6.2391 +        limit = L2_PAGETABLE_ENTRIES;
  6.2392 +    else
  6.2393 +        limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
  6.2394 +
  6.2395 +    for ( i = 0; i < limit; i++ )
  6.2396      {
  6.2397          unsigned long gl1pfn = l2_pgentry_val(gpl2e[i]) >> PAGE_SHIFT;
  6.2398          unsigned long gl1mfn = __gpfn_to_mfn(d, gl1pfn);
  6.2399 @@ -1048,7 +2264,7 @@ int _check_pagetable(struct domain *d, p
  6.2400  
  6.2401          if ( l2_pgentry_val(spl2e[i]) != 0 )
  6.2402          {
  6.2403 -            errors += check_l1_table(d, gl1mfn, sl1mfn, i);
  6.2404 +            errors += check_l1_table(d, gl1pfn, gl1mfn, sl1mfn, i);
  6.2405          }
  6.2406      }
  6.2407  
  6.2408 @@ -1057,22 +2273,23 @@ int _check_pagetable(struct domain *d, p
  6.2409  
  6.2410      SH_VVLOG("PT verified : l2_present = %d, l1_present = %d",
  6.2411               sh_l2_present, sh_l1_present);
  6.2412 - 
  6.2413 -#if 1
  6.2414 +
  6.2415 + out:
  6.2416      if ( errors )
  6.2417          BUG();
  6.2418 -#endif
  6.2419 +
  6.2420 +    shadow_unlock(d);
  6.2421  
  6.2422      return errors;
  6.2423  }
  6.2424  
  6.2425 -int _check_all_pagetables(struct domain *d, char *s)
  6.2426 +int _check_all_pagetables(struct exec_domain *ed, char *s)
  6.2427  {
  6.2428 -    int i, j;
  6.2429 +    struct domain *d = ed->domain;
  6.2430 +    int i;
  6.2431      struct shadow_status *a;
  6.2432      unsigned long gmfn;
  6.2433      int errors = 0;
  6.2434 -    int cpu;
  6.2435  
  6.2436      shadow_status_noswap = 1;
  6.2437  
  6.2438 @@ -1084,22 +2301,34 @@ int _check_all_pagetables(struct domain 
  6.2439      for (i = 0; i < shadow_ht_buckets; i++)
  6.2440      {
  6.2441          a = &d->arch.shadow_ht[i];
  6.2442 -        while ( a && a->pfn )
  6.2443 +        while ( a && a->gpfn_and_flags )
  6.2444          {
  6.2445 -            gmfn = __gpfn_to_mfn(d, a->pfn);
  6.2446 -            switch ( frame_table[a->pfn].u.inuse.type_info & PGT_type_mask )
  6.2447 +            gmfn = __gpfn_to_mfn(d, a->gpfn_and_flags & PGT_mfn_mask);
  6.2448 +
  6.2449 +            switch ( a->gpfn_and_flags & PGT_type_mask )
  6.2450              {
  6.2451 -            case PGT_l1_page_table:
  6.2452 -                errors += check_l1_table(d, gmfn, a->smfn_and_flags & PSH_pfn_mask, 0);
  6.2453 +            case PGT_l1_shadow:
  6.2454 +                errors += check_l1_table(d, a->gpfn_and_flags & PGT_mfn_mask,
  6.2455 +                                         gmfn, a->smfn, 0);
  6.2456                  break;
  6.2457 -            case PGT_l2_page_table:
  6.2458 -                errors += check_l2_table(d, gmfn, a->smfn_and_flags & PSH_pfn_mask);
  6.2459 +            case PGT_l2_shadow:
  6.2460 +                errors += check_l2_table(d, gmfn, a->smfn,
  6.2461 +                                         page_out_of_sync(pfn_to_page(gmfn)));
  6.2462 +                break;
  6.2463 +            case PGT_l3_shadow:
  6.2464 +            case PGT_l4_shadow:
  6.2465 +            case PGT_hl2_shadow:
  6.2466 +                BUG(); // XXX - ought to fix this...
  6.2467 +                break;
  6.2468 +            case PGT_snapshot:
  6.2469                  break;
  6.2470              default:
  6.2471                  errors++;
  6.2472 -                printk("unexpected page type 0x%08x, pfn=0x%08x, gmfn=0x%08x\n",
  6.2473 -                       frame_table[gmfn].u.inuse.type_info,
  6.2474 -                       a->pfn, gmfn);
  6.2475 +                printk("unexpected shadow type %p, gpfn=%p, "
  6.2476 +                       "gmfn=%p smfn=%p\n",
  6.2477 +                       a->gpfn_and_flags & PGT_type_mask,
  6.2478 +                       a->gpfn_and_flags & PGT_mfn_mask,
  6.2479 +                       gmfn, a->smfn);
  6.2480                  BUG();
  6.2481              }
  6.2482              a = a->next;
  6.2483 @@ -1108,52 +2337,8 @@ int _check_all_pagetables(struct domain 
  6.2484  
  6.2485      shadow_status_noswap = 0;
  6.2486  
  6.2487 -    for (i = 0; i < 1024; i++)
  6.2488 -    {
  6.2489 -        if ( l2_pgentry_val(shadow_linear_l2_table[i]) & _PAGE_PRESENT )
  6.2490 -        {
  6.2491 -            unsigned base = i << 10;
  6.2492 -            for (j = 0; j < 1024; j++)
  6.2493 -            {
  6.2494 -                if ( (l1_pgentry_val(shadow_linear_pg_table[base + j]) & PAGE_MASK) == 0x0143d000 )
  6.2495 -                {
  6.2496 -                    printk("sh_ln_pg_tb[0x%08x] => 0x%08lx ",
  6.2497 -                           base + j,
  6.2498 -                           l1_pgentry_val(shadow_linear_pg_table[base + j]));
  6.2499 -                    if ( l1_pgentry_val(shadow_linear_pg_table[base + j]) & _PAGE_PRESENT )
  6.2500 -                        printk(" first entry => 0x%08lx\n",
  6.2501 -                               *(unsigned long *)((base + j) << PAGE_SHIFT));
  6.2502 -                    else
  6.2503 -                        printk(" page not present\n");
  6.2504 -                }
  6.2505 -            }
  6.2506 -        }
  6.2507 -    }
  6.2508 -
  6.2509      if ( errors )
  6.2510 -    {
  6.2511 -        printk("VM_ASSIST(d, VMASST_TYPE_writable_pagetables) => %d\n",
  6.2512 -               VM_ASSIST(d, VMASST_TYPE_writable_pagetables));
  6.2513 -        for ( cpu = 0; cpu < smp_num_cpus; cpu++ )
  6.2514 -        {
  6.2515 -            for ( j = 0; j < ARRAY_SIZE(ptwr_info->ptinfo); j++)
  6.2516 -            {
  6.2517 -                printk("ptwr_info[%d].ptinfo[%d].l1va => 0x%08x\n",
  6.2518 -                       cpu, j, ptwr_info[cpu].ptinfo[j].l1va);
  6.2519 -                printk("ptwr_info[%d].ptinfo[%d].pl1e => 0x%08x\n",
  6.2520 -                       cpu, j, ptwr_info[cpu].ptinfo[j].pl1e);
  6.2521 -                if (cpu == smp_processor_id())
  6.2522 -                    printk("v2m(ptwr_info[%d].ptinfo[%d].pl1e) => 0x%08x\n",
  6.2523 -                           cpu, j, v2m(ptwr_info[cpu].ptinfo[j].pl1e));
  6.2524 -                printk("ptwr_info[%d].ptinfo[%d].page => 0x%08x\n",
  6.2525 -                       cpu, j, ptwr_info[cpu].ptinfo[j].page);
  6.2526 -                if (cpu == smp_processor_id())
  6.2527 -                    printk("v2m(ptwr_info[%d].ptinfo[%d].page) => 0x%08x\n",
  6.2528 -                           cpu, j, v2m(ptwr_info[cpu].ptinfo[j].page));
  6.2529 -            }
  6.2530 -        }
  6.2531          BUG();
  6.2532 -    }
  6.2533  
  6.2534      return errors;
  6.2535  }
     7.1 --- a/xen/arch/x86/traps.c	Tue Mar 15 15:53:52 2005 +0000
     7.2 +++ b/xen/arch/x86/traps.c	Wed Mar 16 17:30:37 2005 +0000
     7.3 @@ -114,7 +114,7 @@ asmlinkage void fatal_trap(int trapnr, s
     7.4      if ( trapnr == TRAP_page_fault )
     7.5      {
     7.6          __asm__ __volatile__ ("mov %%cr2,%0" : "=r" (cr2) : );
     7.7 -        printk("Faulting linear address might be %0lx %lx\n", cr2, cr2);
     7.8 +        printk("Faulting linear address might be %p\n", cr2);
     7.9      }
    7.10  
    7.11      printk("************************************\n");
    7.12 @@ -269,6 +269,8 @@ asmlinkage int do_page_fault(struct xen_
    7.13  
    7.14      DEBUGGER_trap_entry(TRAP_page_fault, regs);
    7.15  
    7.16 +    //printk("do_page_fault(eip=%p, va=%p, code=%d)\n", regs->eip, addr, regs->error_code);
    7.17 +
    7.18      perfc_incrc(page_faults);
    7.19  
    7.20      if ( likely(VM_ASSIST(d, VMASST_TYPE_writable_pagetables)) )
    7.21 @@ -295,9 +297,12 @@ asmlinkage int do_page_fault(struct xen_
    7.22          UNLOCK_BIGLOCK(d);
    7.23      }
    7.24  
    7.25 -    if ( unlikely(shadow_mode_enabled(d)) && 
    7.26 -         (addr < PAGE_OFFSET) && shadow_fault(addr, regs) )
    7.27 +    if ( unlikely(shadow_mode_enabled(d)) &&
    7.28 +         ((addr < PAGE_OFFSET) || shadow_mode_external(d)) &&
    7.29 +         shadow_fault(addr, regs) )
    7.30 +    {
    7.31          return EXCRET_fault_fixed;
    7.32 +    }
    7.33  
    7.34      if ( unlikely(addr >= LDT_VIRT_START(ed)) && 
    7.35           (addr < (LDT_VIRT_START(ed) + (ed->arch.ldt_ents*LDT_ENTRY_SIZE))) )
     8.1 --- a/xen/arch/x86/vmx.c	Tue Mar 15 15:53:52 2005 +0000
     8.2 +++ b/xen/arch/x86/vmx.c	Wed Mar 16 17:30:37 2005 +0000
     8.3 @@ -106,6 +106,7 @@ static void inline __update_guest_eip(un
     8.4  
     8.5  static int vmx_do_page_fault(unsigned long va, struct xen_regs *regs) 
     8.6  {
     8.7 +    struct exec_domain *ed = current;
     8.8      unsigned long eip;
     8.9      unsigned long gpte, gpa;
    8.10      int result;
    8.11 @@ -123,9 +124,9 @@ static int vmx_do_page_fault(unsigned lo
    8.12       * If vpagetable is zero, then we are still emulating 1:1 page tables,
    8.13       * and we should have never gotten here.
    8.14       */
    8.15 -    if ( !current->arch.guest_vtable )
    8.16 +    if ( !test_bit(VMX_CPU_STATE_PG_ENABLED, &ed->arch.arch_vmx.cpu_state) )
    8.17      {
    8.18 -        printk("vmx_do_page_fault while still running on 1:1 page table\n");
    8.19 +        printk("vmx_do_page_fault while running on 1:1 page table\n");
    8.20          return 0;
    8.21      }
    8.22  
    8.23 @@ -269,21 +270,17 @@ static void vmx_vmexit_do_invlpg(unsigne
    8.24  {
    8.25      unsigned long eip;
    8.26      struct exec_domain *ed = current;
    8.27 -    unsigned int index;
    8.28  
    8.29      __vmread(GUEST_EIP, &eip);
    8.30  
    8.31 -    VMX_DBG_LOG(DBG_LEVEL_VMMU, "vmx_vmexit_do_invlpg:eip=%p, va=%p",
    8.32 -            eip, va);
    8.33 +    VMX_DBG_LOG(DBG_LEVEL_VMMU, "vmx_vmexit_do_invlpg: eip=%p, va=%p",
    8.34 +                eip, va);
    8.35  
    8.36      /*
    8.37       * We do the safest things first, then try to update the shadow
    8.38       * copying from guest
    8.39       */
    8.40      shadow_invlpg(ed, va);
    8.41 -    index = l2_table_offset(va);
    8.42 -    ed->arch.hl2_vtable[index] = 
    8.43 -        mk_l2_pgentry(0); /* invalidate pgd cache */
    8.44  }
    8.45  
    8.46  static void vmx_io_instruction(struct xen_regs *regs, 
    8.47 @@ -428,14 +425,6 @@ static void mov_to_cr(int gp, int cr, st
    8.48              }
    8.49              old_base_mfn = pagetable_val(d->arch.guest_table) >> PAGE_SHIFT;
    8.50  
    8.51 -            /* We know that none of the previous 1:1 shadow pages are
    8.52 -             * going to be used again, so might as well flush them.
    8.53 -             * XXXX wait until the last VCPU boots before doing the flush !!
    8.54 -             */
    8.55 -            shadow_lock(d->domain);
    8.56 -            free_shadow_state(d->domain); // XXX SMP
    8.57 -            shadow_unlock(d->domain);
    8.58 -
    8.59              /*
    8.60               * Now arch.guest_table points to machine physical.
    8.61               */
    8.62 @@ -469,7 +458,6 @@ static void mov_to_cr(int gp, int cr, st
    8.63              break;
    8.64          }
    8.65          
    8.66 -        hl2_table_invalidate(d);
    8.67          /*
    8.68           * We make a new one if the shadow does not exist.
    8.69           */
    8.70 @@ -482,8 +470,7 @@ static void mov_to_cr(int gp, int cr, st
    8.71              mfn = phys_to_machine_mapping(value >> PAGE_SHIFT);
    8.72              if ((mfn << PAGE_SHIFT) != pagetable_val(d->arch.guest_table))
    8.73                  __vmx_bug(regs);
    8.74 -            vmx_shadow_clear_state(d->domain);
    8.75 -            shadow_invalidate(d);
    8.76 +            shadow_sync_all(d->domain);
    8.77          } else {
    8.78              /*
    8.79               * If different, make a shadow. Check if the PDBR is valid
    8.80 @@ -525,8 +512,6 @@ static void mov_to_cr(int gp, int cr, st
    8.81           */
    8.82          if ((old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE)) {
    8.83              vmx_shadow_clear_state(d->domain);
    8.84 -            shadow_invalidate(d);
    8.85 -            hl2_table_invalidate(d);
    8.86          }
    8.87          break;
    8.88      default:
     9.1 --- a/xen/arch/x86/x86_32/domain_page.c	Tue Mar 15 15:53:52 2005 +0000
     9.2 +++ b/xen/arch/x86/x86_32/domain_page.c	Wed Mar 16 17:30:37 2005 +0000
     9.3 @@ -85,6 +85,8 @@ void *map_domain_mem(unsigned long pa)
     9.4  void unmap_domain_mem(void *va)
     9.5  {
     9.6      unsigned int idx;
     9.7 +    ASSERT((void *)MAPCACHE_VIRT_START <= va);
     9.8 +    ASSERT(va < (void *)MAPCACHE_VIRT_END);
     9.9      idx = ((unsigned long)va - MAPCACHE_VIRT_START) >> PAGE_SHIFT;
    9.10      mapcache[idx] |= READY_FOR_TLB_FLUSH;
    9.11  }
    10.1 --- a/xen/common/dom_mem_ops.c	Tue Mar 15 15:53:52 2005 +0000
    10.2 +++ b/xen/common/dom_mem_ops.c	Wed Mar 16 17:30:37 2005 +0000
    10.3 @@ -14,6 +14,7 @@
    10.4  #include <xen/sched.h>
    10.5  #include <xen/event.h>
    10.6  #include <asm/domain_page.h>
    10.7 +#include <asm/shadow.h>
    10.8  
    10.9  /*
   10.10   * To allow safe resume of do_dom_mem_op() after preemption, we need to know 
   10.11 @@ -111,6 +112,27 @@ free_dom_mem(struct domain *d,
   10.12              if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
   10.13                  put_page(page);
   10.14  
   10.15 +            if ( unlikely(shadow_mode_enabled(d)) )
   10.16 +            {
   10.17 +                // XXX This needs more thought.  This isn't pretty,
   10.18 +                // and it's not fast.  But it's a place holder.
   10.19 +                //
   10.20 +                shadow_lock(d);
   10.21 +                if ( page_out_of_sync(page) )
   10.22 +                    __shadow_sync_mfn(d, mpfn + j);
   10.23 +                shadow_remove_all_access(d, mpfn + j);
   10.24 +
   10.25 +                if (page->count_info != 1)
   10.26 +                {
   10.27 +                    printk("free_dom_mem in shadow mode didn't release page "
   10.28 +                           "mfn=%p c=%p\n", mpfn+j, page->count_info);
   10.29 +                    shadow_unlock(d);
   10.30 +                    audit_domain(d);
   10.31 +                    BUG();
   10.32 +                }
   10.33 +                shadow_unlock(d);
   10.34 +            }
   10.35 +
   10.36              put_page(page);
   10.37          }
   10.38      }
    11.1 --- a/xen/common/page_alloc.c	Tue Mar 15 15:53:52 2005 +0000
    11.2 +++ b/xen/common/page_alloc.c	Wed Mar 16 17:30:37 2005 +0000
    11.3 @@ -29,6 +29,7 @@
    11.4  #include <xen/slab.h>
    11.5  #include <xen/irq.h>
    11.6  #include <asm/domain_page.h>
    11.7 +#include <asm/shadow.h>
    11.8  
    11.9  /*
   11.10   * Comma-separated list of hexadecimal page numbers containing bad bytes.
   11.11 @@ -566,7 +567,23 @@ void free_domheap_pages(struct pfn_info 
   11.12  
   11.13          for ( i = 0; i < (1 << order); i++ )
   11.14          {
   11.15 -            ASSERT((pg[i].u.inuse.type_info & PGT_count_mask) == 0);
   11.16 +            if ( ((pg[i].u.inuse.type_info & PGT_count_mask) != 0) &&
   11.17 +                shadow_mode_enabled(d) )
   11.18 +            {
   11.19 +                // XXX This needs more thought...
   11.20 +                //
   11.21 +                printk("%s: needing to call shadow_remove_all_access for mfn=%p\n",
   11.22 +                       __func__, page_to_pfn(&pg[i]));
   11.23 +                printk("Amfn=%p c=%p t=%p\n", page_to_pfn(&pg[i]),
   11.24 +                       pg[i].count_info, pg[i].u.inuse.type_info);
   11.25 +                shadow_lock(d);
   11.26 +                shadow_remove_all_access(d, page_to_pfn(&pg[i]));
   11.27 +                shadow_unlock(d);
   11.28 +                printk("Bmfn=%p c=%p t=%p\n", page_to_pfn(&pg[i]),
   11.29 +                       pg[i].count_info, pg[i].u.inuse.type_info);
   11.30 +            }
   11.31 +
   11.32 +            ASSERT( (pg[i].u.inuse.type_info & PGT_count_mask) == 0 );
   11.33              pg[i].tlbflush_timestamp  = tlbflush_current_time();
   11.34              pg[i].u.free.cpu_mask     = cpu_mask;
   11.35              list_del(&pg[i].list);
    12.1 --- a/xen/common/schedule.c	Tue Mar 15 15:53:52 2005 +0000
    12.2 +++ b/xen/common/schedule.c	Wed Mar 16 17:30:37 2005 +0000
    12.3 @@ -423,6 +423,9 @@ void __enter_scheduler(void)
    12.4      
    12.5      perfc_incrc(sched_ctx);
    12.6  
    12.7 +    // Q: With full shadow mode, do we need to flush out-of-sync pages
    12.8 +    //    before switching domains?  Current belief is NO.
    12.9 +
   12.10      if ( !is_idle_task(prev->domain) )
   12.11      {
   12.12          LOCK_BIGLOCK(prev->domain);
    13.1 --- a/xen/include/asm-x86/domain.h	Tue Mar 15 15:53:52 2005 +0000
    13.2 +++ b/xen/include/asm-x86/domain.h	Wed Mar 16 17:30:37 2005 +0000
    13.3 @@ -35,11 +35,21 @@ struct arch_domain
    13.4      unsigned int shadow_dirty_bitmap_size;  /* in pages, bit per page */
    13.5  
    13.6      /* shadow mode stats */
    13.7 -    unsigned int shadow_page_count;     
    13.8 -    unsigned int shadow_fault_count;     
    13.9 -    unsigned int shadow_dirty_count;     
   13.10 -    unsigned int shadow_dirty_net_count;     
   13.11 -    unsigned int shadow_dirty_block_count;     
   13.12 +    unsigned int shadow_page_count;
   13.13 +    unsigned int hl2_page_count;
   13.14 +    unsigned int snapshot_page_count;
   13.15 +
   13.16 +    unsigned int shadow_fault_count;
   13.17 +    unsigned int shadow_dirty_count;
   13.18 +    unsigned int shadow_dirty_net_count;
   13.19 +    unsigned int shadow_dirty_block_count;
   13.20 +
   13.21 +    /* full shadow mode */
   13.22 +    struct out_of_sync_entry *out_of_sync; /* list of out-of-sync pages */
   13.23 +    struct out_of_sync_entry *out_of_sync_free;
   13.24 +    struct out_of_sync_entry *out_of_sync_extras;
   13.25 +    unsigned int out_of_sync_extras_count;
   13.26 +
   13.27  } __cacheline_aligned;
   13.28  
   13.29  struct arch_exec_domain
   13.30 @@ -109,8 +119,8 @@ struct arch_exec_domain
   13.31  
   13.32      l2_pgentry_t *guest_vtable;         /* virtual address of pagetable */
   13.33      l2_pgentry_t *shadow_vtable;        /* virtual address of shadow_table */
   13.34 -    l2_pgentry_t *hl2_vtable;			/* virtual address of hl2_table */
   13.35      l2_pgentry_t *monitor_vtable;		/* virtual address of monitor_table */
   13.36 +    l1_pgentry_t *hl2_vtable;			/* virtual address of hl2_table */
   13.37  
   13.38      /* Virtual CR2 value. Can be read/written by guest. */
   13.39      unsigned long guest_cr2;
    14.1 --- a/xen/include/asm-x86/mm.h	Tue Mar 15 15:53:52 2005 +0000
    14.2 +++ b/xen/include/asm-x86/mm.h	Wed Mar 16 17:30:37 2005 +0000
    14.3 @@ -69,7 +69,16 @@ struct pfn_info
    14.4  #define PGT_gdt_page        (5<<29) /* using this page in a GDT? */
    14.5  #define PGT_ldt_page        (6<<29) /* using this page in an LDT? */
    14.6  #define PGT_writable_page   (7<<29) /* has writable mappings of this page? */
    14.7 +
    14.8 +#define PGT_l1_shadow       PGT_l1_page_table
    14.9 +#define PGT_l2_shadow       PGT_l2_page_table
   14.10 +#define PGT_l3_shadow       PGT_l3_page_table
   14.11 +#define PGT_l4_shadow       PGT_l4_page_table
   14.12 +#define PGT_hl2_shadow      (5<<29)
   14.13 +#define PGT_snapshot        (6<<29)
   14.14 +
   14.15  #define PGT_type_mask       (7<<29) /* Bits 29-31. */
   14.16 +
   14.17   /* Has this page been validated for use as its current type? */
   14.18  #define _PGT_validated      28
   14.19  #define PGT_validated       (1U<<_PGT_validated)
   14.20 @@ -86,11 +95,19 @@ struct pfn_info
   14.21   /* 17-bit count of uses of this frame as its current type. */
   14.22  #define PGT_count_mask      ((1U<<17)-1)
   14.23  
   14.24 +#define PGT_mfn_mask        ((1U<<21)-1) /* mfn mask for shadow types */
   14.25 +
   14.26   /* Cleared when the owning guest 'frees' this page. */
   14.27  #define _PGC_allocated      31
   14.28  #define PGC_allocated       (1U<<_PGC_allocated)
   14.29 - /* 31-bit count of references to this frame. */
   14.30 -#define PGC_count_mask      ((1U<<31)-1)
   14.31 + /* Set when fullshadow mode marks a page out-of-sync */
   14.32 +#define _PGC_out_of_sync     30
   14.33 +#define PGC_out_of_sync     (1U<<_PGC_out_of_sync)
   14.34 + /* Set when fullshadow mode is using a page as a page table */
   14.35 +#define _PGC_page_table      29
   14.36 +#define PGC_page_table      (1U<<_PGC_page_table)
   14.37 + /* 29-bit count of references to this frame. */
   14.38 +#define PGC_count_mask      ((1U<<29)-1)
   14.39  
   14.40  /* We trust the slab allocator in slab.c, and our use of it. */
   14.41  #define PageSlab(page)	    (1)
   14.42 @@ -112,6 +129,8 @@ static inline u32 pickle_domptr(struct d
   14.43  #define page_get_owner(_p)    (unpickle_domptr((_p)->u.inuse._domain))
   14.44  #define page_set_owner(_p,_d) ((_p)->u.inuse._domain = pickle_domptr(_d))
   14.45  
   14.46 +#define page_out_of_sync(_p)  ((_p)->count_info & PGC_out_of_sync)
   14.47 +
   14.48  #define SHARE_PFN_WITH_DOMAIN(_pfn, _dom)                                   \
   14.49      do {                                                                    \
   14.50          page_set_owner((_pfn), (_dom));                                     \
   14.51 @@ -135,6 +154,11 @@ void init_frametable(void);
   14.52  
   14.53  int alloc_page_type(struct pfn_info *page, unsigned int type);
   14.54  void free_page_type(struct pfn_info *page, unsigned int type);
   14.55 +extern void invalidate_shadow_ldt(struct exec_domain *d);
   14.56 +extern u32 shadow_remove_all_write_access(
   14.57 +    struct domain *d, unsigned min_type, unsigned max_type,
   14.58 +    unsigned long gpfn);
   14.59 +extern u32 shadow_remove_all_access( struct domain *d, unsigned long gmfn);
   14.60  
   14.61  static inline void put_page(struct pfn_info *page)
   14.62  {
   14.63 @@ -166,8 +190,10 @@ static inline int get_page(struct pfn_in
   14.64               unlikely((nx & PGC_count_mask) == 0) || /* Count overflow? */
   14.65               unlikely(d != _domain) )                /* Wrong owner? */
   14.66          {
   14.67 -            DPRINTK("Error pfn %p: ed=%p, sd=%p, caf=%08x, taf=%08x\n",
   14.68 -                    page_to_pfn(page), domain, unpickle_domptr(d),
   14.69 +            DPRINTK("Error pfn %p: rd=%p(%d), od=%p(%d), caf=%08x, taf=%08x\n",
   14.70 +                    page_to_pfn(page), domain, (domain ? domain->id : -1),
   14.71 +                    page_get_owner(page),
   14.72 +                    (page_get_owner(page) ? page_get_owner(page)->id : -1),
   14.73                      x, page->u.inuse.type_info);
   14.74              return 0;
   14.75          }
   14.76 @@ -184,6 +210,8 @@ static inline int get_page(struct pfn_in
   14.77  
   14.78  void put_page_type(struct pfn_info *page);
   14.79  int  get_page_type(struct pfn_info *page, u32 type);
   14.80 +int  get_page_from_l1e(l1_pgentry_t l1e, struct domain *d);
   14.81 +void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d);
   14.82  
   14.83  static inline void put_page_and_type(struct pfn_info *page)
   14.84  {
   14.85 @@ -207,6 +235,22 @@ static inline int get_page_and_type(stru
   14.86      return rc;
   14.87  }
   14.88  
   14.89 +static inline int mfn_is_page_table(unsigned long mfn)
   14.90 +{
   14.91 +    if ( !pfn_is_ram(mfn) )
   14.92 +        return 0;
   14.93 +
   14.94 +    return frame_table[mfn].count_info & PGC_page_table;
   14.95 +}
   14.96 +
   14.97 +static inline int page_is_page_table(struct pfn_info *page)
   14.98 +{
   14.99 +    if ( !pfn_is_ram(page_to_pfn(page)) )
  14.100 +        return 0;
  14.101 +
  14.102 +    return page->count_info & PGC_page_table;
  14.103 +}
  14.104 +
  14.105  #define ASSERT_PAGE_IS_TYPE(_p, _t)                            \
  14.106      ASSERT(((_p)->u.inuse.type_info & PGT_type_mask) == (_t)); \
  14.107      ASSERT(((_p)->u.inuse.type_info & PGT_count_mask) != 0)
  14.108 @@ -307,6 +351,7 @@ void ptwr_flush(const int);
  14.109  int ptwr_do_page_fault(unsigned long);
  14.110  
  14.111  int new_guest_cr3(unsigned long pfn);
  14.112 +void propagate_page_fault(unsigned long addr, u16 error_code);
  14.113  
  14.114  #define __cleanup_writable_pagetable(_what)                                 \
  14.115  do {                                                                        \
  14.116 @@ -326,14 +371,24 @@ do {                                    
  14.117                                       PTWR_CLEANUP_INACTIVE);              \
  14.118      } while ( 0 )
  14.119  
  14.120 +int audit_adjust_pgtables(struct domain *d, int dir, int noisy);
  14.121 +
  14.122  #ifndef NDEBUG
  14.123 -void audit_domain(struct domain *d);
  14.124 +
  14.125 +#define AUDIT_ALREADY_LOCKED ( 1u << 0 )
  14.126 +#define AUDIT_ERRORS_OK      ( 1u << 1 )
  14.127 +#define AUDIT_QUIET          ( 1u << 2 )
  14.128 +
  14.129 +void _audit_domain(struct domain *d, int flags, const char *file, int line);
  14.130 +#define audit_domain(_d) _audit_domain((_d), 0, __FILE__, __LINE__)
  14.131  void audit_domains(void);
  14.132 +
  14.133  #else
  14.134 +
  14.135 +#define _audit_domain(_d, _f, _file, _line) ((void)0)
  14.136  #define audit_domain(_d) ((void)0)
  14.137  #define audit_domains()  ((void)0)
  14.138 +
  14.139  #endif
  14.140  
  14.141 -void propagate_page_fault(unsigned long addr, u16 error_code);
  14.142 -
  14.143  #endif /* __ASM_X86_MM_H__ */
    15.1 --- a/xen/include/asm-x86/page.h	Tue Mar 15 15:53:52 2005 +0000
    15.2 +++ b/xen/include/asm-x86/page.h	Wed Mar 16 17:30:37 2005 +0000
    15.3 @@ -57,9 +57,11 @@ typedef struct { unsigned long pt_lo; } 
    15.4  #include <asm/flushtlb.h>
    15.5  
    15.6  #define linear_pg_table ((l1_pgentry_t *)LINEAR_PT_VIRT_START)
    15.7 -#define linear_l2_table ((l2_pgentry_t *)(LINEAR_PT_VIRT_START+(LINEAR_PT_VIRT_START>>(L2_PAGETABLE_SHIFT-L1_PAGETABLE_SHIFT))))
    15.8 +#define __linear_l2_table ((l2_pgentry_t *)(LINEAR_PT_VIRT_START + \
    15.9 +     (LINEAR_PT_VIRT_START>>(L2_PAGETABLE_SHIFT-L1_PAGETABLE_SHIFT))))
   15.10 +#define linear_l2_table(_ed) ((_ed)->arch.guest_vtable)
   15.11  
   15.12 -#define va_to_l1mfn(_va) (l2_pgentry_val(linear_l2_table[_va>>L2_PAGETABLE_SHIFT]) >> PAGE_SHIFT)
   15.13 +#define va_to_l1mfn(_ed, _va) (l2_pgentry_val(linear_l2_table(_ed)[_va>>L2_PAGETABLE_SHIFT]) >> PAGE_SHIFT)
   15.14  
   15.15  extern root_pgentry_t idle_pg_table[ROOT_PAGETABLE_ENTRIES];
   15.16  
    16.1 --- a/xen/include/asm-x86/shadow.h	Tue Mar 15 15:53:52 2005 +0000
    16.2 +++ b/xen/include/asm-x86/shadow.h	Wed Mar 16 17:30:37 2005 +0000
    16.3 @@ -1,3 +1,22 @@
    16.4 +/******************************************************************************
    16.5 + * include/asm-x86/shadow.h
    16.6 + * 
    16.7 + * Copyright (c) 2005 Michael A Fetterman
    16.8 + * 
    16.9 + * This program is free software; you can redistribute it and/or modify
   16.10 + * it under the terms of the GNU General Public License as published by
   16.11 + * the Free Software Foundation; either version 2 of the License, or
   16.12 + * (at your option) any later version.
   16.13 + * 
   16.14 + * This program is distributed in the hope that it will be useful,
   16.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
   16.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   16.17 + * GNU General Public License for more details.
   16.18 + * 
   16.19 + * You should have received a copy of the GNU General Public License
   16.20 + * along with this program; if not, write to the Free Software
   16.21 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
   16.22 + */
   16.23  
   16.24  #ifndef _XEN_SHADOW_H
   16.25  #define _XEN_SHADOW_H
   16.26 @@ -8,29 +27,26 @@
   16.27  #include <asm/processor.h>
   16.28  #include <asm/domain_page.h>
   16.29  
   16.30 -/* Shadow PT flag bits in shadow_status */
   16.31 -#define PSH_shadowed    (1<<31) /* page has a shadow. PFN points to shadow */
   16.32 -#define PSH_hl2         (1<<30) /* page is an hl2 */
   16.33 -#define PSH_pfn_mask    ((1<<21)-1)
   16.34 +/* Shadow PT operation mode : shadow-mode variable in arch_domain. */
   16.35  
   16.36 -/* Shadow PT operation mode: shadow-mode variable in arch_domain. */
   16.37  #define SHM_enable    (1<<0) /* we're in one of the shadow modes */
   16.38  #define SHM_log_dirty (1<<1) /* enable log dirty mode */
   16.39 -#define SHM_translate (1<<2) /* do p2m translation on guest tables */
   16.40 +#define SHM_translate (1<<2) /* do p2m tranaltion on guest tables */
   16.41  #define SHM_external  (1<<3) /* external page table, not used by Xen */
   16.42  
   16.43  #define shadow_mode_enabled(_d)   ((_d)->arch.shadow_mode)
   16.44  #define shadow_mode_log_dirty(_d) ((_d)->arch.shadow_mode & SHM_log_dirty)
   16.45  #define shadow_mode_translate(_d) ((_d)->arch.shadow_mode & SHM_translate)
   16.46 -#ifndef __x86_64__ /* XXX Currently breaks the 64-bit build. */
   16.47  #define shadow_mode_external(_d)  ((_d)->arch.shadow_mode & SHM_external)
   16.48 -#else
   16.49 -#define shadow_mode_external(_d)  (0)
   16.50 -#endif
   16.51  
   16.52  #define shadow_linear_pg_table ((l1_pgentry_t *)SH_LINEAR_PT_VIRT_START)
   16.53 -#define shadow_linear_l2_table ((l2_pgentry_t *)(SH_LINEAR_PT_VIRT_START + \
   16.54 +#define __shadow_linear_l2_table ((l2_pgentry_t *)(SH_LINEAR_PT_VIRT_START + \
   16.55       (SH_LINEAR_PT_VIRT_START >> (L2_PAGETABLE_SHIFT - L1_PAGETABLE_SHIFT))))
   16.56 +#define shadow_linear_l2_table(_ed) ((_ed)->arch.shadow_vtable)
   16.57 +
   16.58 +// easy access to the hl2 table (for translated but not external modes only)
   16.59 +#define __linear_hl2_table ((l1_pgentry_t *)(LINEAR_PT_VIRT_START + \
   16.60 +     (PERDOMAIN_VIRT_START >> (L2_PAGETABLE_SHIFT - L1_PAGETABLE_SHIFT))))
   16.61  
   16.62  #define shadow_lock_init(_d) spin_lock_init(&(_d)->arch.shadow_lock)
   16.63  #define shadow_lock(_d)      spin_lock(&(_d)->arch.shadow_lock)
   16.64 @@ -39,18 +55,86 @@
   16.65  extern void shadow_mode_init(void);
   16.66  extern int shadow_mode_control(struct domain *p, dom0_shadow_control_t *sc);
   16.67  extern int shadow_fault(unsigned long va, struct xen_regs *regs);
   16.68 -extern void shadow_l1_normal_pt_update(
   16.69 -    unsigned long pa, unsigned long gpte, 
   16.70 -    unsigned long *prev_spfn_ptr, l1_pgentry_t **prev_spl1e_ptr);
   16.71 -extern void shadow_l2_normal_pt_update(unsigned long pa, unsigned long gpde);
   16.72 -extern void unshadow_table(unsigned long gpfn, unsigned int type);
   16.73  extern int shadow_mode_enable(struct domain *p, unsigned int mode);
   16.74 -extern void free_shadow_state(struct domain *d);
   16.75  extern void shadow_invlpg(struct exec_domain *, unsigned long);
   16.76 -extern unsigned long mk_hl2_table(struct exec_domain *ed);
   16.77 +extern struct out_of_sync_entry *shadow_mark_mfn_out_of_sync(
   16.78 +    struct exec_domain *ed, unsigned long gpfn, unsigned long mfn);
   16.79 +extern void free_monitor_pagetable(struct exec_domain *ed);
   16.80 +extern void __shadow_sync_all(struct domain *d);
   16.81 +extern int __shadow_out_of_sync(struct exec_domain *ed, unsigned long va);
   16.82 +
   16.83 +static inline unsigned long __shadow_status(
   16.84 +    struct domain *d, unsigned long gpfn, unsigned long stype);
   16.85  
   16.86  extern void vmx_shadow_clear_state(struct domain *);
   16.87  
   16.88 +/************************************************************************/
   16.89 +
   16.90 +static void inline
   16.91 +__shadow_sync_mfn(struct domain *d, unsigned long mfn)
   16.92 +{
   16.93 +    if ( d->arch.out_of_sync )
   16.94 +    {
   16.95 +        // XXX - could be smarter
   16.96 +        //
   16.97 +        __shadow_sync_all(d);
   16.98 +    }
   16.99 +}
  16.100 +
  16.101 +static void inline
  16.102 +__shadow_sync_va(struct exec_domain *ed, unsigned long va)
  16.103 +{
  16.104 +    struct domain *d = ed->domain;
  16.105 +
  16.106 +    if ( d->arch.out_of_sync && __shadow_out_of_sync(ed, va) )
  16.107 +    {
  16.108 +        // XXX - could be smarter
  16.109 +        //
  16.110 +        __shadow_sync_all(ed->domain);
  16.111 +    }
  16.112 +}
  16.113 +
  16.114 +static void inline
  16.115 +shadow_sync_all(struct domain *d)
  16.116 +{
  16.117 +    if ( unlikely(shadow_mode_enabled(d)) )
  16.118 +    {
  16.119 +        shadow_lock(d);
  16.120 +
  16.121 +        if ( d->arch.out_of_sync )
  16.122 +            __shadow_sync_all(d);
  16.123 +
  16.124 +        ASSERT(d->arch.out_of_sync == NULL);
  16.125 +
  16.126 +        shadow_unlock(d);
  16.127 +    }
  16.128 +}
  16.129 +
  16.130 +// SMP BUG: This routine can't ever be used properly in an SMP context.
  16.131 +//          It should be something like get_shadow_and_sync_va().
  16.132 +//          This probably shouldn't exist.
  16.133 +//
  16.134 +static void inline
  16.135 +shadow_sync_va(struct exec_domain *ed, unsigned long gva)
  16.136 +{
  16.137 +    struct domain *d = ed->domain;
  16.138 +    if ( unlikely(shadow_mode_enabled(d)) )
  16.139 +    {
  16.140 +        shadow_lock(d);
  16.141 +        __shadow_sync_va(ed, gva);
  16.142 +        shadow_unlock(d);
  16.143 +    }
  16.144 +}
  16.145 +
  16.146 +extern void __shadow_mode_disable(struct domain *d);
  16.147 +static inline void shadow_mode_disable(struct domain *d)
  16.148 +{
  16.149 +    if ( shadow_mode_enabled(d) )
  16.150 +        __shadow_mode_disable(d);
  16.151 +}
  16.152 +
  16.153 +/************************************************************************/
  16.154 +
  16.155  #define __mfn_to_gpfn(_d, mfn)                         \
  16.156      ( (shadow_mode_translate(_d))                      \
  16.157        ? machine_to_phys_mapping[(mfn)]                 \
  16.158 @@ -61,39 +145,41 @@ extern void vmx_shadow_clear_state(struc
  16.159        ? phys_to_machine_mapping(gpfn)                  \
  16.160        : (gpfn) )
  16.161  
  16.162 -extern void __shadow_mode_disable(struct domain *d);
  16.163 -static inline void shadow_mode_disable(struct domain *d)
  16.164 -{
  16.165 -    if ( shadow_mode_enabled(d) )
  16.166 -        __shadow_mode_disable(d);
  16.167 -}
  16.168 +/************************************************************************/
  16.169 +
  16.170 +struct shadow_status {
  16.171 +    unsigned long gpfn_and_flags; /* Guest pfn plus flags. */
  16.172 +    struct shadow_status *next;   /* Pull-to-front list.   */
  16.173 +    unsigned long smfn;           /* Shadow mfn.           */
  16.174 +};
  16.175 +
  16.176 +#define shadow_ht_extra_size 128
  16.177 +#define shadow_ht_buckets    256
  16.178  
  16.179 -extern unsigned long shadow_l2_table( 
  16.180 -    struct domain *d, unsigned long gmfn);
  16.181 -  
  16.182 -static inline void shadow_invalidate(struct exec_domain *ed) {
  16.183 -    if ( !VMX_DOMAIN(ed) )
  16.184 -        BUG();
  16.185 -    memset(ed->arch.shadow_vtable, 0, PAGE_SIZE);
  16.186 -}
  16.187 +struct out_of_sync_entry {
  16.188 +    struct out_of_sync_entry *next;
  16.189 +    unsigned long gpfn;    /* why is this here? */
  16.190 +    unsigned long gmfn;
  16.191 +    unsigned long snapshot_mfn;
  16.192 +    unsigned long writable_pl1e; /* NB: this is a machine address */
  16.193 +};
  16.194 +
  16.195 +#define out_of_sync_extra_size 127
  16.196 +
  16.197 +#define SHADOW_SNAPSHOT_ELSEWHERE (-1L)
  16.198 +
  16.199 +/************************************************************************/
  16.200  
  16.201  #define SHADOW_DEBUG 0
  16.202  #define SHADOW_VERBOSE_DEBUG 0
  16.203 +#define SHADOW_VVERBOSE_DEBUG 0
  16.204  #define SHADOW_HASH_DEBUG 0
  16.205 +#define FULLSHADOW_DEBUG 0
  16.206  
  16.207  #if SHADOW_DEBUG
  16.208  extern int shadow_status_noswap;
  16.209  #endif
  16.210  
  16.211 -struct shadow_status {
  16.212 -    unsigned long pfn;            /* Guest pfn.             */
  16.213 -    unsigned long smfn_and_flags; /* Shadow mfn plus flags. */
  16.214 -    struct shadow_status *next;   /* Pull-to-front list.    */
  16.215 -};
  16.216 -
  16.217 -#define shadow_ht_extra_size 128
  16.218 -#define shadow_ht_buckets    256
  16.219 -
  16.220  #ifdef VERBOSE
  16.221  #define SH_LOG(_f, _a...)                                               \
  16.222      printk("DOM%uP%u: SH_LOG(%d): " _f "\n",                            \
  16.223 @@ -102,7 +188,7 @@ struct shadow_status {
  16.224  #define SH_LOG(_f, _a...) 
  16.225  #endif
  16.226  
  16.227 -#if SHADOW_DEBUG
  16.228 +#if SHADOW_VERBOSE_DEBUG
  16.229  #define SH_VLOG(_f, _a...)                                              \
  16.230      printk("DOM%uP%u: SH_VLOG(%d): " _f "\n",                           \
  16.231             current->domain->id, current->processor, __LINE__ , ## _a )
  16.232 @@ -110,7 +196,7 @@ struct shadow_status {
  16.233  #define SH_VLOG(_f, _a...) 
  16.234  #endif
  16.235  
  16.236 -#if SHADOW_VERBOSE_DEBUG
  16.237 +#if SHADOW_VVERBOSE_DEBUG
  16.238  #define SH_VVLOG(_f, _a...)                                             \
  16.239      printk("DOM%uP%u: SH_VVLOG(%d): " _f "\n",                          \
  16.240             current->domain->id, current->processor, __LINE__ , ## _a )
  16.241 @@ -118,60 +204,148 @@ struct shadow_status {
  16.242  #define SH_VVLOG(_f, _a...)
  16.243  #endif
  16.244  
  16.245 -// BUG: mafetter: this assumes ed == current, so why pass ed?
  16.246 -static inline void __shadow_get_l2e(
  16.247 -    struct exec_domain *ed, unsigned long va, unsigned long *sl2e)
  16.248 -{
  16.249 -    if ( !likely(shadow_mode_enabled(ed->domain)) )
  16.250 -        BUG();
  16.251 +#if FULLSHADOW_DEBUG
  16.252 +#define FSH_LOG(_f, _a...)                                              \
  16.253 +    printk("DOM%uP%u: FSH_LOG(%d): " _f "\n",                           \
  16.254 +           current->domain->id, current->processor, __LINE__ , ## _a )
  16.255 +#else
  16.256 +#define FSH_LOG(_f, _a...) 
  16.257 +#endif
  16.258 +
  16.259 +
  16.260 +/************************************************************************/
  16.261  
  16.262 -    if ( shadow_mode_translate(ed->domain) )
  16.263 -        *sl2e = l2_pgentry_val(
  16.264 -            ed->arch.shadow_vtable[l2_table_offset(va)]);       
  16.265 -    else 
  16.266 -        *sl2e = l2_pgentry_val(
  16.267 -            shadow_linear_l2_table[l2_table_offset(va)]);
  16.268 +static inline void
  16.269 +__shadow_get_l2e(
  16.270 +    struct exec_domain *ed, unsigned long va, unsigned long *psl2e)
  16.271 +{
  16.272 +    ASSERT(shadow_mode_enabled(ed->domain));
  16.273 +
  16.274 +    *psl2e = l2_pgentry_val( ed->arch.shadow_vtable[l2_table_offset(va)]);
  16.275 +}
  16.276 +
  16.277 +static inline void
  16.278 +__shadow_set_l2e(
  16.279 +    struct exec_domain *ed, unsigned long va, unsigned long value)
  16.280 +{
  16.281 +    ASSERT(shadow_mode_enabled(ed->domain));
  16.282 +
  16.283 +    ed->arch.shadow_vtable[l2_table_offset(va)] = mk_l2_pgentry(value);
  16.284  }
  16.285  
  16.286 -static inline void __shadow_set_l2e(
  16.287 +static inline void
  16.288 +__guest_get_l2e(
  16.289 +    struct exec_domain *ed, unsigned long va, unsigned long *pl2e)
  16.290 +{
  16.291 +    *pl2e = l2_pgentry_val(ed->arch.guest_vtable[l2_table_offset(va)]);
  16.292 +}
  16.293 +
  16.294 +static inline void
  16.295 +__guest_set_l2e(
  16.296      struct exec_domain *ed, unsigned long va, unsigned long value)
  16.297  {
  16.298 -    if ( !likely(shadow_mode_enabled(ed->domain)) )
  16.299 -        BUG();
  16.300 +    if ( unlikely(shadow_mode_translate(ed->domain)) )
  16.301 +    {
  16.302 +        unsigned long mfn = phys_to_machine_mapping(value >> PAGE_SHIFT);
  16.303 +        unsigned long old_hl2e =
  16.304 +            l1_pgentry_val(ed->arch.hl2_vtable[l2_table_offset(va)]);
  16.305 +        unsigned long new_hl2e =
  16.306 +            (mfn ? ((mfn << PAGE_SHIFT) | __PAGE_HYPERVISOR) : 0);
  16.307  
  16.308 -    if ( shadow_mode_translate(ed->domain) ) 
  16.309 -        ed->arch.shadow_vtable[l2_table_offset(va)] = mk_l2_pgentry(value);
  16.310 -    else 
  16.311 -        shadow_linear_l2_table[l2_table_offset(va)] = mk_l2_pgentry(value);
  16.312 +        // only do the ref counting if something important changed.
  16.313 +        //
  16.314 +        if ( (old_hl2e ^ new_hl2e) & (PAGE_MASK | _PAGE_PRESENT) )
  16.315 +        {
  16.316 +            if ( new_hl2e & _PAGE_PRESENT )
  16.317 +                get_page_from_l1e(mk_l1_pgentry(new_hl2e), ed->domain);
  16.318 +            if ( old_hl2e & _PAGE_PRESENT )
  16.319 +                put_page_from_l1e(mk_l1_pgentry(old_hl2e), ed->domain);
  16.320 +        }
  16.321 +
  16.322 +        ed->arch.hl2_vtable[l2_table_offset(va)] = mk_l1_pgentry(new_hl2e);
  16.323 +    }
  16.324 +
  16.325 +    ed->arch.guest_vtable[l2_table_offset(va)] = mk_l2_pgentry(value);
  16.326  }
  16.327  
  16.328 -static inline void __guest_get_l2e(
  16.329 -    struct exec_domain *ed, unsigned long va, unsigned long *l2e)
  16.330 +/************************************************************************/
  16.331 +
  16.332 +/*
  16.333 + * Add another shadow reference to smfn.
  16.334 + */
  16.335 +static inline int
  16.336 +get_shadow_ref(unsigned long smfn)
  16.337  {
  16.338 -    *l2e = ( shadow_mode_translate(ed->domain) ) ?
  16.339 -        l2_pgentry_val(ed->arch.guest_vtable[l2_table_offset(va)]) :
  16.340 -        l2_pgentry_val(linear_l2_table[l2_table_offset(va)]);
  16.341 +    u32 x, nx;
  16.342 +
  16.343 +    ASSERT(pfn_is_ram(smfn));
  16.344 +
  16.345 +    x = frame_table[smfn].count_info;
  16.346 +    nx = x + 1;
  16.347 +
  16.348 +    if ( unlikely(nx == 0) )
  16.349 +    {
  16.350 +        printk("get_shadow_ref overflow, gmfn=%p smfn=%p\n",
  16.351 +               frame_table[smfn].u.inuse.type_info & PGT_mfn_mask, smfn);
  16.352 +        BUG();
  16.353 +    }
  16.354 +    
  16.355 +    // Guarded by the shadow lock...
  16.356 +    //
  16.357 +    frame_table[smfn].count_info = nx;
  16.358 +
  16.359 +    return 1;
  16.360  }
  16.361  
  16.362 -static inline void __guest_set_l2e(
  16.363 -    struct exec_domain *ed, unsigned long va, unsigned long value)
  16.364 +extern void free_shadow_page(unsigned long smfn);
  16.365 +
  16.366 +/*
  16.367 + * Drop a shadow reference to smfn.
  16.368 + */
  16.369 +static inline void
  16.370 +put_shadow_ref(unsigned long smfn)
  16.371  {
  16.372 -    if ( shadow_mode_translate(ed->domain) )
  16.373 -    {
  16.374 -        unsigned long pfn;
  16.375 +    u32 x, nx;
  16.376 +
  16.377 +    ASSERT(pfn_is_ram(smfn));
  16.378 +
  16.379 +    x = frame_table[smfn].count_info;
  16.380 +    nx = x - 1;
  16.381  
  16.382 -        pfn = phys_to_machine_mapping(value >> PAGE_SHIFT);
  16.383 -        ed->arch.hl2_vtable[l2_table_offset(va)] =
  16.384 -            mk_l2_pgentry((pfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
  16.385 +    if ( unlikely(x == 0) )
  16.386 +    {
  16.387 +        printk("put_shadow_ref underflow, gmfn=%p smfn=%p\n",
  16.388 +               frame_table[smfn].u.inuse.type_info & PGT_mfn_mask, smfn);
  16.389 +        BUG();
  16.390 +    }
  16.391  
  16.392 -        ed->arch.guest_vtable[l2_table_offset(va)] = mk_l2_pgentry(value);
  16.393 -    }
  16.394 -    else
  16.395 +    // Guarded by the shadow lock...
  16.396 +    //
  16.397 +    frame_table[smfn].count_info = nx;
  16.398 +
  16.399 +    if ( unlikely(nx == 0) )
  16.400      {
  16.401 -        linear_l2_table[l2_table_offset(va)] = mk_l2_pgentry(value);
  16.402 +        free_shadow_page(smfn);
  16.403      }
  16.404  }
  16.405  
  16.406 +static inline void
  16.407 +shadow_pin(unsigned long smfn)
  16.408 +{
  16.409 +    ASSERT( !(frame_table[smfn].u.inuse.type_info & PGT_pinned) );
  16.410 +
  16.411 +    frame_table[smfn].u.inuse.type_info |= PGT_pinned;
  16.412 +    get_shadow_ref(smfn);
  16.413 +}
  16.414 +
  16.415 +static inline void
  16.416 +shadow_unpin(unsigned long smfn)
  16.417 +{
  16.418 +    frame_table[smfn].u.inuse.type_info &= ~PGT_pinned;
  16.419 +    put_shadow_ref(smfn);
  16.420 +}
  16.421 +
  16.422 +
  16.423  /************************************************************************/
  16.424  
  16.425  static inline int __mark_dirty(struct domain *d, unsigned int mfn)
  16.426 @@ -182,7 +356,7 @@ static inline int __mark_dirty(struct do
  16.427      ASSERT(spin_is_locked(&d->arch.shadow_lock));
  16.428      ASSERT(d->arch.shadow_dirty_bitmap != NULL);
  16.429  
  16.430 -    pfn = machine_to_phys_mapping[mfn];
  16.431 +    pfn = __mfn_to_gpfn(d, mfn);
  16.432  
  16.433      /*
  16.434       * Values with the MSB set denote MFNs that aren't really part of the 
  16.435 @@ -229,23 +403,41 @@ static inline int mark_dirty(struct doma
  16.436  
  16.437  /************************************************************************/
  16.438  
  16.439 +extern void shadow_mark_out_of_sync(
  16.440 +    struct exec_domain *ed, unsigned long gpfn, unsigned long mfn,
  16.441 +    unsigned long va);
  16.442 +
  16.443  static inline void l1pte_write_fault(
  16.444 -    struct domain *d, unsigned long *gpte_p, unsigned long *spte_p)
  16.445 -{ 
  16.446 +    struct exec_domain *ed, unsigned long *gpte_p, unsigned long *spte_p,
  16.447 +    unsigned long va)
  16.448 +{
  16.449 +    struct domain *d = ed->domain;
  16.450      unsigned long gpte = *gpte_p;
  16.451 -    unsigned long spte = *spte_p;
  16.452 -    unsigned long pfn = gpte >> PAGE_SHIFT;
  16.453 -    unsigned long mfn = __gpfn_to_mfn(d, pfn);
  16.454 +    unsigned long spte;
  16.455 +    unsigned long gpfn = gpte >> PAGE_SHIFT;
  16.456 +    unsigned long mfn = __gpfn_to_mfn(d, gpfn);
  16.457 +
  16.458 +    //printk("l1pte_write_fault gmfn=%p\n", mfn);
  16.459 +
  16.460 +    if ( unlikely(!mfn) )
  16.461 +    {
  16.462 +        SH_LOG("l1pte_write_fault: invalid gpfn=%p", gpfn);
  16.463 +        *spte_p = 0;
  16.464 +        return;
  16.465 +    }
  16.466  
  16.467      ASSERT(gpte & _PAGE_RW);
  16.468      gpte |= _PAGE_DIRTY | _PAGE_ACCESSED;
  16.469 -
  16.470 -    if ( shadow_mode_log_dirty(d) )
  16.471 -        __mark_dirty(d, pfn);
  16.472 -
  16.473      spte = (mfn << PAGE_SHIFT) | (gpte & ~PAGE_MASK);
  16.474  
  16.475      SH_VVLOG("l1pte_write_fault: updating spte=0x%p gpte=0x%p", spte, gpte);
  16.476 +
  16.477 +    if ( shadow_mode_log_dirty(d) )
  16.478 +        __mark_dirty(d, mfn);
  16.479 +
  16.480 +    if ( mfn_is_page_table(mfn) )
  16.481 +        shadow_mark_out_of_sync(ed, gpfn, mfn, va);
  16.482 +
  16.483      *gpte_p = gpte;
  16.484      *spte_p = spte;
  16.485  }
  16.486 @@ -258,11 +450,21 @@ static inline void l1pte_read_fault(
  16.487      unsigned long pfn = gpte >> PAGE_SHIFT;
  16.488      unsigned long mfn = __gpfn_to_mfn(d, pfn);
  16.489  
  16.490 +    if ( unlikely(!mfn) )
  16.491 +    {
  16.492 +        SH_LOG("l1pte_read_fault: invalid gpfn=%p", pfn);
  16.493 +        *spte_p = 0;
  16.494 +        return;
  16.495 +    }
  16.496 +
  16.497      gpte |= _PAGE_ACCESSED;
  16.498      spte = (mfn << PAGE_SHIFT) | (gpte & ~PAGE_MASK);
  16.499  
  16.500 -    if ( shadow_mode_log_dirty(d) || !(gpte & _PAGE_DIRTY) )
  16.501 +    if ( shadow_mode_log_dirty(d) || !(gpte & _PAGE_DIRTY) ||
  16.502 +         mfn_is_page_table(mfn) )
  16.503 +    {
  16.504          spte &= ~_PAGE_RW;
  16.505 +    }
  16.506  
  16.507      SH_VVLOG("l1pte_read_fault: updating spte=0x%p gpte=0x%p", spte, gpte);
  16.508      *gpte_p = gpte;
  16.509 @@ -270,9 +472,8 @@ static inline void l1pte_read_fault(
  16.510  }
  16.511  
  16.512  static inline void l1pte_propagate_from_guest(
  16.513 -    struct domain *d, unsigned long *gpte_p, unsigned long *spte_p)
  16.514 +    struct domain *d, unsigned long gpte, unsigned long *spte_p)
  16.515  { 
  16.516 -    unsigned long gpte = *gpte_p;
  16.517      unsigned long spte = *spte_p;
  16.518      unsigned long pfn = gpte >> PAGE_SHIFT;
  16.519      unsigned long mfn = __gpfn_to_mfn(d, pfn);
  16.520 @@ -281,33 +482,36 @@ static inline void l1pte_propagate_from_
  16.521      unsigned long old_spte = spte;
  16.522  #endif
  16.523  
  16.524 -    /* Use 1:1 page table to identify MMIO address space */
  16.525 -    if ( shadow_mode_external(d) && mmio_space(gpte) ) {
  16.526 +    if ( unlikely(!mfn) )
  16.527 +    {
  16.528 +        // likely an MMIO address space mapping...
  16.529 +        //
  16.530          *spte_p = 0;
  16.531          return;
  16.532      }
  16.533 -    
  16.534 +
  16.535      spte = 0;
  16.536      if ( (gpte & (_PAGE_PRESENT|_PAGE_ACCESSED) ) == 
  16.537           (_PAGE_PRESENT|_PAGE_ACCESSED) ) {
  16.538          
  16.539          spte = (mfn << PAGE_SHIFT) | (gpte & ~PAGE_MASK);
  16.540          
  16.541 -        if ( shadow_mode_log_dirty(d) || !(gpte & _PAGE_DIRTY) )
  16.542 +        if ( shadow_mode_log_dirty(d) ||
  16.543 +             !(gpte & _PAGE_DIRTY) ||
  16.544 +             mfn_is_page_table(mfn) )
  16.545 +        {
  16.546              spte &= ~_PAGE_RW;
  16.547 +        }
  16.548      }
  16.549 -        
  16.550 +
  16.551  #if SHADOW_VERBOSE_DEBUG
  16.552      if ( old_spte || spte || gpte )
  16.553 -        SH_VVLOG("l1pte_propagate_from_guest: gpte=0x%p, old spte=0x%p, new spte=0x%p ", gpte, old_spte, spte);
  16.554 +        debugtrace_printk("l1pte_propagate_from_guest: gpte=0x%p, old spte=0x%p, new spte=0x%p\n", gpte, old_spte, spte);
  16.555  #endif
  16.556  
  16.557 -    *gpte_p = gpte;
  16.558      *spte_p = spte;
  16.559  }
  16.560  
  16.561 -
  16.562 -
  16.563  static inline void l2pde_general(
  16.564      struct domain *d,
  16.565      unsigned long *gpde_p,
  16.566 @@ -315,33 +519,104 @@ static inline void l2pde_general(
  16.567      unsigned long sl1mfn)
  16.568  {
  16.569      unsigned long gpde = *gpde_p;
  16.570 -    unsigned long spde = *spde_p;
  16.571 +    unsigned long spde;
  16.572  
  16.573      spde = 0;
  16.574 -
  16.575 -    if ( sl1mfn != 0 )
  16.576 +    if ( (gpde & _PAGE_PRESENT) && (sl1mfn != 0) )
  16.577      {
  16.578          spde = (gpde & ~PAGE_MASK) | (sl1mfn << PAGE_SHIFT) | 
  16.579              _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY;
  16.580          gpde |= _PAGE_ACCESSED; /* N.B. PDEs do not have a dirty bit. */
  16.581  
  16.582 -        /* Detect linear p.t. mappings and write-protect them. */
  16.583 -        if ( (frame_table[sl1mfn].u.inuse.type_info & PGT_type_mask) ==
  16.584 -             PGT_l2_page_table ) 
  16.585 -        {
  16.586 -            if ( !shadow_mode_translate(d) )
  16.587 -                spde = gpde & ~_PAGE_RW;
  16.588 -
  16.589 -        }
  16.590 +        // XXX mafetter: Hmm...
  16.591 +        //     Shouldn't the dirty log be checked/updated here?
  16.592 +        //     Actually, it needs to be done in this function's callers.
  16.593 +        //
  16.594 +        *gpde_p = gpde;
  16.595      }
  16.596  
  16.597 -    *gpde_p = gpde;
  16.598      *spde_p = spde;
  16.599  }
  16.600  
  16.601 +static inline void l2pde_propagate_from_guest(
  16.602 +    struct domain *d, unsigned long *gpde_p, unsigned long *spde_p)
  16.603 +{
  16.604 +    unsigned long gpde = *gpde_p, sl1mfn;
  16.605 +
  16.606 +    sl1mfn =  __shadow_status(d, gpde >> PAGE_SHIFT, PGT_l1_shadow);
  16.607 +    l2pde_general(d, gpde_p, spde_p, sl1mfn);
  16.608 +}
  16.609 +    
  16.610 +/************************************************************************/
  16.611 +
  16.612 +// returns true if a tlb flush is needed
  16.613 +//
  16.614 +static int inline
  16.615 +validate_pte_change(
  16.616 +    struct domain *d,
  16.617 +    unsigned long new_pte,
  16.618 +    unsigned long *shadow_pte_p)
  16.619 +{
  16.620 +    unsigned long old_spte, new_spte;
  16.621 +
  16.622 +    perfc_incrc(validate_pte_change);
  16.623 +
  16.624 +#if 0
  16.625 +    FSH_LOG("validate_pte(old=%p new=%p)\n", old_pte, new_pte);
  16.626 +#endif
  16.627 +
  16.628 +    old_spte = *shadow_pte_p;
  16.629 +    l1pte_propagate_from_guest(d, new_pte, shadow_pte_p);
  16.630 +    new_spte = *shadow_pte_p;
  16.631 +
  16.632 +    // only do the ref counting if something important changed.
  16.633 +    //
  16.634 +    if ( (old_spte ^ new_spte) & (PAGE_MASK | _PAGE_RW | _PAGE_PRESENT) )
  16.635 +    {
  16.636 +        if ( new_spte & _PAGE_PRESENT )
  16.637 +            get_page_from_l1e(mk_l1_pgentry(new_spte), d);
  16.638 +        if ( old_spte & _PAGE_PRESENT )
  16.639 +            put_page_from_l1e(mk_l1_pgentry(old_spte), d);
  16.640 +    }
  16.641 +
  16.642 +    // paranoia rules!
  16.643 +    return 1;
  16.644 +}
  16.645 +
  16.646 +// returns true if a tlb flush is needed
  16.647 +//
  16.648 +static int inline
  16.649 +validate_pde_change(
  16.650 +    struct domain *d,
  16.651 +    unsigned long new_pde,
  16.652 +    unsigned long *shadow_pde_p)
  16.653 +{
  16.654 +    unsigned long old_spde = *shadow_pde_p;
  16.655 +    unsigned long new_spde;
  16.656 +
  16.657 +    perfc_incrc(validate_pde_change);
  16.658 +
  16.659 +    l2pde_propagate_from_guest(d, &new_pde, shadow_pde_p);
  16.660 +    new_spde = *shadow_pde_p;
  16.661 +
  16.662 +    // only do the ref counting if something important changed.
  16.663 +    //
  16.664 +    if ( (old_spde ^ new_spde) & (PAGE_MASK | _PAGE_PRESENT) )
  16.665 +    {
  16.666 +        if ( new_spde & _PAGE_PRESENT )
  16.667 +            get_shadow_ref(new_spde >> PAGE_SHIFT);
  16.668 +        if ( old_spde & _PAGE_PRESENT )
  16.669 +            put_shadow_ref(old_spde >> PAGE_SHIFT);
  16.670 +    }
  16.671 +
  16.672 +    // paranoia rules!
  16.673 +    return 1;
  16.674 +}
  16.675 +
  16.676  /*********************************************************************/
  16.677  
  16.678  #if SHADOW_HASH_DEBUG
  16.679 +
  16.680  static void shadow_audit(struct domain *d, int print)
  16.681  {
  16.682      int live = 0, free = 0, j = 0, abs;
  16.683 @@ -350,26 +625,25 @@ static void shadow_audit(struct domain *
  16.684      for ( j = 0; j < shadow_ht_buckets; j++ )
  16.685      {
  16.686          a = &d->arch.shadow_ht[j];        
  16.687 -        if ( a->pfn )
  16.688 +        if ( a->gpfn_and_flags )
  16.689          {
  16.690              live++;
  16.691 -            ASSERT(a->smfn_and_flags & PSH_pfn_mask);
  16.692 +            ASSERT(a->smfn);
  16.693          }
  16.694          else
  16.695              ASSERT(!a->next);
  16.696 -        ASSERT( (a->pfn & ~PSH_hl2) < 0x00100000UL);
  16.697 +
  16.698          a = a->next;
  16.699          while ( a && (live < 9999) )
  16.700          { 
  16.701              live++; 
  16.702 -            if ( (a->pfn == 0) || (a->smfn_and_flags == 0) )
  16.703 +            if ( (a->gpfn_and_flags == 0) || (a->smfn == 0) )
  16.704              {
  16.705 -                printk("XXX live=%d pfn=%p sp=%p next=%p\n",
  16.706 -                       live, a->pfn, a->smfn_and_flags, a->next);
  16.707 +                printk("XXX live=%d gpfn+flags=%p sp=%p next=%p\n",
  16.708 +                       live, a->gpfn_and_flags, a->smfn, a->next);
  16.709                  BUG();
  16.710              }
  16.711 -            ASSERT( (a->pfn & ~PSH_hl2) < 0x00100000UL);
  16.712 -            ASSERT(a->smfn_and_flags & PSH_pfn_mask);
  16.713 +            ASSERT(a->smfn);
  16.714              a = a->next; 
  16.715          }
  16.716          ASSERT(live < 9999);
  16.717 @@ -379,21 +653,26 @@ static void shadow_audit(struct domain *
  16.718          free++; 
  16.719  
  16.720      if ( print )
  16.721 -        printk("Xlive=%d free=%d\n",live,free);
  16.722 +        printk("Xlive=%d free=%d\n", live, free);
  16.723  
  16.724      // BUG: this only works if there's only a single domain which is
  16.725      //      using shadow tables.
  16.726      //
  16.727 -    abs = ( perfc_value(shadow_l1_pages) +
  16.728 -            perfc_value(shadow_l2_pages) +
  16.729 -            perfc_value(hl2_table_pages) ) - live;
  16.730 +    abs = (
  16.731 +        perfc_value(shadow_l1_pages) +
  16.732 +        perfc_value(shadow_l2_pages) +
  16.733 +        perfc_value(hl2_table_pages) +
  16.734 +        perfc_value(snapshot_pages)
  16.735 +        ) - live;
  16.736  #ifdef PERF_COUNTERS
  16.737      if ( (abs < -1) || (abs > 1) )
  16.738      {
  16.739 -        printk("live=%d free=%d l1=%d l2=%d hl2=%d\n", live, free,
  16.740 +        printk("live=%d free=%d l1=%d l2=%d hl2=%d snapshot=%d\n",
  16.741 +               live, free,
  16.742                 perfc_value(shadow_l1_pages),
  16.743                 perfc_value(shadow_l2_pages),
  16.744 -               perfc_value(hl2_table_pages));
  16.745 +               perfc_value(hl2_table_pages),
  16.746 +               perfc_value(snapshot_pages));
  16.747          BUG();
  16.748      }
  16.749  #endif
  16.750 @@ -414,30 +693,36 @@ static inline struct shadow_status *hash
  16.751   * N.B. This takes a guest pfn (i.e. a pfn in the guest's namespace,
  16.752   *      which, depending on full shadow mode, may or may not equal
  16.753   *      its mfn).
  16.754 - *      The shadow status it returns is a mfn.
  16.755 + *      It returns the shadow's mfn, or zero if it doesn't exist.
  16.756   */
  16.757 +
  16.758  static inline unsigned long __shadow_status(
  16.759 -    struct domain *d, unsigned int gpfn)
  16.760 +    struct domain *d, unsigned long gpfn, unsigned long stype)
  16.761  {
  16.762      struct shadow_status *p, *x, *head;
  16.763 +    unsigned long key = gpfn | stype;
  16.764  
  16.765      ASSERT(spin_is_locked(&d->arch.shadow_lock));
  16.766 +    ASSERT(gpfn == (gpfn & PGT_mfn_mask));
  16.767 +    ASSERT(stype && !(stype & ~PGT_type_mask));
  16.768 +
  16.769 +    perfc_incrc(shadow_status_calls);
  16.770  
  16.771      x = head = hash_bucket(d, gpfn);
  16.772      p = NULL;
  16.773  
  16.774 -    //SH_VVLOG("lookup gpfn=%08x bucket=%p", gpfn, x);
  16.775 +    //SH_VVLOG("lookup gpfn=%08x type=%08x bucket=%p", gpfn, stype, x);
  16.776      shadow_audit(d, 0);
  16.777  
  16.778      do
  16.779      {
  16.780 -        ASSERT(x->pfn || ((x == head) && (x->next == NULL)));
  16.781 +        ASSERT(x->gpfn_and_flags || ((x == head) && (x->next == NULL)));
  16.782  
  16.783 -        if ( x->pfn == gpfn )
  16.784 +        if ( x->gpfn_and_flags == key )
  16.785          {
  16.786  #if SHADOW_DEBUG
  16.787              if ( unlikely(shadow_status_noswap) )
  16.788 -                return x->smfn_and_flags;
  16.789 +                return x->smfn;
  16.790  #endif
  16.791              /* Pull-to-front if 'x' isn't already the head item. */
  16.792              if ( unlikely(x != head) )
  16.793 @@ -448,13 +733,16 @@ static inline unsigned long __shadow_sta
  16.794                  head->next = x;
  16.795  
  16.796                  /* Swap 'x' contents with head contents. */
  16.797 -                SWAP(head->pfn, x->pfn);
  16.798 -                SWAP(head->smfn_and_flags, x->smfn_and_flags);
  16.799 +                SWAP(head->gpfn_and_flags, x->gpfn_and_flags);
  16.800 +                SWAP(head->smfn, x->smfn);
  16.801 +            }
  16.802 +            else
  16.803 +            {
  16.804 +                perfc_incrc(shadow_status_hit_head);
  16.805              }
  16.806  
  16.807 -            SH_VVLOG("lookup gpfn=%p => status=%p",
  16.808 -                     gpfn, head->smfn_and_flags);
  16.809 -            return head->smfn_and_flags;
  16.810 +            SH_VVLOG("lookup gpfn=%p => status=%p", key, head->smfn);
  16.811 +            return head->smfn;
  16.812          }
  16.813  
  16.814          p = x;
  16.815 @@ -462,17 +750,68 @@ static inline unsigned long __shadow_sta
  16.816      }
  16.817      while ( x != NULL );
  16.818  
  16.819 -    SH_VVLOG("lookup gpfn=%p => status=0", gpfn);
  16.820 +    SH_VVLOG("lookup gpfn=%p => status=0", key);
  16.821 +    perfc_incrc(shadow_status_miss);
  16.822      return 0;
  16.823  }
  16.824  
  16.825  /*
  16.826 + * Not clear if pull-to-front is worth while for this or not,
  16.827 + * as it generally needs to scan the entire bucket anyway.
  16.828 + * Much simpler without.
  16.829 + *
  16.830 + * Either returns PGT_none, or PGT_l{1,2,3,4}_page_table.
  16.831 + */
  16.832 +static inline unsigned long
  16.833 +shadow_max_pgtable_type(struct domain *d, unsigned long gpfn)
  16.834 +{
  16.835 +    struct shadow_status *x;
  16.836 +    unsigned long pttype = PGT_none, type;
  16.837 +
  16.838 +    ASSERT(spin_is_locked(&d->arch.shadow_lock));
  16.839 +    ASSERT(gpfn == (gpfn & PGT_mfn_mask));
  16.840 +
  16.841 +    x = hash_bucket(d, gpfn);
  16.842 +
  16.843 +    while ( x && x->gpfn_and_flags )
  16.844 +    {
  16.845 +        if ( (x->gpfn_and_flags & PGT_mfn_mask) == gpfn )
  16.846 +        {
  16.847 +            type = x->gpfn_and_flags & PGT_type_mask;
  16.848 +
  16.849 +            // Treat an HL2 as if it's an L1
  16.850 +            //
  16.851 +            if ( type == PGT_hl2_shadow )
  16.852 +                type = PGT_l1_shadow;
  16.853 +
  16.854 +            // Ignore snapshots -- they don't in and of themselves constitute
  16.855 +            // treating a page as a page table
  16.856 +            //
  16.857 +            if ( type == PGT_snapshot )
  16.858 +                goto next;
  16.859 +
  16.860 +            // Early exit if we found the max possible value
  16.861 +            //
  16.862 +            if ( type == PGT_base_page_table )
  16.863 +                return type;
  16.864 +
  16.865 +            if ( type > pttype )
  16.866 +                pttype = type;
  16.867 +        }
  16.868 +    next:
  16.869 +        x = x->next;
  16.870 +    }
  16.871 +
  16.872 +    return pttype;
  16.873 +}
  16.874 +
  16.875 +/*
  16.876   * N.B. We can make this locking more fine grained (e.g., per shadow page) if
  16.877   * it ever becomes a problem, but since we need a spin lock on the hash table 
  16.878   * anyway it's probably not worth being too clever.
  16.879   */
  16.880  static inline unsigned long get_shadow_status(
  16.881 -    struct domain *d, unsigned int gpfn )
  16.882 +    struct domain *d, unsigned long gpfn, unsigned long stype)
  16.883  {
  16.884      unsigned long res;
  16.885  
  16.886 @@ -484,65 +823,66 @@ static inline unsigned long get_shadow_s
  16.887       * has changed type. If we're in log dirty mode, we should set the
  16.888       * appropriate bit in the dirty bitmap.
  16.889       * N.B. The VA update path doesn't use this and is handled independently. 
  16.890 -
  16.891 -     XXX need to think this through for vmx guests, but probably OK
  16.892 +     *
  16.893 +     * XXX need to think this through for vmx guests, but probably OK
  16.894       */
  16.895  
  16.896      shadow_lock(d);
  16.897  
  16.898      if ( shadow_mode_log_dirty(d) )
  16.899 -        __mark_dirty(d, gpfn);
  16.900 +        __mark_dirty(d, __gpfn_to_mfn(d, gpfn));
  16.901  
  16.902 -    if ( !(res = __shadow_status(d, gpfn)) )
  16.903 +    if ( !(res = __shadow_status(d, gpfn, stype)) )
  16.904          shadow_unlock(d);
  16.905  
  16.906      return res;
  16.907  }
  16.908  
  16.909  
  16.910 -static inline void put_shadow_status(
  16.911 -    struct domain *d)
  16.912 +static inline void put_shadow_status(struct domain *d)
  16.913  {
  16.914      shadow_unlock(d);
  16.915  }
  16.916  
  16.917  
  16.918  static inline void delete_shadow_status( 
  16.919 -    struct domain *d, unsigned int gpfn)
  16.920 +    struct domain *d, unsigned int gpfn, unsigned int stype)
  16.921  {
  16.922      struct shadow_status *p, *x, *n, *head;
  16.923 +    unsigned long key = gpfn | stype;
  16.924  
  16.925      ASSERT(spin_is_locked(&d->arch.shadow_lock));
  16.926 -    ASSERT(gpfn != 0);
  16.927 +    ASSERT(gpfn && !(gpfn & ~PGT_mfn_mask));
  16.928 +    ASSERT(stype && !(stype & ~PGT_type_mask));
  16.929  
  16.930      head = hash_bucket(d, gpfn);
  16.931  
  16.932 -    SH_VVLOG("delete gpfn=%08x bucket=%p", gpfn, head);
  16.933 +    SH_VLOG("delete gpfn=%p t=%p bucket=%p", gpfn, stype, head);
  16.934      shadow_audit(d, 0);
  16.935  
  16.936      /* Match on head item? */
  16.937 -    if ( head->pfn == gpfn )
  16.938 +    if ( head->gpfn_and_flags == key )
  16.939      {
  16.940          if ( (n = head->next) != NULL )
  16.941          {
  16.942              /* Overwrite head with contents of following node. */
  16.943 -            head->pfn            = n->pfn;
  16.944 -            head->smfn_and_flags = n->smfn_and_flags;
  16.945 +            head->gpfn_and_flags = n->gpfn_and_flags;
  16.946 +            head->smfn           = n->smfn;
  16.947  
  16.948              /* Delete following node. */
  16.949              head->next           = n->next;
  16.950  
  16.951              /* Add deleted node to the free list. */
  16.952 -            n->pfn            = 0;
  16.953 -            n->smfn_and_flags = 0;
  16.954 +            n->gpfn_and_flags = 0;
  16.955 +            n->smfn           = 0;
  16.956              n->next           = d->arch.shadow_ht_free;
  16.957              d->arch.shadow_ht_free = n;
  16.958          }
  16.959          else
  16.960          {
  16.961              /* This bucket is now empty. Initialise the head node. */
  16.962 -            head->pfn            = 0;
  16.963 -            head->smfn_and_flags = 0;
  16.964 +            head->gpfn_and_flags = 0;
  16.965 +            head->smfn           = 0;
  16.966          }
  16.967  
  16.968          goto found;
  16.969 @@ -553,14 +893,14 @@ static inline void delete_shadow_status(
  16.970  
  16.971      do
  16.972      {
  16.973 -        if ( x->pfn == gpfn )
  16.974 +        if ( x->gpfn_and_flags == key )
  16.975          {
  16.976              /* Delete matching node. */
  16.977              p->next = x->next;
  16.978  
  16.979              /* Add deleted node to the free list. */
  16.980 -            x->pfn            = 0;
  16.981 -            x->smfn_and_flags = 0;
  16.982 +            x->gpfn_and_flags = 0;
  16.983 +            x->smfn           = 0;
  16.984              x->next           = d->arch.shadow_ht_free;
  16.985              d->arch.shadow_ht_free = x;
  16.986  
  16.987 @@ -576,34 +916,46 @@ static inline void delete_shadow_status(
  16.988      BUG();
  16.989  
  16.990   found:
  16.991 +    // release ref to page
  16.992 +    put_page(pfn_to_page(__gpfn_to_mfn(d, gpfn)));
  16.993 +
  16.994      shadow_audit(d, 0);
  16.995  }
  16.996  
  16.997 -
  16.998  static inline void set_shadow_status(
  16.999 -    struct domain *d, unsigned int gpfn, unsigned long s)
 16.1000 +    struct domain *d, unsigned long gpfn,
 16.1001 +    unsigned long smfn, unsigned long stype)
 16.1002  {
 16.1003      struct shadow_status *x, *head, *extra;
 16.1004      int i;
 16.1005 +    unsigned long gmfn = __gpfn_to_mfn(d, gpfn);
 16.1006 +    unsigned long key = gpfn | stype;
 16.1007  
 16.1008      ASSERT(spin_is_locked(&d->arch.shadow_lock));
 16.1009 -    ASSERT(gpfn != 0);
 16.1010 -    ASSERT(s & (PSH_shadowed | PSH_hl2));
 16.1011 +    ASSERT(gpfn && !(gpfn & ~PGT_mfn_mask));
 16.1012 +    ASSERT(pfn_is_ram(gmfn)); // XXX need to be more graceful
 16.1013 +    ASSERT(smfn && !(smfn & ~PGT_mfn_mask));
 16.1014 +    ASSERT(stype && !(stype & ~PGT_type_mask));
 16.1015  
 16.1016      x = head = hash_bucket(d, gpfn);
 16.1017     
 16.1018 -    SH_VVLOG("set gpfn=%08x s=%p bucket=%p(%p)", gpfn, s, x, x->next);
 16.1019 +    SH_VLOG("set gpfn=%p smfn=%p t=%p bucket=%p(%p)",
 16.1020 +             gpfn, smfn, stype, x, x->next);
 16.1021      shadow_audit(d, 0);
 16.1022  
 16.1023 +    // grab a reference to the guest page to represent the entry in the shadow
 16.1024 +    // hash table
 16.1025 +    //
 16.1026 +    get_page(pfn_to_page(gmfn), d);
 16.1027 +
 16.1028      /*
 16.1029       * STEP 1. If page is already in the table, update it in place.
 16.1030       */
 16.1031 -
 16.1032      do
 16.1033      {
 16.1034 -        if ( x->pfn == gpfn )
 16.1035 +        if ( x->gpfn_and_flags == key )
 16.1036          {
 16.1037 -            x->smfn_and_flags = s;
 16.1038 +            x->smfn = smfn;
 16.1039              goto done;
 16.1040          }
 16.1041  
 16.1042 @@ -616,10 +968,10 @@ static inline void set_shadow_status(
 16.1043       */
 16.1044  
 16.1045      /* If the bucket is empty then insert the new page as the head item. */
 16.1046 -    if ( head->pfn == 0 )
 16.1047 +    if ( head->gpfn_and_flags == 0 )
 16.1048      {
 16.1049 -        head->pfn            = gpfn;
 16.1050 -        head->smfn_and_flags = s;
 16.1051 +        head->gpfn_and_flags = key;
 16.1052 +        head->smfn           = smfn;
 16.1053          ASSERT(head->next == NULL);
 16.1054          goto done;
 16.1055      }
 16.1056 @@ -658,35 +1010,107 @@ static inline void set_shadow_status(
 16.1057      d->arch.shadow_ht_free = x->next;
 16.1058  
 16.1059      /* Initialise the new node and insert directly after the head item. */
 16.1060 -    x->pfn            = gpfn;
 16.1061 -    x->smfn_and_flags = s;
 16.1062 +    x->gpfn_and_flags = key;
 16.1063 +    x->smfn           = smfn;
 16.1064      x->next           = head->next;
 16.1065      head->next        = x;
 16.1066  
 16.1067   done:
 16.1068      shadow_audit(d, 0);
 16.1069  }
 16.1070 -  
 16.1071 +
 16.1072 +/************************************************************************/
 16.1073 +
 16.1074 +extern void shadow_map_l1_into_current_l2(unsigned long va);
 16.1075 +
 16.1076 +void static inline
 16.1077 +shadow_set_l1e(unsigned long va, unsigned long new_spte, int create_l1_shadow)
 16.1078 +{
 16.1079 +    struct exec_domain *ed = current;
 16.1080 +    struct domain *d = ed->domain;
 16.1081 +    unsigned long sl2e, old_spte;
 16.1082 +
 16.1083 +#if 0
 16.1084 +    printk("shadow_set_l1e(va=%p, new_spte=%p, create=%d)\n",
 16.1085 +           va, new_spte, create_l1_shadow);
 16.1086 +#endif
 16.1087 +
 16.1088 +    __shadow_get_l2e(ed, va, &sl2e);
 16.1089 +    if ( !(sl2e & _PAGE_PRESENT) )
 16.1090 +    {
 16.1091 +        /*
 16.1092 +         * Either the L1 is not shadowed, or the shadow isn't linked into
 16.1093 +         * the current shadow L2.
 16.1094 +         */
 16.1095 +        if ( create_l1_shadow )
 16.1096 +        {
 16.1097 +            perfc_incrc(shadow_set_l1e_force_map);
 16.1098 +            shadow_map_l1_into_current_l2(va);
 16.1099 +        }
 16.1100 +        else /* check to see if it exists; if so, link it in */
 16.1101 +        {
 16.1102 +            unsigned long gpde =
 16.1103 +                l2_pgentry_val(linear_l2_table(ed)[l2_table_offset(va)]);
 16.1104 +            unsigned long gl1pfn = gpde >> PAGE_SHIFT;
 16.1105 +            unsigned long sl1mfn = __shadow_status(d, gl1pfn, PGT_l1_shadow);
 16.1106 +
 16.1107 +            ASSERT( gpde & _PAGE_PRESENT );
 16.1108 +
 16.1109 +            if ( sl1mfn )
 16.1110 +            {
 16.1111 +                perfc_incrc(shadow_set_l1e_unlinked);
 16.1112 +                get_shadow_ref(sl1mfn);
 16.1113 +                l2pde_general(d, &gpde, &sl2e, sl1mfn);
 16.1114 +                __guest_set_l2e(ed, va, gpde);
 16.1115 +                __shadow_set_l2e(ed, va, sl2e);
 16.1116 +            }
 16.1117 +            else
 16.1118 +            {
 16.1119 +                // no shadow exists, so there's nothing to do.
 16.1120 +                perfc_incrc(shadow_set_l1e_fail);
 16.1121 +                return;
 16.1122 +            }
 16.1123 +        }
 16.1124 +    }
 16.1125 +
 16.1126 +    old_spte = l1_pgentry_val(shadow_linear_pg_table[l1_linear_offset(va)]);
 16.1127 +    shadow_linear_pg_table[l1_linear_offset(va)] = mk_l1_pgentry(new_spte);
 16.1128 +
 16.1129 +    // only do the ref counting if something important changed.
 16.1130 +    //
 16.1131 +    if ( (old_spte ^ new_spte) & (PAGE_MASK | _PAGE_RW | _PAGE_PRESENT) )
 16.1132 +    {
 16.1133 +        if ( new_spte & _PAGE_PRESENT )
 16.1134 +            get_page_from_l1e(mk_l1_pgentry(new_spte), d);
 16.1135 +        if ( old_spte & _PAGE_PRESENT )
 16.1136 +            put_page_from_l1e(mk_l1_pgentry(old_spte), d);
 16.1137 +    }
 16.1138 +}
 16.1139 +
 16.1140 +/************************************************************************/
 16.1141 +
 16.1142  static inline unsigned long gva_to_gpte(unsigned long gva)
 16.1143  {
 16.1144 -    unsigned long gpde, gpte, pfn, index;
 16.1145 +    unsigned long gpde, gpte;
 16.1146      struct exec_domain *ed = current;
 16.1147  
 16.1148 +    ASSERT( shadow_mode_translate(current->domain) );
 16.1149 +
 16.1150      __guest_get_l2e(ed, gva, &gpde);
 16.1151 -    if (!(gpde & _PAGE_PRESENT))
 16.1152 +    if ( unlikely(!(gpde & _PAGE_PRESENT)) )
 16.1153          return 0;
 16.1154  
 16.1155 -    index = l2_table_offset(gva);
 16.1156 -
 16.1157 -    if (!l2_pgentry_val(ed->arch.hl2_vtable[index])) {
 16.1158 -        pfn = phys_to_machine_mapping(gpde >> PAGE_SHIFT);
 16.1159 -        ed->arch.hl2_vtable[index] = 
 16.1160 -            mk_l2_pgentry((pfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
 16.1161 -    }
 16.1162 +    // This is actually overkill - we only need to make sure the hl2
 16.1163 +    // is in-sync.
 16.1164 +    //
 16.1165 +    shadow_sync_va(ed, gva);
 16.1166  
 16.1167      if ( unlikely(__get_user(gpte, (unsigned long *)
 16.1168                               &linear_pg_table[gva >> PAGE_SHIFT])) )
 16.1169 +    {
 16.1170 +        FSH_LOG("gva_to_gpte got a fault on gva=%p\n", gva);
 16.1171          return 0;
 16.1172 +    }
 16.1173  
 16.1174      return gpte;
 16.1175  }
 16.1176 @@ -702,94 +1126,19 @@ static inline unsigned long gva_to_gpa(u
 16.1177      return (gpte & PAGE_MASK) + (gva & ~PAGE_MASK); 
 16.1178  }
 16.1179  
 16.1180 -static inline void hl2_table_invalidate(struct exec_domain *ed)
 16.1181 -{
 16.1182 -    /*
 16.1183 -     * Need to optimize this
 16.1184 -     */
 16.1185 -    memset(ed->arch.hl2_vtable, 0, PAGE_SIZE);
 16.1186 -}
 16.1187 -
 16.1188 -static inline void __update_pagetables(struct exec_domain *ed)
 16.1189 -{
 16.1190 -    struct domain *d = ed->domain;
 16.1191 -    unsigned long gmfn = pagetable_val(ed->arch.guest_table) >> PAGE_SHIFT;
 16.1192 -    unsigned long gpfn = __mfn_to_gpfn(d, gmfn);
 16.1193 -    unsigned long smfn = __shadow_status(d, gpfn) & PSH_pfn_mask;
 16.1194 -
 16.1195 -    SH_VVLOG("0: __update_pagetables(gmfn=%p, smfn=%p)", gmfn, smfn);
 16.1196 -
 16.1197 -    if ( unlikely(smfn == 0) )
 16.1198 -        smfn = shadow_l2_table(d, gmfn);
 16.1199 -
 16.1200 -    ed->arch.shadow_table = mk_pagetable(smfn<<PAGE_SHIFT);
 16.1201 -
 16.1202 -    if ( shadow_mode_translate(d) )
 16.1203 -    {
 16.1204 -        l2_pgentry_t *mpl2e = ed->arch.monitor_vtable;
 16.1205 -        l2_pgentry_t *gpl2e, *spl2e;
 16.1206 -        unsigned long hl2_status, hl2mfn, offset;
 16.1207 -        int need_flush = 0;
 16.1208 -
 16.1209 -        if ( ed->arch.guest_vtable )
 16.1210 -            unmap_domain_mem(ed->arch.guest_vtable);
 16.1211 -        if ( ed->arch.shadow_vtable )
 16.1212 -            unmap_domain_mem(ed->arch.shadow_vtable);
 16.1213 -        if ( ed->arch.hl2_vtable )
 16.1214 -            unmap_domain_mem(ed->arch.hl2_vtable);
 16.1215 +/************************************************************************/
 16.1216  
 16.1217 -        gpl2e = ed->arch.guest_vtable =
 16.1218 -            map_domain_mem(pagetable_val(ed->arch.guest_table));
 16.1219 -        spl2e = ed->arch.shadow_vtable =
 16.1220 -            map_domain_mem(pagetable_val(ed->arch.shadow_table));
 16.1221 -
 16.1222 -        hl2_status = __shadow_status(d, gpfn | PSH_hl2);
 16.1223 -        if ( unlikely(!(hl2_status & PSH_hl2)) )
 16.1224 -            hl2_status = mk_hl2_table(ed);
 16.1225 -
 16.1226 -        hl2mfn = hl2_status & PSH_pfn_mask;
 16.1227 -        ed->arch.hl2_vtable = map_domain_mem(hl2mfn << PAGE_SHIFT);
 16.1228 -
 16.1229 -        offset = l2_table_offset(LINEAR_PT_VIRT_START);
 16.1230 -        if ( hl2mfn != (l2_pgentry_val(mpl2e[offset]) >> PAGE_SHIFT) )
 16.1231 -        {
 16.1232 -            mpl2e[offset] =
 16.1233 -                mk_l2_pgentry((hl2mfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
 16.1234 -            need_flush = 1;
 16.1235 -        }
 16.1236 -
 16.1237 -        if ( shadow_mode_external(d ) )
 16.1238 -        {
 16.1239 -            offset = l2_table_offset(SH_LINEAR_PT_VIRT_START);
 16.1240 -            if ( smfn != (l2_pgentry_val(mpl2e[offset]) >> PAGE_SHIFT) )
 16.1241 -            {
 16.1242 -                mpl2e[offset] =
 16.1243 -                    mk_l2_pgentry((smfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
 16.1244 -                need_flush = 1;
 16.1245 -            }
 16.1246 -        }
 16.1247 -
 16.1248 -        if ( VMX_DOMAIN(ed) )
 16.1249 -        {
 16.1250 -            // Why is VMX mode doing this?
 16.1251 -            shadow_invalidate(ed);
 16.1252 -            hl2_table_invalidate(ed);
 16.1253 -        }
 16.1254 -
 16.1255 -        if ( need_flush )
 16.1256 -            local_flush_tlb();
 16.1257 -    }
 16.1258 -}
 16.1259 -
 16.1260 +extern void __update_pagetables(struct exec_domain *ed);
 16.1261  static inline void update_pagetables(struct exec_domain *ed)
 16.1262  {
 16.1263      struct domain *d = ed->domain;
 16.1264 +
 16.1265 +#ifdef CONFIG_VMX
 16.1266      int paging_enabled =
 16.1267 -#ifdef CONFIG_VMX
 16.1268          !VMX_DOMAIN(ed) ||
 16.1269          test_bit(VMX_CPU_STATE_PG_ENABLED, &ed->arch.arch_vmx.cpu_state);
 16.1270  #else
 16.1271 -        1;
 16.1272 +    const int paging_enabled = 1;
 16.1273  #endif
 16.1274  
 16.1275      /*
 16.1276 @@ -805,12 +1154,8 @@ static inline void update_pagetables(str
 16.1277          shadow_unlock(d);
 16.1278      }
 16.1279  
 16.1280 -    if ( !shadow_mode_external(d) )
 16.1281 +    if ( likely(!shadow_mode_external(d)) )
 16.1282      {
 16.1283 -        /*
 16.1284 -         * Internal page tables:
 16.1285 -         * No need to allocate a separate page table for Xen.
 16.1286 -         */
 16.1287  #ifdef __x86_64__
 16.1288          if ( !(ed->arch.flags & TF_kernel_mode) )
 16.1289              ed->arch.monitor_table = ed->arch.guest_table_user;
 16.1290 @@ -821,27 +1166,17 @@ static inline void update_pagetables(str
 16.1291          else
 16.1292              ed->arch.monitor_table = ed->arch.guest_table;
 16.1293      }
 16.1294 -    else
 16.1295 -    {
 16.1296 -        /*
 16.1297 -         * External page tables:
 16.1298 -         * Allocate a monitor page table if we don't already have one.
 16.1299 -         */
 16.1300 -        if ( unlikely(!pagetable_val(ed->arch.monitor_table)) )
 16.1301 -            ed->arch.monitor_table =
 16.1302 -                mk_pagetable(alloc_monitor_pagetable(ed) << PAGE_SHIFT);
 16.1303 -    }
 16.1304  }
 16.1305  
 16.1306  #if SHADOW_DEBUG
 16.1307 -extern int _check_pagetable(struct domain *d, pagetable_t pt, char *s);
 16.1308 -extern int _check_all_pagetables(struct domain *d, char *s);
 16.1309 +extern int _check_pagetable(struct exec_domain *ed, char *s);
 16.1310 +extern int _check_all_pagetables(struct exec_domain *ed, char *s);
 16.1311  
 16.1312 -#define check_pagetable(_d, _pt, _s) _check_pagetable(_d, _pt, _s)
 16.1313 -//#define check_pagetable(_d, _pt, _s) _check_all_pagetables(_d, _s)
 16.1314 +#define check_pagetable(_ed, _s) _check_pagetable(_ed, _s)
 16.1315 +//#define check_pagetable(_ed, _s) _check_all_pagetables(_ed, _s)
 16.1316  
 16.1317  #else
 16.1318 -#define check_pagetable(_d, _pt, _s) ((void)0)
 16.1319 +#define check_pagetable(_ed, _s) ((void)0)
 16.1320  #endif
 16.1321  
 16.1322  #endif /* XEN_SHADOW_H */
    17.1 --- a/xen/include/asm-x86/x86_32/page.h	Tue Mar 15 15:53:52 2005 +0000
    17.2 +++ b/xen/include/asm-x86/x86_32/page.h	Wed Mar 16 17:30:37 2005 +0000
    17.3 @@ -68,7 +68,7 @@ typedef l2_pgentry_t root_pgentry_t;
    17.4  #define L1_DISALLOW_MASK (3UL << 7)
    17.5  #define L2_DISALLOW_MASK (7UL << 7)
    17.6  #define L3_DISALLOW_MASK (7UL << 7)
    17.7 -#define L2_DISALLOW_MASK (7UL << 7)
    17.8 +#define L4_DISALLOW_MASK (7UL << 7)
    17.9  
   17.10  #endif /* __X86_32_PAGE_H__ */
   17.11  
    18.1 --- a/xen/include/xen/domain.h	Tue Mar 15 15:53:52 2005 +0000
    18.2 +++ b/xen/include/xen/domain.h	Wed Mar 16 17:30:37 2005 +0000
    18.3 @@ -27,6 +27,4 @@ extern void domain_relinquish_memory(str
    18.4  
    18.5  extern void dump_pageframe_info(struct domain *d);
    18.6  
    18.7 -extern unsigned long alloc_monitor_pagetable(struct exec_domain *ed);
    18.8 -
    18.9  #endif /* __XEN_DOMAIN_H__ */
    19.1 --- a/xen/include/xen/perfc_defn.h	Tue Mar 15 15:53:52 2005 +0000
    19.2 +++ b/xen/include/xen/perfc_defn.h	Wed Mar 16 17:30:37 2005 +0000
    19.3 @@ -48,3 +48,33 @@ PERFCOUNTER_ARRAY( exceptions, "exceptio
    19.4  #define VMX_PERF_VECTOR_SIZE 0x20
    19.5  PERFCOUNTER_ARRAY( vmexits, "vmexits", VMX_PERF_EXIT_REASON_SIZE )
    19.6  PERFCOUNTER_ARRAY( cause_vector, "cause vector", VMX_PERF_VECTOR_SIZE )
    19.7 +
    19.8 +
    19.9 +PERFCOUNTER_CPU( shadow_hl2_table_count,   "shadow_hl2_table count" )
   19.10 +PERFCOUNTER_CPU( shadow_set_l1e_force_map, "shadow_set_l1e forced to map l1" )
   19.11 +PERFCOUNTER_CPU( shadow_set_l1e_unlinked,  "shadow_set_l1e found unlinked l1" )
   19.12 +PERFCOUNTER_CPU( shadow_set_l1e_fail,      "shadow_set_l1e failed (no sl1)" )
   19.13 +PERFCOUNTER_CPU( shadow_invlpg_faults,     "shadow_invlpg's get_user faulted")
   19.14 +
   19.15 +
   19.16 +/* STATUS counters do not reset when 'P' is hit */
   19.17 +PERFSTATUS( snapshot_pages,  "current # fshadow snapshot pages" )
   19.18 +
   19.19 +PERFCOUNTER_CPU(shadow_status_calls,    "calls to __shadow_status" )
   19.20 +PERFCOUNTER_CPU(shadow_status_miss,     "missed shadow cache" )
   19.21 +PERFCOUNTER_CPU(shadow_status_hit_head, "hits on head of bucket" )
   19.22 +
   19.23 +PERFCOUNTER_CPU(shadow_sync_all,                   "calls to shadow_sync_all")
   19.24 +PERFCOUNTER_CPU(shadow_make_snapshot,              "snapshots created")
   19.25 +PERFCOUNTER_CPU(shadow_mark_mfn_out_of_sync_calls, "calls to shadow_mk_out_of_sync")
   19.26 +PERFCOUNTER_CPU(shadow_out_of_sync_calls,          "calls to shadow_out_of_sync")
   19.27 +PERFCOUNTER_CPU(snapshot_entry_matches_calls,      "calls to ss_entry_matches")
   19.28 +PERFCOUNTER_CPU(snapshot_entry_matches_true,       "ss_entry_matches returns true")
   19.29 +
   19.30 +PERFCOUNTER_CPU(shadow_fault_calls,                "calls to shadow_fault")
   19.31 +PERFCOUNTER_CPU(shadow_fault_bail_pde_not_present, "sf bailed due to pde not present")
   19.32 +PERFCOUNTER_CPU(shadow_fault_bail_pte_not_present, "sf bailed due to pte not present")
   19.33 +PERFCOUNTER_CPU(shadow_fault_bail_ro_mapping,      "sf bailed due to a ro mapping")
   19.34 +PERFCOUNTER_CPU(shadow_fault_fixed,                "sf fixed the pgfault")
   19.35 +PERFCOUNTER_CPU(validate_pte_change,               "calls to validate_pte_change")
   19.36 +PERFCOUNTER_CPU(validate_pde_change,               "calls to validate_pde_change")