ia64/xen-unstable

changeset 1660:55d2aa0544ae

bitkeeper revision 1.1041.1.11 (40e4a214S0nMcVOgrtI0299yUPBMXA)

Merge http://xen.bkbits.net/xeno-unstable.bk
into gandalf.hpl.hp.com:/var/bk/djm/xeno-unstable-common.bk
author xenbk@gandalf.hpl.hp.com
date Thu Jul 01 23:45:24 2004 +0000 (2004-07-01)
parents d9a96380ff94 09821113c10b
children 37c17d6bda09 53a47b057466 466bdc820e84
files .rootkeys BitKeeper/etc/logging_ok xen/arch/x86/dom0_ops.c xen/arch/x86/shadow.c xen/common/debug.c xen/common/dom0_ops.c xen/common/domain.c xen/common/kernel.c xen/common/shadow.c xen/include/asm-x86/shadow.h xen/include/xen/shadow.h
line diff
     1.1 --- a/.rootkeys	Thu Jul 01 23:32:40 2004 +0000
     1.2 +++ b/.rootkeys	Thu Jul 01 23:45:24 2004 +0000
     1.3 @@ -284,6 +284,7 @@ 3ddb79bcsjinG9k1KcvbVBuas1R2dA xen/arch/
     1.4  3ddb79bcSC_LvnmFlX-T5iTgaR0SKg xen/arch/x86/boot/x86_32.S
     1.5  40e42bdbNu4MjI750THP_8J1S-Sa0g xen/arch/x86/boot/x86_64.S
     1.6  3ddb79bcUrk2EIaM5VsT6wUudH1kkg xen/arch/x86/delay.c
     1.7 +40e34414WiQO4h2m3tcpaCPn7SyYyg xen/arch/x86/dom0_ops.c
     1.8  3e32af9aRnYGl4GMOaDKp7JdfhOGhg xen/arch/x86/domain_page.c
     1.9  3ddb79bcecupHj56ZbTa3B0FxDowMg xen/arch/x86/entry.S
    1.10  3ddb79bcY5zW7KhvI9gvfuPi3ZumEg xen/arch/x86/extable.c
    1.11 @@ -305,6 +306,7 @@ 4022a73czgX7d-2zfF_cb33oVemApQ xen/arch/
    1.12  3ddb79bc1_2bAt67x9MFCP4AZrQnvQ xen/arch/x86/process.c
    1.13  3ddb79bc7KxGCEJsgBnkDX7XjD_ZEQ xen/arch/x86/rwlock.c
    1.14  3ddb79bcrD6Z_rUvSDgrvjyb4846Eg xen/arch/x86/setup.c
    1.15 +405b8599xI_PoEr3zZoJ2on-jdn7iw xen/arch/x86/shadow.c
    1.16  3ddb79bcSx2e8JSR3pdSGa8x1ScYzA xen/arch/x86/smp.c
    1.17  3ddb79bcfUN3-UBCPzX26IU8bq-3aw xen/arch/x86/smpboot.c
    1.18  3ddb79bc-Udq7ol-NX4q9XsYnN7A2Q xen/arch/x86/time.c
    1.19 @@ -332,7 +334,6 @@ 4064773cJ31vZt-zhbSoxqft1Jaw0w xen/commo
    1.20  40589968dD2D1aejwSOvrROg7fOvGQ xen/common/sched_bvt.c
    1.21  40589968be_t_n0-w6ggceW7h-sx0w xen/common/sched_rrobin.c
    1.22  3e397e6619PgAfBbw2XFbXkewvUWgw xen/common/schedule.c
    1.23 -405b8599xI_PoEr3zZoJ2on-jdn7iw xen/common/shadow.c
    1.24  3ddb79bdB9RNMnkQnUyZ5C9hhMSQQw xen/common/slab.c
    1.25  3ddb79bd0gVQYmL2zvuJnldvD0AGxQ xen/common/softirq.c
    1.26  3e7f358awXBC3Vw-wFRwPw18qL1khg xen/common/string.c
    1.27 @@ -422,6 +423,7 @@ 4022a73diKn2Ax4-R4gzk59lm1YdDg xen/inclu
    1.28  3ddb79c2QF5-pZGzuX4QukPCDAl59A xen/include/asm-x86/processor.h
    1.29  40cf1596bim9F9DNdV75klgRSZ6Y2A xen/include/asm-x86/ptrace.h
    1.30  3ddb79c2plf7ciNgoNjU-RsbUzawsw xen/include/asm-x86/rwlock.h
    1.31 +405b8599BsDsDwKEJLS0XipaiQW3TA xen/include/asm-x86/shadow.h
    1.32  3ddb79c3Hgbb2g8CyWLMCK-6_ZVQSQ xen/include/asm-x86/smp.h
    1.33  3ddb79c3jn8ALV_S9W5aeTYUQRKBpg xen/include/asm-x86/smpboot.h
    1.34  3ddb79c3NiyQE2vQnyGiaBnNjBO1rA xen/include/asm-x86/spinlock.h
    1.35 @@ -479,7 +481,7 @@ 3e4540ccU1sgCx8seIMGlahmMfv7yQ xen/inclu
    1.36  40589969nPq3DMzv24RDb5LXE9brHw xen/include/xen/sched-if.h
    1.37  3ddb79c0LzqqS0LhAQ50ekgj4oGl7Q xen/include/xen/sched.h
    1.38  403a06a7H0hpHcKpAiDe5BPnaXWTlA xen/include/xen/serial.h
    1.39 -405b8599BsDsDwKEJLS0XipaiQW3TA xen/include/xen/shadow.h
    1.40 +40e3392dib7GrcBAu5cT-EUZTYzeEQ xen/include/xen/shadow.h
    1.41  3ddb79c14dXIhP7C2ahnoD08K90G_w xen/include/xen/slab.h
    1.42  3ddb79c09xbS-xxfKxuV3JETIhBzmg xen/include/xen/smp.h
    1.43  3ddb79c1Vi5VleJAOKHAlY0G2zAsgw xen/include/xen/softirq.h
     2.1 --- a/BitKeeper/etc/logging_ok	Thu Jul 01 23:32:40 2004 +0000
     2.2 +++ b/BitKeeper/etc/logging_ok	Thu Jul 01 23:45:24 2004 +0000
     2.3 @@ -9,6 +9,7 @@ bd240@labyrinth.cl.cam.ac.uk
     2.4  br260@br260.wolfson.cam.ac.uk
     2.5  br260@labyrinth.cl.cam.ac.uk
     2.6  br260@laudney.cl.cam.ac.uk
     2.7 +djm@kirby.fc.hp.com
     2.8  gm281@boulderdash.cl.cam.ac.uk
     2.9  iap10@freefall.cl.cam.ac.uk
    2.10  iap10@labyrinth.cl.cam.ac.uk
     3.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     3.2 +++ b/xen/arch/x86/dom0_ops.c	Thu Jul 01 23:45:24 2004 +0000
     3.3 @@ -0,0 +1,223 @@
     3.4 +/******************************************************************************
     3.5 + * Arch-specific dom0_ops.c
     3.6 + * 
     3.7 + * Process command requests from domain-0 guest OS.
     3.8 + * 
     3.9 + * Copyright (c) 2002, K A Fraser
    3.10 + */
    3.11 +
    3.12 +#include <xen/config.h>
    3.13 +#include <xen/types.h>
    3.14 +#include <xen/lib.h>
    3.15 +#include <xen/mm.h>
    3.16 +#include <hypervisor-ifs/dom0_ops.h>
    3.17 +#include <xen/sched.h>
    3.18 +#include <xen/event.h>
    3.19 +#include <asm/domain_page.h>
    3.20 +#include <asm/msr.h>
    3.21 +#include <asm/pdb.h>
    3.22 +#include <xen/trace.h>
    3.23 +#include <xen/console.h>
    3.24 +#include <xen/shadow.h>
    3.25 +#include <hypervisor-ifs/sched_ctl.h>
    3.26 +
    3.27 +#define TRC_DOM0OP_ENTER_BASE  0x00020000
    3.28 +#define TRC_DOM0OP_LEAVE_BASE  0x00030000
    3.29 +
    3.30 +extern unsigned int alloc_new_dom_mem(struct domain *, unsigned int);
    3.31 +
    3.32 +static int msr_cpu_mask;
    3.33 +static unsigned long msr_addr;
    3.34 +static unsigned long msr_lo;
    3.35 +static unsigned long msr_hi;
    3.36 +
    3.37 +static void write_msr_for(void *unused)
    3.38 +{
    3.39 +    if (((1 << current->processor) & msr_cpu_mask))
    3.40 +        wrmsr(msr_addr, msr_lo, msr_hi);
    3.41 +}
    3.42 +
    3.43 +static void read_msr_for(void *unused)
    3.44 +{
    3.45 +    if (((1 << current->processor) & msr_cpu_mask))
    3.46 +        rdmsr(msr_addr, msr_lo, msr_hi);
    3.47 +}
    3.48 +
    3.49 +long arch_do_dom0_op(dom0_op_t *op, dom0_op_t *u_dom0_op)
    3.50 +{
    3.51 +    long ret = 0;
    3.52 +
    3.53 +    if ( !IS_PRIV(current) )
    3.54 +        return -EPERM;
    3.55 +
    3.56 +    switch ( op->cmd )
    3.57 +    {
    3.58 +
    3.59 +    case DOM0_MSR:
    3.60 +    {
    3.61 +        if ( op->u.msr.write )
    3.62 +        {
    3.63 +            msr_cpu_mask = op->u.msr.cpu_mask;
    3.64 +            msr_addr = op->u.msr.msr;
    3.65 +            msr_lo = op->u.msr.in1;
    3.66 +            msr_hi = op->u.msr.in2;
    3.67 +            smp_call_function(write_msr_for, NULL, 1, 1);
    3.68 +            write_msr_for(NULL);
    3.69 +        }
    3.70 +        else
    3.71 +        {
    3.72 +            msr_cpu_mask = op->u.msr.cpu_mask;
    3.73 +            msr_addr = op->u.msr.msr;
    3.74 +            smp_call_function(read_msr_for, NULL, 1, 1);
    3.75 +            read_msr_for(NULL);
    3.76 +
    3.77 +            op->u.msr.out1 = msr_lo;
    3.78 +            op->u.msr.out2 = msr_hi;
    3.79 +            copy_to_user(u_dom0_op, op, sizeof(*op));
    3.80 +        }
    3.81 +        ret = 0;
    3.82 +    }
    3.83 +    break;
    3.84 +
    3.85 +    case DOM0_GETDOMAININFO:
    3.86 +    { 
    3.87 +        full_execution_context_t *c;
    3.88 +        struct domain            *d;
    3.89 +        unsigned long             flags;
    3.90 +        int                       i;
    3.91 +
    3.92 +        read_lock_irqsave(&tasklist_lock, flags);
    3.93 +
    3.94 +        for_each_domain ( d )
    3.95 +        {
    3.96 +            if ( d->domain >= op->u.getdomaininfo.domain )
    3.97 +                break;
    3.98 +        }
    3.99 +
   3.100 +        if ( (d == NULL) || !get_domain(d) )
   3.101 +        {
   3.102 +            read_unlock_irqrestore(&tasklist_lock, flags);
   3.103 +            ret = -ESRCH;
   3.104 +            break;
   3.105 +        }
   3.106 +
   3.107 +        read_unlock_irqrestore(&tasklist_lock, flags);
   3.108 +
   3.109 +        op->u.getdomaininfo.domain = d->domain;
   3.110 +        strcpy(op->u.getdomaininfo.name, d->name);
   3.111 +        
   3.112 +        op->u.getdomaininfo.flags =
   3.113 +            (test_bit(DF_DYING,     &d->flags) ? DOMFLAGS_DYING    : 0) |
   3.114 +            (test_bit(DF_CRASHED,   &d->flags) ? DOMFLAGS_CRASHED  : 0) |
   3.115 +            (test_bit(DF_SHUTDOWN,  &d->flags) ? DOMFLAGS_SHUTDOWN : 0) |
   3.116 +            (test_bit(DF_CTRLPAUSE, &d->flags) ? DOMFLAGS_PAUSED   : 0) |
   3.117 +            (test_bit(DF_BLOCKED,   &d->flags) ? DOMFLAGS_BLOCKED  : 0) |
   3.118 +            (test_bit(DF_RUNNING,   &d->flags) ? DOMFLAGS_RUNNING  : 0);
   3.119 +
   3.120 +        op->u.getdomaininfo.flags |= d->processor << DOMFLAGS_CPUSHIFT;
   3.121 +        op->u.getdomaininfo.flags |= 
   3.122 +            d->shutdown_code << DOMFLAGS_SHUTDOWNSHIFT;
   3.123 +
   3.124 +        op->u.getdomaininfo.tot_pages   = d->tot_pages;
   3.125 +        op->u.getdomaininfo.max_pages   = d->max_pages;
   3.126 +        op->u.getdomaininfo.cpu_time    = d->cpu_time;
   3.127 +        op->u.getdomaininfo.shared_info_frame = 
   3.128 +            __pa(d->shared_info) >> PAGE_SHIFT;
   3.129 +
   3.130 +        if ( op->u.getdomaininfo.ctxt != NULL )
   3.131 +        {
   3.132 +            if ( (c = kmalloc(sizeof(*c))) == NULL )
   3.133 +            {
   3.134 +                ret = -ENOMEM;
   3.135 +                put_domain(d);
   3.136 +                break;
   3.137 +            }
   3.138 +
   3.139 +            if ( d != current )
   3.140 +                domain_pause(d);
   3.141 +
   3.142 +            c->flags = 0;
   3.143 +            memcpy(&c->cpu_ctxt, 
   3.144 +                   &d->shared_info->execution_context,
   3.145 +                   sizeof(d->shared_info->execution_context));
   3.146 +            if ( test_bit(DF_DONEFPUINIT, &d->flags) )
   3.147 +                c->flags |= ECF_I387_VALID;
   3.148 +            memcpy(&c->fpu_ctxt,
   3.149 +                   &d->thread.i387,
   3.150 +                   sizeof(d->thread.i387));
   3.151 +            memcpy(&c->trap_ctxt,
   3.152 +                   d->thread.traps,
   3.153 +                   sizeof(d->thread.traps));
   3.154 +#ifdef ARCH_HAS_FAST_TRAP
   3.155 +            if ( (d->thread.fast_trap_desc.a == 0) &&
   3.156 +                 (d->thread.fast_trap_desc.b == 0) )
   3.157 +                c->fast_trap_idx = 0;
   3.158 +            else
   3.159 +                c->fast_trap_idx = 
   3.160 +                    d->thread.fast_trap_idx;
   3.161 +#endif
   3.162 +            c->ldt_base = d->mm.ldt_base;
   3.163 +            c->ldt_ents = d->mm.ldt_ents;
   3.164 +            c->gdt_ents = 0;
   3.165 +            if ( GET_GDT_ADDRESS(d) == GDT_VIRT_START )
   3.166 +            {
   3.167 +                for ( i = 0; i < 16; i++ )
   3.168 +                    c->gdt_frames[i] = 
   3.169 +                        l1_pgentry_to_pagenr(d->mm.perdomain_pt[i]);
   3.170 +                c->gdt_ents = 
   3.171 +                    (GET_GDT_ENTRIES(d) + 1) >> 3;
   3.172 +            }
   3.173 +            c->guestos_ss  = d->thread.guestos_ss;
   3.174 +            c->guestos_esp = d->thread.guestos_sp;
   3.175 +            c->pt_base   = 
   3.176 +                pagetable_val(d->mm.pagetable);
   3.177 +            memcpy(c->debugreg, 
   3.178 +                   d->thread.debugreg, 
   3.179 +                   sizeof(d->thread.debugreg));
   3.180 +            c->event_callback_cs  =
   3.181 +                d->event_selector;
   3.182 +            c->event_callback_eip =
   3.183 +                d->event_address;
   3.184 +            c->failsafe_callback_cs  = 
   3.185 +                d->failsafe_selector;
   3.186 +            c->failsafe_callback_eip = 
   3.187 +                d->failsafe_address;
   3.188 +
   3.189 +            if ( d != current )
   3.190 +                domain_unpause(d);
   3.191 +
   3.192 +            if ( copy_to_user(op->u.getdomaininfo.ctxt, c, sizeof(*c)) )
   3.193 +                ret = -EINVAL;
   3.194 +
   3.195 +            if ( c != NULL )
   3.196 +                kfree(c);
   3.197 +        }
   3.198 +
   3.199 +        if ( copy_to_user(u_dom0_op, op, sizeof(*op)) )     
   3.200 +            ret = -EINVAL;
   3.201 +
   3.202 +        put_domain(d);
   3.203 +    }
   3.204 +    break;
   3.205 +
   3.206 +    case DOM0_SHADOW_CONTROL:
   3.207 +    {
   3.208 +        struct domain *d; 
   3.209 +        ret = -ESRCH;
   3.210 +        d = find_domain_by_id(op->u.shadow_control.domain);
   3.211 +        if ( d != NULL )
   3.212 +        {
   3.213 +            ret = shadow_mode_control(d, &op->u.shadow_control);
   3.214 +            put_domain(d);
   3.215 +            copy_to_user(u_dom0_op, op, sizeof(*op));
   3.216 +        } 
   3.217 +    }
   3.218 +    break;
   3.219 +
   3.220 +    default:
   3.221 +        ret = -ENOSYS;
   3.222 +
   3.223 +    }
   3.224 +
   3.225 +    return ret;
   3.226 +}
     4.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     4.2 +++ b/xen/arch/x86/shadow.c	Thu Jul 01 23:45:24 2004 +0000
     4.3 @@ -0,0 +1,1058 @@
     4.4 +/* -*-  Mode:C++; c-file-style:BSD; c-basic-offset:4; tab-width:4 -*- */
     4.5 +
     4.6 +#include <xen/config.h>
     4.7 +#include <xen/types.h>
     4.8 +#include <xen/mm.h>
     4.9 +#include <xen/shadow.h>
    4.10 +#include <asm/domain_page.h>
    4.11 +#include <asm/page.h>
    4.12 +#include <xen/event.h>
    4.13 +#include <xen/trace.h>
    4.14 +
    4.15 +
    4.16 +/********
    4.17 +
    4.18 +To use these shadow page tables, guests must not rely on the ACCESSED
    4.19 +and DIRTY bits on L2 pte's being accurate -- they will typically all be set.
    4.20 +
    4.21 +I doubt this will break anything. (If guests want to use the va_update
    4.22 +mechanism they've signed up for this anyhow...)
    4.23 +
    4.24 +There's a per-domain shadow table spin lock which works fine for SMP
    4.25 +hosts. We don't have to worry about interrupts as no shadow operations
    4.26 +happen in an interrupt context. It's probably not quite ready for SMP
    4.27 +guest operation as we have to worry about synchonisation between gpte
    4.28 +and spte updates. Its possible that this might only happen in a
    4.29 +hypercall context, in which case we'll probably at have a per-domain
    4.30 +hypercall lock anyhow (at least initially).
    4.31 +
    4.32 +********/
    4.33 +
    4.34 +
    4.35 +/**
    4.36 +
    4.37 +FIXME:
    4.38 +
    4.39 +The shadow table flush command is dangerous on SMP systems as the
    4.40 +guest may be using the L2 on one CPU while the other is trying to 
    4.41 +blow the table away. 
    4.42 +
    4.43 +The current save restore code works around this by not calling FLUSH,
    4.44 +but by calling CLEAN2 which leaves all L2s in tact (this is probably
    4.45 +quicker anyhow).
    4.46 +
    4.47 +Even so, we have to be very careful. The flush code may need to cause
    4.48 +a TLB flush on another CPU. It needs to do this while holding the
    4.49 +shadow table lock. The trouble is, the guest may be in the shadow page
    4.50 +fault handler spinning waiting to grab the shadow lock. It may have
    4.51 +intterupts disabled, hence we can't use the normal flush_tlb_cpu
    4.52 +mechanism.
    4.53 +
    4.54 +For the moment, we have a grim race whereby the spinlock in the shadow
    4.55 +fault handler is actually a try lock, in a loop with a helper for the
    4.56 +tlb flush code.
    4.57 +
    4.58 +A better soloution would be to take a new flush lock, then raise a
    4.59 +per-domain soft irq on the other CPU.  The softirq will switch to
    4.60 +init's PTs, then do an atomic inc of a variable to count himself in,
    4.61 +then spin on a lock.  Having noticed that the other guy has counted
    4.62 +in, flush the shadow table, then release him by dropping the lock. He
    4.63 +will then reload cr3 from mm.page_table on the way out of the softirq.
    4.64 +
    4.65 +In domian-softirq context we know that the guy holds no locks and has
    4.66 +interrupts enabled. Nothing can go wrong ;-)
    4.67 +
    4.68 +**/
    4.69 +
    4.70 +static inline void free_shadow_page( struct mm_struct *m, 
    4.71 +                                     struct pfn_info *pfn_info )
    4.72 +{
    4.73 +    unsigned long flags;
    4.74 +    unsigned long type = pfn_info->type_and_flags & PGT_type_mask;
    4.75 +
    4.76 +    m->shadow_page_count--;
    4.77 +
    4.78 +    if (type == PGT_l1_page_table)
    4.79 +        perfc_decr(shadow_l1_pages);
    4.80 +    else if (type == PGT_l2_page_table)
    4.81 +        perfc_decr(shadow_l2_pages);
    4.82 +    else printk("Free shadow weird page type pfn=%08x type=%08x\n",
    4.83 +                frame_table-pfn_info, pfn_info->type_and_flags);
    4.84 +    
    4.85 +    pfn_info->type_and_flags = 0;
    4.86 +
    4.87 +    spin_lock_irqsave(&free_list_lock, flags);
    4.88 +    list_add(&pfn_info->list, &free_list);
    4.89 +    free_pfns++;
    4.90 +    spin_unlock_irqrestore(&free_list_lock, flags);
    4.91 +}
    4.92 +
    4.93 +static void __free_shadow_table( struct mm_struct *m )
    4.94 +{
    4.95 +    int j, free=0;
    4.96 +    struct shadow_status *a,*next;
    4.97 + 
    4.98 +    // the code assumes you're not using the page tables i.e.
    4.99 +    // the domain is stopped and cr3 is something else!!
   4.100 +
   4.101 +    // walk the hash table and call free_shadow_page on all pages
   4.102 +
   4.103 +    shadow_audit(m,1);
   4.104 +
   4.105 +    for(j=0;j<shadow_ht_buckets;j++)
   4.106 +    {
   4.107 +        a = &m->shadow_ht[j];        
   4.108 +        if (a->pfn)
   4.109 +        {
   4.110 +            free_shadow_page( m, 
   4.111 +                              &frame_table[a->spfn_and_flags & PSH_pfn_mask] );
   4.112 +            a->pfn = 0;
   4.113 +            a->spfn_and_flags = 0;
   4.114 +            free++;
   4.115 +        }
   4.116 +        next=a->next;
   4.117 +        a->next=NULL;
   4.118 +        a=next;
   4.119 +        while(a)
   4.120 +        { 
   4.121 +            struct shadow_status *next = a->next;
   4.122 +
   4.123 +            free_shadow_page( m, 
   4.124 +                              &frame_table[a->spfn_and_flags & PSH_pfn_mask] );
   4.125 +            a->pfn = 0;
   4.126 +            a->spfn_and_flags = 0;
   4.127 +            free++;
   4.128 +            a->next = m->shadow_ht_free;           
   4.129 +            m->shadow_ht_free = a;
   4.130 +            a=next;
   4.131 +        }
   4.132 +        shadow_audit(m,0);
   4.133 +    }
   4.134 +    SH_LOG("Free shadow table. Freed= %d",free);
   4.135 +}
   4.136 +
   4.137 +
   4.138 +#define TABLE_OP_ZERO_L2 1
   4.139 +#define TABLE_OP_ZERO_L1 2
   4.140 +#define TABLE_OP_FREE_L1 3
   4.141 +
   4.142 +static inline int shadow_page_op( struct mm_struct *m, unsigned int op, 
   4.143 +								  unsigned int gpfn,
   4.144 +                                  struct pfn_info *spfn_info, int *work )
   4.145 +{
   4.146 +    unsigned int spfn = spfn_info-frame_table;
   4.147 +	int restart = 0;
   4.148 +
   4.149 +    switch( op )
   4.150 +    {
   4.151 +	case TABLE_OP_ZERO_L2:
   4.152 +	{
   4.153 +		if ( (spfn_info->type_and_flags & PGT_type_mask) == 
   4.154 +             PGT_l2_page_table )
   4.155 +		{
   4.156 +			unsigned long * spl1e = map_domain_mem( spfn<<PAGE_SHIFT );
   4.157 +#ifdef __i386__
   4.158 +			memset(spl1e, 0, DOMAIN_ENTRIES_PER_L2_PAGETABLE * sizeof(*spl1e));
   4.159 +#endif
   4.160 +			unmap_domain_mem( spl1e );
   4.161 +		}
   4.162 +    }
   4.163 +	break;
   4.164 +	
   4.165 +	case TABLE_OP_ZERO_L1:
   4.166 +	{
   4.167 +		if ( (spfn_info->type_and_flags & PGT_type_mask) == 
   4.168 +             PGT_l1_page_table )
   4.169 +		{
   4.170 +			unsigned long * spl1e = map_domain_mem( spfn<<PAGE_SHIFT );
   4.171 +			memset( spl1e, 0, ENTRIES_PER_L1_PAGETABLE * sizeof(*spl1e) );
   4.172 +			unmap_domain_mem( spl1e );
   4.173 +		}
   4.174 +    }
   4.175 +	break;
   4.176 +
   4.177 +	case TABLE_OP_FREE_L1:
   4.178 +	{
   4.179 +		if ( (spfn_info->type_and_flags & PGT_type_mask) == 
   4.180 +             PGT_l1_page_table )
   4.181 +		{
   4.182 +			// lock is already held
   4.183 +			delete_shadow_status( m, gpfn );
   4.184 +			restart = 1; // we need to go to start of list again
   4.185 +		}
   4.186 +    }
   4.187 +
   4.188 +	break;
   4.189 +	
   4.190 +	default:
   4.191 +		BUG();
   4.192 +
   4.193 +    }
   4.194 +    return restart;
   4.195 +}
   4.196 +
   4.197 +static void __scan_shadow_table( struct mm_struct *m, unsigned int op )
   4.198 +{
   4.199 +    int j, work=0;
   4.200 +    struct shadow_status *a, *next;
   4.201 + 
   4.202 +    // the code assumes you're not using the page tables i.e.
   4.203 +    // the domain is stopped and cr3 is something else!!
   4.204 +
   4.205 +    // walk the hash table and call free_shadow_page on all pages
   4.206 +
   4.207 +    shadow_audit(m,1);
   4.208 +
   4.209 +    for(j=0;j<shadow_ht_buckets;j++)
   4.210 +    {
   4.211 +	retry:
   4.212 +        a = &m->shadow_ht[j];     
   4.213 +		next = a->next;
   4.214 +        if (a->pfn)
   4.215 +        {
   4.216 +            if ( shadow_page_op( m, op, a->pfn,								 
   4.217 +								 &frame_table[a->spfn_and_flags & PSH_pfn_mask], 
   4.218 +								 &work ) )
   4.219 +				goto retry;
   4.220 +        }
   4.221 +        a=next;
   4.222 +        while(a)
   4.223 +        { 
   4.224 +			next = a->next;
   4.225 +            if ( shadow_page_op( m, op, a->pfn,
   4.226 +								 &frame_table[a->spfn_and_flags & PSH_pfn_mask],
   4.227 +								 &work ) )
   4.228 +				goto retry;
   4.229 +            a=next;
   4.230 +        }
   4.231 +        shadow_audit(m,0);
   4.232 +    }
   4.233 +    SH_VLOG("Scan shadow table. Work=%d l1=%d l2=%d", work, perfc_value(shadow_l1_pages), perfc_value(shadow_l2_pages));
   4.234 +}
   4.235 +
   4.236 +
   4.237 +void shadow_mode_init(void)
   4.238 +{
   4.239 +}
   4.240 +
   4.241 +int shadow_mode_enable( struct domain *p, unsigned int mode )
   4.242 +{
   4.243 +    struct mm_struct *m = &p->mm;
   4.244 +    struct shadow_status **fptr;
   4.245 +    int i;
   4.246 +
   4.247 +    m->shadow_mode = mode;
   4.248 + 
   4.249 +    // allocate hashtable
   4.250 +    m->shadow_ht = kmalloc(shadow_ht_buckets * 
   4.251 +                           sizeof(struct shadow_status));
   4.252 +    if( m->shadow_ht == NULL )
   4.253 +        goto nomem;
   4.254 +
   4.255 +    memset(m->shadow_ht, 0, shadow_ht_buckets * sizeof(struct shadow_status));
   4.256 +
   4.257 +    // allocate space for first lot of extra nodes
   4.258 +    m->shadow_ht_extras = kmalloc(sizeof(void*) + 
   4.259 +                                  (shadow_ht_extra_size * 
   4.260 +                                   sizeof(struct shadow_status)));
   4.261 +    if( m->shadow_ht_extras == NULL )
   4.262 +        goto nomem;
   4.263 +
   4.264 +    memset( m->shadow_ht_extras, 0, sizeof(void*) + (shadow_ht_extra_size * 
   4.265 +                                                     sizeof(struct shadow_status)) );
   4.266 +
   4.267 +    m->shadow_extras_count++;
   4.268 + 
   4.269 +    // add extras to free list
   4.270 +    fptr = &m->shadow_ht_free;
   4.271 +    for ( i=0; i<shadow_ht_extra_size; i++ )
   4.272 +    {
   4.273 +        *fptr = &m->shadow_ht_extras[i];
   4.274 +        fptr = &(m->shadow_ht_extras[i].next);
   4.275 +    }
   4.276 +    *fptr = NULL;
   4.277 +    *((struct shadow_status ** ) 
   4.278 +      &m->shadow_ht_extras[shadow_ht_extra_size]) = NULL;
   4.279 +
   4.280 +    if ( mode == SHM_logdirty )
   4.281 +    {
   4.282 +        m->shadow_dirty_bitmap_size = (p->max_pages+63)&(~63);
   4.283 +        m->shadow_dirty_bitmap = 
   4.284 +            kmalloc( m->shadow_dirty_bitmap_size/8);
   4.285 +        if( m->shadow_dirty_bitmap == NULL )
   4.286 +        {
   4.287 +            m->shadow_dirty_bitmap_size = 0;
   4.288 +            goto nomem;
   4.289 +        }
   4.290 +        memset(m->shadow_dirty_bitmap,0,m->shadow_dirty_bitmap_size/8);
   4.291 +    }
   4.292 +
   4.293 +    // call shadow_mk_pagetable
   4.294 +    __shadow_mk_pagetable( m );
   4.295 +    return 0;
   4.296 +
   4.297 +nomem:
   4.298 +    return -ENOMEM;
   4.299 +}
   4.300 +
   4.301 +void shadow_mode_disable( struct domain *p )
   4.302 +{
   4.303 +    struct mm_struct *m = &p->mm;
   4.304 +    struct shadow_status *next;
   4.305 +
   4.306 +    __free_shadow_table( m );
   4.307 +    m->shadow_mode = 0;
   4.308 +
   4.309 +    SH_LOG("freed tables count=%d l1=%d l2=%d",
   4.310 +           m->shadow_page_count, perfc_value(shadow_l1_pages), perfc_value(shadow_l2_pages));
   4.311 +
   4.312 +    next = m->shadow_ht_extras;
   4.313 +    while( next )
   4.314 +    {
   4.315 +        struct shadow_status * this = next;
   4.316 +        m->shadow_extras_count--;
   4.317 +        next = *((struct shadow_status **)(&next[shadow_ht_extra_size]));
   4.318 +        kfree( this );
   4.319 +    }
   4.320 +
   4.321 +    SH_LOG("freed extras, now %d", m->shadow_extras_count);
   4.322 +
   4.323 +    if( m->shadow_dirty_bitmap  )
   4.324 +    {
   4.325 +        kfree( m->shadow_dirty_bitmap );
   4.326 +        m->shadow_dirty_bitmap = 0;
   4.327 +        m->shadow_dirty_bitmap_size = 0;
   4.328 +    }
   4.329 +
   4.330 +    // free the hashtable itself
   4.331 +    kfree( &m->shadow_ht[0] );
   4.332 +}
   4.333 +
   4.334 +static int shadow_mode_table_op(struct domain *d, 
   4.335 +							    dom0_shadow_control_t *sc)
   4.336 +{
   4.337 +    unsigned int op = sc->op;
   4.338 +    struct mm_struct *m = &d->mm;
   4.339 +    int rc = 0;
   4.340 +
   4.341 +    // since Dom0 did the hypercall, we should be running with it's page
   4.342 +    // tables right now. Calling flush on yourself would be really
   4.343 +    // stupid.
   4.344 +
   4.345 +    ASSERT(spin_is_locked(&d->mm.shadow_lock));
   4.346 +
   4.347 +    if ( m == &current->mm )
   4.348 +    {
   4.349 +        printk("Don't try and flush your own page tables!\n");
   4.350 +        return -EINVAL;
   4.351 +    }
   4.352 +   
   4.353 +    SH_VLOG("shadow mode table op %08lx %08lx count %d",pagetable_val( m->pagetable),pagetable_val(m->shadow_table), m->shadow_page_count);
   4.354 +
   4.355 +    shadow_audit(m,1);
   4.356 +
   4.357 +    switch(op)
   4.358 +    {
   4.359 +    case DOM0_SHADOW_CONTROL_OP_FLUSH:
   4.360 +        // XXX THIS IS VERY DANGEROUS : MUST ENSURE THE PTs ARE NOT IN USE ON
   4.361 +		// OTHER CPU -- fix when we get sched sync pause.
   4.362 +        __free_shadow_table( m );  
   4.363 +        break;
   4.364 +   
   4.365 +    case DOM0_SHADOW_CONTROL_OP_CLEAN:   // zero all-non hypervisor
   4.366 +	{
   4.367 +		__scan_shadow_table( m, TABLE_OP_ZERO_L2 );
   4.368 +		__scan_shadow_table( m, TABLE_OP_ZERO_L1 );
   4.369 +
   4.370 +		goto send_bitmap;
   4.371 +	}
   4.372 +		
   4.373 +
   4.374 +    case DOM0_SHADOW_CONTROL_OP_CLEAN2:  // zero all L2, free L1s
   4.375 +    {
   4.376 +		int i,j,zero=1;
   4.377 +		
   4.378 +		__scan_shadow_table( m, TABLE_OP_ZERO_L2 );
   4.379 +		__scan_shadow_table( m, TABLE_OP_FREE_L1 );
   4.380 +		
   4.381 +	send_bitmap:
   4.382 +		sc->stats.fault_count       = d->mm.shadow_fault_count;
   4.383 +		sc->stats.dirty_count       = d->mm.shadow_dirty_count;
   4.384 +		sc->stats.dirty_net_count   = d->mm.shadow_dirty_net_count;
   4.385 +		sc->stats.dirty_block_count = d->mm.shadow_dirty_block_count;
   4.386 +
   4.387 +		d->mm.shadow_fault_count       = 0;
   4.388 +		d->mm.shadow_dirty_count       = 0;
   4.389 +		d->mm.shadow_dirty_net_count   = 0;
   4.390 +		d->mm.shadow_dirty_block_count = 0;
   4.391 +	
   4.392 +		sc->pages = d->tot_pages;
   4.393 +
   4.394 +		if( d->tot_pages > sc->pages || 
   4.395 +			!sc->dirty_bitmap || !d->mm.shadow_dirty_bitmap )
   4.396 +		{
   4.397 +			rc = -EINVAL;
   4.398 +			goto out;
   4.399 +		}
   4.400 +
   4.401 +	
   4.402 +#define chunk (8*1024) // do this in 1KB chunks for L1 cache
   4.403 +	
   4.404 +		for(i=0;i<d->tot_pages;i+=chunk)
   4.405 +		{
   4.406 +			int bytes = ((  ((d->tot_pages-i) > (chunk))?
   4.407 +							(chunk):(d->tot_pages-i) ) + 7) / 8;
   4.408 +	    
   4.409 +			copy_to_user( sc->dirty_bitmap + (i/(8*sizeof(unsigned long))),
   4.410 +						  d->mm.shadow_dirty_bitmap +(i/(8*sizeof(unsigned long))),
   4.411 +						  bytes );
   4.412 +	    
   4.413 +			for(j=0; zero && j<bytes/sizeof(unsigned long);j++)
   4.414 +			{
   4.415 +				if( d->mm.shadow_dirty_bitmap[j] != 0 )
   4.416 +					zero = 0;
   4.417 +			}
   4.418 +
   4.419 +			memset( d->mm.shadow_dirty_bitmap +(i/(8*sizeof(unsigned long))),
   4.420 +					0, bytes);
   4.421 +		}
   4.422 +
   4.423 +        /* Might as well stop the domain as an optimization. */
   4.424 +		if ( zero )
   4.425 +            domain_pause_by_systemcontroller(d);
   4.426 +
   4.427 +		break;
   4.428 +    }
   4.429 +
   4.430 +    case DOM0_SHADOW_CONTROL_OP_PEEK:
   4.431 +    {
   4.432 +		int i;
   4.433 +
   4.434 +		sc->stats.fault_count       = d->mm.shadow_fault_count;
   4.435 +		sc->stats.dirty_count       = d->mm.shadow_dirty_count;
   4.436 +		sc->stats.dirty_net_count   = d->mm.shadow_dirty_net_count;
   4.437 +		sc->stats.dirty_block_count = d->mm.shadow_dirty_block_count;
   4.438 +	
   4.439 +		if( d->tot_pages > sc->pages || 
   4.440 +			!sc->dirty_bitmap || !d->mm.shadow_dirty_bitmap )
   4.441 +		{
   4.442 +			rc = -EINVAL;
   4.443 +			goto out;
   4.444 +		}
   4.445 +	
   4.446 +		sc->pages = d->tot_pages;
   4.447 +	
   4.448 +#define chunk (8*1024) // do this in 1KB chunks for L1 cache
   4.449 +	
   4.450 +		for(i=0;i<d->tot_pages;i+=chunk)
   4.451 +		{
   4.452 +			int bytes = ((  ((d->tot_pages-i) > (chunk))?
   4.453 +							(chunk):(d->tot_pages-i) ) + 7) / 8;
   4.454 +	    
   4.455 +			copy_to_user( sc->dirty_bitmap + (i/(8*sizeof(unsigned long))),
   4.456 +						  d->mm.shadow_dirty_bitmap +(i/(8*sizeof(unsigned long))),
   4.457 +						  bytes );	    
   4.458 +		}
   4.459 +
   4.460 +		break;
   4.461 +    }
   4.462 +
   4.463 +	default:
   4.464 +		BUG();
   4.465 +
   4.466 +    }
   4.467 +
   4.468 +
   4.469 +out:
   4.470 +
   4.471 +    SH_VLOG("shadow mode table op : page count %d", m->shadow_page_count);
   4.472 +
   4.473 +    shadow_audit(m,1);
   4.474 +
   4.475 +    // call shadow_mk_pagetable
   4.476 +    __shadow_mk_pagetable( m );
   4.477 +
   4.478 +    return rc;
   4.479 +}
   4.480 +
   4.481 +int shadow_mode_control( struct domain *p, dom0_shadow_control_t *sc )
   4.482 +{
   4.483 +    unsigned int cmd = sc->op;
   4.484 +    int rc = 0;
   4.485 +
   4.486 +    spin_lock(&p->mm.shadow_lock);
   4.487 +
   4.488 +    if ( p->mm.shadow_mode && cmd == DOM0_SHADOW_CONTROL_OP_OFF )
   4.489 +    {
   4.490 +        shadow_mode_disable(p);
   4.491 +    }
   4.492 +    else if ( cmd == DOM0_SHADOW_CONTROL_OP_ENABLE_TEST )
   4.493 +    {
   4.494 +        if(p->mm.shadow_mode) shadow_mode_disable(p);
   4.495 +        shadow_mode_enable(p, SHM_test);
   4.496 +    } 
   4.497 +    else if ( cmd == DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY )
   4.498 +    {
   4.499 +        if(p->mm.shadow_mode) shadow_mode_disable(p);
   4.500 +        shadow_mode_enable(p, SHM_logdirty);
   4.501 +    } 
   4.502 +    else if ( p->mm.shadow_mode && cmd >= DOM0_SHADOW_CONTROL_OP_FLUSH && cmd<=DOM0_SHADOW_CONTROL_OP_CLEAN2 )
   4.503 +    {
   4.504 +        rc = shadow_mode_table_op(p, sc);
   4.505 +    }
   4.506 +    else
   4.507 +    {
   4.508 +        rc = -EINVAL;
   4.509 +    }
   4.510 +
   4.511 +	flush_tlb_cpu(p->processor);
   4.512 +   
   4.513 +    spin_unlock(&p->mm.shadow_lock);
   4.514 +
   4.515 +    return rc;
   4.516 +}
   4.517 +
   4.518 +
   4.519 +
   4.520 +static inline struct pfn_info *alloc_shadow_page( struct mm_struct *m )
   4.521 +{
   4.522 +    m->shadow_page_count++;
   4.523 +
   4.524 +    return alloc_domain_page( NULL );
   4.525 +}
   4.526 +
   4.527 +
   4.528 +void unshadow_table( unsigned long gpfn, unsigned int type )
   4.529 +{
   4.530 +    unsigned long spfn;
   4.531 +
   4.532 +    SH_VLOG("unshadow_table type=%08x gpfn=%08lx",
   4.533 +            type,
   4.534 +            gpfn );
   4.535 +
   4.536 +    perfc_incrc(unshadow_table_count);
   4.537 +
   4.538 +    // this function is the same for both l1 and l2 tables
   4.539 +
   4.540 +    // even in the SMP guest case, there won't be a race here as
   4.541 +    // this CPU was the one that cmpxchg'ed the page to invalid
   4.542 +
   4.543 +    spfn = __shadow_status(&current->mm, gpfn) & PSH_pfn_mask;
   4.544 +
   4.545 +    delete_shadow_status(&current->mm, gpfn);
   4.546 +
   4.547 +    free_shadow_page( &current->mm, &frame_table[spfn] );
   4.548 +
   4.549 +}
   4.550 +
   4.551 +
   4.552 +unsigned long shadow_l2_table( 
   4.553 +    struct mm_struct *m, unsigned long gpfn )
   4.554 +{
   4.555 +    struct pfn_info *spfn_info;
   4.556 +    unsigned long spfn;
   4.557 +    l2_pgentry_t *spl2e, *gpl2e;
   4.558 +    int i;
   4.559 +
   4.560 +    SH_VVLOG("shadow_l2_table( %08lx )",gpfn);
   4.561 +
   4.562 +    perfc_incrc(shadow_l2_table_count);
   4.563 +
   4.564 +    // XXX in future, worry about racing in SMP guests 
   4.565 +    //      -- use cmpxchg with PSH_pending flag to show progress (and spin)
   4.566 +
   4.567 +    spfn_info = alloc_shadow_page(m);
   4.568 +
   4.569 +    ASSERT( spfn_info ); // XXX deal with failure later e.g. blow cache
   4.570 +
   4.571 +    spfn_info->type_and_flags = PGT_l2_page_table;
   4.572 +    perfc_incr(shadow_l2_pages);
   4.573 +
   4.574 +    spfn = (unsigned long) (spfn_info - frame_table);
   4.575 +
   4.576 +    // mark pfn as being shadowed, update field to point at shadow
   4.577 +    set_shadow_status(m, gpfn, spfn | PSH_shadowed);
   4.578 + 
   4.579 +    // we need to do this before the linear map is set up
   4.580 +    spl2e = (l2_pgentry_t *) map_domain_mem(spfn << PAGE_SHIFT);
   4.581 +
   4.582 +#ifdef __i386__
   4.583 +    // get hypervisor and 2x linear PT mapings installed 
   4.584 +    memcpy(&spl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 
   4.585 +           &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
   4.586 +           HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
   4.587 +    spl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
   4.588 +        mk_l2_pgentry((gpfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
   4.589 +    spl2e[SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
   4.590 +        mk_l2_pgentry((spfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
   4.591 +    spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] =
   4.592 +        mk_l2_pgentry(__pa(frame_table[gpfn].u.domain->mm.perdomain_pt) | 
   4.593 +                      __PAGE_HYPERVISOR);
   4.594 +#endif
   4.595 +
   4.596 +    // can't use the linear map as we may not be in the right PT
   4.597 +    gpl2e = (l2_pgentry_t *) map_domain_mem(gpfn << PAGE_SHIFT);
   4.598 +
   4.599 +    // proactively create entries for pages that are already shadowed
   4.600 +    for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
   4.601 +    {
   4.602 +        unsigned long spte = 0;
   4.603 +
   4.604 +#if 0  // Turns out this doesn't really help
   4.605 +        unsigned long gpte;
   4.606 +
   4.607 +        gpte = l2_pgentry_val(gpl2e[i]);
   4.608 +
   4.609 +        if (gpte & _PAGE_PRESENT)
   4.610 +        {
   4.611 +            unsigned long s_sh = 
   4.612 +                __shadow_status(p, gpte>>PAGE_SHIFT);
   4.613 +
   4.614 +            l2pde_general( m, &gpte, &spte, s_sh );
   4.615 +
   4.616 +        }
   4.617 +#endif
   4.618 +
   4.619 +        spl2e[i] = mk_l2_pgentry( spte );
   4.620 +
   4.621 +    }
   4.622 +
   4.623 +    // its arguable we should 'preemptively shadow' a few active L1 pages
   4.624 +    // to avoid taking a string of faults when 'jacking' a running domain
   4.625 +
   4.626 +    unmap_domain_mem( gpl2e );
   4.627 +    unmap_domain_mem( spl2e );
   4.628 +
   4.629 +    SH_VLOG("shadow_l2_table( %08lx -> %08lx)",gpfn,spfn);
   4.630 +
   4.631 +    return spfn;
   4.632 +}
   4.633 +
   4.634 +
   4.635 +int shadow_fault( unsigned long va, long error_code )
   4.636 +{
   4.637 +    unsigned long gpte, spte;
   4.638 +    struct mm_struct *m = &current->mm;
   4.639 +
   4.640 +    SH_VVLOG("shadow_fault( va=%08lx, code=%ld )", va, error_code );
   4.641 +
   4.642 +    check_pagetable( current, current->mm.pagetable, "pre-sf" );
   4.643 +
   4.644 +    if ( unlikely(__get_user(gpte, (unsigned long*)&linear_pg_table[va>>PAGE_SHIFT])) )
   4.645 +    {
   4.646 +        SH_VVLOG("shadow_fault - EXIT: read gpte faulted" );
   4.647 +        return 0;  // propagate to guest
   4.648 +    }
   4.649 +
   4.650 +    if ( ! (gpte & _PAGE_PRESENT) )
   4.651 +    {
   4.652 +        SH_VVLOG("shadow_fault - EXIT: gpte not present (%lx)",gpte );
   4.653 +        return 0;  // we're not going to be able to help
   4.654 +    }
   4.655 +
   4.656 +    if ( (error_code & 2)  && ! (gpte & _PAGE_RW) )
   4.657 +    {
   4.658 +        // write fault on RO page
   4.659 +        return 0;
   4.660 +    }
   4.661 +
   4.662 +    // take the lock and reread gpte
   4.663 +
   4.664 +    while( unlikely(!spin_trylock(&current->mm.shadow_lock)) )
   4.665 +	{
   4.666 +		extern volatile unsigned long flush_cpumask;
   4.667 +		if ( test_and_clear_bit(smp_processor_id(), &flush_cpumask) )
   4.668 +			local_flush_tlb();
   4.669 +		rep_nop();
   4.670 +	}
   4.671 +	
   4.672 +	ASSERT(spin_is_locked(&current->mm.shadow_lock));
   4.673 +	
   4.674 +    if ( unlikely(__get_user(gpte, (unsigned long*)&linear_pg_table[va>>PAGE_SHIFT])) )
   4.675 +    {
   4.676 +        SH_VVLOG("shadow_fault - EXIT: read gpte faulted" );
   4.677 +        spin_unlock(&m->shadow_lock);
   4.678 +        return 0;  // propagate to guest
   4.679 +    }
   4.680 +
   4.681 +    if ( unlikely(!(gpte & _PAGE_PRESENT)) )
   4.682 +    {
   4.683 +        SH_VVLOG("shadow_fault - EXIT: gpte not present (%lx)",gpte );
   4.684 +        spin_unlock(&m->shadow_lock);
   4.685 +        return 0;  // we're not going to be able to help
   4.686 +    }
   4.687 +
   4.688 +    if ( error_code & 2  )  
   4.689 +    {  // write fault
   4.690 +        if ( likely(gpte & _PAGE_RW) )
   4.691 +        {
   4.692 +            l1pte_write_fault( m, &gpte, &spte );
   4.693 +        }
   4.694 +        else
   4.695 +        {   // write fault on RO page
   4.696 +            SH_VVLOG("shadow_fault - EXIT: write fault on RO page (%lx)",gpte );
   4.697 +            spin_unlock(&m->shadow_lock);
   4.698 +            return 0; // propagate to guest
   4.699 +            // not clear whether we should set accessed bit here...
   4.700 +        }
   4.701 +    }
   4.702 +    else
   4.703 +    {
   4.704 +        l1pte_read_fault( m, &gpte, &spte );
   4.705 +    }
   4.706 +
   4.707 +    SH_VVLOG("plan: gpte=%08lx  spte=%08lx", gpte, spte );
   4.708 +
   4.709 +    // write back updated gpte
   4.710 +    // XXX watch out for read-only L2 entries! (not used in Linux)
   4.711 +    if ( unlikely( __put_user( gpte, (unsigned long*)&linear_pg_table[va>>PAGE_SHIFT])) )
   4.712 +        BUG();  // fixme!
   4.713 +
   4.714 +    if ( unlikely( __put_user( spte, (unsigned long*)&shadow_linear_pg_table[va>>PAGE_SHIFT])) )
   4.715 +    { 
   4.716 +        // failed:
   4.717 +        //  the L1 may not be shadowed, or the L2 entry may be insufficient
   4.718 +
   4.719 +        unsigned long gpde, spde, gl1pfn, sl1pfn;
   4.720 +
   4.721 +        SH_VVLOG("3: not shadowed or l2 insufficient gpte=%08lx  spte=%08lx",gpte,spte );
   4.722 +
   4.723 +        gpde = l2_pgentry_val(linear_l2_table[va>>L2_PAGETABLE_SHIFT]);
   4.724 +
   4.725 +        gl1pfn = gpde>>PAGE_SHIFT;
   4.726 +
   4.727 +        
   4.728 +        if ( ! (sl1pfn=__shadow_status(&current->mm, gl1pfn) ) )
   4.729 +        {
   4.730 +            // this L1 is NOT already shadowed so we need to shadow it
   4.731 +            struct pfn_info *sl1pfn_info;
   4.732 +            unsigned long *gpl1e, *spl1e;
   4.733 +            int i;
   4.734 +            sl1pfn_info = alloc_shadow_page( &current->mm ); 
   4.735 +            sl1pfn_info->type_and_flags = PGT_l1_page_table;
   4.736 +			
   4.737 +            sl1pfn = sl1pfn_info - frame_table;
   4.738 +
   4.739 +            SH_VVLOG("4a: l1 not shadowed ( %08lx )",sl1pfn);
   4.740 +            perfc_incrc(shadow_l1_table_count);
   4.741 +            perfc_incr(shadow_l1_pages);
   4.742 +
   4.743 +            set_shadow_status(&current->mm, gl1pfn, PSH_shadowed | sl1pfn);
   4.744 +
   4.745 +            l2pde_general( m, &gpde, &spde, sl1pfn );
   4.746 +
   4.747 +            linear_l2_table[va>>L2_PAGETABLE_SHIFT] = mk_l2_pgentry(gpde);
   4.748 +            shadow_linear_l2_table[va>>L2_PAGETABLE_SHIFT] =  mk_l2_pgentry(spde);
   4.749 +
   4.750 +            gpl1e = (unsigned long *) &(linear_pg_table[
   4.751 +                (va>>PAGE_SHIFT) & ~(ENTRIES_PER_L1_PAGETABLE-1) ]);
   4.752 +
   4.753 +            spl1e = (unsigned long *) &shadow_linear_pg_table[
   4.754 +                (va>>PAGE_SHIFT) & ~(ENTRIES_PER_L1_PAGETABLE-1) ];
   4.755 +
   4.756 +
   4.757 +            for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
   4.758 +            {
   4.759 +                l1pte_no_fault( m, &gpl1e[i], &spl1e[i] );
   4.760 +            }
   4.761 +
   4.762 +
   4.763 +        }
   4.764 +        else
   4.765 +        {
   4.766 +            // this L1 was shadowed (by another PT) but we didn't have an L2
   4.767 +            // entry for it
   4.768 +
   4.769 +            SH_VVLOG("4b: was shadowed, l2 missing ( %08lx )",sl1pfn);
   4.770 +
   4.771 +            l2pde_general( m, &gpde, &spde, sl1pfn );
   4.772 +
   4.773 +            linear_l2_table[va>>L2_PAGETABLE_SHIFT] = mk_l2_pgentry(gpde);
   4.774 +            shadow_linear_l2_table[va>>L2_PAGETABLE_SHIFT] = mk_l2_pgentry(spde);
   4.775 +   
   4.776 +        }              
   4.777 +
   4.778 +        shadow_linear_pg_table[va>>PAGE_SHIFT] = mk_l1_pgentry(spte);
   4.779 +        // (we need to do the above even if we've just made the shadow L1)
   4.780 +
   4.781 +    } // end of fixup writing the shadow L1 directly failed
   4.782 +     
   4.783 +    perfc_incrc(shadow_fixup_count);
   4.784 +
   4.785 +	m->shadow_fault_count++;
   4.786 +
   4.787 +    check_pagetable( current, current->mm.pagetable, "post-sf" );
   4.788 +
   4.789 +    spin_unlock(&m->shadow_lock);
   4.790 +
   4.791 +    return 1; // let's try the faulting instruction again...
   4.792 +
   4.793 +}
   4.794 +
   4.795 +
   4.796 +void shadow_l1_normal_pt_update( unsigned long pa, unsigned long gpte,
   4.797 +                                 unsigned long *prev_spfn_ptr,
   4.798 +                                 l1_pgentry_t **prev_spl1e_ptr )
   4.799 +{
   4.800 +    unsigned long gpfn, spfn, spte, prev_spfn = *prev_spfn_ptr;    
   4.801 +    l1_pgentry_t * spl1e, * prev_spl1e = *prev_spl1e_ptr;
   4.802 +
   4.803 +
   4.804 +    SH_VVLOG("shadow_l1_normal_pt_update pa=%08lx, gpte=%08lx, prev_spfn=%08lx, prev_spl1e=%p\n",
   4.805 +             pa,gpte,prev_spfn, prev_spl1e);
   4.806 +
   4.807 +    // to get here, we know the l1 page *must* be shadowed
   4.808 +
   4.809 +    gpfn = pa >> PAGE_SHIFT;
   4.810 +    spfn = __shadow_status(&current->mm, gpfn) & PSH_pfn_mask;
   4.811 +
   4.812 +    if ( spfn == prev_spfn )
   4.813 +    {
   4.814 +        spl1e = prev_spl1e;
   4.815 +    }
   4.816 +    else
   4.817 +    {
   4.818 +        if( prev_spl1e ) unmap_domain_mem( prev_spl1e );
   4.819 +        spl1e = (l1_pgentry_t *) map_domain_mem( spfn << PAGE_SHIFT );
   4.820 +        *prev_spfn_ptr  = spfn;
   4.821 +        *prev_spl1e_ptr = spl1e;
   4.822 +    }
   4.823 +
   4.824 +    // XXX we assume only pagetables can be shadowed; 
   4.825 +    // this will have to change to allow arbitrary CoW etc.
   4.826 +
   4.827 +    l1pte_no_fault( &current->mm, &gpte, &spte );
   4.828 +
   4.829 +
   4.830 +    spl1e[(pa & ~PAGE_MASK) / sizeof(l1_pgentry_t) ] = mk_l1_pgentry( spte );
   4.831 +
   4.832 +}
   4.833 +
   4.834 +void shadow_l2_normal_pt_update( unsigned long pa, unsigned long gpte )
   4.835 +{
   4.836 +    unsigned long gpfn, spfn, spte;
   4.837 +    l2_pgentry_t * sp2le;
   4.838 +    unsigned long s_sh=0;
   4.839 +
   4.840 +    SH_VVLOG("shadow_l2_normal_pt_update pa=%08lx, gpte=%08lx",pa,gpte);
   4.841 +
   4.842 +    // to get here, we know the l2 page has a shadow
   4.843 +
   4.844 +    gpfn = pa >> PAGE_SHIFT;
   4.845 +    spfn = __shadow_status(&current->mm, gpfn) & PSH_pfn_mask;
   4.846 +
   4.847 +
   4.848 +    spte = 0;
   4.849 +
   4.850 +    if( gpte & _PAGE_PRESENT )
   4.851 +        s_sh = __shadow_status(&current->mm, gpte >> PAGE_SHIFT);
   4.852 +
   4.853 +    sp2le = (l2_pgentry_t *) map_domain_mem( spfn << PAGE_SHIFT );
   4.854 +    // no real need for a cache here
   4.855 +
   4.856 +    l2pde_general( &current->mm, &gpte, &spte, s_sh );
   4.857 +
   4.858 +    // XXXX Should mark guest pte as DIRTY and ACCESSED too!!!!!
   4.859 +
   4.860 +    sp2le[(pa & ~PAGE_MASK) / sizeof(l2_pgentry_t) ] = 
   4.861 +        mk_l2_pgentry( spte );
   4.862 +
   4.863 +    unmap_domain_mem( (void *) sp2le );
   4.864 +}
   4.865 +
   4.866 +
   4.867 +#if SHADOW_DEBUG
   4.868 +
   4.869 +static int sh_l2_present;
   4.870 +static int sh_l1_present;
   4.871 +char * sh_check_name;
   4.872 +
   4.873 +#define FAIL(_f, _a...)                             \
   4.874 +{printk("XXX %s-FAIL (%d,%d)" _f " g=%08lx s=%08lx\n",  sh_check_name, level, i, ## _a , gpte, spte ); BUG();}
   4.875 +
   4.876 +static int check_pte( struct mm_struct *m, 
   4.877 +                      unsigned long gpte, unsigned long spte, int level, int i )
   4.878 +{
   4.879 +    unsigned long mask, gpfn, spfn;
   4.880 +
   4.881 +    if ( spte == 0 || spte == 0xdeadface || spte == 0x00000E00)
   4.882 +        return 1;  // always safe
   4.883 +
   4.884 +    if ( !(spte & _PAGE_PRESENT) )
   4.885 +        FAIL("Non zero not present spte");
   4.886 +
   4.887 +    if( level == 2 ) sh_l2_present++;
   4.888 +    if( level == 1 ) sh_l1_present++;
   4.889 +
   4.890 +    if ( !(gpte & _PAGE_PRESENT) )
   4.891 +        FAIL("Guest not present yet shadow is");
   4.892 +
   4.893 +    mask = ~(_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW|0xFFFFF000);
   4.894 +
   4.895 +    if ( (spte & mask) != (gpte & mask ) )
   4.896 +        FAIL("Corrupt?");
   4.897 +
   4.898 +    if ( (spte & _PAGE_DIRTY ) && !(gpte & _PAGE_DIRTY) )
   4.899 +        FAIL("Dirty coherence");
   4.900 +
   4.901 +    if ( (spte & _PAGE_ACCESSED ) && !(gpte & _PAGE_ACCESSED) )
   4.902 +        FAIL("Accessed coherence");
   4.903 +
   4.904 +    if ( (spte & _PAGE_RW ) && !(gpte & _PAGE_RW) )
   4.905 +        FAIL("RW coherence");
   4.906 +
   4.907 +    if ( (spte & _PAGE_RW ) && !((gpte & _PAGE_RW) && (gpte & _PAGE_DIRTY) ))
   4.908 +        FAIL("RW2 coherence");
   4.909 + 
   4.910 +    spfn = spte>>PAGE_SHIFT;
   4.911 +    gpfn = gpte>>PAGE_SHIFT;
   4.912 +
   4.913 +    if ( gpfn == spfn )
   4.914 +    {
   4.915 +        if ( level > 1 )
   4.916 +            FAIL("Linear map ???");    // XXX this will fail on BSD
   4.917 +
   4.918 +        return 1;
   4.919 +    }
   4.920 +    else
   4.921 +    {
   4.922 +        if ( level < 2 )
   4.923 +            FAIL("Shadow in L1 entry?");
   4.924 +
   4.925 +        if ( __shadow_status(p, gpfn) != (PSH_shadowed | spfn) )
   4.926 +            FAIL("spfn problem g.sf=%08lx", 
   4.927 +                 __shadow_status(p, gpfn) );
   4.928 +    }
   4.929 +
   4.930 +    return 1;
   4.931 +}
   4.932 +
   4.933 +
   4.934 +static int check_l1_table( struct mm_struct *m, unsigned long va, 
   4.935 +                           unsigned long g2, unsigned long s2 )
   4.936 +{
   4.937 +    int j;
   4.938 +    unsigned long *gpl1e, *spl1e;
   4.939 +
   4.940 +    //gpl1e = (unsigned long *) &(linear_pg_table[ va>>PAGE_SHIFT]);
   4.941 +    //spl1e = (unsigned long *) &(shadow_linear_pg_table[ va>>PAGE_SHIFT]);
   4.942 +
   4.943 +    gpl1e = map_domain_mem( g2<<PAGE_SHIFT );
   4.944 +    spl1e = map_domain_mem( s2<<PAGE_SHIFT );
   4.945 +
   4.946 +    for ( j = 0; j < ENTRIES_PER_L1_PAGETABLE; j++ )
   4.947 +    {
   4.948 +        unsigned long gpte = gpl1e[j];
   4.949 +        unsigned long spte = spl1e[j];
   4.950 +  
   4.951 +        check_pte( p, gpte, spte, 1, j );
   4.952 +    }
   4.953 + 
   4.954 +    unmap_domain_mem( spl1e );
   4.955 +    unmap_domain_mem( gpl1e );
   4.956 +
   4.957 +    return 1;
   4.958 +}
   4.959 +
   4.960 +#define FAILPT(_f, _a...)                             \
   4.961 +{printk("XXX FAIL %s-PT" _f "\n", s, ## _a ); BUG();}
   4.962 +
   4.963 +int check_pagetable( struct mm_struct *m, pagetable_t pt, char *s )
   4.964 +{
   4.965 +    unsigned long gptbase = pagetable_val(pt);
   4.966 +    unsigned long gpfn, spfn;
   4.967 +    int i;
   4.968 +    l2_pgentry_t *gpl2e, *spl2e;
   4.969 +
   4.970 +    sh_check_name = s;
   4.971 +
   4.972 +    SH_VVLOG("%s-PT Audit",s);
   4.973 +
   4.974 +    sh_l2_present = sh_l1_present = 0;
   4.975 +
   4.976 +    gpfn =  gptbase >> PAGE_SHIFT;
   4.977 +
   4.978 +    if ( ! (__shadow_status(p, gpfn) & PSH_shadowed) )
   4.979 +    {
   4.980 +        printk("%s-PT %08lx not shadowed\n", s, gptbase);
   4.981 +
   4.982 +        if( __shadow_status(p, gpfn) != 0 ) BUG();
   4.983 +
   4.984 +        return 0;
   4.985 +    }
   4.986 + 
   4.987 +    spfn = __shadow_status(p, gpfn) & PSH_pfn_mask;
   4.988 +
   4.989 +    if ( ! __shadow_status(p, gpfn) == (PSH_shadowed | spfn) )
   4.990 +        FAILPT("ptbase shadow inconsistent1");
   4.991 +
   4.992 +    gpl2e = (l2_pgentry_t *) map_domain_mem( gpfn << PAGE_SHIFT );
   4.993 +    spl2e = (l2_pgentry_t *) map_domain_mem( spfn << PAGE_SHIFT );
   4.994 +
   4.995 +    //ipl2e = (l2_pgentry_t *) map_domain_mem( spfn << PAGE_SHIFT );
   4.996 +
   4.997 +
   4.998 +    if ( memcmp( &spl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
   4.999 +                 &gpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 
  4.1000 +                 ((SH_LINEAR_PT_VIRT_START>>(L2_PAGETABLE_SHIFT))-DOMAIN_ENTRIES_PER_L2_PAGETABLE)
  4.1001 +                 * sizeof(l2_pgentry_t)) )
  4.1002 +    {
  4.1003 +        printk("gpfn=%08lx spfn=%08lx\n", gpfn, spfn);
  4.1004 +        for (i=DOMAIN_ENTRIES_PER_L2_PAGETABLE; 
  4.1005 +             i<(SH_LINEAR_PT_VIRT_START>>(L2_PAGETABLE_SHIFT));
  4.1006 +             i++ )
  4.1007 +            printk("+++ (%d) %08lx %08lx\n",i,
  4.1008 +                   l2_pgentry_val(gpl2e[i]), l2_pgentry_val(spl2e[i]) );
  4.1009 +        FAILPT("hypervisor entries inconsistent");
  4.1010 +    }
  4.1011 +
  4.1012 +    if ( (l2_pgentry_val(spl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT]) != 
  4.1013 +          l2_pgentry_val(gpl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT])) )
  4.1014 +        FAILPT("hypervisor linear map inconsistent");
  4.1015 +
  4.1016 +    if ( (l2_pgentry_val(spl2e[SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT]) != 
  4.1017 +          ((spfn << PAGE_SHIFT) | __PAGE_HYPERVISOR)) )
  4.1018 +        FAILPT("hypervisor shadow linear map inconsistent %08lx %08lx",
  4.1019 +               l2_pgentry_val(spl2e[SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT]),
  4.1020 +               (spfn << PAGE_SHIFT) | __PAGE_HYPERVISOR
  4.1021 +            );
  4.1022 +
  4.1023 +    if ( (l2_pgentry_val(spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT]) !=
  4.1024 +          ((__pa(frame_table[gpfn].u.domain->mm.perdomain_pt) | __PAGE_HYPERVISOR))) )
  4.1025 +        FAILPT("hypervisor per-domain map inconsistent");
  4.1026 +
  4.1027 +
  4.1028 +    // check the whole L2
  4.1029 +    for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
  4.1030 +    {
  4.1031 +        unsigned long gpte = l2_pgentry_val(gpl2e[i]);
  4.1032 +        unsigned long spte = l2_pgentry_val(spl2e[i]);
  4.1033 +
  4.1034 +        check_pte( p, gpte, spte, 2, i );
  4.1035 +    }
  4.1036 +
  4.1037 +
  4.1038 +    // go back and recurse
  4.1039 +    for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
  4.1040 +    {
  4.1041 +        unsigned long gpte = l2_pgentry_val(gpl2e[i]);
  4.1042 +        unsigned long spte = l2_pgentry_val(spl2e[i]);
  4.1043 +
  4.1044 +        if ( spte )    
  4.1045 +            check_l1_table( p,
  4.1046 +                            i<<L2_PAGETABLE_SHIFT,
  4.1047 +                            gpte>>PAGE_SHIFT, spte>>PAGE_SHIFT );
  4.1048 +
  4.1049 +    }
  4.1050 +
  4.1051 +    unmap_domain_mem( spl2e );
  4.1052 +    unmap_domain_mem( gpl2e );
  4.1053 +
  4.1054 +    SH_VVLOG("PT verified : l2_present = %d, l1_present = %d\n",
  4.1055 +             sh_l2_present, sh_l1_present );
  4.1056 + 
  4.1057 +    return 1;
  4.1058 +}
  4.1059 +
  4.1060 +
  4.1061 +#endif
     5.1 --- a/xen/common/debug.c	Thu Jul 01 23:32:40 2004 +0000
     5.2 +++ b/xen/common/debug.c	Thu Jul 01 23:45:24 2004 +0000
     5.3 @@ -69,7 +69,7 @@ void pdb_do_debug (dom0_op_t *op)
     5.4  	    struct domain *d;
     5.5  
     5.6  	    d = find_domain_by_id(op->u.debug.domain);
     5.7 -	    if ( d->mm.shadow_mode )
     5.8 +	    if ( shadow_mode(d) )
     5.9  	      cr3 = pagetable_val(d->mm.shadow_table);
    5.10  	    else
    5.11  	      cr3 = pagetable_val(d->mm.pagetable);
     6.1 --- a/xen/common/dom0_ops.c	Thu Jul 01 23:32:40 2004 +0000
     6.2 +++ b/xen/common/dom0_ops.c	Thu Jul 01 23:45:24 2004 +0000
     6.3 @@ -14,7 +14,6 @@
     6.4  #include <xen/sched.h>
     6.5  #include <xen/event.h>
     6.6  #include <asm/domain_page.h>
     6.7 -#include <asm/msr.h>
     6.8  #include <asm/pdb.h>
     6.9  #include <xen/trace.h>
    6.10  #include <xen/console.h>
    6.11 @@ -25,23 +24,7 @@
    6.12  #define TRC_DOM0OP_LEAVE_BASE  0x00030000
    6.13  
    6.14  extern unsigned int alloc_new_dom_mem(struct domain *, unsigned int);
    6.15 -
    6.16 -static int msr_cpu_mask;
    6.17 -static unsigned long msr_addr;
    6.18 -static unsigned long msr_lo;
    6.19 -static unsigned long msr_hi;
    6.20 -
    6.21 -static void write_msr_for(void *unused)
    6.22 -{
    6.23 -    if (((1 << current->processor) & msr_cpu_mask))
    6.24 -        wrmsr(msr_addr, msr_lo, msr_hi);
    6.25 -}
    6.26 -
    6.27 -static void read_msr_for(void *unused)
    6.28 -{
    6.29 -    if (((1 << current->processor) & msr_cpu_mask))
    6.30 -        rdmsr(msr_addr, msr_lo, msr_hi);
    6.31 -}
    6.32 +extern long arch_do_dom0_op(dom0_op_t *op, dom0_op_t *u_dom0_op);
    6.33  
    6.34  long do_dom0_op(dom0_op_t *u_dom0_op)
    6.35  {
    6.36 @@ -271,127 +254,6 @@ long do_dom0_op(dom0_op_t *u_dom0_op)
    6.37      }
    6.38      break;
    6.39  
    6.40 -    case DOM0_GETDOMAININFO:
    6.41 -    { 
    6.42 -        full_execution_context_t *c;
    6.43 -        struct domain            *d;
    6.44 -        unsigned long             flags;
    6.45 -        int                       i;
    6.46 -
    6.47 -        read_lock_irqsave(&tasklist_lock, flags);
    6.48 -
    6.49 -        for_each_domain ( d )
    6.50 -        {
    6.51 -            if ( d->domain >= op->u.getdomaininfo.domain )
    6.52 -                break;
    6.53 -        }
    6.54 -
    6.55 -        if ( (d == NULL) || !get_domain(d) )
    6.56 -        {
    6.57 -            read_unlock_irqrestore(&tasklist_lock, flags);
    6.58 -            ret = -ESRCH;
    6.59 -            break;
    6.60 -        }
    6.61 -
    6.62 -        read_unlock_irqrestore(&tasklist_lock, flags);
    6.63 -
    6.64 -        op->u.getdomaininfo.domain = d->domain;
    6.65 -        strcpy(op->u.getdomaininfo.name, d->name);
    6.66 -        
    6.67 -        op->u.getdomaininfo.flags =
    6.68 -            (test_bit(DF_DYING,     &d->flags) ? DOMFLAGS_DYING    : 0) |
    6.69 -            (test_bit(DF_CRASHED,   &d->flags) ? DOMFLAGS_CRASHED  : 0) |
    6.70 -            (test_bit(DF_SHUTDOWN,  &d->flags) ? DOMFLAGS_SHUTDOWN : 0) |
    6.71 -            (test_bit(DF_CTRLPAUSE, &d->flags) ? DOMFLAGS_PAUSED   : 0) |
    6.72 -            (test_bit(DF_BLOCKED,   &d->flags) ? DOMFLAGS_BLOCKED  : 0) |
    6.73 -            (test_bit(DF_RUNNING,   &d->flags) ? DOMFLAGS_RUNNING  : 0);
    6.74 -
    6.75 -        op->u.getdomaininfo.flags |= d->processor << DOMFLAGS_CPUSHIFT;
    6.76 -        op->u.getdomaininfo.flags |= 
    6.77 -            d->shutdown_code << DOMFLAGS_SHUTDOWNSHIFT;
    6.78 -
    6.79 -        op->u.getdomaininfo.tot_pages   = d->tot_pages;
    6.80 -        op->u.getdomaininfo.max_pages   = d->max_pages;
    6.81 -        op->u.getdomaininfo.cpu_time    = d->cpu_time;
    6.82 -        op->u.getdomaininfo.shared_info_frame = 
    6.83 -            __pa(d->shared_info) >> PAGE_SHIFT;
    6.84 -
    6.85 -        if ( op->u.getdomaininfo.ctxt != NULL )
    6.86 -        {
    6.87 -            if ( (c = kmalloc(sizeof(*c))) == NULL )
    6.88 -            {
    6.89 -                ret = -ENOMEM;
    6.90 -                put_domain(d);
    6.91 -                break;
    6.92 -            }
    6.93 -
    6.94 -            if ( d != current )
    6.95 -                domain_pause(d);
    6.96 -
    6.97 -            c->flags = 0;
    6.98 -            memcpy(&c->cpu_ctxt, 
    6.99 -                   &d->shared_info->execution_context,
   6.100 -                   sizeof(d->shared_info->execution_context));
   6.101 -            if ( test_bit(DF_DONEFPUINIT, &d->flags) )
   6.102 -                c->flags |= ECF_I387_VALID;
   6.103 -            memcpy(&c->fpu_ctxt,
   6.104 -                   &d->thread.i387,
   6.105 -                   sizeof(d->thread.i387));
   6.106 -            memcpy(&c->trap_ctxt,
   6.107 -                   d->thread.traps,
   6.108 -                   sizeof(d->thread.traps));
   6.109 -#ifdef ARCH_HAS_FAST_TRAP
   6.110 -            if ( (d->thread.fast_trap_desc.a == 0) &&
   6.111 -                 (d->thread.fast_trap_desc.b == 0) )
   6.112 -                c->fast_trap_idx = 0;
   6.113 -            else
   6.114 -                c->fast_trap_idx = 
   6.115 -                    d->thread.fast_trap_idx;
   6.116 -#endif
   6.117 -            c->ldt_base = d->mm.ldt_base;
   6.118 -            c->ldt_ents = d->mm.ldt_ents;
   6.119 -            c->gdt_ents = 0;
   6.120 -            if ( GET_GDT_ADDRESS(d) == GDT_VIRT_START )
   6.121 -            {
   6.122 -                for ( i = 0; i < 16; i++ )
   6.123 -                    c->gdt_frames[i] = 
   6.124 -                        l1_pgentry_to_pagenr(d->mm.perdomain_pt[i]);
   6.125 -                c->gdt_ents = 
   6.126 -                    (GET_GDT_ENTRIES(d) + 1) >> 3;
   6.127 -            }
   6.128 -            c->guestos_ss  = d->thread.guestos_ss;
   6.129 -            c->guestos_esp = d->thread.guestos_sp;
   6.130 -            c->pt_base   = 
   6.131 -                pagetable_val(d->mm.pagetable);
   6.132 -            memcpy(c->debugreg, 
   6.133 -                   d->thread.debugreg, 
   6.134 -                   sizeof(d->thread.debugreg));
   6.135 -            c->event_callback_cs  =
   6.136 -                d->event_selector;
   6.137 -            c->event_callback_eip =
   6.138 -                d->event_address;
   6.139 -            c->failsafe_callback_cs  = 
   6.140 -                d->failsafe_selector;
   6.141 -            c->failsafe_callback_eip = 
   6.142 -                d->failsafe_address;
   6.143 -
   6.144 -            if ( d != current )
   6.145 -                domain_unpause(d);
   6.146 -
   6.147 -            if ( copy_to_user(op->u.getdomaininfo.ctxt, c, sizeof(*c)) )
   6.148 -                ret = -EINVAL;
   6.149 -
   6.150 -            if ( c != NULL )
   6.151 -                kfree(c);
   6.152 -        }
   6.153 -
   6.154 -        if ( copy_to_user(u_dom0_op, op, sizeof(*op)) )     
   6.155 -            ret = -EINVAL;
   6.156 -
   6.157 -        put_domain(d);
   6.158 -    }
   6.159 -    break;
   6.160 -
   6.161      case DOM0_GETPAGEFRAMEINFO:
   6.162      {
   6.163          struct pfn_info *page;
   6.164 @@ -448,32 +310,6 @@ long do_dom0_op(dom0_op_t *u_dom0_op)
   6.165      }
   6.166      break;
   6.167  
   6.168 -    case DOM0_MSR:
   6.169 -    {
   6.170 -        if ( op->u.msr.write )
   6.171 -        {
   6.172 -            msr_cpu_mask = op->u.msr.cpu_mask;
   6.173 -            msr_addr = op->u.msr.msr;
   6.174 -            msr_lo = op->u.msr.in1;
   6.175 -            msr_hi = op->u.msr.in2;
   6.176 -            smp_call_function(write_msr_for, NULL, 1, 1);
   6.177 -            write_msr_for(NULL);
   6.178 -        }
   6.179 -        else
   6.180 -        {
   6.181 -            msr_cpu_mask = op->u.msr.cpu_mask;
   6.182 -            msr_addr = op->u.msr.msr;
   6.183 -            smp_call_function(read_msr_for, NULL, 1, 1);
   6.184 -            read_msr_for(NULL);
   6.185 -
   6.186 -            op->u.msr.out1 = msr_lo;
   6.187 -            op->u.msr.out2 = msr_hi;
   6.188 -            copy_to_user(u_dom0_op, op, sizeof(*op));
   6.189 -        }
   6.190 -        ret = 0;
   6.191 -    }
   6.192 -    break;
   6.193 -
   6.194  #ifdef XEN_DEBUGGER
   6.195      case DOM0_DEBUG:
   6.196      {
   6.197 @@ -543,20 +379,6 @@ long do_dom0_op(dom0_op_t *u_dom0_op)
   6.198      }
   6.199      break;
   6.200  
   6.201 -    case DOM0_SHADOW_CONTROL:
   6.202 -    {
   6.203 -        struct domain *d; 
   6.204 -        ret = -ESRCH;
   6.205 -        d = find_domain_by_id(op->u.shadow_control.domain);
   6.206 -        if ( d != NULL )
   6.207 -        {
   6.208 -            ret = shadow_mode_control(d, &op->u.shadow_control);
   6.209 -            put_domain(d);
   6.210 -            copy_to_user(u_dom0_op, op, sizeof(*op));
   6.211 -        } 
   6.212 -    }
   6.213 -    break;
   6.214 -
   6.215      case DOM0_SCHED_ID:
   6.216      {
   6.217          op->u.sched_id.sched_id = sched_id();
   6.218 @@ -696,7 +518,7 @@ long do_dom0_op(dom0_op_t *u_dom0_op)
   6.219      break;
   6.220  
   6.221      default:
   6.222 -        ret = -ENOSYS;
   6.223 +        ret = arch_do_dom0_op(op,u_dom0_op);
   6.224  
   6.225      }
   6.226  
     7.1 --- a/xen/common/domain.c	Thu Jul 01 23:32:40 2004 +0000
     7.2 +++ b/xen/common/domain.c	Thu Jul 01 23:45:24 2004 +0000
     7.3 @@ -15,7 +15,6 @@
     7.4  #include <asm/io.h>
     7.5  #include <asm/domain_page.h>
     7.6  #include <asm/flushtlb.h>
     7.7 -#include <asm/msr.h>
     7.8  #include <asm/i387.h>
     7.9  #include <hypervisor-ifs/dom0_ops.h>
    7.10  
    7.11 @@ -57,7 +56,7 @@ struct domain *do_createdomain(domid_t d
    7.12      atomic_set(&d->refcnt, 1);
    7.13      atomic_set(&d->pausecnt, 0);
    7.14  
    7.15 -    spin_lock_init(&d->mm.shadow_lock);
    7.16 +    shadow_lock_init(d);
    7.17  
    7.18      d->domain    = dom_id;
    7.19      d->processor = cpu;
    7.20 @@ -335,7 +334,7 @@ void domain_relinquish_memory(struct dom
    7.21          write_ptbase(&current->mm);
    7.22  
    7.23      /* Exit shadow mode before deconstructing final guest page table. */
    7.24 -    if ( d->mm.shadow_mode )
    7.25 +    if ( shadow_mode(d) )
    7.26          shadow_mode_disable(d);
    7.27  
    7.28      /* Drop the in-use reference to the page-table base. */
     8.1 --- a/xen/common/kernel.c	Thu Jul 01 23:32:40 2004 +0000
     8.2 +++ b/xen/common/kernel.c	Thu Jul 01 23:45:24 2004 +0000
     8.3 @@ -22,7 +22,6 @@
     8.4  #include <xen/shadow.h>
     8.5  #include <xen/trace.h>
     8.6  #include <asm/io.h>
     8.7 -#include <asm/msr.h>
     8.8  #include <asm/uaccess.h>
     8.9  #include <asm/domain_page.h>
    8.10  #include <hypervisor-ifs/dom0_ops.h>
     9.1 --- a/xen/common/shadow.c	Thu Jul 01 23:32:40 2004 +0000
     9.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
     9.3 @@ -1,1058 +0,0 @@
     9.4 -/* -*-  Mode:C++; c-file-style:BSD; c-basic-offset:4; tab-width:4 -*- */
     9.5 -
     9.6 -#include <xen/config.h>
     9.7 -#include <xen/types.h>
     9.8 -#include <xen/mm.h>
     9.9 -#include <xen/shadow.h>
    9.10 -#include <asm/domain_page.h>
    9.11 -#include <asm/page.h>
    9.12 -#include <xen/event.h>
    9.13 -#include <xen/trace.h>
    9.14 -
    9.15 -
    9.16 -/********
    9.17 -
    9.18 -To use these shadow page tables, guests must not rely on the ACCESSED
    9.19 -and DIRTY bits on L2 pte's being accurate -- they will typically all be set.
    9.20 -
    9.21 -I doubt this will break anything. (If guests want to use the va_update
    9.22 -mechanism they've signed up for this anyhow...)
    9.23 -
    9.24 -There's a per-domain shadow table spin lock which works fine for SMP
    9.25 -hosts. We don't have to worry about interrupts as no shadow operations
    9.26 -happen in an interrupt context. It's probably not quite ready for SMP
    9.27 -guest operation as we have to worry about synchonisation between gpte
    9.28 -and spte updates. Its possible that this might only happen in a
    9.29 -hypercall context, in which case we'll probably at have a per-domain
    9.30 -hypercall lock anyhow (at least initially).
    9.31 -
    9.32 -********/
    9.33 -
    9.34 -
    9.35 -/**
    9.36 -
    9.37 -FIXME:
    9.38 -
    9.39 -The shadow table flush command is dangerous on SMP systems as the
    9.40 -guest may be using the L2 on one CPU while the other is trying to 
    9.41 -blow the table away. 
    9.42 -
    9.43 -The current save restore code works around this by not calling FLUSH,
    9.44 -but by calling CLEAN2 which leaves all L2s in tact (this is probably
    9.45 -quicker anyhow).
    9.46 -
    9.47 -Even so, we have to be very careful. The flush code may need to cause
    9.48 -a TLB flush on another CPU. It needs to do this while holding the
    9.49 -shadow table lock. The trouble is, the guest may be in the shadow page
    9.50 -fault handler spinning waiting to grab the shadow lock. It may have
    9.51 -intterupts disabled, hence we can't use the normal flush_tlb_cpu
    9.52 -mechanism.
    9.53 -
    9.54 -For the moment, we have a grim race whereby the spinlock in the shadow
    9.55 -fault handler is actually a try lock, in a loop with a helper for the
    9.56 -tlb flush code.
    9.57 -
    9.58 -A better soloution would be to take a new flush lock, then raise a
    9.59 -per-domain soft irq on the other CPU.  The softirq will switch to
    9.60 -init's PTs, then do an atomic inc of a variable to count himself in,
    9.61 -then spin on a lock.  Having noticed that the other guy has counted
    9.62 -in, flush the shadow table, then release him by dropping the lock. He
    9.63 -will then reload cr3 from mm.page_table on the way out of the softirq.
    9.64 -
    9.65 -In domian-softirq context we know that the guy holds no locks and has
    9.66 -interrupts enabled. Nothing can go wrong ;-)
    9.67 -
    9.68 -**/
    9.69 -
    9.70 -static inline void free_shadow_page( struct mm_struct *m, 
    9.71 -                                     struct pfn_info *pfn_info )
    9.72 -{
    9.73 -    unsigned long flags;
    9.74 -    unsigned long type = pfn_info->type_and_flags & PGT_type_mask;
    9.75 -
    9.76 -    m->shadow_page_count--;
    9.77 -
    9.78 -    if (type == PGT_l1_page_table)
    9.79 -        perfc_decr(shadow_l1_pages);
    9.80 -    else if (type == PGT_l2_page_table)
    9.81 -        perfc_decr(shadow_l2_pages);
    9.82 -    else printk("Free shadow weird page type pfn=%08x type=%08x\n",
    9.83 -                frame_table-pfn_info, pfn_info->type_and_flags);
    9.84 -    
    9.85 -    pfn_info->type_and_flags = 0;
    9.86 -
    9.87 -    spin_lock_irqsave(&free_list_lock, flags);
    9.88 -    list_add(&pfn_info->list, &free_list);
    9.89 -    free_pfns++;
    9.90 -    spin_unlock_irqrestore(&free_list_lock, flags);
    9.91 -}
    9.92 -
    9.93 -static void __free_shadow_table( struct mm_struct *m )
    9.94 -{
    9.95 -    int j, free=0;
    9.96 -    struct shadow_status *a,*next;
    9.97 - 
    9.98 -    // the code assumes you're not using the page tables i.e.
    9.99 -    // the domain is stopped and cr3 is something else!!
   9.100 -
   9.101 -    // walk the hash table and call free_shadow_page on all pages
   9.102 -
   9.103 -    shadow_audit(m,1);
   9.104 -
   9.105 -    for(j=0;j<shadow_ht_buckets;j++)
   9.106 -    {
   9.107 -        a = &m->shadow_ht[j];        
   9.108 -        if (a->pfn)
   9.109 -        {
   9.110 -            free_shadow_page( m, 
   9.111 -                              &frame_table[a->spfn_and_flags & PSH_pfn_mask] );
   9.112 -            a->pfn = 0;
   9.113 -            a->spfn_and_flags = 0;
   9.114 -            free++;
   9.115 -        }
   9.116 -        next=a->next;
   9.117 -        a->next=NULL;
   9.118 -        a=next;
   9.119 -        while(a)
   9.120 -        { 
   9.121 -            struct shadow_status *next = a->next;
   9.122 -
   9.123 -            free_shadow_page( m, 
   9.124 -                              &frame_table[a->spfn_and_flags & PSH_pfn_mask] );
   9.125 -            a->pfn = 0;
   9.126 -            a->spfn_and_flags = 0;
   9.127 -            free++;
   9.128 -            a->next = m->shadow_ht_free;           
   9.129 -            m->shadow_ht_free = a;
   9.130 -            a=next;
   9.131 -        }
   9.132 -        shadow_audit(m,0);
   9.133 -    }
   9.134 -    SH_LOG("Free shadow table. Freed= %d",free);
   9.135 -}
   9.136 -
   9.137 -
   9.138 -#define TABLE_OP_ZERO_L2 1
   9.139 -#define TABLE_OP_ZERO_L1 2
   9.140 -#define TABLE_OP_FREE_L1 3
   9.141 -
   9.142 -static inline int shadow_page_op( struct mm_struct *m, unsigned int op, 
   9.143 -								  unsigned int gpfn,
   9.144 -                                  struct pfn_info *spfn_info, int *work )
   9.145 -{
   9.146 -    unsigned int spfn = spfn_info-frame_table;
   9.147 -	int restart = 0;
   9.148 -
   9.149 -    switch( op )
   9.150 -    {
   9.151 -	case TABLE_OP_ZERO_L2:
   9.152 -	{
   9.153 -		if ( (spfn_info->type_and_flags & PGT_type_mask) == 
   9.154 -             PGT_l2_page_table )
   9.155 -		{
   9.156 -			unsigned long * spl1e = map_domain_mem( spfn<<PAGE_SHIFT );
   9.157 -#ifdef __i386__
   9.158 -			memset(spl1e, 0, DOMAIN_ENTRIES_PER_L2_PAGETABLE * sizeof(*spl1e));
   9.159 -#endif
   9.160 -			unmap_domain_mem( spl1e );
   9.161 -		}
   9.162 -    }
   9.163 -	break;
   9.164 -	
   9.165 -	case TABLE_OP_ZERO_L1:
   9.166 -	{
   9.167 -		if ( (spfn_info->type_and_flags & PGT_type_mask) == 
   9.168 -             PGT_l1_page_table )
   9.169 -		{
   9.170 -			unsigned long * spl1e = map_domain_mem( spfn<<PAGE_SHIFT );
   9.171 -			memset( spl1e, 0, ENTRIES_PER_L1_PAGETABLE * sizeof(*spl1e) );
   9.172 -			unmap_domain_mem( spl1e );
   9.173 -		}
   9.174 -    }
   9.175 -	break;
   9.176 -
   9.177 -	case TABLE_OP_FREE_L1:
   9.178 -	{
   9.179 -		if ( (spfn_info->type_and_flags & PGT_type_mask) == 
   9.180 -             PGT_l1_page_table )
   9.181 -		{
   9.182 -			// lock is already held
   9.183 -			delete_shadow_status( m, gpfn );
   9.184 -			restart = 1; // we need to go to start of list again
   9.185 -		}
   9.186 -    }
   9.187 -
   9.188 -	break;
   9.189 -	
   9.190 -	default:
   9.191 -		BUG();
   9.192 -
   9.193 -    }
   9.194 -    return restart;
   9.195 -}
   9.196 -
   9.197 -static void __scan_shadow_table( struct mm_struct *m, unsigned int op )
   9.198 -{
   9.199 -    int j, work=0;
   9.200 -    struct shadow_status *a, *next;
   9.201 - 
   9.202 -    // the code assumes you're not using the page tables i.e.
   9.203 -    // the domain is stopped and cr3 is something else!!
   9.204 -
   9.205 -    // walk the hash table and call free_shadow_page on all pages
   9.206 -
   9.207 -    shadow_audit(m,1);
   9.208 -
   9.209 -    for(j=0;j<shadow_ht_buckets;j++)
   9.210 -    {
   9.211 -	retry:
   9.212 -        a = &m->shadow_ht[j];     
   9.213 -		next = a->next;
   9.214 -        if (a->pfn)
   9.215 -        {
   9.216 -            if ( shadow_page_op( m, op, a->pfn,								 
   9.217 -								 &frame_table[a->spfn_and_flags & PSH_pfn_mask], 
   9.218 -								 &work ) )
   9.219 -				goto retry;
   9.220 -        }
   9.221 -        a=next;
   9.222 -        while(a)
   9.223 -        { 
   9.224 -			next = a->next;
   9.225 -            if ( shadow_page_op( m, op, a->pfn,
   9.226 -								 &frame_table[a->spfn_and_flags & PSH_pfn_mask],
   9.227 -								 &work ) )
   9.228 -				goto retry;
   9.229 -            a=next;
   9.230 -        }
   9.231 -        shadow_audit(m,0);
   9.232 -    }
   9.233 -    SH_VLOG("Scan shadow table. Work=%d l1=%d l2=%d", work, perfc_value(shadow_l1_pages), perfc_value(shadow_l2_pages));
   9.234 -}
   9.235 -
   9.236 -
   9.237 -void shadow_mode_init(void)
   9.238 -{
   9.239 -}
   9.240 -
   9.241 -int shadow_mode_enable( struct domain *p, unsigned int mode )
   9.242 -{
   9.243 -    struct mm_struct *m = &p->mm;
   9.244 -    struct shadow_status **fptr;
   9.245 -    int i;
   9.246 -
   9.247 -    m->shadow_mode = mode;
   9.248 - 
   9.249 -    // allocate hashtable
   9.250 -    m->shadow_ht = kmalloc(shadow_ht_buckets * 
   9.251 -                           sizeof(struct shadow_status));
   9.252 -    if( m->shadow_ht == NULL )
   9.253 -        goto nomem;
   9.254 -
   9.255 -    memset(m->shadow_ht, 0, shadow_ht_buckets * sizeof(struct shadow_status));
   9.256 -
   9.257 -    // allocate space for first lot of extra nodes
   9.258 -    m->shadow_ht_extras = kmalloc(sizeof(void*) + 
   9.259 -                                  (shadow_ht_extra_size * 
   9.260 -                                   sizeof(struct shadow_status)));
   9.261 -    if( m->shadow_ht_extras == NULL )
   9.262 -        goto nomem;
   9.263 -
   9.264 -    memset( m->shadow_ht_extras, 0, sizeof(void*) + (shadow_ht_extra_size * 
   9.265 -                                                     sizeof(struct shadow_status)) );
   9.266 -
   9.267 -    m->shadow_extras_count++;
   9.268 - 
   9.269 -    // add extras to free list
   9.270 -    fptr = &m->shadow_ht_free;
   9.271 -    for ( i=0; i<shadow_ht_extra_size; i++ )
   9.272 -    {
   9.273 -        *fptr = &m->shadow_ht_extras[i];
   9.274 -        fptr = &(m->shadow_ht_extras[i].next);
   9.275 -    }
   9.276 -    *fptr = NULL;
   9.277 -    *((struct shadow_status ** ) 
   9.278 -      &m->shadow_ht_extras[shadow_ht_extra_size]) = NULL;
   9.279 -
   9.280 -    if ( mode == SHM_logdirty )
   9.281 -    {
   9.282 -        m->shadow_dirty_bitmap_size = (p->max_pages+63)&(~63);
   9.283 -        m->shadow_dirty_bitmap = 
   9.284 -            kmalloc( m->shadow_dirty_bitmap_size/8);
   9.285 -        if( m->shadow_dirty_bitmap == NULL )
   9.286 -        {
   9.287 -            m->shadow_dirty_bitmap_size = 0;
   9.288 -            goto nomem;
   9.289 -        }
   9.290 -        memset(m->shadow_dirty_bitmap,0,m->shadow_dirty_bitmap_size/8);
   9.291 -    }
   9.292 -
   9.293 -    // call shadow_mk_pagetable
   9.294 -    __shadow_mk_pagetable( m );
   9.295 -    return 0;
   9.296 -
   9.297 -nomem:
   9.298 -    return -ENOMEM;
   9.299 -}
   9.300 -
   9.301 -void shadow_mode_disable( struct domain *p )
   9.302 -{
   9.303 -    struct mm_struct *m = &p->mm;
   9.304 -    struct shadow_status *next;
   9.305 -
   9.306 -    __free_shadow_table( m );
   9.307 -    m->shadow_mode = 0;
   9.308 -
   9.309 -    SH_LOG("freed tables count=%d l1=%d l2=%d",
   9.310 -           m->shadow_page_count, perfc_value(shadow_l1_pages), perfc_value(shadow_l2_pages));
   9.311 -
   9.312 -    next = m->shadow_ht_extras;
   9.313 -    while( next )
   9.314 -    {
   9.315 -        struct shadow_status * this = next;
   9.316 -        m->shadow_extras_count--;
   9.317 -        next = *((struct shadow_status **)(&next[shadow_ht_extra_size]));
   9.318 -        kfree( this );
   9.319 -    }
   9.320 -
   9.321 -    SH_LOG("freed extras, now %d", m->shadow_extras_count);
   9.322 -
   9.323 -    if( m->shadow_dirty_bitmap  )
   9.324 -    {
   9.325 -        kfree( m->shadow_dirty_bitmap );
   9.326 -        m->shadow_dirty_bitmap = 0;
   9.327 -        m->shadow_dirty_bitmap_size = 0;
   9.328 -    }
   9.329 -
   9.330 -    // free the hashtable itself
   9.331 -    kfree( &m->shadow_ht[0] );
   9.332 -}
   9.333 -
   9.334 -static int shadow_mode_table_op(struct domain *d, 
   9.335 -							    dom0_shadow_control_t *sc)
   9.336 -{
   9.337 -    unsigned int op = sc->op;
   9.338 -    struct mm_struct *m = &d->mm;
   9.339 -    int rc = 0;
   9.340 -
   9.341 -    // since Dom0 did the hypercall, we should be running with it's page
   9.342 -    // tables right now. Calling flush on yourself would be really
   9.343 -    // stupid.
   9.344 -
   9.345 -    ASSERT(spin_is_locked(&d->mm.shadow_lock));
   9.346 -
   9.347 -    if ( m == &current->mm )
   9.348 -    {
   9.349 -        printk("Don't try and flush your own page tables!\n");
   9.350 -        return -EINVAL;
   9.351 -    }
   9.352 -   
   9.353 -    SH_VLOG("shadow mode table op %08lx %08lx count %d",pagetable_val( m->pagetable),pagetable_val(m->shadow_table), m->shadow_page_count);
   9.354 -
   9.355 -    shadow_audit(m,1);
   9.356 -
   9.357 -    switch(op)
   9.358 -    {
   9.359 -    case DOM0_SHADOW_CONTROL_OP_FLUSH:
   9.360 -        // XXX THIS IS VERY DANGEROUS : MUST ENSURE THE PTs ARE NOT IN USE ON
   9.361 -		// OTHER CPU -- fix when we get sched sync pause.
   9.362 -        __free_shadow_table( m );  
   9.363 -        break;
   9.364 -   
   9.365 -    case DOM0_SHADOW_CONTROL_OP_CLEAN:   // zero all-non hypervisor
   9.366 -	{
   9.367 -		__scan_shadow_table( m, TABLE_OP_ZERO_L2 );
   9.368 -		__scan_shadow_table( m, TABLE_OP_ZERO_L1 );
   9.369 -
   9.370 -		goto send_bitmap;
   9.371 -	}
   9.372 -		
   9.373 -
   9.374 -    case DOM0_SHADOW_CONTROL_OP_CLEAN2:  // zero all L2, free L1s
   9.375 -    {
   9.376 -		int i,j,zero=1;
   9.377 -		
   9.378 -		__scan_shadow_table( m, TABLE_OP_ZERO_L2 );
   9.379 -		__scan_shadow_table( m, TABLE_OP_FREE_L1 );
   9.380 -		
   9.381 -	send_bitmap:
   9.382 -		sc->stats.fault_count       = d->mm.shadow_fault_count;
   9.383 -		sc->stats.dirty_count       = d->mm.shadow_dirty_count;
   9.384 -		sc->stats.dirty_net_count   = d->mm.shadow_dirty_net_count;
   9.385 -		sc->stats.dirty_block_count = d->mm.shadow_dirty_block_count;
   9.386 -
   9.387 -		d->mm.shadow_fault_count       = 0;
   9.388 -		d->mm.shadow_dirty_count       = 0;
   9.389 -		d->mm.shadow_dirty_net_count   = 0;
   9.390 -		d->mm.shadow_dirty_block_count = 0;
   9.391 -	
   9.392 -		sc->pages = d->tot_pages;
   9.393 -
   9.394 -		if( d->tot_pages > sc->pages || 
   9.395 -			!sc->dirty_bitmap || !d->mm.shadow_dirty_bitmap )
   9.396 -		{
   9.397 -			rc = -EINVAL;
   9.398 -			goto out;
   9.399 -		}
   9.400 -
   9.401 -	
   9.402 -#define chunk (8*1024) // do this in 1KB chunks for L1 cache
   9.403 -	
   9.404 -		for(i=0;i<d->tot_pages;i+=chunk)
   9.405 -		{
   9.406 -			int bytes = ((  ((d->tot_pages-i) > (chunk))?
   9.407 -							(chunk):(d->tot_pages-i) ) + 7) / 8;
   9.408 -	    
   9.409 -			copy_to_user( sc->dirty_bitmap + (i/(8*sizeof(unsigned long))),
   9.410 -						  d->mm.shadow_dirty_bitmap +(i/(8*sizeof(unsigned long))),
   9.411 -						  bytes );
   9.412 -	    
   9.413 -			for(j=0; zero && j<bytes/sizeof(unsigned long);j++)
   9.414 -			{
   9.415 -				if( d->mm.shadow_dirty_bitmap[j] != 0 )
   9.416 -					zero = 0;
   9.417 -			}
   9.418 -
   9.419 -			memset( d->mm.shadow_dirty_bitmap +(i/(8*sizeof(unsigned long))),
   9.420 -					0, bytes);
   9.421 -		}
   9.422 -
   9.423 -        /* Might as well stop the domain as an optimization. */
   9.424 -		if ( zero )
   9.425 -            domain_pause_by_systemcontroller(d);
   9.426 -
   9.427 -		break;
   9.428 -    }
   9.429 -
   9.430 -    case DOM0_SHADOW_CONTROL_OP_PEEK:
   9.431 -    {
   9.432 -		int i;
   9.433 -
   9.434 -		sc->stats.fault_count       = d->mm.shadow_fault_count;
   9.435 -		sc->stats.dirty_count       = d->mm.shadow_dirty_count;
   9.436 -		sc->stats.dirty_net_count   = d->mm.shadow_dirty_net_count;
   9.437 -		sc->stats.dirty_block_count = d->mm.shadow_dirty_block_count;
   9.438 -	
   9.439 -		if( d->tot_pages > sc->pages || 
   9.440 -			!sc->dirty_bitmap || !d->mm.shadow_dirty_bitmap )
   9.441 -		{
   9.442 -			rc = -EINVAL;
   9.443 -			goto out;
   9.444 -		}
   9.445 -	
   9.446 -		sc->pages = d->tot_pages;
   9.447 -	
   9.448 -#define chunk (8*1024) // do this in 1KB chunks for L1 cache
   9.449 -	
   9.450 -		for(i=0;i<d->tot_pages;i+=chunk)
   9.451 -		{
   9.452 -			int bytes = ((  ((d->tot_pages-i) > (chunk))?
   9.453 -							(chunk):(d->tot_pages-i) ) + 7) / 8;
   9.454 -	    
   9.455 -			copy_to_user( sc->dirty_bitmap + (i/(8*sizeof(unsigned long))),
   9.456 -						  d->mm.shadow_dirty_bitmap +(i/(8*sizeof(unsigned long))),
   9.457 -						  bytes );	    
   9.458 -		}
   9.459 -
   9.460 -		break;
   9.461 -    }
   9.462 -
   9.463 -	default:
   9.464 -		BUG();
   9.465 -
   9.466 -    }
   9.467 -
   9.468 -
   9.469 -out:
   9.470 -
   9.471 -    SH_VLOG("shadow mode table op : page count %d", m->shadow_page_count);
   9.472 -
   9.473 -    shadow_audit(m,1);
   9.474 -
   9.475 -    // call shadow_mk_pagetable
   9.476 -    __shadow_mk_pagetable( m );
   9.477 -
   9.478 -    return rc;
   9.479 -}
   9.480 -
   9.481 -int shadow_mode_control( struct domain *p, dom0_shadow_control_t *sc )
   9.482 -{
   9.483 -    unsigned int cmd = sc->op;
   9.484 -    int rc = 0;
   9.485 -
   9.486 -    spin_lock(&p->mm.shadow_lock);
   9.487 -
   9.488 -    if ( p->mm.shadow_mode && cmd == DOM0_SHADOW_CONTROL_OP_OFF )
   9.489 -    {
   9.490 -        shadow_mode_disable(p);
   9.491 -    }
   9.492 -    else if ( cmd == DOM0_SHADOW_CONTROL_OP_ENABLE_TEST )
   9.493 -    {
   9.494 -        if(p->mm.shadow_mode) shadow_mode_disable(p);
   9.495 -        shadow_mode_enable(p, SHM_test);
   9.496 -    } 
   9.497 -    else if ( cmd == DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY )
   9.498 -    {
   9.499 -        if(p->mm.shadow_mode) shadow_mode_disable(p);
   9.500 -        shadow_mode_enable(p, SHM_logdirty);
   9.501 -    } 
   9.502 -    else if ( p->mm.shadow_mode && cmd >= DOM0_SHADOW_CONTROL_OP_FLUSH && cmd<=DOM0_SHADOW_CONTROL_OP_CLEAN2 )
   9.503 -    {
   9.504 -        rc = shadow_mode_table_op(p, sc);
   9.505 -    }
   9.506 -    else
   9.507 -    {
   9.508 -        rc = -EINVAL;
   9.509 -    }
   9.510 -
   9.511 -	flush_tlb_cpu(p->processor);
   9.512 -   
   9.513 -    spin_unlock(&p->mm.shadow_lock);
   9.514 -
   9.515 -    return rc;
   9.516 -}
   9.517 -
   9.518 -
   9.519 -
   9.520 -static inline struct pfn_info *alloc_shadow_page( struct mm_struct *m )
   9.521 -{
   9.522 -    m->shadow_page_count++;
   9.523 -
   9.524 -    return alloc_domain_page( NULL );
   9.525 -}
   9.526 -
   9.527 -
   9.528 -void unshadow_table( unsigned long gpfn, unsigned int type )
   9.529 -{
   9.530 -    unsigned long spfn;
   9.531 -
   9.532 -    SH_VLOG("unshadow_table type=%08x gpfn=%08lx",
   9.533 -            type,
   9.534 -            gpfn );
   9.535 -
   9.536 -    perfc_incrc(unshadow_table_count);
   9.537 -
   9.538 -    // this function is the same for both l1 and l2 tables
   9.539 -
   9.540 -    // even in the SMP guest case, there won't be a race here as
   9.541 -    // this CPU was the one that cmpxchg'ed the page to invalid
   9.542 -
   9.543 -    spfn = __shadow_status(&current->mm, gpfn) & PSH_pfn_mask;
   9.544 -
   9.545 -    delete_shadow_status(&current->mm, gpfn);
   9.546 -
   9.547 -    free_shadow_page( &current->mm, &frame_table[spfn] );
   9.548 -
   9.549 -}
   9.550 -
   9.551 -
   9.552 -unsigned long shadow_l2_table( 
   9.553 -    struct mm_struct *m, unsigned long gpfn )
   9.554 -{
   9.555 -    struct pfn_info *spfn_info;
   9.556 -    unsigned long spfn;
   9.557 -    l2_pgentry_t *spl2e, *gpl2e;
   9.558 -    int i;
   9.559 -
   9.560 -    SH_VVLOG("shadow_l2_table( %08lx )",gpfn);
   9.561 -
   9.562 -    perfc_incrc(shadow_l2_table_count);
   9.563 -
   9.564 -    // XXX in future, worry about racing in SMP guests 
   9.565 -    //      -- use cmpxchg with PSH_pending flag to show progress (and spin)
   9.566 -
   9.567 -    spfn_info = alloc_shadow_page(m);
   9.568 -
   9.569 -    ASSERT( spfn_info ); // XXX deal with failure later e.g. blow cache
   9.570 -
   9.571 -    spfn_info->type_and_flags = PGT_l2_page_table;
   9.572 -    perfc_incr(shadow_l2_pages);
   9.573 -
   9.574 -    spfn = (unsigned long) (spfn_info - frame_table);
   9.575 -
   9.576 -    // mark pfn as being shadowed, update field to point at shadow
   9.577 -    set_shadow_status(m, gpfn, spfn | PSH_shadowed);
   9.578 - 
   9.579 -    // we need to do this before the linear map is set up
   9.580 -    spl2e = (l2_pgentry_t *) map_domain_mem(spfn << PAGE_SHIFT);
   9.581 -
   9.582 -#ifdef __i386__
   9.583 -    // get hypervisor and 2x linear PT mapings installed 
   9.584 -    memcpy(&spl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 
   9.585 -           &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
   9.586 -           HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
   9.587 -    spl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
   9.588 -        mk_l2_pgentry((gpfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
   9.589 -    spl2e[SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
   9.590 -        mk_l2_pgentry((spfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
   9.591 -    spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] =
   9.592 -        mk_l2_pgentry(__pa(frame_table[gpfn].u.domain->mm.perdomain_pt) | 
   9.593 -                      __PAGE_HYPERVISOR);
   9.594 -#endif
   9.595 -
   9.596 -    // can't use the linear map as we may not be in the right PT
   9.597 -    gpl2e = (l2_pgentry_t *) map_domain_mem(gpfn << PAGE_SHIFT);
   9.598 -
   9.599 -    // proactively create entries for pages that are already shadowed
   9.600 -    for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
   9.601 -    {
   9.602 -        unsigned long spte = 0;
   9.603 -
   9.604 -#if 0  // Turns out this doesn't really help
   9.605 -        unsigned long gpte;
   9.606 -
   9.607 -        gpte = l2_pgentry_val(gpl2e[i]);
   9.608 -
   9.609 -        if (gpte & _PAGE_PRESENT)
   9.610 -        {
   9.611 -            unsigned long s_sh = 
   9.612 -                __shadow_status(p, gpte>>PAGE_SHIFT);
   9.613 -
   9.614 -            l2pde_general( m, &gpte, &spte, s_sh );
   9.615 -
   9.616 -        }
   9.617 -#endif
   9.618 -
   9.619 -        spl2e[i] = mk_l2_pgentry( spte );
   9.620 -
   9.621 -    }
   9.622 -
   9.623 -    // its arguable we should 'preemptively shadow' a few active L1 pages
   9.624 -    // to avoid taking a string of faults when 'jacking' a running domain
   9.625 -
   9.626 -    unmap_domain_mem( gpl2e );
   9.627 -    unmap_domain_mem( spl2e );
   9.628 -
   9.629 -    SH_VLOG("shadow_l2_table( %08lx -> %08lx)",gpfn,spfn);
   9.630 -
   9.631 -    return spfn;
   9.632 -}
   9.633 -
   9.634 -
   9.635 -int shadow_fault( unsigned long va, long error_code )
   9.636 -{
   9.637 -    unsigned long gpte, spte;
   9.638 -    struct mm_struct *m = &current->mm;
   9.639 -
   9.640 -    SH_VVLOG("shadow_fault( va=%08lx, code=%ld )", va, error_code );
   9.641 -
   9.642 -    check_pagetable( current, current->mm.pagetable, "pre-sf" );
   9.643 -
   9.644 -    if ( unlikely(__get_user(gpte, (unsigned long*)&linear_pg_table[va>>PAGE_SHIFT])) )
   9.645 -    {
   9.646 -        SH_VVLOG("shadow_fault - EXIT: read gpte faulted" );
   9.647 -        return 0;  // propagate to guest
   9.648 -    }
   9.649 -
   9.650 -    if ( ! (gpte & _PAGE_PRESENT) )
   9.651 -    {
   9.652 -        SH_VVLOG("shadow_fault - EXIT: gpte not present (%lx)",gpte );
   9.653 -        return 0;  // we're not going to be able to help
   9.654 -    }
   9.655 -
   9.656 -    if ( (error_code & 2)  && ! (gpte & _PAGE_RW) )
   9.657 -    {
   9.658 -        // write fault on RO page
   9.659 -        return 0;
   9.660 -    }
   9.661 -
   9.662 -    // take the lock and reread gpte
   9.663 -
   9.664 -    while( unlikely(!spin_trylock(&current->mm.shadow_lock)) )
   9.665 -	{
   9.666 -		extern volatile unsigned long flush_cpumask;
   9.667 -		if ( test_and_clear_bit(smp_processor_id(), &flush_cpumask) )
   9.668 -			local_flush_tlb();
   9.669 -		rep_nop();
   9.670 -	}
   9.671 -	
   9.672 -	ASSERT(spin_is_locked(&current->mm.shadow_lock));
   9.673 -	
   9.674 -    if ( unlikely(__get_user(gpte, (unsigned long*)&linear_pg_table[va>>PAGE_SHIFT])) )
   9.675 -    {
   9.676 -        SH_VVLOG("shadow_fault - EXIT: read gpte faulted" );
   9.677 -        spin_unlock(&m->shadow_lock);
   9.678 -        return 0;  // propagate to guest
   9.679 -    }
   9.680 -
   9.681 -    if ( unlikely(!(gpte & _PAGE_PRESENT)) )
   9.682 -    {
   9.683 -        SH_VVLOG("shadow_fault - EXIT: gpte not present (%lx)",gpte );
   9.684 -        spin_unlock(&m->shadow_lock);
   9.685 -        return 0;  // we're not going to be able to help
   9.686 -    }
   9.687 -
   9.688 -    if ( error_code & 2  )  
   9.689 -    {  // write fault
   9.690 -        if ( likely(gpte & _PAGE_RW) )
   9.691 -        {
   9.692 -            l1pte_write_fault( m, &gpte, &spte );
   9.693 -        }
   9.694 -        else
   9.695 -        {   // write fault on RO page
   9.696 -            SH_VVLOG("shadow_fault - EXIT: write fault on RO page (%lx)",gpte );
   9.697 -            spin_unlock(&m->shadow_lock);
   9.698 -            return 0; // propagate to guest
   9.699 -            // not clear whether we should set accessed bit here...
   9.700 -        }
   9.701 -    }
   9.702 -    else
   9.703 -    {
   9.704 -        l1pte_read_fault( m, &gpte, &spte );
   9.705 -    }
   9.706 -
   9.707 -    SH_VVLOG("plan: gpte=%08lx  spte=%08lx", gpte, spte );
   9.708 -
   9.709 -    // write back updated gpte
   9.710 -    // XXX watch out for read-only L2 entries! (not used in Linux)
   9.711 -    if ( unlikely( __put_user( gpte, (unsigned long*)&linear_pg_table[va>>PAGE_SHIFT])) )
   9.712 -        BUG();  // fixme!
   9.713 -
   9.714 -    if ( unlikely( __put_user( spte, (unsigned long*)&shadow_linear_pg_table[va>>PAGE_SHIFT])) )
   9.715 -    { 
   9.716 -        // failed:
   9.717 -        //  the L1 may not be shadowed, or the L2 entry may be insufficient
   9.718 -
   9.719 -        unsigned long gpde, spde, gl1pfn, sl1pfn;
   9.720 -
   9.721 -        SH_VVLOG("3: not shadowed or l2 insufficient gpte=%08lx  spte=%08lx",gpte,spte );
   9.722 -
   9.723 -        gpde = l2_pgentry_val(linear_l2_table[va>>L2_PAGETABLE_SHIFT]);
   9.724 -
   9.725 -        gl1pfn = gpde>>PAGE_SHIFT;
   9.726 -
   9.727 -        
   9.728 -        if ( ! (sl1pfn=__shadow_status(&current->mm, gl1pfn) ) )
   9.729 -        {
   9.730 -            // this L1 is NOT already shadowed so we need to shadow it
   9.731 -            struct pfn_info *sl1pfn_info;
   9.732 -            unsigned long *gpl1e, *spl1e;
   9.733 -            int i;
   9.734 -            sl1pfn_info = alloc_shadow_page( &current->mm ); 
   9.735 -            sl1pfn_info->type_and_flags = PGT_l1_page_table;
   9.736 -			
   9.737 -            sl1pfn = sl1pfn_info - frame_table;
   9.738 -
   9.739 -            SH_VVLOG("4a: l1 not shadowed ( %08lx )",sl1pfn);
   9.740 -            perfc_incrc(shadow_l1_table_count);
   9.741 -            perfc_incr(shadow_l1_pages);
   9.742 -
   9.743 -            set_shadow_status(&current->mm, gl1pfn, PSH_shadowed | sl1pfn);
   9.744 -
   9.745 -            l2pde_general( m, &gpde, &spde, sl1pfn );
   9.746 -
   9.747 -            linear_l2_table[va>>L2_PAGETABLE_SHIFT] = mk_l2_pgentry(gpde);
   9.748 -            shadow_linear_l2_table[va>>L2_PAGETABLE_SHIFT] =  mk_l2_pgentry(spde);
   9.749 -
   9.750 -            gpl1e = (unsigned long *) &(linear_pg_table[
   9.751 -                (va>>PAGE_SHIFT) & ~(ENTRIES_PER_L1_PAGETABLE-1) ]);
   9.752 -
   9.753 -            spl1e = (unsigned long *) &shadow_linear_pg_table[
   9.754 -                (va>>PAGE_SHIFT) & ~(ENTRIES_PER_L1_PAGETABLE-1) ];
   9.755 -
   9.756 -
   9.757 -            for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
   9.758 -            {
   9.759 -                l1pte_no_fault( m, &gpl1e[i], &spl1e[i] );
   9.760 -            }
   9.761 -
   9.762 -
   9.763 -        }
   9.764 -        else
   9.765 -        {
   9.766 -            // this L1 was shadowed (by another PT) but we didn't have an L2
   9.767 -            // entry for it
   9.768 -
   9.769 -            SH_VVLOG("4b: was shadowed, l2 missing ( %08lx )",sl1pfn);
   9.770 -
   9.771 -            l2pde_general( m, &gpde, &spde, sl1pfn );
   9.772 -
   9.773 -            linear_l2_table[va>>L2_PAGETABLE_SHIFT] = mk_l2_pgentry(gpde);
   9.774 -            shadow_linear_l2_table[va>>L2_PAGETABLE_SHIFT] = mk_l2_pgentry(spde);
   9.775 -   
   9.776 -        }              
   9.777 -
   9.778 -        shadow_linear_pg_table[va>>PAGE_SHIFT] = mk_l1_pgentry(spte);
   9.779 -        // (we need to do the above even if we've just made the shadow L1)
   9.780 -
   9.781 -    } // end of fixup writing the shadow L1 directly failed
   9.782 -     
   9.783 -    perfc_incrc(shadow_fixup_count);
   9.784 -
   9.785 -	m->shadow_fault_count++;
   9.786 -
   9.787 -    check_pagetable( current, current->mm.pagetable, "post-sf" );
   9.788 -
   9.789 -    spin_unlock(&m->shadow_lock);
   9.790 -
   9.791 -    return 1; // let's try the faulting instruction again...
   9.792 -
   9.793 -}
   9.794 -
   9.795 -
   9.796 -void shadow_l1_normal_pt_update( unsigned long pa, unsigned long gpte,
   9.797 -                                 unsigned long *prev_spfn_ptr,
   9.798 -                                 l1_pgentry_t **prev_spl1e_ptr )
   9.799 -{
   9.800 -    unsigned long gpfn, spfn, spte, prev_spfn = *prev_spfn_ptr;    
   9.801 -    l1_pgentry_t * spl1e, * prev_spl1e = *prev_spl1e_ptr;
   9.802 -
   9.803 -
   9.804 -    SH_VVLOG("shadow_l1_normal_pt_update pa=%08lx, gpte=%08lx, prev_spfn=%08lx, prev_spl1e=%p\n",
   9.805 -             pa,gpte,prev_spfn, prev_spl1e);
   9.806 -
   9.807 -    // to get here, we know the l1 page *must* be shadowed
   9.808 -
   9.809 -    gpfn = pa >> PAGE_SHIFT;
   9.810 -    spfn = __shadow_status(&current->mm, gpfn) & PSH_pfn_mask;
   9.811 -
   9.812 -    if ( spfn == prev_spfn )
   9.813 -    {
   9.814 -        spl1e = prev_spl1e;
   9.815 -    }
   9.816 -    else
   9.817 -    {
   9.818 -        if( prev_spl1e ) unmap_domain_mem( prev_spl1e );
   9.819 -        spl1e = (l1_pgentry_t *) map_domain_mem( spfn << PAGE_SHIFT );
   9.820 -        *prev_spfn_ptr  = spfn;
   9.821 -        *prev_spl1e_ptr = spl1e;
   9.822 -    }
   9.823 -
   9.824 -    // XXX we assume only pagetables can be shadowed; 
   9.825 -    // this will have to change to allow arbitrary CoW etc.
   9.826 -
   9.827 -    l1pte_no_fault( &current->mm, &gpte, &spte );
   9.828 -
   9.829 -
   9.830 -    spl1e[(pa & ~PAGE_MASK) / sizeof(l1_pgentry_t) ] = mk_l1_pgentry( spte );
   9.831 -
   9.832 -}
   9.833 -
   9.834 -void shadow_l2_normal_pt_update( unsigned long pa, unsigned long gpte )
   9.835 -{
   9.836 -    unsigned long gpfn, spfn, spte;
   9.837 -    l2_pgentry_t * sp2le;
   9.838 -    unsigned long s_sh=0;
   9.839 -
   9.840 -    SH_VVLOG("shadow_l2_normal_pt_update pa=%08lx, gpte=%08lx",pa,gpte);
   9.841 -
   9.842 -    // to get here, we know the l2 page has a shadow
   9.843 -
   9.844 -    gpfn = pa >> PAGE_SHIFT;
   9.845 -    spfn = __shadow_status(&current->mm, gpfn) & PSH_pfn_mask;
   9.846 -
   9.847 -
   9.848 -    spte = 0;
   9.849 -
   9.850 -    if( gpte & _PAGE_PRESENT )
   9.851 -        s_sh = __shadow_status(&current->mm, gpte >> PAGE_SHIFT);
   9.852 -
   9.853 -    sp2le = (l2_pgentry_t *) map_domain_mem( spfn << PAGE_SHIFT );
   9.854 -    // no real need for a cache here
   9.855 -
   9.856 -    l2pde_general( &current->mm, &gpte, &spte, s_sh );
   9.857 -
   9.858 -    // XXXX Should mark guest pte as DIRTY and ACCESSED too!!!!!
   9.859 -
   9.860 -    sp2le[(pa & ~PAGE_MASK) / sizeof(l2_pgentry_t) ] = 
   9.861 -        mk_l2_pgentry( spte );
   9.862 -
   9.863 -    unmap_domain_mem( (void *) sp2le );
   9.864 -}
   9.865 -
   9.866 -
   9.867 -#if SHADOW_DEBUG
   9.868 -
   9.869 -static int sh_l2_present;
   9.870 -static int sh_l1_present;
   9.871 -char * sh_check_name;
   9.872 -
   9.873 -#define FAIL(_f, _a...)                             \
   9.874 -{printk("XXX %s-FAIL (%d,%d)" _f " g=%08lx s=%08lx\n",  sh_check_name, level, i, ## _a , gpte, spte ); BUG();}
   9.875 -
   9.876 -static int check_pte( struct mm_struct *m, 
   9.877 -                      unsigned long gpte, unsigned long spte, int level, int i )
   9.878 -{
   9.879 -    unsigned long mask, gpfn, spfn;
   9.880 -
   9.881 -    if ( spte == 0 || spte == 0xdeadface || spte == 0x00000E00)
   9.882 -        return 1;  // always safe
   9.883 -
   9.884 -    if ( !(spte & _PAGE_PRESENT) )
   9.885 -        FAIL("Non zero not present spte");
   9.886 -
   9.887 -    if( level == 2 ) sh_l2_present++;
   9.888 -    if( level == 1 ) sh_l1_present++;
   9.889 -
   9.890 -    if ( !(gpte & _PAGE_PRESENT) )
   9.891 -        FAIL("Guest not present yet shadow is");
   9.892 -
   9.893 -    mask = ~(_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW|0xFFFFF000);
   9.894 -
   9.895 -    if ( (spte & mask) != (gpte & mask ) )
   9.896 -        FAIL("Corrupt?");
   9.897 -
   9.898 -    if ( (spte & _PAGE_DIRTY ) && !(gpte & _PAGE_DIRTY) )
   9.899 -        FAIL("Dirty coherence");
   9.900 -
   9.901 -    if ( (spte & _PAGE_ACCESSED ) && !(gpte & _PAGE_ACCESSED) )
   9.902 -        FAIL("Accessed coherence");
   9.903 -
   9.904 -    if ( (spte & _PAGE_RW ) && !(gpte & _PAGE_RW) )
   9.905 -        FAIL("RW coherence");
   9.906 -
   9.907 -    if ( (spte & _PAGE_RW ) && !((gpte & _PAGE_RW) && (gpte & _PAGE_DIRTY) ))
   9.908 -        FAIL("RW2 coherence");
   9.909 - 
   9.910 -    spfn = spte>>PAGE_SHIFT;
   9.911 -    gpfn = gpte>>PAGE_SHIFT;
   9.912 -
   9.913 -    if ( gpfn == spfn )
   9.914 -    {
   9.915 -        if ( level > 1 )
   9.916 -            FAIL("Linear map ???");    // XXX this will fail on BSD
   9.917 -
   9.918 -        return 1;
   9.919 -    }
   9.920 -    else
   9.921 -    {
   9.922 -        if ( level < 2 )
   9.923 -            FAIL("Shadow in L1 entry?");
   9.924 -
   9.925 -        if ( __shadow_status(p, gpfn) != (PSH_shadowed | spfn) )
   9.926 -            FAIL("spfn problem g.sf=%08lx", 
   9.927 -                 __shadow_status(p, gpfn) );
   9.928 -    }
   9.929 -
   9.930 -    return 1;
   9.931 -}
   9.932 -
   9.933 -
   9.934 -static int check_l1_table( struct mm_struct *m, unsigned long va, 
   9.935 -                           unsigned long g2, unsigned long s2 )
   9.936 -{
   9.937 -    int j;
   9.938 -    unsigned long *gpl1e, *spl1e;
   9.939 -
   9.940 -    //gpl1e = (unsigned long *) &(linear_pg_table[ va>>PAGE_SHIFT]);
   9.941 -    //spl1e = (unsigned long *) &(shadow_linear_pg_table[ va>>PAGE_SHIFT]);
   9.942 -
   9.943 -    gpl1e = map_domain_mem( g2<<PAGE_SHIFT );
   9.944 -    spl1e = map_domain_mem( s2<<PAGE_SHIFT );
   9.945 -
   9.946 -    for ( j = 0; j < ENTRIES_PER_L1_PAGETABLE; j++ )
   9.947 -    {
   9.948 -        unsigned long gpte = gpl1e[j];
   9.949 -        unsigned long spte = spl1e[j];
   9.950 -  
   9.951 -        check_pte( p, gpte, spte, 1, j );
   9.952 -    }
   9.953 - 
   9.954 -    unmap_domain_mem( spl1e );
   9.955 -    unmap_domain_mem( gpl1e );
   9.956 -
   9.957 -    return 1;
   9.958 -}
   9.959 -
   9.960 -#define FAILPT(_f, _a...)                             \
   9.961 -{printk("XXX FAIL %s-PT" _f "\n", s, ## _a ); BUG();}
   9.962 -
   9.963 -int check_pagetable( struct mm_struct *m, pagetable_t pt, char *s )
   9.964 -{
   9.965 -    unsigned long gptbase = pagetable_val(pt);
   9.966 -    unsigned long gpfn, spfn;
   9.967 -    int i;
   9.968 -    l2_pgentry_t *gpl2e, *spl2e;
   9.969 -
   9.970 -    sh_check_name = s;
   9.971 -
   9.972 -    SH_VVLOG("%s-PT Audit",s);
   9.973 -
   9.974 -    sh_l2_present = sh_l1_present = 0;
   9.975 -
   9.976 -    gpfn =  gptbase >> PAGE_SHIFT;
   9.977 -
   9.978 -    if ( ! (__shadow_status(p, gpfn) & PSH_shadowed) )
   9.979 -    {
   9.980 -        printk("%s-PT %08lx not shadowed\n", s, gptbase);
   9.981 -
   9.982 -        if( __shadow_status(p, gpfn) != 0 ) BUG();
   9.983 -
   9.984 -        return 0;
   9.985 -    }
   9.986 - 
   9.987 -    spfn = __shadow_status(p, gpfn) & PSH_pfn_mask;
   9.988 -
   9.989 -    if ( ! __shadow_status(p, gpfn) == (PSH_shadowed | spfn) )
   9.990 -        FAILPT("ptbase shadow inconsistent1");
   9.991 -
   9.992 -    gpl2e = (l2_pgentry_t *) map_domain_mem( gpfn << PAGE_SHIFT );
   9.993 -    spl2e = (l2_pgentry_t *) map_domain_mem( spfn << PAGE_SHIFT );
   9.994 -
   9.995 -    //ipl2e = (l2_pgentry_t *) map_domain_mem( spfn << PAGE_SHIFT );
   9.996 -
   9.997 -
   9.998 -    if ( memcmp( &spl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
   9.999 -                 &gpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 
  9.1000 -                 ((SH_LINEAR_PT_VIRT_START>>(L2_PAGETABLE_SHIFT))-DOMAIN_ENTRIES_PER_L2_PAGETABLE)
  9.1001 -                 * sizeof(l2_pgentry_t)) )
  9.1002 -    {
  9.1003 -        printk("gpfn=%08lx spfn=%08lx\n", gpfn, spfn);
  9.1004 -        for (i=DOMAIN_ENTRIES_PER_L2_PAGETABLE; 
  9.1005 -             i<(SH_LINEAR_PT_VIRT_START>>(L2_PAGETABLE_SHIFT));
  9.1006 -             i++ )
  9.1007 -            printk("+++ (%d) %08lx %08lx\n",i,
  9.1008 -                   l2_pgentry_val(gpl2e[i]), l2_pgentry_val(spl2e[i]) );
  9.1009 -        FAILPT("hypervisor entries inconsistent");
  9.1010 -    }
  9.1011 -
  9.1012 -    if ( (l2_pgentry_val(spl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT]) != 
  9.1013 -          l2_pgentry_val(gpl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT])) )
  9.1014 -        FAILPT("hypervisor linear map inconsistent");
  9.1015 -
  9.1016 -    if ( (l2_pgentry_val(spl2e[SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT]) != 
  9.1017 -          ((spfn << PAGE_SHIFT) | __PAGE_HYPERVISOR)) )
  9.1018 -        FAILPT("hypervisor shadow linear map inconsistent %08lx %08lx",
  9.1019 -               l2_pgentry_val(spl2e[SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT]),
  9.1020 -               (spfn << PAGE_SHIFT) | __PAGE_HYPERVISOR
  9.1021 -            );
  9.1022 -
  9.1023 -    if ( (l2_pgentry_val(spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT]) !=
  9.1024 -          ((__pa(frame_table[gpfn].u.domain->mm.perdomain_pt) | __PAGE_HYPERVISOR))) )
  9.1025 -        FAILPT("hypervisor per-domain map inconsistent");
  9.1026 -
  9.1027 -
  9.1028 -    // check the whole L2
  9.1029 -    for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
  9.1030 -    {
  9.1031 -        unsigned long gpte = l2_pgentry_val(gpl2e[i]);
  9.1032 -        unsigned long spte = l2_pgentry_val(spl2e[i]);
  9.1033 -
  9.1034 -        check_pte( p, gpte, spte, 2, i );
  9.1035 -    }
  9.1036 -
  9.1037 -
  9.1038 -    // go back and recurse
  9.1039 -    for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
  9.1040 -    {
  9.1041 -        unsigned long gpte = l2_pgentry_val(gpl2e[i]);
  9.1042 -        unsigned long spte = l2_pgentry_val(spl2e[i]);
  9.1043 -
  9.1044 -        if ( spte )    
  9.1045 -            check_l1_table( p,
  9.1046 -                            i<<L2_PAGETABLE_SHIFT,
  9.1047 -                            gpte>>PAGE_SHIFT, spte>>PAGE_SHIFT );
  9.1048 -
  9.1049 -    }
  9.1050 -
  9.1051 -    unmap_domain_mem( spl2e );
  9.1052 -    unmap_domain_mem( gpl2e );
  9.1053 -
  9.1054 -    SH_VVLOG("PT verified : l2_present = %d, l1_present = %d\n",
  9.1055 -             sh_l2_present, sh_l1_present );
  9.1056 - 
  9.1057 -    return 1;
  9.1058 -}
  9.1059 -
  9.1060 -
  9.1061 -#endif
    10.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    10.2 +++ b/xen/include/asm-x86/shadow.h	Thu Jul 01 23:45:24 2004 +0000
    10.3 @@ -0,0 +1,604 @@
    10.4 +/* -*-  Mode:C; c-basic-offset:4; tab-width:4 -*- */
    10.5 +
    10.6 +#ifndef _XEN_SHADOW_H
    10.7 +#define _XEN_SHADOW_H
    10.8 +
    10.9 +#include <xen/config.h>
   10.10 +#include <xen/types.h>
   10.11 +#include <xen/perfc.h>
   10.12 +#include <asm/processor.h>
   10.13 +
   10.14 +
   10.15 +/* Shadow PT flag bits in pfn_info */
   10.16 +#define PSH_shadowed    (1<<31) /* page has a shadow. PFN points to shadow */
   10.17 +#define PSH_pending     (1<<29) /* page is in the process of being shadowed */
   10.18 +#define PSH_pfn_mask    ((1<<21)-1)
   10.19 +
   10.20 +/* Shadow PT operation mode : shadowmode variable in mm_struct */
   10.21 +#define SHM_test        (1) /* just run domain on shadow PTs */
   10.22 +#define SHM_logdirty    (2) /* log pages that are dirtied */
   10.23 +#define SHM_translate   (3) /* lookup machine pages in translation table */
   10.24 +//#define SHM_cow       (4) /* copy on write all dirtied pages */
   10.25 +
   10.26 +
   10.27 +#define shadow_linear_pg_table ((l1_pgentry_t *)SH_LINEAR_PT_VIRT_START)
   10.28 +#define shadow_linear_l2_table ((l2_pgentry_t *)(SH_LINEAR_PT_VIRT_START+(SH_LINEAR_PT_VIRT_START>>(L2_PAGETABLE_SHIFT-L1_PAGETABLE_SHIFT))))
   10.29 +
   10.30 +extern void shadow_mode_init(void);
   10.31 +extern int shadow_mode_control( struct domain *p, dom0_shadow_control_t *sc );
   10.32 +extern int shadow_fault( unsigned long va, long error_code );
   10.33 +extern void shadow_l1_normal_pt_update( unsigned long pa, unsigned long gpte, 
   10.34 +                                        unsigned long *prev_spfn_ptr,
   10.35 +                                        l1_pgentry_t **prev_spl1e_ptr  );
   10.36 +extern void shadow_l2_normal_pt_update( unsigned long pa, unsigned long gpte );
   10.37 +extern void unshadow_table( unsigned long gpfn, unsigned int type );
   10.38 +extern int shadow_mode_enable( struct domain *p, unsigned int mode );
   10.39 +extern void shadow_mode_disable( struct domain *p );
   10.40 +extern unsigned long shadow_l2_table( 
   10.41 +    struct mm_struct *m, unsigned long gpfn );
   10.42 +
   10.43 +#define SHADOW_DEBUG 0
   10.44 +#define SHADOW_HASH_DEBUG 0
   10.45 +#define SHADOW_OPTIMISE 1
   10.46 +
   10.47 +struct shadow_status {
   10.48 +    unsigned long pfn;            // gpfn 
   10.49 +    unsigned long spfn_and_flags; // spfn plus flags
   10.50 +    struct shadow_status *next;   // use pull-to-front list.
   10.51 +};
   10.52 +
   10.53 +#define shadow_ht_extra_size         128 /*128*/
   10.54 +#define shadow_ht_buckets            256 /*256*/
   10.55 +
   10.56 +#ifndef NDEBUG
   10.57 +#define SH_LOG(_f, _a...)                             \
   10.58 +printk("DOM%u: (file=shadow.c, line=%d) " _f "\n",    \
   10.59 +       current->domain , __LINE__ , ## _a )
   10.60 +#else
   10.61 +#define SH_LOG(_f, _a...) 
   10.62 +#endif
   10.63 +
   10.64 +#if SHADOW_DEBUG
   10.65 +#define SH_VLOG(_f, _a...)                             \
   10.66 +    printk("DOM%u: (file=shadow.c, line=%d) " _f "\n", \
   10.67 +           current->domain , __LINE__ , ## _a )
   10.68 +#else
   10.69 +#define SH_VLOG(_f, _a...) 
   10.70 +#endif
   10.71 +
   10.72 +#if 0
   10.73 +#define SH_VVLOG(_f, _a...)                             \
   10.74 +    printk("DOM%u: (file=shadow.c, line=%d) " _f "\n",  \
   10.75 +           current->domain , __LINE__ , ## _a )
   10.76 +#else
   10.77 +#define SH_VVLOG(_f, _a...) 
   10.78 +#endif
   10.79 +
   10.80 +
   10.81 +/************************************************************************/
   10.82 +
   10.83 +#define shadow_mode(d)		(d->mm.shadow_mode)
   10.84 +#define	shadow_lock_init(d)	spin_lock_init(&d->mm.shadow_lock)
   10.85 +
   10.86 +/************************************************************************/
   10.87 +
   10.88 +static inline int __mark_dirty( struct mm_struct *m, unsigned int mfn )
   10.89 +{
   10.90 +    unsigned int pfn;
   10.91 +    int rc = 0;
   10.92 +
   10.93 +    ASSERT(spin_is_locked(&m->shadow_lock));
   10.94 +
   10.95 +    pfn = machine_to_phys_mapping[mfn];
   10.96 +
   10.97 +    /* We use values with the top bit set to mark MFNs that aren't
   10.98 +       really part of the domain's psuedo-physical memory map e.g.
   10.99 +       the shared info frame. Nothing to do here...
  10.100 +    */
  10.101 +    if ( unlikely(pfn & 0x80000000U) ) return rc; 
  10.102 +
  10.103 +    ASSERT(m->shadow_dirty_bitmap);
  10.104 +    if( likely(pfn<m->shadow_dirty_bitmap_size) )
  10.105 +    {
  10.106 +	/* These updates occur with mm.shadow_lock held, so use 
  10.107 +	   (__) version of test_and_set */
  10.108 +	if( __test_and_set_bit( pfn, m->shadow_dirty_bitmap ) == 0 )
  10.109 +	{
  10.110 +	    // if we set it
  10.111 +	    m->shadow_dirty_count++;
  10.112 +	    rc = 1;
  10.113 +	}
  10.114 +    }
  10.115 +    else
  10.116 +    {
  10.117 +        extern void show_traceX(void);
  10.118 +        SH_LOG("mark_dirty OOR! mfn=%x pfn=%x max=%x (mm %p)",
  10.119 +               mfn, pfn, m->shadow_dirty_bitmap_size, m );
  10.120 +        SH_LOG("dom=%u caf=%08x taf=%08x\n", 
  10.121 +               frame_table[mfn].u.domain->domain,
  10.122 +               frame_table[mfn].count_and_flags, 
  10.123 +               frame_table[mfn].type_and_flags );
  10.124 +    }
  10.125 +	
  10.126 +    return rc;
  10.127 +}
  10.128 +
  10.129 +
  10.130 +static inline int mark_dirty( struct mm_struct *m, unsigned int mfn )
  10.131 +{
  10.132 +    int rc;
  10.133 +    ASSERT(local_irq_is_enabled());
  10.134 +    //if(spin_is_locked(&m->shadow_lock)) printk("+");
  10.135 +    spin_lock(&m->shadow_lock);
  10.136 +    rc = __mark_dirty( m, mfn );
  10.137 +    spin_unlock(&m->shadow_lock);
  10.138 +    return rc;
  10.139 +}
  10.140 +
  10.141 +
  10.142 +/************************************************************************/
  10.143 +
  10.144 +static inline void l1pte_write_fault( struct mm_struct *m, 
  10.145 +                                      unsigned long *gpte_p, unsigned long *spte_p )
  10.146 +{ 
  10.147 +    unsigned long gpte = *gpte_p;
  10.148 +    unsigned long spte = *spte_p;
  10.149 +
  10.150 +    switch( m->shadow_mode )
  10.151 +    {
  10.152 +    case SHM_test:
  10.153 +        spte = gpte;
  10.154 +        gpte |= _PAGE_DIRTY | _PAGE_ACCESSED;
  10.155 +        spte |= _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED;
  10.156 +        break;
  10.157 +
  10.158 +    case SHM_logdirty:
  10.159 +        spte = gpte;
  10.160 +        gpte |= _PAGE_DIRTY | _PAGE_ACCESSED;
  10.161 +        spte |= _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED;
  10.162 +        __mark_dirty( m, (gpte >> PAGE_SHIFT) );
  10.163 +        break;
  10.164 +    }
  10.165 +
  10.166 +    *gpte_p = gpte;
  10.167 +    *spte_p = spte;
  10.168 +}
  10.169 +
  10.170 +static inline void l1pte_read_fault( struct mm_struct *m, 
  10.171 +                                     unsigned long *gpte_p, unsigned long *spte_p )
  10.172 +{ 
  10.173 +    unsigned long gpte = *gpte_p;
  10.174 +    unsigned long spte = *spte_p;
  10.175 +
  10.176 +    switch( m->shadow_mode )
  10.177 +    {
  10.178 +    case SHM_test:
  10.179 +        spte = gpte;
  10.180 +        gpte |= _PAGE_ACCESSED;
  10.181 +        spte |= _PAGE_ACCESSED;
  10.182 +        if ( ! (gpte & _PAGE_DIRTY ) )
  10.183 +            spte &= ~ _PAGE_RW;
  10.184 +        break;
  10.185 +
  10.186 +    case SHM_logdirty:
  10.187 +        spte = gpte;
  10.188 +        gpte |= _PAGE_ACCESSED;
  10.189 +        spte |= _PAGE_ACCESSED;
  10.190 +        spte &= ~ _PAGE_RW;
  10.191 +        break;
  10.192 +    }
  10.193 +
  10.194 +    *gpte_p = gpte;
  10.195 +    *spte_p = spte;
  10.196 +}
  10.197 +
  10.198 +static inline void l1pte_no_fault( struct mm_struct *m, 
  10.199 +                                   unsigned long *gpte_p, unsigned long *spte_p )
  10.200 +{ 
  10.201 +    unsigned long gpte = *gpte_p;
  10.202 +    unsigned long spte = *spte_p;
  10.203 +
  10.204 +    switch( m->shadow_mode )
  10.205 +    {
  10.206 +    case SHM_test:
  10.207 +        spte = 0;
  10.208 +        if ( (gpte & (_PAGE_PRESENT|_PAGE_ACCESSED) ) == 
  10.209 +             (_PAGE_PRESENT|_PAGE_ACCESSED) )
  10.210 +        {
  10.211 +            spte = gpte;
  10.212 +            if ( ! (gpte & _PAGE_DIRTY ) )
  10.213 +                spte &= ~ _PAGE_RW;
  10.214 +        }
  10.215 +        break;
  10.216 +
  10.217 +    case SHM_logdirty:
  10.218 +        spte = 0;
  10.219 +        if ( (gpte & (_PAGE_PRESENT|_PAGE_ACCESSED) ) == 
  10.220 +             (_PAGE_PRESENT|_PAGE_ACCESSED) )
  10.221 +        {
  10.222 +            spte = gpte;
  10.223 +            spte &= ~ _PAGE_RW;
  10.224 +        }
  10.225 +
  10.226 +        break;
  10.227 +    }
  10.228 +
  10.229 +    *gpte_p = gpte;
  10.230 +    *spte_p = spte;
  10.231 +}
  10.232 +
  10.233 +static inline void l2pde_general( struct mm_struct *m, 
  10.234 +                                  unsigned long *gpde_p, unsigned long *spde_p,
  10.235 +                                  unsigned long sl1pfn)
  10.236 +{
  10.237 +    unsigned long gpde = *gpde_p;
  10.238 +    unsigned long spde = *spde_p;
  10.239 +
  10.240 +    spde = 0;
  10.241 +
  10.242 +    if ( sl1pfn )
  10.243 +    {
  10.244 +        spde = (gpde & ~PAGE_MASK) | (sl1pfn<<PAGE_SHIFT) | 
  10.245 +            _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY;
  10.246 +        gpde = gpde | _PAGE_ACCESSED | _PAGE_DIRTY;
  10.247 +
  10.248 +        if ( unlikely( (sl1pfn<<PAGE_SHIFT) == (gpde & PAGE_MASK)  ) )
  10.249 +        {   
  10.250 +            // detect linear map, and keep pointing at guest
  10.251 +            SH_VLOG("4c: linear mapping ( %08lx )",sl1pfn);
  10.252 +            spde = gpde & ~_PAGE_RW;
  10.253 +        }
  10.254 +    }
  10.255 +
  10.256 +    *gpde_p = gpde;
  10.257 +    *spde_p = spde;
  10.258 +}
  10.259 +
  10.260 +/*********************************************************************/
  10.261 +
  10.262 +
  10.263 +
  10.264 +#if SHADOW_HASH_DEBUG
  10.265 +static void shadow_audit(struct mm_struct *m, int print)
  10.266 +{
  10.267 +    int live=0, free=0, j=0, abs;
  10.268 +    struct shadow_status *a;
  10.269 +
  10.270 +    for( j = 0; j < shadow_ht_buckets; j++ )
  10.271 +    {
  10.272 +        a = &m->shadow_ht[j];        
  10.273 +        if(a->pfn){live++; ASSERT(a->spfn_and_flags&PSH_pfn_mask);}
  10.274 +        ASSERT((a->pfn&0xf0000000)==0);
  10.275 +        ASSERT(a->pfn<0x00100000);
  10.276 +        a=a->next;
  10.277 +        while(a && live<9999)
  10.278 +        { 
  10.279 +            live++; 
  10.280 +            if(a->pfn == 0 || a->spfn_and_flags == 0)
  10.281 +            {
  10.282 +                printk("XXX live=%d pfn=%08lx sp=%08lx next=%p\n",
  10.283 +                       live, a->pfn, a->spfn_and_flags, a->next);
  10.284 +                BUG();
  10.285 +            }
  10.286 +            ASSERT(a->pfn);
  10.287 +            ASSERT((a->pfn&0xf0000000)==0);
  10.288 +            ASSERT(a->pfn<0x00100000);
  10.289 +            ASSERT(a->spfn_and_flags&PSH_pfn_mask);
  10.290 +            a=a->next; 
  10.291 +        }
  10.292 +        ASSERT(live<9999);
  10.293 +    }
  10.294 +
  10.295 +    a = m->shadow_ht_free;
  10.296 +    while(a) { free++; a=a->next; }
  10.297 +
  10.298 +    if(print) printk("Xlive=%d free=%d\n",live,free);
  10.299 +
  10.300 +    abs=(perfc_value(shadow_l1_pages)+perfc_value(shadow_l2_pages))-live;
  10.301 +    if( abs < -1 || abs > 1 )
  10.302 +    {
  10.303 +        printk("live=%d free=%d l1=%d l2=%d\n",live,free,
  10.304 +               perfc_value(shadow_l1_pages), perfc_value(shadow_l2_pages) );
  10.305 +        BUG();
  10.306 +    }
  10.307 +
  10.308 +}
  10.309 +
  10.310 +#else
  10.311 +#define shadow_audit(p, print)
  10.312 +#endif
  10.313 +
  10.314 +
  10.315 +
  10.316 +static inline struct shadow_status* hash_bucket( struct mm_struct *m,
  10.317 +                                                 unsigned int gpfn )
  10.318 +{
  10.319 +    return &(m->shadow_ht[gpfn % shadow_ht_buckets]);
  10.320 +}
  10.321 +
  10.322 +
  10.323 +static inline unsigned long __shadow_status( struct mm_struct *m,
  10.324 +                                             unsigned int gpfn )
  10.325 +{
  10.326 +    struct shadow_status **ob, *b, *B = hash_bucket( m, gpfn );
  10.327 +
  10.328 +    b = B;
  10.329 +    ob = NULL;
  10.330 +
  10.331 +    SH_VVLOG("lookup gpfn=%08x bucket=%p", gpfn, b );
  10.332 +    shadow_audit(m,0);  // if in debug mode
  10.333 +
  10.334 +    do
  10.335 +    {
  10.336 +        if ( b->pfn == gpfn )
  10.337 +        {
  10.338 +            unsigned long t;
  10.339 +            struct shadow_status *x;
  10.340 +
  10.341 +            // swap with head
  10.342 +            t=B->pfn; B->pfn=b->pfn; b->pfn=t;
  10.343 +            t=B->spfn_and_flags; B->spfn_and_flags=b->spfn_and_flags; 
  10.344 +            b->spfn_and_flags=t;
  10.345 +
  10.346 +            if( ob )
  10.347 +            {   // pull to front
  10.348 +                *ob=b->next;
  10.349 +                x=B->next;
  10.350 +                B->next=b;
  10.351 +                b->next=x;
  10.352 +            }
  10.353 +            return B->spfn_and_flags;
  10.354 +        }
  10.355 +#if SHADOW_HASH_DEBUG
  10.356 +        else
  10.357 +        {
  10.358 +            if(b!=B)ASSERT(b->pfn);
  10.359 +        }
  10.360 +#endif
  10.361 +        ob=&b->next;
  10.362 +        b=b->next;
  10.363 +    }
  10.364 +    while (b);
  10.365 +
  10.366 +    return 0;
  10.367 +}
  10.368 +
  10.369 +/* we can make this locking more fine grained e.g. per shadow page if it 
  10.370 +ever becomes a problem, but since we need a spin lock on the hash table 
  10.371 +anyway its probably not worth being too clever. */
  10.372 +
  10.373 +static inline unsigned long get_shadow_status( struct mm_struct *m,
  10.374 +                                               unsigned int gpfn )
  10.375 +{
  10.376 +    unsigned long res;
  10.377 +
  10.378 +    /* If we get here, we know that this domain is running in shadow mode. 
  10.379 +       We also know that some sort of update has happened to the underlying
  10.380 +       page table page: either a PTE has been updated, or the page has
  10.381 +       changed type. If we're in log dirty mode, we should set the approrpiate
  10.382 +       bit in the dirty bitmap.
  10.383 +       NB: the VA update path doesn't use this so needs to be handled 
  10.384 +       independnetly. 
  10.385 +    */
  10.386 +
  10.387 +    ASSERT(local_irq_is_enabled());
  10.388 +    //if(spin_is_locked(&m->shadow_lock)) printk("*");
  10.389 +    spin_lock(&m->shadow_lock);
  10.390 +
  10.391 +    if( m->shadow_mode == SHM_logdirty )
  10.392 +        __mark_dirty( m, gpfn );
  10.393 +
  10.394 +    res = __shadow_status( m, gpfn );
  10.395 +    if (!res) spin_unlock(&m->shadow_lock);
  10.396 +    return res;
  10.397 +}
  10.398 +
  10.399 +
  10.400 +static inline void put_shadow_status( struct mm_struct *m )
  10.401 +{
  10.402 +    spin_unlock(&m->shadow_lock);
  10.403 +}
  10.404 +
  10.405 +
  10.406 +static inline void delete_shadow_status( struct mm_struct *m,
  10.407 +                                         unsigned int gpfn )
  10.408 +{
  10.409 +    struct shadow_status *b, *B, **ob;
  10.410 +
  10.411 +    ASSERT(spin_is_locked(&m->shadow_lock));
  10.412 +
  10.413 +    B = b = hash_bucket( m, gpfn );
  10.414 +
  10.415 +    SH_VVLOG("delete gpfn=%08x bucket=%p", gpfn, b );
  10.416 +    shadow_audit(m,0);
  10.417 +    ASSERT(gpfn);
  10.418 +
  10.419 +    if( b->pfn == gpfn )
  10.420 +    {
  10.421 +        if (b->next)
  10.422 +        {
  10.423 +            struct shadow_status *D=b->next;
  10.424 +            b->spfn_and_flags = b->next->spfn_and_flags;
  10.425 +            b->pfn = b->next->pfn;
  10.426 +
  10.427 +            b->next = b->next->next;
  10.428 +            D->next = m->shadow_ht_free;
  10.429 +            D->pfn = 0;
  10.430 +            D->spfn_and_flags = 0;
  10.431 +            m->shadow_ht_free = D;
  10.432 +        }
  10.433 +        else
  10.434 +        {
  10.435 +            b->pfn = 0;
  10.436 +            b->spfn_and_flags = 0;
  10.437 +        }
  10.438 +
  10.439 +#if SHADOW_HASH_DEBUG
  10.440 +        if( __shadow_status(m,gpfn) ) BUG();  
  10.441 +        shadow_audit(m,0);
  10.442 +#endif
  10.443 +        return;
  10.444 +    }
  10.445 +
  10.446 +    ob = &b->next;
  10.447 +    b=b->next;
  10.448 +
  10.449 +    do
  10.450 +    {
  10.451 +        if ( b->pfn == gpfn )
  10.452 +        {
  10.453 +            b->pfn = 0;
  10.454 +            b->spfn_and_flags = 0;
  10.455 +
  10.456 +            // b is in the list
  10.457 +            *ob=b->next;
  10.458 +            b->next = m->shadow_ht_free;
  10.459 +            m->shadow_ht_free = b;
  10.460 +
  10.461 +#if SHADOW_HASH_DEBUG
  10.462 +            if( __shadow_status(m,gpfn) ) BUG();
  10.463 +#endif
  10.464 +            shadow_audit(m,0);
  10.465 +            return;
  10.466 +        }
  10.467 +
  10.468 +        ob = &b->next;
  10.469 +        b=b->next;
  10.470 +    }
  10.471 +    while (b);
  10.472 +
  10.473 +    // if we got here, it wasn't in the list
  10.474 +    BUG();
  10.475 +}
  10.476 +
  10.477 +
  10.478 +static inline void set_shadow_status( struct mm_struct *m,
  10.479 +                                      unsigned int gpfn, unsigned long s )
  10.480 +{
  10.481 +    struct shadow_status *b, *B, *extra, **fptr;
  10.482 +    int i;
  10.483 +
  10.484 +    ASSERT(spin_is_locked(&m->shadow_lock));
  10.485 +
  10.486 +    B = b = hash_bucket( m, gpfn );
  10.487 +   
  10.488 +    ASSERT(gpfn);
  10.489 +    SH_VVLOG("set gpfn=%08x s=%08lx bucket=%p(%p)", gpfn, s, b, b->next );
  10.490 +
  10.491 +    shadow_audit(m,0);
  10.492 +
  10.493 +    do
  10.494 +    {
  10.495 +        if ( b->pfn == gpfn )
  10.496 +        {
  10.497 +            b->spfn_and_flags = s;
  10.498 +            shadow_audit(m,0);
  10.499 +            return;
  10.500 +        }
  10.501 +
  10.502 +        b=b->next;
  10.503 +    }
  10.504 +    while (b);
  10.505 +
  10.506 +    // if we got here, this is an insert rather than update
  10.507 +
  10.508 +    ASSERT( s );  // deletes must have succeeded by here
  10.509 +
  10.510 +    if ( B->pfn == 0 )
  10.511 +    {
  10.512 +        // we can use this head
  10.513 +        ASSERT( B->next == 0 );
  10.514 +        B->pfn = gpfn;
  10.515 +        B->spfn_and_flags = s;
  10.516 +        shadow_audit(m,0);
  10.517 +        return;
  10.518 +    }
  10.519 +
  10.520 +    if( unlikely(m->shadow_ht_free == NULL) )
  10.521 +    {
  10.522 +        SH_LOG("allocate more shadow hashtable blocks");
  10.523 +
  10.524 +        // we need to allocate more space
  10.525 +        extra = kmalloc(sizeof(void*) + (shadow_ht_extra_size * 
  10.526 +                                         sizeof(struct shadow_status)));
  10.527 +
  10.528 +        if( ! extra ) BUG(); // should be more graceful here....
  10.529 +
  10.530 +        memset(extra, 0, sizeof(void*) + (shadow_ht_extra_size * 
  10.531 +                                          sizeof(struct shadow_status)));
  10.532 +
  10.533 +        m->shadow_extras_count++;
  10.534 +
  10.535 +        // add extras to free list
  10.536 +        fptr = &m->shadow_ht_free;
  10.537 +        for ( i=0; i<shadow_ht_extra_size; i++ )
  10.538 +        {
  10.539 +            *fptr = &extra[i];
  10.540 +            fptr = &(extra[i].next);
  10.541 +        }
  10.542 +        *fptr = NULL;
  10.543 +
  10.544 +        *((struct shadow_status ** ) &extra[shadow_ht_extra_size]) = 
  10.545 +            m->shadow_ht_extras;
  10.546 +        m->shadow_ht_extras = extra;
  10.547 +
  10.548 +    }
  10.549 +
  10.550 +    // should really put this in B to go right to front
  10.551 +    b = m->shadow_ht_free;
  10.552 +    m->shadow_ht_free = b->next;
  10.553 +    b->spfn_and_flags = s;
  10.554 +    b->pfn = gpfn;
  10.555 +    b->next = B->next;
  10.556 +    B->next = b;
  10.557 +
  10.558 +    shadow_audit(m,0);
  10.559 +
  10.560 +    return;
  10.561 +}
  10.562 +
  10.563 +static inline void __shadow_mk_pagetable( struct mm_struct *mm )
  10.564 +{
  10.565 +    unsigned long gpfn, spfn=0;
  10.566 +
  10.567 +    gpfn =  pagetable_val(mm->pagetable) >> PAGE_SHIFT;
  10.568 +
  10.569 +    if ( unlikely((spfn=__shadow_status(mm, gpfn)) == 0 ) )
  10.570 +    {
  10.571 +        spfn = shadow_l2_table(mm, gpfn );
  10.572 +    }      
  10.573 +    mm->shadow_table = mk_pagetable(spfn<<PAGE_SHIFT);
  10.574 +}
  10.575 +
  10.576 +static inline void shadow_mk_pagetable( struct mm_struct *mm )
  10.577 +{
  10.578 +    SH_VVLOG("shadow_mk_pagetable( gptbase=%08lx, mode=%d )",
  10.579 +             pagetable_val(mm->pagetable), mm->shadow_mode );
  10.580 +
  10.581 +    if ( unlikely(mm->shadow_mode) )
  10.582 +    {
  10.583 +        ASSERT(local_irq_is_enabled());
  10.584 +        spin_lock(&mm->shadow_lock);
  10.585 +
  10.586 +        __shadow_mk_pagetable( mm );
  10.587 +
  10.588 +        spin_unlock(&mm->shadow_lock);
  10.589 +    }
  10.590 +
  10.591 +    SH_VVLOG("leaving shadow_mk_pagetable( gptbase=%08lx, mode=%d ) sh=%08lx",
  10.592 +             pagetable_val(mm->pagetable), mm->shadow_mode, 
  10.593 +             pagetable_val(mm->shadow_table) );
  10.594 +
  10.595 +}
  10.596 +
  10.597 +
  10.598 +#if SHADOW_DEBUG
  10.599 +extern int check_pagetable(struct mm_struct *m, pagetable_t pt, char *s);
  10.600 +#else
  10.601 +#define check_pagetable(m, pt, s) ((void)0)
  10.602 +#endif
  10.603 +
  10.604 +
  10.605 +#endif /* XEN_SHADOW_H */
  10.606 +
  10.607 +
    11.1 --- a/xen/include/xen/shadow.h	Thu Jul 01 23:32:40 2004 +0000
    11.2 +++ b/xen/include/xen/shadow.h	Thu Jul 01 23:45:24 2004 +0000
    11.3 @@ -1,599 +1,1 @@
    11.4 -/* -*-  Mode:C; c-basic-offset:4; tab-width:4 -*- */
    11.5 -
    11.6 -#ifndef _XEN_SHADOW_H
    11.7 -#define _XEN_SHADOW_H
    11.8 -
    11.9 -#include <xen/config.h>
   11.10 -#include <xen/types.h>
   11.11 -#include <xen/perfc.h>
   11.12 -#include <asm/processor.h>
   11.13 -
   11.14 -
   11.15 -/* Shadow PT flag bits in pfn_info */
   11.16 -#define PSH_shadowed    (1<<31) /* page has a shadow. PFN points to shadow */
   11.17 -#define PSH_pending     (1<<29) /* page is in the process of being shadowed */
   11.18 -#define PSH_pfn_mask    ((1<<21)-1)
   11.19 -
   11.20 -/* Shadow PT operation mode : shadowmode variable in mm_struct */
   11.21 -#define SHM_test        (1) /* just run domain on shadow PTs */
   11.22 -#define SHM_logdirty    (2) /* log pages that are dirtied */
   11.23 -#define SHM_translate   (3) /* lookup machine pages in translation table */
   11.24 -//#define SHM_cow       (4) /* copy on write all dirtied pages */
   11.25 -
   11.26 -
   11.27 -#define shadow_linear_pg_table ((l1_pgentry_t *)SH_LINEAR_PT_VIRT_START)
   11.28 -#define shadow_linear_l2_table ((l2_pgentry_t *)(SH_LINEAR_PT_VIRT_START+(SH_LINEAR_PT_VIRT_START>>(L2_PAGETABLE_SHIFT-L1_PAGETABLE_SHIFT))))
   11.29 -
   11.30 -extern void shadow_mode_init(void);
   11.31 -extern int shadow_mode_control( struct domain *p, dom0_shadow_control_t *sc );
   11.32 -extern int shadow_fault( unsigned long va, long error_code );
   11.33 -extern void shadow_l1_normal_pt_update( unsigned long pa, unsigned long gpte, 
   11.34 -                                        unsigned long *prev_spfn_ptr,
   11.35 -                                        l1_pgentry_t **prev_spl1e_ptr  );
   11.36 -extern void shadow_l2_normal_pt_update( unsigned long pa, unsigned long gpte );
   11.37 -extern void unshadow_table( unsigned long gpfn, unsigned int type );
   11.38 -extern int shadow_mode_enable( struct domain *p, unsigned int mode );
   11.39 -extern void shadow_mode_disable( struct domain *p );
   11.40 -extern unsigned long shadow_l2_table( 
   11.41 -    struct mm_struct *m, unsigned long gpfn );
   11.42 -
   11.43 -#define SHADOW_DEBUG 0
   11.44 -#define SHADOW_HASH_DEBUG 0
   11.45 -#define SHADOW_OPTIMISE 1
   11.46 -
   11.47 -struct shadow_status {
   11.48 -    unsigned long pfn;            // gpfn 
   11.49 -    unsigned long spfn_and_flags; // spfn plus flags
   11.50 -    struct shadow_status *next;   // use pull-to-front list.
   11.51 -};
   11.52 -
   11.53 -#define shadow_ht_extra_size         128 /*128*/
   11.54 -#define shadow_ht_buckets            256 /*256*/
   11.55 -
   11.56 -#ifndef NDEBUG
   11.57 -#define SH_LOG(_f, _a...)                             \
   11.58 -printk("DOM%u: (file=shadow.c, line=%d) " _f "\n",    \
   11.59 -       current->domain , __LINE__ , ## _a )
   11.60 -#else
   11.61 -#define SH_LOG(_f, _a...) 
   11.62 -#endif
   11.63 -
   11.64 -#if SHADOW_DEBUG
   11.65 -#define SH_VLOG(_f, _a...)                             \
   11.66 -    printk("DOM%u: (file=shadow.c, line=%d) " _f "\n", \
   11.67 -           current->domain , __LINE__ , ## _a )
   11.68 -#else
   11.69 -#define SH_VLOG(_f, _a...) 
   11.70 -#endif
   11.71 -
   11.72 -#if 0
   11.73 -#define SH_VVLOG(_f, _a...)                             \
   11.74 -    printk("DOM%u: (file=shadow.c, line=%d) " _f "\n",  \
   11.75 -           current->domain , __LINE__ , ## _a )
   11.76 -#else
   11.77 -#define SH_VVLOG(_f, _a...) 
   11.78 -#endif
   11.79 -
   11.80 -
   11.81 -/************************************************************************/
   11.82 -
   11.83 -static inline int __mark_dirty( struct mm_struct *m, unsigned int mfn )
   11.84 -{
   11.85 -    unsigned int pfn;
   11.86 -    int rc = 0;
   11.87 -
   11.88 -    ASSERT(spin_is_locked(&m->shadow_lock));
   11.89 -
   11.90 -    pfn = machine_to_phys_mapping[mfn];
   11.91 -
   11.92 -    /* We use values with the top bit set to mark MFNs that aren't
   11.93 -       really part of the domain's psuedo-physical memory map e.g.
   11.94 -       the shared info frame. Nothing to do here...
   11.95 -    */
   11.96 -    if ( unlikely(pfn & 0x80000000U) ) return rc; 
   11.97 -
   11.98 -    ASSERT(m->shadow_dirty_bitmap);
   11.99 -    if( likely(pfn<m->shadow_dirty_bitmap_size) )
  11.100 -    {
  11.101 -	/* These updates occur with mm.shadow_lock held, so use 
  11.102 -	   (__) version of test_and_set */
  11.103 -	if( __test_and_set_bit( pfn, m->shadow_dirty_bitmap ) == 0 )
  11.104 -	{
  11.105 -	    // if we set it
  11.106 -	    m->shadow_dirty_count++;
  11.107 -	    rc = 1;
  11.108 -	}
  11.109 -    }
  11.110 -    else
  11.111 -    {
  11.112 -        extern void show_traceX(void);
  11.113 -        SH_LOG("mark_dirty OOR! mfn=%x pfn=%x max=%x (mm %p)",
  11.114 -               mfn, pfn, m->shadow_dirty_bitmap_size, m );
  11.115 -        SH_LOG("dom=%u caf=%08x taf=%08x\n", 
  11.116 -               frame_table[mfn].u.domain->domain,
  11.117 -               frame_table[mfn].count_and_flags, 
  11.118 -               frame_table[mfn].type_and_flags );
  11.119 -    }
  11.120 -	
  11.121 -    return rc;
  11.122 -}
  11.123 -
  11.124 -
  11.125 -static inline int mark_dirty( struct mm_struct *m, unsigned int mfn )
  11.126 -{
  11.127 -    int rc;
  11.128 -    ASSERT(local_irq_is_enabled());
  11.129 -    //if(spin_is_locked(&m->shadow_lock)) printk("+");
  11.130 -    spin_lock(&m->shadow_lock);
  11.131 -    rc = __mark_dirty( m, mfn );
  11.132 -    spin_unlock(&m->shadow_lock);
  11.133 -    return rc;
  11.134 -}
  11.135 -
  11.136 -
  11.137 -/************************************************************************/
  11.138 -
  11.139 -static inline void l1pte_write_fault( struct mm_struct *m, 
  11.140 -                                      unsigned long *gpte_p, unsigned long *spte_p )
  11.141 -{ 
  11.142 -    unsigned long gpte = *gpte_p;
  11.143 -    unsigned long spte = *spte_p;
  11.144 -
  11.145 -    switch( m->shadow_mode )
  11.146 -    {
  11.147 -    case SHM_test:
  11.148 -        spte = gpte;
  11.149 -        gpte |= _PAGE_DIRTY | _PAGE_ACCESSED;
  11.150 -        spte |= _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED;
  11.151 -        break;
  11.152 -
  11.153 -    case SHM_logdirty:
  11.154 -        spte = gpte;
  11.155 -        gpte |= _PAGE_DIRTY | _PAGE_ACCESSED;
  11.156 -        spte |= _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED;
  11.157 -        __mark_dirty( m, (gpte >> PAGE_SHIFT) );
  11.158 -        break;
  11.159 -    }
  11.160 -
  11.161 -    *gpte_p = gpte;
  11.162 -    *spte_p = spte;
  11.163 -}
  11.164 -
  11.165 -static inline void l1pte_read_fault( struct mm_struct *m, 
  11.166 -                                     unsigned long *gpte_p, unsigned long *spte_p )
  11.167 -{ 
  11.168 -    unsigned long gpte = *gpte_p;
  11.169 -    unsigned long spte = *spte_p;
  11.170 -
  11.171 -    switch( m->shadow_mode )
  11.172 -    {
  11.173 -    case SHM_test:
  11.174 -        spte = gpte;
  11.175 -        gpte |= _PAGE_ACCESSED;
  11.176 -        spte |= _PAGE_ACCESSED;
  11.177 -        if ( ! (gpte & _PAGE_DIRTY ) )
  11.178 -            spte &= ~ _PAGE_RW;
  11.179 -        break;
  11.180 -
  11.181 -    case SHM_logdirty:
  11.182 -        spte = gpte;
  11.183 -        gpte |= _PAGE_ACCESSED;
  11.184 -        spte |= _PAGE_ACCESSED;
  11.185 -        spte &= ~ _PAGE_RW;
  11.186 -        break;
  11.187 -    }
  11.188 -
  11.189 -    *gpte_p = gpte;
  11.190 -    *spte_p = spte;
  11.191 -}
  11.192 -
  11.193 -static inline void l1pte_no_fault( struct mm_struct *m, 
  11.194 -                                   unsigned long *gpte_p, unsigned long *spte_p )
  11.195 -{ 
  11.196 -    unsigned long gpte = *gpte_p;
  11.197 -    unsigned long spte = *spte_p;
  11.198 -
  11.199 -    switch( m->shadow_mode )
  11.200 -    {
  11.201 -    case SHM_test:
  11.202 -        spte = 0;
  11.203 -        if ( (gpte & (_PAGE_PRESENT|_PAGE_ACCESSED) ) == 
  11.204 -             (_PAGE_PRESENT|_PAGE_ACCESSED) )
  11.205 -        {
  11.206 -            spte = gpte;
  11.207 -            if ( ! (gpte & _PAGE_DIRTY ) )
  11.208 -                spte &= ~ _PAGE_RW;
  11.209 -        }
  11.210 -        break;
  11.211 -
  11.212 -    case SHM_logdirty:
  11.213 -        spte = 0;
  11.214 -        if ( (gpte & (_PAGE_PRESENT|_PAGE_ACCESSED) ) == 
  11.215 -             (_PAGE_PRESENT|_PAGE_ACCESSED) )
  11.216 -        {
  11.217 -            spte = gpte;
  11.218 -            spte &= ~ _PAGE_RW;
  11.219 -        }
  11.220 -
  11.221 -        break;
  11.222 -    }
  11.223 -
  11.224 -    *gpte_p = gpte;
  11.225 -    *spte_p = spte;
  11.226 -}
  11.227 -
  11.228 -static inline void l2pde_general( struct mm_struct *m, 
  11.229 -                                  unsigned long *gpde_p, unsigned long *spde_p,
  11.230 -                                  unsigned long sl1pfn)
  11.231 -{
  11.232 -    unsigned long gpde = *gpde_p;
  11.233 -    unsigned long spde = *spde_p;
  11.234 -
  11.235 -    spde = 0;
  11.236 -
  11.237 -    if ( sl1pfn )
  11.238 -    {
  11.239 -        spde = (gpde & ~PAGE_MASK) | (sl1pfn<<PAGE_SHIFT) | 
  11.240 -            _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY;
  11.241 -        gpde = gpde | _PAGE_ACCESSED | _PAGE_DIRTY;
  11.242 -
  11.243 -        if ( unlikely( (sl1pfn<<PAGE_SHIFT) == (gpde & PAGE_MASK)  ) )
  11.244 -        {   
  11.245 -            // detect linear map, and keep pointing at guest
  11.246 -            SH_VLOG("4c: linear mapping ( %08lx )",sl1pfn);
  11.247 -            spde = gpde & ~_PAGE_RW;
  11.248 -        }
  11.249 -    }
  11.250 -
  11.251 -    *gpde_p = gpde;
  11.252 -    *spde_p = spde;
  11.253 -}
  11.254 -
  11.255 -/*********************************************************************/
  11.256 -
  11.257 -
  11.258 -
  11.259 -#if SHADOW_HASH_DEBUG
  11.260 -static void shadow_audit(struct mm_struct *m, int print)
  11.261 -{
  11.262 -    int live=0, free=0, j=0, abs;
  11.263 -    struct shadow_status *a;
  11.264 -
  11.265 -    for( j = 0; j < shadow_ht_buckets; j++ )
  11.266 -    {
  11.267 -        a = &m->shadow_ht[j];        
  11.268 -        if(a->pfn){live++; ASSERT(a->spfn_and_flags&PSH_pfn_mask);}
  11.269 -        ASSERT((a->pfn&0xf0000000)==0);
  11.270 -        ASSERT(a->pfn<0x00100000);
  11.271 -        a=a->next;
  11.272 -        while(a && live<9999)
  11.273 -        { 
  11.274 -            live++; 
  11.275 -            if(a->pfn == 0 || a->spfn_and_flags == 0)
  11.276 -            {
  11.277 -                printk("XXX live=%d pfn=%08lx sp=%08lx next=%p\n",
  11.278 -                       live, a->pfn, a->spfn_and_flags, a->next);
  11.279 -                BUG();
  11.280 -            }
  11.281 -            ASSERT(a->pfn);
  11.282 -            ASSERT((a->pfn&0xf0000000)==0);
  11.283 -            ASSERT(a->pfn<0x00100000);
  11.284 -            ASSERT(a->spfn_and_flags&PSH_pfn_mask);
  11.285 -            a=a->next; 
  11.286 -        }
  11.287 -        ASSERT(live<9999);
  11.288 -    }
  11.289 -
  11.290 -    a = m->shadow_ht_free;
  11.291 -    while(a) { free++; a=a->next; }
  11.292 -
  11.293 -    if(print) printk("Xlive=%d free=%d\n",live,free);
  11.294 -
  11.295 -    abs=(perfc_value(shadow_l1_pages)+perfc_value(shadow_l2_pages))-live;
  11.296 -    if( abs < -1 || abs > 1 )
  11.297 -    {
  11.298 -        printk("live=%d free=%d l1=%d l2=%d\n",live,free,
  11.299 -               perfc_value(shadow_l1_pages), perfc_value(shadow_l2_pages) );
  11.300 -        BUG();
  11.301 -    }
  11.302 -
  11.303 -}
  11.304 -
  11.305 -#else
  11.306 -#define shadow_audit(p, print)
  11.307 -#endif
  11.308 -
  11.309 -
  11.310 -
  11.311 -static inline struct shadow_status* hash_bucket( struct mm_struct *m,
  11.312 -                                                 unsigned int gpfn )
  11.313 -{
  11.314 -    return &(m->shadow_ht[gpfn % shadow_ht_buckets]);
  11.315 -}
  11.316 -
  11.317 -
  11.318 -static inline unsigned long __shadow_status( struct mm_struct *m,
  11.319 -                                             unsigned int gpfn )
  11.320 -{
  11.321 -    struct shadow_status **ob, *b, *B = hash_bucket( m, gpfn );
  11.322 -
  11.323 -    b = B;
  11.324 -    ob = NULL;
  11.325 -
  11.326 -    SH_VVLOG("lookup gpfn=%08x bucket=%p", gpfn, b );
  11.327 -    shadow_audit(m,0);  // if in debug mode
  11.328 -
  11.329 -    do
  11.330 -    {
  11.331 -        if ( b->pfn == gpfn )
  11.332 -        {
  11.333 -            unsigned long t;
  11.334 -            struct shadow_status *x;
  11.335 -
  11.336 -            // swap with head
  11.337 -            t=B->pfn; B->pfn=b->pfn; b->pfn=t;
  11.338 -            t=B->spfn_and_flags; B->spfn_and_flags=b->spfn_and_flags; 
  11.339 -            b->spfn_and_flags=t;
  11.340 -
  11.341 -            if( ob )
  11.342 -            {   // pull to front
  11.343 -                *ob=b->next;
  11.344 -                x=B->next;
  11.345 -                B->next=b;
  11.346 -                b->next=x;
  11.347 -            }
  11.348 -            return B->spfn_and_flags;
  11.349 -        }
  11.350 -#if SHADOW_HASH_DEBUG
  11.351 -        else
  11.352 -        {
  11.353 -            if(b!=B)ASSERT(b->pfn);
  11.354 -        }
  11.355 -#endif
  11.356 -        ob=&b->next;
  11.357 -        b=b->next;
  11.358 -    }
  11.359 -    while (b);
  11.360 -
  11.361 -    return 0;
  11.362 -}
  11.363 -
  11.364 -/* we can make this locking more fine grained e.g. per shadow page if it 
  11.365 -ever becomes a problem, but since we need a spin lock on the hash table 
  11.366 -anyway its probably not worth being too clever. */
  11.367 -
  11.368 -static inline unsigned long get_shadow_status( struct mm_struct *m,
  11.369 -                                               unsigned int gpfn )
  11.370 -{
  11.371 -    unsigned long res;
  11.372 -
  11.373 -    /* If we get here, we know that this domain is running in shadow mode. 
  11.374 -       We also know that some sort of update has happened to the underlying
  11.375 -       page table page: either a PTE has been updated, or the page has
  11.376 -       changed type. If we're in log dirty mode, we should set the approrpiate
  11.377 -       bit in the dirty bitmap.
  11.378 -       NB: the VA update path doesn't use this so needs to be handled 
  11.379 -       independnetly. 
  11.380 -    */
  11.381 -
  11.382 -    ASSERT(local_irq_is_enabled());
  11.383 -    //if(spin_is_locked(&m->shadow_lock)) printk("*");
  11.384 -    spin_lock(&m->shadow_lock);
  11.385 -
  11.386 -    if( m->shadow_mode == SHM_logdirty )
  11.387 -        __mark_dirty( m, gpfn );
  11.388 -
  11.389 -    res = __shadow_status( m, gpfn );
  11.390 -    if (!res) spin_unlock(&m->shadow_lock);
  11.391 -    return res;
  11.392 -}
  11.393 -
  11.394 -
  11.395 -static inline void put_shadow_status( struct mm_struct *m )
  11.396 -{
  11.397 -    spin_unlock(&m->shadow_lock);
  11.398 -}
  11.399 -
  11.400 -
  11.401 -static inline void delete_shadow_status( struct mm_struct *m,
  11.402 -                                         unsigned int gpfn )
  11.403 -{
  11.404 -    struct shadow_status *b, *B, **ob;
  11.405 -
  11.406 -    ASSERT(spin_is_locked(&m->shadow_lock));
  11.407 -
  11.408 -    B = b = hash_bucket( m, gpfn );
  11.409 -
  11.410 -    SH_VVLOG("delete gpfn=%08x bucket=%p", gpfn, b );
  11.411 -    shadow_audit(m,0);
  11.412 -    ASSERT(gpfn);
  11.413 -
  11.414 -    if( b->pfn == gpfn )
  11.415 -    {
  11.416 -        if (b->next)
  11.417 -        {
  11.418 -            struct shadow_status *D=b->next;
  11.419 -            b->spfn_and_flags = b->next->spfn_and_flags;
  11.420 -            b->pfn = b->next->pfn;
  11.421 -
  11.422 -            b->next = b->next->next;
  11.423 -            D->next = m->shadow_ht_free;
  11.424 -            D->pfn = 0;
  11.425 -            D->spfn_and_flags = 0;
  11.426 -            m->shadow_ht_free = D;
  11.427 -        }
  11.428 -        else
  11.429 -        {
  11.430 -            b->pfn = 0;
  11.431 -            b->spfn_and_flags = 0;
  11.432 -        }
  11.433 -
  11.434 -#if SHADOW_HASH_DEBUG
  11.435 -        if( __shadow_status(m,gpfn) ) BUG();  
  11.436 -        shadow_audit(m,0);
  11.437 -#endif
  11.438 -        return;
  11.439 -    }
  11.440 -
  11.441 -    ob = &b->next;
  11.442 -    b=b->next;
  11.443 -
  11.444 -    do
  11.445 -    {
  11.446 -        if ( b->pfn == gpfn )
  11.447 -        {
  11.448 -            b->pfn = 0;
  11.449 -            b->spfn_and_flags = 0;
  11.450 -
  11.451 -            // b is in the list
  11.452 -            *ob=b->next;
  11.453 -            b->next = m->shadow_ht_free;
  11.454 -            m->shadow_ht_free = b;
  11.455 -
  11.456 -#if SHADOW_HASH_DEBUG
  11.457 -            if( __shadow_status(m,gpfn) ) BUG();
  11.458 -#endif
  11.459 -            shadow_audit(m,0);
  11.460 -            return;
  11.461 -        }
  11.462 -
  11.463 -        ob = &b->next;
  11.464 -        b=b->next;
  11.465 -    }
  11.466 -    while (b);
  11.467 -
  11.468 -    // if we got here, it wasn't in the list
  11.469 -    BUG();
  11.470 -}
  11.471 -
  11.472 -
  11.473 -static inline void set_shadow_status( struct mm_struct *m,
  11.474 -                                      unsigned int gpfn, unsigned long s )
  11.475 -{
  11.476 -    struct shadow_status *b, *B, *extra, **fptr;
  11.477 -    int i;
  11.478 -
  11.479 -    ASSERT(spin_is_locked(&m->shadow_lock));
  11.480 -
  11.481 -    B = b = hash_bucket( m, gpfn );
  11.482 -   
  11.483 -    ASSERT(gpfn);
  11.484 -    SH_VVLOG("set gpfn=%08x s=%08lx bucket=%p(%p)", gpfn, s, b, b->next );
  11.485 -
  11.486 -    shadow_audit(m,0);
  11.487 -
  11.488 -    do
  11.489 -    {
  11.490 -        if ( b->pfn == gpfn )
  11.491 -        {
  11.492 -            b->spfn_and_flags = s;
  11.493 -            shadow_audit(m,0);
  11.494 -            return;
  11.495 -        }
  11.496 -
  11.497 -        b=b->next;
  11.498 -    }
  11.499 -    while (b);
  11.500 -
  11.501 -    // if we got here, this is an insert rather than update
  11.502 -
  11.503 -    ASSERT( s );  // deletes must have succeeded by here
  11.504 -
  11.505 -    if ( B->pfn == 0 )
  11.506 -    {
  11.507 -        // we can use this head
  11.508 -        ASSERT( B->next == 0 );
  11.509 -        B->pfn = gpfn;
  11.510 -        B->spfn_and_flags = s;
  11.511 -        shadow_audit(m,0);
  11.512 -        return;
  11.513 -    }
  11.514 -
  11.515 -    if( unlikely(m->shadow_ht_free == NULL) )
  11.516 -    {
  11.517 -        SH_LOG("allocate more shadow hashtable blocks");
  11.518 -
  11.519 -        // we need to allocate more space
  11.520 -        extra = kmalloc(sizeof(void*) + (shadow_ht_extra_size * 
  11.521 -                                         sizeof(struct shadow_status)));
  11.522 -
  11.523 -        if( ! extra ) BUG(); // should be more graceful here....
  11.524 -
  11.525 -        memset(extra, 0, sizeof(void*) + (shadow_ht_extra_size * 
  11.526 -                                          sizeof(struct shadow_status)));
  11.527 -
  11.528 -        m->shadow_extras_count++;
  11.529 -
  11.530 -        // add extras to free list
  11.531 -        fptr = &m->shadow_ht_free;
  11.532 -        for ( i=0; i<shadow_ht_extra_size; i++ )
  11.533 -        {
  11.534 -            *fptr = &extra[i];
  11.535 -            fptr = &(extra[i].next);
  11.536 -        }
  11.537 -        *fptr = NULL;
  11.538 -
  11.539 -        *((struct shadow_status ** ) &extra[shadow_ht_extra_size]) = 
  11.540 -            m->shadow_ht_extras;
  11.541 -        m->shadow_ht_extras = extra;
  11.542 -
  11.543 -    }
  11.544 -
  11.545 -    // should really put this in B to go right to front
  11.546 -    b = m->shadow_ht_free;
  11.547 -    m->shadow_ht_free = b->next;
  11.548 -    b->spfn_and_flags = s;
  11.549 -    b->pfn = gpfn;
  11.550 -    b->next = B->next;
  11.551 -    B->next = b;
  11.552 -
  11.553 -    shadow_audit(m,0);
  11.554 -
  11.555 -    return;
  11.556 -}
  11.557 -
  11.558 -static inline void __shadow_mk_pagetable( struct mm_struct *mm )
  11.559 -{
  11.560 -    unsigned long gpfn, spfn=0;
  11.561 -
  11.562 -    gpfn =  pagetable_val(mm->pagetable) >> PAGE_SHIFT;
  11.563 -
  11.564 -    if ( unlikely((spfn=__shadow_status(mm, gpfn)) == 0 ) )
  11.565 -    {
  11.566 -        spfn = shadow_l2_table(mm, gpfn );
  11.567 -    }      
  11.568 -    mm->shadow_table = mk_pagetable(spfn<<PAGE_SHIFT);
  11.569 -}
  11.570 -
  11.571 -static inline void shadow_mk_pagetable( struct mm_struct *mm )
  11.572 -{
  11.573 -    SH_VVLOG("shadow_mk_pagetable( gptbase=%08lx, mode=%d )",
  11.574 -             pagetable_val(mm->pagetable), mm->shadow_mode );
  11.575 -
  11.576 -    if ( unlikely(mm->shadow_mode) )
  11.577 -    {
  11.578 -        ASSERT(local_irq_is_enabled());
  11.579 -        spin_lock(&mm->shadow_lock);
  11.580 -
  11.581 -        __shadow_mk_pagetable( mm );
  11.582 -
  11.583 -        spin_unlock(&mm->shadow_lock);
  11.584 -    }
  11.585 -
  11.586 -    SH_VVLOG("leaving shadow_mk_pagetable( gptbase=%08lx, mode=%d ) sh=%08lx",
  11.587 -             pagetable_val(mm->pagetable), mm->shadow_mode, 
  11.588 -             pagetable_val(mm->shadow_table) );
  11.589 -
  11.590 -}
  11.591 -
  11.592 -
  11.593 -#if SHADOW_DEBUG
  11.594 -extern int check_pagetable(struct mm_struct *m, pagetable_t pt, char *s);
  11.595 -#else
  11.596 -#define check_pagetable(m, pt, s) ((void)0)
  11.597 -#endif
  11.598 -
  11.599 -
  11.600 -#endif /* XEN_SHADOW_H */
  11.601 -
  11.602 -
  11.603 +#include <asm/shadow.h>