ia64/xen-unstable

changeset 14805:90a6af455bbd

[HVM] Save/restore: merge xc_linux_save and xc_hvm_save
into xc_domain_save, like we did for xc_domain_restore
Signed-off-by: Tim Deegan <Tim.Deegan@xensource.com>
author Tim Deegan <Tim.Deegan@xensource.com>
date Wed Apr 11 14:45:14 2007 +0100 (2007-04-11)
parents 6e7ef794cdbc
children 2aa05978f2ca
files tools/libxc/Makefile tools/libxc/ia64/xc_ia64_linux_save.c tools/libxc/xc_domain_save.c tools/libxc/xc_hvm_save.c tools/libxc/xc_linux_save.c tools/libxc/xenguest.h tools/libxc/xg_private.c tools/xcutils/xc_save.c
line diff
     1.1 --- a/tools/libxc/Makefile	Wed Apr 11 09:29:00 2007 +0100
     1.2 +++ b/tools/libxc/Makefile	Wed Apr 11 14:45:14 2007 +0100
     1.3 @@ -26,8 +26,8 @@ CTRL_SRCS-$(CONFIG_X86_Linux) += xc_ptra
     1.4  
     1.5  GUEST_SRCS-y :=
     1.6  GUEST_SRCS-y += xg_private.c
     1.7 -GUEST_SRCS-$(CONFIG_MIGRATE) += xc_domain_restore.c xc_linux_save.c
     1.8 -GUEST_SRCS-$(CONFIG_HVM) += xc_hvm_build.c xc_hvm_save.c
     1.9 +GUEST_SRCS-$(CONFIG_MIGRATE) += xc_domain_restore.c xc_domain_save.c
    1.10 +GUEST_SRCS-$(CONFIG_HVM) += xc_hvm_build.c
    1.11  
    1.12  # symlink libelf from xen/common/libelf/
    1.13  LIBELF_SRCS := libelf-tools.c libelf-loader.c
     2.1 --- a/tools/libxc/ia64/xc_ia64_linux_save.c	Wed Apr 11 09:29:00 2007 +0100
     2.2 +++ b/tools/libxc/ia64/xc_ia64_linux_save.c	Wed Apr 11 14:45:14 2007 +0100
     2.3 @@ -134,8 +134,10 @@ retry:
     2.4  }
     2.5  
     2.6  int
     2.7 -xc_linux_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
     2.8 -              uint32_t max_factor, uint32_t flags, int (*suspend)(int))
     2.9 +xc_domain_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
    2.10 +               uint32_t max_factor, uint32_t flags, int (*suspend)(int),
    2.11 +               int hvm, void *(*init_qemu_maps)(int, unsigned),
    2.12 +               void (*qemu_flip_buffer)(int, int))
    2.13  {
    2.14      DECLARE_DOMCTL;
    2.15      xc_dominfo_t info;
     3.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     3.2 +++ b/tools/libxc/xc_domain_save.c	Wed Apr 11 14:45:14 2007 +0100
     3.3 @@ -0,0 +1,1609 @@
     3.4 +/******************************************************************************
     3.5 + * xc_linux_save.c
     3.6 + *
     3.7 + * Save the state of a running Linux session.
     3.8 + *
     3.9 + * Copyright (c) 2003, K A Fraser.
    3.10 + */
    3.11 +
    3.12 +#include <inttypes.h>
    3.13 +#include <time.h>
    3.14 +#include <stdlib.h>
    3.15 +#include <unistd.h>
    3.16 +#include <sys/time.h>
    3.17 +
    3.18 +#include "xc_private.h"
    3.19 +#include "xc_dom.h"
    3.20 +#include "xg_private.h"
    3.21 +#include "xg_save_restore.h"
    3.22 +
    3.23 +#include <xen/hvm/params.h>
    3.24 +#include <xen/hvm/e820.h>
    3.25 +
    3.26 +/*
    3.27 +** Default values for important tuning parameters. Can override by passing
    3.28 +** non-zero replacement values to xc_domain_save().
    3.29 +**
    3.30 +** XXX SMH: should consider if want to be able to override MAX_MBIT_RATE too.
    3.31 +**
    3.32 +*/
    3.33 +#define DEF_MAX_ITERS   29   /* limit us to 30 times round loop   */
    3.34 +#define DEF_MAX_FACTOR   3   /* never send more than 3x p2m_size  */
    3.35 +
    3.36 +/* max mfn of the whole machine */
    3.37 +static unsigned long max_mfn;
    3.38 +
    3.39 +/* virtual starting address of the hypervisor */
    3.40 +static unsigned long hvirt_start;
    3.41 +
    3.42 +/* #levels of page tables used by the current guest */
    3.43 +static unsigned int pt_levels;
    3.44 +
    3.45 +/* HVM: shared-memory bitmaps for getting log-dirty bits from qemu-dm */
    3.46 +static unsigned long *qemu_bitmaps[2];
    3.47 +static int qemu_active;
    3.48 +static int qemu_non_active;
    3.49 +
    3.50 +/* number of pfns this guest has (i.e. number of entries in the P2M) */
    3.51 +static unsigned long p2m_size;
    3.52 +
    3.53 +/* Live mapping of the table mapping each PFN to its current MFN. */
    3.54 +static xen_pfn_t *live_p2m = NULL;
    3.55 +
    3.56 +/* Live mapping of system MFN to PFN table. */
    3.57 +static xen_pfn_t *live_m2p = NULL;
    3.58 +static unsigned long m2p_mfn0;
    3.59 +
    3.60 +/* grep fodder: machine_to_phys */
    3.61 +
    3.62 +#define mfn_to_pfn(_mfn) live_m2p[(_mfn)]
    3.63 +
    3.64 +/*
    3.65 + * Returns TRUE if the given machine frame number has a unique mapping
    3.66 + * in the guest's pseudophysical map.
    3.67 + */
    3.68 +#define MFN_IS_IN_PSEUDOPHYS_MAP(_mfn)          \
    3.69 +    (((_mfn) < (max_mfn)) &&                    \
    3.70 +     ((mfn_to_pfn(_mfn) < (p2m_size)) &&        \
    3.71 +      (live_p2m[mfn_to_pfn(_mfn)] == (_mfn))))
    3.72 +
    3.73 +/* Returns TRUE if MFN is successfully converted to a PFN. */
    3.74 +#define translate_mfn_to_pfn(_pmfn)                             \
    3.75 +({                                                              \
    3.76 +    unsigned long mfn = *(_pmfn);                               \
    3.77 +    int _res = 1;                                               \
    3.78 +    if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) )                       \
    3.79 +        _res = 0;                                               \
    3.80 +    else                                                        \
    3.81 +        *(_pmfn) = mfn_to_pfn(mfn);                             \
    3.82 +    _res;                                                       \
    3.83 +})
    3.84 +
    3.85 +/*
    3.86 +** During (live) save/migrate, we maintain a number of bitmaps to track
    3.87 +** which pages we have to send, to fixup, and to skip.
    3.88 +*/
    3.89 +
    3.90 +#define BITS_PER_LONG (sizeof(unsigned long) * 8)
    3.91 +#define BITS_TO_LONGS(bits) (((bits)+BITS_PER_LONG-1)/BITS_PER_LONG)
    3.92 +#define BITMAP_SIZE   (BITS_TO_LONGS(p2m_size) * sizeof(unsigned long))
    3.93 +
    3.94 +#define BITMAP_ENTRY(_nr,_bmap) \
    3.95 +   ((volatile unsigned long *)(_bmap))[(_nr)/BITS_PER_LONG]
    3.96 +
    3.97 +#define BITMAP_SHIFT(_nr) ((_nr) % BITS_PER_LONG)
    3.98 +
    3.99 +static inline int test_bit (int nr, volatile void * addr)
   3.100 +{
   3.101 +    return (BITMAP_ENTRY(nr, addr) >> BITMAP_SHIFT(nr)) & 1;
   3.102 +}
   3.103 +
   3.104 +static inline void clear_bit (int nr, volatile void * addr)
   3.105 +{
   3.106 +    BITMAP_ENTRY(nr, addr) &= ~(1UL << BITMAP_SHIFT(nr));
   3.107 +}
   3.108 +
   3.109 +static inline void set_bit ( int nr, volatile void * addr)
   3.110 +{
   3.111 +    BITMAP_ENTRY(nr, addr) |= (1UL << BITMAP_SHIFT(nr));
   3.112 +}
   3.113 +
   3.114 +/* Returns the hamming weight (i.e. the number of bits set) in a N-bit word */
   3.115 +static inline unsigned int hweight32(unsigned int w)
   3.116 +{
   3.117 +    unsigned int res = (w & 0x55555555) + ((w >> 1) & 0x55555555);
   3.118 +    res = (res & 0x33333333) + ((res >> 2) & 0x33333333);
   3.119 +    res = (res & 0x0F0F0F0F) + ((res >> 4) & 0x0F0F0F0F);
   3.120 +    res = (res & 0x00FF00FF) + ((res >> 8) & 0x00FF00FF);
   3.121 +    return (res & 0x0000FFFF) + ((res >> 16) & 0x0000FFFF);
   3.122 +}
   3.123 +
   3.124 +static inline int count_bits ( int nr, volatile void *addr)
   3.125 +{
   3.126 +    int i, count = 0;
   3.127 +    volatile unsigned long *p = (volatile unsigned long *)addr;
   3.128 +    /* We know that the array is padded to unsigned long. */
   3.129 +    for ( i = 0; i < (nr / (sizeof(unsigned long)*8)); i++, p++ )
   3.130 +        count += hweight32(*p);
   3.131 +    return count;
   3.132 +}
   3.133 +
   3.134 +static inline int permute( int i, int nr, int order_nr  )
   3.135 +{
   3.136 +    /* Need a simple permutation function so that we scan pages in a
   3.137 +       pseudo random order, enabling us to get a better estimate of
   3.138 +       the domain's page dirtying rate as we go (there are often
   3.139 +       contiguous ranges of pfns that have similar behaviour, and we
   3.140 +       want to mix them up. */
   3.141 +
   3.142 +    /* e.g. nr->oder 15->4 16->4 17->5 */
   3.143 +    /* 512MB domain, 128k pages, order 17 */
   3.144 +
   3.145 +    /*
   3.146 +      QPONMLKJIHGFEDCBA
   3.147 +             QPONMLKJIH
   3.148 +      GFEDCBA
   3.149 +     */
   3.150 +
   3.151 +    /*
   3.152 +      QPONMLKJIHGFEDCBA
   3.153 +                  EDCBA
   3.154 +             QPONM
   3.155 +      LKJIHGF
   3.156 +      */
   3.157 +
   3.158 +    do { i = ((i>>(order_nr-10)) | ( i<<10 ) ) & ((1<<order_nr)-1); }
   3.159 +    while ( i >= nr ); /* this won't ever loop if nr is a power of 2 */
   3.160 +
   3.161 +    return i;
   3.162 +}
   3.163 +
   3.164 +static uint64_t tv_to_us(struct timeval *new)
   3.165 +{
   3.166 +    return (new->tv_sec * 1000000) + new->tv_usec;
   3.167 +}
   3.168 +
   3.169 +static uint64_t llgettimeofday(void)
   3.170 +{
   3.171 +    struct timeval now;
   3.172 +    gettimeofday(&now, NULL);
   3.173 +    return tv_to_us(&now);
   3.174 +}
   3.175 +
   3.176 +static uint64_t tv_delta(struct timeval *new, struct timeval *old)
   3.177 +{
   3.178 +    return (((new->tv_sec - old->tv_sec)*1000000) +
   3.179 +            (new->tv_usec - old->tv_usec));
   3.180 +}
   3.181 +
   3.182 +static int noncached_write(int fd, int live, void *buffer, int len) 
   3.183 +{
   3.184 +    static int write_count = 0;
   3.185 +
   3.186 +    int rc = write(fd,buffer,len);
   3.187 +
   3.188 +    write_count += len;
   3.189 +    if ( write_count >= (MAX_PAGECACHE_USAGE * PAGE_SIZE) )
   3.190 +    {
   3.191 +        /* Time to discard cache - dont care if this fails */
   3.192 +        discard_file_cache(fd, 0 /* no flush */);
   3.193 +        write_count = 0;
   3.194 +    }
   3.195 +
   3.196 +    return rc;
   3.197 +}
   3.198 +
   3.199 +#ifdef ADAPTIVE_SAVE
   3.200 +
   3.201 +/*
   3.202 +** We control the rate at which we transmit (or save) to minimize impact
   3.203 +** on running domains (including the target if we're doing live migrate).
   3.204 +*/
   3.205 +
   3.206 +#define MAX_MBIT_RATE    500      /* maximum transmit rate for migrate */
   3.207 +#define START_MBIT_RATE  100      /* initial transmit rate for migrate */
   3.208 +
   3.209 +/* Scaling factor to convert between a rate (in Mb/s) and time (in usecs) */
   3.210 +#define RATE_TO_BTU      781250
   3.211 +
   3.212 +/* Amount in bytes we allow ourselves to send in a burst */
   3.213 +#define BURST_BUDGET (100*1024)
   3.214 +
   3.215 +/* We keep track of the current and previous transmission rate */
   3.216 +static int mbit_rate, ombit_rate = 0;
   3.217 +
   3.218 +/* Have we reached the maximum transmission rate? */
   3.219 +#define RATE_IS_MAX() (mbit_rate == MAX_MBIT_RATE)
   3.220 +
   3.221 +static inline void initialize_mbit_rate()
   3.222 +{
   3.223 +    mbit_rate = START_MBIT_RATE;
   3.224 +}
   3.225 +
   3.226 +static int ratewrite(int io_fd, int live, void *buf, int n)
   3.227 +{
   3.228 +    static int budget = 0;
   3.229 +    static int burst_time_us = -1;
   3.230 +    static struct timeval last_put = { 0 };
   3.231 +    struct timeval now;
   3.232 +    struct timespec delay;
   3.233 +    long long delta;
   3.234 +
   3.235 +    if ( START_MBIT_RATE == 0 )
   3.236 +        return noncached_write(io_fd, live, buf, n);
   3.237 +
   3.238 +    budget -= n;
   3.239 +    if ( budget < 0 )
   3.240 +    {
   3.241 +        if ( mbit_rate != ombit_rate )
   3.242 +        {
   3.243 +            burst_time_us = RATE_TO_BTU / mbit_rate;
   3.244 +            ombit_rate = mbit_rate;
   3.245 +            DPRINTF("rate limit: %d mbit/s burst budget %d slot time %d\n",
   3.246 +                    mbit_rate, BURST_BUDGET, burst_time_us);
   3.247 +        }
   3.248 +        if ( last_put.tv_sec == 0 )
   3.249 +        {
   3.250 +            budget += BURST_BUDGET;
   3.251 +            gettimeofday(&last_put, NULL);
   3.252 +        }
   3.253 +        else
   3.254 +        {
   3.255 +            while ( budget < 0 )
   3.256 +            {
   3.257 +                gettimeofday(&now, NULL);
   3.258 +                delta = tv_delta(&now, &last_put);
   3.259 +                while ( delta > burst_time_us )
   3.260 +                {
   3.261 +                    budget += BURST_BUDGET;
   3.262 +                    last_put.tv_usec += burst_time_us;
   3.263 +                    if ( last_put.tv_usec > 1000000 
   3.264 +                    {
   3.265 +                        last_put.tv_usec -= 1000000;
   3.266 +                        last_put.tv_sec++;
   3.267 +                    }
   3.268 +                    delta -= burst_time_us;
   3.269 +                }
   3.270 +                if ( budget > 0 )
   3.271 +                    break;
   3.272 +                delay.tv_sec = 0;
   3.273 +                delay.tv_nsec = 1000 * (burst_time_us - delta);
   3.274 +                while ( delay.tv_nsec > 0 )
   3.275 +                    if ( nanosleep(&delay, &delay) == 0 )
   3.276 +                        break;
   3.277 +            }
   3.278 +        }
   3.279 +    }
   3.280 +    return noncached_write(io_fd, live, buf, n);
   3.281 +}
   3.282 +
   3.283 +#else /* ! ADAPTIVE SAVE */
   3.284 +
   3.285 +#define RATE_IS_MAX() (0)
   3.286 +#define ratewrite(_io_fd, _live, _buf, _n) noncached_write((_io_fd), (_live), (_buf), (_n))
   3.287 +#define initialize_mbit_rate()
   3.288 +
   3.289 +#endif
   3.290 +
   3.291 +static inline ssize_t write_exact(int fd, void *buf, size_t count)
   3.292 +{
   3.293 +    return (write(fd, buf, count) == count);
   3.294 +}
   3.295 +
   3.296 +static int print_stats(int xc_handle, uint32_t domid, int pages_sent,
   3.297 +                       xc_shadow_op_stats_t *stats, int print)
   3.298 +{
   3.299 +    static struct timeval wall_last;
   3.300 +    static long long      d0_cpu_last;
   3.301 +    static long long      d1_cpu_last;
   3.302 +
   3.303 +    struct timeval        wall_now;
   3.304 +    long long             wall_delta;
   3.305 +    long long             d0_cpu_now, d0_cpu_delta;
   3.306 +    long long             d1_cpu_now, d1_cpu_delta;
   3.307 +
   3.308 +    gettimeofday(&wall_now, NULL);
   3.309 +
   3.310 +    d0_cpu_now = xc_domain_get_cpu_usage(xc_handle, 0, /* FIXME */ 0)/1000;
   3.311 +    d1_cpu_now = xc_domain_get_cpu_usage(xc_handle, domid, /* FIXME */ 0)/1000;
   3.312 +
   3.313 +    if ( (d0_cpu_now == -1) || (d1_cpu_now == -1) )
   3.314 +        DPRINTF("ARRHHH!!\n");
   3.315 +
   3.316 +    wall_delta = tv_delta(&wall_now,&wall_last)/1000;
   3.317 +    if ( wall_delta == 0 )
   3.318 +        wall_delta = 1;
   3.319 +
   3.320 +    d0_cpu_delta = (d0_cpu_now - d0_cpu_last)/1000;
   3.321 +    d1_cpu_delta = (d1_cpu_now - d1_cpu_last)/1000;
   3.322 +
   3.323 +    if ( print )
   3.324 +        DPRINTF("delta %lldms, dom0 %d%%, target %d%%, sent %dMb/s, "
   3.325 +                "dirtied %dMb/s %" PRId32 " pages\n",
   3.326 +                wall_delta,
   3.327 +                (int)((d0_cpu_delta*100)/wall_delta),
   3.328 +                (int)((d1_cpu_delta*100)/wall_delta),
   3.329 +                (int)((pages_sent*PAGE_SIZE)/(wall_delta*(1000/8))),
   3.330 +                (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))),
   3.331 +                stats->dirty_count);
   3.332 +
   3.333 +#ifdef ADAPTIVE_SAVE
   3.334 +    if ( ((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))) > mbit_rate )
   3.335 +    {
   3.336 +        mbit_rate = (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8)))
   3.337 +            + 50;
   3.338 +        if ( mbit_rate > MAX_MBIT_RATE )
   3.339 +            mbit_rate = MAX_MBIT_RATE;
   3.340 +    }
   3.341 +#endif
   3.342 +
   3.343 +    d0_cpu_last = d0_cpu_now;
   3.344 +    d1_cpu_last = d1_cpu_now;
   3.345 +    wall_last   = wall_now;
   3.346 +
   3.347 +    return 0;
   3.348 +}
   3.349 +
   3.350 +
   3.351 +static int analysis_phase(int xc_handle, uint32_t domid, int p2m_size,
   3.352 +                          unsigned long *arr, int runs)
   3.353 +{
   3.354 +    long long start, now;
   3.355 +    xc_shadow_op_stats_t stats;
   3.356 +    int j;
   3.357 +
   3.358 +    start = llgettimeofday();
   3.359 +
   3.360 +    for ( j = 0; j < runs; j++ )
   3.361 +    {
   3.362 +        int i;
   3.363 +
   3.364 +        xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_CLEAN,
   3.365 +                          arr, p2m_size, NULL, 0, NULL);
   3.366 +        DPRINTF("#Flush\n");
   3.367 +        for ( i = 0; i < 40; i++ )
   3.368 +        {
   3.369 +            usleep(50000);
   3.370 +            now = llgettimeofday();
   3.371 +            xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_PEEK,
   3.372 +                              NULL, 0, NULL, 0, &stats);
   3.373 +            DPRINTF("now= %lld faults= %"PRId32" dirty= %"PRId32"\n",
   3.374 +                    ((now-start)+500)/1000,
   3.375 +                    stats.fault_count, stats.dirty_count);
   3.376 +        }
   3.377 +    }
   3.378 +
   3.379 +    return -1;
   3.380 +}
   3.381 +
   3.382 +
   3.383 +static int suspend_and_state(int (*suspend)(int), int xc_handle, int io_fd,
   3.384 +                             int dom, xc_dominfo_t *info,
   3.385 +                             vcpu_guest_context_t *ctxt)
   3.386 +{
   3.387 +    int i = 0;
   3.388 +
   3.389 +    if ( !(*suspend)(dom) )
   3.390 +    {
   3.391 +        ERROR("Suspend request failed");
   3.392 +        return -1;
   3.393 +    }
   3.394 +
   3.395 + retry:
   3.396 +
   3.397 +    if ( xc_domain_getinfo(xc_handle, dom, 1, info) != 1 )
   3.398 +    {
   3.399 +        ERROR("Could not get domain info");
   3.400 +        return -1;
   3.401 +    }
   3.402 +
   3.403 +    if ( xc_vcpu_getcontext(xc_handle, dom, 0, ctxt) )
   3.404 +        ERROR("Could not get vcpu context");
   3.405 +
   3.406 +
   3.407 +    if ( info->dying )
   3.408 +    {
   3.409 +        ERROR("domain is dying");
   3.410 +        return -1;
   3.411 +    }
   3.412 +
   3.413 +    if ( info->crashed )
   3.414 +    {
   3.415 +        ERROR("domain has crashed");
   3.416 +        return -1;
   3.417 +    }
   3.418 +
   3.419 +    if ( info->shutdown )
   3.420 +    {
   3.421 +        switch ( info->shutdown_reason )
   3.422 +        {
   3.423 +        case SHUTDOWN_poweroff:
   3.424 +        case SHUTDOWN_reboot:
   3.425 +            ERROR("domain has shut down");
   3.426 +            return -1;
   3.427 +        case SHUTDOWN_suspend:
   3.428 +            return 0;
   3.429 +        case SHUTDOWN_crash:
   3.430 +            ERROR("domain has crashed");
   3.431 +            return -1;
   3.432 +        }
   3.433 +    }
   3.434 +
   3.435 +    if ( info->paused )
   3.436 +    {
   3.437 +        /* Try unpausing domain, wait, and retest. */
   3.438 +        xc_domain_unpause( xc_handle, dom );
   3.439 +        ERROR("Domain was paused. Wait and re-test.");
   3.440 +        usleep(10000); /* 10ms */
   3.441 +        goto retry;
   3.442 +    }
   3.443 +
   3.444 +    if ( ++i < 100 )
   3.445 +    {
   3.446 +        ERROR("Retry suspend domain");
   3.447 +        usleep(10000); /* 10ms */
   3.448 +        goto retry;
   3.449 +    }
   3.450 +
   3.451 +    ERROR("Unable to suspend domain.");
   3.452 +
   3.453 +    return -1;
   3.454 +}
   3.455 +
   3.456 +/*
   3.457 +** Map the top-level page of MFNs from the guest. The guest might not have
   3.458 +** finished resuming from a previous restore operation, so we wait a while for
   3.459 +** it to update the MFN to a reasonable value.
   3.460 +*/
   3.461 +static void *map_frame_list_list(int xc_handle, uint32_t dom,
   3.462 +                                 shared_info_t *shinfo)
   3.463 +{
   3.464 +    int count = 100;
   3.465 +    void *p;
   3.466 +
   3.467 +    while ( count-- && (shinfo->arch.pfn_to_mfn_frame_list_list == 0) )
   3.468 +        usleep(10000);
   3.469 +
   3.470 +    if ( shinfo->arch.pfn_to_mfn_frame_list_list == 0 )
   3.471 +    {
   3.472 +        ERROR("Timed out waiting for frame list updated.");
   3.473 +        return NULL;
   3.474 +    }
   3.475 +
   3.476 +    p = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, PROT_READ,
   3.477 +                             shinfo->arch.pfn_to_mfn_frame_list_list);
   3.478 +    if ( p == NULL )
   3.479 +        ERROR("Couldn't map p2m_frame_list_list (errno %d)", errno);
   3.480 +
   3.481 +    return p;
   3.482 +}
   3.483 +
   3.484 +/*
   3.485 +** During transfer (or in the state file), all page-table pages must be
   3.486 +** converted into a 'canonical' form where references to actual mfns
   3.487 +** are replaced with references to the corresponding pfns.
   3.488 +**
   3.489 +** This function performs the appropriate conversion, taking into account
   3.490 +** which entries do not require canonicalization (in particular, those
   3.491 +** entries which map the virtual address reserved for the hypervisor).
   3.492 +*/
   3.493 +static int canonicalize_pagetable(unsigned long type, unsigned long pfn,
   3.494 +                           const void *spage, void *dpage)
   3.495 +{
   3.496 +
   3.497 +    int i, pte_last, xen_start, xen_end, race = 0; 
   3.498 +    uint64_t pte;
   3.499 +
   3.500 +    /*
   3.501 +    ** We need to determine which entries in this page table hold
   3.502 +    ** reserved hypervisor mappings. This depends on the current
   3.503 +    ** page table type as well as the number of paging levels.
   3.504 +    */
   3.505 +    xen_start = xen_end = pte_last = PAGE_SIZE / ((pt_levels == 2) ? 4 : 8);
   3.506 +
   3.507 +    if ( (pt_levels == 2) && (type == XEN_DOMCTL_PFINFO_L2TAB) )
   3.508 +        xen_start = (hvirt_start >> L2_PAGETABLE_SHIFT);
   3.509 +
   3.510 +    if ( (pt_levels == 3) && (type == XEN_DOMCTL_PFINFO_L3TAB) )
   3.511 +        xen_start = L3_PAGETABLE_ENTRIES_PAE;
   3.512 +
   3.513 +    /*
   3.514 +    ** in PAE only the L2 mapping the top 1GB contains Xen mappings.
   3.515 +    ** We can spot this by looking for the guest linear mapping which
   3.516 +    ** Xen always ensures is present in that L2. Guests must ensure
   3.517 +    ** that this check will fail for other L2s.
   3.518 +    */
   3.519 +    if ( (pt_levels == 3) && (type == XEN_DOMCTL_PFINFO_L2TAB) )
   3.520 +    {
   3.521 +        int hstart;
   3.522 +        uint64_t he;
   3.523 +
   3.524 +        hstart = (hvirt_start >> L2_PAGETABLE_SHIFT_PAE) & 0x1ff;
   3.525 +        he = ((const uint64_t *) spage)[hstart];
   3.526 +
   3.527 +        if ( ((he >> PAGE_SHIFT) & MFN_MASK_X86) == m2p_mfn0 )
   3.528 +        {
   3.529 +            /* hvirt starts with xen stuff... */
   3.530 +            xen_start = hstart;
   3.531 +        }
   3.532 +        else if ( hvirt_start != 0xf5800000 )
   3.533 +        {
   3.534 +            /* old L2s from before hole was shrunk... */
   3.535 +            hstart = (0xf5800000 >> L2_PAGETABLE_SHIFT_PAE) & 0x1ff;
   3.536 +            he = ((const uint64_t *) spage)[hstart];
   3.537 +            if ( ((he >> PAGE_SHIFT) & MFN_MASK_X86) == m2p_mfn0 )
   3.538 +                xen_start = hstart;
   3.539 +        }
   3.540 +    }
   3.541 +
   3.542 +    if ( (pt_levels == 4) && (type == XEN_DOMCTL_PFINFO_L4TAB) )
   3.543 +    {
   3.544 +        /*
   3.545 +        ** XXX SMH: should compute these from hvirt_start (which we have)
   3.546 +        ** and hvirt_end (which we don't)
   3.547 +        */
   3.548 +        xen_start = 256;
   3.549 +        xen_end   = 272;
   3.550 +    }
   3.551 +
   3.552 +    /* Now iterate through the page table, canonicalizing each PTE */
   3.553 +    for (i = 0; i < pte_last; i++ )
   3.554 +    {
   3.555 +        unsigned long pfn, mfn;
   3.556 +
   3.557 +        if ( pt_levels == 2 )
   3.558 +            pte = ((const uint32_t*)spage)[i];
   3.559 +        else
   3.560 +            pte = ((const uint64_t*)spage)[i];
   3.561 +
   3.562 +        if ( (i >= xen_start) && (i < xen_end) )
   3.563 +            pte = 0;
   3.564 +
   3.565 +        if ( pte & _PAGE_PRESENT )
   3.566 +        {
   3.567 +            mfn = (pte >> PAGE_SHIFT) & MFN_MASK_X86;
   3.568 +            if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) )
   3.569 +            {
   3.570 +                /* This will happen if the type info is stale which
   3.571 +                   is quite feasible under live migration */
   3.572 +                pfn  = 0;  /* zap it - we'll retransmit this page later */
   3.573 +                race = 1;  /* inform the caller of race; fatal if !live */ 
   3.574 +            }
   3.575 +            else
   3.576 +                pfn = mfn_to_pfn(mfn);
   3.577 +
   3.578 +            pte &= ~MADDR_MASK_X86;
   3.579 +            pte |= (uint64_t)pfn << PAGE_SHIFT;
   3.580 +
   3.581 +            /*
   3.582 +             * PAE guest L3Es can contain these flags when running on
   3.583 +             * a 64bit hypervisor. We zap these here to avoid any
   3.584 +             * surprise at restore time...
   3.585 +             */
   3.586 +            if ( (pt_levels == 3) &&
   3.587 +                 (type == XEN_DOMCTL_PFINFO_L3TAB) &&
   3.588 +                 (pte & (_PAGE_USER|_PAGE_RW|_PAGE_ACCESSED)) )
   3.589 +                pte &= ~(_PAGE_USER|_PAGE_RW|_PAGE_ACCESSED);
   3.590 +        }
   3.591 +
   3.592 +        if ( pt_levels == 2 )
   3.593 +            ((uint32_t*)dpage)[i] = pte;
   3.594 +        else
   3.595 +            ((uint64_t*)dpage)[i] = pte;
   3.596 +    }
   3.597 +
   3.598 +    return race;
   3.599 +}
   3.600 +
   3.601 +static xen_pfn_t *xc_map_m2p(int xc_handle,
   3.602 +                                 unsigned long max_mfn,
   3.603 +                                 int prot)
   3.604 +{
   3.605 +    struct xen_machphys_mfn_list xmml;
   3.606 +    privcmd_mmap_entry_t *entries;
   3.607 +    unsigned long m2p_chunks, m2p_size;
   3.608 +    xen_pfn_t *m2p;
   3.609 +    xen_pfn_t *extent_start;
   3.610 +    int i, rc;
   3.611 +
   3.612 +    m2p_size   = M2P_SIZE(max_mfn);
   3.613 +    m2p_chunks = M2P_CHUNKS(max_mfn);
   3.614 +
   3.615 +    xmml.max_extents = m2p_chunks;
   3.616 +    if ( !(extent_start = malloc(m2p_chunks * sizeof(xen_pfn_t))) )
   3.617 +    {
   3.618 +        ERROR("failed to allocate space for m2p mfns");
   3.619 +        return NULL;
   3.620 +    }
   3.621 +    set_xen_guest_handle(xmml.extent_start, extent_start);
   3.622 +
   3.623 +    if ( xc_memory_op(xc_handle, XENMEM_machphys_mfn_list, &xmml) ||
   3.624 +         (xmml.nr_extents != m2p_chunks) )
   3.625 +    {
   3.626 +        ERROR("xc_get_m2p_mfns");
   3.627 +        return NULL;
   3.628 +    }
   3.629 +
   3.630 +    if ( (m2p = mmap(NULL, m2p_size, prot,
   3.631 +                     MAP_SHARED, xc_handle, 0)) == MAP_FAILED )
   3.632 +    {
   3.633 +        ERROR("failed to mmap m2p");
   3.634 +        return NULL;
   3.635 +    }
   3.636 +
   3.637 +    if ( !(entries = malloc(m2p_chunks * sizeof(privcmd_mmap_entry_t))) )
   3.638 +    {
   3.639 +        ERROR("failed to allocate space for mmap entries");
   3.640 +        return NULL;
   3.641 +    }
   3.642 +
   3.643 +    for ( i = 0; i < m2p_chunks; i++ )
   3.644 +    {
   3.645 +        entries[i].va = (unsigned long)(((void *)m2p) + (i * M2P_CHUNK_SIZE));
   3.646 +        entries[i].mfn = extent_start[i];
   3.647 +        entries[i].npages = M2P_CHUNK_SIZE >> PAGE_SHIFT;
   3.648 +    }
   3.649 +
   3.650 +    if ( (rc = xc_map_foreign_ranges(xc_handle, DOMID_XEN,
   3.651 +                                     entries, m2p_chunks)) < 0 )
   3.652 +    {
   3.653 +        ERROR("xc_mmap_foreign_ranges failed (rc = %d)", rc);
   3.654 +        return NULL;
   3.655 +    }
   3.656 +
   3.657 +    m2p_mfn0 = entries[0].mfn;
   3.658 +
   3.659 +    free(extent_start);
   3.660 +    free(entries);
   3.661 +
   3.662 +    return m2p;
   3.663 +}
   3.664 +
   3.665 +
   3.666 +static xen_pfn_t *map_and_save_p2m_table(int xc_handle, 
   3.667 +                                         int io_fd, 
   3.668 +                                         uint32_t dom,
   3.669 +                                         vcpu_guest_context_t *ctxt,
   3.670 +                                         unsigned long p2m_size,
   3.671 +                                         shared_info_t *live_shinfo)
   3.672 +{
   3.673 +    /* Double and single indirect references to the live P2M table */
   3.674 +    xen_pfn_t *live_p2m_frame_list_list = NULL;
   3.675 +    xen_pfn_t *live_p2m_frame_list = NULL;
   3.676 +
   3.677 +    /* A copy of the pfn-to-mfn table frame list. */
   3.678 +    xen_pfn_t *p2m_frame_list = NULL;
   3.679 +
   3.680 +    /* The mapping of the live p2m table itself */
   3.681 +    xen_pfn_t *p2m = NULL;
   3.682 +
   3.683 +    int i, success = 0;
   3.684 +
   3.685 +    live_p2m_frame_list_list = map_frame_list_list(xc_handle, dom,
   3.686 +                                                   live_shinfo);
   3.687 +    if ( !live_p2m_frame_list_list )
   3.688 +        goto out;
   3.689 +
   3.690 +    live_p2m_frame_list =
   3.691 +        xc_map_foreign_batch(xc_handle, dom, PROT_READ,
   3.692 +                             live_p2m_frame_list_list,
   3.693 +                             P2M_FLL_ENTRIES);
   3.694 +    if ( !live_p2m_frame_list )
   3.695 +    {
   3.696 +        ERROR("Couldn't map p2m_frame_list");
   3.697 +        goto out;
   3.698 +    }
   3.699 +
   3.700 +
   3.701 +    /* Map all the frames of the pfn->mfn table. For migrate to succeed,
   3.702 +       the guest must not change which frames are used for this purpose.
   3.703 +       (its not clear why it would want to change them, and we'll be OK
   3.704 +       from a safety POV anyhow. */
   3.705 +
   3.706 +    p2m = xc_map_foreign_batch(xc_handle, dom, PROT_READ,
   3.707 +                               live_p2m_frame_list,
   3.708 +                               P2M_FL_ENTRIES);
   3.709 +    if ( !p2m )
   3.710 +    {
   3.711 +        ERROR("Couldn't map p2m table");
   3.712 +        goto out;
   3.713 +    }
   3.714 +    live_p2m = p2m; /* So that translation macros will work */
   3.715 +    
   3.716 +    /* Get a local copy of the live_P2M_frame_list */
   3.717 +    if ( !(p2m_frame_list = malloc(P2M_FL_SIZE)) )
   3.718 +    {
   3.719 +        ERROR("Couldn't allocate p2m_frame_list array");
   3.720 +        goto out;
   3.721 +    }
   3.722 +    memcpy(p2m_frame_list, live_p2m_frame_list, P2M_FL_SIZE);
   3.723 +
   3.724 +    /* Canonicalise the pfn-to-mfn table frame-number list. */
   3.725 +    for ( i = 0; i < p2m_size; i += fpp )
   3.726 +    {
   3.727 +        if ( !translate_mfn_to_pfn(&p2m_frame_list[i/fpp]) )
   3.728 +        {
   3.729 +            ERROR("Frame# in pfn-to-mfn frame list is not in pseudophys");
   3.730 +            ERROR("entry %d: p2m_frame_list[%ld] is 0x%"PRIx64, i, i/fpp,
   3.731 +                  (uint64_t)p2m_frame_list[i/fpp]);
   3.732 +            goto out;
   3.733 +        }
   3.734 +    }
   3.735 +
   3.736 +    /*
   3.737 +     * Write an extended-info structure to inform the restore code that
   3.738 +     * a PAE guest understands extended CR3 (PDPTs above 4GB). Turns off
   3.739 +     * slow paths in the restore code.
   3.740 +     */
   3.741 +    if ( (pt_levels == 3) &&
   3.742 +         (ctxt->vm_assist & (1UL << VMASST_TYPE_pae_extended_cr3)) )
   3.743 +    {
   3.744 +        unsigned long signature = ~0UL;
   3.745 +        uint32_t tot_sz   = sizeof(struct vcpu_guest_context) + 8;
   3.746 +        uint32_t chunk_sz = sizeof(struct vcpu_guest_context);
   3.747 +        char chunk_sig[]  = "vcpu";
   3.748 +        if ( !write_exact(io_fd, &signature, sizeof(signature)) ||
   3.749 +             !write_exact(io_fd, &tot_sz,    sizeof(tot_sz)) ||
   3.750 +             !write_exact(io_fd, &chunk_sig, 4) ||
   3.751 +             !write_exact(io_fd, &chunk_sz,  sizeof(chunk_sz)) ||
   3.752 +             !write_exact(io_fd, ctxt,       sizeof(*ctxt)) )
   3.753 +        {
   3.754 +            ERROR("write: extended info");
   3.755 +            goto out;
   3.756 +        }
   3.757 +    }
   3.758 +
   3.759 +    if ( !write_exact(io_fd, p2m_frame_list, P2M_FL_SIZE) )
   3.760 +    {
   3.761 +        ERROR("write: p2m_frame_list");
   3.762 +        goto out;
   3.763 +    }    
   3.764 +
   3.765 +    success = 1;
   3.766 +
   3.767 + out:
   3.768 +    
   3.769 +    if ( !success && p2m )
   3.770 +        munmap(p2m, ROUNDUP(p2m_size * sizeof(xen_pfn_t), PAGE_SHIFT));
   3.771 +
   3.772 +    if ( live_p2m_frame_list_list )
   3.773 +        munmap(live_p2m_frame_list_list, PAGE_SIZE);
   3.774 +
   3.775 +    if ( live_p2m_frame_list )
   3.776 +        munmap(live_p2m_frame_list, P2M_FLL_ENTRIES * PAGE_SIZE);
   3.777 +
   3.778 +    if ( p2m_frame_list ) 
   3.779 +        free(p2m_frame_list);
   3.780 +
   3.781 +    return success ? p2m : NULL;
   3.782 +}
   3.783 +
   3.784 +
   3.785 +
   3.786 +int xc_domain_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
   3.787 +                   uint32_t max_factor, uint32_t flags, int (*suspend)(int),
   3.788 +                   int hvm, void *(*init_qemu_maps)(int, unsigned), 
   3.789 +                   void (*qemu_flip_buffer)(int, int))
   3.790 +{
   3.791 +    xc_dominfo_t info;
   3.792 +
   3.793 +    int rc = 1, i, j, last_iter, iter = 0;
   3.794 +    int live  = (flags & XCFLAGS_LIVE);
   3.795 +    int debug = (flags & XCFLAGS_DEBUG);
   3.796 +    int race = 0, sent_last_iter, skip_this_iter;
   3.797 +
   3.798 +    /* The new domain's shared-info frame number. */
   3.799 +    unsigned long shared_info_frame;
   3.800 +
   3.801 +    /* A copy of the CPU context of the guest. */
   3.802 +    vcpu_guest_context_t ctxt;
   3.803 +
   3.804 +    /* A table containing the type of each PFN (/not/ MFN!). */
   3.805 +    unsigned long *pfn_type = NULL;
   3.806 +    unsigned long *pfn_batch = NULL;
   3.807 +
   3.808 +    /* A copy of one frame of guest memory. */
   3.809 +    char page[PAGE_SIZE];
   3.810 +
   3.811 +    /* Live mapping of shared info structure */
   3.812 +    shared_info_t *live_shinfo = NULL;
   3.813 +
   3.814 +    /* base of the region in which domain memory is mapped */
   3.815 +    unsigned char *region_base = NULL;
   3.816 +
   3.817 +    /* power of 2 order of p2m_size */
   3.818 +    int order_nr;
   3.819 +
   3.820 +    /* bitmap of pages:
   3.821 +       - that should be sent this iteration (unless later marked as skip);
   3.822 +       - to skip this iteration because already dirty;
   3.823 +       - to fixup by sending at the end if not already resent; */
   3.824 +    unsigned long *to_send = NULL, *to_skip = NULL, *to_fix = NULL;
   3.825 +
   3.826 +    xc_shadow_op_stats_t stats;
   3.827 +
   3.828 +    unsigned long needed_to_fix = 0;
   3.829 +    unsigned long total_sent    = 0;
   3.830 +
   3.831 +    uint64_t vcpumap = 1ULL;
   3.832 +
   3.833 +    /* HVM: a buffer for holding HVM context */
   3.834 +    uint32_t hvm_buf_size = 0;
   3.835 +    uint8_t *hvm_buf = NULL;
   3.836 +
   3.837 +    /* HVM: magic frames for ioreqs and xenstore comms. */
   3.838 +    uint64_t magic_pfns[3]; /* ioreq_pfn, bufioreq_pfn, store_pfn */
   3.839 +
   3.840 +    /* If no explicit control parameters given, use defaults */
   3.841 +    max_iters  = max_iters  ? : DEF_MAX_ITERS;
   3.842 +    max_factor = max_factor ? : DEF_MAX_FACTOR;
   3.843 +
   3.844 +    initialize_mbit_rate();
   3.845 +
   3.846 +    if ( !get_platform_info(xc_handle, dom,
   3.847 +                            &max_mfn, &hvirt_start, &pt_levels) )
   3.848 +    {
   3.849 +        ERROR("Unable to get platform info.");
   3.850 +        return 1;
   3.851 +    }
   3.852 +
   3.853 +    if ( xc_domain_getinfo(xc_handle, dom, 1, &info) != 1 )
   3.854 +    {
   3.855 +        ERROR("Could not get domain info");
   3.856 +        return 1;
   3.857 +    }
   3.858 +
   3.859 +    if ( xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt) )
   3.860 +    {
   3.861 +        ERROR("Could not get vcpu context");
   3.862 +        goto out;
   3.863 +    }
   3.864 +    shared_info_frame = info.shared_info_frame;
   3.865 +
   3.866 +    /* Map the shared info frame */
   3.867 +    if ( !hvm )
   3.868 +    {
   3.869 +        live_shinfo = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
   3.870 +                                           PROT_READ, shared_info_frame);
   3.871 +        if ( !live_shinfo )
   3.872 +        {
   3.873 +            ERROR("Couldn't map live_shinfo");
   3.874 +            goto out;
   3.875 +        }
   3.876 +    }
   3.877 +
   3.878 +    /* Get the size of the P2M table */
   3.879 +    p2m_size = xc_memory_op(xc_handle, XENMEM_maximum_gpfn, &dom);
   3.880 +
   3.881 +    /* Domain is still running at this point */
   3.882 +    if ( live )
   3.883 +    {
   3.884 +        /* Live suspend. Enable log-dirty mode. */
   3.885 +        if ( xc_shadow_control(xc_handle, dom,
   3.886 +                               XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY,
   3.887 +                               NULL, 0, NULL, 0, NULL) < 0 )
   3.888 +        {
   3.889 +            ERROR("Couldn't enable shadow mode");
   3.890 +            goto out;
   3.891 +        }
   3.892 +
   3.893 +        if ( hvm )
   3.894 +        {
   3.895 +            /* Get qemu-dm logging dirty pages too */
   3.896 +            void *seg = init_qemu_maps(dom, BITMAP_SIZE);
   3.897 +            qemu_bitmaps[0] = seg;
   3.898 +            qemu_bitmaps[1] = seg + BITMAP_SIZE;
   3.899 +            qemu_active = 0;
   3.900 +            qemu_non_active = 1;
   3.901 +        }
   3.902 +    }
   3.903 +    else
   3.904 +    {
   3.905 +        /* This is a non-live suspend. Suspend the domain .*/
   3.906 +        if ( suspend_and_state(suspend, xc_handle, io_fd, dom, &info, &ctxt) )
   3.907 +        {
   3.908 +            ERROR("Domain appears not to have suspended");
   3.909 +            goto out;
   3.910 +        }
   3.911 +    }
   3.912 +
   3.913 +    last_iter = !live;
   3.914 +
   3.915 +    /* pretend we sent all the pages last iteration */
   3.916 +    sent_last_iter = p2m_size;
   3.917 +
   3.918 +    /* calculate the power of 2 order of p2m_size, e.g.
   3.919 +       15->4 16->4 17->5 */
   3.920 +    for ( i = p2m_size-1, order_nr = 0; i ; i >>= 1, order_nr++ )
   3.921 +        continue;
   3.922 +
   3.923 +    /* Setup to_send / to_fix and to_skip bitmaps */
   3.924 +    to_send = malloc(BITMAP_SIZE);
   3.925 +    to_fix  = calloc(1, BITMAP_SIZE);
   3.926 +    to_skip = malloc(BITMAP_SIZE);
   3.927 +
   3.928 +    if ( !to_send || !to_fix || !to_skip )
   3.929 +    {
   3.930 +        ERROR("Couldn't allocate to_send array");
   3.931 +        goto out;
   3.932 +    }
   3.933 +
   3.934 +    memset(to_send, 0xff, BITMAP_SIZE);
   3.935 +
   3.936 +    if ( lock_pages(to_send, BITMAP_SIZE) )
   3.937 +    {
   3.938 +        ERROR("Unable to lock to_send");
   3.939 +        return 1;
   3.940 +    }
   3.941 +
   3.942 +    /* (to fix is local only) */
   3.943 +    if ( lock_pages(to_skip, BITMAP_SIZE) )
   3.944 +    {
   3.945 +        ERROR("Unable to lock to_skip");
   3.946 +        return 1;
   3.947 +    }
   3.948 +
   3.949 +    if ( hvm ) 
   3.950 +    {
   3.951 +        /* Need another buffer for HVM context */
   3.952 +        hvm_buf_size = xc_domain_hvm_getcontext(xc_handle, dom, 0, 0);
   3.953 +        if ( hvm_buf_size == -1 )
   3.954 +        {
   3.955 +            ERROR("Couldn't get HVM context size from Xen");
   3.956 +            goto out;
   3.957 +        }
   3.958 +        hvm_buf = malloc(hvm_buf_size);
   3.959 +        if ( !hvm_buf )
   3.960 +        {
   3.961 +            ERROR("Couldn't allocate memory");
   3.962 +            goto out;
   3.963 +        }
   3.964 +    }
   3.965 +
   3.966 +    analysis_phase(xc_handle, dom, p2m_size, to_skip, 0);
   3.967 +
   3.968 +    /* We want zeroed memory so use calloc rather than malloc. */
   3.969 +    pfn_type   = calloc(MAX_BATCH_SIZE, sizeof(*pfn_type));
   3.970 +    pfn_batch  = calloc(MAX_BATCH_SIZE, sizeof(*pfn_batch));
   3.971 +    if ( (pfn_type == NULL) || (pfn_batch == NULL) )
   3.972 +    {
   3.973 +        ERROR("failed to alloc memory for pfn_type and/or pfn_batch arrays");
   3.974 +        errno = ENOMEM;
   3.975 +        goto out;
   3.976 +    }
   3.977 +
   3.978 +    if ( lock_pages(pfn_type, MAX_BATCH_SIZE * sizeof(*pfn_type)) )
   3.979 +    {
   3.980 +        ERROR("Unable to lock");
   3.981 +        goto out;
   3.982 +    }
   3.983 +
   3.984 +    /* Setup the mfn_to_pfn table mapping */
   3.985 +    if ( !(live_m2p = xc_map_m2p(xc_handle, max_mfn, PROT_READ)) )
   3.986 +    {
   3.987 +        ERROR("Failed to map live M2P table");
   3.988 +        goto out;
   3.989 +    }
   3.990 +
   3.991 +    /* Start writing out the saved-domain record. */
   3.992 +    if ( !write_exact(io_fd, &p2m_size, sizeof(unsigned long)) )
   3.993 +    {
   3.994 +        ERROR("write: p2m_size");
   3.995 +        goto out;
   3.996 +    }
   3.997 +
   3.998 +    if ( !hvm )
   3.999 +    {
  3.1000 +        int err = 0;
  3.1001 +        unsigned long mfn;
  3.1002 +
  3.1003 +        /* Map the P2M table, and write the list of P2M frames */
  3.1004 +        live_p2m = map_and_save_p2m_table(xc_handle, io_fd, dom, 
  3.1005 +                                          &ctxt, p2m_size, live_shinfo);
  3.1006 +        if ( live_p2m == NULL )
  3.1007 +        {
  3.1008 +            ERROR("Failed to map/save the p2m frame list");
  3.1009 +            goto out;
  3.1010 +        }
  3.1011 +
  3.1012 +        /*
  3.1013 +         * Quick belt and braces sanity check.
  3.1014 +         */
  3.1015 +        
  3.1016 +        for ( i = 0; i < p2m_size; i++ )
  3.1017 +        {
  3.1018 +            mfn = live_p2m[i];
  3.1019 +            if( (mfn != INVALID_P2M_ENTRY) && (mfn_to_pfn(mfn) != i) )
  3.1020 +            {
  3.1021 +                DPRINTF("i=0x%x mfn=%lx live_m2p=%lx\n", i,
  3.1022 +                        mfn, mfn_to_pfn(mfn));
  3.1023 +                err++;
  3.1024 +            }
  3.1025 +        }
  3.1026 +        DPRINTF("Had %d unexplained entries in p2m table\n", err);
  3.1027 +    }
  3.1028 +
  3.1029 +    print_stats(xc_handle, dom, 0, &stats, 0);
  3.1030 +
  3.1031 +    /* Now write out each data page, canonicalising page tables as we go... */
  3.1032 +    for ( ; ; )
  3.1033 +    {
  3.1034 +        unsigned int prev_pc, sent_this_iter, N, batch;
  3.1035 +
  3.1036 +        iter++;
  3.1037 +        sent_this_iter = 0;
  3.1038 +        skip_this_iter = 0;
  3.1039 +        prev_pc = 0;
  3.1040 +        N = 0;
  3.1041 +
  3.1042 +        DPRINTF("Saving memory pages: iter %d   0%%", iter);
  3.1043 +
  3.1044 +        while ( N < p2m_size )
  3.1045 +        {
  3.1046 +            unsigned int this_pc = (N * 100) / p2m_size;
  3.1047 +            int rc;
  3.1048 +
  3.1049 +            if ( (this_pc - prev_pc) >= 5 )
  3.1050 +            {
  3.1051 +                DPRINTF("\b\b\b\b%3d%%", this_pc);
  3.1052 +                prev_pc = this_pc;
  3.1053 +            }
  3.1054 +
  3.1055 +            if ( !last_iter )
  3.1056 +            {
  3.1057 +                /* Slightly wasteful to peek the whole array evey time,
  3.1058 +                   but this is fast enough for the moment. */
  3.1059 +                rc = xc_shadow_control(
  3.1060 +                    xc_handle, dom, XEN_DOMCTL_SHADOW_OP_PEEK, to_skip, 
  3.1061 +                    p2m_size, NULL, 0, NULL);
  3.1062 +                if ( rc != p2m_size )
  3.1063 +                {
  3.1064 +                    ERROR("Error peeking shadow bitmap");
  3.1065 +                    goto out;
  3.1066 +                }
  3.1067 +            }
  3.1068 +
  3.1069 +            /* load pfn_type[] with the mfn of all the pages we're doing in
  3.1070 +               this batch. */
  3.1071 +            for  ( batch = 0;
  3.1072 +                   (batch < MAX_BATCH_SIZE) && (N < p2m_size);
  3.1073 +                   N++ )
  3.1074 +            {
  3.1075 +                int n = permute(N, p2m_size, order_nr);
  3.1076 +
  3.1077 +                if ( debug )
  3.1078 +                    DPRINTF("%d pfn= %08lx mfn= %08lx %d  [mfn]= %08lx\n",
  3.1079 +                            iter, (unsigned long)n, hvm ? 0 : live_p2m[n],
  3.1080 +                            test_bit(n, to_send),
  3.1081 +                            hvm ? 0 : mfn_to_pfn(live_p2m[n]&0xFFFFF));
  3.1082 +
  3.1083 +                if ( !last_iter &&
  3.1084 +                     test_bit(n, to_send) &&
  3.1085 +                     test_bit(n, to_skip) )
  3.1086 +                    skip_this_iter++; /* stats keeping */
  3.1087 +
  3.1088 +                if ( !((test_bit(n, to_send) && !test_bit(n, to_skip)) ||
  3.1089 +                       (test_bit(n, to_send) && last_iter) ||
  3.1090 +                       (test_bit(n, to_fix)  && last_iter)) )
  3.1091 +                    continue;
  3.1092 +
  3.1093 +                /* Skip PFNs that aren't really there */
  3.1094 +                if ( hvm && ((n >= 0xa0 && n < 0xc0) /* VGA hole */
  3.1095 +                             || (n >= (HVM_BELOW_4G_MMIO_START >> PAGE_SHIFT) 
  3.1096 +                                 && n < (1ULL<<32) >> PAGE_SHIFT)) /* MMIO */ )
  3.1097 +                    continue;
  3.1098 +
  3.1099 +                /*
  3.1100 +                ** we get here if:
  3.1101 +                **  1. page is marked to_send & hasn't already been re-dirtied
  3.1102 +                **  2. (ignore to_skip in last iteration)
  3.1103 +                **  3. add in pages that still need fixup (net bufs)
  3.1104 +                */
  3.1105 +
  3.1106 +                pfn_batch[batch] = n;
  3.1107 +
  3.1108 +                /* Hypercall interfaces operate in PFNs for HVM guests
  3.1109 +                * and MFNs for PV guests */
  3.1110 +                if ( hvm ) 
  3.1111 +                    pfn_type[batch] = n;
  3.1112 +                else
  3.1113 +                    pfn_type[batch] = live_p2m[n];
  3.1114 +                    
  3.1115 +                if ( !is_mapped(pfn_type[batch]) )
  3.1116 +                {
  3.1117 +                    /*
  3.1118 +                    ** not currently in psuedo-physical map -- set bit
  3.1119 +                    ** in to_fix since we must send this page in last_iter
  3.1120 +                    ** unless its sent sooner anyhow, or it never enters
  3.1121 +                    ** pseudo-physical map (e.g. for ballooned down doms)
  3.1122 +                    */
  3.1123 +                    set_bit(n, to_fix);
  3.1124 +                    continue;
  3.1125 +                }
  3.1126 +
  3.1127 +                if ( last_iter &&
  3.1128 +                     test_bit(n, to_fix) &&
  3.1129 +                     !test_bit(n, to_send) )
  3.1130 +                {
  3.1131 +                    needed_to_fix++;
  3.1132 +                    DPRINTF("Fix! iter %d, pfn %x. mfn %lx\n",
  3.1133 +                            iter, n, pfn_type[batch]);
  3.1134 +                }
  3.1135 +                
  3.1136 +                clear_bit(n, to_fix);
  3.1137 +                
  3.1138 +                batch++;
  3.1139 +            }
  3.1140 +
  3.1141 +            if ( batch == 0 )
  3.1142 +                goto skip; /* vanishingly unlikely... */
  3.1143 +
  3.1144 +            region_base = xc_map_foreign_batch(
  3.1145 +                xc_handle, dom, PROT_READ, pfn_type, batch);
  3.1146 +            if ( region_base == NULL )
  3.1147 +            {
  3.1148 +                ERROR("map batch failed");
  3.1149 +                goto out;
  3.1150 +            }
  3.1151 +
  3.1152 +            if ( !hvm )
  3.1153 +            {
  3.1154 +                /* Get page types */
  3.1155 +                for ( j = 0; j < batch; j++ )
  3.1156 +                    ((uint32_t *)pfn_type)[j] = pfn_type[j];
  3.1157 +                if ( xc_get_pfn_type_batch(xc_handle, dom, batch,
  3.1158 +                                           (uint32_t *)pfn_type) )
  3.1159 +                {
  3.1160 +                    ERROR("get_pfn_type_batch failed");
  3.1161 +                    goto out;
  3.1162 +                }
  3.1163 +                for ( j = batch-1; j >= 0; j-- )
  3.1164 +                    pfn_type[j] = ((uint32_t *)pfn_type)[j];
  3.1165 +
  3.1166 +                for ( j = 0; j < batch; j++ )
  3.1167 +                {
  3.1168 +                    
  3.1169 +                    if ( (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) ==
  3.1170 +                         XEN_DOMCTL_PFINFO_XTAB )
  3.1171 +                    {
  3.1172 +                        DPRINTF("type fail: page %i mfn %08lx\n", 
  3.1173 +                                j, pfn_type[j]);
  3.1174 +                        continue;
  3.1175 +                    }
  3.1176 +                    
  3.1177 +                    if ( debug )
  3.1178 +                        DPRINTF("%d pfn= %08lx mfn= %08lx [mfn]= %08lx"
  3.1179 +                                " sum= %08lx\n",
  3.1180 +                                iter,
  3.1181 +                                (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) |
  3.1182 +                                pfn_batch[j],
  3.1183 +                                pfn_type[j],
  3.1184 +                                mfn_to_pfn(pfn_type[j] &
  3.1185 +                                           ~XEN_DOMCTL_PFINFO_LTAB_MASK),
  3.1186 +                                csum_page(region_base + (PAGE_SIZE*j)));
  3.1187 +                    
  3.1188 +                    /* canonicalise mfn->pfn */
  3.1189 +                    pfn_type[j] = (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) |
  3.1190 +                        pfn_batch[j];
  3.1191 +                }
  3.1192 +            }
  3.1193 +
  3.1194 +            if ( !write_exact(io_fd, &batch, sizeof(unsigned int)) )
  3.1195 +            {
  3.1196 +                ERROR("Error when writing to state file (2) (errno %d)",
  3.1197 +                      errno);
  3.1198 +                goto out;
  3.1199 +            }
  3.1200 +
  3.1201 +            if ( !write_exact(io_fd, pfn_type, sizeof(unsigned long)*batch) )
  3.1202 +            {
  3.1203 +                ERROR("Error when writing to state file (3) (errno %d)",
  3.1204 +                      errno);
  3.1205 +                goto out;
  3.1206 +            }
  3.1207 +
  3.1208 +            /* entering this loop, pfn_type is now in pfns (Not mfns) */
  3.1209 +            for ( j = 0; j < batch; j++ )
  3.1210 +            {
  3.1211 +                unsigned long pfn, pagetype;
  3.1212 +                void *spage = (char *)region_base + (PAGE_SIZE*j);
  3.1213 +
  3.1214 +                pfn      = pfn_type[j] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
  3.1215 +                pagetype = pfn_type[j] &  XEN_DOMCTL_PFINFO_LTAB_MASK;
  3.1216 +
  3.1217 +                /* write out pages in batch */
  3.1218 +                if ( pagetype == XEN_DOMCTL_PFINFO_XTAB )
  3.1219 +                    continue;
  3.1220 +
  3.1221 +                pagetype &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK;
  3.1222 +
  3.1223 +                if ( (pagetype >= XEN_DOMCTL_PFINFO_L1TAB) &&
  3.1224 +                     (pagetype <= XEN_DOMCTL_PFINFO_L4TAB) )
  3.1225 +                {
  3.1226 +                    /* We have a pagetable page: need to rewrite it. */
  3.1227 +                    race = 
  3.1228 +                        canonicalize_pagetable(pagetype, pfn, spage, page); 
  3.1229 +
  3.1230 +                    if ( race && !live )
  3.1231 +                    {
  3.1232 +                        ERROR("Fatal PT race (pfn %lx, type %08lx)", pfn,
  3.1233 +                              pagetype);
  3.1234 +                        goto out;
  3.1235 +                    }
  3.1236 +
  3.1237 +                    if ( ratewrite(io_fd, live, page, PAGE_SIZE) != PAGE_SIZE )
  3.1238 +                    {
  3.1239 +                        ERROR("Error when writing to state file (4)"
  3.1240 +                              " (errno %d)", errno);
  3.1241 +                        goto out;
  3.1242 +                    }
  3.1243 +                }
  3.1244 +                else
  3.1245 +                {
  3.1246 +                    /* We have a normal page: just write it directly. */
  3.1247 +                    if ( ratewrite(io_fd, live, spage, PAGE_SIZE) !=
  3.1248 +                         PAGE_SIZE )
  3.1249 +                    {
  3.1250 +                        ERROR("Error when writing to state file (5)"
  3.1251 +                              " (errno %d)", errno);
  3.1252 +                        goto out;
  3.1253 +                    }
  3.1254 +                }
  3.1255 +            } /* end of the write out for this batch */
  3.1256 +
  3.1257 +            sent_this_iter += batch;
  3.1258 +
  3.1259 +            munmap(region_base, batch*PAGE_SIZE);
  3.1260 +
  3.1261 +        } /* end of this while loop for this iteration */
  3.1262 +
  3.1263 +      skip:
  3.1264 +
  3.1265 +        total_sent += sent_this_iter;
  3.1266 +
  3.1267 +        DPRINTF("\r %d: sent %d, skipped %d, ",
  3.1268 +                iter, sent_this_iter, skip_this_iter );
  3.1269 +
  3.1270 +        if ( last_iter )
  3.1271 +        {
  3.1272 +            print_stats( xc_handle, dom, sent_this_iter, &stats, 1);
  3.1273 +
  3.1274 +            DPRINTF("Total pages sent= %ld (%.2fx)\n",
  3.1275 +                    total_sent, ((float)total_sent)/p2m_size );
  3.1276 +            DPRINTF("(of which %ld were fixups)\n", needed_to_fix  );
  3.1277 +        }
  3.1278 +
  3.1279 +        if ( last_iter && debug )
  3.1280 +        {
  3.1281 +            int minusone = -1;
  3.1282 +            memset(to_send, 0xff, BITMAP_SIZE);
  3.1283 +            debug = 0;
  3.1284 +            DPRINTF("Entering debug resend-all mode\n");
  3.1285 +
  3.1286 +            /* send "-1" to put receiver into debug mode */
  3.1287 +            if ( !write_exact(io_fd, &minusone, sizeof(int)) )
  3.1288 +            {
  3.1289 +                ERROR("Error when writing to state file (6) (errno %d)",
  3.1290 +                      errno);
  3.1291 +                goto out;
  3.1292 +            }
  3.1293 +
  3.1294 +            continue;
  3.1295 +        }
  3.1296 +
  3.1297 +        if ( last_iter )
  3.1298 +            break;
  3.1299 +
  3.1300 +        if ( live )
  3.1301 +        {
  3.1302 +            if ( ((sent_this_iter > sent_last_iter) && RATE_IS_MAX()) ||
  3.1303 +                 (iter >= max_iters) ||
  3.1304 +                 (sent_this_iter+skip_this_iter < 50) ||
  3.1305 +                 (total_sent > p2m_size*max_factor) )
  3.1306 +            {
  3.1307 +                DPRINTF("Start last iteration\n");
  3.1308 +                last_iter = 1;
  3.1309 +
  3.1310 +                if ( suspend_and_state(suspend, xc_handle, io_fd, dom, &info,
  3.1311 +                                       &ctxt) )
  3.1312 +                {
  3.1313 +                    ERROR("Domain appears not to have suspended");
  3.1314 +                    goto out;
  3.1315 +                }
  3.1316 +
  3.1317 +                DPRINTF("SUSPEND shinfo %08lx eip %08lx edx %08lx\n",
  3.1318 +                        info.shared_info_frame,
  3.1319 +                        (unsigned long)ctxt.user_regs.eip,
  3.1320 +                        (unsigned long)ctxt.user_regs.edx);
  3.1321 +            }
  3.1322 +
  3.1323 +            if ( xc_shadow_control(xc_handle, dom, 
  3.1324 +                                   XEN_DOMCTL_SHADOW_OP_CLEAN, to_send, 
  3.1325 +                                   p2m_size, NULL, 0, &stats) != p2m_size )
  3.1326 +            {
  3.1327 +                ERROR("Error flushing shadow PT");
  3.1328 +                goto out;
  3.1329 +            }
  3.1330 +
  3.1331 +            if ( hvm ) 
  3.1332 +            {
  3.1333 +                /* Pull in the dirty bits from qemu-dm too */
  3.1334 +                if ( !last_iter )
  3.1335 +                {
  3.1336 +                    qemu_active = qemu_non_active;
  3.1337 +                    qemu_non_active = qemu_active ? 0 : 1;
  3.1338 +                    qemu_flip_buffer(dom, qemu_active);
  3.1339 +                    for ( j = 0; j < BITMAP_SIZE / sizeof(unsigned long); j++ )
  3.1340 +                    {
  3.1341 +                        to_send[j] |= qemu_bitmaps[qemu_non_active][j];
  3.1342 +                        qemu_bitmaps[qemu_non_active][j] = 0;
  3.1343 +                    }
  3.1344 +                }
  3.1345 +                else
  3.1346 +                {
  3.1347 +                    for ( j = 0; j < BITMAP_SIZE / sizeof(unsigned long); j++ )
  3.1348 +                        to_send[j] |= qemu_bitmaps[qemu_active][j];
  3.1349 +                }
  3.1350 +            }
  3.1351 +
  3.1352 +            sent_last_iter = sent_this_iter;
  3.1353 +
  3.1354 +            print_stats(xc_handle, dom, sent_this_iter, &stats, 1);
  3.1355 +
  3.1356 +        }
  3.1357 +    } /* end of infinite for loop */
  3.1358 +
  3.1359 +    DPRINTF("All memory is saved\n");
  3.1360 +
  3.1361 +    {
  3.1362 +        struct {
  3.1363 +            int minustwo;
  3.1364 +            int max_vcpu_id;
  3.1365 +            uint64_t vcpumap;
  3.1366 +        } chunk = { -2, info.max_vcpu_id };
  3.1367 +
  3.1368 +        if ( info.max_vcpu_id >= 64 )
  3.1369 +        {
  3.1370 +            ERROR("Too many VCPUS in guest!");
  3.1371 +            goto out;
  3.1372 +        }
  3.1373 +
  3.1374 +        for ( i = 1; i <= info.max_vcpu_id; i++ )
  3.1375 +        {
  3.1376 +            xc_vcpuinfo_t vinfo;
  3.1377 +            if ( (xc_vcpu_getinfo(xc_handle, dom, i, &vinfo) == 0) &&
  3.1378 +                 vinfo.online )
  3.1379 +                vcpumap |= 1ULL << i;
  3.1380 +        }
  3.1381 +
  3.1382 +        chunk.vcpumap = vcpumap;
  3.1383 +        if ( !write_exact(io_fd, &chunk, sizeof(chunk)) )
  3.1384 +        {
  3.1385 +            ERROR("Error when writing to state file (errno %d)", errno);
  3.1386 +            goto out;
  3.1387 +        }
  3.1388 +    }
  3.1389 +
  3.1390 +    /* Zero terminate */
  3.1391 +    i = 0;
  3.1392 +    if ( !write_exact(io_fd, &i, sizeof(int)) )
  3.1393 +    {
  3.1394 +        ERROR("Error when writing to state file (6') (errno %d)", errno);
  3.1395 +        goto out;
  3.1396 +    }
  3.1397 +
  3.1398 +    if ( hvm ) 
  3.1399 +    {
  3.1400 +        uint32_t rec_size;
  3.1401 +
  3.1402 +        /* Save magic-page locations. */
  3.1403 +        memset(magic_pfns, 0, sizeof(magic_pfns));
  3.1404 +        xc_get_hvm_param(xc_handle, dom, HVM_PARAM_IOREQ_PFN,
  3.1405 +                         (unsigned long *)&magic_pfns[0]);
  3.1406 +        xc_get_hvm_param(xc_handle, dom, HVM_PARAM_BUFIOREQ_PFN,
  3.1407 +                         (unsigned long *)&magic_pfns[1]);
  3.1408 +        xc_get_hvm_param(xc_handle, dom, HVM_PARAM_STORE_PFN,
  3.1409 +                         (unsigned long *)&magic_pfns[2]);
  3.1410 +        if ( !write_exact(io_fd, magic_pfns, sizeof(magic_pfns)) )
  3.1411 +        {
  3.1412 +            ERROR("Error when writing to state file (7)");
  3.1413 +            goto out;
  3.1414 +        }
  3.1415 +
  3.1416 +        /* Save vcpu contexts */
  3.1417 +
  3.1418 +        for ( i = 0; i <= info.max_vcpu_id; i++ )
  3.1419 +        {
  3.1420 +            if ( !(vcpumap & (1ULL << i)) )
  3.1421 +                continue;
  3.1422 +            
  3.1423 +            if ( xc_vcpu_getcontext(xc_handle, dom, i, &ctxt) )
  3.1424 +            {
  3.1425 +                ERROR("HVM:Could not get vcpu context");
  3.1426 +                goto out;
  3.1427 +            }
  3.1428 +            
  3.1429 +            DPRINTF("write vcpu %d context.\n", i); 
  3.1430 +            if ( !write_exact(io_fd, &(ctxt), sizeof(ctxt)) )
  3.1431 +            {
  3.1432 +                ERROR("write vcpu context failed!\n");
  3.1433 +                goto out;
  3.1434 +            }
  3.1435 +        }
  3.1436 +
  3.1437 +        /* Get HVM context from Xen and save it too */
  3.1438 +        if ( (rec_size = xc_domain_hvm_getcontext(xc_handle, dom, hvm_buf, 
  3.1439 +                                                  hvm_buf_size)) == -1 )
  3.1440 +        {
  3.1441 +            ERROR("HVM:Could not get hvm buffer");
  3.1442 +            goto out;
  3.1443 +        }
  3.1444 +        
  3.1445 +        if ( !write_exact(io_fd, &rec_size, sizeof(uint32_t)) )
  3.1446 +        {
  3.1447 +            ERROR("error write hvm buffer size");
  3.1448 +            goto out;
  3.1449 +        }
  3.1450 +        
  3.1451 +        if ( !write_exact(io_fd, hvm_buf, rec_size) )
  3.1452 +        {
  3.1453 +            ERROR("write HVM info failed!\n");
  3.1454 +            goto out;
  3.1455 +        }
  3.1456 +        
  3.1457 +        /* HVM guests are done now */
  3.1458 +        rc = 0;
  3.1459 +        goto out;
  3.1460 +    }
  3.1461 +
  3.1462 +    /* PV guests only from now on */
  3.1463 +
  3.1464 +    /* Send through a list of all the PFNs that were not in map at the close */
  3.1465 +    {
  3.1466 +        unsigned int i,j;
  3.1467 +        unsigned long pfntab[1024];
  3.1468 +
  3.1469 +        for ( i = 0, j = 0; i < p2m_size; i++ )
  3.1470 +        {
  3.1471 +            if ( !is_mapped(live_p2m[i]) )
  3.1472 +                j++;
  3.1473 +        }
  3.1474 +
  3.1475 +        if ( !write_exact(io_fd, &j, sizeof(unsigned int)) )
  3.1476 +        {
  3.1477 +            ERROR("Error when writing to state file (6a) (errno %d)", errno);
  3.1478 +            goto out;
  3.1479 +        }
  3.1480 +
  3.1481 +        for ( i = 0, j = 0; i < p2m_size; )
  3.1482 +        {
  3.1483 +            if ( !is_mapped(live_p2m[i]) )
  3.1484 +                pfntab[j++] = i;
  3.1485 +
  3.1486 +            i++;
  3.1487 +            if ( (j == 1024) || (i == p2m_size) )
  3.1488 +            {
  3.1489 +                if ( !write_exact(io_fd, &pfntab, sizeof(unsigned long)*j) )
  3.1490 +                {
  3.1491 +                    ERROR("Error when writing to state file (6b) (errno %d)",
  3.1492 +                          errno);
  3.1493 +                    goto out;
  3.1494 +                }
  3.1495 +                j = 0;
  3.1496 +            }
  3.1497 +        }
  3.1498 +    }
  3.1499 +
  3.1500 +    /* Canonicalise the suspend-record frame number. */
  3.1501 +    if ( !translate_mfn_to_pfn(&ctxt.user_regs.edx) )
  3.1502 +    {
  3.1503 +        ERROR("Suspend record is not in range of pseudophys map");
  3.1504 +        goto out;
  3.1505 +    }
  3.1506 +
  3.1507 +    for ( i = 0; i <= info.max_vcpu_id; i++ )
  3.1508 +    {
  3.1509 +        if ( !(vcpumap & (1ULL << i)) )
  3.1510 +            continue;
  3.1511 +
  3.1512 +        if ( (i != 0) && xc_vcpu_getcontext(xc_handle, dom, i, &ctxt) )
  3.1513 +        {
  3.1514 +            ERROR("No context for VCPU%d", i);
  3.1515 +            goto out;
  3.1516 +        }
  3.1517 +
  3.1518 +        /* Canonicalise each GDT frame number. */
  3.1519 +        for ( j = 0; (512*j) < ctxt.gdt_ents; j++ )
  3.1520 +        {
  3.1521 +            if ( !translate_mfn_to_pfn(&ctxt.gdt_frames[j]) )
  3.1522 +            {
  3.1523 +                ERROR("GDT frame is not in range of pseudophys map");
  3.1524 +                goto out;
  3.1525 +            }
  3.1526 +        }
  3.1527 +
  3.1528 +        /* Canonicalise the page table base pointer. */
  3.1529 +        if ( !MFN_IS_IN_PSEUDOPHYS_MAP(xen_cr3_to_pfn(ctxt.ctrlreg[3])) )
  3.1530 +        {
  3.1531 +            ERROR("PT base is not in range of pseudophys map");
  3.1532 +            goto out;
  3.1533 +        }
  3.1534 +        ctxt.ctrlreg[3] = 
  3.1535 +            xen_pfn_to_cr3(mfn_to_pfn(xen_cr3_to_pfn(ctxt.ctrlreg[3])));
  3.1536 +
  3.1537 +        /* Guest pagetable (x86/64) stored in otherwise-unused CR1. */
  3.1538 +        if ( (pt_levels == 4) && ctxt.ctrlreg[1] )
  3.1539 +        {
  3.1540 +            if ( !MFN_IS_IN_PSEUDOPHYS_MAP(xen_cr3_to_pfn(ctxt.ctrlreg[1])) )
  3.1541 +            {
  3.1542 +                ERROR("PT base is not in range of pseudophys map");
  3.1543 +                goto out;
  3.1544 +            }
  3.1545 +            /* Least-significant bit means 'valid PFN'. */
  3.1546 +            ctxt.ctrlreg[1] = 1 |
  3.1547 +                xen_pfn_to_cr3(mfn_to_pfn(xen_cr3_to_pfn(ctxt.ctrlreg[1])));
  3.1548 +        }
  3.1549 +
  3.1550 +        if ( !write_exact(io_fd, &ctxt, sizeof(ctxt)) )
  3.1551 +        {
  3.1552 +            ERROR("Error when writing to state file (1) (errno %d)", errno);
  3.1553 +            goto out;
  3.1554 +        }
  3.1555 +    }
  3.1556 +
  3.1557 +    /*
  3.1558 +     * Reset the MFN to be a known-invalid value. See map_frame_list_list().
  3.1559 +     */
  3.1560 +    memcpy(page, live_shinfo, PAGE_SIZE);
  3.1561 +    ((shared_info_t *)page)->arch.pfn_to_mfn_frame_list_list = 0;
  3.1562 +    if ( !write_exact(io_fd, page, PAGE_SIZE) )
  3.1563 +    {
  3.1564 +        ERROR("Error when writing to state file (1) (errno %d)", errno);
  3.1565 +        goto out;
  3.1566 +    }
  3.1567 +
  3.1568 +    /* Success! */
  3.1569 +    rc = 0;
  3.1570 +
  3.1571 + out:
  3.1572 +
  3.1573 +    if ( live )
  3.1574 +    {
  3.1575 +        if ( xc_shadow_control(xc_handle, dom, 
  3.1576 +                               XEN_DOMCTL_SHADOW_OP_OFF,
  3.1577 +                               NULL, 0, NULL, 0, NULL) < 0 )
  3.1578 +            DPRINTF("Warning - couldn't disable shadow mode");
  3.1579 +    }
  3.1580 +
  3.1581 +    /* Flush last write and discard cache for file. */
  3.1582 +    discard_file_cache(io_fd, 1 /* flush */);
  3.1583 +
  3.1584 +    if ( live_shinfo )
  3.1585 +        munmap(live_shinfo, PAGE_SIZE);
  3.1586 +
  3.1587 +    if ( live_p2m )
  3.1588 +        munmap(live_p2m, ROUNDUP(p2m_size * sizeof(xen_pfn_t), PAGE_SHIFT));
  3.1589 +
  3.1590 +    if ( live_m2p )
  3.1591 +        munmap(live_m2p, M2P_SIZE(max_mfn));
  3.1592 +
  3.1593 +    free(pfn_type);
  3.1594 +    free(pfn_batch);
  3.1595 +    free(to_send);
  3.1596 +    free(to_fix);
  3.1597 +    free(to_skip);
  3.1598 +
  3.1599 +    DPRINTF("Save exit rc=%d\n",rc);
  3.1600 +
  3.1601 +    return !!rc;
  3.1602 +}
  3.1603 +
  3.1604 +/*
  3.1605 + * Local variables:
  3.1606 + * mode: C
  3.1607 + * c-set-style: "BSD"
  3.1608 + * c-basic-offset: 4
  3.1609 + * tab-width: 4
  3.1610 + * indent-tabs-mode: nil
  3.1611 + * End:
  3.1612 + */
     4.1 --- a/tools/libxc/xc_hvm_save.c	Wed Apr 11 09:29:00 2007 +0100
     4.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
     4.3 @@ -1,755 +0,0 @@
     4.4 -/******************************************************************************
     4.5 - * xc_hvm_save.c
     4.6 - *
     4.7 - * Save the state of a running HVM guest.
     4.8 - *
     4.9 - * Copyright (c) 2003, K A Fraser.
    4.10 - * Copyright (c) 2006 Intel Corperation
    4.11 - * rewriten for hvm guest by Zhai Edwin <edwin.zhai@intel.com>
    4.12 - *
    4.13 - * This program is free software; you can redistribute it and/or modify it
    4.14 - * under the terms and conditions of the GNU General Public License,
    4.15 - * version 2, as published by the Free Software Foundation.
    4.16 - *
    4.17 - * This program is distributed in the hope it will be useful, but WITHOUT
    4.18 - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
    4.19 - * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
    4.20 - * more details.
    4.21 - *
    4.22 - * You should have received a copy of the GNU General Public License along with
    4.23 - * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
    4.24 - * Place - Suite 330, Boston, MA 02111-1307 USA.
    4.25 - *
    4.26 - */
    4.27 -
    4.28 -#include <inttypes.h>
    4.29 -#include <time.h>
    4.30 -#include <stdlib.h>
    4.31 -#include <unistd.h>
    4.32 -#include <sys/time.h>
    4.33 -
    4.34 -#include "xc_private.h"
    4.35 -#include "xg_private.h"
    4.36 -#include "xg_save_restore.h"
    4.37 -
    4.38 -#include <xen/hvm/e820.h>
    4.39 -#include <xen/hvm/params.h>
    4.40 -
    4.41 -/*
    4.42 -** Default values for important tuning parameters. Can override by passing
    4.43 -** non-zero replacement values to xc_hvm_save().
    4.44 -**
    4.45 -** XXX SMH: should consider if want to be able to override MAX_MBIT_RATE too.
    4.46 -**
    4.47 -*/
    4.48 -#define DEF_MAX_ITERS   29   /* limit us to 30 times round loop   */
    4.49 -#define DEF_MAX_FACTOR   3   /* never send more than 3x nr_pfns   */
    4.50 -
    4.51 -/* Shared-memory bitmaps for getting log-dirty bits from qemu */
    4.52 -static unsigned long *qemu_bitmaps[2];
    4.53 -static int qemu_active;
    4.54 -static int qemu_non_active;
    4.55 -
    4.56 -/*
    4.57 -** During (live) save/migrate, we maintain a number of bitmaps to track
    4.58 -** which pages we have to send, to fixup, and to skip.
    4.59 -*/
    4.60 -
    4.61 -#define BITS_PER_LONG (sizeof(unsigned long) * 8)
    4.62 -#define BITS_TO_LONGS(bits) (((bits)+BITS_PER_LONG-1)/BITS_PER_LONG)
    4.63 -#define BITMAP_SIZE   (BITS_TO_LONGS(pfn_array_size) * sizeof(unsigned long))
    4.64 -
    4.65 -#define BITMAP_ENTRY(_nr,_bmap) \
    4.66 -   ((unsigned long *)(_bmap))[(_nr)/BITS_PER_LONG]
    4.67 -
    4.68 -#define BITMAP_SHIFT(_nr) ((_nr) % BITS_PER_LONG)
    4.69 -
    4.70 -static inline int test_bit (int nr, volatile void * addr)
    4.71 -{
    4.72 -    return (BITMAP_ENTRY(nr, addr) >> BITMAP_SHIFT(nr)) & 1;
    4.73 -}
    4.74 -
    4.75 -static inline void clear_bit (int nr, volatile void * addr)
    4.76 -{
    4.77 -    BITMAP_ENTRY(nr, addr) &= ~(1UL << BITMAP_SHIFT(nr));
    4.78 -}
    4.79 -
    4.80 -static inline int permute( int i, int nr, int order_nr  )
    4.81 -{
    4.82 -    /* Need a simple permutation function so that we scan pages in a
    4.83 -       pseudo random order, enabling us to get a better estimate of
    4.84 -       the domain's page dirtying rate as we go (there are often
    4.85 -       contiguous ranges of pfns that have similar behaviour, and we
    4.86 -       want to mix them up. */
    4.87 -
    4.88 -    /* e.g. nr->oder 15->4 16->4 17->5 */
    4.89 -    /* 512MB domain, 128k pages, order 17 */
    4.90 -
    4.91 -    /*
    4.92 -      QPONMLKJIHGFEDCBA
    4.93 -             QPONMLKJIH
    4.94 -      GFEDCBA
    4.95 -     */
    4.96 -
    4.97 -    /*
    4.98 -      QPONMLKJIHGFEDCBA
    4.99 -                  EDCBA
   4.100 -             QPONM
   4.101 -      LKJIHGF
   4.102 -      */
   4.103 -
   4.104 -    do { i = ((i>>(order_nr-10)) | ( i<<10 ) ) & ((1<<order_nr)-1); }
   4.105 -    while ( i >= nr ); /* this won't ever loop if nr is a power of 2 */
   4.106 -
   4.107 -    return i;
   4.108 -}
   4.109 -
   4.110 -
   4.111 -static uint64_t tv_to_us(struct timeval *new)
   4.112 -{
   4.113 -    return (new->tv_sec * 1000000) + new->tv_usec;
   4.114 -}
   4.115 -
   4.116 -static uint64_t llgettimeofday(void)
   4.117 -{
   4.118 -    struct timeval now;
   4.119 -    gettimeofday(&now, NULL);
   4.120 -    return tv_to_us(&now);
   4.121 -}
   4.122 -
   4.123 -static uint64_t tv_delta(struct timeval *new, struct timeval *old)
   4.124 -{
   4.125 -    return (((new->tv_sec - old->tv_sec)*1000000) +
   4.126 -            (new->tv_usec - old->tv_usec));
   4.127 -}
   4.128 -
   4.129 -
   4.130 -#define RATE_IS_MAX() (0)
   4.131 -#define ratewrite(_io_fd, _buf, _n) write((_io_fd), (_buf), (_n))
   4.132 -#define initialize_mbit_rate()
   4.133 -
   4.134 -static inline ssize_t write_exact(int fd, void *buf, size_t count)
   4.135 -{
   4.136 -    return (write(fd, buf, count) == count);
   4.137 -}
   4.138 -
   4.139 -static int print_stats(int xc_handle, uint32_t domid, int pages_sent,
   4.140 -                       xc_shadow_op_stats_t *stats, int print)
   4.141 -{
   4.142 -    static struct timeval wall_last;
   4.143 -    static long long      d0_cpu_last;
   4.144 -    static long long      d1_cpu_last;
   4.145 -
   4.146 -    struct timeval        wall_now;
   4.147 -    long long             wall_delta;
   4.148 -    long long             d0_cpu_now, d0_cpu_delta;
   4.149 -    long long             d1_cpu_now, d1_cpu_delta;
   4.150 -
   4.151 -    gettimeofday(&wall_now, NULL);
   4.152 -
   4.153 -    d0_cpu_now = xc_domain_get_cpu_usage(xc_handle, 0, /* FIXME */ 0)/1000;
   4.154 -    d1_cpu_now = xc_domain_get_cpu_usage(xc_handle, domid, /* FIXME */ 0)/1000;
   4.155 -
   4.156 -    if ( (d0_cpu_now == -1) || (d1_cpu_now == -1) )
   4.157 -        DPRINTF("ARRHHH!!\n");
   4.158 -
   4.159 -    wall_delta = tv_delta(&wall_now,&wall_last)/1000;
   4.160 -    if ( wall_delta == 0 )
   4.161 -        wall_delta = 1;
   4.162 -
   4.163 -    d0_cpu_delta = (d0_cpu_now - d0_cpu_last)/1000;
   4.164 -    d1_cpu_delta = (d1_cpu_now - d1_cpu_last)/1000;
   4.165 -
   4.166 -    if ( print )
   4.167 -        DPRINTF("delta %lldms, dom0 %d%%, target %d%%, sent %dMb/s, "
   4.168 -                "dirtied %dMb/s %" PRId32 " pages\n",
   4.169 -                wall_delta,
   4.170 -                (int)((d0_cpu_delta*100)/wall_delta),
   4.171 -                (int)((d1_cpu_delta*100)/wall_delta),
   4.172 -                (int)((pages_sent*PAGE_SIZE)/(wall_delta*(1000/8))),
   4.173 -                (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))),
   4.174 -                stats->dirty_count);
   4.175 -
   4.176 -    d0_cpu_last = d0_cpu_now;
   4.177 -    d1_cpu_last = d1_cpu_now;
   4.178 -    wall_last   = wall_now;
   4.179 -
   4.180 -    return 0;
   4.181 -}
   4.182 -
   4.183 -static int analysis_phase(int xc_handle, uint32_t domid, int pfn_array_size,
   4.184 -                          unsigned long *arr, int runs)
   4.185 -{
   4.186 -    long long start, now;
   4.187 -    xc_shadow_op_stats_t stats;
   4.188 -    int j;
   4.189 -
   4.190 -    start = llgettimeofday();
   4.191 -
   4.192 -    for ( j = 0; j < runs; j++ )
   4.193 -    {
   4.194 -        int i;
   4.195 -
   4.196 -        xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_CLEAN,
   4.197 -                          arr, pfn_array_size, NULL, 0, NULL);
   4.198 -        DPRINTF("#Flush\n");
   4.199 -        for ( i = 0; i < 40; i++ )
   4.200 -        {
   4.201 -            usleep(50000);
   4.202 -            now = llgettimeofday();
   4.203 -            xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_PEEK,
   4.204 -                              NULL, 0, NULL, 0, &stats);
   4.205 -            DPRINTF("now= %lld faults= %"PRId32" dirty= %"PRId32"\n",
   4.206 -                    ((now-start)+500)/1000,
   4.207 -                    stats.fault_count, stats.dirty_count);
   4.208 -        }
   4.209 -    }
   4.210 -
   4.211 -    return -1;
   4.212 -}
   4.213 -
   4.214 -static int suspend_and_state(int (*suspend)(int), int xc_handle, int io_fd,
   4.215 -                             int dom, xc_dominfo_t *info,
   4.216 -                             vcpu_guest_context_t *ctxt)
   4.217 -{
   4.218 -    int i = 0;
   4.219 -
   4.220 -    if ( !(*suspend)(dom) )
   4.221 -    {
   4.222 -        ERROR("Suspend request failed");
   4.223 -        return -1;
   4.224 -    }
   4.225 -
   4.226 - retry:
   4.227 -
   4.228 -    if ( xc_domain_getinfo(xc_handle, dom, 1, info) != 1 )
   4.229 -    {
   4.230 -        ERROR("Could not get domain info");
   4.231 -        return -1;
   4.232 -    }
   4.233 -
   4.234 -    if ( xc_vcpu_getcontext(xc_handle, dom, 0, ctxt) )
   4.235 -        ERROR("Could not get vcpu context");
   4.236 -
   4.237 -    if ( info->shutdown && (info->shutdown_reason == SHUTDOWN_suspend) )
   4.238 -        return 0; /* success */
   4.239 -
   4.240 -    if ( info->paused )
   4.241 -    {
   4.242 -        /* Try unpausing domain, wait, and retest. */
   4.243 -        xc_domain_unpause( xc_handle, dom );
   4.244 -        ERROR("Domain was paused. Wait and re-test.");
   4.245 -        usleep(10000);  /* 10ms */
   4.246 -        goto retry;
   4.247 -    }
   4.248 -
   4.249 -    if ( ++i < 100 )
   4.250 -    {
   4.251 -        ERROR("Retry suspend domain.");
   4.252 -        usleep(10000); /* 10ms */
   4.253 -        goto retry;
   4.254 -    }
   4.255 -
   4.256 -    ERROR("Unable to suspend domain.");
   4.257 -
   4.258 -    return -1;
   4.259 -}
   4.260 -
   4.261 -int xc_hvm_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
   4.262 -                uint32_t max_factor, uint32_t flags, int (*suspend)(int),
   4.263 -                void *(*init_qemu_maps)(int, unsigned), 
   4.264 -                void (*qemu_flip_buffer)(int, int))
   4.265 -{
   4.266 -    xc_dominfo_t info;
   4.267 -
   4.268 -    int rc = 1, i, j, last_iter, iter = 0;
   4.269 -    int live  = !!(flags & XCFLAGS_LIVE);
   4.270 -    int debug = !!(flags & XCFLAGS_DEBUG);
   4.271 -    int sent_last_iter, skip_this_iter;
   4.272 -
   4.273 -    /* The highest guest-physical frame number used by the current guest */
   4.274 -    unsigned long max_pfn;
   4.275 -
   4.276 -    /* The size of an array big enough to contain all guest pfns */
   4.277 -    unsigned long pfn_array_size;
   4.278 -
   4.279 -    /* Magic frames: ioreqs and xenstore comms. */
   4.280 -    uint64_t magic_pfns[3]; /* ioreq_pfn, bufioreq_pfn, store_pfn */
   4.281 -
   4.282 -    /* A copy of the CPU context of the guest. */
   4.283 -    vcpu_guest_context_t ctxt;
   4.284 -
   4.285 -    /* A table containg the PFNs (/not/ MFN!) to map. */
   4.286 -    xen_pfn_t *pfn_batch = NULL;
   4.287 -
   4.288 -    /* A copy of hvm domain context buffer*/
   4.289 -    uint32_t hvm_buf_size;
   4.290 -    uint8_t *hvm_buf = NULL;
   4.291 -
   4.292 -    /* base of the region in which domain memory is mapped */
   4.293 -    unsigned char *region_base = NULL;
   4.294 -
   4.295 -    uint32_t rec_size, nr_vcpus;
   4.296 -
   4.297 -    /* power of 2 order of pfn_array_size */
   4.298 -    int order_nr;
   4.299 -
   4.300 -    /* bitmap of pages:
   4.301 -       - that should be sent this iteration (unless later marked as skip);
   4.302 -       - to skip this iteration because already dirty; */
   4.303 -    unsigned long *to_send = NULL, *to_skip = NULL;
   4.304 -
   4.305 -    xc_shadow_op_stats_t stats;
   4.306 -
   4.307 -    unsigned long total_sent = 0;
   4.308 -
   4.309 -    uint64_t vcpumap = 1ULL;
   4.310 -
   4.311 -    DPRINTF("xc_hvm_save: dom=%d, max_iters=%d, max_factor=%d, flags=0x%x, "
   4.312 -            "live=%d, debug=%d.\n", dom, max_iters, max_factor, flags,
   4.313 -            live, debug);
   4.314 -    
   4.315 -    /* If no explicit control parameters given, use defaults */
   4.316 -    max_iters  = max_iters  ? : DEF_MAX_ITERS;
   4.317 -    max_factor = max_factor ? : DEF_MAX_FACTOR;
   4.318 -
   4.319 -    initialize_mbit_rate();
   4.320 -
   4.321 -    if ( xc_domain_getinfo(xc_handle, dom, 1, &info) != 1 )
   4.322 -    {
   4.323 -        ERROR("HVM: Could not get domain info");
   4.324 -        return 1;
   4.325 -    }
   4.326 -    nr_vcpus = info.nr_online_vcpus;
   4.327 -
   4.328 -    if ( mlock(&ctxt, sizeof(ctxt)) )
   4.329 -    {
   4.330 -        ERROR("HVM: Unable to mlock ctxt");
   4.331 -        return 1;
   4.332 -    }
   4.333 -
   4.334 -    /* Only have to worry about vcpu 0 even for SMP */
   4.335 -    if ( xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt) )
   4.336 -    {
   4.337 -        ERROR("HVM: Could not get vcpu context");
   4.338 -        goto out;
   4.339 -    }
   4.340 -
   4.341 -    DPRINTF("saved hvm domain info: max_memkb=0x%lx, nr_pages=0x%lx\n",
   4.342 -            info.max_memkb, info.nr_pages); 
   4.343 -
   4.344 -    if ( live )
   4.345 -    {
   4.346 -        /* Live suspend. Enable log-dirty mode. */
   4.347 -        if ( xc_shadow_control(xc_handle, dom,
   4.348 -                               XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY,
   4.349 -                               NULL, 0, NULL, 0, NULL) < 0 )
   4.350 -        {
   4.351 -            ERROR("Couldn't enable shadow mode");
   4.352 -            goto out;
   4.353 -        }
   4.354 -    }
   4.355 -    else
   4.356 -    {
   4.357 -        /* This is a non-live suspend. Suspend the domain .*/
   4.358 -        if ( suspend_and_state(suspend, xc_handle, io_fd, dom, &info, &ctxt) )
   4.359 -        {
   4.360 -            ERROR("HVM Domain appears not to have suspended");
   4.361 -            goto out;
   4.362 -        }
   4.363 -    }
   4.364 -
   4.365 -    last_iter = !live;
   4.366 -
   4.367 -    max_pfn = xc_memory_op(xc_handle, XENMEM_maximum_gpfn, &dom);
   4.368 -
   4.369 -    DPRINTF("after 1st handle hvm domain max_pfn=0x%lx, "
   4.370 -            "max_memkb=0x%lx, live=%d.\n",
   4.371 -            max_pfn, info.max_memkb, live);
   4.372 -
   4.373 -    /* Size of any array that covers 0 ... max_pfn */
   4.374 -    pfn_array_size = max_pfn + 1;
   4.375 -    if ( !write_exact(io_fd, &pfn_array_size, sizeof(unsigned long)) )
   4.376 -    {
   4.377 -        ERROR("Error when writing to state file (1)");
   4.378 -        goto out;
   4.379 -    }
   4.380 -
   4.381 -    /* pretend we sent all the pages last iteration */
   4.382 -    sent_last_iter = pfn_array_size;
   4.383 -
   4.384 -    /* calculate the power of 2 order of pfn_array_size, e.g.
   4.385 -       15->4 16->4 17->5 */
   4.386 -    for ( i = pfn_array_size-1, order_nr = 0; i ; i >>= 1, order_nr++ )
   4.387 -        continue;
   4.388 -
   4.389 -    /* Setup to_send / to_fix and to_skip bitmaps */
   4.390 -    to_send = malloc(BITMAP_SIZE);
   4.391 -    to_skip = malloc(BITMAP_SIZE);
   4.392 -
   4.393 -    if ( live )
   4.394 -    {
   4.395 -        /* Get qemu-dm logging dirty pages too */
   4.396 -        void *seg = init_qemu_maps(dom, BITMAP_SIZE);
   4.397 -        qemu_bitmaps[0] = seg;
   4.398 -        qemu_bitmaps[1] = seg + BITMAP_SIZE;
   4.399 -        qemu_active = 0;
   4.400 -        qemu_non_active = 1;
   4.401 -    }
   4.402 -
   4.403 -    hvm_buf_size = xc_domain_hvm_getcontext(xc_handle, dom, 0, 0);
   4.404 -    if ( hvm_buf_size == -1 )
   4.405 -    {
   4.406 -        ERROR("Couldn't get HVM context size from Xen");
   4.407 -        goto out;
   4.408 -    }
   4.409 -    hvm_buf = malloc(hvm_buf_size);
   4.410 -
   4.411 -    if ( !to_send || !to_skip || !hvm_buf )
   4.412 -    {
   4.413 -        ERROR("Couldn't allocate memory");
   4.414 -        goto out;
   4.415 -    }
   4.416 -
   4.417 -    memset(to_send, 0xff, BITMAP_SIZE);
   4.418 -
   4.419 -    if ( lock_pages(to_send, BITMAP_SIZE) )
   4.420 -    {
   4.421 -        ERROR("Unable to lock to_send");
   4.422 -        return 1;
   4.423 -    }
   4.424 -
   4.425 -    /* (to fix is local only) */
   4.426 -    if ( lock_pages(to_skip, BITMAP_SIZE) )
   4.427 -    {
   4.428 -        ERROR("Unable to lock to_skip");
   4.429 -        return 1;
   4.430 -    }
   4.431 -
   4.432 -    analysis_phase(xc_handle, dom, pfn_array_size, to_skip, 0);
   4.433 -
   4.434 -    /* We want zeroed memory so use calloc rather than malloc. */
   4.435 -    pfn_batch = calloc(MAX_BATCH_SIZE, sizeof(*pfn_batch));
   4.436 -    if ( pfn_batch == NULL )
   4.437 -    {
   4.438 -        ERROR("failed to alloc memory for pfn_batch array");
   4.439 -        errno = ENOMEM;
   4.440 -        goto out;
   4.441 -    }
   4.442 -
   4.443 -    for ( ; ; )
   4.444 -    {
   4.445 -        unsigned int prev_pc, sent_this_iter, N, batch;
   4.446 -
   4.447 -        iter++;
   4.448 -        sent_this_iter = 0;
   4.449 -        skip_this_iter = 0;
   4.450 -        prev_pc = 0;
   4.451 -        N=0;
   4.452 -
   4.453 -        DPRINTF("Saving memory pages: iter %d   0%%", iter);
   4.454 -
   4.455 -        while ( N < pfn_array_size )
   4.456 -        {
   4.457 -            unsigned int this_pc = (N * 100) / pfn_array_size;
   4.458 -            int rc;
   4.459 -
   4.460 -            if ( (this_pc - prev_pc) >= 5 )
   4.461 -            {
   4.462 -                DPRINTF("\b\b\b\b%3d%%", this_pc);
   4.463 -                prev_pc = this_pc;
   4.464 -            }
   4.465 -
   4.466 -            if ( !last_iter )
   4.467 -            {
   4.468 -                /* Slightly wasteful to peek the whole array evey time,
   4.469 -                   but this is fast enough for the moment. */
   4.470 -                rc = xc_shadow_control(
   4.471 -                    xc_handle, dom, XEN_DOMCTL_SHADOW_OP_PEEK, to_skip, 
   4.472 -                    pfn_array_size, NULL, 0, NULL);
   4.473 -                if ( rc != pfn_array_size )
   4.474 -                {
   4.475 -                    ERROR("Error peeking shadow bitmap");
   4.476 -                    goto out;
   4.477 -                }
   4.478 -            }
   4.479 -
   4.480 -            /* load pfn_batch[] with the mfn of all the pages we're doing in
   4.481 -               this batch. */
   4.482 -            for ( batch = 0;
   4.483 -                  (batch < MAX_BATCH_SIZE) && (N < pfn_array_size);
   4.484 -                  N++ )
   4.485 -            {
   4.486 -                int n = permute(N, pfn_array_size, order_nr);
   4.487 -
   4.488 -                if ( 0 && debug )
   4.489 -                    DPRINTF("%d pfn= %08lx %d \n",
   4.490 -                            iter, (unsigned long)n, test_bit(n, to_send));
   4.491 -
   4.492 -                if ( !last_iter &&
   4.493 -                     test_bit(n, to_send) &&
   4.494 -                     test_bit(n, to_skip) )
   4.495 -                    skip_this_iter++; /* stats keeping */
   4.496 -
   4.497 -                if ( !((test_bit(n, to_send) && !test_bit(n, to_skip)) ||
   4.498 -                       (test_bit(n, to_send) && last_iter)) )
   4.499 -                    continue;
   4.500 -
   4.501 -                /* Skip PFNs that aren't really there */
   4.502 -                if ( (n >= 0xa0 && n < 0xc0) /* VGA hole */
   4.503 -                     || (n >= (HVM_BELOW_4G_MMIO_START >> PAGE_SHIFT) &&
   4.504 -                         n < (1ULL << 32) >> PAGE_SHIFT) /* 4G MMIO hole */ )
   4.505 -                    continue;
   4.506 -
   4.507 -                /*
   4.508 -                ** we get here if:
   4.509 -                **  1. page is marked to_send & hasn't already been re-dirtied
   4.510 -                **  2. (ignore to_skip in last iteration)
   4.511 -                */
   4.512 -
   4.513 -                pfn_batch[batch] = n;
   4.514 -
   4.515 -                batch++;
   4.516 -            }
   4.517 -
   4.518 -            if ( batch == 0 )
   4.519 -                goto skip; /* vanishingly unlikely... */
   4.520 -
   4.521 -            region_base = xc_map_foreign_batch(
   4.522 -                xc_handle, dom, PROT_READ, pfn_batch, batch);
   4.523 -            if ( region_base == 0 )
   4.524 -            {
   4.525 -                ERROR("map batch failed");
   4.526 -                goto out;
   4.527 -            }
   4.528 -
   4.529 -            /* write num of pfns */
   4.530 -            if ( !write_exact(io_fd, &batch, sizeof(unsigned int)) )
   4.531 -            {
   4.532 -                ERROR("Error when writing to state file (2)");
   4.533 -                goto out;
   4.534 -            }
   4.535 -
   4.536 -            /* write all the pfns */
   4.537 -            if ( !write_exact(io_fd, pfn_batch, sizeof(unsigned long)*batch) )
   4.538 -            {
   4.539 -                ERROR("Error when writing to state file (3)");
   4.540 -                goto out;
   4.541 -            }
   4.542 -
   4.543 -            for ( j = 0; j < batch; j++ )
   4.544 -            {
   4.545 -                if ( pfn_batch[j] & XEN_DOMCTL_PFINFO_LTAB_MASK )
   4.546 -                    continue;
   4.547 -                if ( ratewrite(io_fd, region_base + j*PAGE_SIZE,
   4.548 -                               PAGE_SIZE) != PAGE_SIZE )
   4.549 -                {
   4.550 -                    ERROR("ERROR when writing to state file (4)");
   4.551 -                    goto out;
   4.552 -                }
   4.553 -            }
   4.554 -
   4.555 -            sent_this_iter += batch;
   4.556 -
   4.557 -            munmap(region_base, batch*PAGE_SIZE);
   4.558 -
   4.559 -        } /* end of this while loop for this iteration */
   4.560 -
   4.561 -      skip:
   4.562 -
   4.563 -        total_sent += sent_this_iter;
   4.564 -
   4.565 -        DPRINTF("\r %d: sent %d, skipped %d, ",
   4.566 -                iter, sent_this_iter, skip_this_iter );
   4.567 -
   4.568 -        if ( last_iter )
   4.569 -        {
   4.570 -            print_stats( xc_handle, dom, sent_this_iter, &stats, 1);
   4.571 -            DPRINTF("Total pages sent= %ld (%.2fx)\n",
   4.572 -                    total_sent, ((float)total_sent)/pfn_array_size );
   4.573 -        }
   4.574 -
   4.575 -        if ( last_iter && debug )
   4.576 -        {
   4.577 -            int minusone = -1;
   4.578 -            memset(to_send, 0xff, BITMAP_SIZE);
   4.579 -            debug = 0;
   4.580 -            DPRINTF("Entering debug resend-all mode\n");
   4.581 -
   4.582 -            /* send "-1" to put receiver into debug mode */
   4.583 -            if ( !write_exact(io_fd, &minusone, sizeof(int)) )
   4.584 -            {
   4.585 -                ERROR("Error when writing to state file (6)");
   4.586 -                goto out;
   4.587 -            }
   4.588 -
   4.589 -            continue;
   4.590 -        }
   4.591 -
   4.592 -        if ( last_iter )
   4.593 -            break;
   4.594 -
   4.595 -        if ( live )
   4.596 -        {
   4.597 -            if ( ((sent_this_iter > sent_last_iter) && RATE_IS_MAX()) ||
   4.598 -                 (iter >= max_iters) ||
   4.599 -                 (sent_this_iter+skip_this_iter < 50) ||
   4.600 -                 (total_sent > pfn_array_size*max_factor) )
   4.601 -            {
   4.602 -                DPRINTF("Start last iteration for HVM domain\n");
   4.603 -                last_iter = 1;
   4.604 -
   4.605 -                if ( suspend_and_state(suspend, xc_handle, io_fd, dom, &info,
   4.606 -                                       &ctxt))
   4.607 -                {
   4.608 -                    ERROR("Domain appears not to have suspended");
   4.609 -                    goto out;
   4.610 -                }
   4.611 -
   4.612 -                DPRINTF("SUSPEND eip %08lx edx %08lx\n",
   4.613 -                        (unsigned long)ctxt.user_regs.eip,
   4.614 -                        (unsigned long)ctxt.user_regs.edx);
   4.615 -            }
   4.616 -
   4.617 -            if ( xc_shadow_control(xc_handle, dom, 
   4.618 -                                   XEN_DOMCTL_SHADOW_OP_CLEAN, to_send, 
   4.619 -                                   pfn_array_size, NULL, 
   4.620 -                                   0, &stats) != pfn_array_size )
   4.621 -            {
   4.622 -                ERROR("Error flushing shadow PT");
   4.623 -                goto out;
   4.624 -            }
   4.625 -
   4.626 -            /* Pull in the dirty bits from qemu too */
   4.627 -            if ( !last_iter )
   4.628 -            {
   4.629 -                qemu_active = qemu_non_active;
   4.630 -                qemu_non_active = qemu_active ? 0 : 1;
   4.631 -                qemu_flip_buffer(dom, qemu_active);
   4.632 -                for ( j = 0; j < BITMAP_SIZE / sizeof(unsigned long); j++ )
   4.633 -                {
   4.634 -                    to_send[j] |= qemu_bitmaps[qemu_non_active][j];
   4.635 -                    qemu_bitmaps[qemu_non_active][j] = 0;
   4.636 -                }
   4.637 -            }
   4.638 -            else
   4.639 -            {
   4.640 -                for ( j = 0; j < BITMAP_SIZE / sizeof(unsigned long); j++ )
   4.641 -                    to_send[j] |= qemu_bitmaps[qemu_active][j];
   4.642 -            }
   4.643 -
   4.644 -            sent_last_iter = sent_this_iter;
   4.645 -
   4.646 -            print_stats(xc_handle, dom, sent_this_iter, &stats, 1);
   4.647 -        }
   4.648 -    } /* end of while 1 */
   4.649 -
   4.650 -
   4.651 -    DPRINTF("All HVM memory is saved\n");
   4.652 -
   4.653 -    {
   4.654 -        struct {
   4.655 -            int minustwo;
   4.656 -            int max_vcpu_id;
   4.657 -            uint64_t vcpumap;
   4.658 -        } chunk = { -2, info.max_vcpu_id };
   4.659 -
   4.660 -        if (info.max_vcpu_id >= 64) {
   4.661 -            ERROR("Too many VCPUS in guest!");
   4.662 -            goto out;
   4.663 -        }
   4.664 -
   4.665 -        for (i = 1; i <= info.max_vcpu_id; i++) {
   4.666 -            xc_vcpuinfo_t vinfo;
   4.667 -            if ((xc_vcpu_getinfo(xc_handle, dom, i, &vinfo) == 0) &&
   4.668 -                vinfo.online)
   4.669 -                vcpumap |= 1ULL << i;
   4.670 -        }
   4.671 -
   4.672 -        chunk.vcpumap = vcpumap;
   4.673 -        if(!write_exact(io_fd, &chunk, sizeof(chunk))) {
   4.674 -            ERROR("Error when writing to state file (errno %d)", errno);
   4.675 -            goto out;
   4.676 -        }
   4.677 -    }
   4.678 -
   4.679 -    /* Zero terminate */
   4.680 -    i = 0;
   4.681 -    if ( !write_exact(io_fd, &i, sizeof(int)) )
   4.682 -    {
   4.683 -        ERROR("Error when writing to state file (6)");
   4.684 -        goto out;
   4.685 -    }
   4.686 -
   4.687 -    /* Save magic-page locations. */
   4.688 -    memset(magic_pfns, 0, sizeof(magic_pfns));
   4.689 -    xc_get_hvm_param(xc_handle, dom, HVM_PARAM_IOREQ_PFN,
   4.690 -                     (unsigned long *)&magic_pfns[0]);
   4.691 -    xc_get_hvm_param(xc_handle, dom, HVM_PARAM_BUFIOREQ_PFN,
   4.692 -                     (unsigned long *)&magic_pfns[1]);
   4.693 -    xc_get_hvm_param(xc_handle, dom, HVM_PARAM_STORE_PFN,
   4.694 -                     (unsigned long *)&magic_pfns[2]);
   4.695 -    if ( !write_exact(io_fd, magic_pfns, sizeof(magic_pfns)) )
   4.696 -    {
   4.697 -        ERROR("Error when writing to state file (7)");
   4.698 -        goto out;
   4.699 -    }
   4.700 -
   4.701 -    /* save vcpu/vmcs contexts */
   4.702 -    for ( i = 0; i < nr_vcpus; i++ )
   4.703 -    {
   4.704 -        if ( !(vcpumap & (1ULL << i)) )
   4.705 -            continue;
   4.706 -
   4.707 -        if ( xc_vcpu_getcontext(xc_handle, dom, i, &ctxt) )
   4.708 -        {
   4.709 -            ERROR("HVM:Could not get vcpu context");
   4.710 -            goto out;
   4.711 -        }
   4.712 -
   4.713 -        DPRINTF("write vcpu %d context.\n", i); 
   4.714 -        if ( !write_exact(io_fd, &(ctxt), sizeof(ctxt)) )
   4.715 -        {
   4.716 -            ERROR("write vcpu context failed!\n");
   4.717 -            goto out;
   4.718 -        }
   4.719 -    }
   4.720 -
   4.721 -    if ( (rec_size = xc_domain_hvm_getcontext(xc_handle, dom, hvm_buf, 
   4.722 -                                              hvm_buf_size)) == -1 )
   4.723 -    {
   4.724 -        ERROR("HVM:Could not get hvm buffer");
   4.725 -        goto out;
   4.726 -    }
   4.727 -
   4.728 -    if ( !write_exact(io_fd, &rec_size, sizeof(uint32_t)) )
   4.729 -    {
   4.730 -        ERROR("error write hvm buffer size");
   4.731 -        goto out;
   4.732 -    }
   4.733 -
   4.734 -    if ( !write_exact(io_fd, hvm_buf, rec_size) )
   4.735 -    {
   4.736 -        ERROR("write HVM info failed!\n");
   4.737 -        goto out;
   4.738 -    }
   4.739 -
   4.740 -    /* Success! */
   4.741 -    rc = 0;
   4.742 -
   4.743 - out:
   4.744 -
   4.745 -    if ( live )
   4.746 -    {
   4.747 -        if ( xc_shadow_control(xc_handle, dom, XEN_DOMCTL_SHADOW_OP_OFF,
   4.748 -                               NULL, 0, NULL, 0, NULL) < 0 )
   4.749 -            DPRINTF("Warning - couldn't disable shadow mode");
   4.750 -    }
   4.751 -
   4.752 -    free(hvm_buf);
   4.753 -    free(pfn_batch);
   4.754 -    free(to_send);
   4.755 -    free(to_skip);
   4.756 -
   4.757 -    return !!rc;
   4.758 -}
     5.1 --- a/tools/libxc/xc_linux_save.c	Wed Apr 11 09:29:00 2007 +0100
     5.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
     5.3 @@ -1,1414 +0,0 @@
     5.4 -/******************************************************************************
     5.5 - * xc_linux_save.c
     5.6 - *
     5.7 - * Save the state of a running Linux session.
     5.8 - *
     5.9 - * Copyright (c) 2003, K A Fraser.
    5.10 - */
    5.11 -
    5.12 -#include <inttypes.h>
    5.13 -#include <time.h>
    5.14 -#include <stdlib.h>
    5.15 -#include <unistd.h>
    5.16 -#include <sys/time.h>
    5.17 -
    5.18 -#include "xc_private.h"
    5.19 -#include "xc_dom.h"
    5.20 -#include "xg_private.h"
    5.21 -#include "xg_save_restore.h"
    5.22 -
    5.23 -/*
    5.24 -** Default values for important tuning parameters. Can override by passing
    5.25 -** non-zero replacement values to xc_linux_save().
    5.26 -**
    5.27 -** XXX SMH: should consider if want to be able to override MAX_MBIT_RATE too.
    5.28 -**
    5.29 -*/
    5.30 -#define DEF_MAX_ITERS   29   /* limit us to 30 times round loop   */
    5.31 -#define DEF_MAX_FACTOR   3   /* never send more than 3x p2m_size  */
    5.32 -
    5.33 -/* max mfn of the whole machine */
    5.34 -static unsigned long max_mfn;
    5.35 -
    5.36 -/* virtual starting address of the hypervisor */
    5.37 -static unsigned long hvirt_start;
    5.38 -
    5.39 -/* #levels of page tables used by the current guest */
    5.40 -static unsigned int pt_levels;
    5.41 -
    5.42 -/* number of pfns this guest has (i.e. number of entries in the P2M) */
    5.43 -static unsigned long p2m_size;
    5.44 -
    5.45 -/* Live mapping of the table mapping each PFN to its current MFN. */
    5.46 -static xen_pfn_t *live_p2m = NULL;
    5.47 -
    5.48 -/* Live mapping of system MFN to PFN table. */
    5.49 -static xen_pfn_t *live_m2p = NULL;
    5.50 -static unsigned long m2p_mfn0;
    5.51 -
    5.52 -/* grep fodder: machine_to_phys */
    5.53 -
    5.54 -#define mfn_to_pfn(_mfn) live_m2p[(_mfn)]
    5.55 -
    5.56 -/*
    5.57 - * Returns TRUE if the given machine frame number has a unique mapping
    5.58 - * in the guest's pseudophysical map.
    5.59 - */
    5.60 -#define MFN_IS_IN_PSEUDOPHYS_MAP(_mfn)          \
    5.61 -    (((_mfn) < (max_mfn)) &&                    \
    5.62 -     ((mfn_to_pfn(_mfn) < (p2m_size)) &&        \
    5.63 -      (live_p2m[mfn_to_pfn(_mfn)] == (_mfn))))
    5.64 -
    5.65 -/* Returns TRUE if MFN is successfully converted to a PFN. */
    5.66 -#define translate_mfn_to_pfn(_pmfn)                             \
    5.67 -({                                                              \
    5.68 -    unsigned long mfn = *(_pmfn);                               \
    5.69 -    int _res = 1;                                               \
    5.70 -    if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) )                       \
    5.71 -        _res = 0;                                               \
    5.72 -    else                                                        \
    5.73 -        *(_pmfn) = mfn_to_pfn(mfn);                             \
    5.74 -    _res;                                                       \
    5.75 -})
    5.76 -
    5.77 -/*
    5.78 -** During (live) save/migrate, we maintain a number of bitmaps to track
    5.79 -** which pages we have to send, to fixup, and to skip.
    5.80 -*/
    5.81 -
    5.82 -#define BITS_PER_LONG (sizeof(unsigned long) * 8)
    5.83 -#define BITMAP_SIZE   ((p2m_size + BITS_PER_LONG - 1) / 8)
    5.84 -
    5.85 -#define BITMAP_ENTRY(_nr,_bmap) \
    5.86 -   ((volatile unsigned long *)(_bmap))[(_nr)/BITS_PER_LONG]
    5.87 -
    5.88 -#define BITMAP_SHIFT(_nr) ((_nr) % BITS_PER_LONG)
    5.89 -
    5.90 -static inline int test_bit (int nr, volatile void * addr)
    5.91 -{
    5.92 -    return (BITMAP_ENTRY(nr, addr) >> BITMAP_SHIFT(nr)) & 1;
    5.93 -}
    5.94 -
    5.95 -static inline void clear_bit (int nr, volatile void * addr)
    5.96 -{
    5.97 -    BITMAP_ENTRY(nr, addr) &= ~(1UL << BITMAP_SHIFT(nr));
    5.98 -}
    5.99 -
   5.100 -static inline void set_bit ( int nr, volatile void * addr)
   5.101 -{
   5.102 -    BITMAP_ENTRY(nr, addr) |= (1UL << BITMAP_SHIFT(nr));
   5.103 -}
   5.104 -
   5.105 -/* Returns the hamming weight (i.e. the number of bits set) in a N-bit word */
   5.106 -static inline unsigned int hweight32(unsigned int w)
   5.107 -{
   5.108 -    unsigned int res = (w & 0x55555555) + ((w >> 1) & 0x55555555);
   5.109 -    res = (res & 0x33333333) + ((res >> 2) & 0x33333333);
   5.110 -    res = (res & 0x0F0F0F0F) + ((res >> 4) & 0x0F0F0F0F);
   5.111 -    res = (res & 0x00FF00FF) + ((res >> 8) & 0x00FF00FF);
   5.112 -    return (res & 0x0000FFFF) + ((res >> 16) & 0x0000FFFF);
   5.113 -}
   5.114 -
   5.115 -static inline int count_bits ( int nr, volatile void *addr)
   5.116 -{
   5.117 -    int i, count = 0;
   5.118 -    volatile unsigned long *p = (volatile unsigned long *)addr;
   5.119 -    /* We know that the array is padded to unsigned long. */
   5.120 -    for ( i = 0; i < (nr / (sizeof(unsigned long)*8)); i++, p++ )
   5.121 -        count += hweight32(*p);
   5.122 -    return count;
   5.123 -}
   5.124 -
   5.125 -static inline int permute( int i, int nr, int order_nr  )
   5.126 -{
   5.127 -    /* Need a simple permutation function so that we scan pages in a
   5.128 -       pseudo random order, enabling us to get a better estimate of
   5.129 -       the domain's page dirtying rate as we go (there are often
   5.130 -       contiguous ranges of pfns that have similar behaviour, and we
   5.131 -       want to mix them up. */
   5.132 -
   5.133 -    /* e.g. nr->oder 15->4 16->4 17->5 */
   5.134 -    /* 512MB domain, 128k pages, order 17 */
   5.135 -
   5.136 -    /*
   5.137 -      QPONMLKJIHGFEDCBA
   5.138 -             QPONMLKJIH
   5.139 -      GFEDCBA
   5.140 -     */
   5.141 -
   5.142 -    /*
   5.143 -      QPONMLKJIHGFEDCBA
   5.144 -                  EDCBA
   5.145 -             QPONM
   5.146 -      LKJIHGF
   5.147 -      */
   5.148 -
   5.149 -    do { i = ((i>>(order_nr-10)) | ( i<<10 ) ) & ((1<<order_nr)-1); }
   5.150 -    while ( i >= nr ); /* this won't ever loop if nr is a power of 2 */
   5.151 -
   5.152 -    return i;
   5.153 -}
   5.154 -
   5.155 -static uint64_t tv_to_us(struct timeval *new)
   5.156 -{
   5.157 -    return (new->tv_sec * 1000000) + new->tv_usec;
   5.158 -}
   5.159 -
   5.160 -static uint64_t llgettimeofday(void)
   5.161 -{
   5.162 -    struct timeval now;
   5.163 -    gettimeofday(&now, NULL);
   5.164 -    return tv_to_us(&now);
   5.165 -}
   5.166 -
   5.167 -static uint64_t tv_delta(struct timeval *new, struct timeval *old)
   5.168 -{
   5.169 -    return (((new->tv_sec - old->tv_sec)*1000000) +
   5.170 -            (new->tv_usec - old->tv_usec));
   5.171 -}
   5.172 -
   5.173 -static int noncached_write(int fd, int live, void *buffer, int len) 
   5.174 -{
   5.175 -    static int write_count = 0;
   5.176 -
   5.177 -    int rc = write(fd,buffer,len);
   5.178 -
   5.179 -    write_count += len;
   5.180 -    if ( write_count >= (MAX_PAGECACHE_USAGE * PAGE_SIZE) )
   5.181 -    {
   5.182 -        /* Time to discard cache - dont care if this fails */
   5.183 -        discard_file_cache(fd, 0 /* no flush */);
   5.184 -        write_count = 0;
   5.185 -    }
   5.186 -
   5.187 -    return rc;
   5.188 -}
   5.189 -
   5.190 -#ifdef ADAPTIVE_SAVE
   5.191 -
   5.192 -/*
   5.193 -** We control the rate at which we transmit (or save) to minimize impact
   5.194 -** on running domains (including the target if we're doing live migrate).
   5.195 -*/
   5.196 -
   5.197 -#define MAX_MBIT_RATE    500      /* maximum transmit rate for migrate */
   5.198 -#define START_MBIT_RATE  100      /* initial transmit rate for migrate */
   5.199 -
   5.200 -/* Scaling factor to convert between a rate (in Mb/s) and time (in usecs) */
   5.201 -#define RATE_TO_BTU      781250
   5.202 -
   5.203 -/* Amount in bytes we allow ourselves to send in a burst */
   5.204 -#define BURST_BUDGET (100*1024)
   5.205 -
   5.206 -/* We keep track of the current and previous transmission rate */
   5.207 -static int mbit_rate, ombit_rate = 0;
   5.208 -
   5.209 -/* Have we reached the maximum transmission rate? */
   5.210 -#define RATE_IS_MAX() (mbit_rate == MAX_MBIT_RATE)
   5.211 -
   5.212 -static inline void initialize_mbit_rate()
   5.213 -{
   5.214 -    mbit_rate = START_MBIT_RATE;
   5.215 -}
   5.216 -
   5.217 -static int ratewrite(int io_fd, int live, void *buf, int n)
   5.218 -{
   5.219 -    static int budget = 0;
   5.220 -    static int burst_time_us = -1;
   5.221 -    static struct timeval last_put = { 0 };
   5.222 -    struct timeval now;
   5.223 -    struct timespec delay;
   5.224 -    long long delta;
   5.225 -
   5.226 -    if ( START_MBIT_RATE == 0 )
   5.227 -        return noncached_write(io_fd, live, buf, n);
   5.228 -
   5.229 -    budget -= n;
   5.230 -    if ( budget < 0 )
   5.231 -    {
   5.232 -        if ( mbit_rate != ombit_rate )
   5.233 -        {
   5.234 -            burst_time_us = RATE_TO_BTU / mbit_rate;
   5.235 -            ombit_rate = mbit_rate;
   5.236 -            DPRINTF("rate limit: %d mbit/s burst budget %d slot time %d\n",
   5.237 -                    mbit_rate, BURST_BUDGET, burst_time_us);
   5.238 -        }
   5.239 -        if ( last_put.tv_sec == 0 )
   5.240 -        {
   5.241 -            budget += BURST_BUDGET;
   5.242 -            gettimeofday(&last_put, NULL);
   5.243 -        }
   5.244 -        else
   5.245 -        {
   5.246 -            while ( budget < 0 )
   5.247 -            {
   5.248 -                gettimeofday(&now, NULL);
   5.249 -                delta = tv_delta(&now, &last_put);
   5.250 -                while ( delta > burst_time_us )
   5.251 -                {
   5.252 -                    budget += BURST_BUDGET;
   5.253 -                    last_put.tv_usec += burst_time_us;
   5.254 -                    if ( last_put.tv_usec > 1000000 
   5.255 -                    {
   5.256 -                        last_put.tv_usec -= 1000000;
   5.257 -                        last_put.tv_sec++;
   5.258 -                    }
   5.259 -                    delta -= burst_time_us;
   5.260 -                }
   5.261 -                if ( budget > 0 )
   5.262 -                    break;
   5.263 -                delay.tv_sec = 0;
   5.264 -                delay.tv_nsec = 1000 * (burst_time_us - delta);
   5.265 -                while ( delay.tv_nsec > 0 )
   5.266 -                    if ( nanosleep(&delay, &delay) == 0 )
   5.267 -                        break;
   5.268 -            }
   5.269 -        }
   5.270 -    }
   5.271 -    return noncached_write(io_fd, live, buf, n);
   5.272 -}
   5.273 -
   5.274 -#else /* ! ADAPTIVE SAVE */
   5.275 -
   5.276 -#define RATE_IS_MAX() (0)
   5.277 -#define ratewrite(_io_fd, _live, _buf, _n) noncached_write((_io_fd), (_live), (_buf), (_n))
   5.278 -#define initialize_mbit_rate()
   5.279 -
   5.280 -#endif
   5.281 -
   5.282 -static inline ssize_t write_exact(int fd, void *buf, size_t count)
   5.283 -{
   5.284 -    return (write(fd, buf, count) == count);
   5.285 -}
   5.286 -
   5.287 -static int print_stats(int xc_handle, uint32_t domid, int pages_sent,
   5.288 -                       xc_shadow_op_stats_t *stats, int print)
   5.289 -{
   5.290 -    static struct timeval wall_last;
   5.291 -    static long long      d0_cpu_last;
   5.292 -    static long long      d1_cpu_last;
   5.293 -
   5.294 -    struct timeval        wall_now;
   5.295 -    long long             wall_delta;
   5.296 -    long long             d0_cpu_now, d0_cpu_delta;
   5.297 -    long long             d1_cpu_now, d1_cpu_delta;
   5.298 -
   5.299 -    gettimeofday(&wall_now, NULL);
   5.300 -
   5.301 -    d0_cpu_now = xc_domain_get_cpu_usage(xc_handle, 0, /* FIXME */ 0)/1000;
   5.302 -    d1_cpu_now = xc_domain_get_cpu_usage(xc_handle, domid, /* FIXME */ 0)/1000;
   5.303 -
   5.304 -    if ( (d0_cpu_now == -1) || (d1_cpu_now == -1) )
   5.305 -        DPRINTF("ARRHHH!!\n");
   5.306 -
   5.307 -    wall_delta = tv_delta(&wall_now,&wall_last)/1000;
   5.308 -    if ( wall_delta == 0 )
   5.309 -        wall_delta = 1;
   5.310 -
   5.311 -    d0_cpu_delta = (d0_cpu_now - d0_cpu_last)/1000;
   5.312 -    d1_cpu_delta = (d1_cpu_now - d1_cpu_last)/1000;
   5.313 -
   5.314 -    if ( print )
   5.315 -        DPRINTF("delta %lldms, dom0 %d%%, target %d%%, sent %dMb/s, "
   5.316 -                "dirtied %dMb/s %" PRId32 " pages\n",
   5.317 -                wall_delta,
   5.318 -                (int)((d0_cpu_delta*100)/wall_delta),
   5.319 -                (int)((d1_cpu_delta*100)/wall_delta),
   5.320 -                (int)((pages_sent*PAGE_SIZE)/(wall_delta*(1000/8))),
   5.321 -                (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))),
   5.322 -                stats->dirty_count);
   5.323 -
   5.324 -#ifdef ADAPTIVE_SAVE
   5.325 -    if ( ((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))) > mbit_rate )
   5.326 -    {
   5.327 -        mbit_rate = (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8)))
   5.328 -            + 50;
   5.329 -        if ( mbit_rate > MAX_MBIT_RATE )
   5.330 -            mbit_rate = MAX_MBIT_RATE;
   5.331 -    }
   5.332 -#endif
   5.333 -
   5.334 -    d0_cpu_last = d0_cpu_now;
   5.335 -    d1_cpu_last = d1_cpu_now;
   5.336 -    wall_last   = wall_now;
   5.337 -
   5.338 -    return 0;
   5.339 -}
   5.340 -
   5.341 -
   5.342 -static int analysis_phase(int xc_handle, uint32_t domid, int p2m_size,
   5.343 -                          unsigned long *arr, int runs)
   5.344 -{
   5.345 -    long long start, now;
   5.346 -    xc_shadow_op_stats_t stats;
   5.347 -    int j;
   5.348 -
   5.349 -    start = llgettimeofday();
   5.350 -
   5.351 -    for ( j = 0; j < runs; j++ )
   5.352 -    {
   5.353 -        int i;
   5.354 -
   5.355 -        xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_CLEAN,
   5.356 -                          arr, p2m_size, NULL, 0, NULL);
   5.357 -        DPRINTF("#Flush\n");
   5.358 -        for ( i = 0; i < 40; i++ )
   5.359 -        {
   5.360 -            usleep(50000);
   5.361 -            now = llgettimeofday();
   5.362 -            xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_PEEK,
   5.363 -                              NULL, 0, NULL, 0, &stats);
   5.364 -            DPRINTF("now= %lld faults= %"PRId32" dirty= %"PRId32"\n",
   5.365 -                    ((now-start)+500)/1000,
   5.366 -                    stats.fault_count, stats.dirty_count);
   5.367 -        }
   5.368 -    }
   5.369 -
   5.370 -    return -1;
   5.371 -}
   5.372 -
   5.373 -
   5.374 -static int suspend_and_state(int (*suspend)(int), int xc_handle, int io_fd,
   5.375 -                             int dom, xc_dominfo_t *info,
   5.376 -                             vcpu_guest_context_t *ctxt)
   5.377 -{
   5.378 -    int i = 0;
   5.379 -
   5.380 -    if ( !(*suspend)(dom) )
   5.381 -    {
   5.382 -        ERROR("Suspend request failed");
   5.383 -        return -1;
   5.384 -    }
   5.385 -
   5.386 - retry:
   5.387 -
   5.388 -    if ( xc_domain_getinfo(xc_handle, dom, 1, info) != 1 )
   5.389 -    {
   5.390 -        ERROR("Could not get domain info");
   5.391 -        return -1;
   5.392 -    }
   5.393 -
   5.394 -    if ( xc_vcpu_getcontext(xc_handle, dom, 0, ctxt) )
   5.395 -        ERROR("Could not get vcpu context");
   5.396 -
   5.397 -
   5.398 -    if ( info->dying )
   5.399 -    {
   5.400 -        ERROR("domain is dying");
   5.401 -        return -1;
   5.402 -    }
   5.403 -
   5.404 -    if ( info->crashed )
   5.405 -    {
   5.406 -        ERROR("domain has crashed");
   5.407 -        return -1;
   5.408 -    }
   5.409 -
   5.410 -    if ( info->shutdown )
   5.411 -    {
   5.412 -        switch ( info->shutdown_reason )
   5.413 -        {
   5.414 -        case SHUTDOWN_poweroff:
   5.415 -        case SHUTDOWN_reboot:
   5.416 -            ERROR("domain has shut down");
   5.417 -            return -1;
   5.418 -        case SHUTDOWN_suspend:
   5.419 -            return 0;
   5.420 -        case SHUTDOWN_crash:
   5.421 -            ERROR("domain has crashed");
   5.422 -            return -1;
   5.423 -        }
   5.424 -    }
   5.425 -
   5.426 -    if ( info->paused )
   5.427 -    {
   5.428 -        /* Try unpausing domain, wait, and retest. */
   5.429 -        xc_domain_unpause( xc_handle, dom );
   5.430 -        ERROR("Domain was paused. Wait and re-test.");
   5.431 -        usleep(10000); /* 10ms */
   5.432 -        goto retry;
   5.433 -    }
   5.434 -
   5.435 -    if ( ++i < 100 )
   5.436 -    {
   5.437 -        ERROR("Retry suspend domain");
   5.438 -        usleep(10000); /* 10ms */
   5.439 -        goto retry;
   5.440 -    }
   5.441 -
   5.442 -    ERROR("Unable to suspend domain.");
   5.443 -
   5.444 -    return -1;
   5.445 -}
   5.446 -
   5.447 -/*
   5.448 -** Map the top-level page of MFNs from the guest. The guest might not have
   5.449 -** finished resuming from a previous restore operation, so we wait a while for
   5.450 -** it to update the MFN to a reasonable value.
   5.451 -*/
   5.452 -static void *map_frame_list_list(int xc_handle, uint32_t dom,
   5.453 -                                 shared_info_t *shinfo)
   5.454 -{
   5.455 -    int count = 100;
   5.456 -    void *p;
   5.457 -
   5.458 -    while ( count-- && (shinfo->arch.pfn_to_mfn_frame_list_list == 0) )
   5.459 -        usleep(10000);
   5.460 -
   5.461 -    if ( shinfo->arch.pfn_to_mfn_frame_list_list == 0 )
   5.462 -    {
   5.463 -        ERROR("Timed out waiting for frame list updated.");
   5.464 -        return NULL;
   5.465 -    }
   5.466 -
   5.467 -    p = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, PROT_READ,
   5.468 -                             shinfo->arch.pfn_to_mfn_frame_list_list);
   5.469 -    if ( p == NULL )
   5.470 -        ERROR("Couldn't map p2m_frame_list_list (errno %d)", errno);
   5.471 -
   5.472 -    return p;
   5.473 -}
   5.474 -
   5.475 -/*
   5.476 -** During transfer (or in the state file), all page-table pages must be
   5.477 -** converted into a 'canonical' form where references to actual mfns
   5.478 -** are replaced with references to the corresponding pfns.
   5.479 -**
   5.480 -** This function performs the appropriate conversion, taking into account
   5.481 -** which entries do not require canonicalization (in particular, those
   5.482 -** entries which map the virtual address reserved for the hypervisor).
   5.483 -*/
   5.484 -static int canonicalize_pagetable(unsigned long type, unsigned long pfn,
   5.485 -                           const void *spage, void *dpage)
   5.486 -{
   5.487 -
   5.488 -    int i, pte_last, xen_start, xen_end, race = 0; 
   5.489 -    uint64_t pte;
   5.490 -
   5.491 -    /*
   5.492 -    ** We need to determine which entries in this page table hold
   5.493 -    ** reserved hypervisor mappings. This depends on the current
   5.494 -    ** page table type as well as the number of paging levels.
   5.495 -    */
   5.496 -    xen_start = xen_end = pte_last = PAGE_SIZE / ((pt_levels == 2) ? 4 : 8);
   5.497 -
   5.498 -    if ( (pt_levels == 2) && (type == XEN_DOMCTL_PFINFO_L2TAB) )
   5.499 -        xen_start = (hvirt_start >> L2_PAGETABLE_SHIFT);
   5.500 -
   5.501 -    if ( (pt_levels == 3) && (type == XEN_DOMCTL_PFINFO_L3TAB) )
   5.502 -        xen_start = L3_PAGETABLE_ENTRIES_PAE;
   5.503 -
   5.504 -    /*
   5.505 -    ** in PAE only the L2 mapping the top 1GB contains Xen mappings.
   5.506 -    ** We can spot this by looking for the guest linear mapping which
   5.507 -    ** Xen always ensures is present in that L2. Guests must ensure
   5.508 -    ** that this check will fail for other L2s.
   5.509 -    */
   5.510 -    if ( (pt_levels == 3) && (type == XEN_DOMCTL_PFINFO_L2TAB) )
   5.511 -    {
   5.512 -        int hstart;
   5.513 -        uint64_t he;
   5.514 -
   5.515 -        hstart = (hvirt_start >> L2_PAGETABLE_SHIFT_PAE) & 0x1ff;
   5.516 -        he = ((const uint64_t *) spage)[hstart];
   5.517 -
   5.518 -        if ( ((he >> PAGE_SHIFT) & MFN_MASK_X86) == m2p_mfn0 )
   5.519 -        {
   5.520 -            /* hvirt starts with xen stuff... */
   5.521 -            xen_start = hstart;
   5.522 -        }
   5.523 -        else if ( hvirt_start != 0xf5800000 )
   5.524 -        {
   5.525 -            /* old L2s from before hole was shrunk... */
   5.526 -            hstart = (0xf5800000 >> L2_PAGETABLE_SHIFT_PAE) & 0x1ff;
   5.527 -            he = ((const uint64_t *) spage)[hstart];
   5.528 -            if ( ((he >> PAGE_SHIFT) & MFN_MASK_X86) == m2p_mfn0 )
   5.529 -                xen_start = hstart;
   5.530 -        }
   5.531 -    }
   5.532 -
   5.533 -    if ( (pt_levels == 4) && (type == XEN_DOMCTL_PFINFO_L4TAB) )
   5.534 -    {
   5.535 -        /*
   5.536 -        ** XXX SMH: should compute these from hvirt_start (which we have)
   5.537 -        ** and hvirt_end (which we don't)
   5.538 -        */
   5.539 -        xen_start = 256;
   5.540 -        xen_end   = 272;
   5.541 -    }
   5.542 -
   5.543 -    /* Now iterate through the page table, canonicalizing each PTE */
   5.544 -    for (i = 0; i < pte_last; i++ )
   5.545 -    {
   5.546 -        unsigned long pfn, mfn;
   5.547 -
   5.548 -        if ( pt_levels == 2 )
   5.549 -            pte = ((const uint32_t*)spage)[i];
   5.550 -        else
   5.551 -            pte = ((const uint64_t*)spage)[i];
   5.552 -
   5.553 -        if ( (i >= xen_start) && (i < xen_end) )
   5.554 -            pte = 0;
   5.555 -
   5.556 -        if ( pte & _PAGE_PRESENT )
   5.557 -        {
   5.558 -            mfn = (pte >> PAGE_SHIFT) & MFN_MASK_X86;
   5.559 -            if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) )
   5.560 -            {
   5.561 -                /* This will happen if the type info is stale which
   5.562 -                   is quite feasible under live migration */
   5.563 -                pfn  = 0;  /* zap it - we'll retransmit this page later */
   5.564 -                race = 1;  /* inform the caller of race; fatal if !live */ 
   5.565 -            }
   5.566 -            else
   5.567 -                pfn = mfn_to_pfn(mfn);
   5.568 -
   5.569 -            pte &= ~MADDR_MASK_X86;
   5.570 -            pte |= (uint64_t)pfn << PAGE_SHIFT;
   5.571 -
   5.572 -            /*
   5.573 -             * PAE guest L3Es can contain these flags when running on
   5.574 -             * a 64bit hypervisor. We zap these here to avoid any
   5.575 -             * surprise at restore time...
   5.576 -             */
   5.577 -            if ( (pt_levels == 3) &&
   5.578 -                 (type == XEN_DOMCTL_PFINFO_L3TAB) &&
   5.579 -                 (pte & (_PAGE_USER|_PAGE_RW|_PAGE_ACCESSED)) )
   5.580 -                pte &= ~(_PAGE_USER|_PAGE_RW|_PAGE_ACCESSED);
   5.581 -        }
   5.582 -
   5.583 -        if ( pt_levels == 2 )
   5.584 -            ((uint32_t*)dpage)[i] = pte;
   5.585 -        else
   5.586 -            ((uint64_t*)dpage)[i] = pte;
   5.587 -    }
   5.588 -
   5.589 -    return race;
   5.590 -}
   5.591 -
   5.592 -static xen_pfn_t *xc_map_m2p(int xc_handle,
   5.593 -                                 unsigned long max_mfn,
   5.594 -                                 int prot)
   5.595 -{
   5.596 -    struct xen_machphys_mfn_list xmml;
   5.597 -    privcmd_mmap_entry_t *entries;
   5.598 -    unsigned long m2p_chunks, m2p_size;
   5.599 -    xen_pfn_t *m2p;
   5.600 -    xen_pfn_t *extent_start;
   5.601 -    int i, rc;
   5.602 -
   5.603 -    m2p_size   = M2P_SIZE(max_mfn);
   5.604 -    m2p_chunks = M2P_CHUNKS(max_mfn);
   5.605 -
   5.606 -    xmml.max_extents = m2p_chunks;
   5.607 -    if ( !(extent_start = malloc(m2p_chunks * sizeof(xen_pfn_t))) )
   5.608 -    {
   5.609 -        ERROR("failed to allocate space for m2p mfns");
   5.610 -        return NULL;
   5.611 -    }
   5.612 -    set_xen_guest_handle(xmml.extent_start, extent_start);
   5.613 -
   5.614 -    if ( xc_memory_op(xc_handle, XENMEM_machphys_mfn_list, &xmml) ||
   5.615 -         (xmml.nr_extents != m2p_chunks) )
   5.616 -    {
   5.617 -        ERROR("xc_get_m2p_mfns");
   5.618 -        return NULL;
   5.619 -    }
   5.620 -
   5.621 -    if ( (m2p = mmap(NULL, m2p_size, prot,
   5.622 -                     MAP_SHARED, xc_handle, 0)) == MAP_FAILED )
   5.623 -    {
   5.624 -        ERROR("failed to mmap m2p");
   5.625 -        return NULL;
   5.626 -    }
   5.627 -
   5.628 -    if ( !(entries = malloc(m2p_chunks * sizeof(privcmd_mmap_entry_t))) )
   5.629 -    {
   5.630 -        ERROR("failed to allocate space for mmap entries");
   5.631 -        return NULL;
   5.632 -    }
   5.633 -
   5.634 -    for ( i = 0; i < m2p_chunks; i++ )
   5.635 -    {
   5.636 -        entries[i].va = (unsigned long)(((void *)m2p) + (i * M2P_CHUNK_SIZE));
   5.637 -        entries[i].mfn = extent_start[i];
   5.638 -        entries[i].npages = M2P_CHUNK_SIZE >> PAGE_SHIFT;
   5.639 -    }
   5.640 -
   5.641 -    if ( (rc = xc_map_foreign_ranges(xc_handle, DOMID_XEN,
   5.642 -                                     entries, m2p_chunks)) < 0 )
   5.643 -    {
   5.644 -        ERROR("xc_mmap_foreign_ranges failed (rc = %d)", rc);
   5.645 -        return NULL;
   5.646 -    }
   5.647 -
   5.648 -    m2p_mfn0 = entries[0].mfn;
   5.649 -
   5.650 -    free(extent_start);
   5.651 -    free(entries);
   5.652 -
   5.653 -    return m2p;
   5.654 -}
   5.655 -
   5.656 -int xc_linux_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
   5.657 -                  uint32_t max_factor, uint32_t flags, int (*suspend)(int))
   5.658 -{
   5.659 -    xc_dominfo_t info;
   5.660 -
   5.661 -    int rc = 1, i, j, last_iter, iter = 0;
   5.662 -    int live  = (flags & XCFLAGS_LIVE);
   5.663 -    int debug = (flags & XCFLAGS_DEBUG);
   5.664 -    int race = 0, sent_last_iter, skip_this_iter;
   5.665 -
   5.666 -    /* The new domain's shared-info frame number. */
   5.667 -    unsigned long shared_info_frame;
   5.668 -
   5.669 -    /* A copy of the CPU context of the guest. */
   5.670 -    vcpu_guest_context_t ctxt;
   5.671 -
   5.672 -    /* A table containg the type of each PFN (/not/ MFN!). */
   5.673 -    unsigned long *pfn_type = NULL;
   5.674 -    unsigned long *pfn_batch = NULL;
   5.675 -
   5.676 -    /* A temporary mapping, and a copy, of one frame of guest memory. */
   5.677 -    char page[PAGE_SIZE];
   5.678 -
   5.679 -    /* Double and single indirect references to the live P2M table */
   5.680 -    xen_pfn_t *live_p2m_frame_list_list = NULL;
   5.681 -    xen_pfn_t *live_p2m_frame_list = NULL;
   5.682 -
   5.683 -    /* A copy of the pfn-to-mfn table frame list. */
   5.684 -    xen_pfn_t *p2m_frame_list = NULL;
   5.685 -
   5.686 -    /* Live mapping of shared info structure */
   5.687 -    shared_info_t *live_shinfo = NULL;
   5.688 -
   5.689 -    /* base of the region in which domain memory is mapped */
   5.690 -    unsigned char *region_base = NULL;
   5.691 -
   5.692 -    /* power of 2 order of p2m_size */
   5.693 -    int order_nr;
   5.694 -
   5.695 -    /* bitmap of pages:
   5.696 -       - that should be sent this iteration (unless later marked as skip);
   5.697 -       - to skip this iteration because already dirty;
   5.698 -       - to fixup by sending at the end if not already resent; */
   5.699 -    unsigned long *to_send = NULL, *to_skip = NULL, *to_fix = NULL;
   5.700 -
   5.701 -    xc_shadow_op_stats_t stats;
   5.702 -
   5.703 -    unsigned long needed_to_fix = 0;
   5.704 -    unsigned long total_sent    = 0;
   5.705 -
   5.706 -    uint64_t vcpumap = 1ULL;
   5.707 -
   5.708 -    /* If no explicit control parameters given, use defaults */
   5.709 -    max_iters  = max_iters  ? : DEF_MAX_ITERS;
   5.710 -    max_factor = max_factor ? : DEF_MAX_FACTOR;
   5.711 -
   5.712 -    initialize_mbit_rate();
   5.713 -
   5.714 -    if ( !get_platform_info(xc_handle, dom,
   5.715 -                            &max_mfn, &hvirt_start, &pt_levels) )
   5.716 -    {
   5.717 -        ERROR("Unable to get platform info.");
   5.718 -        return 1;
   5.719 -    }
   5.720 -
   5.721 -    if ( xc_domain_getinfo(xc_handle, dom, 1, &info) != 1 )
   5.722 -    {
   5.723 -        ERROR("Could not get domain info");
   5.724 -        return 1;
   5.725 -    }
   5.726 -
   5.727 -    if ( xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt) )
   5.728 -    {
   5.729 -        ERROR("Could not get vcpu context");
   5.730 -        goto out;
   5.731 -    }
   5.732 -    shared_info_frame = info.shared_info_frame;
   5.733 -
   5.734 -    /* Map the shared info frame */
   5.735 -    if ( !(live_shinfo = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
   5.736 -                                              PROT_READ, shared_info_frame)) )
   5.737 -    {
   5.738 -        ERROR("Couldn't map live_shinfo");
   5.739 -        goto out;
   5.740 -    }
   5.741 -
   5.742 -    p2m_size = live_shinfo->arch.max_pfn;
   5.743 -
   5.744 -    live_p2m_frame_list_list = map_frame_list_list(xc_handle, dom,
   5.745 -                                                   live_shinfo);
   5.746 -    if ( !live_p2m_frame_list_list )
   5.747 -        goto out;
   5.748 -
   5.749 -    live_p2m_frame_list =
   5.750 -        xc_map_foreign_batch(xc_handle, dom, PROT_READ,
   5.751 -                             live_p2m_frame_list_list,
   5.752 -                             P2M_FLL_ENTRIES);
   5.753 -    if ( !live_p2m_frame_list )
   5.754 -    {
   5.755 -        ERROR("Couldn't map p2m_frame_list");
   5.756 -        goto out;
   5.757 -    }
   5.758 -
   5.759 -    /* Map all the frames of the pfn->mfn table. For migrate to succeed,
   5.760 -       the guest must not change which frames are used for this purpose.
   5.761 -       (its not clear why it would want to change them, and we'll be OK
   5.762 -       from a safety POV anyhow. */
   5.763 -
   5.764 -    live_p2m = xc_map_foreign_batch(xc_handle, dom, PROT_READ,
   5.765 -                                    live_p2m_frame_list,
   5.766 -                                    P2M_FL_ENTRIES);
   5.767 -    if ( !live_p2m )
   5.768 -    {
   5.769 -        ERROR("Couldn't map p2m table");
   5.770 -        goto out;
   5.771 -    }
   5.772 -
   5.773 -    /* Setup the mfn_to_pfn table mapping */
   5.774 -    if ( !(live_m2p = xc_map_m2p(xc_handle, max_mfn, PROT_READ)) )
   5.775 -    {
   5.776 -        ERROR("Failed to map live M2P table");
   5.777 -        goto out;
   5.778 -    }
   5.779 -
   5.780 -
   5.781 -    /* Get a local copy of the live_P2M_frame_list */
   5.782 -    if ( !(p2m_frame_list = malloc(P2M_FL_SIZE)) )
   5.783 -    {
   5.784 -        ERROR("Couldn't allocate p2m_frame_list array");
   5.785 -        goto out;
   5.786 -    }
   5.787 -    memcpy(p2m_frame_list, live_p2m_frame_list, P2M_FL_SIZE);
   5.788 -
   5.789 -    /* Canonicalise the pfn-to-mfn table frame-number list. */
   5.790 -    for ( i = 0; i < p2m_size; i += fpp )
   5.791 -    {
   5.792 -        if ( !translate_mfn_to_pfn(&p2m_frame_list[i/fpp]) )
   5.793 -        {
   5.794 -            ERROR("Frame# in pfn-to-mfn frame list is not in pseudophys");
   5.795 -            ERROR("entry %d: p2m_frame_list[%ld] is 0x%"PRIx64, i, i/fpp,
   5.796 -                  (uint64_t)p2m_frame_list[i/fpp]);
   5.797 -            goto out;
   5.798 -        }
   5.799 -    }
   5.800 -
   5.801 -    /* Domain is still running at this point */
   5.802 -    if ( live )
   5.803 -    {
   5.804 -        /* Live suspend. Enable log-dirty mode. */
   5.805 -        if ( xc_shadow_control(xc_handle, dom,
   5.806 -                               XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY,
   5.807 -                               NULL, 0, NULL, 0, NULL) < 0 )
   5.808 -        {
   5.809 -            ERROR("Couldn't enable shadow mode");
   5.810 -            goto out;
   5.811 -        }
   5.812 -    }
   5.813 -    else
   5.814 -    {
   5.815 -        /* This is a non-live suspend. Suspend the domain .*/
   5.816 -        if ( suspend_and_state(suspend, xc_handle, io_fd, dom, &info, &ctxt) )
   5.817 -        {
   5.818 -            ERROR("Domain appears not to have suspended");
   5.819 -            goto out;
   5.820 -        }
   5.821 -    }
   5.822 -
   5.823 -    last_iter = !live;
   5.824 -
   5.825 -    /* pretend we sent all the pages last iteration */
   5.826 -    sent_last_iter = p2m_size;
   5.827 -
   5.828 -    /* calculate the power of 2 order of p2m_size, e.g.
   5.829 -       15->4 16->4 17->5 */
   5.830 -    for ( i = p2m_size-1, order_nr = 0; i ; i >>= 1, order_nr++ )
   5.831 -        continue;
   5.832 -
   5.833 -    /* Setup to_send / to_fix and to_skip bitmaps */
   5.834 -    to_send = malloc(BITMAP_SIZE);
   5.835 -    to_fix  = calloc(1, BITMAP_SIZE);
   5.836 -    to_skip = malloc(BITMAP_SIZE);
   5.837 -
   5.838 -    if ( !to_send || !to_fix || !to_skip )
   5.839 -    {
   5.840 -        ERROR("Couldn't allocate to_send array");
   5.841 -        goto out;
   5.842 -    }
   5.843 -
   5.844 -    memset(to_send, 0xff, BITMAP_SIZE);
   5.845 -
   5.846 -    if ( lock_pages(to_send, BITMAP_SIZE) )
   5.847 -    {
   5.848 -        ERROR("Unable to lock to_send");
   5.849 -        return 1;
   5.850 -    }
   5.851 -
   5.852 -    /* (to fix is local only) */
   5.853 -    if ( lock_pages(to_skip, BITMAP_SIZE) )
   5.854 -    {
   5.855 -        ERROR("Unable to lock to_skip");
   5.856 -        return 1;
   5.857 -    }
   5.858 -
   5.859 -    analysis_phase(xc_handle, dom, p2m_size, to_skip, 0);
   5.860 -
   5.861 -    /* We want zeroed memory so use calloc rather than malloc. */
   5.862 -    pfn_type   = calloc(MAX_BATCH_SIZE, sizeof(*pfn_type));
   5.863 -    pfn_batch  = calloc(MAX_BATCH_SIZE, sizeof(*pfn_batch));
   5.864 -    if ( (pfn_type == NULL) || (pfn_batch == NULL) )
   5.865 -    {
   5.866 -        ERROR("failed to alloc memory for pfn_type and/or pfn_batch arrays");
   5.867 -        errno = ENOMEM;
   5.868 -        goto out;
   5.869 -    }
   5.870 -
   5.871 -    if ( lock_pages(pfn_type, MAX_BATCH_SIZE * sizeof(*pfn_type)) )
   5.872 -    {
   5.873 -        ERROR("Unable to lock");
   5.874 -        goto out;
   5.875 -    }
   5.876 -
   5.877 -    /*
   5.878 -     * Quick belt and braces sanity check.
   5.879 -     */
   5.880 -    {
   5.881 -        int err=0;
   5.882 -        unsigned long mfn;
   5.883 -        for ( i = 0; i < p2m_size; i++ )
   5.884 -        {
   5.885 -            mfn = live_p2m[i];
   5.886 -            if( (mfn != INVALID_P2M_ENTRY) && (mfn_to_pfn(mfn) != i) )
   5.887 -            {
   5.888 -                DPRINTF("i=0x%x mfn=%lx live_m2p=%lx\n", i,
   5.889 -                        mfn, mfn_to_pfn(mfn));
   5.890 -                err++;
   5.891 -            }
   5.892 -        }
   5.893 -        DPRINTF("Had %d unexplained entries in p2m table\n", err);
   5.894 -    }
   5.895 -
   5.896 -    /* Start writing out the saved-domain record. */
   5.897 -    if ( !write_exact(io_fd, &p2m_size, sizeof(unsigned long)) )
   5.898 -    {
   5.899 -        ERROR("write: p2m_size");
   5.900 -        goto out;
   5.901 -    }
   5.902 -
   5.903 -    /*
   5.904 -     * Write an extended-info structure to inform the restore code that
   5.905 -     * a PAE guest understands extended CR3 (PDPTs above 4GB). Turns off
   5.906 -     * slow paths in the restore code.
   5.907 -     */
   5.908 -    if ( (pt_levels == 3) &&
   5.909 -         (ctxt.vm_assist & (1UL << VMASST_TYPE_pae_extended_cr3)) )
   5.910 -    {
   5.911 -        unsigned long signature = ~0UL;
   5.912 -        uint32_t tot_sz   = sizeof(struct vcpu_guest_context) + 8;
   5.913 -        uint32_t chunk_sz = sizeof(struct vcpu_guest_context);
   5.914 -        char chunk_sig[]  = "vcpu";
   5.915 -        if ( !write_exact(io_fd, &signature, sizeof(signature)) ||
   5.916 -             !write_exact(io_fd, &tot_sz,    sizeof(tot_sz)) ||
   5.917 -             !write_exact(io_fd, &chunk_sig, 4) ||
   5.918 -             !write_exact(io_fd, &chunk_sz,  sizeof(chunk_sz)) ||
   5.919 -             !write_exact(io_fd, &ctxt,      sizeof(ctxt)) )
   5.920 -        {
   5.921 -            ERROR("write: extended info");
   5.922 -            goto out;
   5.923 -        }
   5.924 -    }
   5.925 -
   5.926 -    if ( !write_exact(io_fd, p2m_frame_list, P2M_FL_SIZE) )
   5.927 -    {
   5.928 -        ERROR("write: p2m_frame_list");
   5.929 -        goto out;
   5.930 -    }
   5.931 -
   5.932 -    print_stats(xc_handle, dom, 0, &stats, 0);
   5.933 -
   5.934 -    /* Now write out each data page, canonicalising page tables as we go... */
   5.935 -    for ( ; ; )
   5.936 -    {
   5.937 -        unsigned int prev_pc, sent_this_iter, N, batch;
   5.938 -
   5.939 -        iter++;
   5.940 -        sent_this_iter = 0;
   5.941 -        skip_this_iter = 0;
   5.942 -        prev_pc = 0;
   5.943 -        N = 0;
   5.944 -
   5.945 -        DPRINTF("Saving memory pages: iter %d   0%%", iter);
   5.946 -
   5.947 -        while ( N < p2m_size )
   5.948 -        {
   5.949 -            unsigned int this_pc = (N * 100) / p2m_size;
   5.950 -            int rc;
   5.951 -
   5.952 -            if ( (this_pc - prev_pc) >= 5 )
   5.953 -            {
   5.954 -                DPRINTF("\b\b\b\b%3d%%", this_pc);
   5.955 -                prev_pc = this_pc;
   5.956 -            }
   5.957 -
   5.958 -            if ( !last_iter )
   5.959 -            {
   5.960 -                /* Slightly wasteful to peek the whole array evey time,
   5.961 -                   but this is fast enough for the moment. */
   5.962 -                rc = xc_shadow_control(
   5.963 -                    xc_handle, dom, XEN_DOMCTL_SHADOW_OP_PEEK, to_skip, 
   5.964 -                    p2m_size, NULL, 0, NULL);
   5.965 -                if ( rc != p2m_size )
   5.966 -                {
   5.967 -                    ERROR("Error peeking shadow bitmap");
   5.968 -                    goto out;
   5.969 -                }
   5.970 -            }
   5.971 -
   5.972 -            /* load pfn_type[] with the mfn of all the pages we're doing in
   5.973 -               this batch. */
   5.974 -            for  ( batch = 0;
   5.975 -                   (batch < MAX_BATCH_SIZE) && (N < p2m_size);
   5.976 -                   N++ )
   5.977 -            {
   5.978 -                int n = permute(N, p2m_size, order_nr);
   5.979 -
   5.980 -                if ( debug )
   5.981 -                    DPRINTF("%d pfn= %08lx mfn= %08lx %d  [mfn]= %08lx\n",
   5.982 -                            iter, (unsigned long)n, live_p2m[n],
   5.983 -                            test_bit(n, to_send),
   5.984 -                            mfn_to_pfn(live_p2m[n]&0xFFFFF));
   5.985 -
   5.986 -                if ( !last_iter &&
   5.987 -                     test_bit(n, to_send) &&
   5.988 -                     test_bit(n, to_skip) )
   5.989 -                    skip_this_iter++; /* stats keeping */
   5.990 -
   5.991 -                if ( !((test_bit(n, to_send) && !test_bit(n, to_skip)) ||
   5.992 -                       (test_bit(n, to_send) && last_iter) ||
   5.993 -                       (test_bit(n, to_fix)  && last_iter)) )
   5.994 -                    continue;
   5.995 -
   5.996 -                /*
   5.997 -                ** we get here if:
   5.998 -                **  1. page is marked to_send & hasn't already been re-dirtied
   5.999 -                **  2. (ignore to_skip in last iteration)
  5.1000 -                **  3. add in pages that still need fixup (net bufs)
  5.1001 -                */
  5.1002 -
  5.1003 -                pfn_batch[batch] = n;
  5.1004 -                pfn_type[batch]  = live_p2m[n];
  5.1005 -
  5.1006 -                if ( !is_mapped(pfn_type[batch]) )
  5.1007 -                {
  5.1008 -                    /*
  5.1009 -                    ** not currently in psuedo-physical map -- set bit
  5.1010 -                    ** in to_fix since we must send this page in last_iter
  5.1011 -                    ** unless its sent sooner anyhow, or it never enters
  5.1012 -                    ** pseudo-physical map (e.g. for ballooned down domains)
  5.1013 -                    */
  5.1014 -                    set_bit(n, to_fix);
  5.1015 -                    continue;
  5.1016 -                }
  5.1017 -
  5.1018 -                if ( last_iter &&
  5.1019 -                     test_bit(n, to_fix) &&
  5.1020 -                     !test_bit(n, to_send) )
  5.1021 -                {
  5.1022 -                    needed_to_fix++;
  5.1023 -                    DPRINTF("Fix! iter %d, pfn %x. mfn %lx\n",
  5.1024 -                            iter, n, pfn_type[batch]);
  5.1025 -                }
  5.1026 -
  5.1027 -                clear_bit(n, to_fix);
  5.1028 -
  5.1029 -                batch++;
  5.1030 -            }
  5.1031 -
  5.1032 -            if ( batch == 0 )
  5.1033 -                goto skip; /* vanishingly unlikely... */
  5.1034 -
  5.1035 -            region_base = xc_map_foreign_batch(
  5.1036 -                xc_handle, dom, PROT_READ, pfn_type, batch);
  5.1037 -            if ( region_base == NULL )
  5.1038 -            {
  5.1039 -                ERROR("map batch failed");
  5.1040 -                goto out;
  5.1041 -            }
  5.1042 -
  5.1043 -            for ( j = 0; j < batch; j++ )
  5.1044 -                ((uint32_t *)pfn_type)[j] = pfn_type[j];
  5.1045 -            if ( xc_get_pfn_type_batch(xc_handle, dom, batch,
  5.1046 -                                       (uint32_t *)pfn_type) )
  5.1047 -            {
  5.1048 -                ERROR("get_pfn_type_batch failed");
  5.1049 -                goto out;
  5.1050 -            }
  5.1051 -            for ( j = batch-1; j >= 0; j-- )
  5.1052 -                pfn_type[j] = ((uint32_t *)pfn_type)[j];
  5.1053 -
  5.1054 -            for ( j = 0; j < batch; j++ )
  5.1055 -            {
  5.1056 -
  5.1057 -                if ( (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) ==
  5.1058 -                     XEN_DOMCTL_PFINFO_XTAB )
  5.1059 -                {
  5.1060 -                    DPRINTF("type fail: page %i mfn %08lx\n", j, pfn_type[j]);
  5.1061 -                    continue;
  5.1062 -                }
  5.1063 -
  5.1064 -                if ( debug )
  5.1065 -                    DPRINTF("%d pfn= %08lx mfn= %08lx [mfn]= %08lx"
  5.1066 -                            " sum= %08lx\n",
  5.1067 -                            iter,
  5.1068 -                            (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) |
  5.1069 -                            pfn_batch[j],
  5.1070 -                            pfn_type[j],
  5.1071 -                            mfn_to_pfn(pfn_type[j] &
  5.1072 -                                       ~XEN_DOMCTL_PFINFO_LTAB_MASK),
  5.1073 -                            csum_page(region_base + (PAGE_SIZE*j)));
  5.1074 -
  5.1075 -                /* canonicalise mfn->pfn */
  5.1076 -                pfn_type[j] = (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) |
  5.1077 -                    pfn_batch[j];
  5.1078 -            }
  5.1079 -
  5.1080 -            if ( !write_exact(io_fd, &batch, sizeof(unsigned int)) )
  5.1081 -            {
  5.1082 -                ERROR("Error when writing to state file (2) (errno %d)",
  5.1083 -                      errno);
  5.1084 -                goto out;
  5.1085 -            }
  5.1086 -
  5.1087 -            if ( !write_exact(io_fd, pfn_type, sizeof(unsigned long)*j) )
  5.1088 -            {
  5.1089 -                ERROR("Error when writing to state file (3) (errno %d)",
  5.1090 -                      errno);
  5.1091 -                goto out;
  5.1092 -            }
  5.1093 -
  5.1094 -            /* entering this loop, pfn_type is now in pfns (Not mfns) */
  5.1095 -            for ( j = 0; j < batch; j++ )
  5.1096 -            {
  5.1097 -                unsigned long pfn, pagetype;
  5.1098 -                void *spage = (char *)region_base + (PAGE_SIZE*j);
  5.1099 -
  5.1100 -                pfn      = pfn_type[j] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
  5.1101 -                pagetype = pfn_type[j] &  XEN_DOMCTL_PFINFO_LTAB_MASK;
  5.1102 -
  5.1103 -                /* write out pages in batch */
  5.1104 -                if ( pagetype == XEN_DOMCTL_PFINFO_XTAB )
  5.1105 -                    continue;
  5.1106 -
  5.1107 -                pagetype &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK;
  5.1108 -
  5.1109 -                if ( (pagetype >= XEN_DOMCTL_PFINFO_L1TAB) &&
  5.1110 -                     (pagetype <= XEN_DOMCTL_PFINFO_L4TAB) )
  5.1111 -                {
  5.1112 -                    /* We have a pagetable page: need to rewrite it. */
  5.1113 -                    race = 
  5.1114 -                        canonicalize_pagetable(pagetype, pfn, spage, page); 
  5.1115 -
  5.1116 -                    if ( race && !live )
  5.1117 -                    {
  5.1118 -                        ERROR("Fatal PT race (pfn %lx, type %08lx)", pfn,
  5.1119 -                              pagetype);
  5.1120 -                        goto out;
  5.1121 -                    }
  5.1122 -
  5.1123 -                    if ( ratewrite(io_fd, live, page, PAGE_SIZE) != PAGE_SIZE )
  5.1124 -                    {
  5.1125 -                        ERROR("Error when writing to state file (4)"
  5.1126 -                              " (errno %d)", errno);
  5.1127 -                        goto out;
  5.1128 -                    }
  5.1129 -                }
  5.1130 -                else
  5.1131 -                {
  5.1132 -                    /* We have a normal page: just write it directly. */
  5.1133 -                    if ( ratewrite(io_fd, live, spage, PAGE_SIZE) !=
  5.1134 -                         PAGE_SIZE )
  5.1135 -                    {
  5.1136 -                        ERROR("Error when writing to state file (5)"
  5.1137 -                              " (errno %d)", errno);
  5.1138 -                        goto out;
  5.1139 -                    }
  5.1140 -                }
  5.1141 -            } /* end of the write out for this batch */
  5.1142 -
  5.1143 -            sent_this_iter += batch;
  5.1144 -
  5.1145 -            munmap(region_base, batch*PAGE_SIZE);
  5.1146 -
  5.1147 -        } /* end of this while loop for this iteration */
  5.1148 -
  5.1149 -      skip:
  5.1150 -
  5.1151 -        total_sent += sent_this_iter;
  5.1152 -
  5.1153 -        DPRINTF("\r %d: sent %d, skipped %d, ",
  5.1154 -                iter, sent_this_iter, skip_this_iter );
  5.1155 -
  5.1156 -        if ( last_iter )
  5.1157 -        {
  5.1158 -            print_stats( xc_handle, dom, sent_this_iter, &stats, 1);
  5.1159 -
  5.1160 -            DPRINTF("Total pages sent= %ld (%.2fx)\n",
  5.1161 -                    total_sent, ((float)total_sent)/p2m_size );
  5.1162 -            DPRINTF("(of which %ld were fixups)\n", needed_to_fix  );
  5.1163 -        }
  5.1164 -
  5.1165 -        if ( last_iter && debug )
  5.1166 -        {
  5.1167 -            int minusone = -1;
  5.1168 -            memset(to_send, 0xff, BITMAP_SIZE);
  5.1169 -            debug = 0;
  5.1170 -            DPRINTF("Entering debug resend-all mode\n");
  5.1171 -
  5.1172 -            /* send "-1" to put receiver into debug mode */
  5.1173 -            if ( !write_exact(io_fd, &minusone, sizeof(int)) )
  5.1174 -            {
  5.1175 -                ERROR("Error when writing to state file (6) (errno %d)",
  5.1176 -                      errno);
  5.1177 -                goto out;
  5.1178 -            }
  5.1179 -
  5.1180 -            continue;
  5.1181 -        }
  5.1182 -
  5.1183 -        if ( last_iter )
  5.1184 -            break;
  5.1185 -
  5.1186 -        if ( live )
  5.1187 -        {
  5.1188 -            if ( ((sent_this_iter > sent_last_iter) && RATE_IS_MAX()) ||
  5.1189 -                 (iter >= max_iters) ||
  5.1190 -                 (sent_this_iter+skip_this_iter < 50) ||
  5.1191 -                 (total_sent > p2m_size*max_factor) )
  5.1192 -            {
  5.1193 -                DPRINTF("Start last iteration\n");
  5.1194 -                last_iter = 1;
  5.1195 -
  5.1196 -                if ( suspend_and_state(suspend, xc_handle, io_fd, dom, &info,
  5.1197 -                                       &ctxt) )
  5.1198 -                {
  5.1199 -                    ERROR("Domain appears not to have suspended");
  5.1200 -                    goto out;
  5.1201 -                }
  5.1202 -
  5.1203 -                DPRINTF("SUSPEND shinfo %08lx eip %08lx edx %08lx\n",
  5.1204 -                        info.shared_info_frame,
  5.1205 -                        (unsigned long)ctxt.user_regs.eip,
  5.1206 -                        (unsigned long)ctxt.user_regs.edx);
  5.1207 -            }
  5.1208 -
  5.1209 -            if ( xc_shadow_control(xc_handle, dom, 
  5.1210 -                                   XEN_DOMCTL_SHADOW_OP_CLEAN, to_send, 
  5.1211 -                                   p2m_size, NULL, 0, &stats) != p2m_size )
  5.1212 -            {
  5.1213 -                ERROR("Error flushing shadow PT");
  5.1214 -                goto out;
  5.1215 -            }
  5.1216 -
  5.1217 -            sent_last_iter = sent_this_iter;
  5.1218 -
  5.1219 -            print_stats(xc_handle, dom, sent_this_iter, &stats, 1);
  5.1220 -
  5.1221 -        }
  5.1222 -    } /* end of infinite for loop */
  5.1223 -
  5.1224 -    DPRINTF("All memory is saved\n");
  5.1225 -
  5.1226 -    {
  5.1227 -        struct {
  5.1228 -            int minustwo;
  5.1229 -            int max_vcpu_id;
  5.1230 -            uint64_t vcpumap;
  5.1231 -        } chunk = { -2, info.max_vcpu_id };
  5.1232 -
  5.1233 -        if ( info.max_vcpu_id >= 64 )
  5.1234 -        {
  5.1235 -            ERROR("Too many VCPUS in guest!");
  5.1236 -            goto out;
  5.1237 -        }
  5.1238 -
  5.1239 -        for ( i = 1; i <= info.max_vcpu_id; i++ )
  5.1240 -        {
  5.1241 -            xc_vcpuinfo_t vinfo;
  5.1242 -            if ( (xc_vcpu_getinfo(xc_handle, dom, i, &vinfo) == 0) &&
  5.1243 -                 vinfo.online )
  5.1244 -                vcpumap |= 1ULL << i;
  5.1245 -        }
  5.1246 -
  5.1247 -        chunk.vcpumap = vcpumap;
  5.1248 -        if ( !write_exact(io_fd, &chunk, sizeof(chunk)) )
  5.1249 -        {
  5.1250 -            ERROR("Error when writing to state file (errno %d)", errno);
  5.1251 -            goto out;
  5.1252 -        }
  5.1253 -    }
  5.1254 -
  5.1255 -    /* Zero terminate */
  5.1256 -    i = 0;
  5.1257 -    if ( !write_exact(io_fd, &i, sizeof(int)) )
  5.1258 -    {
  5.1259 -        ERROR("Error when writing to state file (6') (errno %d)", errno);
  5.1260 -        goto out;
  5.1261 -    }
  5.1262 -
  5.1263 -    /* Send through a list of all the PFNs that were not in map at the close */
  5.1264 -    {
  5.1265 -        unsigned int i,j;
  5.1266 -        unsigned long pfntab[1024];
  5.1267 -
  5.1268 -        for ( i = 0, j = 0; i < p2m_size; i++ )
  5.1269 -        {
  5.1270 -            if ( !is_mapped(live_p2m[i]) )
  5.1271 -                j++;
  5.1272 -        }
  5.1273 -
  5.1274 -        if ( !write_exact(io_fd, &j, sizeof(unsigned int)) )
  5.1275 -        {
  5.1276 -            ERROR("Error when writing to state file (6a) (errno %d)", errno);
  5.1277 -            goto out;
  5.1278 -        }
  5.1279 -
  5.1280 -        for ( i = 0, j = 0; i < p2m_size; )
  5.1281 -        {
  5.1282 -            if ( !is_mapped(live_p2m[i]) )
  5.1283 -                pfntab[j++] = i;
  5.1284 -
  5.1285 -            i++;
  5.1286 -            if ( (j == 1024) || (i == p2m_size) )
  5.1287 -            {
  5.1288 -                if ( !write_exact(io_fd, &pfntab, sizeof(unsigned long)*j) )
  5.1289 -                {
  5.1290 -                    ERROR("Error when writing to state file (6b) (errno %d)",
  5.1291 -                          errno);
  5.1292 -                    goto out;
  5.1293 -                }
  5.1294 -                j = 0;
  5.1295 -            }
  5.1296 -        }
  5.1297 -    }
  5.1298 -
  5.1299 -    /* Canonicalise the suspend-record frame number. */
  5.1300 -    if ( !translate_mfn_to_pfn(&ctxt.user_regs.edx) )
  5.1301 -    {
  5.1302 -        ERROR("Suspend record is not in range of pseudophys map");
  5.1303 -        goto out;
  5.1304 -    }
  5.1305 -
  5.1306 -    for ( i = 0; i <= info.max_vcpu_id; i++ )
  5.1307 -    {
  5.1308 -        if ( !(vcpumap & (1ULL << i)) )
  5.1309 -            continue;
  5.1310 -
  5.1311 -        if ( (i != 0) && xc_vcpu_getcontext(xc_handle, dom, i, &ctxt) )
  5.1312 -        {
  5.1313 -            ERROR("No context for VCPU%d", i);
  5.1314 -            goto out;
  5.1315 -        }
  5.1316 -
  5.1317 -        /* Canonicalise each GDT frame number. */
  5.1318 -        for ( j = 0; (512*j) < ctxt.gdt_ents; j++ )
  5.1319 -        {
  5.1320 -            if ( !translate_mfn_to_pfn(&ctxt.gdt_frames[j]) )
  5.1321 -            {
  5.1322 -                ERROR("GDT frame is not in range of pseudophys map");
  5.1323 -                goto out;
  5.1324 -            }
  5.1325 -        }
  5.1326 -
  5.1327 -        /* Canonicalise the page table base pointer. */
  5.1328 -        if ( !MFN_IS_IN_PSEUDOPHYS_MAP(xen_cr3_to_pfn(ctxt.ctrlreg[3])) )
  5.1329 -        {
  5.1330 -            ERROR("PT base is not in range of pseudophys map");
  5.1331 -            goto out;
  5.1332 -        }
  5.1333 -        ctxt.ctrlreg[3] = 
  5.1334 -            xen_pfn_to_cr3(mfn_to_pfn(xen_cr3_to_pfn(ctxt.ctrlreg[3])));
  5.1335 -
  5.1336 -        /* Guest pagetable (x86/64) stored in otherwise-unused CR1. */
  5.1337 -        if ( (pt_levels == 4) && ctxt.ctrlreg[1] )
  5.1338 -        {
  5.1339 -            if ( !MFN_IS_IN_PSEUDOPHYS_MAP(xen_cr3_to_pfn(ctxt.ctrlreg[1])) )
  5.1340 -            {
  5.1341 -                ERROR("PT base is not in range of pseudophys map");
  5.1342 -                goto out;
  5.1343 -            }
  5.1344 -            /* Least-significant bit means 'valid PFN'. */
  5.1345 -            ctxt.ctrlreg[1] = 1 |
  5.1346 -                xen_pfn_to_cr3(mfn_to_pfn(xen_cr3_to_pfn(ctxt.ctrlreg[1])));
  5.1347 -        }
  5.1348 -
  5.1349 -        if ( !write_exact(io_fd, &ctxt, sizeof(ctxt)) )
  5.1350 -        {
  5.1351 -            ERROR("Error when writing to state file (1) (errno %d)", errno);
  5.1352 -            goto out;
  5.1353 -        }
  5.1354 -    }
  5.1355 -
  5.1356 -    /*
  5.1357 -     * Reset the MFN to be a known-invalid value. See map_frame_list_list().
  5.1358 -     */
  5.1359 -    memcpy(page, live_shinfo, PAGE_SIZE);
  5.1360 -    ((shared_info_t *)page)->arch.pfn_to_mfn_frame_list_list = 0;
  5.1361 -    if ( !write_exact(io_fd, page, PAGE_SIZE) )
  5.1362 -    {
  5.1363 -        ERROR("Error when writing to state file (1) (errno %d)", errno);
  5.1364 -        goto out;
  5.1365 -    }
  5.1366 -
  5.1367 -    /* Success! */
  5.1368 -    rc = 0;
  5.1369 -
  5.1370 - out:
  5.1371 -
  5.1372 -    if ( live )
  5.1373 -    {
  5.1374 -        if ( xc_shadow_control(xc_handle, dom, 
  5.1375 -                               XEN_DOMCTL_SHADOW_OP_OFF,
  5.1376 -                               NULL, 0, NULL, 0, NULL) < 0 )
  5.1377 -            DPRINTF("Warning - couldn't disable shadow mode");
  5.1378 -    }
  5.1379 -
  5.1380 -    /* Flush last write and discard cache for file. */
  5.1381 -    discard_file_cache(io_fd, 1 /* flush */);
  5.1382 -
  5.1383 -    if ( live_shinfo )
  5.1384 -        munmap(live_shinfo, PAGE_SIZE);
  5.1385 -
  5.1386 -    if ( live_p2m_frame_list_list )
  5.1387 -        munmap(live_p2m_frame_list_list, PAGE_SIZE);
  5.1388 -
  5.1389 -    if ( live_p2m_frame_list )
  5.1390 -        munmap(live_p2m_frame_list, P2M_FLL_ENTRIES * PAGE_SIZE);
  5.1391 -
  5.1392 -    if ( live_p2m )
  5.1393 -        munmap(live_p2m, ROUNDUP(p2m_size * sizeof(xen_pfn_t), PAGE_SHIFT));
  5.1394 -
  5.1395 -    if ( live_m2p )
  5.1396 -        munmap(live_m2p, M2P_SIZE(max_mfn));
  5.1397 -
  5.1398 -    free(pfn_type);
  5.1399 -    free(pfn_batch);
  5.1400 -    free(to_send);
  5.1401 -    free(to_fix);
  5.1402 -    free(to_skip);
  5.1403 -
  5.1404 -    DPRINTF("Save exit rc=%d\n",rc);
  5.1405 -
  5.1406 -    return !!rc;
  5.1407 -}
  5.1408 -
  5.1409 -/*
  5.1410 - * Local variables:
  5.1411 - * mode: C
  5.1412 - * c-set-style: "BSD"
  5.1413 - * c-basic-offset: 4
  5.1414 - * tab-width: 4
  5.1415 - * indent-tabs-mode: nil
  5.1416 - * End:
  5.1417 - */
     6.1 --- a/tools/libxc/xenguest.h	Wed Apr 11 09:29:00 2007 +0100
     6.2 +++ b/tools/libxc/xenguest.h	Wed Apr 11 14:45:14 2007 +0100
     6.3 @@ -16,26 +16,19 @@
     6.4  
     6.5  
     6.6  /**
     6.7 - * This function will save a domain running Linux.
     6.8 + * This function will save a running domain.
     6.9   *
    6.10   * @parm xc_handle a handle to an open hypervisor interface
    6.11   * @parm fd the file descriptor to save a domain to
    6.12   * @parm dom the id of the domain
    6.13   * @return 0 on success, -1 on failure
    6.14   */
    6.15 -int xc_linux_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
    6.16 -                  uint32_t max_factor, uint32_t flags /* XCFLAGS_xxx */,
    6.17 -                  int (*suspend)(int domid));
    6.18 +int xc_domain_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
    6.19 +                   uint32_t max_factor, uint32_t flags /* XCFLAGS_xxx */,
    6.20 +                   int (*suspend)(int domid), int hvm,
    6.21 +                   void *(*init_qemu_maps)(int, unsigned),  /* HVM only */
    6.22 +                   void (*qemu_flip_buffer)(int, int));     /* HVM only */
    6.23  
    6.24 -/**
    6.25 - * This function will save a hvm domain running unmodified guest.
    6.26 - * @return 0 on success, -1 on failure
    6.27 - */
    6.28 -int xc_hvm_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
    6.29 -                uint32_t max_factor, uint32_t flags /* XCFLAGS_xxx */,
    6.30 -                int (*suspend)(int domid),  
    6.31 -                void *(*init_qemu_maps)(int, unsigned), 
    6.32 -                void (*qemu_flip_buffer)(int, int));
    6.33  
    6.34  /**
    6.35   * This function will restore a saved domain.
     7.1 --- a/tools/libxc/xg_private.c	Wed Apr 11 09:29:00 2007 +0100
     7.2 +++ b/tools/libxc/xg_private.c	Wed Apr 11 14:45:14 2007 +0100
     7.3 @@ -198,17 +198,6 @@ unsigned long csum_page(void *page)
     7.4      return -1;
     7.5  }
     7.6  
     7.7 -__attribute__((weak)) 
     7.8 -    int xc_hvm_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
     7.9 -                    uint32_t max_factor, uint32_t flags,
    7.10 -                    int (*suspend)(int domid), 
    7.11 -                    void *(*init_qemu_maps)(int, unsigned), 
    7.12 -                    void (*qemu_flip_buffer)(int, int))
    7.13 -{
    7.14 -    errno = ENOSYS;
    7.15 -    return -1;
    7.16 -}
    7.17 -
    7.18  __attribute__((weak)) int xc_get_hvm_param(
    7.19      int handle, domid_t dom, int param, unsigned long *value)
    7.20  {
     8.1 --- a/tools/xcutils/xc_save.c	Wed Apr 11 09:29:00 2007 +0100
     8.2 +++ b/tools/xcutils/xc_save.c	Wed Apr 11 14:45:14 2007 +0100
     8.3 @@ -174,12 +174,9 @@ main(int argc, char **argv)
     8.4      max_f = atoi(argv[4]);
     8.5      flags = atoi(argv[5]);
     8.6  
     8.7 -    if (flags & XCFLAGS_HVM)
     8.8 -        ret = xc_hvm_save(xc_fd, io_fd, domid, maxit, max_f, flags, 
     8.9 -                          &suspend, &init_qemu_maps, &qemu_flip_buffer);
    8.10 -    else 
    8.11 -        ret = xc_linux_save(xc_fd, io_fd, domid, maxit, max_f, flags, 
    8.12 -                            &suspend);
    8.13 +    ret = xc_domain_save(xc_fd, io_fd, domid, maxit, max_f, flags, 
    8.14 +                         &suspend, !!(flags & XCFLAGS_HVM),
    8.15 +                         &init_qemu_maps, &qemu_flip_buffer);
    8.16  
    8.17      xc_interface_close(xc_fd);
    8.18