ia64/xen-unstable

changeset 13496:4138b80a8a23

[HVM] save restore: guest memory handling

Signed-off-by: Zhai Edwin <edwin.zhai@intel.com>

add support for save/restore HVM guest memory
author Tim Deegan <Tim.Deegan@xensource.com>
date Thu Jan 18 16:48:08 2007 +0000 (2007-01-18)
parents 239c8504f48d
children 025218cdb17d
files tools/libxc/xc_hvm_restore.c tools/libxc/xc_hvm_save.c
line diff
     1.1 --- a/tools/libxc/xc_hvm_restore.c	Thu Jan 18 16:48:07 2007 +0000
     1.2 +++ b/tools/libxc/xc_hvm_restore.c	Thu Jan 18 16:48:08 2007 +0000
     1.3 @@ -32,11 +32,329 @@
     1.4  #include <xen/hvm/params.h>
     1.5  #include <xen/hvm/e820.h>
     1.6  
     1.7 +/* max mfn of the whole machine */
     1.8 +static unsigned long max_mfn;
     1.9 +
    1.10 +/* virtual starting address of the hypervisor */
    1.11 +static unsigned long hvirt_start;
    1.12 +
    1.13 +/* #levels of page tables used by the currrent guest */
    1.14 +static unsigned int pt_levels;
    1.15 +
    1.16 +/* total number of pages used by the current guest */
    1.17 +static unsigned long max_pfn;
    1.18 +
    1.19 +/* A table mapping each PFN to its new MFN. */
    1.20 +static xen_pfn_t *p2m = NULL;
    1.21 +
    1.22 +static ssize_t
    1.23 +read_exact(int fd, void *buf, size_t count)
    1.24 +{
    1.25 +    int r = 0, s;
    1.26 +    unsigned char *b = buf;
    1.27 +
    1.28 +    while (r < count) {
    1.29 +        s = read(fd, &b[r], count - r);
    1.30 +        if ((s == -1) && (errno == EINTR))
    1.31 +            continue;
    1.32 +        if (s <= 0) {
    1.33 +            break;
    1.34 +        }
    1.35 +        r += s;
    1.36 +    }
    1.37 +
    1.38 +    return (r == count) ? 1 : 0;
    1.39 +}
    1.40 +
    1.41  int xc_hvm_restore(int xc_handle, int io_fd,
    1.42                       uint32_t dom, unsigned long nr_pfns,
    1.43                       unsigned int store_evtchn, unsigned long *store_mfn,
    1.44                       unsigned int console_evtchn, unsigned long *console_mfn,
    1.45                       unsigned int pae, unsigned int apic)
    1.46  {
    1.47 -    return 0;
    1.48 +    DECLARE_DOMCTL;
    1.49 +
    1.50 +    /* The new domain's shared-info frame number. */
    1.51 +    unsigned long shared_info_frame;
    1.52 +
    1.53 +    /* A copy of the CPU context of the guest. */
    1.54 +    vcpu_guest_context_t ctxt;
    1.55 +
    1.56 +    char *region_base;
    1.57 +
    1.58 +    unsigned long buf[PAGE_SIZE/sizeof(unsigned long)];
    1.59 +
    1.60 +    xc_dominfo_t info;
    1.61 +    unsigned int rc = 1, n, i;
    1.62 +    uint32_t rec_len, nr_vcpus;
    1.63 +    hvm_domain_context_t hvm_ctxt;
    1.64 +    unsigned long long v_end, memsize;
    1.65 +    unsigned long shared_page_nr;
    1.66 +
    1.67 +    unsigned long mfn, pfn;
    1.68 +    unsigned int prev_pc, this_pc;
    1.69 +    int verify = 0;
    1.70 +
    1.71 +    /* Types of the pfns in the current region */
    1.72 +    unsigned long region_pfn_type[MAX_BATCH_SIZE];
    1.73 +
    1.74 +    /* hvm guest mem size (Mb) */
    1.75 +    memsize = (unsigned long long)*store_mfn;
    1.76 +    v_end = memsize << 20;
    1.77 +
    1.78 +    DPRINTF("xc_hvm_restore:dom=%d, nr_pfns=0x%lx, store_evtchn=%d, *store_mfn=%ld, console_evtchn=%d, *console_mfn=%ld, pae=%u, apic=%u.\n", 
    1.79 +            dom, nr_pfns, store_evtchn, *store_mfn, console_evtchn, *console_mfn, pae, apic);
    1.80 +
    1.81 +    max_pfn = nr_pfns;
    1.82 +
    1.83 +    if(!get_platform_info(xc_handle, dom,
    1.84 +                          &max_mfn, &hvirt_start, &pt_levels)) {
    1.85 +        ERROR("Unable to get platform info.");
    1.86 +        return 1;
    1.87 +    }
    1.88 +
    1.89 +    DPRINTF("xc_hvm_restore start: max_pfn = %lx, max_mfn = %lx, hvirt_start=%lx, pt_levels=%d\n",
    1.90 +            max_pfn,
    1.91 +            max_mfn,
    1.92 +            hvirt_start,
    1.93 +            pt_levels);
    1.94 +
    1.95 +    if (mlock(&ctxt, sizeof(ctxt))) {
    1.96 +        /* needed for build dom0 op, but might as well do early */
    1.97 +        ERROR("Unable to mlock ctxt");
    1.98 +        return 1;
    1.99 +    }
   1.100 +
   1.101 +
   1.102 +    p2m        = malloc(max_pfn * sizeof(xen_pfn_t));
   1.103 +
   1.104 +    if (p2m == NULL) {
   1.105 +        ERROR("memory alloc failed");
   1.106 +        errno = ENOMEM;
   1.107 +        goto out;
   1.108 +    }
   1.109 +
   1.110 +    /* Get the domain's shared-info frame. */
   1.111 +    domctl.cmd = XEN_DOMCTL_getdomaininfo;
   1.112 +    domctl.domain = (domid_t)dom;
   1.113 +    if (xc_domctl(xc_handle, &domctl) < 0) {
   1.114 +        ERROR("Could not get information on new domain");
   1.115 +        goto out;
   1.116 +    }
   1.117 +    shared_info_frame = domctl.u.getdomaininfo.shared_info_frame;
   1.118 +
   1.119 +    if(xc_domain_setmaxmem(xc_handle, dom, PFN_TO_KB(max_pfn)) != 0) {
   1.120 +        errno = ENOMEM;
   1.121 +        goto out;
   1.122 +    }
   1.123 +
   1.124 +    for ( i = 0; i < max_pfn; i++ )
   1.125 +        p2m[i] = i;
   1.126 +    for ( i = HVM_BELOW_4G_RAM_END >> PAGE_SHIFT; i < max_pfn; i++ )
   1.127 +        p2m[i] += HVM_BELOW_4G_MMIO_LENGTH >> PAGE_SHIFT;
   1.128 +
   1.129 +    /* Allocate memory for HVM guest, skipping VGA hole 0xA0000-0xC0000. */
   1.130 +    rc = xc_domain_memory_populate_physmap(
   1.131 +        xc_handle, dom, (max_pfn > 0xa0) ? 0xa0 : max_pfn,
   1.132 +        0, 0, &p2m[0x00]);
   1.133 +    if ( (rc == 0) && (max_pfn > 0xc0) )
   1.134 +        rc = xc_domain_memory_populate_physmap(
   1.135 +            xc_handle, dom, max_pfn - 0xc0, 0, 0, &p2m[0xc0]);
   1.136 +    if ( rc != 0 )
   1.137 +    {
   1.138 +        PERROR("Could not allocate memory for HVM guest.\n");
   1.139 +        goto out;
   1.140 +    }
   1.141 +
   1.142 +
   1.143 +    /**********XXXXXXXXXXXXXXXX******************/
   1.144 +    if (xc_domain_getinfo(xc_handle, dom, 1, &info) != 1) {
   1.145 +        ERROR("Could not get domain info");
   1.146 +        return 1;
   1.147 +    }
   1.148 +
   1.149 +    domctl.cmd = XEN_DOMCTL_getdomaininfo;
   1.150 +    domctl.domain = (domid_t)dom;
   1.151 +    if (xc_domctl(xc_handle, &domctl) < 0) {
   1.152 +        ERROR("Could not get information on new domain");
   1.153 +        goto out;
   1.154 +    }
   1.155 +
   1.156 +    for ( i = 0; i < max_pfn; i++)
   1.157 +        p2m[i] = i;
   1.158 +
   1.159 +    prev_pc = 0;
   1.160 +
   1.161 +    n = 0;
   1.162 +    while (1) {
   1.163 +
   1.164 +        int j;
   1.165 +
   1.166 +        this_pc = (n * 100) / max_pfn;
   1.167 +        if ( (this_pc - prev_pc) >= 5 )
   1.168 +        {
   1.169 +            PPRINTF("\b\b\b\b%3d%%", this_pc);
   1.170 +            prev_pc = this_pc;
   1.171 +        }
   1.172 +
   1.173 +        if (!read_exact(io_fd, &j, sizeof(int))) {
   1.174 +            ERROR("HVM restore Error when reading batch size");
   1.175 +            goto out;
   1.176 +        }
   1.177 +
   1.178 +        PPRINTF("batch %d\n",j);
   1.179 +
   1.180 +        if (j == -1) {
   1.181 +            verify = 1;
   1.182 +            DPRINTF("Entering page verify mode\n");
   1.183 +            continue;
   1.184 +        }
   1.185 +
   1.186 +        if (j == 0)
   1.187 +            break;  /* our work here is done */
   1.188 +
   1.189 +        if (j > MAX_BATCH_SIZE) {
   1.190 +            ERROR("Max batch size exceeded. Giving up.");
   1.191 +            goto out;
   1.192 +        }
   1.193 +
   1.194 +        if (!read_exact(io_fd, region_pfn_type, j*sizeof(unsigned long))) {
   1.195 +            ERROR("Error when reading region pfn types");
   1.196 +            goto out;
   1.197 +        }
   1.198 +
   1.199 +        region_base = xc_map_foreign_batch(
   1.200 +            xc_handle, dom, PROT_WRITE, region_pfn_type, j);
   1.201 +
   1.202 +        for ( i = 0; i < j; i++ )
   1.203 +        {
   1.204 +            void *page;
   1.205 +
   1.206 +            pfn = region_pfn_type[i];
   1.207 +            if ( pfn > max_pfn )
   1.208 +            {
   1.209 +                ERROR("pfn out of range");
   1.210 +                goto out;
   1.211 +            }
   1.212 +
   1.213 +            if ( pfn >= 0xa0 && pfn < 0xc0) {
   1.214 +                ERROR("hvm restore:pfn in vga hole");
   1.215 +                goto out;
   1.216 +            }
   1.217 +
   1.218 +
   1.219 +            mfn = p2m[pfn];
   1.220 +
   1.221 +            /* In verify mode, we use a copy; otherwise we work in place */
   1.222 +            page = verify ? (void *)buf : (region_base + i*PAGE_SIZE);
   1.223 +
   1.224 +            if (!read_exact(io_fd, page, PAGE_SIZE)) {
   1.225 +                ERROR("Error when reading page (%x)", i);
   1.226 +                goto out;
   1.227 +            }
   1.228 +
   1.229 +            if (verify) {
   1.230 +
   1.231 +                int res = memcmp(buf, (region_base + i*PAGE_SIZE), PAGE_SIZE);
   1.232 +
   1.233 +                if (res) {
   1.234 +
   1.235 +                    int v;
   1.236 +
   1.237 +                    DPRINTF("************** pfn=%lx mfn=%lx gotcs=%08lx "
   1.238 +                            "actualcs=%08lx\n", pfn, p2m[pfn],
   1.239 +                            csum_page(region_base + i*PAGE_SIZE),
   1.240 +                            csum_page(buf));
   1.241 +
   1.242 +                    for (v = 0; v < 4; v++) {
   1.243 +
   1.244 +                        unsigned long *p = (unsigned long *)
   1.245 +                            (region_base + i*PAGE_SIZE);
   1.246 +                        if (buf[v] != p[v])
   1.247 +                            DPRINTF("    %d: %08lx %08lx\n", v, buf[v], p[v]);
   1.248 +                    }
   1.249 +                }
   1.250 +            }
   1.251 +
   1.252 +        } /* end of 'batch' for loop */
   1.253 +        munmap(region_base, j*PAGE_SIZE);
   1.254 +        n+= j; /* crude stats */
   1.255 +
   1.256 +    }/*while 1*/
   1.257 +    
   1.258 +/*    xc_set_hvm_param(xc_handle, dom, HVM_PARAM_APIC_ENABLED, apic);*/
   1.259 +    xc_set_hvm_param(xc_handle, dom, HVM_PARAM_PAE_ENABLED, pae);
   1.260 +    xc_set_hvm_param(xc_handle, dom, HVM_PARAM_STORE_EVTCHN, store_evtchn);
   1.261 +
   1.262 +    if ( v_end > HVM_BELOW_4G_RAM_END )
   1.263 +        shared_page_nr = (HVM_BELOW_4G_RAM_END >> PAGE_SHIFT) - 1;
   1.264 +    else
   1.265 +        shared_page_nr = (v_end >> PAGE_SHIFT) - 1;
   1.266 +
   1.267 +    xc_set_hvm_param(xc_handle, dom, HVM_PARAM_STORE_PFN, shared_page_nr-1);
   1.268 +    xc_set_hvm_param(xc_handle, dom, HVM_PARAM_BUFIOREQ_PFN, shared_page_nr-2);
   1.269 +    xc_set_hvm_param(xc_handle, dom, HVM_PARAM_IOREQ_PFN, shared_page_nr);
   1.270 +
   1.271 +    /* caculate the store_mfn , wrong val cause hang when introduceDomain */
   1.272 +    *store_mfn = (v_end >> PAGE_SHIFT) - 2;
   1.273 +    DPRINTF("hvm restore:calculate new store_mfn=0x%lx,v_end=0x%llx..\n", *store_mfn, v_end);
   1.274 +
   1.275 +    /* restore hvm context including pic/pit/shpage */
   1.276 +    if (!read_exact(io_fd, &rec_len, sizeof(uint32_t))) {
   1.277 +        ERROR("error read hvm context size!\n");
   1.278 +        goto out;
   1.279 +    }
   1.280 +    if (rec_len != sizeof(hvm_ctxt)) {
   1.281 +        ERROR("hvm context size dismatch!\n");
   1.282 +        goto out;
   1.283 +    }
   1.284 +
   1.285 +    if (!read_exact(io_fd, &hvm_ctxt, sizeof(hvm_ctxt))) {
   1.286 +        ERROR("error read hvm context!\n");
   1.287 +        goto out;
   1.288 +    }
   1.289 +
   1.290 +    if (( rc = xc_domain_hvm_setcontext(xc_handle, dom, &hvm_ctxt))) {
   1.291 +        ERROR("error set hvm context!\n");
   1.292 +        goto out;
   1.293 +    }
   1.294 +
   1.295 +    if (!read_exact(io_fd, &nr_vcpus, sizeof(uint32_t))) {
   1.296 +        ERROR("error read nr vcpu !\n");
   1.297 +        goto out;
   1.298 +    }
   1.299 +    DPRINTF("hvm restore:get nr_vcpus=%d.\n", nr_vcpus);
   1.300 +
   1.301 +    for (i =0; i < nr_vcpus; i++) {
   1.302 +        if (!read_exact(io_fd, &rec_len, sizeof(uint32_t))) {
   1.303 +            ERROR("error read vcpu context size!\n");
   1.304 +            goto out;
   1.305 +        }
   1.306 +        if (rec_len != sizeof(ctxt)) {
   1.307 +            ERROR("vcpu context size dismatch!\n");
   1.308 +            goto out;
   1.309 +        }
   1.310 +
   1.311 +        if (!read_exact(io_fd, &(ctxt), sizeof(ctxt))) {
   1.312 +            ERROR("error read vcpu context.\n");
   1.313 +            goto out;
   1.314 +        }
   1.315 +
   1.316 +        if ( (rc = xc_vcpu_setcontext(xc_handle, dom, i, &ctxt)) ) {
   1.317 +            ERROR("Could not set vcpu context, rc=%d", rc);
   1.318 +            goto out;
   1.319 +        }
   1.320 +    }
   1.321 +
   1.322 +    rc = 0;
   1.323 +    goto out;
   1.324 +
   1.325 + out:
   1.326 +    if ( (rc != 0) && (dom != 0) )
   1.327 +        xc_domain_destroy(xc_handle, dom);
   1.328 +    free(p2m);
   1.329 +
   1.330 +    DPRINTF("Restore exit with rc=%d\n", rc);
   1.331 +
   1.332 +    return rc;
   1.333  }
     2.1 --- a/tools/libxc/xc_hvm_save.c	Thu Jan 18 16:48:07 2007 +0000
     2.2 +++ b/tools/libxc/xc_hvm_save.c	Thu Jan 18 16:48:08 2007 +0000
     2.3 @@ -32,9 +32,696 @@
     2.4  #include "xg_private.h"
     2.5  #include "xg_save_restore.h"
     2.6  
     2.7 +/*
     2.8 +** Default values for important tuning parameters. Can override by passing
     2.9 +** non-zero replacement values to xc_hvm_save().
    2.10 +**
    2.11 +** XXX SMH: should consider if want to be able to override MAX_MBIT_RATE too.
    2.12 +**
    2.13 +*/
    2.14 +#define DEF_MAX_ITERS   29   /* limit us to 30 times round loop   */
    2.15 +#define DEF_MAX_FACTOR   3   /* never send more than 3x nr_pfns   */
    2.16 +
    2.17 +/* max mfn of the whole machine */
    2.18 +static unsigned long max_mfn;
    2.19 +
    2.20 +/* virtual starting address of the hypervisor */
    2.21 +static unsigned long hvirt_start;
    2.22 +
    2.23 +/* #levels of page tables used by the currrent guest */
    2.24 +static unsigned int pt_levels;
    2.25 +
    2.26 +/* total number of pages used by the current guest */
    2.27 +static unsigned long max_pfn;
    2.28 +
    2.29 +/*
    2.30 +** During (live) save/migrate, we maintain a number of bitmaps to track
    2.31 +** which pages we have to send, to fixup, and to skip.
    2.32 +*/
    2.33 +
    2.34 +#define BITS_PER_LONG (sizeof(unsigned long) * 8)
    2.35 +#define BITMAP_SIZE   ((max_pfn + BITS_PER_LONG - 1) / 8)
    2.36 +
    2.37 +#define BITMAP_ENTRY(_nr,_bmap) \
    2.38 +   ((unsigned long *)(_bmap))[(_nr)/BITS_PER_LONG]
    2.39 +
    2.40 +#define BITMAP_SHIFT(_nr) ((_nr) % BITS_PER_LONG)
    2.41 +
    2.42 +static inline int test_bit (int nr, volatile void * addr)
    2.43 +{
    2.44 +    return (BITMAP_ENTRY(nr, addr) >> BITMAP_SHIFT(nr)) & 1;
    2.45 +}
    2.46 +
    2.47 +static inline void clear_bit (int nr, volatile void * addr)
    2.48 +{
    2.49 +    BITMAP_ENTRY(nr, addr) &= ~(1UL << BITMAP_SHIFT(nr));
    2.50 +}
    2.51 +
    2.52 +static inline int permute( int i, int nr, int order_nr  )
    2.53 +{
    2.54 +    /* Need a simple permutation function so that we scan pages in a
    2.55 +       pseudo random order, enabling us to get a better estimate of
    2.56 +       the domain's page dirtying rate as we go (there are often
    2.57 +       contiguous ranges of pfns that have similar behaviour, and we
    2.58 +       want to mix them up. */
    2.59 +
    2.60 +    /* e.g. nr->oder 15->4 16->4 17->5 */
    2.61 +    /* 512MB domain, 128k pages, order 17 */
    2.62 +
    2.63 +    /*
    2.64 +      QPONMLKJIHGFEDCBA
    2.65 +             QPONMLKJIH
    2.66 +      GFEDCBA
    2.67 +     */
    2.68 +
    2.69 +    /*
    2.70 +      QPONMLKJIHGFEDCBA
    2.71 +                  EDCBA
    2.72 +             QPONM
    2.73 +      LKJIHGF
    2.74 +      */
    2.75 +
    2.76 +    do { i = ((i>>(order_nr-10)) | ( i<<10 ) ) & ((1<<order_nr)-1); }
    2.77 +    while ( i >= nr ); /* this won't ever loop if nr is a power of 2 */
    2.78 +
    2.79 +    return i;
    2.80 +}
    2.81 +
    2.82 +static uint64_t tv_to_us(struct timeval *new)
    2.83 +{
    2.84 +    return (new->tv_sec * 1000000) + new->tv_usec;
    2.85 +}
    2.86 +
    2.87 +static uint64_t llgettimeofday(void)
    2.88 +{
    2.89 +    struct timeval now;
    2.90 +    gettimeofday(&now, NULL);
    2.91 +    return tv_to_us(&now);
    2.92 +}
    2.93 +
    2.94 +static uint64_t tv_delta(struct timeval *new, struct timeval *old)
    2.95 +{
    2.96 +    return ((new->tv_sec - old->tv_sec)*1000000 ) +
    2.97 +        (new->tv_usec - old->tv_usec);
    2.98 +}
    2.99 +
   2.100 +
   2.101 +#define RATE_IS_MAX() (0)
   2.102 +#define ratewrite(_io_fd, _buf, _n) write((_io_fd), (_buf), (_n))
   2.103 +#define initialize_mbit_rate()
   2.104 +
   2.105 +static inline ssize_t write_exact(int fd, void *buf, size_t count)
   2.106 +{
   2.107 +    if(write(fd, buf, count) != count)
   2.108 +        return 0;
   2.109 +    return 1;
   2.110 +}
   2.111 +
   2.112 +static int print_stats(int xc_handle, uint32_t domid, int pages_sent,
   2.113 +                       xc_shadow_op_stats_t *stats, int print)
   2.114 +{
   2.115 +    static struct timeval wall_last;
   2.116 +    static long long      d0_cpu_last;
   2.117 +    static long long      d1_cpu_last;
   2.118 +
   2.119 +    struct timeval        wall_now;
   2.120 +    long long             wall_delta;
   2.121 +    long long             d0_cpu_now, d0_cpu_delta;
   2.122 +    long long             d1_cpu_now, d1_cpu_delta;
   2.123 +
   2.124 +    gettimeofday(&wall_now, NULL);
   2.125 +
   2.126 +    d0_cpu_now = xc_domain_get_cpu_usage(xc_handle, 0, /* FIXME */ 0)/1000;
   2.127 +    d1_cpu_now = xc_domain_get_cpu_usage(xc_handle, domid, /* FIXME */ 0)/1000;
   2.128 +
   2.129 +    if ( (d0_cpu_now == -1) || (d1_cpu_now == -1) )
   2.130 +        DPRINTF("ARRHHH!!\n");
   2.131 +
   2.132 +    wall_delta = tv_delta(&wall_now,&wall_last)/1000;
   2.133 +
   2.134 +    if (wall_delta == 0) wall_delta = 1;
   2.135 +
   2.136 +    d0_cpu_delta = (d0_cpu_now - d0_cpu_last)/1000;
   2.137 +    d1_cpu_delta = (d1_cpu_now - d1_cpu_last)/1000;
   2.138 +
   2.139 +    if (print)
   2.140 +        DPRINTF(
   2.141 +                "delta %lldms, dom0 %d%%, target %d%%, sent %dMb/s, "
   2.142 +                "dirtied %dMb/s %" PRId32 " pages\n",
   2.143 +                wall_delta,
   2.144 +                (int)((d0_cpu_delta*100)/wall_delta),
   2.145 +                (int)((d1_cpu_delta*100)/wall_delta),
   2.146 +                (int)((pages_sent*PAGE_SIZE)/(wall_delta*(1000/8))),
   2.147 +                (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))),
   2.148 +                stats->dirty_count);
   2.149 +
   2.150 +    d0_cpu_last = d0_cpu_now;
   2.151 +    d1_cpu_last = d1_cpu_now;
   2.152 +    wall_last   = wall_now;
   2.153 +
   2.154 +    return 0;
   2.155 +}
   2.156 +
   2.157 +static int analysis_phase(int xc_handle, uint32_t domid, int max_pfn,
   2.158 +                          unsigned long *arr, int runs)
   2.159 +{
   2.160 +    long long start, now;
   2.161 +    xc_shadow_op_stats_t stats;
   2.162 +    int j;
   2.163 +
   2.164 +    start = llgettimeofday();
   2.165 +
   2.166 +    for (j = 0; j < runs; j++) {
   2.167 +        int i;
   2.168 +
   2.169 +        xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_CLEAN,
   2.170 +                          arr, max_pfn, NULL, 0, NULL);
   2.171 +        DPRINTF("#Flush\n");
   2.172 +        for ( i = 0; i < 40; i++ ) {
   2.173 +            usleep(50000);
   2.174 +            now = llgettimeofday();
   2.175 +            xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_PEEK,
   2.176 +                              NULL, 0, NULL, 0, &stats);
   2.177 +
   2.178 +            DPRINTF("now= %lld faults= %"PRId32" dirty= %"PRId32"\n",
   2.179 +                    ((now-start)+500)/1000,
   2.180 +                    stats.fault_count, stats.dirty_count);
   2.181 +        }
   2.182 +    }
   2.183 +
   2.184 +    return -1;
   2.185 +}
   2.186 +
   2.187 +static int suspend_and_state(int (*suspend)(int), int xc_handle, int io_fd,
   2.188 +                             int dom, xc_dominfo_t *info,
   2.189 +                             vcpu_guest_context_t *ctxt)
   2.190 +{
   2.191 +    int i = 0;
   2.192 +
   2.193 +    if (!(*suspend)(dom)) {
   2.194 +        ERROR("Suspend request failed");
   2.195 +        return -1;
   2.196 +    }
   2.197 +
   2.198 + retry:
   2.199 +
   2.200 +    if (xc_domain_getinfo(xc_handle, dom, 1, info) != 1) {
   2.201 +        ERROR("Could not get domain info");
   2.202 +        return -1;
   2.203 +    }
   2.204 +
   2.205 +    if ( xc_vcpu_getcontext(xc_handle, dom, 0 /* XXX */, ctxt))
   2.206 +        ERROR("Could not get vcpu context");
   2.207 +
   2.208 +
   2.209 +    if (info->shutdown && info->shutdown_reason == SHUTDOWN_suspend)
   2.210 +        return 0; // success
   2.211 +
   2.212 +    if (info->paused) {
   2.213 +        // try unpausing domain, wait, and retest
   2.214 +        xc_domain_unpause( xc_handle, dom );
   2.215 +
   2.216 +        ERROR("Domain was paused. Wait and re-test.");
   2.217 +        usleep(10000);  // 10ms
   2.218 +
   2.219 +        goto retry;
   2.220 +    }
   2.221 +
   2.222 +
   2.223 +    if( ++i < 100 ) {
   2.224 +        ERROR("Retry suspend domain.");
   2.225 +        usleep(10000);  // 10ms
   2.226 +        goto retry;
   2.227 +    }
   2.228 +
   2.229 +    ERROR("Unable to suspend domain.");
   2.230 +
   2.231 +    return -1;
   2.232 +}
   2.233 +
   2.234  int xc_hvm_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
   2.235                    uint32_t max_factor, uint32_t flags, int (*suspend)(int))
   2.236  {
   2.237 +    xc_dominfo_t info;
   2.238  
   2.239 -    return 0;
   2.240 +    int rc = 1, i, last_iter, iter = 0;
   2.241 +    int live  = (flags & XCFLAGS_LIVE);
   2.242 +    int debug = (flags & XCFLAGS_DEBUG);
   2.243 +    int sent_last_iter, skip_this_iter;
   2.244 +
   2.245 +    /* The new domain's shared-info frame number. */
   2.246 +    unsigned long shared_info_frame;
   2.247 +
   2.248 +    /* A copy of the CPU context of the guest. */
   2.249 +    vcpu_guest_context_t ctxt;
   2.250 +
   2.251 +    /* A table containg the type of each PFN (/not/ MFN!). */
   2.252 +    unsigned long *pfn_type = NULL;
   2.253 +    unsigned long *pfn_batch = NULL;
   2.254 +
   2.255 +    /* A copy of hvm domain context */
   2.256 +    hvm_domain_context_t hvm_ctxt;
   2.257 +
   2.258 +    /* Live mapping of shared info structure */
   2.259 +    shared_info_t *live_shinfo = NULL;
   2.260 +
   2.261 +    /* base of the region in which domain memory is mapped */
   2.262 +    unsigned char *region_base = NULL;
   2.263 +
   2.264 +    uint32_t nr_pfns, rec_size, nr_vcpus;
   2.265 +    unsigned long *page_array = NULL;
   2.266 +
   2.267 +    /* power of 2 order of max_pfn */
   2.268 +    int order_nr;
   2.269 +
   2.270 +    /* bitmap of pages:
   2.271 +       - that should be sent this iteration (unless later marked as skip);
   2.272 +       - to skip this iteration because already dirty; */
   2.273 +    unsigned long *to_send = NULL, *to_skip = NULL;
   2.274 +
   2.275 +    xc_shadow_op_stats_t stats;
   2.276 +
   2.277 +    unsigned long total_sent    = 0;
   2.278 +
   2.279 +    DPRINTF("xc_hvm_save:dom=%d, max_iters=%d, max_factor=%d, flags=0x%x, live=%d, debug=%d.\n",
   2.280 +            dom, max_iters, max_factor, flags,
   2.281 +            live, debug);
   2.282 +
   2.283 +    /* If no explicit control parameters given, use defaults */
   2.284 +    if(!max_iters)
   2.285 +        max_iters = DEF_MAX_ITERS;
   2.286 +    if(!max_factor)
   2.287 +        max_factor = DEF_MAX_FACTOR;
   2.288 +
   2.289 +    initialize_mbit_rate();
   2.290 +
   2.291 +    if(!get_platform_info(xc_handle, dom,
   2.292 +                          &max_mfn, &hvirt_start, &pt_levels)) {
   2.293 +        ERROR("HVM:Unable to get platform info.");
   2.294 +        return 1;
   2.295 +    }
   2.296 +
   2.297 +    if (xc_domain_getinfo(xc_handle, dom, 1, &info) != 1) {
   2.298 +        ERROR("HVM:Could not get domain info");
   2.299 +        return 1;
   2.300 +    }
   2.301 +    nr_vcpus = info.nr_online_vcpus;
   2.302 +
   2.303 +    if (mlock(&ctxt, sizeof(ctxt))) {
   2.304 +        ERROR("HVM:Unable to mlock ctxt");
   2.305 +        return 1;
   2.306 +    }
   2.307 +
   2.308 +    /* Only have to worry about vcpu 0 even for SMP */
   2.309 +    if (xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt)) {
   2.310 +        ERROR("HVM:Could not get vcpu context");
   2.311 +        goto out;
   2.312 +    }
   2.313 +    shared_info_frame = info.shared_info_frame;
   2.314 +
   2.315 +    /* A cheesy test to see whether the domain contains valid state. */
   2.316 +    if (ctxt.ctrlreg[3] == 0)
   2.317 +    {
   2.318 +        ERROR("Domain is not in a valid HVM guest state");
   2.319 +        goto out;
   2.320 +    }
   2.321 +
   2.322 +   /* cheesy sanity check */
   2.323 +    if ((info.max_memkb >> (PAGE_SHIFT - 10)) > max_mfn) {
   2.324 +        ERROR("Invalid HVM state record -- pfn count out of range: %lu",
   2.325 +            (info.max_memkb >> (PAGE_SHIFT - 10)));
   2.326 +        goto out;
   2.327 +    }
   2.328 +
   2.329 +    /* Map the shared info frame */
   2.330 +    if(!(live_shinfo = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
   2.331 +                                            PROT_READ, shared_info_frame))) {
   2.332 +        ERROR("HVM:Couldn't map live_shinfo");
   2.333 +        goto out;
   2.334 +    }
   2.335 +
   2.336 +    max_pfn = live_shinfo->arch.max_pfn;
   2.337 +
   2.338 +    DPRINTF("saved hvm domain info:max_memkb=0x%lx, max_mfn=0x%lx, nr_pages=0x%lx\n", info.max_memkb, max_mfn, info.nr_pages); 
   2.339 +
   2.340 +    /* nr_pfns: total pages excluding vga acc mem
   2.341 +     * max_pfn: nr_pfns + 0x20 vga hole(0xa0~0xc0)
   2.342 +     * getdomaininfo.tot_pages: all the allocated pages for this domain
   2.343 +     */
   2.344 +    if (live) {
   2.345 +        ERROR("hvm domain doesn't support live migration now.\n");
   2.346 +        goto out;
   2.347 +
   2.348 +        if (xc_shadow_control(xc_handle, dom,
   2.349 +                              XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY,
   2.350 +                              NULL, 0, NULL, 0, NULL) < 0) {
   2.351 +            ERROR("Couldn't enable shadow mode");
   2.352 +            goto out;
   2.353 +        }
   2.354 +
   2.355 +        /* excludes vga acc mem */
   2.356 +        nr_pfns = info.nr_pages - 0x800;
   2.357 +
   2.358 +        last_iter = 0;
   2.359 +        DPRINTF("hvm domain live migration debug start: logdirty enable.\n");
   2.360 +    } else {
   2.361 +        /* This is a non-live suspend. Issue the call back to get the
   2.362 +           domain suspended */
   2.363 +
   2.364 +        last_iter = 1;
   2.365 +
   2.366 +        /* suspend hvm domain */
   2.367 +        if (suspend_and_state(suspend, xc_handle, io_fd, dom, &info, &ctxt)) {
   2.368 +            ERROR("HVM Domain appears not to have suspended");
   2.369 +            goto out;
   2.370 +        }
   2.371 +        nr_pfns = info.nr_pages;
   2.372 +        DPRINTF("after suspend hvm domain nr_pages=0x%x.\n", nr_pfns);
   2.373 +    }
   2.374 +
   2.375 +    DPRINTF("after 1st handle hvm domain nr_pfns=0x%x, nr_pages=0x%lx, max_memkb=0x%lx, live=%d.\n",
   2.376 +            nr_pfns,
   2.377 +            info.nr_pages,
   2.378 +            info.max_memkb,
   2.379 +            live);
   2.380 +
   2.381 +    nr_pfns = info.nr_pages;
   2.382 +
   2.383 +    /*XXX: caculate the VGA hole*/
   2.384 +    max_pfn = nr_pfns + 0x20;
   2.385 +
   2.386 +    skip_this_iter = 0;/*XXX*/
   2.387 +    /* pretend we sent all the pages last iteration */
   2.388 +    sent_last_iter = max_pfn;
   2.389 +
   2.390 +    /* calculate the power of 2 order of max_pfn, e.g.
   2.391 +       15->4 16->4 17->5 */
   2.392 +    for (i = max_pfn-1, order_nr = 0; i ; i >>= 1, order_nr++)
   2.393 +        continue;
   2.394 +
   2.395 +    /* Setup to_send / to_fix and to_skip bitmaps */
   2.396 +    to_send = malloc(BITMAP_SIZE);
   2.397 +    to_skip = malloc(BITMAP_SIZE);
   2.398 +
   2.399 +    if (!to_send ||!to_skip) {
   2.400 +        ERROR("Couldn't allocate to_send array");
   2.401 +        goto out;
   2.402 +    }
   2.403 +
   2.404 +    memset(to_send, 0xff, BITMAP_SIZE);
   2.405 +
   2.406 +    if (lock_pages(to_send, BITMAP_SIZE)) {
   2.407 +        ERROR("Unable to lock to_send");
   2.408 +        return 1;
   2.409 +    }
   2.410 +
   2.411 +    /* (to fix is local only) */
   2.412 +    if (lock_pages(to_skip, BITMAP_SIZE)) {
   2.413 +        ERROR("Unable to lock to_skip");
   2.414 +        return 1;
   2.415 +    }
   2.416 +
   2.417 +    analysis_phase(xc_handle, dom, max_pfn, to_skip, 0);
   2.418 +
   2.419 +    /* get all the HVM domain pfns */
   2.420 +    if ( (page_array = (unsigned long *) malloc (sizeof(unsigned long) * max_pfn)) == NULL) {
   2.421 +        ERROR("HVM:malloc fail!\n");
   2.422 +        goto out;
   2.423 +    }
   2.424 +
   2.425 +    for ( i = 0; i < max_pfn; i++)
   2.426 +        page_array[i] = i;
   2.427 +
   2.428 +
   2.429 +    /* We want zeroed memory so use calloc rather than malloc. */
   2.430 +    pfn_type  = calloc(MAX_BATCH_SIZE, sizeof(*pfn_type));
   2.431 +    pfn_batch = calloc(MAX_BATCH_SIZE, sizeof(*pfn_batch));
   2.432 +
   2.433 +    if ((pfn_type == NULL) || (pfn_batch == NULL)) {
   2.434 +        ERROR("failed to alloc memory for pfn_type and/or pfn_batch arrays");
   2.435 +        errno = ENOMEM;
   2.436 +        goto out;
   2.437 +    }
   2.438 +
   2.439 +    if (lock_pages(pfn_type, MAX_BATCH_SIZE * sizeof(*pfn_type))) {
   2.440 +        ERROR("Unable to lock");
   2.441 +        goto out;
   2.442 +    }
   2.443 +
   2.444 +    /* Start writing out the saved-domain record. */
   2.445 +    if (!write_exact(io_fd, &max_pfn, sizeof(unsigned long))) {
   2.446 +        ERROR("write: max_pfn");
   2.447 +        goto out;
   2.448 +    }
   2.449 +
   2.450 +    while(1) {
   2.451 +
   2.452 +        unsigned int prev_pc, sent_this_iter, N, batch;
   2.453 +
   2.454 +        iter++;
   2.455 +        sent_this_iter = 0;
   2.456 +        skip_this_iter = 0;
   2.457 +        prev_pc = 0;
   2.458 +        N=0;
   2.459 +
   2.460 +        DPRINTF("Saving HVM domain memory pages: iter %d   0%%", iter);
   2.461 +
   2.462 +        while( N < max_pfn ){
   2.463 +
   2.464 +            unsigned int this_pc = (N * 100) / max_pfn;
   2.465 +
   2.466 +            if ((this_pc - prev_pc) >= 5) {
   2.467 +                DPRINTF("\b\b\b\b%3d%%", this_pc);
   2.468 +                prev_pc = this_pc;
   2.469 +            }
   2.470 +
   2.471 +            /* slightly wasteful to peek the whole array evey time,
   2.472 +               but this is fast enough for the moment. */
   2.473 +            if (!last_iter && xc_shadow_control(
   2.474 +                    xc_handle, dom, XEN_DOMCTL_SHADOW_OP_PEEK,
   2.475 +                    to_skip, max_pfn, NULL, 0, NULL) != max_pfn) {
   2.476 +                ERROR("Error peeking HVM shadow bitmap");
   2.477 +                goto out;
   2.478 +            }
   2.479 +
   2.480 +
   2.481 +            /* load pfn_type[] with the mfn of all the pages we're doing in
   2.482 +               this batch. */
   2.483 +            for (batch = 0; batch < MAX_BATCH_SIZE && N < max_pfn ; N++) {
   2.484 +
   2.485 +                int n = permute(N, max_pfn, order_nr);
   2.486 +
   2.487 +                if (debug) {
   2.488 +                    DPRINTF("%d pfn= %08lx mfn= %08lx %d \n",
   2.489 +                            iter, (unsigned long)n, page_array[n],
   2.490 +                            test_bit(n, to_send));
   2.491 +                }
   2.492 +
   2.493 +                if (!last_iter && test_bit(n, to_send)&& test_bit(n, to_skip))
   2.494 +                    skip_this_iter++; /* stats keeping */
   2.495 +
   2.496 +                if (!((test_bit(n, to_send) && !test_bit(n, to_skip)) ||
   2.497 +                      (test_bit(n, to_send) && last_iter)))
   2.498 +                    continue;
   2.499 +
   2.500 +                if (n >= 0xa0 && n < 0xc0) {
   2.501 +/*                    DPRINTF("get a vga hole pfn= %x.\n", n);*/
   2.502 +                    continue;
   2.503 +                }
   2.504 +                /*
   2.505 +                ** we get here if:
   2.506 +                **  1. page is marked to_send & hasn't already been re-dirtied
   2.507 +                **  2. (ignore to_skip in last iteration)
   2.508 +                */
   2.509 +
   2.510 +                pfn_batch[batch] = n;
   2.511 +                pfn_type[batch]  = page_array[n];
   2.512 +
   2.513 +                batch++;
   2.514 +            }
   2.515 +
   2.516 +            if (batch == 0)
   2.517 +                goto skip; /* vanishingly unlikely... */
   2.518 +
   2.519 +            /* map_foreign use pfns now !*/
   2.520 +            if ((region_base = xc_map_foreign_batch(
   2.521 +                     xc_handle, dom, PROT_READ, pfn_batch, batch)) == 0) {
   2.522 +                ERROR("map batch failed");
   2.523 +                goto out;
   2.524 +            }
   2.525 +
   2.526 +            /* write num of pfns */
   2.527 +            if(!write_exact(io_fd, &batch, sizeof(unsigned int))) {
   2.528 +                ERROR("Error when writing to state file (2)");
   2.529 +                goto out;
   2.530 +            }
   2.531 +
   2.532 +            /* write all the pfns */
   2.533 +            if(!write_exact(io_fd, pfn_batch, sizeof(unsigned long)*batch)) {
   2.534 +                ERROR("Error when writing to state file (3)");
   2.535 +                goto out;
   2.536 +            }
   2.537 +
   2.538 +            if (ratewrite(io_fd, region_base, PAGE_SIZE * batch) != PAGE_SIZE * batch) {
   2.539 +                ERROR("ERROR when writting to state file (4)");
   2.540 +                goto out;
   2.541 +            }
   2.542 +
   2.543 +
   2.544 +            sent_this_iter += batch;
   2.545 +
   2.546 +            munmap(region_base, batch*PAGE_SIZE);
   2.547 +
   2.548 +        } /* end of this while loop for this iteration */
   2.549 +
   2.550 +      skip:
   2.551 +
   2.552 +        total_sent += sent_this_iter;
   2.553 +
   2.554 +        DPRINTF("\r %d: sent %d, skipped %d, ",
   2.555 +                iter, sent_this_iter, skip_this_iter );
   2.556 +
   2.557 +        if (last_iter) {
   2.558 +            print_stats( xc_handle, dom, sent_this_iter, &stats, 1);
   2.559 +
   2.560 +            DPRINTF("Total pages sent= %ld (%.2fx)\n",
   2.561 +                    total_sent, ((float)total_sent)/max_pfn );
   2.562 +        }
   2.563 +
   2.564 +        if (last_iter && debug){
   2.565 +            int minusone = -1;
   2.566 +            memset(to_send, 0xff, BITMAP_SIZE);
   2.567 +            debug = 0;
   2.568 +            DPRINTF("Entering debug resend-all mode\n");
   2.569 +
   2.570 +            /* send "-1" to put receiver into debug mode */
   2.571 +            if(!write_exact(io_fd, &minusone, sizeof(int))) {
   2.572 +                ERROR("Error when writing to state file (6)");
   2.573 +                goto out;
   2.574 +            }
   2.575 +
   2.576 +            continue;
   2.577 +        }
   2.578 +
   2.579 +        if (last_iter) break;
   2.580 +
   2.581 +        if (live) {
   2.582 +
   2.583 +
   2.584 +            if(
   2.585 +                ((sent_this_iter > sent_last_iter) && RATE_IS_MAX()) ||
   2.586 +                (iter >= max_iters) ||
   2.587 +                (sent_this_iter+skip_this_iter < 50) ||
   2.588 +                (total_sent > max_pfn*max_factor) ) {
   2.589 +
   2.590 +                DPRINTF("Start last iteration for HVM domain\n");
   2.591 +                last_iter = 1;
   2.592 +
   2.593 +                if (suspend_and_state(suspend, xc_handle, io_fd, dom, &info,
   2.594 +                                      &ctxt)) {
   2.595 +                    ERROR("Domain appears not to have suspended");
   2.596 +                    goto out;
   2.597 +                }
   2.598 +
   2.599 +                DPRINTF("SUSPEND shinfo %08lx eip %08lx edx %08lx\n",
   2.600 +                        info.shared_info_frame,
   2.601 +                        (unsigned long)ctxt.user_regs.eip,
   2.602 +                        (unsigned long)ctxt.user_regs.edx);
   2.603 +            }
   2.604 +
   2.605 +            if (xc_shadow_control(xc_handle, dom, 
   2.606 +                                  XEN_DOMCTL_SHADOW_OP_CLEAN, to_send, 
   2.607 +                                  max_pfn, NULL, 0, &stats) != max_pfn) {
   2.608 +                ERROR("Error flushing shadow PT");
   2.609 +                goto out;
   2.610 +            }
   2.611 +
   2.612 +            sent_last_iter = sent_this_iter;
   2.613 +
   2.614 +            print_stats(xc_handle, dom, sent_this_iter, &stats, 1);
   2.615 +
   2.616 +        }
   2.617 +
   2.618 +
   2.619 +    } /* end of while 1 */
   2.620 +
   2.621 +
   2.622 +    DPRINTF("All HVM memory is saved\n");
   2.623 +
   2.624 +    /* Zero terminate */
   2.625 +    i = 0;
   2.626 +    if (!write_exact(io_fd, &i, sizeof(int))) {
   2.627 +        ERROR("Error when writing to state file (6)");
   2.628 +        goto out;
   2.629 +    }
   2.630 +
   2.631 +    /* save hvm hypervisor state including pic/pit/shpage */
   2.632 +    if (mlock(&hvm_ctxt, sizeof(hvm_ctxt))) {
   2.633 +        ERROR("Unable to mlock ctxt");
   2.634 +        return 1;
   2.635 +    }
   2.636 +
   2.637 +    if (xc_domain_hvm_getcontext(xc_handle, dom, &hvm_ctxt)){
   2.638 +        ERROR("HVM:Could not get hvm context");
   2.639 +        goto out;
   2.640 +    }
   2.641 +
   2.642 +    rec_size = sizeof(hvm_ctxt);
   2.643 +    if (!write_exact(io_fd, &rec_size, sizeof(uint32_t))) {
   2.644 +        ERROR("error write hvm ctxt size");
   2.645 +        goto out;
   2.646 +    }
   2.647 +
   2.648 +    if ( !write_exact(io_fd, &hvm_ctxt, sizeof(hvm_ctxt)) ) {
   2.649 +        ERROR("write HVM info failed!\n");
   2.650 +    }
   2.651 +
   2.652 +    /* save vcpu/vmcs context */
   2.653 +    if (!write_exact(io_fd, &nr_vcpus, sizeof(uint32_t))) {
   2.654 +        ERROR("error write nr vcpus");
   2.655 +        goto out;
   2.656 +    }
   2.657 +
   2.658 +    /*XXX: need a online map to exclude down cpu */
   2.659 +    for (i = 0; i < nr_vcpus; i++) {
   2.660 +
   2.661 +        if (xc_vcpu_getcontext(xc_handle, dom, i, &ctxt)) {
   2.662 +            ERROR("HVM:Could not get vcpu context");
   2.663 +            goto out;
   2.664 +        }
   2.665 +
   2.666 +        rec_size = sizeof(ctxt);
   2.667 +        DPRINTF("write %d vcpucontext of total %d.\n", i, nr_vcpus); 
   2.668 +        if (!write_exact(io_fd, &rec_size, sizeof(uint32_t))) {
   2.669 +            ERROR("error write vcpu ctxt size");
   2.670 +            goto out;
   2.671 +        }
   2.672 +
   2.673 +        if (!write_exact(io_fd, &(ctxt), sizeof(ctxt)) ) {
   2.674 +            ERROR("write vmcs failed!\n");
   2.675 +            goto out;
   2.676 +        }
   2.677 +    }
   2.678 + 
   2.679 +    /* Success! */
   2.680 +    rc = 0;
   2.681 +
   2.682 + out:
   2.683 +
   2.684 +    if (live) {
   2.685 +        if(xc_shadow_control(xc_handle, dom, 
   2.686 +                             XEN_DOMCTL_SHADOW_OP_OFF,
   2.687 +                             NULL, 0, NULL, 0, NULL) < 0) {
   2.688 +            DPRINTF("Warning - couldn't disable shadow mode");
   2.689 +        }
   2.690 +    }
   2.691 +
   2.692 +    free(page_array);
   2.693 +
   2.694 +    free(pfn_type);
   2.695 +    free(pfn_batch);
   2.696 +    free(to_send);
   2.697 +    free(to_skip);
   2.698 +
   2.699 +    return !!rc;
   2.700  }