+++ /dev/null
-/******************************************************************************
- * xc_domain_restore.c
- *
- * Restore the state of a guest session.
- *
- * Copyright (c) 2003, K A Fraser.
- * Copyright (c) 2006, Intel Corporation
- * Copyright (c) 2007, XenSource Inc.
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation;
- * version 2.1 of the License.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- *
- */
-
-/*
- * The superpages flag in restore has two different meanings depending on
- * the type of domain.
- *
- * For an HVM domain, the flag means to look for properly aligned contiguous
- * pages and try to allocate a superpage to satisfy it. If that fails,
- * fall back to small pages.
- *
- * For a PV domain, the flag means allocate all memory as superpages. If that
- * fails, the restore fails. This behavior is required for PV guests who
- * want to use superpages.
- */
-
-#include <stdlib.h>
-#include <unistd.h>
-#include <inttypes.h>
-
-#include "xg_private.h"
-#include "xg_save_restore.h"
-#include "xc_dom.h"
-
-#include <xen/hvm/ioreq.h>
-#include <xen/hvm/params.h>
-
-struct restore_ctx {
- unsigned long max_mfn; /* max mfn of the current host machine */
- unsigned long hvirt_start; /* virtual starting address of the hypervisor */
- unsigned int pt_levels; /* #levels of page tables used by the current guest */
- unsigned long nr_pfns; /* number of 'in use' pfns in the guest (i.e. #P2M entries with a valid mfn) */
- xen_pfn_t *live_p2m; /* Live mapping of the table mapping each PFN to its current MFN. */
- xen_pfn_t *p2m; /* A table mapping each PFN to its new MFN. */
- xen_pfn_t *p2m_batch; /* A table of P2M mappings in the current region. */
- xen_pfn_t *p2m_saved_batch; /* Copy of p2m_batch array for pv superpage alloc */
- int superpages; /* Superpage allocation has been requested */
- int hvm; /* This is an hvm domain */
- int completed; /* Set when a consistent image is available */
- int last_checkpoint; /* Set when we should commit to the current checkpoint when it completes. */
- int compressing; /* Set when sender signals that pages would be sent compressed (for Remus) */
- struct domain_info_context dinfo;
-};
-
-#define HEARTBEAT_MS 1000
-
-#ifndef __MINIOS__
-static ssize_t rdexact(xc_interface *xch, struct restore_ctx *ctx,
- int fd, void* buf, size_t size)
-{
- size_t offset = 0;
- ssize_t len;
- struct timeval tv;
- fd_set rfds;
-
- while ( offset < size )
- {
- if ( ctx->completed ) {
- /* expect a heartbeat every HEARBEAT_MS ms maximum */
- tv.tv_sec = HEARTBEAT_MS / 1000;
- tv.tv_usec = (HEARTBEAT_MS % 1000) * 1000;
-
- FD_ZERO(&rfds);
- FD_SET(fd, &rfds);
- len = select(fd + 1, &rfds, NULL, NULL, &tv);
- if ( len == -1 && errno == EINTR )
- continue;
- if ( !FD_ISSET(fd, &rfds) ) {
- ERROR("%s failed (select returned %zd)", __func__, len);
- errno = ETIMEDOUT;
- return -1;
- }
- }
-
- len = read(fd, buf + offset, size - offset);
- if ( (len == -1) && ((errno == EINTR) || (errno == EAGAIN)) )
- continue;
- if ( len == 0 ) {
- ERROR("0-length read");
- errno = 0;
- }
- if ( len <= 0 ) {
- ERROR("%s failed (read rc: %zd, errno: %d)", __func__, len, errno);
- return -1;
- }
- offset += len;
- }
-
- return 0;
-}
-
-#define RDEXACT(fd,buf,size) rdexact(xch, ctx, fd, buf, size)
-#else
-#define RDEXACT read_exact
-#endif
-
-#define SUPERPAGE_PFN_SHIFT 9
-#define SUPERPAGE_NR_PFNS (1UL << SUPERPAGE_PFN_SHIFT)
-#define SUPERPAGE(_pfn) ((_pfn) & (~(SUPERPAGE_NR_PFNS-1)))
-#define SUPER_PAGE_START(pfn) (((pfn) & (SUPERPAGE_NR_PFNS-1)) == 0 )
-
-/*
-** When we're restoring into a pv superpage-allocated guest, we take
-** a copy of the p2m_batch array to preserve the pfn, then allocate the
-** corresponding superpages. We then fill in the p2m array using the saved
-** pfns.
-*/
-static int alloc_superpage_mfns(
- xc_interface *xch, uint32_t dom, struct restore_ctx *ctx, int nr_mfns)
-{
- int i, j, max = 0;
- unsigned long pfn, base_pfn, mfn;
-
- for (i = 0; i < nr_mfns; i++)
- {
- pfn = ctx->p2m_batch[i];
- base_pfn = SUPERPAGE(pfn);
- if (ctx->p2m[base_pfn] != (INVALID_P2M_ENTRY-2))
- {
- ctx->p2m_saved_batch[max] = base_pfn;
- ctx->p2m_batch[max] = base_pfn;
- max++;
- ctx->p2m[base_pfn] = INVALID_P2M_ENTRY-2;
- }
- }
- if (xc_domain_populate_physmap_exact(xch, dom, max, SUPERPAGE_PFN_SHIFT,
- 0, ctx->p2m_batch) != 0)
- return 1;
-
- for (i = 0; i < max; i++)
- {
- mfn = ctx->p2m_batch[i];
- pfn = ctx->p2m_saved_batch[i];
- for (j = 0; j < SUPERPAGE_NR_PFNS; j++)
- ctx->p2m[pfn++] = mfn++;
- }
- return 0;
-}
-/*
-** In the state file (or during transfer), all page-table pages are
-** converted into a 'canonical' form where references to actual mfns
-** are replaced with references to the corresponding pfns.
-** This function inverts that operation, replacing the pfn values with
-** the (now known) appropriate mfn values.
-*/
-static int uncanonicalize_pagetable(
- xc_interface *xch, uint32_t dom, struct restore_ctx *ctx, void *page)
-{
- int i, rc, pte_last, nr_mfns = 0;
- unsigned long pfn;
- uint64_t pte;
- struct domain_info_context *dinfo = &ctx->dinfo;
-
- pte_last = PAGE_SIZE / 8;
-
- /* First pass: work out how many (if any) MFNs we need to alloc */
- for ( i = 0; i < pte_last; i++ )
- {
- pte = ((uint64_t *)page)[i];
-
- /* XXX SMH: below needs fixing for PROT_NONE etc */
- if ( !(pte & _PAGE_PRESENT) )
- continue;
-
- pfn = (pte >> PAGE_SHIFT) & MFN_MASK_X86;
-
- if ( pfn >= dinfo->p2m_size )
- {
- /* This "page table page" is probably not one; bail. */
- ERROR("Frame number in page table is out of range: "
- "i=%d pfn=0x%lx p2m_size=%lu",
- i, pfn, dinfo->p2m_size);
- return 0;
- }
-
- if ( ctx->p2m[pfn] == INVALID_P2M_ENTRY )
- {
- /* Have a 'valid' PFN without a matching MFN - need to alloc */
- ctx->p2m_batch[nr_mfns++] = pfn;
- ctx->p2m[pfn]--;
- }
- }
-
- /* Allocate the requisite number of mfns. */
- if (nr_mfns)
- {
- if (!ctx->hvm && ctx->superpages)
- rc = alloc_superpage_mfns(xch, dom, ctx, nr_mfns);
- else
- rc = xc_domain_populate_physmap_exact(xch, dom, nr_mfns, 0, 0,
- ctx->p2m_batch);
-
- if (rc)
- {
- ERROR("Failed to allocate memory for batch.!\n");
- errno = ENOMEM;
- return 0;
- }
- }
-
- /* Second pass: uncanonicalize each present PTE */
- nr_mfns = 0;
- for ( i = 0; i < pte_last; i++ )
- {
- pte = ((uint64_t *)page)[i];
-
- /* XXX SMH: below needs fixing for PROT_NONE etc */
- if ( !(pte & _PAGE_PRESENT) )
- continue;
-
- pfn = (pte >> PAGE_SHIFT) & MFN_MASK_X86;
-
- if ( ctx->p2m[pfn] == (INVALID_P2M_ENTRY-1) )
- ctx->p2m[pfn] = ctx->p2m_batch[nr_mfns++];
-
- pte &= ~MADDR_MASK_X86;
- pte |= (uint64_t)ctx->p2m[pfn] << PAGE_SHIFT;
-
- ((uint64_t *)page)[i] = (uint64_t)pte;
- }
-
- return 1;
-}
-
-
-/* Load the p2m frame list, plus potential extended info chunk */
-static xen_pfn_t *load_p2m_frame_list(
- xc_interface *xch, struct restore_ctx *ctx,
- int io_fd, int *pae_extended_cr3, int *ext_vcpucontext,
- uint32_t *vcpuextstate_size)
-{
- xen_pfn_t *p2m_frame_list;
- vcpu_guest_context_any_t ctxt;
- xen_pfn_t p2m_fl_zero;
- struct domain_info_context *dinfo = &ctx->dinfo;
-
- /* Read first entry of P2M list, or extended-info signature (~0UL). */
- if ( RDEXACT(io_fd, &p2m_fl_zero, sizeof(long)) )
- {
- PERROR("read extended-info signature failed");
- return NULL;
- }
-
- if ( p2m_fl_zero == ~0UL )
- {
- uint32_t tot_bytes;
-
- /* Next 4 bytes: total size of following extended info. */
- if ( RDEXACT(io_fd, &tot_bytes, sizeof(tot_bytes)) )
- {
- PERROR("read extended-info size failed");
- return NULL;
- }
-
- while ( tot_bytes )
- {
- uint32_t chunk_bytes;
- char chunk_sig[4];
-
- /* 4-character chunk signature + 4-byte remaining chunk size. */
- if ( RDEXACT(io_fd, chunk_sig, sizeof(chunk_sig)) ||
- RDEXACT(io_fd, &chunk_bytes, sizeof(chunk_bytes)) ||
- (tot_bytes < (chunk_bytes + 8)) )
- {
- PERROR("read extended-info chunk signature failed");
- return NULL;
- }
- tot_bytes -= 8;
-
- /* VCPU context structure? */
- if ( !strncmp(chunk_sig, "vcpu", 4) )
- {
- /* Pick a guest word-size and PT depth from the ctxt size */
- if ( chunk_bytes == sizeof (ctxt.x32) )
- {
- dinfo->guest_width = 4;
- ctx->pt_levels = 3;
- }
- else if ( chunk_bytes == sizeof (ctxt.x64) )
- {
- dinfo->guest_width = 8;
- ctx->pt_levels = 4;
- }
- else
- {
- ERROR("bad extended-info context size %d", chunk_bytes);
- return NULL;
- }
-
- if ( RDEXACT(io_fd, &ctxt, chunk_bytes) )
- {
- PERROR("read extended-info vcpu context failed");
- return NULL;
- }
- tot_bytes -= chunk_bytes;
- chunk_bytes = 0;
-
- if ( GET_FIELD(&ctxt, vm_assist, dinfo->guest_width)
- & (1UL << VMASST_TYPE_pae_extended_cr3) )
- *pae_extended_cr3 = 1;
- }
- else if ( !strncmp(chunk_sig, "extv", 4) )
- {
- *ext_vcpucontext = 1;
- }
- else if ( !strncmp(chunk_sig, "xcnt", 4) )
- {
- if ( RDEXACT(io_fd, vcpuextstate_size, sizeof(*vcpuextstate_size)) )
- {
- PERROR("read extended vcpu state size failed");
- return NULL;
- }
- tot_bytes -= chunk_bytes;
- chunk_bytes = 0;
- }
-
- /* Any remaining bytes of this chunk: read and discard. */
- while ( chunk_bytes )
- {
- unsigned long sz = min_t(unsigned long, chunk_bytes, sizeof(xen_pfn_t));
- if ( RDEXACT(io_fd, &p2m_fl_zero, sz) )
- {
- PERROR("read-and-discard extended-info chunk bytes failed");
- return NULL;
- }
- chunk_bytes -= sz;
- tot_bytes -= sz;
- }
- }
-
- /* Now read the real first entry of P2M list. */
- if ( RDEXACT(io_fd, &p2m_fl_zero, sizeof(xen_pfn_t)) )
- {
- PERROR("read first entry of p2m_frame_list failed");
- return NULL;
- }
- }
-
- /* Now that we know the guest's word-size, can safely allocate
- * the p2m frame list */
- if ( (p2m_frame_list = malloc(P2M_TOOLS_FL_SIZE)) == NULL )
- {
- ERROR("Couldn't allocate p2m_frame_list array");
- return NULL;
- }
-
- /* First entry has already been read. */
- p2m_frame_list[0] = p2m_fl_zero;
- if ( RDEXACT(io_fd, &p2m_frame_list[1],
- (P2M_FL_ENTRIES - 1) * sizeof(xen_pfn_t)) )
- {
- PERROR("read p2m_frame_list failed");
- free(p2m_frame_list);
- return NULL;
- }
-
- return p2m_frame_list;
-}
-
-typedef struct {
- int ishvm;
- union {
- struct tailbuf_pv {
- unsigned int pfncount;
- unsigned long* pfntab;
- unsigned int vcpucount;
- unsigned char* vcpubuf;
- unsigned char shared_info_page[PAGE_SIZE];
- } pv;
- struct tailbuf_hvm {
- uint64_t magicpfns[3];
- uint32_t hvmbufsize, reclen;
- uint8_t* hvmbuf;
- struct {
- uint32_t magic;
- uint32_t version;
- uint64_t len;
- } qemuhdr;
- uint32_t qemubufsize;
- uint8_t* qemubuf;
- } hvm;
- } u;
-} tailbuf_t;
-
-/* read stream until EOF, growing buffer as necssary */
-static int compat_buffer_qemu(xc_interface *xch, struct restore_ctx *ctx,
- int fd, struct tailbuf_hvm *buf)
-{
- uint8_t *qbuf, *tmp;
- int blen = 0, dlen = 0;
- int rc;
-
- /* currently save records tend to be about 7K */
- blen = 8192;
- if ( !(qbuf = malloc(blen)) ) {
- ERROR("Error allocating QEMU buffer");
- return -1;
- }
-
- while( (rc = read(fd, qbuf+dlen, blen-dlen)) > 0 ) {
- DPRINTF("Read %d bytes of QEMU data\n", rc);
- dlen += rc;
-
- if (dlen == blen) {
- DPRINTF("%d-byte QEMU buffer full, reallocating...\n", dlen);
- blen += 4096;
- tmp = realloc(qbuf, blen);
- if ( !tmp ) {
- ERROR("Error growing QEMU buffer to %d bytes", blen);
- free(qbuf);
- return -1;
- }
- qbuf = tmp;
- }
- }
-
- if ( rc < 0 ) {
- ERROR("Error reading QEMU data");
- free(qbuf);
- return -1;
- }
-
- if ( memcmp(qbuf, "QEVM", 4) ) {
- ERROR("Invalid QEMU magic: 0x%08"PRIx32, *(uint32_t*)qbuf);
- free(qbuf);
- return -1;
- }
-
- buf->qemubuf = qbuf;
- buf->qemubufsize = dlen;
-
- return 0;
-}
-
-static int buffer_qemu(xc_interface *xch, struct restore_ctx *ctx,
- int fd, struct tailbuf_hvm *buf)
-{
- uint32_t qlen;
- uint8_t *tmp;
-
- if ( RDEXACT(fd, &qlen, sizeof(qlen)) ) {
- PERROR("Error reading QEMU header length");
- return -1;
- }
-
- if ( qlen > buf->qemubufsize ) {
- if ( buf->qemubuf) {
- tmp = realloc(buf->qemubuf, qlen);
- if ( tmp )
- buf->qemubuf = tmp;
- else {
- ERROR("Error reallocating QEMU state buffer");
- return -1;
- }
- } else {
- buf->qemubuf = malloc(qlen);
- if ( !buf->qemubuf ) {
- ERROR("Error allocating QEMU state buffer");
- return -1;
- }
- }
- }
- buf->qemubufsize = qlen;
-
- if ( RDEXACT(fd, buf->qemubuf, buf->qemubufsize) ) {
- PERROR("Error reading QEMU state");
- return -1;
- }
-
- return 0;
-}
-
-static int dump_qemu(xc_interface *xch, uint32_t dom, struct tailbuf_hvm *buf)
-{
- int saved_errno;
- char path[256];
- FILE *fp;
-
- sprintf(path, XC_DEVICE_MODEL_RESTORE_FILE".%u", dom);
- fp = fopen(path, "wb");
- if ( !fp )
- return -1;
-
- DPRINTF("Writing %d bytes of QEMU data\n", buf->qemubufsize);
- if ( fwrite(buf->qemubuf, 1, buf->qemubufsize, fp) != buf->qemubufsize) {
- saved_errno = errno;
- fclose(fp);
- errno = saved_errno;
- return -1;
- }
-
- fclose(fp);
-
- return 0;
-}
-
-static int buffer_tail_hvm(xc_interface *xch, struct restore_ctx *ctx,
- struct tailbuf_hvm *buf, int fd,
- unsigned int max_vcpu_id, uint64_t *vcpumap,
- int ext_vcpucontext,
- uint32_t vcpuextstate_size)
-{
- uint8_t *tmp;
- unsigned char qemusig[21];
-
- if ( RDEXACT(fd, buf->magicpfns, sizeof(buf->magicpfns)) ) {
- PERROR("Error reading magic PFNs");
- return -1;
- }
-
- if ( RDEXACT(fd, &buf->reclen, sizeof(buf->reclen)) ) {
- PERROR("Error reading HVM params size");
- return -1;
- }
-
- if ( buf->reclen > buf->hvmbufsize ) {
- if ( buf->hvmbuf) {
- tmp = realloc(buf->hvmbuf, buf->reclen);
- if ( tmp ) {
- buf->hvmbuf = tmp;
- buf->hvmbufsize = buf->reclen;
- } else {
- ERROR("Error reallocating HVM param buffer");
- return -1;
- }
- } else {
- buf->hvmbuf = malloc(buf->reclen);
- if ( !buf->hvmbuf ) {
- ERROR("Error allocating HVM param buffer");
- return -1;
- }
- buf->hvmbufsize = buf->reclen;
- }
- }
-
- if ( RDEXACT(fd, buf->hvmbuf, buf->reclen) ) {
- PERROR("Error reading HVM params");
- return -1;
- }
-
- if ( RDEXACT(fd, qemusig, sizeof(qemusig)) ) {
- PERROR("Error reading QEMU signature");
- return -1;
- }
-
- /* The legacy live-migration QEMU record has no length information.
- * Short of reimplementing the QEMU parser, we're forced to just read
- * until EOF.
- *
- * Gets around this by sending a different signatures for the new
- * live-migration QEMU record and Remus which includes a length
- * prefix
- */
- if ( !memcmp(qemusig, "QemuDeviceModelRecord", sizeof(qemusig)) )
- return compat_buffer_qemu(xch, ctx, fd, buf);
- else if ( !memcmp(qemusig, "DeviceModelRecord0002", sizeof(qemusig)) ||
- !memcmp(qemusig, "RemusDeviceModelState", sizeof(qemusig)) )
- return buffer_qemu(xch, ctx, fd, buf);
-
- qemusig[20] = '\0';
- ERROR("Invalid QEMU signature: %s", qemusig);
- return -1;
-}
-
-static int buffer_tail_pv(xc_interface *xch, struct restore_ctx *ctx,
- struct tailbuf_pv *buf, int fd,
- unsigned int max_vcpu_id, uint64_t *vcpumap,
- int ext_vcpucontext,
- uint32_t vcpuextstate_size)
-{
- unsigned int i;
- size_t pfnlen, vcpulen;
- struct domain_info_context *dinfo = &ctx->dinfo;
-
- /* TODO: handle changing pfntab and vcpu counts */
- /* PFN tab */
- if ( RDEXACT(fd, &buf->pfncount, sizeof(buf->pfncount)) ||
- (buf->pfncount > (1U << 28)) ) /* up to 1TB of address space */
- {
- PERROR("Error when reading pfn count");
- return -1;
- }
- pfnlen = sizeof(unsigned long) * buf->pfncount;
- if ( !(buf->pfntab) ) {
- if ( !(buf->pfntab = malloc(pfnlen)) ) {
- ERROR("Error allocating PFN tail buffer");
- return -1;
- }
- }
- // DPRINTF("Reading PFN tab: %d bytes\n", pfnlen);
- if ( RDEXACT(fd, buf->pfntab, pfnlen) ) {
- PERROR("Error when reading pfntab");
- goto free_pfntab;
- }
-
- /* VCPU contexts */
- buf->vcpucount = 0;
- for (i = 0; i <= max_vcpu_id; i++) {
- // DPRINTF("vcpumap: %llx, cpu: %d, bit: %llu\n", vcpumap[i/64], i, (vcpumap[i/64] & (1ULL << (i%64))));
- if ( (!(vcpumap[i/64] & (1ULL << (i%64)))) )
- continue;
- buf->vcpucount++;
- }
- // DPRINTF("VCPU count: %d\n", buf->vcpucount);
- vcpulen = ((dinfo->guest_width == 8) ? sizeof(vcpu_guest_context_x86_64_t)
- : sizeof(vcpu_guest_context_x86_32_t)) * buf->vcpucount;
- if ( ext_vcpucontext )
- vcpulen += 128 * buf->vcpucount;
- vcpulen += vcpuextstate_size * buf->vcpucount;
-
- if ( !(buf->vcpubuf) ) {
- if ( !(buf->vcpubuf = malloc(vcpulen)) ) {
- ERROR("Error allocating VCPU ctxt tail buffer");
- goto free_pfntab;
- }
- }
- // DPRINTF("Reading VCPUS: %d bytes\n", vcpulen);
- if ( RDEXACT(fd, buf->vcpubuf, vcpulen) ) {
- PERROR("Error when reading ctxt");
- goto free_vcpus;
- }
-
- /* load shared_info_page */
- // DPRINTF("Reading shared info: %lu bytes\n", PAGE_SIZE);
- if ( RDEXACT(fd, buf->shared_info_page, PAGE_SIZE) ) {
- PERROR("Error when reading shared info page");
- goto free_vcpus;
- }
-
- return 0;
-
- free_vcpus:
- if (buf->vcpubuf) {
- free (buf->vcpubuf);
- buf->vcpubuf = NULL;
- }
- free_pfntab:
- if (buf->pfntab) {
- free (buf->pfntab);
- buf->pfntab = NULL;
- }
-
- return -1;
-}
-
-static int buffer_tail(xc_interface *xch, struct restore_ctx *ctx,
- tailbuf_t *buf, int fd, unsigned int max_vcpu_id,
- uint64_t *vcpumap, int ext_vcpucontext,
- uint32_t vcpuextstate_size)
-{
- if ( buf->ishvm )
- return buffer_tail_hvm(xch, ctx, &buf->u.hvm, fd, max_vcpu_id, vcpumap,
- ext_vcpucontext, vcpuextstate_size);
- else
- return buffer_tail_pv(xch, ctx, &buf->u.pv, fd, max_vcpu_id, vcpumap,
- ext_vcpucontext, vcpuextstate_size);
-}
-
-static void tailbuf_free_hvm(struct tailbuf_hvm *buf)
-{
- if ( buf->hvmbuf ) {
- free(buf->hvmbuf);
- buf->hvmbuf = NULL;
- }
- if ( buf->qemubuf ) {
- free(buf->qemubuf);
- buf->qemubuf = NULL;
- }
-}
-
-static void tailbuf_free_pv(struct tailbuf_pv *buf)
-{
- if ( buf->vcpubuf ) {
- free(buf->vcpubuf);
- buf->vcpubuf = NULL;
- }
- if ( buf->pfntab ) {
- free(buf->pfntab);
- buf->pfntab = NULL;
- }
-}
-
-static void tailbuf_free(tailbuf_t *buf)
-{
- if ( buf->ishvm )
- tailbuf_free_hvm(&buf->u.hvm);
- else
- tailbuf_free_pv(&buf->u.pv);
-}
-
-struct toolstack_data_t {
- uint8_t *data;
- uint32_t len;
-};
-
-typedef struct {
- void* pages;
- /* pages is of length nr_physpages, pfn_types is of length nr_pages */
- unsigned int nr_physpages, nr_pages;
-
- /* checkpoint compression state */
- int compressing;
- unsigned long compbuf_pos, compbuf_size;
-
- /* Types of the pfns in the current region */
- unsigned long* pfn_types;
-
- int verify;
-
- int new_ctxt_format;
- int max_vcpu_id;
- uint64_t vcpumap[XC_SR_MAX_VCPUS/64];
- uint64_t identpt;
- uint64_t paging_ring_pfn;
- uint64_t monitor_ring_pfn;
- uint64_t sharing_ring_pfn;
- uint64_t vm86_tss;
- uint64_t console_pfn;
- uint64_t acpi_ioport_location;
- uint64_t viridian;
- uint64_t vm_generationid_addr;
- uint64_t ioreq_server_pfn;
- uint64_t nr_ioreq_server_pages;
-
- struct toolstack_data_t tdata;
-} pagebuf_t;
-
-static int pagebuf_init(pagebuf_t* buf)
-{
- memset(buf, 0, sizeof(*buf));
- return 0;
-}
-
-static void pagebuf_free(pagebuf_t* buf)
-{
- if (buf->tdata.data != NULL) {
- free(buf->tdata.data);
- buf->tdata.data = NULL;
- }
- if (buf->pages) {
- free(buf->pages);
- buf->pages = NULL;
- }
- if(buf->pfn_types) {
- free(buf->pfn_types);
- buf->pfn_types = NULL;
- }
-}
-
-static int pagebuf_get_one(xc_interface *xch, struct restore_ctx *ctx,
- pagebuf_t* buf, int fd, uint32_t dom)
-{
- int count, countpages, oldcount, i;
- void* ptmp;
- unsigned long compbuf_size;
-
- if ( RDEXACT(fd, &count, sizeof(count)) )
- {
- PERROR("Error when reading batch size");
- return -1;
- }
-
- // DPRINTF("reading batch of %d pages\n", count);
-
- switch ( count )
- {
- case 0:
- // DPRINTF("Last batch read\n");
- return 0;
-
- case XC_SAVE_ID_ENABLE_VERIFY_MODE:
- DPRINTF("Entering page verify mode\n");
- buf->verify = 1;
- return pagebuf_get_one(xch, ctx, buf, fd, dom);
-
- case XC_SAVE_ID_VCPU_INFO:
- buf->new_ctxt_format = 1;
- if ( RDEXACT(fd, &buf->max_vcpu_id, sizeof(buf->max_vcpu_id)) ||
- buf->max_vcpu_id >= XC_SR_MAX_VCPUS ||
- RDEXACT(fd, buf->vcpumap, vcpumap_sz(buf->max_vcpu_id)) ) {
- PERROR("Error when reading max_vcpu_id");
- return -1;
- }
- // DPRINTF("Max VCPU ID: %d, vcpumap: %llx\n", buf->max_vcpu_id, buf->vcpumap[0]);
- return pagebuf_get_one(xch, ctx, buf, fd, dom);
-
- case XC_SAVE_ID_HVM_IDENT_PT:
- /* Skip padding 4 bytes then read the EPT identity PT location. */
- if ( RDEXACT(fd, &buf->identpt, sizeof(uint32_t)) ||
- RDEXACT(fd, &buf->identpt, sizeof(uint64_t)) )
- {
- PERROR("error read the address of the EPT identity map");
- return -1;
- }
- // DPRINTF("EPT identity map address: %llx\n", buf->identpt);
- return pagebuf_get_one(xch, ctx, buf, fd, dom);
-
- case XC_SAVE_ID_HVM_PAGING_RING_PFN:
- /* Skip padding 4 bytes then read the paging ring location. */
- if ( RDEXACT(fd, &buf->paging_ring_pfn, sizeof(uint32_t)) ||
- RDEXACT(fd, &buf->paging_ring_pfn, sizeof(uint64_t)) )
- {
- PERROR("error read the paging ring pfn");
- return -1;
- }
- // DPRINTF("paging ring pfn address: %llx\n", buf->paging_ring_pfn);
- return pagebuf_get_one(xch, ctx, buf, fd, dom);
-
- case XC_SAVE_ID_HVM_MONITOR_RING_PFN:
- /* Skip padding 4 bytes then read the mem access ring location. */
- if ( RDEXACT(fd, &buf->monitor_ring_pfn, sizeof(uint32_t)) ||
- RDEXACT(fd, &buf->monitor_ring_pfn, sizeof(uint64_t)) )
- {
- PERROR("error read the access ring pfn");
- return -1;
- }
- // DPRINTF("monitor ring pfn address: %llx\n", buf->monitor_ring_pfn);
- return pagebuf_get_one(xch, ctx, buf, fd, dom);
-
- case XC_SAVE_ID_HVM_SHARING_RING_PFN:
- /* Skip padding 4 bytes then read the sharing ring location. */
- if ( RDEXACT(fd, &buf->sharing_ring_pfn, sizeof(uint32_t)) ||
- RDEXACT(fd, &buf->sharing_ring_pfn, sizeof(uint64_t)) )
- {
- PERROR("error read the sharing ring pfn");
- return -1;
- }
- // DPRINTF("sharing ring pfn address: %llx\n", buf->sharing_ring_pfn);
- return pagebuf_get_one(xch, ctx, buf, fd, dom);
-
- case XC_SAVE_ID_HVM_VM86_TSS:
- /* Skip padding 4 bytes then read the vm86 TSS location. */
- if ( RDEXACT(fd, &buf->vm86_tss, sizeof(uint32_t)) ||
- RDEXACT(fd, &buf->vm86_tss, sizeof(uint64_t)) )
- {
- PERROR("error read the address of the vm86 TSS");
- return -1;
- }
- // DPRINTF("VM86 TSS location: %llx\n", buf->vm86_tss);
- return pagebuf_get_one(xch, ctx, buf, fd, dom);
-
- case XC_SAVE_ID_TMEM:
- DPRINTF("xc_domain_restore start tmem\n");
- if ( xc_tmem_restore(xch, dom, fd) ) {
- PERROR("error reading/restoring tmem");
- return -1;
- }
- return pagebuf_get_one(xch, ctx, buf, fd, dom);
-
- case XC_SAVE_ID_TMEM_EXTRA:
- if ( xc_tmem_restore_extra(xch, dom, fd) ) {
- PERROR("error reading/restoring tmem extra");
- return -1;
- }
- return pagebuf_get_one(xch, ctx, buf, fd, dom);
-
- case XC_SAVE_ID_TSC_INFO:
- {
- uint32_t tsc_mode, khz, incarn;
- uint64_t nsec;
- if ( RDEXACT(fd, &tsc_mode, sizeof(uint32_t)) ||
- RDEXACT(fd, &nsec, sizeof(uint64_t)) ||
- RDEXACT(fd, &khz, sizeof(uint32_t)) ||
- RDEXACT(fd, &incarn, sizeof(uint32_t)) ||
- xc_domain_set_tsc_info(xch, dom, tsc_mode, nsec, khz, incarn) ) {
- PERROR("error reading/restoring tsc info");
- return -1;
- }
- return pagebuf_get_one(xch, ctx, buf, fd, dom);
- }
-
- case XC_SAVE_ID_HVM_CONSOLE_PFN :
- /* Skip padding 4 bytes then read the console pfn location. */
- if ( RDEXACT(fd, &buf->console_pfn, sizeof(uint32_t)) ||
- RDEXACT(fd, &buf->console_pfn, sizeof(uint64_t)) )
- {
- PERROR("error read the address of the console pfn");
- return -1;
- }
- // DPRINTF("console pfn location: %llx\n", buf->console_pfn);
- return pagebuf_get_one(xch, ctx, buf, fd, dom);
-
- case XC_SAVE_ID_LAST_CHECKPOINT:
- ctx->last_checkpoint = 1;
- // DPRINTF("last checkpoint indication received");
- return pagebuf_get_one(xch, ctx, buf, fd, dom);
-
- case XC_SAVE_ID_HVM_ACPI_IOPORTS_LOCATION:
- /* Skip padding 4 bytes then read the acpi ioport location. */
- if ( RDEXACT(fd, &buf->acpi_ioport_location, sizeof(uint32_t)) ||
- RDEXACT(fd, &buf->acpi_ioport_location, sizeof(uint64_t)) )
- {
- PERROR("error read the acpi ioport location");
- return -1;
- }
- return pagebuf_get_one(xch, ctx, buf, fd, dom);
-
- case XC_SAVE_ID_HVM_VIRIDIAN:
- /* Skip padding 4 bytes then read the acpi ioport location. */
- if ( RDEXACT(fd, &buf->viridian, sizeof(uint32_t)) ||
- RDEXACT(fd, &buf->viridian, sizeof(uint64_t)) )
- {
- PERROR("error reading the viridian enlightenments");
- return -1;
- }
- return pagebuf_get_one(xch, ctx, buf, fd, dom);
-
- case XC_SAVE_ID_TOOLSTACK:
- {
- if ( RDEXACT(fd, &buf->tdata.len, sizeof(buf->tdata.len)) )
- {
- PERROR("error read toolstack id size");
- return -1;
- }
- buf->tdata.data = (uint8_t*) realloc(buf->tdata.data, buf->tdata.len);
- if ( buf->tdata.data == NULL )
- {
- PERROR("error memory allocation");
- return -1;
- }
- if ( RDEXACT(fd, buf->tdata.data, buf->tdata.len) )
- {
- PERROR("error read toolstack id");
- return -1;
- }
- return pagebuf_get_one(xch, ctx, buf, fd, dom);
- }
-
- case XC_SAVE_ID_ENABLE_COMPRESSION:
- /* We cannot set compression flag directly in pagebuf structure,
- * since this pagebuf still has uncompressed pages that are yet to
- * be applied. We enable the compression field in pagebuf structure
- * after receiving the first tailbuf.
- */
- ctx->compressing = 1;
- // DPRINTF("compression flag received");
- return pagebuf_get_one(xch, ctx, buf, fd, dom);
-
- case XC_SAVE_ID_COMPRESSED_DATA:
-
- /* read the length of compressed chunk coming in */
- if ( RDEXACT(fd, &compbuf_size, sizeof(unsigned long)) )
- {
- PERROR("Error when reading compbuf_size");
- return -1;
- }
- if (!compbuf_size) return 1;
-
- buf->compbuf_size += compbuf_size;
- if (!(ptmp = realloc(buf->pages, buf->compbuf_size))) {
- ERROR("Could not (re)allocate compression buffer");
- return -1;
- }
- buf->pages = ptmp;
-
- if ( RDEXACT(fd, buf->pages + (buf->compbuf_size - compbuf_size),
- compbuf_size) ) {
- PERROR("Error when reading compression buffer");
- return -1;
- }
- return compbuf_size;
-
- case XC_SAVE_ID_HVM_GENERATION_ID_ADDR:
- /* Skip padding 4 bytes then read the generation id buffer location. */
- if ( RDEXACT(fd, &buf->vm_generationid_addr, sizeof(uint32_t)) ||
- RDEXACT(fd, &buf->vm_generationid_addr, sizeof(uint64_t)) )
- {
- PERROR("error read the generation id buffer location");
- return -1;
- }
- DPRINTF("read generation id buffer address");
- return pagebuf_get_one(xch, ctx, buf, fd, dom);
-
- case XC_SAVE_ID_HVM_IOREQ_SERVER_PFN:
- /* Skip padding 4 bytes then read the ioreq server gmfn base. */
- if ( RDEXACT(fd, &buf->ioreq_server_pfn, sizeof(uint32_t)) ||
- RDEXACT(fd, &buf->ioreq_server_pfn, sizeof(uint64_t)) )
- {
- PERROR("error read the ioreq server gmfn base");
- return -1;
- }
- return pagebuf_get_one(xch, ctx, buf, fd, dom);
-
- case XC_SAVE_ID_HVM_NR_IOREQ_SERVER_PAGES:
- /* Skip padding 4 bytes then read the ioreq server gmfn count. */
- if ( RDEXACT(fd, &buf->nr_ioreq_server_pages, sizeof(uint32_t)) ||
- RDEXACT(fd, &buf->nr_ioreq_server_pages, sizeof(uint64_t)) )
- {
- PERROR("error read the ioreq server gmfn count");
- return -1;
- }
- return pagebuf_get_one(xch, ctx, buf, fd, dom);
-
- default:
- if ( (count > MAX_BATCH_SIZE) || (count < 0) ) {
- ERROR("Max batch size exceeded (%d). Giving up.", count);
- errno = EMSGSIZE;
- return -1;
- }
- break;
- }
-
- oldcount = buf->nr_pages;
- buf->nr_pages += count;
- if (!buf->pfn_types) {
- if (!(buf->pfn_types = malloc(buf->nr_pages * sizeof(*(buf->pfn_types))))) {
- ERROR("Could not allocate PFN type buffer");
- return -1;
- }
- } else {
- if (!(ptmp = realloc(buf->pfn_types, buf->nr_pages * sizeof(*(buf->pfn_types))))) {
- ERROR("Could not reallocate PFN type buffer");
- return -1;
- }
- buf->pfn_types = ptmp;
- }
- if ( RDEXACT(fd, buf->pfn_types + oldcount, count * sizeof(*(buf->pfn_types)))) {
- PERROR("Error when reading region pfn types");
- return -1;
- }
-
- countpages = count;
- for (i = oldcount; i < buf->nr_pages; ++i)
- {
- unsigned long pagetype;
-
- pagetype = buf->pfn_types[i] & XEN_DOMCTL_PFINFO_LTAB_MASK;
- if ( pagetype == XEN_DOMCTL_PFINFO_XTAB ||
- pagetype == XEN_DOMCTL_PFINFO_BROKEN ||
- pagetype == XEN_DOMCTL_PFINFO_XALLOC )
- --countpages;
- }
-
- if (!countpages)
- return count;
-
- /* If Remus Checkpoint Compression is turned on, we will only be
- * receiving the pfn lists now. The compressed pages will come in later,
- * following a <XC_SAVE_ID_COMPRESSED_DATA, compressedChunkSize> tuple.
- */
- if (buf->compressing)
- return pagebuf_get_one(xch, ctx, buf, fd, dom);
-
- oldcount = buf->nr_physpages;
- buf->nr_physpages += countpages;
- if (!buf->pages) {
- if (!(buf->pages = malloc(buf->nr_physpages * PAGE_SIZE))) {
- ERROR("Could not allocate page buffer");
- return -1;
- }
- } else {
- if (!(ptmp = realloc(buf->pages, buf->nr_physpages * PAGE_SIZE))) {
- ERROR("Could not reallocate page buffer");
- return -1;
- }
- buf->pages = ptmp;
- }
- if ( RDEXACT(fd, buf->pages + oldcount * PAGE_SIZE, countpages * PAGE_SIZE) ) {
- PERROR("Error when reading pages");
- return -1;
- }
-
- return count;
-}
-
-static int pagebuf_get(xc_interface *xch, struct restore_ctx *ctx,
- pagebuf_t* buf, int fd, uint32_t dom)
-{
- int rc;
-
- buf->nr_physpages = buf->nr_pages = 0;
- buf->compbuf_pos = buf->compbuf_size = 0;
-
- do {
- rc = pagebuf_get_one(xch, ctx, buf, fd, dom);
- } while (rc > 0);
-
- if (rc < 0)
- pagebuf_free(buf);
-
- return rc;
-}
-
-static int apply_batch(xc_interface *xch, uint32_t dom, struct restore_ctx *ctx,
- xen_pfn_t* region_mfn, unsigned long* pfn_type, int pae_extended_cr3,
- struct xc_mmu* mmu,
- pagebuf_t* pagebuf, int curbatch, int *invalid_pages)
-{
- int i, j, curpage, nr_mfns;
- int k, scount;
- unsigned long superpage_start=INVALID_P2M_ENTRY;
- /* used by debug verify code */
- unsigned long buf[PAGE_SIZE/sizeof(unsigned long)];
- /* Our mapping of the current region (batch) */
- char *region_base;
- /* A temporary mapping, and a copy, of one frame of guest memory. */
- unsigned long *page = NULL;
- int nraces = 0;
- struct domain_info_context *dinfo = &ctx->dinfo;
- int* pfn_err = NULL;
- int rc = -1;
- int local_invalid_pages = 0;
- /* We have handled curbatch pages before this batch, and there are
- * *invalid_pages pages that are not in pagebuf->pages. So the first
- * page for this page is (curbatch - *invalid_pages) page.
- */
- int first_page = curbatch - *invalid_pages;
-
- unsigned long mfn, pfn, pagetype;
-
- j = pagebuf->nr_pages - curbatch;
- if (j > MAX_BATCH_SIZE)
- j = MAX_BATCH_SIZE;
-
- /* First pass for this batch: work out how much memory to alloc, and detect superpages */
- nr_mfns = scount = 0;
- for ( i = 0; i < j; i++ )
- {
- unsigned long pfn, pagetype;
- pfn = pagebuf->pfn_types[i + curbatch] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
- pagetype = pagebuf->pfn_types[i + curbatch] & XEN_DOMCTL_PFINFO_LTAB_MASK;
-
- /* For allocation purposes, treat XEN_DOMCTL_PFINFO_XALLOC as a normal page */
- if ( (pagetype != XEN_DOMCTL_PFINFO_XTAB) &&
- (ctx->p2m[pfn] == INVALID_P2M_ENTRY) )
- {
- /* Have a live PFN which hasn't had an MFN allocated */
-
- /* Logic if we're in the middle of detecting a candidate superpage */
- if ( superpage_start != INVALID_P2M_ENTRY )
- {
- /* Is this the next expected continuation? */
- if ( pfn == superpage_start + scount )
- {
- if ( !ctx->superpages )
- {
- ERROR("Unexpexted codepath with no superpages");
- return -1;
- }
-
- scount++;
-
- /* If we've found a whole superpage, allocate it and update p2m */
- if ( scount == SUPERPAGE_NR_PFNS )
- {
- unsigned long supermfn;
-
-
- supermfn=superpage_start;
- if ( xc_domain_populate_physmap_exact(xch, dom, 1,
- SUPERPAGE_PFN_SHIFT, 0, &supermfn) != 0 )
- {
- DPRINTF("No 2M page available for pfn 0x%lx, fall back to 4K page.\n",
- superpage_start);
- /* If we're falling back from a failed allocation, subtract one
- * from count, since the last page == pfn, which will behandled
- * anyway. */
- scount--;
- goto fallback;
- }
-
- DPRINTF("Mapping superpage (%d) pfn %lx, mfn %lx\n", scount, superpage_start, supermfn);
- for (k=0; k<scount; k++)
- {
- /* We just allocated a new mfn above; update p2m */
- ctx->p2m[superpage_start+k] = supermfn+k;
- ctx->nr_pfns++;
- /* region_map[] will be set below */
- }
- superpage_start=INVALID_P2M_ENTRY;
- scount=0;
- }
- continue;
- }
-
- fallback:
- DPRINTF("Falling back %d pages pfn %lx\n", scount, superpage_start);
- for (k=0; k<scount; k++)
- {
- ctx->p2m_batch[nr_mfns++] = superpage_start+k;
- ctx->p2m[superpage_start+k]--;
- }
- superpage_start = INVALID_P2M_ENTRY;
- scount=0;
- }
-
- /* Are we ready to start a new superpage candidate? */
- if ( ctx->hvm && ctx->superpages && SUPER_PAGE_START(pfn) )
- {
- superpage_start=pfn;
- scount++;
- }
- else
- {
- /* Add the current pfn to pfn_batch */
- ctx->p2m_batch[nr_mfns++] = pfn;
- ctx->p2m[pfn]--;
- }
- }
- }
-
- /* Clean up any partial superpage candidates */
- if ( superpage_start != INVALID_P2M_ENTRY )
- {
- DPRINTF("Falling back %d pages pfn %lx\n", scount, superpage_start);
- for (k=0; k<scount; k++)
- {
- ctx->p2m_batch[nr_mfns++] = superpage_start+k;
- ctx->p2m[superpage_start+k]--;
- }
- superpage_start = INVALID_P2M_ENTRY;
- }
-
- /* Now allocate a bunch of mfns for this batch */
- if ( nr_mfns )
- {
- DPRINTF("Mapping order 0, %d; first pfn %lx\n", nr_mfns, ctx->p2m_batch[0]);
-
- if (!ctx->hvm && ctx->superpages)
- rc = alloc_superpage_mfns(xch, dom, ctx, nr_mfns);
- else
- rc = xc_domain_populate_physmap_exact(xch, dom, nr_mfns, 0, 0,
- ctx->p2m_batch);
-
- if (rc)
- {
- ERROR("Failed to allocate memory for batch.!\n");
- errno = ENOMEM;
- return -1;
- }
- }
-
- /* Second pass for this batch: update p2m[] and region_mfn[] */
- nr_mfns = 0;
- for ( i = 0; i < j; i++ )
- {
- unsigned long pfn, pagetype;
- pfn = pagebuf->pfn_types[i + curbatch] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
- pagetype = pagebuf->pfn_types[i + curbatch] & XEN_DOMCTL_PFINFO_LTAB_MASK;
-
- if ( pagetype != XEN_DOMCTL_PFINFO_XTAB
- && ctx->p2m[pfn] == (INVALID_P2M_ENTRY-1) )
- {
- /* We just allocated a new mfn above; update p2m */
- ctx->p2m[pfn] = ctx->p2m_batch[nr_mfns++];
- ctx->nr_pfns++;
- }
-
- /* setup region_mfn[] for batch map, if necessary.
- * For HVM guests, this interface takes PFNs, not MFNs */
- if ( pagetype == XEN_DOMCTL_PFINFO_XTAB
- || pagetype == XEN_DOMCTL_PFINFO_XALLOC )
- region_mfn[i] = ~0UL; /* map will fail but we don't care */
- else
- region_mfn[i] = ctx->hvm ? pfn : ctx->p2m[pfn];
- }
-
- /* Map relevant mfns */
- pfn_err = calloc(j, sizeof(*pfn_err));
- if ( pfn_err == NULL )
- {
- PERROR("allocation for pfn_err failed");
- return -1;
- }
- region_base = xc_map_foreign_bulk(
- xch, dom, PROT_WRITE, region_mfn, pfn_err, j);
-
- if ( region_base == NULL )
- {
- PERROR("map batch failed");
- free(pfn_err);
- return -1;
- }
-
- for ( i = 0, curpage = -1; i < j; i++ )
- {
- pfn = pagebuf->pfn_types[i + curbatch] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
- pagetype = pagebuf->pfn_types[i + curbatch] & XEN_DOMCTL_PFINFO_LTAB_MASK;
-
- if ( pagetype == XEN_DOMCTL_PFINFO_XTAB
- || pagetype == XEN_DOMCTL_PFINFO_XALLOC)
- {
- local_invalid_pages++;
- /* a bogus/unmapped/allocate-only page: skip it */
- continue;
- }
-
- if ( pagetype == XEN_DOMCTL_PFINFO_BROKEN )
- {
- if ( xc_set_broken_page_p2m(xch, dom, pfn) )
- {
- ERROR("Set p2m for broken page failed, "
- "dom=%d, pfn=%lx\n", dom, pfn);
- goto err_mapped;
- }
-
- local_invalid_pages++;
- continue;
- }
-
- if (pfn_err[i])
- {
- ERROR("unexpected PFN mapping failure pfn %lx map_mfn %lx p2m_mfn %lx",
- pfn, region_mfn[i], ctx->p2m[pfn]);
- goto err_mapped;
- }
-
- ++curpage;
-
- if ( pfn > dinfo->p2m_size )
- {
- ERROR("pfn out of range");
- goto err_mapped;
- }
-
- pfn_type[pfn] = pagetype;
-
- mfn = ctx->p2m[pfn];
-
- /* In verify mode, we use a copy; otherwise we work in place */
- page = pagebuf->verify ? (void *)buf : (region_base + i*PAGE_SIZE);
-
- /* Remus - page decompression */
- if (pagebuf->compressing)
- {
- if (xc_compression_uncompress_page(xch, pagebuf->pages,
- pagebuf->compbuf_size,
- &pagebuf->compbuf_pos,
- (char *)page))
- {
- ERROR("Failed to uncompress page (pfn=%lx)\n", pfn);
- goto err_mapped;
- }
- }
- else
- memcpy(page, pagebuf->pages + (first_page + curpage) * PAGE_SIZE,
- PAGE_SIZE);
-
- pagetype &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK;
-
- if ( (pagetype >= XEN_DOMCTL_PFINFO_L1TAB) &&
- (pagetype <= XEN_DOMCTL_PFINFO_L4TAB) )
- {
- /*
- ** A page table page - need to 'uncanonicalize' it, i.e.
- ** replace all the references to pfns with the corresponding
- ** mfns for the new domain.
- **
- ** On PAE we need to ensure that PGDs are in MFNs < 4G, and
- ** so we may need to update the p2m after the main loop.
- ** Hence we defer canonicalization of L1s until then.
- */
- if ((ctx->pt_levels != 3) ||
- pae_extended_cr3 ||
- (pagetype != XEN_DOMCTL_PFINFO_L1TAB)) {
-
- if (!uncanonicalize_pagetable(xch, dom, ctx, page)) {
- /*
- ** Failing to uncanonicalize a page table can be ok
- ** under live migration since the pages type may have
- ** changed by now (and we'll get an update later).
- */
- DPRINTF("PT L%ld race on pfn=%08lx mfn=%08lx\n",
- pagetype >> 28, pfn, mfn);
- nraces++;
- continue;
- }
- }
- }
- else if ( pagetype != XEN_DOMCTL_PFINFO_NOTAB )
- {
- ERROR("Bogus page type %lx page table is out of range: "
- "i=%d p2m_size=%lu", pagetype, i, dinfo->p2m_size);
- goto err_mapped;
- }
-
- if ( pagebuf->verify )
- {
- int res = memcmp(buf, (region_base + i*PAGE_SIZE), PAGE_SIZE);
- if ( res )
- {
- int v;
-
- DPRINTF("************** pfn=%lx type=%lx gotcs=%08lx "
- "actualcs=%08lx\n", pfn, pfn_type[pfn],
- csum_page(region_base + i * PAGE_SIZE),
- csum_page(buf));
-
- for ( v = 0; v < 4; v++ )
- {
- unsigned long *p = (unsigned long *)
- (region_base + i*PAGE_SIZE);
- if ( buf[v] != p[v] )
- DPRINTF(" %d: %08lx %08lx\n", v, buf[v], p[v]);
- }
- }
- }
-
- if ( !ctx->hvm &&
- xc_add_mmu_update(xch, mmu,
- (((unsigned long long)mfn) << PAGE_SHIFT)
- | MMU_MACHPHYS_UPDATE, pfn) )
- {
- PERROR("failed machpys update mfn=%lx pfn=%lx", mfn, pfn);
- goto err_mapped;
- }
- } /* end of 'batch' for loop */
-
- rc = nraces;
- *invalid_pages += local_invalid_pages;
-
- err_mapped:
- munmap(region_base, j*PAGE_SIZE);
- free(pfn_err);
-
- return rc;
-}
-
-int xc_domain_restore(xc_interface *xch, int io_fd, uint32_t dom,
- unsigned int store_evtchn, unsigned long *store_mfn,
- domid_t store_domid, unsigned int console_evtchn,
- unsigned long *console_mfn, domid_t console_domid,
- unsigned int hvm, unsigned int pae, int superpages,
- int checkpointed_stream,
- struct restore_callbacks *callbacks)
-{
- DECLARE_DOMCTL;
- xc_dominfo_t info;
- int rc = 1, frc, i, j, n, m, pae_extended_cr3 = 0, ext_vcpucontext = 0;
- uint32_t vcpuextstate_size = 0;
- unsigned long mfn, pfn;
- int nraces = 0;
-
- /* The new domain's shared-info frame number. */
- unsigned long shared_info_frame;
- unsigned char shared_info_page[PAGE_SIZE]; /* saved contents from file */
- shared_info_any_t *old_shared_info =
- (shared_info_any_t *)shared_info_page;
- shared_info_any_t *new_shared_info;
-
- /* A copy of the CPU context of the guest. */
- DECLARE_HYPERCALL_BUFFER(vcpu_guest_context_any_t, ctxt);
-
- /* A copy of the CPU eXtended States of the guest. */
- DECLARE_HYPERCALL_BUFFER(void, buffer);
-
- /* A table containing the type of each PFN (/not/ MFN!). */
- unsigned long *pfn_type = NULL;
-
- /* A table of MFNs to map in the current region */
- xen_pfn_t *region_mfn = NULL;
-
- /* A copy of the pfn-to-mfn table frame list. */
- xen_pfn_t *p2m_frame_list = NULL;
-
- /* A temporary mapping of the guest's start_info page. */
- start_info_any_t *start_info;
-
- /* Our mapping of the current region (batch) */
- char *region_base;
-
- struct xc_mmu *mmu = NULL;
-
- struct mmuext_op pin[MAX_PIN_BATCH];
- unsigned int nr_pins;
-
- uint64_t vcpumap[XC_SR_MAX_VCPUS/64] = { 1ULL };
- unsigned int max_vcpu_id = 0;
- int new_ctxt_format = 0;
-
- pagebuf_t pagebuf;
- tailbuf_t tailbuf, tmptail;
- struct toolstack_data_t tdata, tdatatmp;
- void* vcpup;
- uint64_t console_pfn = 0;
-
- int orig_io_fd_flags;
-
- struct restore_ctx _ctx;
- struct restore_ctx *ctx = &_ctx;
- struct domain_info_context *dinfo = &ctx->dinfo;
-
- if ( getenv("XG_MIGRATION_V2") )
- {
- return xc_domain_restore2(
- xch, io_fd, dom, store_evtchn, store_mfn,
- store_domid, console_evtchn, console_mfn, console_domid,
- hvm, pae, superpages, checkpointed_stream, callbacks);
- }
-
- DPRINTF("%s: starting restore of new domid %u", __func__, dom);
-
- pagebuf_init(&pagebuf);
- memset(&tailbuf, 0, sizeof(tailbuf));
- tailbuf.ishvm = hvm;
- memset(&tdata, 0, sizeof(tdata));
-
- memset(ctx, 0, sizeof(*ctx));
-
- ctx->superpages = superpages;
- ctx->hvm = hvm;
- ctx->last_checkpoint = !checkpointed_stream;
-
- ctxt = xc_hypercall_buffer_alloc(xch, ctxt, sizeof(*ctxt));
-
- if ( ctxt == NULL )
- {
- PERROR("Unable to allocate VCPU ctxt buffer");
- return 1;
- }
-
-
- if ( (orig_io_fd_flags = fcntl(io_fd, F_GETFL, 0)) < 0 ) {
- PERROR("unable to read IO FD flags");
- goto out;
- }
-
- if ( RDEXACT(io_fd, &dinfo->p2m_size, sizeof(unsigned long)) )
- {
- PERROR("read: p2m_size");
- goto out;
- }
- DPRINTF("%s: p2m_size = %lx\n", __func__, dinfo->p2m_size);
-
- if ( !get_platform_info(xch, dom,
- &ctx->max_mfn, &ctx->hvirt_start, &ctx->pt_levels, &dinfo->guest_width) )
- {
- ERROR("Unable to get platform info.");
- return 1;
- }
-
- /* The *current* word size of the guest isn't very interesting; for now
- * assume the guest will be the same as we are. We'll fix that later
- * if we discover otherwise. */
- dinfo->guest_width = sizeof(unsigned long);
- ctx->pt_levels = (dinfo->guest_width == 8) ? 4 : 3;
-
- if ( !hvm )
- {
- /* Load the p2m frame list, plus potential extended info chunk */
- p2m_frame_list = load_p2m_frame_list(xch, ctx,
- io_fd, &pae_extended_cr3, &ext_vcpucontext,
- &vcpuextstate_size);
-
- if ( !p2m_frame_list )
- goto out;
-
- /* Now that we know the word size, tell Xen about it */
- memset(&domctl, 0, sizeof(domctl));
- domctl.domain = dom;
- domctl.cmd = XEN_DOMCTL_set_address_size;
- domctl.u.address_size.size = dinfo->guest_width * 8;
- frc = do_domctl(xch, &domctl);
- if ( frc != 0 )
- {
- PERROR("Unable to set guest address size.");
- goto out;
- }
- }
-
- /* We want zeroed memory so use calloc rather than malloc. */
- ctx->p2m = calloc(dinfo->p2m_size, sizeof(xen_pfn_t));
- pfn_type = calloc(dinfo->p2m_size, sizeof(unsigned long));
-
- region_mfn = malloc(ROUNDUP(MAX_BATCH_SIZE * sizeof(xen_pfn_t), PAGE_SHIFT));
- ctx->p2m_batch = malloc(ROUNDUP(MAX_BATCH_SIZE * sizeof(xen_pfn_t), PAGE_SHIFT));
- if (!ctx->hvm && ctx->superpages)
- {
- ctx->p2m_saved_batch =
- malloc(ROUNDUP(MAX_BATCH_SIZE * sizeof(xen_pfn_t), PAGE_SHIFT));
- if ( ctx->p2m_saved_batch == NULL )
- {
- ERROR("saved batch memory alloc failed");
- errno = ENOMEM;
- goto out;
- }
- }
-
- if ( (ctx->p2m == NULL) || (pfn_type == NULL) ||
- (region_mfn == NULL) || (ctx->p2m_batch == NULL) )
- {
- ERROR("memory alloc failed");
- errno = ENOMEM;
- goto out;
- }
-
- memset(region_mfn, 0,
- ROUNDUP(MAX_BATCH_SIZE * sizeof(xen_pfn_t), PAGE_SHIFT));
- memset(ctx->p2m_batch, 0,
- ROUNDUP(MAX_BATCH_SIZE * sizeof(xen_pfn_t), PAGE_SHIFT));
-
- /* Get the domain's shared-info frame. */
- if ( xc_domain_getinfo(xch, (domid_t)dom, 1, &info) != 1 )
- {
- PERROR("Could not get information on new domain");
- goto out;
- }
- shared_info_frame = info.shared_info_frame;
-
- /* Mark all PFNs as invalid; we allocate on demand */
- for ( pfn = 0; pfn < dinfo->p2m_size; pfn++ )
- ctx->p2m[pfn] = INVALID_P2M_ENTRY;
-
- mmu = xc_alloc_mmu_updates(xch, dom);
- if ( mmu == NULL )
- {
- PERROR("Could not initialise for MMU updates");
- goto out;
- }
-
- xc_set_progress_prefix(xch, "Reloading memory pages");
- xc_report_progress_step(xch, 0, dinfo->p2m_size);
-
- /*
- * Now simply read each saved frame into its new machine frame.
- * We uncanonicalise page tables as we go.
- */
-
- n = m = 0;
- loadpages:
- for ( ; ; )
- {
- int j, curbatch, invalid_pages;
-
- xc_report_progress_step(xch, n, dinfo->p2m_size);
-
- if ( !ctx->completed ) {
- pagebuf.nr_physpages = pagebuf.nr_pages = 0;
- pagebuf.compbuf_pos = pagebuf.compbuf_size = 0;
- if ( pagebuf_get_one(xch, ctx, &pagebuf, io_fd, dom) < 0 ) {
- PERROR("Error when reading batch");
- goto out;
- }
- }
- j = pagebuf.nr_pages;
-
- DBGPRINTF("batch %d\n",j);
-
- if ( j == 0 ) {
- /* catch vcpu updates */
- if (pagebuf.new_ctxt_format) {
- max_vcpu_id = pagebuf.max_vcpu_id;
- memcpy(vcpumap, pagebuf.vcpumap, vcpumap_sz(max_vcpu_id));
- }
- /* should this be deferred? does it change? */
- if ( pagebuf.identpt )
- xc_hvm_param_set(xch, dom, HVM_PARAM_IDENT_PT, pagebuf.identpt);
- if ( pagebuf.paging_ring_pfn )
- xc_hvm_param_set(xch, dom, HVM_PARAM_PAGING_RING_PFN, pagebuf.paging_ring_pfn);
- if ( pagebuf.monitor_ring_pfn )
- xc_hvm_param_set(xch, dom, HVM_PARAM_MONITOR_RING_PFN, pagebuf.monitor_ring_pfn);
- if ( pagebuf.sharing_ring_pfn )
- xc_hvm_param_set(xch, dom, HVM_PARAM_SHARING_RING_PFN, pagebuf.sharing_ring_pfn);
- if ( pagebuf.vm86_tss )
- xc_hvm_param_set(xch, dom, HVM_PARAM_VM86_TSS, pagebuf.vm86_tss);
- if ( pagebuf.console_pfn )
- console_pfn = pagebuf.console_pfn;
- if ( pagebuf.vm_generationid_addr )
- xc_hvm_param_set(xch, dom, HVM_PARAM_VM_GENERATION_ID_ADDR,
- pagebuf.vm_generationid_addr);
-
- break; /* our work here is done */
- }
-
- /* break pagebuf into batches */
- curbatch = 0;
- invalid_pages = 0;
- while ( curbatch < j ) {
- int brc;
-
- brc = apply_batch(xch, dom, ctx, region_mfn, pfn_type,
- pae_extended_cr3, mmu, &pagebuf, curbatch,
- &invalid_pages);
- if ( brc < 0 )
- goto out;
-
- nraces += brc;
-
- curbatch += MAX_BATCH_SIZE;
- }
-
- pagebuf.nr_physpages = pagebuf.nr_pages = 0;
- pagebuf.compbuf_pos = pagebuf.compbuf_size = 0;
-
- n += j; /* crude stats */
-
- /*
- * Discard cache for portion of file read so far up to last
- * page boundary every 16MB or so.
- */
- m += j;
- if ( m > MAX_PAGECACHE_USAGE )
- {
- discard_file_cache(xch, io_fd, 0 /* no flush */);
- m = 0;
- }
- }
-
- /*
- * Ensure we flush all machphys updates before potential PAE-specific
- * reallocations below.
- */
- if ( !hvm && xc_flush_mmu_updates(xch, mmu) )
- {
- PERROR("Error doing flush_mmu_updates()");
- goto out;
- }
-
- // DPRINTF("Received all pages (%d races)\n", nraces);
-
- if ( !ctx->completed ) {
-
- if ( buffer_tail(xch, ctx, &tailbuf, io_fd, max_vcpu_id, vcpumap,
- ext_vcpucontext, vcpuextstate_size) < 0 ) {
- ERROR ("error buffering image tail");
- goto out;
- }
-
- ctx->completed = 1;
-
- /*
- * If more checkpoints are expected then shift into
- * nonblocking mode for the remainder.
- */
- if ( !ctx->last_checkpoint )
- fcntl(io_fd, F_SETFL, orig_io_fd_flags | O_NONBLOCK);
-
- /*
- * If sender had sent enable compression flag, switch to compressed
- * checkpoints mode once the first checkpoint is received.
- */
- if (ctx->compressing)
- pagebuf.compressing = 1;
- }
-
- if (pagebuf.viridian != 0)
- xc_hvm_param_set(xch, dom, HVM_PARAM_VIRIDIAN, pagebuf.viridian);
-
- /*
- * If we are migrating in from a host that does not support
- * secondary emulators then nr_ioreq_server_pages will be 0, since
- * there will be no XC_SAVE_ID_HVM_NR_IOREQ_SERVER_PAGES chunk in
- * the image.
- * If we are migrating from a host that does support secondary
- * emulators then the XC_SAVE_ID_HVM_NR_IOREQ_SERVER_PAGES chunk
- * will exist and is guaranteed to have a non-zero value. The
- * existence of that chunk also implies the existence of the
- * XC_SAVE_ID_HVM_IOREQ_SERVER_PFN chunk, which is also guaranteed
- * to have a non-zero value.
- */
- if (!pagebuf.nr_ioreq_server_pages ^ !pagebuf.ioreq_server_pfn) {
- ERROR("Inconsistent IOREQ Server settings (nr=%"PRIx64", pfn=%"PRIx64")",
- pagebuf.nr_ioreq_server_pages, pagebuf.ioreq_server_pfn);
- } else {
- if (pagebuf.nr_ioreq_server_pages != 0 &&
- pagebuf.ioreq_server_pfn != 0) {
- xc_hvm_param_set(xch, dom, HVM_PARAM_NR_IOREQ_SERVER_PAGES,
- pagebuf.nr_ioreq_server_pages);
- xc_hvm_param_set(xch, dom, HVM_PARAM_IOREQ_SERVER_PFN,
- pagebuf.ioreq_server_pfn);
- }
- }
-
- if (pagebuf.acpi_ioport_location == 1) {
- DBGPRINTF("Use new firmware ioport from the checkpoint\n");
- xc_hvm_param_set(xch, dom, HVM_PARAM_ACPI_IOPORTS_LOCATION, 1);
- } else if (pagebuf.acpi_ioport_location == 0) {
- DBGPRINTF("Use old firmware ioport from the checkpoint\n");
- } else {
- ERROR("Error, unknow acpi ioport location (%"PRId64")", pagebuf.acpi_ioport_location);
- }
-
- tdatatmp = tdata;
- tdata = pagebuf.tdata;
- pagebuf.tdata = tdatatmp;
-
- if ( ctx->last_checkpoint )
- {
- // DPRINTF("Last checkpoint, finishing\n");
- goto finish;
- }
-
- // DPRINTF("Buffered checkpoint\n");
-
- if ( pagebuf_get(xch, ctx, &pagebuf, io_fd, dom) ) {
- PERROR("error when buffering batch, finishing");
- /*
- * Remus: discard the current incomplete checkpoint and restore
- * backup from the last complete checkpoint.
- */
- goto finish;
- }
- memset(&tmptail, 0, sizeof(tmptail));
- tmptail.ishvm = hvm;
- if ( buffer_tail(xch, ctx, &tmptail, io_fd, max_vcpu_id, vcpumap,
- ext_vcpucontext, vcpuextstate_size) < 0 ) {
- ERROR ("error buffering image tail, finishing");
- /*
- * Remus: discard the current incomplete checkpoint and restore
- * backup from the last complete checkpoint.
- */
- goto finish;
- }
- tailbuf_free(&tailbuf);
- memcpy(&tailbuf, &tmptail, sizeof(tailbuf));
-
- goto loadpages;
-
- /* With Remus: restore from last complete checkpoint */
- finish:
- if ( hvm )
- goto finish_hvm;
-
- if ( (ctx->pt_levels == 3) && !pae_extended_cr3 )
- {
- /*
- ** XXX SMH on PAE we need to ensure PGDs are in MFNs < 4G. This
- ** is a little awkward and involves (a) finding all such PGDs and
- ** replacing them with 'lowmem' versions; (b) upating the p2m[]
- ** with the new info; and (c) canonicalizing all the L1s using the
- ** (potentially updated) p2m[].
- **
- ** This is relatively slow (and currently involves two passes through
- ** the pfn_type[] array), but at least seems to be correct. May wish
- ** to consider more complex approaches to optimize this later.
- */
-
- int j, k;
-
- /* First pass: find all L3TABs current in > 4G mfns and get new mfns */
- for ( i = 0; i < dinfo->p2m_size; i++ )
- {
- if ( ((pfn_type[i] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK) ==
- XEN_DOMCTL_PFINFO_L3TAB) &&
- (ctx->p2m[i] > 0xfffffUL) )
- {
- unsigned long new_mfn;
- uint64_t l3ptes[4];
- uint64_t *l3tab;
-
- l3tab = (uint64_t *)
- xc_map_foreign_range(xch, dom, PAGE_SIZE,
- PROT_READ, ctx->p2m[i]);
- if ( l3tab == NULL )
- {
- PERROR("xc_map_foreign_range failed (for l3tab)");
- goto out;
- }
-
- for ( j = 0; j < 4; j++ )
- l3ptes[j] = l3tab[j];
-
- munmap(l3tab, PAGE_SIZE);
-
- new_mfn = xc_make_page_below_4G(xch, dom, ctx->p2m[i]);
- if ( !new_mfn )
- {
- PERROR("Couldn't get a page below 4GB :-(");
- goto out;
- }
-
- ctx->p2m[i] = new_mfn;
- if ( xc_add_mmu_update(xch, mmu,
- (((unsigned long long)new_mfn)
- << PAGE_SHIFT) |
- MMU_MACHPHYS_UPDATE, i) )
- {
- PERROR("Couldn't m2p on PAE root pgdir");
- goto out;
- }
-
- l3tab = (uint64_t *)
- xc_map_foreign_range(xch, dom, PAGE_SIZE,
- PROT_READ | PROT_WRITE, ctx->p2m[i]);
- if ( l3tab == NULL )
- {
- PERROR("xc_map_foreign_range failed (for l3tab, 2nd)");
- goto out;
- }
-
- for ( j = 0; j < 4; j++ )
- l3tab[j] = l3ptes[j];
-
- munmap(l3tab, PAGE_SIZE);
- }
- }
-
- /* Second pass: find all L1TABs and uncanonicalize them */
- j = 0;
-
- for ( i = 0; i < dinfo->p2m_size; i++ )
- {
- if ( ((pfn_type[i] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK) ==
- XEN_DOMCTL_PFINFO_L1TAB) )
- {
- region_mfn[j] = ctx->p2m[i];
- j++;
- }
-
- if ( (i == (dinfo->p2m_size-1)) || (j == MAX_BATCH_SIZE) )
- {
- region_base = xc_map_foreign_pages(
- xch, dom, PROT_READ | PROT_WRITE, region_mfn, j);
- if ( region_base == NULL )
- {
- PERROR("map batch failed");
- goto out;
- }
-
- for ( k = 0; k < j; k++ )
- {
- if ( !uncanonicalize_pagetable(
- xch, dom, ctx,
- region_base + k*PAGE_SIZE) )
- {
- ERROR("failed uncanonicalize pt!");
- goto out;
- }
- }
-
- munmap(region_base, j*PAGE_SIZE);
- j = 0;
- }
- }
-
- if ( xc_flush_mmu_updates(xch, mmu) )
- {
- PERROR("Error doing xc_flush_mmu_updates()");
- goto out;
- }
- }
-
- /*
- * Pin page tables. Do this after writing to them as otherwise Xen
- * will barf when doing the type-checking.
- */
- nr_pins = 0;
- for ( i = 0; i < dinfo->p2m_size; i++ )
- {
- if ( (pfn_type[i] & XEN_DOMCTL_PFINFO_LPINTAB) == 0 )
- continue;
-
- switch ( pfn_type[i] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK )
- {
- case XEN_DOMCTL_PFINFO_L1TAB:
- pin[nr_pins].cmd = MMUEXT_PIN_L1_TABLE;
- break;
-
- case XEN_DOMCTL_PFINFO_L2TAB:
- pin[nr_pins].cmd = MMUEXT_PIN_L2_TABLE;
- break;
-
- case XEN_DOMCTL_PFINFO_L3TAB:
- pin[nr_pins].cmd = MMUEXT_PIN_L3_TABLE;
- break;
-
- case XEN_DOMCTL_PFINFO_L4TAB:
- pin[nr_pins].cmd = MMUEXT_PIN_L4_TABLE;
- break;
-
- default:
- continue;
- }
-
- pin[nr_pins].arg1.mfn = ctx->p2m[i];
- nr_pins++;
-
- /* Batch full? Then flush. */
- if ( nr_pins == MAX_PIN_BATCH )
- {
- if ( xc_mmuext_op(xch, pin, nr_pins, dom) < 0 )
- {
- PERROR("Failed to pin batch of %d page tables", nr_pins);
- goto out;
- }
- nr_pins = 0;
- }
- }
-
- /* Flush final partial batch. */
- if ( (nr_pins != 0) && (xc_mmuext_op(xch, pin, nr_pins, dom) < 0) )
- {
- PERROR("Failed to pin batch of %d page tables", nr_pins);
- goto out;
- }
-
- DPRINTF("Memory reloaded (%ld pages)\n", ctx->nr_pfns);
-
- /* Get the list of PFNs that are not in the psuedo-phys map */
- {
- int nr_frees = 0;
-
- for ( i = 0; i < tailbuf.u.pv.pfncount; i++ )
- {
- unsigned long pfn = tailbuf.u.pv.pfntab[i];
-
- if ( ctx->p2m[pfn] != INVALID_P2M_ENTRY )
- {
- /* pfn is not in physmap now, but was at some point during
- the save/migration process - need to free it */
- tailbuf.u.pv.pfntab[nr_frees++] = ctx->p2m[pfn];
- ctx->p2m[pfn] = INVALID_P2M_ENTRY; /* not in pseudo-physical map */
- }
- }
-
- if ( nr_frees > 0 )
- {
- if ( (frc = xc_domain_decrease_reservation(xch, dom, nr_frees, 0, tailbuf.u.pv.pfntab)) != nr_frees )
- {
- PERROR("Could not decrease reservation : %d", frc);
- goto out;
- }
- else
- DPRINTF("Decreased reservation by %d pages\n", tailbuf.u.pv.pfncount);
- }
- }
-
- vcpup = tailbuf.u.pv.vcpubuf;
- for ( i = 0; i <= max_vcpu_id; i++ )
- {
- if ( !(vcpumap[i/64] & (1ULL << (i%64))) )
- continue;
-
- memcpy(ctxt, vcpup, ((dinfo->guest_width == 8) ? sizeof(ctxt->x64)
- : sizeof(ctxt->x32)));
- vcpup += (dinfo->guest_width == 8) ? sizeof(ctxt->x64) : sizeof(ctxt->x32);
-
- DPRINTF("read VCPU %d\n", i);
-
- if ( !new_ctxt_format )
- SET_FIELD(ctxt, flags,
- GET_FIELD(ctxt, flags, dinfo->guest_width) | VGCF_online,
- dinfo->guest_width);
-
- if ( i == 0 )
- {
- /*
- * Uncanonicalise the start info frame number and poke in
- * updated values into the start info itself.
- *
- * The start info MFN is the 3rd argument to the
- * HYPERVISOR_sched_op hypercall when op==SCHEDOP_shutdown
- * and reason==SHUTDOWN_suspend, it is canonicalised in
- * xc_domain_save and therefore the PFN is found in the
- * edx register.
- */
- pfn = GET_FIELD(ctxt, user_regs.edx, dinfo->guest_width);
- if ( (pfn >= dinfo->p2m_size) ||
- (pfn_type[pfn] != XEN_DOMCTL_PFINFO_NOTAB) )
- {
- ERROR("Suspend record frame number is bad");
- goto out;
- }
- mfn = ctx->p2m[pfn];
- SET_FIELD(ctxt, user_regs.edx, mfn, dinfo->guest_width);
- start_info = xc_map_foreign_range(
- xch, dom, PAGE_SIZE, PROT_READ | PROT_WRITE, mfn);
- if ( start_info == NULL )
- {
- PERROR("xc_map_foreign_range failed (for start_info)");
- goto out;
- }
-
- SET_FIELD(start_info, nr_pages, dinfo->p2m_size, dinfo->guest_width);
- SET_FIELD(start_info, shared_info, shared_info_frame<<PAGE_SHIFT, dinfo->guest_width);
- SET_FIELD(start_info, flags, 0, dinfo->guest_width);
- if ( GET_FIELD(start_info, store_mfn, dinfo->guest_width) > dinfo->p2m_size )
- {
- ERROR("Suspend record xenstore frame number is bad");
- munmap(start_info, PAGE_SIZE);
- goto out;
- }
- *store_mfn = ctx->p2m[GET_FIELD(start_info, store_mfn, dinfo->guest_width)];
- SET_FIELD(start_info, store_mfn, *store_mfn, dinfo->guest_width);
- SET_FIELD(start_info, store_evtchn, store_evtchn, dinfo->guest_width);
- if ( GET_FIELD(start_info, console.domU.mfn, dinfo->guest_width) > dinfo->p2m_size )
- {
- ERROR("Suspend record console frame number is bad");
- munmap(start_info, PAGE_SIZE);
- goto out;
- }
- *console_mfn = ctx->p2m[GET_FIELD(start_info, console.domU.mfn, dinfo->guest_width)];
- SET_FIELD(start_info, console.domU.mfn, *console_mfn, dinfo->guest_width);
- SET_FIELD(start_info, console.domU.evtchn, console_evtchn, dinfo->guest_width);
- munmap(start_info, PAGE_SIZE);
- }
- /* Uncanonicalise each GDT frame number. */
- if ( GET_FIELD(ctxt, gdt_ents, dinfo->guest_width) > 8192 )
- {
- ERROR("GDT entry count out of range");
- goto out;
- }
-
- for ( j = 0; (512*j) < GET_FIELD(ctxt, gdt_ents, dinfo->guest_width); j++ )
- {
- pfn = GET_FIELD(ctxt, gdt_frames[j], dinfo->guest_width);
- if ( (pfn >= dinfo->p2m_size) ||
- (pfn_type[pfn] != XEN_DOMCTL_PFINFO_NOTAB) )
- {
- ERROR("GDT frame number %i (0x%lx) is bad",
- j, (unsigned long)pfn);
- goto out;
- }
- SET_FIELD(ctxt, gdt_frames[j], ctx->p2m[pfn], dinfo->guest_width);
- }
- /* Uncanonicalise the page table base pointer. */
- pfn = UNFOLD_CR3(GET_FIELD(ctxt, ctrlreg[3], dinfo->guest_width));
-
- if ( pfn >= dinfo->p2m_size )
- {
- ERROR("PT base is bad: pfn=%lu p2m_size=%lu type=%08lx",
- pfn, dinfo->p2m_size, pfn_type[pfn]);
- goto out;
- }
-
- if ( (pfn_type[pfn] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK) !=
- ((unsigned long)ctx->pt_levels<<XEN_DOMCTL_PFINFO_LTAB_SHIFT) )
- {
- ERROR("PT base is bad. pfn=%lu nr=%lu type=%08lx %08lx",
- pfn, dinfo->p2m_size, pfn_type[pfn],
- (unsigned long)ctx->pt_levels<<XEN_DOMCTL_PFINFO_LTAB_SHIFT);
- goto out;
- }
- SET_FIELD(ctxt, ctrlreg[3], FOLD_CR3(ctx->p2m[pfn]), dinfo->guest_width);
-
- /* Guest pagetable (x86/64) stored in otherwise-unused CR1. */
- if ( (ctx->pt_levels == 4) && (ctxt->x64.ctrlreg[1] & 1) )
- {
- pfn = UNFOLD_CR3(ctxt->x64.ctrlreg[1] & ~1);
- if ( pfn >= dinfo->p2m_size )
- {
- ERROR("User PT base is bad: pfn=%lu p2m_size=%lu",
- pfn, dinfo->p2m_size);
- goto out;
- }
- if ( (pfn_type[pfn] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK) !=
- ((unsigned long)ctx->pt_levels<<XEN_DOMCTL_PFINFO_LTAB_SHIFT) )
- {
- ERROR("User PT base is bad. pfn=%lu nr=%lu type=%08lx %08lx",
- pfn, dinfo->p2m_size, pfn_type[pfn],
- (unsigned long)ctx->pt_levels<<XEN_DOMCTL_PFINFO_LTAB_SHIFT);
- goto out;
- }
- ctxt->x64.ctrlreg[1] = FOLD_CR3(ctx->p2m[pfn]);
- }
- frc = xc_vcpu_setcontext(xch, dom, i, ctxt);
- if ( frc != 0 )
- {
- PERROR("Couldn't build vcpu%d", i);
- goto out;
- }
-
- if ( !ext_vcpucontext )
- goto vcpu_ext_state_restore;
- memcpy(&domctl.u.ext_vcpucontext, vcpup, 128);
- vcpup += 128;
- domctl.cmd = XEN_DOMCTL_set_ext_vcpucontext;
- domctl.domain = dom;
- frc = xc_domctl(xch, &domctl);
- if ( frc != 0 )
- {
- PERROR("Couldn't set extended vcpu%d info", i);
- goto out;
- }
-
- vcpu_ext_state_restore:
- if ( !vcpuextstate_size )
- continue;
-
- memcpy(&domctl.u.vcpuextstate.xfeature_mask, vcpup,
- sizeof(domctl.u.vcpuextstate.xfeature_mask));
- vcpup += sizeof(domctl.u.vcpuextstate.xfeature_mask);
- memcpy(&domctl.u.vcpuextstate.size, vcpup,
- sizeof(domctl.u.vcpuextstate.size));
- vcpup += sizeof(domctl.u.vcpuextstate.size);
-
- buffer = xc_hypercall_buffer_alloc(xch, buffer,
- domctl.u.vcpuextstate.size);
- if ( !buffer )
- {
- PERROR("Could not allocate buffer to restore eXtended States");
- goto out;
- }
- memcpy(buffer, vcpup, domctl.u.vcpuextstate.size);
- vcpup += domctl.u.vcpuextstate.size;
-
- domctl.cmd = XEN_DOMCTL_setvcpuextstate;
- domctl.domain = dom;
- domctl.u.vcpuextstate.vcpu = i;
- set_xen_guest_handle(domctl.u.vcpuextstate.buffer, buffer);
- frc = xc_domctl(xch, &domctl);
- if ( frc != 0 )
- {
- PERROR("Couldn't set eXtended States for vcpu%d", i);
- goto out;
- }
- xc_hypercall_buffer_free(xch, buffer);
- }
-
- memcpy(shared_info_page, tailbuf.u.pv.shared_info_page, PAGE_SIZE);
-
- DPRINTF("Completed checkpoint load\n");
-
- /* Restore contents of shared-info page. No checking needed. */
- new_shared_info = xc_map_foreign_range(
- xch, dom, PAGE_SIZE, PROT_WRITE, shared_info_frame);
- if ( new_shared_info == NULL )
- {
- PERROR("xc_map_foreign_range failed (for new_shared_info)");
- goto out;
- }
-
- /* restore saved vcpu_info and arch specific info */
- MEMCPY_FIELD(new_shared_info, old_shared_info, vcpu_info, dinfo->guest_width);
- MEMCPY_FIELD(new_shared_info, old_shared_info, arch, dinfo->guest_width);
-
- /* clear any pending events and the selector */
- MEMSET_ARRAY_FIELD(new_shared_info, evtchn_pending, 0, dinfo->guest_width);
- for ( i = 0; i < XEN_LEGACY_MAX_VCPUS; i++ )
- SET_FIELD(new_shared_info, vcpu_info[i].evtchn_pending_sel, 0, dinfo->guest_width);
-
- /* mask event channels */
- MEMSET_ARRAY_FIELD(new_shared_info, evtchn_mask, 0xff, dinfo->guest_width);
-
- /* leave wallclock time. set by hypervisor */
- munmap(new_shared_info, PAGE_SIZE);
-
- /* Uncanonicalise the pfn-to-mfn table frame-number list. */
- for ( i = 0; i < P2M_FL_ENTRIES; i++ )
- {
- pfn = p2m_frame_list[i];
- if ( (pfn >= dinfo->p2m_size) || (pfn_type[pfn] != XEN_DOMCTL_PFINFO_NOTAB) )
- {
- ERROR("PFN-to-MFN frame number %i (%#lx) is bad", i, pfn);
- goto out;
- }
- p2m_frame_list[i] = ctx->p2m[pfn];
- }
-
- /* Copy the P2M we've constructed to the 'live' P2M */
- if ( !(ctx->live_p2m = xc_map_foreign_pages(xch, dom, PROT_WRITE,
- p2m_frame_list, P2M_FL_ENTRIES)) )
- {
- PERROR("Couldn't map p2m table");
- goto out;
- }
-
- /* If the domain we're restoring has a different word size to ours,
- * we need to adjust the live_p2m assignment appropriately */
- if ( dinfo->guest_width > sizeof (xen_pfn_t) )
- for ( i = dinfo->p2m_size - 1; i >= 0; i-- )
- ((int64_t *)ctx->live_p2m)[i] = (long)ctx->p2m[i];
- else if ( dinfo->guest_width < sizeof (xen_pfn_t) )
- for ( i = 0; i < dinfo->p2m_size; i++ )
- ((uint32_t *)ctx->live_p2m)[i] = ctx->p2m[i];
- else
- memcpy(ctx->live_p2m, ctx->p2m, dinfo->p2m_size * sizeof(xen_pfn_t));
- munmap(ctx->live_p2m, P2M_FL_ENTRIES * PAGE_SIZE);
-
- frc = xc_dom_gnttab_seed(xch, dom, *console_mfn, *store_mfn,
- console_domid, store_domid);
- if (frc != 0)
- {
- ERROR("error seeding grant table");
- goto out;
- }
-
- DPRINTF("Domain ready to be built.\n");
- rc = 0;
- goto out;
-
- finish_hvm:
- if ( tdata.data != NULL )
- {
- if ( callbacks != NULL && callbacks->toolstack_restore != NULL )
- {
- frc = callbacks->toolstack_restore(dom, tdata.data, tdata.len,
- callbacks->data);
- free(tdata.data);
- if ( frc < 0 )
- {
- PERROR("error calling toolstack_restore");
- goto out;
- }
- } else {
- rc = -1;
- ERROR("toolstack data available but no callback provided\n");
- free(tdata.data);
- goto out;
- }
- }
-
- /* Dump the QEMU state to a state file for QEMU to load */
- if ( dump_qemu(xch, dom, &tailbuf.u.hvm) ) {
- PERROR("Error dumping QEMU state to file");
- goto out;
- }
-
- /* These comms pages need to be zeroed at the start of day */
- if ( xc_clear_domain_page(xch, dom, tailbuf.u.hvm.magicpfns[0]) ||
- xc_clear_domain_page(xch, dom, tailbuf.u.hvm.magicpfns[1]) ||
- xc_clear_domain_page(xch, dom, tailbuf.u.hvm.magicpfns[2]) )
- {
- PERROR("error zeroing magic pages");
- goto out;
- }
-
- if ( (frc = xc_hvm_param_set(xch, dom,
- HVM_PARAM_IOREQ_PFN, tailbuf.u.hvm.magicpfns[0]))
- || (frc = xc_hvm_param_set(xch, dom,
- HVM_PARAM_BUFIOREQ_PFN, tailbuf.u.hvm.magicpfns[1]))
- || (frc = xc_hvm_param_set(xch, dom,
- HVM_PARAM_STORE_PFN, tailbuf.u.hvm.magicpfns[2]))
- || (frc = xc_hvm_param_set(xch, dom,
- HVM_PARAM_PAE_ENABLED, pae))
- || (frc = xc_hvm_param_set(xch, dom,
- HVM_PARAM_STORE_EVTCHN,
- store_evtchn))
- || (frc = xc_hvm_param_set(xch, dom,
- HVM_PARAM_CONSOLE_EVTCHN,
- console_evtchn)) )
- {
- PERROR("error setting HVM params: %i", frc);
- goto out;
- }
- *store_mfn = tailbuf.u.hvm.magicpfns[2];
-
- if ( console_pfn ) {
- if ( xc_clear_domain_page(xch, dom, console_pfn) ) {
- PERROR("error zeroing console page");
- goto out;
- }
- if ( (frc = xc_hvm_param_set(xch, dom,
- HVM_PARAM_CONSOLE_PFN, console_pfn)) ) {
- PERROR("error setting HVM param: %i", frc);
- goto out;
- }
- *console_mfn = console_pfn;
- }
-
- frc = xc_domain_hvm_setcontext(xch, dom, tailbuf.u.hvm.hvmbuf,
- tailbuf.u.hvm.reclen);
- if ( frc )
- {
- PERROR("error setting the HVM context");
- goto out;
- }
-
- frc = xc_dom_gnttab_hvm_seed(xch, dom, *console_mfn, *store_mfn,
- console_domid, store_domid);
- if (frc != 0)
- {
- ERROR("error seeding grant table");
- goto out;
- }
-
- /* HVM success! */
- rc = 0;
-
- out:
- if ( (rc != 0) && (dom != 0) )
- xc_domain_destroy(xch, dom);
- xc_hypercall_buffer_free(xch, ctxt);
- free(mmu);
- free(ctx->p2m);
- free(pfn_type);
- free(region_mfn);
- free(ctx->p2m_batch);
- pagebuf_free(&pagebuf);
- tailbuf_free(&tailbuf);
-
- /* discard cache for save file */
- discard_file_cache(xch, io_fd, 1 /*flush*/);
-
- fcntl(io_fd, F_SETFL, orig_io_fd_flags);
-
- DPRINTF("Restore exit of domid %u with rc=%d\n", dom, rc);
-
- return rc;
-}
-/*
- * Local variables:
- * mode: C
- * c-file-style: "BSD"
- * c-basic-offset: 4
- * tab-width: 4
- * indent-tabs-mode: nil
- * End:
- */
+++ /dev/null
-/******************************************************************************
- * xc_linux_save.c
- *
- * Save the state of a running Linux session.
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation;
- * version 2.1 of the License.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Copyright (c) 2003, K A Fraser.
- */
-
-#include <inttypes.h>
-#include <time.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <sys/time.h>
-#include <assert.h>
-
-#include "xc_private.h"
-#include "xc_bitops.h"
-#include "xc_dom.h"
-#include "xg_private.h"
-#include "xg_save_restore.h"
-
-#include <xen/hvm/params.h>
-
-/*
-** Default values for important tuning parameters. Can override by passing
-** non-zero replacement values to xc_domain_save().
-**
-** XXX SMH: should consider if want to be able to override MAX_MBIT_RATE too.
-**
-*/
-#define DEF_MAX_ITERS 29 /* limit us to 30 times round loop */
-#define DEF_MAX_FACTOR 3 /* never send more than 3x p2m_size */
-
-struct save_ctx {
- unsigned long hvirt_start; /* virtual starting address of the hypervisor */
- unsigned int pt_levels; /* #levels of page tables used by the current guest */
- unsigned long max_mfn; /* max mfn of the whole machine */
- xen_pfn_t *live_p2m; /* Live mapping of the table mapping each PFN to its current MFN. */
- xen_pfn_t *live_m2p; /* Live mapping of system MFN to PFN table. */
- unsigned long m2p_mfn0;
- struct domain_info_context dinfo;
-};
-
-/* buffer for output */
-struct outbuf {
- void* buf;
- size_t size;
- size_t pos;
- int write_count;
-};
-
-#define OUTBUF_SIZE (16384 * 1024)
-
-/* grep fodder: machine_to_phys */
-
-#define mfn_to_pfn(_mfn) (ctx->live_m2p[(_mfn)])
-
-#define pfn_to_mfn(_pfn) \
- ((xen_pfn_t) ((dinfo->guest_width==8) \
- ? (((uint64_t *)ctx->live_p2m)[(_pfn)]) \
- : ((((uint32_t *)ctx->live_p2m)[(_pfn)]) == 0xffffffffU \
- ? (-1UL) : (((uint32_t *)ctx->live_p2m)[(_pfn)]))))
-
-/*
- * Returns TRUE if the given machine frame number has a unique mapping
- * in the guest's pseudophysical map.
- */
-#define MFN_IS_IN_PSEUDOPHYS_MAP(_mfn) \
- (((_mfn) < (ctx->max_mfn)) && \
- ((mfn_to_pfn(_mfn) < (dinfo->p2m_size)) && \
- (pfn_to_mfn(mfn_to_pfn(_mfn)) == (_mfn))))
-
-#define SUPERPAGE_PFN_SHIFT 9
-#define SUPERPAGE_NR_PFNS (1UL << SUPERPAGE_PFN_SHIFT)
-
-#define SUPER_PAGE_START(pfn) (((pfn) & (SUPERPAGE_NR_PFNS-1)) == 0 )
-
-static uint64_t tv_to_us(struct timeval *new)
-{
- return (new->tv_sec * 1000000) + new->tv_usec;
-}
-
-static uint64_t llgettimeofday(void)
-{
- struct timeval now;
- gettimeofday(&now, NULL);
- return tv_to_us(&now);
-}
-
-static uint64_t tv_delta(struct timeval *new, struct timeval *old)
-{
- return (((new->tv_sec - old->tv_sec)*1000000) +
- (new->tv_usec - old->tv_usec));
-}
-
-static int noncached_write(xc_interface *xch,
- struct outbuf* ob,
- int fd, void *buffer, int len)
-{
- int rc = (write_exact(fd, buffer, len) == 0) ? len : -1;
-
- ob->write_count += len;
- if ( ob->write_count >= (MAX_PAGECACHE_USAGE * PAGE_SIZE) )
- {
- /* Time to discard cache - dont care if this fails */
- int saved_errno = errno;
- discard_file_cache(xch, fd, 0 /* no flush */);
- errno = saved_errno;
- ob->write_count = 0;
- }
-
- return rc;
-}
-
-static int outbuf_init(xc_interface *xch, struct outbuf* ob, size_t size)
-{
- memset(ob, 0, sizeof(*ob));
-
- if ( !(ob->buf = malloc(size)) ) {
- DPRINTF("error allocating output buffer of size %zu\n", size);
- return -1;
- }
-
- ob->size = size;
-
- return 0;
-}
-
-static int outbuf_free(struct outbuf *ob)
-{
- free(ob->buf);
- ob->buf = NULL;
- return 0;
-}
-
-static inline int outbuf_write(xc_interface *xch,
- struct outbuf* ob, void* buf, size_t len)
-{
- if ( len > ob->size - ob->pos ) {
- errno = ERANGE;
- DBGPRINTF("outbuf_write: %zu > %zu@%zu\n", len, ob->size - ob->pos, ob->pos);
- return -1;
- }
-
- memcpy(ob->buf + ob->pos, buf, len);
- ob->pos += len;
-
- return 0;
-}
-
-/* prep for nonblocking I/O */
-static int outbuf_flush(xc_interface *xch, struct outbuf* ob, int fd)
-{
- int rc;
- int cur = 0;
-
- if ( !ob->pos )
- return 0;
-
- rc = write(fd, ob->buf, ob->pos);
- while (rc < 0 || cur + rc < ob->pos) {
- if (rc < 0 && errno != EAGAIN && errno != EINTR) {
- DPRINTF("error flushing output: %d\n", errno);
- return -1;
- }
- if (rc > 0)
- cur += rc;
-
- rc = write(fd, ob->buf + cur, ob->pos - cur);
- }
-
- ob->pos = 0;
-
- return 0;
-}
-
-/* if there's no room in the buffer, flush it and try again. */
-static inline int outbuf_hardwrite(xc_interface *xch,
- struct outbuf* ob, int fd, void* buf,
- size_t len)
-{
- if ( !len )
- return 0;
-
- if ( !outbuf_write(xch, ob, buf, len) )
- return 0;
-
- if ( outbuf_flush(xch, ob, fd) < 0 )
- return -1;
-
- return outbuf_write(xch, ob, buf, len);
-}
-
-/* start buffering output once we've reached checkpoint mode. */
-static inline int write_buffer(xc_interface *xch,
- int dobuf, struct outbuf* ob, int fd, void* buf,
- size_t len)
-{
- if ( dobuf )
- return outbuf_hardwrite(xch, ob, fd, buf, len);
- else
- return write_exact(fd, buf, len);
-}
-
-/* like write_buffer for noncached, which returns number of bytes written */
-static inline int write_uncached(xc_interface *xch,
- int dobuf, struct outbuf* ob, int fd,
- void* buf, size_t len)
-{
- if ( dobuf )
- return outbuf_hardwrite(xch, ob, fd, buf, len) ? -1 : len;
- else
- return noncached_write(xch, ob, fd, buf, len);
-}
-
-static int write_compressed(xc_interface *xch, comp_ctx *compress_ctx,
- int dobuf, struct outbuf* ob, int fd)
-{
- int rc = 0;
- int header = sizeof(int) + sizeof(unsigned long);
- int marker = XC_SAVE_ID_COMPRESSED_DATA;
- unsigned long compbuf_len = 0;
-
- for(;;)
- {
- /* check for available space (atleast 8k) */
- if ((ob->pos + header + XC_PAGE_SIZE * 2) > ob->size)
- {
- if (outbuf_flush(xch, ob, fd) < 0)
- {
- ERROR("Error when flushing outbuf intermediate");
- return -1;
- }
- }
-
- rc = xc_compression_compress_pages(xch, compress_ctx,
- ob->buf + ob->pos + header,
- ob->size - ob->pos - header,
- &compbuf_len);
- if (!rc)
- break;
-
- if (outbuf_hardwrite(xch, ob, fd, &marker, sizeof(marker)) < 0)
- {
- PERROR("Error when writing marker (errno %d)", errno);
- return -1;
- }
-
- if (outbuf_hardwrite(xch, ob, fd, &compbuf_len, sizeof(compbuf_len)) < 0)
- {
- PERROR("Error when writing compbuf_len (errno %d)", errno);
- return -1;
- }
-
- ob->pos += (size_t) compbuf_len;
- if (!dobuf && outbuf_flush(xch, ob, fd) < 0)
- {
- ERROR("Error when writing compressed chunk");
- return -1;
- }
- }
-
- return 0;
-}
-
-struct time_stats {
- struct timeval wall;
- long long d0_cpu, d1_cpu;
-};
-
-static int print_stats(xc_interface *xch, uint32_t domid, int pages_sent,
- struct time_stats *last,
- xc_shadow_op_stats_t *stats, int print)
-{
- struct time_stats now;
-
- gettimeofday(&now.wall, NULL);
-
- now.d0_cpu = xc_domain_get_cpu_usage(xch, 0, /* FIXME */ 0)/1000;
- now.d1_cpu = xc_domain_get_cpu_usage(xch, domid, /* FIXME */ 0)/1000;
-
- if ( (now.d0_cpu == -1) || (now.d1_cpu == -1) )
- DPRINTF("ARRHHH!!\n");
-
- if ( print )
- {
- long long wall_delta;
- long long d0_cpu_delta;
- long long d1_cpu_delta;
-
- wall_delta = tv_delta(&now.wall,&last->wall)/1000;
- if ( wall_delta == 0 )
- wall_delta = 1;
-
- d0_cpu_delta = (now.d0_cpu - last->d0_cpu)/1000;
- d1_cpu_delta = (now.d1_cpu - last->d1_cpu)/1000;
-
- DPRINTF("delta %lldms, dom0 %d%%, target %d%%, sent %dMb/s, "
- "dirtied %dMb/s %" PRId32 " pages\n",
- wall_delta,
- (int)((d0_cpu_delta*100)/wall_delta),
- (int)((d1_cpu_delta*100)/wall_delta),
- (int)((pages_sent*PAGE_SIZE)/(wall_delta*(1000/8))),
- (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))),
- stats->dirty_count);
- }
-
- *last = now;
-
- return 0;
-}
-
-
-static int analysis_phase(xc_interface *xch, uint32_t domid, struct save_ctx *ctx,
- xc_hypercall_buffer_t *arr, int runs)
-{
- long long start, now;
- xc_shadow_op_stats_t stats;
- int j;
- struct domain_info_context *dinfo = &ctx->dinfo;
-
- start = llgettimeofday();
-
- for ( j = 0; j < runs; j++ )
- {
- int i;
-
- xc_shadow_control(xch, domid, XEN_DOMCTL_SHADOW_OP_CLEAN,
- arr, dinfo->p2m_size, NULL, 0, NULL);
- DPRINTF("#Flush\n");
- for ( i = 0; i < 40; i++ )
- {
- usleep(50000);
- now = llgettimeofday();
- xc_shadow_control(xch, domid, XEN_DOMCTL_SHADOW_OP_PEEK,
- NULL, 0, NULL, 0, &stats);
- DPRINTF("now= %lld faults= %"PRId32" dirty= %"PRId32"\n",
- ((now-start)+500)/1000,
- stats.fault_count, stats.dirty_count);
- }
- }
-
- return -1;
-}
-
-static int suspend_and_state(int (*suspend)(void*), void* data,
- xc_interface *xch, int io_fd, int dom,
- xc_dominfo_t *info)
-{
- if ( !(*suspend)(data) )
- {
- ERROR("Suspend request failed");
- return -1;
- }
-
- if ( (xc_domain_getinfo(xch, dom, 1, info) != 1) ||
- !info->shutdown || (info->shutdown_reason != SHUTDOWN_suspend) )
- {
- ERROR("Domain not in suspended state");
- return -1;
- }
-
- return 0;
-}
-
-/*
-** Map the top-level page of MFNs from the guest. The guest might not have
-** finished resuming from a previous restore operation, so we wait a while for
-** it to update the MFN to a reasonable value.
-*/
-static void *map_frame_list_list(xc_interface *xch, uint32_t dom,
- struct save_ctx *ctx,
- shared_info_any_t *shinfo)
-{
- int count = 100;
- void *p;
- struct domain_info_context *dinfo = &ctx->dinfo;
- uint64_t fll = GET_FIELD(shinfo, arch.pfn_to_mfn_frame_list_list, dinfo->guest_width);
-
- while ( count-- && (fll == 0) )
- {
- usleep(10000);
- fll = GET_FIELD(shinfo, arch.pfn_to_mfn_frame_list_list, dinfo->guest_width);
- }
-
- if ( fll == 0 )
- {
- ERROR("Timed out waiting for frame list updated.");
- return NULL;
- }
-
- p = xc_map_foreign_range(xch, dom, PAGE_SIZE, PROT_READ, fll);
- if ( p == NULL )
- PERROR("Couldn't map p2m_frame_list_list (errno %d)", errno);
-
- return p;
-}
-
-/*
-** During transfer (or in the state file), all page-table pages must be
-** converted into a 'canonical' form where references to actual mfns
-** are replaced with references to the corresponding pfns.
-**
-** This function performs the appropriate conversion, taking into account
-** which entries do not require canonicalization (in particular, those
-** entries which map the virtual address reserved for the hypervisor).
-*/
-static int canonicalize_pagetable(struct save_ctx *ctx,
- unsigned long type, unsigned long pfn,
- const void *spage, void *dpage)
-{
- struct domain_info_context *dinfo = &ctx->dinfo;
- int i, pte_last, xen_start, xen_end, race = 0;
- uint64_t pte;
-
- /*
- ** We need to determine which entries in this page table hold
- ** reserved hypervisor mappings. This depends on the current
- ** page table type as well as the number of paging levels.
- */
- xen_start = xen_end = pte_last = PAGE_SIZE / 8;
-
- if ( (ctx->pt_levels == 3) && (type == XEN_DOMCTL_PFINFO_L3TAB) )
- xen_start = L3_PAGETABLE_ENTRIES_PAE;
-
- /*
- ** In PAE only the L2 mapping the top 1GB contains Xen mappings.
- ** We can spot this by looking for the guest's mappingof the m2p.
- ** Guests must ensure that this check will fail for other L2s.
- */
- if ( (ctx->pt_levels == 3) && (type == XEN_DOMCTL_PFINFO_L2TAB) )
- {
- int hstart;
- uint64_t he;
-
- hstart = (ctx->hvirt_start >> L2_PAGETABLE_SHIFT_PAE) & 0x1ff;
- he = ((const uint64_t *) spage)[hstart];
-
- if ( ((he >> PAGE_SHIFT) & MFN_MASK_X86) == ctx->m2p_mfn0 )
- {
- /* hvirt starts with xen stuff... */
- xen_start = hstart;
- }
- else if ( ctx->hvirt_start != 0xf5800000 )
- {
- /* old L2s from before hole was shrunk... */
- hstart = (0xf5800000 >> L2_PAGETABLE_SHIFT_PAE) & 0x1ff;
- he = ((const uint64_t *) spage)[hstart];
- if ( ((he >> PAGE_SHIFT) & MFN_MASK_X86) == ctx->m2p_mfn0 )
- xen_start = hstart;
- }
- }
-
- if ( (ctx->pt_levels == 4) && (type == XEN_DOMCTL_PFINFO_L4TAB) )
- {
- /*
- ** XXX SMH: should compute these from hvirt_start (which we have)
- ** and hvirt_end (which we don't)
- */
- xen_start = 256;
- xen_end = 272;
- }
-
- /* Now iterate through the page table, canonicalizing each PTE */
- for (i = 0; i < pte_last; i++ )
- {
- unsigned long pfn, mfn;
-
- pte = ((const uint64_t*)spage)[i];
-
- if ( (i >= xen_start) && (i < xen_end) )
- pte = 0;
-
- if ( pte & _PAGE_PRESENT )
- {
- mfn = (pte >> PAGE_SHIFT) & MFN_MASK_X86;
- if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) )
- {
- /* This will happen if the type info is stale which
- is quite feasible under live migration */
- pfn = 0; /* zap it - we'll retransmit this page later */
- /* XXX: We can't spot Xen mappings in compat-mode L2es
- * from 64-bit tools, but the only thing in them is the
- * compat m2p, so we quietly zap them. This doesn't
- * count as a race, so don't report it. */
- if ( !(type == XEN_DOMCTL_PFINFO_L2TAB
- && sizeof (unsigned long) > dinfo->guest_width) )
- race = 1; /* inform the caller; fatal if !live */
- }
- else
- pfn = mfn_to_pfn(mfn);
-
- pte &= ~MADDR_MASK_X86;
- pte |= (uint64_t)pfn << PAGE_SHIFT;
-
- /*
- * PAE guest L3Es can contain these flags when running on
- * a 64bit hypervisor. We zap these here to avoid any
- * surprise at restore time...
- */
- if ( (ctx->pt_levels == 3) &&
- (type == XEN_DOMCTL_PFINFO_L3TAB) &&
- (pte & (_PAGE_USER|_PAGE_RW|_PAGE_ACCESSED)) )
- pte &= ~(_PAGE_USER|_PAGE_RW|_PAGE_ACCESSED);
- }
-
- ((uint64_t*)dpage)[i] = pte;
- }
-
- return race;
-}
-
-xen_pfn_t *xc_map_m2p(xc_interface *xch,
- unsigned long max_mfn,
- int prot,
- unsigned long *mfn0)
-{
- privcmd_mmap_entry_t *entries;
- unsigned long m2p_chunks, m2p_size;
- xen_pfn_t *m2p;
- xen_pfn_t *extent_start;
- int i;
-
- m2p = NULL;
- m2p_size = M2P_SIZE(max_mfn);
- m2p_chunks = M2P_CHUNKS(max_mfn);
-
- extent_start = calloc(m2p_chunks, sizeof(xen_pfn_t));
- if ( !extent_start )
- {
- ERROR("failed to allocate space for m2p mfns");
- goto err0;
- }
-
- if ( xc_machphys_mfn_list(xch, m2p_chunks, extent_start) )
- {
- PERROR("xc_get_m2p_mfns");
- goto err1;
- }
-
- entries = calloc(m2p_chunks, sizeof(privcmd_mmap_entry_t));
- if (entries == NULL)
- {
- ERROR("failed to allocate space for mmap entries");
- goto err1;
- }
-
- for ( i = 0; i < m2p_chunks; i++ )
- entries[i].mfn = extent_start[i];
-
- m2p = xc_map_foreign_ranges(xch, DOMID_XEN,
- m2p_size, prot, M2P_CHUNK_SIZE,
- entries, m2p_chunks);
- if (m2p == NULL)
- {
- PERROR("xc_mmap_foreign_ranges failed");
- goto err2;
- }
-
- if (mfn0)
- *mfn0 = entries[0].mfn;
-
-err2:
- free(entries);
-err1:
- free(extent_start);
-
-err0:
- return m2p;
-}
-
-
-static xen_pfn_t *map_and_save_p2m_table(xc_interface *xch,
- int io_fd,
- uint32_t dom,
- struct save_ctx *ctx,
- shared_info_any_t *live_shinfo)
-{
- vcpu_guest_context_any_t ctxt;
- struct domain_info_context *dinfo = &ctx->dinfo;
-
- /* Double and single indirect references to the live P2M table */
- void *live_p2m_frame_list_list = NULL;
- void *live_p2m_frame_list = NULL;
-
- /* Copies of the above. */
- xen_pfn_t *p2m_frame_list_list = NULL;
- xen_pfn_t *p2m_frame_list = NULL;
-
- /* The mapping of the live p2m table itself */
- xen_pfn_t *p2m = NULL;
-
- int i, success = 0;
-
- live_p2m_frame_list_list = map_frame_list_list(xch, dom, ctx,
- live_shinfo);
- if ( !live_p2m_frame_list_list )
- goto out;
-
- /* Get a local copy of the live_P2M_frame_list_list */
- if ( !(p2m_frame_list_list = malloc(PAGE_SIZE)) )
- {
- ERROR("Couldn't allocate p2m_frame_list_list array");
- goto out;
- }
- memcpy(p2m_frame_list_list, live_p2m_frame_list_list, PAGE_SIZE);
-
- /* Canonicalize guest's unsigned long vs ours */
- if ( dinfo->guest_width > sizeof(unsigned long) )
- for ( i = 0; i < PAGE_SIZE/sizeof(unsigned long); i++ )
- if ( i < PAGE_SIZE/dinfo->guest_width )
- p2m_frame_list_list[i] = ((uint64_t *)p2m_frame_list_list)[i];
- else
- p2m_frame_list_list[i] = 0;
- else if ( dinfo->guest_width < sizeof(unsigned long) )
- for ( i = PAGE_SIZE/sizeof(unsigned long) - 1; i >= 0; i-- )
- p2m_frame_list_list[i] = ((uint32_t *)p2m_frame_list_list)[i];
-
- live_p2m_frame_list =
- xc_map_foreign_pages(xch, dom, PROT_READ,
- p2m_frame_list_list,
- P2M_FLL_ENTRIES);
- if ( !live_p2m_frame_list )
- {
- PERROR("Couldn't map p2m_frame_list");
- goto out;
- }
-
- /* Get a local copy of the live_P2M_frame_list */
- if ( !(p2m_frame_list = malloc(P2M_TOOLS_FL_SIZE)) )
- {
- ERROR("Couldn't allocate p2m_frame_list array");
- goto out;
- }
- memset(p2m_frame_list, 0, P2M_TOOLS_FL_SIZE);
- memcpy(p2m_frame_list, live_p2m_frame_list, P2M_GUEST_FL_SIZE);
-
- munmap(live_p2m_frame_list, P2M_FLL_ENTRIES * PAGE_SIZE);
- live_p2m_frame_list = NULL;
-
- /* Canonicalize guest's unsigned long vs ours */
- if ( dinfo->guest_width > sizeof(unsigned long) )
- for ( i = 0; i < P2M_FL_ENTRIES; i++ )
- p2m_frame_list[i] = ((uint64_t *)p2m_frame_list)[i];
- else if ( dinfo->guest_width < sizeof(unsigned long) )
- for ( i = P2M_FL_ENTRIES - 1; i >= 0; i-- )
- p2m_frame_list[i] = ((uint32_t *)p2m_frame_list)[i];
-
-
- /* Map all the frames of the pfn->mfn table. For migrate to succeed,
- the guest must not change which frames are used for this purpose.
- (its not clear why it would want to change them, and we'll be OK
- from a safety POV anyhow. */
-
- p2m = xc_map_foreign_pages(xch, dom, PROT_READ,
- p2m_frame_list,
- P2M_FL_ENTRIES);
- if ( !p2m )
- {
- PERROR("Couldn't map p2m table");
- goto out;
- }
- ctx->live_p2m = p2m; /* So that translation macros will work */
-
- /* Canonicalise the pfn-to-mfn table frame-number list. */
- for ( i = 0; i < dinfo->p2m_size; i += FPP )
- {
- if ( !MFN_IS_IN_PSEUDOPHYS_MAP(p2m_frame_list[i/FPP]) )
- {
- ERROR("Frame# in pfn-to-mfn frame list is not in pseudophys");
- ERROR("entry %d: p2m_frame_list[%ld] is 0x%"PRIx64", max 0x%lx",
- i, i/FPP, (uint64_t)p2m_frame_list[i/FPP], ctx->max_mfn);
- if ( p2m_frame_list[i/FPP] < ctx->max_mfn )
- {
- ERROR("m2p[0x%"PRIx64"] = 0x%"PRIx64,
- (uint64_t)p2m_frame_list[i/FPP],
- (uint64_t)ctx->live_m2p[p2m_frame_list[i/FPP]]);
- ERROR("p2m[0x%"PRIx64"] = 0x%"PRIx64,
- (uint64_t)ctx->live_m2p[p2m_frame_list[i/FPP]],
- (uint64_t)p2m[ctx->live_m2p[p2m_frame_list[i/FPP]]]);
-
- }
- goto out;
- }
- p2m_frame_list[i/FPP] = mfn_to_pfn(p2m_frame_list[i/FPP]);
- }
-
- if ( xc_vcpu_getcontext(xch, dom, 0, &ctxt) )
- {
- PERROR("Could not get vcpu context");
- goto out;
- }
-
- /*
- * Write an extended-info structure to inform the restore code that
- * a PAE guest understands extended CR3 (PDPTs above 4GB). Turns off
- * slow paths in the restore code.
- */
- {
- unsigned long signature = ~0UL;
- uint32_t chunk1_sz = ((dinfo->guest_width==8)
- ? sizeof(ctxt.x64)
- : sizeof(ctxt.x32));
- uint32_t chunk2_sz = 0;
- uint32_t chunk3_sz = 4;
- uint32_t xcnt_size = 0;
- uint32_t tot_sz;
- DECLARE_DOMCTL;
-
- domctl.cmd = XEN_DOMCTL_getvcpuextstate;
- domctl.domain = dom;
- domctl.u.vcpuextstate.vcpu = 0;
- domctl.u.vcpuextstate.size = 0;
- domctl.u.vcpuextstate.xfeature_mask = 0;
- if ( xc_domctl(xch, &domctl) < 0 )
- {
- PERROR("No extended context for VCPU%d", i);
- goto out;
- }
- xcnt_size = domctl.u.vcpuextstate.size + 2 * sizeof(uint64_t);
-
- tot_sz = (chunk1_sz + 8) + (chunk2_sz + 8);
- if ( domctl.u.vcpuextstate.xfeature_mask )
- tot_sz += chunk3_sz + 8;
-
- if ( write_exact(io_fd, &signature, sizeof(signature)) ||
- write_exact(io_fd, &tot_sz, sizeof(tot_sz)) ||
- write_exact(io_fd, "vcpu", 4) ||
- write_exact(io_fd, &chunk1_sz, sizeof(chunk1_sz)) ||
- write_exact(io_fd, &ctxt, chunk1_sz) ||
- write_exact(io_fd, "extv", 4) ||
- write_exact(io_fd, &chunk2_sz, sizeof(chunk2_sz)) ||
- (domctl.u.vcpuextstate.xfeature_mask) ?
- (write_exact(io_fd, "xcnt", 4) ||
- write_exact(io_fd, &chunk3_sz, sizeof(chunk3_sz)) ||
- write_exact(io_fd, &xcnt_size, 4)) :
- 0 )
- {
- PERROR("write: extended info");
- goto out;
- }
- }
-
- if ( write_exact(io_fd, p2m_frame_list,
- P2M_FL_ENTRIES * sizeof(xen_pfn_t)) )
- {
- PERROR("write: p2m_frame_list");
- goto out;
- }
-
- success = 1;
-
- out:
-
- if ( !success && p2m )
- munmap(p2m, P2M_FL_ENTRIES * PAGE_SIZE);
-
- if ( live_p2m_frame_list_list )
- munmap(live_p2m_frame_list_list, PAGE_SIZE);
-
- if ( live_p2m_frame_list )
- munmap(live_p2m_frame_list, P2M_FLL_ENTRIES * PAGE_SIZE);
-
- free(p2m_frame_list_list);
-
- free(p2m_frame_list);
-
- return success ? p2m : NULL;
-}
-
-/* must be done AFTER suspend_and_state() */
-static int save_tsc_info(xc_interface *xch, uint32_t dom, int io_fd)
-{
- int marker = XC_SAVE_ID_TSC_INFO;
- uint32_t tsc_mode, khz, incarn;
- uint64_t nsec;
-
- if ( xc_domain_get_tsc_info(xch, dom, &tsc_mode,
- &nsec, &khz, &incarn) < 0 ||
- write_exact(io_fd, &marker, sizeof(marker)) ||
- write_exact(io_fd, &tsc_mode, sizeof(tsc_mode)) ||
- write_exact(io_fd, &nsec, sizeof(nsec)) ||
- write_exact(io_fd, &khz, sizeof(khz)) ||
- write_exact(io_fd, &incarn, sizeof(incarn)) )
- return -1;
- return 0;
-}
-
-int xc_domain_save(xc_interface *xch, int io_fd, uint32_t dom, uint32_t max_iters,
- uint32_t max_factor, uint32_t flags,
- struct save_callbacks* callbacks, int hvm)
-{
- xc_dominfo_t info;
- DECLARE_DOMCTL;
-
- int rc, frc, i, j, last_iter = 0, iter = 0;
- int live = (flags & XCFLAGS_LIVE);
- int debug = (flags & XCFLAGS_DEBUG);
- int superpages = !!hvm;
- int race = 0, skip_this_iter = 0;
- unsigned int sent_this_iter = 0;
- int tmem_saved = 0;
-
- /* The new domain's shared-info frame number. */
- unsigned long shared_info_frame;
-
- /* A copy of the CPU context of the guest. */
- vcpu_guest_context_any_t ctxt;
-
- /* A table containing the type of each PFN (/not/ MFN!). */
- xen_pfn_t *pfn_type = NULL;
- unsigned long *pfn_batch = NULL;
- int *pfn_err = NULL;
-
- /* A copy of one frame of guest memory. */
- char page[PAGE_SIZE];
-
- /* Live mapping of shared info structure */
- shared_info_any_t *live_shinfo = NULL;
-
- /* base of the region in which domain memory is mapped */
- unsigned char *region_base = NULL;
-
- /* A copy of the CPU eXtended States of the guest. */
- DECLARE_HYPERCALL_BUFFER(void, buffer);
-
- /* bitmap of pages:
- - that should be sent this iteration (unless later marked as skip);
- - to skip this iteration because already dirty;
- - to fixup by sending at the end if not already resent; */
- DECLARE_HYPERCALL_BUFFER(unsigned long, to_skip);
- DECLARE_HYPERCALL_BUFFER(unsigned long, to_send);
- unsigned long *to_fix = NULL;
-
- struct time_stats time_stats;
- xc_shadow_op_stats_t shadow_stats;
-
- unsigned long needed_to_fix = 0;
- unsigned long total_sent = 0;
-
- uint64_t vcpumap[XC_SR_MAX_VCPUS/64] = { 1ULL };
-
- /* HVM: a buffer for holding HVM context */
- uint32_t hvm_buf_size = 0;
- uint8_t *hvm_buf = NULL;
-
- /* HVM: magic frames for ioreqs and xenstore comms. */
- uint64_t magic_pfns[3]; /* ioreq_pfn, bufioreq_pfn, store_pfn */
-
- unsigned long mfn;
-
- /* Without checkpoint compression, the dirty pages, pfn arrays
- * and tailbuf (vcpu ctx, shared info page, etc.) are written
- * directly to outbuf. All of this is done while the domain is
- * suspended.
- *
- * When checkpoint compression is enabled, the dirty pages are
- * buffered, compressed "after" the domain is resumed and then
- * written to outbuf. Since tailbuf data are collected while a
- * domain is suspended, they cannot be directly written to the
- * outbuf as there is no dirty page data preceeding tailbuf.
- *
- * So,two output buffers are maintained. Tailbuf data goes into
- * ob_tailbuf. The dirty pages are compressed after resuming the
- * domain and written to ob_pagebuf. ob_tailbuf is then appended
- * to ob_pagebuf and finally flushed out.
- */
- struct outbuf ob_pagebuf, ob_tailbuf, *ob = NULL;
- struct save_ctx _ctx;
- struct save_ctx *ctx = &_ctx;
- struct domain_info_context *dinfo = &ctx->dinfo;
-
- /* Compression context */
- comp_ctx *compress_ctx= NULL;
- /* Even if XCFLAGS_CHECKPOINT_COMPRESS is set, we enable compression only
- * after sending XC_SAVE_ID_ENABLE_COMPRESSION and the tailbuf for
- * first time.
- */
- int compressing = 0;
-
- int completed = 0;
-
- if ( getenv("XG_MIGRATION_V2") )
- {
- return xc_domain_save2(xch, io_fd, dom, max_iters,
- max_factor, flags, callbacks, hvm);
- }
-
- DPRINTF("%s: starting save of domid %u", __func__, dom);
-
- if ( hvm && !callbacks->switch_qemu_logdirty )
- {
- ERROR("No switch_qemu_logdirty callback provided.");
- errno = EINVAL;
- goto exit;
- }
-
- outbuf_init(xch, &ob_pagebuf, OUTBUF_SIZE);
-
- memset(ctx, 0, sizeof(*ctx));
-
- /* If no explicit control parameters given, use defaults */
- max_iters = max_iters ? : DEF_MAX_ITERS;
- max_factor = max_factor ? : DEF_MAX_FACTOR;
-
- if ( !get_platform_info(xch, dom,
- &ctx->max_mfn, &ctx->hvirt_start, &ctx->pt_levels, &dinfo->guest_width) )
- {
- ERROR("Unable to get platform info.");
- goto exit;
- }
-
- if ( xc_domain_getinfo(xch, dom, 1, &info) != 1 )
- {
- PERROR("Could not get domain info");
- goto exit;
- }
-
- shared_info_frame = info.shared_info_frame;
-
- /* Map the shared info frame */
- if ( !hvm )
- {
- live_shinfo = xc_map_foreign_range(xch, dom, PAGE_SIZE,
- PROT_READ, shared_info_frame);
- if ( !live_shinfo )
- {
- PERROR("Couldn't map live_shinfo");
- goto out;
- }
- }
-
- /* Get the size of the P2M table */
- if ( xc_domain_nr_gpfns(xch, dom, &dinfo->p2m_size) < 0 )
- {
- ERROR("Could not get maximum GPFN!");
- goto out;
- }
-
- if ( dinfo->p2m_size > ~XEN_DOMCTL_PFINFO_LTAB_MASK )
- {
- errno = E2BIG;
- ERROR("Cannot save this big a guest");
- goto out;
- }
-
- /* Domain is still running at this point */
- if ( live )
- {
- /* Live suspend. Enable log-dirty mode. */
- if ( xc_shadow_control(xch, dom,
- XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY,
- NULL, 0, NULL, 0, NULL) < 0 )
- {
- /* log-dirty already enabled? There's no test op,
- so attempt to disable then reenable it */
- frc = xc_shadow_control(xch, dom, XEN_DOMCTL_SHADOW_OP_OFF,
- NULL, 0, NULL, 0, NULL);
- if ( frc >= 0 )
- {
- frc = xc_shadow_control(xch, dom,
- XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY,
- NULL, 0, NULL, 0, NULL);
- }
-
- if ( frc < 0 )
- {
- PERROR("Couldn't enable shadow mode (rc %d) (errno %d)", frc, errno );
- goto out;
- }
- }
-
- /* Enable qemu-dm logging dirty pages to xen */
- if ( hvm && callbacks->switch_qemu_logdirty(dom, 1, callbacks->data) )
- {
- PERROR("Couldn't enable qemu log-dirty mode (errno %d)", errno);
- goto out;
- }
- }
- else
- {
- /* This is a non-live suspend. Suspend the domain .*/
- if ( suspend_and_state(callbacks->suspend, callbacks->data, xch,
- io_fd, dom, &info) )
- {
- ERROR("Domain appears not to have suspended");
- goto out;
- }
- }
-
- if ( flags & XCFLAGS_CHECKPOINT_COMPRESS )
- {
- if (!(compress_ctx = xc_compression_create_context(xch, dinfo->p2m_size)))
- {
- ERROR("Failed to create compression context");
- goto out;
- }
- outbuf_init(xch, &ob_tailbuf, OUTBUF_SIZE/4);
- }
-
- last_iter = !live;
-
- /* Setup to_send / to_fix and to_skip bitmaps */
- to_send = xc_hypercall_buffer_alloc_pages(xch, to_send, NRPAGES(bitmap_size(dinfo->p2m_size)));
- to_skip = xc_hypercall_buffer_alloc_pages(xch, to_skip, NRPAGES(bitmap_size(dinfo->p2m_size)));
- to_fix = calloc(1, bitmap_size(dinfo->p2m_size));
-
- if ( !to_send || !to_fix || !to_skip )
- {
- errno = ENOMEM;
- ERROR("Couldn't allocate to_send array");
- goto out;
- }
-
- memset(to_send, 0xff, bitmap_size(dinfo->p2m_size));
-
- if ( hvm )
- {
- /* Need another buffer for HVM context */
- hvm_buf_size = xc_domain_hvm_getcontext(xch, dom, 0, 0);
- if ( hvm_buf_size == -1 )
- {
- PERROR("Couldn't get HVM context size from Xen");
- goto out;
- }
- hvm_buf = malloc(hvm_buf_size);
- if ( !hvm_buf )
- {
- errno = ENOMEM;
- ERROR("Couldn't allocate memory");
- goto out;
- }
- }
-
- analysis_phase(xch, dom, ctx, HYPERCALL_BUFFER(to_skip), 0);
-
- pfn_type = malloc(ROUNDUP(MAX_BATCH_SIZE * sizeof(*pfn_type), PAGE_SHIFT));
- pfn_batch = calloc(MAX_BATCH_SIZE, sizeof(*pfn_batch));
- pfn_err = malloc(MAX_BATCH_SIZE * sizeof(*pfn_err));
- if ( (pfn_type == NULL) || (pfn_batch == NULL) || (pfn_err == NULL) )
- {
- ERROR("failed to alloc memory for pfn_type and/or pfn_batch arrays");
- errno = ENOMEM;
- goto out;
- }
- memset(pfn_type, 0,
- ROUNDUP(MAX_BATCH_SIZE * sizeof(*pfn_type), PAGE_SHIFT));
-
- /* Setup the mfn_to_pfn table mapping */
- if ( !(ctx->live_m2p = xc_map_m2p(xch, ctx->max_mfn, PROT_READ, &ctx->m2p_mfn0)) )
- {
- PERROR("Failed to map live M2P table");
- goto out;
- }
-
- /* Start writing out the saved-domain record. */
- if ( write_exact(io_fd, &dinfo->p2m_size, sizeof(unsigned long)) )
- {
- PERROR("write: p2m_size");
- goto out;
- }
-
- if ( !hvm )
- {
- int err = 0;
-
- /* Map the P2M table, and write the list of P2M frames */
- ctx->live_p2m = map_and_save_p2m_table(xch, io_fd, dom, ctx, live_shinfo);
- if ( ctx->live_p2m == NULL )
- {
- PERROR("Failed to map/save the p2m frame list");
- goto out;
- }
-
- /*
- * Quick belt and braces sanity check.
- */
-
- for ( i = 0; i < dinfo->p2m_size; i++ )
- {
- mfn = pfn_to_mfn(i);
- if( (mfn != INVALID_P2M_ENTRY) && (mfn_to_pfn(mfn) != i) )
- {
- DPRINTF("i=0x%x mfn=%lx live_m2p=%lx\n", i,
- mfn, mfn_to_pfn(mfn));
- err++;
- }
- }
- DPRINTF("Had %d unexplained entries in p2m table\n", err);
- }
-
- print_stats(xch, dom, 0, &time_stats, &shadow_stats, 0);
-
- tmem_saved = xc_tmem_save(xch, dom, io_fd, live, XC_SAVE_ID_TMEM);
- if ( tmem_saved == -1 )
- {
- PERROR("Error when writing to state file (tmem)");
- goto out;
- }
-
- if ( !live && save_tsc_info(xch, dom, io_fd) < 0 )
- {
- PERROR("Error when writing to state file (tsc)");
- goto out;
- }
-
- copypages:
-#define wrexact(fd, buf, len) write_buffer(xch, last_iter, ob, (fd), (buf), (len))
-#define wruncached(fd, live, buf, len) write_uncached(xch, last_iter, ob, (fd), (buf), (len))
-#define wrcompressed(fd) write_compressed(xch, compress_ctx, last_iter, ob, (fd))
-
- ob = &ob_pagebuf; /* Holds pfn_types, pages/compressed pages */
- /* Now write out each data page, canonicalising page tables as we go... */
- for ( ; ; )
- {
- unsigned int N, batch, run;
- char reportbuf[80];
-
- snprintf(reportbuf, sizeof(reportbuf),
- "Saving memory: iter %d (last sent %u skipped %u)",
- iter, sent_this_iter, skip_this_iter);
-
- xc_set_progress_prefix(xch, reportbuf);
- xc_report_progress_step(xch, 0, dinfo->p2m_size);
-
- iter++;
- sent_this_iter = 0;
- skip_this_iter = 0;
- N = 0;
-
- while ( N < dinfo->p2m_size )
- {
- xc_report_progress_step(xch, N, dinfo->p2m_size);
-
- if ( !last_iter )
- {
- /* Slightly wasteful to peek the whole array every time,
- but this is fast enough for the moment. */
- frc = xc_shadow_control(
- xch, dom, XEN_DOMCTL_SHADOW_OP_PEEK, HYPERCALL_BUFFER(to_skip),
- dinfo->p2m_size, NULL, 0, NULL);
- if ( frc != dinfo->p2m_size )
- {
- ERROR("Error peeking shadow bitmap");
- goto out;
- }
- }
-
- /* load pfn_type[] with the mfn of all the pages we're doing in
- this batch. */
- for ( batch = 0;
- (batch < MAX_BATCH_SIZE) && (N < dinfo->p2m_size);
- N++ )
- {
- int n = N;
-
- if ( debug )
- {
- DPRINTF("%d pfn= %08lx mfn= %08lx %d",
- iter, (unsigned long)n,
- hvm ? 0 : pfn_to_mfn(n),
- test_bit(n, to_send));
- if ( !hvm && is_mapped(pfn_to_mfn(n)) )
- DPRINTF(" [mfn]= %08lx",
- mfn_to_pfn(pfn_to_mfn(n)&0xFFFFF));
- DPRINTF("\n");
- }
-
- if ( completed )
- {
- /* for sparse bitmaps, word-by-word may save time */
- if ( !to_send[N >> ORDER_LONG] )
- {
- /* incremented again in for loop! */
- N += BITS_PER_LONG - 1;
- continue;
- }
-
- if ( !test_bit(n, to_send) )
- continue;
-
- pfn_batch[batch] = n;
- if ( hvm )
- pfn_type[batch] = n;
- else
- pfn_type[batch] = pfn_to_mfn(n);
- }
- else
- {
- int dont_skip = (last_iter || (superpages && iter==1));
-
- if ( !dont_skip &&
- test_bit(n, to_send) &&
- test_bit(n, to_skip) )
- skip_this_iter++; /* stats keeping */
-
- if ( !((test_bit(n, to_send) && !test_bit(n, to_skip)) ||
- (test_bit(n, to_send) && dont_skip) ||
- (test_bit(n, to_fix) && last_iter)) )
- continue;
-
- /* First time through, try to keep superpages in the same batch */
- if ( superpages && iter == 1
- && SUPER_PAGE_START(n)
- && batch + SUPERPAGE_NR_PFNS > MAX_BATCH_SIZE )
- break;
-
- /*
- ** we get here if:
- ** 1. page is marked to_send & hasn't already been re-dirtied
- ** 2. (ignore to_skip in first and last iterations)
- ** 3. add in pages that still need fixup (net bufs)
- */
-
- pfn_batch[batch] = n;
-
- /* Hypercall interfaces operate in PFNs for HVM guests
- * and MFNs for PV guests */
- if ( hvm )
- pfn_type[batch] = n;
- else
- pfn_type[batch] = pfn_to_mfn(n);
-
- if ( !is_mapped(pfn_type[batch]) )
- {
- /*
- ** not currently in psuedo-physical map -- set bit
- ** in to_fix since we must send this page in last_iter
- ** unless its sent sooner anyhow, or it never enters
- ** pseudo-physical map (e.g. for ballooned down doms)
- */
- set_bit(n, to_fix);
- continue;
- }
-
- if ( last_iter &&
- test_bit(n, to_fix) &&
- !test_bit(n, to_send) )
- {
- needed_to_fix++;
- DPRINTF("Fix! iter %d, pfn %x. mfn %lx\n",
- iter, n, pfn_type[batch]);
- }
-
- clear_bit(n, to_fix);
- }
-
- batch++;
- }
-
- if ( batch == 0 )
- goto skip; /* vanishingly unlikely... */
-
- region_base = xc_map_foreign_bulk(
- xch, dom, PROT_READ, pfn_type, pfn_err, batch);
- if ( region_base == NULL )
- {
- PERROR("map batch failed");
- goto out;
- }
-
- /* Get page types */
- if ( xc_get_pfn_type_batch(xch, dom, batch, pfn_type) )
- {
- PERROR("get_pfn_type_batch failed");
- goto out;
- }
-
- for ( run = j = 0; j < batch; j++ )
- {
- unsigned long gmfn = pfn_batch[j];
-
- if ( !hvm )
- gmfn = pfn_to_mfn(gmfn);
-
- if ( pfn_type[j] == XEN_DOMCTL_PFINFO_BROKEN )
- {
- pfn_type[j] |= pfn_batch[j];
- ++run;
- continue;
- }
-
- if ( pfn_err[j] )
- {
- if ( pfn_type[j] == XEN_DOMCTL_PFINFO_XTAB )
- continue;
-
- DPRINTF("map fail: page %i mfn %08lx err %d\n",
- j, gmfn, pfn_err[j]);
- pfn_type[j] = XEN_DOMCTL_PFINFO_XTAB;
- continue;
- }
-
- if ( pfn_type[j] == XEN_DOMCTL_PFINFO_XTAB )
- {
- DPRINTF("type fail: page %i mfn %08lx\n", j, gmfn);
- continue;
- }
-
- if ( superpages && iter==1 && test_bit(gmfn, to_skip))
- pfn_type[j] = XEN_DOMCTL_PFINFO_XALLOC;
-
- /* canonicalise mfn->pfn */
- pfn_type[j] |= pfn_batch[j];
- ++run;
-
- if ( debug )
- {
- if ( hvm )
- DPRINTF("%d pfn=%08lx sum=%08lx\n",
- iter,
- pfn_type[j],
- csum_page(region_base + (PAGE_SIZE*j)));
- else
- DPRINTF("%d pfn= %08lx mfn= %08lx [mfn]= %08lx"
- " sum= %08lx\n",
- iter,
- pfn_type[j],
- gmfn,
- mfn_to_pfn(gmfn),
- csum_page(region_base + (PAGE_SIZE*j)));
- }
- }
-
- if ( !run )
- {
- munmap(region_base, batch*PAGE_SIZE);
- continue; /* bail on this batch: no valid pages */
- }
-
- if ( wrexact(io_fd, &batch, sizeof(unsigned int)) )
- {
- PERROR("Error when writing to state file (2)");
- goto out;
- }
-
- if ( sizeof(unsigned long) < sizeof(*pfn_type) )
- for ( j = 0; j < batch; j++ )
- ((unsigned long *)pfn_type)[j] = pfn_type[j];
- if ( wrexact(io_fd, pfn_type, sizeof(unsigned long)*batch) )
- {
- PERROR("Error when writing to state file (3)");
- goto out;
- }
- if ( sizeof(unsigned long) < sizeof(*pfn_type) )
- while ( --j >= 0 )
- pfn_type[j] = ((unsigned long *)pfn_type)[j];
-
- /* entering this loop, pfn_type is now in pfns (Not mfns) */
- run = 0;
- for ( j = 0; j < batch; j++ )
- {
- unsigned long pfn, pagetype;
- void *spage = (char *)region_base + (PAGE_SIZE*j);
-
- pfn = pfn_type[j] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
- pagetype = pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK;
-
- if ( pagetype != 0 )
- {
- /* If the page is not a normal data page, write out any
- run of pages we may have previously acumulated */
- if ( !compressing && run )
- {
- if ( wruncached(io_fd, live,
- (char*)region_base+(PAGE_SIZE*(j-run)),
- PAGE_SIZE*run) != PAGE_SIZE*run )
- {
- PERROR("Error when writing to state file (4a)"
- " (errno %d)", errno);
- goto out;
- }
- run = 0;
- }
- }
-
- /*
- * skip pages that aren't present,
- * or are broken, or are alloc-only
- */
- if ( pagetype == XEN_DOMCTL_PFINFO_XTAB
- || pagetype == XEN_DOMCTL_PFINFO_BROKEN
- || pagetype == XEN_DOMCTL_PFINFO_XALLOC )
- continue;
-
- pagetype &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK;
-
- if ( (pagetype >= XEN_DOMCTL_PFINFO_L1TAB) &&
- (pagetype <= XEN_DOMCTL_PFINFO_L4TAB) )
- {
- /* We have a pagetable page: need to rewrite it. */
- race =
- canonicalize_pagetable(ctx, pagetype, pfn, spage, page);
-
- if ( race && !live )
- {
- ERROR("Fatal PT race (pfn %lx, type %08lx)", pfn,
- pagetype);
- goto out;
- }
-
- if (compressing)
- {
- int c_err;
- /* Mark pagetable page to be sent uncompressed */
- c_err = xc_compression_add_page(xch, compress_ctx, page,
- pfn, 1 /* raw page */);
- if (c_err == -2) /* OOB PFN */
- {
- ERROR("Could not add pagetable page "
- "(pfn:%" PRIpfn "to page buffer\n", pfn);
- goto out;
- }
-
- if (c_err == -1)
- {
- /*
- * We are out of buffer space to hold dirty
- * pages. Compress and flush the current buffer
- * to make space. This is a corner case, that
- * slows down checkpointing as the compression
- * happens while domain is suspended. Happens
- * seldom and if you find this occuring
- * frequently, increase the PAGE_BUFFER_SIZE
- * in xc_compression.c.
- */
- if (wrcompressed(io_fd) < 0)
- {
- ERROR("Error when writing compressed"
- " data (4b)\n");
- goto out;
- }
- }
- }
- else if ( wruncached(io_fd, live, page,
- PAGE_SIZE) != PAGE_SIZE )
- {
- PERROR("Error when writing to state file (4b)"
- " (errno %d)", errno);
- goto out;
- }
- }
- else
- {
- /* We have a normal page: accumulate it for writing. */
- if (compressing)
- {
- int c_err;
- /* For checkpoint compression, accumulate the page in the
- * page buffer, to be compressed later.
- */
- c_err = xc_compression_add_page(xch, compress_ctx, spage,
- pfn, 0 /* not raw page */);
-
- if (c_err == -2) /* OOB PFN */
- {
- ERROR("Could not add page "
- "(pfn:%" PRIpfn "to page buffer\n", pfn);
- goto out;
- }
-
- if (c_err == -1)
- {
- if (wrcompressed(io_fd) < 0)
- {
- ERROR("Error when writing compressed"
- " data (4c)\n");
- goto out;
- }
- }
- }
- else
- run++;
- }
- } /* end of the write out for this batch */
-
- if ( run )
- {
- /* write out the last accumulated run of pages */
- if ( wruncached(io_fd, live,
- (char*)region_base+(PAGE_SIZE*(j-run)),
- PAGE_SIZE*run) != PAGE_SIZE*run )
- {
- PERROR("Error when writing to state file (4c)"
- " (errno %d)", errno);
- goto out;
- }
- }
-
- sent_this_iter += batch;
-
- munmap(region_base, batch*PAGE_SIZE);
-
- } /* end of this while loop for this iteration */
-
- skip:
-
- xc_report_progress_step(xch, dinfo->p2m_size, dinfo->p2m_size);
-
- total_sent += sent_this_iter;
-
- if ( last_iter )
- {
- print_stats( xch, dom, sent_this_iter, &time_stats, &shadow_stats, 1);
-
- DPRINTF("Total pages sent= %ld (%.2fx)\n",
- total_sent, ((float)total_sent)/dinfo->p2m_size );
- DPRINTF("(of which %ld were fixups)\n", needed_to_fix );
- }
-
- if ( last_iter && debug )
- {
- int id = XC_SAVE_ID_ENABLE_VERIFY_MODE;
- memset(to_send, 0xff, bitmap_size(dinfo->p2m_size));
- debug = 0;
- DPRINTF("Entering debug resend-all mode\n");
-
- /* send "-1" to put receiver into debug mode */
- if ( wrexact(io_fd, &id, sizeof(int)) )
- {
- PERROR("Error when writing to state file (6)");
- goto out;
- }
-
- continue;
- }
-
- if ( last_iter )
- break;
-
- if ( live )
- {
- if ( (iter >= max_iters) ||
- (sent_this_iter+skip_this_iter < 50) ||
- (total_sent > dinfo->p2m_size*max_factor) )
- {
- DPRINTF("Start last iteration\n");
- last_iter = 1;
-
- if ( suspend_and_state(callbacks->suspend, callbacks->data,
- xch, io_fd, dom, &info) )
- {
- ERROR("Domain appears not to have suspended");
- goto out;
- }
-
- DPRINTF("SUSPEND shinfo %08lx\n", info.shared_info_frame);
- if ( (tmem_saved > 0) &&
- (xc_tmem_save_extra(xch,dom,io_fd,XC_SAVE_ID_TMEM_EXTRA) == -1) )
- {
- PERROR("Error when writing to state file (tmem)");
- goto out;
- }
-
- if ( save_tsc_info(xch, dom, io_fd) < 0 )
- {
- PERROR("Error when writing to state file (tsc)");
- goto out;
- }
-
-
- }
-
- if ( xc_shadow_control(xch, dom,
- XEN_DOMCTL_SHADOW_OP_CLEAN, HYPERCALL_BUFFER(to_send),
- dinfo->p2m_size, NULL, 0, &shadow_stats) != dinfo->p2m_size )
- {
- PERROR("Error flushing shadow PT");
- goto out;
- }
-
- print_stats(xch, dom, sent_this_iter, &time_stats, &shadow_stats, 1);
-
- }
- } /* end of infinite for loop */
-
- DPRINTF("All memory is saved\n");
-
- /* After last_iter, buffer the rest of pagebuf & tailbuf data into a
- * separate output buffer and flush it after the compressed page chunks.
- */
- if (compressing)
- {
- ob = &ob_tailbuf;
- ob->pos = 0;
- }
-
- {
- struct chunk {
- int id;
- int max_vcpu_id;
- uint64_t vcpumap[XC_SR_MAX_VCPUS/64];
- } chunk = { XC_SAVE_ID_VCPU_INFO, info.max_vcpu_id };
-
- if ( info.max_vcpu_id >= XC_SR_MAX_VCPUS )
- {
- errno = E2BIG;
- ERROR("Too many VCPUS in guest!");
- goto out;
- }
-
- for ( i = 1; i <= info.max_vcpu_id; i++ )
- {
- xc_vcpuinfo_t vinfo;
- if ( (xc_vcpu_getinfo(xch, dom, i, &vinfo) == 0) &&
- vinfo.online )
- vcpumap[i/64] |= 1ULL << (i%64);
- }
-
- memcpy(chunk.vcpumap, vcpumap, vcpumap_sz(info.max_vcpu_id));
- if ( wrexact(io_fd, &chunk, offsetof(struct chunk, vcpumap)
- + vcpumap_sz(info.max_vcpu_id)) )
- {
- PERROR("Error when writing to state file");
- goto out;
- }
- }
-
- if ( hvm )
- {
- struct {
- int id;
- uint32_t pad;
- uint64_t data;
- } chunk = { 0, };
-
- chunk.id = XC_SAVE_ID_HVM_GENERATION_ID_ADDR;
- xc_hvm_param_get(xch, dom, HVM_PARAM_VM_GENERATION_ID_ADDR, &chunk.data);
-
- if ( (chunk.data != 0) &&
- wrexact(io_fd, &chunk, sizeof(chunk)) )
- {
- PERROR("Error when writing the generation id buffer location for guest");
- goto out;
- }
-
- chunk.id = XC_SAVE_ID_HVM_IDENT_PT;
- chunk.data = 0;
- xc_hvm_param_get(xch, dom, HVM_PARAM_IDENT_PT, &chunk.data);
-
- if ( (chunk.data != 0) &&
- wrexact(io_fd, &chunk, sizeof(chunk)) )
- {
- PERROR("Error when writing the ident_pt for EPT guest");
- goto out;
- }
-
- chunk.id = XC_SAVE_ID_HVM_PAGING_RING_PFN;
- chunk.data = 0;
- xc_hvm_param_get(xch, dom, HVM_PARAM_PAGING_RING_PFN, &chunk.data);
-
- if ( (chunk.data != 0) &&
- wrexact(io_fd, &chunk, sizeof(chunk)) )
- {
- PERROR("Error when writing the paging ring pfn for guest");
- goto out;
- }
-
- chunk.id = XC_SAVE_ID_HVM_MONITOR_RING_PFN;
- chunk.data = 0;
- xc_hvm_param_get(xch, dom, HVM_PARAM_MONITOR_RING_PFN, &chunk.data);
-
- if ( (chunk.data != 0) &&
- wrexact(io_fd, &chunk, sizeof(chunk)) )
- {
- PERROR("Error when writing the access ring pfn for guest");
- goto out;
- }
-
- chunk.id = XC_SAVE_ID_HVM_SHARING_RING_PFN;
- chunk.data = 0;
- xc_hvm_param_get(xch, dom, HVM_PARAM_SHARING_RING_PFN, &chunk.data);
-
- if ( (chunk.data != 0) &&
- wrexact(io_fd, &chunk, sizeof(chunk)) )
- {
- PERROR("Error when writing the sharing ring pfn for guest");
- goto out;
- }
-
- chunk.id = XC_SAVE_ID_HVM_VM86_TSS;
- chunk.data = 0;
- xc_hvm_param_get(xch, dom, HVM_PARAM_VM86_TSS, &chunk.data);
-
- if ( (chunk.data != 0) &&
- wrexact(io_fd, &chunk, sizeof(chunk)) )
- {
- PERROR("Error when writing the vm86 TSS for guest");
- goto out;
- }
-
- chunk.id = XC_SAVE_ID_HVM_CONSOLE_PFN;
- chunk.data = 0;
- xc_hvm_param_get(xch, dom, HVM_PARAM_CONSOLE_PFN, &chunk.data);
-
- if ( (chunk.data != 0) &&
- wrexact(io_fd, &chunk, sizeof(chunk)) )
- {
- PERROR("Error when writing the console pfn for guest");
- goto out;
- }
-
- chunk.id = XC_SAVE_ID_HVM_ACPI_IOPORTS_LOCATION;
- chunk.data = 0;
- xc_hvm_param_get(xch, dom, HVM_PARAM_ACPI_IOPORTS_LOCATION, &chunk.data);
-
- if ((chunk.data != 0) && wrexact(io_fd, &chunk, sizeof(chunk)))
- {
- PERROR("Error when writing the firmware ioport version");
- goto out;
- }
-
- chunk.id = XC_SAVE_ID_HVM_VIRIDIAN;
- chunk.data = 0;
- xc_hvm_param_get(xch, dom, HVM_PARAM_VIRIDIAN, &chunk.data);
-
- if ( (chunk.data != 0) &&
- wrexact(io_fd, &chunk, sizeof(chunk)) )
- {
- PERROR("Error when writing the viridian flag");
- goto out;
- }
-
- chunk.id = XC_SAVE_ID_HVM_IOREQ_SERVER_PFN;
- chunk.data = 0;
- xc_hvm_param_get(xch, dom, HVM_PARAM_IOREQ_SERVER_PFN, &chunk.data);
-
- if ( (chunk.data != 0) &&
- wrexact(io_fd, &chunk, sizeof(chunk)) )
- {
- PERROR("Error when writing the ioreq server gmfn base");
- goto out;
- }
-
- chunk.id = XC_SAVE_ID_HVM_NR_IOREQ_SERVER_PAGES;
- chunk.data = 0;
- xc_hvm_param_get(xch, dom, HVM_PARAM_NR_IOREQ_SERVER_PAGES, &chunk.data);
-
- if ( (chunk.data != 0) &&
- wrexact(io_fd, &chunk, sizeof(chunk)) )
- {
- PERROR("Error when writing the ioreq server gmfn count");
- goto out;
- }
- }
-
- if ( callbacks != NULL && callbacks->toolstack_save != NULL )
- {
- int id = XC_SAVE_ID_TOOLSTACK;
- uint8_t *buf;
- uint32_t len;
-
- if ( callbacks->toolstack_save(dom, &buf, &len, callbacks->data) < 0 )
- {
- PERROR("Error calling toolstack_save");
- goto out;
- }
- wrexact(io_fd, &id, sizeof(id));
- wrexact(io_fd, &len, sizeof(len));
- wrexact(io_fd, buf, len);
- free(buf);
- }
-
- if ( !callbacks->checkpoint )
- {
- /*
- * If this is not a checkpointed save then this must be the first and
- * last checkpoint.
- */
- i = XC_SAVE_ID_LAST_CHECKPOINT;
- if ( wrexact(io_fd, &i, sizeof(int)) )
- {
- PERROR("Error when writing last checkpoint chunk");
- goto out;
- }
- }
-
- /* Enable compression logic on both sides by sending this
- * one time marker.
- * NOTE: We could have simplified this procedure by sending
- * the enable/disable compression flag before the beginning of
- * the main for loop. But this would break compatibility for
- * live migration code, with older versions of xen. So we have
- * to enable it after the last_iter, when the XC_SAVE_ID_*
- * elements are sent.
- */
- if (!compressing && (flags & XCFLAGS_CHECKPOINT_COMPRESS))
- {
- i = XC_SAVE_ID_ENABLE_COMPRESSION;
- if ( wrexact(io_fd, &i, sizeof(int)) )
- {
- PERROR("Error when writing enable_compression marker");
- goto out;
- }
- }
-
- /* Zero terminate */
- i = 0;
- if ( wrexact(io_fd, &i, sizeof(int)) )
- {
- PERROR("Error when writing to state file (6')");
- goto out;
- }
-
- if ( hvm )
- {
- uint32_t rec_size;
-
- /* Save magic-page locations. */
- memset(magic_pfns, 0, sizeof(magic_pfns));
- xc_hvm_param_get(xch, dom, HVM_PARAM_IOREQ_PFN, &magic_pfns[0]);
- xc_hvm_param_get(xch, dom, HVM_PARAM_BUFIOREQ_PFN, &magic_pfns[1]);
- xc_hvm_param_get(xch, dom, HVM_PARAM_STORE_PFN, &magic_pfns[2]);
- if ( wrexact(io_fd, magic_pfns, sizeof(magic_pfns)) )
- {
- PERROR("Error when writing to state file (7)");
- goto out;
- }
-
- /* Get HVM context from Xen and save it too */
- if ( (rec_size = xc_domain_hvm_getcontext(xch, dom, hvm_buf,
- hvm_buf_size)) == -1 )
- {
- PERROR("HVM:Could not get hvm buffer");
- goto out;
- }
-
- if ( wrexact(io_fd, &rec_size, sizeof(uint32_t)) )
- {
- PERROR("error write hvm buffer size");
- goto out;
- }
-
- if ( wrexact(io_fd, hvm_buf, rec_size) )
- {
- PERROR("write HVM info failed!");
- goto out;
- }
-
- /* HVM guests are done now */
- goto success;
- }
-
- /* PV guests only from now on */
-
- /* Send through a list of all the PFNs that were not in map at the close */
- {
- unsigned int i,j;
- unsigned long pfntab[1024];
-
- for ( i = 0, j = 0; i < dinfo->p2m_size; i++ )
- {
- if ( !is_mapped(pfn_to_mfn(i)) )
- j++;
- }
-
- if ( wrexact(io_fd, &j, sizeof(unsigned int)) )
- {
- PERROR("Error when writing to state file (6a)");
- goto out;
- }
-
- for ( i = 0, j = 0; i < dinfo->p2m_size; )
- {
- if ( !is_mapped(pfn_to_mfn(i)) )
- pfntab[j++] = i;
-
- i++;
- if ( (j == 1024) || (i == dinfo->p2m_size) )
- {
- if ( wrexact(io_fd, &pfntab, sizeof(unsigned long)*j) )
- {
- PERROR("Error when writing to state file (6b)");
- goto out;
- }
- j = 0;
- }
- }
- }
-
- if ( xc_vcpu_getcontext(xch, dom, 0, &ctxt) )
- {
- PERROR("Could not get vcpu context");
- goto out;
- }
-
- /*
- * Canonicalise the start info frame number.
- *
- * The start info MFN is the 3rd argument to the
- * HYPERVISOR_sched_op hypercall when op==SCHEDOP_shutdown and
- * reason==SHUTDOWN_suspend and is therefore found in the edx
- * register.
- */
- mfn = GET_FIELD(&ctxt, user_regs.edx, dinfo->guest_width);
- if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) )
- {
- errno = ERANGE;
- ERROR("Suspend record is not in range of pseudophys map");
- goto out;
- }
- SET_FIELD(&ctxt, user_regs.edx, mfn_to_pfn(mfn), dinfo->guest_width);
-
- for ( i = 0; i <= info.max_vcpu_id; i++ )
- {
- if ( !(vcpumap[i/64] & (1ULL << (i%64))) )
- continue;
-
- if ( (i != 0) && xc_vcpu_getcontext(xch, dom, i, &ctxt) )
- {
- PERROR("No context for VCPU%d", i);
- goto out;
- }
-
- /* Canonicalise each GDT frame number. */
- for ( j = 0; (512*j) < GET_FIELD(&ctxt, gdt_ents, dinfo->guest_width); j++ )
- {
- mfn = GET_FIELD(&ctxt, gdt_frames[j], dinfo->guest_width);
- if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) )
- {
- errno = ERANGE;
- ERROR("GDT frame is not in range of pseudophys map");
- goto out;
- }
- SET_FIELD(&ctxt, gdt_frames[j], mfn_to_pfn(mfn), dinfo->guest_width);
- }
-
- /* Canonicalise the page table base pointer. */
- if ( !MFN_IS_IN_PSEUDOPHYS_MAP(
- UNFOLD_CR3(GET_FIELD(&ctxt, ctrlreg[3], dinfo->guest_width))) )
- {
- errno = ERANGE;
- ERROR("PT base is not in range of pseudophys map");
- goto out;
- }
- SET_FIELD(&ctxt, ctrlreg[3],
- FOLD_CR3(mfn_to_pfn(UNFOLD_CR3(
- GET_FIELD(&ctxt, ctrlreg[3], dinfo->guest_width)
- ))), dinfo->guest_width);
-
- /* Guest pagetable (x86/64) stored in otherwise-unused CR1. */
- if ( (ctx->pt_levels == 4) && ctxt.x64.ctrlreg[1] )
- {
- if ( !MFN_IS_IN_PSEUDOPHYS_MAP(UNFOLD_CR3(ctxt.x64.ctrlreg[1])) )
- {
- errno = ERANGE;
- ERROR("PT base is not in range of pseudophys map");
- goto out;
- }
- /* Least-significant bit means 'valid PFN'. */
- ctxt.x64.ctrlreg[1] = 1 |
- FOLD_CR3(mfn_to_pfn(UNFOLD_CR3(ctxt.x64.ctrlreg[1])));
- }
-
- if ( wrexact(io_fd, &ctxt, ((dinfo->guest_width==8)
- ? sizeof(ctxt.x64)
- : sizeof(ctxt.x32))) )
- {
- PERROR("Error when writing to state file (1)");
- goto out;
- }
-
- domctl.cmd = XEN_DOMCTL_get_ext_vcpucontext;
- domctl.domain = dom;
- memset(&domctl.u, 0, sizeof(domctl.u));
- domctl.u.ext_vcpucontext.vcpu = i;
- if ( xc_domctl(xch, &domctl) < 0 )
- {
- PERROR("No extended context for VCPU%d", i);
- goto out;
- }
- if ( wrexact(io_fd, &domctl.u.ext_vcpucontext, 128) )
- {
- PERROR("Error when writing to state file (2)");
- goto out;
- }
-
- /* Check there are no PV MSRs in use. */
- domctl.cmd = XEN_DOMCTL_get_vcpu_msrs;
- domctl.domain = dom;
- memset(&domctl.u, 0, sizeof(domctl.u));
- domctl.u.vcpu_msrs.vcpu = i;
- domctl.u.vcpu_msrs.msr_count = 0;
- set_xen_guest_handle_raw(domctl.u.vcpu_msrs.msrs, (void*)1);
-
- if ( xc_domctl(xch, &domctl) < 0 )
- {
- if ( errno == ENOBUFS )
- {
- errno = EOPNOTSUPP;
- PERROR("Unable to migrate PV guest using MSRs (yet)");
- }
- else
- PERROR("Error querying maximum number of MSRs for VCPU%d", i);
- goto out;
- }
-
- /* Start to fetch CPU eXtended States */
- /* Get buffer size first */
- domctl.cmd = XEN_DOMCTL_getvcpuextstate;
- domctl.domain = dom;
- domctl.u.vcpuextstate.vcpu = i;
- domctl.u.vcpuextstate.xfeature_mask = 0;
- domctl.u.vcpuextstate.size = 0;
- if ( xc_domctl(xch, &domctl) < 0 )
- {
- PERROR("No eXtended states (XSAVE) for VCPU%d", i);
- goto out;
- }
-
- if ( !domctl.u.vcpuextstate.xfeature_mask )
- continue;
-
- /* Getting eXtended states data */
- buffer = xc_hypercall_buffer_alloc(xch, buffer, domctl.u.vcpuextstate.size);
- if ( !buffer )
- {
- PERROR("Insufficient memory for getting eXtended states for"
- "VCPU%d", i);
- goto out;
- }
- set_xen_guest_handle(domctl.u.vcpuextstate.buffer, buffer);
- if ( xc_domctl(xch, &domctl) < 0 )
- {
- PERROR("No eXtended states (XSAVE) for VCPU%d", i);
- xc_hypercall_buffer_free(xch, buffer);
- goto out;
- }
-
- if ( wrexact(io_fd, &domctl.u.vcpuextstate.xfeature_mask,
- sizeof(domctl.u.vcpuextstate.xfeature_mask)) ||
- wrexact(io_fd, &domctl.u.vcpuextstate.size,
- sizeof(domctl.u.vcpuextstate.size)) ||
- wrexact(io_fd, buffer, domctl.u.vcpuextstate.size) )
- {
- PERROR("Error when writing to state file VCPU extended state");
- xc_hypercall_buffer_free(xch, buffer);
- goto out;
- }
- xc_hypercall_buffer_free(xch, buffer);
- }
-
- /*
- * Reset the MFN to be a known-invalid value. See map_frame_list_list().
- */
- memcpy(page, live_shinfo, PAGE_SIZE);
- SET_FIELD(((shared_info_any_t *)page),
- arch.pfn_to_mfn_frame_list_list, 0, dinfo->guest_width);
- if ( wrexact(io_fd, page, PAGE_SIZE) )
- {
- PERROR("Error when writing to state file (1)");
- goto out;
- }
-
- /* Flush last write and check for errors. */
- if ( fsync(io_fd) && errno != EINVAL )
- {
- PERROR("Error when flushing state file");
- goto out;
- }
-
- /* Success! */
- success:
- rc = errno = 0;
- goto out_rc;
-
- out:
- rc = errno;
- assert(rc);
- out_rc:
- completed = 1;
-
- if ( !rc && callbacks->postcopy )
- callbacks->postcopy(callbacks->data);
-
- /* guest has been resumed. Now we can compress data
- * at our own pace.
- */
- if (!rc && compressing)
- {
- ob = &ob_pagebuf;
- if (wrcompressed(io_fd) < 0)
- {
- ERROR("Error when writing compressed data, after postcopy\n");
- goto out;
- }
- /* Append the tailbuf data to the main outbuf */
- if ( wrexact(io_fd, ob_tailbuf.buf, ob_tailbuf.pos) )
- {
- PERROR("Error when copying tailbuf into outbuf");
- goto out;
- }
- }
-
- /* Flush last write and discard cache for file. */
- if ( ob && outbuf_flush(xch, ob, io_fd) < 0 ) {
- PERROR("Error when flushing output buffer");
- if (!rc)
- rc = errno;
- }
-
- discard_file_cache(xch, io_fd, 1 /* flush */);
-
- /* Enable compression now, finally */
- compressing = (flags & XCFLAGS_CHECKPOINT_COMPRESS);
-
- /* checkpoint_cb can spend arbitrarily long in between rounds */
- if (!rc && callbacks->checkpoint &&
- callbacks->checkpoint(callbacks->data) > 0)
- {
- /* reset stats timer */
- print_stats(xch, dom, 0, &time_stats, &shadow_stats, 0);
-
- /* last_iter = 1; */
- if ( suspend_and_state(callbacks->suspend, callbacks->data, xch,
- io_fd, dom, &info) )
- {
- ERROR("Domain appears not to have suspended");
- goto out;
- }
- DPRINTF("SUSPEND shinfo %08lx\n", info.shared_info_frame);
- print_stats(xch, dom, 0, &time_stats, &shadow_stats, 1);
-
- if ( xc_shadow_control(xch, dom,
- XEN_DOMCTL_SHADOW_OP_CLEAN, HYPERCALL_BUFFER(to_send),
- dinfo->p2m_size, NULL, 0, &shadow_stats) != dinfo->p2m_size )
- {
- PERROR("Error flushing shadow PT");
- }
-
- goto copypages;
- }
-
- if ( tmem_saved != 0 && live )
- xc_tmem_save_done(xch, dom);
-
- if ( live )
- {
- if ( xc_shadow_control(xch, dom,
- XEN_DOMCTL_SHADOW_OP_OFF,
- NULL, 0, NULL, 0, NULL) < 0 )
- DPRINTF("Warning - couldn't disable shadow mode");
- if ( hvm && callbacks->switch_qemu_logdirty(dom, 0, callbacks->data) )
- DPRINTF("Warning - couldn't disable qemu log-dirty mode");
- }
-
- if (compress_ctx)
- xc_compression_free_context(xch, compress_ctx);
-
- if ( live_shinfo )
- munmap(live_shinfo, PAGE_SIZE);
-
- if ( ctx->live_p2m )
- munmap(ctx->live_p2m, P2M_FL_ENTRIES * PAGE_SIZE);
-
- if ( ctx->live_m2p )
- munmap(ctx->live_m2p, M2P_SIZE(ctx->max_mfn));
-
- xc_hypercall_buffer_free_pages(xch, to_send, NRPAGES(bitmap_size(dinfo->p2m_size)));
- xc_hypercall_buffer_free_pages(xch, to_skip, NRPAGES(bitmap_size(dinfo->p2m_size)));
-
- free(pfn_type);
- free(pfn_batch);
- free(pfn_err);
- free(to_fix);
- free(hvm_buf);
- outbuf_free(&ob_pagebuf);
-
- errno = rc;
-exit:
- DPRINTF("Save exit of domid %u with errno=%d\n", dom, errno);
-
- return !!errno;
-}
-
-/*
- * Local variables:
- * mode: C
- * c-file-style: "BSD"
- * c-basic-offset: 4
- * tab-width: 4
- * indent-tabs-mode: nil
- * End:
- */