direct-io.hg

changeset 7702:b3c2bc39d815

Enable save/restore for PAE domains.

This includes quite a few cleanups / refactoring of the old code, some
of which is intended to prepare for 64-bit save/restore.

Signed-off-by: Steven Hand <steven@xensource.com>
author smh22@firebug.cl.cam.ac.uk
date Tue Nov 08 18:42:07 2005 +0100 (2005-11-08)
parents abbe3df33774
children 539b2757642e
files tools/libxc/xc_linux_restore.c tools/libxc/xc_linux_save.c tools/libxc/xenctrl.h tools/libxc/xg_private.h tools/libxc/xg_save_restore.h
line diff
     1.1 --- a/tools/libxc/xc_linux_restore.c	Tue Nov 08 18:39:58 2005 +0100
     1.2 +++ b/tools/libxc/xc_linux_restore.c	Tue Nov 08 18:42:07 2005 +0100
     1.3 @@ -8,32 +8,30 @@
     1.4  
     1.5  #include <stdlib.h>
     1.6  #include <unistd.h>
     1.7 -#include "xg_private.h"
     1.8 -#include <xenctrl.h>
     1.9 -#include <xen/memory.h>
    1.10 -
    1.11 -#define MAX_BATCH_SIZE 1024
    1.12 -
    1.13 -#define DEBUG 0
    1.14  
    1.15 -#if 1
    1.16 -#define ERR(_f, _a...) do { fprintf ( stderr, _f , ## _a ); fflush(stderr); } while(0)
    1.17 -#else
    1.18 -#define ERR(_f, _a...) ((void)0)
    1.19 -#endif
    1.20 +#include "xg_private.h"
    1.21 +#include "xg_save_restore.h"
    1.22  
    1.23 -#if DEBUG
    1.24 -#define DPRINTF(_f, _a...) do { fprintf ( stdout, _f , ## _a ); fflush(stdout); } while (0)
    1.25 -#else
    1.26 -#define DPRINTF(_f, _a...) ((void)0)
    1.27 -#endif
    1.28  
    1.29 -#define PROGRESS 0
    1.30 -#if PROGRESS
    1.31 -#define PPRINTF(_f, _a...) fprintf ( stderr, _f , ## _a ); fflush(stderr)
    1.32 -#else
    1.33 -#define PPRINTF(_f, _a...)
    1.34 -#endif
    1.35 +
    1.36 +/* max mfn of the whole machine */
    1.37 +static uint32_t max_mfn; 
    1.38 +
    1.39 +/* virtual starting address of the hypervisor */
    1.40 +static uint32_t hvirt_start; 
    1.41 +
    1.42 +/* #levels of page tables used by the currrent guest */
    1.43 +static uint32_t pt_levels; 
    1.44 +
    1.45 +/* total number of pages used by the current guest */
    1.46 +static unsigned long max_pfn;
    1.47 +
    1.48 +/* Live mapping of the table mapping each PFN to its current MFN. */
    1.49 +static unsigned long *live_p2m = NULL;
    1.50 +
    1.51 +/* A table mapping each PFN to its new MFN. */
    1.52 +static unsigned long *p2m = NULL;
    1.53 +
    1.54  
    1.55  static ssize_t
    1.56  read_exact(int fd, void *buf, size_t count)
    1.57 @@ -45,24 +43,93 @@ read_exact(int fd, void *buf, size_t cou
    1.58          s = read(fd, &b[r], count - r);
    1.59          if ((s == -1) && (errno == EINTR))
    1.60              continue;
    1.61 -        if (s <= 0)
    1.62 +        if (s <= 0) { 
    1.63              break;
    1.64 +        } 
    1.65          r += s;
    1.66      }
    1.67  
    1.68 -    return r;
    1.69 +    return (r == count) ? 1 : 0; 
    1.70  }
    1.71  
    1.72 -int xc_linux_restore(int xc_handle, int io_fd, uint32_t dom, unsigned long nr_pfns,
    1.73 +
    1.74 +/*
    1.75 +** In the state file (or during transfer), all page-table pages are 
    1.76 +** converted into a 'canonical' form where references to actual mfns 
    1.77 +** are replaced with references to the corresponding pfns. 
    1.78 +** This function inverts that operation, replacing the pfn values with 
    1.79 +** the (now known) appropriate mfn values. 
    1.80 +*/
    1.81 +int uncanonicalize_pagetable(unsigned long type, void *page) 
    1.82 +{ 
    1.83 +    int i, pte_last, xen_start, xen_end; 
    1.84 +    unsigned long pfn; 
    1.85 +    uint64_t pte; 
    1.86 +
    1.87 +    /* 
    1.88 +    ** We need to determine which entries in this page table hold
    1.89 +    ** reserved hypervisor mappings. This depends on the current
    1.90 +    ** page table type as well as the number of paging levels. 
    1.91 +    */
    1.92 +    xen_start = xen_end = pte_last = PAGE_SIZE / ((pt_levels == 2)? 4 : 8); 
    1.93 +    
    1.94 +    if (pt_levels == 2 && type == L2TAB)
    1.95 +        xen_start = (hvirt_start >> L2_PAGETABLE_SHIFT); 
    1.96 +
    1.97 +    if (pt_levels == 3 && type == L3TAB) 
    1.98 +        xen_start = L3_PAGETABLE_ENTRIES_PAE; 
    1.99 +
   1.100 +
   1.101 +    /* Now iterate through the page table, uncanonicalizing each PTE */
   1.102 +    for(i = 0; i < pte_last; i++) { 
   1.103 +        
   1.104 +        if(pt_levels == 2) 
   1.105 +            pte = ((uint32_t *)page)[i]; 
   1.106 +        else 
   1.107 +            pte = ((uint64_t *)page)[i]; 
   1.108 +        
   1.109 +        if(i >= xen_start && i < xen_end) 
   1.110 +            pte = 0; 
   1.111 +        
   1.112 +        if(pte & _PAGE_PRESENT) { 
   1.113 +            
   1.114 +            pfn = pte >> PAGE_SHIFT; 
   1.115 +            
   1.116 +            if(pfn >= max_pfn) { 
   1.117 +                ERR("Frame number in type %lu page table is out of range: "
   1.118 +                    "i=%d pfn=0x%lx max_pfn=%lu", 
   1.119 +                    type >> 28, i, pfn, max_pfn);
   1.120 +                return 0; 
   1.121 +            } 
   1.122 +            
   1.123 +            
   1.124 +            if(type == L1TAB) 
   1.125 +                pte &= (PAGE_SIZE - 1) & ~(_PAGE_GLOBAL | _PAGE_PAT);
   1.126 +            else 
   1.127 +                pte &= (PAGE_SIZE - 1) & ~(_PAGE_GLOBAL | _PAGE_PSE);
   1.128 +            
   1.129 +            pte |= p2m[pfn] << PAGE_SHIFT;
   1.130 +            
   1.131 +            if(pt_levels == 2) 
   1.132 +                ((uint32_t *)page)[i] = (uint32_t)pte; 
   1.133 +            else 
   1.134 +                ((uint64_t *)page)[i] = (uint64_t)pte; 
   1.135 +        }
   1.136 +    }
   1.137 +    
   1.138 +    return 1; 
   1.139 +}
   1.140 +
   1.141 +int xc_linux_restore(int xc_handle, int io_fd, 
   1.142 +                     uint32_t dom, unsigned long nr_pfns, 
   1.143                       unsigned int store_evtchn, unsigned long *store_mfn,
   1.144                       unsigned int console_evtchn, unsigned long *console_mfn)
   1.145  {
   1.146      dom0_op_t op;
   1.147 -    int rc = 1, i, n, k;
   1.148 -    unsigned long mfn, pfn, xpfn;
   1.149 +    int rc = 1, i, n;
   1.150 +    unsigned long mfn, pfn; 
   1.151      unsigned int prev_pc, this_pc;
   1.152      int verify = 0;
   1.153 -    int err;
   1.154  
   1.155      /* The new domain's shared-info frame number. */
   1.156      unsigned long shared_info_frame;
   1.157 @@ -72,29 +139,21 @@ int xc_linux_restore(int xc_handle, int 
   1.158      /* A copy of the CPU context of the guest. */
   1.159      vcpu_guest_context_t ctxt;
   1.160  
   1.161 -    /* A table containg the type of each PFN (/not/ MFN!). */
   1.162 +    /* A table containing the type of each PFN (/not/ MFN!). */
   1.163      unsigned long *pfn_type = NULL;
   1.164  
   1.165      /* A table of MFNs to map in the current region */
   1.166      unsigned long *region_mfn = NULL;
   1.167  
   1.168      /* A temporary mapping, and a copy, of one frame of guest memory. */
   1.169 -    unsigned long *ppage = NULL;
   1.170 +    unsigned long *page = NULL;
   1.171  
   1.172      /* A copy of the pfn-to-mfn table frame list. */
   1.173 -    unsigned long pfn_to_mfn_frame_list[1024];
   1.174 -
   1.175 -    /* A table mapping each PFN to its new MFN. */
   1.176 -    unsigned long *pfn_to_mfn_table = NULL;
   1.177 -
   1.178 -    /* used by mapper for updating the domain's copy of the table */
   1.179 -    unsigned long *live_pfn_to_mfn_table = NULL;
   1.180 +    unsigned long *p2m_frame_list = NULL; 
   1.181  
   1.182      /* A temporary mapping of the guest's start_info page. */
   1.183      start_info_t *start_info;
   1.184  
   1.185 -    int pt_levels = 2; /* XXX auto-detect this */
   1.186 -
   1.187      char *region_base;
   1.188  
   1.189      xc_mmu_t *mmu = NULL;
   1.190 @@ -102,37 +161,60 @@ int xc_linux_restore(int xc_handle, int 
   1.191      /* used by debug verify code */
   1.192      unsigned long buf[PAGE_SIZE/sizeof(unsigned long)];
   1.193  
   1.194 -#define MAX_PIN_BATCH 1024
   1.195      struct mmuext_op pin[MAX_PIN_BATCH];
   1.196      unsigned int nr_pins = 0;
   1.197  
   1.198 -    DPRINTF("xc_linux_restore start: nr_pfns = %lx\n", nr_pfns);
   1.199 +
   1.200 +    max_pfn = nr_pfns; 
   1.201 +
   1.202 +    DPRINTF("xc_linux_restore start: max_pfn = %lx\n", max_pfn);
   1.203 +
   1.204 +
   1.205 +    if(!get_platform_info(xc_handle, dom, 
   1.206 +                          &max_mfn, &hvirt_start, &pt_levels)) {
   1.207 +        ERR("Unable to get platform info."); 
   1.208 +        return 1;
   1.209 +    }
   1.210 +
   1.211  
   1.212      if (mlock(&ctxt, sizeof(ctxt))) {
   1.213 -        /* needed for when we do the build dom0 op, 
   1.214 -           but might as well do early */
   1.215 +        /* needed for build dom0 op, but might as well do early */
   1.216          ERR("Unable to mlock ctxt");
   1.217          return 1;
   1.218      }
   1.219  
   1.220 -    if (read_exact(io_fd, pfn_to_mfn_frame_list, PAGE_SIZE) != PAGE_SIZE) {
   1.221 -        ERR("read pfn_to_mfn_frame_list failed");
   1.222 +
   1.223 +    /* Only have to worry about vcpu 0 even for SMP */
   1.224 +    if (xc_domain_get_vcpu_context( xc_handle, dom, 0, &ctxt)) {
   1.225 +        ERR("Could not get vcpu context");
   1.226          goto out;
   1.227      }
   1.228  
   1.229 +    
   1.230 +    /* Read the saved P2M frame list */
   1.231 +    if(!(p2m_frame_list = malloc(P2M_FL_SIZE))) { 
   1.232 +        ERR("Couldn't allocate p2m_frame_list array");
   1.233 +        goto out;
   1.234 +    }
   1.235 +    
   1.236 +    if (!read_exact(io_fd, p2m_frame_list, P2M_FL_SIZE)) { 
   1.237 +        ERR("read p2m_frame_list failed");
   1.238 +        goto out;
   1.239 +    }
   1.240 +
   1.241 +    
   1.242      /* We want zeroed memory so use calloc rather than malloc. */
   1.243 -    pfn_to_mfn_table = calloc(4, nr_pfns);
   1.244 -    pfn_type = calloc(4, nr_pfns);    
   1.245 -    region_mfn = calloc(4, MAX_BATCH_SIZE);
   1.246 +    p2m        = calloc(sizeof(unsigned long), max_pfn); 
   1.247 +    pfn_type   = calloc(sizeof(unsigned long), max_pfn);    
   1.248 +    region_mfn = calloc(sizeof(unsigned long), MAX_BATCH_SIZE);
   1.249  
   1.250 -    if ((pfn_to_mfn_table == NULL) || (pfn_type == NULL) || 
   1.251 -        (region_mfn == NULL)) {
   1.252 +    if ((p2m == NULL) || (pfn_type == NULL) || (region_mfn == NULL)) {
   1.253          ERR("memory alloc failed");
   1.254          errno = ENOMEM;
   1.255          goto out;
   1.256      }
   1.257      
   1.258 -    if (mlock(region_mfn, 4 * MAX_BATCH_SIZE)) {
   1.259 +    if (mlock(region_mfn, sizeof(unsigned long) * MAX_BATCH_SIZE)) {
   1.260          ERR("Could not mlock region_mfn");
   1.261          goto out;
   1.262      }
   1.263 @@ -146,35 +228,30 @@ int xc_linux_restore(int xc_handle, int 
   1.264      }
   1.265      shared_info_frame = op.u.getdomaininfo.shared_info_frame;
   1.266  
   1.267 -    err = xc_domain_setmaxmem(xc_handle, dom, nr_pfns * PAGE_SIZE / 1024);
   1.268 -    if (err != 0) {
   1.269 +    if(xc_domain_setmaxmem(xc_handle, dom, PFN_TO_KB(max_pfn)) != 0) { 
   1.270          errno = ENOMEM;
   1.271          goto out;
   1.272      }
   1.273 -
   1.274 -    err = xc_domain_memory_increase_reservation(xc_handle, dom,
   1.275 -                                                nr_pfns, 0, 0, NULL);
   1.276 -    if (err != 0) {
   1.277 -        ERR("Failed to increase reservation by %lx\n", 
   1.278 -            nr_pfns * PAGE_SIZE / 1024); 
   1.279 +    
   1.280 +    if(xc_domain_memory_increase_reservation(
   1.281 +           xc_handle, dom, max_pfn, 0, 0, NULL) != 0) { 
   1.282 +        ERR("Failed to increase reservation by %lx KB\n", max_pfn); 
   1.283          errno = ENOMEM;
   1.284          goto out;
   1.285      }
   1.286  
   1.287      /* Build the pfn-to-mfn table. We choose MFN ordering returned by Xen. */
   1.288 -    if (xc_get_pfn_list(xc_handle, dom, pfn_to_mfn_table, nr_pfns) !=
   1.289 -        nr_pfns) {
   1.290 +    if (xc_get_pfn_list(xc_handle, dom, p2m, max_pfn) != max_pfn) {
   1.291          ERR("Did not read correct number of frame numbers for new dom");
   1.292          goto out;
   1.293      }
   1.294 -
   1.295 -    mmu = xc_init_mmu_updates(xc_handle, dom);
   1.296 -    if (mmu == NULL) {
   1.297 +    
   1.298 +    if(!(mmu = xc_init_mmu_updates(xc_handle, dom))) { 
   1.299          ERR("Could not initialise for MMU updates");
   1.300          goto out;
   1.301      }
   1.302  
   1.303 -    DPRINTF("Reloading memory pages:   0%%");
   1.304 +    DPRINTF("Reloading memory pages:   0%%\n");
   1.305  
   1.306      /*
   1.307       * Now simply read each saved frame into its new machine frame.
   1.308 @@ -183,258 +260,229 @@ int xc_linux_restore(int xc_handle, int 
   1.309      prev_pc = 0;
   1.310  
   1.311      n = 0;
   1.312 -    while ( 1 )
   1.313 -    {
   1.314 +    while (1) { 
   1.315 +
   1.316          int j;
   1.317          unsigned long region_pfn_type[MAX_BATCH_SIZE];
   1.318  
   1.319 -        this_pc = (n * 100) / nr_pfns;
   1.320 +        this_pc = (n * 100) / max_pfn;
   1.321          if ( (this_pc - prev_pc) >= 5 )
   1.322          {
   1.323              PPRINTF("\b\b\b\b%3d%%", this_pc);
   1.324              prev_pc = this_pc;
   1.325          }
   1.326  
   1.327 -        if ( read_exact(io_fd, &j, sizeof(int)) != sizeof(int) )
   1.328 -        {
   1.329 +        if (!read_exact(io_fd, &j, sizeof(int))) { 
   1.330              ERR("Error when reading batch size");
   1.331              goto out;
   1.332          }
   1.333  
   1.334          PPRINTF("batch %d\n",j);
   1.335   
   1.336 -        if ( j == -1 )
   1.337 -        {
   1.338 +        if (j == -1) {
   1.339              verify = 1;
   1.340 -            printf("Entering page verify mode\n");
   1.341 +            fprintf(stderr, "Entering page verify mode\n");
   1.342              continue;
   1.343          }
   1.344  
   1.345 -        if ( j == 0 )
   1.346 +        if (j == 0)
   1.347              break;  /* our work here is done */
   1.348  
   1.349 -        if ( j > MAX_BATCH_SIZE )
   1.350 -        {
   1.351 +        if (j > MAX_BATCH_SIZE) { 
   1.352              ERR("Max batch size exceeded. Giving up.");
   1.353              goto out;
   1.354          }
   1.355   
   1.356 -        if ( read_exact(io_fd, region_pfn_type, j*sizeof(unsigned long)) !=
   1.357 -             j*sizeof(unsigned long) ) {
   1.358 +        if (!read_exact(io_fd, region_pfn_type, j*sizeof(unsigned long))) { 
   1.359              ERR("Error when reading region pfn types");
   1.360              goto out;
   1.361          }
   1.362  
   1.363 -        for ( i = 0; i < j; i++ )
   1.364 -        {
   1.365 -            if ( (region_pfn_type[i] & LTAB_MASK) == XTAB)
   1.366 -            {
   1.367 +        for (i = 0; i < j; i++) { 
   1.368 +
   1.369 +            if ((region_pfn_type[i] & LTAB_MASK) == XTAB)
   1.370                  region_mfn[i] = 0; /* we know map will fail, but don't care */
   1.371 -            }
   1.372 -            else
   1.373 -            {  
   1.374 -                pfn = region_pfn_type[i] & ~LTAB_MASK;
   1.375 -                region_mfn[i] = pfn_to_mfn_table[pfn];
   1.376 -            }          
   1.377 +            else 
   1.378 +                region_mfn[i] = p2m[region_pfn_type[i] & ~LTAB_MASK]; 
   1.379 +
   1.380          }
   1.381   
   1.382 -        if ( (region_base = xc_map_foreign_batch( xc_handle, dom, 
   1.383 -                                                  PROT_WRITE,
   1.384 -                                                  region_mfn,
   1.385 -                                                  j )) == 0 )
   1.386 -        {
   1.387 +        if (!(region_base = xc_map_foreign_batch(
   1.388 +                  xc_handle, dom, PROT_WRITE, region_mfn, j))) {  
   1.389              ERR("map batch failed");
   1.390              goto out;
   1.391          }
   1.392  
   1.393          for ( i = 0; i < j; i++ )
   1.394          {
   1.395 -            unsigned long *ppage;
   1.396 -
   1.397 -            pfn = region_pfn_type[i] & ~LTAB_MASK;
   1.398 +            void *page;
   1.399 +            unsigned long pagetype; 
   1.400  
   1.401 -            if ( (region_pfn_type[i] & LTAB_MASK) == XTAB) continue;
   1.402 +            pfn      = region_pfn_type[i] & ~LTAB_MASK;
   1.403 +            pagetype = region_pfn_type[i] & LTAB_MASK; 
   1.404  
   1.405 -            if (pfn>nr_pfns)
   1.406 -            {
   1.407 +            if (pagetype == XTAB) 
   1.408 +                /* a bogus/unmapped page: skip it */
   1.409 +                continue;
   1.410 +            
   1.411 +            if (pfn > max_pfn) {
   1.412                  ERR("pfn out of range");
   1.413                  goto out;
   1.414              }
   1.415  
   1.416 -            region_pfn_type[i] &= LTAB_MASK;
   1.417 -
   1.418 -            pfn_type[pfn] = region_pfn_type[i];
   1.419 -
   1.420 -            mfn = pfn_to_mfn_table[pfn];
   1.421 +            pfn_type[pfn] = pagetype; 
   1.422  
   1.423 -            if ( verify )
   1.424 -                ppage = (unsigned long*) buf;  /* debug case */
   1.425 -            else
   1.426 -                ppage = (unsigned long*) (region_base + i*PAGE_SIZE);
   1.427 +            mfn = p2m[pfn];
   1.428  
   1.429 -            if ( read_exact(io_fd, ppage, PAGE_SIZE) != PAGE_SIZE )
   1.430 -            {
   1.431 -                ERR("Error when reading pagetable page");
   1.432 +            /* In verify mode, we use a copy; otherwise we work in place */
   1.433 +            page = verify ? (void *)buf : (region_base + i*PAGE_SIZE); 
   1.434 +
   1.435 +            if (!read_exact(io_fd, page, PAGE_SIZE)) { 
   1.436 +                ERR("Error when reading page (type was %lx)", pagetype);
   1.437                  goto out;
   1.438              }
   1.439  
   1.440 -            switch( region_pfn_type[i] & LTABTYPE_MASK )
   1.441 -            {
   1.442 -            case 0:
   1.443 -                break;
   1.444 -
   1.445 -            case L1TAB:
   1.446 -            {
   1.447 -                for ( k = 0; k < 1024; k++ ) 
   1.448 -                {
   1.449 -                    if ( ppage[k] & _PAGE_PRESENT ) 
   1.450 -                    {
   1.451 -                        xpfn = ppage[k] >> PAGE_SHIFT;
   1.452 -                        if ( xpfn >= nr_pfns )
   1.453 -                        {
   1.454 -                            ERR("Frame number in type %lu page "
   1.455 -                                "table is out of range. i=%d k=%d "
   1.456 -                                "pfn=0x%lx nr_pfns=%lu", 
   1.457 -                                region_pfn_type[i]>>28, i, 
   1.458 -                                k, xpfn, nr_pfns);
   1.459 -                            goto out;
   1.460 -                        }
   1.461 -
   1.462 -                        ppage[k] &= (PAGE_SIZE - 1) & 
   1.463 -                            ~(_PAGE_GLOBAL | _PAGE_PAT);
   1.464 -                        ppage[k] |= pfn_to_mfn_table[xpfn] << PAGE_SHIFT;
   1.465 -                    }
   1.466 -                }
   1.467 -            }
   1.468 -            break;
   1.469 +            pagetype &= LTABTYPE_MASK; 
   1.470  
   1.471 -            case L2TAB:
   1.472 -            {
   1.473 -                for ( k = 0; 
   1.474 -                      k < (HYPERVISOR_VIRT_START>>L2_PAGETABLE_SHIFT); 
   1.475 -                      k++ )
   1.476 -                {
   1.477 -                    if ( ppage[k] & _PAGE_PRESENT )
   1.478 -                    {
   1.479 -                        xpfn = ppage[k] >> PAGE_SHIFT;
   1.480 +            if(pagetype >= L1TAB && pagetype <= L4TAB) { 
   1.481 +                
   1.482 +                /* 
   1.483 +                ** A page table page - need to 'uncanonicalize' it, i.e. 
   1.484 +                ** replace all the references to pfns with the corresponding 
   1.485 +                ** mfns for the new domain. 
   1.486 +                */ 
   1.487 +                if(!uncanonicalize_pagetable(pagetype, page))
   1.488 +                    goto out; 
   1.489  
   1.490 -                        if ( xpfn >= nr_pfns )
   1.491 -                        {
   1.492 -                            ERR("Frame number in type %lu page"
   1.493 -                                " table is out of range. i=%d k=%d "
   1.494 -                                "pfn=%lu nr_pfns=%lu",
   1.495 -                                region_pfn_type[i]>>28, i, k, 
   1.496 -                                xpfn, nr_pfns);
   1.497 -                            goto out;
   1.498 -                        }
   1.499 +            } else if(pagetype != NOTAB) { 
   1.500  
   1.501 -                        ppage[k] &= (PAGE_SIZE - 1) & 
   1.502 -                            ~(_PAGE_GLOBAL | _PAGE_PSE);
   1.503 -                        ppage[k] |= pfn_to_mfn_table[xpfn] << PAGE_SHIFT;
   1.504 -                    }
   1.505 -                }
   1.506 -            }
   1.507 -            break;
   1.508 -
   1.509 -            default:
   1.510 -                ERR("Bogus page type %lx page table is "
   1.511 -                    "out of range. i=%d nr_pfns=%lu", 
   1.512 -                    region_pfn_type[i], i, nr_pfns);
   1.513 +                ERR("Bogus page type %lx page table is out of range: "
   1.514 +                    "i=%d max_pfn=%lu", pagetype, i, max_pfn);
   1.515                  goto out;
   1.516  
   1.517 -            } /* end of page type switch statement */
   1.518 +            } 
   1.519  
   1.520 -            if ( verify )
   1.521 -            {
   1.522 -                int res = memcmp(buf, (region_base + i*PAGE_SIZE), PAGE_SIZE );
   1.523 -                if ( res )
   1.524 -                {
   1.525 +
   1.526 +
   1.527 +            if (verify) {
   1.528 +
   1.529 +                int res = memcmp(buf, (region_base + i*PAGE_SIZE), PAGE_SIZE);
   1.530 +
   1.531 +                if (res) { 
   1.532 +
   1.533                      int v;
   1.534 -                    printf("************** pfn=%lx type=%lx gotcs=%08lx "
   1.535 -                           "actualcs=%08lx\n", pfn, pfn_type[pfn], 
   1.536 -                           csum_page(region_base + i*PAGE_SIZE), 
   1.537 -                           csum_page(buf));
   1.538 -                    for ( v = 0; v < 4; v++ )
   1.539 -                    {
   1.540 -                        unsigned long *p = (unsigned long *)
   1.541 +
   1.542 +                    DPRINTF("************** pfn=%lx type=%lx gotcs=%08lx "
   1.543 +                            "actualcs=%08lx\n", pfn, pfn_type[pfn], 
   1.544 +                            csum_page(region_base + i*PAGE_SIZE), 
   1.545 +                            csum_page(buf));
   1.546 +
   1.547 +                    for (v = 0; v < 4; v++) {
   1.548 +                        
   1.549 +                        unsigned long *p = (unsigned long *) 
   1.550                              (region_base + i*PAGE_SIZE);
   1.551 -                        if ( buf[v] != p[v] )
   1.552 -                            printf("    %d: %08lx %08lx\n",
   1.553 -                                   v, buf[v], p[v] );
   1.554 +                        if (buf[v] != p[v])
   1.555 +                            DPRINTF("    %d: %08lx %08lx\n", v, buf[v], p[v]);
   1.556                      }
   1.557                  }
   1.558              }
   1.559  
   1.560 -            if ( xc_add_mmu_update(xc_handle, mmu,
   1.561 -                                   (mfn<<PAGE_SHIFT) | MMU_MACHPHYS_UPDATE,
   1.562 -                                   pfn) )
   1.563 -            {
   1.564 -                printf("machpys mfn=%ld pfn=%ld\n",mfn,pfn);
   1.565 +            if (xc_add_mmu_update(xc_handle, mmu, 
   1.566 +                                  (mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE,
   1.567 +                                  pfn)) {
   1.568 +                ERR("machpys mfn=%ld pfn=%ld", mfn, pfn);
   1.569                  goto out;
   1.570              }
   1.571 -
   1.572          } /* end of 'batch' for loop */
   1.573  
   1.574 -        munmap( region_base, j*PAGE_SIZE );
   1.575 -        n+=j; /* crude stats */
   1.576 +        munmap(region_base, j*PAGE_SIZE);
   1.577 +        n+= j; /* crude stats */
   1.578      }
   1.579  
   1.580      DPRINTF("Received all pages\n");
   1.581  
   1.582 -    if ( pt_levels == 3 )
   1.583 -    {
   1.584 +    if (pt_levels == 3) {
   1.585 +
   1.586          /* Get all PGDs below 4GB. */
   1.587 -        for ( i = 0; i < nr_pfns; i++ )
   1.588 -        {
   1.589 -            if ( ((pfn_type[i] & LTABTYPE_MASK) == L3TAB) &&
   1.590 -                 (pfn_to_mfn_table[i] > 0xfffffUL) )
   1.591 -            {
   1.592 -                unsigned long new_mfn = xc_make_page_below_4G(
   1.593 -                    xc_handle, dom, pfn_to_mfn_table[i]);
   1.594 -                if ( new_mfn == 0 )
   1.595 -                {
   1.596 -                    fprintf(stderr, "Couldn't get a page below 4GB :-(\n");
   1.597 +        for (i = 0; i < max_pfn; i++) {
   1.598 +            
   1.599 +            if (((pfn_type[i] & LTABTYPE_MASK)==L3TAB) && (p2m[i]>0xfffffUL)) {
   1.600 +
   1.601 +                unsigned long new_mfn; 
   1.602 +
   1.603 +                if (!(new_mfn=xc_make_page_below_4G(xc_handle, dom, p2m[i]))) {
   1.604 +                    ERR("Couldn't get a page below 4GB :-(");
   1.605                      goto out;
   1.606                  }
   1.607 -                pfn_to_mfn_table[i] = new_mfn;
   1.608 -                if ( xc_add_mmu_update(
   1.609 -                    xc_handle, mmu, (new_mfn << PAGE_SHIFT) |
   1.610 -                    MMU_MACHPHYS_UPDATE, i) )
   1.611 -                {
   1.612 -                    fprintf(stderr, "Couldn't m2p on PAE root pgdir\n");
   1.613 +                
   1.614 +                p2m[i] = new_mfn;
   1.615 +                if (xc_add_mmu_update(
   1.616 +                        xc_handle, mmu, 
   1.617 +                        (new_mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE, i)) {
   1.618 +                    ERR("Couldn't m2p on PAE root pgdir");
   1.619                      goto out;
   1.620                  }
   1.621              }
   1.622          }
   1.623 +        
   1.624      }
   1.625  
   1.626 -    if ( xc_finish_mmu_updates(xc_handle, mmu) )
   1.627 +
   1.628 +    if (xc_finish_mmu_updates(xc_handle, mmu)) { 
   1.629 +        ERR("Error doing finish_mmu_updates()"); 
   1.630          goto out;
   1.631 +    } 
   1.632  
   1.633      /*
   1.634       * Pin page tables. Do this after writing to them as otherwise Xen
   1.635       * will barf when doing the type-checking.
   1.636       */
   1.637 -    for ( i = 0; i < nr_pfns; i++ )
   1.638 -    {
   1.639 +    for (i = 0; i < max_pfn; i++) {
   1.640 +
   1.641          if ( (pfn_type[i] & LPINTAB) == 0 )
   1.642              continue;
   1.643 -        if ( pfn_type[i] == (L1TAB|LPINTAB) )
   1.644 +        
   1.645 +        switch(pfn_type[i]) { 
   1.646 +
   1.647 +        case (L1TAB|LPINTAB): 
   1.648              pin[nr_pins].cmd = MMUEXT_PIN_L1_TABLE;
   1.649 -        else /* pfn_type[i] == (L2TAB|LPINTAB) */
   1.650 +            break; 
   1.651 +            
   1.652 +        case (L2TAB|LPINTAB): 
   1.653              pin[nr_pins].cmd = MMUEXT_PIN_L2_TABLE;
   1.654 -        pin[nr_pins].arg1.mfn = pfn_to_mfn_table[i];
   1.655 -        if ( ++nr_pins == MAX_PIN_BATCH )
   1.656 -        {
   1.657 -            if ( xc_mmuext_op(xc_handle, pin, nr_pins, dom) < 0 )
   1.658 +            break; 
   1.659 +            
   1.660 +        case (L3TAB|LPINTAB): 
   1.661 +            pin[nr_pins].cmd = MMUEXT_PIN_L3_TABLE;
   1.662 +            break; 
   1.663 +
   1.664 +        case (L4TAB|LPINTAB):
   1.665 +            pin[nr_pins].cmd = MMUEXT_PIN_L4_TABLE;
   1.666 +            break; 
   1.667 +            
   1.668 +        default: 
   1.669 +            continue; 
   1.670 +        }
   1.671 +
   1.672 +        pin[nr_pins].arg1.mfn = p2m[i];
   1.673 +        
   1.674 +        if (++nr_pins == MAX_PIN_BATCH) {
   1.675 +            if (xc_mmuext_op(xc_handle, pin, nr_pins, dom) < 0) { 
   1.676 +                ERR("Failed to pin batch of %d page tables", nr_pins); 
   1.677                  goto out;
   1.678 +            } 
   1.679 +            DPRINTF("successfully pinned batch of %d page tables", nr_pins); 
   1.680              nr_pins = 0;
   1.681          }
   1.682      }
   1.683 -
   1.684 -    if ( (nr_pins != 0) &&
   1.685 -         (xc_mmuext_op(xc_handle, pin, nr_pins, dom) < 0) )
   1.686 -        goto out;
   1.687 +    
   1.688 +    if (nr_pins != 0) { 
   1.689 +        if((rc = xc_mmuext_op(xc_handle, pin, nr_pins, dom)) < 0) { 
   1.690 +            ERR("Failed (2) to pin batch of %d page tables", nr_pins); 
   1.691 +            DPRINTF("rc is %d\n", rc); 
   1.692 +            goto out;
   1.693 +        }
   1.694 +    }
   1.695  
   1.696      DPRINTF("\b\b\b\b100%%\n");
   1.697      DPRINTF("Memory reloaded.\n");
   1.698 @@ -445,111 +493,115 @@ int xc_linux_restore(int xc_handle, int 
   1.699          unsigned long *pfntab;
   1.700          int rc;
   1.701  
   1.702 -        if ( read_exact(io_fd, &count, sizeof(count)) != sizeof(count) )
   1.703 -        {
   1.704 +        if (!read_exact(io_fd, &count, sizeof(count))) { 
   1.705              ERR("Error when reading pfn count");
   1.706              goto out;
   1.707          }
   1.708  
   1.709 -        pfntab = malloc( sizeof(unsigned int) * count );
   1.710 -        if ( pfntab == NULL )
   1.711 -        {
   1.712 +        if(!(pfntab = malloc(sizeof(unsigned long) * count))) { 
   1.713              ERR("Out of memory");
   1.714              goto out;
   1.715          }
   1.716 -
   1.717 -        if ( read_exact(io_fd, pfntab, sizeof(unsigned int)*count) !=
   1.718 -             sizeof(unsigned int)*count )
   1.719 -        {
   1.720 +        
   1.721 +        if (!read_exact(io_fd, pfntab, sizeof(unsigned long)*count)) { 
   1.722              ERR("Error when reading pfntab");
   1.723              goto out;
   1.724          }
   1.725  
   1.726 -        for ( i = 0; i < count; i++ )
   1.727 -        {
   1.728 +        for (i = 0; i < count; i++) {
   1.729 +
   1.730              unsigned long pfn = pfntab[i];
   1.731 -            pfntab[i]=pfn_to_mfn_table[pfn];
   1.732 -            pfn_to_mfn_table[pfn] = 0x80000001;  // not in pmap
   1.733 +
   1.734 +            if(pfn > max_pfn) 
   1.735 +                /* shouldn't happen - continue optimistically */
   1.736 +                continue; 
   1.737 +
   1.738 +            pfntab[i] = p2m[pfn];
   1.739 +            p2m[pfn]  = 0x80000001;  // not in pmap
   1.740          }
   1.741 +        
   1.742 +        if (count > 0) {
   1.743  
   1.744 -        if ( count > 0 )
   1.745 -        {
   1.746              struct xen_memory_reservation reservation = {
   1.747                  .extent_start = pfntab,
   1.748                  .nr_extents   = count,
   1.749                  .extent_order = 0,
   1.750                  .domid        = dom
   1.751              };
   1.752 -            if ( (rc = xc_memory_op(xc_handle,
   1.753 -                                    XENMEM_decrease_reservation,
   1.754 -                                    &reservation)) != count )
   1.755 -            {
   1.756 -                ERR("Could not decrease reservation : %d",rc);
   1.757 +
   1.758 +            if ((rc = xc_memory_op(xc_handle, XENMEM_decrease_reservation,
   1.759 +                                   &reservation)) != count) { 
   1.760 +                ERR("Could not decrease reservation : %d", rc);
   1.761                  goto out;
   1.762 -            }
   1.763 -            else
   1.764 -            {
   1.765 -                printf("Decreased reservation by %d pages\n", count);
   1.766 -            }
   1.767 +            } else
   1.768 +                DPRINTF("Decreased reservation by %d pages\n", count);
   1.769          } 
   1.770      }
   1.771  
   1.772 -    if ( read_exact(io_fd, &ctxt,            sizeof(ctxt)) != sizeof(ctxt) ||
   1.773 -         read_exact(io_fd, shared_info_page, PAGE_SIZE) != PAGE_SIZE )
   1.774 -    {
   1.775 +    if (!read_exact(io_fd, &ctxt, sizeof(ctxt)) || 
   1.776 +        !read_exact(io_fd, shared_info_page, PAGE_SIZE)) { 
   1.777          ERR("Error when reading ctxt or shared info page");
   1.778          goto out;
   1.779      }
   1.780  
   1.781      /* Uncanonicalise the suspend-record frame number and poke resume rec. */
   1.782      pfn = ctxt.user_regs.edx;
   1.783 -    if ( (pfn >= nr_pfns) || (pfn_type[pfn] != NOTAB) )
   1.784 -    {
   1.785 +    if ((pfn >= max_pfn) || (pfn_type[pfn] != NOTAB)) {
   1.786          ERR("Suspend record frame number is bad");
   1.787          goto out;
   1.788      }
   1.789 -    ctxt.user_regs.edx = mfn = pfn_to_mfn_table[pfn];
   1.790 +    ctxt.user_regs.edx = mfn = p2m[pfn];
   1.791      start_info = xc_map_foreign_range(
   1.792          xc_handle, dom, PAGE_SIZE, PROT_READ | PROT_WRITE, mfn);
   1.793 -    start_info->nr_pages    = nr_pfns;
   1.794 +    start_info->nr_pages    = max_pfn;
   1.795      start_info->shared_info = shared_info_frame << PAGE_SHIFT;
   1.796      start_info->flags       = 0;
   1.797 -    *store_mfn = start_info->store_mfn   =
   1.798 -        pfn_to_mfn_table[start_info->store_mfn];
   1.799 -    start_info->store_evtchn = store_evtchn;
   1.800 -    *console_mfn = start_info->console_mfn   =
   1.801 -        pfn_to_mfn_table[start_info->console_mfn];
   1.802 -    start_info->console_evtchn = console_evtchn;
   1.803 +    *store_mfn = start_info->store_mfn       = p2m[start_info->store_mfn];
   1.804 +    start_info->store_evtchn                 = store_evtchn;
   1.805 +    *console_mfn = start_info->console_mfn   = p2m[start_info->console_mfn];
   1.806 +    start_info->console_evtchn               = console_evtchn;
   1.807      munmap(start_info, PAGE_SIZE);
   1.808  
   1.809      /* Uncanonicalise each GDT frame number. */
   1.810 -    if ( ctxt.gdt_ents > 8192 )
   1.811 -    {
   1.812 +    if (ctxt.gdt_ents > 8192) {
   1.813          ERR("GDT entry count out of range");
   1.814          goto out;
   1.815      }
   1.816  
   1.817 -    for ( i = 0; i < ctxt.gdt_ents; i += 512 )
   1.818 -    {
   1.819 +    for (i = 0; i < ctxt.gdt_ents; i += 512) {
   1.820          pfn = ctxt.gdt_frames[i];
   1.821 -        if ( (pfn >= nr_pfns) || (pfn_type[pfn] != NOTAB) )
   1.822 -        {
   1.823 +        if ((pfn >= max_pfn) || (pfn_type[pfn] != NOTAB)) {
   1.824              ERR("GDT frame number is bad");
   1.825              goto out;
   1.826          }
   1.827 -        ctxt.gdt_frames[i] = pfn_to_mfn_table[pfn];
   1.828 +        ctxt.gdt_frames[i] = p2m[pfn];
   1.829      }
   1.830  
   1.831      /* Uncanonicalise the page table base pointer. */
   1.832      pfn = ctxt.ctrlreg[3] >> PAGE_SHIFT;
   1.833 -    if ( (pfn >= nr_pfns) || ((pfn_type[pfn]&LTABTYPE_MASK) != L2TAB) )
   1.834 -    {
   1.835 -        printf("PT base is bad. pfn=%lu nr=%lu type=%08lx %08lx\n",
   1.836 -               pfn, nr_pfns, pfn_type[pfn], (unsigned long)L2TAB);
   1.837 +
   1.838 +    if (pfn >= max_pfn) {
   1.839 +        DPRINTF("PT base is bad: pfn=%lu max_pfn=%lu type=%08lx\n",
   1.840 +                pfn, max_pfn, pfn_type[pfn]); 
   1.841          ERR("PT base is bad.");
   1.842          goto out;
   1.843      }
   1.844 -    ctxt.ctrlreg[3] = pfn_to_mfn_table[pfn] << PAGE_SHIFT;
   1.845 +
   1.846 +    if ((pt_levels == 2) && ((pfn_type[pfn]&LTABTYPE_MASK) != L2TAB)) { 
   1.847 +        DPRINTF("PT base is bad. pfn=%lu nr=%lu type=%08lx %08lx\n",
   1.848 +                pfn, max_pfn, pfn_type[pfn], (unsigned long)L2TAB);
   1.849 +        ERR("PT base is bad.");
   1.850 +        goto out;
   1.851 +    }
   1.852 +
   1.853 +    if ((pt_levels == 3) && ((pfn_type[pfn]&LTABTYPE_MASK) != L3TAB)) { 
   1.854 +        DPRINTF("PT base is bad. pfn=%lu nr=%lu type=%08lx %08lx\n",
   1.855 +                pfn, max_pfn, pfn_type[pfn], (unsigned long)L3TAB);
   1.856 +        ERR("PT base is bad.");
   1.857 +        goto out;
   1.858 +    }
   1.859 +    
   1.860 +    ctxt.ctrlreg[3] = p2m[pfn] << PAGE_SHIFT;
   1.861  
   1.862      /* clear any pending events and the selector */
   1.863      memset(&(shared_info->evtchn_pending[0]), 0,
   1.864 @@ -558,40 +610,31 @@ int xc_linux_restore(int xc_handle, int 
   1.865          shared_info->vcpu_data[i].evtchn_pending_sel = 0;
   1.866  
   1.867      /* Copy saved contents of shared-info page. No checking needed. */
   1.868 -    ppage = xc_map_foreign_range(
   1.869 +    page = xc_map_foreign_range(
   1.870          xc_handle, dom, PAGE_SIZE, PROT_WRITE, shared_info_frame);
   1.871 -    memcpy(ppage, shared_info, sizeof(shared_info_t));
   1.872 -    munmap(ppage, PAGE_SIZE);
   1.873 -
   1.874 +    memcpy(page, shared_info, sizeof(shared_info_t));
   1.875 +    munmap(page, PAGE_SIZE);
   1.876 +    
   1.877      /* Uncanonicalise the pfn-to-mfn table frame-number list. */
   1.878 -    for ( i = 0; i < (nr_pfns+1023)/1024; i++ )
   1.879 -    {
   1.880 -        unsigned long pfn, mfn;
   1.881 -
   1.882 -        pfn = pfn_to_mfn_frame_list[i];
   1.883 -        if ( (pfn >= nr_pfns) || (pfn_type[pfn] != NOTAB) )
   1.884 -        {
   1.885 +    for (i = 0; i < P2M_FL_ENTRIES; i++) {
   1.886 +        pfn = p2m_frame_list[i];
   1.887 +        if ((pfn >= max_pfn) || (pfn_type[pfn] != NOTAB)) {
   1.888              ERR("PFN-to-MFN frame number is bad");
   1.889              goto out;
   1.890          }
   1.891 -        mfn = pfn_to_mfn_table[pfn];
   1.892 -        pfn_to_mfn_frame_list[i] = mfn;
   1.893 +
   1.894 +        p2m_frame_list[i] = p2m[pfn];
   1.895      }
   1.896      
   1.897 -    if ( (live_pfn_to_mfn_table = 
   1.898 -          xc_map_foreign_batch(xc_handle, dom, 
   1.899 -                               PROT_WRITE,
   1.900 -                               pfn_to_mfn_frame_list,
   1.901 -                               (nr_pfns+1023)/1024 )) == 0 )
   1.902 -    {
   1.903 -        ERR("Couldn't map pfn_to_mfn table");
   1.904 +    /* Copy the P2M we've constructed to the 'live' P2M */
   1.905 +    if (!(live_p2m = xc_map_foreign_batch(xc_handle, dom, PROT_WRITE, 
   1.906 +                                          p2m_frame_list, P2M_FL_ENTRIES))) {
   1.907 +        ERR("Couldn't map p2m table");
   1.908          goto out;
   1.909      }
   1.910  
   1.911 -    memcpy(live_pfn_to_mfn_table, pfn_to_mfn_table, 
   1.912 -           nr_pfns*sizeof(unsigned long) );
   1.913 -
   1.914 -    munmap(live_pfn_to_mfn_table, ((nr_pfns+1023)/1024)*PAGE_SIZE);
   1.915 +    memcpy(live_p2m, p2m, P2M_SIZE); 
   1.916 +    munmap(live_p2m, P2M_SIZE); 
   1.917  
   1.918      /*
   1.919       * Safety checking of saved context:
   1.920 @@ -605,25 +648,23 @@ int xc_linux_restore(int xc_handle, int 
   1.921       *  8. debugregs are checked by Xen.
   1.922       *  9. callback code selectors need checking.
   1.923       */
   1.924 -    for ( i = 0; i < 256; i++ )
   1.925 -    {
   1.926 +    for ( i = 0; i < 256; i++ ) {
   1.927          ctxt.trap_ctxt[i].vector = i;
   1.928 -        if ( (ctxt.trap_ctxt[i].cs & 3) == 0 )
   1.929 +        if ((ctxt.trap_ctxt[i].cs & 3) == 0)
   1.930              ctxt.trap_ctxt[i].cs = FLAT_KERNEL_CS;
   1.931      }
   1.932 -    if ( (ctxt.kernel_ss & 3) == 0 )
   1.933 +    if ((ctxt.kernel_ss & 3) == 0)
   1.934          ctxt.kernel_ss = FLAT_KERNEL_DS;
   1.935  #if defined(__i386__)
   1.936 -    if ( (ctxt.event_callback_cs & 3) == 0 )
   1.937 +    if ((ctxt.event_callback_cs & 3) == 0)
   1.938          ctxt.event_callback_cs = FLAT_KERNEL_CS;
   1.939 -    if ( (ctxt.failsafe_callback_cs & 3) == 0 )
   1.940 +    if ((ctxt.failsafe_callback_cs & 3) == 0)
   1.941          ctxt.failsafe_callback_cs = FLAT_KERNEL_CS;
   1.942  #endif
   1.943 -    if ( ((ctxt.ldt_base & (PAGE_SIZE - 1)) != 0) ||
   1.944 -         (ctxt.ldt_ents > 8192) ||
   1.945 -         (ctxt.ldt_base > HYPERVISOR_VIRT_START) ||
   1.946 -         ((ctxt.ldt_base + ctxt.ldt_ents*8) > HYPERVISOR_VIRT_START) )
   1.947 -    {
   1.948 +    if (((ctxt.ldt_base & (PAGE_SIZE - 1)) != 0) ||
   1.949 +        (ctxt.ldt_ents > 8192) ||
   1.950 +        (ctxt.ldt_base > hvirt_start) ||
   1.951 +        ((ctxt.ldt_base + ctxt.ldt_ents*8) > hvirt_start)) {
   1.952          ERR("Bad LDT base or size");
   1.953          goto out;
   1.954      }
   1.955 @@ -636,8 +677,7 @@ int xc_linux_restore(int xc_handle, int 
   1.956      op.u.setdomaininfo.ctxt   = &ctxt;
   1.957      rc = xc_dom0_op(xc_handle, &op);
   1.958  
   1.959 -    if ( rc != 0 )
   1.960 -    {
   1.961 +    if (rc != 0) {
   1.962          ERR("Couldn't build the domain");
   1.963          goto out;
   1.964      }
   1.965 @@ -646,9 +686,10 @@ int xc_linux_restore(int xc_handle, int 
   1.966      if ( (rc != 0) && (dom != 0) )
   1.967          xc_domain_destroy(xc_handle, dom);
   1.968      free(mmu);
   1.969 -    free(pfn_to_mfn_table);
   1.970 +    free(p2m);
   1.971      free(pfn_type);
   1.972  
   1.973      DPRINTF("Restore exit with rc=%d\n", rc);
   1.974 +
   1.975      return rc;
   1.976  }
     2.1 --- a/tools/libxc/xc_linux_save.c	Tue Nov 08 18:39:58 2005 +0100
     2.2 +++ b/tools/libxc/xc_linux_save.c	Tue Nov 08 18:42:07 2005 +0100
     2.3 @@ -13,10 +13,7 @@
     2.4  #include <sys/time.h>
     2.5  
     2.6  #include "xg_private.h"
     2.7 -
     2.8 -#define BATCH_SIZE 1024   /* 1024 pages (4MB) at a time */
     2.9 -
    2.10 -#define MAX_MBIT_RATE 500
    2.11 +#include "xg_save_restore.h"
    2.12  
    2.13  /*
    2.14  ** Default values for important tuning parameters. Can override by passing
    2.15 @@ -25,75 +22,77 @@
    2.16  ** XXX SMH: should consider if want to be able to override MAX_MBIT_RATE too. 
    2.17  ** 
    2.18  */
    2.19 -#define DEF_MAX_ITERS   29   /* limit us to 30 times round loop */ 
    2.20 -#define DEF_MAX_FACTOR   3   /* never send more than 3x nr_pfns */
    2.21 -
    2.22 -/* Flags to control behaviour of xc_linux_save */
    2.23 -#define XCFLAGS_LIVE      1
    2.24 -#define XCFLAGS_DEBUG     2
    2.25 -
    2.26 -#define DEBUG 0
    2.27 +#define DEF_MAX_ITERS   29   /* limit us to 30 times round loop   */ 
    2.28 +#define DEF_MAX_FACTOR   3   /* never send more than 3x nr_pfns   */
    2.29  
    2.30 -#if 1
    2.31 -#define ERR(_f, _a...) do { fprintf(stderr, _f "\n" , ## _a); fflush(stderr); } while (0)
    2.32 -#else
    2.33 -#define ERR(_f, _a...) ((void)0)
    2.34 -#endif
    2.35  
    2.36 -#if DEBUG
    2.37 -#define DPRINTF(_f, _a...) fprintf ( stderr, _f , ## _a )
    2.38 -#else
    2.39 -#define DPRINTF(_f, _a...) ((void)0)
    2.40 -#endif
    2.41 +/* max mfn of the whole machine */
    2.42 +static uint32_t max_mfn; 
    2.43  
    2.44 -#define PROGRESS 0
    2.45 -#if PROGRESS
    2.46 -#define PPRINTF(_f, _a...) fprintf ( stderr, _f , ## _a )
    2.47 -#else
    2.48 -#define PPRINTF(_f, _a...)
    2.49 -#endif
    2.50 +/* virtual starting address of the hypervisor */
    2.51 +static uint32_t hvirt_start; 
    2.52 +
    2.53 +/* #levels of page tables used by the currrent guest */
    2.54 +static uint32_t pt_levels; 
    2.55 +
    2.56 +/* total number of pages used by the current guest */
    2.57 +static unsigned long max_pfn;
    2.58 +
    2.59 +/* Live mapping of the table mapping each PFN to its current MFN. */
    2.60 +static unsigned long *live_p2m = NULL;
    2.61 +
    2.62 +/* Live mapping of system MFN to PFN table. */
    2.63 +static unsigned long *live_m2p = NULL;
    2.64 +
    2.65  
    2.66  /*
    2.67   * Returns TRUE if the given machine frame number has a unique mapping
    2.68   * in the guest's pseudophysical map.
    2.69   */
    2.70 -
    2.71 -#define MFN_IS_IN_PSEUDOPHYS_MAP(_mfn)                                    \
    2.72 -    (((_mfn) < (1024*1024)) &&                                            \
    2.73 -     ((live_mfn_to_pfn_table[_mfn] < nr_pfns) &&                         \
    2.74 -       (live_pfn_to_mfn_table[live_mfn_to_pfn_table[_mfn]] == (_mfn))))
    2.75 -
    2.76 +#define MFN_IS_IN_PSEUDOPHYS_MAP(_mfn)          \
    2.77 +(((_mfn) < (max_mfn)) &&                        \
    2.78 + ((live_m2p[_mfn] < (max_pfn)) &&               \
    2.79 +  (live_p2m[live_m2p[_mfn]] == (_mfn))))
    2.80 +    
    2.81   
    2.82  /* Returns TRUE if MFN is successfully converted to a PFN. */
    2.83 -#define translate_mfn_to_pfn(_pmfn)            \
    2.84 -({                                             \
    2.85 -    unsigned long mfn = *(_pmfn);              \
    2.86 -    int _res = 1;                              \
    2.87 -    if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) )      \
    2.88 -        _res = 0;                              \
    2.89 -    else                                       \
    2.90 -        *(_pmfn) = live_mfn_to_pfn_table[mfn]; \
    2.91 -    _res;                                      \
    2.92 +#define translate_mfn_to_pfn(_pmfn)                             \
    2.93 +({                                                              \
    2.94 +    unsigned long mfn = *(_pmfn);                               \
    2.95 +    int _res = 1;                                               \
    2.96 +    if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) )                       \
    2.97 +        _res = 0;                                               \
    2.98 +    else                                                        \
    2.99 +        *(_pmfn) = live_m2p[mfn];                               \
   2.100 +    _res;                                                       \
   2.101  })
   2.102  
   2.103 -#define is_mapped(pfn) (!((pfn) & 0x80000000UL))
   2.104 +/* 
   2.105 +** During (live) save/migrate, we maintain a number of bitmaps to track 
   2.106 +** which pages we have to send, to fixup, and to skip. 
   2.107 +*/
   2.108  
   2.109 -static inline int test_bit ( int nr, volatile void * addr)
   2.110 +#define BITS_PER_LONG (sizeof(unsigned long) * 8) 
   2.111 +#define BITMAP_SIZE   ((max_pfn + BITS_PER_LONG - 1) / BITS_PER_LONG)
   2.112 +
   2.113 +#define BITMAP_ENTRY(_nr,_bmap) \
   2.114 +   ((unsigned long *)(_bmap))[(_nr)/BITS_PER_LONG]
   2.115 +
   2.116 +#define BITMAP_SHIFT(_nr) ((_nr) % BITS_PER_LONG)
   2.117 +
   2.118 +static inline int test_bit (int nr, volatile void * addr)
   2.119  {
   2.120 -    return (((unsigned long*)addr)[nr/(sizeof(unsigned long)*8)] >> 
   2.121 -            (nr % (sizeof(unsigned long)*8))) & 1;
   2.122 +    return (BITMAP_ENTRY(nr, addr) >> BITMAP_SHIFT(nr)) & 1; 
   2.123  }
   2.124  
   2.125 -static inline void clear_bit ( int nr, volatile void * addr)
   2.126 +static inline void clear_bit (int nr, volatile void * addr)
   2.127  {
   2.128 -    ((unsigned long*)addr)[nr/(sizeof(unsigned long)*8)] &= 
   2.129 -        ~(1 << (nr % (sizeof(unsigned long)*8) ) );
   2.130 +    BITMAP_ENTRY(nr, addr) &= ~(1 << BITMAP_SHIFT(nr)); 
   2.131  }
   2.132  
   2.133  static inline void set_bit ( int nr, volatile void * addr)
   2.134  {
   2.135 -    ((unsigned long*)addr)[nr/(sizeof(unsigned long)*8)] |= 
   2.136 -        (1 << (nr % (sizeof(unsigned long)*8) ) );
   2.137 +    BITMAP_ENTRY(nr, addr) |= (1 << BITMAP_SHIFT(nr)); 
   2.138  }
   2.139  
   2.140  /* Returns the hamming weight (i.e. the number of bits set) in a N-bit word */
   2.141 @@ -142,102 +141,106 @@ static inline int permute( int i, int nr
   2.142  
   2.143      do { i = ((i>>(order_nr-10)) | ( i<<10 ) ) & ((1<<order_nr)-1); }
   2.144      while ( i >= nr ); /* this won't ever loop if nr is a power of 2 */
   2.145 -
   2.146 +    
   2.147      return i;
   2.148  }
   2.149  
   2.150 -static long long tv_to_us( struct timeval *new )
   2.151 +
   2.152 +
   2.153 +
   2.154 +static uint64_t tv_to_us(struct timeval *new)
   2.155  {
   2.156      return (new->tv_sec * 1000000) + new->tv_usec;
   2.157  }
   2.158  
   2.159 -static long long llgettimeofday( void )
   2.160 +static uint64_t llgettimeofday(void)
   2.161  {
   2.162      struct timeval now;
   2.163      gettimeofday(&now, NULL);
   2.164      return tv_to_us(&now);
   2.165  }
   2.166  
   2.167 -static long long tv_delta( struct timeval *new, struct timeval *old )
   2.168 +static uint64_t tv_delta(struct timeval *new, struct timeval *old)
   2.169  {
   2.170      return ((new->tv_sec - old->tv_sec)*1000000 ) + 
   2.171          (new->tv_usec - old->tv_usec);
   2.172  }
   2.173  
   2.174  
   2.175 -#define START_MBIT_RATE 0 //ioctxt->resource
   2.176 +#ifdef ADAPTIVE_SAVE
   2.177  
   2.178 -static int mbit_rate, ombit_rate = 0;
   2.179 -static int burst_time_us = -1;
   2.180  
   2.181 -#define MBIT_RATE mbit_rate
   2.182 +/*
   2.183 +** We control the rate at which we transmit (or save) to minimize impact
   2.184 +** on running domains (including the target if we're doing live migrate). 
   2.185 +*/
   2.186 +
   2.187 +#define MAX_MBIT_RATE    500      /* maximum transmit rate for migrate */
   2.188 +#define START_MBIT_RATE  100      /* initial transmit rate for migrate */
   2.189 +
   2.190 +
   2.191 +/* Scaling factor to convert between a rate (in Mb/s) and time (in usecs) */
   2.192 +#define RATE_TO_BTU      781250
   2.193 +
   2.194 +/* Amount in bytes we allow ourselves to send in a burst */
   2.195  #define BURST_BUDGET (100*1024)
   2.196  
   2.197 -/* 
   2.198 -   1000000/((100)*1024*1024/8/(100*1024))
   2.199 -   7812
   2.200 -   1000000/((100)*1024/8/(100))
   2.201 -   7812
   2.202 -   1000000/((100)*128/(100))
   2.203 -   7812
   2.204 -   100000000/((100)*128)
   2.205 -   7812
   2.206 -   100000000/128
   2.207 -   781250
   2.208 - */
   2.209 -#define RATE_TO_BTU 781250
   2.210 -#define BURST_TIME_US burst_time_us
   2.211  
   2.212 -static int
   2.213 -ratewrite(int io_fd, void *buf, int n)
   2.214 +/* We keep track of the current and previous transmission rate */
   2.215 +static int mbit_rate, ombit_rate = 0;
   2.216 +
   2.217 +/* Have we reached the maximum transmission rate? */
   2.218 +#define RATE_IS_MAX() (mbit_rate == MAX_MBIT_RATE) 
   2.219 +
   2.220 +
   2.221 +static inline void initialize_mbit_rate() 
   2.222 +{
   2.223 +    mbit_rate = START_MBIT_RATE;
   2.224 +}
   2.225 +
   2.226 +
   2.227 +static int ratewrite(int io_fd, void *buf, int n)
   2.228  {
   2.229      static int budget = 0;
   2.230 +    static int burst_time_us = -1;
   2.231      static struct timeval last_put = { 0 };
   2.232      struct timeval now;
   2.233      struct timespec delay;
   2.234      long long delta;
   2.235  
   2.236 -    if ( START_MBIT_RATE == 0 )
   2.237 +    if (START_MBIT_RATE == 0)
   2.238          return write(io_fd, buf, n);
   2.239      
   2.240      budget -= n;
   2.241 -    if ( budget < 0 )
   2.242 -    {
   2.243 -        if ( MBIT_RATE != ombit_rate )
   2.244 -        {
   2.245 -            BURST_TIME_US = RATE_TO_BTU / MBIT_RATE;
   2.246 -            ombit_rate = MBIT_RATE;
   2.247 +    if (budget < 0) {
   2.248 +        if (mbit_rate != ombit_rate) {
   2.249 +            burst_time_us = RATE_TO_BTU / mbit_rate;
   2.250 +            ombit_rate = mbit_rate;
   2.251              DPRINTF("rate limit: %d mbit/s burst budget %d slot time %d\n",
   2.252 -                    MBIT_RATE, BURST_BUDGET, BURST_TIME_US);
   2.253 +                    mbit_rate, BURST_BUDGET, burst_time_us);
   2.254          }
   2.255 -        if ( last_put.tv_sec == 0 )
   2.256 -        {
   2.257 +        if (last_put.tv_sec == 0) {
   2.258              budget += BURST_BUDGET;
   2.259              gettimeofday(&last_put, NULL);
   2.260 -        }
   2.261 -        else
   2.262 -        {
   2.263 -            while ( budget < 0 )
   2.264 -            {
   2.265 +        } else {
   2.266 +            while (budget < 0) {
   2.267                  gettimeofday(&now, NULL);
   2.268                  delta = tv_delta(&now, &last_put);
   2.269 -                while ( delta > BURST_TIME_US )
   2.270 -                {
   2.271 +                while (delta > burst_time_us) {
   2.272                      budget += BURST_BUDGET;
   2.273 -                    last_put.tv_usec += BURST_TIME_US;
   2.274 -                    if ( last_put.tv_usec > 1000000 )
   2.275 -                    {
   2.276 +                    last_put.tv_usec += burst_time_us;
   2.277 +                    if (last_put.tv_usec > 1000000) {
   2.278                          last_put.tv_usec -= 1000000;
   2.279                          last_put.tv_sec++;
   2.280                      }
   2.281 -                    delta -= BURST_TIME_US;
   2.282 +                    delta -= burst_time_us;
   2.283                  }
   2.284 -                if ( budget > 0 )
   2.285 +                if (budget > 0)
   2.286                      break;
   2.287                  delay.tv_sec = 0;
   2.288 -                delay.tv_nsec = 1000 * (BURST_TIME_US - delta);
   2.289 -                while ( delay.tv_nsec > 0 )
   2.290 -                    if ( nanosleep(&delay, &delay) == 0 )
   2.291 +                delay.tv_nsec = 1000 * (burst_time_us - delta);
   2.292 +                while (delay.tv_nsec > 0)
   2.293 +                    if (nanosleep(&delay, &delay) == 0)
   2.294                          break;
   2.295              }
   2.296          }
   2.297 @@ -245,35 +248,52 @@ ratewrite(int io_fd, void *buf, int n)
   2.298      return write(io_fd, buf, n);
   2.299  }
   2.300  
   2.301 -static int print_stats( int xc_handle, uint32_t domid, 
   2.302 -                        int pages_sent, xc_shadow_control_stats_t *stats,
   2.303 -                        int print )
   2.304 +#else /* ! ADAPTIVE SAVE */
   2.305 +
   2.306 +#define RATE_IS_MAX() (0) 
   2.307 +#define ratewrite(_io_fd, _buf, _n) write((_io_fd), (_buf), (_n)) 
   2.308 +#define initialize_mbit_rate() 
   2.309 +
   2.310 +#endif
   2.311 +
   2.312 +
   2.313 +static inline ssize_t write_exact(int fd, void *buf, size_t count)
   2.314 +{
   2.315 +    if(write(fd, buf, count) != count) 
   2.316 +        return 0; 
   2.317 +    return 1; 
   2.318 +} 
   2.319 +
   2.320 +
   2.321 +
   2.322 +static int print_stats(int xc_handle, uint32_t domid, int pages_sent, 
   2.323 +                       xc_shadow_control_stats_t *stats, int print)
   2.324  {
   2.325      static struct timeval wall_last;
   2.326      static long long      d0_cpu_last;
   2.327      static long long      d1_cpu_last;
   2.328 -
   2.329 +    
   2.330      struct timeval        wall_now;
   2.331      long long             wall_delta;
   2.332      long long             d0_cpu_now, d0_cpu_delta;
   2.333      long long             d1_cpu_now, d1_cpu_delta;
   2.334 -
   2.335 +    
   2.336      gettimeofday(&wall_now, NULL);
   2.337 -
   2.338 +    
   2.339      d0_cpu_now = xc_domain_get_cpu_usage(xc_handle, 0, /* FIXME */ 0)/1000;
   2.340      d1_cpu_now = xc_domain_get_cpu_usage(xc_handle, domid, /* FIXME */ 0)/1000;
   2.341  
   2.342      if ( (d0_cpu_now == -1) || (d1_cpu_now == -1) ) 
   2.343          fprintf(stderr, "ARRHHH!!\n");
   2.344 -
   2.345 +    
   2.346      wall_delta = tv_delta(&wall_now,&wall_last)/1000;
   2.347 +    
   2.348 +    if (wall_delta == 0) wall_delta = 1;
   2.349 +    
   2.350 +    d0_cpu_delta = (d0_cpu_now - d0_cpu_last)/1000;
   2.351 +    d1_cpu_delta = (d1_cpu_now - d1_cpu_last)/1000;
   2.352  
   2.353 -    if ( wall_delta == 0 ) wall_delta = 1;
   2.354 -
   2.355 -    d0_cpu_delta  = (d0_cpu_now - d0_cpu_last)/1000;
   2.356 -    d1_cpu_delta  = (d1_cpu_now - d1_cpu_last)/1000;
   2.357 -
   2.358 -    if ( print )
   2.359 +    if (print)
   2.360          fprintf(stderr,
   2.361                  "delta %lldms, dom0 %d%%, target %d%%, sent %dMb/s, "
   2.362                  "dirtied %dMb/s %" PRId32 " pages\n",
   2.363 @@ -284,23 +304,25 @@ static int print_stats( int xc_handle, u
   2.364                  (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))),
   2.365                  stats->dirty_count);
   2.366  
   2.367 -    if ( ((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))) > mbit_rate )
   2.368 -    {
   2.369 +#ifdef ADAPTIVE_SAVE    
   2.370 +    if (((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))) > mbit_rate) {
   2.371          mbit_rate = (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8)))
   2.372              + 50;
   2.373          if (mbit_rate > MAX_MBIT_RATE)
   2.374              mbit_rate = MAX_MBIT_RATE;
   2.375      }
   2.376 -
   2.377 -    d0_cpu_last  = d0_cpu_now;
   2.378 -    d1_cpu_last  = d1_cpu_now;
   2.379 -    wall_last = wall_now; 
   2.380 +#endif
   2.381 +    
   2.382 +    d0_cpu_last = d0_cpu_now;
   2.383 +    d1_cpu_last = d1_cpu_now;
   2.384 +    wall_last   = wall_now; 
   2.385  
   2.386      return 0;
   2.387  }
   2.388  
   2.389 -static int analysis_phase( int xc_handle, uint32_t domid, 
   2.390 -                           int nr_pfns, unsigned long *arr, int runs )
   2.391 +
   2.392 +static int analysis_phase(int xc_handle, uint32_t domid, int max_pfn, 
   2.393 +                          unsigned long *arr, int runs)
   2.394  {
   2.395      long long start, now;
   2.396      xc_shadow_control_stats_t stats;
   2.397 @@ -308,22 +330,18 @@ static int analysis_phase( int xc_handle
   2.398  
   2.399      start = llgettimeofday();
   2.400  
   2.401 -    for ( j = 0; j < runs; j++ )
   2.402 -    {
   2.403 +    for (j = 0; j < runs; j++) {
   2.404          int i;
   2.405 -
   2.406 -        xc_shadow_control( xc_handle, domid, 
   2.407 -                           DOM0_SHADOW_CONTROL_OP_CLEAN,
   2.408 -                           arr, nr_pfns, NULL);
   2.409 +        
   2.410 +        xc_shadow_control(xc_handle, domid, DOM0_SHADOW_CONTROL_OP_CLEAN,
   2.411 +                          arr, max_pfn, NULL);
   2.412          fprintf(stderr, "#Flush\n");
   2.413 -        for ( i = 0; i < 40; i++ )
   2.414 -        {     
   2.415 +        for ( i = 0; i < 40; i++ ) {     
   2.416              usleep(50000);     
   2.417              now = llgettimeofday();
   2.418 -            xc_shadow_control( xc_handle, domid, 
   2.419 -                               DOM0_SHADOW_CONTROL_OP_PEEK,
   2.420 -                               NULL, 0, &stats);
   2.421 -
   2.422 +            xc_shadow_control(xc_handle, domid, DOM0_SHADOW_CONTROL_OP_PEEK,
   2.423 +                              NULL, 0, &stats);
   2.424 +            
   2.425              fprintf(stderr, "now= %lld faults= %" PRId32 " dirty= %" PRId32
   2.426                      " dirty_net= %" PRId32 " dirty_block= %" PRId32"\n", 
   2.427                      ((now-start)+500)/1000, 
   2.428 @@ -331,7 +349,7 @@ static int analysis_phase( int xc_handle
   2.429                      stats.dirty_net_count, stats.dirty_block_count);
   2.430          }
   2.431      }
   2.432 -
   2.433 +    
   2.434      return -1;
   2.435  }
   2.436  
   2.437 @@ -345,67 +363,150 @@ static int suspend_and_state(int xc_hand
   2.438  
   2.439      printf("suspend\n");
   2.440      fflush(stdout);
   2.441 -    if ( fgets(ans, sizeof(ans), stdin) == NULL )
   2.442 -    {
   2.443 +    if (fgets(ans, sizeof(ans), stdin) == NULL) {
   2.444          ERR("failed reading suspend reply");
   2.445          return -1;
   2.446      }
   2.447 -    if ( strncmp(ans, "done\n", 5) )
   2.448 -    {
   2.449 +    if (strncmp(ans, "done\n", 5)) {
   2.450          ERR("suspend reply incorrect: %s", ans);
   2.451          return -1;
   2.452      }
   2.453  
   2.454   retry:
   2.455  
   2.456 -    if ( xc_domain_getinfo(xc_handle, dom, 1, info) != 1)
   2.457 -    {
   2.458 +    if (xc_domain_getinfo(xc_handle, dom, 1, info) != 1) {
   2.459          ERR("Could not get domain info");
   2.460          return -1;
   2.461      }
   2.462  
   2.463 -    if ( xc_domain_get_vcpu_context(xc_handle, dom, 0 /* XXX */, 
   2.464 -                                    ctxt) )
   2.465 -    {
   2.466 +    if ( xc_domain_get_vcpu_context(xc_handle, dom, 0 /* XXX */, ctxt)) 
   2.467          ERR("Could not get vcpu context");
   2.468 -    }
   2.469  
   2.470 -    if ( info->shutdown && info->shutdown_reason == SHUTDOWN_suspend )
   2.471 -    {
   2.472 +
   2.473 +    if (info->shutdown && info->shutdown_reason == SHUTDOWN_suspend)
   2.474          return 0; // success
   2.475 -    }
   2.476  
   2.477 -    if ( info->paused )
   2.478 -    {
   2.479 +    if (info->paused) {
   2.480          // try unpausing domain, wait, and retest 
   2.481          xc_domain_unpause( xc_handle, dom );
   2.482 -
   2.483 +        
   2.484          ERR("Domain was paused. Wait and re-test.");
   2.485          usleep(10000);  // 10ms
   2.486 -
   2.487 +        
   2.488          goto retry;
   2.489      }
   2.490  
   2.491  
   2.492 -    if( ++i < 100 )
   2.493 -    {
   2.494 +    if( ++i < 100 ) {
   2.495          ERR("Retry suspend domain.");
   2.496          usleep(10000);  // 10ms 
   2.497          goto retry;
   2.498      }
   2.499 -
   2.500 +    
   2.501      ERR("Unable to suspend domain.");
   2.502  
   2.503      return -1;
   2.504  }
   2.505  
   2.506 +
   2.507 +/*
   2.508 +** During transfer (or in the state file), all page-table pages must be  
   2.509 +** converted into a 'canonical' form where references to actual mfns 
   2.510 +** are replaced with references to the corresponding pfns. 
   2.511 +**
   2.512 +** This function performs the appropriate conversion, taking into account 
   2.513 +** which entries do not require canonicalization (in particular, those 
   2.514 +** entries which map the virtual address reserved for the hypervisor). 
   2.515 +*/
   2.516 +void canonicalize_pagetable(unsigned long type, unsigned long pfn, 
   2.517 +                             const void *spage, void *dpage) 
   2.518 +{ 
   2.519 +    
   2.520 +    int i, pte_last, xen_start, xen_end;
   2.521 +    uint64_t pte;
   2.522 +
   2.523 +    /* 
   2.524 +    ** We need to determine which entries in this page table hold
   2.525 +    ** reserved hypervisor mappings. This depends on the current
   2.526 +    ** page table type as well as the number of paging levels. 
   2.527 +    */
   2.528 +    xen_start = xen_end = pte_last = PAGE_SIZE / ((pt_levels == 2)? 4 : 8); 
   2.529 +    
   2.530 +    if (pt_levels == 2 && type == L2TAB)
   2.531 +        xen_start = (hvirt_start >> L2_PAGETABLE_SHIFT); 
   2.532 +
   2.533 +    if (pt_levels == 3 && type == L3TAB) 
   2.534 +        xen_start = L3_PAGETABLE_ENTRIES_PAE; 
   2.535 +        
   2.536 +    /* 
   2.537 +    ** in PAE only the L2 mapping the top 1GB contains Xen mappings. 
   2.538 +    ** We can spot this by looking for the guest linear mapping which
   2.539 +    ** Xen always ensures is present in that L2. Guests must ensure 
   2.540 +    ** that this check will fail for other L2s. 
   2.541 +    */
   2.542 +    if (pt_levels == 3 && type == L2TAB) {
   2.543 +
   2.544 +/* XXX index of the L2 entry in PAE mode which holds the guest LPT */
   2.545 +#define PAE_GLPT_L2ENTRY (495) 
   2.546 +        pte = ((uint64_t*)spage)[PAE_GLPT_L2ENTRY]; 
   2.547 +
   2.548 +        if(((pte >> PAGE_SHIFT) & 0x0fffffff) == live_p2m[pfn])
   2.549 +            xen_start = (hvirt_start >> L2_PAGETABLE_SHIFT_PAE) & 0x1ff; 
   2.550 +    }
   2.551 +
   2.552 +    /* Now iterate through the page table, canonicalizing each PTE */
   2.553 +    for (i = 0; i < pte_last; i++ ) {
   2.554 +
   2.555 +        unsigned long pfn, mfn; 
   2.556 +        
   2.557 +        if (pt_levels == 2)
   2.558 +            pte = ((uint32_t*)spage)[i];
   2.559 +        else
   2.560 +            pte = ((uint64_t*)spage)[i];
   2.561 +        
   2.562 +        if (i >= xen_start && i < xen_end)
   2.563 +            pte = 0;
   2.564 +        
   2.565 +        if (pte & _PAGE_PRESENT) {
   2.566 +            
   2.567 +            mfn = (pte >> PAGE_SHIFT) & 0xfffffff;      
   2.568 +            pfn = live_m2p[mfn];
   2.569 +            
   2.570 +            if (!MFN_IS_IN_PSEUDOPHYS_MAP(mfn)) {
   2.571 +                /* I don't think this should ever happen */
   2.572 +                DPRINTF("FNI: [%08lx,%d] pte=%llx,"
   2.573 +                        " mfn=%08lx, pfn=%08lx [mfn]=%08lx\n",
   2.574 +                        type, i, (uint64_t)pte, mfn, 
   2.575 +                        live_m2p[mfn],
   2.576 +                        (live_m2p[mfn] < max_pfn) ? 
   2.577 +                        live_p2m[live_m2p[mfn]] : 0xdeadbeaf);
   2.578 +                
   2.579 +                pfn = 0; /* be suspicious */
   2.580 +            }
   2.581 +            
   2.582 +            pte &= 0xffffff0000000fffULL;
   2.583 +            pte |= (uint64_t)pfn << PAGE_SHIFT;
   2.584 +        }
   2.585 +        
   2.586 +        if (pt_levels == 2)
   2.587 +            ((uint32_t*)dpage)[i] = pte;
   2.588 +        else
   2.589 +            ((uint64_t*)dpage)[i] = pte;		       
   2.590 +        
   2.591 +    } 
   2.592 +    
   2.593 +    return; 
   2.594 +}
   2.595 +
   2.596 +
   2.597 +
   2.598 +
   2.599  int xc_linux_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters, 
   2.600                    uint32_t max_factor, uint32_t flags)
   2.601  {
   2.602      xc_dominfo_t info;
   2.603  
   2.604 -    int rc = 1, i, j, k, last_iter, iter = 0;
   2.605 -    unsigned long mfn;
   2.606 +    int rc = 1, i, j, last_iter, iter = 0;
   2.607      int live  = (flags & XCFLAGS_LIVE); 
   2.608      int debug = (flags & XCFLAGS_DEBUG); 
   2.609      int sent_last_iter, skip_this_iter;
   2.610 @@ -421,18 +522,16 @@ int xc_linux_save(int xc_handle, int io_
   2.611      unsigned long *pfn_batch = NULL;
   2.612  
   2.613      /* A temporary mapping, and a copy, of one frame of guest memory. */
   2.614 -    unsigned long page[1024];
   2.615 +    char page[PAGE_SIZE]; 
   2.616 +
   2.617 +    /* Double and single indirect references to the live P2M table */
   2.618 +    unsigned long *live_p2m_frame_list_list = NULL;
   2.619 +    unsigned long *live_p2m_frame_list = NULL;
   2.620  
   2.621      /* A copy of the pfn-to-mfn table frame list. */
   2.622 -    unsigned long *live_pfn_to_mfn_frame_list_list = NULL;
   2.623 -    unsigned long *live_pfn_to_mfn_frame_list = NULL;
   2.624 -    unsigned long pfn_to_mfn_frame_list[1024];
   2.625 +    unsigned long *p2m_frame_list = NULL;
   2.626  
   2.627 -    /* Live mapping of the table mapping each PFN to its current MFN. */
   2.628 -    unsigned long *live_pfn_to_mfn_table = NULL;
   2.629 -    /* Live mapping of system MFN to PFN table. */
   2.630 -    unsigned long *live_mfn_to_pfn_table = NULL;
   2.631 -    unsigned long mfn_to_pfn_table_start_mfn;
   2.632 +    unsigned long m2p_start_mfn;
   2.633      
   2.634      /* Live mapping of shared info structure */
   2.635      shared_info_t *live_shinfo = NULL;
   2.636 @@ -440,10 +539,9 @@ int xc_linux_save(int xc_handle, int io_
   2.637      /* base of the region in which domain memory is mapped */
   2.638      unsigned char *region_base = NULL;
   2.639  
   2.640 -    /* number of pages we're dealing with */
   2.641 -    unsigned long nr_pfns;
   2.642  
   2.643 -    /* power of 2 order of nr_pfns */
   2.644 +    
   2.645 +    /* power of 2 order of max_pfn */
   2.646      int order_nr; 
   2.647  
   2.648      /* bitmap of pages:
   2.649 @@ -454,207 +552,197 @@ int xc_linux_save(int xc_handle, int io_
   2.650      
   2.651      xc_shadow_control_stats_t stats;
   2.652  
   2.653 -    int needed_to_fix = 0;
   2.654 -    int total_sent    = 0;
   2.655 -
   2.656 -    MBIT_RATE = START_MBIT_RATE;
   2.657 +    unsigned long needed_to_fix = 0;
   2.658 +    unsigned long total_sent    = 0;
   2.659  
   2.660  
   2.661      /* If no explicit control parameters given, use defaults */
   2.662 -    if( !max_iters ) 
   2.663 +    if(!max_iters) 
   2.664          max_iters = DEF_MAX_ITERS; 
   2.665 -    if( !max_factor ) 
   2.666 +    if(!max_factor) 
   2.667          max_factor = DEF_MAX_FACTOR; 
   2.668 +    
   2.669 +    initialize_mbit_rate(); 
   2.670  
   2.671 +    DPRINTF("xc_linux_save start DOM%u live=%s\n", dom, live ? 
   2.672 +            "true" : "false"); 
   2.673  
   2.674 -    DPRINTF("xc_linux_save start DOM%u live=%s\n", dom, live?"true":"false"); 
   2.675 +    if(!get_platform_info(xc_handle, dom, 
   2.676 +                          &max_mfn, &hvirt_start, &pt_levels)) {
   2.677 +        ERR("Unable to get platform info."); 
   2.678 +        return 1;
   2.679 +    }
   2.680  
   2.681 -    if ( mlock(&ctxt, sizeof(ctxt)) ) 
   2.682 -    {
   2.683 +    if (xc_domain_getinfo(xc_handle, dom, 1, &info) != 1) {
   2.684 +        ERR("Could not get domain info");
   2.685 +        return 1; 
   2.686 +    }
   2.687 +
   2.688 +    if (mlock(&ctxt, sizeof(ctxt))) {
   2.689          ERR("Unable to mlock ctxt");
   2.690          return 1;
   2.691      }
   2.692      
   2.693 -    if ( xc_domain_getinfo(xc_handle, dom, 1, &info) != 1 )
   2.694 -    {
   2.695 -        ERR("Could not get domain info");
   2.696 -        goto out;
   2.697 -    }
   2.698 -    if ( xc_domain_get_vcpu_context(xc_handle, dom, /* FIXME */ 0, &ctxt) )
   2.699 -    {
   2.700 +    /* Only have to worry about vcpu 0 even for SMP */
   2.701 +    if (xc_domain_get_vcpu_context(xc_handle, dom, 0, &ctxt)) {
   2.702          ERR("Could not get vcpu context");
   2.703          goto out;
   2.704      }
   2.705      shared_info_frame = info.shared_info_frame;
   2.706  
   2.707      /* A cheesy test to see whether the domain contains valid state. */
   2.708 -    if ( ctxt.ctrlreg[3] == 0 )
   2.709 +    if (ctxt.ctrlreg[3] == 0)
   2.710      {
   2.711          ERR("Domain is not in a valid Linux guest OS state");
   2.712          goto out;
   2.713      }
   2.714 -    
   2.715 -    nr_pfns = info.max_memkb >> (PAGE_SHIFT - 10);
   2.716 -
   2.717 -    /* cheesy sanity check */
   2.718 -    if ( nr_pfns > 1024*1024 )
   2.719 -    {
   2.720 -        ERR("Invalid state record -- pfn count out of range: %lu", nr_pfns);
   2.721 +  
   2.722 +   /* cheesy sanity check */
   2.723 +    if ((info.max_memkb >> (PAGE_SHIFT - 10)) > max_mfn) {
   2.724 +        ERR("Invalid state record -- pfn count out of range: %lu", 
   2.725 +            (info.max_memkb >> (PAGE_SHIFT - 10))); 
   2.726          goto out;
   2.727 -    }
   2.728 -
   2.729 +     }
   2.730 + 
   2.731      /* Map the shared info frame */
   2.732 -    live_shinfo = xc_map_foreign_range(
   2.733 -        xc_handle, dom, PAGE_SIZE, PROT_READ, shared_info_frame);
   2.734 -    if ( !live_shinfo )
   2.735 -    {
   2.736 +    if(!(live_shinfo = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, 
   2.737 +                                            PROT_READ, shared_info_frame))) {
   2.738          ERR("Couldn't map live_shinfo");
   2.739          goto out;
   2.740      }
   2.741  
   2.742 -    live_pfn_to_mfn_frame_list_list = xc_map_foreign_range(
   2.743 -        xc_handle, dom,
   2.744 -        PAGE_SIZE, PROT_READ, live_shinfo->arch.pfn_to_mfn_frame_list_list);
   2.745 +    max_pfn = live_shinfo->arch.max_pfn;
   2.746  
   2.747 -    if (!live_pfn_to_mfn_frame_list_list){
   2.748 -        ERR("Couldn't map pfn_to_mfn_frame_list_list");
   2.749 +    live_p2m_frame_list_list = 
   2.750 +        xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, PROT_READ, 
   2.751 +                             live_shinfo->arch.pfn_to_mfn_frame_list_list);
   2.752 +
   2.753 +    if (!live_p2m_frame_list_list) {
   2.754 +        ERR("Couldn't map p2m_frame_list_list");
   2.755          goto out;
   2.756      }
   2.757  
   2.758 -    live_pfn_to_mfn_frame_list = 
   2.759 -        xc_map_foreign_batch(xc_handle, dom, 
   2.760 -                             PROT_READ,
   2.761 -                             live_pfn_to_mfn_frame_list_list,
   2.762 -                             (nr_pfns+(1024*1024)-1)/(1024*1024) );
   2.763 -
   2.764 -    if ( !live_pfn_to_mfn_frame_list)
   2.765 -    {
   2.766 -        ERR("Couldn't map pfn_to_mfn_frame_list");
   2.767 +    live_p2m_frame_list = 
   2.768 +        xc_map_foreign_batch(xc_handle, dom, PROT_READ,
   2.769 +                             live_p2m_frame_list_list,
   2.770 +                             P2M_FLL_ENTRIES); 
   2.771 +    
   2.772 +    if (!live_p2m_frame_list) {
   2.773 +        ERR("Couldn't map p2m_frame_list");
   2.774          goto out;
   2.775      }
   2.776  
   2.777 -
   2.778      /* Map all the frames of the pfn->mfn table. For migrate to succeed, 
   2.779         the guest must not change which frames are used for this purpose. 
   2.780         (its not clear why it would want to change them, and we'll be OK
   2.781         from a safety POV anyhow. */
   2.782  
   2.783 -    live_pfn_to_mfn_table = xc_map_foreign_batch(xc_handle, dom, 
   2.784 -                                                 PROT_READ,
   2.785 -                                                 live_pfn_to_mfn_frame_list,
   2.786 -                                                 (nr_pfns+1023)/1024 );  
   2.787 -    if ( !live_pfn_to_mfn_table )
   2.788 -    {
   2.789 -        ERR("Couldn't map pfn_to_mfn table");
   2.790 +    live_p2m = xc_map_foreign_batch(xc_handle, dom, PROT_READ,
   2.791 +                                    live_p2m_frame_list,
   2.792 +                                    P2M_FL_ENTRIES); 
   2.793 +
   2.794 +    if (!live_p2m) {
   2.795 +        ERR("Couldn't map p2m table");
   2.796          goto out;
   2.797      }
   2.798  
   2.799      /* Setup the mfn_to_pfn table mapping */
   2.800 -    mfn_to_pfn_table_start_mfn = xc_get_m2p_start_mfn( xc_handle );
   2.801 -
   2.802 -    live_mfn_to_pfn_table = 
   2.803 -        xc_map_foreign_range(xc_handle, DOMID_XEN, 
   2.804 -                             PAGE_SIZE*1024, PROT_READ, 
   2.805 -                             mfn_to_pfn_table_start_mfn );
   2.806 +    m2p_start_mfn = xc_get_m2p_start_mfn(xc_handle);
   2.807 +    live_m2p      = xc_map_foreign_range(xc_handle, DOMID_XEN, M2P_SIZE, 
   2.808 +                                         PROT_READ, m2p_start_mfn);
   2.809 +    
   2.810 +    /* Get a local copy fo the live_P2M_frame_list */
   2.811 +    if(!(p2m_frame_list = malloc(P2M_FL_SIZE))) { 
   2.812 +        ERR("Couldn't allocate p2m_frame_list array");
   2.813 +        goto out;
   2.814 +    }
   2.815 +    memcpy(p2m_frame_list, live_p2m_frame_list, P2M_FL_SIZE); 
   2.816  
   2.817      /* Canonicalise the pfn-to-mfn table frame-number list. */
   2.818 -    memcpy( pfn_to_mfn_frame_list, live_pfn_to_mfn_frame_list, PAGE_SIZE );
   2.819 -
   2.820 -    for ( i = 0; i < nr_pfns; i += 1024 )
   2.821 -    {
   2.822 -        if ( !translate_mfn_to_pfn(&pfn_to_mfn_frame_list[i/1024]) )
   2.823 -        {
   2.824 +    for (i = 0; i < max_pfn; i += ulpp) {
   2.825 +        if (!translate_mfn_to_pfn(&p2m_frame_list[i/ulpp])) { 
   2.826              ERR("Frame# in pfn-to-mfn frame list is not in pseudophys");
   2.827              goto out;
   2.828          }
   2.829      }
   2.830  
   2.831 -
   2.832      /* Domain is still running at this point */
   2.833  
   2.834 -    if ( live )
   2.835 -    {
   2.836 -        if ( xc_shadow_control( xc_handle, dom, 
   2.837 -                                DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY,
   2.838 -                                NULL, 0, NULL ) < 0 )
   2.839 -        {
   2.840 +    if (live) {
   2.841 +
   2.842 +        if (xc_shadow_control(xc_handle, dom, 
   2.843 +                              DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY,
   2.844 +                              NULL, 0, NULL ) < 0) { 
   2.845              ERR("Couldn't enable shadow mode");
   2.846              goto out;
   2.847          }
   2.848 -
   2.849 +        
   2.850          last_iter = 0;
   2.851 -    } 
   2.852 -    else
   2.853 -    {
   2.854 +        
   2.855 +    } else {
   2.856 +        
   2.857          /* This is a non-live suspend. Issue the call back to get the
   2.858             domain suspended */
   2.859 -
   2.860 +        
   2.861          last_iter = 1;
   2.862 -
   2.863 -        if ( suspend_and_state( xc_handle, io_fd, dom, &info, &ctxt) )
   2.864 -        {
   2.865 +        
   2.866 +        if (suspend_and_state( xc_handle, io_fd, dom, &info, &ctxt)) {
   2.867              ERR("Domain appears not to have suspended");
   2.868              goto out;
   2.869          }
   2.870 -
   2.871 +        
   2.872      }
   2.873 -    sent_last_iter = 1<<20; /* 4GB of pages */
   2.874  
   2.875 -    /* calculate the power of 2 order of nr_pfns, e.g.
   2.876 +#if 0
   2.877 +    sent_last_iter = 0xFFFFFFFF; /* Pretend we sent a /lot/ last time */
   2.878 +#else
   2.879 +    sent_last_iter = 1 << 20; 
   2.880 +#endif
   2.881 +
   2.882 +
   2.883 +    /* calculate the power of 2 order of max_pfn, e.g.
   2.884         15->4 16->4 17->5 */
   2.885 -    for ( i = nr_pfns-1, order_nr = 0; i ; i >>= 1, order_nr++ )
   2.886 +    for (i = max_pfn-1, order_nr = 0; i ; i >>= 1, order_nr++)
   2.887          continue;
   2.888  
   2.889 -    /* Setup to_send bitmap */
   2.890 -    {
   2.891 -        /* size these for a maximal 4GB domain, to make interaction
   2.892 -           with balloon driver easier. It's only user space memory,
   2.893 -           ater all... (3x 128KB) */
   2.894 -
   2.895 -        int sz = ( 1<<20 ) / 8;
   2.896 - 
   2.897 -        to_send = malloc( sz );
   2.898 -        to_fix  = calloc( 1, sz );
   2.899 -        to_skip = malloc( sz );
   2.900 +#undef BITMAP_SIZE
   2.901 +#define BITMAP_SIZE ((1<<20)/8) 
   2.902  
   2.903 -        if ( !to_send || !to_fix || !to_skip )
   2.904 -        {
   2.905 -            ERR("Couldn't allocate to_send array");
   2.906 -            goto out;
   2.907 -        }
   2.908 -
   2.909 -        memset(to_send, 0xff, sz);
   2.910 +    /* Setup to_send / to_fix and to_skip bitmaps */
   2.911 +    to_send = malloc(BITMAP_SIZE); 
   2.912 +    to_fix  = calloc(1, BITMAP_SIZE); 
   2.913 +    to_skip = malloc(BITMAP_SIZE); 
   2.914 +    
   2.915 +    if (!to_send || !to_fix || !to_skip) {
   2.916 +        ERR("Couldn't allocate to_send array");
   2.917 +        goto out;
   2.918 +    }
   2.919 +    
   2.920 +    memset(to_send, 0xff, BITMAP_SIZE);
   2.921  
   2.922 -        if ( mlock(to_send, sz) )
   2.923 -        {
   2.924 -            ERR("Unable to mlock to_send");
   2.925 -            return 1;
   2.926 -        }
   2.927 -
   2.928 -        /* (to fix is local only) */
   2.929 -
   2.930 -        if ( mlock(to_skip, sz) )
   2.931 -        {
   2.932 -            ERR("Unable to mlock to_skip");
   2.933 -            return 1;
   2.934 -        }
   2.935 -
   2.936 +    if (mlock(to_send, BITMAP_SIZE)) {
   2.937 +        ERR("Unable to mlock to_send");
   2.938 +        return 1;
   2.939      }
   2.940  
   2.941 -    analysis_phase( xc_handle, dom, nr_pfns, to_skip, 0 );
   2.942 +    /* (to fix is local only) */
   2.943 +    if (mlock(to_skip, BITMAP_SIZE)) {
   2.944 +        ERR("Unable to mlock to_skip");
   2.945 +        return 1;
   2.946 +    }
   2.947 +        
   2.948 +    analysis_phase(xc_handle, dom, max_pfn, to_skip, 0);
   2.949  
   2.950      /* We want zeroed memory so use calloc rather than malloc. */
   2.951 -    pfn_type = calloc(BATCH_SIZE, sizeof(unsigned long));
   2.952 -    pfn_batch = calloc(BATCH_SIZE, sizeof(unsigned long));
   2.953 +    pfn_type  = calloc(MAX_BATCH_SIZE, sizeof(unsigned long));
   2.954 +    pfn_batch = calloc(MAX_BATCH_SIZE, sizeof(unsigned long));
   2.955  
   2.956 -    if ( (pfn_type == NULL) || (pfn_batch == NULL) )
   2.957 -    {
   2.958 +    if ((pfn_type == NULL) || (pfn_batch == NULL)) {
   2.959          errno = ENOMEM;
   2.960          goto out;
   2.961      }
   2.962  
   2.963 -    if ( mlock(pfn_type, BATCH_SIZE * sizeof(unsigned long)) )
   2.964 -    {
   2.965 +    if (mlock(pfn_type, MAX_BATCH_SIZE * sizeof(unsigned long))) {
   2.966          ERR("Unable to mlock");
   2.967          goto out;
   2.968      }
   2.969 @@ -663,46 +751,40 @@ int xc_linux_save(int xc_handle, int io_
   2.970      /*
   2.971       * Quick belt and braces sanity check.
   2.972       */
   2.973 -#if DEBUG
   2.974      {
   2.975          int err=0;
   2.976 -        for ( i = 0; i < nr_pfns; i++ )
   2.977 -        {
   2.978 -            mfn = live_pfn_to_mfn_table[i];
   2.979 -     
   2.980 -            if( (live_mfn_to_pfn_table[mfn] != i) && (mfn != 0xffffffffUL) )
   2.981 -            {
   2.982 -                fprintf(stderr, "i=0x%x mfn=%lx live_mfn_to_pfn_table=%lx\n",
   2.983 -                        i,mfn,live_mfn_to_pfn_table[mfn]);
   2.984 +        unsigned long mfn; 
   2.985 +        for (i = 0; i < max_pfn; i++) {
   2.986 +
   2.987 +            mfn = live_p2m[i];
   2.988 +            if((live_m2p[mfn] != i) && (mfn != 0xffffffffUL)) { 
   2.989 +                DPRINTF("i=0x%x mfn=%lx live_m2p=%lx\n", i, 
   2.990 +                        mfn, live_m2p[mfn]);
   2.991                  err++;
   2.992              }
   2.993          }
   2.994 -        fprintf(stderr, "Had %d unexplained entries in p2m table\n",err);
   2.995 +        DPRINTF("Had %d unexplained entries in p2m table\n", err);
   2.996      }
   2.997 -#endif
   2.998  
   2.999  
  2.1000      /* Start writing out the saved-domain record. */
  2.1001  
  2.1002 -    if ( write(io_fd, &nr_pfns, sizeof(unsigned long)) !=
  2.1003 -         sizeof(unsigned long) )
  2.1004 -    {
  2.1005 -        ERR("write: nr_pfns");
  2.1006 +    if(!write_exact(io_fd, &max_pfn, sizeof(unsigned long))) { 
  2.1007 +        ERR("write: max_pfn");
  2.1008          goto out;
  2.1009      }
  2.1010  
  2.1011 -    if ( write(io_fd, pfn_to_mfn_frame_list, PAGE_SIZE) != PAGE_SIZE )
  2.1012 -    {
  2.1013 -        ERR("write: pfn_to_mfn_frame_list");
  2.1014 +    if(!write_exact(io_fd, p2m_frame_list, P2M_FL_SIZE)) { 
  2.1015 +        ERR("write: p2m_frame_list");
  2.1016          goto out;
  2.1017      }
  2.1018 -
  2.1019 -    print_stats( xc_handle, dom, 0, &stats, 0 );
  2.1020 +    
  2.1021 +    print_stats(xc_handle, dom, 0, &stats, 0);
  2.1022  
  2.1023      /* Now write out each data page, canonicalising page tables as we go... */
  2.1024 -    
  2.1025 -    for ( ; ; )
  2.1026 -    {
  2.1027 +
  2.1028 +    while(1) {
  2.1029 +
  2.1030          unsigned int prev_pc, sent_this_iter, N, batch;
  2.1031  
  2.1032          iter++;
  2.1033 @@ -713,24 +795,20 @@ int xc_linux_save(int xc_handle, int io_
  2.1034  
  2.1035          DPRINTF("Saving memory pages: iter %d   0%%", iter);
  2.1036  
  2.1037 -        while ( N < nr_pfns )
  2.1038 -        {
  2.1039 -            unsigned int this_pc = (N * 100) / nr_pfns;
  2.1040 +        while( N < max_pfn ){
  2.1041  
  2.1042 -            if ( (this_pc - prev_pc) >= 5 )
  2.1043 -            {
  2.1044 +            unsigned int this_pc = (N * 100) / max_pfn;
  2.1045 +
  2.1046 +            if ((this_pc - prev_pc) >= 5) {
  2.1047                  DPRINTF("\b\b\b\b%3d%%", this_pc);
  2.1048                  prev_pc = this_pc;
  2.1049              }
  2.1050 -
  2.1051 +            
  2.1052              /* slightly wasteful to peek the whole array evey time, 
  2.1053                 but this is fast enough for the moment. */
  2.1054 -
  2.1055 -            if ( !last_iter && 
  2.1056 -                 xc_shadow_control(xc_handle, dom, 
  2.1057 -                                   DOM0_SHADOW_CONTROL_OP_PEEK,
  2.1058 -                                   to_skip, nr_pfns, NULL) != nr_pfns )
  2.1059 -            {
  2.1060 +            if (!last_iter && xc_shadow_control(
  2.1061 +                    xc_handle, dom, DOM0_SHADOW_CONTROL_OP_PEEK,
  2.1062 +                    to_skip, max_pfn, NULL) != max_pfn) {
  2.1063                  ERR("Error peeking shadow bitmap");
  2.1064                  goto out;
  2.1065              }
  2.1066 @@ -738,219 +816,168 @@ int xc_linux_save(int xc_handle, int io_
  2.1067  
  2.1068              /* load pfn_type[] with the mfn of all the pages we're doing in
  2.1069                 this batch. */
  2.1070 -
  2.1071 -            for ( batch = 0; batch < BATCH_SIZE && N < nr_pfns ; N++ )
  2.1072 -            {
  2.1073 -                int n = permute(N, nr_pfns, order_nr );
  2.1074 -
  2.1075 -                if ( 0 && debug ) {
  2.1076 -                    fprintf(stderr,"%d pfn= %08lx mfn= %08lx %d  "
  2.1077 -                            " [mfn]= %08lx\n",
  2.1078 -                            iter, (unsigned long)n, live_pfn_to_mfn_table[n],
  2.1079 -                            test_bit(n,to_send),
  2.1080 -                            live_mfn_to_pfn_table[live_pfn_to_mfn_table[n]&
  2.1081 -                                                 0xFFFFF]);
  2.1082 -                }
  2.1083 +            for (batch = 0; batch < MAX_BATCH_SIZE && N < max_pfn ; N++) {
  2.1084  
  2.1085 -                if ( !last_iter && 
  2.1086 -                     test_bit(n, to_send) && 
  2.1087 -                     test_bit(n, to_skip) ) {
  2.1088 -                    skip_this_iter++; /* stats keeping */
  2.1089 -                }
  2.1090 +                int n = permute(N, max_pfn, order_nr);
  2.1091  
  2.1092 -                if ( !((test_bit(n, to_send) && !test_bit(n, to_skip)) ||
  2.1093 -                       (test_bit(n, to_send) && last_iter) ||
  2.1094 -                       (test_bit(n, to_fix)  && last_iter)) ) {
  2.1095 -                    continue;
  2.1096 +                if (debug) {
  2.1097 +                    DPRINTF("%d pfn= %08lx mfn= %08lx %d  [mfn]= %08lx\n",
  2.1098 +                            iter, (unsigned long)n, live_p2m[n],
  2.1099 +                            test_bit(n, to_send), 
  2.1100 +                            live_m2p[live_p2m[n]&0xFFFFF]);
  2.1101                  }
  2.1102 +                
  2.1103 +                if (!last_iter && test_bit(n, to_send)&& test_bit(n, to_skip)) 
  2.1104 +                    skip_this_iter++; /* stats keeping */
  2.1105  
  2.1106 -                /* we get here if:
  2.1107 -                   1. page is marked to_send & hasn't already been re-dirtied
  2.1108 -                   2. (ignore to_skip in last iteration)
  2.1109 -                   3. add in pages that still need fixup (net bufs)
  2.1110 +                if (!((test_bit(n, to_send) && !test_bit(n, to_skip)) ||
  2.1111 +                      (test_bit(n, to_send) && last_iter) ||
  2.1112 +                      (test_bit(n, to_fix)  && last_iter)))
  2.1113 +                    continue;
  2.1114 +
  2.1115 +                /* 
  2.1116 +                ** we get here if:
  2.1117 +                **  1. page is marked to_send & hasn't already been re-dirtied
  2.1118 +                **  2. (ignore to_skip in last iteration)
  2.1119 +                **  3. add in pages that still need fixup (net bufs)
  2.1120                  */
  2.1121    
  2.1122                  pfn_batch[batch] = n;
  2.1123 -                pfn_type[batch] = live_pfn_to_mfn_table[n];
  2.1124 +                pfn_type[batch]  = live_p2m[n];
  2.1125  
  2.1126 -                if( ! is_mapped(pfn_type[batch]) )
  2.1127 -                {
  2.1128 +                if(!is_mapped(pfn_type[batch])) {
  2.1129 +
  2.1130                      /* not currently in pusedo-physical map -- set bit
  2.1131                         in to_fix that we must send this page in last_iter
  2.1132                         unless its sent sooner anyhow */
  2.1133  
  2.1134 -                    set_bit( n, to_fix );
  2.1135 -                    if( iter>1 )
  2.1136 +                    set_bit(n, to_fix);
  2.1137 +                    if(iter > 1)
  2.1138                          DPRINTF("netbuf race: iter %d, pfn %x. mfn %lx\n",
  2.1139 -                                iter,n,pfn_type[batch]);
  2.1140 +                                iter, n, pfn_type[batch]);
  2.1141                      continue;
  2.1142                  }
  2.1143  
  2.1144 -                if ( last_iter && 
  2.1145 -                     test_bit(n, to_fix) && 
  2.1146 -                     !test_bit(n, to_send) )
  2.1147 -                {
  2.1148 +                if(last_iter && test_bit(n, to_fix) && !test_bit(n, to_send)) {
  2.1149                      needed_to_fix++;
  2.1150                      DPRINTF("Fix! iter %d, pfn %x. mfn %lx\n",
  2.1151                              iter,n,pfn_type[batch]);
  2.1152                  }
  2.1153  
  2.1154                  clear_bit(n, to_fix); 
  2.1155 -
  2.1156 +                
  2.1157                  batch++;
  2.1158              }
  2.1159       
  2.1160 -            if ( batch == 0 )
  2.1161 +            if (batch == 0)
  2.1162                  goto skip; /* vanishingly unlikely... */
  2.1163        
  2.1164 -            if ( (region_base = xc_map_foreign_batch(xc_handle, dom, 
  2.1165 -                                                     PROT_READ,
  2.1166 -                                                     pfn_type,
  2.1167 -                                                     batch)) == 0 ){
  2.1168 +            if ((region_base = xc_map_foreign_batch(
  2.1169 +                     xc_handle, dom, PROT_READ, pfn_type, batch)) == 0) { 
  2.1170                  ERR("map batch failed");
  2.1171                  goto out;
  2.1172              }
  2.1173       
  2.1174 -            if ( xc_get_pfn_type_batch(xc_handle, dom, batch, pfn_type) ){
  2.1175 +            if (xc_get_pfn_type_batch(xc_handle, dom, batch, pfn_type)) {
  2.1176                  ERR("get_pfn_type_batch failed");
  2.1177                  goto out;
  2.1178              }
  2.1179       
  2.1180 -            for ( j = 0; j < batch; j++ )
  2.1181 -            {
  2.1182 -                if ( (pfn_type[j] & LTAB_MASK) == XTAB )
  2.1183 -                {
  2.1184 -                    DPRINTF("type fail: page %i mfn %08lx\n",j,pfn_type[j]);
  2.1185 +            for (j = 0; j < batch; j++) {
  2.1186 +
  2.1187 +                if ((pfn_type[j] & LTAB_MASK) == XTAB) {
  2.1188 +                    DPRINTF("type fail: page %i mfn %08lx\n", j, pfn_type[j]);
  2.1189                      continue;
  2.1190                  }
  2.1191    
  2.1192 -                if ( 0 && debug )
  2.1193 +                if (debug) 
  2.1194                      fprintf(stderr, "%d pfn= %08lx mfn= %08lx [mfn]= %08lx"
  2.1195                              " sum= %08lx\n",
  2.1196                              iter, 
  2.1197                              (pfn_type[j] & LTAB_MASK) | pfn_batch[j],
  2.1198                              pfn_type[j],
  2.1199 -                            live_mfn_to_pfn_table[pfn_type[j]&(~LTAB_MASK)],
  2.1200 +                            live_m2p[pfn_type[j]&(~LTAB_MASK)],
  2.1201                              csum_page(region_base + (PAGE_SIZE*j)));
  2.1202 -
  2.1203 +                
  2.1204                  /* canonicalise mfn->pfn */
  2.1205                  pfn_type[j] = (pfn_type[j] & LTAB_MASK) | pfn_batch[j];
  2.1206              }
  2.1207  
  2.1208 -            if ( write(io_fd, &batch, sizeof(int)) != sizeof(int) )
  2.1209 -            {
  2.1210 +            if(!write_exact(io_fd, &batch, sizeof(unsigned int))) { 
  2.1211                  ERR("Error when writing to state file (2)");
  2.1212                  goto out;
  2.1213              }
  2.1214  
  2.1215 -            if ( write(io_fd, pfn_type, sizeof(unsigned long)*j) !=
  2.1216 -                 (sizeof(unsigned long) * j) )
  2.1217 -            {
  2.1218 +            if(!write_exact(io_fd, pfn_type, sizeof(unsigned long)*j)) { 
  2.1219                  ERR("Error when writing to state file (3)");
  2.1220                  goto out;
  2.1221              }
  2.1222 -     
  2.1223 +            
  2.1224              /* entering this loop, pfn_type is now in pfns (Not mfns) */
  2.1225 -            for ( j = 0; j < batch; j++ )
  2.1226 -            {
  2.1227 +            for (j = 0; j < batch; j++) {
  2.1228 +                
  2.1229 +                unsigned long pfn      = pfn_type[j] & ~LTAB_MASK; 
  2.1230 +                unsigned long pagetype = pfn_type[j] & LTAB_MASK; 
  2.1231 +                void *spage            = (void *) region_base + (PAGE_SIZE*j); 
  2.1232 +
  2.1233 +
  2.1234                  /* write out pages in batch */
  2.1235 -                if ( (pfn_type[j] & LTAB_MASK) == XTAB )
  2.1236 -                {
  2.1237 -                    DPRINTF("SKIP BOGUS page %i mfn %08lx\n",j,pfn_type[j]);
  2.1238 +                if (pagetype == XTAB) {
  2.1239 +                    DPRINTF("SKIP BOGUS page %i mfn %08lx\n", j, pfn_type[j]);
  2.1240                      continue;
  2.1241                  }
  2.1242 -  
  2.1243 -                if ( ((pfn_type[j] & LTABTYPE_MASK) == L1TAB) || 
  2.1244 -                     ((pfn_type[j] & LTABTYPE_MASK) == L2TAB) ){
  2.1245 -                    memcpy(page, region_base + (PAGE_SIZE*j), PAGE_SIZE);
  2.1246 -      
  2.1247 -                    for ( k = 0; 
  2.1248 -                          k < (((pfn_type[j] & LTABTYPE_MASK) == L2TAB) ? 
  2.1249 -                               (HYPERVISOR_VIRT_START >> L2_PAGETABLE_SHIFT) :
  2.1250 -                               1024); 
  2.1251 -                          k++ )
  2.1252 -                    {
  2.1253 -                        unsigned long pfn;
  2.1254 -
  2.1255 -                        if ( !(page[k] & _PAGE_PRESENT) )
  2.1256 -                            continue;
  2.1257 -                        
  2.1258 -                        mfn = page[k] >> PAGE_SHIFT;      
  2.1259 -                        pfn = live_mfn_to_pfn_table[mfn];
  2.1260  
  2.1261 -                        if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) )
  2.1262 -                        {
  2.1263 -                            /* I don't think this should ever happen */
  2.1264 -                            fprintf(stderr, "FNI %d : [%08lx,%d] pte=%08lx, "
  2.1265 -                                    "mfn=%08lx, pfn=%08lx [mfn]=%08lx\n",
  2.1266 -                                    j, pfn_type[j], k,
  2.1267 -                                    page[k], mfn, live_mfn_to_pfn_table[mfn],
  2.1268 -                                    (live_mfn_to_pfn_table[mfn]<nr_pfns)? 
  2.1269 -                                    live_pfn_to_mfn_table[
  2.1270 -                                        live_mfn_to_pfn_table[mfn]] : 
  2.1271 -                                    0xdeadbeef);
  2.1272 -
  2.1273 -                            pfn = 0; /* be suspicious */
  2.1274 -                        }
  2.1275 -
  2.1276 -                        page[k] &= PAGE_SIZE - 1;
  2.1277 -                        page[k] |= pfn << PAGE_SHIFT;
  2.1278 -   
  2.1279 -#if 0
  2.1280 -                        fprintf(stderr,
  2.1281 -                                "L%d i=%d pfn=%d mfn=%d k=%d pte=%08lx "
  2.1282 -                                "xpfn=%d\n",
  2.1283 -                                pfn_type[j]>>28,
  2.1284 -                                j,i,mfn,k,page[k],page[k]>>PAGE_SHIFT);
  2.1285 -#endif     
  2.1286 -   
  2.1287 -                    } /* end of page table rewrite for loop */
  2.1288 -      
  2.1289 +                pagetype &= LTABTYPE_MASK; 
  2.1290 +                
  2.1291 +                if (pagetype >= L1TAB && pagetype <= L4TAB) {
  2.1292 +                    
  2.1293 +                    /* We have a pagetable page: need to rewrite it. */
  2.1294 +                    canonicalize_pagetable(pagetype, pfn, spage, page); 
  2.1295 +                    
  2.1296                      if (ratewrite(io_fd, page, PAGE_SIZE) != PAGE_SIZE) {
  2.1297                          ERR("Error when writing to state file (4)");
  2.1298                          goto out;
  2.1299                      }
  2.1300 -      
  2.1301 -                }  /* end of it's a PT page */ else {  /* normal page */
  2.1302 +                    
  2.1303 +                }  else {  
  2.1304  
  2.1305 -                    if ( ratewrite(io_fd, region_base + (PAGE_SIZE*j), 
  2.1306 -                                   PAGE_SIZE) != PAGE_SIZE )
  2.1307 -                    {
  2.1308 +                    /* We have a normal page: just write it directly. */
  2.1309 +                    if (ratewrite(io_fd, spage, PAGE_SIZE) != PAGE_SIZE) {
  2.1310                          ERR("Error when writing to state file (5)");
  2.1311                          goto out;
  2.1312                      }
  2.1313                  }
  2.1314              } /* end of the write out for this batch */
  2.1315 -     
  2.1316 +            
  2.1317              sent_this_iter += batch;
  2.1318 -
  2.1319 +            
  2.1320          } /* end of this while loop for this iteration */
  2.1321 -
  2.1322 +        
  2.1323          munmap(region_base, batch*PAGE_SIZE);
  2.1324 -
  2.1325 -    skip: 
  2.1326 -
  2.1327 +        
  2.1328 +      skip: 
  2.1329 +        
  2.1330          total_sent += sent_this_iter;
  2.1331  
  2.1332          DPRINTF("\r %d: sent %d, skipped %d, ", 
  2.1333                  iter, sent_this_iter, skip_this_iter );
  2.1334  
  2.1335 -        if ( last_iter ) {
  2.1336 +        if (last_iter) {
  2.1337              print_stats( xc_handle, dom, sent_this_iter, &stats, 1);
  2.1338  
  2.1339 -            DPRINTF("Total pages sent= %d (%.2fx)\n", 
  2.1340 -                    total_sent, ((float)total_sent)/nr_pfns );
  2.1341 -            DPRINTF("(of which %d were fixups)\n", needed_to_fix  );
  2.1342 +            DPRINTF("Total pages sent= %ld (%.2fx)\n", 
  2.1343 +                    total_sent, ((float)total_sent)/max_pfn );
  2.1344 +            DPRINTF("(of which %ld were fixups)\n", needed_to_fix  );
  2.1345          }       
  2.1346  
  2.1347          if (last_iter && debug){
  2.1348              int minusone = -1;
  2.1349 -            memset( to_send, 0xff, (nr_pfns+8)/8 );
  2.1350 +            memset( to_send, 0xff, (max_pfn+8)/8 );
  2.1351              debug = 0;
  2.1352              fprintf(stderr, "Entering debug resend-all mode\n");
  2.1353      
  2.1354              /* send "-1" to put receiver into debug mode */
  2.1355 -            if (write(io_fd, &minusone, sizeof(int)) != sizeof(int)) {
  2.1356 +            if(!write_exact(io_fd, &minusone, sizeof(int))) { 
  2.1357                  ERR("Error when writing to state file (6)");
  2.1358                  goto out;
  2.1359              }
  2.1360 @@ -958,42 +985,39 @@ int xc_linux_save(int xc_handle, int io_
  2.1361              continue;
  2.1362          }
  2.1363  
  2.1364 -        if ( last_iter ) break; 
  2.1365 +        if (last_iter) break; 
  2.1366  
  2.1367 -        if ( live )
  2.1368 -        {
  2.1369 -            if ( 
  2.1370 -                ( ( sent_this_iter > sent_last_iter ) &&
  2.1371 -                  (mbit_rate == MAX_MBIT_RATE ) ) ||
  2.1372 -                (iter >= max_iters) || 
  2.1373 -                (sent_this_iter+skip_this_iter < 50) || 
  2.1374 -                (total_sent > nr_pfns*max_factor) )
  2.1375 -            {
  2.1376 +        if (live) {
  2.1377 +
  2.1378 +
  2.1379 +            if( 
  2.1380 +                ((sent_this_iter > sent_last_iter) && RATE_IS_MAX()) ||
  2.1381 +                (iter >= max_iters) ||
  2.1382 +                (sent_this_iter+skip_this_iter < 50) ||
  2.1383 +                (total_sent > max_pfn*max_factor) ) { 
  2.1384 +
  2.1385                  DPRINTF("Start last iteration\n");
  2.1386                  last_iter = 1;
  2.1387 -
  2.1388 -                if ( suspend_and_state( xc_handle, io_fd, dom, &info, &ctxt) )
  2.1389 -                {
  2.1390 +                
  2.1391 +                if (suspend_and_state(xc_handle, io_fd, dom, &info, &ctxt)) {
  2.1392                      ERR("Domain appears not to have suspended");
  2.1393                      goto out;
  2.1394                  }
  2.1395 -
  2.1396 +                
  2.1397                  DPRINTF("SUSPEND shinfo %08lx eip %08u edx %08u\n",
  2.1398                          info.shared_info_frame,
  2.1399                          ctxt.user_regs.eip, ctxt.user_regs.edx);
  2.1400              } 
  2.1401 -
  2.1402 -            if ( xc_shadow_control( xc_handle, dom, 
  2.1403 -                                    DOM0_SHADOW_CONTROL_OP_CLEAN,
  2.1404 -                                    to_send, nr_pfns, &stats ) != nr_pfns ) 
  2.1405 -            {
  2.1406 +            
  2.1407 +            if (xc_shadow_control(xc_handle, dom, DOM0_SHADOW_CONTROL_OP_CLEAN,
  2.1408 +                                  to_send, max_pfn, &stats ) != max_pfn) {  
  2.1409                  ERR("Error flushing shadow PT");
  2.1410                  goto out;
  2.1411              }
  2.1412  
  2.1413              sent_last_iter = sent_this_iter;
  2.1414  
  2.1415 -            print_stats( xc_handle, dom, sent_this_iter, &stats, 1);
  2.1416 +            print_stats(xc_handle, dom, sent_this_iter, &stats, 1);
  2.1417       
  2.1418          }
  2.1419  
  2.1420 @@ -1005,9 +1029,10 @@ int xc_linux_save(int xc_handle, int io_
  2.1421      /* Success! */
  2.1422      rc = 0;
  2.1423      
  2.1424 +    /* ^^^^^^ XXX SMH: hmm.. not sure that's really success! */
  2.1425 +    
  2.1426      /* Zero terminate */
  2.1427 -    if ( write(io_fd, &rc, sizeof(int)) != sizeof(int) )
  2.1428 -    {
  2.1429 +    if (!write_exact(io_fd, &rc, sizeof(int))) { 
  2.1430          ERR("Error when writing to state file (6)");
  2.1431          goto out;
  2.1432      }
  2.1433 @@ -1015,84 +1040,76 @@ int xc_linux_save(int xc_handle, int io_
  2.1434      /* Send through a list of all the PFNs that were not in map at the close */
  2.1435      {
  2.1436          unsigned int i,j;
  2.1437 -        unsigned int pfntab[1024];
  2.1438 +        unsigned long pfntab[1024]; 
  2.1439  
  2.1440 -        for ( i = 0, j = 0; i < nr_pfns; i++ )
  2.1441 -            if ( !is_mapped(live_pfn_to_mfn_table[i]) )
  2.1442 +        for ( i = 0, j = 0; i < max_pfn; i++ ) {
  2.1443 +            if ( ! is_mapped(live_p2m[i]) )
  2.1444                  j++;
  2.1445 +        }
  2.1446  
  2.1447 -        if ( write(io_fd, &j, sizeof(unsigned int)) != sizeof(unsigned int) )
  2.1448 -        {
  2.1449 +        if(!write_exact(io_fd, &j, sizeof(unsigned int))) { 
  2.1450              ERR("Error when writing to state file (6a)");
  2.1451              goto out;
  2.1452 -        } 
  2.1453 +        }	
  2.1454 +        
  2.1455 +        for ( i = 0, j = 0; i < max_pfn; ) {
  2.1456  
  2.1457 -        for ( i = 0, j = 0; i < nr_pfns; )
  2.1458 -        {
  2.1459 -            if ( !is_mapped(live_pfn_to_mfn_table[i]) )
  2.1460 -            {
  2.1461 +            if (!is_mapped(live_p2m[i]))
  2.1462                  pfntab[j++] = i;
  2.1463 -            }
  2.1464 +
  2.1465              i++;
  2.1466 -            if ( j == 1024 || i == nr_pfns )
  2.1467 -            {
  2.1468 -                if ( write(io_fd, &pfntab, sizeof(unsigned long)*j) !=
  2.1469 -                     (sizeof(unsigned long) * j) )
  2.1470 -                {
  2.1471 +            if (j == 1024 || i == max_pfn) {
  2.1472 +                if(!write_exact(io_fd, &pfntab, sizeof(unsigned long)*j)) { 
  2.1473                      ERR("Error when writing to state file (6b)");
  2.1474                      goto out;
  2.1475                  } 
  2.1476                  j = 0;
  2.1477              }
  2.1478          }
  2.1479 -    }
  2.1480  
  2.1481 +    }
  2.1482 +    
  2.1483      /* Canonicalise the suspend-record frame number. */
  2.1484 -    if ( !translate_mfn_to_pfn(&ctxt.user_regs.edx) )
  2.1485 -    {
  2.1486 +    if ( !translate_mfn_to_pfn(&ctxt.user_regs.edx) ){
  2.1487          ERR("Suspend record is not in range of pseudophys map");
  2.1488          goto out;
  2.1489      }
  2.1490  
  2.1491      /* Canonicalise each GDT frame number. */
  2.1492 -    for ( i = 0; i < ctxt.gdt_ents; i += 512 )
  2.1493 -    {
  2.1494 -        if ( !translate_mfn_to_pfn(&ctxt.gdt_frames[i]) ) 
  2.1495 -        {
  2.1496 +    for ( i = 0; i < ctxt.gdt_ents; i += 512 ) {
  2.1497 +        if ( !translate_mfn_to_pfn(&ctxt.gdt_frames[i]) ) {
  2.1498              ERR("GDT frame is not in range of pseudophys map");
  2.1499              goto out;
  2.1500          }
  2.1501      }
  2.1502  
  2.1503      /* Canonicalise the page table base pointer. */
  2.1504 -    if ( !MFN_IS_IN_PSEUDOPHYS_MAP(ctxt.ctrlreg[3] >> PAGE_SHIFT) )
  2.1505 -    {
  2.1506 +    if ( !MFN_IS_IN_PSEUDOPHYS_MAP(ctxt.ctrlreg[3] >> PAGE_SHIFT) ) {
  2.1507          ERR("PT base is not in range of pseudophys map");
  2.1508          goto out;
  2.1509      }
  2.1510 -    ctxt.ctrlreg[3] = live_mfn_to_pfn_table[ctxt.ctrlreg[3] >> PAGE_SHIFT] <<
  2.1511 +    ctxt.ctrlreg[3] = live_m2p[ctxt.ctrlreg[3] >> PAGE_SHIFT] <<
  2.1512          PAGE_SHIFT;
  2.1513  
  2.1514 -    if ( write(io_fd, &ctxt, sizeof(ctxt)) != sizeof(ctxt) ||
  2.1515 -         write(io_fd, live_shinfo, PAGE_SIZE) != PAGE_SIZE)
  2.1516 -    {
  2.1517 +    if (!write_exact(io_fd, &ctxt, sizeof(ctxt)) ||
  2.1518 +        !write_exact(io_fd, live_shinfo, PAGE_SIZE)) { 
  2.1519          ERR("Error when writing to state file (1)");
  2.1520          goto out;
  2.1521      }
  2.1522 -
  2.1523 +    
  2.1524   out:
  2.1525  
  2.1526 -    if ( live_shinfo )
  2.1527 +    if (live_shinfo)
  2.1528          munmap(live_shinfo, PAGE_SIZE);
  2.1529 +    
  2.1530 +    if (live_p2m_frame_list) 
  2.1531 +        munmap(live_p2m_frame_list, P2M_FLL_ENTRIES * PAGE_SIZE); 
  2.1532  
  2.1533 -    if ( live_pfn_to_mfn_frame_list ) 
  2.1534 -        munmap(live_pfn_to_mfn_frame_list, PAGE_SIZE);
  2.1535 +    if(live_p2m) 
  2.1536 +        munmap(live_p2m, P2M_SIZE); 
  2.1537  
  2.1538 -    if ( live_pfn_to_mfn_table ) 
  2.1539 -        munmap(live_pfn_to_mfn_table, nr_pfns*4);
  2.1540 -
  2.1541 -    if ( live_mfn_to_pfn_table ) 
  2.1542 -        munmap(live_mfn_to_pfn_table, PAGE_SIZE*1024);
  2.1543 +    if(live_m2p) 
  2.1544 +        munmap(live_m2p, M2P_SIZE); 
  2.1545  
  2.1546      free(pfn_type);
  2.1547      free(pfn_batch);
  2.1548 @@ -1101,6 +1118,7 @@ int xc_linux_save(int xc_handle, int io_
  2.1549      free(to_skip);
  2.1550  
  2.1551      DPRINTF("Save exit rc=%d\n",rc);
  2.1552 +
  2.1553      return !!rc;
  2.1554  }
  2.1555  
     3.1 --- a/tools/libxc/xenctrl.h	Tue Nov 08 18:39:58 2005 +0100
     3.2 +++ b/tools/libxc/xenctrl.h	Tue Nov 08 18:42:07 2005 +0100
     3.3 @@ -17,6 +17,7 @@
     3.4  #include <xen/event_channel.h>
     3.5  #include <xen/sched.h>
     3.6  #include <xen/sched_ctl.h>
     3.7 +#include <xen/memory.h>
     3.8  #include <xen/acm.h>
     3.9  
    3.10  #ifdef __ia64__
     4.1 --- a/tools/libxc/xg_private.h	Tue Nov 08 18:39:58 2005 +0100
     4.2 +++ b/tools/libxc/xg_private.h	Tue Nov 08 18:42:07 2005 +0100
     4.3 @@ -11,8 +11,10 @@
     4.4  #include <sys/stat.h>
     4.5  
     4.6  #include "xenctrl.h"
     4.7 +#include "xenguest.h" 
     4.8  
     4.9  #include <xen/linux/privcmd.h>
    4.10 +#include <xen/memory.h>
    4.11  
    4.12  char *xc_read_kernel_image(const char *filename, unsigned long *size);
    4.13  unsigned long csum_page (void * page);
     5.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     5.2 +++ b/tools/libxc/xg_save_restore.h	Tue Nov 08 18:42:07 2005 +0100
     5.3 @@ -0,0 +1,123 @@
     5.4 +/*
     5.5 +** xg_save_restore.h
     5.6 +** 
     5.7 +** Defintions and utilities for save / restore. 
     5.8 +*/
     5.9 +
    5.10 +#define DEBUG    1
    5.11 +#define PROGRESS 0
    5.12 +
    5.13 +#define ERR(_f, _a...) do {                     \
    5.14 +    fprintf(stderr, _f "\n" , ## _a);           \
    5.15 +    fflush(stderr); }                           \
    5.16 +while (0)
    5.17 +
    5.18 +#if DEBUG
    5.19 +#define DPRINTF(_f, _a...) fprintf(stderr, _f , ## _a)
    5.20 +#else
    5.21 +#define DPRINTF(_f, _a...) ((void)0)
    5.22 +#endif
    5.23 +
    5.24 +
    5.25 +#if PROGRESS
    5.26 +#define PPRINTF(_f, _a...) fprintf(stderr, _f , ## _a)
    5.27 +#else
    5.28 +#define PPRINTF(_f, _a...)
    5.29 +#endif
    5.30 +
    5.31 +
    5.32 +/*
    5.33 +** We process save/restore/migrate in batches of pages; the below 
    5.34 +** determines how many pages we (at maximum) deal with in each batch. 
    5.35 +*/
    5.36 +#define MAX_BATCH_SIZE 1024   /* up to 1024 pages (4MB) at a time */
    5.37 +
    5.38 +/* When pinning page tables at the end of restore, we also use batching. */
    5.39 +#define MAX_PIN_BATCH  1024
    5.40 +
    5.41 +
    5.42 +
    5.43 +/*
    5.44 +** Determine various platform information required for save/restore, in 
    5.45 +** particular: 
    5.46 +**
    5.47 +**    - the maximum MFN on this machine, used to compute the size of 
    5.48 +**      the M2P table; 
    5.49 +** 
    5.50 +**    - the starting virtual address of the the hypervisor; we use this 
    5.51 +**      to determine which parts of guest address space(s) do and don't 
    5.52 +**      require canonicalization during save/restore; and 
    5.53 +** 
    5.54 +**    - the number of page-table levels for save/ restore. This should 
    5.55 +**      be a property of the domain, but for the moment we just read it 
    5.56 +**      from the hypervisor.
    5.57 +**
    5.58 +** Returns 1 on success, 0 on failure. 
    5.59 +*/
    5.60 +static int get_platform_info(int xc_handle, uint32_t dom, 
    5.61 +                             /* OUT */ uint32_t *max_mfn,  
    5.62 +                             /* OUT */ uint32_t *hvirt_start, 
    5.63 +                             /* OUT */ uint32_t *pt_levels)
    5.64 +    
    5.65 +{ 
    5.66 +    xen_capabilities_info_t xen_caps = "";
    5.67 +    xen_parameters_info_t xen_parms;
    5.68 +    xc_physinfo_t physinfo;
    5.69 +    
    5.70 +    if (xc_physinfo(xc_handle, &physinfo) != 0) 
    5.71 +        return 0;
    5.72 +    
    5.73 +    if (xc_version(xc_handle, XENVER_parameters, &xen_parms) != 0)
    5.74 +        return 0;
    5.75 +    
    5.76 +    if (xc_version(xc_handle, XENVER_capabilities, &xen_caps) != 0)
    5.77 +        return 0;
    5.78 +
    5.79 +    *max_mfn =     physinfo.total_pages;
    5.80 +    *hvirt_start = xen_parms.virt_start;
    5.81 +
    5.82 +    if (strstr(xen_caps, "xen-3.0-x86_64"))
    5.83 +        *pt_levels = 4;
    5.84 +    else if (strstr(xen_caps, "xen-3.0-x86_32p"))
    5.85 +        *pt_levels = 3; 
    5.86 +    else if (strstr(xen_caps, "xen-3.0-x86_32"))
    5.87 +        *pt_levels = 2; 
    5.88 +    else 
    5.89 +        return 0; 
    5.90 +    
    5.91 +    return 1;
    5.92 +} 
    5.93 +
    5.94 +
    5.95 +/* 
    5.96 +** Save/restore deal with the mfn_to_pfn (M2P) and pfn_to_mfn (P2M) tables. 
    5.97 +** The M2P simply holds the corresponding PFN, while the top bit of a P2M
    5.98 +** entry tell us whether or not the the PFN is currently mapped.
    5.99 +*/
   5.100 +
   5.101 +#define PFN_TO_KB(_pfn) ((_pfn) * PAGE_SIZE / 1024)
   5.102 +#define ROUNDUP(_x,_w) (((unsigned long)(_x)+(1UL<<(_w))-1) & ~((1UL<<(_w))-1))
   5.103 +
   5.104 +/* Size in bytes of the M2P and P2M (both rounded up to nearest PAGE_SIZE) */
   5.105 +#define M2P_SIZE ROUNDUP((max_mfn * sizeof(unsigned long)), PAGE_SHIFT) 
   5.106 +#define P2M_SIZE ROUNDUP((max_pfn * sizeof(unsigned long)), PAGE_SHIFT) 
   5.107 +
   5.108 +
   5.109 +/* Number of unsigned longs in a page */
   5.110 +#define ulpp            (PAGE_SIZE/sizeof(unsigned long))
   5.111 +
   5.112 +/* Number of entries in the pfn_to_mfn_frame_list */
   5.113 +#define P2M_FL_ENTRIES  (((max_pfn)+ulpp-1)/ulpp)
   5.114 +
   5.115 +/* Size in bytes of the pfn_to_mfn_frame_list     */
   5.116 +#define P2M_FL_SIZE     ((P2M_FL_ENTRIES)*sizeof(unsigned long))
   5.117 +
   5.118 +/* Number of entries in the pfn_to_mfn_frame_list_list */
   5.119 +#define P2M_FLL_ENTRIES (((max_pfn)+(ulpp*ulpp)-1)/(ulpp*ulpp))
   5.120 +
   5.121 +/* Returns TRUE if the PFN is currently mapped */
   5.122 +#define is_mapped(pfn_type) (!((pfn_type) & 0x80000000UL))
   5.123 +
   5.124 +
   5.125 +
   5.126 +