direct-io.hg

changeset 7702:b3c2bc39d815

Enable save/restore for PAE domains.

This includes quite a few cleanups / refactoring of the old code, some
of which is intended to prepare for 64-bit save/restore.

Signed-off-by: Steven Hand <steven@xensource.com>
author smh22@firebug.cl.cam.ac.uk
date Tue Nov 08 18:42:07 2005 +0100 (2005-11-08)
parents abbe3df33774
children 539b2757642e
files tools/libxc/xc_linux_restore.c tools/libxc/xc_linux_save.c tools/libxc/xenctrl.h tools/libxc/xg_private.h tools/libxc/xg_save_restore.h
line diff
     1.1 --- a/tools/libxc/xc_linux_restore.c	Tue Nov 08 18:39:58 2005 +0100
     1.2 +++ b/tools/libxc/xc_linux_restore.c	Tue Nov 08 18:42:07 2005 +0100
     1.3 @@ -8,32 +8,30 @@
     1.4  
     1.5  #include <stdlib.h>
     1.6  #include <unistd.h>
     1.7 +
     1.8  #include "xg_private.h"
     1.9 -#include <xenctrl.h>
    1.10 -#include <xen/memory.h>
    1.11 +#include "xg_save_restore.h"
    1.12 +
    1.13 +
    1.14  
    1.15 -#define MAX_BATCH_SIZE 1024
    1.16 +/* max mfn of the whole machine */
    1.17 +static uint32_t max_mfn; 
    1.18  
    1.19 -#define DEBUG 0
    1.20 +/* virtual starting address of the hypervisor */
    1.21 +static uint32_t hvirt_start; 
    1.22  
    1.23 -#if 1
    1.24 -#define ERR(_f, _a...) do { fprintf ( stderr, _f , ## _a ); fflush(stderr); } while(0)
    1.25 -#else
    1.26 -#define ERR(_f, _a...) ((void)0)
    1.27 -#endif
    1.28 +/* #levels of page tables used by the currrent guest */
    1.29 +static uint32_t pt_levels; 
    1.30 +
    1.31 +/* total number of pages used by the current guest */
    1.32 +static unsigned long max_pfn;
    1.33  
    1.34 -#if DEBUG
    1.35 -#define DPRINTF(_f, _a...) do { fprintf ( stdout, _f , ## _a ); fflush(stdout); } while (0)
    1.36 -#else
    1.37 -#define DPRINTF(_f, _a...) ((void)0)
    1.38 -#endif
    1.39 +/* Live mapping of the table mapping each PFN to its current MFN. */
    1.40 +static unsigned long *live_p2m = NULL;
    1.41  
    1.42 -#define PROGRESS 0
    1.43 -#if PROGRESS
    1.44 -#define PPRINTF(_f, _a...) fprintf ( stderr, _f , ## _a ); fflush(stderr)
    1.45 -#else
    1.46 -#define PPRINTF(_f, _a...)
    1.47 -#endif
    1.48 +/* A table mapping each PFN to its new MFN. */
    1.49 +static unsigned long *p2m = NULL;
    1.50 +
    1.51  
    1.52  static ssize_t
    1.53  read_exact(int fd, void *buf, size_t count)
    1.54 @@ -45,24 +43,93 @@ read_exact(int fd, void *buf, size_t cou
    1.55          s = read(fd, &b[r], count - r);
    1.56          if ((s == -1) && (errno == EINTR))
    1.57              continue;
    1.58 -        if (s <= 0)
    1.59 +        if (s <= 0) { 
    1.60              break;
    1.61 +        } 
    1.62          r += s;
    1.63      }
    1.64  
    1.65 -    return r;
    1.66 +    return (r == count) ? 1 : 0; 
    1.67  }
    1.68  
    1.69 -int xc_linux_restore(int xc_handle, int io_fd, uint32_t dom, unsigned long nr_pfns,
    1.70 +
    1.71 +/*
    1.72 +** In the state file (or during transfer), all page-table pages are 
    1.73 +** converted into a 'canonical' form where references to actual mfns 
    1.74 +** are replaced with references to the corresponding pfns. 
    1.75 +** This function inverts that operation, replacing the pfn values with 
    1.76 +** the (now known) appropriate mfn values. 
    1.77 +*/
    1.78 +int uncanonicalize_pagetable(unsigned long type, void *page) 
    1.79 +{ 
    1.80 +    int i, pte_last, xen_start, xen_end; 
    1.81 +    unsigned long pfn; 
    1.82 +    uint64_t pte; 
    1.83 +
    1.84 +    /* 
    1.85 +    ** We need to determine which entries in this page table hold
    1.86 +    ** reserved hypervisor mappings. This depends on the current
    1.87 +    ** page table type as well as the number of paging levels. 
    1.88 +    */
    1.89 +    xen_start = xen_end = pte_last = PAGE_SIZE / ((pt_levels == 2)? 4 : 8); 
    1.90 +    
    1.91 +    if (pt_levels == 2 && type == L2TAB)
    1.92 +        xen_start = (hvirt_start >> L2_PAGETABLE_SHIFT); 
    1.93 +
    1.94 +    if (pt_levels == 3 && type == L3TAB) 
    1.95 +        xen_start = L3_PAGETABLE_ENTRIES_PAE; 
    1.96 +
    1.97 +
    1.98 +    /* Now iterate through the page table, uncanonicalizing each PTE */
    1.99 +    for(i = 0; i < pte_last; i++) { 
   1.100 +        
   1.101 +        if(pt_levels == 2) 
   1.102 +            pte = ((uint32_t *)page)[i]; 
   1.103 +        else 
   1.104 +            pte = ((uint64_t *)page)[i]; 
   1.105 +        
   1.106 +        if(i >= xen_start && i < xen_end) 
   1.107 +            pte = 0; 
   1.108 +        
   1.109 +        if(pte & _PAGE_PRESENT) { 
   1.110 +            
   1.111 +            pfn = pte >> PAGE_SHIFT; 
   1.112 +            
   1.113 +            if(pfn >= max_pfn) { 
   1.114 +                ERR("Frame number in type %lu page table is out of range: "
   1.115 +                    "i=%d pfn=0x%lx max_pfn=%lu", 
   1.116 +                    type >> 28, i, pfn, max_pfn);
   1.117 +                return 0; 
   1.118 +            } 
   1.119 +            
   1.120 +            
   1.121 +            if(type == L1TAB) 
   1.122 +                pte &= (PAGE_SIZE - 1) & ~(_PAGE_GLOBAL | _PAGE_PAT);
   1.123 +            else 
   1.124 +                pte &= (PAGE_SIZE - 1) & ~(_PAGE_GLOBAL | _PAGE_PSE);
   1.125 +            
   1.126 +            pte |= p2m[pfn] << PAGE_SHIFT;
   1.127 +            
   1.128 +            if(pt_levels == 2) 
   1.129 +                ((uint32_t *)page)[i] = (uint32_t)pte; 
   1.130 +            else 
   1.131 +                ((uint64_t *)page)[i] = (uint64_t)pte; 
   1.132 +        }
   1.133 +    }
   1.134 +    
   1.135 +    return 1; 
   1.136 +}
   1.137 +
   1.138 +int xc_linux_restore(int xc_handle, int io_fd, 
   1.139 +                     uint32_t dom, unsigned long nr_pfns, 
   1.140                       unsigned int store_evtchn, unsigned long *store_mfn,
   1.141                       unsigned int console_evtchn, unsigned long *console_mfn)
   1.142  {
   1.143      dom0_op_t op;
   1.144 -    int rc = 1, i, n, k;
   1.145 -    unsigned long mfn, pfn, xpfn;
   1.146 +    int rc = 1, i, n;
   1.147 +    unsigned long mfn, pfn; 
   1.148      unsigned int prev_pc, this_pc;
   1.149      int verify = 0;
   1.150 -    int err;
   1.151  
   1.152      /* The new domain's shared-info frame number. */
   1.153      unsigned long shared_info_frame;
   1.154 @@ -72,29 +139,21 @@ int xc_linux_restore(int xc_handle, int 
   1.155      /* A copy of the CPU context of the guest. */
   1.156      vcpu_guest_context_t ctxt;
   1.157  
   1.158 -    /* A table containg the type of each PFN (/not/ MFN!). */
   1.159 +    /* A table containing the type of each PFN (/not/ MFN!). */
   1.160      unsigned long *pfn_type = NULL;
   1.161  
   1.162      /* A table of MFNs to map in the current region */
   1.163      unsigned long *region_mfn = NULL;
   1.164  
   1.165      /* A temporary mapping, and a copy, of one frame of guest memory. */
   1.166 -    unsigned long *ppage = NULL;
   1.167 +    unsigned long *page = NULL;
   1.168  
   1.169      /* A copy of the pfn-to-mfn table frame list. */
   1.170 -    unsigned long pfn_to_mfn_frame_list[1024];
   1.171 -
   1.172 -    /* A table mapping each PFN to its new MFN. */
   1.173 -    unsigned long *pfn_to_mfn_table = NULL;
   1.174 -
   1.175 -    /* used by mapper for updating the domain's copy of the table */
   1.176 -    unsigned long *live_pfn_to_mfn_table = NULL;
   1.177 +    unsigned long *p2m_frame_list = NULL; 
   1.178  
   1.179      /* A temporary mapping of the guest's start_info page. */
   1.180      start_info_t *start_info;
   1.181  
   1.182 -    int pt_levels = 2; /* XXX auto-detect this */
   1.183 -
   1.184      char *region_base;
   1.185  
   1.186      xc_mmu_t *mmu = NULL;
   1.187 @@ -102,37 +161,60 @@ int xc_linux_restore(int xc_handle, int 
   1.188      /* used by debug verify code */
   1.189      unsigned long buf[PAGE_SIZE/sizeof(unsigned long)];
   1.190  
   1.191 -#define MAX_PIN_BATCH 1024
   1.192      struct mmuext_op pin[MAX_PIN_BATCH];
   1.193      unsigned int nr_pins = 0;
   1.194  
   1.195 -    DPRINTF("xc_linux_restore start: nr_pfns = %lx\n", nr_pfns);
   1.196 +
   1.197 +    max_pfn = nr_pfns; 
   1.198 +
   1.199 +    DPRINTF("xc_linux_restore start: max_pfn = %lx\n", max_pfn);
   1.200 +
   1.201 +
   1.202 +    if(!get_platform_info(xc_handle, dom, 
   1.203 +                          &max_mfn, &hvirt_start, &pt_levels)) {
   1.204 +        ERR("Unable to get platform info."); 
   1.205 +        return 1;
   1.206 +    }
   1.207 +
   1.208  
   1.209      if (mlock(&ctxt, sizeof(ctxt))) {
   1.210 -        /* needed for when we do the build dom0 op, 
   1.211 -           but might as well do early */
   1.212 +        /* needed for build dom0 op, but might as well do early */
   1.213          ERR("Unable to mlock ctxt");
   1.214          return 1;
   1.215      }
   1.216  
   1.217 -    if (read_exact(io_fd, pfn_to_mfn_frame_list, PAGE_SIZE) != PAGE_SIZE) {
   1.218 -        ERR("read pfn_to_mfn_frame_list failed");
   1.219 +
   1.220 +    /* Only have to worry about vcpu 0 even for SMP */
   1.221 +    if (xc_domain_get_vcpu_context( xc_handle, dom, 0, &ctxt)) {
   1.222 +        ERR("Could not get vcpu context");
   1.223          goto out;
   1.224      }
   1.225  
   1.226 +    
   1.227 +    /* Read the saved P2M frame list */
   1.228 +    if(!(p2m_frame_list = malloc(P2M_FL_SIZE))) { 
   1.229 +        ERR("Couldn't allocate p2m_frame_list array");
   1.230 +        goto out;
   1.231 +    }
   1.232 +    
   1.233 +    if (!read_exact(io_fd, p2m_frame_list, P2M_FL_SIZE)) { 
   1.234 +        ERR("read p2m_frame_list failed");
   1.235 +        goto out;
   1.236 +    }
   1.237 +
   1.238 +    
   1.239      /* We want zeroed memory so use calloc rather than malloc. */
   1.240 -    pfn_to_mfn_table = calloc(4, nr_pfns);
   1.241 -    pfn_type = calloc(4, nr_pfns);    
   1.242 -    region_mfn = calloc(4, MAX_BATCH_SIZE);
   1.243 +    p2m        = calloc(sizeof(unsigned long), max_pfn); 
   1.244 +    pfn_type   = calloc(sizeof(unsigned long), max_pfn);    
   1.245 +    region_mfn = calloc(sizeof(unsigned long), MAX_BATCH_SIZE);
   1.246  
   1.247 -    if ((pfn_to_mfn_table == NULL) || (pfn_type == NULL) || 
   1.248 -        (region_mfn == NULL)) {
   1.249 +    if ((p2m == NULL) || (pfn_type == NULL) || (region_mfn == NULL)) {
   1.250          ERR("memory alloc failed");
   1.251          errno = ENOMEM;
   1.252          goto out;
   1.253      }
   1.254      
   1.255 -    if (mlock(region_mfn, 4 * MAX_BATCH_SIZE)) {
   1.256 +    if (mlock(region_mfn, sizeof(unsigned long) * MAX_BATCH_SIZE)) {
   1.257          ERR("Could not mlock region_mfn");
   1.258          goto out;
   1.259      }
   1.260 @@ -146,35 +228,30 @@ int xc_linux_restore(int xc_handle, int 
   1.261      }
   1.262      shared_info_frame = op.u.getdomaininfo.shared_info_frame;
   1.263  
   1.264 -    err = xc_domain_setmaxmem(xc_handle, dom, nr_pfns * PAGE_SIZE / 1024);
   1.265 -    if (err != 0) {
   1.266 +    if(xc_domain_setmaxmem(xc_handle, dom, PFN_TO_KB(max_pfn)) != 0) { 
   1.267          errno = ENOMEM;
   1.268          goto out;
   1.269      }
   1.270 -
   1.271 -    err = xc_domain_memory_increase_reservation(xc_handle, dom,
   1.272 -                                                nr_pfns, 0, 0, NULL);
   1.273 -    if (err != 0) {
   1.274 -        ERR("Failed to increase reservation by %lx\n", 
   1.275 -            nr_pfns * PAGE_SIZE / 1024); 
   1.276 +    
   1.277 +    if(xc_domain_memory_increase_reservation(
   1.278 +           xc_handle, dom, max_pfn, 0, 0, NULL) != 0) { 
   1.279 +        ERR("Failed to increase reservation by %lx KB\n", max_pfn); 
   1.280          errno = ENOMEM;
   1.281          goto out;
   1.282      }
   1.283  
   1.284      /* Build the pfn-to-mfn table. We choose MFN ordering returned by Xen. */
   1.285 -    if (xc_get_pfn_list(xc_handle, dom, pfn_to_mfn_table, nr_pfns) !=
   1.286 -        nr_pfns) {
   1.287 +    if (xc_get_pfn_list(xc_handle, dom, p2m, max_pfn) != max_pfn) {
   1.288          ERR("Did not read correct number of frame numbers for new dom");
   1.289          goto out;
   1.290      }
   1.291 -
   1.292 -    mmu = xc_init_mmu_updates(xc_handle, dom);
   1.293 -    if (mmu == NULL) {
   1.294 +    
   1.295 +    if(!(mmu = xc_init_mmu_updates(xc_handle, dom))) { 
   1.296          ERR("Could not initialise for MMU updates");
   1.297          goto out;
   1.298      }
   1.299  
   1.300 -    DPRINTF("Reloading memory pages:   0%%");
   1.301 +    DPRINTF("Reloading memory pages:   0%%\n");
   1.302  
   1.303      /*
   1.304       * Now simply read each saved frame into its new machine frame.
   1.305 @@ -183,258 +260,229 @@ int xc_linux_restore(int xc_handle, int 
   1.306      prev_pc = 0;
   1.307  
   1.308      n = 0;
   1.309 -    while ( 1 )
   1.310 -    {
   1.311 +    while (1) { 
   1.312 +
   1.313          int j;
   1.314          unsigned long region_pfn_type[MAX_BATCH_SIZE];
   1.315  
   1.316 -        this_pc = (n * 100) / nr_pfns;
   1.317 +        this_pc = (n * 100) / max_pfn;
   1.318          if ( (this_pc - prev_pc) >= 5 )
   1.319          {
   1.320              PPRINTF("\b\b\b\b%3d%%", this_pc);
   1.321              prev_pc = this_pc;
   1.322          }
   1.323  
   1.324 -        if ( read_exact(io_fd, &j, sizeof(int)) != sizeof(int) )
   1.325 -        {
   1.326 +        if (!read_exact(io_fd, &j, sizeof(int))) { 
   1.327              ERR("Error when reading batch size");
   1.328              goto out;
   1.329          }
   1.330  
   1.331          PPRINTF("batch %d\n",j);
   1.332   
   1.333 -        if ( j == -1 )
   1.334 -        {
   1.335 +        if (j == -1) {
   1.336              verify = 1;
   1.337 -            printf("Entering page verify mode\n");
   1.338 +            fprintf(stderr, "Entering page verify mode\n");
   1.339              continue;
   1.340          }
   1.341  
   1.342 -        if ( j == 0 )
   1.343 +        if (j == 0)
   1.344              break;  /* our work here is done */
   1.345  
   1.346 -        if ( j > MAX_BATCH_SIZE )
   1.347 -        {
   1.348 +        if (j > MAX_BATCH_SIZE) { 
   1.349              ERR("Max batch size exceeded. Giving up.");
   1.350              goto out;
   1.351          }
   1.352   
   1.353 -        if ( read_exact(io_fd, region_pfn_type, j*sizeof(unsigned long)) !=
   1.354 -             j*sizeof(unsigned long) ) {
   1.355 +        if (!read_exact(io_fd, region_pfn_type, j*sizeof(unsigned long))) { 
   1.356              ERR("Error when reading region pfn types");
   1.357              goto out;
   1.358          }
   1.359  
   1.360 -        for ( i = 0; i < j; i++ )
   1.361 -        {
   1.362 -            if ( (region_pfn_type[i] & LTAB_MASK) == XTAB)
   1.363 -            {
   1.364 +        for (i = 0; i < j; i++) { 
   1.365 +
   1.366 +            if ((region_pfn_type[i] & LTAB_MASK) == XTAB)
   1.367                  region_mfn[i] = 0; /* we know map will fail, but don't care */
   1.368 -            }
   1.369 -            else
   1.370 -            {  
   1.371 -                pfn = region_pfn_type[i] & ~LTAB_MASK;
   1.372 -                region_mfn[i] = pfn_to_mfn_table[pfn];
   1.373 -            }          
   1.374 +            else 
   1.375 +                region_mfn[i] = p2m[region_pfn_type[i] & ~LTAB_MASK]; 
   1.376 +
   1.377          }
   1.378   
   1.379 -        if ( (region_base = xc_map_foreign_batch( xc_handle, dom, 
   1.380 -                                                  PROT_WRITE,
   1.381 -                                                  region_mfn,
   1.382 -                                                  j )) == 0 )
   1.383 -        {
   1.384 +        if (!(region_base = xc_map_foreign_batch(
   1.385 +                  xc_handle, dom, PROT_WRITE, region_mfn, j))) {  
   1.386              ERR("map batch failed");
   1.387              goto out;
   1.388          }
   1.389  
   1.390          for ( i = 0; i < j; i++ )
   1.391          {
   1.392 -            unsigned long *ppage;
   1.393 +            void *page;
   1.394 +            unsigned long pagetype; 
   1.395  
   1.396 -            pfn = region_pfn_type[i] & ~LTAB_MASK;
   1.397 +            pfn      = region_pfn_type[i] & ~LTAB_MASK;
   1.398 +            pagetype = region_pfn_type[i] & LTAB_MASK; 
   1.399  
   1.400 -            if ( (region_pfn_type[i] & LTAB_MASK) == XTAB) continue;
   1.401 -
   1.402 -            if (pfn>nr_pfns)
   1.403 -            {
   1.404 +            if (pagetype == XTAB) 
   1.405 +                /* a bogus/unmapped page: skip it */
   1.406 +                continue;
   1.407 +            
   1.408 +            if (pfn > max_pfn) {
   1.409                  ERR("pfn out of range");
   1.410                  goto out;
   1.411              }
   1.412  
   1.413 -            region_pfn_type[i] &= LTAB_MASK;
   1.414 +            pfn_type[pfn] = pagetype; 
   1.415  
   1.416 -            pfn_type[pfn] = region_pfn_type[i];
   1.417 -
   1.418 -            mfn = pfn_to_mfn_table[pfn];
   1.419 +            mfn = p2m[pfn];
   1.420  
   1.421 -            if ( verify )
   1.422 -                ppage = (unsigned long*) buf;  /* debug case */
   1.423 -            else
   1.424 -                ppage = (unsigned long*) (region_base + i*PAGE_SIZE);
   1.425 +            /* In verify mode, we use a copy; otherwise we work in place */
   1.426 +            page = verify ? (void *)buf : (region_base + i*PAGE_SIZE); 
   1.427  
   1.428 -            if ( read_exact(io_fd, ppage, PAGE_SIZE) != PAGE_SIZE )
   1.429 -            {
   1.430 -                ERR("Error when reading pagetable page");
   1.431 +            if (!read_exact(io_fd, page, PAGE_SIZE)) { 
   1.432 +                ERR("Error when reading page (type was %lx)", pagetype);
   1.433                  goto out;
   1.434              }
   1.435  
   1.436 -            switch( region_pfn_type[i] & LTABTYPE_MASK )
   1.437 -            {
   1.438 -            case 0:
   1.439 -                break;
   1.440 -
   1.441 -            case L1TAB:
   1.442 -            {
   1.443 -                for ( k = 0; k < 1024; k++ ) 
   1.444 -                {
   1.445 -                    if ( ppage[k] & _PAGE_PRESENT ) 
   1.446 -                    {
   1.447 -                        xpfn = ppage[k] >> PAGE_SHIFT;
   1.448 -                        if ( xpfn >= nr_pfns )
   1.449 -                        {
   1.450 -                            ERR("Frame number in type %lu page "
   1.451 -                                "table is out of range. i=%d k=%d "
   1.452 -                                "pfn=0x%lx nr_pfns=%lu", 
   1.453 -                                region_pfn_type[i]>>28, i, 
   1.454 -                                k, xpfn, nr_pfns);
   1.455 -                            goto out;
   1.456 -                        }
   1.457 -
   1.458 -                        ppage[k] &= (PAGE_SIZE - 1) & 
   1.459 -                            ~(_PAGE_GLOBAL | _PAGE_PAT);
   1.460 -                        ppage[k] |= pfn_to_mfn_table[xpfn] << PAGE_SHIFT;
   1.461 -                    }
   1.462 -                }
   1.463 -            }
   1.464 -            break;
   1.465 +            pagetype &= LTABTYPE_MASK; 
   1.466  
   1.467 -            case L2TAB:
   1.468 -            {
   1.469 -                for ( k = 0; 
   1.470 -                      k < (HYPERVISOR_VIRT_START>>L2_PAGETABLE_SHIFT); 
   1.471 -                      k++ )
   1.472 -                {
   1.473 -                    if ( ppage[k] & _PAGE_PRESENT )
   1.474 -                    {
   1.475 -                        xpfn = ppage[k] >> PAGE_SHIFT;
   1.476 +            if(pagetype >= L1TAB && pagetype <= L4TAB) { 
   1.477 +                
   1.478 +                /* 
   1.479 +                ** A page table page - need to 'uncanonicalize' it, i.e. 
   1.480 +                ** replace all the references to pfns with the corresponding 
   1.481 +                ** mfns for the new domain. 
   1.482 +                */ 
   1.483 +                if(!uncanonicalize_pagetable(pagetype, page))
   1.484 +                    goto out; 
   1.485  
   1.486 -                        if ( xpfn >= nr_pfns )
   1.487 -                        {
   1.488 -                            ERR("Frame number in type %lu page"
   1.489 -                                " table is out of range. i=%d k=%d "
   1.490 -                                "pfn=%lu nr_pfns=%lu",
   1.491 -                                region_pfn_type[i]>>28, i, k, 
   1.492 -                                xpfn, nr_pfns);
   1.493 -                            goto out;
   1.494 -                        }
   1.495 +            } else if(pagetype != NOTAB) { 
   1.496  
   1.497 -                        ppage[k] &= (PAGE_SIZE - 1) & 
   1.498 -                            ~(_PAGE_GLOBAL | _PAGE_PSE);
   1.499 -                        ppage[k] |= pfn_to_mfn_table[xpfn] << PAGE_SHIFT;
   1.500 -                    }
   1.501 -                }
   1.502 -            }
   1.503 -            break;
   1.504 -
   1.505 -            default:
   1.506 -                ERR("Bogus page type %lx page table is "
   1.507 -                    "out of range. i=%d nr_pfns=%lu", 
   1.508 -                    region_pfn_type[i], i, nr_pfns);
   1.509 +                ERR("Bogus page type %lx page table is out of range: "
   1.510 +                    "i=%d max_pfn=%lu", pagetype, i, max_pfn);
   1.511                  goto out;
   1.512  
   1.513 -            } /* end of page type switch statement */
   1.514 +            } 
   1.515 +
   1.516 +
   1.517  
   1.518 -            if ( verify )
   1.519 -            {
   1.520 -                int res = memcmp(buf, (region_base + i*PAGE_SIZE), PAGE_SIZE );
   1.521 -                if ( res )
   1.522 -                {
   1.523 +            if (verify) {
   1.524 +
   1.525 +                int res = memcmp(buf, (region_base + i*PAGE_SIZE), PAGE_SIZE);
   1.526 +
   1.527 +                if (res) { 
   1.528 +
   1.529                      int v;
   1.530 -                    printf("************** pfn=%lx type=%lx gotcs=%08lx "
   1.531 -                           "actualcs=%08lx\n", pfn, pfn_type[pfn], 
   1.532 -                           csum_page(region_base + i*PAGE_SIZE), 
   1.533 -                           csum_page(buf));
   1.534 -                    for ( v = 0; v < 4; v++ )
   1.535 -                    {
   1.536 -                        unsigned long *p = (unsigned long *)
   1.537 +
   1.538 +                    DPRINTF("************** pfn=%lx type=%lx gotcs=%08lx "
   1.539 +                            "actualcs=%08lx\n", pfn, pfn_type[pfn], 
   1.540 +                            csum_page(region_base + i*PAGE_SIZE), 
   1.541 +                            csum_page(buf));
   1.542 +
   1.543 +                    for (v = 0; v < 4; v++) {
   1.544 +                        
   1.545 +                        unsigned long *p = (unsigned long *) 
   1.546                              (region_base + i*PAGE_SIZE);
   1.547 -                        if ( buf[v] != p[v] )
   1.548 -                            printf("    %d: %08lx %08lx\n",
   1.549 -                                   v, buf[v], p[v] );
   1.550 +                        if (buf[v] != p[v])
   1.551 +                            DPRINTF("    %d: %08lx %08lx\n", v, buf[v], p[v]);
   1.552                      }
   1.553                  }
   1.554              }
   1.555  
   1.556 -            if ( xc_add_mmu_update(xc_handle, mmu,
   1.557 -                                   (mfn<<PAGE_SHIFT) | MMU_MACHPHYS_UPDATE,
   1.558 -                                   pfn) )
   1.559 -            {
   1.560 -                printf("machpys mfn=%ld pfn=%ld\n",mfn,pfn);
   1.561 +            if (xc_add_mmu_update(xc_handle, mmu, 
   1.562 +                                  (mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE,
   1.563 +                                  pfn)) {
   1.564 +                ERR("machpys mfn=%ld pfn=%ld", mfn, pfn);
   1.565                  goto out;
   1.566              }
   1.567 -
   1.568          } /* end of 'batch' for loop */
   1.569  
   1.570 -        munmap( region_base, j*PAGE_SIZE );
   1.571 -        n+=j; /* crude stats */
   1.572 +        munmap(region_base, j*PAGE_SIZE);
   1.573 +        n+= j; /* crude stats */
   1.574      }
   1.575  
   1.576      DPRINTF("Received all pages\n");
   1.577  
   1.578 -    if ( pt_levels == 3 )
   1.579 -    {
   1.580 +    if (pt_levels == 3) {
   1.581 +
   1.582          /* Get all PGDs below 4GB. */
   1.583 -        for ( i = 0; i < nr_pfns; i++ )
   1.584 -        {
   1.585 -            if ( ((pfn_type[i] & LTABTYPE_MASK) == L3TAB) &&
   1.586 -                 (pfn_to_mfn_table[i] > 0xfffffUL) )
   1.587 -            {
   1.588 -                unsigned long new_mfn = xc_make_page_below_4G(
   1.589 -                    xc_handle, dom, pfn_to_mfn_table[i]);
   1.590 -                if ( new_mfn == 0 )
   1.591 -                {
   1.592 -                    fprintf(stderr, "Couldn't get a page below 4GB :-(\n");
   1.593 +        for (i = 0; i < max_pfn; i++) {
   1.594 +            
   1.595 +            if (((pfn_type[i] & LTABTYPE_MASK)==L3TAB) && (p2m[i]>0xfffffUL)) {
   1.596 +
   1.597 +                unsigned long new_mfn; 
   1.598 +
   1.599 +                if (!(new_mfn=xc_make_page_below_4G(xc_handle, dom, p2m[i]))) {
   1.600 +                    ERR("Couldn't get a page below 4GB :-(");
   1.601                      goto out;
   1.602                  }
   1.603 -                pfn_to_mfn_table[i] = new_mfn;
   1.604 -                if ( xc_add_mmu_update(
   1.605 -                    xc_handle, mmu, (new_mfn << PAGE_SHIFT) |
   1.606 -                    MMU_MACHPHYS_UPDATE, i) )
   1.607 -                {
   1.608 -                    fprintf(stderr, "Couldn't m2p on PAE root pgdir\n");
   1.609 +                
   1.610 +                p2m[i] = new_mfn;
   1.611 +                if (xc_add_mmu_update(
   1.612 +                        xc_handle, mmu, 
   1.613 +                        (new_mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE, i)) {
   1.614 +                    ERR("Couldn't m2p on PAE root pgdir");
   1.615                      goto out;
   1.616                  }
   1.617              }
   1.618          }
   1.619 +        
   1.620      }
   1.621  
   1.622 -    if ( xc_finish_mmu_updates(xc_handle, mmu) )
   1.623 +
   1.624 +    if (xc_finish_mmu_updates(xc_handle, mmu)) { 
   1.625 +        ERR("Error doing finish_mmu_updates()"); 
   1.626          goto out;
   1.627 +    } 
   1.628  
   1.629      /*
   1.630       * Pin page tables. Do this after writing to them as otherwise Xen
   1.631       * will barf when doing the type-checking.
   1.632       */
   1.633 -    for ( i = 0; i < nr_pfns; i++ )
   1.634 -    {
   1.635 +    for (i = 0; i < max_pfn; i++) {
   1.636 +
   1.637          if ( (pfn_type[i] & LPINTAB) == 0 )
   1.638              continue;
   1.639 -        if ( pfn_type[i] == (L1TAB|LPINTAB) )
   1.640 +        
   1.641 +        switch(pfn_type[i]) { 
   1.642 +
   1.643 +        case (L1TAB|LPINTAB): 
   1.644              pin[nr_pins].cmd = MMUEXT_PIN_L1_TABLE;
   1.645 -        else /* pfn_type[i] == (L2TAB|LPINTAB) */
   1.646 +            break; 
   1.647 +            
   1.648 +        case (L2TAB|LPINTAB): 
   1.649              pin[nr_pins].cmd = MMUEXT_PIN_L2_TABLE;
   1.650 -        pin[nr_pins].arg1.mfn = pfn_to_mfn_table[i];
   1.651 -        if ( ++nr_pins == MAX_PIN_BATCH )
   1.652 -        {
   1.653 -            if ( xc_mmuext_op(xc_handle, pin, nr_pins, dom) < 0 )
   1.654 +            break; 
   1.655 +            
   1.656 +        case (L3TAB|LPINTAB): 
   1.657 +            pin[nr_pins].cmd = MMUEXT_PIN_L3_TABLE;
   1.658 +            break; 
   1.659 +
   1.660 +        case (L4TAB|LPINTAB):
   1.661 +            pin[nr_pins].cmd = MMUEXT_PIN_L4_TABLE;
   1.662 +            break; 
   1.663 +            
   1.664 +        default: 
   1.665 +            continue; 
   1.666 +        }
   1.667 +
   1.668 +        pin[nr_pins].arg1.mfn = p2m[i];
   1.669 +        
   1.670 +        if (++nr_pins == MAX_PIN_BATCH) {
   1.671 +            if (xc_mmuext_op(xc_handle, pin, nr_pins, dom) < 0) { 
   1.672 +                ERR("Failed to pin batch of %d page tables", nr_pins); 
   1.673                  goto out;
   1.674 +            } 
   1.675 +            DPRINTF("successfully pinned batch of %d page tables", nr_pins); 
   1.676              nr_pins = 0;
   1.677          }
   1.678      }
   1.679 -
   1.680 -    if ( (nr_pins != 0) &&
   1.681 -         (xc_mmuext_op(xc_handle, pin, nr_pins, dom) < 0) )
   1.682 -        goto out;
   1.683 +    
   1.684 +    if (nr_pins != 0) { 
   1.685 +        if((rc = xc_mmuext_op(xc_handle, pin, nr_pins, dom)) < 0) { 
   1.686 +            ERR("Failed (2) to pin batch of %d page tables", nr_pins); 
   1.687 +            DPRINTF("rc is %d\n", rc); 
   1.688 +            goto out;
   1.689 +        }
   1.690 +    }
   1.691  
   1.692      DPRINTF("\b\b\b\b100%%\n");
   1.693      DPRINTF("Memory reloaded.\n");
   1.694 @@ -445,111 +493,115 @@ int xc_linux_restore(int xc_handle, int 
   1.695          unsigned long *pfntab;
   1.696          int rc;
   1.697  
   1.698 -        if ( read_exact(io_fd, &count, sizeof(count)) != sizeof(count) )
   1.699 -        {
   1.700 +        if (!read_exact(io_fd, &count, sizeof(count))) { 
   1.701              ERR("Error when reading pfn count");
   1.702              goto out;
   1.703          }
   1.704  
   1.705 -        pfntab = malloc( sizeof(unsigned int) * count );
   1.706 -        if ( pfntab == NULL )
   1.707 -        {
   1.708 +        if(!(pfntab = malloc(sizeof(unsigned long) * count))) { 
   1.709              ERR("Out of memory");
   1.710              goto out;
   1.711          }
   1.712 -
   1.713 -        if ( read_exact(io_fd, pfntab, sizeof(unsigned int)*count) !=
   1.714 -             sizeof(unsigned int)*count )
   1.715 -        {
   1.716 +        
   1.717 +        if (!read_exact(io_fd, pfntab, sizeof(unsigned long)*count)) { 
   1.718              ERR("Error when reading pfntab");
   1.719              goto out;
   1.720          }
   1.721  
   1.722 -        for ( i = 0; i < count; i++ )
   1.723 -        {
   1.724 +        for (i = 0; i < count; i++) {
   1.725 +
   1.726              unsigned long pfn = pfntab[i];
   1.727 -            pfntab[i]=pfn_to_mfn_table[pfn];
   1.728 -            pfn_to_mfn_table[pfn] = 0x80000001;  // not in pmap
   1.729 +
   1.730 +            if(pfn > max_pfn) 
   1.731 +                /* shouldn't happen - continue optimistically */
   1.732 +                continue; 
   1.733 +
   1.734 +            pfntab[i] = p2m[pfn];
   1.735 +            p2m[pfn]  = 0x80000001;  // not in pmap
   1.736          }
   1.737 +        
   1.738 +        if (count > 0) {
   1.739  
   1.740 -        if ( count > 0 )
   1.741 -        {
   1.742              struct xen_memory_reservation reservation = {
   1.743                  .extent_start = pfntab,
   1.744                  .nr_extents   = count,
   1.745                  .extent_order = 0,
   1.746                  .domid        = dom
   1.747              };
   1.748 -            if ( (rc = xc_memory_op(xc_handle,
   1.749 -                                    XENMEM_decrease_reservation,
   1.750 -                                    &reservation)) != count )
   1.751 -            {
   1.752 -                ERR("Could not decrease reservation : %d",rc);
   1.753 +
   1.754 +            if ((rc = xc_memory_op(xc_handle, XENMEM_decrease_reservation,
   1.755 +                                   &reservation)) != count) { 
   1.756 +                ERR("Could not decrease reservation : %d", rc);
   1.757                  goto out;
   1.758 -            }
   1.759 -            else
   1.760 -            {
   1.761 -                printf("Decreased reservation by %d pages\n", count);
   1.762 -            }
   1.763 +            } else
   1.764 +                DPRINTF("Decreased reservation by %d pages\n", count);
   1.765          } 
   1.766      }
   1.767  
   1.768 -    if ( read_exact(io_fd, &ctxt,            sizeof(ctxt)) != sizeof(ctxt) ||
   1.769 -         read_exact(io_fd, shared_info_page, PAGE_SIZE) != PAGE_SIZE )
   1.770 -    {
   1.771 +    if (!read_exact(io_fd, &ctxt, sizeof(ctxt)) || 
   1.772 +        !read_exact(io_fd, shared_info_page, PAGE_SIZE)) { 
   1.773          ERR("Error when reading ctxt or shared info page");
   1.774          goto out;
   1.775      }
   1.776  
   1.777      /* Uncanonicalise the suspend-record frame number and poke resume rec. */
   1.778      pfn = ctxt.user_regs.edx;
   1.779 -    if ( (pfn >= nr_pfns) || (pfn_type[pfn] != NOTAB) )
   1.780 -    {
   1.781 +    if ((pfn >= max_pfn) || (pfn_type[pfn] != NOTAB)) {
   1.782          ERR("Suspend record frame number is bad");
   1.783          goto out;
   1.784      }
   1.785 -    ctxt.user_regs.edx = mfn = pfn_to_mfn_table[pfn];
   1.786 +    ctxt.user_regs.edx = mfn = p2m[pfn];
   1.787      start_info = xc_map_foreign_range(
   1.788          xc_handle, dom, PAGE_SIZE, PROT_READ | PROT_WRITE, mfn);
   1.789 -    start_info->nr_pages    = nr_pfns;
   1.790 +    start_info->nr_pages    = max_pfn;
   1.791      start_info->shared_info = shared_info_frame << PAGE_SHIFT;
   1.792      start_info->flags       = 0;
   1.793 -    *store_mfn = start_info->store_mfn   =
   1.794 -        pfn_to_mfn_table[start_info->store_mfn];
   1.795 -    start_info->store_evtchn = store_evtchn;
   1.796 -    *console_mfn = start_info->console_mfn   =
   1.797 -        pfn_to_mfn_table[start_info->console_mfn];
   1.798 -    start_info->console_evtchn = console_evtchn;
   1.799 +    *store_mfn = start_info->store_mfn       = p2m[start_info->store_mfn];
   1.800 +    start_info->store_evtchn                 = store_evtchn;
   1.801 +    *console_mfn = start_info->console_mfn   = p2m[start_info->console_mfn];
   1.802 +    start_info->console_evtchn               = console_evtchn;
   1.803      munmap(start_info, PAGE_SIZE);
   1.804  
   1.805      /* Uncanonicalise each GDT frame number. */
   1.806 -    if ( ctxt.gdt_ents > 8192 )
   1.807 -    {
   1.808 +    if (ctxt.gdt_ents > 8192) {
   1.809          ERR("GDT entry count out of range");
   1.810          goto out;
   1.811      }
   1.812  
   1.813 -    for ( i = 0; i < ctxt.gdt_ents; i += 512 )
   1.814 -    {
   1.815 +    for (i = 0; i < ctxt.gdt_ents; i += 512) {
   1.816          pfn = ctxt.gdt_frames[i];
   1.817 -        if ( (pfn >= nr_pfns) || (pfn_type[pfn] != NOTAB) )
   1.818 -        {
   1.819 +        if ((pfn >= max_pfn) || (pfn_type[pfn] != NOTAB)) {
   1.820              ERR("GDT frame number is bad");
   1.821              goto out;
   1.822          }
   1.823 -        ctxt.gdt_frames[i] = pfn_to_mfn_table[pfn];
   1.824 +        ctxt.gdt_frames[i] = p2m[pfn];
   1.825      }
   1.826  
   1.827      /* Uncanonicalise the page table base pointer. */
   1.828      pfn = ctxt.ctrlreg[3] >> PAGE_SHIFT;
   1.829 -    if ( (pfn >= nr_pfns) || ((pfn_type[pfn]&LTABTYPE_MASK) != L2TAB) )
   1.830 -    {
   1.831 -        printf("PT base is bad. pfn=%lu nr=%lu type=%08lx %08lx\n",
   1.832 -               pfn, nr_pfns, pfn_type[pfn], (unsigned long)L2TAB);
   1.833 +
   1.834 +    if (pfn >= max_pfn) {
   1.835 +        DPRINTF("PT base is bad: pfn=%lu max_pfn=%lu type=%08lx\n",
   1.836 +                pfn, max_pfn, pfn_type[pfn]); 
   1.837          ERR("PT base is bad.");
   1.838          goto out;
   1.839      }
   1.840 -    ctxt.ctrlreg[3] = pfn_to_mfn_table[pfn] << PAGE_SHIFT;
   1.841 +
   1.842 +    if ((pt_levels == 2) && ((pfn_type[pfn]&LTABTYPE_MASK) != L2TAB)) { 
   1.843 +        DPRINTF("PT base is bad. pfn=%lu nr=%lu type=%08lx %08lx\n",
   1.844 +                pfn, max_pfn, pfn_type[pfn], (unsigned long)L2TAB);
   1.845 +        ERR("PT base is bad.");
   1.846 +        goto out;
   1.847 +    }
   1.848 +
   1.849 +    if ((pt_levels == 3) && ((pfn_type[pfn]&LTABTYPE_MASK) != L3TAB)) { 
   1.850 +        DPRINTF("PT base is bad. pfn=%lu nr=%lu type=%08lx %08lx\n",
   1.851 +                pfn, max_pfn, pfn_type[pfn], (unsigned long)L3TAB);
   1.852 +        ERR("PT base is bad.");
   1.853 +        goto out;
   1.854 +    }
   1.855 +    
   1.856 +    ctxt.ctrlreg[3] = p2m[pfn] << PAGE_SHIFT;
   1.857  
   1.858      /* clear any pending events and the selector */
   1.859      memset(&(shared_info->evtchn_pending[0]), 0,
   1.860 @@ -558,40 +610,31 @@ int xc_linux_restore(int xc_handle, int 
   1.861          shared_info->vcpu_data[i].evtchn_pending_sel = 0;
   1.862  
   1.863      /* Copy saved contents of shared-info page. No checking needed. */
   1.864 -    ppage = xc_map_foreign_range(
   1.865 +    page = xc_map_foreign_range(
   1.866          xc_handle, dom, PAGE_SIZE, PROT_WRITE, shared_info_frame);
   1.867 -    memcpy(ppage, shared_info, sizeof(shared_info_t));
   1.868 -    munmap(ppage, PAGE_SIZE);
   1.869 -
   1.870 +    memcpy(page, shared_info, sizeof(shared_info_t));
   1.871 +    munmap(page, PAGE_SIZE);
   1.872 +    
   1.873      /* Uncanonicalise the pfn-to-mfn table frame-number list. */
   1.874 -    for ( i = 0; i < (nr_pfns+1023)/1024; i++ )
   1.875 -    {
   1.876 -        unsigned long pfn, mfn;
   1.877 -
   1.878 -        pfn = pfn_to_mfn_frame_list[i];
   1.879 -        if ( (pfn >= nr_pfns) || (pfn_type[pfn] != NOTAB) )
   1.880 -        {
   1.881 +    for (i = 0; i < P2M_FL_ENTRIES; i++) {
   1.882 +        pfn = p2m_frame_list[i];
   1.883 +        if ((pfn >= max_pfn) || (pfn_type[pfn] != NOTAB)) {
   1.884              ERR("PFN-to-MFN frame number is bad");
   1.885              goto out;
   1.886          }
   1.887 -        mfn = pfn_to_mfn_table[pfn];
   1.888 -        pfn_to_mfn_frame_list[i] = mfn;
   1.889 +
   1.890 +        p2m_frame_list[i] = p2m[pfn];
   1.891      }
   1.892      
   1.893 -    if ( (live_pfn_to_mfn_table = 
   1.894 -          xc_map_foreign_batch(xc_handle, dom, 
   1.895 -                               PROT_WRITE,
   1.896 -                               pfn_to_mfn_frame_list,
   1.897 -                               (nr_pfns+1023)/1024 )) == 0 )
   1.898 -    {
   1.899 -        ERR("Couldn't map pfn_to_mfn table");
   1.900 +    /* Copy the P2M we've constructed to the 'live' P2M */
   1.901 +    if (!(live_p2m = xc_map_foreign_batch(xc_handle, dom, PROT_WRITE, 
   1.902 +                                          p2m_frame_list, P2M_FL_ENTRIES))) {
   1.903 +        ERR("Couldn't map p2m table");
   1.904          goto out;
   1.905      }
   1.906  
   1.907 -    memcpy(live_pfn_to_mfn_table, pfn_to_mfn_table, 
   1.908 -           nr_pfns*sizeof(unsigned long) );
   1.909 -
   1.910 -    munmap(live_pfn_to_mfn_table, ((nr_pfns+1023)/1024)*PAGE_SIZE);
   1.911 +    memcpy(live_p2m, p2m, P2M_SIZE); 
   1.912 +    munmap(live_p2m, P2M_SIZE); 
   1.913  
   1.914      /*
   1.915       * Safety checking of saved context:
   1.916 @@ -605,25 +648,23 @@ int xc_linux_restore(int xc_handle, int 
   1.917       *  8. debugregs are checked by Xen.
   1.918       *  9. callback code selectors need checking.
   1.919       */
   1.920 -    for ( i = 0; i < 256; i++ )
   1.921 -    {
   1.922 +    for ( i = 0; i < 256; i++ ) {
   1.923          ctxt.trap_ctxt[i].vector = i;
   1.924 -        if ( (ctxt.trap_ctxt[i].cs & 3) == 0 )
   1.925 +        if ((ctxt.trap_ctxt[i].cs & 3) == 0)
   1.926              ctxt.trap_ctxt[i].cs = FLAT_KERNEL_CS;
   1.927      }
   1.928 -    if ( (ctxt.kernel_ss & 3) == 0 )
   1.929 +    if ((ctxt.kernel_ss & 3) == 0)
   1.930          ctxt.kernel_ss = FLAT_KERNEL_DS;
   1.931  #if defined(__i386__)
   1.932 -    if ( (ctxt.event_callback_cs & 3) == 0 )
   1.933 +    if ((ctxt.event_callback_cs & 3) == 0)
   1.934          ctxt.event_callback_cs = FLAT_KERNEL_CS;
   1.935 -    if ( (ctxt.failsafe_callback_cs & 3) == 0 )
   1.936 +    if ((ctxt.failsafe_callback_cs & 3) == 0)
   1.937          ctxt.failsafe_callback_cs = FLAT_KERNEL_CS;
   1.938  #endif
   1.939 -    if ( ((ctxt.ldt_base & (PAGE_SIZE - 1)) != 0) ||
   1.940 -         (ctxt.ldt_ents > 8192) ||
   1.941 -         (ctxt.ldt_base > HYPERVISOR_VIRT_START) ||
   1.942 -         ((ctxt.ldt_base + ctxt.ldt_ents*8) > HYPERVISOR_VIRT_START) )
   1.943 -    {
   1.944 +    if (((ctxt.ldt_base & (PAGE_SIZE - 1)) != 0) ||
   1.945 +        (ctxt.ldt_ents > 8192) ||
   1.946 +        (ctxt.ldt_base > hvirt_start) ||
   1.947 +        ((ctxt.ldt_base + ctxt.ldt_ents*8) > hvirt_start)) {
   1.948          ERR("Bad LDT base or size");
   1.949          goto out;
   1.950      }
   1.951 @@ -636,8 +677,7 @@ int xc_linux_restore(int xc_handle, int 
   1.952      op.u.setdomaininfo.ctxt   = &ctxt;
   1.953      rc = xc_dom0_op(xc_handle, &op);
   1.954  
   1.955 -    if ( rc != 0 )
   1.956 -    {
   1.957 +    if (rc != 0) {
   1.958          ERR("Couldn't build the domain");
   1.959          goto out;
   1.960      }
   1.961 @@ -646,9 +686,10 @@ int xc_linux_restore(int xc_handle, int 
   1.962      if ( (rc != 0) && (dom != 0) )
   1.963          xc_domain_destroy(xc_handle, dom);
   1.964      free(mmu);
   1.965 -    free(pfn_to_mfn_table);
   1.966 +    free(p2m);
   1.967      free(pfn_type);
   1.968  
   1.969      DPRINTF("Restore exit with rc=%d\n", rc);
   1.970 +
   1.971      return rc;
   1.972  }
     2.1 --- a/tools/libxc/xc_linux_save.c	Tue Nov 08 18:39:58 2005 +0100
     2.2 +++ b/tools/libxc/xc_linux_save.c	Tue Nov 08 18:42:07 2005 +0100
     2.3 @@ -13,10 +13,7 @@
     2.4  #include <sys/time.h>
     2.5  
     2.6  #include "xg_private.h"
     2.7 -
     2.8 -#define BATCH_SIZE 1024   /* 1024 pages (4MB) at a time */
     2.9 -
    2.10 -#define MAX_MBIT_RATE 500
    2.11 +#include "xg_save_restore.h"
    2.12  
    2.13  /*
    2.14  ** Default values for important tuning parameters. Can override by passing
    2.15 @@ -25,75 +22,77 @@
    2.16  ** XXX SMH: should consider if want to be able to override MAX_MBIT_RATE too. 
    2.17  ** 
    2.18  */
    2.19 -#define DEF_MAX_ITERS   29   /* limit us to 30 times round loop */ 
    2.20 -#define DEF_MAX_FACTOR   3   /* never send more than 3x nr_pfns */
    2.21 +#define DEF_MAX_ITERS   29   /* limit us to 30 times round loop   */ 
    2.22 +#define DEF_MAX_FACTOR   3   /* never send more than 3x nr_pfns   */
    2.23 +
    2.24  
    2.25 -/* Flags to control behaviour of xc_linux_save */
    2.26 -#define XCFLAGS_LIVE      1
    2.27 -#define XCFLAGS_DEBUG     2
    2.28 +/* max mfn of the whole machine */
    2.29 +static uint32_t max_mfn; 
    2.30  
    2.31 -#define DEBUG 0
    2.32 +/* virtual starting address of the hypervisor */
    2.33 +static uint32_t hvirt_start; 
    2.34  
    2.35 -#if 1
    2.36 -#define ERR(_f, _a...) do { fprintf(stderr, _f "\n" , ## _a); fflush(stderr); } while (0)
    2.37 -#else
    2.38 -#define ERR(_f, _a...) ((void)0)
    2.39 -#endif
    2.40 +/* #levels of page tables used by the currrent guest */
    2.41 +static uint32_t pt_levels; 
    2.42 +
    2.43 +/* total number of pages used by the current guest */
    2.44 +static unsigned long max_pfn;
    2.45  
    2.46 -#if DEBUG
    2.47 -#define DPRINTF(_f, _a...) fprintf ( stderr, _f , ## _a )
    2.48 -#else
    2.49 -#define DPRINTF(_f, _a...) ((void)0)
    2.50 -#endif
    2.51 +/* Live mapping of the table mapping each PFN to its current MFN. */
    2.52 +static unsigned long *live_p2m = NULL;
    2.53  
    2.54 -#define PROGRESS 0
    2.55 -#if PROGRESS
    2.56 -#define PPRINTF(_f, _a...) fprintf ( stderr, _f , ## _a )
    2.57 -#else
    2.58 -#define PPRINTF(_f, _a...)
    2.59 -#endif
    2.60 +/* Live mapping of system MFN to PFN table. */
    2.61 +static unsigned long *live_m2p = NULL;
    2.62 +
    2.63  
    2.64  /*
    2.65   * Returns TRUE if the given machine frame number has a unique mapping
    2.66   * in the guest's pseudophysical map.
    2.67   */
    2.68 -
    2.69 -#define MFN_IS_IN_PSEUDOPHYS_MAP(_mfn)                                    \
    2.70 -    (((_mfn) < (1024*1024)) &&                                            \
    2.71 -     ((live_mfn_to_pfn_table[_mfn] < nr_pfns) &&                         \
    2.72 -       (live_pfn_to_mfn_table[live_mfn_to_pfn_table[_mfn]] == (_mfn))))
    2.73 -
    2.74 +#define MFN_IS_IN_PSEUDOPHYS_MAP(_mfn)          \
    2.75 +(((_mfn) < (max_mfn)) &&                        \
    2.76 + ((live_m2p[_mfn] < (max_pfn)) &&               \
    2.77 +  (live_p2m[live_m2p[_mfn]] == (_mfn))))
    2.78 +    
    2.79   
    2.80  /* Returns TRUE if MFN is successfully converted to a PFN. */
    2.81 -#define translate_mfn_to_pfn(_pmfn)            \
    2.82 -({                                             \
    2.83 -    unsigned long mfn = *(_pmfn);              \
    2.84 -    int _res = 1;                              \
    2.85 -    if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) )      \
    2.86 -        _res = 0;                              \
    2.87 -    else                                       \
    2.88 -        *(_pmfn) = live_mfn_to_pfn_table[mfn]; \
    2.89 -    _res;                                      \
    2.90 +#define translate_mfn_to_pfn(_pmfn)                             \
    2.91 +({                                                              \
    2.92 +    unsigned long mfn = *(_pmfn);                               \
    2.93 +    int _res = 1;                                               \
    2.94 +    if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) )                       \
    2.95 +        _res = 0;                                               \
    2.96 +    else                                                        \
    2.97 +        *(_pmfn) = live_m2p[mfn];                               \
    2.98 +    _res;                                                       \
    2.99  })
   2.100  
   2.101 -#define is_mapped(pfn) (!((pfn) & 0x80000000UL))
   2.102 +/* 
   2.103 +** During (live) save/migrate, we maintain a number of bitmaps to track 
   2.104 +** which pages we have to send, to fixup, and to skip. 
   2.105 +*/
   2.106 +
   2.107 +#define BITS_PER_LONG (sizeof(unsigned long) * 8) 
   2.108 +#define BITMAP_SIZE   ((max_pfn + BITS_PER_LONG - 1) / BITS_PER_LONG)
   2.109  
   2.110 -static inline int test_bit ( int nr, volatile void * addr)
   2.111 +#define BITMAP_ENTRY(_nr,_bmap) \
   2.112 +   ((unsigned long *)(_bmap))[(_nr)/BITS_PER_LONG]
   2.113 +
   2.114 +#define BITMAP_SHIFT(_nr) ((_nr) % BITS_PER_LONG)
   2.115 +
   2.116 +static inline int test_bit (int nr, volatile void * addr)
   2.117  {
   2.118 -    return (((unsigned long*)addr)[nr/(sizeof(unsigned long)*8)] >> 
   2.119 -            (nr % (sizeof(unsigned long)*8))) & 1;
   2.120 +    return (BITMAP_ENTRY(nr, addr) >> BITMAP_SHIFT(nr)) & 1; 
   2.121  }
   2.122  
   2.123 -static inline void clear_bit ( int nr, volatile void * addr)
   2.124 +static inline void clear_bit (int nr, volatile void * addr)
   2.125  {
   2.126 -    ((unsigned long*)addr)[nr/(sizeof(unsigned long)*8)] &= 
   2.127 -        ~(1 << (nr % (sizeof(unsigned long)*8) ) );
   2.128 +    BITMAP_ENTRY(nr, addr) &= ~(1 << BITMAP_SHIFT(nr)); 
   2.129  }
   2.130  
   2.131  static inline void set_bit ( int nr, volatile void * addr)
   2.132  {
   2.133 -    ((unsigned long*)addr)[nr/(sizeof(unsigned long)*8)] |= 
   2.134 -        (1 << (nr % (sizeof(unsigned long)*8) ) );
   2.135 +    BITMAP_ENTRY(nr, addr) |= (1 << BITMAP_SHIFT(nr)); 
   2.136  }
   2.137  
   2.138  /* Returns the hamming weight (i.e. the number of bits set) in a N-bit word */
   2.139 @@ -142,102 +141,106 @@ static inline int permute( int i, int nr
   2.140  
   2.141      do { i = ((i>>(order_nr-10)) | ( i<<10 ) ) & ((1<<order_nr)-1); }
   2.142      while ( i >= nr ); /* this won't ever loop if nr is a power of 2 */
   2.143 -
   2.144 +    
   2.145      return i;
   2.146  }
   2.147  
   2.148 -static long long tv_to_us( struct timeval *new )
   2.149 +
   2.150 +
   2.151 +
   2.152 +static uint64_t tv_to_us(struct timeval *new)
   2.153  {
   2.154      return (new->tv_sec * 1000000) + new->tv_usec;
   2.155  }
   2.156  
   2.157 -static long long llgettimeofday( void )
   2.158 +static uint64_t llgettimeofday(void)
   2.159  {
   2.160      struct timeval now;
   2.161      gettimeofday(&now, NULL);
   2.162      return tv_to_us(&now);
   2.163  }
   2.164  
   2.165 -static long long tv_delta( struct timeval *new, struct timeval *old )
   2.166 +static uint64_t tv_delta(struct timeval *new, struct timeval *old)
   2.167  {
   2.168      return ((new->tv_sec - old->tv_sec)*1000000 ) + 
   2.169          (new->tv_usec - old->tv_usec);
   2.170  }
   2.171  
   2.172  
   2.173 -#define START_MBIT_RATE 0 //ioctxt->resource
   2.174 +#ifdef ADAPTIVE_SAVE
   2.175 +
   2.176 +
   2.177 +/*
   2.178 +** We control the rate at which we transmit (or save) to minimize impact
   2.179 +** on running domains (including the target if we're doing live migrate). 
   2.180 +*/
   2.181  
   2.182 -static int mbit_rate, ombit_rate = 0;
   2.183 -static int burst_time_us = -1;
   2.184 +#define MAX_MBIT_RATE    500      /* maximum transmit rate for migrate */
   2.185 +#define START_MBIT_RATE  100      /* initial transmit rate for migrate */
   2.186 +
   2.187  
   2.188 -#define MBIT_RATE mbit_rate
   2.189 +/* Scaling factor to convert between a rate (in Mb/s) and time (in usecs) */
   2.190 +#define RATE_TO_BTU      781250
   2.191 +
   2.192 +/* Amount in bytes we allow ourselves to send in a burst */
   2.193  #define BURST_BUDGET (100*1024)
   2.194  
   2.195 -/* 
   2.196 -   1000000/((100)*1024*1024/8/(100*1024))
   2.197 -   7812
   2.198 -   1000000/((100)*1024/8/(100))
   2.199 -   7812
   2.200 -   1000000/((100)*128/(100))
   2.201 -   7812
   2.202 -   100000000/((100)*128)
   2.203 -   7812
   2.204 -   100000000/128
   2.205 -   781250
   2.206 - */
   2.207 -#define RATE_TO_BTU 781250
   2.208 -#define BURST_TIME_US burst_time_us
   2.209 +
   2.210 +/* We keep track of the current and previous transmission rate */
   2.211 +static int mbit_rate, ombit_rate = 0;
   2.212 +
   2.213 +/* Have we reached the maximum transmission rate? */
   2.214 +#define RATE_IS_MAX() (mbit_rate == MAX_MBIT_RATE) 
   2.215 +
   2.216  
   2.217 -static int
   2.218 -ratewrite(int io_fd, void *buf, int n)
   2.219 +static inline void initialize_mbit_rate() 
   2.220 +{
   2.221 +    mbit_rate = START_MBIT_RATE;
   2.222 +}
   2.223 +
   2.224 +
   2.225 +static int ratewrite(int io_fd, void *buf, int n)
   2.226  {
   2.227      static int budget = 0;
   2.228 +    static int burst_time_us = -1;
   2.229      static struct timeval last_put = { 0 };
   2.230      struct timeval now;
   2.231      struct timespec delay;
   2.232      long long delta;
   2.233  
   2.234 -    if ( START_MBIT_RATE == 0 )
   2.235 +    if (START_MBIT_RATE == 0)
   2.236          return write(io_fd, buf, n);
   2.237      
   2.238      budget -= n;
   2.239 -    if ( budget < 0 )
   2.240 -    {
   2.241 -        if ( MBIT_RATE != ombit_rate )
   2.242 -        {
   2.243 -            BURST_TIME_US = RATE_TO_BTU / MBIT_RATE;
   2.244 -            ombit_rate = MBIT_RATE;
   2.245 +    if (budget < 0) {
   2.246 +        if (mbit_rate != ombit_rate) {
   2.247 +            burst_time_us = RATE_TO_BTU / mbit_rate;
   2.248 +            ombit_rate = mbit_rate;
   2.249              DPRINTF("rate limit: %d mbit/s burst budget %d slot time %d\n",
   2.250 -                    MBIT_RATE, BURST_BUDGET, BURST_TIME_US);
   2.251 +                    mbit_rate, BURST_BUDGET, burst_time_us);
   2.252          }
   2.253 -        if ( last_put.tv_sec == 0 )
   2.254 -        {
   2.255 +        if (last_put.tv_sec == 0) {
   2.256              budget += BURST_BUDGET;
   2.257              gettimeofday(&last_put, NULL);
   2.258 -        }
   2.259 -        else
   2.260 -        {
   2.261 -            while ( budget < 0 )
   2.262 -            {
   2.263 +        } else {
   2.264 +            while (budget < 0) {
   2.265                  gettimeofday(&now, NULL);
   2.266                  delta = tv_delta(&now, &last_put);
   2.267 -                while ( delta > BURST_TIME_US )
   2.268 -                {
   2.269 +                while (delta > burst_time_us) {
   2.270                      budget += BURST_BUDGET;
   2.271 -                    last_put.tv_usec += BURST_TIME_US;
   2.272 -                    if ( last_put.tv_usec > 1000000 )
   2.273 -                    {
   2.274 +                    last_put.tv_usec += burst_time_us;
   2.275 +                    if (last_put.tv_usec > 1000000) {
   2.276                          last_put.tv_usec -= 1000000;
   2.277                          last_put.tv_sec++;
   2.278                      }
   2.279 -                    delta -= BURST_TIME_US;
   2.280 +                    delta -= burst_time_us;
   2.281                  }
   2.282 -                if ( budget > 0 )
   2.283 +                if (budget > 0)
   2.284                      break;
   2.285                  delay.tv_sec = 0;
   2.286 -                delay.tv_nsec = 1000 * (BURST_TIME_US - delta);
   2.287 -                while ( delay.tv_nsec > 0 )
   2.288 -                    if ( nanosleep(&delay, &delay) == 0 )
   2.289 +                delay.tv_nsec = 1000 * (burst_time_us - delta);
   2.290 +                while (delay.tv_nsec > 0)
   2.291 +                    if (nanosleep(&delay, &delay) == 0)
   2.292                          break;
   2.293              }
   2.294          }
   2.295 @@ -245,35 +248,52 @@ ratewrite(int io_fd, void *buf, int n)
   2.296      return write(io_fd, buf, n);
   2.297  }
   2.298  
   2.299 -static int print_stats( int xc_handle, uint32_t domid, 
   2.300 -                        int pages_sent, xc_shadow_control_stats_t *stats,
   2.301 -                        int print )
   2.302 +#else /* ! ADAPTIVE SAVE */
   2.303 +
   2.304 +#define RATE_IS_MAX() (0) 
   2.305 +#define ratewrite(_io_fd, _buf, _n) write((_io_fd), (_buf), (_n)) 
   2.306 +#define initialize_mbit_rate() 
   2.307 +
   2.308 +#endif
   2.309 +
   2.310 +
   2.311 +static inline ssize_t write_exact(int fd, void *buf, size_t count)
   2.312 +{
   2.313 +    if(write(fd, buf, count) != count) 
   2.314 +        return 0; 
   2.315 +    return 1; 
   2.316 +} 
   2.317 +
   2.318 +
   2.319 +
   2.320 +static int print_stats(int xc_handle, uint32_t domid, int pages_sent, 
   2.321 +                       xc_shadow_control_stats_t *stats, int print)
   2.322  {
   2.323      static struct timeval wall_last;
   2.324      static long long      d0_cpu_last;
   2.325      static long long      d1_cpu_last;
   2.326 -
   2.327 +    
   2.328      struct timeval        wall_now;
   2.329      long long             wall_delta;
   2.330      long long             d0_cpu_now, d0_cpu_delta;
   2.331      long long             d1_cpu_now, d1_cpu_delta;
   2.332 -
   2.333 +    
   2.334      gettimeofday(&wall_now, NULL);
   2.335 -
   2.336 +    
   2.337      d0_cpu_now = xc_domain_get_cpu_usage(xc_handle, 0, /* FIXME */ 0)/1000;
   2.338      d1_cpu_now = xc_domain_get_cpu_usage(xc_handle, domid, /* FIXME */ 0)/1000;
   2.339  
   2.340      if ( (d0_cpu_now == -1) || (d1_cpu_now == -1) ) 
   2.341          fprintf(stderr, "ARRHHH!!\n");
   2.342 -
   2.343 +    
   2.344      wall_delta = tv_delta(&wall_now,&wall_last)/1000;
   2.345 -
   2.346 -    if ( wall_delta == 0 ) wall_delta = 1;
   2.347 +    
   2.348 +    if (wall_delta == 0) wall_delta = 1;
   2.349 +    
   2.350 +    d0_cpu_delta = (d0_cpu_now - d0_cpu_last)/1000;
   2.351 +    d1_cpu_delta = (d1_cpu_now - d1_cpu_last)/1000;
   2.352  
   2.353 -    d0_cpu_delta  = (d0_cpu_now - d0_cpu_last)/1000;
   2.354 -    d1_cpu_delta  = (d1_cpu_now - d1_cpu_last)/1000;
   2.355 -
   2.356 -    if ( print )
   2.357 +    if (print)
   2.358          fprintf(stderr,
   2.359                  "delta %lldms, dom0 %d%%, target %d%%, sent %dMb/s, "
   2.360                  "dirtied %dMb/s %" PRId32 " pages\n",
   2.361 @@ -284,23 +304,25 @@ static int print_stats( int xc_handle, u
   2.362                  (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))),
   2.363                  stats->dirty_count);
   2.364  
   2.365 -    if ( ((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))) > mbit_rate )
   2.366 -    {
   2.367 +#ifdef ADAPTIVE_SAVE    
   2.368 +    if (((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))) > mbit_rate) {
   2.369          mbit_rate = (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8)))
   2.370              + 50;
   2.371          if (mbit_rate > MAX_MBIT_RATE)
   2.372              mbit_rate = MAX_MBIT_RATE;
   2.373      }
   2.374 -
   2.375 -    d0_cpu_last  = d0_cpu_now;
   2.376 -    d1_cpu_last  = d1_cpu_now;
   2.377 -    wall_last = wall_now; 
   2.378 +#endif
   2.379 +    
   2.380 +    d0_cpu_last = d0_cpu_now;
   2.381 +    d1_cpu_last = d1_cpu_now;
   2.382 +    wall_last   = wall_now; 
   2.383  
   2.384      return 0;
   2.385  }
   2.386  
   2.387 -static int analysis_phase( int xc_handle, uint32_t domid, 
   2.388 -                           int nr_pfns, unsigned long *arr, int runs )
   2.389 +
   2.390 +static int analysis_phase(int xc_handle, uint32_t domid, int max_pfn, 
   2.391 +                          unsigned long *arr, int runs)
   2.392  {
   2.393      long long start, now;
   2.394      xc_shadow_control_stats_t stats;
   2.395 @@ -308,22 +330,18 @@ static int analysis_phase( int xc_handle
   2.396  
   2.397      start = llgettimeofday();
   2.398  
   2.399 -    for ( j = 0; j < runs; j++ )
   2.400 -    {
   2.401 +    for (j = 0; j < runs; j++) {
   2.402          int i;
   2.403 -
   2.404 -        xc_shadow_control( xc_handle, domid, 
   2.405 -                           DOM0_SHADOW_CONTROL_OP_CLEAN,
   2.406 -                           arr, nr_pfns, NULL);
   2.407 +        
   2.408 +        xc_shadow_control(xc_handle, domid, DOM0_SHADOW_CONTROL_OP_CLEAN,
   2.409 +                          arr, max_pfn, NULL);
   2.410          fprintf(stderr, "#Flush\n");
   2.411 -        for ( i = 0; i < 40; i++ )
   2.412 -        {     
   2.413 +        for ( i = 0; i < 40; i++ ) {     
   2.414              usleep(50000);     
   2.415              now = llgettimeofday();
   2.416 -            xc_shadow_control( xc_handle, domid, 
   2.417 -                               DOM0_SHADOW_CONTROL_OP_PEEK,
   2.418 -                               NULL, 0, &stats);
   2.419 -
   2.420 +            xc_shadow_control(xc_handle, domid, DOM0_SHADOW_CONTROL_OP_PEEK,
   2.421 +                              NULL, 0, &stats);
   2.422 +            
   2.423              fprintf(stderr, "now= %lld faults= %" PRId32 " dirty= %" PRId32
   2.424                      " dirty_net= %" PRId32 " dirty_block= %" PRId32"\n", 
   2.425                      ((now-start)+500)/1000, 
   2.426 @@ -331,7 +349,7 @@ static int analysis_phase( int xc_handle
   2.427                      stats.dirty_net_count, stats.dirty_block_count);
   2.428          }
   2.429      }
   2.430 -
   2.431 +    
   2.432      return -1;
   2.433  }
   2.434  
   2.435 @@ -345,67 +363,150 @@ static int suspend_and_state(int xc_hand
   2.436  
   2.437      printf("suspend\n");
   2.438      fflush(stdout);
   2.439 -    if ( fgets(ans, sizeof(ans), stdin) == NULL )
   2.440 -    {
   2.441 +    if (fgets(ans, sizeof(ans), stdin) == NULL) {
   2.442          ERR("failed reading suspend reply");
   2.443          return -1;
   2.444      }
   2.445 -    if ( strncmp(ans, "done\n", 5) )
   2.446 -    {
   2.447 +    if (strncmp(ans, "done\n", 5)) {
   2.448          ERR("suspend reply incorrect: %s", ans);
   2.449          return -1;
   2.450      }
   2.451  
   2.452   retry:
   2.453  
   2.454 -    if ( xc_domain_getinfo(xc_handle, dom, 1, info) != 1)
   2.455 -    {
   2.456 +    if (xc_domain_getinfo(xc_handle, dom, 1, info) != 1) {
   2.457          ERR("Could not get domain info");
   2.458          return -1;
   2.459      }
   2.460  
   2.461 -    if ( xc_domain_get_vcpu_context(xc_handle, dom, 0 /* XXX */, 
   2.462 -                                    ctxt) )
   2.463 -    {
   2.464 +    if ( xc_domain_get_vcpu_context(xc_handle, dom, 0 /* XXX */, ctxt)) 
   2.465          ERR("Could not get vcpu context");
   2.466 -    }
   2.467 +
   2.468  
   2.469 -    if ( info->shutdown && info->shutdown_reason == SHUTDOWN_suspend )
   2.470 -    {
   2.471 +    if (info->shutdown && info->shutdown_reason == SHUTDOWN_suspend)
   2.472          return 0; // success
   2.473 -    }
   2.474  
   2.475 -    if ( info->paused )
   2.476 -    {
   2.477 +    if (info->paused) {
   2.478          // try unpausing domain, wait, and retest 
   2.479          xc_domain_unpause( xc_handle, dom );
   2.480 -
   2.481 +        
   2.482          ERR("Domain was paused. Wait and re-test.");
   2.483          usleep(10000);  // 10ms
   2.484 -
   2.485 +        
   2.486          goto retry;
   2.487      }
   2.488  
   2.489  
   2.490 -    if( ++i < 100 )
   2.491 -    {
   2.492 +    if( ++i < 100 ) {
   2.493          ERR("Retry suspend domain.");
   2.494          usleep(10000);  // 10ms 
   2.495          goto retry;
   2.496      }
   2.497 -
   2.498 +    
   2.499      ERR("Unable to suspend domain.");
   2.500  
   2.501      return -1;
   2.502  }
   2.503  
   2.504 +
   2.505 +/*
   2.506 +** During transfer (or in the state file), all page-table pages must be  
   2.507 +** converted into a 'canonical' form where references to actual mfns 
   2.508 +** are replaced with references to the corresponding pfns. 
   2.509 +**
   2.510 +** This function performs the appropriate conversion, taking into account 
   2.511 +** which entries do not require canonicalization (in particular, those 
   2.512 +** entries which map the virtual address reserved for the hypervisor). 
   2.513 +*/
   2.514 +void canonicalize_pagetable(unsigned long type, unsigned long pfn, 
   2.515 +                             const void *spage, void *dpage) 
   2.516 +{ 
   2.517 +    
   2.518 +    int i, pte_last, xen_start, xen_end;
   2.519 +    uint64_t pte;
   2.520 +
   2.521 +    /* 
   2.522 +    ** We need to determine which entries in this page table hold
   2.523 +    ** reserved hypervisor mappings. This depends on the current
   2.524 +    ** page table type as well as the number of paging levels. 
   2.525 +    */
   2.526 +    xen_start = xen_end = pte_last = PAGE_SIZE / ((pt_levels == 2)? 4 : 8); 
   2.527 +    
   2.528 +    if (pt_levels == 2 && type == L2TAB)
   2.529 +        xen_start = (hvirt_start >> L2_PAGETABLE_SHIFT); 
   2.530 +
   2.531 +    if (pt_levels == 3 && type == L3TAB) 
   2.532 +        xen_start = L3_PAGETABLE_ENTRIES_PAE; 
   2.533 +        
   2.534 +    /* 
   2.535 +    ** in PAE only the L2 mapping the top 1GB contains Xen mappings. 
   2.536 +    ** We can spot this by looking for the guest linear mapping which
   2.537 +    ** Xen always ensures is present in that L2. Guests must ensure 
   2.538 +    ** that this check will fail for other L2s. 
   2.539 +    */
   2.540 +    if (pt_levels == 3 && type == L2TAB) {
   2.541 +
   2.542 +/* XXX index of the L2 entry in PAE mode which holds the guest LPT */
   2.543 +#define PAE_GLPT_L2ENTRY (495) 
   2.544 +        pte = ((uint64_t*)spage)[PAE_GLPT_L2ENTRY]; 
   2.545 +
   2.546 +        if(((pte >> PAGE_SHIFT) & 0x0fffffff) == live_p2m[pfn])
   2.547 +            xen_start = (hvirt_start >> L2_PAGETABLE_SHIFT_PAE) & 0x1ff; 
   2.548 +    }
   2.549 +
   2.550 +    /* Now iterate through the page table, canonicalizing each PTE */
   2.551 +    for (i = 0; i < pte_last; i++ ) {
   2.552 +
   2.553 +        unsigned long pfn, mfn; 
   2.554 +        
   2.555 +        if (pt_levels == 2)
   2.556 +            pte = ((uint32_t*)spage)[i];
   2.557 +        else
   2.558 +            pte = ((uint64_t*)spage)[i];
   2.559 +        
   2.560 +        if (i >= xen_start && i < xen_end)
   2.561 +            pte = 0;
   2.562 +        
   2.563 +        if (pte & _PAGE_PRESENT) {
   2.564 +            
   2.565 +            mfn = (pte >> PAGE_SHIFT) & 0xfffffff;      
   2.566 +            pfn = live_m2p[mfn];
   2.567 +            
   2.568 +            if (!MFN_IS_IN_PSEUDOPHYS_MAP(mfn)) {
   2.569 +                /* I don't think this should ever happen */
   2.570 +                DPRINTF("FNI: [%08lx,%d] pte=%llx,"
   2.571 +                        " mfn=%08lx, pfn=%08lx [mfn]=%08lx\n",
   2.572 +                        type, i, (uint64_t)pte, mfn, 
   2.573 +                        live_m2p[mfn],
   2.574 +                        (live_m2p[mfn] < max_pfn) ? 
   2.575 +                        live_p2m[live_m2p[mfn]] : 0xdeadbeaf);
   2.576 +                
   2.577 +                pfn = 0; /* be suspicious */
   2.578 +            }
   2.579 +            
   2.580 +            pte &= 0xffffff0000000fffULL;
   2.581 +            pte |= (uint64_t)pfn << PAGE_SHIFT;
   2.582 +        }
   2.583 +        
   2.584 +        if (pt_levels == 2)
   2.585 +            ((uint32_t*)dpage)[i] = pte;
   2.586 +        else
   2.587 +            ((uint64_t*)dpage)[i] = pte;		       
   2.588 +        
   2.589 +    } 
   2.590 +    
   2.591 +    return; 
   2.592 +}
   2.593 +
   2.594 +
   2.595 +
   2.596 +
   2.597  int xc_linux_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters, 
   2.598                    uint32_t max_factor, uint32_t flags)
   2.599  {
   2.600      xc_dominfo_t info;
   2.601  
   2.602 -    int rc = 1, i, j, k, last_iter, iter = 0;
   2.603 -    unsigned long mfn;
   2.604 +    int rc = 1, i, j, last_iter, iter = 0;
   2.605      int live  = (flags & XCFLAGS_LIVE); 
   2.606      int debug = (flags & XCFLAGS_DEBUG); 
   2.607      int sent_last_iter, skip_this_iter;
   2.608 @@ -421,18 +522,16 @@ int xc_linux_save(int xc_handle, int io_
   2.609      unsigned long *pfn_batch = NULL;
   2.610  
   2.611      /* A temporary mapping, and a copy, of one frame of guest memory. */
   2.612 -    unsigned long page[1024];
   2.613 +    char page[PAGE_SIZE]; 
   2.614 +
   2.615 +    /* Double and single indirect references to the live P2M table */
   2.616 +    unsigned long *live_p2m_frame_list_list = NULL;
   2.617 +    unsigned long *live_p2m_frame_list = NULL;
   2.618  
   2.619      /* A copy of the pfn-to-mfn table frame list. */
   2.620 -    unsigned long *live_pfn_to_mfn_frame_list_list = NULL;
   2.621 -    unsigned long *live_pfn_to_mfn_frame_list = NULL;
   2.622 -    unsigned long pfn_to_mfn_frame_list[1024];
   2.623 +    unsigned long *p2m_frame_list = NULL;
   2.624  
   2.625 -    /* Live mapping of the table mapping each PFN to its current MFN. */
   2.626 -    unsigned long *live_pfn_to_mfn_table = NULL;
   2.627 -    /* Live mapping of system MFN to PFN table. */
   2.628 -    unsigned long *live_mfn_to_pfn_table = NULL;
   2.629 -    unsigned long mfn_to_pfn_table_start_mfn;
   2.630 +    unsigned long m2p_start_mfn;
   2.631      
   2.632      /* Live mapping of shared info structure */
   2.633      shared_info_t *live_shinfo = NULL;
   2.634 @@ -440,10 +539,9 @@ int xc_linux_save(int xc_handle, int io_
   2.635      /* base of the region in which domain memory is mapped */
   2.636      unsigned char *region_base = NULL;
   2.637  
   2.638 -    /* number of pages we're dealing with */
   2.639 -    unsigned long nr_pfns;
   2.640  
   2.641 -    /* power of 2 order of nr_pfns */
   2.642 +    
   2.643 +    /* power of 2 order of max_pfn */
   2.644      int order_nr; 
   2.645  
   2.646      /* bitmap of pages:
   2.647 @@ -454,207 +552,197 @@ int xc_linux_save(int xc_handle, int io_
   2.648      
   2.649      xc_shadow_control_stats_t stats;
   2.650  
   2.651 -    int needed_to_fix = 0;
   2.652 -    int total_sent    = 0;
   2.653 -
   2.654 -    MBIT_RATE = START_MBIT_RATE;
   2.655 +    unsigned long needed_to_fix = 0;
   2.656 +    unsigned long total_sent    = 0;
   2.657  
   2.658  
   2.659      /* If no explicit control parameters given, use defaults */
   2.660 -    if( !max_iters ) 
   2.661 +    if(!max_iters) 
   2.662          max_iters = DEF_MAX_ITERS; 
   2.663 -    if( !max_factor ) 
   2.664 +    if(!max_factor) 
   2.665          max_factor = DEF_MAX_FACTOR; 
   2.666 +    
   2.667 +    initialize_mbit_rate(); 
   2.668 +
   2.669 +    DPRINTF("xc_linux_save start DOM%u live=%s\n", dom, live ? 
   2.670 +            "true" : "false"); 
   2.671  
   2.672 -
   2.673 -    DPRINTF("xc_linux_save start DOM%u live=%s\n", dom, live?"true":"false"); 
   2.674 +    if(!get_platform_info(xc_handle, dom, 
   2.675 +                          &max_mfn, &hvirt_start, &pt_levels)) {
   2.676 +        ERR("Unable to get platform info."); 
   2.677 +        return 1;
   2.678 +    }
   2.679  
   2.680 -    if ( mlock(&ctxt, sizeof(ctxt)) ) 
   2.681 -    {
   2.682 +    if (xc_domain_getinfo(xc_handle, dom, 1, &info) != 1) {
   2.683 +        ERR("Could not get domain info");
   2.684 +        return 1; 
   2.685 +    }
   2.686 +
   2.687 +    if (mlock(&ctxt, sizeof(ctxt))) {
   2.688          ERR("Unable to mlock ctxt");
   2.689          return 1;
   2.690      }
   2.691      
   2.692 -    if ( xc_domain_getinfo(xc_handle, dom, 1, &info) != 1 )
   2.693 -    {
   2.694 -        ERR("Could not get domain info");
   2.695 -        goto out;
   2.696 -    }
   2.697 -    if ( xc_domain_get_vcpu_context(xc_handle, dom, /* FIXME */ 0, &ctxt) )
   2.698 -    {
   2.699 +    /* Only have to worry about vcpu 0 even for SMP */
   2.700 +    if (xc_domain_get_vcpu_context(xc_handle, dom, 0, &ctxt)) {
   2.701          ERR("Could not get vcpu context");
   2.702          goto out;
   2.703      }
   2.704      shared_info_frame = info.shared_info_frame;
   2.705  
   2.706      /* A cheesy test to see whether the domain contains valid state. */
   2.707 -    if ( ctxt.ctrlreg[3] == 0 )
   2.708 +    if (ctxt.ctrlreg[3] == 0)
   2.709      {
   2.710          ERR("Domain is not in a valid Linux guest OS state");
   2.711          goto out;
   2.712      }
   2.713 -    
   2.714 -    nr_pfns = info.max_memkb >> (PAGE_SHIFT - 10);
   2.715 -
   2.716 -    /* cheesy sanity check */
   2.717 -    if ( nr_pfns > 1024*1024 )
   2.718 -    {
   2.719 -        ERR("Invalid state record -- pfn count out of range: %lu", nr_pfns);
   2.720 +  
   2.721 +   /* cheesy sanity check */
   2.722 +    if ((info.max_memkb >> (PAGE_SHIFT - 10)) > max_mfn) {
   2.723 +        ERR("Invalid state record -- pfn count out of range: %lu", 
   2.724 +            (info.max_memkb >> (PAGE_SHIFT - 10))); 
   2.725          goto out;
   2.726 -    }
   2.727 -
   2.728 +     }
   2.729 + 
   2.730      /* Map the shared info frame */
   2.731 -    live_shinfo = xc_map_foreign_range(
   2.732 -        xc_handle, dom, PAGE_SIZE, PROT_READ, shared_info_frame);
   2.733 -    if ( !live_shinfo )
   2.734 -    {
   2.735 +    if(!(live_shinfo = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, 
   2.736 +                                            PROT_READ, shared_info_frame))) {
   2.737          ERR("Couldn't map live_shinfo");
   2.738          goto out;
   2.739      }
   2.740  
   2.741 -    live_pfn_to_mfn_frame_list_list = xc_map_foreign_range(
   2.742 -        xc_handle, dom,
   2.743 -        PAGE_SIZE, PROT_READ, live_shinfo->arch.pfn_to_mfn_frame_list_list);
   2.744 +    max_pfn = live_shinfo->arch.max_pfn;
   2.745  
   2.746 -    if (!live_pfn_to_mfn_frame_list_list){
   2.747 -        ERR("Couldn't map pfn_to_mfn_frame_list_list");
   2.748 +    live_p2m_frame_list_list = 
   2.749 +        xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, PROT_READ, 
   2.750 +                             live_shinfo->arch.pfn_to_mfn_frame_list_list);
   2.751 +
   2.752 +    if (!live_p2m_frame_list_list) {
   2.753 +        ERR("Couldn't map p2m_frame_list_list");
   2.754          goto out;
   2.755      }
   2.756  
   2.757 -    live_pfn_to_mfn_frame_list = 
   2.758 -        xc_map_foreign_batch(xc_handle, dom, 
   2.759 -                             PROT_READ,
   2.760 -                             live_pfn_to_mfn_frame_list_list,
   2.761 -                             (nr_pfns+(1024*1024)-1)/(1024*1024) );
   2.762 -
   2.763 -    if ( !live_pfn_to_mfn_frame_list)
   2.764 -    {
   2.765 -        ERR("Couldn't map pfn_to_mfn_frame_list");
   2.766 +    live_p2m_frame_list = 
   2.767 +        xc_map_foreign_batch(xc_handle, dom, PROT_READ,
   2.768 +                             live_p2m_frame_list_list,
   2.769 +                             P2M_FLL_ENTRIES); 
   2.770 +    
   2.771 +    if (!live_p2m_frame_list) {
   2.772 +        ERR("Couldn't map p2m_frame_list");
   2.773          goto out;
   2.774      }
   2.775  
   2.776 -
   2.777      /* Map all the frames of the pfn->mfn table. For migrate to succeed, 
   2.778         the guest must not change which frames are used for this purpose. 
   2.779         (its not clear why it would want to change them, and we'll be OK
   2.780         from a safety POV anyhow. */
   2.781  
   2.782 -    live_pfn_to_mfn_table = xc_map_foreign_batch(xc_handle, dom, 
   2.783 -                                                 PROT_READ,
   2.784 -                                                 live_pfn_to_mfn_frame_list,
   2.785 -                                                 (nr_pfns+1023)/1024 );  
   2.786 -    if ( !live_pfn_to_mfn_table )
   2.787 -    {
   2.788 -        ERR("Couldn't map pfn_to_mfn table");
   2.789 +    live_p2m = xc_map_foreign_batch(xc_handle, dom, PROT_READ,
   2.790 +                                    live_p2m_frame_list,
   2.791 +                                    P2M_FL_ENTRIES); 
   2.792 +
   2.793 +    if (!live_p2m) {
   2.794 +        ERR("Couldn't map p2m table");
   2.795          goto out;
   2.796      }
   2.797  
   2.798      /* Setup the mfn_to_pfn table mapping */
   2.799 -    mfn_to_pfn_table_start_mfn = xc_get_m2p_start_mfn( xc_handle );
   2.800 -
   2.801 -    live_mfn_to_pfn_table = 
   2.802 -        xc_map_foreign_range(xc_handle, DOMID_XEN, 
   2.803 -                             PAGE_SIZE*1024, PROT_READ, 
   2.804 -                             mfn_to_pfn_table_start_mfn );
   2.805 +    m2p_start_mfn = xc_get_m2p_start_mfn(xc_handle);
   2.806 +    live_m2p      = xc_map_foreign_range(xc_handle, DOMID_XEN, M2P_SIZE, 
   2.807 +                                         PROT_READ, m2p_start_mfn);
   2.808 +    
   2.809 +    /* Get a local copy fo the live_P2M_frame_list */
   2.810 +    if(!(p2m_frame_list = malloc(P2M_FL_SIZE))) { 
   2.811 +        ERR("Couldn't allocate p2m_frame_list array");
   2.812 +        goto out;
   2.813 +    }
   2.814 +    memcpy(p2m_frame_list, live_p2m_frame_list, P2M_FL_SIZE); 
   2.815  
   2.816      /* Canonicalise the pfn-to-mfn table frame-number list. */
   2.817 -    memcpy( pfn_to_mfn_frame_list, live_pfn_to_mfn_frame_list, PAGE_SIZE );
   2.818 -
   2.819 -    for ( i = 0; i < nr_pfns; i += 1024 )
   2.820 -    {
   2.821 -        if ( !translate_mfn_to_pfn(&pfn_to_mfn_frame_list[i/1024]) )
   2.822 -        {
   2.823 +    for (i = 0; i < max_pfn; i += ulpp) {
   2.824 +        if (!translate_mfn_to_pfn(&p2m_frame_list[i/ulpp])) { 
   2.825              ERR("Frame# in pfn-to-mfn frame list is not in pseudophys");
   2.826              goto out;
   2.827          }
   2.828      }
   2.829  
   2.830 -
   2.831      /* Domain is still running at this point */
   2.832  
   2.833 -    if ( live )
   2.834 -    {
   2.835 -        if ( xc_shadow_control( xc_handle, dom, 
   2.836 -                                DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY,
   2.837 -                                NULL, 0, NULL ) < 0 )
   2.838 -        {
   2.839 +    if (live) {
   2.840 +
   2.841 +        if (xc_shadow_control(xc_handle, dom, 
   2.842 +                              DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY,
   2.843 +                              NULL, 0, NULL ) < 0) { 
   2.844              ERR("Couldn't enable shadow mode");
   2.845              goto out;
   2.846          }
   2.847 -
   2.848 +        
   2.849          last_iter = 0;
   2.850 -    } 
   2.851 -    else
   2.852 -    {
   2.853 +        
   2.854 +    } else {
   2.855 +        
   2.856          /* This is a non-live suspend. Issue the call back to get the
   2.857             domain suspended */
   2.858 -
   2.859 +        
   2.860          last_iter = 1;
   2.861 -
   2.862 -        if ( suspend_and_state( xc_handle, io_fd, dom, &info, &ctxt) )
   2.863 -        {
   2.864 +        
   2.865 +        if (suspend_and_state( xc_handle, io_fd, dom, &info, &ctxt)) {
   2.866              ERR("Domain appears not to have suspended");
   2.867              goto out;
   2.868          }
   2.869 -
   2.870 +        
   2.871      }
   2.872 -    sent_last_iter = 1<<20; /* 4GB of pages */
   2.873  
   2.874 -    /* calculate the power of 2 order of nr_pfns, e.g.
   2.875 +#if 0
   2.876 +    sent_last_iter = 0xFFFFFFFF; /* Pretend we sent a /lot/ last time */
   2.877 +#else
   2.878 +    sent_last_iter = 1 << 20; 
   2.879 +#endif
   2.880 +
   2.881 +
   2.882 +    /* calculate the power of 2 order of max_pfn, e.g.
   2.883         15->4 16->4 17->5 */
   2.884 -    for ( i = nr_pfns-1, order_nr = 0; i ; i >>= 1, order_nr++ )
   2.885 +    for (i = max_pfn-1, order_nr = 0; i ; i >>= 1, order_nr++)
   2.886          continue;
   2.887  
   2.888 -    /* Setup to_send bitmap */
   2.889 -    {
   2.890 -        /* size these for a maximal 4GB domain, to make interaction
   2.891 -           with balloon driver easier. It's only user space memory,
   2.892 -           ater all... (3x 128KB) */
   2.893 -
   2.894 -        int sz = ( 1<<20 ) / 8;
   2.895 - 
   2.896 -        to_send = malloc( sz );
   2.897 -        to_fix  = calloc( 1, sz );
   2.898 -        to_skip = malloc( sz );
   2.899 +#undef BITMAP_SIZE
   2.900 +#define BITMAP_SIZE ((1<<20)/8) 
   2.901  
   2.902 -        if ( !to_send || !to_fix || !to_skip )
   2.903 -        {
   2.904 -            ERR("Couldn't allocate to_send array");
   2.905 -            goto out;
   2.906 -        }
   2.907 -
   2.908 -        memset(to_send, 0xff, sz);
   2.909 +    /* Setup to_send / to_fix and to_skip bitmaps */
   2.910 +    to_send = malloc(BITMAP_SIZE); 
   2.911 +    to_fix  = calloc(1, BITMAP_SIZE); 
   2.912 +    to_skip = malloc(BITMAP_SIZE); 
   2.913 +    
   2.914 +    if (!to_send || !to_fix || !to_skip) {
   2.915 +        ERR("Couldn't allocate to_send array");
   2.916 +        goto out;
   2.917 +    }
   2.918 +    
   2.919 +    memset(to_send, 0xff, BITMAP_SIZE);
   2.920  
   2.921 -        if ( mlock(to_send, sz) )
   2.922 -        {
   2.923 -            ERR("Unable to mlock to_send");
   2.924 -            return 1;
   2.925 -        }
   2.926 -
   2.927 -        /* (to fix is local only) */
   2.928 -
   2.929 -        if ( mlock(to_skip, sz) )
   2.930 -        {
   2.931 -            ERR("Unable to mlock to_skip");
   2.932 -            return 1;
   2.933 -        }
   2.934 -
   2.935 +    if (mlock(to_send, BITMAP_SIZE)) {
   2.936 +        ERR("Unable to mlock to_send");
   2.937 +        return 1;
   2.938      }
   2.939  
   2.940 -    analysis_phase( xc_handle, dom, nr_pfns, to_skip, 0 );
   2.941 +    /* (to fix is local only) */
   2.942 +    if (mlock(to_skip, BITMAP_SIZE)) {
   2.943 +        ERR("Unable to mlock to_skip");
   2.944 +        return 1;
   2.945 +    }
   2.946 +        
   2.947 +    analysis_phase(xc_handle, dom, max_pfn, to_skip, 0);
   2.948  
   2.949      /* We want zeroed memory so use calloc rather than malloc. */
   2.950 -    pfn_type = calloc(BATCH_SIZE, sizeof(unsigned long));
   2.951 -    pfn_batch = calloc(BATCH_SIZE, sizeof(unsigned long));
   2.952 +    pfn_type  = calloc(MAX_BATCH_SIZE, sizeof(unsigned long));
   2.953 +    pfn_batch = calloc(MAX_BATCH_SIZE, sizeof(unsigned long));
   2.954  
   2.955 -    if ( (pfn_type == NULL) || (pfn_batch == NULL) )
   2.956 -    {
   2.957 +    if ((pfn_type == NULL) || (pfn_batch == NULL)) {
   2.958          errno = ENOMEM;
   2.959          goto out;
   2.960      }
   2.961  
   2.962 -    if ( mlock(pfn_type, BATCH_SIZE * sizeof(unsigned long)) )
   2.963 -    {
   2.964 +    if (mlock(pfn_type, MAX_BATCH_SIZE * sizeof(unsigned long))) {
   2.965          ERR("Unable to mlock");
   2.966          goto out;
   2.967      }
   2.968 @@ -663,46 +751,40 @@ int xc_linux_save(int xc_handle, int io_
   2.969      /*
   2.970       * Quick belt and braces sanity check.
   2.971       */
   2.972 -#if DEBUG
   2.973      {
   2.974          int err=0;
   2.975 -        for ( i = 0; i < nr_pfns; i++ )
   2.976 -        {
   2.977 -            mfn = live_pfn_to_mfn_table[i];
   2.978 -     
   2.979 -            if( (live_mfn_to_pfn_table[mfn] != i) && (mfn != 0xffffffffUL) )
   2.980 -            {
   2.981 -                fprintf(stderr, "i=0x%x mfn=%lx live_mfn_to_pfn_table=%lx\n",
   2.982 -                        i,mfn,live_mfn_to_pfn_table[mfn]);
   2.983 +        unsigned long mfn; 
   2.984 +        for (i = 0; i < max_pfn; i++) {
   2.985 +
   2.986 +            mfn = live_p2m[i];
   2.987 +            if((live_m2p[mfn] != i) && (mfn != 0xffffffffUL)) { 
   2.988 +                DPRINTF("i=0x%x mfn=%lx live_m2p=%lx\n", i, 
   2.989 +                        mfn, live_m2p[mfn]);
   2.990                  err++;
   2.991              }
   2.992          }
   2.993 -        fprintf(stderr, "Had %d unexplained entries in p2m table\n",err);
   2.994 +        DPRINTF("Had %d unexplained entries in p2m table\n", err);
   2.995      }
   2.996 -#endif
   2.997  
   2.998  
   2.999      /* Start writing out the saved-domain record. */
  2.1000  
  2.1001 -    if ( write(io_fd, &nr_pfns, sizeof(unsigned long)) !=
  2.1002 -         sizeof(unsigned long) )
  2.1003 -    {
  2.1004 -        ERR("write: nr_pfns");
  2.1005 +    if(!write_exact(io_fd, &max_pfn, sizeof(unsigned long))) { 
  2.1006 +        ERR("write: max_pfn");
  2.1007          goto out;
  2.1008      }
  2.1009  
  2.1010 -    if ( write(io_fd, pfn_to_mfn_frame_list, PAGE_SIZE) != PAGE_SIZE )
  2.1011 -    {
  2.1012 -        ERR("write: pfn_to_mfn_frame_list");
  2.1013 +    if(!write_exact(io_fd, p2m_frame_list, P2M_FL_SIZE)) { 
  2.1014 +        ERR("write: p2m_frame_list");
  2.1015          goto out;
  2.1016      }
  2.1017 -
  2.1018 -    print_stats( xc_handle, dom, 0, &stats, 0 );
  2.1019 +    
  2.1020 +    print_stats(xc_handle, dom, 0, &stats, 0);
  2.1021  
  2.1022      /* Now write out each data page, canonicalising page tables as we go... */
  2.1023 -    
  2.1024 -    for ( ; ; )
  2.1025 -    {
  2.1026 +
  2.1027 +    while(1) {
  2.1028 +
  2.1029          unsigned int prev_pc, sent_this_iter, N, batch;
  2.1030  
  2.1031          iter++;
  2.1032 @@ -713,24 +795,20 @@ int xc_linux_save(int xc_handle, int io_
  2.1033  
  2.1034          DPRINTF("Saving memory pages: iter %d   0%%", iter);
  2.1035  
  2.1036 -        while ( N < nr_pfns )
  2.1037 -        {
  2.1038 -            unsigned int this_pc = (N * 100) / nr_pfns;
  2.1039 +        while( N < max_pfn ){
  2.1040  
  2.1041 -            if ( (this_pc - prev_pc) >= 5 )
  2.1042 -            {
  2.1043 +            unsigned int this_pc = (N * 100) / max_pfn;
  2.1044 +
  2.1045 +            if ((this_pc - prev_pc) >= 5) {
  2.1046                  DPRINTF("\b\b\b\b%3d%%", this_pc);
  2.1047                  prev_pc = this_pc;
  2.1048              }
  2.1049 -
  2.1050 +            
  2.1051              /* slightly wasteful to peek the whole array evey time, 
  2.1052                 but this is fast enough for the moment. */
  2.1053 -
  2.1054 -            if ( !last_iter && 
  2.1055 -                 xc_shadow_control(xc_handle, dom, 
  2.1056 -                                   DOM0_SHADOW_CONTROL_OP_PEEK,
  2.1057 -                                   to_skip, nr_pfns, NULL) != nr_pfns )
  2.1058 -            {
  2.1059 +            if (!last_iter && xc_shadow_control(
  2.1060 +                    xc_handle, dom, DOM0_SHADOW_CONTROL_OP_PEEK,
  2.1061 +                    to_skip, max_pfn, NULL) != max_pfn) {
  2.1062                  ERR("Error peeking shadow bitmap");
  2.1063                  goto out;
  2.1064              }
  2.1065 @@ -738,219 +816,168 @@ int xc_linux_save(int xc_handle, int io_
  2.1066  
  2.1067              /* load pfn_type[] with the mfn of all the pages we're doing in
  2.1068                 this batch. */
  2.1069 -
  2.1070 -            for ( batch = 0; batch < BATCH_SIZE && N < nr_pfns ; N++ )
  2.1071 -            {
  2.1072 -                int n = permute(N, nr_pfns, order_nr );
  2.1073 +            for (batch = 0; batch < MAX_BATCH_SIZE && N < max_pfn ; N++) {
  2.1074  
  2.1075 -                if ( 0 && debug ) {
  2.1076 -                    fprintf(stderr,"%d pfn= %08lx mfn= %08lx %d  "
  2.1077 -                            " [mfn]= %08lx\n",
  2.1078 -                            iter, (unsigned long)n, live_pfn_to_mfn_table[n],
  2.1079 -                            test_bit(n,to_send),
  2.1080 -                            live_mfn_to_pfn_table[live_pfn_to_mfn_table[n]&
  2.1081 -                                                 0xFFFFF]);
  2.1082 -                }
  2.1083 +                int n = permute(N, max_pfn, order_nr);
  2.1084  
  2.1085 -                if ( !last_iter && 
  2.1086 -                     test_bit(n, to_send) && 
  2.1087 -                     test_bit(n, to_skip) ) {
  2.1088 +                if (debug) {
  2.1089 +                    DPRINTF("%d pfn= %08lx mfn= %08lx %d  [mfn]= %08lx\n",
  2.1090 +                            iter, (unsigned long)n, live_p2m[n],
  2.1091 +                            test_bit(n, to_send), 
  2.1092 +                            live_m2p[live_p2m[n]&0xFFFFF]);
  2.1093 +                }
  2.1094 +                
  2.1095 +                if (!last_iter && test_bit(n, to_send)&& test_bit(n, to_skip)) 
  2.1096                      skip_this_iter++; /* stats keeping */
  2.1097 -                }
  2.1098  
  2.1099 -                if ( !((test_bit(n, to_send) && !test_bit(n, to_skip)) ||
  2.1100 -                       (test_bit(n, to_send) && last_iter) ||
  2.1101 -                       (test_bit(n, to_fix)  && last_iter)) ) {
  2.1102 +                if (!((test_bit(n, to_send) && !test_bit(n, to_skip)) ||
  2.1103 +                      (test_bit(n, to_send) && last_iter) ||
  2.1104 +                      (test_bit(n, to_fix)  && last_iter)))
  2.1105                      continue;
  2.1106 -                }
  2.1107  
  2.1108 -                /* we get here if:
  2.1109 -                   1. page is marked to_send & hasn't already been re-dirtied
  2.1110 -                   2. (ignore to_skip in last iteration)
  2.1111 -                   3. add in pages that still need fixup (net bufs)
  2.1112 +                /* 
  2.1113 +                ** we get here if:
  2.1114 +                **  1. page is marked to_send & hasn't already been re-dirtied
  2.1115 +                **  2. (ignore to_skip in last iteration)
  2.1116 +                **  3. add in pages that still need fixup (net bufs)
  2.1117                  */
  2.1118    
  2.1119                  pfn_batch[batch] = n;
  2.1120 -                pfn_type[batch] = live_pfn_to_mfn_table[n];
  2.1121 +                pfn_type[batch]  = live_p2m[n];
  2.1122  
  2.1123 -                if( ! is_mapped(pfn_type[batch]) )
  2.1124 -                {
  2.1125 +                if(!is_mapped(pfn_type[batch])) {
  2.1126 +
  2.1127                      /* not currently in pusedo-physical map -- set bit
  2.1128                         in to_fix that we must send this page in last_iter
  2.1129                         unless its sent sooner anyhow */
  2.1130  
  2.1131 -                    set_bit( n, to_fix );
  2.1132 -                    if( iter>1 )
  2.1133 +                    set_bit(n, to_fix);
  2.1134 +                    if(iter > 1)
  2.1135                          DPRINTF("netbuf race: iter %d, pfn %x. mfn %lx\n",
  2.1136 -                                iter,n,pfn_type[batch]);
  2.1137 +                                iter, n, pfn_type[batch]);
  2.1138                      continue;
  2.1139                  }
  2.1140  
  2.1141 -                if ( last_iter && 
  2.1142 -                     test_bit(n, to_fix) && 
  2.1143 -                     !test_bit(n, to_send) )
  2.1144 -                {
  2.1145 +                if(last_iter && test_bit(n, to_fix) && !test_bit(n, to_send)) {
  2.1146                      needed_to_fix++;
  2.1147                      DPRINTF("Fix! iter %d, pfn %x. mfn %lx\n",
  2.1148                              iter,n,pfn_type[batch]);
  2.1149                  }
  2.1150  
  2.1151                  clear_bit(n, to_fix); 
  2.1152 -
  2.1153 +                
  2.1154                  batch++;
  2.1155              }
  2.1156       
  2.1157 -            if ( batch == 0 )
  2.1158 +            if (batch == 0)
  2.1159                  goto skip; /* vanishingly unlikely... */
  2.1160        
  2.1161 -            if ( (region_base = xc_map_foreign_batch(xc_handle, dom, 
  2.1162 -                                                     PROT_READ,
  2.1163 -                                                     pfn_type,
  2.1164 -                                                     batch)) == 0 ){
  2.1165 +            if ((region_base = xc_map_foreign_batch(
  2.1166 +                     xc_handle, dom, PROT_READ, pfn_type, batch)) == 0) { 
  2.1167                  ERR("map batch failed");
  2.1168                  goto out;
  2.1169              }
  2.1170       
  2.1171 -            if ( xc_get_pfn_type_batch(xc_handle, dom, batch, pfn_type) ){
  2.1172 +            if (xc_get_pfn_type_batch(xc_handle, dom, batch, pfn_type)) {
  2.1173                  ERR("get_pfn_type_batch failed");
  2.1174                  goto out;
  2.1175              }
  2.1176       
  2.1177 -            for ( j = 0; j < batch; j++ )
  2.1178 -            {
  2.1179 -                if ( (pfn_type[j] & LTAB_MASK) == XTAB )
  2.1180 -                {
  2.1181 -                    DPRINTF("type fail: page %i mfn %08lx\n",j,pfn_type[j]);
  2.1182 +            for (j = 0; j < batch; j++) {
  2.1183 +
  2.1184 +                if ((pfn_type[j] & LTAB_MASK) == XTAB) {
  2.1185 +                    DPRINTF("type fail: page %i mfn %08lx\n", j, pfn_type[j]);
  2.1186                      continue;
  2.1187                  }
  2.1188    
  2.1189 -                if ( 0 && debug )
  2.1190 +                if (debug) 
  2.1191                      fprintf(stderr, "%d pfn= %08lx mfn= %08lx [mfn]= %08lx"
  2.1192                              " sum= %08lx\n",
  2.1193                              iter, 
  2.1194                              (pfn_type[j] & LTAB_MASK) | pfn_batch[j],
  2.1195                              pfn_type[j],
  2.1196 -                            live_mfn_to_pfn_table[pfn_type[j]&(~LTAB_MASK)],
  2.1197 +                            live_m2p[pfn_type[j]&(~LTAB_MASK)],
  2.1198                              csum_page(region_base + (PAGE_SIZE*j)));
  2.1199 -
  2.1200 +                
  2.1201                  /* canonicalise mfn->pfn */
  2.1202                  pfn_type[j] = (pfn_type[j] & LTAB_MASK) | pfn_batch[j];
  2.1203              }
  2.1204  
  2.1205 -            if ( write(io_fd, &batch, sizeof(int)) != sizeof(int) )
  2.1206 -            {
  2.1207 +            if(!write_exact(io_fd, &batch, sizeof(unsigned int))) { 
  2.1208                  ERR("Error when writing to state file (2)");
  2.1209                  goto out;
  2.1210              }
  2.1211  
  2.1212 -            if ( write(io_fd, pfn_type, sizeof(unsigned long)*j) !=
  2.1213 -                 (sizeof(unsigned long) * j) )
  2.1214 -            {
  2.1215 +            if(!write_exact(io_fd, pfn_type, sizeof(unsigned long)*j)) { 
  2.1216                  ERR("Error when writing to state file (3)");
  2.1217                  goto out;
  2.1218              }
  2.1219 -     
  2.1220 +            
  2.1221              /* entering this loop, pfn_type is now in pfns (Not mfns) */
  2.1222 -            for ( j = 0; j < batch; j++ )
  2.1223 -            {
  2.1224 +            for (j = 0; j < batch; j++) {
  2.1225 +                
  2.1226 +                unsigned long pfn      = pfn_type[j] & ~LTAB_MASK; 
  2.1227 +                unsigned long pagetype = pfn_type[j] & LTAB_MASK; 
  2.1228 +                void *spage            = (void *) region_base + (PAGE_SIZE*j); 
  2.1229 +
  2.1230 +
  2.1231                  /* write out pages in batch */
  2.1232 -                if ( (pfn_type[j] & LTAB_MASK) == XTAB )
  2.1233 -                {
  2.1234 -                    DPRINTF("SKIP BOGUS page %i mfn %08lx\n",j,pfn_type[j]);
  2.1235 +                if (pagetype == XTAB) {
  2.1236 +                    DPRINTF("SKIP BOGUS page %i mfn %08lx\n", j, pfn_type[j]);
  2.1237                      continue;
  2.1238                  }
  2.1239 -  
  2.1240 -                if ( ((pfn_type[j] & LTABTYPE_MASK) == L1TAB) || 
  2.1241 -                     ((pfn_type[j] & LTABTYPE_MASK) == L2TAB) ){
  2.1242 -                    memcpy(page, region_base + (PAGE_SIZE*j), PAGE_SIZE);
  2.1243 -      
  2.1244 -                    for ( k = 0; 
  2.1245 -                          k < (((pfn_type[j] & LTABTYPE_MASK) == L2TAB) ? 
  2.1246 -                               (HYPERVISOR_VIRT_START >> L2_PAGETABLE_SHIFT) :
  2.1247 -                               1024); 
  2.1248 -                          k++ )
  2.1249 -                    {
  2.1250 -                        unsigned long pfn;
  2.1251  
  2.1252 -                        if ( !(page[k] & _PAGE_PRESENT) )
  2.1253 -                            continue;
  2.1254 -                        
  2.1255 -                        mfn = page[k] >> PAGE_SHIFT;      
  2.1256 -                        pfn = live_mfn_to_pfn_table[mfn];
  2.1257 -
  2.1258 -                        if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) )
  2.1259 -                        {
  2.1260 -                            /* I don't think this should ever happen */
  2.1261 -                            fprintf(stderr, "FNI %d : [%08lx,%d] pte=%08lx, "
  2.1262 -                                    "mfn=%08lx, pfn=%08lx [mfn]=%08lx\n",
  2.1263 -                                    j, pfn_type[j], k,
  2.1264 -                                    page[k], mfn, live_mfn_to_pfn_table[mfn],
  2.1265 -                                    (live_mfn_to_pfn_table[mfn]<nr_pfns)? 
  2.1266 -                                    live_pfn_to_mfn_table[
  2.1267 -                                        live_mfn_to_pfn_table[mfn]] : 
  2.1268 -                                    0xdeadbeef);
  2.1269 -
  2.1270 -                            pfn = 0; /* be suspicious */
  2.1271 -                        }
  2.1272 -
  2.1273 -                        page[k] &= PAGE_SIZE - 1;
  2.1274 -                        page[k] |= pfn << PAGE_SHIFT;
  2.1275 -   
  2.1276 -#if 0
  2.1277 -                        fprintf(stderr,
  2.1278 -                                "L%d i=%d pfn=%d mfn=%d k=%d pte=%08lx "
  2.1279 -                                "xpfn=%d\n",
  2.1280 -                                pfn_type[j]>>28,
  2.1281 -                                j,i,mfn,k,page[k],page[k]>>PAGE_SHIFT);
  2.1282 -#endif     
  2.1283 -   
  2.1284 -                    } /* end of page table rewrite for loop */
  2.1285 -      
  2.1286 +                pagetype &= LTABTYPE_MASK; 
  2.1287 +                
  2.1288 +                if (pagetype >= L1TAB && pagetype <= L4TAB) {
  2.1289 +                    
  2.1290 +                    /* We have a pagetable page: need to rewrite it. */
  2.1291 +                    canonicalize_pagetable(pagetype, pfn, spage, page); 
  2.1292 +                    
  2.1293                      if (ratewrite(io_fd, page, PAGE_SIZE) != PAGE_SIZE) {
  2.1294                          ERR("Error when writing to state file (4)");
  2.1295                          goto out;
  2.1296                      }
  2.1297 -      
  2.1298 -                }  /* end of it's a PT page */ else {  /* normal page */
  2.1299 +                    
  2.1300 +                }  else {  
  2.1301  
  2.1302 -                    if ( ratewrite(io_fd, region_base + (PAGE_SIZE*j), 
  2.1303 -                                   PAGE_SIZE) != PAGE_SIZE )
  2.1304 -                    {
  2.1305 +                    /* We have a normal page: just write it directly. */
  2.1306 +                    if (ratewrite(io_fd, spage, PAGE_SIZE) != PAGE_SIZE) {
  2.1307                          ERR("Error when writing to state file (5)");
  2.1308                          goto out;
  2.1309                      }
  2.1310                  }
  2.1311              } /* end of the write out for this batch */
  2.1312 -     
  2.1313 +            
  2.1314              sent_this_iter += batch;
  2.1315 -
  2.1316 +            
  2.1317          } /* end of this while loop for this iteration */
  2.1318 -
  2.1319 +        
  2.1320          munmap(region_base, batch*PAGE_SIZE);
  2.1321 -
  2.1322 -    skip: 
  2.1323 -
  2.1324 +        
  2.1325 +      skip: 
  2.1326 +        
  2.1327          total_sent += sent_this_iter;
  2.1328  
  2.1329          DPRINTF("\r %d: sent %d, skipped %d, ", 
  2.1330                  iter, sent_this_iter, skip_this_iter );
  2.1331  
  2.1332 -        if ( last_iter ) {
  2.1333 +        if (last_iter) {
  2.1334              print_stats( xc_handle, dom, sent_this_iter, &stats, 1);
  2.1335  
  2.1336 -            DPRINTF("Total pages sent= %d (%.2fx)\n", 
  2.1337 -                    total_sent, ((float)total_sent)/nr_pfns );
  2.1338 -            DPRINTF("(of which %d were fixups)\n", needed_to_fix  );
  2.1339 +            DPRINTF("Total pages sent= %ld (%.2fx)\n", 
  2.1340 +                    total_sent, ((float)total_sent)/max_pfn );
  2.1341 +            DPRINTF("(of which %ld were fixups)\n", needed_to_fix  );
  2.1342          }       
  2.1343  
  2.1344          if (last_iter && debug){
  2.1345              int minusone = -1;
  2.1346 -            memset( to_send, 0xff, (nr_pfns+8)/8 );
  2.1347 +            memset( to_send, 0xff, (max_pfn+8)/8 );
  2.1348              debug = 0;
  2.1349              fprintf(stderr, "Entering debug resend-all mode\n");
  2.1350      
  2.1351              /* send "-1" to put receiver into debug mode */
  2.1352 -            if (write(io_fd, &minusone, sizeof(int)) != sizeof(int)) {
  2.1353 +            if(!write_exact(io_fd, &minusone, sizeof(int))) { 
  2.1354                  ERR("Error when writing to state file (6)");
  2.1355                  goto out;
  2.1356              }
  2.1357 @@ -958,42 +985,39 @@ int xc_linux_save(int xc_handle, int io_
  2.1358              continue;
  2.1359          }
  2.1360  
  2.1361 -        if ( last_iter ) break; 
  2.1362 +        if (last_iter) break; 
  2.1363 +
  2.1364 +        if (live) {
  2.1365 +
  2.1366  
  2.1367 -        if ( live )
  2.1368 -        {
  2.1369 -            if ( 
  2.1370 -                ( ( sent_this_iter > sent_last_iter ) &&
  2.1371 -                  (mbit_rate == MAX_MBIT_RATE ) ) ||
  2.1372 -                (iter >= max_iters) || 
  2.1373 -                (sent_this_iter+skip_this_iter < 50) || 
  2.1374 -                (total_sent > nr_pfns*max_factor) )
  2.1375 -            {
  2.1376 +            if( 
  2.1377 +                ((sent_this_iter > sent_last_iter) && RATE_IS_MAX()) ||
  2.1378 +                (iter >= max_iters) ||
  2.1379 +                (sent_this_iter+skip_this_iter < 50) ||
  2.1380 +                (total_sent > max_pfn*max_factor) ) { 
  2.1381 +
  2.1382                  DPRINTF("Start last iteration\n");
  2.1383                  last_iter = 1;
  2.1384 -
  2.1385 -                if ( suspend_and_state( xc_handle, io_fd, dom, &info, &ctxt) )
  2.1386 -                {
  2.1387 +                
  2.1388 +                if (suspend_and_state(xc_handle, io_fd, dom, &info, &ctxt)) {
  2.1389                      ERR("Domain appears not to have suspended");
  2.1390                      goto out;
  2.1391                  }
  2.1392 -
  2.1393 +                
  2.1394                  DPRINTF("SUSPEND shinfo %08lx eip %08u edx %08u\n",
  2.1395                          info.shared_info_frame,
  2.1396                          ctxt.user_regs.eip, ctxt.user_regs.edx);
  2.1397              } 
  2.1398 -
  2.1399 -            if ( xc_shadow_control( xc_handle, dom, 
  2.1400 -                                    DOM0_SHADOW_CONTROL_OP_CLEAN,
  2.1401 -                                    to_send, nr_pfns, &stats ) != nr_pfns ) 
  2.1402 -            {
  2.1403 +            
  2.1404 +            if (xc_shadow_control(xc_handle, dom, DOM0_SHADOW_CONTROL_OP_CLEAN,
  2.1405 +                                  to_send, max_pfn, &stats ) != max_pfn) {  
  2.1406                  ERR("Error flushing shadow PT");
  2.1407                  goto out;
  2.1408              }
  2.1409  
  2.1410              sent_last_iter = sent_this_iter;
  2.1411  
  2.1412 -            print_stats( xc_handle, dom, sent_this_iter, &stats, 1);
  2.1413 +            print_stats(xc_handle, dom, sent_this_iter, &stats, 1);
  2.1414       
  2.1415          }
  2.1416  
  2.1417 @@ -1005,9 +1029,10 @@ int xc_linux_save(int xc_handle, int io_
  2.1418      /* Success! */
  2.1419      rc = 0;
  2.1420      
  2.1421 +    /* ^^^^^^ XXX SMH: hmm.. not sure that's really success! */
  2.1422 +    
  2.1423      /* Zero terminate */
  2.1424 -    if ( write(io_fd, &rc, sizeof(int)) != sizeof(int) )
  2.1425 -    {
  2.1426 +    if (!write_exact(io_fd, &rc, sizeof(int))) { 
  2.1427          ERR("Error when writing to state file (6)");
  2.1428          goto out;
  2.1429      }
  2.1430 @@ -1015,84 +1040,76 @@ int xc_linux_save(int xc_handle, int io_
  2.1431      /* Send through a list of all the PFNs that were not in map at the close */
  2.1432      {
  2.1433          unsigned int i,j;
  2.1434 -        unsigned int pfntab[1024];
  2.1435 +        unsigned long pfntab[1024]; 
  2.1436  
  2.1437 -        for ( i = 0, j = 0; i < nr_pfns; i++ )
  2.1438 -            if ( !is_mapped(live_pfn_to_mfn_table[i]) )
  2.1439 +        for ( i = 0, j = 0; i < max_pfn; i++ ) {
  2.1440 +            if ( ! is_mapped(live_p2m[i]) )
  2.1441                  j++;
  2.1442 +        }
  2.1443  
  2.1444 -        if ( write(io_fd, &j, sizeof(unsigned int)) != sizeof(unsigned int) )
  2.1445 -        {
  2.1446 +        if(!write_exact(io_fd, &j, sizeof(unsigned int))) { 
  2.1447              ERR("Error when writing to state file (6a)");
  2.1448              goto out;
  2.1449 -        } 
  2.1450 +        }	
  2.1451 +        
  2.1452 +        for ( i = 0, j = 0; i < max_pfn; ) {
  2.1453  
  2.1454 -        for ( i = 0, j = 0; i < nr_pfns; )
  2.1455 -        {
  2.1456 -            if ( !is_mapped(live_pfn_to_mfn_table[i]) )
  2.1457 -            {
  2.1458 +            if (!is_mapped(live_p2m[i]))
  2.1459                  pfntab[j++] = i;
  2.1460 -            }
  2.1461 +
  2.1462              i++;
  2.1463 -            if ( j == 1024 || i == nr_pfns )
  2.1464 -            {
  2.1465 -                if ( write(io_fd, &pfntab, sizeof(unsigned long)*j) !=
  2.1466 -                     (sizeof(unsigned long) * j) )
  2.1467 -                {
  2.1468 +            if (j == 1024 || i == max_pfn) {
  2.1469 +                if(!write_exact(io_fd, &pfntab, sizeof(unsigned long)*j)) { 
  2.1470                      ERR("Error when writing to state file (6b)");
  2.1471                      goto out;
  2.1472                  } 
  2.1473                  j = 0;
  2.1474              }
  2.1475          }
  2.1476 +
  2.1477      }
  2.1478 -
  2.1479 +    
  2.1480      /* Canonicalise the suspend-record frame number. */
  2.1481 -    if ( !translate_mfn_to_pfn(&ctxt.user_regs.edx) )
  2.1482 -    {
  2.1483 +    if ( !translate_mfn_to_pfn(&ctxt.user_regs.edx) ){
  2.1484          ERR("Suspend record is not in range of pseudophys map");
  2.1485          goto out;
  2.1486      }
  2.1487  
  2.1488      /* Canonicalise each GDT frame number. */
  2.1489 -    for ( i = 0; i < ctxt.gdt_ents; i += 512 )
  2.1490 -    {
  2.1491 -        if ( !translate_mfn_to_pfn(&ctxt.gdt_frames[i]) ) 
  2.1492 -        {
  2.1493 +    for ( i = 0; i < ctxt.gdt_ents; i += 512 ) {
  2.1494 +        if ( !translate_mfn_to_pfn(&ctxt.gdt_frames[i]) ) {
  2.1495              ERR("GDT frame is not in range of pseudophys map");
  2.1496              goto out;
  2.1497          }
  2.1498      }
  2.1499  
  2.1500      /* Canonicalise the page table base pointer. */
  2.1501 -    if ( !MFN_IS_IN_PSEUDOPHYS_MAP(ctxt.ctrlreg[3] >> PAGE_SHIFT) )
  2.1502 -    {
  2.1503 +    if ( !MFN_IS_IN_PSEUDOPHYS_MAP(ctxt.ctrlreg[3] >> PAGE_SHIFT) ) {
  2.1504          ERR("PT base is not in range of pseudophys map");
  2.1505          goto out;
  2.1506      }
  2.1507 -    ctxt.ctrlreg[3] = live_mfn_to_pfn_table[ctxt.ctrlreg[3] >> PAGE_SHIFT] <<
  2.1508 +    ctxt.ctrlreg[3] = live_m2p[ctxt.ctrlreg[3] >> PAGE_SHIFT] <<
  2.1509          PAGE_SHIFT;
  2.1510  
  2.1511 -    if ( write(io_fd, &ctxt, sizeof(ctxt)) != sizeof(ctxt) ||
  2.1512 -         write(io_fd, live_shinfo, PAGE_SIZE) != PAGE_SIZE)
  2.1513 -    {
  2.1514 +    if (!write_exact(io_fd, &ctxt, sizeof(ctxt)) ||
  2.1515 +        !write_exact(io_fd, live_shinfo, PAGE_SIZE)) { 
  2.1516          ERR("Error when writing to state file (1)");
  2.1517          goto out;
  2.1518      }
  2.1519 -
  2.1520 +    
  2.1521   out:
  2.1522  
  2.1523 -    if ( live_shinfo )
  2.1524 +    if (live_shinfo)
  2.1525          munmap(live_shinfo, PAGE_SIZE);
  2.1526 -
  2.1527 -    if ( live_pfn_to_mfn_frame_list ) 
  2.1528 -        munmap(live_pfn_to_mfn_frame_list, PAGE_SIZE);
  2.1529 +    
  2.1530 +    if (live_p2m_frame_list) 
  2.1531 +        munmap(live_p2m_frame_list, P2M_FLL_ENTRIES * PAGE_SIZE); 
  2.1532  
  2.1533 -    if ( live_pfn_to_mfn_table ) 
  2.1534 -        munmap(live_pfn_to_mfn_table, nr_pfns*4);
  2.1535 +    if(live_p2m) 
  2.1536 +        munmap(live_p2m, P2M_SIZE); 
  2.1537  
  2.1538 -    if ( live_mfn_to_pfn_table ) 
  2.1539 -        munmap(live_mfn_to_pfn_table, PAGE_SIZE*1024);
  2.1540 +    if(live_m2p) 
  2.1541 +        munmap(live_m2p, M2P_SIZE); 
  2.1542  
  2.1543      free(pfn_type);
  2.1544      free(pfn_batch);
  2.1545 @@ -1101,6 +1118,7 @@ int xc_linux_save(int xc_handle, int io_
  2.1546      free(to_skip);
  2.1547  
  2.1548      DPRINTF("Save exit rc=%d\n",rc);
  2.1549 +
  2.1550      return !!rc;
  2.1551  }
  2.1552  
     3.1 --- a/tools/libxc/xenctrl.h	Tue Nov 08 18:39:58 2005 +0100
     3.2 +++ b/tools/libxc/xenctrl.h	Tue Nov 08 18:42:07 2005 +0100
     3.3 @@ -17,6 +17,7 @@
     3.4  #include <xen/event_channel.h>
     3.5  #include <xen/sched.h>
     3.6  #include <xen/sched_ctl.h>
     3.7 +#include <xen/memory.h>
     3.8  #include <xen/acm.h>
     3.9  
    3.10  #ifdef __ia64__
     4.1 --- a/tools/libxc/xg_private.h	Tue Nov 08 18:39:58 2005 +0100
     4.2 +++ b/tools/libxc/xg_private.h	Tue Nov 08 18:42:07 2005 +0100
     4.3 @@ -11,8 +11,10 @@
     4.4  #include <sys/stat.h>
     4.5  
     4.6  #include "xenctrl.h"
     4.7 +#include "xenguest.h" 
     4.8  
     4.9  #include <xen/linux/privcmd.h>
    4.10 +#include <xen/memory.h>
    4.11  
    4.12  char *xc_read_kernel_image(const char *filename, unsigned long *size);
    4.13  unsigned long csum_page (void * page);
     5.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     5.2 +++ b/tools/libxc/xg_save_restore.h	Tue Nov 08 18:42:07 2005 +0100
     5.3 @@ -0,0 +1,123 @@
     5.4 +/*
     5.5 +** xg_save_restore.h
     5.6 +** 
     5.7 +** Defintions and utilities for save / restore. 
     5.8 +*/
     5.9 +
    5.10 +#define DEBUG    1
    5.11 +#define PROGRESS 0
    5.12 +
    5.13 +#define ERR(_f, _a...) do {                     \
    5.14 +    fprintf(stderr, _f "\n" , ## _a);           \
    5.15 +    fflush(stderr); }                           \
    5.16 +while (0)
    5.17 +
    5.18 +#if DEBUG
    5.19 +#define DPRINTF(_f, _a...) fprintf(stderr, _f , ## _a)
    5.20 +#else
    5.21 +#define DPRINTF(_f, _a...) ((void)0)
    5.22 +#endif
    5.23 +
    5.24 +
    5.25 +#if PROGRESS
    5.26 +#define PPRINTF(_f, _a...) fprintf(stderr, _f , ## _a)
    5.27 +#else
    5.28 +#define PPRINTF(_f, _a...)
    5.29 +#endif
    5.30 +
    5.31 +
    5.32 +/*
    5.33 +** We process save/restore/migrate in batches of pages; the below 
    5.34 +** determines how many pages we (at maximum) deal with in each batch. 
    5.35 +*/
    5.36 +#define MAX_BATCH_SIZE 1024   /* up to 1024 pages (4MB) at a time */
    5.37 +
    5.38 +/* When pinning page tables at the end of restore, we also use batching. */
    5.39 +#define MAX_PIN_BATCH  1024
    5.40 +
    5.41 +
    5.42 +
    5.43 +/*
    5.44 +** Determine various platform information required for save/restore, in 
    5.45 +** particular: 
    5.46 +**
    5.47 +**    - the maximum MFN on this machine, used to compute the size of 
    5.48 +**      the M2P table; 
    5.49 +** 
    5.50 +**    - the starting virtual address of the the hypervisor; we use this 
    5.51 +**      to determine which parts of guest address space(s) do and don't 
    5.52 +**      require canonicalization during save/restore; and 
    5.53 +** 
    5.54 +**    - the number of page-table levels for save/ restore. This should 
    5.55 +**      be a property of the domain, but for the moment we just read it 
    5.56 +**      from the hypervisor.
    5.57 +**
    5.58 +** Returns 1 on success, 0 on failure. 
    5.59 +*/
    5.60 +static int get_platform_info(int xc_handle, uint32_t dom, 
    5.61 +                             /* OUT */ uint32_t *max_mfn,  
    5.62 +                             /* OUT */ uint32_t *hvirt_start, 
    5.63 +                             /* OUT */ uint32_t *pt_levels)
    5.64 +    
    5.65 +{ 
    5.66 +    xen_capabilities_info_t xen_caps = "";
    5.67 +    xen_parameters_info_t xen_parms;
    5.68 +    xc_physinfo_t physinfo;
    5.69 +    
    5.70 +    if (xc_physinfo(xc_handle, &physinfo) != 0) 
    5.71 +        return 0;
    5.72 +    
    5.73 +    if (xc_version(xc_handle, XENVER_parameters, &xen_parms) != 0)
    5.74 +        return 0;
    5.75 +    
    5.76 +    if (xc_version(xc_handle, XENVER_capabilities, &xen_caps) != 0)
    5.77 +        return 0;
    5.78 +
    5.79 +    *max_mfn =     physinfo.total_pages;
    5.80 +    *hvirt_start = xen_parms.virt_start;
    5.81 +
    5.82 +    if (strstr(xen_caps, "xen-3.0-x86_64"))
    5.83 +        *pt_levels = 4;
    5.84 +    else if (strstr(xen_caps, "xen-3.0-x86_32p"))
    5.85 +        *pt_levels = 3; 
    5.86 +    else if (strstr(xen_caps, "xen-3.0-x86_32"))
    5.87 +        *pt_levels = 2; 
    5.88 +    else 
    5.89 +        return 0; 
    5.90 +    
    5.91 +    return 1;
    5.92 +} 
    5.93 +
    5.94 +
    5.95 +/* 
    5.96 +** Save/restore deal with the mfn_to_pfn (M2P) and pfn_to_mfn (P2M) tables. 
    5.97 +** The M2P simply holds the corresponding PFN, while the top bit of a P2M
    5.98 +** entry tell us whether or not the the PFN is currently mapped.
    5.99 +*/
   5.100 +
   5.101 +#define PFN_TO_KB(_pfn) ((_pfn) * PAGE_SIZE / 1024)
   5.102 +#define ROUNDUP(_x,_w) (((unsigned long)(_x)+(1UL<<(_w))-1) & ~((1UL<<(_w))-1))
   5.103 +
   5.104 +/* Size in bytes of the M2P and P2M (both rounded up to nearest PAGE_SIZE) */
   5.105 +#define M2P_SIZE ROUNDUP((max_mfn * sizeof(unsigned long)), PAGE_SHIFT) 
   5.106 +#define P2M_SIZE ROUNDUP((max_pfn * sizeof(unsigned long)), PAGE_SHIFT) 
   5.107 +
   5.108 +
   5.109 +/* Number of unsigned longs in a page */
   5.110 +#define ulpp            (PAGE_SIZE/sizeof(unsigned long))
   5.111 +
   5.112 +/* Number of entries in the pfn_to_mfn_frame_list */
   5.113 +#define P2M_FL_ENTRIES  (((max_pfn)+ulpp-1)/ulpp)
   5.114 +
   5.115 +/* Size in bytes of the pfn_to_mfn_frame_list     */
   5.116 +#define P2M_FL_SIZE     ((P2M_FL_ENTRIES)*sizeof(unsigned long))
   5.117 +
   5.118 +/* Number of entries in the pfn_to_mfn_frame_list_list */
   5.119 +#define P2M_FLL_ENTRIES (((max_pfn)+(ulpp*ulpp)-1)/(ulpp*ulpp))
   5.120 +
   5.121 +/* Returns TRUE if the PFN is currently mapped */
   5.122 +#define is_mapped(pfn_type) (!((pfn_type) & 0x80000000UL))
   5.123 +
   5.124 +
   5.125 +
   5.126 +