ia64/xen-unstable

changeset 13448:895d873a00b4

Enable lazy (on-demand) allocation of memory to a guest being restored; this
means that ballooned down domains only require as much memory as is currently
being used (rather than their max) when being restored from save, or when
being migrated.

Signed-off-by: Steven Hand <steven@xensource.com>
author Steven Hand <steven@xensource.com>
date Tue Jan 16 10:02:50 2007 +0000 (2007-01-16)
parents 887168cf7532
children fd2667419c53
files tools/libxc/xc_linux_restore.c tools/python/xen/xend/XendCheckpoint.py
line diff
     1.1 --- a/tools/libxc/xc_linux_restore.c	Mon Jan 15 18:09:16 2007 +0000
     1.2 +++ b/tools/libxc/xc_linux_restore.c	Tue Jan 16 10:02:50 2007 +0000
     1.3 @@ -12,7 +12,7 @@
     1.4  #include "xg_private.h"
     1.5  #include "xg_save_restore.h"
     1.6  
     1.7 -/* max mfn of the whole machine */
     1.8 +/* max mfn of the current host machine */
     1.9  static unsigned long max_mfn;
    1.10  
    1.11  /* virtual starting address of the hypervisor */
    1.12 @@ -30,6 +30,9 @@ static xen_pfn_t *live_p2m = NULL;
    1.13  /* A table mapping each PFN to its new MFN. */
    1.14  static xen_pfn_t *p2m = NULL;
    1.15  
    1.16 +/* A table of P2M mappings in the current region */
    1.17 +static xen_pfn_t *p2m_batch = NULL;
    1.18 +
    1.19  
    1.20  static ssize_t
    1.21  read_exact(int fd, void *buf, size_t count)
    1.22 @@ -57,46 +60,78 @@ read_exact(int fd, void *buf, size_t cou
    1.23  ** This function inverts that operation, replacing the pfn values with
    1.24  ** the (now known) appropriate mfn values.
    1.25  */
    1.26 -static int uncanonicalize_pagetable(unsigned long type, void *page)
    1.27 +static int uncanonicalize_pagetable(int xc_handle, uint32_t dom, 
    1.28 +                                    unsigned long type, void *page)
    1.29  {
    1.30      int i, pte_last;
    1.31      unsigned long pfn;
    1.32      uint64_t pte;
    1.33 +    int nr_mfns = 0; 
    1.34  
    1.35      pte_last = PAGE_SIZE / ((pt_levels == 2)? 4 : 8);
    1.36  
    1.37 -    /* Now iterate through the page table, uncanonicalizing each PTE */
    1.38 +    /* First pass: work out how many (if any) MFNs we need to alloc */
    1.39 +    for(i = 0; i < pte_last; i++) {
    1.40 +        
    1.41 +        if(pt_levels == 2)
    1.42 +            pte = ((uint32_t *)page)[i];
    1.43 +        else
    1.44 +            pte = ((uint64_t *)page)[i];
    1.45 +        
    1.46 +        /* XXX SMH: below needs fixing for PROT_NONE etc */
    1.47 +        if(!(pte & _PAGE_PRESENT))
    1.48 +            continue; 
    1.49 +        
    1.50 +        pfn = (pte >> PAGE_SHIFT) & 0xffffffff;
    1.51 +        
    1.52 +        if(pfn >= max_pfn) {
    1.53 +            /* This "page table page" is probably not one; bail. */
    1.54 +            ERROR("Frame number in type %lu page table is out of range: "
    1.55 +                  "i=%d pfn=0x%lx max_pfn=%lu",
    1.56 +                  type >> 28, i, pfn, max_pfn);
    1.57 +            return 0;
    1.58 +        }
    1.59 +        
    1.60 +        if(p2m[pfn] == INVALID_P2M_ENTRY) {
    1.61 +            /* Have a 'valid' PFN without a matching MFN - need to alloc */
    1.62 +            p2m_batch[nr_mfns++] = pfn; 
    1.63 +        }
    1.64 +    }
    1.65 +    
    1.66 +    
    1.67 +    /* Alllocate the requistite number of mfns */
    1.68 +    if (nr_mfns && xc_domain_memory_populate_physmap(
    1.69 +            xc_handle, dom, nr_mfns, 0, 0, p2m_batch) != 0) { 
    1.70 +        ERROR("Failed to allocate memory for batch.!\n"); 
    1.71 +        errno = ENOMEM;
    1.72 +        return 0; 
    1.73 +    }
    1.74 +    
    1.75 +    /* Second pass: uncanonicalize each present PTE */
    1.76 +    nr_mfns = 0;
    1.77      for(i = 0; i < pte_last; i++) {
    1.78  
    1.79          if(pt_levels == 2)
    1.80              pte = ((uint32_t *)page)[i];
    1.81          else
    1.82              pte = ((uint64_t *)page)[i];
    1.83 -
    1.84 -        if(pte & _PAGE_PRESENT) {
    1.85 -
    1.86 -            pfn = (pte >> PAGE_SHIFT) & 0xffffffff;
    1.87 -
    1.88 -            if(pfn >= max_pfn) {
    1.89 -                /* This "page table page" is probably not one; bail. */
    1.90 -                ERROR("Frame number in type %lu page table is out of range: "
    1.91 -                    "i=%d pfn=0x%lx max_pfn=%lu",
    1.92 -                    type >> 28, i, pfn, max_pfn);
    1.93 -                return 0;
    1.94 -            }
    1.95 +        
    1.96 +        /* XXX SMH: below needs fixing for PROT_NONE etc */
    1.97 +        if(!(pte & _PAGE_PRESENT))
    1.98 +            continue;
    1.99 +        
   1.100 +        pfn = (pte >> PAGE_SHIFT) & 0xffffffff;
   1.101 +        
   1.102 +        if(p2m[pfn] == INVALID_P2M_ENTRY)
   1.103 +            p2m[pfn] = p2m_batch[nr_mfns++];
   1.104  
   1.105 -
   1.106 -            pte &= 0xffffff0000000fffULL;
   1.107 -            pte |= (uint64_t)p2m[pfn] << PAGE_SHIFT;
   1.108 +        pte &= 0xffffff0000000fffULL;
   1.109 +        pte |= (uint64_t)p2m[pfn] << PAGE_SHIFT;
   1.110  
   1.111 -            if(pt_levels == 2)
   1.112 -                ((uint32_t *)page)[i] = (uint32_t)pte;
   1.113 -            else
   1.114 -                ((uint64_t *)page)[i] = (uint64_t)pte;
   1.115 -
   1.116 -
   1.117 -
   1.118 -        }
   1.119 +        if(pt_levels == 2)
   1.120 +            ((uint32_t *)page)[i] = (uint32_t)pte;
   1.121 +        else
   1.122 +            ((uint64_t *)page)[i] = (uint64_t)pte;
   1.123      }
   1.124  
   1.125      return 1;
   1.126 @@ -140,6 +175,7 @@ int xc_linux_restore(int xc_handle, int 
   1.127      /* A temporary mapping of the guest's start_info page. */
   1.128      start_info_t *start_info;
   1.129  
   1.130 +    /* Our mapping of the current region (batch) */
   1.131      char *region_base;
   1.132  
   1.133      xc_mmu_t *mmu = NULL;
   1.134 @@ -244,8 +280,10 @@ int xc_linux_restore(int xc_handle, int 
   1.135      p2m        = calloc(max_pfn, sizeof(xen_pfn_t));
   1.136      pfn_type   = calloc(max_pfn, sizeof(unsigned long));
   1.137      region_mfn = calloc(MAX_BATCH_SIZE, sizeof(xen_pfn_t));
   1.138 +    p2m_batch  = calloc(MAX_BATCH_SIZE, sizeof(xen_pfn_t));
   1.139  
   1.140 -    if ((p2m == NULL) || (pfn_type == NULL) || (region_mfn == NULL)) {
   1.141 +    if ((p2m == NULL) || (pfn_type == NULL) ||
   1.142 +        (region_mfn == NULL) || (p2m_batch == NULL)) {
   1.143          ERROR("memory alloc failed");
   1.144          errno = ENOMEM;
   1.145          goto out;
   1.146 @@ -256,6 +294,11 @@ int xc_linux_restore(int xc_handle, int 
   1.147          goto out;
   1.148      }
   1.149  
   1.150 +    if (lock_pages(p2m_batch, sizeof(xen_pfn_t) * MAX_BATCH_SIZE)) {
   1.151 +        ERROR("Could not lock p2m_batch");
   1.152 +        goto out;
   1.153 +    }
   1.154 +
   1.155      /* Get the domain's shared-info frame. */
   1.156      domctl.cmd = XEN_DOMCTL_getdomaininfo;
   1.157      domctl.domain = (domid_t)dom;
   1.158 @@ -270,17 +313,9 @@ int xc_linux_restore(int xc_handle, int 
   1.159          goto out;
   1.160      }
   1.161  
   1.162 +    /* Mark all PFNs as invalid; we allocate on demand */
   1.163      for ( pfn = 0; pfn < max_pfn; pfn++ )
   1.164 -        p2m[pfn] = pfn;
   1.165 -
   1.166 -    if (xc_domain_memory_populate_physmap(xc_handle, dom, max_pfn,
   1.167 -                                          0, 0, p2m) != 0) {
   1.168 -        ERROR("Failed to increase reservation by %lx KB", PFN_TO_KB(max_pfn));
   1.169 -        errno = ENOMEM;
   1.170 -        goto out;
   1.171 -    }
   1.172 -
   1.173 -    DPRINTF("Increased domain reservation by %lx KB\n", PFN_TO_KB(max_pfn));
   1.174 +        p2m[pfn] = INVALID_P2M_ENTRY;
   1.175  
   1.176      if(!(mmu = xc_init_mmu_updates(xc_handle, dom))) {
   1.177          ERROR("Could not initialise for MMU updates");
   1.178 @@ -298,7 +333,7 @@ int xc_linux_restore(int xc_handle, int 
   1.179      n = 0;
   1.180      while (1) {
   1.181  
   1.182 -        int j;
   1.183 +        int j, nr_mfns = 0; 
   1.184  
   1.185          this_pc = (n * 100) / max_pfn;
   1.186          if ( (this_pc - prev_pc) >= 5 )
   1.187 @@ -333,6 +368,33 @@ int xc_linux_restore(int xc_handle, int 
   1.188              goto out;
   1.189          }
   1.190  
   1.191 +        /* First pass for this batch: work out how much memory to alloc */
   1.192 +        nr_mfns = 0; 
   1.193 +        for ( i = 0; i < j; i++ )
   1.194 +        {
   1.195 +            unsigned long pfn, pagetype;
   1.196 +            pfn      = region_pfn_type[i] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
   1.197 +            pagetype = region_pfn_type[i] &  XEN_DOMCTL_PFINFO_LTAB_MASK;
   1.198 +
   1.199 +            if ( (pagetype != XEN_DOMCTL_PFINFO_XTAB) && 
   1.200 +                 (p2m[pfn] == INVALID_P2M_ENTRY) )
   1.201 +            {
   1.202 +                /* Have a live PFN which hasn't had an MFN allocated */
   1.203 +                p2m_batch[nr_mfns++] = pfn; 
   1.204 +            }
   1.205 +        } 
   1.206 +
   1.207 +
   1.208 +        /* Now allocate a bunch of mfns for this batch */
   1.209 +        if (nr_mfns && xc_domain_memory_populate_physmap(
   1.210 +                xc_handle, dom, nr_mfns, 0, 0, p2m_batch) != 0) { 
   1.211 +            ERROR("Failed to allocate memory for batch.!\n"); 
   1.212 +            errno = ENOMEM;
   1.213 +            goto out;
   1.214 +        }
   1.215 +
   1.216 +        /* Second pass for this batch: update p2m[] and region_mfn[] */
   1.217 +        nr_mfns = 0; 
   1.218          for ( i = 0; i < j; i++ )
   1.219          {
   1.220              unsigned long pfn, pagetype;
   1.221 @@ -340,13 +402,23 @@ int xc_linux_restore(int xc_handle, int 
   1.222              pagetype = region_pfn_type[i] &  XEN_DOMCTL_PFINFO_LTAB_MASK;
   1.223  
   1.224              if ( pagetype == XEN_DOMCTL_PFINFO_XTAB)
   1.225 -                region_mfn[i] = 0; /* we know map will fail, but don't care */
   1.226 -            else
   1.227 -                region_mfn[i] = p2m[pfn];
   1.228 -        }
   1.229 +                region_mfn[i] = ~0UL; /* map will fail but we don't care */
   1.230 +            else 
   1.231 +            {
   1.232 +                if (p2m[pfn] == INVALID_P2M_ENTRY) {
   1.233 +                    /* We just allocated a new mfn above; update p2m */
   1.234 +                    p2m[pfn] = p2m_batch[nr_mfns++]; 
   1.235 +                }
   1.236  
   1.237 +                /* setup region_mfn[] for batch map */
   1.238 +                region_mfn[i] = p2m[pfn]; 
   1.239 +            }
   1.240 +        } 
   1.241 +
   1.242 +        /* Map relevant mfns */
   1.243          region_base = xc_map_foreign_batch(
   1.244              xc_handle, dom, PROT_WRITE, region_mfn, j);
   1.245 +
   1.246          if ( region_base == NULL )
   1.247          {
   1.248              ERROR("map batch failed");
   1.249 @@ -401,7 +473,8 @@ int xc_linux_restore(int xc_handle, int 
   1.250                      pae_extended_cr3 ||
   1.251                      (pagetype != XEN_DOMCTL_PFINFO_L1TAB)) {
   1.252  
   1.253 -                    if (!uncanonicalize_pagetable(pagetype, page)) {
   1.254 +                    if (!uncanonicalize_pagetable(xc_handle, dom, 
   1.255 +                                                  pagetype, page)) {
   1.256                          /*
   1.257                          ** Failing to uncanonicalize a page table can be ok
   1.258                          ** under live migration since the pages type may have
   1.259 @@ -411,10 +484,8 @@ int xc_linux_restore(int xc_handle, int 
   1.260                                  pagetype >> 28, pfn, mfn);
   1.261                          nraces++;
   1.262                          continue;
   1.263 -                    }
   1.264 -
   1.265 +                    } 
   1.266                  }
   1.267 -
   1.268              }
   1.269              else if ( pagetype != XEN_DOMCTL_PFINFO_NOTAB )
   1.270              {
   1.271 @@ -486,7 +557,7 @@ int xc_linux_restore(int xc_handle, int 
   1.272          */
   1.273  
   1.274          int j, k;
   1.275 -
   1.276 +        
   1.277          /* First pass: find all L3TABs current in > 4G mfns and get new mfns */
   1.278          for ( i = 0; i < max_pfn; i++ )
   1.279          {
   1.280 @@ -555,7 +626,8 @@ int xc_linux_restore(int xc_handle, int 
   1.281                  }
   1.282  
   1.283                  for(k = 0; k < j; k++) {
   1.284 -                    if(!uncanonicalize_pagetable(XEN_DOMCTL_PFINFO_L1TAB,
   1.285 +                    if(!uncanonicalize_pagetable(xc_handle, dom, 
   1.286 +                                                 XEN_DOMCTL_PFINFO_L1TAB,
   1.287                                                   region_base + k*PAGE_SIZE)) {
   1.288                          ERROR("failed uncanonicalize pt!");
   1.289                          goto out;
   1.290 @@ -631,7 +703,7 @@ int xc_linux_restore(int xc_handle, int 
   1.291      {
   1.292          unsigned int count;
   1.293          unsigned long *pfntab;
   1.294 -        int rc;
   1.295 +        int nr_frees, rc;
   1.296  
   1.297          if (!read_exact(io_fd, &count, sizeof(count))) {
   1.298              ERROR("Error when reading pfn count");
   1.299 @@ -648,29 +720,30 @@ int xc_linux_restore(int xc_handle, int 
   1.300              goto out;
   1.301          }
   1.302  
   1.303 +        nr_frees = 0; 
   1.304          for (i = 0; i < count; i++) {
   1.305  
   1.306              unsigned long pfn = pfntab[i];
   1.307  
   1.308 -            if(pfn > max_pfn)
   1.309 -                /* shouldn't happen - continue optimistically */
   1.310 -                continue;
   1.311 -
   1.312 -            pfntab[i] = p2m[pfn];
   1.313 -            p2m[pfn]  = INVALID_P2M_ENTRY; // not in pseudo-physical map
   1.314 +            if(p2m[pfn] != INVALID_P2M_ENTRY) {
   1.315 +                /* pfn is not in physmap now, but was at some point during 
   1.316 +                   the save/migration process - need to free it */
   1.317 +                pfntab[nr_frees++] = p2m[pfn];
   1.318 +                p2m[pfn]  = INVALID_P2M_ENTRY; // not in pseudo-physical map
   1.319 +            }
   1.320          }
   1.321  
   1.322 -        if (count > 0) {
   1.323 +        if (nr_frees > 0) {
   1.324  
   1.325              struct xen_memory_reservation reservation = {
   1.326 -                .nr_extents   = count,
   1.327 +                .nr_extents   = nr_frees,
   1.328                  .extent_order = 0,
   1.329                  .domid        = dom
   1.330              };
   1.331              set_xen_guest_handle(reservation.extent_start, pfntab);
   1.332  
   1.333              if ((rc = xc_memory_op(xc_handle, XENMEM_decrease_reservation,
   1.334 -                                   &reservation)) != count) {
   1.335 +                                   &reservation)) != nr_frees) {
   1.336                  ERROR("Could not decrease reservation : %d", rc);
   1.337                  goto out;
   1.338              } else
   1.339 @@ -791,6 +864,6 @@ int xc_linux_restore(int xc_handle, int 
   1.340      free(pfn_type);
   1.341  
   1.342      DPRINTF("Restore exit with rc=%d\n", rc);
   1.343 -
   1.344 +    
   1.345      return rc;
   1.346  }
     2.1 --- a/tools/python/xen/xend/XendCheckpoint.py	Mon Jan 15 18:09:16 2007 +0000
     2.2 +++ b/tools/python/xen/xend/XendCheckpoint.py	Tue Jan 16 10:02:50 2007 +0000
     2.3 @@ -147,18 +147,20 @@ def restore(xd, fd, dominfo = None, paus
     2.4      assert store_port
     2.5      assert console_port
     2.6  
     2.7 +    nr_pfns = (dominfo.getMemoryTarget() + 3) / 4 
     2.8 +
     2.9      try:
    2.10          l = read_exact(fd, sizeof_unsigned_long,
    2.11                         "not a valid guest state file: pfn count read")
    2.12 -        nr_pfns = unpack("L", l)[0]    # native sizeof long
    2.13 -        if nr_pfns > 16*1024*1024:     # XXX 
    2.14 +        max_pfn = unpack("L", l)[0]    # native sizeof long
    2.15 +        if max_pfn > 16*1024*1024:     # XXX 
    2.16              raise XendError(
    2.17                  "not a valid guest state file: pfn count out of range")
    2.18  
    2.19          balloon.free(xc.pages_to_kib(nr_pfns))
    2.20  
    2.21          cmd = map(str, [xen.util.auxbin.pathTo(XC_RESTORE),
    2.22 -                        fd, dominfo.getDomid(), nr_pfns,
    2.23 +                        fd, dominfo.getDomid(), max_pfn,
    2.24                          store_port, console_port])
    2.25          log.debug("[xc_restore]: %s", string.join(cmd))
    2.26