ia64/xen-unstable

changeset 17619:c99a88623eda

xend: Fix and improve error handling for failed suspend/migrate

This has been broken since cset 16964:5d84464dc1fc
Also deal better with very early errors (close sender side socket)

Signed-off-by: Steven Hand <steven.hand@cl.cam.ac.uk>
author Keir Fraser <keir.fraser@citrix.com>
date Thu May 08 14:33:31 2008 +0100 (2008-05-08)
parents 8bd776540ab3
children 810d8c3ac992
files tools/python/xen/xend/XendCheckpoint.py tools/python/xen/xend/XendDomain.py tools/python/xen/xend/XendDomainInfo.py
line diff
     1.1 --- a/tools/python/xen/xend/XendCheckpoint.py	Thu May 08 14:32:11 2008 +0100
     1.2 +++ b/tools/python/xen/xend/XendCheckpoint.py	Thu May 08 14:33:31 2008 +0100
     1.3 @@ -81,8 +81,6 @@ def save(fd, dominfo, network, live, dst
     1.4      # thing is useful for debugging.
     1.5      dominfo.setName('migrating-' + domain_name)
     1.6  
     1.7 -    done_suspend = 0
     1.8 -
     1.9      try:
    1.10          dominfo.migrateDevices(network, dst, DEV_MIGRATE_STEP1, domain_name)
    1.11  
    1.12 @@ -110,7 +108,6 @@ def save(fd, dominfo, network, live, dst
    1.13                  log.debug("Suspending %d ...", dominfo.getDomid())
    1.14                  dominfo.shutdown('suspend')
    1.15                  dominfo.waitForShutdown()
    1.16 -                done_suspend = 1
    1.17                  dominfo.migrateDevices(network, dst, DEV_MIGRATE_STEP2,
    1.18                                         domain_name)
    1.19                  log.info("Domain %d suspended.", dominfo.getDomid())
    1.20 @@ -154,16 +151,9 @@ def save(fd, dominfo, network, live, dst
    1.21              pass
    1.22  
    1.23      except Exception, exn:
    1.24 -        log.exception("Save failed on domain %s (%s).", domain_name,
    1.25 +        log.exception("Save failed on domain %s (%s) - resuming.", domain_name,
    1.26                        dominfo.getDomid())
    1.27 -        
    1.28 -        # If we didn't get as far as suspending the domain (for
    1.29 -        # example, we couldn't balloon enough memory for the new
    1.30 -        # domain), then we don't want to re-plumb the devices, as the
    1.31 -        # domU will not be expecting it.
    1.32 -        if done_suspend:
    1.33 -            log.debug("XendCheckpoint.save: resumeDomain")
    1.34 -            dominfo.resumeDomain()
    1.35 +        dominfo.resumeDomain()
    1.36   
    1.37          try:
    1.38              dominfo.setName(domain_name)
     2.1 --- a/tools/python/xen/xend/XendDomain.py	Thu May 08 14:32:11 2008 +0100
     2.2 +++ b/tools/python/xen/xend/XendDomain.py	Thu May 08 14:33:31 2008 +0100
     2.3 @@ -1308,8 +1308,10 @@ class XendDomain:
     2.4  
     2.5          sock.send("receive\n")
     2.6          sock.recv(80)
     2.7 -        XendCheckpoint.save(sock.fileno(), dominfo, True, live, dst, node=node)
     2.8 -        sock.close()
     2.9 +        try:
    2.10 +            XendCheckpoint.save(sock.fileno(), dominfo, True, live, dst, node=node)
    2.11 +        finally:
    2.12 +            sock.close()
    2.13  
    2.14      def domain_save(self, domid, dst, checkpoint=False):
    2.15          """Start saving a domain to file.
     3.1 --- a/tools/python/xen/xend/XendDomainInfo.py	Thu May 08 14:32:11 2008 +0100
     3.2 +++ b/tools/python/xen/xend/XendDomainInfo.py	Thu May 08 14:33:31 2008 +0100
     3.3 @@ -2378,8 +2378,19 @@ class XendDomainInfo:
     3.4      def resumeDomain(self):
     3.5          log.debug("XendDomainInfo.resumeDomain(%s)", str(self.domid))
     3.6  
     3.7 -        if self.domid is None:
     3.8 +        # resume a suspended domain (e.g. after live checkpoint, or after
     3.9 +        # a later error during save or migate); checks that the domain
    3.10 +        # is currently suspended first so safe to call from anywhere
    3.11 +
    3.12 +        xeninfo = dom_get(self.domid)
    3.13 +        if xeninfo is None: 
    3.14              return
    3.15 +        if not xeninfo['shutdown']:
    3.16 +            return
    3.17 +        reason = shutdown_reason(xeninfo['shutdown_reason'])
    3.18 +        if reason != 'suspend':
    3.19 +            return
    3.20 +
    3.21          try:
    3.22              # could also fetch a parsed note from xenstore
    3.23              fast = self.info.get_notes().get('SUSPEND_CANCEL') and 1 or 0