ia64/xen-unstable

changeset 9727:1fe63743a147

This patch enables external devices, such as for example a mounted hard
drive image or a TPM, to be migrated to a remote machine. The patch
hooks into the checkpointing (XendCheckpoint.py) code and performs
migration in 4 different steps:

In a 1st step (step = 0 in the code) migration of all devices of a
domain is 'tested', that means their driver implementations (blkif.py,
netif.py, tpmif.py, usbif.py, pciif.py) are queried whether migration is
possible at all. Currently all device representations respond with a
'yes' (=0), although probably a VM mounting a hard drive partition
should respond with a 'no' (-1) already. This first step is a quick
check to see whether devices can be migrated.

The 2nd step is to do whatever can be done before the domain is
suspended. At this point migration of the device could be initiated, if
at all possible.

The 3rd step is to migrate a device after the domain has been suspended,
meaning that it is not scheduled anymore and the VM is 'settled'. All
devices are called again and a good implementation would initiate the
migration in a background process to achieve as much concurrency as
possible.

The 4th step is to synchronize with the 3rd step. At this point the
implementor has to make sure that anything that was initiated in step 3
has completed. Once all steps 4 have been processed, the VM will resume
on the remove machine.

I have implemented hooks for migration of a virtual TPM in
xen/xend/server/tpmif.py. These hooks call a configurable external
migration tool using the os.popen() call with a fixed command line
parameter set. The implementation refuses to migrate a VM attached to a
virtual TPM if no tool has been provided for migration.
All other devices do not currently overload the 'migrate' method defined
in the DevController.py and therefore will just let migration happen.

I have added hooks for error recovery such that whatever part of
migration has been initiated can be rolled back when any of the devices
fail to migrate in one of the steps. The interface (in tpmif.py) to the
external application now uses os.popen() to allow error handling by
reading the application's output.

Signed-off-by: Stefan Berger <stefanb@us.ibm.com>
author emellor@leeni.uk.xensource.com
date Fri Apr 14 21:22:09 2006 +0100 (2006-04-14)
parents 4b168245977a
children 5102cd121a36
files tools/examples/xend-config.sxp tools/python/xen/xend/XendCheckpoint.py tools/python/xen/xend/XendDomain.py tools/python/xen/xend/XendDomainInfo.py tools/python/xen/xend/XendRoot.py tools/python/xen/xend/server/DevController.py tools/python/xen/xend/server/tpmif.py
line diff
     1.1 --- a/tools/examples/xend-config.sxp	Fri Apr 14 21:15:38 2006 +0100
     1.2 +++ b/tools/examples/xend-config.sxp	Fri Apr 14 21:22:09 2006 +0100
     1.3 @@ -127,3 +127,6 @@
     1.4  
     1.5  # Whether to enable core-dumps when domains crash.
     1.6  #(enable-dump no)
     1.7 +
     1.8 +# The tool used for initiating virtual TPM migration
     1.9 +#(external-migration-tool '')
     2.1 --- a/tools/python/xen/xend/XendCheckpoint.py	Fri Apr 14 21:15:38 2006 +0100
     2.2 +++ b/tools/python/xen/xend/XendCheckpoint.py	Fri Apr 14 21:22:09 2006 +0100
     2.3 @@ -53,7 +53,7 @@ def read_exact(fd, size, errmsg):
     2.4  
     2.5  
     2.6  
     2.7 -def save(fd, dominfo, live):
     2.8 +def save(fd, dominfo, live, dst):
     2.9      write_exact(fd, SIGNATURE, "could not write guest state file: signature")
    2.10  
    2.11      config = sxp.to_string(dominfo.sxpr())
    2.12 @@ -65,6 +65,8 @@ def save(fd, dominfo, live):
    2.13      dominfo.setName('migrating-' + domain_name)
    2.14  
    2.15      try:
    2.16 +        dominfo.migrateDevices(live, dst, 1, domain_name)
    2.17 +
    2.18          write_exact(fd, pack("!i", len(config)),
    2.19                      "could not write guest state file: config len")
    2.20          write_exact(fd, config, "could not write guest state file: config")
    2.21 @@ -85,7 +87,9 @@ def save(fd, dominfo, live):
    2.22                  log.debug("Suspending %d ...", dominfo.getDomid())
    2.23                  dominfo.shutdown('suspend')
    2.24                  dominfo.waitForShutdown()
    2.25 +                dominfo.migrateDevices(live, dst, 2, domain_name)
    2.26                  log.info("Domain %d suspended.", dominfo.getDomid())
    2.27 +                dominfo.migrateDevices(live, dst, 3, domain_name)
    2.28                  tochild.write("done\n")
    2.29                  tochild.flush()
    2.30                  log.debug('Written done')
     3.1 --- a/tools/python/xen/xend/XendDomain.py	Fri Apr 14 21:15:38 2006 +0100
     3.2 +++ b/tools/python/xen/xend/XendDomain.py	Fri Apr 14 21:22:09 2006 +0100
     3.3 @@ -405,6 +405,9 @@ class XendDomain:
     3.4          if dominfo.getDomid() == PRIV_DOMAIN:
     3.5              raise XendError("Cannot migrate privileged domain %i" % domid)
     3.6  
     3.7 +        """ The following call may raise a XendError exception """
     3.8 +        dominfo.testMigrateDevices(live, dst)
     3.9 +
    3.10          if port == 0:
    3.11              port = xroot.get_xend_relocation_port()
    3.12          try:
    3.13 @@ -414,8 +417,8 @@ class XendDomain:
    3.14              raise XendError("can't connect: %s" % err[1])
    3.15  
    3.16          sock.send("receive\n")
    3.17 -        sock.recv(80) 
    3.18 -        XendCheckpoint.save(sock.fileno(), dominfo, live)
    3.19 +        sock.recv(80)
    3.20 +        XendCheckpoint.save(sock.fileno(), dominfo, live, dst)
    3.21  
    3.22  
    3.23      def domain_save(self, domid, dst):
    3.24 @@ -435,7 +438,7 @@ class XendDomain:
    3.25              fd = os.open(dst, os.O_WRONLY | os.O_CREAT | os.O_TRUNC)
    3.26              try:
    3.27                  # For now we don't support 'live checkpoint' 
    3.28 -                return XendCheckpoint.save(fd, dominfo, False)
    3.29 +                return XendCheckpoint.save(fd, dominfo, False, dst)
    3.30              finally:
    3.31                  os.close(fd)
    3.32          except OSError, ex:
     4.1 --- a/tools/python/xen/xend/XendDomainInfo.py	Fri Apr 14 21:15:38 2006 +0100
     4.2 +++ b/tools/python/xen/xend/XendDomainInfo.py	Fri Apr 14 21:22:09 2006 +0100
     4.3 @@ -1395,6 +1395,38 @@ class XendDomainInfo:
     4.4          if self.image:
     4.5              self.image.createDeviceModel()
     4.6  
     4.7 +    ## public:
     4.8 +
     4.9 +    def testMigrateDevices(self, live, dst):
    4.10 +        """ Notify all device about intention of migration
    4.11 +        @raise: XendError for a device that cannot be migrated
    4.12 +        """
    4.13 +        for (n, c) in self.info['device']:
    4.14 +            rc = self.migrateDevice(n, c, live, dst, 0)
    4.15 +            if rc != 0:
    4.16 +                raise XendError("Device of type '%s' refuses migration." % n)
    4.17 +
    4.18 +    def migrateDevices(self, live, dst, step, domName=''):
    4.19 +        """Notify the devices about migration
    4.20 +        """
    4.21 +        ctr = 0
    4.22 +        try:
    4.23 +            for (n, c) in self.info['device']:
    4.24 +                self.migrateDevice(n, c, live, dst, step, domName)
    4.25 +                ctr = ctr + 1
    4.26 +        except:
    4.27 +            for (n, c) in self.info['device']:
    4.28 +                if ctr == 0:
    4.29 +                    step = step - 1
    4.30 +                ctr = ctr - 1
    4.31 +                self.recoverMigrateDevice(n, c, live, dst, step, domName)
    4.32 +            raise
    4.33 +
    4.34 +    def migrateDevice(self, deviceClass, deviceConfig, live, dst, step, domName=''):
    4.35 +        return self.getDeviceController(deviceClass).migrate(deviceConfig, live, dst, step, domName)
    4.36 +
    4.37 +    def recoverMigrateDevice(self, deviceClass, deviceConfig, live, dst, step, domName=''):
    4.38 +        return self.getDeviceController(deviceClass).recover_migrate(deviceConfig, live, dst, step, domName)
    4.39  
    4.40      def waitForDevices(self):
    4.41          """Wait for this domain's configured devices to connect.
     5.1 --- a/tools/python/xen/xend/XendRoot.py	Fri Apr 14 21:15:38 2006 +0100
     5.2 +++ b/tools/python/xen/xend/XendRoot.py	Fri Apr 14 21:22:09 2006 +0100
     5.3 @@ -86,6 +86,9 @@ class XendRoot:
     5.4      server (deprecated)."""
     5.5      xend_unix_server_default = 'no'
     5.6  
     5.7 +    """Default external migration tool """
     5.8 +    external_migration_tool_default = ''
     5.9 +
    5.10      """Default path the unix-domain server listens at."""
    5.11      xend_unix_path_default = '/var/lib/xend/xend-socket'
    5.12  
    5.13 @@ -250,6 +253,9 @@ class XendRoot:
    5.14          else:
    5.15              return None
    5.16  
    5.17 +    def get_external_migration_tool(self):
    5.18 +        """@return the name of the tool to handle virtual TPM migration."""
    5.19 +        return self.get_config_value('external-migration-tool', self.external_migration_tool_default)
    5.20  
    5.21      def get_enable_dump(self):
    5.22          return self.get_config_bool('enable-dump', 'no')
     6.1 --- a/tools/python/xen/xend/server/DevController.py	Fri Apr 14 21:15:38 2006 +0100
     6.2 +++ b/tools/python/xen/xend/server/DevController.py	Fri Apr 14 21:22:09 2006 +0100
     6.3 @@ -267,6 +267,41 @@ class DevController:
     6.4  
     6.5          raise NotImplementedError()
     6.6  
     6.7 +    def migrate(self, deviceConfig, live, dst, step, domName):
     6.8 +        """ Migration of a device. The 'live' parameter indicates
     6.9 +            whether the device is live-migrated (live=1). 'dst' then gives
    6.10 +            the hostname of the machine to migrate to.
    6.11 +        This function is called for 4 steps:
    6.12 +        If step == 0: Check whether the device is ready to be migrated
    6.13 +                      or can at all be migrated; return a '-1' if
    6.14 +                      the device is NOT ready, a '0' otherwise. If it is
    6.15 +                      not ready ( = not possible to migrate this device),
    6.16 +                      migration will not take place.
    6.17 +           step == 1: Called immediately after step 0; migration
    6.18 +                      of the kernel has started;
    6.19 +           step == 2: Called after the suspend has been issued
    6.20 +                      to the domain and the domain is not scheduled anymore.
    6.21 +                      Synchronize with what was started in step 1, if necessary.
    6.22 +                      Now the device should initiate its transfer to the
    6.23 +                      given target. Since there might be more than just
    6.24 +                      one device initiating a migration, this step should
    6.25 +                      put the process performing the transfer into the
    6.26 +                      background and return immediately to achieve as much
    6.27 +                      concurrency as possible.
    6.28 +           step == 3: Synchronize with the migration of the device that
    6.29 +                      was initiated in step 2.
    6.30 +                      Make sure that the migration has finished and only
    6.31 +                      then return from the call.
    6.32 +        """
    6.33 +        return 0
    6.34 +
    6.35 +
    6.36 +    def recover_migrate(self, deviceConfig, list, dst, step, domName):
    6.37 +        """ Recover from device migration. The given step was the
    6.38 +            last one that was successfully executed.
    6.39 +        """
    6.40 +        return 0
    6.41 +
    6.42  
    6.43      def getDomid(self):
    6.44          """Stub to {@link XendDomainInfo.getDomid}, for use by our
     7.1 --- a/tools/python/xen/xend/server/tpmif.py	Fri Apr 14 21:15:38 2006 +0100
     7.2 +++ b/tools/python/xen/xend/server/tpmif.py	Fri Apr 14 21:22:09 2006 +0100
     7.3 @@ -23,9 +23,17 @@
     7.4  
     7.5  from xen.xend import sxp
     7.6  from xen.xend.XendLogging import log
     7.7 +from xen.xend.XendError import XendError
     7.8 +from xen.xend import XendRoot
     7.9  
    7.10  from xen.xend.server.DevController import DevController
    7.11  
    7.12 +import os
    7.13 +import re
    7.14 +
    7.15 +
    7.16 +xroot = XendRoot.instance()
    7.17 +
    7.18  
    7.19  class TPMifController(DevController):
    7.20      """TPM interface controller. Handles all TPM devices for a domain.
    7.21 @@ -61,3 +69,43 @@ class TPMifController(DevController):
    7.22              result.append(['instance', instance])
    7.23  
    7.24          return result
    7.25 +
    7.26 +    def migrate(self, deviceConfig, live, dst, step, domName):
    7.27 +        """@see DevContoller.migrate"""
    7.28 +        if live:
    7.29 +            tool = xroot.get_external_migration_tool()
    7.30 +            if tool != '':
    7.31 +                log.info("Request to live-migrate device to %s. step=%d.",
    7.32 +                         dst, step)
    7.33 +
    7.34 +                if step == 0:
    7.35 +                    """Assuming for now that everything is ok and migration
    7.36 +                       with the given tool can proceed.
    7.37 +                    """
    7.38 +                    return 0
    7.39 +                else:
    7.40 +                    fd = os.popen("%s -type vtpm -step %d -host %s -domname %s" %
    7.41 +                                  (tool, step, dst, domName),
    7.42 +                                  'r')
    7.43 +                    for line in fd.readlines():
    7.44 +                        mo = re.search('Error', line)
    7.45 +                        if mo:
    7.46 +                            raise XendError("vtpm: Fatal error in migration step %d." %
    7.47 +                                            step)
    7.48 +                    return 0
    7.49 +            else:
    7.50 +                log.debug("External migration tool not in configuration.")
    7.51 +                return -1
    7.52 +        return 0
    7.53 +
    7.54 +    def recover_migrate(self, deviceConfig, live, dst, step, domName):
    7.55 +        """@see DevContoller.recover_migrate"""
    7.56 +        if live:
    7.57 +            tool = xroot.get_external_migration_tool()
    7.58 +            if tool != '':
    7.59 +                log.info("Request to recover live-migrated device. last good step=%d.",
    7.60 +                         step)
    7.61 +                fd = os.popen("%s -type vtpm -step %d -host %s -domname %s -recover" %
    7.62 +                              (tool, step, dst, domName),
    7.63 +                              'r')
    7.64 +        return 0