]> xenbits.xensource.com Git - xen.git/commitdiff
remus: proper cleanup on checkpoint failure.
authorShriram Rajagopalan <rshriram@cs.ubc.ca>
Fri, 8 Apr 2011 15:49:04 +0000 (16:49 +0100)
committerShriram Rajagopalan <rshriram@cs.ubc.ca>
Fri, 8 Apr 2011 15:49:04 +0000 (16:49 +0100)
While running remus, when an error occurs during checkpointing
(e.g., timeouts on primary, failing to checkpoint network buffer
or disk or even communication failure) the domU is sometimes
left in suspended state on primary. Instead of blindly closing
the checkpoint file handle, attempt to resume the domain before
the close.

Signed-off-by: Shriram Rajagopalan <rshriram@cs.ubc.ca>
Committed-by: Ian Jackson <ian.jackson@eu.citrix.com>
tools/python/xen/lowlevel/checkpoint/checkpoint.c
tools/python/xen/remus/save.py

index 7545d7deb6652e0c213936963b11f28d444d50e3..1581b64095a53eb60d64094317b30f9aefe19696 100644 (file)
@@ -80,6 +80,9 @@ static PyObject* pycheckpoint_close(PyObject* obj, PyObject* args)
 {
   CheckpointObject* self = (CheckpointObject*)obj;
 
+  if (checkpoint_resume(&self->cps) < 0)
+    fprintf(stderr, "%s\n", checkpoint_error(&self->cps));
+
   checkpoint_close(&self->cps);
 
   Py_XDECREF(self->suspend_cb);
index 71517da8c1b4275367d7fa6340318c19b1cee790..9858aec571cb5a214a2a1e752f79d1a84a9d22e7 100644 (file)
@@ -158,9 +158,13 @@ class Saver(object):
             self.checkpointer.open(self.vm.domid)
             self.checkpointer.start(self.fd, self.suspendcb, self.resumecb,
                                     self.checkpointcb, self.interval)
-            self.checkpointer.close()
         except xen.lowlevel.checkpoint.error, e:
             raise CheckpointError(e)
+        finally:
+            try: #errors in checkpoint close are not critical atm.
+                self.checkpointer.close()
+            except:
+                pass
 
     def _resume(self):
         """low-overhead version of XendDomainInfo.resumeDomain"""