From 9a11d0b69663d60174c1365dd02846bca4a37609 Mon Sep 17 00:00:00 2001 From: Ted M Lin Date: Wed, 4 Aug 2010 15:55:23 -0400 Subject: [PATCH] MAR-112: RRD transfer lock can prevent master transition If the master loses connectivity during a RRD transfer, the RRD mutex will never be released. When performing a emergency master transition, the first thing that occurs is to save the local RRDs to disk, which requires the RRD mutex. This will never succeed, thus XAPI gets stuck trying to shut down. Instead of locking the mutex, we try to acquire it for a few seconds. If the lock can not be taken, the loss of some RRD data is preferable to not being able to transition at all. Signed-off-by: Ted M Lin --- ocaml/xapi/monitor_rrds.ml | 45 ++++++++++++++++++++++++++------------ 1 file changed, 31 insertions(+), 14 deletions(-) diff --git a/ocaml/xapi/monitor_rrds.ml b/ocaml/xapi/monitor_rrds.ml index a9123c7b..39940e89 100644 --- a/ocaml/xapi/monitor_rrds.ml +++ b/ocaml/xapi/monitor_rrds.ml @@ -272,20 +272,37 @@ let send_host_rrd_to_master () = We save our host RRD and running VM RRDs on the local filesystem and pick them up when we restart. *) let backup ?(save_stats_locally=true) () = debug "backup safe_stats_locally=%b" save_stats_locally; - let vrrds = - Mutex.execute mutex (fun () -> Hashtbl.fold (fun k v acc -> (k,v.rrd)::acc) vm_rrds []) - in - List.iter (fun (uuid,rrd) -> - debug "Backup: saving RRD for VM uuid=%s to local disk" uuid; - let rrd = Mutex.execute mutex (fun () -> Rrd.to_string rrd) in - archive_rrd uuid ~save_stats_locally rrd) - vrrds; - match !host_rrd with - | Some rrdi -> - debug "Backup: saving RRD for host to local disk"; - let rrd = Mutex.execute mutex (fun () -> Rrd.to_string rrdi.rrd) in - archive_rrd (Helpers.get_localhost_uuid ()) ~save_stats_locally rrd - | None -> () + let total_cycles = 5 in + let cycles_tried = ref 0 in + while !cycles_tried < total_cycles do + if Mutex.try_lock mutex then begin + cycles_tried := total_cycles; + let vrrds = + try + Hashtbl.fold (fun k v acc -> (k,v.rrd)::acc) vm_rrds [] + with exn -> + Mutex.unlock mutex; + raise exn + in + Mutex.unlock mutex; + List.iter (fun (uuid,rrd) -> + debug "Backup: saving RRD for VM uuid=%s to local disk" uuid; + let rrd = Mutex.execute mutex (fun () -> Rrd.to_string rrd) in + archive_rrd uuid ~save_stats_locally rrd) + vrrds; + match !host_rrd with + | Some rrdi -> + debug "Backup: saving RRD for host to local disk"; + let rrd = Mutex.execute mutex (fun () -> Rrd.to_string rrdi.rrd) in + archive_rrd (Helpers.get_localhost_uuid ()) ~save_stats_locally rrd + | None -> () + end else begin + cycles_tried := 1 + !cycles_tried; + if !cycles_tried >= total_cycles + then debug "Could not acquire RRD lock, skipping RRD backup" + else Thread.delay 1. + end + done (** Maybe_remove_rrd - remove an RRD from the local filesystem, if it exists *) let maybe_remove_rrd uuid = -- 2.39.5