From: David Scott Date: Wed, 26 Jan 2011 17:39:06 +0000 (+0000) Subject: When the watchdog restarts xapi, invoke a full fork(),exec() rather than just a fork(). X-Git-Url: http://xenbits.xensource.com/gitweb?a=commitdiff_plain;h=ee555868f43f9d22e338ac449d8205680b3bf0f0;p=xcp%2Fxen-api.git When the watchdog restarts xapi, invoke a full fork(),exec() rather than just a fork(). There are three benefits: (1) this allows you to update the xapi binary and then use 'xe host-restart-agent' (2) top-level module declarations are re-evaluated and side-effects re-performed. This is particularly important after changing between a master and a slave, since this information is currently cached on demand. If you accidentally 'demand' the information before the fork() then it used to be persisted over the fork. (3) a xapi internal-start is now exactly the same as 'service xapi restart' (rather than there being subtle differences) Signed-off-by: David Scott --- diff --git a/ocaml/xapi/pool_role.ml b/ocaml/xapi/pool_role.ml index e65259da..e48f6a5d 100644 --- a/ocaml/xapi/pool_role.ml +++ b/ocaml/xapi/pool_role.ml @@ -27,7 +27,7 @@ type t = let role = ref None let role_m = Mutex.create () -let string_of_t = function +let string_of = function | Master -> "master" | Slave x -> "slave:" ^ x | Broken -> "broken" @@ -54,7 +54,7 @@ let set_role r = let old_role = get_role () in Mutex.execute role_m (fun () -> - Unixext.write_string_to_file Xapi_globs.pool_config_file (string_of_t r)); + Unixext.write_string_to_file Xapi_globs.pool_config_file (string_of r)); Localdb.put Constants.this_node_just_became_master (string_of_bool (old_role <> Master && r = Master)) let is_master () = get_role () = Master diff --git a/ocaml/xapi/pool_role.mli b/ocaml/xapi/pool_role.mli index ed0ec678..321ad9ee 100644 --- a/ocaml/xapi/pool_role.mli +++ b/ocaml/xapi/pool_role.mli @@ -17,6 +17,9 @@ type t = | Slave of string (** IP address *) | Broken +(** Returns a printable version ot [t] *) +val string_of: t -> string + (** Returns the role of this node *) val get_role: unit -> t (** Reset the role on disk, takes effect on next server restart only! *) diff --git a/ocaml/xapi/xapi.ml b/ocaml/xapi/xapi.ml index a8791804..6c2a769c 100644 --- a/ocaml/xapi/xapi.ml +++ b/ocaml/xapi/xapi.ml @@ -240,7 +240,6 @@ let init_args() = Debug.name_thread "thread_zero"; (* Immediately register callback functions *) register_callback_fns(); - let writelog = ref false in Arg.parse [ "-daemon", Arg.Set daemonize, "run as a daemon in the background"; "-config", Arg.Set_string Xapi_globs.config_file, "set config file to use"; @@ -676,7 +675,7 @@ let server_init() = ignore(Http_svr.start (localhost_sock, "inet-RPC")); in - let print_server_starting_message() = debug "on_system_boot=%b" !Xapi_globs.on_system_boot in + let print_server_starting_message() = debug "on_system_boot=%b pool_role=%s" !Xapi_globs.on_system_boot (Pool_role.string_of (Pool_role.get_role ())) in (* Record the initial value of Master_connection.connection_timeout and set it to 'never'. When we are a slave who has just started up we want to wait forever for the master to appear. (See CA-25481) *) @@ -722,7 +721,7 @@ let server_init() = let service_name = Db.Host.get_external_auth_service_name ~__context ~self:host in let last_error = ref None in (* watchdog to indicate that on_xapi_initialize wasn't successful after 2 min initializing *) - Thread.create (fun ()-> Thread.delay (2.0 *. 60.0); (* wait 2min before testing for success *) + let (_: Thread.t) = Thread.create (fun ()-> Thread.delay (2.0 *. 60.0); (* wait 2min before testing for success *) if not !Xapi_globs.event_hook_auth_on_xapi_initialize_succeeded then begin (* no success after 2 min *) let obj_uuid = Helpers.get_localhost_uuid () in @@ -741,7 +740,7 @@ let server_init() = ); )); end - ) (); + ) () in (); (* ignore Thread.t *) (* persistent loop trying to initialize the external authentication service *) (* obs: this loop will also end after a host.disable_external_auth call *) while (not !Xapi_globs.event_hook_auth_on_xapi_initialize_succeeded) do @@ -961,118 +960,119 @@ let delay_on_eintr f = | e -> raise e let watchdog f = - if !nowatchdog then delay_on_eintr f - else - begin - (* parent process blocks sigint and forward sigterm to child. *) - ignore(Unix.sigprocmask Unix.SIG_BLOCK [Sys.sigint]); - Sys.catch_break false; - Logs.append "watchdog" Log.Info "syslog:xapi_watchdog"; - - (* watchdog logic *) - let loginfo fmt = W.info fmt in - - let restart = ref true - and error_msg = ref "" and exit_code = ref 0 - and last_badsig = ref (0.) and pid = ref 0 - and last_badexit = ref (0.) and no_retry_interval = 60. in - - while !restart - do - begin - loginfo "(Re)starting xapi..."; - if !pid = 0 then - begin - let newpid = Unix.fork () in - if newpid = 0 then - begin - try - ignore(Unix.sigprocmask Unix.SIG_UNBLOCK [Sys.sigint]); - delay_on_eintr f; - exit 127 - with e -> + if !nowatchdog then begin + try + ignore(Unix.sigprocmask Unix.SIG_UNBLOCK [Sys.sigint]); + delay_on_eintr f; + exit 127 + with e -> error "Caught exception at toplevel: '%s'" (Printexc.to_string e); log_backtrace (); raise e (* will exit the process with rc=2 *) + end else begin + (* parent process blocks sigint and forward sigterm to child. *) + ignore(Unix.sigprocmask Unix.SIG_BLOCK [Sys.sigint]); + Sys.catch_break false; + Logs.append "watchdog" Log.Info "syslog:xapi_watchdog"; + + (* watchdog logic *) + let loginfo fmt = W.info fmt in + + let restart = ref true + and error_msg = ref "" + and exit_code = ref 0 + and last_badsig = ref (0.) + and pid = ref None + and last_badexit = ref (0.) + and no_retry_interval = 60. in + + while !restart do begin + loginfo "(Re)starting xapi..."; + if !pid = None then begin + let cmd = Sys.argv.(0) in + + let overriden_args = [ "-onsystemboot"; "-daemon" ] in + let core_args = + List.filter (fun x -> not(List.mem x overriden_args)) + (List.tl (Array.to_list Sys.argv)) in + let args = + "-nowatchdog" :: core_args + @ (if !Xapi_globs.on_system_boot then [ "-onsystemboot" ] else []) in + + let newpid = Forkhelpers.safe_close_and_exec ~env:(Unix.environment ()) None None None [] cmd args in + + (* parent just reset the sighandler *) + Sys.set_signal Sys.sigterm (Sys.Signal_handle (fun i -> restart := false; Unix.kill (Forkhelpers.getpid newpid) Sys.sigterm)); + pid := Some newpid; + + (* CA-22875: make sure we unset on_system_boot so we don't + execute on-boot stuff more than once eg over a + 'pool-emergency-transition-to-master' or an HA failover *) + Xapi_globs.on_system_boot := false; + + end; + try + (* remove the pid in all case, except stop *) + match snd (Forkhelpers.waitpid (Opt.unbox !pid)) with + | Unix.WEXITED i when i = Xapi_globs.restart_return_code -> + pid := None; + loginfo "restarting xapi in different operating mode"; + () + | Unix.WEXITED i when i=0-> + loginfo "received exit code 0. Not restarting."; + pid := None; + restart := false; + error_msg := ""; + | Unix.WEXITED i -> + loginfo "received exit code %d" i; + exit_code := i; + pid := None; + let ctime = Unix.time () in + if ctime < (!last_badexit +. no_retry_interval) then begin + restart := false; + loginfo "Received 2 bad exits within no-retry-interval. Giving up."; + end else begin + (* restart := true; -- don't need to do this - it's true already *) + loginfo "Received bad exit, retrying"; + last_badexit := ctime + end + | Unix.WSIGNALED i -> + loginfo "received signal %d" i; + pid := None; + (* arbitrary choice of signals, probably need more + though, for real use *) + if i = Sys.sigsegv || i = Sys.sigpipe then begin + let ctime = Unix.time () in + if ctime < (!last_badsig +. no_retry_interval) then begin + restart := false; + error_msg := sprintf "xapi died with signal %d: not restarting (2 bad signals within no_retry_interval)" i; + exit_code := 13 + end else begin + loginfo "xapi died with signal %d: restarting" i; + last_badsig := ctime + end + end else begin + restart := false; + error_msg := sprintf "xapi died with signal %d: not restarting (watchdog never restarts on this signal)" i; + exit_code := 12 + end + | Unix.WSTOPPED i -> + loginfo "receive stop code %i" i; + Unix.sleep 1; + (* well, just resume the stop process. the watchdog + cannot do anything if the process is stop *) + Unix.kill (Forkhelpers.getpid (Opt.unbox !pid)) Sys.sigcont; + with + Unix.Unix_error(Unix.EINTR,_,_) -> () + | e -> loginfo "Watchdog received unexpected exception: %s" (Printexc.to_string e) end; - (* parent just reset the sighandler *) - Sys.set_signal Sys.sigterm (Sys.Signal_handle (fun i -> restart := false; Unix.kill newpid Sys.sigterm)); - pid := newpid; - (* CA-22875: make sure we unset on_system_boot so we don't execute on-boot stuff more than once eg over a - 'pool-emergency-transition-to-master' or an HA failover *) - Xapi_globs.on_system_boot := false - end; - try - (* remove the pid in all case, except stop *) - match snd (Unix.waitpid [] !pid) with - | Unix.WEXITED i when i = Xapi_globs.restart_return_code -> - pid := 0; - loginfo "restarting xapi in different operating mode"; - () - | Unix.WEXITED i when i=0-> - loginfo "received exit code 0. Not restarting."; - pid := 0; - restart := false; - error_msg := ""; - | Unix.WEXITED i -> - loginfo "received exit code %d" i; - exit_code := i; - pid := 0; - let ctime = Unix.time () in - if ctime < (!last_badexit +. no_retry_interval) then - begin - restart := false; - loginfo "Received 2 bad exits within no-retry-interval. Giving up."; - end - else - begin - (* restart := true; -- don't need to do this - it's true already *) - loginfo "Received bad exit, retrying"; - last_badexit := ctime - end - | Unix.WSIGNALED i -> - loginfo "received signal %d" i; - pid := 0; - (* arbitrary choice of signals, probably need more - though, for real use *) - if i = Sys.sigsegv || i = Sys.sigpipe then - begin - let ctime = Unix.time () in - if ctime < (!last_badsig +. no_retry_interval) then - begin - restart := false; - error_msg := sprintf "xapi died with signal %d: not restarting (2 bad signals within no_retry_interval)" i; - exit_code := 13 - end else - begin - loginfo "xapi died with signal %d: restarting" i; - last_badsig := ctime - end - end - else - begin - restart := false; - error_msg := sprintf "xapi died with signal %d: not restarting (watchdog never restarts on this signal)" i; - exit_code := 12 - end - | Unix.WSTOPPED i -> - loginfo "receive stop code %i" i; - Unix.sleep 1; - (* well, just resume the stop process. the watchdog - cannot do anything if the process is stop *) - Unix.kill !pid Sys.sigcont; - with - Unix.Unix_error(Unix.EINTR,_,_) -> () - | e -> loginfo "Watchdog received unexpected exception: %s" (Printexc.to_string e) - end; - done; - if !error_msg <> "" then - begin - loginfo "xapi watchdog exiting."; - loginfo "Fatal: %s" !error_msg; - eprintf "%s\n" !error_msg; - end; - exit !exit_code + done; + if !error_msg <> "" then begin + loginfo "xapi watchdog exiting."; + loginfo "Fatal: %s" !error_msg; + eprintf "%s\n" !error_msg; + end; + exit !exit_code end let _ = diff --git a/ocaml/xapi/xapi_pool_transition.ml b/ocaml/xapi/xapi_pool_transition.ml index 5311f425..5cb14190 100644 --- a/ocaml/xapi/xapi_pool_transition.ml +++ b/ocaml/xapi/xapi_pool_transition.ml @@ -158,6 +158,7 @@ let become_another_masters_slave master_address = if Pool_role.get_role () = new_role then begin debug "We are already a slave of %s; nothing to do" master_address; end else begin + debug "Setting pool.conf to point to %s" master_address; Pool_role.set_role new_role; run_external_scripts false; Xapi_fuse.light_fuse_and_run ()