]> xenbits.xensource.com Git - xcp/xen-api.git/commitdiff
When the watchdog restarts xapi, invoke a full fork(),exec() rather than just a fork().
authorDavid Scott <dave.scott@eu.citrix.com>
Wed, 26 Jan 2011 17:39:06 +0000 (17:39 +0000)
committerDavid Scott <dave.scott@eu.citrix.com>
Wed, 26 Jan 2011 17:39:06 +0000 (17:39 +0000)
There are three benefits:
  (1) this allows you to update the xapi binary and then use 'xe host-restart-agent'
  (2) top-level module declarations are re-evaluated and side-effects re-performed. This is particularly important after changing between a master and a slave, since this information is currently cached on demand. If you accidentally 'demand' the information before the fork() then it used to be persisted over the fork.
  (3) a xapi internal-start is now exactly the same as 'service xapi restart' (rather than there being subtle differences)

Signed-off-by: David Scott <dave.scott@eu.citrix.com>
ocaml/xapi/pool_role.ml
ocaml/xapi/pool_role.mli
ocaml/xapi/xapi.ml
ocaml/xapi/xapi_pool_transition.ml

index e65259da774c2a0b5893404bf6bd7c6772445d04..e48f6a5d79e9441f9900c1e787bb6fcff91ae13a 100644 (file)
@@ -27,7 +27,7 @@ type t =
 let role = ref None
 let role_m = Mutex.create ()
 
-let string_of_t = function
+let string_of = function
   | Master -> "master"
   | Slave x -> "slave:" ^ x
   | Broken -> "broken"
@@ -54,7 +54,7 @@ let set_role r =
   let old_role = get_role () in
   Mutex.execute role_m
     (fun () ->
-       Unixext.write_string_to_file Xapi_globs.pool_config_file (string_of_t r));
+       Unixext.write_string_to_file Xapi_globs.pool_config_file (string_of r));
   Localdb.put Constants.this_node_just_became_master (string_of_bool (old_role <> Master && r = Master))
 
 let is_master () = get_role () = Master
index ed0ec678a51e794c151ba161d648ab6c434f6b79..321ad9ee388ba83c700a1d10282a7af1a5dc70b3 100644 (file)
@@ -17,6 +17,9 @@ type t =
   | Slave of string (** IP address *)
   | Broken
 
+(** Returns a printable version ot [t] *)
+val string_of: t -> string
+
 (** Returns the role of this node *)
 val get_role: unit -> t
 (** Reset the role on disk, takes effect on next server restart only! *)
index a87918046a9fcc858b1db98576b5ea0d0da46b3f..6c2a769c3f408c60163f70863c7834d29e97e87e 100644 (file)
@@ -240,7 +240,6 @@ let init_args() =
   Debug.name_thread "thread_zero";
   (* Immediately register callback functions *)
   register_callback_fns();
-  let writelog = ref false in
   Arg.parse [
               "-daemon", Arg.Set daemonize, "run as a daemon in the background";
               "-config", Arg.Set_string Xapi_globs.config_file, "set config file to use";
@@ -676,7 +675,7 @@ let server_init() =
     ignore(Http_svr.start (localhost_sock, "inet-RPC"));
     in
 
-  let print_server_starting_message() = debug "on_system_boot=%b" !Xapi_globs.on_system_boot in
+  let print_server_starting_message() = debug "on_system_boot=%b pool_role=%s" !Xapi_globs.on_system_boot (Pool_role.string_of (Pool_role.get_role ())) in
 
   (* Record the initial value of Master_connection.connection_timeout and set it to 'never'. When we are a slave who
      has just started up we want to wait forever for the master to appear. (See CA-25481) *)
@@ -722,7 +721,7 @@ let server_init() =
       let service_name = Db.Host.get_external_auth_service_name ~__context ~self:host in
       let last_error = ref None in
       (* watchdog to indicate that on_xapi_initialize wasn't successful after 2 min initializing *)
-      Thread.create (fun ()-> Thread.delay (2.0 *. 60.0); (* wait 2min before testing for success *)
+      let (_: Thread.t) = Thread.create (fun ()-> Thread.delay (2.0 *. 60.0); (* wait 2min before testing for success *)
         if not !Xapi_globs.event_hook_auth_on_xapi_initialize_succeeded then
         begin (* no success after 2 min *)
           let obj_uuid = Helpers.get_localhost_uuid () in
@@ -741,7 +740,7 @@ let server_init() =
             );
           ));
         end
-      ) ();
+      ) () in (); (* ignore Thread.t *)
       (* persistent loop trying to initialize the external authentication service *)
       (* obs: this loop will also end after a host.disable_external_auth call *)
       while (not !Xapi_globs.event_hook_auth_on_xapi_initialize_succeeded) do
@@ -961,118 +960,119 @@ let delay_on_eintr f =
   | e -> raise e
 
 let watchdog f =
-  if !nowatchdog then delay_on_eintr f
-  else
-    begin
-      (* parent process blocks sigint and forward sigterm to child. *)
-      ignore(Unix.sigprocmask Unix.SIG_BLOCK [Sys.sigint]);
-      Sys.catch_break false;
-      Logs.append "watchdog" Log.Info "syslog:xapi_watchdog";
-
-      (* watchdog logic *)
-      let loginfo fmt = W.info fmt in
-
-      let restart = ref true
-      and error_msg = ref "" and exit_code = ref 0
-                            and last_badsig = ref (0.) and pid = ref 0
-                                                       and last_badexit = ref (0.) and no_retry_interval = 60. in
-
-      while !restart
-      do
-       begin
-         loginfo "(Re)starting xapi...";
-         if !pid = 0 then
-           begin
-             let newpid = Unix.fork () in
-             if newpid = 0 then
-               begin
-                 try
-                   ignore(Unix.sigprocmask Unix.SIG_UNBLOCK [Sys.sigint]);
-                   delay_on_eintr f;
-                   exit 127
-                 with e ->
+       if !nowatchdog then begin
+               try
+                       ignore(Unix.sigprocmask Unix.SIG_UNBLOCK [Sys.sigint]);
+                       delay_on_eintr f;
+                       exit 127
+               with e ->
                    error "Caught exception at toplevel: '%s'" (Printexc.to_string e);
                    log_backtrace ();
                    raise e (* will exit the process with rc=2 *)
+       end else begin
+               (* parent process blocks sigint and forward sigterm to child. *)
+               ignore(Unix.sigprocmask Unix.SIG_BLOCK [Sys.sigint]);
+               Sys.catch_break false;
+               Logs.append "watchdog" Log.Info "syslog:xapi_watchdog";
+               
+               (* watchdog logic *)
+               let loginfo fmt = W.info fmt in
+               
+               let restart = ref true
+               and error_msg = ref "" 
+               and exit_code = ref 0
+               and last_badsig = ref (0.) 
+               and pid = ref None
+               and last_badexit = ref (0.) 
+               and no_retry_interval = 60. in
+
+               while !restart do begin
+                       loginfo "(Re)starting xapi...";
+                       if !pid = None then begin
+                               let cmd = Sys.argv.(0) in
+
+                               let overriden_args = [ "-onsystemboot"; "-daemon" ] in
+                               let core_args = 
+                                       List.filter (fun x -> not(List.mem x overriden_args))
+                                               (List.tl (Array.to_list Sys.argv)) in
+                               let args = 
+                                       "-nowatchdog" :: core_args 
+                                       @ (if !Xapi_globs.on_system_boot then [ "-onsystemboot" ] else []) in
+
+                               let newpid = Forkhelpers.safe_close_and_exec ~env:(Unix.environment ()) None None None [] cmd args in
+
+                               (* parent just reset the sighandler *)
+                               Sys.set_signal Sys.sigterm (Sys.Signal_handle (fun i -> restart := false; Unix.kill (Forkhelpers.getpid newpid) Sys.sigterm));
+                               pid := Some newpid;
+
+                               (* CA-22875: make sure we unset on_system_boot so we don't 
+                                  execute on-boot stuff more than once eg over a 
+                                  'pool-emergency-transition-to-master' or an HA failover *)
+                               Xapi_globs.on_system_boot := false;
+
+                       end;
+                       try
+                               (* remove the pid in all case, except stop *)
+                               match snd (Forkhelpers.waitpid (Opt.unbox !pid)) with
+                                       | Unix.WEXITED i when i = Xapi_globs.restart_return_code ->
+                                               pid := None;
+                                               loginfo "restarting xapi in different operating mode";
+                                               ()
+                                       | Unix.WEXITED i when i=0->
+                                               loginfo "received exit code 0. Not restarting.";
+                                               pid := None;
+                                               restart := false;
+                                               error_msg := "";
+                                       | Unix.WEXITED i ->
+                                               loginfo "received exit code %d" i;
+                                               exit_code := i;
+                                               pid := None;
+                                               let ctime = Unix.time () in
+                                               if ctime < (!last_badexit +. no_retry_interval) then begin
+                                                       restart := false;
+                                                       loginfo "Received 2 bad exits within no-retry-interval. Giving up.";
+                                               end else begin
+                                                       (* restart := true; -- don't need to do this - it's true already *)
+                                                       loginfo "Received bad exit, retrying";
+                                                       last_badexit := ctime
+                                               end
+                                       | Unix.WSIGNALED i ->
+                                               loginfo "received signal %d" i;
+                                               pid := None;
+                                               (* arbitrary choice of signals, probably need more
+                                                  though, for real use *)
+                                               if i = Sys.sigsegv || i = Sys.sigpipe then begin
+                                                       let ctime = Unix.time () in
+                                                       if ctime < (!last_badsig +. no_retry_interval) then begin
+                                                               restart := false;
+                                                               error_msg := sprintf "xapi died with signal %d: not restarting (2 bad signals within no_retry_interval)" i;
+                                                               exit_code := 13
+                                                       end else begin
+                                                               loginfo "xapi died with signal %d: restarting" i;
+                                                               last_badsig := ctime
+                                                       end
+                                               end else begin
+                                                       restart := false;
+                                                       error_msg := sprintf "xapi died with signal %d: not restarting (watchdog never restarts on this signal)" i;
+                                                       exit_code := 12
+                                               end
+                                       | Unix.WSTOPPED i ->
+                                               loginfo "receive stop code %i" i;
+                                               Unix.sleep 1;
+                                               (* well, just resume the stop process. the watchdog
+                                                  cannot do anything if the process is stop *)
+                                               Unix.kill (Forkhelpers.getpid (Opt.unbox !pid)) Sys.sigcont;
+                       with
+                                       Unix.Unix_error(Unix.EINTR,_,_) -> ()
+                               | e -> loginfo "Watchdog received unexpected exception: %s" (Printexc.to_string e)
                end;
-             (* parent just reset the sighandler *)
-             Sys.set_signal Sys.sigterm (Sys.Signal_handle (fun i -> restart := false; Unix.kill newpid Sys.sigterm));
-             pid := newpid;
-             (* CA-22875: make sure we unset on_system_boot so we don't execute on-boot stuff more than once eg over a 
-                'pool-emergency-transition-to-master' or an HA failover *)
-             Xapi_globs.on_system_boot := false
-           end;
-         try
-           (* remove the pid in all case, except stop *)
-           match snd (Unix.waitpid [] !pid) with
-           | Unix.WEXITED i when i = Xapi_globs.restart_return_code ->
-               pid := 0;
-               loginfo "restarting xapi in different operating mode";
-               ()
-           | Unix.WEXITED i when i=0->
-               loginfo "received exit code 0. Not restarting.";
-               pid := 0;
-               restart := false;
-               error_msg := "";
-           | Unix.WEXITED i ->
-               loginfo "received exit code %d" i;
-               exit_code := i;
-               pid := 0;
-               let ctime = Unix.time () in
-               if ctime < (!last_badexit +. no_retry_interval) then
-                 begin
-                   restart := false;
-                   loginfo "Received 2 bad exits within no-retry-interval. Giving up.";
-                 end
-               else
-                 begin
-                   (* restart := true; -- don't need to do this - it's true already *)
-                   loginfo "Received bad exit, retrying";
-                   last_badexit := ctime
-                 end
-           | Unix.WSIGNALED i ->
-               loginfo "received signal %d" i;
-               pid := 0;
-               (* arbitrary choice of signals, probably need more
-                  though, for real use *)
-               if i = Sys.sigsegv || i = Sys.sigpipe then
-                 begin
-                   let ctime = Unix.time () in
-                   if ctime < (!last_badsig +. no_retry_interval) then
-                     begin
-                       restart := false;
-                       error_msg := sprintf "xapi died with signal %d: not restarting (2 bad signals within no_retry_interval)" i;
-                       exit_code := 13
-                     end else
-                       begin
-                         loginfo "xapi died with signal %d: restarting" i;
-                         last_badsig := ctime
-                       end
-                 end
-               else
-                 begin
-                   restart := false;
-                   error_msg := sprintf "xapi died with signal %d: not restarting (watchdog never restarts on this signal)" i;
-                   exit_code := 12
-                 end
-           | Unix.WSTOPPED i ->
-               loginfo "receive stop code %i" i;
-               Unix.sleep 1;
-               (* well, just resume the stop process. the watchdog
-                  cannot do anything if the process is stop *)
-               Unix.kill !pid Sys.sigcont;
-         with
-           Unix.Unix_error(Unix.EINTR,_,_) -> ()
-         | e -> loginfo "Watchdog received unexpected exception: %s" (Printexc.to_string e)
-       end;
-      done;
-      if !error_msg <> "" then
-       begin
-         loginfo "xapi watchdog exiting.";
-         loginfo "Fatal: %s" !error_msg;
-         eprintf "%s\n" !error_msg;
-       end;
-      exit !exit_code    
+               done;
+               if !error_msg <> "" then begin
+                       loginfo "xapi watchdog exiting.";
+                       loginfo "Fatal: %s" !error_msg;
+                       eprintf "%s\n" !error_msg;
+               end;
+               exit !exit_code    
     end
 
 let _ =
index 5311f425163ea7e1b167a5e6d8832feafe1ad15d..5cb1419078c3acbf680a8fcc465d0e5cf47232b9 100644 (file)
@@ -158,6 +158,7 @@ let become_another_masters_slave master_address =
   if Pool_role.get_role () = new_role then begin
     debug "We are already a slave of %s; nothing to do" master_address;
   end else begin
+         debug "Setting pool.conf to point to %s" master_address;
     Pool_role.set_role new_role;
     run_external_scripts false;
     Xapi_fuse.light_fuse_and_run ()