]> xenbits.xensource.com Git - people/dstodden/blktap.git/commitdiff
CA-26523: Maintain child affiliation in blktapctrl.
authorDaniel Stodden <daniel.stodden@citrix.com>
Mon, 6 Apr 2009 20:22:21 +0000 (13:22 -0700)
committerDaniel Stodden <daniel.stodden@citrix.com>
Mon, 6 Apr 2009 20:22:21 +0000 (13:22 -0700)
Early fatal tapdisk failures are rare but presently go unnoticed. The
less improbable pathologic example case covered here is an AIO init
failure due to resource exhaustion.

At the very least, we would (a) start tapdisk1 synchronously and test
for errors or (b) defer post-daemonization to somewhere after PID_MSG.

Synchronous starts recently becoming more popular -- we used to
daemonize from the first instruction -- sheds light on a more general
issue: tapdisk1 shouldn't detach from blttapctrl at all.

Ergo:

 - Eliminate daemon()ization from tapdisk1.

 - Leaves detaching as a (hopefully unpopular) option, in case
   something is ever found to disagree.

 - Makes blktapctrl reap children.

 - Assocites child exists with respective tapdisk-channels.

 - Lets channels detect and escalate (via XS) unclean tapdisk losses.

At this point, even doing the right thing (tm) and suspending guests
in face of crashed tapdisks becomes an option, but this takes an
additional patch or two on the agent.

daemon/tapdisk-channel.c
daemon/tapdisk-daemon.c
daemon/tapdisk-dispatch.h
drivers/tapdisk.c

index b3b8ed71f952d84199003abd666de7e4c9e087d1..0fbb9afd14c1dfd0e45547a2a53338c52b2dd939 100644 (file)
@@ -858,27 +858,24 @@ tapdisk_channel_start_process(tapdisk_channel_t *channel,
 {
        pid_t child;
        char *argv[] = { "tapdisk", write_dev, read_dev, NULL };
+       int i;
 
        if ((child = fork()) == -1)
                return -errno;
 
-       if (!child) {
-               int i;
-               for (i = 0 ; i < sysconf(_SC_OPEN_MAX) ; i++)
-                       if (i != STDIN_FILENO &&
-                           i != STDOUT_FILENO &&
-                           i != STDERR_FILENO)
-                               close(i);
+       if (child)
+               return child;
 
-               execvp("tapdisk", argv);
-               _exit(1);
-       } else {
-               pid_t got;
-               do {
-                       got = waitpid(child, NULL, 0);
-               } while (got != child);
-       }
-       return 0;
+       for (i = 0; i < sysconf(_SC_OPEN_MAX) ; i++)
+               if (i != STDIN_FILENO &&
+                   i != STDOUT_FILENO &&
+                   i != STDERR_FILENO)
+                       close(i);
+
+       execvp("tapdisk", argv);
+
+       PERROR("execvp");
+       _exit(1);
 }
 
 static int
@@ -926,9 +923,13 @@ tapdisk_channel_launch_tapdisk(tapdisk_channel_t *channel)
                goto fail;
        }
 
-       err = tapdisk_channel_start_process(channel, write_dev, read_dev);
-       if (err)
+       channel->tapdisk_pid = 
+               tapdisk_channel_start_process(channel, write_dev, read_dev);
+       if (channel->tapdisk_pid < 0) {
+               err = channel->tapdisk_pid;
+               channel->tapdisk_pid = -1;
                goto fail;
+       }
 
        channel->open       = 1;
        channel->channel_id = channel->write_fd;
@@ -1273,6 +1274,12 @@ mem_fail:
 void
 tapdisk_channel_close(tapdisk_channel_t *channel)
 {
+       if (channel->tapdisk_pid != -1) {
+               DPRINTF("%s: waiting for child %d to exit\n", 
+                       channel->path, channel->tapdisk_pid);
+               return;
+       }
+
        if (channel->channel_id)
                DPRINTF("%s: closing channel %d:%d\n",
                        channel->path, channel->channel_id, channel->cookie);
@@ -1375,6 +1382,47 @@ fail:
        return err;
 }
 
+void
+tapdisk_channel_reap(tapdisk_channel_t *channel, int status)
+{
+       /* Nothing left to defer for in channel_close() */
+       channel->tapdisk_pid = -1; 
+
+       /* No IPC after this point */
+       channel->open = 0;
+
+       /*
+        * A tapdisk writes CLOSE_RSP and exists. What triggers first,
+        * SIGCHLD or the readfd? Cannot always guarantee events to be
+        * ordered synchronously on the observer side, so we refrain
+        * from assuming any particular order.
+        *
+        * Hence we escalate only the obvious cases. This also means:
+        * A tapdisk broken enough to return status 0 AND dropping the
+        * shutdown message would make us leak a channel.
+        */
+
+       if (WIFEXITED(status) && WEXITSTATUS(status) != 0) {
+               tapdisk_channel_fatal(channel,
+                                     "tapdisk died with status %d", 
+                                     WEXITSTATUS(status));
+               return;
+       }
+       
+       if (WIFSIGNALED(status)) {
+               tapdisk_channel_fatal(channel,
+                                     "tapdisk killed by signal %d", 
+                                     WTERMSIG(status));
+               return;
+       }
+
+       if (channel->state == TAPDISK_CHANNEL_CLOSED) {
+               /* We saw the shutdown message */
+               tapdisk_channel_close(channel);
+               return;
+       }
+}
+
 int
 tapdisk_channel_receive_message(tapdisk_channel_t *c, tapdisk_message_t *m)
 {
index 6e033cb7953c8e54bfc302b5853c64928b148fb7..7f41efb14474befa6868e6eb36ef271e264ace03 100644 (file)
@@ -32,6 +32,8 @@
 #include <stdlib.h>
 #include <string.h>
 #include <sys/ioctl.h>
+#include <sys/wait.h>
+#include <signal.h>
 
 #include <xs.h>
 #include "disktypes.h"
@@ -48,6 +50,8 @@ typedef struct tapdisk_daemon {
        struct xs_handle             *xsh;
        struct list_head              channels;
        struct xenbus_watch           watch;
+
+       sigset_t                      sigunmask;
 } tapdisk_daemon_t;
 
 static tapdisk_daemon_t tapdisk_daemon;
@@ -110,11 +114,18 @@ tapdisk_daemon_write_pidfile(long pid)
        return 0;
 }
 
+static void
+tapdisk_daemon_sa_none(int signo)
+{
+       /* only take a syscall restart */
+}
+
 static int
 tapdisk_daemon_init(void)
 {
        char *devname;
        int i, err, blktap_major;
+       sigset_t mask;
 
        memset(&tapdisk_daemon, 0, sizeof(tapdisk_daemon_t));
 
@@ -141,6 +152,22 @@ tapdisk_daemon_init(void)
                goto fail;
        }
 
+       /* 
+        * Spoil any opportunity for set/check races by forcing
+        * children to later serialize their demise into the event
+        * loop.
+        *
+        * NB. It's no coincidence we're blocking those signals right
+        * here. XS watches spawn threads [*shiver*]. The new mask is
+        * heritage.
+        */
+       sigemptyset(&mask);
+
+       sigaddset(&mask, SIGCHLD);
+       signal(SIGCHLD, tapdisk_daemon_sa_none);
+
+       sigprocmask(SIG_BLOCK, &mask, &tapdisk_daemon.sigunmask);
+
        for (i = 0; i < 2; i++) {
                tapdisk_daemon.xsh = xs_daemon_open();
                if (!tapdisk_daemon.xsh) {
@@ -155,6 +182,8 @@ tapdisk_daemon_init(void)
                goto fail;
        }
 
+       fcntl(xs_fileno(tapdisk_daemon.xsh), F_SETFD, O_NONBLOCK);
+       
        INIT_LIST_HEAD(&tapdisk_daemon.channels);
 
        free(devname);
@@ -382,6 +411,64 @@ tapdisk_daemon_free(void)
        memset(&tapdisk_daemon, 0, sizeof(tapdisk_daemon_t));
 }
 
+static pid_t
+tapdisk_daemon_wait(int *_status)
+{
+       tapdisk_channel_t *channel;
+       pid_t pid;
+       int status;
+
+       pid = waitpid(-1, &status, WNOHANG);
+       if (pid == 0)
+               return -1; /* No state changes */
+
+       if (pid < 0) {
+               if (errno != ECHILD) /* No children */
+                       PERROR("waitpid");
+               return -1;
+       }
+
+       *_status = status;
+
+       if (WIFEXITED(status)) {
+               DPRINTF("child %d exited with status %d", pid, 
+                       WEXITSTATUS(status));
+               return pid;
+       }
+
+       if (WIFSIGNALED(status)) {
+               DPRINTF("child %d killed by signal %d", pid, WTERMSIG(status));
+               return pid;
+       }
+               
+       /* WIFSTOPPED? Oh well. */
+       DPRINTF("ignoring child %d transition to state 0x%x.", pid, status);
+
+       return 0;
+}
+
+static void
+tapdisk_daemon_reap_channels(void)
+{
+       do {
+               tapdisk_channel_t *channel, *next;
+               pid_t pid;
+               int status;
+
+               pid = tapdisk_daemon_wait(&status);
+               if (pid < 0)
+                       break;
+
+               if (!pid)
+                       /* ignorable child state. */
+                       continue;
+               
+               tapdisk_daemon_for_each_channel(channel, next)
+                       if (channel->tapdisk_pid == pid)
+                               tapdisk_channel_reap(channel, status);
+       } while (1);
+}
+
 static int
 tapdisk_daemon_read_message(int fd, tapdisk_message_t *message, int timeout)
 {
@@ -485,17 +572,23 @@ tapdisk_daemon_check_fds(fd_set *readfds)
 static int
 tapdisk_daemon_run(void)
 {
-       int err, max;
+       int nfds, max;
        fd_set readfds;
 
        while (1) {
                max = tapdisk_daemon_set_fds(&readfds);
 
-               err = select(max + 1, &readfds, NULL, NULL, NULL);
-               if (err < 0)
-                       continue;
+               nfds = pselect(max + 1, &readfds, NULL, NULL, NULL,
+                              &tapdisk_daemon.sigunmask);
+               if (nfds < 0) {
+                       if (errno != EINTR)
+                               PERROR("select");
+               }
+
+               if (nfds > 0)
+                       tapdisk_daemon_check_fds(&readfds);
 
-               tapdisk_daemon_check_fds(&readfds);
+               tapdisk_daemon_reap_channels();
        }
 
        return 0;
index 58c7ebe9d2c4d5099796b11fabf7dbbe2fa476ed..77d4248b78684f72225e7aef7613478d0988f7d7 100644 (file)
@@ -92,5 +92,6 @@ void tapdisk_daemon_find_channel(tapdisk_channel_t *);
 void tapdisk_daemon_close_channel(tapdisk_channel_t *);
 
 int tapdisk_channel_receive_message(tapdisk_channel_t *, tapdisk_message_t *);
+void tapdisk_channel_reap(tapdisk_channel_t *channel, int status);
 
 #endif
index db1366afa4721326c2d6bc40d2ac7df8d8cf3c12..4fbc52b0e52c7ef2d28a6cad7192fbb181a8d630 100644 (file)
 #include <errno.h>
 #include <stdlib.h>
 #include <unistd.h>
+#include <libgen.h>
+#include <getopt.h>
 
 #include "tapdisk-utils.h"
 #include "tapdisk-server.h"
 
+static const char *program;
+
 static void
-usage(void)
+usage(FILE *stream)
 {
-       fprintf(stderr, "blktap-utils: v2.0.0\n");
-       fprintf(stderr, "usage: tapdisk <READ fifo> <WRITE fifo>\n");
-       exit(EINVAL);
+       fprintf(stream, "blktap-utils: v2.0.0\n");
+       fprintf(stream, "Usage: %s [-h ] [-D] <READ fifo> <WRITE fifo>\n", program);
 }
 
 int
 main(int argc, char *argv[])
 {
-       int err;
+       const struct option longopts[] = {
+               { "help",               0, 0, 'h' },
+               { "detach",             0, 0, 'D' }
+       };
+       int err, detach = 0;
+       
+       program = basename(argv[0]);
+
+       do {
+               int c;
+
+               c = getopt_long(argc, argv, "hD", longopts, NULL);
+               if (c == -1)
+                       break;
 
-       if (argc != 3)
-               usage();
+               switch (c) {
+               case 'h':
+                       usage(stdout);
+                       return 0;
+               case 'D':
+                       detach = 1;
+                       break;
+               default:
+                       goto usage;
+               }
+       } while (1);
+       
+       if (argc - optind < 2)
+               goto usage;
+       
+       if (detach) {
+               /* NB. This is expected to rarely, if ever, be
+                  used. Blktapctrl is already detached, and breaking
+                  affiliation will be exactly not desirable. */
+               err = daemon(0, 0);
+               if (err) {
+                       PERROR("daemon");
+                       return -errno;
+               }
+       }
 
-       daemon(0, 0);
        tapdisk_start_logging("TAPDISK");
 
        err = tapdisk_server_initialize(argv[1], argv[2]);
@@ -62,5 +100,10 @@ main(int argc, char *argv[])
 
 out:
        tapdisk_stop_logging();
-       return err;
+
+       return err ? 1 : 0;
+
+usage:
+       usage(stderr);
+       return 1;
 }