ia64/xen-unstable

changeset 9093:689da5e0a970

Added a basic integrity checker, and some basic ability to recover from store
corruption, rather than just spewing error messages and exiting.

Added a xenstore-control executable, which sends commands to xenstored.
Currently, the only command is 'check', which triggers an integrity check.
(The integrity check is also triggered whenever a corrupted store is detected).

Signed-off-by: Ewan Mellor <ewan@xensource.com>
author emellor@leeni.uk.xensource.com
date Thu Mar 02 02:09:23 2006 +0100 (2006-03-02)
parents 849723752858
children cff782f65c4d
files .hgignore tools/xenstore/Makefile tools/xenstore/xenstore_control.c tools/xenstore/xenstored_core.c tools/xenstore/xenstored_core.h
line diff
     1.1 --- a/.hgignore	Thu Mar 02 02:01:17 2006 +0100
     1.2 +++ b/.hgignore	Thu Mar 02 02:09:23 2006 +0100
     1.3 @@ -166,6 +166,7 @@
     1.4  ^tools/xenstore/xenstore-read$
     1.5  ^tools/xenstore/xenstore-rm$
     1.6  ^tools/xenstore/xenstore-write$
     1.7 +^tools/xenstore/xenstore-control$
     1.8  ^tools/xenstore/xenstore-ls$
     1.9  ^tools/xenstore/xenstored$
    1.10  ^tools/xenstore/xenstored_test$
     2.1 --- a/tools/xenstore/Makefile	Thu Mar 02 02:01:17 2006 +0100
     2.2 +++ b/tools/xenstore/Makefile	Thu Mar 02 02:09:23 2006 +0100
     2.3 @@ -27,7 +27,10 @@ CLIENTS := xenstore-exists xenstore-list
     2.4  CLIENTS += xenstore-write
     2.5  CLIENTS_OBJS := $(patsubst xenstore-%,xenstore_%.o,$(CLIENTS))
     2.6  
     2.7 -all: libxenstore.so xenstored $(CLIENTS) xs_tdb_dump xenstore-ls
     2.8 +all: libxenstore.so xenstored $(CLIENTS) xs_tdb_dump xenstore-control xenstore-ls
     2.9 +
    2.10 +test_interleaved_transactions: test_interleaved_transactions.o
    2.11 +	$(LINK.o) $^ $(LOADLIBES) $(LDLIBS) -L. -lxenstore -o $@
    2.12  
    2.13  testcode: xs_test xenstored_test xs_random
    2.14  
    2.15 @@ -35,13 +38,16 @@ xenstored: xenstored_core.o xenstored_wa
    2.16  	$(LINK.o) $^ $(LOADLIBES) $(LDLIBS) -lxenctrl -o $@
    2.17  
    2.18  $(CLIENTS): xenstore-%: xenstore_%.o libxenstore.so
    2.19 -	$(LINK.o) $< $(LOADLIBES) $(LDLIBS) -lxenctrl -L. -lxenstore -o $@
    2.20 +	$(LINK.o) $< $(LOADLIBES) $(LDLIBS) -L. -lxenstore -o $@
    2.21  
    2.22  $(CLIENTS_OBJS): xenstore_%.o: xenstore_client.c
    2.23  	$(COMPILE.c) -DCLIENT_$(*F) -o $@ $<
    2.24  
    2.25 +xenstore-control: xenstore_control.o libxenstore.so
    2.26 +	$(LINK.o) $< $(LOADLIBES) $(LDLIBS) -L. -lxenstore -o $@
    2.27 +
    2.28  xenstore-ls: xsls.o libxenstore.so
    2.29 -	$(LINK.o) $< $(LOADLIBES) $(LDLIBS) -lxenctrl -L. -lxenstore -o $@
    2.30 +	$(LINK.o) $< $(LOADLIBES) $(LDLIBS) -L. -lxenstore -o $@
    2.31  
    2.32  xenstored_test: xenstored_core_test.o xenstored_watch_test.o xenstored_domain_test.o xenstored_transaction_test.o xs_lib.o talloc_test.o fake_libxc.o utils.o tdb.o
    2.33  	$(LINK.o) $^ $(LOADLIBES) $(LDLIBS) -o $@
    2.34 @@ -77,7 +83,8 @@ libxenstore.so: xs.opic xs_lib.opic
    2.35  clean: testsuite-clean
    2.36  	rm -f *.o *.opic *.so
    2.37  	rm -f xenstored xs_random xs_stress xs_crashme
    2.38 -	rm -f xs_test xenstored_test xs_tdb_dump xenstore-ls $(CLIENTS)
    2.39 +	rm -f xs_test xenstored_test xs_tdb_dump xenstore-control xenstore-ls
    2.40 +	rm -f $(CLIENTS)
    2.41  	$(RM) $(PROG_DEP)
    2.42  
    2.43  print-dir:
    2.44 @@ -129,7 +136,7 @@ TAGS:
    2.45  tarball: clean
    2.46  	cd .. && tar -c -j -v -h -f xenstore.tar.bz2 xenstore/
    2.47  
    2.48 -install: libxenstore.so xenstored xenstore-ls $(CLIENTS)
    2.49 +install: all
    2.50  	$(INSTALL_DIR) -p $(DESTDIR)/var/run/xenstored
    2.51  	$(INSTALL_DIR) -p $(DESTDIR)/var/lib/xenstored
    2.52  	$(INSTALL_DIR) -p $(DESTDIR)/usr/bin
    2.53 @@ -137,6 +144,7 @@ install: libxenstore.so xenstored xensto
    2.54  	$(INSTALL_DIR) -p $(DESTDIR)/usr/include
    2.55  	$(INSTALL_PROG) xenstored $(DESTDIR)/usr/sbin
    2.56  	$(INSTALL_PROG) $(CLIENTS) $(DESTDIR)/usr/bin
    2.57 +	$(INSTALL_PROG) xenstore-control $(DESTDIR)/usr/bin
    2.58  	$(INSTALL_PROG) xenstore-ls $(DESTDIR)/usr/bin
    2.59  	$(INSTALL_DIR) -p $(DESTDIR)/usr/$(LIBDIR)
    2.60  	$(INSTALL_DATA) libxenstore.so $(DESTDIR)/usr/$(LIBDIR)
     3.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     3.2 +++ b/tools/xenstore/xenstore_control.c	Thu Mar 02 02:09:23 2006 +0100
     3.3 @@ -0,0 +1,28 @@
     3.4 +#include <stdio.h>
     3.5 +#include <stdlib.h>
     3.6 +#include <string.h>
     3.7 +
     3.8 +#include "xs.h"
     3.9 +
    3.10 +
    3.11 +int main(int argc, char **argv)
    3.12 +{
    3.13 +  if (argc < 2 ||
    3.14 +      strcmp(argv[1], "check"))
    3.15 +  {
    3.16 +    fprintf(stderr,
    3.17 +            "Usage:\n"
    3.18 +            "\n"
    3.19 +            "       %s check\n"
    3.20 +            "\n", argv[0]);
    3.21 +    return 2;
    3.22 +  }
    3.23 +
    3.24 +  struct xs_handle * xsh = xs_daemon_open();
    3.25 +
    3.26 +  xs_debug_command(xsh, argv[1], NULL, 0);
    3.27 +
    3.28 +  xs_daemon_close(xsh);
    3.29 +
    3.30 +  return 0;
    3.31 +}
     4.1 --- a/tools/xenstore/xenstored_core.c	Thu Mar 02 02:01:17 2006 +0100
     4.2 +++ b/tools/xenstore/xenstored_core.c	Thu Mar 02 02:09:23 2006 +0100
     4.3 @@ -60,6 +60,18 @@ static int reopen_log_pipe[2];
     4.4  static char *tracefile = NULL;
     4.5  static TDB_CONTEXT *tdb_ctx;
     4.6  
     4.7 +static void corrupt(struct connection *conn, const char *fmt, ...);
     4.8 +static void check_store();
     4.9 +
    4.10 +#define log(...)							\
    4.11 +	do {								\
    4.12 +		char *s = talloc_asprintf(NULL, __VA_ARGS__);		\
    4.13 +		trace("%s\n", s);					\
    4.14 +		syslog(LOG_ERR, "%s",  s);				\
    4.15 +		talloc_free(s);						\
    4.16 +	} while (0)
    4.17 +
    4.18 +
    4.19  #ifdef TESTING
    4.20  static bool failtest = false;
    4.21  
    4.22 @@ -104,33 +116,6 @@ int test_mkdir(const char *dir, int perm
    4.23  
    4.24  #include "xenstored_test.h"
    4.25  
    4.26 -/* FIXME: Ideally, this should never be called.  Some can be eliminated. */
    4.27 -/* Something is horribly wrong: shutdown immediately. */
    4.28 -void __attribute__((noreturn)) corrupt(struct connection *conn,
    4.29 -				       const char *fmt, ...)
    4.30 -{
    4.31 -	va_list arglist;
    4.32 -	char *str;
    4.33 -	int saved_errno = errno;
    4.34 -
    4.35 -	va_start(arglist, fmt);
    4.36 -	str = talloc_vasprintf(NULL, fmt, arglist);
    4.37 -	va_end(arglist);
    4.38 -
    4.39 -	trace("xenstored corruption: connection id %i: err %s: %s",
    4.40 -		conn ? (int)conn->id : -1, strerror(saved_errno), str);
    4.41 -	eprintf("xenstored corruption: connection id %i: err %s: %s",
    4.42 -		conn ? (int)conn->id : -1, strerror(saved_errno), str);
    4.43 -#ifdef TESTING
    4.44 -	/* Allow them to attach debugger. */
    4.45 -	sleep(30);
    4.46 -#endif
    4.47 -	syslog(LOG_DAEMON,
    4.48 -	       "xenstored corruption: connection id %i: err %s: %s",
    4.49 -	       conn ? (int)conn->id : -1, strerror(saved_errno), str);
    4.50 -	_exit(2);
    4.51 -}
    4.52 -
    4.53  TDB_CONTEXT *tdb_context(struct connection *conn)
    4.54  {
    4.55  	/* conn = NULL used in manual_node at setup. */
    4.56 @@ -216,7 +201,8 @@ static void trace_io(const struct connec
    4.57  	now = time(NULL);
    4.58  	tm = localtime(&now);
    4.59  
    4.60 -	trace("%s %p %02d:%02d:%02d %s (", prefix, conn,
    4.61 +	trace("%s %p %p %04d%02d%02d %02d:%02d:%02d %s (", prefix, conn,
    4.62 +	      conn->transaction, tm->year + 1900, tm->mon + 1, tm->mday,
    4.63  	      tm->tm_hour, tm->tm_min, tm->tm_sec,
    4.64  	      sockmsg_string(data->hdr.msg.type));
    4.65  	
    4.66 @@ -837,8 +823,6 @@ static int destroy_node(void *_node)
    4.67  	return 0;
    4.68  }
    4.69  
    4.70 -/* Be careful: create heirarchy, put entry in existing parent *last*.
    4.71 - * This helps fsck if we die during this. */
    4.72  static struct node *create_node(struct connection *conn, 
    4.73  				const char *name,
    4.74  				void *data, unsigned int datalen)
    4.75 @@ -939,8 +923,9 @@ static void delete_node(struct connectio
    4.76  {
    4.77  	unsigned int i;
    4.78  
    4.79 -	/* Delete self, then delete children.  If something goes wrong,
    4.80 -	 * consistency check will clean up this way. */
    4.81 +	/* Delete self, then delete children.  If we crash, then the worst
    4.82 +	   that can happen is the children will continue to take up space, but
    4.83 +	   will otherwise be unreachable. */
    4.84  	delete_node_single(conn, node);
    4.85  
    4.86  	/* Delete children, too. */
    4.87 @@ -950,9 +935,14 @@ static void delete_node(struct connectio
    4.88  		child = read_node(conn, 
    4.89  				  talloc_asprintf(node, "%s/%s", node->name,
    4.90  						  node->children + i));
    4.91 -		if (!child)
    4.92 -			corrupt(conn, "No child '%s' found", child);
    4.93 -		delete_node(conn, child);
    4.94 +		if (child) {
    4.95 +			delete_node(conn, child);
    4.96 +		}
    4.97 +		else {
    4.98 +			trace("delete_node: No child '%s/%s' found!\n",
    4.99 +			      node->name, node->children + i);
   4.100 +			/* Skip it, we've already deleted the parent. */
   4.101 +		}
   4.102  	}
   4.103  }
   4.104  
   4.105 @@ -976,12 +966,15 @@ static bool delete_child(struct connecti
   4.106  		}
   4.107  	}
   4.108  	corrupt(conn, "Can't find child '%s' in %s", childname, node->name);
   4.109 +	return false;
   4.110  }
   4.111  
   4.112  
   4.113  static int _rm(struct connection *conn, struct node *node, const char *name)
   4.114  {
   4.115 -	/* Delete from parent first, then if something explodes fsck cleans. */
   4.116 +	/* Delete from parent first, then if we crash, the worst that can
   4.117 +	   happen is the child will continue to take up space, but will
   4.118 +	   otherwise be unreachable. */
   4.119  	struct node *parent = read_node(conn, get_parent(name));
   4.120  	if (!parent) {
   4.121  		send_error(conn, EINVAL);
   4.122 @@ -1000,10 +993,11 @@ static int _rm(struct connection *conn, 
   4.123  
   4.124  static void internal_rm(const char *name)
   4.125  {
   4.126 -	char *tname = talloc_strdup(talloc_autofree_context(), name);
   4.127 +	char *tname = talloc_strdup(NULL, name);
   4.128  	struct node *node = read_node(NULL, tname);
   4.129  	if (node)
   4.130  		_rm(NULL, node, tname);
   4.131 +	talloc_free(tname);
   4.132  }
   4.133  
   4.134  
   4.135 @@ -1149,18 +1143,19 @@ static void process_message(struct conne
   4.136  	case XS_DEBUG:
   4.137  		if (streq(in->buffer, "print"))
   4.138  			xprintf("debug: %s", in->buffer + get_string(in, 0));
   4.139 +		if (streq(in->buffer, "check"))
   4.140 +			check_store();
   4.141  #ifdef TESTING
   4.142  		/* For testing, we allow them to set id. */
   4.143  		if (streq(in->buffer, "setid")) {
   4.144  			conn->id = atoi(in->buffer + get_string(in, 0));
   4.145 -			send_ack(conn, XS_DEBUG);
   4.146  		} else if (streq(in->buffer, "failtest")) {
   4.147  			if (get_string(in, 0) < in->used)
   4.148  				srandom(atoi(in->buffer + get_string(in, 0)));
   4.149 -			send_ack(conn, XS_DEBUG);
   4.150  			failtest = true;
   4.151  		}
   4.152  #endif /* TESTING */
   4.153 +		send_ack(conn, XS_DEBUG);
   4.154  		break;
   4.155  
   4.156  	case XS_WATCH:
   4.157 @@ -1258,7 +1253,7 @@ static void handle_input(struct connecti
   4.158  
   4.159  		if (in->hdr.msg.len > PATH_MAX) {
   4.160  #ifndef TESTING
   4.161 -			syslog(LOG_DAEMON, "Client tried to feed us %i",
   4.162 +			syslog(LOG_ERR, "Client tried to feed us %i",
   4.163  			       in->hdr.msg.len);
   4.164  #endif
   4.165  			goto bad_client;
   4.166 @@ -1425,10 +1420,16 @@ static void setup_structure(void)
   4.167  		   balloon driver will pick up stale entries.  In the case of
   4.168  		   the balloon driver, this can be fatal.
   4.169  		*/
   4.170 -		char *tlocal = talloc_strdup(talloc_autofree_context(),
   4.171 -					     "/local");
   4.172 +		char *tlocal = talloc_strdup(NULL, "/local");
   4.173 +
   4.174 +		check_store();
   4.175 +
   4.176  		internal_rm("/local");
   4.177  		create_node(NULL, tlocal, NULL, 0);
   4.178 +
   4.179 +		talloc_free(tlocal);
   4.180 +
   4.181 +		check_store();
   4.182  	}
   4.183  	else {
   4.184  		tdb_ctx = tdb_open(tdbname, 7919, TDB_FLAGS, O_RDWR|O_CREAT,
   4.185 @@ -1439,11 +1440,93 @@ static void setup_structure(void)
   4.186  		manual_node("/", "tool");
   4.187  		manual_node("/tool", "xenstored");
   4.188  		manual_node("/tool/xenstored", NULL);
   4.189 +
   4.190 +		check_store();
   4.191  	}
   4.192 +}
   4.193 +
   4.194 +static char *child_name(const char *s1, const char *s2)
   4.195 +{
   4.196 +	if (strcmp(s1, "/")) {
   4.197 +		return talloc_asprintf(NULL, "%s/%s", s1, s2);
   4.198 +	}
   4.199 +	else {
   4.200 +		return talloc_asprintf(NULL, "/%s", s2);
   4.201 +	}
   4.202 +}
   4.203 +
   4.204 +static void check_store_(const char *name)
   4.205 +{
   4.206 +	struct node *node = read_node(NULL, name);
   4.207 +
   4.208 +	if (node) {
   4.209 +		size_t i = 0;
   4.210 +
   4.211 +		while (i < node->childlen) {
   4.212 +			size_t childlen = strlen(node->children + i);
   4.213 +			char * childname = child_name(node->name,
   4.214 +						      node->children + i);
   4.215 +			struct node *childnode = read_node(NULL, childname);
   4.216 +			
   4.217 +			if (childnode) {
   4.218 +				check_store_(childname);
   4.219 +				i += childlen + 1;
   4.220 +			}
   4.221 +			else {
   4.222 +				log("check_store: No child '%s' found!\n",
   4.223 +				    childname);
   4.224 +
   4.225 +				memdel(node->children, i, childlen + 1,
   4.226 +				       node->childlen);
   4.227 +				node->childlen -= childlen + 1;
   4.228 +				write_node(NULL, node);
   4.229 +			}
   4.230  
   4.231 -	/* FIXME: Fsck */
   4.232 +			talloc_free(childname);
   4.233 +		}
   4.234 +	}
   4.235 +	else {
   4.236 +		/* Impossible, because no database should ever be without the
   4.237 +		   root, and otherwise, we've just checked in our caller
   4.238 +		   (which made a recursive call to get here). */
   4.239 +		   
   4.240 +		log("check_store: No child '%s' found: impossible!", name);
   4.241 +	}
   4.242 +}
   4.243 +
   4.244 +
   4.245 +static void check_store()
   4.246 +{
   4.247 +	char * root = talloc_strdup(NULL, "/");
   4.248 +	log("Checking store ...");
   4.249 +	check_store_(root);
   4.250 +	log("Checking store complete.");
   4.251 +	talloc_free(root);
   4.252  }
   4.253  
   4.254 +
   4.255 +/* Something is horribly wrong: check the store. */
   4.256 +static void corrupt(struct connection *conn, const char *fmt, ...)
   4.257 +{
   4.258 +	va_list arglist;
   4.259 +	char *str;
   4.260 +	int saved_errno = errno;
   4.261 +
   4.262 +	va_start(arglist, fmt);
   4.263 +	str = talloc_vasprintf(NULL, fmt, arglist);
   4.264 +	va_end(arglist);
   4.265 +
   4.266 +	log("corruption detected by connection %i: err %s: %s",
   4.267 +	    conn ? (int)conn->id : -1, strerror(saved_errno), str);
   4.268 +
   4.269 +#ifdef TESTING
   4.270 +	/* Allow them to attach debugger. */
   4.271 +	sleep(30);
   4.272 +#endif
   4.273 +	check_store();
   4.274 +}
   4.275 +
   4.276 +
   4.277  static void write_pidfile(const char *pidfile)
   4.278  {
   4.279  	char buf[100];
     5.1 --- a/tools/xenstore/xenstored_core.h	Thu Mar 02 02:01:17 2006 +0100
     5.2 +++ b/tools/xenstore/xenstored_core.h	Thu Mar 02 02:09:23 2006 +0100
     5.3 @@ -148,10 +148,6 @@ int destroy_tdb(void *_tdb);
     5.4  /* Replace the tdb: required for transaction code */
     5.5  bool replace_tdb(const char *newname, TDB_CONTEXT *newtdb);
     5.6  
     5.7 -/* Fail due to excessive corruption, capitalist pigdogs! */
     5.8 -void __attribute__((noreturn)) corrupt(struct connection *conn,
     5.9 -				       const char *fmt, ...);
    5.10 -
    5.11  struct connection *new_connection(connwritefn_t *write, connreadfn_t *read);
    5.12  
    5.13