ia64/xen-unstable

changeset 7039:76af1a1df67c

Make xenstored use tdb, transactions can soft-fail (EAGAIN)
Transactions no longer take root dir, no longer lock & block: commit can fail spuriously with EAGAIN, not ETIMEDOUT.
Speeds up transactions by over 1000 times, should be NFS safe.
New program: xs_tdb_dump to dump raw TDB contents.
Don't do failure testing: we are no longer robust against all ENOMEM 8(
Introduce "struct node" which contains perms, children and data.
Make struct xs_permissions unpadded, so we can write to tdb w/o valgrind complaints.
Gently modify TDB to use talloc, not do alloc on tdb_delete.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
author Rusty Russell <rusty@rustcorp.com.au>
date Fri Sep 23 14:25:01 2005 +0100 (2005-09-23)
parents 6aef7d1062bb
children 10d6bda59ea4
files tools/xenstore/Makefile tools/xenstore/tdb.c tools/xenstore/tdb.h tools/xenstore/testsuite/04rm.test tools/xenstore/testsuite/08transaction.slowtest tools/xenstore/testsuite/08transaction.test tools/xenstore/testsuite/12readonly.test tools/xenstore/testsuite/14complexperms.test tools/xenstore/testsuite/16block-watch-crash.test tools/xenstore/xenstored.h tools/xenstore/xenstored_core.c tools/xenstore/xenstored_core.h tools/xenstore/xenstored_domain.c tools/xenstore/xenstored_transaction.c tools/xenstore/xenstored_transaction.h tools/xenstore/xenstored_watch.c tools/xenstore/xenstored_watch.h tools/xenstore/xs.c tools/xenstore/xs.h tools/xenstore/xs_lib.c tools/xenstore/xs_lib.h tools/xenstore/xs_random.c tools/xenstore/xs_stress.c tools/xenstore/xs_tdb_dump.c tools/xenstore/xs_test.c
line diff
     1.1 --- a/tools/xenstore/Makefile	Fri Sep 23 14:24:58 2005 +0100
     1.2 +++ b/tools/xenstore/Makefile	Fri Sep 23 14:25:01 2005 +0100
     1.3 @@ -28,11 +28,11 @@ CLIENTS := xenstore-exists xenstore-list
     1.4  CLIENTS += xenstore-write
     1.5  CLIENTS_OBJS := $(patsubst xenstore-%,xenstore_%.o,$(CLIENTS))
     1.6  
     1.7 -all: libxenstore.so xenstored $(CLIENTS)
     1.8 +all: libxenstore.so xenstored $(CLIENTS) xs_tdb_dump
     1.9  
    1.10  testcode: xs_test xenstored_test xs_random xs_dom0_test
    1.11  
    1.12 -xenstored: xenstored_core.o xenstored_watch.o xenstored_domain.o xenstored_transaction.o xs_lib.o talloc.o utils.o
    1.13 +xenstored: xenstored_core.o xenstored_watch.o xenstored_domain.o xenstored_transaction.o xs_lib.o talloc.o utils.o tdb.o
    1.14  	$(LINK.o) $^ $(LOADLIBES) $(LDLIBS) -lxenctrl -o $@
    1.15  
    1.16  $(CLIENTS): libxenstore.so
    1.17 @@ -42,7 +42,10 @@ xenstored: xenstored_core.o xenstored_wa
    1.18  $(CLIENTS_OBJS): xenstore_%.o: xenstore_client.c
    1.19  	$(COMPILE.c) -DCLIENT_$(*F) -o $@ $<
    1.20  
    1.21 -xenstored_test: xenstored_core_test.o xenstored_watch_test.o xenstored_domain_test.o xenstored_transaction_test.o xs_lib.o talloc_test.o fake_libxc.o utils.o
    1.22 +xenstored_test: xenstored_core_test.o xenstored_watch_test.o xenstored_domain_test.o xenstored_transaction_test.o xs_lib.o talloc_test.o fake_libxc.o utils.o tdb.o
    1.23 +	$(LINK.o) $^ $(LOADLIBES) $(LDLIBS) -o $@
    1.24 +
    1.25 +xs_tdb_dump: xs_tdb_dump.o utils.o tdb.o talloc.o
    1.26  	$(LINK.o) $^ $(LOADLIBES) $(LDLIBS) -o $@
    1.27  
    1.28  xs_test: xs_test.o xs_lib.o utils.o
    1.29 @@ -103,7 +106,7 @@ RANDSEED=$(shell date +%s)
    1.30  randomcheck: xs_random xenstored_test $(TESTDIR)
    1.31  	$(TESTENV) ./xs_random --simple --fast /tmp/xs_random 200000 $(RANDSEED) && echo
    1.32  	$(TESTENV) ./xs_random --fast /tmp/xs_random 100000 $(RANDSEED) && echo
    1.33 -	$(TESTENV) ./xs_random --fail /tmp/xs_random 10000 $(RANDSEED)
    1.34 +#	$(TESTENV) ./xs_random --fail /tmp/xs_random 10000 $(RANDSEED)
    1.35  
    1.36  crashme:  xs_crashme xenstored_test $(TESTDIR)
    1.37  	rm -rf $(TESTDIR)/store $(TESTDIR)/transactions /tmp/xs_crashme.vglog* /tmp/trace
     2.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     2.2 +++ b/tools/xenstore/tdb.c	Fri Sep 23 14:25:01 2005 +0100
     2.3 @@ -0,0 +1,2151 @@
     2.4 + /* 
     2.5 +   Unix SMB/CIFS implementation.
     2.6 +
     2.7 +   trivial database library
     2.8 +
     2.9 +   Copyright (C) Andrew Tridgell              1999-2004
    2.10 +   Copyright (C) Paul `Rusty' Russell		   2000
    2.11 +   Copyright (C) Jeremy Allison			   2000-2003
    2.12 +   
    2.13 +     ** NOTE! The following LGPL license applies to the tdb
    2.14 +     ** library. This does NOT imply that all of Samba is released
    2.15 +     ** under the LGPL
    2.16 +   
    2.17 +   This library is free software; you can redistribute it and/or
    2.18 +   modify it under the terms of the GNU Lesser General Public
    2.19 +   License as published by the Free Software Foundation; either
    2.20 +   version 2 of the License, or (at your option) any later version.
    2.21 +
    2.22 +   This library is distributed in the hope that it will be useful,
    2.23 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
    2.24 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    2.25 +   Lesser General Public License for more details.
    2.26 +
    2.27 +   You should have received a copy of the GNU Lesser General Public
    2.28 +   License along with this library; if not, write to the Free Software
    2.29 +   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
    2.30 +*/
    2.31 +
    2.32 +
    2.33 +#ifndef _SAMBA_BUILD_
    2.34 +#if HAVE_CONFIG_H
    2.35 +#include <config.h>
    2.36 +#endif
    2.37 +
    2.38 +#include <stdlib.h>
    2.39 +#include <stdio.h>
    2.40 +#include <stdint.h>
    2.41 +#include <fcntl.h>
    2.42 +#include <unistd.h>
    2.43 +#include <string.h>
    2.44 +#include <fcntl.h>
    2.45 +#include <errno.h>
    2.46 +#include <sys/mman.h>
    2.47 +#include <sys/stat.h>
    2.48 +#include "tdb.h"
    2.49 +#include <stdarg.h>
    2.50 +#include "talloc.h"
    2.51 +#define HAVE_MMAP
    2.52 +#else
    2.53 +#include "includes.h"
    2.54 +#include "lib/tdb/include/tdb.h"
    2.55 +#include "system/time.h"
    2.56 +#include "system/shmem.h"
    2.57 +#include "system/filesys.h"
    2.58 +#endif
    2.59 +
    2.60 +#define TDB_MAGIC_FOOD "TDB file\n"
    2.61 +#define TDB_VERSION (0x26011967 + 6)
    2.62 +#define TDB_MAGIC (0x26011999U)
    2.63 +#define TDB_FREE_MAGIC (~TDB_MAGIC)
    2.64 +#define TDB_DEAD_MAGIC (0xFEE1DEAD)
    2.65 +#define TDB_ALIGNMENT 4
    2.66 +#define MIN_REC_SIZE (2*sizeof(struct list_struct) + TDB_ALIGNMENT)
    2.67 +#define DEFAULT_HASH_SIZE 131
    2.68 +#define TDB_PAGE_SIZE 0x2000
    2.69 +#define FREELIST_TOP (sizeof(struct tdb_header))
    2.70 +#define TDB_ALIGN(x,a) (((x) + (a)-1) & ~((a)-1))
    2.71 +#define TDB_BYTEREV(x) (((((x)&0xff)<<24)|((x)&0xFF00)<<8)|(((x)>>8)&0xFF00)|((x)>>24))
    2.72 +#define TDB_DEAD(r) ((r)->magic == TDB_DEAD_MAGIC)
    2.73 +#define TDB_BAD_MAGIC(r) ((r)->magic != TDB_MAGIC && !TDB_DEAD(r))
    2.74 +#define TDB_HASH_TOP(hash) (FREELIST_TOP + (BUCKET(hash)+1)*sizeof(tdb_off))
    2.75 +#define TDB_DATA_START(hash_size) (TDB_HASH_TOP(hash_size-1))
    2.76 +
    2.77 +
    2.78 +/* NB assumes there is a local variable called "tdb" that is the
    2.79 + * current context, also takes doubly-parenthesized print-style
    2.80 + * argument. */
    2.81 +#define TDB_LOG(x) tdb->log_fn x
    2.82 +
    2.83 +/* lock offsets */
    2.84 +#define GLOBAL_LOCK 0
    2.85 +#define ACTIVE_LOCK 4
    2.86 +
    2.87 +#ifndef MAP_FILE
    2.88 +#define MAP_FILE 0
    2.89 +#endif
    2.90 +
    2.91 +#ifndef MAP_FAILED
    2.92 +#define MAP_FAILED ((void *)-1)
    2.93 +#endif
    2.94 +
    2.95 +#ifndef discard_const_p
    2.96 +# if defined(__intptr_t_defined) || defined(HAVE_INTPTR_T)
    2.97 +#  define discard_const(ptr) ((void *)((intptr_t)(ptr)))
    2.98 +# else
    2.99 +#  define discard_const(ptr) ((void *)(ptr))
   2.100 +# endif
   2.101 +# define discard_const_p(type, ptr) ((type *)discard_const(ptr))
   2.102 +#endif
   2.103 +
   2.104 +/* free memory if the pointer is valid and zero the pointer */
   2.105 +#ifndef SAFE_FREE
   2.106 +#define SAFE_FREE(x) do { if ((x) != NULL) {talloc_free(discard_const_p(void *, (x))); (x)=NULL;} } while(0)
   2.107 +#endif
   2.108 +
   2.109 +#define BUCKET(hash) ((hash) % tdb->header.hash_size)
   2.110 +TDB_DATA tdb_null;
   2.111 +
   2.112 +/* all contexts, to ensure no double-opens (fcntl locks don't nest!) */
   2.113 +static TDB_CONTEXT *tdbs = NULL;
   2.114 +
   2.115 +static int tdb_munmap(TDB_CONTEXT *tdb)
   2.116 +{
   2.117 +	if (tdb->flags & TDB_INTERNAL)
   2.118 +		return 0;
   2.119 +
   2.120 +#ifdef HAVE_MMAP
   2.121 +	if (tdb->map_ptr) {
   2.122 +		int ret = munmap(tdb->map_ptr, tdb->map_size);
   2.123 +		if (ret != 0)
   2.124 +			return ret;
   2.125 +	}
   2.126 +#endif
   2.127 +	tdb->map_ptr = NULL;
   2.128 +	return 0;
   2.129 +}
   2.130 +
   2.131 +static void tdb_mmap(TDB_CONTEXT *tdb)
   2.132 +{
   2.133 +	if (tdb->flags & TDB_INTERNAL)
   2.134 +		return;
   2.135 +
   2.136 +#ifdef HAVE_MMAP
   2.137 +	if (!(tdb->flags & TDB_NOMMAP)) {
   2.138 +		tdb->map_ptr = mmap(NULL, tdb->map_size, 
   2.139 +				    PROT_READ|(tdb->read_only? 0:PROT_WRITE), 
   2.140 +				    MAP_SHARED|MAP_FILE, tdb->fd, 0);
   2.141 +
   2.142 +		/*
   2.143 +		 * NB. When mmap fails it returns MAP_FAILED *NOT* NULL !!!!
   2.144 +		 */
   2.145 +
   2.146 +		if (tdb->map_ptr == MAP_FAILED) {
   2.147 +			tdb->map_ptr = NULL;
   2.148 +			TDB_LOG((tdb, 2, "tdb_mmap failed for size %d (%s)\n", 
   2.149 +				 tdb->map_size, strerror(errno)));
   2.150 +		}
   2.151 +	} else {
   2.152 +		tdb->map_ptr = NULL;
   2.153 +	}
   2.154 +#else
   2.155 +	tdb->map_ptr = NULL;
   2.156 +#endif
   2.157 +}
   2.158 +
   2.159 +/* Endian conversion: we only ever deal with 4 byte quantities */
   2.160 +static void *convert(void *buf, u32 size)
   2.161 +{
   2.162 +	u32 i, *p = buf;
   2.163 +	for (i = 0; i < size / 4; i++)
   2.164 +		p[i] = TDB_BYTEREV(p[i]);
   2.165 +	return buf;
   2.166 +}
   2.167 +#define DOCONV() (tdb->flags & TDB_CONVERT)
   2.168 +#define CONVERT(x) (DOCONV() ? convert(&x, sizeof(x)) : &x)
   2.169 +
   2.170 +/* the body of the database is made of one list_struct for the free space
   2.171 +   plus a separate data list for each hash value */
   2.172 +struct list_struct {
   2.173 +	tdb_off next; /* offset of the next record in the list */
   2.174 +	tdb_len rec_len; /* total byte length of record */
   2.175 +	tdb_len key_len; /* byte length of key */
   2.176 +	tdb_len data_len; /* byte length of data */
   2.177 +	u32 full_hash; /* the full 32 bit hash of the key */
   2.178 +	u32 magic;   /* try to catch errors */
   2.179 +	/* the following union is implied:
   2.180 +		union {
   2.181 +			char record[rec_len];
   2.182 +			struct {
   2.183 +				char key[key_len];
   2.184 +				char data[data_len];
   2.185 +			}
   2.186 +			u32 totalsize; (tailer)
   2.187 +		}
   2.188 +	*/
   2.189 +};
   2.190 +
   2.191 +/* a byte range locking function - return 0 on success
   2.192 +   this functions locks/unlocks 1 byte at the specified offset.
   2.193 +
   2.194 +   On error, errno is also set so that errors are passed back properly
   2.195 +   through tdb_open(). */
   2.196 +static int tdb_brlock(TDB_CONTEXT *tdb, tdb_off offset, 
   2.197 +		      int rw_type, int lck_type, int probe)
   2.198 +{
   2.199 +	struct flock fl;
   2.200 +	int ret;
   2.201 +
   2.202 +	if (tdb->flags & TDB_NOLOCK)
   2.203 +		return 0;
   2.204 +	if ((rw_type == F_WRLCK) && (tdb->read_only)) {
   2.205 +		errno = EACCES;
   2.206 +		return -1;
   2.207 +	}
   2.208 +
   2.209 +	fl.l_type = rw_type;
   2.210 +	fl.l_whence = SEEK_SET;
   2.211 +	fl.l_start = offset;
   2.212 +	fl.l_len = 1;
   2.213 +	fl.l_pid = 0;
   2.214 +
   2.215 +	do {
   2.216 +		ret = fcntl(tdb->fd,lck_type,&fl);
   2.217 +	} while (ret == -1 && errno == EINTR);
   2.218 +
   2.219 +	if (ret == -1) {
   2.220 +		if (!probe && lck_type != F_SETLK) {
   2.221 +			/* Ensure error code is set for log fun to examine. */
   2.222 +			tdb->ecode = TDB_ERR_LOCK;
   2.223 +			TDB_LOG((tdb, 5,"tdb_brlock failed (fd=%d) at offset %d rw_type=%d lck_type=%d\n", 
   2.224 +				 tdb->fd, offset, rw_type, lck_type));
   2.225 +		}
   2.226 +		/* Generic lock error. errno set by fcntl.
   2.227 +		 * EAGAIN is an expected return from non-blocking
   2.228 +		 * locks. */
   2.229 +		if (errno != EAGAIN) {
   2.230 +		TDB_LOG((tdb, 5, "tdb_brlock failed (fd=%d) at offset %d rw_type=%d lck_type=%d: %s\n", 
   2.231 +				 tdb->fd, offset, rw_type, lck_type, 
   2.232 +				 strerror(errno)));
   2.233 +		}
   2.234 +		return TDB_ERRCODE(TDB_ERR_LOCK, -1);
   2.235 +	}
   2.236 +	return 0;
   2.237 +}
   2.238 +
   2.239 +/* lock a list in the database. list -1 is the alloc list */
   2.240 +static int tdb_lock(TDB_CONTEXT *tdb, int list, int ltype)
   2.241 +{
   2.242 +	if (list < -1 || list >= (int)tdb->header.hash_size) {
   2.243 +		TDB_LOG((tdb, 0,"tdb_lock: invalid list %d for ltype=%d\n", 
   2.244 +			   list, ltype));
   2.245 +		return -1;
   2.246 +	}
   2.247 +	if (tdb->flags & TDB_NOLOCK)
   2.248 +		return 0;
   2.249 +
   2.250 +	/* Since fcntl locks don't nest, we do a lock for the first one,
   2.251 +	   and simply bump the count for future ones */
   2.252 +	if (tdb->locked[list+1].count == 0) {
   2.253 +		if (tdb_brlock(tdb,FREELIST_TOP+4*list,ltype,F_SETLKW, 0)) {
   2.254 +			TDB_LOG((tdb, 0,"tdb_lock failed on list %d ltype=%d (%s)\n", 
   2.255 +					   list, ltype, strerror(errno)));
   2.256 +			return -1;
   2.257 +		}
   2.258 +		tdb->locked[list+1].ltype = ltype;
   2.259 +	}
   2.260 +	tdb->locked[list+1].count++;
   2.261 +	return 0;
   2.262 +}
   2.263 +
   2.264 +/* unlock the database: returns void because it's too late for errors. */
   2.265 +	/* changed to return int it may be interesting to know there
   2.266 +	   has been an error  --simo */
   2.267 +static int tdb_unlock(TDB_CONTEXT *tdb, int list,
   2.268 +		      int ltype __attribute__((unused)))
   2.269 +{
   2.270 +	int ret = -1;
   2.271 +
   2.272 +	if (tdb->flags & TDB_NOLOCK)
   2.273 +		return 0;
   2.274 +
   2.275 +	/* Sanity checks */
   2.276 +	if (list < -1 || list >= (int)tdb->header.hash_size) {
   2.277 +		TDB_LOG((tdb, 0, "tdb_unlock: list %d invalid (%d)\n", list, tdb->header.hash_size));
   2.278 +		return ret;
   2.279 +	}
   2.280 +
   2.281 +	if (tdb->locked[list+1].count==0) {
   2.282 +		TDB_LOG((tdb, 0, "tdb_unlock: count is 0\n"));
   2.283 +		return ret;
   2.284 +	}
   2.285 +
   2.286 +	if (tdb->locked[list+1].count == 1) {
   2.287 +		/* Down to last nested lock: unlock underneath */
   2.288 +		ret = tdb_brlock(tdb, FREELIST_TOP+4*list, F_UNLCK, F_SETLKW, 0);
   2.289 +	} else {
   2.290 +		ret = 0;
   2.291 +	}
   2.292 +	tdb->locked[list+1].count--;
   2.293 +
   2.294 +	if (ret)
   2.295 +		TDB_LOG((tdb, 0,"tdb_unlock: An error occurred unlocking!\n")); 
   2.296 +	return ret;
   2.297 +}
   2.298 +
   2.299 +/* This is based on the hash algorithm from gdbm */
   2.300 +static u32 default_tdb_hash(TDB_DATA *key)
   2.301 +{
   2.302 +	u32 value;	/* Used to compute the hash value.  */
   2.303 +	u32   i;	/* Used to cycle through random values. */
   2.304 +
   2.305 +	/* Set the initial value from the key size. */
   2.306 +	for (value = 0x238F13AF * key->dsize, i=0; i < key->dsize; i++)
   2.307 +		value = (value + (key->dptr[i] << (i*5 % 24)));
   2.308 +
   2.309 +	return (1103515243 * value + 12345);  
   2.310 +}
   2.311 +
   2.312 +/* check for an out of bounds access - if it is out of bounds then
   2.313 +   see if the database has been expanded by someone else and expand
   2.314 +   if necessary 
   2.315 +   note that "len" is the minimum length needed for the db
   2.316 +*/
   2.317 +static int tdb_oob(TDB_CONTEXT *tdb, tdb_off len, int probe)
   2.318 +{
   2.319 +	struct stat st;
   2.320 +	if (len <= tdb->map_size)
   2.321 +		return 0;
   2.322 +	if (tdb->flags & TDB_INTERNAL) {
   2.323 +		if (!probe) {
   2.324 +			/* Ensure ecode is set for log fn. */
   2.325 +			tdb->ecode = TDB_ERR_IO;
   2.326 +			TDB_LOG((tdb, 0,"tdb_oob len %d beyond internal malloc size %d\n",
   2.327 +				 (int)len, (int)tdb->map_size));
   2.328 +		}
   2.329 +		return TDB_ERRCODE(TDB_ERR_IO, -1);
   2.330 +	}
   2.331 +
   2.332 +	if (fstat(tdb->fd, &st) == -1)
   2.333 +		return TDB_ERRCODE(TDB_ERR_IO, -1);
   2.334 +
   2.335 +	if (st.st_size < (off_t)len) {
   2.336 +		if (!probe) {
   2.337 +			/* Ensure ecode is set for log fn. */
   2.338 +			tdb->ecode = TDB_ERR_IO;
   2.339 +			TDB_LOG((tdb, 0,"tdb_oob len %d beyond eof at %d\n",
   2.340 +				 (int)len, (int)st.st_size));
   2.341 +		}
   2.342 +		return TDB_ERRCODE(TDB_ERR_IO, -1);
   2.343 +	}
   2.344 +
   2.345 +	/* Unmap, update size, remap */
   2.346 +	if (tdb_munmap(tdb) == -1)
   2.347 +		return TDB_ERRCODE(TDB_ERR_IO, -1);
   2.348 +	tdb->map_size = st.st_size;
   2.349 +	tdb_mmap(tdb);
   2.350 +	return 0;
   2.351 +}
   2.352 +
   2.353 +/* write a lump of data at a specified offset */
   2.354 +static int tdb_write(TDB_CONTEXT *tdb, tdb_off off, void *buf, tdb_len len)
   2.355 +{
   2.356 +	if (tdb_oob(tdb, off + len, 0) != 0)
   2.357 +		return -1;
   2.358 +
   2.359 +	if (tdb->map_ptr)
   2.360 +		memcpy(off + (char *)tdb->map_ptr, buf, len);
   2.361 +#ifdef HAVE_PWRITE
   2.362 +	else if (pwrite(tdb->fd, buf, len, off) != (ssize_t)len) {
   2.363 +#else
   2.364 +	else if (lseek(tdb->fd, off, SEEK_SET) != (off_t)off
   2.365 +		 || write(tdb->fd, buf, len) != (off_t)len) {
   2.366 +#endif
   2.367 +		/* Ensure ecode is set for log fn. */
   2.368 +		tdb->ecode = TDB_ERR_IO;
   2.369 +		TDB_LOG((tdb, 0,"tdb_write failed at %d len=%d (%s)\n",
   2.370 +			   off, len, strerror(errno)));
   2.371 +		return TDB_ERRCODE(TDB_ERR_IO, -1);
   2.372 +	}
   2.373 +	return 0;
   2.374 +}
   2.375 +
   2.376 +/* read a lump of data at a specified offset, maybe convert */
   2.377 +static int tdb_read(TDB_CONTEXT *tdb,tdb_off off,void *buf,tdb_len len,int cv)
   2.378 +{
   2.379 +	if (tdb_oob(tdb, off + len, 0) != 0)
   2.380 +		return -1;
   2.381 +
   2.382 +	if (tdb->map_ptr)
   2.383 +		memcpy(buf, off + (char *)tdb->map_ptr, len);
   2.384 +#ifdef HAVE_PREAD
   2.385 +	else if (pread(tdb->fd, buf, len, off) != (off_t)len) {
   2.386 +#else
   2.387 +	else if (lseek(tdb->fd, off, SEEK_SET) != (off_t)off
   2.388 +		 || read(tdb->fd, buf, len) != (off_t)len) {
   2.389 +#endif
   2.390 +		/* Ensure ecode is set for log fn. */
   2.391 +		tdb->ecode = TDB_ERR_IO;
   2.392 +		TDB_LOG((tdb, 0,"tdb_read failed at %d len=%d (%s)\n",
   2.393 +			   off, len, strerror(errno)));
   2.394 +		return TDB_ERRCODE(TDB_ERR_IO, -1);
   2.395 +	}
   2.396 +	if (cv)
   2.397 +		convert(buf, len);
   2.398 +	return 0;
   2.399 +}
   2.400 +
   2.401 +/* don't allocate memory: used in tdb_delete path. */
   2.402 +static int tdb_key_eq(TDB_CONTEXT *tdb, tdb_off off, TDB_DATA key)
   2.403 +{
   2.404 +	char buf[64];
   2.405 +	u32 len;
   2.406 +
   2.407 +	if (tdb_oob(tdb, off + key.dsize, 0) != 0)
   2.408 +		return -1;
   2.409 +
   2.410 +	if (tdb->map_ptr)
   2.411 +		return !memcmp(off + (char*)tdb->map_ptr, key.dptr, key.dsize);
   2.412 +
   2.413 +	while (key.dsize) {
   2.414 +		len = key.dsize;
   2.415 +		if (len > sizeof(buf))
   2.416 +			len = sizeof(buf);
   2.417 +		if (tdb_read(tdb, off, buf, len, 0) != 0)
   2.418 +			return -1;
   2.419 +		if (memcmp(buf, key.dptr, len) != 0)
   2.420 +			return 0;
   2.421 +		key.dptr += len;
   2.422 +		key.dsize -= len;
   2.423 +		off += len;
   2.424 +	}
   2.425 +	return 1;
   2.426 +}
   2.427 +
   2.428 +/* read a lump of data, allocating the space for it */
   2.429 +static char *tdb_alloc_read(TDB_CONTEXT *tdb, tdb_off offset, tdb_len len)
   2.430 +{
   2.431 +	char *buf;
   2.432 +
   2.433 +	if (!(buf = talloc_size(tdb, len))) {
   2.434 +		/* Ensure ecode is set for log fn. */
   2.435 +		tdb->ecode = TDB_ERR_OOM;
   2.436 +		TDB_LOG((tdb, 0,"tdb_alloc_read malloc failed len=%d (%s)\n",
   2.437 +			   len, strerror(errno)));
   2.438 +		return TDB_ERRCODE(TDB_ERR_OOM, buf);
   2.439 +	}
   2.440 +	if (tdb_read(tdb, offset, buf, len, 0) == -1) {
   2.441 +		SAFE_FREE(buf);
   2.442 +		return NULL;
   2.443 +	}
   2.444 +	return buf;
   2.445 +}
   2.446 +
   2.447 +/* read/write a tdb_off */
   2.448 +static int ofs_read(TDB_CONTEXT *tdb, tdb_off offset, tdb_off *d)
   2.449 +{
   2.450 +	return tdb_read(tdb, offset, (char*)d, sizeof(*d), DOCONV());
   2.451 +}
   2.452 +static int ofs_write(TDB_CONTEXT *tdb, tdb_off offset, tdb_off *d)
   2.453 +{
   2.454 +	tdb_off off = *d;
   2.455 +	return tdb_write(tdb, offset, CONVERT(off), sizeof(*d));
   2.456 +}
   2.457 +
   2.458 +/* read/write a record */
   2.459 +static int rec_read(TDB_CONTEXT *tdb, tdb_off offset, struct list_struct *rec)
   2.460 +{
   2.461 +	if (tdb_read(tdb, offset, rec, sizeof(*rec),DOCONV()) == -1)
   2.462 +		return -1;
   2.463 +	if (TDB_BAD_MAGIC(rec)) {
   2.464 +		/* Ensure ecode is set for log fn. */
   2.465 +		tdb->ecode = TDB_ERR_CORRUPT;
   2.466 +		TDB_LOG((tdb, 0,"rec_read bad magic 0x%x at offset=%d\n", rec->magic, offset));
   2.467 +		return TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
   2.468 +	}
   2.469 +	return tdb_oob(tdb, rec->next+sizeof(*rec), 0);
   2.470 +}
   2.471 +static int rec_write(TDB_CONTEXT *tdb, tdb_off offset, struct list_struct *rec)
   2.472 +{
   2.473 +	struct list_struct r = *rec;
   2.474 +	return tdb_write(tdb, offset, CONVERT(r), sizeof(r));
   2.475 +}
   2.476 +
   2.477 +/* read a freelist record and check for simple errors */
   2.478 +static int rec_free_read(TDB_CONTEXT *tdb, tdb_off off, struct list_struct *rec)
   2.479 +{
   2.480 +	if (tdb_read(tdb, off, rec, sizeof(*rec),DOCONV()) == -1)
   2.481 +		return -1;
   2.482 +
   2.483 +	if (rec->magic == TDB_MAGIC) {
   2.484 +		/* this happens when a app is showdown while deleting a record - we should
   2.485 +		   not completely fail when this happens */
   2.486 +		TDB_LOG((tdb, 0,"rec_free_read non-free magic 0x%x at offset=%d - fixing\n", 
   2.487 +			 rec->magic, off));
   2.488 +		rec->magic = TDB_FREE_MAGIC;
   2.489 +		if (tdb_write(tdb, off, rec, sizeof(*rec)) == -1)
   2.490 +			return -1;
   2.491 +	}
   2.492 +
   2.493 +	if (rec->magic != TDB_FREE_MAGIC) {
   2.494 +		/* Ensure ecode is set for log fn. */
   2.495 +		tdb->ecode = TDB_ERR_CORRUPT;
   2.496 +		TDB_LOG((tdb, 0,"rec_free_read bad magic 0x%x at offset=%d\n", 
   2.497 +			   rec->magic, off));
   2.498 +		return TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
   2.499 +	}
   2.500 +	if (tdb_oob(tdb, rec->next+sizeof(*rec), 0) != 0)
   2.501 +		return -1;
   2.502 +	return 0;
   2.503 +}
   2.504 +
   2.505 +/* update a record tailer (must hold allocation lock) */
   2.506 +static int update_tailer(TDB_CONTEXT *tdb, tdb_off offset,
   2.507 +			 const struct list_struct *rec)
   2.508 +{
   2.509 +	tdb_off totalsize;
   2.510 +
   2.511 +	/* Offset of tailer from record header */
   2.512 +	totalsize = sizeof(*rec) + rec->rec_len;
   2.513 +	return ofs_write(tdb, offset + totalsize - sizeof(tdb_off),
   2.514 +			 &totalsize);
   2.515 +}
   2.516 +
   2.517 +static tdb_off tdb_dump_record(TDB_CONTEXT *tdb, tdb_off offset)
   2.518 +{
   2.519 +	struct list_struct rec;
   2.520 +	tdb_off tailer_ofs, tailer;
   2.521 +
   2.522 +	if (tdb_read(tdb, offset, (char *)&rec, sizeof(rec), DOCONV()) == -1) {
   2.523 +		printf("ERROR: failed to read record at %u\n", offset);
   2.524 +		return 0;
   2.525 +	}
   2.526 +
   2.527 +	printf(" rec: offset=0x%08x next=0x%08x rec_len=%d key_len=%d data_len=%d full_hash=0x%x magic=0x%x\n",
   2.528 +	       offset, rec.next, rec.rec_len, rec.key_len, rec.data_len, rec.full_hash, rec.magic);
   2.529 +
   2.530 +	tailer_ofs = offset + sizeof(rec) + rec.rec_len - sizeof(tdb_off);
   2.531 +	if (ofs_read(tdb, tailer_ofs, &tailer) == -1) {
   2.532 +		printf("ERROR: failed to read tailer at %u\n", tailer_ofs);
   2.533 +		return rec.next;
   2.534 +	}
   2.535 +
   2.536 +	if (tailer != rec.rec_len + sizeof(rec)) {
   2.537 +		printf("ERROR: tailer does not match record! tailer=%u totalsize=%u\n",
   2.538 +				(unsigned int)tailer, (unsigned int)(rec.rec_len + sizeof(rec)));
   2.539 +	}
   2.540 +	return rec.next;
   2.541 +}
   2.542 +
   2.543 +static int tdb_dump_chain(TDB_CONTEXT *tdb, int i)
   2.544 +{
   2.545 +	tdb_off rec_ptr, top;
   2.546 +
   2.547 +	top = TDB_HASH_TOP(i);
   2.548 +
   2.549 +	if (tdb_lock(tdb, i, F_WRLCK) != 0)
   2.550 +		return -1;
   2.551 +
   2.552 +	if (ofs_read(tdb, top, &rec_ptr) == -1)
   2.553 +		return tdb_unlock(tdb, i, F_WRLCK);
   2.554 +
   2.555 +	if (rec_ptr)
   2.556 +		printf("hash=%d\n", i);
   2.557 +
   2.558 +	while (rec_ptr) {
   2.559 +		rec_ptr = tdb_dump_record(tdb, rec_ptr);
   2.560 +	}
   2.561 +
   2.562 +	return tdb_unlock(tdb, i, F_WRLCK);
   2.563 +}
   2.564 +
   2.565 +void tdb_dump_all(TDB_CONTEXT *tdb)
   2.566 +{
   2.567 +	unsigned int i;
   2.568 +	for (i=0;i<tdb->header.hash_size;i++) {
   2.569 +		tdb_dump_chain(tdb, i);
   2.570 +	}
   2.571 +	printf("freelist:\n");
   2.572 +	tdb_dump_chain(tdb, -1);
   2.573 +}
   2.574 +
   2.575 +int tdb_printfreelist(TDB_CONTEXT *tdb)
   2.576 +{
   2.577 +	int ret;
   2.578 +	long total_free = 0;
   2.579 +	tdb_off offset, rec_ptr;
   2.580 +	struct list_struct rec;
   2.581 +
   2.582 +	if ((ret = tdb_lock(tdb, -1, F_WRLCK)) != 0)
   2.583 +		return ret;
   2.584 +
   2.585 +	offset = FREELIST_TOP;
   2.586 +
   2.587 +	/* read in the freelist top */
   2.588 +	if (ofs_read(tdb, offset, &rec_ptr) == -1) {
   2.589 +		tdb_unlock(tdb, -1, F_WRLCK);
   2.590 +		return 0;
   2.591 +	}
   2.592 +
   2.593 +	printf("freelist top=[0x%08x]\n", rec_ptr );
   2.594 +	while (rec_ptr) {
   2.595 +		if (tdb_read(tdb, rec_ptr, (char *)&rec, sizeof(rec), DOCONV()) == -1) {
   2.596 +			tdb_unlock(tdb, -1, F_WRLCK);
   2.597 +			return -1;
   2.598 +		}
   2.599 +
   2.600 +		if (rec.magic != TDB_FREE_MAGIC) {
   2.601 +			printf("bad magic 0x%08x in free list\n", rec.magic);
   2.602 +			tdb_unlock(tdb, -1, F_WRLCK);
   2.603 +			return -1;
   2.604 +		}
   2.605 +
   2.606 +		printf("entry offset=[0x%08x], rec.rec_len = [0x%08x (%d)] (end = 0x%08x)\n", 
   2.607 +		       rec_ptr, rec.rec_len, rec.rec_len, rec_ptr + rec.rec_len);
   2.608 +		total_free += rec.rec_len;
   2.609 +
   2.610 +		/* move to the next record */
   2.611 +		rec_ptr = rec.next;
   2.612 +	}
   2.613 +	printf("total rec_len = [0x%08x (%d)]\n", (int)total_free, 
   2.614 +               (int)total_free);
   2.615 +
   2.616 +	return tdb_unlock(tdb, -1, F_WRLCK);
   2.617 +}
   2.618 +
   2.619 +/* Remove an element from the freelist.  Must have alloc lock. */
   2.620 +static int remove_from_freelist(TDB_CONTEXT *tdb, tdb_off off, tdb_off next)
   2.621 +{
   2.622 +	tdb_off last_ptr, i;
   2.623 +
   2.624 +	/* read in the freelist top */
   2.625 +	last_ptr = FREELIST_TOP;
   2.626 +	while (ofs_read(tdb, last_ptr, &i) != -1 && i != 0) {
   2.627 +		if (i == off) {
   2.628 +			/* We've found it! */
   2.629 +			return ofs_write(tdb, last_ptr, &next);
   2.630 +		}
   2.631 +		/* Follow chain (next offset is at start of record) */
   2.632 +		last_ptr = i;
   2.633 +	}
   2.634 +	TDB_LOG((tdb, 0,"remove_from_freelist: not on list at off=%d\n", off));
   2.635 +	return TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
   2.636 +}
   2.637 +
   2.638 +/* Add an element into the freelist. Merge adjacent records if
   2.639 +   neccessary. */
   2.640 +static int tdb_free(TDB_CONTEXT *tdb, tdb_off offset, struct list_struct *rec)
   2.641 +{
   2.642 +	tdb_off right, left;
   2.643 +
   2.644 +	/* Allocation and tailer lock */
   2.645 +	if (tdb_lock(tdb, -1, F_WRLCK) != 0)
   2.646 +		return -1;
   2.647 +
   2.648 +	/* set an initial tailer, so if we fail we don't leave a bogus record */
   2.649 +	if (update_tailer(tdb, offset, rec) != 0) {
   2.650 +		TDB_LOG((tdb, 0, "tdb_free: upfate_tailer failed!\n"));
   2.651 +		goto fail;
   2.652 +	}
   2.653 +
   2.654 +	/* Look right first (I'm an Australian, dammit) */
   2.655 +	right = offset + sizeof(*rec) + rec->rec_len;
   2.656 +	if (right + sizeof(*rec) <= tdb->map_size) {
   2.657 +		struct list_struct r;
   2.658 +
   2.659 +		if (tdb_read(tdb, right, &r, sizeof(r), DOCONV()) == -1) {
   2.660 +			TDB_LOG((tdb, 0, "tdb_free: right read failed at %u\n", right));
   2.661 +			goto left;
   2.662 +		}
   2.663 +
   2.664 +		/* If it's free, expand to include it. */
   2.665 +		if (r.magic == TDB_FREE_MAGIC) {
   2.666 +			if (remove_from_freelist(tdb, right, r.next) == -1) {
   2.667 +				TDB_LOG((tdb, 0, "tdb_free: right free failed at %u\n", right));
   2.668 +				goto left;
   2.669 +			}
   2.670 +			rec->rec_len += sizeof(r) + r.rec_len;
   2.671 +		}
   2.672 +	}
   2.673 +
   2.674 +left:
   2.675 +	/* Look left */
   2.676 +	left = offset - sizeof(tdb_off);
   2.677 +	if (left > TDB_DATA_START(tdb->header.hash_size)) {
   2.678 +		struct list_struct l;
   2.679 +		tdb_off leftsize;
   2.680 +		
   2.681 +		/* Read in tailer and jump back to header */
   2.682 +		if (ofs_read(tdb, left, &leftsize) == -1) {
   2.683 +			TDB_LOG((tdb, 0, "tdb_free: left offset read failed at %u\n", left));
   2.684 +			goto update;
   2.685 +		}
   2.686 +		left = offset - leftsize;
   2.687 +
   2.688 +		/* Now read in record */
   2.689 +		if (tdb_read(tdb, left, &l, sizeof(l), DOCONV()) == -1) {
   2.690 +			TDB_LOG((tdb, 0, "tdb_free: left read failed at %u (%u)\n", left, leftsize));
   2.691 +			goto update;
   2.692 +		}
   2.693 +
   2.694 +		/* If it's free, expand to include it. */
   2.695 +		if (l.magic == TDB_FREE_MAGIC) {
   2.696 +			if (remove_from_freelist(tdb, left, l.next) == -1) {
   2.697 +				TDB_LOG((tdb, 0, "tdb_free: left free failed at %u\n", left));
   2.698 +				goto update;
   2.699 +			} else {
   2.700 +				offset = left;
   2.701 +				rec->rec_len += leftsize;
   2.702 +			}
   2.703 +		}
   2.704 +	}
   2.705 +
   2.706 +update:
   2.707 +	if (update_tailer(tdb, offset, rec) == -1) {
   2.708 +		TDB_LOG((tdb, 0, "tdb_free: update_tailer failed at %u\n", offset));
   2.709 +		goto fail;
   2.710 +	}
   2.711 +
   2.712 +	/* Now, prepend to free list */
   2.713 +	rec->magic = TDB_FREE_MAGIC;
   2.714 +
   2.715 +	if (ofs_read(tdb, FREELIST_TOP, &rec->next) == -1 ||
   2.716 +	    rec_write(tdb, offset, rec) == -1 ||
   2.717 +	    ofs_write(tdb, FREELIST_TOP, &offset) == -1) {
   2.718 +		TDB_LOG((tdb, 0, "tdb_free record write failed at offset=%d\n", offset));
   2.719 +		goto fail;
   2.720 +	}
   2.721 +
   2.722 +	/* And we're done. */
   2.723 +	tdb_unlock(tdb, -1, F_WRLCK);
   2.724 +	return 0;
   2.725 +
   2.726 + fail:
   2.727 +	tdb_unlock(tdb, -1, F_WRLCK);
   2.728 +	return -1;
   2.729 +}
   2.730 +
   2.731 +
   2.732 +/* expand a file.  we prefer to use ftruncate, as that is what posix
   2.733 +  says to use for mmap expansion */
   2.734 +static int expand_file(TDB_CONTEXT *tdb, tdb_off size, tdb_off addition)
   2.735 +{
   2.736 +	char buf[1024];
   2.737 +#if HAVE_FTRUNCATE_EXTEND
   2.738 +	if (ftruncate(tdb->fd, size+addition) != 0) {
   2.739 +		TDB_LOG((tdb, 0, "expand_file ftruncate to %d failed (%s)\n", 
   2.740 +			   size+addition, strerror(errno)));
   2.741 +		return -1;
   2.742 +	}
   2.743 +#else
   2.744 +	char b = 0;
   2.745 +
   2.746 +#ifdef HAVE_PWRITE
   2.747 +	if (pwrite(tdb->fd,  &b, 1, (size+addition) - 1) != 1) {
   2.748 +#else
   2.749 +	if (lseek(tdb->fd, (size+addition) - 1, SEEK_SET) != (off_t)(size+addition) - 1 || 
   2.750 +	    write(tdb->fd, &b, 1) != 1) {
   2.751 +#endif
   2.752 +		TDB_LOG((tdb, 0, "expand_file to %d failed (%s)\n", 
   2.753 +			   size+addition, strerror(errno)));
   2.754 +		return -1;
   2.755 +	}
   2.756 +#endif
   2.757 +
   2.758 +	/* now fill the file with something. This ensures that the file isn't sparse, which would be
   2.759 +	   very bad if we ran out of disk. This must be done with write, not via mmap */
   2.760 +	memset(buf, 0x42, sizeof(buf));
   2.761 +	while (addition) {
   2.762 +		int n = addition>sizeof(buf)?sizeof(buf):addition;
   2.763 +#ifdef HAVE_PWRITE
   2.764 +		int ret = pwrite(tdb->fd, buf, n, size);
   2.765 +#else
   2.766 +		int ret;
   2.767 +		if (lseek(tdb->fd, size, SEEK_SET) != (off_t)size)
   2.768 +			return -1;
   2.769 +		ret = write(tdb->fd, buf, n);
   2.770 +#endif
   2.771 +		if (ret != n) {
   2.772 +			TDB_LOG((tdb, 0, "expand_file write of %d failed (%s)\n", 
   2.773 +				   n, strerror(errno)));
   2.774 +			return -1;
   2.775 +		}
   2.776 +		addition -= n;
   2.777 +		size += n;
   2.778 +	}
   2.779 +	return 0;
   2.780 +}
   2.781 +
   2.782 +
   2.783 +/* expand the database at least size bytes by expanding the underlying
   2.784 +   file and doing the mmap again if necessary */
   2.785 +static int tdb_expand(TDB_CONTEXT *tdb, tdb_off size)
   2.786 +{
   2.787 +	struct list_struct rec;
   2.788 +	tdb_off offset;
   2.789 +
   2.790 +	if (tdb_lock(tdb, -1, F_WRLCK) == -1) {
   2.791 +		TDB_LOG((tdb, 0, "lock failed in tdb_expand\n"));
   2.792 +		return -1;
   2.793 +	}
   2.794 +
   2.795 +	/* must know about any previous expansions by another process */
   2.796 +	tdb_oob(tdb, tdb->map_size + 1, 1);
   2.797 +
   2.798 +	/* always make room for at least 10 more records, and round
   2.799 +           the database up to a multiple of TDB_PAGE_SIZE */
   2.800 +	size = TDB_ALIGN(tdb->map_size + size*10, TDB_PAGE_SIZE) - tdb->map_size;
   2.801 +
   2.802 +	if (!(tdb->flags & TDB_INTERNAL))
   2.803 +		tdb_munmap(tdb);
   2.804 +
   2.805 +	/*
   2.806 +	 * We must ensure the file is unmapped before doing this
   2.807 +	 * to ensure consistency with systems like OpenBSD where
   2.808 +	 * writes and mmaps are not consistent.
   2.809 +	 */
   2.810 +
   2.811 +	/* expand the file itself */
   2.812 +	if (!(tdb->flags & TDB_INTERNAL)) {
   2.813 +		if (expand_file(tdb, tdb->map_size, size) != 0)
   2.814 +			goto fail;
   2.815 +	}
   2.816 +
   2.817 +	tdb->map_size += size;
   2.818 +
   2.819 +	if (tdb->flags & TDB_INTERNAL) {
   2.820 +		char *new_map_ptr = talloc_realloc_size(tdb, tdb->map_ptr,
   2.821 +							tdb->map_size);
   2.822 +		if (!new_map_ptr) {
   2.823 +			tdb->map_size -= size;
   2.824 +			goto fail;
   2.825 +		}
   2.826 +		tdb->map_ptr = new_map_ptr;
   2.827 +	} else {
   2.828 +		/*
   2.829 +		 * We must ensure the file is remapped before adding the space
   2.830 +		 * to ensure consistency with systems like OpenBSD where
   2.831 +		 * writes and mmaps are not consistent.
   2.832 +		 */
   2.833 +
   2.834 +		/* We're ok if the mmap fails as we'll fallback to read/write */
   2.835 +		tdb_mmap(tdb);
   2.836 +	}
   2.837 +
   2.838 +	/* form a new freelist record */
   2.839 +	memset(&rec,'\0',sizeof(rec));
   2.840 +	rec.rec_len = size - sizeof(rec);
   2.841 +
   2.842 +	/* link it into the free list */
   2.843 +	offset = tdb->map_size - size;
   2.844 +	if (tdb_free(tdb, offset, &rec) == -1)
   2.845 +		goto fail;
   2.846 +
   2.847 +	tdb_unlock(tdb, -1, F_WRLCK);
   2.848 +	return 0;
   2.849 + fail:
   2.850 +	tdb_unlock(tdb, -1, F_WRLCK);
   2.851 +	return -1;
   2.852 +}
   2.853 +
   2.854 +
   2.855 +/* 
   2.856 +   the core of tdb_allocate - called when we have decided which
   2.857 +   free list entry to use
   2.858 + */
   2.859 +static tdb_off tdb_allocate_ofs(TDB_CONTEXT *tdb, tdb_len length, tdb_off rec_ptr,
   2.860 +				struct list_struct *rec, tdb_off last_ptr)
   2.861 +{
   2.862 +	struct list_struct newrec;
   2.863 +	tdb_off newrec_ptr;
   2.864 +
   2.865 +	memset(&newrec, '\0', sizeof(newrec));
   2.866 +
   2.867 +	/* found it - now possibly split it up  */
   2.868 +	if (rec->rec_len > length + MIN_REC_SIZE) {
   2.869 +		/* Length of left piece */
   2.870 +		length = TDB_ALIGN(length, TDB_ALIGNMENT);
   2.871 +		
   2.872 +		/* Right piece to go on free list */
   2.873 +		newrec.rec_len = rec->rec_len - (sizeof(*rec) + length);
   2.874 +		newrec_ptr = rec_ptr + sizeof(*rec) + length;
   2.875 +		
   2.876 +		/* And left record is shortened */
   2.877 +		rec->rec_len = length;
   2.878 +	} else {
   2.879 +		newrec_ptr = 0;
   2.880 +	}
   2.881 +	
   2.882 +	/* Remove allocated record from the free list */
   2.883 +	if (ofs_write(tdb, last_ptr, &rec->next) == -1) {
   2.884 +		return 0;
   2.885 +	}
   2.886 +	
   2.887 +	/* Update header: do this before we drop alloc
   2.888 +	   lock, otherwise tdb_free() might try to
   2.889 +	   merge with us, thinking we're free.
   2.890 +	   (Thanks Jeremy Allison). */
   2.891 +	rec->magic = TDB_MAGIC;
   2.892 +	if (rec_write(tdb, rec_ptr, rec) == -1) {
   2.893 +		return 0;
   2.894 +	}
   2.895 +	
   2.896 +	/* Did we create new block? */
   2.897 +	if (newrec_ptr) {
   2.898 +		/* Update allocated record tailer (we
   2.899 +		   shortened it). */
   2.900 +		if (update_tailer(tdb, rec_ptr, rec) == -1) {
   2.901 +			return 0;
   2.902 +		}
   2.903 +		
   2.904 +		/* Free new record */
   2.905 +		if (tdb_free(tdb, newrec_ptr, &newrec) == -1) {
   2.906 +			return 0;
   2.907 +		}
   2.908 +	}
   2.909 +	
   2.910 +	/* all done - return the new record offset */
   2.911 +	return rec_ptr;
   2.912 +}
   2.913 +
   2.914 +/* allocate some space from the free list. The offset returned points
   2.915 +   to a unconnected list_struct within the database with room for at
   2.916 +   least length bytes of total data
   2.917 +
   2.918 +   0 is returned if the space could not be allocated
   2.919 + */
   2.920 +static tdb_off tdb_allocate(TDB_CONTEXT *tdb, tdb_len length,
   2.921 +			    struct list_struct *rec)
   2.922 +{
   2.923 +	tdb_off rec_ptr, last_ptr, newrec_ptr;
   2.924 +	struct {
   2.925 +		tdb_off rec_ptr, last_ptr;
   2.926 +		tdb_len rec_len;
   2.927 +	} bestfit = { 0, 0, 0 };
   2.928 +
   2.929 +	if (tdb_lock(tdb, -1, F_WRLCK) == -1)
   2.930 +		return 0;
   2.931 +
   2.932 +	/* Extra bytes required for tailer */
   2.933 +	length += sizeof(tdb_off);
   2.934 +
   2.935 + again:
   2.936 +	last_ptr = FREELIST_TOP;
   2.937 +
   2.938 +	/* read in the freelist top */
   2.939 +	if (ofs_read(tdb, FREELIST_TOP, &rec_ptr) == -1)
   2.940 +		goto fail;
   2.941 +
   2.942 +	bestfit.rec_ptr = 0;
   2.943 +
   2.944 +	/* 
   2.945 +	   this is a best fit allocation strategy. Originally we used
   2.946 +	   a first fit strategy, but it suffered from massive fragmentation
   2.947 +	   issues when faced with a slowly increasing record size.
   2.948 +	 */
   2.949 +	while (rec_ptr) {
   2.950 +		if (rec_free_read(tdb, rec_ptr, rec) == -1) {
   2.951 +			goto fail;
   2.952 +		}
   2.953 +
   2.954 +		if (rec->rec_len >= length) {
   2.955 +			if (bestfit.rec_ptr == 0 ||
   2.956 +			    rec->rec_len < bestfit.rec_len) {
   2.957 +				bestfit.rec_len = rec->rec_len;
   2.958 +				bestfit.rec_ptr = rec_ptr;
   2.959 +				bestfit.last_ptr = last_ptr;
   2.960 +				/* consider a fit to be good enough if we aren't wasting more than half the space */
   2.961 +				if (bestfit.rec_len < 2*length) {
   2.962 +					break;
   2.963 +				}
   2.964 +			}
   2.965 +		}
   2.966 +
   2.967 +		/* move to the next record */
   2.968 +		last_ptr = rec_ptr;
   2.969 +		rec_ptr = rec->next;
   2.970 +	}
   2.971 +
   2.972 +	if (bestfit.rec_ptr != 0) {
   2.973 +		if (rec_free_read(tdb, bestfit.rec_ptr, rec) == -1) {
   2.974 +			goto fail;
   2.975 +		}
   2.976 +
   2.977 +		newrec_ptr = tdb_allocate_ofs(tdb, length, bestfit.rec_ptr, rec, bestfit.last_ptr);
   2.978 +		tdb_unlock(tdb, -1, F_WRLCK);
   2.979 +		return newrec_ptr;
   2.980 +	}
   2.981 +
   2.982 +	/* we didn't find enough space. See if we can expand the
   2.983 +	   database and if we can then try again */
   2.984 +	if (tdb_expand(tdb, length + sizeof(*rec)) == 0)
   2.985 +		goto again;
   2.986 + fail:
   2.987 +	tdb_unlock(tdb, -1, F_WRLCK);
   2.988 +	return 0;
   2.989 +}
   2.990 +
   2.991 +/* initialise a new database with a specified hash size */
   2.992 +static int tdb_new_database(TDB_CONTEXT *tdb, int hash_size)
   2.993 +{
   2.994 +	struct tdb_header *newdb;
   2.995 +	int size, ret = -1;
   2.996 +
   2.997 +	/* We make it up in memory, then write it out if not internal */
   2.998 +	size = sizeof(struct tdb_header) + (hash_size+1)*sizeof(tdb_off);
   2.999 +	if (!(newdb = talloc_zero_size(tdb, size)))
  2.1000 +		return TDB_ERRCODE(TDB_ERR_OOM, -1);
  2.1001 +
  2.1002 +	/* Fill in the header */
  2.1003 +	newdb->version = TDB_VERSION;
  2.1004 +	newdb->hash_size = hash_size;
  2.1005 +	if (tdb->flags & TDB_INTERNAL) {
  2.1006 +		tdb->map_size = size;
  2.1007 +		tdb->map_ptr = (char *)newdb;
  2.1008 +		memcpy(&tdb->header, newdb, sizeof(tdb->header));
  2.1009 +		/* Convert the `ondisk' version if asked. */
  2.1010 +		CONVERT(*newdb);
  2.1011 +		return 0;
  2.1012 +	}
  2.1013 +	if (lseek(tdb->fd, 0, SEEK_SET) == -1)
  2.1014 +		goto fail;
  2.1015 +
  2.1016 +	if (ftruncate(tdb->fd, 0) == -1)
  2.1017 +		goto fail;
  2.1018 +
  2.1019 +	/* This creates an endian-converted header, as if read from disk */
  2.1020 +	CONVERT(*newdb);
  2.1021 +	memcpy(&tdb->header, newdb, sizeof(tdb->header));
  2.1022 +	/* Don't endian-convert the magic food! */
  2.1023 +	memcpy(newdb->magic_food, TDB_MAGIC_FOOD, strlen(TDB_MAGIC_FOOD)+1);
  2.1024 +	if (write(tdb->fd, newdb, size) != size)
  2.1025 +		ret = -1;
  2.1026 +	else
  2.1027 +		ret = 0;
  2.1028 +
  2.1029 +  fail:
  2.1030 +	SAFE_FREE(newdb);
  2.1031 +	return ret;
  2.1032 +}
  2.1033 +
  2.1034 +/* Returns 0 on fail.  On success, return offset of record, and fills
  2.1035 +   in rec */
  2.1036 +static tdb_off tdb_find(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash,
  2.1037 +			struct list_struct *r)
  2.1038 +{
  2.1039 +	tdb_off rec_ptr;
  2.1040 +	
  2.1041 +	/* read in the hash top */
  2.1042 +	if (ofs_read(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1)
  2.1043 +		return 0;
  2.1044 +
  2.1045 +	/* keep looking until we find the right record */
  2.1046 +	while (rec_ptr) {
  2.1047 +		if (rec_read(tdb, rec_ptr, r) == -1)
  2.1048 +			return 0;
  2.1049 +
  2.1050 +		if (!TDB_DEAD(r) && hash==r->full_hash && key.dsize==r->key_len) {
  2.1051 +			/* a very likely hit - read the key */
  2.1052 +			int cmp = tdb_key_eq(tdb, rec_ptr + sizeof(*r), key);
  2.1053 +			if (cmp < 0)
  2.1054 +				return 0;
  2.1055 +			else if (cmp > 0)
  2.1056 +				return rec_ptr;
  2.1057 +		}
  2.1058 +		rec_ptr = r->next;
  2.1059 +	}
  2.1060 +	return TDB_ERRCODE(TDB_ERR_NOEXIST, 0);
  2.1061 +}
  2.1062 +
  2.1063 +/* As tdb_find, but if you succeed, keep the lock */
  2.1064 +static tdb_off tdb_find_lock_hash(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash, int locktype,
  2.1065 +			     struct list_struct *rec)
  2.1066 +{
  2.1067 +	u32 rec_ptr;
  2.1068 +
  2.1069 +	if (tdb_lock(tdb, BUCKET(hash), locktype) == -1)
  2.1070 +		return 0;
  2.1071 +	if (!(rec_ptr = tdb_find(tdb, key, hash, rec)))
  2.1072 +		tdb_unlock(tdb, BUCKET(hash), locktype);
  2.1073 +	return rec_ptr;
  2.1074 +}
  2.1075 +
  2.1076 +enum TDB_ERROR tdb_error(TDB_CONTEXT *tdb)
  2.1077 +{
  2.1078 +	return tdb->ecode;
  2.1079 +}
  2.1080 +
  2.1081 +static struct tdb_errname {
  2.1082 +	enum TDB_ERROR ecode; const char *estring;
  2.1083 +} emap[] = { {TDB_SUCCESS, "Success"},
  2.1084 +	     {TDB_ERR_CORRUPT, "Corrupt database"},
  2.1085 +	     {TDB_ERR_IO, "IO Error"},
  2.1086 +	     {TDB_ERR_LOCK, "Locking error"},
  2.1087 +	     {TDB_ERR_OOM, "Out of memory"},
  2.1088 +	     {TDB_ERR_EXISTS, "Record exists"},
  2.1089 +	     {TDB_ERR_NOLOCK, "Lock exists on other keys"},
  2.1090 +	     {TDB_ERR_NOEXIST, "Record does not exist"} };
  2.1091 +
  2.1092 +/* Error string for the last tdb error */
  2.1093 +const char *tdb_errorstr(TDB_CONTEXT *tdb)
  2.1094 +{
  2.1095 +	u32 i;
  2.1096 +	for (i = 0; i < sizeof(emap) / sizeof(struct tdb_errname); i++)
  2.1097 +		if (tdb->ecode == emap[i].ecode)
  2.1098 +			return emap[i].estring;
  2.1099 +	return "Invalid error code";
  2.1100 +}
  2.1101 +
  2.1102 +/* update an entry in place - this only works if the new data size
  2.1103 +   is <= the old data size and the key exists.
  2.1104 +   on failure return -1.
  2.1105 +*/
  2.1106 +
  2.1107 +static int tdb_update_hash(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash, TDB_DATA dbuf)
  2.1108 +{
  2.1109 +	struct list_struct rec;
  2.1110 +	tdb_off rec_ptr;
  2.1111 +
  2.1112 +	/* find entry */
  2.1113 +	if (!(rec_ptr = tdb_find(tdb, key, hash, &rec)))
  2.1114 +		return -1;
  2.1115 +
  2.1116 +	/* must be long enough key, data and tailer */
  2.1117 +	if (rec.rec_len < key.dsize + dbuf.dsize + sizeof(tdb_off)) {
  2.1118 +		tdb->ecode = TDB_SUCCESS; /* Not really an error */
  2.1119 +		return -1;
  2.1120 +	}
  2.1121 +
  2.1122 +	if (tdb_write(tdb, rec_ptr + sizeof(rec) + rec.key_len,
  2.1123 +		      dbuf.dptr, dbuf.dsize) == -1)
  2.1124 +		return -1;
  2.1125 +
  2.1126 +	if (dbuf.dsize != rec.data_len) {
  2.1127 +		/* update size */
  2.1128 +		rec.data_len = dbuf.dsize;
  2.1129 +		return rec_write(tdb, rec_ptr, &rec);
  2.1130 +	}
  2.1131 + 
  2.1132 +	return 0;
  2.1133 +}
  2.1134 +
  2.1135 +/* find an entry in the database given a key */
  2.1136 +/* If an entry doesn't exist tdb_err will be set to
  2.1137 + * TDB_ERR_NOEXIST. If a key has no data attached
  2.1138 + * then the TDB_DATA will have zero length but
  2.1139 + * a non-zero pointer
  2.1140 + */
  2.1141 +
  2.1142 +TDB_DATA tdb_fetch(TDB_CONTEXT *tdb, TDB_DATA key)
  2.1143 +{
  2.1144 +	tdb_off rec_ptr;
  2.1145 +	struct list_struct rec;
  2.1146 +	TDB_DATA ret;
  2.1147 +	u32 hash;
  2.1148 +
  2.1149 +	/* find which hash bucket it is in */
  2.1150 +	hash = tdb->hash_fn(&key);
  2.1151 +	if (!(rec_ptr = tdb_find_lock_hash(tdb,key,hash,F_RDLCK,&rec)))
  2.1152 +		return tdb_null;
  2.1153 +
  2.1154 +	ret.dptr = tdb_alloc_read(tdb, rec_ptr + sizeof(rec) + rec.key_len,
  2.1155 +				  rec.data_len);
  2.1156 +	ret.dsize = rec.data_len;
  2.1157 +	tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
  2.1158 +	return ret;
  2.1159 +}
  2.1160 +
  2.1161 +/* check if an entry in the database exists 
  2.1162 +
  2.1163 +   note that 1 is returned if the key is found and 0 is returned if not found
  2.1164 +   this doesn't match the conventions in the rest of this module, but is
  2.1165 +   compatible with gdbm
  2.1166 +*/
  2.1167 +static int tdb_exists_hash(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash)
  2.1168 +{
  2.1169 +	struct list_struct rec;
  2.1170 +	
  2.1171 +	if (tdb_find_lock_hash(tdb, key, hash, F_RDLCK, &rec) == 0)
  2.1172 +		return 0;
  2.1173 +	tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
  2.1174 +	return 1;
  2.1175 +}
  2.1176 +
  2.1177 +int tdb_exists(TDB_CONTEXT *tdb, TDB_DATA key)
  2.1178 +{
  2.1179 +	u32 hash = tdb->hash_fn(&key);
  2.1180 +	return tdb_exists_hash(tdb, key, hash);
  2.1181 +}
  2.1182 +
  2.1183 +/* record lock stops delete underneath */
  2.1184 +static int lock_record(TDB_CONTEXT *tdb, tdb_off off)
  2.1185 +{
  2.1186 +	return off ? tdb_brlock(tdb, off, F_RDLCK, F_SETLKW, 0) : 0;
  2.1187 +}
  2.1188 +/*
  2.1189 +  Write locks override our own fcntl readlocks, so check it here.
  2.1190 +  Note this is meant to be F_SETLK, *not* F_SETLKW, as it's not
  2.1191 +  an error to fail to get the lock here.
  2.1192 +*/
  2.1193 + 
  2.1194 +static int write_lock_record(TDB_CONTEXT *tdb, tdb_off off)
  2.1195 +{
  2.1196 +	struct tdb_traverse_lock *i;
  2.1197 +	for (i = &tdb->travlocks; i; i = i->next)
  2.1198 +		if (i->off == off)
  2.1199 +			return -1;
  2.1200 +	return tdb_brlock(tdb, off, F_WRLCK, F_SETLK, 1);
  2.1201 +}
  2.1202 +
  2.1203 +/*
  2.1204 +  Note this is meant to be F_SETLK, *not* F_SETLKW, as it's not
  2.1205 +  an error to fail to get the lock here.
  2.1206 +*/
  2.1207 +
  2.1208 +static int write_unlock_record(TDB_CONTEXT *tdb, tdb_off off)
  2.1209 +{
  2.1210 +	return tdb_brlock(tdb, off, F_UNLCK, F_SETLK, 0);
  2.1211 +}
  2.1212 +/* fcntl locks don't stack: avoid unlocking someone else's */
  2.1213 +static int unlock_record(TDB_CONTEXT *tdb, tdb_off off)
  2.1214 +{
  2.1215 +	struct tdb_traverse_lock *i;
  2.1216 +	u32 count = 0;
  2.1217 +
  2.1218 +	if (off == 0)
  2.1219 +		return 0;
  2.1220 +	for (i = &tdb->travlocks; i; i = i->next)
  2.1221 +		if (i->off == off)
  2.1222 +			count++;
  2.1223 +	return (count == 1 ? tdb_brlock(tdb, off, F_UNLCK, F_SETLKW, 0) : 0);
  2.1224 +}
  2.1225 +
  2.1226 +/* actually delete an entry in the database given the offset */
  2.1227 +static int do_delete(TDB_CONTEXT *tdb, tdb_off rec_ptr, struct list_struct*rec)
  2.1228 +{
  2.1229 +	tdb_off last_ptr, i;
  2.1230 +	struct list_struct lastrec;
  2.1231 +
  2.1232 +	if (tdb->read_only) return -1;
  2.1233 +
  2.1234 +	if (write_lock_record(tdb, rec_ptr) == -1) {
  2.1235 +		/* Someone traversing here: mark it as dead */
  2.1236 +		rec->magic = TDB_DEAD_MAGIC;
  2.1237 +		return rec_write(tdb, rec_ptr, rec);
  2.1238 +	}
  2.1239 +	if (write_unlock_record(tdb, rec_ptr) != 0)
  2.1240 +		return -1;
  2.1241 +
  2.1242 +	/* find previous record in hash chain */
  2.1243 +	if (ofs_read(tdb, TDB_HASH_TOP(rec->full_hash), &i) == -1)
  2.1244 +		return -1;
  2.1245 +	for (last_ptr = 0; i != rec_ptr; last_ptr = i, i = lastrec.next)
  2.1246 +		if (rec_read(tdb, i, &lastrec) == -1)
  2.1247 +			return -1;
  2.1248 +
  2.1249 +	/* unlink it: next ptr is at start of record. */
  2.1250 +	if (last_ptr == 0)
  2.1251 +		last_ptr = TDB_HASH_TOP(rec->full_hash);
  2.1252 +	if (ofs_write(tdb, last_ptr, &rec->next) == -1)
  2.1253 +		return -1;
  2.1254 +
  2.1255 +	/* recover the space */
  2.1256 +	if (tdb_free(tdb, rec_ptr, rec) == -1)
  2.1257 +		return -1;
  2.1258 +	return 0;
  2.1259 +}
  2.1260 +
  2.1261 +/* Uses traverse lock: 0 = finish, -1 = error, other = record offset */
  2.1262 +static int tdb_next_lock(TDB_CONTEXT *tdb, struct tdb_traverse_lock *tlock,
  2.1263 +			 struct list_struct *rec)
  2.1264 +{
  2.1265 +	int want_next = (tlock->off != 0);
  2.1266 +
  2.1267 +	/* Lock each chain from the start one. */
  2.1268 +	for (; tlock->hash < tdb->header.hash_size; tlock->hash++) {
  2.1269 +
  2.1270 +		/* this is an optimisation for the common case where
  2.1271 +		   the hash chain is empty, which is particularly
  2.1272 +		   common for the use of tdb with ldb, where large
  2.1273 +		   hashes are used. In that case we spend most of our
  2.1274 +		   time in tdb_brlock(), locking empty hash chains.
  2.1275 +
  2.1276 +		   To avoid this, we do an unlocked pre-check to see
  2.1277 +		   if the hash chain is empty before starting to look
  2.1278 +		   inside it. If it is empty then we can avoid that
  2.1279 +		   hash chain. If it isn't empty then we can't believe
  2.1280 +		   the value we get back, as we read it without a
  2.1281 +		   lock, so instead we get the lock and re-fetch the
  2.1282 +		   value below.
  2.1283 +
  2.1284 +		   Notice that not doing this optimisation on the
  2.1285 +		   first hash chain is critical. We must guarantee
  2.1286 +		   that we have done at least one fcntl lock at the
  2.1287 +		   start of a search to guarantee that memory is
  2.1288 +		   coherent on SMP systems. If records are added by
  2.1289 +		   others during the search then thats OK, and we
  2.1290 +		   could possibly miss those with this trick, but we
  2.1291 +		   could miss them anyway without this trick, so the
  2.1292 +		   semantics don't change.
  2.1293 +
  2.1294 +		   With a non-indexed ldb search this trick gains us a
  2.1295 +		   factor of around 80 in speed on a linux 2.6.x
  2.1296 +		   system (testing using ldbtest).
  2.1297 +		 */
  2.1298 +		if (!tlock->off && tlock->hash != 0) {
  2.1299 +			u32 off;
  2.1300 +			if (tdb->map_ptr) {
  2.1301 +				for (;tlock->hash < tdb->header.hash_size;tlock->hash++) {
  2.1302 +					if (0 != *(u32 *)(TDB_HASH_TOP(tlock->hash) + (unsigned char *)tdb->map_ptr)) {
  2.1303 +						break;
  2.1304 +					}
  2.1305 +				}
  2.1306 +				if (tlock->hash == tdb->header.hash_size) {
  2.1307 +					continue;
  2.1308 +				}
  2.1309 +			} else {
  2.1310 +				if (ofs_read(tdb, TDB_HASH_TOP(tlock->hash), &off) == 0 &&
  2.1311 +				    off == 0) {
  2.1312 +					continue;
  2.1313 +				}
  2.1314 +			}
  2.1315 +		}
  2.1316 +
  2.1317 +		if (tdb_lock(tdb, tlock->hash, F_WRLCK) == -1)
  2.1318 +			return -1;
  2.1319 +
  2.1320 +		/* No previous record?  Start at top of chain. */
  2.1321 +		if (!tlock->off) {
  2.1322 +			if (ofs_read(tdb, TDB_HASH_TOP(tlock->hash),
  2.1323 +				     &tlock->off) == -1)
  2.1324 +				goto fail;
  2.1325 +		} else {
  2.1326 +			/* Otherwise unlock the previous record. */
  2.1327 +			if (unlock_record(tdb, tlock->off) != 0)
  2.1328 +				goto fail;
  2.1329 +		}
  2.1330 +
  2.1331 +		if (want_next) {
  2.1332 +			/* We have offset of old record: grab next */
  2.1333 +			if (rec_read(tdb, tlock->off, rec) == -1)
  2.1334 +				goto fail;
  2.1335 +			tlock->off = rec->next;
  2.1336 +		}
  2.1337 +
  2.1338 +		/* Iterate through chain */
  2.1339 +		while( tlock->off) {
  2.1340 +			tdb_off current;
  2.1341 +			if (rec_read(tdb, tlock->off, rec) == -1)
  2.1342 +				goto fail;
  2.1343 +
  2.1344 +			/* Detect infinite loops. From "Shlomi Yaakobovich" <Shlomi@exanet.com>. */
  2.1345 +			if (tlock->off == rec->next) {
  2.1346 +				TDB_LOG((tdb, 0, "tdb_next_lock: loop detected.\n"));
  2.1347 +				goto fail;
  2.1348 +			}
  2.1349 +
  2.1350 +			if (!TDB_DEAD(rec)) {
  2.1351 +				/* Woohoo: we found one! */
  2.1352 +				if (lock_record(tdb, tlock->off) != 0)
  2.1353 +					goto fail;
  2.1354 +				return tlock->off;
  2.1355 +			}
  2.1356 +
  2.1357 +			/* Try to clean dead ones from old traverses */
  2.1358 +			current = tlock->off;
  2.1359 +			tlock->off = rec->next;
  2.1360 +			if (!tdb->read_only && 
  2.1361 +			    do_delete(tdb, current, rec) != 0)
  2.1362 +				goto fail;
  2.1363 +		}
  2.1364 +		tdb_unlock(tdb, tlock->hash, F_WRLCK);
  2.1365 +		want_next = 0;
  2.1366 +	}
  2.1367 +	/* We finished iteration without finding anything */
  2.1368 +	return TDB_ERRCODE(TDB_SUCCESS, 0);
  2.1369 +
  2.1370 + fail:
  2.1371 +	tlock->off = 0;
  2.1372 +	if (tdb_unlock(tdb, tlock->hash, F_WRLCK) != 0)
  2.1373 +		TDB_LOG((tdb, 0, "tdb_next_lock: On error unlock failed!\n"));
  2.1374 +	return -1;
  2.1375 +}
  2.1376 +
  2.1377 +/* traverse the entire database - calling fn(tdb, key, data) on each element.
  2.1378 +   return -1 on error or the record count traversed
  2.1379 +   if fn is NULL then it is not called
  2.1380 +   a non-zero return value from fn() indicates that the traversal should stop
  2.1381 +  */
  2.1382 +int tdb_traverse(TDB_CONTEXT *tdb, tdb_traverse_func fn, void *private)
  2.1383 +{
  2.1384 +	TDB_DATA key, dbuf;
  2.1385 +	struct list_struct rec;
  2.1386 +	struct tdb_traverse_lock tl = { NULL, 0, 0 };
  2.1387 +	int ret, count = 0;
  2.1388 +
  2.1389 +	/* This was in the initializaton, above, but the IRIX compiler
  2.1390 +	 * did not like it.  crh
  2.1391 +	 */
  2.1392 +	tl.next = tdb->travlocks.next;
  2.1393 +
  2.1394 +	/* fcntl locks don't stack: beware traverse inside traverse */
  2.1395 +	tdb->travlocks.next = &tl;
  2.1396 +
  2.1397 +	/* tdb_next_lock places locks on the record returned, and its chain */
  2.1398 +	while ((ret = tdb_next_lock(tdb, &tl, &rec)) > 0) {
  2.1399 +		count++;
  2.1400 +		/* now read the full record */
  2.1401 +		key.dptr = tdb_alloc_read(tdb, tl.off + sizeof(rec), 
  2.1402 +					  rec.key_len + rec.data_len);
  2.1403 +		if (!key.dptr) {
  2.1404 +			ret = -1;
  2.1405 +			if (tdb_unlock(tdb, tl.hash, F_WRLCK) != 0)
  2.1406 +				goto out;
  2.1407 +			if (unlock_record(tdb, tl.off) != 0)
  2.1408 +				TDB_LOG((tdb, 0, "tdb_traverse: key.dptr == NULL and unlock_record failed!\n"));
  2.1409 +			goto out;
  2.1410 +		}
  2.1411 +		key.dsize = rec.key_len;
  2.1412 +		dbuf.dptr = key.dptr + rec.key_len;
  2.1413 +		dbuf.dsize = rec.data_len;
  2.1414 +
  2.1415 +		/* Drop chain lock, call out */
  2.1416 +		if (tdb_unlock(tdb, tl.hash, F_WRLCK) != 0) {
  2.1417 +			ret = -1;
  2.1418 +			goto out;
  2.1419 +		}
  2.1420 +		if (fn && fn(tdb, key, dbuf, private)) {
  2.1421 +			/* They want us to terminate traversal */
  2.1422 +			ret = count;
  2.1423 +			if (unlock_record(tdb, tl.off) != 0) {
  2.1424 +				TDB_LOG((tdb, 0, "tdb_traverse: unlock_record failed!\n"));;
  2.1425 +				ret = -1;
  2.1426 +			}
  2.1427 +			tdb->travlocks.next = tl.next;
  2.1428 +			SAFE_FREE(key.dptr);
  2.1429 +			return count;
  2.1430 +		}
  2.1431 +		SAFE_FREE(key.dptr);
  2.1432 +	}
  2.1433 +out:
  2.1434 +	tdb->travlocks.next = tl.next;
  2.1435 +	if (ret < 0)
  2.1436 +		return -1;
  2.1437 +	else
  2.1438 +		return count;
  2.1439 +}
  2.1440 +
  2.1441 +/* find the first entry in the database and return its key */
  2.1442 +TDB_DATA tdb_firstkey(TDB_CONTEXT *tdb)
  2.1443 +{
  2.1444 +	TDB_DATA key;
  2.1445 +	struct list_struct rec;
  2.1446 +
  2.1447 +	/* release any old lock */
  2.1448 +	if (unlock_record(tdb, tdb->travlocks.off) != 0)
  2.1449 +		return tdb_null;
  2.1450 +	tdb->travlocks.off = tdb->travlocks.hash = 0;
  2.1451 +
  2.1452 +	if (tdb_next_lock(tdb, &tdb->travlocks, &rec) <= 0)
  2.1453 +		return tdb_null;
  2.1454 +	/* now read the key */
  2.1455 +	key.dsize = rec.key_len;
  2.1456 +	key.dptr =tdb_alloc_read(tdb,tdb->travlocks.off+sizeof(rec),key.dsize);
  2.1457 +	if (tdb_unlock(tdb, BUCKET(tdb->travlocks.hash), F_WRLCK) != 0)
  2.1458 +		TDB_LOG((tdb, 0, "tdb_firstkey: error occurred while tdb_unlocking!\n"));
  2.1459 +	return key;
  2.1460 +}
  2.1461 +
  2.1462 +/* find the next entry in the database, returning its key */
  2.1463 +TDB_DATA tdb_nextkey(TDB_CONTEXT *tdb, TDB_DATA oldkey)
  2.1464 +{
  2.1465 +	u32 oldhash;
  2.1466 +	TDB_DATA key = tdb_null;
  2.1467 +	struct list_struct rec;
  2.1468 +	char *k = NULL;
  2.1469 +
  2.1470 +	/* Is locked key the old key?  If so, traverse will be reliable. */
  2.1471 +	if (tdb->travlocks.off) {
  2.1472 +		if (tdb_lock(tdb,tdb->travlocks.hash,F_WRLCK))
  2.1473 +			return tdb_null;
  2.1474 +		if (rec_read(tdb, tdb->travlocks.off, &rec) == -1
  2.1475 +		    || !(k = tdb_alloc_read(tdb,tdb->travlocks.off+sizeof(rec),
  2.1476 +					    rec.key_len))
  2.1477 +		    || memcmp(k, oldkey.dptr, oldkey.dsize) != 0) {
  2.1478 +			/* No, it wasn't: unlock it and start from scratch */
  2.1479 +			if (unlock_record(tdb, tdb->travlocks.off) != 0)
  2.1480 +				return tdb_null;
  2.1481 +			if (tdb_unlock(tdb, tdb->travlocks.hash, F_WRLCK) != 0)
  2.1482 +				return tdb_null;
  2.1483 +			tdb->travlocks.off = 0;
  2.1484 +		}
  2.1485 +
  2.1486 +		SAFE_FREE(k);
  2.1487 +	}
  2.1488 +
  2.1489 +	if (!tdb->travlocks.off) {
  2.1490 +		/* No previous element: do normal find, and lock record */
  2.1491 +		tdb->travlocks.off = tdb_find_lock_hash(tdb, oldkey, tdb->hash_fn(&oldkey), F_WRLCK, &rec);
  2.1492 +		if (!tdb->travlocks.off)
  2.1493 +			return tdb_null;
  2.1494 +		tdb->travlocks.hash = BUCKET(rec.full_hash);
  2.1495 +		if (lock_record(tdb, tdb->travlocks.off) != 0) {
  2.1496 +			TDB_LOG((tdb, 0, "tdb_nextkey: lock_record failed (%s)!\n", strerror(errno)));
  2.1497 +			return tdb_null;
  2.1498 +		}
  2.1499 +	}
  2.1500 +	oldhash = tdb->travlocks.hash;
  2.1501 +
  2.1502 +	/* Grab next record: locks chain and returned record,
  2.1503 +	   unlocks old record */
  2.1504 +	if (tdb_next_lock(tdb, &tdb->travlocks, &rec) > 0) {
  2.1505 +		key.dsize = rec.key_len;
  2.1506 +		key.dptr = tdb_alloc_read(tdb, tdb->travlocks.off+sizeof(rec),
  2.1507 +					  key.dsize);
  2.1508 +		/* Unlock the chain of this new record */
  2.1509 +		if (tdb_unlock(tdb, tdb->travlocks.hash, F_WRLCK) != 0)
  2.1510 +			TDB_LOG((tdb, 0, "tdb_nextkey: WARNING tdb_unlock failed!\n"));
  2.1511 +	}
  2.1512 +	/* Unlock the chain of old record */
  2.1513 +	if (tdb_unlock(tdb, BUCKET(oldhash), F_WRLCK) != 0)
  2.1514 +		TDB_LOG((tdb, 0, "tdb_nextkey: WARNING tdb_unlock failed!\n"));
  2.1515 +	return key;
  2.1516 +}
  2.1517 +
  2.1518 +/* delete an entry in the database given a key */
  2.1519 +static int tdb_delete_hash(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash)
  2.1520 +{
  2.1521 +	tdb_off rec_ptr;
  2.1522 +	struct list_struct rec;
  2.1523 +	int ret;
  2.1524 +
  2.1525 +	if (!(rec_ptr = tdb_find_lock_hash(tdb, key, hash, F_WRLCK, &rec)))
  2.1526 +		return -1;
  2.1527 +	ret = do_delete(tdb, rec_ptr, &rec);
  2.1528 +	if (tdb_unlock(tdb, BUCKET(rec.full_hash), F_WRLCK) != 0)
  2.1529 +		TDB_LOG((tdb, 0, "tdb_delete: WARNING tdb_unlock failed!\n"));
  2.1530 +	return ret;
  2.1531 +}
  2.1532 +
  2.1533 +int tdb_delete(TDB_CONTEXT *tdb, TDB_DATA key)
  2.1534 +{
  2.1535 +	u32 hash = tdb->hash_fn(&key);
  2.1536 +	return tdb_delete_hash(tdb, key, hash);
  2.1537 +}
  2.1538 +
  2.1539 +/* store an element in the database, replacing any existing element
  2.1540 +   with the same key 
  2.1541 +
  2.1542 +   return 0 on success, -1 on failure
  2.1543 +*/
  2.1544 +int tdb_store(TDB_CONTEXT *tdb, TDB_DATA key, TDB_DATA dbuf, int flag)
  2.1545 +{
  2.1546 +	struct list_struct rec;
  2.1547 +	u32 hash;
  2.1548 +	tdb_off rec_ptr;
  2.1549 +	char *p = NULL;
  2.1550 +	int ret = 0;
  2.1551 +
  2.1552 +	/* find which hash bucket it is in */
  2.1553 +	hash = tdb->hash_fn(&key);
  2.1554 +	if (tdb_lock(tdb, BUCKET(hash), F_WRLCK) == -1)
  2.1555 +		return -1;
  2.1556 +
  2.1557 +	/* check for it existing, on insert. */
  2.1558 +	if (flag == TDB_INSERT) {
  2.1559 +		if (tdb_exists_hash(tdb, key, hash)) {
  2.1560 +			tdb->ecode = TDB_ERR_EXISTS;
  2.1561 +			goto fail;
  2.1562 +		}
  2.1563 +	} else {
  2.1564 +		/* first try in-place update, on modify or replace. */
  2.1565 +		if (tdb_update_hash(tdb, key, hash, dbuf) == 0)
  2.1566 +			goto out;
  2.1567 +		if (tdb->ecode == TDB_ERR_NOEXIST &&
  2.1568 +		    flag == TDB_MODIFY) {
  2.1569 +			/* if the record doesn't exist and we are in TDB_MODIFY mode then
  2.1570 +			 we should fail the store */
  2.1571 +			goto fail;
  2.1572 +		}
  2.1573 +	}
  2.1574 +	/* reset the error code potentially set by the tdb_update() */
  2.1575 +	tdb->ecode = TDB_SUCCESS;
  2.1576 +
  2.1577 +	/* delete any existing record - if it doesn't exist we don't
  2.1578 +           care.  Doing this first reduces fragmentation, and avoids
  2.1579 +           coalescing with `allocated' block before it's updated. */
  2.1580 +	if (flag != TDB_INSERT)
  2.1581 +		tdb_delete_hash(tdb, key, hash);
  2.1582 +
  2.1583 +	/* Copy key+value *before* allocating free space in case malloc
  2.1584 +	   fails and we are left with a dead spot in the tdb. */
  2.1585 +
  2.1586 +	if (!(p = (char *)talloc_size(tdb, key.dsize + dbuf.dsize))) {
  2.1587 +		tdb->ecode = TDB_ERR_OOM;
  2.1588 +		goto fail;
  2.1589 +	}
  2.1590 +
  2.1591 +	memcpy(p, key.dptr, key.dsize);
  2.1592 +	if (dbuf.dsize)
  2.1593 +		memcpy(p+key.dsize, dbuf.dptr, dbuf.dsize);
  2.1594 +
  2.1595 +	/* we have to allocate some space */
  2.1596 +	if (!(rec_ptr = tdb_allocate(tdb, key.dsize + dbuf.dsize, &rec)))
  2.1597 +		goto fail;
  2.1598 +
  2.1599 +	/* Read hash top into next ptr */
  2.1600 +	if (ofs_read(tdb, TDB_HASH_TOP(hash), &rec.next) == -1)
  2.1601 +		goto fail;
  2.1602 +
  2.1603 +	rec.key_len = key.dsize;
  2.1604 +	rec.data_len = dbuf.dsize;
  2.1605 +	rec.full_hash = hash;
  2.1606 +	rec.magic = TDB_MAGIC;
  2.1607 +
  2.1608 +	/* write out and point the top of the hash chain at it */
  2.1609 +	if (rec_write(tdb, rec_ptr, &rec) == -1
  2.1610 +	    || tdb_write(tdb, rec_ptr+sizeof(rec), p, key.dsize+dbuf.dsize)==-1
  2.1611 +	    || ofs_write(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1) {
  2.1612 +		/* Need to tdb_unallocate() here */
  2.1613 +		goto fail;
  2.1614 +	}
  2.1615 + out:
  2.1616 +	SAFE_FREE(p); 
  2.1617 +	tdb_unlock(tdb, BUCKET(hash), F_WRLCK);
  2.1618 +	return ret;
  2.1619 +fail:
  2.1620 +	ret = -1;
  2.1621 +	goto out;
  2.1622 +}
  2.1623 +
  2.1624 +/* Attempt to append data to an entry in place - this only works if the new data size
  2.1625 +   is <= the old data size and the key exists.
  2.1626 +   on failure return -1. Record must be locked before calling.
  2.1627 +*/
  2.1628 +static int tdb_append_inplace(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash, TDB_DATA new_dbuf)
  2.1629 +{
  2.1630 +	struct list_struct rec;
  2.1631 +	tdb_off rec_ptr;
  2.1632 +
  2.1633 +	/* find entry */
  2.1634 +	if (!(rec_ptr = tdb_find(tdb, key, hash, &rec)))
  2.1635 +		return -1;
  2.1636 +
  2.1637 +	/* Append of 0 is always ok. */
  2.1638 +	if (new_dbuf.dsize == 0)
  2.1639 +		return 0;
  2.1640 +
  2.1641 +	/* must be long enough for key, old data + new data and tailer */
  2.1642 +	if (rec.rec_len < key.dsize + rec.data_len + new_dbuf.dsize + sizeof(tdb_off)) {
  2.1643 +		/* No room. */
  2.1644 +		tdb->ecode = TDB_SUCCESS; /* Not really an error */
  2.1645 +		return -1;
  2.1646 +	}
  2.1647 +
  2.1648 +	if (tdb_write(tdb, rec_ptr + sizeof(rec) + rec.key_len + rec.data_len,
  2.1649 +		      new_dbuf.dptr, new_dbuf.dsize) == -1)
  2.1650 +		return -1;
  2.1651 +
  2.1652 +	/* update size */
  2.1653 +	rec.data_len += new_dbuf.dsize;
  2.1654 +	return rec_write(tdb, rec_ptr, &rec);
  2.1655 +}
  2.1656 +
  2.1657 +/* Append to an entry. Create if not exist. */
  2.1658 +
  2.1659 +int tdb_append(TDB_CONTEXT *tdb, TDB_DATA key, TDB_DATA new_dbuf)
  2.1660 +{
  2.1661 +	struct list_struct rec;
  2.1662 +	u32 hash;
  2.1663 +	tdb_off rec_ptr;
  2.1664 +	char *p = NULL;
  2.1665 +	int ret = 0;
  2.1666 +	size_t new_data_size = 0;
  2.1667 +
  2.1668 +	/* find which hash bucket it is in */
  2.1669 +	hash = tdb->hash_fn(&key);
  2.1670 +	if (tdb_lock(tdb, BUCKET(hash), F_WRLCK) == -1)
  2.1671 +		return -1;
  2.1672 +
  2.1673 +	/* first try in-place. */
  2.1674 +	if (tdb_append_inplace(tdb, key, hash, new_dbuf) == 0)
  2.1675 +		goto out;
  2.1676 +
  2.1677 +	/* reset the error code potentially set by the tdb_append_inplace() */
  2.1678 +	tdb->ecode = TDB_SUCCESS;
  2.1679 +
  2.1680 +	/* find entry */
  2.1681 +	if (!(rec_ptr = tdb_find(tdb, key, hash, &rec))) {
  2.1682 +		if (tdb->ecode != TDB_ERR_NOEXIST)
  2.1683 +			goto fail;
  2.1684 +
  2.1685 +		/* Not found - create. */
  2.1686 +
  2.1687 +		ret = tdb_store(tdb, key, new_dbuf, TDB_INSERT);
  2.1688 +		goto out;
  2.1689 +	}
  2.1690 +
  2.1691 +	new_data_size = rec.data_len + new_dbuf.dsize;
  2.1692 +
  2.1693 +	/* Copy key+old_value+value *before* allocating free space in case malloc
  2.1694 +	   fails and we are left with a dead spot in the tdb. */
  2.1695 +
  2.1696 +	if (!(p = (char *)talloc_size(tdb, key.dsize + new_data_size))) {
  2.1697 +		tdb->ecode = TDB_ERR_OOM;
  2.1698 +		goto fail;
  2.1699 +	}
  2.1700 +
  2.1701 +	/* Copy the key in place. */
  2.1702 +	memcpy(p, key.dptr, key.dsize);
  2.1703 +
  2.1704 +	/* Now read the old data into place. */
  2.1705 +	if (rec.data_len &&
  2.1706 +		tdb_read(tdb, rec_ptr + sizeof(rec) + rec.key_len, p + key.dsize, rec.data_len, 0) == -1)
  2.1707 +			goto fail;
  2.1708 +
  2.1709 +	/* Finally append the new data. */
  2.1710 +	if (new_dbuf.dsize)
  2.1711 +		memcpy(p+key.dsize+rec.data_len, new_dbuf.dptr, new_dbuf.dsize);
  2.1712 +
  2.1713 +	/* delete any existing record - if it doesn't exist we don't
  2.1714 +           care.  Doing this first reduces fragmentation, and avoids
  2.1715 +           coalescing with `allocated' block before it's updated. */
  2.1716 +
  2.1717 +	tdb_delete_hash(tdb, key, hash);
  2.1718 +
  2.1719 +	if (!(rec_ptr = tdb_allocate(tdb, key.dsize + new_data_size, &rec)))
  2.1720 +		goto fail;
  2.1721 +
  2.1722 +	/* Read hash top into next ptr */
  2.1723 +	if (ofs_read(tdb, TDB_HASH_TOP(hash), &rec.next) == -1)
  2.1724 +		goto fail;
  2.1725 +
  2.1726 +	rec.key_len = key.dsize;
  2.1727 +	rec.data_len = new_data_size;
  2.1728 +	rec.full_hash = hash;
  2.1729 +	rec.magic = TDB_MAGIC;
  2.1730 +
  2.1731 +	/* write out and point the top of the hash chain at it */
  2.1732 +	if (rec_write(tdb, rec_ptr, &rec) == -1
  2.1733 +	    || tdb_write(tdb, rec_ptr+sizeof(rec), p, key.dsize+new_data_size)==-1
  2.1734 +	    || ofs_write(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1) {
  2.1735 +		/* Need to tdb_unallocate() here */
  2.1736 +		goto fail;
  2.1737 +	}
  2.1738 +
  2.1739 + out:
  2.1740 +	SAFE_FREE(p); 
  2.1741 +	tdb_unlock(tdb, BUCKET(hash), F_WRLCK);
  2.1742 +	return ret;
  2.1743 +
  2.1744 +fail:
  2.1745 +	ret = -1;
  2.1746 +	goto out;
  2.1747 +}
  2.1748 +
  2.1749 +static int tdb_already_open(dev_t device,
  2.1750 +			    ino_t ino)
  2.1751 +{
  2.1752 +	TDB_CONTEXT *i;
  2.1753 +	
  2.1754 +	for (i = tdbs; i; i = i->next) {
  2.1755 +		if (i->device == device && i->inode == ino) {
  2.1756 +			return 1;
  2.1757 +		}
  2.1758 +	}
  2.1759 +
  2.1760 +	return 0;
  2.1761 +}
  2.1762 +
  2.1763 +/* open the database, creating it if necessary 
  2.1764 +
  2.1765 +   The open_flags and mode are passed straight to the open call on the
  2.1766 +   database file. A flags value of O_WRONLY is invalid. The hash size
  2.1767 +   is advisory, use zero for a default value.
  2.1768 +
  2.1769 +   Return is NULL on error, in which case errno is also set.  Don't 
  2.1770 +   try to call tdb_error or tdb_errname, just do strerror(errno).
  2.1771 +
  2.1772 +   @param name may be NULL for internal databases. */
  2.1773 +TDB_CONTEXT *tdb_open(const char *name, int hash_size, int tdb_flags,
  2.1774 +		      int open_flags, mode_t mode)
  2.1775 +{
  2.1776 +	return tdb_open_ex(name, hash_size, tdb_flags, open_flags, mode, NULL, NULL);
  2.1777 +}
  2.1778 +
  2.1779 +/* a default logging function */
  2.1780 +static void null_log_fn(TDB_CONTEXT *tdb __attribute__((unused)),
  2.1781 +			int level __attribute__((unused)),
  2.1782 +			const char *fmt __attribute__((unused)), ...)
  2.1783 +{
  2.1784 +}
  2.1785 +
  2.1786 +
  2.1787 +TDB_CONTEXT *tdb_open_ex(const char *name, int hash_size, int tdb_flags,
  2.1788 +			 int open_flags, mode_t mode,
  2.1789 +			 tdb_log_func log_fn,
  2.1790 +			 tdb_hash_func hash_fn)
  2.1791 +{
  2.1792 +	TDB_CONTEXT *tdb;
  2.1793 +	struct stat st;
  2.1794 +	int rev = 0, locked = 0;
  2.1795 +	uint8_t *vp;
  2.1796 +	u32 vertest;
  2.1797 +
  2.1798 +	if (!(tdb = talloc_zero(name, TDB_CONTEXT))) {
  2.1799 +		/* Can't log this */
  2.1800 +		errno = ENOMEM;
  2.1801 +		goto fail;
  2.1802 +	}
  2.1803 +	tdb->fd = -1;
  2.1804 +	tdb->name = NULL;
  2.1805 +	tdb->map_ptr = NULL;
  2.1806 +	tdb->flags = tdb_flags;
  2.1807 +	tdb->open_flags = open_flags;
  2.1808 +	tdb->log_fn = log_fn?log_fn:null_log_fn;
  2.1809 +	tdb->hash_fn = hash_fn ? hash_fn : default_tdb_hash;
  2.1810 +
  2.1811 +	if ((open_flags & O_ACCMODE) == O_WRONLY) {
  2.1812 +		TDB_LOG((tdb, 0, "tdb_open_ex: can't open tdb %s write-only\n",
  2.1813 +			 name));
  2.1814 +		errno = EINVAL;
  2.1815 +		goto fail;
  2.1816 +	}
  2.1817 +	
  2.1818 +	if (hash_size == 0)
  2.1819 +		hash_size = DEFAULT_HASH_SIZE;
  2.1820 +	if ((open_flags & O_ACCMODE) == O_RDONLY) {
  2.1821 +		tdb->read_only = 1;
  2.1822 +		/* read only databases don't do locking or clear if first */
  2.1823 +		tdb->flags |= TDB_NOLOCK;
  2.1824 +		tdb->flags &= ~TDB_CLEAR_IF_FIRST;
  2.1825 +	}
  2.1826 +
  2.1827 +	/* internal databases don't mmap or lock, and start off cleared */
  2.1828 +	if (tdb->flags & TDB_INTERNAL) {
  2.1829 +		tdb->flags |= (TDB_NOLOCK | TDB_NOMMAP);
  2.1830 +		tdb->flags &= ~TDB_CLEAR_IF_FIRST;
  2.1831 +		if (tdb_new_database(tdb, hash_size) != 0) {
  2.1832 +			TDB_LOG((tdb, 0, "tdb_open_ex: tdb_new_database failed!"));
  2.1833 +			goto fail;
  2.1834 +		}
  2.1835 +		goto internal;
  2.1836 +	}
  2.1837 +
  2.1838 +	if ((tdb->fd = open(name, open_flags, mode)) == -1) {
  2.1839 +		TDB_LOG((tdb, 5, "tdb_open_ex: could not open file %s: %s\n",
  2.1840 +			 name, strerror(errno)));
  2.1841 +		goto fail;	/* errno set by open(2) */
  2.1842 +	}
  2.1843 +
  2.1844 +	/* ensure there is only one process initialising at once */
  2.1845 +	if (tdb_brlock(tdb, GLOBAL_LOCK, F_WRLCK, F_SETLKW, 0) == -1) {
  2.1846 +		TDB_LOG((tdb, 0, "tdb_open_ex: failed to get global lock on %s: %s\n",
  2.1847 +			 name, strerror(errno)));
  2.1848 +		goto fail;	/* errno set by tdb_brlock */
  2.1849 +	}
  2.1850 +
  2.1851 +	/* we need to zero database if we are the only one with it open */
  2.1852 +	if ((tdb_flags & TDB_CLEAR_IF_FIRST) &&
  2.1853 +		(locked = (tdb_brlock(tdb, ACTIVE_LOCK, F_WRLCK, F_SETLK, 0) == 0))) {
  2.1854 +		open_flags |= O_CREAT;
  2.1855 +		if (ftruncate(tdb->fd, 0) == -1) {
  2.1856 +			TDB_LOG((tdb, 0, "tdb_open_ex: "
  2.1857 +				 "failed to truncate %s: %s\n",
  2.1858 +				 name, strerror(errno)));
  2.1859 +			goto fail; /* errno set by ftruncate */
  2.1860 +		}
  2.1861 +	}
  2.1862 +
  2.1863 +	if (read(tdb->fd, &tdb->header, sizeof(tdb->header)) != sizeof(tdb->header)
  2.1864 +	    || strcmp(tdb->header.magic_food, TDB_MAGIC_FOOD) != 0
  2.1865 +	    || (tdb->header.version != TDB_VERSION
  2.1866 +		&& !(rev = (tdb->header.version==TDB_BYTEREV(TDB_VERSION))))) {
  2.1867 +		/* its not a valid database - possibly initialise it */
  2.1868 +		if (!(open_flags & O_CREAT) || tdb_new_database(tdb, hash_size) == -1) {
  2.1869 +			errno = EIO; /* ie bad format or something */
  2.1870 +			goto fail;
  2.1871 +		}
  2.1872 +		rev = (tdb->flags & TDB_CONVERT);
  2.1873 +	}
  2.1874 +	vp = (uint8_t *)&tdb->header.version;
  2.1875 +	vertest = (((u32)vp[0]) << 24) | (((u32)vp[1]) << 16) |
  2.1876 +		  (((u32)vp[2]) << 8) | (u32)vp[3];
  2.1877 +	tdb->flags |= (vertest==TDB_VERSION) ? TDB_BIGENDIAN : 0;
  2.1878 +	if (!rev)
  2.1879 +		tdb->flags &= ~TDB_CONVERT;
  2.1880 +	else {
  2.1881 +		tdb->flags |= TDB_CONVERT;
  2.1882 +		convert(&tdb->header, sizeof(tdb->header));
  2.1883 +	}
  2.1884 +	if (fstat(tdb->fd, &st) == -1)
  2.1885 +		goto fail;
  2.1886 +
  2.1887 +	/* Is it already in the open list?  If so, fail. */
  2.1888 +	if (tdb_already_open(st.st_dev, st.st_ino)) {
  2.1889 +		TDB_LOG((tdb, 2, "tdb_open_ex: "
  2.1890 +			 "%s (%d,%d) is already open in this process\n",
  2.1891 +			 name, (int)st.st_dev, (int)st.st_ino));
  2.1892 +		errno = EBUSY;
  2.1893 +		goto fail;
  2.1894 +	}
  2.1895 +
  2.1896 +	if (!(tdb->name = (char *)talloc_strdup(tdb, name))) {
  2.1897 +		errno = ENOMEM;
  2.1898 +		goto fail;
  2.1899 +	}
  2.1900 +
  2.1901 +	tdb->map_size = st.st_size;
  2.1902 +	tdb->device = st.st_dev;
  2.1903 +	tdb->inode = st.st_ino;
  2.1904 +	tdb->locked = talloc_zero_array(tdb, struct tdb_lock_type,
  2.1905 +					tdb->header.hash_size+1);
  2.1906 +	if (!tdb->locked) {
  2.1907 +		TDB_LOG((tdb, 2, "tdb_open_ex: "
  2.1908 +			 "failed to allocate lock structure for %s\n",
  2.1909 +			 name));
  2.1910 +		errno = ENOMEM;
  2.1911 +		goto fail;
  2.1912 +	}
  2.1913 +	tdb_mmap(tdb);
  2.1914 +	if (locked) {
  2.1915 +		if (tdb_brlock(tdb, ACTIVE_LOCK, F_UNLCK, F_SETLK, 0) == -1) {
  2.1916 +			TDB_LOG((tdb, 0, "tdb_open_ex: "
  2.1917 +				 "failed to take ACTIVE_LOCK on %s: %s\n",
  2.1918 +				 name, strerror(errno)));
  2.1919 +			goto fail;
  2.1920 +		}
  2.1921 +
  2.1922 +	}
  2.1923 +
  2.1924 +	/* We always need to do this if the CLEAR_IF_FIRST flag is set, even if
  2.1925 +	   we didn't get the initial exclusive lock as we need to let all other
  2.1926 +	   users know we're using it. */
  2.1927 +
  2.1928 +	if (tdb_flags & TDB_CLEAR_IF_FIRST) {
  2.1929 +	/* leave this lock in place to indicate it's in use */
  2.1930 +	if (tdb_brlock(tdb, ACTIVE_LOCK, F_RDLCK, F_SETLKW, 0) == -1)
  2.1931 +		goto fail;
  2.1932 +	}
  2.1933 +
  2.1934 +
  2.1935 + internal:
  2.1936 +	/* Internal (memory-only) databases skip all the code above to
  2.1937 +	 * do with disk files, and resume here by releasing their
  2.1938 +	 * global lock and hooking into the active list. */
  2.1939 +	if (tdb_brlock(tdb, GLOBAL_LOCK, F_UNLCK, F_SETLKW, 0) == -1)
  2.1940 +		goto fail;
  2.1941 +	tdb->next = tdbs;
  2.1942 +	tdbs = tdb;
  2.1943 +	return tdb;
  2.1944 +
  2.1945 + fail:
  2.1946 +	{ int save_errno = errno;
  2.1947 +
  2.1948 +	if (!tdb)
  2.1949 +		return NULL;
  2.1950 +	
  2.1951 +	if (tdb->map_ptr) {
  2.1952 +		if (tdb->flags & TDB_INTERNAL)
  2.1953 +			SAFE_FREE(tdb->map_ptr);
  2.1954 +		else
  2.1955 +			tdb_munmap(tdb);
  2.1956 +	}
  2.1957 +	SAFE_FREE(tdb->name);
  2.1958 +	if (tdb->fd != -1)
  2.1959 +		if (close(tdb->fd) != 0)
  2.1960 +			TDB_LOG((tdb, 5, "tdb_open_ex: failed to close tdb->fd on error!\n"));
  2.1961 +	SAFE_FREE(tdb->locked);
  2.1962 +	SAFE_FREE(tdb);
  2.1963 +	errno = save_errno;
  2.1964 +	return NULL;
  2.1965 +	}
  2.1966 +}
  2.1967 +
  2.1968 +/**
  2.1969 + * Close a database.
  2.1970 + *
  2.1971 + * @returns -1 for error; 0 for success.
  2.1972 + **/
  2.1973 +int tdb_close(TDB_CONTEXT *tdb)
  2.1974 +{
  2.1975 +	TDB_CONTEXT **i;
  2.1976 +	int ret = 0;
  2.1977 +
  2.1978 +	if (tdb->map_ptr) {
  2.1979 +		if (tdb->flags & TDB_INTERNAL)
  2.1980 +			SAFE_FREE(tdb->map_ptr);
  2.1981 +		else
  2.1982 +			tdb_munmap(tdb);
  2.1983 +	}
  2.1984 +	SAFE_FREE(tdb->name);
  2.1985 +	if (tdb->fd != -1)
  2.1986 +		ret = close(tdb->fd);
  2.1987 +	SAFE_FREE(tdb->locked);
  2.1988 +
  2.1989 +	/* Remove from contexts list */
  2.1990 +	for (i = &tdbs; *i; i = &(*i)->next) {
  2.1991 +		if (*i == tdb) {
  2.1992 +			*i = tdb->next;
  2.1993 +			break;
  2.1994 +		}
  2.1995 +	}
  2.1996 +
  2.1997 +	memset(tdb, 0, sizeof(*tdb));
  2.1998 +	SAFE_FREE(tdb);
  2.1999 +
  2.2000 +	return ret;
  2.2001 +}
  2.2002 +
  2.2003 +/* lock/unlock entire database */
  2.2004 +int tdb_lockall(TDB_CONTEXT *tdb)
  2.2005 +{
  2.2006 +	u32 i;
  2.2007 +
  2.2008 +	/* There are no locks on read-only dbs */
  2.2009 +	if (tdb->read_only)
  2.2010 +		return TDB_ERRCODE(TDB_ERR_LOCK, -1);
  2.2011 +	for (i = 0; i < tdb->header.hash_size; i++) 
  2.2012 +		if (tdb_lock(tdb, i, F_WRLCK))
  2.2013 +			break;
  2.2014 +
  2.2015 +	/* If error, release locks we have... */
  2.2016 +	if (i < tdb->header.hash_size) {
  2.2017 +		u32 j;
  2.2018 +
  2.2019 +		for ( j = 0; j < i; j++)
  2.2020 +			tdb_unlock(tdb, j, F_WRLCK);
  2.2021 +		return TDB_ERRCODE(TDB_ERR_NOLOCK, -1);
  2.2022 +	}
  2.2023 +
  2.2024 +	return 0;
  2.2025 +}
  2.2026 +void tdb_unlockall(TDB_CONTEXT *tdb)
  2.2027 +{
  2.2028 +	u32 i;
  2.2029 +	for (i=0; i < tdb->header.hash_size; i++)
  2.2030 +		tdb_unlock(tdb, i, F_WRLCK);
  2.2031 +}
  2.2032 +
  2.2033 +/* lock/unlock one hash chain. This is meant to be used to reduce
  2.2034 +   contention - it cannot guarantee how many records will be locked */
  2.2035 +int tdb_chainlock(TDB_CONTEXT *tdb, TDB_DATA key)
  2.2036 +{
  2.2037 +	return tdb_lock(tdb, BUCKET(tdb->hash_fn(&key)), F_WRLCK);
  2.2038 +}
  2.2039 +
  2.2040 +int tdb_chainunlock(TDB_CONTEXT *tdb, TDB_DATA key)
  2.2041 +{
  2.2042 +	return tdb_unlock(tdb, BUCKET(tdb->hash_fn(&key)), F_WRLCK);
  2.2043 +}
  2.2044 +
  2.2045 +int tdb_chainlock_read(TDB_CONTEXT *tdb, TDB_DATA key)
  2.2046 +{
  2.2047 +	return tdb_lock(tdb, BUCKET(tdb->hash_fn(&key)), F_RDLCK);
  2.2048 +}
  2.2049 +
  2.2050 +int tdb_chainunlock_read(TDB_CONTEXT *tdb, TDB_DATA key)
  2.2051 +{
  2.2052 +	return tdb_unlock(tdb, BUCKET(tdb->hash_fn(&key)), F_RDLCK);
  2.2053 +}
  2.2054 +
  2.2055 +
  2.2056 +/* register a loging function */
  2.2057 +void tdb_logging_function(TDB_CONTEXT *tdb, void (*fn)(TDB_CONTEXT *, int , const char *, ...))
  2.2058 +{
  2.2059 +	tdb->log_fn = fn?fn:null_log_fn;
  2.2060 +}
  2.2061 +
  2.2062 +
  2.2063 +/* reopen a tdb - this can be used after a fork to ensure that we have an independent
  2.2064 +   seek pointer from our parent and to re-establish locks */
  2.2065 +int tdb_reopen(TDB_CONTEXT *tdb)
  2.2066 +{
  2.2067 +	struct stat st;
  2.2068 +
  2.2069 +	if (tdb->flags & TDB_INTERNAL)
  2.2070 +		return 0; /* Nothing to do. */
  2.2071 +	if (tdb_munmap(tdb) != 0) {
  2.2072 +		TDB_LOG((tdb, 0, "tdb_reopen: munmap failed (%s)\n", strerror(errno)));
  2.2073 +		goto fail;
  2.2074 +	}
  2.2075 +	if (close(tdb->fd) != 0)
  2.2076 +		TDB_LOG((tdb, 0, "tdb_reopen: WARNING closing tdb->fd failed!\n"));
  2.2077 +	tdb->fd = open(tdb->name, tdb->open_flags & ~(O_CREAT|O_TRUNC), 0);
  2.2078 +	if (tdb->fd == -1) {
  2.2079 +		TDB_LOG((tdb, 0, "tdb_reopen: open failed (%s)\n", strerror(errno)));
  2.2080 +		goto fail;
  2.2081 +	}
  2.2082 +	if (fstat(tdb->fd, &st) != 0) {
  2.2083 +		TDB_LOG((tdb, 0, "tdb_reopen: fstat failed (%s)\n", strerror(errno)));
  2.2084 +		goto fail;
  2.2085 +	}
  2.2086 +	if (st.st_ino != tdb->inode || st.st_dev != tdb->device) {
  2.2087 +		TDB_LOG((tdb, 0, "tdb_reopen: file dev/inode has changed!\n"));
  2.2088 +		goto fail;
  2.2089 +	}
  2.2090 +	tdb_mmap(tdb);
  2.2091 +	if ((tdb->flags & TDB_CLEAR_IF_FIRST) && (tdb_brlock(tdb, ACTIVE_LOCK, F_RDLCK, F_SETLKW, 0) == -1)) {
  2.2092 +		TDB_LOG((tdb, 0, "tdb_reopen: failed to obtain active lock\n"));
  2.2093 +		goto fail;
  2.2094 +	}
  2.2095 +
  2.2096 +	return 0;
  2.2097 +
  2.2098 +fail:
  2.2099 +	tdb_close(tdb);
  2.2100 +	return -1;
  2.2101 +}
  2.2102 +
  2.2103 +/* Not general: only works if single writer. */
  2.2104 +TDB_CONTEXT *tdb_copy(TDB_CONTEXT *tdb, const char *outfile)
  2.2105 +{
  2.2106 +	int fd, saved_errno;
  2.2107 +	TDB_CONTEXT *copy;
  2.2108 +
  2.2109 +	fd = open(outfile, O_TRUNC|O_CREAT|O_WRONLY, 0640);
  2.2110 +	if (fd < 0)
  2.2111 +		return NULL;
  2.2112 +	if (tdb->map_ptr) {
  2.2113 +		if (write(fd,tdb->map_ptr,tdb->map_size) != (int)tdb->map_size)
  2.2114 +			goto fail;
  2.2115 +	} else {
  2.2116 +		char buf[65536];
  2.2117 +		int r;
  2.2118 +
  2.2119 +		lseek(tdb->fd, 0, SEEK_SET);
  2.2120 +		while ((r = read(tdb->fd, buf, sizeof(buf))) > 0) {
  2.2121 +			if (write(fd, buf, r) != r)
  2.2122 +				goto fail;
  2.2123 +		}
  2.2124 +		if (r < 0)
  2.2125 +			goto fail;
  2.2126 +	}
  2.2127 +	copy = tdb_open(outfile, 0, 0, O_RDWR, 0);
  2.2128 +	if (!copy)
  2.2129 +		goto fail;
  2.2130 +	close(fd);
  2.2131 +	return copy;
  2.2132 +
  2.2133 +fail:
  2.2134 +	saved_errno = errno;
  2.2135 +	close(fd);
  2.2136 +	unlink(outfile);
  2.2137 +	errno = saved_errno;
  2.2138 +	return NULL;
  2.2139 +}
  2.2140 +
  2.2141 +/* reopen all tdb's */
  2.2142 +int tdb_reopen_all(void)
  2.2143 +{
  2.2144 +	TDB_CONTEXT *tdb;
  2.2145 +
  2.2146 +	for (tdb=tdbs; tdb; tdb = tdb->next) {
  2.2147 +		/* Ensure no clear-if-first. */
  2.2148 +		tdb->flags &= ~TDB_CLEAR_IF_FIRST;
  2.2149 +		if (tdb_reopen(tdb) != 0)
  2.2150 +			return -1;
  2.2151 +	}
  2.2152 +
  2.2153 +	return 0;
  2.2154 +}
     3.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     3.2 +++ b/tools/xenstore/tdb.h	Fri Sep 23 14:25:01 2005 +0100
     3.3 @@ -0,0 +1,157 @@
     3.4 +#ifndef __TDB_H__
     3.5 +#define __TDB_H__
     3.6 +
     3.7 +/* 
     3.8 +   Unix SMB/CIFS implementation.
     3.9 +
    3.10 +   trivial database library
    3.11 +
    3.12 +   Copyright (C) Andrew Tridgell 1999-2004
    3.13 +   
    3.14 +     ** NOTE! The following LGPL license applies to the tdb
    3.15 +     ** library. This does NOT imply that all of Samba is released
    3.16 +     ** under the LGPL
    3.17 +   
    3.18 +   This library is free software; you can redistribute it and/or
    3.19 +   modify it under the terms of the GNU Lesser General Public
    3.20 +   License as published by the Free Software Foundation; either
    3.21 +   version 2 of the License, or (at your option) any later version.
    3.22 +
    3.23 +   This library is distributed in the hope that it will be useful,
    3.24 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
    3.25 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    3.26 +   Lesser General Public License for more details.
    3.27 +
    3.28 +   You should have received a copy of the GNU Lesser General Public
    3.29 +   License along with this library; if not, write to the Free Software
    3.30 +   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
    3.31 +*/
    3.32 +
    3.33 +#ifdef  __cplusplus
    3.34 +extern "C" {
    3.35 +#endif
    3.36 +
    3.37 +
    3.38 +/* flags to tdb_store() */
    3.39 +#define TDB_REPLACE 1
    3.40 +#define TDB_INSERT 2
    3.41 +#define TDB_MODIFY 3
    3.42 +
    3.43 +/* flags for tdb_open() */
    3.44 +#define TDB_DEFAULT 0 /* just a readability place holder */
    3.45 +#define TDB_CLEAR_IF_FIRST 1
    3.46 +#define TDB_INTERNAL 2 /* don't store on disk */
    3.47 +#define TDB_NOLOCK   4 /* don't do any locking */
    3.48 +#define TDB_NOMMAP   8 /* don't use mmap */
    3.49 +#define TDB_CONVERT 16 /* convert endian (internal use) */
    3.50 +#define TDB_BIGENDIAN 32 /* header is big-endian (internal use) */
    3.51 +
    3.52 +#define TDB_ERRCODE(code, ret) ((tdb->ecode = (code)), ret)
    3.53 +
    3.54 +/* error codes */
    3.55 +enum TDB_ERROR {TDB_SUCCESS=0, TDB_ERR_CORRUPT, TDB_ERR_IO, TDB_ERR_LOCK, 
    3.56 +		TDB_ERR_OOM, TDB_ERR_EXISTS, TDB_ERR_NOLOCK, TDB_ERR_LOCK_TIMEOUT,
    3.57 +		TDB_ERR_NOEXIST};
    3.58 +
    3.59 +#ifndef u32
    3.60 +#define u32 unsigned
    3.61 +#endif
    3.62 +
    3.63 +typedef struct TDB_DATA {
    3.64 +	char *dptr;
    3.65 +	size_t dsize;
    3.66 +} TDB_DATA;
    3.67 +
    3.68 +typedef u32 tdb_len;
    3.69 +typedef u32 tdb_off;
    3.70 +
    3.71 +/* this is stored at the front of every database */
    3.72 +struct tdb_header {
    3.73 +	char magic_food[32]; /* for /etc/magic */
    3.74 +	u32 version; /* version of the code */
    3.75 +	u32 hash_size; /* number of hash entries */
    3.76 +	tdb_off rwlocks;
    3.77 +	tdb_off reserved[31];
    3.78 +};
    3.79 +
    3.80 +struct tdb_lock_type {
    3.81 +	u32 count;
    3.82 +	u32 ltype;
    3.83 +};
    3.84 +
    3.85 +struct tdb_traverse_lock {
    3.86 +	struct tdb_traverse_lock *next;
    3.87 +	u32 off;
    3.88 +	u32 hash;
    3.89 +};
    3.90 +
    3.91 +#ifndef PRINTF_ATTRIBUTE
    3.92 +#define PRINTF_ATTRIBUTE(a,b)
    3.93 +#endif
    3.94 +
    3.95 +/* this is the context structure that is returned from a db open */
    3.96 +typedef struct tdb_context {
    3.97 +	char *name; /* the name of the database */
    3.98 +	void *map_ptr; /* where it is currently mapped */
    3.99 +	int fd; /* open file descriptor for the database */
   3.100 +	tdb_len map_size; /* how much space has been mapped */
   3.101 +	int read_only; /* opened read-only */
   3.102 +	struct tdb_lock_type *locked; /* array of chain locks */
   3.103 +	enum TDB_ERROR ecode; /* error code for last tdb error */
   3.104 +	struct tdb_header header; /* a cached copy of the header */
   3.105 +	u32 flags; /* the flags passed to tdb_open */
   3.106 +	struct tdb_traverse_lock travlocks; /* current traversal locks */
   3.107 +	struct tdb_context *next; /* all tdbs to avoid multiple opens */
   3.108 +	dev_t device;	/* uniquely identifies this tdb */
   3.109 +	ino_t inode;	/* uniquely identifies this tdb */
   3.110 +	void (*log_fn)(struct tdb_context *tdb, int level, const char *, ...) PRINTF_ATTRIBUTE(3,4); /* logging function */
   3.111 +	u32 (*hash_fn)(TDB_DATA *key);
   3.112 +	int open_flags; /* flags used in the open - needed by reopen */
   3.113 +} TDB_CONTEXT;
   3.114 +
   3.115 +typedef int (*tdb_traverse_func)(TDB_CONTEXT *, TDB_DATA, TDB_DATA, void *);
   3.116 +typedef void (*tdb_log_func)(TDB_CONTEXT *, int , const char *, ...);
   3.117 +typedef u32 (*tdb_hash_func)(TDB_DATA *key);
   3.118 +
   3.119 +TDB_CONTEXT *tdb_open(const char *name, int hash_size, int tdb_flags,
   3.120 +		      int open_flags, mode_t mode);
   3.121 +TDB_CONTEXT *tdb_open_ex(const char *name, int hash_size, int tdb_flags,
   3.122 +			 int open_flags, mode_t mode,
   3.123 +			 tdb_log_func log_fn,
   3.124 +			 tdb_hash_func hash_fn);
   3.125 +
   3.126 +int tdb_reopen(TDB_CONTEXT *tdb);
   3.127 +int tdb_reopen_all(void);
   3.128 +void tdb_logging_function(TDB_CONTEXT *tdb, tdb_log_func);
   3.129 +enum TDB_ERROR tdb_error(TDB_CONTEXT *tdb);
   3.130 +const char *tdb_errorstr(TDB_CONTEXT *tdb);
   3.131 +TDB_DATA tdb_fetch(TDB_CONTEXT *tdb, TDB_DATA key);
   3.132 +int tdb_delete(TDB_CONTEXT *tdb, TDB_DATA key);
   3.133 +int tdb_store(TDB_CONTEXT *tdb, TDB_DATA key, TDB_DATA dbuf, int flag);
   3.134 +int tdb_append(TDB_CONTEXT *tdb, TDB_DATA key, TDB_DATA new_dbuf);
   3.135 +int tdb_close(TDB_CONTEXT *tdb);
   3.136 +TDB_DATA tdb_firstkey(TDB_CONTEXT *tdb);
   3.137 +TDB_DATA tdb_nextkey(TDB_CONTEXT *tdb, TDB_DATA key);
   3.138 +int tdb_traverse(TDB_CONTEXT *tdb, tdb_traverse_func fn, void *);
   3.139 +int tdb_exists(TDB_CONTEXT *tdb, TDB_DATA key);
   3.140 +int tdb_lockall(TDB_CONTEXT *tdb);
   3.141 +void tdb_unlockall(TDB_CONTEXT *tdb);
   3.142 +
   3.143 +/* Low level locking functions: use with care */
   3.144 +int tdb_chainlock(TDB_CONTEXT *tdb, TDB_DATA key);
   3.145 +int tdb_chainunlock(TDB_CONTEXT *tdb, TDB_DATA key);
   3.146 +int tdb_chainlock_read(TDB_CONTEXT *tdb, TDB_DATA key);
   3.147 +int tdb_chainunlock_read(TDB_CONTEXT *tdb, TDB_DATA key);
   3.148 +TDB_CONTEXT *tdb_copy(TDB_CONTEXT *tdb, const char *outfile);
   3.149 +
   3.150 +/* Debug functions. Not used in production. */
   3.151 +void tdb_dump_all(TDB_CONTEXT *tdb);
   3.152 +int tdb_printfreelist(TDB_CONTEXT *tdb);
   3.153 +
   3.154 +extern TDB_DATA tdb_null;
   3.155 +
   3.156 +#ifdef  __cplusplus
   3.157 +}
   3.158 +#endif
   3.159 +
   3.160 +#endif /* tdb.h */
     4.1 --- a/tools/xenstore/testsuite/04rm.test	Fri Sep 23 14:24:58 2005 +0100
     4.2 +++ b/tools/xenstore/testsuite/04rm.test	Fri Sep 23 14:25:01 2005 +0100
     4.3 @@ -6,6 +6,8 @@ rm /dir/test
     4.4  # Create file and remove it
     4.5  write /test contents
     4.6  rm /test
     4.7 +expect tool
     4.8 +dir /
     4.9  
    4.10  # Create directory and remove it.
    4.11  mkdir /dir
    4.12 @@ -15,3 +17,4 @@ rm /dir
    4.13  mkdir /dir
    4.14  write /dir/test contents
    4.15  rm /dir
    4.16 +
     5.1 --- a/tools/xenstore/testsuite/08transaction.slowtest	Fri Sep 23 14:24:58 2005 +0100
     5.2 +++ b/tools/xenstore/testsuite/08transaction.slowtest	Fri Sep 23 14:25:01 2005 +0100
     5.3 @@ -1,21 +1,43 @@
     5.4 -# Test transaction timeouts.  Take a second each.
     5.5 +# Test transaction clashes.
     5.6  
     5.7  mkdir /test
     5.8  write /test/entry1 contents
     5.9  
    5.10 -# Transactions can take as long as the want...
    5.11 -start /test
    5.12 -sleep 1100
    5.13 -rm /test/entry1
    5.14 -commit
    5.15 -dir /test
    5.16 +# Start transaction, do read-only op, transaction succeeds
    5.17 +1 start
    5.18 +1 write /test/entry1 contents2
    5.19 +expect contents
    5.20 +read /test/entry1
    5.21 +1 commit
    5.22 +expect contents2
    5.23 +read /test/entry1
    5.24 +
    5.25 +# Start transaction, abort other transaction, transaction succeeds.
    5.26 +1 start
    5.27 +1 write /test/entry1 contents3
    5.28 +start
    5.29 +write /test/entry1 contents
    5.30 +abort
    5.31 +1 commit
    5.32 +expect contents3
    5.33 +read /test/entry1
    5.34  
    5.35 -# ... as long as noone is waiting.
    5.36 -1 start /test
    5.37 -notimeout
    5.38 -2 mkdir /test/dir
    5.39 -1 mkdir /test/dir
    5.40 -expect 1:dir
    5.41 -1 dir /test
    5.42 -expect 1: commit failed: Connection timed out
    5.43 +# Start transaction, do write op, transaction fails
    5.44 +1 start
    5.45 +1 write /test/entry1 contents4
    5.46 +write /test/entry1 contents
    5.47 +expect 1: commit failed: Resource temporarily unavailable
    5.48  1 commit
    5.49 +expect contents
    5.50 +read /test/entry1
    5.51 +
    5.52 +# Start transaction, do other transaction, transaction fails
    5.53 +1 start
    5.54 +1 write /test/entry1 contents4
    5.55 +start
    5.56 +write /test/entry1 contents5
    5.57 +commit
    5.58 +expect 1: commit failed: Resource temporarily unavailable
    5.59 +1 commit
    5.60 +expect contents5
    5.61 +read /test/entry1
     6.1 --- a/tools/xenstore/testsuite/08transaction.test	Fri Sep 23 14:24:58 2005 +0100
     6.2 +++ b/tools/xenstore/testsuite/08transaction.test	Fri Sep 23 14:25:01 2005 +0100
     6.3 @@ -3,7 +3,7 @@
     6.4  mkdir /test
     6.5  
     6.6  # Simple transaction: create a file inside transaction.
     6.7 -1 start /test
     6.8 +1 start
     6.9  1 write /test/entry1 contents
    6.10  2 dir /test
    6.11  expect 1:entry1
    6.12 @@ -15,7 +15,7 @@ 2 read /test/entry1
    6.13  rm /test/entry1
    6.14  
    6.15  # Create a file and abort transaction.
    6.16 -1 start /test
    6.17 +1 start
    6.18  1 write /test/entry1 contents
    6.19  2 dir /test
    6.20  expect 1:entry1
    6.21 @@ -25,7 +25,7 @@ 2 dir /test
    6.22  
    6.23  write /test/entry1 contents
    6.24  # Delete in transaction, commit
    6.25 -1 start /test
    6.26 +1 start
    6.27  1 rm /test/entry1
    6.28  expect 2:entry1
    6.29  2 dir /test
    6.30 @@ -35,7 +35,7 @@ 2 dir /test
    6.31  
    6.32  # Delete in transaction, abort.
    6.33  write /test/entry1 contents
    6.34 -1 start /test
    6.35 +1 start
    6.36  1 rm /test/entry1
    6.37  expect 2:entry1
    6.38  2 dir /test
    6.39 @@ -47,7 +47,7 @@ 2 dir /test
    6.40  # Events inside transactions don't trigger watches until (successful) commit.
    6.41  mkdir /test/dir
    6.42  1 watch /test token
    6.43 -2 start /test
    6.44 +2 start
    6.45  2 mkdir /test/dir/sub
    6.46  expect 1: waitwatch failed: Connection timed out
    6.47  1 waitwatch
    6.48 @@ -55,7 +55,7 @@ 2 close
    6.49  1 close
    6.50  
    6.51  1 watch /test token
    6.52 -2 start /test
    6.53 +2 start
    6.54  2 mkdir /test/dir/sub
    6.55  2 abort
    6.56  expect 1: waitwatch failed: Connection timed out
    6.57 @@ -63,7 +63,7 @@ 1 waitwatch
    6.58  1 close
    6.59  
    6.60  1 watch /test token
    6.61 -2 start /test
    6.62 +2 start
    6.63  2 mkdir /test/dir/sub
    6.64  2 commit
    6.65  expect 1:/test/dir/sub:token
    6.66 @@ -73,7 +73,7 @@ 1 close
    6.67  
    6.68  # Rm inside transaction works like rm outside: children get notified.
    6.69  1 watch /test/dir/sub token
    6.70 -2 start /test
    6.71 +2 start
    6.72  2 rm /test/dir
    6.73  2 commit
    6.74  expect 1:/test/dir/sub:token
    6.75 @@ -83,7 +83,7 @@ 1 close
    6.76  
    6.77  # Multiple events from single transaction don't trigger assert
    6.78  1 watch /test token
    6.79 -2 start /test
    6.80 +2 start
    6.81  2 write /test/1 contents
    6.82  2 write /test/2 contents
    6.83  2 commit
     7.1 --- a/tools/xenstore/testsuite/12readonly.test	Fri Sep 23 14:24:58 2005 +0100
     7.2 +++ b/tools/xenstore/testsuite/12readonly.test	Fri Sep 23 14:25:01 2005 +0100
     7.3 @@ -13,23 +13,23 @@ expect 0 READ
     7.4  getperm /test
     7.5  watch /test token
     7.6  unwatch /test token 
     7.7 -start /
     7.8 +start
     7.9  commit
    7.10 -start /
    7.11 +start
    7.12  abort
    7.13  
    7.14  # These don't work
    7.15 -expect write failed: Read-only file system
    7.16 +expect write failed: Permission denied
    7.17  write /test2 contents
    7.18 -expect write failed: Read-only file system
    7.19 +expect write failed: Permission denied
    7.20  write /test contents
    7.21 -expect setperm failed: Read-only file system
    7.22 +expect setperm failed: Permission denied
    7.23  setperm /test 100 NONE
    7.24 -expect setperm failed: Read-only file system
    7.25 +expect setperm failed: Permission denied
    7.26  setperm /test 100 NONE
    7.27 -expect shutdown failed: Read-only file system
    7.28 +expect shutdown failed: Permission denied
    7.29  shutdown
    7.30 -expect introduce failed: Read-only file system
    7.31 +expect introduce failed: Permission denied
    7.32  introduce 1 100 7 /home
    7.33  
    7.34  # Check that watches work like normal.
     8.1 --- a/tools/xenstore/testsuite/14complexperms.test	Fri Sep 23 14:24:58 2005 +0100
     8.2 +++ b/tools/xenstore/testsuite/14complexperms.test	Fri Sep 23 14:25:01 2005 +0100
     8.3 @@ -33,14 +33,6 @@ unwatch /dir/file token
     8.4  expect *No such file or directory
     8.5  unwatch /dir/file token 
     8.6  expect *Permission denied
     8.7 -start /dir/file
     8.8 -expect *No such file or directory
     8.9 -abort
    8.10 -expect *Permission denied
    8.11 -start /dir/file
    8.12 -expect *No such file or directory
    8.13 -commit
    8.14 -expect *Permission denied
    8.15  introduce 2 100 7 /dir/file
    8.16  
    8.17  # Now it exists
    8.18 @@ -73,12 +65,4 @@ unwatch /dir/file token
    8.19  expect *No such file or directory
    8.20  unwatch /dir/file token 
    8.21  expect *Permission denied
    8.22 -start /dir/file
    8.23 -expect *No such file or directory
    8.24 -abort
    8.25 -expect *Permission denied
    8.26 -start /dir/file
    8.27 -expect *No such file or directory
    8.28 -commit
    8.29 -expect *Permission denied
    8.30  introduce 2 100 7 /dir/file
     9.1 --- a/tools/xenstore/testsuite/16block-watch-crash.test	Fri Sep 23 14:24:58 2005 +0100
     9.2 +++ b/tools/xenstore/testsuite/16block-watch-crash.test	Fri Sep 23 14:25:01 2005 +0100
     9.3 @@ -1,13 +1,14 @@
     9.4  # Test case where blocked connection gets sent watch.
     9.5  
     9.6 -mkdir /test
     9.7 -watch /test token
     9.8 -1 start /test
     9.9 -# This will block on above
    9.10 -noackwrite /test/entry contents
    9.11 -1 write /test/entry2 contents
    9.12 -1 commit
    9.13 -readack
    9.14 -expect /test/entry2:token
    9.15 -waitwatch
    9.16 -ackwatch token
    9.17 +# FIXME: We no longer block connections 
    9.18 +# mkdir /test
    9.19 +# watch /test token
    9.20 +# 1 start
    9.21 +# # This will block on above
    9.22 +# noackwrite /test/entry contents
    9.23 +# 1 write /test/entry2 contents
    9.24 +# 1 commit
    9.25 +# readack
    9.26 +# expect /test/entry2:token
    9.27 +# waitwatch
    9.28 +# ackwatch token
    10.1 --- a/tools/xenstore/xenstored.h	Fri Sep 23 14:24:58 2005 +0100
    10.2 +++ b/tools/xenstore/xenstored.h	Fri Sep 23 14:25:01 2005 +0100
    10.3 @@ -75,7 +75,7 @@ static struct xsd_errors xsd_errors[] __
    10.4  	XSD_ERROR(ENOSYS),
    10.5  	XSD_ERROR(EROFS),
    10.6  	XSD_ERROR(EBUSY),
    10.7 -	XSD_ERROR(ETIMEDOUT),
    10.8 +	XSD_ERROR(EAGAIN),
    10.9  	XSD_ERROR(EISCONN),
   10.10  };
   10.11  struct xsd_sockmsg
    11.1 --- a/tools/xenstore/xenstored_core.c	Fri Sep 23 14:24:58 2005 +0100
    11.2 +++ b/tools/xenstore/xenstored_core.c	Fri Sep 23 14:25:01 2005 +0100
    11.3 @@ -50,10 +50,12 @@
    11.4  #include "xenstored_transaction.h"
    11.5  #include "xenstored_domain.h"
    11.6  #include "xenctrl.h"
    11.7 +#include "tdb.h"
    11.8  
    11.9  static bool verbose;
   11.10  LIST_HEAD(connections);
   11.11  static int tracefd = -1;
   11.12 +static TDB_CONTEXT *tdb_ctx;
   11.13  
   11.14  #ifdef TESTING
   11.15  static bool failtest = false;
   11.16 @@ -126,6 +128,23 @@ void __attribute__((noreturn)) corrupt(s
   11.17  	_exit(2);
   11.18  }
   11.19  
   11.20 +TDB_CONTEXT *tdb_context(struct connection *conn)
   11.21 +{
   11.22 +	/* conn = NULL used in manual_node at setup. */
   11.23 +	if (!conn || !conn->transaction)
   11.24 +		return tdb_ctx;
   11.25 +	return tdb_transaction_context(conn->transaction);
   11.26 +}
   11.27 +
   11.28 +bool replace_tdb(const char *newname, TDB_CONTEXT *newtdb)
   11.29 +{
   11.30 +	if (rename(newname, xs_daemon_tdb()) != 0)
   11.31 +		return false;
   11.32 +	tdb_close(tdb_ctx);
   11.33 +	tdb_ctx = talloc_steal(talloc_autofree_context(), newtdb);
   11.34 +	return true;
   11.35 +}
   11.36 +
   11.37  static char *sockmsg_string(enum xsd_sockmsg_type type)
   11.38  {
   11.39  	switch (type) {
   11.40 @@ -202,37 +221,6 @@ void trace_destroy(const void *data, con
   11.41  	write(tracefd, string, strlen(string));
   11.42  }
   11.43  
   11.44 -void trace_watch_timeout(const struct connection *conn, const char *node, const char *token)
   11.45 -{
   11.46 -	char string[64];
   11.47 -	if (tracefd < 0)
   11.48 -		return;
   11.49 -	write(tracefd, "WATCH_TIMEOUT ", strlen("WATCH_TIMEOUT "));
   11.50 -	sprintf(string, " %p ", conn);
   11.51 -	write(tracefd, string, strlen(string));
   11.52 -	write(tracefd, " (", 2);
   11.53 -	write(tracefd, node, strlen(node));
   11.54 -	write(tracefd, " ", 1);
   11.55 -	write(tracefd, token, strlen(token));
   11.56 -	write(tracefd, ")\n", 2);
   11.57 -}
   11.58 -
   11.59 -static void trace_blocked(const struct connection *conn,
   11.60 -			  const struct buffered_data *data)
   11.61 -{
   11.62 -	char string[64];
   11.63 -
   11.64 -	if (tracefd < 0)
   11.65 -		return;
   11.66 -
   11.67 -	write(tracefd, "BLOCKED", strlen("BLOCKED"));
   11.68 -	sprintf(string, " %p (", conn);
   11.69 -	write(tracefd, string, strlen(string));
   11.70 -	write(tracefd, sockmsg_string(data->hdr.msg.type),
   11.71 -	      strlen(sockmsg_string(data->hdr.msg.type)));
   11.72 -	write(tracefd, ")\n", 2);
   11.73 -}
   11.74 -
   11.75  void trace(const char *fmt, ...)
   11.76  {
   11.77  	va_list arglist;
   11.78 @@ -253,7 +241,6 @@ static bool write_message(struct connect
   11.79  	int ret;
   11.80  	struct buffered_data *out = conn->out;
   11.81  
   11.82 -	assert(conn->state != BLOCKED);
   11.83  	if (out->inhdr) {
   11.84  		if (verbose)
   11.85  			xprintf("Writing msg %s (%s) out to %p\n",
   11.86 @@ -351,24 +338,6 @@ static int initialize_set(fd_set *inset,
   11.87  	return max;
   11.88  }
   11.89  
   11.90 -/* Read everything from a talloc_open'ed fd. */
   11.91 -void *read_all(int *fd, unsigned int *size)
   11.92 -{
   11.93 -	unsigned int max = 4;
   11.94 -	int ret;
   11.95 -	void *buffer = talloc_size(fd, max);
   11.96 -
   11.97 -	*size = 0;
   11.98 -	while ((ret = read(*fd, buffer + *size, max - *size)) > 0) {
   11.99 -		*size += ret;
  11.100 -		if (*size == max)
  11.101 -			buffer = talloc_realloc_size(fd, buffer, max *= 2);
  11.102 -	}
  11.103 -	if (ret < 0)
  11.104 -		return NULL;
  11.105 -	return buffer;
  11.106 -}
  11.107 -
  11.108  static int destroy_fd(void *_fd)
  11.109  {
  11.110  	int *fd = _fd;
  11.111 @@ -409,42 +378,167 @@ bool is_child(const char *child, const c
  11.112  	return child[len] == '/' || child[len] == '\0';
  11.113  }
  11.114  
  11.115 -/* Answer never ends in /. */
  11.116 -char *node_dir_outside_transaction(const char *node)
  11.117 +/* If it fails, returns NULL and sets errno. */
  11.118 +static struct node *read_node(struct connection *conn, const char *name)
  11.119  {
  11.120 -	if (streq(node, "/"))
  11.121 -		return talloc_strdup(node, xs_daemon_store());
  11.122 -	return talloc_asprintf(node, "%s%s", xs_daemon_store(), node);
  11.123 +	TDB_DATA key, data;
  11.124 +	u32 *p;
  11.125 +	struct node *node;
  11.126 +
  11.127 +	key.dptr = (void *)name;
  11.128 +	key.dsize = strlen(name);
  11.129 +	data = tdb_fetch(tdb_context(conn), key);
  11.130 +
  11.131 +	if (data.dptr == NULL) {
  11.132 +		if (tdb_error(tdb_context(conn)) == TDB_ERR_NOEXIST)
  11.133 +			errno = ENOENT;
  11.134 +		else
  11.135 +			errno = EIO;
  11.136 +		return NULL;
  11.137 +	}
  11.138 +
  11.139 +	node = talloc(name, struct node);
  11.140 +	node->name = talloc_strdup(node, name);
  11.141 +	node->parent = NULL;
  11.142 +	node->tdb = tdb_context(conn);
  11.143 +	talloc_steal(node, data.dptr);
  11.144 +
  11.145 +	/* Datalen, childlen, number of permissions */
  11.146 +	p = (u32 *)data.dptr;
  11.147 +	node->num_perms = p[0];
  11.148 +	node->datalen = p[1];
  11.149 +	node->childlen = p[2];
  11.150 +
  11.151 +	/* Permissions are struct xs_permissions. */
  11.152 +	node->perms = (void *)&p[3];
  11.153 +	/* Data is binary blob (usually ascii, no nul). */
  11.154 +	node->data = node->perms + node->num_perms;
  11.155 +	/* Children is strings, nul separated. */
  11.156 +	node->children = node->data + node->datalen;
  11.157 +
  11.158 +	return node;
  11.159  }
  11.160  
  11.161 -static char *node_dir(struct transaction *trans, const char *node)
  11.162 +static bool write_node(struct connection *conn, const struct node *node)
  11.163  {
  11.164 -	if (!trans || !within_transaction(trans, node))
  11.165 -		return node_dir_outside_transaction(node);
  11.166 -	return node_dir_inside_transaction(trans, node);
  11.167 +	TDB_DATA key, data;
  11.168 +	void *p;
  11.169 +
  11.170 +	key.dptr = (void *)node->name;
  11.171 +	key.dsize = strlen(node->name);
  11.172 +
  11.173 +	data.dsize = 3*sizeof(u32)
  11.174 +		+ node->num_perms*sizeof(node->perms[0])
  11.175 +		+ node->datalen + node->childlen;
  11.176 +	data.dptr = talloc_size(node, data.dsize);
  11.177 +	((u32 *)data.dptr)[0] = node->num_perms;
  11.178 +	((u32 *)data.dptr)[1] = node->datalen;
  11.179 +	((u32 *)data.dptr)[2] = node->childlen;
  11.180 +	p = data.dptr + 3 * sizeof(u32);
  11.181 +
  11.182 +	memcpy(p, node->perms, node->num_perms*sizeof(node->perms[0]));
  11.183 +	p += node->num_perms*sizeof(node->perms[0]);
  11.184 +	memcpy(p, node->data, node->datalen);
  11.185 +	p += node->datalen;
  11.186 +	memcpy(p, node->children, node->childlen);
  11.187 +
  11.188 +	/* TDB should set errno, but doesn't even set ecode AFAICT. */
  11.189 +	if (tdb_store(tdb_context(conn), key, data, TDB_REPLACE) != 0) {
  11.190 +		errno = ENOSPC;
  11.191 +		return false;
  11.192 +	}
  11.193 +	return true;
  11.194  }
  11.195  
  11.196 -static char *datafile(const char *dir)
  11.197 +static enum xs_perm_type perm_for_conn(struct connection *conn,
  11.198 +				       struct xs_permissions *perms,
  11.199 +				       unsigned int num)
  11.200  {
  11.201 -	return talloc_asprintf(dir, "%s/.data", dir);
  11.202 +	unsigned int i;
  11.203 +	enum xs_perm_type mask = XS_PERM_READ|XS_PERM_WRITE|XS_PERM_OWNER;
  11.204 +
  11.205 +	if (!conn->can_write)
  11.206 +		mask &= ~XS_PERM_WRITE;
  11.207 +
  11.208 +	/* Owners and tools get it all... */
  11.209 +	if (!conn->id || perms[0].id == conn->id)
  11.210 +		return (XS_PERM_READ|XS_PERM_WRITE|XS_PERM_OWNER) & mask;
  11.211 +
  11.212 +	for (i = 1; i < num; i++)
  11.213 +		if (perms[i].id == conn->id)
  11.214 +			return perms[i].perms & mask;
  11.215 +
  11.216 +	return perms[0].perms & mask;
  11.217  }
  11.218  
  11.219 -static char *node_datafile(struct transaction *trans, const char *node)
  11.220 +static char *get_parent(const char *node)
  11.221  {
  11.222 -	return datafile(node_dir(trans, node));
  11.223 +	char *slash = strrchr(node + 1, '/');
  11.224 +	if (!slash)
  11.225 +		return talloc_strdup(node, "/");
  11.226 +	return talloc_asprintf(node, "%.*s", (int)(slash - node), node);
  11.227  }
  11.228  
  11.229 -static char *permfile(const char *dir)
  11.230 +/* What do parents say? */
  11.231 +static enum xs_perm_type ask_parents(struct connection *conn, const char *name)
  11.232  {
  11.233 -	return talloc_asprintf(dir, "%s/.perms", dir);
  11.234 +	struct node *node;
  11.235 +
  11.236 +	do {
  11.237 +		name = get_parent(name);
  11.238 +		node = read_node(conn, name);
  11.239 +		if (node)
  11.240 +			break;
  11.241 +	} while (!streq(name, "/"));
  11.242 +
  11.243 +	/* No permission at root?  We're in trouble. */
  11.244 +	if (!node)
  11.245 +		corrupt(conn, "No permissions file at root");
  11.246 +
  11.247 +	return perm_for_conn(conn, node->perms, node->num_perms);
  11.248  }
  11.249  
  11.250 -static char *node_permfile(struct transaction *trans, const char *node)
  11.251 +/* We have a weird permissions system.  You can allow someone into a
  11.252 + * specific node without allowing it in the parents.  If it's going to
  11.253 + * fail, however, we don't want the errno to indicate any information
  11.254 + * about the node. */
  11.255 +static int errno_from_parents(struct connection *conn, const char *node,
  11.256 +			      int errnum, enum xs_perm_type perm)
  11.257  {
  11.258 -	return permfile(node_dir(trans, node));
  11.259 +	/* We always tell them about memory failures. */
  11.260 +	if (errnum == ENOMEM)
  11.261 +		return errnum;
  11.262 +
  11.263 +	if (ask_parents(conn, node) & perm)
  11.264 +		return errnum;
  11.265 +	return EACCES;
  11.266  }
  11.267  
  11.268 -struct buffered_data *new_buffer(void *ctx)
  11.269 +/* If it fails, returns NULL and sets errno. */
  11.270 +struct node *get_node(struct connection *conn,
  11.271 +		      const char *name,
  11.272 +		      enum xs_perm_type perm)
  11.273 +{
  11.274 +	struct node *node;
  11.275 +
  11.276 +	if (!name || !is_valid_nodename(name)) {
  11.277 +		errno = EINVAL;
  11.278 +		return NULL;
  11.279 +	}
  11.280 +	node = read_node(conn, name);
  11.281 +	/* If we don't have permission, we don't have node. */
  11.282 +	if (node) {
  11.283 +		if ((perm_for_conn(conn, node->perms, node->num_perms) & perm)
  11.284 +		    != perm)
  11.285 +			node = NULL;
  11.286 +	}
  11.287 +	/* Clean up errno if they weren't supposed to know. */
  11.288 +	if (!node) 
  11.289 +		errno = errno_from_parents(conn, name, errno, perm);
  11.290 +	return node;
  11.291 +}
  11.292 +
  11.293 +static struct buffered_data *new_buffer(void *ctx)
  11.294  {
  11.295  	struct buffered_data *data;
  11.296  
  11.297 @@ -457,7 +551,8 @@ struct buffered_data *new_buffer(void *c
  11.298  }
  11.299  
  11.300  /* Return length of string (including nul) at this offset. */
  11.301 -unsigned int get_string(const struct buffered_data *data, unsigned int offset)
  11.302 +static unsigned int get_string(const struct buffered_data *data,
  11.303 +			       unsigned int offset)
  11.304  {
  11.305  	const char *nul;
  11.306  
  11.307 @@ -508,7 +603,6 @@ void send_reply(struct connection *conn,
  11.308  		conn->waiting_reply = bdata;
  11.309  	} else
  11.310  		conn->out = bdata;
  11.311 -	assert(conn->state != BLOCKED);
  11.312  	conn->state = BUSY;
  11.313  }
  11.314  
  11.315 @@ -567,29 +661,6 @@ static const char *onearg(struct buffere
  11.316  	return in->buffer;
  11.317  }
  11.318  
  11.319 -/* If it fails, returns NULL and sets errno. */
  11.320 -static struct xs_permissions *get_perms(const char *dir, unsigned int *num)
  11.321 -{
  11.322 -	unsigned int size;
  11.323 -	char *strings;
  11.324 -	struct xs_permissions *ret;
  11.325 -	int *fd;
  11.326 -
  11.327 -	fd = talloc_open(permfile(dir), O_RDONLY, 0);
  11.328 -	if (!fd)
  11.329 -		return NULL;
  11.330 -	strings = read_all(fd, &size);
  11.331 -	if (!strings)
  11.332 -		return NULL;
  11.333 -
  11.334 -	*num = xs_count_strings(strings, size);
  11.335 -	ret = talloc_array(dir, struct xs_permissions, *num);
  11.336 -	if (!xs_strings_to_perms(ret, *num, strings))
  11.337 -		corrupt(NULL, "Permissions corrupt for %s", dir);
  11.338 -
  11.339 -	return ret;
  11.340 -}
  11.341 -
  11.342  static char *perms_to_strings(const void *ctx,
  11.343  			      struct xs_permissions *perms, unsigned int num,
  11.344  			      unsigned int *len)
  11.345 @@ -610,173 +681,6 @@ static char *perms_to_strings(const void
  11.346  	return strings;
  11.347  }
  11.348  
  11.349 -/* Destroy this, and its children, and its children's children. */
  11.350 -int destroy_path(void *path)
  11.351 -{
  11.352 -	DIR *dir;
  11.353 -	struct dirent *dirent;
  11.354 -
  11.355 -	dir = opendir(path);
  11.356 -	if (!dir) {
  11.357 -		if (unlink(path) == 0 || errno == ENOENT)
  11.358 -			return 0;
  11.359 -		corrupt(NULL, "Destroying path %s", path);
  11.360 -	}
  11.361 -
  11.362 -	while ((dirent = readdir(dir)) != NULL) {
  11.363 -		char fullpath[strlen(path) + 1 + strlen(dirent->d_name) + 1];
  11.364 -		sprintf(fullpath, "%s/%s", (char *)path, dirent->d_name);
  11.365 -		if (!streq(dirent->d_name,".") && !streq(dirent->d_name,".."))
  11.366 -			destroy_path(fullpath);
  11.367 -	}
  11.368 -	closedir(dir);
  11.369 -	if (rmdir(path) != 0)
  11.370 -		corrupt(NULL, "Destroying directory %s", path);
  11.371 -	return 0;
  11.372 -}
  11.373 -
  11.374 -/* Create a self-destructing temporary path */
  11.375 -static char *temppath(const char *path)
  11.376 -{
  11.377 -	char *tmppath = talloc_asprintf(path, "%s.tmp", path);
  11.378 -	talloc_set_destructor(tmppath, destroy_path);
  11.379 -	return tmppath;
  11.380 -}
  11.381 -
  11.382 -/* Create a self-destructing temporary file */
  11.383 -static char *tempfile(const char *path, void *contents, unsigned int len)
  11.384 -{
  11.385 -	int *fd;
  11.386 -	char *tmppath = temppath(path);
  11.387 -
  11.388 -	fd = talloc_open(tmppath, O_WRONLY|O_CREAT|O_EXCL, 0640);
  11.389 -	if (!fd)
  11.390 -		return NULL;
  11.391 -	if (!xs_write_all(*fd, contents, len))
  11.392 -		return NULL;
  11.393 -
  11.394 -	return tmppath;
  11.395 -}
  11.396 -
  11.397 -static int destroy_opendir(void *_dir)
  11.398 -{
  11.399 -	DIR **dir = _dir;
  11.400 -	closedir(*dir);
  11.401 -	return 0;
  11.402 -}
  11.403 -
  11.404 -/* Return a pointer to a DIR*, self-closing and attached to this pathname. */
  11.405 -DIR **talloc_opendir(const char *pathname)
  11.406 -{
  11.407 -	DIR **dir;
  11.408 -
  11.409 -	dir = talloc(pathname, DIR *);
  11.410 -	*dir = opendir(pathname);
  11.411 -	if (!*dir) {
  11.412 -		int saved_errno = errno;
  11.413 -		talloc_free(dir);
  11.414 -		errno = saved_errno;
  11.415 -		return NULL;
  11.416 -	}
  11.417 -	talloc_set_destructor(dir, destroy_opendir);
  11.418 -	return dir;
  11.419 -}
  11.420 -
  11.421 -/* We assume rename() doesn't fail on moves in same dir. */
  11.422 -static void commit_tempfile(const char *path)
  11.423 -{
  11.424 -	char realname[strlen(path) + 1];
  11.425 -	unsigned int len = strrchr(path, '.') - path;
  11.426 -
  11.427 -	memcpy(realname, path, len);
  11.428 -	realname[len] = '\0';
  11.429 -	if (rename(path, realname) != 0)
  11.430 -		corrupt(NULL, "Committing %s", realname);
  11.431 -	talloc_set_destructor(path, NULL);
  11.432 -}
  11.433 -
  11.434 -static bool set_perms(struct transaction *transaction,
  11.435 -		      const char *node,
  11.436 -		      struct xs_permissions *perms, unsigned int num)
  11.437 -{
  11.438 -	unsigned int len;
  11.439 -	char *permpath, *strings;
  11.440 -
  11.441 -	strings = perms_to_strings(node, perms, num, &len);
  11.442 -	if (!strings)
  11.443 -		return false;
  11.444 -
  11.445 -	/* Create then move. */
  11.446 -	permpath = tempfile(node_permfile(transaction, node), strings, len);
  11.447 -	if (!permpath)
  11.448 -		return false;
  11.449 -
  11.450 -	commit_tempfile(permpath);
  11.451 -	return true;
  11.452 -}
  11.453 -
  11.454 -static char *get_parent(const char *node)
  11.455 -{
  11.456 -	char *slash = strrchr(node + 1, '/');
  11.457 -	if (!slash)
  11.458 -		return talloc_strdup(node, "/");
  11.459 -	return talloc_asprintf(node, "%.*s", (int)(slash - node), node);
  11.460 -}
  11.461 -
  11.462 -static enum xs_perm_type perm_for_id(domid_t id,
  11.463 -				     struct xs_permissions *perms,
  11.464 -				     unsigned int num)
  11.465 -{
  11.466 -	unsigned int i;
  11.467 -
  11.468 -	/* Owners and tools get it all... */
  11.469 -	if (!id || perms[0].id == id)
  11.470 -		return XS_PERM_READ|XS_PERM_WRITE|XS_PERM_OWNER;
  11.471 -
  11.472 -	for (i = 1; i < num; i++)
  11.473 -		if (perms[i].id == id)
  11.474 -			return perms[i].perms;
  11.475 -
  11.476 -	return perms[0].perms;
  11.477 -}
  11.478 -
  11.479 -/* What do parents say? */
  11.480 -static enum xs_perm_type ask_parents(struct connection *conn,
  11.481 -				     const char *node)
  11.482 -{
  11.483 -	struct xs_permissions *perms;
  11.484 -	unsigned int num;
  11.485 -
  11.486 -	do {
  11.487 -		node = get_parent(node);
  11.488 -		perms = get_perms(node_dir(conn->transaction, node), &num);
  11.489 -		if (perms)
  11.490 -			break;
  11.491 -	} while (!streq(node, "/"));
  11.492 -
  11.493 -	/* No permission at root?  We're in trouble. */
  11.494 -	if (!perms)
  11.495 -		corrupt(conn, "No permissions file at root");
  11.496 -
  11.497 -	return perm_for_id(conn->id, perms, num);
  11.498 -}
  11.499 -
  11.500 -/* We have a weird permissions system.  You can allow someone into a
  11.501 - * specific node without allowing it in the parents.  If it's going to
  11.502 - * fail, however, we don't want the errno to indicate any information
  11.503 - * about the node. */
  11.504 -static int errno_from_parents(struct connection *conn, const char *node,
  11.505 -			      int errnum)
  11.506 -{
  11.507 -	/* We always tell them about memory failures. */
  11.508 -	if (errnum == ENOMEM)
  11.509 -		return errnum;
  11.510 -
  11.511 -	if (ask_parents(conn, node) & XS_PERM_READ)
  11.512 -		return errnum;
  11.513 -	return EACCES;
  11.514 -}
  11.515 -
  11.516  char *canonicalize(struct connection *conn, const char *node)
  11.517  {
  11.518  	const char *prefix;
  11.519 @@ -789,46 +693,6 @@ char *canonicalize(struct connection *co
  11.520  	return (char *)node;
  11.521  }
  11.522  
  11.523 -bool check_node_perms(struct connection *conn, const char *node,
  11.524 -		      enum xs_perm_type perm)
  11.525 -{
  11.526 -	struct xs_permissions *perms;
  11.527 -	unsigned int num;
  11.528 -
  11.529 -	if (!node || !is_valid_nodename(node)) {
  11.530 -		errno = EINVAL;
  11.531 -		return false;
  11.532 -	}
  11.533 -
  11.534 -	if (!conn->can_write && (perm & XS_PERM_WRITE)) {
  11.535 -		errno = EROFS;
  11.536 -		return false;
  11.537 -	}
  11.538 -
  11.539 -	perms = get_perms(node_dir(conn->transaction, node), &num);
  11.540 -
  11.541 -	if (perms) {
  11.542 -		if (perm_for_id(conn->id, perms, num) & perm)
  11.543 -			return true;
  11.544 -		errno = EACCES;
  11.545 -		return false;
  11.546 -	}
  11.547 -
  11.548 -	/* If it's OK not to exist, we consult parents. */
  11.549 -	if (errno == ENOENT && (perm & XS_PERM_ENOENT_OK)) {
  11.550 -		if (ask_parents(conn, node) & perm)
  11.551 -			return true;
  11.552 -		/* Parents say they should not know. */
  11.553 -		errno = EACCES;
  11.554 -		return false;
  11.555 -	}
  11.556 -
  11.557 -	/* They might not have permission to even *see* this node, in
  11.558 -	 * which case we return EACCES even if it's ENOENT or EIO. */
  11.559 -	errno = errno_from_parents(conn, node, errno);
  11.560 -	return false;
  11.561 -}
  11.562 -
  11.563  bool check_event_node(const char *node)
  11.564  {
  11.565  	if (!node || !strstarts(node, "@")) {
  11.566 @@ -838,142 +702,144 @@ bool check_event_node(const char *node)
  11.567  	return true;
  11.568  }
  11.569  
  11.570 -static void send_directory(struct connection *conn, const char *node)
  11.571 +static void send_directory(struct connection *conn, const char *name)
  11.572  {
  11.573 -	char *path, *reply;
  11.574 -	unsigned int reply_len = 0;
  11.575 -	DIR **dir;
  11.576 -	struct dirent *dirent;
  11.577 +	struct node *node;
  11.578  
  11.579 -	node = canonicalize(conn, node);
  11.580 -	if (!check_node_perms(conn, node, XS_PERM_READ)) {
  11.581 -		send_error(conn, errno);
  11.582 -		return;
  11.583 -	}
  11.584 -
  11.585 -	path = node_dir(conn->transaction, node);
  11.586 -	dir = talloc_opendir(path);
  11.587 -	if (!dir) {
  11.588 +	name = canonicalize(conn, name);
  11.589 +	node = get_node(conn, name, XS_PERM_READ);
  11.590 +	if (!node) {
  11.591  		send_error(conn, errno);
  11.592  		return;
  11.593  	}
  11.594  
  11.595 -	reply = talloc_strdup(node, "");
  11.596 -	while ((dirent = readdir(*dir)) != NULL) {
  11.597 -		int len = strlen(dirent->d_name) + 1;
  11.598 -
  11.599 -		if (!valid_chars(dirent->d_name))
  11.600 -			continue;
  11.601 -
  11.602 -		reply = talloc_realloc(path, reply, char, reply_len + len);
  11.603 -		strcpy(reply + reply_len, dirent->d_name);
  11.604 -		reply_len += len;
  11.605 -	}
  11.606 -
  11.607 -	send_reply(conn, XS_DIRECTORY, reply, reply_len);
  11.608 +	send_reply(conn, XS_DIRECTORY, node->children, node->childlen);
  11.609  }
  11.610  
  11.611 -static void do_read(struct connection *conn, const char *node)
  11.612 +static void do_read(struct connection *conn, const char *name)
  11.613  {
  11.614 -	char *value;
  11.615 -	unsigned int size;
  11.616 -	int *fd;
  11.617 +	struct node *node;
  11.618  
  11.619 -	node = canonicalize(conn, node);
  11.620 -	if (!check_node_perms(conn, node, XS_PERM_READ)) {
  11.621 -		send_error(conn, errno);
  11.622 -		return;
  11.623 -	}
  11.624 -
  11.625 -	fd = talloc_open(node_datafile(conn->transaction, node), O_RDONLY, 0);
  11.626 -	if (!fd) {
  11.627 -		/* Data file doesn't exist?  We call that a directory */
  11.628 -		if (errno == ENOENT)
  11.629 -			errno = EISDIR;
  11.630 +	name = canonicalize(conn, name);
  11.631 +	node = get_node(conn, name, XS_PERM_READ);
  11.632 +	if (!node) {
  11.633  		send_error(conn, errno);
  11.634  		return;
  11.635  	}
  11.636  
  11.637 -	value = read_all(fd, &size);
  11.638 -	if (!value)
  11.639 -		send_error(conn, errno);
  11.640 -	else
  11.641 -		send_reply(conn, XS_READ, value, size);
  11.642 +	send_reply(conn, XS_READ, node->data, node->datalen);
  11.643  }
  11.644  
  11.645 -/* Commit this directory, eg. comitting a/b.tmp/c causes a/b.tmp -> a.b */
  11.646 -static bool commit_dir(char *dir)
  11.647 +static void delete_node_single(struct connection *conn, struct node *node)
  11.648  {
  11.649 -	char *dot, *slash, *dest;
  11.650 +	TDB_DATA key;
  11.651 +
  11.652 +	key.dptr = (void *)node->name;
  11.653 +	key.dsize = strlen(node->name);
  11.654  
  11.655 -	dot = strrchr(dir, '.');
  11.656 -	slash = strchr(dot, '/');
  11.657 -	if (slash)
  11.658 -		*slash = '\0';
  11.659 +	if (tdb_delete(tdb_context(conn), key) != 0)
  11.660 +		corrupt(conn, "Could not delete '%s'", node->name);
  11.661 +}
  11.662  
  11.663 -	dest = talloc_asprintf(dir, "%.*s", (int)(dot - dir), dir);
  11.664 -	return rename(dir, dest) == 0;
  11.665 +/* Must not be / */
  11.666 +static char *basename(const char *name)
  11.667 +{
  11.668 +	return strrchr(name, '/') + 1;
  11.669  }
  11.670  
  11.671 -/* Create a temporary directory.  Put data in it (if data != NULL) */
  11.672 -static char *tempdir(struct connection *conn,
  11.673 -		     const char *node, void *data, unsigned int datalen)
  11.674 +static struct node *construct_node(struct connection *conn, const char *name)
  11.675  {
  11.676 -	struct xs_permissions *perms;
  11.677 -	char *permstr;
  11.678 -	unsigned int num, len;
  11.679 -	int *fd;
  11.680 -	char *dir;
  11.681 +	const char *base;
  11.682 +	unsigned int baselen;
  11.683 +	struct node *parent, *node;
  11.684 +	char *children, *parentname = get_parent(name);
  11.685 +
  11.686 +	/* If parent doesn't exist, create it. */
  11.687 +	parent = read_node(conn, parentname);
  11.688 +	if (!parent)
  11.689 +		parent = construct_node(conn, parentname);
  11.690 +	if (!parent)
  11.691 +		return NULL;
  11.692 +	
  11.693 +	/* Add child to parent. */
  11.694 +	base = basename(name);
  11.695 +	baselen = strlen(base) + 1;
  11.696 +	children = talloc_array(name, char, parent->childlen + baselen);
  11.697 +	memcpy(children, parent->children, parent->childlen);
  11.698 +	memcpy(children + parent->childlen, base, baselen);
  11.699 +	parent->children = children;
  11.700 +	parent->childlen += baselen;
  11.701 +
  11.702 +	/* Allocate node */
  11.703 +	node = talloc(name, struct node);
  11.704 +	node->tdb = tdb_context(conn);
  11.705 +	node->name = talloc_strdup(node, name);
  11.706 +
  11.707 +	/* Inherit permissions, except domains own what they create */
  11.708 +	node->num_perms = parent->num_perms;
  11.709 +	node->perms = talloc_memdup(node, parent->perms,
  11.710 +				    node->num_perms * sizeof(node->perms[0]));
  11.711 +	if (conn->id)
  11.712 +		node->perms[0].id = conn->id;
  11.713  
  11.714 -	dir = temppath(node_dir(conn->transaction, node));
  11.715 -	if (mkdir(dir, 0750) != 0) {
  11.716 -		if (errno != ENOENT)
  11.717 -			return NULL;
  11.718 +	/* No children, no data */
  11.719 +	node->children = node->data = NULL;
  11.720 +	node->childlen = node->datalen = 0;
  11.721 +	node->parent = parent;
  11.722 +	return node;
  11.723 +}
  11.724 +
  11.725 +static int destroy_node(void *_node)
  11.726 +{
  11.727 +	struct node *node = _node;
  11.728 +	TDB_DATA key;
  11.729 +
  11.730 +	if (streq(node->name, "/"))
  11.731 +		corrupt(NULL, "Destroying root node!");
  11.732 +
  11.733 +	key.dptr = (void *)node->name;
  11.734 +	key.dsize = strlen(node->name);
  11.735 +
  11.736 +	tdb_delete(node->tdb, key);
  11.737 +	return 0;
  11.738 +}
  11.739  
  11.740 -		dir = tempdir(conn, get_parent(node), NULL, 0);
  11.741 -		if (!dir)
  11.742 -			return NULL;
  11.743 +/* Be careful: create heirarchy, put entry in existing parent *last*.
  11.744 + * This helps fsck if we die during this. */
  11.745 +static struct node *create_node(struct connection *conn, 
  11.746 +				const char *name,
  11.747 +				void *data, unsigned int datalen)
  11.748 +{
  11.749 +	struct node *node, *i;
  11.750  
  11.751 -		dir = talloc_asprintf(dir, "%s%s", dir, strrchr(node, '/'));
  11.752 -		if (mkdir(dir, 0750) != 0)
  11.753 +	node = construct_node(conn, name);
  11.754 +	if (!node)
  11.755 +		return NULL;
  11.756 +
  11.757 +	node->data = data;
  11.758 +	node->datalen = datalen;
  11.759 +
  11.760 +	/* We write out the nodes down, setting destructor in case
  11.761 +	 * something goes wrong. */
  11.762 +	for (i = node; i; i = i->parent) {
  11.763 +		if (!write_node(conn, i))
  11.764  			return NULL;
  11.765 -		talloc_set_destructor(dir, destroy_path);
  11.766 +		talloc_set_destructor(i, destroy_node);
  11.767  	}
  11.768  
  11.769 -	perms = get_perms(get_parent(dir), &num);
  11.770 -	assert(perms);
  11.771 -	/* Domains own what they create. */
  11.772 -	if (conn->id)
  11.773 -		perms->id = conn->id;
  11.774 -
  11.775 -	permstr = perms_to_strings(dir, perms, num, &len);
  11.776 -	fd = talloc_open(permfile(dir), O_WRONLY|O_CREAT|O_EXCL, 0640);
  11.777 -	if (!fd || !xs_write_all(*fd, permstr, len))
  11.778 -		return NULL;
  11.779 -
  11.780 -	if (data) {
  11.781 -		char *datapath = datafile(dir);
  11.782 -
  11.783 -		fd = talloc_open(datapath, O_WRONLY|O_CREAT|O_EXCL, 0640);
  11.784 -		if (!fd || !xs_write_all(*fd, data, datalen))
  11.785 -			return NULL;
  11.786 -	}
  11.787 -	return dir;
  11.788 -}
  11.789 -
  11.790 -static bool node_exists(struct connection *conn, const char *node)
  11.791 -{
  11.792 -	struct stat st;
  11.793 -
  11.794 -	return lstat(node_dir(conn->transaction, node), &st) == 0;
  11.795 +	/* OK, now remove destructors so they stay around */
  11.796 +	for (i = node; i; i = i->parent)
  11.797 +		talloc_set_destructor(i, NULL);
  11.798 +	return node;
  11.799  }
  11.800  
  11.801  /* path, data... */
  11.802  static void do_write(struct connection *conn, struct buffered_data *in)
  11.803  {
  11.804  	unsigned int offset, datalen;
  11.805 +	struct node *node;
  11.806  	char *vec[1] = { NULL }; /* gcc4 + -W + -Werror fucks code. */
  11.807 -	char *node, *tmppath;
  11.808 +	char *name;
  11.809  
  11.810  	/* Extra "strings" can be created by binary data. */
  11.811  	if (get_strings(in, vec, ARRAY_SIZE(vec)) < ARRAY_SIZE(vec)) {
  11.812 @@ -981,99 +847,115 @@ static void do_write(struct connection *
  11.813  		return;
  11.814  	}
  11.815  
  11.816 -	node = canonicalize(conn, vec[0]);
  11.817 -	if (!within_transaction(conn->transaction, node)) {
  11.818 -		send_error(conn, EROFS);
  11.819 -		return;
  11.820 -	}
  11.821 -
  11.822 -	if (transaction_block(conn, node))
  11.823 -		return;
  11.824 -
  11.825  	offset = strlen(vec[0]) + 1;
  11.826  	datalen = in->used - offset;
  11.827  
  11.828 -	if (!check_node_perms(conn, node, XS_PERM_WRITE|XS_PERM_ENOENT_OK)) {
  11.829 -		send_error(conn, errno);
  11.830 -		return;
  11.831 +	name = canonicalize(conn, vec[0]);
  11.832 +	node = get_node(conn, name, XS_PERM_WRITE);
  11.833 +	if (!node) {
  11.834 +		/* No permissions, invalid input? */
  11.835 +		if (errno != ENOENT) {
  11.836 +			send_error(conn, errno);
  11.837 +			return;
  11.838 +		}
  11.839 +		node = create_node(conn, name, in->buffer + offset, datalen);
  11.840 +		if (!node) {
  11.841 +			send_error(conn, errno);
  11.842 +			return;
  11.843 +		}
  11.844 +	} else {
  11.845 +		node->data = in->buffer + offset;
  11.846 +		node->datalen = datalen;
  11.847 +		if (!write_node(conn, node)){
  11.848 +			send_error(conn, errno);
  11.849 +			return;
  11.850 +		}
  11.851  	}
  11.852  
  11.853 -	if (!node_exists(conn, node)) {
  11.854 -		char *dir;
  11.855 +	add_change_node(conn->transaction, name, false);
  11.856 +	fire_watches(conn, name, false);
  11.857 +	send_ack(conn, XS_WRITE);
  11.858 +}
  11.859  
  11.860 -		/* Does not exist... */
  11.861 +static void do_mkdir(struct connection *conn, const char *name)
  11.862 +{
  11.863 +	struct node *node;
  11.864 +
  11.865 +	name = canonicalize(conn, name);
  11.866 +	node = get_node(conn, name, XS_PERM_WRITE);
  11.867 +
  11.868 +	/* If it already exists, fine. */
  11.869 +	if (!node) {
  11.870 +		/* No permissions? */
  11.871  		if (errno != ENOENT) {
  11.872  			send_error(conn, errno);
  11.873  			return;
  11.874  		}
  11.875 -
  11.876 -		dir = tempdir(conn, node, in->buffer + offset, datalen);
  11.877 -		if (!dir || !commit_dir(dir)) {
  11.878 -			send_error(conn, errno);
  11.879 -			return;
  11.880 -		}
  11.881 -		
  11.882 -	} else {
  11.883 -		/* Exists... */
  11.884 -		tmppath = tempfile(node_datafile(conn->transaction, node),
  11.885 -				   in->buffer + offset, datalen);
  11.886 -		if (!tmppath) {
  11.887 +		node = create_node(conn, name, NULL, 0);
  11.888 +		if (!node) {
  11.889  			send_error(conn, errno);
  11.890  			return;
  11.891  		}
  11.892 -
  11.893 -		commit_tempfile(tmppath);
  11.894 -	}
  11.895 -
  11.896 -	add_change_node(conn->transaction, node, false);
  11.897 -	fire_watches(conn, node, false);
  11.898 -	send_ack(conn, XS_WRITE);
  11.899 -}
  11.900 -
  11.901 -static void do_mkdir(struct connection *conn, const char *node)
  11.902 -{
  11.903 -	char *dir;
  11.904 -
  11.905 -	node = canonicalize(conn, node);
  11.906 -	if (!check_node_perms(conn, node, XS_PERM_WRITE|XS_PERM_ENOENT_OK)) {
  11.907 -		send_error(conn, errno);
  11.908 -		return;
  11.909 +		add_change_node(conn->transaction, name, false);
  11.910 +		fire_watches(conn, name, false);
  11.911  	}
  11.912 -
  11.913 -	if (!within_transaction(conn->transaction, node)) {
  11.914 -		send_error(conn, EROFS);
  11.915 -		return;
  11.916 -	}
  11.917 -
  11.918 -	if (transaction_block(conn, node))
  11.919 -		return;
  11.920 -
  11.921 -	/* If it already exists, fine. */
  11.922 -	if (node_exists(conn, node)) {
  11.923 -		send_ack(conn, XS_MKDIR);
  11.924 -		return;
  11.925 -	}
  11.926 -
  11.927 -	dir = tempdir(conn, node, NULL, 0);
  11.928 -	if (!dir || !commit_dir(dir)) {
  11.929 -		send_error(conn, errno);
  11.930 -		return;
  11.931 -	}
  11.932 -
  11.933 -	add_change_node(conn->transaction, node, false);
  11.934 -	fire_watches(conn, node, false);
  11.935  	send_ack(conn, XS_MKDIR);
  11.936  }
  11.937  
  11.938 -static void do_rm(struct connection *conn, const char *node)
  11.939 +static void delete_node(struct connection *conn, struct node *node)
  11.940 +{
  11.941 +	unsigned int i;
  11.942 +
  11.943 +	/* Delete self, then delete children.  If something goes wrong,
  11.944 +	 * consistency check will clean up this way. */
  11.945 +	delete_node_single(conn, node);
  11.946 +
  11.947 +	/* Delete children, too. */
  11.948 +	for (i = 0; i < node->childlen; i += strlen(node->children+i) + 1) {
  11.949 +		struct node *child;
  11.950 +
  11.951 +		child = read_node(conn, 
  11.952 +				  talloc_asprintf(node, "%s/%s", node->name,
  11.953 +						  node->children + i));
  11.954 +		if (!child)
  11.955 +			corrupt(conn, "No child '%s' found", child);
  11.956 +		delete_node(conn, child);
  11.957 +	}
  11.958 +}
  11.959 +
  11.960 +/* Delete memory using memmove. */
  11.961 +static void memdel(void *mem, unsigned off, unsigned len, unsigned total)
  11.962  {
  11.963 -	char *tmppath, *path;
  11.964 +	memmove(mem + off, mem + off + len, total - off - len);
  11.965 +}
  11.966 +
  11.967 +static bool delete_child(struct connection *conn,
  11.968 +			 struct node *node, const char *childname)
  11.969 +{
  11.970 +	unsigned int i;
  11.971  
  11.972 -	node = canonicalize(conn, node);
  11.973 -	if (!check_node_perms(conn, node, XS_PERM_WRITE)) {
  11.974 +	for (i = 0; i < node->childlen; i += strlen(node->children+i) + 1) {
  11.975 +		if (streq(node->children+i, childname)) {
  11.976 +			memdel(node->children, i, strlen(childname) + 1,
  11.977 +			       node->childlen);
  11.978 +			node->childlen -= strlen(childname) + 1;
  11.979 +			return write_node(conn, node);
  11.980 +		}
  11.981 +	}
  11.982 +	corrupt(conn, "Can't find child '%s' in %s", childname, node->name);
  11.983 +}
  11.984 +
  11.985 +static void do_rm(struct connection *conn, const char *name)
  11.986 +{
  11.987 +	struct node *node, *parent;
  11.988 +
  11.989 +	name = canonicalize(conn, name);
  11.990 +	node = get_node(conn, name, XS_PERM_WRITE);
  11.991 +	if (!node) {
  11.992  		/* Didn't exist already?  Fine, if parent exists. */
  11.993  		if (errno == ENOENT) {
  11.994 -			if (node_exists(conn, get_parent(node))) {
  11.995 +			node = read_node(conn, get_parent(name));
  11.996 +			if (node) {
  11.997  				send_ack(conn, XS_RM);
  11.998  				return;
  11.999  			}
 11.1000 @@ -1084,53 +966,43 @@ static void do_rm(struct connection *con
 11.1001  		return;
 11.1002  	}
 11.1003  
 11.1004 -	if (!within_transaction(conn->transaction, node)) {
 11.1005 -		send_error(conn, EROFS);
 11.1006 +	if (streq(name, "/")) {
 11.1007 +		send_error(conn, EINVAL);
 11.1008  		return;
 11.1009  	}
 11.1010  
 11.1011 -	if (transaction_block(conn, node))
 11.1012 -		return;
 11.1013 -
 11.1014 -	if (streq(node, "/")) {
 11.1015 +	/* Delete from parent first, then if something explodes fsck cleans. */
 11.1016 +	parent = read_node(conn, get_parent(name));
 11.1017 +	if (!parent) {
 11.1018  		send_error(conn, EINVAL);
 11.1019  		return;
 11.1020  	}
 11.1021  
 11.1022 -	/* We move the directory to temporary name, destructor cleans up. */
 11.1023 -	path = node_dir(conn->transaction, node);
 11.1024 -	tmppath = talloc_asprintf(node, "%s.tmp", path);
 11.1025 -	talloc_set_destructor(tmppath, destroy_path);
 11.1026 +	if (!delete_child(conn, parent, basename(name))) {
 11.1027 +		send_error(conn, EINVAL);
 11.1028 +		return;
 11.1029 +	}
 11.1030  
 11.1031 -	if (rename(path, tmppath) != 0) {
 11.1032 +	delete_node(conn, node);
 11.1033 +	add_change_node(conn->transaction, name, true);
 11.1034 +	fire_watches(conn, name, true);
 11.1035 +	send_ack(conn, XS_RM);
 11.1036 +}
 11.1037 +
 11.1038 +static void do_get_perms(struct connection *conn, const char *name)
 11.1039 +{
 11.1040 +	struct node *node;
 11.1041 +	char *strings;
 11.1042 +	unsigned int len;
 11.1043 +
 11.1044 +	name = canonicalize(conn, name);
 11.1045 +	node = get_node(conn, name, XS_PERM_READ);
 11.1046 +	if (!node) {
 11.1047  		send_error(conn, errno);
 11.1048  		return;
 11.1049  	}
 11.1050  
 11.1051 -	add_change_node(conn->transaction, node, true);
 11.1052 -	fire_watches(conn, node, true);
 11.1053 -	send_ack(conn, XS_RM);
 11.1054 -}
 11.1055 -
 11.1056 -static void do_get_perms(struct connection *conn, const char *node)
 11.1057 -{
 11.1058 -	struct xs_permissions *perms;
 11.1059 -	char *strings;
 11.1060 -	unsigned int len, num;
 11.1061 -
 11.1062 -	node = canonicalize(conn, node);
 11.1063 -	if (!check_node_perms(conn, node, XS_PERM_READ)) {
 11.1064 -		send_error(conn, errno);
 11.1065 -		return;
 11.1066 -	}
 11.1067 -
 11.1068 -	perms = get_perms(node_dir(conn->transaction, node), &num);
 11.1069 -	if (!perms) {
 11.1070 -		send_error(conn, errno);
 11.1071 -		return;
 11.1072 -	}
 11.1073 -
 11.1074 -	strings = perms_to_strings(node, perms, num, &len);
 11.1075 +	strings = perms_to_strings(node, node->perms, node->num_perms, &len);
 11.1076  	if (!strings)
 11.1077  		send_error(conn, errno);
 11.1078  	else
 11.1079 @@ -1140,8 +1012,8 @@ static void do_get_perms(struct connecti
 11.1080  static void do_set_perms(struct connection *conn, struct buffered_data *in)
 11.1081  {
 11.1082  	unsigned int num;
 11.1083 -	char *node, *permstr;
 11.1084 -	struct xs_permissions *perms;
 11.1085 +	char *name, *permstr;
 11.1086 +	struct node *node;
 11.1087  
 11.1088  	num = xs_count_strings(in->buffer, in->used);
 11.1089  	if (num < 2) {
 11.1090 @@ -1150,37 +1022,30 @@ static void do_set_perms(struct connecti
 11.1091  	}
 11.1092  
 11.1093  	/* First arg is node name. */
 11.1094 -	node = canonicalize(conn, in->buffer);
 11.1095 +	name = canonicalize(conn, in->buffer);
 11.1096  	permstr = in->buffer + strlen(in->buffer) + 1;
 11.1097  	num--;
 11.1098  
 11.1099 -	if (!within_transaction(conn->transaction, node)) {
 11.1100 -		send_error(conn, EROFS);
 11.1101 -		return;
 11.1102 -	}
 11.1103 -
 11.1104 -	if (transaction_block(conn, node))
 11.1105 -		return;
 11.1106 -
 11.1107  	/* We must own node to do this (tools can do this too). */
 11.1108 -	if (!check_node_perms(conn, node, XS_PERM_WRITE|XS_PERM_OWNER)) {
 11.1109 +	node = get_node(conn, name, XS_PERM_WRITE|XS_PERM_OWNER);
 11.1110 +	if (!node) {
 11.1111  		send_error(conn, errno);
 11.1112  		return;
 11.1113  	}
 11.1114  
 11.1115 -	perms = talloc_array(node, struct xs_permissions, num);
 11.1116 -	if (!xs_strings_to_perms(perms, num, permstr)) {
 11.1117 +	node->perms = talloc_array(node, struct xs_permissions, num);
 11.1118 +	node->num_perms = num;
 11.1119 +	if (!xs_strings_to_perms(node->perms, num, permstr)) {
 11.1120 +		send_error(conn, errno);
 11.1121 +		return;
 11.1122 +	}
 11.1123 +	if (!write_node(conn, node)) {
 11.1124  		send_error(conn, errno);
 11.1125  		return;
 11.1126  	}
 11.1127  
 11.1128 -	if (!set_perms(conn->transaction, node, perms, num)) {
 11.1129 -		send_error(conn, errno);
 11.1130 -		return;
 11.1131 -	}
 11.1132 -
 11.1133 -	add_change_node(conn->transaction, node, false);
 11.1134 -	fire_watches(conn, node, false);
 11.1135 +	add_change_node(conn->transaction, name, false);
 11.1136 +	fire_watches(conn, name, false);
 11.1137  	send_ack(conn, XS_SET_PERMS);
 11.1138  }
 11.1139  
 11.1140 @@ -1221,14 +1086,10 @@ static void process_message(struct conne
 11.1141  	case XS_SHUTDOWN:
 11.1142  		/* FIXME: Implement gentle shutdown too. */
 11.1143  		/* Only tools can do this. */
 11.1144 -		if (conn->id != 0) {
 11.1145 +		if (conn->id != 0 || !conn->can_write) {
 11.1146  			send_error(conn, EACCES);
 11.1147  			break;
 11.1148  		}
 11.1149 -		if (!conn->can_write) {
 11.1150 -			send_error(conn, EROFS);
 11.1151 -			break;
 11.1152 -		}
 11.1153  		send_ack(conn, XS_SHUTDOWN);
 11.1154  		/* Everything hangs off auto-free context, freed at exit. */
 11.1155  		exit(0);
 11.1156 @@ -1263,7 +1124,7 @@ static void process_message(struct conne
 11.1157  		break;
 11.1158  
 11.1159  	case XS_TRANSACTION_START:
 11.1160 -		do_transaction_start(conn, onearg(in));
 11.1161 +		do_transaction_start(conn, in);
 11.1162  		break;
 11.1163  
 11.1164  	case XS_TRANSACTION_END:
 11.1165 @@ -1309,6 +1170,8 @@ static void consider_message(struct conn
 11.1166  	/* For simplicity, we kill the connection on OOM. */
 11.1167  	talloc_set_fail_handler(out_of_mem, &talloc_fail);
 11.1168  	if (setjmp(talloc_fail)) {
 11.1169 +		/* Free in before conn, in case it needs something. */
 11.1170 +		talloc_free(in);
 11.1171  		talloc_free(conn);
 11.1172  		goto end;
 11.1173  	}
 11.1174 @@ -1330,16 +1193,8 @@ static void consider_message(struct conn
 11.1175  	conn->in = new_buffer(conn);
 11.1176  	process_message(conn, in);
 11.1177  
 11.1178 -	if (conn->state == BLOCKED) {
 11.1179 -		/* Blocked by transaction: queue for re-xmit. */
 11.1180 -		talloc_free(conn->in);
 11.1181 -		conn->in = in;
 11.1182 -		in = NULL;
 11.1183 -		trace_blocked(conn, conn->in);
 11.1184 -	}
 11.1185 -
 11.1186 +	talloc_free(in);
 11.1187  end:
 11.1188 -	talloc_free(in);
 11.1189  	talloc_set_fail_handler(NULL, NULL);
 11.1190  	if (talloc_total_blocks(NULL)
 11.1191  	    != talloc_total_blocks(talloc_autofree_context()) + 1) {
 11.1192 @@ -1350,7 +1205,7 @@ end:
 11.1193  
 11.1194  /* Errors in reading or allocating here mean we get out of sync, so we
 11.1195   * drop the whole client connection. */
 11.1196 -void handle_input(struct connection *conn)
 11.1197 +static void handle_input(struct connection *conn)
 11.1198  {
 11.1199  	int bytes;
 11.1200  	struct buffered_data *in;
 11.1201 @@ -1402,41 +1257,12 @@ bad_client:
 11.1202  	talloc_free(conn);
 11.1203  }
 11.1204  
 11.1205 -void handle_output(struct connection *conn)
 11.1206 +static void handle_output(struct connection *conn)
 11.1207  {
 11.1208  	if (!write_message(conn))
 11.1209  		talloc_free(conn);
 11.1210  }
 11.1211  
 11.1212 -/* If a transaction has ended, see if we can unblock any connections. */
 11.1213 -static void unblock_connections(void)
 11.1214 -{
 11.1215 -	struct connection *i, *tmp;
 11.1216 -
 11.1217 -	list_for_each_entry_safe(i, tmp, &connections, list) {
 11.1218 -		switch (i->state) {
 11.1219 -		case BLOCKED:
 11.1220 -			if (!transaction_covering_node(i->blocked_by)) {
 11.1221 -				talloc_free(i->blocked_by);
 11.1222 -				i->blocked_by = NULL;
 11.1223 -				i->state = OK;
 11.1224 -				consider_message(i);
 11.1225 -			}
 11.1226 -			break;
 11.1227 -		case BUSY:
 11.1228 -		case OK:
 11.1229 -			break;
 11.1230 -		}
 11.1231 -	}
 11.1232 -
 11.1233 -	/* To balance bias, move first entry to end. */
 11.1234 -	if (!list_empty(&connections)) {
 11.1235 -		i = list_top(&connections, struct connection, list);
 11.1236 -		list_del(&i->list);
 11.1237 -		list_add_tail(&i->list, &connections);
 11.1238 -	}
 11.1239 -}
 11.1240 -
 11.1241  struct connection *new_connection(connwritefn_t *write, connreadfn_t *read)
 11.1242  {
 11.1243  	/*
 11.1244 @@ -1451,7 +1277,6 @@ struct connection *new_connection(connwr
 11.1245  		return NULL;
 11.1246  
 11.1247  	new->state = OK;
 11.1248 -	new->blocked_by = NULL;
 11.1249  	new->out = new->waiting_reply = NULL;
 11.1250  	new->waiting_for_ack = NULL;
 11.1251  	new->fd = -1;
 11.1252 @@ -1504,25 +1329,9 @@ static void accept_connection(int sock, 
 11.1253  		close(fd);
 11.1254  }
 11.1255  
 11.1256 -/* Calc timespan from now to absolute time. */
 11.1257 -static void time_relative_to_now(struct timeval *tv)
 11.1258 -{
 11.1259 -	struct timeval now;
 11.1260 -
 11.1261 -	gettimeofday(&now, NULL);
 11.1262 -	if (timercmp(&now, tv, >))
 11.1263 -		timerclear(tv);
 11.1264 -	else {
 11.1265 -		tv->tv_sec -= now.tv_sec;
 11.1266 -		if (now.tv_usec > tv->tv_usec) {
 11.1267 -			tv->tv_sec--;
 11.1268 -			tv->tv_usec += 1000000;
 11.1269 -		}
 11.1270 -		tv->tv_usec -= now.tv_usec;
 11.1271 -	}
 11.1272 -}
 11.1273 -
 11.1274  #ifdef TESTING
 11.1275 +/* Valgrind can check our writes better if we don't use mmap */
 11.1276 +#define TDB_FLAGS TDB_NOMMAP
 11.1277  /* Useful for running under debugger. */
 11.1278  void dump_connection(void)
 11.1279  {
 11.1280 @@ -1532,13 +1341,10 @@ void dump_connection(void)
 11.1281  		printf("Connection %p:\n", i);
 11.1282  		printf("    state = %s\n",
 11.1283  		       i->state == OK ? "OK"
 11.1284 -		       : i->state == BLOCKED ? "BLOCKED"
 11.1285  		       : i->state == BUSY ? "BUSY"
 11.1286  		       : "INVALID");
 11.1287  		if (i->id)
 11.1288  			printf("    id = %i\n", i->id);
 11.1289 -		if (i->blocked_by)
 11.1290 -			printf("    blocked on = %s\n", i->blocked_by);
 11.1291  		if (!i->in->inhdr || i->in->used)
 11.1292  			printf("    got %i bytes of %s\n",
 11.1293  			       i->in->used, i->in->inhdr ? "header" : "data");
 11.1294 @@ -1559,44 +1365,53 @@ void dump_connection(void)
 11.1295  		dump_watches(i);
 11.1296  	}
 11.1297  }
 11.1298 +#else
 11.1299 +#define TDB_FLAGS 0
 11.1300  #endif
 11.1301  
 11.1302 +/* We create initial nodes manually. */
 11.1303 +static void manual_node(const char *name, const char *child)
 11.1304 +{
 11.1305 +	struct node *node;
 11.1306 +	struct xs_permissions perms = { .id = 0, .perms = XS_PERM_READ };
 11.1307 +
 11.1308 +	node = talloc(NULL, struct node);
 11.1309 +	node->name = name;
 11.1310 +	node->perms = &perms;
 11.1311 +	node->num_perms = 1;
 11.1312 +	node->data = NULL;
 11.1313 +	node->datalen = 0;
 11.1314 +	node->children = (char *)child;
 11.1315 +	if (child)
 11.1316 +		node->childlen = strlen(child) + 1;
 11.1317 +	else
 11.1318 +		node->childlen = 0;
 11.1319 +
 11.1320 +	if (!write_node(NULL, node))
 11.1321 +		barf_perror("Could not create initial node %s", name);
 11.1322 +	talloc_free(node);
 11.1323 +}
 11.1324 +
 11.1325 +#
 11.1326 +
 11.1327  static void setup_structure(void)
 11.1328  {
 11.1329 -	struct xs_permissions perms = { .id = 0, .perms = XS_PERM_READ };
 11.1330 -	char *root, *dir, *permfile;
 11.1331 -
 11.1332 -	/* Create root directory, with permissions. */
 11.1333 -	if (mkdir(xs_daemon_store(), 0750) != 0) {
 11.1334 -		if (errno != EEXIST)
 11.1335 -			barf_perror("Could not create root %s",
 11.1336 -				    xs_daemon_store());
 11.1337 -		return;
 11.1338 -	}
 11.1339 -	root = talloc_strdup(talloc_autofree_context(), "/");
 11.1340 -	if (!set_perms(NULL, root, &perms, 1))
 11.1341 -		barf_perror("Could not create permissions in root");
 11.1342 +	char *tdbname;
 11.1343 +	tdbname = talloc_strdup(talloc_autofree_context(), xs_daemon_tdb());
 11.1344 +	tdb_ctx = tdb_open(tdbname, 0, TDB_FLAGS, O_RDWR, 0);
 11.1345  
 11.1346 -	/* Create tool directory, with xenstored subdir. */
 11.1347 -	dir = talloc_asprintf(root, "%s/%s", xs_daemon_store(), "tool");
 11.1348 -	if (mkdir(dir, 0750) != 0)
 11.1349 -		barf_perror("Making dir %s", dir);
 11.1350 -	
 11.1351 -	permfile = talloc_strdup(root, "/tool");
 11.1352 -	if (!set_perms(NULL, permfile, &perms, 1))
 11.1353 -		barf_perror("Could not create permissions on %s", permfile);
 11.1354 +	if (!tdb_ctx) {
 11.1355 +		tdb_ctx = tdb_open(tdbname, 7919, TDB_FLAGS, O_RDWR|O_CREAT,
 11.1356 +				   0640);
 11.1357 +		if (!tdb_ctx)
 11.1358 +			barf_perror("Could not create tdb file %s", tdbname);
 11.1359  
 11.1360 -	dir = talloc_asprintf(root, "%s/%s", dir, "xenstored");
 11.1361 -	if (mkdir(dir, 0750) != 0)
 11.1362 -		barf_perror("Making dir %s", dir);
 11.1363 -	
 11.1364 -	permfile = talloc_strdup(root, "/tool/xenstored");
 11.1365 -	if (!set_perms(NULL, permfile, &perms, 1))
 11.1366 -		barf_perror("Could not create permissions on %s", permfile);
 11.1367 -	talloc_free(root);
 11.1368 -	if (mkdir(xs_daemon_transactions(), 0750) != 0)
 11.1369 -		barf_perror("Could not create transaction dir %s",
 11.1370 -			    xs_daemon_transactions());
 11.1371 +		manual_node("/", "tool");
 11.1372 +		manual_node("/tool", "xenstored");
 11.1373 +		manual_node("/tool/xenstored", NULL);
 11.1374 +	}
 11.1375 +
 11.1376 +	/* FIXME: Fsck */
 11.1377  }
 11.1378  
 11.1379  static void write_pidfile(const char *pidfile)
 11.1380 @@ -1759,17 +1574,8 @@ int main(int argc, char *argv[])
 11.1381  	/* FIXME: Rewrite so noone can starve. */
 11.1382  	for (;;) {
 11.1383  		struct connection *i;
 11.1384 -		struct timeval *tvp = NULL, tv;
 11.1385  
 11.1386 -		timerclear(&tv);
 11.1387 -		shortest_transaction_timeout(&tv);
 11.1388 -		shortest_watch_ack_timeout(&tv);
 11.1389 -		if (timerisset(&tv)) {
 11.1390 -			time_relative_to_now(&tv);
 11.1391 -			tvp = &tv;
 11.1392 -		}
 11.1393 -
 11.1394 -		if (select(max+1, &inset, &outset, NULL, tvp) < 0) {
 11.1395 +		if (select(max+1, &inset, &outset, NULL, NULL) < 0) {
 11.1396  			if (errno == EINTR)
 11.1397  				continue;
 11.1398  			barf_perror("Select failed");
 11.1399 @@ -1818,14 +1624,6 @@ int main(int argc, char *argv[])
 11.1400  			}
 11.1401  		}
 11.1402  
 11.1403 -		if (tvp) {
 11.1404 -			check_transaction_timeout();
 11.1405 -			check_watch_ack_timeout();
 11.1406 -		}
 11.1407 -
 11.1408 -		/* If transactions ended, we might be able to do more work. */
 11.1409 -		unblock_connections();
 11.1410 -
 11.1411  		max = initialize_set(&inset, &outset, *sock, *ro_sock,
 11.1412  				     event_fd);
 11.1413  	}
    12.1 --- a/tools/xenstore/xenstored_core.h	Fri Sep 23 14:24:58 2005 +0100
    12.2 +++ b/tools/xenstore/xenstored_core.h	Fri Sep 23 14:25:01 2005 +0100
    12.3 @@ -28,6 +28,7 @@
    12.4  #include "xs_lib.h"
    12.5  #include "xenstored.h"
    12.6  #include "list.h"
    12.7 +#include "tdb.h"
    12.8  
    12.9  struct buffered_data
   12.10  {
   12.11 @@ -49,8 +50,6 @@ typedef int connreadfn_t(struct connecti
   12.12  
   12.13  enum state
   12.14  {
   12.15 -	/* Blocked by transaction. */
   12.16 -	BLOCKED,
   12.17  	/* Doing action, not listening */
   12.18  	BUSY,
   12.19  	/* Completed */
   12.20 @@ -70,9 +69,6 @@ struct connection
   12.21  	/* Blocked on transaction?  Busy? */
   12.22  	enum state state;
   12.23  
   12.24 -	/* Node we are waiting for (if state == BLOCKED) */
   12.25 -	char *blocked_by;
   12.26 -
   12.27  	/* Is this a read-only connection? */
   12.28  	bool can_write;
   12.29  
   12.30 @@ -103,9 +99,27 @@ struct connection
   12.31  };
   12.32  extern struct list_head connections;
   12.33  
   12.34 -/* Return length of string (including nul) at this offset. */
   12.35 -unsigned int get_string(const struct buffered_data *data,
   12.36 -			unsigned int offset);
   12.37 +struct node {
   12.38 +	const char *name;
   12.39 +
   12.40 +	/* Database I came from */
   12.41 +	TDB_CONTEXT *tdb;
   12.42 +
   12.43 +	/* Parent (optional) */
   12.44 +	struct node *parent;
   12.45 +
   12.46 +	/* Permissions. */
   12.47 +	unsigned int num_perms;
   12.48 +	struct xs_permissions *perms;
   12.49 +
   12.50 +	/* Contents. */
   12.51 +	unsigned int datalen;
   12.52 +	void *data;
   12.53 +
   12.54 +	/* Children, each nul-terminated. */
   12.55 +	unsigned int childlen;
   12.56 +	char *children;
   12.57 +};
   12.58  
   12.59  /* Break input into vectors, return the number, fill in up to num of them. */
   12.60  unsigned int get_strings(struct buffered_data *data,
   12.61 @@ -114,9 +128,6 @@ unsigned int get_strings(struct buffered
   12.62  /* Is child node a child or equal to parent node? */
   12.63  bool is_child(const char *child, const char *parent);
   12.64  
   12.65 -/* Create a new buffer with lifetime of context. */
   12.66 -struct buffered_data *new_buffer(void *ctx);
   12.67 -
   12.68  void send_reply(struct connection *conn, enum xsd_sockmsg_type type,
   12.69  		const void *data, unsigned int len);
   12.70  
   12.71 @@ -129,15 +140,22 @@ void send_error(struct connection *conn,
   12.72  /* Canonicalize this path if possible. */
   12.73  char *canonicalize(struct connection *conn, const char *node);
   12.74  
   12.75 -/* Check permissions on this node. */
   12.76 -bool check_node_perms(struct connection *conn, const char *node,
   12.77 -		      enum xs_perm_type perm);
   12.78 -
   12.79  /* Check if node is an event node. */
   12.80  bool check_event_node(const char *node);
   12.81  
   12.82 -/* Path to this node outside transaction. */
   12.83 -char *node_dir_outside_transaction(const char *node);
   12.84 +/* Get this node, checking we have permissions. */
   12.85 +struct node *get_node(struct connection *conn,
   12.86 +		      const char *name,
   12.87 +		      enum xs_perm_type perm);
   12.88 +
   12.89 +/* Get TDB context for this connection */
   12.90 +TDB_CONTEXT *tdb_context(struct connection *conn);
   12.91 +
   12.92 +/* Destructor for tdbs: required for transaction code */
   12.93 +int destroy_tdb(void *_tdb);
   12.94 +
   12.95 +/* Replace the tdb: required for transaction code */
   12.96 +bool replace_tdb(const char *newname, TDB_CONTEXT *newtdb);
   12.97  
   12.98  /* Fail due to excessive corruption, capitalist pigdogs! */
   12.99  void __attribute__((noreturn)) corrupt(struct connection *conn,
  12.100 @@ -145,23 +163,9 @@ void __attribute__((noreturn)) corrupt(s
  12.101  
  12.102  struct connection *new_connection(connwritefn_t *write, connreadfn_t *read);
  12.103  
  12.104 -void handle_input(struct connection *conn);
  12.105 -void handle_output(struct connection *conn);
  12.106 -
  12.107  /* Is this a valid node name? */
  12.108  bool is_valid_nodename(const char *node);
  12.109  
  12.110 -/* Return a pointer to an open dir, self-closig and attached to pathname. */
  12.111 -DIR **talloc_opendir(const char *pathname);
  12.112 -
  12.113 -/* Return a pointer to an fd, self-closing and attached to this pathname. */
  12.114 -int *talloc_open(const char *pathname, int flags, int mode);
  12.115 -
  12.116 -/* Convenient talloc-style destructor for paths. */
  12.117 -int destroy_path(void *path);
  12.118 -
  12.119 -/* Read entire contents of a talloced fd. */
  12.120 -void *read_all(int *fd, unsigned int *size);
  12.121  
  12.122  /* Tracing infrastructure. */
  12.123  void trace_create(const void *data, const char *type);
    13.1 --- a/tools/xenstore/xenstored_domain.c	Fri Sep 23 14:24:58 2005 +0100
    13.2 +++ b/tools/xenstore/xenstored_domain.c	Fri Sep 23 14:25:01 2005 +0100
    13.3 @@ -309,16 +309,11 @@ void do_introduce(struct connection *con
    13.4  		return;
    13.5  	}
    13.6  
    13.7 -	if (conn->id != 0) {
    13.8 +	if (conn->id != 0 || !conn->can_write) {
    13.9  		send_error(conn, EACCES);
   13.10  		return;
   13.11  	}
   13.12  
   13.13 -	if (!conn->can_write) {
   13.14 -		send_error(conn, EROFS);
   13.15 -		return;
   13.16 -	}
   13.17 -
   13.18  	/* Sanity check args. */
   13.19  	if ((atoi(vec[2]) <= 0) || !is_valid_nodename(vec[3])) {
   13.20  		send_error(conn, EINVAL);
   13.21 @@ -386,7 +381,7 @@ void do_release(struct connection *conn,
   13.22  
   13.23  	talloc_free(domain->conn);
   13.24  
   13.25 -	fire_watches(NULL, "@releaseDomain", false);
   13.26 +	fire_watches(conn, "@releaseDomain", false);
   13.27  
   13.28  	send_ack(conn, XS_RELEASE);
   13.29  }
    14.1 --- a/tools/xenstore/xenstored_transaction.c	Fri Sep 23 14:24:58 2005 +0100
    14.2 +++ b/tools/xenstore/xenstored_transaction.c	Fri Sep 23 14:25:01 2005 +0100
    14.3 @@ -26,6 +26,7 @@
    14.4  #include <stdarg.h>
    14.5  #include <stdlib.h>
    14.6  #include <fcntl.h>
    14.7 +#include <unistd.h>
    14.8  #include "talloc.h"
    14.9  #include "list.h"
   14.10  #include "xenstored_transaction.h"
   14.11 @@ -51,74 +52,26 @@ struct transaction
   14.12  	/* Global list of transactions. */
   14.13  	struct list_head list;
   14.14  
   14.15 +	/* Generation when transaction started. */
   14.16 +	unsigned int generation;
   14.17 +
   14.18  	/* My owner (conn->transaction == me). */
   14.19  	struct connection *conn;
   14.20  
   14.21 -	/* Subtree this transaction covers */
   14.22 -	char *node;
   14.23 -
   14.24 -	/* Base for this transaction. */
   14.25 -	char *divert;
   14.26 +	/* TDB to work on, and filename */
   14.27 +	TDB_CONTEXT *tdb;
   14.28 +	char *tdb_name;
   14.29  
   14.30  	/* List of changed nodes. */
   14.31  	struct list_head changes;
   14.32 -
   14.33 -	/* Someone's waiting: time limit. */
   14.34 -	struct timeval timeout;
   14.35 -
   14.36 -	/* We've timed out. */
   14.37 -	bool destined_to_fail;
   14.38  };
   14.39  static LIST_HEAD(transactions);
   14.40 -
   14.41 -bool within_transaction(struct transaction *trans, const char *node)
   14.42 -{
   14.43 -	if (!trans)
   14.44 -		return true;
   14.45 -	return is_child(node, trans->node);
   14.46 -}
   14.47 -
   14.48 -/* You are on notice: this transaction is blocking someone. */
   14.49 -static void start_transaction_timeout(struct transaction *trans)
   14.50 -{
   14.51 -	if (timerisset(&trans->timeout))
   14.52 -		return;
   14.53 -
   14.54 -	/* One second timeout. */
   14.55 -	gettimeofday(&trans->timeout, NULL);
   14.56 -	trans->timeout.tv_sec += 1;
   14.57 -}
   14.58 -
   14.59 -struct transaction *transaction_covering_node(const char *node)
   14.60 -{
   14.61 -	struct transaction *i;
   14.62 +static unsigned int generation;
   14.63  
   14.64 -	list_for_each_entry(i, &transactions, list) {
   14.65 -		if (i->destined_to_fail)
   14.66 -			continue;
   14.67 -		if (is_child(i->node, node) || is_child(node, i->node))
   14.68 -			return i;
   14.69 -	}
   14.70 -	return NULL;
   14.71 -}
   14.72 -
   14.73 -bool transaction_block(struct connection *conn, const char *node)
   14.74 +/* Return tdb context to use for this connection. */
   14.75 +TDB_CONTEXT *tdb_transaction_context(struct transaction *trans)
   14.76  {
   14.77 -	struct transaction *trans;
   14.78 -
   14.79 -	/* Transactions don't overlap, so we can't be blocked by
   14.80 -	 * others if we're in one. */
   14.81 -	if (conn->transaction)
   14.82 -		return false;
   14.83 -
   14.84 -	trans = transaction_covering_node(node);
   14.85 -	if (trans) {
   14.86 -		start_transaction_timeout(trans);
   14.87 -		conn->state = BLOCKED;
   14.88 -		conn->blocked_by = talloc_strdup(conn, node);
   14.89 -		return true;
   14.90 -	}
   14.91 -	return false;
   14.92 +	return trans->tdb;
   14.93  }
   14.94  
   14.95  /* Callers get a change node (which can fail) and only commit after they've
   14.96 @@ -127,8 +80,11 @@ void add_change_node(struct transaction 
   14.97  {
   14.98  	struct changed_node *i;
   14.99  
  14.100 -	if (!trans)
  14.101 +	if (!trans) {
  14.102 +		/* They're changing the global database. */
  14.103 +		generation++;
  14.104  		return;
  14.105 +	}
  14.106  
  14.107  	list_for_each_entry(i, &trans->changes, list)
  14.108  		if (streq(i->node, node))
  14.109 @@ -140,167 +96,47 @@ void add_change_node(struct transaction 
  14.110  	list_add_tail(&i->list, &trans->changes);
  14.111  }
  14.112  
  14.113 -char *node_dir_inside_transaction(struct transaction *trans, const char *node)
  14.114 -{
  14.115 -	return talloc_asprintf(node, "%s/%s", trans->divert,
  14.116 -			       node + strlen(trans->node));
  14.117 -}
  14.118 -
  14.119 -void shortest_transaction_timeout(struct timeval *tv)
  14.120 -{
  14.121 -	struct transaction *i;
  14.122 -
  14.123 -	list_for_each_entry(i, &transactions, list) {
  14.124 -		if (!timerisset(&i->timeout))
  14.125 -			continue;
  14.126 -
  14.127 -		if (!timerisset(tv) || timercmp(&i->timeout, tv, <))
  14.128 -			*tv = i->timeout;
  14.129 -	}
  14.130 -}	
  14.131 -
  14.132 -void check_transaction_timeout(void)
  14.133 -{
  14.134 -	struct transaction *i;
  14.135 -	struct timeval now;
  14.136 -
  14.137 -	gettimeofday(&now, NULL);
  14.138 -
  14.139 -	list_for_each_entry(i, &transactions, list) {
  14.140 -		if (!timerisset(&i->timeout))
  14.141 -			continue;
  14.142 -
  14.143 -		if (timercmp(&i->timeout, &now, <))
  14.144 -			i->destined_to_fail = true;
  14.145 -	}
  14.146 -}
  14.147 -
  14.148  static int destroy_transaction(void *_transaction)
  14.149  {
  14.150  	struct transaction *trans = _transaction;
  14.151  
  14.152  	list_del(&trans->list);
  14.153  	trace_destroy(trans, "transaction");
  14.154 -	return destroy_path(trans->divert);
  14.155 -}
  14.156 -
  14.157 -static bool copy_file(const char *src, const char *dst)
  14.158 -{
  14.159 -	int *infd, *outfd;
  14.160 -	void *data;
  14.161 -	unsigned int size;
  14.162 -
  14.163 -	infd = talloc_open(src, O_RDONLY, 0);
  14.164 -	if (!infd)
  14.165 -		return false;
  14.166 -	outfd = talloc_open(dst, O_WRONLY|O_CREAT|O_EXCL, 0640);
  14.167 -	if (!outfd)
  14.168 -		return false;
  14.169 -	data = read_all(infd, &size);
  14.170 -	if (!data)
  14.171 -		return false;
  14.172 -	return xs_write_all(*outfd, data, size);
  14.173 +	if (trans->tdb)
  14.174 +		tdb_close(trans->tdb);
  14.175 +	unlink(trans->tdb_name);
  14.176 +	return 0;
  14.177  }
  14.178  
  14.179 -static bool copy_dir(const char *src, const char *dst)
  14.180 +void do_transaction_start(struct connection *conn, struct buffered_data *in)
  14.181  {
  14.182 -	DIR **dir;
  14.183 -	struct dirent *dirent;
  14.184 -
  14.185 -	if (mkdir(dst, 0750) != 0)
  14.186 -		return false;
  14.187 -
  14.188 -	dir = talloc_opendir(src);
  14.189 -	if (!dir)
  14.190 -		return false;
  14.191 -
  14.192 -	while ((dirent = readdir(*dir)) != NULL) {
  14.193 -		struct stat st;
  14.194 -		char *newsrc, *newdst;
  14.195 -
  14.196 -		if (streq(dirent->d_name, ".") || streq(dirent->d_name, ".."))
  14.197 -			continue;
  14.198 -
  14.199 -		newsrc = talloc_asprintf(src, "%s/%s", src, dirent->d_name);
  14.200 -		newdst = talloc_asprintf(src, "%s/%s", dst, dirent->d_name);
  14.201 -		if (stat(newsrc, &st) != 0)
  14.202 -			return false;
  14.203 -		
  14.204 -		if (S_ISDIR(st.st_mode)) {
  14.205 -			if (!copy_dir(newsrc, newdst))
  14.206 -				return false;
  14.207 -		} else {
  14.208 -			if (!copy_file(newsrc, newdst))
  14.209 -				return false;
  14.210 -		}
  14.211 -		/* Free now so we don't run out of file descriptors */
  14.212 -		talloc_free(newsrc);
  14.213 -		talloc_free(newdst);
  14.214 -	}
  14.215 -	return true;
  14.216 -}
  14.217 -
  14.218 -void do_transaction_start(struct connection *conn, const char *node)
  14.219 -{
  14.220 -	struct transaction *transaction;
  14.221 -	char *dir;
  14.222 +	struct transaction *trans;
  14.223  
  14.224  	if (conn->transaction) {
  14.225  		send_error(conn, EBUSY);
  14.226  		return;
  14.227  	}
  14.228  
  14.229 -	node = canonicalize(conn, node);
  14.230 -	if (!check_node_perms(conn, node, XS_PERM_READ)) {
  14.231 +	/* Attach transaction to input for autofree until it's complete */
  14.232 +	trans = talloc(in, struct transaction);
  14.233 +	INIT_LIST_HEAD(&trans->changes);
  14.234 +	trans->conn = conn;
  14.235 +	trans->generation = generation;
  14.236 +	trans->tdb_name = talloc_asprintf(trans, "%s.%p",
  14.237 +					  xs_daemon_tdb(), trans);
  14.238 +	trans->tdb = tdb_copy(tdb_context(conn), trans->tdb_name);
  14.239 +	if (!trans->tdb) {
  14.240  		send_error(conn, errno);
  14.241  		return;
  14.242  	}
  14.243 -
  14.244 -	if (transaction_block(conn, node))
  14.245 -		return;
  14.246 -
  14.247 -	dir = node_dir_outside_transaction(node);
  14.248 -
  14.249 -	/* Attach transaction to node for autofree until it's complete */
  14.250 -	transaction = talloc(node, struct transaction);
  14.251 -	transaction->node = talloc_strdup(transaction, node);
  14.252 -	transaction->divert = talloc_asprintf(transaction, "%s/%p", 
  14.253 -					      xs_daemon_transactions(),
  14.254 -					      transaction);
  14.255 -	INIT_LIST_HEAD(&transaction->changes);
  14.256 -	transaction->conn = conn;
  14.257 -	timerclear(&transaction->timeout);
  14.258 -	transaction->destined_to_fail = false;
  14.259 -	list_add_tail(&transaction->list, &transactions);
  14.260 -	talloc_set_destructor(transaction, destroy_transaction);
  14.261 -	trace_create(transaction, "transaction");
  14.262 +	/* Make it close if we go away. */
  14.263 +	talloc_steal(trans, trans->tdb);
  14.264  
  14.265 -	if (!copy_dir(dir, transaction->divert)) {
  14.266 -		send_error(conn, errno);
  14.267 -		return;
  14.268 -	}
  14.269 -
  14.270 -	talloc_steal(conn, transaction);
  14.271 -	conn->transaction = transaction;
  14.272 -	send_ack(transaction->conn, XS_TRANSACTION_START);
  14.273 -}
  14.274 -
  14.275 -static bool commit_transaction(struct transaction *trans)
  14.276 -{
  14.277 -	char *tmp, *dir;
  14.278 -
  14.279 -	/* Move: orig -> .old, repl -> orig.  Cleanup deletes .old. */
  14.280 -	dir = node_dir_outside_transaction(trans->node);
  14.281 -	tmp = talloc_asprintf(trans, "%s.old", dir);
  14.282 -
  14.283 -	if (rename(dir, tmp) != 0)
  14.284 -		return false;
  14.285 -	if (rename(trans->divert, dir) != 0)
  14.286 -		corrupt(trans->conn, "Failed rename %s to %s",
  14.287 -			trans->divert, dir);
  14.288 -
  14.289 -	trans->divert = tmp;
  14.290 -	return true;
  14.291 +	/* Now we own it. */
  14.292 +	conn->transaction = talloc_steal(conn, trans);
  14.293 +	list_add_tail(&trans->list, &transactions);
  14.294 +	talloc_set_destructor(trans, destroy_transaction);
  14.295 +	send_ack(conn, XS_TRANSACTION_START);
  14.296  }
  14.297  
  14.298  void do_transaction_end(struct connection *conn, const char *arg)
  14.299 @@ -318,25 +154,29 @@ void do_transaction_end(struct connectio
  14.300  		return;
  14.301  	}
  14.302  
  14.303 -	/* Set to NULL so fire_watches sends events. */
  14.304 +	/* Set to NULL so fire_watches sends events, tdb_context works. */
  14.305  	trans = conn->transaction;
  14.306  	conn->transaction = NULL;
  14.307  	/* Attach transaction to arg for auto-cleanup */
  14.308  	talloc_steal(arg, trans);
  14.309  
  14.310  	if (streq(arg, "T")) {
  14.311 -		if (trans->destined_to_fail) {
  14.312 -			send_error(conn, ETIMEDOUT);
  14.313 +		/* FIXME: Merge, rather failing on any change. */
  14.314 +		if (trans->generation != generation) {
  14.315 +			send_error(conn, EAGAIN);
  14.316  			return;
  14.317  		}
  14.318 -		if (!commit_transaction(trans)) {
  14.319 +		if (!replace_tdb(trans->tdb_name, trans->tdb)) {
  14.320  			send_error(conn, errno);
  14.321  			return;
  14.322  		}
  14.323 +		/* Don't close this: we won! */
  14.324 +		trans->tdb = NULL;
  14.325  
  14.326  		/* Fire off the watches for everything that changed. */
  14.327  		list_for_each_entry(i, &trans->changes, list)
  14.328  			fire_watches(conn, i->node, i->recurse);
  14.329 +		generation++;
  14.330  	}
  14.331  	send_ack(conn, XS_TRANSACTION_END);
  14.332  }
    15.1 --- a/tools/xenstore/xenstored_transaction.h	Fri Sep 23 14:24:58 2005 +0100
    15.2 +++ b/tools/xenstore/xenstored_transaction.h	Fri Sep 23 14:25:01 2005 +0100
    15.3 @@ -22,29 +22,14 @@
    15.4  
    15.5  struct transaction;
    15.6  
    15.7 -void do_transaction_start(struct connection *conn, const char *node);
    15.8 +void do_transaction_start(struct connection *conn, struct buffered_data *node);
    15.9  void do_transaction_end(struct connection *conn, const char *arg);
   15.10  
   15.11 -/* Is node covered by this transaction? */
   15.12 -bool within_transaction(struct transaction *trans, const char *node);
   15.13 -
   15.14 -/* If a write op on this node blocked by another connections' transaction,
   15.15 - * mark conn, setup transaction timeout and return true.
   15.16 - */
   15.17 -bool transaction_block(struct connection *conn, const char *node);
   15.18 -
   15.19 -/* Return transaction which covers this node. */
   15.20 -struct transaction *transaction_covering_node(const char *node);
   15.21 -
   15.22 -/* Return directory of node within transaction t. */
   15.23 -char *node_dir_inside_transaction(struct transaction *t, const char *node);
   15.24 +bool transaction_block(struct connection *conn);
   15.25  
   15.26  /* This node was changed: can fail and longjmp. */
   15.27  void add_change_node(struct transaction *trans, const char *node, bool recurse);
   15.28  
   15.29 -/* Get shortest timeout: leave tv unset if none. */
   15.30 -void shortest_transaction_timeout(struct timeval *tv);
   15.31 -
   15.32 -/* Have any transactions timed out yet? */
   15.33 -void check_transaction_timeout(void);
   15.34 +/* Return tdb context to use for this connection. */
   15.35 +TDB_CONTEXT *tdb_transaction_context(struct transaction *trans);
   15.36  #endif /* _XENSTORED_TRANSACTION_H */
    16.1 --- a/tools/xenstore/xenstored_watch.c	Fri Sep 23 14:24:58 2005 +0100
    16.2 +++ b/tools/xenstore/xenstored_watch.c	Fri Sep 23 14:25:01 2005 +0100
    16.3 @@ -96,36 +96,38 @@ static int destroy_watch_event(void *_ev
    16.4  }
    16.5  
    16.6  static void add_event(struct connection *conn,
    16.7 -		      struct watch *watch, const char *node)
    16.8 +		      struct watch *watch,
    16.9 +		      const char *name)
   16.10  {
   16.11  	struct watch_event *event;
   16.12  
   16.13 -	/* Check read permission: no permission, no watch event.
   16.14 -	 * If it doesn't exist, we need permission to read parent.
   16.15 -	 */
   16.16 -	if (!check_node_perms(conn, node, XS_PERM_READ|XS_PERM_ENOENT_OK) &&
   16.17 -	    !check_event_node(node)) {
   16.18 -		return;
   16.19 +	if (!check_event_node(name)) {
   16.20 +		/* Can this conn load node, or see that it doesn't exist? */
   16.21 +		struct node *node;
   16.22 +
   16.23 +		node = get_node(conn, name, XS_PERM_READ);
   16.24 +		if (!node && errno != ENOENT)
   16.25 +			return;
   16.26  	}
   16.27  
   16.28  	if (watch->relative_path) {
   16.29 -		node += strlen(watch->relative_path);
   16.30 -		if (*node == '/') /* Could be "" */
   16.31 -			node++;
   16.32 +		name += strlen(watch->relative_path);
   16.33 +		if (*name == '/') /* Could be "" */
   16.34 +			name++;
   16.35  	}
   16.36  
   16.37  	event = talloc(watch, struct watch_event);
   16.38 -	event->len = strlen(node) + 1 + strlen(watch->token) + 1;
   16.39 +	event->len = strlen(name) + 1 + strlen(watch->token) + 1;
   16.40  	event->data = talloc_array(event, char, event->len);
   16.41 -	strcpy(event->data, node);
   16.42 -	strcpy(event->data + strlen(node) + 1, watch->token);
   16.43 +	strcpy(event->data, name);
   16.44 +	strcpy(event->data + strlen(name) + 1, watch->token);
   16.45  	talloc_set_destructor(event, destroy_watch_event);
   16.46  	list_add_tail(&event->list, &watch->events);
   16.47  	trace_create(event, "watch_event");
   16.48  }
   16.49  
   16.50  /* FIXME: we fail to fire on out of memory.  Should drop connections. */
   16.51 -void fire_watches(struct connection *conn, const char *node, bool recurse)
   16.52 +void fire_watches(struct connection *conn, const char *name, bool recurse)
   16.53  {
   16.54  	struct connection *i;
   16.55  	struct watch *watch;
   16.56 @@ -137,9 +139,9 @@ void fire_watches(struct connection *con
   16.57  	/* Create an event for each watch. */
   16.58  	list_for_each_entry(i, &connections, list) {
   16.59  		list_for_each_entry(watch, &i->watches, list) {
   16.60 -			if (is_child(node, watch->node))
   16.61 -				add_event(i, watch, node);
   16.62 -			else if (recurse && is_child(watch->node, node))
   16.63 +			if (is_child(name, watch->node))
   16.64 +				add_event(i, watch, name);
   16.65 +			else if (recurse && is_child(watch->node, name))
   16.66  				add_event(i, watch, watch->node);
   16.67  			else
   16.68  				continue;
   16.69 @@ -156,49 +158,6 @@ static int destroy_watch(void *_watch)
   16.70  	return 0;
   16.71  }
   16.72  
   16.73 -void shortest_watch_ack_timeout(struct timeval *tv)
   16.74 -{
   16.75 -	(void)tv;
   16.76 -#if 0 /* FIXME */
   16.77 -	struct watch *watch;
   16.78 -
   16.79 -	list_for_each_entry(watch, &watches, list) {
   16.80 -		struct watch_event *i;
   16.81 -		list_for_each_entry(i, &watch->events, list) {
   16.82 -			if (!timerisset(&i->timeout))
   16.83 -				continue;
   16.84 -			if (!timerisset(tv) || timercmp(&i->timeout, tv, <))
   16.85 -				*tv = i->timeout;
   16.86 -		}
   16.87 -	}
   16.88 -#endif
   16.89 -}	
   16.90 -
   16.91 -void check_watch_ack_timeout(void)
   16.92 -{
   16.93 -#if 0
   16.94 -	struct watch *watch;
   16.95 -	struct timeval now;
   16.96 -
   16.97 -	gettimeofday(&now, NULL);
   16.98 -	list_for_each_entry(watch, &watches, list) {
   16.99 -		struct watch_event *i, *tmp;
  16.100 -		list_for_each_entry_safe(i, tmp, &watch->events, list) {
  16.101 -			if (!timerisset(&i->timeout))
  16.102 -				continue;
  16.103 -			if (timercmp(&i->timeout, &now, <)) {
  16.104 -				xprintf("Warning: timeout on watch event %s"
  16.105 -					" token %s\n",
  16.106 -					i->node, watch->token);
  16.107 -				trace_watch_timeout(watch->conn, i->node,
  16.108 -						    watch->token);
  16.109 -				timerclear(&i->timeout);
  16.110 -			}
  16.111 -		}
  16.112 -	}
  16.113 -#endif
  16.114 -}
  16.115 -
  16.116  void do_watch(struct connection *conn, struct buffered_data *in)
  16.117  {
  16.118  	struct watch *watch;
    17.1 --- a/tools/xenstore/xenstored_watch.h	Fri Sep 23 14:24:58 2005 +0100
    17.2 +++ b/tools/xenstore/xenstored_watch.h	Fri Sep 23 14:25:01 2005 +0100
    17.3 @@ -32,15 +32,9 @@ bool is_watch_event(struct connection *c
    17.4  /* Look through our watches: if any of them have an event, queue it. */
    17.5  void queue_next_event(struct connection *conn);
    17.6  
    17.7 -/* Fire all watches: recurse means all the children are effected (ie. rm).
    17.8 +/* Fire all watches: recurse means all the children are affected (ie. rm).
    17.9   */
   17.10 -void fire_watches(struct connection *conn, const char *node, bool recurse);
   17.11 -
   17.12 -/* Find shortest timeout: if any, reduce tv (may already be set). */
   17.13 -void shortest_watch_ack_timeout(struct timeval *tv);
   17.14 -
   17.15 -/* Check for watches which may have timed out. */
   17.16 -void check_watch_ack_timeout(void);
   17.17 +void fire_watches(struct connection *conn, const char *name, bool recurse);
   17.18  
   17.19  void dump_watches(struct connection *conn);
   17.20  
    18.1 --- a/tools/xenstore/xs.c	Fri Sep 23 14:24:58 2005 +0100
    18.2 +++ b/tools/xenstore/xs.c	Fri Sep 23 14:25:01 2005 +0100
    18.3 @@ -497,13 +497,12 @@ bool xs_unwatch(struct xs_handle *h, con
    18.4  
    18.5  /* Start a transaction: changes by others will not be seen during this
    18.6   * transaction, and changes will not be visible to others until end.
    18.7 - * Transaction only applies to the given subtree.
    18.8   * You can only have one transaction at any time.
    18.9   * Returns false on failure.
   18.10   */
   18.11 -bool xs_transaction_start(struct xs_handle *h, const char *subtree)
   18.12 +bool xs_transaction_start(struct xs_handle *h)
   18.13  {
   18.14 -	return xs_bool(xs_single(h, XS_TRANSACTION_START, subtree, NULL));
   18.15 +	return xs_bool(xs_single(h, XS_TRANSACTION_START, "", NULL));
   18.16  }
   18.17  
   18.18  /* End a transaction.
    19.1 --- a/tools/xenstore/xs.h	Fri Sep 23 14:24:58 2005 +0100
    19.2 +++ b/tools/xenstore/xs.h	Fri Sep 23 14:25:01 2005 +0100
    19.3 @@ -109,11 +109,10 @@ bool xs_unwatch(struct xs_handle *h, con
    19.4  
    19.5  /* Start a transaction: changes by others will not be seen during this
    19.6   * transaction, and changes will not be visible to others until end.
    19.7 - * Transaction only applies to the given subtree.
    19.8   * You can only have one transaction at any time.
    19.9   * Returns false on failure.
   19.10   */
   19.11 -bool xs_transaction_start(struct xs_handle *h, const char *subtree);
   19.12 +bool xs_transaction_start(struct xs_handle *h);
   19.13  
   19.14  /* End a transaction.
   19.15   * If abandon is true, transaction is discarded instead of committed.
    20.1 --- a/tools/xenstore/xs_lib.c	Fri Sep 23 14:24:58 2005 +0100
    20.2 +++ b/tools/xenstore/xs_lib.c	Fri Sep 23 14:25:01 2005 +0100
    20.3 @@ -50,6 +50,13 @@ static const char *xs_daemon_path(void)
    20.4  	return buf;
    20.5  }
    20.6  
    20.7 +const char *xs_daemon_tdb(void)
    20.8 +{
    20.9 +	static char buf[PATH_MAX];
   20.10 +	sprintf(buf, "%s/tdb", xs_daemon_rootdir());
   20.11 +	return buf;
   20.12 +}
   20.13 +
   20.14  const char *xs_daemon_socket(void)
   20.15  {
   20.16  	return xs_daemon_path();
   20.17 @@ -66,24 +73,6 @@ const char *xs_daemon_socket_ro(void)
   20.18  	return buf;
   20.19  }
   20.20  
   20.21 -const char *xs_daemon_store(void)
   20.22 -{
   20.23 -	static char buf[PATH_MAX];
   20.24 -	if (snprintf(buf, PATH_MAX, "%s/store",
   20.25 -		     xs_daemon_rootdir()) >= PATH_MAX)
   20.26 -		return NULL;
   20.27 -	return buf;
   20.28 -}
   20.29 -
   20.30 -const char *xs_daemon_transactions(void)
   20.31 -{
   20.32 -	static char buf[PATH_MAX];
   20.33 -	if (snprintf(buf, PATH_MAX, "%s/transactions",
   20.34 -		     xs_daemon_rootdir()) >= PATH_MAX)
   20.35 -		return NULL;
   20.36 -	return buf;
   20.37 -}
   20.38 -
   20.39  const char *xs_domain_dev(void)
   20.40  {
   20.41  	char *s = getenv("XENSTORED_PATH");
    21.1 --- a/tools/xenstore/xs_lib.h	Fri Sep 23 14:24:58 2005 +0100
    21.2 +++ b/tools/xenstore/xs_lib.h	Fri Sep 23 14:25:01 2005 +0100
    21.3 @@ -36,7 +36,7 @@ enum xs_perm_type {
    21.4  
    21.5  struct xs_permissions
    21.6  {
    21.7 -	domid_t id;
    21.8 +	unsigned int id;
    21.9  	enum xs_perm_type perms;
   21.10  };
   21.11  
   21.12 @@ -46,9 +46,8 @@ struct xs_permissions
   21.13  /* Path for various daemon things: env vars can override. */
   21.14  const char *xs_daemon_socket(void);
   21.15  const char *xs_daemon_socket_ro(void);
   21.16 -const char *xs_daemon_store(void);
   21.17 -const char *xs_daemon_transactions(void);
   21.18  const char *xs_domain_dev(void);
   21.19 +const char *xs_daemon_tdb(void);
   21.20  
   21.21  /* Simple write function: loops for you. */
   21.22  bool xs_write_all(int fd, const void *data, unsigned int len);
    22.1 --- a/tools/xenstore/xs_random.c	Fri Sep 23 14:24:58 2005 +0100
    22.2 +++ b/tools/xenstore/xs_random.c	Fri Sep 23 14:25:01 2005 +0100
    22.3 @@ -41,7 +41,7 @@ struct ops
    22.4  			  struct xs_permissions *perms,
    22.5  			  unsigned int num);
    22.6  
    22.7 -	bool (*transaction_start)(void *h, const char *subtree);
    22.8 +	bool (*transaction_start)(void *h);
    22.9  	bool (*transaction_end)(void *h, bool abort);
   22.10  
   22.11  	/* Create and destroy a new handle. */
   22.12 @@ -53,7 +53,6 @@ struct file_ops_info
   22.13  {
   22.14  	const char *base;
   22.15  	char *transact_base;
   22.16 -	char *transact;
   22.17  };
   22.18  
   22.19  static void convert_to_dir(const char *dirname)
   22.20 @@ -96,31 +95,6 @@ static char *path_to_name(struct file_op
   22.21  	return filename;
   22.22  }
   22.23  
   22.24 -/* Is child a subnode of parent, or equal? */
   22.25 -static bool is_child(const char *child, const char *parent)
   22.26 -{
   22.27 -	unsigned int len = strlen(parent);
   22.28 -
   22.29 -	/* / should really be "" for this algorithm to work, but that's a
   22.30 -	 * usability nightmare. */
   22.31 -	if (streq(parent, "/"))
   22.32 -		return true;
   22.33 -
   22.34 -	if (strncmp(child, parent, len) != 0)
   22.35 -		return false;
   22.36 -
   22.37 -	return child[len] == '/' || child[len] == '\0';
   22.38 -}
   22.39 -
   22.40 -static bool write_ok(struct file_ops_info *info, const char *path)
   22.41 -{
   22.42 -	if (info->transact && !is_child(path, info->transact)) {
   22.43 -		errno = EROFS;
   22.44 -		return false;
   22.45 -	}
   22.46 -	return true;
   22.47 -}	
   22.48 -
   22.49  static char **file_directory(struct file_ops_info *info,
   22.50  			     const char *path, unsigned int *num)
   22.51  {
   22.52 @@ -184,8 +158,10 @@ static void *file_read(struct file_ops_i
   22.53  
   22.54  	ret = grab_file(filename, &size);
   22.55  	/* Directory exists, .DATA doesn't. */
   22.56 -	if (!ret && errno == ENOENT && strends(filename, ".DATA"))
   22.57 -		errno = EISDIR;
   22.58 +	if (!ret && errno == ENOENT && strends(filename, ".DATA")) {
   22.59 +		ret = strdup("");
   22.60 +		size = 0;
   22.61 +	}
   22.62  	*len = size;
   22.63  	return ret;
   22.64  }
   22.65 @@ -270,9 +246,6 @@ static bool file_set_perms(struct file_o
   22.66  		return false;
   22.67  	}
   22.68  
   22.69 -	if (!write_ok(info, path))
   22.70 -		return false;
   22.71 -
   22.72  	/* Check non-perm file exists/ */
   22.73  	if (lstat(filename, &st) != 0)
   22.74  		return false;
   22.75 @@ -338,9 +311,6 @@ static bool file_write(struct file_ops_i
   22.76  	char *filename = filename_to_data(path_to_name(info, path));
   22.77  	int fd;
   22.78  
   22.79 -	if (!write_ok(info, path))
   22.80 -		return false;
   22.81 -
   22.82  	make_dirs(parent_filename(filename));
   22.83  	fd = open(filename, O_CREAT|O_TRUNC|O_WRONLY, 0600);
   22.84  	if (fd < 0)
   22.85 @@ -358,9 +328,6 @@ static bool file_mkdir(struct file_ops_i
   22.86  {
   22.87  	char *dirname = path_to_name(info, path);
   22.88  
   22.89 -	if (!write_ok(info, path))
   22.90 -		return false;
   22.91 -
   22.92  	make_dirs(parent_filename(dirname));
   22.93  	if (mkdir(dirname, 0700) != 0)
   22.94  		return (errno == EEXIST);
   22.95 @@ -374,20 +341,12 @@ static bool file_rm(struct file_ops_info
   22.96  	char *filename = path_to_name(info, path);
   22.97  	struct stat st;
   22.98  
   22.99 -	if (info->transact && streq(info->transact, path)) {
  22.100 -		errno = EINVAL;
  22.101 -		return false;
  22.102 -	}
  22.103 -
  22.104  	if (lstat(filename, &st) != 0) {
  22.105  		if (lstat(parent_filename(filename), &st) != 0)
  22.106  			return false;
  22.107  		return true;
  22.108  	}
  22.109  
  22.110 -	if (!write_ok(info, path))
  22.111 -		return false;
  22.112 -
  22.113  	if (streq(path, "/")) {
  22.114  		errno = EINVAL;
  22.115  		return false;
  22.116 @@ -398,28 +357,20 @@ static bool file_rm(struct file_ops_info
  22.117  	return true;
  22.118  }
  22.119  
  22.120 -static bool file_transaction_start(struct file_ops_info *info,
  22.121 -				   const char *subtree)
  22.122 +static bool file_transaction_start(struct file_ops_info *info)
  22.123  {
  22.124  	char *cmd;
  22.125 -	char *filename = path_to_name(info, subtree);
  22.126 -	struct stat st;
  22.127  
  22.128 -	if (info->transact) {
  22.129 +	if (info->transact_base) {
  22.130  		errno = EBUSY;
  22.131  		return false;
  22.132  	}
  22.133  
  22.134 -	if (lstat(filename, &st) != 0)
  22.135 -		return false;
  22.136 -
  22.137 -	cmd = talloc_asprintf(NULL, "cp -r %s %s.transact",
  22.138 -			      info->base, info->base);
  22.139 +	info->transact_base = talloc_asprintf(NULL, "%s.transact", info->base);
  22.140 +	cmd = talloc_asprintf(NULL, "cp -r %s %s",
  22.141 +			      info->base, info->transact_base);
  22.142  	do_command(cmd);
  22.143  	talloc_free(cmd);
  22.144 -
  22.145 -	info->transact_base = talloc_asprintf(NULL, "%s.transact", info->base);
  22.146 -	info->transact = talloc_strdup(NULL, subtree);
  22.147  	return true;
  22.148  }
  22.149  
  22.150 @@ -427,7 +378,7 @@ static bool file_transaction_end(struct 
  22.151  {
  22.152  	char *old, *cmd;
  22.153  
  22.154 -	if (!info->transact) {
  22.155 +	if (!info->transact_base) {
  22.156  		errno = ENOENT;
  22.157  		return false;
  22.158  	}
  22.159 @@ -448,9 +399,7 @@ static bool file_transaction_end(struct 
  22.160  
  22.161  success:
  22.162  	talloc_free(cmd);
  22.163 -	talloc_free(info->transact);
  22.164  	talloc_free(info->transact_base);
  22.165 -	info->transact = NULL;
  22.166  	info->transact_base = NULL;
  22.167  	return true;
  22.168  }
  22.169 @@ -461,7 +410,6 @@ static struct file_ops_info *file_handle
  22.170  
  22.171  	info->base = dir;
  22.172  	info->transact_base = NULL;
  22.173 -	info->transact = NULL;
  22.174  	return info;
  22.175  }
  22.176  
  22.177 @@ -898,11 +846,10 @@ static char *do_next_op(struct ops *ops,
  22.178  	case 7: {
  22.179  		if (verbose)
  22.180  			printf("START %s\n", name);
  22.181 -		ret = bool_to_errstring(ops->transaction_start(h, name));
  22.182 +		ret = bool_to_errstring(ops->transaction_start(h));
  22.183  		if (streq(ret, "OK")) {
  22.184  			talloc_free(ret);
  22.185 -			ret = talloc_asprintf(NULL, "OK:START-TRANSACT:%s",
  22.186 -					      name);
  22.187 +			ret = talloc_asprintf(NULL, "OK:START-TRANSACT");
  22.188  		}
  22.189  
  22.190  		break;
  22.191 @@ -978,6 +925,8 @@ static void setup_file_ops(const char *d
  22.192  		barf_perror("Creating directory %s/tool", dir);
  22.193  	if (!file_set_perms(h, talloc_strdup(h, "/"), &perm, 1))
  22.194  		barf_perror("Setting root perms in %s", dir);
  22.195 +	if (!file_set_perms(h, talloc_strdup(h, "/tool"), &perm, 1))
  22.196 +		barf_perror("Setting root perms in %s/tool", dir);
  22.197  	file_close(h);
  22.198  }
  22.199  
  22.200 @@ -1071,7 +1020,7 @@ static unsigned int try_simple(const boo
  22.201  			goto out;
  22.202  
  22.203  		if (!data->fast) {
  22.204 -			if (strstarts(ret, "OK:START-TRANSACT:")) {
  22.205 +			if (streq(ret, "OK:START-TRANSACT")) {
  22.206  				void *pre = data->ops->handle(data->dir);
  22.207  
  22.208  				snapshot = dump(data->ops, pre);
  22.209 @@ -1303,7 +1252,7 @@ static unsigned int try_diff(const bool 
  22.210  			     void *_data)
  22.211  {
  22.212  	void *fileh, *xsh;
  22.213 -	char *transact = NULL;
  22.214 +	bool transact = false;
  22.215  	struct ops *fail;
  22.216  	struct diff_data *data = _data;
  22.217  	unsigned int i, print;
  22.218 @@ -1348,13 +1297,9 @@ static unsigned int try_diff(const bool 
  22.219  			goto out;
  22.220  
  22.221  		if (strstarts(file, "OK:START-TRANSACT:"))
  22.222 -			transact = talloc_strdup(NULL,
  22.223 -						 file +
  22.224 -						 strlen("OK:START-TRANSACT:"));
  22.225 -		else if (streq(file, "OK:STOP-TRANSACT")) {
  22.226 -			talloc_free(transact);
  22.227 -			transact = NULL;
  22.228 -		}
  22.229 +			transact = true;
  22.230 +		else if (streq(file, "OK:STOP-TRANSACT"))
  22.231 +			transact = false;
  22.232  
  22.233  		talloc_free(file);
  22.234  		talloc_free(xs);
  22.235 @@ -1379,7 +1324,7 @@ static unsigned int try_diff(const bool 
  22.236  
  22.237  			fail = NULL;
  22.238  			if (!ops_equal(&xs_ops, xsh_pre, &file_ops, fileh_pre,
  22.239 -				       transact, &fail)) {
  22.240 +				       "/", &fail)) {
  22.241  				if (fail)
  22.242  					barf("%s failed during transact\n",
  22.243  					     fail->name);
  22.244 @@ -1456,9 +1401,6 @@ static unsigned int try_fail(const bool 
  22.245  	fileh = file_handle(data->dir);
  22.246  	xsh = xs_handle(data->dir);
  22.247  
  22.248 -	sprintf(seed, "%i", data->seed);
  22.249 -	free(xs_debug_command(xsh, "failtest", seed, strlen(seed)+1));
  22.250 -
  22.251  	print = number / 76;
  22.252  	if (!print)
  22.253  		print = 1;
  22.254 @@ -1491,8 +1433,12 @@ static unsigned int try_fail(const bool 
  22.255  		if (trymap && !trymap[i])
  22.256  			continue;
  22.257  
  22.258 +		/* Turn on failure. */
  22.259 +		sprintf(seed, "%i", data->seed + i);
  22.260 +		free(xs_debug_command(xsh, "failtest",seed,strlen(seed)+1));
  22.261 +
  22.262  		if (verbose)
  22.263 -			printf("(%i) ", i);
  22.264 +			printf("(%i) seed %s ", i, seed);
  22.265  		ret = do_next_op(&xs_ops, xsh, i + data->seed, verbose);
  22.266  		if (streq(ret, "FAILED:Connection reset by peer")
  22.267  		    || streq(ret, "FAILED:Bad file descriptor")
  22.268 @@ -1549,8 +1495,6 @@ static unsigned int try_fail(const bool 
  22.269  		fail = NULL;
  22.270  		if (!ops_equal(&xs_ops, tmpxsh, &file_ops, tmpfileh, "/",
  22.271  			       &fail)) {
  22.272 -			xs_close(tmpxsh);
  22.273 -			file_close(tmpfileh);
  22.274  			if (fail) {
  22.275  				if (verbose)
  22.276  					printf("%s failed\n", fail->name);
  22.277 @@ -1561,10 +1505,16 @@ static unsigned int try_fail(const bool 
  22.278  				failed = 0;
  22.279  				if (verbose)
  22.280  					printf("(Looks like it succeeded)\n");
  22.281 +				xs_close(tmpxsh);
  22.282 +				file_close(tmpfileh);
  22.283  				goto try_applying;
  22.284  			}
  22.285  			if (verbose)
  22.286 -				printf("Two backends not equal\n");
  22.287 +				printf("Trees differ:\nXS:%s\nFILE:%s\n",
  22.288 +				       dump(&xs_ops, tmpxsh),
  22.289 +				       dump(&file_ops, tmpfileh));
  22.290 +			xs_close(tmpxsh);
  22.291 +			file_close(tmpfileh);
  22.292  			goto out;
  22.293  		}
  22.294  
  22.295 @@ -1572,8 +1522,6 @@ static unsigned int try_fail(const bool 
  22.296  		if (!xsh)
  22.297  			file_transaction_end(fileh, true);
  22.298  
  22.299 -		/* Turn failures back on. */
  22.300 -		free(xs_debug_command(tmpxsh, "failtest",  NULL, 0));
  22.301  		xs_close(tmpxsh);
  22.302  		file_close(tmpfileh);
  22.303  	}
    23.1 --- a/tools/xenstore/xs_stress.c	Fri Sep 23 14:24:58 2005 +0100
    23.2 +++ b/tools/xenstore/xs_stress.c	Fri Sep 23 14:25:01 2005 +0100
    23.3 @@ -8,6 +8,7 @@
    23.4  #include <sys/stat.h>
    23.5  #include <fcntl.h>
    23.6  #include <unistd.h>
    23.7 +#include <errno.h>
    23.8  
    23.9  #define NUM_HANDLES 2
   23.10  #define DIR_FANOUT 3
   23.11 @@ -36,24 +37,18 @@ static void work(unsigned int cycles, un
   23.12  
   23.13  	srandom(childnum);
   23.14  	for (i = 0; i < cycles; i++) {
   23.15 -		unsigned int lockdepth, j, len;
   23.16 -		char file[100] = "", lockdir[100];
   23.17 +		unsigned int j, len;
   23.18 +		char file[100] = "";
   23.19  		char *contents, tmp[100];
   23.20  		struct xs_handle *h = handles[random() % NUM_HANDLES];
   23.21  
   23.22 -		lockdepth = random() % DIR_DEPTH;
   23.23 -		for (j = 0; j < DIR_DEPTH; j++) {
   23.24 -			if (j == lockdepth)
   23.25 -				strcpy(lockdir, file);
   23.26 +		for (j = 0; j < DIR_DEPTH; j++)
   23.27  			sprintf(file + strlen(file), "/%li",
   23.28  				random()%DIR_FANOUT);
   23.29 -		}
   23.30 -		if (streq(lockdir, ""))
   23.31 -			strcpy(lockdir, "/");
   23.32  
   23.33 -		if (!xs_transaction_start(h, lockdir))
   23.34 -			barf_perror("%i: starting transaction %i on %s",
   23.35 -				    childnum, i, lockdir);
   23.36 +		if (!xs_transaction_start(h))
   23.37 +			barf_perror("%i: starting transaction %i",
   23.38 +				    childnum, i);
   23.39  
   23.40  		sprintf(file + strlen(file), "/count");
   23.41  		contents = xs_read(h, file, &len);
   23.42 @@ -68,18 +63,23 @@ static void work(unsigned int cycles, un
   23.43  		/* Abandon 1 in 10 */
   23.44  		if (random() % 10 == 0) {
   23.45  			if (!xs_transaction_end(h, true))
   23.46 -				barf_perror("%i: can't abort transact %s",
   23.47 -					    childnum, lockdir);
   23.48 +				barf_perror("%i: can't abort transact",
   23.49 +					    childnum);
   23.50  			i--;
   23.51  		} else {
   23.52 -			if (!xs_transaction_end(h, false))
   23.53 -				barf_perror("%i: can't commit transact %s",
   23.54 -					    childnum, lockdir);
   23.55 -
   23.56 -			/* Offset when we print . so kids don't all
   23.57 -			 * print at once. */
   23.58 -			if ((i + print/(childnum+1)) % print == 0)
   23.59 -				write(STDOUT_FILENO, &id, 1);
   23.60 +			if (!xs_transaction_end(h, false)) {
   23.61 +				if (errno == EAGAIN) {
   23.62 +					write(STDOUT_FILENO, "!", 1);
   23.63 +					i--;
   23.64 +				} else
   23.65 +					barf_perror("%i: can't commit trans",
   23.66 +						    childnum);
   23.67 +			} else {
   23.68 +				/* Offset when we print . so kids don't all
   23.69 +				 * print at once. */
   23.70 +				if ((i + print/(childnum+1)) % print == 0)
   23.71 +					write(STDOUT_FILENO, &id, 1);
   23.72 +			}
   23.73  		}
   23.74  	}
   23.75  }
   23.76 @@ -201,7 +201,7 @@ int main(int argc, char *argv[])
   23.77  	printf("\nCounting results...\n");
   23.78  	i = tally_counts();
   23.79  	if (i != (unsigned)atoi(argv[1]))
   23.80 -		barf("Total counts %i not %s", i, atoi(argv[1]));
   23.81 +		barf("Total counts %i not %s", i, argv[1]);
   23.82  	printf("Success!\n");
   23.83  	exit(0);
   23.84  }
    24.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    24.2 +++ b/tools/xenstore/xs_tdb_dump.c	Fri Sep 23 14:25:01 2005 +0100
    24.3 @@ -0,0 +1,81 @@
    24.4 +/* Simple program to dump out all records of TDB */
    24.5 +#include <stdint.h>
    24.6 +#include <stdlib.h>
    24.7 +#include <fcntl.h>
    24.8 +#include <stdio.h>
    24.9 +
   24.10 +#include "xs_lib.h"
   24.11 +#include "tdb.h"
   24.12 +#include "talloc.h"
   24.13 +#include "utils.h"
   24.14 +
   24.15 +struct record_hdr {
   24.16 +	u32 num_perms;
   24.17 +	u32 datalen;
   24.18 +	u32 childlen;
   24.19 +	struct xs_permissions perms[0];
   24.20 +};
   24.21 +
   24.22 +static u32 total_size(struct record_hdr *hdr)
   24.23 +{
   24.24 +	return sizeof(*hdr) + hdr->num_perms * sizeof(struct xs_permissions) 
   24.25 +		+ hdr->datalen + hdr->childlen;
   24.26 +}
   24.27 +
   24.28 +static char perm_to_char(enum xs_perm_type perm)
   24.29 +{
   24.30 +	return perm == XS_PERM_READ ? 'r' :
   24.31 +		perm == XS_PERM_WRITE ? 'w' :
   24.32 +		perm == XS_PERM_NONE ? '-' :
   24.33 +		perm == (XS_PERM_READ|XS_PERM_WRITE) ? 'b' :
   24.34 +		'?';
   24.35 +}
   24.36 +
   24.37 +int main(int argc, char *argv[])
   24.38 +{
   24.39 +	TDB_DATA key;
   24.40 +	TDB_CONTEXT *tdb;
   24.41 +
   24.42 +	if (argc != 2)
   24.43 +		barf("Usage: xs_tdb_dump <tdbfile>");
   24.44 +
   24.45 +	tdb = tdb_open(talloc_strdup(NULL, argv[1]), 0, 0, O_RDONLY, 0);
   24.46 +	if (!tdb)
   24.47 +		barf_perror("Could not open %s", argv[1]);
   24.48 +
   24.49 +	key = tdb_firstkey(tdb);
   24.50 +	while (key.dptr) {
   24.51 +		TDB_DATA data;
   24.52 +		struct record_hdr *hdr;
   24.53 +
   24.54 +		data = tdb_fetch(tdb, key);
   24.55 +		hdr = (void *)data.dptr;
   24.56 +		if (data.dsize < sizeof(*hdr))
   24.57 +			fprintf(stderr, "%.*s: BAD truncated\n",
   24.58 +				key.dsize, key.dptr);
   24.59 +		else if (data.dsize != total_size(hdr))
   24.60 +			fprintf(stderr, "%.*s: BAD length %i for %i/%i/%i (%i)\n",
   24.61 +				key.dsize, key.dptr, data.dsize,
   24.62 +				hdr->num_perms, hdr->datalen,
   24.63 +				hdr->childlen, total_size(hdr));
   24.64 +		else {
   24.65 +			unsigned int i;
   24.66 +			char *p;
   24.67 +
   24.68 +			printf("%.*s: ", key.dsize, key.dptr);
   24.69 +			for (i = 0; i < hdr->num_perms; i++)
   24.70 +				printf("%s%c%i",
   24.71 +				       i == 0 ? "" : ",",
   24.72 +				       perm_to_char(hdr->perms[i].perms),
   24.73 +				       hdr->perms[i].id);
   24.74 +			p = (void *)&hdr->perms[hdr->num_perms];
   24.75 +			printf(" %.*s\n", hdr->datalen, p);
   24.76 +			p += hdr->datalen;
   24.77 +			for (i = 0; i < hdr->childlen; i += strlen(p+i)+1)
   24.78 +				printf("\t-> %s\n", p+i);
   24.79 +		}
   24.80 +		key = tdb_nextkey(tdb, key);
   24.81 +	}
   24.82 +	return 0;
   24.83 +}
   24.84 +
    25.1 --- a/tools/xenstore/xs_test.c	Fri Sep 23 14:24:58 2005 +0100
    25.2 +++ b/tools/xenstore/xs_test.c	Fri Sep 23 14:25:01 2005 +0100
    25.3 @@ -562,9 +562,9 @@ static void do_unwatch(unsigned int hand
    25.4  		failed(handle);
    25.5  }
    25.6  
    25.7 -static void do_start(unsigned int handle, const char *node)
    25.8 +static void do_start(unsigned int handle)
    25.9  {
   25.10 -	if (!xs_transaction_start(handles[handle], node))
   25.11 +	if (!xs_transaction_start(handles[handle]))
   25.12  		failed(handle);
   25.13  }
   25.14  
   25.15 @@ -791,7 +791,7 @@ static void do_command(unsigned int defa
   25.16  		xs_daemon_close(handles[handle]);
   25.17  		handles[handle] = NULL;
   25.18  	} else if (streq(command, "start"))
   25.19 -		do_start(handle, arg(line, 1));
   25.20 +		do_start(handle);
   25.21  	else if (streq(command, "commit"))
   25.22  		do_end(handle, false);
   25.23  	else if (streq(command, "abort"))