ia64/xen-unstable

view tools/xenstore/tdb.c @ 7238:971e7c7411b3

Raise an exception if an error appears on the pipes to our children, and make
sure that the child's pipes are closed even under that exception. Move the
handling of POLLHUP to the end of the loop, so that we guarantee to read any
remaining data from the child if POLLHUP and POLLIN appear at the same time.

Signed-off-by: Ewan Mellor <ewan@xensource.com>
author emellor@ewan
date Thu Oct 06 10:13:11 2005 +0100 (2005-10-06)
parents ef9591d03fdd
children 93e27f7ca8a8 61b3b357d827
line source
1 /*
2 Unix SMB/CIFS implementation.
4 trivial database library
6 Copyright (C) Andrew Tridgell 1999-2004
7 Copyright (C) Paul `Rusty' Russell 2000
8 Copyright (C) Jeremy Allison 2000-2003
10 ** NOTE! The following LGPL license applies to the tdb
11 ** library. This does NOT imply that all of Samba is released
12 ** under the LGPL
14 This library is free software; you can redistribute it and/or
15 modify it under the terms of the GNU Lesser General Public
16 License as published by the Free Software Foundation; either
17 version 2 of the License, or (at your option) any later version.
19 This library is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22 Lesser General Public License for more details.
24 You should have received a copy of the GNU Lesser General Public
25 License along with this library; if not, write to the Free Software
26 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
27 */
30 #ifndef _SAMBA_BUILD_
31 #if HAVE_CONFIG_H
32 #include <config.h>
33 #endif
35 #include <stdlib.h>
36 #include <stdio.h>
37 #include <stdint.h>
38 #include <fcntl.h>
39 #include <unistd.h>
40 #include <string.h>
41 #include <fcntl.h>
42 #include <errno.h>
43 #include <sys/mman.h>
44 #include <sys/stat.h>
45 #include "tdb.h"
46 #include <stdarg.h>
47 #include "talloc.h"
48 #define HAVE_MMAP
49 #else
50 #include "includes.h"
51 #include "lib/tdb/include/tdb.h"
52 #include "system/time.h"
53 #include "system/shmem.h"
54 #include "system/filesys.h"
55 #endif
57 #define TDB_MAGIC_FOOD "TDB file\n"
58 #define TDB_VERSION (0x26011967 + 6)
59 #define TDB_MAGIC (0x26011999U)
60 #define TDB_FREE_MAGIC (~TDB_MAGIC)
61 #define TDB_DEAD_MAGIC (0xFEE1DEAD)
62 #define TDB_ALIGNMENT 4
63 #define MIN_REC_SIZE (2*sizeof(struct list_struct) + TDB_ALIGNMENT)
64 #define DEFAULT_HASH_SIZE 131
65 #define TDB_PAGE_SIZE 0x2000
66 #define FREELIST_TOP (sizeof(struct tdb_header))
67 #define TDB_ALIGN(x,a) (((x) + (a)-1) & ~((a)-1))
68 #define TDB_BYTEREV(x) (((((x)&0xff)<<24)|((x)&0xFF00)<<8)|(((x)>>8)&0xFF00)|((x)>>24))
69 #define TDB_DEAD(r) ((r)->magic == TDB_DEAD_MAGIC)
70 #define TDB_BAD_MAGIC(r) ((r)->magic != TDB_MAGIC && !TDB_DEAD(r))
71 #define TDB_HASH_TOP(hash) (FREELIST_TOP + (BUCKET(hash)+1)*sizeof(tdb_off))
72 #define TDB_DATA_START(hash_size) (TDB_HASH_TOP(hash_size-1))
75 /* NB assumes there is a local variable called "tdb" that is the
76 * current context, also takes doubly-parenthesized print-style
77 * argument. */
78 #define TDB_LOG(x) tdb->log_fn x
80 /* lock offsets */
81 #define GLOBAL_LOCK 0
82 #define ACTIVE_LOCK 4
84 #ifndef MAP_FILE
85 #define MAP_FILE 0
86 #endif
88 #ifndef MAP_FAILED
89 #define MAP_FAILED ((void *)-1)
90 #endif
92 #ifndef discard_const_p
93 # if defined(__intptr_t_defined) || defined(HAVE_INTPTR_T)
94 # define discard_const(ptr) ((void *)((intptr_t)(ptr)))
95 # else
96 # define discard_const(ptr) ((void *)(ptr))
97 # endif
98 # define discard_const_p(type, ptr) ((type *)discard_const(ptr))
99 #endif
101 /* free memory if the pointer is valid and zero the pointer */
102 #ifndef SAFE_FREE
103 #define SAFE_FREE(x) do { if ((x) != NULL) {talloc_free(discard_const_p(void *, (x))); (x)=NULL;} } while(0)
104 #endif
106 #define BUCKET(hash) ((hash) % tdb->header.hash_size)
107 TDB_DATA tdb_null;
109 /* all contexts, to ensure no double-opens (fcntl locks don't nest!) */
110 static TDB_CONTEXT *tdbs = NULL;
112 static int tdb_munmap(TDB_CONTEXT *tdb)
113 {
114 if (tdb->flags & TDB_INTERNAL)
115 return 0;
117 #ifdef HAVE_MMAP
118 if (tdb->map_ptr) {
119 int ret = munmap(tdb->map_ptr, tdb->map_size);
120 if (ret != 0)
121 return ret;
122 }
123 #endif
124 tdb->map_ptr = NULL;
125 return 0;
126 }
128 static void tdb_mmap(TDB_CONTEXT *tdb)
129 {
130 if (tdb->flags & TDB_INTERNAL)
131 return;
133 #ifdef HAVE_MMAP
134 if (!(tdb->flags & TDB_NOMMAP)) {
135 tdb->map_ptr = mmap(NULL, tdb->map_size,
136 PROT_READ|(tdb->read_only? 0:PROT_WRITE),
137 MAP_SHARED|MAP_FILE, tdb->fd, 0);
139 /*
140 * NB. When mmap fails it returns MAP_FAILED *NOT* NULL !!!!
141 */
143 if (tdb->map_ptr == MAP_FAILED) {
144 tdb->map_ptr = NULL;
145 TDB_LOG((tdb, 2, "tdb_mmap failed for size %d (%s)\n",
146 tdb->map_size, strerror(errno)));
147 }
148 } else {
149 tdb->map_ptr = NULL;
150 }
151 #else
152 tdb->map_ptr = NULL;
153 #endif
154 }
156 /* Endian conversion: we only ever deal with 4 byte quantities */
157 static void *convert(void *buf, u32 size)
158 {
159 u32 i, *p = buf;
160 for (i = 0; i < size / 4; i++)
161 p[i] = TDB_BYTEREV(p[i]);
162 return buf;
163 }
164 #define DOCONV() (tdb->flags & TDB_CONVERT)
165 #define CONVERT(x) (DOCONV() ? convert(&x, sizeof(x)) : &x)
167 /* the body of the database is made of one list_struct for the free space
168 plus a separate data list for each hash value */
169 struct list_struct {
170 tdb_off next; /* offset of the next record in the list */
171 tdb_len rec_len; /* total byte length of record */
172 tdb_len key_len; /* byte length of key */
173 tdb_len data_len; /* byte length of data */
174 u32 full_hash; /* the full 32 bit hash of the key */
175 u32 magic; /* try to catch errors */
176 /* the following union is implied:
177 union {
178 char record[rec_len];
179 struct {
180 char key[key_len];
181 char data[data_len];
182 }
183 u32 totalsize; (tailer)
184 }
185 */
186 };
188 /* a byte range locking function - return 0 on success
189 this functions locks/unlocks 1 byte at the specified offset.
191 On error, errno is also set so that errors are passed back properly
192 through tdb_open(). */
193 static int tdb_brlock(TDB_CONTEXT *tdb, tdb_off offset,
194 int rw_type, int lck_type, int probe)
195 {
196 struct flock fl;
197 int ret;
199 if (tdb->flags & TDB_NOLOCK)
200 return 0;
201 if ((rw_type == F_WRLCK) && (tdb->read_only)) {
202 errno = EACCES;
203 return -1;
204 }
206 fl.l_type = rw_type;
207 fl.l_whence = SEEK_SET;
208 fl.l_start = offset;
209 fl.l_len = 1;
210 fl.l_pid = 0;
212 do {
213 ret = fcntl(tdb->fd,lck_type,&fl);
214 } while (ret == -1 && errno == EINTR);
216 if (ret == -1) {
217 if (!probe && lck_type != F_SETLK) {
218 /* Ensure error code is set for log fun to examine. */
219 tdb->ecode = TDB_ERR_LOCK;
220 TDB_LOG((tdb, 5,"tdb_brlock failed (fd=%d) at offset %d rw_type=%d lck_type=%d\n",
221 tdb->fd, offset, rw_type, lck_type));
222 }
223 /* Generic lock error. errno set by fcntl.
224 * EAGAIN is an expected return from non-blocking
225 * locks. */
226 if (errno != EAGAIN) {
227 TDB_LOG((tdb, 5, "tdb_brlock failed (fd=%d) at offset %d rw_type=%d lck_type=%d: %s\n",
228 tdb->fd, offset, rw_type, lck_type,
229 strerror(errno)));
230 }
231 return TDB_ERRCODE(TDB_ERR_LOCK, -1);
232 }
233 return 0;
234 }
236 /* lock a list in the database. list -1 is the alloc list */
237 static int tdb_lock(TDB_CONTEXT *tdb, int list, int ltype)
238 {
239 if (list < -1 || list >= (int)tdb->header.hash_size) {
240 TDB_LOG((tdb, 0,"tdb_lock: invalid list %d for ltype=%d\n",
241 list, ltype));
242 return -1;
243 }
244 if (tdb->flags & TDB_NOLOCK)
245 return 0;
247 /* Since fcntl locks don't nest, we do a lock for the first one,
248 and simply bump the count for future ones */
249 if (tdb->locked[list+1].count == 0) {
250 if (tdb_brlock(tdb,FREELIST_TOP+4*list,ltype,F_SETLKW, 0)) {
251 TDB_LOG((tdb, 0,"tdb_lock failed on list %d ltype=%d (%s)\n",
252 list, ltype, strerror(errno)));
253 return -1;
254 }
255 tdb->locked[list+1].ltype = ltype;
256 }
257 tdb->locked[list+1].count++;
258 return 0;
259 }
261 /* unlock the database: returns void because it's too late for errors. */
262 /* changed to return int it may be interesting to know there
263 has been an error --simo */
264 static int tdb_unlock(TDB_CONTEXT *tdb, int list,
265 int ltype __attribute__((unused)))
266 {
267 int ret = -1;
269 if (tdb->flags & TDB_NOLOCK)
270 return 0;
272 /* Sanity checks */
273 if (list < -1 || list >= (int)tdb->header.hash_size) {
274 TDB_LOG((tdb, 0, "tdb_unlock: list %d invalid (%d)\n", list, tdb->header.hash_size));
275 return ret;
276 }
278 if (tdb->locked[list+1].count==0) {
279 TDB_LOG((tdb, 0, "tdb_unlock: count is 0\n"));
280 return ret;
281 }
283 if (tdb->locked[list+1].count == 1) {
284 /* Down to last nested lock: unlock underneath */
285 ret = tdb_brlock(tdb, FREELIST_TOP+4*list, F_UNLCK, F_SETLKW, 0);
286 } else {
287 ret = 0;
288 }
289 tdb->locked[list+1].count--;
291 if (ret)
292 TDB_LOG((tdb, 0,"tdb_unlock: An error occurred unlocking!\n"));
293 return ret;
294 }
296 /* This is based on the hash algorithm from gdbm */
297 static u32 default_tdb_hash(TDB_DATA *key)
298 {
299 u32 value; /* Used to compute the hash value. */
300 u32 i; /* Used to cycle through random values. */
302 /* Set the initial value from the key size. */
303 for (value = 0x238F13AF * key->dsize, i=0; i < key->dsize; i++)
304 value = (value + (key->dptr[i] << (i*5 % 24)));
306 return (1103515243 * value + 12345);
307 }
309 /* check for an out of bounds access - if it is out of bounds then
310 see if the database has been expanded by someone else and expand
311 if necessary
312 note that "len" is the minimum length needed for the db
313 */
314 static int tdb_oob(TDB_CONTEXT *tdb, tdb_off len, int probe)
315 {
316 struct stat st;
317 if (len <= tdb->map_size)
318 return 0;
319 if (tdb->flags & TDB_INTERNAL) {
320 if (!probe) {
321 /* Ensure ecode is set for log fn. */
322 tdb->ecode = TDB_ERR_IO;
323 TDB_LOG((tdb, 0,"tdb_oob len %d beyond internal malloc size %d\n",
324 (int)len, (int)tdb->map_size));
325 }
326 return TDB_ERRCODE(TDB_ERR_IO, -1);
327 }
329 if (fstat(tdb->fd, &st) == -1)
330 return TDB_ERRCODE(TDB_ERR_IO, -1);
332 if (st.st_size < (off_t)len) {
333 if (!probe) {
334 /* Ensure ecode is set for log fn. */
335 tdb->ecode = TDB_ERR_IO;
336 TDB_LOG((tdb, 0,"tdb_oob len %d beyond eof at %d\n",
337 (int)len, (int)st.st_size));
338 }
339 return TDB_ERRCODE(TDB_ERR_IO, -1);
340 }
342 /* Unmap, update size, remap */
343 if (tdb_munmap(tdb) == -1)
344 return TDB_ERRCODE(TDB_ERR_IO, -1);
345 tdb->map_size = st.st_size;
346 tdb_mmap(tdb);
347 return 0;
348 }
350 /* write a lump of data at a specified offset */
351 static int tdb_write(TDB_CONTEXT *tdb, tdb_off off, void *buf, tdb_len len)
352 {
353 if (tdb_oob(tdb, off + len, 0) != 0)
354 return -1;
356 if (tdb->map_ptr)
357 memcpy(off + (char *)tdb->map_ptr, buf, len);
358 #ifdef HAVE_PWRITE
359 else if (pwrite(tdb->fd, buf, len, off) != (ssize_t)len) {
360 #else
361 else if (lseek(tdb->fd, off, SEEK_SET) != (off_t)off
362 || write(tdb->fd, buf, len) != (off_t)len) {
363 #endif
364 /* Ensure ecode is set for log fn. */
365 tdb->ecode = TDB_ERR_IO;
366 TDB_LOG((tdb, 0,"tdb_write failed at %d len=%d (%s)\n",
367 off, len, strerror(errno)));
368 return TDB_ERRCODE(TDB_ERR_IO, -1);
369 }
370 return 0;
371 }
373 /* read a lump of data at a specified offset, maybe convert */
374 static int tdb_read(TDB_CONTEXT *tdb,tdb_off off,void *buf,tdb_len len,int cv)
375 {
376 if (tdb_oob(tdb, off + len, 0) != 0)
377 return -1;
379 if (tdb->map_ptr)
380 memcpy(buf, off + (char *)tdb->map_ptr, len);
381 #ifdef HAVE_PREAD
382 else if (pread(tdb->fd, buf, len, off) != (off_t)len) {
383 #else
384 else if (lseek(tdb->fd, off, SEEK_SET) != (off_t)off
385 || read(tdb->fd, buf, len) != (off_t)len) {
386 #endif
387 /* Ensure ecode is set for log fn. */
388 tdb->ecode = TDB_ERR_IO;
389 TDB_LOG((tdb, 0,"tdb_read failed at %d len=%d (%s)\n",
390 off, len, strerror(errno)));
391 return TDB_ERRCODE(TDB_ERR_IO, -1);
392 }
393 if (cv)
394 convert(buf, len);
395 return 0;
396 }
398 /* don't allocate memory: used in tdb_delete path. */
399 static int tdb_key_eq(TDB_CONTEXT *tdb, tdb_off off, TDB_DATA key)
400 {
401 char buf[64];
402 u32 len;
404 if (tdb_oob(tdb, off + key.dsize, 0) != 0)
405 return -1;
407 if (tdb->map_ptr)
408 return !memcmp(off + (char*)tdb->map_ptr, key.dptr, key.dsize);
410 while (key.dsize) {
411 len = key.dsize;
412 if (len > sizeof(buf))
413 len = sizeof(buf);
414 if (tdb_read(tdb, off, buf, len, 0) != 0)
415 return -1;
416 if (memcmp(buf, key.dptr, len) != 0)
417 return 0;
418 key.dptr += len;
419 key.dsize -= len;
420 off += len;
421 }
422 return 1;
423 }
425 /* read a lump of data, allocating the space for it */
426 static char *tdb_alloc_read(TDB_CONTEXT *tdb, tdb_off offset, tdb_len len)
427 {
428 char *buf;
430 if (!(buf = talloc_size(tdb, len))) {
431 /* Ensure ecode is set for log fn. */
432 tdb->ecode = TDB_ERR_OOM;
433 TDB_LOG((tdb, 0,"tdb_alloc_read malloc failed len=%d (%s)\n",
434 len, strerror(errno)));
435 return TDB_ERRCODE(TDB_ERR_OOM, buf);
436 }
437 if (tdb_read(tdb, offset, buf, len, 0) == -1) {
438 SAFE_FREE(buf);
439 return NULL;
440 }
441 return buf;
442 }
444 /* read/write a tdb_off */
445 static int ofs_read(TDB_CONTEXT *tdb, tdb_off offset, tdb_off *d)
446 {
447 return tdb_read(tdb, offset, (char*)d, sizeof(*d), DOCONV());
448 }
449 static int ofs_write(TDB_CONTEXT *tdb, tdb_off offset, tdb_off *d)
450 {
451 tdb_off off = *d;
452 return tdb_write(tdb, offset, CONVERT(off), sizeof(*d));
453 }
455 /* read/write a record */
456 static int rec_read(TDB_CONTEXT *tdb, tdb_off offset, struct list_struct *rec)
457 {
458 if (tdb_read(tdb, offset, rec, sizeof(*rec),DOCONV()) == -1)
459 return -1;
460 if (TDB_BAD_MAGIC(rec)) {
461 /* Ensure ecode is set for log fn. */
462 tdb->ecode = TDB_ERR_CORRUPT;
463 TDB_LOG((tdb, 0,"rec_read bad magic 0x%x at offset=%d\n", rec->magic, offset));
464 return TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
465 }
466 return tdb_oob(tdb, rec->next+sizeof(*rec), 0);
467 }
468 static int rec_write(TDB_CONTEXT *tdb, tdb_off offset, struct list_struct *rec)
469 {
470 struct list_struct r = *rec;
471 return tdb_write(tdb, offset, CONVERT(r), sizeof(r));
472 }
474 /* read a freelist record and check for simple errors */
475 static int rec_free_read(TDB_CONTEXT *tdb, tdb_off off, struct list_struct *rec)
476 {
477 if (tdb_read(tdb, off, rec, sizeof(*rec),DOCONV()) == -1)
478 return -1;
480 if (rec->magic == TDB_MAGIC) {
481 /* this happens when a app is showdown while deleting a record - we should
482 not completely fail when this happens */
483 TDB_LOG((tdb, 0,"rec_free_read non-free magic 0x%x at offset=%d - fixing\n",
484 rec->magic, off));
485 rec->magic = TDB_FREE_MAGIC;
486 if (tdb_write(tdb, off, rec, sizeof(*rec)) == -1)
487 return -1;
488 }
490 if (rec->magic != TDB_FREE_MAGIC) {
491 /* Ensure ecode is set for log fn. */
492 tdb->ecode = TDB_ERR_CORRUPT;
493 TDB_LOG((tdb, 0,"rec_free_read bad magic 0x%x at offset=%d\n",
494 rec->magic, off));
495 return TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
496 }
497 if (tdb_oob(tdb, rec->next+sizeof(*rec), 0) != 0)
498 return -1;
499 return 0;
500 }
502 /* update a record tailer (must hold allocation lock) */
503 static int update_tailer(TDB_CONTEXT *tdb, tdb_off offset,
504 const struct list_struct *rec)
505 {
506 tdb_off totalsize;
508 /* Offset of tailer from record header */
509 totalsize = sizeof(*rec) + rec->rec_len;
510 return ofs_write(tdb, offset + totalsize - sizeof(tdb_off),
511 &totalsize);
512 }
514 static tdb_off tdb_dump_record(TDB_CONTEXT *tdb, tdb_off offset)
515 {
516 struct list_struct rec;
517 tdb_off tailer_ofs, tailer;
519 if (tdb_read(tdb, offset, (char *)&rec, sizeof(rec), DOCONV()) == -1) {
520 printf("ERROR: failed to read record at %u\n", offset);
521 return 0;
522 }
524 printf(" rec: offset=0x%08x next=0x%08x rec_len=%d key_len=%d data_len=%d full_hash=0x%x magic=0x%x\n",
525 offset, rec.next, rec.rec_len, rec.key_len, rec.data_len, rec.full_hash, rec.magic);
527 tailer_ofs = offset + sizeof(rec) + rec.rec_len - sizeof(tdb_off);
528 if (ofs_read(tdb, tailer_ofs, &tailer) == -1) {
529 printf("ERROR: failed to read tailer at %u\n", tailer_ofs);
530 return rec.next;
531 }
533 if (tailer != rec.rec_len + sizeof(rec)) {
534 printf("ERROR: tailer does not match record! tailer=%u totalsize=%u\n",
535 (unsigned int)tailer, (unsigned int)(rec.rec_len + sizeof(rec)));
536 }
537 return rec.next;
538 }
540 static int tdb_dump_chain(TDB_CONTEXT *tdb, int i)
541 {
542 tdb_off rec_ptr, top;
544 top = TDB_HASH_TOP(i);
546 if (tdb_lock(tdb, i, F_WRLCK) != 0)
547 return -1;
549 if (ofs_read(tdb, top, &rec_ptr) == -1)
550 return tdb_unlock(tdb, i, F_WRLCK);
552 if (rec_ptr)
553 printf("hash=%d\n", i);
555 while (rec_ptr) {
556 rec_ptr = tdb_dump_record(tdb, rec_ptr);
557 }
559 return tdb_unlock(tdb, i, F_WRLCK);
560 }
562 void tdb_dump_all(TDB_CONTEXT *tdb)
563 {
564 unsigned int i;
565 for (i=0;i<tdb->header.hash_size;i++) {
566 tdb_dump_chain(tdb, i);
567 }
568 printf("freelist:\n");
569 tdb_dump_chain(tdb, -1);
570 }
572 int tdb_printfreelist(TDB_CONTEXT *tdb)
573 {
574 int ret;
575 long total_free = 0;
576 tdb_off offset, rec_ptr;
577 struct list_struct rec;
579 if ((ret = tdb_lock(tdb, -1, F_WRLCK)) != 0)
580 return ret;
582 offset = FREELIST_TOP;
584 /* read in the freelist top */
585 if (ofs_read(tdb, offset, &rec_ptr) == -1) {
586 tdb_unlock(tdb, -1, F_WRLCK);
587 return 0;
588 }
590 printf("freelist top=[0x%08x]\n", rec_ptr );
591 while (rec_ptr) {
592 if (tdb_read(tdb, rec_ptr, (char *)&rec, sizeof(rec), DOCONV()) == -1) {
593 tdb_unlock(tdb, -1, F_WRLCK);
594 return -1;
595 }
597 if (rec.magic != TDB_FREE_MAGIC) {
598 printf("bad magic 0x%08x in free list\n", rec.magic);
599 tdb_unlock(tdb, -1, F_WRLCK);
600 return -1;
601 }
603 printf("entry offset=[0x%08x], rec.rec_len = [0x%08x (%d)] (end = 0x%08x)\n",
604 rec_ptr, rec.rec_len, rec.rec_len, rec_ptr + rec.rec_len);
605 total_free += rec.rec_len;
607 /* move to the next record */
608 rec_ptr = rec.next;
609 }
610 printf("total rec_len = [0x%08x (%d)]\n", (int)total_free,
611 (int)total_free);
613 return tdb_unlock(tdb, -1, F_WRLCK);
614 }
616 /* Remove an element from the freelist. Must have alloc lock. */
617 static int remove_from_freelist(TDB_CONTEXT *tdb, tdb_off off, tdb_off next)
618 {
619 tdb_off last_ptr, i;
621 /* read in the freelist top */
622 last_ptr = FREELIST_TOP;
623 while (ofs_read(tdb, last_ptr, &i) != -1 && i != 0) {
624 if (i == off) {
625 /* We've found it! */
626 return ofs_write(tdb, last_ptr, &next);
627 }
628 /* Follow chain (next offset is at start of record) */
629 last_ptr = i;
630 }
631 TDB_LOG((tdb, 0,"remove_from_freelist: not on list at off=%d\n", off));
632 return TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
633 }
635 /* Add an element into the freelist. Merge adjacent records if
636 neccessary. */
637 static int tdb_free(TDB_CONTEXT *tdb, tdb_off offset, struct list_struct *rec)
638 {
639 tdb_off right, left;
641 /* Allocation and tailer lock */
642 if (tdb_lock(tdb, -1, F_WRLCK) != 0)
643 return -1;
645 /* set an initial tailer, so if we fail we don't leave a bogus record */
646 if (update_tailer(tdb, offset, rec) != 0) {
647 TDB_LOG((tdb, 0, "tdb_free: upfate_tailer failed!\n"));
648 goto fail;
649 }
651 /* Look right first (I'm an Australian, dammit) */
652 right = offset + sizeof(*rec) + rec->rec_len;
653 if (right + sizeof(*rec) <= tdb->map_size) {
654 struct list_struct r;
656 if (tdb_read(tdb, right, &r, sizeof(r), DOCONV()) == -1) {
657 TDB_LOG((tdb, 0, "tdb_free: right read failed at %u\n", right));
658 goto left;
659 }
661 /* If it's free, expand to include it. */
662 if (r.magic == TDB_FREE_MAGIC) {
663 if (remove_from_freelist(tdb, right, r.next) == -1) {
664 TDB_LOG((tdb, 0, "tdb_free: right free failed at %u\n", right));
665 goto left;
666 }
667 rec->rec_len += sizeof(r) + r.rec_len;
668 }
669 }
671 left:
672 /* Look left */
673 left = offset - sizeof(tdb_off);
674 if (left > TDB_DATA_START(tdb->header.hash_size)) {
675 struct list_struct l;
676 tdb_off leftsize;
678 /* Read in tailer and jump back to header */
679 if (ofs_read(tdb, left, &leftsize) == -1) {
680 TDB_LOG((tdb, 0, "tdb_free: left offset read failed at %u\n", left));
681 goto update;
682 }
683 left = offset - leftsize;
685 /* Now read in record */
686 if (tdb_read(tdb, left, &l, sizeof(l), DOCONV()) == -1) {
687 TDB_LOG((tdb, 0, "tdb_free: left read failed at %u (%u)\n", left, leftsize));
688 goto update;
689 }
691 /* If it's free, expand to include it. */
692 if (l.magic == TDB_FREE_MAGIC) {
693 if (remove_from_freelist(tdb, left, l.next) == -1) {
694 TDB_LOG((tdb, 0, "tdb_free: left free failed at %u\n", left));
695 goto update;
696 } else {
697 offset = left;
698 rec->rec_len += leftsize;
699 }
700 }
701 }
703 update:
704 if (update_tailer(tdb, offset, rec) == -1) {
705 TDB_LOG((tdb, 0, "tdb_free: update_tailer failed at %u\n", offset));
706 goto fail;
707 }
709 /* Now, prepend to free list */
710 rec->magic = TDB_FREE_MAGIC;
712 if (ofs_read(tdb, FREELIST_TOP, &rec->next) == -1 ||
713 rec_write(tdb, offset, rec) == -1 ||
714 ofs_write(tdb, FREELIST_TOP, &offset) == -1) {
715 TDB_LOG((tdb, 0, "tdb_free record write failed at offset=%d\n", offset));
716 goto fail;
717 }
719 /* And we're done. */
720 tdb_unlock(tdb, -1, F_WRLCK);
721 return 0;
723 fail:
724 tdb_unlock(tdb, -1, F_WRLCK);
725 return -1;
726 }
729 /* expand a file. we prefer to use ftruncate, as that is what posix
730 says to use for mmap expansion */
731 static int expand_file(TDB_CONTEXT *tdb, tdb_off size, tdb_off addition)
732 {
733 char buf[1024];
734 #if HAVE_FTRUNCATE_EXTEND
735 if (ftruncate(tdb->fd, size+addition) != 0) {
736 TDB_LOG((tdb, 0, "expand_file ftruncate to %d failed (%s)\n",
737 size+addition, strerror(errno)));
738 return -1;
739 }
740 #else
741 char b = 0;
743 #ifdef HAVE_PWRITE
744 if (pwrite(tdb->fd, &b, 1, (size+addition) - 1) != 1) {
745 #else
746 if (lseek(tdb->fd, (size+addition) - 1, SEEK_SET) != (off_t)(size+addition) - 1 ||
747 write(tdb->fd, &b, 1) != 1) {
748 #endif
749 TDB_LOG((tdb, 0, "expand_file to %d failed (%s)\n",
750 size+addition, strerror(errno)));
751 return -1;
752 }
753 #endif
755 /* now fill the file with something. This ensures that the file isn't sparse, which would be
756 very bad if we ran out of disk. This must be done with write, not via mmap */
757 memset(buf, 0x42, sizeof(buf));
758 while (addition) {
759 int n = addition>sizeof(buf)?sizeof(buf):addition;
760 #ifdef HAVE_PWRITE
761 int ret = pwrite(tdb->fd, buf, n, size);
762 #else
763 int ret;
764 if (lseek(tdb->fd, size, SEEK_SET) != (off_t)size)
765 return -1;
766 ret = write(tdb->fd, buf, n);
767 #endif
768 if (ret != n) {
769 TDB_LOG((tdb, 0, "expand_file write of %d failed (%s)\n",
770 n, strerror(errno)));
771 return -1;
772 }
773 addition -= n;
774 size += n;
775 }
776 return 0;
777 }
780 /* expand the database at least size bytes by expanding the underlying
781 file and doing the mmap again if necessary */
782 static int tdb_expand(TDB_CONTEXT *tdb, tdb_off size)
783 {
784 struct list_struct rec;
785 tdb_off offset;
787 if (tdb_lock(tdb, -1, F_WRLCK) == -1) {
788 TDB_LOG((tdb, 0, "lock failed in tdb_expand\n"));
789 return -1;
790 }
792 /* must know about any previous expansions by another process */
793 tdb_oob(tdb, tdb->map_size + 1, 1);
795 /* always make room for at least 10 more records, and round
796 the database up to a multiple of TDB_PAGE_SIZE */
797 size = TDB_ALIGN(tdb->map_size + size*10, TDB_PAGE_SIZE) - tdb->map_size;
799 if (!(tdb->flags & TDB_INTERNAL))
800 tdb_munmap(tdb);
802 /*
803 * We must ensure the file is unmapped before doing this
804 * to ensure consistency with systems like OpenBSD where
805 * writes and mmaps are not consistent.
806 */
808 /* expand the file itself */
809 if (!(tdb->flags & TDB_INTERNAL)) {
810 if (expand_file(tdb, tdb->map_size, size) != 0)
811 goto fail;
812 }
814 tdb->map_size += size;
816 if (tdb->flags & TDB_INTERNAL) {
817 char *new_map_ptr = talloc_realloc_size(tdb, tdb->map_ptr,
818 tdb->map_size);
819 if (!new_map_ptr) {
820 tdb->map_size -= size;
821 goto fail;
822 }
823 tdb->map_ptr = new_map_ptr;
824 } else {
825 /*
826 * We must ensure the file is remapped before adding the space
827 * to ensure consistency with systems like OpenBSD where
828 * writes and mmaps are not consistent.
829 */
831 /* We're ok if the mmap fails as we'll fallback to read/write */
832 tdb_mmap(tdb);
833 }
835 /* form a new freelist record */
836 memset(&rec,'\0',sizeof(rec));
837 rec.rec_len = size - sizeof(rec);
839 /* link it into the free list */
840 offset = tdb->map_size - size;
841 if (tdb_free(tdb, offset, &rec) == -1)
842 goto fail;
844 tdb_unlock(tdb, -1, F_WRLCK);
845 return 0;
846 fail:
847 tdb_unlock(tdb, -1, F_WRLCK);
848 return -1;
849 }
852 /*
853 the core of tdb_allocate - called when we have decided which
854 free list entry to use
855 */
856 static tdb_off tdb_allocate_ofs(TDB_CONTEXT *tdb, tdb_len length, tdb_off rec_ptr,
857 struct list_struct *rec, tdb_off last_ptr)
858 {
859 struct list_struct newrec;
860 tdb_off newrec_ptr;
862 memset(&newrec, '\0', sizeof(newrec));
864 /* found it - now possibly split it up */
865 if (rec->rec_len > length + MIN_REC_SIZE) {
866 /* Length of left piece */
867 length = TDB_ALIGN(length, TDB_ALIGNMENT);
869 /* Right piece to go on free list */
870 newrec.rec_len = rec->rec_len - (sizeof(*rec) + length);
871 newrec_ptr = rec_ptr + sizeof(*rec) + length;
873 /* And left record is shortened */
874 rec->rec_len = length;
875 } else {
876 newrec_ptr = 0;
877 }
879 /* Remove allocated record from the free list */
880 if (ofs_write(tdb, last_ptr, &rec->next) == -1) {
881 return 0;
882 }
884 /* Update header: do this before we drop alloc
885 lock, otherwise tdb_free() might try to
886 merge with us, thinking we're free.
887 (Thanks Jeremy Allison). */
888 rec->magic = TDB_MAGIC;
889 if (rec_write(tdb, rec_ptr, rec) == -1) {
890 return 0;
891 }
893 /* Did we create new block? */
894 if (newrec_ptr) {
895 /* Update allocated record tailer (we
896 shortened it). */
897 if (update_tailer(tdb, rec_ptr, rec) == -1) {
898 return 0;
899 }
901 /* Free new record */
902 if (tdb_free(tdb, newrec_ptr, &newrec) == -1) {
903 return 0;
904 }
905 }
907 /* all done - return the new record offset */
908 return rec_ptr;
909 }
911 /* allocate some space from the free list. The offset returned points
912 to a unconnected list_struct within the database with room for at
913 least length bytes of total data
915 0 is returned if the space could not be allocated
916 */
917 static tdb_off tdb_allocate(TDB_CONTEXT *tdb, tdb_len length,
918 struct list_struct *rec)
919 {
920 tdb_off rec_ptr, last_ptr, newrec_ptr;
921 struct {
922 tdb_off rec_ptr, last_ptr;
923 tdb_len rec_len;
924 } bestfit = { 0, 0, 0 };
926 if (tdb_lock(tdb, -1, F_WRLCK) == -1)
927 return 0;
929 /* Extra bytes required for tailer */
930 length += sizeof(tdb_off);
932 again:
933 last_ptr = FREELIST_TOP;
935 /* read in the freelist top */
936 if (ofs_read(tdb, FREELIST_TOP, &rec_ptr) == -1)
937 goto fail;
939 bestfit.rec_ptr = 0;
941 /*
942 this is a best fit allocation strategy. Originally we used
943 a first fit strategy, but it suffered from massive fragmentation
944 issues when faced with a slowly increasing record size.
945 */
946 while (rec_ptr) {
947 if (rec_free_read(tdb, rec_ptr, rec) == -1) {
948 goto fail;
949 }
951 if (rec->rec_len >= length) {
952 if (bestfit.rec_ptr == 0 ||
953 rec->rec_len < bestfit.rec_len) {
954 bestfit.rec_len = rec->rec_len;
955 bestfit.rec_ptr = rec_ptr;
956 bestfit.last_ptr = last_ptr;
957 /* consider a fit to be good enough if we aren't wasting more than half the space */
958 if (bestfit.rec_len < 2*length) {
959 break;
960 }
961 }
962 }
964 /* move to the next record */
965 last_ptr = rec_ptr;
966 rec_ptr = rec->next;
967 }
969 if (bestfit.rec_ptr != 0) {
970 if (rec_free_read(tdb, bestfit.rec_ptr, rec) == -1) {
971 goto fail;
972 }
974 newrec_ptr = tdb_allocate_ofs(tdb, length, bestfit.rec_ptr, rec, bestfit.last_ptr);
975 tdb_unlock(tdb, -1, F_WRLCK);
976 return newrec_ptr;
977 }
979 /* we didn't find enough space. See if we can expand the
980 database and if we can then try again */
981 if (tdb_expand(tdb, length + sizeof(*rec)) == 0)
982 goto again;
983 fail:
984 tdb_unlock(tdb, -1, F_WRLCK);
985 return 0;
986 }
988 /* initialise a new database with a specified hash size */
989 static int tdb_new_database(TDB_CONTEXT *tdb, int hash_size)
990 {
991 struct tdb_header *newdb;
992 int size, ret = -1;
994 /* We make it up in memory, then write it out if not internal */
995 size = sizeof(struct tdb_header) + (hash_size+1)*sizeof(tdb_off);
996 if (!(newdb = talloc_zero_size(tdb, size)))
997 return TDB_ERRCODE(TDB_ERR_OOM, -1);
999 /* Fill in the header */
1000 newdb->version = TDB_VERSION;
1001 newdb->hash_size = hash_size;
1002 if (tdb->flags & TDB_INTERNAL) {
1003 tdb->map_size = size;
1004 tdb->map_ptr = (char *)newdb;
1005 memcpy(&tdb->header, newdb, sizeof(tdb->header));
1006 /* Convert the `ondisk' version if asked. */
1007 CONVERT(*newdb);
1008 return 0;
1010 if (lseek(tdb->fd, 0, SEEK_SET) == -1)
1011 goto fail;
1013 if (ftruncate(tdb->fd, 0) == -1)
1014 goto fail;
1016 /* This creates an endian-converted header, as if read from disk */
1017 CONVERT(*newdb);
1018 memcpy(&tdb->header, newdb, sizeof(tdb->header));
1019 /* Don't endian-convert the magic food! */
1020 memcpy(newdb->magic_food, TDB_MAGIC_FOOD, strlen(TDB_MAGIC_FOOD)+1);
1021 if (write(tdb->fd, newdb, size) != size)
1022 ret = -1;
1023 else
1024 ret = 0;
1026 fail:
1027 SAFE_FREE(newdb);
1028 return ret;
1031 /* Returns 0 on fail. On success, return offset of record, and fills
1032 in rec */
1033 static tdb_off tdb_find(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash,
1034 struct list_struct *r)
1036 tdb_off rec_ptr;
1038 /* read in the hash top */
1039 if (ofs_read(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1)
1040 return 0;
1042 /* keep looking until we find the right record */
1043 while (rec_ptr) {
1044 if (rec_read(tdb, rec_ptr, r) == -1)
1045 return 0;
1047 if (!TDB_DEAD(r) && hash==r->full_hash && key.dsize==r->key_len) {
1048 /* a very likely hit - read the key */
1049 int cmp = tdb_key_eq(tdb, rec_ptr + sizeof(*r), key);
1050 if (cmp < 0)
1051 return 0;
1052 else if (cmp > 0)
1053 return rec_ptr;
1055 rec_ptr = r->next;
1057 return TDB_ERRCODE(TDB_ERR_NOEXIST, 0);
1060 /* As tdb_find, but if you succeed, keep the lock */
1061 static tdb_off tdb_find_lock_hash(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash, int locktype,
1062 struct list_struct *rec)
1064 u32 rec_ptr;
1066 if (tdb_lock(tdb, BUCKET(hash), locktype) == -1)
1067 return 0;
1068 if (!(rec_ptr = tdb_find(tdb, key, hash, rec)))
1069 tdb_unlock(tdb, BUCKET(hash), locktype);
1070 return rec_ptr;
1073 enum TDB_ERROR tdb_error(TDB_CONTEXT *tdb)
1075 return tdb->ecode;
1078 static struct tdb_errname {
1079 enum TDB_ERROR ecode; const char *estring;
1080 } emap[] = { {TDB_SUCCESS, "Success"},
1081 {TDB_ERR_CORRUPT, "Corrupt database"},
1082 {TDB_ERR_IO, "IO Error"},
1083 {TDB_ERR_LOCK, "Locking error"},
1084 {TDB_ERR_OOM, "Out of memory"},
1085 {TDB_ERR_EXISTS, "Record exists"},
1086 {TDB_ERR_NOLOCK, "Lock exists on other keys"},
1087 {TDB_ERR_NOEXIST, "Record does not exist"} };
1089 /* Error string for the last tdb error */
1090 const char *tdb_errorstr(TDB_CONTEXT *tdb)
1092 u32 i;
1093 for (i = 0; i < sizeof(emap) / sizeof(struct tdb_errname); i++)
1094 if (tdb->ecode == emap[i].ecode)
1095 return emap[i].estring;
1096 return "Invalid error code";
1099 /* update an entry in place - this only works if the new data size
1100 is <= the old data size and the key exists.
1101 on failure return -1.
1102 */
1104 static int tdb_update_hash(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash, TDB_DATA dbuf)
1106 struct list_struct rec;
1107 tdb_off rec_ptr;
1109 /* find entry */
1110 if (!(rec_ptr = tdb_find(tdb, key, hash, &rec)))
1111 return -1;
1113 /* must be long enough key, data and tailer */
1114 if (rec.rec_len < key.dsize + dbuf.dsize + sizeof(tdb_off)) {
1115 tdb->ecode = TDB_SUCCESS; /* Not really an error */
1116 return -1;
1119 if (tdb_write(tdb, rec_ptr + sizeof(rec) + rec.key_len,
1120 dbuf.dptr, dbuf.dsize) == -1)
1121 return -1;
1123 if (dbuf.dsize != rec.data_len) {
1124 /* update size */
1125 rec.data_len = dbuf.dsize;
1126 return rec_write(tdb, rec_ptr, &rec);
1129 return 0;
1132 /* find an entry in the database given a key */
1133 /* If an entry doesn't exist tdb_err will be set to
1134 * TDB_ERR_NOEXIST. If a key has no data attached
1135 * then the TDB_DATA will have zero length but
1136 * a non-zero pointer
1137 */
1139 TDB_DATA tdb_fetch(TDB_CONTEXT *tdb, TDB_DATA key)
1141 tdb_off rec_ptr;
1142 struct list_struct rec;
1143 TDB_DATA ret;
1144 u32 hash;
1146 /* find which hash bucket it is in */
1147 hash = tdb->hash_fn(&key);
1148 if (!(rec_ptr = tdb_find_lock_hash(tdb,key,hash,F_RDLCK,&rec)))
1149 return tdb_null;
1151 ret.dptr = tdb_alloc_read(tdb, rec_ptr + sizeof(rec) + rec.key_len,
1152 rec.data_len);
1153 ret.dsize = rec.data_len;
1154 tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
1155 return ret;
1158 /* check if an entry in the database exists
1160 note that 1 is returned if the key is found and 0 is returned if not found
1161 this doesn't match the conventions in the rest of this module, but is
1162 compatible with gdbm
1163 */
1164 static int tdb_exists_hash(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash)
1166 struct list_struct rec;
1168 if (tdb_find_lock_hash(tdb, key, hash, F_RDLCK, &rec) == 0)
1169 return 0;
1170 tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
1171 return 1;
1174 int tdb_exists(TDB_CONTEXT *tdb, TDB_DATA key)
1176 u32 hash = tdb->hash_fn(&key);
1177 return tdb_exists_hash(tdb, key, hash);
1180 /* record lock stops delete underneath */
1181 static int lock_record(TDB_CONTEXT *tdb, tdb_off off)
1183 return off ? tdb_brlock(tdb, off, F_RDLCK, F_SETLKW, 0) : 0;
1185 /*
1186 Write locks override our own fcntl readlocks, so check it here.
1187 Note this is meant to be F_SETLK, *not* F_SETLKW, as it's not
1188 an error to fail to get the lock here.
1189 */
1191 static int write_lock_record(TDB_CONTEXT *tdb, tdb_off off)
1193 struct tdb_traverse_lock *i;
1194 for (i = &tdb->travlocks; i; i = i->next)
1195 if (i->off == off)
1196 return -1;
1197 return tdb_brlock(tdb, off, F_WRLCK, F_SETLK, 1);
1200 /*
1201 Note this is meant to be F_SETLK, *not* F_SETLKW, as it's not
1202 an error to fail to get the lock here.
1203 */
1205 static int write_unlock_record(TDB_CONTEXT *tdb, tdb_off off)
1207 return tdb_brlock(tdb, off, F_UNLCK, F_SETLK, 0);
1209 /* fcntl locks don't stack: avoid unlocking someone else's */
1210 static int unlock_record(TDB_CONTEXT *tdb, tdb_off off)
1212 struct tdb_traverse_lock *i;
1213 u32 count = 0;
1215 if (off == 0)
1216 return 0;
1217 for (i = &tdb->travlocks; i; i = i->next)
1218 if (i->off == off)
1219 count++;
1220 return (count == 1 ? tdb_brlock(tdb, off, F_UNLCK, F_SETLKW, 0) : 0);
1223 /* actually delete an entry in the database given the offset */
1224 static int do_delete(TDB_CONTEXT *tdb, tdb_off rec_ptr, struct list_struct*rec)
1226 tdb_off last_ptr, i;
1227 struct list_struct lastrec;
1229 if (tdb->read_only) return -1;
1231 if (write_lock_record(tdb, rec_ptr) == -1) {
1232 /* Someone traversing here: mark it as dead */
1233 rec->magic = TDB_DEAD_MAGIC;
1234 return rec_write(tdb, rec_ptr, rec);
1236 if (write_unlock_record(tdb, rec_ptr) != 0)
1237 return -1;
1239 /* find previous record in hash chain */
1240 if (ofs_read(tdb, TDB_HASH_TOP(rec->full_hash), &i) == -1)
1241 return -1;
1242 for (last_ptr = 0; i != rec_ptr; last_ptr = i, i = lastrec.next)
1243 if (rec_read(tdb, i, &lastrec) == -1)
1244 return -1;
1246 /* unlink it: next ptr is at start of record. */
1247 if (last_ptr == 0)
1248 last_ptr = TDB_HASH_TOP(rec->full_hash);
1249 if (ofs_write(tdb, last_ptr, &rec->next) == -1)
1250 return -1;
1252 /* recover the space */
1253 if (tdb_free(tdb, rec_ptr, rec) == -1)
1254 return -1;
1255 return 0;
1258 /* Uses traverse lock: 0 = finish, -1 = error, other = record offset */
1259 static int tdb_next_lock(TDB_CONTEXT *tdb, struct tdb_traverse_lock *tlock,
1260 struct list_struct *rec)
1262 int want_next = (tlock->off != 0);
1264 /* Lock each chain from the start one. */
1265 for (; tlock->hash < tdb->header.hash_size; tlock->hash++) {
1267 /* this is an optimisation for the common case where
1268 the hash chain is empty, which is particularly
1269 common for the use of tdb with ldb, where large
1270 hashes are used. In that case we spend most of our
1271 time in tdb_brlock(), locking empty hash chains.
1273 To avoid this, we do an unlocked pre-check to see
1274 if the hash chain is empty before starting to look
1275 inside it. If it is empty then we can avoid that
1276 hash chain. If it isn't empty then we can't believe
1277 the value we get back, as we read it without a
1278 lock, so instead we get the lock and re-fetch the
1279 value below.
1281 Notice that not doing this optimisation on the
1282 first hash chain is critical. We must guarantee
1283 that we have done at least one fcntl lock at the
1284 start of a search to guarantee that memory is
1285 coherent on SMP systems. If records are added by
1286 others during the search then thats OK, and we
1287 could possibly miss those with this trick, but we
1288 could miss them anyway without this trick, so the
1289 semantics don't change.
1291 With a non-indexed ldb search this trick gains us a
1292 factor of around 80 in speed on a linux 2.6.x
1293 system (testing using ldbtest).
1294 */
1295 if (!tlock->off && tlock->hash != 0) {
1296 u32 off;
1297 if (tdb->map_ptr) {
1298 for (;tlock->hash < tdb->header.hash_size;tlock->hash++) {
1299 if (0 != *(u32 *)(TDB_HASH_TOP(tlock->hash) + (unsigned char *)tdb->map_ptr)) {
1300 break;
1303 if (tlock->hash == tdb->header.hash_size) {
1304 continue;
1306 } else {
1307 if (ofs_read(tdb, TDB_HASH_TOP(tlock->hash), &off) == 0 &&
1308 off == 0) {
1309 continue;
1314 if (tdb_lock(tdb, tlock->hash, F_WRLCK) == -1)
1315 return -1;
1317 /* No previous record? Start at top of chain. */
1318 if (!tlock->off) {
1319 if (ofs_read(tdb, TDB_HASH_TOP(tlock->hash),
1320 &tlock->off) == -1)
1321 goto fail;
1322 } else {
1323 /* Otherwise unlock the previous record. */
1324 if (unlock_record(tdb, tlock->off) != 0)
1325 goto fail;
1328 if (want_next) {
1329 /* We have offset of old record: grab next */
1330 if (rec_read(tdb, tlock->off, rec) == -1)
1331 goto fail;
1332 tlock->off = rec->next;
1335 /* Iterate through chain */
1336 while( tlock->off) {
1337 tdb_off current;
1338 if (rec_read(tdb, tlock->off, rec) == -1)
1339 goto fail;
1341 /* Detect infinite loops. From "Shlomi Yaakobovich" <Shlomi@exanet.com>. */
1342 if (tlock->off == rec->next) {
1343 TDB_LOG((tdb, 0, "tdb_next_lock: loop detected.\n"));
1344 goto fail;
1347 if (!TDB_DEAD(rec)) {
1348 /* Woohoo: we found one! */
1349 if (lock_record(tdb, tlock->off) != 0)
1350 goto fail;
1351 return tlock->off;
1354 /* Try to clean dead ones from old traverses */
1355 current = tlock->off;
1356 tlock->off = rec->next;
1357 if (!tdb->read_only &&
1358 do_delete(tdb, current, rec) != 0)
1359 goto fail;
1361 tdb_unlock(tdb, tlock->hash, F_WRLCK);
1362 want_next = 0;
1364 /* We finished iteration without finding anything */
1365 return TDB_ERRCODE(TDB_SUCCESS, 0);
1367 fail:
1368 tlock->off = 0;
1369 if (tdb_unlock(tdb, tlock->hash, F_WRLCK) != 0)
1370 TDB_LOG((tdb, 0, "tdb_next_lock: On error unlock failed!\n"));
1371 return -1;
1374 /* traverse the entire database - calling fn(tdb, key, data) on each element.
1375 return -1 on error or the record count traversed
1376 if fn is NULL then it is not called
1377 a non-zero return value from fn() indicates that the traversal should stop
1378 */
1379 int tdb_traverse(TDB_CONTEXT *tdb, tdb_traverse_func fn, void *private)
1381 TDB_DATA key, dbuf;
1382 struct list_struct rec;
1383 struct tdb_traverse_lock tl = { NULL, 0, 0 };
1384 int ret, count = 0;
1386 /* This was in the initializaton, above, but the IRIX compiler
1387 * did not like it. crh
1388 */
1389 tl.next = tdb->travlocks.next;
1391 /* fcntl locks don't stack: beware traverse inside traverse */
1392 tdb->travlocks.next = &tl;
1394 /* tdb_next_lock places locks on the record returned, and its chain */
1395 while ((ret = tdb_next_lock(tdb, &tl, &rec)) > 0) {
1396 count++;
1397 /* now read the full record */
1398 key.dptr = tdb_alloc_read(tdb, tl.off + sizeof(rec),
1399 rec.key_len + rec.data_len);
1400 if (!key.dptr) {
1401 ret = -1;
1402 if (tdb_unlock(tdb, tl.hash, F_WRLCK) != 0)
1403 goto out;
1404 if (unlock_record(tdb, tl.off) != 0)
1405 TDB_LOG((tdb, 0, "tdb_traverse: key.dptr == NULL and unlock_record failed!\n"));
1406 goto out;
1408 key.dsize = rec.key_len;
1409 dbuf.dptr = key.dptr + rec.key_len;
1410 dbuf.dsize = rec.data_len;
1412 /* Drop chain lock, call out */
1413 if (tdb_unlock(tdb, tl.hash, F_WRLCK) != 0) {
1414 ret = -1;
1415 goto out;
1417 if (fn && fn(tdb, key, dbuf, private)) {
1418 /* They want us to terminate traversal */
1419 ret = count;
1420 if (unlock_record(tdb, tl.off) != 0) {
1421 TDB_LOG((tdb, 0, "tdb_traverse: unlock_record failed!\n"));;
1422 ret = -1;
1424 tdb->travlocks.next = tl.next;
1425 SAFE_FREE(key.dptr);
1426 return count;
1428 SAFE_FREE(key.dptr);
1430 out:
1431 tdb->travlocks.next = tl.next;
1432 if (ret < 0)
1433 return -1;
1434 else
1435 return count;
1438 /* find the first entry in the database and return its key */
1439 TDB_DATA tdb_firstkey(TDB_CONTEXT *tdb)
1441 TDB_DATA key;
1442 struct list_struct rec;
1444 /* release any old lock */
1445 if (unlock_record(tdb, tdb->travlocks.off) != 0)
1446 return tdb_null;
1447 tdb->travlocks.off = tdb->travlocks.hash = 0;
1449 if (tdb_next_lock(tdb, &tdb->travlocks, &rec) <= 0)
1450 return tdb_null;
1451 /* now read the key */
1452 key.dsize = rec.key_len;
1453 key.dptr =tdb_alloc_read(tdb,tdb->travlocks.off+sizeof(rec),key.dsize);
1454 if (tdb_unlock(tdb, BUCKET(tdb->travlocks.hash), F_WRLCK) != 0)
1455 TDB_LOG((tdb, 0, "tdb_firstkey: error occurred while tdb_unlocking!\n"));
1456 return key;
1459 /* find the next entry in the database, returning its key */
1460 TDB_DATA tdb_nextkey(TDB_CONTEXT *tdb, TDB_DATA oldkey)
1462 u32 oldhash;
1463 TDB_DATA key = tdb_null;
1464 struct list_struct rec;
1465 char *k = NULL;
1467 /* Is locked key the old key? If so, traverse will be reliable. */
1468 if (tdb->travlocks.off) {
1469 if (tdb_lock(tdb,tdb->travlocks.hash,F_WRLCK))
1470 return tdb_null;
1471 if (rec_read(tdb, tdb->travlocks.off, &rec) == -1
1472 || !(k = tdb_alloc_read(tdb,tdb->travlocks.off+sizeof(rec),
1473 rec.key_len))
1474 || memcmp(k, oldkey.dptr, oldkey.dsize) != 0) {
1475 /* No, it wasn't: unlock it and start from scratch */
1476 if (unlock_record(tdb, tdb->travlocks.off) != 0)
1477 return tdb_null;
1478 if (tdb_unlock(tdb, tdb->travlocks.hash, F_WRLCK) != 0)
1479 return tdb_null;
1480 tdb->travlocks.off = 0;
1483 SAFE_FREE(k);
1486 if (!tdb->travlocks.off) {
1487 /* No previous element: do normal find, and lock record */
1488 tdb->travlocks.off = tdb_find_lock_hash(tdb, oldkey, tdb->hash_fn(&oldkey), F_WRLCK, &rec);
1489 if (!tdb->travlocks.off)
1490 return tdb_null;
1491 tdb->travlocks.hash = BUCKET(rec.full_hash);
1492 if (lock_record(tdb, tdb->travlocks.off) != 0) {
1493 TDB_LOG((tdb, 0, "tdb_nextkey: lock_record failed (%s)!\n", strerror(errno)));
1494 return tdb_null;
1497 oldhash = tdb->travlocks.hash;
1499 /* Grab next record: locks chain and returned record,
1500 unlocks old record */
1501 if (tdb_next_lock(tdb, &tdb->travlocks, &rec) > 0) {
1502 key.dsize = rec.key_len;
1503 key.dptr = tdb_alloc_read(tdb, tdb->travlocks.off+sizeof(rec),
1504 key.dsize);
1505 /* Unlock the chain of this new record */
1506 if (tdb_unlock(tdb, tdb->travlocks.hash, F_WRLCK) != 0)
1507 TDB_LOG((tdb, 0, "tdb_nextkey: WARNING tdb_unlock failed!\n"));
1509 /* Unlock the chain of old record */
1510 if (tdb_unlock(tdb, BUCKET(oldhash), F_WRLCK) != 0)
1511 TDB_LOG((tdb, 0, "tdb_nextkey: WARNING tdb_unlock failed!\n"));
1512 return key;
1515 /* delete an entry in the database given a key */
1516 static int tdb_delete_hash(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash)
1518 tdb_off rec_ptr;
1519 struct list_struct rec;
1520 int ret;
1522 if (!(rec_ptr = tdb_find_lock_hash(tdb, key, hash, F_WRLCK, &rec)))
1523 return -1;
1524 ret = do_delete(tdb, rec_ptr, &rec);
1525 if (tdb_unlock(tdb, BUCKET(rec.full_hash), F_WRLCK) != 0)
1526 TDB_LOG((tdb, 0, "tdb_delete: WARNING tdb_unlock failed!\n"));
1527 return ret;
1530 int tdb_delete(TDB_CONTEXT *tdb, TDB_DATA key)
1532 u32 hash = tdb->hash_fn(&key);
1533 return tdb_delete_hash(tdb, key, hash);
1536 /* store an element in the database, replacing any existing element
1537 with the same key
1539 return 0 on success, -1 on failure
1540 */
1541 int tdb_store(TDB_CONTEXT *tdb, TDB_DATA key, TDB_DATA dbuf, int flag)
1543 struct list_struct rec;
1544 u32 hash;
1545 tdb_off rec_ptr;
1546 char *p = NULL;
1547 int ret = 0;
1549 /* find which hash bucket it is in */
1550 hash = tdb->hash_fn(&key);
1551 if (tdb_lock(tdb, BUCKET(hash), F_WRLCK) == -1)
1552 return -1;
1554 /* check for it existing, on insert. */
1555 if (flag == TDB_INSERT) {
1556 if (tdb_exists_hash(tdb, key, hash)) {
1557 tdb->ecode = TDB_ERR_EXISTS;
1558 goto fail;
1560 } else {
1561 /* first try in-place update, on modify or replace. */
1562 if (tdb_update_hash(tdb, key, hash, dbuf) == 0)
1563 goto out;
1564 if (tdb->ecode == TDB_ERR_NOEXIST &&
1565 flag == TDB_MODIFY) {
1566 /* if the record doesn't exist and we are in TDB_MODIFY mode then
1567 we should fail the store */
1568 goto fail;
1571 /* reset the error code potentially set by the tdb_update() */
1572 tdb->ecode = TDB_SUCCESS;
1574 /* delete any existing record - if it doesn't exist we don't
1575 care. Doing this first reduces fragmentation, and avoids
1576 coalescing with `allocated' block before it's updated. */
1577 if (flag != TDB_INSERT)
1578 tdb_delete_hash(tdb, key, hash);
1580 /* Copy key+value *before* allocating free space in case malloc
1581 fails and we are left with a dead spot in the tdb. */
1583 if (!(p = (char *)talloc_size(tdb, key.dsize + dbuf.dsize))) {
1584 tdb->ecode = TDB_ERR_OOM;
1585 goto fail;
1588 memcpy(p, key.dptr, key.dsize);
1589 if (dbuf.dsize)
1590 memcpy(p+key.dsize, dbuf.dptr, dbuf.dsize);
1592 /* we have to allocate some space */
1593 if (!(rec_ptr = tdb_allocate(tdb, key.dsize + dbuf.dsize, &rec)))
1594 goto fail;
1596 /* Read hash top into next ptr */
1597 if (ofs_read(tdb, TDB_HASH_TOP(hash), &rec.next) == -1)
1598 goto fail;
1600 rec.key_len = key.dsize;
1601 rec.data_len = dbuf.dsize;
1602 rec.full_hash = hash;
1603 rec.magic = TDB_MAGIC;
1605 /* write out and point the top of the hash chain at it */
1606 if (rec_write(tdb, rec_ptr, &rec) == -1
1607 || tdb_write(tdb, rec_ptr+sizeof(rec), p, key.dsize+dbuf.dsize)==-1
1608 || ofs_write(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1) {
1609 /* Need to tdb_unallocate() here */
1610 goto fail;
1612 out:
1613 SAFE_FREE(p);
1614 tdb_unlock(tdb, BUCKET(hash), F_WRLCK);
1615 return ret;
1616 fail:
1617 ret = -1;
1618 goto out;
1621 /* Attempt to append data to an entry in place - this only works if the new data size
1622 is <= the old data size and the key exists.
1623 on failure return -1. Record must be locked before calling.
1624 */
1625 static int tdb_append_inplace(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash, TDB_DATA new_dbuf)
1627 struct list_struct rec;
1628 tdb_off rec_ptr;
1630 /* find entry */
1631 if (!(rec_ptr = tdb_find(tdb, key, hash, &rec)))
1632 return -1;
1634 /* Append of 0 is always ok. */
1635 if (new_dbuf.dsize == 0)
1636 return 0;
1638 /* must be long enough for key, old data + new data and tailer */
1639 if (rec.rec_len < key.dsize + rec.data_len + new_dbuf.dsize + sizeof(tdb_off)) {
1640 /* No room. */
1641 tdb->ecode = TDB_SUCCESS; /* Not really an error */
1642 return -1;
1645 if (tdb_write(tdb, rec_ptr + sizeof(rec) + rec.key_len + rec.data_len,
1646 new_dbuf.dptr, new_dbuf.dsize) == -1)
1647 return -1;
1649 /* update size */
1650 rec.data_len += new_dbuf.dsize;
1651 return rec_write(tdb, rec_ptr, &rec);
1654 /* Append to an entry. Create if not exist. */
1656 int tdb_append(TDB_CONTEXT *tdb, TDB_DATA key, TDB_DATA new_dbuf)
1658 struct list_struct rec;
1659 u32 hash;
1660 tdb_off rec_ptr;
1661 char *p = NULL;
1662 int ret = 0;
1663 size_t new_data_size = 0;
1665 /* find which hash bucket it is in */
1666 hash = tdb->hash_fn(&key);
1667 if (tdb_lock(tdb, BUCKET(hash), F_WRLCK) == -1)
1668 return -1;
1670 /* first try in-place. */
1671 if (tdb_append_inplace(tdb, key, hash, new_dbuf) == 0)
1672 goto out;
1674 /* reset the error code potentially set by the tdb_append_inplace() */
1675 tdb->ecode = TDB_SUCCESS;
1677 /* find entry */
1678 if (!(rec_ptr = tdb_find(tdb, key, hash, &rec))) {
1679 if (tdb->ecode != TDB_ERR_NOEXIST)
1680 goto fail;
1682 /* Not found - create. */
1684 ret = tdb_store(tdb, key, new_dbuf, TDB_INSERT);
1685 goto out;
1688 new_data_size = rec.data_len + new_dbuf.dsize;
1690 /* Copy key+old_value+value *before* allocating free space in case malloc
1691 fails and we are left with a dead spot in the tdb. */
1693 if (!(p = (char *)talloc_size(tdb, key.dsize + new_data_size))) {
1694 tdb->ecode = TDB_ERR_OOM;
1695 goto fail;
1698 /* Copy the key in place. */
1699 memcpy(p, key.dptr, key.dsize);
1701 /* Now read the old data into place. */
1702 if (rec.data_len &&
1703 tdb_read(tdb, rec_ptr + sizeof(rec) + rec.key_len, p + key.dsize, rec.data_len, 0) == -1)
1704 goto fail;
1706 /* Finally append the new data. */
1707 if (new_dbuf.dsize)
1708 memcpy(p+key.dsize+rec.data_len, new_dbuf.dptr, new_dbuf.dsize);
1710 /* delete any existing record - if it doesn't exist we don't
1711 care. Doing this first reduces fragmentation, and avoids
1712 coalescing with `allocated' block before it's updated. */
1714 tdb_delete_hash(tdb, key, hash);
1716 if (!(rec_ptr = tdb_allocate(tdb, key.dsize + new_data_size, &rec)))
1717 goto fail;
1719 /* Read hash top into next ptr */
1720 if (ofs_read(tdb, TDB_HASH_TOP(hash), &rec.next) == -1)
1721 goto fail;
1723 rec.key_len = key.dsize;
1724 rec.data_len = new_data_size;
1725 rec.full_hash = hash;
1726 rec.magic = TDB_MAGIC;
1728 /* write out and point the top of the hash chain at it */
1729 if (rec_write(tdb, rec_ptr, &rec) == -1
1730 || tdb_write(tdb, rec_ptr+sizeof(rec), p, key.dsize+new_data_size)==-1
1731 || ofs_write(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1) {
1732 /* Need to tdb_unallocate() here */
1733 goto fail;
1736 out:
1737 SAFE_FREE(p);
1738 tdb_unlock(tdb, BUCKET(hash), F_WRLCK);
1739 return ret;
1741 fail:
1742 ret = -1;
1743 goto out;
1746 static int tdb_already_open(dev_t device,
1747 ino_t ino)
1749 TDB_CONTEXT *i;
1751 for (i = tdbs; i; i = i->next) {
1752 if (i->device == device && i->inode == ino) {
1753 return 1;
1757 return 0;
1760 /* open the database, creating it if necessary
1762 The open_flags and mode are passed straight to the open call on the
1763 database file. A flags value of O_WRONLY is invalid. The hash size
1764 is advisory, use zero for a default value.
1766 Return is NULL on error, in which case errno is also set. Don't
1767 try to call tdb_error or tdb_errname, just do strerror(errno).
1769 @param name may be NULL for internal databases. */
1770 TDB_CONTEXT *tdb_open(const char *name, int hash_size, int tdb_flags,
1771 int open_flags, mode_t mode)
1773 return tdb_open_ex(name, hash_size, tdb_flags, open_flags, mode, NULL, NULL);
1776 /* a default logging function */
1777 static void null_log_fn(TDB_CONTEXT *tdb __attribute__((unused)),
1778 int level __attribute__((unused)),
1779 const char *fmt __attribute__((unused)), ...)
1784 TDB_CONTEXT *tdb_open_ex(const char *name, int hash_size, int tdb_flags,
1785 int open_flags, mode_t mode,
1786 tdb_log_func log_fn,
1787 tdb_hash_func hash_fn)
1789 TDB_CONTEXT *tdb;
1790 struct stat st;
1791 int rev = 0, locked = 0;
1792 uint8_t *vp;
1793 u32 vertest;
1795 if (!(tdb = talloc_zero(name, TDB_CONTEXT))) {
1796 /* Can't log this */
1797 errno = ENOMEM;
1798 goto fail;
1800 tdb->fd = -1;
1801 tdb->name = NULL;
1802 tdb->map_ptr = NULL;
1803 tdb->flags = tdb_flags;
1804 tdb->open_flags = open_flags;
1805 tdb->log_fn = log_fn?log_fn:null_log_fn;
1806 tdb->hash_fn = hash_fn ? hash_fn : default_tdb_hash;
1808 if ((open_flags & O_ACCMODE) == O_WRONLY) {
1809 TDB_LOG((tdb, 0, "tdb_open_ex: can't open tdb %s write-only\n",
1810 name));
1811 errno = EINVAL;
1812 goto fail;
1815 if (hash_size == 0)
1816 hash_size = DEFAULT_HASH_SIZE;
1817 if ((open_flags & O_ACCMODE) == O_RDONLY) {
1818 tdb->read_only = 1;
1819 /* read only databases don't do locking or clear if first */
1820 tdb->flags |= TDB_NOLOCK;
1821 tdb->flags &= ~TDB_CLEAR_IF_FIRST;
1824 /* internal databases don't mmap or lock, and start off cleared */
1825 if (tdb->flags & TDB_INTERNAL) {
1826 tdb->flags |= (TDB_NOLOCK | TDB_NOMMAP);
1827 tdb->flags &= ~TDB_CLEAR_IF_FIRST;
1828 if (tdb_new_database(tdb, hash_size) != 0) {
1829 TDB_LOG((tdb, 0, "tdb_open_ex: tdb_new_database failed!"));
1830 goto fail;
1832 goto internal;
1835 if ((tdb->fd = open(name, open_flags, mode)) == -1) {
1836 TDB_LOG((tdb, 5, "tdb_open_ex: could not open file %s: %s\n",
1837 name, strerror(errno)));
1838 goto fail; /* errno set by open(2) */
1841 /* ensure there is only one process initialising at once */
1842 if (tdb_brlock(tdb, GLOBAL_LOCK, F_WRLCK, F_SETLKW, 0) == -1) {
1843 TDB_LOG((tdb, 0, "tdb_open_ex: failed to get global lock on %s: %s\n",
1844 name, strerror(errno)));
1845 goto fail; /* errno set by tdb_brlock */
1848 /* we need to zero database if we are the only one with it open */
1849 if ((tdb_flags & TDB_CLEAR_IF_FIRST) &&
1850 (locked = (tdb_brlock(tdb, ACTIVE_LOCK, F_WRLCK, F_SETLK, 0) == 0))) {
1851 open_flags |= O_CREAT;
1852 if (ftruncate(tdb->fd, 0) == -1) {
1853 TDB_LOG((tdb, 0, "tdb_open_ex: "
1854 "failed to truncate %s: %s\n",
1855 name, strerror(errno)));
1856 goto fail; /* errno set by ftruncate */
1860 if (read(tdb->fd, &tdb->header, sizeof(tdb->header)) != sizeof(tdb->header)
1861 || strcmp(tdb->header.magic_food, TDB_MAGIC_FOOD) != 0
1862 || (tdb->header.version != TDB_VERSION
1863 && !(rev = (tdb->header.version==TDB_BYTEREV(TDB_VERSION))))) {
1864 /* its not a valid database - possibly initialise it */
1865 if (!(open_flags & O_CREAT) || tdb_new_database(tdb, hash_size) == -1) {
1866 errno = EIO; /* ie bad format or something */
1867 goto fail;
1869 rev = (tdb->flags & TDB_CONVERT);
1871 vp = (uint8_t *)&tdb->header.version;
1872 vertest = (((u32)vp[0]) << 24) | (((u32)vp[1]) << 16) |
1873 (((u32)vp[2]) << 8) | (u32)vp[3];
1874 tdb->flags |= (vertest==TDB_VERSION) ? TDB_BIGENDIAN : 0;
1875 if (!rev)
1876 tdb->flags &= ~TDB_CONVERT;
1877 else {
1878 tdb->flags |= TDB_CONVERT;
1879 convert(&tdb->header, sizeof(tdb->header));
1881 if (fstat(tdb->fd, &st) == -1)
1882 goto fail;
1884 /* Is it already in the open list? If so, fail. */
1885 if (tdb_already_open(st.st_dev, st.st_ino)) {
1886 TDB_LOG((tdb, 2, "tdb_open_ex: "
1887 "%s (%d,%d) is already open in this process\n",
1888 name, (int)st.st_dev, (int)st.st_ino));
1889 errno = EBUSY;
1890 goto fail;
1893 if (!(tdb->name = (char *)talloc_strdup(tdb, name))) {
1894 errno = ENOMEM;
1895 goto fail;
1898 tdb->map_size = st.st_size;
1899 tdb->device = st.st_dev;
1900 tdb->inode = st.st_ino;
1901 tdb->locked = talloc_zero_array(tdb, struct tdb_lock_type,
1902 tdb->header.hash_size+1);
1903 if (!tdb->locked) {
1904 TDB_LOG((tdb, 2, "tdb_open_ex: "
1905 "failed to allocate lock structure for %s\n",
1906 name));
1907 errno = ENOMEM;
1908 goto fail;
1910 tdb_mmap(tdb);
1911 if (locked) {
1912 if (tdb_brlock(tdb, ACTIVE_LOCK, F_UNLCK, F_SETLK, 0) == -1) {
1913 TDB_LOG((tdb, 0, "tdb_open_ex: "
1914 "failed to take ACTIVE_LOCK on %s: %s\n",
1915 name, strerror(errno)));
1916 goto fail;
1921 /* We always need to do this if the CLEAR_IF_FIRST flag is set, even if
1922 we didn't get the initial exclusive lock as we need to let all other
1923 users know we're using it. */
1925 if (tdb_flags & TDB_CLEAR_IF_FIRST) {
1926 /* leave this lock in place to indicate it's in use */
1927 if (tdb_brlock(tdb, ACTIVE_LOCK, F_RDLCK, F_SETLKW, 0) == -1)
1928 goto fail;
1932 internal:
1933 /* Internal (memory-only) databases skip all the code above to
1934 * do with disk files, and resume here by releasing their
1935 * global lock and hooking into the active list. */
1936 if (tdb_brlock(tdb, GLOBAL_LOCK, F_UNLCK, F_SETLKW, 0) == -1)
1937 goto fail;
1938 tdb->next = tdbs;
1939 tdbs = tdb;
1940 return tdb;
1942 fail:
1943 { int save_errno = errno;
1945 if (!tdb)
1946 return NULL;
1948 if (tdb->map_ptr) {
1949 if (tdb->flags & TDB_INTERNAL)
1950 SAFE_FREE(tdb->map_ptr);
1951 else
1952 tdb_munmap(tdb);
1954 SAFE_FREE(tdb->name);
1955 if (tdb->fd != -1)
1956 if (close(tdb->fd) != 0)
1957 TDB_LOG((tdb, 5, "tdb_open_ex: failed to close tdb->fd on error!\n"));
1958 SAFE_FREE(tdb->locked);
1959 SAFE_FREE(tdb);
1960 errno = save_errno;
1961 return NULL;
1965 /**
1966 * Close a database.
1968 * @returns -1 for error; 0 for success.
1969 **/
1970 int tdb_close(TDB_CONTEXT *tdb)
1972 TDB_CONTEXT **i;
1973 int ret = 0;
1975 if (tdb->map_ptr) {
1976 if (tdb->flags & TDB_INTERNAL)
1977 SAFE_FREE(tdb->map_ptr);
1978 else
1979 tdb_munmap(tdb);
1981 SAFE_FREE(tdb->name);
1982 if (tdb->fd != -1)
1983 ret = close(tdb->fd);
1984 SAFE_FREE(tdb->locked);
1986 /* Remove from contexts list */
1987 for (i = &tdbs; *i; i = &(*i)->next) {
1988 if (*i == tdb) {
1989 *i = tdb->next;
1990 break;
1994 memset(tdb, 0, sizeof(*tdb));
1995 SAFE_FREE(tdb);
1997 return ret;
2000 /* lock/unlock entire database */
2001 int tdb_lockall(TDB_CONTEXT *tdb)
2003 u32 i;
2005 /* There are no locks on read-only dbs */
2006 if (tdb->read_only)
2007 return TDB_ERRCODE(TDB_ERR_LOCK, -1);
2008 for (i = 0; i < tdb->header.hash_size; i++)
2009 if (tdb_lock(tdb, i, F_WRLCK))
2010 break;
2012 /* If error, release locks we have... */
2013 if (i < tdb->header.hash_size) {
2014 u32 j;
2016 for ( j = 0; j < i; j++)
2017 tdb_unlock(tdb, j, F_WRLCK);
2018 return TDB_ERRCODE(TDB_ERR_NOLOCK, -1);
2021 return 0;
2023 void tdb_unlockall(TDB_CONTEXT *tdb)
2025 u32 i;
2026 for (i=0; i < tdb->header.hash_size; i++)
2027 tdb_unlock(tdb, i, F_WRLCK);
2030 /* lock/unlock one hash chain. This is meant to be used to reduce
2031 contention - it cannot guarantee how many records will be locked */
2032 int tdb_chainlock(TDB_CONTEXT *tdb, TDB_DATA key)
2034 return tdb_lock(tdb, BUCKET(tdb->hash_fn(&key)), F_WRLCK);
2037 int tdb_chainunlock(TDB_CONTEXT *tdb, TDB_DATA key)
2039 return tdb_unlock(tdb, BUCKET(tdb->hash_fn(&key)), F_WRLCK);
2042 int tdb_chainlock_read(TDB_CONTEXT *tdb, TDB_DATA key)
2044 return tdb_lock(tdb, BUCKET(tdb->hash_fn(&key)), F_RDLCK);
2047 int tdb_chainunlock_read(TDB_CONTEXT *tdb, TDB_DATA key)
2049 return tdb_unlock(tdb, BUCKET(tdb->hash_fn(&key)), F_RDLCK);
2053 /* register a loging function */
2054 void tdb_logging_function(TDB_CONTEXT *tdb, void (*fn)(TDB_CONTEXT *, int , const char *, ...))
2056 tdb->log_fn = fn?fn:null_log_fn;
2060 /* reopen a tdb - this can be used after a fork to ensure that we have an independent
2061 seek pointer from our parent and to re-establish locks */
2062 int tdb_reopen(TDB_CONTEXT *tdb)
2064 struct stat st;
2066 if (tdb->flags & TDB_INTERNAL)
2067 return 0; /* Nothing to do. */
2068 if (tdb_munmap(tdb) != 0) {
2069 TDB_LOG((tdb, 0, "tdb_reopen: munmap failed (%s)\n", strerror(errno)));
2070 goto fail;
2072 if (close(tdb->fd) != 0)
2073 TDB_LOG((tdb, 0, "tdb_reopen: WARNING closing tdb->fd failed!\n"));
2074 tdb->fd = open(tdb->name, tdb->open_flags & ~(O_CREAT|O_TRUNC), 0);
2075 if (tdb->fd == -1) {
2076 TDB_LOG((tdb, 0, "tdb_reopen: open failed (%s)\n", strerror(errno)));
2077 goto fail;
2079 if (fstat(tdb->fd, &st) != 0) {
2080 TDB_LOG((tdb, 0, "tdb_reopen: fstat failed (%s)\n", strerror(errno)));
2081 goto fail;
2083 if (st.st_ino != tdb->inode || st.st_dev != tdb->device) {
2084 TDB_LOG((tdb, 0, "tdb_reopen: file dev/inode has changed!\n"));
2085 goto fail;
2087 tdb_mmap(tdb);
2088 if ((tdb->flags & TDB_CLEAR_IF_FIRST) && (tdb_brlock(tdb, ACTIVE_LOCK, F_RDLCK, F_SETLKW, 0) == -1)) {
2089 TDB_LOG((tdb, 0, "tdb_reopen: failed to obtain active lock\n"));
2090 goto fail;
2093 return 0;
2095 fail:
2096 tdb_close(tdb);
2097 return -1;
2100 /* Not general: only works if single writer. */
2101 TDB_CONTEXT *tdb_copy(TDB_CONTEXT *tdb, const char *outfile)
2103 int fd, saved_errno;
2104 TDB_CONTEXT *copy;
2106 fd = open(outfile, O_TRUNC|O_CREAT|O_WRONLY, 0640);
2107 if (fd < 0)
2108 return NULL;
2109 if (tdb->map_ptr) {
2110 if (write(fd,tdb->map_ptr,tdb->map_size) != (int)tdb->map_size)
2111 goto fail;
2112 } else {
2113 char buf[65536];
2114 int r;
2116 lseek(tdb->fd, 0, SEEK_SET);
2117 while ((r = read(tdb->fd, buf, sizeof(buf))) > 0) {
2118 if (write(fd, buf, r) != r)
2119 goto fail;
2121 if (r < 0)
2122 goto fail;
2124 copy = tdb_open(outfile, 0, 0, O_RDWR, 0);
2125 if (!copy)
2126 goto fail;
2127 close(fd);
2128 return copy;
2130 fail:
2131 saved_errno = errno;
2132 close(fd);
2133 unlink(outfile);
2134 errno = saved_errno;
2135 return NULL;
2138 /* reopen all tdb's */
2139 int tdb_reopen_all(void)
2141 TDB_CONTEXT *tdb;
2143 for (tdb=tdbs; tdb; tdb = tdb->next) {
2144 /* Ensure no clear-if-first. */
2145 tdb->flags &= ~TDB_CLEAR_IF_FIRST;
2146 if (tdb_reopen(tdb) != 0)
2147 return -1;
2150 return 0;