#include "xenstored_control.h"
#include "tdb.h"
-#include "hashtable.h"
-
#ifndef NO_SOCKETS
#if defined(HAVE_SYSTEMD)
#define XEN_SYSTEMD_ENABLED 1
static int reopen_log_pipe[2];
static int reopen_log_pipe0_pollfd_idx = -1;
char *tracefile = NULL;
-static TDB_CONTEXT *tdb_ctx = NULL;
+TDB_CONTEXT *tdb_ctx = NULL;
-static void corrupt(struct connection *conn, const char *fmt, ...);
static const char *sockmsg_string(enum xsd_sockmsg_type type);
#define log(...) \
int quota_max_entry_size = 2048; /* 2K */
int quota_max_transaction = 10;
-TDB_CONTEXT *tdb_context(struct connection *conn)
-{
- /* conn = NULL used in manual_node at setup. */
- if (!conn || !conn->transaction)
- return tdb_ctx;
- return tdb_transaction_context(conn->transaction);
-}
-
-bool replace_tdb(const char *newname, TDB_CONTEXT *newtdb)
-{
- if (!(tdb_ctx->flags & TDB_INTERNAL))
- if (rename(newname, xs_daemon_tdb()) != 0)
- return false;
- tdb_close(tdb_ctx);
- tdb_ctx = talloc_steal(talloc_autofree_context(), newtdb);
- return true;
-}
-
void trace(const char *fmt, ...)
{
va_list arglist;
TDB_DATA key, data;
struct xs_tdb_record_hdr *hdr;
struct node *node;
- TDB_CONTEXT * context = tdb_context(conn);
-
- key.dptr = (void *)name;
- key.dsize = strlen(name);
- data = tdb_fetch(context, key);
-
- if (data.dptr == NULL) {
- if (tdb_error(context) == TDB_ERR_NOEXIST)
- errno = ENOENT;
- else {
- log("TDB error on read: %s", tdb_errorstr(context));
- errno = EIO;
- }
- return NULL;
- }
node = talloc(ctx, struct node);
if (!node) {
errno = ENOMEM;
return NULL;
}
+
+ if (transaction_prepend(conn, name, &key))
+ return NULL;
+
+ data = tdb_fetch(tdb_ctx, key);
+
+ if (data.dptr == NULL) {
+ if (tdb_error(tdb_ctx) == TDB_ERR_NOEXIST) {
+ node->generation = NO_GENERATION;
+ access_node(conn, node, NODE_ACCESS_READ, NULL);
+ errno = ENOENT;
+ } else {
+ log("TDB error on read: %s", tdb_errorstr(tdb_ctx));
+ errno = EIO;
+ }
+ talloc_free(node);
+ return NULL;
+ }
+
node->parent = NULL;
- node->tdb = tdb_context(conn);
talloc_steal(node, data.dptr);
/* Datalen, childlen, number of permissions */
/* Children is strings, nul separated. */
node->children = node->data + node->datalen;
+ access_node(conn, node, NODE_ACCESS_READ, NULL);
+
return node;
}
-static int write_node(struct connection *conn, struct node *node)
+int write_node_raw(struct connection *conn, TDB_DATA *key, struct node *node)
{
- /*
- * conn will be null when this is called from manual_node.
- * tdb_context copes with this.
- */
-
- TDB_DATA key, data;
+ TDB_DATA data;
void *p;
struct xs_tdb_record_hdr *hdr;
- key.dptr = (void *)node->name;
- key.dsize = strlen(node->name);
-
data.dsize = sizeof(*hdr)
+ node->num_perms*sizeof(node->perms[0])
+ node->datalen + node->childlen;
- if (domain_is_unprivileged(conn) && data.dsize >= quota_max_entry_size)
- goto error;
-
- add_change_node(conn, node, false);
- wrl_apply_debit_direct(conn);
+ if (domain_is_unprivileged(conn) &&
+ data.dsize >= quota_max_entry_size) {
+ errno = ENOSPC;
+ return errno;
+ }
data.dptr = talloc_size(node, data.dsize);
hdr = (void *)data.dptr;
memcpy(p, node->children, node->childlen);
/* TDB should set errno, but doesn't even set ecode AFAICT. */
- if (tdb_store(tdb_context(conn), key, data, TDB_REPLACE) != 0) {
- corrupt(conn, "Write of %s failed", key.dptr);
- goto error;
+ if (tdb_store(tdb_ctx, *key, data, TDB_REPLACE) != 0) {
+ corrupt(conn, "Write of %s failed", key->dptr);
+ errno = EIO;
+ return errno;
}
return 0;
- error:
- errno = ENOSPC;
- return errno;
+}
+
+static int write_node(struct connection *conn, struct node *node)
+{
+ TDB_DATA key;
+
+ if (access_node(conn, node, NODE_ACCESS_WRITE, &key))
+ return errno;
+
+ return write_node_raw(conn, &key, node);
}
static enum xs_perm_type perm_for_conn(struct connection *conn,
return 0;
}
-static void delete_node_single(struct connection *conn, struct node *node,
- bool changed)
+static void delete_node_single(struct connection *conn, struct node *node)
{
TDB_DATA key;
- key.dptr = (void *)node->name;
- key.dsize = strlen(node->name);
+ if (access_node(conn, node, NODE_ACCESS_DELETE, &key))
+ return;
- if (tdb_delete(tdb_context(conn), key) != 0) {
+ if (tdb_delete(tdb_ctx, key) != 0) {
corrupt(conn, "Could not delete '%s'", node->name);
return;
}
- if (changed) {
- add_change_node(conn, node, true);
- wrl_apply_debit_direct(conn);
- }
-
domain_entry_dec(conn, node);
}
node = talloc(ctx, struct node);
if (!node)
goto nomem;
- node->tdb = tdb_context(conn);
node->name = talloc_strdup(node, name);
if (!node->name)
goto nomem;
key.dptr = (void *)node->name;
key.dsize = strlen(node->name);
- tdb_delete(node->tdb, key);
+ tdb_delete(tdb_ctx, key);
return 0;
}
return 0;
}
-static void delete_node(struct connection *conn, struct node *node,
- bool changed)
+static void delete_node(struct connection *conn, struct node *node)
{
unsigned int i;
char *name;
/* Delete self, then delete children. If we crash, then the worst
that can happen is the children will continue to take up space, but
will otherwise be unreachable. */
- delete_node_single(conn, node, changed);
+ delete_node_single(conn, node);
/* Delete children, too. */
for (i = 0; i < node->childlen; i += strlen(node->children+i) + 1) {
node->children + i);
child = name ? read_node(conn, node, name) : NULL;
if (child) {
- delete_node(conn, child, false);
+ delete_node(conn, child);
}
else {
trace("delete_node: Error deleting child '%s/%s'!\n",
if (delete_child(conn, parent, basename(name)))
return EINVAL;
- delete_node(conn, node, true);
+ delete_node(conn, node);
return 0;
}
}
-static int remember_string(struct hashtable *hash, const char *str)
+int remember_string(struct hashtable *hash, const char *str)
{
char *k = malloc(strlen(str) + 1);
void *private)
{
struct hashtable *reachable = private;
+ char *slash;
char * name = talloc_strndup(NULL, key.dptr, key.dsize);
if (!name) {
return 1;
}
+ if (name[0] != '/') {
+ slash = strchr(name, '/');
+ if (slash)
+ *slash = 0;
+ }
if (!hashtable_search(reachable, name)) {
log("clean_store: '%s' is orphaned!", name);
if (recovery) {
}
log("Checking store ...");
- if (!check_store_(root, reachable))
+ if (!check_store_(root, reachable) &&
+ !check_transactions(reachable))
clean_store(reachable);
log("Checking store complete.");
/* Something is horribly wrong: check the store. */
-static void corrupt(struct connection *conn, const char *fmt, ...)
+void corrupt(struct connection *conn, const char *fmt, ...)
{
va_list arglist;
char *str;
along with this program; If not, see <http://www.gnu.org/licenses/>.
*/
+#include <inttypes.h>
#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include "xenstore_lib.h"
#include "utils.h"
-struct changed_node
+/*
+ * Some notes regarding detection and handling of transaction conflicts:
+ *
+ * Basic source of reference is the 'generation' count. Each writing access
+ * (either normal write or in a transaction) to the tdb data base will set
+ * the node specific generation count to the global generation count.
+ * For being able to identify a transaction the transaction specific generation
+ * count is initialized with the global generation count when starting the
+ * transaction.
+ * Each time the global generation count is copied to either a node or a
+ * transaction it is incremented. This ensures all nodes and/or transactions
+ * are having a unique generation count.
+ *
+ * Transaction conflicts are detected by checking the generation count of all
+ * nodes read in the transaction to match with the generation count in the
+ * global data base at the end of the transaction. Nodes which have been
+ * modified in the transaction don't have to be checked to match even if they
+ * have been read, as the modified node will be globally visible after the
+ * succeeded transaction possibly overwriting another modification which may
+ * have occurred concurrent to the transaction.
+ *
+ * Examples:
+ * ---------
+ * The following notation is used:
+ * I: initial state
+ * G global generation count
+ * g(X) generation count of node X
+ * G(1) generation count of transaction 1
+ * g(1:Y) saved generation count of node Y in transaction 1
+ * TA1: operation in transaction 1
+ * X=1:X replace global node X with transaction 1 specific value of X
+ *
+ * 1. Simple transaction doing: read node A, write node B
+ * I: g(A) = 1, g(B) = 2, G = 3
+ * Start transaction 1: G(1) = 3, G = 4
+ * TA1: read node A: g(1:A) = 1
+ * TA1: write node B: g(1:B) = 4, G = 5
+ * End TA1: g(1:A) == g(A) => okay, B = 1:B, g(B) = 5, G = 6
+ *
+ * 2. Transaction with conflicting write
+ * I: g(A) = 1, g(B) = 2, G = 3
+ * Start transaction 1: G(1) = 3, G = 4
+ * TA1: read node A: g(1:A) = 1
+ * write node A: g(A) = 4, G = 5
+ * TA1: write node B: g(1:B) = 5, G = 6
+ * End TA1: g(1:A) != g(A) => EAGAIN
+ *
+ * 3. Transaction with conflicting delete
+ * I: g(A) = 1, g(B) = 2, G = 3
+ * Start transaction 1: G(1) = 3, G = 4
+ * TA1: read node A: g(1:A) = 1
+ * delete node A: g(A) = ~0
+ * TA1: write node B: g(1:B) = 4, G = 5
+ * End TA1: g(1:A) != g(A) => EAGAIN
+ *
+ * 4. Two interfering transactions
+ * I: g(A) = 1, g(B) = 2, G = 3
+ * Start transaction 1: G(1) = 3, G = 4
+ * Start transaction 2: G(2) = 4, G = 5
+ * TA1: read node A: g(1:A) = 1
+ * TA2: read node B: g(2:B) = 2
+ * TA1: write node B: g(1:B) = 5, G = 6
+ * TA2: write node A: g(2:A) = 6, G = 7
+ * End TA1: g(1:A) == g(A) => okay, B = 1:B, g(B) = 7, G = 8
+ * End TA2: g(2:B) != g(B) => EAGAIN
+ */
+
+struct accessed_node
{
/* List of all changed nodes in the context of this transaction. */
struct list_head list;
/* The name of the node. */
char *node;
- /* And the children? (ie. rm) */
- bool recurse;
+ /* Generation count (or NO_GENERATION) for conflict checking. */
+ uint64_t generation;
+
+ /* Generation count checking required? */
+ bool check_gen;
+
+ /* Modified? */
+ bool modified;
+
+ /* Transaction node in data base? */
+ bool ta_node;
};
struct changed_domain
/* Generation when transaction started. */
uint64_t generation;
- /* Transaction internal generation. */
- uint64_t trans_gen;
-
- /* TDB to work on, and filename */
- TDB_CONTEXT *tdb;
- char *tdb_name;
-
- /* List of changed nodes. */
- struct list_head changes;
+ /* List of accessed nodes. */
+ struct list_head accessed;
/* List of changed domains - to record the changed domain entry number */
struct list_head changed_domains;
+
+ /* Flag for letting transaction fail. */
+ bool fail;
};
extern int quota_max_transaction;
static uint64_t generation;
-/* Return tdb context to use for this connection. */
-TDB_CONTEXT *tdb_transaction_context(struct transaction *trans)
+static void set_tdb_key(const char *name, TDB_DATA *key)
+{
+ key->dptr = (char *)name;
+ key->dsize = strlen(name);
+}
+
+static struct accessed_node *find_accessed_node(struct transaction *trans,
+ const char *name)
+{
+ struct accessed_node *i;
+
+ list_for_each_entry(i, &trans->accessed, list)
+ if (streq(i->node, name))
+ return i;
+
+ return NULL;
+}
+
+static char *transaction_get_node_name(void *ctx, struct transaction *trans,
+ const char *name)
+{
+ return talloc_asprintf(ctx, "%"PRIu64"/%s", trans->generation, name);
+}
+
+/*
+ * Prepend the transaction to name if node has been modified in the current
+ * transaction.
+ */
+int transaction_prepend(struct connection *conn, const char *name,
+ TDB_DATA *key)
{
- return trans->tdb;
+ char *tdb_name;
+
+ if (!conn || !conn->transaction ||
+ !find_accessed_node(conn->transaction, name)) {
+ set_tdb_key(name, key);
+ return 0;
+ }
+
+ tdb_name = transaction_get_node_name(conn->transaction,
+ conn->transaction, name);
+ if (!tdb_name)
+ return errno;
+
+ set_tdb_key(tdb_name, key);
+
+ return 0;
}
-/* Callers get a change node (which can fail) and only commit after they've
- * finished. This way they don't have to unwind eg. a write. */
-void add_change_node(struct connection *conn, struct node *node, bool recurse)
+/*
+ * A node has been accessed.
+ *
+ * Modifying accesses (write, delete) always update the generation (global and
+ * node->generation).
+ *
+ * Accesses in a transaction will be added to the list of accessed nodes
+ * if not already done. Read type accesses will copy the node to the
+ * transaction specific data base part, write type accesses go there
+ * anyway.
+ *
+ * If not NULL, key will be supplied with name and length of name of the node
+ * to be accessed in the data base.
+ */
+int access_node(struct connection *conn, struct node *node,
+ enum node_access_type type, TDB_DATA *key)
{
- struct changed_node *i;
+ struct accessed_node *i = NULL;
struct transaction *trans;
+ TDB_DATA local_key;
+ const char *trans_name = NULL;
+ int ret;
+ bool introduce = false;
+
+ if (type != NODE_ACCESS_READ) {
+ node->generation = generation++;
+ if (conn && !conn->transaction)
+ wrl_apply_debit_direct(conn);
+ }
if (!conn || !conn->transaction) {
/* They're changing the global database. */
- node->generation = generation++;
- return;
+ if (key)
+ set_tdb_key(node->name, key);
+ return 0;
}
trans = conn->transaction;
- node->generation = generation + trans->trans_gen++;
+ trans_name = transaction_get_node_name(node, trans, node->name);
+ if (!trans_name)
+ goto nomem;
- list_for_each_entry(i, &trans->changes, list) {
- if (streq(i->node, node->name)) {
- if (recurse)
- i->recurse = recurse;
- return;
+ i = find_accessed_node(trans, node->name);
+ if (!i) {
+ i = talloc_zero(trans, struct accessed_node);
+ if (!i)
+ goto nomem;
+ i->node = talloc_strdup(i, node->name);
+ if (!i->node)
+ goto nomem;
+
+ introduce = true;
+ i->ta_node = false;
+
+ /*
+ * Additional transaction-specific node for read type. We only
+ * have to verify read nodes if we didn't write them.
+ *
+ * The node is created and written to DB here to distinguish
+ * from the write types.
+ */
+ if (type == NODE_ACCESS_READ) {
+ i->generation = node->generation;
+ i->check_gen = true;
+ if (node->generation != NO_GENERATION) {
+ set_tdb_key(trans_name, &local_key);
+ ret = write_node_raw(conn, &local_key, node);
+ if (ret)
+ goto err;
+ i->ta_node = true;
+ }
}
+ list_add_tail(&i->list, &trans->accessed);
}
- i = talloc(trans, struct changed_node);
- if (!i) {
- /* All we can do is let the transaction fail. */
- generation++;
- return;
+ if (type != NODE_ACCESS_READ)
+ i->modified = true;
+
+ if (introduce && type == NODE_ACCESS_DELETE)
+ /* Nothing to delete. */
+ return -1;
+
+ if (key) {
+ set_tdb_key(trans_name, key);
+ if (type == NODE_ACCESS_WRITE)
+ i->ta_node = true;
+ if (type == NODE_ACCESS_DELETE)
+ i->ta_node = false;
+ }
+
+ return 0;
+
+nomem:
+ ret = ENOMEM;
+err:
+ talloc_free((void *)trans_name);
+ talloc_free(i);
+ trans->fail = true;
+ errno = ret;
+ return ret;
+}
+
+/*
+ * Finalize transaction:
+ * Walk through accessed nodes and check generation against global data.
+ * If all entries match, read the transaction entries and write them without
+ * transaction prepended. Delete all transaction specific nodes in the data
+ * base.
+ */
+static int finalize_transaction(struct connection *conn,
+ struct transaction *trans)
+{
+ struct accessed_node *i;
+ TDB_DATA key, ta_key, data;
+ struct xs_tdb_record_hdr *hdr;
+ uint64_t gen;
+ char *trans_name;
+ int ret;
+
+ list_for_each_entry(i, &trans->accessed, list) {
+ if (!i->check_gen)
+ continue;
+
+ set_tdb_key(i->node, &key);
+ data = tdb_fetch(tdb_ctx, key);
+ hdr = (void *)data.dptr;
+ if (!data.dptr) {
+ if (tdb_error(tdb_ctx) != TDB_ERR_NOEXIST)
+ return EIO;
+ gen = NO_GENERATION;
+ } else
+ gen = hdr->generation;
+ talloc_free(data.dptr);
+ if (i->generation != gen)
+ return EAGAIN;
}
- i->node = talloc_strdup(i, node->name);
- if (!i->node) {
- /* All we can do is let the transaction fail. */
- generation++;
+
+ while ((i = list_top(&trans->accessed, struct accessed_node, list))) {
+ trans_name = transaction_get_node_name(i, trans, i->node);
+ if (!trans_name)
+ /* We are doomed: the transaction is only partial. */
+ goto err;
+
+ set_tdb_key(trans_name, &ta_key);
+
+ if (i->modified) {
+ set_tdb_key(i->node, &key);
+ if (i->ta_node) {
+ data = tdb_fetch(tdb_ctx, ta_key);
+ if (!data.dptr)
+ goto err;
+ hdr = (void *)data.dptr;
+ hdr->generation = generation++;
+ ret = tdb_store(tdb_ctx, key, data,
+ TDB_REPLACE);
+ talloc_free(data.dptr);
+ if (ret)
+ goto err;
+ } else if (tdb_delete(tdb_ctx, key))
+ goto err;
+ fire_watches(conn, trans, i->node, false);
+ }
+
+ if (i->ta_node && tdb_delete(tdb_ctx, ta_key))
+ goto err;
+ list_del(&i->list);
talloc_free(i);
- return;
}
- i->recurse = recurse;
- list_add_tail(&i->list, &trans->changes);
+
+ return 0;
+
+err:
+ corrupt(conn, "Partial transaction");
+ return EIO;
}
static int destroy_transaction(void *_transaction)
{
struct transaction *trans = _transaction;
+ struct accessed_node *i;
+ char *trans_name;
+ TDB_DATA key;
wrl_ntransactions--;
trace_destroy(trans, "transaction");
- if (trans->tdb)
- tdb_close(trans->tdb);
- unlink(trans->tdb_name);
+ while ((i = list_top(&trans->accessed, struct accessed_node, list))) {
+ if (i->ta_node) {
+ trans_name = transaction_get_node_name(i, trans,
+ i->node);
+ if (trans_name) {
+ set_tdb_key(trans_name, &key);
+ tdb_delete(tdb_ctx, key);
+ }
+ }
+ list_del(&i->list);
+ talloc_free(i);
+ }
+
return 0;
}
if (!trans)
return ENOMEM;
- INIT_LIST_HEAD(&trans->changes);
+ INIT_LIST_HEAD(&trans->accessed);
INIT_LIST_HEAD(&trans->changed_domains);
- trans->generation = generation;
- trans->tdb_name = talloc_asprintf(trans, "%s.%p",
- xs_daemon_tdb(), trans);
- if (!trans->tdb_name)
- return ENOMEM;
- trans->tdb = tdb_copy(tdb_context(conn), trans->tdb_name);
- if (!trans->tdb)
- return errno;
- /* Make it close if we go away. */
- talloc_steal(trans, trans->tdb);
+ trans->fail = false;
+ trans->generation = generation++;
/* Pick an unused transaction identifier. */
do {
return 0;
}
+static int transaction_fix_domains(struct transaction *trans, bool update)
+{
+ struct changed_domain *d;
+ int cnt;
+
+ list_for_each_entry(d, &trans->changed_domains, list) {
+ cnt = domain_entry_fix(d->domid, d->nbentry, update);
+ if (!update && cnt >= quota_nb_entry_per_domain)
+ return ENOSPC;
+ }
+
+ return 0;
+}
+
int do_transaction_end(struct connection *conn, struct buffered_data *in)
{
const char *arg = onearg(in);
- struct changed_node *i;
- struct changed_domain *d;
struct transaction *trans;
+ int ret;
if (!arg || (!streq(arg, "T") && !streq(arg, "F")))
return EINVAL;
talloc_steal(in, trans);
if (streq(arg, "T")) {
- /* FIXME: Merge, rather failing on any change. */
- if (trans->generation != generation)
+ if (trans->fail)
+ return ENOMEM;
+ ret = transaction_fix_domains(trans, false);
+ if (ret)
+ return ret;
+ if (finalize_transaction(conn, trans))
return EAGAIN;
wrl_apply_debit_trans_commit(conn);
- if (!replace_tdb(trans->tdb_name, trans->tdb))
- return errno;
- /* Don't close this: we won! */
- trans->tdb = NULL;
-
/* fix domain entry for each changed domain */
- list_for_each_entry(d, &trans->changed_domains, list)
- domain_entry_fix(d->domid, d->nbentry);
-
- /* Fire off the watches for everything that changed. */
- list_for_each_entry(i, &trans->changes, list)
- fire_watches(conn, in, i->node, i->recurse);
- generation += trans->trans_gen;
+ transaction_fix_domains(trans, true);
}
send_ack(conn, XS_TRANSACTION_END);
d = talloc(trans, struct changed_domain);
if (!d) {
/* Let the transaction fail. */
- generation++;
+ trans->fail = true;
return;
}
d->domid = domid;
d = talloc(trans, struct changed_domain);
if (!d) {
/* Let the transaction fail. */
- generation++;
+ trans->fail = true;
return;
}
d->domid = domid;
conn->transaction_started = 0;
}
+int check_transactions(struct hashtable *hash)
+{
+ struct connection *conn;
+ struct transaction *trans;
+ struct accessed_node *i;
+ char *tname, *tnode;
+
+ list_for_each_entry(conn, &connections, list) {
+ list_for_each_entry(trans, &conn->transaction_list, list) {
+ tname = talloc_asprintf(trans, "%"PRIu64,
+ trans->generation);
+ if (!tname || !remember_string(hash, tname))
+ goto nomem;
+
+ list_for_each_entry(i, &trans->accessed, list) {
+ if (!i->ta_node)
+ continue;
+ tnode = transaction_get_node_name(tname, trans,
+ i->node);
+ if (!tnode || !remember_string(hash, tnode))
+ goto nomem;
+ talloc_free(tnode);
+ }
+
+ talloc_free(tname);
+ }
+ }
+
+ return 0;
+
+nomem:
+ talloc_free(tname);
+ return ENOMEM;
+}
+
/*
* Local variables:
* c-file-style: "linux"