ia64/xen-unstable

view tools/xenstore/xenstored_core.c @ 7238:971e7c7411b3

Raise an exception if an error appears on the pipes to our children, and make
sure that the child's pipes are closed even under that exception. Move the
handling of POLLHUP to the end of the loop, so that we guarantee to read any
remaining data from the child if POLLHUP and POLLIN appear at the same time.

Signed-off-by: Ewan Mellor <ewan@xensource.com>
author emellor@ewan
date Thu Oct 06 10:13:11 2005 +0100 (2005-10-06)
parents ef9591d03fdd
children 93e27f7ca8a8 61b3b357d827 76a7a7aa27e4
line source
1 /*
2 Simple prototype Xen Store Daemon providing simple tree-like database.
3 Copyright (C) 2005 Rusty Russell IBM Corporation
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
20 #include <sys/types.h>
21 #include <sys/stat.h>
22 #include <sys/socket.h>
23 #include <sys/select.h>
24 #include <sys/un.h>
25 #include <sys/time.h>
26 #include <time.h>
27 #include <unistd.h>
28 #include <fcntl.h>
29 #include <stdbool.h>
30 #include <stdio.h>
31 #include <stdarg.h>
32 #include <stdlib.h>
33 #include <syslog.h>
34 #include <string.h>
35 #include <errno.h>
36 #include <dirent.h>
37 #include <getopt.h>
38 #include <signal.h>
39 #include <assert.h>
40 #include <setjmp.h>
42 //#define DEBUG
43 #include "utils.h"
44 #include "list.h"
45 #include "talloc.h"
46 #include "xs_lib.h"
47 #include "xenstored.h"
48 #include "xenstored_core.h"
49 #include "xenstored_watch.h"
50 #include "xenstored_transaction.h"
51 #include "xenstored_domain.h"
52 #include "xenctrl.h"
53 #include "tdb.h"
55 static bool verbose;
56 LIST_HEAD(connections);
57 static int tracefd = -1;
58 static TDB_CONTEXT *tdb_ctx;
60 #ifdef TESTING
61 static bool failtest = false;
63 /* We override talloc's malloc. */
64 void *test_malloc(size_t size)
65 {
66 /* 1 in 20 means only about 50% of connections establish. */
67 if (failtest && (random() % 32) == 0)
68 return NULL;
69 return malloc(size);
70 }
72 static void stop_failtest(int signum __attribute__((unused)))
73 {
74 failtest = false;
75 }
77 /* Need these before we #define away write_all/mkdir in testing.h */
78 bool test_write_all(int fd, void *contents, unsigned int len);
79 bool test_write_all(int fd, void *contents, unsigned int len)
80 {
81 if (failtest && (random() % 8) == 0) {
82 if (len)
83 len = random() % len;
84 write(fd, contents, len);
85 errno = ENOSPC;
86 return false;
87 }
88 return xs_write_all(fd, contents, len);
89 }
91 int test_mkdir(const char *dir, int perms);
92 int test_mkdir(const char *dir, int perms)
93 {
94 if (failtest && (random() % 8) == 0) {
95 errno = ENOSPC;
96 return -1;
97 }
98 return mkdir(dir, perms);
99 }
100 #endif /* TESTING */
102 #include "xenstored_test.h"
104 /* FIXME: Ideally, this should never be called. Some can be eliminated. */
105 /* Something is horribly wrong: shutdown immediately. */
106 void __attribute__((noreturn)) corrupt(struct connection *conn,
107 const char *fmt, ...)
108 {
109 va_list arglist;
110 char *str;
111 int saved_errno = errno;
113 va_start(arglist, fmt);
114 str = talloc_vasprintf(NULL, fmt, arglist);
115 va_end(arglist);
117 trace("xenstored corruption: connection id %i: err %s: %s",
118 conn ? (int)conn->id : -1, strerror(saved_errno), str);
119 eprintf("xenstored corruption: connection id %i: err %s: %s",
120 conn ? (int)conn->id : -1, strerror(saved_errno), str);
121 #ifdef TESTING
122 /* Allow them to attach debugger. */
123 sleep(30);
124 #endif
125 syslog(LOG_DAEMON,
126 "xenstored corruption: connection id %i: err %s: %s",
127 conn ? (int)conn->id : -1, strerror(saved_errno), str);
128 _exit(2);
129 }
131 TDB_CONTEXT *tdb_context(struct connection *conn)
132 {
133 /* conn = NULL used in manual_node at setup. */
134 if (!conn || !conn->transaction)
135 return tdb_ctx;
136 return tdb_transaction_context(conn->transaction);
137 }
139 bool replace_tdb(const char *newname, TDB_CONTEXT *newtdb)
140 {
141 if (rename(newname, xs_daemon_tdb()) != 0)
142 return false;
143 tdb_close(tdb_ctx);
144 tdb_ctx = talloc_steal(talloc_autofree_context(), newtdb);
145 return true;
146 }
148 static char *sockmsg_string(enum xsd_sockmsg_type type)
149 {
150 switch (type) {
151 case XS_DEBUG: return "DEBUG";
152 case XS_SHUTDOWN: return "SHUTDOWN";
153 case XS_DIRECTORY: return "DIRECTORY";
154 case XS_READ: return "READ";
155 case XS_GET_PERMS: return "GET_PERMS";
156 case XS_WATCH: return "WATCH";
157 case XS_WATCH_ACK: return "WATCH_ACK";
158 case XS_UNWATCH: return "UNWATCH";
159 case XS_TRANSACTION_START: return "TRANSACTION_START";
160 case XS_TRANSACTION_END: return "TRANSACTION_END";
161 case XS_INTRODUCE: return "INTRODUCE";
162 case XS_RELEASE: return "RELEASE";
163 case XS_GET_DOMAIN_PATH: return "GET_DOMAIN_PATH";
164 case XS_WRITE: return "WRITE";
165 case XS_MKDIR: return "MKDIR";
166 case XS_RM: return "RM";
167 case XS_SET_PERMS: return "SET_PERMS";
168 case XS_WATCH_EVENT: return "WATCH_EVENT";
169 case XS_ERROR: return "ERROR";
170 default:
171 return "**UNKNOWN**";
172 }
173 }
175 static void trace_io(const struct connection *conn,
176 const char *prefix,
177 const struct buffered_data *data)
178 {
179 char string[64];
180 unsigned int i;
182 if (tracefd < 0)
183 return;
185 write(tracefd, prefix, strlen(prefix));
186 sprintf(string, " %p ", conn);
187 write(tracefd, string, strlen(string));
188 write(tracefd, sockmsg_string(data->hdr.msg.type),
189 strlen(sockmsg_string(data->hdr.msg.type)));
190 write(tracefd, " (", 2);
191 for (i = 0; i < data->hdr.msg.len; i++) {
192 if (data->buffer[i] == '\0')
193 write(tracefd, " ", 1);
194 else
195 write(tracefd, data->buffer + i, 1);
196 }
197 write(tracefd, ")\n", 2);
198 }
200 void trace_create(const void *data, const char *type)
201 {
202 char string[64];
203 if (tracefd < 0)
204 return;
206 write(tracefd, "CREATE ", strlen("CREATE "));
207 write(tracefd, type, strlen(type));
208 sprintf(string, " %p\n", data);
209 write(tracefd, string, strlen(string));
210 }
212 void trace_destroy(const void *data, const char *type)
213 {
214 char string[64];
215 if (tracefd < 0)
216 return;
218 write(tracefd, "DESTROY ", strlen("DESTROY "));
219 write(tracefd, type, strlen(type));
220 sprintf(string, " %p\n", data);
221 write(tracefd, string, strlen(string));
222 }
224 void trace(const char *fmt, ...)
225 {
226 va_list arglist;
227 char *str;
229 if (tracefd < 0)
230 return;
232 va_start(arglist, fmt);
233 str = talloc_vasprintf(NULL, fmt, arglist);
234 va_end(arglist);
235 write(tracefd, str, strlen(str));
236 talloc_free(str);
237 }
239 static bool write_message(struct connection *conn)
240 {
241 int ret;
242 struct buffered_data *out = conn->out;
244 if (out->inhdr) {
245 if (verbose)
246 xprintf("Writing msg %s (%s) out to %p\n",
247 sockmsg_string(out->hdr.msg.type),
248 out->buffer, conn);
249 ret = conn->write(conn, out->hdr.raw + out->used,
250 sizeof(out->hdr) - out->used);
251 if (ret < 0)
252 return false;
254 out->used += ret;
255 if (out->used < sizeof(out->hdr))
256 return true;
258 out->inhdr = false;
259 out->used = 0;
261 /* Second write might block if non-zero. */
262 if (out->hdr.msg.len && !conn->domain)
263 return true;
264 }
266 ret = conn->write(conn, out->buffer + out->used,
267 out->hdr.msg.len - out->used);
269 if (ret < 0)
270 return false;
272 out->used += ret;
273 if (out->used != out->hdr.msg.len)
274 return true;
276 trace_io(conn, "OUT", out);
277 conn->out = NULL;
278 talloc_free(out);
280 queue_next_event(conn);
282 /* No longer busy? */
283 if (!conn->out)
284 conn->state = OK;
285 return true;
286 }
288 static int destroy_conn(void *_conn)
289 {
290 struct connection *conn = _conn;
292 /* Flush outgoing if possible, but don't block. */
293 if (!conn->domain) {
294 fd_set set;
295 struct timeval none;
297 FD_ZERO(&set);
298 FD_SET(conn->fd, &set);
299 none.tv_sec = none.tv_usec = 0;
301 while (conn->out
302 && select(conn->fd+1, NULL, &set, NULL, &none) == 1)
303 if (!write_message(conn))
304 break;
305 close(conn->fd);
306 }
307 list_del(&conn->list);
308 trace_destroy(conn, "connection");
309 return 0;
310 }
312 static int initialize_set(fd_set *inset, fd_set *outset, int sock, int ro_sock,
313 int event_fd)
314 {
315 struct connection *i;
316 int max;
318 FD_ZERO(inset);
319 FD_ZERO(outset);
320 FD_SET(sock, inset);
321 max = sock;
322 FD_SET(ro_sock, inset);
323 if (ro_sock > max)
324 max = ro_sock;
325 FD_SET(event_fd, inset);
326 if (event_fd > max)
327 max = event_fd;
328 list_for_each_entry(i, &connections, list) {
329 if (i->domain)
330 continue;
331 if (i->state == OK)
332 FD_SET(i->fd, inset);
333 if (i->out)
334 FD_SET(i->fd, outset);
335 if (i->fd > max)
336 max = i->fd;
337 }
338 return max;
339 }
341 static int destroy_fd(void *_fd)
342 {
343 int *fd = _fd;
344 close(*fd);
345 return 0;
346 }
348 /* Return a pointer to an fd, self-closing and attached to this pathname. */
349 int *talloc_open(const char *pathname, int flags, int mode)
350 {
351 int *fd;
353 fd = talloc(pathname, int);
354 *fd = open(pathname, flags, mode);
355 if (*fd < 0) {
356 int saved_errno = errno;
357 talloc_free(fd);
358 errno = saved_errno;
359 return NULL;
360 }
361 talloc_set_destructor(fd, destroy_fd);
362 return fd;
363 }
365 /* Is child a subnode of parent, or equal? */
366 bool is_child(const char *child, const char *parent)
367 {
368 unsigned int len = strlen(parent);
370 /* / should really be "" for this algorithm to work, but that's a
371 * usability nightmare. */
372 if (streq(parent, "/"))
373 return true;
375 if (strncmp(child, parent, len) != 0)
376 return false;
378 return child[len] == '/' || child[len] == '\0';
379 }
381 /* If it fails, returns NULL and sets errno. */
382 static struct node *read_node(struct connection *conn, const char *name)
383 {
384 TDB_DATA key, data;
385 u32 *p;
386 struct node *node;
388 key.dptr = (void *)name;
389 key.dsize = strlen(name);
390 data = tdb_fetch(tdb_context(conn), key);
392 if (data.dptr == NULL) {
393 if (tdb_error(tdb_context(conn)) == TDB_ERR_NOEXIST)
394 errno = ENOENT;
395 else
396 errno = EIO;
397 return NULL;
398 }
400 node = talloc(name, struct node);
401 node->name = talloc_strdup(node, name);
402 node->parent = NULL;
403 node->tdb = tdb_context(conn);
404 talloc_steal(node, data.dptr);
406 /* Datalen, childlen, number of permissions */
407 p = (u32 *)data.dptr;
408 node->num_perms = p[0];
409 node->datalen = p[1];
410 node->childlen = p[2];
412 /* Permissions are struct xs_permissions. */
413 node->perms = (void *)&p[3];
414 /* Data is binary blob (usually ascii, no nul). */
415 node->data = node->perms + node->num_perms;
416 /* Children is strings, nul separated. */
417 node->children = node->data + node->datalen;
419 return node;
420 }
422 static bool write_node(struct connection *conn, const struct node *node)
423 {
424 TDB_DATA key, data;
425 void *p;
427 key.dptr = (void *)node->name;
428 key.dsize = strlen(node->name);
430 data.dsize = 3*sizeof(u32)
431 + node->num_perms*sizeof(node->perms[0])
432 + node->datalen + node->childlen;
433 data.dptr = talloc_size(node, data.dsize);
434 ((u32 *)data.dptr)[0] = node->num_perms;
435 ((u32 *)data.dptr)[1] = node->datalen;
436 ((u32 *)data.dptr)[2] = node->childlen;
437 p = data.dptr + 3 * sizeof(u32);
439 memcpy(p, node->perms, node->num_perms*sizeof(node->perms[0]));
440 p += node->num_perms*sizeof(node->perms[0]);
441 memcpy(p, node->data, node->datalen);
442 p += node->datalen;
443 memcpy(p, node->children, node->childlen);
445 /* TDB should set errno, but doesn't even set ecode AFAICT. */
446 if (tdb_store(tdb_context(conn), key, data, TDB_REPLACE) != 0) {
447 errno = ENOSPC;
448 return false;
449 }
450 return true;
451 }
453 static enum xs_perm_type perm_for_conn(struct connection *conn,
454 struct xs_permissions *perms,
455 unsigned int num)
456 {
457 unsigned int i;
458 enum xs_perm_type mask = XS_PERM_READ|XS_PERM_WRITE|XS_PERM_OWNER;
460 if (!conn->can_write)
461 mask &= ~XS_PERM_WRITE;
463 /* Owners and tools get it all... */
464 if (!conn->id || perms[0].id == conn->id)
465 return (XS_PERM_READ|XS_PERM_WRITE|XS_PERM_OWNER) & mask;
467 for (i = 1; i < num; i++)
468 if (perms[i].id == conn->id)
469 return perms[i].perms & mask;
471 return perms[0].perms & mask;
472 }
474 static char *get_parent(const char *node)
475 {
476 char *slash = strrchr(node + 1, '/');
477 if (!slash)
478 return talloc_strdup(node, "/");
479 return talloc_asprintf(node, "%.*s", (int)(slash - node), node);
480 }
482 /* What do parents say? */
483 static enum xs_perm_type ask_parents(struct connection *conn, const char *name)
484 {
485 struct node *node;
487 do {
488 name = get_parent(name);
489 node = read_node(conn, name);
490 if (node)
491 break;
492 } while (!streq(name, "/"));
494 /* No permission at root? We're in trouble. */
495 if (!node)
496 corrupt(conn, "No permissions file at root");
498 return perm_for_conn(conn, node->perms, node->num_perms);
499 }
501 /* We have a weird permissions system. You can allow someone into a
502 * specific node without allowing it in the parents. If it's going to
503 * fail, however, we don't want the errno to indicate any information
504 * about the node. */
505 static int errno_from_parents(struct connection *conn, const char *node,
506 int errnum, enum xs_perm_type perm)
507 {
508 /* We always tell them about memory failures. */
509 if (errnum == ENOMEM)
510 return errnum;
512 if (ask_parents(conn, node) & perm)
513 return errnum;
514 return EACCES;
515 }
517 /* If it fails, returns NULL and sets errno. */
518 struct node *get_node(struct connection *conn,
519 const char *name,
520 enum xs_perm_type perm)
521 {
522 struct node *node;
524 if (!name || !is_valid_nodename(name)) {
525 errno = EINVAL;
526 return NULL;
527 }
528 node = read_node(conn, name);
529 /* If we don't have permission, we don't have node. */
530 if (node) {
531 if ((perm_for_conn(conn, node->perms, node->num_perms) & perm)
532 != perm)
533 node = NULL;
534 }
535 /* Clean up errno if they weren't supposed to know. */
536 if (!node)
537 errno = errno_from_parents(conn, name, errno, perm);
538 return node;
539 }
541 static struct buffered_data *new_buffer(void *ctx)
542 {
543 struct buffered_data *data;
545 data = talloc(ctx, struct buffered_data);
546 data->inhdr = true;
547 data->used = 0;
548 data->buffer = NULL;
550 return data;
551 }
553 /* Return length of string (including nul) at this offset. */
554 static unsigned int get_string(const struct buffered_data *data,
555 unsigned int offset)
556 {
557 const char *nul;
559 if (offset >= data->used)
560 return 0;
562 nul = memchr(data->buffer + offset, 0, data->used - offset);
563 if (!nul)
564 return 0;
566 return nul - (data->buffer + offset) + 1;
567 }
569 /* Break input into vectors, return the number, fill in up to num of them. */
570 unsigned int get_strings(struct buffered_data *data,
571 char *vec[], unsigned int num)
572 {
573 unsigned int off, i, len;
575 off = i = 0;
576 while ((len = get_string(data, off)) != 0) {
577 if (i < num)
578 vec[i] = data->buffer + off;
579 i++;
580 off += len;
581 }
582 return i;
583 }
585 void send_reply(struct connection *conn, enum xsd_sockmsg_type type,
586 const void *data, unsigned int len)
587 {
588 struct buffered_data *bdata;
590 /* When data gets freed, we want list entry is destroyed (so
591 * list entry is a child). */
592 bdata = new_buffer(conn);
593 bdata->buffer = talloc_array(bdata, char, len);
595 bdata->hdr.msg.type = type;
596 bdata->hdr.msg.len = len;
597 memcpy(bdata->buffer, data, len);
599 /* There might be an event going out now. Queue behind it. */
600 if (conn->out) {
601 assert(conn->out->hdr.msg.type == XS_WATCH_EVENT);
602 assert(!conn->waiting_reply);
603 conn->waiting_reply = bdata;
604 } else
605 conn->out = bdata;
606 conn->state = BUSY;
607 }
609 /* Some routines (write, mkdir, etc) just need a non-error return */
610 void send_ack(struct connection *conn, enum xsd_sockmsg_type type)
611 {
612 send_reply(conn, type, "OK", sizeof("OK"));
613 }
615 void send_error(struct connection *conn, int error)
616 {
617 unsigned int i;
619 for (i = 0; error != xsd_errors[i].errnum; i++) {
620 if (i == ARRAY_SIZE(xsd_errors) - 1) {
621 eprintf("xenstored: error %i untranslatable", error);
622 i = 0; /* EINVAL */
623 break;
624 }
625 }
626 send_reply(conn, XS_ERROR, xsd_errors[i].errstring,
627 strlen(xsd_errors[i].errstring) + 1);
628 }
630 static bool valid_chars(const char *node)
631 {
632 /* Nodes can have lots of crap. */
633 return (strspn(node,
634 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
635 "abcdefghijklmnopqrstuvwxyz"
636 "0123456789-/_@") == strlen(node));
637 }
639 bool is_valid_nodename(const char *node)
640 {
641 /* Must start in /. */
642 if (!strstarts(node, "/"))
643 return false;
645 /* Cannot end in / (unless it's just "/"). */
646 if (strends(node, "/") && !streq(node, "/"))
647 return false;
649 /* No double //. */
650 if (strstr(node, "//"))
651 return false;
653 return valid_chars(node);
654 }
656 /* We expect one arg in the input: return NULL otherwise. */
657 static const char *onearg(struct buffered_data *in)
658 {
659 if (!in->used || get_string(in, 0) != in->used)
660 return NULL;
661 return in->buffer;
662 }
664 static char *perms_to_strings(const void *ctx,
665 struct xs_permissions *perms, unsigned int num,
666 unsigned int *len)
667 {
668 unsigned int i;
669 char *strings = NULL;
670 char buffer[MAX_STRLEN(domid_t) + 1];
672 for (*len = 0, i = 0; i < num; i++) {
673 if (!xs_perm_to_string(&perms[i], buffer))
674 return NULL;
676 strings = talloc_realloc(ctx, strings, char,
677 *len + strlen(buffer) + 1);
678 strcpy(strings + *len, buffer);
679 *len += strlen(buffer) + 1;
680 }
681 return strings;
682 }
684 char *canonicalize(struct connection *conn, const char *node)
685 {
686 const char *prefix;
688 if (!node || strstarts(node, "/"))
689 return (char *)node;
690 prefix = get_implicit_path(conn);
691 if (prefix)
692 return talloc_asprintf(node, "%s/%s", prefix, node);
693 return (char *)node;
694 }
696 bool check_event_node(const char *node)
697 {
698 if (!node || !strstarts(node, "@")) {
699 errno = EINVAL;
700 return false;
701 }
702 return true;
703 }
705 static void send_directory(struct connection *conn, const char *name)
706 {
707 struct node *node;
709 name = canonicalize(conn, name);
710 node = get_node(conn, name, XS_PERM_READ);
711 if (!node) {
712 send_error(conn, errno);
713 return;
714 }
716 send_reply(conn, XS_DIRECTORY, node->children, node->childlen);
717 }
719 static void do_read(struct connection *conn, const char *name)
720 {
721 struct node *node;
723 name = canonicalize(conn, name);
724 node = get_node(conn, name, XS_PERM_READ);
725 if (!node) {
726 send_error(conn, errno);
727 return;
728 }
730 send_reply(conn, XS_READ, node->data, node->datalen);
731 }
733 static void delete_node_single(struct connection *conn, struct node *node)
734 {
735 TDB_DATA key;
737 key.dptr = (void *)node->name;
738 key.dsize = strlen(node->name);
740 if (tdb_delete(tdb_context(conn), key) != 0)
741 corrupt(conn, "Could not delete '%s'", node->name);
742 }
744 /* Must not be / */
745 static char *basename(const char *name)
746 {
747 return strrchr(name, '/') + 1;
748 }
750 static struct node *construct_node(struct connection *conn, const char *name)
751 {
752 const char *base;
753 unsigned int baselen;
754 struct node *parent, *node;
755 char *children, *parentname = get_parent(name);
757 /* If parent doesn't exist, create it. */
758 parent = read_node(conn, parentname);
759 if (!parent)
760 parent = construct_node(conn, parentname);
761 if (!parent)
762 return NULL;
764 /* Add child to parent. */
765 base = basename(name);
766 baselen = strlen(base) + 1;
767 children = talloc_array(name, char, parent->childlen + baselen);
768 memcpy(children, parent->children, parent->childlen);
769 memcpy(children + parent->childlen, base, baselen);
770 parent->children = children;
771 parent->childlen += baselen;
773 /* Allocate node */
774 node = talloc(name, struct node);
775 node->tdb = tdb_context(conn);
776 node->name = talloc_strdup(node, name);
778 /* Inherit permissions, except domains own what they create */
779 node->num_perms = parent->num_perms;
780 node->perms = talloc_memdup(node, parent->perms,
781 node->num_perms * sizeof(node->perms[0]));
782 if (conn->id)
783 node->perms[0].id = conn->id;
785 /* No children, no data */
786 node->children = node->data = NULL;
787 node->childlen = node->datalen = 0;
788 node->parent = parent;
789 return node;
790 }
792 static int destroy_node(void *_node)
793 {
794 struct node *node = _node;
795 TDB_DATA key;
797 if (streq(node->name, "/"))
798 corrupt(NULL, "Destroying root node!");
800 key.dptr = (void *)node->name;
801 key.dsize = strlen(node->name);
803 tdb_delete(node->tdb, key);
804 return 0;
805 }
807 /* Be careful: create heirarchy, put entry in existing parent *last*.
808 * This helps fsck if we die during this. */
809 static struct node *create_node(struct connection *conn,
810 const char *name,
811 void *data, unsigned int datalen)
812 {
813 struct node *node, *i;
815 node = construct_node(conn, name);
816 if (!node)
817 return NULL;
819 node->data = data;
820 node->datalen = datalen;
822 /* We write out the nodes down, setting destructor in case
823 * something goes wrong. */
824 for (i = node; i; i = i->parent) {
825 if (!write_node(conn, i))
826 return NULL;
827 talloc_set_destructor(i, destroy_node);
828 }
830 /* OK, now remove destructors so they stay around */
831 for (i = node; i; i = i->parent)
832 talloc_set_destructor(i, NULL);
833 return node;
834 }
836 /* path, data... */
837 static void do_write(struct connection *conn, struct buffered_data *in)
838 {
839 unsigned int offset, datalen;
840 struct node *node;
841 char *vec[1] = { NULL }; /* gcc4 + -W + -Werror fucks code. */
842 char *name;
844 /* Extra "strings" can be created by binary data. */
845 if (get_strings(in, vec, ARRAY_SIZE(vec)) < ARRAY_SIZE(vec)) {
846 send_error(conn, EINVAL);
847 return;
848 }
850 offset = strlen(vec[0]) + 1;
851 datalen = in->used - offset;
853 name = canonicalize(conn, vec[0]);
854 node = get_node(conn, name, XS_PERM_WRITE);
855 if (!node) {
856 /* No permissions, invalid input? */
857 if (errno != ENOENT) {
858 send_error(conn, errno);
859 return;
860 }
861 node = create_node(conn, name, in->buffer + offset, datalen);
862 if (!node) {
863 send_error(conn, errno);
864 return;
865 }
866 } else {
867 node->data = in->buffer + offset;
868 node->datalen = datalen;
869 if (!write_node(conn, node)){
870 send_error(conn, errno);
871 return;
872 }
873 }
875 add_change_node(conn->transaction, name, false);
876 fire_watches(conn, name, false);
877 send_ack(conn, XS_WRITE);
878 }
880 static void do_mkdir(struct connection *conn, const char *name)
881 {
882 struct node *node;
884 name = canonicalize(conn, name);
885 node = get_node(conn, name, XS_PERM_WRITE);
887 /* If it already exists, fine. */
888 if (!node) {
889 /* No permissions? */
890 if (errno != ENOENT) {
891 send_error(conn, errno);
892 return;
893 }
894 node = create_node(conn, name, NULL, 0);
895 if (!node) {
896 send_error(conn, errno);
897 return;
898 }
899 add_change_node(conn->transaction, name, false);
900 fire_watches(conn, name, false);
901 }
902 send_ack(conn, XS_MKDIR);
903 }
905 static void delete_node(struct connection *conn, struct node *node)
906 {
907 unsigned int i;
909 /* Delete self, then delete children. If something goes wrong,
910 * consistency check will clean up this way. */
911 delete_node_single(conn, node);
913 /* Delete children, too. */
914 for (i = 0; i < node->childlen; i += strlen(node->children+i) + 1) {
915 struct node *child;
917 child = read_node(conn,
918 talloc_asprintf(node, "%s/%s", node->name,
919 node->children + i));
920 if (!child)
921 corrupt(conn, "No child '%s' found", child);
922 delete_node(conn, child);
923 }
924 }
926 /* Delete memory using memmove. */
927 static void memdel(void *mem, unsigned off, unsigned len, unsigned total)
928 {
929 memmove(mem + off, mem + off + len, total - off - len);
930 }
932 static bool delete_child(struct connection *conn,
933 struct node *node, const char *childname)
934 {
935 unsigned int i;
937 for (i = 0; i < node->childlen; i += strlen(node->children+i) + 1) {
938 if (streq(node->children+i, childname)) {
939 memdel(node->children, i, strlen(childname) + 1,
940 node->childlen);
941 node->childlen -= strlen(childname) + 1;
942 return write_node(conn, node);
943 }
944 }
945 corrupt(conn, "Can't find child '%s' in %s", childname, node->name);
946 }
948 static void do_rm(struct connection *conn, const char *name)
949 {
950 struct node *node, *parent;
952 name = canonicalize(conn, name);
953 node = get_node(conn, name, XS_PERM_WRITE);
954 if (!node) {
955 /* Didn't exist already? Fine, if parent exists. */
956 if (errno == ENOENT) {
957 node = read_node(conn, get_parent(name));
958 if (node) {
959 send_ack(conn, XS_RM);
960 return;
961 }
962 /* Restore errno, just in case. */
963 errno = ENOENT;
964 }
965 send_error(conn, errno);
966 return;
967 }
969 if (streq(name, "/")) {
970 send_error(conn, EINVAL);
971 return;
972 }
974 /* Delete from parent first, then if something explodes fsck cleans. */
975 parent = read_node(conn, get_parent(name));
976 if (!parent) {
977 send_error(conn, EINVAL);
978 return;
979 }
981 if (!delete_child(conn, parent, basename(name))) {
982 send_error(conn, EINVAL);
983 return;
984 }
986 delete_node(conn, node);
987 add_change_node(conn->transaction, name, true);
988 fire_watches(conn, name, true);
989 send_ack(conn, XS_RM);
990 }
992 static void do_get_perms(struct connection *conn, const char *name)
993 {
994 struct node *node;
995 char *strings;
996 unsigned int len;
998 name = canonicalize(conn, name);
999 node = get_node(conn, name, XS_PERM_READ);
1000 if (!node) {
1001 send_error(conn, errno);
1002 return;
1005 strings = perms_to_strings(node, node->perms, node->num_perms, &len);
1006 if (!strings)
1007 send_error(conn, errno);
1008 else
1009 send_reply(conn, XS_GET_PERMS, strings, len);
1012 static void do_set_perms(struct connection *conn, struct buffered_data *in)
1014 unsigned int num;
1015 char *name, *permstr;
1016 struct node *node;
1018 num = xs_count_strings(in->buffer, in->used);
1019 if (num < 2) {
1020 send_error(conn, EINVAL);
1021 return;
1024 /* First arg is node name. */
1025 name = canonicalize(conn, in->buffer);
1026 permstr = in->buffer + strlen(in->buffer) + 1;
1027 num--;
1029 /* We must own node to do this (tools can do this too). */
1030 node = get_node(conn, name, XS_PERM_WRITE|XS_PERM_OWNER);
1031 if (!node) {
1032 send_error(conn, errno);
1033 return;
1036 node->perms = talloc_array(node, struct xs_permissions, num);
1037 node->num_perms = num;
1038 if (!xs_strings_to_perms(node->perms, num, permstr)) {
1039 send_error(conn, errno);
1040 return;
1042 if (!write_node(conn, node)) {
1043 send_error(conn, errno);
1044 return;
1047 add_change_node(conn->transaction, name, false);
1048 fire_watches(conn, name, false);
1049 send_ack(conn, XS_SET_PERMS);
1052 /* Process "in" for conn: "in" will vanish after this conversation, so
1053 * we can talloc off it for temporary variables. May free "conn".
1054 */
1055 static void process_message(struct connection *conn, struct buffered_data *in)
1057 switch (in->hdr.msg.type) {
1058 case XS_DIRECTORY:
1059 send_directory(conn, onearg(in));
1060 break;
1062 case XS_READ:
1063 do_read(conn, onearg(in));
1064 break;
1066 case XS_WRITE:
1067 do_write(conn, in);
1068 break;
1070 case XS_MKDIR:
1071 do_mkdir(conn, onearg(in));
1072 break;
1074 case XS_RM:
1075 do_rm(conn, onearg(in));
1076 break;
1078 case XS_GET_PERMS:
1079 do_get_perms(conn, onearg(in));
1080 break;
1082 case XS_SET_PERMS:
1083 do_set_perms(conn, in);
1084 break;
1086 case XS_SHUTDOWN:
1087 /* FIXME: Implement gentle shutdown too. */
1088 /* Only tools can do this. */
1089 if (conn->id != 0 || !conn->can_write) {
1090 send_error(conn, EACCES);
1091 break;
1093 send_ack(conn, XS_SHUTDOWN);
1094 /* Everything hangs off auto-free context, freed at exit. */
1095 exit(0);
1097 case XS_DEBUG:
1098 if (streq(in->buffer, "print"))
1099 xprintf("debug: %s", in->buffer + get_string(in, 0));
1100 #ifdef TESTING
1101 /* For testing, we allow them to set id. */
1102 if (streq(in->buffer, "setid")) {
1103 conn->id = atoi(in->buffer + get_string(in, 0));
1104 send_ack(conn, XS_DEBUG);
1105 } else if (streq(in->buffer, "failtest")) {
1106 if (get_string(in, 0) < in->used)
1107 srandom(atoi(in->buffer + get_string(in, 0)));
1108 send_ack(conn, XS_DEBUG);
1109 failtest = true;
1111 #endif /* TESTING */
1112 break;
1114 case XS_WATCH:
1115 do_watch(conn, in);
1116 break;
1118 case XS_WATCH_ACK:
1119 do_watch_ack(conn, onearg(in));
1120 break;
1122 case XS_UNWATCH:
1123 do_unwatch(conn, in);
1124 break;
1126 case XS_TRANSACTION_START:
1127 do_transaction_start(conn, in);
1128 break;
1130 case XS_TRANSACTION_END:
1131 do_transaction_end(conn, onearg(in));
1132 break;
1134 case XS_INTRODUCE:
1135 do_introduce(conn, in);
1136 break;
1138 case XS_RELEASE:
1139 do_release(conn, onearg(in));
1140 break;
1142 case XS_GET_DOMAIN_PATH:
1143 do_get_domain_path(conn, onearg(in));
1144 break;
1146 case XS_WATCH_EVENT:
1147 default:
1148 eprintf("Client unknown operation %i", in->hdr.msg.type);
1149 send_error(conn, ENOSYS);
1153 static int out_of_mem(void *data)
1155 longjmp(*(jmp_buf *)data, 1);
1158 static void consider_message(struct connection *conn)
1160 /*
1161 * 'volatile' qualifier prevents register allocation which fixes:
1162 * warning: variable 'xxx' might be clobbered by 'longjmp' or 'vfork'
1163 */
1164 struct buffered_data *volatile in = NULL;
1165 enum xsd_sockmsg_type volatile type = conn->in->hdr.msg.type;
1166 jmp_buf talloc_fail;
1168 assert(conn->state == OK);
1170 /* For simplicity, we kill the connection on OOM. */
1171 talloc_set_fail_handler(out_of_mem, &talloc_fail);
1172 if (setjmp(talloc_fail)) {
1173 /* Free in before conn, in case it needs something. */
1174 talloc_free(in);
1175 talloc_free(conn);
1176 goto end;
1179 if (verbose)
1180 xprintf("Got message %s len %i from %p\n",
1181 sockmsg_string(type), conn->in->hdr.msg.len, conn);
1183 /* We might get a command while waiting for an ack: this means
1184 * the other end discarded it: we will re-transmit. */
1185 if (type != XS_WATCH_ACK)
1186 conn->waiting_for_ack = NULL;
1188 /* Careful: process_message may free connection. We detach
1189 * "in" beforehand and allocate the new buffer to avoid
1190 * touching conn after process_message.
1191 */
1192 in = talloc_steal(talloc_autofree_context(), conn->in);
1193 conn->in = new_buffer(conn);
1194 process_message(conn, in);
1196 talloc_free(in);
1197 end:
1198 talloc_set_fail_handler(NULL, NULL);
1199 if (talloc_total_blocks(NULL)
1200 != talloc_total_blocks(talloc_autofree_context()) + 1) {
1201 talloc_report_full(NULL, stderr);
1202 abort();
1206 /* Errors in reading or allocating here mean we get out of sync, so we
1207 * drop the whole client connection. */
1208 static void handle_input(struct connection *conn)
1210 int bytes;
1211 struct buffered_data *in;
1213 assert(conn->state == OK);
1214 in = conn->in;
1216 /* Not finished header yet? */
1217 if (in->inhdr) {
1218 bytes = conn->read(conn, in->hdr.raw + in->used,
1219 sizeof(in->hdr) - in->used);
1220 if (bytes <= 0)
1221 goto bad_client;
1222 in->used += bytes;
1223 if (in->used != sizeof(in->hdr))
1224 return;
1226 if (in->hdr.msg.len > PATH_MAX) {
1227 #ifndef TESTING
1228 syslog(LOG_DAEMON, "Client tried to feed us %i",
1229 in->hdr.msg.len);
1230 #endif
1231 goto bad_client;
1234 in->buffer = talloc_array(in, char, in->hdr.msg.len);
1235 if (!in->buffer)
1236 goto bad_client;
1237 in->used = 0;
1238 in->inhdr = false;
1239 return;
1242 bytes = conn->read(conn, in->buffer + in->used,
1243 in->hdr.msg.len - in->used);
1244 if (bytes < 0)
1245 goto bad_client;
1247 in->used += bytes;
1248 if (in->used != in->hdr.msg.len)
1249 return;
1251 trace_io(conn, "IN ", in);
1252 consider_message(conn);
1253 return;
1255 bad_client:
1256 /* Kill it. */
1257 talloc_free(conn);
1260 static void handle_output(struct connection *conn)
1262 if (!write_message(conn))
1263 talloc_free(conn);
1266 struct connection *new_connection(connwritefn_t *write, connreadfn_t *read)
1268 /*
1269 * 'volatile' qualifier prevents register allocation which fixes:
1270 * warning: variable 'xxx' might be clobbered by 'longjmp' or 'vfork'
1271 */
1272 struct connection *volatile new;
1273 jmp_buf talloc_fail;
1275 new = talloc(talloc_autofree_context(), struct connection);
1276 if (!new)
1277 return NULL;
1279 new->state = OK;
1280 new->out = new->waiting_reply = NULL;
1281 new->waiting_for_ack = NULL;
1282 new->fd = -1;
1283 new->id = 0;
1284 new->domain = NULL;
1285 new->transaction = NULL;
1286 new->write = write;
1287 new->read = read;
1288 new->can_write = true;
1289 INIT_LIST_HEAD(&new->watches);
1291 talloc_set_fail_handler(out_of_mem, &talloc_fail);
1292 if (setjmp(talloc_fail)) {
1293 talloc_free(new);
1294 return NULL;
1296 new->in = new_buffer(new);
1297 talloc_set_fail_handler(NULL, NULL);
1299 list_add_tail(&new->list, &connections);
1300 talloc_set_destructor(new, destroy_conn);
1301 trace_create(new, "connection");
1302 return new;
1305 static int writefd(struct connection *conn, const void *data, unsigned int len)
1307 return write(conn->fd, data, len);
1310 static int readfd(struct connection *conn, void *data, unsigned int len)
1312 return read(conn->fd, data, len);
1315 static void accept_connection(int sock, bool canwrite)
1317 int fd;
1318 struct connection *conn;
1320 fd = accept(sock, NULL, NULL);
1321 if (fd < 0)
1322 return;
1324 conn = new_connection(writefd, readfd);
1325 if (conn) {
1326 conn->fd = fd;
1327 conn->can_write = canwrite;
1328 } else
1329 close(fd);
1332 #ifdef TESTING
1333 /* Valgrind can check our writes better if we don't use mmap */
1334 #define TDB_FLAGS TDB_NOMMAP
1335 /* Useful for running under debugger. */
1336 void dump_connection(void)
1338 struct connection *i;
1340 list_for_each_entry(i, &connections, list) {
1341 printf("Connection %p:\n", i);
1342 printf(" state = %s\n",
1343 i->state == OK ? "OK"
1344 : i->state == BUSY ? "BUSY"
1345 : "INVALID");
1346 if (i->id)
1347 printf(" id = %i\n", i->id);
1348 if (!i->in->inhdr || i->in->used)
1349 printf(" got %i bytes of %s\n",
1350 i->in->used, i->in->inhdr ? "header" : "data");
1351 if (i->out)
1352 printf(" sending message %s (%s) out\n",
1353 sockmsg_string(i->out->hdr.msg.type),
1354 i->out->buffer);
1355 if (i->waiting_reply)
1356 printf(" ... and behind is queued %s (%s)\n",
1357 sockmsg_string(i->waiting_reply->hdr.msg.type),
1358 i->waiting_reply->buffer);
1359 #if 0
1360 if (i->transaction)
1361 dump_transaction(i);
1362 if (i->domain)
1363 dump_domain(i);
1364 #endif
1365 dump_watches(i);
1368 #else
1369 #define TDB_FLAGS 0
1370 #endif
1372 /* We create initial nodes manually. */
1373 static void manual_node(const char *name, const char *child)
1375 struct node *node;
1376 struct xs_permissions perms = { .id = 0, .perms = XS_PERM_READ };
1378 node = talloc(NULL, struct node);
1379 node->name = name;
1380 node->perms = &perms;
1381 node->num_perms = 1;
1382 node->data = NULL;
1383 node->datalen = 0;
1384 node->children = (char *)child;
1385 if (child)
1386 node->childlen = strlen(child) + 1;
1387 else
1388 node->childlen = 0;
1390 if (!write_node(NULL, node))
1391 barf_perror("Could not create initial node %s", name);
1392 talloc_free(node);
1397 static void setup_structure(void)
1399 char *tdbname;
1400 tdbname = talloc_strdup(talloc_autofree_context(), xs_daemon_tdb());
1401 tdb_ctx = tdb_open(tdbname, 0, TDB_FLAGS, O_RDWR, 0);
1403 if (!tdb_ctx) {
1404 tdb_ctx = tdb_open(tdbname, 7919, TDB_FLAGS, O_RDWR|O_CREAT,
1405 0640);
1406 if (!tdb_ctx)
1407 barf_perror("Could not create tdb file %s", tdbname);
1409 manual_node("/", "tool");
1410 manual_node("/tool", "xenstored");
1411 manual_node("/tool/xenstored", NULL);
1414 /* FIXME: Fsck */
1417 static void write_pidfile(const char *pidfile)
1419 char buf[100];
1420 int len;
1421 int fd;
1423 fd = open(pidfile, O_RDWR | O_CREAT, 0600);
1424 if (fd == -1)
1425 barf_perror("Opening pid file %s", pidfile);
1427 /* We exit silently if daemon already running. */
1428 if (lockf(fd, F_TLOCK, 0) == -1)
1429 exit(0);
1431 len = sprintf(buf, "%d\n", getpid());
1432 write(fd, buf, len);
1435 /* Stevens. */
1436 static void daemonize(void)
1438 pid_t pid;
1440 /* Separate from our parent via fork, so init inherits us. */
1441 if ((pid = fork()) < 0)
1442 barf_perror("Failed to fork daemon");
1443 if (pid != 0)
1444 exit(0);
1446 /* Session leader so ^C doesn't whack us. */
1447 setsid();
1448 #ifndef TESTING /* Relative paths for socket names */
1449 /* Move off any mount points we might be in. */
1450 chdir("/");
1451 #endif
1452 /* Discard our parent's old-fashioned umask prejudices. */
1453 umask(0);
1457 static struct option options[] = {
1458 { "pid-file", 1, NULL, 'F' },
1459 { "no-fork", 0, NULL, 'N' },
1460 { "output-pid", 0, NULL, 'P' },
1461 { "trace-file", 1, NULL, 'T' },
1462 { "verbose", 0, NULL, 'V' },
1463 { NULL, 0, NULL, 0 } };
1465 int main(int argc, char *argv[])
1467 int opt, *sock, *ro_sock, event_fd, max;
1468 struct sockaddr_un addr;
1469 fd_set inset, outset;
1470 bool dofork = true;
1471 bool outputpid = false;
1472 const char *pidfile = NULL;
1474 while ((opt = getopt_long(argc, argv, "F:NPT:V", options,
1475 NULL)) != -1) {
1476 switch (opt) {
1477 case 'F':
1478 pidfile = optarg;
1479 break;
1480 case 'N':
1481 dofork = false;
1482 break;
1483 case 'P':
1484 outputpid = true;
1485 break;
1486 case 'T':
1487 tracefd = open(optarg, O_WRONLY|O_CREAT|O_APPEND, 0600);
1488 if (tracefd < 0)
1489 barf_perror("Could not open tracefile %s",
1490 optarg);
1491 write(tracefd, "\n***\n", strlen("\n***\n"));
1492 break;
1493 case 'V':
1494 verbose = true;
1495 break;
1498 if (optind != argc)
1499 barf("%s: No arguments desired", argv[0]);
1501 if (dofork) {
1502 openlog("xenstored", 0, LOG_DAEMON);
1503 daemonize();
1505 if (pidfile)
1506 write_pidfile(pidfile);
1508 talloc_enable_leak_report_full();
1510 /* Create sockets for them to listen to. */
1511 sock = talloc(talloc_autofree_context(), int);
1512 *sock = socket(PF_UNIX, SOCK_STREAM, 0);
1513 if (*sock < 0)
1514 barf_perror("Could not create socket");
1515 ro_sock = talloc(talloc_autofree_context(), int);
1516 *ro_sock = socket(PF_UNIX, SOCK_STREAM, 0);
1517 if (*ro_sock < 0)
1518 barf_perror("Could not create socket");
1519 talloc_set_destructor(sock, destroy_fd);
1520 talloc_set_destructor(ro_sock, destroy_fd);
1522 /* Don't kill us with SIGPIPE. */
1523 signal(SIGPIPE, SIG_IGN);
1525 /* FIXME: Be more sophisticated, don't mug running daemon. */
1526 unlink(xs_daemon_socket());
1527 unlink(xs_daemon_socket_ro());
1529 addr.sun_family = AF_UNIX;
1530 strcpy(addr.sun_path, xs_daemon_socket());
1531 if (bind(*sock, (struct sockaddr *)&addr, sizeof(addr)) != 0)
1532 barf_perror("Could not bind socket to %s", xs_daemon_socket());
1533 strcpy(addr.sun_path, xs_daemon_socket_ro());
1534 if (bind(*ro_sock, (struct sockaddr *)&addr, sizeof(addr)) != 0)
1535 barf_perror("Could not bind socket to %s",
1536 xs_daemon_socket_ro());
1537 if (chmod(xs_daemon_socket(), 0600) != 0
1538 || chmod(xs_daemon_socket_ro(), 0660) != 0)
1539 barf_perror("Could not chmod sockets");
1541 if (listen(*sock, 1) != 0
1542 || listen(*ro_sock, 1) != 0)
1543 barf_perror("Could not listen on sockets");
1545 /* If we're the first, create .perms file for root. */
1546 setup_structure();
1548 /* Listen to hypervisor. */
1549 event_fd = domain_init();
1551 /* Restore existing connections. */
1552 restore_existing_connections();
1554 if (outputpid) {
1555 printf("%i\n", getpid());
1556 fflush(stdout);
1559 /* close stdin/stdout now we're ready to accept connections */
1560 if (dofork) {
1561 close(STDIN_FILENO);
1562 close(STDOUT_FILENO);
1563 close(STDERR_FILENO);
1566 #ifdef TESTING
1567 signal(SIGUSR1, stop_failtest);
1568 #endif
1570 /* Get ready to listen to the tools. */
1571 max = initialize_set(&inset, &outset, *sock, *ro_sock, event_fd);
1573 /* Main loop. */
1574 /* FIXME: Rewrite so noone can starve. */
1575 for (;;) {
1576 struct connection *i;
1578 if (select(max+1, &inset, &outset, NULL, NULL) < 0) {
1579 if (errno == EINTR)
1580 continue;
1581 barf_perror("Select failed");
1584 if (FD_ISSET(*sock, &inset))
1585 accept_connection(*sock, true);
1587 if (FD_ISSET(*ro_sock, &inset))
1588 accept_connection(*ro_sock, false);
1590 if (FD_ISSET(event_fd, &inset))
1591 handle_event(event_fd);
1593 list_for_each_entry(i, &connections, list) {
1594 if (i->domain)
1595 continue;
1597 /* Operations can delete themselves or others
1598 * (xs_release): list is not safe after input,
1599 * so break. */
1600 if (FD_ISSET(i->fd, &inset)) {
1601 handle_input(i);
1602 break;
1604 if (FD_ISSET(i->fd, &outset)) {
1605 handle_output(i);
1606 break;
1610 /* Handle all possible I/O for domain connections. */
1611 more:
1612 list_for_each_entry(i, &connections, list) {
1613 if (!i->domain)
1614 continue;
1616 if (domain_can_read(i)) {
1617 handle_input(i);
1618 goto more;
1621 if (domain_can_write(i)) {
1622 handle_output(i);
1623 goto more;
1627 max = initialize_set(&inset, &outset, *sock, *ro_sock,
1628 event_fd);