ia64/xen-unstable

view tools/xenstore/xenstored_core.c @ 5867:932fc8a1b38d

# HG changeset patch
# User Rusty Russell <rusty@rustcorp.com.au>
# Node ID a92163adedcfcff0d05c965d09da747f3c8aa13e
# Parent 63ab20781afa311300f3a8e832744292014ea7f6

Remove ill-conceived concept of watches blocking reply on connection which did write/mkdir/rm/setperm etc.
This causes deadlocks in real life, and I can't see a sane way of avoiding them: it is reasonable for someone to ignore watch notifications while doing other actions, and that means that we can do other writes. These writes can block pending other watchers; if one of these is the process blocked awaiting our ack, we deadlock.

diff -r 63ab20781afa -r a92163adedcf tools/xenstore/xenstored_core.c
author cl349@firebug.cl.cam.ac.uk
date Tue Jul 26 13:11:01 2005 +0000 (2005-07-26)
parents a83ac0806d6b
children 71271a3f41a9
line source
1 /*
2 Simple prototype Xen Store Daemon providing simple tree-like database.
3 Copyright (C) 2005 Rusty Russell IBM Corporation
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
20 #include <sys/types.h>
21 #include <sys/stat.h>
22 #include <sys/socket.h>
23 #include <sys/select.h>
24 #include <sys/un.h>
25 #include <sys/time.h>
26 #include <time.h>
27 #include <unistd.h>
28 #include <fcntl.h>
29 #include <stdbool.h>
30 #include <stdio.h>
31 #include <stdarg.h>
32 #include <stdlib.h>
33 #include <syslog.h>
34 #include <string.h>
35 #include <errno.h>
36 #include <dirent.h>
37 #include <getopt.h>
38 #include <signal.h>
39 #include <assert.h>
40 #include <setjmp.h>
42 //#define DEBUG
43 #include "utils.h"
44 #include "list.h"
45 #include "talloc.h"
46 #include "xs_lib.h"
47 #include "xenstored.h"
48 #include "xenstored_core.h"
49 #include "xenstored_watch.h"
50 #include "xenstored_transaction.h"
51 #include "xenstored_domain.h"
53 static bool verbose;
54 static LIST_HEAD(connections);
55 static int tracefd = -1;
57 #ifdef TESTING
58 static bool failtest = false;
60 /* We override talloc's malloc. */
61 void *test_malloc(size_t size)
62 {
63 /* 1 in 20 means only about 50% of connections establish. */
64 if (failtest && (random() % 32) == 0)
65 return NULL;
66 return malloc(size);
67 }
69 static void stop_failtest(int signum __attribute__((unused)))
70 {
71 failtest = false;
72 }
74 /* Need these before we #define away write_all/mkdir in testing.h */
75 bool test_write_all(int fd, void *contents, unsigned int len);
76 bool test_write_all(int fd, void *contents, unsigned int len)
77 {
78 if (failtest && (random() % 8) == 0) {
79 if (len)
80 len = random() % len;
81 write(fd, contents, len);
82 errno = ENOSPC;
83 return false;
84 }
85 return xs_write_all(fd, contents, len);
86 }
88 int test_mkdir(const char *dir, int perms);
89 int test_mkdir(const char *dir, int perms)
90 {
91 if (failtest && (random() % 8) == 0) {
92 errno = ENOSPC;
93 return -1;
94 }
95 return mkdir(dir, perms);
96 }
97 #endif /* TESTING */
99 #include "xenstored_test.h"
101 /* FIXME: Ideally, this should never be called. Some can be eliminated. */
102 /* Something is horribly wrong: shutdown immediately. */
103 void __attribute__((noreturn)) corrupt(struct connection *conn,
104 const char *fmt, ...)
105 {
106 va_list arglist;
107 char *str;
108 int saved_errno = errno;
110 va_start(arglist, fmt);
111 str = talloc_vasprintf(NULL, fmt, arglist);
112 va_end(arglist);
114 eprintf("xenstored corruption: connection id %i: err %s: %s",
115 conn ? (int)conn->id : -1, strerror(saved_errno), str);
116 #ifdef TESTING
117 /* Allow them to attach debugger. */
118 sleep(30);
119 #endif
120 syslog(LOG_DAEMON,
121 "xenstored corruption: connection id %i: err %s: %s",
122 conn ? (int)conn->id : -1, strerror(saved_errno), str);
123 _exit(2);
124 }
126 static char *sockmsg_string(enum xsd_sockmsg_type type)
127 {
128 switch (type) {
129 case XS_DEBUG: return "DEBUG";
130 case XS_SHUTDOWN: return "SHUTDOWN";
131 case XS_DIRECTORY: return "DIRECTORY";
132 case XS_READ: return "READ";
133 case XS_GET_PERMS: return "GET_PERMS";
134 case XS_WATCH: return "WATCH";
135 case XS_WATCH_ACK: return "WATCH_ACK";
136 case XS_UNWATCH: return "UNWATCH";
137 case XS_TRANSACTION_START: return "TRANSACTION_START";
138 case XS_TRANSACTION_END: return "TRANSACTION_END";
139 case XS_INTRODUCE: return "INTRODUCE";
140 case XS_RELEASE: return "RELEASE";
141 case XS_GETDOMAINPATH: return "GETDOMAINPATH";
142 case XS_WRITE: return "WRITE";
143 case XS_MKDIR: return "MKDIR";
144 case XS_RM: return "RM";
145 case XS_SET_PERMS: return "SET_PERMS";
146 case XS_WATCH_EVENT: return "WATCH_EVENT";
147 case XS_ERROR: return "ERROR";
148 default:
149 return "**UNKNOWN**";
150 }
151 }
153 static void trace_io(const struct connection *conn,
154 const char *prefix,
155 const struct buffered_data *data)
156 {
157 char string[64];
158 unsigned int i;
160 if (tracefd < 0)
161 return;
163 write(tracefd, prefix, strlen(prefix));
164 sprintf(string, " %p ", conn);
165 write(tracefd, string, strlen(string));
166 write(tracefd, sockmsg_string(data->hdr.msg.type),
167 strlen(sockmsg_string(data->hdr.msg.type)));
168 write(tracefd, " (", 2);
169 for (i = 0; i < data->hdr.msg.len; i++) {
170 if (data->buffer[i] == '\0')
171 write(tracefd, " ", 1);
172 else
173 write(tracefd, data->buffer + i, 1);
174 }
175 write(tracefd, ")\n", 2);
176 }
178 void trace_create(const void *data, const char *type)
179 {
180 char string[64];
181 if (tracefd < 0)
182 return;
184 write(tracefd, "CREATE ", strlen("CREATE "));
185 write(tracefd, type, strlen(type));
186 sprintf(string, " %p\n", data);
187 write(tracefd, string, strlen(string));
188 }
190 void trace_destroy(const void *data, const char *type)
191 {
192 char string[64];
193 if (tracefd < 0)
194 return;
196 write(tracefd, "DESTROY ", strlen("DESTROY "));
197 write(tracefd, type, strlen(type));
198 sprintf(string, " %p\n", data);
199 write(tracefd, string, strlen(string));
200 }
202 void trace_watch_timeout(const struct connection *conn, const char *node, const char *token)
203 {
204 char string[64];
205 if (tracefd < 0)
206 return;
207 write(tracefd, "WATCH_TIMEOUT ", strlen("WATCH_TIMEOUT "));
208 sprintf(string, " %p ", conn);
209 write(tracefd, string, strlen(string));
210 write(tracefd, " (", 2);
211 write(tracefd, node, strlen(node));
212 write(tracefd, " ", 1);
213 write(tracefd, token, strlen(token));
214 write(tracefd, ")\n", 2);
215 }
217 static void trace_blocked(const struct connection *conn,
218 const struct buffered_data *data)
219 {
220 char string[64];
222 if (tracefd < 0)
223 return;
225 write(tracefd, "BLOCKED", strlen("BLOCKED"));
226 sprintf(string, " %p (", conn);
227 write(tracefd, string, strlen(string));
228 write(tracefd, sockmsg_string(data->hdr.msg.type),
229 strlen(sockmsg_string(data->hdr.msg.type)));
230 write(tracefd, ")\n", 2);
231 }
233 static bool write_message(struct connection *conn)
234 {
235 int ret;
236 struct buffered_data *out = conn->out;
238 if (out->inhdr) {
239 if (verbose)
240 xprintf("Writing msg %s (%s) out to %p\n",
241 sockmsg_string(out->hdr.msg.type),
242 out->buffer, conn);
243 ret = conn->write(conn, out->hdr.raw + out->used,
244 sizeof(out->hdr) - out->used);
245 if (ret < 0)
246 return false;
248 out->used += ret;
249 if (out->used < sizeof(out->hdr))
250 return true;
252 out->inhdr = false;
253 out->used = 0;
255 /* Second write might block if non-zero. */
256 if (out->hdr.msg.len)
257 return true;
258 }
260 ret = conn->write(conn, out->buffer + out->used,
261 out->hdr.msg.len - out->used);
263 if (ret < 0)
264 return false;
266 out->used += ret;
267 if (out->used != out->hdr.msg.len)
268 return true;
270 trace_io(conn, "OUT", out);
271 conn->out = NULL;
272 talloc_free(out);
274 queue_next_event(conn);
275 return true;
276 }
278 static int destroy_conn(void *_conn)
279 {
280 struct connection *conn = _conn;
282 /* Flush outgoing if possible, but don't block. */
283 if (!conn->domain) {
284 fd_set set;
285 struct timeval none;
287 FD_ZERO(&set);
288 FD_SET(conn->fd, &set);
289 none.tv_sec = none.tv_usec = 0;
291 while (conn->out
292 && select(conn->fd+1, NULL, &set, NULL, &none) == 1)
293 if (!write_message(conn))
294 break;
295 close(conn->fd);
296 }
297 list_del(&conn->list);
298 trace_destroy(conn, "connection");
299 return 0;
300 }
302 static int initialize_set(fd_set *inset, fd_set *outset, int sock, int ro_sock,
303 int event_fd)
304 {
305 struct connection *i;
306 int max;
308 FD_ZERO(inset);
309 FD_ZERO(outset);
310 FD_SET(sock, inset);
311 max = sock;
312 FD_SET(ro_sock, inset);
313 if (ro_sock > max)
314 max = ro_sock;
315 FD_SET(event_fd, inset);
316 if (event_fd > max)
317 max = event_fd;
318 list_for_each_entry(i, &connections, list) {
319 if (i->domain)
320 continue;
321 if (!i->blocked)
322 FD_SET(i->fd, inset);
323 if (i->out)
324 FD_SET(i->fd, outset);
325 if (i->fd > max)
326 max = i->fd;
327 }
328 return max;
329 }
331 /* Read everything from a talloc_open'ed fd. */
332 void *read_all(int *fd, unsigned int *size)
333 {
334 unsigned int max = 4;
335 int ret;
336 void *buffer = talloc_size(fd, max);
338 *size = 0;
339 while ((ret = read(*fd, buffer + *size, max - *size)) > 0) {
340 *size += ret;
341 if (*size == max)
342 buffer = talloc_realloc_size(fd, buffer, max *= 2);
343 }
344 if (ret < 0)
345 return NULL;
346 return buffer;
347 }
349 static int destroy_fd(void *_fd)
350 {
351 int *fd = _fd;
352 close(*fd);
353 return 0;
354 }
356 /* Return a pointer to an fd, self-closing and attached to this pathname. */
357 int *talloc_open(const char *pathname, int flags, int mode)
358 {
359 int *fd;
361 fd = talloc(pathname, int);
362 *fd = open(pathname, flags, mode);
363 if (*fd < 0) {
364 int saved_errno = errno;
365 talloc_free(fd);
366 errno = saved_errno;
367 return NULL;
368 }
369 talloc_set_destructor(fd, destroy_fd);
370 return fd;
371 }
373 /* Is child a subnode of parent, or equal? */
374 bool is_child(const char *child, const char *parent)
375 {
376 unsigned int len = strlen(parent);
378 /* / should really be "" for this algorithm to work, but that's a
379 * usability nightmare. */
380 if (streq(parent, "/"))
381 return true;
383 if (strncmp(child, parent, len) != 0)
384 return false;
386 return child[len] == '/' || child[len] == '\0';
387 }
389 /* Answer never ends in /. */
390 char *node_dir_outside_transaction(const char *node)
391 {
392 if (streq(node, "/"))
393 return talloc_strdup(node, xs_daemon_store());
394 return talloc_asprintf(node, "%s%s", xs_daemon_store(), node);
395 }
397 static char *node_dir(struct transaction *trans, const char *node)
398 {
399 if (!trans || !within_transaction(trans, node))
400 return node_dir_outside_transaction(node);
401 return node_dir_inside_transaction(trans, node);
402 }
404 static char *node_datafile(struct transaction *trans, const char *node)
405 {
406 return talloc_asprintf(node, "%s/.data", node_dir(trans, node));
407 }
409 static char *node_permfile(struct transaction *trans, const char *node)
410 {
411 return talloc_asprintf(node, "%s/.perms", node_dir(trans, node));
412 }
414 struct buffered_data *new_buffer(void *ctx)
415 {
416 struct buffered_data *data;
418 data = talloc(ctx, struct buffered_data);
419 data->inhdr = true;
420 data->used = 0;
421 data->buffer = NULL;
423 return data;
424 }
426 /* Return length of string (including nul) at this offset. */
427 unsigned int get_string(const struct buffered_data *data, unsigned int offset)
428 {
429 const char *nul;
431 if (offset >= data->used)
432 return 0;
434 nul = memchr(data->buffer + offset, 0, data->used - offset);
435 if (!nul)
436 return 0;
438 return nul - (data->buffer + offset) + 1;
439 }
441 /* Break input into vectors, return the number, fill in up to num of them. */
442 unsigned int get_strings(struct buffered_data *data,
443 char *vec[], unsigned int num)
444 {
445 unsigned int off, i, len;
447 off = i = 0;
448 while ((len = get_string(data, off)) != 0) {
449 if (i < num)
450 vec[i] = data->buffer + off;
451 i++;
452 off += len;
453 }
454 return i;
455 }
457 /* Returns "false", meaning "connection is not blocked". */
458 bool send_reply(struct connection *conn, enum xsd_sockmsg_type type,
459 const void *data, unsigned int len)
460 {
461 struct buffered_data *bdata;
463 /* When data gets freed, we want list entry is destroyed (so
464 * list entry is a child). */
465 bdata = new_buffer(conn);
466 bdata->buffer = talloc_array(bdata, char, len);
468 bdata->hdr.msg.type = type;
469 bdata->hdr.msg.len = len;
470 memcpy(bdata->buffer, data, len);
472 /* There might be an event going out now. Queue behind it. */
473 if (conn->out) {
474 assert(conn->out->hdr.msg.type == XS_WATCH_EVENT);
475 assert(!conn->waiting_reply);
476 conn->waiting_reply = bdata;
477 } else
478 conn->out = bdata;
479 return false;
480 }
482 /* Some routines (write, mkdir, etc) just need a non-error return */
483 bool send_ack(struct connection *conn, enum xsd_sockmsg_type type)
484 {
485 return send_reply(conn, type, "OK", sizeof("OK"));
486 }
488 bool send_error(struct connection *conn, int error)
489 {
490 unsigned int i;
492 for (i = 0; error != xsd_errors[i].errnum; i++)
493 if (i == ARRAY_SIZE(xsd_errors) - 1)
494 corrupt(conn, "Unknown error %i (%s)", error,
495 strerror(error));
497 return send_reply(conn, XS_ERROR, xsd_errors[i].errstring,
498 strlen(xsd_errors[i].errstring) + 1);
499 }
501 static bool valid_chars(const char *node)
502 {
503 /* Nodes can have lots of crap. */
504 return (strspn(node,
505 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
506 "abcdefghijklmnopqrstuvwxyz"
507 "0123456789-/_@") == strlen(node));
508 }
510 bool is_valid_nodename(const char *node)
511 {
512 /* Must start in /. */
513 if (!strstarts(node, "/"))
514 return false;
516 /* Cannot end in / (unless it's just "/"). */
517 if (strends(node, "/") && !streq(node, "/"))
518 return false;
520 /* No double //. */
521 if (strstr(node, "//"))
522 return false;
524 return valid_chars(node);
525 }
527 /* We expect one arg in the input: return NULL otherwise. */
528 static const char *onearg(struct buffered_data *in)
529 {
530 if (get_string(in, 0) != in->used)
531 return NULL;
532 return in->buffer;
533 }
535 /* If it fails, returns NULL and sets errno. */
536 static struct xs_permissions *get_perms(struct transaction *transaction,
537 const char *node, unsigned int *num)
538 {
539 unsigned int size;
540 char *strings;
541 struct xs_permissions *ret;
542 int *fd;
544 fd = talloc_open(node_permfile(transaction, node), O_RDONLY, 0);
545 if (!fd)
546 return NULL;
547 strings = read_all(fd, &size);
548 if (!strings)
549 return NULL;
551 *num = xs_count_strings(strings, size);
552 ret = talloc_array(node, struct xs_permissions, *num);
553 if (!xs_strings_to_perms(ret, *num, strings))
554 corrupt(NULL, "Permissions corrupt for %s", node);
556 return ret;
557 }
559 static char *perms_to_strings(const char *node,
560 struct xs_permissions *perms, unsigned int num,
561 unsigned int *len)
562 {
563 unsigned int i;
564 char *strings = NULL;
565 char buffer[MAX_STRLEN(domid_t) + 1];
567 for (*len = 0, i = 0; i < num; i++) {
568 if (!xs_perm_to_string(&perms[i], buffer))
569 return NULL;
571 strings = talloc_realloc(node, strings, char,
572 *len + strlen(buffer) + 1);
573 strcpy(strings + *len, buffer);
574 *len += strlen(buffer) + 1;
575 }
576 return strings;
577 }
579 /* Destroy this, and its children, and its children's children. */
580 int destroy_path(void *path)
581 {
582 DIR *dir;
583 struct dirent *dirent;
585 dir = opendir(path);
586 if (!dir) {
587 if (unlink(path) == 0 || errno == ENOENT)
588 return 0;
589 corrupt(NULL, "Destroying path %s", path);
590 }
592 while ((dirent = readdir(dir)) != NULL) {
593 char fullpath[strlen(path) + 1 + strlen(dirent->d_name) + 1];
594 sprintf(fullpath, "%s/%s", (char *)path, dirent->d_name);
595 if (!streq(dirent->d_name,".") && !streq(dirent->d_name,".."))
596 destroy_path(fullpath);
597 }
598 closedir(dir);
599 if (rmdir(path) != 0)
600 corrupt(NULL, "Destroying directory %s", path);
601 return 0;
602 }
604 /* Create a self-destructing temporary file */
605 static char *tempfile(const char *path, void *contents, unsigned int len)
606 {
607 int *fd;
608 char *tmppath = talloc_asprintf(path, "%s.tmp", path);
610 fd = talloc_open(tmppath, O_WRONLY|O_CREAT|O_EXCL, 0640);
611 if (!fd)
612 return NULL;
613 talloc_set_destructor(tmppath, destroy_path);
614 if (!xs_write_all(*fd, contents, len))
615 return NULL;
617 return tmppath;
618 }
620 static int destroy_opendir(void *_dir)
621 {
622 DIR **dir = _dir;
623 closedir(*dir);
624 return 0;
625 }
627 /* Return a pointer to a DIR*, self-closing and attached to this pathname. */
628 DIR **talloc_opendir(const char *pathname)
629 {
630 DIR **dir;
632 dir = talloc(pathname, DIR *);
633 *dir = opendir(pathname);
634 if (!*dir) {
635 int saved_errno = errno;
636 talloc_free(dir);
637 errno = saved_errno;
638 return NULL;
639 }
640 talloc_set_destructor(dir, destroy_opendir);
641 return dir;
642 }
644 /* We assume rename() doesn't fail on moves in same dir. */
645 static void commit_tempfile(const char *path)
646 {
647 char realname[strlen(path) + 1];
648 unsigned int len = strrchr(path, '.') - path;
650 memcpy(realname, path, len);
651 realname[len] = '\0';
652 if (rename(path, realname) != 0)
653 corrupt(NULL, "Committing %s", realname);
654 talloc_set_destructor(path, NULL);
655 }
657 static bool set_perms(struct transaction *transaction,
658 const char *node,
659 struct xs_permissions *perms, unsigned int num)
660 {
661 unsigned int len;
662 char *permpath, *strings;
664 strings = perms_to_strings(node, perms, num, &len);
665 if (!strings)
666 return false;
668 /* Create then move. */
669 permpath = tempfile(node_permfile(transaction, node), strings, len);
670 if (!permpath)
671 return false;
673 commit_tempfile(permpath);
674 return true;
675 }
677 static char *get_parent(const char *node)
678 {
679 char *slash = strrchr(node + 1, '/');
680 if (!slash)
681 return talloc_strdup(node, "/");
682 return talloc_asprintf(node, "%.*s", slash - node, node);
683 }
685 static enum xs_perm_type perm_for_id(domid_t id,
686 struct xs_permissions *perms,
687 unsigned int num)
688 {
689 unsigned int i;
691 /* Owners and tools get it all... */
692 if (!id || perms[0].id == id)
693 return XS_PERM_READ|XS_PERM_WRITE|XS_PERM_CREATE|XS_PERM_OWNER;
695 for (i = 1; i < num; i++)
696 if (perms[i].id == id)
697 return perms[i].perms;
699 return perms[0].perms;
700 }
702 /* We have a weird permissions system. You can allow someone into a
703 * specific node without allowing it in the parents. If it's going to
704 * fail, however, we don't want the errno to indicate any information
705 * about the node. */
706 static int check_with_parents(struct connection *conn, const char *node,
707 int errnum)
708 {
709 struct xs_permissions *perms;
710 unsigned int num;
712 /* We always tell them about memory failures. */
713 if (errnum == ENOMEM)
714 return errnum;
716 do {
717 node = get_parent(node);
718 perms = get_perms(conn->transaction, node, &num);
719 if (perms)
720 break;
721 } while (!streq(node, "/"));
723 /* No permission at root? We're in trouble. */
724 if (!perms)
725 corrupt(conn, "No permissions file at root");
727 if (!(perm_for_id(conn->id, perms, num) & XS_PERM_READ))
728 return EACCES;
730 return errnum;
731 }
733 char *canonicalize(struct connection *conn, const char *node)
734 {
735 const char *prefix;
737 if (!node || strstarts(node, "/"))
738 return (char *)node;
739 prefix = get_implicit_path(conn);
740 if (prefix)
741 return talloc_asprintf(node, "%s/%s", prefix, node);
742 return (char *)node;
743 }
745 bool check_node_perms(struct connection *conn, const char *node,
746 enum xs_perm_type perm)
747 {
748 struct xs_permissions *perms;
749 unsigned int num;
751 if (!node || !is_valid_nodename(node)) {
752 errno = EINVAL;
753 return false;
754 }
756 if (!conn->can_write && (perm & XS_PERM_WRITE)) {
757 errno = EROFS;
758 return false;
759 }
761 perms = get_perms(conn->transaction, node, &num);
762 /* No permissions. If we want to create it and
763 * it doesn't exist, check parent directory. */
764 if (!perms && errno == ENOENT && (perm & XS_PERM_CREATE)) {
765 char *parent = get_parent(node);
766 if (!parent)
767 return false;
769 perms = get_perms(conn->transaction, parent, &num);
770 }
771 if (!perms) {
772 errno = check_with_parents(conn, node, errno);
773 return false;
774 }
776 if (perm_for_id(conn->id, perms, num) & perm)
777 return true;
779 errno = check_with_parents(conn, node, EACCES);
780 return false;
781 }
783 static bool send_directory(struct connection *conn, const char *node)
784 {
785 char *path, *reply = talloc_strdup(node, "");
786 unsigned int reply_len = 0;
787 DIR **dir;
788 struct dirent *dirent;
790 node = canonicalize(conn, node);
791 if (!check_node_perms(conn, node, XS_PERM_READ))
792 return send_error(conn, errno);
794 path = node_dir(conn->transaction, node);
795 dir = talloc_opendir(path);
796 if (!dir)
797 return send_error(conn, errno);
799 while ((dirent = readdir(*dir)) != NULL) {
800 int len = strlen(dirent->d_name) + 1;
802 if (!valid_chars(dirent->d_name))
803 continue;
805 reply = talloc_realloc(path, reply, char, reply_len + len);
806 strcpy(reply + reply_len, dirent->d_name);
807 reply_len += len;
808 }
810 return send_reply(conn, XS_DIRECTORY, reply, reply_len);
811 }
813 static bool do_read(struct connection *conn, const char *node)
814 {
815 char *value;
816 unsigned int size;
817 int *fd;
819 node = canonicalize(conn, node);
820 if (!check_node_perms(conn, node, XS_PERM_READ))
821 return send_error(conn, errno);
823 fd = talloc_open(node_datafile(conn->transaction, node), O_RDONLY, 0);
824 if (!fd) {
825 /* Data file doesn't exist? We call that a directory */
826 if (errno == ENOENT)
827 errno = EISDIR;
828 return send_error(conn, errno);
829 }
831 value = read_all(fd, &size);
832 if (!value)
833 return send_error(conn, errno);
835 return send_reply(conn, XS_READ, value, size);
836 }
838 /* Create a new directory. Optionally put data in it (if data != NULL) */
839 static bool new_directory(struct connection *conn,
840 const char *node, void *data, unsigned int datalen)
841 {
842 struct xs_permissions *perms;
843 char *permstr;
844 unsigned int num, len;
845 int *fd;
846 char *dir = node_dir(conn->transaction, node);
848 if (mkdir(dir, 0750) != 0)
849 return false;
851 /* Set destructor so we clean up if neccesary. */
852 talloc_set_destructor(dir, destroy_path);
854 perms = get_perms(conn->transaction, get_parent(node), &num);
855 /* Domains own what they create. */
856 if (conn->id)
857 perms->id = conn->id;
859 permstr = perms_to_strings(dir, perms, num, &len);
860 fd = talloc_open(node_permfile(conn->transaction, node),
861 O_WRONLY|O_CREAT|O_EXCL, 0640);
862 if (!fd || !xs_write_all(*fd, permstr, len))
863 return false;
865 if (data) {
866 char *datapath = node_datafile(conn->transaction, node);
868 fd = talloc_open(datapath, O_WRONLY|O_CREAT|O_EXCL, 0640);
869 if (!fd || !xs_write_all(*fd, data, datalen))
870 return false;
871 }
873 /* Finished! */
874 talloc_set_destructor(dir, NULL);
875 return true;
876 }
878 /* path, flags, data... */
879 static bool do_write(struct connection *conn, struct buffered_data *in)
880 {
881 unsigned int offset, datalen;
882 char *vec[2];
883 char *node, *tmppath;
884 enum xs_perm_type mode;
885 struct stat st;
887 /* Extra "strings" can be created by binary data. */
888 if (get_strings(in, vec, ARRAY_SIZE(vec)) < ARRAY_SIZE(vec))
889 return send_error(conn, EINVAL);
891 node = canonicalize(conn, vec[0]);
892 if (/*suppress error on write outside transaction*/ 0 &&
893 !within_transaction(conn->transaction, node))
894 return send_error(conn, EROFS);
896 if (transaction_block(conn, node))
897 return true;
899 offset = strlen(vec[0]) + strlen(vec[1]) + 2;
900 datalen = in->used - offset;
902 if (streq(vec[1], XS_WRITE_NONE))
903 mode = XS_PERM_WRITE;
904 else if (streq(vec[1], XS_WRITE_CREATE))
905 mode = XS_PERM_WRITE|XS_PERM_CREATE;
906 else if (streq(vec[1], XS_WRITE_CREATE_EXCL))
907 mode = XS_PERM_WRITE|XS_PERM_CREATE;
908 else
909 return send_error(conn, EINVAL);
911 if (!check_node_perms(conn, node, mode))
912 return send_error(conn, errno);
914 if (lstat(node_dir(conn->transaction, node), &st) != 0) {
915 /* Does not exist... */
916 if (errno != ENOENT)
917 return send_error(conn, errno);
919 /* Not going to create it? */
920 if (!(mode & XS_PERM_CREATE))
921 return send_error(conn, ENOENT);
923 if (!new_directory(conn, node, in->buffer + offset, datalen))
924 return send_error(conn, errno);
925 } else {
926 /* Exists... */
927 if (streq(vec[1], XS_WRITE_CREATE_EXCL))
928 return send_error(conn, EEXIST);
930 tmppath = tempfile(node_datafile(conn->transaction, node),
931 in->buffer + offset, datalen);
932 if (!tmppath)
933 return send_error(conn, errno);
935 commit_tempfile(tmppath);
936 }
938 add_change_node(conn->transaction, node, false);
939 send_ack(conn, XS_WRITE);
940 fire_watches(conn->transaction, node, false);
941 return false;
942 }
944 static bool do_mkdir(struct connection *conn, const char *node)
945 {
946 node = canonicalize(conn, node);
947 if (!check_node_perms(conn, node, XS_PERM_WRITE|XS_PERM_CREATE))
948 return send_error(conn, errno);
950 if (!within_transaction(conn->transaction, node))
951 return send_error(conn, EROFS);
953 if (transaction_block(conn, node))
954 return true;
956 if (!new_directory(conn, node, NULL, 0))
957 return send_error(conn, errno);
959 add_change_node(conn->transaction, node, false);
960 send_ack(conn, XS_MKDIR);
961 fire_watches(conn->transaction, node, false);
962 return false;
963 }
965 static bool do_rm(struct connection *conn, const char *node)
966 {
967 char *tmppath, *path;
969 node = canonicalize(conn, node);
970 if (!check_node_perms(conn, node, XS_PERM_WRITE))
971 return send_error(conn, errno);
973 if (!within_transaction(conn->transaction, node))
974 return send_error(conn, EROFS);
976 if (transaction_block(conn, node))
977 return true;
979 if (streq(node, "/"))
980 return send_error(conn, EINVAL);
982 /* We move the directory to temporary name, destructor cleans up. */
983 path = node_dir(conn->transaction, node);
984 tmppath = talloc_asprintf(node, "%s.tmp", path);
985 talloc_set_destructor(tmppath, destroy_path);
987 if (rename(path, tmppath) != 0)
988 return send_error(conn, errno);
990 add_change_node(conn->transaction, node, true);
991 send_ack(conn, XS_RM);
992 fire_watches(conn->transaction, node, true);
993 return false;
994 }
996 static bool do_get_perms(struct connection *conn, const char *node)
997 {
998 struct xs_permissions *perms;
999 char *strings;
1000 unsigned int len, num;
1002 node = canonicalize(conn, node);
1003 if (!check_node_perms(conn, node, XS_PERM_READ))
1004 return send_error(conn, errno);
1006 perms = get_perms(conn->transaction, node, &num);
1007 if (!perms)
1008 return send_error(conn, errno);
1010 strings = perms_to_strings(node, perms, num, &len);
1011 if (!strings)
1012 return send_error(conn, errno);
1014 return send_reply(conn, XS_GET_PERMS, strings, len);
1017 static bool do_set_perms(struct connection *conn, struct buffered_data *in)
1019 unsigned int num;
1020 char *node;
1021 struct xs_permissions *perms;
1023 num = xs_count_strings(in->buffer, in->used);
1024 if (num < 2)
1025 return send_error(conn, EINVAL);
1027 /* First arg is node name. */
1028 node = canonicalize(conn, in->buffer);
1029 in->buffer += strlen(in->buffer) + 1;
1030 num--;
1032 if (!within_transaction(conn->transaction, node))
1033 return send_error(conn, EROFS);
1035 if (transaction_block(conn, node))
1036 return true;
1038 /* We must own node to do this (tools can do this too). */
1039 if (!check_node_perms(conn, node, XS_PERM_WRITE|XS_PERM_OWNER))
1040 return send_error(conn, errno);
1042 perms = talloc_array(node, struct xs_permissions, num);
1043 if (!xs_strings_to_perms(perms, num, in->buffer))
1044 return send_error(conn, errno);
1046 if (!set_perms(conn->transaction, node, perms, num))
1047 return send_error(conn, errno);
1048 add_change_node(conn->transaction, node, false);
1049 send_ack(conn, XS_SET_PERMS);
1050 fire_watches(conn->transaction, node, false);
1051 return false;
1054 /* Process "in" for conn: "in" will vanish after this conversation, so
1055 * we can talloc off it for temporary variables. May free "conn".
1056 * Returns true if can't complete due to block.
1057 */
1058 static bool process_message(struct connection *conn, struct buffered_data *in)
1060 switch (in->hdr.msg.type) {
1061 case XS_DIRECTORY:
1062 return send_directory(conn, onearg(in));
1064 case XS_READ:
1065 return do_read(conn, onearg(in));
1067 case XS_WRITE:
1068 return do_write(conn, in);
1070 case XS_MKDIR:
1071 return do_mkdir(conn, onearg(in));
1073 case XS_RM:
1074 return do_rm(conn, onearg(in));
1076 case XS_GET_PERMS:
1077 return do_get_perms(conn, onearg(in));
1079 case XS_SET_PERMS:
1080 return do_set_perms(conn, in);
1082 case XS_SHUTDOWN:
1083 /* FIXME: Implement gentle shutdown too. */
1084 /* Only tools can do this. */
1085 if (conn->id != 0)
1086 return send_error(conn, EACCES);
1087 if (!conn->can_write)
1088 return send_error(conn, EROFS);
1089 send_ack(conn, XS_SHUTDOWN);
1090 /* Everything hangs off auto-free context, freed at exit. */
1091 exit(0);
1093 case XS_DEBUG:
1094 if (streq(in->buffer, "print")) {
1095 xprintf("debug: %s", in->buffer + get_string(in, 0));
1096 return false;
1098 #ifdef TESTING
1099 /* For testing, we allow them to set id. */
1100 if (streq(in->buffer, "setid")) {
1101 conn->id = atoi(in->buffer + get_string(in, 0));
1102 send_ack(conn, XS_DEBUG);
1103 } else if (streq(in->buffer, "failtest")) {
1104 if (get_string(in, 0) < in->used)
1105 srandom(atoi(in->buffer + get_string(in, 0)));
1106 send_ack(conn, XS_DEBUG);
1107 failtest = true;
1109 #endif /* TESTING */
1110 return false;
1112 case XS_WATCH:
1113 return do_watch(conn, in);
1115 case XS_WATCH_ACK:
1116 return do_watch_ack(conn, onearg(in));
1118 case XS_UNWATCH:
1119 return do_unwatch(conn, in);
1121 case XS_TRANSACTION_START:
1122 return do_transaction_start(conn, onearg(in));
1124 case XS_TRANSACTION_END:
1125 return do_transaction_end(conn, onearg(in));
1127 case XS_INTRODUCE:
1128 return do_introduce(conn, in);
1130 case XS_RELEASE:
1131 return do_release(conn, onearg(in));
1133 case XS_GETDOMAINPATH:
1134 return do_get_domain_path(conn, onearg(in));
1136 case XS_WATCH_EVENT:
1137 default:
1138 eprintf("Client unknown operation %i", in->hdr.msg.type);
1139 send_error(conn, ENOSYS);
1140 return false;
1144 static int out_of_mem(void *data)
1146 longjmp(*(jmp_buf *)data, 1);
1149 static void consider_message(struct connection *conn)
1151 struct buffered_data *in = NULL;
1152 enum xsd_sockmsg_type type = conn->in->hdr.msg.type;
1153 jmp_buf talloc_fail;
1155 /* For simplicity, we kill the connection on OOM. */
1156 talloc_set_fail_handler(out_of_mem, &talloc_fail);
1157 if (setjmp(talloc_fail)) {
1158 talloc_free(conn);
1159 goto end;
1162 if (verbose)
1163 xprintf("Got message %s len %i from %p\n",
1164 sockmsg_string(type), conn->in->hdr.msg.len, conn);
1166 /* We might get a command while waiting for an ack: this means
1167 * the other end discarded it: we will re-transmit. */
1168 if (type != XS_WATCH_ACK)
1169 conn->waiting_for_ack = false;
1171 /* Careful: process_message may free connection. We detach
1172 * "in" beforehand and allocate the new buffer to avoid
1173 * touching conn after process_message.
1174 */
1175 in = talloc_steal(talloc_autofree_context(), conn->in);
1176 conn->in = new_buffer(conn);
1177 if (process_message(conn, in)) {
1178 /* Blocked by transaction: queue for re-xmit. */
1179 talloc_free(conn->in);
1180 conn->in = in;
1181 in = NULL;
1182 trace_blocked(conn, conn->in);
1185 end:
1186 talloc_free(in);
1187 talloc_set_fail_handler(NULL, NULL);
1188 if (talloc_total_blocks(NULL)
1189 != talloc_total_blocks(talloc_autofree_context()) + 1)
1190 talloc_report_full(NULL, stderr);
1193 /* Errors in reading or allocating here mean we get out of sync, so we
1194 * drop the whole client connection. */
1195 void handle_input(struct connection *conn)
1197 int bytes;
1198 struct buffered_data *in;
1200 assert(!conn->blocked);
1201 in = conn->in;
1203 /* Not finished header yet? */
1204 if (in->inhdr) {
1205 bytes = conn->read(conn, in->hdr.raw + in->used,
1206 sizeof(in->hdr) - in->used);
1207 if (bytes <= 0)
1208 goto bad_client;
1209 in->used += bytes;
1210 if (in->used != sizeof(in->hdr))
1211 return;
1213 if (in->hdr.msg.len > PATH_MAX) {
1214 syslog(LOG_DAEMON, "Client tried to feed us %i",
1215 in->hdr.msg.len);
1216 goto bad_client;
1219 in->buffer = talloc_array(in, char, in->hdr.msg.len);
1220 if (!in->buffer)
1221 goto bad_client;
1222 in->used = 0;
1223 in->inhdr = false;
1224 return;
1227 bytes = conn->read(conn, in->buffer + in->used,
1228 in->hdr.msg.len - in->used);
1229 if (bytes < 0)
1230 goto bad_client;
1232 in->used += bytes;
1233 if (in->used != in->hdr.msg.len)
1234 return;
1236 trace_io(conn, "IN ", in);
1237 consider_message(conn);
1238 return;
1240 bad_client:
1241 /* Kill it. */
1242 talloc_free(conn);
1245 void handle_output(struct connection *conn)
1247 if (!write_message(conn))
1248 talloc_free(conn);
1251 /* If a transaction has ended, see if we can unblock any connections. */
1252 static void unblock_connections(void)
1254 struct connection *i, *tmp;
1256 list_for_each_entry_safe(i, tmp, &connections, list) {
1257 if (!i->blocked)
1258 continue;
1260 if (!transaction_covering_node(i->blocked)) {
1261 talloc_free(i->blocked);
1262 i->blocked = NULL;
1263 consider_message(i);
1267 /* To balance bias, move first entry to end. */
1268 if (!list_empty(&connections)) {
1269 i = list_top(&connections, struct connection, list);
1270 list_del(&i->list);
1271 list_add_tail(&i->list, &connections);
1275 struct connection *new_connection(connwritefn_t *write, connreadfn_t *read)
1277 struct connection *new;
1278 jmp_buf talloc_fail;
1280 new = talloc(talloc_autofree_context(), struct connection);
1281 if (!new)
1282 return NULL;
1284 new->blocked = false;
1285 new->out = new->waiting_reply = NULL;
1286 new->fd = -1;
1287 new->id = 0;
1288 new->domain = NULL;
1289 new->transaction = NULL;
1290 new->write = write;
1291 new->read = read;
1292 new->can_write = true;
1294 talloc_set_fail_handler(out_of_mem, &talloc_fail);
1295 if (setjmp(talloc_fail)) {
1296 talloc_free(new);
1297 return NULL;
1299 new->in = new_buffer(new);
1300 talloc_set_fail_handler(NULL, NULL);
1302 list_add_tail(&new->list, &connections);
1303 talloc_set_destructor(new, destroy_conn);
1304 trace_create(new, "connection");
1305 return new;
1308 static int writefd(struct connection *conn, const void *data, unsigned int len)
1310 return write(conn->fd, data, len);
1313 static int readfd(struct connection *conn, void *data, unsigned int len)
1315 return read(conn->fd, data, len);
1318 static void accept_connection(int sock, bool canwrite)
1320 int fd;
1321 struct connection *conn;
1323 fd = accept(sock, NULL, NULL);
1324 if (fd < 0)
1325 return;
1327 conn = new_connection(writefd, readfd);
1328 if (conn) {
1329 conn->fd = fd;
1330 conn->can_write = canwrite;
1331 } else
1332 close(fd);
1335 /* Calc timespan from now to absolute time. */
1336 static void time_relative_to_now(struct timeval *tv)
1338 struct timeval now;
1340 gettimeofday(&now, NULL);
1341 if (timercmp(&now, tv, >))
1342 timerclear(tv);
1343 else {
1344 tv->tv_sec -= now.tv_sec;
1345 if (now.tv_usec > tv->tv_usec) {
1346 tv->tv_sec--;
1347 tv->tv_usec += 1000000;
1349 tv->tv_usec -= now.tv_usec;
1353 #ifdef TESTING
1354 /* Useful for running under debugger. */
1355 void dump_connection(void)
1357 struct connection *i;
1359 list_for_each_entry(i, &connections, list) {
1360 printf("Connection %p:\n", i);
1361 if (i->id)
1362 printf(" id = %i\n", i->id);
1363 if (i->blocked)
1364 printf(" blocked on = %s\n", i->blocked);
1365 if (i->waiting_for_ack)
1366 printf(" waiting_for_ack TRUE\n");
1367 if (!i->in->inhdr || i->in->used)
1368 printf(" got %i bytes of %s\n",
1369 i->in->used, i->in->inhdr ? "header" : "data");
1370 if (i->out)
1371 printf(" sending message %s (%s) out\n",
1372 sockmsg_string(i->out->hdr.msg.type),
1373 i->out->buffer);
1374 if (i->waiting_reply)
1375 printf(" ... and behind is queued %s (%s)\n",
1376 sockmsg_string(i->waiting_reply->hdr.msg.type),
1377 i->waiting_reply->buffer);
1378 #if 0
1379 if (i->transaction)
1380 dump_transaction(i);
1381 if (i->domain)
1382 dump_domain(i);
1383 #endif
1384 dump_watches(i);
1387 #endif
1389 static struct option options[] = { { "no-fork", 0, NULL, 'N' },
1390 { "verbose", 0, NULL, 'V' },
1391 { "output-pid", 0, NULL, 'P' },
1392 { "trace-file", 1, NULL, 'T' },
1393 { NULL, 0, NULL, 0 } };
1395 int main(int argc, char *argv[])
1397 int opt, *sock, *ro_sock, event_fd, max, tmpout;
1398 struct sockaddr_un addr;
1399 fd_set inset, outset;
1400 bool dofork = true;
1401 bool outputpid = false;
1403 while ((opt = getopt_long(argc, argv, "DVT:", options, NULL)) != -1) {
1404 switch (opt) {
1405 case 'N':
1406 dofork = false;
1407 break;
1408 case 'V':
1409 verbose = true;
1410 break;
1411 case 'P':
1412 outputpid = true;
1413 break;
1414 case 'T':
1415 tracefd = open(optarg, O_WRONLY|O_CREAT|O_APPEND, 0600);
1416 if (tracefd < 0)
1417 barf_perror("Could not open tracefile %s",
1418 optarg);
1419 write(tracefd, "\n***\n", strlen("\n***\n"));
1420 break;
1423 if (optind != argc)
1424 barf("%s: No arguments desired", argv[0]);
1426 talloc_enable_leak_report_full();
1428 /* Create sockets for them to listen to. */
1429 sock = talloc(talloc_autofree_context(), int);
1430 *sock = socket(PF_UNIX, SOCK_STREAM, 0);
1431 if (*sock < 0)
1432 barf_perror("Could not create socket");
1433 ro_sock = talloc(talloc_autofree_context(), int);
1434 *ro_sock = socket(PF_UNIX, SOCK_STREAM, 0);
1435 if (*ro_sock < 0)
1436 barf_perror("Could not create socket");
1437 talloc_set_destructor(sock, destroy_fd);
1438 talloc_set_destructor(ro_sock, destroy_fd);
1440 /* Don't kill us with SIGPIPE. */
1441 signal(SIGPIPE, SIG_IGN);
1443 /* FIXME: Be more sophisticated, don't mug running daemon. */
1444 unlink(xs_daemon_socket());
1445 unlink(xs_daemon_socket_ro());
1447 addr.sun_family = AF_UNIX;
1448 strcpy(addr.sun_path, xs_daemon_socket());
1449 if (bind(*sock, (struct sockaddr *)&addr, sizeof(addr)) != 0)
1450 barf_perror("Could not bind socket to %s", xs_daemon_socket());
1451 strcpy(addr.sun_path, xs_daemon_socket_ro());
1452 if (bind(*ro_sock, (struct sockaddr *)&addr, sizeof(addr)) != 0)
1453 barf_perror("Could not bind socket to %s",
1454 xs_daemon_socket_ro());
1455 if (chmod(xs_daemon_socket(), 0600) != 0
1456 || chmod(xs_daemon_socket_ro(), 0660) != 0)
1457 barf_perror("Could not chmod sockets");
1459 if (listen(*sock, 1) != 0
1460 || listen(*ro_sock, 1) != 0)
1461 barf_perror("Could not listen on sockets");
1463 /* If we're the first, create .perms file for root. */
1464 if (mkdir(xs_daemon_store(), 0750) == 0) {
1465 struct xs_permissions perms;
1466 char *root = talloc_strdup(talloc_autofree_context(), "/");
1468 perms.id = 0;
1469 perms.perms = XS_PERM_READ;
1470 if (!set_perms(NULL, root, &perms, 1))
1471 barf_perror("Could not create permissions in root");
1472 talloc_free(root);
1473 mkdir(xs_daemon_transactions(), 0750);
1474 } else if (errno != EEXIST)
1475 barf_perror("Could not create root %s", xs_daemon_store());
1477 /* Listen to hypervisor. */
1478 event_fd = domain_init();
1480 /* Debugging: daemonize() closes standard fds, so dup here. */
1481 tmpout = dup(STDOUT_FILENO);
1482 if (dofork) {
1483 openlog("xenstored", 0, LOG_DAEMON);
1484 daemonize();
1487 if (outputpid) {
1488 char buffer[20];
1489 sprintf(buffer, "%i\n", getpid());
1490 write(tmpout, buffer, strlen(buffer));
1492 close(tmpout);
1494 #ifdef TESTING
1495 signal(SIGUSR1, stop_failtest);
1496 #endif
1498 /* Get ready to listen to the tools. */
1499 max = initialize_set(&inset, &outset, *sock, *ro_sock, event_fd);
1501 /* Main loop. */
1502 for (;;) {
1503 struct connection *i;
1504 struct timeval *tvp = NULL, tv;
1506 timerclear(&tv);
1507 shortest_transaction_timeout(&tv);
1508 shortest_watch_ack_timeout(&tv);
1509 if (timerisset(&tv)) {
1510 time_relative_to_now(&tv);
1511 tvp = &tv;
1514 if (select(max+1, &inset, &outset, NULL, tvp) < 0) {
1515 if (errno == EINTR)
1516 continue;
1517 barf_perror("Select failed");
1520 if (FD_ISSET(*sock, &inset))
1521 accept_connection(*sock, true);
1523 if (FD_ISSET(*ro_sock, &inset))
1524 accept_connection(*ro_sock, false);
1526 if (FD_ISSET(event_fd, &inset))
1527 handle_event(event_fd);
1529 list_for_each_entry(i, &connections, list) {
1530 if (i->domain)
1531 continue;
1533 /* Operations can delete themselves or others
1534 * (xs_release): list is not safe after input,
1535 * so break. */
1536 if (FD_ISSET(i->fd, &inset)) {
1537 handle_input(i);
1538 break;
1540 if (FD_ISSET(i->fd, &outset)) {
1541 handle_output(i);
1542 break;
1546 /* Flush output for domain connections, */
1547 list_for_each_entry(i, &connections, list)
1548 if (i->domain && i->out)
1549 handle_output(i);
1551 if (tvp) {
1552 check_transaction_timeout();
1553 check_watch_ack_timeout();
1556 /* If transactions ended, we might be able to do more work. */
1557 unblock_connections();
1559 max = initialize_set(&inset, &outset, *sock,*ro_sock,event_fd);