ia64/xen-unstable

view tools/blktap2/daemon/tapdisk-daemon.c @ 19647:1c627434605e

blktap2: a completely rewritten blktap implementation

Benefits to blktap2 over the old version of blktap:

* Isolation from xenstore - Blktap devices are now created directly on
the linux dom0 command line, rather than being spawned in response
to XenStore events. This is handy for debugging, makes blktap
generally easier to work with, and is a step toward a generic
user-level block device implementation that is not Xen-specific.

* Improved tapdisk infrastructure: simpler request forwarding, new
request scheduler, request merging, more efficient use of AIO.

* Improved tapdisk error handling and memory management. No
allocations on the block data path, IO retry logic to protect
guests
transient block device failures. This has been tested and is known
to work on weird environments such as NFS soft mounts.

* Pause and snapshot of live virtual disks (see xmsnap script).

* VHD support. The VHD code in this release has been rigorously
tested, and represents a very mature implementation of the VHD
image
format.

* No more duplication of mechanism with blkback. The blktap kernel
module has changed dramatically from the original blktap. Blkback
is now always used to talk to Xen guests, blktap just presents a
Linux gendisk that blkback can export. This is done while
preserving the zero-copy data path from domU to physical device.

These patches deprecate the old blktap code, which can hopefully be
removed from the tree completely at some point in the future.

Signed-off-by: Jake Wires <jake.wires@citrix.com>
Signed-off-by: Dutch Meyer <dmeyer@cs.ubc.ca>
author Keir Fraser <keir.fraser@citrix.com>
date Tue May 26 11:52:31 2009 +0100 (2009-05-26)
parents
children 56d00bbc21e2
line source
1 /* Copyright (c) 2008, XenSource Inc.
2 * All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are met:
6 * * Redistributions of source code must retain the above copyright
7 * notice, this list of conditions and the following disclaimer.
8 * * Redistributions in binary form must reproduce the above copyright
9 * notice, this list of conditions and the following disclaimer in the
10 * documentation and/or other materials provided with the distribution.
11 * * Neither the name of XenSource Inc. nor the names of its contributors
12 * may be used to endorse or promote products derived from this software
13 * without specific prior written permission.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
19 * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27 #include <stdio.h>
28 #include <errno.h>
29 #include <fcntl.h>
30 #include <unistd.h>
31 #include <stdlib.h>
32 #include <string.h>
33 #include <sys/ioctl.h>
35 #include <xs.h>
36 #include "disktypes.h"
37 #include "tapdisk-dispatch.h"
39 #define TAPDISK_DAEMON_DOMID_WATCH "domid-watch"
40 #define TAPDISK_DAEMON_PIDFILE "/var/run/blktapctrl.pid"
42 typedef struct tapdisk_daemon {
43 char *node;
44 int blktap_fd;
45 uint16_t cookie;
47 struct xs_handle *xsh;
48 struct list_head channels;
49 struct xenbus_watch watch;
50 } tapdisk_daemon_t;
52 static tapdisk_daemon_t tapdisk_daemon;
54 #define tapdisk_daemon_for_each_channel(c, tmp) \
55 list_for_each_entry_safe(c, tmp, &tapdisk_daemon.channels, list)
57 #define MAX(a, b) ((a) >= (b) ? (a) : (b))
59 static void
60 tapdisk_daemon_print_drivers(void)
61 {
62 int i, size;
64 DPRINTF("blktap-daemon: v1.0.2\n");
66 size = sizeof(dtypes) / sizeof(disk_info_t *);
67 for (i = 0; i < size; i++)
68 DPRINTF("Found driver: [%s]\n", dtypes[i]->name);
69 }
71 static int
72 tapdisk_daemon_write_pidfile(long pid)
73 {
74 char buf[100];
75 int len, fd, flags, err;
77 fd = open(TAPDISK_DAEMON_PIDFILE, O_RDWR | O_CREAT, 0600);
78 if (fd == -1) {
79 EPRINTF("Opening pid file failed (%d)\n", errno);
80 return -errno;
81 }
83 /* We exit silently if daemon already running */
84 err = lockf(fd, F_TLOCK, 0);
85 if (err == -1)
86 exit(0);
88 /* Set FD_CLOEXEC, so that tapdisk doesn't get this file descriptor */
89 flags = fcntl(fd, F_GETFD);
90 if (flags == -1) {
91 EPRINTF("F_GETFD failed (%d)\n", errno);
92 return -errno;
93 }
95 flags |= FD_CLOEXEC;
96 err = fcntl(fd, F_SETFD, flags);
97 if (err == -1) {
98 EPRINTF("F_SETFD failed (%d)\n", errno);
99 return -errno;
100 }
102 len = sprintf(buf, "%ld\n", pid);
103 err = write(fd, buf, len);
104 if (err != len) {
105 EPRINTF("Writing pid file failed (%d)\n", errno);
106 return -errno;
107 }
109 return 0;
110 }
112 static int
113 tapdisk_daemon_init(void)
114 {
115 char *devname;
116 int i, err, blktap_major;
118 memset(&tapdisk_daemon, 0, sizeof(tapdisk_daemon_t));
120 err = asprintf(&devname, "%s/%s0", BLKTAP_DEV_DIR, BLKTAP_DEV_NAME);
121 if (err == -1) {
122 devname = NULL;
123 err = -ENOMEM;
124 goto fail;
125 }
127 err = xc_find_device_number("blktap0");
128 if (err < 0)
129 goto fail;
131 blktap_major = major(err);
132 err = make_blktap_device(devname, blktap_major, 0, S_IFCHR | 0600);
133 if (err)
134 goto fail;
136 tapdisk_daemon.blktap_fd = open(devname, O_RDWR);
137 if (tapdisk_daemon.blktap_fd == -1) {
138 err = -errno;
139 EPRINTF("blktap0 open failed\n");
140 goto fail;
141 }
143 for (i = 0; i < 2; i++) {
144 tapdisk_daemon.xsh = xs_daemon_open();
145 if (!tapdisk_daemon.xsh) {
146 EPRINTF("xs_daemon_open failed -- is xenstore running?\n");
147 sleep(2);
148 } else
149 break;
150 }
152 if (!tapdisk_daemon.xsh) {
153 err = -ENOSYS;
154 goto fail;
155 }
157 INIT_LIST_HEAD(&tapdisk_daemon.channels);
159 free(devname);
160 return 0;
162 fail:
163 if (tapdisk_daemon.blktap_fd > 0)
164 close(tapdisk_daemon.blktap_fd);
165 free(devname);
166 memset(&tapdisk_daemon, 0, sizeof(tapdisk_daemon_t));
167 EPRINTF("%s: %d\n", __func__, err);
169 return err;
170 }
172 static int
173 tapdisk_daemon_set_node(void)
174 {
175 int err;
176 char *domid;
178 domid = get_dom_domid(tapdisk_daemon.xsh);
179 if (!domid)
180 return -EAGAIN;
182 err = asprintf(&tapdisk_daemon.node,
183 "/local/domain/%s/backend/tap", domid);
184 if (err == -1) {
185 tapdisk_daemon.node = NULL;
186 err = -ENOMEM;
187 goto out;
188 }
190 err = 0;
192 out:
193 free(domid);
194 return err;
195 }
197 static int
198 tapdisk_daemon_get_domid(void)
199 {
200 int err;
201 unsigned int num;
202 char **res, *node, *token, *domid;
204 res = xs_read_watch(tapdisk_daemon.xsh, &num);
205 if (!res)
206 return -EAGAIN;
208 err = 0;
209 node = res[XS_WATCH_PATH];
210 token = res[XS_WATCH_TOKEN];
212 if (strcmp(token, TAPDISK_DAEMON_DOMID_WATCH)) {
213 err = -EINVAL;
214 goto out;
215 }
217 err = tapdisk_daemon_set_node();
219 out:
220 free(res);
221 return err;
222 }
224 static int
225 tapdisk_daemon_wait_for_domid(void)
226 {
227 int err;
228 char *domid;
229 fd_set readfds;
231 err = tapdisk_daemon_set_node();
232 if (!err)
233 return 0;
235 if (!xs_watch(tapdisk_daemon.xsh, "/local/domain",
236 TAPDISK_DAEMON_DOMID_WATCH)) {
237 EPRINTF("unable to set domain id watch\n");
238 return -EINVAL;
239 }
241 do {
242 FD_ZERO(&readfds);
243 FD_SET(xs_fileno(tapdisk_daemon.xsh), &readfds);
245 select(xs_fileno(tapdisk_daemon.xsh) + 1,
246 &readfds, NULL, NULL, NULL);
248 if (FD_ISSET(xs_fileno(tapdisk_daemon.xsh), &readfds))
249 err = tapdisk_daemon_get_domid();
250 else
251 err = -EAGAIN;
252 } while (err == -EAGAIN);
254 xs_unwatch(tapdisk_daemon.xsh,
255 "/local/domain", TAPDISK_DAEMON_DOMID_WATCH);
256 return err;
257 }
259 static inline int
260 tapdisk_daemon_new_vbd_event(const char *node)
261 {
262 return (!strcmp(node, "start-tapdisk"));
263 }
265 static int
266 tapdisk_daemon_write_uuid(char *path, uint32_t uuid)
267 {
268 int err;
269 char *cpath, uuid_str[12];
271 snprintf(uuid_str, sizeof(uuid_str), "%u", uuid);
273 err = asprintf(&cpath, "%s/tapdisk-uuid", path);
274 if (err == -1)
275 return -ENOMEM;
277 err = xs_write(tapdisk_daemon.xsh, XBT_NULL,
278 cpath, uuid_str, strlen(uuid_str));
279 free(cpath);
281 return (err ? 0 : -errno);
282 }
284 static void
285 tapdisk_daemon_probe(struct xs_handle *xsh,
286 struct xenbus_watch *watch, const char *path)
287 {
288 char *cpath;
289 int len, err;
290 uint32_t cookie;
291 const char *node;
292 tapdisk_channel_t *channel;
294 len = strsep_len(path, '/', 7);
295 if (len < 0)
296 return;
298 node = path + len + 1;
300 if (!tapdisk_daemon_new_vbd_event(node))
301 return;
303 if (!xs_exists(xsh, path))
304 return;
306 cpath = strdup(path);
307 if (!cpath) {
308 EPRINTF("failed to allocate control path for %s\n", path);
309 return;
310 }
311 cpath[len] = '\0';
313 cookie = tapdisk_daemon.cookie++;
314 err = tapdisk_daemon_write_uuid(cpath, cookie);
315 if (err)
316 goto out;
318 DPRINTF("%s: got watch on %s, uuid = %u\n", __func__, path, cookie);
320 err = tapdisk_channel_open(&channel, cpath,
321 tapdisk_daemon.xsh,
322 tapdisk_daemon.blktap_fd,
323 cookie);
324 if (!err)
325 list_add(&channel->list, &tapdisk_daemon.channels);
326 else
327 EPRINTF("failed to open tapdisk channel for %s: %d\n",
328 path, err);
330 out:
331 free(cpath);
332 }
334 static int
335 tapdisk_daemon_start(void)
336 {
337 int err;
339 err = tapdisk_daemon_wait_for_domid();
340 if (err)
341 return err;
343 tapdisk_daemon.watch.node = tapdisk_daemon.node;
344 tapdisk_daemon.watch.callback = tapdisk_daemon_probe;
346 err = register_xenbus_watch(tapdisk_daemon.xsh, &tapdisk_daemon.watch);
347 if (err)
348 goto fail;
350 ioctl(tapdisk_daemon.blktap_fd,
351 BLKTAP_IOCTL_SETMODE, BLKTAP_MODE_INTERPOSE);
352 ioctl(tapdisk_daemon.blktap_fd, BLKTAP_IOCTL_SENDPID, getpid());
354 return 0;
356 fail:
357 free(tapdisk_daemon.node);
358 tapdisk_daemon.node = NULL;
359 tapdisk_daemon.watch.node = NULL;
360 EPRINTF("%s: %d\n", __func__, err);
361 return err;
362 }
364 static int
365 tapdisk_daemon_stop(void)
366 {
367 unregister_xenbus_watch(tapdisk_daemon.xsh, &tapdisk_daemon.watch);
369 ioctl(tapdisk_daemon.blktap_fd,
370 BLKTAP_IOCTL_SETMODE, BLKTAP_MODE_PASSTHROUGH);
371 close(tapdisk_daemon.blktap_fd);
373 return 0;
374 }
376 static void
377 tapdisk_daemon_free(void)
378 {
379 free(tapdisk_daemon.node);
380 xs_daemon_close(tapdisk_daemon.xsh);
381 memset(&tapdisk_daemon, 0, sizeof(tapdisk_daemon_t));
382 }
384 static int
385 tapdisk_daemon_read_message(int fd, tapdisk_message_t *message, int timeout)
386 {
387 fd_set readfds;
388 struct timeval tv;
389 int ret, len, offset;
391 tv.tv_sec = timeout;
392 tv.tv_usec = 0;
393 offset = 0;
394 len = sizeof(tapdisk_message_t);
396 memset(message, 0, sizeof(tapdisk_message_t));
398 while (offset < len) {
399 FD_ZERO(&readfds);
400 FD_SET(fd, &readfds);
402 /* we don't bother reinitializing tv. at worst, it will wait a
403 * bit more time than expected. */
405 ret = select(fd + 1, &readfds, NULL, NULL, &tv);
406 if (ret == -1)
407 break;
408 else if (FD_ISSET(fd, &readfds)) {
409 ret = read(fd, message + offset, len - offset);
410 if (ret <= 0)
411 break;
412 offset += ret;
413 } else
414 break;
415 }
417 return (offset == len ? 0 : -EIO);
418 }
420 static int
421 tapdisk_daemon_receive_message(int fd)
422 {
423 int err;
424 tapdisk_message_t m;
425 tapdisk_channel_t *c, *tmp;
427 err = tapdisk_daemon_read_message(fd, &m, 2);
428 if (err) {
429 EPRINTF("failed reading message on %d: %d\n", fd, err);
430 return err;
431 }
433 tapdisk_daemon_for_each_channel(c, tmp)
434 if (c->cookie == m.cookie && c->read_fd == fd) {
435 DPRINTF("got '%s' message from %d:%d\n",
436 tapdisk_message_name(m.type),
437 c->channel_id, c->cookie);
439 return tapdisk_channel_receive_message(c, &m);
440 }
442 EPRINTF("unrecognized message on %d: '%s' (uuid = %u)\n",
443 fd, tapdisk_message_name(m.type), m.cookie);
445 return -EINVAL;
446 }
448 static int
449 tapdisk_daemon_set_fds(fd_set *readfds)
450 {
451 int max, fd;
452 tapdisk_channel_t *channel, *tmp;
454 max = xs_fileno(tapdisk_daemon.xsh);
456 FD_ZERO(readfds);
457 FD_SET(max, readfds);
459 tapdisk_daemon_for_each_channel(channel, tmp) {
460 fd = channel->read_fd;
461 max = MAX(fd, max);
462 FD_SET(fd, readfds);
463 }
465 return max;
466 }
468 static int
469 tapdisk_daemon_check_fds(fd_set *readfds)
470 {
471 int err;
472 tapdisk_channel_t *channel, *tmp;
474 if (FD_ISSET(xs_fileno(tapdisk_daemon.xsh), readfds))
475 xs_fire_next_watch(tapdisk_daemon.xsh);
477 tapdisk_daemon_for_each_channel(channel, tmp)
478 if (FD_ISSET(channel->read_fd, readfds))
479 return tapdisk_daemon_receive_message(channel->read_fd);
481 return 0;
482 }
484 static int
485 tapdisk_daemon_run(void)
486 {
487 int err, max;
488 fd_set readfds;
490 while (1) {
491 max = tapdisk_daemon_set_fds(&readfds);
493 err = select(max + 1, &readfds, NULL, NULL, NULL);
494 if (err < 0)
495 continue;
497 err = tapdisk_daemon_check_fds(&readfds);
498 }
500 return err;
501 }
503 void
504 tapdisk_daemon_find_channel(tapdisk_channel_t *channel)
505 {
506 tapdisk_channel_t *c, *tmp;
508 channel->read_fd = 0;
509 channel->write_fd = 0;
510 channel->tapdisk_pid = 0;
512 /* do we want multiple vbds per tapdisk? */
513 if (!xs_exists(tapdisk_daemon.xsh, channel->share_tapdisk_str)) {
514 channel->shared = 0;
515 return;
516 }
518 channel->shared = 1;
520 /* check if we already have a process started */
521 tapdisk_daemon_for_each_channel(c, tmp)
522 if (c->drivertype == channel->drivertype) {
523 channel->write_fd = c->write_fd;
524 channel->read_fd = c->read_fd;
525 channel->channel_id = c->channel_id;
526 channel->tapdisk_pid = c->tapdisk_pid;
527 return;
528 }
529 }
531 void
532 tapdisk_daemon_close_channel(tapdisk_channel_t *channel)
533 {
534 tapdisk_channel_t *c, *tmp;
536 list_del(&channel->list);
538 tapdisk_daemon_for_each_channel(c, tmp)
539 if (c->channel_id == channel->channel_id)
540 return;
542 close(channel->read_fd);
543 close(channel->write_fd);
544 }
546 int
547 main(int argc, char *argv[])
548 {
549 int err;
550 char buf[128];
552 if (daemon(0, 0)) {
553 EPRINTF("daemon() failed (%d)\n", errno);
554 return -errno;
555 }
557 #define CORE_DUMP
558 #if defined(CORE_DUMP)
559 #include <sys/resource.h>
560 {
561 /* set up core-dumps*/
562 struct rlimit rlim;
563 rlim.rlim_cur = RLIM_INFINITY;
564 rlim.rlim_max = RLIM_INFINITY;
565 if (setrlimit(RLIMIT_CORE, &rlim) < 0)
566 EPRINTF("setrlimit failed: %d\n", errno);
567 }
568 #endif
570 snprintf(buf, sizeof(buf), "BLKTAP-DAEMON[%d]", getpid());
571 openlog(buf, LOG_CONS | LOG_ODELAY, LOG_DAEMON);
573 err = tapdisk_daemon_write_pidfile(getpid());
574 if (err)
575 goto out;
577 tapdisk_daemon_print_drivers();
579 err = tapdisk_daemon_init();
580 if (err)
581 goto out;
583 err = tapdisk_daemon_start();
584 if (err)
585 goto out;
587 tapdisk_daemon_run();
589 tapdisk_daemon_stop();
590 tapdisk_daemon_free();
592 err = 0;
594 out:
595 if (err)
596 EPRINTF("failed to start %s: %d\n", argv[0], err);
597 closelog();
598 return err;
599 }