ia64/xen-unstable

view tools/blktap2/drivers/tapdisk-server.c @ 19647:1c627434605e

blktap2: a completely rewritten blktap implementation

Benefits to blktap2 over the old version of blktap:

* Isolation from xenstore - Blktap devices are now created directly on
the linux dom0 command line, rather than being spawned in response
to XenStore events. This is handy for debugging, makes blktap
generally easier to work with, and is a step toward a generic
user-level block device implementation that is not Xen-specific.

* Improved tapdisk infrastructure: simpler request forwarding, new
request scheduler, request merging, more efficient use of AIO.

* Improved tapdisk error handling and memory management. No
allocations on the block data path, IO retry logic to protect
guests
transient block device failures. This has been tested and is known
to work on weird environments such as NFS soft mounts.

* Pause and snapshot of live virtual disks (see xmsnap script).

* VHD support. The VHD code in this release has been rigorously
tested, and represents a very mature implementation of the VHD
image
format.

* No more duplication of mechanism with blkback. The blktap kernel
module has changed dramatically from the original blktap. Blkback
is now always used to talk to Xen guests, blktap just presents a
Linux gendisk that blkback can export. This is done while
preserving the zero-copy data path from domU to physical device.

These patches deprecate the old blktap code, which can hopefully be
removed from the tree completely at some point in the future.

Signed-off-by: Jake Wires <jake.wires@citrix.com>
Signed-off-by: Dutch Meyer <dmeyer@cs.ubc.ca>
author Keir Fraser <keir.fraser@citrix.com>
date Tue May 26 11:52:31 2009 +0100 (2009-05-26)
parents
children
line source
1 /*
2 * Copyright (c) 2008, XenSource Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of XenSource Inc. nor the names of its contributors
13 * may be used to endorse or promote products derived from this software
14 * without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
20 * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
21 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
22 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
24 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
25 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28 #include <stdio.h>
29 #include <fcntl.h>
30 #include <errno.h>
31 #include <unistd.h>
32 #include <stdlib.h>
33 #include <sys/ioctl.h>
34 #include <sys/signal.h>
36 #define TAPDISK
37 #include "tapdisk-utils.h"
38 #include "tapdisk-server.h"
39 #include "tapdisk-driver.h"
40 #include "tapdisk-interface.h"
42 #define DBG(_level, _f, _a...) tlog_write(_level, _f, ##_a)
43 #define ERR(_err, _f, _a...) tlog_error(_err, _f, ##_a)
45 tapdisk_server_t server;
47 #define tapdisk_server_for_each_vbd(vbd, tmp) \
48 list_for_each_entry_safe(vbd, tmp, &server.vbds, next)
50 struct tap_disk *
51 tapdisk_server_find_driver_interface(int type)
52 {
53 int n;
55 n = sizeof(dtypes) / sizeof(struct disk_info_t *);
56 if (type > n)
57 return NULL;
59 return dtypes[type]->drv;
60 }
62 td_image_t *
63 tapdisk_server_get_shared_image(td_image_t *image)
64 {
65 td_vbd_t *vbd, *tmpv;
66 td_image_t *img, *tmpi;
68 if (!td_flag_test(image->flags, TD_OPEN_SHAREABLE))
69 return NULL;
71 tapdisk_server_for_each_vbd(vbd, tmpv)
72 tapdisk_vbd_for_each_image(vbd, img, tmpi)
73 if (img->type == image->type &&
74 !strcmp(img->name, image->name))
75 return img;
77 return NULL;
78 }
80 td_vbd_t *
81 tapdisk_server_get_vbd(uint16_t uuid)
82 {
83 td_vbd_t *vbd, *tmp;
85 tapdisk_server_for_each_vbd(vbd, tmp)
86 if (vbd->uuid == uuid)
87 return vbd;
89 return NULL;
90 }
92 void
93 tapdisk_server_add_vbd(td_vbd_t *vbd)
94 {
95 list_add_tail(&vbd->next, &server.vbds);
96 }
98 void
99 tapdisk_server_remove_vbd(td_vbd_t *vbd)
100 {
101 list_del(&vbd->next);
102 INIT_LIST_HEAD(&vbd->next);
103 tapdisk_server_check_state();
104 }
106 void
107 tapdisk_server_queue_tiocb(struct tiocb *tiocb)
108 {
109 tapdisk_queue_tiocb(&server.aio_queue, tiocb);
110 }
112 void
113 tapdisk_server_debug(void)
114 {
115 td_vbd_t *vbd, *tmp;
117 tapdisk_debug_queue(&server.aio_queue);
119 tapdisk_server_for_each_vbd(vbd, tmp)
120 tapdisk_vbd_debug(vbd);
122 tlog_flush();
123 }
125 void
126 tapdisk_server_check_state(void)
127 {
128 if (list_empty(&server.vbds))
129 server.run = 0;
130 }
132 event_id_t
133 tapdisk_server_register_event(char mode, int fd,
134 int timeout, event_cb_t cb, void *data)
135 {
136 return scheduler_register_event(&server.scheduler,
137 mode, fd, timeout, cb, data);
138 }
140 void
141 tapdisk_server_unregister_event(event_id_t event)
142 {
143 return scheduler_unregister_event(&server.scheduler, event);
144 }
146 void
147 tapdisk_server_set_max_timeout(int seconds)
148 {
149 scheduler_set_max_timeout(&server.scheduler, seconds);
150 }
152 static void
153 tapdisk_server_assert_locks(void)
154 {
156 }
158 static void
159 tapdisk_server_set_retry_timeout(void)
160 {
161 td_vbd_t *vbd, *tmp;
163 tapdisk_server_for_each_vbd(vbd, tmp)
164 if (tapdisk_vbd_retry_needed(vbd)) {
165 tapdisk_server_set_max_timeout(TD_VBD_RETRY_INTERVAL);
166 return;
167 }
168 }
170 static void
171 tapdisk_server_check_progress(void)
172 {
173 struct timeval now;
174 td_vbd_t *vbd, *tmp;
176 gettimeofday(&now, NULL);
178 tapdisk_server_for_each_vbd(vbd, tmp)
179 tapdisk_vbd_check_progress(vbd);
180 }
182 static void
183 tapdisk_server_submit_tiocbs(void)
184 {
185 tapdisk_submit_all_tiocbs(&server.aio_queue);
186 }
188 static void
189 tapdisk_server_kick_responses(void)
190 {
191 int n;
192 td_vbd_t *vbd, *tmp;
194 tapdisk_server_for_each_vbd(vbd, tmp)
195 tapdisk_vbd_kick(vbd);
196 }
198 static void
199 tapdisk_server_check_vbds(void)
200 {
201 td_vbd_t *vbd, *tmp;
203 tapdisk_server_for_each_vbd(vbd, tmp)
204 tapdisk_vbd_check_state(vbd);
205 }
207 static void
208 tapdisk_server_stop_vbds(void)
209 {
210 td_vbd_t *vbd, *tmp;
212 tapdisk_server_for_each_vbd(vbd, tmp)
213 tapdisk_vbd_kill_queue(vbd);
214 }
216 static void
217 tapdisk_server_send_error(const char *message)
218 {
219 td_vbd_t *vbd, *tmp;
221 tapdisk_server_for_each_vbd(vbd, tmp)
222 tapdisk_ipc_write_error(&vbd->ipc, message);
223 }
225 static void
226 tapdisk_server_read_ipc_message(event_id_t id, char mode, void *private)
227 {
228 tapdisk_ipc_read(&server.ipc);
229 }
231 static void
232 tapdisk_server_aio_queue_event(event_id_t id, char mode, void *private)
233 {
234 tapdisk_complete_tiocbs(&server.aio_queue);
235 }
237 static void
238 tapdisk_server_free_aio_queue(void)
239 {
240 tapdisk_server_unregister_event(server.aio_queue_event_id);
241 tapdisk_free_queue(&server.aio_queue);
242 }
244 static int
245 tapdisk_server_initialize_aio_queue(void)
246 {
247 int err;
248 event_id_t id;
250 err = tapdisk_init_queue(&server.aio_queue,
251 TAPDISK_TIOCBS, 0, NULL);
252 if (err)
253 return err;
255 id = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD,
256 server.aio_queue.poll_fd, 0,
257 tapdisk_server_aio_queue_event,
258 NULL);
259 if (id < 0) {
260 tapdisk_free_queue(&server.aio_queue);
261 return id;
262 }
264 server.aio_queue_event_id = id;
266 return 0;
267 }
269 static void
270 tapdisk_server_close(void)
271 {
272 tapdisk_server_free_aio_queue();
274 if (server.control_event)
275 scheduler_unregister_event(&server.scheduler, server.control_event);
277 if (server.ipc.rfd != -1)
278 close(server.ipc.rfd);
280 if (server.ipc.wfd != -1)
281 close(server.ipc.wfd);
282 }
284 static void
285 __tapdisk_server_run(void)
286 {
287 int ret;
289 while (server.run) {
290 tapdisk_server_assert_locks();
291 tapdisk_server_set_retry_timeout();
292 tapdisk_server_check_progress();
294 ret = scheduler_wait_for_events(&server.scheduler);
295 if (ret < 0)
296 DBG(TLOG_WARN, "server wait returned %d\n", ret);
298 tapdisk_server_check_vbds();
299 tapdisk_server_submit_tiocbs();
300 tapdisk_server_kick_responses();
301 }
302 }
304 static void
305 tapdisk_server_signal_handler(int signal)
306 {
307 td_vbd_t *vbd, *tmp;
308 static int xfsz_error_sent = 0;
310 switch (signal) {
311 case SIGBUS:
312 case SIGINT:
313 tapdisk_server_for_each_vbd(vbd, tmp)
314 tapdisk_vbd_close(vbd);
315 break;
317 case SIGXFSZ:
318 ERR(EFBIG, "received SIGXFSZ");
319 tapdisk_server_stop_vbds();
320 if (xfsz_error_sent)
321 break;
323 tapdisk_server_send_error("received SIGXFSZ, closing queues");
324 xfsz_error_sent = 1;
325 break;
327 case SIGUSR1:
328 tapdisk_server_debug();
329 break;
330 }
331 }
333 int
334 tapdisk_server_initialize(const char *read, const char *write)
335 {
336 int err;
337 event_id_t event_id;
339 event_id = 0;
340 memset(&server, 0, sizeof(tapdisk_server_t));
341 server.ipc.rfd = server.ipc.wfd = -1;
343 INIT_LIST_HEAD(&server.vbds);
345 if (read) {
346 server.ipc.rfd = open(read, O_RDWR | O_NONBLOCK);
347 if (server.ipc.rfd < 0) {
348 err = -errno;
349 EPRINTF("FD open failed %s: %d\n", read, err);
350 goto fail;
351 }
352 }
354 if (write) {
355 server.ipc.wfd = open(write, O_RDWR | O_NONBLOCK);
356 if (server.ipc.wfd < 0) {
357 err = -errno;
358 EPRINTF("FD open failed %s, %d\n", write, err);
359 goto fail;
360 }
361 }
363 scheduler_initialize(&server.scheduler);
365 if (read) {
366 event_id = scheduler_register_event(&server.scheduler,
367 SCHEDULER_POLL_READ_FD,
368 server.ipc.rfd, 0,
369 tapdisk_server_read_ipc_message,
370 NULL);
371 if (event_id < 0) {
372 err = event_id;
373 goto fail;
374 }
375 }
377 err = tapdisk_server_initialize_aio_queue();
378 if (err)
379 goto fail;
381 server.control_event = event_id;
382 server.run = 1;
384 return 0;
386 fail:
387 if (server.ipc.rfd > 0)
388 close(server.ipc.rfd);
389 if (server.ipc.wfd > 0)
390 close(server.ipc.wfd);
391 if (event_id > 0)
392 scheduler_unregister_event(&server.scheduler,
393 server.control_event);
394 return err;
395 }
397 int
398 tapdisk_server_run()
399 {
400 int err;
402 err = tapdisk_set_resource_limits();
403 if (err)
404 return err;
406 signal(SIGBUS, tapdisk_server_signal_handler);
407 signal(SIGINT, tapdisk_server_signal_handler);
408 signal(SIGUSR1, tapdisk_server_signal_handler);
409 signal(SIGXFSZ, tapdisk_server_signal_handler);
411 __tapdisk_server_run();
412 tapdisk_server_close();
414 return 0;
415 }