ia64/xen-unstable

view tools/blktap2/drivers/tapdisk-stream.c @ 19647:1c627434605e

blktap2: a completely rewritten blktap implementation

Benefits to blktap2 over the old version of blktap:

* Isolation from xenstore - Blktap devices are now created directly on
the linux dom0 command line, rather than being spawned in response
to XenStore events. This is handy for debugging, makes blktap
generally easier to work with, and is a step toward a generic
user-level block device implementation that is not Xen-specific.

* Improved tapdisk infrastructure: simpler request forwarding, new
request scheduler, request merging, more efficient use of AIO.

* Improved tapdisk error handling and memory management. No
allocations on the block data path, IO retry logic to protect
guests
transient block device failures. This has been tested and is known
to work on weird environments such as NFS soft mounts.

* Pause and snapshot of live virtual disks (see xmsnap script).

* VHD support. The VHD code in this release has been rigorously
tested, and represents a very mature implementation of the VHD
image
format.

* No more duplication of mechanism with blkback. The blktap kernel
module has changed dramatically from the original blktap. Blkback
is now always used to talk to Xen guests, blktap just presents a
Linux gendisk that blkback can export. This is done while
preserving the zero-copy data path from domU to physical device.

These patches deprecate the old blktap code, which can hopefully be
removed from the tree completely at some point in the future.

Signed-off-by: Jake Wires <jake.wires@citrix.com>
Signed-off-by: Dutch Meyer <dmeyer@cs.ubc.ca>
author Keir Fraser <keir.fraser@citrix.com>
date Tue May 26 11:52:31 2009 +0100 (2009-05-26)
parents
children
line source
1 /*
2 * Copyright (c) 2008, XenSource Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of XenSource Inc. nor the names of its contributors
13 * may be used to endorse or promote products derived from this software
14 * without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
20 * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
21 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
22 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
24 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
25 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28 #include <stdio.h>
29 #include <errno.h>
30 #include <fcntl.h>
31 #include <stdlib.h>
32 #include <string.h>
33 #include <assert.h>
34 #include <unistd.h>
36 #include "list.h"
37 #include "scheduler.h"
38 #include "tapdisk-vbd.h"
39 #include "tapdisk-server.h"
41 #define POLL_READ 0
42 #define POLL_WRITE 1
44 #define MIN(a, b) ((a) < (b) ? (a) : (b))
46 struct tapdisk_stream_poll {
47 int pipe[2];
48 int set;
49 };
51 struct tapdisk_stream_request {
52 uint64_t sec;
53 uint32_t secs;
54 uint64_t seqno;
55 blkif_request_t blkif_req;
56 struct list_head next;
57 };
59 struct tapdisk_stream {
60 td_vbd_t *vbd;
62 unsigned int id;
63 int in_fd;
64 int out_fd;
66 int err;
68 uint64_t cur;
69 uint64_t start;
70 uint64_t end;
72 uint64_t started;
73 uint64_t completed;
75 struct tapdisk_stream_poll poll;
76 event_id_t enqueue_event_id;
78 struct list_head free_list;
79 struct list_head pending_list;
80 struct list_head completed_list;
82 struct tapdisk_stream_request requests[MAX_REQUESTS];
83 };
85 static unsigned int tapdisk_stream_count;
87 static void tapdisk_stream_close_image(struct tapdisk_stream *);
89 static void
90 usage(const char *app, int err)
91 {
92 printf("usage: %s <-n type:/path/to/image> "
93 "[-c sector count] [-s skip sectors]\n", app);
94 exit(err);
95 }
97 static inline void
98 tapdisk_stream_poll_initialize(struct tapdisk_stream_poll *p)
99 {
100 p->set = 0;
101 p->pipe[POLL_READ] = p->pipe[POLL_WRITE] = -1;
102 }
104 static int
105 tapdisk_stream_poll_open(struct tapdisk_stream_poll *p)
106 {
107 int err;
109 tapdisk_stream_poll_initialize(p);
111 err = pipe(p->pipe);
112 if (err)
113 return -errno;
115 err = fcntl(p->pipe[POLL_READ], F_SETFL, O_NONBLOCK);
116 if (err)
117 goto out;
119 err = fcntl(p->pipe[POLL_WRITE], F_SETFL, O_NONBLOCK);
120 if (err)
121 goto out;
123 return 0;
125 out:
126 close(p->pipe[POLL_READ]);
127 close(p->pipe[POLL_WRITE]);
128 tapdisk_stream_poll_initialize(p);
129 return -errno;
130 }
132 static void
133 tapdisk_stream_poll_close(struct tapdisk_stream_poll *p)
134 {
135 if (p->pipe[POLL_READ] != -1)
136 close(p->pipe[POLL_READ]);
137 if (p->pipe[POLL_WRITE] != -1)
138 close(p->pipe[POLL_WRITE]);
139 tapdisk_stream_poll_initialize(p);
140 }
142 static inline void
143 tapdisk_stream_poll_clear(struct tapdisk_stream_poll *p)
144 {
145 int dummy;
147 read(p->pipe[POLL_READ], &dummy, sizeof(dummy));
148 p->set = 0;
149 }
151 static inline void
152 tapdisk_stream_poll_set(struct tapdisk_stream_poll *p)
153 {
154 int dummy = 0;
156 if (!p->set) {
157 write(p->pipe[POLL_WRITE], &dummy, sizeof(dummy));
158 p->set = 1;
159 }
160 }
162 static inline int
163 tapdisk_stream_stop(struct tapdisk_stream *s)
164 {
165 return (list_empty(&s->pending_list) && (s->cur == s->end || s->err));
166 }
168 static inline void
169 tapdisk_stream_initialize_request(struct tapdisk_stream_request *req)
170 {
171 memset(req, 0, sizeof(*req));
172 INIT_LIST_HEAD(&req->next);
173 }
175 static inline int
176 tapdisk_stream_request_idx(struct tapdisk_stream *s,
177 struct tapdisk_stream_request *req)
178 {
179 return (req - s->requests);
180 }
182 static inline struct tapdisk_stream_request *
183 tapdisk_stream_get_request(struct tapdisk_stream *s)
184 {
185 struct tapdisk_stream_request *req;
187 if (list_empty(&s->free_list))
188 return NULL;
190 req = list_entry(s->free_list.next,
191 struct tapdisk_stream_request, next);
193 list_del_init(&req->next);
194 tapdisk_stream_initialize_request(req);
196 return req;
197 }
199 static void
200 tapdisk_stream_print_request(struct tapdisk_stream *s,
201 struct tapdisk_stream_request *sreq)
202 {
203 unsigned long idx = (unsigned long)tapdisk_stream_request_idx(s, sreq);
204 char *buf = (char *)MMAP_VADDR(s->vbd->ring.vstart, idx, 0);
205 write(s->out_fd, buf, sreq->secs << SECTOR_SHIFT);
206 }
208 static void
209 tapdisk_stream_write_data(struct tapdisk_stream *s)
210 {
211 struct tapdisk_stream_request *sreq, *tmp;
213 list_for_each_entry_safe(sreq, tmp, &s->completed_list, next) {
214 if (sreq->seqno != s->completed)
215 break;
217 s->completed++;
218 tapdisk_stream_print_request(s, sreq);
220 list_del_init(&sreq->next);
221 list_add_tail(&sreq->next, &s->free_list);
222 }
223 }
225 static inline void
226 tapdisk_stream_queue_completed(struct tapdisk_stream *s,
227 struct tapdisk_stream_request *sreq)
228 {
229 struct tapdisk_stream_request *itr;
231 list_for_each_entry(itr, &s->completed_list, next)
232 if (sreq->seqno < itr->seqno) {
233 list_add_tail(&sreq->next, &itr->next);
234 return;
235 }
237 list_add_tail(&sreq->next, &s->completed_list);
238 }
240 static void
241 tapdisk_stream_dequeue(void *arg, blkif_response_t *rsp)
242 {
243 struct tapdisk_stream *s = (struct tapdisk_stream *)arg;
244 struct tapdisk_stream_request *sreq = s->requests + rsp->id;
246 list_del_init(&sreq->next);
248 if (rsp->status == BLKIF_RSP_OKAY)
249 tapdisk_stream_queue_completed(s, sreq);
250 else {
251 s->err = EIO;
252 list_add_tail(&sreq->next, &s->free_list);
253 fprintf(stderr, "error reading sector 0x%"PRIu64"\n", sreq->sec);
254 }
256 tapdisk_stream_write_data(s);
257 tapdisk_stream_poll_set(&s->poll);
258 }
260 static void
261 tapdisk_stream_enqueue(event_id_t id, char mode, void *arg)
262 {
263 td_vbd_t *vbd;
264 int i, idx, psize;
265 struct tapdisk_stream *s = (struct tapdisk_stream *)arg;
267 vbd = s->vbd;
268 tapdisk_stream_poll_clear(&s->poll);
270 if (tapdisk_stream_stop(s)) {
271 tapdisk_stream_close_image(s);
272 return;
273 }
275 psize = getpagesize();
277 while (s->cur < s->end && !s->err) {
278 blkif_request_t *breq;
279 td_vbd_request_t *vreq;
280 struct tapdisk_stream_request *sreq;
282 sreq = tapdisk_stream_get_request(s);
283 if (!sreq)
284 break;
286 idx = tapdisk_stream_request_idx(s, sreq);
288 sreq->sec = s->cur;
289 sreq->secs = 0;
290 sreq->seqno = s->started++;
292 breq = &sreq->blkif_req;
293 breq->id = idx;
294 breq->nr_segments = 0;
295 breq->sector_number = sreq->sec;
296 breq->operation = BLKIF_OP_READ;
298 for (i = 0; i < BLKIF_MAX_SEGMENTS_PER_REQUEST; i++) {
299 uint32_t secs = MIN(s->end - s->cur, psize >> SECTOR_SHIFT);
300 struct blkif_request_segment *seg = breq->seg + i;
302 if (!secs)
303 break;
305 sreq->secs += secs;
306 s->cur += secs;
308 seg->first_sect = 0;
309 seg->last_sect = secs - 1;
310 breq->nr_segments++;
311 }
313 vreq = vbd->request_list + idx;
315 assert(list_empty(&vreq->next));
316 assert(vreq->secs_pending == 0);
318 memcpy(&vreq->req, breq, sizeof(*breq));
319 vbd->received++;
320 vreq->vbd = vbd;
322 tapdisk_vbd_move_request(vreq, &vbd->new_requests);
323 list_add_tail(&sreq->next, &s->pending_list);
324 }
326 tapdisk_vbd_issue_requests(vbd);
327 }
329 static int
330 tapdisk_stream_open_image(struct tapdisk_stream *s, const char *path, int type)
331 {
332 int err;
334 s->id = tapdisk_stream_count++;
336 err = tapdisk_server_initialize(NULL, NULL);
337 if (err)
338 goto out;
340 err = tapdisk_vbd_initialize(-1, -1, s->id);
341 if (err)
342 goto out;
344 s->vbd = tapdisk_server_get_vbd(s->id);
345 if (!s->vbd) {
346 err = ENODEV;
347 goto out;
348 }
350 tapdisk_vbd_set_callback(s->vbd, tapdisk_stream_dequeue, s);
352 err = tapdisk_vbd_open_vdi(s->vbd, path, type,
353 TAPDISK_STORAGE_TYPE_DEFAULT,
354 TD_OPEN_RDONLY);
355 if (err)
356 goto out;
358 s->vbd->reopened = 1;
359 err = 0;
361 out:
362 if (err)
363 fprintf(stderr, "failed to open %s: %d\n", path, err);
364 return err;
365 }
367 static void
368 tapdisk_stream_close_image(struct tapdisk_stream *s)
369 {
370 td_vbd_t *vbd;
372 vbd = tapdisk_server_get_vbd(s->id);
373 if (vbd) {
374 tapdisk_vbd_close_vdi(vbd);
375 tapdisk_server_remove_vbd(vbd);
376 free((void *)vbd->ring.vstart);
377 free(vbd->name);
378 free(vbd);
379 s->vbd = NULL;
380 }
381 }
383 static int
384 tapdisk_stream_set_position(struct tapdisk_stream *s,
385 uint64_t count, uint64_t skip)
386 {
387 int err;
388 image_t image;
390 err = tapdisk_vbd_get_image_info(s->vbd, &image);
391 if (err) {
392 fprintf(stderr, "failed getting image size: %d\n", err);
393 return err;
394 }
396 if (count == (uint64_t)-1)
397 count = image.size - skip;
399 if (count + skip > image.size) {
400 fprintf(stderr, "0x%"PRIx64" past end of image 0x%"PRIx64"\n",
401 (uint64_t) (count + skip), (uint64_t) image.size);
402 return -EINVAL;
403 }
405 s->start = skip;
406 s->cur = s->start;
407 s->end = s->start + count;
409 return 0;
410 }
412 static int
413 tapdisk_stream_initialize_requests(struct tapdisk_stream *s)
414 {
415 size_t size;
416 td_ring_t *ring;
417 int err, i, psize;
419 ring = &s->vbd->ring;
420 psize = getpagesize();
421 size = psize * BLKTAP_MMAP_REGION_SIZE;
423 /* sneaky -- set up ring->vstart so tapdisk_vbd will use our buffers */
424 err = posix_memalign((void **)&ring->vstart, psize, size);
425 if (err) {
426 fprintf(stderr, "failed to allocate buffers: %d\n", err);
427 ring->vstart = 0;
428 return err;
429 }
431 for (i = 0; i < MAX_REQUESTS; i++) {
432 struct tapdisk_stream_request *req = s->requests + i;
433 tapdisk_stream_initialize_request(req);
434 list_add_tail(&req->next, &s->free_list);
435 }
437 return 0;
438 }
440 static int
441 tapdisk_stream_register_enqueue_event(struct tapdisk_stream *s)
442 {
443 int err;
444 struct tapdisk_stream_poll *p = &s->poll;
446 err = tapdisk_stream_poll_open(p);
447 if (err)
448 goto out;
450 err = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD,
451 p->pipe[POLL_READ], 0,
452 tapdisk_stream_enqueue, s);
453 if (err < 0)
454 goto out;
456 s->enqueue_event_id = err;
457 err = 0;
459 out:
460 if (err)
461 fprintf(stderr, "failed to register event: %d\n", err);
462 return err;
463 }
465 static void
466 tapdisk_stream_unregister_enqueue_event(struct tapdisk_stream *s)
467 {
468 if (s->enqueue_event_id) {
469 tapdisk_server_unregister_event(s->enqueue_event_id);
470 s->enqueue_event_id = 0;
471 }
472 tapdisk_stream_poll_close(&s->poll);
473 }
475 static inline void
476 tapdisk_stream_initialize(struct tapdisk_stream *s)
477 {
478 memset(s, 0, sizeof(*s));
479 s->in_fd = s->out_fd = -1;
480 INIT_LIST_HEAD(&s->free_list);
481 INIT_LIST_HEAD(&s->pending_list);
482 INIT_LIST_HEAD(&s->completed_list);
483 }
485 static int
486 tapdisk_stream_open_fds(struct tapdisk_stream *s)
487 {
488 s->out_fd = dup(STDOUT_FILENO);
489 if (s->out_fd == -1) {
490 fprintf(stderr, "failed to open output: %d\n", errno);
491 return errno;
492 }
494 return 0;
495 }
497 static int
498 tapdisk_stream_open(struct tapdisk_stream *s, const char *path,
499 int type, uint64_t count, uint64_t skip)
500 {
501 int err;
503 tapdisk_stream_initialize(s);
505 err = tapdisk_stream_open_fds(s);
506 if (err)
507 return err;
509 err = tapdisk_stream_open_image(s, path, type);
510 if (err)
511 return err;
513 err = tapdisk_stream_set_position(s, count, skip);
514 if (err)
515 return err;
517 err = tapdisk_stream_initialize_requests(s);
518 if (err)
519 return err;
521 err = tapdisk_stream_register_enqueue_event(s);
522 if (err)
523 return err;
525 return 0;
526 }
528 static void
529 tapdisk_stream_release(struct tapdisk_stream *s)
530 {
531 close(s->out_fd);
532 tapdisk_stream_close_image(s);
533 tapdisk_stream_unregister_enqueue_event(s);
534 }
536 static int
537 tapdisk_stream_run(struct tapdisk_stream *s)
538 {
539 tapdisk_stream_enqueue(s->enqueue_event_id, SCHEDULER_POLL_READ_FD, s);
540 tapdisk_server_run();
541 return s->err;
542 }
544 int
545 main(int argc, char *argv[])
546 {
547 int c, err, type;
548 char *params, *path;
549 uint64_t count, skip;
550 struct tapdisk_stream stream;
552 err = 0;
553 skip = 0;
554 count = (uint64_t)-1;
555 params = NULL;
557 while ((c = getopt(argc, argv, "n:c:s:h")) != -1) {
558 switch (c) {
559 case 'n':
560 params = optarg;
561 break;
562 case 'c':
563 count = strtoull(optarg, NULL, 10);
564 break;
565 case 's':
566 skip = strtoull(optarg, NULL, 10);
567 break;
568 default:
569 err = EINVAL;
570 case 'h':
571 usage(argv[0], err);
572 }
573 }
575 if (!params)
576 usage(argv[0], EINVAL);
578 err = tapdisk_parse_disk_type(params, &path, &type);
579 if (err) {
580 fprintf(stderr, "invalid argument %s: %d\n", params, err);
581 return err;
582 }
584 tapdisk_start_logging("tapdisk-stream");
586 err = tapdisk_stream_open(&stream, path, type, count, skip);
587 if (err)
588 goto out;
590 err = tapdisk_stream_run(&stream);
591 if (err)
592 goto out;
594 err = 0;
596 out:
597 tapdisk_stream_release(&stream);
598 tapdisk_stop_logging();
599 return err;
600 }