ia64/xen-unstable

view tools/blktap2/drivers/tapdisk-ring.c @ 19647:1c627434605e

blktap2: a completely rewritten blktap implementation

Benefits to blktap2 over the old version of blktap:

* Isolation from xenstore - Blktap devices are now created directly on
the linux dom0 command line, rather than being spawned in response
to XenStore events. This is handy for debugging, makes blktap
generally easier to work with, and is a step toward a generic
user-level block device implementation that is not Xen-specific.

* Improved tapdisk infrastructure: simpler request forwarding, new
request scheduler, request merging, more efficient use of AIO.

* Improved tapdisk error handling and memory management. No
allocations on the block data path, IO retry logic to protect
guests
transient block device failures. This has been tested and is known
to work on weird environments such as NFS soft mounts.

* Pause and snapshot of live virtual disks (see xmsnap script).

* VHD support. The VHD code in this release has been rigorously
tested, and represents a very mature implementation of the VHD
image
format.

* No more duplication of mechanism with blkback. The blktap kernel
module has changed dramatically from the original blktap. Blkback
is now always used to talk to Xen guests, blktap just presents a
Linux gendisk that blkback can export. This is done while
preserving the zero-copy data path from domU to physical device.

These patches deprecate the old blktap code, which can hopefully be
removed from the tree completely at some point in the future.

Signed-off-by: Jake Wires <jake.wires@citrix.com>
Signed-off-by: Dutch Meyer <dmeyer@cs.ubc.ca>
author Keir Fraser <keir.fraser@citrix.com>
date Tue May 26 11:52:31 2009 +0100 (2009-05-26)
parents
children
line source
1 /*
2 * Copyright (c) 2008, XenSource Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of XenSource Inc. nor the names of its contributors
13 * may be used to endorse or promote products derived from this software
14 * without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
20 * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
21 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
22 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
24 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
25 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28 #include <errno.h>
30 #include "tapdisk-ring.h"
32 static int
33 tapdisk_uring_create_ctlfd(td_uring_t *ring)
34 {
35 int fd, err;
36 struct sockaddr_un saddr;
38 if (strnlen(ring->ctlfd_path, sizeof(saddr.sun_family)) >=
39 sizeof(saddr.sun_family))
40 return -ENAMETOOLONG;
42 fd = socket(AF_UNIX, SOCK_STREAM, 0);
43 if (fd == -1)
44 return -errno;
46 memset(&saddr, 0, sizeof(struct sockaddr_un));
47 saddr.sun_family = AF_UNIX;
48 memcpy(saddr.sun_path, ring->ctlfd_path, strlen(ring->ctlfd_path));
50 err = unlink(ring->ctlfd_path);
51 if (err == -1 && errno != ENOENT) {
52 err = -errno;
53 goto fail;
54 }
56 err = bind(fd, &saddr, sizeof(struct sockaddr_un));
57 if (err == -1) {
58 err = -errno;
59 goto fail;
60 }
62 err = listen(fd, 1);
63 if (err == -1) {
64 err = -errno;
65 goto fail;
66 }
68 ring->ctlfd = fd;
69 return 0;
71 fail:
72 close(fd);
73 return err;
74 }
76 static void
77 tapdisk_uring_destroy_ctlfd(td_uring_t *ring)
78 {
79 if (ring->ctlfd) {
80 close(ring->ctlfd);
81 ring->ctlfd = 0;
82 }
84 if (ring->ctlfd_path) {
85 unlink(ring->ctlfd_path);
86 free(ring->ctlfd_path);
87 ring->ctlfd_path = NULL;
88 }
89 }
91 static int
92 tapdisk_uring_connect_ctlfd(td_uring_t *ring)
93 {
94 int fd, err;
95 struct sockaddr_un saddr;
97 if (strnlen(ring->ctlfd_path, sizeof(saddr.sun_path)) >=
98 sizeof(saddr.sun_path))
99 return -ENAMETOOLONG;
101 fd = socket(AF_UNIX, SOCK_STREAM, 0);
102 if (fd == -1)
103 return -errno;
105 memset(&saddr, 0, sizeof(struct sockaddr_un));
106 saddr.sun_family = AF_UNIX;
107 memcpy(saddr.sun_path, ring->ctlfd_path, strlen(ring->ctlfd_path));
109 err = connect(fd, &saddr, sizeof(saddr));
110 if (err == -1) {
111 err = -errno;
112 goto fail;
113 }
115 ring->ctlfd = fd;
116 return 0;
118 fail:
119 close(fd);
120 return err;
121 }
123 static void
124 tapdisk_uring_disconnect_ctlfd(td_uring_t *ring)
125 {
126 if (ring->ctlfd)
127 close(ring->ctlfd);
128 free(ring->ctlfd_path);
129 ring->ctlfd_path = NULL;
130 }
132 static int
133 tapdisk_uring_create_shmem(td_uring_t *ring)
134 {
135 int fd, err;
137 fd = shm_open(ring->shmem_path, O_CREAT | O_RDWR, 0750);
138 if (fd == -1)
139 return -errno;
141 err = ftruncate(fd, ring->shmem_size);
142 if (err == -1) {
143 err = -errno;
144 goto out;
145 }
147 ring->shmem = mmap(NULL, ring->shmem_size,
148 PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
149 if (ring->shmem == MAP_FAILED) {
150 ring->shmem = NULL;
151 err = -errno;
152 goto out;
153 }
155 err = 0;
157 out:
158 close(fd);
159 return err;
160 }
162 static void
163 tapdisk_uring_destroy_shmem(td_uring_t *ring)
164 {
165 if (ring->shmem) {
166 munmap(ring->shmem, ring->shmem_size);
167 ring->shmem = NULL;
168 }
170 if (ring->shmem_path) {
171 shm_unlink(ring->shmem_path);
172 free(ring->shmem_path);
173 ring->shmem_path = NULL;
174 }
175 }
177 static int
178 tapdisk_uring_connect_shmem(td_uring_t *ring)
179 {
180 int fd, err;
181 td_uring_header_t header, *p;
183 fd = shm_open(ring->shmem_path, O_RDWR);
184 if (fd == -1)
185 return -errno;
187 p = mmap(NULL, sizeof(td_uring_header_t),
188 PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
189 if (p == MAP_FAILED) {
190 err = -errno;
191 goto out;
192 }
194 memcpy(&header, p, sizeof(td_uring_header_t));
195 munmap(p, sizeof(td_uring_header_t));
197 if (memcmp(header.cookie,
198 TAPDISK_URING_COOKIE, sizeof(header.cookie))) {
199 err = -EINVAL;
200 goto out;
201 }
203 if (header.version != TD_URING_CURRENT_VERSION) {
204 err = -EINVAL;
205 goto out;
206 }
208 ring->ring_size = header.ring_size;
209 ring->data_size = header.data_size;
210 ring->shmem_size = header.shmem_size;
212 ring->shmem = mmap(NULL, ring->shmem_size,
213 PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
214 if (ring->shmem == MAP_FAILED) {
215 rint->shmem = NULL;
216 err = -errno;
217 goto out;
218 }
220 err = 0;
222 out:
223 close(fd);
224 return err;
225 }
227 static void
228 tapdisk_uring_disconnect_shmem(td_uring_t *ring)
229 {
230 if (ring->shmem)
231 munmap(ring->shmem, ring->shmem_size);
232 free(ring->shmem_path);
233 ring->shmem_path = NULL;
234 }
236 int
237 tapdisk_uring_create(td_uring_t *ring, const char *location,
238 uint32_t ring_size, uint32_t data_size)
239 {
240 int fd, err;
242 memset(ring, 0, sizeof(td_uring_t));
244 ring->ring_size = ring_size;
245 ring->data_size = data_size;
246 ring->shmem_size = ring_size + data_size + sizeof(td_uring_header_t);
248 err = asprintf(&ring->shmem_path, "%s.shm", location);
249 if (err == -1) {
250 ring->shmem_path = NULL;
251 err = -errno;
252 goto fail;
253 }
255 err = asprintf(&ring->ctlfd_path, "%s.cfd", location);
256 if (err == -1) {
257 ring->ctlfd_path = NULL;
258 err = -errno;
259 goto fail;
260 }
262 err = tapdisk_uring_create_ctlfd(ring);
263 if (err)
264 goto fail;
266 err = tapdisk_uring_create_shmem(ring);
267 if (err)
268 goto fail;
270 ring->ring_area = (unsigned long)ring->shmem + sizeof(td_uring_header_t);
271 ring->data_area = (unsigned long)ring->ring_area + ring->ring_size;
273 return 0;
275 fail:
276 tapdisk_uring_destroy(ring);
277 return err;
278 }
280 int
281 tapdisk_uring_destroy(td_uring_t *ring)
282 {
283 tapdisk_uring_destroy_shmem(ring);
284 tapdisk_uring_destroy_ctlfd(ring);
285 return 0;
286 }
288 int
289 tapdisk_uring_connect(td_uring_t *ring, const char *location)
290 {
291 int fd, err;
293 memset(ring, 0, sizeof(td_uring_t));
295 err = asprintf(&ring->shmem_path, "%s.shm", location);
296 if (err == -1) {
297 ring->shmem_path = NULL;
298 err = -errno;
299 goto fail;
300 }
302 err = asprintf(&ring->ctlfd_path, "%s.cfd", location);
303 if (err == -1) {
304 ring->ctlfd_path = NULL;
305 err = -errno;
306 goto fail;
307 }
309 err = tapdisk_uring_connect_ctlfd(ring);
310 if (err)
311 goto fail;
313 err = tapdisk_uring_connect_shmem(ring);
314 if (err)
315 goto fail;
317 err = 0;
319 fail:
320 }
322 int
323 tapdisk_uring_disconnect(td_uring_t *ring)
324 {
325 tapdisk_uring_disconnect_shmem(ring);
326 tapdisk_uring_disconnect_ctlfd(ring);
327 return 0;
328 }
330 static int
331 tapdisk_ring_read_message(int fd, td_uring_message_t *message, int timeout)
332 {
333 fd_set readfds;
334 int ret, len, offset;
335 struct timeval tv, *t;
337 t = NULL;
338 offset = 0;
339 len = sizeof(td_uring_message_t);
341 if (timeout) {
342 tv.tv_sec = timeout;
343 tv.tv_usec = 0;
344 t = &tv;
345 }
347 while (offset < len) {
348 FD_ZERO(&readfds);
349 FD_SET(fd, &readfds);
351 /* we don't bother reinitializing tv. at worst, it will wait a
352 * bit more time than expected. */
354 ret = select(fd + 1, &readfds, NULL, NULL, t);
355 if (ret == -1)
356 break;
357 else if (FD_ISSET(fd, &readfds)) {
358 ret = read(fd, message + offset, len - offset);
359 if (ret <= 0)
360 break;
361 offset += ret;
362 } else
363 break;
364 }
366 if (offset != len)
367 return -EIO;
369 return 0;
370 }
372 static int
373 tapdisk_ring_write_message(int fd, td_uring_message_t *message, int timeout)
374 {
375 fd_set writefds;
376 int ret, len, offset;
377 struct timeval tv, *t;
379 t = NULL;
380 offset = 0;
381 len = sizeof(td_uring_message_t);
383 if (timeout) {
384 tv.tv_sec = timeout;
385 tv.tv_usec = 0;
386 t = &tv;
387 }
389 while (offset < len) {
390 FD_ZERO(&writefds);
391 FD_SET(fd, &writefds);
393 /* we don't bother reinitializing tv. at worst, it will wait a
394 * bit more time than expected. */
396 ret = select(fd + 1, NULL, &writefds, NULL, t);
397 if (ret == -1)
398 break;
399 else if (FD_ISSET(fd, &writefds)) {
400 ret = write(fd, message + offset, len - offset);
401 if (ret <= 0)
402 break;
403 offset += ret;
404 } else
405 break;
406 }
408 if (offset != len)
409 return -EIO;
411 return 0;
412 }
414 int
415 tapdisk_uring_poll(td_uring_t *ring)
416 {
417 int err;
418 td_uring_message_t message;
420 err = tapdisk_uring_read_message(ring->ctlfd, &message, 1);
421 if (err)
422 return err;
424 if (message.type != TAPDISK_URING_MESSAGE_KICK)
425 return -EINVAL;
427 return 0;
428 }
430 int
431 tapdisk_uring_kick(td_uring_t *ring)
432 {
433 td_uring_message_t message;
435 memset(&message, 0, sizeof(td_uring_message_t));
436 message.type = TAPDISK_URING_MESSAGE_KICK;
438 return tapdisk_uring_write_message(ring->ctlfd, &message, 1);
439 }