ia64/xen-unstable

view tools/blktap2/drivers/tapdisk-ipc.c @ 19647:1c627434605e

blktap2: a completely rewritten blktap implementation

Benefits to blktap2 over the old version of blktap:

* Isolation from xenstore - Blktap devices are now created directly on
the linux dom0 command line, rather than being spawned in response
to XenStore events. This is handy for debugging, makes blktap
generally easier to work with, and is a step toward a generic
user-level block device implementation that is not Xen-specific.

* Improved tapdisk infrastructure: simpler request forwarding, new
request scheduler, request merging, more efficient use of AIO.

* Improved tapdisk error handling and memory management. No
allocations on the block data path, IO retry logic to protect
guests
transient block device failures. This has been tested and is known
to work on weird environments such as NFS soft mounts.

* Pause and snapshot of live virtual disks (see xmsnap script).

* VHD support. The VHD code in this release has been rigorously
tested, and represents a very mature implementation of the VHD
image
format.

* No more duplication of mechanism with blkback. The blktap kernel
module has changed dramatically from the original blktap. Blkback
is now always used to talk to Xen guests, blktap just presents a
Linux gendisk that blkback can export. This is done while
preserving the zero-copy data path from domU to physical device.

These patches deprecate the old blktap code, which can hopefully be
removed from the tree completely at some point in the future.

Signed-off-by: Jake Wires <jake.wires@citrix.com>
Signed-off-by: Dutch Meyer <dmeyer@cs.ubc.ca>
author Keir Fraser <keir.fraser@citrix.com>
date Tue May 26 11:52:31 2009 +0100 (2009-05-26)
parents
children
line source
1 /*
2 * Copyright (c) 2008, XenSource Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of XenSource Inc. nor the names of its contributors
13 * may be used to endorse or promote products derived from this software
14 * without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
20 * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
21 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
22 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
24 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
25 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28 #include <stdio.h>
29 #include <errno.h>
30 #include <stdlib.h>
31 #include <unistd.h>
32 #include <string.h>
34 #include "tapdisk.h"
35 #include "tapdisk-ipc.h"
36 #include "tapdisk-vbd.h"
37 #include "tapdisk-server.h"
39 static int
40 tapdisk_ipc_write_message(int fd, tapdisk_message_t *message, int timeout)
41 {
42 fd_set writefds;
43 int ret, len, offset;
44 struct timeval tv, *t;
46 t = NULL;
47 offset = 0;
48 len = sizeof(tapdisk_message_t);
50 if (timeout) {
51 tv.tv_sec = timeout;
52 tv.tv_usec = 0;
53 t = &tv;
54 }
56 DPRINTF("sending '%s' message (uuid = %u)\n",
57 tapdisk_message_name(message->type), message->cookie);
59 while (offset < len) {
60 FD_ZERO(&writefds);
61 FD_SET(fd, &writefds);
63 /* we don't bother reinitializing tv. at worst, it will wait a
64 * bit more time than expected. */
66 ret = select(fd + 1, NULL, &writefds, NULL, t);
67 if (ret == -1)
68 break;
69 else if (FD_ISSET(fd, &writefds)) {
70 ret = write(fd, message + offset, len - offset);
71 if (ret <= 0)
72 break;
73 offset += ret;
74 } else
75 break;
76 }
78 if (offset != len) {
79 EPRINTF("failure writing message\n");
80 return -EIO;
81 }
83 return 0;
84 }
86 int
87 tapdisk_ipc_write(td_ipc_t *ipc, int type)
88 {
89 tapdisk_message_t message;
91 if (ipc->wfd == -1)
92 return 0;
94 memset(&message, 0, sizeof(tapdisk_message_t));
95 message.type = type;
96 message.cookie = ipc->uuid;
98 return tapdisk_ipc_write_message(ipc->wfd, &message, 2);
99 }
101 int
102 tapdisk_ipc_write_error(td_ipc_t *ipc, const char *text)
103 {
104 tapdisk_message_t message;
106 memset(&message, 0, sizeof(message));
107 message.type = TAPDISK_MESSAGE_RUNTIME_ERROR;
108 message.cookie = ipc->uuid;
109 snprintf(message.u.string.text, sizeof(message.u.string.text), "%s", text);
111 return tapdisk_ipc_write_message(ipc->wfd, &message, 2);
112 }
114 static int
115 tapdisk_ipc_read_message(int fd, tapdisk_message_t *message, int timeout)
116 {
117 fd_set readfds;
118 int ret, len, offset;
119 struct timeval tv, *t;
121 t = NULL;
122 offset = 0;
123 len = sizeof(tapdisk_message_t);
125 if (timeout) {
126 tv.tv_sec = timeout;
127 tv.tv_usec = 0;
128 t = &tv;
129 }
131 memset(message, 0, sizeof(tapdisk_message_t));
133 while (offset < len) {
134 FD_ZERO(&readfds);
135 FD_SET(fd, &readfds);
137 /* we don't bother reinitializing tv. at worst, it will wait a
138 * bit more time than expected. */
140 ret = select(fd + 1, &readfds, NULL, NULL, t);
141 if (ret == -1)
142 break;
143 else if (FD_ISSET(fd, &readfds)) {
144 ret = read(fd, message + offset, len - offset);
145 if (ret <= 0)
146 break;
147 offset += ret;
148 } else
149 break;
150 }
152 if (offset != len) {
153 EPRINTF("failure reading message\n");
154 return -EIO;
155 }
157 DPRINTF("received '%s' message (uuid = %u)\n",
158 tapdisk_message_name(message->type), message->cookie);
160 return 0;
161 }
163 int
164 tapdisk_ipc_read(td_ipc_t *ipc)
165 {
166 int err;
167 td_vbd_t *vbd;
168 td_uuid_t uuid;
169 tapdisk_message_t message;
171 err = tapdisk_ipc_read_message(ipc->rfd, &message, 2);
172 if (err) {
173 tapdisk_server_check_state();
174 return err;
175 }
177 uuid = message.cookie;
178 vbd = tapdisk_server_get_vbd(uuid);
180 if (!vbd && message.type != TAPDISK_MESSAGE_PID) {
181 EPRINTF("received message for non-existing vbd: %u\n", uuid);
182 err = -EINVAL;
183 goto fail;
184 }
186 switch (message.type) {
187 case TAPDISK_MESSAGE_PID:
188 err = tapdisk_vbd_initialize(ipc->rfd, ipc->wfd, uuid);
190 memset(&message, 0, sizeof(tapdisk_message_t));
191 message.cookie = uuid;
193 if (!err) {
194 message.type = TAPDISK_MESSAGE_PID_RSP;
195 message.u.tapdisk_pid = getpid();
196 } else
197 message.type = TAPDISK_MESSAGE_ERROR;
199 return tapdisk_ipc_write_message(ipc->wfd, &message, 0);
201 case TAPDISK_MESSAGE_OPEN:
202 {
203 image_t image;
204 char *devname;
205 td_flag_t flags;
207 flags = 0;
209 if (message.u.params.flags & TAPDISK_MESSAGE_FLAG_RDONLY)
210 flags |= TD_OPEN_RDONLY;
211 if (message.u.params.flags & TAPDISK_MESSAGE_FLAG_SHARED)
212 flags |= TD_OPEN_SHAREABLE;
213 if (message.u.params.flags & TAPDISK_MESSAGE_FLAG_ADD_CACHE)
214 flags |= TD_OPEN_ADD_CACHE;
215 if (message.u.params.flags & TAPDISK_MESSAGE_FLAG_VHD_INDEX)
216 flags |= TD_OPEN_VHD_INDEX;
217 if (message.u.params.flags & TAPDISK_MESSAGE_FLAG_LOG_DIRTY)
218 flags |= TD_OPEN_LOG_DIRTY;
220 err = asprintf(&devname, "%s/%s%d",
221 BLKTAP_DEV_DIR, BLKTAP_DEV_NAME,
222 message.u.params.devnum);
223 if (err == -1)
224 goto fail;
226 err = tapdisk_vbd_open(vbd,
227 message.u.params.path,
228 message.drivertype,
229 message.u.params.storage,
230 devname, flags);
231 free(devname);
232 if (err)
233 goto fail;
235 err = tapdisk_vbd_get_image_info(vbd, &image);
236 if (err)
237 goto fail;
239 memset(&message, 0, sizeof(tapdisk_message_t));
240 message.cookie = uuid;
241 message.u.image.sectors = image.size;
242 message.u.image.sector_size = image.secsize;
243 message.u.image.info = image.info;
244 message.type = TAPDISK_MESSAGE_OPEN_RSP;
246 return tapdisk_ipc_write_message(ipc->wfd, &message, 0);
247 }
249 case TAPDISK_MESSAGE_PAUSE:
250 tapdisk_vbd_pause(vbd);
251 return 0; /* response written asynchronously */
253 case TAPDISK_MESSAGE_RESUME:
254 tapdisk_vbd_resume(vbd,
255 message.u.params.path,
256 message.drivertype);
257 return 0; /* response written asynchronously */
259 case TAPDISK_MESSAGE_CLOSE:
260 tapdisk_vbd_close(vbd);
261 return 0; /* response written asynchronously */
263 case TAPDISK_MESSAGE_EXIT:
264 return 0;
265 }
267 err = -EINVAL;
268 EPRINTF("received unrecognized message %s, uuid = %d\n",
269 tapdisk_message_name(message.type), uuid);
271 fail:
272 memset(&message, 0, sizeof(tapdisk_message_t));
273 message.cookie = uuid;
274 message.type = TAPDISK_MESSAGE_ERROR;
275 tapdisk_ipc_write_message(ipc->wfd, &message, 2);
276 tapdisk_server_check_state();
278 return -err;
279 }