ia64/xen-unstable

view tools/blktap/drivers/block-vmdk.c @ 15783:c93e2a822d6f

[xen, xencomm] xencomm multiple page support
Current implementation doesn't allow struct xencomm_desc::address
array to be more than single page. On IA64 it causes 64GB+ domain
creation failure. This patch generalizes xencomm to allow multipage

Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp>
author kfraser@localhost.localdomain
date Tue Aug 28 15:32:27 2007 +0100 (2007-08-28)
parents 965bf43c9f11
children b6cc74f275fd
line source
1 /* block-vmdk.c
2 *
3 * VMware Disk format implementation.
4 *
5 * (c) 2006 Andrew Warfield and Julian Chesterfield
6 *
7 * This is largely the same as the vmdk driver in Qemu, I've just twisted it
8 * to match our interfaces. The original (BSDish) Copyright message appears
9 * below:
10 */
12 /*
13 * Block driver for the VMDK format
14 *
15 * Copyright (c) 2004 Fabrice Bellard
16 * Copyright (c) 2005 Filip Navara
17 *
18 * Permission is hereby granted, free of charge, to any person obtaining a copy
19 * of this software and associated documentation files (the "Software"), to deal
20 * in the Software without restriction, including without limitation the rights
21 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
22 * copies of the Software, and to permit persons to whom the Software is
23 * furnished to do so, subject to the following conditions:
24 *
25 * The above copyright notice and this permission notice shall be included in
26 * all copies or substantial portions of the Software.
27 *
28 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
29 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
30 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
31 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
32 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
33 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
34 * THE SOFTWARE.
35 */
37 #include <errno.h>
38 #include <fcntl.h>
39 #include <stdio.h>
40 #include <stdlib.h>
41 #include <unistd.h>
42 #include <sys/statvfs.h>
43 #include <sys/stat.h>
44 #include <sys/ioctl.h>
45 #include <linux/fs.h>
46 #include <string.h>
47 #include "tapdisk.h"
48 #include "bswap.h"
50 #define safer_free(_x) \
51 do { \
52 if (NULL != _x) { \
53 free(_x); \
54 (_x) = NULL; \
55 } \
56 } while (0) ;
58 #define VMDK3_MAGIC (('C' << 24) | ('O' << 16) | ('W' << 8) | 'D')
59 #define VMDK4_MAGIC (('K' << 24) | ('D' << 16) | ('M' << 8) | 'V')
61 typedef struct {
62 uint32_t version;
63 uint32_t flags;
64 uint32_t disk_sectors;
65 uint32_t granularity;
66 uint32_t l1dir_offset;
67 uint32_t l1dir_size;
68 uint32_t file_sectors;
69 uint32_t cylinders;
70 uint32_t heads;
71 uint32_t sectors_per_track;
72 } VMDK3Header;
74 typedef struct {
75 uint32_t version;
76 uint32_t flags;
77 int64_t capacity;
78 int64_t granularity;
79 int64_t desc_offset;
80 int64_t desc_size;
81 int32_t num_gtes_per_gte;
82 int64_t rgd_offset;
83 int64_t gd_offset;
84 int64_t grain_offset;
85 char filler[1];
86 char check_bytes[4];
87 } __attribute__((packed)) VMDK4Header;
89 #define L2_CACHE_SIZE 16
91 struct tdvmdk_state {
92 int fd;
93 int poll_pipe[2]; /* dummy fd for polling on */
95 unsigned int l1_size;
96 int64_t l1_table_offset;
97 int64_t l1_backup_table_offset;
98 uint32_t l1_entry_sectors;
99 unsigned int l2_size;
101 uint32_t *l1_table;
102 uint32_t *l1_backup_table;
103 uint32_t *l2_cache;
104 uint32_t l2_cache_offsets[L2_CACHE_SIZE];
105 uint32_t l2_cache_counts[L2_CACHE_SIZE];
107 unsigned int cluster_sectors;
108 };
110 static inline void init_fds(struct disk_driver *dd)
111 {
112 int i;
113 struct tdvmdk_state *prv = (struct tdvmdk_state *)dd->private;
115 for (i = 0; i < MAX_IOFD; i++)
116 dd->io_fd[i] = 0;
118 dd->io_fd[0] = prv->poll_pipe[0];
119 }
121 /* Open the disk file and initialize aio state. */
122 static int tdvmdk_open (struct disk_driver *dd,
123 const char *name, td_flag_t flags)
124 {
125 int ret, fd;
126 int l1_size, i, o_flags;
127 uint32_t magic;
128 struct td_state *s = dd->td_state;
129 struct tdvmdk_state *prv = (struct tdvmdk_state *)dd->private;
131 /* set up a pipe so that we can hand back a poll fd that won't fire.*/
132 ret = pipe(prv->poll_pipe);
133 if (ret != 0)
134 return -1;
136 /* Open the file */
137 o_flags = O_DIRECT | O_LARGEFILE |
138 ((flags == TD_RDONLY) ? O_RDONLY : O_RDWR);
139 fd = open(name, o_flags);
141 if ( (fd == -1) && (errno == EINVAL) ) {
143 /* Maybe O_DIRECT isn't supported. */
144 o_flags &= ~O_DIRECT;
145 fd = open(name, o_flags);
146 if (fd != -1) DPRINTF("WARNING: Accessing image without"
147 "O_DIRECT! (%s)\n", name);
149 } else if (fd != -1) DPRINTF("open(%s) with O_DIRECT\n", name);
151 if (fd == -1) {
152 DPRINTF("Unable to open [%s]!\n",name);
153 ret = 0 - errno;
154 return -1;
155 }
157 prv->fd = fd;
159 /* Grok the vmdk header. */
160 if ((ret = read(fd, &magic, sizeof(magic))) != sizeof(magic))
161 goto fail;
162 magic = be32_to_cpu(magic);
163 if (magic == VMDK3_MAGIC) {
164 VMDK3Header header;
165 if (read(fd, &header, sizeof(header)) !=
166 sizeof(header))
167 goto fail;
168 prv->cluster_sectors = le32_to_cpu(header.granularity);
169 prv->l2_size = 1 << 9;
170 prv->l1_size = 1 << 6;
171 s->size = le32_to_cpu(header.disk_sectors);
172 prv->l1_table_offset = le32_to_cpu(header.l1dir_offset) << 9;
173 prv->l1_backup_table_offset = 0;
174 prv->l1_entry_sectors = prv->l2_size * prv->cluster_sectors;
175 } else if (magic == VMDK4_MAGIC) {
176 VMDK4Header header;
178 if (read(fd, &header, sizeof(header)) != sizeof(header))
179 goto fail;
180 s->size = le32_to_cpu(header.capacity);
181 prv->cluster_sectors = le32_to_cpu(header.granularity);
182 prv->l2_size = le32_to_cpu(header.num_gtes_per_gte);
183 prv->l1_entry_sectors = prv->l2_size * prv->cluster_sectors;
184 if (prv->l1_entry_sectors <= 0)
185 goto fail;
186 prv->l1_size = (s->size + prv->l1_entry_sectors - 1)
187 / prv->l1_entry_sectors;
188 prv->l1_table_offset = le64_to_cpu(header.rgd_offset) << 9;
189 prv->l1_backup_table_offset =
190 le64_to_cpu(header.gd_offset) << 9;
191 } else {
192 goto fail;
193 }
194 /* read the L1 table */
195 l1_size = prv->l1_size * sizeof(uint32_t);
196 prv->l1_table = malloc(l1_size);
197 if (!prv->l1_table)
198 goto fail;
199 if (lseek(fd, prv->l1_table_offset, SEEK_SET) == -1)
200 goto fail;
201 if (read(fd, prv->l1_table, l1_size) != l1_size)
202 goto fail;
203 for (i = 0; i < prv->l1_size; i++) {
204 le32_to_cpus(&prv->l1_table[i]);
205 }
207 if (prv->l1_backup_table_offset) {
208 prv->l1_backup_table = malloc(l1_size);
209 if (!prv->l1_backup_table)
210 goto fail;
211 if (lseek(fd, prv->l1_backup_table_offset, SEEK_SET) == -1)
212 goto fail;
213 if (read(fd, prv->l1_backup_table, l1_size) != l1_size)
214 goto fail;
215 for(i = 0; i < prv->l1_size; i++) {
216 le32_to_cpus(&prv->l1_backup_table[i]);
217 }
218 }
220 prv->l2_cache = malloc(prv->l2_size * L2_CACHE_SIZE *sizeof(uint32_t));
221 if (!prv->l2_cache)
222 goto fail;
223 prv->fd = fd;
224 init_fds(dd);
225 DPRINTF("VMDK File opened successfully\n");
226 return 0;
228 fail:
229 DPRINTF("VMDK File open failed.\n");
230 safer_free(prv->l1_backup_table);
231 free(prv->l1_table);
232 free(prv->l2_cache);
233 close(fd);
234 return -1;
235 }
237 static uint64_t get_cluster_offset(struct tdvmdk_state *prv,
238 uint64_t offset, int allocate)
239 {
240 unsigned int l1_index, l2_offset, l2_index;
241 int min_index, i, j;
242 uint32_t min_count, *l2_table, tmp;
243 uint64_t cluster_offset;
245 l1_index = (offset >> 9) / prv->l1_entry_sectors;
246 if (l1_index >= prv->l1_size)
247 return 0;
248 l2_offset = prv->l1_table[l1_index];
249 if (!l2_offset)
250 return 0;
251 for (i = 0; i < L2_CACHE_SIZE; i++) {
252 if (l2_offset == prv->l2_cache_offsets[i]) {
253 /* increment the hit count */
254 if (++prv->l2_cache_counts[i] == 0xffffffff) {
255 for(j = 0; j < L2_CACHE_SIZE; j++) {
256 prv->l2_cache_counts[j] >>= 1;
257 }
258 }
259 l2_table = prv->l2_cache + (i * prv->l2_size);
260 goto found;
261 }
262 }
263 /* not found: load a new entry in the least used one */
264 min_index = 0;
265 min_count = 0xffffffff;
266 for (i = 0; i < L2_CACHE_SIZE; i++) {
267 if (prv->l2_cache_counts[i] < min_count) {
268 min_count = prv->l2_cache_counts[i];
269 min_index = i;
270 }
271 }
272 l2_table = prv->l2_cache + (min_index * prv->l2_size);
273 lseek(prv->fd, (int64_t)l2_offset * 512, SEEK_SET);
274 if (read(prv->fd, l2_table, prv->l2_size * sizeof(uint32_t)) !=
275 prv->l2_size * sizeof(uint32_t))
276 return 0;
277 prv->l2_cache_offsets[min_index] = l2_offset;
278 prv->l2_cache_counts[min_index] = 1;
279 found:
280 l2_index = ((offset >> 9) / prv->cluster_sectors) % prv->l2_size;
281 cluster_offset = le32_to_cpu(l2_table[l2_index]);
282 if (!cluster_offset) {
283 if (!allocate)
284 return 0;
285 cluster_offset = lseek(prv->fd, 0, SEEK_END);
286 if (ftruncate(prv->fd, cluster_offset +
287 (prv->cluster_sectors << 9)))
288 return 0;
289 cluster_offset >>= 9;
290 /* update L2 table */
291 tmp = cpu_to_le32(cluster_offset);
292 l2_table[l2_index] = tmp;
293 lseek(prv->fd, ((int64_t)l2_offset * 512) +
294 (l2_index * sizeof(tmp)), SEEK_SET);
295 if (write(prv->fd, &tmp, sizeof(tmp)) != sizeof(tmp))
296 return 0;
297 /* update backup L2 table */
298 if (prv->l1_backup_table_offset != 0) {
299 l2_offset = prv->l1_backup_table[l1_index];
300 lseek(prv->fd, ((int64_t)l2_offset * 512) +
301 (l2_index * sizeof(tmp)), SEEK_SET);
302 if (write(prv->fd, &tmp, sizeof(tmp)) != sizeof(tmp))
303 return 0;
304 }
305 }
306 cluster_offset <<= 9;
307 return cluster_offset;
308 }
310 static int tdvmdk_queue_read(struct disk_driver *dd, uint64_t sector,
311 int nb_sectors, char *buf, td_callback_t cb,
312 int id, void *private)
313 {
314 struct tdvmdk_state *prv = (struct tdvmdk_state *)dd->private;
315 int index_in_cluster, n;
316 uint64_t cluster_offset;
317 int ret = 0;
319 while (nb_sectors > 0) {
320 cluster_offset = get_cluster_offset(prv, sector << 9, 0);
321 index_in_cluster = sector % prv->cluster_sectors;
322 n = prv->cluster_sectors - index_in_cluster;
323 if (n > nb_sectors)
324 n = nb_sectors;
325 if (!cluster_offset) {
326 memset(buf, 0, 512 * n);
327 } else {
328 lseek(prv->fd, cluster_offset + index_in_cluster * 512,
329 SEEK_SET);
330 ret = read(prv->fd, buf, n * 512);
331 if (ret != n * 512) {
332 ret = -1;
333 goto done;
334 }
335 }
336 nb_sectors -= n;
337 sector += n;
338 buf += n * 512;
339 }
340 done:
341 return cb(dd, ret == -1 ? -1 : 0, sector, nb_sectors, id, private);
342 }
344 static int tdvmdk_queue_write(struct disk_driver *dd, uint64_t sector,
345 int nb_sectors, char *buf, td_callback_t cb,
346 int id, void *private)
347 {
348 struct tdvmdk_state *prv = (struct tdvmdk_state *)dd->private;
349 int index_in_cluster, n;
350 uint64_t cluster_offset;
351 int ret = 0;
353 while (nb_sectors > 0) {
354 index_in_cluster = sector & (prv->cluster_sectors - 1);
355 n = prv->cluster_sectors - index_in_cluster;
356 if (n > nb_sectors)
357 n = nb_sectors;
358 cluster_offset = get_cluster_offset(prv, sector << 9, 1);
359 if (!cluster_offset) {
360 ret = -1;
361 goto done;
362 }
363 lseek(prv->fd, cluster_offset + index_in_cluster * 512,
364 SEEK_SET);
365 ret = write(prv->fd, buf, n * 512);
366 if (ret != n * 512) {
367 ret = -1;
368 goto done;
369 }
370 nb_sectors -= n;
371 sector += n;
372 buf += n * 512;
373 }
374 done:
375 return cb(dd, ret == -1 ? -1 : 0, sector, nb_sectors, id, private);
376 }
378 static int tdvmdk_submit(struct disk_driver *dd)
379 {
380 return 0;
381 }
383 static int tdvmdk_close(struct disk_driver *dd)
384 {
385 struct tdvmdk_state *prv = (struct tdvmdk_state *)dd->private;
387 safer_free(prv->l1_table);
388 safer_free(prv->l1_backup_table);
389 safer_free(prv->l2_cache);
390 close(prv->fd);
391 close(prv->poll_pipe[0]);
392 close(prv->poll_pipe[1]);
393 return 0;
394 }
396 static int tdvmdk_do_callbacks(struct disk_driver *dd, int sid)
397 {
398 /* always ask for a kick */
399 return 1;
400 }
402 static int tdvmdk_get_parent_id(struct disk_driver *dd, struct disk_id *id)
403 {
404 return TD_NO_PARENT;
405 }
407 static int tdvmdk_validate_parent(struct disk_driver *dd,
408 struct disk_driver *parent, td_flag_t flags)
409 {
410 return -EINVAL;
411 }
413 struct tap_disk tapdisk_vmdk = {
414 .disk_type = "tapdisk_vmdk",
415 .private_data_size = sizeof(struct tdvmdk_state),
416 .td_open = tdvmdk_open,
417 .td_queue_read = tdvmdk_queue_read,
418 .td_queue_write = tdvmdk_queue_write,
419 .td_submit = tdvmdk_submit,
420 .td_close = tdvmdk_close,
421 .td_do_callbacks = tdvmdk_do_callbacks,
422 .td_get_parent_id = tdvmdk_get_parent_id,
423 .td_validate_parent = tdvmdk_validate_parent
424 };