ia64/xen-unstable

view tools/blktap/drivers/block-qcow.c @ 14105:e9bd3267ff23

[TAPDISK] honor read-only attributes when creating tap-based VBDs
Signed-off-by: Jake Wires <jwires@xensource.com>
author Jake Wires <jwires@xensource.com>
date Fri Feb 23 17:26:07 2007 -0800 (2007-02-23)
parents 8407279d3751
children d64c0af015dd
line source
1 /* block-qcow.c
2 *
3 * Asynchronous Qemu copy-on-write disk implementation.
4 * Code based on the Qemu implementation
5 * (see copyright notice below)
6 *
7 * (c) 2006 Andrew Warfield and Julian Chesterfield
8 *
9 */
11 /*
12 * Block driver for the QCOW format
13 *
14 * Copyright (c) 2004 Fabrice Bellard
15 *
16 * Permission is hereby granted, free of charge, to any person obtaining a copy
17 * of this software and associated documentation files(the "Software"), to deal
18 * in the Software without restriction, including without limitation the rights
19 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
20 * copies of the Software, and to permit persons to whom the Software is
21 * furnished to do so, subject to the following conditions:
22 */
24 #include <errno.h>
25 #include <fcntl.h>
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <unistd.h>
29 #include <sys/statvfs.h>
30 #include <sys/stat.h>
31 #include <sys/ioctl.h>
32 #include <linux/fs.h>
33 #include <string.h>
34 #include <zlib.h>
35 #include <inttypes.h>
36 #include <libaio.h>
37 #include <openssl/md5.h>
38 #include "bswap.h"
39 #include "aes.h"
40 #include "tapdisk.h"
42 #if 1
43 #define ASSERT(_p) \
44 if ( !(_p) ) { DPRINTF("Assertion '%s' failed, line %d, file %s", #_p , \
45 __LINE__, __FILE__); *(int*)0=0; }
46 #else
47 #define ASSERT(_p) ((void)0)
48 #endif
50 #define ROUNDUP(l, s) \
51 ({ \
52 (uint64_t)( \
53 (l + (s - 1)) - ((l + (s - 1)) % s)); \
54 })
56 /******AIO DEFINES******/
57 #define REQUEST_ASYNC_FD 1
58 #define MAX_AIO_REQS (MAX_REQUESTS * MAX_SEGMENTS_PER_REQ)
60 struct pending_aio {
61 td_callback_t cb;
62 int id;
63 void *private;
64 int nb_sectors;
65 char *buf;
66 uint64_t sector;
67 };
69 #define IOCB_IDX(_s, _io) ((_io) - (_s)->iocb_list)
71 #define ZERO_TEST(_b) (_b | 0x00)
73 /**************************************************************/
74 /* QEMU COW block driver with compression and encryption support */
76 #define QCOW_MAGIC (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb)
77 #define XEN_MAGIC (('X' << 24) | ('E' << 16) | ('N' << 8) | 0xfb)
78 #define QCOW_VERSION 1
80 #define QCOW_CRYPT_NONE 0x00
81 #define QCOW_CRYPT_AES 0x01
83 #define QCOW_OFLAG_COMPRESSED (1LL << 63)
84 #define SPARSE_FILE 0x01
86 #ifndef O_BINARY
87 #define O_BINARY 0
88 #endif
90 typedef struct QCowHeader {
91 uint32_t magic;
92 uint32_t version;
93 uint64_t backing_file_offset;
94 uint32_t backing_file_size;
95 uint32_t mtime;
96 uint64_t size; /* in bytes */
97 uint8_t cluster_bits;
98 uint8_t l2_bits;
99 uint32_t crypt_method;
100 uint64_t l1_table_offset;
101 } QCowHeader;
103 /*Extended header for Xen enhancements*/
104 typedef struct QCowHeader_ext {
105 uint32_t xmagic;
106 uint32_t cksum;
107 uint32_t min_cluster_alloc;
108 uint32_t flags;
109 } QCowHeader_ext;
111 #define L2_CACHE_SIZE 16 /*Fixed allocation in Qemu*/
113 struct tdqcow_state {
114 int fd; /*Main Qcow file descriptor */
115 uint64_t fd_end; /*Store a local record of file length */
116 char *name; /*Record of the filename*/
117 uint32_t backing_file_size;
118 uint64_t backing_file_offset;
119 int encrypted; /*File contents are encrypted or plain*/
120 int cluster_bits; /*Determines length of cluster as
121 *indicated by file hdr*/
122 int cluster_size; /*Length of cluster*/
123 int cluster_sectors; /*Number of sectors per cluster*/
124 int cluster_alloc; /*Blktap fix for allocating full
125 *extents*/
126 int min_cluster_alloc; /*Blktap historical extent alloc*/
127 int sparse; /*Indicates whether to preserve sparseness*/
128 int l2_bits; /*Size of L2 table entry*/
129 int l2_size; /*Full table size*/
130 int l1_size; /*L1 table size*/
131 uint64_t cluster_offset_mask;
132 uint64_t l1_table_offset; /*L1 table offset from beginning of
133 *file*/
134 uint64_t *l1_table; /*L1 table entries*/
135 uint64_t *l2_cache; /*We maintain a cache of size
136 *L2_CACHE_SIZE of most read entries*/
137 uint64_t l2_cache_offsets[L2_CACHE_SIZE]; /*L2 cache entries*/
138 uint32_t l2_cache_counts[L2_CACHE_SIZE]; /*Cache access record*/
139 uint8_t *cluster_cache;
140 uint8_t *cluster_data;
141 uint8_t *sector_lock; /*Locking bitmap for AIO reads/writes*/
142 uint64_t cluster_cache_offset; /**/
143 uint32_t crypt_method; /*current crypt method, 0 if no
144 *key yet */
145 uint32_t crypt_method_header; /**/
146 AES_KEY aes_encrypt_key; /*AES key*/
147 AES_KEY aes_decrypt_key; /*AES key*/
148 /* libaio state */
149 io_context_t aio_ctx;
150 struct iocb iocb_list [MAX_AIO_REQS];
151 struct iocb *iocb_free [MAX_AIO_REQS];
152 struct pending_aio pending_aio[MAX_AIO_REQS];
153 int iocb_free_count;
154 struct iocb *iocb_queue[MAX_AIO_REQS];
155 int iocb_queued;
156 int poll_fd; /* NB: we require aio_poll support */
157 struct io_event aio_events[MAX_AIO_REQS];
158 };
160 static int decompress_cluster(struct tdqcow_state *s, uint64_t cluster_offset);
162 static int init_aio_state(struct disk_driver *dd)
163 {
164 int i;
165 struct td_state *bs = dd->td_state;
166 struct tdqcow_state *s = (struct tdqcow_state *)dd->private;
167 long ioidx;
169 /*Initialize Locking bitmap*/
170 s->sector_lock = calloc(1, bs->size);
172 if (!s->sector_lock) {
173 DPRINTF("Failed to allocate sector lock\n");
174 goto fail;
175 }
177 /* Initialize AIO */
178 s->iocb_free_count = MAX_AIO_REQS;
179 s->iocb_queued = 0;
181 /*Signal kernel to create Poll FD for Asyc completion events*/
182 s->aio_ctx = (io_context_t) REQUEST_ASYNC_FD;
183 s->poll_fd = io_setup(MAX_AIO_REQS, &s->aio_ctx);
185 if (s->poll_fd < 0) {
186 if (s->poll_fd == -EAGAIN) {
187 DPRINTF("Couldn't setup AIO context. If you are "
188 "trying to concurrently use a large number "
189 "of blktap-based disks, you may need to "
190 "increase the system-wide aio request limit. "
191 "(e.g. 'echo echo 1048576 > /proc/sys/fs/"
192 "aio-max-nr')\n");
193 } else {
194 DPRINTF("Couldn't get fd for AIO poll support. This "
195 "is probably because your kernel does not "
196 "have the aio-poll patch applied.\n");
197 }
198 goto fail;
199 }
201 for (i=0;i<MAX_AIO_REQS;i++)
202 s->iocb_free[i] = &s->iocb_list[i];
204 DPRINTF("AIO state initialised\n");
206 return 0;
208 fail:
209 return -1;
210 }
212 static uint32_t gen_cksum(char *ptr, int len)
213 {
214 unsigned char *md;
215 uint32_t ret;
217 md = malloc(MD5_DIGEST_LENGTH);
219 if(!md) return 0;
221 if (MD5((unsigned char *)ptr, len, md) != md) {
222 free(md);
223 return 0;
224 }
226 memcpy(&ret, md, sizeof(uint32_t));
227 free(md);
228 return ret;
229 }
231 static int get_filesize(char *filename, uint64_t *size, struct stat *st)
232 {
233 int fd;
234 QCowHeader header;
236 /*Set to the backing file size*/
237 fd = open(filename, O_RDONLY);
238 if (fd < 0)
239 return -1;
240 if (read(fd, &header, sizeof(header)) < sizeof(header)) {
241 close(fd);
242 return -1;
243 }
244 close(fd);
246 be32_to_cpus(&header.magic);
247 be64_to_cpus(&header.size);
248 if (header.magic == QCOW_MAGIC) {
249 *size = header.size >> SECTOR_SHIFT;
250 return 0;
251 }
253 if(S_ISBLK(st->st_mode)) {
254 fd = open(filename, O_RDONLY);
255 if (fd < 0)
256 return -1;
257 if (ioctl(fd,BLKGETSIZE,size)!=0) {
258 printf("Unable to get Block device size\n");
259 close(fd);
260 return -1;
261 }
262 close(fd);
263 } else *size = (st->st_size >> SECTOR_SHIFT);
264 return 0;
265 }
267 static int qcow_set_key(struct tdqcow_state *s, const char *key)
268 {
269 uint8_t keybuf[16];
270 int len, i;
272 memset(keybuf, 0, 16);
273 len = strlen(key);
274 if (len > 16)
275 len = 16;
276 /* XXX: we could compress the chars to 7 bits to increase
277 entropy */
278 for (i = 0; i < len; i++) {
279 keybuf[i] = key[i];
280 }
281 s->crypt_method = s->crypt_method_header;
283 if (AES_set_encrypt_key(keybuf, 128, &s->aes_encrypt_key) != 0)
284 return -1;
285 if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0)
286 return -1;
287 #if 0
288 /* test */
289 {
290 uint8_t in[16];
291 uint8_t out[16];
292 uint8_t tmp[16];
293 for (i=0; i<16; i++)
294 in[i] = i;
295 AES_encrypt(in, tmp, &s->aes_encrypt_key);
296 AES_decrypt(tmp, out, &s->aes_decrypt_key);
297 for (i = 0; i < 16; i++)
298 DPRINTF(" %02x", tmp[i]);
299 DPRINTF("\n");
300 for (i = 0; i < 16; i++)
301 DPRINTF(" %02x", out[i]);
302 DPRINTF("\n");
303 }
304 #endif
305 return 0;
306 }
308 static int async_read(struct tdqcow_state *s, int size,
309 uint64_t offset, char *buf, td_callback_t cb,
310 int id, uint64_t sector, void *private)
311 {
312 struct iocb *io;
313 struct pending_aio *pio;
314 long ioidx;
316 io = s->iocb_free[--s->iocb_free_count];
318 ioidx = IOCB_IDX(s, io);
319 pio = &s->pending_aio[ioidx];
320 pio->cb = cb;
321 pio->id = id;
322 pio->private = private;
323 pio->nb_sectors = size/512;
324 pio->buf = buf;
325 pio->sector = sector;
327 io_prep_pread(io, s->fd, buf, size, offset);
328 io->data = (void *)ioidx;
330 s->iocb_queue[s->iocb_queued++] = io;
332 return 1;
333 }
335 static int async_write(struct tdqcow_state *s, int size,
336 uint64_t offset, char *buf, td_callback_t cb,
337 int id, uint64_t sector, void *private)
338 {
339 struct iocb *io;
340 struct pending_aio *pio;
341 long ioidx;
343 io = s->iocb_free[--s->iocb_free_count];
345 ioidx = IOCB_IDX(s, io);
346 pio = &s->pending_aio[ioidx];
347 pio->cb = cb;
348 pio->id = id;
349 pio->private = private;
350 pio->nb_sectors = size/512;
351 pio->buf = buf;
352 pio->sector = sector;
354 io_prep_pwrite(io, s->fd, buf, size, offset);
355 io->data = (void *)ioidx;
357 s->iocb_queue[s->iocb_queued++] = io;
359 return 1;
360 }
362 /*TODO: Fix sector span!*/
363 static int aio_can_lock(struct tdqcow_state *s, uint64_t sector)
364 {
365 return (s->sector_lock[sector] ? 0 : 1);
366 }
368 static int aio_lock(struct tdqcow_state *s, uint64_t sector)
369 {
370 return ++s->sector_lock[sector];
371 }
373 static void aio_unlock(struct tdqcow_state *s, uint64_t sector)
374 {
375 if (!s->sector_lock[sector]) return;
377 --s->sector_lock[sector];
378 return;
379 }
381 /*
382 * The crypt function is compatible with the linux cryptoloop
383 * algorithm for < 4 GB images. NOTE: out_buf == in_buf is
384 * supported .
385 */
386 static void encrypt_sectors(struct tdqcow_state *s, int64_t sector_num,
387 uint8_t *out_buf, const uint8_t *in_buf,
388 int nb_sectors, int enc,
389 const AES_KEY *key)
390 {
391 union {
392 uint64_t ll[2];
393 uint8_t b[16];
394 } ivec;
395 int i;
397 for (i = 0; i < nb_sectors; i++) {
398 ivec.ll[0] = cpu_to_le64(sector_num);
399 ivec.ll[1] = 0;
400 AES_cbc_encrypt(in_buf, out_buf, 512, key,
401 ivec.b, enc);
402 sector_num++;
403 in_buf += 512;
404 out_buf += 512;
405 }
406 }
408 static int qtruncate(int fd, off_t length, int sparse)
409 {
410 int ret, i;
411 int current = 0, rem = 0;
412 uint64_t sectors;
413 struct stat st;
414 char *buf;
416 /* If length is greater than the current file len
417 * we synchronously write zeroes to the end of the
418 * file, otherwise we truncate the length down
419 */
420 ret = fstat(fd, &st);
421 if (ret == -1)
422 return -1;
423 if (S_ISBLK(st.st_mode))
424 return 0;
426 sectors = (length + DEFAULT_SECTOR_SIZE - 1)/DEFAULT_SECTOR_SIZE;
427 current = (st.st_size + DEFAULT_SECTOR_SIZE - 1)/DEFAULT_SECTOR_SIZE;
428 rem = st.st_size % DEFAULT_SECTOR_SIZE;
430 /* If we are extending this file, we write zeros to the end --
431 * this tries to ensure that the extents allocated wind up being
432 * contiguous on disk.
433 */
434 if(st.st_size < sectors * DEFAULT_SECTOR_SIZE) {
435 /*We are extending the file*/
436 if ((ret = posix_memalign((void **)&buf,
437 512, DEFAULT_SECTOR_SIZE))) {
438 DPRINTF("posix_memalign failed: %d\n", ret);
439 return -1;
440 }
441 memset(buf, 0x00, DEFAULT_SECTOR_SIZE);
442 if (lseek(fd, 0, SEEK_END)==-1) {
443 DPRINTF("Lseek EOF failed (%d), internal error\n",
444 errno);
445 free(buf);
446 return -1;
447 }
448 if (rem) {
449 ret = write(fd, buf, rem);
450 if (ret != rem) {
451 DPRINTF("write failed: ret = %d, err = %s\n",
452 ret, strerror(errno));
453 free(buf);
454 return -1;
455 }
456 }
457 for (i = current; i < sectors; i++ ) {
458 ret = write(fd, buf, DEFAULT_SECTOR_SIZE);
459 if (ret != DEFAULT_SECTOR_SIZE) {
460 DPRINTF("write failed: ret = %d, err = %s\n",
461 ret, strerror(errno));
462 free(buf);
463 return -1;
464 }
465 }
466 free(buf);
467 } else if(sparse && (st.st_size > sectors * DEFAULT_SECTOR_SIZE))
468 if (ftruncate(fd, (off_t)sectors * DEFAULT_SECTOR_SIZE)==-1) {
469 DPRINTF("Ftruncate failed (%s)\n", strerror(errno));
470 return -1;
471 }
472 return 0;
473 }
476 /* 'allocate' is:
477 *
478 * 0 to not allocate.
479 *
480 * 1 to allocate a normal cluster (for sector indexes 'n_start' to
481 * 'n_end')
482 *
483 * 2 to allocate a compressed cluster of size
484 * 'compressed_size'. 'compressed_size' must be > 0 and <
485 * cluster_size
486 *
487 * return 0 if not allocated.
488 */
489 static uint64_t get_cluster_offset(struct tdqcow_state *s,
490 uint64_t offset, int allocate,
491 int compressed_size,
492 int n_start, int n_end)
493 {
494 int min_index, i, j, l1_index, l2_index, l2_sector, l1_sector;
495 char *tmp_ptr, *tmp_ptr2, *l2_ptr, *l1_ptr;
496 uint64_t l2_offset, *l2_table, cluster_offset, tmp;
497 uint32_t min_count;
498 int new_l2_table;
500 /*Check L1 table for the extent offset*/
501 l1_index = offset >> (s->l2_bits + s->cluster_bits);
502 l2_offset = s->l1_table[l1_index];
503 new_l2_table = 0;
504 if (!l2_offset) {
505 if (!allocate)
506 return 0;
507 /*
508 * allocating a new l2 entry + extent
509 * at the end of the file, we must also
510 * update the L1 entry safely.
511 */
512 l2_offset = s->fd_end;
514 /* round to cluster size */
515 l2_offset = (l2_offset + s->cluster_size - 1)
516 & ~(s->cluster_size - 1);
518 /* update the L1 entry */
519 s->l1_table[l1_index] = l2_offset;
520 tmp = cpu_to_be64(l2_offset);
522 /*Truncate file for L2 table
523 *(initialised to zero in case we crash)*/
524 if (qtruncate(s->fd,
525 l2_offset + (s->l2_size * sizeof(uint64_t)),
526 s->sparse) != 0) {
527 DPRINTF("ERROR truncating file\n");
528 return 0;
529 }
530 s->fd_end = l2_offset + (s->l2_size * sizeof(uint64_t));
532 /*Update the L1 table entry on disk
533 * (for O_DIRECT we write 4KByte blocks)*/
534 l1_sector = (l1_index * sizeof(uint64_t)) >> 12;
535 l1_ptr = (char *)s->l1_table + (l1_sector << 12);
537 if (posix_memalign((void **)&tmp_ptr, 4096, 4096) != 0) {
538 DPRINTF("ERROR allocating memory for L1 table\n");
539 }
540 memcpy(tmp_ptr, l1_ptr, 4096);
542 /*
543 * Issue non-asynchronous L1 write.
544 * For safety, we must ensure that
545 * entry is written before blocks.
546 */
547 lseek(s->fd, s->l1_table_offset + (l1_sector << 12), SEEK_SET);
548 if (write(s->fd, tmp_ptr, 4096) != 4096) {
549 free(tmp_ptr);
550 return 0;
551 }
552 free(tmp_ptr);
554 new_l2_table = 1;
555 goto cache_miss;
556 } else if (s->min_cluster_alloc == s->l2_size) {
557 /*Fast-track the request*/
558 cluster_offset = l2_offset + (s->l2_size * sizeof(uint64_t));
559 l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
560 return cluster_offset + (l2_index * s->cluster_size);
561 }
563 /*Check to see if L2 entry is already cached*/
564 for (i = 0; i < L2_CACHE_SIZE; i++) {
565 if (l2_offset == s->l2_cache_offsets[i]) {
566 /* increment the hit count */
567 if (++s->l2_cache_counts[i] == 0xffffffff) {
568 for (j = 0; j < L2_CACHE_SIZE; j++) {
569 s->l2_cache_counts[j] >>= 1;
570 }
571 }
572 l2_table = s->l2_cache + (i << s->l2_bits);
573 goto found;
574 }
575 }
577 cache_miss:
578 /* not found: load a new entry in the least used one */
579 min_index = 0;
580 min_count = 0xffffffff;
581 for (i = 0; i < L2_CACHE_SIZE; i++) {
582 if (s->l2_cache_counts[i] < min_count) {
583 min_count = s->l2_cache_counts[i];
584 min_index = i;
585 }
586 }
587 l2_table = s->l2_cache + (min_index << s->l2_bits);
589 /*If extent pre-allocated, read table from disk,
590 *otherwise write new table to disk*/
591 if (new_l2_table) {
592 /*Should we allocate the whole extent? Adjustable parameter.*/
593 if (s->cluster_alloc == s->l2_size) {
594 cluster_offset = l2_offset +
595 (s->l2_size * sizeof(uint64_t));
596 cluster_offset = (cluster_offset + s->cluster_size - 1)
597 & ~(s->cluster_size - 1);
598 if (qtruncate(s->fd, cluster_offset +
599 (s->cluster_size * s->l2_size),
600 s->sparse) != 0) {
601 DPRINTF("ERROR truncating file\n");
602 return 0;
603 }
604 s->fd_end = cluster_offset +
605 (s->cluster_size * s->l2_size);
606 for (i = 0; i < s->l2_size; i++) {
607 l2_table[i] = cpu_to_be64(cluster_offset +
608 (i*s->cluster_size));
609 }
610 } else memset(l2_table, 0, s->l2_size * sizeof(uint64_t));
612 lseek(s->fd, l2_offset, SEEK_SET);
613 if (write(s->fd, l2_table, s->l2_size * sizeof(uint64_t)) !=
614 s->l2_size * sizeof(uint64_t))
615 return 0;
616 } else {
617 lseek(s->fd, l2_offset, SEEK_SET);
618 if (read(s->fd, l2_table, s->l2_size * sizeof(uint64_t)) !=
619 s->l2_size * sizeof(uint64_t))
620 return 0;
621 }
623 /*Update the cache entries*/
624 s->l2_cache_offsets[min_index] = l2_offset;
625 s->l2_cache_counts[min_index] = 1;
627 found:
628 /*The extent is split into 's->l2_size' blocks of
629 *size 's->cluster_size'*/
630 l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
631 cluster_offset = be64_to_cpu(l2_table[l2_index]);
633 if (!cluster_offset ||
634 ((cluster_offset & QCOW_OFLAG_COMPRESSED) && allocate == 1) ) {
635 if (!allocate)
636 return 0;
638 if ((cluster_offset & QCOW_OFLAG_COMPRESSED) &&
639 (n_end - n_start) < s->cluster_sectors) {
640 /* cluster is already allocated but compressed, we must
641 decompress it in the case it is not completely
642 overwritten */
643 if (decompress_cluster(s, cluster_offset) < 0)
644 return 0;
645 cluster_offset = lseek(s->fd, s->fd_end, SEEK_SET);
646 cluster_offset = (cluster_offset + s->cluster_size - 1)
647 & ~(s->cluster_size - 1);
648 /* write the cluster content - not asynchronous */
649 lseek(s->fd, cluster_offset, SEEK_SET);
650 if (write(s->fd, s->cluster_cache, s->cluster_size) !=
651 s->cluster_size)
652 return -1;
653 } else {
654 /* allocate a new cluster */
655 cluster_offset = lseek(s->fd, s->fd_end, SEEK_SET);
656 if (allocate == 1) {
657 /* round to cluster size */
658 cluster_offset =
659 (cluster_offset + s->cluster_size - 1)
660 & ~(s->cluster_size - 1);
661 if (qtruncate(s->fd, cluster_offset +
662 s->cluster_size, s->sparse)!=0) {
663 DPRINTF("ERROR truncating file\n");
664 return 0;
665 }
666 s->fd_end = (cluster_offset + s->cluster_size);
667 /* if encrypted, we must initialize the cluster
668 content which won't be written */
669 if (s->crypt_method &&
670 (n_end - n_start) < s->cluster_sectors) {
671 uint64_t start_sect;
672 start_sect = (offset &
673 ~(s->cluster_size - 1))
674 >> 9;
675 memset(s->cluster_data + 512,
676 0xaa, 512);
677 for (i = 0; i < s->cluster_sectors;i++)
678 {
679 if (i < n_start || i >= n_end)
680 {
681 encrypt_sectors(s, start_sect + i,
682 s->cluster_data,
683 s->cluster_data + 512, 1, 1,
684 &s->aes_encrypt_key);
685 lseek(s->fd, cluster_offset + i * 512, SEEK_SET);
686 if (write(s->fd, s->cluster_data, 512) != 512)
687 return -1;
688 }
689 }
690 }
691 } else {
692 cluster_offset |= QCOW_OFLAG_COMPRESSED |
693 (uint64_t)compressed_size
694 << (63 - s->cluster_bits);
695 }
696 }
697 /* update L2 table */
698 tmp = cpu_to_be64(cluster_offset);
699 l2_table[l2_index] = tmp;
701 /*For IO_DIRECT we write 4KByte blocks*/
702 l2_sector = (l2_index * sizeof(uint64_t)) >> 12;
703 l2_ptr = (char *)l2_table + (l2_sector << 12);
705 if (posix_memalign((void **)&tmp_ptr2, 4096, 4096) != 0) {
706 DPRINTF("ERROR allocating memory for L1 table\n");
707 }
708 memcpy(tmp_ptr2, l2_ptr, 4096);
709 lseek(s->fd, l2_offset + (l2_sector << 12), SEEK_SET);
710 write(s->fd, tmp_ptr2, 4096);
711 free(tmp_ptr2);
712 }
713 return cluster_offset;
714 }
716 static void init_cluster_cache(struct disk_driver *dd)
717 {
718 struct td_state *bs = dd->td_state;
719 struct tdqcow_state *s = (struct tdqcow_state *)dd->private;
720 uint32_t count = 0;
721 int i, cluster_entries;
723 cluster_entries = s->cluster_size / 512;
724 DPRINTF("Initialising Cluster cache, %d sectors per cluster (%d cluster size)\n",
725 cluster_entries, s->cluster_size);
727 for (i = 0; i < bs->size; i += cluster_entries) {
728 if (get_cluster_offset(s, i << 9, 0, 0, 0, 1)) count++;
729 if (count >= L2_CACHE_SIZE) return;
730 }
731 DPRINTF("Finished cluster initialisation, added %d entries\n", count);
732 return;
733 }
735 static int qcow_is_allocated(struct tdqcow_state *s, int64_t sector_num,
736 int nb_sectors, int *pnum)
737 {
738 int index_in_cluster, n;
739 uint64_t cluster_offset;
741 cluster_offset = get_cluster_offset(s, sector_num << 9, 0, 0, 0, 0);
742 index_in_cluster = sector_num & (s->cluster_sectors - 1);
743 n = s->cluster_sectors - index_in_cluster;
744 if (n > nb_sectors)
745 n = nb_sectors;
746 *pnum = n;
747 return (cluster_offset != 0);
748 }
750 static int decompress_buffer(uint8_t *out_buf, int out_buf_size,
751 const uint8_t *buf, int buf_size)
752 {
753 z_stream strm1, *strm = &strm1;
754 int ret, out_len;
756 memset(strm, 0, sizeof(*strm));
758 strm->next_in = (uint8_t *)buf;
759 strm->avail_in = buf_size;
760 strm->next_out = out_buf;
761 strm->avail_out = out_buf_size;
763 ret = inflateInit2(strm, -12);
764 if (ret != Z_OK)
765 return -1;
766 ret = inflate(strm, Z_FINISH);
767 out_len = strm->next_out - out_buf;
768 if ( (ret != Z_STREAM_END && ret != Z_BUF_ERROR) ||
769 (out_len != out_buf_size) ) {
770 inflateEnd(strm);
771 return -1;
772 }
773 inflateEnd(strm);
774 return 0;
775 }
777 static int decompress_cluster(struct tdqcow_state *s, uint64_t cluster_offset)
778 {
779 int ret, csize;
780 uint64_t coffset;
782 coffset = cluster_offset & s->cluster_offset_mask;
783 if (s->cluster_cache_offset != coffset) {
784 csize = cluster_offset >> (63 - s->cluster_bits);
785 csize &= (s->cluster_size - 1);
786 lseek(s->fd, coffset, SEEK_SET);
787 ret = read(s->fd, s->cluster_data, csize);
788 if (ret != csize)
789 return -1;
790 if (decompress_buffer(s->cluster_cache, s->cluster_size,
791 s->cluster_data, csize) < 0) {
792 return -1;
793 }
794 s->cluster_cache_offset = coffset;
795 }
796 return 0;
797 }
799 static inline void init_fds(struct disk_driver *dd)
800 {
801 int i;
802 struct tdqcow_state *s = (struct tdqcow_state *)dd->private;
804 for(i = 0; i < MAX_IOFD; i++)
805 dd->io_fd[i] = 0;
807 dd->io_fd[0] = s->poll_fd;
808 }
810 /* Open the disk file and initialize qcow state. */
811 int tdqcow_open (struct disk_driver *dd, const char *name, td_flag_t flags)
812 {
813 int fd, len, i, shift, ret, size, l1_table_size, o_flags;
814 struct td_state *bs = dd->td_state;
815 struct tdqcow_state *s = (struct tdqcow_state *)dd->private;
816 char *buf;
817 QCowHeader *header;
818 QCowHeader_ext *exthdr;
819 uint32_t cksum;
820 uint64_t final_cluster = 0;
822 DPRINTF("QCOW: Opening %s\n",name);
824 o_flags = O_DIRECT | O_LARGEFILE |
825 ((flags == TD_RDONLY) ? O_RDONLY : O_RDWR);
826 fd = open(name, o_flags);
827 if (fd < 0) {
828 DPRINTF("Unable to open %s (%d)\n",name,0 - errno);
829 return -1;
830 }
832 s->fd = fd;
833 asprintf(&s->name,"%s", name);
835 ASSERT(sizeof(QCowHeader) + sizeof(QCowHeader_ext) < 512);
837 ret = posix_memalign((void **)&buf, 512, 512);
838 if (ret != 0) goto fail;
840 if (read(fd, buf, 512) != 512)
841 goto fail;
843 header = (QCowHeader *)buf;
844 be32_to_cpus(&header->magic);
845 be32_to_cpus(&header->version);
846 be64_to_cpus(&header->backing_file_offset);
847 be32_to_cpus(&header->backing_file_size);
848 be32_to_cpus(&header->mtime);
849 be64_to_cpus(&header->size);
850 be32_to_cpus(&header->crypt_method);
851 be64_to_cpus(&header->l1_table_offset);
853 if (header->magic != QCOW_MAGIC || header->version > QCOW_VERSION)
854 goto fail;
855 if (header->size <= 1 || header->cluster_bits < 9)
856 goto fail;
857 if (header->crypt_method > QCOW_CRYPT_AES)
858 goto fail;
859 s->crypt_method_header = header->crypt_method;
860 if (s->crypt_method_header)
861 s->encrypted = 1;
862 s->cluster_bits = header->cluster_bits;
863 s->cluster_size = 1 << s->cluster_bits;
864 s->cluster_sectors = 1 << (s->cluster_bits - 9);
865 s->l2_bits = header->l2_bits;
866 s->l2_size = 1 << s->l2_bits;
867 s->cluster_alloc = s->l2_size;
868 bs->size = header->size / 512;
869 s->cluster_offset_mask = (1LL << (63 - s->cluster_bits)) - 1;
870 s->backing_file_offset = header->backing_file_offset;
871 s->backing_file_size = header->backing_file_size;
873 /* read the level 1 table */
874 shift = s->cluster_bits + s->l2_bits;
875 s->l1_size = (header->size + (1LL << shift) - 1) >> shift;
877 s->l1_table_offset = header->l1_table_offset;
879 /*allocate a 4Kbyte multiple of memory*/
880 l1_table_size = s->l1_size * sizeof(uint64_t);
881 if (l1_table_size % 4096 > 0) {
882 l1_table_size = ((l1_table_size >> 12) + 1) << 12;
883 }
884 ret = posix_memalign((void **)&s->l1_table, 4096, l1_table_size);
885 if (ret != 0) goto fail;
887 memset(s->l1_table, 0x00, l1_table_size);
889 DPRINTF("L1 Table offset detected: %llu, size %d (%d)\n",
890 (long long)s->l1_table_offset,
891 (int) (s->l1_size * sizeof(uint64_t)),
892 l1_table_size);
894 lseek(fd, s->l1_table_offset, SEEK_SET);
895 if (read(fd, s->l1_table, l1_table_size) != l1_table_size)
896 goto fail;
898 for(i = 0; i < s->l1_size; i++) {
899 //be64_to_cpus(&s->l1_table[i]);
900 //DPRINTF("L1[%d] => %llu\n", i, s->l1_table[i]);
901 if (s->l1_table[i] > final_cluster)
902 final_cluster = s->l1_table[i];
903 }
905 /* alloc L2 cache */
906 size = s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t);
907 ret = posix_memalign((void **)&s->l2_cache, 4096, size);
908 if(ret != 0) goto fail;
910 size = s->cluster_size;
911 ret = posix_memalign((void **)&s->cluster_cache, 4096, size);
912 if(ret != 0) goto fail;
914 ret = posix_memalign((void **)&s->cluster_data, 4096, size);
915 if(ret != 0) goto fail;
916 s->cluster_cache_offset = -1;
918 if (s->backing_file_offset != 0)
919 s->cluster_alloc = 1; /*Cannot use pre-alloc*/
921 bs->sector_size = 512;
922 bs->info = 0;
924 /*Detect min_cluster_alloc*/
925 s->min_cluster_alloc = 1; /*Default*/
926 if (s->backing_file_offset == 0 && s->l1_table_offset % 4096 == 0) {
927 /*We test to see if the xen magic # exists*/
928 exthdr = (QCowHeader_ext *)(buf + sizeof(QCowHeader));
929 be32_to_cpus(&exthdr->xmagic);
930 if(exthdr->xmagic != XEN_MAGIC)
931 goto end_xenhdr;
933 /*Finally check the L1 table cksum*/
934 be32_to_cpus(&exthdr->cksum);
935 cksum = gen_cksum((char *)s->l1_table,
936 s->l1_size * sizeof(uint64_t));
937 if(exthdr->cksum != cksum)
938 goto end_xenhdr;
940 be32_to_cpus(&exthdr->min_cluster_alloc);
941 be32_to_cpus(&exthdr->flags);
942 s->sparse = (exthdr->flags & SPARSE_FILE);
943 s->min_cluster_alloc = exthdr->min_cluster_alloc;
944 }
946 end_xenhdr:
947 if (init_aio_state(dd)!=0) {
948 DPRINTF("Unable to initialise AIO state\n");
949 goto fail;
950 }
951 init_fds(dd);
952 s->fd_end = (final_cluster == 0 ? (s->l1_table_offset + l1_table_size) :
953 (final_cluster + s->cluster_size));
955 return 0;
957 fail:
958 DPRINTF("QCOW Open failed\n");
959 free(s->l1_table);
960 free(s->l2_cache);
961 free(s->cluster_cache);
962 free(s->cluster_data);
963 close(fd);
964 return -1;
965 }
967 int tdqcow_queue_read(struct disk_driver *dd, uint64_t sector,
968 int nb_sectors, char *buf, td_callback_t cb,
969 int id, void *private)
970 {
971 struct tdqcow_state *s = (struct tdqcow_state *)dd->private;
972 int ret = 0, index_in_cluster, n, i, rsp = 0;
973 uint64_t cluster_offset, sec, nr_secs;
975 sec = sector;
976 nr_secs = nb_sectors;
978 /*Check we can get a lock*/
979 for (i = 0; i < nb_sectors; i++)
980 if (!aio_can_lock(s, sector + i))
981 return cb(dd, -EBUSY, sector, nb_sectors, id, private);
983 /*We store a local record of the request*/
984 while (nb_sectors > 0) {
985 cluster_offset =
986 get_cluster_offset(s, sector << 9, 0, 0, 0, 0);
987 index_in_cluster = sector & (s->cluster_sectors - 1);
988 n = s->cluster_sectors - index_in_cluster;
989 if (n > nb_sectors)
990 n = nb_sectors;
992 if (s->iocb_free_count == 0 || !aio_lock(s, sector))
993 return cb(dd, -EBUSY, sector, nb_sectors, id, private);
995 if(!cluster_offset) {
996 aio_unlock(s, sector);
997 ret = cb(dd, BLK_NOT_ALLOCATED,
998 sector, n, id, private);
999 if (ret == -EBUSY) {
1000 /* mark remainder of request
1001 * as busy and try again later */
1002 return cb(dd, -EBUSY, sector + n,
1003 nb_sectors - n, id, private);
1004 } else
1005 rsp += ret;
1006 } else if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
1007 aio_unlock(s, sector);
1008 if (decompress_cluster(s, cluster_offset) < 0) {
1009 rsp += cb(dd, -EIO, sector,
1010 nb_sectors, id, private);
1011 goto done;
1013 memcpy(buf, s->cluster_cache + index_in_cluster * 512,
1014 512 * n);
1015 rsp += cb(dd, 0, sector, n, id, private);
1016 } else {
1017 async_read(s, n * 512,
1018 (cluster_offset + index_in_cluster * 512),
1019 buf, cb, id, sector, private);
1021 nb_sectors -= n;
1022 sector += n;
1023 buf += n * 512;
1025 done:
1026 return rsp;
1029 int tdqcow_queue_write(struct disk_driver *dd, uint64_t sector,
1030 int nb_sectors, char *buf, td_callback_t cb,
1031 int id, void *private)
1033 struct tdqcow_state *s = (struct tdqcow_state *)dd->private;
1034 int ret = 0, index_in_cluster, n, i;
1035 uint64_t cluster_offset, sec, nr_secs;
1037 sec = sector;
1038 nr_secs = nb_sectors;
1040 /*Check we can get a lock*/
1041 for (i = 0; i < nb_sectors; i++)
1042 if (!aio_can_lock(s, sector + i))
1043 return cb(dd, -EBUSY, sector, nb_sectors, id, private);
1045 /*We store a local record of the request*/
1046 while (nb_sectors > 0) {
1047 index_in_cluster = sector & (s->cluster_sectors - 1);
1048 n = s->cluster_sectors - index_in_cluster;
1049 if (n > nb_sectors)
1050 n = nb_sectors;
1052 if (s->iocb_free_count == 0 || !aio_lock(s, sector))
1053 return cb(dd, -EBUSY, sector, nb_sectors, id, private);
1055 cluster_offset = get_cluster_offset(s, sector << 9, 1, 0,
1056 index_in_cluster,
1057 index_in_cluster+n);
1058 if (!cluster_offset) {
1059 DPRINTF("Ooops, no write cluster offset!\n");
1060 return cb(dd, -EIO, sector, nb_sectors, id, private);
1063 if (s->crypt_method) {
1064 encrypt_sectors(s, sector, s->cluster_data,
1065 (unsigned char *)buf, n, 1,
1066 &s->aes_encrypt_key);
1067 async_write(s, n * 512,
1068 (cluster_offset + index_in_cluster*512),
1069 (char *)s->cluster_data, cb, id, sector,
1070 private);
1071 } else {
1072 async_write(s, n * 512,
1073 (cluster_offset + index_in_cluster*512),
1074 buf, cb, id, sector, private);
1077 nb_sectors -= n;
1078 sector += n;
1079 buf += n * 512;
1081 s->cluster_cache_offset = -1; /* disable compressed cache */
1083 return 0;
1086 int tdqcow_submit(struct disk_driver *dd)
1088 int ret;
1089 struct tdqcow_state *prv = (struct tdqcow_state *)dd->private;
1091 if (!prv->iocb_queued)
1092 return 0;
1094 ret = io_submit(prv->aio_ctx, prv->iocb_queued, prv->iocb_queue);
1096 /* XXX: TODO: Handle error conditions here. */
1098 /* Success case: */
1099 prv->iocb_queued = 0;
1101 return 0;
1104 int tdqcow_close(struct disk_driver *dd)
1106 struct tdqcow_state *s = (struct tdqcow_state *)dd->private;
1107 uint32_t cksum, out;
1108 int fd, offset;
1110 /*Update the hdr cksum*/
1111 if(s->min_cluster_alloc == s->l2_size) {
1112 cksum = gen_cksum((char *)s->l1_table, s->l1_size * sizeof(uint64_t));
1113 printf("Writing cksum: %d",cksum);
1114 fd = open(s->name, O_WRONLY | O_LARGEFILE); /*Open without O_DIRECT*/
1115 offset = sizeof(QCowHeader) + sizeof(uint32_t);
1116 lseek(fd, offset, SEEK_SET);
1117 out = cpu_to_be32(cksum);
1118 write(fd, &out, sizeof(uint32_t));
1119 close(fd);
1122 io_destroy(s->aio_ctx);
1123 free(s->name);
1124 free(s->l1_table);
1125 free(s->l2_cache);
1126 free(s->cluster_cache);
1127 free(s->cluster_data);
1128 close(s->fd);
1129 return 0;
1132 int tdqcow_do_callbacks(struct disk_driver *dd, int sid)
1134 int ret, i, rsp = 0,*ptr;
1135 struct io_event *ep;
1136 struct tdqcow_state *prv = (struct tdqcow_state *)dd->private;
1138 if (sid > MAX_IOFD) return 1;
1140 /* Non-blocking test for completed io. */
1141 ret = io_getevents(prv->aio_ctx, 0, MAX_AIO_REQS, prv->aio_events,
1142 NULL);
1144 for (ep = prv->aio_events, i = ret; i-- > 0; ep++) {
1145 struct iocb *io = ep->obj;
1146 struct pending_aio *pio;
1148 pio = &prv->pending_aio[(long)io->data];
1150 aio_unlock(prv, pio->sector);
1152 if (prv->crypt_method)
1153 encrypt_sectors(prv, pio->sector,
1154 (unsigned char *)pio->buf,
1155 (unsigned char *)pio->buf,
1156 pio->nb_sectors, 0,
1157 &prv->aes_decrypt_key);
1159 rsp += pio->cb(dd, ep->res == io->u.c.nbytes ? 0 : 1,
1160 pio->sector, pio->nb_sectors,
1161 pio->id, pio->private);
1163 prv->iocb_free[prv->iocb_free_count++] = io;
1165 return rsp;
1168 int qcow_create(const char *filename, uint64_t total_size,
1169 const char *backing_file, int sparse)
1171 int fd, header_size, backing_filename_len, l1_size, i;
1172 int shift, length, adjust, flags = 0, ret = 0;
1173 QCowHeader header;
1174 QCowHeader_ext exthdr;
1175 char backing_filename[1024], *ptr;
1176 uint64_t tmp, size, total_length;
1177 struct stat st;
1179 DPRINTF("Qcow_create: size %llu\n",(long long unsigned)total_size);
1181 fd = open(filename,
1182 O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE,
1183 0644);
1184 if (fd < 0)
1185 return -1;
1187 memset(&header, 0, sizeof(header));
1188 header.magic = cpu_to_be32(QCOW_MAGIC);
1189 header.version = cpu_to_be32(QCOW_VERSION);
1191 /*Create extended header fields*/
1192 exthdr.xmagic = cpu_to_be32(XEN_MAGIC);
1194 header_size = sizeof(header) + sizeof(QCowHeader_ext);
1195 backing_filename_len = 0;
1196 size = (total_size >> SECTOR_SHIFT);
1197 if (backing_file) {
1198 if (strcmp(backing_file, "fat:")) {
1199 const char *p;
1200 /* XXX: this is a hack: we do not attempt to
1201 *check for URL like syntax */
1202 p = strchr(backing_file, ':');
1203 if (p && (p - backing_file) >= 2) {
1204 /* URL like but exclude "c:" like filenames */
1205 strncpy(backing_filename, backing_file,
1206 sizeof(backing_filename));
1207 } else {
1208 realpath(backing_file, backing_filename);
1209 if (stat(backing_filename, &st) != 0) {
1210 return -1;
1213 header.backing_file_offset = cpu_to_be64(header_size);
1214 backing_filename_len = strlen(backing_filename);
1215 header.backing_file_size = cpu_to_be32(
1216 backing_filename_len);
1217 header_size += backing_filename_len;
1219 /*Set to the backing file size*/
1220 if(get_filesize(backing_filename, &size, &st)) {
1221 return -1;
1223 DPRINTF("Backing file size detected: %lld sectors"
1224 "(total %lld [%lld MB])\n",
1225 (long long)size,
1226 (long long)(size << SECTOR_SHIFT),
1227 (long long)(size >> 11));
1228 } else {
1229 backing_file = NULL;
1230 DPRINTF("Setting file size: %lld (total %lld)\n",
1231 (long long) total_size,
1232 (long long) (total_size << SECTOR_SHIFT));
1234 header.mtime = cpu_to_be32(st.st_mtime);
1235 header.cluster_bits = 9; /* 512 byte cluster to avoid copying
1236 unmodifyed sectors */
1237 header.l2_bits = 12; /* 32 KB L2 tables */
1238 exthdr.min_cluster_alloc = cpu_to_be32(1);
1239 } else {
1240 DPRINTF("Setting file size: %lld sectors"
1241 "(total %lld [%lld MB])\n",
1242 (long long) size,
1243 (long long) (size << SECTOR_SHIFT),
1244 (long long) (size >> 11));
1245 header.cluster_bits = 12; /* 4 KB clusters */
1246 header.l2_bits = 9; /* 4 KB L2 tables */
1247 exthdr.min_cluster_alloc = cpu_to_be32(1 << 9);
1249 /*Set the header size value*/
1250 header.size = cpu_to_be64(size * 512);
1252 header_size = (header_size + 7) & ~7;
1253 if (header_size % 4096 > 0) {
1254 header_size = ((header_size >> 12) + 1) << 12;
1257 shift = header.cluster_bits + header.l2_bits;
1258 l1_size = ((size * 512) + (1LL << shift) - 1) >> shift;
1260 header.l1_table_offset = cpu_to_be64(header_size);
1261 DPRINTF("L1 Table offset: %d, size %d\n",
1262 header_size,
1263 (int)(l1_size * sizeof(uint64_t)));
1264 header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE);
1266 ptr = calloc(1, l1_size * sizeof(uint64_t));
1267 exthdr.cksum = cpu_to_be32(gen_cksum(ptr, l1_size * sizeof(uint64_t)));
1268 printf("Created cksum: %d\n",exthdr.cksum);
1269 free(ptr);
1271 /*adjust file length to system page size boundary*/
1272 length = ROUNDUP(header_size + (l1_size * sizeof(uint64_t)),
1273 getpagesize());
1274 if (qtruncate(fd, length, 0)!=0) {
1275 DPRINTF("ERROR truncating file\n");
1276 return -1;
1279 if (sparse == 0) {
1280 /*Filesize is length+l1_size*(1 << s->l2_bits)+(size*512)*/
1281 total_length = length + (l1_size * (1 << 9)) + (size * 512);
1282 if (qtruncate(fd, total_length, 0)!=0) {
1283 DPRINTF("ERROR truncating file\n");
1284 return -1;
1286 printf("File truncated to length %"PRIu64"\n",total_length);
1287 } else
1288 flags = SPARSE_FILE;
1290 exthdr.flags = cpu_to_be32(flags);
1292 /* write all the data */
1293 lseek(fd, 0, SEEK_SET);
1294 ret += write(fd, &header, sizeof(header));
1295 ret += write(fd, &exthdr, sizeof(exthdr));
1296 if (backing_file)
1297 ret += write(fd, backing_filename, backing_filename_len);
1299 lseek(fd, header_size, SEEK_SET);
1300 tmp = 0;
1301 for (i = 0;i < l1_size; i++) {
1302 ret += write(fd, &tmp, sizeof(tmp));
1305 close(fd);
1307 return 0;
1310 int qcow_make_empty(struct tdqcow_state *s)
1312 uint32_t l1_length = s->l1_size * sizeof(uint64_t);
1314 memset(s->l1_table, 0, l1_length);
1315 lseek(s->fd, s->l1_table_offset, SEEK_SET);
1316 if (write(s->fd, s->l1_table, l1_length) < 0)
1317 return -1;
1318 if (qtruncate(s->fd, s->l1_table_offset + l1_length, s->sparse)!=0) {
1319 DPRINTF("ERROR truncating file\n");
1320 return -1;
1323 memset(s->l2_cache, 0, s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
1324 memset(s->l2_cache_offsets, 0, L2_CACHE_SIZE * sizeof(uint64_t));
1325 memset(s->l2_cache_counts, 0, L2_CACHE_SIZE * sizeof(uint32_t));
1327 return 0;
1330 int qcow_get_cluster_size(struct tdqcow_state *s)
1332 return s->cluster_size;
1335 /* XXX: put compressed sectors first, then all the cluster aligned
1336 tables to avoid losing bytes in alignment */
1337 int qcow_compress_cluster(struct tdqcow_state *s, int64_t sector_num,
1338 const uint8_t *buf)
1340 z_stream strm;
1341 int ret, out_len;
1342 uint8_t *out_buf;
1343 uint64_t cluster_offset;
1345 out_buf = malloc(s->cluster_size + (s->cluster_size / 1000) + 128);
1346 if (!out_buf)
1347 return -1;
1349 /* best compression, small window, no zlib header */
1350 memset(&strm, 0, sizeof(strm));
1351 ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
1352 Z_DEFLATED, -12,
1353 9, Z_DEFAULT_STRATEGY);
1354 if (ret != 0) {
1355 free(out_buf);
1356 return -1;
1359 strm.avail_in = s->cluster_size;
1360 strm.next_in = (uint8_t *)buf;
1361 strm.avail_out = s->cluster_size;
1362 strm.next_out = out_buf;
1364 ret = deflate(&strm, Z_FINISH);
1365 if (ret != Z_STREAM_END && ret != Z_OK) {
1366 free(out_buf);
1367 deflateEnd(&strm);
1368 return -1;
1370 out_len = strm.next_out - out_buf;
1372 deflateEnd(&strm);
1374 if (ret != Z_STREAM_END || out_len >= s->cluster_size) {
1375 /* could not compress: write normal cluster */
1376 //tdqcow_queue_write(bs, sector_num, buf, s->cluster_sectors);
1377 } else {
1378 cluster_offset = get_cluster_offset(s, sector_num << 9, 2,
1379 out_len, 0, 0);
1380 cluster_offset &= s->cluster_offset_mask;
1381 lseek(s->fd, cluster_offset, SEEK_SET);
1382 if (write(s->fd, out_buf, out_len) != out_len) {
1383 free(out_buf);
1384 return -1;
1388 free(out_buf);
1389 return 0;
1392 int tdqcow_get_parent_id(struct disk_driver *dd, struct disk_id *id)
1394 off_t off;
1395 char *buf, *filename;
1396 int len, secs, err = -EINVAL;
1397 struct tdqcow_state *child = (struct tdqcow_state *)dd->private;
1399 if (!child->backing_file_offset)
1400 return TD_NO_PARENT;
1402 /* read the backing file name */
1403 len = child->backing_file_size;
1404 off = child->backing_file_offset - (child->backing_file_offset % 512);
1405 secs = (len + (child->backing_file_offset - off) + 511) >> 9;
1407 if (posix_memalign((void **)&buf, 512, secs << 9))
1408 return -1;
1410 if (lseek(child->fd, off, SEEK_SET) == (off_t)-1)
1411 goto out;
1413 if (read(child->fd, buf, secs << 9) != secs << 9)
1414 goto out;
1415 filename = buf + (child->backing_file_offset - off);
1416 filename[len] = '\0';
1418 id->name = strdup(filename);
1419 id->drivertype = DISK_TYPE_QCOW;
1420 err = 0;
1421 out:
1422 free(buf);
1423 return err;
1426 int tdqcow_validate_parent(struct disk_driver *child,
1427 struct disk_driver *parent, td_flag_t flags)
1429 struct stat stats;
1430 uint64_t psize, csize;
1431 struct tdqcow_state *c = (struct tdqcow_state *)child->private;
1432 struct tdqcow_state *p = (struct tdqcow_state *)parent->private;
1434 if (stat(p->name, &stats))
1435 return -EINVAL;
1436 if (get_filesize(p->name, &psize, &stats))
1437 return -EINVAL;
1439 if (stat(c->name, &stats))
1440 return -EINVAL;
1441 if (get_filesize(c->name, &csize, &stats))
1442 return -EINVAL;
1444 if (csize != psize)
1445 return -EINVAL;
1447 return 0;
1450 struct tap_disk tapdisk_qcow = {
1451 .disk_type = "tapdisk_qcow",
1452 .private_data_size = sizeof(struct tdqcow_state),
1453 .td_open = tdqcow_open,
1454 .td_queue_read = tdqcow_queue_read,
1455 .td_queue_write = tdqcow_queue_write,
1456 .td_submit = tdqcow_submit,
1457 .td_close = tdqcow_close,
1458 .td_do_callbacks = tdqcow_do_callbacks,
1459 .td_get_parent_id = tdqcow_get_parent_id,
1460 .td_validate_parent = tdqcow_validate_parent
1461 };