ia64/xen-unstable

view tools/blktap/drivers/block-qcow.c @ 18654:0a09de68c541

blktap: Handle qcow backing files correctly.

Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Mon Oct 20 15:08:24 2008 +0100 (2008-10-20)
parents 86e3027df9d9
children 008505c3c65a
line source
1 /* block-qcow.c
2 *
3 * Asynchronous Qemu copy-on-write disk implementation.
4 * Code based on the Qemu implementation
5 * (see copyright notice below)
6 *
7 * (c) 2006 Andrew Warfield and Julian Chesterfield
8 *
9 */
11 /*
12 * Block driver for the QCOW format
13 *
14 * Copyright (c) 2004 Fabrice Bellard
15 *
16 * Permission is hereby granted, free of charge, to any person obtaining a copy
17 * of this software and associated documentation files(the "Software"), to deal
18 * in the Software without restriction, including without limitation the rights
19 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
20 * copies of the Software, and to permit persons to whom the Software is
21 * furnished to do so, subject to the following conditions:
22 */
24 #include <errno.h>
25 #include <fcntl.h>
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <unistd.h>
29 #include <sys/statvfs.h>
30 #include <sys/stat.h>
31 #include <sys/ioctl.h>
32 #include <string.h>
33 #include <zlib.h>
34 #include <inttypes.h>
35 #include <libaio.h>
36 #include "bswap.h"
37 #include "aes.h"
38 #include "tapdisk.h"
39 #include "tapaio.h"
40 #include "blk.h"
42 /* *BSD has no O_LARGEFILE */
43 #ifndef O_LARGEFILE
44 #define O_LARGEFILE 0
45 #endif
47 #if 1
48 #define ASSERT(_p) \
49 if ( !(_p) ) { DPRINTF("Assertion '%s' failed, line %d, file %s", #_p , \
50 __LINE__, __FILE__); *(int*)0=0; }
51 #else
52 #define ASSERT(_p) ((void)0)
53 #endif
55 #define ROUNDUP(l, s) \
56 ({ \
57 (uint64_t)( \
58 (l + (s - 1)) - ((l + (s - 1)) % s)); \
59 })
61 #undef IOCB_IDX
62 #define IOCB_IDX(_s, _io) ((_io) - (_s)->iocb_list)
64 #define ZERO_TEST(_b) (_b | 0x00)
66 /**************************************************************/
67 /* QEMU COW block driver with compression and encryption support */
69 #define QCOW_MAGIC (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb)
70 #define XEN_MAGIC (('X' << 24) | ('E' << 16) | ('N' << 8) | 0xfb)
71 #define QCOW_VERSION 1
73 #define QCOW_CRYPT_NONE 0x00
74 #define QCOW_CRYPT_AES 0x01
76 #define QCOW_OFLAG_COMPRESSED (1LL << 63)
77 #define SPARSE_FILE 0x01
78 #define EXTHDR_L1_BIG_ENDIAN 0x02
80 #ifndef O_BINARY
81 #define O_BINARY 0
82 #endif
84 typedef struct QCowHeader {
85 uint32_t magic;
86 uint32_t version;
87 uint64_t backing_file_offset;
88 uint32_t backing_file_size;
89 uint32_t mtime;
90 uint64_t size; /* in bytes */
91 uint8_t cluster_bits;
92 uint8_t l2_bits;
93 uint32_t crypt_method;
94 uint64_t l1_table_offset;
95 } QCowHeader;
97 /*Extended header for Xen enhancements*/
98 typedef struct QCowHeader_ext {
99 uint32_t xmagic;
100 uint32_t cksum;
101 uint32_t min_cluster_alloc;
102 uint32_t flags;
103 } QCowHeader_ext;
105 #define L2_CACHE_SIZE 16 /*Fixed allocation in Qemu*/
107 struct tdqcow_state {
108 int fd; /*Main Qcow file descriptor */
109 uint64_t fd_end; /*Store a local record of file length */
110 char *name; /*Record of the filename*/
111 uint32_t backing_file_size;
112 uint64_t backing_file_offset;
113 int encrypted; /*File contents are encrypted or plain*/
114 int cluster_bits; /*Determines length of cluster as
115 *indicated by file hdr*/
116 int cluster_size; /*Length of cluster*/
117 int cluster_sectors; /*Number of sectors per cluster*/
118 int cluster_alloc; /*Blktap fix for allocating full
119 *extents*/
120 int min_cluster_alloc; /*Blktap historical extent alloc*/
121 int sparse; /*Indicates whether to preserve sparseness*/
122 int l2_bits; /*Size of L2 table entry*/
123 int l2_size; /*Full table size*/
124 int l1_size; /*L1 table size*/
125 uint64_t cluster_offset_mask;
126 uint64_t l1_table_offset; /*L1 table offset from beginning of
127 *file*/
128 uint64_t *l1_table; /*L1 table entries*/
129 uint64_t *l2_cache; /*We maintain a cache of size
130 *L2_CACHE_SIZE of most read entries*/
131 uint64_t l2_cache_offsets[L2_CACHE_SIZE]; /*L2 cache entries*/
132 uint32_t l2_cache_counts[L2_CACHE_SIZE]; /*Cache access record*/
133 uint8_t *cluster_cache;
134 uint8_t *cluster_data;
135 uint64_t cluster_cache_offset; /**/
136 uint32_t crypt_method; /*current crypt method, 0 if no
137 *key yet */
138 uint32_t crypt_method_header; /**/
139 AES_KEY aes_encrypt_key; /*AES key*/
140 AES_KEY aes_decrypt_key; /*AES key*/
142 /* libaio state */
143 tap_aio_context_t aio;
144 };
146 static int decompress_cluster(struct tdqcow_state *s, uint64_t cluster_offset);
148 #ifdef USE_GCRYPT
150 #include <gcrypt.h>
152 static uint32_t gen_cksum(char *ptr, int len)
153 {
154 int i;
155 uint32_t md[4];
157 /* Convert L1 table to big endian */
158 for(i = 0; i < len / sizeof(uint64_t); i++) {
159 cpu_to_be64s(&((uint64_t*) ptr)[i]);
160 }
162 /* Generate checksum */
163 gcry_md_hash_buffer(GCRY_MD_MD5, md, ptr, len);
165 /* Convert L1 table back to native endianess */
166 for(i = 0; i < len / sizeof(uint64_t); i++) {
167 be64_to_cpus(&((uint64_t*) ptr)[i]);
168 }
170 return md[0];
171 }
173 #else /* use libcrypto */
175 #include <openssl/md5.h>
177 static uint32_t gen_cksum(char *ptr, int len)
178 {
179 int i;
180 unsigned char *md;
181 uint32_t ret;
183 md = malloc(MD5_DIGEST_LENGTH);
184 if(!md) return 0;
186 /* Convert L1 table to big endian */
187 for(i = 0; i < len / sizeof(uint64_t); i++) {
188 cpu_to_be64s(&((uint64_t*) ptr)[i]);
189 }
191 /* Generate checksum */
192 if (MD5((unsigned char *)ptr, len, md) != md)
193 ret = 0;
194 else
195 memcpy(&ret, md, sizeof(uint32_t));
197 /* Convert L1 table back to native endianess */
198 for(i = 0; i < len / sizeof(uint64_t); i++) {
199 be64_to_cpus(&((uint64_t*) ptr)[i]);
200 }
202 free(md);
203 return ret;
204 }
206 #endif
208 static int get_filesize(char *filename, uint64_t *size, struct stat *st)
209 {
210 int fd;
211 QCowHeader header;
213 /*Set to the backing file size*/
214 fd = open(filename, O_RDONLY);
215 if (fd < 0)
216 return -1;
217 if (read(fd, &header, sizeof(header)) < sizeof(header)) {
218 close(fd);
219 return -1;
220 }
221 close(fd);
223 be32_to_cpus(&header.magic);
224 be64_to_cpus(&header.size);
225 if (header.magic == QCOW_MAGIC) {
226 *size = header.size >> SECTOR_SHIFT;
227 return 0;
228 }
230 if(S_ISBLK(st->st_mode)) {
231 fd = open(filename, O_RDONLY);
232 if (fd < 0)
233 return -1;
234 if (blk_getimagesize(fd, size) != 0) {
235 close(fd);
236 return -1;
237 }
238 close(fd);
239 } else *size = (st->st_size >> SECTOR_SHIFT);
240 return 0;
241 }
243 static int qcow_set_key(struct tdqcow_state *s, const char *key)
244 {
245 uint8_t keybuf[16];
246 int len, i;
248 memset(keybuf, 0, 16);
249 len = strlen(key);
250 if (len > 16)
251 len = 16;
252 /* XXX: we could compress the chars to 7 bits to increase
253 entropy */
254 for (i = 0; i < len; i++) {
255 keybuf[i] = key[i];
256 }
257 s->crypt_method = s->crypt_method_header;
259 if (AES_set_encrypt_key(keybuf, 128, &s->aes_encrypt_key) != 0)
260 return -1;
261 if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0)
262 return -1;
263 #if 0
264 /* test */
265 {
266 uint8_t in[16];
267 uint8_t out[16];
268 uint8_t tmp[16];
269 for (i=0; i<16; i++)
270 in[i] = i;
271 AES_encrypt(in, tmp, &s->aes_encrypt_key);
272 AES_decrypt(tmp, out, &s->aes_decrypt_key);
273 for (i = 0; i < 16; i++)
274 DPRINTF(" %02x", tmp[i]);
275 DPRINTF("\n");
276 for (i = 0; i < 16; i++)
277 DPRINTF(" %02x", out[i]);
278 DPRINTF("\n");
279 }
280 #endif
281 return 0;
282 }
284 /*
285 * The crypt function is compatible with the linux cryptoloop
286 * algorithm for < 4 GB images. NOTE: out_buf == in_buf is
287 * supported .
288 */
289 static void encrypt_sectors(struct tdqcow_state *s, int64_t sector_num,
290 uint8_t *out_buf, const uint8_t *in_buf,
291 int nb_sectors, int enc,
292 const AES_KEY *key)
293 {
294 union {
295 uint64_t ll[2];
296 uint8_t b[16];
297 } ivec;
298 int i;
300 for (i = 0; i < nb_sectors; i++) {
301 ivec.ll[0] = cpu_to_le64(sector_num);
302 ivec.ll[1] = 0;
303 AES_cbc_encrypt(in_buf, out_buf, 512, key,
304 ivec.b, enc);
305 sector_num++;
306 in_buf += 512;
307 out_buf += 512;
308 }
309 }
311 static int qtruncate(int fd, off_t length, int sparse)
312 {
313 int ret, i;
314 int current = 0, rem = 0;
315 uint64_t sectors;
316 struct stat st;
317 char *buf;
319 /* If length is greater than the current file len
320 * we synchronously write zeroes to the end of the
321 * file, otherwise we truncate the length down
322 */
323 ret = fstat(fd, &st);
324 if (ret == -1)
325 return -1;
326 if (S_ISBLK(st.st_mode))
327 return 0;
329 sectors = (length + DEFAULT_SECTOR_SIZE - 1)/DEFAULT_SECTOR_SIZE;
330 current = (st.st_size + DEFAULT_SECTOR_SIZE - 1)/DEFAULT_SECTOR_SIZE;
331 rem = st.st_size % DEFAULT_SECTOR_SIZE;
333 /* If we are extending this file, we write zeros to the end --
334 * this tries to ensure that the extents allocated wind up being
335 * contiguous on disk.
336 */
337 if(st.st_size < sectors * DEFAULT_SECTOR_SIZE) {
338 /*We are extending the file*/
339 if ((ret = posix_memalign((void **)&buf,
340 512, DEFAULT_SECTOR_SIZE))) {
341 DPRINTF("posix_memalign failed: %d\n", ret);
342 return -1;
343 }
344 memset(buf, 0x00, DEFAULT_SECTOR_SIZE);
345 if (lseek(fd, 0, SEEK_END)==-1) {
346 DPRINTF("Lseek EOF failed (%d), internal error\n",
347 errno);
348 free(buf);
349 return -1;
350 }
351 if (rem) {
352 ret = write(fd, buf, rem);
353 if (ret != rem) {
354 DPRINTF("write failed: ret = %d, err = %s\n",
355 ret, strerror(errno));
356 free(buf);
357 return -1;
358 }
359 }
360 for (i = current; i < sectors; i++ ) {
361 ret = write(fd, buf, DEFAULT_SECTOR_SIZE);
362 if (ret != DEFAULT_SECTOR_SIZE) {
363 DPRINTF("write failed: ret = %d, err = %s\n",
364 ret, strerror(errno));
365 free(buf);
366 return -1;
367 }
368 }
369 free(buf);
370 } else if(sparse && (st.st_size > sectors * DEFAULT_SECTOR_SIZE))
371 if (ftruncate(fd, (off_t)sectors * DEFAULT_SECTOR_SIZE)==-1) {
372 DPRINTF("Ftruncate failed (%s)\n", strerror(errno));
373 return -1;
374 }
375 return 0;
376 }
379 /* 'allocate' is:
380 *
381 * 0 to not allocate.
382 *
383 * 1 to allocate a normal cluster (for sector indexes 'n_start' to
384 * 'n_end')
385 *
386 * 2 to allocate a compressed cluster of size
387 * 'compressed_size'. 'compressed_size' must be > 0 and <
388 * cluster_size
389 *
390 * return 0 if not allocated.
391 */
392 static uint64_t get_cluster_offset(struct tdqcow_state *s,
393 uint64_t offset, int allocate,
394 int compressed_size,
395 int n_start, int n_end)
396 {
397 int min_index, i, j, l1_index, l2_index, l2_sector, l1_sector;
398 char *tmp_ptr2, *l2_ptr, *l1_ptr;
399 uint64_t *tmp_ptr;
400 uint64_t l2_offset, *l2_table, cluster_offset, tmp;
401 uint32_t min_count;
402 int new_l2_table;
404 /*Check L1 table for the extent offset*/
405 l1_index = offset >> (s->l2_bits + s->cluster_bits);
406 l2_offset = s->l1_table[l1_index];
407 new_l2_table = 0;
408 if (!l2_offset) {
409 if (!allocate)
410 return 0;
411 /*
412 * allocating a new l2 entry + extent
413 * at the end of the file, we must also
414 * update the L1 entry safely.
415 */
416 l2_offset = s->fd_end;
418 /* round to cluster size */
419 l2_offset = (l2_offset + s->cluster_size - 1)
420 & ~(s->cluster_size - 1);
422 /* update the L1 entry */
423 s->l1_table[l1_index] = l2_offset;
424 tmp = cpu_to_be64(l2_offset);
426 /*Truncate file for L2 table
427 *(initialised to zero in case we crash)*/
428 if (qtruncate(s->fd,
429 l2_offset + (s->l2_size * sizeof(uint64_t)),
430 s->sparse) != 0) {
431 DPRINTF("ERROR truncating file\n");
432 return 0;
433 }
434 s->fd_end = l2_offset + (s->l2_size * sizeof(uint64_t));
436 /*Update the L1 table entry on disk
437 * (for O_DIRECT we write 4KByte blocks)*/
438 l1_sector = (l1_index * sizeof(uint64_t)) >> 12;
439 l1_ptr = (char *)s->l1_table + (l1_sector << 12);
441 if (posix_memalign((void **)&tmp_ptr, 4096, 4096) != 0) {
442 DPRINTF("ERROR allocating memory for L1 table\n");
443 }
444 memcpy(tmp_ptr, l1_ptr, 4096);
446 /* Convert block to write to big endian */
447 for(i = 0; i < 4096 / sizeof(uint64_t); i++) {
448 cpu_to_be64s(&tmp_ptr[i]);
449 }
451 /*
452 * Issue non-asynchronous L1 write.
453 * For safety, we must ensure that
454 * entry is written before blocks.
455 */
456 lseek(s->fd, s->l1_table_offset + (l1_sector << 12), SEEK_SET);
457 if (write(s->fd, tmp_ptr, 4096) != 4096) {
458 free(tmp_ptr);
459 return 0;
460 }
461 free(tmp_ptr);
463 new_l2_table = 1;
464 goto cache_miss;
465 } else if (s->min_cluster_alloc == s->l2_size) {
466 /*Fast-track the request*/
467 cluster_offset = l2_offset + (s->l2_size * sizeof(uint64_t));
468 l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
469 return cluster_offset + (l2_index * s->cluster_size);
470 }
472 /*Check to see if L2 entry is already cached*/
473 for (i = 0; i < L2_CACHE_SIZE; i++) {
474 if (l2_offset == s->l2_cache_offsets[i]) {
475 /* increment the hit count */
476 if (++s->l2_cache_counts[i] == 0xffffffff) {
477 for (j = 0; j < L2_CACHE_SIZE; j++) {
478 s->l2_cache_counts[j] >>= 1;
479 }
480 }
481 l2_table = s->l2_cache + (i << s->l2_bits);
482 goto found;
483 }
484 }
486 cache_miss:
487 /* not found: load a new entry in the least used one */
488 min_index = 0;
489 min_count = 0xffffffff;
490 for (i = 0; i < L2_CACHE_SIZE; i++) {
491 if (s->l2_cache_counts[i] < min_count) {
492 min_count = s->l2_cache_counts[i];
493 min_index = i;
494 }
495 }
496 l2_table = s->l2_cache + (min_index << s->l2_bits);
498 /*If extent pre-allocated, read table from disk,
499 *otherwise write new table to disk*/
500 if (new_l2_table) {
501 /*Should we allocate the whole extent? Adjustable parameter.*/
502 if (s->cluster_alloc == s->l2_size) {
503 cluster_offset = l2_offset +
504 (s->l2_size * sizeof(uint64_t));
505 cluster_offset = (cluster_offset + s->cluster_size - 1)
506 & ~(s->cluster_size - 1);
507 if (qtruncate(s->fd, cluster_offset +
508 (s->cluster_size * s->l2_size),
509 s->sparse) != 0) {
510 DPRINTF("ERROR truncating file\n");
511 return 0;
512 }
513 s->fd_end = cluster_offset +
514 (s->cluster_size * s->l2_size);
515 for (i = 0; i < s->l2_size; i++) {
516 l2_table[i] = cpu_to_be64(cluster_offset +
517 (i*s->cluster_size));
518 }
519 } else memset(l2_table, 0, s->l2_size * sizeof(uint64_t));
521 lseek(s->fd, l2_offset, SEEK_SET);
522 if (write(s->fd, l2_table, s->l2_size * sizeof(uint64_t)) !=
523 s->l2_size * sizeof(uint64_t))
524 return 0;
525 } else {
526 lseek(s->fd, l2_offset, SEEK_SET);
527 if (read(s->fd, l2_table, s->l2_size * sizeof(uint64_t)) !=
528 s->l2_size * sizeof(uint64_t))
529 return 0;
530 }
532 /*Update the cache entries*/
533 s->l2_cache_offsets[min_index] = l2_offset;
534 s->l2_cache_counts[min_index] = 1;
536 found:
537 /*The extent is split into 's->l2_size' blocks of
538 *size 's->cluster_size'*/
539 l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
540 cluster_offset = be64_to_cpu(l2_table[l2_index]);
542 if (!cluster_offset ||
543 ((cluster_offset & QCOW_OFLAG_COMPRESSED) && allocate == 1) ) {
544 if (!allocate)
545 return 0;
547 if ((cluster_offset & QCOW_OFLAG_COMPRESSED) &&
548 (n_end - n_start) < s->cluster_sectors) {
549 /* cluster is already allocated but compressed, we must
550 decompress it in the case it is not completely
551 overwritten */
552 if (decompress_cluster(s, cluster_offset) < 0)
553 return 0;
554 cluster_offset = lseek(s->fd, s->fd_end, SEEK_SET);
555 cluster_offset = (cluster_offset + s->cluster_size - 1)
556 & ~(s->cluster_size - 1);
557 /* write the cluster content - not asynchronous */
558 lseek(s->fd, cluster_offset, SEEK_SET);
559 if (write(s->fd, s->cluster_cache, s->cluster_size) !=
560 s->cluster_size)
561 return -1;
562 } else {
563 /* allocate a new cluster */
564 cluster_offset = lseek(s->fd, s->fd_end, SEEK_SET);
565 if (allocate == 1) {
566 /* round to cluster size */
567 cluster_offset =
568 (cluster_offset + s->cluster_size - 1)
569 & ~(s->cluster_size - 1);
570 if (qtruncate(s->fd, cluster_offset +
571 s->cluster_size, s->sparse)!=0) {
572 DPRINTF("ERROR truncating file\n");
573 return 0;
574 }
575 s->fd_end = (cluster_offset + s->cluster_size);
576 /* if encrypted, we must initialize the cluster
577 content which won't be written */
578 if (s->crypt_method &&
579 (n_end - n_start) < s->cluster_sectors) {
580 uint64_t start_sect;
581 start_sect = (offset &
582 ~(s->cluster_size - 1))
583 >> 9;
584 memset(s->cluster_data + 512,
585 0xaa, 512);
586 for (i = 0; i < s->cluster_sectors;i++)
587 {
588 if (i < n_start || i >= n_end)
589 {
590 encrypt_sectors(s, start_sect + i,
591 s->cluster_data,
592 s->cluster_data + 512, 1, 1,
593 &s->aes_encrypt_key);
594 lseek(s->fd, cluster_offset + i * 512, SEEK_SET);
595 if (write(s->fd, s->cluster_data, 512) != 512)
596 return -1;
597 }
598 }
599 }
600 } else {
601 cluster_offset |= QCOW_OFLAG_COMPRESSED |
602 (uint64_t)compressed_size
603 << (63 - s->cluster_bits);
604 }
605 }
606 /* update L2 table */
607 tmp = cpu_to_be64(cluster_offset);
608 l2_table[l2_index] = tmp;
610 /*For IO_DIRECT we write 4KByte blocks*/
611 l2_sector = (l2_index * sizeof(uint64_t)) >> 12;
612 l2_ptr = (char *)l2_table + (l2_sector << 12);
614 if (posix_memalign((void **)&tmp_ptr2, 4096, 4096) != 0) {
615 DPRINTF("ERROR allocating memory for L1 table\n");
616 }
617 memcpy(tmp_ptr2, l2_ptr, 4096);
618 lseek(s->fd, l2_offset + (l2_sector << 12), SEEK_SET);
619 if (write(s->fd, tmp_ptr2, 4096) != 4096) {
620 free(tmp_ptr2);
621 return -1;
622 }
623 free(tmp_ptr2);
624 }
625 return cluster_offset;
626 }
628 static void init_cluster_cache(struct disk_driver *dd)
629 {
630 struct td_state *bs = dd->td_state;
631 struct tdqcow_state *s = (struct tdqcow_state *)dd->private;
632 uint32_t count = 0;
633 int i, cluster_entries;
635 cluster_entries = s->cluster_size / 512;
636 DPRINTF("Initialising Cluster cache, %d sectors per cluster (%d cluster size)\n",
637 cluster_entries, s->cluster_size);
639 for (i = 0; i < bs->size; i += cluster_entries) {
640 if (get_cluster_offset(s, i << 9, 0, 0, 0, 1)) count++;
641 if (count >= L2_CACHE_SIZE) return;
642 }
643 DPRINTF("Finished cluster initialisation, added %d entries\n", count);
644 return;
645 }
647 static int qcow_is_allocated(struct tdqcow_state *s, int64_t sector_num,
648 int nb_sectors, int *pnum)
649 {
650 int index_in_cluster, n;
651 uint64_t cluster_offset;
653 cluster_offset = get_cluster_offset(s, sector_num << 9, 0, 0, 0, 0);
654 index_in_cluster = sector_num & (s->cluster_sectors - 1);
655 n = s->cluster_sectors - index_in_cluster;
656 if (n > nb_sectors)
657 n = nb_sectors;
658 *pnum = n;
659 return (cluster_offset != 0);
660 }
662 static int decompress_buffer(uint8_t *out_buf, int out_buf_size,
663 const uint8_t *buf, int buf_size)
664 {
665 z_stream strm1, *strm = &strm1;
666 int ret, out_len;
668 memset(strm, 0, sizeof(*strm));
670 strm->next_in = (uint8_t *)buf;
671 strm->avail_in = buf_size;
672 strm->next_out = out_buf;
673 strm->avail_out = out_buf_size;
675 ret = inflateInit2(strm, -12);
676 if (ret != Z_OK)
677 return -1;
678 ret = inflate(strm, Z_FINISH);
679 out_len = strm->next_out - out_buf;
680 if ( (ret != Z_STREAM_END && ret != Z_BUF_ERROR) ||
681 (out_len != out_buf_size) ) {
682 inflateEnd(strm);
683 return -1;
684 }
685 inflateEnd(strm);
686 return 0;
687 }
689 static int decompress_cluster(struct tdqcow_state *s, uint64_t cluster_offset)
690 {
691 int ret, csize;
692 uint64_t coffset;
694 coffset = cluster_offset & s->cluster_offset_mask;
695 if (s->cluster_cache_offset != coffset) {
696 csize = cluster_offset >> (63 - s->cluster_bits);
697 csize &= (s->cluster_size - 1);
698 lseek(s->fd, coffset, SEEK_SET);
699 ret = read(s->fd, s->cluster_data, csize);
700 if (ret != csize)
701 return -1;
702 if (decompress_buffer(s->cluster_cache, s->cluster_size,
703 s->cluster_data, csize) < 0) {
704 return -1;
705 }
706 s->cluster_cache_offset = coffset;
707 }
708 return 0;
709 }
711 static inline void init_fds(struct disk_driver *dd)
712 {
713 int i;
714 struct tdqcow_state *s = (struct tdqcow_state *)dd->private;
716 for(i = 0; i < MAX_IOFD; i++)
717 dd->io_fd[i] = 0;
719 dd->io_fd[0] = s->aio.aio_ctx.pollfd;
720 }
722 /* Open the disk file and initialize qcow state. */
723 static int tdqcow_open (struct disk_driver *dd, const char *name, td_flag_t flags)
724 {
725 int fd, len, i, shift, ret, size, l1_table_size, o_flags;
726 int max_aio_reqs;
727 struct td_state *bs = dd->td_state;
728 struct tdqcow_state *s = (struct tdqcow_state *)dd->private;
729 char *buf;
730 QCowHeader *header;
731 QCowHeader_ext *exthdr;
732 uint32_t cksum;
733 uint64_t final_cluster = 0;
735 DPRINTF("QCOW: Opening %s\n",name);
737 /* Since we don't handle O_DIRECT correctly, don't use it */
738 o_flags = O_LARGEFILE | ((flags == TD_RDONLY) ? O_RDONLY : O_RDWR);
739 fd = open(name, o_flags);
740 if (fd < 0) {
741 DPRINTF("Unable to open %s (%d)\n",name,0 - errno);
742 return -1;
743 }
745 s->fd = fd;
746 if (asprintf(&s->name,"%s", name) == -1) {
747 close(fd);
748 return -1;
749 }
751 ASSERT(sizeof(QCowHeader) + sizeof(QCowHeader_ext) < 512);
753 ret = posix_memalign((void **)&buf, 512, 512);
754 if (ret != 0) goto fail;
756 if (read(fd, buf, 512) != 512)
757 goto fail;
759 header = (QCowHeader *)buf;
760 be32_to_cpus(&header->magic);
761 be32_to_cpus(&header->version);
762 be64_to_cpus(&header->backing_file_offset);
763 be32_to_cpus(&header->backing_file_size);
764 be32_to_cpus(&header->mtime);
765 be64_to_cpus(&header->size);
766 be32_to_cpus(&header->crypt_method);
767 be64_to_cpus(&header->l1_table_offset);
769 if (header->magic != QCOW_MAGIC)
770 goto fail;
772 switch (header->version) {
773 case QCOW_VERSION:
774 break;
775 case 2:
776 close(fd);
777 dd->drv = &tapdisk_qcow2;
778 return dd->drv->td_open(dd, name, flags);
779 default:
780 goto fail;
781 }
783 if (header->size <= 1 || header->cluster_bits < 9)
784 goto fail;
785 if (header->crypt_method > QCOW_CRYPT_AES)
786 goto fail;
787 s->crypt_method_header = header->crypt_method;
788 if (s->crypt_method_header)
789 s->encrypted = 1;
790 s->cluster_bits = header->cluster_bits;
791 s->cluster_size = 1 << s->cluster_bits;
792 s->cluster_sectors = 1 << (s->cluster_bits - 9);
793 s->l2_bits = header->l2_bits;
794 s->l2_size = 1 << s->l2_bits;
795 s->cluster_alloc = s->l2_size;
796 bs->size = header->size / 512;
797 s->cluster_offset_mask = (1LL << (63 - s->cluster_bits)) - 1;
798 s->backing_file_offset = header->backing_file_offset;
799 s->backing_file_size = header->backing_file_size;
801 /* read the level 1 table */
802 shift = s->cluster_bits + s->l2_bits;
803 s->l1_size = (header->size + (1LL << shift) - 1) >> shift;
805 s->l1_table_offset = header->l1_table_offset;
807 /*allocate a 4Kbyte multiple of memory*/
808 l1_table_size = s->l1_size * sizeof(uint64_t);
809 if (l1_table_size % 4096 > 0) {
810 l1_table_size = ((l1_table_size >> 12) + 1) << 12;
811 }
812 ret = posix_memalign((void **)&s->l1_table, 4096, l1_table_size);
813 if (ret != 0) goto fail;
815 memset(s->l1_table, 0x00, l1_table_size);
817 DPRINTF("L1 Table offset detected: %llu, size %d (%d)\n",
818 (long long)s->l1_table_offset,
819 (int) (s->l1_size * sizeof(uint64_t)),
820 l1_table_size);
822 lseek(fd, s->l1_table_offset, SEEK_SET);
823 if (read(fd, s->l1_table, l1_table_size) != l1_table_size)
824 goto fail;
826 for(i = 0; i < s->l1_size; i++) {
827 be64_to_cpus(&s->l1_table[i]);
828 //DPRINTF("L1[%d] => %llu\n", i, s->l1_table[i]);
829 if (s->l1_table[i] > final_cluster)
830 final_cluster = s->l1_table[i];
831 }
833 /* alloc L2 cache */
834 size = s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t);
835 ret = posix_memalign((void **)&s->l2_cache, 4096, size);
836 if(ret != 0) goto fail;
838 size = s->cluster_size;
839 ret = posix_memalign((void **)&s->cluster_cache, 4096, size);
840 if(ret != 0) goto fail;
842 ret = posix_memalign((void **)&s->cluster_data, 4096, size);
843 if(ret != 0) goto fail;
844 s->cluster_cache_offset = -1;
846 if (s->backing_file_offset != 0)
847 s->cluster_alloc = 1; /*Cannot use pre-alloc*/
849 bs->sector_size = 512;
850 bs->info = 0;
852 /*Detect min_cluster_alloc*/
853 s->min_cluster_alloc = 1; /*Default*/
854 if (s->backing_file_offset == 0 && s->l1_table_offset % 4096 == 0) {
855 /*We test to see if the xen magic # exists*/
856 exthdr = (QCowHeader_ext *)(buf + sizeof(QCowHeader));
857 be32_to_cpus(&exthdr->xmagic);
858 if(exthdr->xmagic != XEN_MAGIC)
859 goto end_xenhdr;
861 /* Try to detect old tapdisk images. They have to be fixed because
862 * they don't use big endian but native endianess for the L1 table */
863 if ((exthdr->flags & EXTHDR_L1_BIG_ENDIAN) == 0) {
865 /*
866 The image is broken. Fix it. The L1 table has already been
867 byte-swapped, so we can write it to the image file as it is
868 currently in memory. Then swap it back to native endianess
869 for operation.
870 */
872 DPRINTF("qcow: Converting image to big endian L1 table\n");
874 lseek(fd, s->l1_table_offset, SEEK_SET);
875 if (write(fd, s->l1_table, l1_table_size) != l1_table_size) {
876 DPRINTF("qcow: Failed to write new L1 table\n");
877 goto fail;
878 }
880 for(i = 0;i < s->l1_size; i++) {
881 cpu_to_be64s(&s->l1_table[i]);
882 }
884 /* Write the big endian flag to the extended header */
885 exthdr->flags |= EXTHDR_L1_BIG_ENDIAN;
887 if (write(fd, buf, 512) != 512) {
888 DPRINTF("qcow: Failed to write extended header\n");
889 goto fail;
890 }
891 }
893 /*Finally check the L1 table cksum*/
894 be32_to_cpus(&exthdr->cksum);
895 cksum = gen_cksum((char *)s->l1_table,
896 s->l1_size * sizeof(uint64_t));
897 if(exthdr->cksum != cksum)
898 goto end_xenhdr;
900 be32_to_cpus(&exthdr->min_cluster_alloc);
901 be32_to_cpus(&exthdr->flags);
902 s->sparse = (exthdr->flags & SPARSE_FILE);
903 s->min_cluster_alloc = exthdr->min_cluster_alloc;
904 }
906 end_xenhdr:
908 /* A segment (i.e. a page) can span multiple clusters */
909 max_aio_reqs = ((getpagesize() / s->cluster_size) + 1) *
910 MAX_SEGMENTS_PER_REQ * MAX_REQUESTS;
912 if (tap_aio_init(&s->aio, bs->size, max_aio_reqs)!=0) {
913 DPRINTF("Unable to initialise AIO state\n");
914 tap_aio_free(&s->aio);
915 goto fail;
916 }
917 init_fds(dd);
919 if (!final_cluster)
920 s->fd_end = s->l1_table_offset + l1_table_size;
921 else {
922 s->fd_end = lseek(fd, 0, SEEK_END);
923 if (s->fd_end == (off_t)-1)
924 goto fail;
925 }
927 return 0;
929 fail:
930 DPRINTF("QCOW Open failed\n");
931 tap_aio_free(&s->aio);
932 free(s->l1_table);
933 free(s->l2_cache);
934 free(s->cluster_cache);
935 free(s->cluster_data);
936 close(fd);
937 return -1;
938 }
940 static int tdqcow_queue_read(struct disk_driver *dd, uint64_t sector,
941 int nb_sectors, char *buf, td_callback_t cb,
942 int id, void *private)
943 {
944 struct tdqcow_state *s = (struct tdqcow_state *)dd->private;
945 int ret = 0, index_in_cluster, n, i, rsp = 0;
946 uint64_t cluster_offset, sec, nr_secs;
948 sec = sector;
949 nr_secs = nb_sectors;
951 /*Check we can get a lock*/
952 for (i = 0; i < nb_sectors; i++)
953 if (!tap_aio_can_lock(&s->aio, sector + i))
954 return cb(dd, -EBUSY, sector, nb_sectors, id, private);
956 /*We store a local record of the request*/
957 while (nb_sectors > 0) {
958 cluster_offset =
959 get_cluster_offset(s, sector << 9, 0, 0, 0, 0);
960 index_in_cluster = sector & (s->cluster_sectors - 1);
961 n = s->cluster_sectors - index_in_cluster;
962 if (n > nb_sectors)
963 n = nb_sectors;
965 if (s->aio.iocb_free_count == 0 || !tap_aio_lock(&s->aio, sector))
966 return cb(dd, -EBUSY, sector, nb_sectors, id, private);
968 if(!cluster_offset) {
969 tap_aio_unlock(&s->aio, sector);
970 ret = cb(dd, BLK_NOT_ALLOCATED,
971 sector, n, id, private);
972 if (ret == -EBUSY) {
973 /* mark remainder of request
974 * as busy and try again later */
975 return cb(dd, -EBUSY, sector + n,
976 nb_sectors - n, id, private);
977 } else
978 rsp += ret;
979 } else if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
980 tap_aio_unlock(&s->aio, sector);
981 if (decompress_cluster(s, cluster_offset) < 0) {
982 rsp += cb(dd, -EIO, sector,
983 nb_sectors, id, private);
984 goto done;
985 }
986 memcpy(buf, s->cluster_cache + index_in_cluster * 512,
987 512 * n);
988 rsp += cb(dd, 0, sector, n, id, private);
989 } else {
990 tap_aio_read(&s->aio, s->fd, n * 512,
991 (cluster_offset + index_in_cluster * 512),
992 buf, cb, id, sector, private);
993 }
994 nb_sectors -= n;
995 sector += n;
996 buf += n * 512;
997 }
998 done:
999 return rsp;
1002 static int tdqcow_queue_write(struct disk_driver *dd, uint64_t sector,
1003 int nb_sectors, char *buf, td_callback_t cb,
1004 int id, void *private)
1006 struct tdqcow_state *s = (struct tdqcow_state *)dd->private;
1007 int ret = 0, index_in_cluster, n, i;
1008 uint64_t cluster_offset, sec, nr_secs;
1010 sec = sector;
1011 nr_secs = nb_sectors;
1013 /*Check we can get a lock*/
1014 for (i = 0; i < nb_sectors; i++)
1015 if (!tap_aio_can_lock(&s->aio, sector + i))
1016 return cb(dd, -EBUSY, sector, nb_sectors, id, private);
1018 /*We store a local record of the request*/
1019 while (nb_sectors > 0) {
1020 index_in_cluster = sector & (s->cluster_sectors - 1);
1021 n = s->cluster_sectors - index_in_cluster;
1022 if (n > nb_sectors)
1023 n = nb_sectors;
1025 if (s->aio.iocb_free_count == 0 || !tap_aio_lock(&s->aio, sector))
1026 return cb(dd, -EBUSY, sector, nb_sectors, id, private);
1028 cluster_offset = get_cluster_offset(s, sector << 9, 1, 0,
1029 index_in_cluster,
1030 index_in_cluster+n);
1031 if (!cluster_offset) {
1032 DPRINTF("Ooops, no write cluster offset!\n");
1033 tap_aio_unlock(&s->aio, sector);
1034 return cb(dd, -EIO, sector, nb_sectors, id, private);
1037 if (s->crypt_method) {
1038 encrypt_sectors(s, sector, s->cluster_data,
1039 (unsigned char *)buf, n, 1,
1040 &s->aes_encrypt_key);
1041 tap_aio_write(&s->aio, s->fd, n * 512,
1042 (cluster_offset + index_in_cluster*512),
1043 (char *)s->cluster_data, cb, id, sector,
1044 private);
1045 } else {
1046 tap_aio_write(&s->aio, s->fd, n * 512,
1047 (cluster_offset + index_in_cluster*512),
1048 buf, cb, id, sector, private);
1051 nb_sectors -= n;
1052 sector += n;
1053 buf += n * 512;
1055 s->cluster_cache_offset = -1; /* disable compressed cache */
1057 return 0;
1060 static int tdqcow_submit(struct disk_driver *dd)
1062 struct tdqcow_state *prv = (struct tdqcow_state *)dd->private;
1064 return tap_aio_submit(&prv->aio);
1067 static int tdqcow_close(struct disk_driver *dd)
1069 struct tdqcow_state *s = (struct tdqcow_state *)dd->private;
1070 uint32_t cksum, out;
1071 int fd, offset;
1073 /*Update the hdr cksum*/
1074 if(s->min_cluster_alloc == s->l2_size) {
1075 cksum = gen_cksum((char *)s->l1_table, s->l1_size * sizeof(uint64_t));
1076 printf("Writing cksum: %d",cksum);
1077 fd = open(s->name, O_WRONLY | O_LARGEFILE); /*Open without O_DIRECT*/
1078 offset = sizeof(QCowHeader) + sizeof(uint32_t);
1079 lseek(fd, offset, SEEK_SET);
1080 out = cpu_to_be32(cksum);
1081 if (write(fd, &out, sizeof(uint32_t))) ;
1082 close(fd);
1085 io_destroy(s->aio.aio_ctx.aio_ctx);
1086 free(s->name);
1087 free(s->l1_table);
1088 free(s->l2_cache);
1089 free(s->cluster_cache);
1090 free(s->cluster_data);
1091 close(s->fd);
1092 return 0;
1095 static int tdqcow_do_callbacks(struct disk_driver *dd, int sid)
1097 int ret, i, nr_events, rsp = 0,*ptr;
1098 struct io_event *ep;
1099 struct tdqcow_state *prv = (struct tdqcow_state *)dd->private;
1101 if (sid > MAX_IOFD) return 1;
1103 nr_events = tap_aio_get_events(&prv->aio.aio_ctx);
1104 repeat:
1105 for (ep = prv->aio.aio_events, i = nr_events; i-- > 0; ep++) {
1106 struct iocb *io = ep->obj;
1107 struct pending_aio *pio;
1109 pio = &prv->aio.pending_aio[(long)io->data];
1111 tap_aio_unlock(&prv->aio, pio->sector);
1113 if (prv->crypt_method)
1114 encrypt_sectors(prv, pio->sector,
1115 (unsigned char *)pio->buf,
1116 (unsigned char *)pio->buf,
1117 pio->nb_sectors, 0,
1118 &prv->aes_decrypt_key);
1120 rsp += pio->cb(dd, ep->res == io->u.c.nbytes ? 0 : 1,
1121 pio->sector, pio->nb_sectors,
1122 pio->id, pio->private);
1124 prv->aio.iocb_free[prv->aio.iocb_free_count++] = io;
1127 if (nr_events) {
1128 nr_events = tap_aio_more_events(&prv->aio.aio_ctx);
1129 goto repeat;
1132 tap_aio_continue(&prv->aio.aio_ctx);
1134 return rsp;
1137 int qcow_create(const char *filename, uint64_t total_size,
1138 const char *backing_file, int sparse)
1140 int fd, header_size, backing_filename_len, l1_size, i;
1141 int shift, length, adjust, flags = 0, ret = 0;
1142 QCowHeader header;
1143 QCowHeader_ext exthdr;
1144 char backing_filename[PATH_MAX], *ptr;
1145 uint64_t tmp, size, total_length;
1146 struct stat st;
1148 DPRINTF("Qcow_create: size %llu\n",(long long unsigned)total_size);
1150 fd = open(filename,
1151 O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE,
1152 0644);
1153 if (fd < 0)
1154 return -1;
1156 memset(&header, 0, sizeof(header));
1157 header.magic = cpu_to_be32(QCOW_MAGIC);
1158 header.version = cpu_to_be32(QCOW_VERSION);
1160 /*Create extended header fields*/
1161 exthdr.xmagic = cpu_to_be32(XEN_MAGIC);
1163 header_size = sizeof(header) + sizeof(QCowHeader_ext);
1164 backing_filename_len = 0;
1165 size = (total_size >> SECTOR_SHIFT);
1166 if (backing_file) {
1167 if (strcmp(backing_file, "fat:")) {
1168 const char *p;
1169 /* XXX: this is a hack: we do not attempt to
1170 *check for URL like syntax */
1171 p = strchr(backing_file, ':');
1172 if (p && (p - backing_file) >= 2) {
1173 /* URL like but exclude "c:" like filenames */
1174 strncpy(backing_filename, backing_file,
1175 sizeof(backing_filename));
1176 } else {
1177 if (realpath(backing_file, backing_filename) == NULL ||
1178 stat(backing_filename, &st) != 0) {
1179 return -1;
1182 header.backing_file_offset = cpu_to_be64(header_size);
1183 backing_filename_len = strlen(backing_filename);
1184 header.backing_file_size = cpu_to_be32(
1185 backing_filename_len);
1186 header_size += backing_filename_len;
1188 /*Set to the backing file size*/
1189 if(get_filesize(backing_filename, &size, &st)) {
1190 return -1;
1192 DPRINTF("Backing file size detected: %lld sectors"
1193 "(total %lld [%lld MB])\n",
1194 (long long)size,
1195 (long long)(size << SECTOR_SHIFT),
1196 (long long)(size >> 11));
1197 } else {
1198 backing_file = NULL;
1199 DPRINTF("Setting file size: %lld (total %lld)\n",
1200 (long long) total_size,
1201 (long long) (total_size << SECTOR_SHIFT));
1203 header.mtime = cpu_to_be32(st.st_mtime);
1204 header.cluster_bits = 9; /* 512 byte cluster to avoid copying
1205 unmodifyed sectors */
1206 header.l2_bits = 12; /* 32 KB L2 tables */
1207 exthdr.min_cluster_alloc = cpu_to_be32(1);
1208 } else {
1209 DPRINTF("Setting file size: %lld sectors"
1210 "(total %lld [%lld MB])\n",
1211 (long long) size,
1212 (long long) (size << SECTOR_SHIFT),
1213 (long long) (size >> 11));
1214 header.cluster_bits = 12; /* 4 KB clusters */
1215 header.l2_bits = 9; /* 4 KB L2 tables */
1216 exthdr.min_cluster_alloc = cpu_to_be32(1 << 9);
1218 /*Set the header size value*/
1219 header.size = cpu_to_be64(size * 512);
1221 header_size = (header_size + 7) & ~7;
1222 if (header_size % 4096 > 0) {
1223 header_size = ((header_size >> 12) + 1) << 12;
1226 shift = header.cluster_bits + header.l2_bits;
1227 l1_size = ((size * 512) + (1LL << shift) - 1) >> shift;
1229 header.l1_table_offset = cpu_to_be64(header_size);
1230 DPRINTF("L1 Table offset: %d, size %d\n",
1231 header_size,
1232 (int)(l1_size * sizeof(uint64_t)));
1233 header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE);
1235 ptr = calloc(1, l1_size * sizeof(uint64_t));
1236 exthdr.cksum = cpu_to_be32(gen_cksum(ptr, l1_size * sizeof(uint64_t)));
1237 printf("Created cksum: %d\n",exthdr.cksum);
1238 free(ptr);
1240 /*adjust file length to system page size boundary*/
1241 length = ROUNDUP(header_size + (l1_size * sizeof(uint64_t)),
1242 getpagesize());
1243 if (qtruncate(fd, length, 0)!=0) {
1244 DPRINTF("ERROR truncating file\n");
1245 return -1;
1248 if (sparse == 0) {
1249 /*Filesize is length+l1_size*(1 << s->l2_bits)+(size*512)*/
1250 total_length = length + (l1_size * (1 << 9)) + (size * 512);
1251 if (qtruncate(fd, total_length, 0)!=0) {
1252 DPRINTF("ERROR truncating file\n");
1253 return -1;
1255 printf("File truncated to length %"PRIu64"\n",total_length);
1256 } else
1257 flags = SPARSE_FILE;
1259 exthdr.flags = cpu_to_be32(flags);
1261 /* write all the data */
1262 lseek(fd, 0, SEEK_SET);
1263 ret += write(fd, &header, sizeof(header));
1264 ret += write(fd, &exthdr, sizeof(exthdr));
1265 if (backing_file)
1266 ret += write(fd, backing_filename, backing_filename_len);
1268 lseek(fd, header_size, SEEK_SET);
1269 tmp = 0;
1270 for (i = 0;i < l1_size; i++) {
1271 ret += write(fd, &tmp, sizeof(tmp));
1274 close(fd);
1276 return 0;
1279 static int qcow_make_empty(struct tdqcow_state *s)
1281 uint32_t l1_length = s->l1_size * sizeof(uint64_t);
1283 memset(s->l1_table, 0, l1_length);
1284 lseek(s->fd, s->l1_table_offset, SEEK_SET);
1285 if (write(s->fd, s->l1_table, l1_length) < 0)
1286 return -1;
1287 if (qtruncate(s->fd, s->l1_table_offset + l1_length, s->sparse)!=0) {
1288 DPRINTF("ERROR truncating file\n");
1289 return -1;
1292 memset(s->l2_cache, 0, s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
1293 memset(s->l2_cache_offsets, 0, L2_CACHE_SIZE * sizeof(uint64_t));
1294 memset(s->l2_cache_counts, 0, L2_CACHE_SIZE * sizeof(uint32_t));
1296 return 0;
1299 static int qcow_get_cluster_size(struct tdqcow_state *s)
1301 return s->cluster_size;
1304 /* XXX: put compressed sectors first, then all the cluster aligned
1305 tables to avoid losing bytes in alignment */
1306 static int qcow_compress_cluster(struct tdqcow_state *s, int64_t sector_num,
1307 const uint8_t *buf)
1309 z_stream strm;
1310 int ret, out_len;
1311 uint8_t *out_buf;
1312 uint64_t cluster_offset;
1314 out_buf = malloc(s->cluster_size + (s->cluster_size / 1000) + 128);
1315 if (!out_buf)
1316 return -1;
1318 /* best compression, small window, no zlib header */
1319 memset(&strm, 0, sizeof(strm));
1320 ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
1321 Z_DEFLATED, -12,
1322 9, Z_DEFAULT_STRATEGY);
1323 if (ret != 0) {
1324 free(out_buf);
1325 return -1;
1328 strm.avail_in = s->cluster_size;
1329 strm.next_in = (uint8_t *)buf;
1330 strm.avail_out = s->cluster_size;
1331 strm.next_out = out_buf;
1333 ret = deflate(&strm, Z_FINISH);
1334 if (ret != Z_STREAM_END && ret != Z_OK) {
1335 free(out_buf);
1336 deflateEnd(&strm);
1337 return -1;
1339 out_len = strm.next_out - out_buf;
1341 deflateEnd(&strm);
1343 if (ret != Z_STREAM_END || out_len >= s->cluster_size) {
1344 /* could not compress: write normal cluster */
1345 //tdqcow_queue_write(bs, sector_num, buf, s->cluster_sectors);
1346 } else {
1347 cluster_offset = get_cluster_offset(s, sector_num << 9, 2,
1348 out_len, 0, 0);
1349 cluster_offset &= s->cluster_offset_mask;
1350 lseek(s->fd, cluster_offset, SEEK_SET);
1351 if (write(s->fd, out_buf, out_len) != out_len) {
1352 free(out_buf);
1353 return -1;
1357 free(out_buf);
1358 return 0;
1361 static int tdqcow_get_parent_id(struct disk_driver *dd, struct disk_id *id)
1363 off_t off;
1364 char *buf, *filename;
1365 int len, secs, err = -EINVAL;
1366 struct tdqcow_state *child = (struct tdqcow_state *)dd->private;
1368 if (!child->backing_file_offset)
1369 return TD_NO_PARENT;
1371 /* read the backing file name */
1372 len = child->backing_file_size;
1373 off = child->backing_file_offset - (child->backing_file_offset % 512);
1374 secs = (len + (child->backing_file_offset - off) + 511) >> 9;
1376 if (posix_memalign((void **)&buf, 512, secs << 9))
1377 return -1;
1379 if (lseek(child->fd, off, SEEK_SET) == (off_t)-1)
1380 goto out;
1382 if (read(child->fd, buf, secs << 9) != secs << 9)
1383 goto out;
1384 filename = buf + (child->backing_file_offset - off);
1385 filename[len] = '\0';
1387 id->name = strdup(filename);
1388 id->drivertype = DISK_TYPE_AIO;
1389 err = 0;
1390 out:
1391 free(buf);
1392 return err;
1395 static int tdqcow_validate_parent(struct disk_driver *child,
1396 struct disk_driver *parent, td_flag_t flags)
1398 struct stat stats;
1399 uint64_t psize, csize;
1401 if (stat(parent->name, &stats))
1402 return -EINVAL;
1403 if (get_filesize(parent->name, &psize, &stats))
1404 return -EINVAL;
1406 if (stat(child->name, &stats))
1407 return -EINVAL;
1408 if (get_filesize(child->name, &csize, &stats))
1409 return -EINVAL;
1411 if (csize != psize)
1412 return -EINVAL;
1414 return 0;
1417 struct tap_disk tapdisk_qcow = {
1418 .disk_type = "tapdisk_qcow",
1419 .private_data_size = sizeof(struct tdqcow_state),
1420 .td_open = tdqcow_open,
1421 .td_queue_read = tdqcow_queue_read,
1422 .td_queue_write = tdqcow_queue_write,
1423 .td_submit = tdqcow_submit,
1424 .td_close = tdqcow_close,
1425 .td_do_callbacks = tdqcow_do_callbacks,
1426 .td_get_parent_id = tdqcow_get_parent_id,
1427 .td_validate_parent = tdqcow_validate_parent
1428 };