ia64/xen-unstable

view tools/blktap/drivers/block-qcow.c @ 18698:008505c3c65a

blktap: re-enable O_DIRECT in block_qcow.c

Turns out that only two reads and writes in block-qcow.c need to be
fixed to work correctly with O_DIRECT.

Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Wed Oct 22 11:55:33 2008 +0100 (2008-10-22)
parents 0a09de68c541
children f989778298d8
line source
1 /* block-qcow.c
2 *
3 * Asynchronous Qemu copy-on-write disk implementation.
4 * Code based on the Qemu implementation
5 * (see copyright notice below)
6 *
7 * (c) 2006 Andrew Warfield and Julian Chesterfield
8 *
9 */
11 /*
12 * Block driver for the QCOW format
13 *
14 * Copyright (c) 2004 Fabrice Bellard
15 *
16 * Permission is hereby granted, free of charge, to any person obtaining a copy
17 * of this software and associated documentation files(the "Software"), to deal
18 * in the Software without restriction, including without limitation the rights
19 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
20 * copies of the Software, and to permit persons to whom the Software is
21 * furnished to do so, subject to the following conditions:
22 */
24 #include <errno.h>
25 #include <fcntl.h>
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <unistd.h>
29 #include <sys/statvfs.h>
30 #include <sys/stat.h>
31 #include <sys/ioctl.h>
32 #include <string.h>
33 #include <zlib.h>
34 #include <inttypes.h>
35 #include <libaio.h>
36 #include "bswap.h"
37 #include "aes.h"
38 #include "tapdisk.h"
39 #include "tapaio.h"
40 #include "blk.h"
42 /* *BSD has no O_LARGEFILE */
43 #ifndef O_LARGEFILE
44 #define O_LARGEFILE 0
45 #endif
47 #if 1
48 #define ASSERT(_p) \
49 if ( !(_p) ) { DPRINTF("Assertion '%s' failed, line %d, file %s", #_p , \
50 __LINE__, __FILE__); *(int*)0=0; }
51 #else
52 #define ASSERT(_p) ((void)0)
53 #endif
55 #define ROUNDUP(l, s) \
56 ({ \
57 (uint64_t)( \
58 (l + (s - 1)) - ((l + (s - 1)) % s)); \
59 })
61 #undef IOCB_IDX
62 #define IOCB_IDX(_s, _io) ((_io) - (_s)->iocb_list)
64 #define ZERO_TEST(_b) (_b | 0x00)
66 /**************************************************************/
67 /* QEMU COW block driver with compression and encryption support */
69 #define QCOW_MAGIC (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb)
70 #define XEN_MAGIC (('X' << 24) | ('E' << 16) | ('N' << 8) | 0xfb)
71 #define QCOW_VERSION 1
73 #define QCOW_CRYPT_NONE 0x00
74 #define QCOW_CRYPT_AES 0x01
76 #define QCOW_OFLAG_COMPRESSED (1LL << 63)
77 #define SPARSE_FILE 0x01
78 #define EXTHDR_L1_BIG_ENDIAN 0x02
80 #ifndef O_BINARY
81 #define O_BINARY 0
82 #endif
84 typedef struct QCowHeader {
85 uint32_t magic;
86 uint32_t version;
87 uint64_t backing_file_offset;
88 uint32_t backing_file_size;
89 uint32_t mtime;
90 uint64_t size; /* in bytes */
91 uint8_t cluster_bits;
92 uint8_t l2_bits;
93 uint32_t crypt_method;
94 uint64_t l1_table_offset;
95 } QCowHeader;
97 /*Extended header for Xen enhancements*/
98 typedef struct QCowHeader_ext {
99 uint32_t xmagic;
100 uint32_t cksum;
101 uint32_t min_cluster_alloc;
102 uint32_t flags;
103 } QCowHeader_ext;
105 #define L2_CACHE_SIZE 16 /*Fixed allocation in Qemu*/
107 struct tdqcow_state {
108 int fd; /*Main Qcow file descriptor */
109 uint64_t fd_end; /*Store a local record of file length */
110 char *name; /*Record of the filename*/
111 uint32_t backing_file_size;
112 uint64_t backing_file_offset;
113 int encrypted; /*File contents are encrypted or plain*/
114 int cluster_bits; /*Determines length of cluster as
115 *indicated by file hdr*/
116 int cluster_size; /*Length of cluster*/
117 int cluster_sectors; /*Number of sectors per cluster*/
118 int cluster_alloc; /*Blktap fix for allocating full
119 *extents*/
120 int min_cluster_alloc; /*Blktap historical extent alloc*/
121 int sparse; /*Indicates whether to preserve sparseness*/
122 int l2_bits; /*Size of L2 table entry*/
123 int l2_size; /*Full table size*/
124 int l1_size; /*L1 table size*/
125 uint64_t cluster_offset_mask;
126 uint64_t l1_table_offset; /*L1 table offset from beginning of
127 *file*/
128 uint64_t *l1_table; /*L1 table entries*/
129 uint64_t *l2_cache; /*We maintain a cache of size
130 *L2_CACHE_SIZE of most read entries*/
131 uint64_t l2_cache_offsets[L2_CACHE_SIZE]; /*L2 cache entries*/
132 uint32_t l2_cache_counts[L2_CACHE_SIZE]; /*Cache access record*/
133 uint8_t *cluster_cache;
134 uint8_t *cluster_data;
135 uint64_t cluster_cache_offset; /**/
136 uint32_t crypt_method; /*current crypt method, 0 if no
137 *key yet */
138 uint32_t crypt_method_header; /**/
139 AES_KEY aes_encrypt_key; /*AES key*/
140 AES_KEY aes_decrypt_key; /*AES key*/
142 /* libaio state */
143 tap_aio_context_t aio;
144 };
146 static int decompress_cluster(struct tdqcow_state *s, uint64_t cluster_offset);
148 #ifdef USE_GCRYPT
150 #include <gcrypt.h>
152 static uint32_t gen_cksum(char *ptr, int len)
153 {
154 int i;
155 uint32_t md[4];
157 /* Convert L1 table to big endian */
158 for(i = 0; i < len / sizeof(uint64_t); i++) {
159 cpu_to_be64s(&((uint64_t*) ptr)[i]);
160 }
162 /* Generate checksum */
163 gcry_md_hash_buffer(GCRY_MD_MD5, md, ptr, len);
165 /* Convert L1 table back to native endianess */
166 for(i = 0; i < len / sizeof(uint64_t); i++) {
167 be64_to_cpus(&((uint64_t*) ptr)[i]);
168 }
170 return md[0];
171 }
173 #else /* use libcrypto */
175 #include <openssl/md5.h>
177 static uint32_t gen_cksum(char *ptr, int len)
178 {
179 int i;
180 unsigned char *md;
181 uint32_t ret;
183 md = malloc(MD5_DIGEST_LENGTH);
184 if(!md) return 0;
186 /* Convert L1 table to big endian */
187 for(i = 0; i < len / sizeof(uint64_t); i++) {
188 cpu_to_be64s(&((uint64_t*) ptr)[i]);
189 }
191 /* Generate checksum */
192 if (MD5((unsigned char *)ptr, len, md) != md)
193 ret = 0;
194 else
195 memcpy(&ret, md, sizeof(uint32_t));
197 /* Convert L1 table back to native endianess */
198 for(i = 0; i < len / sizeof(uint64_t); i++) {
199 be64_to_cpus(&((uint64_t*) ptr)[i]);
200 }
202 free(md);
203 return ret;
204 }
206 #endif
208 static int get_filesize(char *filename, uint64_t *size, struct stat *st)
209 {
210 int fd;
211 QCowHeader header;
213 /*Set to the backing file size*/
214 fd = open(filename, O_RDONLY);
215 if (fd < 0)
216 return -1;
217 if (read(fd, &header, sizeof(header)) < sizeof(header)) {
218 close(fd);
219 return -1;
220 }
221 close(fd);
223 be32_to_cpus(&header.magic);
224 be64_to_cpus(&header.size);
225 if (header.magic == QCOW_MAGIC) {
226 *size = header.size >> SECTOR_SHIFT;
227 return 0;
228 }
230 if(S_ISBLK(st->st_mode)) {
231 fd = open(filename, O_RDONLY);
232 if (fd < 0)
233 return -1;
234 if (blk_getimagesize(fd, size) != 0) {
235 close(fd);
236 return -1;
237 }
238 close(fd);
239 } else *size = (st->st_size >> SECTOR_SHIFT);
240 return 0;
241 }
243 static int qcow_set_key(struct tdqcow_state *s, const char *key)
244 {
245 uint8_t keybuf[16];
246 int len, i;
248 memset(keybuf, 0, 16);
249 len = strlen(key);
250 if (len > 16)
251 len = 16;
252 /* XXX: we could compress the chars to 7 bits to increase
253 entropy */
254 for (i = 0; i < len; i++) {
255 keybuf[i] = key[i];
256 }
257 s->crypt_method = s->crypt_method_header;
259 if (AES_set_encrypt_key(keybuf, 128, &s->aes_encrypt_key) != 0)
260 return -1;
261 if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0)
262 return -1;
263 #if 0
264 /* test */
265 {
266 uint8_t in[16];
267 uint8_t out[16];
268 uint8_t tmp[16];
269 for (i=0; i<16; i++)
270 in[i] = i;
271 AES_encrypt(in, tmp, &s->aes_encrypt_key);
272 AES_decrypt(tmp, out, &s->aes_decrypt_key);
273 for (i = 0; i < 16; i++)
274 DPRINTF(" %02x", tmp[i]);
275 DPRINTF("\n");
276 for (i = 0; i < 16; i++)
277 DPRINTF(" %02x", out[i]);
278 DPRINTF("\n");
279 }
280 #endif
281 return 0;
282 }
284 /*
285 * The crypt function is compatible with the linux cryptoloop
286 * algorithm for < 4 GB images. NOTE: out_buf == in_buf is
287 * supported .
288 */
289 static void encrypt_sectors(struct tdqcow_state *s, int64_t sector_num,
290 uint8_t *out_buf, const uint8_t *in_buf,
291 int nb_sectors, int enc,
292 const AES_KEY *key)
293 {
294 union {
295 uint64_t ll[2];
296 uint8_t b[16];
297 } ivec;
298 int i;
300 for (i = 0; i < nb_sectors; i++) {
301 ivec.ll[0] = cpu_to_le64(sector_num);
302 ivec.ll[1] = 0;
303 AES_cbc_encrypt(in_buf, out_buf, 512, key,
304 ivec.b, enc);
305 sector_num++;
306 in_buf += 512;
307 out_buf += 512;
308 }
309 }
311 static int qtruncate(int fd, off_t length, int sparse)
312 {
313 int ret, i;
314 int current = 0, rem = 0;
315 uint64_t sectors;
316 struct stat st;
317 char *buf;
319 /* If length is greater than the current file len
320 * we synchronously write zeroes to the end of the
321 * file, otherwise we truncate the length down
322 */
323 ret = fstat(fd, &st);
324 if (ret == -1)
325 return -1;
326 if (S_ISBLK(st.st_mode))
327 return 0;
329 sectors = (length + DEFAULT_SECTOR_SIZE - 1)/DEFAULT_SECTOR_SIZE;
330 current = (st.st_size + DEFAULT_SECTOR_SIZE - 1)/DEFAULT_SECTOR_SIZE;
331 rem = st.st_size % DEFAULT_SECTOR_SIZE;
333 /* If we are extending this file, we write zeros to the end --
334 * this tries to ensure that the extents allocated wind up being
335 * contiguous on disk.
336 */
337 if(st.st_size < sectors * DEFAULT_SECTOR_SIZE) {
338 /*We are extending the file*/
339 if ((ret = posix_memalign((void **)&buf,
340 512, DEFAULT_SECTOR_SIZE))) {
341 DPRINTF("posix_memalign failed: %d\n", ret);
342 return -1;
343 }
344 memset(buf, 0x00, DEFAULT_SECTOR_SIZE);
345 if (lseek(fd, 0, SEEK_END)==-1) {
346 DPRINTF("Lseek EOF failed (%d), internal error\n",
347 errno);
348 free(buf);
349 return -1;
350 }
351 if (rem) {
352 ret = write(fd, buf, rem);
353 if (ret != rem) {
354 DPRINTF("write failed: ret = %d, err = %s\n",
355 ret, strerror(errno));
356 free(buf);
357 return -1;
358 }
359 }
360 for (i = current; i < sectors; i++ ) {
361 ret = write(fd, buf, DEFAULT_SECTOR_SIZE);
362 if (ret != DEFAULT_SECTOR_SIZE) {
363 DPRINTF("write failed: ret = %d, err = %s\n",
364 ret, strerror(errno));
365 free(buf);
366 return -1;
367 }
368 }
369 free(buf);
370 } else if(sparse && (st.st_size > sectors * DEFAULT_SECTOR_SIZE))
371 if (ftruncate(fd, (off_t)sectors * DEFAULT_SECTOR_SIZE)==-1) {
372 DPRINTF("Ftruncate failed (%s)\n", strerror(errno));
373 return -1;
374 }
375 return 0;
376 }
379 /* 'allocate' is:
380 *
381 * 0 to not allocate.
382 *
383 * 1 to allocate a normal cluster (for sector indexes 'n_start' to
384 * 'n_end')
385 *
386 * 2 to allocate a compressed cluster of size
387 * 'compressed_size'. 'compressed_size' must be > 0 and <
388 * cluster_size
389 *
390 * return 0 if not allocated.
391 */
392 static uint64_t get_cluster_offset(struct tdqcow_state *s,
393 uint64_t offset, int allocate,
394 int compressed_size,
395 int n_start, int n_end)
396 {
397 int min_index, i, j, l1_index, l2_index, l2_sector, l1_sector;
398 char *tmp_ptr2, *l2_ptr, *l1_ptr;
399 uint64_t *tmp_ptr;
400 uint64_t l2_offset, *l2_table, cluster_offset, tmp;
401 uint32_t min_count;
402 int new_l2_table;
404 /*Check L1 table for the extent offset*/
405 l1_index = offset >> (s->l2_bits + s->cluster_bits);
406 l2_offset = s->l1_table[l1_index];
407 new_l2_table = 0;
408 if (!l2_offset) {
409 if (!allocate)
410 return 0;
411 /*
412 * allocating a new l2 entry + extent
413 * at the end of the file, we must also
414 * update the L1 entry safely.
415 */
416 l2_offset = s->fd_end;
418 /* round to cluster size */
419 l2_offset = (l2_offset + s->cluster_size - 1)
420 & ~(s->cluster_size - 1);
422 /* update the L1 entry */
423 s->l1_table[l1_index] = l2_offset;
424 tmp = cpu_to_be64(l2_offset);
426 /*Truncate file for L2 table
427 *(initialised to zero in case we crash)*/
428 if (qtruncate(s->fd,
429 l2_offset + (s->l2_size * sizeof(uint64_t)),
430 s->sparse) != 0) {
431 DPRINTF("ERROR truncating file\n");
432 return 0;
433 }
434 s->fd_end = l2_offset + (s->l2_size * sizeof(uint64_t));
436 /*Update the L1 table entry on disk
437 * (for O_DIRECT we write 4KByte blocks)*/
438 l1_sector = (l1_index * sizeof(uint64_t)) >> 12;
439 l1_ptr = (char *)s->l1_table + (l1_sector << 12);
441 if (posix_memalign((void **)&tmp_ptr, 4096, 4096) != 0) {
442 DPRINTF("ERROR allocating memory for L1 table\n");
443 }
444 memcpy(tmp_ptr, l1_ptr, 4096);
446 /* Convert block to write to big endian */
447 for(i = 0; i < 4096 / sizeof(uint64_t); i++) {
448 cpu_to_be64s(&tmp_ptr[i]);
449 }
451 /*
452 * Issue non-asynchronous L1 write.
453 * For safety, we must ensure that
454 * entry is written before blocks.
455 */
456 lseek(s->fd, s->l1_table_offset + (l1_sector << 12), SEEK_SET);
457 if (write(s->fd, tmp_ptr, 4096) != 4096) {
458 free(tmp_ptr);
459 return 0;
460 }
461 free(tmp_ptr);
463 new_l2_table = 1;
464 goto cache_miss;
465 } else if (s->min_cluster_alloc == s->l2_size) {
466 /*Fast-track the request*/
467 cluster_offset = l2_offset + (s->l2_size * sizeof(uint64_t));
468 l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
469 return cluster_offset + (l2_index * s->cluster_size);
470 }
472 /*Check to see if L2 entry is already cached*/
473 for (i = 0; i < L2_CACHE_SIZE; i++) {
474 if (l2_offset == s->l2_cache_offsets[i]) {
475 /* increment the hit count */
476 if (++s->l2_cache_counts[i] == 0xffffffff) {
477 for (j = 0; j < L2_CACHE_SIZE; j++) {
478 s->l2_cache_counts[j] >>= 1;
479 }
480 }
481 l2_table = s->l2_cache + (i << s->l2_bits);
482 goto found;
483 }
484 }
486 cache_miss:
487 /* not found: load a new entry in the least used one */
488 min_index = 0;
489 min_count = 0xffffffff;
490 for (i = 0; i < L2_CACHE_SIZE; i++) {
491 if (s->l2_cache_counts[i] < min_count) {
492 min_count = s->l2_cache_counts[i];
493 min_index = i;
494 }
495 }
496 l2_table = s->l2_cache + (min_index << s->l2_bits);
498 /*If extent pre-allocated, read table from disk,
499 *otherwise write new table to disk*/
500 if (new_l2_table) {
501 /*Should we allocate the whole extent? Adjustable parameter.*/
502 if (s->cluster_alloc == s->l2_size) {
503 cluster_offset = l2_offset +
504 (s->l2_size * sizeof(uint64_t));
505 cluster_offset = (cluster_offset + s->cluster_size - 1)
506 & ~(s->cluster_size - 1);
507 if (qtruncate(s->fd, cluster_offset +
508 (s->cluster_size * s->l2_size),
509 s->sparse) != 0) {
510 DPRINTF("ERROR truncating file\n");
511 return 0;
512 }
513 s->fd_end = cluster_offset +
514 (s->cluster_size * s->l2_size);
515 for (i = 0; i < s->l2_size; i++) {
516 l2_table[i] = cpu_to_be64(cluster_offset +
517 (i*s->cluster_size));
518 }
519 } else memset(l2_table, 0, s->l2_size * sizeof(uint64_t));
521 lseek(s->fd, l2_offset, SEEK_SET);
522 if (write(s->fd, l2_table, s->l2_size * sizeof(uint64_t)) !=
523 s->l2_size * sizeof(uint64_t))
524 return 0;
525 } else {
526 lseek(s->fd, l2_offset, SEEK_SET);
527 if (read(s->fd, l2_table, s->l2_size * sizeof(uint64_t)) !=
528 s->l2_size * sizeof(uint64_t))
529 return 0;
530 }
532 /*Update the cache entries*/
533 s->l2_cache_offsets[min_index] = l2_offset;
534 s->l2_cache_counts[min_index] = 1;
536 found:
537 /*The extent is split into 's->l2_size' blocks of
538 *size 's->cluster_size'*/
539 l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
540 cluster_offset = be64_to_cpu(l2_table[l2_index]);
542 if (!cluster_offset ||
543 ((cluster_offset & QCOW_OFLAG_COMPRESSED) && allocate == 1) ) {
544 if (!allocate)
545 return 0;
547 if ((cluster_offset & QCOW_OFLAG_COMPRESSED) &&
548 (n_end - n_start) < s->cluster_sectors) {
549 /* cluster is already allocated but compressed, we must
550 decompress it in the case it is not completely
551 overwritten */
552 if (decompress_cluster(s, cluster_offset) < 0)
553 return 0;
554 cluster_offset = lseek(s->fd, s->fd_end, SEEK_SET);
555 cluster_offset = (cluster_offset + s->cluster_size - 1)
556 & ~(s->cluster_size - 1);
557 /* write the cluster content - not asynchronous */
558 lseek(s->fd, cluster_offset, SEEK_SET);
559 if (write(s->fd, s->cluster_cache, s->cluster_size) !=
560 s->cluster_size)
561 return -1;
562 } else {
563 /* allocate a new cluster */
564 cluster_offset = lseek(s->fd, s->fd_end, SEEK_SET);
565 if (allocate == 1) {
566 /* round to cluster size */
567 cluster_offset =
568 (cluster_offset + s->cluster_size - 1)
569 & ~(s->cluster_size - 1);
570 if (qtruncate(s->fd, cluster_offset +
571 s->cluster_size, s->sparse)!=0) {
572 DPRINTF("ERROR truncating file\n");
573 return 0;
574 }
575 s->fd_end = (cluster_offset + s->cluster_size);
576 /* if encrypted, we must initialize the cluster
577 content which won't be written */
578 if (s->crypt_method &&
579 (n_end - n_start) < s->cluster_sectors) {
580 uint64_t start_sect;
581 start_sect = (offset &
582 ~(s->cluster_size - 1))
583 >> 9;
584 memset(s->cluster_data + 512,
585 0xaa, 512);
586 for (i = 0; i < s->cluster_sectors;i++)
587 {
588 if (i < n_start || i >= n_end)
589 {
590 encrypt_sectors(s, start_sect + i,
591 s->cluster_data,
592 s->cluster_data + 512, 1, 1,
593 &s->aes_encrypt_key);
594 lseek(s->fd, cluster_offset + i * 512, SEEK_SET);
595 if (write(s->fd, s->cluster_data, 512) != 512)
596 return -1;
597 }
598 }
599 }
600 } else {
601 cluster_offset |= QCOW_OFLAG_COMPRESSED |
602 (uint64_t)compressed_size
603 << (63 - s->cluster_bits);
604 }
605 }
606 /* update L2 table */
607 tmp = cpu_to_be64(cluster_offset);
608 l2_table[l2_index] = tmp;
610 /*For IO_DIRECT we write 4KByte blocks*/
611 l2_sector = (l2_index * sizeof(uint64_t)) >> 12;
612 l2_ptr = (char *)l2_table + (l2_sector << 12);
614 if (posix_memalign((void **)&tmp_ptr2, 4096, 4096) != 0) {
615 DPRINTF("ERROR allocating memory for L1 table\n");
616 }
617 memcpy(tmp_ptr2, l2_ptr, 4096);
618 lseek(s->fd, l2_offset + (l2_sector << 12), SEEK_SET);
619 if (write(s->fd, tmp_ptr2, 4096) != 4096) {
620 free(tmp_ptr2);
621 return -1;
622 }
623 free(tmp_ptr2);
624 }
625 return cluster_offset;
626 }
628 static void init_cluster_cache(struct disk_driver *dd)
629 {
630 struct td_state *bs = dd->td_state;
631 struct tdqcow_state *s = (struct tdqcow_state *)dd->private;
632 uint32_t count = 0;
633 int i, cluster_entries;
635 cluster_entries = s->cluster_size / 512;
636 DPRINTF("Initialising Cluster cache, %d sectors per cluster (%d cluster size)\n",
637 cluster_entries, s->cluster_size);
639 for (i = 0; i < bs->size; i += cluster_entries) {
640 if (get_cluster_offset(s, i << 9, 0, 0, 0, 1)) count++;
641 if (count >= L2_CACHE_SIZE) return;
642 }
643 DPRINTF("Finished cluster initialisation, added %d entries\n", count);
644 return;
645 }
647 static int qcow_is_allocated(struct tdqcow_state *s, int64_t sector_num,
648 int nb_sectors, int *pnum)
649 {
650 int index_in_cluster, n;
651 uint64_t cluster_offset;
653 cluster_offset = get_cluster_offset(s, sector_num << 9, 0, 0, 0, 0);
654 index_in_cluster = sector_num & (s->cluster_sectors - 1);
655 n = s->cluster_sectors - index_in_cluster;
656 if (n > nb_sectors)
657 n = nb_sectors;
658 *pnum = n;
659 return (cluster_offset != 0);
660 }
662 static int decompress_buffer(uint8_t *out_buf, int out_buf_size,
663 const uint8_t *buf, int buf_size)
664 {
665 z_stream strm1, *strm = &strm1;
666 int ret, out_len;
668 memset(strm, 0, sizeof(*strm));
670 strm->next_in = (uint8_t *)buf;
671 strm->avail_in = buf_size;
672 strm->next_out = out_buf;
673 strm->avail_out = out_buf_size;
675 ret = inflateInit2(strm, -12);
676 if (ret != Z_OK)
677 return -1;
678 ret = inflate(strm, Z_FINISH);
679 out_len = strm->next_out - out_buf;
680 if ( (ret != Z_STREAM_END && ret != Z_BUF_ERROR) ||
681 (out_len != out_buf_size) ) {
682 inflateEnd(strm);
683 return -1;
684 }
685 inflateEnd(strm);
686 return 0;
687 }
689 static int decompress_cluster(struct tdqcow_state *s, uint64_t cluster_offset)
690 {
691 int ret, csize;
692 uint64_t coffset;
694 coffset = cluster_offset & s->cluster_offset_mask;
695 if (s->cluster_cache_offset != coffset) {
696 csize = cluster_offset >> (63 - s->cluster_bits);
697 csize &= (s->cluster_size - 1);
698 lseek(s->fd, coffset, SEEK_SET);
699 ret = read(s->fd, s->cluster_data, csize);
700 if (ret != csize)
701 return -1;
702 if (decompress_buffer(s->cluster_cache, s->cluster_size,
703 s->cluster_data, csize) < 0) {
704 return -1;
705 }
706 s->cluster_cache_offset = coffset;
707 }
708 return 0;
709 }
711 static inline void init_fds(struct disk_driver *dd)
712 {
713 int i;
714 struct tdqcow_state *s = (struct tdqcow_state *)dd->private;
716 for(i = 0; i < MAX_IOFD; i++)
717 dd->io_fd[i] = 0;
719 dd->io_fd[0] = s->aio.aio_ctx.pollfd;
720 }
722 /* Open the disk file and initialize qcow state. */
723 static int tdqcow_open (struct disk_driver *dd, const char *name, td_flag_t flags)
724 {
725 int fd, len, i, shift, ret, size, l1_table_size, o_flags, l1_table_block;
726 int max_aio_reqs;
727 struct td_state *bs = dd->td_state;
728 struct tdqcow_state *s = (struct tdqcow_state *)dd->private;
729 char *buf, *buf2;
730 QCowHeader *header;
731 QCowHeader_ext *exthdr;
732 uint32_t cksum;
733 uint64_t final_cluster = 0;
735 DPRINTF("QCOW: Opening %s\n",name);
737 o_flags = O_DIRECT | O_LARGEFILE |
738 ((flags == TD_RDONLY) ? O_RDONLY : O_RDWR);
739 fd = open(name, o_flags);
740 if (fd < 0) {
741 DPRINTF("Unable to open %s (%d)\n",name,0 - errno);
742 return -1;
743 }
745 s->fd = fd;
746 if (asprintf(&s->name,"%s", name) == -1) {
747 close(fd);
748 return -1;
749 }
751 ASSERT(sizeof(QCowHeader) + sizeof(QCowHeader_ext) < 512);
753 ret = posix_memalign((void **)&buf, 512, 512);
754 if (ret != 0) goto fail;
756 if (read(fd, buf, 512) != 512)
757 goto fail;
759 header = (QCowHeader *)buf;
760 be32_to_cpus(&header->magic);
761 be32_to_cpus(&header->version);
762 be64_to_cpus(&header->backing_file_offset);
763 be32_to_cpus(&header->backing_file_size);
764 be32_to_cpus(&header->mtime);
765 be64_to_cpus(&header->size);
766 be32_to_cpus(&header->crypt_method);
767 be64_to_cpus(&header->l1_table_offset);
769 if (header->magic != QCOW_MAGIC)
770 goto fail;
772 switch (header->version) {
773 case QCOW_VERSION:
774 break;
775 case 2:
776 close(fd);
777 dd->drv = &tapdisk_qcow2;
778 return dd->drv->td_open(dd, name, flags);
779 default:
780 goto fail;
781 }
783 if (header->size <= 1 || header->cluster_bits < 9)
784 goto fail;
785 if (header->crypt_method > QCOW_CRYPT_AES)
786 goto fail;
787 s->crypt_method_header = header->crypt_method;
788 if (s->crypt_method_header)
789 s->encrypted = 1;
790 s->cluster_bits = header->cluster_bits;
791 s->cluster_size = 1 << s->cluster_bits;
792 s->cluster_sectors = 1 << (s->cluster_bits - 9);
793 s->l2_bits = header->l2_bits;
794 s->l2_size = 1 << s->l2_bits;
795 s->cluster_alloc = s->l2_size;
796 bs->size = header->size / 512;
797 s->cluster_offset_mask = (1LL << (63 - s->cluster_bits)) - 1;
798 s->backing_file_offset = header->backing_file_offset;
799 s->backing_file_size = header->backing_file_size;
801 /* read the level 1 table */
802 shift = s->cluster_bits + s->l2_bits;
803 s->l1_size = (header->size + (1LL << shift) - 1) >> shift;
805 s->l1_table_offset = header->l1_table_offset;
807 /*allocate a 4Kbyte multiple of memory*/
808 l1_table_size = s->l1_size * sizeof(uint64_t);
809 if (l1_table_size % 4096 > 0) {
810 l1_table_size = ((l1_table_size >> 12) + 1) << 12;
811 }
812 ret = posix_memalign((void **)&s->l1_table, 4096, l1_table_size);
813 if (ret != 0) goto fail;
815 memset(s->l1_table, 0x00, l1_table_size);
817 DPRINTF("L1 Table offset detected: %llu, size %d (%d)\n",
818 (long long)s->l1_table_offset,
819 (int) (s->l1_size * sizeof(uint64_t)),
820 l1_table_size);
822 lseek(fd, 0, SEEK_SET);
823 l1_table_block = l1_table_size + s->l1_table_offset;
824 l1_table_block = l1_table_block + 512 - (l1_table_block % 512);
825 ret = posix_memalign((void **)&buf2, 4096, l1_table_block);
826 if (ret != 0) goto fail;
827 if (read(fd, buf2, l1_table_block) != l1_table_block)
828 goto fail;
829 memcpy(s->l1_table, buf2 + s->l1_table_offset, l1_table_size);
831 for(i = 0; i < s->l1_size; i++) {
832 be64_to_cpus(&s->l1_table[i]);
833 //DPRINTF("L1[%d] => %llu\n", i, s->l1_table[i]);
834 if (s->l1_table[i] > final_cluster)
835 final_cluster = s->l1_table[i];
836 }
838 /* alloc L2 cache */
839 size = s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t);
840 ret = posix_memalign((void **)&s->l2_cache, 4096, size);
841 if(ret != 0) goto fail;
843 size = s->cluster_size;
844 ret = posix_memalign((void **)&s->cluster_cache, 4096, size);
845 if(ret != 0) goto fail;
847 ret = posix_memalign((void **)&s->cluster_data, 4096, size);
848 if(ret != 0) goto fail;
849 s->cluster_cache_offset = -1;
851 if (s->backing_file_offset != 0)
852 s->cluster_alloc = 1; /*Cannot use pre-alloc*/
854 bs->sector_size = 512;
855 bs->info = 0;
857 /*Detect min_cluster_alloc*/
858 s->min_cluster_alloc = 1; /*Default*/
859 if (s->backing_file_offset == 0 && s->l1_table_offset % 4096 == 0) {
860 /*We test to see if the xen magic # exists*/
861 exthdr = (QCowHeader_ext *)(buf + sizeof(QCowHeader));
862 be32_to_cpus(&exthdr->xmagic);
863 if(exthdr->xmagic != XEN_MAGIC)
864 goto end_xenhdr;
866 /* Try to detect old tapdisk images. They have to be fixed because
867 * they don't use big endian but native endianess for the L1 table */
868 if ((exthdr->flags & EXTHDR_L1_BIG_ENDIAN) == 0) {
870 /*
871 The image is broken. Fix it. The L1 table has already been
872 byte-swapped, so we can write it to the image file as it is
873 currently in memory. Then swap it back to native endianess
874 for operation.
875 */
877 DPRINTF("qcow: Converting image to big endian L1 table\n");
879 memcpy(buf2 + s->l1_table_offset, s->l1_table, l1_table_size);
880 lseek(fd, 0, SEEK_SET);
881 if (write(fd, buf2, l1_table_block) != l1_table_block) {
882 DPRINTF("qcow: Failed to write new L1 table\n");
883 goto fail;
884 }
886 for(i = 0;i < s->l1_size; i++) {
887 cpu_to_be64s(&s->l1_table[i]);
888 }
890 /* Write the big endian flag to the extended header */
891 exthdr->flags |= EXTHDR_L1_BIG_ENDIAN;
893 if (write(fd, buf, 512) != 512) {
894 DPRINTF("qcow: Failed to write extended header\n");
895 goto fail;
896 }
897 }
899 /*Finally check the L1 table cksum*/
900 be32_to_cpus(&exthdr->cksum);
901 cksum = gen_cksum((char *)s->l1_table,
902 s->l1_size * sizeof(uint64_t));
903 if(exthdr->cksum != cksum)
904 goto end_xenhdr;
906 be32_to_cpus(&exthdr->min_cluster_alloc);
907 be32_to_cpus(&exthdr->flags);
908 s->sparse = (exthdr->flags & SPARSE_FILE);
909 s->min_cluster_alloc = exthdr->min_cluster_alloc;
910 }
912 end_xenhdr:
914 /* A segment (i.e. a page) can span multiple clusters */
915 max_aio_reqs = ((getpagesize() / s->cluster_size) + 1) *
916 MAX_SEGMENTS_PER_REQ * MAX_REQUESTS;
918 if (tap_aio_init(&s->aio, bs->size, max_aio_reqs)!=0) {
919 DPRINTF("Unable to initialise AIO state\n");
920 tap_aio_free(&s->aio);
921 goto fail;
922 }
923 init_fds(dd);
925 if (!final_cluster)
926 s->fd_end = l1_table_block;
927 else {
928 s->fd_end = lseek(fd, 0, SEEK_END);
929 if (s->fd_end == (off_t)-1)
930 goto fail;
931 }
933 return 0;
935 fail:
936 DPRINTF("QCOW Open failed\n");
937 tap_aio_free(&s->aio);
938 free(s->l1_table);
939 free(s->l2_cache);
940 free(s->cluster_cache);
941 free(s->cluster_data);
942 close(fd);
943 return -1;
944 }
946 static int tdqcow_queue_read(struct disk_driver *dd, uint64_t sector,
947 int nb_sectors, char *buf, td_callback_t cb,
948 int id, void *private)
949 {
950 struct tdqcow_state *s = (struct tdqcow_state *)dd->private;
951 int ret = 0, index_in_cluster, n, i, rsp = 0;
952 uint64_t cluster_offset, sec, nr_secs;
954 sec = sector;
955 nr_secs = nb_sectors;
957 /*Check we can get a lock*/
958 for (i = 0; i < nb_sectors; i++)
959 if (!tap_aio_can_lock(&s->aio, sector + i))
960 return cb(dd, -EBUSY, sector, nb_sectors, id, private);
962 /*We store a local record of the request*/
963 while (nb_sectors > 0) {
964 cluster_offset =
965 get_cluster_offset(s, sector << 9, 0, 0, 0, 0);
966 index_in_cluster = sector & (s->cluster_sectors - 1);
967 n = s->cluster_sectors - index_in_cluster;
968 if (n > nb_sectors)
969 n = nb_sectors;
971 if (s->aio.iocb_free_count == 0 || !tap_aio_lock(&s->aio, sector))
972 return cb(dd, -EBUSY, sector, nb_sectors, id, private);
974 if(!cluster_offset) {
975 tap_aio_unlock(&s->aio, sector);
976 ret = cb(dd, BLK_NOT_ALLOCATED,
977 sector, n, id, private);
978 if (ret == -EBUSY) {
979 /* mark remainder of request
980 * as busy and try again later */
981 return cb(dd, -EBUSY, sector + n,
982 nb_sectors - n, id, private);
983 } else
984 rsp += ret;
985 } else if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
986 tap_aio_unlock(&s->aio, sector);
987 if (decompress_cluster(s, cluster_offset) < 0) {
988 rsp += cb(dd, -EIO, sector,
989 nb_sectors, id, private);
990 goto done;
991 }
992 memcpy(buf, s->cluster_cache + index_in_cluster * 512,
993 512 * n);
994 rsp += cb(dd, 0, sector, n, id, private);
995 } else {
996 tap_aio_read(&s->aio, s->fd, n * 512,
997 (cluster_offset + index_in_cluster * 512),
998 buf, cb, id, sector, private);
999 }
1000 nb_sectors -= n;
1001 sector += n;
1002 buf += n * 512;
1004 done:
1005 return rsp;
1008 static int tdqcow_queue_write(struct disk_driver *dd, uint64_t sector,
1009 int nb_sectors, char *buf, td_callback_t cb,
1010 int id, void *private)
1012 struct tdqcow_state *s = (struct tdqcow_state *)dd->private;
1013 int ret = 0, index_in_cluster, n, i;
1014 uint64_t cluster_offset, sec, nr_secs;
1016 sec = sector;
1017 nr_secs = nb_sectors;
1019 /*Check we can get a lock*/
1020 for (i = 0; i < nb_sectors; i++)
1021 if (!tap_aio_can_lock(&s->aio, sector + i))
1022 return cb(dd, -EBUSY, sector, nb_sectors, id, private);
1024 /*We store a local record of the request*/
1025 while (nb_sectors > 0) {
1026 index_in_cluster = sector & (s->cluster_sectors - 1);
1027 n = s->cluster_sectors - index_in_cluster;
1028 if (n > nb_sectors)
1029 n = nb_sectors;
1031 if (s->aio.iocb_free_count == 0 || !tap_aio_lock(&s->aio, sector))
1032 return cb(dd, -EBUSY, sector, nb_sectors, id, private);
1034 cluster_offset = get_cluster_offset(s, sector << 9, 1, 0,
1035 index_in_cluster,
1036 index_in_cluster+n);
1037 if (!cluster_offset) {
1038 DPRINTF("Ooops, no write cluster offset!\n");
1039 tap_aio_unlock(&s->aio, sector);
1040 return cb(dd, -EIO, sector, nb_sectors, id, private);
1043 if (s->crypt_method) {
1044 encrypt_sectors(s, sector, s->cluster_data,
1045 (unsigned char *)buf, n, 1,
1046 &s->aes_encrypt_key);
1047 tap_aio_write(&s->aio, s->fd, n * 512,
1048 (cluster_offset + index_in_cluster*512),
1049 (char *)s->cluster_data, cb, id, sector,
1050 private);
1051 } else {
1052 tap_aio_write(&s->aio, s->fd, n * 512,
1053 (cluster_offset + index_in_cluster*512),
1054 buf, cb, id, sector, private);
1057 nb_sectors -= n;
1058 sector += n;
1059 buf += n * 512;
1061 s->cluster_cache_offset = -1; /* disable compressed cache */
1063 return 0;
1066 static int tdqcow_submit(struct disk_driver *dd)
1068 struct tdqcow_state *prv = (struct tdqcow_state *)dd->private;
1070 return tap_aio_submit(&prv->aio);
1073 static int tdqcow_close(struct disk_driver *dd)
1075 struct tdqcow_state *s = (struct tdqcow_state *)dd->private;
1076 uint32_t cksum, out;
1077 int fd, offset;
1079 /*Update the hdr cksum*/
1080 if(s->min_cluster_alloc == s->l2_size) {
1081 cksum = gen_cksum((char *)s->l1_table, s->l1_size * sizeof(uint64_t));
1082 printf("Writing cksum: %d",cksum);
1083 fd = open(s->name, O_WRONLY | O_LARGEFILE); /*Open without O_DIRECT*/
1084 offset = sizeof(QCowHeader) + sizeof(uint32_t);
1085 lseek(fd, offset, SEEK_SET);
1086 out = cpu_to_be32(cksum);
1087 if (write(fd, &out, sizeof(uint32_t))) ;
1088 close(fd);
1091 io_destroy(s->aio.aio_ctx.aio_ctx);
1092 free(s->name);
1093 free(s->l1_table);
1094 free(s->l2_cache);
1095 free(s->cluster_cache);
1096 free(s->cluster_data);
1097 close(s->fd);
1098 return 0;
1101 static int tdqcow_do_callbacks(struct disk_driver *dd, int sid)
1103 int ret, i, nr_events, rsp = 0,*ptr;
1104 struct io_event *ep;
1105 struct tdqcow_state *prv = (struct tdqcow_state *)dd->private;
1107 if (sid > MAX_IOFD) return 1;
1109 nr_events = tap_aio_get_events(&prv->aio.aio_ctx);
1110 repeat:
1111 for (ep = prv->aio.aio_events, i = nr_events; i-- > 0; ep++) {
1112 struct iocb *io = ep->obj;
1113 struct pending_aio *pio;
1115 pio = &prv->aio.pending_aio[(long)io->data];
1117 tap_aio_unlock(&prv->aio, pio->sector);
1119 if (prv->crypt_method)
1120 encrypt_sectors(prv, pio->sector,
1121 (unsigned char *)pio->buf,
1122 (unsigned char *)pio->buf,
1123 pio->nb_sectors, 0,
1124 &prv->aes_decrypt_key);
1126 rsp += pio->cb(dd, ep->res == io->u.c.nbytes ? 0 : 1,
1127 pio->sector, pio->nb_sectors,
1128 pio->id, pio->private);
1130 prv->aio.iocb_free[prv->aio.iocb_free_count++] = io;
1133 if (nr_events) {
1134 nr_events = tap_aio_more_events(&prv->aio.aio_ctx);
1135 goto repeat;
1138 tap_aio_continue(&prv->aio.aio_ctx);
1140 return rsp;
1143 int qcow_create(const char *filename, uint64_t total_size,
1144 const char *backing_file, int sparse)
1146 int fd, header_size, backing_filename_len, l1_size, i;
1147 int shift, length, adjust, flags = 0, ret = 0;
1148 QCowHeader header;
1149 QCowHeader_ext exthdr;
1150 char backing_filename[PATH_MAX], *ptr;
1151 uint64_t tmp, size, total_length;
1152 struct stat st;
1154 DPRINTF("Qcow_create: size %llu\n",(long long unsigned)total_size);
1156 fd = open(filename,
1157 O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE,
1158 0644);
1159 if (fd < 0)
1160 return -1;
1162 memset(&header, 0, sizeof(header));
1163 header.magic = cpu_to_be32(QCOW_MAGIC);
1164 header.version = cpu_to_be32(QCOW_VERSION);
1166 /*Create extended header fields*/
1167 exthdr.xmagic = cpu_to_be32(XEN_MAGIC);
1169 header_size = sizeof(header) + sizeof(QCowHeader_ext);
1170 backing_filename_len = 0;
1171 size = (total_size >> SECTOR_SHIFT);
1172 if (backing_file) {
1173 if (strcmp(backing_file, "fat:")) {
1174 const char *p;
1175 /* XXX: this is a hack: we do not attempt to
1176 *check for URL like syntax */
1177 p = strchr(backing_file, ':');
1178 if (p && (p - backing_file) >= 2) {
1179 /* URL like but exclude "c:" like filenames */
1180 strncpy(backing_filename, backing_file,
1181 sizeof(backing_filename));
1182 } else {
1183 if (realpath(backing_file, backing_filename) == NULL ||
1184 stat(backing_filename, &st) != 0) {
1185 return -1;
1188 header.backing_file_offset = cpu_to_be64(header_size);
1189 backing_filename_len = strlen(backing_filename);
1190 header.backing_file_size = cpu_to_be32(
1191 backing_filename_len);
1192 header_size += backing_filename_len;
1194 /*Set to the backing file size*/
1195 if(get_filesize(backing_filename, &size, &st)) {
1196 return -1;
1198 DPRINTF("Backing file size detected: %lld sectors"
1199 "(total %lld [%lld MB])\n",
1200 (long long)size,
1201 (long long)(size << SECTOR_SHIFT),
1202 (long long)(size >> 11));
1203 } else {
1204 backing_file = NULL;
1205 DPRINTF("Setting file size: %lld (total %lld)\n",
1206 (long long) total_size,
1207 (long long) (total_size << SECTOR_SHIFT));
1209 header.mtime = cpu_to_be32(st.st_mtime);
1210 header.cluster_bits = 9; /* 512 byte cluster to avoid copying
1211 unmodifyed sectors */
1212 header.l2_bits = 12; /* 32 KB L2 tables */
1213 exthdr.min_cluster_alloc = cpu_to_be32(1);
1214 } else {
1215 DPRINTF("Setting file size: %lld sectors"
1216 "(total %lld [%lld MB])\n",
1217 (long long) size,
1218 (long long) (size << SECTOR_SHIFT),
1219 (long long) (size >> 11));
1220 header.cluster_bits = 12; /* 4 KB clusters */
1221 header.l2_bits = 9; /* 4 KB L2 tables */
1222 exthdr.min_cluster_alloc = cpu_to_be32(1 << 9);
1224 /*Set the header size value*/
1225 header.size = cpu_to_be64(size * 512);
1227 header_size = (header_size + 7) & ~7;
1228 if (header_size % 4096 > 0) {
1229 header_size = ((header_size >> 12) + 1) << 12;
1232 shift = header.cluster_bits + header.l2_bits;
1233 l1_size = ((size * 512) + (1LL << shift) - 1) >> shift;
1235 header.l1_table_offset = cpu_to_be64(header_size);
1236 DPRINTF("L1 Table offset: %d, size %d\n",
1237 header_size,
1238 (int)(l1_size * sizeof(uint64_t)));
1239 header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE);
1241 ptr = calloc(1, l1_size * sizeof(uint64_t));
1242 exthdr.cksum = cpu_to_be32(gen_cksum(ptr, l1_size * sizeof(uint64_t)));
1243 printf("Created cksum: %d\n",exthdr.cksum);
1244 free(ptr);
1246 /*adjust file length to system page size boundary*/
1247 length = ROUNDUP(header_size + (l1_size * sizeof(uint64_t)),
1248 getpagesize());
1249 if (qtruncate(fd, length, 0)!=0) {
1250 DPRINTF("ERROR truncating file\n");
1251 return -1;
1254 if (sparse == 0) {
1255 /*Filesize is length+l1_size*(1 << s->l2_bits)+(size*512)*/
1256 total_length = length + (l1_size * (1 << 9)) + (size * 512);
1257 if (qtruncate(fd, total_length, 0)!=0) {
1258 DPRINTF("ERROR truncating file\n");
1259 return -1;
1261 printf("File truncated to length %"PRIu64"\n",total_length);
1262 } else
1263 flags = SPARSE_FILE;
1265 exthdr.flags = cpu_to_be32(flags);
1267 /* write all the data */
1268 lseek(fd, 0, SEEK_SET);
1269 ret += write(fd, &header, sizeof(header));
1270 ret += write(fd, &exthdr, sizeof(exthdr));
1271 if (backing_file)
1272 ret += write(fd, backing_filename, backing_filename_len);
1274 lseek(fd, header_size, SEEK_SET);
1275 tmp = 0;
1276 for (i = 0;i < l1_size; i++) {
1277 ret += write(fd, &tmp, sizeof(tmp));
1280 close(fd);
1282 return 0;
1285 static int qcow_make_empty(struct tdqcow_state *s)
1287 uint32_t l1_length = s->l1_size * sizeof(uint64_t);
1289 memset(s->l1_table, 0, l1_length);
1290 lseek(s->fd, s->l1_table_offset, SEEK_SET);
1291 if (write(s->fd, s->l1_table, l1_length) < 0)
1292 return -1;
1293 if (qtruncate(s->fd, s->l1_table_offset + l1_length, s->sparse)!=0) {
1294 DPRINTF("ERROR truncating file\n");
1295 return -1;
1298 memset(s->l2_cache, 0, s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
1299 memset(s->l2_cache_offsets, 0, L2_CACHE_SIZE * sizeof(uint64_t));
1300 memset(s->l2_cache_counts, 0, L2_CACHE_SIZE * sizeof(uint32_t));
1302 return 0;
1305 static int qcow_get_cluster_size(struct tdqcow_state *s)
1307 return s->cluster_size;
1310 /* XXX: put compressed sectors first, then all the cluster aligned
1311 tables to avoid losing bytes in alignment */
1312 static int qcow_compress_cluster(struct tdqcow_state *s, int64_t sector_num,
1313 const uint8_t *buf)
1315 z_stream strm;
1316 int ret, out_len;
1317 uint8_t *out_buf;
1318 uint64_t cluster_offset;
1320 out_buf = malloc(s->cluster_size + (s->cluster_size / 1000) + 128);
1321 if (!out_buf)
1322 return -1;
1324 /* best compression, small window, no zlib header */
1325 memset(&strm, 0, sizeof(strm));
1326 ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
1327 Z_DEFLATED, -12,
1328 9, Z_DEFAULT_STRATEGY);
1329 if (ret != 0) {
1330 free(out_buf);
1331 return -1;
1334 strm.avail_in = s->cluster_size;
1335 strm.next_in = (uint8_t *)buf;
1336 strm.avail_out = s->cluster_size;
1337 strm.next_out = out_buf;
1339 ret = deflate(&strm, Z_FINISH);
1340 if (ret != Z_STREAM_END && ret != Z_OK) {
1341 free(out_buf);
1342 deflateEnd(&strm);
1343 return -1;
1345 out_len = strm.next_out - out_buf;
1347 deflateEnd(&strm);
1349 if (ret != Z_STREAM_END || out_len >= s->cluster_size) {
1350 /* could not compress: write normal cluster */
1351 //tdqcow_queue_write(bs, sector_num, buf, s->cluster_sectors);
1352 } else {
1353 cluster_offset = get_cluster_offset(s, sector_num << 9, 2,
1354 out_len, 0, 0);
1355 cluster_offset &= s->cluster_offset_mask;
1356 lseek(s->fd, cluster_offset, SEEK_SET);
1357 if (write(s->fd, out_buf, out_len) != out_len) {
1358 free(out_buf);
1359 return -1;
1363 free(out_buf);
1364 return 0;
1367 static int tdqcow_get_parent_id(struct disk_driver *dd, struct disk_id *id)
1369 off_t off;
1370 char *buf, *filename;
1371 int len, secs, err = -EINVAL;
1372 struct tdqcow_state *child = (struct tdqcow_state *)dd->private;
1374 if (!child->backing_file_offset)
1375 return TD_NO_PARENT;
1377 /* read the backing file name */
1378 len = child->backing_file_size;
1379 off = child->backing_file_offset - (child->backing_file_offset % 512);
1380 secs = (len + (child->backing_file_offset - off) + 511) >> 9;
1382 if (posix_memalign((void **)&buf, 512, secs << 9))
1383 return -1;
1385 if (lseek(child->fd, off, SEEK_SET) == (off_t)-1)
1386 goto out;
1388 if (read(child->fd, buf, secs << 9) != secs << 9)
1389 goto out;
1390 filename = buf + (child->backing_file_offset - off);
1391 filename[len] = '\0';
1393 id->name = strdup(filename);
1394 id->drivertype = DISK_TYPE_AIO;
1395 err = 0;
1396 out:
1397 free(buf);
1398 return err;
1401 static int tdqcow_validate_parent(struct disk_driver *child,
1402 struct disk_driver *parent, td_flag_t flags)
1404 struct stat stats;
1405 uint64_t psize, csize;
1407 if (stat(parent->name, &stats))
1408 return -EINVAL;
1409 if (get_filesize(parent->name, &psize, &stats))
1410 return -EINVAL;
1412 if (stat(child->name, &stats))
1413 return -EINVAL;
1414 if (get_filesize(child->name, &csize, &stats))
1415 return -EINVAL;
1417 if (csize != psize)
1418 return -EINVAL;
1420 return 0;
1423 struct tap_disk tapdisk_qcow = {
1424 .disk_type = "tapdisk_qcow",
1425 .private_data_size = sizeof(struct tdqcow_state),
1426 .td_open = tdqcow_open,
1427 .td_queue_read = tdqcow_queue_read,
1428 .td_queue_write = tdqcow_queue_write,
1429 .td_submit = tdqcow_submit,
1430 .td_close = tdqcow_close,
1431 .td_do_callbacks = tdqcow_do_callbacks,
1432 .td_get_parent_id = tdqcow_get_parent_id,
1433 .td_validate_parent = tdqcow_validate_parent
1434 };