ia64/xen-unstable

view tools/blktap/drivers/block-qcow.c @ 14799:380db34694d7

[TAPDISK] Fix qcow initialization bug.

fd_end was being incorrectly initialized upon open, leading to data corruption.

Signed-off-by: Jake Wires <jwires@xensource.com>
author Jake Wires <jwires@xensource.com>
date Tue Apr 10 11:40:43 2007 -0700 (2007-04-10)
parents d64c0af015dd
children 816d274e2a85
line source
1 /* block-qcow.c
2 *
3 * Asynchronous Qemu copy-on-write disk implementation.
4 * Code based on the Qemu implementation
5 * (see copyright notice below)
6 *
7 * (c) 2006 Andrew Warfield and Julian Chesterfield
8 *
9 */
11 /*
12 * Block driver for the QCOW format
13 *
14 * Copyright (c) 2004 Fabrice Bellard
15 *
16 * Permission is hereby granted, free of charge, to any person obtaining a copy
17 * of this software and associated documentation files(the "Software"), to deal
18 * in the Software without restriction, including without limitation the rights
19 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
20 * copies of the Software, and to permit persons to whom the Software is
21 * furnished to do so, subject to the following conditions:
22 */
24 #include <errno.h>
25 #include <fcntl.h>
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <unistd.h>
29 #include <sys/statvfs.h>
30 #include <sys/stat.h>
31 #include <sys/ioctl.h>
32 #include <linux/fs.h>
33 #include <string.h>
34 #include <zlib.h>
35 #include <inttypes.h>
36 #include <libaio.h>
37 #include <openssl/md5.h>
38 #include "bswap.h"
39 #include "aes.h"
40 #include "tapdisk.h"
42 #if 1
43 #define ASSERT(_p) \
44 if ( !(_p) ) { DPRINTF("Assertion '%s' failed, line %d, file %s", #_p , \
45 __LINE__, __FILE__); *(int*)0=0; }
46 #else
47 #define ASSERT(_p) ((void)0)
48 #endif
50 #define ROUNDUP(l, s) \
51 ({ \
52 (uint64_t)( \
53 (l + (s - 1)) - ((l + (s - 1)) % s)); \
54 })
56 /******AIO DEFINES******/
57 #define REQUEST_ASYNC_FD 1
58 #define MAX_AIO_REQS (MAX_REQUESTS * MAX_SEGMENTS_PER_REQ)
60 struct pending_aio {
61 td_callback_t cb;
62 int id;
63 void *private;
64 int nb_sectors;
65 char *buf;
66 uint64_t sector;
67 };
69 #define IOCB_IDX(_s, _io) ((_io) - (_s)->iocb_list)
71 #define ZERO_TEST(_b) (_b | 0x00)
73 /**************************************************************/
74 /* QEMU COW block driver with compression and encryption support */
76 #define QCOW_MAGIC (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb)
77 #define XEN_MAGIC (('X' << 24) | ('E' << 16) | ('N' << 8) | 0xfb)
78 #define QCOW_VERSION 1
80 #define QCOW_CRYPT_NONE 0x00
81 #define QCOW_CRYPT_AES 0x01
83 #define QCOW_OFLAG_COMPRESSED (1LL << 63)
84 #define SPARSE_FILE 0x01
86 #ifndef O_BINARY
87 #define O_BINARY 0
88 #endif
90 typedef struct QCowHeader {
91 uint32_t magic;
92 uint32_t version;
93 uint64_t backing_file_offset;
94 uint32_t backing_file_size;
95 uint32_t mtime;
96 uint64_t size; /* in bytes */
97 uint8_t cluster_bits;
98 uint8_t l2_bits;
99 uint32_t crypt_method;
100 uint64_t l1_table_offset;
101 } QCowHeader;
103 /*Extended header for Xen enhancements*/
104 typedef struct QCowHeader_ext {
105 uint32_t xmagic;
106 uint32_t cksum;
107 uint32_t min_cluster_alloc;
108 uint32_t flags;
109 } QCowHeader_ext;
111 #define L2_CACHE_SIZE 16 /*Fixed allocation in Qemu*/
113 struct tdqcow_state {
114 int fd; /*Main Qcow file descriptor */
115 uint64_t fd_end; /*Store a local record of file length */
116 char *name; /*Record of the filename*/
117 uint32_t backing_file_size;
118 uint64_t backing_file_offset;
119 int encrypted; /*File contents are encrypted or plain*/
120 int cluster_bits; /*Determines length of cluster as
121 *indicated by file hdr*/
122 int cluster_size; /*Length of cluster*/
123 int cluster_sectors; /*Number of sectors per cluster*/
124 int cluster_alloc; /*Blktap fix for allocating full
125 *extents*/
126 int min_cluster_alloc; /*Blktap historical extent alloc*/
127 int sparse; /*Indicates whether to preserve sparseness*/
128 int l2_bits; /*Size of L2 table entry*/
129 int l2_size; /*Full table size*/
130 int l1_size; /*L1 table size*/
131 uint64_t cluster_offset_mask;
132 uint64_t l1_table_offset; /*L1 table offset from beginning of
133 *file*/
134 uint64_t *l1_table; /*L1 table entries*/
135 uint64_t *l2_cache; /*We maintain a cache of size
136 *L2_CACHE_SIZE of most read entries*/
137 uint64_t l2_cache_offsets[L2_CACHE_SIZE]; /*L2 cache entries*/
138 uint32_t l2_cache_counts[L2_CACHE_SIZE]; /*Cache access record*/
139 uint8_t *cluster_cache;
140 uint8_t *cluster_data;
141 uint8_t *sector_lock; /*Locking bitmap for AIO reads/writes*/
142 uint64_t cluster_cache_offset; /**/
143 uint32_t crypt_method; /*current crypt method, 0 if no
144 *key yet */
145 uint32_t crypt_method_header; /**/
146 AES_KEY aes_encrypt_key; /*AES key*/
147 AES_KEY aes_decrypt_key; /*AES key*/
148 /* libaio state */
149 io_context_t aio_ctx;
150 struct iocb iocb_list [MAX_AIO_REQS];
151 struct iocb *iocb_free [MAX_AIO_REQS];
152 struct pending_aio pending_aio[MAX_AIO_REQS];
153 int iocb_free_count;
154 struct iocb *iocb_queue[MAX_AIO_REQS];
155 int iocb_queued;
156 int poll_fd; /* NB: we require aio_poll support */
157 struct io_event aio_events[MAX_AIO_REQS];
158 };
160 static int decompress_cluster(struct tdqcow_state *s, uint64_t cluster_offset);
162 static int init_aio_state(struct disk_driver *dd)
163 {
164 int i;
165 struct td_state *bs = dd->td_state;
166 struct tdqcow_state *s = (struct tdqcow_state *)dd->private;
167 long ioidx;
169 /*Initialize Locking bitmap*/
170 s->sector_lock = calloc(1, bs->size);
172 if (!s->sector_lock) {
173 DPRINTF("Failed to allocate sector lock\n");
174 goto fail;
175 }
177 /* Initialize AIO */
178 s->iocb_free_count = MAX_AIO_REQS;
179 s->iocb_queued = 0;
181 /*Signal kernel to create Poll FD for Asyc completion events*/
182 s->aio_ctx = (io_context_t) REQUEST_ASYNC_FD;
183 s->poll_fd = io_setup(MAX_AIO_REQS, &s->aio_ctx);
185 if (s->poll_fd < 0) {
186 if (s->poll_fd == -EAGAIN) {
187 DPRINTF("Couldn't setup AIO context. If you are "
188 "trying to concurrently use a large number "
189 "of blktap-based disks, you may need to "
190 "increase the system-wide aio request limit. "
191 "(e.g. 'echo echo 1048576 > /proc/sys/fs/"
192 "aio-max-nr')\n");
193 } else {
194 DPRINTF("Couldn't get fd for AIO poll support. This "
195 "is probably because your kernel does not "
196 "have the aio-poll patch applied.\n");
197 }
198 goto fail;
199 }
201 for (i=0;i<MAX_AIO_REQS;i++)
202 s->iocb_free[i] = &s->iocb_list[i];
204 DPRINTF("AIO state initialised\n");
206 return 0;
208 fail:
209 return -1;
210 }
212 static uint32_t gen_cksum(char *ptr, int len)
213 {
214 unsigned char *md;
215 uint32_t ret;
217 md = malloc(MD5_DIGEST_LENGTH);
219 if(!md) return 0;
221 if (MD5((unsigned char *)ptr, len, md) != md) {
222 free(md);
223 return 0;
224 }
226 memcpy(&ret, md, sizeof(uint32_t));
227 free(md);
228 return ret;
229 }
231 static int get_filesize(char *filename, uint64_t *size, struct stat *st)
232 {
233 int fd;
234 QCowHeader header;
236 /*Set to the backing file size*/
237 fd = open(filename, O_RDONLY);
238 if (fd < 0)
239 return -1;
240 if (read(fd, &header, sizeof(header)) < sizeof(header)) {
241 close(fd);
242 return -1;
243 }
244 close(fd);
246 be32_to_cpus(&header.magic);
247 be64_to_cpus(&header.size);
248 if (header.magic == QCOW_MAGIC) {
249 *size = header.size >> SECTOR_SHIFT;
250 return 0;
251 }
253 if(S_ISBLK(st->st_mode)) {
254 fd = open(filename, O_RDONLY);
255 if (fd < 0)
256 return -1;
257 if (ioctl(fd,BLKGETSIZE,size)!=0) {
258 printf("Unable to get Block device size\n");
259 close(fd);
260 return -1;
261 }
262 close(fd);
263 } else *size = (st->st_size >> SECTOR_SHIFT);
264 return 0;
265 }
267 static int qcow_set_key(struct tdqcow_state *s, const char *key)
268 {
269 uint8_t keybuf[16];
270 int len, i;
272 memset(keybuf, 0, 16);
273 len = strlen(key);
274 if (len > 16)
275 len = 16;
276 /* XXX: we could compress the chars to 7 bits to increase
277 entropy */
278 for (i = 0; i < len; i++) {
279 keybuf[i] = key[i];
280 }
281 s->crypt_method = s->crypt_method_header;
283 if (AES_set_encrypt_key(keybuf, 128, &s->aes_encrypt_key) != 0)
284 return -1;
285 if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0)
286 return -1;
287 #if 0
288 /* test */
289 {
290 uint8_t in[16];
291 uint8_t out[16];
292 uint8_t tmp[16];
293 for (i=0; i<16; i++)
294 in[i] = i;
295 AES_encrypt(in, tmp, &s->aes_encrypt_key);
296 AES_decrypt(tmp, out, &s->aes_decrypt_key);
297 for (i = 0; i < 16; i++)
298 DPRINTF(" %02x", tmp[i]);
299 DPRINTF("\n");
300 for (i = 0; i < 16; i++)
301 DPRINTF(" %02x", out[i]);
302 DPRINTF("\n");
303 }
304 #endif
305 return 0;
306 }
308 static int async_read(struct tdqcow_state *s, int size,
309 uint64_t offset, char *buf, td_callback_t cb,
310 int id, uint64_t sector, void *private)
311 {
312 struct iocb *io;
313 struct pending_aio *pio;
314 long ioidx;
316 io = s->iocb_free[--s->iocb_free_count];
318 ioidx = IOCB_IDX(s, io);
319 pio = &s->pending_aio[ioidx];
320 pio->cb = cb;
321 pio->id = id;
322 pio->private = private;
323 pio->nb_sectors = size/512;
324 pio->buf = buf;
325 pio->sector = sector;
327 io_prep_pread(io, s->fd, buf, size, offset);
328 io->data = (void *)ioidx;
330 s->iocb_queue[s->iocb_queued++] = io;
332 return 1;
333 }
335 static int async_write(struct tdqcow_state *s, int size,
336 uint64_t offset, char *buf, td_callback_t cb,
337 int id, uint64_t sector, void *private)
338 {
339 struct iocb *io;
340 struct pending_aio *pio;
341 long ioidx;
343 io = s->iocb_free[--s->iocb_free_count];
345 ioidx = IOCB_IDX(s, io);
346 pio = &s->pending_aio[ioidx];
347 pio->cb = cb;
348 pio->id = id;
349 pio->private = private;
350 pio->nb_sectors = size/512;
351 pio->buf = buf;
352 pio->sector = sector;
354 io_prep_pwrite(io, s->fd, buf, size, offset);
355 io->data = (void *)ioidx;
357 s->iocb_queue[s->iocb_queued++] = io;
359 return 1;
360 }
362 /*TODO: Fix sector span!*/
363 static int aio_can_lock(struct tdqcow_state *s, uint64_t sector)
364 {
365 return (s->sector_lock[sector] ? 0 : 1);
366 }
368 static int aio_lock(struct tdqcow_state *s, uint64_t sector)
369 {
370 return ++s->sector_lock[sector];
371 }
373 static void aio_unlock(struct tdqcow_state *s, uint64_t sector)
374 {
375 if (!s->sector_lock[sector]) return;
377 --s->sector_lock[sector];
378 return;
379 }
381 /*
382 * The crypt function is compatible with the linux cryptoloop
383 * algorithm for < 4 GB images. NOTE: out_buf == in_buf is
384 * supported .
385 */
386 static void encrypt_sectors(struct tdqcow_state *s, int64_t sector_num,
387 uint8_t *out_buf, const uint8_t *in_buf,
388 int nb_sectors, int enc,
389 const AES_KEY *key)
390 {
391 union {
392 uint64_t ll[2];
393 uint8_t b[16];
394 } ivec;
395 int i;
397 for (i = 0; i < nb_sectors; i++) {
398 ivec.ll[0] = cpu_to_le64(sector_num);
399 ivec.ll[1] = 0;
400 AES_cbc_encrypt(in_buf, out_buf, 512, key,
401 ivec.b, enc);
402 sector_num++;
403 in_buf += 512;
404 out_buf += 512;
405 }
406 }
408 static int qtruncate(int fd, off_t length, int sparse)
409 {
410 int ret, i;
411 int current = 0, rem = 0;
412 uint64_t sectors;
413 struct stat st;
414 char *buf;
416 /* If length is greater than the current file len
417 * we synchronously write zeroes to the end of the
418 * file, otherwise we truncate the length down
419 */
420 ret = fstat(fd, &st);
421 if (ret == -1)
422 return -1;
423 if (S_ISBLK(st.st_mode))
424 return 0;
426 sectors = (length + DEFAULT_SECTOR_SIZE - 1)/DEFAULT_SECTOR_SIZE;
427 current = (st.st_size + DEFAULT_SECTOR_SIZE - 1)/DEFAULT_SECTOR_SIZE;
428 rem = st.st_size % DEFAULT_SECTOR_SIZE;
430 /* If we are extending this file, we write zeros to the end --
431 * this tries to ensure that the extents allocated wind up being
432 * contiguous on disk.
433 */
434 if(st.st_size < sectors * DEFAULT_SECTOR_SIZE) {
435 /*We are extending the file*/
436 if ((ret = posix_memalign((void **)&buf,
437 512, DEFAULT_SECTOR_SIZE))) {
438 DPRINTF("posix_memalign failed: %d\n", ret);
439 return -1;
440 }
441 memset(buf, 0x00, DEFAULT_SECTOR_SIZE);
442 if (lseek(fd, 0, SEEK_END)==-1) {
443 DPRINTF("Lseek EOF failed (%d), internal error\n",
444 errno);
445 free(buf);
446 return -1;
447 }
448 if (rem) {
449 ret = write(fd, buf, rem);
450 if (ret != rem) {
451 DPRINTF("write failed: ret = %d, err = %s\n",
452 ret, strerror(errno));
453 free(buf);
454 return -1;
455 }
456 }
457 for (i = current; i < sectors; i++ ) {
458 ret = write(fd, buf, DEFAULT_SECTOR_SIZE);
459 if (ret != DEFAULT_SECTOR_SIZE) {
460 DPRINTF("write failed: ret = %d, err = %s\n",
461 ret, strerror(errno));
462 free(buf);
463 return -1;
464 }
465 }
466 free(buf);
467 } else if(sparse && (st.st_size > sectors * DEFAULT_SECTOR_SIZE))
468 if (ftruncate(fd, (off_t)sectors * DEFAULT_SECTOR_SIZE)==-1) {
469 DPRINTF("Ftruncate failed (%s)\n", strerror(errno));
470 return -1;
471 }
472 return 0;
473 }
476 /* 'allocate' is:
477 *
478 * 0 to not allocate.
479 *
480 * 1 to allocate a normal cluster (for sector indexes 'n_start' to
481 * 'n_end')
482 *
483 * 2 to allocate a compressed cluster of size
484 * 'compressed_size'. 'compressed_size' must be > 0 and <
485 * cluster_size
486 *
487 * return 0 if not allocated.
488 */
489 static uint64_t get_cluster_offset(struct tdqcow_state *s,
490 uint64_t offset, int allocate,
491 int compressed_size,
492 int n_start, int n_end)
493 {
494 int min_index, i, j, l1_index, l2_index, l2_sector, l1_sector;
495 char *tmp_ptr, *tmp_ptr2, *l2_ptr, *l1_ptr;
496 uint64_t l2_offset, *l2_table, cluster_offset, tmp;
497 uint32_t min_count;
498 int new_l2_table;
500 /*Check L1 table for the extent offset*/
501 l1_index = offset >> (s->l2_bits + s->cluster_bits);
502 l2_offset = s->l1_table[l1_index];
503 new_l2_table = 0;
504 if (!l2_offset) {
505 if (!allocate)
506 return 0;
507 /*
508 * allocating a new l2 entry + extent
509 * at the end of the file, we must also
510 * update the L1 entry safely.
511 */
512 l2_offset = s->fd_end;
514 /* round to cluster size */
515 l2_offset = (l2_offset + s->cluster_size - 1)
516 & ~(s->cluster_size - 1);
518 /* update the L1 entry */
519 s->l1_table[l1_index] = l2_offset;
520 tmp = cpu_to_be64(l2_offset);
522 /*Truncate file for L2 table
523 *(initialised to zero in case we crash)*/
524 if (qtruncate(s->fd,
525 l2_offset + (s->l2_size * sizeof(uint64_t)),
526 s->sparse) != 0) {
527 DPRINTF("ERROR truncating file\n");
528 return 0;
529 }
530 s->fd_end = l2_offset + (s->l2_size * sizeof(uint64_t));
532 /*Update the L1 table entry on disk
533 * (for O_DIRECT we write 4KByte blocks)*/
534 l1_sector = (l1_index * sizeof(uint64_t)) >> 12;
535 l1_ptr = (char *)s->l1_table + (l1_sector << 12);
537 if (posix_memalign((void **)&tmp_ptr, 4096, 4096) != 0) {
538 DPRINTF("ERROR allocating memory for L1 table\n");
539 }
540 memcpy(tmp_ptr, l1_ptr, 4096);
542 /*
543 * Issue non-asynchronous L1 write.
544 * For safety, we must ensure that
545 * entry is written before blocks.
546 */
547 lseek(s->fd, s->l1_table_offset + (l1_sector << 12), SEEK_SET);
548 if (write(s->fd, tmp_ptr, 4096) != 4096) {
549 free(tmp_ptr);
550 return 0;
551 }
552 free(tmp_ptr);
554 new_l2_table = 1;
555 goto cache_miss;
556 } else if (s->min_cluster_alloc == s->l2_size) {
557 /*Fast-track the request*/
558 cluster_offset = l2_offset + (s->l2_size * sizeof(uint64_t));
559 l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
560 return cluster_offset + (l2_index * s->cluster_size);
561 }
563 /*Check to see if L2 entry is already cached*/
564 for (i = 0; i < L2_CACHE_SIZE; i++) {
565 if (l2_offset == s->l2_cache_offsets[i]) {
566 /* increment the hit count */
567 if (++s->l2_cache_counts[i] == 0xffffffff) {
568 for (j = 0; j < L2_CACHE_SIZE; j++) {
569 s->l2_cache_counts[j] >>= 1;
570 }
571 }
572 l2_table = s->l2_cache + (i << s->l2_bits);
573 goto found;
574 }
575 }
577 cache_miss:
578 /* not found: load a new entry in the least used one */
579 min_index = 0;
580 min_count = 0xffffffff;
581 for (i = 0; i < L2_CACHE_SIZE; i++) {
582 if (s->l2_cache_counts[i] < min_count) {
583 min_count = s->l2_cache_counts[i];
584 min_index = i;
585 }
586 }
587 l2_table = s->l2_cache + (min_index << s->l2_bits);
589 /*If extent pre-allocated, read table from disk,
590 *otherwise write new table to disk*/
591 if (new_l2_table) {
592 /*Should we allocate the whole extent? Adjustable parameter.*/
593 if (s->cluster_alloc == s->l2_size) {
594 cluster_offset = l2_offset +
595 (s->l2_size * sizeof(uint64_t));
596 cluster_offset = (cluster_offset + s->cluster_size - 1)
597 & ~(s->cluster_size - 1);
598 if (qtruncate(s->fd, cluster_offset +
599 (s->cluster_size * s->l2_size),
600 s->sparse) != 0) {
601 DPRINTF("ERROR truncating file\n");
602 return 0;
603 }
604 s->fd_end = cluster_offset +
605 (s->cluster_size * s->l2_size);
606 for (i = 0; i < s->l2_size; i++) {
607 l2_table[i] = cpu_to_be64(cluster_offset +
608 (i*s->cluster_size));
609 }
610 } else memset(l2_table, 0, s->l2_size * sizeof(uint64_t));
612 lseek(s->fd, l2_offset, SEEK_SET);
613 if (write(s->fd, l2_table, s->l2_size * sizeof(uint64_t)) !=
614 s->l2_size * sizeof(uint64_t))
615 return 0;
616 } else {
617 lseek(s->fd, l2_offset, SEEK_SET);
618 if (read(s->fd, l2_table, s->l2_size * sizeof(uint64_t)) !=
619 s->l2_size * sizeof(uint64_t))
620 return 0;
621 }
623 /*Update the cache entries*/
624 s->l2_cache_offsets[min_index] = l2_offset;
625 s->l2_cache_counts[min_index] = 1;
627 found:
628 /*The extent is split into 's->l2_size' blocks of
629 *size 's->cluster_size'*/
630 l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
631 cluster_offset = be64_to_cpu(l2_table[l2_index]);
633 if (!cluster_offset ||
634 ((cluster_offset & QCOW_OFLAG_COMPRESSED) && allocate == 1) ) {
635 if (!allocate)
636 return 0;
638 if ((cluster_offset & QCOW_OFLAG_COMPRESSED) &&
639 (n_end - n_start) < s->cluster_sectors) {
640 /* cluster is already allocated but compressed, we must
641 decompress it in the case it is not completely
642 overwritten */
643 if (decompress_cluster(s, cluster_offset) < 0)
644 return 0;
645 cluster_offset = lseek(s->fd, s->fd_end, SEEK_SET);
646 cluster_offset = (cluster_offset + s->cluster_size - 1)
647 & ~(s->cluster_size - 1);
648 /* write the cluster content - not asynchronous */
649 lseek(s->fd, cluster_offset, SEEK_SET);
650 if (write(s->fd, s->cluster_cache, s->cluster_size) !=
651 s->cluster_size)
652 return -1;
653 } else {
654 /* allocate a new cluster */
655 cluster_offset = lseek(s->fd, s->fd_end, SEEK_SET);
656 if (allocate == 1) {
657 /* round to cluster size */
658 cluster_offset =
659 (cluster_offset + s->cluster_size - 1)
660 & ~(s->cluster_size - 1);
661 if (qtruncate(s->fd, cluster_offset +
662 s->cluster_size, s->sparse)!=0) {
663 DPRINTF("ERROR truncating file\n");
664 return 0;
665 }
666 s->fd_end = (cluster_offset + s->cluster_size);
667 /* if encrypted, we must initialize the cluster
668 content which won't be written */
669 if (s->crypt_method &&
670 (n_end - n_start) < s->cluster_sectors) {
671 uint64_t start_sect;
672 start_sect = (offset &
673 ~(s->cluster_size - 1))
674 >> 9;
675 memset(s->cluster_data + 512,
676 0xaa, 512);
677 for (i = 0; i < s->cluster_sectors;i++)
678 {
679 if (i < n_start || i >= n_end)
680 {
681 encrypt_sectors(s, start_sect + i,
682 s->cluster_data,
683 s->cluster_data + 512, 1, 1,
684 &s->aes_encrypt_key);
685 lseek(s->fd, cluster_offset + i * 512, SEEK_SET);
686 if (write(s->fd, s->cluster_data, 512) != 512)
687 return -1;
688 }
689 }
690 }
691 } else {
692 cluster_offset |= QCOW_OFLAG_COMPRESSED |
693 (uint64_t)compressed_size
694 << (63 - s->cluster_bits);
695 }
696 }
697 /* update L2 table */
698 tmp = cpu_to_be64(cluster_offset);
699 l2_table[l2_index] = tmp;
701 /*For IO_DIRECT we write 4KByte blocks*/
702 l2_sector = (l2_index * sizeof(uint64_t)) >> 12;
703 l2_ptr = (char *)l2_table + (l2_sector << 12);
705 if (posix_memalign((void **)&tmp_ptr2, 4096, 4096) != 0) {
706 DPRINTF("ERROR allocating memory for L1 table\n");
707 }
708 memcpy(tmp_ptr2, l2_ptr, 4096);
709 lseek(s->fd, l2_offset + (l2_sector << 12), SEEK_SET);
710 write(s->fd, tmp_ptr2, 4096);
711 free(tmp_ptr2);
712 }
713 return cluster_offset;
714 }
716 static void init_cluster_cache(struct disk_driver *dd)
717 {
718 struct td_state *bs = dd->td_state;
719 struct tdqcow_state *s = (struct tdqcow_state *)dd->private;
720 uint32_t count = 0;
721 int i, cluster_entries;
723 cluster_entries = s->cluster_size / 512;
724 DPRINTF("Initialising Cluster cache, %d sectors per cluster (%d cluster size)\n",
725 cluster_entries, s->cluster_size);
727 for (i = 0; i < bs->size; i += cluster_entries) {
728 if (get_cluster_offset(s, i << 9, 0, 0, 0, 1)) count++;
729 if (count >= L2_CACHE_SIZE) return;
730 }
731 DPRINTF("Finished cluster initialisation, added %d entries\n", count);
732 return;
733 }
735 static int qcow_is_allocated(struct tdqcow_state *s, int64_t sector_num,
736 int nb_sectors, int *pnum)
737 {
738 int index_in_cluster, n;
739 uint64_t cluster_offset;
741 cluster_offset = get_cluster_offset(s, sector_num << 9, 0, 0, 0, 0);
742 index_in_cluster = sector_num & (s->cluster_sectors - 1);
743 n = s->cluster_sectors - index_in_cluster;
744 if (n > nb_sectors)
745 n = nb_sectors;
746 *pnum = n;
747 return (cluster_offset != 0);
748 }
750 static int decompress_buffer(uint8_t *out_buf, int out_buf_size,
751 const uint8_t *buf, int buf_size)
752 {
753 z_stream strm1, *strm = &strm1;
754 int ret, out_len;
756 memset(strm, 0, sizeof(*strm));
758 strm->next_in = (uint8_t *)buf;
759 strm->avail_in = buf_size;
760 strm->next_out = out_buf;
761 strm->avail_out = out_buf_size;
763 ret = inflateInit2(strm, -12);
764 if (ret != Z_OK)
765 return -1;
766 ret = inflate(strm, Z_FINISH);
767 out_len = strm->next_out - out_buf;
768 if ( (ret != Z_STREAM_END && ret != Z_BUF_ERROR) ||
769 (out_len != out_buf_size) ) {
770 inflateEnd(strm);
771 return -1;
772 }
773 inflateEnd(strm);
774 return 0;
775 }
777 static int decompress_cluster(struct tdqcow_state *s, uint64_t cluster_offset)
778 {
779 int ret, csize;
780 uint64_t coffset;
782 coffset = cluster_offset & s->cluster_offset_mask;
783 if (s->cluster_cache_offset != coffset) {
784 csize = cluster_offset >> (63 - s->cluster_bits);
785 csize &= (s->cluster_size - 1);
786 lseek(s->fd, coffset, SEEK_SET);
787 ret = read(s->fd, s->cluster_data, csize);
788 if (ret != csize)
789 return -1;
790 if (decompress_buffer(s->cluster_cache, s->cluster_size,
791 s->cluster_data, csize) < 0) {
792 return -1;
793 }
794 s->cluster_cache_offset = coffset;
795 }
796 return 0;
797 }
799 static inline void init_fds(struct disk_driver *dd)
800 {
801 int i;
802 struct tdqcow_state *s = (struct tdqcow_state *)dd->private;
804 for(i = 0; i < MAX_IOFD; i++)
805 dd->io_fd[i] = 0;
807 dd->io_fd[0] = s->poll_fd;
808 }
810 /* Open the disk file and initialize qcow state. */
811 int tdqcow_open (struct disk_driver *dd, const char *name, td_flag_t flags)
812 {
813 int fd, len, i, shift, ret, size, l1_table_size, o_flags;
814 struct td_state *bs = dd->td_state;
815 struct tdqcow_state *s = (struct tdqcow_state *)dd->private;
816 char *buf;
817 QCowHeader *header;
818 QCowHeader_ext *exthdr;
819 uint32_t cksum;
820 uint64_t final_cluster = 0;
822 DPRINTF("QCOW: Opening %s\n",name);
824 o_flags = O_DIRECT | O_LARGEFILE |
825 ((flags == TD_RDONLY) ? O_RDONLY : O_RDWR);
826 fd = open(name, o_flags);
827 if (fd < 0) {
828 DPRINTF("Unable to open %s (%d)\n",name,0 - errno);
829 return -1;
830 }
832 s->fd = fd;
833 asprintf(&s->name,"%s", name);
835 ASSERT(sizeof(QCowHeader) + sizeof(QCowHeader_ext) < 512);
837 ret = posix_memalign((void **)&buf, 512, 512);
838 if (ret != 0) goto fail;
840 if (read(fd, buf, 512) != 512)
841 goto fail;
843 header = (QCowHeader *)buf;
844 be32_to_cpus(&header->magic);
845 be32_to_cpus(&header->version);
846 be64_to_cpus(&header->backing_file_offset);
847 be32_to_cpus(&header->backing_file_size);
848 be32_to_cpus(&header->mtime);
849 be64_to_cpus(&header->size);
850 be32_to_cpus(&header->crypt_method);
851 be64_to_cpus(&header->l1_table_offset);
853 if (header->magic != QCOW_MAGIC || header->version > QCOW_VERSION)
854 goto fail;
855 if (header->size <= 1 || header->cluster_bits < 9)
856 goto fail;
857 if (header->crypt_method > QCOW_CRYPT_AES)
858 goto fail;
859 s->crypt_method_header = header->crypt_method;
860 if (s->crypt_method_header)
861 s->encrypted = 1;
862 s->cluster_bits = header->cluster_bits;
863 s->cluster_size = 1 << s->cluster_bits;
864 s->cluster_sectors = 1 << (s->cluster_bits - 9);
865 s->l2_bits = header->l2_bits;
866 s->l2_size = 1 << s->l2_bits;
867 s->cluster_alloc = s->l2_size;
868 bs->size = header->size / 512;
869 s->cluster_offset_mask = (1LL << (63 - s->cluster_bits)) - 1;
870 s->backing_file_offset = header->backing_file_offset;
871 s->backing_file_size = header->backing_file_size;
873 /* read the level 1 table */
874 shift = s->cluster_bits + s->l2_bits;
875 s->l1_size = (header->size + (1LL << shift) - 1) >> shift;
877 s->l1_table_offset = header->l1_table_offset;
879 /*allocate a 4Kbyte multiple of memory*/
880 l1_table_size = s->l1_size * sizeof(uint64_t);
881 if (l1_table_size % 4096 > 0) {
882 l1_table_size = ((l1_table_size >> 12) + 1) << 12;
883 }
884 ret = posix_memalign((void **)&s->l1_table, 4096, l1_table_size);
885 if (ret != 0) goto fail;
887 memset(s->l1_table, 0x00, l1_table_size);
889 DPRINTF("L1 Table offset detected: %llu, size %d (%d)\n",
890 (long long)s->l1_table_offset,
891 (int) (s->l1_size * sizeof(uint64_t)),
892 l1_table_size);
894 lseek(fd, s->l1_table_offset, SEEK_SET);
895 if (read(fd, s->l1_table, l1_table_size) != l1_table_size)
896 goto fail;
898 for(i = 0; i < s->l1_size; i++) {
899 //be64_to_cpus(&s->l1_table[i]);
900 //DPRINTF("L1[%d] => %llu\n", i, s->l1_table[i]);
901 if (s->l1_table[i] > final_cluster)
902 final_cluster = s->l1_table[i];
903 }
905 /* alloc L2 cache */
906 size = s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t);
907 ret = posix_memalign((void **)&s->l2_cache, 4096, size);
908 if(ret != 0) goto fail;
910 size = s->cluster_size;
911 ret = posix_memalign((void **)&s->cluster_cache, 4096, size);
912 if(ret != 0) goto fail;
914 ret = posix_memalign((void **)&s->cluster_data, 4096, size);
915 if(ret != 0) goto fail;
916 s->cluster_cache_offset = -1;
918 if (s->backing_file_offset != 0)
919 s->cluster_alloc = 1; /*Cannot use pre-alloc*/
921 bs->sector_size = 512;
922 bs->info = 0;
924 /*Detect min_cluster_alloc*/
925 s->min_cluster_alloc = 1; /*Default*/
926 if (s->backing_file_offset == 0 && s->l1_table_offset % 4096 == 0) {
927 /*We test to see if the xen magic # exists*/
928 exthdr = (QCowHeader_ext *)(buf + sizeof(QCowHeader));
929 be32_to_cpus(&exthdr->xmagic);
930 if(exthdr->xmagic != XEN_MAGIC)
931 goto end_xenhdr;
933 /*Finally check the L1 table cksum*/
934 be32_to_cpus(&exthdr->cksum);
935 cksum = gen_cksum((char *)s->l1_table,
936 s->l1_size * sizeof(uint64_t));
937 if(exthdr->cksum != cksum)
938 goto end_xenhdr;
940 be32_to_cpus(&exthdr->min_cluster_alloc);
941 be32_to_cpus(&exthdr->flags);
942 s->sparse = (exthdr->flags & SPARSE_FILE);
943 s->min_cluster_alloc = exthdr->min_cluster_alloc;
944 }
946 end_xenhdr:
947 if (init_aio_state(dd)!=0) {
948 DPRINTF("Unable to initialise AIO state\n");
949 goto fail;
950 }
951 init_fds(dd);
953 if (!final_cluster)
954 s->fd_end = s->l1_table_offset + l1_table_size;
955 else {
956 s->fd_end = lseek64(fd, 0, SEEK_END);
957 if (s->fd_end == (off64_t)-1)
958 goto fail;
959 }
961 return 0;
963 fail:
964 DPRINTF("QCOW Open failed\n");
965 free(s->l1_table);
966 free(s->l2_cache);
967 free(s->cluster_cache);
968 free(s->cluster_data);
969 close(fd);
970 return -1;
971 }
973 int tdqcow_queue_read(struct disk_driver *dd, uint64_t sector,
974 int nb_sectors, char *buf, td_callback_t cb,
975 int id, void *private)
976 {
977 struct tdqcow_state *s = (struct tdqcow_state *)dd->private;
978 int ret = 0, index_in_cluster, n, i, rsp = 0;
979 uint64_t cluster_offset, sec, nr_secs;
981 sec = sector;
982 nr_secs = nb_sectors;
984 /*Check we can get a lock*/
985 for (i = 0; i < nb_sectors; i++)
986 if (!aio_can_lock(s, sector + i))
987 return cb(dd, -EBUSY, sector, nb_sectors, id, private);
989 /*We store a local record of the request*/
990 while (nb_sectors > 0) {
991 cluster_offset =
992 get_cluster_offset(s, sector << 9, 0, 0, 0, 0);
993 index_in_cluster = sector & (s->cluster_sectors - 1);
994 n = s->cluster_sectors - index_in_cluster;
995 if (n > nb_sectors)
996 n = nb_sectors;
998 if (s->iocb_free_count == 0 || !aio_lock(s, sector))
999 return cb(dd, -EBUSY, sector, nb_sectors, id, private);
1001 if(!cluster_offset) {
1002 aio_unlock(s, sector);
1003 ret = cb(dd, BLK_NOT_ALLOCATED,
1004 sector, n, id, private);
1005 if (ret == -EBUSY) {
1006 /* mark remainder of request
1007 * as busy and try again later */
1008 return cb(dd, -EBUSY, sector + n,
1009 nb_sectors - n, id, private);
1010 } else
1011 rsp += ret;
1012 } else if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
1013 aio_unlock(s, sector);
1014 if (decompress_cluster(s, cluster_offset) < 0) {
1015 rsp += cb(dd, -EIO, sector,
1016 nb_sectors, id, private);
1017 goto done;
1019 memcpy(buf, s->cluster_cache + index_in_cluster * 512,
1020 512 * n);
1021 rsp += cb(dd, 0, sector, n, id, private);
1022 } else {
1023 async_read(s, n * 512,
1024 (cluster_offset + index_in_cluster * 512),
1025 buf, cb, id, sector, private);
1027 nb_sectors -= n;
1028 sector += n;
1029 buf += n * 512;
1031 done:
1032 return rsp;
1035 int tdqcow_queue_write(struct disk_driver *dd, uint64_t sector,
1036 int nb_sectors, char *buf, td_callback_t cb,
1037 int id, void *private)
1039 struct tdqcow_state *s = (struct tdqcow_state *)dd->private;
1040 int ret = 0, index_in_cluster, n, i;
1041 uint64_t cluster_offset, sec, nr_secs;
1043 sec = sector;
1044 nr_secs = nb_sectors;
1046 /*Check we can get a lock*/
1047 for (i = 0; i < nb_sectors; i++)
1048 if (!aio_can_lock(s, sector + i))
1049 return cb(dd, -EBUSY, sector, nb_sectors, id, private);
1051 /*We store a local record of the request*/
1052 while (nb_sectors > 0) {
1053 index_in_cluster = sector & (s->cluster_sectors - 1);
1054 n = s->cluster_sectors - index_in_cluster;
1055 if (n > nb_sectors)
1056 n = nb_sectors;
1058 if (s->iocb_free_count == 0 || !aio_lock(s, sector))
1059 return cb(dd, -EBUSY, sector, nb_sectors, id, private);
1061 cluster_offset = get_cluster_offset(s, sector << 9, 1, 0,
1062 index_in_cluster,
1063 index_in_cluster+n);
1064 if (!cluster_offset) {
1065 DPRINTF("Ooops, no write cluster offset!\n");
1066 aio_unlock(s, sector);
1067 return cb(dd, -EIO, sector, nb_sectors, id, private);
1070 if (s->crypt_method) {
1071 encrypt_sectors(s, sector, s->cluster_data,
1072 (unsigned char *)buf, n, 1,
1073 &s->aes_encrypt_key);
1074 async_write(s, n * 512,
1075 (cluster_offset + index_in_cluster*512),
1076 (char *)s->cluster_data, cb, id, sector,
1077 private);
1078 } else {
1079 async_write(s, n * 512,
1080 (cluster_offset + index_in_cluster*512),
1081 buf, cb, id, sector, private);
1084 nb_sectors -= n;
1085 sector += n;
1086 buf += n * 512;
1088 s->cluster_cache_offset = -1; /* disable compressed cache */
1090 return 0;
1093 int tdqcow_submit(struct disk_driver *dd)
1095 int ret;
1096 struct tdqcow_state *prv = (struct tdqcow_state *)dd->private;
1098 if (!prv->iocb_queued)
1099 return 0;
1101 ret = io_submit(prv->aio_ctx, prv->iocb_queued, prv->iocb_queue);
1103 /* XXX: TODO: Handle error conditions here. */
1105 /* Success case: */
1106 prv->iocb_queued = 0;
1108 return 0;
1111 int tdqcow_close(struct disk_driver *dd)
1113 struct tdqcow_state *s = (struct tdqcow_state *)dd->private;
1114 uint32_t cksum, out;
1115 int fd, offset;
1117 /*Update the hdr cksum*/
1118 if(s->min_cluster_alloc == s->l2_size) {
1119 cksum = gen_cksum((char *)s->l1_table, s->l1_size * sizeof(uint64_t));
1120 printf("Writing cksum: %d",cksum);
1121 fd = open(s->name, O_WRONLY | O_LARGEFILE); /*Open without O_DIRECT*/
1122 offset = sizeof(QCowHeader) + sizeof(uint32_t);
1123 lseek(fd, offset, SEEK_SET);
1124 out = cpu_to_be32(cksum);
1125 write(fd, &out, sizeof(uint32_t));
1126 close(fd);
1129 io_destroy(s->aio_ctx);
1130 free(s->name);
1131 free(s->l1_table);
1132 free(s->l2_cache);
1133 free(s->cluster_cache);
1134 free(s->cluster_data);
1135 close(s->fd);
1136 return 0;
1139 int tdqcow_do_callbacks(struct disk_driver *dd, int sid)
1141 int ret, i, rsp = 0,*ptr;
1142 struct io_event *ep;
1143 struct tdqcow_state *prv = (struct tdqcow_state *)dd->private;
1145 if (sid > MAX_IOFD) return 1;
1147 /* Non-blocking test for completed io. */
1148 ret = io_getevents(prv->aio_ctx, 0, MAX_AIO_REQS, prv->aio_events,
1149 NULL);
1151 for (ep = prv->aio_events, i = ret; i-- > 0; ep++) {
1152 struct iocb *io = ep->obj;
1153 struct pending_aio *pio;
1155 pio = &prv->pending_aio[(long)io->data];
1157 aio_unlock(prv, pio->sector);
1159 if (prv->crypt_method)
1160 encrypt_sectors(prv, pio->sector,
1161 (unsigned char *)pio->buf,
1162 (unsigned char *)pio->buf,
1163 pio->nb_sectors, 0,
1164 &prv->aes_decrypt_key);
1166 rsp += pio->cb(dd, ep->res == io->u.c.nbytes ? 0 : 1,
1167 pio->sector, pio->nb_sectors,
1168 pio->id, pio->private);
1170 prv->iocb_free[prv->iocb_free_count++] = io;
1172 return rsp;
1175 int qcow_create(const char *filename, uint64_t total_size,
1176 const char *backing_file, int sparse)
1178 int fd, header_size, backing_filename_len, l1_size, i;
1179 int shift, length, adjust, flags = 0, ret = 0;
1180 QCowHeader header;
1181 QCowHeader_ext exthdr;
1182 char backing_filename[1024], *ptr;
1183 uint64_t tmp, size, total_length;
1184 struct stat st;
1186 DPRINTF("Qcow_create: size %llu\n",(long long unsigned)total_size);
1188 fd = open(filename,
1189 O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE,
1190 0644);
1191 if (fd < 0)
1192 return -1;
1194 memset(&header, 0, sizeof(header));
1195 header.magic = cpu_to_be32(QCOW_MAGIC);
1196 header.version = cpu_to_be32(QCOW_VERSION);
1198 /*Create extended header fields*/
1199 exthdr.xmagic = cpu_to_be32(XEN_MAGIC);
1201 header_size = sizeof(header) + sizeof(QCowHeader_ext);
1202 backing_filename_len = 0;
1203 size = (total_size >> SECTOR_SHIFT);
1204 if (backing_file) {
1205 if (strcmp(backing_file, "fat:")) {
1206 const char *p;
1207 /* XXX: this is a hack: we do not attempt to
1208 *check for URL like syntax */
1209 p = strchr(backing_file, ':');
1210 if (p && (p - backing_file) >= 2) {
1211 /* URL like but exclude "c:" like filenames */
1212 strncpy(backing_filename, backing_file,
1213 sizeof(backing_filename));
1214 } else {
1215 realpath(backing_file, backing_filename);
1216 if (stat(backing_filename, &st) != 0) {
1217 return -1;
1220 header.backing_file_offset = cpu_to_be64(header_size);
1221 backing_filename_len = strlen(backing_filename);
1222 header.backing_file_size = cpu_to_be32(
1223 backing_filename_len);
1224 header_size += backing_filename_len;
1226 /*Set to the backing file size*/
1227 if(get_filesize(backing_filename, &size, &st)) {
1228 return -1;
1230 DPRINTF("Backing file size detected: %lld sectors"
1231 "(total %lld [%lld MB])\n",
1232 (long long)size,
1233 (long long)(size << SECTOR_SHIFT),
1234 (long long)(size >> 11));
1235 } else {
1236 backing_file = NULL;
1237 DPRINTF("Setting file size: %lld (total %lld)\n",
1238 (long long) total_size,
1239 (long long) (total_size << SECTOR_SHIFT));
1241 header.mtime = cpu_to_be32(st.st_mtime);
1242 header.cluster_bits = 9; /* 512 byte cluster to avoid copying
1243 unmodifyed sectors */
1244 header.l2_bits = 12; /* 32 KB L2 tables */
1245 exthdr.min_cluster_alloc = cpu_to_be32(1);
1246 } else {
1247 DPRINTF("Setting file size: %lld sectors"
1248 "(total %lld [%lld MB])\n",
1249 (long long) size,
1250 (long long) (size << SECTOR_SHIFT),
1251 (long long) (size >> 11));
1252 header.cluster_bits = 12; /* 4 KB clusters */
1253 header.l2_bits = 9; /* 4 KB L2 tables */
1254 exthdr.min_cluster_alloc = cpu_to_be32(1 << 9);
1256 /*Set the header size value*/
1257 header.size = cpu_to_be64(size * 512);
1259 header_size = (header_size + 7) & ~7;
1260 if (header_size % 4096 > 0) {
1261 header_size = ((header_size >> 12) + 1) << 12;
1264 shift = header.cluster_bits + header.l2_bits;
1265 l1_size = ((size * 512) + (1LL << shift) - 1) >> shift;
1267 header.l1_table_offset = cpu_to_be64(header_size);
1268 DPRINTF("L1 Table offset: %d, size %d\n",
1269 header_size,
1270 (int)(l1_size * sizeof(uint64_t)));
1271 header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE);
1273 ptr = calloc(1, l1_size * sizeof(uint64_t));
1274 exthdr.cksum = cpu_to_be32(gen_cksum(ptr, l1_size * sizeof(uint64_t)));
1275 printf("Created cksum: %d\n",exthdr.cksum);
1276 free(ptr);
1278 /*adjust file length to system page size boundary*/
1279 length = ROUNDUP(header_size + (l1_size * sizeof(uint64_t)),
1280 getpagesize());
1281 if (qtruncate(fd, length, 0)!=0) {
1282 DPRINTF("ERROR truncating file\n");
1283 return -1;
1286 if (sparse == 0) {
1287 /*Filesize is length+l1_size*(1 << s->l2_bits)+(size*512)*/
1288 total_length = length + (l1_size * (1 << 9)) + (size * 512);
1289 if (qtruncate(fd, total_length, 0)!=0) {
1290 DPRINTF("ERROR truncating file\n");
1291 return -1;
1293 printf("File truncated to length %"PRIu64"\n",total_length);
1294 } else
1295 flags = SPARSE_FILE;
1297 exthdr.flags = cpu_to_be32(flags);
1299 /* write all the data */
1300 lseek(fd, 0, SEEK_SET);
1301 ret += write(fd, &header, sizeof(header));
1302 ret += write(fd, &exthdr, sizeof(exthdr));
1303 if (backing_file)
1304 ret += write(fd, backing_filename, backing_filename_len);
1306 lseek(fd, header_size, SEEK_SET);
1307 tmp = 0;
1308 for (i = 0;i < l1_size; i++) {
1309 ret += write(fd, &tmp, sizeof(tmp));
1312 close(fd);
1314 return 0;
1317 int qcow_make_empty(struct tdqcow_state *s)
1319 uint32_t l1_length = s->l1_size * sizeof(uint64_t);
1321 memset(s->l1_table, 0, l1_length);
1322 lseek(s->fd, s->l1_table_offset, SEEK_SET);
1323 if (write(s->fd, s->l1_table, l1_length) < 0)
1324 return -1;
1325 if (qtruncate(s->fd, s->l1_table_offset + l1_length, s->sparse)!=0) {
1326 DPRINTF("ERROR truncating file\n");
1327 return -1;
1330 memset(s->l2_cache, 0, s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
1331 memset(s->l2_cache_offsets, 0, L2_CACHE_SIZE * sizeof(uint64_t));
1332 memset(s->l2_cache_counts, 0, L2_CACHE_SIZE * sizeof(uint32_t));
1334 return 0;
1337 int qcow_get_cluster_size(struct tdqcow_state *s)
1339 return s->cluster_size;
1342 /* XXX: put compressed sectors first, then all the cluster aligned
1343 tables to avoid losing bytes in alignment */
1344 int qcow_compress_cluster(struct tdqcow_state *s, int64_t sector_num,
1345 const uint8_t *buf)
1347 z_stream strm;
1348 int ret, out_len;
1349 uint8_t *out_buf;
1350 uint64_t cluster_offset;
1352 out_buf = malloc(s->cluster_size + (s->cluster_size / 1000) + 128);
1353 if (!out_buf)
1354 return -1;
1356 /* best compression, small window, no zlib header */
1357 memset(&strm, 0, sizeof(strm));
1358 ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
1359 Z_DEFLATED, -12,
1360 9, Z_DEFAULT_STRATEGY);
1361 if (ret != 0) {
1362 free(out_buf);
1363 return -1;
1366 strm.avail_in = s->cluster_size;
1367 strm.next_in = (uint8_t *)buf;
1368 strm.avail_out = s->cluster_size;
1369 strm.next_out = out_buf;
1371 ret = deflate(&strm, Z_FINISH);
1372 if (ret != Z_STREAM_END && ret != Z_OK) {
1373 free(out_buf);
1374 deflateEnd(&strm);
1375 return -1;
1377 out_len = strm.next_out - out_buf;
1379 deflateEnd(&strm);
1381 if (ret != Z_STREAM_END || out_len >= s->cluster_size) {
1382 /* could not compress: write normal cluster */
1383 //tdqcow_queue_write(bs, sector_num, buf, s->cluster_sectors);
1384 } else {
1385 cluster_offset = get_cluster_offset(s, sector_num << 9, 2,
1386 out_len, 0, 0);
1387 cluster_offset &= s->cluster_offset_mask;
1388 lseek(s->fd, cluster_offset, SEEK_SET);
1389 if (write(s->fd, out_buf, out_len) != out_len) {
1390 free(out_buf);
1391 return -1;
1395 free(out_buf);
1396 return 0;
1399 int tdqcow_get_parent_id(struct disk_driver *dd, struct disk_id *id)
1401 off_t off;
1402 char *buf, *filename;
1403 int len, secs, err = -EINVAL;
1404 struct tdqcow_state *child = (struct tdqcow_state *)dd->private;
1406 if (!child->backing_file_offset)
1407 return TD_NO_PARENT;
1409 /* read the backing file name */
1410 len = child->backing_file_size;
1411 off = child->backing_file_offset - (child->backing_file_offset % 512);
1412 secs = (len + (child->backing_file_offset - off) + 511) >> 9;
1414 if (posix_memalign((void **)&buf, 512, secs << 9))
1415 return -1;
1417 if (lseek(child->fd, off, SEEK_SET) == (off_t)-1)
1418 goto out;
1420 if (read(child->fd, buf, secs << 9) != secs << 9)
1421 goto out;
1422 filename = buf + (child->backing_file_offset - off);
1423 filename[len] = '\0';
1425 id->name = strdup(filename);
1426 id->drivertype = DISK_TYPE_QCOW;
1427 err = 0;
1428 out:
1429 free(buf);
1430 return err;
1433 int tdqcow_validate_parent(struct disk_driver *child,
1434 struct disk_driver *parent, td_flag_t flags)
1436 struct stat stats;
1437 uint64_t psize, csize;
1438 struct tdqcow_state *c = (struct tdqcow_state *)child->private;
1439 struct tdqcow_state *p = (struct tdqcow_state *)parent->private;
1441 if (stat(p->name, &stats))
1442 return -EINVAL;
1443 if (get_filesize(p->name, &psize, &stats))
1444 return -EINVAL;
1446 if (stat(c->name, &stats))
1447 return -EINVAL;
1448 if (get_filesize(c->name, &csize, &stats))
1449 return -EINVAL;
1451 if (csize != psize)
1452 return -EINVAL;
1454 return 0;
1457 struct tap_disk tapdisk_qcow = {
1458 .disk_type = "tapdisk_qcow",
1459 .private_data_size = sizeof(struct tdqcow_state),
1460 .td_open = tdqcow_open,
1461 .td_queue_read = tdqcow_queue_read,
1462 .td_queue_write = tdqcow_queue_write,
1463 .td_submit = tdqcow_submit,
1464 .td_close = tdqcow_close,
1465 .td_do_callbacks = tdqcow_do_callbacks,
1466 .td_get_parent_id = tdqcow_get_parent_id,
1467 .td_validate_parent = tdqcow_validate_parent
1468 };