ia64/xen-unstable

view tools/blktap/drivers/block-qcow.c @ 13044:a05fefbeb19f

Add sparseness flag to qcow-create.

For environments where space must be guaranteed in advance
use the -p flag to remove sparseness from the qcow file.

Signed-off-by: Julian Chesterfield <julian@xensource.com>
author Julian Chesterfield <julian@xensource.com>
date Thu Dec 14 20:23:07 2006 +0000 (2006-12-14)
parents 69efe6730fb1
children d72237b3cc37
line source
1 /* block-qcow.c
2 *
3 * Asynchronous Qemu copy-on-write disk implementation.
4 * Code based on the Qemu implementation
5 * (see copyright notice below)
6 *
7 * (c) 2006 Andrew Warfield and Julian Chesterfield
8 *
9 */
11 /*
12 * Block driver for the QCOW format
13 *
14 * Copyright (c) 2004 Fabrice Bellard
15 *
16 * Permission is hereby granted, free of charge, to any person obtaining a copy
17 * of this software and associated documentation files(the "Software"), to deal
18 * in the Software without restriction, including without limitation the rights
19 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
20 * copies of the Software, and to permit persons to whom the Software is
21 * furnished to do so, subject to the following conditions:
22 */
24 #include <errno.h>
25 #include <fcntl.h>
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <unistd.h>
29 #include <sys/statvfs.h>
30 #include <sys/stat.h>
31 #include <sys/ioctl.h>
32 #include <linux/fs.h>
33 #include <string.h>
34 #include <zlib.h>
35 #include <inttypes.h>
36 #include <libaio.h>
37 #include <openssl/md5.h>
38 #include "bswap.h"
39 #include "aes.h"
40 #include "tapdisk.h"
42 #if 1
43 #define ASSERT(_p) \
44 if ( !(_p) ) { DPRINTF("Assertion '%s' failed, line %d, file %s", #_p , \
45 __LINE__, __FILE__); *(int*)0=0; }
46 #else
47 #define ASSERT(_p) ((void)0)
48 #endif
51 /******AIO DEFINES******/
52 #define REQUEST_ASYNC_FD 1
53 #define MAX_QCOW_IDS 0xFFFF
54 #define MAX_AIO_REQS (MAX_REQUESTS * MAX_SEGMENTS_PER_REQ)
56 struct pending_aio {
57 td_callback_t cb;
58 int id;
59 void *private;
60 int nb_sectors;
61 char *buf;
62 uint64_t sector;
63 int qcow_idx;
64 };
66 #define IOCB_IDX(_s, _io) ((_io) - (_s)->iocb_list)
68 #define ZERO_TEST(_b) (_b | 0x00)
70 /**************************************************************/
71 /* QEMU COW block driver with compression and encryption support */
73 #define QCOW_MAGIC (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb)
74 #define XEN_MAGIC (('X' << 24) | ('E' << 16) | ('N' << 8) | 0xfb)
75 #define QCOW_VERSION 1
77 #define QCOW_CRYPT_NONE 0x00
78 #define QCOW_CRYPT_AES 0x01
79 #define QCOW_SPARSE_FILE 0x02
81 #define QCOW_OFLAG_COMPRESSED (1LL << 63)
83 #ifndef O_BINARY
84 #define O_BINARY 0
85 #endif
87 typedef struct QCowHeader {
88 uint32_t magic;
89 uint32_t version;
90 uint64_t backing_file_offset;
91 uint32_t backing_file_size;
92 uint32_t mtime;
93 uint64_t size; /* in bytes */
94 uint8_t cluster_bits;
95 uint8_t l2_bits;
96 uint32_t crypt_method;
97 uint64_t l1_table_offset;
98 } QCowHeader;
100 /*Extended header for Xen enhancements*/
101 typedef struct QCowHeader_ext {
102 uint32_t xmagic;
103 uint32_t cksum;
104 uint32_t min_cluster_alloc;
105 uint32_t flags;
106 } QCowHeader_ext;
108 #define L2_CACHE_SIZE 16 /*Fixed allocation in Qemu*/
110 struct tdqcow_state {
111 int fd; /*Main Qcow file descriptor */
112 uint64_t fd_end; /*Store a local record of file length */
113 int bfd; /*Backing file descriptor*/
114 char *name; /*Record of the filename*/
115 int poll_pipe[2]; /*dummy fd for polling on */
116 int encrypted; /*File contents are encrypted or plain*/
117 int cluster_bits; /*Determines length of cluster as
118 *indicated by file hdr*/
119 int cluster_size; /*Length of cluster*/
120 int cluster_sectors; /*Number of sectors per cluster*/
121 int cluster_alloc; /*Blktap fix for allocating full
122 *extents*/
123 int min_cluster_alloc; /*Blktap historical extent alloc*/
124 int sparse; /*Indicates whether to preserve sparseness*/
125 int l2_bits; /*Size of L2 table entry*/
126 int l2_size; /*Full table size*/
127 int l1_size; /*L1 table size*/
128 uint64_t cluster_offset_mask;
129 uint64_t l1_table_offset; /*L1 table offset from beginning of
130 *file*/
131 uint64_t *l1_table; /*L1 table entries*/
132 uint64_t *l2_cache; /*We maintain a cache of size
133 *L2_CACHE_SIZE of most read entries*/
134 uint64_t l2_cache_offsets[L2_CACHE_SIZE]; /*L2 cache entries*/
135 uint32_t l2_cache_counts[L2_CACHE_SIZE]; /*Cache access record*/
136 uint8_t *cluster_cache;
137 uint8_t *cluster_data;
138 uint8_t *sector_lock; /*Locking bitmap for AIO reads/writes*/
139 uint64_t cluster_cache_offset; /**/
140 uint32_t crypt_method; /*current crypt method, 0 if no
141 *key yet */
142 uint32_t crypt_method_header; /**/
143 AES_KEY aes_encrypt_key; /*AES key*/
144 AES_KEY aes_decrypt_key; /*AES key*/
145 /* libaio state */
146 io_context_t aio_ctx;
147 int nr_reqs [MAX_QCOW_IDS];
148 struct iocb iocb_list [MAX_AIO_REQS];
149 struct iocb *iocb_free [MAX_AIO_REQS];
150 struct pending_aio pending_aio[MAX_AIO_REQS];
151 int iocb_free_count;
152 struct iocb *iocb_queue[MAX_AIO_REQS];
153 int iocb_queued;
154 int poll_fd; /* NB: we require aio_poll support */
155 struct io_event aio_events[MAX_AIO_REQS];
156 };
158 static int decompress_cluster(struct tdqcow_state *s, uint64_t cluster_offset);
160 static int init_aio_state(struct td_state *bs)
161 {
162 int i;
163 struct tdqcow_state *s = (struct tdqcow_state *)bs->private;
164 long ioidx;
166 /*Initialize Locking bitmap*/
167 s->sector_lock = calloc(1, bs->size);
169 if (!s->sector_lock) {
170 DPRINTF("Failed to allocate sector lock\n");
171 goto fail;
172 }
174 /* Initialize AIO */
175 s->iocb_free_count = MAX_AIO_REQS;
176 s->iocb_queued = 0;
178 /*Signal kernel to create Poll FD for Asyc completion events*/
179 s->aio_ctx = (io_context_t) REQUEST_ASYNC_FD;
180 s->poll_fd = io_setup(MAX_AIO_REQS, &s->aio_ctx);
182 if (s->poll_fd < 0) {
183 if (s->poll_fd == -EAGAIN) {
184 DPRINTF("Couldn't setup AIO context. If you are "
185 "trying to concurrently use a large number "
186 "of blktap-based disks, you may need to "
187 "increase the system-wide aio request limit. "
188 "(e.g. 'echo echo 1048576 > /proc/sys/fs/"
189 "aio-max-nr')\n");
190 } else {
191 DPRINTF("Couldn't get fd for AIO poll support. This "
192 "is probably because your kernel does not "
193 "have the aio-poll patch applied.\n");
194 }
195 goto fail;
196 }
198 for (i=0;i<MAX_AIO_REQS;i++)
199 s->iocb_free[i] = &s->iocb_list[i];
200 for (i=0;i<MAX_QCOW_IDS;i++)
201 s->nr_reqs[i] = 0;
202 DPRINTF("AIO state initialised\n");
204 return 0;
206 fail:
207 return -1;
208 }
210 /*
211 *Test if block is zero.
212 * Return:
213 * 1 for TRUE
214 * 0 for FALSE
215 */
216 static inline int IS_ZERO(char *buf, int len)
217 {
218 int i;
220 for (i = 0; i < len; i++) {
221 /*if not zero, return false*/
222 if (ZERO_TEST(*(buf + i))) return 0;
223 }
224 return 1;
225 }
227 static uint32_t gen_cksum(char *ptr, int len)
228 {
229 unsigned char *md;
230 uint32_t ret;
232 md = malloc(MD5_DIGEST_LENGTH);
234 if(!md) return 0;
236 if (MD5((unsigned char *)ptr, len, md) != md) return 0;
238 memcpy(&ret, md, sizeof(uint32_t));
239 free(md);
240 return ret;
241 }
243 static int get_filesize(char *filename, uint64_t *size, struct stat *st)
244 {
245 int blockfd;
247 /*Set to the backing file size*/
248 if(S_ISBLK(st->st_mode)) {
249 blockfd = open(filename, O_RDONLY);
250 if (blockfd < 0)
251 return -1;
252 if (ioctl(blockfd,BLKGETSIZE,size)!=0) {
253 printf("Unable to get Block device size\n");
254 close(blockfd);
255 return -1;
256 }
257 close(blockfd);
258 } else *size = (st->st_size >> SECTOR_SHIFT);
259 return 0;
260 }
262 static int qcow_set_key(struct td_state *bs, const char *key)
263 {
264 struct tdqcow_state *s = (struct tdqcow_state *)bs->private;
265 uint8_t keybuf[16];
266 int len, i;
268 memset(keybuf, 0, 16);
269 len = strlen(key);
270 if (len > 16)
271 len = 16;
272 /* XXX: we could compress the chars to 7 bits to increase
273 entropy */
274 for (i = 0; i < len; i++) {
275 keybuf[i] = key[i];
276 }
277 s->crypt_method = s->crypt_method_header;
279 if (AES_set_encrypt_key(keybuf, 128, &s->aes_encrypt_key) != 0)
280 return -1;
281 if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0)
282 return -1;
283 #if 0
284 /* test */
285 {
286 uint8_t in[16];
287 uint8_t out[16];
288 uint8_t tmp[16];
289 for (i=0; i<16; i++)
290 in[i] = i;
291 AES_encrypt(in, tmp, &s->aes_encrypt_key);
292 AES_decrypt(tmp, out, &s->aes_decrypt_key);
293 for (i = 0; i < 16; i++)
294 DPRINTF(" %02x", tmp[i]);
295 DPRINTF("\n");
296 for (i = 0; i < 16; i++)
297 DPRINTF(" %02x", out[i]);
298 DPRINTF("\n");
299 }
300 #endif
301 return 0;
302 }
304 static int async_read(struct tdqcow_state *s, int fd, int size,
305 uint64_t offset,
306 char *buf, td_callback_t cb,
307 int id, uint64_t sector, int qcow_idx, void *private)
308 {
309 struct iocb *io;
310 struct pending_aio *pio;
311 long ioidx;
313 io = s->iocb_free[--s->iocb_free_count];
315 ioidx = IOCB_IDX(s, io);
316 pio = &s->pending_aio[ioidx];
317 pio->cb = cb;
318 pio->id = id;
319 pio->private = private;
320 pio->nb_sectors = size/512;
321 pio->buf = buf;
322 pio->sector = sector;
323 pio->qcow_idx = qcow_idx;
325 io_prep_pread(io, fd, buf, size, offset);
326 io->data = (void *)ioidx;
328 s->iocb_queue[s->iocb_queued++] = io;
330 return 1;
331 }
333 static int async_write(struct tdqcow_state *s, int fd, int size,
334 uint64_t offset,
335 char *buf, td_callback_t cb,
336 int id, uint64_t sector, int qcow_idx, void *private)
337 {
338 struct iocb *io;
339 struct pending_aio *pio;
340 long ioidx;
342 io = s->iocb_free[--s->iocb_free_count];
344 ioidx = IOCB_IDX(s, io);
345 pio = &s->pending_aio[ioidx];
346 pio->cb = cb;
347 pio->id = id;
348 pio->private = private;
349 pio->nb_sectors = size/512;
350 pio->buf = buf;
351 pio->sector = sector;
352 pio->qcow_idx = qcow_idx;
354 io_prep_pwrite(io, fd, buf, size, offset);
355 io->data = (void *)ioidx;
357 s->iocb_queue[s->iocb_queued++] = io;
359 return 1;
360 }
362 /*TODO: Fix sector span!*/
363 static int aio_can_lock(struct tdqcow_state *s, uint64_t sector)
364 {
365 return (s->sector_lock[sector] ? 0 : 1);
366 }
368 static int aio_lock(struct tdqcow_state *s, uint64_t sector)
369 {
370 return ++s->sector_lock[sector];
371 }
373 static void aio_unlock(struct tdqcow_state *s, uint64_t sector)
374 {
375 if (!s->sector_lock[sector]) return;
377 --s->sector_lock[sector];
378 return;
379 }
381 /*TODO - Use a freelist*/
382 static int get_free_idx(struct tdqcow_state *s)
383 {
384 int i;
386 for(i = 0; i < MAX_QCOW_IDS; i++) {
387 if(s->nr_reqs[i] == 0) return i;
388 }
389 return -1;
390 }
392 /*
393 * The crypt function is compatible with the linux cryptoloop
394 * algorithm for < 4 GB images. NOTE: out_buf == in_buf is
395 * supported .
396 */
397 static void encrypt_sectors(struct tdqcow_state *s, int64_t sector_num,
398 uint8_t *out_buf, const uint8_t *in_buf,
399 int nb_sectors, int enc,
400 const AES_KEY *key)
401 {
402 union {
403 uint64_t ll[2];
404 uint8_t b[16];
405 } ivec;
406 int i;
408 for (i = 0; i < nb_sectors; i++) {
409 ivec.ll[0] = cpu_to_le64(sector_num);
410 ivec.ll[1] = 0;
411 AES_cbc_encrypt(in_buf, out_buf, 512, key,
412 ivec.b, enc);
413 sector_num++;
414 in_buf += 512;
415 out_buf += 512;
416 }
417 }
419 static int qtruncate(int fd, off_t length, int sparse)
420 {
421 int current, ret, i;
422 int sectors = length/DEFAULT_SECTOR_SIZE;
423 struct stat st;
424 char buf[DEFAULT_SECTOR_SIZE];
426 /* If length is greater than the current file len
427 * we synchronously write zeroes to the end of the
428 * file, otherwise we truncate the length down
429 */
430 memset(buf, 0x00, DEFAULT_SECTOR_SIZE);
431 ret = fstat(fd, &st);
432 if((ret == -1) || S_ISBLK(st.st_mode))
433 return -1;
435 if(st.st_size < length) {
436 /*We are extending the file*/
437 lseek(fd, 0, SEEK_END);
438 for (i = 0; i < sectors; i++ ) {
439 ret = write(fd, buf, DEFAULT_SECTOR_SIZE);
440 if (ret != DEFAULT_SECTOR_SIZE)
441 return -1;
442 }
444 } else if(sparse && (st.st_size > length))
445 ftruncate(fd, length);
447 return 1;
448 }
451 /* 'allocate' is:
452 *
453 * 0 to not allocate.
454 *
455 * 1 to allocate a normal cluster (for sector indexes 'n_start' to
456 * 'n_end')
457 *
458 * 2 to allocate a compressed cluster of size
459 * 'compressed_size'. 'compressed_size' must be > 0 and <
460 * cluster_size
461 *
462 * return 0 if not allocated.
463 */
464 static uint64_t get_cluster_offset(struct td_state *bs,
465 uint64_t offset, int allocate,
466 int compressed_size,
467 int n_start, int n_end)
468 {
469 struct tdqcow_state *s = (struct tdqcow_state *)bs->private;
470 int min_index, i, j, l1_index, l2_index, l2_sector, l1_sector;
471 char *tmp_ptr, *tmp_ptr2, *l2_ptr, *l1_ptr;
472 uint64_t l2_offset, *l2_table, cluster_offset, tmp;
473 uint32_t min_count;
474 int new_l2_table;
476 /*Check L1 table for the extent offset*/
477 l1_index = offset >> (s->l2_bits + s->cluster_bits);
478 l2_offset = s->l1_table[l1_index];
479 new_l2_table = 0;
480 if (!l2_offset) {
481 if (!allocate)
482 return 0;
483 /*
484 * allocating a new l2 entry + extent
485 * at the end of the file, we must also
486 * update the L1 entry safely.
487 */
488 l2_offset = s->fd_end;
490 /* round to cluster size */
491 l2_offset = (l2_offset + s->cluster_size - 1)
492 & ~(s->cluster_size - 1);
494 /* update the L1 entry */
495 s->l1_table[l1_index] = l2_offset;
496 tmp = cpu_to_be64(l2_offset);
498 /*Truncate file for L2 table
499 *(initialised to zero in case we crash)*/
500 qtruncate(s->fd, l2_offset + (s->l2_size * sizeof(uint64_t)), s->sparse);
501 s->fd_end = l2_offset + (s->l2_size * sizeof(uint64_t));
503 /*Update the L1 table entry on disk
504 * (for O_DIRECT we write 4KByte blocks)*/
505 l1_sector = (l1_index * sizeof(uint64_t)) >> 12;
506 l1_ptr = (char *)s->l1_table + (l1_sector << 12);
508 if (posix_memalign((void **)&tmp_ptr, 4096, 4096) != 0) {
509 DPRINTF("ERROR allocating memory for L1 table\n");
510 }
511 memcpy(tmp_ptr, l1_ptr, 4096);
513 /*
514 * Issue non-asynchronous L1 write.
515 * For safety, we must ensure that
516 * entry is written before blocks.
517 */
518 lseek(s->fd, s->l1_table_offset + (l1_sector << 12), SEEK_SET);
519 if (write(s->fd, tmp_ptr, 4096) != 4096)
520 return 0;
521 free(tmp_ptr);
523 new_l2_table = 1;
524 goto cache_miss;
525 } else if (s->min_cluster_alloc == s->l2_size) {
526 /*Fast-track the request*/
527 cluster_offset = l2_offset + (s->l2_size * sizeof(uint64_t));
528 l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
529 return cluster_offset + (l2_index * s->cluster_size);
530 }
532 /*Check to see if L2 entry is already cached*/
533 for (i = 0; i < L2_CACHE_SIZE; i++) {
534 if (l2_offset == s->l2_cache_offsets[i]) {
535 /* increment the hit count */
536 if (++s->l2_cache_counts[i] == 0xffffffff) {
537 for (j = 0; j < L2_CACHE_SIZE; j++) {
538 s->l2_cache_counts[j] >>= 1;
539 }
540 }
541 l2_table = s->l2_cache + (i << s->l2_bits);
542 goto found;
543 }
544 }
546 cache_miss:
547 /* not found: load a new entry in the least used one */
548 min_index = 0;
549 min_count = 0xffffffff;
550 for (i = 0; i < L2_CACHE_SIZE; i++) {
551 if (s->l2_cache_counts[i] < min_count) {
552 min_count = s->l2_cache_counts[i];
553 min_index = i;
554 }
555 }
556 l2_table = s->l2_cache + (min_index << s->l2_bits);
558 /*If extent pre-allocated, read table from disk,
559 *otherwise write new table to disk*/
560 if (new_l2_table) {
561 /*Should we allocate the whole extent? Adjustable parameter.*/
562 if (s->cluster_alloc == s->l2_size) {
563 cluster_offset = l2_offset +
564 (s->l2_size * sizeof(uint64_t));
565 cluster_offset = (cluster_offset + s->cluster_size - 1)
566 & ~(s->cluster_size - 1);
567 qtruncate(s->fd, cluster_offset +
568 (s->cluster_size * s->l2_size), s->sparse);
569 s->fd_end = cluster_offset +
570 (s->cluster_size * s->l2_size);
571 for (i = 0; i < s->l2_size; i++) {
572 l2_table[i] = cpu_to_be64(cluster_offset +
573 (i*s->cluster_size));
574 }
575 } else memset(l2_table, 0, s->l2_size * sizeof(uint64_t));
577 lseek(s->fd, l2_offset, SEEK_SET);
578 if (write(s->fd, l2_table, s->l2_size * sizeof(uint64_t)) !=
579 s->l2_size * sizeof(uint64_t))
580 return 0;
581 } else {
582 lseek(s->fd, l2_offset, SEEK_SET);
583 if (read(s->fd, l2_table, s->l2_size * sizeof(uint64_t)) !=
584 s->l2_size * sizeof(uint64_t))
585 return 0;
586 }
588 /*Update the cache entries*/
589 s->l2_cache_offsets[min_index] = l2_offset;
590 s->l2_cache_counts[min_index] = 1;
592 found:
593 /*The extent is split into 's->l2_size' blocks of
594 *size 's->cluster_size'*/
595 l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
596 cluster_offset = be64_to_cpu(l2_table[l2_index]);
598 if (!cluster_offset ||
599 ((cluster_offset & QCOW_OFLAG_COMPRESSED) && allocate == 1) ) {
600 if (!allocate)
601 return 0;
603 if ((cluster_offset & QCOW_OFLAG_COMPRESSED) &&
604 (n_end - n_start) < s->cluster_sectors) {
605 /* cluster is already allocated but compressed, we must
606 decompress it in the case it is not completely
607 overwritten */
608 if (decompress_cluster(s, cluster_offset) < 0)
609 return 0;
610 cluster_offset = lseek(s->fd, s->fd_end, SEEK_SET);
611 cluster_offset = (cluster_offset + s->cluster_size - 1)
612 & ~(s->cluster_size - 1);
613 /* write the cluster content - not asynchronous */
614 lseek(s->fd, cluster_offset, SEEK_SET);
615 if (write(s->fd, s->cluster_cache, s->cluster_size) !=
616 s->cluster_size)
617 return -1;
618 } else {
619 /* allocate a new cluster */
620 cluster_offset = lseek(s->fd, s->fd_end, SEEK_SET);
621 if (allocate == 1) {
622 /* round to cluster size */
623 cluster_offset =
624 (cluster_offset + s->cluster_size - 1)
625 & ~(s->cluster_size - 1);
626 qtruncate(s->fd, cluster_offset +
627 s->cluster_size, s->sparse);
628 s->fd_end = (cluster_offset + s->cluster_size);
629 /* if encrypted, we must initialize the cluster
630 content which won't be written */
631 if (s->crypt_method &&
632 (n_end - n_start) < s->cluster_sectors) {
633 uint64_t start_sect;
634 start_sect = (offset &
635 ~(s->cluster_size - 1))
636 >> 9;
637 memset(s->cluster_data + 512,
638 0xaa, 512);
639 for (i = 0; i < s->cluster_sectors;i++)
640 {
641 if (i < n_start || i >= n_end)
642 {
643 encrypt_sectors(s, start_sect + i,
644 s->cluster_data,
645 s->cluster_data + 512, 1, 1,
646 &s->aes_encrypt_key);
647 lseek(s->fd, cluster_offset + i * 512, SEEK_SET);
648 if (write(s->fd, s->cluster_data, 512) != 512)
649 return -1;
650 }
651 }
652 }
653 } else {
654 cluster_offset |= QCOW_OFLAG_COMPRESSED |
655 (uint64_t)compressed_size
656 << (63 - s->cluster_bits);
657 }
658 }
659 /* update L2 table */
660 tmp = cpu_to_be64(cluster_offset);
661 l2_table[l2_index] = tmp;
663 /*For IO_DIRECT we write 4KByte blocks*/
664 l2_sector = (l2_index * sizeof(uint64_t)) >> 12;
665 l2_ptr = (char *)l2_table + (l2_sector << 12);
667 if (posix_memalign((void **)&tmp_ptr2, 4096, 4096) != 0) {
668 DPRINTF("ERROR allocating memory for L1 table\n");
669 }
670 memcpy(tmp_ptr2, l2_ptr, 4096);
671 lseek(s->fd, l2_offset + (l2_sector << 12), SEEK_SET);
672 write(s->fd, tmp_ptr2, 4096);
673 free(tmp_ptr2);
674 }
675 return cluster_offset;
676 }
678 static void init_cluster_cache(struct td_state *bs)
679 {
680 struct tdqcow_state *s = (struct tdqcow_state *)bs->private;
681 uint32_t count = 0;
682 int i, cluster_entries;
684 cluster_entries = s->cluster_size / 512;
685 DPRINTF("Initialising Cluster cache, %d sectors per cluster (%d cluster size)\n",
686 cluster_entries, s->cluster_size);
688 for (i = 0; i < bs->size; i += cluster_entries) {
689 if (get_cluster_offset(bs, i << 9, 0, 0, 0, 1)) count++;
690 if (count >= L2_CACHE_SIZE) return;
691 }
692 DPRINTF("Finished cluster initialisation, added %d entries\n", count);
693 return;
694 }
696 static int qcow_is_allocated(struct td_state *bs, int64_t sector_num,
697 int nb_sectors, int *pnum)
698 {
699 struct tdqcow_state *s = (struct tdqcow_state *)bs->private;
701 int index_in_cluster, n;
702 uint64_t cluster_offset;
704 cluster_offset = get_cluster_offset(bs, sector_num << 9, 0, 0, 0, 0);
705 index_in_cluster = sector_num & (s->cluster_sectors - 1);
706 n = s->cluster_sectors - index_in_cluster;
707 if (n > nb_sectors)
708 n = nb_sectors;
709 *pnum = n;
710 return (cluster_offset != 0);
711 }
713 static int decompress_buffer(uint8_t *out_buf, int out_buf_size,
714 const uint8_t *buf, int buf_size)
715 {
716 z_stream strm1, *strm = &strm1;
717 int ret, out_len;
719 memset(strm, 0, sizeof(*strm));
721 strm->next_in = (uint8_t *)buf;
722 strm->avail_in = buf_size;
723 strm->next_out = out_buf;
724 strm->avail_out = out_buf_size;
726 ret = inflateInit2(strm, -12);
727 if (ret != Z_OK)
728 return -1;
729 ret = inflate(strm, Z_FINISH);
730 out_len = strm->next_out - out_buf;
731 if ( (ret != Z_STREAM_END && ret != Z_BUF_ERROR) ||
732 (out_len != out_buf_size) ) {
733 inflateEnd(strm);
734 return -1;
735 }
736 inflateEnd(strm);
737 return 0;
738 }
740 static int decompress_cluster(struct tdqcow_state *s, uint64_t cluster_offset)
741 {
742 int ret, csize;
743 uint64_t coffset;
745 coffset = cluster_offset & s->cluster_offset_mask;
746 if (s->cluster_cache_offset != coffset) {
747 csize = cluster_offset >> (63 - s->cluster_bits);
748 csize &= (s->cluster_size - 1);
749 lseek(s->fd, coffset, SEEK_SET);
750 ret = read(s->fd, s->cluster_data, csize);
751 if (ret != csize)
752 return -1;
753 if (decompress_buffer(s->cluster_cache, s->cluster_size,
754 s->cluster_data, csize) < 0) {
755 return -1;
756 }
757 s->cluster_cache_offset = coffset;
758 }
759 return 0;
760 }
762 /* Open the disk file and initialize qcow state. */
763 int tdqcow_open (struct td_state *bs, const char *name)
764 {
765 int fd, len, i, shift, ret, size, l1_table_size;
766 struct tdqcow_state *s = (struct tdqcow_state *)bs->private;
767 char *buf;
768 QCowHeader *header;
769 QCowHeader_ext *exthdr;
770 uint32_t cksum;
771 uint64_t final_cluster = 0;
773 DPRINTF("QCOW: Opening %s\n",name);
774 /* set up a pipe so that we can hand back a poll fd that won't fire.*/
775 ret = pipe(s->poll_pipe);
776 if (ret != 0)
777 return (0 - errno);
779 fd = open(name, O_RDWR | O_DIRECT | O_LARGEFILE);
780 if (fd < 0) {
781 DPRINTF("Unable to open %s (%d)\n",name,0 - errno);
782 return -1;
783 }
785 s->fd = fd;
786 asprintf(&s->name,"%s", name);
788 ASSERT(sizeof(header) < 512);
790 ret = posix_memalign((void **)&buf, 512, 512);
791 if (ret != 0) goto fail;
793 if (read(fd, buf, 512) != 512)
794 goto fail;
796 header = (QCowHeader *)buf;
797 be32_to_cpus(&header->magic);
798 be32_to_cpus(&header->version);
799 be64_to_cpus(&header->backing_file_offset);
800 be32_to_cpus(&header->backing_file_size);
801 be32_to_cpus(&header->mtime);
802 be64_to_cpus(&header->size);
803 be32_to_cpus(&header->crypt_method);
804 be64_to_cpus(&header->l1_table_offset);
806 if (header->magic != QCOW_MAGIC || header->version > QCOW_VERSION)
807 goto fail;
808 if (header->size <= 1 || header->cluster_bits < 9)
809 goto fail;
810 if (header->crypt_method > QCOW_CRYPT_AES)
811 goto fail;
812 s->crypt_method_header = header->crypt_method;
813 if (s->crypt_method_header)
814 s->encrypted = 1;
815 s->cluster_bits = header->cluster_bits;
816 s->cluster_size = 1 << s->cluster_bits;
817 s->cluster_sectors = 1 << (s->cluster_bits - 9);
818 s->l2_bits = header->l2_bits;
819 s->l2_size = 1 << s->l2_bits;
820 s->cluster_alloc = s->l2_size;
821 bs->size = header->size / 512;
822 s->cluster_offset_mask = (1LL << (63 - s->cluster_bits)) - 1;
824 /* read the level 1 table */
825 shift = s->cluster_bits + s->l2_bits;
826 s->l1_size = (header->size + (1LL << shift) - 1) >> shift;
828 s->l1_table_offset = header->l1_table_offset;
830 /*allocate a 4Kbyte multiple of memory*/
831 l1_table_size = s->l1_size * sizeof(uint64_t);
832 if (l1_table_size % 4096 > 0) {
833 l1_table_size = ((l1_table_size >> 12) + 1) << 12;
834 }
835 ret = posix_memalign((void **)&s->l1_table, 4096, l1_table_size);
836 if (ret != 0) goto fail;
838 memset(s->l1_table, 0x00, l1_table_size);
840 DPRINTF("L1 Table offset detected: %llu, size %d (%d)\n",
841 (long long)s->l1_table_offset,
842 (int) (s->l1_size * sizeof(uint64_t)),
843 l1_table_size);
845 lseek(fd, s->l1_table_offset, SEEK_SET);
846 if (read(fd, s->l1_table, l1_table_size) != l1_table_size)
847 goto fail;
849 for(i = 0;i < s->l1_size; i++) {
850 //be64_to_cpus(&s->l1_table[i]);
851 //DPRINTF("L1[%d] => %llu\n", i, s->l1_table[i]);
852 if (s->l1_table[i] > final_cluster)
853 final_cluster = s->l1_table[i];
854 }
856 /* alloc L2 cache */
857 size = s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t);
858 ret = posix_memalign((void **)&s->l2_cache, 4096, size);
859 if(ret != 0) goto fail;
861 size = s->cluster_size;
862 ret = posix_memalign((void **)&s->cluster_cache, 4096, size);
863 if(ret != 0) goto fail;
865 ret = posix_memalign((void **)&s->cluster_data, 4096, size);
866 if(ret != 0) goto fail;
867 s->cluster_cache_offset = -1;
869 /* read the backing file name */
870 s->bfd = -1;
871 if (header->backing_file_offset != 0) {
872 DPRINTF("Reading backing file data\n");
873 len = header->backing_file_size;
874 if (len > 1023)
875 len = 1023;
877 /*TODO - Fix read size for O_DIRECT and use original fd!*/
878 fd = open(name, O_RDONLY | O_LARGEFILE);
880 lseek(fd, header->backing_file_offset, SEEK_SET);
881 if (read(fd, bs->backing_file, len) != len)
882 goto fail;
883 bs->backing_file[len] = '\0';
884 close(fd);
885 /***********************************/
887 /*Open backing file*/
888 fd = open(bs->backing_file, O_RDONLY | O_DIRECT | O_LARGEFILE);
889 if (fd < 0) {
890 DPRINTF("Unable to open backing file: %s\n",
891 bs->backing_file);
892 goto fail;
893 }
894 s->bfd = fd;
895 s->cluster_alloc = 1; /*Cannot use pre-alloc*/
896 }
898 bs->sector_size = 512;
899 bs->info = 0;
901 /*Detect min_cluster_alloc*/
902 s->min_cluster_alloc = 1; /*Default*/
903 if (s->bfd == -1 && (s->l1_table_offset % 4096 == 0) ) {
904 /*We test to see if the xen magic # exists*/
905 exthdr = (QCowHeader_ext *)(buf + sizeof(QCowHeader));
906 be32_to_cpus(&exthdr->xmagic);
907 if(exthdr->xmagic != XEN_MAGIC)
908 goto end_xenhdr;
910 /*Finally check the L1 table cksum*/
911 be32_to_cpus(&exthdr->cksum);
912 cksum = gen_cksum((char *)s->l1_table, s->l1_size * sizeof(uint64_t));
913 if(exthdr->cksum != cksum) {
914 goto end_xenhdr;
915 }
917 be32_to_cpus(&exthdr->min_cluster_alloc);
918 be32_to_cpus(&exthdr->flags);
919 if (exthdr->flags & QCOW_SPARSE_FILE)
920 s->sparse = 1;
921 s->min_cluster_alloc = exthdr->min_cluster_alloc;
922 }
924 end_xenhdr:
925 if (init_aio_state(bs)!=0) {
926 DPRINTF("Unable to initialise AIO state\n");
927 goto fail;
928 }
929 s->fd_end = (final_cluster == 0 ? (s->l1_table_offset + l1_table_size) :
930 (final_cluster + s->cluster_size));
932 return 0;
934 fail:
935 DPRINTF("QCOW Open failed\n");
936 free(s->l1_table);
937 free(s->l2_cache);
938 free(s->cluster_cache);
939 free(s->cluster_data);
940 close(fd);
941 return -1;
942 }
944 int tdqcow_queue_read(struct td_state *bs, uint64_t sector,
945 int nb_sectors, char *buf, td_callback_t cb,
946 int id, void *private)
947 {
948 struct tdqcow_state *s = (struct tdqcow_state *)bs->private;
949 int ret = 0, index_in_cluster, n, i, qcow_idx, asubmit = 0;
950 uint64_t cluster_offset;
952 /*Check we can get a lock*/
953 for (i = 0; i < nb_sectors; i++)
954 if (!aio_can_lock(s, sector + i)) {
955 DPRINTF("AIO_CAN_LOCK failed [%llu]\n",
956 (long long) sector + i);
957 return -EBUSY;
958 }
960 /*We store a local record of the request*/
961 qcow_idx = get_free_idx(s);
962 while (nb_sectors > 0) {
963 cluster_offset =
964 get_cluster_offset(bs, sector << 9, 0, 0, 0, 0);
965 index_in_cluster = sector & (s->cluster_sectors - 1);
966 n = s->cluster_sectors - index_in_cluster;
967 if (n > nb_sectors)
968 n = nb_sectors;
970 if (s->iocb_free_count == 0 || !aio_lock(s, sector)) {
971 DPRINTF("AIO_LOCK or iocb_free_count (%d) failed"
972 "[%llu]\n", s->iocb_free_count,
973 (long long) sector);
974 return -ENOMEM;
975 }
977 if (!cluster_offset && (s->bfd > 0)) {
978 s->nr_reqs[qcow_idx]++;
979 asubmit += async_read(s, s->bfd, n * 512, sector << 9,
980 buf, cb, id, sector,
981 qcow_idx, private);
982 } else if(!cluster_offset) {
983 memset(buf, 0, 512 * n);
984 aio_unlock(s, sector);
985 } else if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
986 if (decompress_cluster(s, cluster_offset) < 0) {
987 ret = -1;
988 goto done;
989 }
990 memcpy(buf, s->cluster_cache + index_in_cluster * 512,
991 512 * n);
992 } else {
993 s->nr_reqs[qcow_idx]++;
994 asubmit += async_read(s, s->fd, n * 512,
995 (cluster_offset +
996 index_in_cluster * 512),
997 buf, cb, id, sector,
998 qcow_idx, private);
999 }
1000 nb_sectors -= n;
1001 sector += n;
1002 buf += n * 512;
1004 done:
1005 /*Callback if no async requests outstanding*/
1006 if (!asubmit) return cb(bs, ret == -1 ? -1 : 0, id, private);
1008 return 0;
1011 int tdqcow_queue_write(struct td_state *bs, uint64_t sector,
1012 int nb_sectors, char *buf, td_callback_t cb,
1013 int id, void *private)
1015 struct tdqcow_state *s = (struct tdqcow_state *)bs->private;
1016 int ret = 0, index_in_cluster, n, i, qcow_idx, asubmit = 0;
1017 uint64_t cluster_offset;
1019 /*Check we can get a lock*/
1020 for (i = 0; i < nb_sectors; i++)
1021 if (!aio_can_lock(s, sector + i)) {
1022 DPRINTF("AIO_CAN_LOCK failed [%llu]\n",
1023 (long long) (sector + i));
1024 return -EBUSY;
1027 /*We store a local record of the request*/
1028 qcow_idx = get_free_idx(s);
1029 while (nb_sectors > 0) {
1030 index_in_cluster = sector & (s->cluster_sectors - 1);
1031 n = s->cluster_sectors - index_in_cluster;
1032 if (n > nb_sectors)
1033 n = nb_sectors;
1035 if (s->iocb_free_count == 0 || !aio_lock(s, sector)){
1036 DPRINTF("AIO_LOCK or iocb_free_count (%d) failed"
1037 "[%llu]\n", s->iocb_free_count,
1038 (long long) sector);
1039 return -ENOMEM;
1042 if (!IS_ZERO(buf,n * 512)) {
1044 cluster_offset = get_cluster_offset(bs, sector << 9,
1045 1, 0,
1046 index_in_cluster,
1047 index_in_cluster+n
1048 );
1049 if (!cluster_offset) {
1050 DPRINTF("Ooops, no write cluster offset!\n");
1051 ret = -1;
1052 goto done;
1055 if (s->crypt_method) {
1056 encrypt_sectors(s, sector, s->cluster_data,
1057 (unsigned char *)buf, n, 1,
1058 &s->aes_encrypt_key);
1059 s->nr_reqs[qcow_idx]++;
1060 asubmit += async_write(s, s->fd, n * 512,
1061 (cluster_offset +
1062 index_in_cluster*512),
1063 (char *)s->cluster_data,
1064 cb, id, sector,
1065 qcow_idx, private);
1066 } else {
1067 s->nr_reqs[qcow_idx]++;
1068 asubmit += async_write(s, s->fd, n * 512,
1069 (cluster_offset +
1070 index_in_cluster*512),
1071 buf, cb, id, sector,
1072 qcow_idx, private);
1074 } else {
1075 /*Write data contains zeros, but we must check to see
1076 if cluster already allocated*/
1077 cluster_offset = get_cluster_offset(bs, sector << 9,
1078 0, 0,
1079 index_in_cluster,
1080 index_in_cluster+n
1081 );
1082 if(cluster_offset) {
1083 if (s->crypt_method) {
1084 encrypt_sectors(s, sector,
1085 s->cluster_data,
1086 (unsigned char *)buf,
1087 n, 1,
1088 &s->aes_encrypt_key);
1089 s->nr_reqs[qcow_idx]++;
1090 asubmit += async_write(s, s->fd,
1091 n * 512,
1092 (cluster_offset+
1093 index_in_cluster * 512),
1094 (char *)s->cluster_data, cb, id, sector,
1095 qcow_idx, private);
1096 } else {
1097 s->nr_reqs[qcow_idx]++;
1098 asubmit += async_write(s, s->fd, n*512,
1099 cluster_offset + index_in_cluster * 512,
1100 buf, cb, id, sector,
1101 qcow_idx, private);
1104 else aio_unlock(s, sector);
1106 nb_sectors -= n;
1107 sector += n;
1108 buf += n * 512;
1110 s->cluster_cache_offset = -1; /* disable compressed cache */
1112 done:
1113 /*Callback if no async requests outstanding*/
1114 if (!asubmit) return cb(bs, ret == -1 ? -1 : 0, id, private);
1116 return 0;
1119 int tdqcow_submit(struct td_state *bs)
1121 int ret;
1122 struct tdqcow_state *prv = (struct tdqcow_state *)bs->private;
1124 ret = io_submit(prv->aio_ctx, prv->iocb_queued, prv->iocb_queue);
1126 /* XXX: TODO: Handle error conditions here. */
1128 /* Success case: */
1129 prv->iocb_queued = 0;
1131 return ret;
1135 int *tdqcow_get_fd(struct td_state *bs)
1137 struct tdqcow_state *s = (struct tdqcow_state *)bs->private;
1138 int *fds, i;
1140 fds = malloc(sizeof(int) * MAX_IOFD);
1141 /*initialise the FD array*/
1142 for(i=0;i<MAX_IOFD;i++) fds[i] = 0;
1144 fds[0] = s->poll_fd;
1145 return fds;
1148 int tdqcow_close(struct td_state *bs)
1150 struct tdqcow_state *s = (struct tdqcow_state *)bs->private;
1151 uint32_t cksum, out;
1152 int fd, offset;
1154 /*Update the hdr cksum*/
1155 if(s->min_cluster_alloc == s->l2_size) {
1156 cksum = gen_cksum((char *)s->l1_table, s->l1_size * sizeof(uint64_t));
1157 printf("Writing cksum: %d",cksum);
1158 fd = open(s->name, O_WRONLY | O_LARGEFILE); /*Open without O_DIRECT*/
1159 offset = sizeof(QCowHeader) + sizeof(uint32_t);
1160 lseek(fd, offset, SEEK_SET);
1161 out = cpu_to_be32(cksum);
1162 write(fd, &out, sizeof(uint32_t));
1163 close(fd);
1166 free(s->name);
1167 free(s->l1_table);
1168 free(s->l2_cache);
1169 free(s->cluster_cache);
1170 free(s->cluster_data);
1171 close(s->fd);
1172 return 0;
1175 int tdqcow_do_callbacks(struct td_state *s, int sid)
1177 int ret, i, rsp = 0,*ptr;
1178 struct io_event *ep;
1179 struct tdqcow_state *prv = (struct tdqcow_state *)s->private;
1181 if (sid > MAX_IOFD) return 1;
1183 /* Non-blocking test for completed io. */
1184 ret = io_getevents(prv->aio_ctx, 0, MAX_AIO_REQS, prv->aio_events,
1185 NULL);
1187 for (ep=prv->aio_events, i = ret; i-->0; ep++) {
1188 struct iocb *io = ep->obj;
1189 struct pending_aio *pio;
1191 pio = &prv->pending_aio[(long)io->data];
1193 aio_unlock(prv, pio->sector);
1194 if (pio->id >= 0) {
1195 if (prv->crypt_method)
1196 encrypt_sectors(prv, pio->sector,
1197 (unsigned char *)pio->buf,
1198 (unsigned char *)pio->buf,
1199 pio->nb_sectors, 0,
1200 &prv->aes_decrypt_key);
1201 prv->nr_reqs[pio->qcow_idx]--;
1202 if (prv->nr_reqs[pio->qcow_idx] == 0)
1203 rsp += pio->cb(s, ep->res == io->u.c.nbytes ? 0 : 1, pio->id,
1204 pio->private);
1205 } else if (pio->id == -2) free(pio->buf);
1207 prv->iocb_free[prv->iocb_free_count++] = io;
1209 return rsp;
1212 int qcow_create(const char *filename, uint64_t total_size,
1213 const char *backing_file, int flags)
1215 int fd, header_size, backing_filename_len, l1_size, i;
1216 int shift, length, adjust, ret = 0;
1217 QCowHeader header;
1218 QCowHeader_ext exthdr;
1219 char backing_filename[1024], *ptr;
1220 uint64_t tmp, size, total_length;
1221 struct stat st;
1223 DPRINTF("Qcow_create: size %llu\n",(long long unsigned)total_size);
1225 fd = open(filename,
1226 O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE,
1227 0644);
1228 if (fd < 0)
1229 return -1;
1231 memset(&header, 0, sizeof(header));
1232 header.magic = cpu_to_be32(QCOW_MAGIC);
1233 header.version = cpu_to_be32(QCOW_VERSION);
1235 /*Create extended header fields*/
1236 exthdr.xmagic = cpu_to_be32(XEN_MAGIC);
1238 header_size = sizeof(header) + sizeof(QCowHeader_ext);
1239 backing_filename_len = 0;
1240 size = (total_size >> SECTOR_SHIFT);
1241 if (backing_file) {
1242 if (strcmp(backing_file, "fat:")) {
1243 const char *p;
1244 /* XXX: this is a hack: we do not attempt to
1245 *check for URL like syntax */
1246 p = strchr(backing_file, ':');
1247 if (p && (p - backing_file) >= 2) {
1248 /* URL like but exclude "c:" like filenames */
1249 strncpy(backing_filename, backing_file,
1250 sizeof(backing_filename));
1251 } else {
1252 realpath(backing_file, backing_filename);
1253 if (stat(backing_filename, &st) != 0) {
1254 return -1;
1257 header.backing_file_offset = cpu_to_be64(header_size);
1258 backing_filename_len = strlen(backing_filename);
1259 header.backing_file_size = cpu_to_be32(
1260 backing_filename_len);
1261 header_size += backing_filename_len;
1263 /*Set to the backing file size*/
1264 if(get_filesize(backing_filename, &size, &st)) {
1265 return -1;
1267 DPRINTF("Backing file size detected: %lld sectors"
1268 "(total %lld [%lld MB])\n",
1269 (long long)size,
1270 (long long)(size << SECTOR_SHIFT),
1271 (long long)(size >> 11));
1272 } else {
1273 backing_file = NULL;
1274 DPRINTF("Setting file size: %lld (total %lld)\n",
1275 (long long) total_size,
1276 (long long) (total_size << SECTOR_SHIFT));
1278 header.mtime = cpu_to_be32(st.st_mtime);
1279 header.cluster_bits = 9; /* 512 byte cluster to avoid copying
1280 unmodifyed sectors */
1281 header.l2_bits = 12; /* 32 KB L2 tables */
1282 exthdr.min_cluster_alloc = cpu_to_be32(1);
1283 } else {
1284 DPRINTF("Setting file size: %lld sectors"
1285 "(total %lld [%lld MB])\n",
1286 (long long) size,
1287 (long long) (size << SECTOR_SHIFT),
1288 (long long) (size >> 11));
1289 header.cluster_bits = 12; /* 4 KB clusters */
1290 header.l2_bits = 9; /* 4 KB L2 tables */
1291 exthdr.min_cluster_alloc = cpu_to_be32(1 << 9);
1293 /*Set the header size value*/
1294 header.size = cpu_to_be64(size * 512);
1296 header_size = (header_size + 7) & ~7;
1297 if (header_size % 4096 > 0) {
1298 header_size = ((header_size >> 12) + 1) << 12;
1301 shift = header.cluster_bits + header.l2_bits;
1302 l1_size = ((size * 512) + (1LL << shift) - 1) >> shift;
1304 header.l1_table_offset = cpu_to_be64(header_size);
1305 DPRINTF("L1 Table offset: %d, size %d\n",
1306 header_size,
1307 (int)(l1_size * sizeof(uint64_t)));
1308 if (flags & QCOW_CRYPT_AES) {
1309 header.crypt_method = cpu_to_be32(QCOW_CRYPT_AES);
1310 } else {
1311 header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE);
1314 ptr = calloc(1, l1_size * sizeof(uint64_t));
1315 exthdr.cksum = cpu_to_be32(gen_cksum(ptr, l1_size * sizeof(uint64_t)));
1316 printf("Created cksum: %d\n",exthdr.cksum);
1317 free(ptr);
1319 /*adjust file length to 4 KByte boundary*/
1320 length = header_size + l1_size * sizeof(uint64_t);
1321 if (length % 4096 > 0) {
1322 length = ((length >> 12) + 1) << 12;
1323 qtruncate(fd, length, 0);
1324 DPRINTF("Adjusted filelength to %d for 4 "
1325 "Kbyte alignment\n",length);
1328 if (!(flags & QCOW_SPARSE_FILE)) {
1329 /*Filesize is length + l1_size * (1 << s->l2_bits) + (size*512)*/
1330 total_length = length + (l1_size * (1 << 9)) + (size * 512);
1331 qtruncate(fd, total_length, 0);
1332 printf("File truncated to length %llu\n",total_length);
1334 exthdr.flags = cpu_to_be32(flags);
1336 /* write all the data */
1337 lseek(fd, 0, SEEK_SET);
1338 ret += write(fd, &header, sizeof(header));
1339 ret += write(fd, &exthdr, sizeof(exthdr));
1340 if (backing_file) {
1341 ret += write(fd, backing_filename, backing_filename_len);
1343 lseek(fd, header_size, SEEK_SET);
1344 tmp = 0;
1345 for (i = 0;i < l1_size; i++) {
1346 ret += write(fd, &tmp, sizeof(tmp));
1349 close(fd);
1351 return 0;
1354 int qcow_make_empty(struct td_state *bs)
1356 struct tdqcow_state *s = (struct tdqcow_state *)bs->private;
1357 uint32_t l1_length = s->l1_size * sizeof(uint64_t);
1359 memset(s->l1_table, 0, l1_length);
1360 lseek(s->fd, s->l1_table_offset, SEEK_SET);
1361 if (write(s->fd, s->l1_table, l1_length) < 0)
1362 return -1;
1363 qtruncate(s->fd, s->l1_table_offset + l1_length, s->sparse);
1365 memset(s->l2_cache, 0, s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
1366 memset(s->l2_cache_offsets, 0, L2_CACHE_SIZE * sizeof(uint64_t));
1367 memset(s->l2_cache_counts, 0, L2_CACHE_SIZE * sizeof(uint32_t));
1369 return 0;
1372 int qcow_get_cluster_size(struct td_state *bs)
1374 struct tdqcow_state *s = (struct tdqcow_state *)bs->private;
1376 return s->cluster_size;
1379 /* XXX: put compressed sectors first, then all the cluster aligned
1380 tables to avoid losing bytes in alignment */
1381 int qcow_compress_cluster(struct td_state *bs, int64_t sector_num,
1382 const uint8_t *buf)
1384 struct tdqcow_state *s = (struct tdqcow_state *)bs->private;
1385 z_stream strm;
1386 int ret, out_len;
1387 uint8_t *out_buf;
1388 uint64_t cluster_offset;
1390 out_buf = malloc(s->cluster_size + (s->cluster_size / 1000) + 128);
1391 if (!out_buf)
1392 return -1;
1394 /* best compression, small window, no zlib header */
1395 memset(&strm, 0, sizeof(strm));
1396 ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
1397 Z_DEFLATED, -12,
1398 9, Z_DEFAULT_STRATEGY);
1399 if (ret != 0) {
1400 free(out_buf);
1401 return -1;
1404 strm.avail_in = s->cluster_size;
1405 strm.next_in = (uint8_t *)buf;
1406 strm.avail_out = s->cluster_size;
1407 strm.next_out = out_buf;
1409 ret = deflate(&strm, Z_FINISH);
1410 if (ret != Z_STREAM_END && ret != Z_OK) {
1411 free(out_buf);
1412 deflateEnd(&strm);
1413 return -1;
1415 out_len = strm.next_out - out_buf;
1417 deflateEnd(&strm);
1419 if (ret != Z_STREAM_END || out_len >= s->cluster_size) {
1420 /* could not compress: write normal cluster */
1421 //tdqcow_queue_write(bs, sector_num, buf, s->cluster_sectors);
1422 } else {
1423 cluster_offset = get_cluster_offset(bs, sector_num << 9, 2,
1424 out_len, 0, 0);
1425 cluster_offset &= s->cluster_offset_mask;
1426 lseek(s->fd, cluster_offset, SEEK_SET);
1427 if (write(s->fd, out_buf, out_len) != out_len) {
1428 free(out_buf);
1429 return -1;
1433 free(out_buf);
1434 return 0;
1437 struct tap_disk tapdisk_qcow = {
1438 "tapdisk_qcow",
1439 sizeof(struct tdqcow_state),
1440 tdqcow_open,
1441 tdqcow_queue_read,
1442 tdqcow_queue_write,
1443 tdqcow_submit,
1444 tdqcow_get_fd,
1445 tdqcow_close,
1446 tdqcow_do_callbacks,
1447 };