ia64/xen-unstable

view tools/blktap/drivers/block-qcow.c @ 15783:c93e2a822d6f

[xen, xencomm] xencomm multiple page support
Current implementation doesn't allow struct xencomm_desc::address
array to be more than single page. On IA64 it causes 64GB+ domain
creation failure. This patch generalizes xencomm to allow multipage

Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp>
author kfraser@localhost.localdomain
date Tue Aug 28 15:32:27 2007 +0100 (2007-08-28)
parents eeeb77195ac2
children b6cc74f275fd
line source
1 /* block-qcow.c
2 *
3 * Asynchronous Qemu copy-on-write disk implementation.
4 * Code based on the Qemu implementation
5 * (see copyright notice below)
6 *
7 * (c) 2006 Andrew Warfield and Julian Chesterfield
8 *
9 */
11 /*
12 * Block driver for the QCOW format
13 *
14 * Copyright (c) 2004 Fabrice Bellard
15 *
16 * Permission is hereby granted, free of charge, to any person obtaining a copy
17 * of this software and associated documentation files(the "Software"), to deal
18 * in the Software without restriction, including without limitation the rights
19 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
20 * copies of the Software, and to permit persons to whom the Software is
21 * furnished to do so, subject to the following conditions:
22 */
24 #include <errno.h>
25 #include <fcntl.h>
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <unistd.h>
29 #include <sys/statvfs.h>
30 #include <sys/stat.h>
31 #include <sys/ioctl.h>
32 #include <linux/fs.h>
33 #include <string.h>
34 #include <zlib.h>
35 #include <inttypes.h>
36 #include <libaio.h>
37 #include <openssl/md5.h>
38 #include "bswap.h"
39 #include "aes.h"
40 #include "tapdisk.h"
41 #include "tapaio.h"
43 #if 1
44 #define ASSERT(_p) \
45 if ( !(_p) ) { DPRINTF("Assertion '%s' failed, line %d, file %s", #_p , \
46 __LINE__, __FILE__); *(int*)0=0; }
47 #else
48 #define ASSERT(_p) ((void)0)
49 #endif
51 #define ROUNDUP(l, s) \
52 ({ \
53 (uint64_t)( \
54 (l + (s - 1)) - ((l + (s - 1)) % s)); \
55 })
57 struct pending_aio {
58 td_callback_t cb;
59 int id;
60 void *private;
61 int nb_sectors;
62 char *buf;
63 uint64_t sector;
64 };
66 #define IOCB_IDX(_s, _io) ((_io) - (_s)->iocb_list)
68 #define ZERO_TEST(_b) (_b | 0x00)
70 /**************************************************************/
71 /* QEMU COW block driver with compression and encryption support */
73 #define QCOW_MAGIC (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb)
74 #define XEN_MAGIC (('X' << 24) | ('E' << 16) | ('N' << 8) | 0xfb)
75 #define QCOW_VERSION 1
77 #define QCOW_CRYPT_NONE 0x00
78 #define QCOW_CRYPT_AES 0x01
80 #define QCOW_OFLAG_COMPRESSED (1LL << 63)
81 #define SPARSE_FILE 0x01
83 #ifndef O_BINARY
84 #define O_BINARY 0
85 #endif
87 typedef struct QCowHeader {
88 uint32_t magic;
89 uint32_t version;
90 uint64_t backing_file_offset;
91 uint32_t backing_file_size;
92 uint32_t mtime;
93 uint64_t size; /* in bytes */
94 uint8_t cluster_bits;
95 uint8_t l2_bits;
96 uint32_t crypt_method;
97 uint64_t l1_table_offset;
98 } QCowHeader;
100 /*Extended header for Xen enhancements*/
101 typedef struct QCowHeader_ext {
102 uint32_t xmagic;
103 uint32_t cksum;
104 uint32_t min_cluster_alloc;
105 uint32_t flags;
106 } QCowHeader_ext;
108 #define L2_CACHE_SIZE 16 /*Fixed allocation in Qemu*/
110 struct tdqcow_state {
111 int fd; /*Main Qcow file descriptor */
112 uint64_t fd_end; /*Store a local record of file length */
113 char *name; /*Record of the filename*/
114 uint32_t backing_file_size;
115 uint64_t backing_file_offset;
116 int encrypted; /*File contents are encrypted or plain*/
117 int cluster_bits; /*Determines length of cluster as
118 *indicated by file hdr*/
119 int cluster_size; /*Length of cluster*/
120 int cluster_sectors; /*Number of sectors per cluster*/
121 int cluster_alloc; /*Blktap fix for allocating full
122 *extents*/
123 int min_cluster_alloc; /*Blktap historical extent alloc*/
124 int sparse; /*Indicates whether to preserve sparseness*/
125 int l2_bits; /*Size of L2 table entry*/
126 int l2_size; /*Full table size*/
127 int l1_size; /*L1 table size*/
128 uint64_t cluster_offset_mask;
129 uint64_t l1_table_offset; /*L1 table offset from beginning of
130 *file*/
131 uint64_t *l1_table; /*L1 table entries*/
132 uint64_t *l2_cache; /*We maintain a cache of size
133 *L2_CACHE_SIZE of most read entries*/
134 uint64_t l2_cache_offsets[L2_CACHE_SIZE]; /*L2 cache entries*/
135 uint32_t l2_cache_counts[L2_CACHE_SIZE]; /*Cache access record*/
136 uint8_t *cluster_cache;
137 uint8_t *cluster_data;
138 uint8_t *sector_lock; /*Locking bitmap for AIO reads/writes*/
139 uint64_t cluster_cache_offset; /**/
140 uint32_t crypt_method; /*current crypt method, 0 if no
141 *key yet */
142 uint32_t crypt_method_header; /**/
143 AES_KEY aes_encrypt_key; /*AES key*/
144 AES_KEY aes_decrypt_key; /*AES key*/
145 /* libaio state */
146 tap_aio_context_t aio_ctx;
147 int max_aio_reqs;
148 struct iocb *iocb_list;
149 struct iocb **iocb_free;
150 struct pending_aio *pending_aio;
151 int iocb_free_count;
152 struct iocb **iocb_queue;
153 int iocb_queued;
154 struct io_event *aio_events;
155 };
157 static int decompress_cluster(struct tdqcow_state *s, uint64_t cluster_offset);
159 static void free_aio_state(struct disk_driver *dd)
160 {
161 struct tdqcow_state *s = (struct tdqcow_state *)dd->private;
163 if (s->sector_lock)
164 free(s->sector_lock);
165 if (s->iocb_list)
166 free(s->iocb_list);
167 if (s->pending_aio)
168 free(s->pending_aio);
169 if (s->aio_events)
170 free(s->aio_events);
171 if (s->iocb_free)
172 free(s->iocb_free);
173 if (s->iocb_queue)
174 free(s->iocb_queue);
175 }
177 static int init_aio_state(struct disk_driver *dd)
178 {
179 int i, ret;
180 struct td_state *bs = dd->td_state;
181 struct tdqcow_state *s = (struct tdqcow_state *)dd->private;
182 long ioidx;
184 s->iocb_list = NULL;
185 s->pending_aio = NULL;
186 s->aio_events = NULL;
187 s->iocb_free = NULL;
188 s->iocb_queue = NULL;
190 /*Initialize Locking bitmap*/
191 s->sector_lock = calloc(1, bs->size);
193 if (!s->sector_lock) {
194 DPRINTF("Failed to allocate sector lock\n");
195 goto fail;
196 }
198 /* A segment (i.e. a page) can span multiple clusters */
199 s->max_aio_reqs = ((getpagesize() / s->cluster_size) + 1) *
200 MAX_SEGMENTS_PER_REQ * MAX_REQUESTS;
202 /* Initialize AIO */
203 s->iocb_free_count = s->max_aio_reqs;
204 s->iocb_queued = 0;
206 if (!(s->iocb_list = malloc(sizeof(struct iocb) * s->max_aio_reqs)) ||
207 !(s->pending_aio = malloc(sizeof(struct pending_aio) * s->max_aio_reqs)) ||
208 !(s->aio_events = malloc(sizeof(struct io_event) * s->max_aio_reqs)) ||
209 !(s->iocb_free = malloc(sizeof(struct iocb *) * s->max_aio_reqs)) ||
210 !(s->iocb_queue = malloc(sizeof(struct iocb *) * s->max_aio_reqs))) {
211 DPRINTF("Failed to allocate AIO structs (max_aio_reqs = %d)\n",
212 s->max_aio_reqs);
213 goto fail;
214 }
216 ret = tap_aio_setup(&s->aio_ctx, s->aio_events, s->max_aio_reqs);
217 if (ret < 0) {
218 if (ret == -EAGAIN) {
219 DPRINTF("Couldn't setup AIO context. If you are "
220 "trying to concurrently use a large number "
221 "of blktap-based disks, you may need to "
222 "increase the system-wide aio request limit. "
223 "(e.g. 'echo echo 1048576 > /proc/sys/fs/"
224 "aio-max-nr')\n");
225 } else {
226 DPRINTF("Couldn't setup AIO context.\n");
227 }
228 goto fail;
229 }
231 for (i=0;i<s->max_aio_reqs;i++)
232 s->iocb_free[i] = &s->iocb_list[i];
234 DPRINTF("AIO state initialised\n");
236 return 0;
238 fail:
239 return -1;
240 }
242 static uint32_t gen_cksum(char *ptr, int len)
243 {
244 unsigned char *md;
245 uint32_t ret;
247 md = malloc(MD5_DIGEST_LENGTH);
249 if(!md) return 0;
251 if (MD5((unsigned char *)ptr, len, md) != md) {
252 free(md);
253 return 0;
254 }
256 memcpy(&ret, md, sizeof(uint32_t));
257 free(md);
258 return ret;
259 }
261 static int get_filesize(char *filename, uint64_t *size, struct stat *st)
262 {
263 int fd;
264 QCowHeader header;
266 /*Set to the backing file size*/
267 fd = open(filename, O_RDONLY);
268 if (fd < 0)
269 return -1;
270 if (read(fd, &header, sizeof(header)) < sizeof(header)) {
271 close(fd);
272 return -1;
273 }
274 close(fd);
276 be32_to_cpus(&header.magic);
277 be64_to_cpus(&header.size);
278 if (header.magic == QCOW_MAGIC) {
279 *size = header.size >> SECTOR_SHIFT;
280 return 0;
281 }
283 if(S_ISBLK(st->st_mode)) {
284 fd = open(filename, O_RDONLY);
285 if (fd < 0)
286 return -1;
287 if (ioctl(fd,BLKGETSIZE,size)!=0) {
288 printf("Unable to get Block device size\n");
289 close(fd);
290 return -1;
291 }
292 close(fd);
293 } else *size = (st->st_size >> SECTOR_SHIFT);
294 return 0;
295 }
297 static int qcow_set_key(struct tdqcow_state *s, const char *key)
298 {
299 uint8_t keybuf[16];
300 int len, i;
302 memset(keybuf, 0, 16);
303 len = strlen(key);
304 if (len > 16)
305 len = 16;
306 /* XXX: we could compress the chars to 7 bits to increase
307 entropy */
308 for (i = 0; i < len; i++) {
309 keybuf[i] = key[i];
310 }
311 s->crypt_method = s->crypt_method_header;
313 if (AES_set_encrypt_key(keybuf, 128, &s->aes_encrypt_key) != 0)
314 return -1;
315 if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0)
316 return -1;
317 #if 0
318 /* test */
319 {
320 uint8_t in[16];
321 uint8_t out[16];
322 uint8_t tmp[16];
323 for (i=0; i<16; i++)
324 in[i] = i;
325 AES_encrypt(in, tmp, &s->aes_encrypt_key);
326 AES_decrypt(tmp, out, &s->aes_decrypt_key);
327 for (i = 0; i < 16; i++)
328 DPRINTF(" %02x", tmp[i]);
329 DPRINTF("\n");
330 for (i = 0; i < 16; i++)
331 DPRINTF(" %02x", out[i]);
332 DPRINTF("\n");
333 }
334 #endif
335 return 0;
336 }
338 static int async_read(struct tdqcow_state *s, int size,
339 uint64_t offset, char *buf, td_callback_t cb,
340 int id, uint64_t sector, void *private)
341 {
342 struct iocb *io;
343 struct pending_aio *pio;
344 long ioidx;
346 io = s->iocb_free[--s->iocb_free_count];
348 ioidx = IOCB_IDX(s, io);
349 pio = &s->pending_aio[ioidx];
350 pio->cb = cb;
351 pio->id = id;
352 pio->private = private;
353 pio->nb_sectors = size/512;
354 pio->buf = buf;
355 pio->sector = sector;
357 io_prep_pread(io, s->fd, buf, size, offset);
358 io->data = (void *)ioidx;
360 s->iocb_queue[s->iocb_queued++] = io;
362 return 1;
363 }
365 static int async_write(struct tdqcow_state *s, int size,
366 uint64_t offset, char *buf, td_callback_t cb,
367 int id, uint64_t sector, void *private)
368 {
369 struct iocb *io;
370 struct pending_aio *pio;
371 long ioidx;
373 io = s->iocb_free[--s->iocb_free_count];
375 ioidx = IOCB_IDX(s, io);
376 pio = &s->pending_aio[ioidx];
377 pio->cb = cb;
378 pio->id = id;
379 pio->private = private;
380 pio->nb_sectors = size/512;
381 pio->buf = buf;
382 pio->sector = sector;
384 io_prep_pwrite(io, s->fd, buf, size, offset);
385 io->data = (void *)ioidx;
387 s->iocb_queue[s->iocb_queued++] = io;
389 return 1;
390 }
392 /*TODO: Fix sector span!*/
393 static int aio_can_lock(struct tdqcow_state *s, uint64_t sector)
394 {
395 return (s->sector_lock[sector] ? 0 : 1);
396 }
398 static int aio_lock(struct tdqcow_state *s, uint64_t sector)
399 {
400 return ++s->sector_lock[sector];
401 }
403 static void aio_unlock(struct tdqcow_state *s, uint64_t sector)
404 {
405 if (!s->sector_lock[sector]) return;
407 --s->sector_lock[sector];
408 return;
409 }
411 /*
412 * The crypt function is compatible with the linux cryptoloop
413 * algorithm for < 4 GB images. NOTE: out_buf == in_buf is
414 * supported .
415 */
416 static void encrypt_sectors(struct tdqcow_state *s, int64_t sector_num,
417 uint8_t *out_buf, const uint8_t *in_buf,
418 int nb_sectors, int enc,
419 const AES_KEY *key)
420 {
421 union {
422 uint64_t ll[2];
423 uint8_t b[16];
424 } ivec;
425 int i;
427 for (i = 0; i < nb_sectors; i++) {
428 ivec.ll[0] = cpu_to_le64(sector_num);
429 ivec.ll[1] = 0;
430 AES_cbc_encrypt(in_buf, out_buf, 512, key,
431 ivec.b, enc);
432 sector_num++;
433 in_buf += 512;
434 out_buf += 512;
435 }
436 }
438 static int qtruncate(int fd, off_t length, int sparse)
439 {
440 int ret, i;
441 int current = 0, rem = 0;
442 uint64_t sectors;
443 struct stat st;
444 char *buf;
446 /* If length is greater than the current file len
447 * we synchronously write zeroes to the end of the
448 * file, otherwise we truncate the length down
449 */
450 ret = fstat(fd, &st);
451 if (ret == -1)
452 return -1;
453 if (S_ISBLK(st.st_mode))
454 return 0;
456 sectors = (length + DEFAULT_SECTOR_SIZE - 1)/DEFAULT_SECTOR_SIZE;
457 current = (st.st_size + DEFAULT_SECTOR_SIZE - 1)/DEFAULT_SECTOR_SIZE;
458 rem = st.st_size % DEFAULT_SECTOR_SIZE;
460 /* If we are extending this file, we write zeros to the end --
461 * this tries to ensure that the extents allocated wind up being
462 * contiguous on disk.
463 */
464 if(st.st_size < sectors * DEFAULT_SECTOR_SIZE) {
465 /*We are extending the file*/
466 if ((ret = posix_memalign((void **)&buf,
467 512, DEFAULT_SECTOR_SIZE))) {
468 DPRINTF("posix_memalign failed: %d\n", ret);
469 return -1;
470 }
471 memset(buf, 0x00, DEFAULT_SECTOR_SIZE);
472 if (lseek(fd, 0, SEEK_END)==-1) {
473 DPRINTF("Lseek EOF failed (%d), internal error\n",
474 errno);
475 free(buf);
476 return -1;
477 }
478 if (rem) {
479 ret = write(fd, buf, rem);
480 if (ret != rem) {
481 DPRINTF("write failed: ret = %d, err = %s\n",
482 ret, strerror(errno));
483 free(buf);
484 return -1;
485 }
486 }
487 for (i = current; i < sectors; i++ ) {
488 ret = write(fd, buf, DEFAULT_SECTOR_SIZE);
489 if (ret != DEFAULT_SECTOR_SIZE) {
490 DPRINTF("write failed: ret = %d, err = %s\n",
491 ret, strerror(errno));
492 free(buf);
493 return -1;
494 }
495 }
496 free(buf);
497 } else if(sparse && (st.st_size > sectors * DEFAULT_SECTOR_SIZE))
498 if (ftruncate(fd, (off_t)sectors * DEFAULT_SECTOR_SIZE)==-1) {
499 DPRINTF("Ftruncate failed (%s)\n", strerror(errno));
500 return -1;
501 }
502 return 0;
503 }
506 /* 'allocate' is:
507 *
508 * 0 to not allocate.
509 *
510 * 1 to allocate a normal cluster (for sector indexes 'n_start' to
511 * 'n_end')
512 *
513 * 2 to allocate a compressed cluster of size
514 * 'compressed_size'. 'compressed_size' must be > 0 and <
515 * cluster_size
516 *
517 * return 0 if not allocated.
518 */
519 static uint64_t get_cluster_offset(struct tdqcow_state *s,
520 uint64_t offset, int allocate,
521 int compressed_size,
522 int n_start, int n_end)
523 {
524 int min_index, i, j, l1_index, l2_index, l2_sector, l1_sector;
525 char *tmp_ptr, *tmp_ptr2, *l2_ptr, *l1_ptr;
526 uint64_t l2_offset, *l2_table, cluster_offset, tmp;
527 uint32_t min_count;
528 int new_l2_table;
530 /*Check L1 table for the extent offset*/
531 l1_index = offset >> (s->l2_bits + s->cluster_bits);
532 l2_offset = s->l1_table[l1_index];
533 new_l2_table = 0;
534 if (!l2_offset) {
535 if (!allocate)
536 return 0;
537 /*
538 * allocating a new l2 entry + extent
539 * at the end of the file, we must also
540 * update the L1 entry safely.
541 */
542 l2_offset = s->fd_end;
544 /* round to cluster size */
545 l2_offset = (l2_offset + s->cluster_size - 1)
546 & ~(s->cluster_size - 1);
548 /* update the L1 entry */
549 s->l1_table[l1_index] = l2_offset;
550 tmp = cpu_to_be64(l2_offset);
552 /*Truncate file for L2 table
553 *(initialised to zero in case we crash)*/
554 if (qtruncate(s->fd,
555 l2_offset + (s->l2_size * sizeof(uint64_t)),
556 s->sparse) != 0) {
557 DPRINTF("ERROR truncating file\n");
558 return 0;
559 }
560 s->fd_end = l2_offset + (s->l2_size * sizeof(uint64_t));
562 /*Update the L1 table entry on disk
563 * (for O_DIRECT we write 4KByte blocks)*/
564 l1_sector = (l1_index * sizeof(uint64_t)) >> 12;
565 l1_ptr = (char *)s->l1_table + (l1_sector << 12);
567 if (posix_memalign((void **)&tmp_ptr, 4096, 4096) != 0) {
568 DPRINTF("ERROR allocating memory for L1 table\n");
569 }
570 memcpy(tmp_ptr, l1_ptr, 4096);
572 /*
573 * Issue non-asynchronous L1 write.
574 * For safety, we must ensure that
575 * entry is written before blocks.
576 */
577 lseek(s->fd, s->l1_table_offset + (l1_sector << 12), SEEK_SET);
578 if (write(s->fd, tmp_ptr, 4096) != 4096) {
579 free(tmp_ptr);
580 return 0;
581 }
582 free(tmp_ptr);
584 new_l2_table = 1;
585 goto cache_miss;
586 } else if (s->min_cluster_alloc == s->l2_size) {
587 /*Fast-track the request*/
588 cluster_offset = l2_offset + (s->l2_size * sizeof(uint64_t));
589 l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
590 return cluster_offset + (l2_index * s->cluster_size);
591 }
593 /*Check to see if L2 entry is already cached*/
594 for (i = 0; i < L2_CACHE_SIZE; i++) {
595 if (l2_offset == s->l2_cache_offsets[i]) {
596 /* increment the hit count */
597 if (++s->l2_cache_counts[i] == 0xffffffff) {
598 for (j = 0; j < L2_CACHE_SIZE; j++) {
599 s->l2_cache_counts[j] >>= 1;
600 }
601 }
602 l2_table = s->l2_cache + (i << s->l2_bits);
603 goto found;
604 }
605 }
607 cache_miss:
608 /* not found: load a new entry in the least used one */
609 min_index = 0;
610 min_count = 0xffffffff;
611 for (i = 0; i < L2_CACHE_SIZE; i++) {
612 if (s->l2_cache_counts[i] < min_count) {
613 min_count = s->l2_cache_counts[i];
614 min_index = i;
615 }
616 }
617 l2_table = s->l2_cache + (min_index << s->l2_bits);
619 /*If extent pre-allocated, read table from disk,
620 *otherwise write new table to disk*/
621 if (new_l2_table) {
622 /*Should we allocate the whole extent? Adjustable parameter.*/
623 if (s->cluster_alloc == s->l2_size) {
624 cluster_offset = l2_offset +
625 (s->l2_size * sizeof(uint64_t));
626 cluster_offset = (cluster_offset + s->cluster_size - 1)
627 & ~(s->cluster_size - 1);
628 if (qtruncate(s->fd, cluster_offset +
629 (s->cluster_size * s->l2_size),
630 s->sparse) != 0) {
631 DPRINTF("ERROR truncating file\n");
632 return 0;
633 }
634 s->fd_end = cluster_offset +
635 (s->cluster_size * s->l2_size);
636 for (i = 0; i < s->l2_size; i++) {
637 l2_table[i] = cpu_to_be64(cluster_offset +
638 (i*s->cluster_size));
639 }
640 } else memset(l2_table, 0, s->l2_size * sizeof(uint64_t));
642 lseek(s->fd, l2_offset, SEEK_SET);
643 if (write(s->fd, l2_table, s->l2_size * sizeof(uint64_t)) !=
644 s->l2_size * sizeof(uint64_t))
645 return 0;
646 } else {
647 lseek(s->fd, l2_offset, SEEK_SET);
648 if (read(s->fd, l2_table, s->l2_size * sizeof(uint64_t)) !=
649 s->l2_size * sizeof(uint64_t))
650 return 0;
651 }
653 /*Update the cache entries*/
654 s->l2_cache_offsets[min_index] = l2_offset;
655 s->l2_cache_counts[min_index] = 1;
657 found:
658 /*The extent is split into 's->l2_size' blocks of
659 *size 's->cluster_size'*/
660 l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
661 cluster_offset = be64_to_cpu(l2_table[l2_index]);
663 if (!cluster_offset ||
664 ((cluster_offset & QCOW_OFLAG_COMPRESSED) && allocate == 1) ) {
665 if (!allocate)
666 return 0;
668 if ((cluster_offset & QCOW_OFLAG_COMPRESSED) &&
669 (n_end - n_start) < s->cluster_sectors) {
670 /* cluster is already allocated but compressed, we must
671 decompress it in the case it is not completely
672 overwritten */
673 if (decompress_cluster(s, cluster_offset) < 0)
674 return 0;
675 cluster_offset = lseek(s->fd, s->fd_end, SEEK_SET);
676 cluster_offset = (cluster_offset + s->cluster_size - 1)
677 & ~(s->cluster_size - 1);
678 /* write the cluster content - not asynchronous */
679 lseek(s->fd, cluster_offset, SEEK_SET);
680 if (write(s->fd, s->cluster_cache, s->cluster_size) !=
681 s->cluster_size)
682 return -1;
683 } else {
684 /* allocate a new cluster */
685 cluster_offset = lseek(s->fd, s->fd_end, SEEK_SET);
686 if (allocate == 1) {
687 /* round to cluster size */
688 cluster_offset =
689 (cluster_offset + s->cluster_size - 1)
690 & ~(s->cluster_size - 1);
691 if (qtruncate(s->fd, cluster_offset +
692 s->cluster_size, s->sparse)!=0) {
693 DPRINTF("ERROR truncating file\n");
694 return 0;
695 }
696 s->fd_end = (cluster_offset + s->cluster_size);
697 /* if encrypted, we must initialize the cluster
698 content which won't be written */
699 if (s->crypt_method &&
700 (n_end - n_start) < s->cluster_sectors) {
701 uint64_t start_sect;
702 start_sect = (offset &
703 ~(s->cluster_size - 1))
704 >> 9;
705 memset(s->cluster_data + 512,
706 0xaa, 512);
707 for (i = 0; i < s->cluster_sectors;i++)
708 {
709 if (i < n_start || i >= n_end)
710 {
711 encrypt_sectors(s, start_sect + i,
712 s->cluster_data,
713 s->cluster_data + 512, 1, 1,
714 &s->aes_encrypt_key);
715 lseek(s->fd, cluster_offset + i * 512, SEEK_SET);
716 if (write(s->fd, s->cluster_data, 512) != 512)
717 return -1;
718 }
719 }
720 }
721 } else {
722 cluster_offset |= QCOW_OFLAG_COMPRESSED |
723 (uint64_t)compressed_size
724 << (63 - s->cluster_bits);
725 }
726 }
727 /* update L2 table */
728 tmp = cpu_to_be64(cluster_offset);
729 l2_table[l2_index] = tmp;
731 /*For IO_DIRECT we write 4KByte blocks*/
732 l2_sector = (l2_index * sizeof(uint64_t)) >> 12;
733 l2_ptr = (char *)l2_table + (l2_sector << 12);
735 if (posix_memalign((void **)&tmp_ptr2, 4096, 4096) != 0) {
736 DPRINTF("ERROR allocating memory for L1 table\n");
737 }
738 memcpy(tmp_ptr2, l2_ptr, 4096);
739 lseek(s->fd, l2_offset + (l2_sector << 12), SEEK_SET);
740 if (write(s->fd, tmp_ptr2, 4096) != 4096) {
741 free(tmp_ptr2);
742 return -1;
743 }
744 free(tmp_ptr2);
745 }
746 return cluster_offset;
747 }
749 static void init_cluster_cache(struct disk_driver *dd)
750 {
751 struct td_state *bs = dd->td_state;
752 struct tdqcow_state *s = (struct tdqcow_state *)dd->private;
753 uint32_t count = 0;
754 int i, cluster_entries;
756 cluster_entries = s->cluster_size / 512;
757 DPRINTF("Initialising Cluster cache, %d sectors per cluster (%d cluster size)\n",
758 cluster_entries, s->cluster_size);
760 for (i = 0; i < bs->size; i += cluster_entries) {
761 if (get_cluster_offset(s, i << 9, 0, 0, 0, 1)) count++;
762 if (count >= L2_CACHE_SIZE) return;
763 }
764 DPRINTF("Finished cluster initialisation, added %d entries\n", count);
765 return;
766 }
768 static int qcow_is_allocated(struct tdqcow_state *s, int64_t sector_num,
769 int nb_sectors, int *pnum)
770 {
771 int index_in_cluster, n;
772 uint64_t cluster_offset;
774 cluster_offset = get_cluster_offset(s, sector_num << 9, 0, 0, 0, 0);
775 index_in_cluster = sector_num & (s->cluster_sectors - 1);
776 n = s->cluster_sectors - index_in_cluster;
777 if (n > nb_sectors)
778 n = nb_sectors;
779 *pnum = n;
780 return (cluster_offset != 0);
781 }
783 static int decompress_buffer(uint8_t *out_buf, int out_buf_size,
784 const uint8_t *buf, int buf_size)
785 {
786 z_stream strm1, *strm = &strm1;
787 int ret, out_len;
789 memset(strm, 0, sizeof(*strm));
791 strm->next_in = (uint8_t *)buf;
792 strm->avail_in = buf_size;
793 strm->next_out = out_buf;
794 strm->avail_out = out_buf_size;
796 ret = inflateInit2(strm, -12);
797 if (ret != Z_OK)
798 return -1;
799 ret = inflate(strm, Z_FINISH);
800 out_len = strm->next_out - out_buf;
801 if ( (ret != Z_STREAM_END && ret != Z_BUF_ERROR) ||
802 (out_len != out_buf_size) ) {
803 inflateEnd(strm);
804 return -1;
805 }
806 inflateEnd(strm);
807 return 0;
808 }
810 static int decompress_cluster(struct tdqcow_state *s, uint64_t cluster_offset)
811 {
812 int ret, csize;
813 uint64_t coffset;
815 coffset = cluster_offset & s->cluster_offset_mask;
816 if (s->cluster_cache_offset != coffset) {
817 csize = cluster_offset >> (63 - s->cluster_bits);
818 csize &= (s->cluster_size - 1);
819 lseek(s->fd, coffset, SEEK_SET);
820 ret = read(s->fd, s->cluster_data, csize);
821 if (ret != csize)
822 return -1;
823 if (decompress_buffer(s->cluster_cache, s->cluster_size,
824 s->cluster_data, csize) < 0) {
825 return -1;
826 }
827 s->cluster_cache_offset = coffset;
828 }
829 return 0;
830 }
832 static inline void init_fds(struct disk_driver *dd)
833 {
834 int i;
835 struct tdqcow_state *s = (struct tdqcow_state *)dd->private;
837 for(i = 0; i < MAX_IOFD; i++)
838 dd->io_fd[i] = 0;
840 dd->io_fd[0] = s->aio_ctx.pollfd;
841 }
843 /* Open the disk file and initialize qcow state. */
844 int tdqcow_open (struct disk_driver *dd, const char *name, td_flag_t flags)
845 {
846 int fd, len, i, shift, ret, size, l1_table_size, o_flags;
847 struct td_state *bs = dd->td_state;
848 struct tdqcow_state *s = (struct tdqcow_state *)dd->private;
849 char *buf;
850 QCowHeader *header;
851 QCowHeader_ext *exthdr;
852 uint32_t cksum;
853 uint64_t final_cluster = 0;
855 DPRINTF("QCOW: Opening %s\n",name);
857 o_flags = O_DIRECT | O_LARGEFILE |
858 ((flags == TD_RDONLY) ? O_RDONLY : O_RDWR);
859 fd = open(name, o_flags);
860 if (fd < 0) {
861 DPRINTF("Unable to open %s (%d)\n",name,0 - errno);
862 return -1;
863 }
865 s->fd = fd;
866 if (asprintf(&s->name,"%s", name) == -1) {
867 close(fd);
868 return -1;
869 }
871 ASSERT(sizeof(QCowHeader) + sizeof(QCowHeader_ext) < 512);
873 ret = posix_memalign((void **)&buf, 512, 512);
874 if (ret != 0) goto fail;
876 if (read(fd, buf, 512) != 512)
877 goto fail;
879 header = (QCowHeader *)buf;
880 be32_to_cpus(&header->magic);
881 be32_to_cpus(&header->version);
882 be64_to_cpus(&header->backing_file_offset);
883 be32_to_cpus(&header->backing_file_size);
884 be32_to_cpus(&header->mtime);
885 be64_to_cpus(&header->size);
886 be32_to_cpus(&header->crypt_method);
887 be64_to_cpus(&header->l1_table_offset);
889 if (header->magic != QCOW_MAGIC || header->version > QCOW_VERSION)
890 goto fail;
891 if (header->size <= 1 || header->cluster_bits < 9)
892 goto fail;
893 if (header->crypt_method > QCOW_CRYPT_AES)
894 goto fail;
895 s->crypt_method_header = header->crypt_method;
896 if (s->crypt_method_header)
897 s->encrypted = 1;
898 s->cluster_bits = header->cluster_bits;
899 s->cluster_size = 1 << s->cluster_bits;
900 s->cluster_sectors = 1 << (s->cluster_bits - 9);
901 s->l2_bits = header->l2_bits;
902 s->l2_size = 1 << s->l2_bits;
903 s->cluster_alloc = s->l2_size;
904 bs->size = header->size / 512;
905 s->cluster_offset_mask = (1LL << (63 - s->cluster_bits)) - 1;
906 s->backing_file_offset = header->backing_file_offset;
907 s->backing_file_size = header->backing_file_size;
909 /* read the level 1 table */
910 shift = s->cluster_bits + s->l2_bits;
911 s->l1_size = (header->size + (1LL << shift) - 1) >> shift;
913 s->l1_table_offset = header->l1_table_offset;
915 /*allocate a 4Kbyte multiple of memory*/
916 l1_table_size = s->l1_size * sizeof(uint64_t);
917 if (l1_table_size % 4096 > 0) {
918 l1_table_size = ((l1_table_size >> 12) + 1) << 12;
919 }
920 ret = posix_memalign((void **)&s->l1_table, 4096, l1_table_size);
921 if (ret != 0) goto fail;
923 memset(s->l1_table, 0x00, l1_table_size);
925 DPRINTF("L1 Table offset detected: %llu, size %d (%d)\n",
926 (long long)s->l1_table_offset,
927 (int) (s->l1_size * sizeof(uint64_t)),
928 l1_table_size);
930 lseek(fd, s->l1_table_offset, SEEK_SET);
931 if (read(fd, s->l1_table, l1_table_size) != l1_table_size)
932 goto fail;
934 for(i = 0; i < s->l1_size; i++) {
935 //be64_to_cpus(&s->l1_table[i]);
936 //DPRINTF("L1[%d] => %llu\n", i, s->l1_table[i]);
937 if (s->l1_table[i] > final_cluster)
938 final_cluster = s->l1_table[i];
939 }
941 /* alloc L2 cache */
942 size = s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t);
943 ret = posix_memalign((void **)&s->l2_cache, 4096, size);
944 if(ret != 0) goto fail;
946 size = s->cluster_size;
947 ret = posix_memalign((void **)&s->cluster_cache, 4096, size);
948 if(ret != 0) goto fail;
950 ret = posix_memalign((void **)&s->cluster_data, 4096, size);
951 if(ret != 0) goto fail;
952 s->cluster_cache_offset = -1;
954 if (s->backing_file_offset != 0)
955 s->cluster_alloc = 1; /*Cannot use pre-alloc*/
957 bs->sector_size = 512;
958 bs->info = 0;
960 /*Detect min_cluster_alloc*/
961 s->min_cluster_alloc = 1; /*Default*/
962 if (s->backing_file_offset == 0 && s->l1_table_offset % 4096 == 0) {
963 /*We test to see if the xen magic # exists*/
964 exthdr = (QCowHeader_ext *)(buf + sizeof(QCowHeader));
965 be32_to_cpus(&exthdr->xmagic);
966 if(exthdr->xmagic != XEN_MAGIC)
967 goto end_xenhdr;
969 /*Finally check the L1 table cksum*/
970 be32_to_cpus(&exthdr->cksum);
971 cksum = gen_cksum((char *)s->l1_table,
972 s->l1_size * sizeof(uint64_t));
973 if(exthdr->cksum != cksum)
974 goto end_xenhdr;
976 be32_to_cpus(&exthdr->min_cluster_alloc);
977 be32_to_cpus(&exthdr->flags);
978 s->sparse = (exthdr->flags & SPARSE_FILE);
979 s->min_cluster_alloc = exthdr->min_cluster_alloc;
980 }
982 end_xenhdr:
983 if (init_aio_state(dd)!=0) {
984 DPRINTF("Unable to initialise AIO state\n");
985 free_aio_state(dd);
986 goto fail;
987 }
988 init_fds(dd);
990 if (!final_cluster)
991 s->fd_end = s->l1_table_offset + l1_table_size;
992 else {
993 s->fd_end = lseek64(fd, 0, SEEK_END);
994 if (s->fd_end == (off64_t)-1)
995 goto fail;
996 }
998 return 0;
1000 fail:
1001 DPRINTF("QCOW Open failed\n");
1002 free_aio_state(dd);
1003 free(s->l1_table);
1004 free(s->l2_cache);
1005 free(s->cluster_cache);
1006 free(s->cluster_data);
1007 close(fd);
1008 return -1;
1011 int tdqcow_queue_read(struct disk_driver *dd, uint64_t sector,
1012 int nb_sectors, char *buf, td_callback_t cb,
1013 int id, void *private)
1015 struct tdqcow_state *s = (struct tdqcow_state *)dd->private;
1016 int ret = 0, index_in_cluster, n, i, rsp = 0;
1017 uint64_t cluster_offset, sec, nr_secs;
1019 sec = sector;
1020 nr_secs = nb_sectors;
1022 /*Check we can get a lock*/
1023 for (i = 0; i < nb_sectors; i++)
1024 if (!aio_can_lock(s, sector + i))
1025 return cb(dd, -EBUSY, sector, nb_sectors, id, private);
1027 /*We store a local record of the request*/
1028 while (nb_sectors > 0) {
1029 cluster_offset =
1030 get_cluster_offset(s, sector << 9, 0, 0, 0, 0);
1031 index_in_cluster = sector & (s->cluster_sectors - 1);
1032 n = s->cluster_sectors - index_in_cluster;
1033 if (n > nb_sectors)
1034 n = nb_sectors;
1036 if (s->iocb_free_count == 0 || !aio_lock(s, sector))
1037 return cb(dd, -EBUSY, sector, nb_sectors, id, private);
1039 if(!cluster_offset) {
1040 aio_unlock(s, sector);
1041 ret = cb(dd, BLK_NOT_ALLOCATED,
1042 sector, n, id, private);
1043 if (ret == -EBUSY) {
1044 /* mark remainder of request
1045 * as busy and try again later */
1046 return cb(dd, -EBUSY, sector + n,
1047 nb_sectors - n, id, private);
1048 } else
1049 rsp += ret;
1050 } else if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
1051 aio_unlock(s, sector);
1052 if (decompress_cluster(s, cluster_offset) < 0) {
1053 rsp += cb(dd, -EIO, sector,
1054 nb_sectors, id, private);
1055 goto done;
1057 memcpy(buf, s->cluster_cache + index_in_cluster * 512,
1058 512 * n);
1059 rsp += cb(dd, 0, sector, n, id, private);
1060 } else {
1061 async_read(s, n * 512,
1062 (cluster_offset + index_in_cluster * 512),
1063 buf, cb, id, sector, private);
1065 nb_sectors -= n;
1066 sector += n;
1067 buf += n * 512;
1069 done:
1070 return rsp;
1073 int tdqcow_queue_write(struct disk_driver *dd, uint64_t sector,
1074 int nb_sectors, char *buf, td_callback_t cb,
1075 int id, void *private)
1077 struct tdqcow_state *s = (struct tdqcow_state *)dd->private;
1078 int ret = 0, index_in_cluster, n, i;
1079 uint64_t cluster_offset, sec, nr_secs;
1081 sec = sector;
1082 nr_secs = nb_sectors;
1084 /*Check we can get a lock*/
1085 for (i = 0; i < nb_sectors; i++)
1086 if (!aio_can_lock(s, sector + i))
1087 return cb(dd, -EBUSY, sector, nb_sectors, id, private);
1089 /*We store a local record of the request*/
1090 while (nb_sectors > 0) {
1091 index_in_cluster = sector & (s->cluster_sectors - 1);
1092 n = s->cluster_sectors - index_in_cluster;
1093 if (n > nb_sectors)
1094 n = nb_sectors;
1096 if (s->iocb_free_count == 0 || !aio_lock(s, sector))
1097 return cb(dd, -EBUSY, sector, nb_sectors, id, private);
1099 cluster_offset = get_cluster_offset(s, sector << 9, 1, 0,
1100 index_in_cluster,
1101 index_in_cluster+n);
1102 if (!cluster_offset) {
1103 DPRINTF("Ooops, no write cluster offset!\n");
1104 aio_unlock(s, sector);
1105 return cb(dd, -EIO, sector, nb_sectors, id, private);
1108 if (s->crypt_method) {
1109 encrypt_sectors(s, sector, s->cluster_data,
1110 (unsigned char *)buf, n, 1,
1111 &s->aes_encrypt_key);
1112 async_write(s, n * 512,
1113 (cluster_offset + index_in_cluster*512),
1114 (char *)s->cluster_data, cb, id, sector,
1115 private);
1116 } else {
1117 async_write(s, n * 512,
1118 (cluster_offset + index_in_cluster*512),
1119 buf, cb, id, sector, private);
1122 nb_sectors -= n;
1123 sector += n;
1124 buf += n * 512;
1126 s->cluster_cache_offset = -1; /* disable compressed cache */
1128 return 0;
1131 int tdqcow_submit(struct disk_driver *dd)
1133 int ret;
1134 struct tdqcow_state *prv = (struct tdqcow_state *)dd->private;
1136 if (!prv->iocb_queued)
1137 return 0;
1139 ret = io_submit(prv->aio_ctx.aio_ctx, prv->iocb_queued, prv->iocb_queue);
1141 /* XXX: TODO: Handle error conditions here. */
1143 /* Success case: */
1144 prv->iocb_queued = 0;
1146 return 0;
1149 int tdqcow_close(struct disk_driver *dd)
1151 struct tdqcow_state *s = (struct tdqcow_state *)dd->private;
1152 uint32_t cksum, out;
1153 int fd, offset;
1155 /*Update the hdr cksum*/
1156 if(s->min_cluster_alloc == s->l2_size) {
1157 cksum = gen_cksum((char *)s->l1_table, s->l1_size * sizeof(uint64_t));
1158 printf("Writing cksum: %d",cksum);
1159 fd = open(s->name, O_WRONLY | O_LARGEFILE); /*Open without O_DIRECT*/
1160 offset = sizeof(QCowHeader) + sizeof(uint32_t);
1161 lseek(fd, offset, SEEK_SET);
1162 out = cpu_to_be32(cksum);
1163 if (write(fd, &out, sizeof(uint32_t))) ;
1164 close(fd);
1167 io_destroy(s->aio_ctx.aio_ctx);
1168 free(s->name);
1169 free(s->l1_table);
1170 free(s->l2_cache);
1171 free(s->cluster_cache);
1172 free(s->cluster_data);
1173 close(s->fd);
1174 return 0;
1177 int tdqcow_do_callbacks(struct disk_driver *dd, int sid)
1179 int ret, i, nr_events, rsp = 0,*ptr;
1180 struct io_event *ep;
1181 struct tdqcow_state *prv = (struct tdqcow_state *)dd->private;
1183 if (sid > MAX_IOFD) return 1;
1185 nr_events = tap_aio_get_events(&prv->aio_ctx);
1186 repeat:
1187 for (ep = prv->aio_events, i = nr_events; i-- > 0; ep++) {
1188 struct iocb *io = ep->obj;
1189 struct pending_aio *pio;
1191 pio = &prv->pending_aio[(long)io->data];
1193 aio_unlock(prv, pio->sector);
1195 if (prv->crypt_method)
1196 encrypt_sectors(prv, pio->sector,
1197 (unsigned char *)pio->buf,
1198 (unsigned char *)pio->buf,
1199 pio->nb_sectors, 0,
1200 &prv->aes_decrypt_key);
1202 rsp += pio->cb(dd, ep->res == io->u.c.nbytes ? 0 : 1,
1203 pio->sector, pio->nb_sectors,
1204 pio->id, pio->private);
1206 prv->iocb_free[prv->iocb_free_count++] = io;
1209 if (nr_events) {
1210 nr_events = tap_aio_more_events(&prv->aio_ctx);
1211 goto repeat;
1214 tap_aio_continue(&prv->aio_ctx);
1216 return rsp;
1219 int qcow_create(const char *filename, uint64_t total_size,
1220 const char *backing_file, int sparse)
1222 int fd, header_size, backing_filename_len, l1_size, i;
1223 int shift, length, adjust, flags = 0, ret = 0;
1224 QCowHeader header;
1225 QCowHeader_ext exthdr;
1226 char backing_filename[1024], *ptr;
1227 uint64_t tmp, size, total_length;
1228 struct stat st;
1230 DPRINTF("Qcow_create: size %llu\n",(long long unsigned)total_size);
1232 fd = open(filename,
1233 O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE,
1234 0644);
1235 if (fd < 0)
1236 return -1;
1238 memset(&header, 0, sizeof(header));
1239 header.magic = cpu_to_be32(QCOW_MAGIC);
1240 header.version = cpu_to_be32(QCOW_VERSION);
1242 /*Create extended header fields*/
1243 exthdr.xmagic = cpu_to_be32(XEN_MAGIC);
1245 header_size = sizeof(header) + sizeof(QCowHeader_ext);
1246 backing_filename_len = 0;
1247 size = (total_size >> SECTOR_SHIFT);
1248 if (backing_file) {
1249 if (strcmp(backing_file, "fat:")) {
1250 const char *p;
1251 /* XXX: this is a hack: we do not attempt to
1252 *check for URL like syntax */
1253 p = strchr(backing_file, ':');
1254 if (p && (p - backing_file) >= 2) {
1255 /* URL like but exclude "c:" like filenames */
1256 strncpy(backing_filename, backing_file,
1257 sizeof(backing_filename));
1258 } else {
1259 if (realpath(backing_file, backing_filename) == NULL ||
1260 stat(backing_filename, &st) != 0) {
1261 return -1;
1264 header.backing_file_offset = cpu_to_be64(header_size);
1265 backing_filename_len = strlen(backing_filename);
1266 header.backing_file_size = cpu_to_be32(
1267 backing_filename_len);
1268 header_size += backing_filename_len;
1270 /*Set to the backing file size*/
1271 if(get_filesize(backing_filename, &size, &st)) {
1272 return -1;
1274 DPRINTF("Backing file size detected: %lld sectors"
1275 "(total %lld [%lld MB])\n",
1276 (long long)size,
1277 (long long)(size << SECTOR_SHIFT),
1278 (long long)(size >> 11));
1279 } else {
1280 backing_file = NULL;
1281 DPRINTF("Setting file size: %lld (total %lld)\n",
1282 (long long) total_size,
1283 (long long) (total_size << SECTOR_SHIFT));
1285 header.mtime = cpu_to_be32(st.st_mtime);
1286 header.cluster_bits = 9; /* 512 byte cluster to avoid copying
1287 unmodifyed sectors */
1288 header.l2_bits = 12; /* 32 KB L2 tables */
1289 exthdr.min_cluster_alloc = cpu_to_be32(1);
1290 } else {
1291 DPRINTF("Setting file size: %lld sectors"
1292 "(total %lld [%lld MB])\n",
1293 (long long) size,
1294 (long long) (size << SECTOR_SHIFT),
1295 (long long) (size >> 11));
1296 header.cluster_bits = 12; /* 4 KB clusters */
1297 header.l2_bits = 9; /* 4 KB L2 tables */
1298 exthdr.min_cluster_alloc = cpu_to_be32(1 << 9);
1300 /*Set the header size value*/
1301 header.size = cpu_to_be64(size * 512);
1303 header_size = (header_size + 7) & ~7;
1304 if (header_size % 4096 > 0) {
1305 header_size = ((header_size >> 12) + 1) << 12;
1308 shift = header.cluster_bits + header.l2_bits;
1309 l1_size = ((size * 512) + (1LL << shift) - 1) >> shift;
1311 header.l1_table_offset = cpu_to_be64(header_size);
1312 DPRINTF("L1 Table offset: %d, size %d\n",
1313 header_size,
1314 (int)(l1_size * sizeof(uint64_t)));
1315 header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE);
1317 ptr = calloc(1, l1_size * sizeof(uint64_t));
1318 exthdr.cksum = cpu_to_be32(gen_cksum(ptr, l1_size * sizeof(uint64_t)));
1319 printf("Created cksum: %d\n",exthdr.cksum);
1320 free(ptr);
1322 /*adjust file length to system page size boundary*/
1323 length = ROUNDUP(header_size + (l1_size * sizeof(uint64_t)),
1324 getpagesize());
1325 if (qtruncate(fd, length, 0)!=0) {
1326 DPRINTF("ERROR truncating file\n");
1327 return -1;
1330 if (sparse == 0) {
1331 /*Filesize is length+l1_size*(1 << s->l2_bits)+(size*512)*/
1332 total_length = length + (l1_size * (1 << 9)) + (size * 512);
1333 if (qtruncate(fd, total_length, 0)!=0) {
1334 DPRINTF("ERROR truncating file\n");
1335 return -1;
1337 printf("File truncated to length %"PRIu64"\n",total_length);
1338 } else
1339 flags = SPARSE_FILE;
1341 exthdr.flags = cpu_to_be32(flags);
1343 /* write all the data */
1344 lseek(fd, 0, SEEK_SET);
1345 ret += write(fd, &header, sizeof(header));
1346 ret += write(fd, &exthdr, sizeof(exthdr));
1347 if (backing_file)
1348 ret += write(fd, backing_filename, backing_filename_len);
1350 lseek(fd, header_size, SEEK_SET);
1351 tmp = 0;
1352 for (i = 0;i < l1_size; i++) {
1353 ret += write(fd, &tmp, sizeof(tmp));
1356 close(fd);
1358 return 0;
1361 int qcow_make_empty(struct tdqcow_state *s)
1363 uint32_t l1_length = s->l1_size * sizeof(uint64_t);
1365 memset(s->l1_table, 0, l1_length);
1366 lseek(s->fd, s->l1_table_offset, SEEK_SET);
1367 if (write(s->fd, s->l1_table, l1_length) < 0)
1368 return -1;
1369 if (qtruncate(s->fd, s->l1_table_offset + l1_length, s->sparse)!=0) {
1370 DPRINTF("ERROR truncating file\n");
1371 return -1;
1374 memset(s->l2_cache, 0, s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
1375 memset(s->l2_cache_offsets, 0, L2_CACHE_SIZE * sizeof(uint64_t));
1376 memset(s->l2_cache_counts, 0, L2_CACHE_SIZE * sizeof(uint32_t));
1378 return 0;
1381 int qcow_get_cluster_size(struct tdqcow_state *s)
1383 return s->cluster_size;
1386 /* XXX: put compressed sectors first, then all the cluster aligned
1387 tables to avoid losing bytes in alignment */
1388 int qcow_compress_cluster(struct tdqcow_state *s, int64_t sector_num,
1389 const uint8_t *buf)
1391 z_stream strm;
1392 int ret, out_len;
1393 uint8_t *out_buf;
1394 uint64_t cluster_offset;
1396 out_buf = malloc(s->cluster_size + (s->cluster_size / 1000) + 128);
1397 if (!out_buf)
1398 return -1;
1400 /* best compression, small window, no zlib header */
1401 memset(&strm, 0, sizeof(strm));
1402 ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
1403 Z_DEFLATED, -12,
1404 9, Z_DEFAULT_STRATEGY);
1405 if (ret != 0) {
1406 free(out_buf);
1407 return -1;
1410 strm.avail_in = s->cluster_size;
1411 strm.next_in = (uint8_t *)buf;
1412 strm.avail_out = s->cluster_size;
1413 strm.next_out = out_buf;
1415 ret = deflate(&strm, Z_FINISH);
1416 if (ret != Z_STREAM_END && ret != Z_OK) {
1417 free(out_buf);
1418 deflateEnd(&strm);
1419 return -1;
1421 out_len = strm.next_out - out_buf;
1423 deflateEnd(&strm);
1425 if (ret != Z_STREAM_END || out_len >= s->cluster_size) {
1426 /* could not compress: write normal cluster */
1427 //tdqcow_queue_write(bs, sector_num, buf, s->cluster_sectors);
1428 } else {
1429 cluster_offset = get_cluster_offset(s, sector_num << 9, 2,
1430 out_len, 0, 0);
1431 cluster_offset &= s->cluster_offset_mask;
1432 lseek(s->fd, cluster_offset, SEEK_SET);
1433 if (write(s->fd, out_buf, out_len) != out_len) {
1434 free(out_buf);
1435 return -1;
1439 free(out_buf);
1440 return 0;
1443 int tdqcow_get_parent_id(struct disk_driver *dd, struct disk_id *id)
1445 off_t off;
1446 char *buf, *filename;
1447 int len, secs, err = -EINVAL;
1448 struct tdqcow_state *child = (struct tdqcow_state *)dd->private;
1450 if (!child->backing_file_offset)
1451 return TD_NO_PARENT;
1453 /* read the backing file name */
1454 len = child->backing_file_size;
1455 off = child->backing_file_offset - (child->backing_file_offset % 512);
1456 secs = (len + (child->backing_file_offset - off) + 511) >> 9;
1458 if (posix_memalign((void **)&buf, 512, secs << 9))
1459 return -1;
1461 if (lseek(child->fd, off, SEEK_SET) == (off_t)-1)
1462 goto out;
1464 if (read(child->fd, buf, secs << 9) != secs << 9)
1465 goto out;
1466 filename = buf + (child->backing_file_offset - off);
1467 filename[len] = '\0';
1469 id->name = strdup(filename);
1470 id->drivertype = DISK_TYPE_QCOW;
1471 err = 0;
1472 out:
1473 free(buf);
1474 return err;
1477 int tdqcow_validate_parent(struct disk_driver *child,
1478 struct disk_driver *parent, td_flag_t flags)
1480 struct stat stats;
1481 uint64_t psize, csize;
1482 struct tdqcow_state *c = (struct tdqcow_state *)child->private;
1483 struct tdqcow_state *p = (struct tdqcow_state *)parent->private;
1485 if (stat(p->name, &stats))
1486 return -EINVAL;
1487 if (get_filesize(p->name, &psize, &stats))
1488 return -EINVAL;
1490 if (stat(c->name, &stats))
1491 return -EINVAL;
1492 if (get_filesize(c->name, &csize, &stats))
1493 return -EINVAL;
1495 if (csize != psize)
1496 return -EINVAL;
1498 return 0;
1501 struct tap_disk tapdisk_qcow = {
1502 .disk_type = "tapdisk_qcow",
1503 .private_data_size = sizeof(struct tdqcow_state),
1504 .td_open = tdqcow_open,
1505 .td_queue_read = tdqcow_queue_read,
1506 .td_queue_write = tdqcow_queue_write,
1507 .td_submit = tdqcow_submit,
1508 .td_close = tdqcow_close,
1509 .td_do_callbacks = tdqcow_do_callbacks,
1510 .td_get_parent_id = tdqcow_get_parent_id,
1511 .td_validate_parent = tdqcow_validate_parent
1512 };