]> xenbits.xensource.com Git - people/dstodden/blktap.git/commitdiff
Import libvhdio from the XenClient fork of blktap (required for transfervm VHD tests)
authorAndrei Lifchits <andrei.lifchits@citrix.com>
Thu, 1 Apr 2010 02:23:32 +0000 (19:23 -0700)
committerAndrei Lifchits <andrei.lifchits@citrix.com>
Thu, 1 Apr 2010 02:23:32 +0000 (19:23 -0700)
13 files changed:
Makefile
include/libvhd.h
include/partition.h [new file with mode: 0644]
include/vhd.h
mk/Makefile
mk/blktap.spec.in
part/Makefile [new file with mode: 0644]
part/part-util.c [new file with mode: 0644]
part/partition.c [new file with mode: 0644]
part/vhdpartx [new file with mode: 0644]
vhd/lib/Makefile
vhd/lib/libvhd.c
vhd/lib/libvhdio.c [new file with mode: 0644]

index d26d35ffcaf774416df34420f2c1948a3b04dfb5..8132d6bbb0393266d3d0b1642dbce884a9974a8f 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -4,6 +4,7 @@ include $(BLKTAP_ROOT)/Rules.mk
 SUBDIRS-y :=
 SUBDIRS-y += include
 SUBDIRS-y += lvm
+SUBDIRS-y += part
 SUBDIRS-y += vhd
 SUBDIRS-y += drivers
 SUBDIRS-y += daemon
index 4164568c719538e230f2ab10c816dd2ee4c62157..5368c432008bcd8df983d89afe7128908d59d8fc 100644 (file)
@@ -12,6 +12,7 @@
 #include <uuid/uuid.h>
 
 #include "vhd.h"
+#include "list.h"
 
 #if BYTE_ORDER == LITTLE_ENDIAN
   #define BE16_IN(foo)             (*(foo)) = bswap_16(*(foo))
 #define VHD_OPEN_FAST              0x00004
 #define VHD_OPEN_STRICT            0x00008
 #define VHD_OPEN_IGNORE_DISABLED   0x00010
+#define VHD_OPEN_CACHED            0x00020
+#define VHD_OPEN_IO_WRITE_SPARSE   0x00040
 
-#define VHD_FLAG_CREAT_PARENT_RAW        0x00001
+#define VHD_FLAG_CREAT_FILE_SIZE_FIXED   0x00001
+#define VHD_FLAG_CREAT_PARENT_RAW        0x00002
 
 #define vhd_flag_set(word, flag)         ((word) |= (flag))
 #define vhd_flag_clear(word, flag)       ((word) &= ~(flag))
@@ -114,6 +118,8 @@ struct vhd_context {
        vhd_footer_t               footer;
        vhd_bat_t                  bat;
        vhd_batmap_t               batmap;
+
+       struct list_head           next;
 };
 
 static inline int
@@ -241,6 +247,8 @@ int vhd_snapshot(const char *snapshot, uint64_t bytes, const char *parent,
 
 int vhd_hidden(vhd_context_t *, int *);
 int vhd_chain_depth(vhd_context_t *, int *);
+int vhd_marker(vhd_context_t *, char *);   
+int vhd_set_marker(vhd_context_t *, char); 
 
 off64_t vhd_position(vhd_context_t *);
 int vhd_seek(vhd_context_t *, off64_t, int);
@@ -268,6 +276,7 @@ int vhd_batmap_test(vhd_context_t *, vhd_batmap_t *, uint32_t);
 void vhd_batmap_set(vhd_context_t *, vhd_batmap_t *, uint32_t);
 void vhd_batmap_clear(vhd_context_t *, vhd_batmap_t *, uint32_t);
 
+int vhd_file_size_fixed(vhd_context_t *);
 int vhd_get_phys_size(vhd_context_t *, off64_t *);
 int vhd_set_phys_size(vhd_context_t *, off64_t);
 int vhd_set_virt_size(vhd_context_t *, uint64_t);
@@ -312,5 +321,7 @@ int vhd_write_block(vhd_context_t *, uint32_t block, char *data);
 
 int vhd_io_read(vhd_context_t *, char *, uint64_t, uint32_t);
 int vhd_io_write(vhd_context_t *, char *, uint64_t, uint32_t);
+int vhd_io_read_bytes(vhd_context_t *, char *, size_t, uint64_t);
+int vhd_io_write_bytes(vhd_context_t *, char *, size_t, uint64_t);
 
 #endif
diff --git a/include/partition.h b/include/partition.h
new file mode 100644 (file)
index 0000000..e241779
--- /dev/null
@@ -0,0 +1,45 @@
+#ifndef _PARTITION_H_
+#define _PARTITION_H_
+
+#include <inttypes.h>
+
+#define PARTITION_BOOTABLE            0x80
+#define PARTITION_NON_BOOTABLE        0x00
+
+#define MBR_SIGNATURE                 0xAA55
+#define MBR_START_SECTOR              0x80
+
+struct partition_geometry {
+       unsigned char                 heads;
+       unsigned char                 sectors;
+       unsigned int                  cylinders;
+};
+
+struct partition_chs {
+       uint8_t                       chs[3];
+} __attribute__((__packed__));
+
+struct primary_partition {
+       uint8_t                       status;
+       struct partition_chs          chs_first;
+       uint8_t                       type;
+       struct partition_chs          chs_last;
+       uint32_t                      lba;
+       uint32_t                      blocks;
+} __attribute__((__packed__));
+
+struct partition_table {
+       uint8_t                       code[0x1b8];
+       uint32_t                      disk_signature;
+       uint8_t                       pad[0x2];
+       struct primary_partition      partitions[4];
+       uint16_t                      mbr_signature;
+} __attribute__((__packed__));
+
+void partition_table_in(struct partition_table *);
+void partition_table_out(struct partition_table *);
+int partition_table_validate(struct partition_table *);
+void partition_table_dump(struct partition_table *);
+struct partition_chs lba_to_chs(struct partition_geometry *, uint64_t);
+
+#endif
index 8d0270cae4425fa18700034f73076e50c370a199..e38b9d2826dc3a5d38546f7e1e43563ed015b2a0 100644 (file)
@@ -154,6 +154,7 @@ struct dd_batmap_hdr {
   u32    batmap_size;     /* batmap size in sectors                       */
   u32    batmap_version;  /* version of batmap                            */
   u32    checksum;        /* batmap checksum -- 1's complement of batmap  */
+  char   marker;          /* generic marker field                         */
 };
 
 static const char VHD_BATMAP_COOKIE[9] = "tdbatmap";
index 2fd813485e41c4de9e72ce565c85be62eb7c6f41..835c371c80514e322986e19240321a1f7753efed 100644 (file)
@@ -56,6 +56,7 @@ PROPRIETARY_SOURCE += drivers/tapdisk-filter.c
 PROPRIETARY_SOURCE += drivers/tapdisk-vbd.c
 PROPRIETARY_SOURCE += drivers/tapdisk-diff.c
 PROPRIETARY_SOURCE += vhd/lib/libvhd.c
+PROPRIETARY_SOURCE += vhd/lib/libvhdio.c
 PROPRIETARY_SOURCE += vhd/lib/libvhd-journal.c
 PROPRIETARY_SOURCE += vhd/lib/libvhd-index.c
 PROPRIETARY_SOURCE += vhd/lib/vhd-util-coalesce.c
index 7f6476e0168ce602cc2ef9f457c2d5b884be87d9..6daf0965c4a6e09a0163c43786fbfebbf1f43c58 100644 (file)
@@ -33,6 +33,7 @@ Source25: vhd-util-scan.o
 Source26: vhd-util-check.o
 Source27: vhd-util-revert.o
 Source28: tapdisk-diff.o
+Source29: libvhdio.o
 
 Patch0: %{name}-development.patch
 BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-buildroot
@@ -81,6 +82,7 @@ cp %{SOURCE25} vhd/lib
 cp %{SOURCE26} vhd/lib
 cp %{SOURCE27} vhd/lib
 cp %{SOURCE28} drivers
+cp %{SOURCE29} vhd/lib
 
 %build
 %{__make} USE_SYSTEM_LIBRARIES=y
@@ -97,6 +99,7 @@ rm -rf $RPM_BUILD_ROOT
 %doc
 %{_libdir}/libblktap.so.*
 %{_libdir}/libvhd.so.*
+%{_libdir}/libvhdio.so.*
 %{_sbindir}/blktapctrl
 %{_sbindir}/tapdisk
 %{_sbindir}/tapdisk2
@@ -108,6 +111,8 @@ rm -rf $RPM_BUILD_ROOT
 %{_sbindir}/tapdisk-client
 %{_sbindir}/tapdisk-stream
 %{_sbindir}/tapdisk-diff
+%{_sbindir}/part-util
+%{_sbindir}/vhdpartx
 
 %files devel
 %defattr(-,root,root,-)
@@ -117,5 +122,7 @@ rm -rf $RPM_BUILD_ROOT
 %{_libdir}/libblktap.so
 %{_libdir}/libvhd.a
 %{_libdir}/libvhd.so
+%{_libdir}/libvhdio.a
+%{_libdir}/libvhdio.so
 
 %changelog
diff --git a/part/Makefile b/part/Makefile
new file mode 100644 (file)
index 0000000..308930f
--- /dev/null
@@ -0,0 +1,35 @@
+BLKTAP_ROOT := ../
+include $(BLKTAP_ROOT)/Rules.mk
+
+IBIN               = part-util
+INST_DIR           = /usr/sbin
+
+VHDPARTX           = vhdpartx
+
+CFLAGS            := -g -O0
+CFLAGS            += -Werror
+CFLAGS            += -Wno-unused
+CFLAGS            += -I../include
+CFLAGS            += -D_GNU_SOURCE
+
+# Get gcc to generate the dependencies for us.
+CFLAGS            += -Wp,-MD,.$(@F).d
+DEPS               = .*.d
+
+all: build
+
+build: $(IBIN)
+
+part-util: part-util.o partition.o
+       $(CC) $(CFLAGS) -o $@ $^
+
+install: all
+       $(INSTALL_DIR) -p $(DESTDIR)$(INST_DIR)
+       $(INSTALL_PROG) $(IBIN) $(VHDPARTX) $(DESTDIR)$(INST_DIR)
+
+clean:
+       rm -rf *.o *~ $(DEPS) $(IBIN)
+
+.PHONY: all build clean install
+
+-include $(DEPS)
diff --git a/part/part-util.c b/part/part-util.c
new file mode 100644 (file)
index 0000000..9cf31f6
--- /dev/null
@@ -0,0 +1,364 @@
+#include <time.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <endian.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <byteswap.h>
+#include <sys/ioctl.h>
+#include <linux/fs.h>
+#include <linux/hdreg.h>
+
+#include "partition.h"
+
+#if BYTE_ORDER == LITTLE_ENDIAN
+  #define cpu_to_le32(x) (x)
+  #define cpu_to_le64(x) (x)
+#else
+  #define cpu_to_le32(x) bswap_32(x)
+  #define cpu_to_le64(x) bswap_64(x)
+#endif
+
+static void
+usage(const char *app)
+{
+       printf("usage: %s <-i image> "
+              "[-d dump] [-c count] [-f format] "
+              "[-t type] [-s sig <part>]\n", app);
+}
+
+static void
+chs_unpack(struct partition_chs *c,
+          uint8_t *head, uint8_t *sector, uint16_t *cylinder)
+{
+       *head = c->chs[0];
+       *sector = c->chs[1] & 0x3f;
+       *cylinder = (c->chs[1] & 0xc0) * 4 + c->chs[2];
+}
+
+void
+partition_table_dump(struct partition_table *pt)
+{
+       int i;
+
+       printf("disk signature   0x%08x\n", pt->disk_signature);
+       printf("mbr signature    0x%04x\n", pt->mbr_signature);
+       printf("\n");
+
+       for (i = 0; i < 4; i++) {
+               struct primary_partition *p = pt->partitions + i;
+               uint8_t head, sector;
+               uint16_t cylinder;
+
+               printf("  %d status       0x%02x\n", i, p->status);
+
+               chs_unpack(&p->chs_first, &head, &sector, &cylinder);
+               printf("  %d s cylinder   0x%04x\n", i, cylinder);
+               printf("  %d s sector     0x%01x\n", i, sector);
+               printf("  %d s head       0x%01x\n", i, head);
+
+               printf("  %d type         0x%01x\n", i, p->type);
+
+               chs_unpack(&p->chs_last, &head, &sector, &cylinder);
+               printf("  %d e cylinder   0x%04x\n", i, cylinder);
+               printf("  %d e sector     0x%01x\n", i, sector);
+               printf("  %d e head       0x%01x\n", i, head);
+
+               printf("  %d lba          0x%08x\n", i, p->lba);
+               printf("  %d blocks       0x%08x\n", i, p->blocks);
+
+               printf("\n");
+       }
+}
+
+static int
+dump_partitions(const char *image)
+{
+       int fd, ret;
+       struct partition_table pt;
+
+       ret = 1;
+       fd  = -1;
+
+       fd = open(image, O_RDONLY);
+       if (fd == -1)
+               goto out;
+
+       if (read(fd, &pt, sizeof(pt)) != sizeof(pt)) {
+               errno = errno ? : EIO;
+               goto out;
+       }
+
+       partition_table_in(&pt);
+       if (partition_table_validate(&pt)) {
+               errno = EINVAL;
+               printf("table invalid\n");
+               goto out;
+       }
+
+       partition_table_dump(&pt);
+       ret = 0;
+
+out:
+       close(fd);
+       return ret;
+}
+
+static void
+__dump_signature(struct partition_table *pt, int part)
+{
+       if (part < 1 || part > 4)
+               errno = EINVAL;
+       else {
+               uint8_t *p, *s;
+               uint32_t sig = pt->disk_signature;
+               uint64_t off = (uint64_t)pt->partitions[part - 1].lba << 9;
+
+               sig = cpu_to_le32(sig);
+               off = cpu_to_le64(off);
+
+               for (p = s = (uint8_t *)&sig; p - s < sizeof(sig); p++)
+                       printf("%02x", *p);
+
+               for (p = s = (uint8_t *)&off; p - s < sizeof(off); p++)
+                       printf("%02x", *p);
+
+               printf("\n");
+       }
+}
+
+static int
+dump_signature(const char *image, int part)
+{
+       int fd, ret;
+       struct partition_table pt;
+
+       ret = 1;
+       fd  = -1;
+
+       fd = open(image, O_RDONLY);
+       if (fd == -1)
+               goto out;
+
+       if (read(fd, &pt, sizeof(pt)) != sizeof(pt)) {
+               errno = errno ? : EIO;
+               goto out;
+       }
+
+       partition_table_in(&pt);
+       if (partition_table_validate(&pt)) {
+               errno = EINVAL;
+               printf("table invalid\n");
+               goto out;
+       }
+
+       __dump_signature(&pt, part);
+       ret = 0;
+
+out:
+       close(fd);
+       return ret;
+}
+
+static int
+count_partitions(const char *image, int *count)
+{
+       int i, fd, ret;
+       struct partition_table pt;
+
+       ret = 1;
+       fd  = -1;
+
+       fd = open(image, O_RDONLY);
+       if (fd == -1)
+               goto out;
+
+       if (read(fd, &pt, sizeof(pt)) != sizeof(pt)) {
+               errno = errno ? : EIO;
+               goto out;
+       }
+
+       partition_table_in(&pt);
+       if (partition_table_validate(&pt)) {
+               *count = 0;
+               goto done;
+       }
+
+       *count = 0;
+       for (i = 0; i < 4; i++)
+               if (pt.partitions[i].type)
+                       (*count)++;
+
+done:
+       ret = 0;
+out:
+       close(fd);
+       return ret;
+}
+
+static int
+format_partition(const char *image, int type, struct partition_table *pt)
+{
+       uint64_t lend;
+       uint32_t start, end;
+       int ret, sec_size, fd;
+       unsigned int cylinders;
+       struct hd_geometry geo;
+       struct primary_partition *pp;
+       struct partition_geometry pgeo;
+       unsigned long long bytes, llcyls;
+
+       ret = 1;
+       fd  = -1;
+
+       memset(pt, 0, sizeof(*pt));
+       pp = pt->partitions;
+
+       srandom(time(NULL));
+
+       fd = open(image, O_RDWR);
+       if (fd == -1)
+               goto out;
+
+       if (ioctl(fd, HDIO_GETGEO, &geo))
+               goto out;
+
+       if (ioctl(fd, BLKGETSIZE64, &bytes))
+               goto out;
+
+       if (ioctl(fd, BLKSSZGET, &sec_size))
+               goto out;
+
+       llcyls = (bytes >> 9) / ((sec_size >> 9) * geo.heads * geo.sectors);
+       cylinders = llcyls;
+       if (cylinders != llcyls)
+               cylinders = ~0;
+
+       pgeo.heads          = geo.heads;
+       pgeo.sectors        = geo.sectors;
+       pgeo.cylinders      = cylinders;
+
+       start               = pgeo.sectors;
+       lend                = geo.heads * geo.sectors * llcyls - 1;
+
+       end = lend;
+       if (end != lend)
+               end = ~0;
+
+       pp->status          = PARTITION_BOOTABLE;
+       pp->type            = type;
+       pp->lba             = start;
+       pp->blocks          = end - start + 1;
+       pp->chs_first       = lba_to_chs(&pgeo, start);
+       pp->chs_last        = lba_to_chs(&pgeo, lend);
+
+       pt->mbr_signature   = MBR_SIGNATURE;
+       pt->disk_signature  = random();
+
+       partition_table_out(pt);
+       if (write(fd, pt, sizeof(*pt)) != sizeof(*pt)) {
+               errno = errno ? : EIO;
+               goto out;
+       }
+
+       ret = 0;
+
+out:
+       close(fd);
+       return ret;
+}
+
+int
+main(int argc, char *argv[])
+{
+       char *image;
+       struct partition_table pt;
+       int ret, c, type, count, dump, format, signature;
+
+       ret       = 1;
+       format    = 0;
+       count     = 0;
+       dump      = 0;
+       type      = 0;
+       signature = -1;
+       image     = NULL;
+
+       while ((c = getopt(argc, argv, "i:fdt:cs:h")) != -1) {
+               switch (c) {
+               case 'i':
+                       image = optarg;
+                       break;
+               case 'c':
+                       count = 1;
+                       break;
+               case 's':
+                       signature = atoi(optarg);
+                       break;
+               case 'f':
+                       format = 1;
+                       break;
+               case 't': {
+                       int base = (!strncasecmp(optarg, "0x", 2) ? 16 : 10);
+                       type = strtol(optarg, NULL, base);
+                       break;
+               }
+               case 'd':
+                       dump = 1;
+                       break;
+               case 'h':
+                       usage(argv[0]);
+                       ret = 0;
+                       goto out;
+               }
+       }
+
+       if (!image || (!format && !count && !signature && !dump)) {
+               errno = EINVAL;
+               usage(argv[0]);
+               goto out;
+       }
+
+       if (format) {
+               if (!type) {
+                       errno = EINVAL;
+                       perror("type required");
+                       goto out;
+               }
+
+               if (format_partition(image, type, &pt)) {
+                       perror("formatting partition");
+                       goto out;
+               }
+
+               __dump_signature(&pt, 1);
+       }
+
+       if (count) {
+               if (count_partitions(image, &count)) {
+                       perror("counting partitions");
+                       goto out;
+               }
+               printf("%d\n", count);
+       }
+
+       if (signature != -1) {
+               if (dump_signature(image, signature)) {
+                       perror("dumping signature");
+                       goto out;
+               }
+       }
+
+       if (dump) {
+               if (dump_partitions(image)) {
+                       perror("dumping partitions");
+                       goto out;
+               }
+       }
+
+       ret = 0;
+
+out:
+       return ret;
+}
diff --git a/part/partition.c b/part/partition.c
new file mode 100644 (file)
index 0000000..477a11f
--- /dev/null
@@ -0,0 +1,107 @@
+#include <errno.h>
+#include <endian.h>
+#include <byteswap.h>
+
+#include "partition.h"
+
+#if BYTE_ORDER == LITTLE_ENDIAN
+  #define le16_to_cpu(x) (x)
+  #define le32_to_cpu(x) (x)
+  #define cpu_to_le16(x) (x)
+  #define cpu_to_le32(x) (x)
+#else
+  #define le16_to_cpu(x) bswap_16(x)
+  #define le32_to_cpu(x) bswap_32(x)
+  #define cpu_to_le16(x) bswap_16(x)
+  #define cpu_to_le32(x) bswap_32(x)
+#endif
+
+#define ARRAY_SIZE(a) (sizeof(a) / sizeof(a)[0])
+
+void
+primary_partition_in(struct primary_partition *p)
+{
+       p->lba    = le32_to_cpu(p->lba);
+       p->blocks = le32_to_cpu(p->blocks);     
+}
+
+void
+primary_partition_out(struct primary_partition *p)
+{
+       p->lba    = cpu_to_le32(p->lba);
+       p->blocks = cpu_to_le32(p->blocks);     
+}
+
+void
+partition_table_in(struct partition_table *pt)
+{
+       int i;
+
+       pt->disk_signature = le32_to_cpu(pt->disk_signature);
+       pt->mbr_signature  = le16_to_cpu(pt->mbr_signature);
+
+       for (i = 0; i < ARRAY_SIZE(pt->partitions); i++)
+               primary_partition_in(pt->partitions + i);
+}
+
+void
+partition_table_out(struct partition_table *pt)
+{
+       int i;
+
+       pt->disk_signature = cpu_to_le32(pt->disk_signature);
+       pt->mbr_signature  = cpu_to_le16(pt->mbr_signature);
+
+       for (i = 0; i < ARRAY_SIZE(pt->partitions); i++)
+               primary_partition_out(pt->partitions + i);
+}
+
+int
+primary_partition_validate(struct primary_partition *p)
+{
+       if (p->status != PARTITION_BOOTABLE &&
+           p->status != PARTITION_NON_BOOTABLE)
+               return EINVAL;
+
+       return 0;
+}
+
+int
+partition_table_validate(struct partition_table *pt)
+{
+       int i;
+
+       if (pt->mbr_signature != MBR_SIGNATURE)
+               return EINVAL;
+
+       for (i = 0; i < ARRAY_SIZE(pt->partitions); i++) {
+               int err = primary_partition_validate(pt->partitions + i);
+               if (err)
+                       return err;
+       }
+
+       return 0;
+}
+
+struct partition_chs
+lba_to_chs(struct partition_geometry *geo, uint64_t lba)
+{
+       struct partition_chs c;
+
+       if (lba >= 0x3ff * geo->sectors * geo->heads) {
+               c.chs[0]  = geo->heads - 1;
+               c.chs[1]  = geo->sectors;
+               lba       = 0x3ff;
+       } else {
+               c.chs[1]  = lba % geo->sectors + 1;
+               lba      /= geo->sectors;
+
+               c.chs[0]  = lba % geo->heads;
+               lba      /= geo->heads;
+       }
+
+       c.chs[2]  = lba & 0xff;
+       c.chs[1] |= (lba >> 2) & 0xc0;
+
+       return c;
+}
diff --git a/part/vhdpartx b/part/vhdpartx
new file mode 100644 (file)
index 0000000..9241bbc
--- /dev/null
@@ -0,0 +1,109 @@
+#!/bin/sh
+
+set -e
+
+PARTUTIL=/usr/sbin/part-util
+LIBVHDIO=/usr/lib/libvhdio.so.1.0
+
+die()
+{
+    echo "$@"
+    exit 1
+}
+
+usage()
+{
+    echo "usage: $0 [-a | -d | -l] vhd [lib]"
+    echo "-a add partition mappings"
+    echo "-d del partition mappings"
+    echo "-l list partition mappings"
+    exit 1
+}
+
+parse_args()
+{
+    part_util=$PARTUTIL
+
+    while [ $# -ge 1 ]; do
+       case $1 in
+           -a) add="TRUE" && count="1$count";;
+           -d) del="TRUE" && count="1$count";;
+           -l) list="TRUE" && count="1$count";;
+           *) if [ -z "$vhd" ]; then vhd=$1;
+              elif [ -z "$lib" ]; then lib=$1;
+              else usage;
+              fi;;
+       esac
+       shift
+    done
+
+    [[ -z "$lib" ]] && lib=$LIBVHDIO
+    [[ -z "$vhd" || "$count" != "1" ]] && usage
+    return 0
+}
+
+# screen-scraping of fdisk... not used
+fdisk_read_partitions()
+{
+    local data=$(LD_PRELOAD=$lib fdisk -l $vhd)
+
+    local none=$(echo $data | grep "This doesn't look like a partition table")
+    [[ -n "$none" ]] && partitions=0 && return 0
+
+    partitions=4
+    while [[ "$partitions" != "0" ]]; do
+       local hit=$(echo $data | grep "${vhd}$partitions")
+       [[ -n "$hit" ]] && break
+       let partitions=$partitions-1
+    done
+}
+
+part_util_read_partitions()
+{
+    partitions=$(LD_PRELOAD=$lib $part_util -c -i $vhd)
+}
+
+list_mappings()
+{
+    local parts=1
+    while [[ $parts -le $partitions ]]; do
+       echo ${vhd}$parts
+       let parts=$parts+1
+    done
+}
+
+add_mappings()
+{
+    local parts=1
+    local path=$(realpath $vhd)
+    while [[ $parts -le $partitions ]]; do
+       [[ -e ${path}${parts} ]] || ln -s $(basename $path) ${path}$parts
+       let parts=$parts+1
+    done
+}
+
+del_mappings()
+{
+    local parts=1
+    while [[ $parts -le $partitions ]]; do
+       [[ -L ${vhd}$parts ]] && rm -f ${vhd}$parts
+       let parts=$parts+1
+    done
+}
+
+main()
+{
+    parse_args $@
+    [[ -x $part_util ]] || die "can't find part-util"
+    [[ -r $vhd && -r $lib ]] || die "can't find vhd or lib"
+
+    part_util_read_partitions
+
+    [[ -n "$add" ]] && add_mappings
+    [[ -n "$del" ]] && del_mappings
+    [[ -n "$list" ]] && list_mappings
+
+    return 0
+}
+
+main $@
index c82f487f09cecabdde48cfa3ea7f3ecf0eb24066..2352040e31a25fc69892824cdcf07890d42c9a06 100644 (file)
@@ -7,11 +7,12 @@ LIBVHD-SONAME    = libvhd.so.$(LIBVHD-MAJOR)
 
 LVM-UTIL-OBJ    := $(BLKTAP_ROOT)lvm/lvm-util.o
 
-LIBVHD-BUILD    := libvhd.so.$(LIBVHD-MAJOR).$(LIBVHD-MINOR) libvhd.a
+LIBVHD          := libvhd.a libvhd.so.$(LIBVHD-MAJOR).$(LIBVHD-MINOR)
+LIBVHDIO        := libvhdio.a libvhdio.so.$(LIBVHD-MAJOR).$(LIBVHD-MINOR)
+LIBVHD-BUILD    := $(LIBVHD) $(LIBVHDIO)
 
 INST-DIR         = /usr/lib
 
-CFLAGS          += -Werror
 CFLAGS          += -Wno-unused
 CFLAGS          += -I../../include
 CFLAGS          += -D_GNU_SOURCE
@@ -50,8 +51,11 @@ ifndef TRANSFERVM
 LIB-OBJS        += $(LVM-UTIL-OBJ)
 endif
 
-LIBVHD           = libvhd.a libvhd.so.$(LIBVHD-MAJOR).$(LIBVHD-MINOR)
-
+LIBVHD-IO-LIBS  := -ldl -L./ -lvhd
+LIBVHD-IO-OBJS  := libvhdio.o $(BLKTAP_ROOT)/part/partition.o
+LIBVHD-IO        = libvhdio.a libvhdio.so.$(LIBVHD-MAJOR).$(LIBVHD-MINOR)
+LIBVHD-IO-SONAME = libvhdio.so.$(LIBVHD-MAJOR)
 all: build
 
 build: $(LIBVHD-BUILD)
@@ -59,24 +63,39 @@ build: $(LIBVHD-BUILD)
 libvhd.so: libvhd.so.$(LIBVHD-MAJOR)
        ln -sf $^ $@
 
+libvhdio.so: libvhdio.so.$(LIBVHD-MAJOR)
+       ln -sf $^ $@
+
 libvhd.so.$(LIBVHD-MAJOR): libvhd.so.$(LIBVHD-MAJOR).$(LIBVHD-MINOR)
        ln -sf $^ $@
 
+libvhdio.so.$(LIBVHD-MAJOR): libvhdio.so.$(LIBVHD-MAJOR).$(LIBVHD-MINOR)
+       ln -sf $^ $@
+
 libvhd.so.$(LIBVHD-MAJOR).$(LIBVHD-MINOR): $(LIB-OBJS)
        $(CC) $(CFLAGS) -Wl,$(SONAME_LDFLAG),$(LIBVHD-SONAME) $(SHLIB_CFLAGS) \
                -o $@ $(LIBS) $^
 
+libvhdio.so.$(LIBVHD-MAJOR).$(LIBVHD-MINOR): $(LIBVHD-IO-OBJS) libvhd.so
+       $(CC) $(CFLAGS) -Wl,$(SONAME_LDFLAG),$(LIBVHD-IO-SONAME) $(SHLIB_CFLAGS) \
+               -o $@ $(LIBVHD-IO-LIBS) $^
+
 libvhd.a: $(LIB-OBJS)
        $(AR) rc $@ $^
 
+libvhdio.a: $(LIBVHD-IO-OBJS)
+       $(AR) rc $@ $^
+
 install: all
        $(INSTALL_DIR) -p $(DESTDIR)$(INST-DIR)
-       $(INSTALL_DATA) $(LIBVHD) $(DESTDIR)$(INST-DIR)
+       $(INSTALL_DATA) $(LIBVHD) $(LIBVHD-IO) $(DESTDIR)$(INST-DIR)
        ln -sf libvhd.so.$(LIBVHD-MAJOR).$(LIBVHD-MINOR) $(DESTDIR)$(INST-DIR)/libvhd.so.$(LIBVHD-MAJOR)
        ln -sf libvhd.so.$(LIBVHD-MAJOR) $(DESTDIR)$(INST-DIR)/libvhd.so
+       ln -sf libvhdio.so.$(LIBVHD-MAJOR).$(LIBVHD-MINOR) $(DESTDIR)$(INST-DIR)/libvhdio.so.$(LIBVHD-MAJOR)
+       ln -sf libvhdio.so.$(LIBVHD-MAJOR) $(DESTDIR)$(INST-DIR)/libvhdio.so
 
 clean:
-       rm -rf *.a *.so* *.o *~ $(DEPS) $(LIBVHD)
+       rm -rf *.a *.so* *.o *~ $(DEPS) $(LIBVHD) $(LIBVHD-IO)
 
 .PHONY: all build clean install libvhd
 
index df28a1294d4ec54f1da5d861bbb9b573c63df7a6..761058f00a3ea1cd4be4a1ea8ba619c3065c98ca 100644 (file)
@@ -58,6 +58,11 @@ const char* ENV_VAR_FAIL[NUM_FAIL_TESTS] = {
 int TEST_FAIL[NUM_FAIL_TESTS];
 #endif // ENABLE_FAILURE_TESTING
 
+static void vhd_cache_init(vhd_context_t *);
+static int vhd_cache_enabled(vhd_context_t *);
+static int vhd_cache_load(vhd_context_t *);
+static int vhd_cache_unload(vhd_context_t *);
+static vhd_context_t * vhd_cache_get_parent(vhd_context_t *);
 
 static inline int
 old_test_bit(volatile char *addr, int nr)
@@ -2033,6 +2038,44 @@ vhd_write_bat(vhd_context_t *ctx, vhd_bat_t *bat)
        return err;
 }
 
+static int
+vhd_write_batmap_header(vhd_context_t *ctx, vhd_batmap_t *batmap)
+{
+       int err;
+       size_t size;
+       off64_t off;
+       char *buf = NULL;
+
+       err = vhd_batmap_header_offset(ctx, &off);
+       if (err)
+               goto out;
+
+       size = vhd_bytes_padded(sizeof(*batmap));
+
+       err = vhd_seek(ctx, off, SEEK_SET);
+       if (err)
+               goto out;
+
+       err = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, size);
+       if (err) {
+               err = -err;
+               buf = NULL;
+               goto out;
+       }
+
+       vhd_batmap_header_out(batmap);
+       memset(buf, 0, size);
+       memcpy(buf, &batmap->header, sizeof(batmap->header));
+
+       err = vhd_write(ctx, buf, size);
+
+out:
+       if (err)
+               VHDLOG("%s: failed writing batmap: %d\n", ctx->file, err);
+       free(buf);
+       return err;
+}
+
 int
 vhd_write_batmap(vhd_context_t *ctx, vhd_batmap_t *batmap)
 {
@@ -2204,6 +2247,71 @@ namedup(char **dup, const char *name)
        return 0;
 }
 
+#define vwrite (ssize_t (*)(int, void *, size_t))write
+#define vpwrite (ssize_t (*)(int, void *, size_t, off_t))pwrite
+
+static ssize_t
+vhd_atomic_pio(ssize_t (*f) (int, void *, size_t, off_t),
+              int fd, void *_s, size_t n, off_t off)
+{
+       char *s = _s;
+       size_t pos = 0;
+       ssize_t res;
+       struct stat st;
+
+       memset(&st, 0, sizeof(st));
+
+       for (;;) {
+               res = (f) (fd, s + pos, n - pos, off + pos);
+               switch (res) {
+               case -1:
+                       if (errno == EINTR || errno == EAGAIN)
+                               continue;
+                       else
+                               return 0;
+                       break;
+               case 0:
+                       errno = EPIPE;
+                       return pos;
+               }
+
+               if (pos + res == n)
+                       return n;
+
+               if (!st.st_size)
+                       if (fstat(fd, &st) == -1)
+                               return -1;
+
+               if (off + pos + res == st.st_size)
+                       return pos + res;
+
+               pos += (res & ~(VHD_SECTOR_SIZE - 1));
+       }
+
+       return -1;
+}
+
+static ssize_t
+vhd_atomic_io(ssize_t (*f) (int, void *, size_t), int fd, void *_s, size_t n)
+{
+       off64_t off;
+       ssize_t res;
+       ssize_t (*pf) (int, void *, size_t, off_t);
+
+       off = lseek64(fd, 0, SEEK_CUR);
+       if (off == (off_t)-1)
+               return -1;
+
+       pf = (f == read ? pread : vpwrite);
+       res = vhd_atomic_pio(pf, fd, _s, n, off);
+
+       if (res > 0)
+               if (lseek64(fd, off + res, SEEK_SET) == (off64_t)-1)
+                       return -1;
+
+       return res;
+}
+
 int
 vhd_seek(vhd_context_t *ctx, off64_t offset, int whence)
 {
@@ -2232,7 +2340,7 @@ vhd_read(vhd_context_t *ctx, void *buf, size_t size)
 
        errno = 0;
 
-       ret = read(ctx->fd, buf, size);
+       ret = vhd_atomic_io(read, ctx->fd, buf, size);
        if (ret == size)
                return 0;
 
@@ -2249,7 +2357,7 @@ vhd_write(vhd_context_t *ctx, void *buf, size_t size)
 
        errno = 0;
 
-       ret = write(ctx->fd, buf, size);
+       ret = vhd_atomic_io(vwrite, ctx->fd, buf, size);
        if (ret == size)
                return 0;
 
@@ -2259,6 +2367,40 @@ vhd_write(vhd_context_t *ctx, void *buf, size_t size)
        return (errno ? -errno : -EIO);
 }
 
+static int
+vhd_pread(vhd_context_t *ctx, void *buf, size_t size, off64_t offset)
+{
+       ssize_t ret;
+
+       errno = 0;
+
+       ret = vhd_atomic_pio(pread, ctx->fd, buf, size, offset);
+       if (ret == size)
+               return 0;
+
+       VHDLOG("%s: pread of %zu returned %zd, errno: %d\n",
+              ctx->file, size, ret, -errno);
+
+       return (errno ? -errno : -EIO);
+}
+
+static int
+vhd_pwrite(vhd_context_t *ctx, void *buf, size_t size, off64_t offset)
+{
+       ssize_t ret;
+
+       errno = 0;
+
+       ret = vhd_atomic_pio(vpwrite, ctx->fd, buf, size, offset);
+       if (ret == size)
+               return 0;
+
+       VHDLOG("%s: pwrite of %zu returned %zd, errno: %d\n",
+              ctx->file, size, ret, -errno);
+
+       return (errno ? -errno : -EIO);
+}
+
 int
 vhd_offset(vhd_context_t *ctx, uint32_t sector, uint32_t *offset)
 {
@@ -2334,12 +2476,14 @@ out:
 int
 vhd_open(vhd_context_t *ctx, const char *file, int flags)
 {
-       int err, oflags, i;
+       int i, err, oflags;
 
        if (flags & VHD_OPEN_STRICT)
                vhd_flag_clear(flags, VHD_OPEN_FAST);
 
        memset(ctx, 0, sizeof(vhd_context_t));
+       vhd_cache_init(ctx);
+
        ctx->fd     = -1;
        ctx->oflags = flags;
 
@@ -2347,7 +2491,9 @@ vhd_open(vhd_context_t *ctx, const char *file, int flags)
        if (err)
                return err;
 
-       oflags = O_DIRECT | O_LARGEFILE;
+       oflags = O_LARGEFILE;
+       if (!(flags & VHD_OPEN_CACHED))
+               oflags |= O_DIRECT;
        if (flags & VHD_OPEN_RDONLY)
                oflags |= O_RDONLY;
        if (flags & VHD_OPEN_RDWR)
@@ -2396,6 +2542,12 @@ vhd_open(vhd_context_t *ctx, const char *file, int flags)
                ctx->bm_secs = secs_round_up_no_zero(ctx->spb >> 3);
        }
 
+       err = vhd_cache_load(ctx);
+       if (err) {
+               VHDLOG("failed to load cache: %d\n", err);
+               goto fail;
+       }
+
        return 0;
 
 fail:
@@ -2409,8 +2561,13 @@ fail:
 void
 vhd_close(vhd_context_t *ctx)
 {
-       if (ctx->file)
+       vhd_cache_unload(ctx);
+
+       if (ctx->file) {
+               fsync(ctx->fd);
                close(ctx->fd);
+       }
+
        free(ctx->file);
        free(ctx->bat.bat);
        free(ctx->batmap.map);
@@ -3170,6 +3327,16 @@ __vhd_io_dynamic_read(vhd_context_t *ctx,
                }
 
                if (vhd->footer.type == HD_TYPE_DIFF) {
+                       vhd_context_t *p;
+                       p = vhd_cache_get_parent(vhd);
+                       if (p) {
+                               vhd = p;
+                               err = vhd_get_bat(vhd);
+                               if (err)
+                                       goto out;
+                               continue;
+                       }
+
                        err = vhd_parent_locator_get(vhd, &next);
                        if (err)
                                goto close;
@@ -3200,7 +3367,7 @@ __vhd_io_dynamic_read(vhd_context_t *ctx,
        }
 
 close:
-       if (vhd != ctx)
+       if (vhd != ctx && !vhd_flag_test(vhd->oflags, VHD_OPEN_CACHED))
                vhd_close(vhd);
 out:
        free(map);
@@ -3239,7 +3406,7 @@ __vhd_io_allocate_block(vhd_context_t *ctx, uint32_t block)
        char *buf;
        size_t size;
        off64_t off, max;
-       int i, err, gap, spp;
+       int i, err, gap, spp, secs;
 
        spp = getpagesize() >> VHD_SECTOR_SHIFT;
 
@@ -3261,7 +3428,11 @@ __vhd_io_allocate_block(vhd_context_t *ctx, uint32_t block)
        if (err)
                return err;
 
-       size = vhd_sectors_to_bytes(ctx->spb + ctx->bm_secs + gap);
+       secs = ctx->bm_secs + gap;
+       if (!vhd_flag_test(ctx->oflags, VHD_OPEN_IO_WRITE_SPARSE))
+               secs += ctx->spb;
+
+       size = vhd_sectors_to_bytes(secs);
        buf  = mmap(0, size, PROT_READ, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
        if (buf == MAP_FAILED)
                return -errno;
@@ -3386,3 +3557,783 @@ vhd_io_write(vhd_context_t *ctx, char *buf, uint64_t sec, uint32_t secs)
 
        return __vhd_io_dynamic_write(ctx, buf, sec, secs);
 }
+
+static inline void
+vhd_cache_init(vhd_context_t *ctx)
+{
+       INIT_LIST_HEAD(&ctx->next);
+}
+
+static inline int
+vhd_cache_enabled(vhd_context_t *ctx)
+{
+       return vhd_flag_test(ctx->oflags, VHD_OPEN_CACHED);
+}
+
+static int
+vhd_cache_load(vhd_context_t *ctx)
+{
+       char *next;
+       int err, pflags;
+       vhd_context_t *vhd;
+
+       err    = 1;
+       pflags = ctx->oflags;
+       vhd    = ctx;
+       next   = NULL;
+
+       vhd_flag_set(pflags, VHD_OPEN_RDONLY);
+       vhd_flag_clear(pflags, VHD_OPEN_CACHED);
+
+       if (!vhd_cache_enabled(vhd))
+               goto done;
+
+       while (vhd->footer.type == HD_TYPE_DIFF) {
+               vhd_context_t *parent;
+
+               parent = NULL;
+
+               if (vhd_parent_raw(vhd))
+                       goto done;
+
+               err = vhd_parent_locator_get(vhd, &next);
+               if (err)
+                       goto out;
+
+               parent = calloc(1, sizeof(*parent));
+               if (!parent)
+                       goto out;
+
+               err = vhd_open(parent, next, pflags);
+               if (err) {
+                       free(parent);
+                       parent = NULL;
+                       goto out;
+               }
+
+               fcntl(parent->fd, F_SETFL,
+                     fcntl(parent->fd, F_GETFL) & ~O_DIRECT);
+               vhd_flag_set(parent->oflags, VHD_OPEN_CACHED);
+               list_add(&parent->next, &vhd->next);
+
+               free(next);
+               next = NULL;
+               vhd  = parent;
+       }
+
+done:
+       err = 0;
+out:
+       free(next);
+       if (err)
+               vhd_cache_unload(vhd);
+
+       return err;
+}
+
+static int
+vhd_cache_unload(vhd_context_t *ctx)
+{
+       vhd_context_t *vhd, *tmp;
+
+       if (!vhd_cache_enabled(ctx))
+               goto out;
+
+       list_for_each_entry_safe(vhd, tmp, &ctx->next, next) {
+               list_del_init(&vhd->next);
+               vhd_close(vhd);
+               free(vhd);
+       }
+
+       INIT_LIST_HEAD(&ctx->next);
+
+out:
+       return 0;
+}
+
+static vhd_context_t *
+vhd_cache_get_parent(vhd_context_t *ctx)
+{
+       vhd_context_t *vhd;
+
+       vhd = NULL;
+
+       if (!vhd_cache_enabled(ctx))
+               goto out;
+
+       if (list_empty(&ctx->next))
+               goto out;
+
+       vhd = list_entry(ctx->next.next, vhd_context_t, next);
+
+out:
+       return vhd;
+}
+
+typedef struct vhd_block_vector vhd_block_vector_t;
+typedef struct vhd_block_vector_entry vhd_block_vector_entry_t;
+
+struct vhd_block_vector_entry {
+       uint64_t                   off;       /* byte offset from block */
+       uint32_t                   bytes;     /* size in bytes */
+       char                      *buf;       /* destination buffer */
+};
+
+struct vhd_block_vector {
+       uint32_t                   block;     /* logical block in vhd */
+       int                        entries;   /* number of vector entries */
+       vhd_block_vector_entry_t  *array;     /* vector list */
+};
+
+/**
+ * @vec: block vector describing read
+ *
+ * @vec describes a list of byte-spans within a given block
+ * and a corresponding list of destination buffers.
+ */
+static int
+vhd_block_vector_read(vhd_context_t *ctx, vhd_block_vector_t *vec)
+{
+       int err, i;
+       off64_t off;
+       uint32_t blk;
+
+       err = vhd_get_bat(ctx);
+       if (err)
+               goto out;
+
+       if (vec->block >= ctx->bat.entries) {
+               err = -ERANGE;
+               goto out;
+       }
+
+       blk = ctx->bat.bat[vec->block];
+       if (blk == DD_BLK_UNUSED) {
+               err = -EINVAL;
+               goto out;
+       }
+
+       off = vhd_sectors_to_bytes(blk + ctx->bm_secs);
+
+       for (i = 0; i < vec->entries; i++) {
+               vhd_block_vector_entry_t *v = vec->array + i;
+               err = vhd_pread(ctx, v->buf, v->bytes, off + v->off);
+               if (err)
+                       goto out;
+       }
+
+out:
+       return err;
+}
+
+/**
+ * @vec: block vector to initialize
+ * @block: vhd block number
+ * @map: optional bitmap of sectors to map (relative to beginning of block)
+ * @buf: destination buffer
+ * @blk_start: byte offset relative to beginning of block
+ * @blk_end: byte offset relative to beginning of block
+ *
+ * initializes @vec to describe a read into a contiguous buffer
+ * of potentially non-contiguous byte ranges in a given vhd block.
+ * only sectors with corresponding bits set in @map (if it is not NULL)
+ * will be mapped; bits corresponding to unmapped sectors will be cleared.
+ * first and last sector maps may be smaller than vhd sector size.
+ */
+static int
+vhd_block_vector_init(vhd_context_t *ctx,
+                     vhd_block_vector_t *vec, uint32_t block, char *map,
+                     char *buf, uint64_t blk_start, uint64_t blk_end)
+{
+       int err, sec;
+       char *bitmap;
+       uint32_t blk, first_sec, last_sec;
+
+       bitmap = NULL;
+       memset(vec, 0, sizeof(*vec));
+
+       first_sec = blk_start >> VHD_SECTOR_SHIFT;
+       last_sec  = secs_round_up_no_zero(blk_end);
+
+       err = vhd_read_bitmap(ctx, block, &bitmap);
+       if (err)
+               goto out;
+
+       vec->array = calloc(ctx->spb, sizeof(vhd_block_vector_entry_t));
+       if (!vec->array) {
+               err = -ENOMEM;
+               goto out;
+       }
+
+       for (sec = first_sec; sec < last_sec; sec++) {
+               uint32_t cnt;
+               vhd_block_vector_entry_t *v;
+
+               cnt = VHD_SECTOR_SIZE - (blk_start & (VHD_SECTOR_SIZE - 1));
+               if (cnt > blk_end - blk_start)
+                       cnt = blk_end - blk_start;
+
+               if (map && !test_bit(map, sec))
+                       goto next;
+
+               if (vhd_bitmap_test(ctx, bitmap, sec)) {
+                       if (vec->entries > 0) {
+                               v = vec->array + vec->entries - 1;
+                               if (v->off + v->bytes == blk_start) {
+                                       v->bytes += cnt;
+                                       goto next;
+                               }
+                       }
+
+                       v        = vec->array + vec->entries;
+                       v->off   = blk_start;
+                       v->bytes = cnt;
+                       v->buf   = buf;
+
+                       vec->entries++;
+
+               } else if (map) {
+                       clear_bit(map, sec);
+               }
+
+       next:
+               blk_start += cnt;
+               buf       += cnt;
+       }
+
+       vec->block = block;
+
+out:
+       free(bitmap);
+       return err;
+}
+
+/**
+ * @block: vhd block number
+ * @buf: buffer to place data in
+ * @size: number of bytes to read
+ * @start: byte offset into block from which to start reading
+ * @end: byte offset in block at which to stop reading
+ *
+ * reads data (if it exists) into @buf.  partial reads may occur
+ * for the first and last sectors if @start and @end are not multiples
+ * of vhd sector size.
+ */
+static int
+vhd_block_vector_read_allocated(vhd_context_t *ctx, uint32_t block,
+                               char *buf, uint64_t start, uint64_t end)
+{
+       int err;
+       vhd_block_vector_t vec;
+
+       vec.array = NULL;
+
+       err = vhd_block_vector_init(ctx, &vec, block, NULL, buf, start, end);
+       if (err)
+               goto out;
+
+       err = vhd_block_vector_read(ctx, &vec);
+
+out:
+       free(vec.array);
+       return err;
+}
+
+/**
+ * @block: vhd block number
+ * @map: bitmap of sectors in block which should be read
+ * @buf: buffer to place data in
+ * @start: byte offset into block from which to start reading
+ * @end: byte offset in block at which to stop reading
+ *
+ * for every bit set in @map (corresponding to sectors in @block),
+ * reads data (if it exists) into @buf.  if data does not exist,
+ * clears corresponding bit in @map.  partial reads may occur
+ * for the first and last sectors if @start and @end are not multiples
+ * of vhd sector size.
+ */
+static int
+vhd_block_vector_read_allocated_selective(vhd_context_t *ctx,
+                                         uint32_t block, char *map, char *buf,
+                                         uint64_t start, uint64_t end)
+{
+       int err;
+       vhd_block_vector_t vec;
+
+       vec.array = NULL;
+
+       err = vhd_block_vector_init(ctx, &vec, block, map, buf, start, end);
+       if (err)
+               goto out;
+
+       err = vhd_block_vector_read(ctx, &vec);
+
+out:
+       free(vec.array);
+       return err;
+}
+
+/**
+ * @map: bitmap of sectors which have already been read
+ * @buf: destination buffer
+ * @size: size in bytes to read
+ * @off: byte offset in virtual disk to read
+ *
+ * reads @size bytes into @buf, starting at @off, skipping sectors
+ * which have corresponding bits set in @map
+ */
+static int
+__vhd_io_dynamic_read_link_bytes(vhd_context_t *ctx, char *map,
+                                char *buf, size_t size, uint64_t off)
+{
+       char *blkmap;
+       int i, err, cnt, map_off;
+       off64_t blk_off, blk_size;
+       uint32_t blk, bytes, first_sec, last_sec;
+
+       blkmap = malloc((ctx->spb + 7) >> 3);
+       if (!blkmap) {
+               err = -ENOMEM;
+               goto out;
+       }
+
+       map_off  = 0;
+       blk_size = vhd_sectors_to_bytes(ctx->spb);
+
+       do {
+               blk     = off / blk_size;
+               blk_off = off % blk_size;
+               bytes   = MIN(blk_size - blk_off, size);
+
+               first_sec = blk_off >> VHD_SECTOR_SHIFT;
+               last_sec  = secs_round_up_no_zero(blk_off + bytes);
+
+               if (ctx->bat.bat[blk] == DD_BLK_UNUSED)
+                       goto next;
+
+               memset(blkmap, 0, (ctx->spb + 7) >> 3);
+
+               for (i = 0; i < (last_sec - first_sec); i++)
+                       if (!test_bit(map, map_off + i))
+                               set_bit(blkmap, first_sec + i);
+
+               err = vhd_block_vector_read_allocated_selective(ctx, blk,
+                                                               blkmap, buf,
+                                                               blk_off,
+                                                               blk_off +
+                                                               bytes);
+               if (err)
+                       goto out;
+
+               for (i = 0; i < (last_sec - first_sec); i++)
+                       if (test_bit(blkmap, first_sec + i))
+                               set_bit(map, map_off + i);
+
+       next:
+               size    -= bytes;
+               off     += bytes;
+               map_off += (last_sec - first_sec);
+               buf     += bytes;
+
+       } while (size);
+
+       err = 0;
+out:
+       free(blkmap);
+       return err;
+}
+
+static int
+__raw_read_link_bytes(const char *filename,
+                     char *map, char *buf, size_t size, uint64_t off)
+{
+       int fd, err;
+       uint32_t i, first_sec, last_sec;
+
+       fd = open(filename, O_RDONLY | O_LARGEFILE);
+       if (fd == -1) {
+               VHDLOG("%s: failed to open: %d\n", filename, -errno);
+               return -errno;
+       }
+
+       first_sec = off >> VHD_SECTOR_SHIFT;
+       last_sec  = secs_round_up_no_zero(off + size);
+
+       for (i = first_sec; i < last_sec; i++) {
+               if (!test_bit(map, i - first_sec)) {
+                       uint32_t secs = 0;
+                       uint64_t coff, csize;
+
+                       while (i + secs < last_sec &&
+                              !test_bit(map, i + secs - first_sec))
+                               secs++;
+
+                       coff  = vhd_sectors_to_bytes(i);
+                       csize = vhd_sectors_to_bytes(secs);
+
+                       if (i == first_sec)
+                               coff = off;
+                       if (secs == last_sec - 1)
+                               csize = (off + size) - coff;
+
+                       if (pread(fd, buf + coff - off, csize, coff) != csize) {
+                               err = (errno ? -errno : -EIO);
+                               goto close;
+                       }
+
+                       i += secs - 1;
+               }
+       }
+
+       err = 0;
+
+close:
+       close(fd);
+       return err;
+}
+
+static int
+__vhd_io_dynamic_read_bytes(vhd_context_t *ctx,
+                           char *buf, size_t size, uint64_t off)
+{
+       int err;
+       char *next, *map;
+       vhd_context_t parent, *vhd;
+       uint32_t i, done, first_sec, last_sec;
+
+       err  = vhd_get_bat(ctx);
+       if (err)
+               return err;
+
+       first_sec = off >> VHD_SECTOR_SHIFT;
+       last_sec  = secs_round_up_no_zero(off + size);
+
+       vhd  = ctx;
+       next = NULL;
+       map  = calloc(1, ((last_sec - first_sec) + 7) >> 3);
+       if (!map) {
+               err = -ENOMEM;
+               goto out;
+       }
+
+       for (;;) {
+               err = __vhd_io_dynamic_read_link_bytes(vhd, map,
+                                                      buf, size, off);
+               if (err)
+                       goto close;
+
+               for (done = 0, i = 0; i < (last_sec - first_sec); i++)
+                       if (test_bit(map, i))
+                               done++;
+
+               if (done == last_sec - first_sec) {
+                       err = 0;
+                       goto close;
+               }
+
+               if (vhd->footer.type == HD_TYPE_DIFF) {
+                       vhd_context_t *p;
+                       p = vhd_cache_get_parent(vhd);
+                       if (p) {
+                               vhd = p;
+                               err = vhd_get_bat(vhd);
+                               if (err)
+                                       goto out;
+                               continue;
+                       }
+
+                       err = vhd_parent_locator_get(vhd, &next);
+                       if (err)
+                               goto close;
+
+                       if (vhd_parent_raw(vhd)) {
+                               err = __raw_read_link_bytes(next, map,
+                                                           buf, size, off);
+                               goto close;
+                       }
+               } else {
+                       err = 0;
+                       goto close;
+               }
+
+               if (vhd != ctx)
+                       vhd_close(vhd);
+               vhd = &parent;
+
+               err = vhd_open(vhd, next, VHD_OPEN_RDONLY);
+               if (err)
+                       goto out;
+
+               err = vhd_get_bat(vhd);
+               if (err)
+                       goto close;
+
+               free(next);
+               next = NULL;
+       }
+
+close:
+       if (!err) {
+               /*
+                * clear any regions not present on disk
+                */
+               for (i = first_sec; i < last_sec; i++) {
+                       if (!test_bit(map, i - first_sec)) {
+                               uint64_t coff  = vhd_sectors_to_bytes(i);
+                               uint32_t csize = VHD_SECTOR_SIZE;
+
+                               if (i == first_sec)
+                                       coff = off;
+                               if (i == last_sec - 1)
+                                       csize = (off + size) - coff;
+
+                               memset(buf + coff - off, 0, csize);
+                       }
+               }
+       }
+
+       if (vhd != ctx && !vhd_flag_test(vhd->oflags, VHD_OPEN_CACHED))
+               vhd_close(vhd);
+out:
+       free(map);
+       free(next);
+       return err;
+}
+
+int
+vhd_io_read_bytes(vhd_context_t *ctx, char *buf, size_t size, uint64_t off)
+{
+       if (off + size > ctx->footer.curr_size)
+               return -ERANGE;
+
+       if (!vhd_type_dynamic(ctx))
+               return vhd_pread(ctx, buf, size, off);
+
+       return __vhd_io_dynamic_read_bytes(ctx, buf, size, off);
+}
+
+static int
+__vhd_io_dynamic_write_bytes_aligned(vhd_context_t *ctx,
+                                    char *buf, size_t size, uint64_t off)
+{
+       char *map;
+       int i, err, ret;
+       uint64_t blk_off, blk_size, blk_start;
+       uint32_t blk, bytes, first_sec, last_sec;
+
+       if (off & (VHD_SECTOR_SIZE - 1) || size & (VHD_SECTOR_SIZE - 1))
+               return -EINVAL;
+
+       err = vhd_get_bat(ctx);
+       if (err)
+               return err;
+
+       if (vhd_has_batmap(ctx)) {
+               err = vhd_get_batmap(ctx);
+               if (err)
+                       return err;
+       }
+
+       map      = NULL;
+       blk_size = vhd_sectors_to_bytes(ctx->spb);
+
+       do {
+               blk     = off / blk_size;
+               blk_off = off % blk_size;
+               bytes   = MIN(blk_size - blk_off, size);
+
+               first_sec = blk_off >> VHD_SECTOR_SHIFT;
+               last_sec  = secs_round_up_no_zero(blk_off + bytes);
+
+               blk_start = ctx->bat.bat[blk];
+               if (blk_start == DD_BLK_UNUSED) {
+                       err = __vhd_io_allocate_block(ctx, blk);
+                       if (err)
+                               goto fail;
+
+                       blk_start = ctx->bat.bat[blk];
+               }
+
+               blk_start = vhd_sectors_to_bytes(blk_start + ctx->bm_secs);
+
+               err = vhd_pwrite(ctx, buf, bytes, blk_start + blk_off);
+               if (err)
+                       goto fail;
+
+               if (vhd_has_batmap(ctx) &&
+                   vhd_batmap_test(ctx, &ctx->batmap, blk))
+                       goto next;
+
+               err = vhd_read_bitmap(ctx, blk, &map);
+               if (err) {
+                       map = NULL;
+                       goto fail;
+               }
+
+               for (i = first_sec; i < last_sec; i++)
+                       vhd_bitmap_set(ctx, map, i);
+
+               err = vhd_write_bitmap(ctx, blk, map);
+               if (err)
+                       goto fail;
+
+               if (vhd_has_batmap(ctx)) {
+                       for (i = 0; i < ctx->spb; i++)
+                               if (!vhd_bitmap_test(ctx, map, i)) {
+                                       free(map);
+                                       map = NULL;
+                                       goto next;
+                               }
+
+                       vhd_batmap_set(ctx, &ctx->batmap, blk);
+                       err = vhd_write_batmap(ctx, &ctx->batmap);
+                       if (err)
+                               goto fail;
+               }
+
+               free(map);
+               map = NULL;
+
+       next:
+               size   -= bytes;
+               off    += bytes;
+               buf    += bytes;
+
+       } while (size);
+
+       err = 0;
+
+out:
+       ret = vhd_write_footer(ctx, &ctx->footer);
+       return (err ? err : ret);
+
+fail:
+       free(map);
+       goto out;
+}
+
+static int
+__vhd_io_dynamic_write_bytes(vhd_context_t *ctx,
+                            char *buf, size_t size, uint64_t off)
+{
+       int err;
+       char *tmp;
+       uint32_t first_sec, last_sec, first_sec_off, last_sec_off;
+
+       err = 0;
+       tmp = NULL;
+
+       first_sec = off >> VHD_SECTOR_SHIFT;
+       last_sec  = secs_round_up_no_zero(off + size);
+
+       first_sec_off = off & (VHD_SECTOR_SIZE - 1);
+       last_sec_off  = (off + size) & (VHD_SECTOR_SIZE - 1);
+
+       if (first_sec_off || last_sec_off) {
+               tmp = malloc(VHD_SECTOR_SIZE);
+               if (!tmp) {
+                       err = -ENOMEM;
+                       goto out;
+               }
+
+               if (first_sec_off) {
+                       uint32_t new = VHD_SECTOR_SIZE - first_sec_off;
+                       if (new > size)
+                               new = size;
+
+                       err = vhd_io_read_bytes(
+                               ctx, tmp, VHD_SECTOR_SIZE,
+                               vhd_sectors_to_bytes(first_sec));
+                       if (err)
+                               goto out;
+
+                       memcpy(tmp + first_sec_off, buf, new);
+
+                       err = __vhd_io_dynamic_write_bytes_aligned(
+                               ctx, tmp, VHD_SECTOR_SIZE,
+                               vhd_sectors_to_bytes(first_sec));
+                       if (err)
+                               goto out;
+
+                       buf  += new;
+                       off  += new;
+                       size -= new;
+               }
+
+               if (last_sec_off &&
+                   (last_sec - first_sec > 1 || !first_sec_off)) {
+                       uint32_t new = last_sec_off;
+
+                       err = vhd_io_read_bytes(
+                               ctx, tmp, VHD_SECTOR_SIZE,
+                               vhd_sectors_to_bytes(last_sec - 1));
+                       if (err)
+                               goto out;
+
+                       memcpy(tmp, buf + size - new, new);
+
+                       err = __vhd_io_dynamic_write_bytes_aligned(
+                               ctx, tmp, VHD_SECTOR_SIZE,
+                               vhd_sectors_to_bytes(last_sec - 1));
+                       if (err)
+                               goto out;
+
+                       size -= new;
+               }
+       }
+
+       if (size)
+               err = __vhd_io_dynamic_write_bytes_aligned(ctx, buf, size, off);
+
+out:
+       free(tmp);
+       return err;
+}
+
+int
+vhd_io_write_bytes(vhd_context_t *ctx, char *buf, size_t size, uint64_t off)
+{
+       if (off + size > ctx->footer.curr_size)
+               return -ERANGE;
+
+       if (!vhd_type_dynamic(ctx))
+               return vhd_pwrite(ctx, buf, size, off);
+
+       return __vhd_io_dynamic_write_bytes(ctx, buf, size, off);
+}
+
+int
+vhd_marker(vhd_context_t *ctx, char *marker)
+{
+       int err;
+       vhd_batmap_t batmap;
+
+       *marker = 0;
+
+       if (!vhd_has_batmap(ctx))
+               return -ENOSYS;
+
+       err = vhd_read_batmap_header(ctx, &batmap);
+       if (err)
+               return err;
+
+       *marker = batmap.header.marker;
+       return 0;
+}
+
+int
+vhd_set_marker(vhd_context_t *ctx, char marker)
+{
+       int err;
+       vhd_batmap_t batmap;
+
+       if (!vhd_has_batmap(ctx))
+               return -ENOSYS;
+
+       err = vhd_read_batmap_header(ctx, &batmap);
+       if (err)
+               return err;
+
+       batmap.header.marker = marker;
+       return vhd_write_batmap_header(ctx, &batmap);
+}
diff --git a/vhd/lib/libvhdio.c b/vhd/lib/libvhdio.c
new file mode 100644 (file)
index 0000000..eaa076d
--- /dev/null
@@ -0,0 +1,1638 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * XenSource proprietary code.
+ */
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
+#ifdef _LARGEFILE_SOURCE
+#undef _LARGEFILE_SOURCE
+#endif
+
+#ifdef _LARGEFILE64_SOURCE
+#undef _LARGEFILE64_SOURCE
+#endif
+
+#if defined(_FILE_OFFSET_BITS) && _FILE_OFFSET_BITS == 64
+#undef _FILE_OFFSET_BITS
+#define _FILE_OFFSET_BITS 32
+#endif
+
+#ifdef _LARGEFILE_SOURCE
+#undef _LARGEFILE_SOURCE
+#endif
+
+#include <time.h>
+#include <stdio.h>
+#include <errno.h>
+#include <dlfcn.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <stdarg.h>
+#include <signal.h>
+#include <malloc.h>
+#include <sys/stat.h>
+#include <linux/fs.h>
+#include <linux/hdreg.h>
+
+#define _FCNTL_H
+#include <bits/fcntl.h>
+
+#include "libvhd.h"
+#include "partition.h"
+
+#define _ARRAY_SIZE(array) (sizeof(array) / sizeof(array[0]))
+
+#define __RESOLVE(func, name)                                          \
+       do {                                                            \
+               if (!_libvhd_io_initialized)                            \
+                       _libvhd_io_init();                              \
+               if (!(func))                                            \
+                       (func) = _get_std_fn((name));                   \
+       } while (0)
+
+#define _RESOLVE(func) __RESOLVE((func), __func__)
+
+#define LIBVHD_IO_DEBUG "LIBVHD_IO_DEBUG"
+#define LIBVHD_IO_DUMP  "LIBVHD_IO_DUMP"
+#define LIBVHD_IO_TEST  "LIBVHD_IO_TEST"
+
+static int libvhdio_logging;
+static FILE *libvhdio_log;
+#define LOG(_f, _a...)                                                 \
+       do {                                                            \
+               if (libvhdio_logging && libvhdio_log) {                 \
+                       fprintf(libvhdio_log, _f, ##_a);                \
+                       fflush(libvhdio_log);                           \
+               }                                                       \
+       } while (0)
+
+static int libvhdio_dump;
+#define DUMP(_buf, _size)                                              \
+       do {                                                            \
+               if (libvhdio_log && libvhdio_dump) {                    \
+                       int i;                                          \
+                       LOG("'");                                       \
+                       for (i = 0; i < (_size); i++)                   \
+                               fputc(((char *)(_buf))[i],              \
+                                     libvhdio_log);                    \
+                       LOG("'\n");                                     \
+               }                                                       \
+       } while (0)
+
+struct _function {
+       const char                    *name;
+       void                          *fn;
+};
+
+struct vhd_object {
+       vhd_context_t                  vhd;
+       int                            refcnt;
+       uint64_t                       ino;
+       struct list_head               next;
+};
+
+struct vhd_partition {
+       struct vhd_object             *vhd_obj;
+       int                            partition;
+       int                            flags;
+       off64_t                        start;     /* in sectors */
+       off64_t                        end;       /* in sectors */
+       off64_t                        size;      /* in sectors */
+};
+
+struct vhd_fd_context {
+       struct vhd_partition           vhd_part;
+       off64_t                        off;
+       int                            users;
+};
+
+typedef struct vhd_object vhd_object_t;
+typedef struct vhd_partition vhd_partition_t;
+typedef struct vhd_fd_context vhd_fd_context_t;
+typedef int (*_std_open_t)(const char *, int, int);
+typedef int (*_std_close_t)(int);
+typedef FILE *(*_std_fopen_t)(const char *, const char *);
+
+static struct _function _function_table[] = {
+       { .name = "open",             .fn = NULL },
+       { .name = "open64",           .fn = NULL },
+#ifdef __open_2
+       { .name = "__open_2",         .fn = NULL },
+#endif // __open_2
+#ifdef __open64_2
+       { .name = "__open64_2",       .fn = NULL },
+#endif // __open64_2
+       { .name = "close",            .fn = NULL },
+       { .name = "dup",              .fn = NULL },
+       { .name = "dup2",             .fn = NULL },
+#ifdef dup3
+       { .name = "dup3",             .fn = NULL },
+#endif // dup3
+       { .name = "lseek",            .fn = NULL },
+       { .name = "lseek64",          .fn = NULL },
+       { .name = "read",             .fn = NULL },
+       { .name = "write",            .fn = NULL },
+       { .name = "pread",            .fn = NULL },
+       { .name = "pread64",          .fn = NULL },
+       { .name = "pwrite",           .fn = NULL },
+       { .name = "pwrite64",         .fn = NULL },
+       { .name = "fsync",            .fn = NULL },
+       { .name = "__xstat",          .fn = NULL },
+       { .name = "__xstat64",        .fn = NULL },
+       { .name = "__fxstat",         .fn = NULL },
+       { .name = "__fxstat64",       .fn = NULL },
+       { .name = "__lxstat",         .fn = NULL },
+       { .name = "__lxstat64",       .fn = NULL },
+       { .name = "ioctl",            .fn = NULL },
+       { .name = "fcntl",            .fn = NULL },
+
+       { .name = "fopen",            .fn = NULL },
+       { .name = "fopen64",          .fn = NULL },
+       { .name = "_IO_getc",         .fn = NULL },
+       { .name = "fread",            .fn = NULL },
+
+       { .name = "posix_memalign",   .fn = NULL },
+};
+
+static int _libvhd_io_interpose = 1;
+static struct list_head _vhd_objects;
+static vhd_fd_context_t **_vhd_map;
+static int _vhd_map_size;
+
+static int _libvhd_io_initialized;
+static void _libvhd_io_init(void) __attribute__((constructor));
+
+static volatile sig_atomic_t _libvhd_io_reset_vhds;
+
+static void *
+_load_std_fn(const char *name)
+{
+       void *fn;
+       char *msg;
+
+       LOG("loading %s\n", name);
+
+       fn = dlsym(RTLD_NEXT, name);
+       msg = dlerror();
+       if (!fn || msg) {
+               LOG("dlsym '%s' failed: %s\n", name, msg);
+               exit(1);
+       }
+
+       return fn;
+}
+
+static void *
+_get_std_fn(const char *name)
+{
+       int i;
+
+       for (i = 0; i < _ARRAY_SIZE(_function_table); i++)
+               if (!strcmp(name, _function_table[i].name))
+                       return _function_table[i].fn;
+
+       return NULL;
+}
+
+static void
+_init_vhd_log(void)
+{
+       int (*_std_dup)(int) = _load_std_fn("dup");
+       int log_fd = _std_dup(STDERR_FILENO);
+
+       libvhdio_log = fdopen(log_fd, "a");
+
+       if (getenv(LIBVHD_IO_DEBUG)) {
+               libvhdio_logging = 1;
+               libvhd_set_log_level(1);
+       }
+
+       if (getenv(LIBVHD_IO_DUMP))
+               libvhdio_dump = 1;
+}
+
+static void
+_init_vhd_map(void)
+{
+       _vhd_map_size = sysconf(_SC_OPEN_MAX);
+       _vhd_map = calloc(_vhd_map_size, sizeof(vhd_fd_context_t *));
+       if (!_vhd_map) {
+               LOG("failed to init vhd map\n");
+               exit(1);
+       }
+}
+
+static void
+_init_vhd_objs(void)
+{
+       INIT_LIST_HEAD(&_vhd_objects);
+}
+
+static void
+_libvhd_io_reset(void)
+{
+       int i, err;
+
+       if (!_libvhd_io_interpose)
+               return;
+
+       _libvhd_io_reset_vhds = 0;
+
+       if (!_vhd_map)
+               return;
+
+       _libvhd_io_interpose = 0;
+
+       for (i = 0; i < _vhd_map_size; i++) {
+               int flags;
+               vhd_context_t *vhd;
+               char *child, *parent;
+               vhd_fd_context_t *vhd_fd = _vhd_map[i];
+
+               if (!vhd_fd)
+                       continue;
+
+               vhd = &vhd_fd->vhd_part.vhd_obj->vhd;
+
+               flags = vhd->oflags;
+               child = strdup(vhd->file);
+               if (!child)
+                       exit(ENOMEM);
+
+               LOG("resetting vhd fd %d user fd %d\n", vhd->fd, i);
+               vhd_close(vhd);
+
+               if (asprintf(&parent, "%s.%d.vhd",
+                            child, (int)time(NULL)) == -1)
+                       exit(ENOMEM);
+
+               if (rename(child, parent))
+                       exit(errno);
+
+               err = vhd_snapshot(child, 0, parent, 0, 0);
+               if (err) {
+                       LOG("snapshot of %s failed on reset: %d\n",
+                           child, err);
+                       exit(1);
+               }
+
+               err = vhd_open(vhd, child, flags);
+               if (err) {
+                       LOG("opening new snapshot %s failed on reset: %d\n",
+                           child, err);
+                       exit(1);
+               }
+
+               LOG("snapshot %s %s vhd fd %d user fd %d\n",
+                   child, parent, vhd->fd, i);
+
+               free(child);
+               free(parent);
+       }
+
+       _libvhd_io_interpose = 1;
+}
+
+static void
+_libvhd_io_continue(int signo)
+{
+       _libvhd_io_reset_vhds = 1;
+}
+
+static void
+_init_vhd_test(void)
+{
+       if (getenv(LIBVHD_IO_TEST)) {
+               sigset_t set;
+               struct sigaction act;
+
+               if (sigemptyset(&set))
+                       exit(1);
+
+               act = (struct sigaction) {
+                       .sa_handler  = _libvhd_io_continue,
+                       .sa_mask     = set,
+                       .sa_flags    = 0,
+               };
+
+               if (sigaction(SIGCONT, &act, NULL)) {
+                       LOG("failed to set signal handler: %d\n", errno);
+                       exit(1);
+               }
+
+               LOG("testing enabled\n");
+       }
+}
+
+static void
+_libvhd_io_init(void)
+{
+       int i;
+
+       if (_libvhd_io_initialized)
+               return;
+
+       _init_vhd_log();
+       _init_vhd_map();
+       _init_vhd_objs();
+       _init_vhd_test();
+
+       for (i = 0; i < _ARRAY_SIZE(_function_table); i++)
+               _function_table[i].fn = _load_std_fn(_function_table[i].name);
+
+       LOG("\n");
+       _libvhd_io_initialized = 1;
+}
+
+static vhd_object_t *
+_libvhd_io_get_vhd(const char *path, int flags)
+{
+       struct stat64 st;
+       int err, vhd_flags;
+       vhd_object_t *tmp, *obj = NULL;
+
+       _libvhd_io_interpose = 0;
+
+       if (stat64(path, &st))
+               goto out;
+
+       list_for_each_entry(tmp, &_vhd_objects, next)
+               if (tmp->ino == st.st_ino) {
+                       obj = tmp;
+                       if (flags & (O_RDWR | O_WRONLY) &&
+                           obj->vhd.oflags & VHD_OPEN_RDONLY) {
+                               errno = EACCES;
+                               obj = NULL;
+                       }
+                       goto out;
+               }
+
+       vhd_flags = VHD_OPEN_CACHED;
+
+       /*
+        * we open RDWR whenever we can since vhd objects may be shared and
+        * we don't have a clean way to switch RDONLY vhds to RDWR.  we'll
+        * only open RDONLY when (flags & O_RDONLY) and we lack permission
+        * to open RDWR.
+        */
+       if (access(path, W_OK) == -1) {
+               if (errno != EACCES)
+                       goto out;
+
+               if (flags & (O_WRONLY | O_RDWR))
+                       goto out;
+
+               vhd_flags |= VHD_OPEN_RDONLY;
+       } else {
+               vhd_flags |= VHD_OPEN_RDWR;
+       }
+
+       obj = malloc(sizeof(*obj));
+       if (!obj) {
+               errno = ENOMEM;
+               goto out;
+       }
+
+       INIT_LIST_HEAD(&obj->next);
+       obj->refcnt = 0;
+       obj->ino = st.st_ino;
+
+       err = vhd_open(&obj->vhd, path, vhd_flags);
+       if (err) {
+               free(obj);
+               obj = NULL;
+               errno = err;
+               goto out;
+       }
+
+       list_add(&obj->next, &_vhd_objects);
+
+out:
+       _libvhd_io_interpose = 1;
+       if (obj) {
+               obj->refcnt++;
+               LOG("%s: %s 0x%llx 0x%x\n",
+                   __func__, path, obj->ino, obj->refcnt);
+       }
+       return obj;
+}
+
+static void
+_libvhd_io_put_vhd(vhd_object_t *obj)
+{
+       LOG("%s: 0x%llx 0x%x\n", __func__, obj->ino, obj->refcnt - 1);
+       if (--obj->refcnt == 0) {
+               vhd_close(&obj->vhd);
+               list_del(&obj->next);
+               free(obj);
+       }
+}
+
+static inline vhd_fd_context_t *
+_libvhd_io_map_get(int idx)
+{
+       if (_libvhd_io_reset_vhds)
+               _libvhd_io_reset();
+       return _vhd_map[idx];
+}
+
+static inline void
+_libvhd_io_map_set(int idx, vhd_fd_context_t *vhd_fd)
+{
+       vhd_fd->users++;
+       _vhd_map[idx] = vhd_fd;
+       LOG("mapping 0x%x to %s (0x%x users)\n",
+           idx, vhd_fd->vhd_part.vhd_obj->vhd.file, vhd_fd->users);
+}
+
+static inline void
+_libvhd_io_map_clear(int idx)
+{
+       vhd_fd_context_t *vhd_fd;
+
+       if (idx < 0 || idx >= _vhd_map_size)
+               return;
+
+       vhd_fd = _vhd_map[idx];
+       _vhd_map[idx] = NULL;
+
+       if (vhd_fd) {
+               if (--vhd_fd->users == 0) {
+                       _libvhd_io_put_vhd(vhd_fd->vhd_part.vhd_obj);
+                       free(vhd_fd);
+               }
+       }
+}
+
+static int
+_libvhd_io_read_bytes(vhd_partition_t *vhd_part,
+                     void *buf, size_t size, uint64_t off)
+{
+       int ret;
+       vhd_context_t *vhd = &vhd_part->vhd_obj->vhd;
+
+       _libvhd_io_interpose = 0;
+       ret = vhd_io_read_bytes(vhd, buf, size, off);
+       _libvhd_io_interpose = 1;
+
+       if (ret) {
+               LOG("vhd_io_read_bytes %s %p 0x%x 0x%llx failed: %d\n",
+                   vhd->file, buf, size, off, ret);
+               errno = -ret;
+               ret = 1;
+       } else {
+               LOG("vhd_io_read_bytes %s %p 0x%x 0x%llx\n",
+                   vhd->file, buf, size, off);
+               DUMP(buf, size);
+       }
+
+       return ret;
+}
+
+static int
+_libvhd_io_write_bytes(vhd_partition_t *vhd_part,
+                      const void *buf, size_t size, uint64_t off)
+{
+       int ret;
+       vhd_context_t *vhd = &vhd_part->vhd_obj->vhd;
+
+       _libvhd_io_interpose = 0;
+       ret = vhd_io_write_bytes(vhd, (void *)buf, size, off);
+       _libvhd_io_interpose = 1;
+
+       if (ret) {
+               LOG("vhd_io_write_bytes %s %p 0x%x 0x%llx failed: %d\n",
+                   vhd->file, buf, size, off, ret);
+               errno = -ret;
+               ret = 1;
+       } else {
+               LOG("vhd_io_write_bytes %s %p 0x%x 0x%llx\n",
+                   vhd->file, buf, size, off);
+               DUMP(buf, size);
+       }
+
+       return ret;
+}
+
+/*
+ * symlink pathnames like *.vhd[1-4] are treated specially
+ */
+static int
+_libvhd_io_guess_partition(const char *path, int *partition, int *skip)
+{
+       char *sfx;
+       int err, len;
+       struct stat64 st;
+
+       *skip      = 0;
+       *partition = 0;
+
+       _libvhd_io_interpose = 0;
+       err = lstat64(path, &st);
+       _libvhd_io_interpose = 1;
+
+       if (err == -1)
+               return errno;
+
+       if ((st.st_mode & __S_IFMT) != __S_IFLNK) {
+               if (st.st_size < VHD_SECTOR_SIZE)
+                       *skip = 1;
+               return 0;
+       }
+
+       sfx = strstr(path, ".vhd");
+       if (!sfx)
+               return 0;
+
+       sfx += strlen(".vhd");
+       len = strlen(sfx);
+       if (!len)
+               return 0;
+       if (len > 1)
+               return EINVAL;
+
+       switch (*sfx) {
+       case '1' ... '4':
+               *partition = atoi(sfx);
+               break;
+       default:
+               return EINVAL;
+       }
+
+       return 0;
+}
+
+static int
+_libvhd_io_init_partition(vhd_partition_t *vhd_part, int partition)
+{
+       int err;
+       vhd_context_t *vhd;
+       struct partition_table *pt;
+       struct primary_partition *p;
+
+       if (partition < 0 || partition > 4)
+               return ENOENT;
+
+       vhd = &vhd_part->vhd_obj->vhd;
+
+       if (!partition) {
+               vhd_part->partition = 0;
+               vhd_part->start     = 0;
+               vhd_part->end       = (vhd->footer.curr_size >> VHD_SECTOR_SHIFT);
+               vhd_part->size      = vhd_part->end;
+               return 0;
+       }
+
+       err = posix_memalign((void **)&pt, VHD_SECTOR_SIZE, VHD_SECTOR_SIZE);
+       if (err)
+               return err;
+
+       err = _libvhd_io_read_bytes(vhd_part, (char *)pt, 512, 0);
+       if (err) {
+               LOG("reading partition failed: %d\n", err);
+               goto out;
+       }
+
+       partition_table_in(pt);
+       err = partition_table_validate(pt);
+       if (err) {
+               LOG("bad partition table read\n");
+               goto out;
+       }
+
+       p = pt->partitions + (partition - 1);
+       if (!p->lba || !p->blocks) {
+               err = ENOENT;
+               goto out;
+       }
+
+       vhd_part->partition = partition;
+       vhd_part->start     = p->lba;
+       vhd_part->end       = p->lba + p->blocks;
+       vhd_part->size      = p->blocks;
+       err                 = 0;
+
+       LOG("%s: opening %s partition 0x%x start 0x%08llx end 0x%08llx\n",
+           __func__, vhd->file, partition, vhd_part->start, vhd_part->end);
+
+out:
+       free(pt);
+       return err;
+}
+
+static int
+_libvhd_io_vhd_open(vhd_partition_t *vhd_part, const char *path, int flags)
+{
+       int err, skip, partition;
+
+       memset(vhd_part, 0, sizeof(*vhd_part));
+       vhd_part->flags = flags;
+
+       err = _libvhd_io_guess_partition(path, &partition, &skip);
+       if (err)
+               return err;
+
+       if (skip)
+               return EINVAL;
+
+       LOG("%s: attempting vhd_open of %s\n", __func__, path);
+
+       vhd_part->vhd_obj = _libvhd_io_get_vhd(path, flags);
+       if (!vhd_part->vhd_obj)
+               err = errno;
+
+       if (!err) {
+               err = _libvhd_io_init_partition(vhd_part, partition);
+               if (err) {
+                       _libvhd_io_put_vhd(vhd_part->vhd_obj);
+                       memset(vhd_part, 0, sizeof(*vhd_part));
+               }
+       }
+
+       return (err >= 0 ? err : -err);
+}
+
+static int
+_libvhd_io_open(const char *pathname,
+               int flags, mode_t mode, _std_open_t _std_open)
+{
+       int err, fd;
+       vhd_fd_context_t *vhd_fd;
+
+       errno    = 0;
+       vhd_fd   = NULL;
+
+       vhd_fd = calloc(1, sizeof(*vhd_fd));
+       if (!vhd_fd) {
+               err = ENOMEM;
+               goto fail;
+       }
+
+       err = _libvhd_io_vhd_open(&vhd_fd->vhd_part, pathname, flags);
+       if (err) {
+               if (err == EINVAL || err == ENOENT)
+                       goto std_open;
+
+               LOG("%s: vhd_open of %s failed: %d\n",
+                   __func__, pathname, err);
+               goto fail;
+       }
+
+#ifdef O_CLOEXEC
+       if (flags & (O_APPEND | O_ASYNC | O_CLOEXEC |
+                    O_DIRECTORY | O_NONBLOCK)) {
+#else
+       if (flags & (O_APPEND | O_ASYNC | O_DIRECTORY | O_NONBLOCK)) {
+#endif //O_CLOEXEC
+               LOG("%s: invalid flags for vhd_open: 0x%x\n", __func__, flags);
+               err = EINVAL;
+               goto fail;
+       }
+
+       fd = _std_open("/dev/null", O_RDONLY, 0);
+       if (fd == -1) {
+               err = errno;
+               goto fail;
+       }
+
+       _libvhd_io_map_set(fd, vhd_fd);
+       return fd;
+
+std_open:
+       free(vhd_fd);
+       return _std_open(pathname, flags, mode);
+
+fail:
+       if (vhd_fd && vhd_fd->vhd_part.vhd_obj)
+               _libvhd_io_put_vhd(vhd_fd->vhd_part.vhd_obj);
+       free(vhd_fd);
+       errno = err;
+       return -1;
+}
+
+static int
+_libvhd_io_close(int fd, _std_close_t _std_close)
+{
+       _libvhd_io_map_clear(fd);
+       return _std_close(fd);
+}
+
+static FILE *
+_libvhd_io_fopen(const char *path, const char *mode)
+{
+       char *m;
+       FILE *f;
+       int fd, flags;
+       vhd_fd_context_t *vhd_fd;
+       static _std_open_t _std_open64;
+
+       __RESOLVE(_std_open64, "open64");
+
+       flags = 0;
+       if (strchr(mode, 'a')) {
+               if (strchr(mode, '+'))
+                       flags |= O_APPEND | O_RDWR;
+               else
+                       flags |= O_APPEND | O_WRONLY;
+       }
+       if (strchr(mode, 'r')) {
+               if (strchr(mode, '+'))
+                       flags |= O_RDWR;
+               else
+                       flags |= O_RDONLY;
+       }
+       if (strchr(mode, 'w')) {
+               errno = EINVAL;
+               return NULL;
+       }
+
+       fd = _libvhd_io_open(path, flags, 0, _std_open64);
+       if (fd == -1)
+               return NULL;
+
+       vhd_fd = _libvhd_io_map_get(fd);
+       if (vhd_fd)
+               m = "r";
+       else
+               m = (char *)mode;
+
+       f = fdopen(fd, m);
+       if (!f) {
+               int err = errno;
+               close(fd);
+               errno = err;
+       }
+
+       return f;
+}
+
+static ssize_t
+_libvhd_io_pread(vhd_partition_t *vhd_part,
+                void *buf, size_t count, off64_t offset)
+{
+       ssize_t ret;
+       off64_t psize;
+
+       ret   = (ssize_t)-1;
+       psize = vhd_part->size << VHD_SECTOR_SHIFT;
+
+       if (vhd_part->flags & O_WRONLY) {
+               errno = EPERM;
+               goto out;
+       }
+
+       if (offset >= psize) {
+               ret = 0;
+               goto out;
+       }
+
+       count   = MIN(count, psize - offset);
+       offset += (vhd_part->start << VHD_SECTOR_SHIFT);
+
+       if (_libvhd_io_read_bytes(vhd_part, buf, count, offset))
+               goto out;
+
+       ret = count;
+
+out:
+       return ret;
+}
+
+static ssize_t
+_libvhd_io_pwrite(vhd_partition_t *vhd_part,
+                 const void *buf, size_t count, off64_t offset)
+{
+       ssize_t ret;
+       off64_t psize;
+
+       ret   = (ssize_t)-1;
+       psize = vhd_part->size << VHD_SECTOR_SHIFT;
+
+       if (vhd_part->flags & O_RDONLY) {
+               errno = EPERM;
+               goto out;
+       }
+
+       if (offset >= psize) {
+               ret = 0;
+               goto out;
+       }
+
+       count   = MIN(count, psize - offset);
+       offset += (vhd_part->start << VHD_SECTOR_SHIFT);
+
+       if (_libvhd_io_write_bytes(vhd_part, buf, count, offset))
+               goto out;
+
+       ret = count;
+
+out:
+       return ret;
+}
+
+static int
+_libvhd_io_fstat(int version, vhd_partition_t *vhd_part, struct stat *stats)
+{
+       int ret;
+       static int (*_std___fxstat)(int, int, struct stat *);
+
+       __RESOLVE(_std___fxstat, "__fxstat");
+       ret = _std___fxstat(version, vhd_part->vhd_obj->vhd.fd, stats);
+       if (ret)
+               return ret;
+
+       /*
+        * emulate block device
+        */
+       stats->st_size = 0;
+       stats->st_blocks = 0;
+       stats->st_blksize = getpagesize();
+       stats->st_mode &= ~__S_IFREG;
+       stats->st_mode |= __S_IFBLK;
+
+       return 0;
+}
+
+static int
+_libvhd_io_fstat64(int version,
+                  vhd_partition_t *vhd_part, struct stat64 *stats)
+{
+       int ret;
+       static int (*_std___fxstat64)(int, int, struct stat64 *);
+
+       __RESOLVE(_std___fxstat64, "__fxstat64");
+       ret = _std___fxstat64(version, vhd_part->vhd_obj->vhd.fd, stats);
+       if (ret)
+               return ret;
+
+       /*
+        * emulate block device
+        */
+       stats->st_size = 0;
+       stats->st_blocks = 0;
+       stats->st_blksize = getpagesize();
+       stats->st_mode &= ~__S_IFREG;
+       stats->st_mode |= __S_IFBLK;
+
+       return 0;
+}
+
+static int
+_libvhd_io_stat(int version, const char *path, struct stat *stats)
+{
+       int err;
+       vhd_partition_t vhd_part;
+
+       err = _libvhd_io_vhd_open(&vhd_part, path, O_RDONLY);
+       if (err) {
+               errno = (err > 0 ? err : -err);
+               return -1;
+       }
+
+       err = _libvhd_io_fstat(version, &vhd_part, stats);
+       _libvhd_io_put_vhd(vhd_part.vhd_obj);
+
+       return err;
+}
+
+static int
+_libvhd_io_stat64(int version, const char *path, struct stat64 *stats)
+{
+       int err;
+       vhd_partition_t vhd_part;
+
+       err = _libvhd_io_vhd_open(&vhd_part, path, O_RDONLY);
+       if (err) {
+               errno = (err > 0 ? err : -err);
+               return -1;
+       }
+
+       err = _libvhd_io_fstat64(version, &vhd_part, stats);
+       _libvhd_io_put_vhd(vhd_part.vhd_obj);
+
+       return err;
+}
+
+int
+open(const char *pathname, int flags, mode_t _mode)
+{
+       int fd;
+       mode_t mode;
+       static _std_open_t _std_open;
+
+       _RESOLVE(_std_open);
+       mode = (flags & O_CREAT ? _mode : 0);
+
+       if (!_libvhd_io_interpose)
+               return _std_open(pathname, flags, mode);
+
+       fd = _libvhd_io_open(pathname, flags, mode, _std_open);
+
+       LOG("%s %s 0x%x 0x%x: 0x%x\n", __func__, pathname, flags, mode, fd);
+
+       return fd;
+}
+
+int
+open64(const char *pathname, int flags, mode_t _mode)
+{
+       int fd;
+       mode_t mode;
+       static _std_open_t _std_open64;
+
+       _RESOLVE(_std_open64);
+       mode = (flags & O_CREAT ? _mode : 0);
+
+       if (!_libvhd_io_interpose)
+               return _std_open64(pathname, flags, mode);
+
+       fd = _libvhd_io_open(pathname, flags, mode, _std_open64);
+
+       LOG("%s %s 0x%x 0x%x: 0x%x\n", __func__, pathname, flags, mode, fd);
+
+       return fd;
+}
+
+int
+__open_2(const char *pathname, int flags, mode_t _mode)
+{
+       int fd;
+       mode_t mode;
+       static _std_open_t _std___open_2;
+
+       _RESOLVE(_std___open_2);
+       mode = (flags & O_CREAT ? _mode : 0);
+
+       if (!_libvhd_io_interpose)
+               return _std___open_2(pathname, flags, mode);
+
+       fd = _libvhd_io_open(pathname, flags, mode, _std___open_2);
+
+       LOG("%s %s 0x%x 0x%x: 0x%x\n", __func__, pathname, flags, mode, fd);
+
+       return fd;
+}
+
+int
+__open64_2(const char *pathname, int flags, mode_t _mode)
+{
+       int fd;
+       mode_t mode;
+       static _std_open_t _std___open64_2;
+
+       _RESOLVE(_std___open64_2);
+       mode = (flags & O_CREAT ? _mode : 0);
+
+       if (!_libvhd_io_interpose)
+               return _std___open64_2(pathname, flags, mode);
+
+       fd = _libvhd_io_open(pathname, flags, mode, _std___open64_2);
+
+       LOG("%s %s 0x%x 0x%x: 0x%x\n", __func__, pathname, flags, mode, fd);
+
+       return fd;
+}
+
+int
+close(int fd)
+{
+       static _std_close_t _std_close;
+
+       _RESOLVE(_std_close);
+
+       LOG("%s 0x%x\n", __func__, fd);
+
+       return _libvhd_io_close(fd, _std_close);
+}
+
+int
+dup(int oldfd)
+{
+       int newfd;
+       vhd_fd_context_t *vhd_fd;
+       static int (*_std_dup)(int);
+
+       _RESOLVE(_std_dup);
+       vhd_fd = _libvhd_io_map_get(oldfd);
+
+       LOG("%s 0x%x\n", __func__, oldfd);
+
+       newfd = _std_dup(oldfd);
+       if (newfd != -1 && vhd_fd)
+               _libvhd_io_map_set(newfd, vhd_fd);
+
+       return newfd;
+}
+
+int
+dup2(int oldfd, int newfd)
+{
+       int ret;
+       vhd_fd_context_t *vhd_fd;
+       static int (*_std_dup2)(int, int);
+
+       _RESOLVE(_std_dup2);
+       vhd_fd = _libvhd_io_map_get(oldfd);
+
+       LOG("%s 0x%x 0x%x\n", __func__, oldfd, newfd);
+
+       ret = _std_dup2(oldfd, newfd);
+       if (ret != -1 && vhd_fd)
+               _libvhd_io_map_set(ret, vhd_fd);
+
+       return ret;
+}
+
+int
+dup3(int oldfd, int newfd, int flags)
+{
+       int ret;
+       vhd_fd_context_t *vhd_fd;
+       static int (*_std_dup3)(int, int, int);
+
+       _RESOLVE(_std_dup3);
+       vhd_fd = _libvhd_io_map_get(oldfd);
+
+       LOG("%s 0x%x 0x%x 0x%x\n", __func__, oldfd, newfd, flags);
+
+       /*
+        * TODO: handle O_CLOEXEC...
+        */
+       ret = _std_dup3(oldfd, newfd, flags);
+       if (ret != -1 && vhd_fd)
+               _libvhd_io_map_set(ret, vhd_fd);
+
+       return ret;
+}
+
+off_t
+lseek(int fd, off_t offset, int whence)
+{
+       off_t new_off;
+       vhd_fd_context_t *vhd_fd;
+       static off_t (*_std_lseek)(int, off_t, int);
+
+       _RESOLVE(_std_lseek);
+       vhd_fd = _libvhd_io_map_get(fd);
+
+       LOG("%s 0x%x 0x%lx 0x%x\n", __func__, fd, offset, whence);
+
+       if (!vhd_fd)
+               return _std_lseek(fd, offset, whence);
+
+       switch (whence) {
+       case SEEK_SET:
+               new_off = offset;
+               break;
+       case SEEK_CUR:
+               new_off = vhd_fd->off + offset;
+               break;
+       case SEEK_END:
+               new_off = (vhd_fd->vhd_part.size << VHD_SECTOR_SHIFT) + offset;
+               break;
+       default:
+               errno = EINVAL;
+               return (off_t)-1;
+       }
+
+       if (new_off < 0 ||
+           new_off > vhd_fd->vhd_part.size << VHD_SECTOR_SHIFT) {
+               errno = EINVAL;
+               return (off_t)-1;
+       }
+
+       vhd_fd->off = new_off;
+       return vhd_fd->off;
+}
+
+off64_t
+lseek64(int fd, off64_t offset, int whence)
+{
+       off64_t new_off;
+       vhd_fd_context_t *vhd_fd;
+       static off64_t (*_std_lseek64)(int, off64_t, int);
+
+       _RESOLVE(_std_lseek64);
+       vhd_fd = _libvhd_io_map_get(fd);
+
+       LOG("%s 0x%x 0x%llx 0x%x\n", __func__, fd, offset, whence);
+
+       if (!vhd_fd)
+               return _std_lseek64(fd, offset, whence);
+
+       switch (whence) {
+       case SEEK_SET:
+               new_off = offset;
+               break;
+       case SEEK_CUR:
+               new_off = vhd_fd->off + offset;
+               break;
+       case SEEK_END:
+               new_off = (vhd_fd->vhd_part.size << VHD_SECTOR_SHIFT) + offset;
+               break;
+       default:
+               errno = EINVAL;
+               return (off64_t)-1;
+       }
+
+       if (new_off < 0 ||
+           new_off > vhd_fd->vhd_part.size << VHD_SECTOR_SHIFT) {
+               errno = EINVAL;
+               return (off64_t)-1;
+       }
+
+       vhd_fd->off = new_off;
+       return vhd_fd->off;
+}
+
+ssize_t
+read(int fd, void *buf, size_t count)
+{
+       ssize_t ret;
+       vhd_fd_context_t *vhd_fd;
+       static ssize_t (*_std_read)(int, void *, size_t);
+
+       _RESOLVE(_std_read);
+       vhd_fd = _libvhd_io_map_get(fd);
+
+       LOG("%s 0x%x %p 0x%x\n", __func__, fd, buf, count);
+
+       if (!vhd_fd)
+               return _std_read(fd, buf, count);
+
+       ret = _libvhd_io_pread(&vhd_fd->vhd_part, buf, count, vhd_fd->off);
+       if (ret != -1)
+               vhd_fd->off += count;
+
+       return ret;
+}
+
+ssize_t
+write(int fd, const void *buf, size_t count)
+{
+       ssize_t ret;
+       vhd_fd_context_t *vhd_fd;
+       static ssize_t (*_std_write)(int, const void *, size_t);
+
+       _RESOLVE(_std_write);
+       vhd_fd = _libvhd_io_map_get(fd);
+
+       LOG("%s 0x%x %p 0x%x\n", __func__, fd, buf, count);
+
+       if (!vhd_fd)
+               return _std_write(fd, buf, count);
+
+       ret = _libvhd_io_pwrite(&vhd_fd->vhd_part, buf, count, vhd_fd->off);
+       if (ret != -1)
+               vhd_fd->off += count;
+
+       return ret;
+}
+
+ssize_t
+pread(int fd, void *buf, size_t count, off_t offset)
+{
+       vhd_fd_context_t *vhd_fd;
+       static ssize_t (*_std_pread)(int, void *, size_t, off_t);
+
+       _RESOLVE(_std_pread);
+       vhd_fd = _libvhd_io_map_get(fd);
+
+       LOG("%s 0x%x %p 0x%x 0x%lx\n", __func__, fd, buf, count, offset);
+
+       if (!vhd_fd)
+               return _std_pread(fd, buf, count, offset);
+
+       return _libvhd_io_pread(&vhd_fd->vhd_part, buf, count, offset);
+}
+
+ssize_t
+pread64(int fd, void *buf, size_t count, off64_t offset)
+{
+       vhd_fd_context_t *vhd_fd;
+       static ssize_t (*_std_pread64)(int, void *, size_t, off64_t);
+
+       _RESOLVE(_std_pread64);
+       vhd_fd = _libvhd_io_map_get(fd);
+
+       LOG("%s 0x%x %p 0x%x 0x%llx\n", __func__, fd, buf, count, offset);
+
+       if (!vhd_fd)
+               return _std_pread64(fd, buf, count, offset);
+
+       return _libvhd_io_pread(&vhd_fd->vhd_part, buf, count, offset);
+}
+
+ssize_t
+pwrite(int fd, const void *buf, size_t count, off_t offset)
+{
+       vhd_fd_context_t *vhd_fd;
+       static ssize_t (*_std_pwrite)(int, const void *, size_t, off_t);
+
+       _RESOLVE(_std_pwrite);
+       vhd_fd = _libvhd_io_map_get(fd);
+
+       LOG("%s 0x%x %p 0x%x, 0x%lx\n", __func__, fd, buf, count, offset);
+
+       if (!vhd_fd)
+               return _std_pwrite(fd, buf, count, offset);
+
+       return _libvhd_io_pwrite(&vhd_fd->vhd_part, buf, count, offset);
+}
+
+ssize_t
+pwrite64(int fd, const void *buf, size_t count, off64_t offset)
+{
+       vhd_fd_context_t *vhd_fd;
+       static ssize_t (*_std_pwrite64)(int, const void *, size_t, off64_t);
+
+       _RESOLVE(_std_pwrite64);
+       vhd_fd = _libvhd_io_map_get(fd);
+
+       LOG("%s 0x%x %p 0x%x, 0x%llx\n", __func__, fd, buf, count, offset);
+
+       if (!vhd_fd)
+               return _std_pwrite64(fd, buf, count, offset);
+
+       return _libvhd_io_pwrite(&vhd_fd->vhd_part, buf, count, offset);
+}
+
+int
+fsync(int fd)
+{
+       vhd_fd_context_t *vhd_fd;
+       static int (*_std_fsync)(int);
+
+       _RESOLVE(_std_fsync);
+       vhd_fd = _libvhd_io_map_get(fd);
+       if (!vhd_fd)
+               return _std_fsync(fd);
+
+       LOG("%s 0x%x\n", __func__, fd);
+
+       return _std_fsync(vhd_fd->vhd_part.vhd_obj->vhd.fd);
+}
+
+int
+__xstat(int version, const char *path, struct stat *buf)
+{
+       int ret;
+       static int (*_std___xstat)(int, const char *, struct stat *);
+
+       _RESOLVE(_std___xstat);
+       if (!_libvhd_io_interpose)
+               return _std___xstat(version, path, buf);
+
+       LOG("%s 0x%x %s %p\n", __func__, version, path, buf);
+
+       ret = _libvhd_io_stat(version, path, buf);
+       if (ret)
+               ret = _std___xstat(version, path, buf);
+
+       return ret;
+}
+
+int
+__xstat64(int version, const char *path, struct stat64 *buf)
+{
+       int ret;
+       static int (*_std___xstat64)(int, const char *, struct stat64 *);
+
+       _RESOLVE(_std___xstat64);
+       if (!_libvhd_io_interpose)
+               return _std___xstat64(version, path, buf);
+
+       LOG("%s 0x%x %s %p\n", __func__, version, path, buf);
+
+       ret = _libvhd_io_stat64(version, path, buf);
+       if (ret)
+               ret = _std___xstat64(version, path, buf);
+
+
+       return ret;
+}
+
+int
+__fxstat(int version, int fd, struct stat *buf)
+{
+       vhd_fd_context_t *vhd_fd;
+       static int (*_std___fxstat)(int, int, struct stat *);
+
+       _RESOLVE(_std___fxstat);
+       vhd_fd = _libvhd_io_map_get(fd);
+
+       LOG("%s 0x%x 0x%x %p\n", __func__, version, fd, buf);
+
+       if (vhd_fd)
+               return _libvhd_io_fstat(version, &vhd_fd->vhd_part, buf);
+       else
+               return _std___fxstat(version, fd, buf);
+}
+
+int
+__fxstat64(int version, int fd, struct stat64 *buf)
+{
+       vhd_fd_context_t *vhd_fd;
+       static int (*_std___fxstat64)(int, int, struct stat64 *);
+
+       _RESOLVE(_std___fxstat64);
+       vhd_fd = _libvhd_io_map_get(fd);
+
+       LOG("%s 0x%x 0x%x %p\n", __func__, version, fd, buf);
+
+       if (vhd_fd)
+               return _libvhd_io_fstat64(version, &vhd_fd->vhd_part, buf);
+       else
+               return _std___fxstat64(version, fd, buf);
+}
+
+/*
+ * NB: symlinks to vhds will be stat'ed rather than lstat'ed.
+ */
+int
+__lxstat(int version, const char *path, struct stat *buf)
+{
+       int ret;
+       static int (*_std___lxstat)(int, const char *, struct stat *);
+
+       _RESOLVE(_std___lxstat);
+       if (!_libvhd_io_interpose)
+               return _std___lxstat(version, path, buf);
+
+       LOG("%s 0x%x %s %p\n", __func__, version, path, buf);
+
+       ret = _libvhd_io_stat(version, path, buf);
+       if (ret)
+               ret = _std___lxstat(version, path, buf);
+
+       return ret;
+}
+
+/*
+ * NB: symlinks to vhds will be stat'ed rather than lstat'ed.
+ */
+int
+__lxstat64(int version, const char *path, struct stat64 *buf)
+{
+       int ret;
+       static int (*_std___lxstat64)(int, const char *, struct stat64 *);
+
+       _RESOLVE(_std___lxstat64);
+       if (!_libvhd_io_interpose)
+               return _std___lxstat64(version, path, buf);
+
+       LOG("%s 0x%x %s %p\n", __func__, version, path, buf);
+
+       ret = _libvhd_io_stat64(version, path, buf);
+       if (ret)
+               ret = _std___lxstat64(version, path, buf);
+
+       return ret;
+}
+
+int
+ioctl(int fd, int request, char *argp)
+{
+       vhd_fd_context_t *vhd_fd;
+       static int (*_std_ioctl)(int, int, char *);
+
+       _RESOLVE(_std_ioctl);
+       vhd_fd = _libvhd_io_map_get(fd);
+       if (!vhd_fd)
+               return _std_ioctl(fd, request, argp);
+
+       LOG("%s 0x%x 0x%x %p\n", __func__, fd, request, argp);
+
+#ifdef BLKGETSIZE64
+       if (request == BLKGETSIZE64) {
+               uint64_t *size = (uint64_t *)argp;
+               *size = vhd_fd->vhd_part.size << VHD_SECTOR_SHIFT;
+               return 0;
+       }
+#endif
+#ifdef BLKGETSIZE
+       if (request == BLKGETSIZE) {
+               unsigned long *size = (unsigned long *)argp;
+               *size = vhd_fd->vhd_part.size << VHD_SECTOR_SHIFT;
+               return 0;
+       }
+#endif
+#ifdef BLKSSZGET
+       if (request == BLKSSZGET) {
+               int *sec_size = (int *)argp;
+               *sec_size = VHD_SECTOR_SIZE;
+               return 0;
+       }
+#endif
+#ifdef HDIO_GETGEO
+       if (request == HDIO_GETGEO) {
+               vhd_context_t *vhd = &vhd_fd->vhd_part.vhd_obj->vhd;
+               struct hd_geometry *geo = (struct hd_geometry *)argp;
+               geo->heads = GEOM_GET_HEADS(vhd->footer.geometry);
+               geo->sectors = GEOM_GET_SPT(vhd->footer.geometry);
+               geo->cylinders = GEOM_GET_CYLS(vhd->footer.geometry);
+               geo->start = vhd_fd->vhd_part.start;
+               return 0;
+       }
+#endif
+
+       return _std_ioctl(fd, request, argp);
+}
+
+int
+fcntl(int fd, int cmd, ...)
+{
+       int real_fd;
+       va_list args;
+       vhd_fd_context_t *vhd_fd;
+       static int (*_std_fcntl)(int, int, ...);
+
+       _RESOLVE(_std_fcntl);
+
+       real_fd = fd;
+       vhd_fd = _libvhd_io_map_get(fd);
+       if (vhd_fd)
+               real_fd = vhd_fd->vhd_part.vhd_obj->vhd.fd;
+
+       LOG("%s 0x%x 0x%x\n", __func__, fd, cmd);
+
+       switch (cmd) {
+       case F_GETFD:
+       case F_GETFL:
+       case F_GETOWN:
+       case F_GETSIG:
+       case F_GETLEASE:
+               LOG("%s 0x%x void\n", __func__, real_fd);
+               return _std_fcntl(real_fd, cmd);
+
+       case F_DUPFD:
+#ifdef F_DUPFD_CLOEXEC
+       case F_DUPFD_CLOEXEC:
+#endif // F_DUPFD_CLOEXEC
+       case F_SETFD:
+       case F_SETFL:
+       case F_SETOWN:
+       case F_SETSIG:
+       case F_SETLEASE:
+       case F_NOTIFY:
+       {
+               long arg;
+               va_start(args, cmd);
+               arg = va_arg(args, long);
+               va_end(args);
+               LOG("%s 0x%x long 0x%lx\n", __func__, real_fd, arg);
+               return _std_fcntl(real_fd, cmd, arg);
+       }
+
+       case F_SETLK:
+       case F_SETLKW:
+       case F_GETLK:
+       {
+               struct flock *flk;
+               va_start(args, cmd);
+               flk = va_arg(args, struct flock *);
+               va_end(args);
+               LOG("%s 0x%x lock %p\n", __func__, real_fd, flk);
+               return _std_fcntl(real_fd, cmd, flk);
+       }
+
+       case F_SETLK64:
+       case F_SETLKW64:
+       case F_GETLK64:
+       {
+               struct flock64 *flk;
+               va_start(args, cmd);
+               flk = va_arg(args, struct flock64 *);
+               va_end(args);
+               LOG("%s 0x%x lock64 %p (%p)\n",
+                   __func__, real_fd, flk, _std_fcntl);
+               return _std_fcntl(real_fd, cmd, flk);
+       }
+
+       default:
+               LOG("%s unrecognized cmd\n", __func__);
+               errno = EINVAL;
+               return -1;
+       }
+}
+
+FILE *
+fopen(const char *path, const char *mode)
+{
+       FILE *f;
+       static _std_fopen_t _std_fopen;
+
+       _RESOLVE(_std_fopen);
+
+       if (!_libvhd_io_interpose || strchr(mode, 'w'))
+               return _std_fopen(path, mode);
+
+       f = _libvhd_io_fopen(path, mode);
+
+       LOG("%s %s %s: 0x%x\n", __func__, path, mode, (f ? fileno(f) : -1));
+
+       return f;
+}
+
+FILE *
+fopen64(const char *path, const char *mode)
+{
+       FILE *f;
+       static _std_fopen_t _std_fopen64;
+
+       _RESOLVE(_std_fopen64);
+
+       if (!_libvhd_io_interpose || strchr(mode, 'w'))
+               return _std_fopen64(path, mode);
+
+       f = _libvhd_io_fopen(path, mode);
+
+       LOG("%s %s %s: 0x%x\n", __func__, path, mode, (f ? fileno(f) : -1));
+
+       return f;
+}
+
+int
+_IO_getc(FILE *f)
+{
+       int cnt;
+       unsigned char c;
+       vhd_fd_context_t *vhd_fd;
+       static int (*_std__IO_getc)(FILE *);
+
+       _RESOLVE(_std__IO_getc);
+       vhd_fd = _libvhd_io_map_get(fileno(f));
+       if (!vhd_fd)
+               return _std__IO_getc(f);
+
+       LOG("%s %p (0x%x)\n", __func__, f, fileno(f));
+       cnt = _libvhd_io_pread(&vhd_fd->vhd_part, &c, sizeof(c), vhd_fd->off);
+       if (cnt > 0)
+               vhd_fd->off += cnt;
+
+       return (int)c;
+}
+
+#ifdef _IO_getc_unlocked
+#undef _IO_getc_unlocked
+#endif
+int
+_IO_getc_unlocked(FILE *f)
+{
+       return _IO_getc(f);
+}
+
+size_t
+fread(void *buf, size_t size, size_t n, FILE *f)
+{
+       ssize_t cnt;
+       vhd_fd_context_t *vhd_fd;
+       static size_t (*_std_fread)(void *, size_t, size_t, FILE *);
+
+       _RESOLVE(_std_fread);
+       vhd_fd = _libvhd_io_map_get(fileno(f));
+       if (!vhd_fd)
+               return _std_fread(buf, size, n, f);
+
+       LOG("%s %p 0x%x 0x%x %p (0x%x)\n",
+           __func__, buf, size, n, f, fileno(f));
+       cnt = _libvhd_io_pread(&vhd_fd->vhd_part, buf, n * size, vhd_fd->off);
+       if (cnt > 0) {
+               vhd_fd->off += cnt;
+               cnt /= size;
+       }
+
+       return cnt;
+}
+
+#ifdef fread_unlocked
+#undef fread_unlocked
+#endif
+size_t fread_unlocked(void *buf, size_t size, size_t n, FILE *f)
+{
+       return fread(buf, size, n, f);
+}
+
+/*
+ * sigh... preloading with bash causes problems, since bash has its own
+ * malloc(), memalign(), and free() functions, but no posix_memalign().
+ * this causes problems when libvhd free()'s posix_memalign()'ed memory.
+ */
+#define _libvhd_power_of_2(x) ((((x) - 1) & (x)) == 0)
+int
+posix_memalign(void **memptr, size_t alignment, size_t size)
+{
+       if (!alignment || alignment % sizeof(void *) ||
+           !_libvhd_power_of_2(alignment / sizeof(void *)))
+               return EINVAL;
+
+       *memptr = memalign(alignment, size);
+       if (!*memptr)
+               return ENOMEM;
+
+       return 0;
+}