]> xenbits.xensource.com Git - people/liuw/libxenctrl-split/xen.git/commitdiff
blktap2: a completely rewritten blktap implementation
authorKeir Fraser <keir.fraser@citrix.com>
Tue, 26 May 2009 10:52:31 +0000 (11:52 +0100)
committerKeir Fraser <keir.fraser@citrix.com>
Tue, 26 May 2009 10:52:31 +0000 (11:52 +0100)
Benefits to blktap2 over the old version of blktap:

* Isolation from xenstore - Blktap devices are now created directly on
   the linux dom0 command line, rather than being spawned in response
   to XenStore events.  This is handy for debugging, makes blktap
   generally easier to work with, and is a step toward a generic
   user-level block device implementation that is not Xen-specific.

* Improved tapdisk infrastructure: simpler request forwarding, new
   request scheduler, request merging, more efficient use of AIO.

* Improved tapdisk error handling and memory management.  No
   allocations on the block data path, IO retry logic to protect
   guests
   transient block device failures.  This has been tested and is known
   to work on weird environments such as NFS soft mounts.

* Pause and snapshot of live virtual disks (see xmsnap script).

* VHD support.  The VHD code in this release has been rigorously
   tested, and represents a very mature implementation of the VHD
   image
   format.

* No more duplication of mechanism with blkback.  The blktap kernel
   module has changed dramatically from the original blktap.  Blkback
   is now always used to talk to Xen guests, blktap just presents a
   Linux gendisk that blkback can export.  This is done while
   preserving the zero-copy data path from domU to physical device.

These patches deprecate the old blktap code, which can hopefully be
removed from the tree completely at some point in the future.

Signed-off-by: Jake Wires <jake.wires@citrix.com>
Signed-off-by: Dutch Meyer <dmeyer@cs.ubc.ca>
108 files changed:
.hgignore
tools/Makefile
tools/blktap2/Makefile [new file with mode: 0644]
tools/blktap2/README [new file with mode: 0644]
tools/blktap2/daemon/Makefile [new file with mode: 0644]
tools/blktap2/daemon/lib/Makefile [new file with mode: 0644]
tools/blktap2/daemon/lib/xs_api.c [new file with mode: 0644]
tools/blktap2/daemon/lib/xs_api.h [new file with mode: 0644]
tools/blktap2/daemon/tapdisk-channel.c [new file with mode: 0644]
tools/blktap2/daemon/tapdisk-daemon.c [new file with mode: 0644]
tools/blktap2/daemon/tapdisk-dispatch-common.c [new file with mode: 0644]
tools/blktap2/daemon/tapdisk-dispatch.h [new file with mode: 0644]
tools/blktap2/drivers/Makefile [new file with mode: 0644]
tools/blktap2/drivers/aes.c [new file with mode: 0644]
tools/blktap2/drivers/aes.h [new file with mode: 0644]
tools/blktap2/drivers/atomicio.c [new file with mode: 0644]
tools/blktap2/drivers/blk.h [new file with mode: 0644]
tools/blktap2/drivers/blk_linux.c [new file with mode: 0644]
tools/blktap2/drivers/blktap2.h [new file with mode: 0644]
tools/blktap2/drivers/block-aio.c [new file with mode: 0644]
tools/blktap2/drivers/block-cache.c [new file with mode: 0644]
tools/blktap2/drivers/block-log.c [new file with mode: 0644]
tools/blktap2/drivers/block-qcow.c [new file with mode: 0644]
tools/blktap2/drivers/block-ram.c [new file with mode: 0644]
tools/blktap2/drivers/block-vhd.c [new file with mode: 0644]
tools/blktap2/drivers/bswap.h [new file with mode: 0644]
tools/blktap2/drivers/check_gcrypt [new file with mode: 0644]
tools/blktap2/drivers/disktypes.h [new file with mode: 0644]
tools/blktap2/drivers/img2qcow.c [new file with mode: 0644]
tools/blktap2/drivers/io-optimize.c [new file with mode: 0644]
tools/blktap2/drivers/io-optimize.h [new file with mode: 0644]
tools/blktap2/drivers/lock.c [new file with mode: 0644]
tools/blktap2/drivers/lock.h [new file with mode: 0644]
tools/blktap2/drivers/log.h [new file with mode: 0644]
tools/blktap2/drivers/profile.h [new file with mode: 0644]
tools/blktap2/drivers/qcow-create.c [new file with mode: 0644]
tools/blktap2/drivers/qcow.h [new file with mode: 0644]
tools/blktap2/drivers/qcow2raw.c [new file with mode: 0644]
tools/blktap2/drivers/scheduler.c [new file with mode: 0644]
tools/blktap2/drivers/scheduler.h [new file with mode: 0644]
tools/blktap2/drivers/tapdisk-client.c [new file with mode: 0644]
tools/blktap2/drivers/tapdisk-diff.c [new file with mode: 0644]
tools/blktap2/drivers/tapdisk-driver.c [new file with mode: 0644]
tools/blktap2/drivers/tapdisk-driver.h [new file with mode: 0644]
tools/blktap2/drivers/tapdisk-filter.c [new file with mode: 0644]
tools/blktap2/drivers/tapdisk-filter.h [new file with mode: 0644]
tools/blktap2/drivers/tapdisk-image.c [new file with mode: 0644]
tools/blktap2/drivers/tapdisk-image.h [new file with mode: 0644]
tools/blktap2/drivers/tapdisk-interface.c [new file with mode: 0644]
tools/blktap2/drivers/tapdisk-interface.h [new file with mode: 0644]
tools/blktap2/drivers/tapdisk-ipc.c [new file with mode: 0644]
tools/blktap2/drivers/tapdisk-ipc.h [new file with mode: 0644]
tools/blktap2/drivers/tapdisk-log.c [new file with mode: 0644]
tools/blktap2/drivers/tapdisk-log.h [new file with mode: 0644]
tools/blktap2/drivers/tapdisk-queue.c [new file with mode: 0644]
tools/blktap2/drivers/tapdisk-queue.h [new file with mode: 0644]
tools/blktap2/drivers/tapdisk-ring.c [new file with mode: 0644]
tools/blktap2/drivers/tapdisk-ring.h [new file with mode: 0644]
tools/blktap2/drivers/tapdisk-server.c [new file with mode: 0644]
tools/blktap2/drivers/tapdisk-server.h [new file with mode: 0644]
tools/blktap2/drivers/tapdisk-stream.c [new file with mode: 0644]
tools/blktap2/drivers/tapdisk-utils.c [new file with mode: 0644]
tools/blktap2/drivers/tapdisk-utils.h [new file with mode: 0644]
tools/blktap2/drivers/tapdisk-vbd.c [new file with mode: 0644]
tools/blktap2/drivers/tapdisk-vbd.h [new file with mode: 0644]
tools/blktap2/drivers/tapdisk.c [new file with mode: 0644]
tools/blktap2/drivers/tapdisk.h [new file with mode: 0644]
tools/blktap2/drivers/tapdisk2.c [new file with mode: 0644]
tools/blktap2/drivers/td.c [new file with mode: 0644]
tools/blktap2/drivers/xmsnap [new file with mode: 0644]
tools/blktap2/include/Makefile [new file with mode: 0644]
tools/blktap2/include/atomicio.h [new file with mode: 0644]
tools/blktap2/include/blktaplib.h [new file with mode: 0644]
tools/blktap2/include/libvhd-journal.h [new file with mode: 0644]
tools/blktap2/include/libvhd.h [new file with mode: 0644]
tools/blktap2/include/list.h [new file with mode: 0644]
tools/blktap2/include/lvm-util.h [new file with mode: 0644]
tools/blktap2/include/relative-path.h [new file with mode: 0644]
tools/blktap2/include/tapdisk-message.h [new file with mode: 0644]
tools/blktap2/include/vhd-util.h [new file with mode: 0644]
tools/blktap2/include/vhd.h [new file with mode: 0644]
tools/blktap2/lvm/Makefile [new file with mode: 0644]
tools/blktap2/lvm/lvm-util.c [new file with mode: 0644]
tools/blktap2/vhd/Makefile [new file with mode: 0644]
tools/blktap2/vhd/lib/Makefile [new file with mode: 0644]
tools/blktap2/vhd/lib/atomicio.c [new file with mode: 0644]
tools/blktap2/vhd/lib/libvhd-journal.c [new file with mode: 0644]
tools/blktap2/vhd/lib/libvhd.c [new file with mode: 0644]
tools/blktap2/vhd/lib/relative-path.c [new file with mode: 0644]
tools/blktap2/vhd/lib/vhd-util-check.c [new file with mode: 0644]
tools/blktap2/vhd/lib/vhd-util-coalesce.c [new file with mode: 0644]
tools/blktap2/vhd/lib/vhd-util-create.c [new file with mode: 0644]
tools/blktap2/vhd/lib/vhd-util-fill.c [new file with mode: 0644]
tools/blktap2/vhd/lib/vhd-util-modify.c [new file with mode: 0644]
tools/blktap2/vhd/lib/vhd-util-query.c [new file with mode: 0644]
tools/blktap2/vhd/lib/vhd-util-read.c [new file with mode: 0644]
tools/blktap2/vhd/lib/vhd-util-repair.c [new file with mode: 0644]
tools/blktap2/vhd/lib/vhd-util-resize.c [new file with mode: 0644]
tools/blktap2/vhd/lib/vhd-util-revert.c [new file with mode: 0644]
tools/blktap2/vhd/lib/vhd-util-scan.c [new file with mode: 0644]
tools/blktap2/vhd/lib/vhd-util-set-field.c [new file with mode: 0644]
tools/blktap2/vhd/lib/vhd-util-snapshot.c [new file with mode: 0644]
tools/blktap2/vhd/vhd-update.c [new file with mode: 0644]
tools/blktap2/vhd/vhd-util.c [new file with mode: 0644]
tools/check/check_uuid_devel [new file with mode: 0755]
tools/python/xen/xend/XendDomainInfo.py
tools/python/xen/xend/server/BlktapController.py
tools/python/xen/xend/server/DevController.py

index 1b798d15f433d1352895019dfa8ed1162a9760e3..966c180e384bbe13683c45b0e9bc4b939119d23a 100644 (file)
--- a/.hgignore
+++ b/.hgignore
 ^stubdom/lwip/
 ^stubdom/ioemu/
 ^tools/.*/build/lib.*/.*\.py$
-^tools/blktap/Makefile\.smh$
+^tools/blktap2/daemon/blktapctrl$
+^tools/blktap2/drivers/img2qcow$
+^tools/blktap2/drivers/lock-util$
+^tools/blktap2/drivers/qcow-create$
+^tools/blktap2/drivers/qcow2raw$
+^tools/blktap2/drivers/tapdisk$
+^tools/blktap2/drivers/tapdisk-client$
+^tools/blktap2/drivers/tapdisk-diff$
+^tools/blktap2/drivers/tapdisk-stream$
+^tools/blktap2/drivers/tapdisk2$
+^tools/blktap2/drivers/td-util$
+^tools/blktap2/vhd/vhd-update$
+^tools/blktap2/vhd/vhd-util$
 ^tools/blktap/drivers/blktapctrl$
 ^tools/blktap/drivers/img2qcow$
 ^tools/blktap/drivers/qcow-create$
index 3209f2f8bd5891ef443c232941d411ad454c8924..dff96a5c76d87891174ad76fd5d047281b8fc426 100644 (file)
@@ -22,6 +22,7 @@ SUBDIRS-$(VTPM_TOOLS) += vtpm
 SUBDIRS-y += xenstat
 SUBDIRS-$(CONFIG_Linux) += libaio
 SUBDIRS-$(CONFIG_Linux) += blktap
+SUBDIRS-$(CONFIG_Linux) += blktap2
 SUBDIRS-y += libfsimage
 SUBDIRS-$(LIBXENAPI_BINDINGS) += libxen
 SUBDIRS-$(CONFIG_Linux) += fs-back
diff --git a/tools/blktap2/Makefile b/tools/blktap2/Makefile
new file mode 100644 (file)
index 0000000..20a9451
--- /dev/null
@@ -0,0 +1,34 @@
+XEN_ROOT = ../..
+include $(XEN_ROOT)/tools/Rules.mk
+
+CFLAGS  += $(CFLAGS_libxenctrl)
+LDFLAGS += $(LDFLAGS_libxenctrl)
+
+SUBDIRS-y :=
+SUBDIRS-y += include
+SUBDIRS-y += lvm
+SUBDIRS-y += vhd
+SUBDIRS-y += drivers
+SUBDIRS-y += daemon
+
+.PHONY: all
+all: build
+
+.PHONY: build
+build:
+       @set -e; for subdir in $(SUBDIRS-y); do \
+       $(MAKE) -C $$subdir all;       \
+               done
+
+.PHONY: install
+install:
+       @set -e; for subdir in $(SUBDIRS-y); do \
+               $(MAKE) -C $$subdir install; \
+       done
+
+.PHONY: clean
+clean:
+       rm -rf *.a *.so *.o *.rpm $(LIB) *~ $(DEPS) TAGS
+       @set -e; for subdir in $(SUBDIRS-y); do \
+       $(MAKE) -C $$subdir clean;       \
+               done
diff --git a/tools/blktap2/README b/tools/blktap2/README
new file mode 100644 (file)
index 0000000..5e41080
--- /dev/null
@@ -0,0 +1,122 @@
+Blktap Userspace Tools + Library
+================================
+
+Andrew Warfield and Julian Chesterfield
+16th June 2006
+
+{firstname.lastname}@cl.cam.ac.uk
+
+The blktap userspace toolkit provides a user-level disk I/O
+interface. The blktap mechanism involves a kernel driver that acts
+similarly to the existing Xen/Linux blkback driver, and a set of
+associated user-level libraries.  Using these tools, blktap allows
+virtual block devices presented to VMs to be implemented in userspace
+and to be backed by raw partitions, files, network, etc.
+
+The key benefit of blktap is that it makes it easy and fast to write
+arbitrary block backends, and that these user-level backends actually
+perform very well.  Specifically:
+
+- Metadata disk formats such as Copy-on-Write, encrypted disks, sparse
+  formats and other compression features can be easily implemented.
+
+- Accessing file-based images from userspace avoids problems related
+  to flushing dirty pages which are present in the Linux loopback
+  driver.  (Specifically, doing a large number of writes to an
+  NFS-backed image don't result in the OOM killer going berserk.)
+
+- Per-disk handler processes enable easier userspace policing of block
+  resources, and process-granularity QoS techniques (disk scheduling
+  and related tools) may be trivially applied to block devices.
+
+- It's very easy to take advantage of userspace facilities such as
+  networking libraries, compression utilities, peer-to-peer
+  file-sharing systems and so on to build more complex block backends.
+
+- Crashes are contained -- incremental development/debugging is very
+  fast.
+
+How it works (in one paragraph):
+
+Working in conjunction with the kernel blktap driver, all disk I/O
+requests from VMs are passed to the userspace deamon (using a shared
+memory interface) through a character device. Each active disk is
+mapped to an individual device node, allowing per-disk processes to
+implement individual block devices where desired.  The userspace
+drivers are implemented using asynchronous (Linux libaio),
+O_DIRECT-based calls to preserve the unbuffered, batched and
+asynchronous request dispatch achieved with the existing blkback
+code.  We provide a simple, asynchronous virtual disk interface that
+makes it quite easy to add new disk implementations.
+
+As of June 2006 the current supported disk formats are:
+
+ - Raw Images (both on partitions and in image files)
+ - File-backed Qcow disks
+ - Standalone sparse Qcow disks
+ - Fast shareable RAM disk between VMs (requires some form of cluster-based 
+   filesystem support e.g. OCFS2 in the guest kernel)
+ - Some VMDK images - your mileage may vary
+
+Raw and QCow images have asynchronous backends and so should perform
+fairly well.  VMDK is based directly on the qemu vmdk driver, which is
+synchronous (a.k.a. slow).
+
+Build and Installation Instructions
+===================================
+
+Make to configure the blktap backend driver in your dom0 kernel.  It
+will cooperate fine with the existing backend driver, so you can
+experiment with tap disks without breaking existing VM configs.
+
+To build the tools separately, "make && make install" in 
+tools/blktap.
+
+
+Using the Tools
+===============
+
+Prepare the image for booting. For qcow files use the qcow utilities
+installed earlier. e.g. qcow-create generates a blank standalone image
+or a file-backed CoW image. img2qcow takes an existing image or
+partition and creates a sparse, standalone qcow-based file.
+
+The userspace disk agent is configured to start automatically via xend
+(alternatively you can start it manually => 'blktapctrl')
+
+Customise the VM config file to use the 'tap' handler, followed by the
+driver type. e.g. for a raw image such as a file or partition:
+
+disk = ['tap:aio:<FILENAME>,sda1,w']
+
+e.g. for a qcow image:
+
+disk = ['tap:qcow:<FILENAME>,sda1,w']
+
+
+Mounting images in Dom0 using the blktap driver
+===============================================
+Tap (and blkback) disks are also mountable in Dom0 without requiring an
+active VM to attach. You will need to build a xenlinux Dom0 kernel that
+includes the blkfront driver (e.g. the default 'make world' or 
+'make kernels' build. Simply use the xm command-line tool to activate
+the backend disks, and blkfront will generate a virtual block device that
+can be accessed in the same way as a loop device or partition:
+
+e.g. for a raw image file <FILENAME> that would normally be mounted using
+the loopback driver (such as 'mount -o loop <FILENAME> /mnt/disk'), do the
+following:
+
+xm block-attach 0 tap:aio:<FILENAME> /dev/xvda1 w 0
+mount /dev/xvda1 /mnt/disk        <--- don't use loop driver
+
+In this way, you can use any of the userspace device-type drivers built
+with the blktap userspace toolkit to open and mount disks such as qcow
+or vmdk images:
+
+xm block-attach 0 tap:qcow:<FILENAME> /dev/xvda1 w 0
+mount /dev/xvda1 /mnt/disk
+
+
+
diff --git a/tools/blktap2/daemon/Makefile b/tools/blktap2/daemon/Makefile
new file mode 100644 (file)
index 0000000..a7869b6
--- /dev/null
@@ -0,0 +1,55 @@
+XEN_ROOT=../../../
+BLKTAP_ROOT := ..
+include $(XEN_ROOT)/tools/Rules.mk
+
+IBIN          = blktapctrl
+INST_DIR      = $(SBINDIR)
+
+LIBDIR        = lib
+
+LIBS         := -lxenstore
+LIBS         += -Llib
+LIBS         += -lblktap
+LIBS         += -lxenctrl
+
+ifneq ($(USE_SYSTEM_LIBRARIES),y)
+INCLUDES     += -I $(XEN_LIBXC) -I $(XEN_XENSTORE)
+LIBS         += -L $(XEN_LIBXC) -L $(XEN_XENSTORE)
+endif
+
+OBJS         := tapdisk-dispatch-common.o
+OBJS         += tapdisk-channel.o
+
+CFLAGS       += -Werror
+CFLAGS       += -Wno-unused
+CFLAGS       += -fno-strict-aliasing -fPIC
+CFLAGS       += -Ilib -I../include -I../drivers -I../../include $(INCLUDES)
+CFLAGS       += -D_GNU_SOURCE
+CFLAGS       += -g
+
+# Get gcc to generate the dependencies for us.
+CFLAGS       += -Wp,-MD,.$(@F).d
+DEPS          = .*.d
+
+all: libblktap $(IBIN)
+
+blktapctrl: tapdisk-daemon.c $(OBJS)
+       $(CC) $(CFLAGS) -o blktapctrl tapdisk-daemon.c $(LIBS) $(OBJS)
+
+libblktap:
+       @set -e
+       $(MAKE) -C $(LIBDIR) all
+
+install: all
+       $(MAKE) -C $(LIBDIR) install
+       $(INSTALL_DIR) -p $(DESTDIR)$(INST_DIR)
+       $(INSTALL_PROG) $(IBIN) $(DESTDIR)$(INST_DIR)
+
+clean:
+       $(MAKE) -C $(LIBDIR) clean
+       rm -rf *.o *~ $(IBIN) $(DEPS) xen TAGS
+
+.PHONY: all clean install blktapctrl libblktap
+
+-include $(DEPS)
+
diff --git a/tools/blktap2/daemon/lib/Makefile b/tools/blktap2/daemon/lib/Makefile
new file mode 100644 (file)
index 0000000..e4e289a
--- /dev/null
@@ -0,0 +1,69 @@
+XEN_ROOT=../../../../
+BLKTAP_ROOT := ../../
+include $(XEN_ROOT)/tools/Rules.mk
+
+MAJOR    = 3.1
+MINOR    = 0
+SONAME   = libblktap.so.$(MAJOR)
+
+BLKTAP_INSTALL_DIR = /usr/sbin
+
+LIBS     := -lxenstore
+
+ifneq ($(USE_SYSTEM_LIBRARIES),y)
+INCLUDES += -I $(XEN_LIBXC) -I $(XEN_XENSTORE)
+LIBS     += -L$(XEN_XENSTORE)
+endif
+
+SRCS     :=
+SRCS     += xs_api.c
+CFLAGS   += -Werror
+CFLAGS   += -Wno-unused
+CFLAGS   += -fno-strict-aliasing -fPIC
+# get asprintf():
+CFLAGS   += -D _GNU_SOURCE
+CFLAGS   += -g
+CFLAGS   += -I../../include -I../../../include/ $(INCLUDES) 
+
+
+# Get gcc to generate the dependencies for us.
+CFLAGS  += -Wp,-MD,.$(@F).d
+DEPS     = .*.d
+
+OBJS     = $(patsubst %.c,%.o,$(SRCS))
+IBINS   :=
+
+LIB      = libblktap.a libblktap.so.$(MAJOR).$(MINOR)
+
+.PHONY: all
+all: build
+
+.PHONY: build
+build: libblktap.a
+
+.PHONY: libblktap
+libblktap: libblktap.a
+
+install: all
+       $(INSTALL_DIR) -p $(DESTDIR)$(LIBDIR)
+       $(INSTALL_DATA) $(LIB) $(DESTDIR)$(LIBDIR)
+       ln -sf libblktap.so.$(MAJOR).$(MINOR) $(DESTDIR)$(LIBDIR)/libblktap.so.$(MAJOR)
+       ln -sf libblktap.so.$(MAJOR) $(DESTDIR)$(LIBDIR)/libblktap.so
+
+clean:
+       rm -rf *.a *.so* *.o *.rpm $(LIB) *~ $(DEPS) xen TAGS
+
+libblktap.a: $(OBJS) 
+       $(CC) $(CFLAGS) -Wl,$(SONAME_LDFLAG) -Wl,$(SONAME) $(SHLIB_CFLAGS) \
+             -o libblktap.so.$(MAJOR).$(MINOR) $^ $(LIBS)
+       ln -sf libblktap.so.$(MAJOR).$(MINOR) libblktap.so.$(MAJOR)
+       ln -sf libblktap.so.$(MAJOR) libblktap.so
+       $(AR) rc $@ libblktap.so
+
+.PHONY: TAGS all build clean install libblktap
+
+TAGS:
+       etags -t $(SRCS) *.h
+
+-include $(DEPS)
+
diff --git a/tools/blktap2/daemon/lib/xs_api.c b/tools/blktap2/daemon/lib/xs_api.c
new file mode 100644 (file)
index 0000000..2a7d6ac
--- /dev/null
@@ -0,0 +1,323 @@
+/*
+ * xs_api.c
+ * 
+ * blocktap interface functions to xenstore
+ *
+ * (c) 2005 Andrew Warfield and Julian Chesterfield
+ *
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ */
+
+#include <time.h>
+#include <stdio.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdarg.h>
+#include <xs.h>
+
+#include "xs_api.h"
+#include "blktaplib.h"
+
+#define DOMNAME "Domain-0"
+#define BASE_DEV_VAL 2048
+
+static LIST_HEAD(watches);
+
+int
+xs_gather(struct xs_handle *xs, const char *dir, ...)
+{
+       va_list ap;
+       const char *name;
+       char *path, **e;
+       int ret = 0, num,i;
+       unsigned int len;
+       xs_transaction_t xth;
+
+again:
+       if ((xth = xs_transaction_start(xs)) == XBT_NULL) {
+               DPRINTF("unable to start xs trasanction\n");
+               ret = ENOMEM;
+               return ret;
+       }
+
+       va_start(ap, dir);
+       while ((ret == 0) && (name = va_arg(ap, char *)) != NULL) {
+               char *p;
+               const char *fmt = va_arg(ap, char *);
+               void *result = va_arg(ap, void *);
+               
+               if (asprintf(&path, "%s/%s", dir, name) == -1) {
+                       EPRINTF("allocation error in xs_gather!\n");
+                       ret = ENOMEM;
+                       break;
+               }
+
+               p = xs_read(xs, xth, path, &len);
+               free(path);
+
+               if (!p) {
+                       ret = ENOENT;
+                       break;
+               }
+
+               if (fmt) {
+                       if (sscanf(p, fmt, result) == 0)
+                               ret = EINVAL;
+                       free(p);
+               } else
+                       *(char **)result = p;
+       }
+
+       va_end(ap);
+
+       if (!xs_transaction_end(xs, xth, ret)) {
+               if (ret == 0 && errno == EAGAIN)
+                       goto again;
+               else
+                       ret = errno;
+       }
+
+       return ret;
+}
+
+/* Single printf and write: returns -errno or 0. */
+int
+xs_printf(struct xs_handle *h, const char *dir,
+         const char *node, const char *fmt, ...)
+{
+       int ret;
+       va_list ap;
+       char *buf, *path;
+
+       va_start(ap, fmt);
+       ret = vasprintf(&buf, fmt, ap);
+       va_end(ap);
+
+       if (ret == -1)
+               return 0;
+
+       ret = asprintf(&path, "%s/%s", dir, node);
+       if (ret == -1) {
+               free(buf);
+               return 0;
+       }
+
+       ret = xs_write(h, XBT_NULL, path, buf, strlen(buf)+1);
+
+       free(buf);
+       free(path);
+
+       return ret;
+}
+
+int
+xs_exists(struct xs_handle *h, const char *path)
+{
+       char **d;
+       unsigned int num;
+       xs_transaction_t xth;
+
+       if ((xth = xs_transaction_start(h)) == XBT_NULL) {
+               EPRINTF("unable to start xs trasanction\n");
+               return 0;
+       }
+
+       d = xs_directory(h, xth, path, &num);
+       xs_transaction_end(h, xth, 0);
+       if (!d)
+               return 0;
+
+       free(d);
+       return 1;
+}
+
+
+
+/**
+ * This assumes that the domain name we are looking for is unique. 
+ * Name parameter Domain-0 
+ */
+char *
+get_dom_domid(struct xs_handle *h)
+{
+       int i;
+       xs_transaction_t xth;
+       unsigned int num, len;
+       char *val, *path, *domid, **e;
+
+       e     = NULL;
+       domid = NULL;
+
+       if ((xth = xs_transaction_start(h)) == XBT_NULL) {
+               EPRINTF("unable to start xs trasanction\n");
+               return NULL;
+       }
+
+       e = xs_directory(h, xth, "/local/domain", &num);
+       if (e == NULL)
+               goto done;
+
+       for (i = 0; (i < num) && (domid == NULL); i++) {
+               if (asprintf(&path, "/local/domain/%s/name", e[i]) == -1)
+                       break;
+
+               val = xs_read(h, xth, path, &len);
+               free(path);
+               if (val == NULL)
+                       continue;
+
+               if (strcmp(val, DOMNAME) == 0) {
+                       /* match! */
+                       if (asprintf(&path, 
+                                    "/local/domain/%s/domid", e[i]) == -1) {
+                               free(val);
+                               break;
+                       }
+                       domid = xs_read(h, xth, path, &len);
+                       free(path);
+               }
+               free(val);
+       }
+
+ done:
+       xs_transaction_end(h, xth, 0);
+       free(e);
+       return domid;
+}
+
+/*
+ * a little paranoia: we don't just trust token
+ */
+static struct xenbus_watch *find_watch(const char *token)
+{
+       int ret;
+       long nonce;
+       unsigned long addr;
+       struct xenbus_watch *i, *cmp;
+
+       ret = sscanf(token, "%lX:%lX", &addr, &nonce);
+       if (ret != 2) {
+               EPRINTF("invalid watch token %s\n", token);
+               return NULL;
+       }
+
+       cmp = (struct xenbus_watch *)addr;
+       list_for_each_entry(i, &watches, list)
+               if (i == cmp && i->nonce == nonce)
+                       return i;
+
+       return NULL;
+}
+
+/*
+ * Register callback to watch this node;
+ * like xs_watch, return 0 on failure
+ */
+int register_xenbus_watch(struct xs_handle *h, struct xenbus_watch *watch)
+{
+       /* Pointer in ascii is the token. */
+       char token[(sizeof(watch) + sizeof(long)) * 2 + 2];
+
+       /* 1-second granularity should suffice here */
+       watch->nonce = time(NULL);
+
+       sprintf(token, "%lX:%lX", (long)watch, watch->nonce);
+       if (find_watch(token)) {
+               EPRINTF("watch collision!\n");
+               return -EINVAL;
+       }
+
+       if (!xs_watch(h, watch->node, token)) {
+               EPRINTF("unable to set watch!\n");
+               return -EINVAL;
+       }
+
+       list_add(&watch->list, &watches);
+
+       return 0;
+}
+
+int unregister_xenbus_watch(struct xs_handle *h, struct xenbus_watch *watch)
+{
+       char token[(sizeof(watch) + sizeof(long)) * 2 + 2];
+
+       sprintf(token, "%lX:%lX", (long)watch, watch->nonce);
+       if (!find_watch(token)) {
+               EPRINTF("no such watch!\n");
+               return -EINVAL;
+       }
+
+       if (!xs_unwatch(h, watch->node, token))
+               EPRINTF("XENBUS Failed to release watch %s\n", watch->node);
+
+       list_del(&watch->list);
+
+       return 0;
+}
+
+/*
+ * re-register callbacks to all watches
+ */
+void reregister_xenbus_watches(struct xs_handle *h)
+{
+       struct xenbus_watch *watch;
+       char token[(sizeof(watch) + sizeof(long)) * 2 + 2];
+
+       list_for_each_entry(watch, &watches, list) {
+               sprintf(token, "%lX:%lX", (long)watch, watch->nonce);
+               xs_watch(h, watch->node, token);
+       }
+}
+
+/*
+ * based on watch_thread() 
+ */
+int xs_fire_next_watch(struct xs_handle *h)
+{
+       unsigned int num;
+       struct xenbus_watch *w;
+       char **res, *token, *node = NULL;
+
+       res = xs_read_watch(h, &num);
+       if (res == NULL) 
+               return -EAGAIN; /* in O_NONBLOCK, read_watch returns 0... */
+
+       node  = res[XS_WATCH_PATH];
+       token = res[XS_WATCH_TOKEN];
+       DPRINTF("got watch %s on %s\n", token, node);
+
+       w = find_watch(token);
+       if (w) 
+               w->callback(h, w, node);
+
+       DPRINTF("handled watch %s on %s\n", token, node);
+
+       free(res);
+
+       return 1;
+}
diff --git a/tools/blktap2/daemon/lib/xs_api.h b/tools/blktap2/daemon/lib/xs_api.h
new file mode 100644 (file)
index 0000000..e6f055a
--- /dev/null
@@ -0,0 +1,62 @@
+/*
+ * xs_api.h
+ *
+ * (c) 2005 Andrew Warfield and Julian Chesterfield
+ *
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef _XS_API_H_
+#define _XS_API_H_
+
+#include <xs.h>
+
+#include "list.h"
+
+struct xenbus_watch
+{
+        struct list_head  list;
+        char             *node;
+       void             *data;
+       long              nonce;
+        void (*callback) (struct xs_handle *h, 
+                         struct xenbus_watch *, 
+                         const  char *node);
+};
+
+int xs_gather(struct xs_handle *xs, const char *dir, ...);
+int xs_printf(struct xs_handle *h, const char *dir, const char *node, 
+             const char *fmt, ...) __attribute__((format(printf, 4, 5)));
+int xs_exists(struct xs_handle *h, const char *path);
+char *get_dom_domid(struct xs_handle *h);
+int convert_dev_name_to_num(char *name);
+
+int register_xenbus_watch(struct xs_handle *h, struct xenbus_watch *watch);
+int unregister_xenbus_watch(struct xs_handle *h, struct xenbus_watch *watch);
+void reregister_xenbus_watches(struct xs_handle *h);
+int xs_fire_next_watch(struct xs_handle *h);
+
+#endif
diff --git a/tools/blktap2/daemon/tapdisk-channel.c b/tools/blktap2/daemon/tapdisk-channel.c
new file mode 100644 (file)
index 0000000..c2dac3a
--- /dev/null
@@ -0,0 +1,1367 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <stdio.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <stdarg.h>
+#include <sys/wait.h>
+#include <sys/ioctl.h>
+#include <sys/resource.h>
+
+#include <xs.h>
+#include "disktypes.h"
+#include "tapdisk-dispatch.h"
+
+#define TAPDISK_CHANNEL_IDLE          1
+#define TAPDISK_CHANNEL_WAIT_PID      2
+#define TAPDISK_CHANNEL_WAIT_OPEN     3
+#define TAPDISK_CHANNEL_WAIT_PAUSE    4
+#define TAPDISK_CHANNEL_WAIT_RESUME   5
+#define TAPDISK_CHANNEL_WAIT_CLOSE    6
+#define TAPDISK_CHANNEL_CLOSED        7
+
+static void tapdisk_channel_error(tapdisk_channel_t *,
+                                 const char *fmt, ...)
+  __attribute__((format(printf, 2, 3)));
+static void tapdisk_channel_fatal(tapdisk_channel_t *,
+                                 const char *fmt, ...)
+  __attribute__((format(printf, 2, 3)));
+static int tapdisk_channel_parse_params(tapdisk_channel_t *);
+static void tapdisk_channel_pause_event(struct xs_handle *,
+                                       struct xenbus_watch *,
+                                       const char *);
+
+static int
+tapdisk_channel_check_uuid(tapdisk_channel_t *channel)
+{
+       uint32_t uuid;
+       char *uuid_str;
+
+       uuid_str = xs_read(channel->xsh, XBT_NULL, channel->uuid_str, NULL);
+       if (!uuid_str)
+               return -errno;
+
+       uuid = strtoul(uuid_str, NULL, 10);
+       free(uuid_str);
+
+       if (uuid != channel->cookie)
+               return -EINVAL;
+
+       return 0;
+}
+
+static inline int
+tapdisk_channel_validate_watch(tapdisk_channel_t *channel, const char *path)
+{
+       int err, len;
+
+       len = strsep_len(path, '/', 7);
+       if (len < 0)
+               return -EINVAL;
+
+       err = tapdisk_channel_check_uuid(channel);
+       if (err)
+               return err;
+
+       if (!xs_exists(channel->xsh, path))
+               return -ENOENT;
+
+       return 0;
+}
+
+static inline int
+tapdisk_channel_validate_message(tapdisk_channel_t *channel,
+                                tapdisk_message_t *message)
+{
+       switch (message->type) {
+       case TAPDISK_MESSAGE_PID_RSP:
+               if (channel->state != TAPDISK_CHANNEL_WAIT_PID)
+                       return -EINVAL;
+               break;
+
+       case TAPDISK_MESSAGE_OPEN_RSP:
+               if (channel->state != TAPDISK_CHANNEL_WAIT_OPEN)
+                       return -EINVAL;
+               break;
+
+       case TAPDISK_MESSAGE_PAUSE_RSP:
+               if (channel->state != TAPDISK_CHANNEL_WAIT_PAUSE)
+                       return -EINVAL;
+               break;
+
+       case TAPDISK_MESSAGE_RESUME_RSP:
+               if (channel->state != TAPDISK_CHANNEL_WAIT_RESUME)
+                       return -EINVAL;
+               break;
+
+       case TAPDISK_MESSAGE_CLOSE_RSP:
+               if (channel->state != TAPDISK_CHANNEL_WAIT_CLOSE)
+                       return -EINVAL;
+               break;
+
+       case TAPDISK_MESSAGE_RUNTIME_ERROR:
+               /*
+                * runtime errors can be received at any time
+                * and should not affect the state machine
+                */
+               return 0;
+       }
+
+       channel->state = TAPDISK_CHANNEL_IDLE;
+       return 0;
+}
+
+static int
+tapdisk_channel_send_message(tapdisk_channel_t *channel,
+                            tapdisk_message_t *message, int timeout)
+{
+       fd_set writefds;
+       struct timeval tv;
+       int ret, len, offset;
+
+       tv.tv_sec  = timeout;
+       tv.tv_usec = 0;
+       offset     = 0;
+       len        = sizeof(tapdisk_message_t);
+
+       DPRINTF("%s: sending '%s' message to %d:%d\n",
+               channel->path, tapdisk_message_name(message->type),
+               channel->channel_id, channel->cookie);
+
+       if (channel->state != TAPDISK_CHANNEL_IDLE &&
+           message->type  != TAPDISK_MESSAGE_CLOSE)
+               EPRINTF("%s: writing message to non-idle channel (%d)\n",
+                       channel->path, channel->state);
+
+       while (offset < len) {
+               FD_ZERO(&writefds);
+               FD_SET(channel->write_fd, &writefds);
+
+               /* we don't bother reinitializing tv. at worst, it will wait a
+                * bit more time than expected. */
+
+               ret = select(channel->write_fd + 1,
+                            NULL, &writefds, NULL, &tv);
+               if (ret == -1)
+                       break;
+               else if (FD_ISSET(channel->write_fd, &writefds)) {
+                       ret = write(channel->write_fd,
+                                   message + offset, len - offset);
+                       if (ret <= 0)
+                               break;
+                       offset += ret;
+               } else
+                       break;
+       }
+
+       if (offset != len) {
+               EPRINTF("%s: error writing '%s' message to %d:%d\n",
+                       channel->path, tapdisk_message_name(message->type),
+                       channel->channel_id, channel->cookie);
+               return -EIO;
+       }
+
+       switch (message->type) {
+       case TAPDISK_MESSAGE_PID:
+               channel->state = TAPDISK_CHANNEL_WAIT_PID;
+               break;
+
+       case TAPDISK_MESSAGE_OPEN:
+               channel->state = TAPDISK_CHANNEL_WAIT_OPEN;
+               break;
+
+       case TAPDISK_MESSAGE_PAUSE:
+               channel->state = TAPDISK_CHANNEL_WAIT_PAUSE;
+               break;
+
+       case TAPDISK_MESSAGE_RESUME:
+               channel->state = TAPDISK_CHANNEL_WAIT_RESUME;
+               break;
+
+       case TAPDISK_MESSAGE_CLOSE:
+               channel->state = TAPDISK_CHANNEL_WAIT_CLOSE;
+               break;
+
+       default:
+               EPRINTF("%s: unrecognized message type %d\n",
+                       channel->path, message->type);
+       }
+
+       return 0;
+}
+
+static void
+__tapdisk_channel_error(tapdisk_channel_t *channel,
+                       const char *fmt, va_list ap)
+{
+       int err;
+       char *dir, *buf, *message;
+
+       err = vasprintf(&buf, fmt, ap);
+       if (err == -1) {
+               EPRINTF("failed to allocate error message\n");
+               buf = NULL;
+       }
+
+       if (buf)
+               message = buf;
+       else
+               message = "tapdisk error";
+
+       EPRINTF("%s: %s\n", channel->path, message);
+
+       err = asprintf(&dir, "%s/tapdisk-error", channel->path);
+       if (err == -1) {
+               EPRINTF("%s: failed to write %s\n", __func__, message);
+               dir = NULL;
+               goto out;
+       }
+
+       xs_write(channel->xsh, XBT_NULL, dir, message, strlen(message));
+
+out:
+       free(dir);
+       free(buf);
+}
+
+static void
+tapdisk_channel_error(tapdisk_channel_t *channel, const char *fmt, ...)
+{
+       va_list ap;
+
+       va_start(ap, fmt);
+       __tapdisk_channel_error(channel, fmt, ap);
+       va_end(ap);
+}
+
+static void
+tapdisk_channel_fatal(tapdisk_channel_t *channel, const char *fmt, ...)
+{
+       va_list ap;
+
+       va_start(ap, fmt);
+       __tapdisk_channel_error(channel, fmt, ap);
+       va_end(ap);
+
+       tapdisk_channel_close(channel);
+}
+
+static int
+tapdisk_channel_connect_backdev(tapdisk_channel_t *channel)
+{
+       int err, major, minor;
+       char *s, *path, *devname;
+
+       s       = NULL;
+       path    = NULL;
+       devname = NULL;
+
+       err = ioctl(channel->blktap_fd,
+                   BLKTAP_IOCTL_BACKDEV_SETUP, channel->minor);
+       if (err) {
+               err = -errno;
+               goto fail;
+       }
+
+       err = asprintf(&path, "%s/backdev-node", channel->path);
+       if (err == -1) {
+               path = NULL;
+               err  = -ENOMEM;
+               goto fail;
+       }
+
+       s = xs_read(channel->xsh, XBT_NULL, path, NULL);
+       if (!s) {
+               err = -errno;
+               goto fail;
+       }
+
+       err = sscanf(s, "%d:%d", &major, &minor);
+       if (err != 2) {
+               err = -EINVAL;
+               goto fail;
+       }
+
+       err = asprintf(&devname,"%s/%s%d",
+                      BLKTAP_DEV_DIR, BACKDEV_NAME, minor);
+       if (err == -1) {
+               devname = NULL;
+               err = -ENOMEM;
+               goto fail;
+       }
+
+       err = make_blktap_device(devname, major, minor, S_IFBLK | 0600);
+       if (err)
+               goto fail;
+
+       free(path);
+       err = asprintf(&path, "%s/backdev-path", channel->path);
+       if (err == -1) {
+               path = NULL;
+               err  = -ENOMEM;
+               goto fail;
+       }
+
+       err = xs_write(channel->xsh, XBT_NULL, path, devname, strlen(devname));
+       if (err == 0) {
+               err = -errno;
+               goto fail;
+       }
+
+       err = 0;
+ out:
+       free(devname);
+       free(path);
+       free(s);
+       return err;
+
+ fail:
+       EPRINTF("backdev setup failed [%d]\n", err);
+       goto out;
+}
+
+static int
+tapdisk_channel_complete_connection(tapdisk_channel_t *channel)
+{
+       int err;
+       char *path;
+
+       if (!xs_printf(channel->xsh, channel->path,
+                      "sectors", "%llu", channel->image.size)) {
+               EPRINTF("ERROR: Failed writing sectors");
+               return -errno;
+       }
+
+       if (!xs_printf(channel->xsh, channel->path,
+                      "sector-size", "%lu", channel->image.secsize)) {
+               EPRINTF("ERROR: Failed writing sector-size");
+               return -errno;
+       }
+
+       if (!xs_printf(channel->xsh, channel->path,
+                      "info", "%u", channel->image.info)) {
+               EPRINTF("ERROR: Failed writing info");
+               return -errno;
+       }
+
+       err = tapdisk_channel_connect_backdev(channel);
+       if (err)
+               goto clean;
+
+       channel->connected = 1;
+       return 0;
+
+ clean:
+       if (asprintf(&path, "%s/info", channel->path) == -1)
+               return err;
+
+       if (!xs_rm(channel->xsh, XBT_NULL, path))
+               goto clean_out;
+
+       free(path);
+       if (asprintf(&path, "%s/sector-size", channel->path) == -1)
+               return err;
+
+       if (!xs_rm(channel->xsh, XBT_NULL, path))
+               goto clean_out;
+
+       free(path);
+       if (asprintf(&path, "%s/sectors", channel->path) == -1)
+               return err;
+
+       xs_rm(channel->xsh, XBT_NULL, path);
+
+ clean_out:
+       free(path);
+       return err;
+}
+
+static int
+tapdisk_channel_send_open_request(tapdisk_channel_t *channel)
+{
+       int len;
+       tapdisk_message_t message;
+
+       memset(&message, 0, sizeof(tapdisk_message_t));
+
+       len = strlen(channel->vdi_path);
+
+       message.type              = TAPDISK_MESSAGE_OPEN;
+       message.cookie            = channel->cookie;
+       message.drivertype        = channel->drivertype;
+       message.u.params.storage  = channel->storage;
+       message.u.params.devnum   = channel->minor;
+       message.u.params.domid    = channel->domid;
+       message.u.params.path_len = len;
+       strncpy(message.u.params.path, channel->vdi_path, len);
+
+       if (channel->mode == 'r')
+               message.u.params.flags |= TAPDISK_MESSAGE_FLAG_RDONLY;
+       if (channel->shared)
+               message.u.params.flags |= TAPDISK_MESSAGE_FLAG_SHARED;
+
+       /* TODO: clean this up */
+       if (xs_exists(channel->xsh, "/local/domain/0/tapdisk/add-cache"))
+               message.u.params.flags |= TAPDISK_MESSAGE_FLAG_ADD_CACHE;
+       if (xs_exists(channel->xsh, "/local/domain/0/tapdisk/log-dirty"))
+               message.u.params.flags |= TAPDISK_MESSAGE_FLAG_LOG_DIRTY;
+
+       return tapdisk_channel_send_message(channel, &message, 2);
+}
+
+static int
+tapdisk_channel_receive_open_response(tapdisk_channel_t *channel,
+                                     tapdisk_message_t *message)
+{
+       int err;
+
+       channel->image.size    = message->u.image.sectors;
+       channel->image.secsize = message->u.image.sector_size;
+       channel->image.info    = message->u.image.info;
+
+       err = tapdisk_channel_complete_connection(channel);
+       if (err)
+               goto fail;
+
+       /* did we receive a pause request before the connection completed? */
+       if (channel->pause_needed) {
+               DPRINTF("%s: deferred pause request\n", channel->path);
+               tapdisk_channel_pause_event(channel->xsh,
+                                           &channel->pause_watch,
+                                           channel->pause_str);
+               channel->pause_needed = 0;
+       }
+
+       return 0;
+
+fail:
+       tapdisk_channel_fatal(channel,
+                             "failure completing connection: %d", err);
+       return err;
+}
+
+static int
+tapdisk_channel_send_shutdown_request(tapdisk_channel_t *channel)
+{
+       tapdisk_message_t message;
+
+       memset(&message, 0, sizeof(tapdisk_message_t));
+
+       message.type       = TAPDISK_MESSAGE_CLOSE;
+       message.drivertype = channel->drivertype;
+       message.cookie     = channel->cookie;
+
+       return tapdisk_channel_send_message(channel, &message, 2);
+}
+
+static int
+tapdisk_channel_receive_shutdown_response(tapdisk_channel_t *channel,
+                                         tapdisk_message_t *message)
+{
+       channel->open  = 0;
+       channel->state = TAPDISK_CHANNEL_CLOSED;
+       tapdisk_channel_close(channel);
+       return 0;
+}
+
+static int
+tapdisk_channel_receive_runtime_error(tapdisk_channel_t *channel,
+                                     tapdisk_message_t *message)
+{
+       tapdisk_channel_error(channel,
+                             "runtime error: %s", message->u.string.text);
+       return 0;
+}
+
+static int
+tapdisk_channel_send_pid_request(tapdisk_channel_t *channel)
+{
+       int err;
+       tapdisk_message_t message;
+
+       memset(&message, 0, sizeof(tapdisk_message_t));
+
+       message.type       = TAPDISK_MESSAGE_PID;
+       message.drivertype = channel->drivertype;
+       message.cookie     = channel->cookie;
+
+       err = tapdisk_channel_send_message(channel, &message, 2);
+
+       if (!err)
+               channel->open = 1;
+
+       return err;
+}
+
+static int
+tapdisk_channel_receive_pid_response(tapdisk_channel_t *channel,
+                                    tapdisk_message_t *message)
+{
+       int err;
+
+       channel->tapdisk_pid = message->u.tapdisk_pid;
+
+       DPRINTF("%s: tapdisk pid: %d\n", channel->path, channel->tapdisk_pid);
+
+       err = setpriority(PRIO_PROCESS, channel->tapdisk_pid, PRIO_SPECIAL_IO);
+       if (err) {
+               tapdisk_channel_fatal(channel,
+                                     "setting tapdisk priority: %d", err);
+               return err;
+       }
+
+       err = tapdisk_channel_send_open_request(channel);
+       if (err) {
+               tapdisk_channel_fatal(channel,
+                                     "sending open request: %d", err);
+               return err;
+       }
+
+       return 0;
+}
+
+static int
+tapdisk_channel_send_pause_request(tapdisk_channel_t *channel)
+{
+       tapdisk_message_t message;
+
+       memset(&message, 0, sizeof(tapdisk_message_t));
+
+       DPRINTF("pausing %s\n", channel->path);
+
+       message.type       = TAPDISK_MESSAGE_PAUSE;
+       message.drivertype = channel->drivertype;
+       message.cookie     = channel->cookie;
+
+       return tapdisk_channel_send_message(channel, &message, 2);
+}
+
+static int
+tapdisk_channel_receive_pause_response(tapdisk_channel_t *channel,
+                                      tapdisk_message_t *message)
+{
+       int err;
+
+       if (!xs_write(channel->xsh, XBT_NULL,
+                     channel->pause_done_str, "", strlen(""))) {
+               err = -errno;
+               goto fail;
+       }
+
+       return 0;
+
+fail:
+       tapdisk_channel_fatal(channel,
+                             "failure receiving pause response: %d\n", err);
+       return err;
+}
+
+static int
+tapdisk_channel_send_resume_request(tapdisk_channel_t *channel)
+{
+       int len;
+       tapdisk_message_t message;
+
+       memset(&message, 0, sizeof(tapdisk_message_t));
+
+       len = strlen(channel->vdi_path);
+
+       DPRINTF("resuming %s\n", channel->path);
+
+       message.type              = TAPDISK_MESSAGE_RESUME;
+       message.drivertype        = channel->drivertype;
+       message.cookie            = channel->cookie;
+       message.u.params.path_len = len;
+       strncpy(message.u.params.path, channel->vdi_path, len);
+
+       return tapdisk_channel_send_message(channel, &message, 2);
+}
+
+static int
+tapdisk_channel_receive_resume_response(tapdisk_channel_t *channel,
+                                       tapdisk_message_t *message)
+{
+       int err;
+
+       if (!xs_rm(channel->xsh, XBT_NULL, channel->pause_done_str)) {
+               err = -errno;
+               goto fail;
+       }
+
+       return 0;
+
+fail:
+       tapdisk_channel_fatal(channel,
+                             "failure receiving pause response: %d", err);
+       return err;
+}
+
+static void
+tapdisk_channel_shutdown_event(struct xs_handle *xsh,
+                              struct xenbus_watch *watch, const char *path)
+{
+       int err;
+       tapdisk_channel_t *channel;
+
+       channel = watch->data;
+
+       DPRINTF("%s: got watch on %s\n", channel->path, path);
+
+       if (!xs_exists(channel->xsh, channel->path)) {
+               tapdisk_channel_close(channel);
+               return;
+       }
+
+       err = tapdisk_channel_validate_watch(channel, path);
+       if (err) {
+               if (err == -EINVAL)
+                       tapdisk_channel_fatal(channel, "bad shutdown watch");
+               return;
+       }
+
+       tapdisk_channel_send_shutdown_request(channel);
+}
+
+static void
+tapdisk_channel_pause_event(struct xs_handle *xsh,
+                           struct xenbus_watch *watch, const char *path)
+{
+       int err, paused;
+       tapdisk_channel_t *channel;
+
+       channel = watch->data;
+
+       DPRINTF("%s: got watch on %s\n", channel->path, path);
+
+       if (!xs_exists(channel->xsh, channel->path)) {
+               tapdisk_channel_close(channel);
+               return;
+       }
+
+       /* NB: The VBD is essentially considered ready since the
+        * backend hotplug event ocurred, which is just after
+        * start-tapdisk, not after watch registration. We start
+        * testing xenstore keys with the very first shot, but defer
+        * until after connection completion. */
+
+       err = tapdisk_channel_validate_watch(channel, path);
+       if (err) {
+               if (err == -EINVAL)
+                       tapdisk_channel_fatal(channel, "bad pause watch");
+
+               if (err != -ENOENT)
+                       return;
+
+               err = 0;
+       }
+
+       paused  = xs_exists(xsh, channel->pause_done_str);
+
+       if (xs_exists(xsh, channel->pause_str)) {
+               /*
+                * Duplicate requests are a protocol validation, but
+                * impossible to identify if watch registration and an
+                * actual pause request may fire separately in close
+                * succession. Warn, but do not signal an error.
+                */
+               int pausing = channel->state == TAPDISK_CHANNEL_WAIT_PAUSE;
+               if (pausing || paused) {
+                       DPRINTF("Ignoring pause event for %s vbd %s\n",
+                               pausing ? "pausing" : "paused", channel->path);
+                       goto out;
+               }
+
+               /* defer if tapdisk is not ready yet */
+               if (!channel->connected) {
+                       DPRINTF("%s: deferring pause request\n", path);
+                       channel->pause_needed = 1;
+                       goto out;
+               }
+
+               err = tapdisk_channel_send_pause_request(channel);
+
+       } else if (xs_exists(xsh, channel->pause_done_str)) {
+               free(channel->params);
+               channel->params   = NULL;
+               channel->vdi_path = NULL;
+
+               err = xs_gather(channel->xsh, channel->path,
+                               "params", NULL, &channel->params, NULL);
+               if (err) {
+                       EPRINTF("failure re-reading params: %d\n", err);
+                       channel->params = NULL;
+                       goto out;
+               }
+
+               err = tapdisk_channel_parse_params(channel);
+               if (err)
+                       goto out;
+
+               err = tapdisk_channel_send_resume_request(channel);
+               if (err)
+                       goto out;
+       }
+
+       err = 0;
+
+out:
+       if (err)
+               tapdisk_channel_error(channel, "pause event failed: %d", err);
+}
+
+static int
+tapdisk_channel_open_control_socket(char *devname)
+{
+       int err, fd;
+       fd_set socks;
+       struct timeval timeout;
+
+       err = mkdir(BLKTAP_CTRL_DIR, 0755);
+       if (err == -1 && errno != EEXIST) {
+               EPRINTF("Failure creating %s directory: %d\n",
+                       BLKTAP_CTRL_DIR, errno);
+               return -errno;
+       }
+
+       err = mkfifo(devname, S_IRWXU | S_IRWXG | S_IRWXO);
+       if (err) {
+               if (errno == EEXIST) {
+                       /*
+                        * Remove fifo since it may have data from
+                        * it's previous use --- earlier invocation
+                        * of tapdisk may not have read all messages.
+                        */
+                       err = unlink(devname);
+                       if (err) {
+                               EPRINTF("ERROR: unlink(%s) failed (%d)\n",
+                                       devname, errno);
+                               return -errno;
+                       }
+
+                       err = mkfifo(devname, S_IRWXU | S_IRWXG | S_IRWXO);
+               }
+
+               if (err) {
+                       EPRINTF("ERROR: pipe failed (%d)\n", errno);
+                       return -errno;
+               }
+       }
+
+       fd = open(devname, O_RDWR | O_NONBLOCK);
+       if (fd == -1) {
+               EPRINTF("Failed to open %s\n", devname);
+               return -errno;
+       }
+
+       return fd;
+}
+
+static int
+tapdisk_channel_get_device_number(tapdisk_channel_t *channel)
+{
+       char *devname;
+       domid_translate_t tr;
+       int major, minor, err;
+
+       tr.domid = channel->domid;
+        tr.busid = channel->busid;
+
+       minor = ioctl(channel->blktap_fd, BLKTAP_IOCTL_NEWINTF, tr);
+       if (minor <= 0 || minor > MAX_TAP_DEV) {
+               EPRINTF("invalid dev id: %d\n", minor);
+               return -EINVAL;
+       }
+
+       major = ioctl(channel->blktap_fd, BLKTAP_IOCTL_MAJOR, minor);
+       if (major < 0) {
+               EPRINTF("invalid major id: %d\n", major);
+               return -EINVAL;
+       }
+
+       err = asprintf(&devname, "%s/%s%d",
+                      BLKTAP_DEV_DIR, BLKTAP_DEV_NAME, minor);
+       if (err == -1) {
+               EPRINTF("get_new_dev: malloc failed\n");
+               return -ENOMEM;
+       }
+
+       err = make_blktap_device(devname, major, minor, S_IFCHR | 0600);
+       free(devname);
+
+       if (err)
+               return err;
+
+       DPRINTF("Received device id %d and major %d, "
+               "sent domid %d and be_id %d\n",
+               minor, major, tr.domid, tr.busid);
+
+       channel->major = major;
+       channel->minor = minor;
+
+       return 0;
+}
+
+static int
+tapdisk_channel_start_process(tapdisk_channel_t *channel,
+                             char *write_dev, char *read_dev)
+{
+       pid_t child;
+       char *argv[] = { "tapdisk", write_dev, read_dev, NULL };
+
+       if ((child = fork()) == -1)
+               return -errno;
+
+       if (!child) {
+               int i;
+               for (i = 0 ; i < sysconf(_SC_OPEN_MAX) ; i++)
+                       if (i != STDIN_FILENO &&
+                           i != STDOUT_FILENO &&
+                           i != STDERR_FILENO)
+                               close(i);
+
+               execvp("tapdisk", argv);
+               _exit(1);
+       } else {
+               pid_t got;
+               do {
+                       got = waitpid(child, NULL, 0);
+               } while (got != child);
+       }
+       return 0;
+}
+
+static int
+tapdisk_channel_launch_tapdisk(tapdisk_channel_t *channel)
+{
+       int err;
+       char *read_dev, *write_dev;
+
+       read_dev          = NULL;
+       write_dev         = NULL;
+       channel->read_fd  = -1;
+       channel->write_fd = -1;
+
+       err = tapdisk_channel_get_device_number(channel);
+       if (err)
+               return err;
+
+       err = asprintf(&write_dev,
+                      "%s/tapctrlwrite%d", BLKTAP_CTRL_DIR, channel->minor);
+       if (err == -1) {
+               err = -ENOMEM;
+               write_dev = NULL;
+               goto fail;
+       }
+
+       err = asprintf(&read_dev,
+                      "%s/tapctrlread%d", BLKTAP_CTRL_DIR, channel->minor);
+       if (err == -1) {
+               err = -ENOMEM;
+               read_dev = NULL;
+               goto fail;
+       }
+
+       channel->write_fd = tapdisk_channel_open_control_socket(write_dev);
+       if (channel->write_fd < 0) {
+               err = channel->write_fd;
+               channel->write_fd = -1;
+               goto fail;
+       }
+
+       channel->read_fd = tapdisk_channel_open_control_socket(read_dev);
+       if (channel->read_fd < 0) {
+               err = channel->read_fd;
+               channel->read_fd = -1;
+               goto fail;
+       }
+
+       err = tapdisk_channel_start_process(channel, write_dev, read_dev);
+       if (err)
+               goto fail;
+
+       channel->open       = 1;
+       channel->channel_id = channel->write_fd;
+
+       free(read_dev);
+       free(write_dev);
+
+       DPRINTF("process launched, channel = %d:%d\n",
+               channel->channel_id, channel->cookie);
+
+       return tapdisk_channel_send_pid_request(channel);
+
+fail:
+       free(read_dev);
+       free(write_dev);
+       if (channel->read_fd != -1)
+               close(channel->read_fd);
+       if (channel->write_fd != -1)
+               close(channel->write_fd);
+       return err;
+}
+
+static int
+tapdisk_channel_connect(tapdisk_channel_t *channel)
+{
+       int err;
+
+       tapdisk_daemon_find_channel(channel);
+
+       if (!channel->tapdisk_pid)
+               return tapdisk_channel_launch_tapdisk(channel);
+
+       DPRINTF("%s: process exists: %d, channel = %d:%d\n",
+               channel->path, channel->tapdisk_pid,
+               channel->channel_id, channel->cookie);
+
+       err = tapdisk_channel_get_device_number(channel);
+       if (err)
+               return err;
+
+       return tapdisk_channel_send_pid_request(channel);
+}
+
+static int
+tapdisk_channel_init(tapdisk_channel_t *channel)
+{
+       int err;
+
+       channel->uuid_str          = NULL;
+       channel->pause_str         = NULL;
+       channel->pause_done_str    = NULL;
+       channel->shutdown_str      = NULL;
+       channel->share_tapdisk_str = NULL;
+
+       err = asprintf(&channel->uuid_str,
+                      "%s/tapdisk-uuid", channel->path);
+       if (err == -1) {
+               channel->uuid_str = NULL;
+               goto fail;
+       }
+
+       err = asprintf(&channel->pause_str, "%s/pause", channel->path);
+       if (err == -1) {
+               channel->pause_str = NULL;
+               goto fail;
+       }
+
+       err = asprintf(&channel->pause_done_str,
+                      "%s/pause-done", channel->path);
+       if (err == -1) {
+               channel->pause_done_str = NULL;
+               goto fail;
+       }
+
+       err = asprintf(&channel->shutdown_str,
+                      "%s/shutdown-tapdisk", channel->path);
+       if (err == -1) {
+               channel->shutdown_str = NULL;
+               goto fail;
+       }
+
+       channel->share_tapdisk_str = "/local/domain/0/tapdisk/share-tapdisks";
+
+       return 0;
+
+fail:
+       free(channel->uuid_str);
+       free(channel->pause_str);
+       free(channel->pause_done_str);
+       free(channel->shutdown_str);
+       channel->uuid_str          = NULL;
+       channel->pause_str         = NULL;
+       channel->pause_done_str    = NULL;
+       channel->shutdown_str      = NULL;
+       channel->share_tapdisk_str = NULL;
+       return -ENOMEM;
+}
+
+static int
+tapdisk_channel_set_watches(tapdisk_channel_t *channel)
+{
+       int err;
+
+       /* watch for pause events */
+       channel->pause_watch.node            = channel->pause_str;
+       channel->pause_watch.callback        = tapdisk_channel_pause_event;
+       channel->pause_watch.data            = channel;
+       err = register_xenbus_watch(channel->xsh, &channel->pause_watch);
+       if (err) {
+               channel->pause_watch.node    = NULL;
+               goto fail;
+       }
+
+       /* watch for shutdown events */
+       channel->shutdown_watch.node         = channel->shutdown_str;
+       channel->shutdown_watch.callback     = tapdisk_channel_shutdown_event;
+       channel->shutdown_watch.data         = channel;
+       err = register_xenbus_watch(channel->xsh, &channel->shutdown_watch);
+       if (err) {
+               channel->shutdown_watch.node = NULL;
+               goto fail;
+       }
+
+       return 0;
+
+fail:
+       if (channel->pause_watch.node) {
+               unregister_xenbus_watch(channel->xsh, &channel->pause_watch);
+               channel->pause_watch.node    = NULL;
+       }
+       if (channel->shutdown_watch.node) {
+               unregister_xenbus_watch(channel->xsh, &channel->shutdown_watch);
+               channel->shutdown_watch.node = NULL;
+       }
+       return err;
+}
+
+static void
+tapdisk_channel_get_storage_type(tapdisk_channel_t *channel)
+{
+       int err, type;
+       unsigned int len;
+       char *path, *stype;
+
+       channel->storage = TAPDISK_STORAGE_TYPE_DEFAULT;
+
+       err = asprintf(&path, "%s/sm-data/storage-type", channel->path);
+       if (err == -1)
+               return;
+
+       stype = xs_read(channel->xsh, XBT_NULL, path, &len);
+       if (!stype)
+               goto out;
+       else if (!strcmp(stype, "nfs"))
+               channel->storage = TAPDISK_STORAGE_TYPE_NFS;
+       else if (!strcmp(stype, "ext"))
+               channel->storage = TAPDISK_STORAGE_TYPE_EXT;
+       else if (!strcmp(stype, "lvm"))
+               channel->storage = TAPDISK_STORAGE_TYPE_LVM;
+
+out:
+       free(path);
+       free(stype);
+}
+
+static int
+tapdisk_channel_get_busid(tapdisk_channel_t *channel)
+{
+       int len, end;
+       const char *ptr;
+       char *tptr, num[10];
+
+       len = strsep_len(channel->path, '/', 6);
+       end = strlen(channel->path);
+       if(len < 0 || end < 0) {
+               EPRINTF("invalid path: %s\n", channel->path);
+               return -EINVAL;
+       }
+       
+       ptr = channel->path + len + 1;
+       strncpy(num, ptr, end - len);
+       tptr = num + (end - (len + 1));
+       *tptr = '\0';
+
+       channel->busid = atoi(num);
+       return 0;
+}
+
+static int
+tapdisk_channel_parse_params(tapdisk_channel_t *channel)
+{
+       int i, size, err;
+       unsigned int len;
+       char *ptr, *path, handle[10];
+       char *vdi_type;
+       char *vtype;
+
+       path = channel->params;
+       size = sizeof(dtypes) / sizeof(disk_info_t *);
+
+       if (strlen(path) + 1 >= TAPDISK_MESSAGE_MAX_PATH_LENGTH)
+               goto fail;
+
+       ptr = strchr(path, ':');
+       if (!ptr)
+               goto fail;
+
+       channel->vdi_path = ptr + 1;
+       memcpy(handle, path, (ptr - path));
+       ptr  = handle + (ptr - path);
+       *ptr = '\0';
+
+       err = asprintf(&vdi_type, "%s/sm-data/vdi-type", channel->path);
+       if (err == -1)
+               goto fail;
+
+       if (xs_exists(channel->xsh, vdi_type)) {
+               vtype = xs_read(channel->xsh, XBT_NULL, vdi_type, &len);
+               free(vdi_type);
+               if (!vtype)
+                       goto fail;
+               if (len >= sizeof(handle) - 1) {
+                       free(vtype);
+                       goto fail;
+               }
+               sprintf(handle, "%s", vtype);
+               free(vtype);
+       }
+
+       for (i = 0; i < size; i++) {
+               if (strncmp(handle, dtypes[i]->handle, (ptr - path)))
+                       continue;
+
+               if (dtypes[i]->idnum == -1)
+                       goto fail;
+
+               channel->drivertype = dtypes[i]->idnum;
+               return 0;
+       }
+
+fail:
+       EPRINTF("%s: invalid blktap params: %s\n",
+               channel->path, channel->params);
+       channel->vdi_path = NULL;
+       return -EINVAL;
+}
+
+static int
+tapdisk_channel_gather_info(tapdisk_channel_t *channel)
+{
+       int err;
+
+       err = xs_gather(channel->xsh, channel->path,
+                       "frontend", NULL, &channel->frontpath,
+                       "frontend-id", "%li", &channel->domid,
+                       "params", NULL, &channel->params,
+                       "mode", "%c", &channel->mode, NULL);
+       if (err) {
+               EPRINTF("could not find device info: %d\n", err);
+               return err;
+       }
+
+       err = tapdisk_channel_parse_params(channel);
+       if (err)
+               return err;
+
+       err = tapdisk_channel_get_busid(channel);
+       if (err)
+               return err;
+
+       tapdisk_channel_get_storage_type(channel);
+
+       return 0;
+}
+
+static int
+tapdisk_channel_verify_start_request(tapdisk_channel_t *channel)
+{
+       char *path;
+       unsigned int err;
+
+       err = asprintf(&path, "%s/start-tapdisk", channel->path);
+       if (err == -1)
+               goto mem_fail;
+
+       if (!xs_exists(channel->xsh, path))
+               goto fail;
+
+       free(path);
+       err = asprintf(&path, "%s/shutdown-request", channel->path);
+       if (err == -1)
+               goto mem_fail;
+
+       if (xs_exists(channel->xsh, path))
+               goto fail;
+
+       if (xs_exists(channel->xsh, channel->shutdown_str))
+               goto fail;
+
+       free(path);
+       err = asprintf(&path, "%s/shutdown-done", channel->path);
+       if (err == -1)
+               goto mem_fail;
+
+       if (xs_exists(channel->xsh, path))
+               goto fail;
+
+       free(path);
+
+       return 0;
+
+fail:
+       free(path);
+       EPRINTF("%s:%s: invalid start request\n", __func__, channel->path);
+       return -EINVAL;
+
+mem_fail:
+       EPRINTF("%s:%s: out of memory\n", __func__, channel->path);
+       return -ENOMEM;
+}
+
+void
+tapdisk_channel_close(tapdisk_channel_t *channel)
+{
+       if (channel->channel_id)
+               DPRINTF("%s: closing channel %d:%d\n",
+                       channel->path, channel->channel_id, channel->cookie);
+
+       if (channel->open)
+               tapdisk_channel_send_shutdown_request(channel);
+
+       if (channel->pause_watch.node) {
+               unregister_xenbus_watch(channel->xsh, &channel->pause_watch);
+               channel->pause_watch.node = NULL;
+       }
+
+       if (channel->shutdown_watch.node) {
+               unregister_xenbus_watch(channel->xsh, &channel->shutdown_watch);
+               channel->shutdown_watch.node = NULL;
+       }
+
+       tapdisk_daemon_close_channel(channel);
+
+       free(channel->params);
+       free(channel->frontpath);
+       free(channel->shutdown_str);
+       free(channel->pause_done_str);
+       free(channel->pause_str);
+       free(channel->uuid_str);
+       free(channel->path);
+       free(channel);
+}
+
+int
+tapdisk_channel_open(tapdisk_channel_t **_channel,
+                    char *path, struct xs_handle *xsh,
+                    int blktap_fd, uint16_t cookie)
+{
+       int err;
+       char *msg;
+       tapdisk_channel_t *channel;
+
+       msg       = NULL;
+       *_channel = NULL;
+
+       channel = calloc(1, sizeof(tapdisk_channel_t));
+       if (!channel)
+               return -ENOMEM;
+
+       channel->xsh       = xsh;
+       channel->blktap_fd = blktap_fd;
+       channel->cookie    = cookie;
+       channel->state     = TAPDISK_CHANNEL_IDLE;
+
+       INIT_LIST_HEAD(&channel->list);
+
+       channel->path = strdup(path);
+       if (!channel->path) {
+               err = -ENOMEM;
+               goto fail;
+       }
+
+       err = tapdisk_channel_init(channel);
+       if (err) {
+               msg = "allocating device";
+               goto fail;
+       }
+
+       err = tapdisk_channel_check_uuid(channel);
+       if (err) {
+               msg = "checking uuid";
+               goto fail;
+       }
+
+       err = tapdisk_channel_gather_info(channel);
+       if (err) {
+               msg = "gathering parameters";
+               goto fail;
+       }
+
+       err = tapdisk_channel_verify_start_request(channel);
+       if (err) {
+               msg = "invalid start request";
+               goto fail;
+       }
+
+       err = tapdisk_channel_set_watches(channel);
+       if (err) {
+               msg = "registering xenstore watches";
+               goto fail;
+       }
+
+       err = tapdisk_channel_connect(channel);
+       if (err) {
+               msg = "connecting to tapdisk";
+               goto fail;
+       }
+
+       *_channel = channel;
+       return 0;
+
+fail:
+       tapdisk_channel_fatal(channel, "%s: %d", (msg ? : "failure"), err);
+       return err;
+}
+
+int
+tapdisk_channel_receive_message(tapdisk_channel_t *c, tapdisk_message_t *m)
+{
+       int err;
+
+       err = tapdisk_channel_validate_message(c, m);
+       if (err)
+               goto fail;
+
+       switch (m->type) {
+       case TAPDISK_MESSAGE_PID_RSP:
+               return tapdisk_channel_receive_pid_response(c, m);
+
+       case TAPDISK_MESSAGE_OPEN_RSP:
+               return tapdisk_channel_receive_open_response(c, m);
+
+       case TAPDISK_MESSAGE_PAUSE_RSP:
+               return tapdisk_channel_receive_pause_response(c, m);
+
+       case TAPDISK_MESSAGE_RESUME_RSP:
+               return tapdisk_channel_receive_resume_response(c, m);
+
+       case TAPDISK_MESSAGE_CLOSE_RSP:
+               return tapdisk_channel_receive_shutdown_response(c, m);
+
+       case TAPDISK_MESSAGE_RUNTIME_ERROR:
+               return tapdisk_channel_receive_runtime_error(c, m);
+       }
+
+fail:
+       tapdisk_channel_fatal(c, "received unexpected message %s in state %d",
+                             tapdisk_message_name(m->type), c->state);
+       return -EINVAL;
+}
diff --git a/tools/blktap2/daemon/tapdisk-daemon.c b/tools/blktap2/daemon/tapdisk-daemon.c
new file mode 100644 (file)
index 0000000..ecfc0f3
--- /dev/null
@@ -0,0 +1,599 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <stdio.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include <xs.h>
+#include "disktypes.h"
+#include "tapdisk-dispatch.h"
+
+#define TAPDISK_DAEMON_DOMID_WATCH   "domid-watch"
+#define TAPDISK_DAEMON_PIDFILE       "/var/run/blktapctrl.pid"
+
+typedef struct tapdisk_daemon {
+       char                         *node;
+       int                           blktap_fd;
+       uint16_t                      cookie;
+
+       struct xs_handle             *xsh;
+       struct list_head              channels;
+       struct xenbus_watch           watch;
+} tapdisk_daemon_t;
+
+static tapdisk_daemon_t tapdisk_daemon;
+
+#define tapdisk_daemon_for_each_channel(c, tmp) \
+       list_for_each_entry_safe(c, tmp, &tapdisk_daemon.channels, list)
+
+#define MAX(a, b) ((a) >= (b) ? (a) : (b))
+
+static void
+tapdisk_daemon_print_drivers(void)
+{
+       int i, size;
+
+       DPRINTF("blktap-daemon: v1.0.2\n");
+
+       size = sizeof(dtypes) / sizeof(disk_info_t *);
+       for (i = 0; i < size; i++)
+               DPRINTF("Found driver: [%s]\n", dtypes[i]->name);
+}
+
+static int
+tapdisk_daemon_write_pidfile(long pid)
+{
+       char buf[100];
+       int len, fd, flags, err;
+
+       fd = open(TAPDISK_DAEMON_PIDFILE, O_RDWR | O_CREAT, 0600);
+       if (fd == -1) {
+               EPRINTF("Opening pid file failed (%d)\n", errno);
+               return -errno;
+       }
+
+       /* We exit silently if daemon already running */
+       err = lockf(fd, F_TLOCK, 0);
+       if (err == -1)
+               exit(0);
+
+       /* Set FD_CLOEXEC, so that tapdisk doesn't get this file descriptor */
+       flags = fcntl(fd, F_GETFD);
+       if (flags == -1) {
+               EPRINTF("F_GETFD failed (%d)\n", errno);
+               return -errno;
+       }
+
+       flags |= FD_CLOEXEC;
+       err = fcntl(fd, F_SETFD, flags);
+       if (err == -1) {
+               EPRINTF("F_SETFD failed (%d)\n", errno);
+               return -errno;
+       }
+
+       len = sprintf(buf, "%ld\n", pid);
+       err = write(fd, buf, len);
+       if (err != len) {
+               EPRINTF("Writing pid file failed (%d)\n", errno);
+               return -errno;
+       }
+
+       return 0;
+}
+
+static int
+tapdisk_daemon_init(void)
+{
+       char *devname;
+       int i, err, blktap_major;
+
+       memset(&tapdisk_daemon, 0, sizeof(tapdisk_daemon_t));
+
+       err = asprintf(&devname, "%s/%s0", BLKTAP_DEV_DIR, BLKTAP_DEV_NAME);
+       if (err == -1) {
+               devname = NULL;
+               err = -ENOMEM;
+               goto fail;
+       }
+
+       err = xc_find_device_number("blktap0");
+       if (err < 0)
+               goto fail;
+
+       blktap_major = major(err);
+       err = make_blktap_device(devname, blktap_major, 0, S_IFCHR | 0600);
+       if (err)
+               goto fail;
+
+       tapdisk_daemon.blktap_fd = open(devname, O_RDWR);
+       if (tapdisk_daemon.blktap_fd == -1) {
+               err = -errno;
+               EPRINTF("blktap0 open failed\n");
+               goto fail;
+       }
+
+       for (i = 0; i < 2; i++) {
+               tapdisk_daemon.xsh = xs_daemon_open();
+               if (!tapdisk_daemon.xsh) {
+                       EPRINTF("xs_daemon_open failed -- is xenstore running?\n");
+                       sleep(2);
+               } else
+                       break;
+       }
+
+       if (!tapdisk_daemon.xsh) {
+               err = -ENOSYS;
+               goto fail;
+       }
+
+       INIT_LIST_HEAD(&tapdisk_daemon.channels);
+
+       free(devname);
+       return 0;
+
+fail:
+       if (tapdisk_daemon.blktap_fd > 0)
+               close(tapdisk_daemon.blktap_fd);
+       free(devname);
+       memset(&tapdisk_daemon, 0, sizeof(tapdisk_daemon_t));
+       EPRINTF("%s: %d\n", __func__, err);
+
+       return err;
+}
+
+static int
+tapdisk_daemon_set_node(void)
+{
+       int err;
+       char *domid;
+
+       domid = get_dom_domid(tapdisk_daemon.xsh);
+       if (!domid)
+               return -EAGAIN;
+
+       err = asprintf(&tapdisk_daemon.node,
+                      "/local/domain/%s/backend/tap", domid);
+       if (err == -1) {
+               tapdisk_daemon.node = NULL;
+               err = -ENOMEM;
+               goto out;
+       }
+
+       err = 0;
+
+out:
+       free(domid);
+       return err;
+}
+
+static int
+tapdisk_daemon_get_domid(void)
+{
+       int err;
+       unsigned int num;
+       char **res, *node, *token, *domid;
+
+       res = xs_read_watch(tapdisk_daemon.xsh, &num);
+       if (!res)
+               return -EAGAIN;
+
+       err   = 0;
+       node  = res[XS_WATCH_PATH];
+       token = res[XS_WATCH_TOKEN];
+
+       if (strcmp(token, TAPDISK_DAEMON_DOMID_WATCH)) {
+               err = -EINVAL;
+               goto out;
+       }
+
+       err = tapdisk_daemon_set_node();
+
+out:
+       free(res);
+       return err;
+}
+
+static int
+tapdisk_daemon_wait_for_domid(void)
+{
+       int err;
+       char *domid;
+       fd_set readfds;
+
+       err = tapdisk_daemon_set_node();
+       if (!err)
+               return 0;
+
+       if (!xs_watch(tapdisk_daemon.xsh, "/local/domain",
+                     TAPDISK_DAEMON_DOMID_WATCH)) {
+               EPRINTF("unable to set domain id watch\n");
+               return -EINVAL;
+       }
+
+       do {
+               FD_ZERO(&readfds);
+               FD_SET(xs_fileno(tapdisk_daemon.xsh), &readfds);
+
+               select(xs_fileno(tapdisk_daemon.xsh) + 1,
+                      &readfds, NULL, NULL, NULL);
+
+               if (FD_ISSET(xs_fileno(tapdisk_daemon.xsh), &readfds))
+                       err = tapdisk_daemon_get_domid();
+               else
+                       err = -EAGAIN;
+       } while (err == -EAGAIN);
+
+       xs_unwatch(tapdisk_daemon.xsh,
+                  "/local/domain", TAPDISK_DAEMON_DOMID_WATCH);
+       return err;
+}
+
+static inline int
+tapdisk_daemon_new_vbd_event(const char *node)
+{
+       return (!strcmp(node, "start-tapdisk"));
+}
+
+static int
+tapdisk_daemon_write_uuid(char *path, uint32_t uuid)
+{
+       int err;
+       char *cpath, uuid_str[12];
+
+       snprintf(uuid_str, sizeof(uuid_str), "%u", uuid);
+
+       err = asprintf(&cpath, "%s/tapdisk-uuid", path);
+       if (err == -1)
+               return -ENOMEM;
+
+       err = xs_write(tapdisk_daemon.xsh, XBT_NULL,
+                      cpath, uuid_str, strlen(uuid_str));
+       free(cpath);
+
+       return (err ? 0 : -errno);
+}
+
+static void
+tapdisk_daemon_probe(struct xs_handle *xsh,
+                    struct xenbus_watch *watch, const char *path)
+{
+       char *cpath;
+       int len, err;
+       uint32_t cookie;
+       const char *node;
+       tapdisk_channel_t *channel;
+
+       len = strsep_len(path, '/', 7);
+       if (len < 0)
+               return;
+
+       node = path + len + 1;
+
+       if (!tapdisk_daemon_new_vbd_event(node))
+               return;
+
+       if (!xs_exists(xsh, path))
+               return;
+
+       cpath = strdup(path);
+       if (!cpath) {
+               EPRINTF("failed to allocate control path for %s\n", path);
+               return;
+       }
+       cpath[len] = '\0';
+
+       cookie = tapdisk_daemon.cookie++;
+       err    = tapdisk_daemon_write_uuid(cpath, cookie);
+       if (err)
+               goto out;
+
+       DPRINTF("%s: got watch on %s, uuid = %u\n", __func__, path, cookie);
+
+       err = tapdisk_channel_open(&channel, cpath,
+                                  tapdisk_daemon.xsh,
+                                  tapdisk_daemon.blktap_fd,
+                                  cookie);
+       if (!err)
+               list_add(&channel->list, &tapdisk_daemon.channels);
+       else
+               EPRINTF("failed to open tapdisk channel for %s: %d\n",
+                       path, err);
+
+out:
+       free(cpath);
+}
+
+static int
+tapdisk_daemon_start(void)
+{
+       int err;
+
+       err = tapdisk_daemon_wait_for_domid();
+       if (err)
+               return err;
+
+       tapdisk_daemon.watch.node     = tapdisk_daemon.node;
+       tapdisk_daemon.watch.callback = tapdisk_daemon_probe;
+
+       err = register_xenbus_watch(tapdisk_daemon.xsh, &tapdisk_daemon.watch);
+       if (err)
+               goto fail;
+
+       ioctl(tapdisk_daemon.blktap_fd,
+             BLKTAP_IOCTL_SETMODE, BLKTAP_MODE_INTERPOSE);
+       ioctl(tapdisk_daemon.blktap_fd, BLKTAP_IOCTL_SENDPID, getpid());
+
+       return 0;
+
+fail:
+       free(tapdisk_daemon.node);
+       tapdisk_daemon.node       = NULL;
+       tapdisk_daemon.watch.node = NULL;
+       EPRINTF("%s: %d\n", __func__, err);
+       return err;
+}
+
+static int
+tapdisk_daemon_stop(void)
+{
+       unregister_xenbus_watch(tapdisk_daemon.xsh, &tapdisk_daemon.watch);
+
+       ioctl(tapdisk_daemon.blktap_fd,
+             BLKTAP_IOCTL_SETMODE, BLKTAP_MODE_PASSTHROUGH);
+       close(tapdisk_daemon.blktap_fd);
+
+       return 0;
+}
+
+static void
+tapdisk_daemon_free(void)
+{
+       free(tapdisk_daemon.node);
+       xs_daemon_close(tapdisk_daemon.xsh);
+       memset(&tapdisk_daemon, 0, sizeof(tapdisk_daemon_t));
+}
+
+static int
+tapdisk_daemon_read_message(int fd, tapdisk_message_t *message, int timeout)
+{
+       fd_set readfds;
+       struct timeval tv;
+       int ret, len, offset;
+
+       tv.tv_sec  = timeout;
+       tv.tv_usec = 0;
+       offset     = 0;
+       len        = sizeof(tapdisk_message_t);
+
+       memset(message, 0, sizeof(tapdisk_message_t));
+
+       while (offset < len) {
+               FD_ZERO(&readfds);
+               FD_SET(fd, &readfds);
+
+               /* we don't bother reinitializing tv. at worst, it will wait a
+                * bit more time than expected. */
+
+               ret = select(fd + 1, &readfds, NULL, NULL, &tv);
+               if (ret == -1)
+                       break;
+               else if (FD_ISSET(fd, &readfds)) {
+                       ret = read(fd, message + offset, len - offset);
+                       if (ret <= 0)
+                               break;
+                       offset += ret;
+               } else
+                       break;
+       }
+
+       return (offset == len ? 0 : -EIO);
+}
+
+static int
+tapdisk_daemon_receive_message(int fd)
+{
+       int err;
+       tapdisk_message_t m;
+       tapdisk_channel_t *c, *tmp;
+
+       err = tapdisk_daemon_read_message(fd, &m, 2);
+       if (err) {
+               EPRINTF("failed reading message on %d: %d\n", fd, err);
+               return err;
+       }
+
+       tapdisk_daemon_for_each_channel(c, tmp)
+               if (c->cookie == m.cookie && c->read_fd == fd) {
+                       DPRINTF("got '%s' message from %d:%d\n",
+                               tapdisk_message_name(m.type),
+                               c->channel_id, c->cookie);
+
+                       return tapdisk_channel_receive_message(c, &m);
+               }
+
+       EPRINTF("unrecognized message on %d: '%s' (uuid = %u)\n",
+               fd, tapdisk_message_name(m.type), m.cookie);
+
+       return -EINVAL;
+}
+
+static int
+tapdisk_daemon_set_fds(fd_set *readfds)
+{
+       int max, fd;
+       tapdisk_channel_t *channel, *tmp;
+
+       max = xs_fileno(tapdisk_daemon.xsh);
+
+       FD_ZERO(readfds);
+       FD_SET(max, readfds);
+
+       tapdisk_daemon_for_each_channel(channel, tmp) {
+               fd  = channel->read_fd;
+               max = MAX(fd, max);
+               FD_SET(fd, readfds);
+       }
+
+       return max;
+}
+
+static int
+tapdisk_daemon_check_fds(fd_set *readfds)
+{
+       int err;
+       tapdisk_channel_t *channel, *tmp;
+
+       if (FD_ISSET(xs_fileno(tapdisk_daemon.xsh), readfds))
+               xs_fire_next_watch(tapdisk_daemon.xsh);
+
+       tapdisk_daemon_for_each_channel(channel, tmp)
+               if (FD_ISSET(channel->read_fd, readfds))
+                       return tapdisk_daemon_receive_message(channel->read_fd);
+
+       return 0;
+}
+
+static int
+tapdisk_daemon_run(void)
+{
+       int err, max;
+       fd_set readfds;
+
+       while (1) {
+               max = tapdisk_daemon_set_fds(&readfds);
+
+               err = select(max + 1, &readfds, NULL, NULL, NULL);
+               if (err < 0)
+                       continue;
+
+               err = tapdisk_daemon_check_fds(&readfds);
+       }
+
+       return err;
+}
+
+void
+tapdisk_daemon_find_channel(tapdisk_channel_t *channel)
+{
+       tapdisk_channel_t *c, *tmp;
+
+       channel->read_fd     = 0;
+       channel->write_fd    = 0;
+       channel->tapdisk_pid = 0;
+
+       /* do we want multiple vbds per tapdisk? */
+       if (!xs_exists(tapdisk_daemon.xsh, channel->share_tapdisk_str)) {
+               channel->shared = 0;
+               return;
+       }
+
+       channel->shared = 1;
+
+       /* check if we already have a process started */
+       tapdisk_daemon_for_each_channel(c, tmp)
+               if (c->drivertype == channel->drivertype) {
+                       channel->write_fd    = c->write_fd;
+                       channel->read_fd     = c->read_fd;
+                       channel->channel_id  = c->channel_id;
+                       channel->tapdisk_pid = c->tapdisk_pid;
+                       return;
+               }
+}
+
+void
+tapdisk_daemon_close_channel(tapdisk_channel_t *channel)
+{
+       tapdisk_channel_t *c, *tmp;
+
+       list_del(&channel->list);
+
+       tapdisk_daemon_for_each_channel(c, tmp)
+               if (c->channel_id == channel->channel_id)
+                       return;
+
+       close(channel->read_fd);
+       close(channel->write_fd);
+}
+
+int
+main(int argc, char *argv[])
+{
+       int err;
+       char buf[128];
+
+       if (daemon(0, 0)) {
+         EPRINTF("daemon() failed (%d)\n", errno);
+         return -errno;
+       }
+
+#define CORE_DUMP
+#if defined(CORE_DUMP)
+#include <sys/resource.h>
+       {
+               /* set up core-dumps*/
+               struct rlimit rlim;
+               rlim.rlim_cur = RLIM_INFINITY;
+               rlim.rlim_max = RLIM_INFINITY;
+               if (setrlimit(RLIMIT_CORE, &rlim) < 0)
+                       EPRINTF("setrlimit failed: %d\n", errno);
+       }
+#endif
+
+       snprintf(buf, sizeof(buf), "BLKTAP-DAEMON[%d]", getpid());
+       openlog(buf, LOG_CONS | LOG_ODELAY, LOG_DAEMON);
+
+       err = tapdisk_daemon_write_pidfile(getpid());
+       if (err)
+               goto out;
+
+       tapdisk_daemon_print_drivers();
+
+       err = tapdisk_daemon_init();
+       if (err)
+               goto out;
+
+       err = tapdisk_daemon_start();
+       if (err)
+               goto out;
+
+       tapdisk_daemon_run();
+
+       tapdisk_daemon_stop();
+       tapdisk_daemon_free();
+
+       err = 0;
+
+out:
+       if (err)
+               EPRINTF("failed to start %s: %d\n", argv[0], err);
+       closelog();
+       return err;
+}
diff --git a/tools/blktap2/daemon/tapdisk-dispatch-common.c b/tools/blktap2/daemon/tapdisk-dispatch-common.c
new file mode 100644 (file)
index 0000000..3d72b7d
--- /dev/null
@@ -0,0 +1,94 @@
+/*
+ * (c) 2005 Andrew Warfield and Julian Chesterfield
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#include <errno.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include "tapdisk-dispatch.h"
+
+int
+strsep_len(const char *str, char c, unsigned int len)
+{
+       unsigned int i;
+       
+       for (i = 0; str[i]; i++)
+               if (str[i] == c) {
+                       if (len == 0)
+                               return i;
+                       len--;
+               }
+
+       return (len == 0) ? i : -ERANGE;
+}
+
+int
+make_blktap_device(char *devname, int major, int minor, int perm)
+{
+       int err;
+
+       err = unlink(devname);
+       if (err && errno != ENOENT) {
+               EPRINTF("unlink %s failed: %d\n", devname, errno);
+               return -errno;
+       }
+
+       /* Need to create device */
+       err = mkdir(BLKTAP_DEV_DIR, 0755);
+       if (err && errno != EEXIST) {
+               EPRINTF("Failed to create %s directory\n", BLKTAP_DEV_DIR);
+               return -errno;
+       }
+
+       err = mknod(devname, perm, makedev(major, minor));
+       if (err) {
+               int ret = -errno;
+               struct stat st;
+
+               EPRINTF("mknod %s failed: %d\n", devname, -errno);
+
+               err = lstat(devname, &st);
+               if (err) {
+                       DPRINTF("lstat %s failed: %d\n", devname, -errno);
+                       err = access(devname, F_OK);
+                       if (err)
+                               DPRINTF("access %s failed: %d\n", devname, -errno);
+                       else
+                               DPRINTF("access %s succeeded\n", devname);
+               } else
+                       DPRINTF("lstat %s: %u:%u\n", devname,
+                               (unsigned int)st.st_rdev >> 8,
+                               (unsigned int)st.st_rdev & 0xff);
+
+               return ret;
+       }
+
+       DPRINTF("Created %s device\n", devname);
+       return 0;
+}
diff --git a/tools/blktap2/daemon/tapdisk-dispatch.h b/tools/blktap2/daemon/tapdisk-dispatch.h
new file mode 100644 (file)
index 0000000..bcd1e9d
--- /dev/null
@@ -0,0 +1,95 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef _TAPDISK_DISPATCH_H_
+#define _TAPDISK_DISPATCH_H_
+
+#include "xs_api.h"
+#include "blktaplib.h"
+#include "tapdisk-message.h"
+
+struct tapdisk_channel {
+       int                       state;
+
+       int                       read_fd;
+       int                       write_fd;
+       int                       blktap_fd;
+       int                       channel_id;
+
+       char                      mode;
+       char                      shared;
+       char                      open;
+       unsigned int              domid;
+       unsigned int              busid;
+       unsigned int              major;
+       unsigned int              minor;
+       unsigned int              storage;
+       unsigned int              drivertype;
+       uint16_t                  cookie;
+       pid_t                     tapdisk_pid;
+
+       /*
+        * special accounting needed to handle pause
+        * requests received before tapdisk process is ready
+        */
+       char                      connected;
+       char                      pause_needed;
+
+       char                     *path;
+       char                     *frontpath;
+       char                     *params;
+       char                     *vdi_path;
+       char                     *uuid_str;
+       char                     *pause_str;
+       char                     *pause_done_str;
+       char                     *shutdown_str;
+       char                     *share_tapdisk_str;
+
+       image_t                   image;
+
+       struct list_head          list;
+       struct xenbus_watch       pause_watch;
+       struct xenbus_watch       shutdown_watch;
+
+       struct xs_handle         *xsh;
+};
+
+typedef struct tapdisk_channel tapdisk_channel_t;
+
+int strsep_len(const char *str, char c, unsigned int len);
+int make_blktap_device(char *devname, int major, int minor, int perm);
+
+int tapdisk_channel_open(tapdisk_channel_t **,
+                        char *node, struct xs_handle *,
+                        int blktap_fd, uint16_t cookie);
+void tapdisk_channel_close(tapdisk_channel_t *);
+
+void tapdisk_daemon_find_channel(tapdisk_channel_t *);
+void tapdisk_daemon_close_channel(tapdisk_channel_t *);
+
+int tapdisk_channel_receive_message(tapdisk_channel_t *, tapdisk_message_t *);
+
+#endif
diff --git a/tools/blktap2/drivers/Makefile b/tools/blktap2/drivers/Makefile
new file mode 100644 (file)
index 0000000..90cd6be
--- /dev/null
@@ -0,0 +1,105 @@
+XEN_ROOT=../../../
+BLKTAP_ROOT= ..
+include $(XEN_ROOT)/tools/Rules.mk
+
+LIBVHDDIR  = $(BLKTAP_ROOT)/vhd/lib
+
+IBIN       = tapdisk tapdisk2 td-util tapdisk-client tapdisk-stream tapdisk-diff
+QCOW_UTIL  = img2qcow qcow-create qcow2raw
+LOCK_UTIL  = lock-util
+INST_DIR   = $(SBINDIR)
+
+CFLAGS    += -Werror -g -O0
+CFLAGS    += -Wno-unused
+CFLAGS    += -fno-strict-aliasing
+CFLAGS    += -I../lib -I../../libxc
+CFLAGS    += -I../include -I../../include
+CFLAGS    += -I $(LIBAIO_DIR)
+CFLAGS    += -D_GNU_SOURCE
+CFLAGS    += -DUSE_NFS_LOCKS
+
+ifeq ($(CONFIG_X86_64),y)
+CFLAGS            += -fPIC
+endif
+
+LIBS      += -lrt -lz
+
+ifeq ($(shell . ./check_gcrypt $(CC)),yes)
+CFLAGS += -DUSE_GCRYPT
+CRYPT_LIB += -lgcrypt
+else
+CRYPT_LIB += -lcrypto
+$(warning === libgcrypt not installed: falling back to libcrypto ===)
+endif
+
+LDFLAGS_img := $(CRYPT_LIB) -lpthread -lz
+
+tapdisk tapdisk2 td-util tapdisk-stream tapdisk-diff $(QCOW_UTIL): LIBS += -L$(LIBVHDDIR) -lvhd -luuid
+
+LIBAIO_DIR = $(XEN_ROOT)/tools/libaio/src
+tapdisk tapdisk2 tapdisk-stream tapdisk-diff $(QCOW_UTIL): AIOLIBS := $(LIBAIO_DIR)/libaio.a
+tapdisk tapdisk-client tapdisk-stream tapdisk-diff $(QCOW_UTIL): CFLAGS  += -I$(LIBAIO_DIR) -I$(XEN_LIBXC)
+
+ifeq ($(VHD_STATIC),y)
+td-util: CFLAGS += -static
+endif
+
+TAP-OBJS-y  := scheduler.o
+TAP-OBJS-y  += tapdisk-ipc.o
+TAP-OBJS-y  += tapdisk-vbd.o
+TAP-OBJS-y  += tapdisk-image.o
+TAP-OBJS-y  += tapdisk-driver.o
+TAP-OBJS-y  += tapdisk-interface.o
+TAP-OBJS-y  += tapdisk-server.o
+TAP-OBJS-y  += tapdisk-queue.o
+TAP-OBJS-y  += tapdisk-filter.o
+TAP-OBJS-y  += tapdisk-log.o
+TAP-OBJS-y  += tapdisk-utils.o
+TAP-OBJS-y  += io-optimize.o
+TAP-OBJS-y  += lock.o
+TAP-OBJS-$(CONFIG_Linux)  += blk_linux.o
+
+MISC-OBJS-y := atomicio.o
+
+BLK-OBJS-y  := block-aio.o
+BLK-OBJS-y  += block-ram.o
+BLK-OBJS-y  += block-cache.o
+BLK-OBJS-y  += block-vhd.o
+BLK-OBJS-y  += block-log.o
+BLK-OBJS-y  += block-qcow.o
+BLK-OBJS-y  += aes.o
+
+all: $(IBIN) lock-util qcow-util
+
+tapdisk: $(TAP-OBJS-y) $(BLK-OBJS-y) $(MISC-OBJS-y) tapdisk.c
+       $(CC) $(CFLAGS) -o $@ $^ $(LIBS) $(AIOLIBS)  $(LDFLAGS_img)
+
+tapdisk2: $(TAP-OBJS-y) $(BLK-OBJS-y) $(MISC-OBJS-y) tapdisk2.c
+       $(CC) $(CFLAGS) -o $@ $^ $(LIBS) $(AIOLIBS) $(LDFLAGS_img)
+
+tapdisk-client: tapdisk-client.o
+       $(CC) $(CFLAGS) -o $@ $^ $(LIBS)  $(LDFLAGS_img)
+
+tapdisk-stream tapdisk-diff: %: %.o $(TAP-OBJS-y) $(BLK-OBJS-y)
+       $(CC) $(CFLAGS) -o $@ $^ $(LIBS) $(AIOLIBS)  $(LDFLAGS_img)
+
+td-util: td.o tapdisk-utils.o tapdisk-log.o
+       $(CC) $(CFLAGS) -o $@ $^ $(LIBS)  $(LDFLAGS_img)
+
+lock-util: lock.c
+       $(CC) $(CFLAGS) -DUTIL -o lock-util lock.c $(LIBS)
+
+.PHONY: qcow-util
+qcow-util: img2qcow qcow2raw qcow-create
+
+img2qcow qcow2raw qcow-create: %: %.o $(TAP-OBJS-y) $(BLK-OBJS-y)
+       $(CC) $(CFLAGS) -o $@ $^ $(LIBS) $(AIOLIBS) $(LDFLAGS_img)
+
+install: all
+       $(INSTALL_DIR) -p $(DESTDIR)$(INST_DIR)
+       $(INSTALL_PROG) $(IBIN) $(LOCK_UTIL) $(QCOW_UTIL) $(DESTDIR)$(INST_DIR)
+
+clean:
+       rm -rf *.o *~ xen TAGS $(IBIN) $(LIB) $(LOCK_UTIL) $(QCOW_UTIL)
+
+.PHONY: clean install
diff --git a/tools/blktap2/drivers/aes.c b/tools/blktap2/drivers/aes.c
new file mode 100644 (file)
index 0000000..ea81ae5
--- /dev/null
@@ -0,0 +1,1319 @@
+/**\r
+ * \r
+ * aes.c - integrated in QEMU by Fabrice Bellard from the OpenSSL project.\r
+ */\r
+/*\r
+ * rijndael-alg-fst.c\r
+ *\r
+ * @version 3.0 (December 2000)\r
+ *\r
+ * Optimised ANSI C code for the Rijndael cipher (now AES)\r
+ *\r
+ * @author Vincent Rijmen <vincent.rijmen@esat.kuleuven.ac.be>\r
+ * @author Antoon Bosselaers <antoon.bosselaers@esat.kuleuven.ac.be>\r
+ * @author Paulo Barreto <paulo.barreto@terra.com.br>\r
+ *\r
+ * This code is hereby placed in the public domain.\r
+ *\r
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS\r
+ * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED\r
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE\r
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE\r
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR\r
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF\r
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR\r
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,\r
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE\r
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,\r
+ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
+ */\r
+//#include "vl.h"\r
+#include <inttypes.h>\r
+#include <string.h>\r
+#include "aes.h"\r
+\r
+//#define NDEBUG\r
+#include <assert.h>\r
+\r
+typedef uint32_t u32;\r
+typedef uint16_t u16;\r
+typedef uint8_t u8;\r
+\r
+#define MAXKC   (256/32)\r
+#define MAXKB   (256/8)\r
+#define MAXNR   14\r
+\r
+/* This controls loop-unrolling in aes_core.c */\r
+#undef FULL_UNROLL\r
+# define GETU32(pt) (((u32)(pt)[0] << 24) ^ ((u32)(pt)[1] << 16) ^ ((u32)(pt)[2] <<  8) ^ ((u32)(pt)[3]))\r
+# define PUTU32(ct, st) { (ct)[0] = (u8)((st) >> 24); (ct)[1] = (u8)((st) >> 16); (ct)[2] = (u8)((st) >>  8); (ct)[3] = (u8)(st); }\r
+\r
+/*\r
+Te0[x] = S [x].[02, 01, 01, 03];\r
+Te1[x] = S [x].[03, 02, 01, 01];\r
+Te2[x] = S [x].[01, 03, 02, 01];\r
+Te3[x] = S [x].[01, 01, 03, 02];\r
+Te4[x] = S [x].[01, 01, 01, 01];\r
+\r
+Td0[x] = Si[x].[0e, 09, 0d, 0b];\r
+Td1[x] = Si[x].[0b, 0e, 09, 0d];\r
+Td2[x] = Si[x].[0d, 0b, 0e, 09];\r
+Td3[x] = Si[x].[09, 0d, 0b, 0e];\r
+Td4[x] = Si[x].[01, 01, 01, 01];\r
+*/\r
+\r
+static const u32 Te0[256] = {\r
+    0xc66363a5U, 0xf87c7c84U, 0xee777799U, 0xf67b7b8dU,\r
+    0xfff2f20dU, 0xd66b6bbdU, 0xde6f6fb1U, 0x91c5c554U,\r
+    0x60303050U, 0x02010103U, 0xce6767a9U, 0x562b2b7dU,\r
+    0xe7fefe19U, 0xb5d7d762U, 0x4dababe6U, 0xec76769aU,\r
+    0x8fcaca45U, 0x1f82829dU, 0x89c9c940U, 0xfa7d7d87U,\r
+    0xeffafa15U, 0xb25959ebU, 0x8e4747c9U, 0xfbf0f00bU,\r
+    0x41adadecU, 0xb3d4d467U, 0x5fa2a2fdU, 0x45afafeaU,\r
+    0x239c9cbfU, 0x53a4a4f7U, 0xe4727296U, 0x9bc0c05bU,\r
+    0x75b7b7c2U, 0xe1fdfd1cU, 0x3d9393aeU, 0x4c26266aU,\r
+    0x6c36365aU, 0x7e3f3f41U, 0xf5f7f702U, 0x83cccc4fU,\r
+    0x6834345cU, 0x51a5a5f4U, 0xd1e5e534U, 0xf9f1f108U,\r
+    0xe2717193U, 0xabd8d873U, 0x62313153U, 0x2a15153fU,\r
+    0x0804040cU, 0x95c7c752U, 0x46232365U, 0x9dc3c35eU,\r
+    0x30181828U, 0x379696a1U, 0x0a05050fU, 0x2f9a9ab5U,\r
+    0x0e070709U, 0x24121236U, 0x1b80809bU, 0xdfe2e23dU,\r
+    0xcdebeb26U, 0x4e272769U, 0x7fb2b2cdU, 0xea75759fU,\r
+    0x1209091bU, 0x1d83839eU, 0x582c2c74U, 0x341a1a2eU,\r
+    0x361b1b2dU, 0xdc6e6eb2U, 0xb45a5aeeU, 0x5ba0a0fbU,\r
+    0xa45252f6U, 0x763b3b4dU, 0xb7d6d661U, 0x7db3b3ceU,\r
+    0x5229297bU, 0xdde3e33eU, 0x5e2f2f71U, 0x13848497U,\r
+    0xa65353f5U, 0xb9d1d168U, 0x00000000U, 0xc1eded2cU,\r
+    0x40202060U, 0xe3fcfc1fU, 0x79b1b1c8U, 0xb65b5bedU,\r
+    0xd46a6abeU, 0x8dcbcb46U, 0x67bebed9U, 0x7239394bU,\r
+    0x944a4adeU, 0x984c4cd4U, 0xb05858e8U, 0x85cfcf4aU,\r
+    0xbbd0d06bU, 0xc5efef2aU, 0x4faaaae5U, 0xedfbfb16U,\r
+    0x864343c5U, 0x9a4d4dd7U, 0x66333355U, 0x11858594U,\r
+    0x8a4545cfU, 0xe9f9f910U, 0x04020206U, 0xfe7f7f81U,\r
+    0xa05050f0U, 0x783c3c44U, 0x259f9fbaU, 0x4ba8a8e3U,\r
+    0xa25151f3U, 0x5da3a3feU, 0x804040c0U, 0x058f8f8aU,\r
+    0x3f9292adU, 0x219d9dbcU, 0x70383848U, 0xf1f5f504U,\r
+    0x63bcbcdfU, 0x77b6b6c1U, 0xafdada75U, 0x42212163U,\r
+    0x20101030U, 0xe5ffff1aU, 0xfdf3f30eU, 0xbfd2d26dU,\r
+    0x81cdcd4cU, 0x180c0c14U, 0x26131335U, 0xc3ecec2fU,\r
+    0xbe5f5fe1U, 0x359797a2U, 0x884444ccU, 0x2e171739U,\r
+    0x93c4c457U, 0x55a7a7f2U, 0xfc7e7e82U, 0x7a3d3d47U,\r
+    0xc86464acU, 0xba5d5de7U, 0x3219192bU, 0xe6737395U,\r
+    0xc06060a0U, 0x19818198U, 0x9e4f4fd1U, 0xa3dcdc7fU,\r
+    0x44222266U, 0x542a2a7eU, 0x3b9090abU, 0x0b888883U,\r
+    0x8c4646caU, 0xc7eeee29U, 0x6bb8b8d3U, 0x2814143cU,\r
+    0xa7dede79U, 0xbc5e5ee2U, 0x160b0b1dU, 0xaddbdb76U,\r
+    0xdbe0e03bU, 0x64323256U, 0x743a3a4eU, 0x140a0a1eU,\r
+    0x924949dbU, 0x0c06060aU, 0x4824246cU, 0xb85c5ce4U,\r
+    0x9fc2c25dU, 0xbdd3d36eU, 0x43acacefU, 0xc46262a6U,\r
+    0x399191a8U, 0x319595a4U, 0xd3e4e437U, 0xf279798bU,\r
+    0xd5e7e732U, 0x8bc8c843U, 0x6e373759U, 0xda6d6db7U,\r
+    0x018d8d8cU, 0xb1d5d564U, 0x9c4e4ed2U, 0x49a9a9e0U,\r
+    0xd86c6cb4U, 0xac5656faU, 0xf3f4f407U, 0xcfeaea25U,\r
+    0xca6565afU, 0xf47a7a8eU, 0x47aeaee9U, 0x10080818U,\r
+    0x6fbabad5U, 0xf0787888U, 0x4a25256fU, 0x5c2e2e72U,\r
+    0x381c1c24U, 0x57a6a6f1U, 0x73b4b4c7U, 0x97c6c651U,\r
+    0xcbe8e823U, 0xa1dddd7cU, 0xe874749cU, 0x3e1f1f21U,\r
+    0x964b4bddU, 0x61bdbddcU, 0x0d8b8b86U, 0x0f8a8a85U,\r
+    0xe0707090U, 0x7c3e3e42U, 0x71b5b5c4U, 0xcc6666aaU,\r
+    0x904848d8U, 0x06030305U, 0xf7f6f601U, 0x1c0e0e12U,\r
+    0xc26161a3U, 0x6a35355fU, 0xae5757f9U, 0x69b9b9d0U,\r
+    0x17868691U, 0x99c1c158U, 0x3a1d1d27U, 0x279e9eb9U,\r
+    0xd9e1e138U, 0xebf8f813U, 0x2b9898b3U, 0x22111133U,\r
+    0xd26969bbU, 0xa9d9d970U, 0x078e8e89U, 0x339494a7U,\r
+    0x2d9b9bb6U, 0x3c1e1e22U, 0x15878792U, 0xc9e9e920U,\r
+    0x87cece49U, 0xaa5555ffU, 0x50282878U, 0xa5dfdf7aU,\r
+    0x038c8c8fU, 0x59a1a1f8U, 0x09898980U, 0x1a0d0d17U,\r
+    0x65bfbfdaU, 0xd7e6e631U, 0x844242c6U, 0xd06868b8U,\r
+    0x824141c3U, 0x299999b0U, 0x5a2d2d77U, 0x1e0f0f11U,\r
+    0x7bb0b0cbU, 0xa85454fcU, 0x6dbbbbd6U, 0x2c16163aU,\r
+};\r
+static const u32 Te1[256] = {\r
+    0xa5c66363U, 0x84f87c7cU, 0x99ee7777U, 0x8df67b7bU,\r
+    0x0dfff2f2U, 0xbdd66b6bU, 0xb1de6f6fU, 0x5491c5c5U,\r
+    0x50603030U, 0x03020101U, 0xa9ce6767U, 0x7d562b2bU,\r
+    0x19e7fefeU, 0x62b5d7d7U, 0xe64dababU, 0x9aec7676U,\r
+    0x458fcacaU, 0x9d1f8282U, 0x4089c9c9U, 0x87fa7d7dU,\r
+    0x15effafaU, 0xebb25959U, 0xc98e4747U, 0x0bfbf0f0U,\r
+    0xec41adadU, 0x67b3d4d4U, 0xfd5fa2a2U, 0xea45afafU,\r
+    0xbf239c9cU, 0xf753a4a4U, 0x96e47272U, 0x5b9bc0c0U,\r
+    0xc275b7b7U, 0x1ce1fdfdU, 0xae3d9393U, 0x6a4c2626U,\r
+    0x5a6c3636U, 0x417e3f3fU, 0x02f5f7f7U, 0x4f83ccccU,\r
+    0x5c683434U, 0xf451a5a5U, 0x34d1e5e5U, 0x08f9f1f1U,\r
+    0x93e27171U, 0x73abd8d8U, 0x53623131U, 0x3f2a1515U,\r
+    0x0c080404U, 0x5295c7c7U, 0x65462323U, 0x5e9dc3c3U,\r
+    0x28301818U, 0xa1379696U, 0x0f0a0505U, 0xb52f9a9aU,\r
+    0x090e0707U, 0x36241212U, 0x9b1b8080U, 0x3ddfe2e2U,\r
+    0x26cdebebU, 0x694e2727U, 0xcd7fb2b2U, 0x9fea7575U,\r
+    0x1b120909U, 0x9e1d8383U, 0x74582c2cU, 0x2e341a1aU,\r
+    0x2d361b1bU, 0xb2dc6e6eU, 0xeeb45a5aU, 0xfb5ba0a0U,\r
+    0xf6a45252U, 0x4d763b3bU, 0x61b7d6d6U, 0xce7db3b3U,\r
+    0x7b522929U, 0x3edde3e3U, 0x715e2f2fU, 0x97138484U,\r
+    0xf5a65353U, 0x68b9d1d1U, 0x00000000U, 0x2cc1ededU,\r
+    0x60402020U, 0x1fe3fcfcU, 0xc879b1b1U, 0xedb65b5bU,\r
+    0xbed46a6aU, 0x468dcbcbU, 0xd967bebeU, 0x4b723939U,\r
+    0xde944a4aU, 0xd4984c4cU, 0xe8b05858U, 0x4a85cfcfU,\r
+    0x6bbbd0d0U, 0x2ac5efefU, 0xe54faaaaU, 0x16edfbfbU,\r
+    0xc5864343U, 0xd79a4d4dU, 0x55663333U, 0x94118585U,\r
+    0xcf8a4545U, 0x10e9f9f9U, 0x06040202U, 0x81fe7f7fU,\r
+    0xf0a05050U, 0x44783c3cU, 0xba259f9fU, 0xe34ba8a8U,\r
+    0xf3a25151U, 0xfe5da3a3U, 0xc0804040U, 0x8a058f8fU,\r
+    0xad3f9292U, 0xbc219d9dU, 0x48703838U, 0x04f1f5f5U,\r
+    0xdf63bcbcU, 0xc177b6b6U, 0x75afdadaU, 0x63422121U,\r
+    0x30201010U, 0x1ae5ffffU, 0x0efdf3f3U, 0x6dbfd2d2U,\r
+    0x4c81cdcdU, 0x14180c0cU, 0x35261313U, 0x2fc3ececU,\r
+    0xe1be5f5fU, 0xa2359797U, 0xcc884444U, 0x392e1717U,\r
+    0x5793c4c4U, 0xf255a7a7U, 0x82fc7e7eU, 0x477a3d3dU,\r
+    0xacc86464U, 0xe7ba5d5dU, 0x2b321919U, 0x95e67373U,\r
+    0xa0c06060U, 0x98198181U, 0xd19e4f4fU, 0x7fa3dcdcU,\r
+    0x66442222U, 0x7e542a2aU, 0xab3b9090U, 0x830b8888U,\r
+    0xca8c4646U, 0x29c7eeeeU, 0xd36bb8b8U, 0x3c281414U,\r
+    0x79a7dedeU, 0xe2bc5e5eU, 0x1d160b0bU, 0x76addbdbU,\r
+    0x3bdbe0e0U, 0x56643232U, 0x4e743a3aU, 0x1e140a0aU,\r
+    0xdb924949U, 0x0a0c0606U, 0x6c482424U, 0xe4b85c5cU,\r
+    0x5d9fc2c2U, 0x6ebdd3d3U, 0xef43acacU, 0xa6c46262U,\r
+    0xa8399191U, 0xa4319595U, 0x37d3e4e4U, 0x8bf27979U,\r
+    0x32d5e7e7U, 0x438bc8c8U, 0x596e3737U, 0xb7da6d6dU,\r
+    0x8c018d8dU, 0x64b1d5d5U, 0xd29c4e4eU, 0xe049a9a9U,\r
+    0xb4d86c6cU, 0xfaac5656U, 0x07f3f4f4U, 0x25cfeaeaU,\r
+    0xafca6565U, 0x8ef47a7aU, 0xe947aeaeU, 0x18100808U,\r
+    0xd56fbabaU, 0x88f07878U, 0x6f4a2525U, 0x725c2e2eU,\r
+    0x24381c1cU, 0xf157a6a6U, 0xc773b4b4U, 0x5197c6c6U,\r
+    0x23cbe8e8U, 0x7ca1ddddU, 0x9ce87474U, 0x213e1f1fU,\r
+    0xdd964b4bU, 0xdc61bdbdU, 0x860d8b8bU, 0x850f8a8aU,\r
+    0x90e07070U, 0x427c3e3eU, 0xc471b5b5U, 0xaacc6666U,\r
+    0xd8904848U, 0x05060303U, 0x01f7f6f6U, 0x121c0e0eU,\r
+    0xa3c26161U, 0x5f6a3535U, 0xf9ae5757U, 0xd069b9b9U,\r
+    0x91178686U, 0x5899c1c1U, 0x273a1d1dU, 0xb9279e9eU,\r
+    0x38d9e1e1U, 0x13ebf8f8U, 0xb32b9898U, 0x33221111U,\r
+    0xbbd26969U, 0x70a9d9d9U, 0x89078e8eU, 0xa7339494U,\r
+    0xb62d9b9bU, 0x223c1e1eU, 0x92158787U, 0x20c9e9e9U,\r
+    0x4987ceceU, 0xffaa5555U, 0x78502828U, 0x7aa5dfdfU,\r
+    0x8f038c8cU, 0xf859a1a1U, 0x80098989U, 0x171a0d0dU,\r
+    0xda65bfbfU, 0x31d7e6e6U, 0xc6844242U, 0xb8d06868U,\r
+    0xc3824141U, 0xb0299999U, 0x775a2d2dU, 0x111e0f0fU,\r
+    0xcb7bb0b0U, 0xfca85454U, 0xd66dbbbbU, 0x3a2c1616U,\r
+};\r
+static const u32 Te2[256] = {\r
+    0x63a5c663U, 0x7c84f87cU, 0x7799ee77U, 0x7b8df67bU,\r
+    0xf20dfff2U, 0x6bbdd66bU, 0x6fb1de6fU, 0xc55491c5U,\r
+    0x30506030U, 0x01030201U, 0x67a9ce67U, 0x2b7d562bU,\r
+    0xfe19e7feU, 0xd762b5d7U, 0xabe64dabU, 0x769aec76U,\r
+    0xca458fcaU, 0x829d1f82U, 0xc94089c9U, 0x7d87fa7dU,\r
+    0xfa15effaU, 0x59ebb259U, 0x47c98e47U, 0xf00bfbf0U,\r
+    0xadec41adU, 0xd467b3d4U, 0xa2fd5fa2U, 0xafea45afU,\r
+    0x9cbf239cU, 0xa4f753a4U, 0x7296e472U, 0xc05b9bc0U,\r
+    0xb7c275b7U, 0xfd1ce1fdU, 0x93ae3d93U, 0x266a4c26U,\r
+    0x365a6c36U, 0x3f417e3fU, 0xf702f5f7U, 0xcc4f83ccU,\r
+    0x345c6834U, 0xa5f451a5U, 0xe534d1e5U, 0xf108f9f1U,\r
+    0x7193e271U, 0xd873abd8U, 0x31536231U, 0x153f2a15U,\r
+    0x040c0804U, 0xc75295c7U, 0x23654623U, 0xc35e9dc3U,\r
+    0x18283018U, 0x96a13796U, 0x050f0a05U, 0x9ab52f9aU,\r
+    0x07090e07U, 0x12362412U, 0x809b1b80U, 0xe23ddfe2U,\r
+    0xeb26cdebU, 0x27694e27U, 0xb2cd7fb2U, 0x759fea75U,\r
+    0x091b1209U, 0x839e1d83U, 0x2c74582cU, 0x1a2e341aU,\r
+    0x1b2d361bU, 0x6eb2dc6eU, 0x5aeeb45aU, 0xa0fb5ba0U,\r
+    0x52f6a452U, 0x3b4d763bU, 0xd661b7d6U, 0xb3ce7db3U,\r
+    0x297b5229U, 0xe33edde3U, 0x2f715e2fU, 0x84971384U,\r
+    0x53f5a653U, 0xd168b9d1U, 0x00000000U, 0xed2cc1edU,\r
+    0x20604020U, 0xfc1fe3fcU, 0xb1c879b1U, 0x5bedb65bU,\r
+    0x6abed46aU, 0xcb468dcbU, 0xbed967beU, 0x394b7239U,\r
+    0x4ade944aU, 0x4cd4984cU, 0x58e8b058U, 0xcf4a85cfU,\r
+    0xd06bbbd0U, 0xef2ac5efU, 0xaae54faaU, 0xfb16edfbU,\r
+    0x43c58643U, 0x4dd79a4dU, 0x33556633U, 0x85941185U,\r
+    0x45cf8a45U, 0xf910e9f9U, 0x02060402U, 0x7f81fe7fU,\r
+    0x50f0a050U, 0x3c44783cU, 0x9fba259fU, 0xa8e34ba8U,\r
+    0x51f3a251U, 0xa3fe5da3U, 0x40c08040U, 0x8f8a058fU,\r
+    0x92ad3f92U, 0x9dbc219dU, 0x38487038U, 0xf504f1f5U,\r
+    0xbcdf63bcU, 0xb6c177b6U, 0xda75afdaU, 0x21634221U,\r
+    0x10302010U, 0xff1ae5ffU, 0xf30efdf3U, 0xd26dbfd2U,\r
+    0xcd4c81cdU, 0x0c14180cU, 0x13352613U, 0xec2fc3ecU,\r
+    0x5fe1be5fU, 0x97a23597U, 0x44cc8844U, 0x17392e17U,\r
+    0xc45793c4U, 0xa7f255a7U, 0x7e82fc7eU, 0x3d477a3dU,\r
+    0x64acc864U, 0x5de7ba5dU, 0x192b3219U, 0x7395e673U,\r
+    0x60a0c060U, 0x81981981U, 0x4fd19e4fU, 0xdc7fa3dcU,\r
+    0x22664422U, 0x2a7e542aU, 0x90ab3b90U, 0x88830b88U,\r
+    0x46ca8c46U, 0xee29c7eeU, 0xb8d36bb8U, 0x143c2814U,\r
+    0xde79a7deU, 0x5ee2bc5eU, 0x0b1d160bU, 0xdb76addbU,\r
+    0xe03bdbe0U, 0x32566432U, 0x3a4e743aU, 0x0a1e140aU,\r
+    0x49db9249U, 0x060a0c06U, 0x246c4824U, 0x5ce4b85cU,\r
+    0xc25d9fc2U, 0xd36ebdd3U, 0xacef43acU, 0x62a6c462U,\r
+    0x91a83991U, 0x95a43195U, 0xe437d3e4U, 0x798bf279U,\r
+    0xe732d5e7U, 0xc8438bc8U, 0x37596e37U, 0x6db7da6dU,\r
+    0x8d8c018dU, 0xd564b1d5U, 0x4ed29c4eU, 0xa9e049a9U,\r
+    0x6cb4d86cU, 0x56faac56U, 0xf407f3f4U, 0xea25cfeaU,\r
+    0x65afca65U, 0x7a8ef47aU, 0xaee947aeU, 0x08181008U,\r
+    0xbad56fbaU, 0x7888f078U, 0x256f4a25U, 0x2e725c2eU,\r
+    0x1c24381cU, 0xa6f157a6U, 0xb4c773b4U, 0xc65197c6U,\r
+    0xe823cbe8U, 0xdd7ca1ddU, 0x749ce874U, 0x1f213e1fU,\r
+    0x4bdd964bU, 0xbddc61bdU, 0x8b860d8bU, 0x8a850f8aU,\r
+    0x7090e070U, 0x3e427c3eU, 0xb5c471b5U, 0x66aacc66U,\r
+    0x48d89048U, 0x03050603U, 0xf601f7f6U, 0x0e121c0eU,\r
+    0x61a3c261U, 0x355f6a35U, 0x57f9ae57U, 0xb9d069b9U,\r
+    0x86911786U, 0xc15899c1U, 0x1d273a1dU, 0x9eb9279eU,\r
+    0xe138d9e1U, 0xf813ebf8U, 0x98b32b98U, 0x11332211U,\r
+    0x69bbd269U, 0xd970a9d9U, 0x8e89078eU, 0x94a73394U,\r
+    0x9bb62d9bU, 0x1e223c1eU, 0x87921587U, 0xe920c9e9U,\r
+    0xce4987ceU, 0x55ffaa55U, 0x28785028U, 0xdf7aa5dfU,\r
+    0x8c8f038cU, 0xa1f859a1U, 0x89800989U, 0x0d171a0dU,\r
+    0xbfda65bfU, 0xe631d7e6U, 0x42c68442U, 0x68b8d068U,\r
+    0x41c38241U, 0x99b02999U, 0x2d775a2dU, 0x0f111e0fU,\r
+    0xb0cb7bb0U, 0x54fca854U, 0xbbd66dbbU, 0x163a2c16U,\r
+};\r
+static const u32 Te3[256] = {\r
+\r
+    0x6363a5c6U, 0x7c7c84f8U, 0x777799eeU, 0x7b7b8df6U,\r
+    0xf2f20dffU, 0x6b6bbdd6U, 0x6f6fb1deU, 0xc5c55491U,\r
+    0x30305060U, 0x01010302U, 0x6767a9ceU, 0x2b2b7d56U,\r
+    0xfefe19e7U, 0xd7d762b5U, 0xababe64dU, 0x76769aecU,\r
+    0xcaca458fU, 0x82829d1fU, 0xc9c94089U, 0x7d7d87faU,\r
+    0xfafa15efU, 0x5959ebb2U, 0x4747c98eU, 0xf0f00bfbU,\r
+    0xadadec41U, 0xd4d467b3U, 0xa2a2fd5fU, 0xafafea45U,\r
+    0x9c9cbf23U, 0xa4a4f753U, 0x727296e4U, 0xc0c05b9bU,\r
+    0xb7b7c275U, 0xfdfd1ce1U, 0x9393ae3dU, 0x26266a4cU,\r
+    0x36365a6cU, 0x3f3f417eU, 0xf7f702f5U, 0xcccc4f83U,\r
+    0x34345c68U, 0xa5a5f451U, 0xe5e534d1U, 0xf1f108f9U,\r
+    0x717193e2U, 0xd8d873abU, 0x31315362U, 0x15153f2aU,\r
+    0x04040c08U, 0xc7c75295U, 0x23236546U, 0xc3c35e9dU,\r
+    0x18182830U, 0x9696a137U, 0x05050f0aU, 0x9a9ab52fU,\r
+    0x0707090eU, 0x12123624U, 0x80809b1bU, 0xe2e23ddfU,\r
+    0xebeb26cdU, 0x2727694eU, 0xb2b2cd7fU, 0x75759feaU,\r
+    0x09091b12U, 0x83839e1dU, 0x2c2c7458U, 0x1a1a2e34U,\r
+    0x1b1b2d36U, 0x6e6eb2dcU, 0x5a5aeeb4U, 0xa0a0fb5bU,\r
+    0x5252f6a4U, 0x3b3b4d76U, 0xd6d661b7U, 0xb3b3ce7dU,\r
+    0x29297b52U, 0xe3e33eddU, 0x2f2f715eU, 0x84849713U,\r
+    0x5353f5a6U, 0xd1d168b9U, 0x00000000U, 0xeded2cc1U,\r
+    0x20206040U, 0xfcfc1fe3U, 0xb1b1c879U, 0x5b5bedb6U,\r
+    0x6a6abed4U, 0xcbcb468dU, 0xbebed967U, 0x39394b72U,\r
+    0x4a4ade94U, 0x4c4cd498U, 0x5858e8b0U, 0xcfcf4a85U,\r
+    0xd0d06bbbU, 0xefef2ac5U, 0xaaaae54fU, 0xfbfb16edU,\r
+    0x4343c586U, 0x4d4dd79aU, 0x33335566U, 0x85859411U,\r
+    0x4545cf8aU, 0xf9f910e9U, 0x02020604U, 0x7f7f81feU,\r
+    0x5050f0a0U, 0x3c3c4478U, 0x9f9fba25U, 0xa8a8e34bU,\r
+    0x5151f3a2U, 0xa3a3fe5dU, 0x4040c080U, 0x8f8f8a05U,\r
+    0x9292ad3fU, 0x9d9dbc21U, 0x38384870U, 0xf5f504f1U,\r
+    0xbcbcdf63U, 0xb6b6c177U, 0xdada75afU, 0x21216342U,\r
+    0x10103020U, 0xffff1ae5U, 0xf3f30efdU, 0xd2d26dbfU,\r
+    0xcdcd4c81U, 0x0c0c1418U, 0x13133526U, 0xecec2fc3U,\r
+    0x5f5fe1beU, 0x9797a235U, 0x4444cc88U, 0x1717392eU,\r
+    0xc4c45793U, 0xa7a7f255U, 0x7e7e82fcU, 0x3d3d477aU,\r
+    0x6464acc8U, 0x5d5de7baU, 0x19192b32U, 0x737395e6U,\r
+    0x6060a0c0U, 0x81819819U, 0x4f4fd19eU, 0xdcdc7fa3U,\r
+    0x22226644U, 0x2a2a7e54U, 0x9090ab3bU, 0x8888830bU,\r
+    0x4646ca8cU, 0xeeee29c7U, 0xb8b8d36bU, 0x14143c28U,\r
+    0xdede79a7U, 0x5e5ee2bcU, 0x0b0b1d16U, 0xdbdb76adU,\r
+    0xe0e03bdbU, 0x32325664U, 0x3a3a4e74U, 0x0a0a1e14U,\r
+    0x4949db92U, 0x06060a0cU, 0x24246c48U, 0x5c5ce4b8U,\r
+    0xc2c25d9fU, 0xd3d36ebdU, 0xacacef43U, 0x6262a6c4U,\r
+    0x9191a839U, 0x9595a431U, 0xe4e437d3U, 0x79798bf2U,\r
+    0xe7e732d5U, 0xc8c8438bU, 0x3737596eU, 0x6d6db7daU,\r
+    0x8d8d8c01U, 0xd5d564b1U, 0x4e4ed29cU, 0xa9a9e049U,\r
+    0x6c6cb4d8U, 0x5656faacU, 0xf4f407f3U, 0xeaea25cfU,\r
+    0x6565afcaU, 0x7a7a8ef4U, 0xaeaee947U, 0x08081810U,\r
+    0xbabad56fU, 0x787888f0U, 0x25256f4aU, 0x2e2e725cU,\r
+    0x1c1c2438U, 0xa6a6f157U, 0xb4b4c773U, 0xc6c65197U,\r
+    0xe8e823cbU, 0xdddd7ca1U, 0x74749ce8U, 0x1f1f213eU,\r
+    0x4b4bdd96U, 0xbdbddc61U, 0x8b8b860dU, 0x8a8a850fU,\r
+    0x707090e0U, 0x3e3e427cU, 0xb5b5c471U, 0x6666aaccU,\r
+    0x4848d890U, 0x03030506U, 0xf6f601f7U, 0x0e0e121cU,\r
+    0x6161a3c2U, 0x35355f6aU, 0x5757f9aeU, 0xb9b9d069U,\r
+    0x86869117U, 0xc1c15899U, 0x1d1d273aU, 0x9e9eb927U,\r
+    0xe1e138d9U, 0xf8f813ebU, 0x9898b32bU, 0x11113322U,\r
+    0x6969bbd2U, 0xd9d970a9U, 0x8e8e8907U, 0x9494a733U,\r
+    0x9b9bb62dU, 0x1e1e223cU, 0x87879215U, 0xe9e920c9U,\r
+    0xcece4987U, 0x5555ffaaU, 0x28287850U, 0xdfdf7aa5U,\r
+    0x8c8c8f03U, 0xa1a1f859U, 0x89898009U, 0x0d0d171aU,\r
+    0xbfbfda65U, 0xe6e631d7U, 0x4242c684U, 0x6868b8d0U,\r
+    0x4141c382U, 0x9999b029U, 0x2d2d775aU, 0x0f0f111eU,\r
+    0xb0b0cb7bU, 0x5454fca8U, 0xbbbbd66dU, 0x16163a2cU,\r
+};\r
+static const u32 Te4[256] = {\r
+    0x63636363U, 0x7c7c7c7cU, 0x77777777U, 0x7b7b7b7bU,\r
+    0xf2f2f2f2U, 0x6b6b6b6bU, 0x6f6f6f6fU, 0xc5c5c5c5U,\r
+    0x30303030U, 0x01010101U, 0x67676767U, 0x2b2b2b2bU,\r
+    0xfefefefeU, 0xd7d7d7d7U, 0xababababU, 0x76767676U,\r
+    0xcacacacaU, 0x82828282U, 0xc9c9c9c9U, 0x7d7d7d7dU,\r
+    0xfafafafaU, 0x59595959U, 0x47474747U, 0xf0f0f0f0U,\r
+    0xadadadadU, 0xd4d4d4d4U, 0xa2a2a2a2U, 0xafafafafU,\r
+    0x9c9c9c9cU, 0xa4a4a4a4U, 0x72727272U, 0xc0c0c0c0U,\r
+    0xb7b7b7b7U, 0xfdfdfdfdU, 0x93939393U, 0x26262626U,\r
+    0x36363636U, 0x3f3f3f3fU, 0xf7f7f7f7U, 0xccccccccU,\r
+    0x34343434U, 0xa5a5a5a5U, 0xe5e5e5e5U, 0xf1f1f1f1U,\r
+    0x71717171U, 0xd8d8d8d8U, 0x31313131U, 0x15151515U,\r
+    0x04040404U, 0xc7c7c7c7U, 0x23232323U, 0xc3c3c3c3U,\r
+    0x18181818U, 0x96969696U, 0x05050505U, 0x9a9a9a9aU,\r
+    0x07070707U, 0x12121212U, 0x80808080U, 0xe2e2e2e2U,\r
+    0xebebebebU, 0x27272727U, 0xb2b2b2b2U, 0x75757575U,\r
+    0x09090909U, 0x83838383U, 0x2c2c2c2cU, 0x1a1a1a1aU,\r
+    0x1b1b1b1bU, 0x6e6e6e6eU, 0x5a5a5a5aU, 0xa0a0a0a0U,\r
+    0x52525252U, 0x3b3b3b3bU, 0xd6d6d6d6U, 0xb3b3b3b3U,\r
+    0x29292929U, 0xe3e3e3e3U, 0x2f2f2f2fU, 0x84848484U,\r
+    0x53535353U, 0xd1d1d1d1U, 0x00000000U, 0xededededU,\r
+    0x20202020U, 0xfcfcfcfcU, 0xb1b1b1b1U, 0x5b5b5b5bU,\r
+    0x6a6a6a6aU, 0xcbcbcbcbU, 0xbebebebeU, 0x39393939U,\r
+    0x4a4a4a4aU, 0x4c4c4c4cU, 0x58585858U, 0xcfcfcfcfU,\r
+    0xd0d0d0d0U, 0xefefefefU, 0xaaaaaaaaU, 0xfbfbfbfbU,\r
+    0x43434343U, 0x4d4d4d4dU, 0x33333333U, 0x85858585U,\r
+    0x45454545U, 0xf9f9f9f9U, 0x02020202U, 0x7f7f7f7fU,\r
+    0x50505050U, 0x3c3c3c3cU, 0x9f9f9f9fU, 0xa8a8a8a8U,\r
+    0x51515151U, 0xa3a3a3a3U, 0x40404040U, 0x8f8f8f8fU,\r
+    0x92929292U, 0x9d9d9d9dU, 0x38383838U, 0xf5f5f5f5U,\r
+    0xbcbcbcbcU, 0xb6b6b6b6U, 0xdadadadaU, 0x21212121U,\r
+    0x10101010U, 0xffffffffU, 0xf3f3f3f3U, 0xd2d2d2d2U,\r
+    0xcdcdcdcdU, 0x0c0c0c0cU, 0x13131313U, 0xececececU,\r
+    0x5f5f5f5fU, 0x97979797U, 0x44444444U, 0x17171717U,\r
+    0xc4c4c4c4U, 0xa7a7a7a7U, 0x7e7e7e7eU, 0x3d3d3d3dU,\r
+    0x64646464U, 0x5d5d5d5dU, 0x19191919U, 0x73737373U,\r
+    0x60606060U, 0x81818181U, 0x4f4f4f4fU, 0xdcdcdcdcU,\r
+    0x22222222U, 0x2a2a2a2aU, 0x90909090U, 0x88888888U,\r
+    0x46464646U, 0xeeeeeeeeU, 0xb8b8b8b8U, 0x14141414U,\r
+    0xdedededeU, 0x5e5e5e5eU, 0x0b0b0b0bU, 0xdbdbdbdbU,\r
+    0xe0e0e0e0U, 0x32323232U, 0x3a3a3a3aU, 0x0a0a0a0aU,\r
+    0x49494949U, 0x06060606U, 0x24242424U, 0x5c5c5c5cU,\r
+    0xc2c2c2c2U, 0xd3d3d3d3U, 0xacacacacU, 0x62626262U,\r
+    0x91919191U, 0x95959595U, 0xe4e4e4e4U, 0x79797979U,\r
+    0xe7e7e7e7U, 0xc8c8c8c8U, 0x37373737U, 0x6d6d6d6dU,\r
+    0x8d8d8d8dU, 0xd5d5d5d5U, 0x4e4e4e4eU, 0xa9a9a9a9U,\r
+    0x6c6c6c6cU, 0x56565656U, 0xf4f4f4f4U, 0xeaeaeaeaU,\r
+    0x65656565U, 0x7a7a7a7aU, 0xaeaeaeaeU, 0x08080808U,\r
+    0xbabababaU, 0x78787878U, 0x25252525U, 0x2e2e2e2eU,\r
+    0x1c1c1c1cU, 0xa6a6a6a6U, 0xb4b4b4b4U, 0xc6c6c6c6U,\r
+    0xe8e8e8e8U, 0xddddddddU, 0x74747474U, 0x1f1f1f1fU,\r
+    0x4b4b4b4bU, 0xbdbdbdbdU, 0x8b8b8b8bU, 0x8a8a8a8aU,\r
+    0x70707070U, 0x3e3e3e3eU, 0xb5b5b5b5U, 0x66666666U,\r
+    0x48484848U, 0x03030303U, 0xf6f6f6f6U, 0x0e0e0e0eU,\r
+    0x61616161U, 0x35353535U, 0x57575757U, 0xb9b9b9b9U,\r
+    0x86868686U, 0xc1c1c1c1U, 0x1d1d1d1dU, 0x9e9e9e9eU,\r
+    0xe1e1e1e1U, 0xf8f8f8f8U, 0x98989898U, 0x11111111U,\r
+    0x69696969U, 0xd9d9d9d9U, 0x8e8e8e8eU, 0x94949494U,\r
+    0x9b9b9b9bU, 0x1e1e1e1eU, 0x87878787U, 0xe9e9e9e9U,\r
+    0xcecececeU, 0x55555555U, 0x28282828U, 0xdfdfdfdfU,\r
+    0x8c8c8c8cU, 0xa1a1a1a1U, 0x89898989U, 0x0d0d0d0dU,\r
+    0xbfbfbfbfU, 0xe6e6e6e6U, 0x42424242U, 0x68686868U,\r
+    0x41414141U, 0x99999999U, 0x2d2d2d2dU, 0x0f0f0f0fU,\r
+    0xb0b0b0b0U, 0x54545454U, 0xbbbbbbbbU, 0x16161616U,\r
+};\r
+static const u32 Td0[256] = {\r
+    0x51f4a750U, 0x7e416553U, 0x1a17a4c3U, 0x3a275e96U,\r
+    0x3bab6bcbU, 0x1f9d45f1U, 0xacfa58abU, 0x4be30393U,\r
+    0x2030fa55U, 0xad766df6U, 0x88cc7691U, 0xf5024c25U,\r
+    0x4fe5d7fcU, 0xc52acbd7U, 0x26354480U, 0xb562a38fU,\r
+    0xdeb15a49U, 0x25ba1b67U, 0x45ea0e98U, 0x5dfec0e1U,\r
+    0xc32f7502U, 0x814cf012U, 0x8d4697a3U, 0x6bd3f9c6U,\r
+    0x038f5fe7U, 0x15929c95U, 0xbf6d7aebU, 0x955259daU,\r
+    0xd4be832dU, 0x587421d3U, 0x49e06929U, 0x8ec9c844U,\r
+    0x75c2896aU, 0xf48e7978U, 0x99583e6bU, 0x27b971ddU,\r
+    0xbee14fb6U, 0xf088ad17U, 0xc920ac66U, 0x7dce3ab4U,\r
+    0x63df4a18U, 0xe51a3182U, 0x97513360U, 0x62537f45U,\r
+    0xb16477e0U, 0xbb6bae84U, 0xfe81a01cU, 0xf9082b94U,\r
+    0x70486858U, 0x8f45fd19U, 0x94de6c87U, 0x527bf8b7U,\r
+    0xab73d323U, 0x724b02e2U, 0xe31f8f57U, 0x6655ab2aU,\r
+    0xb2eb2807U, 0x2fb5c203U, 0x86c57b9aU, 0xd33708a5U,\r
+    0x302887f2U, 0x23bfa5b2U, 0x02036abaU, 0xed16825cU,\r
+    0x8acf1c2bU, 0xa779b492U, 0xf307f2f0U, 0x4e69e2a1U,\r
+    0x65daf4cdU, 0x0605bed5U, 0xd134621fU, 0xc4a6fe8aU,\r
+    0x342e539dU, 0xa2f355a0U, 0x058ae132U, 0xa4f6eb75U,\r
+    0x0b83ec39U, 0x4060efaaU, 0x5e719f06U, 0xbd6e1051U,\r
+    0x3e218af9U, 0x96dd063dU, 0xdd3e05aeU, 0x4de6bd46U,\r
+    0x91548db5U, 0x71c45d05U, 0x0406d46fU, 0x605015ffU,\r
+    0x1998fb24U, 0xd6bde997U, 0x894043ccU, 0x67d99e77U,\r
+    0xb0e842bdU, 0x07898b88U, 0xe7195b38U, 0x79c8eedbU,\r
+    0xa17c0a47U, 0x7c420fe9U, 0xf8841ec9U, 0x00000000U,\r
+    0x09808683U, 0x322bed48U, 0x1e1170acU, 0x6c5a724eU,\r
+    0xfd0efffbU, 0x0f853856U, 0x3daed51eU, 0x362d3927U,\r
+    0x0a0fd964U, 0x685ca621U, 0x9b5b54d1U, 0x24362e3aU,\r
+    0x0c0a67b1U, 0x9357e70fU, 0xb4ee96d2U, 0x1b9b919eU,\r
+    0x80c0c54fU, 0x61dc20a2U, 0x5a774b69U, 0x1c121a16U,\r
+    0xe293ba0aU, 0xc0a02ae5U, 0x3c22e043U, 0x121b171dU,\r
+    0x0e090d0bU, 0xf28bc7adU, 0x2db6a8b9U, 0x141ea9c8U,\r
+    0x57f11985U, 0xaf75074cU, 0xee99ddbbU, 0xa37f60fdU,\r
+    0xf701269fU, 0x5c72f5bcU, 0x44663bc5U, 0x5bfb7e34U,\r
+    0x8b432976U, 0xcb23c6dcU, 0xb6edfc68U, 0xb8e4f163U,\r
+    0xd731dccaU, 0x42638510U, 0x13972240U, 0x84c61120U,\r
+    0x854a247dU, 0xd2bb3df8U, 0xaef93211U, 0xc729a16dU,\r
+    0x1d9e2f4bU, 0xdcb230f3U, 0x0d8652ecU, 0x77c1e3d0U,\r
+    0x2bb3166cU, 0xa970b999U, 0x119448faU, 0x47e96422U,\r
+    0xa8fc8cc4U, 0xa0f03f1aU, 0x567d2cd8U, 0x223390efU,\r
+    0x87494ec7U, 0xd938d1c1U, 0x8ccaa2feU, 0x98d40b36U,\r
+    0xa6f581cfU, 0xa57ade28U, 0xdab78e26U, 0x3fadbfa4U,\r
+    0x2c3a9de4U, 0x5078920dU, 0x6a5fcc9bU, 0x547e4662U,\r
+    0xf68d13c2U, 0x90d8b8e8U, 0x2e39f75eU, 0x82c3aff5U,\r
+    0x9f5d80beU, 0x69d0937cU, 0x6fd52da9U, 0xcf2512b3U,\r
+    0xc8ac993bU, 0x10187da7U, 0xe89c636eU, 0xdb3bbb7bU,\r
+    0xcd267809U, 0x6e5918f4U, 0xec9ab701U, 0x834f9aa8U,\r
+    0xe6956e65U, 0xaaffe67eU, 0x21bccf08U, 0xef15e8e6U,\r
+    0xbae79bd9U, 0x4a6f36ceU, 0xea9f09d4U, 0x29b07cd6U,\r
+    0x31a4b2afU, 0x2a3f2331U, 0xc6a59430U, 0x35a266c0U,\r
+    0x744ebc37U, 0xfc82caa6U, 0xe090d0b0U, 0x33a7d815U,\r
+    0xf104984aU, 0x41ecdaf7U, 0x7fcd500eU, 0x1791f62fU,\r
+    0x764dd68dU, 0x43efb04dU, 0xccaa4d54U, 0xe49604dfU,\r
+    0x9ed1b5e3U, 0x4c6a881bU, 0xc12c1fb8U, 0x4665517fU,\r
+    0x9d5eea04U, 0x018c355dU, 0xfa877473U, 0xfb0b412eU,\r
+    0xb3671d5aU, 0x92dbd252U, 0xe9105633U, 0x6dd64713U,\r
+    0x9ad7618cU, 0x37a10c7aU, 0x59f8148eU, 0xeb133c89U,\r
+    0xcea927eeU, 0xb761c935U, 0xe11ce5edU, 0x7a47b13cU,\r
+    0x9cd2df59U, 0x55f2733fU, 0x1814ce79U, 0x73c737bfU,\r
+    0x53f7cdeaU, 0x5ffdaa5bU, 0xdf3d6f14U, 0x7844db86U,\r
+    0xcaaff381U, 0xb968c43eU, 0x3824342cU, 0xc2a3405fU,\r
+    0x161dc372U, 0xbce2250cU, 0x283c498bU, 0xff0d9541U,\r
+    0x39a80171U, 0x080cb3deU, 0xd8b4e49cU, 0x6456c190U,\r
+    0x7bcb8461U, 0xd532b670U, 0x486c5c74U, 0xd0b85742U,\r
+};\r
+static const u32 Td1[256] = {\r
+    0x5051f4a7U, 0x537e4165U, 0xc31a17a4U, 0x963a275eU,\r
+    0xcb3bab6bU, 0xf11f9d45U, 0xabacfa58U, 0x934be303U,\r
+    0x552030faU, 0xf6ad766dU, 0x9188cc76U, 0x25f5024cU,\r
+    0xfc4fe5d7U, 0xd7c52acbU, 0x80263544U, 0x8fb562a3U,\r
+    0x49deb15aU, 0x6725ba1bU, 0x9845ea0eU, 0xe15dfec0U,\r
+    0x02c32f75U, 0x12814cf0U, 0xa38d4697U, 0xc66bd3f9U,\r
+    0xe7038f5fU, 0x9515929cU, 0xebbf6d7aU, 0xda955259U,\r
+    0x2dd4be83U, 0xd3587421U, 0x2949e069U, 0x448ec9c8U,\r
+    0x6a75c289U, 0x78f48e79U, 0x6b99583eU, 0xdd27b971U,\r
+    0xb6bee14fU, 0x17f088adU, 0x66c920acU, 0xb47dce3aU,\r
+    0x1863df4aU, 0x82e51a31U, 0x60975133U, 0x4562537fU,\r
+    0xe0b16477U, 0x84bb6baeU, 0x1cfe81a0U, 0x94f9082bU,\r
+    0x58704868U, 0x198f45fdU, 0x8794de6cU, 0xb7527bf8U,\r
+    0x23ab73d3U, 0xe2724b02U, 0x57e31f8fU, 0x2a6655abU,\r
+    0x07b2eb28U, 0x032fb5c2U, 0x9a86c57bU, 0xa5d33708U,\r
+    0xf2302887U, 0xb223bfa5U, 0xba02036aU, 0x5ced1682U,\r
+    0x2b8acf1cU, 0x92a779b4U, 0xf0f307f2U, 0xa14e69e2U,\r
+    0xcd65daf4U, 0xd50605beU, 0x1fd13462U, 0x8ac4a6feU,\r
+    0x9d342e53U, 0xa0a2f355U, 0x32058ae1U, 0x75a4f6ebU,\r
+    0x390b83ecU, 0xaa4060efU, 0x065e719fU, 0x51bd6e10U,\r
+    0xf93e218aU, 0x3d96dd06U, 0xaedd3e05U, 0x464de6bdU,\r
+    0xb591548dU, 0x0571c45dU, 0x6f0406d4U, 0xff605015U,\r
+    0x241998fbU, 0x97d6bde9U, 0xcc894043U, 0x7767d99eU,\r
+    0xbdb0e842U, 0x8807898bU, 0x38e7195bU, 0xdb79c8eeU,\r
+    0x47a17c0aU, 0xe97c420fU, 0xc9f8841eU, 0x00000000U,\r
+    0x83098086U, 0x48322bedU, 0xac1e1170U, 0x4e6c5a72U,\r
+    0xfbfd0effU, 0x560f8538U, 0x1e3daed5U, 0x27362d39U,\r
+    0x640a0fd9U, 0x21685ca6U, 0xd19b5b54U, 0x3a24362eU,\r
+    0xb10c0a67U, 0x0f9357e7U, 0xd2b4ee96U, 0x9e1b9b91U,\r
+    0x4f80c0c5U, 0xa261dc20U, 0x695a774bU, 0x161c121aU,\r
+    0x0ae293baU, 0xe5c0a02aU, 0x433c22e0U, 0x1d121b17U,\r
+    0x0b0e090dU, 0xadf28bc7U, 0xb92db6a8U, 0xc8141ea9U,\r
+    0x8557f119U, 0x4caf7507U, 0xbbee99ddU, 0xfda37f60U,\r
+    0x9ff70126U, 0xbc5c72f5U, 0xc544663bU, 0x345bfb7eU,\r
+    0x768b4329U, 0xdccb23c6U, 0x68b6edfcU, 0x63b8e4f1U,\r
+    0xcad731dcU, 0x10426385U, 0x40139722U, 0x2084c611U,\r
+    0x7d854a24U, 0xf8d2bb3dU, 0x11aef932U, 0x6dc729a1U,\r
+    0x4b1d9e2fU, 0xf3dcb230U, 0xec0d8652U, 0xd077c1e3U,\r
+    0x6c2bb316U, 0x99a970b9U, 0xfa119448U, 0x2247e964U,\r
+    0xc4a8fc8cU, 0x1aa0f03fU, 0xd8567d2cU, 0xef223390U,\r
+    0xc787494eU, 0xc1d938d1U, 0xfe8ccaa2U, 0x3698d40bU,\r
+    0xcfa6f581U, 0x28a57adeU, 0x26dab78eU, 0xa43fadbfU,\r
+    0xe42c3a9dU, 0x0d507892U, 0x9b6a5fccU, 0x62547e46U,\r
+    0xc2f68d13U, 0xe890d8b8U, 0x5e2e39f7U, 0xf582c3afU,\r
+    0xbe9f5d80U, 0x7c69d093U, 0xa96fd52dU, 0xb3cf2512U,\r
+    0x3bc8ac99U, 0xa710187dU, 0x6ee89c63U, 0x7bdb3bbbU,\r
+    0x09cd2678U, 0xf46e5918U, 0x01ec9ab7U, 0xa8834f9aU,\r
+    0x65e6956eU, 0x7eaaffe6U, 0x0821bccfU, 0xe6ef15e8U,\r
+    0xd9bae79bU, 0xce4a6f36U, 0xd4ea9f09U, 0xd629b07cU,\r
+    0xaf31a4b2U, 0x312a3f23U, 0x30c6a594U, 0xc035a266U,\r
+    0x37744ebcU, 0xa6fc82caU, 0xb0e090d0U, 0x1533a7d8U,\r
+    0x4af10498U, 0xf741ecdaU, 0x0e7fcd50U, 0x2f1791f6U,\r
+    0x8d764dd6U, 0x4d43efb0U, 0x54ccaa4dU, 0xdfe49604U,\r
+    0xe39ed1b5U, 0x1b4c6a88U, 0xb8c12c1fU, 0x7f466551U,\r
+    0x049d5eeaU, 0x5d018c35U, 0x73fa8774U, 0x2efb0b41U,\r
+    0x5ab3671dU, 0x5292dbd2U, 0x33e91056U, 0x136dd647U,\r
+    0x8c9ad761U, 0x7a37a10cU, 0x8e59f814U, 0x89eb133cU,\r
+    0xeecea927U, 0x35b761c9U, 0xede11ce5U, 0x3c7a47b1U,\r
+    0x599cd2dfU, 0x3f55f273U, 0x791814ceU, 0xbf73c737U,\r
+    0xea53f7cdU, 0x5b5ffdaaU, 0x14df3d6fU, 0x867844dbU,\r
+    0x81caaff3U, 0x3eb968c4U, 0x2c382434U, 0x5fc2a340U,\r
+    0x72161dc3U, 0x0cbce225U, 0x8b283c49U, 0x41ff0d95U,\r
+    0x7139a801U, 0xde080cb3U, 0x9cd8b4e4U, 0x906456c1U,\r
+    0x617bcb84U, 0x70d532b6U, 0x74486c5cU, 0x42d0b857U,\r
+};\r
+static const u32 Td2[256] = {\r
+    0xa75051f4U, 0x65537e41U, 0xa4c31a17U, 0x5e963a27U,\r
+    0x6bcb3babU, 0x45f11f9dU, 0x58abacfaU, 0x03934be3U,\r
+    0xfa552030U, 0x6df6ad76U, 0x769188ccU, 0x4c25f502U,\r
+    0xd7fc4fe5U, 0xcbd7c52aU, 0x44802635U, 0xa38fb562U,\r
+    0x5a49deb1U, 0x1b6725baU, 0x0e9845eaU, 0xc0e15dfeU,\r
+    0x7502c32fU, 0xf012814cU, 0x97a38d46U, 0xf9c66bd3U,\r
+    0x5fe7038fU, 0x9c951592U, 0x7aebbf6dU, 0x59da9552U,\r
+    0x832dd4beU, 0x21d35874U, 0x692949e0U, 0xc8448ec9U,\r
+    0x896a75c2U, 0x7978f48eU, 0x3e6b9958U, 0x71dd27b9U,\r
+    0x4fb6bee1U, 0xad17f088U, 0xac66c920U, 0x3ab47dceU,\r
+    0x4a1863dfU, 0x3182e51aU, 0x33609751U, 0x7f456253U,\r
+    0x77e0b164U, 0xae84bb6bU, 0xa01cfe81U, 0x2b94f908U,\r
+    0x68587048U, 0xfd198f45U, 0x6c8794deU, 0xf8b7527bU,\r
+    0xd323ab73U, 0x02e2724bU, 0x8f57e31fU, 0xab2a6655U,\r
+    0x2807b2ebU, 0xc2032fb5U, 0x7b9a86c5U, 0x08a5d337U,\r
+    0x87f23028U, 0xa5b223bfU, 0x6aba0203U, 0x825ced16U,\r
+    0x1c2b8acfU, 0xb492a779U, 0xf2f0f307U, 0xe2a14e69U,\r
+    0xf4cd65daU, 0xbed50605U, 0x621fd134U, 0xfe8ac4a6U,\r
+    0x539d342eU, 0x55a0a2f3U, 0xe132058aU, 0xeb75a4f6U,\r
+    0xec390b83U, 0xefaa4060U, 0x9f065e71U, 0x1051bd6eU,\r
+\r
+    0x8af93e21U, 0x063d96ddU, 0x05aedd3eU, 0xbd464de6U,\r
+    0x8db59154U, 0x5d0571c4U, 0xd46f0406U, 0x15ff6050U,\r
+    0xfb241998U, 0xe997d6bdU, 0x43cc8940U, 0x9e7767d9U,\r
+    0x42bdb0e8U, 0x8b880789U, 0x5b38e719U, 0xeedb79c8U,\r
+    0x0a47a17cU, 0x0fe97c42U, 0x1ec9f884U, 0x00000000U,\r
+    0x86830980U, 0xed48322bU, 0x70ac1e11U, 0x724e6c5aU,\r
+    0xfffbfd0eU, 0x38560f85U, 0xd51e3daeU, 0x3927362dU,\r
+    0xd9640a0fU, 0xa621685cU, 0x54d19b5bU, 0x2e3a2436U,\r
+    0x67b10c0aU, 0xe70f9357U, 0x96d2b4eeU, 0x919e1b9bU,\r
+    0xc54f80c0U, 0x20a261dcU, 0x4b695a77U, 0x1a161c12U,\r
+    0xba0ae293U, 0x2ae5c0a0U, 0xe0433c22U, 0x171d121bU,\r
+    0x0d0b0e09U, 0xc7adf28bU, 0xa8b92db6U, 0xa9c8141eU,\r
+    0x198557f1U, 0x074caf75U, 0xddbbee99U, 0x60fda37fU,\r
+    0x269ff701U, 0xf5bc5c72U, 0x3bc54466U, 0x7e345bfbU,\r
+    0x29768b43U, 0xc6dccb23U, 0xfc68b6edU, 0xf163b8e4U,\r
+    0xdccad731U, 0x85104263U, 0x22401397U, 0x112084c6U,\r
+    0x247d854aU, 0x3df8d2bbU, 0x3211aef9U, 0xa16dc729U,\r
+    0x2f4b1d9eU, 0x30f3dcb2U, 0x52ec0d86U, 0xe3d077c1U,\r
+    0x166c2bb3U, 0xb999a970U, 0x48fa1194U, 0x642247e9U,\r
+    0x8cc4a8fcU, 0x3f1aa0f0U, 0x2cd8567dU, 0x90ef2233U,\r
+    0x4ec78749U, 0xd1c1d938U, 0xa2fe8ccaU, 0x0b3698d4U,\r
+    0x81cfa6f5U, 0xde28a57aU, 0x8e26dab7U, 0xbfa43fadU,\r
+    0x9de42c3aU, 0x920d5078U, 0xcc9b6a5fU, 0x4662547eU,\r
+    0x13c2f68dU, 0xb8e890d8U, 0xf75e2e39U, 0xaff582c3U,\r
+    0x80be9f5dU, 0x937c69d0U, 0x2da96fd5U, 0x12b3cf25U,\r
+    0x993bc8acU, 0x7da71018U, 0x636ee89cU, 0xbb7bdb3bU,\r
+    0x7809cd26U, 0x18f46e59U, 0xb701ec9aU, 0x9aa8834fU,\r
+    0x6e65e695U, 0xe67eaaffU, 0xcf0821bcU, 0xe8e6ef15U,\r
+    0x9bd9bae7U, 0x36ce4a6fU, 0x09d4ea9fU, 0x7cd629b0U,\r
+    0xb2af31a4U, 0x23312a3fU, 0x9430c6a5U, 0x66c035a2U,\r
+    0xbc37744eU, 0xcaa6fc82U, 0xd0b0e090U, 0xd81533a7U,\r
+    0x984af104U, 0xdaf741ecU, 0x500e7fcdU, 0xf62f1791U,\r
+    0xd68d764dU, 0xb04d43efU, 0x4d54ccaaU, 0x04dfe496U,\r
+    0xb5e39ed1U, 0x881b4c6aU, 0x1fb8c12cU, 0x517f4665U,\r
+    0xea049d5eU, 0x355d018cU, 0x7473fa87U, 0x412efb0bU,\r
+    0x1d5ab367U, 0xd25292dbU, 0x5633e910U, 0x47136dd6U,\r
+    0x618c9ad7U, 0x0c7a37a1U, 0x148e59f8U, 0x3c89eb13U,\r
+    0x27eecea9U, 0xc935b761U, 0xe5ede11cU, 0xb13c7a47U,\r
+    0xdf599cd2U, 0x733f55f2U, 0xce791814U, 0x37bf73c7U,\r
+    0xcdea53f7U, 0xaa5b5ffdU, 0x6f14df3dU, 0xdb867844U,\r
+    0xf381caafU, 0xc43eb968U, 0x342c3824U, 0x405fc2a3U,\r
+    0xc372161dU, 0x250cbce2U, 0x498b283cU, 0x9541ff0dU,\r
+    0x017139a8U, 0xb3de080cU, 0xe49cd8b4U, 0xc1906456U,\r
+    0x84617bcbU, 0xb670d532U, 0x5c74486cU, 0x5742d0b8U,\r
+};\r
+static const u32 Td3[256] = {\r
+    0xf4a75051U, 0x4165537eU, 0x17a4c31aU, 0x275e963aU,\r
+    0xab6bcb3bU, 0x9d45f11fU, 0xfa58abacU, 0xe303934bU,\r
+    0x30fa5520U, 0x766df6adU, 0xcc769188U, 0x024c25f5U,\r
+    0xe5d7fc4fU, 0x2acbd7c5U, 0x35448026U, 0x62a38fb5U,\r
+    0xb15a49deU, 0xba1b6725U, 0xea0e9845U, 0xfec0e15dU,\r
+    0x2f7502c3U, 0x4cf01281U, 0x4697a38dU, 0xd3f9c66bU,\r
+    0x8f5fe703U, 0x929c9515U, 0x6d7aebbfU, 0x5259da95U,\r
+    0xbe832dd4U, 0x7421d358U, 0xe0692949U, 0xc9c8448eU,\r
+    0xc2896a75U, 0x8e7978f4U, 0x583e6b99U, 0xb971dd27U,\r
+    0xe14fb6beU, 0x88ad17f0U, 0x20ac66c9U, 0xce3ab47dU,\r
+    0xdf4a1863U, 0x1a3182e5U, 0x51336097U, 0x537f4562U,\r
+    0x6477e0b1U, 0x6bae84bbU, 0x81a01cfeU, 0x082b94f9U,\r
+    0x48685870U, 0x45fd198fU, 0xde6c8794U, 0x7bf8b752U,\r
+    0x73d323abU, 0x4b02e272U, 0x1f8f57e3U, 0x55ab2a66U,\r
+    0xeb2807b2U, 0xb5c2032fU, 0xc57b9a86U, 0x3708a5d3U,\r
+    0x2887f230U, 0xbfa5b223U, 0x036aba02U, 0x16825cedU,\r
+    0xcf1c2b8aU, 0x79b492a7U, 0x07f2f0f3U, 0x69e2a14eU,\r
+    0xdaf4cd65U, 0x05bed506U, 0x34621fd1U, 0xa6fe8ac4U,\r
+    0x2e539d34U, 0xf355a0a2U, 0x8ae13205U, 0xf6eb75a4U,\r
+    0x83ec390bU, 0x60efaa40U, 0x719f065eU, 0x6e1051bdU,\r
+    0x218af93eU, 0xdd063d96U, 0x3e05aeddU, 0xe6bd464dU,\r
+    0x548db591U, 0xc45d0571U, 0x06d46f04U, 0x5015ff60U,\r
+    0x98fb2419U, 0xbde997d6U, 0x4043cc89U, 0xd99e7767U,\r
+    0xe842bdb0U, 0x898b8807U, 0x195b38e7U, 0xc8eedb79U,\r
+    0x7c0a47a1U, 0x420fe97cU, 0x841ec9f8U, 0x00000000U,\r
+    0x80868309U, 0x2bed4832U, 0x1170ac1eU, 0x5a724e6cU,\r
+    0x0efffbfdU, 0x8538560fU, 0xaed51e3dU, 0x2d392736U,\r
+    0x0fd9640aU, 0x5ca62168U, 0x5b54d19bU, 0x362e3a24U,\r
+    0x0a67b10cU, 0x57e70f93U, 0xee96d2b4U, 0x9b919e1bU,\r
+    0xc0c54f80U, 0xdc20a261U, 0x774b695aU, 0x121a161cU,\r
+    0x93ba0ae2U, 0xa02ae5c0U, 0x22e0433cU, 0x1b171d12U,\r
+    0x090d0b0eU, 0x8bc7adf2U, 0xb6a8b92dU, 0x1ea9c814U,\r
+    0xf1198557U, 0x75074cafU, 0x99ddbbeeU, 0x7f60fda3U,\r
+    0x01269ff7U, 0x72f5bc5cU, 0x663bc544U, 0xfb7e345bU,\r
+    0x4329768bU, 0x23c6dccbU, 0xedfc68b6U, 0xe4f163b8U,\r
+    0x31dccad7U, 0x63851042U, 0x97224013U, 0xc6112084U,\r
+    0x4a247d85U, 0xbb3df8d2U, 0xf93211aeU, 0x29a16dc7U,\r
+    0x9e2f4b1dU, 0xb230f3dcU, 0x8652ec0dU, 0xc1e3d077U,\r
+    0xb3166c2bU, 0x70b999a9U, 0x9448fa11U, 0xe9642247U,\r
+    0xfc8cc4a8U, 0xf03f1aa0U, 0x7d2cd856U, 0x3390ef22U,\r
+    0x494ec787U, 0x38d1c1d9U, 0xcaa2fe8cU, 0xd40b3698U,\r
+    0xf581cfa6U, 0x7ade28a5U, 0xb78e26daU, 0xadbfa43fU,\r
+    0x3a9de42cU, 0x78920d50U, 0x5fcc9b6aU, 0x7e466254U,\r
+    0x8d13c2f6U, 0xd8b8e890U, 0x39f75e2eU, 0xc3aff582U,\r
+    0x5d80be9fU, 0xd0937c69U, 0xd52da96fU, 0x2512b3cfU,\r
+    0xac993bc8U, 0x187da710U, 0x9c636ee8U, 0x3bbb7bdbU,\r
+    0x267809cdU, 0x5918f46eU, 0x9ab701ecU, 0x4f9aa883U,\r
+    0x956e65e6U, 0xffe67eaaU, 0xbccf0821U, 0x15e8e6efU,\r
+    0xe79bd9baU, 0x6f36ce4aU, 0x9f09d4eaU, 0xb07cd629U,\r
+    0xa4b2af31U, 0x3f23312aU, 0xa59430c6U, 0xa266c035U,\r
+    0x4ebc3774U, 0x82caa6fcU, 0x90d0b0e0U, 0xa7d81533U,\r
+    0x04984af1U, 0xecdaf741U, 0xcd500e7fU, 0x91f62f17U,\r
+    0x4dd68d76U, 0xefb04d43U, 0xaa4d54ccU, 0x9604dfe4U,\r
+    0xd1b5e39eU, 0x6a881b4cU, 0x2c1fb8c1U, 0x65517f46U,\r
+    0x5eea049dU, 0x8c355d01U, 0x877473faU, 0x0b412efbU,\r
+    0x671d5ab3U, 0xdbd25292U, 0x105633e9U, 0xd647136dU,\r
+    0xd7618c9aU, 0xa10c7a37U, 0xf8148e59U, 0x133c89ebU,\r
+    0xa927eeceU, 0x61c935b7U, 0x1ce5ede1U, 0x47b13c7aU,\r
+    0xd2df599cU, 0xf2733f55U, 0x14ce7918U, 0xc737bf73U,\r
+    0xf7cdea53U, 0xfdaa5b5fU, 0x3d6f14dfU, 0x44db8678U,\r
+    0xaff381caU, 0x68c43eb9U, 0x24342c38U, 0xa3405fc2U,\r
+    0x1dc37216U, 0xe2250cbcU, 0x3c498b28U, 0x0d9541ffU,\r
+    0xa8017139U, 0x0cb3de08U, 0xb4e49cd8U, 0x56c19064U,\r
+    0xcb84617bU, 0x32b670d5U, 0x6c5c7448U, 0xb85742d0U,\r
+};\r
+static const u32 Td4[256] = {\r
+    0x52525252U, 0x09090909U, 0x6a6a6a6aU, 0xd5d5d5d5U,\r
+    0x30303030U, 0x36363636U, 0xa5a5a5a5U, 0x38383838U,\r
+    0xbfbfbfbfU, 0x40404040U, 0xa3a3a3a3U, 0x9e9e9e9eU,\r
+    0x81818181U, 0xf3f3f3f3U, 0xd7d7d7d7U, 0xfbfbfbfbU,\r
+    0x7c7c7c7cU, 0xe3e3e3e3U, 0x39393939U, 0x82828282U,\r
+    0x9b9b9b9bU, 0x2f2f2f2fU, 0xffffffffU, 0x87878787U,\r
+    0x34343434U, 0x8e8e8e8eU, 0x43434343U, 0x44444444U,\r
+    0xc4c4c4c4U, 0xdedededeU, 0xe9e9e9e9U, 0xcbcbcbcbU,\r
+    0x54545454U, 0x7b7b7b7bU, 0x94949494U, 0x32323232U,\r
+    0xa6a6a6a6U, 0xc2c2c2c2U, 0x23232323U, 0x3d3d3d3dU,\r
+    0xeeeeeeeeU, 0x4c4c4c4cU, 0x95959595U, 0x0b0b0b0bU,\r
+    0x42424242U, 0xfafafafaU, 0xc3c3c3c3U, 0x4e4e4e4eU,\r
+    0x08080808U, 0x2e2e2e2eU, 0xa1a1a1a1U, 0x66666666U,\r
+    0x28282828U, 0xd9d9d9d9U, 0x24242424U, 0xb2b2b2b2U,\r
+    0x76767676U, 0x5b5b5b5bU, 0xa2a2a2a2U, 0x49494949U,\r
+    0x6d6d6d6dU, 0x8b8b8b8bU, 0xd1d1d1d1U, 0x25252525U,\r
+    0x72727272U, 0xf8f8f8f8U, 0xf6f6f6f6U, 0x64646464U,\r
+    0x86868686U, 0x68686868U, 0x98989898U, 0x16161616U,\r
+    0xd4d4d4d4U, 0xa4a4a4a4U, 0x5c5c5c5cU, 0xccccccccU,\r
+    0x5d5d5d5dU, 0x65656565U, 0xb6b6b6b6U, 0x92929292U,\r
+    0x6c6c6c6cU, 0x70707070U, 0x48484848U, 0x50505050U,\r
+    0xfdfdfdfdU, 0xededededU, 0xb9b9b9b9U, 0xdadadadaU,\r
+    0x5e5e5e5eU, 0x15151515U, 0x46464646U, 0x57575757U,\r
+    0xa7a7a7a7U, 0x8d8d8d8dU, 0x9d9d9d9dU, 0x84848484U,\r
+    0x90909090U, 0xd8d8d8d8U, 0xababababU, 0x00000000U,\r
+    0x8c8c8c8cU, 0xbcbcbcbcU, 0xd3d3d3d3U, 0x0a0a0a0aU,\r
+    0xf7f7f7f7U, 0xe4e4e4e4U, 0x58585858U, 0x05050505U,\r
+    0xb8b8b8b8U, 0xb3b3b3b3U, 0x45454545U, 0x06060606U,\r
+    0xd0d0d0d0U, 0x2c2c2c2cU, 0x1e1e1e1eU, 0x8f8f8f8fU,\r
+    0xcacacacaU, 0x3f3f3f3fU, 0x0f0f0f0fU, 0x02020202U,\r
+    0xc1c1c1c1U, 0xafafafafU, 0xbdbdbdbdU, 0x03030303U,\r
+    0x01010101U, 0x13131313U, 0x8a8a8a8aU, 0x6b6b6b6bU,\r
+    0x3a3a3a3aU, 0x91919191U, 0x11111111U, 0x41414141U,\r
+    0x4f4f4f4fU, 0x67676767U, 0xdcdcdcdcU, 0xeaeaeaeaU,\r
+    0x97979797U, 0xf2f2f2f2U, 0xcfcfcfcfU, 0xcecececeU,\r
+    0xf0f0f0f0U, 0xb4b4b4b4U, 0xe6e6e6e6U, 0x73737373U,\r
+    0x96969696U, 0xacacacacU, 0x74747474U, 0x22222222U,\r
+    0xe7e7e7e7U, 0xadadadadU, 0x35353535U, 0x85858585U,\r
+    0xe2e2e2e2U, 0xf9f9f9f9U, 0x37373737U, 0xe8e8e8e8U,\r
+    0x1c1c1c1cU, 0x75757575U, 0xdfdfdfdfU, 0x6e6e6e6eU,\r
+    0x47474747U, 0xf1f1f1f1U, 0x1a1a1a1aU, 0x71717171U,\r
+    0x1d1d1d1dU, 0x29292929U, 0xc5c5c5c5U, 0x89898989U,\r
+    0x6f6f6f6fU, 0xb7b7b7b7U, 0x62626262U, 0x0e0e0e0eU,\r
+    0xaaaaaaaaU, 0x18181818U, 0xbebebebeU, 0x1b1b1b1bU,\r
+    0xfcfcfcfcU, 0x56565656U, 0x3e3e3e3eU, 0x4b4b4b4bU,\r
+    0xc6c6c6c6U, 0xd2d2d2d2U, 0x79797979U, 0x20202020U,\r
+    0x9a9a9a9aU, 0xdbdbdbdbU, 0xc0c0c0c0U, 0xfefefefeU,\r
+    0x78787878U, 0xcdcdcdcdU, 0x5a5a5a5aU, 0xf4f4f4f4U,\r
+    0x1f1f1f1fU, 0xddddddddU, 0xa8a8a8a8U, 0x33333333U,\r
+    0x88888888U, 0x07070707U, 0xc7c7c7c7U, 0x31313131U,\r
+    0xb1b1b1b1U, 0x12121212U, 0x10101010U, 0x59595959U,\r
+    0x27272727U, 0x80808080U, 0xececececU, 0x5f5f5f5fU,\r
+    0x60606060U, 0x51515151U, 0x7f7f7f7fU, 0xa9a9a9a9U,\r
+    0x19191919U, 0xb5b5b5b5U, 0x4a4a4a4aU, 0x0d0d0d0dU,\r
+    0x2d2d2d2dU, 0xe5e5e5e5U, 0x7a7a7a7aU, 0x9f9f9f9fU,\r
+    0x93939393U, 0xc9c9c9c9U, 0x9c9c9c9cU, 0xefefefefU,\r
+    0xa0a0a0a0U, 0xe0e0e0e0U, 0x3b3b3b3bU, 0x4d4d4d4dU,\r
+    0xaeaeaeaeU, 0x2a2a2a2aU, 0xf5f5f5f5U, 0xb0b0b0b0U,\r
+    0xc8c8c8c8U, 0xebebebebU, 0xbbbbbbbbU, 0x3c3c3c3cU,\r
+    0x83838383U, 0x53535353U, 0x99999999U, 0x61616161U,\r
+    0x17171717U, 0x2b2b2b2bU, 0x04040404U, 0x7e7e7e7eU,\r
+    0xbabababaU, 0x77777777U, 0xd6d6d6d6U, 0x26262626U,\r
+    0xe1e1e1e1U, 0x69696969U, 0x14141414U, 0x63636363U,\r
+    0x55555555U, 0x21212121U, 0x0c0c0c0cU, 0x7d7d7d7dU,\r
+};\r
+static const u32 rcon[] = {\r
+       0x01000000, 0x02000000, 0x04000000, 0x08000000,\r
+       0x10000000, 0x20000000, 0x40000000, 0x80000000,\r
+       0x1B000000, 0x36000000, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */\r
+};\r
+\r
+/**\r
+ * Expand the cipher key into the encryption key schedule.\r
+ */\r
+int AES_set_encrypt_key(const unsigned char *userKey, const int bits,\r
+                       AES_KEY *key) {\r
+\r
+       u32 *rk;\r
+       int i = 0;\r
+       u32 temp;\r
+\r
+       if (!userKey || !key)\r
+               return -1;\r
+       if (bits != 128 && bits != 192 && bits != 256)\r
+               return -2;\r
+\r
+       rk = key->rd_key;\r
+\r
+       if (bits==128)\r
+               key->rounds = 10;\r
+       else if (bits==192)\r
+               key->rounds = 12;\r
+       else\r
+               key->rounds = 14;\r
+\r
+       rk[0] = GETU32(userKey     );\r
+       rk[1] = GETU32(userKey +  4);\r
+       rk[2] = GETU32(userKey +  8);\r
+       rk[3] = GETU32(userKey + 12);\r
+       if (bits == 128) {\r
+               while (1) {\r
+                       temp  = rk[3];\r
+                       rk[4] = rk[0] ^\r
+                               (Te4[(temp >> 16) & 0xff] & 0xff000000) ^\r
+                               (Te4[(temp >>  8) & 0xff] & 0x00ff0000) ^\r
+                               (Te4[(temp      ) & 0xff] & 0x0000ff00) ^\r
+                               (Te4[(temp >> 24)       ] & 0x000000ff) ^\r
+                               rcon[i];\r
+                       rk[5] = rk[1] ^ rk[4];\r
+                       rk[6] = rk[2] ^ rk[5];\r
+                       rk[7] = rk[3] ^ rk[6];\r
+                       if (++i == 10) {\r
+                               return 0;\r
+                       }\r
+                       rk += 4;\r
+               }\r
+       }\r
+       rk[4] = GETU32(userKey + 16);\r
+       rk[5] = GETU32(userKey + 20);\r
+       if (bits == 192) {\r
+               while (1) {\r
+                       temp = rk[ 5];\r
+                       rk[ 6] = rk[ 0] ^\r
+                               (Te4[(temp >> 16) & 0xff] & 0xff000000) ^\r
+                               (Te4[(temp >>  8) & 0xff] & 0x00ff0000) ^\r
+                               (Te4[(temp      ) & 0xff] & 0x0000ff00) ^\r
+                               (Te4[(temp >> 24)       ] & 0x000000ff) ^\r
+                               rcon[i];\r
+                       rk[ 7] = rk[ 1] ^ rk[ 6];\r
+                       rk[ 8] = rk[ 2] ^ rk[ 7];\r
+                       rk[ 9] = rk[ 3] ^ rk[ 8];\r
+                       if (++i == 8) {\r
+                               return 0;\r
+                       }\r
+                       rk[10] = rk[ 4] ^ rk[ 9];\r
+                       rk[11] = rk[ 5] ^ rk[10];\r
+                       rk += 6;\r
+               }\r
+       }\r
+       rk[6] = GETU32(userKey + 24);\r
+       rk[7] = GETU32(userKey + 28);\r
+       if (bits == 256) {\r
+               while (1) {\r
+                       temp = rk[ 7];\r
+                       rk[ 8] = rk[ 0] ^\r
+                               (Te4[(temp >> 16) & 0xff] & 0xff000000) ^\r
+                               (Te4[(temp >>  8) & 0xff] & 0x00ff0000) ^\r
+                               (Te4[(temp      ) & 0xff] & 0x0000ff00) ^\r
+                               (Te4[(temp >> 24)       ] & 0x000000ff) ^\r
+                               rcon[i];\r
+                       rk[ 9] = rk[ 1] ^ rk[ 8];\r
+                       rk[10] = rk[ 2] ^ rk[ 9];\r
+                       rk[11] = rk[ 3] ^ rk[10];\r
+                       if (++i == 7) {\r
+                               return 0;\r
+                       }\r
+                       temp = rk[11];\r
+                       rk[12] = rk[ 4] ^\r
+                               (Te4[(temp >> 24)       ] & 0xff000000) ^\r
+                               (Te4[(temp >> 16) & 0xff] & 0x00ff0000) ^\r
+                               (Te4[(temp >>  8) & 0xff] & 0x0000ff00) ^\r
+                               (Te4[(temp      ) & 0xff] & 0x000000ff);\r
+                       rk[13] = rk[ 5] ^ rk[12];\r
+                       rk[14] = rk[ 6] ^ rk[13];\r
+                       rk[15] = rk[ 7] ^ rk[14];\r
+\r
+                       rk += 8;\r
+               }\r
+       }\r
+       return 0;\r
+}\r
+\r
+/**\r
+ * Expand the cipher key into the decryption key schedule.\r
+ */\r
+int AES_set_decrypt_key(const unsigned char *userKey, const int bits,\r
+                        AES_KEY *key) {\r
+\r
+        u32 *rk;\r
+       int i, j, status;\r
+       u32 temp;\r
+\r
+       /* first, start with an encryption schedule */\r
+       status = AES_set_encrypt_key(userKey, bits, key);\r
+       if (status < 0)\r
+               return status;\r
+\r
+       rk = key->rd_key;\r
+\r
+       /* invert the order of the round keys: */\r
+       for (i = 0, j = 4*(key->rounds); i < j; i += 4, j -= 4) {\r
+               temp = rk[i    ]; rk[i    ] = rk[j    ]; rk[j    ] = temp;\r
+               temp = rk[i + 1]; rk[i + 1] = rk[j + 1]; rk[j + 1] = temp;\r
+               temp = rk[i + 2]; rk[i + 2] = rk[j + 2]; rk[j + 2] = temp;\r
+               temp = rk[i + 3]; rk[i + 3] = rk[j + 3]; rk[j + 3] = temp;\r
+       }\r
+       /* apply the inverse MixColumn transform to all round keys but the first and the last: */\r
+       for (i = 1; i < (key->rounds); i++) {\r
+               rk += 4;\r
+               rk[0] =\r
+                       Td0[Te4[(rk[0] >> 24)       ] & 0xff] ^\r
+                       Td1[Te4[(rk[0] >> 16) & 0xff] & 0xff] ^\r
+                       Td2[Te4[(rk[0] >>  8) & 0xff] & 0xff] ^\r
+                       Td3[Te4[(rk[0]      ) & 0xff] & 0xff];\r
+               rk[1] =\r
+                       Td0[Te4[(rk[1] >> 24)       ] & 0xff] ^\r
+                       Td1[Te4[(rk[1] >> 16) & 0xff] & 0xff] ^\r
+                       Td2[Te4[(rk[1] >>  8) & 0xff] & 0xff] ^\r
+                       Td3[Te4[(rk[1]      ) & 0xff] & 0xff];\r
+               rk[2] =\r
+                       Td0[Te4[(rk[2] >> 24)       ] & 0xff] ^\r
+                       Td1[Te4[(rk[2] >> 16) & 0xff] & 0xff] ^\r
+                       Td2[Te4[(rk[2] >>  8) & 0xff] & 0xff] ^\r
+                       Td3[Te4[(rk[2]      ) & 0xff] & 0xff];\r
+               rk[3] =\r
+                       Td0[Te4[(rk[3] >> 24)       ] & 0xff] ^\r
+                       Td1[Te4[(rk[3] >> 16) & 0xff] & 0xff] ^\r
+                       Td2[Te4[(rk[3] >>  8) & 0xff] & 0xff] ^\r
+                       Td3[Te4[(rk[3]      ) & 0xff] & 0xff];\r
+       }\r
+       return 0;\r
+}\r
+\r
+#ifndef AES_ASM\r
+/*\r
+ * Encrypt a single block\r
+ * in and out can overlap\r
+ */\r
+void AES_encrypt(const unsigned char *in, unsigned char *out,\r
+                const AES_KEY *key) {\r
+\r
+       const u32 *rk;\r
+       u32 s0, s1, s2, s3, t0, t1, t2, t3;\r
+#ifndef FULL_UNROLL\r
+       int r;\r
+#endif /* ?FULL_UNROLL */\r
+\r
+       assert(in && out && key);\r
+       rk = key->rd_key;\r
+\r
+       /*\r
+        * map byte array block to cipher state\r
+        * and add initial round key:\r
+        */\r
+       s0 = GETU32(in     ) ^ rk[0];\r
+       s1 = GETU32(in +  4) ^ rk[1];\r
+       s2 = GETU32(in +  8) ^ rk[2];\r
+       s3 = GETU32(in + 12) ^ rk[3];\r
+#ifdef FULL_UNROLL\r
+       /* round 1: */\r
+       t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >>  8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[ 4];\r
+       t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >>  8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[ 5];\r
+       t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >>  8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[ 6];\r
+       t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >>  8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[ 7];\r
+       /* round 2: */\r
+       s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >>  8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[ 8];\r
+       s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >>  8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[ 9];\r
+       s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >>  8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[10];\r
+       s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >>  8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[11];\r
+       /* round 3: */\r
+       t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >>  8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[12];\r
+       t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >>  8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[13];\r
+       t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >>  8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[14];\r
+       t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >>  8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[15];\r
+       /* round 4: */\r
+       s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >>  8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[16];\r
+       s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >>  8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[17];\r
+       s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >>  8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[18];\r
+       s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >>  8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[19];\r
+       /* round 5: */\r
+       t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >>  8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[20];\r
+       t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >>  8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[21];\r
+       t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >>  8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[22];\r
+       t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >>  8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[23];\r
+       /* round 6: */\r
+       s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >>  8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[24];\r
+       s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >>  8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[25];\r
+       s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >>  8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[26];\r
+       s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >>  8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[27];\r
+       /* round 7: */\r
+       t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >>  8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[28];\r
+       t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >>  8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[29];\r
+       t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >>  8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[30];\r
+       t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >>  8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[31];\r
+       /* round 8: */\r
+       s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >>  8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[32];\r
+       s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >>  8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[33];\r
+       s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >>  8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[34];\r
+       s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >>  8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[35];\r
+       /* round 9: */\r
+       t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >>  8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[36];\r
+       t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >>  8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[37];\r
+       t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >>  8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[38];\r
+       t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >>  8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[39];\r
+    if (key->rounds > 10) {\r
+        /* round 10: */\r
+        s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >>  8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[40];\r
+        s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >>  8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[41];\r
+        s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >>  8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[42];\r
+        s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >>  8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[43];\r
+        /* round 11: */\r
+        t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >>  8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[44];\r
+        t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >>  8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[45];\r
+        t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >>  8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[46];\r
+        t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >>  8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[47];\r
+        if (key->rounds > 12) {\r
+            /* round 12: */\r
+            s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >>  8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[48];\r
+            s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >>  8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[49];\r
+            s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >>  8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[50];\r
+            s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >>  8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[51];\r
+            /* round 13: */\r
+            t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >>  8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[52];\r
+            t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >>  8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[53];\r
+            t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >>  8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[54];\r
+            t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >>  8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[55];\r
+        }\r
+    }\r
+    rk += key->rounds << 2;\r
+#else  /* !FULL_UNROLL */\r
+    /*\r
+     * Nr - 1 full rounds:\r
+     */\r
+    r = key->rounds >> 1;\r
+    for (;;) {\r
+        t0 =\r
+            Te0[(s0 >> 24)       ] ^\r
+            Te1[(s1 >> 16) & 0xff] ^\r
+            Te2[(s2 >>  8) & 0xff] ^\r
+            Te3[(s3      ) & 0xff] ^\r
+            rk[4];\r
+        t1 =\r
+            Te0[(s1 >> 24)       ] ^\r
+            Te1[(s2 >> 16) & 0xff] ^\r
+            Te2[(s3 >>  8) & 0xff] ^\r
+            Te3[(s0      ) & 0xff] ^\r
+            rk[5];\r
+        t2 =\r
+            Te0[(s2 >> 24)       ] ^\r
+            Te1[(s3 >> 16) & 0xff] ^\r
+            Te2[(s0 >>  8) & 0xff] ^\r
+            Te3[(s1      ) & 0xff] ^\r
+            rk[6];\r
+        t3 =\r
+            Te0[(s3 >> 24)       ] ^\r
+            Te1[(s0 >> 16) & 0xff] ^\r
+            Te2[(s1 >>  8) & 0xff] ^\r
+            Te3[(s2      ) & 0xff] ^\r
+            rk[7];\r
+\r
+        rk += 8;\r
+        if (--r == 0) {\r
+            break;\r
+        }\r
+\r
+        s0 =\r
+            Te0[(t0 >> 24)       ] ^\r
+            Te1[(t1 >> 16) & 0xff] ^\r
+            Te2[(t2 >>  8) & 0xff] ^\r
+            Te3[(t3      ) & 0xff] ^\r
+            rk[0];\r
+        s1 =\r
+            Te0[(t1 >> 24)       ] ^\r
+            Te1[(t2 >> 16) & 0xff] ^\r
+            Te2[(t3 >>  8) & 0xff] ^\r
+            Te3[(t0      ) & 0xff] ^\r
+            rk[1];\r
+        s2 =\r
+            Te0[(t2 >> 24)       ] ^\r
+            Te1[(t3 >> 16) & 0xff] ^\r
+            Te2[(t0 >>  8) & 0xff] ^\r
+            Te3[(t1      ) & 0xff] ^\r
+            rk[2];\r
+        s3 =\r
+            Te0[(t3 >> 24)       ] ^\r
+            Te1[(t0 >> 16) & 0xff] ^\r
+            Te2[(t1 >>  8) & 0xff] ^\r
+            Te3[(t2      ) & 0xff] ^\r
+            rk[3];\r
+    }\r
+#endif /* ?FULL_UNROLL */\r
+    /*\r
+        * apply last round and\r
+        * map cipher state to byte array block:\r
+        */\r
+       s0 =\r
+               (Te4[(t0 >> 24)       ] & 0xff000000) ^\r
+               (Te4[(t1 >> 16) & 0xff] & 0x00ff0000) ^\r
+               (Te4[(t2 >>  8) & 0xff] & 0x0000ff00) ^\r
+               (Te4[(t3      ) & 0xff] & 0x000000ff) ^\r
+               rk[0];\r
+       PUTU32(out     , s0);\r
+       s1 =\r
+               (Te4[(t1 >> 24)       ] & 0xff000000) ^\r
+               (Te4[(t2 >> 16) & 0xff] & 0x00ff0000) ^\r
+               (Te4[(t3 >>  8) & 0xff] & 0x0000ff00) ^\r
+               (Te4[(t0      ) & 0xff] & 0x000000ff) ^\r
+               rk[1];\r
+       PUTU32(out +  4, s1);\r
+       s2 =\r
+               (Te4[(t2 >> 24)       ] & 0xff000000) ^\r
+               (Te4[(t3 >> 16) & 0xff] & 0x00ff0000) ^\r
+               (Te4[(t0 >>  8) & 0xff] & 0x0000ff00) ^\r
+               (Te4[(t1      ) & 0xff] & 0x000000ff) ^\r
+               rk[2];\r
+       PUTU32(out +  8, s2);\r
+       s3 =\r
+               (Te4[(t3 >> 24)       ] & 0xff000000) ^\r
+               (Te4[(t0 >> 16) & 0xff] & 0x00ff0000) ^\r
+               (Te4[(t1 >>  8) & 0xff] & 0x0000ff00) ^\r
+               (Te4[(t2      ) & 0xff] & 0x000000ff) ^\r
+               rk[3];\r
+       PUTU32(out + 12, s3);\r
+}\r
+\r
+/*\r
+ * Decrypt a single block\r
+ * in and out can overlap\r
+ */\r
+void AES_decrypt(const unsigned char *in, unsigned char *out,\r
+                const AES_KEY *key) {\r
+\r
+       const u32 *rk;\r
+       u32 s0, s1, s2, s3, t0, t1, t2, t3;\r
+#ifndef FULL_UNROLL\r
+       int r;\r
+#endif /* ?FULL_UNROLL */\r
+\r
+       assert(in && out && key);\r
+       rk = key->rd_key;\r
+\r
+       /*\r
+        * map byte array block to cipher state\r
+        * and add initial round key:\r
+        */\r
+    s0 = GETU32(in     ) ^ rk[0];\r
+    s1 = GETU32(in +  4) ^ rk[1];\r
+    s2 = GETU32(in +  8) ^ rk[2];\r
+    s3 = GETU32(in + 12) ^ rk[3];\r
+#ifdef FULL_UNROLL\r
+    /* round 1: */\r
+    t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >>  8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[ 4];\r
+    t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >>  8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[ 5];\r
+    t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >>  8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[ 6];\r
+    t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >>  8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[ 7];\r
+    /* round 2: */\r
+    s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >>  8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[ 8];\r
+    s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >>  8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[ 9];\r
+    s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >>  8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[10];\r
+    s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >>  8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[11];\r
+    /* round 3: */\r
+    t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >>  8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[12];\r
+    t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >>  8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[13];\r
+    t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >>  8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[14];\r
+    t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >>  8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[15];\r
+    /* round 4: */\r
+    s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >>  8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[16];\r
+    s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >>  8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[17];\r
+    s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >>  8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[18];\r
+    s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >>  8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[19];\r
+    /* round 5: */\r
+    t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >>  8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[20];\r
+    t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >>  8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[21];\r
+    t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >>  8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[22];\r
+    t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >>  8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[23];\r
+    /* round 6: */\r
+    s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >>  8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[24];\r
+    s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >>  8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[25];\r
+    s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >>  8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[26];\r
+    s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >>  8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[27];\r
+    /* round 7: */\r
+    t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >>  8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[28];\r
+    t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >>  8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[29];\r
+    t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >>  8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[30];\r
+    t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >>  8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[31];\r
+    /* round 8: */\r
+    s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >>  8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[32];\r
+    s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >>  8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[33];\r
+    s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >>  8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[34];\r
+    s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >>  8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[35];\r
+    /* round 9: */\r
+    t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >>  8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[36];\r
+    t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >>  8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[37];\r
+    t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >>  8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[38];\r
+    t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >>  8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[39];\r
+    if (key->rounds > 10) {\r
+        /* round 10: */\r
+        s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >>  8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[40];\r
+        s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >>  8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[41];\r
+        s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >>  8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[42];\r
+        s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >>  8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[43];\r
+        /* round 11: */\r
+        t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >>  8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[44];\r
+        t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >>  8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[45];\r
+        t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >>  8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[46];\r
+        t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >>  8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[47];\r
+        if (key->rounds > 12) {\r
+            /* round 12: */\r
+            s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >>  8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[48];\r
+            s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >>  8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[49];\r
+            s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >>  8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[50];\r
+            s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >>  8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[51];\r
+            /* round 13: */\r
+            t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >>  8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[52];\r
+            t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >>  8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[53];\r
+            t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >>  8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[54];\r
+            t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >>  8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[55];\r
+        }\r
+    }\r
+       rk += key->rounds << 2;\r
+#else  /* !FULL_UNROLL */\r
+    /*\r
+     * Nr - 1 full rounds:\r
+     */\r
+    r = key->rounds >> 1;\r
+    for (;;) {\r
+        t0 =\r
+            Td0[(s0 >> 24)       ] ^\r
+            Td1[(s3 >> 16) & 0xff] ^\r
+            Td2[(s2 >>  8) & 0xff] ^\r
+            Td3[(s1      ) & 0xff] ^\r
+            rk[4];\r
+        t1 =\r
+            Td0[(s1 >> 24)       ] ^\r
+            Td1[(s0 >> 16) & 0xff] ^\r
+            Td2[(s3 >>  8) & 0xff] ^\r
+            Td3[(s2      ) & 0xff] ^\r
+            rk[5];\r
+        t2 =\r
+            Td0[(s2 >> 24)       ] ^\r
+            Td1[(s1 >> 16) & 0xff] ^\r
+            Td2[(s0 >>  8) & 0xff] ^\r
+            Td3[(s3      ) & 0xff] ^\r
+            rk[6];\r
+        t3 =\r
+            Td0[(s3 >> 24)       ] ^\r
+            Td1[(s2 >> 16) & 0xff] ^\r
+            Td2[(s1 >>  8) & 0xff] ^\r
+            Td3[(s0      ) & 0xff] ^\r
+            rk[7];\r
+\r
+        rk += 8;\r
+        if (--r == 0) {\r
+            break;\r
+        }\r
+\r
+        s0 =\r
+            Td0[(t0 >> 24)       ] ^\r
+            Td1[(t3 >> 16) & 0xff] ^\r
+            Td2[(t2 >>  8) & 0xff] ^\r
+            Td3[(t1      ) & 0xff] ^\r
+            rk[0];\r
+        s1 =\r
+            Td0[(t1 >> 24)       ] ^\r
+            Td1[(t0 >> 16) & 0xff] ^\r
+            Td2[(t3 >>  8) & 0xff] ^\r
+            Td3[(t2      ) & 0xff] ^\r
+            rk[1];\r
+        s2 =\r
+            Td0[(t2 >> 24)       ] ^\r
+            Td1[(t1 >> 16) & 0xff] ^\r
+            Td2[(t0 >>  8) & 0xff] ^\r
+            Td3[(t3      ) & 0xff] ^\r
+            rk[2];\r
+        s3 =\r
+            Td0[(t3 >> 24)       ] ^\r
+            Td1[(t2 >> 16) & 0xff] ^\r
+            Td2[(t1 >>  8) & 0xff] ^\r
+            Td3[(t0      ) & 0xff] ^\r
+            rk[3];\r
+    }\r
+#endif /* ?FULL_UNROLL */\r
+    /*\r
+        * apply last round and\r
+        * map cipher state to byte array block:\r
+        */\r
+       s0 =\r
+               (Td4[(t0 >> 24)       ] & 0xff000000) ^\r
+               (Td4[(t3 >> 16) & 0xff] & 0x00ff0000) ^\r
+               (Td4[(t2 >>  8) & 0xff] & 0x0000ff00) ^\r
+               (Td4[(t1      ) & 0xff] & 0x000000ff) ^\r
+               rk[0];\r
+       PUTU32(out     , s0);\r
+       s1 =\r
+               (Td4[(t1 >> 24)       ] & 0xff000000) ^\r
+               (Td4[(t0 >> 16) & 0xff] & 0x00ff0000) ^\r
+               (Td4[(t3 >>  8) & 0xff] & 0x0000ff00) ^\r
+               (Td4[(t2      ) & 0xff] & 0x000000ff) ^\r
+               rk[1];\r
+       PUTU32(out +  4, s1);\r
+       s2 =\r
+               (Td4[(t2 >> 24)       ] & 0xff000000) ^\r
+               (Td4[(t1 >> 16) & 0xff] & 0x00ff0000) ^\r
+               (Td4[(t0 >>  8) & 0xff] & 0x0000ff00) ^\r
+               (Td4[(t3      ) & 0xff] & 0x000000ff) ^\r
+               rk[2];\r
+       PUTU32(out +  8, s2);\r
+       s3 =\r
+               (Td4[(t3 >> 24)       ] & 0xff000000) ^\r
+               (Td4[(t2 >> 16) & 0xff] & 0x00ff0000) ^\r
+               (Td4[(t1 >>  8) & 0xff] & 0x0000ff00) ^\r
+               (Td4[(t0      ) & 0xff] & 0x000000ff) ^\r
+               rk[3];\r
+       PUTU32(out + 12, s3);\r
+}\r
+\r
+#endif /* AES_ASM */\r
+\r
+void AES_cbc_encrypt(const unsigned char *in, unsigned char *out,\r
+                    const unsigned long length, const AES_KEY *key,\r
+                    unsigned char *ivec, const int enc) \r
+{\r
+\r
+       unsigned long n;\r
+       unsigned long len = length;\r
+       unsigned char tmp[AES_BLOCK_SIZE];\r
+\r
+       assert(in && out && key && ivec);\r
+\r
+       if (enc) {\r
+               while (len >= AES_BLOCK_SIZE) {\r
+                       for(n=0; n < AES_BLOCK_SIZE; ++n)\r
+                               tmp[n] = in[n] ^ ivec[n];\r
+                       AES_encrypt(tmp, out, key);\r
+                       memcpy(ivec, out, AES_BLOCK_SIZE);\r
+                       len -= AES_BLOCK_SIZE;\r
+                       in += AES_BLOCK_SIZE;\r
+                       out += AES_BLOCK_SIZE;\r
+               }\r
+               if (len) {\r
+                       for(n=0; n < len; ++n)\r
+                               tmp[n] = in[n] ^ ivec[n];\r
+                       for(n=len; n < AES_BLOCK_SIZE; ++n)\r
+                               tmp[n] = ivec[n];\r
+                       AES_encrypt(tmp, tmp, key);\r
+                       memcpy(out, tmp, AES_BLOCK_SIZE);\r
+                       memcpy(ivec, tmp, AES_BLOCK_SIZE);\r
+               }                       \r
+       } else {\r
+               while (len >= AES_BLOCK_SIZE) {\r
+                       memcpy(tmp, in, AES_BLOCK_SIZE);\r
+                       AES_decrypt(in, out, key);\r
+                       for(n=0; n < AES_BLOCK_SIZE; ++n)\r
+                               out[n] ^= ivec[n];\r
+                       memcpy(ivec, tmp, AES_BLOCK_SIZE);\r
+                       len -= AES_BLOCK_SIZE;\r
+                       in += AES_BLOCK_SIZE;\r
+                       out += AES_BLOCK_SIZE;\r
+               }\r
+               if (len) {\r
+                       memcpy(tmp, in, AES_BLOCK_SIZE);\r
+                       AES_decrypt(tmp, tmp, key);\r
+                       for(n=0; n < len; ++n)\r
+                               out[n] = tmp[n] ^ ivec[n];\r
+                       memcpy(ivec, tmp, AES_BLOCK_SIZE);\r
+               }                       \r
+       }\r
+}\r
diff --git a/tools/blktap2/drivers/aes.h b/tools/blktap2/drivers/aes.h
new file mode 100644 (file)
index 0000000..9fb54a9
--- /dev/null
@@ -0,0 +1,28 @@
+#ifndef QEMU_AES_H
+#define QEMU_AES_H
+
+#include <stdint.h>
+
+#define AES_MAXNR 14
+#define AES_BLOCK_SIZE 16
+
+struct aes_key_st {
+    uint32_t rd_key[4 *(AES_MAXNR + 1)];
+    int rounds;
+};
+typedef struct aes_key_st AES_KEY;
+
+int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
+       AES_KEY *key);
+int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
+       AES_KEY *key);
+
+void AES_encrypt(const unsigned char *in, unsigned char *out,
+       const AES_KEY *key);
+void AES_decrypt(const unsigned char *in, unsigned char *out,
+       const AES_KEY *key);
+void AES_cbc_encrypt(const unsigned char *in, unsigned char *out,
+                    const unsigned long length, const AES_KEY *key,
+                    unsigned char *ivec, const int enc);
+
+#endif
diff --git a/tools/blktap2/drivers/atomicio.c b/tools/blktap2/drivers/atomicio.c
new file mode 100644 (file)
index 0000000..ae0e24b
--- /dev/null
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2005 Anil Madhavapeddy. All rights reserved.
+ * Copyright (c) 1995,1999 Theo de Raadt.  All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdlib.h>
+#include <errno.h>
+#include "atomicio.h"
+
+/*
+ * ensure all of data on socket comes through. f==read || f==vwrite
+ */
+size_t
+atomicio(f, fd, _s, n)
+       ssize_t (*f) (int, void *, size_t);
+       int fd;
+       void *_s;
+       size_t n;
+{
+       char *s = _s;
+       size_t pos = 0;
+       ssize_t res;
+
+       while (n > pos) {
+               res = (f) (fd, s + pos, n - pos);
+               switch (res) {
+               case -1:
+                       if (errno == EINTR || errno == EAGAIN)
+                               continue;
+                       return 0;
+               case 0:
+                       errno = EPIPE;
+                       return pos;
+               default:
+                       pos += (size_t)res;
+               }
+       }
+       return (pos);
+}
+
diff --git a/tools/blktap2/drivers/blk.h b/tools/blktap2/drivers/blk.h
new file mode 100644 (file)
index 0000000..73ca40c
--- /dev/null
@@ -0,0 +1,30 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+int blk_getimagesize(int fd, uint64_t *size);
+int blk_getsectorsize(int fd, uint64_t *sector_size);
diff --git a/tools/blktap2/drivers/blk_linux.c b/tools/blktap2/drivers/blk_linux.c
new file mode 100644 (file)
index 0000000..75ddcc3
--- /dev/null
@@ -0,0 +1,43 @@
+#include <inttypes.h>
+#include <sys/ioctl.h>
+#include <linux/fs.h>
+#include <linux/errno.h>
+#include "tapdisk.h"
+#include "blk.h"
+
+int blk_getimagesize(int fd, uint64_t *size)
+{
+       int rc;
+
+       *size = 0;
+       rc = ioctl(fd, BLKGETSIZE, size);
+       if (rc) {
+               DPRINTF("ERR: BLKGETSIZE failed, couldn't stat image");
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+int blk_getsectorsize(int fd, uint64_t *sector_size)
+{
+#if defined(BLKSSZGET)
+       int rc;
+
+       *sector_size = DEFAULT_SECTOR_SIZE;
+       rc = ioctl(fd, BLKSSZGET, sector_size);
+       if (rc) {
+               DPRINTF("ERR: BLKSSZGET failed. Falling back to use default sector size");
+               *sector_size = DEFAULT_SECTOR_SIZE;
+       }
+
+       if (*sector_size != DEFAULT_SECTOR_SIZE)
+               DPRINTF("Note: sector size is %"PRIu64" (not %u)\n",
+                       *sector_size, DEFAULT_SECTOR_SIZE);
+#else
+       *sector_size = DEFAULT_SECTOR_SIZE;
+#endif
+
+       return 0;
+}
+
diff --git a/tools/blktap2/drivers/blktap2.h b/tools/blktap2/drivers/blktap2.h
new file mode 100644 (file)
index 0000000..38350d2
--- /dev/null
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _BLKTAP_2_H_
+#define _BLKTAP_2_H_
+
+#define MISC_MAJOR_NUMBER              10
+
+#define BLKTAP2_MAX_MESSAGE_LEN        256
+
+#define BLKTAP2_RING_MESSAGE_PAUSE     1
+#define BLKTAP2_RING_MESSAGE_RESUME    2
+#define BLKTAP2_RING_MESSAGE_CLOSE     3
+
+#define BLKTAP2_IOCTL_KICK_FE          1
+#define BLKTAP2_IOCTL_ALLOC_TAP        200
+#define BLKTAP2_IOCTL_FREE_TAP         201
+#define BLKTAP2_IOCTL_CREATE_DEVICE    202
+#define BLKTAP2_IOCTL_SET_PARAMS       203
+#define BLKTAP2_IOCTL_PAUSE            204
+#define BLKTAP2_IOCTL_REOPEN           205
+#define BLKTAP2_IOCTL_RESUME           206
+
+#define BLKTAP2_CONTROL_NAME           "blktap-control"
+#define BLKTAP2_DIRECTORY              "/dev/xen/blktap-2"
+#define BLKTAP2_CONTROL_DEVICE         BLKTAP2_DIRECTORY"/control"
+#define BLKTAP2_RING_DEVICE            BLKTAP2_DIRECTORY"/blktap"
+#define BLKTAP2_IO_DEVICE              BLKTAP2_DIRECTORY"/tapdev"
+
+struct blktap2_handle {
+       unsigned int                   ring;
+       unsigned int                   device;
+       unsigned int                   minor;
+};
+
+struct blktap2_params {
+       char                           name[BLKTAP2_MAX_MESSAGE_LEN];
+       unsigned long long             capacity;
+       unsigned long                  sector_size;
+};
+
+#endif
diff --git a/tools/blktap2/drivers/block-aio.c b/tools/blktap2/drivers/block-aio.c
new file mode 100644 (file)
index 0000000..2c5af14
--- /dev/null
@@ -0,0 +1,272 @@
+/* 
+ * Copyright (c) 2007, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#include <errno.h>
+#include <libaio.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/statvfs.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <linux/fs.h>
+
+#include "tapdisk.h"
+#include "tapdisk-driver.h"
+#include "tapdisk-interface.h"
+
+#define MAX_AIO_REQS         TAPDISK_DATA_REQUESTS
+
+struct tdaio_state;
+
+struct aio_request {
+       td_request_t         treq;
+       struct tiocb         tiocb;
+       struct tdaio_state  *state;
+};
+
+struct tdaio_state {
+       int                  fd;
+       td_driver_t         *driver;
+
+       int                  aio_free_count;    
+       struct aio_request   aio_requests[MAX_AIO_REQS];
+       struct aio_request  *aio_free_list[MAX_AIO_REQS];
+};
+
+/*Get Image size, secsize*/
+static int tdaio_get_image_info(int fd, td_disk_info_t *info)
+{
+       int ret;
+       long size;
+       unsigned long total_size;
+       struct statvfs statBuf;
+       struct stat stat;
+
+       ret = fstat(fd, &stat);
+       if (ret != 0) {
+               DPRINTF("ERROR: fstat failed, Couldn't stat image");
+               return -EINVAL;
+       }
+
+       if (S_ISBLK(stat.st_mode)) {
+               /*Accessing block device directly*/
+               info->size = 0;
+               if (ioctl(fd,BLKGETSIZE,&info->size)!=0) {
+                       DPRINTF("ERR: BLKGETSIZE failed, couldn't stat image");
+                       return -EINVAL;
+               }
+
+               DPRINTF("Image size: \n\tpre sector_shift  [%llu]\n\tpost "
+                       "sector_shift [%llu]\n",
+                       (long long unsigned)(info->size << SECTOR_SHIFT),
+                       (long long unsigned)info->size);
+
+               /*Get the sector size*/
+#if defined(BLKSSZGET)
+               {
+                       int arg;
+                       info->sector_size = DEFAULT_SECTOR_SIZE;
+                       ioctl(fd, BLKSSZGET, &info->sector_size);
+                       
+                       if (info->sector_size != DEFAULT_SECTOR_SIZE)
+                               DPRINTF("Note: sector size is %ld (not %d)\n",
+                                       info->sector_size, DEFAULT_SECTOR_SIZE);
+               }
+#else
+               info->sector_size = DEFAULT_SECTOR_SIZE;
+#endif
+
+       } else {
+               /*Local file? try fstat instead*/
+               info->size = (stat.st_size >> SECTOR_SHIFT);
+               info->sector_size = DEFAULT_SECTOR_SIZE;
+               DPRINTF("Image size: \n\tpre sector_shift  [%llu]\n\tpost "
+                       "sector_shift [%llu]\n",
+                       (long long unsigned)(info->size << SECTOR_SHIFT),
+                       (long long unsigned)info->size);
+       }
+
+       if (info->size == 0) {          
+               info->size =((uint64_t) 16836057);
+               info->sector_size = DEFAULT_SECTOR_SIZE;
+       }
+       info->info = 0;
+
+       return 0;
+}
+
+/* Open the disk file and initialize aio state. */
+int tdaio_open(td_driver_t *driver, const char *name, td_flag_t flags)
+{
+       int i, fd, ret, o_flags;
+       struct tdaio_state *prv;
+
+       ret = 0;
+       prv = (struct tdaio_state *)driver->data;
+
+       DPRINTF("block-aio open('%s')", name);
+
+       memset(prv, 0, sizeof(struct tdaio_state));
+
+       prv->aio_free_count = MAX_AIO_REQS;
+       for (i = 0; i < MAX_AIO_REQS; i++)
+               prv->aio_free_list[i] = &prv->aio_requests[i];
+
+       /* Open the file */
+       o_flags = O_DIRECT | O_LARGEFILE | 
+               ((flags & TD_OPEN_RDONLY) ? O_RDONLY : O_RDWR);
+        fd = open(name, o_flags);
+
+        if ( (fd == -1) && (errno == EINVAL) ) {
+
+                /* Maybe O_DIRECT isn't supported. */
+               o_flags &= ~O_DIRECT;
+                fd = open(name, o_flags);
+                if (fd != -1) DPRINTF("WARNING: Accessing image without"
+                                     "O_DIRECT! (%s)\n", name);
+
+        } else if (fd != -1) DPRINTF("open(%s) with O_DIRECT\n", name);
+       
+        if (fd == -1) {
+               DPRINTF("Unable to open [%s] (%d)!\n", name, 0 - errno);
+               ret = 0 - errno;
+               goto done;
+        }
+
+       ret = tdaio_get_image_info(fd, &driver->info);
+       if (ret) {
+               close(fd);
+               goto done;
+       }
+
+        prv->fd = fd;
+
+done:
+       return ret;     
+}
+
+void tdaio_complete(void *arg, struct tiocb *tiocb, int err)
+{
+       struct aio_request *aio = (struct aio_request *)arg;
+       struct tdaio_state *prv = aio->state;
+
+       td_complete_request(aio->treq, err);
+       prv->aio_free_list[prv->aio_free_count++] = aio;
+}
+
+void tdaio_queue_read(td_driver_t *driver, td_request_t treq)
+{
+       int size;
+       uint64_t offset;
+       struct aio_request *aio;
+       struct tdaio_state *prv;
+
+       prv    = (struct tdaio_state *)driver->data;
+       size   = treq.secs * driver->info.sector_size;
+       offset = treq.sec  * (uint64_t)driver->info.sector_size;
+
+       if (prv->aio_free_count == 0)
+               goto fail;
+
+       aio        = prv->aio_free_list[--prv->aio_free_count];
+       aio->treq  = treq;
+       aio->state = prv;
+
+       td_prep_read(&aio->tiocb, prv->fd, treq.buf,
+                    size, offset, tdaio_complete, aio);
+       td_queue_tiocb(driver, &aio->tiocb);
+
+       return;
+
+fail:
+       td_complete_request(treq, -EBUSY);
+}
+
+void tdaio_queue_write(td_driver_t *driver, td_request_t treq)
+{
+       int size;
+       uint64_t offset;
+       struct aio_request *aio;
+       struct tdaio_state *prv;
+
+       prv     = (struct tdaio_state *)driver->data;
+       size    = treq.secs * driver->info.sector_size;
+       offset  = treq.sec  * (uint64_t)driver->info.sector_size;
+
+       if (prv->aio_free_count == 0)
+               goto fail;
+
+       aio        = prv->aio_free_list[--prv->aio_free_count];
+       aio->treq  = treq;
+       aio->state = prv;
+
+       td_prep_write(&aio->tiocb, prv->fd, treq.buf,
+                     size, offset, tdaio_complete, aio);
+       td_queue_tiocb(driver, &aio->tiocb);
+
+       return;
+
+fail:
+       td_complete_request(treq, -EBUSY);
+}
+
+int tdaio_close(td_driver_t *driver)
+{
+       struct tdaio_state *prv = (struct tdaio_state *)driver->data;
+       
+       close(prv->fd);
+
+       return 0;
+}
+
+int tdaio_get_parent_id(td_driver_t *driver, td_disk_id_t *id)
+{
+       return TD_NO_PARENT;
+}
+
+int tdaio_validate_parent(td_driver_t *driver,
+                         td_driver_t *pdriver, td_flag_t flags)
+{
+       return -EINVAL;
+}
+
+struct tap_disk tapdisk_aio = {
+       .disk_type          = "tapdisk_aio",
+       .flags              = 0,
+       .private_data_size  = sizeof(struct tdaio_state),
+       .td_open            = tdaio_open,
+       .td_close           = tdaio_close,
+       .td_queue_read      = tdaio_queue_read,
+       .td_queue_write     = tdaio_queue_write,
+       .td_get_parent_id   = tdaio_get_parent_id,
+       .td_validate_parent = tdaio_validate_parent,
+       .td_debug           = NULL,
+};
diff --git a/tools/blktap2/drivers/block-cache.c b/tools/blktap2/drivers/block-cache.c
new file mode 100644 (file)
index 0000000..1d2f4eb
--- /dev/null
@@ -0,0 +1,787 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <errno.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+
+#include "tapdisk.h"
+#include "tapdisk-utils.h"
+#include "tapdisk-driver.h"
+#include "tapdisk-server.h"
+#include "tapdisk-interface.h"
+
+#ifdef DEBUG
+#define DBG(_f, _a...) tlog_write(TLOG_DBG, _f, ##_a)
+#else
+#define DBG(_f, _a...) ((void)0)
+#endif
+
+#define WARN(_f, _a...) tlog_write(TLOG_WARN, _f, ##_a)
+
+#define RADIX_TREE_PAGE_SHIFT           12 /* 4K pages */
+#define RADIX_TREE_PAGE_SIZE            (1 << RADIX_TREE_PAGE_SHIFT)
+
+#define RADIX_TREE_NODE_SHIFT           9 /* 512B nodes */
+#define RADIX_TREE_NODE_SIZE            (1 << RADIX_TREE_NODE_SHIFT)
+#define RADIX_TREE_NODE_MASK            (RADIX_TREE_NODE_SIZE - 1)
+
+#define BLOCK_CACHE_NODES_PER_PAGE      (1 << (RADIX_TREE_PAGE_SHIFT - RADIX_TREE_NODE_SHIFT))
+
+#define BLOCK_CACHE_MAX_SIZE            (10 << 20) /* 100MB cache */
+#define BLOCK_CACHE_REQUESTS            (TAPDISK_DATA_REQUESTS << 3)
+#define BLOCK_CACHE_PAGE_IDLETIME       60
+
+typedef struct radix_tree               radix_tree_t;
+typedef struct radix_tree_node          radix_tree_node_t;
+typedef struct radix_tree_link          radix_tree_link_t;
+typedef struct radix_tree_leaf          radix_tree_leaf_t;
+typedef struct radix_tree_page          radix_tree_page_t;
+
+typedef struct block_cache              block_cache_t;
+typedef struct block_cache_request      block_cache_request_t;
+typedef struct block_cache_stats        block_cache_stats_t;
+
+struct radix_tree_page {
+       char                           *buf;
+       size_t                          size;
+       uint64_t                        sec;
+       radix_tree_link_t              *owners[BLOCK_CACHE_NODES_PER_PAGE];
+};
+
+struct radix_tree_leaf {
+       radix_tree_page_t              *page;
+       char                           *buf;
+};
+
+struct radix_tree_link {
+       uint32_t                        time;
+       union {
+               radix_tree_node_t      *next;
+               radix_tree_leaf_t       leaf;
+       } u;
+};
+
+struct radix_tree_node {
+       int                             height;
+       radix_tree_link_t               links[RADIX_TREE_NODE_SIZE];
+};
+
+struct radix_tree {
+       int                             height;
+       uint64_t                        size;
+       uint32_t                        nodes;
+       radix_tree_node_t              *root;
+
+       block_cache_t                  *cache;
+};
+
+struct block_cache_request {
+       int                             err;
+       char                           *buf;
+       uint64_t                        secs;
+       td_request_t                    treq;
+       block_cache_t                  *cache;
+};
+
+struct block_cache_stats {
+       uint64_t                        reads;
+       uint64_t                        hits;
+       uint64_t                        misses;
+       uint64_t                        prunes;
+};
+
+struct block_cache {
+       int                             ptype;
+       char                           *name;
+
+       uint64_t                        sectors;
+
+       block_cache_request_t           requests[BLOCK_CACHE_REQUESTS];
+       block_cache_request_t          *request_free_list[BLOCK_CACHE_REQUESTS];
+       int                             requests_free;
+
+       event_id_t                      timeout_id;
+
+       radix_tree_t                    tree;
+
+       block_cache_stats_t             stats;
+};
+
+static inline uint64_t
+radix_tree_calculate_size(int height)
+{
+       return (uint64_t)RADIX_TREE_NODE_SIZE <<
+         (height * RADIX_TREE_NODE_SHIFT);
+}
+
+static inline int
+radix_tree_calculate_height(uint64_t sectors)
+{
+       int height;
+       uint64_t tree_size;
+
+       height = 1;  /* always allocate root node */
+       tree_size = radix_tree_calculate_size(height);
+       while (sectors > tree_size)
+               tree_size = radix_tree_calculate_size(++height);
+
+       return height;
+}
+
+static inline int
+radix_tree_index(radix_tree_node_t *node, uint64_t sector)
+{
+       return ((sector >> (node->height * RADIX_TREE_NODE_SHIFT)) &
+               RADIX_TREE_NODE_MASK);
+}
+
+static inline int
+radix_tree_node_contains_leaves(radix_tree_t *tree, radix_tree_node_t *node)
+{
+       return (node->height == 0);
+}
+
+static inline int
+radix_tree_node_is_root(radix_tree_t *tree, radix_tree_node_t *node)
+{
+       return (node->height == tree->height);
+}
+
+static inline uint64_t
+radix_tree_size(radix_tree_t *tree)
+{
+       return tree->size + tree->nodes * sizeof(radix_tree_node_t);
+}
+
+static inline void
+radix_tree_clear_link(radix_tree_link_t *link)
+{
+       if (link)
+               memset(link, 0, sizeof(radix_tree_link_t));
+}
+
+static inline radix_tree_node_t *
+radix_tree_allocate_node(radix_tree_t *tree, int height)
+{
+       radix_tree_node_t *node;
+
+       node = calloc(1, sizeof(radix_tree_node_t));
+       if (!node)
+               return NULL;
+
+       node->height = height;
+       tree->nodes++;
+
+       return node;
+}
+
+static inline radix_tree_node_t *
+radix_tree_allocate_child_node(radix_tree_t *tree, radix_tree_node_t *parent)
+{
+       return radix_tree_allocate_node(tree, parent->height - 1);
+}
+
+void
+radix_tree_free_node(radix_tree_t *tree, radix_tree_node_t *node)
+{
+       if (!node)
+               return;
+
+       free(node);
+       tree->nodes--;
+}
+
+static inline radix_tree_page_t *
+radix_tree_allocate_page(radix_tree_t *tree,
+                        char *buf, uint64_t sec, size_t size)
+{
+       radix_tree_page_t *page;
+
+       page = calloc(1, sizeof(radix_tree_page_t));
+       if (!page)
+               return NULL;
+
+       page->buf   = buf;
+       page->sec   = sec;
+       page->size  = size;
+       tree->size += size;
+
+       return page;
+}
+
+static inline void
+radix_tree_free_page(radix_tree_t *tree, radix_tree_page_t *page)
+{
+       int i;
+
+       for (i = 0; i < page->size >> RADIX_TREE_NODE_SHIFT; i++)
+               DBG("%s: ejecting sector 0x%llx\n",
+                   tree->cache->name, page->sec + i);
+
+       tree->cache->stats.prunes += (page->size >> RADIX_TREE_NODE_SHIFT);
+       tree->size -= page->size;
+       free(page->buf);
+       free(page);
+}
+
+/*
+ * remove a leaf and the shared radix_tree_page_t containing its buffer.
+ * leaves are deleted, nodes are not; gc will reap the nodes later.
+ */
+static void
+radix_tree_remove_page(radix_tree_t *tree, radix_tree_page_t *page)
+{
+       int i;
+
+       if (!page)
+               return;
+
+       for (i = 0; i < BLOCK_CACHE_NODES_PER_PAGE; i++)
+               radix_tree_clear_link(page->owners[i]);
+
+       radix_tree_free_page(tree, page);
+}
+
+static void
+radix_tree_insert_leaf(radix_tree_t *tree, radix_tree_link_t *link,
+                      radix_tree_page_t *page, off_t off)
+{
+       int i;
+
+       if (off + RADIX_TREE_NODE_SIZE > page->size)
+               return;
+
+       for (i = 0; i < BLOCK_CACHE_NODES_PER_PAGE; i++) {
+               if (page->owners[i])
+                       continue;
+
+               page->owners[i]   = link;
+               link->u.leaf.page = page;
+               link->u.leaf.buf  = page->buf + off;
+
+               break;
+       }
+}
+
+static char *
+radix_tree_find_leaf(radix_tree_t *tree, uint64_t sector)
+{
+       int idx;
+       struct timeval now;
+       radix_tree_link_t *link;
+       radix_tree_node_t *node;
+
+       node = tree->root;
+       gettimeofday(&now, NULL);
+
+       do {
+               idx        = radix_tree_index(node, sector);
+               link       = node->links + idx;
+               link->time = now.tv_sec;
+
+               if (radix_tree_node_contains_leaves(tree, node))
+                       return link->u.leaf.buf;
+
+               if (!link->u.next)
+                       return NULL;
+
+               node = link->u.next;
+       } while (1);
+}
+
+static char *
+radix_tree_add_leaf(radix_tree_t *tree, uint64_t sector,
+                   radix_tree_page_t *page, off_t off)
+{
+       int idx;
+       struct timeval now;
+       radix_tree_link_t *link;
+       radix_tree_node_t *node;
+
+       node = tree->root;
+       gettimeofday(&now, NULL);
+
+       do {
+               idx        = radix_tree_index(node, sector);
+               link       = node->links + idx;
+               link->time = now.tv_sec;
+
+               if (radix_tree_node_contains_leaves(tree, node)) {
+                       radix_tree_remove_page(tree, link->u.leaf.page);
+                       radix_tree_insert_leaf(tree, link, page, off);
+                       return link->u.leaf.buf;
+               }
+
+               if (!link->u.next) {
+                       link->u.next = radix_tree_allocate_child_node(tree,
+                                                                     node);
+                       if (!link->u.next)
+                               return NULL;
+               }
+
+               node = link->u.next;
+       } while (1);
+}
+
+static int
+radix_tree_add_leaves(radix_tree_t *tree, char *buf,
+                     uint64_t sector, uint64_t sectors)
+{
+       int i;
+       radix_tree_page_t *page;
+
+       page = radix_tree_allocate_page(tree, buf, sector,
+                                       sectors << RADIX_TREE_NODE_SHIFT);
+       if (!page)
+               return -ENOMEM;
+
+       for (i = 0; i < sectors; i++)
+               if (!radix_tree_add_leaf(tree, sector + i, 
+                                        page, (i << RADIX_TREE_NODE_SHIFT)))
+                       goto fail;
+
+       return 0;
+
+fail:
+       page->buf = NULL;
+       radix_tree_remove_page(tree, page);
+       return -ENOMEM;
+}
+
+static void
+radix_tree_delete_branch(radix_tree_t *tree, radix_tree_node_t *node)
+{
+       int i;
+       radix_tree_link_t *link;
+
+       if (!node)
+               return;
+
+       for (i = 0; i < RADIX_TREE_NODE_SIZE; i++) {
+               link = node->links + i;
+
+               if (radix_tree_node_contains_leaves(tree, node))
+                       radix_tree_remove_page(tree, link->u.leaf.page);
+               else
+                       radix_tree_delete_branch(tree, link->u.next);
+
+               radix_tree_clear_link(link);
+       }
+
+       radix_tree_free_node(tree, node);
+}
+
+static inline void
+radix_tree_destroy(radix_tree_t *tree)
+{
+       radix_tree_delete_branch(tree, tree->root);
+       tree->root = NULL;
+}
+
+/*
+ * returns 1 if @node is empty after pruning, 0 otherwise
+ */
+static int
+radix_tree_prune_branch(radix_tree_t *tree,
+                       radix_tree_node_t *node, uint32_t now)
+{
+       int i, empty;
+       radix_tree_link_t *link;
+
+       empty = 1;
+       if (!node)
+               return empty;
+
+       for (i = 0; i < RADIX_TREE_NODE_SIZE; i++) {
+               link = node->links + i;
+
+               if (now - link->time < BLOCK_CACHE_PAGE_IDLETIME) {
+                       if (radix_tree_node_contains_leaves(tree, node)) {
+                               empty = 0;
+                               continue;
+                       }
+
+                       if (radix_tree_prune_branch(tree, link->u.next, now))
+                               radix_tree_clear_link(link);
+                       else
+                               empty = 0;
+
+                       continue;
+               }
+
+               if (radix_tree_node_contains_leaves(tree, node))
+                       radix_tree_remove_page(tree, link->u.leaf.page);
+               else
+                       radix_tree_delete_branch(tree, link->u.next);
+
+               radix_tree_clear_link(link);
+       }
+
+       if (empty && !radix_tree_node_is_root(tree, node))
+               radix_tree_free_node(tree, node);
+
+       return empty;
+}
+
+/*
+ * walk tree and free any node that has been idle for too long
+ */
+static void
+radix_tree_prune(radix_tree_t *tree)
+{
+       struct timeval now;
+
+       if (!tree->root)
+               return;
+
+       DPRINTF("tree %s has %"PRIu64" bytes\n",
+               tree->cache->name, tree->size);
+
+       gettimeofday(&now, NULL);
+       radix_tree_prune_branch(tree, tree->root, now.tv_sec);
+
+       DPRINTF("tree %s now has %"PRIu64" bytes\n",
+               tree->cache->name, tree->size);
+}
+
+static inline int
+radix_tree_initialize(radix_tree_t *tree, uint64_t sectors)
+{
+       tree->height = radix_tree_calculate_height(sectors);
+       tree->root   = radix_tree_allocate_node(tree, tree->height);
+       if (!tree->root)
+               return -ENOMEM;
+
+       return 0;
+}
+
+static inline void
+radix_tree_free(radix_tree_t *tree)
+{
+       radix_tree_destroy(tree);
+}
+
+static void
+block_cache_prune_event(event_id_t id, char mode, void *private)
+{
+       radix_tree_t *tree;
+       block_cache_t *cache;
+
+       cache = (block_cache_t *)private;
+       tree  = &cache->tree;
+
+       radix_tree_prune(tree);
+}
+
+static inline block_cache_request_t *
+block_cache_get_request(block_cache_t *cache)
+{
+       if (!cache->requests_free)
+               return NULL;
+
+       return cache->request_free_list[--cache->requests_free];
+}
+
+static inline void
+block_cache_put_request(block_cache_t *cache, block_cache_request_t *breq)
+{
+       memset(breq, 0, sizeof(block_cache_request_t));
+       cache->request_free_list[cache->requests_free++] = breq;
+}
+
+static int
+block_cache_open(td_driver_t *driver, const char *name, td_flag_t flags)
+{
+       int i, err;
+       radix_tree_t *tree;
+       block_cache_t *cache;
+
+       if (!td_flag_test(flags, TD_OPEN_RDONLY))
+               return -EINVAL;
+
+       if (driver->info.sector_size != RADIX_TREE_NODE_SIZE)
+               return -EINVAL;
+
+       cache = (block_cache_t *)driver->data;
+       err   = tapdisk_namedup(&cache->name, (char *)name);
+       if (err)
+               return -ENOMEM;
+
+       cache->sectors = driver->info.size;
+
+       tree = &cache->tree;
+       err  = radix_tree_initialize(tree, cache->sectors);
+       if (err)
+               goto fail;
+
+       tree->cache = cache;
+       cache->requests_free = BLOCK_CACHE_REQUESTS;
+       for (i = 0; i < BLOCK_CACHE_REQUESTS; i++)
+               cache->request_free_list[i] = cache->requests + i;
+
+       cache->timeout_id = tapdisk_server_register_event(SCHEDULER_POLL_TIMEOUT,
+                                                         -1, /* dummy fd */
+                                                         BLOCK_CACHE_PAGE_IDLETIME << 1,
+                                                         block_cache_prune_event,
+                                                         cache);
+       if (cache->timeout_id < 0)
+               goto fail;
+
+       DPRINTF("opening cache for %s, sectors: %"PRIu64", "
+               "tree: %p, height: %d\n",
+               cache->name, cache->sectors, tree, tree->height);
+
+       if (mlockall(MCL_CURRENT | MCL_FUTURE))
+               DPRINTF("mlockall failed: %d\n", -errno);
+
+       return 0;
+
+fail:
+       free(cache->name);
+       radix_tree_free(&cache->tree);
+       return err;
+}
+
+static int
+block_cache_close(td_driver_t *driver)
+{
+       radix_tree_t *tree;
+       block_cache_t *cache;
+
+       cache = (block_cache_t *)driver->data;
+       tree  = &cache->tree;
+
+       DPRINTF("closing cache for %s\n", cache->name);
+
+       tapdisk_server_unregister_event(cache->timeout_id);
+       radix_tree_free(tree);
+       free(cache->name);
+
+       return 0;
+}
+
+static inline uint64_t
+block_cache_hash(block_cache_t *cache, char *buf)
+{
+       int i, n;
+       uint64_t cksm, *data;
+
+       return 0;
+
+       cksm = 0;
+       data = (uint64_t *)buf;
+       n    = RADIX_TREE_NODE_SIZE / sizeof(uint64_t);
+
+       for (i = 0; i < n; i++)
+               cksm += data[i];
+
+       return ~cksm;
+}
+
+static void
+block_cache_hit(block_cache_t *cache, td_request_t treq, char *iov[])
+{
+       int i;
+       off_t off;
+
+       cache->stats.hits += treq.secs;
+
+       for (i = 0; i < treq.secs; i++) {
+               DBG("%s: block cache hit: sec 0x%08llx, hash: 0x%08llx\n",
+                   cache->name, treq.sec + i, block_cache_hash(cache, iov[i]));
+
+               off = i << RADIX_TREE_NODE_SHIFT;
+               memcpy(treq.buf + off, iov[i], RADIX_TREE_NODE_SIZE);
+       }
+
+       td_complete_request(treq, 0);
+}
+
+static void
+block_cache_populate_cache(td_request_t clone, int err)
+{
+       int i;
+       radix_tree_t *tree;
+       block_cache_t *cache;
+       block_cache_request_t *breq;
+
+       breq        = (block_cache_request_t *)clone.cb_data;
+       cache       = breq->cache;
+       tree        = &cache->tree;
+       breq->secs -= clone.secs;
+       breq->err   = (breq->err ? breq->err : err);
+
+       if (breq->secs)
+               return;
+
+       if (breq->err) {
+               free(breq->buf);
+               goto out;
+       }
+
+       for (i = 0; i < breq->treq.secs; i++) {
+               off_t off = i << RADIX_TREE_NODE_SHIFT;
+               DBG("%s: populating sec 0x%08llx\n",
+                   cache->name, breq->treq.sec + i);
+               memcpy(breq->treq.buf + off,
+                      breq->buf + off, RADIX_TREE_NODE_SIZE);
+       }
+
+       if (radix_tree_add_leaves(tree, breq->buf,
+                                 breq->treq.sec, breq->treq.secs))
+               free(breq->buf);
+
+out:
+       td_complete_request(breq->treq, breq->err);
+       block_cache_put_request(cache, breq);
+}
+
+static void
+block_cache_miss(block_cache_t *cache, td_request_t treq)
+{
+       char *buf;
+       size_t size;
+       td_request_t clone;
+       radix_tree_t *tree;
+       block_cache_request_t *breq;
+
+       DBG("%s: block cache miss: sec 0x%08llx\n", cache->name, treq.sec);
+
+       clone = treq;
+       tree  = &cache->tree;
+       size  = treq.secs << RADIX_TREE_NODE_SHIFT;
+
+       cache->stats.misses += treq.secs;
+
+       if (radix_tree_size(tree) + size >= BLOCK_CACHE_MAX_SIZE)
+               goto out;
+
+       breq = block_cache_get_request(cache);
+       if (!breq)
+               goto out;
+
+       if (posix_memalign((void **)&buf, RADIX_TREE_NODE_SIZE, size)) {
+               block_cache_put_request(cache, breq);
+               goto out;
+       }
+
+       breq->treq    = treq;
+       breq->secs    = treq.secs;
+       breq->err     = 0;
+       breq->buf     = buf;
+       breq->cache   = cache;
+
+       clone.buf     = buf;
+       clone.cb      = block_cache_populate_cache;
+       clone.cb_data = breq;
+
+out:
+       td_forward_request(clone);
+}
+
+static void
+block_cache_queue_read(td_driver_t *driver, td_request_t treq)
+{
+       int i;
+       radix_tree_t *tree;
+       block_cache_t *cache;
+       char *iov[BLOCK_CACHE_NODES_PER_PAGE];
+
+       cache = (block_cache_t *)driver->data;
+       tree  = &cache->tree;
+
+       cache->stats.reads += treq.secs;
+
+       if (treq.secs > BLOCK_CACHE_NODES_PER_PAGE)
+               return td_forward_request(treq);
+
+       for (i = 0; i < treq.secs; i++) {
+               iov[i] = radix_tree_find_leaf(tree, treq.sec + i);
+               if (!iov[i])
+                       return block_cache_miss(cache, treq);
+       }
+
+       return block_cache_hit(cache, treq, iov);
+}
+
+static void
+block_cache_queue_write(td_driver_t *driver, td_request_t treq)
+{
+       td_complete_request(treq, -EPERM);
+}
+
+static int
+block_cache_get_parent_id(td_driver_t *driver, td_disk_id_t *id)
+{
+       return -EINVAL;
+}
+
+static int
+block_cache_validate_parent(td_driver_t *driver,
+                           td_driver_t *pdriver, td_flag_t flags)
+{
+       block_cache_t *cache;
+
+       if (!td_flag_test(pdriver->state, TD_DRIVER_RDONLY))
+               return -EINVAL;
+
+       cache = (block_cache_t *)driver->data;
+       if (strcmp(driver->name, pdriver->name))
+               return -EINVAL;
+
+       return 0;
+}
+
+static void
+block_cache_debug(td_driver_t *driver)
+{
+       block_cache_t *cache;
+       block_cache_stats_t *stats;
+
+       cache = (block_cache_t *)driver->data;
+       stats = &cache->stats;
+
+       WARN("BLOCK CACHE %s\n", cache->name);
+       WARN("reads: %"PRIu64", hits: %"PRIu64", misses: %"PRIu64", prunes: %"PRIu64"\n",
+            stats->reads, stats->hits, stats->misses, stats->prunes);
+}
+
+struct tap_disk tapdisk_block_cache = {
+       .disk_type                  = "tapdisk_block_cache",
+       .flags                      = 0,
+       .private_data_size          = sizeof(block_cache_t),
+       .td_open                    = block_cache_open,
+       .td_close                   = block_cache_close,
+       .td_queue_read              = block_cache_queue_read,
+       .td_queue_write             = block_cache_queue_write,
+       .td_get_parent_id           = block_cache_get_parent_id,
+       .td_validate_parent         = block_cache_validate_parent,
+       .td_debug                   = block_cache_debug,
+};
diff --git a/tools/blktap2/drivers/block-log.c b/tools/blktap2/drivers/block-log.c
new file mode 100644 (file)
index 0000000..2cc051b
--- /dev/null
@@ -0,0 +1,688 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* Driver to sit on top of another disk and log writes, in order
+ * to synchronize two distinct disks
+ *
+ * On receipt of a control request it can export a list of dirty
+ * sectors in the following format:
+ * struct writerange {
+ *   u64 sector;
+ *   u32 count;
+ * }
+ * terminated by { 0, 0 }
+ */
+
+#include <errno.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+
+#include "log.h"
+#include "tapdisk.h"
+#include "tapdisk-server.h"
+#include "tapdisk-driver.h"
+#include "tapdisk-interface.h"
+
+#define MAX_CONNECTIONS 1
+
+typedef struct poll_fd {
+  int          fd;
+  event_id_t   id;
+} poll_fd_t;
+
+struct tdlog_state {
+  uint64_t     size;
+
+  void*        writelog;
+
+  char*        ctlpath;
+  poll_fd_t    ctl;
+
+  int          connected;
+  poll_fd_t    connections[MAX_CONNECTIONS];
+
+  char*        shmpath;
+  void*        shm;
+
+  log_sring_t* sring;
+  log_back_ring_t bring;
+};
+
+#define BDPRINTF(_f, _a...) syslog (LOG_DEBUG, "log: " _f "\n", ## _a)
+
+#define BWPRINTF(_f, _a...) syslog (LOG_WARNING, "log: " _f "\n", ## _a)
+
+static void ctl_accept(event_id_t, char, void *);
+static void ctl_request(event_id_t, char, void *);
+
+/* -- write log -- */
+
+/* large flat bitmaps don't scale particularly well either in size or scan
+ * time, but they'll do for now */
+#define BITS_PER_LONG (sizeof(unsigned long) * 8)
+#define BITS_TO_LONGS(bits) (((bits)+BITS_PER_LONG-1)/BITS_PER_LONG)
+
+#define BITMAP_ENTRY(_nr, _bmap) ((unsigned long*)(_bmap))[(_nr)/BITS_PER_LONG]
+#define BITMAP_SHIFT(_nr) ((_nr) % BITS_PER_LONG)
+
+static inline int test_bit(int nr, void* bmap)
+{
+  return (BITMAP_ENTRY(nr, bmap) >> BITMAP_SHIFT(nr)) & 1;
+}
+
+static inline void clear_bit(int nr, void* bmap)
+{
+  BITMAP_ENTRY(nr, bmap) &= ~(1UL << BITMAP_SHIFT(nr));
+}
+
+static inline void set_bit(int nr, void* bmap)
+{
+  BITMAP_ENTRY(nr, bmap) |= (1UL << BITMAP_SHIFT(nr));
+}
+
+static inline int bitmap_size(uint64_t sz)
+{
+  return sz >> 3;
+}
+
+static int writelog_create(struct tdlog_state *s)
+{
+  uint64_t bmsize;
+
+  bmsize = bitmap_size(s->size);
+
+  BDPRINTF("allocating %"PRIu64" bytes for dirty bitmap", bmsize);
+
+  if (!(s->writelog = calloc(bmsize, 1))) {
+    BWPRINTF("could not allocate dirty bitmap of size %"PRIu64, bmsize);
+    return -1;
+  }
+
+  return 0;
+}
+
+static int writelog_free(struct tdlog_state *s)
+{
+  if (s->writelog)
+    free(s->writelog);
+
+  return 0;
+}
+
+static int writelog_set(struct tdlog_state* s, uint64_t sector, int count)
+{
+  int i;
+
+  for (i = 0; i < count; i++) 
+    set_bit(sector + i, s->writelog);
+
+  return 0;
+}
+
+/* if end is 0, clear to end of disk */
+int writelog_clear(struct tdlog_state* s, uint64_t start, uint64_t end)
+{
+  if (!end)
+    end = s->size;
+
+  /* clear to word boundaries */
+  while (BITMAP_SHIFT(start))
+    clear_bit(start++, s->writelog);
+  while (BITMAP_SHIFT(end))
+    clear_bit(end--, s->writelog);
+
+  memset(s->writelog + start / BITS_PER_LONG, 0, (end - start) >> 3);
+
+  return 0;
+}
+
+/* returns last block exported (may not be end of disk if shm region
+ * overflows) */
+static uint64_t writelog_export(struct tdlog_state* s)
+{
+  struct disk_range* range = s->shm;
+  uint64_t i = 0;
+
+  BDPRINTF("sector count: %"PRIu64, s->size);
+
+  for (i = 0; i < s->size; i++) {
+    if (test_bit(i, s->writelog)) {
+      /* range start */
+      range->sector = i;
+      range->count = 1;
+      /* find end */
+      for (i++; i < s->size && test_bit(i, s->writelog); i++)
+       range->count++;
+
+      BDPRINTF("export: dirty extent %"PRIu64":%u",
+              range->sector, range->count);
+      range++;
+
+      /* out of space in shared memory region */
+      if ((void*)range >= bmend(s->shm)) {
+       BDPRINTF("out of space in shm region at sector %"PRIu64, i);
+       return i;
+      }
+
+      /* undo forloop increment */
+      i--;
+    }
+  }
+
+  /* NULL-terminate range list */
+  range->sector = 0;
+  range->count = 0;
+
+  return i;
+}
+
+/* -- communication channel -- */
+
+/* remove FS special characters in up to len bytes of path */
+static inline void path_escape(char* path, size_t len) {
+  int i;
+
+  for (i = 0; i < len && path[i]; i++)
+    if (strchr(":/", path[i]))
+      path[i] = '_';
+}
+
+static char* ctl_makepath(const char* name, const char* ext)
+{
+  char* res;
+  char *file;
+
+  file = strrchr(name, '/');
+  if (!file) {
+    BWPRINTF("invalid name %s\n", name);
+    return NULL;
+  }
+
+  if (asprintf(&res, BLKTAP_CTRL_DIR "/log_%s.%s", file, ext) < 0) {
+    BWPRINTF("could not allocate path");
+    return NULL;
+  }
+
+  path_escape(res + strlen(BLKTAP_CTRL_DIR) + 5, strlen(file));
+
+  return res;
+}
+
+static int shmem_open(struct tdlog_state* s, const char* name)
+{
+  int i, l, fd;
+
+  /* device name -> path */
+  if (asprintf(&s->shmpath, "/log_%s.wlog", name) < 0) {
+    BWPRINTF("could not allocate shm path");
+    return -1;
+  }
+
+  path_escape(s->shmpath + 5, strlen(name));
+
+  if ((fd = shm_open(s->shmpath, O_CREAT|O_RDWR, 0750)) < 0) {
+    BWPRINTF("could not open shared memory file %s: %s", s->shmpath,
+            strerror(errno));
+    goto err;
+  }
+  if (ftruncate(fd, SHMSIZE) < 0) {
+    BWPRINTF("error truncating shmem to size %u", SHMSIZE);
+    close(fd);
+    goto err;
+  }
+
+  s->shm = mmap(NULL, SHMSIZE, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
+  close(fd);
+  if (s->shm == MAP_FAILED) {
+    BWPRINTF("could not mmap write log shm: %s", strerror(errno));
+    goto err;
+  }
+  return 0;
+
+  err:
+  s->shm = NULL;
+  free(s->shmpath);
+  s->shmpath = NULL;
+  return -1;
+}
+
+static int shmem_close(struct tdlog_state* s)
+{
+  if (s->shm) {
+    munmap(s->shm, SHMSIZE);
+    s->shm = NULL;
+  }
+
+  if (s->shmpath) {
+    shm_unlink(s->shmpath);
+    s->shmpath = NULL;
+  }
+
+  return 0;
+}
+
+/* control socket */
+
+static int ctl_open(struct tdlog_state* s, const char* name)
+{
+  struct sockaddr_un saddr;
+
+  if (!(s->ctlpath = ctl_makepath(name, "ctl")))
+    return -1;
+
+  if ((s->ctl.fd = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) {
+    BWPRINTF("error opening control socket: %s", strerror(errno));
+    goto err;
+  }
+
+  memset(&saddr, 0, sizeof(saddr));
+  saddr.sun_family = AF_UNIX;
+  memcpy(saddr.sun_path, s->ctlpath, strlen(s->ctlpath));
+  if (unlink(s->ctlpath) && errno != ENOENT) {
+    BWPRINTF("error unlinking old socket path %s: %s", s->ctlpath,
+            strerror(errno));
+    goto err_sock;
+  }
+    
+  if (bind(s->ctl.fd, &saddr, sizeof(saddr)) < 0) {
+    BWPRINTF("error binding control socket to %s: %s", s->ctlpath,
+            strerror(errno));
+    goto err_sock;
+  }
+
+  if (listen(s->ctl.fd, 1) < 0) {
+    BWPRINTF("error listening on control socket: %s", strerror(errno));
+    goto err_sock;
+  }
+
+  s->ctl.id = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD,
+                                           s->ctl.fd, 0, ctl_accept, s);
+  if (s->ctl.id < 0) {
+    BWPRINTF("error register event handler: %s", strerror(s->ctl.id));
+    goto err_sock;
+  }
+
+  return 0;
+
+  err_sock:
+  close(s->ctl.fd);
+  s->ctl.fd = -1;
+  err:
+  free(s->ctlpath);
+  s->ctlpath = NULL;
+
+  return -1;
+}
+
+static int ctl_close(struct tdlog_state* s)
+{
+  while (s->connected) {
+    tapdisk_server_unregister_event(s->connections[s->connected].id);
+    close(s->connections[s->connected].fd);
+    s->connections[s->connected].fd = -1;
+    s->connections[s->connected].id = 0;
+    s->connected--;
+  }
+
+  if (s->ctl.fd >= 0) {
+    tapdisk_server_unregister_event(s->ctl.id);
+    close(s->ctl.fd);
+    s->ctl.fd = -1;
+    s->ctl.id = 0;
+  }
+
+  if (s->ctlpath) {
+    unlink(s->ctlpath);
+    free(s->ctlpath);
+    s->ctlpath = NULL;
+  }
+
+  /* XXX this must be fixed once requests are actually in flight */
+  /* could just drain the existing ring here first */
+  if (s->sring) {
+    SHARED_RING_INIT(s->sring);
+    BACK_RING_INIT(&s->bring, s->sring, SRINGSIZE);
+  }
+
+  return 0;
+}
+
+/* walk list of open sockets, close matching fd */
+static int ctl_close_sock(struct tdlog_state* s, int fd)
+{
+  int i;
+
+  for (i = 0; i <= s->connected; i++) {
+    if (s->connections[i].fd == fd) {
+      tapdisk_server_unregister_event(s->connections[i].id);
+      close(s->connections[i].fd);
+      s->connections[i].fd = -1;
+      s->connections[i].id = 0;
+      s->connected--;
+      return 0;
+    }
+  }
+
+  BWPRINTF("requested to close unknown socket %d", fd);
+  return -1;
+}
+
+static void ctl_accept(event_id_t id, char mode, void *private)
+{
+  struct tdlog_state* s = (struct tdlog_state *)private;
+  int fd;
+  event_id_t cid;
+
+  if ((fd = accept(s->ctl.fd, NULL, NULL)) < 0) {
+    BWPRINTF("error accepting control connection: %s", strerror(errno));
+    return;
+  }
+
+  if (s->connected) {
+    BWPRINTF("control session in progress, closing new connection");
+    close(fd);
+    return;
+  }
+
+  cid = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD,
+                                     fd, 0, ctl_request, s);
+  if (cid < 0) {
+    BWPRINTF("error registering connection event handler: %s", strerror(cid));
+    close(fd);
+    return;
+  }
+
+  s->connections[s->connected].fd = fd;
+  s->connections[s->connected].id = cid;
+  s->connected++;
+}
+
+/* response format: 4 bytes shmsize, 0-terminated path */
+static int ctl_get_shmpath(struct tdlog_state* s, int fd)
+{
+  char msg[CTLRSPLEN_SHMP + 1];
+  uint32_t sz;
+  int rc;
+
+  BDPRINTF("ctl: sending shared memory parameters (size: %u, path: %s)",
+          SHMSIZE, s->shmpath);
+
+  /* TMP: sanity-check shm */
+  sz = 0xdeadbeef;
+  memcpy(s->shm, &sz, sizeof(sz));
+
+  sz = SHMSIZE;
+  memcpy(msg, &sz, sizeof(sz));
+  snprintf(msg + sizeof(sz), sizeof(msg) - sizeof(sz), "%s", s->shmpath);
+  if ((rc = write(fd, msg, CTLRSPLEN_SHMP)) < 0) {
+    BWPRINTF("error writing shmpath: %s", strerror(errno));
+    return -1;
+  }
+
+  return 0;
+}
+
+static int ctl_peek_writes(struct tdlog_state* s, int fd)
+{
+  int rc;
+
+  BDPRINTF("ctl: peeking bitmap");
+
+  writelog_export(s);
+
+  if ((rc = write(fd, "done", CTLRSPLEN_PEEK)) < 0) {
+    BWPRINTF("error writing peek ack: %s", strerror(errno));
+    return -1;
+  }
+
+  return 0;
+}
+
+static int ctl_clear_writes(struct tdlog_state* s, int fd)
+{
+  int rc;
+
+  BDPRINTF("ctl: clearing bitmap");
+
+  writelog_clear(s, 0, 0);
+
+  if ((rc = write(fd, "done", CTLRSPLEN_CLEAR)) < 0) {
+    BWPRINTF("error writing clear ack: %s", strerror(errno));
+    return -1;
+  }
+
+  return 0;
+}
+
+/* get dirty bitmap and clear it atomically */
+static int ctl_get_writes(struct tdlog_state* s, int fd)
+{
+  int rc;
+
+  BDPRINTF("ctl: getting bitmap");
+
+  writelog_export(s);
+  writelog_clear(s, 0, 0);
+
+  if ((rc = write(fd, "done", CTLRSPLEN_GET)) < 0) {
+    BWPRINTF("error writing get ack: %s", strerror(errno));
+    return -1;
+  }
+
+  return 0;
+}
+
+/* get requests from ring */
+static int ctl_kick(struct tdlog_state* s, int fd)
+{
+  RING_IDX reqstart, reqend;
+  log_request_t req;
+
+  /* XXX testing */
+  RING_IDX rspstart, rspend;
+  log_response_t rsp;
+  struct log_ctlmsg msg;
+  int rc;
+
+  reqstart = s->bring.req_cons;
+  reqend = s->sring->req_prod;
+
+  BDPRINTF("ctl: ring kicked (start = %u, end = %u)", reqstart, reqend);
+
+  while (reqstart != reqend) {
+    /* XXX actually submit these! */
+    memcpy(&req, RING_GET_REQUEST(&s->bring, reqstart), sizeof(req));
+    BDPRINTF("ctl: read request %"PRIu64":%u", req.sector, req.count);
+    s->bring.req_cons = ++reqstart;
+
+    rsp.sector = req.sector;
+    rsp.count = req.count;
+    memcpy(RING_GET_RESPONSE(&s->bring, s->bring.rsp_prod_pvt), &rsp,
+          sizeof(rsp));
+    s->bring.rsp_prod_pvt++;
+  }
+
+  RING_PUSH_RESPONSES(&s->bring);
+  memset(&msg, 0, sizeof(msg));
+  memcpy(msg.msg, LOGCMD_KICK, 4);
+  if ((rc = write(fd, &msg, sizeof(msg))) < 0) {
+    BWPRINTF("error sending notify: %s", strerror(errno));
+    return -1;
+  } else if (rc < sizeof(msg)) {
+    BWPRINTF("short notify write (%d/%zd)", rc, sizeof(msg));
+    return -1;
+  }
+
+  return 0;
+}
+
+static int ctl_do_request(struct tdlog_state* s, int fd, struct log_ctlmsg* msg)
+{
+  if (!strncmp(msg->msg, LOGCMD_SHMP, 4)) {
+    return ctl_get_shmpath(s, fd);
+  } else if (!strncmp(msg->msg, LOGCMD_PEEK, 4)) {
+    return ctl_peek_writes(s, fd);
+  } else if (!strncmp(msg->msg, LOGCMD_CLEAR, 4)) {
+    return ctl_clear_writes(s, fd);
+  } else if (!strncmp(msg->msg, LOGCMD_GET, 4)) {
+    return ctl_get_writes(s, fd);
+  } else if (!strncmp(msg->msg, LOGCMD_KICK, 4)) {
+    return ctl_kick(s, fd);
+  }
+
+  BWPRINTF("unknown control request %.4s", msg->msg);
+  return -1;
+}
+
+static inline int ctl_find_connection(struct tdlog_state *s, event_id_t id)
+{
+  int i;
+
+  for (i = 0; i < s->connected; i++)
+    if (s->connections[i].id == id)
+      return s->connections[i].fd;
+
+  BWPRINTF("unrecognized event callback id %d", id);
+  return -1;
+}
+
+static void ctl_request(event_id_t id, char mode, void *private)
+{
+  struct tdlog_state* s = (struct tdlog_state*)private;
+  struct log_ctlmsg msg;
+  int rc, i, fd = -1;
+
+  fd = ctl_find_connection(s, id);
+  if (fd == -1)
+    return;
+
+  if ((rc = read(fd, &msg, sizeof(msg))) < 0) {
+    BWPRINTF("error reading from ctl socket %d, closing: %s", fd,
+            strerror(errno));
+    ctl_close_sock(s, fd);
+    return;
+  } else if (rc == 0) {
+    BDPRINTF("ctl_request: EOF, closing socket");
+    ctl_close_sock(s, fd);
+    return;
+  } else if (rc < sizeof(msg)) {
+    BWPRINTF("short request received (%d/%zd bytes), ignoring", rc,
+            sizeof(msg));
+    return;
+  }
+
+  ctl_do_request(s, fd, &msg);
+}
+
+/* -- interface -- */
+
+static int tdlog_close(td_driver_t*);
+
+static int tdlog_open(td_driver_t* driver, const char* name, td_flag_t flags)
+{
+  struct tdlog_state* s = (struct tdlog_state*)driver->data;
+  int rc;
+
+  memset(s, 0, sizeof(*s));
+
+  s->size = driver->info.size;
+
+  if ((rc = writelog_create(s))) {
+    tdlog_close(driver);
+    return rc;
+  }
+  if ((rc = shmem_open(s, name))) {
+    tdlog_close(driver);
+    return rc;
+  }
+  if ((rc = ctl_open(s, name))) {
+    tdlog_close(driver);
+    return rc;
+  }
+
+  s->sring = (log_sring_t*)sringstart(s->shm);
+  SHARED_RING_INIT(s->sring);
+  BACK_RING_INIT(&s->bring, s->sring, SRINGSIZE);
+
+  BDPRINTF("opened ctl socket");
+
+  return 0;
+}
+
+static int tdlog_close(td_driver_t* driver)
+{
+  struct tdlog_state* s = (struct tdlog_state*)driver->data;
+
+  ctl_close(s);
+  shmem_close(s);
+  writelog_free(s);
+
+  return 0;
+}
+
+static void tdlog_queue_read(td_driver_t* driver, td_request_t treq)
+{
+  td_forward_request(treq);
+}
+
+static void tdlog_queue_write(td_driver_t* driver, td_request_t treq)
+{
+  struct tdlog_state* s = (struct tdlog_state*)driver->data;
+  int rc;
+
+  writelog_set(s, treq.sec, treq.secs);
+  td_forward_request(treq);
+}
+
+static int tdlog_get_parent_id(td_driver_t* driver, td_disk_id_t* id)
+{
+  return -EINVAL;
+}
+
+static int tdlog_validate_parent(td_driver_t *driver,
+                                td_driver_t *parent, td_flag_t flags)
+{
+  return 0;
+}
+
+struct tap_disk tapdisk_log = {
+  .disk_type          = "tapdisk_log",
+  .private_data_size  = sizeof(struct tdlog_state),
+  .flags              = 0,
+  .td_open            = tdlog_open,
+  .td_close           = tdlog_close,
+  .td_queue_read      = tdlog_queue_read,
+  .td_queue_write     = tdlog_queue_write,
+  .td_get_parent_id   = tdlog_get_parent_id,
+  .td_validate_parent = tdlog_validate_parent,
+};
diff --git a/tools/blktap2/drivers/block-qcow.c b/tools/blktap2/drivers/block-qcow.c
new file mode 100644 (file)
index 0000000..1ddd92d
--- /dev/null
@@ -0,0 +1,1517 @@
+/* block-qcow.c
+ *
+ * Asynchronous Qemu copy-on-write disk implementation.
+ * Code based on the Qemu implementation
+ * (see copyright notice below)
+ *
+ * (c) 2006 Andrew Warfield and Julian Chesterfield
+ *
+ */
+
+/*
+ * Block driver for the QCOW format
+ * 
+ * Copyright (c) 2004 Fabrice Bellard
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files(the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/statvfs.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <linux/fs.h>
+#include <string.h>
+#include <zlib.h>
+#include <inttypes.h>
+#include <libaio.h>
+#include <openssl/md5.h>
+#include "bswap.h"
+#include "aes.h"
+
+#include "tapdisk.h"
+#include "tapdisk-driver.h"
+#include "tapdisk-interface.h"
+#include "qcow.h"
+#include "blk.h"
+#include "atomicio.h"
+
+/* *BSD has no O_LARGEFILE */
+#ifndef O_LARGEFILE
+#define O_LARGEFILE     0
+#endif
+
+#if 1
+#define ASSERT(_p) \
+    if ( !(_p) ) { DPRINTF("Assertion '%s' failed, line %d, file %s", #_p , \
+    __LINE__, __FILE__); *(int*)0=0; }
+#else
+#define ASSERT(_p) ((void)0)
+#endif
+
+struct pending_aio {
+        td_callback_t cb;
+        int id;
+        void *private;
+       int nb_sectors;
+       char *buf;
+       uint64_t sector;
+};
+
+#undef IOCB_IDX
+#define IOCB_IDX(_s, _io) ((_io) - (_s)->iocb_list)
+
+#define ZERO_TEST(_b) (_b | 0x00)
+
+struct qcow_request {
+       td_request_t         treq;
+       struct tiocb         tiocb;
+       struct tdqcow_state  *state;
+};
+
+static int decompress_cluster(struct tdqcow_state *s, uint64_t cluster_offset);
+
+#ifdef USE_GCRYPT
+
+#include <gcrypt.h>
+
+uint32_t gen_cksum(char *ptr, int len)
+{
+  int i;
+  uint32_t md[4];
+
+  /* Generate checksum */
+  gcry_md_hash_buffer(GCRY_MD_MD5, md, ptr, len);
+
+  return md[0];
+}
+
+#else /* use libcrypto */
+
+#include <openssl/md5.h>
+
+uint32_t gen_cksum(char *ptr, int len)
+{
+  int i;
+  unsigned char *md;
+  uint32_t ret;
+
+  md = malloc(MD5_DIGEST_LENGTH);
+  if(!md) return 0;
+
+  /* Generate checksum */
+  if (MD5((unsigned char *)ptr, len, md) != md)
+    ret = 0;
+  else
+    memcpy(&ret, md, sizeof(uint32_t));
+
+  free(md);
+  return ret;
+}
+
+#endif
+
+
+static void free_aio_state(struct tdqcow_state* s)
+{
+       free(s->aio_requests);
+       free(s->aio_free_list);
+}
+
+static int init_aio_state(td_driver_t *driver)
+{
+       int i, ret;
+       td_disk_info_t *bs = &(driver->info);
+       struct tdqcow_state   *s  = (struct tdqcow_state *)driver->data;
+       
+        // A segment (i.e. a page) can span multiple clusters
+        s->max_aio_reqs = ((getpagesize() / s->cluster_size) + 1) *
+         MAX_SEGMENTS_PER_REQ * MAX_REQUESTS;
+
+       s->aio_free_count = s->max_aio_reqs;
+
+       if (!(s->aio_requests  = calloc(s->max_aio_reqs, sizeof(struct qcow_request))) || 
+           !(s->aio_free_list = calloc(s->max_aio_reqs, sizeof(struct qcow_request)))) {
+           DPRINTF("Failed to allocate AIO structs (max_aio_reqs = %d)\n",
+                   s->max_aio_reqs);
+           goto fail;
+       }
+
+       for (i = 0; i < s->max_aio_reqs; i++)
+               s->aio_free_list[i] = &s->aio_requests[i];
+
+        DPRINTF("AIO state initialised\n");
+
+        return 0;
+ fail:
+       return -1;
+}
+
+int get_filesize(char *filename, uint64_t *size, struct stat *st)
+{
+       int fd;
+       QCowHeader header;
+
+       /*Set to the backing file size*/
+       fd = open(filename, O_RDONLY);
+       if (fd < 0)
+               return -1;
+       if (read(fd, &header, sizeof(header)) < sizeof(header)) {
+               close(fd);
+               return -1;
+       }
+       close(fd);
+       
+       be32_to_cpus(&header.magic);
+       be64_to_cpus(&header.size);
+       if (header.magic == QCOW_MAGIC) {
+               *size = header.size >> SECTOR_SHIFT;
+               return 0;
+       }
+
+       if(S_ISBLK(st->st_mode)) {
+               fd = open(filename, O_RDONLY);
+               if (fd < 0)
+                       return -1;
+               if (blk_getimagesize(fd, size) != 0) {
+                       printf("Unable to get Block device size\n");
+                       close(fd);
+                       return -1;
+               }
+               close(fd);
+       } else *size = (st->st_size >> SECTOR_SHIFT);   
+       return 0;
+}
+
+static int qcow_set_key(struct tdqcow_state *s, const char *key)
+{
+       uint8_t keybuf[16];
+       int len, i;
+       
+       memset(keybuf, 0, 16);
+       len = strlen(key);
+       if (len > 16)
+               len = 16;
+       /* XXX: we could compress the chars to 7 bits to increase
+          entropy */
+       for (i = 0; i < len; i++) {
+               keybuf[i] = key[i];
+       }
+       s->crypt_method = s->crypt_method_header;
+       
+       if (AES_set_encrypt_key(keybuf, 128, &s->aes_encrypt_key) != 0)
+               return -1;
+       if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0)
+               return -1;
+#if 0
+       /* test */
+       {
+               uint8_t in[16];
+               uint8_t out[16];
+               uint8_t tmp[16];
+               for (i=0; i<16; i++)
+                       in[i] = i;
+               AES_encrypt(in, tmp, &s->aes_encrypt_key);
+               AES_decrypt(tmp, out, &s->aes_decrypt_key);
+               for (i = 0; i < 16; i++)
+                       DPRINTF(" %02x", tmp[i]);
+               DPRINTF("\n");
+               for (i = 0; i < 16; i++)
+                       DPRINTF(" %02x", out[i]);
+               DPRINTF("\n");
+       }
+#endif
+       return 0;
+}
+
+void tdqcow_complete(void *arg, struct tiocb *tiocb, int err)
+{
+       struct qcow_request *aio = (struct qcow_request *)arg;
+       struct tdqcow_state *s = aio->state;
+
+       td_complete_request(aio->treq, err);
+
+       s->aio_free_list[s->aio_free_count++] = aio;
+}
+
+static void async_read(td_driver_t *driver, td_request_t treq)
+{
+       int size;
+       uint64_t offset;
+       struct qcow_request *aio;
+       struct tdqcow_state *prv;
+
+       prv    = (struct tdqcow_state *)driver->data;
+       size   = treq.secs * driver->info.sector_size;
+       offset = treq.sec  * (uint64_t)driver->info.sector_size;
+
+       if (prv->aio_free_count == 0)
+               goto fail;
+
+       aio        = prv->aio_free_list[--prv->aio_free_count];
+       aio->treq  = treq;
+       aio->state = prv;
+
+       td_prep_read(&aio->tiocb, prv->fd, treq.buf,
+                    size, offset, tdqcow_complete, aio);
+       td_queue_tiocb(driver, &aio->tiocb);
+
+       return;
+
+fail:
+       td_complete_request(treq, -EBUSY);
+}
+
+static void async_write(td_driver_t *driver, td_request_t treq)
+{
+       int size;
+       uint64_t offset;
+       struct qcow_request *aio;
+       struct tdqcow_state *prv;
+
+       prv     = (struct tdqcow_state *)driver->data;
+       size    = treq.secs * driver->info.sector_size;
+       offset  = treq.sec  * (uint64_t)driver->info.sector_size;
+
+       if (prv->aio_free_count == 0)
+               goto fail;
+
+       aio        = prv->aio_free_list[--prv->aio_free_count];
+       aio->treq  = treq;
+       aio->state = prv;
+
+       td_prep_write(&aio->tiocb, prv->fd, treq.buf,
+                     size, offset, tdqcow_complete, aio);
+       td_queue_tiocb(driver, &aio->tiocb);
+
+       return;
+
+fail:
+       td_complete_request(treq, -EBUSY);
+}
+
+/* 
+ * The crypt function is compatible with the linux cryptoloop
+ * algorithm for < 4 GB images. NOTE: out_buf == in_buf is
+ * supported .
+ */
+static void encrypt_sectors(struct tdqcow_state *s, int64_t sector_num,
+                            uint8_t *out_buf, const uint8_t *in_buf,
+                            int nb_sectors, int enc,
+                            const AES_KEY *key)
+{
+       union {
+               uint64_t ll[2];
+               uint8_t b[16];
+       } ivec;
+       int i;
+       
+       for (i = 0; i < nb_sectors; i++) {
+               ivec.ll[0] = cpu_to_le64(sector_num);
+               ivec.ll[1] = 0;
+               AES_cbc_encrypt(in_buf, out_buf, 512, key, 
+                               ivec.b, enc);
+               sector_num++;
+               in_buf += 512;
+               out_buf += 512;
+       }
+}
+
+int qtruncate(int fd, off_t length, int sparse)
+{
+       int ret, i; 
+       int current = 0, rem = 0;
+       uint64_t sectors;
+       struct stat st;
+       char *buf;
+
+       /* If length is greater than the current file len
+        * we synchronously write zeroes to the end of the 
+        * file, otherwise we truncate the length down
+        */
+       ret = fstat(fd, &st);
+       if (ret == -1) 
+               return -1;
+       if (S_ISBLK(st.st_mode))
+               return 0;
+
+       sectors = (length + DEFAULT_SECTOR_SIZE - 1)/DEFAULT_SECTOR_SIZE;
+       current = (st.st_size + DEFAULT_SECTOR_SIZE - 1)/DEFAULT_SECTOR_SIZE;
+       rem     = st.st_size % DEFAULT_SECTOR_SIZE;
+
+       /* If we are extending this file, we write zeros to the end --
+        * this tries to ensure that the extents allocated wind up being
+        * contiguous on disk.
+        */
+       if(st.st_size < sectors * DEFAULT_SECTOR_SIZE) {
+               /*We are extending the file*/
+               if ((ret = posix_memalign((void **)&buf, 
+                                         512, DEFAULT_SECTOR_SIZE))) {
+                       DPRINTF("posix_memalign failed: %d\n", ret);
+                       return -1;
+               }
+               memset(buf, 0x00, DEFAULT_SECTOR_SIZE);
+               if (lseek(fd, 0, SEEK_END)==-1) {
+                       DPRINTF("Lseek EOF failed (%d), internal error\n",
+                               errno);
+                       free(buf);
+                       return -1;
+               }
+               if (rem) {
+                       ret = write(fd, buf, rem);
+                       if (ret != rem) {
+                               DPRINTF("write failed: ret = %d, err = %s\n",
+                                       ret, strerror(errno));
+                               free(buf);
+                               return -1;
+                       }
+               }
+               for (i = current; i < sectors; i++ ) {
+                       ret = write(fd, buf, DEFAULT_SECTOR_SIZE);
+                       if (ret != DEFAULT_SECTOR_SIZE) {
+                               DPRINTF("write failed: ret = %d, err = %s\n",
+                                       ret, strerror(errno));
+                               free(buf);
+                               return -1;
+                       }
+               }
+               free(buf);
+       } else if(sparse && (st.st_size > sectors * DEFAULT_SECTOR_SIZE))
+               if (ftruncate(fd, (off_t)sectors * DEFAULT_SECTOR_SIZE)==-1) {
+                       DPRINTF("Ftruncate failed (%s)\n", strerror(errno));
+                       return -1;
+               }
+       return 0;
+}
+
+/* 'allocate' is:
+ *
+ * 0 to not allocate.
+ *
+ * 1 to allocate a normal cluster (for sector indexes 'n_start' to
+ * 'n_end')
+ *
+ * 2 to allocate a compressed cluster of size
+ * 'compressed_size'. 'compressed_size' must be > 0 and <
+ * cluster_size 
+ *
+ * return 0 if not allocated.
+ */
+static uint64_t get_cluster_offset(struct tdqcow_state *s,
+                                   uint64_t offset, int allocate,
+                                   int compressed_size,
+                                   int n_start, int n_end)
+{
+       int min_index, i, j, l1_index, l2_index, l2_sector, l1_sector;
+       char *tmp_ptr2, *l2_ptr, *l1_ptr;
+       uint64_t *tmp_ptr;
+       uint64_t l2_offset, *l2_table, cluster_offset, tmp;
+       uint32_t min_count;
+       int new_l2_table;
+
+       /*Check L1 table for the extent offset*/
+       l1_index = offset >> (s->l2_bits + s->cluster_bits);
+       l2_offset = s->l1_table[l1_index];
+       new_l2_table = 0;
+       if (!l2_offset) {
+               if (!allocate)
+                       return 0;
+               /* 
+                * allocating a new l2 entry + extent 
+                * at the end of the file, we must also
+                * update the L1 entry safely.
+                */
+               l2_offset = s->fd_end;
+
+               /* round to cluster size */
+               l2_offset = (l2_offset + s->cluster_size - 1) 
+                       & ~(s->cluster_size - 1);
+
+               /* update the L1 entry */
+               s->l1_table[l1_index] = l2_offset;
+               
+               /*Truncate file for L2 table 
+                *(initialised to zero in case we crash)*/
+               if (qtruncate(s->fd, 
+                             l2_offset + (s->l2_size * sizeof(uint64_t)),
+                             s->sparse) != 0) {
+                       DPRINTF("ERROR truncating file\n");
+                       return 0;
+               }
+               s->fd_end = l2_offset + (s->l2_size * sizeof(uint64_t));
+
+               /*Update the L1 table entry on disk
+                 * (for O_DIRECT we write 4KByte blocks)*/
+               l1_sector = (l1_index * sizeof(uint64_t)) >> 12;
+               l1_ptr = (char *)s->l1_table + (l1_sector << 12);
+
+               if (posix_memalign((void **)&tmp_ptr, 4096, 4096) != 0) {
+                       DPRINTF("ERROR allocating memory for L1 table\n");
+               }
+               memcpy(tmp_ptr, l1_ptr, 4096);
+
+               /* Convert block to write to big endian */
+               for(i = 0; i < 4096 / sizeof(uint64_t); i++) {
+                       cpu_to_be64s(&tmp_ptr[i]);
+               }
+
+               /*
+                * Issue non-asynchronous L1 write.
+                * For safety, we must ensure that
+                * entry is written before blocks.
+                */
+               lseek(s->fd, s->l1_table_offset + (l1_sector << 12), SEEK_SET);
+               if (write(s->fd, tmp_ptr, 4096) != 4096) {
+                       free(tmp_ptr);
+                       return 0;
+               }
+               free(tmp_ptr);
+
+               new_l2_table = 1;
+               goto cache_miss;
+       } else if (s->min_cluster_alloc == s->l2_size) {
+               /*Fast-track the request*/
+               cluster_offset = l2_offset + (s->l2_size * sizeof(uint64_t));
+               l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
+               return cluster_offset + (l2_index * s->cluster_size);
+       }
+
+       /*Check to see if L2 entry is already cached*/
+       for (i = 0; i < L2_CACHE_SIZE; i++) {
+               if (l2_offset == s->l2_cache_offsets[i]) {
+                       /* increment the hit count */
+                       if (++s->l2_cache_counts[i] == 0xffffffff) {
+                               for (j = 0; j < L2_CACHE_SIZE; j++) {
+                                       s->l2_cache_counts[j] >>= 1;
+                               }
+                       }
+                       l2_table = s->l2_cache + (i << s->l2_bits);
+                       goto found;
+               }
+       }
+
+cache_miss:
+       /* not found: load a new entry in the least used one */
+       min_index = 0;
+       min_count = 0xffffffff;
+       for (i = 0; i < L2_CACHE_SIZE; i++) {
+               if (s->l2_cache_counts[i] < min_count) {
+                       min_count = s->l2_cache_counts[i];
+                       min_index = i;
+               }
+       }
+       l2_table = s->l2_cache + (min_index << s->l2_bits);
+
+       /*If extent pre-allocated, read table from disk, 
+        *otherwise write new table to disk*/
+       if (new_l2_table) {
+               /*Should we allocate the whole extent? Adjustable parameter.*/
+               if (s->cluster_alloc == s->l2_size) {
+                       cluster_offset = l2_offset + 
+                               (s->l2_size * sizeof(uint64_t));
+                       cluster_offset = (cluster_offset + s->cluster_size - 1)
+                               & ~(s->cluster_size - 1);
+                       if (qtruncate(s->fd, cluster_offset + 
+                                 (s->cluster_size * s->l2_size), 
+                                     s->sparse) != 0) {
+                               DPRINTF("ERROR truncating file\n");
+                               return 0;
+                       }
+                       s->fd_end = cluster_offset + 
+                               (s->cluster_size * s->l2_size);
+                       for (i = 0; i < s->l2_size; i++) {
+                               l2_table[i] = cpu_to_be64(cluster_offset + 
+                                                         (i*s->cluster_size));
+                       }  
+               } else memset(l2_table, 0, s->l2_size * sizeof(uint64_t));
+
+               lseek(s->fd, l2_offset, SEEK_SET);
+               if (write(s->fd, l2_table, s->l2_size * sizeof(uint64_t)) !=
+                  s->l2_size * sizeof(uint64_t))
+                       return 0;
+       } else {
+               lseek(s->fd, l2_offset, SEEK_SET);
+               if (read(s->fd, l2_table, s->l2_size * sizeof(uint64_t)) != 
+                   s->l2_size * sizeof(uint64_t))
+                       return 0;
+       }
+       
+       /*Update the cache entries*/ 
+       s->l2_cache_offsets[min_index] = l2_offset;
+       s->l2_cache_counts[min_index] = 1;
+
+found:
+       /*The extent is split into 's->l2_size' blocks of 
+        *size 's->cluster_size'*/
+       l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
+       cluster_offset = be64_to_cpu(l2_table[l2_index]);
+
+       if (!cluster_offset || 
+           ((cluster_offset & QCOW_OFLAG_COMPRESSED) && allocate == 1) ) {
+               if (!allocate)
+                       return 0;
+               
+               if ((cluster_offset & QCOW_OFLAG_COMPRESSED) &&
+                   (n_end - n_start) < s->cluster_sectors) {
+                       /* cluster is already allocated but compressed, we must
+                          decompress it in the case it is not completely
+                          overwritten */
+                       if (decompress_cluster(s, cluster_offset) < 0)
+                               return 0;
+                       cluster_offset = lseek(s->fd, s->fd_end, SEEK_SET);
+                       cluster_offset = (cluster_offset + s->cluster_size - 1)
+                               & ~(s->cluster_size - 1);
+                       /* write the cluster content - not asynchronous */
+                       lseek(s->fd, cluster_offset, SEEK_SET);
+                       if (write(s->fd, s->cluster_cache, s->cluster_size) != 
+                           s->cluster_size)
+                           return -1;
+               } else {
+                       /* allocate a new cluster */
+                       cluster_offset = lseek(s->fd, s->fd_end, SEEK_SET);
+                       if (allocate == 1) {
+                               /* round to cluster size */
+                               cluster_offset = 
+                                       (cluster_offset + s->cluster_size - 1) 
+                                       & ~(s->cluster_size - 1);
+                               if (qtruncate(s->fd, cluster_offset + 
+                                             s->cluster_size, s->sparse)!=0) {
+                                       DPRINTF("ERROR truncating file\n");
+                                       return 0;
+                               }
+                               s->fd_end = (cluster_offset + s->cluster_size);
+                               /* if encrypted, we must initialize the cluster
+                                  content which won't be written */
+                               if (s->crypt_method && 
+                                   (n_end - n_start) < s->cluster_sectors) {
+                                       uint64_t start_sect;
+                                       start_sect = (offset & 
+                                                     ~(s->cluster_size - 1)) 
+                                                             >> 9;
+                                       memset(s->cluster_data + 512, 
+                                              0xaa, 512);
+                                       for (i = 0; i < s->cluster_sectors;i++)
+                                       {
+                                               if (i < n_start || i >= n_end) 
+                                               {
+                                                       encrypt_sectors(s, start_sect + i, 
+                                                                       s->cluster_data, 
+                                                                       s->cluster_data + 512, 1, 1,
+                                                                       &s->aes_encrypt_key);
+                                                       lseek(s->fd, cluster_offset + i * 512, SEEK_SET);
+                                                       if (write(s->fd, s->cluster_data, 512) != 512)
+                                                               return -1;
+                                               }
+                                       }
+                               }
+                       } else {
+                               cluster_offset |= QCOW_OFLAG_COMPRESSED | 
+                                       (uint64_t)compressed_size 
+                                               << (63 - s->cluster_bits);
+                       }
+               }
+               /* update L2 table */
+               tmp = cpu_to_be64(cluster_offset);
+               l2_table[l2_index] = tmp;
+
+               /*For IO_DIRECT we write 4KByte blocks*/
+               l2_sector = (l2_index * sizeof(uint64_t)) >> 12;
+               l2_ptr = (char *)l2_table + (l2_sector << 12);
+               
+               if (posix_memalign((void **)&tmp_ptr2, 4096, 4096) != 0) {
+                       DPRINTF("ERROR allocating memory for L1 table\n");
+               }
+               memcpy(tmp_ptr2, l2_ptr, 4096);
+               lseek(s->fd, l2_offset + (l2_sector << 12), SEEK_SET);
+               if (write(s->fd, tmp_ptr2, 4096) != 4096) {
+                       free(tmp_ptr2);
+                       return -1;
+               }
+               free(tmp_ptr2);
+       }
+       return cluster_offset;
+}
+
+static int qcow_is_allocated(struct tdqcow_state *s, int64_t sector_num,
+                             int nb_sectors, int *pnum)
+{
+       int index_in_cluster, n;
+       uint64_t cluster_offset;
+
+       cluster_offset = get_cluster_offset(s, sector_num << 9, 0, 0, 0, 0);
+       index_in_cluster = sector_num & (s->cluster_sectors - 1);
+       n = s->cluster_sectors - index_in_cluster;
+       if (n > nb_sectors)
+               n = nb_sectors;
+       *pnum = n;
+       return (cluster_offset != 0);
+}
+
+static int decompress_buffer(uint8_t *out_buf, int out_buf_size,
+                             const uint8_t *buf, int buf_size)
+{
+       z_stream strm1, *strm = &strm1;
+       int ret, out_len;
+       
+       memset(strm, 0, sizeof(*strm));
+       
+       strm->next_in = (uint8_t *)buf;
+       strm->avail_in = buf_size;
+       strm->next_out = out_buf;
+       strm->avail_out = out_buf_size;
+       
+       ret = inflateInit2(strm, -12);
+       if (ret != Z_OK)
+               return -1;
+       ret = inflate(strm, Z_FINISH);
+       out_len = strm->next_out - out_buf;
+       if ( (ret != Z_STREAM_END && ret != Z_BUF_ERROR) ||
+           (out_len != out_buf_size) ) {
+               inflateEnd(strm);
+               return -1;
+       }
+       inflateEnd(strm);
+       return 0;
+}
+                              
+static int decompress_cluster(struct tdqcow_state *s, uint64_t cluster_offset)
+{
+       int ret, csize;
+       uint64_t coffset;
+
+       coffset = cluster_offset & s->cluster_offset_mask;
+       if (s->cluster_cache_offset != coffset) {
+               csize = cluster_offset >> (63 - s->cluster_bits);
+               csize &= (s->cluster_size - 1);
+               lseek(s->fd, coffset, SEEK_SET);
+               ret = read(s->fd, s->cluster_data, csize);
+               if (ret != csize) 
+                       return -1;
+               if (decompress_buffer(s->cluster_cache, s->cluster_size,
+                                     s->cluster_data, csize) < 0) {
+                       return -1;
+               }
+               s->cluster_cache_offset = coffset;
+       }
+       return 0;
+}
+
+static int
+tdqcow_read_header(int fd, QCowHeader *header)
+{
+       int err;
+       char *buf;
+       struct stat st;
+       size_t size, expected;
+
+       memset(header, 0, sizeof(*header));
+
+       err = fstat(fd, &st);
+       if (err)
+               return -errno;
+
+       err = lseek(fd, 0, SEEK_SET);
+       if (err == (off_t)-1)
+               return -errno;
+
+       size = (sizeof(*header) + 511) & ~511;
+       err = posix_memalign((void **)&buf, 512, size);
+       if (err)
+               return err;
+
+       expected = size;
+       if (st.st_size < size)
+               expected = st.st_size;
+
+       errno = 0;
+       err = read(fd, buf, size);
+       if (err != expected) {
+               err = (errno ? -errno : -EIO);
+               goto out;
+       }
+
+       memcpy(header, buf, sizeof(*header));
+       be32_to_cpus(&header->magic);
+       be32_to_cpus(&header->version);
+       be64_to_cpus(&header->backing_file_offset);
+       be32_to_cpus(&header->backing_file_size);
+       be32_to_cpus(&header->mtime);
+       be64_to_cpus(&header->size);
+       be32_to_cpus(&header->crypt_method);
+       be64_to_cpus(&header->l1_table_offset);
+
+       err = 0;
+
+out:
+       free(buf);
+       return err;
+}
+
+static int
+tdqcow_load_l1_table(struct tdqcow_state *s, QCowHeader *header)
+{
+       char *buf;
+       struct stat st;
+       size_t expected;
+       int i, err, shift;
+       QCowHeader_ext *exthdr;
+       uint32_t l1_table_bytes, l1_table_block, l1_table_size;
+
+       buf         = NULL;
+       s->l1_table = NULL;
+
+       shift = s->cluster_bits + s->l2_bits;
+
+       s->l1_size = (header->size + (1LL << shift) - 1) >> shift;
+       s->l1_table_offset = header->l1_table_offset;
+
+       s->min_cluster_alloc = 1; /* default */
+
+       l1_table_bytes = s->l1_size * sizeof(uint64_t);
+       l1_table_size  = (l1_table_bytes + 4095) & ~4095;
+       l1_table_block = (l1_table_bytes + s->l1_table_offset + 4095) & ~4095;
+
+       DPRINTF("L1 Table offset detected: %"PRIu64", size %d (%d)\n",
+               (uint64_t)s->l1_table_offset,
+               (int) (s->l1_size * sizeof(uint64_t)), 
+               l1_table_size);
+
+       err = fstat(s->fd, &st);
+       if (err) {
+               err = -errno;
+               goto out;
+       }
+
+       err = lseek(s->fd, 0, SEEK_SET);
+       if (err == (off_t)-1) {
+               err = -errno;
+               goto out;
+       }
+
+       err = posix_memalign((void **)&buf, 512, l1_table_block);
+       if (err) {
+               buf = NULL;
+               goto out;
+       }
+
+       err = posix_memalign((void **)&s->l1_table, 4096, l1_table_size);
+       if (err) {
+               s->l1_table = NULL;
+               goto out;
+       }
+
+       memset(buf, 0, l1_table_block);
+       memset(s->l1_table, 0, l1_table_size);
+
+       expected = l1_table_block;
+       if (st.st_size < l1_table_block)
+               expected = st.st_size;
+
+       errno = 0;
+       err = read(s->fd, buf, l1_table_block);
+       if (err != expected) {
+               err = (errno ? -errno : -EIO);
+               goto out;
+       }
+
+       memcpy(s->l1_table, buf + s->l1_table_offset, l1_table_size);
+       exthdr = (QCowHeader_ext *)(buf + sizeof(QCowHeader));
+
+       /* check for xen extended header */
+       if (s->l1_table_offset % 4096 == 0 &&
+           be32_to_cpu(exthdr->xmagic) == XEN_MAGIC) {
+               uint32_t flags = be32_to_cpu(exthdr->flags);
+               uint32_t cksum = be32_to_cpu(exthdr->cksum);
+
+               /*
+                * Try to detect old tapdisk images. They have to be fixed
+                * because they use big endian rather than native endian for
+                * the L1 table.  After this block, the l1 table will
+                * definitely be in BIG endian.
+                */
+               if (!(flags & EXTHDR_L1_BIG_ENDIAN)) {
+                       DPRINTF("qcow: converting to big endian L1 table\n");
+
+                       /* convert to big endian */
+                       for (i = 0; i < s->l1_size; i++)
+                               cpu_to_be64s(&s->l1_table[i]);
+
+                       flags |= EXTHDR_L1_BIG_ENDIAN;
+                       exthdr->flags = cpu_to_be32(flags);
+
+                       memcpy(buf + s->l1_table_offset,
+                              s->l1_table, l1_table_size);
+                       
+                       err = lseek(s->fd, 0, SEEK_SET);
+                       if (err == (off_t)-1) {
+                               err = -errno;
+                               goto out;
+                       }
+
+                       err = atomicio(vwrite, s->fd, buf, l1_table_block);
+                       if (err != l1_table_block) {
+                               err = -errno;
+                               goto out;
+                       }
+               }
+
+               /* check the L1 table checksum */
+               if (cksum != gen_cksum((char *)s->l1_table,
+                                      s->l1_size * sizeof(uint64_t)))
+                       DPRINTF("qcow: bad L1 checksum\n");
+               else {
+                       s->extended = 1;
+                       s->sparse = (be32_to_cpu(exthdr->flags) & SPARSE_FILE);
+                       s->min_cluster_alloc =
+                               be32_to_cpu(exthdr->min_cluster_alloc);
+               }
+       }
+
+       /* convert L1 table to native endian for operation */
+       for (i = 0; i < s->l1_size; i++)
+               be64_to_cpus(&s->l1_table[i]);
+
+       err = 0;
+
+out:
+       if (err) {
+               free(buf);
+               free(s->l1_table);
+               s->l1_table = NULL;
+       }
+       return err;
+}
+
+/* Open the disk file and initialize qcow state. */
+int tdqcow_open (td_driver_t *driver, const char *name, td_flag_t flags)
+{
+       int fd, len, i, ret, size, o_flags;
+       td_disk_info_t *bs = &(driver->info);
+       struct tdqcow_state   *s  = (struct tdqcow_state *)driver->data;
+       QCowHeader header;
+       uint64_t final_cluster = 0;
+
+       DPRINTF("QCOW: Opening %s\n", name);
+
+       o_flags = O_DIRECT | O_LARGEFILE | 
+               ((flags == TD_OPEN_RDONLY) ? O_RDONLY : O_RDWR);
+       fd = open(name, o_flags);
+       if (fd < 0) {
+               DPRINTF("Unable to open %s (%d)\n", name, -errno);
+               return -1;
+       }
+
+       s->fd = fd;
+       s->name = strdup(name);
+       if (!s->name)
+               goto fail;
+
+       if (tdqcow_read_header(fd, &header))
+               goto fail;
+
+       if (header.magic != QCOW_MAGIC)
+               goto fail;
+
+       switch (header.version) {
+       case QCOW_VERSION:
+               break;
+       case 2:
+         //TODO: Port qcow2 to new blktap framework.
+         //            close(fd);
+         //            dd->drv = &tapdisk_qcow2;
+         //            return dd->drv->td_open(dd, name, flags);
+         goto fail;
+       default:
+               goto fail;
+       }
+
+       if (header.size <= 1 || header.cluster_bits < 9)
+               goto fail;
+       if (header.crypt_method > QCOW_CRYPT_AES)
+               goto fail;
+       s->crypt_method_header = header.crypt_method;
+       if (s->crypt_method_header)
+               s->encrypted = 1;
+       s->cluster_bits = header.cluster_bits;
+       s->cluster_size = 1 << s->cluster_bits;
+       s->cluster_sectors = 1 << (s->cluster_bits - 9);
+       s->l2_bits = header.l2_bits;
+       s->l2_size = 1 << s->l2_bits;
+       s->cluster_alloc = s->l2_size;
+       bs->size = header.size / 512;
+       s->cluster_offset_mask = (1LL << (63 - s->cluster_bits)) - 1;
+       s->backing_file_offset = header.backing_file_offset;
+       s->backing_file_size   = header.backing_file_size;
+
+       /* allocate and load l1 table */
+       if (tdqcow_load_l1_table(s, &header))
+               goto fail;
+
+       /* alloc L2 cache */
+       size = s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t);
+       ret = posix_memalign((void **)&s->l2_cache, 4096, size);
+       if(ret != 0) goto fail;
+
+       size = s->cluster_size;
+       ret = posix_memalign((void **)&s->cluster_cache, 4096, size);
+       if(ret != 0) goto fail;
+
+       ret = posix_memalign((void **)&s->cluster_data, 4096, size);
+       if(ret != 0) goto fail;
+       s->cluster_cache_offset = -1;
+
+       if (s->backing_file_offset != 0)
+               s->cluster_alloc = 1; /*Cannot use pre-alloc*/
+
+        bs->sector_size = 512;
+        bs->info = 0;
+
+       for(i = 0; i < s->l1_size; i++)
+               if (s->l1_table[i] > final_cluster)
+                       final_cluster = s->l1_table[i];
+
+       if (init_aio_state(driver)!=0) {
+         DPRINTF("Unable to initialise AIO state\n");
+         free_aio_state(s);
+         goto fail;
+       }
+
+       if (!final_cluster)
+               s->fd_end = s->l1_table_offset +
+                       ((s->l1_size * sizeof(uint64_t) + 4095) & ~4095);
+       else {
+               s->fd_end = lseek64(fd, 0, SEEK_END);
+               if (s->fd_end == (off64_t)-1)
+                       goto fail;
+       }
+
+       return 0;
+       
+fail:
+       DPRINTF("QCOW Open failed\n");
+
+       free_aio_state(s);
+       free(s->l1_table);
+       free(s->l2_cache);
+       free(s->cluster_cache);
+       free(s->cluster_data);
+       close(fd);
+       return -1;
+}
+
+void tdqcow_queue_read(td_driver_t *driver, td_request_t treq)
+{
+       struct tdqcow_state   *s  = (struct tdqcow_state *)driver->data;
+       int ret = 0, index_in_cluster, n, i;
+       uint64_t cluster_offset, sector, nb_sectors;
+       struct qcow_prv* prv;
+       td_request_t clone = treq;
+       char* buf = treq.buf;
+
+       sector     = treq.sec;
+       nb_sectors = treq.secs;
+
+       /*We store a local record of the request*/
+       while (nb_sectors > 0) {
+               cluster_offset = 
+                       get_cluster_offset(s, sector << 9, 0, 0, 0, 0);
+               index_in_cluster = sector & (s->cluster_sectors - 1);
+               n = s->cluster_sectors - index_in_cluster;
+               if (n > nb_sectors)
+                       n = nb_sectors;
+
+               if (s->aio_free_count == 0) {
+                       td_complete_request(treq, -EBUSY);
+                       return;
+               }
+               
+               if(!cluster_offset) {
+                       treq.buf  = buf;
+                       treq.sec  = sector;
+                       treq.secs = n;
+                       td_forward_request(treq);
+
+               } else if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
+                       if (decompress_cluster(s, cluster_offset) < 0) {
+                               td_complete_request(treq, -EIO);
+                               goto done;
+                       }
+                       memcpy(buf, s->cluster_cache + index_in_cluster * 512, 
+                              512 * n);
+                       
+                       treq.buf  = buf;
+                       treq.sec  = sector;
+                       treq.secs = n;
+                       td_complete_request(treq, 0);
+               } else {
+                 clone.buf  = buf;
+                 clone.sec  = (cluster_offset>>9)+index_in_cluster;
+                 clone.secs = n;
+                 async_read(driver, clone);
+               }
+               nb_sectors -= n;
+               sector += n;
+               buf += n * 512;
+       }
+done:
+       return;
+}
+
+void tdqcow_queue_write(td_driver_t *driver, td_request_t treq)
+{
+       struct tdqcow_state   *s  = (struct tdqcow_state *)driver->data;
+       int ret = 0, index_in_cluster, n, i;
+       uint64_t cluster_offset, sector, nb_sectors;
+       td_callback_t cb;
+       struct qcow_prv* prv;
+       char* buf = treq.buf;
+       td_request_t clone=treq;
+
+       sector     = treq.sec;
+       nb_sectors = treq.secs;
+                  
+       /*We store a local record of the request*/
+       while (nb_sectors > 0) {
+               index_in_cluster = sector & (s->cluster_sectors - 1);
+               n = s->cluster_sectors - index_in_cluster;
+               if (n > nb_sectors)
+                       n = nb_sectors;
+
+               if (s->aio_free_count == 0) {
+                       td_complete_request(treq, -EBUSY);
+                       return;
+               }
+
+               cluster_offset = get_cluster_offset(s, sector << 9, 1, 0,
+                                                   index_in_cluster, 
+                                                   index_in_cluster+n);
+               if (!cluster_offset) {
+                       DPRINTF("Ooops, no write cluster offset!\n");
+                       td_complete_request(treq, -EIO);
+                       return;
+               }
+
+               if (s->crypt_method) {
+                       encrypt_sectors(s, sector, s->cluster_data, 
+                                       (unsigned char *)buf, n, 1,
+                                       &s->aes_encrypt_key);
+
+                       clone.buf  = buf;
+                       clone.sec  = (cluster_offset>>9) + index_in_cluster;
+                       clone.secs = n;
+                       async_write(driver, clone);
+               } else {
+                 clone.buf  = buf;
+                 clone.sec  = (cluster_offset>>9) + index_in_cluster;
+                 clone.secs = n;
+
+                 async_write(driver, clone);
+               }
+               
+               nb_sectors -= n;
+               sector += n;
+               buf += n * 512;
+       }
+       s->cluster_cache_offset = -1; /* disable compressed cache */
+
+       return;
+}
+
+static int
+tdqcow_update_checksum(struct tdqcow_state *s)
+{
+       int i, fd, err;
+       uint32_t offset, cksum, out;
+
+       if (!s->extended)
+               return 0;
+
+       fd = open(s->name, O_WRONLY | O_LARGEFILE); /* open without O_DIRECT */
+       if (fd == -1) {
+               err = errno;
+               goto out;
+       }
+
+       offset = sizeof(QCowHeader) + offsetof(QCowHeader_ext, cksum);
+       if (lseek(fd, offset, SEEK_SET) == (off_t)-1) {
+               err = errno;
+               goto out;
+       }
+
+       /* convert to big endian for checksum */
+       for (i = 0; i < s->l1_size; i++)
+               cpu_to_be64s(&s->l1_table[i]);
+
+       cksum = gen_cksum((char *)s->l1_table, s->l1_size * sizeof(uint64_t));
+
+       /* and back again... */
+       for (i = 0; i < s->l1_size; i++)
+               be64_to_cpus(&s->l1_table[i]);
+
+       DPRINTF("Writing cksum: %d", cksum);
+
+       out = cpu_to_be32(cksum);
+       if (write(fd, &out, sizeof(out)) != sizeof(out)) {
+               err = errno;
+               goto out;
+       }
+
+       err = 0;
+
+out:
+       if (err)
+               DPRINTF("failed to update checksum: %d\n", err);
+       if (fd != -1)
+               close(fd);
+       return err;
+}
+               
+int tdqcow_close(td_driver_t *driver)
+{
+       struct tdqcow_state *s = (struct tdqcow_state *)driver->data;
+
+       /*Update the hdr cksum*/
+       tdqcow_update_checksum(s);
+
+       free_aio_state(s);
+       free(s->name);
+       free(s->l1_table);
+       free(s->l2_cache);
+       free(s->cluster_cache);
+       free(s->cluster_data);
+       close(s->fd);   
+       return 0;
+}
+
+int qcow_create(const char *filename, uint64_t total_size,
+               const char *backing_file, int sparse)
+{
+       int fd, header_size, backing_filename_len, l1_size, i;
+       int shift, length, adjust, flags = 0, ret = 0;
+       QCowHeader header;
+       QCowHeader_ext exthdr;
+       char backing_filename[PATH_MAX], *ptr;
+       uint64_t tmp, size, total_length;
+       struct stat st;
+
+       DPRINTF("Qcow_create: size %"PRIu64"\n",total_size);
+
+       fd = open(filename, 
+                 O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE,
+                 0644);
+       if (fd < 0)
+               return -1;
+
+       memset(&header, 0, sizeof(header));
+       header.magic = cpu_to_be32(QCOW_MAGIC);
+       header.version = cpu_to_be32(QCOW_VERSION);
+
+       /*Create extended header fields*/
+       exthdr.xmagic = cpu_to_be32(XEN_MAGIC);
+
+       header_size = sizeof(header) + sizeof(QCowHeader_ext);
+       backing_filename_len = 0;
+       size = (total_size >> SECTOR_SHIFT);
+       if (backing_file) {
+               if (strcmp(backing_file, "fat:")) {
+                       const char *p;
+                       /* XXX: this is a hack: we do not attempt to 
+                        *check for URL like syntax */
+                       p = strchr(backing_file, ':');
+                       if (p && (p - backing_file) >= 2) {
+                               /* URL like but exclude "c:" like filenames */
+                               strncpy(backing_filename, backing_file,
+                                       sizeof(backing_filename));
+                       } else {
+                               if (realpath(backing_file, backing_filename) == NULL ||
+                                   stat(backing_filename, &st) != 0) {
+                                       return -1;
+                               }
+                       }
+                       header.backing_file_offset = cpu_to_be64(header_size);
+                       backing_filename_len = strlen(backing_filename);
+                       header.backing_file_size = cpu_to_be32(
+                               backing_filename_len);
+                       header_size += backing_filename_len;
+                       
+                       /*Set to the backing file size*/
+                       if(get_filesize(backing_filename, &size, &st)) {
+                               return -1;
+                       }
+                       DPRINTF("Backing file size detected: %"PRId64" sectors" 
+                               "(total %"PRId64" [%"PRId64" MB])\n", 
+                               size, 
+                               (uint64_t)(size << SECTOR_SHIFT), 
+                               (uint64_t)(size >> 11));
+               } else {
+                       backing_file = NULL;
+                       DPRINTF("Setting file size: %"PRId64" (total %"PRId64")\n", 
+                               total_size, 
+                               (uint64_t) (total_size << SECTOR_SHIFT));
+               }
+               header.mtime = cpu_to_be32(st.st_mtime);
+               header.cluster_bits = 9; /* 512 byte cluster to avoid copying
+                                           unmodifyed sectors */
+               header.l2_bits = 12; /* 32 KB L2 tables */
+               exthdr.min_cluster_alloc = cpu_to_be32(1);
+       } else {
+               DPRINTF("Setting file size: %"PRId64" sectors" 
+                       "(total %"PRId64" [%"PRId64" MB])\n", 
+                       size, 
+                       (uint64_t) (size << SECTOR_SHIFT), 
+                       (uint64_t) (size >> 11));
+               header.cluster_bits = 12; /* 4 KB clusters */
+               header.l2_bits = 9; /* 4 KB L2 tables */
+               exthdr.min_cluster_alloc = cpu_to_be32(1 << 9);
+       }
+       /*Set the header size value*/
+       header.size = cpu_to_be64(size * 512);
+       
+       header_size = (header_size + 7) & ~7;
+       if (header_size % 4096 > 0) {
+               header_size = ((header_size >> 12) + 1) << 12;
+       }
+
+       shift = header.cluster_bits + header.l2_bits;
+       l1_size = ((size * 512) + (1LL << shift) - 1) >> shift;
+
+       header.l1_table_offset = cpu_to_be64(header_size);
+       DPRINTF("L1 Table offset: %d, size %d\n",
+               header_size,
+               (int)(l1_size * sizeof(uint64_t)));
+       header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE);
+
+       ptr = calloc(1, l1_size * sizeof(uint64_t));
+       exthdr.cksum = cpu_to_be32(gen_cksum(ptr, l1_size * sizeof(uint64_t)));
+       printf("Created cksum: %d\n",exthdr.cksum);
+       free(ptr);
+
+       /*adjust file length to system page size boundary*/
+       length = ROUNDUP(header_size + (l1_size * sizeof(uint64_t)),
+               getpagesize());
+       if (qtruncate(fd, length, 0)!=0) {
+               DPRINTF("ERROR truncating file\n");
+               return -1;
+       }
+
+       if (sparse == 0) {
+               /*Filesize is length+l1_size*(1 << s->l2_bits)+(size*512)*/
+               total_length = length + (l1_size * (1 << 9)) + (size * 512);
+               if (qtruncate(fd, total_length, 0)!=0) {
+                        DPRINTF("ERROR truncating file\n");
+                        return -1;
+               }
+               printf("File truncated to length %"PRIu64"\n",total_length);
+       } else
+               flags = SPARSE_FILE;
+
+       flags |= EXTHDR_L1_BIG_ENDIAN;
+       exthdr.flags = cpu_to_be32(flags);
+       
+       /* write all the data */
+       lseek(fd, 0, SEEK_SET);
+       ret += write(fd, &header, sizeof(header));
+       ret += write(fd, &exthdr, sizeof(exthdr));
+       if (backing_file)
+               ret += write(fd, backing_filename, backing_filename_len);
+
+       lseek(fd, header_size, SEEK_SET);
+       tmp = 0;
+       for (i = 0;i < l1_size; i++) {
+               ret += write(fd, &tmp, sizeof(tmp));
+       }
+
+       close(fd);
+
+       return 0;
+}
+
+static int qcow_make_empty(struct tdqcow_state *s)
+{
+       uint32_t l1_length = s->l1_size * sizeof(uint64_t);
+
+       memset(s->l1_table, 0, l1_length);
+       lseek(s->fd, s->l1_table_offset, SEEK_SET);
+       if (write(s->fd, s->l1_table, l1_length) < 0)
+               return -1;
+       if (qtruncate(s->fd, s->l1_table_offset + l1_length, s->sparse)!=0) {
+               DPRINTF("ERROR truncating file\n");
+               return -1;
+       }
+
+       memset(s->l2_cache, 0, s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
+       memset(s->l2_cache_offsets, 0, L2_CACHE_SIZE * sizeof(uint64_t));
+       memset(s->l2_cache_counts, 0, L2_CACHE_SIZE * sizeof(uint32_t));
+
+       return 0;
+}
+
+static int qcow_get_cluster_size(struct tdqcow_state *s)
+{
+       return s->cluster_size;
+}
+
+/* XXX: put compressed sectors first, then all the cluster aligned
+   tables to avoid losing bytes in alignment */
+static int qcow_compress_cluster(struct tdqcow_state *s, int64_t sector_num, 
+                          const uint8_t *buf)
+{
+       z_stream strm;
+       int ret, out_len;
+       uint8_t *out_buf;
+       uint64_t cluster_offset;
+
+       out_buf = malloc(s->cluster_size + (s->cluster_size / 1000) + 128);
+       if (!out_buf)
+               return -1;
+
+       /* best compression, small window, no zlib header */
+       memset(&strm, 0, sizeof(strm));
+       ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
+                          Z_DEFLATED, -12, 
+                          9, Z_DEFAULT_STRATEGY);
+       if (ret != 0) {
+               free(out_buf);
+               return -1;
+       }
+
+       strm.avail_in = s->cluster_size;
+       strm.next_in = (uint8_t *)buf;
+       strm.avail_out = s->cluster_size;
+       strm.next_out = out_buf;
+
+       ret = deflate(&strm, Z_FINISH);
+       if (ret != Z_STREAM_END && ret != Z_OK) {
+               free(out_buf);
+               deflateEnd(&strm);
+               return -1;
+       }
+       out_len = strm.next_out - out_buf;
+
+       deflateEnd(&strm);
+
+       if (ret != Z_STREAM_END || out_len >= s->cluster_size) {
+               /* could not compress: write normal cluster */
+               //tdqcow_queue_write(bs, sector_num, buf, s->cluster_sectors);
+       } else {
+               cluster_offset = get_cluster_offset(s, sector_num << 9, 2, 
+                                            out_len, 0, 0);
+               cluster_offset &= s->cluster_offset_mask;
+               lseek(s->fd, cluster_offset, SEEK_SET);
+               if (write(s->fd, out_buf, out_len) != out_len) {
+                       free(out_buf);
+                       return -1;
+               }
+       }
+       
+       free(out_buf);
+       return 0;
+}
+
+static int
+tdqcow_get_image_type(const char *file, int *type)
+{
+       int fd;
+       size_t size;
+       QCowHeader header;
+
+       fd = open(file, O_RDONLY);
+       if (fd == -1)
+               return -errno;
+
+       size = read(fd, &header, sizeof(header));
+       close(fd);
+       if (size != sizeof(header))
+               return (errno ? -errno : -EIO);
+
+       be32_to_cpus(&header.magic);
+       if (header.magic == QCOW_MAGIC)
+               *type = DISK_TYPE_QCOW;
+       else
+               *type = DISK_TYPE_AIO;
+
+       return 0;
+}
+
+int tdqcow_get_parent_id(td_driver_t *driver, td_disk_id_t *id)
+{
+       off_t off;
+       char *buf, *filename;
+       int len, secs, type, err = -EINVAL;
+       struct tdqcow_state *child  = (struct tdqcow_state *)driver->data;
+
+       if (!child->backing_file_offset)
+               return TD_NO_PARENT;
+
+       /* read the backing file name */
+       len  = child->backing_file_size;
+       off  = child->backing_file_offset - (child->backing_file_offset % 512);
+       secs = (len + (child->backing_file_offset - off) + 511) >> 9;
+
+       if (posix_memalign((void **)&buf, 512, secs << 9)) 
+               return -1;
+
+       if (lseek(child->fd, off, SEEK_SET) == (off_t)-1)
+               goto out;
+
+       if (read(child->fd, buf, secs << 9) != secs << 9)
+               goto out;
+       filename       = buf + (child->backing_file_offset - off);
+       filename[len]  = '\0';
+
+       if (tdqcow_get_image_type(filename, &type))
+               goto out;
+
+       id->name       = strdup(filename);
+       id->drivertype = type;
+       err            = 0;
+ out:
+       free(buf);
+       return err;
+}
+
+int tdqcow_validate_parent(td_driver_t *driver,
+                         td_driver_t *pdriver, td_flag_t flags)
+{
+       struct stat stats;
+       uint64_t psize, csize;
+       struct tdqcow_state *c = (struct tdqcow_state *)driver->data;
+       struct tdqcow_state *p = (struct tdqcow_state *)pdriver->data;
+       
+       if (stat(p->name, &stats))
+               return -EINVAL;
+       if (get_filesize(p->name, &psize, &stats))
+               return -EINVAL;
+
+       if (stat(c->name, &stats))
+               return -EINVAL;
+       if (get_filesize(c->name, &csize, &stats))
+               return -EINVAL;
+
+       if (csize != psize)
+               return -EINVAL;
+
+       return 0;
+}
+
+struct tap_disk tapdisk_qcow = {
+       .disk_type           = "tapdisk_qcow",
+       .flags              = 0,
+       .private_data_size   = sizeof(struct tdqcow_state),
+       .td_open             = tdqcow_open,
+       .td_close            = tdqcow_close,
+       .td_queue_read       = tdqcow_queue_read,
+       .td_queue_write      = tdqcow_queue_write,
+       .td_get_parent_id    = tdqcow_get_parent_id,
+       .td_validate_parent  = tdqcow_validate_parent,
+       .td_debug           = NULL,
+};
diff --git a/tools/blktap2/drivers/block-ram.c b/tools/blktap2/drivers/block-ram.c
new file mode 100644 (file)
index 0000000..16b4ec9
--- /dev/null
@@ -0,0 +1,269 @@
+/* 
+ * Copyright (c) 2007, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/statvfs.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <linux/fs.h>
+#include <string.h>
+
+#include "tapdisk.h"
+#include "tapdisk-driver.h"
+#include "tapdisk-interface.h"
+
+char *img;
+long int   disksector_size;
+long int   disksize;
+long int   diskinfo;
+static int connections = 0;
+
+struct tdram_state {
+        int fd;
+};
+
+/*Get Image size, secsize*/
+static int get_image_info(int fd, td_disk_info_t *info)
+{
+       int ret;
+       long size;
+       unsigned long total_size;
+       struct statvfs statBuf;
+       struct stat stat;
+
+       ret = fstat(fd, &stat);
+       if (ret != 0) {
+               DPRINTF("ERROR: fstat failed, Couldn't stat image");
+               return -EINVAL;
+       }
+
+       if (S_ISBLK(stat.st_mode)) {
+               /*Accessing block device directly*/
+               info->size = 0;
+               if (ioctl(fd,BLKGETSIZE,&info->size)!=0) {
+                       DPRINTF("ERR: BLKGETSIZE failed, couldn't stat image");
+                       return -EINVAL;
+               }
+
+               DPRINTF("Image size: \n\tpre sector_shift  [%llu]\n\tpost "
+                       "sector_shift [%llu]\n",
+                       (long long unsigned)(info->size << SECTOR_SHIFT),
+                       (long long unsigned)info->size);
+
+               /*Get the sector size*/
+#if defined(BLKSSZGET)
+               {
+                       int arg;
+                       info->sector_size = DEFAULT_SECTOR_SIZE;
+                       ioctl(fd, BLKSSZGET, &info->sector_size);
+                       
+                       if (info->sector_size != DEFAULT_SECTOR_SIZE)
+                               DPRINTF("Note: sector size is %ld (not %d)\n",
+                                       info->sector_size, DEFAULT_SECTOR_SIZE);
+               }
+#else
+               info->sector_size = DEFAULT_SECTOR_SIZE;
+#endif
+
+       } else {
+               /*Local file? try fstat instead*/
+               info->size = (stat.st_size >> SECTOR_SHIFT);
+               info->sector_size = DEFAULT_SECTOR_SIZE;
+               DPRINTF("Image size: \n\tpre sector_shift  [%llu]\n\tpost "
+                       "sector_shift [%llu]\n",
+                       (long long unsigned)(info->size << SECTOR_SHIFT),
+                       (long long unsigned)info->size);
+       }
+
+       if (info->size == 0) {          
+               info->size =((uint64_t) MAX_RAMDISK_SIZE);
+               info->sector_size = DEFAULT_SECTOR_SIZE;
+       }
+       info->info = 0;
+
+        /*Store variables locally*/
+       disksector_size = info->sector_size;
+       disksize        = info->size;
+       diskinfo        = info->info;
+       DPRINTF("Image sector_size: \n\t[%lu]\n",
+               info->sector_size);
+
+       return 0;
+}
+
+/* Open the disk file and initialize ram state. */
+int tdram_open (td_driver_t *driver, const char *name, td_flag_t flags)
+{
+       char *p;
+       uint64_t size;
+       int i, fd, ret = 0, count = 0, o_flags;
+       struct tdram_state *prv = (struct tdram_state *)driver->data;
+
+       connections++;
+
+       if (connections > 1) {
+               driver->info.sector_size = disksector_size;
+               driver->info.size        = disksize;
+               driver->info.info        = diskinfo; 
+               DPRINTF("Image already open, returning parameters:\n");
+               DPRINTF("Image size: \n\tpre sector_shift  [%llu]\n\tpost "
+                       "sector_shift [%llu]\n",
+                       (long long unsigned)(driver->info.size << SECTOR_SHIFT),
+                       (long long unsigned)driver->info.size);
+               DPRINTF("Image sector_size: \n\t[%lu]\n",
+                       driver->info.sector_size);
+
+               prv->fd = -1;
+               goto done;
+       }
+
+       /* Open the file */
+       o_flags = O_DIRECT | O_LARGEFILE | 
+               ((flags == TD_OPEN_RDONLY) ? O_RDONLY : O_RDWR);
+        fd = open(name, o_flags);
+
+        if ((fd == -1) && (errno == EINVAL)) {
+
+                /* Maybe O_DIRECT isn't supported. */
+               o_flags &= ~O_DIRECT;
+                fd = open(name, o_flags);
+                if (fd != -1) DPRINTF("WARNING: Accessing image without"
+                                     "O_DIRECT! (%s)\n", name);
+
+        } else if (fd != -1) DPRINTF("open(%s) with O_DIRECT\n", name);
+       
+        if (fd == -1) {
+               DPRINTF("Unable to open [%s]!\n",name);
+               ret = 0 - errno;
+               goto done;
+        }
+
+        prv->fd = fd;
+
+       ret = get_image_info(fd, &driver->info);
+       size = MAX_RAMDISK_SIZE;
+
+       if (driver->info.size > size) {
+               DPRINTF("Disk exceeds limit, must be less than [%d]MB",
+                       (MAX_RAMDISK_SIZE<<SECTOR_SHIFT)>>20);
+               return -ENOMEM;
+       }
+
+       /*Read the image into memory*/
+       if (posix_memalign((void **)&img, 
+                          DEFAULT_SECTOR_SIZE,
+                          driver->info.size << SECTOR_SHIFT)) {
+               DPRINTF("Mem malloc failed\n");
+               return -errno;
+       }
+       p = img;
+       DPRINTF("Reading %llu bytes.......",
+               (long long unsigned)driver->info.size << SECTOR_SHIFT);
+
+       for (i = 0; i < driver->info.size; i++) {
+               ret = read(prv->fd, p, driver->info.sector_size);
+               if (ret != driver->info.sector_size) {
+                       DPRINTF("ret = %d, errno = %d\n", ret, errno);
+                       ret = 0 - errno;
+                       break;
+               } else {
+                       count += ret;
+                       p = img + count;
+               }
+       }
+       DPRINTF("[%d]\n",count);
+       if (count != driver->info.size << SECTOR_SHIFT) {
+               ret = -1;
+       } else {
+               ret = 0;
+       }
+
+done:
+       return ret;
+}
+
+void tdram_queue_read(td_driver_t *driver, td_request_t treq)
+{
+       struct tdram_state *prv = (struct tdram_state *)driver->data;
+       int      size    = treq.secs * driver->info.sector_size;
+       uint64_t offset  = treq.sec * (uint64_t)driver->info.sector_size;
+
+       memcpy(treq.buf, img + offset, size);
+
+       td_complete_request(treq, 0);
+}
+
+void tdram_queue_write(td_driver_t *driver, td_request_t treq)
+{
+       struct tdram_state *prv = (struct tdram_state *)driver->data;
+       int      size    = treq.secs * driver->info.sector_size;
+       uint64_t offset  = treq.sec * (uint64_t)driver->info.sector_size;
+       
+       /* We assume that write access is controlled
+        * at a higher level for multiple disks */
+       memcpy(img + offset, treq.buf, size);
+
+       td_complete_request(treq, 0);
+}
+
+int tdram_close(td_driver_t *driver)
+{
+       struct tdram_state *prv = (struct tdram_state *)driver->data;
+       
+       connections--;
+       
+       return 0;
+}
+
+int tdram_get_parent_id(td_driver_t *driver, td_disk_id_t *id)
+{
+       return TD_NO_PARENT;
+}
+
+int tdram_validate_parent(td_driver_t *driver,
+                         td_driver_t *pdriver, td_flag_t flags)
+{
+       return -EINVAL;
+}
+
+struct tap_disk tapdisk_ram = {
+       .disk_type          = "tapdisk_ram",
+       .flags              = 0,
+       .private_data_size  = sizeof(struct tdram_state),
+       .td_open            = tdram_open,
+       .td_close           = tdram_close,
+       .td_queue_read      = tdram_queue_read,
+       .td_queue_write     = tdram_queue_write,
+       .td_get_parent_id   = tdram_get_parent_id,
+       .td_validate_parent = tdram_validate_parent,
+       .td_debug           = NULL,
+};
diff --git a/tools/blktap2/drivers/block-vhd.c b/tools/blktap2/drivers/block-vhd.c
new file mode 100644 (file)
index 0000000..54431c1
--- /dev/null
@@ -0,0 +1,2321 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * A note on write transactions:
+ * Writes that require updating the BAT or bitmaps cannot be signaled
+ * as complete until all updates have reached disk.  Transactions are
+ * used to ensure proper ordering in these cases.  The two types of
+ * transactions are as follows:
+ *   - Bitmap updates only: data writes that require updates to the same
+ *     bitmap are grouped in a transaction.  Only after all data writes
+ *     in a transaction complete does the bitmap write commence.  Only
+ *     after the bitmap write finishes are the data writes signalled as
+ *     complete.
+ *   - BAT and bitmap updates: data writes are grouped in transactions
+ *     as above, but a special extra write is included in the transaction,
+ *     which zeros out the newly allocated bitmap on disk.  When the data
+ *     writes and the zero-bitmap write complete, the BAT and bitmap writes
+ *     are started in parallel.  The transaction is completed only after both
+ *     the BAT and bitmap writes successfully return.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <uuid/uuid.h> /* For whatever reason, Linux packages this in */
+                       /* e2fsprogs-devel.                            */
+#include <string.h>    /* for memset.                                 */
+#include <libaio.h>
+#include <sys/mman.h>
+
+#include "libvhd.h"
+#include "tapdisk.h"
+#include "tapdisk-driver.h"
+#include "tapdisk-interface.h"
+
+unsigned int SPB;
+
+#define DEBUGGING   2
+#define ASSERTING   1
+#define MICROSOFT_COMPAT
+
+#define VHD_BATMAP_MAX_RETRIES 10
+
+#define __TRACE(s)                                                     \
+       do {                                                            \
+               DBG(TLOG_DBG, "%s: QUEUED: %" PRIu64 ", COMPLETED: %"   \
+                   PRIu64", RETURNED: %" PRIu64 ", DATA_ALLOCATED: "   \
+                   "%lu, BBLK: 0x%04x\n",                              \
+                   s->vhd.file, s->queued, s->completed, s->returned,  \
+                   VHD_REQS_DATA - s->vreq_free_count,                 \
+                   s->bat.pbw_blk);                                    \
+       } while(0)
+
+#define __ASSERT(_p)                                                   \
+       if (!(_p)) {                                                    \
+               DPRINTF("%s:%d: FAILED ASSERTION: '%s'\n",              \
+                       __FILE__, __LINE__, #_p);                       \
+               DBG(TLOG_WARN, "%s:%d: FAILED ASSERTION: '%s'\n",       \
+                   __FILE__, __LINE__, #_p);                           \
+               tlog_flush();                                           \
+               *(int*)0 = 0;                                           \
+       }
+
+#if (DEBUGGING == 1)
+  #define DBG(level, _f, _a...)      DPRINTF(_f, ##_a)
+  #define ERR(err, _f, _a...)        DPRINTF("ERROR: %d: " _f, err, ##_a)
+  #define TRACE(s)                   ((void)0)
+#elif (DEBUGGING == 2)
+  #define DBG(level, _f, _a...)      tlog_write(level, _f, ##_a)
+  #define ERR(err, _f, _a...)       tlog_error(err, _f, ##_a)
+  #define TRACE(s)                   __TRACE(s)
+#else
+  #define DBG(level, _f, _a...)      ((void)0)
+  #define ERR(err, _f, _a...)        ((void)0)
+  #define TRACE(s)                   ((void)0)
+#endif
+
+#if (ASSERTING == 1)
+  #define ASSERT(_p)                 __ASSERT(_p)
+#else
+  #define ASSERT(_p)                 ((void)0)
+#endif
+
+/******VHD DEFINES******/
+#define VHD_CACHE_SIZE               32
+
+#define VHD_REQS_DATA                TAPDISK_DATA_REQUESTS
+#define VHD_REQS_META                (VHD_CACHE_SIZE + 2)
+#define VHD_REQS_TOTAL               (VHD_REQS_DATA + VHD_REQS_META)
+
+#define VHD_OP_BAT_WRITE             0
+#define VHD_OP_DATA_READ             1
+#define VHD_OP_DATA_WRITE            2
+#define VHD_OP_BITMAP_READ           3
+#define VHD_OP_BITMAP_WRITE          4
+#define VHD_OP_ZERO_BM_WRITE         5
+
+#define VHD_BM_BAT_LOCKED            0
+#define VHD_BM_BAT_CLEAR             1
+#define VHD_BM_BIT_CLEAR             2
+#define VHD_BM_BIT_SET               3
+#define VHD_BM_NOT_CACHED            4
+#define VHD_BM_READ_PENDING          5
+
+#define VHD_FLAG_OPEN_RDONLY         1
+#define VHD_FLAG_OPEN_NO_CACHE       2
+#define VHD_FLAG_OPEN_QUIET          4
+#define VHD_FLAG_OPEN_STRICT         8
+#define VHD_FLAG_OPEN_QUERY          16
+#define VHD_FLAG_OPEN_PREALLOCATE    32
+
+#define VHD_FLAG_BAT_LOCKED          1
+#define VHD_FLAG_BAT_WRITE_STARTED   2
+
+#define VHD_FLAG_BM_UPDATE_BAT       1
+#define VHD_FLAG_BM_WRITE_PENDING    2
+#define VHD_FLAG_BM_READ_PENDING     4
+#define VHD_FLAG_BM_LOCKED           8
+
+#define VHD_FLAG_REQ_UPDATE_BAT      1
+#define VHD_FLAG_REQ_UPDATE_BITMAP   2
+#define VHD_FLAG_REQ_QUEUED          4
+#define VHD_FLAG_REQ_FINISHED        8
+
+#define VHD_FLAG_TX_LIVE             1
+#define VHD_FLAG_TX_UPDATE_BAT       2
+
+typedef uint8_t vhd_flag_t;
+
+struct vhd_state;
+struct vhd_request;
+
+struct vhd_req_list {
+       struct vhd_request       *head;
+       struct vhd_request       *tail;
+};
+
+struct vhd_transaction {
+       int                       error;
+       int                       closed;
+       int                       started;
+       int                       finished;
+       vhd_flag_t                status;
+       struct vhd_req_list       requests;
+};
+
+struct vhd_request {
+       int                       error;
+       uint8_t                   op;
+       vhd_flag_t                flags;
+       td_request_t              treq;
+       struct tiocb              tiocb;
+       struct vhd_state         *state;
+       struct vhd_request       *next;
+       struct vhd_transaction   *tx;
+};
+
+struct vhd_bat_state {
+       vhd_bat_t                 bat;
+       vhd_batmap_t              batmap;
+       vhd_flag_t                status;
+       uint32_t                  pbw_blk;     /* blk num of pending write */
+       uint64_t                  pbw_offset;  /* file offset of same */
+       struct vhd_request        req;         /* for writing bat table */
+       struct vhd_request        zero_req;    /* for initializing bitmaps */
+       char                     *bat_buf;
+};
+
+struct vhd_bitmap {
+       u32                       blk;
+       u64                       seqno;       /* lru sequence number */
+       vhd_flag_t                status;
+
+       char                     *map;         /* map should only be modified
+                                               * in finish_bitmap_write */
+       char                     *shadow;      /* in-memory bitmap changes are 
+                                               * made to shadow and copied to
+                                               * map only after having been
+                                               * flushed to disk */
+       struct vhd_transaction    tx;          /* transaction data structure
+                                               * encapsulating data, bitmap, 
+                                               * and bat writes */
+       struct vhd_req_list       queue;       /* data writes waiting for next
+                                               * transaction */
+       struct vhd_req_list       waiting;     /* pending requests that cannot
+                                               * be serviced until this bitmap
+                                               * is read from disk */
+       struct vhd_request        req;
+};
+
+struct vhd_state {
+       vhd_flag_t                flags;
+
+        /* VHD stuff */
+       vhd_context_t             vhd;
+       u32                       spp;         /* sectors per page */
+        u32                       spb;         /* sectors per block */
+        u64                       next_db;     /* pointer to the next 
+                                               * (unallocated) datablock */
+
+       struct vhd_bat_state      bat;
+
+       u64                       bm_lru;      /* lru sequence number */
+       u32                       bm_secs;     /* size of bitmap, in sectors */
+       struct vhd_bitmap        *bitmap[VHD_CACHE_SIZE];
+
+       int                       bm_free_count;
+       struct vhd_bitmap        *bitmap_free[VHD_CACHE_SIZE];
+       struct vhd_bitmap         bitmap_list[VHD_CACHE_SIZE];
+
+       int                       vreq_free_count;
+       struct vhd_request       *vreq_free[VHD_REQS_DATA];
+       struct vhd_request        vreq_list[VHD_REQS_DATA];
+
+       td_driver_t              *driver;
+
+       uint64_t                  queued;
+       uint64_t                  completed;
+       uint64_t                  returned;
+       uint64_t                  reads;
+       uint64_t                  read_size;
+       uint64_t                  writes;
+       uint64_t                  write_size;
+};
+
+#define test_vhd_flag(word, flag)  ((word) & (flag))
+#define set_vhd_flag(word, flag)   ((word) |= (flag))
+#define clear_vhd_flag(word, flag) ((word) &= ~(flag))
+
+#define bat_entry(s, blk)          ((s)->bat.bat.bat[(blk)])
+
+static void vhd_complete(void *, struct tiocb *, int);
+static void finish_data_transaction(struct vhd_state *, struct vhd_bitmap *);
+
+static struct vhd_state  *_vhd_master;
+static unsigned long      _vhd_zsize;
+static char              *_vhd_zeros;
+
+static int
+vhd_initialize(struct vhd_state *s)
+{
+       if (_vhd_zeros)
+               return 0;
+
+       _vhd_zsize = 2 * getpagesize();
+       if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_PREALLOCATE))
+               _vhd_zsize += VHD_BLOCK_SIZE;
+
+       _vhd_zeros = mmap(0, _vhd_zsize, PROT_READ,
+                         MAP_SHARED | MAP_ANONYMOUS, -1, 0);
+       if (_vhd_zeros == MAP_FAILED) {
+               EPRINTF("vhd_initialize failed: %d\n", -errno);
+               _vhd_zeros = NULL;
+               _vhd_zsize = 0;
+               return -errno;
+       }
+
+       _vhd_master = s;
+       return 0;
+}
+
+static void
+vhd_free(struct vhd_state *s)
+{
+       if (_vhd_master != s || !_vhd_zeros)
+               return;
+
+       munmap(_vhd_zeros, _vhd_zsize);
+       _vhd_zsize  = 0;
+       _vhd_zeros  = NULL;
+       _vhd_master = NULL;
+}
+
+static char *
+_get_vhd_zeros(const char *func, unsigned long size)
+{
+       if (!_vhd_zeros || _vhd_zsize < size) {
+               EPRINTF("invalid zero request from %s: %lu, %lu, %p\n",
+                       func, size, _vhd_zsize, _vhd_zeros);
+               ASSERT(0);
+       }
+
+       return _vhd_zeros;
+}
+
+#define vhd_zeros(size)        _get_vhd_zeros(__func__, size)
+
+static inline void
+set_batmap(struct vhd_state *s, uint32_t blk)
+{
+       if (s->bat.batmap.map) {
+               vhd_batmap_set(&s->vhd, &s->bat.batmap, blk);
+               DBG(TLOG_DBG, "block 0x%x completely full\n", blk);
+       }
+}
+
+static inline int
+test_batmap(struct vhd_state *s, uint32_t blk)
+{
+       if (!s->bat.batmap.map)
+               return 0;
+       return vhd_batmap_test(&s->vhd, &s->bat.batmap, blk);
+}
+
+static int
+vhd_kill_footer(struct vhd_state *s)
+{
+       int err;
+       off64_t end;
+       char *zeros;
+
+       if (s->vhd.footer.type == HD_TYPE_FIXED)
+               return 0;
+
+       err = posix_memalign((void **)&zeros, 512, 512);
+       if (err)
+               return -err;
+
+       err = 1;
+       memset(zeros, 0xc7c7c7c7, 512);
+
+       if ((end = lseek64(s->vhd.fd, 0, SEEK_END)) == -1)
+               goto fail;
+
+       if (lseek64(s->vhd.fd, (end - 512), SEEK_SET) == -1)
+               goto fail;
+
+       if (write(s->vhd.fd, zeros, 512) != 512)
+               goto fail;
+
+       err = 0;
+
+ fail:
+       free(zeros);
+       if (err)
+               return (errno ? -errno : -EIO);
+       return 0;
+}
+
+static inline int
+find_next_free_block(struct vhd_state *s)
+{
+       int err;
+       off64_t eom;
+       uint32_t i, entry;
+
+       err = vhd_end_of_headers(&s->vhd, &eom);
+       if (err)
+               return err;
+
+       s->next_db = secs_round_up(eom);
+
+       for (i = 0; i < s->bat.bat.entries; i++) {
+               entry = bat_entry(s, i);
+               if (entry != DD_BLK_UNUSED && entry >= s->next_db)
+                       s->next_db = entry + s->spb + s->bm_secs;
+       }
+
+       return 0;
+}
+
+static void
+vhd_free_bat(struct vhd_state *s)
+{
+       free(s->bat.bat.bat);
+       free(s->bat.batmap.map);
+       free(s->bat.bat_buf);
+       memset(&s->bat, 0, sizeof(struct vhd_bat));
+}
+
+static int
+vhd_initialize_bat(struct vhd_state *s)
+{
+       int err, psize, batmap_required, i;
+
+       memset(&s->bat, 0, sizeof(struct vhd_bat));
+
+       psize = getpagesize();
+
+       err = vhd_read_bat(&s->vhd, &s->bat.bat);
+       if (err) {
+               EPRINTF("%s: reading bat: %d\n", s->vhd.file, err);
+               return err;
+       }
+
+       batmap_required = 1;
+       if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_RDONLY)) {
+               batmap_required = 0;
+       } else {
+               err = find_next_free_block(s);
+               if (err)
+                       goto fail;
+       }
+
+       if (vhd_has_batmap(&s->vhd)) {
+               for (i = 0; i < VHD_BATMAP_MAX_RETRIES; i++) {
+                       err = vhd_read_batmap(&s->vhd, &s->bat.batmap);
+                       if (err) {
+                               EPRINTF("%s: reading batmap: %d\n",
+                                               s->vhd.file, err);
+                               if (batmap_required)
+                                       goto fail;
+                       } else {
+                               break;
+                       }
+               }
+               if (err)
+                       EPRINTF("%s: ignoring non-critical batmap error\n",
+                                       s->vhd.file);
+       }
+
+       err = posix_memalign((void **)&s->bat.bat_buf,
+                            VHD_SECTOR_SIZE, VHD_SECTOR_SIZE);
+       if (err) {
+               s->bat.bat_buf = NULL;
+               goto fail;
+       }
+
+       return 0;
+
+fail:
+       vhd_free_bat(s);
+       return err;
+}
+
+static void
+vhd_free_bitmap_cache(struct vhd_state *s)
+{
+       int i;
+       struct vhd_bitmap *bm;
+
+       for (i = 0; i < VHD_CACHE_SIZE; i++) {
+               bm = s->bitmap_list + i;
+               free(bm->map);
+               free(bm->shadow);
+               s->bitmap_free[i] = NULL;
+       }
+
+       memset(s->bitmap_list, 0, sizeof(struct vhd_bitmap) * VHD_CACHE_SIZE);
+}
+
+static int
+vhd_initialize_bitmap_cache(struct vhd_state *s)
+{
+       int i, err, map_size;
+       struct vhd_bitmap *bm;
+
+       memset(s->bitmap_list, 0, sizeof(struct vhd_bitmap) * VHD_CACHE_SIZE);
+
+       s->bm_lru        = 0;
+       map_size         = vhd_sectors_to_bytes(s->bm_secs);
+       s->bm_free_count = VHD_CACHE_SIZE;
+
+       for (i = 0; i < VHD_CACHE_SIZE; i++) {
+               bm = s->bitmap_list + i;
+
+               err = posix_memalign((void **)&bm->map, 512, map_size);
+               if (err) {
+                       bm->map = NULL;
+                       goto fail;
+               }
+
+               err = posix_memalign((void **)&bm->shadow, 512, map_size);
+               if (err) {
+                       bm->shadow = NULL;
+                       goto fail;
+               }
+
+               memset(bm->map, 0, map_size);
+               memset(bm->shadow, 0, map_size);
+               s->bitmap_free[i] = bm;
+       }
+
+       return 0;
+
+fail:
+       vhd_free_bitmap_cache(s);
+       return err;
+}
+
+static int
+vhd_initialize_dynamic_disk(struct vhd_state *s)
+{
+       int err;
+
+       err = vhd_get_header(&s->vhd);
+       if (err) {
+               if (!test_vhd_flag(s->flags, VHD_FLAG_OPEN_QUIET))
+                       EPRINTF("Error reading VHD DD header.\n");
+               return err;
+       }
+
+       if (s->vhd.header.hdr_ver != 0x00010000) {
+               EPRINTF("unsupported header version! (0x%x)\n",
+                       s->vhd.header.hdr_ver);
+               return -EINVAL;
+       }
+
+       s->spp     = getpagesize() >> VHD_SECTOR_SHIFT;
+       s->spb     = s->vhd.header.block_size >> VHD_SECTOR_SHIFT;
+       s->bm_secs = secs_round_up_no_zero(s->spb >> 3);
+
+       if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_NO_CACHE))
+               return 0;
+
+       err = vhd_initialize_bat(s);
+       if (err)
+               return err;
+
+       err = vhd_initialize_bitmap_cache(s);
+       if (err) {
+               vhd_free_bat(s);
+               return err;
+       }
+
+       return 0;
+}
+
+static int
+vhd_check_version(struct vhd_state *s)
+{
+       if (strncmp(s->vhd.footer.crtr_app, "tap", 3))
+               return 0;
+
+       if (s->vhd.footer.crtr_ver > VHD_CURRENT_VERSION) {
+               if (!test_vhd_flag(s->flags, VHD_FLAG_OPEN_QUIET))
+                       EPRINTF("WARNING: %s vhd creator version 0x%08x, "
+                               "but only versions up to 0x%08x are "
+                               "supported for IO\n", s->vhd.file,
+                               s->vhd.footer.crtr_ver, VHD_CURRENT_VERSION);
+
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+static void
+vhd_log_open(struct vhd_state *s)
+{
+       char buf[5];
+       uint32_t i, allocated, full;
+
+       if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_QUIET))
+               return;
+
+       snprintf(buf, sizeof(buf), "%s", s->vhd.footer.crtr_app);
+       if (!vhd_type_dynamic(&s->vhd)) {
+               DPRINTF("%s version: %s 0x%08x\n",
+                       s->vhd.file, buf, s->vhd.footer.crtr_ver);
+               return;
+       }
+
+       allocated = 0;
+       full      = 0;
+
+       for (i = 0; i < s->bat.bat.entries; i++) {
+               if (bat_entry(s, i) != DD_BLK_UNUSED)
+                       allocated++;
+               if (test_batmap(s, i))
+                       full++;
+       }
+
+       DPRINTF("%s version: %s 0x%08x, b: %u, a: %u, f: %u, n: %"PRIu64"\n",
+               s->vhd.file, buf, s->vhd.footer.crtr_ver, s->bat.bat.entries,
+               allocated, full, s->next_db);
+}
+
+static int
+__vhd_open(td_driver_t *driver, const char *name, vhd_flag_t flags)
+{
+        int i, o_flags, err;
+       struct vhd_state *s;
+
+        DBG(TLOG_INFO, "vhd_open: %s\n", name);
+       if (test_vhd_flag(flags, VHD_FLAG_OPEN_STRICT))
+               libvhd_set_log_level(1);
+
+       s = (struct vhd_state *)driver->data;
+       memset(s, 0, sizeof(struct vhd_state));
+
+       s->flags  = flags;
+       s->driver = driver;
+
+       err = vhd_initialize(s);
+       if (err)
+               return err;
+
+       o_flags = ((test_vhd_flag(flags, VHD_FLAG_OPEN_RDONLY)) ? 
+                  VHD_OPEN_RDONLY : VHD_OPEN_RDWR);
+
+       err = vhd_open(&s->vhd, name, o_flags);
+       if (err) {
+               libvhd_set_log_level(1);
+               err = vhd_open(&s->vhd, name, o_flags);
+               if (err) {
+                       EPRINTF("Unable to open [%s] (%d)!\n", name, err);
+                       return err;
+               }
+       }
+
+       err = vhd_check_version(s);
+       if (err)
+               goto fail;
+
+       s->spb = s->spp = 1;
+
+       if (vhd_type_dynamic(&s->vhd)) {
+               err = vhd_initialize_dynamic_disk(s);
+               if (err)
+                       goto fail;
+       }
+
+       vhd_log_open(s);
+
+       SPB = s->spb;
+
+       s->vreq_free_count = VHD_REQS_DATA;
+       for (i = 0; i < VHD_REQS_DATA; i++)
+               s->vreq_free[i] = s->vreq_list + i;
+
+       driver->info.size        = s->vhd.footer.curr_size >> VHD_SECTOR_SHIFT;
+       driver->info.sector_size = VHD_SECTOR_SIZE;
+       driver->info.info        = 0;
+
+        DBG(TLOG_INFO, "vhd_open: done (sz:%"PRIu64", sct:%lu, inf:%u)\n",
+           driver->info.size, driver->info.sector_size, driver->info.info);
+
+       if (test_vhd_flag(flags, VHD_FLAG_OPEN_STRICT) && 
+           !test_vhd_flag(flags, VHD_FLAG_OPEN_RDONLY)) {
+               err = vhd_kill_footer(s);
+               if (err) {
+                       DPRINTF("ERROR killing footer: %d\n", err);
+                       goto fail;
+               }
+               s->writes++;
+       }
+
+        return 0;
+
+ fail:
+       vhd_free_bat(s);
+       vhd_free_bitmap_cache(s);
+       vhd_close(&s->vhd);
+       vhd_free(s);
+       return err;
+}
+
+static int
+_vhd_open(td_driver_t *driver, const char *name, td_flag_t flags)
+{
+       vhd_flag_t vhd_flags = 0;
+
+       if (flags & TD_OPEN_RDONLY)
+               vhd_flags |= VHD_FLAG_OPEN_RDONLY;
+       if (flags & TD_OPEN_QUIET)
+               vhd_flags |= VHD_FLAG_OPEN_QUIET;
+       if (flags & TD_OPEN_STRICT)
+               vhd_flags |= VHD_FLAG_OPEN_STRICT;
+       if (flags & TD_OPEN_QUERY)
+               vhd_flags |= (VHD_FLAG_OPEN_QUERY  |
+                             VHD_FLAG_OPEN_QUIET  |
+                             VHD_FLAG_OPEN_RDONLY |
+                             VHD_FLAG_OPEN_NO_CACHE);
+
+       /* pre-allocate for all but NFS and LVM storage */
+       if (driver->storage != TAPDISK_STORAGE_TYPE_NFS &&
+           driver->storage != TAPDISK_STORAGE_TYPE_LVM)
+               vhd_flags |= VHD_FLAG_OPEN_PREALLOCATE;
+
+       return __vhd_open(driver, name, vhd_flags);
+}
+
+static void
+vhd_log_close(struct vhd_state *s)
+{
+       uint32_t i, allocated, full;
+
+       if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_QUIET))
+               return;
+
+       allocated = 0;
+       full      = 0;
+
+       for (i = 0; i < s->bat.bat.entries; i++) {
+               if (bat_entry(s, i) != DD_BLK_UNUSED)
+                       allocated++;
+               if (test_batmap(s, i))
+                       full++;
+       }
+
+       DPRINTF("%s: b: %u, a: %u, f: %u, n: %"PRIu64"\n",
+               s->vhd.file, s->bat.bat.entries, allocated, full, s->next_db);
+}
+
+static int
+_vhd_close(td_driver_t *driver)
+{
+       int err;
+       struct vhd_state *s;
+       struct vhd_bitmap *bm;
+       
+       DBG(TLOG_WARN, "vhd_close\n");
+       s = (struct vhd_state *)driver->data;
+
+       /* don't write footer if tapdisk is read-only */
+       if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_RDONLY))
+               goto free;
+       
+       /* 
+        * write footer if:
+        *   - we killed it on open (opened with strict) 
+        *   - we've written data since opening
+        */
+       if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_STRICT) || s->writes) {
+               memcpy(&s->vhd.bat, &s->bat.bat, sizeof(vhd_bat_t));
+               err = vhd_write_footer(&s->vhd, &s->vhd.footer);
+               memset(&s->vhd.bat, 0, sizeof(vhd_bat_t));
+
+               if (err)
+                       EPRINTF("writing %s footer: %d\n", s->vhd.file, err);
+
+               if (!vhd_has_batmap(&s->vhd))
+                       goto free;
+
+               err = vhd_write_batmap(&s->vhd, &s->bat.batmap);
+               if (err)
+                       EPRINTF("writing %s batmap: %d\n", s->vhd.file, err);
+       }
+
+ free:
+       vhd_log_close(s);
+       vhd_free_bat(s);
+       vhd_free_bitmap_cache(s);
+       vhd_close(&s->vhd);
+       vhd_free(s);
+
+       memset(s, 0, sizeof(struct vhd_state));
+
+       return 0;
+}
+
+int
+vhd_validate_parent(td_driver_t *child_driver,
+                   td_driver_t *parent_driver, td_flag_t flags)
+{
+       struct stat stats;
+       struct vhd_state *child  = (struct vhd_state *)child_driver->data;
+       struct vhd_state *parent;
+
+       if (parent_driver->type != DISK_TYPE_VHD) {
+               if (child_driver->type != DISK_TYPE_VHD)
+                       return -EINVAL;
+               if (child->vhd.footer.type != HD_TYPE_DIFF)
+                       return -EINVAL;
+               if (!vhd_parent_raw(&child->vhd))
+                       return -EINVAL;
+               return 0;
+       }
+
+       parent = (struct vhd_state *)parent_driver->data;
+
+       /* 
+        * This check removed because of cases like:
+        *   - parent VHD marked as 'hidden'
+        *   - parent VHD modified during coalesce
+        */
+       /*
+       if (stat(parent->vhd.file, &stats)) {
+               DPRINTF("ERROR stating parent file %s\n", parent->vhd.file);
+               return -errno;
+       }
+
+       if (child->hdr.prt_ts != vhd_time(stats.st_mtime)) {
+               DPRINTF("ERROR: parent file has been modified since "
+                       "snapshot.  Child image no longer valid.\n");
+               return -EINVAL;
+       }
+       */
+
+       if (uuid_compare(child->vhd.header.prt_uuid, parent->vhd.footer.uuid)) {
+               DPRINTF("ERROR: %s: %s, %s: parent uuid has changed since "
+                       "snapshot.  Child image no longer valid.\n",
+                       __func__, child->vhd.file, parent->vhd.file);
+               return -EINVAL;
+       }
+
+       /* TODO: compare sizes */
+       
+       return 0;
+}
+
+int
+vhd_get_parent_id(td_driver_t *driver, td_disk_id_t *id)
+{
+       int err;
+       char *parent;
+       struct vhd_state *s;
+
+       DBG(TLOG_DBG, "\n");
+       memset(id, 0, sizeof(td_disk_id_t));
+
+       s = (struct vhd_state *)driver->data;
+
+       if (s->vhd.footer.type != HD_TYPE_DIFF)
+               return TD_NO_PARENT;
+
+       err = vhd_parent_locator_get(&s->vhd, &parent);
+       if (err)
+               return err;
+
+       id->name       = parent;
+       id->drivertype = DISK_TYPE_VHD;
+       if (vhd_parent_raw(&s->vhd)) {
+               DPRINTF("VHD: parent is raw\n");
+               id->drivertype = DISK_TYPE_AIO;
+       }
+       return 0;
+}
+
+static inline void
+clear_req_list(struct vhd_req_list *list)
+{
+       list->head = list->tail = NULL;
+}
+
+static inline void
+add_to_tail(struct vhd_req_list *list, struct vhd_request *e)
+{
+       if (!list->head) 
+               list->head = list->tail = e;
+       else 
+               list->tail = list->tail->next = e;
+}
+
+static inline int
+remove_from_req_list(struct vhd_req_list *list, struct vhd_request *e)
+{
+       struct vhd_request *i = list->head;
+
+       if (list->head == e) {
+               if (list->tail == e)
+                       clear_req_list(list);
+               else
+                       list->head = list->head->next;
+               return 0;
+       }
+
+       while (i->next) {
+               if (i->next == e) {
+                       if (list->tail == e) {
+                               i->next = NULL;
+                               list->tail = i;
+                       } else
+                               i->next = i->next->next;
+                       return 0;
+               }
+               i = i->next;
+       }
+
+       return -EINVAL;
+}
+
+static inline void
+init_vhd_request(struct vhd_state *s, struct vhd_request *req)
+{
+       memset(req, 0, sizeof(struct vhd_request));
+       req->state = s;
+}
+
+static inline void
+init_tx(struct vhd_transaction *tx)
+{
+       memset(tx, 0, sizeof(struct vhd_transaction));
+}
+
+static inline void
+add_to_transaction(struct vhd_transaction *tx, struct vhd_request *r)
+{
+       ASSERT(!tx->closed);
+
+       r->tx = tx;
+       tx->started++;
+       add_to_tail(&tx->requests, r);
+       set_vhd_flag(tx->status, VHD_FLAG_TX_LIVE);
+
+       DBG(TLOG_DBG, "blk: 0x%04"PRIx64", lsec: 0x%08"PRIx64", tx: %p, "
+           "started: %d, finished: %d, status: %u\n",
+           r->treq.sec / SPB, r->treq.sec, tx,
+           tx->started, tx->finished, tx->status);
+}
+
+static inline int
+transaction_completed(struct vhd_transaction *tx)
+{
+       return (tx->started == tx->finished);
+}
+
+static inline void
+init_bat(struct vhd_state *s)
+{
+       s->bat.req.tx     = NULL;
+       s->bat.req.next   = NULL;
+       s->bat.req.error  = 0;
+       s->bat.pbw_blk    = 0;
+       s->bat.pbw_offset = 0;
+       s->bat.status     = 0;
+}
+
+static inline void
+lock_bat(struct vhd_state *s)
+{
+       set_vhd_flag(s->bat.status, VHD_FLAG_BAT_LOCKED);
+}
+
+static inline void
+unlock_bat(struct vhd_state *s)
+{
+       clear_vhd_flag(s->bat.status, VHD_FLAG_BAT_LOCKED);
+}
+
+static inline int
+bat_locked(struct vhd_state *s)
+{
+       return test_vhd_flag(s->bat.status, VHD_FLAG_BAT_LOCKED);
+}
+
+static inline void
+init_vhd_bitmap(struct vhd_state *s, struct vhd_bitmap *bm)
+{
+       bm->blk    = 0;
+       bm->seqno  = 0;
+       bm->status = 0;
+       init_tx(&bm->tx);
+       clear_req_list(&bm->queue);
+       clear_req_list(&bm->waiting);
+       memset(bm->map, 0, vhd_sectors_to_bytes(s->bm_secs));
+       memset(bm->shadow, 0, vhd_sectors_to_bytes(s->bm_secs));
+       init_vhd_request(s, &bm->req);
+}
+
+static inline struct vhd_bitmap *
+get_bitmap(struct vhd_state *s, uint32_t block)
+{
+       int i;
+       struct vhd_bitmap *bm;
+
+       for (i = 0; i < VHD_CACHE_SIZE; i++) {
+               bm = s->bitmap[i];
+               if (bm && bm->blk == block)
+                       return bm;
+       }
+
+       return NULL;
+}
+
+static inline void
+lock_bitmap(struct vhd_bitmap *bm)
+{
+       set_vhd_flag(bm->status, VHD_FLAG_BM_LOCKED);
+}
+
+static inline void
+unlock_bitmap(struct vhd_bitmap *bm)
+{
+       clear_vhd_flag(bm->status, VHD_FLAG_BM_LOCKED);
+}
+
+static inline int
+bitmap_locked(struct vhd_bitmap *bm)
+{
+       return test_vhd_flag(bm->status, VHD_FLAG_BM_LOCKED);
+}
+
+static inline int
+bitmap_valid(struct vhd_bitmap *bm)
+{
+       return !test_vhd_flag(bm->status, VHD_FLAG_BM_READ_PENDING);
+}
+
+static inline int
+bitmap_in_use(struct vhd_bitmap *bm)
+{
+       return (test_vhd_flag(bm->status, VHD_FLAG_BM_READ_PENDING)  ||
+               test_vhd_flag(bm->status, VHD_FLAG_BM_WRITE_PENDING) ||
+               test_vhd_flag(bm->tx.status, VHD_FLAG_TX_UPDATE_BAT) ||
+               bm->waiting.head || bm->tx.requests.head || bm->queue.head);
+}
+
+static inline int
+bitmap_full(struct vhd_state *s, struct vhd_bitmap *bm)
+{
+       int i, n;
+
+       n = s->spb >> 3;
+       for (i = 0; i < n; i++)
+               if (bm->map[i] != (char)0xFF)
+                       return 0;
+
+       DBG(TLOG_DBG, "bitmap 0x%04x full\n", bm->blk);
+       return 1;
+}
+
+static struct vhd_bitmap *
+remove_lru_bitmap(struct vhd_state *s)
+{
+       int i, idx = 0;
+       u64 seq = s->bm_lru;
+       struct vhd_bitmap *bm, *lru = NULL;
+
+       for (i = 0; i < VHD_CACHE_SIZE; i++) {
+               bm = s->bitmap[i];
+               if (bm && bm->seqno < seq && !bitmap_locked(bm)) {
+                       idx = i;
+                       lru = bm;
+                       seq = lru->seqno;
+               }
+       }
+
+       if (lru) {
+               s->bitmap[idx] = NULL;
+               ASSERT(!bitmap_in_use(lru));
+       }
+
+       return  lru;
+}
+
+static int
+alloc_vhd_bitmap(struct vhd_state *s, struct vhd_bitmap **bitmap, uint32_t blk)
+{
+       struct vhd_bitmap *bm;
+       
+       *bitmap = NULL;
+
+       if (s->bm_free_count > 0) {
+               bm = s->bitmap_free[--s->bm_free_count];
+       } else {
+               bm = remove_lru_bitmap(s);
+               if (!bm)
+                       return -EBUSY;
+       }
+
+       init_vhd_bitmap(s, bm);
+       bm->blk = blk;
+       *bitmap = bm;
+
+       return 0;
+}
+
+static inline uint64_t
+__bitmap_lru_seqno(struct vhd_state *s)
+{
+       int i;
+       struct vhd_bitmap *bm;
+
+       if (s->bm_lru == 0xffffffff) {
+               s->bm_lru = 0;
+               for (i = 0; i < VHD_CACHE_SIZE; i++) {
+                       bm = s->bitmap[i];
+                       if (bm) {
+                               bm->seqno >>= 1;
+                               if (bm->seqno > s->bm_lru)
+                                       s->bm_lru = bm->seqno;
+                       }
+               }
+       }
+
+       return ++s->bm_lru;
+}
+
+static inline void
+touch_bitmap(struct vhd_state *s, struct vhd_bitmap *bm)
+{
+       bm->seqno = __bitmap_lru_seqno(s);
+}
+
+static inline void
+install_bitmap(struct vhd_state *s, struct vhd_bitmap *bm)
+{
+       int i;
+       for (i = 0; i < VHD_CACHE_SIZE; i++) {
+               if (!s->bitmap[i]) {
+                       touch_bitmap(s, bm);
+                       s->bitmap[i] = bm;
+                       return;
+               }
+       }
+
+       ASSERT(0);
+}
+
+static inline void
+free_vhd_bitmap(struct vhd_state *s, struct vhd_bitmap *bm)
+{
+       int i;
+
+       for (i = 0; i < VHD_CACHE_SIZE; i++)
+               if (s->bitmap[i] == bm)
+                       break;
+
+       ASSERT(!bitmap_locked(bm));
+       ASSERT(!bitmap_in_use(bm));
+       ASSERT(i < VHD_CACHE_SIZE);
+
+       s->bitmap[i] = NULL;
+       s->bitmap_free[s->bm_free_count++] = bm;
+}
+
+static int
+read_bitmap_cache(struct vhd_state *s, uint64_t sector, uint8_t op)
+{
+       u32 blk, sec;
+       struct vhd_bitmap *bm;
+
+       /* in fixed disks, every block is present */
+       if (s->vhd.footer.type == HD_TYPE_FIXED) 
+               return VHD_BM_BIT_SET;
+
+       blk = sector / s->spb;
+       sec = sector % s->spb;
+
+       if (blk > s->vhd.header.max_bat_size) {
+               DPRINTF("ERROR: sec %"PRIu64" out of range, op = %d\n",
+                       sector, op);
+               return -EINVAL;
+       }
+
+       if (bat_entry(s, blk) == DD_BLK_UNUSED) {
+               if (op == VHD_OP_DATA_WRITE &&
+                   s->bat.pbw_blk != blk && bat_locked(s))
+                       return VHD_BM_BAT_LOCKED;
+
+               return VHD_BM_BAT_CLEAR;
+       }
+
+       if (test_batmap(s, blk)) {
+               DBG(TLOG_DBG, "batmap set for 0x%04x\n", blk);
+               return VHD_BM_BIT_SET;
+       }
+
+       bm = get_bitmap(s, blk);
+       if (!bm)
+               return VHD_BM_NOT_CACHED;
+
+       /* bump lru count */
+       touch_bitmap(s, bm);
+
+       if (test_vhd_flag(bm->status, VHD_FLAG_BM_READ_PENDING))
+               return VHD_BM_READ_PENDING;
+
+       return ((vhd_bitmap_test(&s->vhd, bm->map, sec)) ? 
+               VHD_BM_BIT_SET : VHD_BM_BIT_CLEAR);
+}
+
+static int
+read_bitmap_cache_span(struct vhd_state *s, 
+                      uint64_t sector, int nr_secs, int value)
+{
+       int ret;
+       u32 blk, sec;
+       struct vhd_bitmap *bm;
+
+       /* in fixed disks, every block is present */
+       if (s->vhd.footer.type == HD_TYPE_FIXED) 
+               return nr_secs;
+
+       sec = sector % s->spb;
+       blk = sector / s->spb;
+
+       if (test_batmap(s, blk))
+               return MIN(nr_secs, s->spb - sec);
+
+       bm  = get_bitmap(s, blk);
+       
+       ASSERT(bm && bitmap_valid(bm));
+
+       for (ret = 0; sec < s->spb && ret < nr_secs; sec++, ret++)
+               if (vhd_bitmap_test(&s->vhd, bm->map, sec) != value)
+                       break;
+
+       return ret;
+}
+
+static inline struct vhd_request *
+alloc_vhd_request(struct vhd_state *s)
+{
+       struct vhd_request *req = NULL;
+       
+       if (s->vreq_free_count > 0) {
+               req = s->vreq_free[--s->vreq_free_count];
+               ASSERT(req->treq.secs == 0);
+               init_vhd_request(s, req);
+               return req;
+       }
+
+       return NULL;
+}
+
+static inline void
+free_vhd_request(struct vhd_state *s, struct vhd_request *req)
+{
+       memset(req, 0, sizeof(struct vhd_request));
+       s->vreq_free[s->vreq_free_count++] = req;
+}
+
+static inline void
+aio_read(struct vhd_state *s, struct vhd_request *req, uint64_t offset)
+{
+       struct tiocb *tiocb = &req->tiocb;
+
+       td_prep_read(tiocb, s->vhd.fd, req->treq.buf,
+                    vhd_sectors_to_bytes(req->treq.secs),
+                    offset, vhd_complete, req);
+       td_queue_tiocb(s->driver, tiocb);
+
+       s->queued++;
+       s->reads++;
+       s->read_size += req->treq.secs;
+       TRACE(s);
+}
+
+static inline void
+aio_write(struct vhd_state *s, struct vhd_request *req, uint64_t offset)
+{
+       struct tiocb *tiocb = &req->tiocb;
+
+       td_prep_write(tiocb, s->vhd.fd, req->treq.buf,
+                     vhd_sectors_to_bytes(req->treq.secs),
+                     offset, vhd_complete, req);
+       td_queue_tiocb(s->driver, tiocb);
+
+       s->queued++;
+       s->writes++;
+       s->write_size += req->treq.secs;
+       TRACE(s);
+}
+
+static inline uint64_t
+reserve_new_block(struct vhd_state *s, uint32_t blk)
+{
+       int gap = 0;
+
+       ASSERT(!test_vhd_flag(s->bat.status, VHD_FLAG_BAT_WRITE_STARTED));
+
+       /* data region of segment should begin on page boundary */
+       if ((s->next_db + s->bm_secs) % s->spp)
+               gap = (s->spp - ((s->next_db + s->bm_secs) % s->spp));
+
+       s->bat.pbw_blk    = blk;
+       s->bat.pbw_offset = s->next_db + gap;
+
+       return s->next_db;
+}
+
+static int
+schedule_bat_write(struct vhd_state *s)
+{
+       int i;
+       u32 blk;
+       char *buf;
+       u64 offset;
+       struct vhd_request *req;
+
+       ASSERT(bat_locked(s));
+
+       req = &s->bat.req;
+       buf = s->bat.bat_buf;
+       blk = s->bat.pbw_blk;
+
+       init_vhd_request(s, req);
+       memcpy(buf, &bat_entry(s, blk - (blk % 128)), 512);
+
+       ((u32 *)buf)[blk % 128] = s->bat.pbw_offset;
+
+       for (i = 0; i < 128; i++)
+               BE32_OUT(&((u32 *)buf)[i]);
+
+       offset         = s->vhd.header.table_offset + (blk - (blk % 128)) * 4;
+       req->treq.secs = 1;
+       req->treq.buf  = buf;
+       req->op        = VHD_OP_BAT_WRITE;
+       req->next      = NULL;
+
+       aio_write(s, req, offset);
+       set_vhd_flag(s->bat.status, VHD_FLAG_BAT_WRITE_STARTED);
+
+       DBG(TLOG_DBG, "blk: 0x%04x, pbwo: 0x%08"PRIx64", "
+           "table_offset: 0x%08"PRIx64"\n", blk, s->bat.pbw_offset, offset);
+
+       return 0;
+}
+
+static void
+schedule_zero_bm_write(struct vhd_state *s,
+                      struct vhd_bitmap *bm, uint64_t lb_end)
+{
+       uint64_t offset;
+       struct vhd_request *req = &s->bat.zero_req;
+
+       init_vhd_request(s, req);
+
+       offset         = vhd_sectors_to_bytes(lb_end);
+       req->op        = VHD_OP_ZERO_BM_WRITE;
+       req->treq.sec  = s->bat.pbw_blk * s->spb;
+       req->treq.secs = (s->bat.pbw_offset - lb_end) + s->bm_secs;
+       req->treq.buf  = vhd_zeros(vhd_sectors_to_bytes(req->treq.secs));
+       req->next      = NULL;
+
+       DBG(TLOG_DBG, "blk: 0x%04x, writing zero bitmap at 0x%08"PRIx64"\n",
+           s->bat.pbw_blk, offset);
+
+       lock_bitmap(bm);
+       add_to_transaction(&bm->tx, req);
+       aio_write(s, req, offset);
+}
+
+static int
+update_bat(struct vhd_state *s, uint32_t blk)
+{
+       int err;
+       uint64_t lb_end;
+       struct vhd_bitmap *bm;
+
+       ASSERT(bat_entry(s, blk) == DD_BLK_UNUSED);
+       
+       if (bat_locked(s)) {
+               ASSERT(s->bat.pbw_blk == blk);
+               return 0;
+       }
+
+       /* empty bitmap could already be in
+        * cache if earlier bat update failed */
+       bm = get_bitmap(s, blk);
+       if (!bm) {
+               /* install empty bitmap in cache */
+               err = alloc_vhd_bitmap(s, &bm, blk);
+               if (err) 
+                       return err;
+
+               install_bitmap(s, bm);
+       }
+
+       lock_bat(s);
+       lb_end = reserve_new_block(s, blk);
+       schedule_zero_bm_write(s, bm, lb_end);
+       set_vhd_flag(bm->tx.status, VHD_FLAG_TX_UPDATE_BAT);
+
+       return 0;
+}
+
+static int
+allocate_block(struct vhd_state *s, uint32_t blk)
+{
+       char *zeros;
+       int err, gap;
+       uint64_t offset, size;
+       struct vhd_bitmap *bm;
+
+       ASSERT(bat_entry(s, blk) == DD_BLK_UNUSED);
+
+       if (bat_locked(s)) {
+               ASSERT(s->bat.pbw_blk == blk);
+               if (s->bat.req.error)
+                       return -EBUSY;
+               return 0;
+       }
+
+       gap            = 0;
+       s->bat.pbw_blk = blk;
+       offset         = vhd_sectors_to_bytes(s->next_db);
+
+       /* data region of segment should begin on page boundary */
+       if ((s->next_db + s->bm_secs) % s->spp) {
+               gap = (s->spp - ((s->next_db + s->bm_secs) % s->spp));
+               s->next_db += gap;
+       }
+
+       s->bat.pbw_offset = s->next_db;
+
+       DBG(TLOG_DBG, "blk: 0x%04x, pbwo: 0x%08"PRIx64"\n",
+           blk, s->bat.pbw_offset);
+
+       if (lseek(s->vhd.fd, offset, SEEK_SET) == (off_t)-1) {
+               ERR(errno, "lseek failed\n");
+               return -errno;
+       }
+
+       size = vhd_sectors_to_bytes(s->spb + s->bm_secs + gap);
+       err  = write(s->vhd.fd, vhd_zeros(size), size);
+       if (err != size) {
+               err = (err == -1 ? -errno : -EIO);
+               ERR(err, "write failed");
+               return err;
+       }
+
+       /* empty bitmap could already be in
+        * cache if earlier bat update failed */
+       bm = get_bitmap(s, blk);
+       if (!bm) {
+               /* install empty bitmap in cache */
+               err = alloc_vhd_bitmap(s, &bm, blk);
+               if (err) 
+                       return err;
+
+               install_bitmap(s, bm);
+       }
+
+       lock_bat(s);
+       lock_bitmap(bm);
+       schedule_bat_write(s);
+       add_to_transaction(&bm->tx, &s->bat.req);
+
+       return 0;
+}
+
+static int 
+schedule_data_read(struct vhd_state *s, td_request_t treq, vhd_flag_t flags)
+{
+       u64 offset;
+       u32 blk = 0, sec = 0;
+       struct vhd_bitmap  *bm;
+       struct vhd_request *req;
+
+       if (s->vhd.footer.type == HD_TYPE_FIXED) {
+               offset = vhd_sectors_to_bytes(treq.sec);
+               goto make_request;
+       }
+
+       blk    = treq.sec / s->spb;
+       sec    = treq.sec % s->spb;
+       bm     = get_bitmap(s, blk);
+       offset = bat_entry(s, blk);
+
+       ASSERT(offset != DD_BLK_UNUSED);
+       ASSERT(test_batmap(s, blk) || (bm && bitmap_valid(bm)));
+
+       offset += s->bm_secs + sec;
+       offset  = vhd_sectors_to_bytes(offset);
+
+ make_request:
+       req = alloc_vhd_request(s);
+       if (!req) 
+               return -EBUSY;
+
+       req->treq  = treq;
+       req->flags = flags;
+       req->op    = VHD_OP_DATA_READ;
+       req->next  = NULL;
+
+       aio_read(s, req, offset);
+
+       DBG(TLOG_DBG, "%s: lsec: 0x%08"PRIx64", blk: 0x%04x, sec: 0x%04x, "
+           "nr_secs: 0x%04x, offset: 0x%08"PRIx64", flags: 0x%08x, buf: %p\n",
+           s->vhd.file, treq.sec, blk, sec, treq.secs, offset, req->flags,
+           treq.buf);
+
+       return 0;
+}
+
+static int
+schedule_data_write(struct vhd_state *s, td_request_t treq, vhd_flag_t flags)
+{
+       int err;
+       u64 offset;
+       u32 blk = 0, sec = 0;
+       struct vhd_bitmap  *bm = NULL;
+       struct vhd_request *req;
+
+       if (s->vhd.footer.type == HD_TYPE_FIXED) {
+               offset = vhd_sectors_to_bytes(treq.sec);
+               goto make_request;
+       }
+
+       blk    = treq.sec / s->spb;
+       sec    = treq.sec % s->spb;
+       offset = bat_entry(s, blk);
+
+       if (test_vhd_flag(flags, VHD_FLAG_REQ_UPDATE_BAT)) {
+               if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_PREALLOCATE))
+                       err = allocate_block(s, blk);
+               else
+                       err = update_bat(s, blk);
+
+               if (err)
+                       return err;
+
+               offset = s->bat.pbw_offset;
+       }
+
+       offset += s->bm_secs + sec;
+       offset  = vhd_sectors_to_bytes(offset);
+
+ make_request:
+       req = alloc_vhd_request(s);
+       if (!req)
+               return -EBUSY;
+
+       req->treq  = treq;
+       req->flags = flags;
+       req->op    = VHD_OP_DATA_WRITE;
+       req->next  = NULL;
+
+       if (test_vhd_flag(flags, VHD_FLAG_REQ_UPDATE_BITMAP)) {
+               bm = get_bitmap(s, blk);
+               ASSERT(bm && bitmap_valid(bm));
+               lock_bitmap(bm);
+
+               if (bm->tx.closed) {
+                       add_to_tail(&bm->queue, req);
+                       set_vhd_flag(req->flags, VHD_FLAG_REQ_QUEUED);
+               } else
+                       add_to_transaction(&bm->tx, req);
+       }
+
+       aio_write(s, req, offset);
+
+       DBG(TLOG_DBG, "%s: lsec: 0x%08"PRIx64", blk: 0x%04x, sec: 0x%04x, "
+           "nr_secs: 0x%04x, offset: 0x%08"PRIx64", flags: 0x%08x\n",
+           s->vhd.file, treq.sec, blk, sec, treq.secs, offset, req->flags);
+
+       return 0;
+}
+
+static int 
+schedule_bitmap_read(struct vhd_state *s, uint32_t blk)
+{
+       int err;
+       u64 offset;
+       struct vhd_bitmap  *bm;
+       struct vhd_request *req = NULL;
+
+       ASSERT(vhd_type_dynamic(&s->vhd));
+
+       offset = bat_entry(s, blk);
+
+       ASSERT(offset != DD_BLK_UNUSED);
+       ASSERT(!get_bitmap(s, blk));
+
+       offset = vhd_sectors_to_bytes(offset);
+
+       err = alloc_vhd_bitmap(s, &bm, blk);
+       if (err)
+               return err;
+
+       req = &bm->req;
+       init_vhd_request(s, req);
+
+       req->treq.sec  = blk * s->spb;
+       req->treq.secs = s->bm_secs;
+       req->treq.buf  = bm->map;
+       req->treq.cb   = NULL;
+       req->op        = VHD_OP_BITMAP_READ;
+       req->next      = NULL;
+
+       aio_read(s, req, offset);
+       lock_bitmap(bm);
+       install_bitmap(s, bm);
+       set_vhd_flag(bm->status, VHD_FLAG_BM_READ_PENDING);
+
+       DBG(TLOG_DBG, "%s: lsec: 0x%08"PRIx64", blk: 0x%04x, nr_secs: 0x%04x, "
+           "offset: 0x%08"PRIx64"\n", s->vhd.file, req->treq.sec, blk,
+           req->treq.secs, offset);
+
+       return 0;
+}
+
+static void
+schedule_bitmap_write(struct vhd_state *s, uint32_t blk)
+{
+       u64 offset;
+       struct vhd_bitmap  *bm;
+       struct vhd_request *req;
+
+       bm     = get_bitmap(s, blk);
+       offset = bat_entry(s, blk);
+
+       ASSERT(vhd_type_dynamic(&s->vhd));
+       ASSERT(bm && bitmap_valid(bm) &&
+              !test_vhd_flag(bm->status, VHD_FLAG_BM_WRITE_PENDING));
+
+       if (offset == DD_BLK_UNUSED) {
+               ASSERT(bat_locked(s) && s->bat.pbw_blk == blk);
+               offset = s->bat.pbw_offset;
+       }
+       
+       offset = vhd_sectors_to_bytes(offset);
+
+       req = &bm->req;
+       init_vhd_request(s, req);
+
+       req->treq.sec  = blk * s->spb;
+       req->treq.secs = s->bm_secs;
+       req->treq.buf  = bm->shadow;
+       req->treq.cb   = NULL;
+       req->op        = VHD_OP_BITMAP_WRITE;
+       req->next      = NULL;
+
+       aio_write(s, req, offset);
+       lock_bitmap(bm);
+       touch_bitmap(s, bm);     /* bump lru count */
+       set_vhd_flag(bm->status, VHD_FLAG_BM_WRITE_PENDING);
+
+       DBG(TLOG_DBG, "%s: blk: 0x%04x, sec: 0x%08"PRIx64", nr_secs: 0x%04x, "
+           "offset: 0x%"PRIx64"\n", s->vhd.file, blk, req->treq.sec,
+           req->treq.secs, offset);
+}
+
+/* 
+ * queued requests will be submitted once the bitmap
+ * describing them is read and the requests are validated. 
+ */
+static int
+__vhd_queue_request(struct vhd_state *s, uint8_t op, td_request_t treq)
+{
+       u32 blk;
+       struct vhd_bitmap  *bm;
+       struct vhd_request *req;
+
+       ASSERT(vhd_type_dynamic(&s->vhd));
+
+       blk = treq.sec / s->spb;
+       bm  = get_bitmap(s, blk);
+
+       ASSERT(bm && test_vhd_flag(bm->status, VHD_FLAG_BM_READ_PENDING));
+
+       req = alloc_vhd_request(s);
+       if (!req)
+               return -EBUSY;
+
+       req->treq = treq;
+       req->op   = op;
+       req->next = NULL;
+
+       add_to_tail(&bm->waiting, req);
+       lock_bitmap(bm);
+
+       DBG(TLOG_DBG, "%s: lsec: 0x%08"PRIx64", blk: 0x%04x nr_secs: 0x%04x, "
+           "op: %u\n", s->vhd.file, treq.sec, blk, treq.secs, op);
+
+       TRACE(s);
+       return 0;
+}
+
+static void
+vhd_queue_read(td_driver_t *driver, td_request_t treq)
+{
+       struct vhd_state *s = (struct vhd_state *)driver->data;
+
+       DBG(TLOG_DBG, "%s: lsec: 0x%08"PRIx64", secs: 0x%04x (seg: %d)\n",
+           s->vhd.file, treq.sec, treq.secs, treq.sidx);
+
+       while (treq.secs) {
+               int err;
+               td_request_t clone;
+
+               err   = 0;
+               clone = treq;
+
+               switch (read_bitmap_cache(s, clone.sec, VHD_OP_DATA_READ)) {
+               case -EINVAL:
+                       err = -EINVAL;
+                       goto fail;
+
+               case VHD_BM_BAT_CLEAR:
+                       clone.secs = MIN(clone.secs, s->spb - (clone.sec % s->spb));
+                       td_forward_request(clone);
+                       break;
+
+               case VHD_BM_BIT_CLEAR:
+                       clone.secs = read_bitmap_cache_span(s, clone.sec, clone.secs, 0);
+                       td_forward_request(clone);
+                       break;
+
+               case VHD_BM_BIT_SET:
+                       clone.secs = read_bitmap_cache_span(s, clone.sec, clone.secs, 1);
+                       err = schedule_data_read(s, clone, 0);
+                       if (err)
+                               goto fail;
+                       break;
+
+               case VHD_BM_NOT_CACHED:
+                       err = schedule_bitmap_read(s, clone.sec / s->spb);
+                       if (err)
+                               goto fail;
+
+                       clone.secs = MIN(clone.secs, s->spb - (clone.sec % s->spb));
+                       err = __vhd_queue_request(s, VHD_OP_DATA_READ, clone);
+                       if (err)
+                               goto fail;
+                       break;
+
+               case VHD_BM_READ_PENDING:
+                       clone.secs = MIN(clone.secs, s->spb - (clone.sec % s->spb));
+                       err = __vhd_queue_request(s, VHD_OP_DATA_READ, clone);
+                       if (err)
+                               goto fail;
+                       break;
+
+               case VHD_BM_BAT_LOCKED:
+               default:
+                       ASSERT(0);
+                       break;
+               }
+
+               treq.sec  += clone.secs;
+               treq.secs -= clone.secs;
+               treq.buf  += vhd_sectors_to_bytes(clone.secs);
+               continue;
+
+       fail:
+               clone.secs = treq.secs;
+               td_complete_request(clone, err);
+               break;
+       }
+}
+
+static void
+vhd_queue_write(td_driver_t *driver, td_request_t treq)
+{
+       struct vhd_state *s = (struct vhd_state *)driver->data;
+
+       DBG(TLOG_DBG, "%s: lsec: 0x%08"PRIx64", secs: 0x%04x, (seg: %d)\n",
+           s->vhd.file, treq.sec, treq.secs, treq.sidx);
+
+       while (treq.secs) {
+               int err;
+               uint8_t flags;
+               td_request_t clone;
+
+               err   = 0;
+               flags = 0;
+               clone = treq;
+
+               switch (read_bitmap_cache(s, clone.sec, VHD_OP_DATA_WRITE)) {
+               case -EINVAL:
+                       err = -EINVAL;
+                       goto fail;
+
+               case VHD_BM_BAT_LOCKED:
+                       err = -EBUSY;
+                       clone.blocked = 1;
+                       goto fail;
+
+               case VHD_BM_BAT_CLEAR:
+                       flags      = (VHD_FLAG_REQ_UPDATE_BAT |
+                                     VHD_FLAG_REQ_UPDATE_BITMAP);
+                       clone.secs = MIN(clone.secs, s->spb - (clone.sec % s->spb));
+                       err        = schedule_data_write(s, clone, flags);
+                       if (err)
+                               goto fail;
+                       break;
+
+               case VHD_BM_BIT_CLEAR:
+                       flags      = VHD_FLAG_REQ_UPDATE_BITMAP;
+                       clone.secs = read_bitmap_cache_span(s, clone.sec, clone.secs, 0);
+                       err        = schedule_data_write(s, clone, flags);
+                       if (err)
+                               goto fail;
+                       break;
+
+               case VHD_BM_BIT_SET:
+                       clone.secs = read_bitmap_cache_span(s, clone.sec, clone.secs, 1);
+                       err = schedule_data_write(s, clone, 0);
+                       if (err)
+                               goto fail;
+                       break;
+
+               case VHD_BM_NOT_CACHED:
+                       clone.secs = MIN(clone.secs, s->spb - (clone.sec % s->spb));
+                       err = schedule_bitmap_read(s, clone.sec / s->spb);
+                       if (err)
+                               goto fail;
+
+                       err = __vhd_queue_request(s, VHD_OP_DATA_WRITE, clone);
+                       if (err)
+                               goto fail;
+                       break;
+
+               case VHD_BM_READ_PENDING:
+                       clone.secs = MIN(clone.secs, s->spb - (clone.sec % s->spb));
+                       err = __vhd_queue_request(s, VHD_OP_DATA_WRITE, clone);
+                       if (err)
+                               goto fail;
+                       break;
+
+               default:
+                       ASSERT(0);
+                       break;
+               }
+
+               treq.sec  += clone.secs;
+               treq.secs -= clone.secs;
+               treq.buf  += vhd_sectors_to_bytes(clone.secs);
+               continue;
+
+       fail:
+               clone.secs = treq.secs;
+               td_complete_request(clone, err);
+               break;
+       }
+}
+
+static inline void
+signal_completion(struct vhd_request *list, int error)
+{
+       struct vhd_state *s;
+       struct vhd_request *r, *next;
+
+       if (!list)
+               return;
+
+       r = list;
+       s = list->state;
+
+       while (r) {
+               int err;
+
+               err  = (error ? error : r->error);
+               next = r->next;
+               td_complete_request(r->treq, err);
+               DBG(TLOG_DBG, "lsec: 0x%08"PRIx64", blk: 0x%04"PRIx64", "
+                   "err: %d\n", r->treq.sec, r->treq.sec / s->spb, err);
+               free_vhd_request(s, r);
+               r    = next;
+
+               s->returned++;
+               TRACE(s);
+       }
+}
+
+static void
+start_new_bitmap_transaction(struct vhd_state *s, struct vhd_bitmap *bm)
+{
+       int i, error = 0;
+       struct vhd_transaction *tx;
+       struct vhd_request *r, *next;
+
+       if (!bm->queue.head)
+               return;
+
+       DBG(TLOG_DBG, "blk: 0x%04x\n", bm->blk);
+
+       r  = bm->queue.head;
+       tx = &bm->tx;
+       clear_req_list(&bm->queue);
+
+       if (r && bat_entry(s, bm->blk) == DD_BLK_UNUSED)
+               tx->error = -EIO;
+
+       while (r) {
+               next    = r->next;
+               r->next = NULL;
+               clear_vhd_flag(r->flags, VHD_FLAG_REQ_QUEUED);
+
+               add_to_transaction(tx, r);
+               if (test_vhd_flag(r->flags, VHD_FLAG_REQ_FINISHED)) {
+                       tx->finished++;
+                       if (!r->error) {
+                               u32 sec = r->treq.sec % s->spb;
+                               for (i = 0; i < r->treq.secs; i++)
+                                       vhd_bitmap_set(&s->vhd,
+                                                      bm->shadow, sec + i);
+                       }
+               }
+               r = next;
+       }
+
+       /* perhaps all the queued writes already completed? */
+       if (tx->started && transaction_completed(tx))
+               finish_data_transaction(s, bm);
+}
+
+static void
+finish_bat_transaction(struct vhd_state *s, struct vhd_bitmap *bm)
+{
+       struct vhd_transaction *tx = &bm->tx;
+
+       if (!bat_locked(s))
+               return;
+
+       if (s->bat.pbw_blk != bm->blk)
+               return;
+
+       if (!s->bat.req.error)
+               goto release;
+
+       if (!test_vhd_flag(tx->status, VHD_FLAG_TX_LIVE))
+               goto release;
+
+       tx->closed = 1;
+       return;
+
+ release:
+       DBG(TLOG_DBG, "blk: 0x%04x\n", bm->blk);
+       unlock_bat(s);
+       init_bat(s);
+}
+
+static void
+finish_bitmap_transaction(struct vhd_state *s,
+                         struct vhd_bitmap *bm, int error)
+{
+       int map_size;
+       struct vhd_transaction *tx = &bm->tx;
+
+       DBG(TLOG_DBG, "blk: 0x%04x, err: %d\n", bm->blk, error);
+       tx->error = (tx->error ? tx->error : error);
+       map_size  = vhd_sectors_to_bytes(s->bm_secs);
+
+       if (!test_vhd_flag(s->flags, VHD_FLAG_OPEN_PREALLOCATE)) {
+               if (test_vhd_flag(tx->status, VHD_FLAG_TX_UPDATE_BAT)) {
+                       /* still waiting for bat write */
+                       ASSERT(bm->blk == s->bat.pbw_blk);
+                       ASSERT(test_vhd_flag(s->bat.status, 
+                                            VHD_FLAG_BAT_WRITE_STARTED));
+                       s->bat.req.tx = tx;
+                       return;
+               }
+       }
+
+       if (tx->error) {
+               /* undo changes to shadow */
+               memcpy(bm->shadow, bm->map, map_size);
+       } else {
+               /* complete atomic write */
+               memcpy(bm->map, bm->shadow, map_size);
+               if (!test_batmap(s, bm->blk) && bitmap_full(s, bm))
+                       set_batmap(s, bm->blk);
+       }
+
+       /* transaction done; signal completions */
+       signal_completion(tx->requests.head, tx->error);
+       init_tx(tx);
+       start_new_bitmap_transaction(s, bm);
+
+       if (!bitmap_in_use(bm))
+               unlock_bitmap(bm);
+
+       finish_bat_transaction(s, bm);
+}
+
+static void
+finish_data_transaction(struct vhd_state *s, struct vhd_bitmap *bm)
+{
+       struct vhd_transaction *tx = &bm->tx;
+
+       DBG(TLOG_DBG, "blk: 0x%04x\n", bm->blk);
+
+       tx->closed = 1;
+
+       if (!tx->error)
+               return schedule_bitmap_write(s, bm->blk);
+
+       return finish_bitmap_transaction(s, bm, 0);
+}
+
+static void
+finish_bat_write(struct vhd_request *req)
+{
+       struct vhd_bitmap *bm;
+       struct vhd_transaction *tx;
+       struct vhd_state *s = req->state;
+
+       s->returned++;
+       TRACE(s);
+
+       bm = get_bitmap(s, s->bat.pbw_blk);
+
+       DBG(TLOG_DBG, "blk 0x%04x, pbwo: 0x%08"PRIx64", err %d\n",
+           s->bat.pbw_blk, s->bat.pbw_offset, req->error);
+       ASSERT(bm && bitmap_valid(bm));
+       ASSERT(bat_locked(s) &&
+              test_vhd_flag(s->bat.status, VHD_FLAG_BAT_WRITE_STARTED));
+
+       tx = &bm->tx;
+       ASSERT(test_vhd_flag(tx->status, VHD_FLAG_TX_LIVE));
+
+       if (!req->error) {
+               bat_entry(s, s->bat.pbw_blk) = s->bat.pbw_offset;
+               s->next_db = s->bat.pbw_offset + s->spb + s->bm_secs;
+       } else
+               tx->error = req->error;
+
+       if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_PREALLOCATE)) {
+               tx->finished++;
+               remove_from_req_list(&tx->requests, req);
+               if (transaction_completed(tx))
+                       finish_data_transaction(s, bm);
+       } else {
+               clear_vhd_flag(tx->status, VHD_FLAG_TX_UPDATE_BAT);
+               if (s->bat.req.tx)
+                       finish_bitmap_transaction(s, bm, req->error);
+       }
+
+       finish_bat_transaction(s, bm);
+}
+
+static void
+finish_zero_bm_write(struct vhd_request *req)
+{
+       u32 blk;
+       struct vhd_bitmap *bm;
+       struct vhd_transaction *tx = req->tx;
+       struct vhd_state *s = req->state;
+
+       s->returned++;
+       TRACE(s);
+
+       blk = req->treq.sec / s->spb;
+       bm  = get_bitmap(s, blk);
+
+       DBG(TLOG_DBG, "blk: 0x%04x\n", blk);
+       ASSERT(bat_locked(s));
+       ASSERT(s->bat.pbw_blk == blk);
+       ASSERT(bm && bitmap_valid(bm) && bitmap_locked(bm));
+
+       tx->finished++;
+       remove_from_req_list(&tx->requests, req);
+
+       if (req->error) {
+               unlock_bat(s);
+               init_bat(s);
+               tx->error = req->error;
+               clear_vhd_flag(tx->status, VHD_FLAG_TX_UPDATE_BAT);
+       } else
+               schedule_bat_write(s);
+
+       if (transaction_completed(tx))
+               finish_data_transaction(s, bm);
+}
+
+static void
+finish_bitmap_read(struct vhd_request *req)
+{
+       u32 blk;
+       struct vhd_bitmap  *bm;
+       struct vhd_request *r, *next;
+       struct vhd_state   *s = req->state;
+
+       s->returned++;
+       TRACE(s);
+
+       blk = req->treq.sec / s->spb;
+       bm  = get_bitmap(s, blk);
+
+       DBG(TLOG_DBG, "blk: 0x%04x\n", blk);
+       ASSERT(bm && test_vhd_flag(bm->status, VHD_FLAG_BM_READ_PENDING));
+
+       r = bm->waiting.head;
+       clear_req_list(&bm->waiting);
+       clear_vhd_flag(bm->status, VHD_FLAG_BM_READ_PENDING);
+
+       if (!req->error) {
+               memcpy(bm->shadow, bm->map, vhd_sectors_to_bytes(s->bm_secs));
+
+               while (r) {
+                       struct vhd_request tmp;
+
+                       tmp  = *r;
+                       next =  r->next;
+                       free_vhd_request(s, r);
+
+                       ASSERT(tmp.op == VHD_OP_DATA_READ || 
+                              tmp.op == VHD_OP_DATA_WRITE);
+
+                       if (tmp.op == VHD_OP_DATA_READ)
+                               vhd_queue_read(s->driver, tmp.treq);
+                       else if (tmp.op == VHD_OP_DATA_WRITE)
+                               vhd_queue_write(s->driver, tmp.treq);
+
+                       r = next;
+               }
+       } else {
+               int err = req->error;
+               unlock_bitmap(bm);
+               free_vhd_bitmap(s, bm);
+               return signal_completion(r, err);
+       }
+
+       if (!bitmap_in_use(bm))
+               unlock_bitmap(bm);
+}
+
+static void
+finish_bitmap_write(struct vhd_request *req)
+{
+       u32 blk;
+       struct vhd_bitmap  *bm;
+       struct vhd_transaction *tx;
+       struct vhd_state *s = req->state;
+
+       s->returned++;
+       TRACE(s);
+
+       blk = req->treq.sec / s->spb;
+       bm  = get_bitmap(s, blk);
+       tx  = &bm->tx;
+
+       DBG(TLOG_DBG, "blk: 0x%04x, started: %d, finished: %d\n",
+           blk, tx->started, tx->finished);
+       ASSERT(tx->closed);
+       ASSERT(bm && bitmap_valid(bm));
+       ASSERT(test_vhd_flag(bm->status, VHD_FLAG_BM_WRITE_PENDING));
+
+       clear_vhd_flag(bm->status, VHD_FLAG_BM_WRITE_PENDING);
+
+       finish_bitmap_transaction(s, bm, req->error);
+}
+
+static void
+finish_data_read(struct vhd_request *req)
+{
+       struct vhd_state *s = req->state;
+
+       DBG(TLOG_DBG, "lsec 0x%08"PRIx64", blk: 0x%04"PRIx64"\n", 
+           req->treq.sec, req->treq.sec / s->spb);
+       signal_completion(req, 0);
+}
+
+static void
+finish_data_write(struct vhd_request *req)
+{
+       int i;
+       struct vhd_transaction *tx = req->tx;
+       struct vhd_state *s = (struct vhd_state *)req->state;
+
+       set_vhd_flag(req->flags, VHD_FLAG_REQ_FINISHED);
+
+       if (tx) {
+               u32 blk, sec;
+               struct vhd_bitmap *bm;
+
+               blk = req->treq.sec / s->spb;
+               sec = req->treq.sec % s->spb;
+               bm  = get_bitmap(s, blk);
+
+               ASSERT(bm && bitmap_valid(bm) && bitmap_locked(bm));
+
+               tx->finished++;
+
+               DBG(TLOG_DBG, "lsec: 0x%08"PRIx64", blk: 0x04%"PRIx64", "
+                   "tx->started: %d, tx->finished: %d\n", req->treq.sec,
+                   req->treq.sec / s->spb, tx->started, tx->finished);
+
+               if (!req->error)
+                       for (i = 0; i < req->treq.secs; i++)
+                               vhd_bitmap_set(&s->vhd, bm->shadow,  sec + i);
+
+               if (transaction_completed(tx))
+                       finish_data_transaction(s, bm);
+
+       } else if (!test_vhd_flag(req->flags, VHD_FLAG_REQ_QUEUED)) {
+               ASSERT(!req->next);
+               DBG(TLOG_DBG, "lsec: 0x%08"PRIx64", blk: 0x%04"PRIx64"\n", 
+                   req->treq.sec, req->treq.sec / s->spb);
+               signal_completion(req, 0);
+       }
+}
+
+void
+vhd_complete(void *arg, struct tiocb *tiocb, int err)
+{
+       struct vhd_request *req = (struct vhd_request *)arg;
+       struct vhd_state *s = req->state;
+       struct iocb *io = &tiocb->iocb;
+
+       s->completed++;
+       TRACE(s);
+
+       req->error = err;
+
+       if (req->error)
+               ERR(req->error, "%s: op: %u, lsec: %"PRIu64", secs: %u, "
+                   "nbytes: %lu, blk: %"PRIu64", blk_offset: %u",
+                   s->vhd.file, req->op, req->treq.sec, req->treq.secs,
+                   io->u.c.nbytes, req->treq.sec / s->spb,
+                   bat_entry(s, req->treq.sec / s->spb));
+
+       switch (req->op) {
+       case VHD_OP_DATA_READ:
+               finish_data_read(req);
+               break;
+
+       case VHD_OP_DATA_WRITE:
+               finish_data_write(req);
+               break;
+
+       case VHD_OP_BITMAP_READ:
+               finish_bitmap_read(req);
+               break;
+
+       case VHD_OP_BITMAP_WRITE:
+               finish_bitmap_write(req);
+               break;
+
+       case VHD_OP_ZERO_BM_WRITE:
+               finish_zero_bm_write(req);
+               break;
+
+       case VHD_OP_BAT_WRITE:
+               finish_bat_write(req);
+               break;
+
+       default:
+               ASSERT(0);
+               break;
+       }
+}
+
+void 
+vhd_debug(td_driver_t *driver)
+{
+       int i;
+       struct vhd_state *s = (struct vhd_state *)driver->data;
+
+       DBG(TLOG_WARN, "%s: QUEUED: 0x%08"PRIx64", COMPLETED: 0x%08"PRIx64", "
+           "RETURNED: 0x%08"PRIx64"\n", s->vhd.file, s->queued, s->completed,
+           s->returned);
+       DBG(TLOG_WARN, "WRITES: 0x%08"PRIx64", AVG_WRITE_SIZE: %f\n",
+           s->writes, (s->writes ? ((float)s->write_size / s->writes) : 0.0));
+       DBG(TLOG_WARN, "READS: 0x%08"PRIx64", AVG_READ_SIZE: %f\n",
+           s->reads, (s->reads ? ((float)s->read_size / s->reads) : 0.0));
+
+       DBG(TLOG_WARN, "ALLOCATED REQUESTS: (%lu total)\n", VHD_REQS_DATA);
+       for (i = 0; i < VHD_REQS_DATA; i++) {
+               struct vhd_request *r = &s->vreq_list[i];
+               td_request_t *t       = &r->treq;
+               if (t->secs)
+                       DBG(TLOG_WARN, "%d: id: 0x%04"PRIx64", err: %d, op: %d,"
+                           " lsec: 0x%08"PRIx64", flags: %d, this: %p, "
+                           "next: %p, tx: %p\n", i, t->id, r->error, r->op,
+                           t->sec, r->flags, r, r->next, r->tx);
+       }
+
+       DBG(TLOG_WARN, "BITMAP CACHE:\n");
+       for (i = 0; i < VHD_CACHE_SIZE; i++) {
+               int qnum = 0, wnum = 0, rnum = 0;
+               struct vhd_bitmap *bm = s->bitmap[i];
+               struct vhd_transaction *tx;
+               struct vhd_request *r;
+
+               if (!bm)
+                       continue;
+
+               tx = &bm->tx;
+               r = bm->queue.head;
+               while (r) {
+                       qnum++;
+                       r = r->next;
+               }
+
+               r = bm->waiting.head;
+               while (r) {
+                       wnum++;
+                       r = r->next;
+               }
+
+               r = tx->requests.head;
+               while (r) {
+                       rnum++;
+                       r = r->next;
+               }
+
+               DBG(TLOG_WARN, "%d: blk: 0x%04x, status: 0x%08x, q: %p, qnum: %d, w: %p, "
+                   "wnum: %d, locked: %d, in use: %d, tx: %p, tx_error: %d, "
+                   "started: %d, finished: %d, status: %u, reqs: %p, nreqs: %d\n",
+                   i, bm->blk, bm->status, bm->queue.head, qnum, bm->waiting.head,
+                   wnum, bitmap_locked(bm), bitmap_in_use(bm), tx, tx->error,
+                   tx->started, tx->finished, tx->status, tx->requests.head, rnum);
+       }
+
+       DBG(TLOG_WARN, "BAT: status: 0x%08x, pbw_blk: 0x%04x, "
+           "pbw_off: 0x%08"PRIx64", tx: %p\n", s->bat.status, s->bat.pbw_blk,
+           s->bat.pbw_offset, s->bat.req.tx);
+
+/*
+       for (i = 0; i < s->hdr.max_bat_size; i++)
+               DPRINTF("%d: %u\n", i, s->bat.bat[i]);
+*/
+}
+
+struct tap_disk tapdisk_vhd = {
+       .disk_type          = "tapdisk_vhd",
+       .flags              = 0,
+       .private_data_size  = sizeof(struct vhd_state),
+       .td_open            = _vhd_open,
+       .td_close           = _vhd_close,
+       .td_queue_read      = vhd_queue_read,
+       .td_queue_write     = vhd_queue_write,
+       .td_get_parent_id   = vhd_get_parent_id,
+       .td_validate_parent = vhd_validate_parent,
+       .td_debug           = vhd_debug,
+};
diff --git a/tools/blktap2/drivers/bswap.h b/tools/blktap2/drivers/bswap.h
new file mode 100644 (file)
index 0000000..45016b9
--- /dev/null
@@ -0,0 +1,214 @@
+#ifndef BSWAP_H
+#define BSWAP_H
+
+//#include "config-host.h"
+
+#include <inttypes.h>
+
+#if defined(__NetBSD__)
+#include <sys/endian.h>
+#include <sys/types.h>
+#elif defined(__OpenBSD__)
+#include <machine/endian.h>
+#define bswap_16(x) swap16(x)
+#define bswap_32(x) swap32(x)
+#define bswap_64(x) swap64(x)
+#else
+
+#ifdef HAVE_BYTESWAP_H
+#include <byteswap.h>
+#else
+
+#define bswap_16(x) \
+({ \
+       uint16_t __x = (x); \
+       ((uint16_t)( \
+               (((uint16_t)(__x) & (uint16_t)0x00ffU) << 8) | \
+               (((uint16_t)(__x) & (uint16_t)0xff00U) >> 8) )); \
+})
+
+#define bswap_32(x) \
+({ \
+       uint32_t __x = (x); \
+       ((uint32_t)( \
+               (((uint32_t)(__x) & (uint32_t)0x000000ffUL) << 24) | \
+               (((uint32_t)(__x) & (uint32_t)0x0000ff00UL) <<  8) | \
+               (((uint32_t)(__x) & (uint32_t)0x00ff0000UL) >>  8) | \
+               (((uint32_t)(__x) & (uint32_t)0xff000000UL) >> 24) )); \
+})
+
+#define bswap_64(x) \
+({ \
+       uint64_t __x = (x); \
+       ((uint64_t)( \
+               (uint64_t)(((uint64_t)(__x) & (uint64_t)0x00000000000000ffULL) << 56) | \
+               (uint64_t)(((uint64_t)(__x) & (uint64_t)0x000000000000ff00ULL) << 40) | \
+               (uint64_t)(((uint64_t)(__x) & (uint64_t)0x0000000000ff0000ULL) << 24) | \
+               (uint64_t)(((uint64_t)(__x) & (uint64_t)0x00000000ff000000ULL) <<  8) | \
+               (uint64_t)(((uint64_t)(__x) & (uint64_t)0x000000ff00000000ULL) >>  8) | \
+               (uint64_t)(((uint64_t)(__x) & (uint64_t)0x0000ff0000000000ULL) >> 24) | \
+               (uint64_t)(((uint64_t)(__x) & (uint64_t)0x00ff000000000000ULL) >> 40) | \
+               (uint64_t)(((uint64_t)(__x) & (uint64_t)0xff00000000000000ULL) >> 56) )); \
+})
+
+#endif /* !HAVE_BYTESWAP_H */
+
+static inline uint16_t bswap16(uint16_t x)
+{
+    return bswap_16(x);
+}
+
+static inline uint32_t bswap32(uint32_t x) 
+{
+    return bswap_32(x);
+}
+
+static inline uint64_t bswap64(uint64_t x) 
+{
+    return bswap_64(x);
+}
+
+static inline void bswap16s(uint16_t *s)
+{
+    *s = bswap16(*s);
+}
+
+static inline void bswap32s(uint32_t *s)
+{
+    *s = bswap32(*s);
+}
+
+static inline void bswap64s(uint64_t *s)
+{
+    *s = bswap64(*s);
+}
+
+#endif
+
+#if defined(WORDS_BIGENDIAN)
+#define be_bswap(v, size) (v)
+#define le_bswap(v, size) bswap ## size(v)
+#define be_bswaps(v, size)
+#define le_bswaps(p, size) *p = bswap ## size(*p);
+#else
+#define le_bswap(v, size) (v)
+#define be_bswap(v, size) bswap ## size(v)
+#define le_bswaps(v, size)
+#define be_bswaps(p, size) *p = bswap ## size(*p);
+#endif
+
+#define CPU_CONVERT(endian, size, type)\
+static inline type endian ## size ## _to_cpu(type v)\
+{\
+    return endian ## _bswap(v, size);\
+}\
+\
+static inline type cpu_to_ ## endian ## size(type v)\
+{\
+    return endian ## _bswap(v, size);\
+}\
+\
+static inline void endian ## size ## _to_cpus(type *p)\
+{\
+    endian ## _bswaps(p, size)\
+}\
+\
+static inline void cpu_to_ ## endian ## size ## s(type *p)\
+{\
+    endian ## _bswaps(p, size)\
+}\
+\
+static inline type endian ## size ## _to_cpup(const type *p)\
+{\
+    return endian ## size ## _to_cpu(*p);\
+}\
+\
+static inline void cpu_to_ ## endian ## size ## w(type *p, type v)\
+{\
+     *p = cpu_to_ ## endian ## size(v);\
+}
+
+CPU_CONVERT(be, 16, uint16_t)
+CPU_CONVERT(be, 32, uint32_t)
+CPU_CONVERT(be, 64, uint64_t)
+
+CPU_CONVERT(le, 16, uint16_t)
+CPU_CONVERT(le, 32, uint32_t)
+CPU_CONVERT(le, 64, uint64_t)
+
+/* unaligned versions (optimized for frequent unaligned accesses)*/
+
+#if defined(__i386__) || defined(__powerpc__)
+
+#define cpu_to_le16wu(p, v) cpu_to_le16w(p, v)
+#define cpu_to_le32wu(p, v) cpu_to_le32w(p, v)
+#define le16_to_cpupu(p) le16_to_cpup(p)
+#define le32_to_cpupu(p) le32_to_cpup(p)
+
+#define cpu_to_be16wu(p, v) cpu_to_be16w(p, v)
+#define cpu_to_be32wu(p, v) cpu_to_be32w(p, v)
+
+#else
+
+static inline void cpu_to_le16wu(uint16_t *p, uint16_t v)
+{
+    uint8_t *p1 = (uint8_t *)p;
+
+    p1[0] = v;
+    p1[1] = v >> 8;
+}
+
+static inline void cpu_to_le32wu(uint32_t *p, uint32_t v)
+{
+    uint8_t *p1 = (uint8_t *)p;
+
+    p1[0] = v;
+    p1[1] = v >> 8;
+    p1[2] = v >> 16;
+    p1[3] = v >> 24;
+}
+
+static inline uint16_t le16_to_cpupu(const uint16_t *p)
+{
+    const uint8_t *p1 = (const uint8_t *)p;
+    return p1[0] | (p1[1] << 8);
+}
+
+static inline uint32_t le32_to_cpupu(const uint32_t *p)
+{
+    const uint8_t *p1 = (const uint8_t *)p;
+    return p1[0] | (p1[1] << 8) | (p1[2] << 16) | (p1[3] << 24);
+}
+
+static inline void cpu_to_be16wu(uint16_t *p, uint16_t v)
+{
+    uint8_t *p1 = (uint8_t *)p;
+
+    p1[0] = v >> 8;
+    p1[1] = v;
+}
+
+static inline void cpu_to_be32wu(uint32_t *p, uint32_t v)
+{
+    uint8_t *p1 = (uint8_t *)p;
+
+    p1[0] = v >> 24;
+    p1[1] = v >> 16;
+    p1[2] = v >> 8;
+    p1[3] = v;
+}
+
+#endif
+
+#ifdef WORDS_BIGENDIAN
+#define cpu_to_32wu cpu_to_be32wu
+#else
+#define cpu_to_32wu cpu_to_le32wu
+#endif
+
+#undef le_bswap
+#undef be_bswap
+#undef le_bswaps
+#undef be_bswaps
+
+#endif /* BSWAP_H */
diff --git a/tools/blktap2/drivers/check_gcrypt b/tools/blktap2/drivers/check_gcrypt
new file mode 100644 (file)
index 0000000..154ba24
--- /dev/null
@@ -0,0 +1,14 @@
+#!/bin/sh
+
+cat > .gcrypt.c << EOF
+#include <gcrypt.h>
+int main(void) { return 0; }
+EOF
+
+if $1 -o .gcrypt .gcrypt.c -lgcrypt 2>/dev/null ; then
+  echo "yes"
+else
+  echo "no"
+fi
+
+rm -f .gcrypt*
diff --git a/tools/blktap2/drivers/disktypes.h b/tools/blktap2/drivers/disktypes.h
new file mode 100644 (file)
index 0000000..d0923f1
--- /dev/null
@@ -0,0 +1,184 @@
+/* 
+ * Copyright (c) 2007, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __DISKTYPES_H__
+#define __DISKTYPES_H__
+
+typedef struct disk_info {
+       int  idnum;
+       char name[50];       /* e.g. "RAMDISK" */
+       char handle[10];     /* xend handle, e.g. 'ram' */
+       int  single_handler; /* is there a single controller for all */
+                            /* instances of disk type? */
+#ifdef TAPDISK
+       struct tap_disk *drv;
+#endif
+} disk_info_t;
+
+extern struct tap_disk tapdisk_aio;
+/* extern struct tap_disk tapdisk_sync;    */
+/* extern struct tap_disk tapdisk_vmdk;    */
+/* extern struct tap_disk tapdisk_vhdsync; */
+extern struct tap_disk tapdisk_vhd;
+extern struct tap_disk tapdisk_ram;
+ extern struct tap_disk tapdisk_qcow; 
+extern struct tap_disk tapdisk_block_cache;
+extern struct tap_disk tapdisk_log;
+
+#define MAX_DISK_TYPES        20
+
+#define DISK_TYPE_AIO         0
+#define DISK_TYPE_SYNC        1
+#define DISK_TYPE_VMDK        2
+#define DISK_TYPE_VHDSYNC     3
+#define DISK_TYPE_VHD         4
+#define DISK_TYPE_RAM         5
+#define DISK_TYPE_QCOW        6
+#define DISK_TYPE_BLOCK_CACHE 7
+#define DISK_TYPE_LOG         9
+
+/*Define Individual Disk Parameters here */
+static disk_info_t null_disk = {
+       -1,
+       "null disk",
+       "null",
+       0,
+#ifdef TAPDISK
+       0,
+#endif
+};
+
+static disk_info_t aio_disk = {
+       DISK_TYPE_AIO,
+       "raw image (aio)",
+       "aio",
+       0,
+#ifdef TAPDISK
+       &tapdisk_aio,
+#endif
+};
+/*
+static disk_info_t sync_disk = {
+       DISK_TYPE_SYNC,
+       "raw image (sync)",
+       "sync",
+       0,
+#ifdef TAPDISK
+       &tapdisk_sync,
+#endif
+};
+
+static disk_info_t vmdk_disk = {
+       DISK_TYPE_VMDK,
+       "vmware image (vmdk)",
+       "vmdk",
+       1,
+#ifdef TAPDISK
+       &tapdisk_vmdk,
+#endif
+};
+
+static disk_info_t vhdsync_disk = {
+       DISK_TYPE_VHDSYNC,
+       "virtual server image (vhd) - synchronous",
+       "vhdsync",
+       1,
+#ifdef TAPDISK
+       &tapdisk_vhdsync,
+#endif
+};
+*/
+
+static disk_info_t vhd_disk = {
+       DISK_TYPE_VHD,
+       "virtual server image (vhd)",
+       "vhd",
+       0,
+#ifdef TAPDISK
+       &tapdisk_vhd,
+#endif
+};
+
+
+static disk_info_t ram_disk = {
+       DISK_TYPE_RAM,
+       "ramdisk image (ram)",
+       "ram",
+       1,
+#ifdef TAPDISK
+       &tapdisk_ram,
+#endif
+};
+
+
+static disk_info_t qcow_disk = {
+       DISK_TYPE_QCOW,
+       "qcow disk (qcow)",
+       "qcow",
+       0,
+#ifdef TAPDISK
+       &tapdisk_qcow,
+#endif
+};
+
+
+static disk_info_t block_cache_disk = {
+       DISK_TYPE_BLOCK_CACHE,
+       "block cache image (bc)",
+       "bc",
+       1,
+#ifdef TAPDISK
+       &tapdisk_block_cache,
+#endif
+};
+
+static disk_info_t log_disk = {
+       DISK_TYPE_LOG,
+       "write logger (log)",
+       "log",
+       0,
+#ifdef TAPDISK
+       &tapdisk_log,
+#endif
+};
+
+/*Main disk info array */
+static disk_info_t *dtypes[] = {
+       &aio_disk,
+       &null_disk, /* &sync_disk, */
+       &null_disk, /* &vmdk_disk, */
+        &null_disk, /* &vhdsync_disk, */
+       &vhd_disk,
+       &ram_disk,
+       &qcow_disk,
+       &block_cache_disk,
+       &null_disk,
+       &log_disk,
+};
+
+#endif
diff --git a/tools/blktap2/drivers/img2qcow.c b/tools/blktap2/drivers/img2qcow.c
new file mode 100644 (file)
index 0000000..b12509d
--- /dev/null
@@ -0,0 +1,318 @@
+/* img2qcow.c
+ *
+ * Generates a qcow format disk and fills it from an existing image.
+ *
+ * (c) 2006 Julian Chesterfield and Andrew Warfield
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/statvfs.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <string.h>
+#include <zlib.h>
+#include <inttypes.h>
+#include <libaio.h>
+#include <openssl/md5.h>
+
+#include "bswap.h"
+#include "aes.h"
+#include "tapdisk.h"
+#include "tapdisk-server.h"
+#include "tapdisk-driver.h"
+#include "tapdisk-interface.h"
+#include "qcow.h"
+#include "blk.h"
+
+
+#if 1
+#define DFPRINTF(_f, _a...) fprintf ( stderr, _f , ## _a )
+#else
+#define DFPRINTF(_f, _a...) ((void)0)
+#endif
+
+/* *BSD has no O_LARGEFILE */
+#ifndef O_LARGEFILE
+#define O_LARGEFILE    0
+#endif
+
+
+#define TAPDISK 1
+#define BLOCK_PROCESSSZ 4096
+#define QCOW_VBD 0
+#define PROGRESS_QUANT 2
+
+static int running = 1, complete = 0;
+static int returned_events = 0, submit_events = 0;
+static uint32_t read_idx = 0;
+td_driver_t *ddqcow;
+td_vbd_t* qcow_vbd;
+static uint64_t prev = 0, written = 0;
+static char output[(100/PROGRESS_QUANT) + 5];
+
+extern tapdisk_server_t server;
+
+
+static void print_bytes(void *ptr, int length)
+{
+  int i,k;
+  unsigned char *p = ptr;
+
+    DFPRINTF("Buf dump, length %d:\n",length);
+    for (k = 0; k < length; k++) {
+        DFPRINTF("%x",*p);
+        *p++;
+       if(k % 16 == 0) DFPRINTF("\n");
+        else if(k % 2 == 0) DFPRINTF(" ");     
+    }
+    DFPRINTF("\n");
+    return;
+}
+
+static void debug_output(uint64_t progress, uint64_t size)
+{
+  //Output progress every PROGRESS_QUANT                                  
+  uint64_t blocks = size/(100/PROGRESS_QUANT);
+
+  if (progress/blocks > prev) {
+    memcpy(output+prev+1,"=>",2);
+    prev++;
+    DFPRINTF("\r%s     %"PRIi64"%%",
+             output, (int64_t)((prev-1)*PROGRESS_QUANT));
+  }
+  return;
+}
+
+static int get_image_info(td_disk_info_t *driver, int fd)
+{
+       int ret;
+       long size;
+       unsigned long total_size;
+       struct statvfs statBuf;
+       struct stat stat;
+       uint64_t sector_size=DEFAULT_SECTOR_SIZE;
+
+       ret = fstat(fd, &stat);
+       if (ret != 0) {
+               DFPRINTF("ERROR: fstat failed, Couldn't stat image");
+               return -EINVAL;
+       }
+
+       if (S_ISBLK(stat.st_mode)) {
+               /*Accessing block device directly*/
+               if (blk_getimagesize(fd, &driver->size) != 0)
+                       return -EINVAL;
+
+               DFPRINTF("Image size: \n\tpre sector_shift  [%"PRIu64"]\n\tpost "
+                       "sector_shift [%"PRIu64"]\n",
+                       (uint64_t)(driver->size << SECTOR_SHIFT),
+                       (uint64_t)driver->size);
+
+               /*Get the sector size*/
+               if (!blk_getsectorsize(fd, &sector_size))
+                 driver->sector_size = sector_size;
+
+       } else {
+               /*Local file? try fstat instead*/
+               driver->size = (stat.st_size >> SECTOR_SHIFT);
+               driver->sector_size = DEFAULT_SECTOR_SIZE;
+               DFPRINTF("Image size: [%"PRIu64"]\n",
+                       (uint64_t)driver->size);
+       }
+
+       return 0;
+}
+
+void send_responses(td_request_t treq, int err)
+{
+  if (err < 0) {
+    DFPRINTF("AIO FAILURE: res [%d]!\n",err);
+    return;
+  }
+
+  returned_events++;
+
+  free(treq.buf);
+} 
+
+int main(int argc, const char *argv[])
+{
+        int ret = -1, fd, len, err;
+       struct timeval timeout;
+       uint64_t i;
+       char *buf;
+       td_request_t treq;
+        td_disk_info_t info;
+        td_vbd_request_t* vreq;
+
+       if (argc != 3) {
+               fprintf(stderr, "Qcow-utils: v1.0.0\n");
+               fprintf(stderr, "usage: %s <QCOW FILENAME> <SRC IMAGE>\n", 
+                       argv[0]);
+               exit(-1);
+       }
+
+
+       /*Open image*/
+       fd = open(argv[2], O_RDONLY | O_LARGEFILE);
+       
+        if (fd == -1) {
+                DFPRINTF("Unable to open [%s], (err %d)!\n",argv[2],0 - errno);
+                exit(-1);
+        }
+       
+       get_image_info(&info, fd);
+
+       /*Create qcow file*/
+       ret = qcow_create(argv[1],info.size<<SECTOR_SHIFT,NULL,0);
+       
+       if (ret < 0) {
+               DFPRINTF("Unable to create QCOW file\n");
+               exit(-1);
+       } else DFPRINTF("Qcow file created: size %"PRIu64" sectors\n",
+                       (uint64_t)info.size);
+       
+        /* Open Qcow image*/
+        err = tapdisk_server_initialize(NULL, NULL);
+        if( err ) {
+          DPRINTF("qcow2raw Couldn't initialize server instance.\n");
+          return err;
+        }
+
+        err=tapdisk_vbd_initialize(-1,-1, QCOW_VBD);
+        if( err ) {
+          DPRINTF("qcow2raw Couldn't initialize qcow vbd.\n");
+          return err;
+        }
+
+        qcow_vbd = tapdisk_server_get_vbd(QCOW_VBD);
+        if (!qcow_vbd) {
+          err = -ENODEV;
+          DPRINTF("qcow2raw Couldn't create qcow vbd.\n");
+          return err;
+        }
+
+        err = tapdisk_vbd_open_vdi(qcow_vbd, argv[1], DISK_TYPE_QCOW,
+                                   TAPDISK_STORAGE_TYPE_DEFAULT,
+                                   0);
+        if( err ) {
+          DPRINTF("qcow2raw Couldn't open qcow file.\n");
+          return err;
+        }
+
+        ddqcow=(tapdisk_vbd_first_image(qcow_vbd))->driver;
+
+        /*Initialise the output string*/
+        memset(output,0x20,(100/PROGRESS_QUANT)+5);
+        output[0] = '[';
+        output[(100/PROGRESS_QUANT)+2] = ']';
+        output[(100/PROGRESS_QUANT)+3] = '\0';
+        DFPRINTF("%s",output);
+
+       i = 0;
+       while (running) {
+               
+               if (!complete) {
+                       /*Read sector from image*/
+                       if (lseek(fd, i*512, SEEK_SET) == (off_t)-1) {
+                               DFPRINTF("Unable to access file offset %"PRIu64"\n",
+                                      (uint64_t)i*512);
+                               exit(-1);
+                       }
+                       
+                       if( (ret = posix_memalign((void **)&buf, 
+                                                 BLOCK_PROCESSSZ, 
+                                                 BLOCK_PROCESSSZ)) != 0) {
+                               DFPRINTF("Unable to read memalign buf (%d)\n",ret);
+                               exit(-1);                               
+                       }
+               
+                       /*We attempt to read 4k sized blocks*/
+                       len = read(fd, buf, BLOCK_PROCESSSZ);
+                       if (len < 512) {
+                               DFPRINTF("Unable to read sector %"PRIu64"\n",
+                                        (uint64_t) (i));
+                               complete = 1;
+                               continue;
+                       }
+                       
+                       len = (len >> 9);
+
+                       treq.op = TD_OP_WRITE;
+                       treq.buf = buf;
+                       treq.sec = i;
+                       treq.secs = len;
+                       treq.image = 0;
+                       treq.cb = send_responses;
+                       treq.cb_data = buf;
+                       treq.id = 0;
+                       treq.sidx = 0;
+                        vreq = calloc(1, sizeof(td_vbd_request_t));
+                       treq.private = vreq; 
+                        
+                       vreq->submitting = 1;
+                        INIT_LIST_HEAD(&vreq->next);
+                        tapdisk_vbd_move_request(treq.private,
+                                                 &qcow_vbd->pending_requests);
+
+                        ddqcow->ops->td_queue_write(ddqcow,treq);
+                        --vreq->submitting;
+
+                       submit_events++;
+
+                       i += len;
+
+                       if (i == info.size) 
+                         complete = 1;
+
+                        tapdisk_submit_all_tiocbs(&server.aio_queue);
+                       debug_output(i,info.size);
+                }
+               
+               while(returned_events != submit_events) {
+                   ret = scheduler_wait_for_events(&server.scheduler);
+                   if (ret < 0) {
+                     DFPRINTF("server wait returned %d\n", ret);
+                     sleep(2);
+                   }
+               }
+
+               if (complete && (returned_events == submit_events)) 
+                       running = 0;
+       }
+       memcpy(output+prev+1,"=",1);
+       DFPRINTF("\r%s     100%%\nTRANSFER COMPLETE\n\n", output);
+
+        ddqcow->ops->td_close(ddqcow);
+        free(ddqcow->data);
+
+       return 0;
+}
diff --git a/tools/blktap2/drivers/io-optimize.c b/tools/blktap2/drivers/io-optimize.c
new file mode 100644 (file)
index 0000000..5d39765
--- /dev/null
@@ -0,0 +1,664 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <time.h>
+#include <stdio.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <inttypes.h>
+
+#include "io-optimize.h"
+#include "tapdisk-log.h"
+
+#if (!defined(TEST) && defined(DEBUG))
+#define DBG(ctx, f, a...) tlog_write(TLOG_DBG, f, ##a)
+#elif defined(TEST)
+#define DBG(ctx, f, a...) printf(f, ##a)
+#else
+#define DBG(ctx, f, a...) ((void)0)
+#endif
+
+static void print_merged_iocbs(struct opioctx *ctx, 
+                              struct iocb **iocbs, int num_iocbs);
+
+void
+opio_free(struct opioctx *ctx)
+{
+       free(ctx->opios);
+       free(ctx->free_opios);
+       free(ctx->iocb_queue);
+       free(ctx->event_queue);
+}
+
+int
+opio_init(struct opioctx *ctx, int num_iocbs)
+{
+       int i;
+
+       memset(ctx, 0, sizeof(struct opioctx));
+
+       ctx->num_opios     = num_iocbs;
+       ctx->free_opio_cnt = num_iocbs;
+       ctx->opios         = calloc(1, sizeof(struct opio) * num_iocbs);
+       ctx->free_opios    = calloc(1, sizeof(struct opio *) * num_iocbs);
+       ctx->iocb_queue    = calloc(1, sizeof(struct iocb *) * num_iocbs);
+       ctx->event_queue   = calloc(1, sizeof(struct io_event) * num_iocbs);
+
+       if (!ctx->opios || !ctx->free_opios ||
+           !ctx->iocb_queue || !ctx->event_queue)
+               goto fail;
+
+       for (i = 0; i < num_iocbs; i++)
+               ctx->free_opios[i] = &ctx->opios[i];
+
+       return 0;
+
+ fail:
+       opio_free(ctx);
+       return -ENOMEM;
+}
+
+static inline struct opio *
+alloc_opio(struct opioctx *ctx)
+{
+       if (ctx->free_opio_cnt <= 0)
+               return NULL;
+       return ctx->free_opios[--ctx->free_opio_cnt];
+}
+
+static inline void
+free_opio(struct opioctx *ctx, struct opio *op)
+{
+       memset(op, 0, sizeof(struct opio));
+       ctx->free_opios[ctx->free_opio_cnt++] = op;
+}
+
+static inline void
+restore_iocb(struct opio *op)
+{
+       struct iocb *io = op->iocb;
+
+       io->data        = op->data;
+       io->u.c.buf     = op->buf;
+       io->u.c.nbytes  = op->nbytes;
+}
+
+static inline int
+iocb_optimized(struct opioctx *ctx, struct iocb *io)
+{
+       unsigned long iop   = (unsigned long)io->data;
+       unsigned long start = (unsigned long)ctx->opios;
+       unsigned long end   = start + (ctx->num_opios * sizeof(struct opio));
+
+       return (iop >= start && iop < end);
+}
+
+static inline int
+contiguous_sectors(struct iocb *l, struct iocb *r)
+{
+       return (l->u.c.offset + l->u.c.nbytes == r->u.c.offset);
+}
+
+static inline int
+contiguous_buffers(struct iocb *l, struct iocb *r)
+{
+       return (l->u.c.buf + l->u.c.nbytes == r->u.c.buf);
+}
+
+static inline int
+contiguous_iocbs(struct iocb *l, struct iocb *r)
+{
+       return ((l->aio_fildes == r->aio_fildes) &&
+               contiguous_sectors(l, r) &&
+               contiguous_buffers(l, r));
+}
+
+static inline void
+init_opio_list(struct opio *op)
+{
+       op->list.head = op->list.tail = op;
+}
+
+static struct opio *
+opio_iocb_init(struct opioctx *ctx, struct iocb *io)
+{
+       struct opio *op;
+
+       op = alloc_opio(ctx);
+       if (!op)
+               return NULL;
+
+       op->buf    = io->u.c.buf;
+       op->nbytes = io->u.c.nbytes;
+       op->offset = io->u.c.offset;
+       op->data   = io->data;
+       op->iocb   = io;
+       io->data   = op;
+
+       init_opio_list(op);
+
+       return op;
+}
+
+static inline struct opio *
+opio_get(struct opioctx *ctx, struct iocb *io)
+{
+       if (iocb_optimized(ctx, io))
+               return (struct opio *)io->data;
+       else
+               return opio_iocb_init(ctx, io);
+}
+
+static int
+merge_tail(struct opioctx *ctx, struct iocb *head, struct iocb *io)
+{
+       struct opio *ophead, *opio;
+
+       ophead = opio_get(ctx, head);
+       if (!ophead)
+               return -ENOMEM;
+
+       opio = opio_get(ctx, io);
+       if (!opio)
+               return -ENOMEM;
+
+       opio->head        = ophead;
+       head->u.c.nbytes += io->u.c.nbytes;
+       ophead->list.tail = ophead->list.tail->next = opio;
+       
+       return 0;
+}
+
+static int
+merge(struct opioctx *ctx, struct iocb *head, struct iocb *io)
+{
+       if (head->aio_lio_opcode != io->aio_lio_opcode)
+               return -EINVAL;
+
+       if (!contiguous_iocbs(head, io))
+               return -EINVAL;
+
+       return merge_tail(ctx, head, io);               
+}
+
+int
+io_merge(struct opioctx *ctx, struct iocb **queue, int num)
+{
+       int i, on_queue;
+       struct iocb *io, **q;
+       struct opio *ophead;
+       
+       if (!num)
+               return 0;
+
+       on_queue = 0;
+       q = ctx->iocb_queue;
+       memcpy(q, queue, num * sizeof(struct iocb *));
+
+       for (i = 1; i < num; i++) {
+               io = q[i];
+               if (merge(ctx, queue[on_queue], io) != 0)
+                       queue[++on_queue] = io;
+       }
+
+#if (defined(TEST) || defined(DEBUG))
+       print_merged_iocbs(ctx, queue, on_queue + 1);
+#endif
+
+       return ++on_queue;
+}
+
+static int
+expand_iocb(struct opioctx *ctx, struct iocb **queue, struct iocb *io)
+{
+       int idx;
+       struct opio *op, *next;
+
+       idx = 0;
+       op  = (struct opio *)io->data;
+       while (op) {
+               next = op->next;
+               restore_iocb(op);
+               queue[idx++] = op->iocb;
+               free_opio(ctx, op);
+               op   = next;
+       }
+
+       return idx;
+}
+
+int
+io_expand_iocbs(struct opioctx *ctx, struct iocb **queue, int idx, int num)
+{
+       int i, on_queue;
+       struct iocb *io, **q;
+
+       if (!num)
+               return 0;
+
+       on_queue = 0;
+       q = ctx->iocb_queue;
+       memcpy(q, queue, num * sizeof(struct iocb *));
+
+       for (i = idx; i < num; i++) {
+               io = q[i];
+               if (!iocb_optimized(ctx, io))
+                       queue[on_queue++] = io;
+               else
+                       on_queue += expand_iocb(ctx, queue + on_queue, io);
+       }
+
+       return on_queue;
+}
+
+static int
+expand_event(struct opioctx *ctx,
+            struct io_event *event, struct io_event *queue, int idx)
+{
+       int err;
+       struct iocb *io;
+       struct io_event *ep;
+       struct opio *ophead, *op, *next;
+
+       io     = event->obj;
+       ophead = (struct opio *)io->data;
+       op     = ophead;
+
+       if (event->res == io->u.c.nbytes)
+               err = 0;
+       else if ((int)event->res < 0)
+               err = (int)event->res;
+       else
+               err = -EIO;
+
+       while (op) {
+               next    = op->next;
+               ep      = &queue[idx++];
+               ep->obj = op->iocb;
+               ep->res = (err ? err : op->nbytes);
+               restore_iocb(op);
+               free_opio(ctx, op);
+               op      = next;
+       }
+
+       return idx;
+}
+
+int
+io_split(struct opioctx *ctx, struct io_event *events, int num)
+{
+       int on_queue;
+       struct iocb *io;
+       struct io_event *ep, *q;
+       
+       if (!num)
+               return 0;
+
+       on_queue = 0;
+       q = ctx->event_queue;
+       memcpy(q, events, num * sizeof(struct io_event));
+
+       for (ep = q; num-- > 0; ep++) {
+               io = ep->obj;
+               if (!iocb_optimized(ctx, io))
+                       events[on_queue++] = *ep;
+               else
+                       on_queue = expand_event(ctx, ep, events, on_queue);
+       }
+
+       return on_queue;
+}
+
+/******************************************************************************
+debug print functions
+******************************************************************************/
+static inline void
+__print_iocb(struct opioctx *ctx, struct iocb *io, char *prefix)
+{
+       char *type;
+
+       type = (io->aio_lio_opcode == IO_CMD_PREAD ? "read" : "write");
+
+       DBG(ctx, "%soff: %08llx, nbytes: %04lx, buf: %p, type: %s, data: %08lx,"
+           " optimized: %d\n", prefix, io->u.c.offset, io->u.c.nbytes, 
+           io->u.c.buf, type, (unsigned long)io->data, 
+           iocb_optimized(ctx, io));
+}
+
+static char *null_prefix = "";
+#define print_iocb(ctx, io) __print_iocb(ctx, io, null_prefix)
+
+static void
+print_iocbs(struct opioctx *ctx, struct iocb **iocbs, int num_iocbs)
+{
+       int i;
+       char pref[10];
+       struct iocb *io;
+
+       DBG(ctx, "iocbs:\n");
+       for (i = 0; i < num_iocbs; i++) {
+               io = iocbs[i];
+               snprintf(pref, 10, "%d: ", i);
+               __print_iocb(ctx, io, pref);
+       }
+}
+
+static void
+print_optimized_iocbs(struct opioctx *ctx, struct opio *op, int *cnt)
+{
+       char pref[10];
+
+       while (op) {
+               snprintf(pref, 10, "  %d: ", (*cnt)++);
+               __print_iocb(ctx, op->iocb, pref);
+               op = op->next;
+       }
+}
+
+static void
+print_merged_iocbs(struct opioctx *ctx, struct iocb **iocbs, int num_iocbs)
+{
+       int i, cnt;
+       char pref[10];
+       struct iocb *io;
+       struct opio *op;
+
+       DBG(ctx, "merged iocbs:\n");
+       for (i = 0, cnt = 0; i < num_iocbs; i++) {
+               io = iocbs[i];
+               snprintf(pref, 10, "%d: ", cnt++);
+               __print_iocb(ctx, io, pref);
+
+               if (iocb_optimized(ctx, io)) {
+                       op = (struct opio *)io->data;
+                       print_optimized_iocbs(ctx, op->next, &cnt);
+               }
+       }
+}
+
+static void
+print_events(struct opioctx *ctx, struct io_event *events, int num_events)
+{
+       int i;
+       struct iocb *io;
+
+       for (i = 0; i < num_events; i++) {
+               io = events[i].obj;
+               print_iocb(ctx, io);
+       }
+}
+/******************************************************************************
+end debug print functions
+******************************************************************************/
+
+#if defined(TEST)
+
+#define hmask 0x80000000UL
+#define smask 0x40000000UL
+#define make_data(idx, is_head, sparse) \
+         (void *)((idx) | ((is_head) ? hmask : 0) | ((sparse) ? smask : 0))
+#define data_idx(data)          (int)((unsigned long)(data) & (0x0fffffff))
+#define data_is_head(data)      (((unsigned long)(data) & hmask) ? 1 : 0)
+#define data_is_sparse(data)    (((unsigned long)(data) & smask) ? 1 : 0)
+
+static void
+usage(void)
+{
+       fprintf(stderr, "usage: io_optimize [-n num_runs] "
+               "[-i num_iocbs] [-s num_secs] [-r random_seed]\n");
+       exit(-1);
+}
+
+static int xalloc_cnt, xfree_cnt;
+static inline char *
+xalloc(int size)
+{
+       char *buf = malloc(size);
+       if (!buf) {
+               fprintf(stderr, "xalloc failed\n");
+               exit(ENOMEM);
+       }
+       xalloc_cnt++;
+       return buf;
+}
+
+static inline void
+xfree(void *buf)
+{
+       free(buf);
+       xfree_cnt++;
+}
+
+static void
+randomize_iocbs(struct iocb **iocbs, int num_iocbs, int num_secs)
+{
+       int i, j;
+
+       i = 0;
+       while (i < num_iocbs) {
+               char *buf;
+               short type;
+               int segs, sparse_mem;
+               uint64_t offset, nbytes;
+               
+               type   = (random() % 10 < 5 ? IO_CMD_PREAD : IO_CMD_PWRITE);
+               offset = ((random() % num_secs) << 9);
+
+               if (random() % 10 < 4) {
+                       segs   = 1;
+                       nbytes = (((random() % 7) + 1) << 9);
+               } else {
+                       segs   = (random() % 10) + 1;
+                       nbytes = 4096;
+               }
+
+               if (i + segs > num_iocbs)
+                       segs = (num_iocbs - i);
+
+               sparse_mem = (random() % 10 < 2 ? 1 : 0);
+
+               if (sparse_mem)
+                       buf = xalloc(nbytes);
+               else
+                       buf = xalloc(segs * nbytes);
+
+               for (j = 0; j < segs; j++) {
+                       struct iocb *io    = iocbs[i + j];
+                       io->aio_lio_opcode = type;
+                       io->u.c.nbytes     = nbytes;
+                       io->u.c.offset     = offset;
+                       io->u.c.buf        = buf;
+                       offset            += nbytes;
+
+                       io->data = make_data(i + j, (j == 0), sparse_mem);
+
+                       if (j + 1 < segs && sparse_mem)
+                               buf  = xalloc(nbytes);
+                       else
+                               buf += nbytes;
+               }
+
+               i += segs;
+       }
+}
+
+static int
+simulate_io(struct iocb **iocbs, struct io_event *events, int num_iocbs)
+{
+       int i, done;
+       struct iocb *io;
+       struct io_event *ep;
+
+       if (num_iocbs > 1)
+               done = (random() % (num_iocbs - 1)) + 1;
+       else
+               done = num_iocbs;
+
+       for (i = 0; i < done; i++) {
+               io      = iocbs[i];
+               ep      = &events[i];
+               ep->obj = io;
+               ep->res = (random() % 10 < 8 ? io->u.c.nbytes : 0);
+       }
+
+       return done;
+}
+
+static inline void
+process_events(struct opioctx *ctx, 
+              struct iocb *iocb_list, struct io_event *events, int num)
+{
+       int i;
+       struct iocb *io;
+
+       for (i = 0; i < num; i++) {
+               io = events[i].obj;
+               print_iocb(ctx, io);
+               if (data_idx(io->data) != (io - iocb_list)) {
+                       printf("corrupt data! data_idx = %d, io = %d\n",
+                              data_idx(io->data), (io - iocb_list));
+                       exit(-1);
+               }
+               if (data_is_head(io->data) || data_is_sparse(io->data))
+                       xfree(io->u.c.buf);
+               memset(io, 0, sizeof(struct iocb));
+       }
+}
+
+static inline void
+init_optest(struct iocb *iocb_list, 
+           struct iocb **iocbs, struct io_event *events, int num)
+{
+       int i;
+
+       memset(iocb_list, 0, num * sizeof(struct iocb));
+       memset(events, 0, num * sizeof(struct io_event));
+
+       for (i = 0; i < num; i++)
+               iocbs[i]  = &iocb_list[i];
+}
+
+int
+main(int argc, char **argv)
+{
+       uint64_t num_secs;
+       struct opioctx ctx;
+       struct io_event *events;
+       int i, c, num_runs, num_iocbs, seed;
+       struct iocb *iocb_list, **iocbs, **ioqueue;
+
+       num_runs  = 1;
+       num_iocbs = 300;
+       seed      = time(NULL);
+       num_secs  = ((4ULL << 20) >> 9); /* 4GB disk */
+
+       while ((c = getopt(argc, argv, "n:i:s:r:h")) != -1) {
+               switch (c) {
+               case 'n':
+                       num_runs  = atoi(optarg);
+                       break;
+               case 'i':
+                       num_iocbs = atoi(optarg);
+                       break;
+               case 's':
+                       num_secs  = strtoull(optarg, NULL, 10);
+                       break;
+               case 'r':
+                       seed      = atoi(optarg);
+                       break;
+               case 'h':
+                       usage();
+               case '?':
+                       fprintf(stderr, "Unrecognized option: -%c\n", optopt);
+                       usage();
+               }
+       }
+
+       printf("Running %d tests with %d iocbs on %llu sectors, seed = %d\n",
+              num_runs, num_iocbs, num_secs, seed);
+
+       srand(seed);
+
+       iocb_list = malloc(num_iocbs * sizeof(struct iocb));
+       iocbs     = malloc(num_iocbs * sizeof(struct iocb *));
+       events    = malloc(num_iocbs * sizeof(struct io_event));
+       
+       if (!iocb_list || !iocbs || !events || opio_init(&ctx, num_iocbs)) {
+               fprintf(stderr, "initialization failed\n");
+               exit(ENOMEM);
+       }
+
+       for (i = 0; i < num_runs; i++) {
+               int op_rem, op_done, num_split, num_events, num_done;
+
+               ioqueue = iocbs;
+               init_optest(iocb_list, ioqueue, events, num_iocbs);
+               randomize_iocbs(ioqueue, num_iocbs, num_secs);
+               print_iocbs(&ctx, ioqueue, num_iocbs);
+
+               op_done  = 0;
+               num_done = 0;
+               op_rem   = io_merge(&ctx, ioqueue, num_iocbs);
+               print_iocbs(&ctx, ioqueue, op_rem);
+               print_merged_iocbs(&ctx, ioqueue, op_rem);
+               
+               while (num_done < num_iocbs) {
+                       DBG(&ctx, "optimized remaining: %d\n", op_rem);
+
+                       DBG(&ctx, "simulating\n");
+                       num_events = simulate_io(ioqueue + op_done, events, op_rem);
+                       print_events(&ctx, events, num_events);
+
+                       DBG(&ctx, "splitting %d\n", num_events);
+                       num_split = io_split(&ctx, events, num_events);
+                       print_events(&ctx, events, num_split);
+
+                       DBG(&ctx, "processing %d\n", num_split);
+                       process_events(&ctx, iocb_list, events, num_split);
+
+                       op_rem   -= num_events;
+                       op_done  += num_events;
+                       num_done += num_split;
+               }
+
+               DBG(&ctx, "run %d: processed: %d, xallocs: %d, xfrees: %d\n", 
+                   i, num_done, xalloc_cnt, xfree_cnt);
+               if (xalloc_cnt != xfree_cnt)
+                       exit(-1);
+               xalloc_cnt = xfree_cnt = 0;
+       }
+
+       free(iocbs);
+       free(events);
+       free(iocb_list);
+       opio_free(&ctx);
+
+       return 0;
+}
+#endif
diff --git a/tools/blktap2/drivers/io-optimize.h b/tools/blktap2/drivers/io-optimize.h
new file mode 100644 (file)
index 0000000..9a0d86b
--- /dev/null
@@ -0,0 +1,68 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef __IO_OPTIMIZE_H__
+#define __IO_OPTIMIZE_H__
+
+#include <libaio.h>
+
+struct opio;
+
+struct opio_list {
+       struct opio        *head;
+       struct opio        *tail;
+};
+
+struct opio {
+       char               *buf;
+       unsigned long       nbytes;
+       long long           offset;
+       void               *data;
+       struct iocb        *iocb;
+       struct io_event     event;
+       struct opio        *head;
+       struct opio        *next;
+       struct opio_list    list;
+};
+
+struct opioctx {
+       int                 num_opios;
+       int                 free_opio_cnt;
+       struct opio        *opios;
+       struct opio       **free_opios;
+       struct iocb       **iocb_queue;
+       struct io_event    *event_queue;
+};
+
+int opio_init(struct opioctx *ctx, int num_iocbs);
+void opio_free(struct opioctx *ctx);
+int io_merge(struct opioctx *ctx, struct iocb **queue, int num);
+int io_split(struct opioctx *ctx, struct io_event *events, int num);
+int io_expand_iocbs(struct opioctx *ctx, struct iocb **queue, int idx, int num);
+
+#endif
diff --git a/tools/blktap2/drivers/lock.c b/tools/blktap2/drivers/lock.c
new file mode 100644 (file)
index 0000000..107c4b6
--- /dev/null
@@ -0,0 +1,1000 @@
+/*
+ * Copyright (c) 2007, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * This module implements a "dot locking" style advisory file locking algorithm.
+ */
+
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <time.h>
+#include <dirent.h>
+#include <limits.h>
+#include "lock.h"
+
+#define unlikely(x) __builtin_expect(!!(x), 0)
+
+/* format: xenlk.hostname.uuid.<xf><rw>*/
+#define LF_POSTFIX ".xenlk"
+#define LFXL_FORMAT LF_POSTFIX ".%s.%s.x%s"
+#define LFFL_FORMAT LF_POSTFIX ".%s.%s.f%s"
+#define RETRY_MAX 16
+
+#if defined(LOGS)
+#define LOG(format, args...) printf("%d: ", __LINE__); printf(format, ## args)
+#else
+#define LOG(format, args...)
+#endif
+
+/* random wait - up to .5 seconds */
+#define XSLEEP usleep(random() & 0x7ffff)
+
+typedef int (*eval_func)(char *name, int readonly);
+
+static char *create_lockfn(char *fn_to_lock)
+{
+        char *lockfn;
+    
+        /* allocate string to hold constructed lock file */
+        lockfn = malloc(strlen(fn_to_lock) + strlen(LF_POSTFIX) + 1);
+        if (unlikely(!lockfn)) {
+                return 0;
+        }
+
+        /* append postfix to file to lock */
+        strcpy(lockfn, fn_to_lock);
+        strcat(lockfn, LF_POSTFIX);
+
+        return lockfn;
+}
+
+static char *create_lockfn_link(char *fn_to_lock, char *format, 
+                                char *uuid, int readonly)
+{
+        char hostname[128];
+        char *lockfn_link;
+        char *ptr;
+
+        /* get hostname */
+        if (unlikely(gethostname(hostname, sizeof(hostname)) == -1)) {
+                return 0;
+        }
+
+        /* allocate string to hold constructed lock file link */
+        lockfn_link = malloc(strlen(fn_to_lock) + strlen(LF_POSTFIX) +
+                             strlen(hostname) + strlen(uuid) + 8);
+        if (unlikely(!lockfn_link)) {
+                return 0;
+        }
+
+        /* construct lock file link with specific format */
+        strcpy(lockfn_link, fn_to_lock);
+        ptr = lockfn_link + strlen(lockfn_link);
+        sprintf(ptr, format, hostname, uuid, readonly ? "r" : "w");
+
+        return lockfn_link;
+}
+
+static int NFSnormalizedStatTime(char *fn, struct stat *statnow, int *reterrno)
+{
+        int result = LOCK_OK;
+        int uniq;
+        char *buf;
+        int fd;
+        int pid = (int)getpid();
+        int clstat;
+
+        *reterrno = 0;
+
+        /* create file to normalize time */
+        srandom((int)time(0) ^ pid);
+        uniq = random() % 0xffffff;
+        buf = malloc(strlen(fn) + 24);
+        if (unlikely(!buf)) { result = LOCK_ENOMEM; goto finish; }
+
+        strcpy(buf, fn);
+        sprintf(buf + strlen(buf), ".xen%08d.tmp", uniq);
+
+        fd = open(buf, O_WRONLY | O_CREAT, 0644);
+        if (fd == -1) { *reterrno = errno; result = LOCK_EOPEN; goto finish; }
+        clstat = close(fd);
+        if (unlikely(clstat == -1)) {
+                LOG("fail on close\n");
+        }
+        if (lstat(buf, statnow) == -1) {
+                unlink(buf);
+                *reterrno = errno;
+                result = LOCK_ESTAT;
+                goto finish;
+        }
+        unlink(buf);
+
+finish:
+        return result;
+}
+
+static int writer_eval(char *name, int readonly) 
+{
+        return name[strlen(name)-1] == 'w';
+}
+
+static int reader_eval(char *name, int readonly) 
+{
+        return name[strlen(name)-1] == 'r' && !readonly;
+}
+
+static int lock_holder(char *fn, char *lockfn, char *lockfn_link, 
+                       int force, int readonly, int *stole, eval_func eval,
+                       int *elt, int *ioerror)
+{
+        int status = 0;
+        int ustat;
+        DIR *pd = 0;
+        struct dirent *dptr;
+        char *ptr;
+        char *dirname = malloc(strlen(lockfn));
+        char *uname = malloc(strlen(lockfn_link) + 8);
+        int elt_established = 0;
+        int fd;
+        char tmpbuf[4096];
+
+        *stole = 0;
+        *ioerror = 0;
+        *elt = 0;
+
+        if (!dirname) goto finish;
+        if (!uname) goto finish;
+
+        /* get directory */
+        ptr = strrchr(lockfn, '/');
+        if (!ptr) {
+                strcpy(dirname, ".");
+        } else {
+                int numbytes = ptr - lockfn;
+                strncpy(dirname, lockfn, numbytes);
+                dirname[numbytes] = '\0';
+        }
+        pd = opendir(dirname); 
+        if (!pd) {
+                *ioerror = errno ? errno : EIO;
+                goto finish;
+        }
+
+        /* 
+         * scan through directory entries and use eval function 
+         * if we have a match (i.e. reader or writer lock) but
+         * note that if we are forcing, we will remove any and
+         * all locks that appear for target of our lock, regardless
+         * if it a reader/writer owns the lock.
+         */
+        errno = 0;
+        dptr = readdir(pd);
+        if (!dptr) {
+            *ioerror = EIO;
+        }
+        while (dptr) {
+                char *p1 = strrchr(fn, '/');
+                char *p2 = strrchr(lockfn, '/');
+                char *p3 = strrchr(lockfn_link, '/');
+                if (p1) p1+=1;
+                if (p2) p2+=1;
+                if (p3) p3+=1;
+                if (strcmp(dptr->d_name, p1 ? p1 : fn) &&
+                    strcmp(dptr->d_name, p2 ? p2 : lockfn) &&
+                    strcmp(dptr->d_name, p3 ? p3 : lockfn_link) &&
+                    !strncmp(dptr->d_name, p1 ? p1 : fn, strlen(p1?p1:fn))) {
+                        strcpy(uname, dirname);
+                        strcat(uname, "/");
+                        strcat(uname, dptr->d_name);
+                        if (!elt_established) {
+                            /* read final lock file and extract lease time */
+                            fd = open(uname, O_RDONLY, 0644); 
+                            memset(tmpbuf, 0, sizeof(tmpbuf));
+                            if (read(fd, tmpbuf, sizeof(tmpbuf)) < 0) {
+                                    *ioerror = errno;
+                                    status = 1;
+                                    close(fd);
+                                    goto finish;
+                            }
+                            close(fd);
+                            ptr = strrchr(tmpbuf, '.');
+                            if (ptr) {
+                                *elt = atoi(ptr+1);
+                                elt_established = 1;
+                            }
+                        }
+                        if (force) {
+                                ustat = unlink(uname);
+                                if (ustat == -1) {
+                                        LOG("failed to unlink %s\n", uname);
+                                }
+                                *stole = 1;
+                                *elt = 0;
+                        } else {
+                                if ((*eval)(dptr->d_name, readonly)) {
+                                        closedir(pd);
+                                        status = 1;
+                                        goto finish;
+                                }
+                        }
+                }
+                dptr = readdir(pd);
+                if (!dptr & errno) {
+                    *ioerror = EIO;
+                }
+        }
+
+        closedir(pd);
+
+finish:
+        free(dirname);
+        free(uname);
+
+        /* if IO error, force a taken status */
+        return (*ioerror) ? 1 : status;
+}
+
+int lock(char *fn_to_lock, char *uuid, int force, int readonly, int *lease_time, int *retstatus)
+{
+        char *lockfn = 0;
+        char *lockfn_xlink = 0;
+        char *lockfn_flink = 0;
+        char *buf = 0;
+        int fd;
+        int status = 0;
+        struct stat stat1, stat2;
+        int retry_attempts = 0;
+        int clstat;
+        int tmpstat;
+        int stealx = 0;
+        int stealw = 0;
+        int stealr = 0;
+        int established_lease_time = 0;
+        char tmpbuf[4096];
+        int ioerr;
+    
+        if (!fn_to_lock || !uuid) {
+                *retstatus = LOCK_EBADPARM;
+                return EINVAL;
+        }
+
+        *retstatus = 0;
+
+        /* seed random with time/pid combo */
+        srandom((int)time(0) ^ getpid());
+
+        /* build lock file strings */
+        lockfn = create_lockfn(fn_to_lock);
+        if (unlikely(!lockfn)) { status = ENOMEM; *retstatus = LOCK_ENOMEM; goto finish; }
+
+        lockfn_xlink = create_lockfn_link(fn_to_lock, LFXL_FORMAT, 
+                                          uuid, readonly);
+        if (unlikely(!lockfn_xlink)) { status = ENOMEM; *retstatus = LOCK_ENOMEM; goto finish; }
+
+        lockfn_flink = create_lockfn_link(fn_to_lock, LFFL_FORMAT, uuid, 
+                                          readonly);
+        if (unlikely(!lockfn_flink)) { status = ENOMEM; *retstatus = LOCK_ENOMEM; goto finish; }
+
+try_again:
+        if (retry_attempts++ > RETRY_MAX) {
+                if (*retstatus == LOCK_EXLOCK_OPEN) {
+                        struct stat statnow, stat_exlock;
+                        int diff;
+
+                        if (lstat(lockfn, &stat_exlock) == -1) {
+                                goto finish;
+                        }
+                
+                        if (NFSnormalizedStatTime(fn_to_lock, &statnow, &ioerr)) {
+                                goto finish;
+                        }
+
+                        diff = (int)statnow.st_mtime - (int)stat_exlock.st_mtime;
+                        if (diff > DEFAULT_LEASE_TIME_SECS) {
+                                unlink(lockfn);
+                                retry_attempts = 0;
+                                goto try_again;
+                        }
+                }
+                goto finish;
+        }
+
+        /* try to open exlusive lockfile */
+        fd = open(lockfn, O_WRONLY | O_CREAT | O_EXCL, 0644); 
+        if (fd == -1) {
+                LOG("Initial lockfile creation failed %s force=%d, errno=%d\n",
+                     lockfn, force, errno);
+                if (errno == EIO) {
+                       *retstatus = LOCK_EXLOCK_OPEN;
+                       status = EIO;
+                       goto finish;
+                }
+                /* already owned? (hostname & uuid match, skip time bits) */
+                errno = 0;
+                fd = open(lockfn, O_RDWR, 0644);
+                if (fd != -1) {
+                        buf = malloc(strlen(lockfn_xlink)+1);
+                        if (!buf) {
+                                clstat = close(fd);
+                                if (unlikely(clstat == -1)) {
+                                        LOG("fail on close\n");
+                                }
+                                *retstatus = LOCK_ENOMEM;
+                                status = ENOMEM;
+                                goto finish;
+                        }
+                        if (read(fd, buf, strlen(lockfn_xlink)) !=
+                           (strlen(lockfn_xlink))) {
+                                clstat = close(fd);
+                                if (unlikely(clstat == -1)) {
+                                        LOG("fail on close\n");
+                                }
+                                free(buf);
+                                goto force_lock;
+                        }
+                        if (!strncmp(buf, lockfn_xlink, strlen(lockfn_xlink)-1)) {
+                                LOG("lock owned by us, reasserting\n");
+                                /* our lock, reassert by rewriting below */
+                                if (lseek(fd, 0, SEEK_SET) == -1) {
+                                        clstat = close(fd);
+                                        if (unlikely(clstat == -1)) {
+                                                LOG("fail on close\n");
+                                        }
+                                        goto force_lock;
+                                }
+                                free(buf);
+                                goto skip;
+                        }
+                        free(buf);
+                        clstat = close(fd);
+                        if (unlikely(clstat == -1)) {
+                                LOG("fail on close\n");
+                        }
+                }
+force_lock:
+                if (errno == EIO) {
+                       *retstatus = LOCK_EXLOCK_OPEN;
+                       status = EIO;
+                       goto finish;
+                }
+                if (force) {
+                        /* remove lock file, we are forcing lock, try again */
+                        status = unlink(lockfn);
+                        if (unlikely(status == -1)) {
+                                if (errno == EIO) {
+                                       *retstatus = LOCK_EXLOCK_OPEN;
+                                       status = EIO;
+                                       goto finish;
+                                }
+                                LOG("force removal of %s lockfile failed, "
+                                    "errno=%d, trying again\n", lockfn, errno);
+                        }
+                        stealx = 1;
+                }
+                XSLEEP;
+                *retstatus = LOCK_EXLOCK_OPEN;
+                goto try_again;
+        }
+
+        LOG("lockfile created %s\n", lockfn);
+
+skip:
+        /* 
+         * write into the temporary xlock
+         */
+        if (write(fd, lockfn_xlink, strlen(lockfn_xlink)) != 
+                strlen(lockfn_xlink)) {
+                if (errno == EIO) {
+                       *retstatus = LOCK_EXLOCK_WRITE;
+                       status = EIO;
+                       goto finish;
+                }
+                status = errno;
+                clstat = close(fd);
+                if (unlikely(clstat == -1)) {
+                        LOG("fail on close\n");
+                }
+                XSLEEP;
+                *retstatus = LOCK_EXLOCK_WRITE;
+                if (unlink(lockfn) == -1)  {
+                        LOG("removal of %s lockfile failed, "
+                            "errno=%d, trying again\n", lockfn, errno);
+                }
+                goto try_again;
+        }
+        clstat = close(fd);
+        if (unlikely(clstat == -1)) {
+                LOG("fail on close\n");
+        }
+
+        while (retry_attempts++ < RETRY_MAX) {
+                tmpstat = link(lockfn, lockfn_xlink);
+                LOG("linking %s and %s\n", lockfn, lockfn_xlink);
+                if ((tmpstat == -1) && (errno != EEXIST)) { 
+                        LOG("link status is %d, errno=%d\n", tmpstat, errno); 
+                }
+
+                if ((lstat(lockfn, &stat1) == -1) || 
+                    (lstat(lockfn_xlink, &stat2) == -1)) {
+                        /* try again, cleanup first */
+                        tmpstat = unlink(lockfn);
+                        if (unlikely(tmpstat == -1)) {
+                                LOG("error removing lock file %s", lockfn);
+                        }
+                        tmpstat = unlink(lockfn_xlink);
+                        if (unlikely(tmpstat == -1)) {
+                                LOG("error removing linked lock file %s", 
+                                    lockfn_xlink);
+                        }
+                        XSLEEP;
+                        status = LOCK_ESTAT;
+                        goto finish;
+                }
+
+                /* compare inodes */
+                if (stat1.st_ino == stat2.st_ino) {
+                        /* success, inodes are the same */
+                        /* should we check that st_nlink's are also 2?? */
+                        *retstatus = LOCK_OK;
+                        status = 0;
+                        tmpstat = unlink(lockfn_xlink);
+                        if (unlikely(tmpstat == -1)) {
+                                LOG("error removing linked lock file %s", 
+                                    lockfn_xlink);
+                        }
+                        goto finish;
+                } else {
+                       status = errno;
+                        /* try again, cleanup first */
+                        tmpstat = unlink(lockfn);
+                        if (unlikely(tmpstat == -1)) {
+                                LOG("error removing lock file %s", lockfn);
+                        }
+                        tmpstat = unlink(lockfn_xlink);
+                        if (unlikely(tmpstat == -1)) {
+                                LOG("error removing linked lock file %s", 
+                                    lockfn_xlink);
+                        }
+                        XSLEEP;
+                        *retstatus = LOCK_EINODE;
+                        goto try_again;
+                }
+        }
+
+finish:
+        if (!*retstatus) {
+
+                /* we have exclusive lock */
+
+                status = 0;
+
+                /* fast check, see if we own a final lock and are reasserting */
+                if (!lstat(lockfn_flink, &stat1)) {
+                        char *ptr;
+
+                        /* set the return value to notice this is a reassert */
+                        *retstatus = 1; 
+
+                        /* read existing lock file and extract 
+                           established lease time */
+                        fd = open(lockfn_flink, O_RDONLY, 0644); 
+                        memset(tmpbuf, 0, sizeof(tmpbuf));
+                        if (read(fd, tmpbuf, sizeof(tmpbuf)) < 0) {
+                                if (errno == EIO) {
+                                        close(fd);
+                                        *retstatus = LOCK_EINODE;
+                                        status = EIO;
+                                        goto skip_scan;
+                                }
+                        }
+                        close(fd);
+                        ptr = strrchr(tmpbuf, '.');
+                        if (ptr) {
+                            *lease_time = atoi(ptr+1);
+                        } else {
+                            *lease_time = 10; /* wkchack */
+                        }
+                        goto skip_scan;
+                } else {
+                       if (errno == EIO) {
+                               *retstatus = LOCK_EINODE;
+                               status = EIO;
+                               goto skip_scan;
+                       }
+                }
+
+                /* we allow exclusive writer, or multiple readers */
+                if (lock_holder(fn_to_lock, lockfn, lockfn_flink, force,
+                                     readonly, &stealw, writer_eval, 
+                                     &established_lease_time, &ioerr)) {
+                        if (ioerr) {
+                            *retstatus = LOCK_EREAD;
+                            status = ioerr;
+                            goto skip_scan;
+                        }
+                        *retstatus = LOCK_EHELD_WR;
+                } else if (lock_holder(fn_to_lock, lockfn, lockfn_flink, force,
+                                     readonly, &stealr, reader_eval, 
+                                     &established_lease_time, &ioerr)) {
+                        if (ioerr) {
+                            *retstatus = LOCK_EREAD;
+                            status = ioerr;
+                            goto skip_scan;
+                        }
+                        *retstatus = LOCK_EHELD_RD;
+                }
+                if (established_lease_time) *lease_time = 
+                                                 established_lease_time;
+        }
+
+skip_scan:
+        if (*retstatus >= 0) {
+                /* update file, changes last modify time */
+                fd = open(lockfn_flink, O_WRONLY | O_CREAT, 0644); 
+                if (fd == -1) {
+                        *retstatus = LOCK_EOPEN;
+                        status = errno;
+                } else {
+                        char tmpbuf[32];
+                        int failed_write;
+                        memset(tmpbuf, 0, sizeof(tmpbuf));
+                        sprintf(tmpbuf, ".%d", *lease_time);
+                        failed_write = write(fd, lockfn_flink, 
+                                             strlen(lockfn_flink)) != 
+                                       strlen(lockfn_flink);
+                        if (failed_write) status = errno;
+                        failed_write |= write(fd, tmpbuf, strlen(tmpbuf)) != 
+                                       strlen(tmpbuf);
+                        if (failed_write) status = errno;
+                        if (failed_write) {
+                                clstat = close(fd);
+                                if (unlikely(clstat == -1)) {
+                                        LOG("fail on close\n");
+                                }
+                                XSLEEP;
+                                *retstatus = LOCK_EUPDATE;
+                                goto try_again;
+                        }
+                }
+                clstat = close(fd);
+                if (unlikely(clstat == -1)) {
+                        LOG("fail on close\n");
+                }
+        }
+
+        if (!*retstatus && force && (stealx || stealw || stealr)) {
+                struct timeval timeout;
+
+                /* enforce quiet time on steal */
+                timeout.tv_sec = *lease_time;
+                timeout.tv_usec = 0;
+                select(0, 0, 0, 0, &timeout);
+        }
+
+        /* remove exclusive lock, final read/write locks will hold */
+        tmpstat = unlink(lockfn);
+        if (unlikely(tmpstat == -1)) {
+                LOG("error removing exclusive lock file %s", 
+                    lockfn);
+        }
+
+        free(lockfn);
+        free(lockfn_xlink);
+        free(lockfn_flink);
+
+        /* set lease time to -1 if error, so no one is apt to use it */
+        if (*retstatus < 0) *lease_time = -1;
+
+        LOG("returning status %d, errno=%d\n", status, errno);
+        return status;
+}
+
+
+int unlock(char *fn_to_unlock, char *uuid, int readonly, int *status)
+{
+        char *lockfn_link = 0;
+        int reterrno = 0;
+
+        if (!fn_to_unlock || !uuid) {
+                *status = LOCK_EBADPARM;
+                return 0;
+        }
+
+        lockfn_link = create_lockfn_link(fn_to_unlock, LFFL_FORMAT, uuid, 
+                                         readonly);
+        if (unlikely(!lockfn_link)) { *status = LOCK_ENOMEM; goto finish; }
+
+        if (unlink(lockfn_link) == -1) {
+                LOG("error removing linked lock file %s", lockfn_link);
+                reterrno = errno;
+                *status = LOCK_ENOLOCK;
+                goto finish;
+        }
+
+        *status = LOCK_OK;
+
+finish:
+        free(lockfn_link);
+        return reterrno;
+}
+
+int lock_delta(char *fn, int *ret_lease, int *max_lease)
+{
+        int reterrno = 0;
+        DIR *pd = 0;
+        struct dirent *dptr;
+        char *ptr;
+        int result = INT_MAX;
+        struct stat statbuf, statnow;
+        char *dirname = malloc(strlen(fn));
+        char *uname = malloc(strlen(fn) + 8);
+        int elt_established = 0;
+        char *dotptr;
+        char tmpbuf[4096];
+        int fd;
+
+        if (!fn || !dirname || !uname) {
+                *ret_lease = LOCK_EBADPARM;
+                *max_lease = -1;
+                return 0;
+        }
+        
+        if (NFSnormalizedStatTime(fn, &statnow, &reterrno)) {
+                result = LOCK_ESTAT;
+                goto finish;
+        }
+
+        /* get directory */
+        ptr = strrchr(fn, '/');
+        if (!ptr) {
+                strcpy(dirname, ".");
+                ptr = fn;
+        } else {
+                int numbytes = ptr - fn;
+                strncpy(dirname, fn, numbytes);
+                ptr += 1;
+        }
+        pd = opendir(dirname); 
+        if (!pd) { reterrno = errno; goto finish; }
+
+        dptr = readdir(pd);
+        while (dptr) {
+                if (strcmp(dptr->d_name, ptr) &&
+                    !strncmp(dptr->d_name, ptr,  strlen(ptr))) {
+                        char *fpath = malloc(strlen(dptr->d_name) + 
+                                             strlen(dirname) + 2);
+                        if (!fpath) {
+                            closedir(pd);
+                            result = LOCK_ENOMEM;
+                            goto finish;
+                        }
+                        strcpy(fpath, dirname);
+                        strcat(fpath, "/");
+                        strcat(fpath, dptr->d_name);
+                        if (lstat(fpath, &statbuf) != -1) {
+                                int diff = (int)statnow.st_mtime - 
+                                           (int)statbuf.st_mtime;
+                                /* adjust diff if someone updated the lock
+                                   between now and when we created the "now"
+                                   file 
+                                 */
+                                diff = (diff < 0) ? 0 : diff;
+                                result = diff < result ? diff : result;
+                        } else {
+                            closedir(pd);
+                            reterrno = errno;
+                            goto finish;
+                        }
+
+                        if (!elt_established) {
+                            /* read final lock file and extract lease time */
+                            fd = open(fpath, O_RDONLY, 0644); 
+                            memset(tmpbuf, 0, sizeof(tmpbuf));
+                            if (read(fd, tmpbuf, sizeof(tmpbuf)) < 0) {
+                                /* error on read? */
+                            }
+                            close(fd);
+                            dotptr = strrchr(tmpbuf, '.');
+                            if (dotptr) {
+                                *max_lease = atoi(dotptr+1);
+                                elt_established = 1;
+                            }
+                        }
+
+                        free(fpath);
+                }
+                dptr = readdir(pd);
+        }
+
+        closedir(pd);
+
+finish:
+        free(dirname);
+        free(uname);
+
+        /* returns smallest lock time, or error */
+        if (result == INT_MAX) result = LOCK_ENOLOCK;
+
+        /* set lease time to -1 if error, so no one is apt to use it */
+        if ((result < 0) || reterrno) *max_lease = -1;
+        *ret_lease = result;
+        return reterrno;
+}
+
+#if defined(TEST)
+/*
+ * the following is for sanity testing.
+ */
+
+static void usage(char *prg)
+{
+        printf("usage %s\n"
+               "    dtr <filename>]\n"
+               "    p <filename> [num iterations]\n"
+               "    u <filename> [0|1] [<uniqid>]\n"
+               "    l <filename> [0|1] [0|1] [<uniqid>] [<leasetime>]\n", prg);
+        printf("        p : perf test lock take and reassert\n");
+        printf("        d : delta lock time\n");
+        printf("        t : test the file (after random locks)\n");
+        printf("        r : random lock tests (must ^C)\n");
+        printf("        u : unlock, readonly? uniqID (default is PID)\n");
+        printf("        l : lock, readonly? force?, uniqID (default is PID), lease time\n");
+}
+
+static void test_file(char *fn)
+{
+        FILE *fptr;
+        int prev_count = 0;
+        int count, pid, time;
+
+        fptr = fopen(fn, "r");
+        if (!fptr) {
+                LOG("ERROR on file %s open, errno=%d\n", fn, errno);
+                return;
+        } 
+
+        while (!feof(fptr)) {
+                fscanf(fptr, "%d %d %d\n", &count, &pid, &time);
+                if (prev_count != count) {
+                        LOG("ERROR: prev_count=%d, count=%d, pid=%d, time=%d\n",
+                                    prev_count, count, pid, time);
+                }
+                prev_count = count + 1;
+        }
+}
+
+static void random_locks(char *fn)
+{
+        int pid = getpid();
+        int status;
+        char *filebuf = malloc(256);
+        int count = 0;
+        int dummy;
+        int clstat;
+        char uuid[12];
+        int readonly;
+        int lease = DEFAULT_LEASE_TIME_SECS;
+        int err;
+
+        /* this will never return, kill to exit */
+
+        srandom((int)time(0) ^ pid);
+
+        LOG("pid: %d using file %s\n", pid, fn);
+        sprintf(uuid, "%08d", pid);
+
+        while (1) {
+                XSLEEP;
+                readonly = random()  & 1;
+                sysstatus = lock(fn, uuid, 0, readonly, &lease, status);
+                if (status == LOCK_OK) {
+                        /* got lock, open, read, modify write close file */
+                        int fd = open(fn, O_RDWR, 0644);
+                        if (fd == -1) {
+                                LOG("pid: %d ERROR on file %s open, errno=%d\n", 
+                                    pid, fn, errno);
+                        } else {
+                            if (!readonly) {
+                                /* ugly code to read data in test format */
+                                /* format is "%d %d %d" 'count pid time' */
+                                struct stat statbuf;
+                                int bytes;
+                                status = stat(fn, &statbuf);
+                                if (status != -1) {
+                                        if (statbuf.st_size > 256) {
+                                                lseek(fd, -256, SEEK_END);
+                                        } 
+                                        memset(filebuf, 0, 256);
+                                        bytes = read(fd, filebuf, 256);
+                                        if (bytes) {
+                                                int bw = bytes-2;
+                                                while (bw && filebuf[bw]!='\n') 
+                                                        bw--;
+                                                if (!bw) bw = -1;
+                                                sscanf(&filebuf[bw+1], 
+                                                       "%d %d %d", 
+                                                       &count, &dummy, &dummy);
+                                                count += 1;
+                                        }
+                                        lseek(fd, 0, SEEK_END);
+                                        sprintf(filebuf, "%d %d %d\n", 
+                                                count, pid, (int)time(0));
+                                        write(fd, filebuf, strlen(filebuf));
+                                } else {
+                                        LOG("pid: %d ERROR on file %s stat, "
+                                            "errno=%d\n", pid, fn, errno);
+                                }
+                            }
+                            clstat = close(fd);
+                            if (unlikely(clstat == -1)) {
+                                    LOG("fail on close\n");
+                            }
+                        }
+                        XSLEEP;
+                        err = unlock(fn, uuid, readonly, &status);
+                        LOG("unlock status is %d (err=%d)\n", status, err);
+                }
+        }
+}
+
+static void perf_lock(char *fn, int loops)
+{
+    int sysstatus;
+    char buf[9];
+    int start = loops;
+    int lease = DEFAULT_LEASE_TIME_SECS;
+
+    sprintf(buf, "%08d", getpid());
+
+    while (loops--) {
+        sysstatus = lock(fn, buf, 0, 0, &lease, &status);
+        if (status < 0) {
+            printf("failed to get lock at iteration %d errno=%d\n", 
+                   start - loops, errno);
+            return;
+        }
+    }
+    unlock(fn, buf, 0, &status);
+}
+
+int main(int argc, char *argv[])
+{
+        int status;
+        char *ptr;
+        char uuid[12];
+        int force;
+        int readonly;
+        int max_lease, cur_lease;
+        int intstatus;
+        int lease = DEFAULT_LEASE_TIME_SECS;
+
+        if (argc < 3) {
+                usage(argv[0]);
+                return 0;
+        }
+
+        sprintf(uuid, "%08d", getpid());
+        ptr = uuid;
+
+        if (!strcmp(argv[1],"d")) {
+                status = lock_delta(argv[2], &cur_lease, &max_lease);
+
+                printf("lock delta for %s is %d seconds, max lease is %d\n", 
+                       argv[2], cur_lease, max_lease);
+        } else if (!strcmp(argv[1],"t")) {
+                test_file(argv[2]);
+        } else if (!strcmp(argv[1],"r")) {
+                random_locks(argv[2]);
+        } else if (!strcmp(argv[1],"p")) {
+                perf_lock(argv[2], argc < 3 ? 100000 : atoi(argv[3]));
+        } else if (!strcmp(argv[1],"l")) {
+                if (argc < 4) force = 0; else force = atoi(argv[3]);
+                if (argc < 5) readonly = 0; else readonly = atoi(argv[4]);
+                if (argc >= 6) ptr = argv[5];
+                if (argc == 7) lease = atoi(argv[6]);
+                status = lock(argv[2], ptr, readonly, force, &lease, &intstatus);
+                printf("lock status = %d\n", status);
+        } else if (!strcmp(argv[1],"u") ) {
+                if (argc < 5) readonly = 0; else readonly = atoi(argv[3]);
+                if (argc == 5) ptr = argv[4];
+                status = unlock(argv[2], ptr, readonly, &intstatus);
+                printf("unlock status = %d\n", intstatus);
+        } else {
+                usage(argv[0]);
+        }
+
+        return status;
+}
+#elif defined(UTIL)
+/*
+ * the following is used for non-libary, standalone 
+ * program utility as a shell program
+ */
+
+static void usage(char *prg)
+{
+        printf("usage %s\n"
+               "    delta <filename>\n"
+               "    unlock <filename> <r|w> <uniqid>\n"
+               "    lock <filename> <r|w> <0|1> <uniqid> <leasetime>\n", prg);
+        printf("        delta : get time since lock last refreshed\n");
+        printf("                returns delta time and max lease time in seconds\n");
+        printf("        unlock: unlock request filename, r|w,  uniqID\n");
+        printf("                returns status (success is 0)\n");
+        printf("        lock  : lock request filename,  r|w, force?, uniqID, lease time request\n");
+        printf("                returns status (success is 0) and established lease time in seconds\n");
+}
+
+int main(int argc, char *argv[])
+{
+        int status = 0;
+        int dlock;
+        char *ptr;
+        int force;
+        int readonly;
+        int cur_lease, max_lease, intstatus;
+        int lease = DEFAULT_LEASE_TIME_SECS;
+
+        if (argc < 3) {
+                if (argc == 2 && !strcmp(argv[1], "-h")) {
+                    usage(argv[0]);
+                } else {
+                    printf("%d\n", LOCK_EUSAGE);
+                }
+                return 0;
+        }
+
+        if (!strcmp(argv[1],"delta") && (argc == 3)) {
+                status = lock_delta(argv[2], &cur_lease, &max_lease);
+                printf("%d %d\n", cur_lease, max_lease);
+        } else if (!strcmp(argv[1],"lock") && (argc == 7)) {
+                readonly = (strcmp(argv[3], "r") == 0) ? 1 : 0;
+                force = atoi(argv[4]);
+                ptr = argv[5];
+                lease = atoi(argv[6]);
+                status = lock(argv[2], ptr, force, readonly, &lease, &intstatus);
+                printf("%d %d\n", intstatus, lease);
+        } else if (!strcmp(argv[1],"unlock") && (argc == 5)) {
+                readonly = (strcmp(argv[3], "r") == 0) ? 1 : 0;
+                ptr = argv[4];
+                status = unlock(argv[2], ptr, readonly, &intstatus);
+                printf("%d\n", intstatus);
+        } else {
+                printf("%d\n", LOCK_EUSAGE);
+        }
+
+        /* this is either 0 or a system defined errno */
+        return status;
+}
+#endif
diff --git a/tools/blktap2/drivers/lock.h b/tools/blktap2/drivers/lock.h
new file mode 100644 (file)
index 0000000..98baaaa
--- /dev/null
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2007, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#define DEFAULT_LEASE_TIME_SECS 30
+
+int lock(char *fn_to_lock, char *uuid, int force, int readonly, int *lease_time, int *retstat);
+int unlock(char *fn_to_unlock, char *uuid, int readonly, int *retstat);
+int lock_delta(char *fn_to_check, int *cur_lease_time, int *max_lease_time);
+
+typedef enum {
+    LOCK_OK          =  0,
+    LOCK_EBADPARM    = -1,
+    LOCK_ENOMEM      = -2,
+    LOCK_ESTAT       = -3,
+    LOCK_EHELD_WR    = -4,
+    LOCK_EHELD_RD    = -5,
+    LOCK_EOPEN       = -6,
+    LOCK_EXLOCK_OPEN = -7,
+    LOCK_EXLOCK_WRITE= -8,
+    LOCK_EINODE      = -9,
+    LOCK_EUPDATE     = -10,
+    LOCK_EREAD       = -11,
+    LOCK_EREMOVE     = -12,
+    LOCK_ENOLOCK     = -13,
+    LOCK_EUSAGE      = -14,
+} lock_error;
diff --git a/tools/blktap2/drivers/log.h b/tools/blktap2/drivers/log.h
new file mode 100644 (file)
index 0000000..8f00df4
--- /dev/null
@@ -0,0 +1,123 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* log.h: API for writelog communication */
+
+#ifndef __LOG_H__
+#define __LOG_H__ 1
+
+#include <inttypes.h>
+
+#include <xen/io/ring.h>
+/* for wmb et al */
+#include <xenctrl.h>
+
+#define LOGCMD_SHMP  "shmp"
+#define LOGCMD_PEEK  "peek"
+#define LOGCMD_CLEAR "clrw"
+#define LOGCMD_GET   "getw"
+#define LOGCMD_KICK  "kick"
+
+#define CTLRSPLEN_SHMP  256
+#define CTLRSPLEN_PEEK  4
+#define CTLRSPLEN_CLEAR 4
+#define CTLRSPLEN_GET   4
+#define CTLRSPLEN_KICK  0
+
+/* shmregion is arbitrarily capped at 8 megs for a minimum of
+ * 64 MB of data per read (if there are no contiguous regions)
+ * In the off-chance that there is more dirty data, multiple
+ * reads must be done */
+#define SHMSIZE (8 * 1024 * 1024)
+#define SRINGSIZE 4096
+
+/* The shared memory region is split up into 3 subregions:
+ * The first half is reserved for the dirty bitmap log.
+ * The second half begins with 1 page for read request descriptors,
+ * followed by a big area for supplying read data.
+ */
+static inline void* bmstart(void* shm)
+{
+  return shm;
+}
+
+static inline void* bmend(void* shm)
+{
+  return shm + SHMSIZE/2;
+}
+
+static inline void* sringstart(void* shm)
+{
+  return bmend(shm);
+}
+
+static inline void* sdatastart(void* shm)
+{
+  return sringstart(shm) + SRINGSIZE;
+}
+
+static inline void* sdataend(void* shm)
+{
+  return shm + SHMSIZE;
+}
+
+/* format for messages between log client and server */
+struct log_ctlmsg {
+  char msg[4];
+  char params[16];
+};
+
+/* extent descriptor */
+struct disk_range {
+  uint64_t sector;
+  uint32_t count;
+};
+
+/* dirty write logging space. This is an extent ring at the front,
+ * full of disk_ranges plus a pointer into the data area */
+/* I think I'd rather have the header in front of each data section to
+ * avoid having two separate spaces that can run out, but then I'd either
+ * lose page alignment on the data blocks or spend an entire page on the
+ * header */
+
+struct log_extent {
+  uint64_t sector;
+  uint32_t count;
+  uint32_t offset; /* offset from start of data area to start of extent */
+};
+
+/* struct above should be 16 bytes, or 256 extents/page */
+
+typedef struct log_extent log_request_t;
+typedef struct log_extent log_response_t;
+
+DEFINE_RING_TYPES(log, log_request_t, log_response_t);
+
+#define LOG_HEADER_PAGES 4
+
+#endif
diff --git a/tools/blktap2/drivers/profile.h b/tools/blktap2/drivers/profile.h
new file mode 100644 (file)
index 0000000..f628ba2
--- /dev/null
@@ -0,0 +1,191 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef __TAP_PROFILE_H__
+#define __TAP_PROFILE_H__
+
+#ifndef _GNU_SOURCE
+  #define _GNU_SOURCE
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <syslog.h>
+#include <sys/time.h>
+#include <time.h>
+#include <fcntl.h>
+#include <inttypes.h>
+
+//#define PROFILING
+//#define LOGGING
+
+#define TAPPROF_IN  1
+#define TAPPROF_OUT 2
+
+struct profile_times {
+       char    *fn_name;
+       uint64_t in, out_sum, cnt;
+};
+
+struct profile_info {
+       FILE                 *log;
+       int                   size;
+       char                 *name;
+       unsigned long long    seq;
+       struct profile_times *pt;
+};
+
+#ifdef PROFILING
+
+static inline void
+tp_open(struct profile_info *prof, char *tap_name, char *log_name, int size)
+{
+       memset(prof, 0, sizeof(struct profile_info));
+#ifdef LOGGING
+       prof->log  = fopen(log_name, "w");
+#endif
+       prof->size = size;
+       prof->name = strdup(tap_name);
+       prof->pt   = malloc(sizeof(struct profile_times) * prof->size);
+       if (prof->pt)
+               memset(prof->pt, 0, sizeof(struct profile_times) * prof->size);
+}
+
+static inline void
+tp_close(struct profile_info *prof)
+{
+       int i;
+       struct profile_times *pt;
+
+       for (i = 0; i < prof->size; i++) {
+               pt = &prof->pt[i];
+               if (pt->fn_name) {
+                       syslog(LOG_DEBUG, "%s: %s: cnt: %llu, avg time: %llu\n",
+                              prof->name, pt->fn_name, pt->cnt, 
+                              ((pt->cnt) ? (pt->out_sum / pt->cnt) : 0));
+                       free(pt->fn_name);
+               }
+       }
+
+#ifdef LOGGING
+       if (prof->log)
+               fclose(prof->log);
+#endif
+       free(prof->name);
+       if (prof->pt)
+               free(prof->pt);
+}
+
+static inline u64
+tp_get_id(struct profile_info *prof)
+{
+       return prof->seq++;
+}
+
+static inline int
+tp_fn_id(struct profile_info *prof, const char *name)
+{
+       int i;
+       struct profile_times *pt;
+
+       for (i = 0; i < prof->size; i++) {
+               pt = &prof->pt[i];
+               if (!pt->fn_name)
+                       return i;
+               if (!strcmp(pt->fn_name, name))
+                       return i;
+       }
+
+       return prof->size - 1;
+}
+
+static inline void
+__tp_in(struct profile_info *prof, const char *func)
+{
+       long long _time;
+       int idx = tp_fn_id(prof, func);
+       struct profile_times *pt = &prof->pt[idx];
+
+       if (!pt->fn_name) 
+               pt->fn_name = strdup(func);
+
+       asm volatile(".byte 0x0f, 0x31" : "=A" (_time));
+       pt->in = _time;
+}
+
+#define tp_in(prof) __tp_in(prof, __func__)
+
+static inline void
+__tp_out(struct profile_info *prof, const char *func)  
+{
+       long long _time;
+       int idx = tp_fn_id(prof, func);
+       struct profile_times *pt = &prof->pt[idx];
+
+       if (!pt->fn_name || !pt->in)
+               return;
+
+       asm volatile(".byte 0x0f, 0x31" : "=A" (_time));
+       pt->cnt++;
+       pt->out_sum += (_time - pt->in);
+       pt->in       = 0;
+}
+
+#define tp_out(prof) __tp_out(prof, __func__)
+
+static inline void
+__tp_log(struct profile_info *prof, u64 id, const char *func, int direction)
+{
+       long long _time;
+       asm volatile(".byte 0x0f, 0x31" : "=A" (_time));
+
+       if (direction == TAPPROF_IN)
+               __tp_in(prof, func);
+       else 
+               __tp_out(prof, func);
+
+#ifdef LOGGING
+        if (prof->log)
+               fprintf(prof->log, "%s: %s: %llu, %lld\n", func, 
+                       ((direction == TAPPROF_IN) ? "in" : "out"), id, _time);
+#endif
+}
+
+#define tp_log(prof, id, direction) __tp_log(prof, id, __func__, direction)
+
+#else
+#define tp_open(prof, tname, lname, size)  ((void)0)
+#define tp_close(prof)                     ((void)0)
+#define tp_in(prof)                        ((void)0)
+#define tp_out(prof)                       ((void)0)
+#define tp_log(prof, sec, direction)       ((void)0)
+#endif
+
+#endif
diff --git a/tools/blktap2/drivers/qcow-create.c b/tools/blktap2/drivers/qcow-create.c
new file mode 100644 (file)
index 0000000..6a641af
--- /dev/null
@@ -0,0 +1,121 @@
+/* qcow-create.c
+ *
+ * Generates a qcow format disk.
+ *
+ * (c) 2006 Andrew Warfield and Julian Chesterfield
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/statvfs.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <string.h>
+#include "tapdisk.h"
+#include "qcow.h"
+
+#if 1
+#define DFPRINTF(_f, _a...) fprintf ( stderr, _f , ## _a )
+#else
+#define DFPRINTF(_f, _a...) ((void)0)
+#endif
+
+#define MAX_NAME_LEN 1000
+
+void help(void)
+{
+       fprintf(stderr, "Qcow-utils: v1.0.0\n");
+       fprintf(stderr, 
+               "usage: qcow-create [-h help] [-r reserve] <SIZE(MB)> <FILENAME> "
+               "[<BACKING_FILENAME>]\n"); 
+       exit(-1);
+}
+
+int main(int argc, char *argv[])
+{
+       int ret = -1, c, backed = 0;
+       int sparse =  1;
+       uint64_t size;
+       char filename[MAX_NAME_LEN], bfilename[MAX_NAME_LEN];
+
+        for(;;) {
+                c = getopt(argc, argv, "hr");
+                if (c == -1)
+                        break;
+                switch(c) {
+                case 'h':
+                        help();
+                        exit(0);
+                        break;
+                case 'r':
+                       sparse = 0;
+                       break;
+               default:
+                       fprintf(stderr, "Unknown option\n");
+                       help();
+               }
+       }
+
+       printf("Optind %d, argc %d\n", optind, argc);
+       if ( !(optind == (argc - 2) || optind == (argc - 3)) )
+               help();
+
+       size = atoi(argv[optind++]);
+       size = size << 20;
+
+       if (snprintf(filename, MAX_NAME_LEN, "%s",argv[optind++]) >=
+               MAX_NAME_LEN) {
+               fprintf(stderr,"Device name too long\n");
+               exit(-1);
+       }
+
+       if (optind != argc) {
+               /*Backing file argument*/
+               backed = 1;
+               if (snprintf(bfilename, MAX_NAME_LEN, "%s",argv[optind++]) >=
+                       MAX_NAME_LEN) {
+                       fprintf(stderr,"Device name too long\n");
+                       exit(-1);
+               }
+       }
+
+       DFPRINTF("Creating file size %"PRIu64", name %s\n",(uint64_t)size, filename);
+       if (!backed)
+               ret = qcow_create(filename,size,NULL,sparse);
+       else
+               ret = qcow_create(filename,size,bfilename,sparse);
+
+       if (ret < 0)
+               DPRINTF("Unable to create QCOW file\n");
+       else
+               DPRINTF("QCOW file successfully created\n");
+
+       return 0;
+}
diff --git a/tools/blktap2/drivers/qcow.h b/tools/blktap2/drivers/qcow.h
new file mode 100644 (file)
index 0000000..a88f1d5
--- /dev/null
@@ -0,0 +1,131 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef _QCOW_H_
+#define _QCOW_H_
+
+#include "aes.h"
+/**************************************************************/
+/* QEMU COW block driver with compression and encryption support */
+
+#define QCOW_MAGIC (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb)
+#define XEN_MAGIC  (('X' << 24) | ('E' << 16) | ('N' << 8) | 0xfb)
+#define QCOW_VERSION 1
+
+#define QCOW_CRYPT_NONE 0x00
+#define QCOW_CRYPT_AES  0x01
+
+#define QCOW_OFLAG_COMPRESSED (1LL << 63)
+#define SPARSE_FILE 0x01
+#define EXTHDR_L1_BIG_ENDIAN 0x02
+
+#ifndef O_BINARY
+#define O_BINARY 0
+#endif
+
+#define ROUNDUP(l, s) \
+({ \
+    (uint64_t)( \
+        (l + (s - 1)) - ((l + (s - 1)) % s)); \
+})
+
+typedef struct QCowHeader {
+       uint32_t magic;
+       uint32_t version;
+       uint64_t backing_file_offset;
+       uint32_t backing_file_size;
+       uint32_t mtime;
+       uint64_t size; /* in bytes */
+       uint8_t cluster_bits;
+       uint8_t l2_bits;
+       uint32_t crypt_method;
+       uint64_t l1_table_offset;
+} QCowHeader;
+
+/*Extended header for Xen enhancements*/
+typedef struct QCowHeader_ext {
+        uint32_t xmagic;
+        uint32_t cksum;
+        uint32_t min_cluster_alloc;
+        uint32_t flags;
+} QCowHeader_ext;
+
+uint32_t gen_cksum(char *ptr, int len);
+int get_filesize(char *filename, uint64_t *size, struct stat *st);
+int qtruncate(int fd, off_t length, int sparse);
+
+#define L2_CACHE_SIZE 16  /*Fixed allocation in Qemu*/
+
+struct tdqcow_state {
+        int fd;                        /*Main Qcow file descriptor */
+       uint64_t fd_end;               /*Store a local record of file length */
+       char *name;                    /*Record of the filename*/
+       uint32_t backing_file_size;
+       uint64_t backing_file_offset;
+       uint8_t extended;              /*File contains extended header*/
+       int encrypted;                 /*File contents are encrypted or plain*/
+       int cluster_bits;              /*Determines length of cluster as 
+                                       *indicated by file hdr*/
+       int cluster_size;              /*Length of cluster*/
+       int cluster_sectors;           /*Number of sectors per cluster*/
+       int cluster_alloc;             /*Blktap fix for allocating full 
+                                       *extents*/
+       int min_cluster_alloc;         /*Blktap historical extent alloc*/
+       int sparse;                    /*Indicates whether to preserve sparseness*/
+       int l2_bits;                   /*Size of L2 table entry*/
+       int l2_size;                   /*Full table size*/
+       int l1_size;                   /*L1 table size*/
+       uint64_t cluster_offset_mask;    
+       uint64_t l1_table_offset;      /*L1 table offset from beginning of 
+                                       *file*/
+       uint64_t *l1_table;            /*L1 table entries*/
+       uint64_t *l2_cache;            /*We maintain a cache of size 
+                                       *L2_CACHE_SIZE of most read entries*/
+       uint64_t l2_cache_offsets[L2_CACHE_SIZE];     /*L2 cache entries*/
+       uint32_t l2_cache_counts[L2_CACHE_SIZE];      /*Cache access record*/
+       uint8_t *cluster_cache;          
+       uint8_t *cluster_data;
+       uint64_t cluster_cache_offset; /**/
+       uint32_t crypt_method;         /*current crypt method, 0 if no 
+                                       *key yet */
+       uint32_t crypt_method_header;  /**/
+       AES_KEY aes_encrypt_key;       /*AES key*/
+       AES_KEY aes_decrypt_key;       /*AES key*/
+
+        /* libaio state */
+       int                  aio_free_count;    
+       int                  max_aio_reqs;
+       struct qcow_request   *aio_requests;
+       struct qcow_request  **aio_free_list;
+
+};
+
+int qcow_create(const char *filename, uint64_t total_size,
+               const char *backing_file, int sparse);
+
+#endif //_QCOW_H_
diff --git a/tools/blktap2/drivers/qcow2raw.c b/tools/blktap2/drivers/qcow2raw.c
new file mode 100644 (file)
index 0000000..689e7f5
--- /dev/null
@@ -0,0 +1,449 @@
+/* qcow2raw.c
+ *
+ * Generates raw image data from an existing qcow image
+ *
+ * (c) 2006 Julian Chesterfield and Andrew Warfield
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <inttypes.h>
+#include <unistd.h>
+#include <sys/statvfs.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <string.h>
+
+#include "bswap.h"
+#include "aes.h"
+#include "blk.h"
+#include "tapdisk.h"
+#include "tapdisk-server.h"
+#include "tapdisk-driver.h"
+#include "tapdisk-interface.h"
+#include "qcow.h"
+
+#if 1
+#define DFPRINTF(_f, _a...) fprintf ( stderr, _f , ## _a )
+#else
+#define DFPRINTF(_f, _a...) ((void)0)
+#endif
+
+
+/* *BSD has no O_LARGEFILE */
+#ifndef O_LARGEFILE
+#define O_LARGEFILE 0
+#endif
+
+#define TAPDISK 1
+#define BLOCK_PROCESSSZ 4096
+#define QCOW_VBD 0
+#define AIO_VBD 1
+#define WINDOW 32
+#define PROGRESS_QUANT 2
+
+static int running = 1, complete = 0; 
+static int returned_read_events = 0, returned_write_events = 0;
+static int submit_events = 0;
+static uint32_t read_idx = 0;
+td_driver_t *ddqcow, *ddaio;
+td_vbd_t* qcow_vbd, *aio_vbd;
+static uint64_t prev = 0, written = 0;
+static char output[(100/PROGRESS_QUANT) + 5];
+
+extern tapdisk_server_t server;
+
+struct request_info {
+  void* buf;
+  uint64_t logical_sec;
+  int pending;
+};
+
+static void print_bytes(void *ptr, int length)
+{
+  int i,k;
+  unsigned char *p = ptr;
+
+    DFPRINTF("Buf dump, length %d:\n",length);
+    for (k = 0; k < length; k++) {
+        DFPRINTF("%x",*p);
+        *p++;
+       if (k % 16 == 0) DFPRINTF("\n");
+        else if (k % 2 == 0) DFPRINTF(" ");    
+    }
+    DFPRINTF("\n");
+    return;
+}
+
+void
+queue_event(event_id_t id, char mode, void *private)
+{
+  tapdisk_complete_tiocbs(&server.aio_queue);
+}
+
+static void debug_output(uint64_t progress, uint64_t size)
+{
+        //Output progress every PROGRESS_QUANT 
+        uint64_t blocks = size/(100/PROGRESS_QUANT);
+
+       if (progress/blocks > prev) {
+               memcpy(output+prev+1,"=>",2);
+               prev++;
+               DFPRINTF("\r%s     %"PRIu64"%%", 
+                       output, (uint64_t)((prev-1)*PROGRESS_QUANT));
+       }
+       return;
+}
+
+static void send_write_responses(td_request_t treq, int err)
+{
+        struct request_info* req;
+
+       if (err < 0) {
+               DFPRINTF("AIO FAILURE: res [%d]!\n",err);
+               return;
+       }
+       returned_write_events+=treq.secs;
+        written += treq.secs;
+
+        req= (struct request_info*)treq.cb_data;
+
+        //Wait for whole request to complete.
+        req->pending-=treq.secs;
+        if(req->pending)
+          return;
+
+        //Whole request has completed, we can free buffers. 
+        free(req->buf);
+        free(req);
+
+       debug_output(written, ddaio->info.size);
+       
+       return;
+}
+
+static void send_read_responses(td_request_t treq, int err)
+{
+       int ret;
+        struct request_info* req;
+        td_vbd_request_t* vreq;
+
+       if (err < 0)  {
+         DFPRINTF("AIO FAILURE: res [%d]!\n",err); 
+         return;
+       }
+       returned_read_events+=treq.secs;
+
+        req= (struct request_info*)treq.cb_data;
+
+        //do nothing until all fragments complete.
+        req->pending-=treq.secs;
+
+        if(req->pending)
+          return;
+
+        //This read is done.
+        tapdisk_vbd_complete_vbd_request(qcow_vbd, treq.private);
+
+
+        treq.op      = TD_OP_WRITE;
+        treq.buf     = req->buf;
+        treq.sec     = req->logical_sec;
+        treq.secs    = BLOCK_PROCESSSZ>>9;
+        treq.image   = tapdisk_vbd_first_image(aio_vbd);
+        treq.cb      = send_write_responses;
+        treq.id      = 0;
+        treq.sidx    = 0;
+
+        req->pending = BLOCK_PROCESSSZ>>9;
+        treq.cb_data = req;
+
+        vreq         = calloc(1, sizeof(td_vbd_request_t));
+        treq.private = vreq;
+
+        //Put it in the VBD's queue, so we don't lose
+        //track of it.
+        vreq->submitting = 1;
+        INIT_LIST_HEAD(&vreq->next);
+        tapdisk_vbd_move_request(treq.private, 
+                                 &aio_vbd->pending_requests);
+
+        ddaio->ops->td_queue_write(ddaio,treq);
+        --vreq->submitting;
+
+        tapdisk_submit_all_tiocbs(&server.aio_queue);
+
+       return;
+}
+
+int main(int argc, const char *argv[])
+{
+       int ret = -1, fd, len,input;
+       uint64_t size;
+       struct timeval timeout;
+       uint64_t i;
+       char *buf;
+       struct stat finfo;
+       td_request_t treq;
+       td_vbd_request_t* vreq;
+        struct request_info* req;
+        int err;
+
+       if (argc != 3) {
+               fprintf(stderr, "Qcow-utils: v1.0.0\n");
+               fprintf(stderr, "usage: %s <Dest File descriptor> "
+                       "<Qcow SRC IMAGE>\n", 
+                      argv[0]);
+               exit(-1);
+       }
+
+        err = tapdisk_server_initialize(NULL, NULL);
+        if( err ) {
+          DPRINTF("qcow2raw Couldn't initialize server instance.\n");
+          return err;
+        }
+
+        err=tapdisk_vbd_initialize(-1,-1, QCOW_VBD);
+        if( err ) {
+          DPRINTF("qcow2raw Couldn't initialize qcow vbd.\n");
+          return err;
+        }
+
+        qcow_vbd = tapdisk_server_get_vbd(QCOW_VBD);
+        if (!qcow_vbd) {
+          err = -ENODEV;
+          DPRINTF("qcow2raw Couldn't create qcow vbd.\n");
+          return err;
+        }
+
+        err = tapdisk_vbd_open_vdi(qcow_vbd, argv[2], DISK_TYPE_QCOW,
+                                   TAPDISK_STORAGE_TYPE_DEFAULT,
+                                   TD_OPEN_RDONLY);
+        if( err ) {
+          DPRINTF("qcow2raw Couldn't open qcow file.\n");
+          return err;
+        }
+
+        ddqcow=(tapdisk_vbd_first_image(qcow_vbd))->driver;
+
+        /*Setup aio destination file*/
+       ret = stat(argv[1],&finfo);
+       if (ret == -1) {
+               /*Check errno*/
+               switch(errno) {
+               case ENOENT:
+                       /*File doesn't exist, create*/
+                       fd = open(argv[1], 
+                                 O_RDWR | O_LARGEFILE | O_CREAT, 0644);
+                       if (fd < 0) {
+                               DFPRINTF("ERROR creating file [%s] "
+                                        "(errno %d)\n",
+                                      argv[1], 0 - errno);
+                               exit(-1);
+                       }
+                       if (ftruncate(fd, (off_t)ddqcow->info.size<<9) < 0) {
+                               DFPRINTF("Unable to create file "
+                                       "[%s] of size %"PRIu64" (errno %d). "
+                                        "Exiting...\n",
+                                       argv[1], 
+                                       (uint64_t)ddqcow->info.size<<9, 
+                                       0 - errno);
+                               close(fd);
+                               exit(-1);
+                       }
+                       close(fd);
+                       break;
+               case  ENXIO:
+                       DFPRINTF("ERROR Device [%s] does not exist\n",argv[1]);
+                       exit(-1);
+               default: 
+                       DFPRINTF("An error occurred opening Device [%s] "
+                                "(errno %d)\n",
+                              argv[1], 0 - errno);
+                       exit(-1);
+               }
+       } else {                
+               fprintf(stderr, "WARNING: All existing data in "
+                       "%s will be overwritten.\nDo you wish to continue? "
+                       "(y or n)  ",
+                       argv[1]);
+               if (getchar() != 'y') {
+                       DFPRINTF("Exiting...\n");
+                       exit(-1);
+               }
+               
+               /*TODO - Test the existing file or device for adequate space*/
+               fd = open(argv[1], O_RDWR | O_LARGEFILE);
+               if (fd < 0) {
+                       DFPRINTF("ERROR: opening file [%s] (errno %d)\n",
+                              argv[1], 0 - errno);
+                       exit(-1);
+               }
+
+               if (S_ISBLK(finfo.st_mode)) {
+                       if (blk_getimagesize(fd, &size) != 0) {
+                               close(fd);
+                               return -1;
+                       }
+
+                       if (size < ddqcow->info.size<<9) {
+                               DFPRINTF("ERROR: Not enough space on device "
+                                       "%s (%"PRIu64" bytes available, "
+                                       "%"PRIu64" bytes required\n",
+                                       argv[1], size, 
+                                       (uint64_t)ddqcow->info.size<<9);
+                               close(fd);
+                               exit(-1);                               
+                       }
+               } else {
+                       if (ftruncate(fd, (off_t)ddqcow->info.size<<9) < 0) {
+                               DFPRINTF("Unable to create file "
+                                       "[%s] of size %"PRIu64" (errno %d). "
+                                        "Exiting...\n",
+                                       argv[1], 
+                                       (uint64_t)ddqcow->info.size<<9, 
+                                        0 - errno);
+                               close(fd);
+                               exit(-1);
+                       } else DFPRINTF("File [%s] truncated to length %"PRIu64" "
+                                       "(%"PRIu64")\n", 
+                                      argv[1], 
+                                      (uint64_t)ddqcow->info.size<<9, 
+                                      (uint64_t)ddqcow->info.size);
+               }
+               close(fd);
+       }
+
+        //Now the output file should be there, reopen it as an aio VBD
+        err=tapdisk_vbd_initialize(-1,-1, AIO_VBD);
+        if( err ) {
+          DPRINTF("qcow2raw Couldn't initialize aio vbd.\n");
+          return err;
+        }
+
+        aio_vbd = tapdisk_server_get_vbd(AIO_VBD);
+        if (!aio_vbd) {
+          err = -ENODEV;
+          DPRINTF("qcow2raw Couldn't create aio vbd.\n");
+          return err;
+        }
+
+        err = tapdisk_vbd_open_vdi(aio_vbd, argv[1], DISK_TYPE_AIO,
+                                   TAPDISK_STORAGE_TYPE_DEFAULT,
+                                   0);
+        if( err ) {
+          DPRINTF("qcow2raw Couldn't open aio file.\n");
+          return err;
+        }
+
+        ddaio=(tapdisk_vbd_first_image(aio_vbd))->driver;
+
+       /*Initialise the output string*/
+       memset(output,0x20,(100/PROGRESS_QUANT)+5);
+       output[0] = '[';
+        output[(100/PROGRESS_QUANT)+2] = ']';
+        output[(100/PROGRESS_QUANT)+3] = '\0';
+       DFPRINTF("%s",output);
+
+       i = 0;
+       while (running) {
+               timeout.tv_sec = 0;
+               
+               if (!complete) {
+                       /*Read Pages from qcow image*/
+                       if ( (ret = posix_memalign((void **)&buf, 
+                                                  BLOCK_PROCESSSZ, 
+                                                  BLOCK_PROCESSSZ))
+                            != 0) {
+                               DFPRINTF("Unable to alloc memory (%d)\n",ret);
+                               exit(-1);                               
+                       }
+               
+                       /*Attempt to read 4k sized blocks*/
+                       submit_events+=BLOCK_PROCESSSZ>>9;
+
+                       //Set up the read request
+                       treq.op      = TD_OP_READ;
+                       treq.buf     = buf;
+                       treq.sec     = i;
+                       treq.secs    = BLOCK_PROCESSSZ>>9;
+                       treq.image   = tapdisk_vbd_first_image(qcow_vbd);
+                       treq.cb      = send_read_responses;
+                       treq.id      = 0;
+                       treq.sidx    = 0;
+
+                        req = calloc(1, sizeof(struct request_info));
+                        req->buf = buf;
+                        req->logical_sec = i;
+                        req->pending = BLOCK_PROCESSSZ>>9;
+                       treq.cb_data = req;
+
+                        vreq         = calloc(1, sizeof(td_vbd_request_t));
+                        treq.private = vreq;
+
+                        //Put it in the VBD's queue, so we don't lose
+                        //track of it.
+                        vreq->submitting = 1;
+                        INIT_LIST_HEAD(&vreq->next);
+                        tapdisk_vbd_move_request(treq.private, 
+                                                 &qcow_vbd->pending_requests);
+
+                       ddqcow->ops->td_queue_read(ddqcow, treq);
+                        --vreq->submitting;
+
+                       i += BLOCK_PROCESSSZ>>9;
+
+                       if (i >= ddqcow->info.size)
+                         complete = 1;
+
+                       
+                       tapdisk_submit_all_tiocbs(&server.aio_queue);
+               }
+               
+
+               while(returned_write_events != submit_events) {
+                 ret = scheduler_wait_for_events(&server.scheduler);
+                 if (ret < 0) {
+                   DFPRINTF("server wait returned %d\n", ret);
+                   sleep(2);
+                 }
+               }
+               if (complete && (returned_write_events == submit_events)) 
+                       running = 0;
+       }
+       memcpy(output+prev+1,"=",1);
+       DFPRINTF("\r%s     100%%\nTRANSFER COMPLETE\n\n", output);
+
+       ddqcow->ops->td_close(ddqcow);
+       ddaio->ops->td_close(ddaio);
+       free(ddqcow->data);
+       free(ddaio->data);
+               
+       return 0;
+}
diff --git a/tools/blktap2/drivers/scheduler.c b/tools/blktap2/drivers/scheduler.c
new file mode 100644 (file)
index 0000000..6b8d009
--- /dev/null
@@ -0,0 +1,265 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <errno.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <sys/time.h>
+
+#include "scheduler.h"
+#include "tapdisk-log.h"
+
+#define DBG(_f, _a...)               tlog_write(TLOG_DBG, _f, ##_a)
+
+#define SCHEDULER_MAX_TIMEOUT        600
+#define SCHEDULER_POLL_FD           (SCHEDULER_POLL_READ_FD |  \
+                                    SCHEDULER_POLL_WRITE_FD |  \
+                                    SCHEDULER_POLL_EXCEPT_FD)
+
+#define MIN(a, b)                   ((a) <= (b) ? (a) : (b))
+#define MAX(a, b)                   ((a) >= (b) ? (a) : (b))
+
+#define scheduler_for_each_event(s, event, tmp)        \
+       list_for_each_entry_safe(event, tmp, &(s)->events, next)
+
+typedef struct event {
+       char                         mode;
+       event_id_t                   id;
+
+       int                          fd;
+       int                          timeout;
+       int                          deadline;
+
+       event_cb_t                   cb;
+       void                        *private;
+
+       struct list_head             next;
+} event_t;
+
+static void
+scheduler_prepare_events(scheduler_t *s)
+{
+       int diff;
+       struct timeval now;
+       event_t *event, *tmp;
+
+       FD_ZERO(&s->read_fds);
+       FD_ZERO(&s->write_fds);
+       FD_ZERO(&s->except_fds);
+
+       s->max_fd  = 0;
+       s->timeout = SCHEDULER_MAX_TIMEOUT;
+
+       gettimeofday(&now, NULL);
+
+       scheduler_for_each_event(s, event, tmp) {
+               if (event->mode & SCHEDULER_POLL_READ_FD) {
+                       FD_SET(event->fd, &s->read_fds);
+                       s->max_fd = MAX(event->fd, s->max_fd);
+               }
+
+               if (event->mode & SCHEDULER_POLL_WRITE_FD) {
+                       FD_SET(event->fd, &s->write_fds);
+                       s->max_fd = MAX(event->fd, s->max_fd);
+               }
+
+               if (event->mode & SCHEDULER_POLL_EXCEPT_FD) {
+                       FD_SET(event->fd, &s->except_fds);
+                       s->max_fd = MAX(event->fd, s->max_fd);
+               }
+
+               if (event->mode & SCHEDULER_POLL_TIMEOUT) {
+                       diff = event->deadline - now.tv_sec;
+                       if (diff > 0)
+                               s->timeout = MIN(s->timeout, diff);
+                       else
+                               s->timeout = 0;
+               }
+       }
+
+       s->timeout = MIN(s->timeout, s->max_timeout);
+}
+
+static void
+scheduler_event_callback(event_t *event, char mode)
+{
+       if (event->mode & SCHEDULER_POLL_TIMEOUT) {
+               struct timeval now;
+               gettimeofday(&now, NULL);
+               event->deadline = now.tv_sec + event->timeout;
+       }
+
+       event->cb(event->id, mode, event->private);
+}
+
+static void
+scheduler_run_events(scheduler_t *s)
+{
+       struct timeval now;
+       event_t *event, *tmp;
+
+       gettimeofday(&now, NULL);
+
+ again:
+       s->restart = 0;
+
+       scheduler_for_each_event(s, event, tmp) {
+               if ((event->mode & SCHEDULER_POLL_READ_FD) &&
+                   FD_ISSET(event->fd, &s->read_fds)) {
+                       FD_CLR(event->fd, &s->read_fds);
+                       scheduler_event_callback(event, SCHEDULER_POLL_READ_FD);
+                       goto next;
+               }
+
+               if ((event->mode & SCHEDULER_POLL_WRITE_FD) &&
+                   FD_ISSET(event->fd, &s->write_fds)) {
+                       FD_CLR(event->fd, &s->write_fds);
+                       scheduler_event_callback(event, SCHEDULER_POLL_WRITE_FD);
+                       goto next;
+               }
+
+               if ((event->mode & SCHEDULER_POLL_EXCEPT_FD) &&
+                   FD_ISSET(event->fd, &s->except_fds)) {
+                       FD_CLR(event->fd, &s->except_fds);
+                       scheduler_event_callback(event, SCHEDULER_POLL_EXCEPT_FD);
+                       goto next;
+               }
+
+               if ((event->mode & SCHEDULER_POLL_TIMEOUT) &&
+                   (event->deadline <= now.tv_sec))
+                   scheduler_event_callback(event, SCHEDULER_POLL_TIMEOUT);
+
+       next:
+               if (s->restart)
+                       goto again;
+       }
+}
+
+int
+scheduler_register_event(scheduler_t *s, char mode, int fd,
+                        int timeout, event_cb_t cb, void *private)
+{
+       event_t *event;
+       struct timeval now;
+
+       if (!cb)
+               return -EINVAL;
+
+       if (!(mode & SCHEDULER_POLL_TIMEOUT) && !(mode & SCHEDULER_POLL_FD))
+               return -EINVAL;
+
+       event = calloc(1, sizeof(event_t));
+       if (!event)
+               return -ENOMEM;
+
+       gettimeofday(&now, NULL);
+
+       INIT_LIST_HEAD(&event->next);
+
+       event->mode     = mode;
+       event->fd       = fd;
+       event->timeout  = timeout;
+       event->deadline = now.tv_sec + timeout;
+       event->cb       = cb;
+       event->private  = private;
+       event->id       = s->uuid++;
+
+       if (!s->uuid)
+               s->uuid++;
+
+       list_add_tail(&event->next, &s->events);
+
+       return event->id;
+}
+
+void
+scheduler_unregister_event(scheduler_t *s, event_id_t id)
+{
+       event_t *event, *tmp;
+
+       if (!id)
+               return;
+
+       scheduler_for_each_event(s, event, tmp)
+               if (event->id == id) {
+                       list_del(&event->next);
+                       free(event);
+                       s->restart = 1;
+                       break;
+               }
+}
+
+void
+scheduler_set_max_timeout(scheduler_t *s, int timeout)
+{
+       if (timeout >= 0)
+               s->max_timeout = MIN(s->max_timeout, timeout);
+}
+
+int
+scheduler_wait_for_events(scheduler_t *s)
+{
+       int ret;
+       struct timeval tv;
+
+       scheduler_prepare_events(s);
+
+       tv.tv_sec  = s->timeout;
+       tv.tv_usec = 0;
+
+       DBG("timeout: %d, max_timeout: %d\n",
+           s->timeout, s->max_timeout);
+
+       ret = select(s->max_fd + 1, &s->read_fds,
+                    &s->write_fds, &s->except_fds, &tv);
+
+       s->restart     = 0;
+       s->timeout     = SCHEDULER_MAX_TIMEOUT;
+       s->max_timeout = SCHEDULER_MAX_TIMEOUT;
+
+       if (ret < 0)
+               return ret;
+
+       scheduler_run_events(s);
+
+       return ret;
+}
+
+void
+scheduler_initialize(scheduler_t *s)
+{
+       memset(s, 0, sizeof(scheduler_t));
+
+       s->uuid = 1;
+
+       FD_ZERO(&s->read_fds);
+       FD_ZERO(&s->write_fds);
+       FD_ZERO(&s->except_fds);
+
+       INIT_LIST_HEAD(&s->events);
+}
diff --git a/tools/blktap2/drivers/scheduler.h b/tools/blktap2/drivers/scheduler.h
new file mode 100644 (file)
index 0000000..ea37e8f
--- /dev/null
@@ -0,0 +1,65 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _SCHEDULER_H_
+#define _SCHEDULER_H_
+
+#include <sys/select.h>
+
+#include "list.h"
+
+#define SCHEDULER_POLL_READ_FD       0x1
+#define SCHEDULER_POLL_WRITE_FD      0x2
+#define SCHEDULER_POLL_EXCEPT_FD     0x4
+#define SCHEDULER_POLL_TIMEOUT       0x8
+
+typedef int                          event_id_t;
+typedef void (*event_cb_t)          (event_id_t id, char mode, void *private);
+
+typedef struct scheduler {
+       fd_set                       read_fds;
+       fd_set                       write_fds;
+       fd_set                       except_fds;
+
+       struct list_head             events;
+
+       int                          uuid;
+       int                          max_fd;
+       int                          timeout;
+       int                          restart;
+       int                          max_timeout;
+} scheduler_t;
+
+void scheduler_initialize(scheduler_t *);
+event_id_t scheduler_register_event(scheduler_t *, char mode,
+                                   int fd, int timeout,
+                                   event_cb_t cb, void *private);
+void scheduler_unregister_event(scheduler_t *,  event_id_t);
+void scheduler_set_max_timeout(scheduler_t *, int);
+int scheduler_wait_for_events(scheduler_t *);
+
+#endif
diff --git a/tools/blktap2/drivers/tapdisk-client.c b/tools/blktap2/drivers/tapdisk-client.c
new file mode 100644 (file)
index 0000000..c85b5fc
--- /dev/null
@@ -0,0 +1,496 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* client harness for tapdisk log */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/un.h>
+
+#include "log.h"
+
+#define BDPRINTF(_f, _a...) fprintf (stderr, "log: " _f "\n", ## _a)
+
+#define BWPRINTF(_f, _a...) fprintf (stderr, "log: " _f "\n", ## _a)
+
+struct writelog {
+  char* shmpath;
+  uint32_t shmsize;
+  void* shm;
+
+  /* next unprocessed item in the writelog */
+  void* cur;
+  unsigned int inflight;
+
+  /* pointer to start and end of free data space for requests */
+  void* dhd;
+  void* dtl;
+
+  log_sring_t* sring;
+  log_front_ring_t fring;
+};
+
+/* bytes free on the data ring */
+static inline unsigned int dring_avail(struct writelog* wl)
+{
+  /* one byte reserved to distinguish empty from full */
+  if (wl->dhd == wl->dtl)
+    return sdataend(wl->shm) - sdatastart(wl->shm) - 1;
+
+  if (wl->dhd < wl->dtl)
+    return wl->dtl - wl->dhd - 1;
+
+  return (sdataend(wl->shm) - wl->dhd) + (wl->dtl - sdatastart(wl->shm)) - 1;
+}
+
+/* advance ring pointer by len bytes */
+static inline void* dring_advance(struct writelog* wl, void* start, size_t len)
+{
+  void* next;
+  int dsz = sdataend(wl->shm) - sdatastart(wl->shm);
+
+  next = start + (len % dsz);
+  if (next > sdataend(wl->shm))
+    next -= dsz;
+
+  return next;
+}
+
+static void usage(void)
+{
+  fprintf(stderr, "usage: tapdisk-client <sock>\n");
+}
+
+/* returns socket file descriptor */
+static int tdctl_open(const char* sockpath)
+{
+  struct sockaddr_un saddr;
+  int fd;
+
+  if ((fd = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) {
+    BWPRINTF("error creating socket: %s", strerror(errno));
+    return -1;
+  }
+
+  memset(&saddr, 0, sizeof(saddr));
+  saddr.sun_family = AF_UNIX;
+  memcpy(saddr.sun_path, sockpath, strlen(sockpath));
+
+  if (connect(fd, &saddr, sizeof(saddr)) < 0) {
+    BWPRINTF("error connecting to socket %s: %s", sockpath, strerror(errno));
+    close(fd);
+    return -1;
+  }
+
+  return fd;
+}
+
+static int ctl_talk(int fd, struct log_ctlmsg* msg, char* rsp, int rsplen)
+{
+  int rc;
+
+  if ((rc = write(fd, msg, sizeof(*msg))) < 0) {
+    BWPRINTF("error sending ctl request: %s", strerror(errno));
+    return -1;
+  } else if (rc < sizeof(*msg)) {
+    BWPRINTF("short ctl write (%d/%zd bytes)", rc, sizeof(*msg));
+    return -1;
+  }
+
+  if (!rsplen)
+    return 0;
+
+  if ((rc = read(fd, rsp, rsplen)) < 0) {
+    BWPRINTF("error reading ctl response: %s", strerror(errno));
+    return -1;
+  } else if (rc < rsplen) {
+    BWPRINTF("short ctl read (%d/%d bytes)", rc, rsplen);
+    return -1;
+  }
+
+  return 0;
+}
+
+static int ctl_get_shmem(int fd, struct writelog* wl)
+{
+  struct log_ctlmsg req;
+  char rsp[CTLRSPLEN_SHMP + 1];
+  int rc;
+
+  memset(&req, 0, sizeof(req));
+  memset(rsp, 0, sizeof(rsp));
+
+  memcpy(req.msg, LOGCMD_SHMP, 4);
+  if ((rc = ctl_talk(fd, &req, rsp, CTLRSPLEN_SHMP)) < 0) {
+    BWPRINTF("error getting shared memory parameters");
+    return -1;
+  }
+
+  memcpy(&wl->shmsize, rsp, sizeof(wl->shmsize));
+  wl->shmpath = strdup(rsp + sizeof(wl->shmsize));
+
+  BDPRINTF("shared memory parameters: size: %u, path: %s",
+          wl->shmsize, wl->shmpath);
+
+  return 0;
+}
+
+static void ctlmsg_init(struct log_ctlmsg* msg, const char* cmd)
+{
+  memset(msg, 0, sizeof(*msg));
+  memcpy(msg->msg, cmd, 4);
+}
+
+static int ctl_get_writes(int fd)
+{
+  struct log_ctlmsg req;
+  char rsp[CTLRSPLEN_GET];
+  int rc;
+
+  ctlmsg_init(&req, LOGCMD_GET);
+
+  if ((rc = ctl_talk(fd, &req, rsp, CTLRSPLEN_GET)) < 0) {
+    BWPRINTF("error getting writes");
+    return -1;
+  }
+
+  return 0;
+}
+
+static int ctl_peek_writes(int fd)
+{
+  struct log_ctlmsg req;
+  char rsp[CTLRSPLEN_PEEK];
+  int rc;
+
+  ctlmsg_init(&req, LOGCMD_PEEK);
+
+  if ((rc = ctl_talk(fd, &req, rsp, CTLRSPLEN_PEEK)) < 0) {
+    BWPRINTF("error peeking writes");
+    return -1;
+  }
+
+  return 0;
+}
+
+/* submit pending requests */
+static int ctl_kick(int fd)
+{
+  struct log_ctlmsg req;
+  int rc;
+
+  ctlmsg_init(&req, LOGCMD_KICK);
+
+  if ((rc = ctl_talk(fd, &req, NULL, 0)) < 0) {
+    BWPRINTF("error kicking ring");
+    return -1;
+  }
+
+  return 0;
+}
+
+static int ctl_clear_writes(int fd)
+{
+  struct log_ctlmsg req;
+  char rsp[CTLRSPLEN_CLEAR];
+  int rc;
+
+  ctlmsg_init(&req, LOGCMD_CLEAR);
+
+  if ((rc = ctl_talk(fd, &req, rsp, CTLRSPLEN_CLEAR)) < 0) {
+    BWPRINTF("error clearing writes");
+    return -1;
+  }
+
+  return 0;
+}
+
+static int writelog_map(struct writelog* wl)
+{
+  int fd;
+  void* shm;
+
+  if ((fd = shm_open(wl->shmpath, O_RDWR, 0750)) < 0) {
+    BWPRINTF("could not open shared memory at %s: %s", wl->shmpath,
+            strerror(errno));
+    return -1;
+  }
+
+  wl->shm = mmap(NULL, wl->shmsize, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
+  close(fd);
+  if (wl->shm == MAP_FAILED) {
+    BWPRINTF("could not mmap write log shm: %s", strerror(errno));
+    return -1;
+  }
+  wl->cur = wl->shm;
+  wl->inflight = 0;
+  wl->dhd = wl->dtl = sdatastart(wl->shm);
+
+  BDPRINTF("shm cookie: 0x%x, data size: %u", *((uint32_t*)wl->shm),
+          dring_avail(wl));
+
+  wl->sring = sringstart(wl->shm);
+  /* need some thought about what to do on reconnect */
+  FRONT_RING_INIT(&wl->fring, wl->sring, SRINGSIZE);
+
+  return 0;
+}
+
+static int writelog_dump(struct writelog* wl)
+{
+  struct disk_range* range = wl->shm;
+
+  for (range = wl->shm; (void*)range < bmend(wl->shm); range++) {
+    if (!range->count)
+      break;
+
+    BDPRINTF("dirty extent: %"PRIu64":%u",
+            range->sector, range->count);
+  }
+
+  return 0;
+}
+
+/* walk dirty map and enqueue read requests.
+ * returns:  0 when entire bitmap has been enqueued,
+ *           1 when the ring is full
+ *          -1 on error
+ */
+static int writelog_enqueue_requests(struct writelog* wl)
+{
+  struct disk_range* range = wl->shm;
+  log_request_t* req;
+
+  for (range = wl->cur; (void*)range < bmend(wl->shm); range++) {
+    if (!range->count)
+      break;
+
+    if (RING_FULL(&wl->fring))
+       break;
+
+    /* insert range into request stream */
+    /* 1. get next request slot from ring */
+    /* 2. ensure enough shm space is available */
+    
+    BDPRINTF("enqueueing dirty extent: %"PRIu64":%u (ring space: %d/%d)",
+            range->sector, range->count, RING_FREE_REQUESTS(&wl->fring),
+            RING_SIZE(&wl->fring));
+
+    req = RING_GET_REQUEST(&wl->fring, wl->fring.req_prod_pvt);
+
+    req->sector = range->sector;
+    req->count = range->count;
+    /* ... */
+    req->offset = 0;
+
+    wl->fring.req_prod_pvt++;
+    wl->inflight++;
+  }
+
+  wl->cur = range;
+
+  if (range->count)
+    return 1;
+
+  return 0;
+}
+
+static int writelog_dequeue_responses(struct writelog* wl)
+{
+  RING_IDX rstart, rend;
+  log_response_t rsp;
+
+  rstart = wl->fring.rsp_cons;
+  rend = wl->sring->rsp_prod;
+
+  BDPRINTF("ring kicked (start = %u, end = %u)", rstart, rend);
+
+  while (rstart != rend) {
+    memcpy(&rsp, RING_GET_RESPONSE(&wl->fring, rstart), sizeof(rsp));
+    BDPRINTF("ctl: read response %"PRIu64":%u", rsp.sector, rsp.count);
+    wl->fring.rsp_cons = ++rstart;
+    wl->inflight--;
+  }
+
+  return 0;
+}
+
+static int writelog_free(struct writelog* wl)
+{
+  if (wl->shmpath) {
+    free(wl->shmpath);
+    wl->shmpath = NULL;
+  }
+  if (wl->shm) {
+    munmap(wl->shm, wl->shmsize);
+    wl->shm = NULL;
+  }
+
+  return 0;
+}
+
+int get_writes(struct writelog* wl, int fd, int peek)
+{
+  int rc;
+
+  if (peek)
+    rc = ctl_peek_writes(fd);
+  else
+    rc = ctl_get_writes(fd);
+
+  if (rc < 0)
+    return rc;
+
+  wl->cur = wl->shm;
+
+  return 0;
+}
+
+int await_responses(struct writelog* wl, int fd)
+{
+  struct log_ctlmsg msg;
+  int rc;
+
+  /* sit on socket waiting for kick */
+  if ((rc = read(fd, &msg, sizeof(msg))) < 0) {
+    BWPRINTF("error reading from control socket: %s", strerror(errno));
+    return -1;
+  } else if (!rc) {
+    BWPRINTF("EOF on control socket");
+    return -1;
+  } else if (rc < sizeof(msg)) {
+         BWPRINTF("short reply (%d/%d bytes)", rc, (int) sizeof(msg));
+    return -1;
+  }
+
+  if (strncmp(msg.msg, LOGCMD_KICK, 4)) {
+    BWPRINTF("Unknown message received: %.4s", msg.msg);
+    return -1;
+  }
+
+  if (writelog_dequeue_responses(wl) < 0)
+    return -1;
+
+  return 0;
+}
+
+/* read_loop:
+ * 1. extract dirty bitmap
+ * 2. feed as much as possible onto ring
+ * 3. kick
+ * 4. as responses come back, feed more of the dirty bitmap
+ *    into the ring
+ * 5. when entire bitmap has been queued, go to 1?
+ */
+int read_loop(struct writelog* wl, int fd)
+{
+  int rc;
+
+  if (get_writes(wl, fd, 1) < 0)
+    return -1;
+  writelog_dump(wl);
+
+  do {
+    rc = writelog_enqueue_requests(wl);
+
+    if (RING_FREE_REQUESTS(&wl->fring) < RING_SIZE(&wl->fring))
+      RING_PUSH_REQUESTS(&wl->fring);
+    if (ctl_kick(fd) < 0)
+      return -1;
+
+    /* collect responses */
+    if (wl->inflight && await_responses(wl, fd) < 0)
+      return -1;
+  } while (rc > 0);
+
+  return rc;
+}
+
+int main(int argc, char* argv[])
+{
+  int fd;
+  struct writelog wl;
+  char cmd;
+
+  if (argc < 2) {
+    usage();
+    return 1;
+  }
+
+  if (argc < 3)
+    cmd = 'p';
+  else
+    cmd = argv[2][0];
+    
+  fd = tdctl_open(argv[1]);
+
+  if (ctl_get_shmem(fd, &wl) < 0)
+    return 1;
+
+  if (writelog_map(&wl) < 0) {
+    BWPRINTF("Error mapping write log: %s", strerror(errno));
+    return 1;
+  }
+
+  switch (cmd) {
+  case 'p':
+    if (get_writes(&wl, fd, 1) < 0)
+      return 1;
+    writelog_dump(&wl);
+    break;
+  case 'c':
+    if (ctl_clear_writes(fd) < 0)
+      return 1;
+    break;
+  case 'g':
+    if (get_writes(&wl, fd, 0) < 0)
+      return 1;
+    writelog_dump(&wl);
+    break;
+  case 'r':
+    if (read_loop(&wl, fd) < 0)
+      return 1;
+    break;
+  default:
+    usage();
+    return 1;
+  }
+
+  writelog_free(&wl);
+  close(fd);
+
+  return 0;
+}
diff --git a/tools/blktap2/drivers/tapdisk-diff.c b/tools/blktap2/drivers/tapdisk-diff.c
new file mode 100644 (file)
index 0000000..0f31c57
--- /dev/null
@@ -0,0 +1,797 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <stdio.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <unistd.h>
+
+#include "list.h"
+#include "scheduler.h"
+#include "tapdisk-vbd.h"
+#include "tapdisk-server.h"
+#include "libvhd.h"
+
+#define POLL_READ                        0
+#define POLL_WRITE                       1
+
+#define SPB_SHIFT (VHD_BLOCK_SHIFT - SECTOR_SHIFT)
+
+struct tapdisk_stream_poll {
+       int                              pipe[2];
+       int                              set;
+};
+
+struct tapdisk_stream_request {
+       uint64_t                         sec;
+       uint32_t                         secs;
+       uint64_t                         seqno;
+       blkif_request_t                  blkif_req;
+       struct list_head                 next;
+};
+
+struct tapdisk_stream {
+       td_vbd_t                        *vbd;
+
+       unsigned int                     id;
+
+       int                              err;
+
+       uint64_t                         cur;
+       uint64_t                         start;
+       uint64_t                         end;
+
+       uint64_t                         started;
+       uint64_t                         completed;
+
+       struct tapdisk_stream_poll       poll;
+       event_id_t                       enqueue_event_id;
+
+       struct list_head                 free_list;
+       struct list_head                 pending_list;
+       struct list_head                 completed_list;
+
+       struct tapdisk_stream_request    requests[MAX_REQUESTS];
+};
+
+static unsigned int tapdisk_stream_count;
+
+static void tapdisk_stream_close_image(struct tapdisk_stream *);
+
+static char *program;
+static struct tapdisk_stream stream1, stream2;
+static vhd_context_t vhd1;
+
+static void
+usage(FILE *stream)
+{
+       printf("usage: %s <-n type:/path/to/image> <-m type:/path/to/image>\n",
+                       program);
+}
+
+static int
+open_vhd(const char *path, vhd_context_t *vhd)
+{
+       int err;
+
+       err = vhd_open(vhd, path, VHD_OPEN_RDONLY);
+       if (err) {
+               printf("error opening %s: %d\n", path, err);
+               return err;
+       }
+
+       err = vhd_get_bat(vhd);
+       if (err)
+       {
+               printf("error reading BAT for %s: %d\n", path, err);
+               vhd_close(vhd);
+               return err;
+       }
+
+       return 0;
+}
+
+static inline void
+tapdisk_stream_poll_initialize(struct tapdisk_stream_poll *p)
+{
+       p->set = 0;
+       p->pipe[POLL_READ] = p->pipe[POLL_WRITE] = -1;
+}
+
+static int
+tapdisk_stream_poll_open(struct tapdisk_stream_poll *p)
+{
+       int err;
+
+       tapdisk_stream_poll_initialize(p);
+
+       err = pipe(p->pipe);
+       if (err)
+               return -errno;
+
+       err = fcntl(p->pipe[POLL_READ], F_SETFL, O_NONBLOCK);
+       if (err)
+               goto out;
+
+       err = fcntl(p->pipe[POLL_WRITE], F_SETFL, O_NONBLOCK);
+       if (err)
+               goto out;
+
+       return 0;
+
+out:
+       close(p->pipe[POLL_READ]);
+       close(p->pipe[POLL_WRITE]);
+       tapdisk_stream_poll_initialize(p);
+       return -errno;
+}
+
+static void
+tapdisk_stream_poll_close(struct tapdisk_stream_poll *p)
+{
+       if (p->pipe[POLL_READ] != -1)
+               close(p->pipe[POLL_READ]);
+       if (p->pipe[POLL_WRITE] != -1)
+               close(p->pipe[POLL_WRITE]);
+       tapdisk_stream_poll_initialize(p);
+}
+
+static inline void
+tapdisk_stream_poll_clear(struct tapdisk_stream_poll *p)
+{
+       int dummy;
+
+       read(p->pipe[POLL_READ], &dummy, sizeof(dummy));
+       p->set = 0;
+}
+
+static inline void
+tapdisk_stream_poll_set(struct tapdisk_stream_poll *p)
+{
+       int dummy = 0;
+
+       if (!p->set) {
+               write(p->pipe[POLL_WRITE], &dummy, sizeof(dummy));
+               p->set = 1;
+       }
+}
+
+static inline int
+tapdisk_stream_stop(struct tapdisk_stream *s)
+{
+       return ((s->cur == s->end || s->err) &&
+                       list_empty(&s->pending_list) && 
+                       list_empty(&s->completed_list));
+}
+
+static inline void
+tapdisk_stream_initialize_request(struct tapdisk_stream_request *req)
+{
+       memset(req, 0, sizeof(*req));
+       INIT_LIST_HEAD(&req->next);
+}
+
+static inline int
+tapdisk_stream_request_idx(struct tapdisk_stream *s,
+                          struct tapdisk_stream_request *req)
+{
+       return (req - s->requests);
+}
+
+static inline struct tapdisk_stream_request *
+tapdisk_stream_get_request(struct tapdisk_stream *s)
+{
+       struct tapdisk_stream_request *req;
+
+       if (list_empty(&s->free_list))
+               return NULL;
+
+       req = list_entry(s->free_list.next,
+                        struct tapdisk_stream_request, next);
+
+       list_del_init(&req->next);
+       tapdisk_stream_initialize_request(req);
+
+       return req;
+}
+
+static inline void
+tapdisk_stream_queue_completed(struct tapdisk_stream *s,
+                              struct tapdisk_stream_request *sreq)
+{
+       struct tapdisk_stream_request *itr;
+
+       list_for_each_entry(itr, &s->completed_list, next)
+               if (sreq->seqno < itr->seqno) {
+                       list_add_tail(&sreq->next, &itr->next);
+                       return;
+               }
+
+       list_add_tail(&sreq->next, &s->completed_list);
+}
+
+static int 
+tapdisk_result_compare(struct tapdisk_stream_request *sreq1,
+               struct tapdisk_stream_request  *sreq2)
+{
+       unsigned long idx1, idx2;
+       char *buf1, *buf2;
+       int result;
+
+       assert(sreq1->seqno == sreq2->seqno);
+       assert(sreq1->secs == sreq2->secs);
+       idx1 = (unsigned long)tapdisk_stream_request_idx(&stream1, 
+                       sreq1);
+       idx2 = (unsigned long)tapdisk_stream_request_idx(&stream2,
+                       sreq2);
+       buf1 = (char *)MMAP_VADDR(stream1.vbd->ring.vstart, idx1, 0);
+       buf2 = (char *)MMAP_VADDR(stream2.vbd->ring.vstart, idx2, 0);
+
+       result = memcmp(buf1, buf2, sreq1->secs << SECTOR_SHIFT);
+       return result;
+}
+
+static int
+tapdisk_stream_process_data(void)
+{
+       struct tapdisk_stream_request *sreq1, *sreq2, *tmp1, *tmp2;
+       int advance_both;
+       int result = 0;
+
+       sreq1 = list_entry(stream1.completed_list.next,
+                       struct tapdisk_stream_request, next);
+       sreq2 = list_entry(stream2.completed_list.next,
+                       struct tapdisk_stream_request, next);
+       tmp1 = list_entry(sreq1->next.next,
+                       struct tapdisk_stream_request, next);
+       tmp2 = list_entry(sreq2->next.next,
+                       struct tapdisk_stream_request, next);
+       while (result == 0 &&
+                       &sreq1->next != &stream1.completed_list &&
+                       &sreq2->next != &stream2.completed_list) {
+               //printf("checking: %llu|%llu\n", sreq1->seqno, sreq2->seqno);
+               advance_both = 1;
+               if (sreq1->seqno < sreq2->seqno) {
+                       advance_both = 0;
+                       goto advance1;
+               }
+               if (sreq1->seqno > sreq2->seqno)
+                       goto advance2;
+
+               result = tapdisk_result_compare(sreq1, sreq2);
+
+               stream1.completed++;
+               stream2.completed++;
+               
+               list_del_init(&sreq1->next);
+               list_add_tail(&sreq1->next, &stream1.free_list);
+               list_del_init(&sreq2->next);
+               list_add_tail(&sreq2->next, &stream2.free_list);
+
+advance1:
+               sreq1 = tmp1;
+               tmp1 = list_entry(tmp1->next.next, 
+                               struct tapdisk_stream_request, next);
+               if (!advance_both)
+                       continue;
+advance2:
+               sreq2 = tmp2;
+               tmp2 = list_entry(tmp2->next.next, 
+                               struct tapdisk_stream_request, next);
+       }
+
+       return result;
+}
+
+static void
+tapdisk_stream_dequeue(void *arg, blkif_response_t *rsp)
+{
+       struct tapdisk_stream *s = (struct tapdisk_stream *)arg;
+       struct tapdisk_stream_request *sreq = s->requests + rsp->id;
+
+       list_del_init(&sreq->next);
+
+       if (rsp->status == BLKIF_RSP_OKAY)
+               tapdisk_stream_queue_completed(s, sreq);
+       else {
+               s->err = EIO;
+               list_add_tail(&sreq->next, &s->free_list);
+               fprintf(stderr, "error reading sector 0x%"PRIx64"\n", sreq->sec);
+       }
+
+       if (tapdisk_stream_process_data()) {
+               fprintf(stderr, "mismatch at sector 0x%"PRIx64"\n",
+                               sreq->sec);
+               stream1.err = EINVAL;
+               stream2.err = EINVAL;
+       }
+
+       tapdisk_stream_poll_set(&stream1.poll);
+       tapdisk_stream_poll_set(&stream2.poll);
+}
+
+static inline int
+tapdisk_stream_enqueue_copy(struct tapdisk_stream *s, 
+               struct tapdisk_stream_request *r)
+{
+       td_vbd_t *vbd;
+       blkif_request_t *breq;
+       td_vbd_request_t *vreq;
+       struct tapdisk_stream_request *sreq;
+       int idx;
+
+       vbd = stream2.vbd;
+       sreq = tapdisk_stream_get_request(s);
+       if (!sreq)
+               return 1;
+
+       idx                 = tapdisk_stream_request_idx(s, sreq);
+
+       sreq->sec           = r->sec;
+       sreq->secs          = r->secs;
+       sreq->seqno         = r->seqno;
+
+       breq                = &sreq->blkif_req;
+       breq->id            = idx;
+       breq->nr_segments   = r->blkif_req.nr_segments;
+       breq->sector_number = r->blkif_req.sector_number;
+       breq->operation     = BLKIF_OP_READ;
+
+       for (int i = 0; i < r->blkif_req.nr_segments; i++) {
+               struct blkif_request_segment *seg = breq->seg + i;
+               seg->first_sect = r->blkif_req.seg[i].first_sect;
+               seg->last_sect  = r->blkif_req.seg[i].last_sect;
+       }
+       s->cur += sreq->secs;
+
+       vreq = vbd->request_list + idx;
+       assert(list_empty(&vreq->next));
+       assert(vreq->secs_pending == 0);
+
+       memcpy(&vreq->req, breq, sizeof(*breq));
+       vbd->received++;
+       vreq->vbd = vbd;
+
+       tapdisk_vbd_move_request(vreq, &vbd->new_requests);
+       list_add_tail(&sreq->next, &s->pending_list);
+
+       return 0;
+}
+
+static void
+tapdisk_stream_enqueue1(void)
+{
+       td_vbd_t *vbd;
+       int i, idx, psize, blk;
+       struct tapdisk_stream *s = &stream1;
+
+       vbd = s->vbd;
+       psize = getpagesize();
+
+       while (s->cur < s->end && !s->err) {
+               blkif_request_t *breq;
+               td_vbd_request_t *vreq;
+               struct tapdisk_stream_request *sreq;
+
+               /* skip any blocks that are not present in this image */
+               blk = s->cur >> SPB_SHIFT;
+               while (s->cur < s->end && vhd1.bat.bat[blk] == DD_BLK_UNUSED) {
+                       //printf("skipping block %d\n", blk);
+                       blk++;
+                       s->cur = blk << SPB_SHIFT;
+               }
+
+               if (s->cur >= s->end)
+                       break;
+
+               sreq = tapdisk_stream_get_request(s);
+               if (!sreq)
+                       break;
+
+               idx                 = tapdisk_stream_request_idx(s, sreq);
+
+               sreq->sec           = s->cur;
+               sreq->secs          = 0;
+               sreq->seqno         = s->started++;
+
+               breq                = &sreq->blkif_req;
+               breq->id            = idx;
+               breq->nr_segments   = 0;
+               breq->sector_number = sreq->sec;
+               breq->operation     = BLKIF_OP_READ;
+
+               for (i = 0; i < BLKIF_MAX_SEGMENTS_PER_REQUEST; i++) {
+                       uint32_t secs;
+                       struct blkif_request_segment *seg = breq->seg + i;
+
+                       secs = MIN(s->end - s->cur, psize >> SECTOR_SHIFT);
+                       secs = MIN(((blk + 1) << SPB_SHIFT) - s->cur, secs);
+                       if (!secs)
+                               break;
+
+                       sreq->secs += secs;
+                       s->cur     += secs;
+
+                       seg->first_sect = 0;
+                       seg->last_sect  = secs - 1;
+                       breq->nr_segments++;
+               }
+
+               vreq = vbd->request_list + idx;
+
+               assert(list_empty(&vreq->next));
+               assert(vreq->secs_pending == 0);
+
+               memcpy(&vreq->req, breq, sizeof(*breq));
+               vbd->received++;
+               vreq->vbd = vbd;
+
+               tapdisk_vbd_move_request(vreq, &vbd->new_requests);
+               list_add_tail(&sreq->next, &s->pending_list);
+       }
+
+       tapdisk_vbd_issue_requests(vbd);
+}
+
+static void
+tapdisk_stream_enqueue2(void)
+{
+       td_vbd_t *vbd;
+       int i, blk;
+       struct tapdisk_stream_request *itr;
+       struct tapdisk_stream *s = &stream2;
+
+       vbd = s->vbd;
+
+       /* issue the same requests that we issued on stream1 */
+       list_for_each_entry(itr, &stream1.completed_list, next) {
+               if (itr->sec < s->cur)
+                       continue;
+               if (tapdisk_stream_enqueue_copy(s, itr))
+                       goto done;
+       }
+
+       list_for_each_entry(itr, &stream1.pending_list, next) {
+               if (itr->sec < s->cur)
+                       continue;
+               if (tapdisk_stream_enqueue_copy(s, itr))
+                       goto done;
+       }
+
+       stream2.cur = stream1.cur;
+
+done:
+       tapdisk_vbd_issue_requests(vbd);
+}
+
+static inline int
+tapdisk_diff_done(void)
+{
+       return (tapdisk_stream_stop(&stream1) && tapdisk_stream_stop(&stream2));
+}
+
+static void
+tapdisk_diff_stop(void)
+{
+       tapdisk_stream_close_image(&stream1);
+       tapdisk_stream_close_image(&stream2);
+}
+
+static void
+tapdisk_stream_enqueue(event_id_t id, char mode, void *arg)
+{
+       struct tapdisk_stream *s = (struct tapdisk_stream *)arg;
+
+       tapdisk_stream_poll_clear(&s->poll);
+
+       if (tapdisk_diff_done()) {
+               tapdisk_diff_stop();
+               return;
+       }
+
+       if (s == &stream1) 
+               tapdisk_stream_enqueue1();
+       else if (s == &stream2)
+               tapdisk_stream_enqueue2();
+       else
+               assert(0);
+
+       if (tapdisk_diff_done()) {
+               // we have to check again for the case when stream1 had no 
+               // blocks at all
+               tapdisk_diff_stop();
+               return;
+       }
+}
+
+static int
+tapdisk_stream_open_image(struct tapdisk_stream *s, const char *path, int type)
+{
+       int err;
+       image_t image;
+
+       s->id = tapdisk_stream_count++;
+
+       err = tapdisk_vbd_initialize(-1, -1, s->id);
+       if (err)
+               goto out;
+
+       s->vbd = tapdisk_server_get_vbd(s->id);
+       if (!s->vbd) {
+               err = ENODEV;
+               goto out;
+       }
+
+       tapdisk_vbd_set_callback(s->vbd, tapdisk_stream_dequeue, s);
+
+       err = tapdisk_vbd_open_vdi(s->vbd, path, type,
+                                  TAPDISK_STORAGE_TYPE_DEFAULT,
+                                  TD_OPEN_RDONLY);
+       if (err)
+               goto out;
+
+       s->vbd->reopened = 1;
+
+       err = tapdisk_vbd_get_image_info(s->vbd, &image);
+       if (err) {
+               fprintf(stderr, "failed getting image size: %d\n", err);
+               return err;
+       }
+
+       s->start = 0;
+       s->cur   = s->start;
+       s->end   = image.size;
+
+       err = 0;
+
+out:
+       if (err)
+               fprintf(stderr, "failed to open image %s: %d\n", path, err);
+       return err;
+}
+
+static void
+tapdisk_stream_close_image(struct tapdisk_stream *s)
+{
+       td_vbd_t *vbd;
+
+       vbd = tapdisk_server_get_vbd(s->id);
+       if (vbd) {
+               tapdisk_vbd_close_vdi(vbd);
+               tapdisk_server_remove_vbd(vbd);
+               free((void *)vbd->ring.vstart);
+               free(vbd->name);
+               free(vbd);
+               s->vbd = NULL;
+       }
+}
+
+static int
+tapdisk_stream_initialize_requests(struct tapdisk_stream *s)
+{
+       size_t size;
+       td_ring_t *ring;
+       int err, i, psize;
+
+       ring  = &s->vbd->ring;
+       psize = getpagesize();
+       size  = psize * BLKTAP_MMAP_REGION_SIZE;
+
+       /* sneaky -- set up ring->vstart so tapdisk_vbd will use our buffers */
+       err = posix_memalign((void **)&ring->vstart, psize, size);
+       if (err) {
+               fprintf(stderr, "failed to allocate buffers: %d\n", err);
+               ring->vstart = 0;
+               return err;
+       }
+
+       for (i = 0; i < MAX_REQUESTS; i++) {
+               struct tapdisk_stream_request *req = s->requests + i;
+               tapdisk_stream_initialize_request(req);
+               list_add_tail(&req->next, &s->free_list);
+       }
+
+       return 0;
+}
+
+static int
+tapdisk_stream_register_enqueue_event(struct tapdisk_stream *s)
+{
+       int err;
+       struct tapdisk_stream_poll *p = &s->poll;
+
+       err = tapdisk_stream_poll_open(p);
+       if (err)
+               goto out;
+
+       err = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD,
+                                           p->pipe[POLL_READ], 0,
+                                           tapdisk_stream_enqueue, s);
+       if (err < 0)
+               goto out;
+
+       s->enqueue_event_id = err;
+       err = 0;
+
+out:
+       if (err)
+               fprintf(stderr, "failed to register event: %d\n", err);
+       return err;
+}
+
+static void
+tapdisk_stream_unregister_enqueue_event(struct tapdisk_stream *s)
+{
+       if (s->enqueue_event_id) {
+               tapdisk_server_unregister_event(s->enqueue_event_id);
+               s->enqueue_event_id = 0;
+       }
+       tapdisk_stream_poll_close(&s->poll);
+}
+
+static inline void
+tapdisk_stream_initialize(struct tapdisk_stream *s)
+{
+       memset(s, 0, sizeof(*s));
+       INIT_LIST_HEAD(&s->free_list);
+       INIT_LIST_HEAD(&s->pending_list);
+       INIT_LIST_HEAD(&s->completed_list);
+}
+
+static int
+tapdisk_stream_open(struct tapdisk_stream *s, const char *arg)
+{
+       int err, type;
+       char *path;
+
+       err = tapdisk_parse_disk_type(arg, &path, &type);
+       if (err)
+               return err;
+
+       tapdisk_stream_initialize(s);
+
+       err = tapdisk_stream_open_image(s, path, type);
+       if (err)
+               return err;
+
+       err = tapdisk_stream_initialize_requests(s);
+       if (err)
+               return err;
+
+       err = tapdisk_stream_register_enqueue_event(s);
+       if (err)
+               return err;
+
+       tapdisk_stream_enqueue(s->enqueue_event_id, 
+                              SCHEDULER_POLL_READ_FD, s);
+
+       return 0;
+}
+
+static void
+tapdisk_stream_release(struct tapdisk_stream *s)
+{
+       tapdisk_stream_close_image(s);
+       tapdisk_stream_unregister_enqueue_event(s);
+}
+
+static int
+tapdisk_stream_run(struct tapdisk_stream *s)
+{
+       tapdisk_stream_enqueue(s->enqueue_event_id, SCHEDULER_POLL_READ_FD, s);
+       tapdisk_server_run();
+       return s->err;
+}
+
+int
+main(int argc, char *argv[])
+{
+       int c, err, type1;
+       const char *arg1 = NULL, *arg2 = NULL;
+       char *path1;
+
+       err    = 0;
+
+       program = basename(argv[0]);
+       
+       while ((c = getopt(argc, argv, "n:m:h")) != -1) {
+               switch (c) {
+               case 'n':
+                       arg1 = optarg;
+                       break;
+               case 'm':
+                       arg2 = optarg;
+                       break;
+               case 'h':
+                       usage(stdout);
+                       return 0;
+               default:
+                       goto fail_usage;
+               }
+       }
+
+       if (!arg1 || !arg2)
+               goto fail_usage;
+
+       err = tapdisk_parse_disk_type(arg1, &path1, &type1);
+       if (err)
+               return err;
+       if (type1 != DISK_TYPE_VHD) {
+               printf("error: first VDI is not VHD\n");
+               return EINVAL;
+       }
+
+       err = open_vhd(path1, &vhd1);
+       if (err)
+               return err;
+
+       tapdisk_start_logging("tapdisk-diff");
+
+       err = tapdisk_server_initialize(NULL, NULL);
+       if (err)
+               goto out;
+
+       err = tapdisk_stream_open(&stream1, arg1);
+       if (err) {
+               fprintf(stderr, "Failed to open %s: %s\n", 
+                       arg1, strerror(-err));
+               goto out;
+       }
+
+       err = tapdisk_stream_open(&stream2, arg2);
+       if (err) {
+               fprintf(stderr, "Failed to open %s: %s\n", 
+                       arg2, strerror(-err));
+               goto out1;
+       }
+
+       if (stream1.end != stream2.end) {
+               fprintf(stderr, "Image sizes differ: %"PRIu64" != %"PRIu64"\n",
+                               stream1.end, stream2.end);
+               err = EINVAL;
+               goto out2;
+       }
+
+       tapdisk_server_run();
+       
+out2:
+       tapdisk_stream_release(&stream2);
+out1:
+       tapdisk_stream_release(&stream1);
+out:
+       vhd_close(&vhd1);
+       tapdisk_stop_logging();
+
+       return err ? : stream1.err;
+
+fail_usage:
+       usage(stderr);
+       return 1;
+}
diff --git a/tools/blktap2/drivers/tapdisk-driver.c b/tools/blktap2/drivers/tapdisk-driver.c
new file mode 100644 (file)
index 0000000..ca5629a
--- /dev/null
@@ -0,0 +1,100 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <stdlib.h>
+
+#include "tapdisk-driver.h"
+#include "tapdisk-server.h"
+
+td_driver_t *
+tapdisk_driver_allocate(int type, char *name, td_flag_t flags, int storage)
+{
+       int err;
+       td_driver_t *driver;
+       struct tap_disk *ops;
+
+       ops = tapdisk_server_find_driver_interface(type);
+       if (!ops)
+               return NULL;
+
+       driver = calloc(1, sizeof(td_driver_t));
+       if (!driver)
+               return NULL;
+
+       err = tapdisk_namedup(&driver->name, name);
+       if (err)
+               goto fail;
+
+       driver->ops     = ops;
+       driver->type    = type;
+       driver->storage = storage;
+       driver->data    = calloc(1, ops->private_data_size);
+       if (!driver->data)
+               goto fail;
+
+       if (td_flag_test(flags, TD_OPEN_RDONLY))
+               td_flag_set(driver->state, TD_DRIVER_RDONLY);
+
+       return driver;
+
+fail:
+       free(driver->name);
+       free(driver->data);
+       free(driver);
+       return NULL;
+}
+
+void
+tapdisk_driver_free(td_driver_t *driver)
+{
+       if (!driver)
+               return;
+
+       if (driver->refcnt)
+               return;
+
+       if (td_flag_test(driver->state, TD_DRIVER_OPEN))
+               EPRINTF("freeing open driver %s (state 0x%08x)\n",
+                       driver->name, driver->state);
+
+       free(driver->name);
+       free(driver->data);
+       free(driver);
+}
+
+void
+tapdisk_driver_queue_tiocb(td_driver_t *driver, struct tiocb *tiocb)
+{
+       tapdisk_server_queue_tiocb(tiocb);
+}
+
+void
+tapdisk_driver_debug(td_driver_t *driver)
+{
+       if (driver->ops->td_debug)
+               driver->ops->td_debug(driver);
+}
diff --git a/tools/blktap2/drivers/tapdisk-driver.h b/tools/blktap2/drivers/tapdisk-driver.h
new file mode 100644 (file)
index 0000000..de0a9be
--- /dev/null
@@ -0,0 +1,62 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _TAPDISK_DRIVER_H_
+#define _TAPDISK_DRIVER_H_
+
+#include "tapdisk.h"
+#include "scheduler.h"
+#include "tapdisk-queue.h"
+
+#define TD_DRIVER_OPEN               0x0001
+#define TD_DRIVER_RDONLY             0x0002
+
+struct td_driver_handle {
+       int                          type;
+       char                        *name;
+
+       int                          storage;
+
+       int                          refcnt;
+       td_flag_t                    state;
+
+       td_disk_info_t               info;
+
+       void                        *data;
+       struct tap_disk             *ops;
+
+       struct list_head             next;
+};
+
+td_driver_t *tapdisk_driver_allocate(int, char *, td_flag_t, int);
+void tapdisk_driver_free(td_driver_t *);
+
+void tapdisk_driver_queue_tiocb(td_driver_t *, struct tiocb *);
+
+void tapdisk_driver_debug(td_driver_t *);
+
+#endif
diff --git a/tools/blktap2/drivers/tapdisk-filter.c b/tools/blktap2/drivers/tapdisk-filter.c
new file mode 100644 (file)
index 0000000..fc018ea
--- /dev/null
@@ -0,0 +1,271 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <stdlib.h>
+#include <unistd.h>
+#include <libaio.h>
+#include <syslog.h>
+#include <sys/time.h>
+
+#include "tapdisk-log.h"
+#include "tapdisk-filter.h"
+
+#define RSEED      7
+#define PRE_CHECK  0
+#define POST_CHECK 1
+
+#define WRITE_INTEGRITY   "buffer integrity failure after write"
+#define READ_INTEGRITY    "disk integrity failure after read"
+
+#define DBG(f, a...) tlog_write(TLOG_WARN, f, ##a)
+
+/*
+ * simulate IO errors by knocking request size to zero before
+ * submitting and restoring original size before returning
+ */
+static inline void
+inject_fault(struct tfilter *filter, struct iocb *io)
+{
+       struct fiocb *fio;
+
+       if (!filter->ffree)
+               return;
+
+       fio = filter->flist[--filter->ffree];
+
+       fio->bytes     = io->u.c.nbytes;
+       fio->data      = io->data;
+       io->u.c.nbytes = 0;
+       io->data       = fio;
+}
+
+static inline int
+fault_injected(struct tfilter *filter, struct iocb *io)
+{
+       unsigned long iop   = (unsigned long)io->data;
+       unsigned long start = (unsigned long)filter->fiocbs;
+       unsigned long end   = start + (filter->iocbs * sizeof(struct fiocb));
+
+       return (iop >= start && iop < end);
+}
+
+static inline void
+recover_fault(struct tfilter *filter, struct iocb *io)
+{
+       struct fiocb *fio = (struct fiocb *)io->data;
+
+       io->u.c.nbytes = fio->bytes;
+       io->data       = fio->data;
+
+       memset(fio, 0, sizeof(struct fiocb));
+       filter->flist[filter->ffree++] = fio;
+}
+
+static inline uint64_t
+chksum(char *buf)
+{
+       int i, num   = 512 >> 3;
+       uint64_t *p  = (uint64_t *)buf;
+       uint64_t sum = 0;
+
+       for (i = 0; i < num; i++)
+               sum += p[i];
+
+       return sum;
+}
+
+static inline void
+check_hash(struct tfilter *filter, uint64_t sec, char *buf, char *type)
+{
+       uint64_t sum;
+       struct dhash *hash;
+
+       hash = filter->dhash + sec;
+       if (!hash->time.tv_sec)
+               return;
+
+       sum = chksum(buf);
+       if (hash->hash != chksum(buf)) {
+               struct timeval now;
+               gettimeofday(&now, NULL);
+               DBG("%s: hash table: 0x%020" PRIx64 " at %012lu.%06lu, "
+                   "from disk: 0x%020" PRIx64 " at %012lu.%06lu\n",
+                   type, hash->hash, hash->time.tv_sec,
+                   hash->time.tv_usec, sum, now.tv_sec, now.tv_usec);
+       }
+}
+
+static inline void
+insert_hash(struct tfilter *filter, uint64_t sec, char *buf)
+{
+       struct dhash *hash;
+
+       hash = filter->dhash + sec;
+       hash->hash = chksum(buf);
+       gettimeofday(&hash->time, NULL);
+}
+
+static void
+check_sector(struct tfilter *filter, int type, int rw, uint64_t sec, char *buf)
+{
+       struct dhash *hash;
+
+       if (sec >= filter->secs)
+               return;
+
+       hash = filter->dhash + sec;
+
+       if (rw) {
+               if (type == PRE_CHECK)
+                       insert_hash(filter, sec, buf);
+               else
+                       check_hash(filter, sec, buf, WRITE_INTEGRITY);
+       } else if (type == POST_CHECK) {
+               check_hash(filter, sec, buf, READ_INTEGRITY);
+               insert_hash(filter, sec, buf);
+       }
+}
+
+static void
+check_data(struct tfilter *filter, int type, struct iocb *io)
+{
+       int rw;
+       uint64_t i, sec;
+
+       rw = (io->aio_lio_opcode == IO_CMD_PWRITE);
+
+       for (i = 0; i < io->u.c.nbytes; i += 512) {
+               char *buf    = io->u.c.buf + i;
+               uint64_t sec = (io->u.c.offset + i) >> 9;
+               check_sector(filter, type, rw, sec, buf);
+       }
+}
+
+struct tfilter *
+tapdisk_init_tfilter(int mode, int iocbs, uint64_t secs)
+{
+       int i;
+       struct tfilter *filter = NULL;
+
+       if (!mode)
+               return NULL;
+
+       filter = calloc(1, sizeof(struct tfilter));
+       if (!filter)
+               goto fail;
+
+       filter->mode  = mode;
+       filter->secs  = secs;
+       filter->iocbs = iocbs;
+
+       if (filter->mode & TD_INJECT_FAULTS) {
+               filter->fiocbs = calloc(iocbs, sizeof(struct fiocb));
+               filter->flist  = calloc(iocbs, sizeof(struct fiocb *));
+               if (!filter->fiocbs || !filter->flist)
+                       filter->mode &= ~TD_INJECT_FAULTS;
+               else {
+                       srand(RSEED);
+                       filter->ffree = iocbs;
+                       for (i = 0; i < iocbs; i++)
+                               filter->flist[i] = filter->fiocbs + i;
+               }
+       }
+
+       if (filter->mode & TD_CHECK_INTEGRITY) {
+               filter->dhash = calloc(secs, sizeof(struct dhash));
+               if (!filter->dhash)
+                       filter->mode &= ~TD_CHECK_INTEGRITY;
+       }
+
+       syslog(LOG_WARNING, "WARNING: "
+              "FILTERING IN MODE 0x%04x\n", filter->mode);
+
+       return filter;
+
+ fail:
+       tapdisk_free_tfilter(filter);
+       return NULL;
+}
+
+void
+tapdisk_free_tfilter(struct tfilter *filter)
+{
+       if (!filter)
+               return;
+
+       free(filter->dhash);
+       free(filter->flist);
+       free(filter->fiocbs);
+       free(filter);
+}
+
+void
+tapdisk_filter_iocbs(struct tfilter *filter, struct iocb **iocbs, int num)
+{
+       int i;
+
+       if (!filter)
+               return;
+
+       for (i = 0; i < num; i++) {
+               struct iocb *io = iocbs[i];
+
+               if (filter->mode & TD_INJECT_FAULTS) {
+                       if ((random() % 100) <= TD_FAULT_RATE) {
+                               inject_fault(filter, io);
+                               continue;
+                       }
+               }
+
+               if (filter->mode & TD_CHECK_INTEGRITY)
+                       check_data(filter, PRE_CHECK, io);
+       }
+}
+
+void
+tapdisk_filter_events(struct tfilter *filter, struct io_event *events, int num)
+{
+       int i;
+
+       if (!filter)
+               return;
+
+       for (i = 0; i < num; i++) {
+               struct iocb *io = events[i].obj;
+
+               if (filter->mode & TD_INJECT_FAULTS) {
+                       if (fault_injected(filter, io)) {
+                               recover_fault(filter, io);
+                               continue;
+                       }
+               }
+
+               if (filter->mode & TD_CHECK_INTEGRITY)
+                       check_data(filter, POST_CHECK, io);
+       }
+}
diff --git a/tools/blktap2/drivers/tapdisk-filter.h b/tools/blktap2/drivers/tapdisk-filter.h
new file mode 100644 (file)
index 0000000..c4e977e
--- /dev/null
@@ -0,0 +1,67 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef TAPDISK_FILTER_H
+#define TAPDISK_FILTER_H
+
+#include <libaio.h>
+#include <inttypes.h>
+#include <time.h>
+
+#define TD_INJECT_FAULTS     0x00001  /* simulate random IO failures */
+#define TD_CHECK_INTEGRITY   0x00002  /* check data integrity */
+
+#define TD_FAULT_RATE        5
+
+struct dhash {
+       uint64_t             hash;
+       struct timeval       time;
+};
+
+struct fiocb {
+       size_t               bytes;
+       void                *data;
+};
+
+struct tfilter {
+       int                  mode;
+       uint64_t             secs;
+       int                  iocbs;
+
+       struct dhash        *dhash;
+
+       int                  ffree;
+       struct fiocb        *fiocbs;
+       struct fiocb       **flist;
+};
+
+struct tfilter *tapdisk_init_tfilter(int mode, int iocbs, uint64_t secs);
+void tapdisk_free_tfilter(struct tfilter *);
+void tapdisk_filter_iocbs(struct tfilter *, struct iocb **, int);
+void tapdisk_filter_events(struct tfilter *, struct io_event *, int);
+
+#endif
diff --git a/tools/blktap2/drivers/tapdisk-image.c b/tools/blktap2/drivers/tapdisk-image.c
new file mode 100644 (file)
index 0000000..6da7f48
--- /dev/null
@@ -0,0 +1,160 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <errno.h>
+#include <unistd.h>
+#include <stdlib.h>
+
+#include "tapdisk-image.h"
+#include "tapdisk-driver.h"
+#include "tapdisk-server.h"
+
+#define ERR(_err, _f, _a...) tlog_error(_err, _f, ##_a)
+
+td_image_t *
+tapdisk_image_allocate(char *file, int type, int storage,
+                      td_flag_t flags, void *private)
+{
+       int err;
+       td_image_t *image;
+
+       image = calloc(1, sizeof(td_image_t));
+       if (!image)
+               return NULL;
+
+       err = tapdisk_namedup(&image->name, file);
+       if (err) {
+               free(image);
+               return NULL;
+       }
+
+       image->type    = type;
+       image->flags   = flags;
+       image->storage = storage;
+       image->private = private;
+       INIT_LIST_HEAD(&image->next);
+
+       return image;
+}
+
+void
+tapdisk_image_free(td_image_t *image)
+{
+       if (!image)
+               return;
+
+       list_del(&image->next);
+
+       free(image->name);
+       tapdisk_driver_free(image->driver);
+       free(image);
+}
+
+int
+tapdisk_image_check_td_request(td_image_t *image, td_request_t treq)
+{
+       int rdonly;
+       td_driver_t *driver;
+       td_disk_info_t *info;
+
+       driver = image->driver;
+       if (!driver)
+               return -ENODEV;
+
+       info   = &driver->info;
+       rdonly = td_flag_test(image->flags, TD_OPEN_RDONLY);
+
+       if (treq.op != TD_OP_READ && treq.op != TD_OP_WRITE)
+               goto fail;
+
+       if (treq.op == TD_OP_WRITE && rdonly)
+               goto fail;
+
+       if (treq.secs <= 0 || treq.sec + treq.secs > info->size)
+               goto fail;
+
+       return 0;
+
+fail:
+       ERR(-EINVAL, "bad td request on %s (%s, %"PRIu64"): %d at %"PRIu64,
+           image->name, (rdonly ? "ro" : "rw"), info->size, treq.op,
+           treq.sec + treq.secs);
+       return -EINVAL;
+
+}
+
+int
+tapdisk_image_check_ring_request(td_image_t *image, blkif_request_t *req)
+{
+       td_driver_t *driver;
+       td_disk_info_t *info;
+       int i, psize, rdonly;
+       uint64_t nsects, total;
+
+       driver = image->driver;
+       if (!driver)
+               return -ENODEV;
+
+       nsects = 0;
+       total  = 0;
+       info   = &driver->info;
+
+       rdonly = td_flag_test(image->flags, TD_OPEN_RDONLY);
+
+       if (req->operation != BLKIF_OP_READ &&
+           req->operation != BLKIF_OP_WRITE)
+               goto fail;
+
+       if (req->operation == BLKIF_OP_WRITE && rdonly)
+               goto fail;
+
+       if (!req->nr_segments || req->nr_segments > MAX_SEGMENTS_PER_REQ)
+               goto fail;
+
+       total = 0;
+       psize = getpagesize();
+
+       for (i = 0; i < req->nr_segments; i++) {
+               nsects = req->seg[i].last_sect - req->seg[i].first_sect + 1;
+               
+               if (req->seg[i].last_sect >= psize >> 9 || nsects <= 0)
+                       goto fail;
+
+               total += nsects;
+       }
+
+       if (req->sector_number + nsects > info->size)
+               goto fail;
+
+       return 0;
+
+fail:
+       ERR(-EINVAL, "bad request on %s (%s, %"PRIu64"): id: %"PRIu64": %d at %"PRIu64,
+           image->name, (rdonly ? "ro" : "rw"), info->size, req->id,
+           req->operation, req->sector_number + total);
+       return -EINVAL;
+}
diff --git a/tools/blktap2/drivers/tapdisk-image.h b/tools/blktap2/drivers/tapdisk-image.h
new file mode 100644 (file)
index 0000000..8779dff
--- /dev/null
@@ -0,0 +1,55 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _TAPDISK_IMAGE_H_
+#define _TAPDISK_IMAGE_H_
+
+#include "tapdisk.h"
+#include <xen/io/blkif.h>
+
+struct td_image_handle {
+       int                          type;
+       char                        *name;
+
+       td_flag_t                    flags;
+       int                          storage;
+
+       td_driver_t                 *driver;
+       td_disk_info_t               info;
+
+       void                        *private;
+
+       struct list_head             next;
+};
+
+td_image_t *tapdisk_image_allocate(char *, int, int, td_flag_t, void *);
+void tapdisk_image_free(td_image_t *);
+
+int tapdisk_image_check_td_request(td_image_t *, td_request_t);
+int tapdisk_image_check_ring_request(td_image_t *, blkif_request_t *);
+
+#endif
diff --git a/tools/blktap2/drivers/tapdisk-interface.c b/tools/blktap2/drivers/tapdisk-interface.c
new file mode 100644 (file)
index 0000000..58366d0
--- /dev/null
@@ -0,0 +1,250 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <errno.h>
+
+#include "tapdisk.h"
+#include "tapdisk-vbd.h"
+#include "tapdisk-image.h"
+#include "tapdisk-driver.h"
+#include "tapdisk-server.h"
+#include "tapdisk-interface.h"
+
+int
+td_load(td_image_t *image)
+{
+       int err;
+       td_image_t *shared;
+       td_driver_t *driver;
+
+       shared = tapdisk_server_get_shared_image(image);
+       if (!shared)
+               return -ENODEV;
+
+       driver = shared->driver;
+       if (!driver)
+               return -EBADF;
+
+       driver->refcnt++;
+       image->driver = driver;
+       image->info   = driver->info;
+
+       DPRINTF("loaded shared image %s (%d users, state: 0x%08x, type: %d)\n",
+               driver->name, driver->refcnt, driver->state, driver->type);
+       return 0;
+}
+
+int
+td_open(td_image_t *image)
+{
+       int err;
+       td_driver_t *driver;
+
+       driver = image->driver;
+       if (!driver) {
+               driver = tapdisk_driver_allocate(image->type,
+                                                image->name,
+                                                image->flags,
+                                                image->storage);
+               if (!driver)
+                       return -ENOMEM;
+       }
+
+       if (!td_flag_test(driver->state, TD_DRIVER_OPEN)) {
+               err = driver->ops->td_open(driver, image->name, image->flags);
+               if (err) {
+                       if (!image->driver)
+                               tapdisk_driver_free(driver);
+                       return err;
+               }
+
+               td_flag_set(driver->state, TD_DRIVER_OPEN);
+               DPRINTF("opened image %s (%d users, state: 0x%08x, type: %d)\n",
+                       driver->name, driver->refcnt + 1,
+                       driver->state, driver->type);
+       }
+
+       image->driver = driver;
+       image->info   = driver->info;
+       driver->refcnt++;
+       return 0;
+}
+
+int
+td_close(td_image_t *image)
+{
+       td_driver_t *driver;
+
+       driver = image->driver;
+       if (!driver)
+               return -ENODEV;
+
+       driver->refcnt--;
+       if (!driver->refcnt && td_flag_test(driver->state, TD_DRIVER_OPEN)) {
+               driver->ops->td_close(driver);
+               td_flag_clear(driver->state, TD_DRIVER_OPEN);
+       }
+
+       DPRINTF("closed image %s (%d users, state: 0x%08x, type: %d)\n",
+               driver->name, driver->refcnt, driver->state, driver->type);
+
+       return 0;
+}
+
+int
+td_get_parent_id(td_image_t *image, td_disk_id_t *id)
+{
+       td_driver_t *driver;
+
+       driver = image->driver;
+       if (!driver)
+               return -ENODEV;
+
+       if (!td_flag_test(driver->state, TD_DRIVER_OPEN))
+               return -EBADF;
+
+       return driver->ops->td_get_parent_id(driver, id);
+}
+
+int
+td_validate_parent(td_image_t *image, td_image_t *parent)
+{
+       td_driver_t *driver, *pdriver;
+
+       driver  = image->driver;
+       pdriver = parent->driver;
+       if (!driver || !pdriver)
+               return -ENODEV;
+
+       if (!td_flag_test(driver->state, TD_DRIVER_OPEN) ||
+           !td_flag_test(pdriver->state, TD_DRIVER_OPEN))
+               return -EBADF;
+
+       return 0;
+       return driver->ops->td_validate_parent(driver, pdriver, 0);
+}
+
+void
+td_queue_write(td_image_t *image, td_request_t treq)
+{
+       int err;
+       td_driver_t *driver;
+
+       driver = image->driver;
+       if (!driver) {
+               err = -ENODEV;
+               goto fail;
+       }
+
+       if (!td_flag_test(driver->state, TD_DRIVER_OPEN)) {
+               err = -EBADF;
+               goto fail;
+       }
+
+       err = tapdisk_image_check_td_request(image, treq);
+       if (err)
+               goto fail;
+
+       driver->ops->td_queue_write(driver, treq);
+       return;
+
+fail:
+       td_complete_request(treq, err);
+}
+
+void
+td_queue_read(td_image_t *image, td_request_t treq)
+{
+       int err;
+       td_driver_t *driver;
+
+       driver = image->driver;
+       if (!driver) {
+               err = -ENODEV;
+               goto fail;
+       }
+
+       if (!td_flag_test(driver->state, TD_DRIVER_OPEN)) {
+               err = -EBADF;
+               goto fail;
+       }
+
+       err = tapdisk_image_check_td_request(image, treq);
+       if (err)
+               goto fail;
+
+       driver->ops->td_queue_read(driver, treq);
+       return;
+
+fail:
+       td_complete_request(treq, err);
+}
+
+void
+td_forward_request(td_request_t treq)
+{
+       tapdisk_vbd_forward_request(treq);
+}
+
+void
+td_complete_request(td_request_t treq, int res)
+{
+       treq.cb(treq, res);
+}
+
+void
+td_queue_tiocb(td_driver_t *driver, struct tiocb *tiocb)
+{
+       tapdisk_driver_queue_tiocb(driver, tiocb);
+}
+
+void
+td_prep_read(struct tiocb *tiocb, int fd, char *buf, size_t bytes,
+            long long offset, td_queue_callback_t cb, void *arg)
+{
+       tapdisk_prep_tiocb(tiocb, fd, 0, buf, bytes, offset, cb, arg);
+}
+
+void
+td_prep_write(struct tiocb *tiocb, int fd, char *buf, size_t bytes,
+             long long offset, td_queue_callback_t cb, void *arg)
+{
+       tapdisk_prep_tiocb(tiocb, fd, 1, buf, bytes, offset, cb, arg);
+}
+
+void
+td_debug(td_image_t *image)
+{
+       td_driver_t *driver;
+
+       driver = image->driver;
+       if (!driver || !td_flag_test(driver->state, TD_DRIVER_OPEN))
+
+               return;
+
+       tapdisk_driver_debug(driver);
+}
diff --git a/tools/blktap2/drivers/tapdisk-interface.h b/tools/blktap2/drivers/tapdisk-interface.h
new file mode 100644 (file)
index 0000000..1e48e58
--- /dev/null
@@ -0,0 +1,53 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _TAPDISK_INTERFACE_H_
+#define _TAPDISK_INTERFACE_H_
+
+#include "tapdisk.h"
+#include "tapdisk-queue.h"
+
+int td_open(td_image_t *);
+int td_load(td_image_t *);
+int td_close(td_image_t *);
+int td_get_parent_id(td_image_t *, td_disk_id_t *);
+int td_validate_parent(td_image_t *, td_image_t *);
+
+void td_queue_write(td_image_t *, td_request_t);
+void td_queue_read(td_image_t *, td_request_t);
+void td_forward_request(td_request_t);
+void td_complete_request(td_request_t, int);
+
+void td_debug(td_image_t *);
+
+void td_queue_tiocb(td_driver_t *, struct tiocb *);
+void td_prep_read(struct tiocb *, int, char *, size_t,
+                 long long, td_queue_callback_t, void *);
+void td_prep_write(struct tiocb *, int, char *, size_t,
+                  long long, td_queue_callback_t, void *);
+
+#endif
diff --git a/tools/blktap2/drivers/tapdisk-ipc.c b/tools/blktap2/drivers/tapdisk-ipc.c
new file mode 100644 (file)
index 0000000..3cfdb6c
--- /dev/null
@@ -0,0 +1,279 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <stdio.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+
+#include "tapdisk.h"
+#include "tapdisk-ipc.h"
+#include "tapdisk-vbd.h"
+#include "tapdisk-server.h"
+
+static int
+tapdisk_ipc_write_message(int fd, tapdisk_message_t *message, int timeout)
+{
+       fd_set writefds;
+       int ret, len, offset;
+       struct timeval tv, *t;
+
+       t      = NULL;
+       offset = 0;
+       len    = sizeof(tapdisk_message_t);
+
+       if (timeout) {
+               tv.tv_sec  = timeout;
+               tv.tv_usec = 0;
+               t = &tv;
+       }
+
+       DPRINTF("sending '%s' message (uuid = %u)\n",
+               tapdisk_message_name(message->type), message->cookie);
+
+       while (offset < len) {
+               FD_ZERO(&writefds);
+               FD_SET(fd, &writefds);
+
+               /* we don't bother reinitializing tv. at worst, it will wait a
+                * bit more time than expected. */
+
+               ret = select(fd + 1, NULL, &writefds, NULL, t);
+               if (ret == -1)
+                       break;
+               else if (FD_ISSET(fd, &writefds)) {
+                       ret = write(fd, message + offset, len - offset);
+                       if (ret <= 0)
+                               break;
+                       offset += ret;
+               } else
+                       break;
+       }
+
+       if (offset != len) {
+               EPRINTF("failure writing message\n");
+               return -EIO;
+       }
+
+       return 0;
+}
+
+int
+tapdisk_ipc_write(td_ipc_t *ipc, int type)
+{
+       tapdisk_message_t message;
+
+       if (ipc->wfd == -1)
+               return 0;
+
+       memset(&message, 0, sizeof(tapdisk_message_t));
+       message.type   = type;
+       message.cookie = ipc->uuid;
+
+       return tapdisk_ipc_write_message(ipc->wfd, &message, 2);
+}
+
+int
+tapdisk_ipc_write_error(td_ipc_t *ipc, const char *text)
+{
+       tapdisk_message_t message;
+
+       memset(&message, 0, sizeof(message));
+       message.type   = TAPDISK_MESSAGE_RUNTIME_ERROR;
+       message.cookie = ipc->uuid;
+       snprintf(message.u.string.text, sizeof(message.u.string.text), "%s", text);
+
+       return tapdisk_ipc_write_message(ipc->wfd, &message, 2);
+}
+
+static int
+tapdisk_ipc_read_message(int fd, tapdisk_message_t *message, int timeout)
+{
+       fd_set readfds;
+       int ret, len, offset;
+       struct timeval tv, *t;
+
+       t      = NULL;
+       offset = 0;
+       len    = sizeof(tapdisk_message_t);
+
+       if (timeout) {
+               tv.tv_sec  = timeout;
+               tv.tv_usec = 0;
+               t = &tv;
+       }
+
+       memset(message, 0, sizeof(tapdisk_message_t));
+
+       while (offset < len) {
+               FD_ZERO(&readfds);
+               FD_SET(fd, &readfds);
+
+               /* we don't bother reinitializing tv. at worst, it will wait a
+                * bit more time than expected. */
+
+               ret = select(fd + 1, &readfds, NULL, NULL, t);
+               if (ret == -1)
+                       break;
+               else if (FD_ISSET(fd, &readfds)) {
+                       ret = read(fd, message + offset, len - offset);
+                       if (ret <= 0)
+                               break;
+                       offset += ret;
+               } else
+                       break;
+       }
+
+       if (offset != len) {
+               EPRINTF("failure reading message\n");
+               return -EIO;
+       }
+
+       DPRINTF("received '%s' message (uuid = %u)\n",
+               tapdisk_message_name(message->type), message->cookie);
+
+       return 0;
+}
+
+int
+tapdisk_ipc_read(td_ipc_t *ipc)
+{
+       int err;
+       td_vbd_t *vbd;
+       td_uuid_t uuid;
+       tapdisk_message_t message;
+
+       err = tapdisk_ipc_read_message(ipc->rfd, &message, 2);
+       if (err) {
+               tapdisk_server_check_state();
+               return err;
+       }
+
+       uuid = message.cookie;
+       vbd  = tapdisk_server_get_vbd(uuid);
+
+       if (!vbd && message.type != TAPDISK_MESSAGE_PID) {
+               EPRINTF("received message for non-existing vbd: %u\n", uuid);
+               err = -EINVAL;
+               goto fail;
+       }
+
+       switch (message.type) {
+       case TAPDISK_MESSAGE_PID:
+               err = tapdisk_vbd_initialize(ipc->rfd, ipc->wfd, uuid);
+
+               memset(&message, 0, sizeof(tapdisk_message_t));
+               message.cookie = uuid;
+
+               if (!err) {
+                       message.type          = TAPDISK_MESSAGE_PID_RSP;
+                       message.u.tapdisk_pid = getpid();
+               } else
+                       message.type          = TAPDISK_MESSAGE_ERROR;
+
+               return tapdisk_ipc_write_message(ipc->wfd, &message, 0);
+
+       case TAPDISK_MESSAGE_OPEN:
+       {
+               image_t image;
+               char *devname;
+               td_flag_t flags;
+
+               flags = 0;
+
+               if (message.u.params.flags & TAPDISK_MESSAGE_FLAG_RDONLY)
+                       flags |= TD_OPEN_RDONLY;
+               if (message.u.params.flags & TAPDISK_MESSAGE_FLAG_SHARED)
+                       flags |= TD_OPEN_SHAREABLE;
+               if (message.u.params.flags & TAPDISK_MESSAGE_FLAG_ADD_CACHE)
+                       flags |= TD_OPEN_ADD_CACHE;
+               if (message.u.params.flags & TAPDISK_MESSAGE_FLAG_VHD_INDEX)
+                       flags |= TD_OPEN_VHD_INDEX;
+               if (message.u.params.flags & TAPDISK_MESSAGE_FLAG_LOG_DIRTY)
+                       flags |= TD_OPEN_LOG_DIRTY;
+
+               err   = asprintf(&devname, "%s/%s%d",
+                                BLKTAP_DEV_DIR, BLKTAP_DEV_NAME,
+                                message.u.params.devnum);
+               if (err == -1)
+                       goto fail;
+
+               err   = tapdisk_vbd_open(vbd,
+                                        message.u.params.path,
+                                        message.drivertype,
+                                        message.u.params.storage,
+                                        devname, flags);
+               free(devname);
+               if (err)
+                       goto fail;
+
+               err   = tapdisk_vbd_get_image_info(vbd, &image);
+               if (err)
+                       goto fail;
+
+               memset(&message, 0, sizeof(tapdisk_message_t));
+               message.cookie              = uuid;
+               message.u.image.sectors     = image.size;
+               message.u.image.sector_size = image.secsize;
+               message.u.image.info        = image.info;
+               message.type                = TAPDISK_MESSAGE_OPEN_RSP;
+
+               return tapdisk_ipc_write_message(ipc->wfd, &message, 0);
+       }
+
+       case TAPDISK_MESSAGE_PAUSE:
+               tapdisk_vbd_pause(vbd);
+               return 0; /* response written asynchronously */
+
+       case TAPDISK_MESSAGE_RESUME:
+               tapdisk_vbd_resume(vbd,
+                                  message.u.params.path,
+                                  message.drivertype);
+               return 0; /* response written asynchronously */
+
+       case TAPDISK_MESSAGE_CLOSE:
+               tapdisk_vbd_close(vbd);
+               return 0; /* response written asynchronously */
+
+       case TAPDISK_MESSAGE_EXIT:
+               return 0;
+       }
+
+       err = -EINVAL;
+       EPRINTF("received unrecognized message %s, uuid = %d\n",
+               tapdisk_message_name(message.type), uuid);
+
+fail:
+       memset(&message, 0, sizeof(tapdisk_message_t));
+       message.cookie = uuid;
+       message.type   = TAPDISK_MESSAGE_ERROR;
+       tapdisk_ipc_write_message(ipc->wfd, &message, 2);
+       tapdisk_server_check_state();
+
+       return -err;
+}
diff --git a/tools/blktap2/drivers/tapdisk-ipc.h b/tools/blktap2/drivers/tapdisk-ipc.h
new file mode 100644 (file)
index 0000000..25eb48c
--- /dev/null
@@ -0,0 +1,43 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _TAPDISK_IPC_H_
+#define _TAPDISK_IPC_H_
+
+#include "tapdisk-message.h"
+
+typedef struct td_ipc_handle {
+       int                         rfd;
+       int                         wfd;
+       td_uuid_t                   uuid;
+} td_ipc_t;
+
+int tapdisk_ipc_read(td_ipc_t *ipc);
+int tapdisk_ipc_write(td_ipc_t *ipc, int type);
+int tapdisk_ipc_write_error(td_ipc_t *ipc, const char *message);
+
+#endif
diff --git a/tools/blktap2/drivers/tapdisk-log.c b/tools/blktap2/drivers/tapdisk-log.c
new file mode 100644 (file)
index 0000000..980affa
--- /dev/null
@@ -0,0 +1,255 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <stdio.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdarg.h>
+#include <syslog.h>
+#include <inttypes.h>
+#include <sys/time.h>
+
+#include "tapdisk-log.h"
+
+#define MAX_ENTRY_LEN      512
+#define MAX_ERROR_MESSAGES 16
+
+struct error {
+       int            cnt;
+       int            err;
+       char          *func;
+       char           msg[MAX_ENTRY_LEN];
+};
+
+struct ehandle {
+       int            cnt;
+       int            dropped;
+       struct error   errors[MAX_ERROR_MESSAGES];
+};
+
+struct tlog {
+       char          *p;
+       int            size;
+       uint64_t       cnt;
+       char          *buf;
+       int            level;
+       char          *file;
+       int            append;
+};
+
+static struct ehandle tapdisk_err;
+static struct tlog tapdisk_log;
+
+void
+open_tlog(char *file, size_t bytes, int level, int append)
+{
+       tapdisk_log.size = ((bytes + 511) & (~511));
+
+       if (asprintf(&tapdisk_log.file, "%s.%d", file, getpid()) == -1)
+               return;
+
+       if (posix_memalign((void **)&tapdisk_log.buf, 512, tapdisk_log.size)) {
+               free(tapdisk_log.file);
+               tapdisk_log.buf = NULL;
+               return;
+       }
+
+       memset(tapdisk_log.buf, 0, tapdisk_log.size);
+
+       tapdisk_log.p      = tapdisk_log.buf;
+       tapdisk_log.level  = level;
+       tapdisk_log.append = append;
+}
+
+void
+close_tlog(void)
+{
+       if (!tapdisk_log.buf)
+               return;
+
+       if (tapdisk_log.append)
+               tlog_flush();
+
+       free(tapdisk_log.buf);
+       free(tapdisk_log.file);
+
+       memset(&tapdisk_log, 0, sizeof(struct tlog));
+}
+
+void
+__tlog_write(int level, const char *func, const char *fmt, ...)
+{
+       char *buf;
+       va_list ap;
+       struct timeval t;
+       int ret, len, avail;
+
+       if (!tapdisk_log.buf)
+               return;
+
+       if (level > tapdisk_log.level)
+               return;
+
+       avail = tapdisk_log.size - (tapdisk_log.p - tapdisk_log.buf);
+       if (avail < MAX_ENTRY_LEN) {
+               if (tapdisk_log.append)
+                       tlog_flush();
+               tapdisk_log.p = tapdisk_log.buf;
+       }
+
+       buf = tapdisk_log.p;
+       gettimeofday(&t, NULL);
+       len = snprintf(buf, MAX_ENTRY_LEN - 1, "%08"PRIu64":%010ld.%06ld:"
+                      "%s ", tapdisk_log.cnt, t.tv_sec, t.tv_usec, func);
+
+       va_start(ap, fmt);
+       ret = vsnprintf(buf + len, MAX_ENTRY_LEN - (len + 1), fmt, ap);
+       va_end(ap);
+
+       len = (ret < MAX_ENTRY_LEN - (len + 1) ?
+              len + ret : MAX_ENTRY_LEN - 1);
+       buf[len] = '\0';
+
+       tapdisk_log.cnt++;
+       tapdisk_log.p += len;
+}
+
+void
+__tlog_error(int err, const char *func, const char *fmt, ...)
+{
+       va_list ap;
+       int i, len, ret;
+       struct error *e;
+       struct timeval t;
+
+       err = (err > 0 ? err : -err);
+
+       for (i = 0; i < tapdisk_err.cnt; i++) {
+               e = &tapdisk_err.errors[i];
+               if (e->err == err && e->func == func) {
+                       e->cnt++;
+                       return;
+               }
+       }
+
+       if (tapdisk_err.cnt >= MAX_ERROR_MESSAGES) {
+               tapdisk_err.dropped++;
+               return;
+       }
+
+       gettimeofday(&t, NULL);
+       e = &tapdisk_err.errors[tapdisk_err.cnt];
+
+       len = snprintf(e->msg, MAX_ENTRY_LEN - 1, "%010ld.%06ld:%s ",
+                      t.tv_sec, t.tv_usec, func);
+
+       va_start(ap, fmt);
+       ret = vsnprintf(e->msg + len, MAX_ENTRY_LEN - (len + 1), fmt, ap);
+       va_end(ap);
+
+       len = (ret < MAX_ENTRY_LEN - (len + 1) ?
+              len + ret : MAX_ENTRY_LEN - 1);
+       e->msg[len] = '\0';
+
+       e->cnt++;
+       e->err  = err;
+       e->func = (char *)func;
+       tapdisk_err.cnt++;
+}
+
+void
+tlog_print_errors(void)
+{
+       int i;
+       struct error *e;
+
+       for (i = 0; i < tapdisk_err.cnt; i++) {
+               e = &tapdisk_err.errors[i];
+               syslog(LOG_INFO, "TAPDISK ERROR: errno %d at %s (cnt = %d): "
+                      "%s\n", e->err, e->func, e->cnt, e->msg);
+       }
+
+       if (tapdisk_err.dropped)
+               syslog(LOG_INFO, "TAPDISK ERROR: %d other error messages "
+                      "dropped\n", tapdisk_err.dropped);
+}
+
+void
+tlog_flush_errors(void)
+{
+       int i;
+       struct error *e;
+
+       for (i = 0; i < tapdisk_err.cnt; i++) {
+               e = &tapdisk_err.errors[i];
+               tlog_write(TLOG_WARN, "TAPDISK ERROR: errno %d at %s "
+                          "(cnt = %d): %s\n", e->err, e->func, e->cnt,
+                          e->msg);
+       }
+
+       if (tapdisk_err.dropped)
+               tlog_write(TLOG_WARN, "TAPDISK ERROR: %d other error messages "
+                      "dropped\n", tapdisk_err.dropped);
+}
+
+void
+tlog_flush(void)
+{
+       int fd, flags;
+       size_t size, wsize;
+
+       if (!tapdisk_log.buf)
+               return;
+
+       flags = O_CREAT | O_WRONLY | O_DIRECT | O_NONBLOCK;
+       if (!tapdisk_log.append)
+               flags |= O_TRUNC;
+
+       fd = open(tapdisk_log.file, flags, 0644);
+       if (fd == -1)
+               return;
+
+       if (tapdisk_log.append)
+               if (lseek64(fd, 0, SEEK_END) == (loff_t)-1)
+                       goto out;
+
+       tlog_flush_errors();
+
+       size  = tapdisk_log.p - tapdisk_log.buf;
+       wsize = ((size + 511) & (~511));
+
+       memset(tapdisk_log.buf + size, '\n', wsize - size);
+       write(fd, tapdisk_log.buf, wsize);
+
+       tapdisk_log.p = tapdisk_log.buf;
+
+out:
+       close(fd);
+}
diff --git a/tools/blktap2/drivers/tapdisk-log.h b/tools/blktap2/drivers/tapdisk-log.h
new file mode 100644 (file)
index 0000000..ae2a408
--- /dev/null
@@ -0,0 +1,51 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef _TAPDISK_LOG_H_
+#define _TAPDISK_LOG_H_
+
+#define TLOG_WARN       0
+#define TLOG_INFO       1
+#define TLOG_DBG        2
+
+void open_tlog(char *file, size_t bytes, int level, int append);
+void close_tlog(void);
+void tlog_flush(void);
+void tlog_print_errors(void);
+
+void __tlog_write(int level, const char *func, const char *fmt, ...)
+  __attribute__((format(printf, 3, 4)));
+void __tlog_error(int err, const char *func, const char *fmt, ...)
+  __attribute__((format(printf, 3, 4)));
+
+#define tlog_write(_level, _f, _a...)                  \
+       __tlog_write(_level, __func__, _f, ##_a)
+
+#define tlog_error(_err, _f, _a...)                    \
+       __tlog_error(_err, __func__, _f, ##_a)
+
+#endif
diff --git a/tools/blktap2/drivers/tapdisk-queue.c b/tools/blktap2/drivers/tapdisk-queue.c
new file mode 100644 (file)
index 0000000..5461d41
--- /dev/null
@@ -0,0 +1,441 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <errno.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <libaio.h>
+
+#include "tapdisk.h"
+#include "tapdisk-log.h"
+#include "tapdisk-queue.h"
+#include "tapdisk-filter.h"
+#include "atomicio.h"
+
+#define WARN(_f, _a...) tlog_write(TLOG_WARN, _f, ##_a)
+#define DBG(_f, _a...) tlog_write(TLOG_DBG, _f, ##_a)
+#define ERR(_err, _f, _a...) tlog_error(_err, _f, ##_a)
+
+/*
+ * We used a kernel patch to return an fd associated with the AIO context
+ * so that we can concurrently poll on synchronous and async descriptors.
+ * This is signalled by passing 1 as the io context to io_setup.
+ */
+#define REQUEST_ASYNC_FD 1
+
+static inline void
+queue_tiocb(struct tqueue *queue, struct tiocb *tiocb)
+{
+       struct iocb *iocb = &tiocb->iocb;
+
+       if (queue->queued) {
+               struct tiocb *prev = (struct tiocb *)
+                       queue->iocbs[queue->queued - 1]->data;
+               prev->next = tiocb;
+       }
+
+       queue->iocbs[queue->queued++] = iocb;
+}
+
+static inline int
+deferred_tiocbs(struct tqueue *queue)
+{
+       return (queue->deferred.head != NULL);
+}
+
+static inline void
+defer_tiocb(struct tqueue *queue, struct tiocb *tiocb)
+{
+       struct tlist *list = &queue->deferred;
+
+       if (!list->head)
+               list->head = list->tail = tiocb;
+       else
+               list->tail = list->tail->next = tiocb;
+
+       queue->tiocbs_deferred++;
+       queue->deferrals++;
+}
+
+static inline void
+queue_deferred_tiocb(struct tqueue *queue)
+{
+       struct tlist *list = &queue->deferred;
+
+       if (list->head) {
+               struct tiocb *tiocb = list->head;
+
+               list->head = tiocb->next;
+               if (!list->head)
+                       list->tail = NULL;
+
+               queue_tiocb(queue, tiocb);
+               queue->tiocbs_deferred--;
+       }
+}
+
+static inline void
+queue_deferred_tiocbs(struct tqueue *queue)
+{
+       while (!tapdisk_queue_full(queue) && deferred_tiocbs(queue))
+               queue_deferred_tiocb(queue);
+}
+
+/*
+ * td_complete may queue more tiocbs
+ */
+static void
+complete_tiocb(struct tqueue *queue, struct tiocb *tiocb, unsigned long res)
+{
+       int err;
+       struct iocb *iocb = &tiocb->iocb;
+
+       if (res == iocb->u.c.nbytes)
+               err = 0;
+       else if ((int)res < 0)
+               err = (int)res;
+       else
+               err = -EIO;
+
+       tiocb->cb(tiocb->arg, tiocb, err);
+}
+
+static int
+cancel_tiocbs(struct tqueue *queue, int err)
+{
+       int queued;
+       struct tiocb *tiocb;
+
+       if (!queue->queued)
+               return 0;
+
+       /* 
+        * td_complete may queue more tiocbs, which
+        * will overwrite the contents of queue->iocbs.
+        * use a private linked list to keep track
+        * of the tiocbs we're cancelling. 
+        */
+       tiocb  = (struct tiocb *)queue->iocbs[0]->data;
+       queued = queue->queued;
+       queue->queued = 0;
+
+       for (; tiocb != NULL; tiocb = tiocb->next)
+               complete_tiocb(queue, tiocb, err);
+
+       return queued;
+}
+
+static int
+fail_tiocbs(struct tqueue *queue, int succeeded, int total, int err)
+{
+       ERR(err, "io_submit error: %d of %d failed",
+           total - succeeded, total);
+
+       /* take any non-submitted, merged iocbs 
+        * off of the queue, split them, and fail them */
+       queue->queued = io_expand_iocbs(&queue->opioctx,
+                                       queue->iocbs, succeeded, total);
+
+       return cancel_tiocbs(queue, err);
+}
+
+static inline ssize_t
+iocb_rw(struct iocb *iocb)
+{
+       int fd        = iocb->aio_fildes;
+       char *buf     = iocb->u.c.buf;
+       long long off = iocb->u.c.offset;
+       size_t size   = iocb->u.c.nbytes;
+       ssize_t (*func)(int, void *, size_t) = 
+               (iocb->aio_lio_opcode == IO_CMD_PWRITE ? vwrite : read);
+
+       if (lseek64(fd, off, SEEK_SET) == (off64_t)-1)
+               return -errno;
+       
+       if (atomicio(func, fd, buf, size) != size)
+               return -errno;
+
+       return size;
+}
+
+static int
+io_synchronous_rw(struct tqueue *queue)
+{
+       int i, merged, split;
+       struct iocb *iocb;
+       struct tiocb *tiocb;
+       struct io_event *ep;
+
+       if (!queue->queued)
+               return 0;
+
+       tapdisk_filter_iocbs(queue->filter, queue->iocbs, queue->queued);
+       merged = io_merge(&queue->opioctx, queue->iocbs, queue->queued);
+
+       queue->queued = 0;
+
+       for (i = 0; i < merged; i++) {
+               ep      = queue->aio_events + i;
+               iocb    = queue->iocbs[i];
+               ep->obj = iocb;
+               ep->res = iocb_rw(iocb);
+       }
+
+       split = io_split(&queue->opioctx, queue->aio_events, merged);
+       tapdisk_filter_events(queue->filter, queue->aio_events, split);
+
+       for (i = split, ep = queue->aio_events; i-- > 0; ep++) {
+               iocb  = ep->obj;
+               tiocb = (struct tiocb *)iocb->data;
+               complete_tiocb(queue, tiocb, ep->res);
+       }
+
+       queue_deferred_tiocbs(queue);
+
+       return split;
+}
+
+int
+tapdisk_init_queue(struct tqueue *queue, int size,
+                  int sync, struct tfilter *filter)
+{
+       int i, err;
+
+       memset(queue, 0, sizeof(struct tqueue));
+
+       queue->size   = size;
+       queue->sync   = sync;
+       queue->filter = filter;
+
+       if (sync) {
+               /* set up a pipe so we can return
+                * a poll fd that won't fire. */
+               if (pipe(queue->dummy_pipe))
+                       return -errno;
+               queue->poll_fd = queue->dummy_pipe[0];
+       } else {
+               queue->aio_ctx = (io_context_t)REQUEST_ASYNC_FD;
+               queue->poll_fd = io_setup(size, &queue->aio_ctx);
+
+               if (queue->poll_fd < 0) {
+                       if (queue->poll_fd == -EAGAIN)
+                               DPRINTF("Couldn't setup AIO context.  If you "
+                                       "are trying to concurrently use a "
+                                       "large number of blktap-based disks, "
+                                       "you may need to increase the "
+                                       "system-wide aio request limit. "
+                                       "(e.g. 'echo 1048576 > /proc/sys/fs/"
+                                       "aio-max-nr')\n");
+                       else
+                               DPRINTF("Couldn't get fd for AIO poll "
+                                       "support.  This is probably because "
+                                       "your kernel does not have the "
+                                       "aio-poll patch applied.\n");
+                       return queue->poll_fd;
+               }
+       }
+
+       err               = -ENOMEM;
+       queue->iocbs      = calloc(size, sizeof(struct iocb *));
+       queue->aio_events = calloc(size, sizeof(struct io_event));
+       if (!queue->iocbs || !queue->aio_events)
+               goto fail;
+
+       err = opio_init(&queue->opioctx, size);
+       if (err)
+               goto fail;
+
+       return 0;
+
+ fail:
+       tapdisk_free_queue(queue);
+       return err;
+}
+
+void
+tapdisk_free_queue(struct tqueue *queue)
+{
+       if (queue->sync) {
+               close(queue->dummy_pipe[0]);
+               close(queue->dummy_pipe[1]);
+       } else
+               io_destroy(queue->aio_ctx);
+
+       free(queue->iocbs);
+       free(queue->aio_events);
+       opio_free(&queue->opioctx);
+}
+
+void 
+tapdisk_debug_queue(struct tqueue *queue)
+{
+       struct tiocb *tiocb = queue->deferred.head;
+
+       WARN("TAPDISK QUEUE:\n");
+       WARN("size: %d, sync: %d, queued: %d, iocbs_pending: %d, "
+            "tiocbs_pending: %d, tiocbs_deferred: %d, deferrals: %"PRIx64"\n",
+            queue->size, queue->sync, queue->queued, queue->iocbs_pending,
+            queue->tiocbs_pending, queue->tiocbs_deferred, queue->deferrals);
+
+       if (tiocb) {
+               WARN("deferred:\n");
+               for (; tiocb != NULL; tiocb = tiocb->next) {
+                       struct iocb *io = &tiocb->iocb;
+                       WARN("%s of %lu bytes at %lld\n",
+                            (io->aio_lio_opcode == IO_CMD_PWRITE ?
+                             "write" : "read"),
+                            io->u.c.nbytes, io->u.c.offset);
+               }
+       }
+}
+
+void
+tapdisk_prep_tiocb(struct tiocb *tiocb, int fd, int rw, char *buf, size_t size,
+                  long long offset, td_queue_callback_t cb, void *arg)
+{
+       struct iocb *iocb = &tiocb->iocb;
+
+       if (rw)
+               io_prep_pwrite(iocb, fd, buf, size, offset);
+       else
+               io_prep_pread(iocb, fd, buf, size, offset);
+
+       iocb->data  = tiocb;
+       tiocb->cb   = cb;
+       tiocb->arg  = arg;
+       tiocb->next = NULL;
+}
+
+void
+tapdisk_queue_tiocb(struct tqueue *queue, struct tiocb *tiocb)
+{
+       if (!tapdisk_queue_full(queue))
+               queue_tiocb(queue, tiocb);
+       else
+               defer_tiocb(queue, tiocb);
+}
+
+/*
+ * fail_tiocbs may queue more tiocbs
+ */
+int
+tapdisk_submit_tiocbs(struct tqueue *queue)
+{
+       int merged, submitted, err = 0;
+
+       if (!queue->queued)
+               return 0;
+
+       if (queue->sync)
+               return io_synchronous_rw(queue);
+
+       tapdisk_filter_iocbs(queue->filter, queue->iocbs, queue->queued);
+       merged    = io_merge(&queue->opioctx, queue->iocbs, queue->queued);
+       submitted = io_submit(queue->aio_ctx, merged, queue->iocbs);
+
+       DBG("queued: %d, merged: %d, submitted: %d\n",
+           queue->queued, merged, submitted);
+
+       if (submitted < 0) {
+               err = submitted;
+               submitted = 0;
+       } else if (submitted < merged)
+               err = -EIO;
+
+       queue->iocbs_pending  += submitted;
+       queue->tiocbs_pending += queue->queued;
+       queue->queued          = 0;
+
+       if (err)
+               queue->tiocbs_pending -= 
+                       fail_tiocbs(queue, submitted, merged, err);
+
+       return submitted;
+}
+
+int
+tapdisk_submit_all_tiocbs(struct tqueue *queue)
+{
+       int submitted = 0;
+
+       do {
+               submitted += tapdisk_submit_tiocbs(queue);
+       } while (!tapdisk_queue_empty(queue));
+
+       return submitted;
+}
+
+int
+tapdisk_complete_tiocbs(struct tqueue *queue)
+{
+       int i, ret, split;
+       struct iocb *iocb;
+       struct tiocb *tiocb;
+       struct io_event *ep;
+
+       ret   = io_getevents(queue->aio_ctx, 0,
+                            queue->size, queue->aio_events, NULL);
+       split = io_split(&queue->opioctx, queue->aio_events, ret);
+       tapdisk_filter_events(queue->filter, queue->aio_events, split);
+
+       DBG("events: %d, tiocbs: %d\n", ret, split);
+
+       queue->iocbs_pending  -= ret;
+       queue->tiocbs_pending -= split;
+
+       for (i = split, ep = queue->aio_events; i-- > 0; ep++) {
+               iocb  = ep->obj;
+               tiocb = (struct tiocb *)iocb->data;
+               complete_tiocb(queue, tiocb, ep->res);
+       }
+
+       queue_deferred_tiocbs(queue);
+
+       return split;
+}
+
+/*
+ * cancel_tiocbs may queue more tiocbs
+ */
+int
+tapdisk_cancel_tiocbs(struct tqueue *queue)
+{
+       return cancel_tiocbs(queue, -EIO);
+}
+
+int
+tapdisk_cancel_all_tiocbs(struct tqueue *queue)
+{
+       int cancelled = 0;
+
+       do {
+               cancelled += tapdisk_cancel_tiocbs(queue);
+       } while (!tapdisk_queue_empty(queue));
+
+       return cancelled;
+}
diff --git a/tools/blktap2/drivers/tapdisk-queue.h b/tools/blktap2/drivers/tapdisk-queue.h
new file mode 100644 (file)
index 0000000..40ff886
--- /dev/null
@@ -0,0 +1,113 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef TAPDISK_QUEUE_H
+#define TAPDISK_QUEUE_H
+
+#include <libaio.h>
+
+#include "io-optimize.h"
+
+struct tiocb;
+struct tfilter;
+
+typedef void (*td_queue_callback_t)(void *arg, struct tiocb *, int err);
+
+
+struct tiocb {
+       td_queue_callback_t   cb;
+       void                 *arg;
+
+       struct iocb           iocb;
+       struct tiocb         *next;
+};
+
+struct tlist {
+       struct tiocb         *head;
+       struct tiocb         *tail;
+};
+
+struct tqueue {
+       int                   size;
+       int                   sync;
+
+       int                   poll_fd;
+       io_context_t          aio_ctx;
+       struct opioctx        opioctx;
+       int                   dummy_pipe[2];
+
+       int                   queued;
+       struct iocb         **iocbs;
+       struct io_event      *aio_events;
+
+       /* number of iocbs pending in the aio layer */
+       int                   iocbs_pending;
+
+       /* number of tiocbs pending in the queue -- 
+        * this is likely to be larger than iocbs_pending 
+        * due to request coalescing */
+       int                   tiocbs_pending;
+
+       /* iocbs may be deferred if the aio ring is full.
+        * tapdisk_queue_complete will ensure deferred
+        * iocbs are queued as slots become available. */
+       struct tlist          deferred;
+       int                   tiocbs_deferred;
+
+       /* optional tapdisk filter */
+       struct tfilter       *filter;
+
+       uint64_t              deferrals;
+};
+
+/*
+ * Interface for request producer (i.e., tapdisk)
+ * NB: the following functions may cause additional tiocbs to be queued:
+ *        - tapdisk_submit_tiocbs
+ *        - tapdisk_cancel_tiocbs
+ *        - tapdisk_complete_tiocbs
+ * The *_all_tiocbs variants will handle the first two cases;
+ * be sure to call submit after calling complete in the third case.
+ */
+#define tapdisk_queue_count(q) ((q)->queued)
+#define tapdisk_queue_empty(q) ((q)->queued == 0)
+#define tapdisk_queue_full(q)  \
+       (((q)->tiocbs_pending + (q)->queued) >= (q)->size)
+int tapdisk_init_queue(struct tqueue *, int size, int sync, struct tfilter *);
+void tapdisk_free_queue(struct tqueue *);
+void tapdisk_debug_queue(struct tqueue *);
+void tapdisk_queue_tiocb(struct tqueue *, struct tiocb *);
+int tapdisk_submit_tiocbs(struct tqueue *);
+int tapdisk_submit_all_tiocbs(struct tqueue *);
+int tapdisk_complete_tiocbs(struct tqueue *);
+int tapdisk_cancel_tiocbs(struct tqueue *);
+int tapdisk_cancel_all_tiocbs(struct tqueue *);
+void tapdisk_prep_tiocb(struct tiocb *, int, int, char *, size_t,
+                       long long, td_queue_callback_t, void *);
+
+#endif
diff --git a/tools/blktap2/drivers/tapdisk-ring.c b/tools/blktap2/drivers/tapdisk-ring.c
new file mode 100644 (file)
index 0000000..a5d40cb
--- /dev/null
@@ -0,0 +1,439 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <errno.h>
+
+#include "tapdisk-ring.h"
+
+static int
+tapdisk_uring_create_ctlfd(td_uring_t *ring)
+{
+       int fd, err;
+       struct sockaddr_un saddr;
+
+       if (strnlen(ring->ctlfd_path, sizeof(saddr.sun_family)) >=
+           sizeof(saddr.sun_family))
+               return -ENAMETOOLONG;
+
+       fd = socket(AF_UNIX, SOCK_STREAM, 0);
+       if (fd == -1)
+               return -errno;
+
+       memset(&saddr, 0, sizeof(struct sockaddr_un));
+       saddr.sun_family = AF_UNIX;
+       memcpy(saddr.sun_path, ring->ctlfd_path, strlen(ring->ctlfd_path));
+
+       err = unlink(ring->ctlfd_path);
+       if (err == -1 && errno != ENOENT) {
+               err = -errno;
+               goto fail;
+       }
+
+       err = bind(fd, &saddr, sizeof(struct sockaddr_un));
+       if (err == -1) {
+               err = -errno;
+               goto fail;
+       }
+
+       err = listen(fd, 1);
+       if (err == -1) {
+               err = -errno;
+               goto fail;
+       }
+
+       ring->ctlfd = fd;
+       return 0;
+
+fail:
+       close(fd);
+       return err;
+}
+
+static void
+tapdisk_uring_destroy_ctlfd(td_uring_t *ring)
+{
+       if (ring->ctlfd) {
+               close(ring->ctlfd);
+               ring->ctlfd = 0;
+       }
+
+       if (ring->ctlfd_path) {
+               unlink(ring->ctlfd_path);
+               free(ring->ctlfd_path);
+               ring->ctlfd_path = NULL;
+       }
+}
+
+static int
+tapdisk_uring_connect_ctlfd(td_uring_t *ring)
+{
+       int fd, err;
+       struct sockaddr_un saddr;
+
+       if (strnlen(ring->ctlfd_path, sizeof(saddr.sun_path)) >=
+           sizeof(saddr.sun_path))
+               return -ENAMETOOLONG;
+
+       fd = socket(AF_UNIX, SOCK_STREAM, 0);
+       if (fd == -1)
+               return -errno;
+
+       memset(&saddr, 0, sizeof(struct sockaddr_un));
+       saddr.sun_family = AF_UNIX;
+       memcpy(saddr.sun_path, ring->ctlfd_path, strlen(ring->ctlfd_path));
+
+       err = connect(fd, &saddr, sizeof(saddr));
+       if (err == -1) {
+               err = -errno;
+               goto fail;
+       }
+
+       ring->ctlfd = fd;
+       return 0;
+
+fail:
+       close(fd);
+       return err;
+}
+
+static void
+tapdisk_uring_disconnect_ctlfd(td_uring_t *ring)
+{
+       if (ring->ctlfd)
+               close(ring->ctlfd);
+       free(ring->ctlfd_path);
+       ring->ctlfd_path = NULL;
+}
+
+static int
+tapdisk_uring_create_shmem(td_uring_t *ring)
+{
+       int fd, err;
+
+       fd = shm_open(ring->shmem_path, O_CREAT | O_RDWR, 0750);
+       if (fd == -1)
+               return -errno;
+
+       err = ftruncate(fd, ring->shmem_size);
+       if (err == -1) {
+               err = -errno;
+               goto out;
+       }
+
+       ring->shmem = mmap(NULL, ring->shmem_size,
+                          PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+       if (ring->shmem == MAP_FAILED) {
+               ring->shmem = NULL;
+               err = -errno;
+               goto out;
+       }
+
+       err = 0;
+
+out:
+       close(fd);
+       return err;
+}
+
+static void
+tapdisk_uring_destroy_shmem(td_uring_t *ring)
+{
+       if (ring->shmem) {
+               munmap(ring->shmem, ring->shmem_size);
+               ring->shmem = NULL;
+       }
+
+       if (ring->shmem_path) {
+               shm_unlink(ring->shmem_path);
+               free(ring->shmem_path);
+               ring->shmem_path = NULL;
+       }
+}
+
+static int
+tapdisk_uring_connect_shmem(td_uring_t *ring)
+{
+       int fd, err;
+       td_uring_header_t header, *p;
+
+       fd = shm_open(ring->shmem_path, O_RDWR);
+       if (fd == -1)
+               return -errno;
+
+       p = mmap(NULL, sizeof(td_uring_header_t),
+                PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+       if (p == MAP_FAILED) {
+               err = -errno;
+               goto out;
+       }
+
+       memcpy(&header, p, sizeof(td_uring_header_t));
+       munmap(p, sizeof(td_uring_header_t));
+
+       if (memcmp(header.cookie,
+                  TAPDISK_URING_COOKIE, sizeof(header.cookie))) {
+               err = -EINVAL;
+               goto out;
+       }
+
+       if (header.version != TD_URING_CURRENT_VERSION) {
+               err = -EINVAL;
+               goto out;
+       }
+
+       ring->ring_size  = header.ring_size;
+       ring->data_size  = header.data_size;
+       ring->shmem_size = header.shmem_size;
+
+       ring->shmem = mmap(NULL, ring->shmem_size,
+                          PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+       if (ring->shmem == MAP_FAILED) {
+               rint->shmem = NULL;
+               err = -errno;
+               goto out;
+       }
+
+       err = 0;
+
+out:
+       close(fd);
+       return err;
+}
+
+static void
+tapdisk_uring_disconnect_shmem(td_uring_t *ring)
+{
+       if (ring->shmem)
+               munmap(ring->shmem, ring->shmem_size);
+       free(ring->shmem_path);
+       ring->shmem_path = NULL;
+}
+
+int
+tapdisk_uring_create(td_uring_t *ring, const char *location,
+                   uint32_t ring_size, uint32_t data_size)
+{
+       int fd, err;
+
+       memset(ring, 0, sizeof(td_uring_t));
+
+       ring->ring_size  = ring_size;
+       ring->data_size  = data_size;
+       ring->shmem_size = ring_size + data_size + sizeof(td_uring_header_t);
+
+       err = asprintf(&ring->shmem_path, "%s.shm", location);
+       if (err == -1) {
+               ring->shmem_path = NULL;
+               err = -errno;
+               goto fail;
+       }
+
+       err = asprintf(&ring->ctlfd_path, "%s.cfd", location);
+       if (err == -1) {
+               ring->ctlfd_path = NULL;
+               err = -errno;
+               goto fail;
+       }
+
+       err = tapdisk_uring_create_ctlfd(ring);
+       if (err)
+               goto fail;
+
+       err = tapdisk_uring_create_shmem(ring);
+       if (err)
+               goto fail;
+
+       ring->ring_area = (unsigned long)ring->shmem + sizeof(td_uring_header_t);
+       ring->data_area = (unsigned long)ring->ring_area + ring->ring_size;
+
+       return 0;
+
+fail:
+       tapdisk_uring_destroy(ring);
+       return err;
+}
+
+int
+tapdisk_uring_destroy(td_uring_t *ring)
+{
+       tapdisk_uring_destroy_shmem(ring);
+       tapdisk_uring_destroy_ctlfd(ring);
+       return 0;
+}
+
+int
+tapdisk_uring_connect(td_uring_t *ring, const char *location)
+{
+       int fd, err;
+
+       memset(ring, 0, sizeof(td_uring_t));
+
+       err = asprintf(&ring->shmem_path, "%s.shm", location);
+       if (err == -1) {
+               ring->shmem_path = NULL;
+               err = -errno;
+               goto fail;
+       }
+
+       err = asprintf(&ring->ctlfd_path, "%s.cfd", location);
+       if (err == -1) {
+               ring->ctlfd_path = NULL;
+               err = -errno;
+               goto fail;
+       }
+
+       err = tapdisk_uring_connect_ctlfd(ring);
+       if (err)
+               goto fail;
+
+       err = tapdisk_uring_connect_shmem(ring);
+       if (err)
+               goto fail;
+
+       err = 0;
+
+fail:
+}
+
+int
+tapdisk_uring_disconnect(td_uring_t *ring)
+{
+       tapdisk_uring_disconnect_shmem(ring);
+       tapdisk_uring_disconnect_ctlfd(ring);
+       return 0;
+}
+
+static int
+tapdisk_ring_read_message(int fd, td_uring_message_t *message, int timeout)
+{
+       fd_set readfds;
+       int ret, len, offset;
+       struct timeval tv, *t;
+
+       t      = NULL;
+       offset = 0;
+       len    = sizeof(td_uring_message_t);
+
+       if (timeout) {
+               tv.tv_sec  = timeout;
+               tv.tv_usec = 0;
+               t = &tv;
+       }
+
+       while (offset < len) {
+               FD_ZERO(&readfds);
+               FD_SET(fd, &readfds);
+
+               /* we don't bother reinitializing tv. at worst, it will wait a
+                * bit more time than expected. */
+
+               ret = select(fd + 1, &readfds, NULL, NULL, t);
+               if (ret == -1)
+                       break;
+               else if (FD_ISSET(fd, &readfds)) {
+                       ret = read(fd, message + offset, len - offset);
+                       if (ret <= 0)
+                               break;
+                       offset += ret;
+               } else
+                       break;
+       }
+
+       if (offset != len)
+               return -EIO;
+
+       return 0;
+}
+
+static int
+tapdisk_ring_write_message(int fd, td_uring_message_t *message, int timeout)
+{
+       fd_set writefds;
+       int ret, len, offset;
+       struct timeval tv, *t;
+
+       t      = NULL;
+       offset = 0;
+       len    = sizeof(td_uring_message_t);
+
+       if (timeout) {
+               tv.tv_sec  = timeout;
+               tv.tv_usec = 0;
+               t = &tv;
+       }
+
+       while (offset < len) {
+               FD_ZERO(&writefds);
+               FD_SET(fd, &writefds);
+
+               /* we don't bother reinitializing tv. at worst, it will wait a
+                * bit more time than expected. */
+
+               ret = select(fd + 1, NULL, &writefds, NULL, t);
+               if (ret == -1)
+                       break;
+               else if (FD_ISSET(fd, &writefds)) {
+                       ret = write(fd, message + offset, len - offset);
+                       if (ret <= 0)
+                               break;
+                       offset += ret;
+               } else
+                       break;
+       }
+
+       if (offset != len)
+               return -EIO;
+
+       return 0;
+}
+
+int
+tapdisk_uring_poll(td_uring_t *ring)
+{
+       int err;
+       td_uring_message_t message;
+
+       err = tapdisk_uring_read_message(ring->ctlfd, &message, 1);
+       if (err)
+               return err;
+
+       if (message.type != TAPDISK_URING_MESSAGE_KICK)
+               return -EINVAL;
+
+       return 0;
+}
+
+int
+tapdisk_uring_kick(td_uring_t *ring)
+{
+       td_uring_message_t message;
+
+       memset(&message, 0, sizeof(td_uring_message_t));
+       message.type = TAPDISK_URING_MESSAGE_KICK;
+
+       return tapdisk_uring_write_message(ring->ctlfd, &message, 1);
+}
diff --git a/tools/blktap2/drivers/tapdisk-ring.h b/tools/blktap2/drivers/tapdisk-ring.h
new file mode 100644 (file)
index 0000000..a70ee10
--- /dev/null
@@ -0,0 +1,87 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _TAPDISK_RING_H_
+#define _TAPDISK_RING_H_
+
+#include <inttypes.h>
+
+#include <xenctrl.h>
+#include <xen/io/ring.h>
+
+typedef struct td_uring             td_uring_t;
+typedef struct td_uring_header      td_uring_header_t;
+typedef struct td_uring_request     td_uring_request_t;
+typedef struct td_uring_response    td_uring_response_t;
+
+struct td_uring {
+       int                         ctlfd;
+
+       char                       *shmem_path;
+       char                       *ctlfd_path;
+
+       void                       *shmem;
+       void                       *ring_area;
+       void                       *data_area;
+};
+
+struct td_uring_header {
+       char                        cookie[8];
+       uint32_t                    version;
+       uint32_t                    shmem_size;
+       uint32_t                    ring_size;
+       uint32_t                    data_size;
+       char                        reserved[4064];
+};
+
+struct td_uring_request {
+       uint8_t                     op;
+       uint64_t                    id;
+       uint64_t                    sec;
+       uint32_t                    secs;
+       uint32_t                    offset;
+};
+
+struct td_uring_response {
+       uint8_t                     op;
+       uint64_t                    id;
+       uint8_t                     status;
+};
+
+DEFINE_RING_TYPES(td_uring, td_uring_request_t, td_uring_response_t);
+
+int tapdisk_uring_create(td_uring_t *, const char *location,
+                       uint32_t ring_size, uint32_t data_size);
+int tapdisk_uring_destroy(td_uring_t *);
+
+int tapdisk_uring_connect(td_uring_t *, const char *location);
+int tapdisk_uring_disconnect(td_uring_t *);
+
+int tapdisk_uring_poll(td_uring_t *);
+int tapdisk_uring_kick(td_uring_t *);
+
+#endif
diff --git a/tools/blktap2/drivers/tapdisk-server.c b/tools/blktap2/drivers/tapdisk-server.c
new file mode 100644 (file)
index 0000000..c6a3de5
--- /dev/null
@@ -0,0 +1,415 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <stdio.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/ioctl.h>
+#include <sys/signal.h>
+
+#define TAPDISK
+#include "tapdisk-utils.h"
+#include "tapdisk-server.h"
+#include "tapdisk-driver.h"
+#include "tapdisk-interface.h"
+
+#define DBG(_level, _f, _a...)       tlog_write(_level, _f, ##_a)
+#define ERR(_err, _f, _a...)         tlog_error(_err, _f, ##_a)
+
+ tapdisk_server_t server;
+
+#define tapdisk_server_for_each_vbd(vbd, tmp)                          \
+       list_for_each_entry_safe(vbd, tmp, &server.vbds, next)
+
+struct tap_disk *
+tapdisk_server_find_driver_interface(int type)
+{
+       int n;
+
+       n = sizeof(dtypes) / sizeof(struct disk_info_t *);
+       if (type > n)
+               return NULL;
+
+       return dtypes[type]->drv;
+}
+
+td_image_t *
+tapdisk_server_get_shared_image(td_image_t *image)
+{
+       td_vbd_t *vbd, *tmpv;
+       td_image_t *img, *tmpi;
+
+       if (!td_flag_test(image->flags, TD_OPEN_SHAREABLE))
+               return NULL;
+
+       tapdisk_server_for_each_vbd(vbd, tmpv)
+               tapdisk_vbd_for_each_image(vbd, img, tmpi)
+                       if (img->type == image->type &&
+                           !strcmp(img->name, image->name))
+                               return img;
+
+       return NULL;
+}
+
+td_vbd_t *
+tapdisk_server_get_vbd(uint16_t uuid)
+{
+       td_vbd_t *vbd, *tmp;
+
+       tapdisk_server_for_each_vbd(vbd, tmp)
+               if (vbd->uuid == uuid)
+                       return vbd;
+
+       return NULL;
+}
+
+void
+tapdisk_server_add_vbd(td_vbd_t *vbd)
+{
+       list_add_tail(&vbd->next, &server.vbds);
+}
+
+void
+tapdisk_server_remove_vbd(td_vbd_t *vbd)
+{
+       list_del(&vbd->next);
+       INIT_LIST_HEAD(&vbd->next);
+       tapdisk_server_check_state();
+}
+
+void
+tapdisk_server_queue_tiocb(struct tiocb *tiocb)
+{
+       tapdisk_queue_tiocb(&server.aio_queue, tiocb);
+}
+
+void
+tapdisk_server_debug(void)
+{
+       td_vbd_t *vbd, *tmp;
+
+       tapdisk_debug_queue(&server.aio_queue);
+
+       tapdisk_server_for_each_vbd(vbd, tmp)
+               tapdisk_vbd_debug(vbd);
+
+       tlog_flush();
+}
+
+void
+tapdisk_server_check_state(void)
+{
+       if (list_empty(&server.vbds))
+               server.run = 0;
+}
+
+event_id_t
+tapdisk_server_register_event(char mode, int fd,
+                             int timeout, event_cb_t cb, void *data)
+{
+       return scheduler_register_event(&server.scheduler,
+                                       mode, fd, timeout, cb, data);
+}
+
+void
+tapdisk_server_unregister_event(event_id_t event)
+{
+       return scheduler_unregister_event(&server.scheduler, event);
+}
+
+void
+tapdisk_server_set_max_timeout(int seconds)
+{
+       scheduler_set_max_timeout(&server.scheduler, seconds);
+}
+
+static void
+tapdisk_server_assert_locks(void)
+{
+
+}
+
+static void
+tapdisk_server_set_retry_timeout(void)
+{
+       td_vbd_t *vbd, *tmp;
+
+       tapdisk_server_for_each_vbd(vbd, tmp)
+               if (tapdisk_vbd_retry_needed(vbd)) {
+                       tapdisk_server_set_max_timeout(TD_VBD_RETRY_INTERVAL);
+                       return;
+               }
+}
+
+static void
+tapdisk_server_check_progress(void)
+{
+       struct timeval now;
+       td_vbd_t *vbd, *tmp;
+
+       gettimeofday(&now, NULL);
+
+       tapdisk_server_for_each_vbd(vbd, tmp)
+               tapdisk_vbd_check_progress(vbd);
+}
+
+static void
+tapdisk_server_submit_tiocbs(void)
+{
+       tapdisk_submit_all_tiocbs(&server.aio_queue);
+}
+
+static void
+tapdisk_server_kick_responses(void)
+{
+       int n;
+       td_vbd_t *vbd, *tmp;
+
+       tapdisk_server_for_each_vbd(vbd, tmp)
+               tapdisk_vbd_kick(vbd);
+}
+
+static void
+tapdisk_server_check_vbds(void)
+{
+       td_vbd_t *vbd, *tmp;
+
+       tapdisk_server_for_each_vbd(vbd, tmp)
+               tapdisk_vbd_check_state(vbd);
+}
+
+static void
+tapdisk_server_stop_vbds(void)
+{
+       td_vbd_t *vbd, *tmp;
+
+       tapdisk_server_for_each_vbd(vbd, tmp)
+               tapdisk_vbd_kill_queue(vbd);
+}
+
+static void
+tapdisk_server_send_error(const char *message)
+{
+       td_vbd_t *vbd, *tmp;
+
+       tapdisk_server_for_each_vbd(vbd, tmp)
+               tapdisk_ipc_write_error(&vbd->ipc, message);
+}
+
+static void
+tapdisk_server_read_ipc_message(event_id_t id, char mode, void *private)
+{
+       tapdisk_ipc_read(&server.ipc);
+}
+
+static void
+tapdisk_server_aio_queue_event(event_id_t id, char mode, void *private)
+{
+       tapdisk_complete_tiocbs(&server.aio_queue);
+}
+
+static void
+tapdisk_server_free_aio_queue(void)
+{
+       tapdisk_server_unregister_event(server.aio_queue_event_id);
+       tapdisk_free_queue(&server.aio_queue);
+}
+
+static int
+tapdisk_server_initialize_aio_queue(void)
+{
+       int err;
+       event_id_t id;
+
+       err = tapdisk_init_queue(&server.aio_queue,
+                                TAPDISK_TIOCBS, 0, NULL);
+       if (err)
+               return err;
+
+       id = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD,
+                                          server.aio_queue.poll_fd, 0,
+                                          tapdisk_server_aio_queue_event,
+                                          NULL);
+       if (id < 0) {
+               tapdisk_free_queue(&server.aio_queue);
+               return id;
+       }
+
+       server.aio_queue_event_id = id;
+
+       return 0;
+}
+
+static void
+tapdisk_server_close(void)
+{
+       tapdisk_server_free_aio_queue();
+
+       if (server.control_event)
+               scheduler_unregister_event(&server.scheduler, server.control_event);
+
+       if (server.ipc.rfd != -1)
+               close(server.ipc.rfd);
+
+       if (server.ipc.wfd != -1)
+               close(server.ipc.wfd);
+}
+
+static void
+__tapdisk_server_run(void)
+{
+       int ret;
+
+       while (server.run) {
+               tapdisk_server_assert_locks();
+               tapdisk_server_set_retry_timeout();
+               tapdisk_server_check_progress();
+
+               ret = scheduler_wait_for_events(&server.scheduler);
+               if (ret < 0)
+                       DBG(TLOG_WARN, "server wait returned %d\n", ret);
+
+               tapdisk_server_check_vbds();
+               tapdisk_server_submit_tiocbs();
+               tapdisk_server_kick_responses();
+       }
+}
+
+static void
+tapdisk_server_signal_handler(int signal)
+{
+       td_vbd_t *vbd, *tmp;
+       static int xfsz_error_sent = 0;
+
+       switch (signal) {
+       case SIGBUS:
+       case SIGINT:
+               tapdisk_server_for_each_vbd(vbd, tmp)
+                       tapdisk_vbd_close(vbd);
+               break;
+
+       case SIGXFSZ:
+               ERR(EFBIG, "received SIGXFSZ");
+               tapdisk_server_stop_vbds();
+               if (xfsz_error_sent)
+                       break;
+
+               tapdisk_server_send_error("received SIGXFSZ, closing queues");
+               xfsz_error_sent = 1;
+               break;
+
+       case SIGUSR1:
+               tapdisk_server_debug();
+               break;
+       }
+}
+
+int
+tapdisk_server_initialize(const char *read, const char *write)
+{
+       int err;
+       event_id_t event_id;
+
+       event_id = 0;
+       memset(&server, 0, sizeof(tapdisk_server_t));
+       server.ipc.rfd = server.ipc.wfd = -1;
+
+       INIT_LIST_HEAD(&server.vbds);
+
+       if (read) {
+               server.ipc.rfd = open(read, O_RDWR | O_NONBLOCK);
+               if (server.ipc.rfd < 0) {
+                       err = -errno;
+                       EPRINTF("FD open failed %s: %d\n", read, err);
+                       goto fail;
+               }
+       }
+
+       if (write) {
+               server.ipc.wfd = open(write, O_RDWR | O_NONBLOCK);
+               if (server.ipc.wfd < 0) {
+                       err = -errno;
+                       EPRINTF("FD open failed %s, %d\n", write, err);
+                       goto fail;
+               }
+       }
+
+       scheduler_initialize(&server.scheduler);
+
+       if (read) {
+               event_id = scheduler_register_event(&server.scheduler,
+                                                   SCHEDULER_POLL_READ_FD,
+                                                   server.ipc.rfd, 0,
+                                                   tapdisk_server_read_ipc_message,
+                                                   NULL);
+               if (event_id < 0) {
+                       err = event_id;
+                       goto fail;
+               }
+       }
+
+       err = tapdisk_server_initialize_aio_queue();
+       if (err)
+               goto fail;
+
+       server.control_event = event_id;
+       server.run = 1;
+
+       return 0;
+
+fail:
+       if (server.ipc.rfd > 0)
+               close(server.ipc.rfd);
+       if (server.ipc.wfd > 0)
+               close(server.ipc.wfd);
+       if (event_id > 0)
+               scheduler_unregister_event(&server.scheduler,
+                                          server.control_event);
+       return err;
+}
+
+int
+tapdisk_server_run()
+{
+       int err;
+
+       err = tapdisk_set_resource_limits();
+       if (err)
+               return err;
+
+       signal(SIGBUS, tapdisk_server_signal_handler);
+       signal(SIGINT, tapdisk_server_signal_handler);
+       signal(SIGUSR1, tapdisk_server_signal_handler);
+       signal(SIGXFSZ, tapdisk_server_signal_handler);
+
+       __tapdisk_server_run();
+       tapdisk_server_close();
+
+       return 0;
+}
diff --git a/tools/blktap2/drivers/tapdisk-server.h b/tools/blktap2/drivers/tapdisk-server.h
new file mode 100644 (file)
index 0000000..09a4e13
--- /dev/null
@@ -0,0 +1,65 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _TAPDISK_SERVER_H_
+#define _TAPDISK_SERVER_H_
+
+#include "tapdisk-vbd.h"
+#include "tapdisk-queue.h"
+
+struct tap_disk *tapdisk_server_find_driver_interface(int);
+
+td_image_t *tapdisk_server_get_shared_image(td_image_t *);
+
+td_vbd_t *tapdisk_server_get_vbd(td_uuid_t);
+void tapdisk_server_add_vbd(td_vbd_t *);
+void tapdisk_server_remove_vbd(td_vbd_t *);
+
+void tapdisk_server_queue_tiocb(struct tiocb *);
+
+void tapdisk_server_check_state(void);
+
+event_id_t tapdisk_server_register_event(char, int, int, event_cb_t, void *);
+void tapdisk_server_unregister_event(event_id_t);
+void tapdisk_server_set_max_timeout(int);
+
+int tapdisk_server_initialize(const char *, const char *);
+int tapdisk_server_run(void);
+
+#define TAPDISK_TIOCBS              (TAPDISK_DATA_REQUESTS + 50)
+
+typedef struct tapdisk_server {
+       int                          run;
+       td_ipc_t                     ipc;
+       struct list_head             vbds;
+       scheduler_t                  scheduler;
+       event_id_t                   control_event;
+       struct tqueue                aio_queue;
+       event_id_t                   aio_queue_event_id;
+} tapdisk_server_t;
+
+#endif
diff --git a/tools/blktap2/drivers/tapdisk-stream.c b/tools/blktap2/drivers/tapdisk-stream.c
new file mode 100644 (file)
index 0000000..8fa9d9e
--- /dev/null
@@ -0,0 +1,600 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <stdio.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <unistd.h>
+
+#include "list.h"
+#include "scheduler.h"
+#include "tapdisk-vbd.h"
+#include "tapdisk-server.h"
+
+#define POLL_READ                        0
+#define POLL_WRITE                       1
+
+#define MIN(a, b)                        ((a) < (b) ? (a) : (b))
+
+struct tapdisk_stream_poll {
+       int                              pipe[2];
+       int                              set;
+};
+
+struct tapdisk_stream_request {
+       uint64_t                         sec;
+       uint32_t                         secs;
+       uint64_t                         seqno;
+       blkif_request_t                  blkif_req;
+       struct list_head                 next;
+};
+
+struct tapdisk_stream {
+       td_vbd_t                        *vbd;
+
+       unsigned int                     id;
+       int                              in_fd;
+       int                              out_fd;
+
+       int                              err;
+
+       uint64_t                         cur;
+       uint64_t                         start;
+       uint64_t                         end;
+
+       uint64_t                         started;
+       uint64_t                         completed;
+
+       struct tapdisk_stream_poll       poll;
+       event_id_t                       enqueue_event_id;
+
+       struct list_head                 free_list;
+       struct list_head                 pending_list;
+       struct list_head                 completed_list;
+
+       struct tapdisk_stream_request    requests[MAX_REQUESTS];
+};
+
+static unsigned int tapdisk_stream_count;
+
+static void tapdisk_stream_close_image(struct tapdisk_stream *);
+
+static void
+usage(const char *app, int err)
+{
+       printf("usage: %s <-n type:/path/to/image> "
+              "[-c sector count] [-s skip sectors]\n", app);
+       exit(err);
+}
+
+static inline void
+tapdisk_stream_poll_initialize(struct tapdisk_stream_poll *p)
+{
+       p->set = 0;
+       p->pipe[POLL_READ] = p->pipe[POLL_WRITE] = -1;
+}
+
+static int
+tapdisk_stream_poll_open(struct tapdisk_stream_poll *p)
+{
+       int err;
+
+       tapdisk_stream_poll_initialize(p);
+
+       err = pipe(p->pipe);
+       if (err)
+               return -errno;
+
+       err = fcntl(p->pipe[POLL_READ], F_SETFL, O_NONBLOCK);
+       if (err)
+               goto out;
+
+       err = fcntl(p->pipe[POLL_WRITE], F_SETFL, O_NONBLOCK);
+       if (err)
+               goto out;
+
+       return 0;
+
+out:
+       close(p->pipe[POLL_READ]);
+       close(p->pipe[POLL_WRITE]);
+       tapdisk_stream_poll_initialize(p);
+       return -errno;
+}
+
+static void
+tapdisk_stream_poll_close(struct tapdisk_stream_poll *p)
+{
+       if (p->pipe[POLL_READ] != -1)
+               close(p->pipe[POLL_READ]);
+       if (p->pipe[POLL_WRITE] != -1)
+               close(p->pipe[POLL_WRITE]);
+       tapdisk_stream_poll_initialize(p);
+}
+
+static inline void
+tapdisk_stream_poll_clear(struct tapdisk_stream_poll *p)
+{
+       int dummy;
+
+       read(p->pipe[POLL_READ], &dummy, sizeof(dummy));
+       p->set = 0;
+}
+
+static inline void
+tapdisk_stream_poll_set(struct tapdisk_stream_poll *p)
+{
+       int dummy = 0;
+
+       if (!p->set) {
+               write(p->pipe[POLL_WRITE], &dummy, sizeof(dummy));
+               p->set = 1;
+       }
+}
+
+static inline int
+tapdisk_stream_stop(struct tapdisk_stream *s)
+{
+       return (list_empty(&s->pending_list) && (s->cur == s->end || s->err));
+}
+
+static inline void
+tapdisk_stream_initialize_request(struct tapdisk_stream_request *req)
+{
+       memset(req, 0, sizeof(*req));
+       INIT_LIST_HEAD(&req->next);
+}
+
+static inline int
+tapdisk_stream_request_idx(struct tapdisk_stream *s,
+                          struct tapdisk_stream_request *req)
+{
+       return (req - s->requests);
+}
+
+static inline struct tapdisk_stream_request *
+tapdisk_stream_get_request(struct tapdisk_stream *s)
+{
+       struct tapdisk_stream_request *req;
+
+       if (list_empty(&s->free_list))
+               return NULL;
+
+       req = list_entry(s->free_list.next,
+                        struct tapdisk_stream_request, next);
+
+       list_del_init(&req->next);
+       tapdisk_stream_initialize_request(req);
+
+       return req;
+}
+
+static void
+tapdisk_stream_print_request(struct tapdisk_stream *s,
+                            struct tapdisk_stream_request *sreq)
+{
+       unsigned long idx = (unsigned long)tapdisk_stream_request_idx(s, sreq);
+       char *buf = (char *)MMAP_VADDR(s->vbd->ring.vstart, idx, 0);
+       write(s->out_fd, buf, sreq->secs << SECTOR_SHIFT);
+}
+
+static void
+tapdisk_stream_write_data(struct tapdisk_stream *s)
+{
+       struct tapdisk_stream_request *sreq, *tmp;
+
+       list_for_each_entry_safe(sreq, tmp, &s->completed_list, next) {
+               if (sreq->seqno != s->completed)
+                       break;
+
+               s->completed++;
+               tapdisk_stream_print_request(s, sreq);
+
+               list_del_init(&sreq->next);
+               list_add_tail(&sreq->next, &s->free_list);
+       }
+}
+
+static inline void
+tapdisk_stream_queue_completed(struct tapdisk_stream *s,
+                              struct tapdisk_stream_request *sreq)
+{
+       struct tapdisk_stream_request *itr;
+
+       list_for_each_entry(itr, &s->completed_list, next)
+               if (sreq->seqno < itr->seqno) {
+                       list_add_tail(&sreq->next, &itr->next);
+                       return;
+               }
+
+       list_add_tail(&sreq->next, &s->completed_list);
+}
+
+static void
+tapdisk_stream_dequeue(void *arg, blkif_response_t *rsp)
+{
+       struct tapdisk_stream *s = (struct tapdisk_stream *)arg;
+       struct tapdisk_stream_request *sreq = s->requests + rsp->id;
+
+       list_del_init(&sreq->next);
+
+       if (rsp->status == BLKIF_RSP_OKAY)
+               tapdisk_stream_queue_completed(s, sreq);
+       else {
+               s->err = EIO;
+               list_add_tail(&sreq->next, &s->free_list);
+               fprintf(stderr, "error reading sector 0x%"PRIu64"\n", sreq->sec);
+       }
+
+       tapdisk_stream_write_data(s);
+       tapdisk_stream_poll_set(&s->poll);
+}
+
+static void
+tapdisk_stream_enqueue(event_id_t id, char mode, void *arg)
+{
+       td_vbd_t *vbd;
+       int i, idx, psize;
+       struct tapdisk_stream *s = (struct tapdisk_stream *)arg;
+
+       vbd = s->vbd;
+       tapdisk_stream_poll_clear(&s->poll);
+
+       if (tapdisk_stream_stop(s)) {
+               tapdisk_stream_close_image(s);
+               return;
+       }
+
+       psize = getpagesize();
+
+       while (s->cur < s->end && !s->err) {
+               blkif_request_t *breq;
+               td_vbd_request_t *vreq;
+               struct tapdisk_stream_request *sreq;
+
+               sreq = tapdisk_stream_get_request(s);
+               if (!sreq)
+                       break;
+
+               idx                 = tapdisk_stream_request_idx(s, sreq);
+
+               sreq->sec           = s->cur;
+               sreq->secs          = 0;
+               sreq->seqno         = s->started++;
+
+               breq                = &sreq->blkif_req;
+               breq->id            = idx;
+               breq->nr_segments   = 0;
+               breq->sector_number = sreq->sec;
+               breq->operation     = BLKIF_OP_READ;
+
+               for (i = 0; i < BLKIF_MAX_SEGMENTS_PER_REQUEST; i++) {
+                       uint32_t secs = MIN(s->end - s->cur, psize >> SECTOR_SHIFT);
+                       struct blkif_request_segment *seg = breq->seg + i;
+
+                       if (!secs)
+                               break;
+
+                       sreq->secs += secs;
+                       s->cur     += secs;
+
+                       seg->first_sect = 0;
+                       seg->last_sect  = secs - 1;
+                       breq->nr_segments++;
+               }
+
+               vreq = vbd->request_list + idx;
+
+               assert(list_empty(&vreq->next));
+               assert(vreq->secs_pending == 0);
+
+               memcpy(&vreq->req, breq, sizeof(*breq));
+               vbd->received++;
+               vreq->vbd = vbd;
+
+               tapdisk_vbd_move_request(vreq, &vbd->new_requests);
+               list_add_tail(&sreq->next, &s->pending_list);
+       }
+
+       tapdisk_vbd_issue_requests(vbd);
+}
+
+static int
+tapdisk_stream_open_image(struct tapdisk_stream *s, const char *path, int type)
+{
+       int err;
+
+       s->id = tapdisk_stream_count++;
+
+       err = tapdisk_server_initialize(NULL, NULL);
+       if (err)
+               goto out;
+
+       err = tapdisk_vbd_initialize(-1, -1, s->id);
+       if (err)
+               goto out;
+
+       s->vbd = tapdisk_server_get_vbd(s->id);
+       if (!s->vbd) {
+               err = ENODEV;
+               goto out;
+       }
+
+       tapdisk_vbd_set_callback(s->vbd, tapdisk_stream_dequeue, s);
+
+       err = tapdisk_vbd_open_vdi(s->vbd, path, type,
+                                  TAPDISK_STORAGE_TYPE_DEFAULT,
+                                  TD_OPEN_RDONLY);
+       if (err)
+               goto out;
+
+       s->vbd->reopened = 1;
+       err = 0;
+
+out:
+       if (err)
+               fprintf(stderr, "failed to open %s: %d\n", path, err);
+       return err;
+}
+
+static void
+tapdisk_stream_close_image(struct tapdisk_stream *s)
+{
+       td_vbd_t *vbd;
+
+       vbd = tapdisk_server_get_vbd(s->id);
+       if (vbd) {
+               tapdisk_vbd_close_vdi(vbd);
+               tapdisk_server_remove_vbd(vbd);
+               free((void *)vbd->ring.vstart);
+               free(vbd->name);
+               free(vbd);
+               s->vbd = NULL;
+       }
+}
+
+static int
+tapdisk_stream_set_position(struct tapdisk_stream *s,
+                           uint64_t count, uint64_t skip)
+{
+       int err;
+       image_t image;
+
+       err = tapdisk_vbd_get_image_info(s->vbd, &image);
+       if (err) {
+               fprintf(stderr, "failed getting image size: %d\n", err);
+               return err;
+       }
+
+       if (count == (uint64_t)-1)
+               count = image.size - skip;
+
+       if (count + skip > image.size) {
+               fprintf(stderr, "0x%"PRIx64" past end of image 0x%"PRIx64"\n",
+                       (uint64_t) (count + skip), (uint64_t) image.size);
+               return -EINVAL;
+       }
+
+       s->start = skip;
+       s->cur   = s->start;
+       s->end   = s->start + count;
+
+       return 0;
+}
+
+static int
+tapdisk_stream_initialize_requests(struct tapdisk_stream *s)
+{
+       size_t size;
+       td_ring_t *ring;
+       int err, i, psize;
+
+       ring  = &s->vbd->ring;
+       psize = getpagesize();
+       size  = psize * BLKTAP_MMAP_REGION_SIZE;
+
+       /* sneaky -- set up ring->vstart so tapdisk_vbd will use our buffers */
+       err = posix_memalign((void **)&ring->vstart, psize, size);
+       if (err) {
+               fprintf(stderr, "failed to allocate buffers: %d\n", err);
+               ring->vstart = 0;
+               return err;
+       }
+
+       for (i = 0; i < MAX_REQUESTS; i++) {
+               struct tapdisk_stream_request *req = s->requests + i;
+               tapdisk_stream_initialize_request(req);
+               list_add_tail(&req->next, &s->free_list);
+       }
+
+       return 0;
+}
+
+static int
+tapdisk_stream_register_enqueue_event(struct tapdisk_stream *s)
+{
+       int err;
+       struct tapdisk_stream_poll *p = &s->poll;
+
+       err = tapdisk_stream_poll_open(p);
+       if (err)
+               goto out;
+
+       err = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD,
+                                           p->pipe[POLL_READ], 0,
+                                           tapdisk_stream_enqueue, s);
+       if (err < 0)
+               goto out;
+
+       s->enqueue_event_id = err;
+       err = 0;
+
+out:
+       if (err)
+               fprintf(stderr, "failed to register event: %d\n", err);
+       return err;
+}
+
+static void
+tapdisk_stream_unregister_enqueue_event(struct tapdisk_stream *s)
+{
+       if (s->enqueue_event_id) {
+               tapdisk_server_unregister_event(s->enqueue_event_id);
+               s->enqueue_event_id = 0;
+       }
+       tapdisk_stream_poll_close(&s->poll);
+}
+
+static inline void
+tapdisk_stream_initialize(struct tapdisk_stream *s)
+{
+       memset(s, 0, sizeof(*s));
+       s->in_fd = s->out_fd = -1;
+       INIT_LIST_HEAD(&s->free_list);
+       INIT_LIST_HEAD(&s->pending_list);
+       INIT_LIST_HEAD(&s->completed_list);
+}
+
+static int
+tapdisk_stream_open_fds(struct tapdisk_stream *s)
+{
+       s->out_fd = dup(STDOUT_FILENO);
+       if (s->out_fd == -1) {
+               fprintf(stderr, "failed to open output: %d\n", errno);
+               return errno;
+       }
+
+       return 0;
+}
+
+static int
+tapdisk_stream_open(struct tapdisk_stream *s, const char *path,
+                   int type, uint64_t count, uint64_t skip)
+{
+       int err;
+
+       tapdisk_stream_initialize(s);
+
+       err = tapdisk_stream_open_fds(s);
+       if (err)
+               return err;
+
+       err = tapdisk_stream_open_image(s, path, type);
+       if (err)
+               return err;
+
+       err = tapdisk_stream_set_position(s, count, skip);
+       if (err)
+               return err;
+
+       err = tapdisk_stream_initialize_requests(s);
+       if (err)
+               return err;
+
+       err = tapdisk_stream_register_enqueue_event(s);
+       if (err)
+               return err;
+
+       return 0;
+}
+
+static void
+tapdisk_stream_release(struct tapdisk_stream *s)
+{
+       close(s->out_fd);
+       tapdisk_stream_close_image(s);
+       tapdisk_stream_unregister_enqueue_event(s);
+}
+
+static int
+tapdisk_stream_run(struct tapdisk_stream *s)
+{
+       tapdisk_stream_enqueue(s->enqueue_event_id, SCHEDULER_POLL_READ_FD, s);
+       tapdisk_server_run();
+       return s->err;
+}
+
+int
+main(int argc, char *argv[])
+{
+       int c, err, type;
+       char *params, *path;
+       uint64_t count, skip;
+       struct tapdisk_stream stream;
+
+       err    = 0;
+       skip   = 0;
+       count  = (uint64_t)-1;
+       params = NULL;
+
+       while ((c = getopt(argc, argv, "n:c:s:h")) != -1) {
+               switch (c) {
+               case 'n':
+                       params = optarg;
+                       break;
+               case 'c':
+                       count = strtoull(optarg, NULL, 10);
+                       break;
+               case 's':
+                       skip = strtoull(optarg, NULL, 10);
+                       break;
+               default:
+                       err = EINVAL;
+               case 'h':
+                       usage(argv[0], err);
+               }
+       }
+
+       if (!params)
+               usage(argv[0], EINVAL);
+
+       err = tapdisk_parse_disk_type(params, &path, &type);
+       if (err) {
+               fprintf(stderr, "invalid argument %s: %d\n", params, err);
+               return err;
+       }
+
+       tapdisk_start_logging("tapdisk-stream");
+
+       err = tapdisk_stream_open(&stream, path, type, count, skip);
+       if (err)
+               goto out;
+
+       err = tapdisk_stream_run(&stream);
+       if (err)
+               goto out;
+
+       err = 0;
+
+out:
+       tapdisk_stream_release(&stream);
+       tapdisk_stop_logging();
+       return err;
+}
diff --git a/tools/blktap2/drivers/tapdisk-utils.c b/tools/blktap2/drivers/tapdisk-utils.c
new file mode 100644 (file)
index 0000000..560f3bf
--- /dev/null
@@ -0,0 +1,199 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <errno.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <linux/fs.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <sys/ioctl.h>
+#include <sys/resource.h>
+
+#include "tapdisk.h"
+#include "disktypes.h"
+#include "blktaplib.h"
+#include "tapdisk-log.h"
+#include "tapdisk-utils.h"
+
+void
+tapdisk_start_logging(const char *name)
+{
+       static char buf[128];
+
+       snprintf(buf, sizeof(buf), "%s[%d]", name, getpid());
+       openlog(buf, LOG_CONS | LOG_ODELAY, LOG_DAEMON);
+       open_tlog("/tmp/tapdisk.log", (64 << 10), TLOG_WARN, 0);
+}
+
+void
+tapdisk_stop_logging(void)
+{
+       closelog();
+       close_tlog();
+}
+
+int
+tapdisk_set_resource_limits(void)
+{
+       int err;
+       struct rlimit rlim;
+
+       rlim.rlim_cur = RLIM_INFINITY;
+       rlim.rlim_max = RLIM_INFINITY;
+
+       err = setrlimit(RLIMIT_MEMLOCK, &rlim);
+       if (err == -1) {
+               EPRINTF("RLIMIT_MEMLOCK failed: %d\n", errno);
+               return -errno;
+       }
+
+       err = mlockall(MCL_CURRENT | MCL_FUTURE);
+       if (err == -1) {
+               EPRINTF("mlockall failed: %d\n", errno);
+               return -errno;
+       }
+
+#define CORE_DUMP
+#if defined(CORE_DUMP)
+       err = setrlimit(RLIMIT_CORE, &rlim);
+       if (err == -1)
+               EPRINTF("RLIMIT_CORE failed: %d\n", errno);
+#endif
+
+       return 0;
+}
+
+int
+tapdisk_namedup(char **dup, const char *name)
+{
+       *dup = NULL;
+
+       if (strnlen(name, MAX_NAME_LEN) >= MAX_NAME_LEN)
+               return -ENAMETOOLONG;
+       
+       *dup = strdup(name);
+       if (!*dup)
+               return -ENOMEM;
+
+       return 0;
+}
+
+int
+tapdisk_parse_disk_type(const char *params, char **_path, int *_type)
+{
+        int i, err, size, handle_len;
+       char *ptr, *path, handle[10];
+
+       if (strlen(params) + 1 >= MAX_NAME_LEN)
+               return -ENAMETOOLONG;
+
+       ptr = strchr(params, ':');
+       if (!ptr)
+               return -EINVAL;
+
+       path = ptr + 1;
+
+        handle_len = ptr - params;
+        if (handle_len > sizeof(handle))
+          return -ENAMETOOLONG;
+        
+        memcpy(handle, params, handle_len);
+        handle[handle_len] = '\0';
+               
+       size = sizeof(dtypes) / sizeof(disk_info_t *);
+       for (i = 0; i < size; i++) {
+          if (strncmp(handle, dtypes[i]->handle, handle_len))
+                       continue;
+
+               if (dtypes[i]->idnum == -1)
+                       return -ENODEV;
+
+               *_type = dtypes[i]->idnum;
+               *_path = path;
+
+               return 0;
+       }
+
+       return -ENODEV;
+}
+
+/*Get Image size, secsize*/
+int
+tapdisk_get_image_size(int fd, uint64_t *_sectors, uint32_t *_sector_size)
+{
+       int ret;
+       struct stat stat;
+       uint64_t sectors;
+       uint32_t sector_size;
+
+       sectors       = 0;
+       sector_size   = 0;
+       *_sectors     = 0;
+       *_sector_size = 0;
+
+       if (fstat(fd, &stat)) {
+               DPRINTF("ERROR: fstat failed, Couldn't stat image");
+               return -EINVAL;
+       }
+
+       if (S_ISBLK(stat.st_mode)) {
+               /*Accessing block device directly*/
+               if (ioctl(fd, BLKGETSIZE, &sectors)) {
+                       DPRINTF("ERR: BLKGETSIZE failed, couldn't stat image");
+                       return -EINVAL;
+               }
+
+               /*Get the sector size*/
+#if defined(BLKSSZGET)
+               {
+                       int arg;
+                       sector_size = DEFAULT_SECTOR_SIZE;
+                       ioctl(fd, BLKSSZGET, &sector_size);
+
+                       if (sector_size != DEFAULT_SECTOR_SIZE)
+                               DPRINTF("Note: sector size is %u (not %d)\n",
+                                       sector_size, DEFAULT_SECTOR_SIZE);
+               }
+#else
+               sector_size = DEFAULT_SECTOR_SIZE;
+#endif
+
+       } else {
+               /*Local file? try fstat instead*/
+               sectors     = (stat.st_size >> SECTOR_SHIFT);
+               sector_size = DEFAULT_SECTOR_SIZE;
+       }
+
+       if (sectors == 0) {             
+               sectors     = 16836057ULL;
+               sector_size = DEFAULT_SECTOR_SIZE;
+       }
+
+       return 0;
+}
diff --git a/tools/blktap2/drivers/tapdisk-utils.h b/tools/blktap2/drivers/tapdisk-utils.h
new file mode 100644 (file)
index 0000000..216c902
--- /dev/null
@@ -0,0 +1,42 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _TAPDISK_UTILS_H_
+#define _TAPDISK_UTILS_H_
+
+#include <inttypes.h>
+
+#define MAX_NAME_LEN                 1000
+
+void tapdisk_start_logging(const char *);
+void tapdisk_stop_logging(void);
+int tapdisk_set_resource_limits(void);
+int tapdisk_namedup(char **, const char *);
+int tapdisk_parse_disk_type(const char *, char **, int *);
+int tapdisk_get_image_size(int, uint64_t *, uint32_t *);
+
+#endif
diff --git a/tools/blktap2/drivers/tapdisk-vbd.c b/tools/blktap2/drivers/tapdisk-vbd.c
new file mode 100644 (file)
index 0000000..1eaaee9
--- /dev/null
@@ -0,0 +1,1758 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <stdio.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <regex.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <libgen.h>
+#include <sys/mman.h>
+#include <sys/ioctl.h>
+
+#include "libvhd.h"
+#include "tapdisk-image.h"
+#include "tapdisk-driver.h"
+#include "tapdisk-server.h"
+#include "tapdisk-interface.h"
+#include "tapdisk-vbd.h"
+#include "blktap2.h"
+
+#define DBG(_level, _f, _a...) tlog_write(_level, _f, ##_a)
+#define ERR(_err, _f, _a...) tlog_error(_err, _f, ##_a)
+
+#if 1                                                                        
+#define ASSERT(p)                                                      \
+       do {                                                            \
+               if (!(p)) {                                             \
+                       DPRINTF("Assertion '%s' failed, line %d, "      \
+                               "file %s", #p, __LINE__, __FILE__);     \
+                       *(int*)0 = 0;                                   \
+               }                                                       \
+       } while (0)
+#else
+#define ASSERT(p) ((void)0)
+#endif 
+
+
+#define TD_VBD_EIO_RETRIES          10
+#define TD_VBD_EIO_SLEEP            1
+#define TD_VBD_WATCHDOG_TIMEOUT     10
+
+static void tapdisk_vbd_ring_event(event_id_t, char, void *);
+static void tapdisk_vbd_callback(void *, blkif_response_t *);
+
+/* 
+ * initialization
+ */
+
+static inline void
+tapdisk_vbd_initialize_vreq(td_vbd_request_t *vreq)
+{
+       memset(vreq, 0, sizeof(td_vbd_request_t));
+       INIT_LIST_HEAD(&vreq->next);
+}
+
+int
+tapdisk_vbd_initialize(int rfd, int wfd, uint16_t uuid)
+{
+       int i;
+       td_vbd_t *vbd;
+
+       vbd = tapdisk_server_get_vbd(uuid);
+       if (vbd) {
+               EPRINTF("duplicate vbds! %u\n", uuid);
+               return -EEXIST;
+       }
+
+       vbd = calloc(1, sizeof(td_vbd_t));
+       if (!vbd) {
+               EPRINTF("failed to allocate tapdisk state\n");
+               return -ENOMEM;
+       }
+
+       vbd->uuid     = uuid;
+       vbd->ipc.rfd  = rfd;
+       vbd->ipc.wfd  = wfd;
+       vbd->ipc.uuid = uuid;
+       vbd->ring.fd  = -1;
+
+       /* default blktap ring completion */
+       vbd->callback = tapdisk_vbd_callback;
+       vbd->argument = vbd;
+
+       INIT_LIST_HEAD(&vbd->images);
+       INIT_LIST_HEAD(&vbd->new_requests);
+       INIT_LIST_HEAD(&vbd->pending_requests);
+       INIT_LIST_HEAD(&vbd->failed_requests);
+       INIT_LIST_HEAD(&vbd->completed_requests);
+       INIT_LIST_HEAD(&vbd->next);
+       gettimeofday(&vbd->ts, NULL);
+
+       for (i = 0; i < MAX_REQUESTS; i++)
+               tapdisk_vbd_initialize_vreq(vbd->request_list + i);
+
+       tapdisk_server_add_vbd(vbd);
+
+       return 0;
+}
+
+void
+tapdisk_vbd_set_callback(td_vbd_t *vbd, td_vbd_cb_t callback, void *argument)
+{
+       vbd->callback = callback;
+       vbd->argument = argument;
+}
+
+static int
+tapdisk_vbd_validate_chain(td_vbd_t *vbd)
+{
+       int err;
+       td_image_t *image, *parent, *tmp;
+
+       DPRINTF("VBD CHAIN:\n");
+
+       tapdisk_vbd_for_each_image(vbd, image, tmp) {
+               DPRINTF("%s: %d\n", image->name, image->type);
+
+               if (tapdisk_vbd_is_last_image(vbd, image))
+                       break;
+
+               parent = tapdisk_vbd_next_image(image);
+               err    = td_validate_parent(image, parent);
+               if (err)
+                       return err;
+       }
+
+       return 0;
+}
+
+void
+tapdisk_vbd_close_vdi(td_vbd_t *vbd)
+{
+       td_image_t *image, *tmp;
+
+       tapdisk_vbd_for_each_image(vbd, image, tmp) {
+               td_close(image);
+               tapdisk_image_free(image);
+       }
+
+       INIT_LIST_HEAD(&vbd->images);
+       td_flag_set(vbd->state, TD_VBD_CLOSED);
+}
+
+static int
+tapdisk_vbd_add_block_cache(td_vbd_t *vbd)
+{
+       int err;
+       td_driver_t *driver;
+       td_image_t *cache, *image, *target, *tmp;
+
+       target = NULL;
+
+       tapdisk_vbd_for_each_image(vbd, image, tmp)
+               if (td_flag_test(image->flags, TD_OPEN_RDONLY) &&
+                   td_flag_test(image->flags, TD_OPEN_SHAREABLE)) {
+                       target = image;
+                       break;
+               }
+
+       if (!target)
+               return 0;
+
+       cache = tapdisk_image_allocate(target->name,
+                                      DISK_TYPE_BLOCK_CACHE,
+                                      target->storage,
+                                      target->flags,
+                                      target->private);
+       if (!cache)
+               return -ENOMEM;
+
+       /* try to load existing cache */
+       err = td_load(cache);
+       if (!err)
+               goto done;
+
+       /* hack driver to send open() correct image size */
+       if (!target->driver) {
+               err = -ENODEV;
+               goto fail;
+       }
+
+       cache->driver = tapdisk_driver_allocate(cache->type,
+                                               cache->name,
+                                               cache->flags,
+                                               cache->storage);
+       if (!cache->driver) {
+               err = -ENOMEM;
+               goto fail;
+       }
+
+       cache->driver->info = target->driver->info;
+
+       /* try to open new cache */
+       err = td_open(cache);
+       if (!err)
+               goto done;
+
+fail:
+       /* give up */
+       tapdisk_image_free(target);
+       return err;
+
+done:
+       /* insert cache before image */
+       list_add(&cache->next, target->next.prev);
+       return 0;
+}
+
+static int
+tapdisk_vbd_add_dirty_log(td_vbd_t *vbd)
+{
+       int err;
+       td_driver_t *driver;
+       td_image_t *log, *parent;
+
+       driver = NULL;
+       log    = NULL;
+
+       parent = tapdisk_vbd_first_image(vbd);
+
+       log    = tapdisk_image_allocate(parent->name,
+                                       DISK_TYPE_LOG,
+                                       parent->storage,
+                                       parent->flags,
+                                       vbd);
+       if (!log)
+               return -ENOMEM;
+
+       driver = tapdisk_driver_allocate(log->type,
+                                        log->name,
+                                        log->flags,
+                                        log->storage);
+       if (!driver) {
+               err = -ENOMEM;
+               goto fail;
+       }
+
+       driver->info = parent->driver->info;
+       log->driver  = driver;
+
+       err = td_open(log);
+       if (err)
+               goto fail;
+
+       list_add(&log->next, &vbd->images);
+       return 0;
+
+fail:
+       tapdisk_image_free(log);
+       return err;
+}
+
+/*
+ * LVHD hack: have to rescan LVM metadata on pool
+ * slaves to register lvchanges made on master.  FIXME.
+ */
+static int
+tapdisk_vbd_reactivate_volume(const char *name)
+{
+       int err;
+       char *cmd;
+
+       DPRINTF("reactivating %s\n", name);
+
+       err = asprintf(&cmd, "lvchange -an %s", name);
+       if (err == - 1) {
+               EPRINTF("failed to deactivate %s\n", name);
+               return -errno;
+       }
+
+       err = system(cmd);
+       if (err) {
+               /* 
+                * Assume that LV deactivation failed because the LV is open, 
+                * in which case the LVM information should be up-to-date and 
+                * we don't need this step anyways (so ignore the error). If 
+                * the failure is due to a non-existent LV, the next command 
+                * (lvchange -ay) will catch it.
+                * If we want to be more prudent/paranoid, we can instead check 
+                * whether the LV is currently open (a bit more work).
+                */
+       }
+
+       free(cmd);
+       err = asprintf(&cmd, "lvchange -ay --refresh %s", name);
+       if (err == - 1) {
+               EPRINTF("failed to activate %s\n", name);
+               return -errno;
+       }
+
+       err = system(cmd);
+       if (err)
+               EPRINTF("%s failed: %d\n", cmd, err);
+       free(cmd);
+       return err;
+}
+
+static int
+tapdisk_vbd_reactivate_volumes(td_vbd_t *vbd, int resume)
+{
+       int i, cnt, err;
+       char *name, *new;
+       vhd_context_t vhd;
+       vhd_parent_locator_t *loc;
+
+       new  = NULL;
+       name = NULL;
+
+       if (vbd->storage != TAPDISK_STORAGE_TYPE_LVM)
+               return 0;
+
+       if (!resume && vbd->reactivated)
+               return 0;
+
+       name = strdup(vbd->name);
+       if (!name) {
+               EPRINTF("%s: nomem\n", vbd->name);
+               return -ENOMEM;
+       }
+
+       for (cnt = 0; 1; cnt++) {
+
+               /* only need to reactivate child and parent during resume */
+               if (resume && cnt == 2)
+                       break;
+
+               err = tapdisk_vbd_reactivate_volume(name);
+               if (err)
+                       goto fail;
+
+               if (!strstr(name, "VHD"))
+                       break;
+
+               for (i = 0; i < TD_VBD_EIO_RETRIES; i++) {
+                       err = vhd_open(&vhd, name, VHD_OPEN_RDONLY);
+                       if (!err)
+                               break;
+
+                       libvhd_set_log_level(1);
+                       sleep(TD_VBD_EIO_SLEEP);
+               }
+               libvhd_set_log_level(0);
+               if (err)
+                       goto fail;
+
+               if (vhd.footer.type != HD_TYPE_DIFF) {
+                       vhd_close(&vhd);
+                       break;
+               }
+
+               loc = NULL;
+               for (i = 0; i < 8; i++)
+                       if (vhd.header.loc[i].code == PLAT_CODE_MACX) {
+                               loc = vhd.header.loc + i;
+                               break;
+                       }
+
+               if (!loc) {
+                       vhd_close(&vhd);
+                       err = -EINVAL;
+                       goto fail;
+               }
+
+               free(name);
+               err = vhd_parent_locator_read(&vhd, loc, &name);
+               vhd_close(&vhd);
+
+               if (err) {
+                       name = NULL;
+                       goto fail;
+               }
+
+               /*
+                * vhd_parent_locator_read returns path relative to child:
+                * ./VG_XenStorage--<sr-uuid>-VHD--<vdi-uuid>
+                * we have to convert this to absolute path for lvm
+                */
+               err = asprintf(&new, "/dev/mapper/%s", name + 2);
+               if (err == -1) {
+                       err  = -errno;
+                       goto fail;
+               }
+
+               free(name);
+               name = new;
+       }
+
+       err = 0;
+       vbd->reactivated = 1;
+
+out:
+       free(name);
+       return err;
+
+fail:
+       EPRINTF("failed to reactivate %s: %d\n", vbd->name, err);
+       goto out;
+}
+
+/*
+ * LVHD hack: 
+ * raw volumes are named /dev/<sr-vg-name>-<sr-uuid>/LV-<sr-uuid>
+ * vhd volumes are named /dev/<sr-vg-name>-<sr-uuid>/VHD-<sr-uuid>
+ *
+ * a live snapshot of a raw volume will result in the writeable volume's
+ * name changing from the raw to vhd format, but this change will not be
+ * reflected by xenstore.  hence this mess.
+ */
+static int
+tapdisk_vbd_check_file(td_vbd_t *vbd)
+{
+       int i, err;
+       regex_t re;
+       size_t len, max;
+       regmatch_t matches[4];
+       char *new, *src, *dst, error[256];
+
+       if (vbd->storage != TAPDISK_STORAGE_TYPE_LVM)
+               return 0;
+
+       err = tapdisk_vbd_reactivate_volume(vbd->name);
+       if (!err)
+               return 0;
+       else
+               DPRINTF("reactivating %s failed\n", vbd->name);
+
+#define HEX   "[A-Za-z0-9]"
+#define UUID  HEX"\\{8\\}-"HEX"\\{4\\}-"HEX"\\{4\\}-"HEX"\\{4\\}-"HEX"\\{12\\}"
+#define VG    "VG_"HEX"\\+"
+#define TYPE  "\\(LV\\|VHD\\)"
+#define RE    "\\(/dev/"VG"-"UUID"/\\)"TYPE"\\(-"UUID"\\)"
+
+       err = regcomp(&re, RE, 0);
+       if (err)
+               goto regerr;
+
+#undef HEX
+#undef UUID
+#undef VG
+#undef TYPE
+#undef RE
+
+       err = regexec(&re, vbd->name, 4, matches, 0);
+       if (err)
+               goto regerr;
+
+       max = strlen("VHD") + 1;
+       for (i = 1; i < 4; i++) {
+               if (matches[i].rm_so == -1 || matches[i].rm_eo == -1) {
+                       EPRINTF("%s: failed to tokenize name\n", vbd->name);
+                       err = -EINVAL;
+                       goto out;
+               }
+
+               max += matches[i].rm_eo - matches[i].rm_so;
+       }
+
+       new = malloc(max);
+       if (!new) {
+               EPRINTF("%s: failed to allocate new name\n", vbd->name);
+               err = -ENOMEM;
+               goto out;
+       }
+
+       src = new;
+       for (i = 1; i < 4; i++) {
+               dst = vbd->name + matches[i].rm_so;
+               len = matches[i].rm_eo - matches[i].rm_so;
+
+               if (i == 2) {
+                       if (memcmp(dst, "LV", len)) {
+                               EPRINTF("%s: bad name format\n", vbd->name);
+                               free(new);
+                               err = -EINVAL;
+                               goto out;
+                       }
+
+                       src += sprintf(src, "VHD");
+                       continue;
+               }
+
+               memcpy(src, dst, len + 1);
+               src += len;
+       }
+
+       *src = '\0';
+
+       err = tapdisk_vbd_reactivate_volume(new);
+       if (err)
+               DPRINTF("reactivating %s failed\n", new);
+
+       err = access(new, F_OK);
+       if (err == -1) {
+               EPRINTF("neither %s nor %s accessible\n",
+                       vbd->name, new);
+               err = -errno;
+               free(new);
+               goto out;
+       }
+
+       DPRINTF("couldn't find %s, trying %s\n", vbd->name, new);
+
+       err = 0;
+       free(vbd->name);
+       vbd->name = new;
+       vbd->type = DISK_TYPE_VHD;
+
+out:
+       regfree(&re);
+       return err;
+
+regerr:
+       regerror(err, &re, error, sizeof(error));
+       EPRINTF("%s: regex failed: %s\n", vbd->name, error);
+       err = -EINVAL;
+       goto out;
+}
+
+static int
+__tapdisk_vbd_open_vdi(td_vbd_t *vbd, td_flag_t extra_flags)
+{
+       char *file;
+       int err, type;
+       td_flag_t flags;
+       td_disk_id_t id;
+       td_image_t *image, *tmp;
+       struct tfilter *filter = NULL;
+
+       err = tapdisk_vbd_reactivate_volumes(vbd, 0);
+       if (err)
+               return err;
+
+       flags = (vbd->flags & ~TD_OPEN_SHAREABLE) | extra_flags;
+       file  = vbd->name;
+       type  = vbd->type;
+
+       for (;;) {
+               err   = -ENOMEM;
+               image = tapdisk_image_allocate(file, type,
+                                              vbd->storage, flags, vbd);
+
+               if (file != vbd->name) {
+                       free(file);
+                       file = NULL;
+               }
+
+               if (!image)
+                       goto fail;
+
+               err = td_load(image);
+               if (err) {
+                       if (err != -ENODEV)
+                               goto fail;
+
+                       err = td_open(image);
+                       if (err)
+                               goto fail;
+               }
+
+               err = td_get_parent_id(image, &id);
+               if (err && err != TD_NO_PARENT) {
+                       td_close(image);
+                       goto fail;
+               }
+
+               if (!image->storage)
+                       image->storage = vbd->storage;
+
+               tapdisk_vbd_add_image(vbd, image);
+               image = NULL;
+
+               if (err == TD_NO_PARENT)
+                       break;
+
+               file   = id.name;
+               type   = id.drivertype;
+               flags |= (TD_OPEN_RDONLY | TD_OPEN_SHAREABLE);
+       }
+
+       if (td_flag_test(vbd->flags, TD_OPEN_LOG_DIRTY)) {
+               err = tapdisk_vbd_add_dirty_log(vbd);
+               if (err)
+                       goto fail;
+       }
+
+       if (td_flag_test(vbd->flags, TD_OPEN_ADD_CACHE)) {
+               err = tapdisk_vbd_add_block_cache(vbd);
+               if (err)
+                       goto fail;
+       }               
+
+       err = tapdisk_vbd_validate_chain(vbd);
+       if (err)
+               goto fail;
+
+       td_flag_clear(vbd->state, TD_VBD_CLOSED);
+
+       return 0;
+
+fail:
+       if (image)
+               tapdisk_image_free(image);
+
+       tapdisk_vbd_close_vdi(vbd);
+
+       return err;
+}
+
+int
+tapdisk_vbd_open_vdi(td_vbd_t *vbd, const char *path,
+                    uint16_t drivertype, uint16_t storage, td_flag_t flags)
+{
+       int i, err;
+       struct tap_disk *ops;
+
+       ops = tapdisk_server_find_driver_interface(drivertype);
+       if (!ops)
+               return -EINVAL;
+       DPRINTF("Loaded %s driver for vbd %u %s 0x%08x\n",
+               ops->disk_type, vbd->uuid, path, flags);
+
+       err = tapdisk_namedup(&vbd->name, path);
+       if (err)
+               return err;
+
+       vbd->flags   = flags;
+       vbd->storage = storage;
+       vbd->type    = drivertype;
+
+       for (i = 0; i < TD_VBD_EIO_RETRIES; i++) {
+               err = __tapdisk_vbd_open_vdi(vbd, 0);
+               if (err != -EIO)
+                       break;
+
+               sleep(TD_VBD_EIO_SLEEP);
+       }
+       if (err)
+               goto fail;
+
+       return 0;
+
+fail:
+       free(vbd->name);
+       vbd->name = NULL;
+       return err;
+}
+
+static int
+tapdisk_vbd_register_event_watches(td_vbd_t *vbd)
+{
+       event_id_t id;
+
+       id = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD,
+                                          vbd->ring.fd, 0,
+                                          tapdisk_vbd_ring_event, vbd);
+       if (id < 0)
+               return id;
+
+       vbd->ring_event_id = id;
+
+       return 0;
+}
+
+static void
+tapdisk_vbd_unregister_events(td_vbd_t *vbd)
+{
+       if (vbd->ring_event_id)
+               tapdisk_server_unregister_event(vbd->ring_event_id);
+}
+
+static int
+tapdisk_vbd_map_device(td_vbd_t *vbd, const char *devname)
+{
+       
+       int err, psize;
+       td_ring_t *ring;
+
+       ring  = &vbd->ring;
+       psize = getpagesize();
+
+       ring->fd = open(devname, O_RDWR);
+       if (ring->fd == -1) {
+               err = -errno;
+               EPRINTF("failed to open %s: %d\n", devname, err);
+               goto fail;
+       }
+
+       ring->mem = mmap(0, psize * BLKTAP_MMAP_REGION_SIZE,
+                        PROT_READ | PROT_WRITE, MAP_SHARED, ring->fd, 0);
+       if (ring->mem == MAP_FAILED) {
+               err = -errno;
+               EPRINTF("failed to mmap %s: %d\n", devname, err);
+               goto fail;
+       }
+
+       ring->sring = (blkif_sring_t *)((unsigned long)ring->mem);
+       BACK_RING_INIT(&ring->fe_ring, ring->sring, psize);
+
+       ring->vstart =
+               (unsigned long)ring->mem + (BLKTAP_RING_PAGES * psize);
+
+       ioctl(ring->fd, BLKTAP_IOCTL_SETMODE, BLKTAP_MODE_INTERPOSE);
+
+       return 0;
+
+fail:
+       if (ring->mem && ring->mem != MAP_FAILED)
+               munmap(ring->mem, psize * BLKTAP_MMAP_REGION_SIZE);
+       if (ring->fd != -1)
+               close(ring->fd);
+       ring->fd  = -1;
+       ring->mem = NULL;
+       return err;
+}
+
+static int
+tapdisk_vbd_unmap_device(td_vbd_t *vbd)
+{
+       int psize;
+
+       psize = getpagesize();
+
+       if (vbd->ring.fd != -1)
+               close(vbd->ring.fd);
+       if (vbd->ring.mem > 0)
+               munmap(vbd->ring.mem, psize * BLKTAP_MMAP_REGION_SIZE);
+
+       return 0;
+}
+
+int
+tapdisk_vbd_open(td_vbd_t *vbd, const char *name, uint16_t type,
+                uint16_t storage, const char *ring, td_flag_t flags)
+{
+       int err;
+
+       err = tapdisk_vbd_open_vdi(vbd, name, type, storage, flags);
+       if (err)
+               goto out;
+
+       err = tapdisk_vbd_map_device(vbd, ring);
+       if (err)
+               goto out;
+
+       err = tapdisk_vbd_register_event_watches(vbd);
+       if (err)
+               goto out;
+
+       return 0;
+
+out:
+       tapdisk_vbd_close_vdi(vbd);
+       tapdisk_vbd_unmap_device(vbd);
+       tapdisk_vbd_unregister_events(vbd);
+       free(vbd->name);
+       vbd->name = NULL;
+       return err;
+}
+
+static void
+tapdisk_vbd_queue_count(td_vbd_t *vbd, int *new,
+                       int *pending, int *failed, int *completed)
+{
+       int n, p, f, c;
+       td_vbd_request_t *vreq, *tvreq;
+
+       n = 0;
+       p = 0;
+       f = 0;
+       c = 0;
+
+       tapdisk_vbd_for_each_request(vreq, tvreq, &vbd->new_requests)
+               n++;
+
+       tapdisk_vbd_for_each_request(vreq, tvreq, &vbd->pending_requests)
+               p++;
+
+       tapdisk_vbd_for_each_request(vreq, tvreq, &vbd->failed_requests)
+               f++;
+
+       tapdisk_vbd_for_each_request(vreq, tvreq, &vbd->completed_requests)
+               c++;
+
+       *new       = n;
+       *pending   = p;
+       *failed    = f;
+       *completed = c;
+}
+
+static int
+tapdisk_vbd_shutdown(td_vbd_t *vbd)
+{
+       int new, pending, failed, completed;
+
+       if (!list_empty(&vbd->pending_requests))
+               return -EAGAIN;
+
+       tapdisk_vbd_kick(vbd);
+       tapdisk_vbd_queue_count(vbd, &new, &pending, &failed, &completed);
+
+       DPRINTF("%s: state: 0x%08x, new: 0x%02x, pending: 0x%02x, "
+               "failed: 0x%02x, completed: 0x%02x\n", 
+               vbd->name, vbd->state, new, pending, failed, completed);
+       DPRINTF("last activity: %010ld.%06ld, errors: 0x%04"PRIx64", "
+               "retries: 0x%04"PRIx64", received: 0x%08"PRIx64", "
+               "returned: 0x%08"PRIx64", kicked: 0x%08"PRIx64"\n",
+               vbd->ts.tv_sec, vbd->ts.tv_usec,
+               vbd->errors, vbd->retries, vbd->received, vbd->returned,
+               vbd->kicked);
+
+       tapdisk_vbd_close_vdi(vbd);
+       tapdisk_ipc_write(&vbd->ipc, TAPDISK_MESSAGE_CLOSE_RSP);
+       tapdisk_vbd_unregister_events(vbd);
+       tapdisk_vbd_unmap_device(vbd);
+       tapdisk_server_remove_vbd(vbd);
+       free(vbd->name);
+       free(vbd);
+
+       tlog_print_errors();
+
+       return 0;
+}
+
+int
+tapdisk_vbd_close(td_vbd_t *vbd)
+{
+       /*
+        * don't close if any requests are pending in the aio layer
+        */
+       if (!list_empty(&vbd->pending_requests))
+               goto fail;
+
+       /* 
+        * if the queue is still active and we have more
+        * requests, try to complete them before closing.
+        */
+       if (tapdisk_vbd_queue_ready(vbd) &&
+           (!list_empty(&vbd->new_requests) ||
+            !list_empty(&vbd->failed_requests) ||
+            !list_empty(&vbd->completed_requests)))
+               goto fail;
+
+       return tapdisk_vbd_shutdown(vbd);
+
+fail:
+       td_flag_set(vbd->state, TD_VBD_SHUTDOWN_REQUESTED);
+       DBG(TLOG_WARN, "%s: requests pending\n", vbd->name);
+       return -EAGAIN;
+}
+
+/*
+ * control operations
+ */
+
+void
+tapdisk_vbd_debug(td_vbd_t *vbd)
+{
+       td_image_t *image, *tmp;
+       int new, pending, failed, completed;
+
+       tapdisk_vbd_queue_count(vbd, &new, &pending, &failed, &completed);
+
+       DBG(TLOG_WARN, "%s: state: 0x%08x, new: 0x%02x, pending: 0x%02x, "
+           "failed: 0x%02x, completed: 0x%02x, last activity: %010ld.%06ld, "
+           "errors: 0x%04"PRIx64", retries: 0x%04"PRIx64", received: 0x%08"PRIx64", "
+           "returned: 0x%08"PRIx64", kicked: 0x%08"PRIx64"\n",
+           vbd->name, vbd->state, new, pending, failed, completed,
+           vbd->ts.tv_sec, vbd->ts.tv_usec, vbd->errors, vbd->retries,
+           vbd->received, vbd->returned, vbd->kicked);
+
+       tapdisk_vbd_for_each_image(vbd, image, tmp)
+               td_debug(image);
+}
+
+static void
+tapdisk_vbd_drop_log(td_vbd_t *vbd)
+{
+       if (td_flag_test(vbd->state, TD_VBD_LOG_DROPPED))
+               return;
+
+       tapdisk_vbd_debug(vbd);
+       tlog_flush();
+       td_flag_set(vbd->state, TD_VBD_LOG_DROPPED);
+}
+
+int
+tapdisk_vbd_get_image_info(td_vbd_t *vbd, image_t *img)
+{
+       td_image_t *image;
+
+       memset(img, 0, sizeof(image_t));
+
+       if (list_empty(&vbd->images))
+               return -EINVAL;
+
+       image        = tapdisk_vbd_first_image(vbd);
+       img->size    = image->info.size;
+       img->secsize = image->info.sector_size;
+       img->info    = image->info.info;
+
+       return 0;
+}
+
+int
+tapdisk_vbd_queue_ready(td_vbd_t *vbd)
+{
+       return (!td_flag_test(vbd->state, TD_VBD_DEAD) &&
+               !td_flag_test(vbd->state, TD_VBD_CLOSED) &&
+               !td_flag_test(vbd->state, TD_VBD_QUIESCED) &&
+               !td_flag_test(vbd->state, TD_VBD_QUIESCE_REQUESTED));
+}
+
+int
+tapdisk_vbd_retry_needed(td_vbd_t *vbd)
+{
+       return td_flag_test(vbd->state, TD_VBD_RETRY_NEEDED);
+}
+
+int
+tapdisk_vbd_lock(td_vbd_t *vbd)
+{
+       return 0;
+}
+
+int
+tapdisk_vbd_quiesce_queue(td_vbd_t *vbd)
+{
+       if (!list_empty(&vbd->pending_requests)) {
+               td_flag_set(vbd->state, TD_VBD_QUIESCE_REQUESTED);
+               return -EAGAIN;
+       }
+
+       td_flag_clear(vbd->state, TD_VBD_QUIESCE_REQUESTED);
+       td_flag_set(vbd->state, TD_VBD_QUIESCED);
+       return 0;
+}
+
+int
+tapdisk_vbd_start_queue(td_vbd_t *vbd)
+{
+       td_flag_clear(vbd->state, TD_VBD_QUIESCED);
+       td_flag_clear(vbd->state, TD_VBD_QUIESCE_REQUESTED);
+       return 0;
+}
+
+int
+tapdisk_vbd_kill_queue(td_vbd_t *vbd)
+{
+       tapdisk_vbd_quiesce_queue(vbd);
+       td_flag_set(vbd->state, TD_VBD_DEAD);
+       return 0;
+}
+
+static int
+tapdisk_vbd_open_image(td_vbd_t *vbd, td_image_t *image)
+{
+       int err;
+       td_image_t *parent;
+
+       err = td_open(image);
+       if (err)
+               return err;
+
+       if (!tapdisk_vbd_is_last_image(vbd, image)) {
+               parent = tapdisk_vbd_next_image(image);
+               err    = td_validate_parent(image, parent);
+               if (err) {
+                       td_close(image);
+                       return err;
+               }
+       }
+
+       return 0;
+}
+
+static int
+tapdisk_vbd_close_and_reopen_image(td_vbd_t *vbd, td_image_t *image)
+{
+       int i, err;
+
+       td_close(image);
+
+       for (i = 0; i < TD_VBD_EIO_RETRIES; i++) {
+               err = tapdisk_vbd_open_image(vbd, image);
+               if (err != -EIO)
+                       break;
+
+               sleep(TD_VBD_EIO_SLEEP);
+       }
+
+       if (err)
+               td_flag_set(vbd->state, TD_VBD_CLOSED);
+
+       return err;
+}
+
+int
+tapdisk_vbd_pause(td_vbd_t *vbd)
+{
+       int err;
+
+       td_flag_set(vbd->state, TD_VBD_PAUSE_REQUESTED);
+
+       err = tapdisk_vbd_quiesce_queue(vbd);
+       if (err)
+               return err;
+
+       tapdisk_vbd_close_vdi(vbd);
+
+       td_flag_clear(vbd->state, TD_VBD_PAUSE_REQUESTED);
+       td_flag_set(vbd->state, TD_VBD_PAUSED);
+       tapdisk_ipc_write(&vbd->ipc, TAPDISK_MESSAGE_PAUSE_RSP);
+
+       return 0;
+}
+
+int
+tapdisk_vbd_resume(td_vbd_t *vbd, const char *path, uint16_t drivertype)
+{
+       int i, err;
+
+       if (!td_flag_test(vbd->state, TD_VBD_PAUSED)) {
+               EPRINTF("resume request for unpaused vbd %s\n", vbd->name);
+               tapdisk_ipc_write(&vbd->ipc, TAPDISK_MESSAGE_ERROR);
+               return -EINVAL;
+       }
+
+       free(vbd->name);
+       vbd->name = strdup(path);
+       if (!vbd->name) {
+               EPRINTF("copying new vbd %s name failed\n", path);
+               tapdisk_ipc_write(&vbd->ipc, TAPDISK_MESSAGE_ERROR);
+               return -EINVAL;
+       }
+       vbd->type = drivertype;
+
+       for (i = 0; i < TD_VBD_EIO_RETRIES; i++) {
+               err = tapdisk_vbd_check_file(vbd);
+               if (err)
+                       goto sleep;
+
+               err = tapdisk_vbd_reactivate_volumes(vbd, 1);
+               if (err) {
+                       EPRINTF("failed to reactivate %s: %d\n",
+                               vbd->name, err);
+                       goto sleep;
+               }
+
+               err = __tapdisk_vbd_open_vdi(vbd, TD_OPEN_STRICT);
+               if (!err)
+                       break;
+
+       sleep:
+               sleep(TD_VBD_EIO_SLEEP);
+       }
+
+       if (err) {
+               tapdisk_ipc_write(&vbd->ipc, TAPDISK_MESSAGE_ERROR);
+               return err;
+       }
+
+       tapdisk_vbd_start_queue(vbd);
+       td_flag_clear(vbd->state, TD_VBD_PAUSED);
+       td_flag_clear(vbd->state, TD_VBD_PAUSE_REQUESTED);
+       tapdisk_ipc_write(&vbd->ipc, TAPDISK_MESSAGE_RESUME_RSP);
+
+       return 0;
+}
+
+int
+tapdisk_vbd_kick(td_vbd_t *vbd)
+{
+       int n;
+       td_ring_t *ring;
+
+       ring = &vbd->ring;
+       if (!ring->sring)
+               return 0;
+
+       n    = (ring->fe_ring.rsp_prod_pvt - ring->fe_ring.sring->rsp_prod);
+       if (!n)
+               return 0;
+
+       vbd->kicked += n;
+       RING_PUSH_RESPONSES(&ring->fe_ring);
+       ioctl(ring->fd, BLKTAP_IOCTL_KICK_FE, 0);
+
+       DBG(TLOG_INFO, "kicking %d: rec: 0x%08"PRIx64", ret: 0x%08"PRIx64", kicked: "
+           "0x%08"PRIx64"\n", n, vbd->received, vbd->returned, vbd->kicked);
+
+       return n;
+}
+
+static inline void
+tapdisk_vbd_write_response_to_ring(td_vbd_t *vbd, blkif_response_t *rsp)
+{
+       td_ring_t *ring;
+       blkif_response_t *rspp;
+
+       ring = &vbd->ring;
+       rspp = RING_GET_RESPONSE(&ring->fe_ring, ring->fe_ring.rsp_prod_pvt);
+       memcpy(rspp, rsp, sizeof(blkif_response_t));
+       ring->fe_ring.rsp_prod_pvt++;
+}
+
+static void
+tapdisk_vbd_callback(void *arg, blkif_response_t *rsp)
+{
+       td_vbd_t *vbd = (td_vbd_t *)arg;
+       tapdisk_vbd_write_response_to_ring(vbd, rsp);
+}
+
+static void
+tapdisk_vbd_make_response(td_vbd_t *vbd, td_vbd_request_t *vreq)
+{
+       blkif_request_t tmp;
+       blkif_response_t *rsp;
+
+       tmp = vreq->req;
+       rsp = (blkif_response_t *)&vreq->req;
+
+       rsp->id = tmp.id;
+       rsp->operation = tmp.operation;
+       rsp->status = vreq->status;
+
+       DBG(TLOG_DBG, "writing req %d, sec 0x%08"PRIx64", res %d to ring\n",
+           (int)tmp.id, tmp.sector_number, vreq->status);
+
+       if (rsp->status != BLKIF_RSP_OKAY)
+               ERR(EIO, "returning BLKIF_RSP %d", rsp->status);
+
+       vbd->returned++;
+       vbd->callback(vbd->argument, rsp);
+}
+
+void
+tapdisk_vbd_check_state(td_vbd_t *vbd)
+{
+       td_vbd_request_t *vreq, *tmp;
+
+       tapdisk_vbd_for_each_request(vreq, tmp, &vbd->failed_requests)
+               if (vreq->num_retries >= TD_VBD_MAX_RETRIES)
+                       tapdisk_vbd_complete_vbd_request(vbd, vreq);
+
+       if (!list_empty(&vbd->new_requests) ||
+           !list_empty(&vbd->failed_requests))
+               tapdisk_vbd_issue_requests(vbd);
+
+       tapdisk_vbd_for_each_request(vreq, tmp, &vbd->completed_requests) {
+               tapdisk_vbd_make_response(vbd, vreq);
+               list_del(&vreq->next);
+               tapdisk_vbd_initialize_vreq(vreq);
+       }
+
+       if (td_flag_test(vbd->state, TD_VBD_QUIESCE_REQUESTED))
+               tapdisk_vbd_quiesce_queue(vbd);
+
+       if (td_flag_test(vbd->state, TD_VBD_PAUSE_REQUESTED))
+               tapdisk_vbd_pause(vbd);
+
+       if (td_flag_test(vbd->state, TD_VBD_SHUTDOWN_REQUESTED))
+               tapdisk_vbd_close(vbd);
+}
+
+void
+tapdisk_vbd_check_progress(td_vbd_t *vbd)
+{
+       int diff;
+       struct timeval now;
+
+       if (list_empty(&vbd->pending_requests))
+               return;
+
+       gettimeofday(&now, NULL);
+       diff = now.tv_sec - vbd->ts.tv_sec;
+
+       if (diff >= TD_VBD_WATCHDOG_TIMEOUT) {
+               DBG(TLOG_WARN, "%s: watchdog timeout: pending requests "
+                   "idle for %d seconds\n", vbd->name, diff);
+               tapdisk_vbd_drop_log(vbd);
+               return;
+       }
+
+       tapdisk_server_set_max_timeout(TD_VBD_WATCHDOG_TIMEOUT - diff);
+}
+
+/*
+ * request submission 
+ */
+
+static int
+tapdisk_vbd_check_queue(td_vbd_t *vbd)
+{
+       int err;
+       td_image_t *image;
+
+       if (list_empty(&vbd->images))
+               return -ENOSYS;
+
+       if (!tapdisk_vbd_queue_ready(vbd))
+               return -EAGAIN;
+
+       if (!vbd->reopened) {
+               if (td_flag_test(vbd->state, TD_VBD_LOCKING)) {
+                       err = tapdisk_vbd_lock(vbd);
+                       if (err)
+                               return err;
+               }
+
+               image = tapdisk_vbd_first_image(vbd);
+               td_flag_set(image->flags, TD_OPEN_STRICT);
+
+               if (tapdisk_vbd_close_and_reopen_image(vbd, image))
+                       EPRINTF("reopening disks failed\n");
+               else {
+                       DPRINTF("reopening disks succeeded\n");
+                       vbd->reopened = 1;
+               }
+       }
+
+       return 0;
+}
+
+void
+tapdisk_vbd_complete_vbd_request(td_vbd_t *vbd, td_vbd_request_t *vreq)
+{
+       if (!vreq->submitting && !vreq->secs_pending) {
+               if (vreq->status == BLKIF_RSP_ERROR &&
+                   vreq->num_retries < TD_VBD_MAX_RETRIES &&
+                   !td_flag_test(vbd->state, TD_VBD_DEAD) &&
+                   !td_flag_test(vbd->state, TD_VBD_SHUTDOWN_REQUESTED))
+                       tapdisk_vbd_move_request(vreq, &vbd->failed_requests);
+               else
+                       tapdisk_vbd_move_request(vreq, &vbd->completed_requests);
+       }
+}
+
+static void
+__tapdisk_vbd_complete_td_request(td_vbd_t *vbd, td_vbd_request_t *vreq,
+                                 td_request_t treq, int res)
+{
+       int err;
+
+       err = (res <= 0 ? res : -res);
+       vbd->secs_pending  -= treq.secs;
+       vreq->secs_pending -= treq.secs;
+
+       vreq->blocked = treq.blocked;
+
+       if (err) {
+               vreq->status = BLKIF_RSP_ERROR;
+               vreq->error  = (vreq->error ? : err);
+               if (err != -EBUSY) {
+                       vbd->errors++;
+                       ERR(err, "req %"PRIu64": %s 0x%04x secs to "
+                           "0x%08"PRIx64, vreq->req.id,
+                           (treq.op == TD_OP_WRITE ? "write" : "read"),
+                           treq.secs, treq.sec);
+               }
+       }
+
+       tapdisk_vbd_complete_vbd_request(vbd, vreq);
+}
+
+static void
+__tapdisk_vbd_reissue_td_request(td_vbd_t *vbd,
+                                td_image_t *image, td_request_t treq)
+{
+       td_image_t *parent;
+       td_vbd_request_t *vreq;
+
+       vreq = (td_vbd_request_t *)treq.private;
+       gettimeofday(&vreq->last_try, NULL);
+
+       vreq->submitting++;
+
+       if (tapdisk_vbd_is_last_image(vbd, image)) {
+               memset(treq.buf, 0, treq.secs << SECTOR_SHIFT);
+               td_complete_request(treq, 0);
+               goto done;
+       }
+
+       parent     = tapdisk_vbd_next_image(image);
+       treq.image = parent;
+
+       /* return zeros for requests that extend beyond end of parent image */
+       if (treq.sec + treq.secs > parent->info.size) {
+               td_request_t clone  = treq;
+
+               if (parent->info.size > treq.sec) {
+                       int secs    = parent->info.size - treq.sec;
+                       clone.sec  += secs;
+                       clone.secs -= secs;
+                       clone.buf  += (secs << SECTOR_SHIFT);
+                       treq.secs   = secs;
+               } else
+                       treq.secs   = 0;
+
+               memset(clone.buf, 0, clone.secs << SECTOR_SHIFT);
+               td_complete_request(clone, 0);
+
+               if (!treq.secs)
+                       goto done;
+       }
+
+       switch (treq.op) {
+       case TD_OP_WRITE:
+               td_queue_write(parent, treq);
+               break;
+
+       case TD_OP_READ:
+               td_queue_read(parent, treq);
+               break;
+       }
+
+done:
+       vreq->submitting--;
+       if (!vreq->secs_pending)
+               tapdisk_vbd_complete_vbd_request(vbd, vreq);
+}
+
+void
+tapdisk_vbd_forward_request(td_request_t treq)
+{
+       td_vbd_t *vbd;
+       td_image_t *image;
+       td_vbd_request_t *vreq;
+
+       image = treq.image;
+       vbd   = (td_vbd_t *)image->private;
+       vreq  = (td_vbd_request_t *)treq.private;
+
+       gettimeofday(&vbd->ts, NULL);
+
+       if (tapdisk_vbd_queue_ready(vbd))
+               __tapdisk_vbd_reissue_td_request(vbd, image, treq);
+       else
+               __tapdisk_vbd_complete_td_request(vbd, vreq, treq, -EIO);
+}
+
+static void
+tapdisk_vbd_complete_td_request(td_request_t treq, int res)
+{
+       td_vbd_t *vbd;
+       td_image_t *image;
+       td_vbd_request_t *vreq;
+
+       image = treq.image;
+       vbd   = (td_vbd_t *)image->private;
+       vreq  = (td_vbd_request_t *)treq.private;
+
+       gettimeofday(&vbd->ts, NULL);
+       DBG(TLOG_DBG, "%s: req %d seg %d sec 0x%08"PRIx64" "
+           "secs 0x%04x buf %p op %d res %d\n", image->name,
+           (int)treq.id, treq.sidx, treq.sec, treq.secs,
+           treq.buf, (int)vreq->req.operation, res);
+
+       __tapdisk_vbd_complete_td_request(vbd, vreq, treq, res);
+}
+
+static int
+tapdisk_vbd_issue_request(td_vbd_t *vbd, td_vbd_request_t *vreq)
+{
+       char *page;
+       td_ring_t *ring;
+       td_image_t *image;
+       td_request_t treq;
+       uint64_t sector_nr;
+       blkif_request_t *req;
+       int i, err, id, nsects;
+
+       req       = &vreq->req;
+       id        = req->id;
+       ring      = &vbd->ring;
+       sector_nr = req->sector_number;
+       image     = tapdisk_vbd_first_image(vbd);
+
+       vreq->submitting = 1;
+       gettimeofday(&vbd->ts, NULL);
+       gettimeofday(&vreq->last_try, NULL);
+       tapdisk_vbd_move_request(vreq, &vbd->pending_requests);
+
+       err = tapdisk_vbd_check_queue(vbd);
+       if (err)
+               goto fail;
+
+       err = tapdisk_image_check_ring_request(image, req);
+       if (err)
+               goto fail;
+
+       for (i = 0; i < req->nr_segments; i++) {
+               nsects = req->seg[i].last_sect - req->seg[i].first_sect + 1;
+               page   = (char *)MMAP_VADDR(ring->vstart, 
+                                          (unsigned long)req->id, i);
+               page  += (req->seg[i].first_sect << SECTOR_SHIFT);
+
+               treq.id             = id;
+               treq.sidx           = i;
+               treq.blocked        = 0;
+               treq.buf            = page;
+               treq.sec            = sector_nr;
+               treq.secs           = nsects;
+               treq.image          = image;
+               treq.cb             = tapdisk_vbd_complete_td_request;
+               treq.cb_data        = NULL;
+               treq.private        = vreq;
+
+               DBG(TLOG_DBG, "%s: req %d seg %d sec 0x%08"PRIx64" secs 0x%04x "
+                   "buf %p op %d\n", image->name, id, i, treq.sec, treq.secs,
+                   treq.buf, (int)req->operation);
+
+               vreq->secs_pending += nsects;
+               vbd->secs_pending  += nsects;
+
+               switch (req->operation) {
+               case BLKIF_OP_WRITE:
+                       treq.op = TD_OP_WRITE;
+                       td_queue_write(image, treq);
+                       break;
+
+               case BLKIF_OP_READ:
+                       treq.op = TD_OP_READ;
+                       td_queue_read(image, treq);
+                       break;
+               }
+
+               sector_nr += nsects;
+       }
+
+       err = 0;
+
+out:
+       vreq->submitting--;
+       if (!vreq->secs_pending) {
+               err = (err ? : vreq->error);
+               tapdisk_vbd_complete_vbd_request(vbd, vreq);
+       }
+
+       return err;
+
+fail:
+       vreq->status = BLKIF_RSP_ERROR;
+       goto out;
+}
+
+static int
+tapdisk_vbd_reissue_failed_requests(td_vbd_t *vbd)
+{
+       int err;
+       struct timeval now;
+       td_vbd_request_t *vreq, *tmp;
+
+       err = 0;
+       gettimeofday(&now, NULL);
+
+       tapdisk_vbd_for_each_request(vreq, tmp, &vbd->failed_requests) {
+               if (vreq->secs_pending)
+                       continue;
+
+               if (td_flag_test(vbd->state, TD_VBD_SHUTDOWN_REQUESTED))
+                       goto fail;
+
+               if (vreq->error != -EBUSY &&
+                   now.tv_sec - vreq->last_try.tv_sec < TD_VBD_RETRY_INTERVAL)
+                       continue;
+
+               if (vreq->num_retries >= TD_VBD_MAX_RETRIES) {
+               fail:
+                       DBG(TLOG_INFO, "req %"PRIu64"retried %d times\n",
+                           vreq->req.id, vreq->num_retries);
+                       tapdisk_vbd_complete_vbd_request(vbd, vreq);
+                       continue;
+               }
+
+               /*
+                * never fail due to too many retries if we are blocked on a 
+                * dependency
+                */
+               if (vreq->blocked) {
+                       vreq->blocked = 0;
+               } else {
+                       vbd->retries++;
+                       vreq->num_retries++;
+               }
+               vreq->error  = 0;
+               vreq->status = BLKIF_RSP_OKAY;
+               DBG(TLOG_DBG, "retry #%d of req %"PRIu64", "
+                   "sec 0x%08"PRIx64", nr_segs: %d\n", vreq->num_retries,
+                   vreq->req.id, vreq->req.sector_number,
+                   vreq->req.nr_segments);
+
+               err = tapdisk_vbd_issue_request(vbd, vreq);
+               if (err)
+                       break;
+       }
+
+       if (list_empty(&vbd->failed_requests))
+               td_flag_clear(vbd->state, TD_VBD_RETRY_NEEDED);
+       else
+               td_flag_set(vbd->state, TD_VBD_RETRY_NEEDED);
+
+       return err;
+}
+
+static int
+tapdisk_vbd_issue_new_requests(td_vbd_t *vbd)
+{
+       int err;
+       td_vbd_request_t *vreq, *tmp;
+
+       tapdisk_vbd_for_each_request(vreq, tmp, &vbd->new_requests) {
+               err = tapdisk_vbd_issue_request(vbd, vreq);
+               if (err)
+                       return err;
+       }
+
+       return 0;
+}
+
+static int
+tapdisk_vbd_kill_requests(td_vbd_t *vbd)
+{
+       td_vbd_request_t *vreq, *tmp;
+
+       tapdisk_vbd_for_each_request(vreq, tmp, &vbd->new_requests) {
+               vreq->status = BLKIF_RSP_ERROR;
+               tapdisk_vbd_move_request(vreq, &vbd->completed_requests);
+       }
+
+       tapdisk_vbd_for_each_request(vreq, tmp, &vbd->failed_requests) {
+               vreq->status = BLKIF_RSP_ERROR;
+               tapdisk_vbd_move_request(vreq, &vbd->completed_requests);
+       }
+
+       return 0;
+}
+
+int
+tapdisk_vbd_issue_requests(td_vbd_t *vbd)
+{
+       int err;
+
+       if (td_flag_test(vbd->state, TD_VBD_DEAD))
+               return tapdisk_vbd_kill_requests(vbd);
+
+       if (!tapdisk_vbd_queue_ready(vbd))
+               return -EAGAIN;
+
+       err = tapdisk_vbd_reissue_failed_requests(vbd);
+       if (err)
+               return err;
+
+       return tapdisk_vbd_issue_new_requests(vbd);
+}
+
+static void
+tapdisk_vbd_pull_ring_requests(td_vbd_t *vbd)
+{
+       int idx;
+       RING_IDX rp, rc;
+       td_ring_t *ring;
+       blkif_request_t *req;
+       td_vbd_request_t *vreq;
+
+       ring = &vbd->ring;
+       if (!ring->sring)
+               return;
+
+       rp   = ring->fe_ring.sring->req_prod;
+       xen_rmb();
+
+       for (rc = ring->fe_ring.req_cons; rc != rp; rc++) {
+               req = RING_GET_REQUEST(&ring->fe_ring, rc);
+               ++ring->fe_ring.req_cons;
+
+               idx  = req->id;
+               vreq = &vbd->request_list[idx];
+
+               ASSERT(list_empty(&vreq->next));
+               ASSERT(vreq->secs_pending == 0);
+
+               memcpy(&vreq->req, req, sizeof(blkif_request_t));
+               vbd->received++;
+               vreq->vbd = vbd;
+
+               tapdisk_vbd_move_request(vreq, &vbd->new_requests);
+
+               DBG(TLOG_DBG, "%s: request %d \n", vbd->name, idx);
+       }
+}
+
+static int
+tapdisk_vbd_pause_ring(td_vbd_t *vbd)
+{
+       int err;
+
+       if (td_flag_test(vbd->state, TD_VBD_PAUSED))
+               return 0;
+
+       td_flag_set(vbd->state, TD_VBD_PAUSE_REQUESTED);
+
+       err = tapdisk_vbd_quiesce_queue(vbd);
+       if (err) {
+               EPRINTF("%s: ring pause request on active queue\n", vbd->name);
+               return err;
+       }
+
+       tapdisk_vbd_close_vdi(vbd);
+
+       err = ioctl(vbd->ring.fd, BLKTAP2_IOCTL_PAUSE, 0);
+       if (err)
+               EPRINTF("%s: pause ioctl failed: %d\n", vbd->name, errno);
+       else {
+               td_flag_clear(vbd->state, TD_VBD_PAUSE_REQUESTED);
+               td_flag_set(vbd->state, TD_VBD_PAUSED);
+       }
+
+       return err;
+}
+
+static int
+tapdisk_vbd_resume_ring(td_vbd_t *vbd)
+{
+       int i, err, type;
+       char *path, message[BLKTAP2_MAX_MESSAGE_LEN];
+
+       memset(message, 0, sizeof(message));
+
+       if (!td_flag_test(vbd->state, TD_VBD_PAUSED)) {
+               EPRINTF("%s: resume message for unpaused vbd\n", vbd->name);
+               return -EINVAL;
+       }
+
+       err = ioctl(vbd->ring.fd, BLKTAP2_IOCTL_REOPEN, &message);
+       if (err) {
+               EPRINTF("%s: resume ioctl failed: %d\n", vbd->name, errno);
+               return err;
+       }
+
+       err = tapdisk_parse_disk_type(message, &path, &type);
+       if (err) {
+               EPRINTF("%s: invalid resume string %s\n", vbd->name, message);
+               goto out;
+       }
+
+       free(vbd->name);
+       vbd->name = strdup(path);
+       if (!vbd->name) {
+               EPRINTF("resume malloc failed\n");
+               err = -ENOMEM;
+               goto out;
+       }
+       vbd->type = type;
+
+       tapdisk_vbd_start_queue(vbd);
+
+       err = tapdisk_vbd_reactivate_volumes(vbd, 1);
+       if (err) {
+               EPRINTF("failed to reactivate %s, %d\n", vbd->name, err);
+               goto out;
+       }
+
+       for (i = 0; i < TD_VBD_EIO_RETRIES; i++) {
+               err = __tapdisk_vbd_open_vdi(vbd, TD_OPEN_STRICT);
+               if (err != -EIO)
+                       break;
+
+               sleep(TD_VBD_EIO_SLEEP);
+       }
+
+out:
+       if (!err) {
+               image_t image;
+               struct blktap2_params params;
+
+               memset(&params, 0, sizeof(params));
+               tapdisk_vbd_get_image_info(vbd, &image);
+
+               params.sector_size = image.secsize;
+               params.capacity    = image.size;
+               snprintf(params.name, sizeof(params.name) - 1, "%s", message);
+
+               ioctl(vbd->ring.fd, BLKTAP2_IOCTL_SET_PARAMS, &params);
+               td_flag_clear(vbd->state, TD_VBD_PAUSED);
+       }
+
+       ioctl(vbd->ring.fd, BLKTAP2_IOCTL_RESUME, err);
+       return err;
+}
+
+static int
+tapdisk_vbd_check_ring_message(td_vbd_t *vbd)
+{
+       if (!vbd->ring.sring)
+               return -EINVAL;
+
+       switch (vbd->ring.sring->pad[0]) {
+       case 0:
+               return 0;
+
+       case BLKTAP2_RING_MESSAGE_PAUSE:
+               return tapdisk_vbd_pause_ring(vbd);
+
+       case BLKTAP2_RING_MESSAGE_RESUME:
+               return tapdisk_vbd_resume_ring(vbd);
+
+       case BLKTAP2_RING_MESSAGE_CLOSE:
+               return tapdisk_vbd_close(vbd);
+
+       default:
+               return -EINVAL;
+       }
+}
+
+static void
+tapdisk_vbd_ring_event(event_id_t id, char mode, void *private)
+{
+       td_vbd_t *vbd;
+
+       vbd = (td_vbd_t *)private;
+
+       tapdisk_vbd_pull_ring_requests(vbd);
+       tapdisk_vbd_issue_requests(vbd);
+
+       /* vbd may be destroyed after this call */
+       tapdisk_vbd_check_ring_message(vbd);
+}
+
+td_image_t *
+tapdisk_vbd_first_image(td_vbd_t *vbd)
+{
+       return list_entry(vbd->images.next, td_image_t, next);
+}
diff --git a/tools/blktap2/drivers/tapdisk-vbd.h b/tools/blktap2/drivers/tapdisk-vbd.h
new file mode 100644 (file)
index 0000000..ecb22a0
--- /dev/null
@@ -0,0 +1,193 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef _TAPDISK_VBD_H_
+#define _TAPDISK_VBD_H_
+
+#include <sys/time.h>
+#include <xenctrl.h>
+#include <xen/io/blkif.h>
+
+#include "tapdisk.h"
+#include "scheduler.h"
+#include "tapdisk-ipc.h"
+#include "tapdisk-image.h"
+
+#define TD_VBD_MAX_RETRIES          100
+#define TD_VBD_RETRY_INTERVAL       1
+
+#define TD_VBD_DEAD                 0x0001
+#define TD_VBD_CLOSED               0x0002
+#define TD_VBD_QUIESCE_REQUESTED    0x0004
+#define TD_VBD_QUIESCED             0x0008
+#define TD_VBD_PAUSE_REQUESTED      0x0010
+#define TD_VBD_PAUSED               0x0020
+#define TD_VBD_SHUTDOWN_REQUESTED   0x0040
+#define TD_VBD_LOCKING              0x0080
+#define TD_VBD_RETRY_NEEDED         0x0100
+#define TD_VBD_LOG_DROPPED          0x0200
+
+typedef struct td_ring              td_ring_t;
+typedef struct td_vbd_request       td_vbd_request_t;
+typedef struct td_vbd_handle        td_vbd_t;
+typedef void (*td_vbd_cb_t)        (void *, blkif_response_t *);
+
+struct td_ring {
+       int                         fd;
+       char                       *mem;
+       blkif_sring_t              *sring;
+       blkif_back_ring_t           fe_ring;
+       unsigned long               vstart;
+};
+
+struct td_vbd_request {
+       blkif_request_t             req;
+       int16_t                     status;
+
+       int                         error;
+       int                         blocked; /* blocked on a dependency */
+       int                         submitting;
+       int                         secs_pending;
+       int                         num_retries;
+       struct timeval              last_try;
+
+       td_vbd_t                   *vbd;
+       struct list_head            next;
+};
+
+struct td_vbd_handle {
+       char                       *name;
+
+       td_uuid_t                   uuid;
+       int                         type;
+
+       int                         storage;
+
+       uint8_t                     reopened;
+       uint8_t                     reactivated;
+       td_flag_t                   flags;
+       td_flag_t                   state;
+
+       td_ipc_t                    ipc;
+
+       struct list_head            images;
+
+       struct list_head            new_requests;
+       struct list_head            pending_requests;
+       struct list_head            failed_requests;
+       struct list_head            completed_requests;
+
+       td_vbd_request_t            request_list[MAX_REQUESTS];
+
+       td_ring_t                   ring;
+       event_id_t                  ring_event_id;
+
+       td_vbd_cb_t                 callback;
+       void                       *argument;
+
+       struct list_head            next;
+
+       struct timeval              ts;
+
+       uint64_t                    received;
+       uint64_t                    returned;
+       uint64_t                    kicked;
+       uint64_t                    secs_pending;
+       uint64_t                    retries;
+       uint64_t                    errors;
+};
+
+#define tapdisk_vbd_for_each_request(vreq, tmp, list)                  \
+       list_for_each_entry_safe((vreq), (tmp), (list), next)
+
+#define tapdisk_vbd_for_each_image(vbd, image, tmp)                    \
+       list_for_each_entry_safe((image), (tmp), &(vbd)->images, next)
+
+static inline void
+tapdisk_vbd_move_request(td_vbd_request_t *vreq, struct list_head *dest)
+{
+       list_del(&vreq->next);
+       INIT_LIST_HEAD(&vreq->next);
+       list_add_tail(&vreq->next, dest);
+}
+
+static inline void
+tapdisk_vbd_add_image(td_vbd_t *vbd, td_image_t *image)
+{
+       list_add_tail(&image->next, &vbd->images);
+}
+
+static inline int
+tapdisk_vbd_is_last_image(td_vbd_t *vbd, td_image_t *image)
+{
+       return list_is_last(&image->next, &vbd->images);
+}
+
+td_image_t *
+tapdisk_vbd_first_image(td_vbd_t *vbd);
+
+static inline td_image_t *
+tapdisk_vbd_last_image(td_vbd_t *vbd)
+{
+       return list_entry(vbd->images.prev, td_image_t, next);
+}
+
+static inline td_image_t *
+tapdisk_vbd_next_image(td_image_t *image)
+{
+       return list_entry(image->next.next, td_image_t, next);
+}
+
+int tapdisk_vbd_initialize(int, int, td_uuid_t);
+void tapdisk_vbd_set_callback(td_vbd_t *, td_vbd_cb_t, void *);
+int tapdisk_vbd_open(td_vbd_t *, const char *, uint16_t,
+                    uint16_t, const char *, td_flag_t);
+int tapdisk_vbd_close(td_vbd_t *);
+
+int tapdisk_vbd_open_vdi(td_vbd_t *, const char *,
+                        uint16_t, uint16_t, td_flag_t);
+void tapdisk_vbd_close_vdi(td_vbd_t *);
+
+void tapdisk_vbd_forward_request(td_request_t);
+
+int tapdisk_vbd_get_image_info(td_vbd_t *, image_t *);
+int tapdisk_vbd_queue_ready(td_vbd_t *);
+int tapdisk_vbd_retry_needed(td_vbd_t *);
+int tapdisk_vbd_quiesce_queue(td_vbd_t *);
+int tapdisk_vbd_start_queue(td_vbd_t *);
+int tapdisk_vbd_issue_requests(td_vbd_t *);
+int tapdisk_vbd_kill_queue(td_vbd_t *);
+int tapdisk_vbd_pause(td_vbd_t *);
+int tapdisk_vbd_resume(td_vbd_t *, const char *, uint16_t);
+int tapdisk_vbd_kick(td_vbd_t *);
+void tapdisk_vbd_check_state(td_vbd_t *);
+void tapdisk_vbd_check_progress(td_vbd_t *);
+void tapdisk_vbd_debug(td_vbd_t *);
+
+void tapdisk_vbd_complete_vbd_request(td_vbd_t *, td_vbd_request_t *);
+
+#endif
diff --git a/tools/blktap2/drivers/tapdisk.c b/tools/blktap2/drivers/tapdisk.c
new file mode 100644 (file)
index 0000000..db1366a
--- /dev/null
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <stdio.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "tapdisk-utils.h"
+#include "tapdisk-server.h"
+
+static void
+usage(void)
+{
+       fprintf(stderr, "blktap-utils: v2.0.0\n");
+       fprintf(stderr, "usage: tapdisk <READ fifo> <WRITE fifo>\n");
+       exit(EINVAL);
+}
+
+int
+main(int argc, char *argv[])
+{
+       int err;
+
+       if (argc != 3)
+               usage();
+
+       daemon(0, 0);
+       tapdisk_start_logging("TAPDISK");
+
+       err = tapdisk_server_initialize(argv[1], argv[2]);
+       if (err) {
+               EPRINTF("failed to initialize tapdisk server: %d\n", err);
+               goto out;
+       }
+
+       err = tapdisk_server_run();
+
+out:
+       tapdisk_stop_logging();
+       return err;
+}
diff --git a/tools/blktap2/drivers/tapdisk.h b/tools/blktap2/drivers/tapdisk.h
new file mode 100644 (file)
index 0000000..487c50f
--- /dev/null
@@ -0,0 +1,158 @@
+/*
+ * Copyright (c) 2007, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * 
+ * Some notes on the tap_disk interface:
+ * 
+ * tap_disk aims to provide a generic interface to easily implement new 
+ * types of image accessors.  The structure-of-function-calls is similar
+ * to disk interfaces used in qemu/denali/etc, with the significant 
+ * difference being the expectation of asynchronous rather than synchronous 
+ * I/O.  The asynchronous interface is intended to allow lots of requests to
+ * be pipelined through a disk, without the disk requiring any of its own
+ * threads of control.  As such, a batch of requests is delivered to the disk
+ * using:
+ * 
+ *    td_queue_[read,write]()
+ * 
+ * and passing in a completion callback, which the disk is responsible for 
+ * tracking.  Disks should transform these requests as necessary and return
+ * the resulting iocbs to tapdisk using td_prep_[read,write]() and 
+ * td_queue_tiocb().
+ *
+ * NOTE: tapdisk uses the number of sectors submitted per request as a 
+ * ref count.  Plugins must use the callback function to communicate the
+ * completion -- or error -- of every sector submitted to them.
+ *
+ * td_get_parent_id returns:
+ *     0 if parent id successfully retrieved
+ *     TD_NO_PARENT if no parent exists
+ *     -errno on error
+ */
+
+#ifndef _TAPDISK_H_
+#define _TAPDISK_H_
+
+#include <time.h>
+#include <stdint.h>
+
+#include "list.h"
+#include "blktaplib.h"
+#include "disktypes.h"
+#include "tapdisk-log.h"
+#include "tapdisk-utils.h"
+
+#define MAX_SEGMENTS_PER_REQ         11
+#define SECTOR_SHIFT                 9
+#define DEFAULT_SECTOR_SIZE          512
+
+#define TAPDISK_DATA_REQUESTS       (MAX_REQUESTS * MAX_SEGMENTS_PER_REQ)
+
+//#define BLK_NOT_ALLOCATED            (-99)
+#define TD_NO_PARENT                 1
+
+#define MAX_RAMDISK_SIZE             1024000 /*500MB disk limit*/
+
+#define TD_OP_READ                   0
+#define TD_OP_WRITE                  1
+
+#define TD_OPEN_QUIET                0x00001
+#define TD_OPEN_QUERY                0x00002
+#define TD_OPEN_RDONLY               0x00004
+#define TD_OPEN_STRICT               0x00008
+#define TD_OPEN_SHAREABLE            0x00010
+#define TD_OPEN_ADD_CACHE            0x00020
+#define TD_OPEN_VHD_INDEX            0x00040
+#define TD_OPEN_LOG_DIRTY            0x00080
+
+#define TD_CREATE_SPARSE             0x00001
+#define TD_CREATE_MULTITYPE          0x00002
+
+#define td_flag_set(word, flag)      ((word) |= (flag))
+#define td_flag_clear(word, flag)    ((word) &= ~(flag))
+#define td_flag_test(word, flag)     ((word) & (flag))
+
+typedef uint16_t                     td_uuid_t;
+typedef uint32_t                     td_flag_t;
+typedef uint64_t                     td_sector_t;
+typedef struct td_disk_id            td_disk_id_t;
+typedef struct td_disk_info          td_disk_info_t;
+typedef struct td_request            td_request_t;
+typedef struct td_driver_handle      td_driver_t;
+typedef struct td_image_handle       td_image_t;
+
+/* 
+ * Prototype of the callback to activate as requests complete.
+ */
+typedef void (*td_callback_t)(td_request_t, int);
+
+struct td_disk_id {
+       char                        *name;
+       int                          drivertype;
+};
+
+struct td_disk_info {
+       td_sector_t                  size;
+        long                         sector_size;
+       uint32_t                     info;
+};
+
+struct td_request {
+       int                          op;
+       char                        *buf;
+       td_sector_t                  sec;
+       int                          secs;
+
+       uint8_t                      blocked; /* blocked on a dependency */
+
+       td_image_t                  *image;
+
+       td_callback_t                cb;
+       void                        *cb_data;
+
+       uint64_t                     id;
+       int                          sidx;
+       void                        *private;
+};
+
+/* 
+ * Structure describing the interface to a virtual disk implementation.
+ * See note at the top of this file describing this interface.
+ */
+struct tap_disk {
+       const char                  *disk_type;
+       td_flag_t                    flags;
+       int                          private_data_size;
+       int (*td_open)               (td_driver_t *, const char *, td_flag_t);
+       int (*td_close)              (td_driver_t *);
+       int (*td_get_parent_id)      (td_driver_t *, td_disk_id_t *);
+       int (*td_validate_parent)    (td_driver_t *, td_driver_t *, td_flag_t);
+       void (*td_queue_read)        (td_driver_t *, td_request_t);
+       void (*td_queue_write)       (td_driver_t *, td_request_t);
+       void (*td_debug)             (td_driver_t *);
+};
+
+#endif
diff --git a/tools/blktap2/drivers/tapdisk2.c b/tools/blktap2/drivers/tapdisk2.c
new file mode 100644 (file)
index 0000000..45b27ec
--- /dev/null
@@ -0,0 +1,436 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <stdio.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/ioctl.h>
+
+#include "tapdisk.h"
+#include "blktap2.h"
+#include "tapdisk-vbd.h"
+#include "tapdisk-utils.h"
+#include "tapdisk-server.h"
+
+#define TAPDISK2_VBD 0
+
+#define cprintf(_err, _f, _a...)                                       \
+       do {                                                            \
+               if (child_out) {                                        \
+                       fprintf(child_out, "%d: " _f, _err, ##_a);      \
+                       fflush(child_out);                              \
+               }                                                       \
+       } while (0)
+
+#define CHILD_ERR(_err, _f, _a...)                                     \
+       do {                                                            \
+               EPRINTF(_f, ##_a);                                      \
+               cprintf(_err, _f, ##_a);                                \
+       } while (0)
+
+static int channel[2];
+static FILE *child_out;
+static struct blktap2_handle handle;
+
+static int
+tapdisk2_prepare_directory(void)
+{
+       int err;
+       char *ptr, *name, *start;
+
+       err = access(BLKTAP2_DIRECTORY, W_OK | R_OK);
+       if (!err)
+               return 0;
+
+       name = strdup(BLKTAP2_DIRECTORY);
+       if (!name)
+               return -ENOMEM;
+
+       start = name;
+
+       for (;;) {
+               ptr = strchr(start + 1, '/');
+               if (ptr)
+                       *ptr = '\0';
+
+               err = mkdir(name, 0755);
+               if (err && errno != EEXIST) {
+                       err = -errno;
+                       CHILD_ERR(err, "failed to create directory %s: %d\n",
+                                 name, err);
+                       break;
+               }
+
+               if (!ptr)
+                       break;
+               else {
+                       *ptr = '/';
+                       start = ptr + 1;
+               }
+       }
+
+       free(name);
+       return err;
+}
+
+static int
+tapdisk2_make_device(char *devname, int major, int minor, int perm)
+{
+       int err;
+       struct stat st;
+
+       err = tapdisk2_prepare_directory();
+       if (err)
+               return err;
+
+       if (!access(devname, F_OK))
+               if (unlink(devname)) {
+                       CHILD_ERR(errno, "error unlinking %s: %d\n",
+                                 devname, errno);
+                       return -errno;
+               }
+
+       err = mknod(devname, perm, makedev(major, minor));
+       if (err) {
+               CHILD_ERR(errno, "mknod %s failed: %d\n", devname, -errno);
+               return -errno;
+       }
+
+       DPRINTF("Created %s device\n", devname);
+       return 0;
+}
+
+static int
+tapdisk2_check_environment(void)
+{
+       FILE *f;
+       int err, minor;
+       char name[256];
+
+       if (!access(BLKTAP2_CONTROL_DEVICE, R_OK | W_OK))
+               return 0;
+
+       memset(name, 0, sizeof(name));
+
+       f = fopen("/proc/misc", "r");
+       if (!f) {
+               CHILD_ERR(errno, "failed to open /proc/misc: %d\n", errno);
+               return -errno;
+       }
+
+       while (fscanf(f, "%d %256s", &minor, name) == 2)
+               if (!strcmp(name, BLKTAP2_CONTROL_NAME)) {
+                       err = tapdisk2_make_device(BLKTAP2_CONTROL_DEVICE,
+                                                  MISC_MAJOR_NUMBER,
+                                                  minor, S_IFCHR | 0600);
+                       goto out;
+               }
+
+       err = -ENOSYS;
+       CHILD_ERR(err, "didn't find %s in /proc/misc\n", BLKTAP2_CONTROL_NAME);
+
+out:
+       fclose(f);
+       return err;
+}
+
+static void
+tapdisk2_free_device(void)
+{
+       int fd, err;
+
+       fd = open(BLKTAP2_CONTROL_DEVICE, O_RDONLY);
+       if (fd == -1) {
+               CHILD_ERR(errno, "failed to open control device: %d\n", errno);
+               return;
+       }
+
+       err = ioctl(fd, BLKTAP2_IOCTL_FREE_TAP, handle.minor);
+       close(fd);
+}
+
+static int
+tapdisk2_prepare_device(void)
+{
+       char *name;
+       int fd, err;
+
+       fd = open(BLKTAP2_CONTROL_DEVICE, O_RDONLY);
+       if (fd == -1) {
+               CHILD_ERR(errno, "failed to open control device: %d\n", errno);
+               return -errno;
+       }
+
+       err = ioctl(fd, BLKTAP2_IOCTL_ALLOC_TAP, &handle);
+       close(fd);
+       if (err == -1) {
+               CHILD_ERR(errno, "failed to allocate new device: %d\n", errno);
+               return -errno;
+       }
+
+       err = asprintf(&name, "%s%d", BLKTAP2_RING_DEVICE, handle.minor);
+       if (err == -1) {
+               err = -ENOMEM;
+               goto fail;
+       }
+
+       err = tapdisk2_make_device(name, handle.ring,
+                                  handle.minor, S_IFCHR | 0600);
+       free(name);
+       if (err) {
+               CHILD_ERR(err, "creating ring device for %d failed: %d\n",
+                         handle.minor, err);
+               goto fail;
+       }
+
+       err = asprintf(&name, "%s%d", BLKTAP2_IO_DEVICE, handle.minor);
+       if (err == -1) {
+               err = -ENOMEM;
+               goto fail;
+       }
+
+       err = tapdisk2_make_device(name, handle.device,
+                                  handle.minor, S_IFBLK | 0600);
+       free(name);
+       if (err) {
+               CHILD_ERR(err, "creating IO device for %d failed: %d\n",
+                         handle.minor, err);
+               goto fail;
+       }
+
+       DPRINTF("new interface: ring: %u, device: %u, minor: %u\n",
+               handle.ring, handle.device, handle.minor);
+
+       return 0;
+
+fail:
+       tapdisk2_free_device();
+       return err;
+}
+
+static int
+tapdisk2_open_device(int type, const char *path, const char *name)
+{
+       int err;
+       td_vbd_t *vbd;
+       image_t image;
+       char *devname;
+       struct blktap2_params params;
+
+       err = tapdisk_vbd_initialize(-1, -1, TAPDISK2_VBD);
+       if (err)
+               return err;
+
+       vbd = tapdisk_server_get_vbd(TAPDISK2_VBD);
+       if (!vbd) {
+               err = -ENODEV;
+               CHILD_ERR(err, "couldn't find vbd\n");
+               return err;
+       }
+
+       err = asprintf(&devname, "%s%d", BLKTAP2_RING_DEVICE, handle.minor);
+       if (err == -1) {
+               err = -ENOMEM;
+               CHILD_ERR(err, "couldn't allocate ring\n");
+               return err;
+       }
+
+       err = tapdisk_vbd_open(vbd, path, type,
+                              TAPDISK_STORAGE_TYPE_DEFAULT,
+                              devname, 0);
+       free(devname);
+       if (err) {
+               CHILD_ERR(err, "vbd open failed: %d\n", err);
+               return err;
+       }
+
+       memset(&params, 0, sizeof(params));
+       tapdisk_vbd_get_image_info(vbd, &image);
+
+       params.capacity    = image.size;
+       params.sector_size = image.secsize;
+       snprintf(params.name, sizeof(params.name) - 1, "%s", name);
+
+       err = ioctl(vbd->ring.fd, BLKTAP2_IOCTL_CREATE_DEVICE, &params);
+       if (err) {
+               err = -errno;
+               CHILD_ERR(err, "create device failed: %d\n", err);
+               return err;
+       }
+
+       return 0;
+}
+
+static int
+tapdisk2_set_child_fds(void)
+{
+       int i, err;
+
+       err = dup2(channel[1], STDOUT_FILENO);
+       if (err == -1) {
+               CHILD_ERR(errno, "failed duping pipe: %d\n", errno);
+               return errno;
+       }
+
+       child_out = fdopen(STDOUT_FILENO, "w");
+       if (!child_out) {
+               CHILD_ERR(errno, "failed setting child_out: %d\n", errno);
+               return errno;
+       }
+
+       for (i = 0; i < sysconf(_SC_OPEN_MAX); i++)
+               if (i != STDOUT_FILENO)
+                       close(i);
+
+       return 0;
+}
+
+static int
+tapdisk2_create_device(const char *params)
+{
+       char *path;
+       int err, type;
+
+       chdir("/");
+       tapdisk_start_logging("tapdisk2");
+
+       err = tapdisk2_set_child_fds();
+       if (err)
+               goto out;
+
+       err = tapdisk2_check_environment();
+       if (err)
+               goto out;
+
+       err = tapdisk_parse_disk_type(params, &path, &type);
+       if (err)
+               goto out;
+
+       err = tapdisk2_prepare_device();
+       if (err)
+               goto out;
+
+       err = tapdisk_server_initialize(NULL, NULL);
+       if (err)
+               goto fail;
+
+       err = tapdisk2_open_device(type, path, params);
+       if (err)
+               goto fail;
+
+       cprintf(0, "%s%d\n", BLKTAP2_IO_DEVICE, handle.minor);
+       close(STDOUT_FILENO);
+
+       err = tapdisk_server_run();
+       if (err)
+               goto fail;
+
+       err = 0;
+
+out:
+       tapdisk_stop_logging();
+       return err;
+
+fail:
+       tapdisk2_free_device();
+       goto out;
+}
+
+static int
+tapdisk2_wait_for_device(void)
+{
+       int err;
+       char msg[1024];
+       FILE *parent_in;
+
+       close(channel[1]);
+       parent_in = fdopen(channel[0], "r");
+       if (!parent_in) {
+               printf("failed to connect to child: %d\n", errno);
+               return errno;
+       }
+
+       memset(msg, 0, sizeof(msg));
+       if (fscanf(parent_in, "%d: %1023[^\n]", &err, msg) != 2) {
+               printf("unrecognized child response\n");
+               return EINVAL;
+       }
+
+       printf("%s\n", msg);
+       return (err >= 0 ? err : -err);
+}
+
+static void
+usage(const char *app, int err)
+{
+       fprintf(stderr, "usage: %s <-n file>\n", app);
+       exit(err);
+}
+
+int
+main(int argc, char *argv[])
+{
+       int c;
+       char *params;
+
+       params = NULL;
+
+       while ((c = getopt(argc, argv, "n:h")) != -1) {
+               switch (c) {
+               case 'n':
+                       params = optarg;
+                       break;
+               case 'h':
+                       usage(argv[0], 0);
+               default:
+                       usage(argv[0], EINVAL);
+               }
+       }
+
+       if (!params || optind != argc)
+               usage(argv[0], EINVAL);
+
+       if (pipe(channel) == -1) {
+               printf("pipe failed: %d\n", errno);
+               return errno;
+       }
+
+       switch (fork()) {
+       case -1:
+               printf("fork failed: %d\n", errno);
+               return errno;
+       case 0:
+               return tapdisk2_create_device(params);
+       default:
+               return tapdisk2_wait_for_device();
+       }
+}
diff --git a/tools/blktap2/drivers/td.c b/tools/blktap2/drivers/td.c
new file mode 100644 (file)
index 0000000..f920acd
--- /dev/null
@@ -0,0 +1,691 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/resource.h>
+#include <unistd.h>
+#include <string.h>
+
+#include "libvhd.h"
+#include "vhd-util.h"
+#include "tapdisk-utils.h"
+
+#if 1
+#define DFPRINTF(_f, _a...) fprintf ( stdout, _f , ## _a )
+#else
+#define DFPRINTF(_f, _a...) ((void)0)
+#endif
+
+typedef enum {
+       TD_FIELD_HIDDEN  = 0,
+       TD_FIELD_INVALID = 1
+} td_field_t;
+
+struct vdi_field {
+       char       *name;
+       td_field_t  id;
+};
+
+static struct vdi_field td_vdi_fields[TD_FIELD_INVALID] = {
+       { .id = TD_FIELD_HIDDEN, .name = "hidden" }
+};
+
+typedef enum {
+       TD_CMD_CREATE    = 0,
+       TD_CMD_SNAPSHOT,
+/*     TD_CMD_COALESCE,       */
+       TD_CMD_QUERY,
+/*     TD_CMD_RESIZE,         */
+       TD_CMD_SET,
+/*     TD_CMD_REPAIR,         */
+/*     TD_CMD_FILL,           */
+/*     TD_CMD_READ,           */
+       TD_CMD_INVALID,
+} td_command_t;
+
+struct command {
+       td_command_t  id;
+       char         *name;
+       int           needs_type;
+};
+
+struct command commands[TD_CMD_INVALID] = {
+       { .id = TD_CMD_CREATE,   .name = "create",   .needs_type = 1 },
+       { .id = TD_CMD_SNAPSHOT, .name = "snapshot", .needs_type = 1 },
+/*     { .id = TD_CMD_COALESCE, .name = "coalesce", .needs_type = 1 },    */
+       { .id = TD_CMD_QUERY,    .name = "query",    .needs_type = 1 },
+/*     { .id = TD_CMD_RESIZE,   .name = "resize",   .needs_type = 1 },    */
+       { .id = TD_CMD_SET,      .name = "set",      .needs_type = 1 },
+/*     { .id = TD_CMD_REPAIR,   .name = "repair",   .needs_type = 1 },    */
+/*     { .id = TD_CMD_FILL,     .name = "fill",     .needs_type = 1 },    */
+/*     { .id = TD_CMD_READ,     .name = "read",     .needs_type = 1 },    */
+};
+
+typedef enum {
+       TD_TYPE_VHD         = 0,
+       TD_TYPE_AIO,
+       TD_TYPE_INVALID,
+} td_disk_t;
+
+const char *td_disk_types[TD_TYPE_INVALID] = {
+       "vhd",
+       "aio",
+};
+
+#define print_commands()                                               \
+       do {                                                            \
+               int i;                                                  \
+               fprintf(stderr, "COMMAND := { ");                       \
+               fprintf(stderr, "%s", commands[0].name);                \
+               for (i = 1; i < TD_CMD_INVALID; i++)                    \
+                       fprintf(stderr, " | %s", commands[i].name);     \
+               fprintf(stderr, " }\n");                                \
+       } while (0)
+
+#define print_disk_types()                                             \
+       do {                                                            \
+               int i;                                                  \
+               fprintf(stderr, "TYPE := { ");                          \
+               fprintf(stderr, "%s", td_disk_types[0]);                \
+               for (i = 1; i < TD_TYPE_INVALID; i++)                   \
+                       fprintf(stderr, " | %s", td_disk_types[i]);     \
+               fprintf(stderr, " }\n");                                \
+       } while (0);
+
+#define print_field_names()                                            \
+       do {                                                            \
+               int i;                                                  \
+               fprintf(stderr, "FIELD := { ");                         \
+               fprintf(stderr, "%s", td_vdi_fields[0].name);           \
+               for (i = 1; i < TD_FIELD_INVALID; i++)                  \
+                       fprintf(stderr, " | %s", td_vdi_fields[i].name); \
+               fprintf(stderr, " }\n");                                \
+       } while (0)
+
+void 
+help(void)
+{
+       fprintf(stderr, "Tapdisk Utilities: v1.0.0\n");
+       fprintf(stderr, "usage: td-util COMMAND [TYPE] [OPTIONS]\n");
+       print_commands();
+       print_disk_types();
+       exit(-1);
+}
+
+struct command *
+get_command(char *command)
+{
+       int i;
+
+       for (i = 0; i < TD_CMD_INVALID; i++)
+               if (!strcmp(command, commands[i].name))
+                       return &commands[i];
+
+       return NULL;
+}
+
+struct vdi_field *
+get_field(char *field)
+{
+       int i;
+
+       for (i = 0; i < TD_FIELD_INVALID; i++)
+               if (!strcmp(field, td_vdi_fields[i].name))
+                       return &td_vdi_fields[i];
+
+       return NULL;
+}
+
+int
+get_driver_type(char *type)
+{
+       int i;
+
+       if (strnlen(type, 25) >= 25)
+               return -ENAMETOOLONG;
+
+       for (i = 0; i < TD_TYPE_INVALID; i++)
+               if (!strcmp(type, td_disk_types[i]))
+                       return i;
+
+       return -TD_TYPE_INVALID;
+}
+
+int
+td_create(int type, int argc, char *argv[])
+{
+       ssize_t mb;
+       uint64_t size;
+       char *name, *buf;
+       int c, i, fd, sparse = 1, fixedsize = 0;
+
+       while ((c = getopt(argc, argv, "hrb")) != -1) {
+               switch(c) {
+               case 'r':
+                       sparse = 0;
+                       break;
+               case 'b':
+                       fixedsize = 1;
+                       break;
+               default:
+                       fprintf(stderr, "Unknown option %c\n", (char)c);
+               case 'h':
+                       goto usage;
+               }
+       }
+
+       if (optind != (argc - 2))
+               goto usage;
+
+       mb   = 1 << 20;
+       size = atoi(argv[optind++]);
+       size = size << 20;
+       name = argv[optind];
+
+       if (strnlen(name, MAX_NAME_LEN) == MAX_NAME_LEN) {
+               fprintf(stderr, "Device name too long\n");
+               return ENAMETOOLONG;
+       }
+
+       if (type == TD_TYPE_VHD) {
+               int cargc = 0;
+               char sbuf[32], *cargv[10];
+
+               size >>= 20;
+
+               memset(cargv, 0, sizeof(cargv));
+               snprintf(sbuf, sizeof(sbuf) - 1, "%"PRIu64, size);
+               cargv[cargc++] = "create";
+               cargv[cargc++] = "-n";
+               cargv[cargc++] = name;
+               cargv[cargc++] = "-s";
+               cargv[cargc++] = sbuf;
+               if (!sparse)
+                       cargv[cargc++] = "-r";
+               if (fixedsize)
+                       cargv[cargc++] = "-b";
+
+               return vhd_util_create(cargc, cargv);
+       }
+
+       /* generic create */
+       if (sparse) {
+               fprintf(stderr, "Cannot create sparse %s image\n",
+                       td_disk_types[type]);
+               return EINVAL;
+       }
+
+       buf = calloc(1, mb);
+       if (!buf)
+               return ENOMEM;
+
+       fd = open(name, O_WRONLY | O_DIRECT | O_CREAT | O_TRUNC, 0644);
+       if (fd == -1) {
+               free(buf);
+               return errno;
+       }
+
+       size >>= 20;
+       for (i = 0; i < size; i++)
+               if (write(fd, buf, mb) != mb) {
+                       close(fd);
+                       unlink(name);
+                       free(buf);
+                       return EIO;
+               }
+
+       close(fd);
+       free(buf);
+       return 0;
+
+ usage:
+       fprintf(stderr, "usage: td-util create %s [-h help] [-r reserve] "
+               "[-b file_is_fixed_size] <SIZE(MB)> <FILENAME>\n",
+               td_disk_types[type]);
+       return EINVAL;
+}
+
+int
+td_snapshot(int type, int argc, char *argv[])
+{
+       char *cargv[10];
+       int c, err, cargc;
+       struct stat stats;
+       char *name, *backing, *limit = NULL;
+       int fixedsize = 0, rawparent = 0;
+
+       if (type != TD_TYPE_VHD) {
+               fprintf(stderr, "Cannot create snapshot of %s image type\n",
+                       td_disk_types[type]);
+               return EINVAL;
+       }
+
+       while ((c = getopt(argc, argv, "hbml:")) != -1) {
+               switch(c) {
+               case 'b':
+                       fixedsize = 1;
+                       break;
+               case 'm':
+                       rawparent = 1;
+                       break;
+               case 'l':
+                       limit = optarg;
+                       break;
+               case 'h':
+                       err = 0;
+                       goto usage;
+               default:
+                       err = EINVAL;
+                       goto usage;
+               }
+       }
+
+       if (optind != (argc - 2)) {
+               err = EINVAL;
+               goto usage;
+       }
+
+       name    = argv[optind++];
+       backing = argv[optind++];
+
+       if (strnlen(name, MAX_NAME_LEN) == MAX_NAME_LEN ||
+           strnlen(backing, MAX_NAME_LEN) == MAX_NAME_LEN) {
+               fprintf(stderr, "Device name too long\n");
+               return ENAMETOOLONG;
+       }
+
+       if (stat(backing, &stats) == -1) {
+               fprintf(stderr, "File %s not found\n", backing);
+               return errno;
+       }
+
+       cargc = 0;
+       memset(cargv, 0, sizeof(cargv));
+       cargv[cargc++] = "snapshot";
+       cargv[cargc++] = "-n";
+       cargv[cargc++] = name;
+       cargv[cargc++] = "-p";
+       cargv[cargc++] = backing;
+       if (fixedsize)
+               cargv[cargc++] = "-b";
+       if (rawparent)
+               cargv[cargc++] = "-m";
+       if (limit) {
+               cargv[cargc++] = "-l";
+               cargv[cargc++] = limit;
+       }
+       return vhd_util_snapshot(cargc, cargv);
+
+ usage:
+       fprintf(stderr, "usage: td-util snapshot %s [-h help] [-m parent_raw] "
+               "[-b file_is_fixed_size] [-l snapshot depth limit] "
+               "<FILENAME> <BACKING_FILENAME>\n", td_disk_types[type]);
+       return err;
+}
+
+int
+td_coalesce(int type, int argc, char *argv[])
+{
+       int c, ret, cargc;
+       char *name, *pname, *cargv[3];
+
+       if (type != TD_TYPE_VHD) {
+               fprintf(stderr, "Cannot create snapshot of %s image type\n",
+                       td_disk_types[type]);
+               return EINVAL;
+       }
+
+       while ((c = getopt(argc, argv, "h")) != -1) {
+               switch(c) {
+               default:
+                       fprintf(stderr, "Unknown option %c\n", (char)c);
+               case 'h':
+                       goto usage;
+               }
+       }
+
+       if (optind != (argc - 1))
+               goto usage;
+
+       name = argv[optind++];
+
+       if (strnlen(name, MAX_NAME_LEN) == MAX_NAME_LEN) {
+               fprintf(stderr, "Device name too long\n");
+               return ENAMETOOLONG;
+       }
+
+       cargc = 0;
+       memset(cargv, 0, sizeof(cargv));
+       cargv[cargc++] = "coalesce";
+       cargv[cargc++] = "-n";
+       cargv[cargc++] = name;
+       ret = vhd_util_coalesce(cargc, cargv);
+       if (ret)
+               printf("coalesce failed: %d\n", ret);
+
+       return ret;
+
+ usage:
+       fprintf(stderr, "usage: td-util coalesce %s [-h help] "
+               "<FILENAME>\n", td_disk_types[type]);
+       return EINVAL;
+}
+
+int
+td_query(int type, int argc, char *argv[])
+{
+       char *name;
+       int c, size = 0, parent = 0, fields = 0, depth = 0, err = 0;
+
+       while ((c = getopt(argc, argv, "hvpfd")) != -1) {
+               switch(c) {
+               case 'v':
+                       size = 1;
+                       break;
+               case 'p':
+                       parent = 1;
+                       break;
+               case 'f':
+                       fields = 1;
+                       break;
+               case 'd':
+                       depth = 1;
+                       break;
+               case 'h':
+                       err = 0;
+                       goto usage;
+               default:
+                       err = EINVAL;
+                       goto usage;
+               }
+       }
+
+       if (optind != (argc - 1)) {
+               err = EINVAL;
+               goto usage;
+       }
+
+       name = argv[optind++];
+
+       if (strnlen(name, MAX_NAME_LEN) == MAX_NAME_LEN) {
+               fprintf(stderr, "Device name too long\n");
+               return ENAMETOOLONG;
+       }
+
+       if (type == TD_TYPE_VHD) {
+               vhd_context_t vhd;
+
+               err = vhd_open(&vhd, name, VHD_OPEN_RDONLY);
+               if (err) {
+                       printf("failed opening %s: %d\n", name, err);
+                       return err;
+               }
+
+               if (size)
+                       printf("%"PRIu64"\n", vhd.footer.curr_size >> 20);
+
+               if (parent) {
+                       if (vhd.footer.type != HD_TYPE_DIFF)
+                               printf("%s has no parent\n", name);
+                       else {
+                               char *pname;
+
+                               err = vhd_parent_locator_get(&vhd, &pname);
+                               if (err)
+                                       printf("failed getting parent: %d\n",
+                                              err);
+                               else {
+                                       printf("%s\n", pname);
+                                       free(pname);
+                               }
+                       }
+               }
+
+               if (fields) {
+                       int ret, hidden;
+
+                       ret = vhd_hidden(&vhd, &hidden);
+                       if (ret) {
+                               printf("failed checking 'hidden' field: %d\n",
+                                      ret);
+                               err = (err ? : ret);
+                       } else
+                               printf("%s: %d\n",
+                                      td_vdi_fields[TD_FIELD_HIDDEN].name,
+                                      hidden);
+               }
+
+               if (depth) {
+                       int ret, length;
+
+                       ret = vhd_chain_depth(&vhd, &length);
+                       if (ret)
+                               printf("error checking chain depth: %d\n", ret);
+                       else
+                               printf("chain depth: %d\n", length);
+
+                       err = (err ? : ret);
+               }
+
+               vhd_close(&vhd);
+
+       } else if (type == TD_TYPE_AIO) {
+               if (size) {
+                       int fd;
+                       uint64_t secs;
+                       uint32_t ssize;
+
+                       fd = open(name, O_RDONLY | O_LARGEFILE);
+                       if (fd == -1) {
+                               printf("failed opening %s: %d\n", name, errno);
+                               return -errno;
+                       }
+
+                       err = tapdisk_get_image_size(fd, &secs, &ssize);
+                       close(fd);
+
+                       if (err) {
+                               printf("failed getting size for %s: %d\n:",
+                                      name, err);
+                               return err;
+                       }
+
+                       printf("%"PRIu64"\n", secs >> 11);
+               }
+
+               if (parent)
+                       printf("%s has no parent\n", name);
+
+               if (fields) {
+                       int i;
+
+                       for (i = 0; i < TD_FIELD_INVALID; i++)
+                               printf("%s: 0\n", td_vdi_fields[i].name);
+               }
+       }
+
+       return err;
+
+ usage:
+       fprintf(stderr, "usage: td-util query %s [-h help] [-v virtsize] "
+               "[-p parent] [-f fields]  <FILENAME>\n", td_disk_types[type]);
+       return err;
+}
+
+int
+td_set_field(int type, int argc, char *argv[])
+{
+       int ret, i, c, cargc;
+       struct vdi_field *field;
+       char *name, *value, *cargv[7];
+
+       if (type != TD_TYPE_VHD) {
+               fprintf(stderr, "Cannot set fields of %s images\n",
+                       td_disk_types[type]);
+               return EINVAL;
+       }
+
+       while ((c = getopt(argc, argv, "h")) != -1) {
+               switch(c) {
+               default:
+                       fprintf(stderr, "Unknown option %c\n", (char)c);
+               case 'h':
+                       goto usage;
+               }
+       }
+
+       if (optind != (argc - 3))
+               goto usage;
+
+       name  = argv[optind++];
+
+       field = get_field(argv[optind]);
+       if (!field || field->id != TD_FIELD_HIDDEN) {
+               fprintf(stderr, "Invalid field %s\n", argv[optind]);
+               goto usage;
+       }
+
+       value = argv[++optind];
+
+       cargc = 0;
+       memset(cargv, 0, sizeof(cargv));
+       cargv[cargc++] = "set";
+       cargv[cargc++] = "-n";
+       cargv[cargc++] = name;
+       cargv[cargc++] = "-f";
+       cargv[cargc++] = field->name;
+       cargv[cargc++] = "-v";
+       cargv[cargc++] = value;
+       return vhd_util_set_field(cargc, cargv);
+
+ usage:
+       fprintf(stderr, "usage: td-util set %s [-h help] "
+               "<FILENAME> <FIELD> <VALUE>\n", td_disk_types[type]);
+       print_field_names();
+       return EINVAL;
+}
+
+int
+main(int argc, char *argv[])
+{
+       char **cargv;
+       struct command *cmd;
+       int cargc, i, type = -1, ret = 0;
+
+#ifdef CORE_DUMP
+       struct rlimit rlim;
+       rlim.rlim_cur = RLIM_INFINITY;
+       rlim.rlim_max = RLIM_INFINITY;
+       if (setrlimit(RLIMIT_CORE, &rlim) < 0)
+               fprintf(stderr, "setrlimit failed: %d\n", errno);
+#endif
+
+       if (argc < 2)
+               help();
+
+       cargc = argc - 1;
+       cmd   = get_command(argv[1]);
+       if (!cmd) {
+               fprintf(stderr, "invalid COMMAND %s\n", argv[1]);
+               help();
+       }
+
+       if (cmd->needs_type) {
+               if (argc < 3) {
+                       fprintf(stderr, "td-util %s requires a TYPE\n",
+                               cmd->name);
+                       print_disk_types();
+                       exit(-1);
+               }
+
+               type = get_driver_type(argv[2]);
+               if (type < 0) {
+                       fprintf(stderr, "invalid TYPE '%s'.\n", argv[2]);
+                       print_disk_types();
+                       exit(-1);
+               }
+               --cargc;
+       }
+
+       cargv = malloc(sizeof(char *) * cargc);
+       if (!cargv)
+               exit(ENOMEM);
+
+       cargv[0] = cmd->name;
+       for (i = 1; i < cargc; i++)
+               cargv[i] = argv[i + (argc - cargc)];
+
+       switch(cmd->id) {
+       case TD_CMD_CREATE:
+               ret = td_create(type, cargc, cargv);
+               break;
+       case TD_CMD_SNAPSHOT:
+               ret = td_snapshot(type, cargc, cargv);
+               break;
+/*
+       case TD_CMD_COALESCE:
+               ret = td_coalesce(type, cargc, cargv);
+               break;
+*/
+       case TD_CMD_QUERY:
+               ret = td_query(type, cargc, cargv);
+               break;
+/*
+       case TD_CMD_RESIZE:
+               ret = td_resize(type, cargc, cargv);
+               break;
+*/
+       case TD_CMD_SET:
+               ret = td_set_field(type, cargc, cargv);
+               break;
+/*
+       case TD_CMD_REPAIR:
+               ret = td_repair(type, cargc, cargv);
+               break;
+       case TD_CMD_FILL:
+               ret = td_fill(type, cargc, cargv);
+               break;
+       case TD_CMD_READ:
+               ret = td_read(type, cargc, cargv);
+               break;
+*/
+       default:
+       case TD_CMD_INVALID:
+               ret = EINVAL;
+               break;
+       }
+
+       free(cargv);
+
+       return (ret >= 0 ? ret : -ret);
+}
diff --git a/tools/blktap2/drivers/xmsnap b/tools/blktap2/drivers/xmsnap
new file mode 100644 (file)
index 0000000..f14351b
--- /dev/null
@@ -0,0 +1,78 @@
+#!/bin/bash
+
+usage () { echo "USAGE: xmsnap <VM ID> <Backing File>"; }
+
+#
+# Check Usage
+#
+if [ -n "$1" ]
+then
+       vmid=$1
+else
+       usage
+       exit 1
+fi
+
+if [ -n "$2" ]
+then
+       target=$2
+else
+       usage
+       exit 1
+fi
+
+if [ -e "$target" ]
+then
+    echo "Creating snapshot of file $target for VM $vmid."
+else
+    usage
+    echo "File $target not found."
+    exit 1
+fi
+
+#
+# Find the snapshot name
+#
+directory=`dirname "$target"`
+target=`basename "$target"`
+
+let maxidx=0
+if [ -e $directory/${target}.snap1 ]
+then
+       for idx in $(ls $directory/${target}.snap*)
+       do
+           let idx=${idx#$directory/${target}.snap}
+           if [ "$idx" -gt "$maxidx" ]
+           then
+               let maxidx=$idx
+           fi
+       done
+fi
+
+snap=${target}.snap`expr $maxidx + 1`
+
+#
+# Pause VM
+#
+xm pause $vmid
+if [ "$?" -ne "0" ]; then
+  exit 1
+fi
+
+
+#
+# Snap and reposition the files
+#
+mv $directory/$target $directory/$snap
+if [ "$?" -ne "0" ]; then
+  exit 1
+fi
+
+qcow-create 0 $directory/$target $directory/$snap
+
+#
+# Unpause
+#
+xm unpause $vmid
+
+exit
\ No newline at end of file
diff --git a/tools/blktap2/include/Makefile b/tools/blktap2/include/Makefile
new file mode 100644 (file)
index 0000000..7267eac
--- /dev/null
@@ -0,0 +1,14 @@
+XEN_ROOT := ../../../
+include $(XEN_ROOT)/tools/Rules.mk
+
+.PHONY: all
+all:
+
+.PHONY: install
+install:
+       $(INSTALL_DIR) -p $(DESTDIR)$(INCLUDEDIR)
+
+
+.PHONY: clean
+clean:
+       @:
diff --git a/tools/blktap2/include/atomicio.h b/tools/blktap2/include/atomicio.h
new file mode 100644 (file)
index 0000000..7eccf20
--- /dev/null
@@ -0,0 +1,33 @@
+/*     $OpenBSD: atomicio.h,v 1.6 2005/05/24 17:32:43 avsm Exp $       */
+
+/*
+ * Copyright (c) 1995,1999 Theo de Raadt.  All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Ensure all of data on socket comes through. f==read || f==vwrite
+ */
+size_t atomicio(ssize_t (*)(int, void *, size_t), int, void *, size_t);
+
+#define vwrite (ssize_t (*)(int, void *, size_t))write
diff --git a/tools/blktap2/include/blktaplib.h b/tools/blktap2/include/blktaplib.h
new file mode 100644 (file)
index 0000000..1824afa
--- /dev/null
@@ -0,0 +1,249 @@
+/* blktaplib.h
+ *
+ * Blktap library userspace code.
+ *
+ * Copyright (c) 2007, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __BLKTAPLIB_H__
+#define __BLKTAPLIB_H__
+
+#include <syslog.h>
+#include <xenctrl.h>
+#include <xen/io/blkif.h>
+
+#if 1
+#define DPRINTF(_f, _a...) syslog(LOG_INFO, _f, ##_a)
+#else
+#define DPRINTF(_f, _a...) ((void)0)
+#endif
+
+#define EPRINTF(_f, _a...) syslog(LOG_ERR, "tap-err:%s: " _f, __func__, ##_a)
+
+#define BLK_RING_SIZE __RING_SIZE((blkif_sring_t *)0, XC_PAGE_SIZE)
+
+/* size of the extra VMA area to map in attached pages. */
+#define BLKTAP_VMA_PAGES BLK_RING_SIZE
+
+/* blktap IOCTLs: These must correspond with the blktap driver ioctls */
+#define BLKTAP_IOCTL_KICK_FE         1
+#define BLKTAP_IOCTL_KICK_BE         2
+#define BLKTAP_IOCTL_SETMODE         3
+#define BLKTAP_IOCTL_SENDPID        4
+#define BLKTAP_IOCTL_NEWINTF        5
+#define BLKTAP_IOCTL_MINOR          6
+#define BLKTAP_IOCTL_MAJOR          7
+#define BLKTAP_QUERY_ALLOC_REQS      8
+#define BLKTAP_IOCTL_FREEINTF       9
+#define BLKTAP_IOCTL_PRINT_IDXS      100 
+#define BLKTAP_IOCTL_BACKDEV_SETUP   200
+
+#define PRIO_SPECIAL_IO             -9999 
+
+/* blktap switching modes: (Set with BLKTAP_IOCTL_SETMODE)             */
+#define BLKTAP_MODE_PASSTHROUGH      0x00000000  /* default            */
+#define BLKTAP_MODE_INTERCEPT_FE     0x00000001
+#define BLKTAP_MODE_INTERCEPT_BE     0x00000002
+
+#define BLKTAP_MODE_INTERPOSE \
+           (BLKTAP_MODE_INTERCEPT_FE | BLKTAP_MODE_INTERCEPT_BE)
+
+static inline int BLKTAP_MODE_VALID(unsigned long arg)
+{
+       return (
+               ( arg == BLKTAP_MODE_PASSTHROUGH  ) ||
+               ( arg == BLKTAP_MODE_INTERCEPT_FE ) ||
+               ( arg == BLKTAP_MODE_INTERPOSE    ) );
+}
+
+#define MAX_REQUESTS            BLK_RING_SIZE
+
+#define BLKTAP_IOCTL_KICK       1
+#define MAX_PENDING_REQS       BLK_RING_SIZE
+#define BLKTAP_DEV_DIR          "/dev/xen"
+#define BLKTAP_DEV_NAME         "blktap"
+#define BACKDEV_NAME            "backdev"
+#define BLKTAP_DEV_MINOR        0
+#define BLKTAP_CTRL_DIR         "/var/run/tap"
+
+extern int blktap_major;
+
+#define BLKTAP_RING_PAGES       1 /* Front */
+#define BLKTAP_MMAP_REGION_SIZE (BLKTAP_RING_PAGES + MMAP_PAGES)
+
+struct blkif;
+struct blkif_info;
+
+typedef struct {
+       blkif_request_t  req;
+       int              submitting;
+       int              secs_pending;
+        int16_t          status;
+       int              num_retries;
+       struct timeval   last_try;
+} pending_req_t;
+
+typedef struct blkif {
+       domid_t domid;
+       long int handle;
+       
+       long int pdev;
+       long int readonly;
+       
+       enum { DISCONNECTED, DISCONNECTING, CONNECTED } state;
+       
+       struct blkif_ops *ops;
+       struct blkif *hash_next;
+       
+       void *prv;  /* device-specific data */
+       struct blkif_info *info; /*Image parameter passing */
+       pending_req_t pending_list[MAX_REQUESTS];
+       int devnum;
+       int fds[2];
+       int be_id;
+       char *backend_path;
+       int major;
+       int minor;
+       pid_t tappid;
+       int drivertype;
+       uint16_t cookie;
+       int err;
+} blkif_t;
+
+typedef struct blkif_info {
+       char *params;
+       int   readonly;
+       int   storage;
+} blkif_info_t;
+
+typedef struct tapdev_info {
+       int fd;
+       char *mem;
+       blkif_sring_t *sring;
+       blkif_back_ring_t  fe_ring;
+       unsigned long vstart;
+       blkif_t *blkif;
+} tapdev_info_t;
+
+typedef struct domid_translate {
+       unsigned short domid;
+       unsigned short busid;
+} domid_translate_t ;
+
+typedef struct image {
+       unsigned long long size;
+       unsigned long secsize;
+       unsigned int info;
+} image_t;
+
+typedef struct msg_hdr {
+       uint16_t   type;
+       uint16_t   len;
+       uint16_t   drivertype;
+       uint16_t   cookie;
+} msg_hdr_t;
+
+typedef struct msg_params {
+       uint8_t    readonly;
+       int        path_off;
+       int        path_len;
+       int        storage;
+} msg_params_t;
+
+typedef struct msg_newdev {
+       uint8_t     devnum;
+       uint16_t    domid;
+} msg_newdev_t;
+
+typedef struct msg_pid {
+       pid_t     pid;
+} msg_pid_t;
+
+typedef struct msg_cp {
+       int       cp_uuid_off;
+       int       cp_uuid_len;
+       int       cp_drivertype;
+} msg_cp_t;
+
+typedef struct msg_lock {
+       int       ro;
+       int       enforce;
+       int       uuid_off;
+       int       uuid_len;
+} msg_lock_t;
+
+#define READ 0
+#define WRITE 1
+
+/*Control Messages between manager and tapdev*/
+#define CTLMSG_PARAMS          1
+#define CTLMSG_IMG             2
+#define CTLMSG_IMG_FAIL        3
+#define CTLMSG_NEWDEV          4
+#define CTLMSG_NEWDEV_RSP      5
+#define CTLMSG_NEWDEV_FAIL     6
+#define CTLMSG_CLOSE           7
+#define CTLMSG_CLOSE_RSP       8
+#define CTLMSG_PID             9
+#define CTLMSG_PID_RSP         10
+#define CTLMSG_CHECKPOINT      11
+#define CTLMSG_CHECKPOINT_RSP  12
+#define CTLMSG_LOCK            13
+#define CTLMSG_LOCK_RSP        14
+#define CTLMSG_PAUSE           15
+#define CTLMSG_PAUSE_RSP       16
+#define CTLMSG_RESUME          17
+#define CTLMSG_RESUME_RSP      18
+
+#define TAPDISK_STORAGE_TYPE_NFS       1
+#define TAPDISK_STORAGE_TYPE_EXT       2
+#define TAPDISK_STORAGE_TYPE_LVM       3
+#define TAPDISK_STORAGE_TYPE_DEFAULT   TAPDISK_STORAGE_TYPE_EXT
+
+/* Abitrary values, must match the underlying driver... */
+#define MAX_TAP_DEV 256
+
+/* Accessing attached data page mappings */
+#define MMAP_PAGES                                                    \
+    (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST)
+#define MMAP_VADDR(_vstart,_req,_seg)                                 \
+    ((_vstart) +                                                      \
+     ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * getpagesize()) +      \
+     ((_seg) * getpagesize()))
+
+/* Defines that are only used by library clients */
+
+#ifndef __COMPILING_BLKTAP_LIB
+
+static char *blkif_op_name[] = {
+       [BLKIF_OP_READ]       = "READ",
+       [BLKIF_OP_WRITE]      = "WRITE",
+};
+
+#endif /* __COMPILING_BLKTAP_LIB */
+
+#endif /* __BLKTAPLIB_H__ */
diff --git a/tools/blktap2/include/libvhd-journal.h b/tools/blktap2/include/libvhd-journal.h
new file mode 100644 (file)
index 0000000..2f32ff0
--- /dev/null
@@ -0,0 +1,68 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef _VHD_JOURNAL_H_
+#define _VHD_JOURNAL_H_
+
+#include <inttypes.h>
+
+#include "libvhd.h"
+
+#define VHD_JOURNAL_METADATA       0x01
+#define VHD_JOURNAL_DATA           0x02
+
+#define VHD_JOURNAL_HEADER_COOKIE  "vjournal"
+#define VHD_JOURNAL_ENTRY_COOKIE   0xaaaa12344321aaaa
+
+typedef struct vhd_journal_header {
+       char                       cookie[8];
+       uuid_t                     uuid;
+       uint64_t                   vhd_footer_offset;
+       uint32_t                   journal_data_entries;
+       uint32_t                   journal_metadata_entries;
+       uint64_t                   journal_data_offset;
+       uint64_t                   journal_metadata_offset;
+       uint64_t                   journal_eof;
+       char                       pad[448];
+} vhd_journal_header_t;
+
+typedef struct vhd_journal {
+       char                      *jname;
+       int                        jfd;
+       int                        is_block; /* is jfd a block device */
+       vhd_journal_header_t       header;
+       vhd_context_t              vhd;
+} vhd_journal_t;
+
+int vhd_journal_create(vhd_journal_t *, const char *file, const char *jfile);
+int vhd_journal_open(vhd_journal_t *, const char *file, const char *jfile);
+int vhd_journal_add_block(vhd_journal_t *, uint32_t block, char mode);
+int vhd_journal_commit(vhd_journal_t *);
+int vhd_journal_revert(vhd_journal_t *);
+int vhd_journal_close(vhd_journal_t *);
+int vhd_journal_remove(vhd_journal_t *);
+
+#endif
diff --git a/tools/blktap2/include/libvhd.h b/tools/blktap2/include/libvhd.h
new file mode 100644 (file)
index 0000000..b128eba
--- /dev/null
@@ -0,0 +1,308 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef _VHD_LIB_H_
+#define _VHD_LIB_H_
+
+#include <string.h>
+#include <endian.h>
+#include <byteswap.h>
+#include <uuid/uuid.h>
+
+#include "vhd.h"
+
+#if BYTE_ORDER == LITTLE_ENDIAN
+  #define BE16_IN(foo)             (*(foo)) = bswap_16(*(foo))
+  #define BE32_IN(foo)             (*(foo)) = bswap_32(*(foo))
+  #define BE64_IN(foo)             (*(foo)) = bswap_64(*(foo))
+  #define BE16_OUT(foo)            (*(foo)) = bswap_16(*(foo))
+  #define BE32_OUT(foo)            (*(foo)) = bswap_32(*(foo))
+  #define BE64_OUT(foo)            (*(foo)) = bswap_64(*(foo))
+#else
+  #define BE16_IN(foo)
+  #define BE32_IN(foo)
+  #define BE64_IN(foo)
+  #define BE32_OUT(foo)
+  #define BE32_OUT(foo)
+  #define BE64_OUT(foo)
+#endif
+
+#define MIN(a, b)                  (((a) < (b)) ? (a) : (b))
+#define MAX(a, b)                  (((a) > (b)) ? (a) : (b))
+
+#define VHD_MAX_NAME_LEN           1024
+
+#define VHD_BLOCK_SHIFT            21
+#define VHD_BLOCK_SIZE             (1ULL << VHD_BLOCK_SHIFT)
+
+#define UTF_16                     "UTF-16"
+#define UTF_16LE                   "UTF-16LE"
+#define UTF_16BE                   "UTF-16BE"
+
+#define VHD_OPEN_RDONLY            0x00001
+#define VHD_OPEN_RDWR              0x00002
+#define VHD_OPEN_FAST              0x00004
+#define VHD_OPEN_STRICT            0x00008
+#define VHD_OPEN_IGNORE_DISABLED   0x00010
+
+#define VHD_FLAG_CREAT_PARENT_RAW        0x00001
+
+#define vhd_flag_set(word, flag)         ((word) |= (flag))
+#define vhd_flag_clear(word, flag)       ((word) &= ~(flag))
+#define vhd_flag_test(word, flag)        ((word) & (flag))
+
+
+#define ENABLE_FAILURE_TESTING
+#define FAIL_REPARENT_BEGIN        0
+#define FAIL_REPARENT_LOCATOR      1
+#define FAIL_REPARENT_END          2
+#define FAIL_RESIZE_BEGIN          3
+#define FAIL_RESIZE_DATA_MOVED     4
+#define FAIL_RESIZE_METADATA_MOVED 5
+#define FAIL_RESIZE_END            6
+#define NUM_FAIL_TESTS             7
+
+#ifdef ENABLE_FAILURE_TESTING
+#define TEST_FAIL_AT(point) \
+       if (TEST_FAIL[point]) { \
+               printf("Failing at %s\n", ENV_VAR_FAIL[point]); exit(EINVAL); }
+#define TEST_FAIL_EXTERN_VARS              \
+       extern const char* ENV_VAR_FAIL[]; \
+       extern int TEST_FAIL[];
+#else
+#define TEST_FAIL_AT(point)
+#define TEST_FAIL_EXTERN_VARS
+#endif // ENABLE_FAILURE_TESTING
+
+
+static const char                  VHD_POISON_COOKIE[] = "v_poison";
+
+typedef struct hd_ftr              vhd_footer_t;
+typedef struct dd_hdr              vhd_header_t;
+typedef struct vhd_bat             vhd_bat_t;
+typedef struct vhd_batmap          vhd_batmap_t;
+typedef struct dd_batmap_hdr       vhd_batmap_header_t;
+typedef struct prt_loc             vhd_parent_locator_t;
+typedef struct vhd_context         vhd_context_t;
+typedef uint32_t                   vhd_flag_creat_t;
+
+struct vhd_bat {
+       uint32_t                   spb;
+       uint32_t                   entries;
+       uint32_t                  *bat;
+};
+
+struct vhd_batmap {
+       vhd_batmap_header_t        header;
+       char                      *map;
+};
+
+struct vhd_context {
+       int                        fd;
+       char                      *file;
+       int                        oflags;
+       int                        is_block;
+
+       uint32_t                   spb;
+       uint32_t                   bm_secs;
+
+       vhd_header_t               header;
+       vhd_footer_t               footer;
+       vhd_bat_t                  bat;
+       vhd_batmap_t               batmap;
+};
+
+static inline uint32_t
+secs_round_up(uint64_t bytes)
+{
+       return ((bytes + (VHD_SECTOR_SIZE - 1)) >> VHD_SECTOR_SHIFT);
+}
+
+static inline uint32_t
+secs_round_up_no_zero(uint64_t bytes)
+{
+       return (secs_round_up(bytes) ? : 1);
+}
+
+static inline uint64_t
+vhd_sectors_to_bytes(uint64_t sectors)
+{
+       return sectors << VHD_SECTOR_SHIFT;
+}
+
+static inline uint64_t
+vhd_bytes_padded(uint64_t bytes)
+{
+       return vhd_sectors_to_bytes(secs_round_up_no_zero(bytes));
+}
+
+static inline int
+vhd_type_dynamic(vhd_context_t *ctx)
+{
+       return (ctx->footer.type == HD_TYPE_DYNAMIC ||
+               ctx->footer.type == HD_TYPE_DIFF);
+}
+
+static inline int
+vhd_creator_tapdisk(vhd_context_t *ctx)
+{
+       return !strncmp(ctx->footer.crtr_app, "tap", 3);
+}
+
+static inline int
+vhd_disabled(vhd_context_t *ctx)
+{
+       return (!memcmp(ctx->footer.cookie,
+                       VHD_POISON_COOKIE, sizeof(ctx->footer.cookie)));
+}
+
+static inline size_t
+vhd_parent_locator_size(vhd_parent_locator_t *loc)
+{
+       /*
+        * MICROSOFT_COMPAT
+        * data_space *should* be in sectors,
+        * but sometimes we find it in bytes
+        */
+       if (loc->data_space < 512)
+               return vhd_sectors_to_bytes(loc->data_space);
+       else if (loc->data_space % 512 == 0)
+               return loc->data_space;
+       else
+               return 0;
+}
+
+static inline int
+vhd_parent_raw(vhd_context_t *ctx)
+{
+       return uuid_is_null(ctx->header.prt_uuid);
+}
+
+void libvhd_set_log_level(int);
+
+int vhd_test_file_fixed(const char *, int *);
+
+uint32_t vhd_time(time_t time);
+size_t vhd_time_to_string(uint32_t timestamp, char *target);
+uint32_t vhd_chs(uint64_t size);
+
+uint32_t vhd_checksum_footer(vhd_footer_t *);
+uint32_t vhd_checksum_header(vhd_header_t *);
+uint32_t vhd_checksum_batmap(vhd_batmap_t *);
+
+void vhd_footer_in(vhd_footer_t *);
+void vhd_footer_out(vhd_footer_t *);
+void vhd_header_in(vhd_header_t *);
+void vhd_header_out(vhd_header_t *);
+void vhd_bat_in(vhd_bat_t *);
+void vhd_bat_out(vhd_bat_t *);
+void vhd_batmap_header_in(vhd_batmap_t *);
+void vhd_batmap_header_out(vhd_batmap_t *);
+
+int vhd_validate_footer(vhd_footer_t *footer);
+int vhd_validate_header(vhd_header_t *header);
+int vhd_validate_batmap_header(vhd_batmap_t *batmap);
+int vhd_validate_batmap(vhd_batmap_t *batmap);
+int vhd_validate_platform_code(uint32_t code);
+
+int vhd_open(vhd_context_t *, const char *file, int flags);
+void vhd_close(vhd_context_t *);
+int vhd_create(const char *name, uint64_t bytes, int type, vhd_flag_creat_t);
+/* vhd_snapshot: the bytes parameter is optional and can be 0 if the snapshot 
+ * is to have the same size as the (first non-empty) parent */
+int vhd_snapshot(const char *snapshot, uint64_t bytes, const char *parent,
+               vhd_flag_creat_t);
+
+int vhd_hidden(vhd_context_t *, int *);
+int vhd_chain_depth(vhd_context_t *, int *);
+
+off64_t vhd_position(vhd_context_t *);
+int vhd_seek(vhd_context_t *, off64_t, int);
+int vhd_read(vhd_context_t *, void *, size_t);
+int vhd_write(vhd_context_t *, void *, size_t);
+
+int vhd_offset(vhd_context_t *, uint32_t, uint32_t *);
+
+int vhd_end_of_headers(vhd_context_t *ctx, off64_t *off);
+int vhd_end_of_data(vhd_context_t *ctx, off64_t *off);
+int vhd_batmap_header_offset(vhd_context_t *ctx, off64_t *off);
+
+int vhd_get_header(vhd_context_t *);
+int vhd_get_footer(vhd_context_t *);
+int vhd_get_bat(vhd_context_t *);
+int vhd_get_batmap(vhd_context_t *);
+
+void vhd_put_header(vhd_context_t *);
+void vhd_put_footer(vhd_context_t *);
+void vhd_put_bat(vhd_context_t *);
+void vhd_put_batmap(vhd_context_t *);
+
+int vhd_has_batmap(vhd_context_t *);
+int vhd_batmap_test(vhd_context_t *, vhd_batmap_t *, uint32_t);
+void vhd_batmap_set(vhd_context_t *, vhd_batmap_t *, uint32_t);
+void vhd_batmap_clear(vhd_context_t *, vhd_batmap_t *, uint32_t);
+
+int vhd_get_phys_size(vhd_context_t *, off64_t *);
+int vhd_set_phys_size(vhd_context_t *, off64_t);
+
+int vhd_bitmap_test(vhd_context_t *, char *, uint32_t);
+void vhd_bitmap_set(vhd_context_t *, char *, uint32_t);
+void vhd_bitmap_clear(vhd_context_t *, char *, uint32_t);
+
+int vhd_parent_locator_count(vhd_context_t *);
+int vhd_parent_locator_get(vhd_context_t *, char **);
+int vhd_parent_locator_read(vhd_context_t *, vhd_parent_locator_t *, char **);
+int vhd_find_parent(vhd_context_t *, const char *, char **);
+int vhd_parent_locator_write_at(vhd_context_t *, const char *,
+                               off64_t, uint32_t, size_t,
+                               vhd_parent_locator_t *);
+
+int vhd_header_decode_parent(vhd_context_t *, vhd_header_t *, char **);
+int vhd_change_parent(vhd_context_t *, char *parent_path, int raw);
+
+int vhd_read_footer(vhd_context_t *, vhd_footer_t *);
+int vhd_read_footer_at(vhd_context_t *, vhd_footer_t *, off64_t);
+int vhd_read_footer_strict(vhd_context_t *, vhd_footer_t *);
+int vhd_read_header(vhd_context_t *, vhd_header_t *);
+int vhd_read_header_at(vhd_context_t *, vhd_header_t *, off64_t);
+int vhd_read_bat(vhd_context_t *, vhd_bat_t *);
+int vhd_read_batmap(vhd_context_t *, vhd_batmap_t *);
+int vhd_read_bitmap(vhd_context_t *, uint32_t block, char **bufp);
+int vhd_read_block(vhd_context_t *, uint32_t block, char **bufp);
+
+int vhd_write_footer(vhd_context_t *, vhd_footer_t *);
+int vhd_write_footer_at(vhd_context_t *, vhd_footer_t *, off64_t);
+int vhd_write_header(vhd_context_t *, vhd_header_t *);
+int vhd_write_header_at(vhd_context_t *, vhd_header_t *, off64_t);
+int vhd_write_bat(vhd_context_t *, vhd_bat_t *);
+int vhd_write_batmap(vhd_context_t *, vhd_batmap_t *);
+int vhd_write_bitmap(vhd_context_t *, uint32_t block, char *bitmap);
+int vhd_write_block(vhd_context_t *, uint32_t block, char *data);
+
+int vhd_io_read(vhd_context_t *, char *, uint64_t, uint32_t);
+int vhd_io_write(vhd_context_t *, char *, uint64_t, uint32_t);
+
+#endif
diff --git a/tools/blktap2/include/list.h b/tools/blktap2/include/list.h
new file mode 100644 (file)
index 0000000..03a524b
--- /dev/null
@@ -0,0 +1,93 @@
+/*
+ * list.h
+ * 
+ * This is a subset of linux's list.h intended to be used in user-space.
+ * 
+ */
+
+#ifndef __LIST_H__
+#define __LIST_H__
+
+#define LIST_POISON1  ((void *) 0x00100100)
+#define LIST_POISON2  ((void *) 0x00200200)
+
+struct list_head {
+        struct list_head *next, *prev;
+};
+#define LIST_HEAD_INIT(name) { &(name), &(name) }
+#define LIST_HEAD(name) \
+        struct list_head name = LIST_HEAD_INIT(name)
+
+static inline void INIT_LIST_HEAD(struct list_head *list)
+{
+       list->next = list;
+       list->prev = list;
+}
+
+static inline void __list_add(struct list_head *new,
+                              struct list_head *prev,
+                              struct list_head *next)
+{
+        next->prev = new;
+        new->next = next;
+        new->prev = prev;
+        prev->next = new;
+}
+
+static inline void list_add(struct list_head *new, struct list_head *head)
+{
+        __list_add(new, head, head->next);
+}
+
+static inline void list_add_tail(struct list_head *new, struct list_head *head)
+{
+       __list_add(new, head->prev, head);
+}
+
+static inline void __list_del(struct list_head * prev, struct list_head * next)
+{
+        next->prev = prev;
+        prev->next = next;
+}
+
+static inline void list_del(struct list_head *entry)
+{
+        __list_del(entry->prev, entry->next);
+        entry->next = LIST_POISON1;
+        entry->prev = LIST_POISON2;
+}
+
+static inline void list_del_init(struct list_head *entry)
+{
+       __list_del(entry->prev, entry->next);
+       INIT_LIST_HEAD(entry);
+}
+
+static inline int list_empty(const struct list_head *head)
+{
+       return head->next == head;
+}
+
+static inline int list_is_last(const struct list_head *list,
+                              const struct list_head *head)
+{
+       return list->next == head;
+}
+
+#define list_entry(ptr, type, member)                                   \
+        ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member)))
+
+#define list_for_each_entry(pos, head, member)                          \
+        for (pos = list_entry((head)->next, typeof(*pos), member);      \
+             &pos->member != (head);                                    \
+             pos = list_entry(pos->member.next, typeof(*pos), member))
+
+#define list_for_each_entry_safe(pos, n, head, member)                 \
+       for (pos = list_entry((head)->next, typeof(*pos), member),      \
+              n = list_entry(pos->member.next, typeof(*pos), member);  \
+            &pos->member != (head);                                    \
+            pos = n, n = list_entry(n->member.next, typeof(*n), member))
+
+#endif /* __LIST_H__ */
diff --git a/tools/blktap2/include/lvm-util.h b/tools/blktap2/include/lvm-util.h
new file mode 100644 (file)
index 0000000..95f3320
--- /dev/null
@@ -0,0 +1,71 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _LVM_UTIL_H_
+#define _LVM_UTIL_H_
+
+#include <inttypes.h>
+
+#define MAX_NAME_SIZE            256
+
+#define LVM_SEG_TYPE_LINEAR      1
+#define LVM_SEG_TYPE_UNKNOWN     2
+
+struct lv_segment {
+       uint8_t                  type;
+       char                     device[MAX_NAME_SIZE];
+       uint64_t                 pe_start;
+       uint64_t                 pe_size;
+};
+
+struct lv {
+       char                     name[MAX_NAME_SIZE];
+       uint64_t                 size;
+       uint32_t                 segments;
+       struct lv_segment        first_segment;
+};
+
+struct pv {
+       char                     name[MAX_NAME_SIZE];
+       uint64_t                 start;
+};
+
+struct vg {
+       char                     name[MAX_NAME_SIZE];
+       uint64_t                 extent_size;
+
+       int                      pv_cnt;
+       struct pv               *pvs;
+
+       int                      lv_cnt;
+       struct lv               *lvs;
+};
+
+int lvm_scan_vg(const char *vg_name, struct vg *vg);
+void lvm_free_vg(struct vg *vg);
+
+#endif
diff --git a/tools/blktap2/include/relative-path.h b/tools/blktap2/include/relative-path.h
new file mode 100644 (file)
index 0000000..d78f94d
--- /dev/null
@@ -0,0 +1,43 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef _RELATIVE_PATH_H_
+#define _RELATIVE_PATH_H_
+
+#include <syslog.h>
+
+#define DELIMITER    '/'
+#define MAX_NAME_LEN 1000
+
+#define EPRINTF(_f, _a...) syslog(LOG_ERR, "tap-err:%s: " _f, __func__, ##_a)
+
+/*
+ * returns a relative path from @src to @dest
+ * result should be freed
+ */
+char *relative_path_to(char *src, char *dest, int *err);
+
+#endif
diff --git a/tools/blktap2/include/tapdisk-message.h b/tools/blktap2/include/tapdisk-message.h
new file mode 100644 (file)
index 0000000..1a86dcb
--- /dev/null
@@ -0,0 +1,141 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef _TAPDISK_MESSAGE_H_
+#define _TAPDISK_MESSAGE_H_
+
+#include <inttypes.h>
+
+#define TAPDISK_MESSAGE_MAX_PATH_LENGTH  256
+#define TAPDISK_MESSAGE_STRING_LENGTH    256
+
+#define TAPDISK_MESSAGE_FLAG_SHARED      0x01
+#define TAPDISK_MESSAGE_FLAG_RDONLY      0x02
+#define TAPDISK_MESSAGE_FLAG_ADD_CACHE   0x04
+#define TAPDISK_MESSAGE_FLAG_VHD_INDEX   0x08
+#define TAPDISK_MESSAGE_FLAG_LOG_DIRTY   0x10
+
+typedef struct tapdisk_message           tapdisk_message_t;
+typedef uint8_t                          tapdisk_message_flag_t;
+typedef struct tapdisk_message_image     tapdisk_message_image_t;
+typedef struct tapdisk_message_params    tapdisk_message_params_t;
+typedef struct tapdisk_message_string    tapdisk_message_string_t;
+
+struct tapdisk_message_params {
+       tapdisk_message_flag_t           flags;
+       
+       uint8_t                          storage;
+       uint32_t                         devnum;
+       uint32_t                         domid;
+       uint16_t                         path_len;
+       char                             path[TAPDISK_MESSAGE_MAX_PATH_LENGTH];
+};
+
+struct tapdisk_message_image {
+       uint64_t                         sectors;
+       uint32_t                         sector_size;
+       uint32_t                         info;
+};
+
+struct tapdisk_message_string {
+       char                             text[TAPDISK_MESSAGE_STRING_LENGTH];
+};
+
+struct tapdisk_message {
+       uint16_t                         type;
+       uint16_t                         cookie;
+       uint16_t                         drivertype;
+
+       union {
+               pid_t                    tapdisk_pid;
+               tapdisk_message_image_t  image;
+               tapdisk_message_params_t params;
+               tapdisk_message_string_t string;
+       } u;
+};
+
+enum tapdisk_message_id {
+       TAPDISK_MESSAGE_ERROR = 1,
+       TAPDISK_MESSAGE_RUNTIME_ERROR,
+       TAPDISK_MESSAGE_PID,
+       TAPDISK_MESSAGE_PID_RSP,
+       TAPDISK_MESSAGE_OPEN,
+       TAPDISK_MESSAGE_OPEN_RSP,
+       TAPDISK_MESSAGE_PAUSE,
+       TAPDISK_MESSAGE_PAUSE_RSP,
+       TAPDISK_MESSAGE_RESUME,
+       TAPDISK_MESSAGE_RESUME_RSP,
+       TAPDISK_MESSAGE_CLOSE,
+       TAPDISK_MESSAGE_CLOSE_RSP,
+       TAPDISK_MESSAGE_EXIT,
+};
+
+static inline char *
+tapdisk_message_name(enum tapdisk_message_id id)
+{
+       switch (id) {
+       case TAPDISK_MESSAGE_ERROR:
+               return "error";
+
+       case TAPDISK_MESSAGE_PID:
+               return "pid";
+
+       case TAPDISK_MESSAGE_PID_RSP:
+               return "pid response";
+
+       case TAPDISK_MESSAGE_OPEN:
+               return "open";
+
+       case TAPDISK_MESSAGE_OPEN_RSP:
+               return "open response";
+
+       case TAPDISK_MESSAGE_PAUSE:
+               return "pause";
+
+       case TAPDISK_MESSAGE_PAUSE_RSP:
+               return "pause response";
+
+       case TAPDISK_MESSAGE_RESUME:
+               return "resume";
+
+       case TAPDISK_MESSAGE_RESUME_RSP:
+               return "resume response";
+
+       case TAPDISK_MESSAGE_CLOSE:
+               return "close";
+
+       case TAPDISK_MESSAGE_CLOSE_RSP:
+               return "close response";
+
+       case TAPDISK_MESSAGE_EXIT:
+               return "exit";
+
+       default:
+               return "unknown";
+       }
+}
+
+#endif
diff --git a/tools/blktap2/include/vhd-util.h b/tools/blktap2/include/vhd-util.h
new file mode 100644 (file)
index 0000000..11f077e
--- /dev/null
@@ -0,0 +1,44 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef _VHD_UTIL_H_
+#define _VHD_UTIL_H_
+
+int vhd_util_create(int argc, char **argv);
+int vhd_util_snapshot(int argc, char **argv);
+int vhd_util_query(int argc, char **argv);
+int vhd_util_read(int argc, char **argv);
+int vhd_util_set_field(int argc, char **argv);
+int vhd_util_repair(int argc, char **argv);
+int vhd_util_fill(int argc, char **argv);
+int vhd_util_resize(int argc, char **argv);
+int vhd_util_coalesce(int argc, char **argv);
+int vhd_util_modify(int argc, char **argv);
+int vhd_util_scan(int argc, char **argv);
+int vhd_util_check(int argc, char **argv);
+int vhd_util_revert(int argc, char **argv);
+
+#endif
diff --git a/tools/blktap2/include/vhd.h b/tools/blktap2/include/vhd.h
new file mode 100644 (file)
index 0000000..4da5f86
--- /dev/null
@@ -0,0 +1,221 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef __VHD_H__
+#define __VHD_H__
+
+#include <asm/types.h>
+#include <uuid/uuid.h>
+#include <inttypes.h>
+
+typedef uint32_t u32;
+typedef uint64_t u64;
+
+#define DEBUG 1
+
+/* ---------------------------------------------------------------------- */
+/* General definitions.                                                   */
+/* ---------------------------------------------------------------------- */
+
+#define VHD_SECTOR_SIZE  512
+#define VHD_SECTOR_SHIFT   9
+
+/* ---------------------------------------------------------------------- */
+/* This is the generic disk footer, used by all disks.                    */
+/* ---------------------------------------------------------------------- */
+
+struct hd_ftr {
+  char   cookie[8];       /* Identifies original creator of the disk      */
+  u32    features;        /* Feature Support -- see below                 */
+  u32    ff_version;      /* (major,minor) version of disk file           */
+  u64    data_offset;     /* Abs. offset from SOF to next structure       */
+  u32    timestamp;       /* Creation time.  secs since 1/1/2000GMT       */
+  char   crtr_app[4];     /* Creator application                          */
+  u32    crtr_ver;        /* Creator version (major,minor)                */
+  u32    crtr_os;         /* Creator host OS                              */
+  u64    orig_size;       /* Size at creation (bytes)                     */
+  u64    curr_size;       /* Current size of disk (bytes)                 */
+  u32    geometry;        /* Disk geometry                                */
+  u32    type;            /* Disk type                                    */
+  u32    checksum;        /* 1's comp sum of this struct.                 */
+  uuid_t uuid;            /* Unique disk ID, used for naming parents      */
+  char   saved;           /* one-bit -- is this disk/VM in a saved state? */
+  char   hidden;          /* tapdisk-specific field: is this vdi hidden?  */
+  char   reserved[426];   /* padding                                      */
+};
+
+/* VHD cookie string. */
+static const char HD_COOKIE[9]  =  "conectix";
+
+/* Feature fields in hd_ftr */
+#define HD_NO_FEATURES     0x00000000
+#define HD_TEMPORARY       0x00000001 /* disk can be deleted on shutdown */
+#define HD_RESERVED        0x00000002 /* NOTE: must always be set        */
+
+/* Version field in hd_ftr */
+#define HD_FF_VERSION      0x00010000
+
+/* Known creator OS type fields in hd_ftr.crtr_os */
+#define HD_CR_OS_WINDOWS   0x5769326B /* (Wi2k) */
+#define HD_CR_OS_MACINTOSH 0x4D616320 /* (Mac ) */
+
+/*
+ * version 0.1:  little endian bitmaps
+ * version 1.1:  big endian bitmaps; batmap
+ * version 1.2:  libvhd
+ * version 1.3:  batmap version bump to 1.2
+ */
+#define VHD_VERSION(major, minor)  (((major) << 16) | ((minor) & 0x0000FFFF))
+#define VHD_CURRENT_VERSION        VHD_VERSION(1, 3)
+
+/* Disk geometry accessor macros. */
+/* Geometry is a triple of (cylinders (2 bytes), tracks (1 byte), and 
+ * secotrs-per-track (1 byte)) 
+ */
+#define GEOM_GET_CYLS(_g)  (((_g) >> 16) & 0xffff)
+#define GEOM_GET_HEADS(_g) (((_g) >> 8)  & 0xff)
+#define GEOM_GET_SPT(_g)   ((_g) & 0xff)
+
+#define GEOM_ENCODE(_c, _h, _s) (((_c) << 16) | ((_h) << 8) | (_s))
+
+/* type field in hd_ftr */
+#define HD_TYPE_NONE       0
+#define HD_TYPE_FIXED      2  /* fixed-allocation disk */
+#define HD_TYPE_DYNAMIC    3  /* dynamic disk */
+#define HD_TYPE_DIFF       4  /* differencing disk */
+
+/* String table for hd.type */
+static const char *HD_TYPE_STR[7] = {
+        "None",                    /* 0 */
+        "Reserved (deprecated)",   /* 1 */
+        "Fixed hard disk",         /* 2 */
+        "Dynamic hard disk",       /* 3 */
+        "Differencing hard disk",  /* 4 */
+        "Reserved (deprecated)",   /* 5 */
+        "Reserved (deprecated)"    /* 6 */
+};
+
+#define HD_TYPE_MAX 6
+
+struct prt_loc {
+  u32    code;            /* Platform code -- see defines below.          */
+  u32    data_space;      /* Number of 512-byte sectors to store locator  */
+  u32    data_len;        /* Actual length of parent locator in bytes     */
+  u32    res;             /* Must be zero                                 */
+  u64    data_offset;     /* Absolute offset of locator data (bytes)      */
+};
+
+/* Platform Codes */
+#define PLAT_CODE_NONE  0x0
+#define PLAT_CODE_WI2R  0x57693272  /* deprecated                         */
+#define PLAT_CODE_WI2K  0x5769326B  /* deprecated                         */
+#define PLAT_CODE_W2RU  0x57327275  /* Windows relative path (UTF-16)     */
+#define PLAT_CODE_W2KU  0x57326B75  /* Windows absolute path (UTF-16)     */
+#define PLAT_CODE_MAC   0x4D616320  /* MacOS alias stored as a blob.      */
+#define PLAT_CODE_MACX  0x4D616358  /* File URL (UTF-8), see RFC 2396.    */
+
+/* ---------------------------------------------------------------------- */
+/* This is the dynamic disk header.                                       */
+/* ---------------------------------------------------------------------- */
+
+struct dd_hdr {
+  char   cookie[8];       /* Should contain "cxsparse"                    */
+  u64    data_offset;     /* Byte offset of next record. (Unused) 0xffs   */
+  u64    table_offset;    /* Absolute offset to the BAT.                  */
+  u32    hdr_ver;         /* Version of the dd_hdr (major,minor)          */
+  u32    max_bat_size;    /* Maximum number of entries in the BAT         */
+  u32    block_size;      /* Block size in bytes. Must be power of 2.     */
+  u32    checksum;        /* Header checksum.  1's comp of all fields.    */
+  uuid_t prt_uuid;        /* ID of the parent disk.                       */
+  u32    prt_ts;          /* Modification time of the parent disk         */
+  u32    res1;            /* Reserved.                                    */
+  char   prt_name[512];   /* Parent unicode name.                         */
+  struct prt_loc loc[8];  /* Parent locator entries.                      */
+  char   res2[256];       /* Reserved.                                    */
+};
+
+/* VHD cookie string. */
+static const char DD_COOKIE[9]  =  "cxsparse";
+
+/* Version field in hd_ftr */
+#define DD_VERSION 0x00010000
+
+/* Default blocksize is 2 meg. */
+#define DD_BLOCKSIZE_DEFAULT 0x00200000
+
+#define DD_BLK_UNUSED 0xFFFFFFFF
+
+struct dd_batmap_hdr {
+  char   cookie[8];       /* should contain "tdbatmap"                    */
+  u64    batmap_offset;   /* byte offset to batmap                        */
+  u32    batmap_size;     /* batmap size in sectors                       */
+  u32    batmap_version;  /* version of batmap                            */
+  u32    checksum;        /* batmap checksum -- 1's complement of batmap  */
+};
+
+static const char VHD_BATMAP_COOKIE[9] = "tdbatmap";
+
+/*
+ * version 1.1: signed char checksum
+ */
+#define VHD_BATMAP_VERSION(major, minor)  (((major) << 16) | ((minor) & 0x0000FFFF))
+#define VHD_BATMAP_CURRENT_VERSION        VHD_BATMAP_VERSION(1, 2)
+
+/* Layout of a dynamic disk:
+ *
+ * +-------------------------------------------------+
+ * | Mirror image of HD footer (hd_ftr) (512 bytes)  |
+ * +-------------------------------------------------+
+ * | Sparse drive header (dd_hdr) (1024 bytes)       |
+ * +-------------------------------------------------+
+ * | BAT (Block allocation table)                    |
+ * |   - Array of absolute sector offsets into the   |
+ * |     file (u32).                                 |
+ * |   - Rounded up to a sector boundary.            |
+ * |   - Unused entries are marked as 0xFFFFFFFF     |
+ * |   - max entries in dd_hdr->max_bat_size         |
+ * +-------------------------------------------------+
+ * | Data Block 0                                    |
+ * | Bitmap (padded to 512 byte sector boundary)     |
+ * |   - each bit indicates whether the associated   |
+ * |     sector within this block is used.           |
+ * | Data                                            |
+ * |   - power-of-two multiple of sectors.           |
+ * |   - default 2MB (4096 * 512)                    |
+ * |   - Any entries with zero in bitmap should be   |
+ * |     zero on disk                                |
+ * +-------------------------------------------------+
+ * | Data Block 1                                    |
+ * +-------------------------------------------------+
+ * | ...                                             |
+ * +-------------------------------------------------+
+ * | Data Block n                                    |
+ * +-------------------------------------------------+
+ * | HD Footer (511 bytes)                           |
+ * +-------------------------------------------------+
+ */
+
+#endif
diff --git a/tools/blktap2/lvm/Makefile b/tools/blktap2/lvm/Makefile
new file mode 100644 (file)
index 0000000..3a726d7
--- /dev/null
@@ -0,0 +1,38 @@
+XEN_ROOT = ../../../
+BLKTAP_ROOT := ../
+include $(XEN_ROOT)/tools/Rules.mk
+
+ifeq ($(LVM_UTIL_TEST),y)
+TEST              := lvm-util
+endif
+
+CFLAGS            += -Werror
+CFLAGS            += -Wno-unused
+CFLAGS            += -I../include
+CFLAGS            += -D_GNU_SOURCE
+
+ifeq ($(CONFIG_X86_64),y)
+CFLAGS            += -fPIC
+endif
+
+# Get gcc to generate the dependencies for us.
+CFLAGS            += -Wp,-MD,.$(@F).d
+DEPS               = .*.d
+
+LVM-OBJS          := lvm-util.o
+
+all: build
+
+build: $(TEST) $(LVM-OBJS)
+
+install: all
+
+lvm-util: lvm-util.o
+       $(CC) $(CFLAGS) -DLVM_UTIL -o lvm-util lvm-util.c
+
+clean:
+       rm -rf *.o *~ $(DEPS) $(IBIN)
+
+.PHONY: all build clean install lvm-util
+
+-include $(DEPS)
diff --git a/tools/blktap2/lvm/lvm-util.c b/tools/blktap2/lvm/lvm-util.c
new file mode 100644 (file)
index 0000000..b456e04
--- /dev/null
@@ -0,0 +1,349 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <stdio.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "lvm-util.h"
+
+#define _NAME "%255s"
+static char line[1024];
+
+static inline int
+lvm_read_line(FILE *scan)
+{
+       memset(line, 0, sizeof(line));
+       return (fscanf(scan, "%1023[^\n]", line) != 1);
+}
+
+static inline int
+lvm_next_line(FILE *scan)
+{
+       return (fscanf(scan, "%1023[\n]", line) != 1);
+}
+
+static int
+lvm_copy_name(char *dst, const char *src, size_t size)
+{
+       if (strnlen(src, size) == size)
+               return -ENAMETOOLONG;
+
+       strcpy(dst, src);
+       return 0;
+}
+
+static int
+lvm_parse_pv(struct vg *vg, const char *name, int pvs, uint64_t start)
+{
+       int i, err;
+       struct pv *pv;
+
+       pv = NULL;
+
+       if (!vg->pvs) {
+               vg->pvs = calloc(pvs, sizeof(struct pv));
+               if (!vg->pvs)
+                       return -ENOMEM;
+       }
+
+       for (i = 0; i < pvs; i++) {
+               pv = vg->pvs + i;
+
+               if (!pv->name[0])
+                       break;
+
+               if (!strcmp(pv->name, name))
+                       return -EEXIST;
+       }
+
+       if (!pv)
+               return -ENOENT;
+
+       if (i == pvs)
+               return -ENOMEM;
+
+       err = lvm_copy_name(pv->name, name, sizeof(pv->name) - 1);
+       if (err)
+               return err;
+
+       pv->start = start;
+       return 0;
+}
+
+static int
+lvm_open_vg(const char *vgname, struct vg *vg)
+{
+       FILE *scan;
+       int i, err, pvs, lvs;
+       char *cmd, pvname[256];
+       uint64_t size, pv_start;
+
+       memset(vg, 0, sizeof(*vg));
+
+       err = asprintf(&cmd, "/usr/sbin/vgs %s --noheadings --nosuffix --units=b "
+                      "--options=vg_name,vg_extent_size,lv_count,pv_count,"
+                      "pv_name,pe_start --unbuffered 2> /dev/null", vgname);
+       if (err == -1)
+               return -ENOMEM;
+
+       errno = 0;
+       scan  = popen(cmd, "r");
+       if (!scan) {
+               err = (errno ? -errno : ENOMEM);
+               goto out;
+       }
+
+       for (;;) {
+               if (lvm_read_line(scan))
+                       break;
+
+               err = -EINVAL;
+                if (sscanf(line, _NAME" %"SCNu64" %d %d "_NAME" %"SCNu64,
+                          vg->name, &size, &lvs, &pvs, pvname, &pv_start) != 6)
+                       goto out;
+
+               if (strcmp(vg->name, vgname))
+                       goto out;
+
+               err = lvm_parse_pv(vg, pvname, pvs, pv_start);
+               if (err)
+                       goto out;
+
+               if (lvm_next_line(scan))
+                       break;
+       }
+
+       err = -EINVAL;
+       if (strcmp(vg->name, vgname))
+               goto out;
+
+       for (i = 0; i < pvs; i++)
+               if (!vg->pvs[i].name[0])
+                       goto out;
+
+       err = -ENOMEM;
+       vg->lvs = calloc(lvs, sizeof(struct lv));
+       if (!vg->lvs)
+               goto out;
+
+       err             = 0;
+       vg->lv_cnt      = lvs;
+       vg->pv_cnt      = pvs;
+       vg->extent_size = size;
+
+out:
+       if (scan)
+               pclose(scan);
+       if (err)
+               lvm_free_vg(vg);
+       free(cmd);
+       return err;
+}
+
+static int
+lvm_parse_lv_devices(struct vg *vg, struct lv_segment *seg, char *devices)
+{
+       int i;
+       uint64_t start, pe_start;
+
+       for (i = 0; i < strlen(devices); i++)
+               if (strchr(",()", devices[i]))
+                       devices[i] = ' ';
+
+        if (sscanf(devices, _NAME" %"SCNu64, seg->device, &start) != 2)
+               return -EINVAL;
+
+       pe_start = -1;
+       for (i = 0; i < vg->pv_cnt; i++)
+               if (!strcmp(vg->pvs[i].name, seg->device)) {
+                       pe_start = vg->pvs[i].start;
+                       break;
+               }
+
+       if (pe_start == -1)
+               return -EINVAL;
+
+       seg->pe_start = (start * vg->extent_size) + pe_start;
+       return 0;
+}
+
+static int
+lvm_scan_lvs(struct vg *vg)
+{
+       char *cmd;
+       FILE *scan;
+       int i, err;
+
+       err = asprintf(&cmd, "/usr/sbin/lvs %s --noheadings --nosuffix --units=b "
+                      "--options=lv_name,lv_size,segtype,seg_count,seg_start,"
+                      "seg_size,devices --unbuffered 2> /dev/null", vg->name);
+       if (err == -1)
+               return -ENOMEM;
+
+       errno = 0;
+       scan  = popen(cmd, "r");
+       if (!scan) {
+               err = (errno ? -errno : -ENOMEM);
+               goto out;
+       }
+
+       for (i = 0;;) {
+               int segs;
+               struct lv *lv;
+               struct lv_segment seg;
+               uint64_t size, seg_start;
+               char type[32], name[256], dev[256], devices[1024];
+
+               if (i >= vg->lv_cnt)
+                       break;
+
+               if (lvm_read_line(scan)) {
+                       vg->lv_cnt = i;
+                       break;
+               }
+
+               err = -EINVAL;
+               lv  = vg->lvs + i;
+
+                if (sscanf(line, _NAME" %"SCNu64" %31s %u %"SCNu64" %"SCNu64" %1023s",
+                          name, &size, type, &segs, &seg_start,
+                          &seg.pe_size, devices) != 7)
+                       goto out;
+
+               if (seg_start)
+                       goto next;
+
+               if (!strcmp(type, "linear"))
+                       seg.type = LVM_SEG_TYPE_LINEAR;
+               else
+                       seg.type = LVM_SEG_TYPE_UNKNOWN;
+
+               if (lvm_parse_lv_devices(vg, &seg, devices))
+                       goto out;
+
+               i++;
+               lv->size          = size;
+               lv->segments      = segs;
+               lv->first_segment = seg;
+
+               err = lvm_copy_name(lv->name, name, sizeof(lv->name) - 1);
+               if (err)
+                       goto out;
+               err = -EINVAL;
+
+       next:
+               if (lvm_next_line(scan))
+                       goto out;
+       }
+
+       err = 0;
+
+out:
+       if (scan)
+               pclose(scan);
+       free(cmd);
+       return err;
+}
+
+void
+lvm_free_vg(struct vg *vg)
+{
+       free(vg->lvs);
+       free(vg->pvs);
+       memset(vg, 0, sizeof(*vg));
+}
+
+int
+lvm_scan_vg(const char *vg_name, struct vg *vg)
+{
+       int err;
+
+       memset(vg, 0, sizeof(*vg));
+
+       err = lvm_open_vg(vg_name, vg);
+       if (err)
+               return err;
+
+       err = lvm_scan_lvs(vg);
+       if (err) {
+               lvm_free_vg(vg);
+               return err;
+       }
+
+       return 0;
+}
+
+#ifdef LVM_UTIL
+static int
+usage(void)
+{
+       printf("usage: lvm-util <vgname>\n");
+       exit(EINVAL);
+}
+
+int
+main(int argc, char **argv)
+{
+       int i, err;
+       struct vg vg;
+       struct pv *pv;
+       struct lv *lv;
+       struct lv_segment *seg;
+
+       if (argc != 2)
+               usage();
+
+       err = lvm_scan_vg(argv[1], &vg);
+       if (err) {
+               printf("scan failed: %d\n", err);
+               return (err >= 0 ? err : -err);
+       }
+
+       
+        printf("vg %s: extent_size: %"PRIu64", pvs: %d, lvs: %d\n",
+              vg.name, vg.extent_size, vg.pv_cnt, vg.lv_cnt);
+
+       for (i = 0; i < vg.pv_cnt; i++) {
+               pv = vg.pvs + i;
+                printf("pv %s: start %"PRIu64"\n", pv->name, pv->start);
+       }
+
+       for (i = 0; i < vg.lv_cnt; i++) {
+               lv  = vg.lvs + i;
+               seg = &lv->first_segment;                
+                printf("lv %s: size: %"PRIu64", segments: %u, type: %u, "
+                       "dev: %s, pe_start: %"PRIu64", pe_size: %"PRIu64"\n",
+                      lv->name, lv->size, lv->segments, seg->type,
+                      seg->device, seg->pe_start, seg->pe_size);
+       }
+
+       lvm_free_vg(&vg);
+       return 0;
+}
+#endif
diff --git a/tools/blktap2/vhd/Makefile b/tools/blktap2/vhd/Makefile
new file mode 100644 (file)
index 0000000..099a0ba
--- /dev/null
@@ -0,0 +1,55 @@
+XEN_ROOT=../../../
+BLKTAP_ROOT := ../
+include $(XEN_ROOT)/tools/Rules.mk
+
+IBIN               = vhd-util vhd-update
+INST_DIR           = $(SBINDIR)
+
+LIBDIR             = lib
+
+CFLAGS            += -Werror
+CFLAGS            += -Wno-unused
+CFLAGS            += -I../include
+CFLAGS            += -D_GNU_SOURCE
+
+ifeq ($(CONFIG_X86_64),y)
+CFLAGS            += -fPIC
+endif
+
+ifeq ($(VHD_STATIC),y)
+CFLAGS            += -static
+endif
+
+LIBS              := -L$(LIBDIR) -lvhd
+LIBS              += -luuid
+
+# Get gcc to generate the dependencies for us.
+CFLAGS            += -Wp,-MD,.$(@F).d
+DEPS               = .*.d
+
+all: build
+
+build: libvhd $(IBIN)
+
+libvhd:
+       @set -e
+       $(MAKE) -C $(LIBDIR) all
+
+vhd-util: vhd-util.o
+       $(CC) $(CFLAGS) -o vhd-util vhd-util.o $(LIBS)
+
+vhd-update: vhd-update.o
+       $(CC) $(CFLAGS) -o vhd-update vhd-update.o $(LIBS)
+
+install: all
+       $(MAKE) -C $(LIBDIR) install
+       $(INSTALL_DIR) -p $(DESTDIR)$(INST_DIR)
+       $(INSTALL_PROG) $(IBIN) $(DESTDIR)$(INST_DIR)
+
+clean:
+       $(MAKE) -C $(LIBDIR) clean
+       rm -rf *.o *~ $(DEPS) $(IBIN)
+
+.PHONY: all build clean install libvhd vhd-util vhd-update
+
+-include $(DEPS)
diff --git a/tools/blktap2/vhd/lib/Makefile b/tools/blktap2/vhd/lib/Makefile
new file mode 100644 (file)
index 0000000..e26ef86
--- /dev/null
@@ -0,0 +1,73 @@
+XEN_ROOT=../../../../
+BLKTAP_ROOT := ../../
+include $(XEN_ROOT)/tools/Rules.mk
+
+LIBVHD-MAJOR     = 1.0
+LIBVHD-MINOR     = 0
+LIBVHD-SONAME    = libvhd.so.$(LIBVHD-MAJOR)
+
+LVM-UTIL-OBJ    := $(BLKTAP_ROOT)lvm/lvm-util.o
+
+LIBVHD-BUILD    := libvhd.a
+
+INST-DIR         = $(LIBDIR)
+
+CFLAGS          += -Werror
+CFLAGS          += -Wno-unused
+CFLAGS          += -I../../include
+CFLAGS          += -D_GNU_SOURCE
+CFLAGS          += -fPIC
+CFLAGS          += -g
+
+LIBS            := -luuid
+
+# Get gcc to generate the dependencies for us.
+CFLAGS          += -Wp,-MD,.$(@F).d
+DEPS             = .*.d
+
+LIB-SRCS        := libvhd.c
+LIB-SRCS        += libvhd-journal.c
+LIB-SRCS        += vhd-util-coalesce.c
+LIB-SRCS        += vhd-util-create.c
+LIB-SRCS        += vhd-util-fill.c
+LIB-SRCS        += vhd-util-modify.c
+LIB-SRCS        += vhd-util-query.c
+LIB-SRCS        += vhd-util-read.c
+LIB-SRCS        += vhd-util-repair.c
+LIB-SRCS        += vhd-util-resize.c
+LIB-SRCS        += vhd-util-revert.c
+LIB-SRCS        += vhd-util-set-field.c
+LIB-SRCS        += vhd-util-snapshot.c
+LIB-SRCS        += vhd-util-scan.c
+LIB-SRCS        += vhd-util-check.c
+LIB-SRCS        += relative-path.c
+LIB-SRCS        += atomicio.c
+
+LIB-OBJS         = $(patsubst %.c,%.o,$(LIB-SRCS))
+LIB-OBJS        += $(LVM-UTIL-OBJ)
+
+LIBVHD           = libvhd.a libvhd.so.$(LIBVHD-MAJOR).$(LIBVHD-MINOR)
+
+all: build
+
+build: $(LIBVHD-BUILD)
+
+libvhd.a: $(LIB-OBJS)
+       $(CC) $(CFLAGS) -Wl,$(SONAME_LDFLAG),$(LIBVHD-SONAME) $(SHLIB_CFLAGS) \
+               -o libvhd.so.$(LIBVHD-MAJOR).$(LIBVHD-MINOR) $(LIBS) $^
+       ln -sf libvhd.so.$(LIBVHD-MAJOR).$(LIBVHD-MINOR) libvhd.so.$(LIBVHD-MAJOR)
+       ln -sf libvhd.so.$(LIBVHD-MAJOR) libvhd.so
+       $(AR) rc $@ $^
+
+install: all
+       $(INSTALL_DIR) -p $(DESTDIR)$(INST-DIR)
+       $(INSTALL_DATA) $(LIBVHD) $(DESTDIR)$(INST-DIR)
+       ln -sf libvhd.so.$(LIBVHD-MAJOR).$(LIBVHD-MINOR) $(DESTDIR)$(INST-DIR)/libvhd.so.$(LIBVHD-MAJOR)
+       ln -sf libvhd.so.$(LIBVHD-MAJOR) $(DESTDIR)$(INST-DIR)/libvhd.so
+
+clean:
+       rm -rf *.a *.so* *.o *~ $(DEPS) $(LIBVHD)
+
+.PHONY: all build clean install libvhd
+
+-include $(DEPS)
diff --git a/tools/blktap2/vhd/lib/atomicio.c b/tools/blktap2/vhd/lib/atomicio.c
new file mode 100644 (file)
index 0000000..ae0e24b
--- /dev/null
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2005 Anil Madhavapeddy. All rights reserved.
+ * Copyright (c) 1995,1999 Theo de Raadt.  All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdlib.h>
+#include <errno.h>
+#include "atomicio.h"
+
+/*
+ * ensure all of data on socket comes through. f==read || f==vwrite
+ */
+size_t
+atomicio(f, fd, _s, n)
+       ssize_t (*f) (int, void *, size_t);
+       int fd;
+       void *_s;
+       size_t n;
+{
+       char *s = _s;
+       size_t pos = 0;
+       ssize_t res;
+
+       while (n > pos) {
+               res = (f) (fd, s + pos, n - pos);
+               switch (res) {
+               case -1:
+                       if (errno == EINTR || errno == EAGAIN)
+                               continue;
+                       return 0;
+               case 0:
+                       errno = EPIPE;
+                       return pos;
+               default:
+                       pos += (size_t)res;
+               }
+       }
+       return (pos);
+}
+
diff --git a/tools/blktap2/vhd/lib/libvhd-journal.c b/tools/blktap2/vhd/lib/libvhd-journal.c
new file mode 100644 (file)
index 0000000..c52affe
--- /dev/null
@@ -0,0 +1,1534 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <stdio.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+
+#include "atomicio.h"
+#include "libvhd-journal.h"
+
+#define VHD_JOURNAL_ENTRY_TYPE_FOOTER_P  1
+#define VHD_JOURNAL_ENTRY_TYPE_FOOTER_C  2
+#define VHD_JOURNAL_ENTRY_TYPE_HEADER    3
+#define VHD_JOURNAL_ENTRY_TYPE_LOCATOR   4
+#define VHD_JOURNAL_ENTRY_TYPE_BAT       5
+#define VHD_JOURNAL_ENTRY_TYPE_BATMAP_H  6
+#define VHD_JOURNAL_ENTRY_TYPE_BATMAP_M  7
+#define VHD_JOURNAL_ENTRY_TYPE_DATA      8
+
+typedef struct vhd_journal_entry {
+       uint64_t                         cookie;
+       uint32_t                         type;
+       uint32_t                         size;
+       uint64_t                         offset;
+       uint32_t                         checksum;
+} vhd_journal_entry_t;
+
+static inline int
+vhd_journal_seek(vhd_journal_t *j, off64_t offset, int whence)
+{
+       off64_t off;
+
+       off = lseek64(j->jfd, offset, whence);
+       if (off == (off64_t)-1)
+               return -errno;
+
+       return 0;
+}
+
+static inline off64_t
+vhd_journal_position(vhd_journal_t *j)
+{
+       return lseek64(j->jfd, 0, SEEK_CUR);
+}
+
+static inline int
+vhd_journal_read(vhd_journal_t *j, void *buf, size_t size)
+{
+       ssize_t ret;
+
+       errno = 0;
+
+       ret = atomicio(read, j->jfd, buf, size);
+       if (ret != size)
+               return (errno ? -errno : -EIO);
+
+       return 0;
+}
+
+static inline int
+vhd_journal_write(vhd_journal_t *j, void *buf, size_t size)
+{
+       ssize_t ret;
+
+       errno = 0;
+
+       ret = atomicio(vwrite, j->jfd, buf, size);
+       if (ret != size)
+               return (errno ? -errno : -EIO);
+
+       return 0;
+}
+
+static inline int
+vhd_journal_truncate(vhd_journal_t *j, off64_t length)
+{
+       int err;
+
+       err = ftruncate(j->jfd, length);
+       if (err == -1)
+               return -errno;
+
+       return 0;
+}
+
+static inline int
+vhd_journal_sync(vhd_journal_t *j)
+{
+       int err;
+
+       err = fdatasync(j->jfd);
+       if (err)
+               return -errno;
+
+       return 0;
+}
+
+static inline void
+vhd_journal_header_in(vhd_journal_header_t *header)
+{
+       BE64_IN(&header->vhd_footer_offset);
+       BE32_IN(&header->journal_data_entries);
+       BE32_IN(&header->journal_metadata_entries);
+       BE64_IN(&header->journal_data_offset);
+       BE64_IN(&header->journal_metadata_offset);
+}
+
+static inline void
+vhd_journal_header_out(vhd_journal_header_t *header)
+{
+       BE64_OUT(&header->vhd_footer_offset);
+       BE32_OUT(&header->journal_data_entries);
+       BE32_OUT(&header->journal_metadata_entries);
+       BE64_OUT(&header->journal_data_offset);
+       BE64_OUT(&header->journal_metadata_offset);
+}
+
+static int
+vhd_journal_validate_header(vhd_journal_t *j, vhd_journal_header_t *header)
+{
+       int err;
+       off64_t eof;
+
+       if (memcmp(header->cookie,
+                  VHD_JOURNAL_HEADER_COOKIE, sizeof(header->cookie)))
+               return -EINVAL;
+
+       err = vhd_journal_seek(j, j->header.journal_eof, SEEK_SET);
+       if (err)
+               return err;
+
+       eof = vhd_journal_position(j);
+       if (eof == (off64_t)-1)
+               return -errno;
+
+       if (j->header.journal_data_offset > j->header.journal_eof)
+               return -EINVAL;
+
+       if (j->header.journal_metadata_offset > j->header.journal_eof)
+               return -EINVAL;
+
+       return 0;
+}
+
+static int
+vhd_journal_read_journal_header(vhd_journal_t *j, vhd_journal_header_t *header)
+{
+       int err;
+       size_t size;
+
+       size = sizeof(vhd_journal_header_t);
+       err  = vhd_journal_seek(j, 0, SEEK_SET);
+       if (err)
+               return err;
+
+       err  = vhd_journal_read(j, header, size);
+       if (err)
+               return err;
+
+       vhd_journal_header_in(header);
+
+       return vhd_journal_validate_header(j, header);
+}
+
+static int
+vhd_journal_write_header(vhd_journal_t *j, vhd_journal_header_t *header)
+{
+       int err;
+       size_t size;
+       vhd_journal_header_t h;
+
+       memcpy(&h, header, sizeof(vhd_journal_header_t));
+
+       err = vhd_journal_validate_header(j, &h);
+       if (err)
+               return err;
+
+       vhd_journal_header_out(&h);
+       size = sizeof(vhd_journal_header_t);
+
+       err  = vhd_journal_seek(j, 0, SEEK_SET);
+       if (err)
+               return err;
+
+       err = vhd_journal_write(j, &h, size);
+       if (err)
+               return err;
+
+       return 0;
+}
+
+static int
+vhd_journal_add_journal_header(vhd_journal_t *j)
+{
+       int err;
+       off64_t off;
+       vhd_context_t *vhd;
+
+       vhd = &j->vhd;
+       memset(&j->header, 0, sizeof(vhd_journal_header_t));
+
+       err = vhd_seek(vhd, 0, SEEK_END);
+       if (err)
+               return err;
+
+       off = vhd_position(vhd);
+       if (off == (off64_t)-1)
+               return -errno;
+
+       err = vhd_get_footer(vhd);
+       if (err)
+               return err;
+
+       uuid_copy(j->header.uuid, vhd->footer.uuid);
+       memcpy(j->header.cookie,
+              VHD_JOURNAL_HEADER_COOKIE, sizeof(j->header.cookie));
+       j->header.vhd_footer_offset = off - sizeof(vhd_footer_t);
+       j->header.journal_eof = sizeof(vhd_journal_header_t);
+
+       return vhd_journal_write_header(j, &j->header);
+}
+
+static void
+vhd_journal_entry_in(vhd_journal_entry_t *entry)
+{
+       BE32_IN(&entry->type);
+       BE32_IN(&entry->size);
+       BE64_IN(&entry->offset);
+       BE64_IN(&entry->cookie);
+       BE32_IN(&entry->checksum);
+}
+
+static void
+vhd_journal_entry_out(vhd_journal_entry_t *entry)
+{
+       BE32_OUT(&entry->type);
+       BE32_OUT(&entry->size);
+       BE64_OUT(&entry->offset);
+       BE64_OUT(&entry->cookie);
+       BE32_OUT(&entry->checksum);
+}
+
+static uint32_t
+vhd_journal_checksum_entry(vhd_journal_entry_t *entry, char *buf, size_t size)
+{
+       int i;
+       unsigned char *blob;
+       uint32_t checksum, tmp;
+
+       checksum        = 0;
+       tmp             = entry->checksum;
+       entry->checksum = 0;
+
+       blob = (unsigned char *)entry;
+       for (i = 0; i < sizeof(vhd_journal_entry_t); i++)
+               checksum += blob[i];
+
+       blob = (unsigned char *)buf;
+       for (i = 0; i < size; i++)
+               checksum += blob[i];
+
+       entry->checksum = tmp;
+       return ~checksum;
+}
+
+static int
+vhd_journal_validate_entry(vhd_journal_entry_t *entry)
+{
+       if (entry->size == 0)
+               return -EINVAL;
+
+       if (entry->size & (VHD_SECTOR_SIZE - 1))
+               return -EINVAL;
+
+       if (entry->cookie != VHD_JOURNAL_ENTRY_COOKIE)
+               return -EINVAL;
+
+       return 0;
+}
+
+static int
+vhd_journal_read_entry(vhd_journal_t *j, vhd_journal_entry_t *entry)
+{
+       int err;
+
+       err = vhd_journal_read(j, entry, sizeof(vhd_journal_entry_t));
+       if (err)
+               return err;
+
+       vhd_journal_entry_in(entry);
+       return vhd_journal_validate_entry(entry);
+}
+
+static int
+vhd_journal_write_entry(vhd_journal_t *j, vhd_journal_entry_t *entry)
+{
+       int err;
+       vhd_journal_entry_t e;
+
+       err = vhd_journal_validate_entry(entry);
+       if (err)
+               return err;
+
+       memcpy(&e, entry, sizeof(vhd_journal_entry_t));
+       vhd_journal_entry_out(&e);
+
+       err = vhd_journal_write(j, &e, sizeof(vhd_journal_entry_t));
+       if (err)
+               err;
+
+       return 0;
+}
+
+static int
+vhd_journal_validate_entry_data(vhd_journal_entry_t *entry, char *buf)
+{
+       int err;
+       uint32_t checksum;
+
+       err      = 0;
+       checksum = vhd_journal_checksum_entry(entry, buf, entry->size);
+
+       if (checksum != entry->checksum)
+               return -EINVAL;
+
+       return err;
+}
+
+static int
+vhd_journal_update(vhd_journal_t *j, off64_t offset,
+                  char *buf, size_t size, uint32_t type)
+{
+       int err;
+       off64_t eof;
+       uint64_t *off, off_bak;
+       uint32_t *entries;
+       vhd_journal_entry_t entry;
+
+       entry.type     = type;
+       entry.size     = size;
+       entry.offset   = offset;
+       entry.cookie   = VHD_JOURNAL_ENTRY_COOKIE;
+       entry.checksum = vhd_journal_checksum_entry(&entry, buf, size);
+
+       err = vhd_journal_seek(j, j->header.journal_eof, SEEK_SET);
+       if (err)
+               return err;
+
+       err = vhd_journal_write_entry(j, &entry);
+       if (err)
+               goto fail;
+
+       err = vhd_journal_write(j, buf, size);
+       if (err)
+               goto fail;
+
+       if (type == VHD_JOURNAL_ENTRY_TYPE_DATA) {
+               off     = &j->header.journal_data_offset;
+               entries = &j->header.journal_data_entries;
+       } else {
+               off     = &j->header.journal_metadata_offset;
+               entries = &j->header.journal_metadata_entries;
+       }
+
+       off_bak = *off;
+       if (!(*entries)++)
+               *off = j->header.journal_eof;
+       j->header.journal_eof += (size + sizeof(vhd_journal_entry_t));
+
+       err = vhd_journal_write_header(j, &j->header);
+       if (err) {
+               if (!--(*entries))
+                       *off = off_bak;
+               j->header.journal_eof -= (size + sizeof(vhd_journal_entry_t));
+               goto fail;
+       }
+
+       return 0;
+
+fail:
+       if (!j->is_block)
+               vhd_journal_truncate(j, j->header.journal_eof);
+       return err;
+}
+
+static int
+vhd_journal_add_footer(vhd_journal_t *j)
+{
+       int err;
+       off64_t off;
+       vhd_context_t *vhd;
+       vhd_footer_t footer;
+
+       vhd = &j->vhd;
+
+       err = vhd_seek(vhd, 0, SEEK_END);
+       if (err)
+               return err;
+
+       off = vhd_position(vhd);
+       if (off == (off64_t)-1)
+               return -errno;
+
+       err = vhd_read_footer_at(vhd, &footer, off - sizeof(vhd_footer_t));
+       if (err)
+               return err;
+
+       vhd_footer_out(&footer);
+       err = vhd_journal_update(j, off - sizeof(vhd_footer_t),
+                                (char *)&footer,
+                                sizeof(vhd_footer_t),
+                                VHD_JOURNAL_ENTRY_TYPE_FOOTER_P);
+       if (err)
+               return err;
+
+       if (!vhd_type_dynamic(vhd))
+               return 0;
+
+       err = vhd_read_footer_at(vhd, &footer, 0);
+       if (err)
+               return err;
+
+       vhd_footer_out(&footer);
+       err = vhd_journal_update(j, 0,
+                                (char *)&footer,
+                                sizeof(vhd_footer_t),
+                                VHD_JOURNAL_ENTRY_TYPE_FOOTER_C);
+
+       return err;
+}
+
+static int
+vhd_journal_add_header(vhd_journal_t *j)
+{
+       int err;
+       off64_t off;
+       vhd_context_t *vhd;
+       vhd_header_t header;
+
+       vhd = &j->vhd;
+
+       err = vhd_read_header(vhd, &header);
+       if (err)
+               return err;
+
+       off = vhd->footer.data_offset;
+
+       vhd_header_out(&header);
+       err = vhd_journal_update(j, off,
+                                (char *)&header,
+                                sizeof(vhd_header_t),
+                                VHD_JOURNAL_ENTRY_TYPE_HEADER);
+
+       return err;
+}
+
+static int
+vhd_journal_add_locators(vhd_journal_t *j)
+{
+       int i, n, err;
+       vhd_context_t *vhd;
+
+       vhd = &j->vhd;
+
+       err = vhd_get_header(vhd);
+       if (err)
+               return err;
+
+       n = sizeof(vhd->header.loc) / sizeof(vhd_parent_locator_t);
+       for (i = 0; i < n; i++) {
+               char *buf;
+               off64_t off;
+               size_t size;
+               vhd_parent_locator_t *loc;
+
+               loc  = vhd->header.loc + i;
+               err  = vhd_validate_platform_code(loc->code);
+               if (err)
+                       return err;
+
+               if (loc->code == PLAT_CODE_NONE)
+                       continue;
+
+               off  = loc->data_offset;
+               size = vhd_parent_locator_size(loc);
+
+               err  = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, size);
+               if (err)
+                       return -err;
+
+               err  = vhd_seek(vhd, off, SEEK_SET);
+               if (err)
+                       goto end;
+
+               err  = vhd_read(vhd, buf, size);
+               if (err)
+                       goto end;
+
+               err  = vhd_journal_update(j, off, buf, size,
+                                         VHD_JOURNAL_ENTRY_TYPE_LOCATOR);
+               if (err)
+                       goto end;
+
+               err = 0;
+
+       end:
+               free(buf);
+               if (err)
+                       break;
+       }
+
+       return err;
+}
+
+static int
+vhd_journal_add_bat(vhd_journal_t *j)
+{
+       int err;
+       off64_t off;
+       size_t size;
+       vhd_bat_t bat;
+       vhd_context_t *vhd;
+
+       vhd  = &j->vhd;
+
+       err  = vhd_get_header(vhd);
+       if (err)
+               return err;
+
+       err  = vhd_read_bat(vhd, &bat);
+       if (err)
+               return err;
+
+       off  = vhd->header.table_offset;
+       size = vhd_bytes_padded(bat.entries * sizeof(uint32_t));
+
+       vhd_bat_out(&bat);
+       err  = vhd_journal_update(j, off, (char *)bat.bat, size,
+                                 VHD_JOURNAL_ENTRY_TYPE_BAT);
+
+       free(bat.bat);
+       return err;
+}
+
+static int
+vhd_journal_add_batmap(vhd_journal_t *j)
+{
+       int err;
+       off64_t off;
+       size_t size;
+       vhd_context_t *vhd;
+       vhd_batmap_t batmap;
+
+       vhd  = &j->vhd;
+
+       err  = vhd_batmap_header_offset(vhd, &off);
+       if (err)
+               return err;
+
+       err  = vhd_read_batmap(vhd, &batmap);
+       if (err)
+               return err;
+
+       size = vhd_bytes_padded(sizeof(struct dd_batmap_hdr));
+
+       vhd_batmap_header_out(&batmap);
+       err  = vhd_journal_update(j, off, (char *)&batmap.header, size,
+                                 VHD_JOURNAL_ENTRY_TYPE_BATMAP_H);
+       if (err)
+               goto out;
+
+       vhd_batmap_header_in(&batmap);
+       off  = batmap.header.batmap_offset;
+       size = vhd_sectors_to_bytes(batmap.header.batmap_size);
+
+       err  = vhd_journal_update(j, off, batmap.map, size,
+                                 VHD_JOURNAL_ENTRY_TYPE_BATMAP_M);
+
+out:
+       free(batmap.map);
+       return err;
+}
+
+static int
+vhd_journal_add_metadata(vhd_journal_t *j)
+{
+       int err;
+       off64_t eof;
+       vhd_context_t *vhd;
+
+       vhd = &j->vhd;
+
+       err = vhd_journal_add_footer(j);
+       if (err)
+               return err;
+
+       if (!vhd_type_dynamic(vhd))
+               return 0;
+
+       err = vhd_journal_add_header(j);
+       if (err)
+               return err;
+
+       err = vhd_journal_add_locators(j);
+       if (err)
+               return err;
+
+       err = vhd_journal_add_bat(j);
+       if (err)
+               return err;
+
+       if (vhd_has_batmap(vhd)) {
+               err = vhd_journal_add_batmap(j);
+               if (err)
+                       return err;
+       }
+
+       j->header.journal_data_offset = j->header.journal_eof;
+       return vhd_journal_write_header(j, &j->header);
+}
+
+static int
+__vhd_journal_read_footer(vhd_journal_t *j,
+                         vhd_footer_t *footer, uint32_t type)
+{
+       int err;
+       vhd_journal_entry_t entry;
+
+       err = vhd_journal_read_entry(j, &entry);
+       if (err)
+               return err;
+
+       if (entry.type != type)
+               return -EINVAL;
+
+       if (entry.size != sizeof(vhd_footer_t))
+               return -EINVAL;
+
+       err = vhd_journal_read(j, footer, entry.size);
+       if (err)
+               return err;
+
+       vhd_footer_in(footer);
+       return vhd_validate_footer(footer);
+}
+
+static int
+vhd_journal_read_footer(vhd_journal_t *j, vhd_footer_t *footer)
+{
+       return __vhd_journal_read_footer(j, footer,
+                                        VHD_JOURNAL_ENTRY_TYPE_FOOTER_P);
+}
+
+static int
+vhd_journal_read_footer_copy(vhd_journal_t *j, vhd_footer_t *footer)
+{
+       return __vhd_journal_read_footer(j, footer,
+                                        VHD_JOURNAL_ENTRY_TYPE_FOOTER_C);
+}
+
+static int
+vhd_journal_read_header(vhd_journal_t *j, vhd_header_t *header)
+{
+       int err;
+       vhd_journal_entry_t entry;
+
+       err = vhd_journal_read_entry(j, &entry);
+       if (err)
+               return err;
+
+       if (entry.type != VHD_JOURNAL_ENTRY_TYPE_HEADER)
+               return -EINVAL;
+
+       if (entry.size != sizeof(vhd_header_t))
+               return -EINVAL;
+
+       err = vhd_journal_read(j, header, entry.size);
+       if (err)
+               return err;
+
+       vhd_header_in(header);
+       return vhd_validate_header(header);
+}
+
+static int
+vhd_journal_read_locators(vhd_journal_t *j, char ***locators, int *locs)
+{
+       int err, n, _locs;
+       char **_locators, *buf;
+       off_t pos;
+       vhd_journal_entry_t entry;
+
+       _locs     = 0;
+       *locs     = 0;
+       *locators = NULL;
+
+       n = sizeof(j->vhd.header.loc) / sizeof(vhd_parent_locator_t);
+       _locators = calloc(n, sizeof(char *));
+       if (!_locators)
+               return -ENOMEM;
+
+       for (;;) {
+               buf = NULL;
+
+               pos = vhd_journal_position(j);
+               err = vhd_journal_read_entry(j, &entry);
+               if (err)
+                       goto fail;
+
+               if (entry.type != VHD_JOURNAL_ENTRY_TYPE_LOCATOR) {
+                       err = vhd_journal_seek(j, pos, SEEK_SET);
+                       if (err)
+                               goto fail;
+                       break;
+               }
+
+               if (_locs >= n) {
+                       err = -EINVAL;
+                       goto fail;
+               }
+
+               err = posix_memalign((void **)&buf,
+                                    VHD_SECTOR_SIZE, entry.size);
+               if (err) {
+                       err = -err;
+                       buf = NULL;
+                       goto fail;
+               }
+
+               err = vhd_journal_read(j, buf, entry.size);
+               if (err)
+                       goto fail;
+
+               _locators[_locs++] = buf;
+               err                = 0;
+       }
+
+
+       *locs     = _locs;
+       *locators = _locators;
+
+       return 0;
+
+fail:
+       if (_locators) {
+               for (n = 0; n < _locs; n++)
+                       free(_locators[n]);
+               free(_locators);
+       }
+       return err;
+}
+
+static int
+vhd_journal_read_bat(vhd_journal_t *j, vhd_bat_t *bat)
+{
+       int err;
+       size_t size;
+       vhd_context_t *vhd;
+       vhd_journal_entry_t entry;
+
+       vhd  = &j->vhd;
+
+       size = vhd_bytes_padded(vhd->header.max_bat_size * sizeof(uint32_t));
+
+       err  = vhd_journal_read_entry(j, &entry);
+       if (err)
+               return err;
+
+       if (entry.type != VHD_JOURNAL_ENTRY_TYPE_BAT)
+               return -EINVAL;
+
+       if (entry.size != size)
+               return -EINVAL;
+
+       if (entry.offset != vhd->header.table_offset)
+               return -EINVAL;
+
+       err = posix_memalign((void **)&bat->bat, VHD_SECTOR_SIZE, size);
+       if (err)
+               return -err;
+
+       err = vhd_journal_read(j, bat->bat, entry.size);
+       if (err)
+               goto fail;
+
+       bat->spb     = vhd->header.block_size >> VHD_SECTOR_SHIFT;
+       bat->entries = vhd->header.max_bat_size;
+       vhd_bat_in(bat);
+
+       return 0;
+
+fail:
+       free(bat->bat);
+       bat->bat = NULL;
+       return err;
+}
+
+static int
+vhd_journal_read_batmap_header(vhd_journal_t *j, vhd_batmap_t *batmap)
+{
+       int err;
+       char *buf;
+       size_t size;
+       vhd_journal_entry_t entry;
+
+       size = vhd_bytes_padded(sizeof(struct dd_batmap_hdr));
+
+       err  = vhd_journal_read_entry(j, &entry);
+       if (err)
+               return err;
+
+       if (entry.type != VHD_JOURNAL_ENTRY_TYPE_BATMAP_H)
+               return -EINVAL;
+
+       if (entry.size != size)
+               return -EINVAL;
+
+       err = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, size);
+       if (err)
+               return err;
+
+       err = vhd_journal_read(j, buf, entry.size);
+       if (err) {
+               free(buf);
+               return err;
+       }
+
+       memcpy(&batmap->header, buf, sizeof(batmap->header));
+
+       vhd_batmap_header_in(batmap);
+       return vhd_validate_batmap_header(batmap);
+}
+
+static int
+vhd_journal_read_batmap_map(vhd_journal_t *j, vhd_batmap_t *batmap)
+{
+       int err;
+       vhd_journal_entry_t entry;
+
+       err  = vhd_journal_read_entry(j, &entry);
+       if (err)
+               return err;
+
+       if (entry.type != VHD_JOURNAL_ENTRY_TYPE_BATMAP_M)
+               return -EINVAL;
+
+       if (entry.size != vhd_sectors_to_bytes(batmap->header.batmap_size))
+               return -EINVAL;
+
+       if (entry.offset != batmap->header.batmap_offset)
+               return -EINVAL;
+
+       err = posix_memalign((void **)&batmap->map,
+                            VHD_SECTOR_SIZE, entry.size);
+       if (err)
+               return -err;
+
+       err = vhd_journal_read(j, batmap->map, entry.size);
+       if (err) {
+               free(batmap->map);
+               batmap->map = NULL;
+               return err;
+       }
+
+       return 0;
+}
+
+static int
+vhd_journal_read_batmap(vhd_journal_t *j, vhd_batmap_t *batmap)
+{
+       int err;
+
+       err = vhd_journal_read_batmap_header(j, batmap);
+       if (err)
+               return err;
+
+       err = vhd_journal_read_batmap_map(j, batmap);
+       if (err)
+               return err;
+
+       err = vhd_validate_batmap(batmap);
+       if (err) {
+               free(batmap->map);
+               batmap->map = NULL;
+               return err;
+       }
+
+       return 0;
+}
+
+static int
+vhd_journal_restore_footer(vhd_journal_t *j, vhd_footer_t *footer)
+{
+       return vhd_write_footer_at(&j->vhd, footer,
+                                  j->header.vhd_footer_offset);
+}
+
+static int
+vhd_journal_restore_footer_copy(vhd_journal_t *j, vhd_footer_t *footer)
+{
+       return vhd_write_footer_at(&j->vhd, footer, 0);
+}
+
+static int
+vhd_journal_restore_header(vhd_journal_t *j, vhd_header_t *header)
+{
+       off64_t off;
+       vhd_context_t *vhd;
+
+       vhd = &j->vhd;
+       off = vhd->footer.data_offset;
+
+       return vhd_write_header_at(&j->vhd, header, off);
+}
+
+static int
+vhd_journal_restore_locators(vhd_journal_t *j, char **locators, int locs)
+{
+       size_t size;
+       vhd_context_t *vhd;
+       int i, n, lidx, err;
+       vhd_parent_locator_t *loc;
+
+       lidx = 0;
+       vhd  = &j->vhd;
+
+       n = sizeof(vhd->header.loc) / sizeof(vhd_parent_locator_t);
+
+       for (i = 0; i < n && lidx < locs; i++) {
+               loc  = vhd->header.loc + i;
+               if (loc->code == PLAT_CODE_NONE)
+                       continue;
+
+               err  = vhd_seek(vhd, loc->data_offset, SEEK_SET);
+               if (err)
+                       return err;
+
+               size = vhd_parent_locator_size(loc);
+               err  = vhd_write(vhd, locators[lidx++], size);
+               if (err)
+                       return err;
+       }
+
+       return 0;
+}
+
+static int
+vhd_journal_restore_bat(vhd_journal_t *j, vhd_bat_t *bat)
+{
+       return vhd_write_bat(&j->vhd, bat);
+}
+
+static int
+vhd_journal_restore_batmap(vhd_journal_t *j, vhd_batmap_t *batmap)
+{
+       return vhd_write_batmap(&j->vhd, batmap);
+}
+
+static int
+vhd_journal_restore_metadata(vhd_journal_t *j)
+{
+       off64_t off;
+       char **locators;
+       vhd_footer_t copy;
+       vhd_context_t *vhd;
+       int i, locs, hlocs, err;
+
+       vhd      = &j->vhd;
+       locs     = 0;
+       hlocs    = 0;
+       locators = NULL;
+
+       err = vhd_journal_seek(j, sizeof(vhd_journal_header_t), SEEK_SET);
+       if (err)
+               return err;
+
+       err  = vhd_journal_read_footer(j, &vhd->footer);
+       if (err)
+               return err;
+
+       if (!vhd_type_dynamic(vhd))
+               goto restore;
+
+       err  = vhd_journal_read_footer_copy(j, &copy);
+       if (err)
+               return err;
+
+       err  = vhd_journal_read_header(j, &vhd->header);
+       if (err)
+               return err;
+
+       for (hlocs = 0, i = 0; i < vhd_parent_locator_count(vhd); i++) {
+               if (vhd_validate_platform_code(vhd->header.loc[i].code))
+                       return err;
+
+               if (vhd->header.loc[i].code != PLAT_CODE_NONE)
+                       hlocs++;
+       }
+
+       if (hlocs) {
+               err  = vhd_journal_read_locators(j, &locators, &locs);
+               if (err)
+                       return err;
+
+               if (hlocs != locs) {
+                       err = -EINVAL;
+                       goto out;
+               }
+       }
+
+       err  = vhd_journal_read_bat(j, &vhd->bat);
+       if (err)
+               goto out;
+
+       if (vhd_has_batmap(vhd)) {
+               err  = vhd_journal_read_batmap(j, &vhd->batmap);
+               if (err)
+                       goto out;
+       }
+
+restore:
+       off  = vhd_journal_position(j);
+       if (off == (off64_t)-1)
+               return -errno;
+
+       if (j->header.journal_data_offset != off)
+               return -EINVAL;
+
+       err  = vhd_journal_restore_footer(j, &vhd->footer);
+       if (err)
+               goto out;
+
+       if (!vhd_type_dynamic(vhd))
+               goto out;
+
+       err  = vhd_journal_restore_footer_copy(j, &copy);
+       if (err)
+               goto out;
+
+       err  = vhd_journal_restore_header(j, &vhd->header);
+       if (err)
+               goto out;
+
+       if (locs) {
+               err = vhd_journal_restore_locators(j, locators, locs);
+               if (err)
+                       goto out;
+       }
+
+       err  = vhd_journal_restore_bat(j, &vhd->bat);
+       if (err)
+               goto out;
+
+       if (vhd_has_batmap(vhd)) {
+               err  = vhd_journal_restore_batmap(j, &vhd->batmap);
+               if (err)
+                       goto out;
+       }
+
+       err = 0;
+
+out:
+       if (locators) {
+               for (i = 0; i < locs; i++)
+                       free(locators[i]);
+               free(locators);
+       }
+
+       if (!err && !vhd->is_block)
+               err = ftruncate(vhd->fd,
+                         j->header.vhd_footer_offset +
+                         sizeof(vhd_footer_t));
+
+       return err;
+}
+
+static int
+vhd_journal_disable_vhd(vhd_journal_t *j)
+{
+       int err;
+       vhd_context_t *vhd;
+
+       vhd = &j->vhd;
+
+       err = vhd_get_footer(vhd);
+       if (err)
+               return err;
+
+       memcpy(&vhd->footer.cookie,
+              VHD_POISON_COOKIE, sizeof(vhd->footer.cookie));
+       vhd->footer.checksum = vhd_checksum_footer(&vhd->footer);
+
+       err = vhd_write_footer(vhd, &vhd->footer);
+       if (err)
+               return err;
+
+       return 0;
+}
+
+static int
+vhd_journal_enable_vhd(vhd_journal_t *j)
+{
+       int err;
+       vhd_context_t *vhd;
+
+       vhd = &j->vhd;
+
+       err = vhd_get_footer(vhd);
+       if (err)
+               return err;
+
+       if (!vhd_disabled(vhd))
+               return 0;
+
+       memcpy(&vhd->footer.cookie, HD_COOKIE, sizeof(vhd->footer.cookie));
+       vhd->footer.checksum = vhd_checksum_footer(&vhd->footer);
+
+       err = vhd_write_footer(vhd, &vhd->footer);
+       if (err)
+               return err;
+
+       return 0;
+}
+
+int
+vhd_journal_close(vhd_journal_t *j)
+{
+       if (j->jfd)
+               close(j->jfd);
+
+       vhd_close(&j->vhd);
+       free(j->jname);
+
+       return 0;
+}
+
+int
+vhd_journal_remove(vhd_journal_t *j)
+{
+       int err;
+
+       err = vhd_journal_enable_vhd(j);
+       if (err)
+               return err;
+
+       if (j->jfd) {
+               close(j->jfd);
+               if (!j->is_block)
+                       unlink(j->jname);
+       }
+
+       vhd_close(&j->vhd);
+       free(j->jname);
+
+       return 0;
+}
+
+int
+vhd_journal_open(vhd_journal_t *j, const char *file, const char *jfile)
+{
+       int err;
+       vhd_context_t *vhd;
+
+       memset(j, 0, sizeof(vhd_journal_t));
+
+       j->jfd = -1;
+       vhd    = &j->vhd;
+
+       j->jname = strdup(jfile);
+       if (j->jname == NULL)
+               return -ENOMEM;
+
+       j->jfd = open(j->jname, O_LARGEFILE | O_RDWR);
+       if (j->jfd == -1) {
+               err = -errno;
+               goto fail;
+       }
+
+       err = vhd_test_file_fixed(j->jname, &j->is_block);
+       if (err)
+               goto fail;
+
+       vhd->fd = open(file, O_LARGEFILE | O_RDWR | O_DIRECT);
+       if (vhd->fd == -1) {
+               err = -errno;
+               goto fail;
+       }
+
+       err = vhd_test_file_fixed(file, &vhd->is_block);
+       if (err)
+               goto fail;
+
+       err = vhd_journal_read_journal_header(j, &j->header);
+       if (err)
+               goto fail;
+
+       err = vhd_journal_restore_metadata(j);
+       if (err)
+               goto fail;
+
+       close(vhd->fd);
+       free(vhd->bat.bat);
+       free(vhd->batmap.map);
+
+       err = vhd_open(vhd, file, VHD_OPEN_RDWR);
+       if (err)
+               goto fail;
+
+       err = vhd_get_bat(vhd);
+       if (err)
+               goto fail;
+
+       if (vhd_has_batmap(vhd)) {
+               err = vhd_get_batmap(vhd);
+               if (err)
+                       goto fail;
+       }
+
+       err = vhd_journal_disable_vhd(j);
+       if (err)
+               goto fail;
+
+       return 0;
+
+fail:
+       vhd_journal_close(j);
+       return err;
+}
+
+int
+vhd_journal_create(vhd_journal_t *j, const char *file, const char *jfile)
+{
+       char *buf;
+       int i, err;
+       size_t size;
+       off64_t off;
+       struct stat stats;
+
+       memset(j, 0, sizeof(vhd_journal_t));
+       j->jfd = -1;
+
+       j->jname = strdup(jfile);
+       if (j->jname == NULL) {
+               err = -ENOMEM;
+               goto fail1;
+       }
+
+       if (access(j->jname, F_OK) == 0) {
+               err = vhd_test_file_fixed(j->jname, &j->is_block);
+               if (err)
+                       goto fail1;
+
+               if (!j->is_block) {
+                       err = -EEXIST;
+                       goto fail1;
+               }
+       }
+
+       if (j->is_block)
+               j->jfd = open(j->jname, O_LARGEFILE | O_RDWR, 0644);
+       else
+               j->jfd = open(j->jname,
+                             O_CREAT | O_TRUNC | O_LARGEFILE | O_RDWR, 0644);
+       if (j->jfd == -1) {
+               err = -errno;
+               goto fail1;
+       }
+
+       err = vhd_open(&j->vhd, file, VHD_OPEN_RDWR | VHD_OPEN_STRICT);
+       if (err)
+               goto fail1;
+
+       err = vhd_get_bat(&j->vhd);
+       if (err)
+               goto fail2;
+
+       if (vhd_has_batmap(&j->vhd)) {
+               err = vhd_get_batmap(&j->vhd);
+               if (err)
+                       goto fail2;
+       }
+
+       err = vhd_journal_add_journal_header(j);
+       if (err)
+               goto fail2;
+
+       err = vhd_journal_add_metadata(j);
+       if (err)
+               goto fail2;
+
+       err = vhd_journal_disable_vhd(j);
+       if (err)
+               goto fail2;
+
+       err = vhd_journal_sync(j);
+       if (err)
+               goto fail2;
+
+       return 0;
+
+fail1:
+       if (j->jfd != -1) {
+               close(j->jfd);
+               if (!j->is_block)
+                       unlink(j->jname);
+       }
+       free(j->jname);
+       memset(j, 0, sizeof(vhd_journal_t));
+
+       return err;
+
+fail2:
+       vhd_journal_remove(j);
+       return err;
+}
+
+int
+vhd_journal_add_block(vhd_journal_t *j, uint32_t block, char mode)
+{
+       int err;
+       char *buf;
+       off64_t off;
+       size_t size;
+       uint64_t blk;
+       vhd_context_t *vhd;
+
+       buf = NULL;
+       vhd = &j->vhd;
+
+       if (!vhd_type_dynamic(vhd))
+               return -EINVAL;
+
+       err = vhd_get_bat(vhd);
+       if (err)
+               return err;
+
+       if (block >= vhd->bat.entries)
+               return -ERANGE;
+
+       blk = vhd->bat.bat[block];
+       if (blk == DD_BLK_UNUSED)
+               return 0;
+
+       off = vhd_sectors_to_bytes(blk);
+
+       if (mode & VHD_JOURNAL_METADATA) {
+               size = vhd_sectors_to_bytes(vhd->bm_secs);
+
+               err  = vhd_read_bitmap(vhd, block, &buf);
+               if (err)
+                       return err;
+
+               err  = vhd_journal_update(j, off, buf, size,
+                                         VHD_JOURNAL_ENTRY_TYPE_DATA);
+
+               free(buf);
+
+               if (err)
+                       return err;
+       }
+
+       if (mode & VHD_JOURNAL_DATA) {
+               off += vhd_sectors_to_bytes(vhd->bm_secs);
+               size = vhd_sectors_to_bytes(vhd->spb);
+
+               err  = vhd_read_block(vhd, block, &buf);
+               if (err)
+                       return err;
+
+               err  = vhd_journal_update(j, off, buf, size,
+                                         VHD_JOURNAL_ENTRY_TYPE_DATA);
+               free(buf);
+
+               if (err)
+                       return err;
+       }
+
+       return vhd_journal_sync(j);
+}
+
+/*
+ * commit indicates the transaction completed 
+ * successfully and we can remove the undo log
+ */
+int
+vhd_journal_commit(vhd_journal_t *j)
+{
+       int err;
+
+       j->header.journal_data_entries     = 0;
+       j->header.journal_metadata_entries = 0;
+       j->header.journal_data_offset      = 0;
+       j->header.journal_metadata_offset  = 0;
+
+       err = vhd_journal_write_header(j, &j->header);
+       if (err)
+               return err;
+
+       if (!j->is_block)
+               err = vhd_journal_truncate(j, sizeof(vhd_journal_header_t));
+       if (err)
+               return -errno;
+
+       return 0;
+}
+
+/*
+ * revert indicates the transaction failed
+ * and we should revert any changes via the undo log
+ */
+int
+vhd_journal_revert(vhd_journal_t *j)
+{
+       int i, err;
+       char *buf, *file;
+       vhd_context_t *vhd;
+       vhd_journal_entry_t entry;
+
+       err  = 0;
+       vhd  = &j->vhd;
+       buf  = NULL;
+
+       file = strdup(vhd->file);
+       if (!file)
+               return -ENOMEM;
+
+       vhd_close(&j->vhd);
+       j->vhd.fd = open(file, O_RDWR | O_DIRECT | O_LARGEFILE);
+       if (j->vhd.fd == -1) {
+               free(file);
+               return -errno;
+       }
+
+       err = vhd_test_file_fixed(file, &vhd->is_block);
+       if (err) {
+               free(file);
+               return err;
+       }
+
+       err  = vhd_journal_restore_metadata(j);
+       if (err) {
+               free(file);
+               return err;
+       }
+
+       close(vhd->fd);
+       free(vhd->bat.bat);
+       free(vhd->batmap.map);
+
+       err = vhd_open(vhd, file, VHD_OPEN_RDWR);
+       free(file);
+       if (err)
+               return err;
+
+       err = vhd_journal_seek(j, j->header.journal_data_offset, SEEK_SET);
+       if (err)
+               return err;
+
+       for (i = 0; i < j->header.journal_data_entries; i++) {
+               err = vhd_journal_read_entry(j, &entry);
+               if (err)
+                       goto end;
+
+               err = posix_memalign((void **)&buf,
+                                    VHD_SECTOR_SIZE, entry.size);
+               if (err) {
+                       err = -err;
+                       buf = NULL;
+                       goto end;
+               }
+
+               err = vhd_journal_read(j, buf, entry.size);
+               if (err)
+                       goto end;
+
+               err = vhd_journal_validate_entry_data(&entry, buf);
+               if (err)
+                       goto end;
+
+               err = vhd_seek(vhd, entry.offset, SEEK_SET);
+               if (err)
+                       goto end;
+
+               err = vhd_write(vhd, buf, entry.size);
+               if (err)
+                       goto end;
+
+               err = 0;
+
+       end:
+               free(buf);
+               buf = NULL;
+               if (err)
+                       break;
+       }
+
+       if (err)
+               return err;
+
+       if (!vhd->is_block) {
+               err = ftruncate(vhd->fd, j->header.vhd_footer_offset +
+                               sizeof(vhd_footer_t));
+               if (err)
+                       return -errno;
+       }
+
+       return vhd_journal_sync(j);
+}
diff --git a/tools/blktap2/vhd/lib/libvhd.c b/tools/blktap2/vhd/lib/libvhd.c
new file mode 100644 (file)
index 0000000..1af30ad
--- /dev/null
@@ -0,0 +1,3328 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+#include <stdio.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <libgen.h>
+#include <iconv.h>
+#include <sys/mman.h>
+
+#include "libvhd.h"
+#include "relative-path.h"
+
+static int libvhd_dbg = 0;
+
+void
+libvhd_set_log_level(int level)
+{
+       if (level)
+               libvhd_dbg = 1;
+}
+
+#define VHDLOG(_f, _a...)                                              \
+       do {                                                            \
+               if (libvhd_dbg)                                         \
+                       syslog(LOG_INFO, "libvhd::%s: "_f,              \
+                              __func__, ##_a);                         \
+       } while (0)
+
+#define BIT_MASK 0x80
+
+#ifdef ENABLE_FAILURE_TESTING
+const char* ENV_VAR_FAIL[NUM_FAIL_TESTS] = {
+       "VHD_UTIL_TEST_FAIL_REPARENT_BEGIN",
+       "VHD_UTIL_TEST_FAIL_REPARENT_LOCATOR",
+       "VHD_UTIL_TEST_FAIL_REPARENT_END",
+       "VHD_UTIL_TEST_FAIL_RESIZE_BEGIN",
+       "VHD_UTIL_TEST_FAIL_RESIZE_DATA_MOVED",
+       "VHD_UTIL_TEST_FAIL_RESIZE_METADATA_MOVED",
+       "VHD_UTIL_TEST_FAIL_RESIZE_END"
+};
+int TEST_FAIL[NUM_FAIL_TESTS];
+#endif // ENABLE_FAILURE_TESTING
+
+static inline int
+test_bit (volatile char *addr, int nr)
+{
+       return ((addr[nr >> 3] << (nr & 7)) & BIT_MASK) != 0;
+}
+
+static inline void
+set_bit (volatile char *addr, int nr)
+{
+       addr[nr >> 3] |= (BIT_MASK >> (nr & 7));
+}
+
+static inline void
+clear_bit (volatile char *addr, int nr)
+{
+       addr[nr >> 3] &= ~(BIT_MASK >> (nr & 7));
+}
+
+static inline int
+old_test_bit(volatile char *addr, int nr)
+{
+       return (((uint32_t *)addr)[nr >> 5] >> (nr & 31)) & 1;
+}
+
+static inline void
+old_set_bit(volatile char *addr, int nr)
+{
+       ((uint32_t *)addr)[nr >> 5] |= (1 << (nr & 31));
+}
+
+static inline void
+old_clear_bit(volatile char *addr, int nr)
+{
+       ((uint32_t *)addr)[nr >> 5] &= ~(1 << (nr & 31));
+}
+
+void
+vhd_footer_in(vhd_footer_t *footer)
+{
+       BE32_IN(&footer->features);
+       BE32_IN(&footer->ff_version);
+       BE64_IN(&footer->data_offset);
+       BE32_IN(&footer->timestamp);
+       BE32_IN(&footer->crtr_ver);
+       BE32_IN(&footer->crtr_os);
+       BE64_IN(&footer->orig_size);
+       BE64_IN(&footer->curr_size);
+       BE32_IN(&footer->geometry);
+       BE32_IN(&footer->type);
+       BE32_IN(&footer->checksum);
+}
+
+void
+vhd_footer_out(vhd_footer_t *footer)
+{
+       BE32_OUT(&footer->features);
+       BE32_OUT(&footer->ff_version);
+       BE64_OUT(&footer->data_offset);
+       BE32_OUT(&footer->timestamp);
+       BE32_OUT(&footer->crtr_ver);
+       BE32_OUT(&footer->crtr_os);
+       BE64_OUT(&footer->orig_size);
+       BE64_OUT(&footer->curr_size);
+       BE32_OUT(&footer->geometry);
+       BE32_OUT(&footer->type);
+       BE32_OUT(&footer->checksum);
+}
+
+void
+vhd_header_in(vhd_header_t *header)
+{
+       int i, n;
+
+       BE64_IN(&header->data_offset);
+       BE64_IN(&header->table_offset);
+       BE32_IN(&header->hdr_ver);
+       BE32_IN(&header->max_bat_size);
+       BE32_IN(&header->block_size);
+       BE32_IN(&header->checksum);
+       BE32_IN(&header->prt_ts);
+
+       n = sizeof(header->loc) / sizeof(vhd_parent_locator_t);
+
+       for (i = 0; i < n; i++) {
+               BE32_IN(&header->loc[i].code);
+               BE32_IN(&header->loc[i].data_space);
+               BE32_IN(&header->loc[i].data_len);
+               BE64_IN(&header->loc[i].data_offset);
+       }
+}
+
+void
+vhd_header_out(vhd_header_t *header)
+{
+       int i, n;
+
+       BE64_OUT(&header->data_offset);
+       BE64_OUT(&header->table_offset);
+       BE32_OUT(&header->hdr_ver);
+       BE32_OUT(&header->max_bat_size);
+       BE32_OUT(&header->block_size);
+       BE32_OUT(&header->checksum);
+       BE32_OUT(&header->prt_ts);
+
+       n = sizeof(header->loc) / sizeof(vhd_parent_locator_t);
+
+       for (i = 0; i < n; i++) {
+               BE32_OUT(&header->loc[i].code);
+               BE32_OUT(&header->loc[i].data_space);
+               BE32_OUT(&header->loc[i].data_len);
+               BE64_OUT(&header->loc[i].data_offset);
+       }
+}
+
+void
+vhd_batmap_header_in(vhd_batmap_t *batmap)
+{
+       BE64_IN(&batmap->header.batmap_offset);
+       BE32_IN(&batmap->header.batmap_size);
+       BE32_IN(&batmap->header.batmap_version);
+       BE32_IN(&batmap->header.checksum);
+}
+
+void
+vhd_batmap_header_out(vhd_batmap_t *batmap)
+{
+       BE64_OUT(&batmap->header.batmap_offset);
+       BE32_OUT(&batmap->header.batmap_size);
+       BE32_OUT(&batmap->header.batmap_version);
+       BE32_OUT(&batmap->header.checksum);
+}
+
+void
+vhd_bat_in(vhd_bat_t *bat)
+{
+       int i;
+
+       for (i = 0; i < bat->entries; i++)
+               BE32_IN(&bat->bat[i]);
+}
+
+void
+vhd_bat_out(vhd_bat_t *bat)
+{
+       int i;
+
+       for (i = 0; i < bat->entries; i++)
+               BE32_OUT(&bat->bat[i]);
+}
+
+uint32_t
+vhd_checksum_footer(vhd_footer_t *footer)
+{
+       int i;
+       unsigned char *blob;
+       uint32_t checksum, tmp;
+
+       checksum         = 0;
+       tmp              = footer->checksum;
+       footer->checksum = 0;
+
+       blob = (unsigned char *)footer;
+       for (i = 0; i < sizeof(vhd_footer_t); i++)
+               checksum += (uint32_t)blob[i];
+
+       footer->checksum = tmp;
+       return ~checksum;
+}
+
+int
+vhd_validate_footer(vhd_footer_t *footer)
+{
+       int csize;
+       uint32_t checksum;
+
+       csize = sizeof(footer->cookie);
+       if (memcmp(footer->cookie, HD_COOKIE, csize) != 0 &&
+           memcmp(footer->cookie, VHD_POISON_COOKIE, csize) != 0) {
+               char buf[9];
+               memcpy(buf, footer->cookie, 8);
+               buf[8]= '\0';
+               VHDLOG("invalid footer cookie: %s\n", buf);
+               return -EINVAL;
+       }
+
+       checksum = vhd_checksum_footer(footer);
+       if (checksum != footer->checksum) {
+               /*
+                * early td-util did not re-calculate
+                * checksum when marking vhds 'hidden'
+                */
+               if (footer->hidden &&
+                   !strncmp(footer->crtr_app, "tap", 3) &&
+                   (footer->crtr_ver == VHD_VERSION(0, 1) ||
+                    footer->crtr_ver == VHD_VERSION(1, 1))) {
+                       char tmp = footer->hidden;
+                       footer->hidden = 0;
+                       checksum = vhd_checksum_footer(footer);
+                       footer->hidden = tmp;
+
+                       if (checksum == footer->checksum)
+                               return 0;
+               }
+
+               VHDLOG("invalid footer checksum: "
+                      "footer = 0x%08x, calculated = 0x%08x\n",
+                      footer->checksum, checksum);
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+uint32_t
+vhd_checksum_header(vhd_header_t *header)
+{
+       int i;
+       unsigned char *blob;
+       uint32_t checksum, tmp;
+
+       checksum         = 0;
+       tmp              = header->checksum;
+       header->checksum = 0;
+
+       blob = (unsigned char *)header;
+       for (i = 0; i < sizeof(vhd_header_t); i++)
+               checksum += (uint32_t)blob[i];
+
+       header->checksum = tmp;
+       return ~checksum;
+}
+
+int
+vhd_validate_header(vhd_header_t *header)
+{
+       int i, n;
+       uint32_t checksum;
+
+       if (memcmp(header->cookie, DD_COOKIE, 8) != 0) {
+               char buf[9];
+               memcpy(buf, header->cookie, 8);
+               buf[8] = '\0';
+               VHDLOG("invalid header cookie: %s\n", buf);
+               return -EINVAL;
+       }
+
+       if (header->hdr_ver != 0x00010000) {
+               VHDLOG("invalid header version 0x%08x\n", header->hdr_ver);
+               return -EINVAL;
+       }
+
+       if (header->data_offset != 0xFFFFFFFFFFFFFFFF) {
+               VHDLOG("invalid header data_offset 0x%016"PRIx64"\n",
+                      header->data_offset);
+               return -EINVAL;
+       }
+
+       n = sizeof(header->loc) / sizeof(vhd_parent_locator_t);
+       for (i = 0; i < n; i++)
+               if (vhd_validate_platform_code(header->loc[i].code))
+                       return -EINVAL;
+
+       checksum = vhd_checksum_header(header);
+       if (checksum != header->checksum) {
+               VHDLOG("invalid header checksum: "
+                      "header = 0x%08x, calculated = 0x%08x\n",
+                      header->checksum, checksum);
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+static inline int
+vhd_validate_bat(vhd_bat_t *bat)
+{
+       if (!bat->bat)
+               return -EINVAL;
+
+       return 0;
+}
+
+uint32_t
+vhd_checksum_batmap(vhd_batmap_t *batmap)
+{
+       int i, n;
+       char *blob;
+       uint32_t checksum;
+
+       blob     = batmap->map;
+       checksum = 0;
+
+       n = vhd_sectors_to_bytes(batmap->header.batmap_size);
+
+       for (i = 0; i < n; i++) {
+               if (batmap->header.batmap_version == VHD_BATMAP_VERSION(1, 1))
+                       checksum += (uint32_t)blob[i];
+               else
+                       checksum += (uint32_t)(unsigned char)blob[i];
+       }
+
+       return ~checksum;
+}
+
+int
+vhd_validate_batmap_header(vhd_batmap_t *batmap)
+{
+       if (memcmp(batmap->header.cookie, VHD_BATMAP_COOKIE, 8))
+               return -EINVAL;
+
+       if (batmap->header.batmap_version > VHD_BATMAP_CURRENT_VERSION)
+               return -EINVAL;
+
+       return 0;
+}
+
+int
+vhd_validate_batmap(vhd_batmap_t *batmap)
+{
+       uint32_t checksum;
+
+       if (!batmap->map)
+               return -EINVAL;
+
+       checksum = vhd_checksum_batmap(batmap);
+       if (checksum != batmap->header.checksum)
+               return -EINVAL;
+
+       return 0;
+}
+
+int
+vhd_batmap_header_offset(vhd_context_t *ctx, off64_t *_off)
+{
+       off64_t off;
+       size_t  bat;
+
+       *_off = 0;
+
+       off  = ctx->header.table_offset;
+       bat  = ctx->header.max_bat_size * sizeof(uint32_t);
+       off += vhd_bytes_padded(bat);
+
+       *_off = off;
+       return 0;
+}
+
+int
+vhd_validate_platform_code(uint32_t code)
+{
+       switch (code) {
+       case PLAT_CODE_NONE:
+       case PLAT_CODE_WI2R:
+       case PLAT_CODE_WI2K:
+       case PLAT_CODE_W2RU:
+       case PLAT_CODE_W2KU:
+       case PLAT_CODE_MAC:
+       case PLAT_CODE_MACX:
+               return 0;
+       default:
+               VHDLOG("invalid parent locator code %u\n", code);
+               return -EINVAL;
+       }
+}
+
+int
+vhd_parent_locator_count(vhd_context_t *ctx)
+{
+       return (sizeof(ctx->header.loc) / sizeof(vhd_parent_locator_t));
+}
+
+int
+vhd_hidden(vhd_context_t *ctx, int *hidden)
+{
+       int err;
+
+       *hidden = 0;
+
+       if (vhd_type_dynamic(ctx) && vhd_creator_tapdisk(ctx) &&
+           (ctx->footer.crtr_ver == VHD_VERSION(0, 1) ||
+            ctx->footer.crtr_ver == VHD_VERSION(1, 1))) {
+               vhd_footer_t copy;
+
+               err = vhd_read_footer_at(ctx, &copy, 0);
+               if (err) {
+                       VHDLOG("error reading backup footer of %s: %d\n",
+                              ctx->file, err);
+                       return err;
+               }
+               *hidden = copy.hidden;
+       } else
+               *hidden = ctx->footer.hidden;
+
+       return 0;
+}
+
+int
+vhd_chain_depth(vhd_context_t *ctx, int *depth)
+{
+       char *file;
+       int err, cnt;
+       vhd_context_t vhd, *cur;
+
+       err    = 0;
+       cnt    = 0;
+       *depth = 0;
+       file   = NULL;
+       cur    = ctx;
+
+       for (;;) {
+               cnt++;
+
+               if (cur->footer.type != HD_TYPE_DIFF)
+                       break;
+
+               if (vhd_parent_raw(cur)) {
+                       cnt++;
+                       break;
+               }
+
+               err = vhd_parent_locator_get(cur, &file);
+               if (err) {
+                       file = NULL;
+                       break;
+               }
+
+               if (cur != ctx) {
+                       vhd_close(cur);
+                       cur = NULL;
+               }
+
+               err = vhd_open(&vhd, file, VHD_OPEN_RDONLY);
+               if (err)
+                       break;
+
+               cur = &vhd;
+               free(file);
+               file = NULL;
+       }
+
+       free(file);
+       if (cur && cur != ctx)
+               vhd_close(cur);
+
+       if (!err)
+               *depth = cnt;
+
+       return err;
+}
+
+int
+vhd_batmap_test(vhd_context_t *ctx, vhd_batmap_t *batmap, uint32_t block)
+{
+       if (!vhd_has_batmap(ctx) || !batmap->map)
+               return 0;
+
+       if (block >= (batmap->header.batmap_size << (VHD_SECTOR_SHIFT + 3)))
+               return 0;
+
+       return test_bit(batmap->map, block);
+}
+
+void
+vhd_batmap_set(vhd_context_t *ctx, vhd_batmap_t *batmap, uint32_t block)
+{
+       if (!vhd_has_batmap(ctx) || !batmap->map)
+               return;
+
+       if (block >= (batmap->header.batmap_size << (VHD_SECTOR_SHIFT + 3)))
+               return;
+
+       set_bit(batmap->map, block);
+}
+
+void
+vhd_batmap_clear(vhd_context_t *ctx, vhd_batmap_t *batmap, uint32_t block)
+{
+       if (!vhd_has_batmap(ctx) || !batmap->map)
+               return;
+
+       if (block >= (batmap->header.batmap_size << (VHD_SECTOR_SHIFT + 3)))
+               return;
+
+       clear_bit(batmap->map, block);
+}
+
+int
+vhd_bitmap_test(vhd_context_t *ctx, char *map, uint32_t block)
+{
+       if (vhd_creator_tapdisk(ctx) &&
+           ctx->footer.crtr_ver == 0x00000001)
+               return old_test_bit(map, block);
+
+       return test_bit(map, block);
+}
+
+void
+vhd_bitmap_set(vhd_context_t *ctx, char *map, uint32_t block)
+{
+       if (vhd_creator_tapdisk(ctx) &&
+           ctx->footer.crtr_ver == 0x00000001)
+               return old_set_bit(map, block);
+
+       return set_bit(map, block);
+}
+
+void
+vhd_bitmap_clear(vhd_context_t *ctx, char *map, uint32_t block)
+{
+       if (vhd_creator_tapdisk(ctx) &&
+           ctx->footer.crtr_ver == 0x00000001)
+               return old_clear_bit(map, block);
+
+       return clear_bit(map, block);
+}
+
+/*
+ * returns absolute offset of the first 
+ * byte of the file which is not vhd metadata
+ */
+int
+vhd_end_of_headers(vhd_context_t *ctx, off64_t *end)
+{
+       int err, i, n;
+       uint32_t bat_bytes;
+       off64_t eom, bat_end;
+       vhd_parent_locator_t *loc;
+
+       *end = 0;
+
+       if (!vhd_type_dynamic(ctx))
+               return 0;
+
+       eom       = ctx->footer.data_offset + sizeof(vhd_header_t);
+
+       bat_bytes = vhd_bytes_padded(ctx->header.max_bat_size * sizeof(uint32_t));
+       bat_end   = ctx->header.table_offset + bat_bytes;
+
+       eom       = MAX(eom, bat_end);
+
+       if (vhd_has_batmap(ctx)) {
+               off64_t hdr_end, hdr_secs, map_end, map_secs;
+
+               err = vhd_get_batmap(ctx);
+               if (err)
+                       return err;
+
+               hdr_secs = secs_round_up_no_zero(sizeof(vhd_batmap_header_t));
+               err      = vhd_batmap_header_offset(ctx, &hdr_end);
+               if (err)
+                       return err;
+
+               hdr_end += vhd_sectors_to_bytes(hdr_secs);
+               eom      = MAX(eom, hdr_end);
+
+               map_secs = ctx->batmap.header.batmap_size;
+               map_end  = (ctx->batmap.header.batmap_offset +
+                           vhd_sectors_to_bytes(map_secs));
+               eom      = MAX(eom, map_end);
+       }
+
+       /* parent locators */
+       n = sizeof(ctx->header.loc) / sizeof(vhd_parent_locator_t);
+
+       for (i = 0; i < n; i++) {
+               off64_t loc_end;
+
+               loc = &ctx->header.loc[i];
+               if (loc->code == PLAT_CODE_NONE)
+                       continue;
+
+               loc_end = loc->data_offset + vhd_parent_locator_size(loc);
+               eom     = MAX(eom, loc_end);
+       }
+
+       *end = eom;
+       return 0;
+}
+
+int
+vhd_end_of_data(vhd_context_t *ctx, off64_t *end)
+{
+       int i, err;
+       off64_t max;
+       uint64_t blk;
+
+       if (!vhd_type_dynamic(ctx)) {
+               err = vhd_seek(ctx, 0, SEEK_END);
+               if (err)
+                       return err;
+
+               max = vhd_position(ctx);
+               if (max == (off64_t)-1)
+                       return -errno;
+
+               *end = max - sizeof(vhd_footer_t);
+               return 0;
+       }
+
+       err = vhd_end_of_headers(ctx, &max);
+       if (err)
+               return err;
+
+       err = vhd_get_bat(ctx);
+       if (err)
+               return err;
+
+       max >>= VHD_SECTOR_SHIFT;
+
+       for (i = 0; i < ctx->bat.entries; i++) {
+               blk = ctx->bat.bat[i];
+
+               if (blk != DD_BLK_UNUSED) {
+                       blk += ctx->spb + ctx->bm_secs;
+                       max  = MAX(blk, max);
+               }
+       }
+
+       *end = vhd_sectors_to_bytes(max);
+       return 0;
+}
+
+uint32_t
+vhd_time(time_t time)
+{
+       struct tm tm;
+       time_t micro_epoch;
+
+       memset(&tm, 0, sizeof(struct tm));
+       tm.tm_year   = 100;
+       tm.tm_mon    = 0;
+       tm.tm_mday   = 1;
+       micro_epoch  = mktime(&tm);
+
+       return (uint32_t)(time - micro_epoch);
+}
+
+/* 
+ * Stringify the VHD timestamp for printing.
+ * As with ctime_r, target must be >=26 bytes.
+ */
+size_t 
+vhd_time_to_string(uint32_t timestamp, char *target)
+{
+       char *cr;
+       struct tm tm;
+       time_t t1, t2;
+
+       memset(&tm, 0, sizeof(struct tm));
+
+       /* VHD uses an epoch of 12:00AM, Jan 1, 2000.         */
+       /* Need to adjust this to the expected epoch of 1970. */
+       tm.tm_year  = 100;
+       tm.tm_mon   = 0;
+       tm.tm_mday  = 1;
+
+       t1 = mktime(&tm);
+       t2 = t1 + (time_t)timestamp;
+       ctime_r(&t2, target);
+
+       /* handle mad ctime_r newline appending. */
+       if ((cr = strchr(target, '\n')) != NULL)
+               *cr = '\0';
+
+       return (strlen(target));
+}
+
+/*
+ * nabbed from vhd specs.
+ */
+uint32_t
+vhd_chs(uint64_t size)
+{
+       uint32_t secs, cylinders, heads, spt, cth;
+
+       secs = secs_round_up_no_zero(size);
+
+       if (secs > 65535 * 16 * 255)
+               secs = 65535 * 16 * 255;
+
+       if (secs >= 65535 * 16 * 63) {
+               spt   = 255;
+               cth   = secs / spt;
+               heads = 16;
+       } else {
+               spt   = 17;
+               cth   = secs / spt;
+               heads = (cth + 1023) / 1024;
+
+               if (heads < 4)
+                       heads = 4;
+
+               if (cth >= (heads * 1024) || heads > 16) {
+                       spt   = 31;
+                       cth   = secs / spt;
+                       heads = 16;
+               }
+
+               if (cth >= heads * 1024) {
+                       spt   = 63;
+                       cth   = secs / spt;
+                       heads = 16;
+               }
+       }
+
+       cylinders = cth / heads;
+
+       return GEOM_ENCODE(cylinders, heads, spt);
+}
+
+int
+vhd_get_footer(vhd_context_t *ctx)
+{
+       if (!vhd_validate_footer(&ctx->footer))
+               return 0;
+
+       return vhd_read_footer(ctx, &ctx->footer);
+}
+
+int
+vhd_get_header(vhd_context_t *ctx)
+{
+       if (!vhd_type_dynamic(ctx))
+               return -EINVAL;
+
+       if (!vhd_validate_header(&ctx->header))
+               return 0;
+
+       return vhd_read_header(ctx, &ctx->header);
+}
+
+int
+vhd_get_bat(vhd_context_t *ctx)
+{
+       if (!vhd_type_dynamic(ctx))
+               return -EINVAL;
+
+       if (!vhd_validate_bat(&ctx->bat))
+               return 0;
+
+       vhd_put_bat(ctx);
+       return vhd_read_bat(ctx, &ctx->bat);
+}
+
+int
+vhd_get_batmap(vhd_context_t *ctx)
+{
+       if (!vhd_has_batmap(ctx))
+               return -EINVAL;
+
+       if (!vhd_validate_batmap(&ctx->batmap))
+               return 0;
+
+       vhd_put_batmap(ctx);
+       return vhd_read_batmap(ctx, &ctx->batmap);
+}
+
+void
+vhd_put_footer(vhd_context_t *ctx)
+{
+       memset(&ctx->footer, 0, sizeof(vhd_footer_t));
+}
+
+void
+vhd_put_header(vhd_context_t *ctx)
+{
+       memset(&ctx->header, 0, sizeof(vhd_header_t));
+}
+
+void
+vhd_put_bat(vhd_context_t *ctx)
+{
+       if (!vhd_type_dynamic(ctx))
+               return;
+
+       free(ctx->bat.bat);
+       memset(&ctx->bat, 0, sizeof(vhd_bat_t));
+}
+
+void
+vhd_put_batmap(vhd_context_t *ctx)
+{
+       if (!vhd_type_dynamic(ctx))
+               return;
+
+       if (!vhd_has_batmap(ctx))
+               return;
+
+       free(ctx->batmap.map);
+       memset(&ctx->batmap, 0, sizeof(vhd_batmap_t));
+}
+
+/*
+ * look for 511 byte footer at end of file
+ */
+int
+vhd_read_short_footer(vhd_context_t *ctx, vhd_footer_t *footer)
+{
+       int err;
+       char *buf;
+       off64_t eof;
+
+       buf = NULL;
+
+       err = vhd_seek(ctx, 0, SEEK_END);
+       if (err)
+               goto out;
+
+       eof = vhd_position(ctx);
+       if (eof == (off64_t)-1) {
+               err = -errno;
+               goto out;
+       }
+
+       err = vhd_seek(ctx, eof - 511, SEEK_SET);
+       if (err)
+               goto out;
+
+       err = posix_memalign((void **)&buf,
+                            VHD_SECTOR_SIZE, sizeof(vhd_footer_t));
+       if (err) {
+               buf = NULL;
+               err = -err;
+               goto out;
+       }
+
+       memset(buf, 0, sizeof(vhd_footer_t));
+
+       /*
+        * expecting short read here
+        */
+       vhd_read(ctx, buf, sizeof(vhd_footer_t));
+
+       memcpy(footer, buf, sizeof(vhd_footer_t));
+
+       vhd_footer_in(footer);
+       err = vhd_validate_footer(footer);
+
+out:
+       if (err)
+               VHDLOG("%s: failed reading short footer: %d\n",
+                      ctx->file, err);
+       free(buf);
+       return err;
+}
+
+int
+vhd_read_footer_at(vhd_context_t *ctx, vhd_footer_t *footer, off64_t off)
+{
+       int err;
+       char *buf;
+
+       buf = NULL;
+
+       err = vhd_seek(ctx, off, SEEK_SET);
+       if (err)
+               goto out;
+
+       err = posix_memalign((void **)&buf,
+                            VHD_SECTOR_SIZE, sizeof(vhd_footer_t));
+       if (err) {
+               buf = NULL;
+               err = -err;
+               goto out;
+       }
+
+       err = vhd_read(ctx, buf, sizeof(vhd_footer_t));
+       if (err)
+               goto out;
+
+       memcpy(footer, buf, sizeof(vhd_footer_t));
+
+       vhd_footer_in(footer);
+       err = vhd_validate_footer(footer);
+
+out:
+       if (err)
+               VHDLOG("%s: reading footer at 0x%08"PRIx64" failed: %d\n",
+                      ctx->file, off, err);
+       free(buf);
+       return err;
+}
+
+int
+vhd_read_footer(vhd_context_t *ctx, vhd_footer_t *footer)
+{
+       int err;
+       off64_t off;
+
+       err = vhd_seek(ctx, 0, SEEK_END);
+       if (err)
+               return err;
+
+       off = vhd_position(ctx);
+       if (off == (off64_t)-1)
+               return -errno;
+
+       err = vhd_read_footer_at(ctx, footer, off - 512);
+       if (err != -EINVAL)
+               return err;
+
+       err = vhd_read_short_footer(ctx, footer);
+       if (err != -EINVAL)
+               return err;
+
+       if (ctx->oflags & VHD_OPEN_STRICT)
+               return -EINVAL;
+
+       return vhd_read_footer_at(ctx, footer, 0);
+}
+
+int
+vhd_read_header_at(vhd_context_t *ctx, vhd_header_t *header, off64_t off)
+{
+       int err;
+       char *buf;
+
+       buf = NULL;
+
+       if (!vhd_type_dynamic(ctx)) {
+               err = -EINVAL;
+               goto out;
+       }
+
+       err = vhd_seek(ctx, off, SEEK_SET);
+       if (err)
+               goto out;
+
+       err = posix_memalign((void **)&buf,
+                            VHD_SECTOR_SIZE, sizeof(vhd_header_t));
+       if (err) {
+               buf = NULL;
+               err = -err;
+               goto out;
+       }
+
+       err = vhd_read(ctx, buf, sizeof(vhd_header_t));
+       if (err)
+               goto out;
+
+       memcpy(header, buf, sizeof(vhd_header_t));
+
+       vhd_header_in(header);
+       err = vhd_validate_header(header);
+
+out:
+       if (err)
+               VHDLOG("%s: reading header at 0x%08"PRIx64" failed: %d\n",
+                      ctx->file, off, err);
+       free(buf);
+       return err;
+}
+
+int
+vhd_read_header(vhd_context_t *ctx, vhd_header_t *header)
+{
+       int err;
+       off64_t off;
+
+       if (!vhd_type_dynamic(ctx)) {
+               VHDLOG("%s is not dynamic!\n", ctx->file);
+               return -EINVAL;
+       }
+
+       off = ctx->footer.data_offset;
+       return vhd_read_header_at(ctx, header, off);
+}
+
+int
+vhd_read_bat(vhd_context_t *ctx, vhd_bat_t *bat)
+{
+       int err;
+       char *buf;
+       off64_t off;
+       size_t size;
+
+       buf  = NULL;
+
+       if (!vhd_type_dynamic(ctx)) {
+               err = -EINVAL;
+               goto fail;
+       }
+
+       off  = ctx->header.table_offset;
+       size = vhd_bytes_padded(ctx->header.max_bat_size * sizeof(uint32_t));
+
+       err  = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, size);
+       if (err) {
+               buf = NULL;
+               err = -err;
+               goto fail;
+       }
+
+       err = vhd_seek(ctx, off, SEEK_SET);
+       if (err)
+               goto fail;
+
+       err = vhd_read(ctx, buf, size);
+       if (err)
+               goto fail;
+
+       bat->spb     = ctx->header.block_size >> VHD_SECTOR_SHIFT;
+       bat->entries = ctx->header.max_bat_size;
+       bat->bat     = (uint32_t *)buf;
+
+       vhd_bat_in(bat);
+
+       return 0;
+
+fail:
+       free(buf);
+       memset(bat, 0, sizeof(vhd_bat_t));
+       VHDLOG("%s: failed to read bat: %d\n", ctx->file, err);
+       return err;
+}
+
+static int
+vhd_read_batmap_header(vhd_context_t *ctx, vhd_batmap_t *batmap)
+{
+       int err;
+       char *buf;
+       off64_t off;
+       size_t size;
+
+       buf = NULL;
+
+       err = vhd_batmap_header_offset(ctx, &off);
+       if (err)
+               goto fail;
+
+       err = vhd_seek(ctx, off, SEEK_SET);
+       if (err)
+               goto fail;
+
+       size = vhd_bytes_padded(sizeof(vhd_batmap_header_t));
+       err  = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, size);
+       if (err) {
+               buf = NULL;
+               err = -err;
+               goto fail;
+       }
+
+       err = vhd_read(ctx, buf, size);
+       if (err)
+               goto fail;
+
+       memcpy(&batmap->header, buf, sizeof(vhd_batmap_header_t));
+       free(buf);
+       buf = NULL;
+
+       vhd_batmap_header_in(batmap);
+
+       return 0;
+
+fail:
+       free(buf);
+       memset(&batmap->header, 0, sizeof(vhd_batmap_header_t));
+       VHDLOG("%s: failed to read batmap header: %d\n", ctx->file, err);
+       return err;
+}
+
+static int
+vhd_read_batmap_map(vhd_context_t *ctx, vhd_batmap_t *batmap)
+{
+       int err;
+       char *buf;
+       off64_t off;
+       size_t map_size;
+
+       map_size = vhd_sectors_to_bytes(batmap->header.batmap_size);
+
+       err = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, map_size);
+       if (err) {
+               buf = NULL;
+               err = -err;
+               goto fail;
+       }
+
+       off  = batmap->header.batmap_offset;
+       err  = vhd_seek(ctx, off, SEEK_SET);
+       if (err)
+               goto fail;
+
+       err  = vhd_read(ctx, buf, map_size);
+       if (err)
+               goto fail;
+
+       batmap->map = buf;
+       return 0;
+
+fail:
+       free(buf);
+       batmap->map = NULL;
+       VHDLOG("%s: failed to read batmap: %d\n", ctx->file, err);
+       return err;
+}
+
+int
+vhd_read_batmap(vhd_context_t *ctx, vhd_batmap_t *batmap)
+{
+       int err;
+
+       if (!vhd_has_batmap(ctx))
+               return -EINVAL;
+
+       memset(batmap, 0, sizeof(vhd_batmap_t));
+
+       err = vhd_read_batmap_header(ctx, batmap);
+       if (err)
+               return err;
+
+       err = vhd_validate_batmap_header(batmap);
+       if (err)
+               return err;
+
+       err = vhd_read_batmap_map(ctx, batmap);
+       if (err)
+               return err;
+
+       err = vhd_validate_batmap(batmap);
+       if (err)
+               goto fail;
+
+       return 0;
+
+fail:
+       free(batmap->map);
+       memset(batmap, 0, sizeof(vhd_batmap_t));
+       return err;
+}
+
+int
+vhd_has_batmap(vhd_context_t *ctx)
+{
+       if (!vhd_type_dynamic(ctx))
+               return 0;
+
+       if (!vhd_creator_tapdisk(ctx))
+               return 0;
+
+       if (ctx->footer.crtr_ver <= VHD_VERSION(0, 1))
+               return 0;
+
+       if (ctx->footer.crtr_ver >= VHD_VERSION(1, 2))
+               return 1;
+
+       /*
+        * VHDs of version 1.1 probably have a batmap, but may not 
+        * if they were updated from version 0.1 via vhd-update.
+        */
+       if (!vhd_validate_batmap_header(&ctx->batmap))
+               return 1;
+
+       if (vhd_read_batmap_header(ctx, &ctx->batmap))
+               return 0;
+
+       return (!vhd_validate_batmap_header(&ctx->batmap));
+}
+
+/* 
+ * Is this a block device (with a fixed size)? This affects whether the file 
+ * can be truncated and where the footer is written for VHDs.
+ */
+int
+vhd_test_file_fixed(const char *file, int *is_block)
+{
+       int err;
+       struct stat stats;
+
+       err = stat(file, &stats);
+       if (err == -1)
+               return -errno;
+
+       *is_block = !!(S_ISBLK(stats.st_mode));
+       return err;
+}
+
+int
+vhd_find_parent(vhd_context_t *ctx, const char *parent, char **_location)
+{
+       int err;
+       char *location, *cpath, *cdir, *path;
+
+       err        = 0;
+       path       = NULL;
+       cpath      = NULL;
+       location   = NULL;
+       *_location = NULL;
+
+       if (!parent)
+               return -EINVAL;
+
+       if (parent[0] == '/') {
+               if (!access(parent, R_OK)) {
+                       path = strdup(parent);
+                       if (!path)
+                               return -ENOMEM;
+                       *_location = path;
+                       return 0;
+               }
+       }
+
+       /* check parent path relative to child's directory */
+       cpath = realpath(ctx->file, NULL);
+       if (!cpath) {
+               err = -errno;
+               goto out;
+       }
+
+       cdir = dirname(cpath);
+       if (asprintf(&location, "%s/%s", cdir, parent) == -1) {
+               err = -errno;
+               location = NULL;
+               goto out;
+       }
+
+       if (!access(location, R_OK)) {
+               path = realpath(location, NULL);
+               if (path) {
+                       *_location = path;
+                       return 0;
+               }
+       }
+       err = -errno;
+
+out:
+       free(location);
+       free(cpath);
+       return err;
+}
+
+static int 
+vhd_macx_encode_location(char *name, char **out, int *outlen)
+{
+       iconv_t cd;
+       int len, err;
+       size_t ibl, obl;
+       char *uri, *urip, *uri_utf8, *uri_utf8p, *ret;
+
+       err     = 0;
+       ret     = NULL;
+       *out    = NULL;
+       *outlen = 0;
+       len     = strlen(name) + strlen("file://");
+
+       ibl     = len;
+       obl     = len;
+
+       uri = urip = malloc(ibl + 1);
+       uri_utf8 = uri_utf8p = malloc(obl);
+
+       if (!uri || !uri_utf8)
+               return -ENOMEM;
+
+       cd = iconv_open("UTF-8", "ASCII");
+       if (cd == (iconv_t)-1) {
+               err = -errno;
+               goto out;
+       }
+
+       sprintf(uri, "file://%s", name);
+
+       if (iconv(cd, &urip, &ibl, &uri_utf8p, &obl) == (size_t)-1 ||
+           ibl || obl) {
+               err = (errno ? -errno : -EIO);
+               goto out;
+       }
+
+       ret = malloc(len);
+       if (!ret) {
+               err = -ENOMEM;
+               goto out;
+       }
+
+       memcpy(ret, uri_utf8, len);
+       *outlen = len;
+       *out    = ret;
+
+ out:
+       free(uri);
+       free(uri_utf8);
+       if (cd != (iconv_t)-1)
+               iconv_close(cd);
+
+       return err;
+}
+
+static int
+vhd_w2u_encode_location(char *name, char **out, int *outlen)
+{
+       iconv_t cd;
+       int len, err;
+       size_t ibl, obl;
+       char *uri, *urip, *uri_utf16, *uri_utf16p, *tmp, *ret;
+
+       err     = 0;
+       ret     = NULL;
+       *out    = NULL;
+       *outlen = 0;
+       cd      = (iconv_t) -1;
+
+       /* 
+        * MICROSOFT_COMPAT
+        * relative paths must start with ".\" 
+        */
+       if (name[0] != '/') {
+               tmp = strstr(name, "./");
+               if (tmp == name)
+                       tmp += strlen("./");
+               else
+                       tmp = name;
+
+               err = asprintf(&uri, ".\\%s", tmp);
+       } else
+               err = asprintf(&uri, "%s", name);
+
+       if (err == -1)
+               return -ENOMEM;
+
+       tmp = uri;
+       while (*tmp != '\0') {
+               if (*tmp == '/')
+                       *tmp = '\\';
+               tmp++;
+       }
+
+       len  = strlen(uri);
+       ibl  = len;
+       obl  = len * 2;
+       urip = uri;
+
+       uri_utf16 = uri_utf16p = malloc(obl);
+       if (!uri_utf16) {
+               err = -ENOMEM;
+               goto out;
+       }
+
+       /* 
+        * MICROSOFT_COMPAT
+        * little endian unicode here 
+        */
+       cd = iconv_open("UTF-16LE", "ASCII");
+       if (cd == (iconv_t)-1) {
+               err = -errno;
+               goto out;
+       }
+
+       if (iconv(cd, &urip, &ibl, &uri_utf16p, &obl) == (size_t)-1 ||
+           ibl || obl) {
+               err = (errno ? -errno : -EIO);
+               goto out;
+       }
+
+       len = len * 2;
+       ret = malloc(len);
+       if (!ret) {
+               err = -ENOMEM;
+               goto out;
+       }
+
+       memcpy(ret, uri_utf16, len);
+       *outlen = len;
+       *out    = ret;
+       err     = 0;
+
+ out:
+       free(uri);
+       free(uri_utf16);
+       if (cd != (iconv_t)-1)
+               iconv_close(cd);
+
+       return err;
+}
+
+static char *
+vhd_macx_decode_location(char *in, char *out, int len)
+{
+       iconv_t cd;
+       char *name;
+       size_t ibl, obl;
+
+       name = out;
+       ibl  = obl = len;
+
+       cd = iconv_open("ASCII", "UTF-8");
+       if (cd == (iconv_t)-1) 
+               return NULL;
+
+       if (iconv(cd, &in, &ibl, &out, &obl) == (size_t)-1 || ibl)
+               return NULL;
+
+       iconv_close(cd);
+       *out = '\0';
+
+       if (strstr(name, "file://") != name)
+               return NULL;
+
+       name += strlen("file://");
+
+       return strdup(name);
+}
+
+static char *
+vhd_w2u_decode_location(char *in, char *out, int len, char *utf_type)
+{
+       iconv_t cd;
+       char *name, *tmp;
+       size_t ibl, obl;
+
+       tmp = name = out;
+       ibl = obl  = len;
+
+       cd = iconv_open("ASCII", utf_type);
+       if (cd == (iconv_t)-1) 
+               return NULL;
+
+       if (iconv(cd, &in, &ibl, &out, &obl) == (size_t)-1 || ibl)
+               return NULL;
+
+       iconv_close(cd);
+       *out = '\0';
+
+       /* TODO: spaces */
+       while (tmp != out) {
+               if (*tmp == '\\')
+                       *tmp = '/';
+               tmp++;
+       }
+
+       if (strstr(name, "C:") == name || strstr(name, "c:") == name)
+               name += strlen("c:");
+
+       return strdup(name);
+}
+
+int
+vhd_header_decode_parent(vhd_context_t *ctx, vhd_header_t *header, char **buf)
+{
+       char *code, out[512];
+
+       if (vhd_creator_tapdisk(ctx) &&
+           ctx->footer.crtr_ver == VHD_VERSION(0, 1))
+               code = UTF_16;
+       else
+               code = UTF_16BE;
+
+       *buf = vhd_w2u_decode_location(header->prt_name, out, 512, code);
+       return (*buf == NULL ? -EINVAL : 0);
+}
+
+int
+vhd_parent_locator_read(vhd_context_t *ctx,
+                       vhd_parent_locator_t *loc, char **parent)
+{
+       int err, size;
+       char *raw, *out, *name;
+
+       raw     = NULL;
+       out     = NULL;
+       name    = NULL;
+       *parent = NULL;
+
+       if (ctx->footer.type != HD_TYPE_DIFF) {
+               err = -EINVAL;
+               goto out;
+       }
+
+       switch (loc->code) {
+       case PLAT_CODE_MACX:
+       case PLAT_CODE_W2KU:
+       case PLAT_CODE_W2RU:
+               break;
+       default:
+               err = -EINVAL;
+               goto out;
+       }
+
+       err = vhd_seek(ctx, loc->data_offset, SEEK_SET);
+       if (err)
+               goto out;
+
+       size = vhd_parent_locator_size(loc);
+       if (size <= 0) {
+               err = -EINVAL;
+               goto out;
+       }
+
+       err = posix_memalign((void **)&raw, VHD_SECTOR_SIZE, size);
+       if (err) {
+               raw = NULL;
+               err = -err;
+               goto out;
+       }
+
+       err = vhd_read(ctx, raw, size);
+       if (err)
+               goto out;
+
+       out = malloc(loc->data_len + 1);
+       if (!out) {
+               err = -ENOMEM;
+               goto out;
+       }
+
+       switch (loc->code) {
+       case PLAT_CODE_MACX:
+               name = vhd_macx_decode_location(raw, out, loc->data_len);
+               break;
+       case PLAT_CODE_W2KU:
+       case PLAT_CODE_W2RU:
+               name = vhd_w2u_decode_location(raw, out,
+                                              loc->data_len, UTF_16LE);
+               break;
+       }
+
+       if (!name) {
+               err = -EINVAL;
+               goto out;
+       }
+
+       err     = 0;
+       *parent = name;
+
+out:
+       free(raw);
+       free(out);
+
+       if (err) {
+               VHDLOG("%s: error reading parent locator: %d\n",
+                      ctx->file, err);
+               VHDLOG("%s: locator: code %u, space 0x%x, len 0x%x, "
+                      "off 0x%"PRIx64"\n", ctx->file, loc->code, loc->data_space,
+                      loc->data_len, loc->data_offset);
+       }
+
+       return err;
+}
+
+int
+vhd_parent_locator_get(vhd_context_t *ctx, char **parent)
+{
+       int i, n, err;
+       char *name, *location;
+       vhd_parent_locator_t *loc;
+
+       err     = 0;
+       *parent = NULL;
+
+       if (ctx->footer.type != HD_TYPE_DIFF)
+               return -EINVAL;
+
+       n = vhd_parent_locator_count(ctx);
+       for (i = 0; i < n; i++) {
+               loc = ctx->header.loc + i;
+               err = vhd_parent_locator_read(ctx, loc, &name);
+               if (err)
+                       continue;
+
+               err = vhd_find_parent(ctx, name, &location);
+               if (err)
+                       VHDLOG("%s: couldn't find parent %s (%d)\n",
+                              ctx->file, name, err);
+               free(name);
+
+               if (!err) {
+                       *parent = location;
+                       return 0;
+               }
+       }
+
+       return err;
+}
+
+int
+vhd_parent_locator_write_at(vhd_context_t *ctx,
+                           const char *parent, off64_t off, uint32_t code,
+                           size_t max_bytes, vhd_parent_locator_t *loc)
+{
+       struct stat stats;
+       int err, len, size;
+       char *absolute_path, *relative_path, *encoded, *block;
+
+       memset(loc, 0, sizeof(vhd_parent_locator_t));
+
+       if (ctx->footer.type != HD_TYPE_DIFF)
+               return -EINVAL;
+
+       absolute_path = NULL;
+       relative_path = NULL;
+       encoded       = NULL;
+       block         = NULL;
+       size          = 0;
+       len           = 0;
+
+       switch (code) {
+       case PLAT_CODE_MACX:
+       case PLAT_CODE_W2KU:
+       case PLAT_CODE_W2RU:
+               break;
+       default:
+               return -EINVAL;
+       }
+
+       absolute_path = realpath(parent, NULL);
+       if (!absolute_path) {
+               err = -errno;
+               goto out;
+       }
+
+       err = stat(absolute_path, &stats);
+       if (err) {
+               err = -errno;
+               goto out;
+       }
+
+       if (!S_ISREG(stats.st_mode) && !S_ISBLK(stats.st_mode)) {
+               err = -EINVAL;
+               goto out;
+       }
+
+       relative_path = relative_path_to(ctx->file, absolute_path, &err);
+       if (!relative_path || err) {
+               err = (err ? err : -EINVAL);
+               goto out;
+       }
+
+       switch (code) {
+       case PLAT_CODE_MACX:
+               err = vhd_macx_encode_location(relative_path, &encoded, &len);
+               break;
+       case PLAT_CODE_W2KU:
+       case PLAT_CODE_W2RU:
+               err = vhd_w2u_encode_location(relative_path, &encoded, &len);
+               break;
+       default:
+               err = -EINVAL;
+       }
+
+       if (err)
+               goto out;
+
+       err = vhd_seek(ctx, off, SEEK_SET);
+       if (err)
+               goto out;
+
+       size = vhd_bytes_padded(len);
+
+       if (max_bytes && size > max_bytes) {
+               err = -ENAMETOOLONG;
+               goto out;
+       }
+
+       err  = posix_memalign((void **)&block, VHD_SECTOR_SIZE, size);
+       if (err) {
+               block = NULL;
+               err   = -err;
+               goto out;
+       }
+
+       memset(block, 0, size);
+       memcpy(block, encoded, len);
+
+       err = vhd_write(ctx, block, size);
+       if (err)
+               goto out;
+
+       err = 0;
+
+out:
+       free(absolute_path);
+       free(relative_path);
+       free(encoded);
+       free(block);
+
+       if (!err) {
+               loc->res         = 0;
+               loc->code        = code;
+               loc->data_len    = len;
+               /*
+                * write number of bytes ('size') instead of number of sectors
+                * into loc->data_space to be compatible with MSFT, even though
+                * this goes against the specs
+                */
+               loc->data_space  = size; 
+               loc->data_offset = off;
+       }
+
+       return err;
+}
+
+static int
+vhd_footer_offset_at_eof(vhd_context_t *ctx, off64_t *off)
+{
+       int err;
+       if ((err = vhd_seek(ctx, 0, SEEK_END)))
+               return errno;
+       *off = vhd_position(ctx) - sizeof(vhd_footer_t);
+       return 0;
+}
+
+int
+vhd_read_bitmap(vhd_context_t *ctx, uint32_t block, char **bufp)
+{
+       int err;
+       char *buf;
+       size_t size;
+       off64_t off;
+       uint64_t blk;
+
+       buf   = NULL;
+       *bufp = NULL;
+
+       if (!vhd_type_dynamic(ctx))
+               return -EINVAL;
+
+       err = vhd_get_bat(ctx);
+       if (err)
+               return err;
+
+       if (block >= ctx->bat.entries)
+               return -ERANGE;
+
+       blk  = ctx->bat.bat[block];
+       if (blk == DD_BLK_UNUSED)
+               return -EINVAL;
+
+       off  = vhd_sectors_to_bytes(blk);
+       size = vhd_bytes_padded(ctx->spb >> 3);
+
+       err  = vhd_seek(ctx, off, SEEK_SET);
+       if (err)
+               return err;
+
+       err  = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, size);
+       if (err)
+               return -err;
+
+       err  = vhd_read(ctx, buf, size);
+       if (err)
+               goto fail;
+
+       *bufp = buf;
+       return 0;
+
+fail:
+       free(buf);
+       return err;
+}
+
+int
+vhd_read_block(vhd_context_t *ctx, uint32_t block, char **bufp)
+{
+       int err;
+       char *buf;
+       size_t size;
+       uint64_t blk;
+       off64_t end, off;
+
+       buf   = NULL;
+       *bufp = NULL;
+
+       if (!vhd_type_dynamic(ctx))
+               return -EINVAL;
+
+       err = vhd_get_bat(ctx);
+       if (err)
+               return err;
+
+       if (block >= ctx->bat.entries)
+               return -ERANGE;
+
+       blk  = ctx->bat.bat[block];
+       if (blk == DD_BLK_UNUSED)
+               return -EINVAL;
+
+       off  = vhd_sectors_to_bytes(blk + ctx->bm_secs);
+       size = vhd_sectors_to_bytes(ctx->spb);
+
+       err  = vhd_footer_offset_at_eof(ctx, &end);
+       if (err)
+               return err;
+
+       err  = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, size);
+       if (err) {
+               err = -err;
+               goto fail;
+       }
+
+       if (end < off + ctx->header.block_size) {
+               size = end - off;
+               memset(buf + size, 0, ctx->header.block_size - size);
+       }
+
+       err  = vhd_seek(ctx, off, SEEK_SET);
+       if (err)
+               goto fail;
+
+       err  = vhd_read(ctx, buf, size);
+       if (err)
+               goto fail;
+
+       *bufp = buf;
+       return 0;
+
+fail:
+       free(buf);
+       return err;
+}
+
+int
+vhd_write_footer_at(vhd_context_t *ctx, vhd_footer_t *footer, off64_t off)
+{
+       int err;
+       vhd_footer_t *f;
+
+       f = NULL;
+
+       err = posix_memalign((void **)&f,
+                            VHD_SECTOR_SIZE, sizeof(vhd_footer_t));
+       if (err) {
+               f   = NULL;
+               err = -err;
+               goto out;
+       }
+
+       memcpy(f, footer, sizeof(vhd_footer_t));
+       f->checksum = vhd_checksum_footer(f);
+
+       err = vhd_validate_footer(f);
+       if (err)
+               goto out;
+
+       err = vhd_seek(ctx, off, SEEK_SET);
+       if (err)
+               goto out;
+
+       vhd_footer_out(f);
+
+       err = vhd_write(ctx, f, sizeof(vhd_footer_t));
+
+out:
+       if (err)
+               VHDLOG("%s: failed writing footer at 0x%08"PRIx64": %d\n",
+                      ctx->file, off, err);
+       free(f);
+       return err;
+}
+
+int
+vhd_write_footer(vhd_context_t *ctx, vhd_footer_t *footer)
+{
+       int err;
+       off64_t off;
+
+       if (ctx->is_block)
+               err = vhd_footer_offset_at_eof(ctx, &off);
+       else
+               err = vhd_end_of_data(ctx, &off);
+       if (err)
+               return err;
+
+       err = vhd_write_footer_at(ctx, footer, off);
+       if (err)
+               return err;
+
+       if (!vhd_type_dynamic(ctx))
+               return 0;
+
+       return vhd_write_footer_at(ctx, footer, 0);
+}
+
+int
+vhd_write_header_at(vhd_context_t *ctx, vhd_header_t *header, off64_t off)
+{
+       int err;
+       vhd_header_t *h;
+
+       h = NULL;
+
+       if (!vhd_type_dynamic(ctx)) {
+               err = -EINVAL;
+               goto out;
+       }
+
+       err = posix_memalign((void **)&h,
+                            VHD_SECTOR_SIZE, sizeof(vhd_header_t));
+       if (err) {
+               h   = NULL;
+               err = -err;
+               goto out;
+       }
+
+       memcpy(h, header, sizeof(vhd_header_t));
+
+       h->checksum = vhd_checksum_header(h);
+       err = vhd_validate_header(h);
+       if (err)
+               goto out;
+
+       vhd_header_out(h);
+
+       err = vhd_seek(ctx, off, SEEK_SET);
+       if (err)
+               goto out;
+
+       err = vhd_write(ctx, h, sizeof(vhd_header_t));
+
+out:
+       if (err)
+               VHDLOG("%s: failed writing header at 0x%08"PRIx64": %d\n",
+                      ctx->file, off, err);
+       free(h);
+       return err;
+}
+
+int
+vhd_write_header(vhd_context_t *ctx, vhd_header_t *header)
+{
+       int err;
+       off64_t off;
+
+       if (!vhd_type_dynamic(ctx))
+               return -EINVAL;
+
+       off = ctx->footer.data_offset;
+       return vhd_write_header_at(ctx, header, off);
+}
+
+int
+vhd_write_bat(vhd_context_t *ctx, vhd_bat_t *bat)
+{
+       int err;
+       off64_t off;
+       vhd_bat_t b;
+       size_t size;
+
+       if (!vhd_type_dynamic(ctx))
+               return -EINVAL;
+
+       err = vhd_validate_bat(&ctx->bat);
+       if (err)
+               return err;
+
+       err = vhd_validate_bat(bat);
+       if (err)
+               return err;
+
+       memset(&b, 0, sizeof(vhd_bat_t));
+
+       off  = ctx->header.table_offset;
+       size = vhd_bytes_padded(bat->entries * sizeof(uint32_t));
+
+       err  = vhd_seek(ctx, off, SEEK_SET);
+       if (err)
+               return err;
+
+       err  = posix_memalign((void **)&b.bat, VHD_SECTOR_SIZE, size);
+       if (err)
+               return -err;
+
+       memcpy(b.bat, bat->bat, size);
+       b.spb     = bat->spb;
+       b.entries = bat->entries;
+       vhd_bat_out(&b);
+
+       err = vhd_write(ctx, b.bat, size);
+       free(b.bat);
+
+       return err;
+}
+
+int
+vhd_write_batmap(vhd_context_t *ctx, vhd_batmap_t *batmap)
+{
+       int err;
+       off64_t off;
+       vhd_batmap_t b;
+       char *buf, *map;
+       size_t size, map_size;
+
+       buf      = NULL;
+       map      = NULL;
+
+       if (!vhd_has_batmap(ctx)) {
+               err = -EINVAL;
+               goto out;
+       }
+
+       b.header = batmap->header;
+       b.map    = batmap->map;
+
+       b.header.checksum = vhd_checksum_batmap(&b);
+       err = vhd_validate_batmap(&b);
+       if (err)
+               goto out;
+
+       off      = b.header.batmap_offset;
+       map_size = vhd_sectors_to_bytes(b.header.batmap_size);
+
+       err  = vhd_seek(ctx, off, SEEK_SET);
+       if (err)
+               goto out;
+
+       err  = posix_memalign((void **)&map, VHD_SECTOR_SIZE, map_size);
+       if (err) {
+               map = NULL;
+               err = -err;
+               goto out;
+       }
+
+       memcpy(map, b.map, map_size);
+
+       err  = vhd_write(ctx, map, map_size);
+       if (err)
+               goto out;
+
+       err  = vhd_batmap_header_offset(ctx, &off);
+       if (err)
+               goto out;
+
+       size = vhd_bytes_padded(sizeof(vhd_batmap_header_t));
+
+       err  = vhd_seek(ctx, off, SEEK_SET);
+       if (err)
+               goto out;
+
+       err  = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, size);
+       if (err) {
+               err = -err;
+               buf = NULL;
+               goto out;
+       }
+
+       vhd_batmap_header_out(&b);
+       memset(buf, 0, size);
+       memcpy(buf, &b.header, sizeof(vhd_batmap_header_t));
+
+       err  = vhd_write(ctx, buf, size);
+
+out:
+       if (err)
+               VHDLOG("%s: failed writing batmap: %d\n", ctx->file, err);
+       free(buf);
+       free(map);
+       return 0;
+}
+
+int
+vhd_write_bitmap(vhd_context_t *ctx, uint32_t block, char *bitmap)
+{
+       int err;
+       off64_t off;
+       uint64_t blk;
+       size_t secs, size;
+
+       if (!vhd_type_dynamic(ctx))
+               return -EINVAL;
+
+       err = vhd_validate_bat(&ctx->bat);
+       if (err)
+               return err;
+
+       if (block >= ctx->bat.entries)
+               return -ERANGE;
+
+       if ((unsigned long)bitmap & (VHD_SECTOR_SIZE - 1))
+               return -EINVAL;
+
+       blk  = ctx->bat.bat[block];
+       if (blk == DD_BLK_UNUSED)
+               return -EINVAL;
+
+       off  = vhd_sectors_to_bytes(blk);
+       size = vhd_sectors_to_bytes(ctx->bm_secs);
+
+       err  = vhd_seek(ctx, off, SEEK_SET);
+       if (err)
+               return err;
+
+       err  = vhd_write(ctx, bitmap, size);
+       if (err)
+               return err;
+
+       return 0;
+}
+
+int
+vhd_write_block(vhd_context_t *ctx, uint32_t block, char *data)
+{
+       int err;
+       off64_t off;
+       size_t size;
+       uint64_t blk;
+
+       if (!vhd_type_dynamic(ctx))
+               return -EINVAL;
+
+       err = vhd_validate_bat(&ctx->bat);
+       if (err)
+               return err;
+
+       if (block >= ctx->bat.entries)
+               return -ERANGE;
+
+       if ((unsigned long)data & ~(VHD_SECTOR_SIZE -1))
+               return -EINVAL;
+
+       blk  = ctx->bat.bat[block];
+       if (blk == DD_BLK_UNUSED)
+               return -EINVAL;
+
+       off  = vhd_sectors_to_bytes(blk + ctx->bm_secs);
+       size = vhd_sectors_to_bytes(ctx->spb);
+
+       err  = vhd_seek(ctx, off, SEEK_SET);
+       if (err)
+               return err;
+
+       err  = vhd_write(ctx, data, size);
+       if (err)
+               return err;
+
+       return 0;
+}
+
+static inline int
+namedup(char **dup, const char *name)
+{
+       *dup = NULL;
+
+       if (strnlen(name, MAX_NAME_LEN) >= MAX_NAME_LEN)
+               return -ENAMETOOLONG;
+       
+       *dup = strdup(name);
+       if (*dup == NULL)
+               return -ENOMEM;
+
+       return 0;
+}
+
+int
+vhd_seek(vhd_context_t *ctx, off64_t offset, int whence)
+{
+       off64_t off;
+
+       off = lseek64(ctx->fd, offset, whence);
+       if (off == (off64_t)-1) {
+               VHDLOG("%s: seek(0x%08"PRIx64", %d) failed: %d\n",
+                      ctx->file, offset, whence, -errno);
+               return -errno;
+       }
+
+       return 0;
+}
+
+off64_t
+vhd_position(vhd_context_t *ctx)
+{
+       return lseek64(ctx->fd, 0, SEEK_CUR);
+}
+
+int
+vhd_read(vhd_context_t *ctx, void *buf, size_t size)
+{
+       size_t ret;
+
+       errno = 0;
+
+       ret = read(ctx->fd, buf, size);
+       if (ret == size)
+               return 0;
+
+       VHDLOG("%s: read of %zu returned %zd, errno: %d\n",
+              ctx->file, size, ret, -errno);
+
+       return (errno ? -errno : -EIO);
+}
+
+int
+vhd_write(vhd_context_t *ctx, void *buf, size_t size)
+{
+       size_t ret;
+
+       errno = 0;
+
+       ret = write(ctx->fd, buf, size);
+       if (ret == size)
+               return 0;
+
+       VHDLOG("%s: write of %zu returned %zd, errno: %d\n",
+              ctx->file, size, ret, -errno);
+
+       return (errno ? -errno : -EIO);
+}
+
+int
+vhd_offset(vhd_context_t *ctx, uint32_t sector, uint32_t *offset)
+{
+       int err;
+       uint32_t block;
+
+       if (!vhd_type_dynamic(ctx))
+               return sector;
+
+       err = vhd_get_bat(ctx);
+       if (err)
+               return err;
+
+       block = sector / ctx->spb;
+       if (ctx->bat.bat[block] == DD_BLK_UNUSED)
+               *offset = DD_BLK_UNUSED;
+       else
+               *offset = ctx->bat.bat[block] +
+                       ctx->bm_secs + (sector % ctx->spb);
+
+       return 0;
+}
+
+int
+vhd_open_fast(vhd_context_t *ctx)
+{
+       int err;
+       char *buf;
+       size_t size;
+
+       size = sizeof(vhd_footer_t) + sizeof(vhd_header_t);
+       err  = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, size);
+       if (err) {
+               VHDLOG("failed allocating %s: %d\n", ctx->file, -err);
+               return -err;
+       }
+
+       err = vhd_read(ctx, buf, size);
+       if (err) {
+               VHDLOG("failed reading %s: %d\n", ctx->file, err);
+               goto out;
+       }
+
+       memcpy(&ctx->footer, buf, sizeof(vhd_footer_t));
+       vhd_footer_in(&ctx->footer);
+       err = vhd_validate_footer(&ctx->footer);
+       if (err)
+               goto out;
+
+       if (vhd_type_dynamic(ctx)) {
+               if (ctx->footer.data_offset != sizeof(vhd_footer_t))
+                       err = vhd_read_header(ctx, &ctx->header);
+               else {
+                       memcpy(&ctx->header,
+                              buf + sizeof(vhd_footer_t),
+                              sizeof(vhd_header_t));
+                       vhd_header_in(&ctx->header);
+                       err = vhd_validate_header(&ctx->header);
+               }
+
+               if (err)
+                       goto out;
+
+               ctx->spb     = ctx->header.block_size >> VHD_SECTOR_SHIFT;
+               ctx->bm_secs = secs_round_up_no_zero(ctx->spb >> 3);
+       }
+
+out:
+       free(buf);
+       return err;
+}
+
+int
+vhd_open(vhd_context_t *ctx, const char *file, int flags)
+{
+       int err, oflags;
+
+       if (flags & VHD_OPEN_STRICT)
+               vhd_flag_clear(flags, VHD_OPEN_FAST);
+
+       memset(ctx, 0, sizeof(vhd_context_t));
+       ctx->fd     = -1;
+       ctx->oflags = flags;
+
+       err = namedup(&ctx->file, file);
+       if (err)
+               return err;
+
+       oflags = O_DIRECT | O_LARGEFILE;
+       if (flags & VHD_OPEN_RDONLY)
+               oflags |= O_RDONLY;
+       if (flags & VHD_OPEN_RDWR)
+               oflags |= O_RDWR;
+
+       ctx->fd = open(ctx->file, oflags, 0644);
+       if (ctx->fd == -1) {
+               err = -errno;
+               VHDLOG("failed to open %s: %d\n", ctx->file, err);
+               goto fail;
+       }
+
+       err = vhd_test_file_fixed(ctx->file, &ctx->is_block);
+       if (err)
+               goto fail;
+
+       if (flags & VHD_OPEN_FAST) {
+               err = vhd_open_fast(ctx);
+               if (err)
+                       goto fail;
+
+               return 0;
+       }
+
+       err = vhd_read_footer(ctx, &ctx->footer);
+       if (err)
+               goto fail;
+
+       if (!(flags & VHD_OPEN_IGNORE_DISABLED) && vhd_disabled(ctx)) {
+               err = -EINVAL;
+               goto fail;
+       }
+
+       if (vhd_type_dynamic(ctx)) {
+               err = vhd_read_header(ctx, &ctx->header);
+               if (err)
+                       goto fail;
+
+               ctx->spb     = ctx->header.block_size >> VHD_SECTOR_SHIFT;
+               ctx->bm_secs = secs_round_up_no_zero(ctx->spb >> 3);
+       }
+
+       return 0;
+
+fail:
+       if (ctx->fd != -1)
+               close(ctx->fd);
+       free(ctx->file);
+       memset(ctx, 0, sizeof(vhd_context_t));
+       return err;
+}
+
+void
+vhd_close(vhd_context_t *ctx)
+{
+       if (ctx->file)
+               close(ctx->fd);
+       free(ctx->file);
+       free(ctx->bat.bat);
+       free(ctx->batmap.map);
+       memset(ctx, 0, sizeof(vhd_context_t));
+}
+
+static inline void
+vhd_initialize_footer(vhd_context_t *ctx, int type, uint64_t size)
+{
+       memset(&ctx->footer, 0, sizeof(vhd_footer_t));
+       memcpy(ctx->footer.cookie, HD_COOKIE, sizeof(ctx->footer.cookie));
+       ctx->footer.features     = HD_RESERVED;
+       ctx->footer.ff_version   = HD_FF_VERSION;
+       ctx->footer.timestamp    = vhd_time(time(NULL));
+       ctx->footer.crtr_ver     = VHD_CURRENT_VERSION;
+       ctx->footer.crtr_os      = 0x00000000;
+       ctx->footer.orig_size    = size;
+       ctx->footer.curr_size    = size;
+       ctx->footer.geometry     = vhd_chs(size);
+       ctx->footer.type         = type;
+       ctx->footer.saved        = 0;
+       ctx->footer.data_offset  = 0xFFFFFFFFFFFFFFFF;
+       strcpy(ctx->footer.crtr_app, "tap");
+       uuid_generate(ctx->footer.uuid);
+}
+
+static int
+vhd_initialize_header_parent_name(vhd_context_t *ctx, const char *parent_path)
+{
+       int err;
+       iconv_t cd;
+       size_t ibl, obl;
+       char *pname, *ppath, *dst;
+
+       err   = 0;
+       pname = NULL;
+       ppath = NULL;
+
+       /*
+        * MICROSOFT_COMPAT
+        * big endian unicode here 
+        */
+       cd = iconv_open(UTF_16BE, "ASCII");
+       if (cd == (iconv_t)-1) {
+               err = -errno;
+               goto out;
+       }
+
+       ppath = strdup(parent_path);
+       if (!ppath) {
+               err = -ENOMEM;
+               goto out;
+       }
+
+       pname = basename(ppath);
+       if (!strcmp(pname, "")) {
+               err = -EINVAL;
+               goto out;
+       }
+
+       ibl = strlen(pname);
+       obl = sizeof(ctx->header.prt_name);
+       dst = ctx->header.prt_name;
+
+       memset(dst, 0, obl);
+
+       if (iconv(cd, &pname, &ibl, &dst, &obl) == (size_t)-1 || ibl)
+               err = (errno ? -errno : -EINVAL);
+
+out:
+       iconv_close(cd);
+       free(ppath);
+       return err;
+}
+
+static off64_t
+get_file_size(const char *name)
+{
+       int fd;
+       off64_t end;
+
+       fd = open(name, O_LARGEFILE | O_RDONLY);
+       if (fd == -1) {
+               VHDLOG("unable to open '%s': %d\n", name, errno);
+               return -errno;
+       }
+       end = lseek64(fd, 0, SEEK_END);
+       close(fd); 
+       return end;
+}
+
+static int
+vhd_initialize_header(vhd_context_t *ctx, const char *parent_path, 
+               uint64_t size, int raw)
+{
+       int err;
+       struct stat stats;
+       vhd_context_t parent;
+
+       if (!vhd_type_dynamic(ctx))
+               return -EINVAL;
+
+       memset(&ctx->header, 0, sizeof(vhd_header_t));
+       memcpy(ctx->header.cookie, DD_COOKIE, sizeof(ctx->header.cookie));
+       ctx->header.data_offset  = (uint64_t)-1;
+       ctx->header.table_offset = VHD_SECTOR_SIZE * 3; /* 1 ftr + 2 hdr */
+       ctx->header.hdr_ver      = DD_VERSION;
+       ctx->header.block_size   = VHD_BLOCK_SIZE;
+       ctx->header.prt_ts       = 0;
+       ctx->header.res1         = 0;
+       ctx->header.max_bat_size = (ctx->footer.curr_size +
+                                   VHD_BLOCK_SIZE - 1) >> VHD_BLOCK_SHIFT;
+
+       ctx->footer.data_offset  = VHD_SECTOR_SIZE;
+
+       if (ctx->footer.type == HD_TYPE_DYNAMIC)
+               return 0;
+
+       err = stat(parent_path, &stats);
+       if (err == -1)
+               return -errno;
+
+       if (raw) {
+               ctx->header.prt_ts = vhd_time(stats.st_mtime);
+               if (!size)
+                       size = get_file_size(parent_path);
+       }
+       else {
+               err = vhd_open(&parent, parent_path, VHD_OPEN_RDONLY);
+               if (err)
+                       return err;
+
+               ctx->header.prt_ts = vhd_time(stats.st_mtime);
+               uuid_copy(ctx->header.prt_uuid, parent.footer.uuid);
+               if (!size)
+                       size = parent.footer.curr_size;
+               vhd_close(&parent);
+       }
+       ctx->footer.orig_size    = size;
+       ctx->footer.curr_size    = size;
+       ctx->footer.geometry     = vhd_chs(size);
+       ctx->header.max_bat_size = 
+               (size + VHD_BLOCK_SIZE - 1) >> VHD_BLOCK_SHIFT;
+
+       return vhd_initialize_header_parent_name(ctx, parent_path);
+}
+
+static int
+vhd_write_parent_locators(vhd_context_t *ctx, const char *parent)
+{
+       int i, err;
+       off64_t off;
+       uint32_t code;
+
+       code = PLAT_CODE_NONE;
+
+       if (ctx->footer.type != HD_TYPE_DIFF)
+               return -EINVAL;
+
+       off = ctx->batmap.header.batmap_offset + 
+               vhd_sectors_to_bytes(ctx->batmap.header.batmap_size);
+       if (off & (VHD_SECTOR_SIZE - 1))
+               off = vhd_bytes_padded(off);
+
+       for (i = 0; i < 3; i++) {
+               switch (i) {
+               case 0:
+                       code = PLAT_CODE_MACX;
+                       break;
+               case 1:
+                       code = PLAT_CODE_W2KU;
+                       break;
+               case 2:
+                       code = PLAT_CODE_W2RU;
+                       break;
+               }
+
+               err = vhd_parent_locator_write_at(ctx, parent, off, code,
+                                                 0, ctx->header.loc + i);
+               if (err)
+                       return err;
+
+               off += vhd_parent_locator_size(ctx->header.loc + i);
+       }
+
+       return 0;
+}
+
+int
+vhd_change_parent(vhd_context_t *child, char *parent_path, int raw)
+{
+       int i, err;
+       char *ppath;
+       struct stat stats;
+       vhd_context_t parent;
+
+       ppath = realpath(parent_path, NULL);
+       if (!ppath) {
+               VHDLOG("error resolving parent path %s for %s: %d\n",
+                      parent_path, child->file, errno);
+               return -errno;
+       }
+
+       err = stat(ppath, &stats);
+       if (err == -1) {
+               err = -errno;
+               goto out;
+       }
+
+       if (!S_ISREG(stats.st_mode) && !S_ISBLK(stats.st_mode)) {
+               err = -EINVAL;
+               goto out;
+       }
+
+       if (raw) {
+               uuid_clear(child->header.prt_uuid);
+       } else {
+               err = vhd_open(&parent, ppath, VHD_OPEN_RDONLY);
+               if (err) {
+                       VHDLOG("error opening parent %s for %s: %d\n",
+                              ppath, child->file, err);
+                       goto out;
+               }
+               uuid_copy(child->header.prt_uuid, parent.footer.uuid);
+               vhd_close(&parent);
+       }
+
+       vhd_initialize_header_parent_name(child, ppath);
+       child->header.prt_ts = vhd_time(stats.st_mtime);
+
+       for (i = 0; i < vhd_parent_locator_count(child); i++) {
+               vhd_parent_locator_t *loc = child->header.loc + i;
+               size_t max = vhd_parent_locator_size(loc);
+
+               switch (loc->code) {
+               case PLAT_CODE_MACX:
+               case PLAT_CODE_W2KU:
+               case PLAT_CODE_W2RU:
+                       break;
+               default:
+                       continue;
+               }
+
+               err = vhd_parent_locator_write_at(child, ppath,
+                                                 loc->data_offset,
+                                                 loc->code, max, loc);
+               if (err) {
+                       VHDLOG("error writing parent locator %d for %s: %d\n",
+                              i, child->file, err);
+                       goto out;
+               }
+       }
+
+       TEST_FAIL_AT(FAIL_REPARENT_LOCATOR);
+
+       err = vhd_write_header(child, &child->header);
+       if (err) {
+               VHDLOG("error writing header for %s: %d\n", child->file, err);
+               goto out;
+       }
+
+       err = 0;
+
+out:
+       free(ppath);
+       return err;
+}
+
+static int
+vhd_create_batmap(vhd_context_t *ctx)
+{
+       off64_t off;
+       int err, map_bytes;
+       vhd_batmap_header_t *header;
+
+       if (!vhd_type_dynamic(ctx))
+               return -EINVAL;
+
+       map_bytes = (ctx->header.max_bat_size + 7) >> 3;
+       header    = &ctx->batmap.header;
+
+       memset(header, 0, sizeof(vhd_batmap_header_t));
+       memcpy(header->cookie, VHD_BATMAP_COOKIE, sizeof(header->cookie));
+
+       err = vhd_batmap_header_offset(ctx, &off);
+       if (err)
+               return err;
+
+       header->batmap_offset  = off +
+               vhd_bytes_padded(sizeof(vhd_batmap_header_t));
+       header->batmap_size    = secs_round_up_no_zero(map_bytes);
+       header->batmap_version = VHD_BATMAP_CURRENT_VERSION;
+
+       map_bytes = vhd_sectors_to_bytes(header->batmap_size);
+
+       err = posix_memalign((void **)&ctx->batmap.map,
+                            VHD_SECTOR_SIZE, map_bytes);
+       if (err) {
+               ctx->batmap.map = NULL;
+               return -err;
+       }
+
+       memset(ctx->batmap.map, 0, map_bytes);
+
+       return vhd_write_batmap(ctx, &ctx->batmap);
+}
+
+static int
+vhd_create_bat(vhd_context_t *ctx)
+{
+       int i, err;
+       size_t size;
+
+       if (!vhd_type_dynamic(ctx))
+               return -EINVAL;
+
+       size = vhd_bytes_padded(ctx->header.max_bat_size * sizeof(uint32_t));
+       err  = posix_memalign((void **)&ctx->bat.bat, VHD_SECTOR_SIZE, size);
+       if (err) {
+               ctx->bat.bat = NULL;
+               return err;
+       }
+
+       memset(ctx->bat.bat, 0, size);
+       for (i = 0; i < ctx->header.max_bat_size; i++)
+               ctx->bat.bat[i] = DD_BLK_UNUSED;
+
+       err = vhd_seek(ctx, ctx->header.table_offset, SEEK_SET);
+       if (err)
+               return err;
+
+       ctx->bat.entries = ctx->header.max_bat_size;
+       ctx->bat.spb     = ctx->header.block_size >> VHD_SECTOR_SHIFT;
+
+       return vhd_write_bat(ctx, &ctx->bat);
+}
+
+static int
+vhd_initialize_fixed_disk(vhd_context_t *ctx)
+{
+       char *buf;
+       int i, err;
+
+       if (ctx->footer.type != HD_TYPE_FIXED)
+               return -EINVAL;
+
+       err = vhd_seek(ctx, 0, SEEK_SET);
+       if (err)
+               return err;
+
+       buf = mmap(0, VHD_BLOCK_SIZE, PROT_READ,
+                  MAP_SHARED | MAP_ANONYMOUS, -1, 0);
+       if (buf == MAP_FAILED)
+               return -errno;
+
+       for (i = 0; i < ctx->footer.curr_size >> VHD_BLOCK_SHIFT; i++) {
+               err = vhd_write(ctx, buf, VHD_BLOCK_SIZE);
+               if (err)
+                       goto out;
+       }
+
+       err = 0;
+
+out:
+       munmap(buf, VHD_BLOCK_SIZE);
+       return err;
+}
+
+int 
+vhd_get_phys_size(vhd_context_t *ctx, off64_t *size)
+{
+       int err;
+
+       if ((err = vhd_end_of_data(ctx, size)))
+               return err;
+       *size += sizeof(vhd_footer_t);
+       return 0;
+}
+
+int 
+vhd_set_phys_size(vhd_context_t *ctx, off64_t size)
+{
+       off64_t phys_size;
+       int err;
+
+       err = vhd_get_phys_size(ctx, &phys_size);
+       if (err)
+               return err;
+       if (size < phys_size) {
+               // would result in data loss
+               VHDLOG("ERROR: new size (%"PRIu64") < phys size (%"PRIu64")\n",
+                               size, phys_size);
+               return -EINVAL;
+       }
+       return vhd_write_footer_at(ctx, &ctx->footer, 
+                       size - sizeof(vhd_footer_t));
+}
+
+static int
+__vhd_create(const char *name, const char *parent, uint64_t bytes, int type,
+               vhd_flag_creat_t flags)
+{
+       int err;
+       off64_t off;
+       vhd_context_t ctx;
+       vhd_footer_t *footer;
+       vhd_header_t *header;
+       uint64_t size, blks;
+
+       switch (type) {
+       case HD_TYPE_DIFF:
+               if (!parent)
+                       return -EINVAL;
+       case HD_TYPE_FIXED:
+       case HD_TYPE_DYNAMIC:
+               break;
+       default:
+               return -EINVAL;
+       }
+
+       if (strnlen(name, VHD_MAX_NAME_LEN - 1) == VHD_MAX_NAME_LEN - 1)
+               return -ENAMETOOLONG;
+
+       memset(&ctx, 0, sizeof(vhd_context_t));
+       footer = &ctx.footer;
+       header = &ctx.header;
+       blks   = (bytes + VHD_BLOCK_SIZE - 1) >> VHD_BLOCK_SHIFT;
+       size   = blks << VHD_BLOCK_SHIFT;
+
+       ctx.fd = open(name, O_WRONLY | O_CREAT |
+                     O_TRUNC | O_LARGEFILE | O_DIRECT, 0644);
+       if (ctx.fd == -1)
+               return -errno;
+
+       ctx.file = strdup(name);
+       if (!ctx.file) {
+               err = -ENOMEM;
+               goto out;
+       }
+
+       err = vhd_test_file_fixed(ctx.file, &ctx.is_block);
+       if (err)
+               goto out;
+
+       vhd_initialize_footer(&ctx, type, size);
+
+       if (type == HD_TYPE_FIXED) {
+               err = vhd_initialize_fixed_disk(&ctx);
+               if (err)
+                       goto out;
+       } else {
+               int raw = vhd_flag_test(flags, VHD_FLAG_CREAT_PARENT_RAW);
+               err = vhd_initialize_header(&ctx, parent, size, raw);
+               if (err)
+                       goto out;
+
+               err = vhd_write_footer_at(&ctx, &ctx.footer, 0);
+               if (err)
+                       goto out;
+
+               err = vhd_write_header_at(&ctx, &ctx.header, VHD_SECTOR_SIZE);
+               if (err)
+                       goto out;
+
+               err = vhd_create_batmap(&ctx);
+               if (err)
+                       goto out;
+
+               err = vhd_create_bat(&ctx);
+               if (err)
+                       goto out;
+
+               if (type == HD_TYPE_DIFF) {
+                       err = vhd_write_parent_locators(&ctx, parent);
+                       if (err)
+                               goto out;
+               }
+
+               /* write header again since it may have changed */
+               err = vhd_write_header_at(&ctx, &ctx.header, VHD_SECTOR_SIZE);
+               if (err)
+                       goto out;
+       }
+
+       err = vhd_seek(&ctx, 0, SEEK_END);
+       if (err)
+               goto out;
+
+       off = vhd_position(&ctx);
+       if (off == (off64_t)-1) {
+               err = -errno;
+               goto out;
+       }
+
+       if (ctx.is_block)
+               off -= sizeof(vhd_footer_t);
+
+       err = vhd_write_footer_at(&ctx, &ctx.footer, off);
+       if (err)
+               goto out;
+
+       err = 0;
+
+out:
+       vhd_close(&ctx);
+       if (err && !ctx.is_block)
+               unlink(name);
+       return err;
+}
+
+int
+vhd_create(const char *name, uint64_t bytes, int type, vhd_flag_creat_t flags)
+{
+       return __vhd_create(name, NULL, bytes, type, flags);
+}
+
+int
+vhd_snapshot(const char *name, uint64_t bytes, const char *parent,
+               vhd_flag_creat_t flags)
+{
+       return __vhd_create(name, parent, bytes, HD_TYPE_DIFF, flags);
+}
+
+static int
+__vhd_io_fixed_read(vhd_context_t *ctx,
+                   char *buf, uint64_t sec, uint32_t secs)
+{
+       int err;
+
+       err = vhd_seek(ctx, vhd_sectors_to_bytes(sec), SEEK_SET);
+       if (err)
+               return err;
+
+       return vhd_read(ctx, buf, vhd_sectors_to_bytes(secs));
+}
+
+static void
+__vhd_io_dynamic_copy_data(vhd_context_t *ctx,
+                          char *map, int map_off,
+                          char *bitmap, int bitmap_off,
+                          char *dst, char *src, int secs)
+{
+       int i;
+
+       for (i = 0; i < secs; i++) {
+               if (test_bit(map, map_off + i))
+                       goto next;
+
+               if (ctx && !vhd_bitmap_test(ctx, bitmap, bitmap_off + i))
+                       goto next;
+
+               memcpy(dst, src, VHD_SECTOR_SIZE);
+               set_bit(map, map_off + i);
+
+       next:
+               src += VHD_SECTOR_SIZE;
+               dst += VHD_SECTOR_SIZE;
+       }
+}
+
+static int
+__vhd_io_dynamic_read_link(vhd_context_t *ctx, char *map,
+                          char *buf, uint64_t sector, uint32_t secs)
+{
+       off64_t off;
+       uint32_t blk, sec;
+       int err, cnt, map_off;
+       char *bitmap, *data, *src;
+
+       map_off = 0;
+
+       do {
+               blk    = sector / ctx->spb;
+               sec    = sector % ctx->spb;
+               off    = ctx->bat.bat[blk];
+               data   = NULL;
+               bitmap = NULL;
+
+               if (off == DD_BLK_UNUSED) {
+                       cnt = MIN(secs, ctx->spb);
+                       goto next;
+               }
+
+               err = vhd_read_bitmap(ctx, blk, &bitmap);
+               if (err)
+                       return err;
+
+               err = vhd_read_block(ctx, blk, &data);
+               if (err) {
+                       free(bitmap);
+                       return err;
+               }
+
+               cnt = MIN(secs, ctx->spb - sec);
+               src = data + vhd_sectors_to_bytes(sec);
+
+               __vhd_io_dynamic_copy_data(ctx,
+                                          map, map_off,
+                                          bitmap, sec,
+                                          buf, src, cnt);
+
+       next:
+               free(data);
+               free(bitmap);
+
+               secs    -= cnt;
+               sector  += cnt;
+               map_off += cnt;
+               buf     += vhd_sectors_to_bytes(cnt);
+
+       } while (secs);
+
+       return 0;
+}
+
+static int
+__raw_read_link(char *filename,
+               char *map, char *buf, uint64_t sec, uint32_t secs)
+{
+       int fd, err;
+       off64_t off;
+       uint64_t size;
+       char *data;
+
+       err = 0;
+       errno = 0;
+       fd = open(filename, O_RDONLY | O_DIRECT | O_LARGEFILE);
+       if (fd == -1) {
+               VHDLOG("%s: failed to open: %d\n", filename, -errno);
+               return -errno;
+       }
+
+       off = lseek64(fd, vhd_sectors_to_bytes(sec), SEEK_SET);
+       if (off == (off64_t)-1) {
+               VHDLOG("%s: seek(0x%08"PRIx64") failed: %d\n",
+                      filename, vhd_sectors_to_bytes(sec), -errno);
+               err = -errno;
+               goto close;
+       }
+
+       size = vhd_sectors_to_bytes(secs);
+       err = posix_memalign((void **)&data, VHD_SECTOR_SIZE, size);
+       if (err)
+               goto close;
+
+       err = read(fd, data, size);
+       if (err != size) {
+               VHDLOG("%s: reading of %"PRIu64" returned %d, errno: %d\n",
+                               filename, size, err, -errno);
+               free(data);
+               err = errno ? -errno : -EIO;
+               goto close;
+       }
+       __vhd_io_dynamic_copy_data(NULL, map, 0, NULL, 0, buf, data, secs);
+       free(data);
+       err = 0;
+
+close:
+       close(fd);
+       return err;
+}
+
+static int
+__vhd_io_dynamic_read(vhd_context_t *ctx,
+                     char *buf, uint64_t sec, uint32_t secs)
+{
+       int err;
+       uint32_t i, done;
+       char *map, *next;
+       vhd_context_t parent, *vhd;
+
+       err  = vhd_get_bat(ctx);
+       if (err)
+               return err;
+
+       vhd  = ctx;
+       next = NULL;
+       map  = calloc(1, secs << (VHD_SECTOR_SHIFT - 3));
+       if (!map)
+               return -ENOMEM;
+
+       memset(buf, 0, vhd_sectors_to_bytes(secs));
+
+       for (;;) {
+               err = __vhd_io_dynamic_read_link(vhd, map, buf, sec, secs);
+               if (err)
+                       goto close;
+
+               for (done = 0, i = 0; i < secs; i++)
+                       if (test_bit(map, i))
+                               done++;
+
+               if (done == secs) {
+                       err = 0;
+                       goto close;
+               }
+
+               if (vhd->footer.type == HD_TYPE_DIFF) {
+                       err = vhd_parent_locator_get(vhd, &next);
+                       if (err)
+                               goto close;
+                       if (vhd_parent_raw(vhd)) {
+                               err = __raw_read_link(next, map, buf, sec,
+                                               secs);
+                               goto close;
+                       }
+               } else {
+                       err = 0;
+                       goto close;
+               }
+
+               if (vhd != ctx)
+                       vhd_close(vhd);
+               vhd = &parent;
+
+               err = vhd_open(vhd, next, VHD_OPEN_RDONLY);
+               if (err)
+                       goto out;
+
+               err = vhd_get_bat(vhd);
+               if (err)
+                       goto close;
+
+               free(next);
+               next = NULL;
+       }
+
+close:
+       if (vhd != ctx)
+               vhd_close(vhd);
+out:
+       free(map);
+       free(next);
+       return err;
+}
+
+int
+vhd_io_read(vhd_context_t *ctx, char *buf, uint64_t sec, uint32_t secs)
+{
+       if (vhd_sectors_to_bytes(sec + secs) > ctx->footer.curr_size)
+               return -ERANGE;
+
+       if (!vhd_type_dynamic(ctx))
+               return __vhd_io_fixed_read(ctx, buf, sec, secs);
+
+       return __vhd_io_dynamic_read(ctx, buf, sec, secs);
+}
+
+static int
+__vhd_io_fixed_write(vhd_context_t *ctx,
+                    char *buf, uint64_t sec, uint32_t secs)
+{
+       int err;
+
+       err = vhd_seek(ctx, vhd_sectors_to_bytes(sec), SEEK_SET);
+       if (err)
+               return err;
+
+       return vhd_write(ctx, buf, vhd_sectors_to_bytes(secs));
+}
+
+static int
+__vhd_io_allocate_block(vhd_context_t *ctx, uint32_t block)
+{
+       char *buf;
+       size_t size;
+       off64_t off, max;
+       int i, err, gap, spp;
+
+       spp = getpagesize() >> VHD_SECTOR_SHIFT;
+
+       err = vhd_end_of_data(ctx, &max);
+       if (err)
+               return err;
+
+       gap   = 0;
+       off   = max;
+       max >>= VHD_SECTOR_SHIFT;
+
+       /* data region of segment should begin on page boundary */
+       if ((max + ctx->bm_secs) % spp) {
+               gap  = (spp - ((max + ctx->bm_secs) % spp));
+               max += gap;
+       }
+
+       err = vhd_seek(ctx, off, SEEK_SET);
+       if (err)
+               return err;
+
+       size = vhd_sectors_to_bytes(ctx->spb + ctx->bm_secs + gap);
+       buf  = mmap(0, size, PROT_READ, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
+       if (buf == MAP_FAILED)
+               return -errno;
+
+       err = vhd_write(ctx, buf, size);
+       if (err)
+               goto out;
+
+       ctx->bat.bat[block] = max;
+       err = vhd_write_bat(ctx, &ctx->bat);
+       if (err)
+               goto out;
+
+       err = 0;
+
+out:
+       munmap(buf, size);
+       return err;
+}
+
+static int
+__vhd_io_dynamic_write(vhd_context_t *ctx,
+                      char *buf, uint64_t sector, uint32_t secs)
+{
+       char *map;
+       off64_t off;
+       uint32_t blk, sec;
+       int i, err, cnt, ret;
+
+       if (vhd_sectors_to_bytes(sector + secs) > ctx->footer.curr_size)
+               return -ERANGE;
+
+       err = vhd_get_bat(ctx);
+       if (err)
+               return err;
+
+       if (vhd_has_batmap(ctx)) {
+               err = vhd_get_batmap(ctx);
+               if (err)
+                       return err;
+       }
+
+       do {
+               blk = sector / ctx->spb;
+               sec = sector % ctx->spb;
+
+               off = ctx->bat.bat[blk];
+               if (off == DD_BLK_UNUSED) {
+                       err = __vhd_io_allocate_block(ctx, blk);
+                       if (err)
+                               return err;
+
+                       off = ctx->bat.bat[blk];
+               }
+
+               off += ctx->bm_secs + sec;
+               err  = vhd_seek(ctx, vhd_sectors_to_bytes(off), SEEK_SET);
+               if (err)
+                       return err;
+
+               cnt = MIN(secs, ctx->spb - sec);
+               err = vhd_write(ctx, buf, vhd_sectors_to_bytes(cnt));
+               if (err)
+                       return err;
+
+               if (vhd_has_batmap(ctx) &&
+                   vhd_batmap_test(ctx, &ctx->batmap, blk))
+                       goto next;
+
+               err = vhd_read_bitmap(ctx, blk, &map);
+               if (err)
+                       return err;
+
+               for (i = 0; i < cnt; i++)
+                       vhd_bitmap_set(ctx, map, sec + i);
+
+               err = vhd_write_bitmap(ctx, blk, map);
+               if (err)
+                       goto fail;
+
+               if (vhd_has_batmap(ctx)) {
+                       for (i = 0; i < ctx->spb; i++)
+                               if (!vhd_bitmap_test(ctx, map, i)) {
+                                       free(map);
+                                       goto next;
+                               }
+
+                       vhd_batmap_set(ctx, &ctx->batmap, blk);
+                       err = vhd_write_batmap(ctx, &ctx->batmap);
+                       if (err)
+                               goto fail;
+               }
+
+               free(map);
+               map = NULL;
+
+       next:
+               secs   -= cnt;
+               sector += cnt;
+               buf    += vhd_sectors_to_bytes(cnt);
+       } while (secs);
+
+       err = 0;
+
+out:
+       ret = vhd_write_footer(ctx, &ctx->footer);
+       return (err ? err : ret);
+
+fail:
+       free(map);
+       goto out;
+}
+
+int
+vhd_io_write(vhd_context_t *ctx, char *buf, uint64_t sec, uint32_t secs)
+{
+       if (vhd_sectors_to_bytes(sec + secs) > ctx->footer.curr_size)
+               return -ERANGE;
+
+       if (!vhd_type_dynamic(ctx))
+               return __vhd_io_fixed_write(ctx, buf, sec, secs);
+
+       return __vhd_io_dynamic_write(ctx, buf, sec, secs);
+}
diff --git a/tools/blktap2/vhd/lib/relative-path.c b/tools/blktap2/vhd/lib/relative-path.c
new file mode 100644 (file)
index 0000000..8b7cb71
--- /dev/null
@@ -0,0 +1,299 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <stdio.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "relative-path.h"
+
+#define sfree(ptr)         \
+do {                       \
+       free(ptr);         \
+       ptr = NULL;        \
+} while (0)
+
+/*
+ * count number of tokens between DELIMETER characters
+ */
+static int
+count_nodes(char *path)
+{
+       int i;
+       char *tmp;
+
+       if (!path)
+               return 0;
+
+       for (i = 0, tmp = path; *tmp != '\0'; tmp++)
+               if (*tmp == DELIMITER)
+                       i++;
+
+       return i;
+}
+
+/*
+ * return copy of next node in @path, or NULL
+ * @path is moved to the end of the next node
+ * @err is set to -errno on failure
+ * copy should be freed
+ */
+static char *
+next_node(char **path, int *err)
+{
+       int ret;
+       char *tmp, *start;
+
+       if (!path || !*path) {
+               *err = -EINVAL;
+               return NULL;
+       }
+
+       *err  = 0;
+       start = *path;
+
+       for (tmp = *path; *tmp != '\0'; tmp++)
+               if (*tmp == DELIMITER) {
+                       int size;
+                       char *node;
+
+                       size = tmp - start + 1;
+                       node = malloc(size);
+                       if (!node) {
+                               *err = -ENOMEM;
+                               return NULL;
+                       }
+
+                       ret = snprintf(node, size, "%s", start);
+                       if (ret < 0) {
+                               free(node);
+                               *err = -EINVAL;
+                               return NULL;
+                       }
+
+                       *path = tmp;
+                       return node;
+               }
+
+       return NULL;
+}
+
+/*
+ * count number of nodes in common betwee @to and @from
+ * returns number of common nodes, or -errno on failure
+ */
+static int
+count_common_nodes(char *to, char *from)
+{
+       int err, common;
+       char *to_node, *from_node;
+
+       if (!to || !from)
+               return -EINVAL;
+
+       err       = 0;
+       common    = 0;
+       to_node   = NULL;
+       from_node = NULL;
+
+       do {
+               to_node = next_node(&to, &err);
+               if (err || !to_node)
+                       break;
+
+               from_node = next_node(&from, &err);
+               if (err || !from_node)
+                       break;
+
+               if (strncmp(to_node, from_node, MAX_NAME_LEN))
+                       break;
+
+               ++to;
+               ++from;
+               ++common;
+               sfree(to_node);
+               sfree(from_node);
+
+       } while (1);
+
+       sfree(to_node);
+       sfree(from_node);
+
+       if (err)
+               return err;
+
+       return common;
+}
+
+/*
+ * construct path of @count '../', './' if @count is zero, or NULL on error
+ * result should be freed
+ */
+static char *
+up_nodes(int count)
+{
+       char *path, *tmp;
+       int i, ret, len, size;
+
+       if (!count)
+               return strdup("./");
+
+       len  = strlen("../");
+       size = len * count;
+       if (size >= MAX_NAME_LEN)
+               return NULL;
+
+       path = malloc(size + 1);
+       if (!path)
+               return NULL;
+
+       tmp = path;
+       for (i = 0; i < count; i++) {
+               ret = sprintf(tmp, "../");
+               if (ret < 0 || ret != len) {
+                       free(path);
+                       return NULL;
+               }
+               tmp += ret;
+       }
+
+       return path;
+}
+
+/*
+ * return pointer to @offset'th node of path or NULL on error
+ */
+static char *
+node_offset(char *from, int offset)
+{
+       char *path;
+
+       if (!from || !offset)
+               return NULL;
+
+       for (path = from; *path != '\0'; path++) {
+               if (*path == DELIMITER)
+                       if (--offset == 0)
+                               return path + 1;
+       }
+
+       return NULL;
+}
+
+/*
+ * return a relative path from @from to @to
+ * result should be freed
+ */
+char *
+relative_path_to(char *from, char *to, int *err)
+{
+       int from_nodes, common;
+       char *to_absolute, *from_absolute;
+       char *up, *common_target_path, *relative_path;
+
+       *err          = 0;
+       up            = NULL;
+       to_absolute   = NULL;
+       from_absolute = NULL;
+       relative_path = NULL;
+
+       if (strnlen(to, MAX_NAME_LEN)   == MAX_NAME_LEN ||
+           strnlen(from, MAX_NAME_LEN) == MAX_NAME_LEN) {
+               EPRINTF("invalid input; max path length is %d\n",
+                       MAX_NAME_LEN);
+               *err = -ENAMETOOLONG;
+               return NULL;
+       }
+
+       to_absolute = realpath(to, NULL);
+       if (!to_absolute) {
+               EPRINTF("failed to get absolute path of %s\n", to);
+               *err = -errno;
+               goto out;
+       }
+
+       from_absolute = realpath(from, NULL);
+       if (!from_absolute) {
+               EPRINTF("failed to get absolute path of %s\n", from);
+               *err = -errno;
+               goto out;
+       }
+
+       if (strnlen(to_absolute, MAX_NAME_LEN)   == MAX_NAME_LEN ||
+           strnlen(from_absolute, MAX_NAME_LEN) == MAX_NAME_LEN) {
+               EPRINTF("invalid input; max path length is %d\n",
+                       MAX_NAME_LEN);
+               *err = -ENAMETOOLONG;
+               goto out;
+       }
+
+       /* count nodes in source path */
+       from_nodes = count_nodes(from_absolute);
+
+       /* count nodes in common */
+       common = count_common_nodes(to_absolute + 1, from_absolute + 1);
+       if (common < 0) {
+               EPRINTF("failed to count common nodes of %s and %s: %d\n",
+                       to_absolute, from_absolute, common);
+               *err = common;
+               goto out;
+       }
+
+       /* move up to common node */
+       up = up_nodes(from_nodes - common - 1);
+       if (!up) {
+               EPRINTF("failed to allocate relative path for %s: %d\n",
+                       from_absolute, -ENOMEM);
+               *err = -ENOMEM;
+               goto out;
+       }
+
+       /* get path from common node to target */
+       common_target_path = node_offset(to_absolute, common + 1);
+       if (!common_target_path) {
+               EPRINTF("failed to find common target path to %s: %d\n",
+                       to_absolute, -EINVAL);
+               *err = -EINVAL;
+               goto out;
+       }
+
+       /* get relative path */
+       if (asprintf(&relative_path, "%s%s", up, common_target_path) == -1) {
+               EPRINTF("failed to construct final path %s%s: %d\n",
+                       up, common_target_path, -ENOMEM);
+               relative_path = NULL;
+               *err = -ENOMEM;
+               goto out;
+       }
+
+out:
+       sfree(up);
+       sfree(to_absolute);
+       sfree(from_absolute);
+
+       return relative_path;
+}
diff --git a/tools/blktap2/vhd/lib/vhd-util-check.c b/tools/blktap2/vhd/lib/vhd-util-check.c
new file mode 100644 (file)
index 0000000..d7d5880
--- /dev/null
@@ -0,0 +1,977 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <time.h>
+#include <stdio.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <libgen.h>
+#include <inttypes.h>
+#include <sys/stat.h>
+
+#include "libvhd.h"
+#include "vhd-util.h"
+
+// allow the VHD timestamp to be at most this many seconds into the future to 
+// account for time skew with NFS servers
+#define TIMESTAMP_MAX_SLACK 1800
+
+static int
+vhd_util_check_zeros(void *buf, size_t size)
+{
+       int i;
+       char *p;
+
+       p = buf;
+       for (i = 0; i < size; i++)
+               if (p[i])
+                       return i;
+
+       return 0;
+}
+
+static int
+vhd_util_check_footer_opened(vhd_footer_t *footer)
+{
+       int i, n;
+       uint32_t *buf;
+
+       buf = (uint32_t *)footer;
+       n = sizeof(*footer) / sizeof(uint32_t);
+
+       for (i = 0; i < n; i++)
+               if (buf[i] != 0xc7c7c7c7)
+                       return 0;
+
+       return 1;
+}
+
+static char *
+vhd_util_check_validate_footer(vhd_footer_t *footer)
+{
+       int size;
+       uint32_t checksum, now;
+
+       size = sizeof(footer->cookie);
+       if (memcmp(footer->cookie, HD_COOKIE, size))
+               return "invalid cookie";
+
+       checksum = vhd_checksum_footer(footer);
+       if (checksum != footer->checksum) {
+               if (footer->hidden &&
+                   !strncmp(footer->crtr_app, "tap", 3) &&
+                   (footer->crtr_ver == VHD_VERSION(0, 1) ||
+                    footer->crtr_ver == VHD_VERSION(1, 1))) {
+                       char tmp = footer->hidden;
+                       footer->hidden = 0;
+                       checksum = vhd_checksum_footer(footer);
+                       footer->hidden = tmp;
+
+                       if (checksum == footer->checksum)
+                               goto ok;
+               }
+
+               return "invalid checksum";
+       }
+
+ok:
+       if (!(footer->features & HD_RESERVED))
+               return "invalid 'reserved' feature";
+
+       if (footer->features & ~(HD_TEMPORARY | HD_RESERVED))
+               return "invalid extra features";
+
+       if (footer->ff_version != HD_FF_VERSION)
+               return "invalid file format version";
+
+       if (footer->type != HD_TYPE_DYNAMIC &&
+           footer->type != HD_TYPE_DIFF    &&
+           footer->data_offset != ~(0ULL))
+               return "invalid data offset";
+
+       now = vhd_time(time(NULL));
+       if (footer->timestamp > now + TIMESTAMP_MAX_SLACK)
+               return "creation time in future";
+
+       if (!strncmp(footer->crtr_app, "tap", 3) &&
+           footer->crtr_ver > VHD_CURRENT_VERSION)
+               return "unsupported tap creator version";
+
+       if (vhd_chs(footer->curr_size) < footer->geometry)
+               return "geometry too large";
+
+       if (footer->type != HD_TYPE_FIXED   &&
+           footer->type != HD_TYPE_DYNAMIC &&
+           footer->type != HD_TYPE_DIFF)
+               return "invalid type";
+
+       if (footer->saved && footer->saved != 1)
+               return "invalid 'saved' state";
+
+       if (footer->hidden && footer->hidden != 1)
+               return "invalid 'hidden' state";
+
+       if (vhd_util_check_zeros(footer->reserved,
+                                sizeof(footer->reserved)))
+               return "invalid 'reserved' bits";
+
+       return NULL;
+}
+
+static char *
+vhd_util_check_validate_header(int fd, vhd_header_t *header)
+{
+       off64_t eof;
+       int i, cnt, size;
+       uint32_t checksum;
+
+       size = sizeof(header->cookie);
+       if (memcmp(header->cookie, DD_COOKIE, size))
+               return "invalid cookie";
+
+       checksum = vhd_checksum_header(header);
+       if (checksum != header->checksum)
+               return "invalid checksum";
+
+       if (header->hdr_ver != 0x00010000)
+               return "invalid header version";
+
+       if (header->data_offset != ~(0ULL))
+               return "invalid data offset";
+
+       eof = lseek64(fd, 0, SEEK_END);
+       if (eof == (off64_t)-1)
+               return "error finding eof";
+
+       if (header->table_offset <= 0  ||
+           header->table_offset % 512 ||
+           (header->table_offset +
+            (header->max_bat_size * sizeof(uint32_t)) >
+            eof - sizeof(vhd_footer_t)))
+               return "invalid table offset";
+
+       for (cnt = 0, i = 0; i < sizeof(header->block_size) * 8; i++)
+               if ((header->block_size >> i) & 1)
+                       cnt++;
+
+       if (cnt != 1)
+               return "invalid block size";
+
+       if (header->res1)
+               return "invalid reserved bits";
+
+       if (vhd_util_check_zeros(header->res2, sizeof(header->res2)))
+               return "invalid reserved bits";
+
+       return NULL;
+}
+
+static char *
+vhd_util_check_validate_differencing_header(vhd_context_t *vhd)
+{
+       vhd_header_t *header;
+
+       header = &vhd->header;
+
+       if (vhd->footer.type == HD_TYPE_DIFF) {
+               char *parent;
+               uint32_t now;
+
+               now = vhd_time(time(NULL));
+               if (header->prt_ts > now + TIMESTAMP_MAX_SLACK)
+                       return "parent creation time in future";
+
+               if (vhd_header_decode_parent(vhd, header, &parent))
+                       return "invalid parent name";
+
+               free(parent);
+       } else {
+               if (vhd_util_check_zeros(header->prt_name,
+                                        sizeof(header->prt_name)))
+                       return "invalid non-null parent name";
+
+               if (vhd_util_check_zeros(header->loc, sizeof(header->loc)))
+                       return "invalid non-null parent locators";
+
+               if (!uuid_is_null(header->prt_uuid))
+                       return "invalid non-null parent uuid";
+
+               if (header->prt_ts)
+                       return "invalid non-zero parent timestamp";
+       }
+
+       return NULL;
+}
+
+static char *
+vhd_util_check_validate_batmap(vhd_context_t *vhd, vhd_batmap_t *batmap)
+{
+       int size;
+       off64_t eof;
+       uint32_t checksum;
+
+       size = sizeof(batmap->header.cookie);
+       if (memcmp(batmap->header.cookie, VHD_BATMAP_COOKIE, size))
+               return "invalid cookie";
+
+       if (batmap->header.batmap_version > VHD_BATMAP_CURRENT_VERSION)
+               return "unsupported batmap version";
+
+       checksum = vhd_checksum_batmap(batmap);
+       if (checksum != batmap->header.checksum)
+               return "invalid checksum";
+
+       if (!batmap->header.batmap_size)
+               return "invalid size zero";
+
+       eof = lseek64(vhd->fd, 0, SEEK_END);
+       if (eof == (off64_t)-1)
+               return "error finding eof";
+
+       if (!batmap->header.batmap_offset ||
+           batmap->header.batmap_offset % 512)
+               return "invalid batmap offset";
+
+       if ((batmap->header.batmap_offset +
+            vhd_sectors_to_bytes(batmap->header.batmap_size)) >
+           eof - sizeof(vhd_footer_t))
+               return "invalid batmap size";
+
+       return NULL;
+}
+
+static char *
+vhd_util_check_validate_parent_locator(vhd_context_t *vhd,
+                                      vhd_parent_locator_t *loc)
+{
+       off64_t eof;
+
+       if (vhd_validate_platform_code(loc->code))
+               return "invalid platform code";
+
+       if (loc->code == PLAT_CODE_NONE) {
+               if (vhd_util_check_zeros(loc, sizeof(*loc)))
+                       return "non-zero locator";
+
+               return NULL;
+       }
+
+       if (!loc->data_offset)
+               return "invalid data offset";
+
+       if (!loc->data_space)
+               return "invalid data space";
+
+       if (!loc->data_len)
+               return "invalid data length";
+
+       eof = lseek64(vhd->fd, 0, SEEK_END);
+       if (eof == (off64_t)-1)
+               return "error finding eof";
+
+       if (loc->data_offset + vhd_parent_locator_size(loc) >
+           eof - sizeof(vhd_footer_t))
+               return "invalid size";
+
+       if (loc->res)
+               return "invalid reserved bits";
+
+       return NULL;
+}
+
+static char *
+vhd_util_check_validate_parent(vhd_context_t *vhd, const char *ppath)
+{
+       char *msg;
+       vhd_context_t parent;
+
+       msg = NULL;
+
+       if (vhd_parent_raw(vhd))
+               return msg;
+
+       if (vhd_open(&parent, ppath,
+                               VHD_OPEN_RDONLY | VHD_OPEN_IGNORE_DISABLED))
+               return "error opening parent";
+
+       if (uuid_compare(vhd->header.prt_uuid, parent.footer.uuid)) {
+               msg = "invalid parent uuid";
+               goto out;
+       }
+
+out:
+       vhd_close(&parent);
+       return msg;
+}
+
+static int
+vhd_util_check_footer(int fd, vhd_footer_t *footer, int ignore)
+{
+       size_t size;
+       int err, opened;
+       char *msg, *buf;
+       off64_t eof, off;
+       vhd_footer_t primary, backup;
+
+       memset(&primary, 0, sizeof(primary));
+       memset(&backup, 0, sizeof(backup));
+
+       err = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, sizeof(primary));
+       if (err) {
+               printf("error allocating buffer: %d\n", err);
+               return -err;
+       }
+
+       memset(buf, 0, sizeof(primary));
+
+       eof = lseek64(fd, 0, SEEK_END);
+       if (eof == (off64_t)-1) {
+               err = -errno;
+               printf("error calculating end of file: %d\n", err);
+               goto out;
+       }
+
+       size = ((eof % 512) ? 511 : 512);
+       eof  = lseek64(fd, eof - size, SEEK_SET);
+       if (eof == (off64_t)-1) {
+               err = -errno;
+               printf("error calculating end of file: %d\n", err);
+               goto out;
+       }
+
+       err = read(fd, buf, 512);
+       if (err != size) {
+               err = (errno ? -errno : -EIO);
+               printf("error reading primary footer: %d\n", err);
+               goto out;
+       }
+
+       memcpy(&primary, buf, sizeof(primary));
+       opened = vhd_util_check_footer_opened(&primary);
+       vhd_footer_in(&primary);
+
+       msg = vhd_util_check_validate_footer(&primary);
+       if (msg) {
+               if (opened && ignore)
+                       goto check_backup;
+
+               err = -EINVAL;
+               printf("primary footer invalid: %s\n", msg);
+               goto out;
+       }
+
+       if (primary.type == HD_TYPE_FIXED) {
+               err = 0;
+               goto out;
+       }
+
+check_backup:
+       off = lseek64(fd, 0, SEEK_SET);
+       if (off == (off64_t)-1) {
+               err = -errno;
+               printf("error seeking to backup footer: %d\n", err);
+               goto out;
+       }
+
+       size = 512;
+       memset(buf, 0, sizeof(primary));
+
+       err = read(fd, buf, size);
+       if (err != size) {
+               err = (errno ? -errno : -EIO);
+               printf("error reading backup footer: %d\n", err);
+               goto out;
+       }
+
+       memcpy(&backup, buf, sizeof(backup));
+       vhd_footer_in(&backup);
+
+       msg = vhd_util_check_validate_footer(&backup);
+       if (msg) {
+               err = -EINVAL;
+               printf("backup footer invalid: %s\n", msg);
+               goto out;
+       }
+
+       if (memcmp(&primary, &backup, sizeof(primary))) {
+               if (opened && ignore) {
+                       memcpy(&primary, &backup, sizeof(primary));
+                       goto ok;
+               }
+
+               if (backup.hidden &&
+                   !strncmp(backup.crtr_app, "tap", 3) &&
+                   (backup.crtr_ver == VHD_VERSION(0, 1) ||
+                    backup.crtr_ver == VHD_VERSION(1, 1))) {
+                       char cmp, tmp = backup.hidden;
+                       backup.hidden = 0;
+                       cmp = memcmp(&primary, &backup, sizeof(primary));
+                       backup.hidden = tmp;
+                       if (!cmp)
+                               goto ok;
+               }
+
+               err = -EINVAL;
+               printf("primary and backup footers do not match\n");
+               goto out;
+       }
+
+ok:
+       err = 0;
+       memcpy(footer, &primary, sizeof(primary));
+
+out:
+       free(buf);
+       return err;
+}
+
+static int
+vhd_util_check_header(int fd, vhd_footer_t *footer)
+{
+       int err;
+       off64_t off;
+       char *msg, *buf;
+       vhd_header_t header;
+
+       err = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, sizeof(header));
+       if (err) {
+               printf("error allocating header: %d\n", err);
+               return err;
+       }
+
+       off = footer->data_offset;
+       off = lseek64(fd, off, SEEK_SET);
+       if (off == (off64_t)-1) {
+               err = -errno;
+               printf("error seeking to header: %d\n", err);
+               goto out;
+       }
+
+       err = read(fd, buf, sizeof(header));
+       if (err != sizeof(header)) {
+               err = (errno ? -errno : -EIO);
+               printf("error reading header: %d\n", err);
+               goto out;
+       }
+
+       memcpy(&header, buf, sizeof(header));
+       vhd_header_in(&header);
+
+       msg = vhd_util_check_validate_header(fd, &header);
+       if (msg) {
+               err = -EINVAL;
+               printf("header is invalid: %s\n", msg);
+               goto out;
+       }
+
+       err = 0;
+
+out:
+       free(buf);
+       return err;
+}
+
+static int
+vhd_util_check_differencing_header(vhd_context_t *vhd)
+{
+       char *msg;
+
+       msg = vhd_util_check_validate_differencing_header(vhd);
+       if (msg) {
+               printf("differencing header is invalid: %s\n", msg);
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+static int
+vhd_util_check_bat(vhd_context_t *vhd)
+{
+       off64_t eof, eoh;
+       int i, j, err, block_size;
+
+       err = vhd_seek(vhd, 0, SEEK_END);
+       if (err) {
+               printf("error calculating eof: %d\n", err);
+               return err;
+       }
+
+       eof = vhd_position(vhd);
+       if (eof == (off64_t)-1) {
+               printf("error calculating eof: %d\n", -errno);
+               return -errno;
+       }
+
+       /* adjust eof for vhds with short footers */
+       if (eof % 512) {
+               if (eof % 512 != 511) {
+                       printf("invalid file size: 0x%"PRIx64"\n", eof);
+                       return -EINVAL;
+               }
+
+               eof++;
+       }
+
+       err = vhd_get_bat(vhd);
+       if (err) {
+               printf("error reading bat: %d\n", err);
+               return err;
+       }
+
+       err = vhd_end_of_headers(vhd, &eoh);
+       if (err) {
+               printf("error calculating end of metadata: %d\n", err);
+               return err;
+       }
+
+       eof  -= sizeof(vhd_footer_t);
+       eof >>= VHD_SECTOR_SHIFT;
+       eoh >>= VHD_SECTOR_SHIFT;
+       block_size = vhd->spb + vhd->bm_secs;
+
+       for (i = 0; i < vhd->header.max_bat_size; i++) {
+               uint32_t off = vhd->bat.bat[i];
+               if (off == DD_BLK_UNUSED)
+                       continue;
+
+               if (off < eoh) {
+                       printf("block %d (offset 0x%x) clobbers headers\n",
+                              i, off);
+                       return -EINVAL;
+               }
+
+               if (off + block_size > eof) {
+                       printf("block %d (offset 0x%x) clobbers footer\n",
+                              i, off);
+                       return -EINVAL;
+               }
+
+               for (j = 0; j < vhd->header.max_bat_size; j++) {
+                       uint32_t joff = vhd->bat.bat[j];
+
+                       if (i == j)
+                               continue;
+
+                       if (joff == DD_BLK_UNUSED)
+                               continue;
+
+                       if (off == joff)
+                               err = -EINVAL;
+
+                       if (off > joff && off < joff + block_size)
+                               err = -EINVAL;
+
+                       if (off + block_size > joff &&
+                           off + block_size < joff + block_size)
+                               err = -EINVAL;
+
+                       if (err) {
+                               printf("block %d (offset 0x%x) clobbers "
+                                      "block %d (offset 0x%x)\n",
+                                      i, off, j, joff);
+                               return err;
+                       }
+               }
+       }
+
+       return 0;
+}
+
+static int
+vhd_util_check_batmap(vhd_context_t *vhd)
+{
+       char *msg;
+       int i, err;
+
+       err = vhd_get_bat(vhd);
+       if (err) {
+               printf("error reading bat: %d\n", err);
+               return err;
+       }
+
+       err = vhd_get_batmap(vhd);
+       if (err) {
+               printf("error reading batmap: %d\n", err);
+               return err;
+       }
+
+       msg = vhd_util_check_validate_batmap(vhd, &vhd->batmap);
+       if (msg) {
+               printf("batmap is invalid: %s\n", msg);
+               return -EINVAL;
+       }
+
+       for (i = 0; i < vhd->header.max_bat_size; i++) {
+               if (!vhd_batmap_test(vhd, &vhd->batmap, i))
+                       continue;
+
+               if (vhd->bat.bat[i] == DD_BLK_UNUSED) {
+                       printf("batmap shows unallocated block %d full\n", i);
+                       return -EINVAL;
+               }
+       }
+
+       return 0;
+}
+
+static int
+vhd_util_check_parent_locators(vhd_context_t *vhd)
+{
+       int i, n, err;
+       vhd_parent_locator_t *loc;
+       char *msg, *file, *ppath, *location, *pname;
+       int mac, macx, w2ku, w2ru, wi2r, wi2k, found;
+
+       mac      = 0;
+       macx     = 0;
+       w2ku     = 0;
+       w2ru     = 0;
+       wi2r     = 0;
+       wi2k     = 0;
+       found    = 0;
+       pname    = NULL;
+       ppath    = NULL;
+       location = NULL;
+
+       err = vhd_header_decode_parent(vhd, &vhd->header, &pname);
+       if (err) {
+               printf("error decoding parent name: %d\n", err);
+               return err;
+       }
+
+       n = sizeof(vhd->header.loc) / sizeof(vhd->header.loc[0]);
+       for (i = 0; i < n; i++) {
+               ppath    = NULL;
+               location = NULL;
+               loc = vhd->header.loc + i;
+
+               msg = vhd_util_check_validate_parent_locator(vhd, loc);
+               if (msg) {
+                       err = -EINVAL;
+                       printf("invalid parent locator %d: %s\n", i, msg);
+                       goto out;
+               }
+
+               if (loc->code == PLAT_CODE_NONE)
+                       continue;
+
+               switch (loc->code) {
+               case PLAT_CODE_MACX:
+                       if (macx++)
+                               goto dup;
+                       break;
+
+               case PLAT_CODE_MAC:
+                       if (mac++)
+                               goto dup;
+                       break;
+
+               case PLAT_CODE_W2KU:
+                       if (w2ku++)
+                               goto dup;
+                       break;
+
+               case PLAT_CODE_W2RU:
+                       if (w2ru++)
+                               goto dup;
+                       break;
+
+               case PLAT_CODE_WI2R:
+                       if (wi2r++)
+                               goto dup;
+                       break;
+
+               case PLAT_CODE_WI2K:
+                       if (wi2k++)
+                               goto dup;
+                       break;
+
+               default:
+                       err = -EINVAL;
+                       printf("invalid  platform code for locator %d\n", i);
+                       goto out;
+               }
+
+               if (loc->code != PLAT_CODE_MACX &&
+                   loc->code != PLAT_CODE_W2RU &&
+                   loc->code != PLAT_CODE_W2KU)
+                       continue;
+
+               err = vhd_parent_locator_read(vhd, loc, &ppath);
+               if (err) {
+                       printf("error reading parent locator %d: %d\n", i, err);
+                       goto out;
+               }
+
+               file = basename(ppath);
+               if (strcmp(pname, file)) {
+                       err = -EINVAL;
+                       printf("parent locator %d name (%s) does not match "
+                              "header name (%s)\n", i, file, pname);
+                       goto out;
+               }
+
+               err = vhd_find_parent(vhd, ppath, &location);
+               if (err) {
+                       printf("error resolving %s: %d\n", ppath, err);
+                       goto out;
+               }
+
+               err = access(location, R_OK);
+               if (err && loc->code == PLAT_CODE_MACX) {
+                       err = -errno;
+                       printf("parent locator %d points to missing file %s "
+                               "(resolved to %s)\n", i, ppath, location);
+                       goto out;
+               }
+
+               msg = vhd_util_check_validate_parent(vhd, location);
+               if (msg) {
+                       err = -EINVAL;
+                       printf("invalid parent %s: %s\n", location, msg);
+                       goto out;
+               }
+
+               found++;
+               free(ppath);
+               free(location);
+               ppath = NULL;
+               location = NULL;
+
+               continue;
+
+       dup:
+               printf("duplicate platform code in locator %d: 0x%x\n",
+                      i, loc->code);
+               err = -EINVAL;
+               goto out;
+       }
+
+       if (!found) {
+               err = -EINVAL;
+               printf("could not find parent %s\n", pname);
+               goto out;
+       }
+
+       err = 0;
+
+out:
+       free(pname);
+       free(ppath);
+       free(location);
+       return err;
+}
+
+static void
+vhd_util_dump_headers(const char *name)
+{
+       char *argv[] = { "read", "-p", "-n", (char *)name };
+       int argc = sizeof(argv) / sizeof(argv[0]);
+
+       printf("%s appears invalid; dumping metadata\n", name);
+       vhd_util_read(argc, argv);
+}
+
+static int
+vhd_util_check_vhd(const char *name, int ignore)
+{
+       int fd, err;
+       vhd_context_t vhd;
+       struct stat stats;
+       vhd_footer_t footer;
+
+       fd = -1;
+       memset(&vhd, 0, sizeof(vhd));
+
+       err = stat(name, &stats);
+       if (err == -1) {
+               printf("cannot stat %s: %d\n", name, errno);
+               return -errno;
+       }
+
+       if (!S_ISREG(stats.st_mode) && !S_ISBLK(stats.st_mode)) {
+               printf("%s is not a regular file or block device\n", name);
+               return -EINVAL;
+       }
+
+       fd = open(name, O_RDONLY | O_DIRECT | O_LARGEFILE);
+       if (fd == -1) {
+               printf("error opening %s\n", name);
+               return -errno;
+       }
+
+       err = vhd_util_check_footer(fd, &footer, ignore);
+       if (err)
+               goto out;
+
+       if (footer.type != HD_TYPE_DYNAMIC && footer.type != HD_TYPE_DIFF)
+               goto out;
+
+       err = vhd_util_check_header(fd, &footer);
+       if (err)
+               goto out;
+
+       err = vhd_open(&vhd, name, VHD_OPEN_RDONLY | VHD_OPEN_IGNORE_DISABLED);
+       if (err)
+               goto out;
+
+       err = vhd_util_check_differencing_header(&vhd);
+       if (err)
+               goto out;
+
+       err = vhd_util_check_bat(&vhd);
+       if (err)
+               goto out;
+
+       if (vhd_has_batmap(&vhd)) {
+               err = vhd_util_check_batmap(&vhd);
+               if (err)
+                       goto out;
+       }
+
+       if (vhd.footer.type == HD_TYPE_DIFF) {
+               err = vhd_util_check_parent_locators(&vhd);
+               if (err)
+                       goto out;
+       }
+
+       err = 0;
+       printf("%s is valid\n", name);
+
+out:
+       if (err)
+               vhd_util_dump_headers(name);
+       if (fd != -1)
+               close(fd);
+       vhd_close(&vhd);
+       return err;
+}
+
+static int
+vhd_util_check_parents(const char *name, int ignore)
+{
+       int err;
+       vhd_context_t vhd;
+       char *cur, *parent;
+
+       cur = (char *)name;
+
+       for (;;) {
+               err = vhd_open(&vhd, cur, 
+                               VHD_OPEN_RDONLY | VHD_OPEN_IGNORE_DISABLED);
+               if (err)
+                       goto out;
+
+               if (vhd.footer.type != HD_TYPE_DIFF || vhd_parent_raw(&vhd)) {
+                       vhd_close(&vhd);
+                       goto out;
+               }
+
+               err = vhd_parent_locator_get(&vhd, &parent);
+               vhd_close(&vhd);
+
+               if (err) {
+                       printf("error getting parent: %d\n", err);
+                       goto out;
+               }
+
+               if (cur != name)
+                       free(cur);
+               cur = parent;
+
+               err = vhd_util_check_vhd(cur, ignore);
+               if (err)
+                       goto out;
+       }
+
+out:
+       if (err)
+               printf("error checking parents: %d\n", err);
+       if (cur != name)
+               free(cur);
+       return err;
+}
+
+int
+vhd_util_check(int argc, char **argv)
+{
+       char *name;
+       vhd_context_t vhd;
+       int c, err, ignore, parents;
+
+       if (!argc || !argv) {
+               err = -EINVAL;
+               goto usage;
+       }
+
+       ignore  = 0;
+       parents = 0;
+       name    = NULL;
+
+       optind = 0;
+       while ((c = getopt(argc, argv, "n:iph")) != -1) {
+               switch (c) {
+               case 'n':
+                       name = optarg;
+                       break;
+               case 'i':
+                       ignore = 1;
+                       break;
+               case 'p':
+                       parents = 1;
+                       break;
+               case 'h':
+                       err = 0;
+                       goto usage;
+               default:
+                       err = -EINVAL;
+                       goto usage;
+               }
+       }
+
+       if (!name || optind != argc) {
+               err = -EINVAL;
+               goto usage;
+       }
+
+       err = vhd_util_check_vhd(name, ignore);
+       if (err)
+               goto out;
+
+       if (parents)
+               err = vhd_util_check_parents(name, ignore);
+
+out:
+       return err;
+
+usage:
+       printf("options: -n <file> [-i ignore missing primary footers] "
+              "[-p check parents] [-h help]\n");
+       return err;
+}
diff --git a/tools/blktap2/vhd/lib/vhd-util-coalesce.c b/tools/blktap2/vhd/lib/vhd-util-coalesce.c
new file mode 100644 (file)
index 0000000..f6461fc
--- /dev/null
@@ -0,0 +1,218 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "libvhd.h"
+
+static int
+__raw_io_write(int fd, char* buf, uint64_t sec, uint32_t secs)
+{
+       off64_t off;
+       size_t ret;
+
+       errno = 0;
+       off = lseek64(fd, vhd_sectors_to_bytes(sec), SEEK_SET);
+       if (off == (off64_t)-1) {
+               printf("raw parent: seek(0x%08"PRIx64") failed: %d\n",
+                      vhd_sectors_to_bytes(sec), -errno);
+               return -errno;
+       }
+
+       ret = write(fd, buf, vhd_sectors_to_bytes(secs));
+       if (ret == vhd_sectors_to_bytes(secs))
+               return 0;
+
+       printf("raw parent: write of 0x%"PRIx64" returned %zd, errno: %d\n",
+              vhd_sectors_to_bytes(secs), ret, -errno);
+       return (errno ? -errno : -EIO);
+}
+
+/*
+ * Use 'parent' if the parent is VHD, and 'parent_fd' if the parent is raw
+ */
+static int
+vhd_util_coalesce_block(vhd_context_t *vhd, vhd_context_t *parent,
+               int parent_fd, uint64_t block)
+{
+       int i, err;
+       char *buf, *map;
+       uint64_t sec, secs;
+
+       buf = NULL;
+       map = NULL;
+       sec = block * vhd->spb;
+
+       if (vhd->bat.bat[block] == DD_BLK_UNUSED)
+               return 0;
+
+       err = posix_memalign((void **)&buf, 4096, vhd->header.block_size);
+       if (err)
+               return -err;
+
+       err = vhd_io_read(vhd, buf, sec, vhd->spb);
+       if (err)
+               goto done;
+
+       if (vhd_has_batmap(vhd) && vhd_batmap_test(vhd, &vhd->batmap, block)) {
+               if (parent->file)
+                       err = vhd_io_write(parent, buf, sec, vhd->spb);
+               else
+                       err = __raw_io_write(parent_fd, buf, sec, vhd->spb);
+               goto done;
+       }
+
+       err = vhd_read_bitmap(vhd, block, &map);
+       if (err)
+               goto done;
+
+       for (i = 0; i < vhd->spb; i++) {
+               if (!vhd_bitmap_test(vhd, map, i))
+                       continue;
+
+               for (secs = 0; i + secs < vhd->spb; secs++)
+                       if (!vhd_bitmap_test(vhd, map, i + secs))
+                               break;
+
+               if (parent->file)
+                       err = vhd_io_write(parent,
+                                          buf + vhd_sectors_to_bytes(i),
+                                          sec + i, secs);
+               else
+                       err = __raw_io_write(parent_fd,
+                                            buf + vhd_sectors_to_bytes(i),
+                                            sec + i, secs);
+               if (err)
+                       goto done;
+
+               i += secs;
+       }
+
+       err = 0;
+
+done:
+       free(buf);
+       free(map);
+       return err;
+}
+
+int
+vhd_util_coalesce(int argc, char **argv)
+{
+       int err, c;
+       uint64_t i;
+       char *name, *pname;
+       vhd_context_t vhd, parent;
+       int parent_fd = -1;
+
+       name  = NULL;
+       pname = NULL;
+       parent.file = NULL;
+
+       if (!argc || !argv)
+               goto usage;
+
+       optind = 0;
+       while ((c = getopt(argc, argv, "n:h")) != -1) {
+               switch (c) {
+               case 'n':
+                       name = optarg;
+                       break;
+               case 'h':
+               default:
+                       goto usage;
+               }
+       }
+
+       if (!name || optind != argc)
+               goto usage;
+
+       err = vhd_open(&vhd, name, VHD_OPEN_RDONLY);
+       if (err) {
+               printf("error opening %s: %d\n", name, err);
+               return err;
+       }
+
+       err = vhd_parent_locator_get(&vhd, &pname);
+       if (err) {
+               printf("error finding %s parent: %d\n", name, err);
+               vhd_close(&vhd);
+               return err;
+       }
+
+       if (vhd_parent_raw(&vhd)) {
+               parent_fd = open(pname, O_RDWR | O_DIRECT | O_LARGEFILE, 0644);
+               if (parent_fd == -1) {
+                       err = -errno;
+                       printf("failed to open parent %s: %d\n", pname, err);
+                       vhd_close(&vhd);
+                       return err;
+               }
+       } else {
+               err = vhd_open(&parent, pname, VHD_OPEN_RDWR);
+               if (err) {
+                       printf("error opening %s: %d\n", pname, err);
+                       free(pname);
+                       vhd_close(&vhd);
+                       return err;
+               }
+       }
+
+       err = vhd_get_bat(&vhd);
+       if (err)
+               goto done;
+
+       if (vhd_has_batmap(&vhd)) {
+               err = vhd_get_batmap(&vhd);
+               if (err)
+                       goto done;
+       }
+
+       for (i = 0; i < vhd.bat.entries; i++) {
+               err = vhd_util_coalesce_block(&vhd, &parent, parent_fd, i);
+               if (err)
+                       goto done;
+       }
+
+       err = 0;
+
+ done:
+       free(pname);
+       vhd_close(&vhd);
+       if (parent.file)
+               vhd_close(&parent);
+       else
+               close(parent_fd);
+       return err;
+
+usage:
+       printf("options: <-n name> [-h help]\n");
+       return -EINVAL;
+}
diff --git a/tools/blktap2/vhd/lib/vhd-util-create.c b/tools/blktap2/vhd/lib/vhd-util-create.c
new file mode 100644 (file)
index 0000000..a9bdf05
--- /dev/null
@@ -0,0 +1,80 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <stdio.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "libvhd.h"
+
+int
+vhd_util_create(int argc, char **argv)
+{
+       char *name;
+       uint64_t size;
+       int c, sparse, err;
+       vhd_flag_creat_t flags;
+
+       err       = -EINVAL;
+       size      = 0;
+       sparse    = 1;
+       name      = NULL;
+       flags     = 0;
+
+       if (!argc || !argv)
+               goto usage;
+
+       optind = 0;
+       while ((c = getopt(argc, argv, "n:s:rh")) != -1) {
+               switch (c) {
+               case 'n':
+                       name = optarg;
+                       break;
+               case 's':
+                       err  = 0;
+                       size = strtoull(optarg, NULL, 10);
+                       break;
+               case 'r':
+                       sparse = 0;
+                       break;
+               case 'h':
+               default:
+                       goto usage;
+               }
+       }
+
+       if (err || !name || optind != argc)
+               goto usage;
+
+       return vhd_create(name, size << 20,
+                                 (sparse ? HD_TYPE_DYNAMIC : HD_TYPE_FIXED),
+                                 flags);
+
+usage:
+       printf("options: <-n name> <-s size (MB)> [-r reserve] [-h help]\n");
+       return -EINVAL;
+}
diff --git a/tools/blktap2/vhd/lib/vhd-util-fill.c b/tools/blktap2/vhd/lib/vhd-util-fill.c
new file mode 100644 (file)
index 0000000..afbfcce
--- /dev/null
@@ -0,0 +1,105 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "libvhd.h"
+
+int
+vhd_util_fill(int argc, char **argv)
+{
+       int err, c;
+       char *buf, *name;
+       vhd_context_t vhd;
+       uint64_t i, sec, secs;
+
+       buf  = NULL;
+       name = NULL;
+
+       if (!argc || !argv)
+               goto usage;
+
+       optind = 0;
+       while ((c = getopt(argc, argv, "n:h")) != -1) {
+               switch (c) {
+               case 'n':
+                       name = optarg;
+                       break;
+               case 'h':
+               default:
+                       goto usage;
+               }
+       }
+
+       if (!name || optind != argc)
+               goto usage;
+
+       err = vhd_open(&vhd, name, VHD_OPEN_RDWR);
+       if (err) {
+               printf("error opening %s: %d\n", name, err);
+               return err;
+       }
+
+       err = vhd_get_bat(&vhd);
+       if (err)
+               goto done;
+
+       err = posix_memalign((void **)&buf, 4096, vhd.header.block_size);
+       if (err) {
+               err = -err;
+               goto done;
+       }
+
+       sec  = 0;
+       secs = vhd.header.block_size >> VHD_SECTOR_SHIFT;
+
+       for (i = 0; i < vhd.header.max_bat_size; i++) {
+               err = vhd_io_read(&vhd, buf, sec, secs);
+               if (err)
+                       goto done;
+
+               err = vhd_io_write(&vhd, buf, sec, secs);
+               if (err)
+                       goto done;
+
+               sec += secs;
+       }
+
+       err = 0;
+
+ done:
+       free(buf);
+       vhd_close(&vhd);
+       return err;
+
+usage:
+       printf("options: <-n name> [-h help]\n");
+       return -EINVAL;
+}
diff --git a/tools/blktap2/vhd/lib/vhd-util-modify.c b/tools/blktap2/vhd/lib/vhd-util-modify.c
new file mode 100644 (file)
index 0000000..3b07e31
--- /dev/null
@@ -0,0 +1,132 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Altering operations:
+ *
+ * 1. Change the parent pointer to another file.
+ * 2. Change the size of the file containing the VHD image. This does NOT 
+ * affect the VHD disk capacity, only the physical size of the file containing 
+ * the VHD. Naturally, it is not possible to set the file size to be less than  
+ * the what VHD utilizes.
+ * The operation doesn't actually change the file size, but it writes the 
+ * footer in the right location such that resizing the file (manually, as a 
+ * separate step) will produce the correct results. If the new file size is 
+ * greater than the current file size, the file must first be expanded and then 
+ * altered with this operation. If the new size is smaller than the current 
+ * size, the VHD must first be altered with this operation and then the file 
+ * must be shrunk. Failing to resize the file will result in a corrupted VHD.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "libvhd.h"
+
+TEST_FAIL_EXTERN_VARS;
+
+int
+vhd_util_modify(int argc, char **argv)
+{
+       char *name;
+       vhd_context_t vhd;
+       int err, c, size, parent, parent_raw;
+       off64_t newsize = 0;
+       char *newparent = NULL;
+
+       name       = NULL;
+       size       = 0;
+       parent     = 0;
+       parent_raw = 0;
+
+       optind = 0;
+       while ((c = getopt(argc, argv, "n:s:p:mh")) != -1) {
+               switch (c) {
+               case 'n':
+                       name = optarg;
+                       break;
+               case 's':
+                       size = 1;
+                       errno = 0;
+                       newsize = strtoll(optarg, NULL, 10);
+                       if (errno) {
+                               fprintf(stderr, "Invalid size '%s'\n", optarg);
+                               goto usage;
+                       }
+                       break;
+               case 'p':
+                       parent = 1;
+                       newparent = optarg;
+                       break;
+               case 'm':
+                       parent_raw = 1;
+                       break;
+
+               case 'h':
+               default:
+                       goto usage;
+               }
+       }
+
+       if (!name || optind != argc)
+               goto usage;
+
+       err = vhd_open(&vhd, name, VHD_OPEN_RDWR);
+       if (err) {
+               printf("error opening %s: %d\n", name, err);
+               return err;
+       }
+
+       if (size) {
+               err = vhd_set_phys_size(&vhd, newsize);
+               if (err)
+                       printf("failed to set physical size to %"PRIu64":"
+                              " %d\n", newsize, err);
+       }
+
+       if (parent) {
+               TEST_FAIL_AT(FAIL_REPARENT_BEGIN);
+               err = vhd_change_parent(&vhd, newparent, parent_raw);
+               if (err) {
+                       printf("failed to set parent to '%s': %d\n",
+                                       newparent, err);
+                       goto done;
+               }
+               TEST_FAIL_AT(FAIL_REPARENT_END);
+       }
+
+done:
+       vhd_close(&vhd);
+       return err;
+
+usage:
+       printf("*** Dangerous operations, use with care ***\n");
+       printf("options: <-n name> [-p NEW_PARENT set parent [-m raw]] "
+                       "[-s NEW_SIZE set size] [-h help]\n");
+       return -EINVAL;
+}
diff --git a/tools/blktap2/vhd/lib/vhd-util-query.c b/tools/blktap2/vhd/lib/vhd-util-query.c
new file mode 100644 (file)
index 0000000..3477a17
--- /dev/null
@@ -0,0 +1,159 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "libvhd.h"
+
+int
+vhd_util_query(int argc, char **argv)
+{
+       char *name;
+       vhd_context_t vhd;
+       off64_t currsize;
+       int ret, err, c, size, physize, parent, fields, depth;
+
+       name    = NULL;
+       size    = 0;
+       physize = 0;
+       parent  = 0;
+       fields  = 0;
+       depth   = 0;
+
+       if (!argc || !argv) {
+               err = -EINVAL;
+               goto usage;
+       }
+
+       optind = 0;
+       while ((c = getopt(argc, argv, "n:vspfdh")) != -1) {
+               switch (c) {
+               case 'n':
+                       name = optarg;
+                       break;
+               case 'v':
+                       size = 1;
+                       break;
+               case 's':
+                       physize = 1;
+                       break;
+               case 'p':
+                       parent = 1;
+                       break;
+               case 'f':
+                       fields = 1;
+                       break;
+               case 'd':
+                       depth = 1;
+                       break;
+               case 'h':
+                       err = 0;
+                       goto usage;
+               default:
+                       err = -EINVAL;
+                       goto usage;
+               }
+       }
+
+       if (!name || optind != argc) {
+               err = -EINVAL;
+               goto usage;
+       }
+
+       err = vhd_open(&vhd, name, VHD_OPEN_RDONLY | VHD_OPEN_IGNORE_DISABLED);
+       if (err) {
+               printf("error opening %s: %d\n", name, err);
+               return err;
+       }
+
+       if (size)
+               printf("%"PRIu64"\n", vhd.footer.curr_size >> 20);
+
+       if (physize) {
+               err = vhd_get_phys_size(&vhd, &currsize);
+               if (err)
+                       printf("failed to get physical size: %d\n", err);
+               else
+                       printf("%"PRIu64"\n", currsize);
+       }
+
+       if (parent) {
+               ret = 0;
+
+               if (vhd.footer.type != HD_TYPE_DIFF)
+                       printf("%s has no parent\n", name);
+               else {
+                       char *pname;
+
+                       ret = vhd_parent_locator_get(&vhd, &pname);
+                       if (ret)
+                               printf("query failed\n");
+                       else {
+                               printf("%s\n", pname);
+                               free(pname);
+                       }
+               }
+
+               err = (err ? : ret);
+       }
+
+       if (fields) {
+               int hidden;
+
+               ret = vhd_hidden(&vhd, &hidden);
+               if (ret)
+                       printf("error checking 'hidden' field: %d\n", ret);
+               else
+                       printf("hidden: %d\n", hidden);
+
+               err = (err ? : ret);
+       }
+
+       if (depth) {
+               int length;
+
+               ret = vhd_chain_depth(&vhd, &length);
+               if (ret)
+                       printf("error checking chain depth: %d\n", ret);
+               else
+                       printf("chain depth: %d\n", length);
+
+               err = (err ? : ret);
+       }
+               
+       vhd_close(&vhd);
+       return err;
+
+usage:
+       printf("options: <-n name> [-v print virtual size (in MB)] "
+              "[-s print physical utilization (bytes)] [-p print parent] "
+              "[-f print fields] [-d print chain depth] [-h help]\n");
+       return err;
+}
diff --git a/tools/blktap2/vhd/lib/vhd-util-read.c b/tools/blktap2/vhd/lib/vhd-util-read.c
new file mode 100644 (file)
index 0000000..7b5246c
--- /dev/null
@@ -0,0 +1,742 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <stdio.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <inttypes.h>
+
+#include "libvhd.h"
+#include "vhd-util.h"
+
+#define nsize     15
+static char nbuf[nsize];
+
+static inline char *
+__xconv(uint64_t num)
+{
+       snprintf(nbuf, nsize, "%#" PRIx64 , num);
+       return nbuf;
+}
+
+static inline char *
+__dconv(uint64_t num)
+{
+       snprintf(nbuf, nsize, "%" PRIu64, num);
+       return nbuf;
+}
+
+#define conv(hex, num) \
+       (hex ? __xconv((uint64_t)num) : __dconv((uint64_t)num))
+
+static void
+vhd_print_header(vhd_context_t *vhd, vhd_header_t *h, int hex)
+{
+       int err;
+       uint32_t  cksm;
+       char      uuid[37], time_str[26], cookie[9], out[512], *name;
+
+       printf("VHD Header Summary:\n-------------------\n");
+
+       snprintf(cookie, 9, "%s", h->cookie);
+       printf("Cookie              : %s\n", cookie);
+
+       printf("Data offset (unusd) : %s\n", conv(hex, h->data_offset));
+       printf("Table offset        : %s\n", conv(hex, h->table_offset));
+       printf("Header version      : 0x%08x\n", h->hdr_ver);
+       printf("Max BAT size        : %s\n", conv(hex, h->max_bat_size));
+       printf("Block size          : %s ", conv(hex, h->block_size));
+       printf("(%s MB)\n", conv(hex, h->block_size >> 20));
+
+       err = vhd_header_decode_parent(vhd, h, &name);
+       printf("Parent name         : %s\n",
+              (err ? "failed to read name" : name));
+       free(name);
+
+       uuid_unparse(h->prt_uuid, uuid);
+       printf("Parent UUID         : %s\n", uuid);
+    
+       vhd_time_to_string(h->prt_ts, time_str);
+       printf("Parent timestamp    : %s\n", time_str);
+
+       cksm = vhd_checksum_header(h);
+       printf("Checksum            : 0x%x|0x%x (%s)\n", h->checksum, cksm,
+               h->checksum == cksm ? "Good!" : "Bad!");
+       printf("\n");
+}
+
+static void
+vhd_print_footer(vhd_footer_t *f, int hex)
+{
+       uint64_t  c, h, s;
+       uint32_t  ff_maj, ff_min, cr_maj, cr_min, cksm, cksm_save;
+       char      time_str[26], creator[5], uuid[37], cookie[9];
+
+       printf("VHD Footer Summary:\n-------------------\n");
+
+       snprintf(cookie, 9, "%s", f->cookie);
+       printf("Cookie              : %s\n", cookie);
+
+       printf("Features            : (0x%08x) %s%s\n", f->features,
+               (f->features & HD_TEMPORARY) ? "<TEMP>" : "",
+               (f->features & HD_RESERVED)  ? "<RESV>" : "");
+
+       ff_maj = f->ff_version >> 16;
+       ff_min = f->ff_version & 0xffff;
+       printf("File format version : Major: %d, Minor: %d\n", 
+               ff_maj, ff_min);
+
+       printf("Data offset         : %s\n", conv(hex, f->data_offset));
+
+       vhd_time_to_string(f->timestamp, time_str);
+       printf("Timestamp           : %s\n", time_str);
+
+       memcpy(creator, f->crtr_app, 4);
+       creator[4] = '\0';
+       printf("Creator Application : '%s'\n", creator);
+
+       cr_maj = f->crtr_ver >> 16;
+       cr_min = f->crtr_ver & 0xffff;
+       printf("Creator version     : Major: %d, Minor: %d\n",
+               cr_maj, cr_min);
+
+       printf("Creator OS          : %s\n",
+               ((f->crtr_os == HD_CR_OS_WINDOWS) ? "Windows" :
+                ((f->crtr_os == HD_CR_OS_MACINTOSH) ? "Macintosh" : 
+                 "Unknown!")));
+
+       printf("Original disk size  : %s MB ", conv(hex, f->orig_size >> 20));
+       printf("(%s Bytes)\n", conv(hex, f->orig_size));
+
+       printf("Current disk size   : %s MB ", conv(hex, f->curr_size >> 20));
+       printf("(%s Bytes)\n", conv(hex, f->curr_size));
+
+       c = f->geometry >> 16;
+       h = (f->geometry & 0x0000FF00) >> 8;
+       s = f->geometry & 0x000000FF;
+       printf("Geometry            : Cyl: %s, ", conv(hex, c));
+       printf("Hds: %s, ", conv(hex, h));
+       printf("Sctrs: %s\n", conv(hex, s));
+       printf("                    : = %s MB ", conv(hex, (c * h * s) >> 11));
+       printf("(%s Bytes)\n", conv(hex, c * h * s << 9));
+
+       printf("Disk type           : %s\n", 
+               f->type <= HD_TYPE_MAX ? 
+               HD_TYPE_STR[f->type] : "Unknown type!\n");
+
+       cksm = vhd_checksum_footer(f);
+       printf("Checksum            : 0x%x|0x%x (%s)\n", f->checksum, cksm,
+               f->checksum == cksm ? "Good!" : "Bad!");
+
+       uuid_unparse(f->uuid, uuid);
+       printf("UUID                : %s\n", uuid);
+
+       printf("Saved state         : %s\n", f->saved == 0 ? "No" : "Yes");
+       printf("Hidden              : %d\n", f->hidden);
+       printf("\n");
+}
+
+static inline char *
+code_name(uint32_t code)
+{
+       switch(code) {
+       case PLAT_CODE_NONE:
+               return "PLAT_CODE_NONE";
+       case PLAT_CODE_WI2R:
+               return "PLAT_CODE_WI2R";
+       case PLAT_CODE_WI2K:
+               return "PLAT_CODE_WI2K";
+       case PLAT_CODE_W2RU:
+               return "PLAT_CODE_W2RU";
+       case PLAT_CODE_W2KU:
+               return "PLAT_CODE_W2KU";
+       case PLAT_CODE_MAC:
+               return "PLAT_CODE_MAC";
+       case PLAT_CODE_MACX:
+               return "PLAT_CODE_MACX";
+       default:
+               return "UNKOWN";
+       }
+}
+
+static void
+vhd_print_parent(vhd_context_t *vhd, vhd_parent_locator_t *loc)
+{
+       int err;
+       char *buf;
+
+       err = vhd_parent_locator_read(vhd, loc, &buf);
+       if (err) {
+               printf("failed to read parent name\n");
+               return;
+       }
+
+       printf("       decoded name : %s\n", buf);
+}
+
+static void
+vhd_print_parent_locators(vhd_context_t *vhd, int hex)
+{
+       int i, n;
+       vhd_parent_locator_t *loc;
+
+       printf("VHD Parent Locators:\n--------------------\n");
+
+       n = sizeof(vhd->header.loc) / sizeof(struct prt_loc);
+       for (i = 0; i < n; i++) {
+               loc = &vhd->header.loc[i];
+
+               if (loc->code == PLAT_CODE_NONE)
+                       continue;
+
+               printf("locator:            : %d\n", i);
+               printf("       code         : %s\n",
+                      code_name(loc->code));
+               printf("       data_space   : %s\n",
+                      conv(hex, loc->data_space));
+               printf("       data_length  : %s\n",
+                      conv(hex, loc->data_len));
+               printf("       data_offset  : %s\n",
+                      conv(hex, loc->data_offset));
+               vhd_print_parent(vhd, loc);
+               printf("\n");
+       }
+}
+
+static void
+vhd_print_batmap_header(vhd_batmap_t *batmap, int hex)
+{
+       uint32_t cksm;
+
+       printf("VHD Batmap Summary:\n-------------------\n");
+       printf("Batmap offset       : %s\n",
+              conv(hex, batmap->header.batmap_offset));
+       printf("Batmap size (secs)  : %s\n",
+              conv(hex, batmap->header.batmap_size));
+       printf("Batmap version      : 0x%08x\n",
+              batmap->header.batmap_version);
+
+       cksm = vhd_checksum_batmap(batmap);
+       printf("Checksum            : 0x%x|0x%x (%s)\n",
+              batmap->header.checksum, cksm,
+              (batmap->header.checksum == cksm ? "Good!" : "Bad!"));
+       printf("\n");
+}
+
+static inline int
+check_block_range(vhd_context_t *vhd, uint64_t block, int hex)
+{
+       if (block > vhd->header.max_bat_size) {
+               fprintf(stderr, "block %s past end of file\n",
+                       conv(hex, block));
+               return -ERANGE;
+       }
+
+       return 0;
+}
+
+static int
+vhd_print_headers(vhd_context_t *vhd, int hex)
+{
+       int err;
+
+       vhd_print_footer(&vhd->footer, hex);
+
+       if (vhd_type_dynamic(vhd)) {
+               vhd_print_header(vhd, &vhd->header, hex);
+
+               if (vhd->footer.type == HD_TYPE_DIFF)
+                       vhd_print_parent_locators(vhd, hex);
+
+               if (vhd_has_batmap(vhd)) {
+                       err = vhd_get_batmap(vhd);
+                       if (err) {
+                               printf("failed to get batmap header\n");
+                               return err;
+                       }
+
+                       vhd_print_batmap_header(&vhd->batmap, hex);
+               }
+       }
+
+       return 0;
+}
+
+static int
+vhd_dump_headers(const char *name, int hex)
+{
+       vhd_context_t vhd;
+
+       libvhd_set_log_level(1);
+       memset(&vhd, 0, sizeof(vhd));
+
+       printf("\n%s appears invalid; dumping headers\n\n", name);
+
+       vhd.fd = open(name, O_DIRECT | O_LARGEFILE | O_RDONLY);
+       if (vhd.fd == -1)
+               return -errno;
+
+       vhd.file = strdup(name);
+
+       vhd_read_footer(&vhd, &vhd.footer);
+       vhd_read_header(&vhd, &vhd.header);
+
+       vhd_print_footer(&vhd.footer, hex);
+       vhd_print_header(&vhd, &vhd.header, hex);
+
+       close(vhd.fd);
+       free(vhd.file);
+
+       return 0;
+}
+
+static int
+vhd_print_logical_to_physical(vhd_context_t *vhd,
+                             uint64_t sector, int count, int hex)
+{
+       int i;
+       uint32_t blk, lsec;
+       uint64_t cur, offset;
+
+       if (vhd_sectors_to_bytes(sector + count) > vhd->footer.curr_size) {
+               fprintf(stderr, "sector %s past end of file\n",
+                       conv(hex, sector + count));
+                       return -ERANGE;
+       }
+
+       for (i = 0; i < count; i++) {
+               cur    = sector + i;
+               blk    = cur / vhd->spb;
+               lsec   = cur % vhd->spb;
+               offset = vhd->bat.bat[blk];
+
+               if (offset != DD_BLK_UNUSED) {
+                       offset += lsec + 1;
+                       offset  = vhd_sectors_to_bytes(offset);
+               }
+
+               printf("logical sector %s: ", conv(hex, cur));
+               printf("block number: %s, ", conv(hex, blk));
+               printf("sector offset: %s, ", conv(hex, lsec));
+               printf("file offset: %s\n", (offset == DD_BLK_UNUSED ?
+                       "not allocated" : conv(hex, offset)));
+       }
+
+       return 0;
+}
+
+static int
+vhd_print_bat(vhd_context_t *vhd, uint64_t block, int count, int hex)
+{
+       int i;
+       uint64_t cur, offset;
+
+       if (check_block_range(vhd, block + count, hex))
+               return -ERANGE;
+
+       for (i = 0; i < count; i++) {
+               cur    = block + i;
+               offset = vhd->bat.bat[cur];
+
+               printf("block: %s: ", conv(hex, cur));
+               printf("offset: %s\n",
+                      (offset == DD_BLK_UNUSED ? "not allocated" :
+                       conv(hex, vhd_sectors_to_bytes(offset))));
+       }
+
+       return 0;
+}
+
+static inline void
+write_full(int fd, void* buf, size_t count)
+{
+       ssize_t num_written = 0;
+       if (!buf) return;
+       
+       
+       while(count > 0) {
+               
+               num_written = write(fd, buf, count);
+               if (num_written == -1) {
+                       if (errno == EINTR) 
+                               continue;
+                       else
+                               return;
+               }
+               
+               count -= num_written;
+               buf   += num_written;
+       }
+}
+
+static int
+vhd_print_bitmap(vhd_context_t *vhd, uint64_t block, int count, int hex)
+{
+       char *buf;
+       int i, err;
+       uint64_t cur;
+
+       if (check_block_range(vhd, block + count, hex))
+               return -ERANGE;
+
+       for (i = 0; i < count; i++) {
+               cur = block + i;
+
+               if (vhd->bat.bat[cur] == DD_BLK_UNUSED) {
+                       printf("block %s not allocated\n", conv(hex, cur));
+                       continue;
+               }
+
+               err = vhd_read_bitmap(vhd, cur, &buf);
+               if (err)
+                       goto out;
+
+               write_full(STDOUT_FILENO, buf, 
+                          vhd_sectors_to_bytes(vhd->bm_secs));
+               free(buf);
+       }
+
+       err = 0;
+out:
+       return err;
+}
+
+static int
+vhd_test_bitmap(vhd_context_t *vhd, uint64_t sector, int count, int hex)
+{
+       char *buf;
+       uint64_t cur;
+       int i, err, bit;
+       uint32_t blk, bm_blk, sec;
+
+       if (vhd_sectors_to_bytes(sector + count) > vhd->footer.curr_size) {
+               printf("sector %s past end of file\n", conv(hex, sector));
+               return -ERANGE;
+       }
+
+       bm_blk = -1;
+       buf    = NULL;
+
+       for (i = 0; i < count; i++) {
+               cur = sector + i;
+               blk = cur / vhd->spb;
+               sec = cur % vhd->spb;
+
+               if (blk != bm_blk) {
+                       bm_blk = blk;
+                       free(buf);
+                       buf = NULL;
+
+                       if (vhd->bat.bat[blk] != DD_BLK_UNUSED) {
+                               err = vhd_read_bitmap(vhd, blk, &buf);
+                               if (err)
+                                       goto out;
+                       }
+               }
+
+               if (vhd->bat.bat[blk] == DD_BLK_UNUSED)
+                       bit = 0;
+               else
+                       bit = vhd_bitmap_test(vhd, buf, blk);
+
+       print:
+               printf("block %s: ", conv(hex, blk));
+               printf("sec: %s: %d\n", conv(hex, sec), bit);
+       }
+
+       err = 0;
+ out:
+       free(buf);
+       return err;
+}
+
+static int
+vhd_print_batmap(vhd_context_t *vhd)
+{
+       int err;
+       size_t size;
+
+       err = vhd_get_batmap(vhd);
+       if (err) {
+               printf("failed to read batmap: %d\n", err);
+               return err;
+       }
+
+       size = vhd_sectors_to_bytes(vhd->batmap.header.batmap_size);
+       write_full(STDOUT_FILENO, vhd->batmap.map, size);
+
+       return 0;
+}
+
+static int
+vhd_test_batmap(vhd_context_t *vhd, uint64_t block, int count, int hex)
+{
+       int i, err;
+       uint64_t cur;
+
+       if (check_block_range(vhd, block + count, hex))
+               return -ERANGE;
+
+       err = vhd_get_batmap(vhd);
+       if (err) {
+               fprintf(stderr, "failed to get batmap\n");
+               return err;
+       }
+
+       for (i = 0; i < count; i++) {
+               cur = block + i;
+               fprintf(stderr, "batmap for block %s: %d\n", conv(hex, cur),
+                       vhd_batmap_test(vhd, &vhd->batmap, cur));
+       }
+
+       return 0;
+}
+
+static int
+vhd_print_data(vhd_context_t *vhd, uint64_t block, int count, int hex)
+{
+       char *buf;
+       int i, err;
+       uint64_t cur;
+
+       err = 0;
+
+       if (check_block_range(vhd, block + count, hex))
+               return -ERANGE;
+
+       for (i = 0; i < count; i++) {
+               cur = block + i;
+
+               if (vhd->bat.bat[cur] == DD_BLK_UNUSED) {
+                       printf("block %s not allocated\n", conv(hex, cur));
+                       continue;
+               }
+
+               err = vhd_read_block(vhd, cur, &buf);
+               if (err)
+                       break;
+
+               write_full(STDOUT_FILENO, buf, vhd->header.block_size);
+               free(buf);
+       }
+
+       return err;
+}
+
+static int
+vhd_read_data(vhd_context_t *vhd, uint64_t sec, int count, int hex)
+{
+       char *buf;
+       uint64_t cur;
+       int err, max, secs;
+
+       if (vhd_sectors_to_bytes(sec + count) > vhd->footer.curr_size)
+               return -ERANGE;
+
+       max = MIN(vhd_sectors_to_bytes(count), VHD_BLOCK_SIZE);
+       err = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, max);
+       if (err)
+               return -err;
+
+       cur = sec;
+       while (count) {
+               secs = MIN((max >> VHD_SECTOR_SHIFT), count);
+               err  = vhd_io_read(vhd, buf, cur, secs);
+               if (err)
+                       break;
+
+               write_full(STDOUT_FILENO, buf, vhd_sectors_to_bytes(secs));
+
+               cur   += secs;
+               count -= secs;
+       }
+
+       free(buf);
+       return err;
+}
+
+int
+vhd_util_read(int argc, char **argv)
+{
+       char *name;
+       vhd_context_t vhd;
+       int c, err, headers, hex;
+       uint64_t bat, bitmap, tbitmap, batmap, tbatmap, data, lsec, count, read;
+
+       err     = 0;
+       hex     = 0;
+       headers = 0;
+       count   = 1;
+       bat     = -1;
+       bitmap  = -1;
+       tbitmap = -1;
+       batmap  = -1;
+       tbatmap = -1;
+       data    = -1;
+       lsec    = -1;
+       read    = -1;
+       name    = NULL;
+
+       if (!argc || !argv)
+               goto usage;
+
+       optind = 0;
+       while ((c = getopt(argc, argv, "n:pt:b:m:i:aj:d:c:r:xh")) != -1) {
+               switch(c) {
+               case 'n':
+                       name = optarg;
+                       break;
+               case 'p':
+                       headers = 1;
+                       break;
+               case 't':
+                       lsec = strtoul(optarg, NULL, 10);
+                       break;
+               case 'b':
+                       bat = strtoull(optarg, NULL, 10);
+                       break;
+               case 'm':
+                       bitmap = strtoull(optarg, NULL, 10);
+                       break;
+               case 'i':
+                       tbitmap = strtoul(optarg, NULL, 10);
+                       break;
+               case 'a':
+                       batmap = 1;
+                       break;
+               case 'j':
+                       tbatmap = strtoull(optarg, NULL, 10);
+                       break;
+               case 'd':
+                       data = strtoull(optarg, NULL, 10);
+                       break;
+               case 'r':
+                       read = strtoull(optarg, NULL, 10);
+                       break;
+               case 'c':
+                       count = strtoul(optarg, NULL, 10);
+                       break;
+               case 'x':
+                       hex = 1;
+                       break;
+               case 'h':
+               default:
+                       goto usage;
+               }
+       }
+
+       if (!name || optind != argc)
+               goto usage;
+
+       err = vhd_open(&vhd, name, VHD_OPEN_RDONLY | VHD_OPEN_IGNORE_DISABLED);
+       if (err) {
+               printf("Failed to open %s: %d\n", name, err);
+               vhd_dump_headers(name, hex);
+               return err;
+       }
+
+       err = vhd_get_bat(&vhd);
+       if (err) {
+               printf("Failed to get bat for %s: %d\n", name, err);
+               goto out;
+       }
+
+       if (headers)
+               vhd_print_headers(&vhd, hex);
+
+       if (lsec != -1) {
+               err = vhd_print_logical_to_physical(&vhd, lsec, count, hex);
+               if (err)
+                       goto out;
+       }
+
+       if (bat != -1) {
+               err = vhd_print_bat(&vhd, bat, count, hex);
+               if (err)
+                       goto out;
+       }
+
+       if (bitmap != -1) {
+               err = vhd_print_bitmap(&vhd, bitmap, count, hex);
+               if (err)
+                       goto out;
+       }
+
+       if (tbitmap != -1) {
+               err = vhd_test_bitmap(&vhd, tbitmap, count, hex);
+               if (err)
+                       goto out;
+       }
+
+       if (batmap != -1) {
+               err = vhd_print_batmap(&vhd);
+               if (err)
+                       goto out;
+       }
+
+       if (tbatmap != -1) {
+               err = vhd_test_batmap(&vhd, tbatmap, count, hex);
+               if (err)
+                       goto out;
+       }
+
+       if (data != -1) {
+               err = vhd_print_data(&vhd, data, count, hex);
+               if (err)
+                       goto out;
+       }
+
+       if (read != -1) {
+               err = vhd_read_data(&vhd, read, count, hex);
+               if (err)
+                       goto out;
+       }
+
+       err = 0;
+
+ out:
+       vhd_close(&vhd);
+       return err;
+
+ usage:
+       printf("options:\n"
+              "-h          help\n"
+              "-n          name\n"
+              "-p          print VHD headers\n"
+              "-t sec      translate logical sector to VHD location\n"
+              "-b blk      print bat entry\n"
+              "-m blk      print bitmap\n"
+              "-i sec      test bitmap for logical sector\n"
+              "-a          print batmap\n"
+              "-j blk      test batmap for block\n"
+              "-d blk      print data\n"
+              "-c num      num units\n"
+              "-r sec      read num sectors at sec\n"
+              "-x          print in hex\n");
+       return EINVAL;
+}
diff --git a/tools/blktap2/vhd/lib/vhd-util-repair.c b/tools/blktap2/vhd/lib/vhd-util-repair.c
new file mode 100644 (file)
index 0000000..a1d2c45
--- /dev/null
@@ -0,0 +1,84 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "libvhd.h"
+
+int
+vhd_util_repair(int argc, char **argv)
+{
+       char *name;
+       int err, c;
+       off64_t eof;
+       vhd_context_t vhd;
+
+       name = NULL;
+
+       if (!argc || !argv)
+               goto usage;
+
+       optind = 0;
+       while ((c = getopt(argc, argv, "n:h")) != -1) {
+               switch (c) {
+               case 'n':
+                       name = optarg;
+                       break;
+               case 'h':
+               default:
+                       goto usage;
+               }
+       }
+
+       if (!name || optind != argc)
+               goto usage;
+
+       err = vhd_open(&vhd, name, VHD_OPEN_RDWR);
+       if (err) {
+               printf("error opening %s: %d\n", name, err);
+               return err;
+       }
+
+       err = vhd_end_of_data(&vhd, &eof);
+       if (err) {
+               printf("error finding end of data: %d\n", err);
+               goto done;
+       }
+
+       err = vhd_write_footer_at(&vhd, &vhd.footer, eof);
+
+ done:
+       vhd_close(&vhd);
+       return err;
+
+usage:
+       printf("options: <-n name> [-h help]\n");
+       return -EINVAL;
+}
diff --git a/tools/blktap2/vhd/lib/vhd-util-resize.c b/tools/blktap2/vhd/lib/vhd-util-resize.c
new file mode 100644 (file)
index 0000000..0143d7a
--- /dev/null
@@ -0,0 +1,1131 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <syslog.h>
+#include <inttypes.h>
+#include <sys/mman.h>
+
+#include "libvhd-journal.h"
+
+#if 1
+#define DFPRINTF(_f, _a...) fprintf(stdout, _f, ##_a)
+#else
+#define DFPRINTF(_f, _a...) ((void)0)
+#endif
+
+#define EPRINTF(_f, _a...)                                     \
+       do {                                                    \
+               syslog(LOG_INFO, "%s: " _f, __func__, ##_a);    \
+               DFPRINTF(_f, _a);                               \
+       } while (0)
+
+typedef struct vhd_block {
+       uint32_t block;
+       uint32_t offset;
+} vhd_block_t;
+
+TEST_FAIL_EXTERN_VARS;
+
+static inline uint32_t
+secs_to_blocks_down(vhd_context_t *vhd, uint64_t secs)
+{
+       return secs / vhd->spb;
+}
+
+static uint32_t
+secs_to_blocks_up(vhd_context_t *vhd, uint64_t secs)
+{
+       uint32_t blocks;
+
+       blocks = secs / vhd->spb;
+       if (secs % vhd->spb)
+               blocks++;
+
+       return blocks;
+}
+
+static int
+vhd_fixed_shrink(vhd_journal_t *journal, uint64_t secs)
+{
+       int err;
+       uint64_t new_eof;
+       vhd_context_t *vhd;
+
+       vhd = &journal->vhd;
+
+       new_eof = vhd->footer.curr_size - vhd_sectors_to_bytes(secs);
+       if (new_eof <= sizeof(vhd_footer_t))
+               return -EINVAL;
+
+       err = ftruncate(vhd->fd, new_eof);
+       if (err)
+               return errno;
+
+       vhd->footer.curr_size = new_eof;
+       return vhd_write_footer(vhd, &vhd->footer);
+}
+
+static int
+vhd_write_zeros(vhd_journal_t *journal, off64_t off, uint64_t size)
+{
+       int err;
+       char *buf;
+       vhd_context_t *vhd;
+       uint64_t bytes, map;
+
+       vhd = &journal->vhd;
+       map = MIN(size, VHD_BLOCK_SIZE);
+
+       err = vhd_seek(vhd, off, SEEK_SET);
+       if (err)
+               return err;
+
+       buf = mmap(0, map, PROT_READ, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
+       if (buf == MAP_FAILED)
+               return -errno;
+
+       do {
+               bytes = MIN(size, map);
+
+               err = vhd_write(vhd, buf, bytes);
+               if (err)
+                       break;
+
+               size -= bytes;
+       } while (size);
+
+       munmap(buf, map);
+
+       return err;
+}
+
+static int
+vhd_fixed_grow(vhd_journal_t *journal, uint64_t secs)
+{
+       int err;
+       vhd_context_t *vhd;
+       uint64_t size, eof, new_eof;
+
+       size = vhd_sectors_to_bytes(secs);
+       vhd  = &journal->vhd;
+
+       err = vhd_seek(vhd, 0, SEEK_END);
+       if (err)
+               goto out;
+
+       eof = vhd_position(vhd);
+       if (eof == (off64_t)-1) {
+               err = -errno;
+               goto out;
+       }
+
+       err = vhd_write_zeros(journal, eof - sizeof(vhd_footer_t), size);
+       if (err)
+               goto out;
+
+       new_eof = eof + size;
+       err = vhd_seek(vhd, new_eof, SEEK_SET);
+       if (err)
+               goto out;
+
+       vhd->footer.curr_size += size;
+       err = vhd_write_footer(vhd, &vhd->footer);
+       if (err)
+               goto out;
+
+       err = 0;
+
+out:
+       return err;
+}
+
+static int
+vhd_fixed_resize(vhd_journal_t *journal, uint64_t size)
+{
+       int err;
+       vhd_context_t *vhd;
+       uint64_t cur_secs, new_secs;
+
+       vhd      = &journal->vhd;
+       cur_secs = vhd->footer.curr_size >> VHD_SECTOR_SHIFT;
+       new_secs = size << (20 - VHD_SECTOR_SHIFT);
+
+       if (cur_secs == new_secs)
+               return 0;
+       else if (cur_secs > new_secs)
+               err = vhd_fixed_shrink(journal, cur_secs - new_secs);
+       else
+               err = vhd_fixed_grow(journal, new_secs - cur_secs);
+
+       return err;
+}
+
+static inline void
+swap(vhd_block_t *list, int a, int b)
+{
+       vhd_block_t tmp;
+
+       tmp     = list[a];
+       list[a] = list[b];
+       list[b] = tmp;
+}
+
+static int
+partition(vhd_block_t *list, int left, int right, int pidx)
+{
+       int i, sidx;
+       long long pval;
+
+       sidx = left;
+       pval = list[pidx].offset;
+       swap(list, pidx, right);
+
+       for (i = left; i < right; i++)
+               if (list[i].offset >= pval) {
+                       swap(list, sidx, i);
+                       ++sidx;
+               }
+
+       swap(list, right, sidx);
+       return sidx;
+}
+
+static void
+quicksort(vhd_block_t *list, int left, int right)
+{
+       int pidx, new_pidx;
+
+       if (right < left)
+               return;
+
+       pidx     = left;
+       new_pidx = partition(list, left, right, pidx);
+       quicksort(list, left, new_pidx - 1);
+       quicksort(list, new_pidx + 1, right);
+}
+
+static int
+vhd_move_block(vhd_journal_t *journal, uint32_t src, off64_t offset)
+{
+       int err;
+       char *buf;
+       size_t size;
+       vhd_context_t *vhd;
+       off64_t off, src_off;
+
+       buf     = NULL;
+       vhd     = &journal->vhd;
+       off     = offset;
+       size    = vhd_sectors_to_bytes(vhd->bm_secs);
+       src_off = vhd->bat.bat[src];
+
+       if (src_off == DD_BLK_UNUSED)
+               return -EINVAL;
+       src_off = vhd_sectors_to_bytes(src_off);
+
+       err  = vhd_journal_add_block(journal, src,
+                                    VHD_JOURNAL_DATA | VHD_JOURNAL_METADATA);
+       if (err)
+               goto out;
+
+       err  = vhd_read_bitmap(vhd, src, &buf);
+       if (err)
+               goto out;
+
+       err  = vhd_seek(vhd, off, SEEK_SET);
+       if (err)
+               goto out;
+
+       err  = vhd_write(vhd, buf, size);
+       if (err)
+               goto out;
+
+       free(buf);
+       buf   = NULL;
+       off  += size;
+       size  = vhd_sectors_to_bytes(vhd->spb);
+
+       err  = vhd_read_block(vhd, src, &buf);
+       if (err)
+               goto out;
+
+       err  = vhd_seek(vhd, off, SEEK_SET);
+       if (err)
+               goto out;
+
+       err  = vhd_write(vhd, buf, size);
+       if (err)
+               goto out;
+
+       vhd->bat.bat[src] = offset >> VHD_SECTOR_SHIFT;
+
+       err = vhd_write_zeros(journal, src_off,
+                             vhd_sectors_to_bytes(vhd->bm_secs + vhd->spb));
+
+out:
+       free(buf);
+       return err;
+}
+
+static int
+vhd_clobber_block(vhd_journal_t *journal, uint32_t src, uint32_t dest)
+{
+       int err;
+       off64_t off;
+       vhd_context_t *vhd;
+
+       vhd = &journal->vhd;
+       off = vhd_sectors_to_bytes(vhd->bat.bat[dest]);
+
+       err = vhd_journal_add_block(journal, dest,
+                                   VHD_JOURNAL_DATA | VHD_JOURNAL_METADATA);
+       if (err)
+               return err;
+
+       err = vhd_move_block(journal, src, off);
+       if (err)
+               return err;
+
+       vhd->bat.bat[dest] = DD_BLK_UNUSED;
+
+       return 0;
+}
+
+/*
+ * remove a list of blocks from the vhd file
+ * if a block to be removed:
+ *   - resides at the end of the file: simply clear its bat entry
+ *   - resides elsewhere: move the last block in the file into its position
+ *                        and update the bat to reflect this
+ */
+static int
+vhd_defrag_shrink(vhd_journal_t *journal,
+                 vhd_block_t *original_free_list, int free_cnt)
+{
+       vhd_context_t *vhd;
+       int i, j, free_idx, err;
+       vhd_block_t *blocks, *free_list;
+
+       err       = 0;
+       blocks    = NULL;
+       free_list = NULL;
+       vhd       = &journal->vhd;
+
+       blocks = malloc(vhd->bat.entries * sizeof(vhd_block_t));
+       if (!blocks) {
+               err = -ENOMEM;
+               goto out;
+       }
+
+       free_list = malloc(free_cnt * sizeof(vhd_block_t));
+       if (!free_list) {
+               err = -ENOMEM;
+               goto out;
+       }
+
+       for (i = 0; i < vhd->bat.entries; i++) {
+               blocks[i].block  = i;
+               blocks[i].offset = vhd->bat.bat[i];
+       }
+
+       memcpy(free_list, original_free_list,
+              free_cnt * sizeof(vhd_block_t));
+
+       /* sort both the to-free list and the bat list
+        * in order of descending file offset */
+       quicksort(free_list, 0, free_cnt - 1);
+       quicksort(blocks, 0, vhd->bat.entries - 1);
+
+       for (i = 0, free_idx = 0;
+            i < vhd->bat.entries && free_idx < free_cnt; i++) {
+               vhd_block_t *b = blocks + i;
+
+               if (b->offset == DD_BLK_UNUSED)
+                       continue;
+
+               for (j = free_idx; j < free_cnt; j++)
+                       if (b->block == free_list[j].block) {
+                               /* the last block in the file is in the list of
+                                * blocks to remove; no need to shuffle the
+                                * data -- just clear the bat entry */
+                               vhd->bat.bat[free_list[j].block] = DD_BLK_UNUSED;
+                               free_idx++;
+                               continue;
+                       }
+
+               err = vhd_clobber_block(journal, b->block,
+                                       free_list[free_idx++].block);
+               if (err)
+                       goto out;
+       }
+
+       /* clear any bat entries for blocks we did not shuffle */
+       for (i = free_idx; i < free_cnt; i++)
+               vhd->bat.bat[free_list[i].block] = DD_BLK_UNUSED;
+
+out:
+       free(blocks);
+       free(free_list);
+
+       return err;
+}
+
+static int
+vhd_clear_bat_entries(vhd_journal_t *journal, uint32_t entries)
+{
+       int i, err;
+       vhd_context_t *vhd;
+       off64_t orig_map_off, new_map_off;
+       uint32_t orig_entries, new_entries;
+
+       vhd          = &journal->vhd;
+       orig_entries = vhd->header.max_bat_size;
+       new_entries  = orig_entries - entries;
+
+       if (vhd_has_batmap(vhd)) {
+               err = vhd_batmap_header_offset(vhd, &orig_map_off);
+               if (err)
+                       return err;
+       }
+
+       /* update header */
+       vhd->header.max_bat_size = new_entries;
+       err = vhd_write_header(vhd, &vhd->header);
+       if (err)
+               return err;
+
+       /* update footer */
+       vhd->footer.curr_size = (uint64_t)new_entries * vhd->header.block_size;
+       vhd->footer.geometry  = vhd_chs(vhd->footer.curr_size);
+       err = vhd_write_footer(vhd, &vhd->footer);
+       if (err)
+               return err;
+
+       /* update bat -- we don't reclaim space, just clear entries */
+       for (i = new_entries; i < orig_entries; i++)
+               vhd->bat.bat[i] = 0;
+
+       err = vhd_write_bat(vhd, &vhd->bat);
+       if (err)
+               return err;
+
+       /* update this after write_bat so the end of the bat is zeored */
+       vhd->bat.entries = new_entries;
+
+       if (!vhd_has_batmap(vhd))
+               return 0;
+
+       /* zero out old batmap header if new header has moved */
+       err = vhd_batmap_header_offset(vhd, &new_map_off);
+       if (err)
+               return err;
+
+       if (orig_map_off != new_map_off) {
+               size_t size;
+
+               size = vhd_bytes_padded(sizeof(struct dd_batmap_hdr));
+
+               err = vhd_write_zeros(journal, orig_map_off, size);
+               if (err)
+                       return err;
+       }
+
+       /* update batmap -- clear entries for freed blocks */
+       for (i = new_entries; i < orig_entries; i++)
+               vhd_batmap_clear(vhd, &vhd->batmap, i);
+
+       err = vhd_write_batmap(vhd, &vhd->batmap);
+       if (err)
+               return err;
+
+       return 0;
+}
+
+static int
+vhd_dynamic_shrink(vhd_journal_t *journal, uint64_t secs)
+{
+       off64_t eof;
+       uint32_t blocks;
+       vhd_context_t *vhd;
+       int i, j, err, free_cnt;
+       struct vhd_block *free_list;
+
+       printf("dynamic shrink not fully implemented\n");
+       return -ENOSYS;
+
+       eof       = 0;
+       free_cnt  = 0;
+       free_list = NULL;
+       vhd       = &journal->vhd;
+
+       blocks    = secs_to_blocks_down(vhd, secs);
+       if (blocks == 0)
+               return 0;
+
+       if (vhd_has_batmap(vhd)) {
+               err = vhd_get_batmap(vhd);
+               if (err)
+                       return err;
+       }
+
+       free_list = malloc(blocks * sizeof(struct vhd_block));
+       if (!free_list)
+               return -ENOMEM;
+
+       for (i = vhd->bat.entries - 1, j = 0; i >= 0 && j < blocks; i--, j++) {
+               uint32_t blk = vhd->bat.bat[i];
+
+               if (blk != DD_BLK_UNUSED) {
+                       free_list[free_cnt].block  = i;
+                       free_list[free_cnt].offset = blk;
+                       free_cnt++;
+               }
+       }
+
+       if (free_cnt) {
+               err = vhd_defrag_shrink(journal, free_list, free_cnt);
+               if (err)
+                       goto out;
+       }
+
+       err = vhd_clear_bat_entries(journal, blocks);
+       if (err)
+               goto out;
+
+       /* remove data beyond footer */
+       err = vhd_end_of_data(vhd, &eof);
+       if (err)
+               goto out;
+
+       err = ftruncate(vhd->fd, eof + sizeof(vhd_footer_t));
+       if (err) {
+               err = -errno;
+               goto out;
+       }
+
+       err = 0;
+
+out:
+       free(free_list);
+       return err;
+}
+
+static inline void
+vhd_first_data_block(vhd_context_t *vhd, vhd_block_t *block)
+{
+       int i;
+       uint32_t blk;
+
+       memset(block, 0, sizeof(vhd_block_t));
+
+       for (i = 0; i < vhd->bat.entries; i++) {
+               blk = vhd->bat.bat[i];
+
+               if (blk != DD_BLK_UNUSED) {
+                       if (!block->offset || blk < block->offset) {
+                               block->block  = i;
+                               block->offset = blk;
+                       }
+               }
+       }
+}
+
+static inline uint32_t
+vhd_next_block_offset(vhd_context_t *vhd)
+{
+       int i;
+       uint32_t blk, end, spp, next;
+
+       next = 0;
+       spp  = getpagesize() >> VHD_SECTOR_SHIFT;
+
+       for (i = 0; i < vhd->bat.entries; i++) {
+               blk = vhd->bat.bat[i];
+
+               if (blk != DD_BLK_UNUSED) {
+                       end  = blk + vhd->spb + vhd->bm_secs;
+                       next = MAX(next, end);
+               }
+       }
+
+       return next;
+}
+
+static inline int
+in_range(off64_t off, off64_t start, off64_t size)
+{
+       return (start < off && start + size > off);
+}
+
+#define SKIP_HEADER 0x01
+#define SKIP_BAT    0x02
+#define SKIP_BATMAP 0x04
+#define SKIP_PLOC   0x08
+#define SKIP_DATA   0x10
+
+static inline int
+skip_check(int mode, int type)
+{
+       return mode & type;
+}
+
+static int
+vhd_check_for_clobber(vhd_context_t *vhd, off64_t off, int mode)
+{
+       int i, n;
+       char *msg;
+       size_t size;
+       vhd_block_t fb;
+       vhd_parent_locator_t *loc;
+
+       msg = NULL;
+
+       if (!vhd_type_dynamic(vhd))
+               return 0;
+
+       if (off < VHD_SECTOR_SIZE) {
+               msg = "backup footer";
+               goto fail;
+       }
+
+       if (!skip_check(mode, SKIP_HEADER))
+               if (in_range(off,
+                            vhd->footer.data_offset, sizeof(vhd_header_t))) {
+                       msg = "header";
+                       goto fail;
+               }
+
+       if (!skip_check(mode, SKIP_BAT))
+               if (in_range(off, vhd->header.table_offset,
+                            vhd_bytes_padded(vhd->header.max_bat_size *
+                                             sizeof(uint32_t)))) {
+                       msg = "bat";
+                       goto fail;
+               }
+
+       if (!skip_check(mode, SKIP_BATMAP))
+               if (vhd_has_batmap(vhd) &&
+                   in_range(off, vhd->batmap.header.batmap_offset,
+                            vhd_bytes_padded(vhd->batmap.header.batmap_size))) {
+                       msg = "batmap";
+                       goto fail;
+               }
+
+       if (!skip_check(mode, SKIP_PLOC)) {
+               n = sizeof(vhd->header.loc) / sizeof(vhd_parent_locator_t);
+               for (i = 0; i < n; i++) {
+                       loc = vhd->header.loc + i;
+                       if (loc->code == PLAT_CODE_NONE)
+                               continue;
+
+                       size = vhd_parent_locator_size(loc);
+                       if (in_range(off, loc->data_offset, size)) {
+                               msg = "parent locator";
+                               goto fail;
+                       }
+               }
+       }
+
+       if (!skip_check(mode, SKIP_DATA)) {
+               vhd_first_data_block(vhd, &fb);
+               if (fb.offset && in_range(off,
+                                         vhd_sectors_to_bytes(fb.offset),
+                                         VHD_BLOCK_SIZE)) {
+                       msg = "data block";
+                       goto fail;
+               }
+       }
+
+       return 0;
+
+fail:
+       EPRINTF("write to 0x%08"PRIx64" would clobber %s\n", off, msg);
+       return -EINVAL;
+}
+
+/*
+ * take any metadata after the bat (@eob) and shift it
+ */
+static int
+vhd_shift_metadata(vhd_journal_t *journal, off64_t eob,
+                  size_t bat_needed, size_t map_needed)
+{
+       int i, n, err;
+       vhd_context_t *vhd;
+       size_t size_needed;
+       char *buf, **locators;
+       vhd_parent_locator_t *loc;
+
+       vhd         = &journal->vhd;
+       size_needed = bat_needed + map_needed;
+
+       n = sizeof(vhd->header.loc) / sizeof(vhd_parent_locator_t);
+
+       locators = calloc(n, sizeof(char *));
+       if (!locators)
+               return -ENOMEM;
+
+       for (i = 0; i < n; i++) {
+               size_t size;
+
+               loc = vhd->header.loc + i;
+               if (loc->code == PLAT_CODE_NONE)
+                       continue;
+
+               if (loc->data_offset < eob)
+                       continue;
+
+               size = vhd_parent_locator_size(loc);
+               err  = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, size);
+               if (err) {
+                       err = -err;
+                       buf = NULL;
+                       goto out;
+               }
+
+               err  = vhd_seek(vhd, loc->data_offset, SEEK_SET);
+               if (err)
+                       goto out;
+
+               err  = vhd_read(vhd, buf, size);
+               if (err)
+                       goto out;
+
+               locators[i] = buf;
+       }
+
+       for (i = 0; i < n; i++) {
+               off64_t off;
+               size_t size;
+
+               if (!locators[i])
+                       continue;
+
+               loc  = vhd->header.loc + i;
+               off  = loc->data_offset + size_needed;
+               size = vhd_parent_locator_size(loc);
+
+               if (vhd_check_for_clobber(vhd, off + size, SKIP_PLOC)) {
+                       EPRINTF("%s: shifting locator %d would clobber data\n",
+                               vhd->file, i);
+                       return -EINVAL;
+               }
+
+               err  = vhd_seek(vhd, off, SEEK_SET);
+               if (err)
+                       goto out;
+
+               err  = vhd_write(vhd, locators[i], size);
+               if (err)
+                       goto out;
+
+               free(locators[i]);
+               locators[i]      = NULL;
+               loc->data_offset = off;
+
+               /* write the new header after writing the new bat */
+       }
+
+       if (vhd_has_batmap(vhd) && vhd->batmap.header.batmap_offset > eob) {
+               vhd->batmap.header.batmap_offset += bat_needed;
+
+               /* write the new batmap after writing the new bat */
+       }
+
+       err = 0;
+
+out:
+       for (i = 0; i < n; i++)
+               free(locators[i]);
+       free(locators);
+
+       return err;
+}
+
+static int
+vhd_add_bat_entries(vhd_journal_t *journal, int entries)
+{
+       int i, err;
+       off64_t off;
+       vhd_bat_t new_bat;
+       vhd_context_t *vhd;
+       uint32_t new_entries;
+       vhd_batmap_t new_batmap;
+       uint64_t bat_size, new_bat_size, map_size, new_map_size;
+
+       vhd          = &journal->vhd;
+       new_entries  = vhd->header.max_bat_size + entries;
+
+       bat_size     = vhd_bytes_padded(vhd->header.max_bat_size *
+                                       sizeof(uint32_t));
+       new_bat_size = vhd_bytes_padded(new_entries * sizeof(uint32_t));
+
+       map_size     = vhd_bytes_padded((vhd->header.max_bat_size + 7) >> 3);
+       new_map_size = vhd_bytes_padded((new_entries + 7) >> 3);
+
+       off = vhd->header.table_offset + new_bat_size;
+       if (vhd_check_for_clobber(vhd, off, SKIP_BAT | SKIP_BATMAP)) {
+               EPRINTF("%s: writing new bat of 0x%"PRIx64" bytes "
+                       "at 0x%08"PRIx64" would clobber data\n", 
+                       vhd->file, new_bat_size, vhd->header.table_offset);
+               return -EINVAL;
+       }
+
+       if (vhd_has_batmap(vhd)) {
+               off = vhd->batmap.header.batmap_offset + new_map_size;
+               if (vhd_check_for_clobber(vhd, off, 0)) {
+                       EPRINTF("%s: writing new batmap of 0x%"PRIx64" bytes"
+                               " at 0x%08"PRIx64" would clobber data\n", vhd->file,
+                               new_map_size, vhd->batmap.header.batmap_offset);
+                       return -EINVAL;
+               }
+       }
+
+       /* update header */
+       vhd->header.max_bat_size = new_entries;
+       err = vhd_write_header(vhd, &vhd->header);
+       if (err)
+               return err;
+
+       /* update footer */
+       vhd->footer.curr_size = (uint64_t)new_entries * vhd->header.block_size;
+       vhd->footer.geometry  = vhd_chs(vhd->footer.curr_size);
+       vhd->footer.checksum  = vhd_checksum_footer(&vhd->footer);
+       err = vhd_write_footer(vhd, &vhd->footer);
+       if (err)
+               return err;
+
+       /* allocate new bat */
+       err = posix_memalign((void **)&new_bat.bat, VHD_SECTOR_SIZE, new_bat_size);
+       if (err)
+               return -err;
+
+       new_bat.spb     = vhd->bat.spb;
+       new_bat.entries = new_entries;
+       memcpy(new_bat.bat, vhd->bat.bat, bat_size);
+       for (i = vhd->bat.entries; i < new_entries; i++)
+               new_bat.bat[i] = DD_BLK_UNUSED;
+
+       /* write new bat */
+       err = vhd_write_bat(vhd, &new_bat);
+       if (err) {
+               free(new_bat.bat);
+               return err;
+       }
+
+       /* update in-memory bat */
+       free(vhd->bat.bat);
+       vhd->bat = new_bat;
+
+       if (!vhd_has_batmap(vhd))
+               return 0;
+
+       /* allocate new batmap */
+       err = posix_memalign((void **)&new_batmap.map,
+                            VHD_SECTOR_SIZE, new_map_size);
+       if (err)
+               return err;
+
+       new_batmap.header = vhd->batmap.header;
+       new_batmap.header.batmap_size = secs_round_up_no_zero(new_map_size);
+       memcpy(new_batmap.map, vhd->batmap.map, map_size);
+       memset(new_batmap.map + map_size, 0, new_map_size - map_size);
+
+       /* write new batmap */
+       err = vhd_write_batmap(vhd, &new_batmap);
+       if (err) {
+               free(new_batmap.map);
+               return err;
+       }
+
+       /* update in-memory batmap */
+       free(vhd->batmap.map);
+       vhd->batmap = new_batmap;
+
+       return 0;
+}
+
+static int
+vhd_dynamic_grow(vhd_journal_t *journal, uint64_t secs)
+{
+       int i, err;
+       off64_t eob, eom;
+       vhd_context_t *vhd;
+       vhd_block_t first_block;
+       uint64_t blocks, size_needed;
+       uint64_t bat_needed, bat_size, bat_avail, bat_bytes, bat_secs;
+       uint64_t map_needed, map_size, map_avail, map_bytes, map_secs;
+
+       vhd         = &journal->vhd;
+
+       size_needed = 0;
+       bat_needed  = 0;
+       map_needed  = 0;
+
+       /* number of vhd blocks to add */
+       blocks      = secs_to_blocks_up(vhd, secs);
+
+       /* size in bytes needed for new bat entries */
+       bat_needed  = blocks * sizeof(uint32_t);
+       map_needed  = (blocks >> 3) + 1;
+
+       /* available bytes in current bat */
+       bat_bytes   = vhd->header.max_bat_size * sizeof(uint32_t);
+       bat_secs    = secs_round_up_no_zero(bat_bytes);
+       bat_size    = vhd_sectors_to_bytes(bat_secs);
+       bat_avail   = bat_size - bat_bytes;
+
+       if (vhd_has_batmap(vhd)) {
+               /* avaliable bytes in current batmap */
+               map_bytes   = (vhd->header.max_bat_size + 7) >> 3;
+               map_secs    = vhd->batmap.header.batmap_size;
+               map_size    = vhd_sectors_to_bytes(map_secs);
+               map_avail   = map_size - map_bytes;
+       } else {
+               map_needed  = 0;
+               map_avail   = 0;
+       }
+
+       /* we have enough space already; just extend the bat */
+       if (bat_needed <= bat_avail && map_needed <= map_avail)
+               goto add_entries;
+
+       /* we need to add new sectors to the bat */
+       if (bat_needed > bat_avail) {
+               bat_needed -= bat_avail;
+               bat_needed  = vhd_bytes_padded(bat_needed);
+       } else
+               bat_needed  = 0;
+
+       /* we need to add new sectors to the batmap */
+       if (map_needed > map_avail) {
+               map_needed -= map_avail;
+               map_needed  = vhd_bytes_padded(map_needed);
+       } else
+               map_needed  = 0;
+
+       /* how many additional bytes do we need? */
+       size_needed = bat_needed + map_needed;
+
+       /* calculate space between end of headers and beginning of data */
+       err = vhd_end_of_headers(vhd, &eom);
+       if (err)
+               return err;
+
+       eob = vhd->header.table_offset + vhd_sectors_to_bytes(bat_secs);
+       vhd_first_data_block(vhd, &first_block);
+
+       /* no blocks allocated; just shift post-bat metadata */
+       if (!first_block.offset)
+               goto shift_metadata;
+
+       /* 
+        * not enough space -- 
+        * move vhd data blocks to the end of the file to make room 
+        */
+       do {
+               off64_t new_off, bm_size, gap_size;
+
+               new_off = vhd_sectors_to_bytes(vhd_next_block_offset(vhd));
+
+               /* data region of segment should begin on page boundary */
+               bm_size = vhd_sectors_to_bytes(vhd->bm_secs);
+               if ((new_off + bm_size) % 4096) {
+                       gap_size = 4096 - ((new_off + bm_size) % 4096);
+
+                       err = vhd_write_zeros(journal, new_off, gap_size);
+                       if (err)
+                               return err;
+
+                       new_off += gap_size;
+               }
+
+               err = vhd_move_block(journal, first_block.block, new_off);
+               if (err)
+                       return err;
+
+               vhd_first_data_block(vhd, &first_block);
+
+       } while (eom + size_needed >= vhd_sectors_to_bytes(first_block.offset));
+
+       TEST_FAIL_AT(FAIL_RESIZE_DATA_MOVED);
+
+shift_metadata:
+       /* shift any metadata after the bat to make room for new bat sectors */
+       err = vhd_shift_metadata(journal, eob, bat_needed, map_needed);
+       if (err)
+               return err;
+
+       TEST_FAIL_AT(FAIL_RESIZE_METADATA_MOVED);
+
+add_entries:
+       return vhd_add_bat_entries(journal, blocks);
+}
+
+static int
+vhd_dynamic_resize(vhd_journal_t *journal, uint64_t size)
+{
+       int err;
+       vhd_context_t *vhd;
+       uint64_t cur_secs, new_secs;
+
+       vhd      = &journal->vhd;
+       cur_secs = vhd->footer.curr_size >> VHD_SECTOR_SHIFT;
+       new_secs = size << (20 - VHD_SECTOR_SHIFT);
+
+       if (cur_secs == new_secs)
+               return 0;
+
+       err = vhd_get_header(vhd);
+       if (err)
+               return err;
+
+       err = vhd_get_bat(vhd);
+       if (err)
+               return err;
+
+       if (vhd_has_batmap(vhd)) {
+               err = vhd_get_batmap(vhd);
+               if (err)
+                       return err;
+       }
+
+       if (cur_secs > new_secs)
+               err = vhd_dynamic_shrink(journal, cur_secs - new_secs);
+       else
+               err = vhd_dynamic_grow(journal, new_secs - cur_secs);
+
+       return err;
+}
+
+static int
+vhd_util_resize_check_creator(const char *name)
+{
+       int err;
+       vhd_context_t vhd;
+
+       err = vhd_open(&vhd, name, VHD_OPEN_RDONLY | VHD_OPEN_STRICT);
+       if (err) {
+               printf("error opening %s: %d\n", name, err);
+               return err;
+       }
+
+       if (!vhd_creator_tapdisk(&vhd)) {
+               printf("%s not created by xen; resize not supported\n", name);
+               err = -EINVAL;
+       }
+
+       vhd_close(&vhd);
+       return err;
+}
+
+int
+vhd_util_resize(int argc, char **argv)
+{
+       char *name, *jname;
+       uint64_t size;
+       int c, err, jerr;
+       vhd_journal_t journal;
+       vhd_context_t *vhd;
+
+       err   = -EINVAL;
+       size  = 0;
+       name  = NULL;
+       jname = NULL;
+
+       optind = 0;
+       while ((c = getopt(argc, argv, "n:j:s:h")) != -1) {
+               switch (c) {
+               case 'n':
+                       name = optarg;
+                       break;
+               case 'j':
+                       jname = optarg;
+                       break;
+               case 's':
+                       err  = 0;
+                       size = strtoull(optarg, NULL, 10);
+                       break;
+               case 'h':
+               default:
+                       goto usage;
+               }
+       }
+
+       if (err || !name || !jname || argc != optind)
+               goto usage;
+
+       err = vhd_util_resize_check_creator(name);
+       if (err)
+               return err;
+
+       libvhd_set_log_level(1);
+       err = vhd_journal_create(&journal, name, jname);
+       if (err) {
+               printf("creating journal failed: %d\n", err);
+               return err;
+       }
+
+       vhd = &journal.vhd;
+
+       err = vhd_get_footer(vhd);
+       if (err)
+               goto out;
+
+       TEST_FAIL_AT(FAIL_RESIZE_BEGIN);
+
+       if (vhd_type_dynamic(vhd))
+               err = vhd_dynamic_resize(&journal, size);
+       else
+               err = vhd_fixed_resize(&journal, size);
+
+       TEST_FAIL_AT(FAIL_RESIZE_END);
+
+out:
+       if (err) {
+               printf("resize failed: %d\n", err);
+               jerr = vhd_journal_revert(&journal);
+       } else
+               jerr = vhd_journal_commit(&journal);
+
+       if (jerr) {
+               printf("closing journal failed: %d\n", jerr);
+               vhd_journal_close(&journal);
+       } else
+               vhd_journal_remove(&journal);
+
+       return (err ? : jerr);
+
+usage:
+       printf("options: <-n name> <-j journal> <-s size (in MB)> [-h help]\n");
+       return -EINVAL;
+}
diff --git a/tools/blktap2/vhd/lib/vhd-util-revert.c b/tools/blktap2/vhd/lib/vhd-util-revert.c
new file mode 100644 (file)
index 0000000..dab6e8b
--- /dev/null
@@ -0,0 +1,106 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Altering operations:
+ *
+ * 1. Change the parent pointer to another file.
+ * 2. Change the size of the file containing the VHD image. This does NOT 
+ * affect the VHD disk capacity, only the physical size of the file containing 
+ * the VHD. Naturally, it is not possible to set the file size to be less than  
+ * the what VHD utilizes.
+ * The operation doesn't actually change the file size, but it writes the 
+ * footer in the right location such that resizing the file (manually, as a 
+ * separate step) will produce the correct results. If the new file size is 
+ * greater than the current file size, the file must first be expanded and then 
+ * altered with this operation. If the new size is smaller than the current 
+ * size, the VHD must first be altered with this operation and then the file 
+ * must be shrunk. Failing to resize the file will result in a corrupted VHD.
+*/
+
+#include <errno.h>
+//#include <fcntl.h>
+#include <stdio.h>
+//#include <stdlib.h>
+#include <unistd.h>
+
+#include "libvhd.h"
+#include "libvhd-journal.h"
+
+int
+vhd_util_revert(int argc, char **argv)
+{
+       char *name, *jname;
+       vhd_journal_t journal;
+       int c, err;
+
+       name  = NULL;
+       jname = NULL;
+
+       optind = 0;
+       while ((c = getopt(argc, argv, "n:j:h")) != -1) {
+               switch (c) {
+               case 'n':
+                       name = optarg;
+                       break;
+               case 'j':
+                       jname = optarg;
+                       break;
+               case 'h':
+               default:
+                       goto usage;
+               }
+       }
+
+       if (!name || !jname || argc != optind)
+               goto usage;
+
+       libvhd_set_log_level(1);
+       err = vhd_journal_open(&journal, name, jname);
+       if (err) {
+               printf("opening journal failed: %d\n", err);
+               return err;
+       }
+
+       err = vhd_journal_revert(&journal);
+       if (err) {
+               printf("reverting journal failed: %d\n", err);
+               vhd_journal_close(&journal);
+               return err;
+       }
+
+       err = vhd_journal_remove(&journal);
+       if (err) {
+               printf("removing journal failed: %d\n", err);
+               vhd_journal_close(&journal);
+               return err;
+       }
+
+       return 0;
+
+usage:
+       printf("options: <-n name> <-j journal> [-h help]\n");
+       return -EINVAL;
+}
diff --git a/tools/blktap2/vhd/lib/vhd-util-scan.c b/tools/blktap2/vhd/lib/vhd-util-scan.c
new file mode 100644 (file)
index 0000000..4ecfb52
--- /dev/null
@@ -0,0 +1,1315 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <glob.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <fnmatch.h>
+
+#include "list.h"
+#include "libvhd.h"
+#include "lvm-util.h"
+
+#define VHD_SCAN_FAST        0x01
+#define VHD_SCAN_PRETTY      0x02
+#define VHD_SCAN_VOLUME      0x04
+#define VHD_SCAN_NOFAIL      0x08
+#define VHD_SCAN_VERBOSE     0x10
+#define VHD_SCAN_PARENTS     0x20
+
+#define VHD_TYPE_RAW_FILE    0x01
+#define VHD_TYPE_VHD_FILE    0x02
+#define VHD_TYPE_RAW_VOLUME  0x04
+#define VHD_TYPE_VHD_VOLUME  0x08
+
+static inline int
+target_volume(uint8_t type)
+{
+       return (type == VHD_TYPE_RAW_VOLUME || type == VHD_TYPE_VHD_VOLUME);
+}
+
+static inline int
+target_vhd(uint8_t type)
+{
+       return (type == VHD_TYPE_VHD_FILE || type == VHD_TYPE_VHD_VOLUME);
+}
+
+struct target {
+       char                 name[VHD_MAX_NAME_LEN];
+       char                 device[VHD_MAX_NAME_LEN];
+       uint64_t             size;
+       uint64_t             start;
+       uint64_t             end;
+       uint8_t              type;
+};
+
+struct iterator {
+       int                  cur;
+       int                  cur_size;
+       int                  max_size;
+       struct target       *targets;
+};
+
+struct vhd_image {
+       char                *name;
+       char                *parent;
+       uint64_t             capacity;
+       off64_t              size;
+       uint8_t              hidden;
+       int                  error;
+       char                *message;
+
+       struct target       *target;
+
+       struct list_head     sibling;
+       struct list_head     children;
+       struct vhd_image    *parent_image;
+};
+
+struct vhd_scan {
+       int                  cur;
+       int                  size;
+
+       int                  lists_cur;
+       int                  lists_size;
+
+       struct vhd_image   **images;
+       struct vhd_image   **lists;
+};
+
+static int flags;
+static struct vg vg;
+static struct vhd_scan scan;
+
+static int
+vhd_util_scan_pretty_allocate_list(int cnt)
+{
+       int i;
+       struct vhd_image *list;
+
+       memset(&scan, 0, sizeof(scan));
+
+       scan.lists_cur  = 1;
+       scan.lists_size = 10;
+
+       scan.lists = calloc(scan.lists_size, sizeof(struct vhd_image *));
+       if (!scan.lists)
+               goto fail;
+
+       scan.lists[0] = calloc(cnt, sizeof(struct vhd_image));
+       if (!scan.lists[0])
+               goto fail;
+
+       scan.images = calloc(cnt, sizeof(struct vhd_image *));
+       if (!scan.images)
+               goto fail;
+
+       for (i = 0; i < cnt; i++)
+               scan.images[i] = scan.lists[0] + i;
+
+       scan.cur  = 0;
+       scan.size = cnt;
+
+       return 0;
+
+fail:
+       if (scan.lists) {
+               free(scan.lists[0]);
+               free(scan.lists);
+       }
+
+       free(scan.images);
+       memset(&scan, 0, sizeof(scan));
+       return -ENOMEM;
+}
+
+static void
+vhd_util_scan_pretty_free_list(void)
+{
+       int i;
+
+       if (scan.lists) {
+               for (i = 0; i < scan.lists_cur; i++)
+                       free(scan.lists[i]);
+               free(scan.lists);
+       }
+
+       free(scan.images);
+       memset(&scan, 0, sizeof(scan));
+}
+
+static int
+vhd_util_scan_pretty_add_image(struct vhd_image *image)
+{
+       int i;
+       struct vhd_image *img;
+
+       for (i = 0; i < scan.cur; i++) {
+               img = scan.images[i];
+               if (!strcmp(img->name, image->name))
+                       return 0;
+       }
+
+       if (scan.cur >= scan.size) {
+               struct vhd_image *new, **list;
+
+               if (scan.lists_cur >= scan.lists_size) {
+                       list = realloc(scan.lists, scan.lists_size * 2 *
+                                      sizeof(struct vhd_image *));
+                       if (!list)
+                               return -ENOMEM;
+
+                       scan.lists_size *= 2;
+                       scan.lists       = list;
+               }
+
+               new = calloc(scan.size, sizeof(struct vhd_image));
+               if (!new)
+                       return -ENOMEM;
+
+               scan.lists[scan.lists_cur++] = new;
+               scan.size *= 2;
+
+               list = realloc(scan.images, scan.size *
+                              sizeof(struct vhd_image *));
+               if (!list)
+                       return -ENOMEM;
+
+               scan.images = list;
+               for (i = 0; i + scan.cur < scan.size; i++)
+                       scan.images[i + scan.cur] = new + i;
+       }
+
+       img = scan.images[scan.cur];
+       INIT_LIST_HEAD(&img->sibling);
+       INIT_LIST_HEAD(&img->children);
+
+       img->capacity = image->capacity;
+       img->size     = image->size;
+       img->hidden   = image->hidden;
+       img->error    = image->error;
+       img->message  = image->message;
+
+       img->name = strdup(image->name);
+       if (!img->name)
+               goto fail;
+
+       if (image->parent) {
+               img->parent = strdup(image->parent);
+               if (!img->parent)
+                       goto fail;
+       }
+
+       scan.cur++;
+       return 0;
+
+fail:
+       free(img->name);
+       free(img->parent);
+       memset(img, 0, sizeof(*img));
+       return -ENOMEM;
+}
+
+static int
+vhd_util_scan_pretty_image_compare(const void *lhs, const void *rhs)
+{
+       struct vhd_image *l, *r;
+
+       l = *(struct vhd_image **)lhs;
+       r = *(struct vhd_image **)rhs;
+
+       return strcmp(l->name, r->name);
+}
+
+static void
+vhd_util_scan_print_image_indent(struct vhd_image *image, int tab)
+{
+       char *pad, *name, *pmsg, *parent;
+
+       pad    = (tab ? " " : "");
+       name   = image->name;
+       parent = (image->parent ? : "none");
+
+       if ((flags & VHD_SCAN_PRETTY) && image->parent && !image->parent_image)
+               pmsg = " (not found in scan)";
+       else
+               pmsg = "";
+
+       if (!(flags & VHD_SCAN_VERBOSE)) {
+               name = basename(image->name);
+               if (image->parent)
+                       parent = basename(image->parent);
+       }
+
+       if (image->error)
+               printf("%*svhd=%s scan-error=%d error-message='%s'\n",
+                      tab, pad, image->name, image->error, image->message);
+       else
+               printf("%*svhd=%s capacity=%"PRIu64" size=%"PRIu64" hidden=%u "
+                      "parent=%s%s\n", tab, pad, name, image->capacity,
+                      image->size, image->hidden, parent, pmsg);
+}
+
+static void
+vhd_util_scan_pretty_print_tree(struct vhd_image *image, int depth)
+{
+       struct vhd_image *img, *tmp;
+
+       vhd_util_scan_print_image_indent(image, depth * 3);
+
+       list_for_each_entry_safe(img, tmp, &image->children, sibling)
+               if (!img->hidden)
+                       vhd_util_scan_pretty_print_tree(img, depth + 1);
+
+       list_for_each_entry_safe(img, tmp, &image->children, sibling)
+               if (img->hidden)
+                       vhd_util_scan_pretty_print_tree(img, depth + 1);
+
+       free(image->name);
+       free(image->parent);
+
+       image->name   = NULL;
+       image->parent = NULL;
+}
+
+static void
+vhd_util_scan_pretty_print_images(void)
+{
+       int i;
+       struct vhd_image *image, **parentp, *parent, *keyp, key;
+
+       qsort(scan.images, scan.cur, sizeof(scan.images[0]),
+             vhd_util_scan_pretty_image_compare);
+
+       for (i = 0; i < scan.cur; i++) {
+               image = scan.images[i];
+
+               if (!image->parent) {
+                       image->parent_image = NULL;
+                       continue;
+               }
+
+               memset(&key, 0, sizeof(key));
+               key.name = image->parent;
+               keyp     = &key;
+
+               parentp  = bsearch(&keyp, scan.images, scan.cur,
+                                  sizeof(scan.images[0]),
+                                  vhd_util_scan_pretty_image_compare);
+               if (!parentp) {
+                       image->parent_image = NULL;
+                       continue;
+               }
+
+               parent = *parentp;
+               image->parent_image = parent;
+               list_add_tail(&image->sibling, &parent->children);
+       }
+
+       for (i = 0; i < scan.cur; i++) {
+               image = scan.images[i];
+
+               if (image->parent_image || !image->hidden)
+                       continue;
+
+               vhd_util_scan_pretty_print_tree(image, 0);
+       }
+
+       for (i = 0; i < scan.cur; i++) {
+               image = scan.images[i];
+
+               if (!image->name || image->parent_image)
+                       continue;
+
+               vhd_util_scan_pretty_print_tree(image, 0);
+       }
+
+       for (i = 0; i < scan.cur; i++) {
+               image = scan.images[i];
+
+               if (!image->name)
+                       continue;
+
+               vhd_util_scan_pretty_print_tree(image, 0);
+       }
+}
+
+static void
+vhd_util_scan_print_image(struct vhd_image *image)
+{
+       int err;
+
+       if (!image->error && (flags & VHD_SCAN_PRETTY)) {
+               err = vhd_util_scan_pretty_add_image(image);
+               if (!err)
+                       return;
+
+               if (!image->error) {
+                       image->error   = err;
+                       image->message = "allocating memory";
+               }
+       }
+
+       vhd_util_scan_print_image_indent(image, 0);
+}
+
+static int
+vhd_util_scan_error(const char *file, int err)
+{
+       struct vhd_image image;
+
+       memset(&image, 0, sizeof(image));
+       image.name    = (char *)file;
+       image.error   = err;
+       image.message = "failure scanning target";
+
+       vhd_util_scan_print_image(&image);
+
+       /*
+       if (flags & VHD_SCAN_NOFAIL)
+               return 0;
+       */
+
+       return err;
+}
+
+static vhd_parent_locator_t *
+vhd_util_scan_get_parent_locator(vhd_context_t *vhd)
+{
+       int i;
+       vhd_parent_locator_t *loc;
+
+       loc = NULL;
+
+       for (i = 0; i < 8; i++) {
+               if (vhd->header.loc[i].code == PLAT_CODE_MACX) {
+                       loc = vhd->header.loc + i;
+                       break;
+               }
+
+               if (vhd->header.loc[i].code == PLAT_CODE_W2RU)
+                       loc = vhd->header.loc + i;
+
+               if (!loc && vhd->header.loc[i].code != PLAT_CODE_NONE)
+                       loc = vhd->header.loc + i;
+       }
+
+       return loc;
+}
+
+static inline int
+copy_name(char *dst, const char *src)
+{
+       if (snprintf(dst, VHD_MAX_NAME_LEN, "%s", src) < VHD_MAX_NAME_LEN)
+               return 0;
+
+       return -ENAMETOOLONG;
+}
+
+/*
+ * LVHD stores realpath(parent) in parent locators, so
+ * /dev/<vol-group>/<lv-name> becomes /dev/mapper/<vol--group>-<lv--name>
+ */
+static int
+vhd_util_scan_extract_volume_name(char *dst, const char *src)
+{
+       int err;
+       char copy[VHD_MAX_NAME_LEN], *name, *s, *c;
+
+       name = strrchr(src, '/');
+       if (!name)
+               name = (char *)src;
+
+       /* convert single dashes to slashes, double dashes to single dashes */
+       for (c = copy, s = name; *s != '\0'; s++, c++) {
+               if (*s == '-') {
+                       if (s[1] != '-')
+                               *c = '/';
+                       else {
+                               s++;
+                               *c = '-';
+                       }
+               } else
+                       *c = *s;
+       }
+
+       *c = '\0';
+       c = strrchr(copy, '/');
+       if (c == name) {
+               /* unrecognized format */
+               strcpy(dst, src);
+               return -EINVAL;
+       }
+
+       strcpy(dst, ++c);
+       return 0;
+}
+
+static int
+vhd_util_scan_get_volume_parent(vhd_context_t *vhd, struct vhd_image *image)
+{
+       int err;
+       char name[VHD_MAX_NAME_LEN];
+       vhd_parent_locator_t *loc, copy;
+
+       if (flags & VHD_SCAN_FAST) {
+               err = vhd_header_decode_parent(vhd,
+                                              &vhd->header, &image->parent);
+               if (!err)
+                       goto found;
+       }
+
+       loc = vhd_util_scan_get_parent_locator(vhd);
+       if (!loc)
+               return -EINVAL;
+
+       copy = *loc;
+       copy.data_offset += image->target->start;
+       err = vhd_parent_locator_read(vhd, &copy, &image->parent);
+       if (err)
+               return err;
+
+found:
+       err = vhd_util_scan_extract_volume_name(name, image->parent);
+       if (!err)
+               return copy_name(image->parent, name);
+
+       return 0;
+}
+
+static int
+vhd_util_scan_get_parent(vhd_context_t *vhd, struct vhd_image *image)
+{
+       int i, err;
+       vhd_parent_locator_t *loc;
+
+       if (!target_vhd(image->target->type)) {
+               image->parent = NULL;
+               return 0;
+       }
+
+       loc = NULL;
+
+       if (target_volume(image->target->type))
+               return vhd_util_scan_get_volume_parent(vhd, image);
+
+       if (flags & VHD_SCAN_FAST) {
+               err = vhd_header_decode_parent(vhd,
+                                              &vhd->header, &image->parent);
+               if (!err)
+                       return 0;
+       } else {
+               /*
+                * vhd_parent_locator_get checks for the existence of the 
+                * parent file. if this call succeeds, all is well; if not,
+                * we'll try to return whatever string we have before failing
+                * outright.
+                */
+               err = vhd_parent_locator_get(vhd, &image->parent);
+               if (!err)
+                       return 0;
+       }
+
+       loc = vhd_util_scan_get_parent_locator(vhd);
+       if (!loc)
+               return -EINVAL;
+
+       return vhd_parent_locator_read(vhd, loc, &image->parent);
+}
+
+static int
+vhd_util_scan_get_hidden(vhd_context_t *vhd, struct vhd_image *image)
+{
+       int err, hidden;
+
+       err    = 0;
+       hidden = 0;
+
+       if (target_vhd(image->target->type))
+               err = vhd_hidden(vhd, &hidden);
+       else
+               hidden = 1;
+
+       if (err)
+               return err;
+
+       image->hidden = hidden;
+       return 0;
+}
+
+static int
+vhd_util_scan_get_size(vhd_context_t *vhd, struct vhd_image *image)
+{
+       image->size = image->target->size;
+
+       if (target_vhd(image->target->type))
+               image->capacity = vhd->footer.curr_size;
+       else
+               image->capacity = image->size;
+
+       return 0;
+}
+
+static int
+vhd_util_scan_open_file(vhd_context_t *vhd, struct vhd_image *image)
+{
+       int err, vhd_flags;
+
+       if (!target_vhd(image->target->type))
+               return 0;
+
+       vhd_flags = VHD_OPEN_RDONLY | VHD_OPEN_IGNORE_DISABLED;
+       if (flags & VHD_SCAN_FAST)
+               vhd_flags |= VHD_OPEN_FAST;
+
+       err = vhd_open(vhd, image->name, vhd_flags);
+       if (err) {
+               vhd->file      = NULL;
+               image->message = "opening file";
+               image->error   = err;
+               return image->error;
+       }
+
+       return 0;
+}
+
+static int
+vhd_util_scan_read_volume_headers(vhd_context_t *vhd, struct vhd_image *image)
+{
+       int err;
+       char *buf;
+       size_t size;
+       struct target *target;
+
+       buf    = NULL;
+       target = image->target;
+       size   = sizeof(vhd_footer_t) + sizeof(vhd_header_t);
+
+       err = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, size);
+       if (err) {
+               buf            = NULL;
+               image->message = "allocating image";
+               image->error   = -err;
+               goto out;
+       }
+
+       err = vhd_seek(vhd, target->start, SEEK_SET);
+       if (err) {
+               image->message = "seeking to headers";
+               image->error   = err;
+               goto out;
+       }
+
+       err = vhd_read(vhd, buf, size);
+       if (err) {
+               image->message = "reading headers";
+               image->error   = err;
+               goto out;
+       }
+
+       memcpy(&vhd->footer, buf, sizeof(vhd_footer_t));
+       vhd_footer_in(&vhd->footer);
+       err = vhd_validate_footer(&vhd->footer);
+       if (err) {
+               image->message = "invalid footer";
+               image->error   = err;
+               goto out;
+       }
+
+       /* lvhd vhds should always be dynamic */
+       if (vhd_type_dynamic(vhd)) {
+               if (vhd->footer.data_offset != sizeof(vhd_footer_t))
+                       err = vhd_read_header_at(vhd, &vhd->header,
+                                                vhd->footer.data_offset +
+                                                target->start);
+               else {
+                       memcpy(&vhd->header,
+                              buf + sizeof(vhd_footer_t),
+                              sizeof(vhd_header_t));
+                       vhd_header_in(&vhd->header);
+                       err = vhd_validate_header(&vhd->header);
+               }
+
+               if (err) {
+                       image->message = "reading header";
+                       image->error   = err;
+                       goto out;
+               }
+
+               vhd->spb = vhd->header.block_size >> VHD_SECTOR_SHIFT;
+               vhd->bm_secs = secs_round_up_no_zero(vhd->spb >> 3);
+       }
+
+out:
+       free(buf);
+       return image->error;
+}
+
+static int
+vhd_util_scan_open_volume(vhd_context_t *vhd, struct vhd_image *image)
+{
+       int err;
+       struct target *target;
+
+       target = image->target;
+       memset(vhd, 0, sizeof(*vhd));
+       vhd->oflags = VHD_OPEN_RDONLY | VHD_OPEN_FAST;
+
+       if (target->end - target->start < 4096) {
+               image->message = "device too small";
+               image->error   = -EINVAL;
+               return image->error;
+       }
+
+       vhd->file = strdup(image->name);
+       if (!vhd->file) {
+               image->message = "allocating device";
+               image->error   = -ENOMEM;
+               return image->error;
+       }
+
+       vhd->fd = open(target->device, O_RDONLY | O_DIRECT | O_LARGEFILE);
+       if (vhd->fd == -1) {
+               free(vhd->file);
+               vhd->file = NULL;
+
+               image->message = "opening device";
+               image->error   = -errno;
+               return image->error;
+       }
+
+       if (target_vhd(target->type))
+               return vhd_util_scan_read_volume_headers(vhd, image);
+
+       return 0;
+}
+
+static int
+vhd_util_scan_open(vhd_context_t *vhd, struct vhd_image *image)
+{
+       struct target *target;
+
+       target = image->target;
+
+       if (target_volume(image->target->type) || !(flags & VHD_SCAN_PRETTY))
+               image->name = target->name;
+       else {
+               image->name = realpath(target->name, NULL);
+               if (!image->name) {
+                       image->name    = target->name;
+                       image->message = "resolving name";
+                       image->error   = -errno;
+                       return image->error;
+               }
+       }
+
+       if (target_volume(target->type))
+               return vhd_util_scan_open_volume(vhd, image);
+       else
+               return vhd_util_scan_open_file(vhd, image);
+}
+
+static int
+vhd_util_scan_init_file_target(struct target *target,
+                              const char *file, uint8_t type)
+{
+       int err;
+       struct stat stats;
+
+       err = stat(file, &stats);
+       if (err == -1)
+               return -errno;
+
+       err = copy_name(target->name, file);
+       if (err)
+               return err;
+
+       err = copy_name(target->device, file);
+       if (err)
+               return err;
+
+       target->type  = type;
+       target->start = 0;
+       target->size  = stats.st_size;
+       target->end   = stats.st_size;
+
+       return 0;
+}
+
+static int
+vhd_util_scan_init_volume_target(struct target *target,
+                                struct lv *lv, uint8_t type)
+{
+       int err;
+
+       if (lv->first_segment.type != LVM_SEG_TYPE_LINEAR)
+               return -ENOSYS;
+
+       err = copy_name(target->name, lv->name);
+       if (err)
+               return err;
+
+       err = copy_name(target->device, lv->first_segment.device);
+       if (err)
+               return err;
+
+       target->type  = type;
+       target->size  = lv->size;
+       target->start = lv->first_segment.pe_start;
+       target->end   = target->start + lv->first_segment.pe_size;
+
+       return 0;
+}
+
+static int
+iterator_init(struct iterator *itr, int cnt, struct target *targets)
+{
+       memset(itr, 0, sizeof(*itr));
+
+       itr->targets = malloc(sizeof(struct target) * cnt);
+       if (!itr->targets)
+               return -ENOMEM;
+
+       memcpy(itr->targets, targets, sizeof(struct target) * cnt);
+
+       itr->cur      = 0;
+       itr->cur_size = cnt;
+       itr->max_size = cnt;
+
+       return 0;
+}
+
+static struct target *
+iterator_next(struct iterator *itr)
+{
+       if (itr->cur == itr->cur_size)
+               return NULL;
+
+       return itr->targets + itr->cur++;
+}
+
+static int
+iterator_add_file(struct iterator *itr,
+                 struct target *target, const char *parent, uint8_t type)
+{
+       int i;
+       struct target *t;
+       char *lname, *rname;
+
+       for (i = 0; i < itr->cur_size; i++) {
+               t = itr->targets + i;
+               lname = basename((char *)t->name);
+               rname = basename((char *)parent);
+
+               if (!strcmp(lname, rname))
+                       return -EEXIST;
+       }
+
+       return vhd_util_scan_init_file_target(target, parent, type);
+}
+
+static int
+iterator_add_volume(struct iterator *itr,
+                   struct target *target, const char *parent, uint8_t type)
+{
+       int i, err;
+       struct lv *lv;
+
+       lv  = NULL;
+       err = -ENOENT;
+
+       for (i = 0; i < itr->cur_size; i++)
+               if (!strcmp(parent, itr->targets[i].name))
+                       return -EEXIST;
+
+       for (i = 0; i < vg.lv_cnt; i++) {
+               err = fnmatch(parent, vg.lvs[i].name, FNM_PATHNAME);
+               if (err != FNM_NOMATCH) {
+                       lv = vg.lvs + i;
+                       break;
+               }
+       }
+
+       if (err && err != FNM_PATHNAME)
+               return err;
+
+       if (!lv)
+               return -ENOENT;
+
+       return vhd_util_scan_init_volume_target(target, lv, type);
+}
+
+static int
+iterator_add(struct iterator *itr, const char *parent, uint8_t type)
+{
+       int err;
+       struct target *target;
+
+       if (itr->cur_size == itr->max_size) {
+               struct target *new;
+
+               new = realloc(itr->targets,
+                             sizeof(struct target) *
+                             itr->max_size * 2);
+               if (!new)
+                       return -ENOMEM;
+
+               itr->max_size *= 2;
+               itr->targets   = new;
+       }
+
+       target = itr->targets + itr->cur_size;
+
+       if (target_volume(type))
+               err = iterator_add_volume(itr, target, parent, type);
+       else
+               err = iterator_add_file(itr, target, parent, type);
+
+       if (err)
+               memset(target, 0, sizeof(*target));
+       else
+               itr->cur_size++;
+
+       return (err == -EEXIST ? 0 : err);
+}
+
+static void
+iterator_free(struct iterator *itr)
+{
+       free(itr->targets);
+       memset(itr, 0, sizeof(*itr));
+}
+
+static void
+vhd_util_scan_add_parent(struct iterator *itr,
+                        vhd_context_t *vhd, struct vhd_image *image)
+{
+       int err;
+       uint8_t type;
+
+       if (vhd_parent_raw(vhd))
+               type = target_volume(image->target->type) ? 
+                       VHD_TYPE_RAW_VOLUME : VHD_TYPE_RAW_FILE;
+       else
+               type = target_volume(image->target->type) ? 
+                       VHD_TYPE_VHD_VOLUME : VHD_TYPE_VHD_FILE;
+
+       err = iterator_add(itr, image->parent, type);
+       if (err)
+               vhd_util_scan_error(image->parent, err);
+}
+
+static int
+vhd_util_scan_targets(int cnt, struct target *targets)
+{
+       int ret, err;
+       vhd_context_t vhd;
+       struct iterator itr;
+       struct target *target;
+       struct vhd_image image;
+
+       ret = 0;
+       err = 0;
+
+       err = iterator_init(&itr, cnt, targets);
+       if (err)
+               return err;
+
+       while ((target = iterator_next(&itr))) {
+               memset(&vhd, 0, sizeof(vhd));
+               memset(&image, 0, sizeof(image));
+
+               image.target = target;
+
+               err = vhd_util_scan_open(&vhd, &image);
+               if (err) {
+                       ret = -EAGAIN;
+                       goto end;
+               }
+
+               err = vhd_util_scan_get_size(&vhd, &image);
+               if (err) {
+                       ret           = -EAGAIN;
+                       image.message = "getting physical size";
+                       image.error   = err;
+                       goto end;
+               }
+
+               err = vhd_util_scan_get_hidden(&vhd, &image);
+               if (err) {
+                       ret           = -EAGAIN;
+                       image.message = "checking 'hidden' field";
+                       image.error   = err;
+                       goto end;
+               }
+
+               if (vhd.footer.type == HD_TYPE_DIFF) {
+                       err = vhd_util_scan_get_parent(&vhd, &image);
+                       if (err) {
+                               ret           = -EAGAIN;
+                               image.message = "getting parent";
+                               image.error   = err;
+                               goto end;
+                       }
+               }
+
+       end:
+               vhd_util_scan_print_image(&image);
+
+               if (flags & VHD_SCAN_PARENTS && image.parent)
+                       vhd_util_scan_add_parent(&itr, &vhd, &image);
+
+               if (vhd.file)
+                       vhd_close(&vhd);
+               if (image.name != target->name)
+                       free(image.name);
+               free(image.parent);
+
+               if (err && !(flags & VHD_SCAN_NOFAIL))
+                       break;
+       }
+
+       iterator_free(&itr);
+
+       if (flags & VHD_SCAN_NOFAIL)
+               return ret;
+
+       return err;
+}
+
+static int
+vhd_util_scan_targets_pretty(int cnt, struct target *targets)
+{
+       int err;
+
+       err = vhd_util_scan_pretty_allocate_list(cnt);
+       if (err) {
+               printf("scan failed: no memory\n");
+               return -ENOMEM;
+       }
+
+       err = vhd_util_scan_targets(cnt, targets);
+
+       vhd_util_scan_pretty_print_images();
+       vhd_util_scan_pretty_free_list();
+
+       return ((flags & VHD_SCAN_NOFAIL) ? 0 : err);
+}
+
+static int
+vhd_util_scan_find_file_targets(int cnt, char **names,
+                               const char *filter,
+                               struct target **_targets, int *_total)
+{
+       glob_t g;
+       struct target *targets;
+       int i, globs, err, total;
+
+       total     = cnt;
+       globs     = 0;
+       *_total   = 0;
+       *_targets = NULL;
+       
+       memset(&g, 0, sizeof(g));
+
+       if (filter) {
+               int gflags = ((flags & VHD_SCAN_FAST) ? GLOB_NOSORT : 0);
+
+               errno = 0;
+               err   = glob(filter, gflags, vhd_util_scan_error, &g);
+
+               switch (err) {
+               case GLOB_NOSPACE:
+                       err = -ENOMEM;
+                       break;
+               case GLOB_ABORTED:
+                       err = -EIO;
+                       break;
+               case GLOB_NOMATCH:
+                       err = -errno;
+                       break;
+               }
+
+               if (err) {
+                       vhd_util_scan_error(filter, err);
+                       return err;
+               }
+
+               globs  = g.gl_pathc;
+               total += globs;
+       }
+
+       targets = calloc(total, sizeof(struct target));
+       if (!targets) {
+               err = -ENOMEM;
+               goto out;
+       }
+
+       for (i = 0; i < g.gl_pathc; i++) {
+               err = vhd_util_scan_init_file_target(targets + i,
+                                                    g.gl_pathv[i],
+                                                    VHD_TYPE_VHD_FILE);
+               if (err) {
+                       vhd_util_scan_error(g.gl_pathv[i], err);
+                       if (!(flags & VHD_SCAN_NOFAIL))
+                               goto out;
+               }
+       }
+
+       for (i = 0; i + globs < total; i++) {
+               err = vhd_util_scan_init_file_target(targets + i + globs,
+                                                    names[i],
+                                                    VHD_TYPE_VHD_FILE);
+               if (err) {
+                       vhd_util_scan_error(names[i], err);
+                       if (!(flags & VHD_SCAN_NOFAIL))
+                               goto out;
+               }
+       }
+
+       err       = 0;
+       *_total   = total;
+       *_targets = targets;
+
+out:
+       if (err)
+               free(targets);
+       if (filter)
+               globfree(&g);
+
+       return err;
+}
+
+static inline void
+swap_volume(struct lv *lvs, int dst, int src)
+{
+       struct lv copy, *ldst, *lsrc;
+
+       if (dst == src)
+               return;
+
+       lsrc = lvs + src;
+       ldst = lvs + dst;
+
+       memcpy(&copy, ldst, sizeof(copy));
+       memcpy(ldst, lsrc, sizeof(*ldst));
+       memcpy(lsrc, &copy, sizeof(copy));
+}
+
+static int
+vhd_util_scan_sort_volumes(struct lv *lvs, int cnt,
+                          const char *filter, int *_matches)
+{
+       struct lv *lv;
+       int i, err, matches;
+
+       matches   = 0;
+       *_matches = 0;
+
+       if (!filter)
+               return 0;
+
+       for (i = 0; i < cnt; i++) {
+               lv  = lvs + i;
+
+               err = fnmatch(filter, lv->name, FNM_PATHNAME);
+               if (err) {
+                       if (err != FNM_NOMATCH) {
+                               vhd_util_scan_error(lv->name, err);
+                               if (!(flags & VHD_SCAN_NOFAIL))
+                                       return err;
+                       }
+
+                       continue;
+               }
+
+               swap_volume(lvs, matches++, i);
+       }
+
+       *_matches = matches;
+       return 0;
+}
+
+static int
+vhd_util_scan_find_volume_targets(int cnt, char **names,
+                                 const char *volume, const char *filter,
+                                 struct target **_targets, int *_total)
+{
+       struct target *targets;
+       int i, err, total, matches;
+
+       *_total   = 0;
+       *_targets = NULL;
+       targets   = NULL;
+
+       err = lvm_scan_vg(volume, &vg);
+       if (err)
+               return err;
+
+       err = vhd_util_scan_sort_volumes(vg.lvs, vg.lv_cnt,
+                                        filter, &matches);
+       if (err)
+               goto out;
+
+       total = matches;
+       for (i = 0; i < cnt; i++) {
+               err = vhd_util_scan_sort_volumes(vg.lvs + total,
+                                                vg.lv_cnt - total,
+                                                names[i], &matches);
+               if (err)
+                       goto out;
+
+               total += matches;
+       }
+
+       targets = calloc(total, sizeof(struct target));
+       if (!targets) {
+               err = -ENOMEM;
+               goto out;
+       }
+
+       for (i = 0; i < total; i++) {
+               err = vhd_util_scan_init_volume_target(targets + i,
+                                                      vg.lvs + i,
+                                                      VHD_TYPE_VHD_VOLUME);
+               if (err) {
+                       vhd_util_scan_error(vg.lvs[i].name, err);
+                       if (!(flags & VHD_SCAN_NOFAIL))
+                               goto out;
+               }
+       }
+
+       err       = 0;
+       *_total   = total;
+       *_targets = targets;
+
+out:
+       if (err)
+               free(targets);
+       return err;
+}
+
+static int
+vhd_util_scan_find_targets(int cnt, char **names,
+                          const char *volume, const char *filter,
+                          struct target **targets, int *total)
+{
+       if (flags & VHD_SCAN_VOLUME)
+               return vhd_util_scan_find_volume_targets(cnt, names,
+                                                        volume, filter,
+                                                        targets, total);
+       return vhd_util_scan_find_file_targets(cnt, names,
+                                              filter, targets, total);
+}
+
+int
+vhd_util_scan(int argc, char **argv)
+{
+       int c, ret, err, cnt;
+       char *filter, *volume;
+       struct target *targets;
+
+       cnt     = 0;
+       ret     = 0;
+       err     = 0;
+       flags   = 0;
+       filter  = NULL;
+       volume  = NULL;
+       targets = NULL;
+
+       optind = 0;
+       while ((c = getopt(argc, argv, "m:fcl:pavh")) != -1) {
+               switch (c) {
+               case 'm':
+                       filter = optarg;
+                       break;
+               case 'f':
+                       flags |= VHD_SCAN_FAST;
+                       break;
+               case 'c':
+                       flags |= VHD_SCAN_NOFAIL;
+                       break;
+               case 'l':
+                       volume = optarg;
+                       flags |= VHD_SCAN_VOLUME;
+                       break;
+               case 'p':
+                       flags |= VHD_SCAN_PRETTY;
+                       break;
+               case 'a':
+                       flags |= VHD_SCAN_PARENTS;
+                       break;
+               case 'v':
+                       flags |= VHD_SCAN_VERBOSE;
+                       break;
+               case 'h':
+                       goto usage;
+               default:
+                       err = -EINVAL;
+                       goto usage;
+               }
+       }
+
+       if (!filter && argc - optind == 0) {
+               err = -EINVAL;
+               goto usage;
+       }
+
+       if (flags & VHD_SCAN_PRETTY)
+               flags &= ~VHD_SCAN_FAST;
+
+       err = vhd_util_scan_find_targets(argc - optind, argv + optind,
+                                        volume, filter, &targets, &cnt);
+       if (err) {
+               printf("scan failed: %d\n", err);
+               return err;
+       }
+
+       if (!cnt)
+               return 0;
+
+       if (flags & VHD_SCAN_PRETTY)
+               err = vhd_util_scan_targets_pretty(cnt, targets);
+       else
+               err = vhd_util_scan_targets(cnt, targets);
+
+       free(targets);
+       lvm_free_vg(&vg);
+
+       return ((flags & VHD_SCAN_NOFAIL) ? 0 : err);
+
+usage:
+       printf("usage: [OPTIONS] FILES\n"
+              "options: [-m match filter] [-f fast] [-c continue on failure] "
+              "[-l LVM volume] [-p pretty print] [-a scan parents] "
+              "[-v verbose] [-h help]\n");
+       return err;
+}
diff --git a/tools/blktap2/vhd/lib/vhd-util-set-field.c b/tools/blktap2/vhd/lib/vhd-util-set-field.c
new file mode 100644 (file)
index 0000000..ac18573
--- /dev/null
@@ -0,0 +1,106 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "libvhd.h"
+
+int
+vhd_util_set_field(int argc, char **argv)
+{
+       long value;
+       int err, c;
+       off64_t eof;
+       vhd_context_t vhd;
+       char *name, *field;
+
+       err   = -EINVAL;
+       value = 0;
+       name  = NULL;
+       field = NULL;
+
+       if (!argc || !argv)
+               goto usage;
+
+       optind = 0;
+       while ((c = getopt(argc, argv, "n:f:v:h")) != -1) {
+               switch (c) {
+               case 'n':
+                       name = optarg;
+                       break;
+               case 'f':
+                       field = optarg;
+                       break;
+               case 'v':
+                       err   = 0;
+                       value = strtol(optarg, NULL, 10);
+                       break;
+               case 'h':
+               default:
+                       goto usage;
+               }
+       }
+
+       if (!name || !field || optind != argc || err)
+               goto usage;
+
+       if (strnlen(field, 25) >= 25) {
+               printf("invalid field\n");
+               goto usage;
+       }
+
+       if (strcmp(field, "hidden")) {
+               printf("invalid field %s\n", field);
+               goto usage;
+       }
+
+       if (value < 0 || value > 255) {
+               printf("invalid value %ld\n", value);
+               goto usage;
+       }
+
+       err = vhd_open(&vhd, name, VHD_OPEN_RDWR);
+       if (err) {
+               printf("error opening %s: %d\n", name, err);
+               return err;
+       }
+
+       vhd.footer.hidden = (char)value;
+
+       err = vhd_write_footer(&vhd, &vhd.footer);
+               
+ done:
+       vhd_close(&vhd);
+       return err;
+
+usage:
+       printf("options: <-n name> <-f field> <-v value> [-h help]\n");
+       return -EINVAL;
+}
diff --git a/tools/blktap2/vhd/lib/vhd-util-snapshot.c b/tools/blktap2/vhd/lib/vhd-util-snapshot.c
new file mode 100644 (file)
index 0000000..75960f9
--- /dev/null
@@ -0,0 +1,216 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <errno.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "libvhd.h"
+
+static int
+vhd_util_find_snapshot_target(const char *name, char **result, int *parent_raw)
+{
+       int i, err;
+       char *target;
+       vhd_context_t vhd;
+
+       *parent_raw = 0;
+       *result     = NULL;
+
+       target = strdup(name);
+       if (!target)
+               return -ENOMEM;
+
+       for (;;) {
+               err = vhd_open(&vhd, target, VHD_OPEN_RDONLY);
+               if (err)
+                       return err;
+
+               if (vhd.footer.type != HD_TYPE_DIFF)
+                       goto out;
+
+               err = vhd_get_bat(&vhd);
+               if (err)
+                       goto out;
+
+               for (i = 0; i < vhd.bat.entries; i++)
+                       if (vhd.bat.bat[i] != DD_BLK_UNUSED)
+                               goto out;
+
+               free(target);
+               err = vhd_parent_locator_get(&vhd, &target);
+               if (err)
+                       goto out;
+
+               if (vhd_parent_raw(&vhd)) {
+                       *parent_raw = 1;
+                       goto out;
+               }
+
+               vhd_close(&vhd);
+       }
+
+out:
+       vhd_close(&vhd);
+       if (err)
+               free(target);
+       else
+               *result = target;
+
+       return err;
+}
+
+static int
+vhd_util_check_depth(const char *name, int *depth)
+{
+       int err;
+       vhd_context_t vhd;
+
+       err = vhd_open(&vhd, name, VHD_OPEN_RDONLY);
+       if (err)
+               return err;
+
+       err = vhd_chain_depth(&vhd, depth);
+       vhd_close(&vhd);
+
+       return err;
+}
+
+int
+vhd_util_snapshot(int argc, char **argv)
+{
+       vhd_flag_creat_t flags;
+       int c, err, prt_raw, limit;
+       char *name, *pname, *ppath, *backing;
+       uint64_t size;
+       vhd_context_t vhd;
+
+       name    = NULL;
+       pname   = NULL;
+       ppath   = NULL;
+       backing = NULL;
+       size    = 0;
+       flags   = 0;
+       limit   = 0;
+
+       if (!argc || !argv) {
+               err = -EINVAL;
+               goto usage;
+       }
+
+       optind = 0;
+       while ((c = getopt(argc, argv, "n:p:l:mh")) != -1) {
+               switch (c) {
+               case 'n':
+                       name = optarg;
+                       break;
+               case 'p':
+                       pname = optarg;
+                       break;
+               case 'l':
+                       limit = strtol(optarg, NULL, 10);
+                       break;
+               case 'm':
+                       vhd_flag_set(flags, VHD_FLAG_CREAT_PARENT_RAW);
+                       break;
+               case 'h':
+                       err = 0;
+                       goto usage;
+               default:
+                       err = -EINVAL;
+                       goto usage;
+               }
+       }
+
+       if (!name || !pname || optind != argc) {
+               err = -EINVAL;
+               goto usage;
+       }
+
+       ppath = realpath(pname, NULL);
+       if (!ppath)
+               return -errno;
+
+       if (vhd_flag_test(flags, VHD_FLAG_CREAT_PARENT_RAW)) {
+               backing = strdup(ppath);
+               if (!backing) {
+                       err = -ENOMEM;
+                       goto out;
+               }
+       } else {
+               err = vhd_util_find_snapshot_target(ppath, &backing, &prt_raw);
+               if (err) {
+                       backing = NULL;
+                       goto out;
+               }
+
+               /* 
+                * if the sizes of the parent chain are non-uniform, we need to 
+                * pick the right size: that of the supplied parent
+                */
+               if (strcmp(ppath, backing)) {
+                       err = vhd_open(&vhd, ppath, VHD_OPEN_RDONLY);
+                       if (err)
+                               goto out;
+                       size = vhd.footer.curr_size;
+                       vhd_close(&vhd);
+               }
+
+               if (prt_raw)
+                       vhd_flag_set(flags, VHD_FLAG_CREAT_PARENT_RAW);
+       }
+
+       if (limit && !vhd_flag_test(flags, VHD_FLAG_CREAT_PARENT_RAW)) {
+               int depth;
+
+               err = vhd_util_check_depth(backing, &depth);
+               if (err)
+                       printf("error checking snapshot depth: %d\n", err);
+               else if (depth + 1 > limit) {
+                       err = -ENOSPC;
+                       printf("snapshot depth exceeded: "
+                              "current depth: %d, limit: %d\n", depth, limit);
+               }
+
+               if (err)
+                       goto out;
+       }
+
+       err = vhd_snapshot(name, size, backing, flags);
+
+out:
+       free(ppath);
+       free(backing);
+
+       return err;
+
+usage:
+       printf("options: <-n name> <-p parent name> [-l snapshot depth limit]"
+              " [-m parent_is_raw] [-h help]\n");
+       return err;
+}
diff --git a/tools/blktap2/vhd/vhd-update.c b/tools/blktap2/vhd/vhd-update.c
new file mode 100644 (file)
index 0000000..fbc23cc
--- /dev/null
@@ -0,0 +1,261 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Before updating a VHD file, we create a journal consisting of:
+ *   - all data at the beginning of the file, up to and including the BAT
+ *   - each allocated bitmap (existing at the same offset in the journal as
+ *                            its corresponding bitmap in the original file)
+ * Updates are performed in place by writing appropriately 
+ * transformed versions of journaled bitmaps to the original file.
+ */
+#include <stdio.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <endian.h>
+#include <byteswap.h>
+
+#include "atomicio.h"
+#include "libvhd.h"
+#include "libvhd-journal.h"
+
+static void
+usage(void)
+{
+       printf("usage: vhd-update <-n name> [-j existing journal] [-h]\n");
+       exit(EINVAL);
+}
+
+/*
+ * update vhd creator version to reflect its new bitmap ordering
+ */
+static inline int
+update_creator_version(vhd_journal_t *journal)
+{
+       journal->vhd.footer.crtr_ver = VHD_VERSION(1, 1);
+       return vhd_write_footer(&journal->vhd, &journal->vhd.footer);
+}
+
+static int
+journal_bitmaps(vhd_journal_t *journal)
+{
+       int i, err;
+
+       for (i = 0; i < journal->vhd.bat.entries; i++) {
+               err = vhd_journal_add_block(journal, i, VHD_JOURNAL_METADATA);
+               if (err)
+                       return err;
+       }
+
+       return 0;
+}
+
+/*
+ * older VHD bitmaps were little endian
+ * and bits within a word were set from right to left
+ */
+static inline int
+old_test_bit(int nr, volatile void * addr)
+{
+        return (((unsigned long*)addr)[nr/(sizeof(unsigned long)*8)] >>
+                (nr % (sizeof(unsigned long)*8))) & 1;
+}
+
+/*
+ * new VHD bitmaps are big endian
+ * and bits within a word are set from left to right
+ */
+#define BIT_MASK 0x80
+static inline void
+new_set_bit (int nr, volatile char *addr)
+{
+        addr[nr >> 3] |= (BIT_MASK >> (nr & 7));
+}
+
+static void
+convert_bitmap(char *in, char *out, int bytes)
+{
+       int i;
+
+       memset(out, 0, bytes);
+
+       for (i = 0; i < bytes << 3; i++)
+               if (old_test_bit(i, (void *)in))
+                       new_set_bit(i, out);
+}
+
+static int
+update_vhd(vhd_journal_t *journal, int rollback)
+{
+       int i, err;
+       size_t size;
+       char *buf, *converted;
+
+       buf       = NULL;
+       converted = NULL;
+
+       size = vhd_bytes_padded(journal->vhd.spb / 8);
+       err  = posix_memalign((void **)&converted, 512, size);
+       if (err) {
+               converted = NULL;
+               goto out;
+       }
+
+       for (i = 0; i < journal->vhd.bat.entries; i++) {
+               if (journal->vhd.bat.bat[i] == DD_BLK_UNUSED)
+                       continue;
+
+               err = vhd_read_bitmap(&journal->vhd, i, &buf);
+               if (err)
+                       goto out;
+
+               if (rollback)
+                       memcpy(converted, buf, size);
+               else
+                       convert_bitmap(buf, converted, size);
+
+               free(buf);
+
+               err = vhd_write_bitmap(&journal->vhd, i, converted);
+               if (err)
+                       goto out;
+       }
+
+       err = 0;
+ out:
+       free(converted);
+       return err;
+}
+
+static int
+open_journal(vhd_journal_t *journal, const char *file, const char *jfile)
+{
+       int err;
+
+       err = vhd_journal_create(journal, file, jfile);
+       if (err) {
+               printf("error creating journal for %s: %d\n", file, err);
+               return err;
+       }
+
+       return 0;
+}
+
+static int
+close_journal(vhd_journal_t *journal, int err)
+{
+       if (err)
+               err = vhd_journal_revert(journal);
+       else
+               err = vhd_journal_commit(journal);
+
+       if (err)
+               return vhd_journal_close(journal);
+       else
+               return vhd_journal_remove(journal);
+}
+
+int
+main(int argc, char **argv)
+{
+       char *file, *jfile;
+       int c, err, rollback;
+       vhd_journal_t journal;
+
+       file     = NULL;
+       jfile    = NULL;
+       rollback = 0;
+
+       while ((c = getopt(argc, argv, "n:j:rh")) != -1) {
+               switch(c) {
+               case 'n':
+                       file = optarg;
+                       break;
+               case 'j':
+                       jfile = optarg;
+                       err = access(jfile, R_OK);
+                       if (err == -1) {
+                               printf("invalid journal arg %s\n", jfile);
+                               return -errno;
+                       }
+                       break;
+               case 'r':
+                       /* add a rollback option for debugging which
+                        * pushes journalled bitmaps to original file
+                        * without transforming them */
+                       rollback = 1;
+                       break;
+               default:
+                       usage();
+               }
+       }
+
+       if (!file)
+               usage();
+
+       if (rollback && !jfile) {
+               printf("rollback requires a journal argument\n");
+               usage();
+       }
+
+       err = open_journal(&journal, file, jfile);
+       if (err)
+               return err;
+
+       if (!vhd_creator_tapdisk(&journal.vhd) ||
+           journal.vhd.footer.crtr_ver != VHD_VERSION(0, 1) ||
+           journal.vhd.footer.type == HD_TYPE_FIXED) {
+               err = 0;
+               goto out;
+       }
+
+       err = journal_bitmaps(&journal);
+       if (err) {
+               /* no changes to vhd file yet,
+                * so close the journal and bail */
+               vhd_journal_close(&journal);
+               return err;
+       }
+
+       err = update_vhd(&journal, rollback);
+       if (err) {
+               printf("update failed: %d; saving journal\n", err);
+               goto out;
+       }
+
+       err = update_creator_version(&journal);
+       if (err) {
+               printf("failed to udpate creator version: %d\n", err);
+               goto out;
+       }
+
+       err = 0;
+
+out:
+       err = close_journal(&journal, err);
+       return err;
+}
diff --git a/tools/blktap2/vhd/vhd-util.c b/tools/blktap2/vhd/vhd-util.c
new file mode 100644 (file)
index 0000000..944a59e
--- /dev/null
@@ -0,0 +1,160 @@
+/* Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "libvhd.h"
+#include "vhd-util.h"
+
+#if 1
+#define DFPRINTF(_f, _a...) fprintf(stdout, _f , ##_a)
+#else
+#define DFPRINTF(_f, _a...) ((void)0)
+#endif
+
+typedef int (*vhd_util_func_t) (int, char **);
+
+struct command {
+       char               *name;
+       vhd_util_func_t     func;
+};
+
+struct command commands[] = {
+       { .name = "create",      .func = vhd_util_create        },
+       { .name = "snapshot",    .func = vhd_util_snapshot      },
+       { .name = "query",       .func = vhd_util_query         },
+       { .name = "read",        .func = vhd_util_read          },
+       { .name = "set",         .func = vhd_util_set_field     },
+       { .name = "repair",      .func = vhd_util_repair        },
+       { .name = "resize",      .func = vhd_util_resize        },
+       { .name = "fill",        .func = vhd_util_fill          },
+       { .name = "coalesce",    .func = vhd_util_coalesce      },
+       { .name = "modify",      .func = vhd_util_modify        },
+       { .name = "scan",        .func = vhd_util_scan          },
+       { .name = "check",       .func = vhd_util_check         },
+       { .name = "revert",      .func = vhd_util_revert        },
+};
+
+#define print_commands()                                       \
+       do {                                                    \
+               int i, n;                                       \
+               n = sizeof(commands) / sizeof(struct command);  \
+               printf("COMMAND := { ");                        \
+               printf("%s", commands[0].name);                 \
+               for (i = 1; i < n; i++)                         \
+                       printf(" | %s", commands[i].name);      \
+               printf(" }\n");                                 \
+       } while (0)
+
+TEST_FAIL_EXTERN_VARS;
+
+void
+help(void)
+{
+       printf("usage: vhd-util COMMAND [OPTIONS]\n");
+       print_commands();
+       exit(0);
+}
+
+struct command *
+get_command(char *command)
+{
+       int i, n;
+
+       if (strnlen(command, 25) >= 25)
+               return NULL;
+
+       n = sizeof(commands) / sizeof (struct command);
+
+       for (i = 0; i < n; i++)
+               if (!strcmp(command, commands[i].name))
+                       return &commands[i];
+
+       return NULL;
+}
+
+int
+main(int argc, char *argv[])
+{
+       char **cargv;
+       struct command *cmd;
+       int cargc, i, cnt, ret;
+
+#ifdef CORE_DUMP
+       #include <sys/resource.h>
+       struct rlimit rlim;
+       rlim.rlim_cur = RLIM_INFINITY;
+       rlim.rlim_max = RLIM_INFINITY;
+       if (setrlimit(RLIMIT_CORE, &rlim) < 0)
+               fprintf(stderr, "setrlimit failed: %d\n", errno);
+#endif
+
+       ret = 0;
+
+       if (argc < 2)
+               help();
+
+       cargc = argc - 1;
+       cmd   = get_command(argv[1]);
+       if (!cmd) {
+               fprintf(stderr, "invalid COMMAND %s\n", argv[1]);
+               help();
+       }
+
+       cargv = malloc(sizeof(char *) * cargc);
+       if (!cargv)
+               exit(ENOMEM);
+
+       cnt      = 1;
+       cargv[0] = cmd->name;
+       for (i = 1; i < cargc; i++) {
+               char *arg = argv[i + (argc - cargc)];
+
+               if (!strcmp(arg, "--debug")) {
+                       libvhd_set_log_level(1);
+                       continue;
+               }
+
+               cargv[cnt++] = arg;
+       }
+
+#ifdef ENABLE_FAILURE_TESTING
+       for (i = 0; i < NUM_FAIL_TESTS; i++) {
+               TEST_FAIL[i] = 0;
+               if (getenv(ENV_VAR_FAIL[i]))
+                       TEST_FAIL[i] = 1;
+       }
+#endif // ENABLE_FAILURE_TESTING
+
+       ret = cmd->func(cnt, cargv);
+
+       free(cargv);
+
+       return (ret >= 0 ? ret : -ret);
+}
diff --git a/tools/check/check_uuid_devel b/tools/check/check_uuid_devel
new file mode 100755 (executable)
index 0000000..0a90b15
--- /dev/null
@@ -0,0 +1,6 @@
+#!/bin/sh
+# CHECK-BUILD
+
+. ./funcs.sh
+
+has_header uuid/uuid.h || fail "missing uuid headers (package uuid-dev)"
index bd499a728a5bb241cf7f804d3cac9f59c7a1e9fc..88a0cbe259c70a186aee2558195c4f4f98d684db 100644 (file)
@@ -27,6 +27,7 @@ Author: Mike Wray <mike.wray@hp.com>
 import logging
 import time
 import threading
+import thread
 import re
 import copy
 import os
@@ -534,6 +535,25 @@ class XendDomainInfo:
         
         @raise XendError: Failed pausing a domain
         """
+        try:
+            bepath="/local/domain/0/backend/"
+            if(self.domid):
+                
+                dev =  xstransact.List(bepath + 'vbd' + "/%d" % (self.domid,))
+                for x in dev:
+                    path = self.getDeviceController('vbd').readBackend(x, 'params')
+                    if path and path.startswith('/dev/xen/blktap-2'):
+                        #Figure out the sysfs path.
+                        pattern = re.compile('/dev/xen/blktap-2/tapdev(\d+)$')
+                        ctrlid = pattern.search(path)
+                        ctrl = '/sys/class/blktap2/blktap' + ctrlid.group(1)            
+                        #pause the disk
+                        f = open(ctrl + '/pause', 'w')
+                        f.write('pause');
+                        f.close()
+        except Exception, ex:
+            log.warn('Could not pause blktap disk.');
+
         try:
             xc.domain_pause(self.domid)
             self._stateSet(DOM_STATE_PAUSED)
@@ -546,6 +566,26 @@ class XendDomainInfo:
         
         @raise XendError: Failed unpausing a domain
         """
+        try:
+            bepath="/local/domain/0/backend/"
+            if(self.domid):
+                dev =  xstransact.List(bepath + "vbd" + "/%d" % (self.domid,))
+                for x in dev:
+                    path = self.getDeviceController('vbd').readBackend(x, 'params')
+                    if path and path.startswith('/dev/xen/blktap-2'):
+                        #Figure out the sysfs path.
+                        pattern = re.compile('/dev/xen/blktap-2/tapdev(\d+)$')
+                        ctrlid = pattern.search(path)
+                        ctrl = '/sys/class/blktap2/blktap' + ctrlid.group(1)
+                        #unpause the disk
+                        if(os.path.exists(ctrl + '/resume')):                  
+                            f = open(ctrl + '/resume', 'w');
+                            f.write('resume');
+                            f.close();
+
+        except Exception, ex:
+            log.warn('Could not unpause blktap disk: %s' % str(ex));
+
         try:
             xc.domain_unpause(self.domid)
             self._stateSet(DOM_STATE_RUNNING)
@@ -1171,6 +1211,15 @@ class XendDomainInfo:
 
         rc = None
         if self.domid is not None:
+            
+            #new blktap implementation may need a sysfs write after everything is torn down.
+            dev = self.getDeviceController(deviceClass).convertToDeviceNumber(devid)
+            path = self.getDeviceController(deviceClass).readBackend(dev, 'params')                
+            if path and path.startswith('/dev/xen/blktap-2'):
+                frontpath = self.getDeviceController(deviceClass).frontendPath(dev)
+                backpath = xstransact.Read(frontpath, "backend")
+                thread.start_new_thread(self.getDeviceController(deviceClass).finishDeviceCleanup, (backpath, path))
+
             rc = self.getDeviceController(deviceClass).destroyDevice(devid, force)
             if not force and rm_cfg:
                 # The backend path, other than the device itself,
index 36c1d0688e5dde2a9161fa995de2b5b50d59301a..4c7f334968085f2428476556e75cf42ba9337433 100644 (file)
@@ -1,5 +1,6 @@
 # Copyright (c) 2005, XenSource Ltd.
-
+import string, re
+import subprocess
 
 from xen.xend.server.blkif import BlkifController
 from xen.xend.XendLogging import log
@@ -7,6 +8,11 @@ from xen.xend.XendLogging import log
 phantomDev = 0;
 phantomId = 0;
 
+TAPDISK_SYSFS   = '/sys/class/blktap2'
+TAPDISK_BINARY  = '/usr/sbin/tapdisk2'
+TAPDISK_DEVICE  = '/dev/xen/blktap-2/tapdev'
+TAPDISK_CONTROL = TAPDISK_SYSFS + '/blktap'
+
 blktap_disk_types = [
     'aio',
     'sync',
@@ -14,10 +20,33 @@ blktap_disk_types = [
     'ram',
     'qcow',
     'qcow2',
-
+    'vhd',
     'ioemu',
     'tapdisk',
     ]
+def doexec(args, inputtext=None):
+    """Execute a subprocess, then return its return code, stdout and stderr"""
+    proc = subprocess.Popen(args,stdin=subprocess.PIPE,stdout=subprocess.PIPE,stderr=subprocess.PIPE,close_fds=True)
+    (stdout,stderr) = proc.communicate(inputtext)
+    rc = proc.returncode
+    return (rc,stdout,stderr)
+
+def parseDeviceString(device):
+    if device.find('/dev') == -1:
+        raise Exception, 'invalid tap device: ' + device
+
+    pattern = re.compile(TAPDISK_DEVICE + '(\d+)$')
+    groups  = pattern.search(device)
+    if not groups:
+        raise Exception, 'malformed tap device: ' + device
+
+    minor   = groups.group(1)
+    control = TAPDISK_CONTROL + minor
+
+    return minor, device, control
+
+
 
 class BlktapController(BlkifController):
     def __init__(self, vm):
@@ -86,3 +115,24 @@ class BlktapController(BlkifController):
 
         return (devid, back, front)
 
+    def createDevice(self, config):
+
+        uname = config.get('uname', '')        
+        (typ, subtyp, params, file) = string.split(uname, ':', 3)
+        if typ in ('tap'):
+            if subtyp in ('tapdisk'):                                          
+                if params in ('ioemu', 'qcow2', 'vmdk', 'sync'):
+                    log.warn('WARNING: using deprecated blktap module');
+                    return BlkifController.createDevice(self, config);
+
+        cmd = [ TAPDISK_BINARY, '-n', '%s:%s' % (params, file) ]
+        (rc,stdout,stderr) = doexec(cmd)
+
+        minor, device, control = parseDeviceString(stdout)
+
+        #modify the configuration to attach as a vbd, now that the
+        #device is configured.  Then continue to create the device
+        config.update({'uname' : 'phy:' + device.rstrip()})
+        self.deviceClass='vbd'
+
+        return BlkifController.createDevice(self, config);
index 6c2bb09ca638419681591e3c7b1ebdb9a58a802c..ed46dd4803e9df8cf2d7961083634df681ccd4bf 100644 (file)
@@ -27,8 +27,8 @@ from xen.xend.server.DevConstants import *
 
 from xen.xend.xenstore.xstransact import xstransact, complete
 from xen.xend.xenstore.xswatch import xswatch
-
-import os
+import xen.xend.server.DevConstants
+import os, re
 
 xoptions = XendOptions.instance()
 
@@ -238,6 +238,34 @@ class DevController:
             # xstransact.Remove(self.devicePath()) ?? Below is the same ?
             self.vm._removeVm("device/%s/%d" % (self.deviceClass, dev))
 
+    # The new blocktap implementation requires a sysfs signal to close
+    # out disks.  This function is called from a thread when the
+    # domain is detached from the disk.
+    def finishDeviceCleanup(self, backpath, path):
+        """Perform any device specific cleanup
+
+        @backpath backend xenstore path.
+        @path frontend device path
+
+        """
+        
+        if path and path.startswith('/dev/xen/blktap-2'):
+            
+            #Figure out what we're going to wait on.
+            self.waitForBackend_destroy(backpath)            
+
+            #Figure out the sysfs path.
+            pattern = re.compile('/dev/xen/blktap-2/tapdev(\d+)$')
+            ctrlid = pattern.search(path)
+            ctrl = '/sys/class/blktap2/blktap' + ctrlid.group(1)
+            
+            #Close out the disk
+            f = open(ctrl + '/remove', 'w')
+            f.write('remove');
+            f.close()
+
+        return
+
     def configurations(self, transaction = None):
         return map(lambda x: self.configuration(x, transaction), self.deviceIDs(transaction))