ia64/xen-unstable

changeset 5656:f8acd354e129

Manual merge.
author kaf24@firebug.cl.cam.ac.uk
date Sun Jul 03 22:36:48 2005 +0000 (2005-07-03)
parents 80d5dd14711e 09067ce92303
children cb6b221bef55 f6e7c967212e
files linux-2.6.11-xen-sparse/arch/xen/Makefile tools/blktap/Makefile tools/blktap/parallax/README tools/blktap/parallax/block-async.c tools/blktap/parallax/block-async.h tools/blktap/parallax/blockstore.c tools/blktap/parallax/blockstore.h tools/blktap/parallax/blockstored.c tools/blktap/parallax/bstest.c tools/blktap/parallax/parallax.c tools/blktap/parallax/radix.c tools/blktap/parallax/radix.h tools/blktap/parallax/requests-async.c tools/blktap/parallax/requests-async.h tools/blktap/parallax/snaplog.c tools/blktap/parallax/snaplog.h tools/blktap/parallax/vdi.c tools/blktap/parallax/vdi.h tools/blktap/parallax/vdi_create.c tools/blktap/parallax/vdi_fill.c tools/blktap/parallax/vdi_list.c tools/blktap/parallax/vdi_snap.c tools/blktap/parallax/vdi_snap_delete.c tools/blktap/parallax/vdi_snap_list.c tools/blktap/parallax/vdi_tree.c tools/blktap/parallax/vdi_unittest.c tools/blktap/parallax/vdi_validate.c tools/ioemu/target-i386-dm/Makefile xen/common/kernel.c xen/include/public/version.h
line diff
     1.1 --- a/tools/blktap/Makefile	Sun Jul 03 22:32:52 2005 +0000
     1.2 +++ b/tools/blktap/Makefile	Sun Jul 03 22:36:48 2005 +0000
     1.3 @@ -2,64 +2,46 @@ MAJOR    = 2.0
     1.4  MINOR    = 0
     1.5  SONAME   = libblktap.so.$(MAJOR)
     1.6  
     1.7 -CC       = gcc
     1.8 -
     1.9  XEN_ROOT = ../..
    1.10  include $(XEN_ROOT)/tools/Rules.mk
    1.11  
    1.12 -BLKTAP_INSTALL_DIR	= /usr/sbin
    1.13 +SUBDIRS :=
    1.14 +SUBDIRS += parallax
    1.15  
    1.16 -INSTALL         = install
    1.17 -INSTALL_PROG    = $(INSTALL) -m0755
    1.18 -INSTALL_DIR     = $(INSTALL) -d -m0755
    1.19 +BLKTAP_INSTALL_DIR = /usr/sbin
    1.20  
    1.21 -INCLUDES += 
    1.22 +INSTALL            = install
    1.23 +INSTALL_PROG       = $(INSTALL) -m0755
    1.24 +INSTALL_DIR        = $(INSTALL) -d -m0755
    1.25 +
    1.26 +INCLUDES += -I. -I $(XEN_LIBXC)
    1.27  
    1.28  LIBS     := -lpthread -lz
    1.29  
    1.30  SRCS     :=
    1.31  SRCS     += blktaplib.c
    1.32  
    1.33 -PLX_SRCS := 
    1.34 -PLX_SRCS += vdi.c 
    1.35 -PLX_SRCS += radix.c 
    1.36 -PLX_SRCS += snaplog.c
    1.37 -PLX_SRCS += blockstore.c 
    1.38 -PLX_SRCS += block-async.c
    1.39 -PLX_SRCS += requests-async.c
    1.40 -VDI_SRCS := $(PLX_SRCS)
    1.41 -PLX_SRCS += parallax.c
    1.42 -
    1.43 -VDI_TOOLS :=
    1.44 -VDI_TOOLS += vdi_create
    1.45 -VDI_TOOLS += vdi_list
    1.46 -VDI_TOOLS += vdi_snap
    1.47 -VDI_TOOLS += vdi_snap_list
    1.48 -VDI_TOOLS += vdi_snap_delete
    1.49 -VDI_TOOLS += vdi_fill
    1.50 -VDI_TOOLS += vdi_tree
    1.51 -VDI_TOOLS += vdi_validate
    1.52 -
    1.53  CFLAGS   += -Wall
    1.54  CFLAGS   += -Werror
    1.55  CFLAGS   += -Wno-unused
    1.56  #CFLAGS   += -O3
    1.57  CFLAGS   += -g3
    1.58  CFLAGS   += -fno-strict-aliasing
    1.59 -CFLAGS   += -I $(XEN_LIBXC)
    1.60 -CFLAGS   += $(INCLUDES) -I.
    1.61  CFLAGS   += -D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE
    1.62  # Get gcc to generate the dependencies for us.
    1.63  CFLAGS   += -Wp,-MD,.$(@F).d
    1.64 +CFLAGS   += $(INCLUDES) 
    1.65  DEPS     = .*.d
    1.66  
    1.67  OBJS     = $(patsubst %.c,%.o,$(SRCS))
    1.68 -IBINS    = blkdump parallax $(VDI_TOOLS)
    1.69 +IBINS    = blkdump
    1.70  
    1.71  LIB      = libblktap.so libblktap.so.$(MAJOR) libblktap.so.$(MAJOR).$(MINOR)
    1.72  
    1.73 -all: mk-symlinks blkdump $(VDI_TOOLS) parallax blockstored
    1.74 -	$(MAKE) $(LIB)
    1.75 +all: mk-symlinks libblktap.so blkdump
    1.76 +	@set -e; for subdir in $(SUBDIRS); do \
    1.77 +		$(MAKE) -C $$subdir $@;       \
    1.78 +	done
    1.79  
    1.80  LINUX_ROOT := $(wildcard $(XEN_ROOT)/linux-2.6.*-xen-sparse)
    1.81  mk-symlinks:
    1.82 @@ -77,10 +59,16 @@ install: all
    1.83  	$(INSTALL_DIR) -p $(DESTDIR)/usr/include
    1.84  	$(INSTALL_PROG) $(LIB) $(DESTDIR)/usr/$(LIBDIR)
    1.85  	$(INSTALL_PROG) blktaplib.h $(DESTDIR)/usr/include
    1.86 -	$(INSTALL_PROG) $(IBINS) $(DESTDIR)/$(BLKTAP_INSTALL_DIR)
    1.87 +	$(INSTALL_PROG) $(IBINS) $(DESTDIR)$(BLKTAP_INSTALL_DIR)
    1.88 +	@set -e; for subdir in $(SUBDIRS); do \
    1.89 +		$(MAKE) -C $$subdir $@;       \
    1.90 +	done
    1.91  
    1.92  clean:
    1.93 -	rm -rf *.a *.so *.o *.rpm $(LIB) *~ $(DEPS) xen TAGS blkdump $(VDI_TOOLS) parallax vdi_unittest
    1.94 +	rm -rf *.a *.so *.o *.rpm $(LIB) *~ $(DEPS) xen TAGS blkdump
    1.95 +	@set -e; for subdir in $(SUBDIRS); do \
    1.96 +		$(MAKE) -C $$subdir $@;       \
    1.97 +	done
    1.98  
    1.99  rpm: all
   1.100  	rm -rf staging
   1.101 @@ -91,52 +79,17 @@ rpm: all
   1.102  	mv staging/i386/*.rpm .
   1.103  	rm -rf staging
   1.104  
   1.105 -libblktap.so:
   1.106 +libblktap.so: $(OBJS)
   1.107 +	$(CC) $(CFLAGS) -Wl,-soname -Wl,$(SONAME) -shared -o      \
   1.108 +	      libblktap.so.$(MAJOR).$(MINOR) $^ $(LIBS)
   1.109 +	ln -sf libblktap.so.$(MAJOR).$(MINOR) libblktap.so.$(MAJOR)
   1.110  	ln -sf libblktap.so.$(MAJOR) $@
   1.111 -libblktap.so.$(MAJOR):
   1.112 -	ln -sf libblktap.so.$(MAJOR).$(MINOR) $@
   1.113 -libblktap.so.$(MAJOR).$(MINOR): $(OBJS)
   1.114 -	$(CC) -Wl,-soname -Wl,$(SONAME) -shared -o $@ $^ $(LIBS)
   1.115  
   1.116 -blkdump: $(LIB)
   1.117 +blkdump: libblktap.so
   1.118  	$(CC) $(CFLAGS) -o blkdump -L$(XEN_LIBXC) -L. -l blktap blkdump.c
   1.119  
   1.120 -parallax: $(LIB) $(PLX_SRCS)
   1.121 -	$(CC) $(CFLAGS) -o parallax -L$(XEN_LIBXC) -L. -lblktap $(LIBS) $(PLX_SRCS) 
   1.122 -
   1.123 -vdi_list: $(LIB) vdi_list.c $(VDI_SRCS)
   1.124 -	$(CC) $(CFLAGS) -g3 -o vdi_list vdi_list.c $(LIBS) $(VDI_SRCS)
   1.125 -
   1.126 -vdi_create: $(LIB) vdi_create.c $(VDI_SRCS)
   1.127 -	$(CC) $(CFLAGS) -g3 -o vdi_create vdi_create.c $(LIBS) $(VDI_SRCS)
   1.128 -
   1.129 -vdi_snap: $(LIB) vdi_snap.c $(VDI_SRCS)
   1.130 -	$(CC) $(CFLAGS) -g3 -o vdi_snap vdi_snap.c $(LIBS) $(VDI_SRCS)
   1.131 -
   1.132 -vdi_snap_list: $(LIB) vdi_snap_list.c $(VDI_SRCS)
   1.133 -	$(CC) $(CFLAGS) -g3 -o vdi_snap_list vdi_snap_list.c $(LIBS) $(VDI_SRCS)
   1.134 -
   1.135 -vdi_snap_delete: $(LIB) vdi_snap_delete.c $(VDI_SRCS)
   1.136 -	$(CC) $(CFLAGS) -g3 -o vdi_snap_delete vdi_snap_delete.c $(LIBS) $(VDI_SRCS)
   1.137 +.PHONY: TAGS clean install mk-symlinks rpm
   1.138  
   1.139 -vdi_tree: $(LIB) vdi_tree.c $(VDI_SRCS)
   1.140 -	$(CC) $(CFLAGS) -g3 -o vdi_tree vdi_tree.c $(LIBS) $(VDI_SRCS)
   1.141 -
   1.142 -vdi_fill: $(LIB) vdi_fill.c $(VDI_SRCS)
   1.143 -	$(CC) $(CFLAGS) -g3 -o vdi_fill vdi_fill.c $(LIBS) $(VDI_SRCS)
   1.144 -
   1.145 -vdi_validate: $(LIB) vdi_validate.c $(VDI_SRCS)
   1.146 -	$(CC) $(CFLAGS) -g3 -o vdi_validate vdi_validate.c $(LIBS) $(VDI_SRCS)
   1.147 -
   1.148 -vdi_unittest: $(LIB) vdi_unittest.c $(VDI_SRCS)
   1.149 -	$(CC) $(CFLAGS) -g3 -o vdi_unittest vdi_unittest.c $(LIBS) $(VDI_SRCS)
   1.150 -
   1.151 -blockstored: blockstored.c
   1.152 -	$(CC) $(CFLAGS) -g3 -o blockstored $(LIBS) blockstored.c
   1.153 -bstest: bstest.c blockstore.c
   1.154 -	$(CC) $(CFLAGS) -g3 -o bstest bstest.c $(LIBS) blockstore.c
   1.155 -
   1.156 -.PHONY: TAGS clean install mk-symlinks rpm
   1.157  TAGS:
   1.158  	etags -t $(SRCS) *.h
   1.159  
     2.1 --- a/tools/blktap/README-PARALLAX	Sun Jul 03 22:32:52 2005 +0000
     2.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
     2.3 @@ -1,177 +0,0 @@
     2.4 -Parallax Quick Overview
     2.5 -March 3, 2005
     2.6 -
     2.7 -This is intended to provide a quick set of instructions to let you
     2.8 -guys play with the current parallax source.  In it's current form, the
     2.9 -code will let you run an arbitrary number of VMs off of a single disk
    2.10 -image, doing copy-on-write as they make updates.  Each domain is
    2.11 -assigned a virtual disk image (VDI), which may be based on a snapshot
    2.12 -of an existing image.  All of the VDI and snapshot management should
    2.13 -currently work.
    2.14 -
    2.15 -The current implementation uses a single file as a blockstore for
    2.16 -_everything_ this will soon be replaced by the fancier backend code
    2.17 -and the local cache.  As it stands, Parallax will create
    2.18 -"blockstore.dat" in the directory that you run it from, and use
    2.19 -largefile support to make this grow to unfathomable girth.  So, you
    2.20 -probably want to run the daemon off of a local disk, with a lot of
    2.21 -free space.
    2.22 -
    2.23 -Here's how to get going:
    2.24 -
    2.25 -0. Setup:
    2.26 ----------
    2.27 -
    2.28 -Pick a local directory on a disk with lots of room.  You should be
    2.29 -running from a privileged domain (e.g. dom0) with the blocktap
    2.30 -configured in and block backend NOT.
    2.31 -
    2.32 -For convenience (for the moment) copy all of the vdi tools (vdi_*) and
    2.33 -the parallax daemon from tools/blktap into this directory.
    2.34 -
    2.35 -1. Populate the blockstore:
    2.36 ----------------------------
    2.37 -
    2.38 -First you need to put at least one image into the blockstore.  You
    2.39 -will need a disk image, either as a file or local partition.  My
    2.40 -general approach has been to
    2.41 -
    2.42 -(a) make a really big sparse file with 
    2.43 -
    2.44 -        dd if=/dev/zero of=./image bs=4K count=1 seek=[big value]
    2.45 -
    2.46 -(b) put a filesystem into it
    2.47 -
    2.48 -        mkfs.ext3 ./image
    2.49 -
    2.50 -(c) mount it using loopback
    2.51 -
    2.52 -        mkdir ./mnt
    2.53 -        mount -o loop ./image
    2.54 -
    2.55 -(d) cd into it and untar one of the image files from srg-roots.
    2.56 -
    2.57 -        cd mnt
    2.58 -        tar ...
    2.59 -
    2.60 -NOTE: Beware if your system is FC3.  mkfs is not compatible with old
    2.61 -versions of fedora, and so you don't have much choice but to install
    2.62 -further fc3 images if you have used the fc3 version of mkfs.
    2.63 -
    2.64 -(e) unmount the image
    2.65 -
    2.66 -        cd ..
    2.67 -        umount mnt
    2.68 -
    2.69 -(f) now, create a new VDI to hold the image 
    2.70 -
    2.71 -        ./vdi_create "My new FC3 VDI"
    2.72 -
    2.73 -(g) get the id of the new VDI.
    2.74 -
    2.75 -        ./vdi_list
    2.76 -
    2.77 -        |      0                     My new FC3 VDI
    2.78 -
    2.79 -(0 is the VDI id... create a few more if you want.)
    2.80 -
    2.81 -(h) hoover your image into the new VDI.
    2.82 -
    2.83 -        ./vdi_fill 0 ./image
    2.84 -
    2.85 -This will pull the entire image into the blockstore and set up a
    2.86 -mapping tree for it for VDI 0.  Passing a device (i.e. /dev/sda3)
    2.87 -should also work, but vdi_fill has NO notion of sparseness yet, so you
    2.88 -are going to pump a block into the store for each block you read.
    2.89 -
    2.90 -vdi_fill will count up until it is done, and you should be ready to
    2.91 -go.  If you want to be anal, you can use vdi_validate to test the VDI
    2.92 -against the original image.
    2.93 -
    2.94 -2. Create some extra VDIs
    2.95 --------------------------
    2.96 -
    2.97 -VDIs are actually a list of snapshots, and each snapshot is a full
    2.98 -image of mappings.  So, to preserve an immutable copy of a current
    2.99 -VDI, do this:
   2.100 -
   2.101 -(a) Snapshot your new VDI.
   2.102 -
   2.103 -        ./vdi_snap 0
   2.104 -
   2.105 -Snapshotting writes the current radix root to the VDI's snapshot log,
   2.106 -and assigns it a new writable root.
   2.107 -
   2.108 -(b) look at the VDI's snapshot log.
   2.109 -
   2.110 -        ./vdi_snap_list 0
   2.111 -
   2.112 -        | 16   0      Thu Mar  3 19:27:48 2005 565111           31
   2.113 -
   2.114 -The first two columns constitute a snapshot id and represent the
   2.115 -(block, offset) of the snapshot record.  The Date tells you when the
   2.116 -snapshot was made, and 31 is the radix root node of the snapshot.
   2.117 -
   2.118 -(c) Create a new VDI, based on that snapshot, and look at the list.
   2.119 -
   2.120 -        ./vdi_create "FC3 - Copy 1" 16 0
   2.121 -        ./vdi_list
   2.122 -
   2.123 -        |      0                     My new FC3 VDI
   2.124 -        |      1                       FC3 - Copy 1
   2.125 -
   2.126 -NOTE: If you have Graphviz installed on your system, you can use
   2.127 -vdi_tree to generate a postscript of your current set of VDIs and
   2.128 -snapshots.
   2.129 -
   2.130 -
   2.131 -Create as many VDIs as you need for the VMs that you want to run.
   2.132 -
   2.133 -3. Boot some VMs:
   2.134 ------------------
   2.135 -
   2.136 -Parallax currently uses a hack in xend to pass the VDI id, you need to
   2.137 -modify the disk line of the VM config that is going to mount it.
   2.138 -
   2.139 -(a) set up your vm config, by using the following disk line:
   2.140 -
   2.141 -        disk = ['parallax:1,sda1,w,0' ]
   2.142 -
   2.143 -This example uses VDI 1 (from vdi_list above), presents it as sda1
   2.144 -(writable), and uses dom 0 as the backend.  If you were running the
   2.145 -daemon (and tap driver) in some domain other than 0, you would change
   2.146 -this last parameter.
   2.147 -
   2.148 -NOTE: You'll need to have reinstalled xend/tools prior to booting the vm, so that it knows what to do with "parallax:".
   2.149 -
   2.150 -(b) Run parallax in the backend domain.
   2.151 -
   2.152 -        ./parallax
   2.153 -
   2.154 -(c) create your new domain.
   2.155 -
   2.156 -        xm create ...
   2.157 -
   2.158 ----
   2.159 -
   2.160 -That's pretty much all there is to it at the moment.  Hope this is
   2.161 -clear enough to get you going.  Now, a few serious caveats that will
   2.162 -be sorted out in the almost immediate future:
   2.163 -
   2.164 -WARNINGS:
   2.165 ----------
   2.166 -
   2.167 -1. There is NO locking in the VDI tools at the moment, so I'd avoid
   2.168 -running them in parallel, or more importantly, running them while the
   2.169 -daemon is running.
   2.170 -
   2.171 -2. I doubt that xend will be very happy about restarting if you have
   2.172 -parallax-using domains.  So if it dies while there are active parallax
   2.173 -doms, you may need to reboot.
   2.174 -
   2.175 -3. I've turned off write-in-place.  So at the moment, EVERY block
   2.176 -write is a log append on the blockstore.  I've been having some probs
   2.177 -with the radix tree's marking of writable blocks after snapshots and
   2.178 -will sort this out very soon.
   2.179 -
   2.180 -
     3.1 --- a/tools/blktap/block-async.c	Sun Jul 03 22:32:52 2005 +0000
     3.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
     3.3 @@ -1,393 +0,0 @@
     3.4 -/* block-async.c
     3.5 - * 
     3.6 - * Asynchronous block wrappers for parallax.
     3.7 - */
     3.8 - 
     3.9 - 
    3.10 -#include <stdio.h>
    3.11 -#include <stdlib.h>
    3.12 -#include <string.h>
    3.13 -#include <pthread.h>
    3.14 -#include "block-async.h"
    3.15 -#include "blockstore.h"
    3.16 -#include "vdi.h"
    3.17 -
    3.18 -
    3.19 -#if 0
    3.20 -#define DPRINTF(_f, _a...) printf ( _f , ## _a )
    3.21 -#else
    3.22 -#define DPRINTF(_f, _a...) ((void)0)
    3.23 -#endif
    3.24 -
    3.25 -/* We have a queue of outstanding I/O requests implemented as a 
    3.26 - * circular producer-consumer ring with free-running buffers.
    3.27 - * to allow reordering, this ring indirects to indexes in an 
    3.28 - * ring of io_structs.
    3.29 - * 
    3.30 - * the block_* calls may either add an entry to this ring and return, 
    3.31 - * or satisfy the request immediately and call the callback directly.
    3.32 - * None of the io calls in parallax should be nested enough to worry 
    3.33 - * about stack problems with this approach.
    3.34 - */
    3.35 -
    3.36 -struct read_args {
    3.37 -    u64 addr;
    3.38 -};
    3.39 -
    3.40 -struct write_args {
    3.41 -    u64   addr;
    3.42 -    char *block;
    3.43 -};
    3.44 -
    3.45 -struct alloc_args {
    3.46 -    char *block;
    3.47 -};
    3.48 - 
    3.49 -struct pending_io_req {
    3.50 -    enum {IO_READ, IO_WRITE, IO_ALLOC, IO_RWAKE, IO_WWAKE} op;
    3.51 -    union {
    3.52 -        struct read_args  r;
    3.53 -        struct write_args w;
    3.54 -        struct alloc_args a;
    3.55 -    } u;
    3.56 -    io_cb_t cb;
    3.57 -    void *param;
    3.58 -};
    3.59 -
    3.60 -void radix_lock_init(struct radix_lock *r)
    3.61 -{
    3.62 -    int i;
    3.63 -    
    3.64 -    pthread_mutex_init(&r->lock, NULL);
    3.65 -    for (i=0; i < 1024; i++) {
    3.66 -        r->lines[i] = 0;
    3.67 -        r->waiters[i] = NULL;
    3.68 -        r->state[i] = ANY;
    3.69 -    }
    3.70 -}
    3.71 -
    3.72 -/* maximum outstanding I/O requests issued asynchronously */
    3.73 -/* must be a power of 2.*/
    3.74 -#define MAX_PENDING_IO 1024
    3.75 -
    3.76 -/* how many threads to concurrently issue I/O to the disk. */
    3.77 -#define IO_POOL_SIZE   10
    3.78 -
    3.79 -static struct pending_io_req pending_io_reqs[MAX_PENDING_IO];
    3.80 -static int pending_io_list[MAX_PENDING_IO];
    3.81 -static unsigned long io_prod = 0, io_cons = 0, io_free = 0;
    3.82 -#define PENDING_IO_MASK(_x) ((_x) & (MAX_PENDING_IO - 1))
    3.83 -#define PENDING_IO_IDX(_x) ((_x) - pending_io_reqs)
    3.84 -#define PENDING_IO_ENT(_x) \
    3.85 -	(&pending_io_reqs[pending_io_list[PENDING_IO_MASK(_x)]])
    3.86 -#define CAN_PRODUCE_PENDING_IO ((io_free + MAX_PENDING_IO) != io_prod)
    3.87 -#define CAN_CONSUME_PENDING_IO (io_cons != io_prod)
    3.88 -static pthread_mutex_t pending_io_lock = PTHREAD_MUTEX_INITIALIZER;
    3.89 -static pthread_cond_t  pending_io_cond = PTHREAD_COND_INITIALIZER;
    3.90 -
    3.91 -static void init_pending_io(void)
    3.92 -{
    3.93 -    int i;
    3.94 -	
    3.95 -    for (i=0; i<MAX_PENDING_IO; i++)
    3.96 -        pending_io_list[i] = i;
    3.97 -		
    3.98 -} 
    3.99 -
   3.100 -void block_read(u64 addr, io_cb_t cb, void *param)
   3.101 -{
   3.102 -    struct pending_io_req *req;
   3.103 -    
   3.104 -    pthread_mutex_lock(&pending_io_lock);
   3.105 -    assert(CAN_PRODUCE_PENDING_IO);
   3.106 -    
   3.107 -    req = PENDING_IO_ENT(io_prod++);
   3.108 -    DPRINTF("Produce (R) %lu (%p)\n", io_prod - 1, req);
   3.109 -    req->op = IO_READ;
   3.110 -    req->u.r.addr = addr;
   3.111 -    req->cb = cb;
   3.112 -    req->param = param;
   3.113 -    
   3.114 -    pthread_cond_signal(&pending_io_cond);
   3.115 -    pthread_mutex_unlock(&pending_io_lock);	
   3.116 -}
   3.117 -
   3.118 -
   3.119 -void block_write(u64 addr, char *block, io_cb_t cb, void *param)
   3.120 -{
   3.121 -    struct pending_io_req *req;
   3.122 -    
   3.123 -    pthread_mutex_lock(&pending_io_lock);
   3.124 -    assert(CAN_PRODUCE_PENDING_IO);
   3.125 -    
   3.126 -    req = PENDING_IO_ENT(io_prod++);
   3.127 -    DPRINTF("Produce (W) %lu (%p)\n", io_prod - 1, req);
   3.128 -    req->op = IO_WRITE;
   3.129 -    req->u.w.addr  = addr;
   3.130 -    req->u.w.block = block;
   3.131 -    req->cb = cb;
   3.132 -    req->param = param;
   3.133 -    
   3.134 -    pthread_cond_signal(&pending_io_cond);
   3.135 -    pthread_mutex_unlock(&pending_io_lock);	
   3.136 -}
   3.137 -
   3.138 -
   3.139 -void block_alloc(char *block, io_cb_t cb, void *param)
   3.140 -{
   3.141 -    struct pending_io_req *req;
   3.142 -	
   3.143 -    pthread_mutex_lock(&pending_io_lock);
   3.144 -    assert(CAN_PRODUCE_PENDING_IO);
   3.145 -    
   3.146 -    req = PENDING_IO_ENT(io_prod++);
   3.147 -    req->op = IO_ALLOC;
   3.148 -    req->u.a.block = block;
   3.149 -    req->cb = cb;
   3.150 -    req->param = param;
   3.151 -    
   3.152 -    pthread_cond_signal(&pending_io_cond);
   3.153 -    pthread_mutex_unlock(&pending_io_lock);	
   3.154 -}
   3.155 -
   3.156 -void block_rlock(struct radix_lock *r, int row, io_cb_t cb, void *param)
   3.157 -{
   3.158 -    struct io_ret ret;
   3.159 -    pthread_mutex_lock(&r->lock);
   3.160 -    
   3.161 -    if (( r->lines[row] >= 0 ) && (r->state[row] != STOP)) {
   3.162 -        r->lines[row]++;
   3.163 -        r->state[row] = READ;
   3.164 -        DPRINTF("RLOCK  : %3d (row: %d)\n", r->lines[row], row);
   3.165 -        pthread_mutex_unlock(&r->lock);
   3.166 -        ret.type = IO_INT_T;
   3.167 -        ret.u.i = 0;
   3.168 -        cb(ret, param);
   3.169 -    } else {
   3.170 -        struct radix_wait **rwc;
   3.171 -        struct radix_wait *rw = 
   3.172 -            (struct radix_wait *) malloc (sizeof(struct radix_wait));
   3.173 -        DPRINTF("RLOCK  : %3d (row: %d) -- DEFERRED!\n", r->lines[row], row);
   3.174 -        rw->type  = RLOCK;
   3.175 -        rw->param = param;
   3.176 -        rw->cb    = cb;
   3.177 -        rw->next  = NULL;
   3.178 -        /* append to waiters list. */
   3.179 -        rwc = &r->waiters[row];
   3.180 -        while (*rwc != NULL) rwc = &(*rwc)->next;
   3.181 -        *rwc = rw;
   3.182 -        pthread_mutex_unlock(&r->lock);
   3.183 -        return;
   3.184 -    }
   3.185 -}
   3.186 -
   3.187 -
   3.188 -void block_wlock(struct radix_lock *r, int row, io_cb_t cb, void *param)
   3.189 -{
   3.190 -    struct io_ret ret;
   3.191 -    pthread_mutex_lock(&r->lock);
   3.192 -    
   3.193 -    /* the second check here is redundant -- just here for debugging now. */
   3.194 -    if ((r->state[row] == ANY) && ( r->lines[row] == 0 )) {
   3.195 -        r->state[row] = STOP;
   3.196 -        r->lines[row] = -1;
   3.197 -        DPRINTF("WLOCK  : %3d (row: %d)\n", r->lines[row], row);
   3.198 -        pthread_mutex_unlock(&r->lock);
   3.199 -        ret.type = IO_INT_T;
   3.200 -        ret.u.i = 0;
   3.201 -        cb(ret, param);
   3.202 -    } else {
   3.203 -        struct radix_wait **rwc;
   3.204 -        struct radix_wait *rw = 
   3.205 -            (struct radix_wait *) malloc (sizeof(struct radix_wait));
   3.206 -        DPRINTF("WLOCK  : %3d (row: %d) -- DEFERRED!\n", r->lines[row], row);
   3.207 -        rw->type  = WLOCK;
   3.208 -        rw->param = param;
   3.209 -        rw->cb    = cb;
   3.210 -        rw->next  = NULL;
   3.211 -        /* append to waiters list. */
   3.212 -        rwc = &r->waiters[row];
   3.213 -        while (*rwc != NULL) rwc = &(*rwc)->next;
   3.214 -        *rwc = rw;
   3.215 -        pthread_mutex_unlock(&r->lock);
   3.216 -        return;
   3.217 -    }
   3.218 -	
   3.219 -}
   3.220 -
   3.221 -/* called with radix_lock locked and lock count of zero. */
   3.222 -static void wake_waiters(struct radix_lock *r, int row)
   3.223 -{
   3.224 -    struct pending_io_req *req;
   3.225 -    struct radix_wait *rw;
   3.226 -    
   3.227 -    if (r->lines[row] != 0) return;
   3.228 -    if (r->waiters[row] == NULL) return; 
   3.229 -    
   3.230 -    if (r->waiters[row]->type == WLOCK) {
   3.231 -
   3.232 -        rw = r->waiters[row];
   3.233 -        pthread_mutex_lock(&pending_io_lock);
   3.234 -        assert(CAN_PRODUCE_PENDING_IO);
   3.235 -        
   3.236 -        req = PENDING_IO_ENT(io_prod++);
   3.237 -        req->op    = IO_WWAKE;
   3.238 -        req->cb    = rw->cb;
   3.239 -        req->param = rw->param;
   3.240 -        r->lines[row] = -1; /* write lock the row. */
   3.241 -        r->state[row] = STOP;
   3.242 -        r->waiters[row] = rw->next;
   3.243 -        free(rw);
   3.244 -        pthread_mutex_unlock(&pending_io_lock);
   3.245 -    
   3.246 -    } else /* RLOCK */ {
   3.247 -
   3.248 -        while ((r->waiters[row] != NULL) && (r->waiters[row]->type == RLOCK)) {
   3.249 -            rw = r->waiters[row];
   3.250 -            pthread_mutex_lock(&pending_io_lock);
   3.251 -            assert(CAN_PRODUCE_PENDING_IO);
   3.252 -            
   3.253 -            req = PENDING_IO_ENT(io_prod++);
   3.254 -            req->op    = IO_RWAKE;
   3.255 -            req->cb    = rw->cb;
   3.256 -            req->param = rw->param;
   3.257 -            r->lines[row]++; /* read lock the row. */
   3.258 -            r->state[row] = READ; 
   3.259 -            r->waiters[row] = rw->next;
   3.260 -            free(rw);
   3.261 -            pthread_mutex_unlock(&pending_io_lock);
   3.262 -        }
   3.263 -
   3.264 -        if (r->waiters[row] != NULL) /* There is a write queued still */
   3.265 -            r->state[row] = STOP;
   3.266 -    }	
   3.267 -    
   3.268 -    pthread_mutex_lock(&pending_io_lock);
   3.269 -    pthread_cond_signal(&pending_io_cond);
   3.270 -    pthread_mutex_unlock(&pending_io_lock);
   3.271 -}
   3.272 -
   3.273 -void block_runlock(struct radix_lock *r, int row, io_cb_t cb, void *param)
   3.274 -{
   3.275 -    struct io_ret ret;
   3.276 -	
   3.277 -    pthread_mutex_lock(&r->lock);
   3.278 -    assert(r->lines[row] > 0); /* try to catch misuse. */
   3.279 -    r->lines[row]--;
   3.280 -    if (r->lines[row] == 0) {
   3.281 -        r->state[row] = ANY;
   3.282 -        wake_waiters(r, row);
   3.283 -    }
   3.284 -    pthread_mutex_unlock(&r->lock);
   3.285 -    cb(ret, param);
   3.286 -}
   3.287 -
   3.288 -void block_wunlock(struct radix_lock *r, int row, io_cb_t cb, void *param)
   3.289 -{
   3.290 -    struct io_ret ret;
   3.291 -    
   3.292 -    pthread_mutex_lock(&r->lock);
   3.293 -    assert(r->lines[row] == -1); /* try to catch misuse. */
   3.294 -    r->lines[row] = 0;
   3.295 -    r->state[row] = ANY;
   3.296 -    wake_waiters(r, row);
   3.297 -    pthread_mutex_unlock(&r->lock);
   3.298 -    cb(ret, param);
   3.299 -}
   3.300 -
   3.301 -/* consumer calls */
   3.302 -static void do_next_io_req(struct pending_io_req *req)
   3.303 -{
   3.304 -    struct io_ret          ret;
   3.305 -    void  *param;
   3.306 -    
   3.307 -    switch (req->op) {
   3.308 -    case IO_READ:
   3.309 -        ret.type = IO_BLOCK_T;
   3.310 -        ret.u.b  = readblock(req->u.r.addr);
   3.311 -        break;
   3.312 -    case IO_WRITE:
   3.313 -        ret.type = IO_INT_T;
   3.314 -        ret.u.i  = writeblock(req->u.w.addr, req->u.w.block);
   3.315 -        DPRINTF("wrote %d at %Lu\n", *(int *)(req->u.w.block), req->u.w.addr);
   3.316 -        break;
   3.317 -    case IO_ALLOC:
   3.318 -        ret.type = IO_ADDR_T;
   3.319 -        ret.u.a  = allocblock(req->u.a.block);
   3.320 -        break;
   3.321 -    case IO_RWAKE:
   3.322 -        DPRINTF("WAKE DEFERRED RLOCK!\n");
   3.323 -        ret.type = IO_INT_T;
   3.324 -        ret.u.i  = 0;
   3.325 -        break;
   3.326 -    case IO_WWAKE:
   3.327 -        DPRINTF("WAKE DEFERRED WLOCK!\n");
   3.328 -        ret.type = IO_INT_T;
   3.329 -        ret.u.i  = 0;
   3.330 -        break;
   3.331 -    default:
   3.332 -        DPRINTF("Unknown IO operation on pending list!\n");
   3.333 -        return;
   3.334 -    }
   3.335 -    
   3.336 -    param = req->param;
   3.337 -    pthread_mutex_lock(&pending_io_lock);
   3.338 -    pending_io_list[PENDING_IO_MASK(io_free++)] = PENDING_IO_IDX(req);
   3.339 -    pthread_mutex_unlock(&pending_io_lock);
   3.340 -	
   3.341 -    assert(req->cb != NULL);
   3.342 -    req->cb(ret, param);
   3.343 -    
   3.344 -}
   3.345 -
   3.346 -void *io_thread(void *param) 
   3.347 -{
   3.348 -    int tid;
   3.349 -    struct pending_io_req *req;
   3.350 -    
   3.351 -    /* Set this thread's tid. */
   3.352 -    tid = *(int *)param;
   3.353 -    free(param);
   3.354 -    
   3.355 -start:
   3.356 -    pthread_mutex_lock(&pending_io_lock);
   3.357 -    while (io_prod == io_cons) {
   3.358 -        pthread_cond_wait(&pending_io_cond, &pending_io_lock);
   3.359 -    }
   3.360 -    
   3.361 -    if (io_prod == io_cons) {
   3.362 -        /* unnecessary wakeup. */
   3.363 -        pthread_mutex_unlock(&pending_io_lock);
   3.364 -        goto start;
   3.365 -    }
   3.366 -    
   3.367 -    req = PENDING_IO_ENT(io_cons++);
   3.368 -    pthread_mutex_unlock(&pending_io_lock);
   3.369 -	
   3.370 -    do_next_io_req(req);
   3.371 -    
   3.372 -    goto start;
   3.373 -	
   3.374 -}
   3.375 -
   3.376 -static pthread_t io_pool[IO_POOL_SIZE];
   3.377 -void start_io_threads(void)
   3.378 -
   3.379 -{	
   3.380 -    int i, tid=0;
   3.381 -    
   3.382 -    for (i=0; i < IO_POOL_SIZE; i++) {
   3.383 -        int ret, *t;
   3.384 -        t = (int *)malloc(sizeof(int));
   3.385 -        *t = tid++;
   3.386 -        ret = pthread_create(&io_pool[i], NULL, io_thread, t);
   3.387 -        if (ret != 0) printf("Error starting thread %d\n", i);
   3.388 -    }
   3.389 -	
   3.390 -}
   3.391 -
   3.392 -void init_block_async(void)
   3.393 -{
   3.394 -    init_pending_io();
   3.395 -    start_io_threads();
   3.396 -}
     4.1 --- a/tools/blktap/block-async.h	Sun Jul 03 22:32:52 2005 +0000
     4.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
     4.3 @@ -1,69 +0,0 @@
     4.4 -/* block-async.h
     4.5 - * 
     4.6 - * Asynchronous block wrappers for parallax.
     4.7 - */
     4.8 - 
     4.9 -#ifndef _BLOCKASYNC_H_
    4.10 -#define _BLOCKASYNC_H_
    4.11 -
    4.12 -#include <assert.h>
    4.13 -#include <xc.h>
    4.14 -#include "vdi.h"
    4.15 -
    4.16 -struct io_ret
    4.17 -{
    4.18 -    enum {IO_ADDR_T, IO_BLOCK_T, IO_INT_T} type;
    4.19 -    union {
    4.20 -        u64   a;
    4.21 -        char *b;
    4.22 -        int   i;
    4.23 -    } u;
    4.24 -};
    4.25 -
    4.26 -typedef void (*io_cb_t)(struct io_ret r, void *param);
    4.27 -
    4.28 -/* per-vdi lock structures to make sure requests run in a safe order. */
    4.29 -struct radix_wait {
    4.30 -    enum {RLOCK, WLOCK} type;
    4.31 -    io_cb_t  cb;
    4.32 -    void    *param;
    4.33 -    struct radix_wait *next;
    4.34 -};
    4.35 -
    4.36 -struct radix_lock {
    4.37 -    pthread_mutex_t lock;
    4.38 -    int                    lines[1024];
    4.39 -    struct radix_wait     *waiters[1024];
    4.40 -    enum {ANY, READ, STOP} state[1024];
    4.41 -};
    4.42 -void radix_lock_init(struct radix_lock *r);
    4.43 -
    4.44 -void block_read(u64 addr, io_cb_t cb, void *param);
    4.45 -void block_write(u64 addr, char *block, io_cb_t cb, void *param);
    4.46 -void block_alloc(char *block, io_cb_t cb, void *param);
    4.47 -void block_rlock(struct radix_lock *r, int row, io_cb_t cb, void *param);
    4.48 -void block_wlock(struct radix_lock *r, int row, io_cb_t cb, void *param);
    4.49 -void block_runlock(struct radix_lock *r, int row, io_cb_t cb, void *param);
    4.50 -void block_wunlock(struct radix_lock *r, int row, io_cb_t cb, void *param);
    4.51 -void init_block_async(void);
    4.52 -
    4.53 -static inline u64 IO_ADDR(struct io_ret r)
    4.54 -{
    4.55 -    assert(r.type == IO_ADDR_T);
    4.56 -    return r.u.a;
    4.57 -}
    4.58 -
    4.59 -static inline char *IO_BLOCK(struct io_ret r)
    4.60 -{
    4.61 -    assert(r.type == IO_BLOCK_T);
    4.62 -    return r.u.b;
    4.63 -}
    4.64 -
    4.65 -static inline int IO_INT(struct io_ret r)
    4.66 -{
    4.67 -    assert(r.type == IO_INT_T);
    4.68 -    return r.u.i;
    4.69 -}
    4.70 -
    4.71 -
    4.72 -#endif //_BLOCKASYNC_H_
     5.1 --- a/tools/blktap/blockstore.c	Sun Jul 03 22:32:52 2005 +0000
     5.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
     5.3 @@ -1,1350 +0,0 @@
     5.4 -/**************************************************************************
     5.5 - * 
     5.6 - * blockstore.c
     5.7 - *
     5.8 - * Simple block store interface
     5.9 - *
    5.10 - */
    5.11 - 
    5.12 -#include <fcntl.h>
    5.13 -#include <unistd.h>
    5.14 -#include <stdio.h>
    5.15 -#include <stdlib.h>
    5.16 -#include <string.h>
    5.17 -#include <sys/types.h>
    5.18 -#include <sys/stat.h>
    5.19 -#include <sys/time.h>
    5.20 -#include <stdarg.h>
    5.21 -#include "blockstore.h"
    5.22 -#include <pthread.h>
    5.23 -
    5.24 -//#define BLOCKSTORE_REMOTE
    5.25 -//#define BSDEBUG
    5.26 -
    5.27 -#define RETRY_TIMEOUT 1000000 /* microseconds */
    5.28 -
    5.29 -/*****************************************************************************
    5.30 - * Debugging
    5.31 - */
    5.32 -#ifdef BSDEBUG
    5.33 -void DB(char *format, ...)
    5.34 -{
    5.35 -    va_list args;
    5.36 -    fprintf(stderr, "[%05u] ", (int)pthread_getspecific(tid_key));
    5.37 -    va_start(args, format);
    5.38 -    vfprintf(stderr, format, args);
    5.39 -    va_end(args);
    5.40 -}
    5.41 -#else
    5.42 -#define DB(format, ...) (void)0
    5.43 -#endif
    5.44 -
    5.45 -#ifdef BLOCKSTORE_REMOTE
    5.46 -
    5.47 -#include <sys/socket.h>
    5.48 -#include <sys/ioctl.h>
    5.49 -#include <netinet/in.h>
    5.50 -#include <netdb.h>
    5.51 -
    5.52 -/*****************************************************************************
    5.53 - * Network state                                                             *
    5.54 - *****************************************************************************/
    5.55 -
    5.56 -/* The individual disk servers we talks to. These will be referenced by
    5.57 - * an integer index into bsservers[].
    5.58 - */
    5.59 -bsserver_t bsservers[MAX_SERVERS];
    5.60 -
    5.61 -/* The cluster map. This is indexed by an integer cluster number.
    5.62 - */
    5.63 -bscluster_t bsclusters[MAX_CLUSTERS];
    5.64 -
    5.65 -/* Local socket.
    5.66 - */
    5.67 -struct sockaddr_in sin_local;
    5.68 -int bssock = 0;
    5.69 -
    5.70 -/*****************************************************************************
    5.71 - * Notification                                                              *
    5.72 - *****************************************************************************/
    5.73 -
    5.74 -typedef struct pool_thread_t_struct {
    5.75 -    pthread_mutex_t ptmutex;
    5.76 -    pthread_cond_t ptcv;
    5.77 -    int newdata;
    5.78 -} pool_thread_t;
    5.79 -
    5.80 -pool_thread_t pool_thread[READ_POOL_SIZE+1];
    5.81 -
    5.82 -#define RECV_NOTIFY(tid) { \
    5.83 -    pthread_mutex_lock(&(pool_thread[tid].ptmutex)); \
    5.84 -    pool_thread[tid].newdata = 1; \
    5.85 -    DB("CV Waking %u", tid); \
    5.86 -    pthread_cond_signal(&(pool_thread[tid].ptcv)); \
    5.87 -    pthread_mutex_unlock(&(pool_thread[tid].ptmutex)); }
    5.88 -#define RECV_AWAIT(tid) { \
    5.89 -    pthread_mutex_lock(&(pool_thread[tid].ptmutex)); \
    5.90 -    if (pool_thread[tid].newdata) { \
    5.91 -        pool_thread[tid].newdata = 0; \
    5.92 -        DB("CV Woken %u", tid); \
    5.93 -    } \
    5.94 -    else { \
    5.95 -        DB("CV Waiting %u", tid); \
    5.96 -        pthread_cond_wait(&(pool_thread[tid].ptcv), \
    5.97 -                          &(pool_thread[tid].ptmutex)); \
    5.98 -    } \
    5.99 -    pthread_mutex_unlock(&(pool_thread[tid].ptmutex)); }
   5.100 -
   5.101 -/*****************************************************************************
   5.102 - * Message queue management                                                  *
   5.103 - *****************************************************************************/
   5.104 -
   5.105 -/* Protects the queue manipulation critcal regions.
   5.106 - */
   5.107 -pthread_mutex_t ptmutex_queue;
   5.108 -#define ENTER_QUEUE_CR pthread_mutex_lock(&ptmutex_queue)
   5.109 -#define LEAVE_QUEUE_CR pthread_mutex_unlock(&ptmutex_queue)
   5.110 -
   5.111 -pthread_mutex_t ptmutex_recv;
   5.112 -#define ENTER_RECV_CR pthread_mutex_lock(&ptmutex_recv)
   5.113 -#define LEAVE_RECV_CR pthread_mutex_unlock(&ptmutex_recv)
   5.114 -
   5.115 -/* A message queue entry. We allocate one of these for every request we send.
   5.116 - * Asynchronous reply reception also used one of these.
   5.117 - */
   5.118 -typedef struct bsq_t_struct {
   5.119 -    struct bsq_t_struct *prev;
   5.120 -    struct bsq_t_struct *next;
   5.121 -    int status;
   5.122 -    int server;
   5.123 -    int length;
   5.124 -    struct msghdr msghdr;
   5.125 -    struct iovec iov[2];
   5.126 -    int tid;
   5.127 -    struct timeval tv_sent;
   5.128 -    bshdr_t message;
   5.129 -    void *block;
   5.130 -} bsq_t;
   5.131 -
   5.132 -#define BSQ_STATUS_MATCHED 1
   5.133 -
   5.134 -pthread_mutex_t ptmutex_luid;
   5.135 -#define ENTER_LUID_CR pthread_mutex_lock(&ptmutex_luid)
   5.136 -#define LEAVE_LUID_CR pthread_mutex_unlock(&ptmutex_luid)
   5.137 -
   5.138 -static u64 luid_cnt = 0x1000ULL;
   5.139 -u64 new_luid(void) {
   5.140 -    u64 luid;
   5.141 -    ENTER_LUID_CR;
   5.142 -    luid = luid_cnt++;
   5.143 -    LEAVE_LUID_CR;
   5.144 -    return luid;
   5.145 -}
   5.146 -
   5.147 -/* Queue of outstanding requests.
   5.148 - */
   5.149 -bsq_t *bs_head = NULL;
   5.150 -bsq_t *bs_tail = NULL;
   5.151 -int bs_qlen = 0;
   5.152 -
   5.153 -/*
   5.154 - */
   5.155 -void queuedebug(char *msg) {
   5.156 -    bsq_t *q;
   5.157 -    ENTER_QUEUE_CR;
   5.158 -    fprintf(stderr, "Q: %s len=%u\n", msg, bs_qlen);
   5.159 -    for (q = bs_head; q; q = q->next) {
   5.160 -        fprintf(stderr, "  luid=%016llx server=%u\n",
   5.161 -                q->message.luid, q->server);
   5.162 -    }
   5.163 -    LEAVE_QUEUE_CR;
   5.164 -}
   5.165 -
   5.166 -int enqueue(bsq_t *qe) {
   5.167 -    ENTER_QUEUE_CR;
   5.168 -    qe->next = NULL;
   5.169 -    qe->prev = bs_tail;
   5.170 -    if (!bs_head)
   5.171 -        bs_head = qe;
   5.172 -    else
   5.173 -        bs_tail->next = qe;
   5.174 -    bs_tail = qe;
   5.175 -    bs_qlen++;
   5.176 -    LEAVE_QUEUE_CR;
   5.177 -#ifdef BSDEBUG
   5.178 -    queuedebug("enqueue");
   5.179 -#endif
   5.180 -    return 0;
   5.181 -}
   5.182 -
   5.183 -int dequeue(bsq_t *qe) {
   5.184 -    bsq_t *q;
   5.185 -    ENTER_QUEUE_CR;
   5.186 -    for (q = bs_head; q; q = q->next) {
   5.187 -        if (q == qe) {
   5.188 -            if (q->prev)
   5.189 -                q->prev->next = q->next;
   5.190 -            else 
   5.191 -                bs_head = q->next;
   5.192 -            if (q->next)
   5.193 -                q->next->prev = q->prev;
   5.194 -            else
   5.195 -                bs_tail = q->prev;
   5.196 -            bs_qlen--;
   5.197 -            goto found;
   5.198 -        }
   5.199 -    }
   5.200 -
   5.201 -    LEAVE_QUEUE_CR;
   5.202 -#ifdef BSDEBUG
   5.203 -    queuedebug("dequeue not found");
   5.204 -#endif
   5.205 -    return 0;
   5.206 -
   5.207 -    found:
   5.208 -    LEAVE_QUEUE_CR;
   5.209 -#ifdef BSDEBUG
   5.210 -    queuedebug("dequeue not found");
   5.211 -#endif
   5.212 -    return 1;
   5.213 -}
   5.214 -
   5.215 -bsq_t *queuesearch(bsq_t *qe) {
   5.216 -    bsq_t *q;
   5.217 -    ENTER_QUEUE_CR;
   5.218 -    for (q = bs_head; q; q = q->next) {
   5.219 -        if ((qe->server == q->server) &&
   5.220 -            (qe->message.operation == q->message.operation) &&
   5.221 -            (qe->message.luid == q->message.luid)) {
   5.222 -
   5.223 -            if ((q->message.operation == BSOP_READBLOCK) &&
   5.224 -                ((q->message.flags & BSOP_FLAG_ERROR) == 0)) {
   5.225 -                q->block = qe->block;
   5.226 -                qe->block = NULL;
   5.227 -            }
   5.228 -            q->length = qe->length;
   5.229 -            q->message.flags = qe->message.flags;
   5.230 -            q->message.id = qe->message.id;
   5.231 -            q->status |= BSQ_STATUS_MATCHED;
   5.232 -
   5.233 -            if (q->prev)
   5.234 -                q->prev->next = q->next;
   5.235 -            else 
   5.236 -                bs_head = q->next;
   5.237 -            if (q->next)
   5.238 -                q->next->prev = q->prev;
   5.239 -            else
   5.240 -                bs_tail = q->prev;
   5.241 -            q->next = NULL;
   5.242 -            q->prev = NULL;
   5.243 -            bs_qlen--;
   5.244 -            goto found;
   5.245 -        }
   5.246 -    }
   5.247 -
   5.248 -    LEAVE_QUEUE_CR;
   5.249 -#ifdef BSDEBUG
   5.250 -    queuedebug("queuesearch not found");
   5.251 -#endif
   5.252 -    return NULL;
   5.253 -
   5.254 -    found:
   5.255 -    LEAVE_QUEUE_CR;
   5.256 -#ifdef BSDEBUG
   5.257 -    queuedebug("queuesearch found");
   5.258 -#endif
   5.259 -    return q;
   5.260 -}
   5.261 -
   5.262 -/*****************************************************************************
   5.263 - * Network communication                                                     *
   5.264 - *****************************************************************************/
   5.265 -
   5.266 -int send_message(bsq_t *qe) {
   5.267 -    int rc;
   5.268 -
   5.269 -    qe->msghdr.msg_name = (void *)&(bsservers[qe->server].sin);
   5.270 -    qe->msghdr.msg_namelen = sizeof(struct sockaddr_in);
   5.271 -    qe->msghdr.msg_iov = qe->iov;
   5.272 -    if (qe->block)
   5.273 -        qe->msghdr.msg_iovlen = 2;
   5.274 -    else
   5.275 -        qe->msghdr.msg_iovlen = 1;
   5.276 -    qe->msghdr.msg_control = NULL;
   5.277 -    qe->msghdr.msg_controllen = 0;
   5.278 -    qe->msghdr.msg_flags = 0;
   5.279 -
   5.280 -    qe->iov[0].iov_base = (void *)&(qe->message);
   5.281 -    qe->iov[0].iov_len = MSGBUFSIZE_ID;
   5.282 -
   5.283 -    if (qe->block) {
   5.284 -        qe->iov[1].iov_base = qe->block;
   5.285 -        qe->iov[1].iov_len = BLOCK_SIZE;
   5.286 -    }
   5.287 -
   5.288 -    qe->message.luid = new_luid();
   5.289 -
   5.290 -    qe->status = 0;
   5.291 -    qe->tid = (int)pthread_getspecific(tid_key);
   5.292 -    if (enqueue(qe) < 0) {
   5.293 -        fprintf(stderr, "Error enqueuing request.\n");
   5.294 -        return -1;
   5.295 -    }
   5.296 -
   5.297 -    gettimeofday(&(qe->tv_sent), NULL);
   5.298 -    DB("send_message to %d luid=%016llx\n", qe->server, qe->message.luid);
   5.299 -    rc = sendmsg(bssock, &(qe->msghdr), MSG_DONTWAIT);
   5.300 -    //rc = sendto(bssock, (void *)&(qe->message), qe->length, 0,
   5.301 -    //           (struct sockaddr *)&(bsservers[qe->server].sin),
   5.302 -    //           sizeof(struct sockaddr_in));
   5.303 -    if (rc < 0)
   5.304 -        return rc;
   5.305 -
   5.306 -    return rc;
   5.307 -}
   5.308 -
   5.309 -int recv_message(bsq_t *qe) {
   5.310 -    struct sockaddr_in from;
   5.311 -    //int flen = sizeof(from);
   5.312 -    int rc;
   5.313 -
   5.314 -    qe->msghdr.msg_name = &from;
   5.315 -    qe->msghdr.msg_namelen = sizeof(struct sockaddr_in);
   5.316 -    qe->msghdr.msg_iov = qe->iov;
   5.317 -    if (qe->block)
   5.318 -        qe->msghdr.msg_iovlen = 2;
   5.319 -    else
   5.320 -        qe->msghdr.msg_iovlen = 1;
   5.321 -    qe->msghdr.msg_control = NULL;
   5.322 -    qe->msghdr.msg_controllen = 0;
   5.323 -    qe->msghdr.msg_flags = 0;
   5.324 -
   5.325 -    qe->iov[0].iov_base = (void *)&(qe->message);
   5.326 -    qe->iov[0].iov_len = MSGBUFSIZE_ID;
   5.327 -    if (qe->block) {
   5.328 -        qe->iov[1].iov_base = qe->block;
   5.329 -        qe->iov[1].iov_len = BLOCK_SIZE;
   5.330 -    }
   5.331 -
   5.332 -    rc = recvmsg(bssock, &(qe->msghdr), 0);
   5.333 -
   5.334 -    //return recvfrom(bssock, (void *)&(qe->message), sizeof(bsmsg_t), 0,
   5.335 -    //               (struct sockaddr *)&from, &flen);
   5.336 -    return rc;
   5.337 -}
   5.338 -
   5.339 -int get_server_number(struct sockaddr_in *sin) {
   5.340 -    int i;
   5.341 -
   5.342 -#ifdef BSDEBUG2
   5.343 -    fprintf(stderr,
   5.344 -            "get_server_number(%u.%u.%u.%u/%u)\n",
   5.345 -            (unsigned int)sin->sin_addr.s_addr & 0xff,
   5.346 -            ((unsigned int)sin->sin_addr.s_addr >> 8) & 0xff,
   5.347 -            ((unsigned int)sin->sin_addr.s_addr >> 16) & 0xff,
   5.348 -            ((unsigned int)sin->sin_addr.s_addr >> 24) & 0xff,
   5.349 -            (unsigned int)sin->sin_port);
   5.350 -#endif
   5.351 -
   5.352 -    for (i = 0; i < MAX_SERVERS; i++) {
   5.353 -        if (bsservers[i].hostname) {
   5.354 -#ifdef BSDEBUG2
   5.355 -            fprintf(stderr,
   5.356 -                    "get_server_number check %u.%u.%u.%u/%u\n",
   5.357 -                    (unsigned int)bsservers[i].sin.sin_addr.s_addr&0xff,
   5.358 -                    ((unsigned int)bsservers[i].sin.sin_addr.s_addr >> 8)&0xff,
   5.359 -                    ((unsigned int)bsservers[i].sin.sin_addr.s_addr >> 16)&0xff,
   5.360 -                    ((unsigned int)bsservers[i].sin.sin_addr.s_addr >> 24)&0xff,
   5.361 -                    (unsigned int)bsservers[i].sin.sin_port);
   5.362 -#endif
   5.363 -            if ((sin->sin_family == bsservers[i].sin.sin_family) &&
   5.364 -                (sin->sin_port == bsservers[i].sin.sin_port) &&
   5.365 -                (memcmp((void *)&(sin->sin_addr),
   5.366 -                        (void *)&(bsservers[i].sin.sin_addr),
   5.367 -                        sizeof(struct in_addr)) == 0)) {
   5.368 -                return i;
   5.369 -            }
   5.370 -        }        
   5.371 -    }
   5.372 -
   5.373 -    return -1;
   5.374 -}
   5.375 -
   5.376 -void *rx_buffer = NULL;
   5.377 -bsq_t rx_qe;
   5.378 -bsq_t *recv_any(void) {
   5.379 -    struct sockaddr_in from;
   5.380 -    int rc;
   5.381 -    
   5.382 -    DB("ENTER recv_any\n");
   5.383 -
   5.384 -    rx_qe.msghdr.msg_name = &from;
   5.385 -    rx_qe.msghdr.msg_namelen = sizeof(struct sockaddr_in);
   5.386 -    rx_qe.msghdr.msg_iov = rx_qe.iov;
   5.387 -    if (!rx_buffer) {
   5.388 -        rx_buffer = malloc(BLOCK_SIZE);
   5.389 -        if (!rx_buffer) {
   5.390 -            perror("recv_any malloc");
   5.391 -            return NULL;
   5.392 -        }
   5.393 -    }
   5.394 -    rx_qe.block = rx_buffer;
   5.395 -    rx_buffer = NULL;
   5.396 -    rx_qe.msghdr.msg_iovlen = 2;
   5.397 -    rx_qe.msghdr.msg_control = NULL;
   5.398 -    rx_qe.msghdr.msg_controllen = 0;
   5.399 -    rx_qe.msghdr.msg_flags = 0;
   5.400 -    
   5.401 -    rx_qe.iov[0].iov_base = (void *)&(rx_qe.message);
   5.402 -    rx_qe.iov[0].iov_len = MSGBUFSIZE_ID;
   5.403 -    rx_qe.iov[1].iov_base = rx_qe.block;
   5.404 -    rx_qe.iov[1].iov_len = BLOCK_SIZE;
   5.405 -
   5.406 -    rc = recvmsg(bssock, &(rx_qe.msghdr), 0);
   5.407 -    if (rc < 0) {
   5.408 -        perror("recv_any");
   5.409 -        return NULL;
   5.410 -    }
   5.411 -
   5.412 -    rx_qe.length = rc;    
   5.413 -    rx_qe.server = get_server_number(&from);
   5.414 -
   5.415 -    DB("recv_any from %d luid=%016llx len=%u\n",
   5.416 -       rx_qe.server, rx_qe.message.luid, rx_qe.length);
   5.417 -
   5.418 -    return &rx_qe;
   5.419 -}
   5.420 -
   5.421 -void recv_recycle_buffer(bsq_t *q) {
   5.422 -    if (q->block) {
   5.423 -        rx_buffer = q->block;
   5.424 -        q->block = NULL;
   5.425 -    }
   5.426 -}
   5.427 -
   5.428 -// cycle through reading any incoming, searching for a match in the
   5.429 -// queue, until we have all we need.
   5.430 -int wait_recv(bsq_t **reqs, int numreqs) {
   5.431 -    bsq_t *q, *m;
   5.432 -    unsigned int x, i;
   5.433 -    int tid = (int)pthread_getspecific(tid_key);
   5.434 -
   5.435 -    DB("ENTER wait_recv %u\n", numreqs);
   5.436 -
   5.437 -    checkmatch:
   5.438 -    x = 0xffffffff;
   5.439 -    for (i = 0; i < numreqs; i++) {
   5.440 -        x &= reqs[i]->status;
   5.441 -    }
   5.442 -    if ((x & BSQ_STATUS_MATCHED)) {
   5.443 -        DB("LEAVE wait_recv\n");
   5.444 -        return numreqs;
   5.445 -    }
   5.446 -
   5.447 -    RECV_AWAIT(tid);
   5.448 -
   5.449 -    /*
   5.450 -    rxagain:
   5.451 -    ENTER_RECV_CR;
   5.452 -    q = recv_any();
   5.453 -    LEAVE_RECV_CR;
   5.454 -    if (!q)
   5.455 -        return -1;
   5.456 -
   5.457 -    m = queuesearch(q);
   5.458 -    recv_recycle_buffer(q);
   5.459 -    if (!m) {
   5.460 -        fprintf(stderr, "Unmatched RX\n");
   5.461 -        goto rxagain;
   5.462 -    }
   5.463 -    */
   5.464 -
   5.465 -    goto checkmatch;
   5.466 -
   5.467 -}
   5.468 -
   5.469 -/* retry
   5.470 - */
   5.471 -static int retry_count = 0;
   5.472 -int retry(bsq_t *qe)
   5.473 -{
   5.474 -    int rc;
   5.475 -    gettimeofday(&(qe->tv_sent), NULL);
   5.476 -    DB("retry to %d luid=%016llx\n", qe->server, qe->message.luid);
   5.477 -    retry_count++;
   5.478 -    rc = sendmsg(bssock, &(qe->msghdr), MSG_DONTWAIT);
   5.479 -    if (rc < 0)
   5.480 -        return rc;
   5.481 -    return 0;
   5.482 -}
   5.483 -
   5.484 -/* queue runner
   5.485 - */
   5.486 -void *queue_runner(void *arg)
   5.487 -{
   5.488 -    for (;;) {
   5.489 -        struct timeval now;
   5.490 -        long long nowus, sus;
   5.491 -        bsq_t *q;
   5.492 -        int r;
   5.493 -
   5.494 -        sleep(1);
   5.495 -
   5.496 -        gettimeofday(&now, NULL);
   5.497 -        nowus = now.tv_usec + now.tv_sec * 1000000;
   5.498 -        ENTER_QUEUE_CR;
   5.499 -        r = retry_count;
   5.500 -        for (q = bs_head; q; q = q->next) {
   5.501 -            sus = q->tv_sent.tv_usec + q->tv_sent.tv_sec * 1000000;
   5.502 -            if ((nowus - sus) > RETRY_TIMEOUT) {
   5.503 -                if (retry(q) < 0) {
   5.504 -                    fprintf(stderr, "Error on sendmsg retry.\n");
   5.505 -                }
   5.506 -            }
   5.507 -        }
   5.508 -        if (r != retry_count) {
   5.509 -            fprintf(stderr, "RETRIES: %u %u\n", retry_count - r, retry_count);
   5.510 -        }
   5.511 -        LEAVE_QUEUE_CR;
   5.512 -    }
   5.513 -}
   5.514 -
   5.515 -/* receive loop
   5.516 - */
   5.517 -void *receive_loop(void *arg)
   5.518 -{
   5.519 -    bsq_t *q, *m;
   5.520 -
   5.521 -    for(;;) {
   5.522 -        q = recv_any();
   5.523 -        if (!q) {
   5.524 -            fprintf(stderr, "recv_any error\n");
   5.525 -        }
   5.526 -        else {
   5.527 -            m = queuesearch(q);
   5.528 -            recv_recycle_buffer(q);
   5.529 -            if (!m) {
   5.530 -                fprintf(stderr, "Unmatched RX\n");
   5.531 -            }
   5.532 -            else {
   5.533 -                DB("RX MATCH");
   5.534 -                RECV_NOTIFY(m->tid);
   5.535 -            }
   5.536 -        }
   5.537 -    }
   5.538 -}
   5.539 -pthread_t pthread_recv;
   5.540 -
   5.541 -/*****************************************************************************
   5.542 - * Reading                                                                   *
   5.543 - *****************************************************************************/
   5.544 -
   5.545 -void *readblock_indiv(int server, u64 id) {
   5.546 -    void *block;
   5.547 -    bsq_t *qe;
   5.548 -    int len, rc;
   5.549 -
   5.550 -    qe = (bsq_t *)malloc(sizeof(bsq_t));
   5.551 -    if (!qe) {
   5.552 -        perror("readblock qe malloc");
   5.553 -        return NULL;
   5.554 -    }
   5.555 -    qe->block = NULL;
   5.556 -    
   5.557 -    /*
   5.558 -    qe->block = malloc(BLOCK_SIZE);
   5.559 -    if (!qe->block) {
   5.560 -        perror("readblock qe malloc");
   5.561 -        free((void *)qe);
   5.562 -        return NULL;
   5.563 -    }
   5.564 -    */
   5.565 -
   5.566 -    qe->server = server;
   5.567 -
   5.568 -    qe->message.operation = BSOP_READBLOCK;
   5.569 -    qe->message.flags = 0;
   5.570 -    qe->message.id = id;
   5.571 -    qe->length = MSGBUFSIZE_ID;
   5.572 -
   5.573 -    if (send_message(qe) < 0) {
   5.574 -        perror("readblock sendto");
   5.575 -        goto err;
   5.576 -    }
   5.577 -    
   5.578 -    /*len = recv_message(qe);
   5.579 -    if (len < 0) {
   5.580 -        perror("readblock recv");
   5.581 -        goto err;
   5.582 -    }*/
   5.583 -
   5.584 -    rc = wait_recv(&qe, 1);
   5.585 -    if (rc < 0) {
   5.586 -        perror("readblock recv");
   5.587 -        goto err;
   5.588 -    }
   5.589 -
   5.590 -    if ((qe->message.flags & BSOP_FLAG_ERROR)) {
   5.591 -        fprintf(stderr, "readblock server error\n");
   5.592 -        goto err;
   5.593 -    }
   5.594 -    if (qe->length < MSGBUFSIZE_BLOCK) {
   5.595 -        fprintf(stderr, "readblock recv short (%u)\n", len);
   5.596 -        goto err;
   5.597 -    }
   5.598 -    /* if ((block = malloc(BLOCK_SIZE)) == NULL) {
   5.599 -        perror("readblock malloc");
   5.600 -        goto err;
   5.601 -    }
   5.602 -    memcpy(block, qe->message.block, BLOCK_SIZE);
   5.603 -    */    
   5.604 -    block = qe->block;
   5.605 -
   5.606 -    free((void *)qe);
   5.607 -    return block;
   5.608 -
   5.609 -    err:
   5.610 -    if (qe->block)
   5.611 -        free(qe->block);
   5.612 -    free((void *)qe);
   5.613 -    return NULL;
   5.614 -}
   5.615 -
   5.616 -/**
   5.617 - * readblock: read a block from disk
   5.618 - *   @id: block id to read
   5.619 - *
   5.620 - *   @return: pointer to block, NULL on error
   5.621 - */
   5.622 -void *readblock(u64 id) {
   5.623 -    int map = (int)BSID_MAP(id);
   5.624 -    u64 xid;
   5.625 -    static int i = CLUSTER_MAX_REPLICAS - 1;
   5.626 -    void *block = NULL;
   5.627 -
   5.628 -    /* special case for the "superblock" just use the first block on the
   5.629 -     * first replica. (extend to blocks < 6 for vdi bug)
   5.630 -     */
   5.631 -    if (id < 6) {
   5.632 -        block = readblock_indiv(bsclusters[map].servers[0], id);
   5.633 -        goto out;
   5.634 -    }
   5.635 -
   5.636 -    i++;
   5.637 -    if (i >= CLUSTER_MAX_REPLICAS)
   5.638 -        i = 0;
   5.639 -    switch (i) {
   5.640 -    case 0:
   5.641 -        xid = BSID_REPLICA0(id);
   5.642 -        break;
   5.643 -    case 1:
   5.644 -        xid = BSID_REPLICA1(id);
   5.645 -        break;
   5.646 -    case 2:
   5.647 -        xid = BSID_REPLICA2(id);
   5.648 -        break;
   5.649 -    }
   5.650 -    
   5.651 -    block = readblock_indiv(bsclusters[map].servers[i], xid);
   5.652 -
   5.653 -    out:
   5.654 -#ifdef BSDEBUG
   5.655 -    if (block)
   5.656 -        fprintf(stderr, "READ:  %016llx %02x%02x %02x%02x %02x%02x %02x%02x\n",
   5.657 -                id,
   5.658 -                (unsigned int)((unsigned char *)block)[0],
   5.659 -                (unsigned int)((unsigned char *)block)[1],
   5.660 -                (unsigned int)((unsigned char *)block)[2],
   5.661 -                (unsigned int)((unsigned char *)block)[3],
   5.662 -                (unsigned int)((unsigned char *)block)[4],
   5.663 -                (unsigned int)((unsigned char *)block)[5],
   5.664 -                (unsigned int)((unsigned char *)block)[6],
   5.665 -                (unsigned int)((unsigned char *)block)[7]);
   5.666 -    else
   5.667 -        fprintf(stderr, "READ:  %016llx NULL\n", id);
   5.668 -#endif
   5.669 -    return block;
   5.670 -}
   5.671 -
   5.672 -/*****************************************************************************
   5.673 - * Writing                                                                   *
   5.674 - *****************************************************************************/
   5.675 -
   5.676 -bsq_t *writeblock_indiv(int server, u64 id, void *block) {
   5.677 -
   5.678 -    bsq_t *qe;
   5.679 -    int len;
   5.680 -
   5.681 -    qe = (bsq_t *)malloc(sizeof(bsq_t));
   5.682 -    if (!qe) {
   5.683 -        perror("writeblock qe malloc");
   5.684 -        goto err;
   5.685 -    }
   5.686 -    qe->server = server;
   5.687 -
   5.688 -    qe->message.operation = BSOP_WRITEBLOCK;
   5.689 -    qe->message.flags = 0;
   5.690 -    qe->message.id = id;
   5.691 -    //memcpy(qe->message.block, block, BLOCK_SIZE);
   5.692 -    qe->block = block;
   5.693 -    qe->length = MSGBUFSIZE_BLOCK;
   5.694 -
   5.695 -    if (send_message(qe) < 0) {
   5.696 -        perror("writeblock sendto");
   5.697 -        goto err;
   5.698 -    }
   5.699 -
   5.700 -    return qe;
   5.701 -
   5.702 -    err:
   5.703 -    free((void *)qe);
   5.704 -    return NULL;
   5.705 -}
   5.706 -    
   5.707 -
   5.708 -/**
   5.709 - * writeblock: write an existing block to disk
   5.710 - *   @id: block id
   5.711 - *   @block: pointer to block
   5.712 - *
   5.713 - *   @return: zero on success, -1 on failure
   5.714 - */
   5.715 -int writeblock(u64 id, void *block) {
   5.716 -    
   5.717 -    int map = (int)BSID_MAP(id);
   5.718 -    int rep0 = bsclusters[map].servers[0];
   5.719 -    int rep1 = bsclusters[map].servers[1];
   5.720 -    int rep2 = bsclusters[map].servers[2];
   5.721 -    bsq_t *reqs[3];
   5.722 -    int rc;
   5.723 -
   5.724 -    reqs[0] = reqs[1] = reqs[2] = NULL;
   5.725 -
   5.726 -#ifdef BSDEBUG
   5.727 -    fprintf(stderr,
   5.728 -            "WRITE: %016llx %02x%02x %02x%02x %02x%02x %02x%02x\n",
   5.729 -            id,
   5.730 -            (unsigned int)((unsigned char *)block)[0],
   5.731 -            (unsigned int)((unsigned char *)block)[1],
   5.732 -            (unsigned int)((unsigned char *)block)[2],
   5.733 -            (unsigned int)((unsigned char *)block)[3],
   5.734 -            (unsigned int)((unsigned char *)block)[4],
   5.735 -            (unsigned int)((unsigned char *)block)[5],
   5.736 -            (unsigned int)((unsigned char *)block)[6],
   5.737 -            (unsigned int)((unsigned char *)block)[7]);
   5.738 -#endif
   5.739 -
   5.740 -    /* special case for the "superblock" just use the first block on the
   5.741 -     * first replica. (extend to blocks < 6 for vdi bug)
   5.742 -     */
   5.743 -    if (id < 6) {
   5.744 -        reqs[0] = writeblock_indiv(rep0, id, block);
   5.745 -        if (!reqs[0])
   5.746 -            return -1;
   5.747 -        rc = wait_recv(reqs, 1);
   5.748 -        return rc;
   5.749 -    }
   5.750 -
   5.751 -    reqs[0] = writeblock_indiv(rep0, BSID_REPLICA0(id), block);
   5.752 -    if (!reqs[0])
   5.753 -        goto err;
   5.754 -    reqs[1] = writeblock_indiv(rep1, BSID_REPLICA1(id), block);
   5.755 -    if (!reqs[1])
   5.756 -        goto err;
   5.757 -    reqs[2] = writeblock_indiv(rep2, BSID_REPLICA2(id), block);
   5.758 -    if (!reqs[2])
   5.759 -        goto err;
   5.760 -
   5.761 -    rc = wait_recv(reqs, 3);
   5.762 -    if (rc < 0) {
   5.763 -        perror("writeblock recv");
   5.764 -        goto err;
   5.765 -    }
   5.766 -    if ((reqs[0]->message.flags & BSOP_FLAG_ERROR)) {
   5.767 -        fprintf(stderr, "writeblock server0 error\n");
   5.768 -        goto err;
   5.769 -    }
   5.770 -    if ((reqs[1]->message.flags & BSOP_FLAG_ERROR)) {
   5.771 -        fprintf(stderr, "writeblock server1 error\n");
   5.772 -        goto err;
   5.773 -    }
   5.774 -    if ((reqs[2]->message.flags & BSOP_FLAG_ERROR)) {
   5.775 -        fprintf(stderr, "writeblock server2 error\n");
   5.776 -        goto err;
   5.777 -    }
   5.778 -
   5.779 -
   5.780 -    free((void *)reqs[0]);
   5.781 -    free((void *)reqs[1]);
   5.782 -    free((void *)reqs[2]);
   5.783 -    return 0;
   5.784 -
   5.785 -    err:
   5.786 -    if (reqs[0]) {
   5.787 -        dequeue(reqs[0]);
   5.788 -        free((void *)reqs[0]);
   5.789 -    }
   5.790 -    if (reqs[1]) {
   5.791 -        dequeue(reqs[1]);
   5.792 -        free((void *)reqs[1]);
   5.793 -    }
   5.794 -    if (reqs[2]) {
   5.795 -        dequeue(reqs[2]);
   5.796 -        free((void *)reqs[2]);
   5.797 -    }
   5.798 -    return -1;
   5.799 -}
   5.800 -
   5.801 -/*****************************************************************************
   5.802 - * Allocation                                                                *
   5.803 - *****************************************************************************/
   5.804 -
   5.805 -/**
   5.806 - * allocblock: write a new block to disk
   5.807 - *   @block: pointer to block
   5.808 - *
   5.809 - *   @return: new id of block on disk
   5.810 - */
   5.811 -u64 allocblock(void *block) {
   5.812 -    return allocblock_hint(block, 0);
   5.813 -}
   5.814 -
   5.815 -bsq_t *allocblock_hint_indiv(int server, void *block, u64 hint) {
   5.816 -    bsq_t *qe;
   5.817 -    int len;
   5.818 -
   5.819 -    qe = (bsq_t *)malloc(sizeof(bsq_t));
   5.820 -    if (!qe) {
   5.821 -        perror("allocblock_hint qe malloc");
   5.822 -        goto err;
   5.823 -    }
   5.824 -    qe->server = server;
   5.825 -
   5.826 -    qe->message.operation = BSOP_ALLOCBLOCK;
   5.827 -    qe->message.flags = 0;
   5.828 -    qe->message.id = hint;
   5.829 -    //memcpy(qe->message.block, block, BLOCK_SIZE);
   5.830 -    qe->block = block;
   5.831 -    qe->length = MSGBUFSIZE_BLOCK;
   5.832 -
   5.833 -    if (send_message(qe) < 0) {
   5.834 -        perror("allocblock_hint sendto");
   5.835 -        goto err;
   5.836 -    }
   5.837 -    
   5.838 -    return qe;
   5.839 -
   5.840 -    err:
   5.841 -    free((void *)qe);
   5.842 -    return NULL;
   5.843 -}
   5.844 -
   5.845 -/**
   5.846 - * allocblock_hint: write a new block to disk
   5.847 - *   @block: pointer to block
   5.848 - *   @hint: allocation hint
   5.849 - *
   5.850 - *   @return: new id of block on disk
   5.851 - */
   5.852 -u64 allocblock_hint(void *block, u64 hint) {
   5.853 -    int map = (int)hint;
   5.854 -    int rep0 = bsclusters[map].servers[0];
   5.855 -    int rep1 = bsclusters[map].servers[1];
   5.856 -    int rep2 = bsclusters[map].servers[2];
   5.857 -    bsq_t *reqs[3];
   5.858 -    int rc;
   5.859 -    u64 id0, id1, id2;
   5.860 -
   5.861 -    reqs[0] = reqs[1] = reqs[2] = NULL;
   5.862 -
   5.863 -    DB("ENTER allocblock\n");
   5.864 -
   5.865 -    reqs[0] = allocblock_hint_indiv(rep0, block, hint);
   5.866 -    if (!reqs[0])
   5.867 -        goto err;
   5.868 -    reqs[1] = allocblock_hint_indiv(rep1, block, hint);
   5.869 -    if (!reqs[1])
   5.870 -        goto err;
   5.871 -    reqs[2] = allocblock_hint_indiv(rep2, block, hint);
   5.872 -    if (!reqs[2])
   5.873 -        goto err;
   5.874 -
   5.875 -    rc = wait_recv(reqs, 3);
   5.876 -    if (rc < 0) {
   5.877 -        perror("allocblock recv");
   5.878 -        goto err;
   5.879 -    }
   5.880 -    if ((reqs[0]->message.flags & BSOP_FLAG_ERROR)) {
   5.881 -        fprintf(stderr, "allocblock server0 error\n");
   5.882 -        goto err;
   5.883 -    }
   5.884 -    if ((reqs[1]->message.flags & BSOP_FLAG_ERROR)) {
   5.885 -        fprintf(stderr, "allocblock server1 error\n");
   5.886 -        goto err;
   5.887 -    }
   5.888 -    if ((reqs[2]->message.flags & BSOP_FLAG_ERROR)) {
   5.889 -        fprintf(stderr, "allocblock server2 error\n");
   5.890 -        goto err;
   5.891 -    }
   5.892 -
   5.893 -    id0 = reqs[0]->message.id;
   5.894 -    id1 = reqs[1]->message.id;
   5.895 -    id2 = reqs[2]->message.id;
   5.896 -
   5.897 -#ifdef BSDEBUG
   5.898 -    fprintf(stderr, "ALLOC: %016llx %02x%02x %02x%02x %02x%02x %02x%02x\n",
   5.899 -            BSID(map, id0, id1, id2),
   5.900 -            (unsigned int)((unsigned char *)block)[0],
   5.901 -            (unsigned int)((unsigned char *)block)[1],
   5.902 -            (unsigned int)((unsigned char *)block)[2],
   5.903 -            (unsigned int)((unsigned char *)block)[3],
   5.904 -            (unsigned int)((unsigned char *)block)[4],
   5.905 -            (unsigned int)((unsigned char *)block)[5],
   5.906 -            (unsigned int)((unsigned char *)block)[6],
   5.907 -            (unsigned int)((unsigned char *)block)[7]);
   5.908 -#endif
   5.909 -    
   5.910 -    free((void *)reqs[0]);
   5.911 -    free((void *)reqs[1]);
   5.912 -    free((void *)reqs[2]);
   5.913 -    return BSID(map, id0, id1, id2);
   5.914 -
   5.915 -    err:
   5.916 -    if (reqs[0]) {
   5.917 -        dequeue(reqs[0]);
   5.918 -        free((void *)reqs[0]);
   5.919 -    }
   5.920 -    if (reqs[1]) {
   5.921 -        dequeue(reqs[1]);
   5.922 -        free((void *)reqs[1]);
   5.923 -    }
   5.924 -    if (reqs[2]) {
   5.925 -        dequeue(reqs[2]);
   5.926 -        free((void *)reqs[2]);
   5.927 -    }
   5.928 -    return 0;
   5.929 -}
   5.930 -
   5.931 -#else /* /BLOCKSTORE_REMOTE */
   5.932 -
   5.933 -/*****************************************************************************
   5.934 - * Local storage version                                                     *
   5.935 - *****************************************************************************/
   5.936 - 
   5.937 -/**
   5.938 - * readblock: read a block from disk
   5.939 - *   @id: block id to read
   5.940 - *
   5.941 - *   @return: pointer to block, NULL on error
   5.942 - */
   5.943 -
   5.944 -void *readblock(u64 id) {
   5.945 -    void *block;
   5.946 -    int block_fp;
   5.947 -   
   5.948 -//printf("readblock(%llu)\n", id); 
   5.949 -    block_fp = open("blockstore.dat", O_RDONLY | O_CREAT | O_LARGEFILE, 0644);
   5.950 -
   5.951 -    if (block_fp < 0) {
   5.952 -        perror("open");
   5.953 -        return NULL;
   5.954 -    }
   5.955 -    
   5.956 -    if (lseek64(block_fp, ((off64_t) id - 1LL) * BLOCK_SIZE, SEEK_SET) < 0) {
   5.957 -        printf ("%Ld ", id);
   5.958 -        printf ("%Ld\n", (id - 1) * BLOCK_SIZE);
   5.959 -        perror("readblock lseek");
   5.960 -        goto err;
   5.961 -    }
   5.962 -    if ((block = malloc(BLOCK_SIZE)) == NULL) {
   5.963 -        perror("readblock malloc");
   5.964 -        goto err;
   5.965 -    }
   5.966 -    if (read(block_fp, block, BLOCK_SIZE) != BLOCK_SIZE) {
   5.967 -        perror("readblock read");
   5.968 -        free(block);
   5.969 -        goto err;
   5.970 -    }
   5.971 -    close(block_fp);
   5.972 -    return block;
   5.973 -    
   5.974 -err:
   5.975 -    close(block_fp);
   5.976 -    return NULL;
   5.977 -}
   5.978 -
   5.979 -/**
   5.980 - * writeblock: write an existing block to disk
   5.981 - *   @id: block id
   5.982 - *   @block: pointer to block
   5.983 - *
   5.984 - *   @return: zero on success, -1 on failure
   5.985 - */
   5.986 -int writeblock(u64 id, void *block) {
   5.987 -    
   5.988 -    int block_fp;
   5.989 -    
   5.990 -    block_fp = open("blockstore.dat", O_RDWR | O_CREAT | O_LARGEFILE, 0644);
   5.991 -
   5.992 -    if (block_fp < 0) {
   5.993 -        perror("open");
   5.994 -        return -1;
   5.995 -    }
   5.996 -
   5.997 -    if (lseek64(block_fp, ((off64_t) id - 1LL) * BLOCK_SIZE, SEEK_SET) < 0) {
   5.998 -        perror("writeblock lseek");
   5.999 -        goto err;
  5.1000 -    }
  5.1001 -    if (write(block_fp, block, BLOCK_SIZE) < 0) {
  5.1002 -        perror("writeblock write");
  5.1003 -        goto err;
  5.1004 -    }
  5.1005 -    close(block_fp);
  5.1006 -    return 0;
  5.1007 -
  5.1008 -err:
  5.1009 -    close(block_fp);
  5.1010 -    return -1;
  5.1011 -}
  5.1012 -
  5.1013 -/**
  5.1014 - * allocblock: write a new block to disk
  5.1015 - *   @block: pointer to block
  5.1016 - *
  5.1017 - *   @return: new id of block on disk
  5.1018 - */
  5.1019 -
  5.1020 -u64 allocblock(void *block) {
  5.1021 -    u64 lb;
  5.1022 -    off64_t pos;
  5.1023 -    int block_fp;
  5.1024 -    
  5.1025 -    block_fp = open("blockstore.dat", O_RDWR | O_CREAT | O_LARGEFILE, 0644);
  5.1026 -
  5.1027 -    if (block_fp < 0) {
  5.1028 -        perror("open");
  5.1029 -        return 0;
  5.1030 -    }
  5.1031 -
  5.1032 -    pos = lseek64(block_fp, 0, SEEK_END);
  5.1033 -    if (pos == (off64_t)-1) {
  5.1034 -        perror("allocblock lseek");
  5.1035 -        goto err;
  5.1036 -    }
  5.1037 -    if (pos % BLOCK_SIZE != 0) {
  5.1038 -        fprintf(stderr, "file size not multiple of %d\n", BLOCK_SIZE);
  5.1039 -        goto err;
  5.1040 -    }
  5.1041 -    if (write(block_fp, block, BLOCK_SIZE) != BLOCK_SIZE) {
  5.1042 -        perror("allocblock write");
  5.1043 -        goto err;
  5.1044 -    }
  5.1045 -    lb = pos / BLOCK_SIZE + 1;
  5.1046 -//printf("alloc(%Ld)\n", lb);
  5.1047 -    close(block_fp);
  5.1048 -    return lb;
  5.1049 -    
  5.1050 -err:
  5.1051 -    close(block_fp);
  5.1052 -    return 0;
  5.1053 -    
  5.1054 -}
  5.1055 -
  5.1056 -/**
  5.1057 - * allocblock_hint: write a new block to disk
  5.1058 - *   @block: pointer to block
  5.1059 - *   @hint: allocation hint
  5.1060 - *
  5.1061 - *   @return: new id of block on disk
  5.1062 - */
  5.1063 -u64 allocblock_hint(void *block, u64 hint) {
  5.1064 -    return allocblock(block);
  5.1065 -}
  5.1066 -
  5.1067 -#endif /* BLOCKSTORE_REMOTE */
  5.1068 -
  5.1069 -/*****************************************************************************
  5.1070 - * Memory management                                                         *
  5.1071 - *****************************************************************************/
  5.1072 -
  5.1073 -/**
  5.1074 - * newblock: get a new in-memory block set to zeros
  5.1075 - *
  5.1076 - *   @return: pointer to new block, NULL on error
  5.1077 - */
  5.1078 -void *newblock() {
  5.1079 -    void *block = malloc(BLOCK_SIZE);
  5.1080 -    if (block == NULL) {
  5.1081 -        perror("newblock");
  5.1082 -        return NULL;
  5.1083 -    }
  5.1084 -    memset(block, 0, BLOCK_SIZE);
  5.1085 -    return block;
  5.1086 -}
  5.1087 -
  5.1088 -
  5.1089 -/**
  5.1090 - * freeblock: unallocate an in-memory block
  5.1091 - *   @id: block id (zero if this is only in-memory)
  5.1092 - *   @block: block to be freed
  5.1093 - */
  5.1094 -void freeblock(void *block) {
  5.1095 -    if (block != NULL)
  5.1096 -        free(block);
  5.1097 -}
  5.1098 -
  5.1099 -static freeblock_t *new_freeblock(void)
  5.1100 -{
  5.1101 -    freeblock_t *fb;
  5.1102 -    
  5.1103 -    fb = newblock();
  5.1104 -    
  5.1105 -    if (fb == NULL) return NULL;
  5.1106 -    
  5.1107 -    fb->magic = FREEBLOCK_MAGIC;
  5.1108 -    fb->next  = 0ULL;
  5.1109 -    fb->count = 0ULL;
  5.1110 -    memset(fb->list, 0, sizeof fb->list);
  5.1111 -    
  5.1112 -    return fb;
  5.1113 -}
  5.1114 -
  5.1115 -void releaseblock(u64 id)
  5.1116 -{
  5.1117 -    blockstore_super_t *bs_super;
  5.1118 -    freeblock_t *fl_current;
  5.1119 -    
  5.1120 -    /* get superblock */
  5.1121 -    bs_super = (blockstore_super_t *) readblock(BLOCKSTORE_SUPER);
  5.1122 -    
  5.1123 -    /* get freeblock_current */
  5.1124 -    if (bs_super->freelist_current == 0ULL) 
  5.1125 -    {
  5.1126 -        fl_current = new_freeblock();
  5.1127 -        bs_super->freelist_current = allocblock(fl_current);
  5.1128 -        writeblock(BLOCKSTORE_SUPER, bs_super);
  5.1129 -    } else {
  5.1130 -        fl_current = readblock(bs_super->freelist_current);
  5.1131 -    }
  5.1132 -    
  5.1133 -    /* if full, chain to superblock and allocate new current */
  5.1134 -    
  5.1135 -    if (fl_current->count == FREEBLOCK_SIZE) {
  5.1136 -        fl_current->next = bs_super->freelist_full;
  5.1137 -        writeblock(bs_super->freelist_current, fl_current);
  5.1138 -        bs_super->freelist_full = bs_super->freelist_current;
  5.1139 -        freeblock(fl_current);
  5.1140 -        fl_current = new_freeblock();
  5.1141 -        bs_super->freelist_current = allocblock(fl_current);
  5.1142 -        writeblock(BLOCKSTORE_SUPER, bs_super);
  5.1143 -    }
  5.1144 -    
  5.1145 -    /* append id to current */
  5.1146 -    fl_current->list[fl_current->count++] = id;
  5.1147 -    writeblock(bs_super->freelist_current, fl_current);
  5.1148 -    
  5.1149 -    freeblock(fl_current);
  5.1150 -    freeblock(bs_super);
  5.1151 -    
  5.1152 -    
  5.1153 -}
  5.1154 -
  5.1155 -/* freelist debug functions: */
  5.1156 -void freelist_count(int print_each)
  5.1157 -{
  5.1158 -    blockstore_super_t *bs_super;
  5.1159 -    freeblock_t *fb;
  5.1160 -    u64 total = 0, next;
  5.1161 -    
  5.1162 -    bs_super = (blockstore_super_t *) readblock(BLOCKSTORE_SUPER);
  5.1163 -    
  5.1164 -    if (bs_super->freelist_current == 0ULL) {
  5.1165 -        printf("freelist is empty!\n");
  5.1166 -        return;
  5.1167 -    }
  5.1168 -    
  5.1169 -    fb = readblock(bs_super->freelist_current);
  5.1170 -    printf("%Ld entires on current.\n", fb->count);
  5.1171 -    total += fb->count;
  5.1172 -    if (print_each == 1)
  5.1173 -    {
  5.1174 -        int i;
  5.1175 -        for (i=0; i< fb->count; i++)
  5.1176 -            printf("  %Ld\n", fb->list[i]);
  5.1177 -    }
  5.1178 -    
  5.1179 -    freeblock(fb);
  5.1180 -    
  5.1181 -    if (bs_super->freelist_full == 0ULL) {
  5.1182 -        printf("freelist_full is empty!\n");
  5.1183 -        return;
  5.1184 -    }
  5.1185 -    
  5.1186 -    next = bs_super->freelist_full;
  5.1187 -    for (;;) {
  5.1188 -        fb = readblock(next);
  5.1189 -        total += fb->count;
  5.1190 -        if (print_each == 1)
  5.1191 -        {
  5.1192 -            int i;
  5.1193 -            for (i=0; i< fb->count; i++)
  5.1194 -                printf("  %Ld\n", fb->list[i]);
  5.1195 -        }
  5.1196 -        next = fb->next;
  5.1197 -        freeblock(fb);
  5.1198 -        if (next == 0ULL) break;
  5.1199 -    }
  5.1200 -    printf("Total of %Ld ids on freelist.\n", total);
  5.1201 -}
  5.1202 -
  5.1203 -/*****************************************************************************
  5.1204 - * Initialisation                                                            *
  5.1205 - *****************************************************************************/
  5.1206 -
  5.1207 -int __init_blockstore(void)
  5.1208 -{
  5.1209 -    int i;
  5.1210 -    blockstore_super_t *bs_super;
  5.1211 -    u64 ret;
  5.1212 -    int block_fp;
  5.1213 -    
  5.1214 -#ifdef BLOCKSTORE_REMOTE
  5.1215 -    struct hostent *addr;
  5.1216 -
  5.1217 -    pthread_mutex_init(&ptmutex_queue, NULL);
  5.1218 -    pthread_mutex_init(&ptmutex_luid, NULL);
  5.1219 -    pthread_mutex_init(&ptmutex_recv, NULL);
  5.1220 -    /*pthread_mutex_init(&ptmutex_notify, NULL);*/
  5.1221 -    for (i = 0; i <= READ_POOL_SIZE; i++) {
  5.1222 -        pool_thread[i].newdata = 0;
  5.1223 -        pthread_mutex_init(&(pool_thread[i].ptmutex), NULL);
  5.1224 -        pthread_cond_init(&(pool_thread[i].ptcv), NULL);
  5.1225 -    }
  5.1226 -
  5.1227 -    bsservers[0].hostname = "firebug.cl.cam.ac.uk";
  5.1228 -    bsservers[1].hostname = "planb.cl.cam.ac.uk";
  5.1229 -    bsservers[2].hostname = "simcity.cl.cam.ac.uk";
  5.1230 -    bsservers[3].hostname = NULL/*"gunfighter.cl.cam.ac.uk"*/;
  5.1231 -    bsservers[4].hostname = NULL/*"galaxian.cl.cam.ac.uk"*/;
  5.1232 -    bsservers[5].hostname = NULL/*"firetrack.cl.cam.ac.uk"*/;
  5.1233 -    bsservers[6].hostname = NULL/*"funfair.cl.cam.ac.uk"*/;
  5.1234 -    bsservers[7].hostname = NULL/*"felix.cl.cam.ac.uk"*/;
  5.1235 -    bsservers[8].hostname = NULL;
  5.1236 -    bsservers[9].hostname = NULL;
  5.1237 -    bsservers[10].hostname = NULL;
  5.1238 -    bsservers[11].hostname = NULL;
  5.1239 -    bsservers[12].hostname = NULL;
  5.1240 -    bsservers[13].hostname = NULL;
  5.1241 -    bsservers[14].hostname = NULL;
  5.1242 -    bsservers[15].hostname = NULL;
  5.1243 -
  5.1244 -    for (i = 0; i < MAX_SERVERS; i++) {
  5.1245 -        if (!bsservers[i].hostname)
  5.1246 -            continue;
  5.1247 -        addr = gethostbyname(bsservers[i].hostname);
  5.1248 -        if (!addr) {
  5.1249 -            perror("bad hostname");
  5.1250 -            return -1;
  5.1251 -        }
  5.1252 -        bsservers[i].sin.sin_family = addr->h_addrtype;
  5.1253 -        bsservers[i].sin.sin_port = htons(BLOCKSTORED_PORT);
  5.1254 -        bsservers[i].sin.sin_addr.s_addr = 
  5.1255 -            ((struct in_addr *)(addr->h_addr))->s_addr;
  5.1256 -    }
  5.1257 -
  5.1258 -    /* Cluster map
  5.1259 -     */
  5.1260 -    bsclusters[0].servers[0] = 0;
  5.1261 -    bsclusters[0].servers[1] = 1;
  5.1262 -    bsclusters[0].servers[2] = 2;
  5.1263 -    bsclusters[1].servers[0] = 1;
  5.1264 -    bsclusters[1].servers[1] = 2;
  5.1265 -    bsclusters[1].servers[2] = 3;
  5.1266 -    bsclusters[2].servers[0] = 2;
  5.1267 -    bsclusters[2].servers[1] = 3;
  5.1268 -    bsclusters[2].servers[2] = 4;
  5.1269 -    bsclusters[3].servers[0] = 3;
  5.1270 -    bsclusters[3].servers[1] = 4;
  5.1271 -    bsclusters[3].servers[2] = 5;
  5.1272 -    bsclusters[4].servers[0] = 4;
  5.1273 -    bsclusters[4].servers[1] = 5;
  5.1274 -    bsclusters[4].servers[2] = 6;
  5.1275 -    bsclusters[5].servers[0] = 5;
  5.1276 -    bsclusters[5].servers[1] = 6;
  5.1277 -    bsclusters[5].servers[2] = 7;
  5.1278 -    bsclusters[6].servers[0] = 6;
  5.1279 -    bsclusters[6].servers[1] = 7;
  5.1280 -    bsclusters[6].servers[2] = 0;
  5.1281 -    bsclusters[7].servers[0] = 7;
  5.1282 -    bsclusters[7].servers[1] = 0;
  5.1283 -    bsclusters[7].servers[2] = 1;
  5.1284 -
  5.1285 -    /* Local socket set up
  5.1286 -     */
  5.1287 -    bssock = socket(AF_INET, SOCK_DGRAM, 0);
  5.1288 -    if (bssock < 0) {
  5.1289 -        perror("Bad socket");
  5.1290 -        return -1;
  5.1291 -    }
  5.1292 -    memset(&sin_local, 0, sizeof(sin_local));
  5.1293 -    sin_local.sin_family = AF_INET;
  5.1294 -    sin_local.sin_port = htons(BLOCKSTORED_PORT);
  5.1295 -    sin_local.sin_addr.s_addr = htonl(INADDR_ANY);
  5.1296 -    if (bind(bssock, (struct sockaddr *)&sin_local, sizeof(sin_local)) < 0) {
  5.1297 -        perror("bind");
  5.1298 -        close(bssock);
  5.1299 -        return -1;
  5.1300 -    }
  5.1301 -
  5.1302 -    pthread_create(&pthread_recv, NULL, receive_loop, NULL);
  5.1303 -    pthread_create(&pthread_recv, NULL, queue_runner, NULL);
  5.1304 -
  5.1305 -#else /* /BLOCKSTORE_REMOTE */
  5.1306 -    block_fp = open("blockstore.dat", O_RDWR | O_CREAT | O_LARGEFILE, 0644);
  5.1307 -
  5.1308 -    if (block_fp < 0) {
  5.1309 -        perror("open");
  5.1310 -        return -1;
  5.1311 -        exit(-1);
  5.1312 -    }
  5.1313 -    
  5.1314 -    if (lseek(block_fp, 0, SEEK_END) == 0) {
  5.1315 -        bs_super = newblock();
  5.1316 -        bs_super->magic            = BLOCKSTORE_MAGIC;
  5.1317 -        bs_super->freelist_full    = 0LL;
  5.1318 -        bs_super->freelist_current = 0LL;
  5.1319 -        
  5.1320 -        ret = allocblock(bs_super);
  5.1321 -        
  5.1322 -        freeblock(bs_super);
  5.1323 -    } else {
  5.1324 -        bs_super = (blockstore_super_t *) readblock(BLOCKSTORE_SUPER);
  5.1325 -        if (bs_super->magic != BLOCKSTORE_MAGIC)
  5.1326 -        {
  5.1327 -            printf("BLOCKSTORE IS CORRUPT! (no magic in superblock!)\n");
  5.1328 -            exit(-1);
  5.1329 -        }
  5.1330 -        freeblock(bs_super);
  5.1331 -    }
  5.1332 -        
  5.1333 -    close(block_fp);
  5.1334 -        
  5.1335 -#endif /*  BLOCKSTORE_REMOTE */   
  5.1336 -    return 0;
  5.1337 -}
  5.1338 -
  5.1339 -void __exit_blockstore(void)
  5.1340 -{
  5.1341 -    int i;
  5.1342 -#ifdef BLOCKSTORE_REMOTE
  5.1343 -    pthread_mutex_destroy(&ptmutex_recv);
  5.1344 -    pthread_mutex_destroy(&ptmutex_luid);
  5.1345 -    pthread_mutex_destroy(&ptmutex_queue);
  5.1346 -    /*pthread_mutex_destroy(&ptmutex_notify);
  5.1347 -      pthread_cond_destroy(&ptcv_notify);*/
  5.1348 -    for (i = 0; i <= READ_POOL_SIZE; i++) {
  5.1349 -        pthread_mutex_destroy(&(pool_thread[i].ptmutex));
  5.1350 -        pthread_cond_destroy(&(pool_thread[i].ptcv));
  5.1351 -    }
  5.1352 -#endif
  5.1353 -}
     6.1 --- a/tools/blktap/blockstore.h	Sun Jul 03 22:32:52 2005 +0000
     6.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
     6.3 @@ -1,134 +0,0 @@
     6.4 -/**************************************************************************
     6.5 - * 
     6.6 - * blockstore.h
     6.7 - *
     6.8 - * Simple block store interface
     6.9 - *
    6.10 - */
    6.11 - 
    6.12 -#ifndef __BLOCKSTORE_H__
    6.13 -#define __BLOCKSTORE_H__
    6.14 -
    6.15 -#include <netinet/in.h>
    6.16 -#include <xc.h>
    6.17 -
    6.18 -#define BLOCK_SIZE  4096
    6.19 -#define BLOCK_SHIFT   12
    6.20 -#define BLOCK_MASK  0xfffffffffffff000LL
    6.21 -
    6.22 -/* XXX SMH: where is the below supposed to be defined???? */
    6.23 -#ifndef SECTOR_SHIFT 
    6.24 -#define SECTOR_SHIFT   9 
    6.25 -#endif
    6.26 -
    6.27 -#define FREEBLOCK_SIZE  (BLOCK_SIZE / sizeof(u64)) - (3 * sizeof(u64))
    6.28 -#define FREEBLOCK_MAGIC 0x0fee0fee0fee0feeULL
    6.29 -
    6.30 -typedef struct {
    6.31 -    u64 magic;
    6.32 -    u64 next;
    6.33 -    u64 count;
    6.34 -    u64 list[FREEBLOCK_SIZE];
    6.35 -} freeblock_t; 
    6.36 -
    6.37 -#define BLOCKSTORE_MAGIC 0xaaaaaaa00aaaaaaaULL
    6.38 -#define BLOCKSTORE_SUPER 1ULL
    6.39 -
    6.40 -typedef struct {
    6.41 -    u64 magic;
    6.42 -    u64 freelist_full;
    6.43 -    u64 freelist_current;
    6.44 -} blockstore_super_t;
    6.45 -
    6.46 -extern void *newblock();
    6.47 -extern void *readblock(u64 id);
    6.48 -extern u64 allocblock(void *block);
    6.49 -extern u64 allocblock_hint(void *block, u64 hint);
    6.50 -extern int writeblock(u64 id, void *block);
    6.51 -
    6.52 -/* Add this blockid to a freelist, to be recycled by the allocator. */
    6.53 -extern void releaseblock(u64 id);
    6.54 -
    6.55 -/* this is a memory free() operation for block-sized allocations */
    6.56 -extern void freeblock(void *block);
    6.57 -extern int __init_blockstore(void);
    6.58 -
    6.59 -/* debug for freelist. */
    6.60 -void freelist_count(int print_each);
    6.61 -#define ALLOCFAIL (((u64)(-1)))
    6.62 -
    6.63 -/* Distribution
    6.64 - */
    6.65 -#define BLOCKSTORED_PORT 9346
    6.66 -
    6.67 -struct bshdr_t_struct {
    6.68 -    u32            operation;
    6.69 -    u32            flags;
    6.70 -    u64            id;
    6.71 -    u64            luid;
    6.72 -} __attribute__ ((packed));
    6.73 -typedef struct bshdr_t_struct bshdr_t;
    6.74 -
    6.75 -struct bsmsg_t_struct {
    6.76 -    bshdr_t        hdr;
    6.77 -    unsigned char  block[BLOCK_SIZE];
    6.78 -} __attribute__ ((packed));
    6.79 -
    6.80 -typedef struct bsmsg_t_struct bsmsg_t;
    6.81 -
    6.82 -#define MSGBUFSIZE_OP    sizeof(u32)
    6.83 -#define MSGBUFSIZE_FLAGS (sizeof(u32) + sizeof(u32))
    6.84 -#define MSGBUFSIZE_ID    (sizeof(u32) + sizeof(u32) + sizeof(u64) + sizeof(u64))
    6.85 -#define MSGBUFSIZE_BLOCK sizeof(bsmsg_t)
    6.86 -
    6.87 -#define BSOP_READBLOCK  0x01
    6.88 -#define BSOP_WRITEBLOCK 0x02
    6.89 -#define BSOP_ALLOCBLOCK 0x03
    6.90 -#define BSOP_FREEBLOCK  0x04
    6.91 -
    6.92 -#define BSOP_FLAG_ERROR 0x01
    6.93 -
    6.94 -#define BS_ALLOC_SKIP 10
    6.95 -#define BS_ALLOC_HACK
    6.96 -
    6.97 -/* Remote hosts and cluster map - XXX need to generalise
    6.98 - */
    6.99 -
   6.100 -/*
   6.101 -
   6.102 -  Interim ID format is
   6.103 -
   6.104 -  63 60 59                40 39                20 19                 0
   6.105 -  +----+--------------------+--------------------+--------------------+
   6.106 -  |map | replica 2          | replica 1          | replica 0          |
   6.107 -  +----+--------------------+--------------------+--------------------+
   6.108 -
   6.109 -  The map is an index into a table detailing which machines form the
   6.110 -  cluster.
   6.111 -
   6.112 - */
   6.113 -
   6.114 -#define BSID_REPLICA0(_id) ((_id)&0xfffffULL)
   6.115 -#define BSID_REPLICA1(_id) (((_id)>>20)&0xfffffULL)
   6.116 -#define BSID_REPLICA2(_id) (((_id)>>40)&0xfffffULL)
   6.117 -#define BSID_MAP(_id)      (((_id)>>60)&0xfULL)
   6.118 -
   6.119 -#define BSID(_map, _rep0, _rep1, _rep2) ((((u64)(_map))<<60) | \
   6.120 -                                         (((u64)(_rep2))<<40) | \
   6.121 -                                         (((u64)(_rep1))<<20) | ((u64)(_rep0)))
   6.122 -
   6.123 -typedef struct bsserver_t_struct {
   6.124 -    char              *hostname;
   6.125 -    struct sockaddr_in sin;
   6.126 -} bsserver_t;
   6.127 -
   6.128 -#define MAX_SERVERS 16
   6.129 -
   6.130 -#define CLUSTER_MAX_REPLICAS 3
   6.131 -typedef struct bscluster_t_struct {
   6.132 -    int servers[CLUSTER_MAX_REPLICAS];
   6.133 -} bscluster_t;
   6.134 -
   6.135 -#define MAX_CLUSTERS 16
   6.136 -
   6.137 -#endif /* __BLOCKSTORE_H__ */
     7.1 --- a/tools/blktap/blockstored.c	Sun Jul 03 22:32:52 2005 +0000
     7.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
     7.3 @@ -1,276 +0,0 @@
     7.4 -/**************************************************************************
     7.5 - * 
     7.6 - * blockstored.c
     7.7 - *
     7.8 - * Block store daemon.
     7.9 - *
    7.10 - */
    7.11 -
    7.12 -#include <fcntl.h>
    7.13 -#include <unistd.h>
    7.14 -#include <stdio.h>
    7.15 -#include <stdlib.h>
    7.16 -#include <string.h>
    7.17 -#include <sys/types.h>
    7.18 -#include <sys/stat.h>
    7.19 -#include <sys/socket.h>
    7.20 -#include <sys/ioctl.h>
    7.21 -#include <netinet/in.h>
    7.22 -#include <errno.h>
    7.23 -#include "blockstore.h"
    7.24 -
    7.25 -//#define BSDEBUG
    7.26 -
    7.27 -int readblock_into(u64 id, void *block);
    7.28 -
    7.29 -int open_socket(u16 port) {
    7.30 -    
    7.31 -    struct sockaddr_in sn;
    7.32 -    int sock;
    7.33 -
    7.34 -    sock = socket(AF_INET, SOCK_DGRAM, 0);
    7.35 -    if (sock < 0) {
    7.36 -        perror("Bad socket");
    7.37 -        return -1;
    7.38 -    }
    7.39 -    memset(&sn, 0, sizeof(sn));
    7.40 -    sn.sin_family = AF_INET;
    7.41 -    sn.sin_port = htons(port);
    7.42 -    sn.sin_addr.s_addr = htonl(INADDR_ANY);
    7.43 -    if (bind(sock, (struct sockaddr *)&sn, sizeof(sn)) < 0) {
    7.44 -        perror("bind");
    7.45 -        close(sock);
    7.46 -        return -1;
    7.47 -    }
    7.48 -
    7.49 -    return sock;
    7.50 -}
    7.51 -
    7.52 -static int block_fp = -1;
    7.53 -static int bssock = -1;
    7.54 -
    7.55 -int send_reply(struct sockaddr_in *peer, void *buffer, int len) {
    7.56 -
    7.57 -    int rc;
    7.58 -    
    7.59 -#ifdef BSDEBUG
    7.60 -    fprintf(stdout, "TX: %u bytes op=%u id=0x%llx\n",
    7.61 -            len, ((bsmsg_t *)buffer)->hdr.operation, ((bsmsg_t *)buffer)->hdr.id);
    7.62 -#endif
    7.63 -    rc = sendto(bssock, buffer, len, 0, (struct sockaddr *)peer, sizeof(*peer));
    7.64 -    if (rc < 0) {
    7.65 -        perror("send_reply");
    7.66 -        return 1;
    7.67 -    }
    7.68 -
    7.69 -
    7.70 -    return 0;
    7.71 -}
    7.72 -
    7.73 -static bsmsg_t msgbuf;
    7.74 -
    7.75 -void service_loop(void) {
    7.76 -
    7.77 -    for (;;) {
    7.78 -        int rc, len;
    7.79 -        struct sockaddr_in from;
    7.80 -        size_t slen = sizeof(from);
    7.81 -        u64 bid;
    7.82 -
    7.83 -        len = recvfrom(bssock, (void *)&msgbuf, sizeof(msgbuf), 0,
    7.84 -                       (struct sockaddr *)&from, &slen);
    7.85 -
    7.86 -        if (len < 0) {
    7.87 -            perror("recvfrom");
    7.88 -            continue;
    7.89 -        }
    7.90 -
    7.91 -        if (len < MSGBUFSIZE_OP) {
    7.92 -            fprintf(stderr, "Short packet.\n");
    7.93 -            continue;
    7.94 -        }
    7.95 -
    7.96 -#ifdef BSDEBUG
    7.97 -        fprintf(stdout, "RX: %u bytes op=%u id=0x%llx\n",
    7.98 -                len, msgbuf.hdr.operation, msgbuf.hdr.id);
    7.99 -#endif
   7.100 -
   7.101 -        switch (msgbuf.hdr.operation) {
   7.102 -        case BSOP_READBLOCK:
   7.103 -            if (len < MSGBUFSIZE_ID) {
   7.104 -                fprintf(stderr, "Short packet (readblock %u).\n", len);
   7.105 -                continue;
   7.106 -            }
   7.107 -            rc = readblock_into(msgbuf.hdr.id, msgbuf.block);
   7.108 -            if (rc < 0) {
   7.109 -                fprintf(stderr, "readblock error\n");
   7.110 -                msgbuf.hdr.flags = BSOP_FLAG_ERROR;
   7.111 -                send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_ID);
   7.112 -                continue;
   7.113 -            }
   7.114 -            msgbuf.hdr.flags = 0;
   7.115 -            send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_BLOCK);
   7.116 -            break;
   7.117 -        case BSOP_WRITEBLOCK:
   7.118 -            if (len < MSGBUFSIZE_BLOCK) {
   7.119 -                fprintf(stderr, "Short packet (writeblock %u).\n", len);
   7.120 -                continue;
   7.121 -            }
   7.122 -            rc = writeblock(msgbuf.hdr.id, msgbuf.block);
   7.123 -            if (rc < 0) {
   7.124 -                fprintf(stderr, "writeblock error\n");
   7.125 -                msgbuf.hdr.flags = BSOP_FLAG_ERROR;
   7.126 -                send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_ID);
   7.127 -                continue;
   7.128 -            }
   7.129 -            msgbuf.hdr.flags = 0;
   7.130 -            send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_ID);
   7.131 -            break;
   7.132 -        case BSOP_ALLOCBLOCK:
   7.133 -            if (len < MSGBUFSIZE_BLOCK) {
   7.134 -                fprintf(stderr, "Short packet (allocblock %u).\n", len);
   7.135 -                continue;
   7.136 -            }
   7.137 -            bid = allocblock(msgbuf.block);
   7.138 -            if (bid == ALLOCFAIL) {
   7.139 -                fprintf(stderr, "allocblock error\n");
   7.140 -                msgbuf.hdr.flags = BSOP_FLAG_ERROR;
   7.141 -                send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_ID);
   7.142 -                continue;
   7.143 -            }
   7.144 -            msgbuf.hdr.id = bid;
   7.145 -            msgbuf.hdr.flags = 0;
   7.146 -            send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_ID);
   7.147 -            break;
   7.148 -        }
   7.149 -
   7.150 -    }
   7.151 -}
   7.152 - 
   7.153 -/**
   7.154 - * readblock: read a block from disk
   7.155 - *   @id: block id to read
   7.156 - *   @block: pointer to buffer to receive block
   7.157 - *
   7.158 - *   @return: 0 if OK, other on error
   7.159 - */
   7.160 -
   7.161 -int readblock_into(u64 id, void *block) {
   7.162 -    if (lseek64(block_fp, ((off64_t) id - 1LL) * BLOCK_SIZE, SEEK_SET) < 0) {
   7.163 -        printf ("%Ld\n", (id - 1) * BLOCK_SIZE);
   7.164 -        perror("readblock lseek");
   7.165 -        return -1;
   7.166 -    }
   7.167 -    if (read(block_fp, block, BLOCK_SIZE) != BLOCK_SIZE) {
   7.168 -        perror("readblock read");
   7.169 -        return -1;
   7.170 -    }
   7.171 -    return 0;
   7.172 -}
   7.173 -
   7.174 -/**
   7.175 - * writeblock: write an existing block to disk
   7.176 - *   @id: block id
   7.177 - *   @block: pointer to block
   7.178 - *
   7.179 - *   @return: zero on success, -1 on failure
   7.180 - */
   7.181 -int writeblock(u64 id, void *block) {
   7.182 -    if (lseek64(block_fp, ((off64_t) id - 1LL) * BLOCK_SIZE, SEEK_SET) < 0) {
   7.183 -        perror("writeblock lseek");
   7.184 -        return -1;
   7.185 -    }
   7.186 -    if (write(block_fp, block, BLOCK_SIZE) < 0) {
   7.187 -        perror("writeblock write");
   7.188 -        return -1;
   7.189 -    }
   7.190 -    return 0;
   7.191 -}
   7.192 -
   7.193 -/**
   7.194 - * allocblock: write a new block to disk
   7.195 - *   @block: pointer to block
   7.196 - *
   7.197 - *   @return: new id of block on disk
   7.198 - */
   7.199 -static u64 lastblock = 0;
   7.200 -
   7.201 -u64 allocblock(void *block) {
   7.202 -    u64 lb;
   7.203 -    off64_t pos;
   7.204 -
   7.205 -    retry:
   7.206 -    pos = lseek64(block_fp, 0, SEEK_END);
   7.207 -    if (pos == (off64_t)-1) {
   7.208 -        perror("allocblock lseek");
   7.209 -        return ALLOCFAIL;
   7.210 -    }
   7.211 -    if (pos % BLOCK_SIZE != 0) {
   7.212 -        fprintf(stderr, "file size not multiple of %d\n", BLOCK_SIZE);
   7.213 -        return ALLOCFAIL;
   7.214 -    }
   7.215 -    if (write(block_fp, block, BLOCK_SIZE) != BLOCK_SIZE) {
   7.216 -        perror("allocblock write");
   7.217 -        return ALLOCFAIL;
   7.218 -    }
   7.219 -    lb = pos / BLOCK_SIZE + 1;
   7.220 -
   7.221 -#ifdef BS_ALLOC_HACK
   7.222 -    if (lb < BS_ALLOC_SKIP)
   7.223 -        goto retry;
   7.224 -#endif
   7.225 -    
   7.226 -    if (lb <= lastblock)
   7.227 -        printf("[*** %Ld alredy allocated! ***]\n", lb);
   7.228 -    
   7.229 -    lastblock = lb;
   7.230 -    return lb;
   7.231 -}
   7.232 -
   7.233 -/**
   7.234 - * newblock: get a new in-memory block set to zeros
   7.235 - *
   7.236 - *   @return: pointer to new block, NULL on error
   7.237 - */
   7.238 -void *newblock() {
   7.239 -    void *block = malloc(BLOCK_SIZE);
   7.240 -    if (block == NULL) {
   7.241 -        perror("newblock");
   7.242 -        return NULL;
   7.243 -    }
   7.244 -    memset(block, 0, BLOCK_SIZE);
   7.245 -    return block;
   7.246 -}
   7.247 -
   7.248 -
   7.249 -/**
   7.250 - * freeblock: unallocate an in-memory block
   7.251 - *   @id: block id (zero if this is only in-memory)
   7.252 - *   @block: block to be freed
   7.253 - */
   7.254 -void freeblock(void *block) {
   7.255 -    if (block != NULL)
   7.256 -        free(block);
   7.257 -}
   7.258 -
   7.259 -
   7.260 -int main(int argc, char **argv)
   7.261 -{
   7.262 -    block_fp = open("blockstore.dat", O_RDWR | O_CREAT | O_LARGEFILE, 0644);
   7.263 -
   7.264 -    if (block_fp < 0) {
   7.265 -        perror("open");
   7.266 -        return -1;
   7.267 -    }
   7.268 -
   7.269 -    bssock = open_socket(BLOCKSTORED_PORT);
   7.270 -    if (bssock < 0) {
   7.271 -        return -1;
   7.272 -    }
   7.273 -
   7.274 -    service_loop();
   7.275 -    
   7.276 -    close(bssock);
   7.277 -
   7.278 -    return 0;
   7.279 -}
     8.1 --- a/tools/blktap/bstest.c	Sun Jul 03 22:32:52 2005 +0000
     8.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
     8.3 @@ -1,191 +0,0 @@
     8.4 -/**************************************************************************
     8.5 - * 
     8.6 - * bstest.c
     8.7 - *
     8.8 - * Block store daemon test program.
     8.9 - *
    8.10 - * usage: bstest <host>|X {r|w|a} ID 
    8.11 - *
    8.12 - */
    8.13 -
    8.14 -#include <fcntl.h>
    8.15 -#include <unistd.h>
    8.16 -#include <stdio.h>
    8.17 -#include <stdlib.h>
    8.18 -#include <string.h>
    8.19 -#include <sys/types.h>
    8.20 -#include <sys/stat.h>
    8.21 -#include <sys/socket.h>
    8.22 -#include <sys/ioctl.h>
    8.23 -#include <netinet/in.h>
    8.24 -#include <netdb.h>
    8.25 -#include <errno.h>
    8.26 -#include "blockstore.h"
    8.27 -
    8.28 -int direct(char *host, u32 op, u64 id, int len) {
    8.29 -    struct sockaddr_in sn, peer;
    8.30 -    int sock;
    8.31 -    bsmsg_t msgbuf;
    8.32 -    int rc, slen;
    8.33 -    struct hostent *addr;
    8.34 -
    8.35 -    addr = gethostbyname(host);
    8.36 -    if (!addr) {
    8.37 -        perror("bad hostname");
    8.38 -        exit(1);
    8.39 -    }
    8.40 -    peer.sin_family = addr->h_addrtype;
    8.41 -    peer.sin_port = htons(BLOCKSTORED_PORT);
    8.42 -    peer.sin_addr.s_addr =  ((struct in_addr *)(addr->h_addr))->s_addr;
    8.43 -    fprintf(stderr, "Sending to: %u.%u.%u.%u\n",
    8.44 -            (unsigned int)(unsigned char)addr->h_addr[0],
    8.45 -            (unsigned int)(unsigned char)addr->h_addr[1],
    8.46 -            (unsigned int)(unsigned char)addr->h_addr[2],
    8.47 -            (unsigned int)(unsigned char)addr->h_addr[3]);
    8.48 -
    8.49 -    sock = socket(AF_INET, SOCK_DGRAM, 0);
    8.50 -    if (sock < 0) {
    8.51 -        perror("Bad socket");
    8.52 -        exit(1);
    8.53 -    }
    8.54 -    memset(&sn, 0, sizeof(sn));
    8.55 -    sn.sin_family = AF_INET;
    8.56 -    sn.sin_port = htons(BLOCKSTORED_PORT);
    8.57 -    sn.sin_addr.s_addr = htonl(INADDR_ANY);
    8.58 -    if (bind(sock, (struct sockaddr *)&sn, sizeof(sn)) < 0) {
    8.59 -        perror("bind");
    8.60 -        close(sock);
    8.61 -        exit(1);
    8.62 -    }
    8.63 -
    8.64 -    memset((void *)&msgbuf, 0, sizeof(msgbuf));
    8.65 -    msgbuf.operation = op;
    8.66 -    msgbuf.id = id;
    8.67 -
    8.68 -    rc = sendto(sock, (void *)&msgbuf, len, 0,
    8.69 -                (struct sockaddr *)&peer, sizeof(peer));
    8.70 -    if (rc < 0) {
    8.71 -        perror("sendto");
    8.72 -        exit(1);
    8.73 -    }
    8.74 -
    8.75 -    slen = sizeof(peer);
    8.76 -    len = recvfrom(sock, (void *)&msgbuf, sizeof(msgbuf), 0,
    8.77 -                   (struct sockaddr *)&peer, &slen);
    8.78 -    if (len < 0) {
    8.79 -        perror("recvfrom");
    8.80 -        exit(1);
    8.81 -    }
    8.82 -
    8.83 -    printf("Reply %u bytes:\n", len);
    8.84 -    if (len >= MSGBUFSIZE_OP)
    8.85 -        printf("  operation: %u\n", msgbuf.operation);
    8.86 -    if (len >= MSGBUFSIZE_FLAGS)
    8.87 -        printf("  flags: 0x%x\n", msgbuf.flags);
    8.88 -    if (len >= MSGBUFSIZE_ID)
    8.89 -        printf("  id: %llu\n", msgbuf.id);
    8.90 -    if (len >= (MSGBUFSIZE_ID + 4))
    8.91 -        printf("  data: %02x %02x %02x %02x...\n",
    8.92 -               (unsigned int)msgbuf.block[0],
    8.93 -               (unsigned int)msgbuf.block[1],
    8.94 -               (unsigned int)msgbuf.block[2],
    8.95 -               (unsigned int)msgbuf.block[3]);
    8.96 -    
    8.97 -    if (sock > 0)
    8.98 -        close(sock);
    8.99 -   
   8.100 -    return 0;
   8.101 -}
   8.102 -
   8.103 -int main (int argc, char **argv) {
   8.104 -
   8.105 -    u32 op = 0;
   8.106 -    u64 id = 0;
   8.107 -    int len = 0, rc;
   8.108 -    void *block;
   8.109 -
   8.110 -    if (argc < 3) {
   8.111 -        fprintf(stderr, "usage: bstest <host>|X {r|w|a} ID\n");
   8.112 -        return 1;
   8.113 -    }
   8.114 -
   8.115 -    switch (argv[2][0]) {
   8.116 -    case 'r':
   8.117 -    case 'R':
   8.118 -        op = BSOP_READBLOCK;
   8.119 -        len = MSGBUFSIZE_ID;
   8.120 -        break;
   8.121 -    case 'w':
   8.122 -    case 'W':
   8.123 -        op = BSOP_WRITEBLOCK;
   8.124 -        len = MSGBUFSIZE_BLOCK;
   8.125 -        break;
   8.126 -    case 'a':
   8.127 -    case 'A':
   8.128 -        op = BSOP_ALLOCBLOCK;
   8.129 -        len = MSGBUFSIZE_BLOCK;
   8.130 -        break;
   8.131 -    default:
   8.132 -        fprintf(stderr, "Unknown action '%s'.\n", argv[2]);
   8.133 -        return 1;
   8.134 -    }
   8.135 -
   8.136 -    if (argc >= 4)
   8.137 -        id = atoll(argv[3]);
   8.138 -
   8.139 -    if (strcmp(argv[1], "X") == 0) {
   8.140 -        rc = __init_blockstore();
   8.141 -        if (rc < 0) {
   8.142 -            fprintf(stderr, "blockstore init failed.\n");
   8.143 -            return 1;
   8.144 -        }
   8.145 -        switch(op) {
   8.146 -        case BSOP_READBLOCK:
   8.147 -            block = readblock(id);
   8.148 -            if (block) {
   8.149 -                printf("data: %02x %02x %02x %02x...\n",
   8.150 -                       (unsigned int)((unsigned char*)block)[0],
   8.151 -                       (unsigned int)((unsigned char*)block)[1],
   8.152 -                       (unsigned int)((unsigned char*)block)[2],
   8.153 -                       (unsigned int)((unsigned char*)block)[3]);
   8.154 -            }
   8.155 -            break;
   8.156 -        case BSOP_WRITEBLOCK:
   8.157 -            block = malloc(BLOCK_SIZE);
   8.158 -            if (!block) {
   8.159 -                perror("bstest malloc");
   8.160 -                return 1;
   8.161 -            }
   8.162 -            memset(block, 0, BLOCK_SIZE);
   8.163 -            rc = writeblock(id, block);
   8.164 -            if (rc != 0) {
   8.165 -                printf("error\n");
   8.166 -            }
   8.167 -            else {
   8.168 -                printf("OK\n");
   8.169 -            }
   8.170 -            break;
   8.171 -        case BSOP_ALLOCBLOCK:
   8.172 -            block = malloc(BLOCK_SIZE);
   8.173 -            if (!block) {
   8.174 -                perror("bstest malloc");
   8.175 -                return 1;
   8.176 -            }
   8.177 -            memset(block, 0, BLOCK_SIZE);
   8.178 -            id = allocblock_hint(block, id);
   8.179 -            if (id == 0) {
   8.180 -                printf("error\n");
   8.181 -            }
   8.182 -            else {
   8.183 -                printf("ID: %llu\n", id);
   8.184 -            }
   8.185 -            break;
   8.186 -        }
   8.187 -    }
   8.188 -    else {
   8.189 -        direct(argv[1], op, id, len);
   8.190 -    }
   8.191 -
   8.192 -
   8.193 -    return 0;
   8.194 -}
     9.1 --- a/tools/blktap/parallax.c	Sun Jul 03 22:32:52 2005 +0000
     9.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
     9.3 @@ -1,611 +0,0 @@
     9.4 -/**************************************************************************
     9.5 - * 
     9.6 - * parallax.c
     9.7 - *
     9.8 - * The Parallax Storage Server
     9.9 - *
    9.10 - */
    9.11 - 
    9.12 -
    9.13 -#include <stdio.h>
    9.14 -#include <stdlib.h>
    9.15 -#include <string.h>
    9.16 -#include <pthread.h>
    9.17 -#include "blktaplib.h"
    9.18 -#include "blockstore.h"
    9.19 -#include "vdi.h"
    9.20 -#include "block-async.h"
    9.21 -#include "requests-async.h"
    9.22 -
    9.23 -#define PARALLAX_DEV     61440
    9.24 -#define SECTS_PER_NODE   8
    9.25 -
    9.26 -
    9.27 -#if 0
    9.28 -#define DPRINTF(_f, _a...) printf ( _f , ## _a )
    9.29 -#else
    9.30 -#define DPRINTF(_f, _a...) ((void)0)
    9.31 -#endif
    9.32 -
    9.33 -/* ------[ session records ]----------------------------------------------- */
    9.34 -
    9.35 -#define BLKIF_HASHSZ 1024
    9.36 -#define BLKIF_HASH(_d,_h) (((int)(_d)^(int)(_h))&(BLKIF_HASHSZ-1))
    9.37 -
    9.38 -#define VDI_HASHSZ 16
    9.39 -#define VDI_HASH(_vd) ((((_vd)>>8)^(_vd))&(VDI_HASHSZ-1))
    9.40 -
    9.41 -typedef struct blkif {
    9.42 -    domid_t       domid;
    9.43 -    unsigned int  handle;
    9.44 -    enum { DISCONNECTED, DISCONNECTING, CONNECTED } status;
    9.45 -    vdi_t        *vdi_hash[VDI_HASHSZ];
    9.46 -    struct blkif *hash_next;
    9.47 -} blkif_t;
    9.48 -
    9.49 -static blkif_t      *blkif_hash[BLKIF_HASHSZ];
    9.50 -
    9.51 -blkif_t *blkif_find_by_handle(domid_t domid, unsigned int handle)
    9.52 -{
    9.53 -    if ( handle != 0 )
    9.54 -        printf("blktap/parallax don't currently support non-0 dev handles!\n");
    9.55 -    
    9.56 -    blkif_t *blkif = blkif_hash[BLKIF_HASH(domid, handle)];
    9.57 -    while ( (blkif != NULL) && 
    9.58 -            ((blkif->domid != domid) || (blkif->handle != handle)) )
    9.59 -        blkif = blkif->hash_next;
    9.60 -    return blkif;
    9.61 -}
    9.62 -
    9.63 -vdi_t *blkif_get_vdi(blkif_t *blkif, blkif_vdev_t device)
    9.64 -{
    9.65 -    vdi_t *vdi = blkif->vdi_hash[VDI_HASH(device)];
    9.66 -    
    9.67 -    while ((vdi != NULL) && (vdi->vdevice != device))
    9.68 -        vdi = vdi->next;
    9.69 -    
    9.70 -    return vdi;
    9.71 -}
    9.72 -
    9.73 -/* ------[ control message handling ]-------------------------------------- */
    9.74 -
    9.75 -void blkif_create(blkif_be_create_t *create)
    9.76 -{
    9.77 -    domid_t       domid  = create->domid;
    9.78 -    unsigned int  handle = create->blkif_handle;
    9.79 -    blkif_t     **pblkif, *blkif;
    9.80 -
    9.81 -    DPRINTF("parallax (blkif_create): create is %p\n", create); 
    9.82 -    
    9.83 -    if ( (blkif = (blkif_t *)malloc(sizeof(blkif_t))) == NULL )
    9.84 -    {
    9.85 -        DPRINTF("Could not create blkif: out of memory\n");
    9.86 -        create->status = BLKIF_BE_STATUS_OUT_OF_MEMORY;
    9.87 -        return;
    9.88 -    }
    9.89 -
    9.90 -    memset(blkif, 0, sizeof(*blkif));
    9.91 -    blkif->domid  = domid;
    9.92 -    blkif->handle = handle;
    9.93 -    blkif->status = DISCONNECTED;
    9.94 -
    9.95 -    pblkif = &blkif_hash[BLKIF_HASH(domid, handle)];
    9.96 -    while ( *pblkif != NULL )
    9.97 -    {
    9.98 -        if ( ((*pblkif)->domid == domid) && ((*pblkif)->handle == handle) )
    9.99 -        {
   9.100 -            DPRINTF("Could not create blkif: already exists (%d,%d)\n",
   9.101 -                domid, handle);
   9.102 -            create->status = BLKIF_BE_STATUS_INTERFACE_EXISTS;
   9.103 -            free(blkif);
   9.104 -            return;
   9.105 -        }
   9.106 -        pblkif = &(*pblkif)->hash_next;
   9.107 -    }
   9.108 -
   9.109 -    blkif->hash_next = *pblkif;
   9.110 -    *pblkif = blkif;
   9.111 -
   9.112 -    DPRINTF("Successfully created blkif\n");
   9.113 -    create->status = BLKIF_BE_STATUS_OKAY;
   9.114 -}
   9.115 -
   9.116 -void blkif_destroy(blkif_be_destroy_t *destroy)
   9.117 -{
   9.118 -    domid_t       domid  = destroy->domid;
   9.119 -    unsigned int  handle = destroy->blkif_handle;
   9.120 -    blkif_t     **pblkif, *blkif;
   9.121 -
   9.122 -    DPRINTF("parallax (blkif_destroy): destroy is %p\n", destroy); 
   9.123 -    
   9.124 -    pblkif = &blkif_hash[BLKIF_HASH(domid, handle)];
   9.125 -    while ( (blkif = *pblkif) != NULL )
   9.126 -    {
   9.127 -        if ( (blkif->domid == domid) && (blkif->handle == handle) )
   9.128 -        {
   9.129 -            if ( blkif->status != DISCONNECTED )
   9.130 -                goto still_connected;
   9.131 -            goto destroy;
   9.132 -        }
   9.133 -        pblkif = &blkif->hash_next;
   9.134 -    }
   9.135 -
   9.136 -    destroy->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
   9.137 -    return;
   9.138 -
   9.139 - still_connected:
   9.140 -    destroy->status = BLKIF_BE_STATUS_INTERFACE_CONNECTED;
   9.141 -    return;
   9.142 -
   9.143 - destroy:
   9.144 -    *pblkif = blkif->hash_next;
   9.145 -    free(blkif);
   9.146 -    destroy->status = BLKIF_BE_STATUS_OKAY;
   9.147 -}
   9.148 -
   9.149 -void vbd_create(blkif_be_vbd_create_t *create)
   9.150 -{
   9.151 -    blkif_t            *blkif;
   9.152 -    vdi_t              *vdi, **vdip;
   9.153 -    blkif_vdev_t        vdevice = create->vdevice;
   9.154 -
   9.155 -    DPRINTF("parallax (vbd_create): create=%p\n", create); 
   9.156 -    
   9.157 -    blkif = blkif_find_by_handle(create->domid, create->blkif_handle);
   9.158 -    if ( blkif == NULL )
   9.159 -    {
   9.160 -        DPRINTF("vbd_create attempted for non-existent blkif (%u,%u)\n", 
   9.161 -                create->domid, create->blkif_handle); 
   9.162 -        create->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
   9.163 -        return;
   9.164 -    }
   9.165 -
   9.166 -    /* VDI identifier is in grow->extent.sector_start */
   9.167 -    DPRINTF("vbd_create: create->dev_handle (id) is %lx\n", 
   9.168 -            (unsigned long)create->dev_handle);
   9.169 -
   9.170 -    vdi = vdi_get(create->dev_handle);
   9.171 -    if (vdi == NULL)
   9.172 -    {
   9.173 -        printf("parallax (vbd_create): VDI %lx not found.\n",
   9.174 -               (unsigned long)create->dev_handle);
   9.175 -        create->status = BLKIF_BE_STATUS_VBD_NOT_FOUND;
   9.176 -        return;
   9.177 -    }
   9.178 -    
   9.179 -    vdi->next = NULL;
   9.180 -    vdi->vdevice = vdevice;
   9.181 -    vdip = &blkif->vdi_hash[VDI_HASH(vdevice)];
   9.182 -    while (*vdip != NULL)
   9.183 -        vdip = &(*vdip)->next;
   9.184 -    *vdip = vdi;
   9.185 -    
   9.186 -    DPRINTF("blkif_create succeeded\n"); 
   9.187 -    create->status = BLKIF_BE_STATUS_OKAY;
   9.188 -}
   9.189 -
   9.190 -void vbd_destroy(blkif_be_vbd_destroy_t *destroy)
   9.191 -{
   9.192 -    blkif_t            *blkif;
   9.193 -    vdi_t              *vdi, **vdip;
   9.194 -    blkif_vdev_t        vdevice = destroy->vdevice;
   9.195 -    
   9.196 -    blkif = blkif_find_by_handle(destroy->domid, destroy->blkif_handle);
   9.197 -    if ( blkif == NULL )
   9.198 -    {
   9.199 -        DPRINTF("vbd_destroy attempted for non-existent blkif (%u,%u)\n", 
   9.200 -                destroy->domid, destroy->blkif_handle); 
   9.201 -        destroy->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
   9.202 -        return;
   9.203 -    }
   9.204 -
   9.205 -    vdip = &blkif->vdi_hash[VDI_HASH(vdevice)];
   9.206 -    while ((*vdip != NULL) && ((*vdip)->vdevice != vdevice))
   9.207 -        vdip = &(*vdip)->next;
   9.208 -
   9.209 -    if (*vdip != NULL) 
   9.210 -    {
   9.211 -        vdi = *vdip;
   9.212 -        *vdip = vdi->next;
   9.213 -        vdi_put(vdi);
   9.214 -    }
   9.215 -        
   9.216 -}
   9.217 -
   9.218 -int parallax_control(control_msg_t *msg)
   9.219 -{
   9.220 -    domid_t  domid;
   9.221 -    int      ret;
   9.222 -
   9.223 -    DPRINTF("parallax_control: msg is %p\n", msg); 
   9.224 -    
   9.225 -    if (msg->type != CMSG_BLKIF_BE) 
   9.226 -    {
   9.227 -        printf("Unexpected control message (%d)\n", msg->type);
   9.228 -        return 0;
   9.229 -    }
   9.230 -    
   9.231 -    switch(msg->subtype)
   9.232 -    {
   9.233 -    case CMSG_BLKIF_BE_CREATE:
   9.234 -        if ( msg->length != sizeof(blkif_be_create_t) )
   9.235 -            goto parse_error;
   9.236 -        blkif_create((blkif_be_create_t *)msg->msg);
   9.237 -        break;   
   9.238 -        
   9.239 -    case CMSG_BLKIF_BE_DESTROY:
   9.240 -        if ( msg->length != sizeof(blkif_be_destroy_t) )
   9.241 -            goto parse_error;
   9.242 -        blkif_destroy((blkif_be_destroy_t *)msg->msg);
   9.243 -        break;  
   9.244 -        
   9.245 -    case CMSG_BLKIF_BE_VBD_CREATE:
   9.246 -        if ( msg->length != sizeof(blkif_be_vbd_create_t) )
   9.247 -            goto parse_error;
   9.248 -        vbd_create((blkif_be_vbd_create_t *)msg->msg);
   9.249 -        break;
   9.250 -        
   9.251 -    case CMSG_BLKIF_BE_VBD_DESTROY:
   9.252 -        if ( msg->length != sizeof(blkif_be_vbd_destroy_t) )
   9.253 -            goto parse_error;
   9.254 -        vbd_destroy((blkif_be_vbd_destroy_t *)msg->msg);
   9.255 -        break;
   9.256 -
   9.257 -    case CMSG_BLKIF_BE_CONNECT:
   9.258 -    case CMSG_BLKIF_BE_DISCONNECT:
   9.259 -        /* we don't manage the device channel, the tap does. */
   9.260 -        break;
   9.261 -
   9.262 -    default:
   9.263 -        goto parse_error;
   9.264 -    }
   9.265 -    return 0;
   9.266 -parse_error:
   9.267 -    printf("Bad control message!\n");
   9.268 -    return 0;
   9.269 -    
   9.270 -}    
   9.271 -
   9.272 -int parallax_probe(blkif_request_t *req, blkif_t *blkif)
   9.273 -{
   9.274 -    blkif_response_t *rsp;
   9.275 -    vdisk_t *img_info;
   9.276 -    vdi_t *vdi;
   9.277 -    int i, nr_vdis = 0; 
   9.278 -
   9.279 -    DPRINTF("parallax_probe: req=%p, blkif=%p\n", req, blkif); 
   9.280 -
   9.281 -    /* We expect one buffer only. */
   9.282 -    if ( req->nr_segments != 1 )
   9.283 -      goto err;
   9.284 -
   9.285 -    /* Make sure the buffer is page-sized. */
   9.286 -    if ( (blkif_first_sect(req->frame_and_sects[0]) != 0) ||
   9.287 -       (blkif_last_sect (req->frame_and_sects[0]) != 7) )
   9.288 -      goto err;
   9.289 -
   9.290 -    /* fill the list of devices */
   9.291 -    for (i=0; i<VDI_HASHSZ; i++) {
   9.292 -        vdi = blkif->vdi_hash[i];
   9.293 -        while (vdi) {
   9.294 -            img_info = (vdisk_t *)MMAP_VADDR(ID_TO_IDX(req->id), 0);
   9.295 -            img_info[nr_vdis].device   = vdi->vdevice;
   9.296 -            img_info[nr_vdis].info     = 0;
   9.297 -            /* The -1 here accounts for the LSB in the radix tree */
   9.298 -            img_info[nr_vdis].capacity = 
   9.299 -                    ((1LL << (VDI_HEIGHT-1)) * SECTS_PER_NODE);
   9.300 -            nr_vdis++;
   9.301 -            vdi = vdi->next;
   9.302 -        }
   9.303 -    }
   9.304 -
   9.305 -    
   9.306 -    rsp = (blkif_response_t *)req;
   9.307 -    rsp->id = req->id;
   9.308 -    rsp->operation = BLKIF_OP_PROBE;
   9.309 -    rsp->status = nr_vdis; /* number of disks */
   9.310 -
   9.311 -    DPRINTF("parallax_probe: send positive response (nr_vdis=%d)\n", nr_vdis);
   9.312 -    return  BLKTAP_RESPOND;
   9.313 -err:
   9.314 -    rsp = (blkif_response_t *)req;
   9.315 -    rsp->id = req->id;
   9.316 -    rsp->operation = BLKIF_OP_PROBE;
   9.317 -    rsp->status = BLKIF_RSP_ERROR;
   9.318 -    
   9.319 -    DPRINTF("parallax_probe: send error response\n"); 
   9.320 -    return BLKTAP_RESPOND;  
   9.321 -}
   9.322 -
   9.323 -typedef struct {
   9.324 -    blkif_request_t *req;
   9.325 -    int              count;
   9.326 -    int              error;
   9.327 -    pthread_mutex_t  mutex;
   9.328 -} pending_t;
   9.329 -
   9.330 -#define MAX_REQUESTS 64
   9.331 -pending_t pending_list[MAX_REQUESTS];
   9.332 -
   9.333 -struct cb_param {
   9.334 -    pending_t *pent;
   9.335 -    int       segment;
   9.336 -    u64       sector; 
   9.337 -    u64       vblock; /* for debug printing -- can be removed. */
   9.338 -};
   9.339 -
   9.340 -static void read_cb(struct io_ret r, void *in_param)
   9.341 -{
   9.342 -    struct cb_param *param = (struct cb_param *)in_param;
   9.343 -    pending_t *p = param->pent;
   9.344 -    int segment = param->segment;
   9.345 -    blkif_request_t *req = p->req;
   9.346 -    unsigned long size, offset, start;
   9.347 -    char *dpage, *spage;
   9.348 -	
   9.349 -    spage  = IO_BLOCK(r);
   9.350 -    if (spage == NULL) { p->error++; goto finish; }
   9.351 -    dpage  = (char *)MMAP_VADDR(ID_TO_IDX(req->id), segment);
   9.352 -    
   9.353 -    /* Calculate read size and offset within the read block. */
   9.354 -
   9.355 -    offset = (param->sector << SECTOR_SHIFT) % BLOCK_SIZE;
   9.356 -    size = ( blkif_last_sect (req->frame_and_sects[segment]) -
   9.357 -             blkif_first_sect(req->frame_and_sects[segment]) + 1
   9.358 -        ) << SECTOR_SHIFT;
   9.359 -    start = blkif_first_sect(req->frame_and_sects[segment]) 
   9.360 -        << SECTOR_SHIFT;
   9.361 -
   9.362 -    DPRINTF("ParallaxRead: sect: %lld (%ld,%ld),  "
   9.363 -            "vblock %llx, "
   9.364 -            "size %lx\n", 
   9.365 -            param->sector, blkif_first_sect(p->req->frame_and_sects[segment]),
   9.366 -            blkif_last_sect (p->req->frame_and_sects[segment]),
   9.367 -            param->vblock, size); 
   9.368 -
   9.369 -    memcpy(dpage + start, spage + offset, size);
   9.370 -    freeblock(spage);
   9.371 -    
   9.372 -    /* Done the read.  Now update the pending record. */
   9.373 - finish:
   9.374 -    pthread_mutex_lock(&p->mutex);
   9.375 -    p->count--;
   9.376 -    
   9.377 -    if (p->count == 0) {
   9.378 -    	blkif_response_t *rsp;
   9.379 -    	
   9.380 -        rsp = (blkif_response_t *)req;
   9.381 -        rsp->id = req->id;
   9.382 -        rsp->operation = BLKIF_OP_READ;
   9.383 -    	if (p->error == 0) {
   9.384 -            rsp->status = BLKIF_RSP_OKAY;
   9.385 -    	} else {
   9.386 -            rsp->status = BLKIF_RSP_ERROR;
   9.387 -    	}
   9.388 -        blktap_inject_response(rsp);       
   9.389 -    }
   9.390 -    
   9.391 -    pthread_mutex_unlock(&p->mutex);
   9.392 -	
   9.393 -    free(param); /* TODO: replace with cached alloc/dealloc */
   9.394 -}	
   9.395 -
   9.396 -int parallax_read(blkif_request_t *req, blkif_t *blkif)
   9.397 -{
   9.398 -    blkif_response_t *rsp;
   9.399 -    u64 vblock, gblock;
   9.400 -    vdi_t *vdi;
   9.401 -    u64 sector;
   9.402 -    int i;
   9.403 -    char *dpage, *spage;
   9.404 -    pending_t *pent;
   9.405 -
   9.406 -    vdi = blkif_get_vdi(blkif, req->device);
   9.407 -    
   9.408 -    if ( vdi == NULL )
   9.409 -        goto err;
   9.410 -        
   9.411 -    pent = &pending_list[ID_TO_IDX(req->id)];
   9.412 -    pent->count = req->nr_segments;
   9.413 -    pent->req = req;
   9.414 -    pthread_mutex_init(&pent->mutex, NULL);
   9.415 -    
   9.416 -    for (i = 0; i < req->nr_segments; i++) {
   9.417 -        pthread_t tid;
   9.418 -        int ret;
   9.419 -        struct cb_param *p;
   9.420 -        
   9.421 -        /* Round the requested segment to a block address. */
   9.422 -        sector  = req->sector_number + (8*i);
   9.423 -        vblock = (sector << SECTOR_SHIFT) >> BLOCK_SHIFT;
   9.424 -        
   9.425 -        /* TODO: Replace this call to malloc with a cached allocation */
   9.426 -        p = (struct cb_param *)malloc(sizeof(struct cb_param));
   9.427 -        p->pent = pent;
   9.428 -        p->sector = sector; 
   9.429 -        p->segment = i;     
   9.430 -        p->vblock = vblock; /* dbg */
   9.431 -        
   9.432 -        /* Get that block from the store. */
   9.433 -        vdi_read(vdi, vblock, read_cb, (void *)p);    
   9.434 -    }
   9.435 -    
   9.436 -    return BLKTAP_STOLEN;
   9.437 -
   9.438 -err:
   9.439 -    rsp = (blkif_response_t *)req;
   9.440 -    rsp->id = req->id;
   9.441 -    rsp->operation = BLKIF_OP_READ;
   9.442 -    rsp->status = BLKIF_RSP_ERROR;
   9.443 -    
   9.444 -    return BLKTAP_RESPOND;  
   9.445 -}
   9.446 -
   9.447 -static void write_cb(struct io_ret r, void *in_param)
   9.448 -{
   9.449 -    struct cb_param *param = (struct cb_param *)in_param;
   9.450 -    pending_t *p = param->pent;
   9.451 -    blkif_request_t *req = p->req;
   9.452 -    
   9.453 -    /* catch errors from the block code. */
   9.454 -    if (IO_INT(r) < 0) p->error++;
   9.455 -    
   9.456 -    pthread_mutex_lock(&p->mutex);
   9.457 -    p->count--;
   9.458 -    
   9.459 -    if (p->count == 0) {
   9.460 -    	blkif_response_t *rsp;
   9.461 -    	
   9.462 -        rsp = (blkif_response_t *)req;
   9.463 -        rsp->id = req->id;
   9.464 -        rsp->operation = BLKIF_OP_WRITE;
   9.465 -    	if (p->error == 0) {
   9.466 -            rsp->status = BLKIF_RSP_OKAY;
   9.467 -    	} else {
   9.468 -            rsp->status = BLKIF_RSP_ERROR;
   9.469 -    	}
   9.470 -        blktap_inject_response(rsp);       
   9.471 -    }
   9.472 -    
   9.473 -    pthread_mutex_unlock(&p->mutex);
   9.474 -	
   9.475 -    free(param); /* TODO: replace with cached alloc/dealloc */
   9.476 -}
   9.477 -
   9.478 -int parallax_write(blkif_request_t *req, blkif_t *blkif)
   9.479 -{
   9.480 -    blkif_response_t *rsp;
   9.481 -    u64 sector;
   9.482 -    int i, writable = 0;
   9.483 -    u64 vblock, gblock;
   9.484 -    char *spage;
   9.485 -    unsigned long size, offset, start;
   9.486 -    vdi_t *vdi;
   9.487 -    pending_t *pent;
   9.488 -
   9.489 -    vdi = blkif_get_vdi(blkif, req->device);
   9.490 -    
   9.491 -    if ( vdi == NULL )
   9.492 -        goto err;
   9.493 -        
   9.494 -    pent = &pending_list[ID_TO_IDX(req->id)];
   9.495 -    pent->count = req->nr_segments;
   9.496 -    pent->req = req;
   9.497 -    pthread_mutex_init(&pent->mutex, NULL);
   9.498 -    
   9.499 -    for (i = 0; i < req->nr_segments; i++) {
   9.500 -        struct cb_param *p;
   9.501 -        
   9.502 -        spage  = (char *)MMAP_VADDR(ID_TO_IDX(req->id), i);
   9.503 -        
   9.504 -        /* Round the requested segment to a block address. */
   9.505 -        
   9.506 -        sector  = req->sector_number + (8*i);
   9.507 -        vblock = (sector << SECTOR_SHIFT) >> BLOCK_SHIFT;
   9.508 -        
   9.509 -        /* Calculate read size and offset within the read block. */
   9.510 -        
   9.511 -        offset = (sector << SECTOR_SHIFT) % BLOCK_SIZE;
   9.512 -        size = ( blkif_last_sect (req->frame_and_sects[i]) -
   9.513 -                 blkif_first_sect(req->frame_and_sects[i]) + 1
   9.514 -            ) << SECTOR_SHIFT;
   9.515 -        start = blkif_first_sect(req->frame_and_sects[i]) << SECTOR_SHIFT;
   9.516 -
   9.517 -        DPRINTF("ParallaxWrite: sect: %lld (%ld,%ld),  "
   9.518 -                "vblock %llx, gblock %llx, "
   9.519 -                "size %lx\n", 
   9.520 -                sector, blkif_first_sect(req->frame_and_sects[i]),
   9.521 -                blkif_last_sect (req->frame_and_sects[i]),
   9.522 -                vblock, gblock, size); 
   9.523 -      
   9.524 -        /* XXX: For now we just freak out if they try to write a   */
   9.525 -        /* non block-sized, block-aligned page.                    */
   9.526 -        
   9.527 -        if ((offset != 0) || (size != BLOCK_SIZE) || (start != 0)) {
   9.528 -            printf("]\n] STRANGE WRITE!\n]\n");
   9.529 -            goto err;
   9.530 -        }
   9.531 -        
   9.532 -        /* TODO: Replace this call to malloc with a cached allocation */
   9.533 -        p = (struct cb_param *)malloc(sizeof(struct cb_param));
   9.534 -        p->pent = pent;
   9.535 -        p->sector = sector; 
   9.536 -        p->segment = i;     
   9.537 -        p->vblock = vblock; /* dbg */
   9.538 -        
   9.539 -        /* Issue the write to the store. */
   9.540 -        vdi_write(vdi, vblock, spage, write_cb, (void *)p);
   9.541 -    }
   9.542 -
   9.543 -    return BLKTAP_STOLEN;
   9.544 -
   9.545 -err:
   9.546 -    rsp = (blkif_response_t *)req;
   9.547 -    rsp->id = req->id;
   9.548 -    rsp->operation = BLKIF_OP_WRITE;
   9.549 -    rsp->status = BLKIF_RSP_ERROR;
   9.550 -    
   9.551 -    return BLKTAP_RESPOND;  
   9.552 -}
   9.553 -
   9.554 -int parallax_request(blkif_request_t *req)
   9.555 -{
   9.556 -    blkif_response_t *rsp;
   9.557 -    domid_t  dom   = ID_TO_DOM(req->id);
   9.558 -    blkif_t *blkif = blkif_find_by_handle(dom, 0);
   9.559 -    
   9.560 -    if (blkif == NULL)
   9.561 -        goto err;
   9.562 -    
   9.563 -    if ( req->operation == BLKIF_OP_PROBE ) {
   9.564 -        
   9.565 -        return parallax_probe(req, blkif);
   9.566 -        
   9.567 -    } else if ( req->operation == BLKIF_OP_READ ) {
   9.568 -        
   9.569 -        return parallax_read(req, blkif);
   9.570 -        
   9.571 -    } else if ( req->operation == BLKIF_OP_WRITE ) {
   9.572 -        
   9.573 -        return parallax_write(req, blkif);
   9.574 -        
   9.575 -    } else {
   9.576 -        printf("Unknown request message type!\n");
   9.577 -        /* Unknown operation */
   9.578 -        goto err;
   9.579 -    }
   9.580 -    
   9.581 -err:
   9.582 -    rsp = (blkif_response_t *)req;
   9.583 -    rsp->operation = req->operation;
   9.584 -    rsp->id = req->id;
   9.585 -    rsp->status = BLKIF_RSP_ERROR;
   9.586 -    return BLKTAP_RESPOND;  
   9.587 -}
   9.588 -
   9.589 -void __init_parallax(void) 
   9.590 -{
   9.591 -    memset(blkif_hash, 0, sizeof(blkif_hash));
   9.592 -}
   9.593 -
   9.594 -
   9.595 -
   9.596 -int main(int argc, char *argv[])
   9.597 -{
   9.598 -    DPRINTF("parallax: starting.\n"); 
   9.599 -    __init_blockstore();
   9.600 -    DPRINTF("parallax: initialized blockstore...\n"); 
   9.601 -    init_block_async();
   9.602 -    DPRINTF("parallax: initialized async blocks...\n"); 
   9.603 -    __init_vdi();
   9.604 -    DPRINTF("parallax: initialized vdi registry etc...\n"); 
   9.605 -    __init_parallax();
   9.606 -    DPRINTF("parallax: initialized local stuff..\n"); 
   9.607 -
   9.608 -    blktap_register_ctrl_hook("parallax_control", parallax_control);
   9.609 -    blktap_register_request_hook("parallax_request", parallax_request);
   9.610 -    DPRINTF("parallax: added ctrl + request hooks, starting listen...\n"); 
   9.611 -    blktap_listen();
   9.612 -    
   9.613 -    return 0;
   9.614 -}
    10.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    10.2 +++ b/tools/blktap/parallax/README	Sun Jul 03 22:36:48 2005 +0000
    10.3 @@ -0,0 +1,177 @@
    10.4 +Parallax Quick Overview
    10.5 +March 3, 2005
    10.6 +
    10.7 +This is intended to provide a quick set of instructions to let you
    10.8 +guys play with the current parallax source.  In it's current form, the
    10.9 +code will let you run an arbitrary number of VMs off of a single disk
   10.10 +image, doing copy-on-write as they make updates.  Each domain is
   10.11 +assigned a virtual disk image (VDI), which may be based on a snapshot
   10.12 +of an existing image.  All of the VDI and snapshot management should
   10.13 +currently work.
   10.14 +
   10.15 +The current implementation uses a single file as a blockstore for
   10.16 +_everything_ this will soon be replaced by the fancier backend code
   10.17 +and the local cache.  As it stands, Parallax will create
   10.18 +"blockstore.dat" in the directory that you run it from, and use
   10.19 +largefile support to make this grow to unfathomable girth.  So, you
   10.20 +probably want to run the daemon off of a local disk, with a lot of
   10.21 +free space.
   10.22 +
   10.23 +Here's how to get going:
   10.24 +
   10.25 +0. Setup:
   10.26 +---------
   10.27 +
   10.28 +Pick a local directory on a disk with lots of room.  You should be
   10.29 +running from a privileged domain (e.g. dom0) with the blocktap
   10.30 +configured in and block backend NOT.
   10.31 +
   10.32 +For convenience (for the moment) copy all of the vdi tools (vdi_*) and
   10.33 +the parallax daemon from tools/blktap into this directory.
   10.34 +
   10.35 +1. Populate the blockstore:
   10.36 +---------------------------
   10.37 +
   10.38 +First you need to put at least one image into the blockstore.  You
   10.39 +will need a disk image, either as a file or local partition.  My
   10.40 +general approach has been to
   10.41 +
   10.42 +(a) make a really big sparse file with 
   10.43 +
   10.44 +        dd if=/dev/zero of=./image bs=4K count=1 seek=[big value]
   10.45 +
   10.46 +(b) put a filesystem into it
   10.47 +
   10.48 +        mkfs.ext3 ./image
   10.49 +
   10.50 +(c) mount it using loopback
   10.51 +
   10.52 +        mkdir ./mnt
   10.53 +        mount -o loop ./image
   10.54 +
   10.55 +(d) cd into it and untar one of the image files from srg-roots.
   10.56 +
   10.57 +        cd mnt
   10.58 +        tar ...
   10.59 +
   10.60 +NOTE: Beware if your system is FC3.  mkfs is not compatible with old
   10.61 +versions of fedora, and so you don't have much choice but to install
   10.62 +further fc3 images if you have used the fc3 version of mkfs.
   10.63 +
   10.64 +(e) unmount the image
   10.65 +
   10.66 +        cd ..
   10.67 +        umount mnt
   10.68 +
   10.69 +(f) now, create a new VDI to hold the image 
   10.70 +
   10.71 +        ./vdi_create "My new FC3 VDI"
   10.72 +
   10.73 +(g) get the id of the new VDI.
   10.74 +
   10.75 +        ./vdi_list
   10.76 +
   10.77 +        |      0                     My new FC3 VDI
   10.78 +
   10.79 +(0 is the VDI id... create a few more if you want.)
   10.80 +
   10.81 +(h) hoover your image into the new VDI.
   10.82 +
   10.83 +        ./vdi_fill 0 ./image
   10.84 +
   10.85 +This will pull the entire image into the blockstore and set up a
   10.86 +mapping tree for it for VDI 0.  Passing a device (i.e. /dev/sda3)
   10.87 +should also work, but vdi_fill has NO notion of sparseness yet, so you
   10.88 +are going to pump a block into the store for each block you read.
   10.89 +
   10.90 +vdi_fill will count up until it is done, and you should be ready to
   10.91 +go.  If you want to be anal, you can use vdi_validate to test the VDI
   10.92 +against the original image.
   10.93 +
   10.94 +2. Create some extra VDIs
   10.95 +-------------------------
   10.96 +
   10.97 +VDIs are actually a list of snapshots, and each snapshot is a full
   10.98 +image of mappings.  So, to preserve an immutable copy of a current
   10.99 +VDI, do this:
  10.100 +
  10.101 +(a) Snapshot your new VDI.
  10.102 +
  10.103 +        ./vdi_snap 0
  10.104 +
  10.105 +Snapshotting writes the current radix root to the VDI's snapshot log,
  10.106 +and assigns it a new writable root.
  10.107 +
  10.108 +(b) look at the VDI's snapshot log.
  10.109 +
  10.110 +        ./vdi_snap_list 0
  10.111 +
  10.112 +        | 16   0      Thu Mar  3 19:27:48 2005 565111           31
  10.113 +
  10.114 +The first two columns constitute a snapshot id and represent the
  10.115 +(block, offset) of the snapshot record.  The Date tells you when the
  10.116 +snapshot was made, and 31 is the radix root node of the snapshot.
  10.117 +
  10.118 +(c) Create a new VDI, based on that snapshot, and look at the list.
  10.119 +
  10.120 +        ./vdi_create "FC3 - Copy 1" 16 0
  10.121 +        ./vdi_list
  10.122 +
  10.123 +        |      0                     My new FC3 VDI
  10.124 +        |      1                       FC3 - Copy 1
  10.125 +
  10.126 +NOTE: If you have Graphviz installed on your system, you can use
  10.127 +vdi_tree to generate a postscript of your current set of VDIs and
  10.128 +snapshots.
  10.129 +
  10.130 +
  10.131 +Create as many VDIs as you need for the VMs that you want to run.
  10.132 +
  10.133 +3. Boot some VMs:
  10.134 +-----------------
  10.135 +
  10.136 +Parallax currently uses a hack in xend to pass the VDI id, you need to
  10.137 +modify the disk line of the VM config that is going to mount it.
  10.138 +
  10.139 +(a) set up your vm config, by using the following disk line:
  10.140 +
  10.141 +        disk = ['parallax:1,sda1,w,0' ]
  10.142 +
  10.143 +This example uses VDI 1 (from vdi_list above), presents it as sda1
  10.144 +(writable), and uses dom 0 as the backend.  If you were running the
  10.145 +daemon (and tap driver) in some domain other than 0, you would change
  10.146 +this last parameter.
  10.147 +
  10.148 +NOTE: You'll need to have reinstalled xend/tools prior to booting the vm, so that it knows what to do with "parallax:".
  10.149 +
  10.150 +(b) Run parallax in the backend domain.
  10.151 +
  10.152 +        ./parallax
  10.153 +
  10.154 +(c) create your new domain.
  10.155 +
  10.156 +        xm create ...
  10.157 +
  10.158 +---
  10.159 +
  10.160 +That's pretty much all there is to it at the moment.  Hope this is
  10.161 +clear enough to get you going.  Now, a few serious caveats that will
  10.162 +be sorted out in the almost immediate future:
  10.163 +
  10.164 +WARNINGS:
  10.165 +---------
  10.166 +
  10.167 +1. There is NO locking in the VDI tools at the moment, so I'd avoid
  10.168 +running them in parallel, or more importantly, running them while the
  10.169 +daemon is running.
  10.170 +
  10.171 +2. I doubt that xend will be very happy about restarting if you have
  10.172 +parallax-using domains.  So if it dies while there are active parallax
  10.173 +doms, you may need to reboot.
  10.174 +
  10.175 +3. I've turned off write-in-place.  So at the moment, EVERY block
  10.176 +write is a log append on the blockstore.  I've been having some probs
  10.177 +with the radix tree's marking of writable blocks after snapshots and
  10.178 +will sort this out very soon.
  10.179 +
  10.180 +
    11.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    11.2 +++ b/tools/blktap/parallax/block-async.c	Sun Jul 03 22:36:48 2005 +0000
    11.3 @@ -0,0 +1,393 @@
    11.4 +/* block-async.c
    11.5 + * 
    11.6 + * Asynchronous block wrappers for parallax.
    11.7 + */
    11.8 + 
    11.9 + 
   11.10 +#include <stdio.h>
   11.11 +#include <stdlib.h>
   11.12 +#include <string.h>
   11.13 +#include <pthread.h>
   11.14 +#include "block-async.h"
   11.15 +#include "blockstore.h"
   11.16 +#include "vdi.h"
   11.17 +
   11.18 +
   11.19 +#if 0
   11.20 +#define DPRINTF(_f, _a...) printf ( _f , ## _a )
   11.21 +#else
   11.22 +#define DPRINTF(_f, _a...) ((void)0)
   11.23 +#endif
   11.24 +
   11.25 +/* We have a queue of outstanding I/O requests implemented as a 
   11.26 + * circular producer-consumer ring with free-running buffers.
   11.27 + * to allow reordering, this ring indirects to indexes in an 
   11.28 + * ring of io_structs.
   11.29 + * 
   11.30 + * the block_* calls may either add an entry to this ring and return, 
   11.31 + * or satisfy the request immediately and call the callback directly.
   11.32 + * None of the io calls in parallax should be nested enough to worry 
   11.33 + * about stack problems with this approach.
   11.34 + */
   11.35 +
   11.36 +struct read_args {
   11.37 +    u64 addr;
   11.38 +};
   11.39 +
   11.40 +struct write_args {
   11.41 +    u64   addr;
   11.42 +    char *block;
   11.43 +};
   11.44 +
   11.45 +struct alloc_args {
   11.46 +    char *block;
   11.47 +};
   11.48 + 
   11.49 +struct pending_io_req {
   11.50 +    enum {IO_READ, IO_WRITE, IO_ALLOC, IO_RWAKE, IO_WWAKE} op;
   11.51 +    union {
   11.52 +        struct read_args  r;
   11.53 +        struct write_args w;
   11.54 +        struct alloc_args a;
   11.55 +    } u;
   11.56 +    io_cb_t cb;
   11.57 +    void *param;
   11.58 +};
   11.59 +
   11.60 +void radix_lock_init(struct radix_lock *r)
   11.61 +{
   11.62 +    int i;
   11.63 +    
   11.64 +    pthread_mutex_init(&r->lock, NULL);
   11.65 +    for (i=0; i < 1024; i++) {
   11.66 +        r->lines[i] = 0;
   11.67 +        r->waiters[i] = NULL;
   11.68 +        r->state[i] = ANY;
   11.69 +    }
   11.70 +}
   11.71 +
   11.72 +/* maximum outstanding I/O requests issued asynchronously */
   11.73 +/* must be a power of 2.*/
   11.74 +#define MAX_PENDING_IO 1024
   11.75 +
   11.76 +/* how many threads to concurrently issue I/O to the disk. */
   11.77 +#define IO_POOL_SIZE   10
   11.78 +
   11.79 +static struct pending_io_req pending_io_reqs[MAX_PENDING_IO];
   11.80 +static int pending_io_list[MAX_PENDING_IO];
   11.81 +static unsigned long io_prod = 0, io_cons = 0, io_free = 0;
   11.82 +#define PENDING_IO_MASK(_x) ((_x) & (MAX_PENDING_IO - 1))
   11.83 +#define PENDING_IO_IDX(_x) ((_x) - pending_io_reqs)
   11.84 +#define PENDING_IO_ENT(_x) \
   11.85 +	(&pending_io_reqs[pending_io_list[PENDING_IO_MASK(_x)]])
   11.86 +#define CAN_PRODUCE_PENDING_IO ((io_free + MAX_PENDING_IO) != io_prod)
   11.87 +#define CAN_CONSUME_PENDING_IO (io_cons != io_prod)
   11.88 +static pthread_mutex_t pending_io_lock = PTHREAD_MUTEX_INITIALIZER;
   11.89 +static pthread_cond_t  pending_io_cond = PTHREAD_COND_INITIALIZER;
   11.90 +
   11.91 +static void init_pending_io(void)
   11.92 +{
   11.93 +    int i;
   11.94 +	
   11.95 +    for (i=0; i<MAX_PENDING_IO; i++)
   11.96 +        pending_io_list[i] = i;
   11.97 +		
   11.98 +} 
   11.99 +
  11.100 +void block_read(u64 addr, io_cb_t cb, void *param)
  11.101 +{
  11.102 +    struct pending_io_req *req;
  11.103 +    
  11.104 +    pthread_mutex_lock(&pending_io_lock);
  11.105 +    assert(CAN_PRODUCE_PENDING_IO);
  11.106 +    
  11.107 +    req = PENDING_IO_ENT(io_prod++);
  11.108 +    DPRINTF("Produce (R) %lu (%p)\n", io_prod - 1, req);
  11.109 +    req->op = IO_READ;
  11.110 +    req->u.r.addr = addr;
  11.111 +    req->cb = cb;
  11.112 +    req->param = param;
  11.113 +    
  11.114 +    pthread_cond_signal(&pending_io_cond);
  11.115 +    pthread_mutex_unlock(&pending_io_lock);	
  11.116 +}
  11.117 +
  11.118 +
  11.119 +void block_write(u64 addr, char *block, io_cb_t cb, void *param)
  11.120 +{
  11.121 +    struct pending_io_req *req;
  11.122 +    
  11.123 +    pthread_mutex_lock(&pending_io_lock);
  11.124 +    assert(CAN_PRODUCE_PENDING_IO);
  11.125 +    
  11.126 +    req = PENDING_IO_ENT(io_prod++);
  11.127 +    DPRINTF("Produce (W) %lu (%p)\n", io_prod - 1, req);
  11.128 +    req->op = IO_WRITE;
  11.129 +    req->u.w.addr  = addr;
  11.130 +    req->u.w.block = block;
  11.131 +    req->cb = cb;
  11.132 +    req->param = param;
  11.133 +    
  11.134 +    pthread_cond_signal(&pending_io_cond);
  11.135 +    pthread_mutex_unlock(&pending_io_lock);	
  11.136 +}
  11.137 +
  11.138 +
  11.139 +void block_alloc(char *block, io_cb_t cb, void *param)
  11.140 +{
  11.141 +    struct pending_io_req *req;
  11.142 +	
  11.143 +    pthread_mutex_lock(&pending_io_lock);
  11.144 +    assert(CAN_PRODUCE_PENDING_IO);
  11.145 +    
  11.146 +    req = PENDING_IO_ENT(io_prod++);
  11.147 +    req->op = IO_ALLOC;
  11.148 +    req->u.a.block = block;
  11.149 +    req->cb = cb;
  11.150 +    req->param = param;
  11.151 +    
  11.152 +    pthread_cond_signal(&pending_io_cond);
  11.153 +    pthread_mutex_unlock(&pending_io_lock);	
  11.154 +}
  11.155 +
  11.156 +void block_rlock(struct radix_lock *r, int row, io_cb_t cb, void *param)
  11.157 +{
  11.158 +    struct io_ret ret;
  11.159 +    pthread_mutex_lock(&r->lock);
  11.160 +    
  11.161 +    if (( r->lines[row] >= 0 ) && (r->state[row] != STOP)) {
  11.162 +        r->lines[row]++;
  11.163 +        r->state[row] = READ;
  11.164 +        DPRINTF("RLOCK  : %3d (row: %d)\n", r->lines[row], row);
  11.165 +        pthread_mutex_unlock(&r->lock);
  11.166 +        ret.type = IO_INT_T;
  11.167 +        ret.u.i = 0;
  11.168 +        cb(ret, param);
  11.169 +    } else {
  11.170 +        struct radix_wait **rwc;
  11.171 +        struct radix_wait *rw = 
  11.172 +            (struct radix_wait *) malloc (sizeof(struct radix_wait));
  11.173 +        DPRINTF("RLOCK  : %3d (row: %d) -- DEFERRED!\n", r->lines[row], row);
  11.174 +        rw->type  = RLOCK;
  11.175 +        rw->param = param;
  11.176 +        rw->cb    = cb;
  11.177 +        rw->next  = NULL;
  11.178 +        /* append to waiters list. */
  11.179 +        rwc = &r->waiters[row];
  11.180 +        while (*rwc != NULL) rwc = &(*rwc)->next;
  11.181 +        *rwc = rw;
  11.182 +        pthread_mutex_unlock(&r->lock);
  11.183 +        return;
  11.184 +    }
  11.185 +}
  11.186 +
  11.187 +
  11.188 +void block_wlock(struct radix_lock *r, int row, io_cb_t cb, void *param)
  11.189 +{
  11.190 +    struct io_ret ret;
  11.191 +    pthread_mutex_lock(&r->lock);
  11.192 +    
  11.193 +    /* the second check here is redundant -- just here for debugging now. */
  11.194 +    if ((r->state[row] == ANY) && ( r->lines[row] == 0 )) {
  11.195 +        r->state[row] = STOP;
  11.196 +        r->lines[row] = -1;
  11.197 +        DPRINTF("WLOCK  : %3d (row: %d)\n", r->lines[row], row);
  11.198 +        pthread_mutex_unlock(&r->lock);
  11.199 +        ret.type = IO_INT_T;
  11.200 +        ret.u.i = 0;
  11.201 +        cb(ret, param);
  11.202 +    } else {
  11.203 +        struct radix_wait **rwc;
  11.204 +        struct radix_wait *rw = 
  11.205 +            (struct radix_wait *) malloc (sizeof(struct radix_wait));
  11.206 +        DPRINTF("WLOCK  : %3d (row: %d) -- DEFERRED!\n", r->lines[row], row);
  11.207 +        rw->type  = WLOCK;
  11.208 +        rw->param = param;
  11.209 +        rw->cb    = cb;
  11.210 +        rw->next  = NULL;
  11.211 +        /* append to waiters list. */
  11.212 +        rwc = &r->waiters[row];
  11.213 +        while (*rwc != NULL) rwc = &(*rwc)->next;
  11.214 +        *rwc = rw;
  11.215 +        pthread_mutex_unlock(&r->lock);
  11.216 +        return;
  11.217 +    }
  11.218 +	
  11.219 +}
  11.220 +
  11.221 +/* called with radix_lock locked and lock count of zero. */
  11.222 +static void wake_waiters(struct radix_lock *r, int row)
  11.223 +{
  11.224 +    struct pending_io_req *req;
  11.225 +    struct radix_wait *rw;
  11.226 +    
  11.227 +    if (r->lines[row] != 0) return;
  11.228 +    if (r->waiters[row] == NULL) return; 
  11.229 +    
  11.230 +    if (r->waiters[row]->type == WLOCK) {
  11.231 +
  11.232 +        rw = r->waiters[row];
  11.233 +        pthread_mutex_lock(&pending_io_lock);
  11.234 +        assert(CAN_PRODUCE_PENDING_IO);
  11.235 +        
  11.236 +        req = PENDING_IO_ENT(io_prod++);
  11.237 +        req->op    = IO_WWAKE;
  11.238 +        req->cb    = rw->cb;
  11.239 +        req->param = rw->param;
  11.240 +        r->lines[row] = -1; /* write lock the row. */
  11.241 +        r->state[row] = STOP;
  11.242 +        r->waiters[row] = rw->next;
  11.243 +        free(rw);
  11.244 +        pthread_mutex_unlock(&pending_io_lock);
  11.245 +    
  11.246 +    } else /* RLOCK */ {
  11.247 +
  11.248 +        while ((r->waiters[row] != NULL) && (r->waiters[row]->type == RLOCK)) {
  11.249 +            rw = r->waiters[row];
  11.250 +            pthread_mutex_lock(&pending_io_lock);
  11.251 +            assert(CAN_PRODUCE_PENDING_IO);
  11.252 +            
  11.253 +            req = PENDING_IO_ENT(io_prod++);
  11.254 +            req->op    = IO_RWAKE;
  11.255 +            req->cb    = rw->cb;
  11.256 +            req->param = rw->param;
  11.257 +            r->lines[row]++; /* read lock the row. */
  11.258 +            r->state[row] = READ; 
  11.259 +            r->waiters[row] = rw->next;
  11.260 +            free(rw);
  11.261 +            pthread_mutex_unlock(&pending_io_lock);
  11.262 +        }
  11.263 +
  11.264 +        if (r->waiters[row] != NULL) /* There is a write queued still */
  11.265 +            r->state[row] = STOP;
  11.266 +    }	
  11.267 +    
  11.268 +    pthread_mutex_lock(&pending_io_lock);
  11.269 +    pthread_cond_signal(&pending_io_cond);
  11.270 +    pthread_mutex_unlock(&pending_io_lock);
  11.271 +}
  11.272 +
  11.273 +void block_runlock(struct radix_lock *r, int row, io_cb_t cb, void *param)
  11.274 +{
  11.275 +    struct io_ret ret;
  11.276 +	
  11.277 +    pthread_mutex_lock(&r->lock);
  11.278 +    assert(r->lines[row] > 0); /* try to catch misuse. */
  11.279 +    r->lines[row]--;
  11.280 +    if (r->lines[row] == 0) {
  11.281 +        r->state[row] = ANY;
  11.282 +        wake_waiters(r, row);
  11.283 +    }
  11.284 +    pthread_mutex_unlock(&r->lock);
  11.285 +    cb(ret, param);
  11.286 +}
  11.287 +
  11.288 +void block_wunlock(struct radix_lock *r, int row, io_cb_t cb, void *param)
  11.289 +{
  11.290 +    struct io_ret ret;
  11.291 +    
  11.292 +    pthread_mutex_lock(&r->lock);
  11.293 +    assert(r->lines[row] == -1); /* try to catch misuse. */
  11.294 +    r->lines[row] = 0;
  11.295 +    r->state[row] = ANY;
  11.296 +    wake_waiters(r, row);
  11.297 +    pthread_mutex_unlock(&r->lock);
  11.298 +    cb(ret, param);
  11.299 +}
  11.300 +
  11.301 +/* consumer calls */
  11.302 +static void do_next_io_req(struct pending_io_req *req)
  11.303 +{
  11.304 +    struct io_ret          ret;
  11.305 +    void  *param;
  11.306 +    
  11.307 +    switch (req->op) {
  11.308 +    case IO_READ:
  11.309 +        ret.type = IO_BLOCK_T;
  11.310 +        ret.u.b  = readblock(req->u.r.addr);
  11.311 +        break;
  11.312 +    case IO_WRITE:
  11.313 +        ret.type = IO_INT_T;
  11.314 +        ret.u.i  = writeblock(req->u.w.addr, req->u.w.block);
  11.315 +        DPRINTF("wrote %d at %Lu\n", *(int *)(req->u.w.block), req->u.w.addr);
  11.316 +        break;
  11.317 +    case IO_ALLOC:
  11.318 +        ret.type = IO_ADDR_T;
  11.319 +        ret.u.a  = allocblock(req->u.a.block);
  11.320 +        break;
  11.321 +    case IO_RWAKE:
  11.322 +        DPRINTF("WAKE DEFERRED RLOCK!\n");
  11.323 +        ret.type = IO_INT_T;
  11.324 +        ret.u.i  = 0;
  11.325 +        break;
  11.326 +    case IO_WWAKE:
  11.327 +        DPRINTF("WAKE DEFERRED WLOCK!\n");
  11.328 +        ret.type = IO_INT_T;
  11.329 +        ret.u.i  = 0;
  11.330 +        break;
  11.331 +    default:
  11.332 +        DPRINTF("Unknown IO operation on pending list!\n");
  11.333 +        return;
  11.334 +    }
  11.335 +    
  11.336 +    param = req->param;
  11.337 +    pthread_mutex_lock(&pending_io_lock);
  11.338 +    pending_io_list[PENDING_IO_MASK(io_free++)] = PENDING_IO_IDX(req);
  11.339 +    pthread_mutex_unlock(&pending_io_lock);
  11.340 +	
  11.341 +    assert(req->cb != NULL);
  11.342 +    req->cb(ret, param);
  11.343 +    
  11.344 +}
  11.345 +
  11.346 +void *io_thread(void *param) 
  11.347 +{
  11.348 +    int tid;
  11.349 +    struct pending_io_req *req;
  11.350 +    
  11.351 +    /* Set this thread's tid. */
  11.352 +    tid = *(int *)param;
  11.353 +    free(param);
  11.354 +    
  11.355 +start:
  11.356 +    pthread_mutex_lock(&pending_io_lock);
  11.357 +    while (io_prod == io_cons) {
  11.358 +        pthread_cond_wait(&pending_io_cond, &pending_io_lock);
  11.359 +    }
  11.360 +    
  11.361 +    if (io_prod == io_cons) {
  11.362 +        /* unnecessary wakeup. */
  11.363 +        pthread_mutex_unlock(&pending_io_lock);
  11.364 +        goto start;
  11.365 +    }
  11.366 +    
  11.367 +    req = PENDING_IO_ENT(io_cons++);
  11.368 +    pthread_mutex_unlock(&pending_io_lock);
  11.369 +	
  11.370 +    do_next_io_req(req);
  11.371 +    
  11.372 +    goto start;
  11.373 +	
  11.374 +}
  11.375 +
  11.376 +static pthread_t io_pool[IO_POOL_SIZE];
  11.377 +void start_io_threads(void)
  11.378 +
  11.379 +{	
  11.380 +    int i, tid=0;
  11.381 +    
  11.382 +    for (i=0; i < IO_POOL_SIZE; i++) {
  11.383 +        int ret, *t;
  11.384 +        t = (int *)malloc(sizeof(int));
  11.385 +        *t = tid++;
  11.386 +        ret = pthread_create(&io_pool[i], NULL, io_thread, t);
  11.387 +        if (ret != 0) printf("Error starting thread %d\n", i);
  11.388 +    }
  11.389 +	
  11.390 +}
  11.391 +
  11.392 +void init_block_async(void)
  11.393 +{
  11.394 +    init_pending_io();
  11.395 +    start_io_threads();
  11.396 +}
    12.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    12.2 +++ b/tools/blktap/parallax/block-async.h	Sun Jul 03 22:36:48 2005 +0000
    12.3 @@ -0,0 +1,69 @@
    12.4 +/* block-async.h
    12.5 + * 
    12.6 + * Asynchronous block wrappers for parallax.
    12.7 + */
    12.8 + 
    12.9 +#ifndef _BLOCKASYNC_H_
   12.10 +#define _BLOCKASYNC_H_
   12.11 +
   12.12 +#include <assert.h>
   12.13 +#include <xc.h>
   12.14 +#include "vdi.h"
   12.15 +
   12.16 +struct io_ret
   12.17 +{
   12.18 +    enum {IO_ADDR_T, IO_BLOCK_T, IO_INT_T} type;
   12.19 +    union {
   12.20 +        u64   a;
   12.21 +        char *b;
   12.22 +        int   i;
   12.23 +    } u;
   12.24 +};
   12.25 +
   12.26 +typedef void (*io_cb_t)(struct io_ret r, void *param);
   12.27 +
   12.28 +/* per-vdi lock structures to make sure requests run in a safe order. */
   12.29 +struct radix_wait {
   12.30 +    enum {RLOCK, WLOCK} type;
   12.31 +    io_cb_t  cb;
   12.32 +    void    *param;
   12.33 +    struct radix_wait *next;
   12.34 +};
   12.35 +
   12.36 +struct radix_lock {
   12.37 +    pthread_mutex_t lock;
   12.38 +    int                    lines[1024];
   12.39 +    struct radix_wait     *waiters[1024];
   12.40 +    enum {ANY, READ, STOP} state[1024];
   12.41 +};
   12.42 +void radix_lock_init(struct radix_lock *r);
   12.43 +
   12.44 +void block_read(u64 addr, io_cb_t cb, void *param);
   12.45 +void block_write(u64 addr, char *block, io_cb_t cb, void *param);
   12.46 +void block_alloc(char *block, io_cb_t cb, void *param);
   12.47 +void block_rlock(struct radix_lock *r, int row, io_cb_t cb, void *param);
   12.48 +void block_wlock(struct radix_lock *r, int row, io_cb_t cb, void *param);
   12.49 +void block_runlock(struct radix_lock *r, int row, io_cb_t cb, void *param);
   12.50 +void block_wunlock(struct radix_lock *r, int row, io_cb_t cb, void *param);
   12.51 +void init_block_async(void);
   12.52 +
   12.53 +static inline u64 IO_ADDR(struct io_ret r)
   12.54 +{
   12.55 +    assert(r.type == IO_ADDR_T);
   12.56 +    return r.u.a;
   12.57 +}
   12.58 +
   12.59 +static inline char *IO_BLOCK(struct io_ret r)
   12.60 +{
   12.61 +    assert(r.type == IO_BLOCK_T);
   12.62 +    return r.u.b;
   12.63 +}
   12.64 +
   12.65 +static inline int IO_INT(struct io_ret r)
   12.66 +{
   12.67 +    assert(r.type == IO_INT_T);
   12.68 +    return r.u.i;
   12.69 +}
   12.70 +
   12.71 +
   12.72 +#endif //_BLOCKASYNC_H_
    13.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    13.2 +++ b/tools/blktap/parallax/blockstore.c	Sun Jul 03 22:36:48 2005 +0000
    13.3 @@ -0,0 +1,1350 @@
    13.4 +/**************************************************************************
    13.5 + * 
    13.6 + * blockstore.c
    13.7 + *
    13.8 + * Simple block store interface
    13.9 + *
   13.10 + */
   13.11 + 
   13.12 +#include <fcntl.h>
   13.13 +#include <unistd.h>
   13.14 +#include <stdio.h>
   13.15 +#include <stdlib.h>
   13.16 +#include <string.h>
   13.17 +#include <sys/types.h>
   13.18 +#include <sys/stat.h>
   13.19 +#include <sys/time.h>
   13.20 +#include <stdarg.h>
   13.21 +#include "blockstore.h"
   13.22 +#include <pthread.h>
   13.23 +
   13.24 +//#define BLOCKSTORE_REMOTE
   13.25 +//#define BSDEBUG
   13.26 +
   13.27 +#define RETRY_TIMEOUT 1000000 /* microseconds */
   13.28 +
   13.29 +/*****************************************************************************
   13.30 + * Debugging
   13.31 + */
   13.32 +#ifdef BSDEBUG
   13.33 +void DB(char *format, ...)
   13.34 +{
   13.35 +    va_list args;
   13.36 +    fprintf(stderr, "[%05u] ", (int)pthread_getspecific(tid_key));
   13.37 +    va_start(args, format);
   13.38 +    vfprintf(stderr, format, args);
   13.39 +    va_end(args);
   13.40 +}
   13.41 +#else
   13.42 +#define DB(format, ...) (void)0
   13.43 +#endif
   13.44 +
   13.45 +#ifdef BLOCKSTORE_REMOTE
   13.46 +
   13.47 +#include <sys/socket.h>
   13.48 +#include <sys/ioctl.h>
   13.49 +#include <netinet/in.h>
   13.50 +#include <netdb.h>
   13.51 +
   13.52 +/*****************************************************************************
   13.53 + * Network state                                                             *
   13.54 + *****************************************************************************/
   13.55 +
   13.56 +/* The individual disk servers we talks to. These will be referenced by
   13.57 + * an integer index into bsservers[].
   13.58 + */
   13.59 +bsserver_t bsservers[MAX_SERVERS];
   13.60 +
   13.61 +/* The cluster map. This is indexed by an integer cluster number.
   13.62 + */
   13.63 +bscluster_t bsclusters[MAX_CLUSTERS];
   13.64 +
   13.65 +/* Local socket.
   13.66 + */
   13.67 +struct sockaddr_in sin_local;
   13.68 +int bssock = 0;
   13.69 +
   13.70 +/*****************************************************************************
   13.71 + * Notification                                                              *
   13.72 + *****************************************************************************/
   13.73 +
   13.74 +typedef struct pool_thread_t_struct {
   13.75 +    pthread_mutex_t ptmutex;
   13.76 +    pthread_cond_t ptcv;
   13.77 +    int newdata;
   13.78 +} pool_thread_t;
   13.79 +
   13.80 +pool_thread_t pool_thread[READ_POOL_SIZE+1];
   13.81 +
   13.82 +#define RECV_NOTIFY(tid) { \
   13.83 +    pthread_mutex_lock(&(pool_thread[tid].ptmutex)); \
   13.84 +    pool_thread[tid].newdata = 1; \
   13.85 +    DB("CV Waking %u", tid); \
   13.86 +    pthread_cond_signal(&(pool_thread[tid].ptcv)); \
   13.87 +    pthread_mutex_unlock(&(pool_thread[tid].ptmutex)); }
   13.88 +#define RECV_AWAIT(tid) { \
   13.89 +    pthread_mutex_lock(&(pool_thread[tid].ptmutex)); \
   13.90 +    if (pool_thread[tid].newdata) { \
   13.91 +        pool_thread[tid].newdata = 0; \
   13.92 +        DB("CV Woken %u", tid); \
   13.93 +    } \
   13.94 +    else { \
   13.95 +        DB("CV Waiting %u", tid); \
   13.96 +        pthread_cond_wait(&(pool_thread[tid].ptcv), \
   13.97 +                          &(pool_thread[tid].ptmutex)); \
   13.98 +    } \
   13.99 +    pthread_mutex_unlock(&(pool_thread[tid].ptmutex)); }
  13.100 +
  13.101 +/*****************************************************************************
  13.102 + * Message queue management                                                  *
  13.103 + *****************************************************************************/
  13.104 +
  13.105 +/* Protects the queue manipulation critcal regions.
  13.106 + */
  13.107 +pthread_mutex_t ptmutex_queue;
  13.108 +#define ENTER_QUEUE_CR pthread_mutex_lock(&ptmutex_queue)
  13.109 +#define LEAVE_QUEUE_CR pthread_mutex_unlock(&ptmutex_queue)
  13.110 +
  13.111 +pthread_mutex_t ptmutex_recv;
  13.112 +#define ENTER_RECV_CR pthread_mutex_lock(&ptmutex_recv)
  13.113 +#define LEAVE_RECV_CR pthread_mutex_unlock(&ptmutex_recv)
  13.114 +
  13.115 +/* A message queue entry. We allocate one of these for every request we send.
  13.116 + * Asynchronous reply reception also used one of these.
  13.117 + */
  13.118 +typedef struct bsq_t_struct {
  13.119 +    struct bsq_t_struct *prev;
  13.120 +    struct bsq_t_struct *next;
  13.121 +    int status;
  13.122 +    int server;
  13.123 +    int length;
  13.124 +    struct msghdr msghdr;
  13.125 +    struct iovec iov[2];
  13.126 +    int tid;
  13.127 +    struct timeval tv_sent;
  13.128 +    bshdr_t message;
  13.129 +    void *block;
  13.130 +} bsq_t;
  13.131 +
  13.132 +#define BSQ_STATUS_MATCHED 1
  13.133 +
  13.134 +pthread_mutex_t ptmutex_luid;
  13.135 +#define ENTER_LUID_CR pthread_mutex_lock(&ptmutex_luid)
  13.136 +#define LEAVE_LUID_CR pthread_mutex_unlock(&ptmutex_luid)
  13.137 +
  13.138 +static u64 luid_cnt = 0x1000ULL;
  13.139 +u64 new_luid(void) {
  13.140 +    u64 luid;
  13.141 +    ENTER_LUID_CR;
  13.142 +    luid = luid_cnt++;
  13.143 +    LEAVE_LUID_CR;
  13.144 +    return luid;
  13.145 +}
  13.146 +
  13.147 +/* Queue of outstanding requests.
  13.148 + */
  13.149 +bsq_t *bs_head = NULL;
  13.150 +bsq_t *bs_tail = NULL;
  13.151 +int bs_qlen = 0;
  13.152 +
  13.153 +/*
  13.154 + */
  13.155 +void queuedebug(char *msg) {
  13.156 +    bsq_t *q;
  13.157 +    ENTER_QUEUE_CR;
  13.158 +    fprintf(stderr, "Q: %s len=%u\n", msg, bs_qlen);
  13.159 +    for (q = bs_head; q; q = q->next) {
  13.160 +        fprintf(stderr, "  luid=%016llx server=%u\n",
  13.161 +                q->message.luid, q->server);
  13.162 +    }
  13.163 +    LEAVE_QUEUE_CR;
  13.164 +}
  13.165 +
  13.166 +int enqueue(bsq_t *qe) {
  13.167 +    ENTER_QUEUE_CR;
  13.168 +    qe->next = NULL;
  13.169 +    qe->prev = bs_tail;
  13.170 +    if (!bs_head)
  13.171 +        bs_head = qe;
  13.172 +    else
  13.173 +        bs_tail->next = qe;
  13.174 +    bs_tail = qe;
  13.175 +    bs_qlen++;
  13.176 +    LEAVE_QUEUE_CR;
  13.177 +#ifdef BSDEBUG
  13.178 +    queuedebug("enqueue");
  13.179 +#endif
  13.180 +    return 0;
  13.181 +}
  13.182 +
  13.183 +int dequeue(bsq_t *qe) {
  13.184 +    bsq_t *q;
  13.185 +    ENTER_QUEUE_CR;
  13.186 +    for (q = bs_head; q; q = q->next) {
  13.187 +        if (q == qe) {
  13.188 +            if (q->prev)
  13.189 +                q->prev->next = q->next;
  13.190 +            else 
  13.191 +                bs_head = q->next;
  13.192 +            if (q->next)
  13.193 +                q->next->prev = q->prev;
  13.194 +            else
  13.195 +                bs_tail = q->prev;
  13.196 +            bs_qlen--;
  13.197 +            goto found;
  13.198 +        }
  13.199 +    }
  13.200 +
  13.201 +    LEAVE_QUEUE_CR;
  13.202 +#ifdef BSDEBUG
  13.203 +    queuedebug("dequeue not found");
  13.204 +#endif
  13.205 +    return 0;
  13.206 +
  13.207 +    found:
  13.208 +    LEAVE_QUEUE_CR;
  13.209 +#ifdef BSDEBUG
  13.210 +    queuedebug("dequeue not found");
  13.211 +#endif
  13.212 +    return 1;
  13.213 +}
  13.214 +
  13.215 +bsq_t *queuesearch(bsq_t *qe) {
  13.216 +    bsq_t *q;
  13.217 +    ENTER_QUEUE_CR;
  13.218 +    for (q = bs_head; q; q = q->next) {
  13.219 +        if ((qe->server == q->server) &&
  13.220 +            (qe->message.operation == q->message.operation) &&
  13.221 +            (qe->message.luid == q->message.luid)) {
  13.222 +
  13.223 +            if ((q->message.operation == BSOP_READBLOCK) &&
  13.224 +                ((q->message.flags & BSOP_FLAG_ERROR) == 0)) {
  13.225 +                q->block = qe->block;
  13.226 +                qe->block = NULL;
  13.227 +            }
  13.228 +            q->length = qe->length;
  13.229 +            q->message.flags = qe->message.flags;
  13.230 +            q->message.id = qe->message.id;
  13.231 +            q->status |= BSQ_STATUS_MATCHED;
  13.232 +
  13.233 +            if (q->prev)
  13.234 +                q->prev->next = q->next;
  13.235 +            else 
  13.236 +                bs_head = q->next;
  13.237 +            if (q->next)
  13.238 +                q->next->prev = q->prev;
  13.239 +            else
  13.240 +                bs_tail = q->prev;
  13.241 +            q->next = NULL;
  13.242 +            q->prev = NULL;
  13.243 +            bs_qlen--;
  13.244 +            goto found;
  13.245 +        }
  13.246 +    }
  13.247 +
  13.248 +    LEAVE_QUEUE_CR;
  13.249 +#ifdef BSDEBUG
  13.250 +    queuedebug("queuesearch not found");
  13.251 +#endif
  13.252 +    return NULL;
  13.253 +
  13.254 +    found:
  13.255 +    LEAVE_QUEUE_CR;
  13.256 +#ifdef BSDEBUG
  13.257 +    queuedebug("queuesearch found");
  13.258 +#endif
  13.259 +    return q;
  13.260 +}
  13.261 +
  13.262 +/*****************************************************************************
  13.263 + * Network communication                                                     *
  13.264 + *****************************************************************************/
  13.265 +
  13.266 +int send_message(bsq_t *qe) {
  13.267 +    int rc;
  13.268 +
  13.269 +    qe->msghdr.msg_name = (void *)&(bsservers[qe->server].sin);
  13.270 +    qe->msghdr.msg_namelen = sizeof(struct sockaddr_in);
  13.271 +    qe->msghdr.msg_iov = qe->iov;
  13.272 +    if (qe->block)
  13.273 +        qe->msghdr.msg_iovlen = 2;
  13.274 +    else
  13.275 +        qe->msghdr.msg_iovlen = 1;
  13.276 +    qe->msghdr.msg_control = NULL;
  13.277 +    qe->msghdr.msg_controllen = 0;
  13.278 +    qe->msghdr.msg_flags = 0;
  13.279 +
  13.280 +    qe->iov[0].iov_base = (void *)&(qe->message);
  13.281 +    qe->iov[0].iov_len = MSGBUFSIZE_ID;
  13.282 +
  13.283 +    if (qe->block) {
  13.284 +        qe->iov[1].iov_base = qe->block;
  13.285 +        qe->iov[1].iov_len = BLOCK_SIZE;
  13.286 +    }
  13.287 +
  13.288 +    qe->message.luid = new_luid();
  13.289 +
  13.290 +    qe->status = 0;
  13.291 +    qe->tid = (int)pthread_getspecific(tid_key);
  13.292 +    if (enqueue(qe) < 0) {
  13.293 +        fprintf(stderr, "Error enqueuing request.\n");
  13.294 +        return -1;
  13.295 +    }
  13.296 +
  13.297 +    gettimeofday(&(qe->tv_sent), NULL);
  13.298 +    DB("send_message to %d luid=%016llx\n", qe->server, qe->message.luid);
  13.299 +    rc = sendmsg(bssock, &(qe->msghdr), MSG_DONTWAIT);
  13.300 +    //rc = sendto(bssock, (void *)&(qe->message), qe->length, 0,
  13.301 +    //           (struct sockaddr *)&(bsservers[qe->server].sin),
  13.302 +    //           sizeof(struct sockaddr_in));
  13.303 +    if (rc < 0)
  13.304 +        return rc;
  13.305 +
  13.306 +    return rc;
  13.307 +}
  13.308 +
  13.309 +int recv_message(bsq_t *qe) {
  13.310 +    struct sockaddr_in from;
  13.311 +    //int flen = sizeof(from);
  13.312 +    int rc;
  13.313 +
  13.314 +    qe->msghdr.msg_name = &from;
  13.315 +    qe->msghdr.msg_namelen = sizeof(struct sockaddr_in);
  13.316 +    qe->msghdr.msg_iov = qe->iov;
  13.317 +    if (qe->block)
  13.318 +        qe->msghdr.msg_iovlen = 2;
  13.319 +    else
  13.320 +        qe->msghdr.msg_iovlen = 1;
  13.321 +    qe->msghdr.msg_control = NULL;
  13.322 +    qe->msghdr.msg_controllen = 0;
  13.323 +    qe->msghdr.msg_flags = 0;
  13.324 +
  13.325 +    qe->iov[0].iov_base = (void *)&(qe->message);
  13.326 +    qe->iov[0].iov_len = MSGBUFSIZE_ID;
  13.327 +    if (qe->block) {
  13.328 +        qe->iov[1].iov_base = qe->block;
  13.329 +        qe->iov[1].iov_len = BLOCK_SIZE;
  13.330 +    }
  13.331 +
  13.332 +    rc = recvmsg(bssock, &(qe->msghdr), 0);
  13.333 +
  13.334 +    //return recvfrom(bssock, (void *)&(qe->message), sizeof(bsmsg_t), 0,
  13.335 +    //               (struct sockaddr *)&from, &flen);
  13.336 +    return rc;
  13.337 +}
  13.338 +
  13.339 +int get_server_number(struct sockaddr_in *sin) {
  13.340 +    int i;
  13.341 +
  13.342 +#ifdef BSDEBUG2
  13.343 +    fprintf(stderr,
  13.344 +            "get_server_number(%u.%u.%u.%u/%u)\n",
  13.345 +            (unsigned int)sin->sin_addr.s_addr & 0xff,
  13.346 +            ((unsigned int)sin->sin_addr.s_addr >> 8) & 0xff,
  13.347 +            ((unsigned int)sin->sin_addr.s_addr >> 16) & 0xff,
  13.348 +            ((unsigned int)sin->sin_addr.s_addr >> 24) & 0xff,
  13.349 +            (unsigned int)sin->sin_port);
  13.350 +#endif
  13.351 +
  13.352 +    for (i = 0; i < MAX_SERVERS; i++) {
  13.353 +        if (bsservers[i].hostname) {
  13.354 +#ifdef BSDEBUG2
  13.355 +            fprintf(stderr,
  13.356 +                    "get_server_number check %u.%u.%u.%u/%u\n",
  13.357 +                    (unsigned int)bsservers[i].sin.sin_addr.s_addr&0xff,
  13.358 +                    ((unsigned int)bsservers[i].sin.sin_addr.s_addr >> 8)&0xff,
  13.359 +                    ((unsigned int)bsservers[i].sin.sin_addr.s_addr >> 16)&0xff,
  13.360 +                    ((unsigned int)bsservers[i].sin.sin_addr.s_addr >> 24)&0xff,
  13.361 +                    (unsigned int)bsservers[i].sin.sin_port);
  13.362 +#endif
  13.363 +            if ((sin->sin_family == bsservers[i].sin.sin_family) &&
  13.364 +                (sin->sin_port == bsservers[i].sin.sin_port) &&
  13.365 +                (memcmp((void *)&(sin->sin_addr),
  13.366 +                        (void *)&(bsservers[i].sin.sin_addr),
  13.367 +                        sizeof(struct in_addr)) == 0)) {
  13.368 +                return i;
  13.369 +            }
  13.370 +        }        
  13.371 +    }
  13.372 +
  13.373 +    return -1;
  13.374 +}
  13.375 +
  13.376 +void *rx_buffer = NULL;
  13.377 +bsq_t rx_qe;
  13.378 +bsq_t *recv_any(void) {
  13.379 +    struct sockaddr_in from;
  13.380 +    int rc;
  13.381 +    
  13.382 +    DB("ENTER recv_any\n");
  13.383 +
  13.384 +    rx_qe.msghdr.msg_name = &from;
  13.385 +    rx_qe.msghdr.msg_namelen = sizeof(struct sockaddr_in);
  13.386 +    rx_qe.msghdr.msg_iov = rx_qe.iov;
  13.387 +    if (!rx_buffer) {
  13.388 +        rx_buffer = malloc(BLOCK_SIZE);
  13.389 +        if (!rx_buffer) {
  13.390 +            perror("recv_any malloc");
  13.391 +            return NULL;
  13.392 +        }
  13.393 +    }
  13.394 +    rx_qe.block = rx_buffer;
  13.395 +    rx_buffer = NULL;
  13.396 +    rx_qe.msghdr.msg_iovlen = 2;
  13.397 +    rx_qe.msghdr.msg_control = NULL;
  13.398 +    rx_qe.msghdr.msg_controllen = 0;
  13.399 +    rx_qe.msghdr.msg_flags = 0;
  13.400 +    
  13.401 +    rx_qe.iov[0].iov_base = (void *)&(rx_qe.message);
  13.402 +    rx_qe.iov[0].iov_len = MSGBUFSIZE_ID;
  13.403 +    rx_qe.iov[1].iov_base = rx_qe.block;
  13.404 +    rx_qe.iov[1].iov_len = BLOCK_SIZE;
  13.405 +
  13.406 +    rc = recvmsg(bssock, &(rx_qe.msghdr), 0);
  13.407 +    if (rc < 0) {
  13.408 +        perror("recv_any");
  13.409 +        return NULL;
  13.410 +    }
  13.411 +
  13.412 +    rx_qe.length = rc;    
  13.413 +    rx_qe.server = get_server_number(&from);
  13.414 +
  13.415 +    DB("recv_any from %d luid=%016llx len=%u\n",
  13.416 +       rx_qe.server, rx_qe.message.luid, rx_qe.length);
  13.417 +
  13.418 +    return &rx_qe;
  13.419 +}
  13.420 +
  13.421 +void recv_recycle_buffer(bsq_t *q) {
  13.422 +    if (q->block) {
  13.423 +        rx_buffer = q->block;
  13.424 +        q->block = NULL;
  13.425 +    }
  13.426 +}
  13.427 +
  13.428 +// cycle through reading any incoming, searching for a match in the
  13.429 +// queue, until we have all we need.
  13.430 +int wait_recv(bsq_t **reqs, int numreqs) {
  13.431 +    bsq_t *q, *m;
  13.432 +    unsigned int x, i;
  13.433 +    int tid = (int)pthread_getspecific(tid_key);
  13.434 +
  13.435 +    DB("ENTER wait_recv %u\n", numreqs);
  13.436 +
  13.437 +    checkmatch:
  13.438 +    x = 0xffffffff;
  13.439 +    for (i = 0; i < numreqs; i++) {
  13.440 +        x &= reqs[i]->status;
  13.441 +    }
  13.442 +    if ((x & BSQ_STATUS_MATCHED)) {
  13.443 +        DB("LEAVE wait_recv\n");
  13.444 +        return numreqs;
  13.445 +    }
  13.446 +
  13.447 +    RECV_AWAIT(tid);
  13.448 +
  13.449 +    /*
  13.450 +    rxagain:
  13.451 +    ENTER_RECV_CR;
  13.452 +    q = recv_any();
  13.453 +    LEAVE_RECV_CR;
  13.454 +    if (!q)
  13.455 +        return -1;
  13.456 +
  13.457 +    m = queuesearch(q);
  13.458 +    recv_recycle_buffer(q);
  13.459 +    if (!m) {
  13.460 +        fprintf(stderr, "Unmatched RX\n");
  13.461 +        goto rxagain;
  13.462 +    }
  13.463 +    */
  13.464 +
  13.465 +    goto checkmatch;
  13.466 +
  13.467 +}
  13.468 +
  13.469 +/* retry
  13.470 + */
  13.471 +static int retry_count = 0;
  13.472 +int retry(bsq_t *qe)
  13.473 +{
  13.474 +    int rc;
  13.475 +    gettimeofday(&(qe->tv_sent), NULL);
  13.476 +    DB("retry to %d luid=%016llx\n", qe->server, qe->message.luid);
  13.477 +    retry_count++;
  13.478 +    rc = sendmsg(bssock, &(qe->msghdr), MSG_DONTWAIT);
  13.479 +    if (rc < 0)
  13.480 +        return rc;
  13.481 +    return 0;
  13.482 +}
  13.483 +
  13.484 +/* queue runner
  13.485 + */
  13.486 +void *queue_runner(void *arg)
  13.487 +{
  13.488 +    for (;;) {
  13.489 +        struct timeval now;
  13.490 +        long long nowus, sus;
  13.491 +        bsq_t *q;
  13.492 +        int r;
  13.493 +
  13.494 +        sleep(1);
  13.495 +
  13.496 +        gettimeofday(&now, NULL);
  13.497 +        nowus = now.tv_usec + now.tv_sec * 1000000;
  13.498 +        ENTER_QUEUE_CR;
  13.499 +        r = retry_count;
  13.500 +        for (q = bs_head; q; q = q->next) {
  13.501 +            sus = q->tv_sent.tv_usec + q->tv_sent.tv_sec * 1000000;
  13.502 +            if ((nowus - sus) > RETRY_TIMEOUT) {
  13.503 +                if (retry(q) < 0) {
  13.504 +                    fprintf(stderr, "Error on sendmsg retry.\n");
  13.505 +                }
  13.506 +            }
  13.507 +        }
  13.508 +        if (r != retry_count) {
  13.509 +            fprintf(stderr, "RETRIES: %u %u\n", retry_count - r, retry_count);
  13.510 +        }
  13.511 +        LEAVE_QUEUE_CR;
  13.512 +    }
  13.513 +}
  13.514 +
  13.515 +/* receive loop
  13.516 + */
  13.517 +void *receive_loop(void *arg)
  13.518 +{
  13.519 +    bsq_t *q, *m;
  13.520 +
  13.521 +    for(;;) {
  13.522 +        q = recv_any();
  13.523 +        if (!q) {
  13.524 +            fprintf(stderr, "recv_any error\n");
  13.525 +        }
  13.526 +        else {
  13.527 +            m = queuesearch(q);
  13.528 +            recv_recycle_buffer(q);
  13.529 +            if (!m) {
  13.530 +                fprintf(stderr, "Unmatched RX\n");
  13.531 +            }
  13.532 +            else {
  13.533 +                DB("RX MATCH");
  13.534 +                RECV_NOTIFY(m->tid);
  13.535 +            }
  13.536 +        }
  13.537 +    }
  13.538 +}
  13.539 +pthread_t pthread_recv;
  13.540 +
  13.541 +/*****************************************************************************
  13.542 + * Reading                                                                   *
  13.543 + *****************************************************************************/
  13.544 +
  13.545 +void *readblock_indiv(int server, u64 id) {
  13.546 +    void *block;
  13.547 +    bsq_t *qe;
  13.548 +    int len, rc;
  13.549 +
  13.550 +    qe = (bsq_t *)malloc(sizeof(bsq_t));
  13.551 +    if (!qe) {
  13.552 +        perror("readblock qe malloc");
  13.553 +        return NULL;
  13.554 +    }
  13.555 +    qe->block = NULL;
  13.556 +    
  13.557 +    /*
  13.558 +    qe->block = malloc(BLOCK_SIZE);
  13.559 +    if (!qe->block) {
  13.560 +        perror("readblock qe malloc");
  13.561 +        free((void *)qe);
  13.562 +        return NULL;
  13.563 +    }
  13.564 +    */
  13.565 +
  13.566 +    qe->server = server;
  13.567 +
  13.568 +    qe->message.operation = BSOP_READBLOCK;
  13.569 +    qe->message.flags = 0;
  13.570 +    qe->message.id = id;
  13.571 +    qe->length = MSGBUFSIZE_ID;
  13.572 +
  13.573 +    if (send_message(qe) < 0) {
  13.574 +        perror("readblock sendto");
  13.575 +        goto err;
  13.576 +    }
  13.577 +    
  13.578 +    /*len = recv_message(qe);
  13.579 +    if (len < 0) {
  13.580 +        perror("readblock recv");
  13.581 +        goto err;
  13.582 +    }*/
  13.583 +
  13.584 +    rc = wait_recv(&qe, 1);
  13.585 +    if (rc < 0) {
  13.586 +        perror("readblock recv");
  13.587 +        goto err;
  13.588 +    }
  13.589 +
  13.590 +    if ((qe->message.flags & BSOP_FLAG_ERROR)) {
  13.591 +        fprintf(stderr, "readblock server error\n");
  13.592 +        goto err;
  13.593 +    }
  13.594 +    if (qe->length < MSGBUFSIZE_BLOCK) {
  13.595 +        fprintf(stderr, "readblock recv short (%u)\n", len);
  13.596 +        goto err;
  13.597 +    }
  13.598 +    /* if ((block = malloc(BLOCK_SIZE)) == NULL) {
  13.599 +        perror("readblock malloc");
  13.600 +        goto err;
  13.601 +    }
  13.602 +    memcpy(block, qe->message.block, BLOCK_SIZE);
  13.603 +    */    
  13.604 +    block = qe->block;
  13.605 +
  13.606 +    free((void *)qe);
  13.607 +    return block;
  13.608 +
  13.609 +    err:
  13.610 +    if (qe->block)
  13.611 +        free(qe->block);
  13.612 +    free((void *)qe);
  13.613 +    return NULL;
  13.614 +}
  13.615 +
  13.616 +/**
  13.617 + * readblock: read a block from disk
  13.618 + *   @id: block id to read
  13.619 + *
  13.620 + *   @return: pointer to block, NULL on error
  13.621 + */
  13.622 +void *readblock(u64 id) {
  13.623 +    int map = (int)BSID_MAP(id);
  13.624 +    u64 xid;
  13.625 +    static int i = CLUSTER_MAX_REPLICAS - 1;
  13.626 +    void *block = NULL;
  13.627 +
  13.628 +    /* special case for the "superblock" just use the first block on the
  13.629 +     * first replica. (extend to blocks < 6 for vdi bug)
  13.630 +     */
  13.631 +    if (id < 6) {
  13.632 +        block = readblock_indiv(bsclusters[map].servers[0], id);
  13.633 +        goto out;
  13.634 +    }
  13.635 +
  13.636 +    i++;
  13.637 +    if (i >= CLUSTER_MAX_REPLICAS)
  13.638 +        i = 0;
  13.639 +    switch (i) {
  13.640 +    case 0:
  13.641 +        xid = BSID_REPLICA0(id);
  13.642 +        break;
  13.643 +    case 1:
  13.644 +        xid = BSID_REPLICA1(id);
  13.645 +        break;
  13.646 +    case 2:
  13.647 +        xid = BSID_REPLICA2(id);
  13.648 +        break;
  13.649 +    }
  13.650 +    
  13.651 +    block = readblock_indiv(bsclusters[map].servers[i], xid);
  13.652 +
  13.653 +    out:
  13.654 +#ifdef BSDEBUG
  13.655 +    if (block)
  13.656 +        fprintf(stderr, "READ:  %016llx %02x%02x %02x%02x %02x%02x %02x%02x\n",
  13.657 +                id,
  13.658 +                (unsigned int)((unsigned char *)block)[0],
  13.659 +                (unsigned int)((unsigned char *)block)[1],
  13.660 +                (unsigned int)((unsigned char *)block)[2],
  13.661 +                (unsigned int)((unsigned char *)block)[3],
  13.662 +                (unsigned int)((unsigned char *)block)[4],
  13.663 +                (unsigned int)((unsigned char *)block)[5],
  13.664 +                (unsigned int)((unsigned char *)block)[6],
  13.665 +                (unsigned int)((unsigned char *)block)[7]);
  13.666 +    else
  13.667 +        fprintf(stderr, "READ:  %016llx NULL\n", id);
  13.668 +#endif
  13.669 +    return block;
  13.670 +}
  13.671 +
  13.672 +/*****************************************************************************
  13.673 + * Writing                                                                   *
  13.674 + *****************************************************************************/
  13.675 +
  13.676 +bsq_t *writeblock_indiv(int server, u64 id, void *block) {
  13.677 +
  13.678 +    bsq_t *qe;
  13.679 +    int len;
  13.680 +
  13.681 +    qe = (bsq_t *)malloc(sizeof(bsq_t));
  13.682 +    if (!qe) {
  13.683 +        perror("writeblock qe malloc");
  13.684 +        goto err;
  13.685 +    }
  13.686 +    qe->server = server;
  13.687 +
  13.688 +    qe->message.operation = BSOP_WRITEBLOCK;
  13.689 +    qe->message.flags = 0;
  13.690 +    qe->message.id = id;
  13.691 +    //memcpy(qe->message.block, block, BLOCK_SIZE);
  13.692 +    qe->block = block;
  13.693 +    qe->length = MSGBUFSIZE_BLOCK;
  13.694 +
  13.695 +    if (send_message(qe) < 0) {
  13.696 +        perror("writeblock sendto");
  13.697 +        goto err;
  13.698 +    }
  13.699 +
  13.700 +    return qe;
  13.701 +
  13.702 +    err:
  13.703 +    free((void *)qe);
  13.704 +    return NULL;
  13.705 +}
  13.706 +    
  13.707 +
  13.708 +/**
  13.709 + * writeblock: write an existing block to disk
  13.710 + *   @id: block id
  13.711 + *   @block: pointer to block
  13.712 + *
  13.713 + *   @return: zero on success, -1 on failure
  13.714 + */
  13.715 +int writeblock(u64 id, void *block) {
  13.716 +    
  13.717 +    int map = (int)BSID_MAP(id);
  13.718 +    int rep0 = bsclusters[map].servers[0];
  13.719 +    int rep1 = bsclusters[map].servers[1];
  13.720 +    int rep2 = bsclusters[map].servers[2];
  13.721 +    bsq_t *reqs[3];
  13.722 +    int rc;
  13.723 +
  13.724 +    reqs[0] = reqs[1] = reqs[2] = NULL;
  13.725 +
  13.726 +#ifdef BSDEBUG
  13.727 +    fprintf(stderr,
  13.728 +            "WRITE: %016llx %02x%02x %02x%02x %02x%02x %02x%02x\n",
  13.729 +            id,
  13.730 +            (unsigned int)((unsigned char *)block)[0],
  13.731 +            (unsigned int)((unsigned char *)block)[1],
  13.732 +            (unsigned int)((unsigned char *)block)[2],
  13.733 +            (unsigned int)((unsigned char *)block)[3],
  13.734 +            (unsigned int)((unsigned char *)block)[4],
  13.735 +            (unsigned int)((unsigned char *)block)[5],
  13.736 +            (unsigned int)((unsigned char *)block)[6],
  13.737 +            (unsigned int)((unsigned char *)block)[7]);
  13.738 +#endif
  13.739 +
  13.740 +    /* special case for the "superblock" just use the first block on the
  13.741 +     * first replica. (extend to blocks < 6 for vdi bug)
  13.742 +     */
  13.743 +    if (id < 6) {
  13.744 +        reqs[0] = writeblock_indiv(rep0, id, block);
  13.745 +        if (!reqs[0])
  13.746 +            return -1;
  13.747 +        rc = wait_recv(reqs, 1);
  13.748 +        return rc;
  13.749 +    }
  13.750 +
  13.751 +    reqs[0] = writeblock_indiv(rep0, BSID_REPLICA0(id), block);
  13.752 +    if (!reqs[0])
  13.753 +        goto err;
  13.754 +    reqs[1] = writeblock_indiv(rep1, BSID_REPLICA1(id), block);
  13.755 +    if (!reqs[1])
  13.756 +        goto err;
  13.757 +    reqs[2] = writeblock_indiv(rep2, BSID_REPLICA2(id), block);
  13.758 +    if (!reqs[2])
  13.759 +        goto err;
  13.760 +
  13.761 +    rc = wait_recv(reqs, 3);
  13.762 +    if (rc < 0) {
  13.763 +        perror("writeblock recv");
  13.764 +        goto err;
  13.765 +    }
  13.766 +    if ((reqs[0]->message.flags & BSOP_FLAG_ERROR)) {
  13.767 +        fprintf(stderr, "writeblock server0 error\n");
  13.768 +        goto err;
  13.769 +    }
  13.770 +    if ((reqs[1]->message.flags & BSOP_FLAG_ERROR)) {
  13.771 +        fprintf(stderr, "writeblock server1 error\n");
  13.772 +        goto err;
  13.773 +    }
  13.774 +    if ((reqs[2]->message.flags & BSOP_FLAG_ERROR)) {
  13.775 +        fprintf(stderr, "writeblock server2 error\n");
  13.776 +        goto err;
  13.777 +    }
  13.778 +
  13.779 +
  13.780 +    free((void *)reqs[0]);
  13.781 +    free((void *)reqs[1]);
  13.782 +    free((void *)reqs[2]);
  13.783 +    return 0;
  13.784 +
  13.785 +    err:
  13.786 +    if (reqs[0]) {
  13.787 +        dequeue(reqs[0]);
  13.788 +        free((void *)reqs[0]);
  13.789 +    }
  13.790 +    if (reqs[1]) {
  13.791 +        dequeue(reqs[1]);
  13.792 +        free((void *)reqs[1]);
  13.793 +    }
  13.794 +    if (reqs[2]) {
  13.795 +        dequeue(reqs[2]);
  13.796 +        free((void *)reqs[2]);
  13.797 +    }
  13.798 +    return -1;
  13.799 +}
  13.800 +
  13.801 +/*****************************************************************************
  13.802 + * Allocation                                                                *
  13.803 + *****************************************************************************/
  13.804 +
  13.805 +/**
  13.806 + * allocblock: write a new block to disk
  13.807 + *   @block: pointer to block
  13.808 + *
  13.809 + *   @return: new id of block on disk
  13.810 + */
  13.811 +u64 allocblock(void *block) {
  13.812 +    return allocblock_hint(block, 0);
  13.813 +}
  13.814 +
  13.815 +bsq_t *allocblock_hint_indiv(int server, void *block, u64 hint) {
  13.816 +    bsq_t *qe;
  13.817 +    int len;
  13.818 +
  13.819 +    qe = (bsq_t *)malloc(sizeof(bsq_t));
  13.820 +    if (!qe) {
  13.821 +        perror("allocblock_hint qe malloc");
  13.822 +        goto err;
  13.823 +    }
  13.824 +    qe->server = server;
  13.825 +
  13.826 +    qe->message.operation = BSOP_ALLOCBLOCK;
  13.827 +    qe->message.flags = 0;
  13.828 +    qe->message.id = hint;
  13.829 +    //memcpy(qe->message.block, block, BLOCK_SIZE);
  13.830 +    qe->block = block;
  13.831 +    qe->length = MSGBUFSIZE_BLOCK;
  13.832 +
  13.833 +    if (send_message(qe) < 0) {
  13.834 +        perror("allocblock_hint sendto");
  13.835 +        goto err;
  13.836 +    }
  13.837 +    
  13.838 +    return qe;
  13.839 +
  13.840 +    err:
  13.841 +    free((void *)qe);
  13.842 +    return NULL;
  13.843 +}
  13.844 +
  13.845 +/**
  13.846 + * allocblock_hint: write a new block to disk
  13.847 + *   @block: pointer to block
  13.848 + *   @hint: allocation hint
  13.849 + *
  13.850 + *   @return: new id of block on disk
  13.851 + */
  13.852 +u64 allocblock_hint(void *block, u64 hint) {
  13.853 +    int map = (int)hint;
  13.854 +    int rep0 = bsclusters[map].servers[0];
  13.855 +    int rep1 = bsclusters[map].servers[1];
  13.856 +    int rep2 = bsclusters[map].servers[2];
  13.857 +    bsq_t *reqs[3];
  13.858 +    int rc;
  13.859 +    u64 id0, id1, id2;
  13.860 +
  13.861 +    reqs[0] = reqs[1] = reqs[2] = NULL;
  13.862 +
  13.863 +    DB("ENTER allocblock\n");
  13.864 +
  13.865 +    reqs[0] = allocblock_hint_indiv(rep0, block, hint);
  13.866 +    if (!reqs[0])
  13.867 +        goto err;
  13.868 +    reqs[1] = allocblock_hint_indiv(rep1, block, hint);
  13.869 +    if (!reqs[1])
  13.870 +        goto err;
  13.871 +    reqs[2] = allocblock_hint_indiv(rep2, block, hint);
  13.872 +    if (!reqs[2])
  13.873 +        goto err;
  13.874 +
  13.875 +    rc = wait_recv(reqs, 3);
  13.876 +    if (rc < 0) {
  13.877 +        perror("allocblock recv");
  13.878 +        goto err;
  13.879 +    }
  13.880 +    if ((reqs[0]->message.flags & BSOP_FLAG_ERROR)) {
  13.881 +        fprintf(stderr, "allocblock server0 error\n");
  13.882 +        goto err;
  13.883 +    }
  13.884 +    if ((reqs[1]->message.flags & BSOP_FLAG_ERROR)) {
  13.885 +        fprintf(stderr, "allocblock server1 error\n");
  13.886 +        goto err;
  13.887 +    }
  13.888 +    if ((reqs[2]->message.flags & BSOP_FLAG_ERROR)) {
  13.889 +        fprintf(stderr, "allocblock server2 error\n");
  13.890 +        goto err;
  13.891 +    }
  13.892 +
  13.893 +    id0 = reqs[0]->message.id;
  13.894 +    id1 = reqs[1]->message.id;
  13.895 +    id2 = reqs[2]->message.id;
  13.896 +
  13.897 +#ifdef BSDEBUG
  13.898 +    fprintf(stderr, "ALLOC: %016llx %02x%02x %02x%02x %02x%02x %02x%02x\n",
  13.899 +            BSID(map, id0, id1, id2),
  13.900 +            (unsigned int)((unsigned char *)block)[0],
  13.901 +            (unsigned int)((unsigned char *)block)[1],
  13.902 +            (unsigned int)((unsigned char *)block)[2],
  13.903 +            (unsigned int)((unsigned char *)block)[3],
  13.904 +            (unsigned int)((unsigned char *)block)[4],
  13.905 +            (unsigned int)((unsigned char *)block)[5],
  13.906 +            (unsigned int)((unsigned char *)block)[6],
  13.907 +            (unsigned int)((unsigned char *)block)[7]);
  13.908 +#endif
  13.909 +    
  13.910 +    free((void *)reqs[0]);
  13.911 +    free((void *)reqs[1]);
  13.912 +    free((void *)reqs[2]);
  13.913 +    return BSID(map, id0, id1, id2);
  13.914 +
  13.915 +    err:
  13.916 +    if (reqs[0]) {
  13.917 +        dequeue(reqs[0]);
  13.918 +        free((void *)reqs[0]);
  13.919 +    }
  13.920 +    if (reqs[1]) {
  13.921 +        dequeue(reqs[1]);
  13.922 +        free((void *)reqs[1]);
  13.923 +    }
  13.924 +    if (reqs[2]) {
  13.925 +        dequeue(reqs[2]);
  13.926 +        free((void *)reqs[2]);
  13.927 +    }
  13.928 +    return 0;
  13.929 +}
  13.930 +
  13.931 +#else /* /BLOCKSTORE_REMOTE */
  13.932 +
  13.933 +/*****************************************************************************
  13.934 + * Local storage version                                                     *
  13.935 + *****************************************************************************/
  13.936 + 
  13.937 +/**
  13.938 + * readblock: read a block from disk
  13.939 + *   @id: block id to read
  13.940 + *
  13.941 + *   @return: pointer to block, NULL on error
  13.942 + */
  13.943 +
  13.944 +void *readblock(u64 id) {
  13.945 +    void *block;
  13.946 +    int block_fp;
  13.947 +   
  13.948 +//printf("readblock(%llu)\n", id); 
  13.949 +    block_fp = open("blockstore.dat", O_RDONLY | O_CREAT | O_LARGEFILE, 0644);
  13.950 +
  13.951 +    if (block_fp < 0) {
  13.952 +        perror("open");
  13.953 +        return NULL;
  13.954 +    }
  13.955 +    
  13.956 +    if (lseek64(block_fp, ((off64_t) id - 1LL) * BLOCK_SIZE, SEEK_SET) < 0) {
  13.957 +        printf ("%Ld ", id);
  13.958 +        printf ("%Ld\n", (id - 1) * BLOCK_SIZE);
  13.959 +        perror("readblock lseek");
  13.960 +        goto err;
  13.961 +    }
  13.962 +    if ((block = malloc(BLOCK_SIZE)) == NULL) {
  13.963 +        perror("readblock malloc");
  13.964 +        goto err;
  13.965 +    }
  13.966 +    if (read(block_fp, block, BLOCK_SIZE) != BLOCK_SIZE) {
  13.967 +        perror("readblock read");
  13.968 +        free(block);
  13.969 +        goto err;
  13.970 +    }
  13.971 +    close(block_fp);
  13.972 +    return block;
  13.973 +    
  13.974 +err:
  13.975 +    close(block_fp);
  13.976 +    return NULL;
  13.977 +}
  13.978 +
  13.979 +/**
  13.980 + * writeblock: write an existing block to disk
  13.981 + *   @id: block id
  13.982 + *   @block: pointer to block
  13.983 + *
  13.984 + *   @return: zero on success, -1 on failure
  13.985 + */
  13.986 +int writeblock(u64 id, void *block) {
  13.987 +    
  13.988 +    int block_fp;
  13.989 +    
  13.990 +    block_fp = open("blockstore.dat", O_RDWR | O_CREAT | O_LARGEFILE, 0644);
  13.991 +
  13.992 +    if (block_fp < 0) {
  13.993 +        perror("open");
  13.994 +        return -1;
  13.995 +    }
  13.996 +
  13.997 +    if (lseek64(block_fp, ((off64_t) id - 1LL) * BLOCK_SIZE, SEEK_SET) < 0) {
  13.998 +        perror("writeblock lseek");
  13.999 +        goto err;
 13.1000 +    }
 13.1001 +    if (write(block_fp, block, BLOCK_SIZE) < 0) {
 13.1002 +        perror("writeblock write");
 13.1003 +        goto err;
 13.1004 +    }
 13.1005 +    close(block_fp);
 13.1006 +    return 0;
 13.1007 +
 13.1008 +err:
 13.1009 +    close(block_fp);
 13.1010 +    return -1;
 13.1011 +}
 13.1012 +
 13.1013 +/**
 13.1014 + * allocblock: write a new block to disk
 13.1015 + *   @block: pointer to block
 13.1016 + *
 13.1017 + *   @return: new id of block on disk
 13.1018 + */
 13.1019 +
 13.1020 +u64 allocblock(void *block) {
 13.1021 +    u64 lb;
 13.1022 +    off64_t pos;
 13.1023 +    int block_fp;
 13.1024 +    
 13.1025 +    block_fp = open("blockstore.dat", O_RDWR | O_CREAT | O_LARGEFILE, 0644);
 13.1026 +
 13.1027 +    if (block_fp < 0) {
 13.1028 +        perror("open");
 13.1029 +        return 0;
 13.1030 +    }
 13.1031 +
 13.1032 +    pos = lseek64(block_fp, 0, SEEK_END);
 13.1033 +    if (pos == (off64_t)-1) {
 13.1034 +        perror("allocblock lseek");
 13.1035 +        goto err;
 13.1036 +    }
 13.1037 +    if (pos % BLOCK_SIZE != 0) {
 13.1038 +        fprintf(stderr, "file size not multiple of %d\n", BLOCK_SIZE);
 13.1039 +        goto err;
 13.1040 +    }
 13.1041 +    if (write(block_fp, block, BLOCK_SIZE) != BLOCK_SIZE) {
 13.1042 +        perror("allocblock write");
 13.1043 +        goto err;
 13.1044 +    }
 13.1045 +    lb = pos / BLOCK_SIZE + 1;
 13.1046 +//printf("alloc(%Ld)\n", lb);
 13.1047 +    close(block_fp);
 13.1048 +    return lb;
 13.1049 +    
 13.1050 +err:
 13.1051 +    close(block_fp);
 13.1052 +    return 0;
 13.1053 +    
 13.1054 +}
 13.1055 +
 13.1056 +/**
 13.1057 + * allocblock_hint: write a new block to disk
 13.1058 + *   @block: pointer to block
 13.1059 + *   @hint: allocation hint
 13.1060 + *
 13.1061 + *   @return: new id of block on disk
 13.1062 + */
 13.1063 +u64 allocblock_hint(void *block, u64 hint) {
 13.1064 +    return allocblock(block);
 13.1065 +}
 13.1066 +
 13.1067 +#endif /* BLOCKSTORE_REMOTE */
 13.1068 +
 13.1069 +/*****************************************************************************
 13.1070 + * Memory management                                                         *
 13.1071 + *****************************************************************************/
 13.1072 +
 13.1073 +/**
 13.1074 + * newblock: get a new in-memory block set to zeros
 13.1075 + *
 13.1076 + *   @return: pointer to new block, NULL on error
 13.1077 + */
 13.1078 +void *newblock() {
 13.1079 +    void *block = malloc(BLOCK_SIZE);
 13.1080 +    if (block == NULL) {
 13.1081 +        perror("newblock");
 13.1082 +        return NULL;
 13.1083 +    }
 13.1084 +    memset(block, 0, BLOCK_SIZE);
 13.1085 +    return block;
 13.1086 +}
 13.1087 +
 13.1088 +
 13.1089 +/**
 13.1090 + * freeblock: unallocate an in-memory block
 13.1091 + *   @id: block id (zero if this is only in-memory)
 13.1092 + *   @block: block to be freed
 13.1093 + */
 13.1094 +void freeblock(void *block) {
 13.1095 +    if (block != NULL)
 13.1096 +        free(block);
 13.1097 +}
 13.1098 +
 13.1099 +static freeblock_t *new_freeblock(void)
 13.1100 +{
 13.1101 +    freeblock_t *fb;
 13.1102 +    
 13.1103 +    fb = newblock();
 13.1104 +    
 13.1105 +    if (fb == NULL) return NULL;
 13.1106 +    
 13.1107 +    fb->magic = FREEBLOCK_MAGIC;
 13.1108 +    fb->next  = 0ULL;
 13.1109 +    fb->count = 0ULL;
 13.1110 +    memset(fb->list, 0, sizeof fb->list);
 13.1111 +    
 13.1112 +    return fb;
 13.1113 +}
 13.1114 +
 13.1115 +void releaseblock(u64 id)
 13.1116 +{
 13.1117 +    blockstore_super_t *bs_super;
 13.1118 +    freeblock_t *fl_current;
 13.1119 +    
 13.1120 +    /* get superblock */
 13.1121 +    bs_super = (blockstore_super_t *) readblock(BLOCKSTORE_SUPER);
 13.1122 +    
 13.1123 +    /* get freeblock_current */
 13.1124 +    if (bs_super->freelist_current == 0ULL) 
 13.1125 +    {
 13.1126 +        fl_current = new_freeblock();
 13.1127 +        bs_super->freelist_current = allocblock(fl_current);
 13.1128 +        writeblock(BLOCKSTORE_SUPER, bs_super);
 13.1129 +    } else {
 13.1130 +        fl_current = readblock(bs_super->freelist_current);
 13.1131 +    }
 13.1132 +    
 13.1133 +    /* if full, chain to superblock and allocate new current */
 13.1134 +    
 13.1135 +    if (fl_current->count == FREEBLOCK_SIZE) {
 13.1136 +        fl_current->next = bs_super->freelist_full;
 13.1137 +        writeblock(bs_super->freelist_current, fl_current);
 13.1138 +        bs_super->freelist_full = bs_super->freelist_current;
 13.1139 +        freeblock(fl_current);
 13.1140 +        fl_current = new_freeblock();
 13.1141 +        bs_super->freelist_current = allocblock(fl_current);
 13.1142 +        writeblock(BLOCKSTORE_SUPER, bs_super);
 13.1143 +    }
 13.1144 +    
 13.1145 +    /* append id to current */
 13.1146 +    fl_current->list[fl_current->count++] = id;
 13.1147 +    writeblock(bs_super->freelist_current, fl_current);
 13.1148 +    
 13.1149 +    freeblock(fl_current);
 13.1150 +    freeblock(bs_super);
 13.1151 +    
 13.1152 +    
 13.1153 +}
 13.1154 +
 13.1155 +/* freelist debug functions: */
 13.1156 +void freelist_count(int print_each)
 13.1157 +{
 13.1158 +    blockstore_super_t *bs_super;
 13.1159 +    freeblock_t *fb;
 13.1160 +    u64 total = 0, next;
 13.1161 +    
 13.1162 +    bs_super = (blockstore_super_t *) readblock(BLOCKSTORE_SUPER);
 13.1163 +    
 13.1164 +    if (bs_super->freelist_current == 0ULL) {
 13.1165 +        printf("freelist is empty!\n");
 13.1166 +        return;
 13.1167 +    }
 13.1168 +    
 13.1169 +    fb = readblock(bs_super->freelist_current);
 13.1170 +    printf("%Ld entires on current.\n", fb->count);
 13.1171 +    total += fb->count;
 13.1172 +    if (print_each == 1)
 13.1173 +    {
 13.1174 +        int i;
 13.1175 +        for (i=0; i< fb->count; i++)
 13.1176 +            printf("  %Ld\n", fb->list[i]);
 13.1177 +    }
 13.1178 +    
 13.1179 +    freeblock(fb);
 13.1180 +    
 13.1181 +    if (bs_super->freelist_full == 0ULL) {
 13.1182 +        printf("freelist_full is empty!\n");
 13.1183 +        return;
 13.1184 +    }
 13.1185 +    
 13.1186 +    next = bs_super->freelist_full;
 13.1187 +    for (;;) {
 13.1188 +        fb = readblock(next);
 13.1189 +        total += fb->count;
 13.1190 +        if (print_each == 1)
 13.1191 +        {
 13.1192 +            int i;
 13.1193 +            for (i=0; i< fb->count; i++)
 13.1194 +                printf("  %Ld\n", fb->list[i]);
 13.1195 +        }
 13.1196 +        next = fb->next;
 13.1197 +        freeblock(fb);
 13.1198 +        if (next == 0ULL) break;
 13.1199 +    }
 13.1200 +    printf("Total of %Ld ids on freelist.\n", total);
 13.1201 +}
 13.1202 +
 13.1203 +/*****************************************************************************
 13.1204 + * Initialisation                                                            *
 13.1205 + *****************************************************************************/
 13.1206 +
 13.1207 +int __init_blockstore(void)
 13.1208 +{
 13.1209 +    int i;
 13.1210 +    blockstore_super_t *bs_super;
 13.1211 +    u64 ret;
 13.1212 +    int block_fp;
 13.1213 +    
 13.1214 +#ifdef BLOCKSTORE_REMOTE
 13.1215 +    struct hostent *addr;
 13.1216 +
 13.1217 +    pthread_mutex_init(&ptmutex_queue, NULL);
 13.1218 +    pthread_mutex_init(&ptmutex_luid, NULL);
 13.1219 +    pthread_mutex_init(&ptmutex_recv, NULL);
 13.1220 +    /*pthread_mutex_init(&ptmutex_notify, NULL);*/
 13.1221 +    for (i = 0; i <= READ_POOL_SIZE; i++) {
 13.1222 +        pool_thread[i].newdata = 0;
 13.1223 +        pthread_mutex_init(&(pool_thread[i].ptmutex), NULL);
 13.1224 +        pthread_cond_init(&(pool_thread[i].ptcv), NULL);
 13.1225 +    }
 13.1226 +
 13.1227 +    bsservers[0].hostname = "firebug.cl.cam.ac.uk";
 13.1228 +    bsservers[1].hostname = "planb.cl.cam.ac.uk";
 13.1229 +    bsservers[2].hostname = "simcity.cl.cam.ac.uk";
 13.1230 +    bsservers[3].hostname = NULL/*"gunfighter.cl.cam.ac.uk"*/;
 13.1231 +    bsservers[4].hostname = NULL/*"galaxian.cl.cam.ac.uk"*/;
 13.1232 +    bsservers[5].hostname = NULL/*"firetrack.cl.cam.ac.uk"*/;
 13.1233 +    bsservers[6].hostname = NULL/*"funfair.cl.cam.ac.uk"*/;
 13.1234 +    bsservers[7].hostname = NULL/*"felix.cl.cam.ac.uk"*/;
 13.1235 +    bsservers[8].hostname = NULL;
 13.1236 +    bsservers[9].hostname = NULL;
 13.1237 +    bsservers[10].hostname = NULL;
 13.1238 +    bsservers[11].hostname = NULL;
 13.1239 +    bsservers[12].hostname = NULL;
 13.1240 +    bsservers[13].hostname = NULL;
 13.1241 +    bsservers[14].hostname = NULL;
 13.1242 +    bsservers[15].hostname = NULL;
 13.1243 +
 13.1244 +    for (i = 0; i < MAX_SERVERS; i++) {
 13.1245 +        if (!bsservers[i].hostname)
 13.1246 +            continue;
 13.1247 +        addr = gethostbyname(bsservers[i].hostname);
 13.1248 +        if (!addr) {
 13.1249 +            perror("bad hostname");
 13.1250 +            return -1;
 13.1251 +        }
 13.1252 +        bsservers[i].sin.sin_family = addr->h_addrtype;
 13.1253 +        bsservers[i].sin.sin_port = htons(BLOCKSTORED_PORT);
 13.1254 +        bsservers[i].sin.sin_addr.s_addr = 
 13.1255 +            ((struct in_addr *)(addr->h_addr))->s_addr;
 13.1256 +    }
 13.1257 +
 13.1258 +    /* Cluster map
 13.1259 +     */
 13.1260 +    bsclusters[0].servers[0] = 0;
 13.1261 +    bsclusters[0].servers[1] = 1;
 13.1262 +    bsclusters[0].servers[2] = 2;
 13.1263 +    bsclusters[1].servers[0] = 1;
 13.1264 +    bsclusters[1].servers[1] = 2;
 13.1265 +    bsclusters[1].servers[2] = 3;
 13.1266 +    bsclusters[2].servers[0] = 2;
 13.1267 +    bsclusters[2].servers[1] = 3;
 13.1268 +    bsclusters[2].servers[2] = 4;
 13.1269 +    bsclusters[3].servers[0] = 3;
 13.1270 +    bsclusters[3].servers[1] = 4;
 13.1271 +    bsclusters[3].servers[2] = 5;
 13.1272 +    bsclusters[4].servers[0] = 4;
 13.1273 +    bsclusters[4].servers[1] = 5;
 13.1274 +    bsclusters[4].servers[2] = 6;
 13.1275 +    bsclusters[5].servers[0] = 5;
 13.1276 +    bsclusters[5].servers[1] = 6;
 13.1277 +    bsclusters[5].servers[2] = 7;
 13.1278 +    bsclusters[6].servers[0] = 6;
 13.1279 +    bsclusters[6].servers[1] = 7;
 13.1280 +    bsclusters[6].servers[2] = 0;
 13.1281 +    bsclusters[7].servers[0] = 7;
 13.1282 +    bsclusters[7].servers[1] = 0;
 13.1283 +    bsclusters[7].servers[2] = 1;
 13.1284 +
 13.1285 +    /* Local socket set up
 13.1286 +     */
 13.1287 +    bssock = socket(AF_INET, SOCK_DGRAM, 0);
 13.1288 +    if (bssock < 0) {
 13.1289 +        perror("Bad socket");
 13.1290 +        return -1;
 13.1291 +    }
 13.1292 +    memset(&sin_local, 0, sizeof(sin_local));
 13.1293 +    sin_local.sin_family = AF_INET;
 13.1294 +    sin_local.sin_port = htons(BLOCKSTORED_PORT);
 13.1295 +    sin_local.sin_addr.s_addr = htonl(INADDR_ANY);
 13.1296 +    if (bind(bssock, (struct sockaddr *)&sin_local, sizeof(sin_local)) < 0) {
 13.1297 +        perror("bind");
 13.1298 +        close(bssock);
 13.1299 +        return -1;
 13.1300 +    }
 13.1301 +
 13.1302 +    pthread_create(&pthread_recv, NULL, receive_loop, NULL);
 13.1303 +    pthread_create(&pthread_recv, NULL, queue_runner, NULL);
 13.1304 +
 13.1305 +#else /* /BLOCKSTORE_REMOTE */
 13.1306 +    block_fp = open("blockstore.dat", O_RDWR | O_CREAT | O_LARGEFILE, 0644);
 13.1307 +
 13.1308 +    if (block_fp < 0) {
 13.1309 +        perror("open");
 13.1310 +        return -1;
 13.1311 +        exit(-1);
 13.1312 +    }
 13.1313 +    
 13.1314 +    if (lseek(block_fp, 0, SEEK_END) == 0) {
 13.1315 +        bs_super = newblock();
 13.1316 +        bs_super->magic            = BLOCKSTORE_MAGIC;
 13.1317 +        bs_super->freelist_full    = 0LL;
 13.1318 +        bs_super->freelist_current = 0LL;
 13.1319 +        
 13.1320 +        ret = allocblock(bs_super);
 13.1321 +        
 13.1322 +        freeblock(bs_super);
 13.1323 +    } else {
 13.1324 +        bs_super = (blockstore_super_t *) readblock(BLOCKSTORE_SUPER);
 13.1325 +        if (bs_super->magic != BLOCKSTORE_MAGIC)
 13.1326 +        {
 13.1327 +            printf("BLOCKSTORE IS CORRUPT! (no magic in superblock!)\n");
 13.1328 +            exit(-1);
 13.1329 +        }
 13.1330 +        freeblock(bs_super);
 13.1331 +    }
 13.1332 +        
 13.1333 +    close(block_fp);
 13.1334 +        
 13.1335 +#endif /*  BLOCKSTORE_REMOTE */   
 13.1336 +    return 0;
 13.1337 +}
 13.1338 +
 13.1339 +void __exit_blockstore(void)
 13.1340 +{
 13.1341 +    int i;
 13.1342 +#ifdef BLOCKSTORE_REMOTE
 13.1343 +    pthread_mutex_destroy(&ptmutex_recv);
 13.1344 +    pthread_mutex_destroy(&ptmutex_luid);
 13.1345 +    pthread_mutex_destroy(&ptmutex_queue);
 13.1346 +    /*pthread_mutex_destroy(&ptmutex_notify);
 13.1347 +      pthread_cond_destroy(&ptcv_notify);*/
 13.1348 +    for (i = 0; i <= READ_POOL_SIZE; i++) {
 13.1349 +        pthread_mutex_destroy(&(pool_thread[i].ptmutex));
 13.1350 +        pthread_cond_destroy(&(pool_thread[i].ptcv));
 13.1351 +    }
 13.1352 +#endif
 13.1353 +}
    14.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    14.2 +++ b/tools/blktap/parallax/blockstore.h	Sun Jul 03 22:36:48 2005 +0000
    14.3 @@ -0,0 +1,134 @@
    14.4 +/**************************************************************************
    14.5 + * 
    14.6 + * blockstore.h
    14.7 + *
    14.8 + * Simple block store interface
    14.9 + *
   14.10 + */
   14.11 + 
   14.12 +#ifndef __BLOCKSTORE_H__
   14.13 +#define __BLOCKSTORE_H__
   14.14 +
   14.15 +#include <netinet/in.h>
   14.16 +#include <xc.h>
   14.17 +
   14.18 +#define BLOCK_SIZE  4096
   14.19 +#define BLOCK_SHIFT   12
   14.20 +#define BLOCK_MASK  0xfffffffffffff000LL
   14.21 +
   14.22 +/* XXX SMH: where is the below supposed to be defined???? */
   14.23 +#ifndef SECTOR_SHIFT 
   14.24 +#define SECTOR_SHIFT   9 
   14.25 +#endif
   14.26 +
   14.27 +#define FREEBLOCK_SIZE  (BLOCK_SIZE / sizeof(u64)) - (3 * sizeof(u64))
   14.28 +#define FREEBLOCK_MAGIC 0x0fee0fee0fee0feeULL
   14.29 +
   14.30 +typedef struct {
   14.31 +    u64 magic;
   14.32 +    u64 next;
   14.33 +    u64 count;
   14.34 +    u64 list[FREEBLOCK_SIZE];
   14.35 +} freeblock_t; 
   14.36 +
   14.37 +#define BLOCKSTORE_MAGIC 0xaaaaaaa00aaaaaaaULL
   14.38 +#define BLOCKSTORE_SUPER 1ULL
   14.39 +
   14.40 +typedef struct {
   14.41 +    u64 magic;
   14.42 +    u64 freelist_full;
   14.43 +    u64 freelist_current;
   14.44 +} blockstore_super_t;
   14.45 +
   14.46 +extern void *newblock();
   14.47 +extern void *readblock(u64 id);
   14.48 +extern u64 allocblock(void *block);
   14.49 +extern u64 allocblock_hint(void *block, u64 hint);
   14.50 +extern int writeblock(u64 id, void *block);
   14.51 +
   14.52 +/* Add this blockid to a freelist, to be recycled by the allocator. */
   14.53 +extern void releaseblock(u64 id);
   14.54 +
   14.55 +/* this is a memory free() operation for block-sized allocations */
   14.56 +extern void freeblock(void *block);
   14.57 +extern int __init_blockstore(void);
   14.58 +
   14.59 +/* debug for freelist. */
   14.60 +void freelist_count(int print_each);
   14.61 +#define ALLOCFAIL (((u64)(-1)))
   14.62 +
   14.63 +/* Distribution
   14.64 + */
   14.65 +#define BLOCKSTORED_PORT 9346
   14.66 +
   14.67 +struct bshdr_t_struct {
   14.68 +    u32            operation;
   14.69 +    u32            flags;
   14.70 +    u64            id;
   14.71 +    u64            luid;
   14.72 +} __attribute__ ((packed));
   14.73 +typedef struct bshdr_t_struct bshdr_t;
   14.74 +
   14.75 +struct bsmsg_t_struct {
   14.76 +    bshdr_t        hdr;
   14.77 +    unsigned char  block[BLOCK_SIZE];
   14.78 +} __attribute__ ((packed));
   14.79 +
   14.80 +typedef struct bsmsg_t_struct bsmsg_t;
   14.81 +
   14.82 +#define MSGBUFSIZE_OP    sizeof(u32)
   14.83 +#define MSGBUFSIZE_FLAGS (sizeof(u32) + sizeof(u32))
   14.84 +#define MSGBUFSIZE_ID    (sizeof(u32) + sizeof(u32) + sizeof(u64) + sizeof(u64))
   14.85 +#define MSGBUFSIZE_BLOCK sizeof(bsmsg_t)
   14.86 +
   14.87 +#define BSOP_READBLOCK  0x01
   14.88 +#define BSOP_WRITEBLOCK 0x02
   14.89 +#define BSOP_ALLOCBLOCK 0x03
   14.90 +#define BSOP_FREEBLOCK  0x04
   14.91 +
   14.92 +#define BSOP_FLAG_ERROR 0x01
   14.93 +
   14.94 +#define BS_ALLOC_SKIP 10
   14.95 +#define BS_ALLOC_HACK
   14.96 +
   14.97 +/* Remote hosts and cluster map - XXX need to generalise
   14.98 + */
   14.99 +
  14.100 +/*
  14.101 +
  14.102 +  Interim ID format is
  14.103 +
  14.104 +  63 60 59                40 39                20 19                 0
  14.105 +  +----+--------------------+--------------------+--------------------+
  14.106 +  |map | replica 2          | replica 1          | replica 0          |
  14.107 +  +----+--------------------+--------------------+--------------------+
  14.108 +
  14.109 +  The map is an index into a table detailing which machines form the
  14.110 +  cluster.
  14.111 +
  14.112 + */
  14.113 +
  14.114 +#define BSID_REPLICA0(_id) ((_id)&0xfffffULL)
  14.115 +#define BSID_REPLICA1(_id) (((_id)>>20)&0xfffffULL)
  14.116 +#define BSID_REPLICA2(_id) (((_id)>>40)&0xfffffULL)
  14.117 +#define BSID_MAP(_id)      (((_id)>>60)&0xfULL)
  14.118 +
  14.119 +#define BSID(_map, _rep0, _rep1, _rep2) ((((u64)(_map))<<60) | \
  14.120 +                                         (((u64)(_rep2))<<40) | \
  14.121 +                                         (((u64)(_rep1))<<20) | ((u64)(_rep0)))
  14.122 +
  14.123 +typedef struct bsserver_t_struct {
  14.124 +    char              *hostname;
  14.125 +    struct sockaddr_in sin;
  14.126 +} bsserver_t;
  14.127 +
  14.128 +#define MAX_SERVERS 16
  14.129 +
  14.130 +#define CLUSTER_MAX_REPLICAS 3
  14.131 +typedef struct bscluster_t_struct {
  14.132 +    int servers[CLUSTER_MAX_REPLICAS];
  14.133 +} bscluster_t;
  14.134 +
  14.135 +#define MAX_CLUSTERS 16
  14.136 +
  14.137 +#endif /* __BLOCKSTORE_H__ */
    15.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    15.2 +++ b/tools/blktap/parallax/blockstored.c	Sun Jul 03 22:36:48 2005 +0000
    15.3 @@ -0,0 +1,276 @@
    15.4 +/**************************************************************************
    15.5 + * 
    15.6 + * blockstored.c
    15.7 + *
    15.8 + * Block store daemon.
    15.9 + *
   15.10 + */
   15.11 +
   15.12 +#include <fcntl.h>
   15.13 +#include <unistd.h>
   15.14 +#include <stdio.h>
   15.15 +#include <stdlib.h>
   15.16 +#include <string.h>
   15.17 +#include <sys/types.h>
   15.18 +#include <sys/stat.h>
   15.19 +#include <sys/socket.h>
   15.20 +#include <sys/ioctl.h>
   15.21 +#include <netinet/in.h>
   15.22 +#include <errno.h>
   15.23 +#include "blockstore.h"
   15.24 +
   15.25 +//#define BSDEBUG
   15.26 +
   15.27 +int readblock_into(u64 id, void *block);
   15.28 +
   15.29 +int open_socket(u16 port) {
   15.30 +    
   15.31 +    struct sockaddr_in sn;
   15.32 +    int sock;
   15.33 +
   15.34 +    sock = socket(AF_INET, SOCK_DGRAM, 0);
   15.35 +    if (sock < 0) {
   15.36 +        perror("Bad socket");
   15.37 +        return -1;
   15.38 +    }
   15.39 +    memset(&sn, 0, sizeof(sn));
   15.40 +    sn.sin_family = AF_INET;
   15.41 +    sn.sin_port = htons(port);
   15.42 +    sn.sin_addr.s_addr = htonl(INADDR_ANY);
   15.43 +    if (bind(sock, (struct sockaddr *)&sn, sizeof(sn)) < 0) {
   15.44 +        perror("bind");
   15.45 +        close(sock);
   15.46 +        return -1;
   15.47 +    }
   15.48 +
   15.49 +    return sock;
   15.50 +}
   15.51 +
   15.52 +static int block_fp = -1;
   15.53 +static int bssock = -1;
   15.54 +
   15.55 +int send_reply(struct sockaddr_in *peer, void *buffer, int len) {
   15.56 +
   15.57 +    int rc;
   15.58 +    
   15.59 +#ifdef BSDEBUG
   15.60 +    fprintf(stdout, "TX: %u bytes op=%u id=0x%llx\n",
   15.61 +            len, ((bsmsg_t *)buffer)->hdr.operation, ((bsmsg_t *)buffer)->hdr.id);
   15.62 +#endif
   15.63 +    rc = sendto(bssock, buffer, len, 0, (struct sockaddr *)peer, sizeof(*peer));
   15.64 +    if (rc < 0) {
   15.65 +        perror("send_reply");
   15.66 +        return 1;
   15.67 +    }
   15.68 +
   15.69 +
   15.70 +    return 0;
   15.71 +}
   15.72 +
   15.73 +static bsmsg_t msgbuf;
   15.74 +
   15.75 +void service_loop(void) {
   15.76 +
   15.77 +    for (;;) {
   15.78 +        int rc, len;
   15.79 +        struct sockaddr_in from;
   15.80 +        size_t slen = sizeof(from);
   15.81 +        u64 bid;
   15.82 +
   15.83 +        len = recvfrom(bssock, (void *)&msgbuf, sizeof(msgbuf), 0,
   15.84 +                       (struct sockaddr *)&from, &slen);
   15.85 +
   15.86 +        if (len < 0) {
   15.87 +            perror("recvfrom");
   15.88 +            continue;
   15.89 +        }
   15.90 +
   15.91 +        if (len < MSGBUFSIZE_OP) {
   15.92 +            fprintf(stderr, "Short packet.\n");
   15.93 +            continue;
   15.94 +        }
   15.95 +
   15.96 +#ifdef BSDEBUG
   15.97 +        fprintf(stdout, "RX: %u bytes op=%u id=0x%llx\n",
   15.98 +                len, msgbuf.hdr.operation, msgbuf.hdr.id);
   15.99 +#endif
  15.100 +
  15.101 +        switch (msgbuf.hdr.operation) {
  15.102 +        case BSOP_READBLOCK:
  15.103 +            if (len < MSGBUFSIZE_ID) {
  15.104 +                fprintf(stderr, "Short packet (readblock %u).\n", len);
  15.105 +                continue;
  15.106 +            }
  15.107 +            rc = readblock_into(msgbuf.hdr.id, msgbuf.block);
  15.108 +            if (rc < 0) {
  15.109 +                fprintf(stderr, "readblock error\n");
  15.110 +                msgbuf.hdr.flags = BSOP_FLAG_ERROR;
  15.111 +                send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_ID);
  15.112 +                continue;
  15.113 +            }
  15.114 +            msgbuf.hdr.flags = 0;
  15.115 +            send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_BLOCK);
  15.116 +            break;
  15.117 +        case BSOP_WRITEBLOCK:
  15.118 +            if (len < MSGBUFSIZE_BLOCK) {
  15.119 +                fprintf(stderr, "Short packet (writeblock %u).\n", len);
  15.120 +                continue;
  15.121 +            }
  15.122 +            rc = writeblock(msgbuf.hdr.id, msgbuf.block);
  15.123 +            if (rc < 0) {
  15.124 +                fprintf(stderr, "writeblock error\n");
  15.125 +                msgbuf.hdr.flags = BSOP_FLAG_ERROR;
  15.126 +                send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_ID);
  15.127 +                continue;
  15.128 +            }
  15.129 +            msgbuf.hdr.flags = 0;
  15.130 +            send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_ID);
  15.131 +            break;
  15.132 +        case BSOP_ALLOCBLOCK:
  15.133 +            if (len < MSGBUFSIZE_BLOCK) {
  15.134 +                fprintf(stderr, "Short packet (allocblock %u).\n", len);
  15.135 +                continue;
  15.136 +            }
  15.137 +            bid = allocblock(msgbuf.block);
  15.138 +            if (bid == ALLOCFAIL) {
  15.139 +                fprintf(stderr, "allocblock error\n");
  15.140 +                msgbuf.hdr.flags = BSOP_FLAG_ERROR;
  15.141 +                send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_ID);
  15.142 +                continue;
  15.143 +            }
  15.144 +            msgbuf.hdr.id = bid;
  15.145 +            msgbuf.hdr.flags = 0;
  15.146 +            send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_ID);
  15.147 +            break;
  15.148 +        }
  15.149 +
  15.150 +    }
  15.151 +}
  15.152 + 
  15.153 +/**
  15.154 + * readblock: read a block from disk
  15.155 + *   @id: block id to read
  15.156 + *   @block: pointer to buffer to receive block
  15.157 + *
  15.158 + *   @return: 0 if OK, other on error
  15.159 + */
  15.160 +
  15.161 +int readblock_into(u64 id, void *block) {
  15.162 +    if (lseek64(block_fp, ((off64_t) id - 1LL) * BLOCK_SIZE, SEEK_SET) < 0) {
  15.163 +        printf ("%Ld\n", (id - 1) * BLOCK_SIZE);
  15.164 +        perror("readblock lseek");
  15.165 +        return -1;
  15.166 +    }
  15.167 +    if (read(block_fp, block, BLOCK_SIZE) != BLOCK_SIZE) {
  15.168 +        perror("readblock read");
  15.169 +        return -1;
  15.170 +    }
  15.171 +    return 0;
  15.172 +}
  15.173 +
  15.174 +/**
  15.175 + * writeblock: write an existing block to disk
  15.176 + *   @id: block id
  15.177 + *   @block: pointer to block
  15.178 + *
  15.179 + *   @return: zero on success, -1 on failure
  15.180 + */
  15.181 +int writeblock(u64 id, void *block) {
  15.182 +    if (lseek64(block_fp, ((off64_t) id - 1LL) * BLOCK_SIZE, SEEK_SET) < 0) {
  15.183 +        perror("writeblock lseek");
  15.184 +        return -1;
  15.185 +    }
  15.186 +    if (write(block_fp, block, BLOCK_SIZE) < 0) {
  15.187 +        perror("writeblock write");
  15.188 +        return -1;
  15.189 +    }
  15.190 +    return 0;
  15.191 +}
  15.192 +
  15.193 +/**
  15.194 + * allocblock: write a new block to disk
  15.195 + *   @block: pointer to block
  15.196 + *
  15.197 + *   @return: new id of block on disk
  15.198 + */
  15.199 +static u64 lastblock = 0;
  15.200 +
  15.201 +u64 allocblock(void *block) {
  15.202 +    u64 lb;
  15.203 +    off64_t pos;
  15.204 +
  15.205 +    retry:
  15.206 +    pos = lseek64(block_fp, 0, SEEK_END);
  15.207 +    if (pos == (off64_t)-1) {
  15.208 +        perror("allocblock lseek");
  15.209 +        return ALLOCFAIL;
  15.210 +    }
  15.211 +    if (pos % BLOCK_SIZE != 0) {
  15.212 +        fprintf(stderr, "file size not multiple of %d\n", BLOCK_SIZE);
  15.213 +        return ALLOCFAIL;
  15.214 +    }
  15.215 +    if (write(block_fp, block, BLOCK_SIZE) != BLOCK_SIZE) {
  15.216 +        perror("allocblock write");
  15.217 +        return ALLOCFAIL;
  15.218 +    }
  15.219 +    lb = pos / BLOCK_SIZE + 1;
  15.220 +
  15.221 +#ifdef BS_ALLOC_HACK
  15.222 +    if (lb < BS_ALLOC_SKIP)
  15.223 +        goto retry;
  15.224 +#endif
  15.225 +    
  15.226 +    if (lb <= lastblock)
  15.227 +        printf("[*** %Ld alredy allocated! ***]\n", lb);
  15.228 +    
  15.229 +    lastblock = lb;
  15.230 +    return lb;
  15.231 +}
  15.232 +
  15.233 +/**
  15.234 + * newblock: get a new in-memory block set to zeros
  15.235 + *
  15.236 + *   @return: pointer to new block, NULL on error
  15.237 + */
  15.238 +void *newblock() {
  15.239 +    void *block = malloc(BLOCK_SIZE);
  15.240 +    if (block == NULL) {
  15.241 +        perror("newblock");
  15.242 +        return NULL;
  15.243 +    }
  15.244 +    memset(block, 0, BLOCK_SIZE);
  15.245 +    return block;
  15.246 +}
  15.247 +
  15.248 +
  15.249 +/**
  15.250 + * freeblock: unallocate an in-memory block
  15.251 + *   @id: block id (zero if this is only in-memory)
  15.252 + *   @block: block to be freed
  15.253 + */
  15.254 +void freeblock(void *block) {
  15.255 +    if (block != NULL)
  15.256 +        free(block);
  15.257 +}
  15.258 +
  15.259 +
  15.260 +int main(int argc, char **argv)
  15.261 +{
  15.262 +    block_fp = open("blockstore.dat", O_RDWR | O_CREAT | O_LARGEFILE, 0644);
  15.263 +
  15.264 +    if (block_fp < 0) {
  15.265 +        perror("open");
  15.266 +        return -1;
  15.267 +    }
  15.268 +
  15.269 +    bssock = open_socket(BLOCKSTORED_PORT);
  15.270 +    if (bssock < 0) {
  15.271 +        return -1;
  15.272 +    }
  15.273 +
  15.274 +    service_loop();
  15.275 +    
  15.276 +    close(bssock);
  15.277 +
  15.278 +    return 0;
  15.279 +}
    16.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    16.2 +++ b/tools/blktap/parallax/bstest.c	Sun Jul 03 22:36:48 2005 +0000
    16.3 @@ -0,0 +1,191 @@
    16.4 +/**************************************************************************
    16.5 + * 
    16.6 + * bstest.c
    16.7 + *
    16.8 + * Block store daemon test program.
    16.9 + *
   16.10 + * usage: bstest <host>|X {r|w|a} ID 
   16.11 + *
   16.12 + */
   16.13 +
   16.14 +#include <fcntl.h>
   16.15 +#include <unistd.h>
   16.16 +#include <stdio.h>
   16.17 +#include <stdlib.h>
   16.18 +#include <string.h>
   16.19 +#include <sys/types.h>
   16.20 +#include <sys/stat.h>
   16.21 +#include <sys/socket.h>
   16.22 +#include <sys/ioctl.h>
   16.23 +#include <netinet/in.h>
   16.24 +#include <netdb.h>
   16.25 +#include <errno.h>
   16.26 +#include "blockstore.h"
   16.27 +
   16.28 +int direct(char *host, u32 op, u64 id, int len) {
   16.29 +    struct sockaddr_in sn, peer;
   16.30 +    int sock;
   16.31 +    bsmsg_t msgbuf;
   16.32 +    int rc, slen;
   16.33 +    struct hostent *addr;
   16.34 +
   16.35 +    addr = gethostbyname(host);
   16.36 +    if (!addr) {
   16.37 +        perror("bad hostname");
   16.38 +        exit(1);
   16.39 +    }
   16.40 +    peer.sin_family = addr->h_addrtype;
   16.41 +    peer.sin_port = htons(BLOCKSTORED_PORT);
   16.42 +    peer.sin_addr.s_addr =  ((struct in_addr *)(addr->h_addr))->s_addr;
   16.43 +    fprintf(stderr, "Sending to: %u.%u.%u.%u\n",
   16.44 +            (unsigned int)(unsigned char)addr->h_addr[0],
   16.45 +            (unsigned int)(unsigned char)addr->h_addr[1],
   16.46 +            (unsigned int)(unsigned char)addr->h_addr[2],
   16.47 +            (unsigned int)(unsigned char)addr->h_addr[3]);
   16.48 +
   16.49 +    sock = socket(AF_INET, SOCK_DGRAM, 0);
   16.50 +    if (sock < 0) {
   16.51 +        perror("Bad socket");
   16.52 +        exit(1);
   16.53 +    }
   16.54 +    memset(&sn, 0, sizeof(sn));
   16.55 +    sn.sin_family = AF_INET;
   16.56 +    sn.sin_port = htons(BLOCKSTORED_PORT);
   16.57 +    sn.sin_addr.s_addr = htonl(INADDR_ANY);
   16.58 +    if (bind(sock, (struct sockaddr *)&sn, sizeof(sn)) < 0) {
   16.59 +        perror("bind");
   16.60 +        close(sock);
   16.61 +        exit(1);
   16.62 +    }
   16.63 +
   16.64 +    memset((void *)&msgbuf, 0, sizeof(msgbuf));
   16.65 +    msgbuf.operation = op;
   16.66 +    msgbuf.id = id;
   16.67 +
   16.68 +    rc = sendto(sock, (void *)&msgbuf, len, 0,
   16.69 +                (struct sockaddr *)&peer, sizeof(peer));
   16.70 +    if (rc < 0) {
   16.71 +        perror("sendto");
   16.72 +        exit(1);
   16.73 +    }
   16.74 +
   16.75 +    slen = sizeof(peer);
   16.76 +    len = recvfrom(sock, (void *)&msgbuf, sizeof(msgbuf), 0,
   16.77 +                   (struct sockaddr *)&peer, &slen);
   16.78 +    if (len < 0) {
   16.79 +        perror("recvfrom");
   16.80 +        exit(1);
   16.81 +    }
   16.82 +
   16.83 +    printf("Reply %u bytes:\n", len);
   16.84 +    if (len >= MSGBUFSIZE_OP)
   16.85 +        printf("  operation: %u\n", msgbuf.operation);
   16.86 +    if (len >= MSGBUFSIZE_FLAGS)
   16.87 +        printf("  flags: 0x%x\n", msgbuf.flags);
   16.88 +    if (len >= MSGBUFSIZE_ID)
   16.89 +        printf("  id: %llu\n", msgbuf.id);
   16.90 +    if (len >= (MSGBUFSIZE_ID + 4))
   16.91 +        printf("  data: %02x %02x %02x %02x...\n",
   16.92 +               (unsigned int)msgbuf.block[0],
   16.93 +               (unsigned int)msgbuf.block[1],
   16.94 +               (unsigned int)msgbuf.block[2],
   16.95 +               (unsigned int)msgbuf.block[3]);
   16.96 +    
   16.97 +    if (sock > 0)
   16.98 +        close(sock);
   16.99 +   
  16.100 +    return 0;
  16.101 +}
  16.102 +
  16.103 +int main (int argc, char **argv) {
  16.104 +
  16.105 +    u32 op = 0;
  16.106 +    u64 id = 0;
  16.107 +    int len = 0, rc;
  16.108 +    void *block;
  16.109 +
  16.110 +    if (argc < 3) {
  16.111 +        fprintf(stderr, "usage: bstest <host>|X {r|w|a} ID\n");
  16.112 +        return 1;
  16.113 +    }
  16.114 +
  16.115 +    switch (argv[2][0]) {
  16.116 +    case 'r':
  16.117 +    case 'R':
  16.118 +        op = BSOP_READBLOCK;
  16.119 +        len = MSGBUFSIZE_ID;
  16.120 +        break;
  16.121 +    case 'w':
  16.122 +    case 'W':
  16.123 +        op = BSOP_WRITEBLOCK;
  16.124 +        len = MSGBUFSIZE_BLOCK;
  16.125 +        break;
  16.126 +    case 'a':
  16.127 +    case 'A':
  16.128 +        op = BSOP_ALLOCBLOCK;
  16.129 +        len = MSGBUFSIZE_BLOCK;
  16.130 +        break;
  16.131 +    default:
  16.132 +        fprintf(stderr, "Unknown action '%s'.\n", argv[2]);
  16.133 +        return 1;
  16.134 +    }
  16.135 +
  16.136 +    if (argc >= 4)
  16.137 +        id = atoll(argv[3]);
  16.138 +
  16.139 +    if (strcmp(argv[1], "X") == 0) {
  16.140 +        rc = __init_blockstore();
  16.141 +        if (rc < 0) {
  16.142 +            fprintf(stderr, "blockstore init failed.\n");
  16.143 +            return 1;
  16.144 +        }
  16.145 +        switch(op) {
  16.146 +        case BSOP_READBLOCK:
  16.147 +            block = readblock(id);
  16.148 +            if (block) {
  16.149 +                printf("data: %02x %02x %02x %02x...\n",
  16.150 +                       (unsigned int)((unsigned char*)block)[0],
  16.151 +                       (unsigned int)((unsigned char*)block)[1],
  16.152 +                       (unsigned int)((unsigned char*)block)[2],
  16.153 +                       (unsigned int)((unsigned char*)block)[3]);
  16.154 +            }
  16.155 +            break;
  16.156 +        case BSOP_WRITEBLOCK:
  16.157 +            block = malloc(BLOCK_SIZE);
  16.158 +            if (!block) {
  16.159 +                perror("bstest malloc");
  16.160 +                return 1;
  16.161 +            }
  16.162 +            memset(block, 0, BLOCK_SIZE);
  16.163 +            rc = writeblock(id, block);
  16.164 +            if (rc != 0) {
  16.165 +                printf("error\n");
  16.166 +            }
  16.167 +            else {
  16.168 +                printf("OK\n");
  16.169 +            }
  16.170 +            break;
  16.171 +        case BSOP_ALLOCBLOCK:
  16.172 +            block = malloc(BLOCK_SIZE);
  16.173 +            if (!block) {
  16.174 +                perror("bstest malloc");
  16.175 +                return 1;
  16.176 +            }
  16.177 +            memset(block, 0, BLOCK_SIZE);
  16.178 +            id = allocblock_hint(block, id);
  16.179 +            if (id == 0) {
  16.180 +                printf("error\n");
  16.181 +            }
  16.182 +            else {
  16.183 +                printf("ID: %llu\n", id);
  16.184 +            }
  16.185 +            break;
  16.186 +        }
  16.187 +    }
  16.188 +    else {
  16.189 +        direct(argv[1], op, id, len);
  16.190 +    }
  16.191 +
  16.192 +
  16.193 +    return 0;
  16.194 +}
    17.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    17.2 +++ b/tools/blktap/parallax/parallax.c	Sun Jul 03 22:36:48 2005 +0000
    17.3 @@ -0,0 +1,611 @@
    17.4 +/**************************************************************************
    17.5 + * 
    17.6 + * parallax.c
    17.7 + *
    17.8 + * The Parallax Storage Server
    17.9 + *
   17.10 + */
   17.11 + 
   17.12 +
   17.13 +#include <stdio.h>
   17.14 +#include <stdlib.h>
   17.15 +#include <string.h>
   17.16 +#include <pthread.h>
   17.17 +#include "blktaplib.h"
   17.18 +#include "blockstore.h"
   17.19 +#include "vdi.h"
   17.20 +#include "block-async.h"
   17.21 +#include "requests-async.h"
   17.22 +
   17.23 +#define PARALLAX_DEV     61440
   17.24 +#define SECTS_PER_NODE   8
   17.25 +
   17.26 +
   17.27 +#if 0
   17.28 +#define DPRINTF(_f, _a...) printf ( _f , ## _a )
   17.29 +#else
   17.30 +#define DPRINTF(_f, _a...) ((void)0)
   17.31 +#endif
   17.32 +
   17.33 +/* ------[ session records ]----------------------------------------------- */
   17.34 +
   17.35 +#define BLKIF_HASHSZ 1024
   17.36 +#define BLKIF_HASH(_d,_h) (((int)(_d)^(int)(_h))&(BLKIF_HASHSZ-1))
   17.37 +
   17.38 +#define VDI_HASHSZ 16
   17.39 +#define VDI_HASH(_vd) ((((_vd)>>8)^(_vd))&(VDI_HASHSZ-1))
   17.40 +
   17.41 +typedef struct blkif {
   17.42 +    domid_t       domid;
   17.43 +    unsigned int  handle;
   17.44 +    enum { DISCONNECTED, DISCONNECTING, CONNECTED } status;
   17.45 +    vdi_t        *vdi_hash[VDI_HASHSZ];
   17.46 +    struct blkif *hash_next;
   17.47 +} blkif_t;
   17.48 +
   17.49 +static blkif_t      *blkif_hash[BLKIF_HASHSZ];
   17.50 +
   17.51 +blkif_t *blkif_find_by_handle(domid_t domid, unsigned int handle)
   17.52 +{
   17.53 +    if ( handle != 0 )
   17.54 +        printf("blktap/parallax don't currently support non-0 dev handles!\n");
   17.55 +    
   17.56 +    blkif_t *blkif = blkif_hash[BLKIF_HASH(domid, handle)];
   17.57 +    while ( (blkif != NULL) && 
   17.58 +            ((blkif->domid != domid) || (blkif->handle != handle)) )
   17.59 +        blkif = blkif->hash_next;
   17.60 +    return blkif;
   17.61 +}
   17.62 +
   17.63 +vdi_t *blkif_get_vdi(blkif_t *blkif, blkif_vdev_t device)
   17.64 +{
   17.65 +    vdi_t *vdi = blkif->vdi_hash[VDI_HASH(device)];
   17.66 +    
   17.67 +    while ((vdi != NULL) && (vdi->vdevice != device))
   17.68 +        vdi = vdi->next;
   17.69 +    
   17.70 +    return vdi;
   17.71 +}
   17.72 +
   17.73 +/* ------[ control message handling ]-------------------------------------- */
   17.74 +
   17.75 +void blkif_create(blkif_be_create_t *create)
   17.76 +{
   17.77 +    domid_t       domid  = create->domid;
   17.78 +    unsigned int  handle = create->blkif_handle;
   17.79 +    blkif_t     **pblkif, *blkif;
   17.80 +
   17.81 +    DPRINTF("parallax (blkif_create): create is %p\n", create); 
   17.82 +    
   17.83 +    if ( (blkif = (blkif_t *)malloc(sizeof(blkif_t))) == NULL )
   17.84 +    {
   17.85 +        DPRINTF("Could not create blkif: out of memory\n");
   17.86 +        create->status = BLKIF_BE_STATUS_OUT_OF_MEMORY;
   17.87 +        return;
   17.88 +    }
   17.89 +
   17.90 +    memset(blkif, 0, sizeof(*blkif));
   17.91 +    blkif->domid  = domid;
   17.92 +    blkif->handle = handle;
   17.93 +    blkif->status = DISCONNECTED;
   17.94 +
   17.95 +    pblkif = &blkif_hash[BLKIF_HASH(domid, handle)];
   17.96 +    while ( *pblkif != NULL )
   17.97 +    {
   17.98 +        if ( ((*pblkif)->domid == domid) && ((*pblkif)->handle == handle) )
   17.99 +        {
  17.100 +            DPRINTF("Could not create blkif: already exists (%d,%d)\n",
  17.101 +                domid, handle);
  17.102 +            create->status = BLKIF_BE_STATUS_INTERFACE_EXISTS;
  17.103 +            free(blkif);
  17.104 +            return;
  17.105 +        }
  17.106 +        pblkif = &(*pblkif)->hash_next;
  17.107 +    }
  17.108 +
  17.109 +    blkif->hash_next = *pblkif;
  17.110 +    *pblkif = blkif;
  17.111 +
  17.112 +    DPRINTF("Successfully created blkif\n");
  17.113 +    create->status = BLKIF_BE_STATUS_OKAY;
  17.114 +}
  17.115 +
  17.116 +void blkif_destroy(blkif_be_destroy_t *destroy)
  17.117 +{
  17.118 +    domid_t       domid  = destroy->domid;
  17.119 +    unsigned int  handle = destroy->blkif_handle;
  17.120 +    blkif_t     **pblkif, *blkif;
  17.121 +
  17.122 +    DPRINTF("parallax (blkif_destroy): destroy is %p\n", destroy); 
  17.123 +    
  17.124 +    pblkif = &blkif_hash[BLKIF_HASH(domid, handle)];
  17.125 +    while ( (blkif = *pblkif) != NULL )
  17.126 +    {
  17.127 +        if ( (blkif->domid == domid) && (blkif->handle == handle) )
  17.128 +        {
  17.129 +            if ( blkif->status != DISCONNECTED )
  17.130 +                goto still_connected;
  17.131 +            goto destroy;
  17.132 +        }
  17.133 +        pblkif = &blkif->hash_next;
  17.134 +    }
  17.135 +
  17.136 +    destroy->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
  17.137 +    return;
  17.138 +
  17.139 + still_connected:
  17.140 +    destroy->status = BLKIF_BE_STATUS_INTERFACE_CONNECTED;
  17.141 +    return;
  17.142 +
  17.143 + destroy:
  17.144 +    *pblkif = blkif->hash_next;
  17.145 +    free(blkif);
  17.146 +    destroy->status = BLKIF_BE_STATUS_OKAY;
  17.147 +}
  17.148 +
  17.149 +void vbd_create(blkif_be_vbd_create_t *create)
  17.150 +{
  17.151 +    blkif_t            *blkif;
  17.152 +    vdi_t              *vdi, **vdip;
  17.153 +    blkif_vdev_t        vdevice = create->vdevice;
  17.154 +
  17.155 +    DPRINTF("parallax (vbd_create): create=%p\n", create); 
  17.156 +    
  17.157 +    blkif = blkif_find_by_handle(create->domid, create->blkif_handle);
  17.158 +    if ( blkif == NULL )
  17.159 +    {
  17.160 +        DPRINTF("vbd_create attempted for non-existent blkif (%u,%u)\n", 
  17.161 +                create->domid, create->blkif_handle); 
  17.162 +        create->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
  17.163 +        return;
  17.164 +    }
  17.165 +
  17.166 +    /* VDI identifier is in grow->extent.sector_start */
  17.167 +    DPRINTF("vbd_create: create->dev_handle (id) is %lx\n", 
  17.168 +            (unsigned long)create->dev_handle);
  17.169 +
  17.170 +    vdi = vdi_get(create->dev_handle);
  17.171 +    if (vdi == NULL)
  17.172 +    {
  17.173 +        printf("parallax (vbd_create): VDI %lx not found.\n",
  17.174 +               (unsigned long)create->dev_handle);
  17.175 +        create->status = BLKIF_BE_STATUS_VBD_NOT_FOUND;
  17.176 +        return;
  17.177 +    }
  17.178 +    
  17.179 +    vdi->next = NULL;
  17.180 +    vdi->vdevice = vdevice;
  17.181 +    vdip = &blkif->vdi_hash[VDI_HASH(vdevice)];
  17.182 +    while (*vdip != NULL)
  17.183 +        vdip = &(*vdip)->next;
  17.184 +    *vdip = vdi;
  17.185 +    
  17.186 +    DPRINTF("blkif_create succeeded\n"); 
  17.187 +    create->status = BLKIF_BE_STATUS_OKAY;
  17.188 +}
  17.189 +
  17.190 +void vbd_destroy(blkif_be_vbd_destroy_t *destroy)
  17.191 +{
  17.192 +    blkif_t            *blkif;
  17.193 +    vdi_t              *vdi, **vdip;
  17.194 +    blkif_vdev_t        vdevice = destroy->vdevice;
  17.195 +    
  17.196 +    blkif = blkif_find_by_handle(destroy->domid, destroy->blkif_handle);
  17.197 +    if ( blkif == NULL )
  17.198 +    {
  17.199 +        DPRINTF("vbd_destroy attempted for non-existent blkif (%u,%u)\n", 
  17.200 +                destroy->domid, destroy->blkif_handle); 
  17.201 +        destroy->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
  17.202 +        return;
  17.203 +    }
  17.204 +
  17.205 +    vdip = &blkif->vdi_hash[VDI_HASH(vdevice)];
  17.206 +    while ((*vdip != NULL) && ((*vdip)->vdevice != vdevice))
  17.207 +        vdip = &(*vdip)->next;
  17.208 +
  17.209 +    if (*vdip != NULL) 
  17.210 +    {
  17.211 +        vdi = *vdip;
  17.212 +        *vdip = vdi->next;
  17.213 +        vdi_put(vdi);
  17.214 +    }
  17.215 +        
  17.216 +}
  17.217 +
  17.218 +int parallax_control(control_msg_t *msg)
  17.219 +{
  17.220 +    domid_t  domid;
  17.221 +    int      ret;
  17.222 +
  17.223 +    DPRINTF("parallax_control: msg is %p\n", msg); 
  17.224 +    
  17.225 +    if (msg->type != CMSG_BLKIF_BE) 
  17.226 +    {
  17.227 +        printf("Unexpected control message (%d)\n", msg->type);
  17.228 +        return 0;
  17.229 +    }
  17.230 +    
  17.231 +    switch(msg->subtype)
  17.232 +    {
  17.233 +    case CMSG_BLKIF_BE_CREATE:
  17.234 +        if ( msg->length != sizeof(blkif_be_create_t) )
  17.235 +            goto parse_error;
  17.236 +        blkif_create((blkif_be_create_t *)msg->msg);
  17.237 +        break;   
  17.238 +        
  17.239 +    case CMSG_BLKIF_BE_DESTROY:
  17.240 +        if ( msg->length != sizeof(blkif_be_destroy_t) )
  17.241 +            goto parse_error;
  17.242 +        blkif_destroy((blkif_be_destroy_t *)msg->msg);
  17.243 +        break;  
  17.244 +        
  17.245 +    case CMSG_BLKIF_BE_VBD_CREATE:
  17.246 +        if ( msg->length != sizeof(blkif_be_vbd_create_t) )
  17.247 +            goto parse_error;
  17.248 +        vbd_create((blkif_be_vbd_create_t *)msg->msg);
  17.249 +        break;
  17.250 +        
  17.251 +    case CMSG_BLKIF_BE_VBD_DESTROY:
  17.252 +        if ( msg->length != sizeof(blkif_be_vbd_destroy_t) )
  17.253 +            goto parse_error;
  17.254 +        vbd_destroy((blkif_be_vbd_destroy_t *)msg->msg);
  17.255 +        break;
  17.256 +
  17.257 +    case CMSG_BLKIF_BE_CONNECT:
  17.258 +    case CMSG_BLKIF_BE_DISCONNECT:
  17.259 +        /* we don't manage the device channel, the tap does. */
  17.260 +        break;
  17.261 +
  17.262 +    default:
  17.263 +        goto parse_error;
  17.264 +    }
  17.265 +    return 0;
  17.266 +parse_error:
  17.267 +    printf("Bad control message!\n");
  17.268 +    return 0;
  17.269 +    
  17.270 +}    
  17.271 +
  17.272 +int parallax_probe(blkif_request_t *req, blkif_t *blkif)
  17.273 +{
  17.274 +    blkif_response_t *rsp;
  17.275 +    vdisk_t *img_info;
  17.276 +    vdi_t *vdi;
  17.277 +    int i, nr_vdis = 0; 
  17.278 +
  17.279 +    DPRINTF("parallax_probe: req=%p, blkif=%p\n", req, blkif); 
  17.280 +
  17.281 +    /* We expect one buffer only. */
  17.282 +    if ( req->nr_segments != 1 )
  17.283 +      goto err;
  17.284 +
  17.285 +    /* Make sure the buffer is page-sized. */
  17.286 +    if ( (blkif_first_sect(req->frame_and_sects[0]) != 0) ||
  17.287 +       (blkif_last_sect (req->frame_and_sects[0]) != 7) )
  17.288 +      goto err;
  17.289 +
  17.290 +    /* fill the list of devices */
  17.291 +    for (i=0; i<VDI_HASHSZ; i++) {
  17.292 +        vdi = blkif->vdi_hash[i];
  17.293 +        while (vdi) {
  17.294 +            img_info = (vdisk_t *)MMAP_VADDR(ID_TO_IDX(req->id), 0);
  17.295 +            img_info[nr_vdis].device   = vdi->vdevice;
  17.296 +            img_info[nr_vdis].info     = 0;
  17.297 +            /* The -1 here accounts for the LSB in the radix tree */
  17.298 +            img_info[nr_vdis].capacity = 
  17.299 +                    ((1LL << (VDI_HEIGHT-1)) * SECTS_PER_NODE);
  17.300 +            nr_vdis++;
  17.301 +            vdi = vdi->next;
  17.302 +        }
  17.303 +    }
  17.304 +
  17.305 +    
  17.306 +    rsp = (blkif_response_t *)req;
  17.307 +    rsp->id = req->id;
  17.308 +    rsp->operation = BLKIF_OP_PROBE;
  17.309 +    rsp->status = nr_vdis; /* number of disks */
  17.310 +
  17.311 +    DPRINTF("parallax_probe: send positive response (nr_vdis=%d)\n", nr_vdis);
  17.312 +    return  BLKTAP_RESPOND;
  17.313 +err:
  17.314 +    rsp = (blkif_response_t *)req;
  17.315 +    rsp->id = req->id;
  17.316 +    rsp->operation = BLKIF_OP_PROBE;
  17.317 +    rsp->status = BLKIF_RSP_ERROR;
  17.318 +    
  17.319 +    DPRINTF("parallax_probe: send error response\n"); 
  17.320 +    return BLKTAP_RESPOND;  
  17.321 +}
  17.322 +
  17.323 +typedef struct {
  17.324 +    blkif_request_t *req;
  17.325 +    int              count;
  17.326 +    int              error;
  17.327 +    pthread_mutex_t  mutex;
  17.328 +} pending_t;
  17.329 +
  17.330 +#define MAX_REQUESTS 64
  17.331 +pending_t pending_list[MAX_REQUESTS];
  17.332 +
  17.333 +struct cb_param {
  17.334 +    pending_t *pent;
  17.335 +    int       segment;
  17.336 +    u64       sector; 
  17.337 +    u64       vblock; /* for debug printing -- can be removed. */
  17.338 +};
  17.339 +
  17.340 +static void read_cb(struct io_ret r, void *in_param)
  17.341 +{
  17.342 +    struct cb_param *param = (struct cb_param *)in_param;
  17.343 +    pending_t *p = param->pent;
  17.344 +    int segment = param->segment;
  17.345 +    blkif_request_t *req = p->req;
  17.346 +    unsigned long size, offset, start;
  17.347 +    char *dpage, *spage;
  17.348 +	
  17.349 +    spage  = IO_BLOCK(r);
  17.350 +    if (spage == NULL) { p->error++; goto finish; }
  17.351 +    dpage  = (char *)MMAP_VADDR(ID_TO_IDX(req->id), segment);
  17.352 +    
  17.353 +    /* Calculate read size and offset within the read block. */
  17.354 +
  17.355 +    offset = (param->sector << SECTOR_SHIFT) % BLOCK_SIZE;
  17.356 +    size = ( blkif_last_sect (req->frame_and_sects[segment]) -
  17.357 +             blkif_first_sect(req->frame_and_sects[segment]) + 1
  17.358 +        ) << SECTOR_SHIFT;
  17.359 +    start = blkif_first_sect(req->frame_and_sects[segment]) 
  17.360 +        << SECTOR_SHIFT;
  17.361 +
  17.362 +    DPRINTF("ParallaxRead: sect: %lld (%ld,%ld),  "
  17.363 +            "vblock %llx, "
  17.364 +            "size %lx\n", 
  17.365 +            param->sector, blkif_first_sect(p->req->frame_and_sects[segment]),
  17.366 +            blkif_last_sect (p->req->frame_and_sects[segment]),
  17.367 +            param->vblock, size); 
  17.368 +
  17.369 +    memcpy(dpage + start, spage + offset, size);
  17.370 +    freeblock(spage);
  17.371 +    
  17.372 +    /* Done the read.  Now update the pending record. */
  17.373 + finish:
  17.374 +    pthread_mutex_lock(&p->mutex);
  17.375 +    p->count--;
  17.376 +    
  17.377 +    if (p->count == 0) {
  17.378 +    	blkif_response_t *rsp;
  17.379 +    	
  17.380 +        rsp = (blkif_response_t *)req;
  17.381 +        rsp->id = req->id;
  17.382 +        rsp->operation = BLKIF_OP_READ;
  17.383 +    	if (p->error == 0) {
  17.384 +            rsp->status = BLKIF_RSP_OKAY;
  17.385 +    	} else {
  17.386 +            rsp->status = BLKIF_RSP_ERROR;
  17.387 +    	}
  17.388 +        blktap_inject_response(rsp);       
  17.389 +    }
  17.390 +    
  17.391 +    pthread_mutex_unlock(&p->mutex);
  17.392 +	
  17.393 +    free(param); /* TODO: replace with cached alloc/dealloc */
  17.394 +}	
  17.395 +
  17.396 +int parallax_read(blkif_request_t *req, blkif_t *blkif)
  17.397 +{
  17.398 +    blkif_response_t *rsp;
  17.399 +    u64 vblock, gblock;
  17.400 +    vdi_t *vdi;
  17.401 +    u64 sector;
  17.402 +    int i;
  17.403 +    char *dpage, *spage;
  17.404 +    pending_t *pent;
  17.405 +
  17.406 +    vdi = blkif_get_vdi(blkif, req->device);
  17.407 +    
  17.408 +    if ( vdi == NULL )
  17.409 +        goto err;
  17.410 +        
  17.411 +    pent = &pending_list[ID_TO_IDX(req->id)];
  17.412 +    pent->count = req->nr_segments;
  17.413 +    pent->req = req;
  17.414 +    pthread_mutex_init(&pent->mutex, NULL);
  17.415 +    
  17.416 +    for (i = 0; i < req->nr_segments; i++) {
  17.417 +        pthread_t tid;
  17.418 +        int ret;
  17.419 +        struct cb_param *p;
  17.420 +        
  17.421 +        /* Round the requested segment to a block address. */
  17.422 +        sector  = req->sector_number + (8*i);
  17.423 +        vblock = (sector << SECTOR_SHIFT) >> BLOCK_SHIFT;
  17.424 +        
  17.425 +        /* TODO: Replace this call to malloc with a cached allocation */
  17.426 +        p = (struct cb_param *)malloc(sizeof(struct cb_param));
  17.427 +        p->pent = pent;
  17.428 +        p->sector = sector; 
  17.429 +        p->segment = i;     
  17.430 +        p->vblock = vblock; /* dbg */
  17.431 +        
  17.432 +        /* Get that block from the store. */
  17.433 +        vdi_read(vdi, vblock, read_cb, (void *)p);    
  17.434 +    }
  17.435 +    
  17.436 +    return BLKTAP_STOLEN;
  17.437 +
  17.438 +err:
  17.439 +    rsp = (blkif_response_t *)req;
  17.440 +    rsp->id = req->id;
  17.441 +    rsp->operation = BLKIF_OP_READ;
  17.442 +    rsp->status = BLKIF_RSP_ERROR;
  17.443 +    
  17.444 +    return BLKTAP_RESPOND;  
  17.445 +}
  17.446 +
  17.447 +static void write_cb(struct io_ret r, void *in_param)
  17.448 +{
  17.449 +    struct cb_param *param = (struct cb_param *)in_param;
  17.450 +    pending_t *p = param->pent;
  17.451 +    blkif_request_t *req = p->req;
  17.452 +    
  17.453 +    /* catch errors from the block code. */
  17.454 +    if (IO_INT(r) < 0) p->error++;
  17.455 +    
  17.456 +    pthread_mutex_lock(&p->mutex);
  17.457 +    p->count--;
  17.458 +    
  17.459 +    if (p->count == 0) {
  17.460 +    	blkif_response_t *rsp;
  17.461 +    	
  17.462 +        rsp = (blkif_response_t *)req;
  17.463 +        rsp->id = req->id;
  17.464 +        rsp->operation = BLKIF_OP_WRITE;
  17.465 +    	if (p->error == 0) {
  17.466 +            rsp->status = BLKIF_RSP_OKAY;
  17.467 +    	} else {
  17.468 +            rsp->status = BLKIF_RSP_ERROR;
  17.469 +    	}
  17.470 +        blktap_inject_response(rsp);       
  17.471 +    }
  17.472 +    
  17.473 +    pthread_mutex_unlock(&p->mutex);
  17.474 +	
  17.475 +    free(param); /* TODO: replace with cached alloc/dealloc */
  17.476 +}
  17.477 +
  17.478 +int parallax_write(blkif_request_t *req, blkif_t *blkif)
  17.479 +{
  17.480 +    blkif_response_t *rsp;
  17.481 +    u64 sector;
  17.482 +    int i, writable = 0;
  17.483 +    u64 vblock, gblock;
  17.484 +    char *spage;
  17.485 +    unsigned long size, offset, start;
  17.486 +    vdi_t *vdi;
  17.487 +    pending_t *pent;
  17.488 +
  17.489 +    vdi = blkif_get_vdi(blkif, req->device);
  17.490 +    
  17.491 +    if ( vdi == NULL )
  17.492 +        goto err;
  17.493 +        
  17.494 +    pent = &pending_list[ID_TO_IDX(req->id)];
  17.495 +    pent->count = req->nr_segments;
  17.496 +    pent->req = req;
  17.497 +    pthread_mutex_init(&pent->mutex, NULL);
  17.498 +    
  17.499 +    for (i = 0; i < req->nr_segments; i++) {
  17.500 +        struct cb_param *p;
  17.501 +        
  17.502 +        spage  = (char *)MMAP_VADDR(ID_TO_IDX(req->id), i);
  17.503 +        
  17.504 +        /* Round the requested segment to a block address. */
  17.505 +        
  17.506 +        sector  = req->sector_number + (8*i);
  17.507 +        vblock = (sector << SECTOR_SHIFT) >> BLOCK_SHIFT;
  17.508 +        
  17.509 +        /* Calculate read size and offset within the read block. */
  17.510 +        
  17.511 +        offset = (sector << SECTOR_SHIFT) % BLOCK_SIZE;
  17.512 +        size = ( blkif_last_sect (req->frame_and_sects[i]) -
  17.513 +                 blkif_first_sect(req->frame_and_sects[i]) + 1
  17.514 +            ) << SECTOR_SHIFT;
  17.515 +        start = blkif_first_sect(req->frame_and_sects[i]) << SECTOR_SHIFT;
  17.516 +
  17.517 +        DPRINTF("ParallaxWrite: sect: %lld (%ld,%ld),  "
  17.518 +                "vblock %llx, gblock %llx, "
  17.519 +                "size %lx\n", 
  17.520 +                sector, blkif_first_sect(req->frame_and_sects[i]),
  17.521 +                blkif_last_sect (req->frame_and_sects[i]),
  17.522 +                vblock, gblock, size); 
  17.523 +      
  17.524 +        /* XXX: For now we just freak out if they try to write a   */
  17.525 +        /* non block-sized, block-aligned page.                    */
  17.526 +        
  17.527 +        if ((offset != 0) || (size != BLOCK_SIZE) || (start != 0)) {
  17.528 +            printf("]\n] STRANGE WRITE!\n]\n");
  17.529 +            goto err;
  17.530 +        }
  17.531 +        
  17.532 +        /* TODO: Replace this call to malloc with a cached allocation */
  17.533 +        p = (struct cb_param *)malloc(sizeof(struct cb_param));
  17.534 +        p->pent = pent;
  17.535 +        p->sector = sector; 
  17.536 +        p->segment = i;     
  17.537 +        p->vblock = vblock; /* dbg */
  17.538 +        
  17.539 +        /* Issue the write to the store. */
  17.540 +        vdi_write(vdi, vblock, spage, write_cb, (void *)p);
  17.541 +    }
  17.542 +
  17.543 +    return BLKTAP_STOLEN;
  17.544 +
  17.545 +err:
  17.546 +    rsp = (blkif_response_t *)req;
  17.547 +    rsp->id = req->id;
  17.548 +    rsp->operation = BLKIF_OP_WRITE;
  17.549 +    rsp->status = BLKIF_RSP_ERROR;
  17.550 +    
  17.551 +    return BLKTAP_RESPOND;  
  17.552 +}
  17.553 +
  17.554 +int parallax_request(blkif_request_t *req)
  17.555 +{
  17.556 +    blkif_response_t *rsp;
  17.557 +    domid_t  dom   = ID_TO_DOM(req->id);
  17.558 +    blkif_t *blkif = blkif_find_by_handle(dom, 0);
  17.559 +    
  17.560 +    if (blkif == NULL)
  17.561 +        goto err;
  17.562 +    
  17.563 +    if ( req->operation == BLKIF_OP_PROBE ) {
  17.564 +        
  17.565 +        return parallax_probe(req, blkif);
  17.566 +        
  17.567 +    } else if ( req->operation == BLKIF_OP_READ ) {
  17.568 +        
  17.569 +        return parallax_read(req, blkif);
  17.570 +        
  17.571 +    } else if ( req->operation == BLKIF_OP_WRITE ) {
  17.572 +        
  17.573 +        return parallax_write(req, blkif);
  17.574 +        
  17.575 +    } else {
  17.576 +        printf("Unknown request message type!\n");
  17.577 +        /* Unknown operation */
  17.578 +        goto err;
  17.579 +    }
  17.580 +    
  17.581 +err:
  17.582 +    rsp = (blkif_response_t *)req;
  17.583 +    rsp->operation = req->operation;
  17.584 +    rsp->id = req->id;
  17.585 +    rsp->status = BLKIF_RSP_ERROR;
  17.586 +    return BLKTAP_RESPOND;  
  17.587 +}
  17.588 +
  17.589 +void __init_parallax(void) 
  17.590 +{
  17.591 +    memset(blkif_hash, 0, sizeof(blkif_hash));
  17.592 +}
  17.593 +
  17.594 +
  17.595 +
  17.596 +int main(int argc, char *argv[])
  17.597 +{
  17.598 +    DPRINTF("parallax: starting.\n"); 
  17.599 +    __init_blockstore();
  17.600 +    DPRINTF("parallax: initialized blockstore...\n"); 
  17.601 +    init_block_async();
  17.602 +    DPRINTF("parallax: initialized async blocks...\n"); 
  17.603 +    __init_vdi();
  17.604 +    DPRINTF("parallax: initialized vdi registry etc...\n"); 
  17.605 +    __init_parallax();
  17.606 +    DPRINTF("parallax: initialized local stuff..\n"); 
  17.607 +
  17.608 +    blktap_register_ctrl_hook("parallax_control", parallax_control);
  17.609 +    blktap_register_request_hook("parallax_request", parallax_request);
  17.610 +    DPRINTF("parallax: added ctrl + request hooks, starting listen...\n"); 
  17.611 +    blktap_listen();
  17.612 +    
  17.613 +    return 0;
  17.614 +}
    18.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    18.2 +++ b/tools/blktap/parallax/radix.c	Sun Jul 03 22:36:48 2005 +0000
    18.3 @@ -0,0 +1,631 @@
    18.4 +/*
    18.5 + * Radix tree for mapping (up to) 63-bit virtual block IDs to
    18.6 + * 63-bit global block IDs
    18.7 + *
    18.8 + * Pointers within the tree set aside the least significant bit to indicate
    18.9 + * whther or not the target block is writable from this node.
   18.10 + *
   18.11 + * The block with ID 0 is assumed to be an empty block of all zeros
   18.12 + */
   18.13 +
   18.14 +#include <unistd.h>
   18.15 +#include <stdio.h>
   18.16 +#include <stdlib.h>
   18.17 +#include <assert.h>
   18.18 +#include <string.h>
   18.19 +#include <pthread.h>
   18.20 +#include "blockstore.h"
   18.21 +#include "radix.h"
   18.22 +
   18.23 +#define RADIX_TREE_MAP_SHIFT 9
   18.24 +#define RADIX_TREE_MAP_MASK 0x1ff
   18.25 +#define RADIX_TREE_MAP_ENTRIES 512
   18.26 +
   18.27 +/*
   18.28 +#define DEBUG
   18.29 +*/
   18.30 +
   18.31 +/* Experimental radix cache. */
   18.32 +
   18.33 +static  pthread_mutex_t rcache_mutex = PTHREAD_MUTEX_INITIALIZER;
   18.34 +static  int rcache_count = 0;
   18.35 +#define RCACHE_MAX 1024
   18.36 +
   18.37 +typedef struct rcache_st {
   18.38 +    radix_tree_node  *node;
   18.39 +    u64               id;
   18.40 +    struct rcache_st *hash_next;
   18.41 +    struct rcache_st *cache_next;
   18.42 +    struct rcache_st *cache_prev;
   18.43 +} rcache_t;
   18.44 +
   18.45 +static rcache_t *rcache_head = NULL;
   18.46 +static rcache_t *rcache_tail = NULL;
   18.47 +
   18.48 +#define RCHASH_SIZE 512ULL
   18.49 +rcache_t *rcache[RCHASH_SIZE];
   18.50 +#define RCACHE_HASH(_id) ((_id) & (RCHASH_SIZE - 1))
   18.51 +
   18.52 +void __rcache_init(void)
   18.53 +{
   18.54 +    int i;
   18.55 +
   18.56 +    for (i=0; i<RCHASH_SIZE; i++)
   18.57 +        rcache[i] = NULL;
   18.58 +}
   18.59 +    
   18.60 +
   18.61 +void rcache_write(u64 id, radix_tree_node *node)
   18.62 +{
   18.63 +    rcache_t *r, *tmp, **curs;
   18.64 +    
   18.65 +    pthread_mutex_lock(&rcache_mutex);
   18.66 +    
   18.67 +    /* Is it already in the cache? */
   18.68 +    r = rcache[RCACHE_HASH(id)];
   18.69 +    
   18.70 +    for (;;) {
   18.71 +        if (r == NULL) 
   18.72 +            break;
   18.73 +        if (r->id == id) 
   18.74 +        {
   18.75 +            memcpy(r->node, node, BLOCK_SIZE);
   18.76 +            
   18.77 +            /* bring to front. */
   18.78 +            if (r != rcache_head) {
   18.79 +                
   18.80 +                if (r == rcache_tail) {
   18.81 +                    if (r->cache_prev != NULL) rcache_tail = r->cache_prev;
   18.82 +                    rcache_tail->cache_next = NULL;
   18.83 +                }
   18.84 +
   18.85 +                tmp = r->cache_next;
   18.86 +                if (r->cache_next != NULL) r->cache_next->cache_prev 
   18.87 +                                                     = r->cache_prev;
   18.88 +                if (r->cache_prev != NULL) r->cache_prev->cache_next = tmp;
   18.89 +
   18.90 +                r->cache_prev = NULL;
   18.91 +                r->cache_next = rcache_head;
   18.92 +                if (rcache_head != NULL) rcache_head->cache_prev = r;
   18.93 +                rcache_head = r;
   18.94 +            }
   18.95 +
   18.96 +//printf("Update (%Ld)\n", r->id);
   18.97 +            goto done;
   18.98 +        }
   18.99 +        r = r->hash_next;
  18.100 +    }
  18.101 +    
  18.102 +    if ( rcache_count == RCACHE_MAX ) 
  18.103 +    {
  18.104 +        /* Remove an entry */
  18.105 +        
  18.106 +        r = rcache_tail;
  18.107 +        if (r->cache_prev != NULL) rcache_tail = r->cache_prev;
  18.108 +        rcache_tail->cache_next = NULL;
  18.109 +        freeblock(r->node);
  18.110 +        
  18.111 +        curs = &rcache[RCACHE_HASH(r->id)];
  18.112 +        while ((*curs) != r)
  18.113 +            curs = &(*curs)->hash_next;
  18.114 +        *curs = r->hash_next;
  18.115 +//printf("Evict (%Ld)\n", r->id);
  18.116 +        
  18.117 +    } else {
  18.118 +        
  18.119 +        r = (rcache_t *)malloc(sizeof(rcache_t));
  18.120 +        rcache_count++;
  18.121 +    }
  18.122 +    
  18.123 +    r->node = newblock();
  18.124 +    memcpy(r->node, node, BLOCK_SIZE);
  18.125 +    r->id = id;
  18.126 +    
  18.127 +    r->hash_next = rcache[RCACHE_HASH(id)];
  18.128 +    rcache[RCACHE_HASH(id)] = r;
  18.129 +    
  18.130 +    r->cache_prev = NULL;
  18.131 +    r->cache_next = rcache_head;
  18.132 +    if (rcache_head != NULL) rcache_head->cache_prev = r;
  18.133 +    rcache_head = r;
  18.134 +    if (rcache_tail == NULL) rcache_tail = r;
  18.135 +    
  18.136 +//printf("Added (%Ld, %p)\n", id, r->node);
  18.137 +done:
  18.138 +    pthread_mutex_unlock(&rcache_mutex);
  18.139 +}
  18.140 +
  18.141 +radix_tree_node *rcache_read(u64 id)
  18.142 +{
  18.143 +    rcache_t *r, *tmp;
  18.144 +    radix_tree_node *node = NULL;
  18.145 +    
  18.146 +    pthread_mutex_lock(&rcache_mutex);
  18.147 +
  18.148 +    r = rcache[RCACHE_HASH(id)];
  18.149 +    
  18.150 +    for (;;) {
  18.151 +        if (r == NULL) {
  18.152 +//printf("Miss (%Ld)\n", id);
  18.153 +            goto done;
  18.154 +        }
  18.155 +        if (r->id == id) break;
  18.156 +        r = r->hash_next;
  18.157 +    }
  18.158 +   
  18.159 +    /* bring to front. */
  18.160 +    if (r != rcache_head) 
  18.161 +    {
  18.162 +        if (r == rcache_tail) {
  18.163 +            if (r->cache_prev != NULL) rcache_tail = r->cache_prev;
  18.164 +            rcache_tail->cache_next = NULL;
  18.165 +        }
  18.166 +        tmp = r->cache_next;
  18.167 +        if (r->cache_next != NULL) r->cache_next->cache_prev = r->cache_prev;
  18.168 +        if (r->cache_prev != NULL) r->cache_prev->cache_next = tmp;
  18.169 +
  18.170 +        r->cache_prev = NULL;
  18.171 +        r->cache_next = rcache_head;
  18.172 +        if (rcache_head != NULL) rcache_head->cache_prev = r;
  18.173 +        rcache_head = r;
  18.174 +    }
  18.175 +    
  18.176 +    node = newblock();
  18.177 +    memcpy(node, r->node, BLOCK_SIZE);
  18.178 +    
  18.179 +//printf("Hit (%Ld, %p)\n", id, r->node);
  18.180 +done:
  18.181 +    pthread_mutex_unlock(&rcache_mutex);
  18.182 +    
  18.183 +    return(node);
  18.184 +}
  18.185 +
  18.186 +
  18.187 +void *rc_readblock(u64 id)
  18.188 +{
  18.189 +    void *ret;
  18.190 +    
  18.191 +    ret = (void *)rcache_read(id);
  18.192 +    
  18.193 +    if (ret != NULL) return ret;
  18.194 +    
  18.195 +    ret = readblock(id);
  18.196 +    
  18.197 +    if (ret != NULL)
  18.198 +        rcache_write(id, ret);
  18.199 +    
  18.200 +    return(ret);
  18.201 +}
  18.202 +
  18.203 +u64 rc_allocblock(void *block)
  18.204 +{
  18.205 +    u64 ret;
  18.206 +    
  18.207 +    ret = allocblock(block);
  18.208 +    
  18.209 +    if (ret != ZERO)
  18.210 +        rcache_write(ret, block);
  18.211 +    
  18.212 +    return(ret);
  18.213 +}
  18.214 +
  18.215 +int rc_writeblock(u64 id, void *block)
  18.216 +{
  18.217 +    int ret;
  18.218 +    
  18.219 +    ret = writeblock(id, block);
  18.220 +    rcache_write(id, block);
  18.221 +    
  18.222 +    return(ret);
  18.223 +}
  18.224 +
  18.225 +
  18.226 +/*
  18.227 + * block device interface and other helper functions
  18.228 + * with these functions, block id is just a 63-bit number, with
  18.229 + * no special consideration for the LSB
  18.230 + */
  18.231 +radix_tree_node cloneblock(radix_tree_node block);
  18.232 +
  18.233 +/*
  18.234 + * main api
  18.235 + * with these functions, the LSB of root always indicates
  18.236 + * whether or not the block is writable, including the return
  18.237 + * values of update and snapshot
  18.238 + */
  18.239 +u64 lookup(int height, u64 root, u64 key);
  18.240 +u64 update(int height, u64 root, u64 key, u64 val);
  18.241 +u64 snapshot(u64 root);
  18.242 +
  18.243 +/**
  18.244 + * cloneblock: clone an existing block in memory
  18.245 + *   @block: the old block
  18.246 + *
  18.247 + *   @return: new block, with LSB cleared for every entry
  18.248 + */
  18.249 +radix_tree_node cloneblock(radix_tree_node block) {
  18.250 +    radix_tree_node node = (radix_tree_node) malloc(BLOCK_SIZE);
  18.251 +    int i;
  18.252 +    if (node == NULL) {
  18.253 +        perror("cloneblock malloc");
  18.254 +        return NULL;
  18.255 +    }
  18.256 +    for (i = 0; i < RADIX_TREE_MAP_ENTRIES; i++)
  18.257 +        node[i] = block[i] & ONEMASK;
  18.258 +    return node;
  18.259 +}
  18.260 +
  18.261 +/**
  18.262 + * lookup: find a value given a key
  18.263 + *   @height: height in bits of the radix tree
  18.264 + *   @root: root node id, with set LSB indicating writable node
  18.265 + *   @key: key to lookup
  18.266 + *
  18.267 + *   @return: value on success, zero on error
  18.268 + */
  18.269 +
  18.270 +u64 lookup(int height, u64 root, u64 key) {
  18.271 +    radix_tree_node node;
  18.272 +    u64 mask = ONE;
  18.273 +    
  18.274 +    assert(key >> height == 0);
  18.275 +
  18.276 +    /* the root block may be smaller to ensure all leaves are full */
  18.277 +    height = ((height - 1) / RADIX_TREE_MAP_SHIFT) * RADIX_TREE_MAP_SHIFT;
  18.278 +
  18.279 +    /* now carve off equal sized chunks at each step */
  18.280 +    for (;;) {
  18.281 +        u64 oldroot;
  18.282 +
  18.283 +#ifdef DEBUG
  18.284 +        printf("lookup: height=%3d root=%3Ld offset=%3d%s\n", height, root,
  18.285 +                (int) ((key >> height) & RADIX_TREE_MAP_MASK),
  18.286 +                (iswritable(root) ? "" : " (readonly)"));
  18.287 +#endif
  18.288 +        
  18.289 +        if (getid(root) == ZERO)
  18.290 +            return ZERO;
  18.291 +
  18.292 +        oldroot = root;
  18.293 +        node = (radix_tree_node) rc_readblock(getid(root));
  18.294 +        if (node == NULL)
  18.295 +            return ZERO;
  18.296 +
  18.297 +        root = node[(key >> height) & RADIX_TREE_MAP_MASK];
  18.298 +        mask &= root;
  18.299 +        freeblock(node);
  18.300 +
  18.301 +        if (height == 0)
  18.302 +            return ( root & ONEMASK ) | mask;
  18.303 +
  18.304 +        height -= RADIX_TREE_MAP_SHIFT;
  18.305 +    }
  18.306 +
  18.307 +    return ZERO;
  18.308 +}
  18.309 +
  18.310 +/*
  18.311 + * update: set a radix tree entry, doing copy-on-write as necessary
  18.312 + *   @height: height in bits of the radix tree
  18.313 + *   @root: root node id, with set LSB indicating writable node
  18.314 + *   @key: key to set
  18.315 + *   @val: value to set, s.t. radix(key)=val
  18.316 + *
  18.317 + *   @returns: (possibly new) root id on success (with LSB=1), 0 on failure
  18.318 + */
  18.319 +
  18.320 +u64 update(int height, u64 root, u64 key, u64 val) {
  18.321 +    int offset;
  18.322 +    u64 child;
  18.323 +    radix_tree_node node;
  18.324 +    
  18.325 +    /* base case--return val */
  18.326 +    if (height == 0)
  18.327 +        return val;
  18.328 +
  18.329 +    /* the root block may be smaller to ensure all leaves are full */
  18.330 +    height = ((height - 1) / RADIX_TREE_MAP_SHIFT) * RADIX_TREE_MAP_SHIFT;
  18.331 +    offset = (key >> height) & RADIX_TREE_MAP_MASK;
  18.332 +
  18.333 +#ifdef DEBUG
  18.334 +    printf("update: height=%3d root=%3Ld offset=%3d%s\n", height, root,
  18.335 +            offset, (iswritable(root)?"":" (clone)"));
  18.336 +#endif
  18.337 +
  18.338 +    /* load a block, or create a new one */
  18.339 +    if (root == ZERO) {
  18.340 +        node = (radix_tree_node) newblock();
  18.341 +    } else {
  18.342 +        node = (radix_tree_node) rc_readblock(getid(root));
  18.343 +
  18.344 +        if (!iswritable(root)) {
  18.345 +            /* need to clone this node */
  18.346 +            radix_tree_node oldnode = node;
  18.347 +            node = cloneblock(node);
  18.348 +            freeblock(oldnode);
  18.349 +            root = ZERO;
  18.350 +        }
  18.351 +    }
  18.352 +
  18.353 +    if (node == NULL) {
  18.354 +#ifdef DEBUG
  18.355 +        printf("update: node is null!\n");
  18.356 +#endif
  18.357 +        return ZERO;
  18.358 +    }
  18.359 +
  18.360 +    child = update(height, node[offset], key, val);
  18.361 +
  18.362 +    if (child == ZERO) {
  18.363 +        freeblock(node);
  18.364 +        return ZERO;
  18.365 +    } else if (child == node[offset]) {
  18.366 +        /* no change, so we already owned the child */
  18.367 +        assert(iswritable(root));
  18.368 +
  18.369 +        freeblock(node);
  18.370 +        return root;
  18.371 +    }
  18.372 +
  18.373 +    node[offset] = child;
  18.374 +
  18.375 +    /* new/cloned blocks need to be saved */
  18.376 +    if (root == ZERO) {
  18.377 +        /* mark this as an owned block */
  18.378 +        root = rc_allocblock(node);
  18.379 +        if (root)
  18.380 +            root = writable(root);
  18.381 +    } else if (rc_writeblock(getid(root), node) < 0) {
  18.382 +        freeblock(node);
  18.383 +        return ZERO;
  18.384 +    }
  18.385 +
  18.386 +    freeblock(node);
  18.387 +    return root;
  18.388 +}
  18.389 +
  18.390 +/**
  18.391 + * snapshot: create a snapshot
  18.392 + *   @root: old root node
  18.393 + *
  18.394 + *   @return: new root node, 0 on error
  18.395 + */
  18.396 +u64 snapshot(u64 root) {
  18.397 +    radix_tree_node node, newnode;
  18.398 +
  18.399 +    if ((node = rc_readblock(getid(root))) == NULL)
  18.400 +        return ZERO;
  18.401 +
  18.402 +    newnode = cloneblock(node);
  18.403 +    freeblock(node);
  18.404 +    if (newnode == NULL)
  18.405 +        return ZERO;
  18.406 +    
  18.407 +    root = rc_allocblock(newnode);
  18.408 +    freeblock(newnode);
  18.409 +
  18.410 +    if (root == ZERO)
  18.411 +        return ZERO;
  18.412 +    else
  18.413 +        return writable(root);
  18.414 +}
  18.415 +
  18.416 +/**
  18.417 + * collapse: collapse a parent onto a child.
  18.418 + * 
  18.419 + * NOTE: This assumes that parent and child really are, and further that
  18.420 + * there are no other children forked from this parent. (children of the
  18.421 + * child are okay...)
  18.422 + */
  18.423 +
  18.424 +int collapse(int height, u64 proot, u64 croot)
  18.425 +{
  18.426 +    int i, numlinks, ret, total = 0;
  18.427 +    radix_tree_node pnode, cnode;
  18.428 +    
  18.429 +    if (height == 0) {
  18.430 +        height = -1; /* terminate recursion */
  18.431 +    } else {        
  18.432 +        height = ((height - 1) / RADIX_TREE_MAP_SHIFT) * RADIX_TREE_MAP_SHIFT;
  18.433 +    }
  18.434 +    numlinks = (1UL << RADIX_TREE_MAP_SHIFT);
  18.435 +
  18.436 +    /* Terminal cases: */
  18.437 +
  18.438 +    if ( (getid(proot) == ZERO) || (getid(croot) == ZERO) )
  18.439 +        return -1;
  18.440 +    
  18.441 +    /* get roots */
  18.442 +    if ((pnode = readblock(getid(proot))) == NULL)
  18.443 +        return -1;
  18.444 +    
  18.445 +    if ((cnode = readblock(getid(croot))) == NULL)
  18.446 +    {
  18.447 +        freeblock(pnode);
  18.448 +        return -1;
  18.449 +    }
  18.450 +    
  18.451 +    /* For each writable link in proot */
  18.452 +    for (i=0; i<numlinks; i++)
  18.453 +    {
  18.454 +        if ( pnode[i] == cnode[i] ) continue;
  18.455 +        
  18.456 +        /* collapse (next level) */
  18.457 +        /* if height != 0 and writable... */
  18.458 +        if (( height >= 0 ) && ( iswritable(pnode[i]) ) )
  18.459 +        {
  18.460 +            //printf("   %Ld is writable (i=%d).\n", getid(pnode[i]), i);
  18.461 +            ret = collapse(height, pnode[i], cnode[i]);
  18.462 +            if (ret == -1) 
  18.463 +            {
  18.464 +                total = -1;
  18.465 +            } else {
  18.466 +                total += ret;
  18.467 +            }
  18.468 +        }
  18.469 +    
  18.470 +        
  18.471 +    }
  18.472 +    
  18.473 +    /* if plink is writable, AND clink is writable -> free plink block */
  18.474 +    if ( ( iswritable(proot) ) && ( iswritable(croot) ) ) 
  18.475 +    {
  18.476 +        releaseblock(getid(proot));
  18.477 +        if (ret >=0) total++;
  18.478 +        //printf("   Delete %Ld\n", getid(proot));
  18.479 +    }
  18.480 +//printf("done : %Ld\n", getid(proot));
  18.481 +    return total;
  18.482 +
  18.483 +}
  18.484 +
  18.485 +
  18.486 +void print_root(u64 root, int height, FILE *dot_f)
  18.487 +{
  18.488 +    FILE *f;
  18.489 +    int i;
  18.490 +    radix_tree_node node;
  18.491 +    char *style[2] = { "", "style=bold,color=blue," };
  18.492 +    
  18.493 +    if (dot_f == NULL) {
  18.494 +        f = fopen("radix.dot", "w");
  18.495 +        if (f == NULL) {
  18.496 +            perror("print_root: open");
  18.497 +            return;
  18.498 +        }
  18.499 +
  18.500 +        /* write graph preamble */
  18.501 +        fprintf(f, "digraph G {\n");
  18.502 +
  18.503 +        /* add a node for this root. */
  18.504 +        fprintf(f, "   n%Ld [%sshape=box,label=\"%Ld\"];\n", 
  18.505 +                getid(root), style[iswritable(root)], getid(root));
  18.506 +    }
  18.507 +    
  18.508 +    printf("print_root(%Ld)\n", getid(root));
  18.509 +    
  18.510 +    /* base case */
  18.511 +    if (height == 0) {
  18.512 +        /* add a node and edge for each child root */
  18.513 +        node = (radix_tree_node) readblock(getid(root));
  18.514 +        if (node == NULL)
  18.515 +            return;
  18.516 +        
  18.517 +        for (i = 0; i < RADIX_TREE_MAP_ENTRIES; i++) {
  18.518 +            if (node[i] != ZERO) {
  18.519 +                fprintf(f, "   n%Ld [%sshape=box,label=\"%Ld\"];\n", 
  18.520 +                        getid(node[i]), style[iswritable(node[i])], 
  18.521 +                        getid(node[i]));
  18.522 +                fprintf(f, "   n%Ld -> n%Ld [label=\"%d\"]\n", getid(root), 
  18.523 +                        getid(node[i]), i);
  18.524 +            }
  18.525 +        }
  18.526 +        freeblock(node);
  18.527 +        return;
  18.528 +    }
  18.529 +
  18.530 +    /* the root block may be smaller to ensure all leaves are full */
  18.531 +    height = ((height - 1) / RADIX_TREE_MAP_SHIFT) * RADIX_TREE_MAP_SHIFT;
  18.532 +
  18.533 +    if (getid(root) == ZERO)
  18.534 +        return;
  18.535 +
  18.536 +    node = (radix_tree_node) readblock(getid(root));
  18.537 +    if (node == NULL)
  18.538 +        return;
  18.539 +
  18.540 +    /* add a node and edge for each child root */
  18.541 +    for (i = 0; i < RADIX_TREE_MAP_ENTRIES; i++)
  18.542 +        if (node[i] != ZERO) {
  18.543 +            fprintf(f, "   n%Ld [%sshape=box,label=\"%Ld\"];\n", 
  18.544 +                    getid(node[i]), style[iswritable(node[i])], 
  18.545 +                    getid(node[i]));
  18.546 +
  18.547 +            print_root(node[i], height-RADIX_TREE_MAP_SHIFT, f);
  18.548 +            fprintf(f, "   n%Ld -> n%Ld [label=\"%d\"]\n", getid(root), 
  18.549 +                    getid(node[i]), i);
  18.550 +        }
  18.551 +
  18.552 +    freeblock(node);
  18.553 +    
  18.554 +    /* write graph postamble */
  18.555 +    if (dot_f == NULL) {
  18.556 +        fprintf(f, "}\n");
  18.557 +        fclose(f);
  18.558 +    }
  18.559 +}
  18.560 +
  18.561 +#ifdef RADIX_STANDALONE
  18.562 +
  18.563 +int main(int argc, char **argv) {
  18.564 +    u64 key = ZERO, val = ZERO;
  18.565 +    u64 root = writable(2ULL);
  18.566 +    u64 p = ZERO, c = ZERO;
  18.567 +    int v;
  18.568 +    char buff[4096];
  18.569 +
  18.570 +    __init_blockstore();
  18.571 +    
  18.572 +    memset(buff, 0, 4096);
  18.573 +    /*fp = open("radix.dat", O_RDWR | O_CREAT, 0644);
  18.574 +
  18.575 +    if (fp < 3) {
  18.576 +        perror("open");
  18.577 +        return -1;
  18.578 +    }
  18.579 +    if (lseek(fp, 0, SEEK_END) == 0) {
  18.580 +        write(fp, buff, 4096);
  18.581 +    }*/
  18.582 +        
  18.583 +    allocblock(buff);
  18.584 +            
  18.585 +    printf("Recognized commands:\n"
  18.586 +           "Note: the LSB of a node number indicates if it is writable\n"
  18.587 +           "  root <node>               set root to <node>\n"
  18.588 +           "  snapshot                  take a snapshot of the root\n"
  18.589 +           "  set <key> <val>           set key=val\n"
  18.590 +           "  get <key>                 query key\n"
  18.591 +           "  c <proot> <croot>         collapse\n"
  18.592 +           "  pr                        print tree to dot\n"
  18.593 +           "  pf <1=verbose>            print freelist\n"
  18.594 +           "  quit\n"
  18.595 +           "\nroot = %Ld\n", root);
  18.596 +    for (;;) {
  18.597 +        //print_root(root, 34, NULL);
  18.598 +        //system("dot radix.dot -Tps -o radix.ps");
  18.599 +
  18.600 +        printf("> ");
  18.601 +        fflush(stdout);
  18.602 +        fgets(buff, 1024, stdin);
  18.603 +        if (feof(stdin))
  18.604 +            break;
  18.605 +        if (sscanf(buff, " root %Ld", &root) == 1) {
  18.606 +            printf("root set to %Ld\n", root);
  18.607 +        } else if (sscanf(buff, " set %Ld %Ld", &key, &val) == 2) {
  18.608 +            root = update(34, root, key, val);
  18.609 +            printf("root = %Ld\n", root);
  18.610 +        } else if (sscanf(buff, " c %Ld %Ld", &p, &c) == 2) {
  18.611 +            v = collapse(34, p, c);
  18.612 +            printf("reclaimed %d blocks.\n", v);
  18.613 +        } else if (sscanf(buff, " get %Ld", &key) == 1) {
  18.614 +            val = lookup(34, root, key);
  18.615 +            printf("value = %Ld\n", val);
  18.616 +        } else if (!strcmp(buff, "quit\n")) {
  18.617 +            break;
  18.618 +        } else if (!strcmp(buff, "snapshot\n")) {
  18.619 +            root = snapshot(root);
  18.620 +            printf("new root = %Ld\n", root);
  18.621 +        } else if (sscanf(buff, " pr %Ld", &root) == 1) {
  18.622 +            print_root(root, 34, NULL);
  18.623 +        } else if (sscanf(buff, " pf %d", &v) == 1) {
  18.624 +            freelist_count(v);
  18.625 +        } else if (!strcmp(buff, "pf\n")) {
  18.626 +            freelist_count(0);
  18.627 +        } else {
  18.628 +            printf("command not recognized\n");
  18.629 +        }
  18.630 +    }
  18.631 +    return 0;
  18.632 +}
  18.633 +
  18.634 +#endif
    19.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    19.2 +++ b/tools/blktap/parallax/radix.h	Sun Jul 03 22:36:48 2005 +0000
    19.3 @@ -0,0 +1,45 @@
    19.4 +/*
    19.5 + * Radix tree for mapping (up to) 63-bit virtual block IDs to
    19.6 + * 63-bit global block IDs
    19.7 + *
    19.8 + * Pointers within the tree set aside the least significant bit to indicate
    19.9 + * whther or not the target block is writable from this node.
   19.10 + *
   19.11 + * The block with ID 0 is assumed to be an empty block of all zeros
   19.12 + */
   19.13 +
   19.14 +#ifndef __RADIX_H__
   19.15 +#define __RADIX_H__
   19.16 +
   19.17 +/* I don't really like exposing these, but... */
   19.18 +#define getid(x) (((x)>>1)&0x7fffffffffffffffLL)
   19.19 +#define putid(x) ((x)<<1)
   19.20 +#define writable(x) (((x)<<1)|1LL)
   19.21 +#define iswritable(x) ((x)&1LL)
   19.22 +#define ZERO 0LL
   19.23 +#define ONE 1LL
   19.24 +#define ONEMASK 0xffffffffffffffeLL
   19.25 +
   19.26 +#define RADIX_TREE_MAP_SHIFT 9
   19.27 +#define RADIX_TREE_MAP_MASK 0x1ff
   19.28 +#define RADIX_TREE_MAP_ENTRIES 512
   19.29 +
   19.30 +typedef u64 *radix_tree_node;
   19.31 +
   19.32 +
   19.33 +/*
   19.34 + * main api
   19.35 + * with these functions, the LSB of root always indicates
   19.36 + * whether or not the block is writable, including the return
   19.37 + * values of update and snapshot
   19.38 + */
   19.39 +u64 lookup(int height, u64 root, u64 key);
   19.40 +u64 update(int height, u64 root, u64 key, u64 val);
   19.41 +u64 snapshot(u64 root);
   19.42 +int collapse(int height, u64 proot, u64 croot);
   19.43 +int isprivate(int height, u64 root, u64 key);
   19.44 +
   19.45 +
   19.46 +void __rcache_init(void);
   19.47 +
   19.48 +#endif /* __RADIX_H__ */
    20.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    20.2 +++ b/tools/blktap/parallax/requests-async.c	Sun Jul 03 22:36:48 2005 +0000
    20.3 @@ -0,0 +1,762 @@
    20.4 +/* requests-async.c
    20.5 + *
    20.6 + * asynchronous request dispatcher for radix access in parallax.
    20.7 + */
    20.8 +
    20.9 +#include <stdio.h>
   20.10 +#include <stdlib.h>
   20.11 +#include <string.h>
   20.12 +#include <ctype.h>
   20.13 +#include <assert.h>
   20.14 +#include <pthread.h>
   20.15 +#include <err.h>
   20.16 +#include <zlib.h> /* for crc32() */
   20.17 +#include "requests-async.h"
   20.18 +#include "vdi.h"
   20.19 +#include "radix.h"
   20.20 +
   20.21 +#define L1_IDX(_a) (((_a) & 0x0000000007fc0000ULL) >> 18)
   20.22 +#define L2_IDX(_a) (((_a) & 0x000000000003fe00ULL) >> 9)
   20.23 +#define L3_IDX(_a) (((_a) & 0x00000000000001ffULL))
   20.24 +
   20.25 +
   20.26 +#if 0
   20.27 +#define DPRINTF(_f, _a...) printf ( _f , ## _a )
   20.28 +#else
   20.29 +#define DPRINTF(_f, _a...) ((void)0)
   20.30 +#endif
   20.31 +
   20.32 +struct block_info {
   20.33 +    u32        crc;
   20.34 +    u32        unused;
   20.35 +};
   20.36 +
   20.37 +struct io_req {
   20.38 +    enum { IO_OP_READ, IO_OP_WRITE } op;
   20.39 +    u64        root;
   20.40 +    u64        vaddr;
   20.41 +    int        state;
   20.42 +    io_cb_t    cb;
   20.43 +    void      *param;
   20.44 +    struct radix_lock *lock;
   20.45 +
   20.46 +    /* internal stuff: */
   20.47 +    struct io_ret     retval;/* holds the return while we unlock. */
   20.48 +    char             *block; /* the block to write */
   20.49 +    radix_tree_node   radix[3];
   20.50 +    u64               radix_addr[3];
   20.51 +    struct block_info bi;
   20.52 +};
   20.53 +
   20.54 +void clear_w_bits(radix_tree_node node) 
   20.55 +{
   20.56 +    int i;
   20.57 +    for (i=0; i<RADIX_TREE_MAP_ENTRIES; i++)
   20.58 +        node[i] = node[i] & ONEMASK;
   20.59 +    return;
   20.60 +}
   20.61 +
   20.62 +void clear_L3_w_bits(radix_tree_node node) 
   20.63 +{
   20.64 +    int i;
   20.65 +    for (i=0; i<RADIX_TREE_MAP_ENTRIES; i+=2)
   20.66 +        node[i] = node[i] & ONEMASK;
   20.67 +    return;
   20.68 +}
   20.69 +
   20.70 +enum states {
   20.71 +    /* both */
   20.72 +    READ_L1,
   20.73 +    READ_L2,
   20.74 +    READ_L3,
   20.75 +
   20.76 +    /* read */
   20.77 +    READ_LOCKED,
   20.78 +    READ_DATA,
   20.79 +    READ_UNLOCKED,
   20.80 +    RETURN_ZERO,
   20.81 +
   20.82 +    /* write */
   20.83 +    WRITE_LOCKED,
   20.84 +    WRITE_DATA,
   20.85 +    WRITE_L3,
   20.86 +    WRITE_UNLOCKED,
   20.87 +    
   20.88 +    /* L3 Zero Path */
   20.89 +    ALLOC_DATA_L3z,
   20.90 +    WRITE_L3_L3z,
   20.91 +    
   20.92 +    /* L3 Fault Path */
   20.93 +    ALLOC_DATA_L3f,
   20.94 +    WRITE_L3_L3f,
   20.95 +    
   20.96 +    /* L2 Zero Path */
   20.97 +    ALLOC_DATA_L2z,
   20.98 +    WRITE_L2_L2z,
   20.99 +    ALLOC_L3_L2z,
  20.100 +    WRITE_L2_L3z,
  20.101 +    
  20.102 +    /* L2 Fault Path */
  20.103 +    READ_L3_L2f,
  20.104 +    ALLOC_DATA_L2f,
  20.105 +    WRITE_L2_L2f,
  20.106 +    ALLOC_L3_L2f,
  20.107 +    WRITE_L2_L3f,
  20.108 +
  20.109 +    /* L1 Zero Path */
  20.110 +    ALLOC_DATA_L1z,
  20.111 +    ALLOC_L3_L1z,
  20.112 +    ALLOC_L2_L1z,
  20.113 +    WRITE_L1_L1z,
  20.114 +
  20.115 +    /* L1 Fault Path */
  20.116 +    READ_L2_L1f,
  20.117 +    READ_L3_L1f,
  20.118 +    ALLOC_DATA_L1f,
  20.119 +    ALLOC_L3_L1f,
  20.120 +    ALLOC_L2_L1f,
  20.121 +    WRITE_L1_L1f,
  20.122 +    
  20.123 +};
  20.124 +
  20.125 +enum radix_offsets {
  20.126 +    L1 = 0, 
  20.127 +    L2 = 1,
  20.128 +    L3 = 2
  20.129 +};
  20.130 +
  20.131 +
  20.132 +static void read_cb(struct io_ret ret, void *param);
  20.133 +static void write_cb(struct io_ret ret, void *param);
  20.134 +
  20.135 +int vdi_read(vdi_t *vdi, u64 vaddr, io_cb_t cb, void *param)
  20.136 +{
  20.137 +    struct io_req *req;
  20.138 +
  20.139 +    if (!VALID_VADDR(vaddr)) return ERR_BAD_VADDR;
  20.140 +    /* Every second line in the bottom-level radix tree is used to      */
  20.141 +    /* store crc32 values etc. We shift the vadder here to achied this. */
  20.142 +    vaddr <<= 1;
  20.143 +
  20.144 +    req = (struct io_req *)malloc(sizeof (struct io_req));
  20.145 +    if (req == NULL) return ERR_NOMEM;
  20.146 +
  20.147 +    req->radix[0] = req->radix[1] = req->radix[2] = NULL;	
  20.148 +    req->op    = IO_OP_READ;
  20.149 +    req->root  = vdi->radix_root;
  20.150 +    req->lock  = vdi->radix_lock; 
  20.151 +    req->vaddr = vaddr;
  20.152 +    req->cb    = cb;
  20.153 +    req->param = param;
  20.154 +    req->state = READ_LOCKED;
  20.155 +
  20.156 +    block_rlock(req->lock, L1_IDX(vaddr), read_cb, req);
  20.157 +	
  20.158 +    return 0;
  20.159 +}
  20.160 +
  20.161 +
  20.162 +int   vdi_write(vdi_t *vdi, u64 vaddr, char *block, 
  20.163 +                io_cb_t cb, void *param)
  20.164 +{
  20.165 +    struct io_req *req;
  20.166 +
  20.167 +    if (!VALID_VADDR(vaddr)) return ERR_BAD_VADDR;
  20.168 +    /* Every second line in the bottom-level radix tree is used to      */
  20.169 +    /* store crc32 values etc. We shift the vadder here to achied this. */
  20.170 +    vaddr <<= 1;
  20.171 +
  20.172 +    req = (struct io_req *)malloc(sizeof (struct io_req));
  20.173 +    if (req == NULL) return ERR_NOMEM; 
  20.174 +
  20.175 +    req->radix[0] = req->radix[1] = req->radix[2] = NULL;
  20.176 +    req->op     = IO_OP_WRITE;
  20.177 +    req->root   = vdi->radix_root;
  20.178 +    req->lock   = vdi->radix_lock; 
  20.179 +    req->vaddr  = vaddr;
  20.180 +    req->block  = block;
  20.181 +    /* Todo: add a pseodoheader to the block to include some location   */
  20.182 +    /* information in the CRC as well.                                  */
  20.183 +    req->bi.crc = (u32) crc32(0L, Z_NULL, 0); 
  20.184 +    req->bi.crc = (u32) crc32(req->bi.crc, block, BLOCK_SIZE); 
  20.185 +    req->bi.unused = 0xdeadbeef;
  20.186 +
  20.187 +    req->cb     = cb;
  20.188 +    req->param  = param;
  20.189 +    req->radix_addr[L1] = getid(req->root); /* for consistency */
  20.190 +    req->state  = WRITE_LOCKED;
  20.191 +
  20.192 +    block_wlock(req->lock, L1_IDX(vaddr), write_cb, req);
  20.193 +
  20.194 +
  20.195 +    return 0;
  20.196 +}
  20.197 +
  20.198 +static void read_cb(struct io_ret ret, void *param)
  20.199 +{
  20.200 +    struct io_req *req = (struct io_req *)param;
  20.201 +    radix_tree_node node;
  20.202 +    u64 idx;
  20.203 +    char *block;
  20.204 +    void *req_param;
  20.205 +
  20.206 +    DPRINTF("read_cb\n");
  20.207 +    /* get record */
  20.208 +    switch(req->state) {
  20.209 +    	
  20.210 +    case READ_LOCKED: 
  20.211 +    
  20.212 +        DPRINTF("READ_LOCKED\n");
  20.213 +    	req->state = READ_L1;
  20.214 +    	block_read(getid(req->root), read_cb, req); 
  20.215 +    	break;
  20.216 +    	
  20.217 +    case READ_L1: /* block is the radix root */
  20.218 +
  20.219 +        DPRINTF("READ_L1\n");
  20.220 +        block = IO_BLOCK(ret);
  20.221 +        if (block == NULL) goto fail;
  20.222 +        node = (radix_tree_node) block;
  20.223 +        idx  = getid( node[L1_IDX(req->vaddr)] );
  20.224 +        free(block);
  20.225 +        if ( idx == ZERO ) {
  20.226 +            req->state = RETURN_ZERO;
  20.227 +            block_runlock(req->lock, L1_IDX(req->vaddr), read_cb, req);
  20.228 +        } else {
  20.229 +            req->state = READ_L2;
  20.230 +            block_read(idx, read_cb, req);
  20.231 +        }
  20.232 +        break;
  20.233 +
  20.234 +    case READ_L2:
  20.235 +
  20.236 +        DPRINTF("READ_L2\n");
  20.237 +        block = IO_BLOCK(ret);
  20.238 +        if (block == NULL) goto fail;
  20.239 +        node = (radix_tree_node) block;
  20.240 +        idx  = getid( node[L2_IDX(req->vaddr)] );
  20.241 +        free(block);
  20.242 +        if ( idx == ZERO ) {
  20.243 +            req->state = RETURN_ZERO;
  20.244 +            block_runlock(req->lock, L1_IDX(req->vaddr), read_cb, req);
  20.245 +        } else {
  20.246 +            req->state = READ_L3;
  20.247 +            block_read(idx, read_cb, req);
  20.248 +        }
  20.249 +        break;
  20.250 +
  20.251 +    case READ_L3:
  20.252 +    {
  20.253 +        struct block_info *bi;
  20.254 +
  20.255 +        DPRINTF("READ_L3\n");
  20.256 +        block = IO_BLOCK(ret);
  20.257 +        if (block == NULL) goto fail;
  20.258 +        node = (radix_tree_node) block;
  20.259 +        idx  = getid( node[L3_IDX(req->vaddr)] );
  20.260 +        bi = (struct block_info *) &node[L3_IDX(req->vaddr) + 1];
  20.261 +        req->bi = *bi;
  20.262 +        free(block);
  20.263 +        if ( idx == ZERO )  {
  20.264 +            req->state = RETURN_ZERO;
  20.265 +            block_runlock(req->lock, L1_IDX(req->vaddr), read_cb, req);
  20.266 +        } else {
  20.267 +            req->state = READ_DATA;
  20.268 +            block_read(idx, read_cb, req);
  20.269 +        }
  20.270 +        break;
  20.271 +    }
  20.272 +    case READ_DATA:
  20.273 +    {
  20.274 +        u32 crc;
  20.275 +
  20.276 +        DPRINTF("READ_DATA\n");
  20.277 +        block = IO_BLOCK(ret);
  20.278 +        if (block == NULL) goto fail;
  20.279 +
  20.280 +        /* crc check */
  20.281 +        crc = (u32) crc32(0L, Z_NULL, 0); 
  20.282 +        crc = (u32) crc32(crc, block, BLOCK_SIZE); 
  20.283 +        if (crc != req->bi.crc) {
  20.284 +            /* TODO: add a retry loop here.                          */
  20.285 +            /* Do this after the cache is added -- make sure to      */
  20.286 +            /* invalidate the bad page before reissuing the read.    */
  20.287 +
  20.288 +            warn("Bad CRC on vaddr (%Lu:%d)\n", req->vaddr, req->bi.unused);
  20.289 +#ifdef PRINT_BADCRC_PAGES
  20.290 +            {
  20.291 +                int j;
  20.292 +                for (j=0; j<BLOCK_SIZE; j++) {
  20.293 +                    if isprint(block[j]) {
  20.294 +                        printf("%c", block[j]);
  20.295 +                    } else {
  20.296 +                        printf(".");
  20.297 +                    }
  20.298 +                    if ((j % 64) == 0) printf("\n");
  20.299 +                }
  20.300 +            }
  20.301 +#endif /* PRINT_BADCRC_PAGES */
  20.302 +
  20.303 +            /* fast and loose for the moment. */
  20.304 +            /* goto fail;                     */
  20.305 +        }
  20.306 +
  20.307 +        req->retval = ret;
  20.308 +        req->state = READ_UNLOCKED;
  20.309 +        block_runlock(req->lock, L1_IDX(req->vaddr), read_cb, req);
  20.310 +        break;
  20.311 +    }
  20.312 +    case READ_UNLOCKED:
  20.313 +    {
  20.314 +        struct io_ret r;
  20.315 +        io_cb_t cb;
  20.316 +        DPRINTF("READ_UNLOCKED\n");
  20.317 +        req_param = req->param;
  20.318 +        r         = req->retval;
  20.319 +        cb        = req->cb;
  20.320 +        free(req);
  20.321 +        cb(r, req_param);
  20.322 +        break;
  20.323 +    }
  20.324 +    
  20.325 +    case RETURN_ZERO:
  20.326 +    {
  20.327 +        struct io_ret r;
  20.328 +        io_cb_t cb;
  20.329 +        DPRINTF("RETURN_ZERO\n");
  20.330 +        req_param = req->param;
  20.331 +        cb        = req->cb;
  20.332 +        free(req);
  20.333 +        r.type = IO_BLOCK_T;
  20.334 +        r.u.b = newblock();
  20.335 +        cb(r, req_param);
  20.336 +        break;
  20.337 +    }
  20.338 +        
  20.339 +    default:
  20.340 +    	DPRINTF("*** Write: Bad state! (%d) ***\n", req->state);
  20.341 +    	goto fail;
  20.342 +    }
  20.343 + 
  20.344 +    return;
  20.345 +
  20.346 + fail:
  20.347 +    {
  20.348 +        struct io_ret r;
  20.349 +        io_cb_t cb;
  20.350 +        DPRINTF("asyn_read had a read error.\n");
  20.351 +        req_param = req->param;
  20.352 +        r         = ret;
  20.353 +        cb        = req->cb;
  20.354 +        free(req);
  20.355 +        cb(r, req_param);
  20.356 +    }
  20.357 +
  20.358 +
  20.359 +}
  20.360 +
  20.361 +static void write_cb(struct io_ret r, void *param)
  20.362 +{
  20.363 +    struct io_req *req = (struct io_req *)param;
  20.364 +    radix_tree_node node;
  20.365 +    u64 a, addr;
  20.366 +    void *req_param;
  20.367 +    struct block_info *bi;
  20.368 +
  20.369 +    switch(req->state) {
  20.370 +    	
  20.371 +    case WRITE_LOCKED:
  20.372 +        
  20.373 +        DPRINTF("WRITE_LOCKED (%llu)\n", L1_IDX(req->vaddr));
  20.374 +    	req->state = READ_L1;
  20.375 +    	block_read(getid(req->root), write_cb, req); 
  20.376 +    	break;
  20.377 +    	
  20.378 +    case READ_L1: /* block is the radix root */
  20.379 +
  20.380 +        DPRINTF("READ_L1\n");
  20.381 +        node = (radix_tree_node) IO_BLOCK(r);
  20.382 +        if (node == NULL) goto fail;
  20.383 +        a    = node[L1_IDX(req->vaddr)];
  20.384 +        addr = getid(a);
  20.385 +
  20.386 +        req->radix_addr[L2] = addr;
  20.387 +        req->radix[L1] = node;
  20.388 +
  20.389 +        if ( addr == ZERO ) {
  20.390 +            /* L1 empty subtree: */
  20.391 +            req->state = ALLOC_DATA_L1z;
  20.392 +            block_alloc( req->block, write_cb, req );
  20.393 +        } else if ( !iswritable(a) ) {
  20.394 +            /* L1 fault: */
  20.395 +            req->state = READ_L2_L1f;
  20.396 +            block_read( addr, write_cb, req );
  20.397 +        } else {
  20.398 +            req->state = READ_L2;
  20.399 +            block_read( addr, write_cb, req );
  20.400 +        }
  20.401 +        break;
  20.402 +    
  20.403 +    case READ_L2:
  20.404 +
  20.405 +        DPRINTF("READ_L2\n");
  20.406 +        node = (radix_tree_node) IO_BLOCK(r);
  20.407 +        if (node == NULL) goto fail;
  20.408 +        a    = node[L2_IDX(req->vaddr)];
  20.409 +        addr = getid(a);
  20.410 +
  20.411 +        req->radix_addr[L3] = addr;
  20.412 +        req->radix[L2] = node;
  20.413 +
  20.414 +        if ( addr == ZERO ) {
  20.415 +            /* L2 empty subtree: */
  20.416 +            req->state = ALLOC_DATA_L2z;
  20.417 +            block_alloc( req->block, write_cb, req );
  20.418 +        } else if ( !iswritable(a) ) {
  20.419 +            /* L2 fault: */
  20.420 +            req->state = READ_L3_L2f;
  20.421 +            block_read( addr, write_cb, req );
  20.422 +        } else {
  20.423 +            req->state = READ_L3;
  20.424 +            block_read( addr, write_cb, req );
  20.425 +        }
  20.426 +        break;
  20.427 +    
  20.428 +    case READ_L3:
  20.429 +
  20.430 +        DPRINTF("READ_L3\n");
  20.431 +        node = (radix_tree_node) IO_BLOCK(r);
  20.432 +        if (node == NULL) goto fail;
  20.433 +        a    = node[L3_IDX(req->vaddr)];
  20.434 +        addr = getid(a);
  20.435 +
  20.436 +        req->radix[L3] = node;
  20.437 +
  20.438 +        if ( addr == ZERO ) {
  20.439 +            /* L3 fault: */
  20.440 +            req->state = ALLOC_DATA_L3z;
  20.441 +            block_alloc( req->block, write_cb, req );
  20.442 +        } else if ( !iswritable(a) ) {
  20.443 +            /* L3 fault: */
  20.444 +            req->state = ALLOC_DATA_L3f;
  20.445 +            block_alloc( req->block, write_cb, req );
  20.446 +        } else {
  20.447 +            req->state = WRITE_DATA;
  20.448 +            block_write( addr, req->block, write_cb, req );
  20.449 +        }
  20.450 +        break;
  20.451 +    
  20.452 +    case WRITE_DATA:
  20.453 +
  20.454 +        DPRINTF("WRITE_DATA\n");
  20.455 +        /* The L3 radix points to the correct block, we just need to  */
  20.456 +        /* update the crc.                                            */
  20.457 +        if (IO_INT(r) < 0) goto fail;
  20.458 +        bi  = (struct block_info *) &req->radix[L3][L3_IDX(req->vaddr)+1];
  20.459 +        req->bi.unused = 101;
  20.460 +        *bi = req->bi;
  20.461 +        req->state = WRITE_L3;
  20.462 +        block_write(req->radix_addr[L3], (char*)req->radix[L3], write_cb, req);
  20.463 +        break;
  20.464 +    
  20.465 +    /* L3 Zero Path: */
  20.466 +
  20.467 +    case ALLOC_DATA_L3z:
  20.468 +
  20.469 +        DPRINTF("ALLOC_DATA_L3z\n");
  20.470 +        addr = IO_ADDR(r);
  20.471 +        a = writable(addr);
  20.472 +        req->radix[L3][L3_IDX(req->vaddr)] = a;
  20.473 +        bi  = (struct block_info *) &req->radix[L3][L3_IDX(req->vaddr)+1];
  20.474 +        req->bi.unused = 102;
  20.475 +        *bi = req->bi;
  20.476 +        req->state = WRITE_L3_L3z;
  20.477 +        block_write(req->radix_addr[L3], (char*)req->radix[L3], write_cb, req);
  20.478 +        break;
  20.479 +    
  20.480 +    /* L3 Fault Path: */
  20.481 +
  20.482 +    case ALLOC_DATA_L3f:
  20.483 +    
  20.484 +        DPRINTF("ALLOC_DATA_L3f\n");
  20.485 +        addr = IO_ADDR(r);
  20.486 +        a = writable(addr);
  20.487 +        req->radix[L3][L3_IDX(req->vaddr)] = a;
  20.488 +        bi  = (struct block_info *) &req->radix[L3][L3_IDX(req->vaddr)+1];
  20.489 +        req->bi.unused = 103;
  20.490 +        *bi = req->bi;
  20.491 +        req->state = WRITE_L3_L3f;
  20.492 +        block_write(req->radix_addr[L3], (char*)req->radix[L3], write_cb, req);
  20.493 +        break;
  20.494 +
  20.495 +    /* L2 Zero Path: */
  20.496 +        
  20.497 +    case ALLOC_DATA_L2z:
  20.498 +
  20.499 +        DPRINTF("ALLOC_DATA_L2z\n");
  20.500 +        addr = IO_ADDR(r);
  20.501 +        a = writable(addr);
  20.502 +        req->radix[L3] = newblock();
  20.503 +        req->radix[L3][L3_IDX(req->vaddr)] = a;
  20.504 +        bi  = (struct block_info *) &req->radix[L3][L3_IDX(req->vaddr)+1];
  20.505 +        req->bi.unused = 104;
  20.506 +        *bi = req->bi;
  20.507 +        req->state = ALLOC_L3_L2z;
  20.508 +        block_alloc( (char*)req->radix[L3], write_cb, req );
  20.509 +        break;
  20.510 +
  20.511 +    case ALLOC_L3_L2z:
  20.512 +
  20.513 +        DPRINTF("ALLOC_L3_L2z\n");
  20.514 +        addr = IO_ADDR(r);
  20.515 +        a = writable(addr);
  20.516 +        req->radix[L2][L2_IDX(req->vaddr)] = a;
  20.517 +        req->state = WRITE_L2_L2z;
  20.518 +        block_write(req->radix_addr[L2], (char*)req->radix[L2], write_cb, req);
  20.519 +        break;
  20.520 +        
  20.521 +    /* L2 Fault Path: */
  20.522 +        
  20.523 +    case READ_L3_L2f:
  20.524 +    
  20.525 +    	DPRINTF("READ_L3_L2f\n");
  20.526 +        node = (radix_tree_node) IO_BLOCK(r);
  20.527 +        clear_L3_w_bits(node);
  20.528 +        if (node == NULL) goto fail;
  20.529 +        a    = node[L2_IDX(req->vaddr)];
  20.530 +        addr = getid(a);
  20.531 +
  20.532 +        req->radix[L3] = node;
  20.533 +        req->state = ALLOC_DATA_L2f;
  20.534 +        block_alloc( req->block, write_cb, req );
  20.535 +        break;
  20.536 +                
  20.537 +    case ALLOC_DATA_L2f:
  20.538 +
  20.539 +        DPRINTF("ALLOC_DATA_L2f\n");
  20.540 +        addr = IO_ADDR(r);
  20.541 +        a = writable(addr);
  20.542 +        req->radix[L3][L3_IDX(req->vaddr)] = a;
  20.543 +        bi  = (struct block_info *) &req->radix[L3][L3_IDX(req->vaddr)+1];
  20.544 +        req->bi.unused = 105;
  20.545 +        *bi = req->bi;
  20.546 +        req->state = ALLOC_L3_L2f;
  20.547 +        block_alloc( (char*)req->radix[L3], write_cb, req );
  20.548 +        break;
  20.549 +
  20.550 +    case ALLOC_L3_L2f:
  20.551 +
  20.552 +        DPRINTF("ALLOC_L3_L2f\n");
  20.553 +        addr = IO_ADDR(r);
  20.554 +        a = writable(addr);
  20.555 +        req->radix[L2][L2_IDX(req->vaddr)] = a;
  20.556 +        req->state = WRITE_L2_L2f;
  20.557 +        block_write(req->radix_addr[L2], (char*)req->radix[L2], write_cb, req);
  20.558 +        break;
  20.559 +        
  20.560 +    /* L1 Zero Path: */
  20.561 +    
  20.562 +    case ALLOC_DATA_L1z:
  20.563 +
  20.564 +        DPRINTF("ALLOC_DATA_L1z\n");
  20.565 +        addr = IO_ADDR(r);
  20.566 +        a = writable(addr);
  20.567 +        req->radix[L3] = newblock();
  20.568 +        req->radix[L3][L3_IDX(req->vaddr)] = a;
  20.569 +        bi  = (struct block_info *) &req->radix[L3][L3_IDX(req->vaddr)+1];
  20.570 +        req->bi.unused = 106;
  20.571 +        *bi = req->bi;
  20.572 +        req->state = ALLOC_L3_L1z;
  20.573 +        block_alloc( (char*)req->radix[L3], write_cb, req );
  20.574 +        break;
  20.575 +        
  20.576 +    case ALLOC_L3_L1z:
  20.577 +
  20.578 +        DPRINTF("ALLOC_L3_L1z\n");
  20.579 +        addr = IO_ADDR(r);
  20.580 +        a = writable(addr);
  20.581 +        req->radix[L2] = newblock();
  20.582 +        req->radix[L2][L2_IDX(req->vaddr)] = a;
  20.583 +        req->state = ALLOC_L2_L1z;
  20.584 +        block_alloc( (char*)req->radix[L2], write_cb, req );
  20.585 +        break;
  20.586 +
  20.587 +    case ALLOC_L2_L1z:
  20.588 +
  20.589 +        DPRINTF("ALLOC_L2_L1z\n");
  20.590 +        addr = IO_ADDR(r);
  20.591 +        a = writable(addr);
  20.592 +        req->radix[L1][L1_IDX(req->vaddr)] = a;
  20.593 +        req->state = WRITE_L1_L1z;
  20.594 +        block_write(req->radix_addr[L1], (char*)req->radix[L1], write_cb, req);
  20.595 +        break;
  20.596 +
  20.597 +    /* L1 Fault Path: */
  20.598 +        
  20.599 +    case READ_L2_L1f:
  20.600 +    
  20.601 +    	DPRINTF("READ_L2_L1f\n");
  20.602 +        node = (radix_tree_node) IO_BLOCK(r);
  20.603 +        clear_w_bits(node);
  20.604 +        if (node == NULL) goto fail;
  20.605 +        a    = node[L2_IDX(req->vaddr)];
  20.606 +        addr = getid(a);
  20.607 +
  20.608 +        req->radix_addr[L3] = addr;
  20.609 +        req->radix[L2] = node;
  20.610 +        
  20.611 +        if (addr == ZERO) {
  20.612 +            /* nothing below L2, create an empty L3 and alloc data. */
  20.613 +            /* (So skip READ_L3_L1f.) */
  20.614 +            req->radix[L3] = newblock();
  20.615 +            req->state = ALLOC_DATA_L1f;
  20.616 +            block_alloc( req->block, write_cb, req );
  20.617 +        } else {
  20.618 +            req->state = READ_L3_L1f;
  20.619 +            block_read( addr, write_cb, req );
  20.620 +        }
  20.621 +        break;
  20.622 +        
  20.623 +    case READ_L3_L1f:
  20.624 +    
  20.625 +    	DPRINTF("READ_L3_L1f\n");
  20.626 +        node = (radix_tree_node) IO_BLOCK(r);
  20.627 +        clear_L3_w_bits(node);
  20.628 +        if (node == NULL) goto fail;
  20.629 +        a    = node[L2_IDX(req->vaddr)];
  20.630 +        addr = getid(a);
  20.631 +
  20.632 +        req->radix[L3] = node;
  20.633 +        req->state = ALLOC_DATA_L1f;
  20.634 +        block_alloc( req->block, write_cb, req );
  20.635 +        break;
  20.636 +                
  20.637 +    case ALLOC_DATA_L1f:
  20.638 +
  20.639 +        DPRINTF("ALLOC_DATA_L1f\n");
  20.640 +        addr = IO_ADDR(r);
  20.641 +        a = writable(addr);
  20.642 +        req->radix[L3][L3_IDX(req->vaddr)] = a;
  20.643 +        bi  = (struct block_info *) &req->radix[L3][L3_IDX(req->vaddr)+1];
  20.644 +        req->bi.unused = 107;
  20.645 +        *bi = req->bi;
  20.646 +        req->state = ALLOC_L3_L1f;
  20.647 +        block_alloc( (char*)req->radix[L3], write_cb, req );
  20.648 +        break;
  20.649 +
  20.650 +    case ALLOC_L3_L1f:
  20.651 +
  20.652 +        DPRINTF("ALLOC_L3_L1f\n");
  20.653 +        addr = IO_ADDR(r);
  20.654 +        a = writable(addr);
  20.655 +        req->radix[L2][L2_IDX(req->vaddr)] = a;
  20.656 +        req->state = ALLOC_L2_L1f;
  20.657 +        block_alloc( (char*)req->radix[L2], write_cb, req );
  20.658 +        break;
  20.659 +
  20.660 +    case ALLOC_L2_L1f:
  20.661 +
  20.662 +        DPRINTF("ALLOC_L2_L1f\n");
  20.663 +        addr = IO_ADDR(r);
  20.664 +        a = writable(addr);
  20.665 +        req->radix[L1][L1_IDX(req->vaddr)] = a;
  20.666 +        req->state = WRITE_L1_L1f;
  20.667 +        block_write(req->radix_addr[L1], (char*)req->radix[L1], write_cb, req);
  20.668 +        break;
  20.669 +
  20.670 +    case WRITE_L3:
  20.671 +    case WRITE_L3_L3z:
  20.672 +    case WRITE_L3_L3f:
  20.673 +    case WRITE_L2_L2z:
  20.674 +    case WRITE_L2_L2f:
  20.675 +    case WRITE_L1_L1z:
  20.676 +    case WRITE_L1_L1f:
  20.677 +    {
  20.678 +    	int i;
  20.679 +        DPRINTF("DONE\n");
  20.680 +        /* free any saved node vals. */
  20.681 +        for (i=0; i<3; i++)
  20.682 +            if (req->radix[i] != 0) free(req->radix[i]);
  20.683 +        req->retval = r;
  20.684 +        req->state = WRITE_UNLOCKED;
  20.685 +        block_wunlock(req->lock, L1_IDX(req->vaddr), write_cb, req);
  20.686 +        break;
  20.687 +    }
  20.688 +    case WRITE_UNLOCKED:
  20.689 +    {
  20.690 +        struct io_ret r;
  20.691 +        io_cb_t cb;
  20.692 +        DPRINTF("WRITE_UNLOCKED!\n");
  20.693 +        req_param = req->param;
  20.694 +        r         = req->retval;
  20.695 +        cb        = req->cb;
  20.696 +        free(req);
  20.697 +        cb(r, req_param);
  20.698 +        break;
  20.699 +    }
  20.700 +        
  20.701 +    default:
  20.702 +    	DPRINTF("*** Write: Bad state! (%d) ***\n", req->state);
  20.703 +    	goto fail;
  20.704 +    }
  20.705 +    
  20.706 +    return;
  20.707 +    
  20.708 + fail:
  20.709 +    {
  20.710 +        struct io_ret r;
  20.711 +        io_cb_t cb;
  20.712 +        int i;
  20.713 +
  20.714 +        DPRINTF("asyn_write had a read error mid-way.\n");
  20.715 +        req_param = req->param;
  20.716 +        cb        = req->cb;
  20.717 +        r.type = IO_INT_T;
  20.718 +        r.u.i  = -1;
  20.719 +        /* free any saved node vals. */
  20.720 +        for (i=0; i<3; i++)
  20.721 +            if (req->radix[i] != 0) free(req->radix[i]);
  20.722 +        free(req);
  20.723 +        cb(r, req_param);
  20.724 +    }
  20.725 +}
  20.726 +
  20.727 +char *vdi_read_s(vdi_t *vdi, u64 vaddr)
  20.728 +{
  20.729 +    pthread_mutex_t m = PTHREAD_MUTEX_INITIALIZER;
  20.730 +    char *block = NULL;
  20.731 +    int ret;
  20.732 +
  20.733 +    void reads_cb(struct io_ret r, void *param) 
  20.734 +    {
  20.735 +        block = IO_BLOCK(r);
  20.736 +        pthread_mutex_unlock((pthread_mutex_t *)param);
  20.737 +    }
  20.738 +
  20.739 +    pthread_mutex_lock(&m);
  20.740 +    ret = vdi_read(vdi, vaddr, reads_cb, &m);
  20.741 +
  20.742 +    if (ret == 0) pthread_mutex_lock(&m);
  20.743 +    
  20.744 +    return block;
  20.745 +}
  20.746 +
  20.747 +
  20.748 +int vdi_write_s(vdi_t *vdi, u64 vaddr, char *block)
  20.749 +{
  20.750 +    pthread_mutex_t m = PTHREAD_MUTEX_INITIALIZER;
  20.751 +    int ret, result;
  20.752 +
  20.753 +    void writes_cb(struct io_ret r, void *param) 
  20.754 +    {
  20.755 +        result = IO_INT(r);
  20.756 +        pthread_mutex_unlock((pthread_mutex_t *)param);
  20.757 +    }
  20.758 +
  20.759 +    pthread_mutex_lock(&m);
  20.760 +    ret = vdi_write(vdi, vaddr, block, writes_cb, &m);
  20.761 +
  20.762 +    if (ret == 0) pthread_mutex_lock(&m);
  20.763 +    
  20.764 +    return result;
  20.765 +}
    21.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    21.2 +++ b/tools/blktap/parallax/requests-async.h	Sun Jul 03 22:36:48 2005 +0000
    21.3 @@ -0,0 +1,29 @@
    21.4 +#ifndef _REQUESTSASYNC_H_
    21.5 +#define _REQUESTSASYNC_H_
    21.6 +
    21.7 +#include "block-async.h"
    21.8 +#include "blockstore.h" /* for newblock etc. */
    21.9 +
   21.10 +/*
   21.11 +#define BLOCK_SIZE 4096
   21.12 +#define ZERO 0ULL
   21.13 +#define getid(x) (((x)>>1)&0x7fffffffffffffffLLU)
   21.14 +#define iswritable(x) (((x) & 1LLU) != 0)
   21.15 +#define writable(x) (((x) << 1) | 1LLU)
   21.16 +#define readonly(x) ((u64)((x) << 1))
   21.17 +*/
   21.18 +
   21.19 +#define VADDR_MASK 0x0000000003ffffffLLU /* 26-bits = 256Gig */
   21.20 +#define VALID_VADDR(x) (((x) & VADDR_MASK) == (x))
   21.21 +
   21.22 +int vdi_read (vdi_t *vdi, u64 vaddr, io_cb_t cb, void *param);
   21.23 +int vdi_write(vdi_t *vdi, u64 vaddr, char *block, io_cb_t cb, void *param);
   21.24 +             
   21.25 +/* synchronous versions: */
   21.26 +char *vdi_read_s (vdi_t *vdi, u64 vaddr);
   21.27 +int   vdi_write_s(vdi_t *vdi, u64 vaddr, char *block);
   21.28 +
   21.29 +#define ERR_BAD_VADDR  -1
   21.30 +#define ERR_NOMEM      -2
   21.31 +
   21.32 +#endif //_REQUESTSASYNC_H_
    22.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    22.2 +++ b/tools/blktap/parallax/snaplog.c	Sun Jul 03 22:36:48 2005 +0000
    22.3 @@ -0,0 +1,238 @@
    22.4 +/**************************************************************************
    22.5 + * 
    22.6 + * snaplog.c
    22.7 + *
    22.8 + * Snapshot log on-disk data structure.
    22.9 + *
   22.10 + */
   22.11 + 
   22.12 + /* VDI histories are made from chains of snapshot logs.  These logs record 
   22.13 +  * the (radix) root and timestamp of individual snapshots.
   22.14 +  *
   22.15 +  * creation of a new VDI involves 'forking' a snapshot log, by creating a 
   22.16 +  * new, empty log (in a new VDI) and parenting it off of a record in an 
   22.17 +  * existing snapshot log.
   22.18 +  *
   22.19 +  * snapshot log blocks have at most one writer.
   22.20 +  */
   22.21 +
   22.22 +#include <stdio.h>
   22.23 +#include <stdlib.h>
   22.24 +#include <sys/time.h>
   22.25 +#include "blockstore.h"
   22.26 +#include "snaplog.h"
   22.27 +
   22.28 +
   22.29 +
   22.30 +snap_block_t *snap_get_block(u64 block)
   22.31 +{
   22.32 +    snap_block_t *blk = (snap_block_t *)readblock(block);
   22.33 +    
   22.34 +    if ( blk == NULL)
   22.35 +        return NULL;
   22.36 +    if ( blk->hdr.magic != SNAP_MAGIC ) {
   22.37 +        freeblock(blk);
   22.38 +        return NULL;
   22.39 +    }
   22.40 +    
   22.41 +    return blk;
   22.42 +}
   22.43 +    
   22.44 +int snap_get_id(snap_id_t *id, snap_rec_t *target)
   22.45 +{
   22.46 +    snap_block_t *blk;
   22.47 +    
   22.48 +    if ( id == NULL )
   22.49 +        return -1;
   22.50 +    
   22.51 +    blk = snap_get_block(id->block);
   22.52 +    
   22.53 +    if ( blk == NULL ) 
   22.54 +        return -1;
   22.55 +    
   22.56 +    if ( id->index > blk->hdr.nr_entries ) {
   22.57 +        freeblock(blk);
   22.58 +        return -1;
   22.59 +    }
   22.60 +    
   22.61 +    *target = blk->snaps[id->index];
   22.62 +    freeblock(blk);
   22.63 +    return 0;
   22.64 +}
   22.65 +
   22.66 +int __snap_block_create(snap_id_t *parent_id, snap_id_t *fork_id,
   22.67 +                                  snap_id_t *new_id)
   22.68 +{
   22.69 +    snap_rec_t parent_rec, fork_rec;
   22.70 +    snap_block_t *blk, *pblk;
   22.71 +    /*
   22.72 +    if ( (parent_id != NULL) && (snap_get_id(parent_id, &parent_rec) != 0) )
   22.73 +        return -1;    
   22.74 +    
   22.75 +    if ( (fork_id != NULL) && (snap_get_id(fork_id, &fork_rec) != 0) )
   22.76 +        return -1;   
   22.77 +*/
   22.78 +    blk = (snap_block_t *)newblock();
   22.79 +    blk->hdr.magic  = SNAP_MAGIC;
   22.80 +    blk->hdr.nr_entries  = 0;
   22.81 +    blk->hdr.log_entries = 0;
   22.82 +    blk->hdr.immutable   = 0;
   22.83 +    
   22.84 +    if (   (parent_id  != NULL) 
   22.85 +        && (parent_id->block != fork_id->block) 
   22.86 +        && (parent_id->block != 0)) {
   22.87 +        
   22.88 +        pblk = snap_get_block(parent_id->block);
   22.89 +        blk->hdr.log_entries = pblk->hdr.log_entries;
   22.90 +        freeblock(pblk);
   22.91 +    }
   22.92 +    
   22.93 +    if (parent_id != NULL) {
   22.94 +        blk->hdr.parent_block = *parent_id;
   22.95 +        blk->hdr.fork_block   = *fork_id;
   22.96 +    } else {
   22.97 +        blk->hdr.parent_block = null_snap_id;
   22.98 +        blk->hdr.fork_block   = null_snap_id;
   22.99 +    }
  22.100 +    
  22.101 +    new_id->index = 0;
  22.102 +    new_id->block = allocblock(blk);
  22.103 +    freeblock(blk);
  22.104 +    if (new_id->block == 0)
  22.105 +        return -1;
  22.106 +    
  22.107 +    return 0;
  22.108 +}
  22.109 +
  22.110 +int snap_block_create(snap_id_t *parent_id, snap_id_t *new_id)
  22.111 +{
  22.112 +    return __snap_block_create(parent_id, parent_id, new_id);
  22.113 +}
  22.114 +
  22.115 +int snap_append(snap_id_t *old_id, snap_rec_t *rec, snap_id_t *new_id)
  22.116 +{
  22.117 +    snap_id_t id = *old_id;
  22.118 +    snap_block_t *blk = snap_get_block(id.block);
  22.119 +    
  22.120 +    if ( rec->deleted == 1 ) {
  22.121 +        printf("Attempt to append a deleted snapshot!\n");
  22.122 +        return -1;
  22.123 +    }
  22.124 +    
  22.125 +    if ( blk->hdr.immutable != 0 ) {
  22.126 +        printf("Attempt to snap an immutable snap block!\n");
  22.127 +        return -1;
  22.128 +    }
  22.129 +    
  22.130 +    new_id->block = id.block;
  22.131 +    
  22.132 +    if (blk->hdr.nr_entries == SNAPS_PER_BLOCK) {
  22.133 +        int ret;
  22.134 +        
  22.135 +        id.index--; /* make id point to the last full record */
  22.136 +        
  22.137 +        ret = __snap_block_create(&id, &blk->hdr.fork_block, new_id);
  22.138 +        if ( ret != 0 ) {
  22.139 +            freeblock(blk);
  22.140 +            return -1;
  22.141 +        }
  22.142 +        
  22.143 +        blk->hdr.immutable = 1;
  22.144 +        writeblock(id.block, blk);
  22.145 +        freeblock(blk);
  22.146 +        blk = snap_get_block(new_id->block);
  22.147 +        id = *new_id;
  22.148 +    }
  22.149 +    
  22.150 +    blk->snaps[blk->hdr.nr_entries] = *rec;
  22.151 +    blk->hdr.nr_entries++;
  22.152 +    blk->hdr.log_entries++;
  22.153 +    new_id->index = blk->hdr.nr_entries;
  22.154 +    //printf("snap: %u %u\n", blk->hdr.nr_entries, blk->hdr.log_entries);
  22.155 +    writeblock(id.block, blk);
  22.156 +    freeblock(blk);
  22.157 +    return 0;
  22.158 +}
  22.159 +
  22.160 +int snap_collapse(int height, snap_id_t *p_id, snap_id_t *c_id)
  22.161 +{
  22.162 +    snap_block_t *p_blk, *c_blk, *blk;
  22.163 +    snap_rec_t   *p_rec, *c_rec;
  22.164 +    int ret = -1;
  22.165 +    
  22.166 +    p_blk = snap_get_block(p_id->block);
  22.167 +    
  22.168 +    if (p_blk == NULL) return(-1);
  22.169 +    
  22.170 +    if (c_id->block == p_id->block)
  22.171 +    {
  22.172 +        c_blk = p_blk;
  22.173 +    } else {
  22.174 +         c_blk = snap_get_block(c_id->block);
  22.175 +    }
  22.176 +    
  22.177 +    if (p_blk == NULL) {
  22.178 +        freeblock(p_blk);
  22.179 +        return(-1);
  22.180 +    }
  22.181 +     
  22.182 +    /* parent and child must not be deleted. */
  22.183 +    p_rec = &p_blk->snaps[p_id->index];
  22.184 +    c_rec = &c_blk->snaps[c_id->index];
  22.185 +    /*
  22.186 +    if ( (p_rec->deleted == 1) || (c_rec->deleted == 1) ) {
  22.187 +        printf("One of those snaps is already deleted.\n");
  22.188 +        goto done;
  22.189 +    }
  22.190 +    */
  22.191 +    /* first non-deleted thing in the log before child must be parent. */
  22.192 +    
  22.193 +    /* XXX todo: text the range here for delete (and eventually fork) bits) */
  22.194 +    /* for now, snaps must be consecutive, on the same log page: */
  22.195 +    
  22.196 +    if ((p_id->block != c_id->block) || (p_id->index != c_id->index-1))
  22.197 +    {
  22.198 +        printf("Deleting non-consecutive snaps is not done yet.\n");
  22.199 +        goto done;
  22.200 +    }
  22.201 +    
  22.202 +    /* mark parent as deleted XXX: may need to lock parent block here.*/
  22.203 +    p_rec->deleted = 1;
  22.204 +    writeblock(p_id->block, p_blk);
  22.205 +    
  22.206 +    /* delete the parent */
  22.207 +    printf("collapse(%Ld, %Ld)\n", p_rec->radix_root, c_rec->radix_root);
  22.208 +    ret = collapse(height, p_rec->radix_root, c_rec->radix_root);
  22.209 +    
  22.210 +    /* return the number of blocks reclaimed. */
  22.211 +    
  22.212 +done:
  22.213 +    if (c_blk != p_blk) freeblock(c_blk);
  22.214 +    freeblock(p_blk);
  22.215 +    
  22.216 +    return(ret);
  22.217 +}
  22.218 +
  22.219 +void snap_print_history(snap_id_t *snap_id)
  22.220 +{
  22.221 +    snap_id_t id = *snap_id;
  22.222 +    unsigned int idx = id.index;
  22.223 +    snap_block_t *new_blk, *blk = snap_get_block(id.block);
  22.224 +    
  22.225 +    while ( blk ) {
  22.226 +        printf("[Snap block %Ld]:\n", id.block);
  22.227 +        do {
  22.228 +            printf("   %03u: root: %Ld ts: %ld.%ld\n", idx, 
  22.229 +                    blk->snaps[idx].radix_root,
  22.230 +                    blk->snaps[idx].timestamp.tv_sec,
  22.231 +                    blk->snaps[idx].timestamp.tv_usec);
  22.232 +        } while (idx-- != 0);
  22.233 +        
  22.234 +        id = blk->hdr.parent_block;
  22.235 +        if (id.block != 0) {
  22.236 +            new_blk = snap_get_block(id.block);
  22.237 +        }
  22.238 +        freeblock(blk);
  22.239 +        blk = new_blk;
  22.240 +    }
  22.241 +}
    23.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    23.2 +++ b/tools/blktap/parallax/snaplog.h	Sun Jul 03 22:36:48 2005 +0000
    23.3 @@ -0,0 +1,61 @@
    23.4 +/**************************************************************************
    23.5 + * 
    23.6 + * snaplog.h
    23.7 + *
    23.8 + * Snapshot log on-disk data structure.
    23.9 + *
   23.10 + */
   23.11 + 
   23.12 +#include "radix.h"
   23.13 +#include "blockstore.h"    /* for BLOCK_SIZE */
   23.14 + 
   23.15 +#ifndef __SNAPLOG_H__
   23.16 +#define __SNAPLOG_H__
   23.17 +
   23.18 +typedef struct snap_id {
   23.19 +    u64            block;
   23.20 +    unsigned int   index;
   23.21 +} snap_id_t;
   23.22 +
   23.23 +typedef struct snap_rec {
   23.24 +    u64            radix_root;
   23.25 +    struct timeval timestamp;
   23.26 +    /* flags: */
   23.27 +    unsigned       deleted:1;
   23.28 +} snap_rec_t;
   23.29 +
   23.30 +
   23.31 +int  snap_block_create(snap_id_t *parent_id, snap_id_t *new_id);
   23.32 +int  snap_append(snap_id_t *id, snap_rec_t *rec, snap_id_t *new_id);
   23.33 +int  snap_collapse(int height, snap_id_t *p_id, snap_id_t *c_id);
   23.34 +void snap_print_history(snap_id_t *snap_id);
   23.35 +int  snap_get_id(snap_id_t *id, snap_rec_t *target);
   23.36 +
   23.37 +
   23.38 +/* exported for vdi debugging */
   23.39 +#define SNAP_MAGIC 0xff00ff0aa0ff00ffLL
   23.40 +
   23.41 +static const snap_id_t null_snap_id = { 0, 0 }; 
   23.42 +
   23.43 +typedef struct snap_block_hdr {
   23.44 +    u64            magic;
   23.45 +    snap_id_t      parent_block; /* parent block within this chain */
   23.46 +    snap_id_t      fork_block;   /* where this log was forked */
   23.47 +    unsigned       log_entries;  /* total entries since forking */
   23.48 +    unsigned short nr_entries;   /* entries in snaps[] */
   23.49 +    unsigned short immutable;    /* has this snap page become immutable? */
   23.50 +} snap_block_hdr_t;
   23.51 +
   23.52 +
   23.53 +#define SNAPS_PER_BLOCK \
   23.54 +    ((BLOCK_SIZE - sizeof(snap_block_hdr_t)) / sizeof(snap_rec_t))
   23.55 +
   23.56 +typedef struct snap_block {
   23.57 +    snap_block_hdr_t hdr;
   23.58 +    snap_rec_t       snaps[SNAPS_PER_BLOCK];
   23.59 +} snap_block_t;
   23.60 +    
   23.61 +
   23.62 +snap_block_t *snap_get_block(u64 block);
   23.63 +
   23.64 +#endif /* __SNAPLOG_H__ */
    24.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    24.2 +++ b/tools/blktap/parallax/vdi.c	Sun Jul 03 22:36:48 2005 +0000
    24.3 @@ -0,0 +1,367 @@
    24.4 +/**************************************************************************
    24.5 + * 
    24.6 + * vdi.c
    24.7 + *
    24.8 + * Virtual Disk Image (VDI) Interfaces
    24.9 + *
   24.10 + */
   24.11 + 
   24.12 +#include <stdio.h>
   24.13 +#include <stdlib.h>
   24.14 +#include <fcntl.h>
   24.15 +#include <string.h>
   24.16 +#include <sys/time.h>
   24.17 +#include <pthread.h>
   24.18 +#include "blockstore.h"
   24.19 +#include "block-async.h"
   24.20 +#include "requests-async.h"
   24.21 +#include "radix.h"
   24.22 +#include "vdi.h"
   24.23 +                    
   24.24 +#define VDI_REG_BLOCK   2LL
   24.25 +#define VDI_RADIX_ROOT  writable(3)
   24.26 +                                                            
   24.27 +#if 0
   24.28 +#define DPRINTF(_f, _a...) printf ( _f , ## _a )
   24.29 +#else
   24.30 +#define DPRINTF(_f, _a...) ((void)0)
   24.31 +#endif
   24.32 +
   24.33 +/* I haven't decided about this registry stuff, so this is just a really
   24.34 + * quick lash-up so that there is some way to track VDIs.
   24.35 + *
   24.36 + * (Most vdi access should be with a direct handle to the block, so this
   24.37 + *  registry is just for start-of-day lookup and other control operations.)
   24.38 + */
   24.39 +
   24.40 +vdi_registry_t *create_vdi_registry(void)
   24.41 +{
   24.42 +    vdi_registry_t *reg = (vdi_registry_t *)newblock();
   24.43 +    
   24.44 +    if (reg == NULL)
   24.45 +        return NULL;
   24.46 +    
   24.47 +    /* zero-fill the vdi radix root while we have an empty block. */
   24.48 +    writeblock(VDI_RADIX_ROOT, (void *)reg);
   24.49 +    
   24.50 +    
   24.51 +    DPRINTF("[vdi.c] Creating VDI registry!\n");
   24.52 +    reg->magic      = VDI_REG_MAGIC;
   24.53 +    reg->nr_vdis    = 0;
   24.54 +    
   24.55 +    writeblock(VDI_REG_BLOCK, (void *)reg);
   24.56 +    
   24.57 +    return reg;
   24.58 +}
   24.59 +    
   24.60 +vdi_registry_t *get_vdi_registry(void)
   24.61 +{
   24.62 +    vdi_registry_t *vdi_reg = (vdi_registry_t *)readblock(VDI_REG_BLOCK);
   24.63 +    
   24.64 +    if ( vdi_reg == NULL )
   24.65 +        vdi_reg = create_vdi_registry();
   24.66 +    
   24.67 +    if ( vdi_reg->magic != VDI_REG_MAGIC ) {
   24.68 +        freeblock(vdi_reg);
   24.69 +        return NULL;
   24.70 +    }
   24.71 +    
   24.72 +    return vdi_reg;
   24.73 +}
   24.74 +
   24.75 +
   24.76 +vdi_t *vdi_create(snap_id_t *parent_snap, char *name)
   24.77 +{
   24.78 +    int ret;
   24.79 +    vdi_t *vdi;
   24.80 +    vdi_registry_t *vdi_reg;
   24.81 +    snap_rec_t snap_rec;
   24.82 +    
   24.83 +    /* create a vdi struct */
   24.84 +    vdi = newblock();
   24.85 +    if (vdi == NULL) 
   24.86 +        return NULL;
   24.87 +    
   24.88 +    if ( snap_get_id(parent_snap, &snap_rec) == 0 ) {
   24.89 +        vdi->radix_root = snapshot(snap_rec.radix_root);
   24.90 +    } else {
   24.91 +        vdi->radix_root = allocblock((void *)vdi); /* vdi is just zeros here */
   24.92 +        vdi->radix_root = writable(vdi->radix_root); /* grr. */
   24.93 +    }
   24.94 +    
   24.95 +    /* create a snapshot log, and add it to the vdi struct */
   24.96 +    
   24.97 +    ret = snap_block_create(parent_snap, &vdi->snap);
   24.98 +    if ( ret != 0 ) {
   24.99 +        DPRINTF("Error getting snap block in vdi_create.\n");
  24.100 +        freeblock(vdi);
  24.101 +        return NULL;
  24.102 +    }
  24.103 +            
  24.104 +    /* append the vdi to the registry, fill block and id.             */
  24.105 +    /* implicit allocation means we have to write the vdi twice here. */
  24.106 +    vdi_reg    = get_vdi_registry();
  24.107 +    if ( vdi_reg == NULL ) {
  24.108 +        freeblock(vdi);
  24.109 +        return NULL;
  24.110 +    }
  24.111 +    
  24.112 +    vdi->block = allocblock((void *)vdi);
  24.113 +    vdi->id    = vdi_reg->nr_vdis++;
  24.114 +    strncpy(vdi->name, name, VDI_NAME_SZ);
  24.115 +    vdi->name[VDI_NAME_SZ] = '\0';
  24.116 +    vdi->radix_lock = NULL; /* for tidiness */
  24.117 +    writeblock(vdi->block, (void *)vdi);
  24.118 +    
  24.119 +    update(VDI_REG_HEIGHT, VDI_RADIX_ROOT, vdi->id, vdi->block);
  24.120 +    writeblock(VDI_REG_BLOCK, (void *)vdi_reg);
  24.121 +    freeblock(vdi_reg);
  24.122 +    
  24.123 +    vdi->radix_lock = (struct radix_lock *)malloc(sizeof(struct radix_lock));
  24.124 +    if (vdi->radix_lock == NULL) 
  24.125 +    {
  24.126 +    	perror("couldn't malloc radix_lock for new vdi!");
  24.127 +    	freeblock(vdi);
  24.128 +    	return NULL;
  24.129 +    }
  24.130 +    radix_lock_init(vdi->radix_lock);
  24.131 +    
  24.132 +    return vdi;
  24.133 +}
  24.134 +
  24.135 +/* vdi_get and vdi_put currently act more like alloc/free -- they don't 
  24.136 + * do refcount-based allocation.  
  24.137 + */
  24.138 +vdi_t *vdi_get(u64 vdi_id)
  24.139 +{
  24.140 +    u64 vdi_blk;
  24.141 +    vdi_t *vdi;
  24.142 +    
  24.143 +    vdi_blk = lookup(VDI_REG_HEIGHT, VDI_RADIX_ROOT, vdi_id);
  24.144 +    
  24.145 +    if ( vdi_blk == 0 )
  24.146 +        return NULL;
  24.147 +    
  24.148 +    vdi = (vdi_t *)readblock(vdi_blk);
  24.149 +    
  24.150 +    vdi->radix_lock = (struct radix_lock *)malloc(sizeof(struct radix_lock));
  24.151 +    if (vdi->radix_lock == NULL) 
  24.152 +    {
  24.153 +    	perror("couldn't malloc radix_lock for new vdi!");
  24.154 +    	freeblock(vdi);
  24.155 +    	return NULL;
  24.156 +    }
  24.157 +    radix_lock_init(vdi->radix_lock);
  24.158 +    
  24.159 +    return vdi;
  24.160 +}
  24.161 +
  24.162 +void vdi_put(vdi_t *vdi)
  24.163 +{
  24.164 +    free(vdi->radix_lock);
  24.165 +    freeblock(vdi);
  24.166 +}
  24.167 +
  24.168 +void vdi_snapshot(vdi_t *vdi)
  24.169 +{
  24.170 +    snap_rec_t rec;
  24.171 +    int ret;
  24.172 +    
  24.173 +    rec.radix_root = vdi->radix_root;
  24.174 +    gettimeofday(&rec.timestamp, NULL);
  24.175 +    rec.deleted = 0;
  24.176 +    
  24.177 +    vdi->radix_root = snapshot(vdi->radix_root);
  24.178 +    ret = snap_append(&vdi->snap, &rec, &vdi->snap);
  24.179 +    if ( ret != 0 ) {
  24.180 +        printf("snap_append returned failure\n");
  24.181 +        return;
  24.182 +    }
  24.183 +    writeblock(vdi->block, vdi);
  24.184 +}
  24.185 +    
  24.186 +int __init_vdi()
  24.187 +{
  24.188 +    /* sneak this in here for the moment. */
  24.189 +    __rcache_init();
  24.190 +    
  24.191 +    /* force the registry to be created if it doesn't exist. */
  24.192 +    vdi_registry_t *vdi_reg = get_vdi_registry();
  24.193 +    if (vdi_reg == NULL) {
  24.194 +        printf("[vdi.c] Couldn't get/create a VDI registry!\n");
  24.195 +        return -1;
  24.196 +    }
  24.197 +    freeblock(vdi_reg);
  24.198 +    
  24.199 +    
  24.200 +    return 0;
  24.201 +}
  24.202 +    
  24.203 +#ifdef VDI_STANDALONE
  24.204 +
  24.205 +#define TEST_VDIS      50
  24.206 +#define NR_ITERS    50000
  24.207 +#define FORK_POINTS   200
  24.208 +#define INIT_VDIS       3
  24.209 +#define INIT_SNAPS     40
  24.210 +
  24.211 +/* These must be of decreasing size: */
  24.212 +#define NEW_FORK       (RAND_MAX-(RAND_MAX/1000))
  24.213 +#define NEW_ROOT_VDI   (RAND_MAX-((RAND_MAX/1000)*2))
  24.214 +#define NEW_FORK_VDI   (RAND_MAX-((RAND_MAX/1000)*3))
  24.215 +
  24.216 +#define GRAPH_DOT_FILE "vdi.dot"
  24.217 +#define GRAPH_PS_FILE  "vdi.ps"
  24.218 +
  24.219 +
  24.220 +typedef struct sh_st {
  24.221 +    snap_id_t     id;
  24.222 +    struct sh_st *next;
  24.223 +} sh_t;
  24.224 +
  24.225 +#define SNAP_HASHSZ 1024
  24.226 +sh_t *node_hash[SNAP_HASHSZ];
  24.227 +#define SNAP_HASH(_id) (((int)(_id)->block^(_id)->index)%SNAP_HASHSZ)
  24.228 +
  24.229 +#define SNAPID_EQUAL(_a,_b) \
  24.230 +    (((_a)->block==(_b)->block) && ((_a)->index==(_b)->index))
  24.231 +int sh_check_and_add(snap_id_t *id)
  24.232 +{
  24.233 +    sh_t **s = &node_hash[SNAP_HASH(id)];
  24.234 +    
  24.235 +    while (*s != NULL) {
  24.236 +        if (SNAPID_EQUAL(&((*s)->id), id))
  24.237 +            return 1;
  24.238 +        *s = (*s)->next;
  24.239 +    }
  24.240 +    
  24.241 +    *s = (sh_t *)malloc(sizeof(sh_t));
  24.242 +    (*s)->id = *id;
  24.243 +    (*s)->next = NULL;
  24.244 +    
  24.245 +    return 0;
  24.246 +}
  24.247 +
  24.248 +int main(int argc, char *argv[])
  24.249 +{
  24.250 +    vdi_t *vdi_list[TEST_VDIS];
  24.251 +    snap_id_t id, fork_points[FORK_POINTS];
  24.252 +    int nr_vdis = 0, nr_forks = 0;
  24.253 +    int i, j, r;
  24.254 +    FILE *f;
  24.255 +    char name[VDI_NAME_SZ];
  24.256 +    
  24.257 +    __init_blockstore();
  24.258 +    __init_vdi();
  24.259 +    
  24.260 +    printf("[o] Generating seed VDIs. (%d VDIs)\n", INIT_VDIS);
  24.261 +    
  24.262 +    for (i=0; i<INIT_VDIS; i++) {
  24.263 +        r=rand();
  24.264 +        
  24.265 +        sprintf(name, "VDI Number %d", nr_vdis);
  24.266 +        vdi_list[i] = vdi_create(NULL, name);
  24.267 +        for (j=0; j<(r%INIT_SNAPS); j++)
  24.268 +            vdi_snapshot(vdi_list[i]);
  24.269 +        fork_points[i] = vdi_list[i]->snap;
  24.270 +        nr_vdis++;
  24.271 +        nr_forks++;
  24.272 +    }
  24.273 +    
  24.274 +    printf("[o] Running a random workload. (%d iterations)\n", NR_ITERS);
  24.275 +            
  24.276 +    for (i=0; i<NR_ITERS; i++) {
  24.277 +        r = rand();
  24.278 +        
  24.279 +        if ( r > NEW_FORK ) {
  24.280 +            if ( nr_forks > FORK_POINTS )
  24.281 +                continue;
  24.282 +            id = vdi_list[r%nr_vdis]->snap;
  24.283 +            if ( ( id.block == 0 ) || ( id.index == 0 ) )
  24.284 +                continue;
  24.285 +            id.index--;
  24.286 +            fork_points[nr_forks++] = id;
  24.287 +            
  24.288 +        } else if ( r > NEW_ROOT_VDI ) {
  24.289 +            
  24.290 +            if ( nr_vdis == TEST_VDIS )
  24.291 +                continue;
  24.292 +            
  24.293 +            sprintf(name, "VDI Number %d.", nr_vdis);
  24.294 +            vdi_list[nr_vdis++] = vdi_create(NULL, name);
  24.295 +            
  24.296 +        } else if ( r > NEW_FORK_VDI ) {
  24.297 +            
  24.298 +            if ( nr_vdis == TEST_VDIS )
  24.299 +                continue;
  24.300 +            
  24.301 +            sprintf(name, "VDI Number %d.", nr_vdis);
  24.302 +            vdi_list[nr_vdis++] = vdi_create(&fork_points[r%nr_forks], name);
  24.303 +            
  24.304 +        } else /* SNAPSHOT */ {
  24.305 +            
  24.306 +            vdi_snapshot(vdi_list[r%nr_vdis]);
  24.307 +            
  24.308 +        }
  24.309 +    }
  24.310 +    
  24.311 +    /* now dump it out to a dot file. */
  24.312 +    printf("[o] Dumping state to a dot graph. (%d VDIs)\n", nr_vdis);
  24.313 +    
  24.314 +    f = fopen(GRAPH_DOT_FILE, "w");
  24.315 +    
  24.316 +    /* write graph preamble */
  24.317 +    fprintf(f, "digraph G {\n");
  24.318 +    fprintf(f, "   rankdir=LR\n");
  24.319 +    
  24.320 +    for (i=0; i<nr_vdis; i++) {
  24.321 +        char oldnode[255];
  24.322 +        snap_block_t *blk;
  24.323 +        snap_id_t id = vdi_list[i]->snap;
  24.324 +        int nr_snaps, done=0;
  24.325 +        
  24.326 +        /* add a node for the id */
  24.327 +printf("vdi: %d\n", i);
  24.328 +        fprintf(f, "   n%Ld%d [color=blue,shape=box,label=\"%s\\nb:%Ld\\nidx:%d\"]\n", 
  24.329 +                id.block, id.index, vdi_list[i]->name,
  24.330 +                id.block, id.index);
  24.331 +        sprintf(oldnode, "n%Ld%d", id.block, id.index);
  24.332 +        
  24.333 +        while (id.block != 0) {
  24.334 +            blk = snap_get_block(id.block);
  24.335 +            nr_snaps = blk->hdr.log_entries - (blk->hdr.nr_entries - id.index);
  24.336 +            id = blk->hdr.fork_block;
  24.337 +            
  24.338 +            done = sh_check_and_add(&id);
  24.339 +            
  24.340 +            /* add a node for the fork_id */
  24.341 +            if (!done) {
  24.342 +                fprintf(f, "   n%Ld%d [shape=box,label=\"b:%Ld\\nidx:%d\"]\n", 
  24.343 +                    id.block, id.index,
  24.344 +                    id.block, id.index);
  24.345 +            }
  24.346 +            
  24.347 +            /* add an edge between them */
  24.348 +            fprintf(f, "   n%Ld%d -> %s [label=\"%u snapshots\"]\n",
  24.349 +                    id.block, id.index, oldnode, nr_snaps);
  24.350 +            sprintf(oldnode, "n%Ld%d", id.block, id.index);
  24.351 +            freeblock(blk);
  24.352 +            
  24.353 +            if (done) break;
  24.354 +        }
  24.355 +    }
  24.356 +    
  24.357 +    /* write graph postamble */
  24.358 +    fprintf(f, "}\n");
  24.359 +    fclose(f);
  24.360 +    
  24.361 +    printf("[o] Generating postscript graph. (%s)\n", GRAPH_PS_FILE);
  24.362 +    {
  24.363 +        char cmd[255];
  24.364 +        sprintf(cmd, "dot %s -Tps -o %s", GRAPH_DOT_FILE, GRAPH_PS_FILE);
  24.365 +        system(cmd);
  24.366 +    }
  24.367 +    return 0;
  24.368 +}
  24.369 +
  24.370 +#endif
    25.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    25.2 +++ b/tools/blktap/parallax/vdi.h	Sun Jul 03 22:36:48 2005 +0000
    25.3 @@ -0,0 +1,55 @@
    25.4 +#ifndef _VDI_H_
    25.5 +#define _VDI_H_
    25.6 +/**************************************************************************
    25.7 + * 
    25.8 + * vdi.h
    25.9 + *
   25.10 + * Virtual Disk Image (VDI) Interfaces
   25.11 + *
   25.12 + */
   25.13 +
   25.14 +#ifndef __VDI_H__
   25.15 +#define __VDI_H__
   25.16 +
   25.17 +#include "blktaplib.h"
   25.18 +#include "snaplog.h"
   25.19 +
   25.20 +#define VDI_HEIGHT     27 /* Note that these are now hard-coded */
   25.21 +#define VDI_REG_HEIGHT 27 /* in the async lookup code           */
   25.22 +
   25.23 +#define VDI_NAME_SZ 256
   25.24 +
   25.25 +
   25.26 +typedef struct vdi {
   25.27 +    u64         id;               /* unique vdi id -- used by the registry   */
   25.28 +    u64         block;            /* block where this vdi lives (also unique)*/
   25.29 +    u64         radix_root;       /* radix root node for block mappings      */
   25.30 +    snap_id_t   snap;             /* next snapshot slot for this VDI         */
   25.31 +    struct vdi *next;             /* used to hash-chain in blkif.            */
   25.32 +    blkif_vdev_t vdevice;         /* currently mounted as...                 */
   25.33 +    struct radix_lock *radix_lock;/* per-line L1 RW lock for parallel reqs   */
   25.34 +    char        name[VDI_NAME_SZ];/* human readable vdi name                 */
   25.35 +} vdi_t;
   25.36 +
   25.37 +#define VDI_REG_MAGIC   0xff00ff0bb0ff00ffLL
   25.38 +
   25.39 +typedef struct vdi_registry {
   25.40 +    u64     magic;
   25.41 +    u64     nr_vdis;
   25.42 +} vdi_registry_t;
   25.43 +
   25.44 +
   25.45 +int __init_vdi(void);
   25.46 +
   25.47 +vdi_t *vdi_get(u64 vdi_id);
   25.48 +void vdi_put(vdi_t *vdi);
   25.49 +vdi_registry_t *get_vdi_registry(void);
   25.50 +vdi_t *vdi_create(snap_id_t *parent_snap, char *name);
   25.51 +u64 vdi_lookup_block(vdi_t *vdi, u64 vdi_block, int *writable);
   25.52 +void vdi_update_block(vdi_t *vdi, u64 vdi_block, u64 g_block);
   25.53 +void vdi_snapshot(vdi_t *vdi);
   25.54 +
   25.55 +
   25.56 +#endif /* __VDI_H__ */
   25.57 +
   25.58 +#endif //_VDI_H_
    26.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    26.2 +++ b/tools/blktap/parallax/vdi_create.c	Sun Jul 03 22:36:48 2005 +0000
    26.3 @@ -0,0 +1,52 @@
    26.4 +/**************************************************************************
    26.5 + * 
    26.6 + * vdi_create.c
    26.7 + *
    26.8 + * Create a new vdi.
    26.9 + *
   26.10 + */
   26.11 + 
   26.12 +#include <stdio.h>
   26.13 +#include <stdlib.h>
   26.14 +#include <string.h>
   26.15 +#include <sys/time.h>
   26.16 +#include "blockstore.h"
   26.17 +#include "radix.h"
   26.18 +#include "vdi.h"
   26.19 +
   26.20 +int main(int argc, char *argv[])
   26.21 +{
   26.22 +    vdi_t       *vdi;
   26.23 +    char         name[VDI_NAME_SZ] = "";
   26.24 +    snap_id_t    id;
   26.25 +    int          from_snap = 0;
   26.26 +    
   26.27 +    __init_blockstore();
   26.28 +    __init_vdi();
   26.29 +    
   26.30 +    if ( argc == 1 ) {
   26.31 +        printf("usage: %s <VDI Name> [<snap block> <snap idx>]\n", argv[0]);
   26.32 +        exit(-1);
   26.33 +    }
   26.34 +    
   26.35 +    strncpy( name, argv[1], VDI_NAME_SZ);
   26.36 +    name[VDI_NAME_SZ] = '\0';    
   26.37 +    
   26.38 +    if ( argc > 3 ) {
   26.39 +        id.block   = (u64)          atoll(argv[2]);
   26.40 +        id.index   = (unsigned int) atol (argv[3]);
   26.41 +        from_snap  = 1;
   26.42 +    }
   26.43 +    
   26.44 +    vdi = vdi_create( from_snap ? &id : NULL, name);
   26.45 +    
   26.46 +    if ( vdi == NULL ) {
   26.47 +        printf("Failed to create VDI!\n");
   26.48 +        freeblock(vdi);
   26.49 +        exit(-1);
   26.50 +    }
   26.51 +    
   26.52 +    freeblock(vdi);
   26.53 +    
   26.54 +    return (0);
   26.55 +}
    27.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    27.2 +++ b/tools/blktap/parallax/vdi_fill.c	Sun Jul 03 22:36:48 2005 +0000
    27.3 @@ -0,0 +1,81 @@
    27.4 +/**************************************************************************
    27.5 + * 
    27.6 + * vdi_fill.c
    27.7 + *
    27.8 + * Hoover a file or device into a vdi.
    27.9 + * You must first create the vdi with vdi_create.
   27.10 + *
   27.11 + */
   27.12 + 
   27.13 +#include <stdio.h>
   27.14 +#include <stdlib.h>
   27.15 +#include <string.h>
   27.16 +#include <sys/types.h>
   27.17 +#include <sys/stat.h>
   27.18 +#include <fcntl.h>
   27.19 +#include <unistd.h>
   27.20 +#include "blockstore.h"
   27.21 +#include "radix.h"
   27.22 +#include "requests-async.h"
   27.23 +#include "vdi.h"
   27.24 +
   27.25 +int main(int argc, char *argv[])
   27.26 +{
   27.27 +    vdi_t       *vdi;
   27.28 +    u64          id;
   27.29 +    int          fd;
   27.30 +    struct stat  st;
   27.31 +    u64          tot_size;
   27.32 +    char         spage[BLOCK_SIZE];
   27.33 +    char        *dpage;
   27.34 +    u64          vblock = 0, count=0;
   27.35 +    
   27.36 +    __init_blockstore();
   27.37 +    init_block_async();
   27.38 +    __init_vdi();
   27.39 +    
   27.40 +    if ( argc < 3 ) {
   27.41 +        printf("usage: %s <VDI id> <filename>\n", argv[0]);
   27.42 +        exit(-1);
   27.43 +    }
   27.44 +        
   27.45 +    id = (u64) atoll(argv[1]);
   27.46 +    
   27.47 +    vdi = vdi_get( id );
   27.48 +    
   27.49 +    if ( vdi == NULL ) {
   27.50 +        printf("Failed to retreive VDI %Ld!\n", id);
   27.51 +        exit(-1);
   27.52 +    }
   27.53 +    
   27.54 +    fd = open(argv[2], O_RDONLY | O_LARGEFILE);
   27.55 +    
   27.56 +    if (fd < 0) {
   27.57 +        printf("Couldn't open %s!\n", argv[2]);
   27.58 +        exit(-1);
   27.59 +    }
   27.60 +    
   27.61 +    if ( fstat(fd, &st) != 0 ) {
   27.62 +        printf("Couldn't stat %s!\n", argv[2]);
   27.63 +        exit(-1);
   27.64 +    }
   27.65 +    
   27.66 +    tot_size = (u64) st.st_size;
   27.67 +    printf("Filling VDI %Ld with %Ld bytes.\n", id, tot_size);
   27.68 +    
   27.69 +    printf("%011Ld blocks total\n", tot_size / BLOCK_SIZE);    
   27.70 +    printf("           ");
   27.71 +    while ( ( count = read(fd, spage, BLOCK_SIZE) ) > 0 ) {
   27.72 +        vdi_write_s(vdi, vblock, spage);
   27.73 +        
   27.74 +        vblock++;
   27.75 +        if ((vblock % 512) == 0)
   27.76 +        printf("\b\b\b\b\b\b\b\b\b\b\b%011Ld", vblock);
   27.77 +        fflush(stdout);
   27.78 +    }
   27.79 +    printf("\n");
   27.80 +    
   27.81 +    freeblock(vdi);
   27.82 +    
   27.83 +    return (0);
   27.84 +}
    28.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    28.2 +++ b/tools/blktap/parallax/vdi_list.c	Sun Jul 03 22:36:48 2005 +0000
    28.3 @@ -0,0 +1,47 @@
    28.4 +/**************************************************************************
    28.5 + * 
    28.6 + * vdi_list.c
    28.7 + *
    28.8 + * Print a list of VDIs on the block store.
    28.9 + *
   28.10 + */
   28.11 + 
   28.12 +#include <stdio.h>
   28.13 +#include <stdlib.h>
   28.14 +#include <string.h>
   28.15 +#include <sys/time.h>
   28.16 +#include "blockstore.h"
   28.17 +#include "radix.h"
   28.18 +#include "vdi.h"
   28.19 +
   28.20 +int main(int argc, char *argv[])
   28.21 +{
   28.22 +    vdi_registry_t *reg;
   28.23 +    vdi_t *vdi;
   28.24 +    int i;
   28.25 +    
   28.26 +    __init_blockstore();
   28.27 +    __init_vdi();
   28.28 +    
   28.29 +    reg = get_vdi_registry();
   28.30 +    
   28.31 +    if ( reg == NULL ) {
   28.32 +        printf("couldn't get VDI registry.\n");
   28.33 +        exit(-1);
   28.34 +    }
   28.35 +    
   28.36 +    for (i=0; i < reg->nr_vdis; i++) {
   28.37 +        vdi = vdi_get(i);
   28.38 +        
   28.39 +        if ( vdi != NULL ) {
   28.40 +            
   28.41 +            printf("%10Ld %60s\n", vdi->id, vdi->name);
   28.42 +            freeblock(vdi);
   28.43 +            
   28.44 +        }
   28.45 +    }
   28.46 +    
   28.47 +    freeblock(reg);
   28.48 +    
   28.49 +    return 0;
   28.50 +}
    29.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    29.2 +++ b/tools/blktap/parallax/vdi_snap.c	Sun Jul 03 22:36:48 2005 +0000
    29.3 @@ -0,0 +1,43 @@
    29.4 +/**************************************************************************
    29.5 + * 
    29.6 + * vdi_snap.c
    29.7 + *
    29.8 + * Snapshot a vdi.
    29.9 + *
   29.10 + */
   29.11 + 
   29.12 +#include <stdio.h>
   29.13 +#include <stdlib.h>
   29.14 +#include <string.h>
   29.15 +#include <sys/time.h>
   29.16 +#include "blockstore.h"
   29.17 +#include "radix.h"
   29.18 +#include "vdi.h"
   29.19 +
   29.20 +int main(int argc, char *argv[])
   29.21 +{
   29.22 +    vdi_t  *vdi;
   29.23 +    u64     id;
   29.24 +    
   29.25 +    __init_blockstore();
   29.26 +    __init_vdi();
   29.27 +    
   29.28 +    if ( argc == 1 ) {
   29.29 +        printf("usage: %s <VDI id>\n", argv[0]);
   29.30 +        exit(-1);
   29.31 +    }
   29.32 +    
   29.33 +    id = (u64) atoll(argv[1]);
   29.34 +    
   29.35 +    vdi = vdi_get(id);
   29.36 +    
   29.37 +    if ( vdi == NULL ) {
   29.38 +        printf("couldn't find the requested VDI.\n");
   29.39 +        freeblock(vdi);
   29.40 +        exit(-1);
   29.41 +    }
   29.42 +    
   29.43 +    vdi_snapshot(vdi);
   29.44 +    
   29.45 +    return 0;
   29.46 +}
    30.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    30.2 +++ b/tools/blktap/parallax/vdi_snap_delete.c	Sun Jul 03 22:36:48 2005 +0000
    30.3 @@ -0,0 +1,48 @@
    30.4 +/**************************************************************************
    30.5 + * 
    30.6 + * vdi_snap_delete.c
    30.7 + *
    30.8 + * Delete a snapshot.
    30.9 + *
   30.10 + * This is not finished:  right now it takes a snap n and calls 
   30.11 + * snap_collapse(n,n+1).
   30.12 + *
   30.13 + * TODO: support for non-consecutive, non-same-block snaps
   30.14 + *       Avoid forking probs.
   30.15 + *
   30.16 + */
   30.17 + 
   30.18 +#include <stdio.h>
   30.19 +#include <stdlib.h>
   30.20 +#include <string.h>
   30.21 +#include <sys/time.h>
   30.22 +#include "blockstore.h"
   30.23 +#include "snaplog.h"
   30.24 +#include "radix.h"
   30.25 +#include "vdi.h"
   30.26 +
   30.27 +int main(int argc, char *argv[])
   30.28 +{
   30.29 +    snap_id_t    id, c_id;
   30.30 +    int ret;
   30.31 +    
   30.32 +    __init_blockstore();
   30.33 +    __init_vdi();
   30.34 +    
   30.35 +    if ( argc != 3 ) {
   30.36 +        printf("usage: %s <snap block> <snap idx>\n", argv[0]);
   30.37 +        exit(-1);
   30.38 +    }
   30.39 +    
   30.40 +    id.block   = (u64)          atoll(argv[1]);
   30.41 +    id.index   = (unsigned int) atol (argv[2]);
   30.42 +    
   30.43 +    c_id = id;
   30.44 +    c_id.index++;
   30.45 +    
   30.46 +    ret = snap_collapse(VDI_HEIGHT, &id, &c_id);
   30.47 +    
   30.48 +    printf("Freed %d blocks.\n", ret);
   30.49 +    
   30.50 +    return 0;
   30.51 +}
    31.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    31.2 +++ b/tools/blktap/parallax/vdi_snap_list.c	Sun Jul 03 22:36:48 2005 +0000
    31.3 @@ -0,0 +1,82 @@
    31.4 +/**************************************************************************
    31.5 + * 
    31.6 + * vdi_snap_list.c
    31.7 + *
    31.8 + * Print a list of snapshots for the specified vdi.
    31.9 + *
   31.10 + */
   31.11 + 
   31.12 +#include <stdio.h>
   31.13 +#include <stdlib.h>
   31.14 +#include <string.h>
   31.15 +#include <time.h>
   31.16 +#include <sys/time.h>
   31.17 +#include "blockstore.h"
   31.18 +#include "radix.h"
   31.19 +#include "vdi.h"
   31.20 +
   31.21 +int main(int argc, char *argv[])
   31.22 +{
   31.23 +    vdi_t        *vdi;
   31.24 +    u64           id;
   31.25 +    int           i, max_snaps = -1;
   31.26 +    snap_block_t *blk;
   31.27 +    snap_id_t     sid;
   31.28 +    char         *t;
   31.29 +    
   31.30 +    __init_blockstore();
   31.31 +    __init_vdi();
   31.32 +    
   31.33 +    if ( argc == 1 ) {
   31.34 +        printf("usage: %s <VDI id> [max snaps]\n", argv[0]);
   31.35 +        exit(-1);
   31.36 +    }
   31.37 +    
   31.38 +    id = (u64) atoll(argv[1]);
   31.39 +    
   31.40 +    if ( argc > 2 ) {
   31.41 +        max_snaps = atoi(argv[2]);
   31.42 +    }
   31.43 +    
   31.44 +    vdi = vdi_get(id);
   31.45 +    
   31.46 +    if ( vdi == NULL ) {
   31.47 +        printf("couldn't find the requested VDI.\n");
   31.48 +        freeblock(vdi);
   31.49 +        exit(-1);
   31.50 +    }
   31.51 +    
   31.52 +    sid = vdi->snap;
   31.53 +    sid.index--;
   31.54 +    
   31.55 +    //printf("%8s%4s%21s %12s %1s\n", "Block", "idx", "timestamp", 
   31.56 +    //    "radix root", "d");
   31.57 +    printf("%8s%4s%37s %12s %1s\n", "Block", "idx", "timestamp", 
   31.58 +            "radix root", "d");
   31.59 +     
   31.60 +    while (sid.block != 0) {
   31.61 +        blk = snap_get_block(sid.block);
   31.62 +        for (i = sid.index; i >= 0; i--) {
   31.63 +            if ( max_snaps == 0  ) {
   31.64 +                freeblock(blk);
   31.65 +                goto done;
   31.66 +            }
   31.67 +            t = ctime(&blk->snaps[i].timestamp.tv_sec);
   31.68 +            t[strlen(t)-1] = '\0';
   31.69 +            //printf("%8Ld%4u%14lu.%06lu %12Ld %1s\n",
   31.70 +            printf("%8Ld%4u%30s %06lu %12Ld %1s\n",
   31.71 +                    sid.block, i, 
   31.72 +                    //blk->snaps[i].timestamp.tv_sec,
   31.73 +                    t,
   31.74 +                    blk->snaps[i].timestamp.tv_usec,
   31.75 +                    blk->snaps[i].radix_root,
   31.76 +                    blk->snaps[i].deleted ? "*" : " ");
   31.77 +            if ( max_snaps != -1 ) 
   31.78 +                max_snaps--;
   31.79 +        }
   31.80 +        sid = blk->hdr.parent_block;
   31.81 +        freeblock(blk);
   31.82 +    }
   31.83 +done:            
   31.84 +    return 0;
   31.85 +}
    32.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    32.2 +++ b/tools/blktap/parallax/vdi_tree.c	Sun Jul 03 22:36:48 2005 +0000
    32.3 @@ -0,0 +1,132 @@
    32.4 +/**************************************************************************
    32.5 + * 
    32.6 + * vdi_tree.c
    32.7 + *
    32.8 + * Output current vdi tree to dot and postscript.
    32.9 + *
   32.10 + */
   32.11 + 
   32.12 +#include <stdio.h>
   32.13 +#include <stdlib.h>
   32.14 +#include <string.h>
   32.15 +#include <sys/time.h>
   32.16 +#include "blockstore.h"
   32.17 +#include "radix.h"
   32.18 +#include "vdi.h"
   32.19 +
   32.20 +#define GRAPH_DOT_FILE "vdi.dot"
   32.21 +#define GRAPH_PS_FILE  "vdi.ps"
   32.22 +
   32.23 +typedef struct sh_st {
   32.24 +    snap_id_t     id;
   32.25 +    struct sh_st *next;
   32.26 +} sh_t;
   32.27 +
   32.28 +#define SNAP_HASHSZ 1024
   32.29 +sh_t *node_hash[SNAP_HASHSZ];
   32.30 +#define SNAP_HASH(_id) (((int)(_id)->block^(_id)->index)%SNAP_HASHSZ)
   32.31 +
   32.32 +#define SNAPID_EQUAL(_a,_b) \
   32.33 +    (((_a)->block==(_b)->block) && ((_a)->index==(_b)->index))
   32.34 +int sh_check_and_add(snap_id_t *id)
   32.35 +{
   32.36 +    sh_t **s = &node_hash[SNAP_HASH(id)];
   32.37 +    
   32.38 +    while (*s != NULL) {
   32.39 +        if (SNAPID_EQUAL(&((*s)->id), id))
   32.40 +            return 1;
   32.41 +        *s = (*s)->next;
   32.42 +    }
   32.43 +    
   32.44 +    *s = (sh_t *)malloc(sizeof(sh_t));
   32.45 +    (*s)->id = *id;
   32.46 +    (*s)->next = NULL;
   32.47 +    
   32.48 +    return 0;
   32.49 +}
   32.50 +
   32.51 +int main(int argc, char *argv[])
   32.52 +{
   32.53 +    FILE *f;
   32.54 +    char dot_file[255] = GRAPH_DOT_FILE;
   32.55 +    char  ps_file[255] = GRAPH_PS_FILE;
   32.56 +    int nr_vdis = 0, nr_forks = 0;
   32.57 +    vdi_registry_t *reg;
   32.58 +    vdi_t *vdi;
   32.59 +    int i;
   32.60 +    
   32.61 +    __init_blockstore();
   32.62 +    __init_vdi();
   32.63 +    
   32.64 +    reg = get_vdi_registry();
   32.65 +    
   32.66 +    if ( reg == NULL ) {
   32.67 +        printf("couldn't get VDI registry.\n");
   32.68 +        exit(-1);
   32.69 +    }
   32.70 +    
   32.71 +    if ( argc > 1 ) {
   32.72 +        strncpy(ps_file, argv[1], 255);
   32.73 +        ps_file[255] = '\0';
   32.74 +    }
   32.75 +    
   32.76 +    /* now dump it out to a dot file. */
   32.77 +    printf("[o] Dumping state to a dot graph. (%d VDIs)\n", nr_vdis);
   32.78 +    
   32.79 +    f = fopen(dot_file, "w");
   32.80 +    
   32.81 +    /* write graph preamble */
   32.82 +    fprintf(f, "digraph G {\n");
   32.83 +    fprintf(f, "   rankdir=LR\n");
   32.84 +    
   32.85 +    for (i=0; i<reg->nr_vdis; i++) {
   32.86 +        char oldnode[255];
   32.87 +        snap_block_t *blk;
   32.88 +        snap_id_t id;
   32.89 +        int nr_snaps, done=0;
   32.90 +        
   32.91 +        vdi = vdi_get(i);
   32.92 +        id = vdi->snap;
   32.93 +        /* add a node for the id */
   32.94 +printf("vdi: %d\n", i);
   32.95 +        fprintf(f, "   n%Ld%d [color=blue,shape=box,label=\"%s\\nb:%Ld\\nidx:%d\"]\n", 
   32.96 +                id.block, id.index, vdi->name,
   32.97 +                id.block, id.index);
   32.98 +        sprintf(oldnode, "n%Ld%d", id.block, id.index);
   32.99 +        
  32.100 +        while (id.block != 0) {
  32.101 +            blk = snap_get_block(id.block);
  32.102 +            nr_snaps = blk->hdr.log_entries - (blk->hdr.nr_entries - id.index);
  32.103 +            id = blk->hdr.fork_block;
  32.104 +            
  32.105 +            done = sh_check_and_add(&id);
  32.106 +            
  32.107 +            /* add a node for the fork_id */
  32.108 +            if (!done) {
  32.109 +                fprintf(f, "   n%Ld%d [shape=box,label=\"b:%Ld\\nidx:%d\"]\n", 
  32.110 +                    id.block, id.index,
  32.111 +                    id.block, id.index);
  32.112 +            }
  32.113 +            
  32.114 +            /* add an edge between them */
  32.115 +            fprintf(f, "   n%Ld%d -> %s [label=\"%u snapshots\"]\n",
  32.116 +                    id.block, id.index, oldnode, nr_snaps);
  32.117 +            sprintf(oldnode, "n%Ld%d", id.block, id.index);
  32.118 +            freeblock(blk);
  32.119 +            
  32.120 +            if (done) break;
  32.121 +        }
  32.122 +    }
  32.123 +    
  32.124 +    /* write graph postamble */
  32.125 +    fprintf(f, "}\n");
  32.126 +    fclose(f);
  32.127 +    
  32.128 +    printf("[o] Generating postscript graph. (%s)\n", GRAPH_PS_FILE);
  32.129 +    {
  32.130 +        char cmd[255];
  32.131 +        sprintf(cmd, "dot %s -Tps -o %s", dot_file, ps_file);
  32.132 +        system(cmd);
  32.133 +    }
  32.134 +    return 0;
  32.135 +}
    33.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    33.2 +++ b/tools/blktap/parallax/vdi_unittest.c	Sun Jul 03 22:36:48 2005 +0000
    33.3 @@ -0,0 +1,184 @@
    33.4 +/**************************************************************************
    33.5 + * 
    33.6 + * vdi_unittest.c
    33.7 + *
    33.8 + * Run a small test workload to ensure that data access through a vdi
    33.9 + * is (at least superficially) correct.
   33.10 + *
   33.11 + */
   33.12 + 
   33.13 +#include <stdio.h>
   33.14 +#include <stdlib.h>
   33.15 +#include <string.h>
   33.16 +#include <sys/types.h>
   33.17 +#include <sys/stat.h>
   33.18 +#include <fcntl.h>
   33.19 +#include <unistd.h>
   33.20 +#include "requests-async.h"
   33.21 +#include "blockstore.h"
   33.22 +#include "radix.h"
   33.23 +#include "vdi.h"
   33.24 +
   33.25 +#define TEST_PAGES  32
   33.26 +static char *zero_page;
   33.27 +static char pages[TEST_PAGES][BLOCK_SIZE];
   33.28 +static int next_page = 0;
   33.29 +
   33.30 +void fill_test_pages(void)
   33.31 +{
   33.32 +    int i, j;
   33.33 +    long *page;
   33.34 +
   33.35 +    for (i=0; i< TEST_PAGES; i++) {
   33.36 +        page = (unsigned long *)pages[i];
   33.37 +        for (j=0; j<(BLOCK_SIZE/4); j++) {
   33.38 +            page[j] = random();
   33.39 +        }
   33.40 +    }
   33.41 +
   33.42 +    zero_page = newblock();
   33.43 +}
   33.44 +
   33.45 +inline u64 make_vaddr(u64 L1, u64 L2, u64 L3)
   33.46 +{
   33.47 +    u64 ret = L1;
   33.48 +
   33.49 +    ret = (ret << 9) | L2;
   33.50 +    ret = (ret << 9) | L3;
   33.51 +
   33.52 +    return ret;
   33.53 +}
   33.54 +
   33.55 +void touch_block(vdi_t *vdi, u64 L1, u64 L2, u64 L3)
   33.56 +{
   33.57 +    u64 vaddr;
   33.58 +    char *page = pages[next_page++];
   33.59 +    char *rpage = NULL;
   33.60 +
   33.61 +    printf("TOUCH (%3Lu, %3Lu, %3Lu)\n", L1, L2, L3);
   33.62 +
   33.63 +    vaddr = make_vaddr(L1, L2, L3);
   33.64 +    vdi_write_s(vdi, vaddr, page);
   33.65 +    rpage = vdi_read_s(vdi, vaddr);
   33.66 +
   33.67 +    if (rpage == NULL) 
   33.68 +    {
   33.69 +        printf( "read %Lu returned NULL\n", vaddr); 
   33.70 +        return; 
   33.71 +    }
   33.72 +
   33.73 +    if (memcmp(page, rpage, BLOCK_SIZE) != 0)
   33.74 +    {
   33.75 +        printf( "read %Lu returned a different page\n", vaddr);
   33.76 +        return;
   33.77 +    }
   33.78 +
   33.79 +    freeblock(rpage);
   33.80 +}
   33.81 +
   33.82 +void test_block(vdi_t *vdi, u64 L1, u64 L2, u64 L3, char *page)
   33.83 +{
   33.84 +    u64 vaddr;
   33.85 +    char *rpage = NULL;
   33.86 +
   33.87 +    printf("TEST  (%3Lu, %3Lu, %3Lu)\n", L1, L2, L3);
   33.88 +
   33.89 +    vaddr = make_vaddr(L1, L2, L3);
   33.90 +    rpage = vdi_read_s(vdi, vaddr);
   33.91 +
   33.92 +    if (rpage == NULL) 
   33.93 +    {
   33.94 +        printf( "read %Lu returned NULL\n", vaddr); 
   33.95 +        return; 
   33.96 +    }
   33.97 +
   33.98 +    if (memcmp(page, rpage, BLOCK_SIZE) != 0)
   33.99 +    {
  33.100 +        printf( "read %Lu returned a different page\n", vaddr);
  33.101 +        return;
  33.102 +    }
  33.103 +
  33.104 +    freeblock(rpage);
  33.105 +}
  33.106 +
  33.107 +void coverage_test(vdi_t *vdi)
  33.108 +{
  33.109 +    u64 vaddr;
  33.110 +    int i, j, k;
  33.111 +
  33.112 +    /* Do a series of writes and reads to test all paths through the 
  33.113 +     * async radix code.  The radix request code will dump CRC warnings
  33.114 +     * if there are data problems here as well.
  33.115 +     */
  33.116 +
  33.117 +    /* L1 Zero */
  33.118 +    touch_block(vdi, 0, 0, 0);
  33.119 +
  33.120 +    /* L2 Zero */
  33.121 +    i = next_page;
  33.122 +    touch_block(vdi, 0, 1, 0);
  33.123 +
  33.124 +    /* L3 Zero */
  33.125 +    j = next_page;
  33.126 +    touch_block(vdi, 0, 0, 1);
  33.127 +    k = next_page;
  33.128 +    touch_block(vdi, 0, 1, 1);
  33.129 +
  33.130 +    /* Direct write */
  33.131 +    touch_block(vdi, 0, 0, 0);
  33.132 +
  33.133 +    vdi_snapshot(vdi);
  33.134 +
  33.135 +    /* L1 fault */
  33.136 +    touch_block(vdi, 0, 0, 0);
  33.137 +    /* test the read-only branches that should have been copied over. */
  33.138 +    test_block(vdi, 0, 1, 0, pages[i]);
  33.139 +    test_block(vdi, 0, 0, 1, pages[j]);
  33.140 +
  33.141 +    /* L2 fault */
  33.142 +    touch_block(vdi, 0, 1, 0);
  33.143 +    test_block(vdi, 0, 1, 1, pages[k]);
  33.144 +
  33.145 +    /* L3 fault */
  33.146 +    touch_block(vdi, 0, 0, 1);
  33.147 +    
  33.148 +    /* read - L1 zero */
  33.149 +    test_block(vdi, 1, 0, 0, zero_page);
  33.150 +    
  33.151 +    /* read - L2 zero */
  33.152 +    test_block(vdi, 0, 2, 0, zero_page);
  33.153 +
  33.154 +    /* read - L3 zero */
  33.155 +    test_block(vdi, 0, 0, 2, zero_page);
  33.156 +}
  33.157 +
  33.158 +int main(int argc, char *argv[])
  33.159 +{
  33.160 +    vdi_t       *vdi;
  33.161 +    u64          id;
  33.162 +    int          fd;
  33.163 +    struct stat  st;
  33.164 +    u64          tot_size;
  33.165 +    char         spage[BLOCK_SIZE];
  33.166 +    char        *dpage;
  33.167 +    u64          vblock = 0, count=0;
  33.168 +    
  33.169 +    __init_blockstore();
  33.170 +    init_block_async();
  33.171 +    __init_vdi();
  33.172 +        
  33.173 +    vdi = vdi_create( NULL, "UNIT TEST VDI");
  33.174 +    
  33.175 +    if ( vdi == NULL ) {
  33.176 +        printf("Failed to create VDI!\n");
  33.177 +        freeblock(vdi);
  33.178 +        exit(-1);
  33.179 +    }
  33.180 +
  33.181 +    fill_test_pages();
  33.182 +    coverage_test(vdi);
  33.183 +    
  33.184 +    freeblock(vdi);
  33.185 +    
  33.186 +    return (0);
  33.187 +}
    34.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    34.2 +++ b/tools/blktap/parallax/vdi_validate.c	Sun Jul 03 22:36:48 2005 +0000
    34.3 @@ -0,0 +1,97 @@
    34.4 +/**************************************************************************
    34.5 + * 
    34.6 + * vdi_validate.c
    34.7 + *
    34.8 + * Intended to sanity-check vm_fill and the underlying vdi code.
    34.9 + *
   34.10 + * Block-by-block compare of a vdi with a file/device on the disk.
   34.11 + *
   34.12 + */
   34.13 + 
   34.14 +#include <stdio.h>
   34.15 +#include <stdlib.h>
   34.16 +#include <string.h>
   34.17 +#include <sys/types.h>
   34.18 +#include <sys/stat.h>
   34.19 +#include <fcntl.h>
   34.20 +#include <unistd.h>
   34.21 +#include "blockstore.h"
   34.22 +#include "radix.h"
   34.23 +#include "vdi.h"
   34.24 +#include "requests-async.h"
   34.25 +
   34.26 +int main(int argc, char *argv[])
   34.27 +{
   34.28 +    vdi_t       *vdi;
   34.29 +    u64          id;
   34.30 +    int          fd;
   34.31 +    struct stat  st;
   34.32 +    u64          tot_size;
   34.33 +    char         spage[BLOCK_SIZE], *dpage;
   34.34 +    char        *vpage;
   34.35 +    u64          vblock = 0, count=0;
   34.36 +    
   34.37 +    __init_blockstore();
   34.38 +    init_block_async();
   34.39 +    __init_vdi();
   34.40 +    
   34.41 +    if ( argc < 3 ) {
   34.42 +        printf("usage: %s <VDI id> <filename>\n", argv[0]);
   34.43 +        exit(-1);
   34.44 +    }
   34.45 +        
   34.46 +    id = (u64) atoll(argv[1]);
   34.47 +    
   34.48 +    vdi = vdi_get( id );
   34.49 +    
   34.50 +    if ( vdi == NULL ) {
   34.51 +        printf("Failed to retreive VDI %Ld!\n", id);
   34.52 +        exit(-1);
   34.53 +    }
   34.54 +    
   34.55 +    fd = open(argv[2], O_RDONLY | O_LARGEFILE);
   34.56 +    
   34.57 +    if (fd < 0) {
   34.58 +        printf("Couldn't open %s!\n", argv[2]);
   34.59 +        exit(-1);
   34.60 +    }
   34.61 +    
   34.62 +    if ( fstat(fd, &st) != 0 ) {
   34.63 +        printf("Couldn't stat %s!\n", argv[2]);
   34.64 +        exit(-1);
   34.65 +    }
   34.66 +    
   34.67 +    tot_size = (u64) st.st_size;
   34.68 +    printf("Testing VDI %Ld (%Ld bytes).\n", id, tot_size);
   34.69 +    
   34.70 +    printf("           ");
   34.71 +    while ( ( count = read(fd, spage, BLOCK_SIZE) ) > 0 ) {
   34.72 +
   34.73 +        dpage = vdi_read_s(vdi, vblock);
   34.74 +
   34.75 +        if (dpage == NULL) {
   34.76 +            printf("\n\nfound an unmapped VDI block (%Ld)\n", vblock);
   34.77 +            exit(0);
   34.78 +        }
   34.79 +
   34.80 +        if (memcmp(spage, dpage, BLOCK_SIZE) != 0) {
   34.81 +            printf("\n\nblocks don't match! (%Ld)\n", vblock);
   34.82 +            exit(0);
   34.83 +        }
   34.84 +        
   34.85 +        freeblock(dpage);
   34.86 +        
   34.87 +        vblock++;
   34.88 +        if ((vblock % 1024) == 0) {
   34.89 +            printf("\b\b\b\b\b\b\b\b\b\b\b%011Ld", vblock);
   34.90 +            fflush(stdout);
   34.91 +        }
   34.92 +    }
   34.93 +    printf("\n");
   34.94 +    
   34.95 +    printf("VDI %Ld looks good!\n", id);
   34.96 +    
   34.97 +    freeblock(vdi);
   34.98 +    
   34.99 +    return (0);
  34.100 +}
    35.1 --- a/tools/blktap/radix.c	Sun Jul 03 22:32:52 2005 +0000
    35.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    35.3 @@ -1,631 +0,0 @@
    35.4 -/*
    35.5 - * Radix tree for mapping (up to) 63-bit virtual block IDs to
    35.6 - * 63-bit global block IDs
    35.7 - *
    35.8 - * Pointers within the tree set aside the least significant bit to indicate
    35.9 - * whther or not the target block is writable from this node.
   35.10 - *
   35.11 - * The block with ID 0 is assumed to be an empty block of all zeros
   35.12 - */
   35.13 -
   35.14 -#include <unistd.h>
   35.15 -#include <stdio.h>
   35.16 -#include <stdlib.h>
   35.17 -#include <assert.h>
   35.18 -#include <string.h>
   35.19 -#include <pthread.h>
   35.20 -#include "blockstore.h"
   35.21 -#include "radix.h"
   35.22 -
   35.23 -#define RADIX_TREE_MAP_SHIFT 9
   35.24 -#define RADIX_TREE_MAP_MASK 0x1ff
   35.25 -#define RADIX_TREE_MAP_ENTRIES 512
   35.26 -
   35.27 -/*
   35.28 -#define DEBUG
   35.29 -*/
   35.30 -
   35.31 -/* Experimental radix cache. */
   35.32 -
   35.33 -static  pthread_mutex_t rcache_mutex = PTHREAD_MUTEX_INITIALIZER;
   35.34 -static  int rcache_count = 0;
   35.35 -#define RCACHE_MAX 1024
   35.36 -
   35.37 -typedef struct rcache_st {
   35.38 -    radix_tree_node  *node;
   35.39 -    u64               id;
   35.40 -    struct rcache_st *hash_next;
   35.41 -    struct rcache_st *cache_next;
   35.42 -    struct rcache_st *cache_prev;
   35.43 -} rcache_t;
   35.44 -
   35.45 -static rcache_t *rcache_head = NULL;
   35.46 -static rcache_t *rcache_tail = NULL;
   35.47 -
   35.48 -#define RCHASH_SIZE 512ULL
   35.49 -rcache_t *rcache[RCHASH_SIZE];
   35.50 -#define RCACHE_HASH(_id) ((_id) & (RCHASH_SIZE - 1))
   35.51 -
   35.52 -void __rcache_init(void)
   35.53 -{
   35.54 -    int i;
   35.55 -
   35.56 -    for (i=0; i<RCHASH_SIZE; i++)
   35.57 -        rcache[i] = NULL;
   35.58 -}
   35.59 -    
   35.60 -
   35.61 -void rcache_write(u64 id, radix_tree_node *node)
   35.62 -{
   35.63 -    rcache_t *r, *tmp, **curs;
   35.64 -    
   35.65 -    pthread_mutex_lock(&rcache_mutex);
   35.66 -    
   35.67 -    /* Is it already in the cache? */
   35.68 -    r = rcache[RCACHE_HASH(id)];
   35.69 -    
   35.70 -    for (;;) {
   35.71 -        if (r == NULL) 
   35.72 -            break;
   35.73 -        if (r->id == id) 
   35.74 -        {
   35.75 -            memcpy(r->node, node, BLOCK_SIZE);
   35.76 -            
   35.77 -            /* bring to front. */
   35.78 -            if (r != rcache_head) {
   35.79 -                
   35.80 -                if (r == rcache_tail) {
   35.81 -                    if (r->cache_prev != NULL) rcache_tail = r->cache_prev;
   35.82 -                    rcache_tail->cache_next = NULL;
   35.83 -                }
   35.84 -
   35.85 -                tmp = r->cache_next;
   35.86 -                if (r->cache_next != NULL) r->cache_next->cache_prev 
   35.87 -                                                     = r->cache_prev;
   35.88 -                if (r->cache_prev != NULL) r->cache_prev->cache_next = tmp;
   35.89 -
   35.90 -                r->cache_prev = NULL;
   35.91 -                r->cache_next = rcache_head;
   35.92 -                if (rcache_head != NULL) rcache_head->cache_prev = r;
   35.93 -                rcache_head = r;
   35.94 -            }
   35.95 -
   35.96 -//printf("Update (%Ld)\n", r->id);
   35.97 -            goto done;
   35.98 -        }
   35.99 -        r = r->hash_next;
  35.100 -    }
  35.101 -    
  35.102 -    if ( rcache_count == RCACHE_MAX ) 
  35.103 -    {
  35.104 -        /* Remove an entry */
  35.105 -        
  35.106 -        r = rcache_tail;
  35.107 -        if (r->cache_prev != NULL) rcache_tail = r->cache_prev;
  35.108 -        rcache_tail->cache_next = NULL;
  35.109 -        freeblock(r->node);
  35.110 -        
  35.111 -        curs = &rcache[RCACHE_HASH(r->id)];
  35.112 -        while ((*curs) != r)
  35.113 -            curs = &(*curs)->hash_next;
  35.114 -        *curs = r->hash_next;
  35.115 -//printf("Evict (%Ld)\n", r->id);
  35.116 -        
  35.117 -    } else {
  35.118 -        
  35.119 -        r = (rcache_t *)malloc(sizeof(rcache_t));
  35.120 -        rcache_count++;
  35.121 -    }
  35.122 -    
  35.123 -    r->node = newblock();
  35.124 -    memcpy(r->node, node, BLOCK_SIZE);
  35.125 -    r->id = id;
  35.126 -    
  35.127 -    r->hash_next = rcache[RCACHE_HASH(id)];
  35.128 -    rcache[RCACHE_HASH(id)] = r;
  35.129 -    
  35.130 -    r->cache_prev = NULL;
  35.131 -    r->cache_next = rcache_head;
  35.132 -    if (rcache_head != NULL) rcache_head->cache_prev = r;
  35.133 -    rcache_head = r;
  35.134 -    if (rcache_tail == NULL) rcache_tail = r;
  35.135 -    
  35.136 -//printf("Added (%Ld, %p)\n", id, r->node);
  35.137 -done:
  35.138 -    pthread_mutex_unlock(&rcache_mutex);
  35.139 -}
  35.140 -
  35.141 -radix_tree_node *rcache_read(u64 id)
  35.142 -{
  35.143 -    rcache_t *r, *tmp;
  35.144 -    radix_tree_node *node = NULL;
  35.145 -    
  35.146 -    pthread_mutex_lock(&rcache_mutex);
  35.147 -
  35.148 -    r = rcache[RCACHE_HASH(id)];
  35.149 -    
  35.150 -    for (;;) {
  35.151 -        if (r == NULL) {
  35.152 -//printf("Miss (%Ld)\n", id);
  35.153 -            goto done;
  35.154 -        }
  35.155 -        if (r->id == id) break;
  35.156 -        r = r->hash_next;
  35.157 -    }
  35.158 -   
  35.159 -    /* bring to front. */
  35.160 -    if (r != rcache_head) 
  35.161 -    {
  35.162 -        if (r == rcache_tail) {
  35.163 -            if (r->cache_prev != NULL) rcache_tail = r->cache_prev;
  35.164 -            rcache_tail->cache_next = NULL;
  35.165 -        }
  35.166 -        tmp = r->cache_next;
  35.167 -        if (r->cache_next != NULL) r->cache_next->cache_prev = r->cache_prev;
  35.168 -        if (r->cache_prev != NULL) r->cache_prev->cache_next = tmp;
  35.169 -
  35.170 -        r->cache_prev = NULL;
  35.171 -        r->cache_next = rcache_head;
  35.172 -        if (rcache_head != NULL) rcache_head->cache_prev = r;
  35.173 -        rcache_head = r;
  35.174 -    }
  35.175 -    
  35.176 -    node = newblock();
  35.177 -    memcpy(node, r->node, BLOCK_SIZE);
  35.178 -    
  35.179 -//printf("Hit (%Ld, %p)\n", id, r->node);
  35.180 -done:
  35.181 -    pthread_mutex_unlock(&rcache_mutex);
  35.182 -    
  35.183 -    return(node);
  35.184 -}
  35.185 -
  35.186 -
  35.187 -void *rc_readblock(u64 id)
  35.188 -{
  35.189 -    void *ret;
  35.190 -    
  35.191 -    ret = (void *)rcache_read(id);
  35.192 -    
  35.193 -    if (ret != NULL) return ret;
  35.194 -    
  35.195 -    ret = readblock(id);
  35.196 -    
  35.197 -    if (ret != NULL)
  35.198 -        rcache_write(id, ret);
  35.199 -    
  35.200 -    return(ret);
  35.201 -}
  35.202 -
  35.203 -u64 rc_allocblock(void *block)
  35.204 -{
  35.205 -    u64 ret;
  35.206 -    
  35.207 -    ret = allocblock(block);
  35.208 -    
  35.209 -    if (ret != ZERO)
  35.210 -        rcache_write(ret, block);
  35.211 -    
  35.212 -    return(ret);
  35.213 -}
  35.214 -
  35.215 -int rc_writeblock(u64 id, void *block)
  35.216 -{
  35.217 -    int ret;
  35.218 -    
  35.219 -    ret = writeblock(id, block);
  35.220 -    rcache_write(id, block);
  35.221 -    
  35.222 -    return(ret);
  35.223 -}
  35.224 -
  35.225 -
  35.226 -/*
  35.227 - * block device interface and other helper functions
  35.228 - * with these functions, block id is just a 63-bit number, with
  35.229 - * no special consideration for the LSB
  35.230 - */
  35.231 -radix_tree_node cloneblock(radix_tree_node block);
  35.232 -
  35.233 -/*
  35.234 - * main api
  35.235 - * with these functions, the LSB of root always indicates
  35.236 - * whether or not the block is writable, including the return
  35.237 - * values of update and snapshot
  35.238 - */
  35.239 -u64 lookup(int height, u64 root, u64 key);
  35.240 -u64 update(int height, u64 root, u64 key, u64 val);
  35.241 -u64 snapshot(u64 root);
  35.242 -
  35.243 -/**
  35.244 - * cloneblock: clone an existing block in memory
  35.245 - *   @block: the old block
  35.246 - *
  35.247 - *   @return: new block, with LSB cleared for every entry
  35.248 - */
  35.249 -radix_tree_node cloneblock(radix_tree_node block) {
  35.250 -    radix_tree_node node = (radix_tree_node) malloc(BLOCK_SIZE);
  35.251 -    int i;
  35.252 -    if (node == NULL) {
  35.253 -        perror("cloneblock malloc");
  35.254 -        return NULL;
  35.255 -    }
  35.256 -    for (i = 0; i < RADIX_TREE_MAP_ENTRIES; i++)
  35.257 -        node[i] = block[i] & ONEMASK;
  35.258 -    return node;
  35.259 -}
  35.260 -
  35.261 -/**
  35.262 - * lookup: find a value given a key
  35.263 - *   @height: height in bits of the radix tree
  35.264 - *   @root: root node id, with set LSB indicating writable node
  35.265 - *   @key: key to lookup
  35.266 - *
  35.267 - *   @return: value on success, zero on error
  35.268 - */
  35.269 -
  35.270 -u64 lookup(int height, u64 root, u64 key) {
  35.271 -    radix_tree_node node;
  35.272 -    u64 mask = ONE;
  35.273 -    
  35.274 -    assert(key >> height == 0);
  35.275 -
  35.276 -    /* the root block may be smaller to ensure all leaves are full */
  35.277 -    height = ((height - 1) / RADIX_TREE_MAP_SHIFT) * RADIX_TREE_MAP_SHIFT;
  35.278 -
  35.279 -    /* now carve off equal sized chunks at each step */
  35.280 -    for (;;) {
  35.281 -        u64 oldroot;
  35.282 -
  35.283 -#ifdef DEBUG
  35.284 -        printf("lookup: height=%3d root=%3Ld offset=%3d%s\n", height, root,
  35.285 -                (int) ((key >> height) & RADIX_TREE_MAP_MASK),
  35.286 -                (iswritable(root) ? "" : " (readonly)"));
  35.287 -#endif
  35.288 -        
  35.289 -        if (getid(root) == ZERO)
  35.290 -            return ZERO;
  35.291 -
  35.292 -        oldroot = root;
  35.293 -        node = (radix_tree_node) rc_readblock(getid(root));
  35.294 -        if (node == NULL)
  35.295 -            return ZERO;
  35.296 -
  35.297 -        root = node[(key >> height) & RADIX_TREE_MAP_MASK];
  35.298 -        mask &= root;
  35.299 -        freeblock(node);
  35.300 -
  35.301 -        if (height == 0)
  35.302 -            return ( root & ONEMASK ) | mask;
  35.303 -
  35.304 -        height -= RADIX_TREE_MAP_SHIFT;
  35.305 -    }
  35.306 -
  35.307 -    return ZERO;
  35.308 -}
  35.309 -
  35.310 -/*
  35.311 - * update: set a radix tree entry, doing copy-on-write as necessary
  35.312 - *   @height: height in bits of the radix tree
  35.313 - *   @root: root node id, with set LSB indicating writable node
  35.314 - *   @key: key to set
  35.315 - *   @val: value to set, s.t. radix(key)=val
  35.316 - *
  35.317 - *   @returns: (possibly new) root id on success (with LSB=1), 0 on failure
  35.318 - */
  35.319 -
  35.320 -u64 update(int height, u64 root, u64 key, u64 val) {
  35.321 -    int offset;
  35.322 -    u64 child;
  35.323 -    radix_tree_node node;
  35.324 -    
  35.325 -    /* base case--return val */
  35.326 -    if (height == 0)
  35.327 -        return val;
  35.328 -
  35.329 -    /* the root block may be smaller to ensure all leaves are full */
  35.330 -    height = ((height - 1) / RADIX_TREE_MAP_SHIFT) * RADIX_TREE_MAP_SHIFT;
  35.331 -    offset = (key >> height) & RADIX_TREE_MAP_MASK;
  35.332 -
  35.333 -#ifdef DEBUG
  35.334 -    printf("update: height=%3d root=%3Ld offset=%3d%s\n", height, root,
  35.335 -            offset, (iswritable(root)?"":" (clone)"));
  35.336 -#endif
  35.337 -
  35.338 -    /* load a block, or create a new one */
  35.339 -    if (root == ZERO) {
  35.340 -        node = (radix_tree_node) newblock();
  35.341 -    } else {
  35.342 -        node = (radix_tree_node) rc_readblock(getid(root));
  35.343 -
  35.344 -        if (!iswritable(root)) {
  35.345 -            /* need to clone this node */
  35.346 -            radix_tree_node oldnode = node;
  35.347 -            node = cloneblock(node);
  35.348 -            freeblock(oldnode);
  35.349 -            root = ZERO;
  35.350 -        }
  35.351 -    }
  35.352 -
  35.353 -    if (node == NULL) {
  35.354 -#ifdef DEBUG
  35.355 -        printf("update: node is null!\n");
  35.356 -#endif
  35.357 -        return ZERO;
  35.358 -    }
  35.359 -
  35.360 -    child = update(height, node[offset], key, val);
  35.361 -
  35.362 -    if (child == ZERO) {
  35.363 -        freeblock(node);
  35.364 -        return ZERO;
  35.365 -    } else if (child == node[offset]) {
  35.366 -        /* no change, so we already owned the child */
  35.367 -        assert(iswritable(root));
  35.368 -
  35.369 -        freeblock(node);
  35.370 -        return root;
  35.371 -    }
  35.372 -
  35.373 -    node[offset] = child;
  35.374 -
  35.375 -    /* new/cloned blocks need to be saved */
  35.376 -    if (root == ZERO) {
  35.377 -        /* mark this as an owned block */
  35.378 -        root = rc_allocblock(node);
  35.379 -        if (root)
  35.380 -            root = writable(root);
  35.381 -    } else if (rc_writeblock(getid(root), node) < 0) {
  35.382 -        freeblock(node);
  35.383 -        return ZERO;
  35.384 -    }
  35.385 -
  35.386 -    freeblock(node);
  35.387 -    return root;
  35.388 -}
  35.389 -
  35.390 -/**
  35.391 - * snapshot: create a snapshot
  35.392 - *   @root: old root node
  35.393 - *
  35.394 - *   @return: new root node, 0 on error
  35.395 - */
  35.396 -u64 snapshot(u64 root) {
  35.397 -    radix_tree_node node, newnode;
  35.398 -
  35.399 -    if ((node = rc_readblock(getid(root))) == NULL)
  35.400 -        return ZERO;
  35.401 -
  35.402 -    newnode = cloneblock(node);
  35.403 -    freeblock(node);
  35.404 -    if (newnode == NULL)
  35.405 -        return ZERO;
  35.406 -    
  35.407 -    root = rc_allocblock(newnode);
  35.408 -    freeblock(newnode);
  35.409 -
  35.410 -    if (root == ZERO)
  35.411 -        return ZERO;
  35.412 -    else
  35.413 -        return writable(root);
  35.414 -}
  35.415 -
  35.416 -/**
  35.417 - * collapse: collapse a parent onto a child.
  35.418 - * 
  35.419 - * NOTE: This assumes that parent and child really are, and further that
  35.420 - * there are no other children forked from this parent. (children of the
  35.421 - * child are okay...)
  35.422 - */
  35.423 -
  35.424 -int collapse(int height, u64 proot, u64 croot)
  35.425 -{
  35.426 -    int i, numlinks, ret, total = 0;
  35.427 -    radix_tree_node pnode, cnode;
  35.428 -    
  35.429 -    if (height == 0) {
  35.430 -        height = -1; /* terminate recursion */
  35.431 -    } else {        
  35.432 -        height = ((height - 1) / RADIX_TREE_MAP_SHIFT) * RADIX_TREE_MAP_SHIFT;
  35.433 -    }
  35.434 -    numlinks = (1UL << RADIX_TREE_MAP_SHIFT);
  35.435 -
  35.436 -    /* Terminal cases: */
  35.437 -
  35.438 -    if ( (getid(proot) == ZERO) || (getid(croot) == ZERO) )
  35.439 -        return -1;
  35.440 -    
  35.441 -    /* get roots */
  35.442 -    if ((pnode = readblock(getid(proot))) == NULL)
  35.443 -        return -1;
  35.444 -    
  35.445 -    if ((cnode = readblock(getid(croot))) == NULL)
  35.446 -    {
  35.447 -        freeblock(pnode);
  35.448 -        return -1;
  35.449 -    }
  35.450 -    
  35.451 -    /* For each writable link in proot */
  35.452 -    for (i=0; i<numlinks; i++)
  35.453 -    {
  35.454 -        if ( pnode[i] == cnode[i] ) continue;
  35.455 -        
  35.456 -        /* collapse (next level) */
  35.457 -        /* if height != 0 and writable... */
  35.458 -        if (( height >= 0 ) && ( iswritable(pnode[i]) ) )
  35.459 -        {
  35.460 -            //printf("   %Ld is writable (i=%d).\n", getid(pnode[i]), i);
  35.461 -            ret = collapse(height, pnode[i], cnode[i]);
  35.462 -            if (ret == -1) 
  35.463 -            {
  35.464 -                total = -1;
  35.465 -            } else {
  35.466 -                total += ret;
  35.467 -            }
  35.468 -        }
  35.469 -    
  35.470 -        
  35.471 -    }
  35.472 -    
  35.473 -    /* if plink is writable, AND clink is writable -> free plink block */
  35.474 -    if ( ( iswritable(proot) ) && ( iswritable(croot) ) ) 
  35.475 -    {
  35.476 -        releaseblock(getid(proot));
  35.477 -        if (ret >=0) total++;
  35.478 -        //printf("   Delete %Ld\n", getid(proot));
  35.479 -    }
  35.480 -//printf("done : %Ld\n", getid(proot));
  35.481 -    return total;
  35.482 -
  35.483 -}
  35.484 -
  35.485 -
  35.486 -void print_root(u64 root, int height, FILE *dot_f)
  35.487 -{
  35.488 -    FILE *f;
  35.489 -    int i;
  35.490 -    radix_tree_node node;
  35.491 -    char *style[2] = { "", "style=bold,color=blue," };
  35.492 -    
  35.493 -    if (dot_f == NULL) {
  35.494 -        f = fopen("radix.dot", "w");
  35.495 -        if (f == NULL) {
  35.496 -            perror("print_root: open");
  35.497 -            return;
  35.498 -        }
  35.499 -
  35.500 -        /* write graph preamble */
  35.501 -        fprintf(f, "digraph G {\n");
  35.502 -
  35.503 -        /* add a node for this root. */
  35.504 -        fprintf(f, "   n%Ld [%sshape=box,label=\"%Ld\"];\n", 
  35.505 -                getid(root), style[iswritable(root)], getid(root));
  35.506 -    }
  35.507 -    
  35.508 -    printf("print_root(%Ld)\n", getid(root));
  35.509 -    
  35.510 -    /* base case */
  35.511 -    if (height == 0) {
  35.512 -        /* add a node and edge for each child root */
  35.513 -        node = (radix_tree_node) readblock(getid(root));
  35.514 -        if (node == NULL)
  35.515 -            return;
  35.516 -        
  35.517 -        for (i = 0; i < RADIX_TREE_MAP_ENTRIES; i++) {
  35.518 -            if (node[i] != ZERO) {
  35.519 -                fprintf(f, "   n%Ld [%sshape=box,label=\"%Ld\"];\n", 
  35.520 -                        getid(node[i]), style[iswritable(node[i])], 
  35.521 -                        getid(node[i]));
  35.522 -                fprintf(f, "   n%Ld -> n%Ld [label=\"%d\"]\n", getid(root), 
  35.523 -                        getid(node[i]), i);
  35.524 -            }
  35.525 -        }
  35.526 -        freeblock(node);
  35.527 -        return;
  35.528 -    }
  35.529 -
  35.530 -    /* the root block may be smaller to ensure all leaves are full */
  35.531 -    height = ((height - 1) / RADIX_TREE_MAP_SHIFT) * RADIX_TREE_MAP_SHIFT;
  35.532 -
  35.533 -    if (geti