ia64/xen-unstable

changeset 5656:f8acd354e129

Manual merge.
author kaf24@firebug.cl.cam.ac.uk
date Sun Jul 03 22:36:48 2005 +0000 (2005-07-03)
parents 80d5dd14711e 09067ce92303
children cb6b221bef55 f6e7c967212e
files linux-2.6.11-xen-sparse/arch/xen/Makefile tools/blktap/Makefile tools/blktap/parallax/README tools/blktap/parallax/block-async.c tools/blktap/parallax/block-async.h tools/blktap/parallax/blockstore.c tools/blktap/parallax/blockstore.h tools/blktap/parallax/blockstored.c tools/blktap/parallax/bstest.c tools/blktap/parallax/parallax.c tools/blktap/parallax/radix.c tools/blktap/parallax/radix.h tools/blktap/parallax/requests-async.c tools/blktap/parallax/requests-async.h tools/blktap/parallax/snaplog.c tools/blktap/parallax/snaplog.h tools/blktap/parallax/vdi.c tools/blktap/parallax/vdi.h tools/blktap/parallax/vdi_create.c tools/blktap/parallax/vdi_fill.c tools/blktap/parallax/vdi_list.c tools/blktap/parallax/vdi_snap.c tools/blktap/parallax/vdi_snap_delete.c tools/blktap/parallax/vdi_snap_list.c tools/blktap/parallax/vdi_tree.c tools/blktap/parallax/vdi_unittest.c tools/blktap/parallax/vdi_validate.c tools/ioemu/target-i386-dm/Makefile xen/common/kernel.c xen/include/public/version.h
line diff
     2.1 --- a/tools/blktap/Makefile	Sun Jul 03 22:32:52 2005 +0000
     2.2 +++ b/tools/blktap/Makefile	Sun Jul 03 22:36:48 2005 +0000
     2.3 @@ -2,64 +2,46 @@ MAJOR    = 2.0
     2.4  MINOR    = 0
     2.5  SONAME   = libblktap.so.$(MAJOR)
     2.6  
     2.7 -CC       = gcc
     2.8 -
     2.9  XEN_ROOT = ../..
    2.10  include $(XEN_ROOT)/tools/Rules.mk
    2.11  
    2.12 -BLKTAP_INSTALL_DIR	= /usr/sbin
    2.13 +SUBDIRS :=
    2.14 +SUBDIRS += parallax
    2.15 +
    2.16 +BLKTAP_INSTALL_DIR = /usr/sbin
    2.17  
    2.18 -INSTALL         = install
    2.19 -INSTALL_PROG    = $(INSTALL) -m0755
    2.20 -INSTALL_DIR     = $(INSTALL) -d -m0755
    2.21 +INSTALL            = install
    2.22 +INSTALL_PROG       = $(INSTALL) -m0755
    2.23 +INSTALL_DIR        = $(INSTALL) -d -m0755
    2.24  
    2.25 -INCLUDES += 
    2.26 +INCLUDES += -I. -I $(XEN_LIBXC)
    2.27  
    2.28  LIBS     := -lpthread -lz
    2.29  
    2.30  SRCS     :=
    2.31  SRCS     += blktaplib.c
    2.32  
    2.33 -PLX_SRCS := 
    2.34 -PLX_SRCS += vdi.c 
    2.35 -PLX_SRCS += radix.c 
    2.36 -PLX_SRCS += snaplog.c
    2.37 -PLX_SRCS += blockstore.c 
    2.38 -PLX_SRCS += block-async.c
    2.39 -PLX_SRCS += requests-async.c
    2.40 -VDI_SRCS := $(PLX_SRCS)
    2.41 -PLX_SRCS += parallax.c
    2.42 -
    2.43 -VDI_TOOLS :=
    2.44 -VDI_TOOLS += vdi_create
    2.45 -VDI_TOOLS += vdi_list
    2.46 -VDI_TOOLS += vdi_snap
    2.47 -VDI_TOOLS += vdi_snap_list
    2.48 -VDI_TOOLS += vdi_snap_delete
    2.49 -VDI_TOOLS += vdi_fill
    2.50 -VDI_TOOLS += vdi_tree
    2.51 -VDI_TOOLS += vdi_validate
    2.52 -
    2.53  CFLAGS   += -Wall
    2.54  CFLAGS   += -Werror
    2.55  CFLAGS   += -Wno-unused
    2.56  #CFLAGS   += -O3
    2.57  CFLAGS   += -g3
    2.58  CFLAGS   += -fno-strict-aliasing
    2.59 -CFLAGS   += -I $(XEN_LIBXC)
    2.60 -CFLAGS   += $(INCLUDES) -I.
    2.61  CFLAGS   += -D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE
    2.62  # Get gcc to generate the dependencies for us.
    2.63  CFLAGS   += -Wp,-MD,.$(@F).d
    2.64 +CFLAGS   += $(INCLUDES) 
    2.65  DEPS     = .*.d
    2.66  
    2.67  OBJS     = $(patsubst %.c,%.o,$(SRCS))
    2.68 -IBINS    = blkdump parallax $(VDI_TOOLS)
    2.69 +IBINS    = blkdump
    2.70  
    2.71  LIB      = libblktap.so libblktap.so.$(MAJOR) libblktap.so.$(MAJOR).$(MINOR)
    2.72  
    2.73 -all: mk-symlinks blkdump $(VDI_TOOLS) parallax blockstored
    2.74 -	$(MAKE) $(LIB)
    2.75 +all: mk-symlinks libblktap.so blkdump
    2.76 +	@set -e; for subdir in $(SUBDIRS); do \
    2.77 +		$(MAKE) -C $$subdir $@;       \
    2.78 +	done
    2.79  
    2.80  LINUX_ROOT := $(wildcard $(XEN_ROOT)/linux-2.6.*-xen-sparse)
    2.81  mk-symlinks:
    2.82 @@ -77,10 +59,16 @@ install: all
    2.83  	$(INSTALL_DIR) -p $(DESTDIR)/usr/include
    2.84  	$(INSTALL_PROG) $(LIB) $(DESTDIR)/usr/$(LIBDIR)
    2.85  	$(INSTALL_PROG) blktaplib.h $(DESTDIR)/usr/include
    2.86 -	$(INSTALL_PROG) $(IBINS) $(DESTDIR)/$(BLKTAP_INSTALL_DIR)
    2.87 +	$(INSTALL_PROG) $(IBINS) $(DESTDIR)$(BLKTAP_INSTALL_DIR)
    2.88 +	@set -e; for subdir in $(SUBDIRS); do \
    2.89 +		$(MAKE) -C $$subdir $@;       \
    2.90 +	done
    2.91  
    2.92  clean:
    2.93 -	rm -rf *.a *.so *.o *.rpm $(LIB) *~ $(DEPS) xen TAGS blkdump $(VDI_TOOLS) parallax vdi_unittest
    2.94 +	rm -rf *.a *.so *.o *.rpm $(LIB) *~ $(DEPS) xen TAGS blkdump
    2.95 +	@set -e; for subdir in $(SUBDIRS); do \
    2.96 +		$(MAKE) -C $$subdir $@;       \
    2.97 +	done
    2.98  
    2.99  rpm: all
   2.100  	rm -rf staging
   2.101 @@ -91,52 +79,17 @@ rpm: all
   2.102  	mv staging/i386/*.rpm .
   2.103  	rm -rf staging
   2.104  
   2.105 -libblktap.so:
   2.106 +libblktap.so: $(OBJS)
   2.107 +	$(CC) $(CFLAGS) -Wl,-soname -Wl,$(SONAME) -shared -o      \
   2.108 +	      libblktap.so.$(MAJOR).$(MINOR) $^ $(LIBS)
   2.109 +	ln -sf libblktap.so.$(MAJOR).$(MINOR) libblktap.so.$(MAJOR)
   2.110  	ln -sf libblktap.so.$(MAJOR) $@
   2.111 -libblktap.so.$(MAJOR):
   2.112 -	ln -sf libblktap.so.$(MAJOR).$(MINOR) $@
   2.113 -libblktap.so.$(MAJOR).$(MINOR): $(OBJS)
   2.114 -	$(CC) -Wl,-soname -Wl,$(SONAME) -shared -o $@ $^ $(LIBS)
   2.115  
   2.116 -blkdump: $(LIB)
   2.117 +blkdump: libblktap.so
   2.118  	$(CC) $(CFLAGS) -o blkdump -L$(XEN_LIBXC) -L. -l blktap blkdump.c
   2.119  
   2.120 -parallax: $(LIB) $(PLX_SRCS)
   2.121 -	$(CC) $(CFLAGS) -o parallax -L$(XEN_LIBXC) -L. -lblktap $(LIBS) $(PLX_SRCS) 
   2.122 -
   2.123 -vdi_list: $(LIB) vdi_list.c $(VDI_SRCS)
   2.124 -	$(CC) $(CFLAGS) -g3 -o vdi_list vdi_list.c $(LIBS) $(VDI_SRCS)
   2.125 -
   2.126 -vdi_create: $(LIB) vdi_create.c $(VDI_SRCS)
   2.127 -	$(CC) $(CFLAGS) -g3 -o vdi_create vdi_create.c $(LIBS) $(VDI_SRCS)
   2.128 -
   2.129 -vdi_snap: $(LIB) vdi_snap.c $(VDI_SRCS)
   2.130 -	$(CC) $(CFLAGS) -g3 -o vdi_snap vdi_snap.c $(LIBS) $(VDI_SRCS)
   2.131 -
   2.132 -vdi_snap_list: $(LIB) vdi_snap_list.c $(VDI_SRCS)
   2.133 -	$(CC) $(CFLAGS) -g3 -o vdi_snap_list vdi_snap_list.c $(LIBS) $(VDI_SRCS)
   2.134 -
   2.135 -vdi_snap_delete: $(LIB) vdi_snap_delete.c $(VDI_SRCS)
   2.136 -	$(CC) $(CFLAGS) -g3 -o vdi_snap_delete vdi_snap_delete.c $(LIBS) $(VDI_SRCS)
   2.137 +.PHONY: TAGS clean install mk-symlinks rpm
   2.138  
   2.139 -vdi_tree: $(LIB) vdi_tree.c $(VDI_SRCS)
   2.140 -	$(CC) $(CFLAGS) -g3 -o vdi_tree vdi_tree.c $(LIBS) $(VDI_SRCS)
   2.141 -
   2.142 -vdi_fill: $(LIB) vdi_fill.c $(VDI_SRCS)
   2.143 -	$(CC) $(CFLAGS) -g3 -o vdi_fill vdi_fill.c $(LIBS) $(VDI_SRCS)
   2.144 -
   2.145 -vdi_validate: $(LIB) vdi_validate.c $(VDI_SRCS)
   2.146 -	$(CC) $(CFLAGS) -g3 -o vdi_validate vdi_validate.c $(LIBS) $(VDI_SRCS)
   2.147 -
   2.148 -vdi_unittest: $(LIB) vdi_unittest.c $(VDI_SRCS)
   2.149 -	$(CC) $(CFLAGS) -g3 -o vdi_unittest vdi_unittest.c $(LIBS) $(VDI_SRCS)
   2.150 -
   2.151 -blockstored: blockstored.c
   2.152 -	$(CC) $(CFLAGS) -g3 -o blockstored $(LIBS) blockstored.c
   2.153 -bstest: bstest.c blockstore.c
   2.154 -	$(CC) $(CFLAGS) -g3 -o bstest bstest.c $(LIBS) blockstore.c
   2.155 -
   2.156 -.PHONY: TAGS clean install mk-symlinks rpm
   2.157  TAGS:
   2.158  	etags -t $(SRCS) *.h
   2.159  
     3.1 --- a/tools/blktap/README-PARALLAX	Sun Jul 03 22:32:52 2005 +0000
     3.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
     3.3 @@ -1,177 +0,0 @@
     3.4 -Parallax Quick Overview
     3.5 -March 3, 2005
     3.6 -
     3.7 -This is intended to provide a quick set of instructions to let you
     3.8 -guys play with the current parallax source.  In it's current form, the
     3.9 -code will let you run an arbitrary number of VMs off of a single disk
    3.10 -image, doing copy-on-write as they make updates.  Each domain is
    3.11 -assigned a virtual disk image (VDI), which may be based on a snapshot
    3.12 -of an existing image.  All of the VDI and snapshot management should
    3.13 -currently work.
    3.14 -
    3.15 -The current implementation uses a single file as a blockstore for
    3.16 -_everything_ this will soon be replaced by the fancier backend code
    3.17 -and the local cache.  As it stands, Parallax will create
    3.18 -"blockstore.dat" in the directory that you run it from, and use
    3.19 -largefile support to make this grow to unfathomable girth.  So, you
    3.20 -probably want to run the daemon off of a local disk, with a lot of
    3.21 -free space.
    3.22 -
    3.23 -Here's how to get going:
    3.24 -
    3.25 -0. Setup:
    3.26 ----------
    3.27 -
    3.28 -Pick a local directory on a disk with lots of room.  You should be
    3.29 -running from a privileged domain (e.g. dom0) with the blocktap
    3.30 -configured in and block backend NOT.
    3.31 -
    3.32 -For convenience (for the moment) copy all of the vdi tools (vdi_*) and
    3.33 -the parallax daemon from tools/blktap into this directory.
    3.34 -
    3.35 -1. Populate the blockstore:
    3.36 ----------------------------
    3.37 -
    3.38 -First you need to put at least one image into the blockstore.  You
    3.39 -will need a disk image, either as a file or local partition.  My
    3.40 -general approach has been to
    3.41 -
    3.42 -(a) make a really big sparse file with 
    3.43 -
    3.44 -        dd if=/dev/zero of=./image bs=4K count=1 seek=[big value]
    3.45 -
    3.46 -(b) put a filesystem into it
    3.47 -
    3.48 -        mkfs.ext3 ./image
    3.49 -
    3.50 -(c) mount it using loopback
    3.51 -
    3.52 -        mkdir ./mnt
    3.53 -        mount -o loop ./image
    3.54 -
    3.55 -(d) cd into it and untar one of the image files from srg-roots.
    3.56 -
    3.57 -        cd mnt
    3.58 -        tar ...
    3.59 -
    3.60 -NOTE: Beware if your system is FC3.  mkfs is not compatible with old
    3.61 -versions of fedora, and so you don't have much choice but to install
    3.62 -further fc3 images if you have used the fc3 version of mkfs.
    3.63 -
    3.64 -(e) unmount the image
    3.65 -
    3.66 -        cd ..
    3.67 -        umount mnt
    3.68 -
    3.69 -(f) now, create a new VDI to hold the image 
    3.70 -
    3.71 -        ./vdi_create "My new FC3 VDI"
    3.72 -
    3.73 -(g) get the id of the new VDI.
    3.74 -
    3.75 -        ./vdi_list
    3.76 -
    3.77 -        |      0                     My new FC3 VDI
    3.78 -
    3.79 -(0 is the VDI id... create a few more if you want.)
    3.80 -
    3.81 -(h) hoover your image into the new VDI.
    3.82 -
    3.83 -        ./vdi_fill 0 ./image
    3.84 -
    3.85 -This will pull the entire image into the blockstore and set up a
    3.86 -mapping tree for it for VDI 0.  Passing a device (i.e. /dev/sda3)
    3.87 -should also work, but vdi_fill has NO notion of sparseness yet, so you
    3.88 -are going to pump a block into the store for each block you read.
    3.89 -
    3.90 -vdi_fill will count up until it is done, and you should be ready to
    3.91 -go.  If you want to be anal, you can use vdi_validate to test the VDI
    3.92 -against the original image.
    3.93 -
    3.94 -2. Create some extra VDIs
    3.95 --------------------------
    3.96 -
    3.97 -VDIs are actually a list of snapshots, and each snapshot is a full
    3.98 -image of mappings.  So, to preserve an immutable copy of a current
    3.99 -VDI, do this:
   3.100 -
   3.101 -(a) Snapshot your new VDI.
   3.102 -
   3.103 -        ./vdi_snap 0
   3.104 -
   3.105 -Snapshotting writes the current radix root to the VDI's snapshot log,
   3.106 -and assigns it a new writable root.
   3.107 -
   3.108 -(b) look at the VDI's snapshot log.
   3.109 -
   3.110 -        ./vdi_snap_list 0
   3.111 -
   3.112 -        | 16   0      Thu Mar  3 19:27:48 2005 565111           31
   3.113 -
   3.114 -The first two columns constitute a snapshot id and represent the
   3.115 -(block, offset) of the snapshot record.  The Date tells you when the
   3.116 -snapshot was made, and 31 is the radix root node of the snapshot.
   3.117 -
   3.118 -(c) Create a new VDI, based on that snapshot, and look at the list.
   3.119 -
   3.120 -        ./vdi_create "FC3 - Copy 1" 16 0
   3.121 -        ./vdi_list
   3.122 -
   3.123 -        |      0                     My new FC3 VDI
   3.124 -        |      1                       FC3 - Copy 1
   3.125 -
   3.126 -NOTE: If you have Graphviz installed on your system, you can use
   3.127 -vdi_tree to generate a postscript of your current set of VDIs and
   3.128 -snapshots.
   3.129 -
   3.130 -
   3.131 -Create as many VDIs as you need for the VMs that you want to run.
   3.132 -
   3.133 -3. Boot some VMs:
   3.134 ------------------
   3.135 -
   3.136 -Parallax currently uses a hack in xend to pass the VDI id, you need to
   3.137 -modify the disk line of the VM config that is going to mount it.
   3.138 -
   3.139 -(a) set up your vm config, by using the following disk line:
   3.140 -
   3.141 -        disk = ['parallax:1,sda1,w,0' ]
   3.142 -
   3.143 -This example uses VDI 1 (from vdi_list above), presents it as sda1
   3.144 -(writable), and uses dom 0 as the backend.  If you were running the
   3.145 -daemon (and tap driver) in some domain other than 0, you would change
   3.146 -this last parameter.
   3.147 -
   3.148 -NOTE: You'll need to have reinstalled xend/tools prior to booting the vm, so that it knows what to do with "parallax:".
   3.149 -
   3.150 -(b) Run parallax in the backend domain.
   3.151 -
   3.152 -        ./parallax
   3.153 -
   3.154 -(c) create your new domain.
   3.155 -
   3.156 -        xm create ...
   3.157 -
   3.158 ----
   3.159 -
   3.160 -That's pretty much all there is to it at the moment.  Hope this is
   3.161 -clear enough to get you going.  Now, a few serious caveats that will
   3.162 -be sorted out in the almost immediate future:
   3.163 -
   3.164 -WARNINGS:
   3.165 ----------
   3.166 -
   3.167 -1. There is NO locking in the VDI tools at the moment, so I'd avoid
   3.168 -running them in parallel, or more importantly, running them while the
   3.169 -daemon is running.
   3.170 -
   3.171 -2. I doubt that xend will be very happy about restarting if you have
   3.172 -parallax-using domains.  So if it dies while there are active parallax
   3.173 -doms, you may need to reboot.
   3.174 -
   3.175 -3. I've turned off write-in-place.  So at the moment, EVERY block
   3.176 -write is a log append on the blockstore.  I've been having some probs
   3.177 -with the radix tree's marking of writable blocks after snapshots and
   3.178 -will sort this out very soon.
   3.179 -
   3.180 -
     4.1 --- a/tools/blktap/block-async.c	Sun Jul 03 22:32:52 2005 +0000
     4.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
     4.3 @@ -1,393 +0,0 @@
     4.4 -/* block-async.c
     4.5 - * 
     4.6 - * Asynchronous block wrappers for parallax.
     4.7 - */
     4.8 - 
     4.9 - 
    4.10 -#include <stdio.h>
    4.11 -#include <stdlib.h>
    4.12 -#include <string.h>
    4.13 -#include <pthread.h>
    4.14 -#include "block-async.h"
    4.15 -#include "blockstore.h"
    4.16 -#include "vdi.h"
    4.17 -
    4.18 -
    4.19 -#if 0
    4.20 -#define DPRINTF(_f, _a...) printf ( _f , ## _a )
    4.21 -#else
    4.22 -#define DPRINTF(_f, _a...) ((void)0)
    4.23 -#endif
    4.24 -
    4.25 -/* We have a queue of outstanding I/O requests implemented as a 
    4.26 - * circular producer-consumer ring with free-running buffers.
    4.27 - * to allow reordering, this ring indirects to indexes in an 
    4.28 - * ring of io_structs.
    4.29 - * 
    4.30 - * the block_* calls may either add an entry to this ring and return, 
    4.31 - * or satisfy the request immediately and call the callback directly.
    4.32 - * None of the io calls in parallax should be nested enough to worry 
    4.33 - * about stack problems with this approach.
    4.34 - */
    4.35 -
    4.36 -struct read_args {
    4.37 -    u64 addr;
    4.38 -};
    4.39 -
    4.40 -struct write_args {
    4.41 -    u64   addr;
    4.42 -    char *block;
    4.43 -};
    4.44 -
    4.45 -struct alloc_args {
    4.46 -    char *block;
    4.47 -};
    4.48 - 
    4.49 -struct pending_io_req {
    4.50 -    enum {IO_READ, IO_WRITE, IO_ALLOC, IO_RWAKE, IO_WWAKE} op;
    4.51 -    union {
    4.52 -        struct read_args  r;
    4.53 -        struct write_args w;
    4.54 -        struct alloc_args a;
    4.55 -    } u;
    4.56 -    io_cb_t cb;
    4.57 -    void *param;
    4.58 -};
    4.59 -
    4.60 -void radix_lock_init(struct radix_lock *r)
    4.61 -{
    4.62 -    int i;
    4.63 -    
    4.64 -    pthread_mutex_init(&r->lock, NULL);
    4.65 -    for (i=0; i < 1024; i++) {
    4.66 -        r->lines[i] = 0;
    4.67 -        r->waiters[i] = NULL;
    4.68 -        r->state[i] = ANY;
    4.69 -    }
    4.70 -}
    4.71 -
    4.72 -/* maximum outstanding I/O requests issued asynchronously */
    4.73 -/* must be a power of 2.*/
    4.74 -#define MAX_PENDING_IO 1024
    4.75 -
    4.76 -/* how many threads to concurrently issue I/O to the disk. */
    4.77 -#define IO_POOL_SIZE   10
    4.78 -
    4.79 -static struct pending_io_req pending_io_reqs[MAX_PENDING_IO];
    4.80 -static int pending_io_list[MAX_PENDING_IO];
    4.81 -static unsigned long io_prod = 0, io_cons = 0, io_free = 0;
    4.82 -#define PENDING_IO_MASK(_x) ((_x) & (MAX_PENDING_IO - 1))
    4.83 -#define PENDING_IO_IDX(_x) ((_x) - pending_io_reqs)
    4.84 -#define PENDING_IO_ENT(_x) \
    4.85 -	(&pending_io_reqs[pending_io_list[PENDING_IO_MASK(_x)]])
    4.86 -#define CAN_PRODUCE_PENDING_IO ((io_free + MAX_PENDING_IO) != io_prod)
    4.87 -#define CAN_CONSUME_PENDING_IO (io_cons != io_prod)
    4.88 -static pthread_mutex_t pending_io_lock = PTHREAD_MUTEX_INITIALIZER;
    4.89 -static pthread_cond_t  pending_io_cond = PTHREAD_COND_INITIALIZER;
    4.90 -
    4.91 -static void init_pending_io(void)
    4.92 -{
    4.93 -    int i;
    4.94 -	
    4.95 -    for (i=0; i<MAX_PENDING_IO; i++)
    4.96 -        pending_io_list[i] = i;
    4.97 -		
    4.98 -} 
    4.99 -
   4.100 -void block_read(u64 addr, io_cb_t cb, void *param)
   4.101 -{
   4.102 -    struct pending_io_req *req;
   4.103 -    
   4.104 -    pthread_mutex_lock(&pending_io_lock);
   4.105 -    assert(CAN_PRODUCE_PENDING_IO);
   4.106 -    
   4.107 -    req = PENDING_IO_ENT(io_prod++);
   4.108 -    DPRINTF("Produce (R) %lu (%p)\n", io_prod - 1, req);
   4.109 -    req->op = IO_READ;
   4.110 -    req->u.r.addr = addr;
   4.111 -    req->cb = cb;
   4.112 -    req->param = param;
   4.113 -    
   4.114 -    pthread_cond_signal(&pending_io_cond);
   4.115 -    pthread_mutex_unlock(&pending_io_lock);	
   4.116 -}
   4.117 -
   4.118 -
   4.119 -void block_write(u64 addr, char *block, io_cb_t cb, void *param)
   4.120 -{
   4.121 -    struct pending_io_req *req;
   4.122 -    
   4.123 -    pthread_mutex_lock(&pending_io_lock);
   4.124 -    assert(CAN_PRODUCE_PENDING_IO);
   4.125 -    
   4.126 -    req = PENDING_IO_ENT(io_prod++);
   4.127 -    DPRINTF("Produce (W) %lu (%p)\n", io_prod - 1, req);
   4.128 -    req->op = IO_WRITE;
   4.129 -    req->u.w.addr  = addr;
   4.130 -    req->u.w.block = block;
   4.131 -    req->cb = cb;
   4.132 -    req->param = param;
   4.133 -    
   4.134 -    pthread_cond_signal(&pending_io_cond);
   4.135 -    pthread_mutex_unlock(&pending_io_lock);	
   4.136 -}
   4.137 -
   4.138 -
   4.139 -void block_alloc(char *block, io_cb_t cb, void *param)
   4.140 -{
   4.141 -    struct pending_io_req *req;
   4.142 -	
   4.143 -    pthread_mutex_lock(&pending_io_lock);
   4.144 -    assert(CAN_PRODUCE_PENDING_IO);
   4.145 -    
   4.146 -    req = PENDING_IO_ENT(io_prod++);
   4.147 -    req->op = IO_ALLOC;
   4.148 -    req->u.a.block = block;
   4.149 -    req->cb = cb;
   4.150 -    req->param = param;
   4.151 -    
   4.152 -    pthread_cond_signal(&pending_io_cond);
   4.153 -    pthread_mutex_unlock(&pending_io_lock);	
   4.154 -}
   4.155 -
   4.156 -void block_rlock(struct radix_lock *r, int row, io_cb_t cb, void *param)
   4.157 -{
   4.158 -    struct io_ret ret;
   4.159 -    pthread_mutex_lock(&r->lock);
   4.160 -    
   4.161 -    if (( r->lines[row] >= 0 ) && (r->state[row] != STOP)) {
   4.162 -        r->lines[row]++;
   4.163 -        r->state[row] = READ;
   4.164 -        DPRINTF("RLOCK  : %3d (row: %d)\n", r->lines[row], row);
   4.165 -        pthread_mutex_unlock(&r->lock);
   4.166 -        ret.type = IO_INT_T;
   4.167 -        ret.u.i = 0;
   4.168 -        cb(ret, param);
   4.169 -    } else {
   4.170 -        struct radix_wait **rwc;
   4.171 -        struct radix_wait *rw = 
   4.172 -            (struct radix_wait *) malloc (sizeof(struct radix_wait));
   4.173 -        DPRINTF("RLOCK  : %3d (row: %d) -- DEFERRED!\n", r->lines[row], row);
   4.174 -        rw->type  = RLOCK;
   4.175 -        rw->param = param;
   4.176 -        rw->cb    = cb;
   4.177 -        rw->next  = NULL;
   4.178 -        /* append to waiters list. */
   4.179 -        rwc = &r->waiters[row];
   4.180 -        while (*rwc != NULL) rwc = &(*rwc)->next;
   4.181 -        *rwc = rw;
   4.182 -        pthread_mutex_unlock(&r->lock);
   4.183 -        return;
   4.184 -    }
   4.185 -}
   4.186 -
   4.187 -
   4.188 -void block_wlock(struct radix_lock *r, int row, io_cb_t cb, void *param)
   4.189 -{
   4.190 -    struct io_ret ret;
   4.191 -    pthread_mutex_lock(&r->lock);
   4.192 -    
   4.193 -    /* the second check here is redundant -- just here for debugging now. */
   4.194 -    if ((r->state[row] == ANY) && ( r->lines[row] == 0 )) {
   4.195 -        r->state[row] = STOP;
   4.196 -        r->lines[row] = -1;
   4.197 -        DPRINTF("WLOCK  : %3d (row: %d)\n", r->lines[row], row);
   4.198 -        pthread_mutex_unlock(&r->lock);
   4.199 -        ret.type = IO_INT_T;
   4.200 -        ret.u.i = 0;
   4.201 -        cb(ret, param);
   4.202 -    } else {
   4.203 -        struct radix_wait **rwc;
   4.204 -        struct radix_wait *rw = 
   4.205 -            (struct radix_wait *) malloc (sizeof(struct radix_wait));
   4.206 -        DPRINTF("WLOCK  : %3d (row: %d) -- DEFERRED!\n", r->lines[row], row);
   4.207 -        rw->type  = WLOCK;
   4.208 -        rw->param = param;
   4.209 -        rw->cb    = cb;
   4.210 -        rw->next  = NULL;
   4.211 -        /* append to waiters list. */
   4.212 -        rwc = &r->waiters[row];
   4.213 -        while (*rwc != NULL) rwc = &(*rwc)->next;
   4.214 -        *rwc = rw;
   4.215 -        pthread_mutex_unlock(&r->lock);
   4.216 -        return;
   4.217 -    }
   4.218 -	
   4.219 -}
   4.220 -
   4.221 -/* called with radix_lock locked and lock count of zero. */
   4.222 -static void wake_waiters(struct radix_lock *r, int row)
   4.223 -{
   4.224 -    struct pending_io_req *req;
   4.225 -    struct radix_wait *rw;
   4.226 -    
   4.227 -    if (r->lines[row] != 0) return;
   4.228 -    if (r->waiters[row] == NULL) return; 
   4.229 -    
   4.230 -    if (r->waiters[row]->type == WLOCK) {
   4.231 -
   4.232 -        rw = r->waiters[row];
   4.233 -        pthread_mutex_lock(&pending_io_lock);
   4.234 -        assert(CAN_PRODUCE_PENDING_IO);
   4.235 -        
   4.236 -        req = PENDING_IO_ENT(io_prod++);
   4.237 -        req->op    = IO_WWAKE;
   4.238 -        req->cb    = rw->cb;
   4.239 -        req->param = rw->param;
   4.240 -        r->lines[row] = -1; /* write lock the row. */
   4.241 -        r->state[row] = STOP;
   4.242 -        r->waiters[row] = rw->next;
   4.243 -        free(rw);
   4.244 -        pthread_mutex_unlock(&pending_io_lock);
   4.245 -    
   4.246 -    } else /* RLOCK */ {
   4.247 -
   4.248 -        while ((r->waiters[row] != NULL) && (r->waiters[row]->type == RLOCK)) {
   4.249 -            rw = r->waiters[row];
   4.250 -            pthread_mutex_lock(&pending_io_lock);
   4.251 -            assert(CAN_PRODUCE_PENDING_IO);
   4.252 -            
   4.253 -            req = PENDING_IO_ENT(io_prod++);
   4.254 -            req->op    = IO_RWAKE;
   4.255 -            req->cb    = rw->cb;
   4.256 -            req->param = rw->param;
   4.257 -            r->lines[row]++; /* read lock the row. */
   4.258 -            r->state[row] = READ; 
   4.259 -            r->waiters[row] = rw->next;
   4.260 -            free(rw);
   4.261 -            pthread_mutex_unlock(&pending_io_lock);
   4.262 -        }
   4.263 -
   4.264 -        if (r->waiters[row] != NULL) /* There is a write queued still */
   4.265 -            r->state[row] = STOP;
   4.266 -    }	
   4.267 -    
   4.268 -    pthread_mutex_lock(&pending_io_lock);
   4.269 -    pthread_cond_signal(&pending_io_cond);
   4.270 -    pthread_mutex_unlock(&pending_io_lock);
   4.271 -}
   4.272 -
   4.273 -void block_runlock(struct radix_lock *r, int row, io_cb_t cb, void *param)
   4.274 -{
   4.275 -    struct io_ret ret;
   4.276 -	
   4.277 -    pthread_mutex_lock(&r->lock);
   4.278 -    assert(r->lines[row] > 0); /* try to catch misuse. */
   4.279 -    r->lines[row]--;
   4.280 -    if (r->lines[row] == 0) {
   4.281 -        r->state[row] = ANY;
   4.282 -        wake_waiters(r, row);
   4.283 -    }
   4.284 -    pthread_mutex_unlock(&r->lock);
   4.285 -    cb(ret, param);
   4.286 -}
   4.287 -
   4.288 -void block_wunlock(struct radix_lock *r, int row, io_cb_t cb, void *param)
   4.289 -{
   4.290 -    struct io_ret ret;
   4.291 -    
   4.292 -    pthread_mutex_lock(&r->lock);
   4.293 -    assert(r->lines[row] == -1); /* try to catch misuse. */
   4.294 -    r->lines[row] = 0;
   4.295 -    r->state[row] = ANY;
   4.296 -    wake_waiters(r, row);
   4.297 -    pthread_mutex_unlock(&r->lock);
   4.298 -    cb(ret, param);
   4.299 -}
   4.300 -
   4.301 -/* consumer calls */
   4.302 -static void do_next_io_req(struct pending_io_req *req)
   4.303 -{
   4.304 -    struct io_ret          ret;
   4.305 -    void  *param;
   4.306 -    
   4.307 -    switch (req->op) {
   4.308 -    case IO_READ:
   4.309 -        ret.type = IO_BLOCK_T;
   4.310 -        ret.u.b  = readblock(req->u.r.addr);
   4.311 -        break;
   4.312 -    case IO_WRITE:
   4.313 -        ret.type = IO_INT_T;
   4.314 -        ret.u.i  = writeblock(req->u.w.addr, req->u.w.block);
   4.315 -        DPRINTF("wrote %d at %Lu\n", *(int *)(req->u.w.block), req->u.w.addr);
   4.316 -        break;
   4.317 -    case IO_ALLOC:
   4.318 -        ret.type = IO_ADDR_T;
   4.319 -        ret.u.a  = allocblock(req->u.a.block);
   4.320 -        break;
   4.321 -    case IO_RWAKE:
   4.322 -        DPRINTF("WAKE DEFERRED RLOCK!\n");
   4.323 -        ret.type = IO_INT_T;
   4.324 -        ret.u.i  = 0;
   4.325 -        break;
   4.326 -    case IO_WWAKE:
   4.327 -        DPRINTF("WAKE DEFERRED WLOCK!\n");
   4.328 -        ret.type = IO_INT_T;
   4.329 -        ret.u.i  = 0;
   4.330 -        break;
   4.331 -    default:
   4.332 -        DPRINTF("Unknown IO operation on pending list!\n");
   4.333 -        return;
   4.334 -    }
   4.335 -    
   4.336 -    param = req->param;
   4.337 -    pthread_mutex_lock(&pending_io_lock);
   4.338 -    pending_io_list[PENDING_IO_MASK(io_free++)] = PENDING_IO_IDX(req);
   4.339 -    pthread_mutex_unlock(&pending_io_lock);
   4.340 -	
   4.341 -    assert(req->cb != NULL);
   4.342 -    req->cb(ret, param);
   4.343 -    
   4.344 -}
   4.345 -
   4.346 -void *io_thread(void *param) 
   4.347 -{
   4.348 -    int tid;
   4.349 -    struct pending_io_req *req;
   4.350 -    
   4.351 -    /* Set this thread's tid. */
   4.352 -    tid = *(int *)param;
   4.353 -    free(param);
   4.354 -    
   4.355 -start:
   4.356 -    pthread_mutex_lock(&pending_io_lock);
   4.357 -    while (io_prod == io_cons) {
   4.358 -        pthread_cond_wait(&pending_io_cond, &pending_io_lock);
   4.359 -    }
   4.360 -    
   4.361 -    if (io_prod == io_cons) {
   4.362 -        /* unnecessary wakeup. */
   4.363 -        pthread_mutex_unlock(&pending_io_lock);
   4.364 -        goto start;
   4.365 -    }
   4.366 -    
   4.367 -    req = PENDING_IO_ENT(io_cons++);
   4.368 -    pthread_mutex_unlock(&pending_io_lock);
   4.369 -	
   4.370 -    do_next_io_req(req);
   4.371 -    
   4.372 -    goto start;
   4.373 -	
   4.374 -}
   4.375 -
   4.376 -static pthread_t io_pool[IO_POOL_SIZE];
   4.377 -void start_io_threads(void)
   4.378 -
   4.379 -{	
   4.380 -    int i, tid=0;
   4.381 -    
   4.382 -    for (i=0; i < IO_POOL_SIZE; i++) {
   4.383 -        int ret, *t;
   4.384 -        t = (int *)malloc(sizeof(int));
   4.385 -        *t = tid++;
   4.386 -        ret = pthread_create(&io_pool[i], NULL, io_thread, t);
   4.387 -        if (ret != 0) printf("Error starting thread %d\n", i);
   4.388 -    }
   4.389 -	
   4.390 -}
   4.391 -
   4.392 -void init_block_async(void)
   4.393 -{
   4.394 -    init_pending_io();
   4.395 -    start_io_threads();
   4.396 -}
     5.1 --- a/tools/blktap/block-async.h	Sun Jul 03 22:32:52 2005 +0000
     5.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
     5.3 @@ -1,69 +0,0 @@
     5.4 -/* block-async.h
     5.5 - * 
     5.6 - * Asynchronous block wrappers for parallax.
     5.7 - */
     5.8 - 
     5.9 -#ifndef _BLOCKASYNC_H_
    5.10 -#define _BLOCKASYNC_H_
    5.11 -
    5.12 -#include <assert.h>
    5.13 -#include <xc.h>
    5.14 -#include "vdi.h"
    5.15 -
    5.16 -struct io_ret
    5.17 -{
    5.18 -    enum {IO_ADDR_T, IO_BLOCK_T, IO_INT_T} type;
    5.19 -    union {
    5.20 -        u64   a;
    5.21 -        char *b;
    5.22 -        int   i;
    5.23 -    } u;
    5.24 -};
    5.25 -
    5.26 -typedef void (*io_cb_t)(struct io_ret r, void *param);
    5.27 -
    5.28 -/* per-vdi lock structures to make sure requests run in a safe order. */
    5.29 -struct radix_wait {
    5.30 -    enum {RLOCK, WLOCK} type;
    5.31 -    io_cb_t  cb;
    5.32 -    void    *param;
    5.33 -    struct radix_wait *next;
    5.34 -};
    5.35 -
    5.36 -struct radix_lock {
    5.37 -    pthread_mutex_t lock;
    5.38 -    int                    lines[1024];
    5.39 -    struct radix_wait     *waiters[1024];
    5.40 -    enum {ANY, READ, STOP} state[1024];
    5.41 -};
    5.42 -void radix_lock_init(struct radix_lock *r);
    5.43 -
    5.44 -void block_read(u64 addr, io_cb_t cb, void *param);
    5.45 -void block_write(u64 addr, char *block, io_cb_t cb, void *param);
    5.46 -void block_alloc(char *block, io_cb_t cb, void *param);
    5.47 -void block_rlock(struct radix_lock *r, int row, io_cb_t cb, void *param);
    5.48 -void block_wlock(struct radix_lock *r, int row, io_cb_t cb, void *param);
    5.49 -void block_runlock(struct radix_lock *r, int row, io_cb_t cb, void *param);
    5.50 -void block_wunlock(struct radix_lock *r, int row, io_cb_t cb, void *param);
    5.51 -void init_block_async(void);
    5.52 -
    5.53 -static inline u64 IO_ADDR(struct io_ret r)
    5.54 -{
    5.55 -    assert(r.type == IO_ADDR_T);
    5.56 -    return r.u.a;
    5.57 -}
    5.58 -
    5.59 -static inline char *IO_BLOCK(struct io_ret r)
    5.60 -{
    5.61 -    assert(r.type == IO_BLOCK_T);
    5.62 -    return r.u.b;
    5.63 -}
    5.64 -
    5.65 -static inline int IO_INT(struct io_ret r)
    5.66 -{
    5.67 -    assert(r.type == IO_INT_T);
    5.68 -    return r.u.i;
    5.69 -}
    5.70 -
    5.71 -
    5.72 -#endif //_BLOCKASYNC_H_
     6.1 --- a/tools/blktap/blockstore.c	Sun Jul 03 22:32:52 2005 +0000
     6.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
     6.3 @@ -1,1350 +0,0 @@
     6.4 -/**************************************************************************
     6.5 - * 
     6.6 - * blockstore.c
     6.7 - *
     6.8 - * Simple block store interface
     6.9 - *
    6.10 - */
    6.11 - 
    6.12 -#include <fcntl.h>
    6.13 -#include <unistd.h>
    6.14 -#include <stdio.h>
    6.15 -#include <stdlib.h>
    6.16 -#include <string.h>
    6.17 -#include <sys/types.h>
    6.18 -#include <sys/stat.h>
    6.19 -#include <sys/time.h>
    6.20 -#include <stdarg.h>
    6.21 -#include "blockstore.h"
    6.22 -#include <pthread.h>
    6.23 -
    6.24 -//#define BLOCKSTORE_REMOTE
    6.25 -//#define BSDEBUG
    6.26 -
    6.27 -#define RETRY_TIMEOUT 1000000 /* microseconds */
    6.28 -
    6.29 -/*****************************************************************************
    6.30 - * Debugging
    6.31 - */
    6.32 -#ifdef BSDEBUG
    6.33 -void DB(char *format, ...)
    6.34 -{
    6.35 -    va_list args;
    6.36 -    fprintf(stderr, "[%05u] ", (int)pthread_getspecific(tid_key));
    6.37 -    va_start(args, format);
    6.38 -    vfprintf(stderr, format, args);
    6.39 -    va_end(args);
    6.40 -}
    6.41 -#else
    6.42 -#define DB(format, ...) (void)0
    6.43 -#endif
    6.44 -
    6.45 -#ifdef BLOCKSTORE_REMOTE
    6.46 -
    6.47 -#include <sys/socket.h>
    6.48 -#include <sys/ioctl.h>
    6.49 -#include <netinet/in.h>
    6.50 -#include <netdb.h>
    6.51 -
    6.52 -/*****************************************************************************
    6.53 - * Network state                                                             *
    6.54 - *****************************************************************************/
    6.55 -
    6.56 -/* The individual disk servers we talks to. These will be referenced by
    6.57 - * an integer index into bsservers[].
    6.58 - */
    6.59 -bsserver_t bsservers[MAX_SERVERS];
    6.60 -
    6.61 -/* The cluster map. This is indexed by an integer cluster number.
    6.62 - */
    6.63 -bscluster_t bsclusters[MAX_CLUSTERS];
    6.64 -
    6.65 -/* Local socket.
    6.66 - */
    6.67 -struct sockaddr_in sin_local;
    6.68 -int bssock = 0;
    6.69 -
    6.70 -/*****************************************************************************
    6.71 - * Notification                                                              *
    6.72 - *****************************************************************************/
    6.73 -
    6.74 -typedef struct pool_thread_t_struct {
    6.75 -    pthread_mutex_t ptmutex;
    6.76 -    pthread_cond_t ptcv;
    6.77 -    int newdata;
    6.78 -} pool_thread_t;
    6.79 -
    6.80 -pool_thread_t pool_thread[READ_POOL_SIZE+1];
    6.81 -
    6.82 -#define RECV_NOTIFY(tid) { \
    6.83 -    pthread_mutex_lock(&(pool_thread[tid].ptmutex)); \
    6.84 -    pool_thread[tid].newdata = 1; \
    6.85 -    DB("CV Waking %u", tid); \
    6.86 -    pthread_cond_signal(&(pool_thread[tid].ptcv)); \
    6.87 -    pthread_mutex_unlock(&(pool_thread[tid].ptmutex)); }
    6.88 -#define RECV_AWAIT(tid) { \
    6.89 -    pthread_mutex_lock(&(pool_thread[tid].ptmutex)); \
    6.90 -    if (pool_thread[tid].newdata) { \
    6.91 -        pool_thread[tid].newdata = 0; \
    6.92 -        DB("CV Woken %u", tid); \
    6.93 -    } \
    6.94 -    else { \
    6.95 -        DB("CV Waiting %u", tid); \
    6.96 -        pthread_cond_wait(&(pool_thread[tid].ptcv), \
    6.97 -                          &(pool_thread[tid].ptmutex)); \
    6.98 -    } \
    6.99 -    pthread_mutex_unlock(&(pool_thread[tid].ptmutex)); }
   6.100 -
   6.101 -/*****************************************************************************
   6.102 - * Message queue management                                                  *
   6.103 - *****************************************************************************/
   6.104 -
   6.105 -/* Protects the queue manipulation critcal regions.
   6.106 - */
   6.107 -pthread_mutex_t ptmutex_queue;
   6.108 -#define ENTER_QUEUE_CR pthread_mutex_lock(&ptmutex_queue)
   6.109 -#define LEAVE_QUEUE_CR pthread_mutex_unlock(&ptmutex_queue)
   6.110 -
   6.111 -pthread_mutex_t ptmutex_recv;
   6.112 -#define ENTER_RECV_CR pthread_mutex_lock(&ptmutex_recv)
   6.113 -#define LEAVE_RECV_CR pthread_mutex_unlock(&ptmutex_recv)
   6.114 -
   6.115 -/* A message queue entry. We allocate one of these for every request we send.
   6.116 - * Asynchronous reply reception also used one of these.
   6.117 - */
   6.118 -typedef struct bsq_t_struct {
   6.119 -    struct bsq_t_struct *prev;
   6.120 -    struct bsq_t_struct *next;
   6.121 -    int status;
   6.122 -    int server;
   6.123 -    int length;
   6.124 -    struct msghdr msghdr;
   6.125 -    struct iovec iov[2];
   6.126 -    int tid;
   6.127 -    struct timeval tv_sent;
   6.128 -    bshdr_t message;
   6.129 -    void *block;
   6.130 -} bsq_t;
   6.131 -
   6.132 -#define BSQ_STATUS_MATCHED 1
   6.133 -
   6.134 -pthread_mutex_t ptmutex_luid;
   6.135 -#define ENTER_LUID_CR pthread_mutex_lock(&ptmutex_luid)
   6.136 -#define LEAVE_LUID_CR pthread_mutex_unlock(&ptmutex_luid)
   6.137 -
   6.138 -static u64 luid_cnt = 0x1000ULL;
   6.139 -u64 new_luid(void) {
   6.140 -    u64 luid;
   6.141 -    ENTER_LUID_CR;
   6.142 -    luid = luid_cnt++;
   6.143 -    LEAVE_LUID_CR;
   6.144 -    return luid;
   6.145 -}
   6.146 -
   6.147 -/* Queue of outstanding requests.
   6.148 - */
   6.149 -bsq_t *bs_head = NULL;
   6.150 -bsq_t *bs_tail = NULL;
   6.151 -int bs_qlen = 0;
   6.152 -
   6.153 -/*
   6.154 - */
   6.155 -void queuedebug(char *msg) {
   6.156 -    bsq_t *q;
   6.157 -    ENTER_QUEUE_CR;
   6.158 -    fprintf(stderr, "Q: %s len=%u\n", msg, bs_qlen);
   6.159 -    for (q = bs_head; q; q = q->next) {
   6.160 -        fprintf(stderr, "  luid=%016llx server=%u\n",
   6.161 -                q->message.luid, q->server);
   6.162 -    }
   6.163 -    LEAVE_QUEUE_CR;
   6.164 -}
   6.165 -
   6.166 -int enqueue(bsq_t *qe) {
   6.167 -    ENTER_QUEUE_CR;
   6.168 -    qe->next = NULL;
   6.169 -    qe->prev = bs_tail;
   6.170 -    if (!bs_head)
   6.171 -        bs_head = qe;
   6.172 -    else
   6.173 -        bs_tail->next = qe;
   6.174 -    bs_tail = qe;
   6.175 -    bs_qlen++;
   6.176 -    LEAVE_QUEUE_CR;
   6.177 -#ifdef BSDEBUG
   6.178 -    queuedebug("enqueue");
   6.179 -#endif
   6.180 -    return 0;
   6.181 -}
   6.182 -
   6.183 -int dequeue(bsq_t *qe) {
   6.184 -    bsq_t *q;
   6.185 -    ENTER_QUEUE_CR;
   6.186 -    for (q = bs_head; q; q = q->next) {
   6.187 -        if (q == qe) {
   6.188 -            if (q->prev)
   6.189 -                q->prev->next = q->next;
   6.190 -            else 
   6.191 -                bs_head = q->next;
   6.192 -            if (q->next)
   6.193 -                q->next->prev = q->prev;
   6.194 -            else
   6.195 -                bs_tail = q->prev;
   6.196 -            bs_qlen--;
   6.197 -            goto found;
   6.198 -        }
   6.199 -    }
   6.200 -
   6.201 -    LEAVE_QUEUE_CR;
   6.202 -#ifdef BSDEBUG
   6.203 -    queuedebug("dequeue not found");
   6.204 -#endif
   6.205 -    return 0;
   6.206 -
   6.207 -    found:
   6.208 -    LEAVE_QUEUE_CR;
   6.209 -#ifdef BSDEBUG
   6.210 -    queuedebug("dequeue not found");
   6.211 -#endif
   6.212 -    return 1;
   6.213 -}
   6.214 -
   6.215 -bsq_t *queuesearch(bsq_t *qe) {
   6.216 -    bsq_t *q;
   6.217 -    ENTER_QUEUE_CR;
   6.218 -    for (q = bs_head; q; q = q->next) {
   6.219 -        if ((qe->server == q->server) &&
   6.220 -            (qe->message.operation == q->message.operation) &&
   6.221 -            (qe->message.luid == q->message.luid)) {
   6.222 -
   6.223 -            if ((q->message.operation == BSOP_READBLOCK) &&
   6.224 -                ((q->message.flags & BSOP_FLAG_ERROR) == 0)) {
   6.225 -                q->block = qe->block;
   6.226 -                qe->block = NULL;
   6.227 -            }
   6.228 -            q->length = qe->length;
   6.229 -            q->message.flags = qe->message.flags;
   6.230 -            q->message.id = qe->message.id;
   6.231 -            q->status |= BSQ_STATUS_MATCHED;
   6.232 -
   6.233 -            if (q->prev)
   6.234 -                q->prev->next = q->next;
   6.235 -            else 
   6.236 -                bs_head = q->next;
   6.237 -            if (q->next)
   6.238 -                q->next->prev = q->prev;
   6.239 -            else
   6.240 -                bs_tail = q->prev;
   6.241 -            q->next = NULL;
   6.242 -            q->prev = NULL;
   6.243 -            bs_qlen--;
   6.244 -            goto found;
   6.245 -        }
   6.246 -    }
   6.247 -
   6.248 -    LEAVE_QUEUE_CR;
   6.249 -#ifdef BSDEBUG
   6.250 -    queuedebug("queuesearch not found");
   6.251 -#endif
   6.252 -    return NULL;
   6.253 -
   6.254 -    found:
   6.255 -    LEAVE_QUEUE_CR;
   6.256 -#ifdef BSDEBUG
   6.257 -    queuedebug("queuesearch found");
   6.258 -#endif
   6.259 -    return q;
   6.260 -}
   6.261 -
   6.262 -/*****************************************************************************
   6.263 - * Network communication                                                     *
   6.264 - *****************************************************************************/
   6.265 -
   6.266 -int send_message(bsq_t *qe) {
   6.267 -    int rc;
   6.268 -
   6.269 -    qe->msghdr.msg_name = (void *)&(bsservers[qe->server].sin);
   6.270 -    qe->msghdr.msg_namelen = sizeof(struct sockaddr_in);
   6.271 -    qe->msghdr.msg_iov = qe->iov;
   6.272 -    if (qe->block)
   6.273 -        qe->msghdr.msg_iovlen = 2;
   6.274 -    else
   6.275 -        qe->msghdr.msg_iovlen = 1;
   6.276 -    qe->msghdr.msg_control = NULL;
   6.277 -    qe->msghdr.msg_controllen = 0;
   6.278 -    qe->msghdr.msg_flags = 0;
   6.279 -
   6.280 -    qe->iov[0].iov_base = (void *)&(qe->message);
   6.281 -    qe->iov[0].iov_len = MSGBUFSIZE_ID;
   6.282 -
   6.283 -    if (qe->block) {
   6.284 -        qe->iov[1].iov_base = qe->block;
   6.285 -        qe->iov[1].iov_len = BLOCK_SIZE;
   6.286 -    }
   6.287 -
   6.288 -    qe->message.luid = new_luid();
   6.289 -
   6.290 -    qe->status = 0;
   6.291 -    qe->tid = (int)pthread_getspecific(tid_key);
   6.292 -    if (enqueue(qe) < 0) {
   6.293 -        fprintf(stderr, "Error enqueuing request.\n");
   6.294 -        return -1;
   6.295 -    }
   6.296 -
   6.297 -    gettimeofday(&(qe->tv_sent), NULL);
   6.298 -    DB("send_message to %d luid=%016llx\n", qe->server, qe->message.luid);
   6.299 -    rc = sendmsg(bssock, &(qe->msghdr), MSG_DONTWAIT);
   6.300 -    //rc = sendto(bssock, (void *)&(qe->message), qe->length, 0,
   6.301 -    //           (struct sockaddr *)&(bsservers[qe->server].sin),
   6.302 -    //           sizeof(struct sockaddr_in));
   6.303 -    if (rc < 0)
   6.304 -        return rc;
   6.305 -
   6.306 -    return rc;
   6.307 -}
   6.308 -
   6.309 -int recv_message(bsq_t *qe) {
   6.310 -    struct sockaddr_in from;
   6.311 -    //int flen = sizeof(from);
   6.312 -    int rc;
   6.313 -
   6.314 -    qe->msghdr.msg_name = &from;
   6.315 -    qe->msghdr.msg_namelen = sizeof(struct sockaddr_in);
   6.316 -    qe->msghdr.msg_iov = qe->iov;
   6.317 -    if (qe->block)
   6.318 -        qe->msghdr.msg_iovlen = 2;
   6.319 -    else
   6.320 -        qe->msghdr.msg_iovlen = 1;
   6.321 -    qe->msghdr.msg_control = NULL;
   6.322 -    qe->msghdr.msg_controllen = 0;
   6.323 -    qe->msghdr.msg_flags = 0;
   6.324 -
   6.325 -    qe->iov[0].iov_base = (void *)&(qe->message);
   6.326 -    qe->iov[0].iov_len = MSGBUFSIZE_ID;
   6.327 -    if (qe->block) {
   6.328 -        qe->iov[1].iov_base = qe->block;
   6.329 -        qe->iov[1].iov_len = BLOCK_SIZE;
   6.330 -    }
   6.331 -
   6.332 -    rc = recvmsg(bssock, &(qe->msghdr), 0);
   6.333 -
   6.334 -    //return recvfrom(bssock, (void *)&(qe->message), sizeof(bsmsg_t), 0,
   6.335 -    //               (struct sockaddr *)&from, &flen);
   6.336 -    return rc;
   6.337 -}
   6.338 -
   6.339 -int get_server_number(struct sockaddr_in *sin) {
   6.340 -    int i;
   6.341 -
   6.342 -#ifdef BSDEBUG2
   6.343 -    fprintf(stderr,
   6.344 -            "get_server_number(%u.%u.%u.%u/%u)\n",
   6.345 -            (unsigned int)sin->sin_addr.s_addr & 0xff,
   6.346 -            ((unsigned int)sin->sin_addr.s_addr >> 8) & 0xff,
   6.347 -            ((unsigned int)sin->sin_addr.s_addr >> 16) & 0xff,
   6.348 -            ((unsigned int)sin->sin_addr.s_addr >> 24) & 0xff,
   6.349 -            (unsigned int)sin->sin_port);
   6.350 -#endif
   6.351 -
   6.352 -    for (i = 0; i < MAX_SERVERS; i++) {
   6.353 -        if (bsservers[i].hostname) {
   6.354 -#ifdef BSDEBUG2
   6.355 -            fprintf(stderr,
   6.356 -                    "get_server_number check %u.%u.%u.%u/%u\n",
   6.357 -                    (unsigned int)bsservers[i].sin.sin_addr.s_addr&0xff,
   6.358 -                    ((unsigned int)bsservers[i].sin.sin_addr.s_addr >> 8)&0xff,
   6.359 -                    ((unsigned int)bsservers[i].sin.sin_addr.s_addr >> 16)&0xff,
   6.360 -                    ((unsigned int)bsservers[i].sin.sin_addr.s_addr >> 24)&0xff,
   6.361 -                    (unsigned int)bsservers[i].sin.sin_port);
   6.362 -#endif
   6.363 -            if ((sin->sin_family == bsservers[i].sin.sin_family) &&
   6.364 -                (sin->sin_port == bsservers[i].sin.sin_port) &&
   6.365 -                (memcmp((void *)&(sin->sin_addr),
   6.366 -                        (void *)&(bsservers[i].sin.sin_addr),
   6.367 -                        sizeof(struct in_addr)) == 0)) {
   6.368 -                return i;
   6.369 -            }
   6.370 -        }        
   6.371 -    }
   6.372 -
   6.373 -    return -1;
   6.374 -}
   6.375 -
   6.376 -void *rx_buffer = NULL;
   6.377 -bsq_t rx_qe;
   6.378 -bsq_t *recv_any(void) {
   6.379 -    struct sockaddr_in from;
   6.380 -    int rc;
   6.381 -    
   6.382 -    DB("ENTER recv_any\n");
   6.383 -
   6.384 -    rx_qe.msghdr.msg_name = &from;
   6.385 -    rx_qe.msghdr.msg_namelen = sizeof(struct sockaddr_in);
   6.386 -    rx_qe.msghdr.msg_iov = rx_qe.iov;
   6.387 -    if (!rx_buffer) {
   6.388 -        rx_buffer = malloc(BLOCK_SIZE);
   6.389 -        if (!rx_buffer) {
   6.390 -            perror("recv_any malloc");
   6.391 -            return NULL;
   6.392 -        }
   6.393 -    }
   6.394 -    rx_qe.block = rx_buffer;
   6.395 -    rx_buffer = NULL;
   6.396 -    rx_qe.msghdr.msg_iovlen = 2;
   6.397 -    rx_qe.msghdr.msg_control = NULL;
   6.398 -    rx_qe.msghdr.msg_controllen = 0;
   6.399 -    rx_qe.msghdr.msg_flags = 0;
   6.400 -    
   6.401 -    rx_qe.iov[0].iov_base = (void *)&(rx_qe.message);
   6.402 -    rx_qe.iov[0].iov_len = MSGBUFSIZE_ID;
   6.403 -    rx_qe.iov[1].iov_base = rx_qe.block;
   6.404 -    rx_qe.iov[1].iov_len = BLOCK_SIZE;
   6.405 -
   6.406 -    rc = recvmsg(bssock, &(rx_qe.msghdr), 0);
   6.407 -    if (rc < 0) {
   6.408 -        perror("recv_any");
   6.409 -        return NULL;
   6.410 -    }
   6.411 -
   6.412 -    rx_qe.length = rc;    
   6.413 -    rx_qe.server = get_server_number(&from);
   6.414 -
   6.415 -    DB("recv_any from %d luid=%016llx len=%u\n",
   6.416 -       rx_qe.server, rx_qe.message.luid, rx_qe.length);
   6.417 -
   6.418 -    return &rx_qe;
   6.419 -}
   6.420 -
   6.421 -void recv_recycle_buffer(bsq_t *q) {
   6.422 -    if (q->block) {
   6.423 -        rx_buffer = q->block;
   6.424 -        q->block = NULL;
   6.425 -    }
   6.426 -}
   6.427 -
   6.428 -// cycle through reading any incoming, searching for a match in the
   6.429 -// queue, until we have all we need.
   6.430 -int wait_recv(bsq_t **reqs, int numreqs) {
   6.431 -    bsq_t *q, *m;
   6.432 -    unsigned int x, i;
   6.433 -    int tid = (int)pthread_getspecific(tid_key);
   6.434 -
   6.435 -    DB("ENTER wait_recv %u\n", numreqs);
   6.436 -
   6.437 -    checkmatch:
   6.438 -    x = 0xffffffff;
   6.439 -    for (i = 0; i < numreqs; i++) {
   6.440 -        x &= reqs[i]->status;
   6.441 -    }
   6.442 -    if ((x & BSQ_STATUS_MATCHED)) {
   6.443 -        DB("LEAVE wait_recv\n");
   6.444 -        return numreqs;
   6.445 -    }
   6.446 -
   6.447 -    RECV_AWAIT(tid);
   6.448 -
   6.449 -    /*
   6.450 -    rxagain:
   6.451 -    ENTER_RECV_CR;
   6.452 -    q = recv_any();
   6.453 -    LEAVE_RECV_CR;
   6.454 -    if (!q)
   6.455 -        return -1;
   6.456 -
   6.457 -    m = queuesearch(q);
   6.458 -    recv_recycle_buffer(q);
   6.459 -    if (!m) {
   6.460 -        fprintf(stderr, "Unmatched RX\n");
   6.461 -        goto rxagain;
   6.462 -    }
   6.463 -    */
   6.464 -
   6.465 -    goto checkmatch;
   6.466 -
   6.467 -}
   6.468 -
   6.469 -/* retry
   6.470 - */
   6.471 -static int retry_count = 0;
   6.472 -int retry(bsq_t *qe)
   6.473 -{
   6.474 -    int rc;
   6.475 -    gettimeofday(&(qe->tv_sent), NULL);
   6.476 -    DB("retry to %d luid=%016llx\n", qe->server, qe->message.luid);
   6.477 -    retry_count++;
   6.478 -    rc = sendmsg(bssock, &(qe->msghdr), MSG_DONTWAIT);
   6.479 -    if (rc < 0)
   6.480 -        return rc;
   6.481 -    return 0;
   6.482 -}
   6.483 -
   6.484 -/* queue runner
   6.485 - */
   6.486 -void *queue_runner(void *arg)
   6.487 -{
   6.488 -    for (;;) {
   6.489 -        struct timeval now;
   6.490 -        long long nowus, sus;
   6.491 -        bsq_t *q;
   6.492 -        int r;
   6.493 -
   6.494 -        sleep(1);
   6.495 -
   6.496 -        gettimeofday(&now, NULL);
   6.497 -        nowus = now.tv_usec + now.tv_sec * 1000000;
   6.498 -        ENTER_QUEUE_CR;
   6.499 -        r = retry_count;
   6.500 -        for (q = bs_head; q; q = q->next) {
   6.501 -            sus = q->tv_sent.tv_usec + q->tv_sent.tv_sec * 1000000;
   6.502 -            if ((nowus - sus) > RETRY_TIMEOUT) {
   6.503 -                if (retry(q) < 0) {
   6.504 -                    fprintf(stderr, "Error on sendmsg retry.\n");
   6.505 -                }
   6.506 -            }
   6.507 -        }
   6.508 -        if (r != retry_count) {
   6.509 -            fprintf(stderr, "RETRIES: %u %u\n", retry_count - r, retry_count);
   6.510 -        }
   6.511 -        LEAVE_QUEUE_CR;
   6.512 -    }
   6.513 -}
   6.514 -
   6.515 -/* receive loop
   6.516 - */
   6.517 -void *receive_loop(void *arg)
   6.518 -{
   6.519 -    bsq_t *q, *m;
   6.520 -
   6.521 -    for(;;) {
   6.522 -        q = recv_any();
   6.523 -        if (!q) {
   6.524 -            fprintf(stderr, "recv_any error\n");
   6.525 -        }
   6.526 -        else {
   6.527 -            m = queuesearch(q);
   6.528 -            recv_recycle_buffer(q);
   6.529 -            if (!m) {
   6.530 -                fprintf(stderr, "Unmatched RX\n");
   6.531 -            }
   6.532 -            else {
   6.533 -                DB("RX MATCH");
   6.534 -                RECV_NOTIFY(m->tid);
   6.535 -            }
   6.536 -        }
   6.537 -    }
   6.538 -}
   6.539 -pthread_t pthread_recv;
   6.540 -
   6.541 -/*****************************************************************************
   6.542 - * Reading                                                                   *
   6.543 - *****************************************************************************/
   6.544 -
   6.545 -void *readblock_indiv(int server, u64 id) {
   6.546 -    void *block;
   6.547 -    bsq_t *qe;
   6.548 -    int len, rc;
   6.549 -
   6.550 -    qe = (bsq_t *)malloc(sizeof(bsq_t));
   6.551 -    if (!qe) {
   6.552 -        perror("readblock qe malloc");
   6.553 -        return NULL;
   6.554 -    }
   6.555 -    qe->block = NULL;
   6.556 -    
   6.557 -    /*
   6.558 -    qe->block = malloc(BLOCK_SIZE);
   6.559 -    if (!qe->block) {
   6.560 -        perror("readblock qe malloc");
   6.561 -        free((void *)qe);
   6.562 -        return NULL;
   6.563 -    }
   6.564 -    */
   6.565 -
   6.566 -    qe->server = server;
   6.567 -
   6.568 -    qe->message.operation = BSOP_READBLOCK;
   6.569 -    qe->message.flags = 0;
   6.570 -    qe->message.id = id;
   6.571 -    qe->length = MSGBUFSIZE_ID;
   6.572 -
   6.573 -    if (send_message(qe) < 0) {
   6.574 -        perror("readblock sendto");
   6.575 -        goto err;
   6.576 -    }
   6.577 -    
   6.578 -    /*len = recv_message(qe);
   6.579 -    if (len < 0) {
   6.580 -        perror("readblock recv");
   6.581 -        goto err;
   6.582 -    }*/
   6.583 -
   6.584 -    rc = wait_recv(&qe, 1);
   6.585 -    if (rc < 0) {
   6.586 -        perror("readblock recv");
   6.587 -        goto err;
   6.588 -    }
   6.589 -
   6.590 -    if ((qe->message.flags & BSOP_FLAG_ERROR)) {
   6.591 -        fprintf(stderr, "readblock server error\n");
   6.592 -        goto err;
   6.593 -    }
   6.594 -    if (qe->length < MSGBUFSIZE_BLOCK) {
   6.595 -        fprintf(stderr, "readblock recv short (%u)\n", len);
   6.596 -        goto err;
   6.597 -    }
   6.598 -    /* if ((block = malloc(BLOCK_SIZE)) == NULL) {
   6.599 -        perror("readblock malloc");
   6.600 -        goto err;
   6.601 -    }
   6.602 -    memcpy(block, qe->message.block, BLOCK_SIZE);
   6.603 -    */    
   6.604 -    block = qe->block;
   6.605 -
   6.606 -    free((void *)qe);
   6.607 -    return block;
   6.608 -
   6.609 -    err:
   6.610 -    if (qe->block)
   6.611 -        free(qe->block);
   6.612 -    free((void *)qe);
   6.613 -    return NULL;
   6.614 -}
   6.615 -
   6.616 -/**
   6.617 - * readblock: read a block from disk
   6.618 - *   @id: block id to read
   6.619 - *
   6.620 - *   @return: pointer to block, NULL on error
   6.621 - */
   6.622 -void *readblock(u64 id) {
   6.623 -    int map = (int)BSID_MAP(id);
   6.624 -    u64 xid;
   6.625 -    static int i = CLUSTER_MAX_REPLICAS - 1;
   6.626 -    void *block = NULL;
   6.627 -
   6.628 -    /* special case for the "superblock" just use the first block on the
   6.629 -     * first replica. (extend to blocks < 6 for vdi bug)
   6.630 -     */
   6.631 -    if (id < 6) {
   6.632 -        block = readblock_indiv(bsclusters[map].servers[0], id);
   6.633 -        goto out;
   6.634 -    }
   6.635 -
   6.636 -    i++;
   6.637 -    if (i >= CLUSTER_MAX_REPLICAS)
   6.638 -        i = 0;
   6.639 -    switch (i) {
   6.640 -    case 0:
   6.641 -        xid = BSID_REPLICA0(id);
   6.642 -        break;
   6.643 -    case 1:
   6.644 -        xid = BSID_REPLICA1(id);
   6.645 -        break;
   6.646 -    case 2:
   6.647 -        xid = BSID_REPLICA2(id);
   6.648 -        break;
   6.649 -    }
   6.650 -    
   6.651 -    block = readblock_indiv(bsclusters[map].servers[i], xid);
   6.652 -
   6.653 -    out:
   6.654 -#ifdef BSDEBUG
   6.655 -    if (block)
   6.656 -        fprintf(stderr, "READ:  %016llx %02x%02x %02x%02x %02x%02x %02x%02x\n",
   6.657 -                id,
   6.658 -                (unsigned int)((unsigned char *)block)[0],
   6.659 -                (unsigned int)((unsigned char *)block)[1],
   6.660 -                (unsigned int)((unsigned char *)block)[2],
   6.661 -                (unsigned int)((unsigned char *)block)[3],
   6.662 -                (unsigned int)((unsigned char *)block)[4],
   6.663 -                (unsigned int)((unsigned char *)block)[5],
   6.664 -                (unsigned int)((unsigned char *)block)[6],
   6.665 -                (unsigned int)((unsigned char *)block)[7]);
   6.666 -    else
   6.667 -        fprintf(stderr, "READ:  %016llx NULL\n", id);
   6.668 -#endif
   6.669 -    return block;
   6.670 -}
   6.671 -
   6.672 -/*****************************************************************************
   6.673 - * Writing                                                                   *
   6.674 - *****************************************************************************/
   6.675 -
   6.676 -bsq_t *writeblock_indiv(int server, u64 id, void *block) {
   6.677 -
   6.678 -    bsq_t *qe;
   6.679 -    int len;
   6.680 -
   6.681 -    qe = (bsq_t *)malloc(sizeof(bsq_t));
   6.682 -    if (!qe) {
   6.683 -        perror("writeblock qe malloc");
   6.684 -        goto err;
   6.685 -    }
   6.686 -    qe->server = server;
   6.687 -
   6.688 -    qe->message.operation = BSOP_WRITEBLOCK;
   6.689 -    qe->message.flags = 0;
   6.690 -    qe->message.id = id;
   6.691 -    //memcpy(qe->message.block, block, BLOCK_SIZE);
   6.692 -    qe->block = block;
   6.693 -    qe->length = MSGBUFSIZE_BLOCK;
   6.694 -
   6.695 -    if (send_message(qe) < 0) {
   6.696 -        perror("writeblock sendto");
   6.697 -        goto err;
   6.698 -    }
   6.699 -
   6.700 -    return qe;
   6.701 -
   6.702 -    err:
   6.703 -    free((void *)qe);
   6.704 -    return NULL;
   6.705 -}
   6.706 -    
   6.707 -
   6.708 -/**
   6.709 - * writeblock: write an existing block to disk
   6.710 - *   @id: block id
   6.711 - *   @block: pointer to block
   6.712 - *
   6.713 - *   @return: zero on success, -1 on failure
   6.714 - */
   6.715 -int writeblock(u64 id, void *block) {
   6.716 -    
   6.717 -    int map = (int)BSID_MAP(id);
   6.718 -    int rep0 = bsclusters[map].servers[0];
   6.719 -    int rep1 = bsclusters[map].servers[1];
   6.720 -    int rep2 = bsclusters[map].servers[2];
   6.721 -    bsq_t *reqs[3];
   6.722 -    int rc;
   6.723 -
   6.724 -    reqs[0] = reqs[1] = reqs[2] = NULL;
   6.725 -
   6.726 -#ifdef BSDEBUG
   6.727 -    fprintf(stderr,
   6.728 -            "WRITE: %016llx %02x%02x %02x%02x %02x%02x %02x%02x\n",
   6.729 -            id,
   6.730 -            (unsigned int)((unsigned char *)block)[0],
   6.731 -            (unsigned int)((unsigned char *)block)[1],
   6.732 -            (unsigned int)((unsigned char *)block)[2],
   6.733 -            (unsigned int)((unsigned char *)block)[3],
   6.734 -            (unsigned int)((unsigned char *)block)[4],
   6.735 -            (unsigned int)((unsigned char *)block)[5],
   6.736 -            (unsigned int)((unsigned char *)block)[6],
   6.737 -            (unsigned int)((unsigned char *)block)[7]);
   6.738 -#endif
   6.739 -
   6.740 -    /* special case for the "superblock" just use the first block on the
   6.741 -     * first replica. (extend to blocks < 6 for vdi bug)
   6.742 -     */
   6.743 -    if (id < 6) {
   6.744 -        reqs[0] = writeblock_indiv(rep0, id, block);
   6.745 -        if (!reqs[0])
   6.746 -            return -1;
   6.747 -        rc = wait_recv(reqs, 1);
   6.748 -        return rc;
   6.749 -    }
   6.750 -
   6.751 -    reqs[0] = writeblock_indiv(rep0, BSID_REPLICA0(id), block);
   6.752 -    if (!reqs[0])
   6.753 -        goto err;
   6.754 -    reqs[1] = writeblock_indiv(rep1, BSID_REPLICA1(id), block);
   6.755 -    if (!reqs[1])
   6.756 -        goto err;
   6.757 -    reqs[2] = writeblock_indiv(rep2, BSID_REPLICA2(id), block);
   6.758 -    if (!reqs[2])
   6.759 -        goto err;
   6.760 -
   6.761 -    rc = wait_recv(reqs, 3);
   6.762 -    if (rc < 0) {
   6.763 -        perror("writeblock recv");
   6.764 -        goto err;
   6.765 -    }
   6.766 -    if ((reqs[0]->message.flags & BSOP_FLAG_ERROR)) {
   6.767 -        fprintf(stderr, "writeblock server0 error\n");
   6.768 -        goto err;
   6.769 -    }
   6.770 -    if ((reqs[1]->message.flags & BSOP_FLAG_ERROR)) {
   6.771 -        fprintf(stderr, "writeblock server1 error\n");
   6.772 -        goto err;
   6.773 -    }
   6.774 -    if ((reqs[2]->message.flags & BSOP_FLAG_ERROR)) {
   6.775 -        fprintf(stderr, "writeblock server2 error\n");
   6.776 -        goto err;
   6.777 -    }
   6.778 -
   6.779 -
   6.780 -    free((void *)reqs[0]);
   6.781 -    free((void *)reqs[1]);
   6.782 -    free((void *)reqs[2]);
   6.783 -    return 0;
   6.784 -
   6.785 -    err:
   6.786 -    if (reqs[0]) {
   6.787 -        dequeue(reqs[0]);
   6.788 -        free((void *)reqs[0]);
   6.789 -    }
   6.790 -    if (reqs[1]) {
   6.791 -        dequeue(reqs[1]);
   6.792 -        free((void *)reqs[1]);
   6.793 -    }
   6.794 -    if (reqs[2]) {
   6.795 -        dequeue(reqs[2]);
   6.796 -        free((void *)reqs[2]);
   6.797 -    }
   6.798 -    return -1;
   6.799 -}
   6.800 -
   6.801 -/*****************************************************************************
   6.802 - * Allocation                                                                *
   6.803 - *****************************************************************************/
   6.804 -
   6.805 -/**
   6.806 - * allocblock: write a new block to disk
   6.807 - *   @block: pointer to block
   6.808 - *
   6.809 - *   @return: new id of block on disk
   6.810 - */
   6.811 -u64 allocblock(void *block) {
   6.812 -    return allocblock_hint(block, 0);
   6.813 -}
   6.814 -
   6.815 -bsq_t *allocblock_hint_indiv(int server, void *block, u64 hint) {
   6.816 -    bsq_t *qe;
   6.817 -    int len;
   6.818 -
   6.819 -    qe = (bsq_t *)malloc(sizeof(bsq_t));
   6.820 -    if (!qe) {
   6.821 -        perror("allocblock_hint qe malloc");
   6.822 -        goto err;
   6.823 -    }
   6.824 -    qe->server = server;
   6.825 -
   6.826 -    qe->message.operation = BSOP_ALLOCBLOCK;
   6.827 -    qe->message.flags = 0;
   6.828 -    qe->message.id = hint;
   6.829 -    //memcpy(qe->message.block, block, BLOCK_SIZE);
   6.830 -    qe->block = block;
   6.831 -    qe->length = MSGBUFSIZE_BLOCK;
   6.832 -
   6.833 -    if (send_message(qe) < 0) {
   6.834 -        perror("allocblock_hint sendto");
   6.835 -        goto err;
   6.836 -    }
   6.837 -    
   6.838 -    return qe;
   6.839 -
   6.840 -    err:
   6.841 -    free((void *)qe);
   6.842 -    return NULL;
   6.843 -}
   6.844 -
   6.845 -/**
   6.846 - * allocblock_hint: write a new block to disk
   6.847 - *   @block: pointer to block
   6.848 - *   @hint: allocation hint
   6.849 - *
   6.850 - *   @return: new id of block on disk
   6.851 - */
   6.852 -u64 allocblock_hint(void *block, u64 hint) {
   6.853 -    int map = (int)hint;
   6.854 -    int rep0 = bsclusters[map].servers[0];
   6.855 -    int rep1 = bsclusters[map].servers[1];
   6.856 -    int rep2 = bsclusters[map].servers[2];
   6.857 -    bsq_t *reqs[3];
   6.858 -    int rc;
   6.859 -    u64 id0, id1, id2;
   6.860 -
   6.861 -    reqs[0] = reqs[1] = reqs[2] = NULL;
   6.862 -
   6.863 -    DB("ENTER allocblock\n");
   6.864 -
   6.865 -    reqs[0] = allocblock_hint_indiv(rep0, block, hint);
   6.866 -    if (!reqs[0])
   6.867 -        goto err;
   6.868 -    reqs[1] = allocblock_hint_indiv(rep1, block, hint);
   6.869 -    if (!reqs[1])
   6.870 -        goto err;
   6.871 -    reqs[2] = allocblock_hint_indiv(rep2, block, hint);
   6.872 -    if (!reqs[2])
   6.873 -        goto err;
   6.874 -
   6.875 -    rc = wait_recv(reqs, 3);
   6.876 -    if (rc < 0) {
   6.877 -        perror("allocblock recv");
   6.878 -        goto err;
   6.879 -    }
   6.880 -    if ((reqs[0]->message.flags & BSOP_FLAG_ERROR)) {
   6.881 -        fprintf(stderr, "allocblock server0 error\n");
   6.882 -        goto err;
   6.883 -    }
   6.884 -    if ((reqs[1]->message.flags & BSOP_FLAG_ERROR)) {
   6.885 -        fprintf(stderr, "allocblock server1 error\n");
   6.886 -        goto err;
   6.887 -    }
   6.888 -    if ((reqs[2]->message.flags & BSOP_FLAG_ERROR)) {
   6.889 -        fprintf(stderr, "allocblock server2 error\n");
   6.890 -        goto err;
   6.891 -    }
   6.892 -
   6.893 -    id0 = reqs[0]->message.id;
   6.894 -    id1 = reqs[1]->message.id;
   6.895 -    id2 = reqs[2]->message.id;
   6.896 -
   6.897 -#ifdef BSDEBUG
   6.898 -    fprintf(stderr, "ALLOC: %016llx %02x%02x %02x%02x %02x%02x %02x%02x\n",
   6.899 -            BSID(map, id0, id1, id2),
   6.900 -            (unsigned int)((unsigned char *)block)[0],
   6.901 -            (unsigned int)((unsigned char *)block)[1],
   6.902 -            (unsigned int)((unsigned char *)block)[2],
   6.903 -            (unsigned int)((unsigned char *)block)[3],
   6.904 -            (unsigned int)((unsigned char *)block)[4],
   6.905 -            (unsigned int)((unsigned char *)block)[5],
   6.906 -            (unsigned int)((unsigned char *)block)[6],
   6.907 -            (unsigned int)((unsigned char *)block)[7]);
   6.908 -#endif
   6.909 -    
   6.910 -    free((void *)reqs[0]);
   6.911 -    free((void *)reqs[1]);
   6.912 -    free((void *)reqs[2]);
   6.913 -    return BSID(map, id0, id1, id2);
   6.914 -
   6.915 -    err:
   6.916 -    if (reqs[0]) {
   6.917 -        dequeue(reqs[0]);
   6.918 -        free((void *)reqs[0]);
   6.919 -    }
   6.920 -    if (reqs[1]) {
   6.921 -        dequeue(reqs[1]);
   6.922 -        free((void *)reqs[1]);
   6.923 -    }
   6.924 -    if (reqs[2]) {
   6.925 -        dequeue(reqs[2]);
   6.926 -        free((void *)reqs[2]);
   6.927 -    }
   6.928 -    return 0;
   6.929 -}
   6.930 -
   6.931 -#else /* /BLOCKSTORE_REMOTE */
   6.932 -
   6.933 -/*****************************************************************************
   6.934 - * Local storage version                                                     *
   6.935 - *****************************************************************************/
   6.936 - 
   6.937 -/**
   6.938 - * readblock: read a block from disk
   6.939 - *   @id: block id to read
   6.940 - *
   6.941 - *   @return: pointer to block, NULL on error
   6.942 - */
   6.943 -
   6.944 -void *readblock(u64 id) {
   6.945 -    void *block;
   6.946 -    int block_fp;
   6.947 -   
   6.948 -//printf("readblock(%llu)\n", id); 
   6.949 -    block_fp = open("blockstore.dat", O_RDONLY | O_CREAT | O_LARGEFILE, 0644);
   6.950 -
   6.951 -    if (block_fp < 0) {
   6.952 -        perror("open");
   6.953 -        return NULL;
   6.954 -    }
   6.955 -    
   6.956 -    if (lseek64(block_fp, ((off64_t) id - 1LL) * BLOCK_SIZE, SEEK_SET) < 0) {
   6.957 -        printf ("%Ld ", id);
   6.958 -        printf ("%Ld\n", (id - 1) * BLOCK_SIZE);
   6.959 -        perror("readblock lseek");
   6.960 -        goto err;
   6.961 -    }
   6.962 -    if ((block = malloc(BLOCK_SIZE)) == NULL) {
   6.963 -        perror("readblock malloc");
   6.964 -        goto err;
   6.965 -    }
   6.966 -    if (read(block_fp, block, BLOCK_SIZE) != BLOCK_SIZE) {
   6.967 -        perror("readblock read");
   6.968 -        free(block);
   6.969 -        goto err;
   6.970 -    }
   6.971 -    close(block_fp);
   6.972 -    return block;
   6.973 -    
   6.974 -err:
   6.975 -    close(block_fp);
   6.976 -    return NULL;
   6.977 -}
   6.978 -
   6.979 -/**
   6.980 - * writeblock: write an existing block to disk
   6.981 - *   @id: block id
   6.982 - *   @block: pointer to block
   6.983 - *
   6.984 - *   @return: zero on success, -1 on failure
   6.985 - */
   6.986 -int writeblock(u64 id, void *block) {
   6.987 -    
   6.988 -    int block_fp;
   6.989 -    
   6.990 -    block_fp = open("blockstore.dat", O_RDWR | O_CREAT | O_LARGEFILE, 0644);
   6.991 -
   6.992 -    if (block_fp < 0) {
   6.993 -        perror("open");
   6.994 -        return -1;
   6.995 -    }
   6.996 -
   6.997 -    if (lseek64(block_fp, ((off64_t) id - 1LL) * BLOCK_SIZE, SEEK_SET) < 0) {
   6.998 -        perror("writeblock lseek");
   6.999 -        goto err;
  6.1000 -    }
  6.1001 -    if (write(block_fp, block, BLOCK_SIZE) < 0) {
  6.1002 -        perror("writeblock write");
  6.1003 -        goto err;
  6.1004 -    }
  6.1005 -    close(block_fp);
  6.1006 -    return 0;
  6.1007 -
  6.1008 -err:
  6.1009 -    close(block_fp);
  6.1010 -    return -1;
  6.1011 -}
  6.1012 -
  6.1013 -/**
  6.1014 - * allocblock: write a new block to disk
  6.1015 - *   @block: pointer to block
  6.1016 - *
  6.1017 - *   @return: new id of block on disk
  6.1018 - */
  6.1019 -
  6.1020 -u64 allocblock(void *block) {
  6.1021 -    u64 lb;
  6.1022 -    off64_t pos;
  6.1023 -    int block_fp;
  6.1024 -    
  6.1025 -    block_fp = open("blockstore.dat", O_RDWR | O_CREAT | O_LARGEFILE, 0644);
  6.1026 -
  6.1027 -    if (block_fp < 0) {
  6.1028 -        perror("open");
  6.1029 -        return 0;
  6.1030 -    }
  6.1031 -
  6.1032 -    pos = lseek64(block_fp, 0, SEEK_END);
  6.1033 -    if (pos == (off64_t)-1) {
  6.1034 -        perror("allocblock lseek");
  6.1035 -        goto err;
  6.1036 -    }
  6.1037 -    if (pos % BLOCK_SIZE != 0) {
  6.1038 -        fprintf(stderr, "file size not multiple of %d\n", BLOCK_SIZE);
  6.1039 -        goto err;
  6.1040 -    }
  6.1041 -    if (write(block_fp, block, BLOCK_SIZE) != BLOCK_SIZE) {
  6.1042 -        perror("allocblock write");
  6.1043 -        goto err;
  6.1044 -    }
  6.1045 -    lb = pos / BLOCK_SIZE + 1;
  6.1046 -//printf("alloc(%Ld)\n", lb);
  6.1047 -    close(block_fp);
  6.1048 -    return lb;
  6.1049 -    
  6.1050 -err:
  6.1051 -    close(block_fp);
  6.1052 -    return 0;
  6.1053 -    
  6.1054 -}
  6.1055 -
  6.1056 -/**
  6.1057 - * allocblock_hint: write a new block to disk
  6.1058 - *   @block: pointer to block
  6.1059 - *   @hint: allocation hint
  6.1060 - *
  6.1061 - *   @return: new id of block on disk
  6.1062 - */
  6.1063 -u64 allocblock_hint(void *block, u64 hint) {
  6.1064 -    return allocblock(block);
  6.1065 -}
  6.1066 -
  6.1067 -#endif /* BLOCKSTORE_REMOTE */
  6.1068 -
  6.1069 -/*****************************************************************************
  6.1070 - * Memory management                                                         *
  6.1071 - *****************************************************************************/
  6.1072 -
  6.1073 -/**
  6.1074 - * newblock: get a new in-memory block set to zeros
  6.1075 - *
  6.1076 - *   @return: pointer to new block, NULL on error
  6.1077 - */
  6.1078 -void *newblock() {
  6.1079 -    void *block = malloc(BLOCK_SIZE);
  6.1080 -    if (block == NULL) {
  6.1081 -        perror("newblock");
  6.1082 -        return NULL;
  6.1083 -    }
  6.1084 -    memset(block, 0, BLOCK_SIZE);
  6.1085 -    return block;
  6.1086 -}
  6.1087 -
  6.1088 -
  6.1089 -/**
  6.1090 - * freeblock: unallocate an in-memory block
  6.1091 - *   @id: block id (zero if this is only in-memory)
  6.1092 - *   @block: block to be freed
  6.1093 - */
  6.1094 -void freeblock(void *block) {
  6.1095 -    if (block != NULL)
  6.1096 -        free(block);
  6.1097 -}
  6.1098 -
  6.1099 -static freeblock_t *new_freeblock(void)
  6.1100 -{
  6.1101 -    freeblock_t *fb;
  6.1102 -    
  6.1103 -    fb = newblock();
  6.1104 -    
  6.1105 -    if (fb == NULL) return NULL;
  6.1106 -    
  6.1107 -    fb->magic = FREEBLOCK_MAGIC;
  6.1108 -    fb->next  = 0ULL;
  6.1109 -    fb->count = 0ULL;
  6.1110 -    memset(fb->list, 0, sizeof fb->list);
  6.1111 -    
  6.1112 -    return fb;
  6.1113 -}
  6.1114 -
  6.1115 -void releaseblock(u64 id)
  6.1116 -{
  6.1117 -    blockstore_super_t *bs_super;
  6.1118 -    freeblock_t *fl_current;
  6.1119 -    
  6.1120 -    /* get superblock */
  6.1121 -    bs_super = (blockstore_super_t *) readblock(BLOCKSTORE_SUPER);
  6.1122 -    
  6.1123 -    /* get freeblock_current */
  6.1124 -    if (bs_super->freelist_current == 0ULL) 
  6.1125 -    {
  6.1126 -        fl_current = new_freeblock();
  6.1127 -        bs_super->freelist_current = allocblock(fl_current);
  6.1128 -        writeblock(BLOCKSTORE_SUPER, bs_super);
  6.1129 -    } else {
  6.1130 -        fl_current = readblock(bs_super->freelist_current);
  6.1131 -    }
  6.1132 -    
  6.1133 -    /* if full, chain to superblock and allocate new current */
  6.1134 -    
  6.1135 -    if (fl_current->count == FREEBLOCK_SIZE) {
  6.1136 -        fl_current->next = bs_super->freelist_full;
  6.1137 -        writeblock(bs_super->freelist_current, fl_current);
  6.1138 -        bs_super->freelist_full = bs_super->freelist_current;
  6.1139 -        freeblock(fl_current);
  6.1140 -        fl_current = new_freeblock();
  6.1141 -        bs_super->freelist_current = allocblock(fl_current);
  6.1142 -        writeblock(BLOCKSTORE_SUPER, bs_super);
  6.1143 -    }
  6.1144 -    
  6.1145 -    /* append id to current */
  6.1146 -    fl_current->list[fl_current->count++] = id;
  6.1147 -    writeblock(bs_super->freelist_current, fl_current);
  6.1148 -    
  6.1149 -    freeblock(fl_current);
  6.1150 -    freeblock(bs_super);
  6.1151 -    
  6.1152 -    
  6.1153 -}
  6.1154 -
  6.1155 -/* freelist debug functions: */
  6.1156 -void freelist_count(int print_each)
  6.1157 -{
  6.1158 -    blockstore_super_t *bs_super;
  6.1159 -    freeblock_t *fb;
  6.1160 -    u64 total = 0, next;
  6.1161 -    
  6.1162 -    bs_super = (blockstore_super_t *) readblock(BLOCKSTORE_SUPER);
  6.1163 -    
  6.1164 -    if (bs_super->freelist_current == 0ULL) {
  6.1165 -        printf("freelist is empty!\n");
  6.1166 -        return;
  6.1167 -    }
  6.1168 -    
  6.1169 -    fb = readblock(bs_super->freelist_current);
  6.1170 -    printf("%Ld entires on current.\n", fb->count);
  6.1171 -    total += fb->count;
  6.1172 -    if (print_each == 1)
  6.1173 -    {
  6.1174 -        int i;
  6.1175 -        for (i=0; i< fb->count; i++)
  6.1176 -            printf("  %Ld\n", fb->list[i]);
  6.1177 -    }
  6.1178 -    
  6.1179 -    freeblock(fb);
  6.1180 -    
  6.1181 -    if (bs_super->freelist_full == 0ULL) {
  6.1182 -        printf("freelist_full is empty!\n");
  6.1183 -        return;
  6.1184 -    }
  6.1185 -    
  6.1186 -    next = bs_super->freelist_full;
  6.1187 -    for (;;) {
  6.1188 -        fb = readblock(next);
  6.1189 -        total += fb->count;
  6.1190 -        if (print_each == 1)
  6.1191 -        {
  6.1192 -            int i;
  6.1193 -            for (i=0; i< fb->count; i++)
  6.1194 -                printf("  %Ld\n", fb->list[i]);
  6.1195 -        }
  6.1196 -        next = fb->next;
  6.1197 -        freeblock(fb);
  6.1198 -        if (next == 0ULL) break;
  6.1199 -    }
  6.1200 -    printf("Total of %Ld ids on freelist.\n", total);
  6.1201 -}
  6.1202 -
  6.1203 -/*****************************************************************************
  6.1204 - * Initialisation                                                            *
  6.1205 - *****************************************************************************/
  6.1206 -
  6.1207 -int __init_blockstore(void)
  6.1208 -{
  6.1209 -    int i;
  6.1210 -    blockstore_super_t *bs_super;
  6.1211 -    u64 ret;
  6.1212 -    int block_fp;
  6.1213 -    
  6.1214 -#ifdef BLOCKSTORE_REMOTE
  6.1215 -    struct hostent *addr;
  6.1216 -
  6.1217 -    pthread_mutex_init(&ptmutex_queue, NULL);
  6.1218 -    pthread_mutex_init(&ptmutex_luid, NULL);
  6.1219 -    pthread_mutex_init(&ptmutex_recv, NULL);
  6.1220 -    /*pthread_mutex_init(&ptmutex_notify, NULL);*/
  6.1221 -    for (i = 0; i <= READ_POOL_SIZE; i++) {
  6.1222 -        pool_thread[i].newdata = 0;
  6.1223 -        pthread_mutex_init(&(pool_thread[i].ptmutex), NULL);
  6.1224 -        pthread_cond_init(&(pool_thread[i].ptcv), NULL);
  6.1225 -    }
  6.1226 -
  6.1227 -    bsservers[0].hostname = "firebug.cl.cam.ac.uk";
  6.1228 -    bsservers[1].hostname = "planb.cl.cam.ac.uk";
  6.1229 -    bsservers[2].hostname = "simcity.cl.cam.ac.uk";
  6.1230 -    bsservers[3].hostname = NULL/*"gunfighter.cl.cam.ac.uk"*/;
  6.1231 -    bsservers[4].hostname = NULL/*"galaxian.cl.cam.ac.uk"*/;
  6.1232 -    bsservers[5].hostname = NULL/*"firetrack.cl.cam.ac.uk"*/;
  6.1233 -    bsservers[6].hostname = NULL/*"funfair.cl.cam.ac.uk"*/;
  6.1234 -    bsservers[7].hostname = NULL/*"felix.cl.cam.ac.uk"*/;
  6.1235 -    bsservers[8].hostname = NULL;
  6.1236 -    bsservers[9].hostname = NULL;
  6.1237 -    bsservers[10].hostname = NULL;
  6.1238 -    bsservers[11].hostname = NULL;
  6.1239 -    bsservers[12].hostname = NULL;
  6.1240 -    bsservers[13].hostname = NULL;
  6.1241 -    bsservers[14].hostname = NULL;
  6.1242 -    bsservers[15].hostname = NULL;
  6.1243 -
  6.1244 -    for (i = 0; i < MAX_SERVERS; i++) {
  6.1245 -        if (!bsservers[i].hostname)
  6.1246 -            continue;
  6.1247 -        addr = gethostbyname(bsservers[i].hostname);
  6.1248 -        if (!addr) {
  6.1249 -            perror("bad hostname");
  6.1250 -            return -1;
  6.1251 -        }
  6.1252 -        bsservers[i].sin.sin_family = addr->h_addrtype;
  6.1253 -        bsservers[i].sin.sin_port = htons(BLOCKSTORED_PORT);
  6.1254 -        bsservers[i].sin.sin_addr.s_addr = 
  6.1255 -            ((struct in_addr *)(addr->h_addr))->s_addr;
  6.1256 -    }
  6.1257 -
  6.1258 -    /* Cluster map
  6.1259 -     */
  6.1260 -    bsclusters[0].servers[0] = 0;
  6.1261 -    bsclusters[0].servers[1] = 1;
  6.1262 -    bsclusters[0].servers[2] = 2;
  6.1263 -    bsclusters[1].servers[0] = 1;
  6.1264 -    bsclusters[1].servers[1] = 2;
  6.1265 -    bsclusters[1].servers[2] = 3;
  6.1266 -    bsclusters[2].servers[0] = 2;
  6.1267 -    bsclusters[2].servers[1] = 3;
  6.1268 -    bsclusters[2].servers[2] = 4;
  6.1269 -    bsclusters[3].servers[0] = 3;
  6.1270 -    bsclusters[3].servers[1] = 4;
  6.1271 -    bsclusters[3].servers[2] = 5;
  6.1272 -    bsclusters[4].servers[0] = 4;
  6.1273 -    bsclusters[4].servers[1] = 5;
  6.1274 -    bsclusters[4].servers[2] = 6;
  6.1275 -    bsclusters[5].servers[0] = 5;
  6.1276 -    bsclusters[5].servers[1] = 6;
  6.1277 -    bsclusters[5].servers[2] = 7;
  6.1278 -    bsclusters[6].servers[0] = 6;
  6.1279 -    bsclusters[6].servers[1] = 7;
  6.1280 -    bsclusters[6].servers[2] = 0;
  6.1281 -    bsclusters[7].servers[0] = 7;
  6.1282 -    bsclusters[7].servers[1] = 0;
  6.1283 -    bsclusters[7].servers[2] = 1;
  6.1284 -
  6.1285 -    /* Local socket set up
  6.1286 -     */
  6.1287 -    bssock = socket(AF_INET, SOCK_DGRAM, 0);
  6.1288 -    if (bssock < 0) {
  6.1289 -        perror("Bad socket");
  6.1290 -        return -1;
  6.1291 -    }
  6.1292 -    memset(&sin_local, 0, sizeof(sin_local));
  6.1293 -    sin_local.sin_family = AF_INET;
  6.1294 -    sin_local.sin_port = htons(BLOCKSTORED_PORT);
  6.1295 -    sin_local.sin_addr.s_addr = htonl(INADDR_ANY);
  6.1296 -    if (bind(bssock, (struct sockaddr *)&sin_local, sizeof(sin_local)) < 0) {
  6.1297 -        perror("bind");
  6.1298 -        close(bssock);
  6.1299 -        return -1;
  6.1300 -    }
  6.1301 -
  6.1302 -    pthread_create(&pthread_recv, NULL, receive_loop, NULL);
  6.1303 -    pthread_create(&pthread_recv, NULL, queue_runner, NULL);
  6.1304 -
  6.1305 -#else /* /BLOCKSTORE_REMOTE */
  6.1306 -    block_fp = open("blockstore.dat", O_RDWR | O_CREAT | O_LARGEFILE, 0644);
  6.1307 -
  6.1308 -    if (block_fp < 0) {
  6.1309 -        perror("open");
  6.1310 -        return -1;
  6.1311 -        exit(-1);
  6.1312 -    }
  6.1313 -    
  6.1314 -    if (lseek(block_fp, 0, SEEK_END) == 0) {
  6.1315 -        bs_super = newblock();
  6.1316 -        bs_super->magic            = BLOCKSTORE_MAGIC;
  6.1317 -        bs_super->freelist_full    = 0LL;
  6.1318 -        bs_super->freelist_current = 0LL;
  6.1319 -        
  6.1320 -        ret = allocblock(bs_super);
  6.1321 -        
  6.1322 -        freeblock(bs_super);
  6.1323 -    } else {
  6.1324 -        bs_super = (blockstore_super_t *) readblock(BLOCKSTORE_SUPER);
  6.1325 -        if (bs_super->magic != BLOCKSTORE_MAGIC)
  6.1326 -        {
  6.1327 -            printf("BLOCKSTORE IS CORRUPT! (no magic in superblock!)\n");
  6.1328 -            exit(-1);
  6.1329 -        }
  6.1330 -        freeblock(bs_super);
  6.1331 -    }
  6.1332 -        
  6.1333 -    close(block_fp);
  6.1334 -        
  6.1335 -#endif /*  BLOCKSTORE_REMOTE */   
  6.1336 -    return 0;
  6.1337 -}
  6.1338 -
  6.1339 -void __exit_blockstore(void)
  6.1340 -{
  6.1341 -    int i;
  6.1342 -#ifdef BLOCKSTORE_REMOTE
  6.1343 -    pthread_mutex_destroy(&ptmutex_recv);
  6.1344 -    pthread_mutex_destroy(&ptmutex_luid);
  6.1345 -    pthread_mutex_destroy(&ptmutex_queue);
  6.1346 -    /*pthread_mutex_destroy(&ptmutex_notify);
  6.1347 -      pthread_cond_destroy(&ptcv_notify);*/
  6.1348 -    for (i = 0; i <= READ_POOL_SIZE; i++) {
  6.1349 -        pthread_mutex_destroy(&(pool_thread[i].ptmutex));
  6.1350 -        pthread_cond_destroy(&(pool_thread[i].ptcv));
  6.1351 -    }
  6.1352 -#endif
  6.1353 -}
     7.1 --- a/tools/blktap/blockstore.h	Sun Jul 03 22:32:52 2005 +0000
     7.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
     7.3 @@ -1,134 +0,0 @@
     7.4 -/**************************************************************************
     7.5 - * 
     7.6 - * blockstore.h
     7.7 - *
     7.8 - * Simple block store interface
     7.9 - *
    7.10 - */
    7.11 - 
    7.12 -#ifndef __BLOCKSTORE_H__
    7.13 -#define __BLOCKSTORE_H__
    7.14 -
    7.15 -#include <netinet/in.h>
    7.16 -#include <xc.h>
    7.17 -
    7.18 -#define BLOCK_SIZE  4096
    7.19 -#define BLOCK_SHIFT   12
    7.20 -#define BLOCK_MASK  0xfffffffffffff000LL
    7.21 -
    7.22 -/* XXX SMH: where is the below supposed to be defined???? */
    7.23 -#ifndef SECTOR_SHIFT 
    7.24 -#define SECTOR_SHIFT   9 
    7.25 -#endif
    7.26 -
    7.27 -#define FREEBLOCK_SIZE  (BLOCK_SIZE / sizeof(u64)) - (3 * sizeof(u64))
    7.28 -#define FREEBLOCK_MAGIC 0x0fee0fee0fee0feeULL
    7.29 -
    7.30 -typedef struct {
    7.31 -    u64 magic;
    7.32 -    u64 next;
    7.33 -    u64 count;
    7.34 -    u64 list[FREEBLOCK_SIZE];
    7.35 -} freeblock_t; 
    7.36 -
    7.37 -#define BLOCKSTORE_MAGIC 0xaaaaaaa00aaaaaaaULL
    7.38 -#define BLOCKSTORE_SUPER 1ULL
    7.39 -
    7.40 -typedef struct {
    7.41 -    u64 magic;
    7.42 -    u64 freelist_full;
    7.43 -    u64 freelist_current;
    7.44 -} blockstore_super_t;
    7.45 -
    7.46 -extern void *newblock();
    7.47 -extern void *readblock(u64 id);
    7.48 -extern u64 allocblock(void *block);
    7.49 -extern u64 allocblock_hint(void *block, u64 hint);
    7.50 -extern int writeblock(u64 id, void *block);
    7.51 -
    7.52 -/* Add this blockid to a freelist, to be recycled by the allocator. */
    7.53 -extern void releaseblock(u64 id);
    7.54 -
    7.55 -/* this is a memory free() operation for block-sized allocations */
    7.56 -extern void freeblock(void *block);
    7.57 -extern int __init_blockstore(void);
    7.58 -
    7.59 -/* debug for freelist. */
    7.60 -void freelist_count(int print_each);
    7.61 -#define ALLOCFAIL (((u64)(-1)))
    7.62 -
    7.63 -/* Distribution
    7.64 - */
    7.65 -#define BLOCKSTORED_PORT 9346
    7.66 -
    7.67 -struct bshdr_t_struct {
    7.68 -    u32            operation;
    7.69 -    u32            flags;
    7.70 -    u64            id;
    7.71 -    u64            luid;
    7.72 -} __attribute__ ((packed));
    7.73 -typedef struct bshdr_t_struct bshdr_t;
    7.74 -
    7.75 -struct bsmsg_t_struct {
    7.76 -    bshdr_t        hdr;
    7.77 -    unsigned char  block[BLOCK_SIZE];
    7.78 -} __attribute__ ((packed));
    7.79 -
    7.80 -typedef struct bsmsg_t_struct bsmsg_t;
    7.81 -
    7.82 -#define MSGBUFSIZE_OP    sizeof(u32)
    7.83 -#define MSGBUFSIZE_FLAGS (sizeof(u32) + sizeof(u32))
    7.84 -#define MSGBUFSIZE_ID    (sizeof(u32) + sizeof(u32) + sizeof(u64) + sizeof(u64))
    7.85 -#define MSGBUFSIZE_BLOCK sizeof(bsmsg_t)
    7.86 -
    7.87 -#define BSOP_READBLOCK  0x01
    7.88 -#define BSOP_WRITEBLOCK 0x02
    7.89 -#define BSOP_ALLOCBLOCK 0x03
    7.90 -#define BSOP_FREEBLOCK  0x04
    7.91 -
    7.92 -#define BSOP_FLAG_ERROR 0x01
    7.93 -
    7.94 -#define BS_ALLOC_SKIP 10
    7.95 -#define BS_ALLOC_HACK
    7.96 -
    7.97 -/* Remote hosts and cluster map - XXX need to generalise
    7.98 - */
    7.99 -
   7.100 -/*
   7.101 -
   7.102 -  Interim ID format is
   7.103 -
   7.104 -  63 60 59                40 39                20 19                 0
   7.105 -  +----+--------------------+--------------------+--------------------+
   7.106 -  |map | replica 2          | replica 1          | replica 0          |
   7.107 -  +----+--------------------+--------------------+--------------------+
   7.108 -
   7.109 -  The map is an index into a table detailing which machines form the
   7.110 -  cluster.
   7.111 -
   7.112 - */
   7.113 -
   7.114 -#define BSID_REPLICA0(_id) ((_id)&0xfffffULL)
   7.115 -#define BSID_REPLICA1(_id) (((_id)>>20)&0xfffffULL)
   7.116 -#define BSID_REPLICA2(_id) (((_id)>>40)&0xfffffULL)
   7.117 -#define BSID_MAP(_id)      (((_id)>>60)&0xfULL)
   7.118 -
   7.119 -#define BSID(_map, _rep0, _rep1, _rep2) ((((u64)(_map))<<60) | \
   7.120 -                                         (((u64)(_rep2))<<40) | \
   7.121 -                                         (((u64)(_rep1))<<20) | ((u64)(_rep0)))
   7.122 -
   7.123 -typedef struct bsserver_t_struct {
   7.124 -    char              *hostname;
   7.125 -    struct sockaddr_in sin;
   7.126 -} bsserver_t;
   7.127 -
   7.128 -#define MAX_SERVERS 16
   7.129 -
   7.130 -#define CLUSTER_MAX_REPLICAS 3
   7.131 -typedef struct bscluster_t_struct {
   7.132 -    int servers[CLUSTER_MAX_REPLICAS];
   7.133 -} bscluster_t;
   7.134 -
   7.135 -#define MAX_CLUSTERS 16
   7.136 -
   7.137 -#endif /* __BLOCKSTORE_H__ */
     8.1 --- a/tools/blktap/blockstored.c	Sun Jul 03 22:32:52 2005 +0000
     8.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
     8.3 @@ -1,276 +0,0 @@
     8.4 -/**************************************************************************
     8.5 - * 
     8.6 - * blockstored.c
     8.7 - *
     8.8 - * Block store daemon.
     8.9 - *
    8.10 - */
    8.11 -
    8.12 -#include <fcntl.h>
    8.13 -#include <unistd.h>
    8.14 -#include <stdio.h>
    8.15 -#include <stdlib.h>
    8.16 -#include <string.h>
    8.17 -#include <sys/types.h>
    8.18 -#include <sys/stat.h>
    8.19 -#include <sys/socket.h>
    8.20 -#include <sys/ioctl.h>
    8.21 -#include <netinet/in.h>
    8.22 -#include <errno.h>
    8.23 -#include "blockstore.h"
    8.24 -
    8.25 -//#define BSDEBUG
    8.26 -
    8.27 -int readblock_into(u64 id, void *block);
    8.28 -
    8.29 -int open_socket(u16 port) {
    8.30 -    
    8.31 -    struct sockaddr_in sn;
    8.32 -    int sock;
    8.33 -
    8.34 -    sock = socket(AF_INET, SOCK_DGRAM, 0);
    8.35 -    if (sock < 0) {
    8.36 -        perror("Bad socket");
    8.37 -        return -1;
    8.38 -    }
    8.39 -    memset(&sn, 0, sizeof(sn));
    8.40 -    sn.sin_family = AF_INET;
    8.41 -    sn.sin_port = htons(port);
    8.42 -    sn.sin_addr.s_addr = htonl(INADDR_ANY);
    8.43 -    if (bind(sock, (struct sockaddr *)&sn, sizeof(sn)) < 0) {
    8.44 -        perror("bind");
    8.45 -        close(sock);
    8.46 -        return -1;
    8.47 -    }
    8.48 -
    8.49 -    return sock;
    8.50 -}
    8.51 -
    8.52 -static int block_fp = -1;
    8.53 -static int bssock = -1;
    8.54 -
    8.55 -int send_reply(struct sockaddr_in *peer, void *buffer, int len) {
    8.56 -
    8.57 -    int rc;
    8.58 -    
    8.59 -#ifdef BSDEBUG
    8.60 -    fprintf(stdout, "TX: %u bytes op=%u id=0x%llx\n",
    8.61 -            len, ((bsmsg_t *)buffer)->hdr.operation, ((bsmsg_t *)buffer)->hdr.id);
    8.62 -#endif
    8.63 -    rc = sendto(bssock, buffer, len, 0, (struct sockaddr *)peer, sizeof(*peer));
    8.64 -    if (rc < 0) {
    8.65 -        perror("send_reply");
    8.66 -        return 1;
    8.67 -    }
    8.68 -
    8.69 -
    8.70 -    return 0;
    8.71 -}
    8.72 -
    8.73 -static bsmsg_t msgbuf;
    8.74 -
    8.75 -void service_loop(void) {
    8.76 -
    8.77 -    for (;;) {
    8.78 -        int rc, len;
    8.79 -        struct sockaddr_in from;
    8.80 -        size_t slen = sizeof(from);
    8.81 -        u64 bid;
    8.82 -
    8.83 -        len = recvfrom(bssock, (void *)&msgbuf, sizeof(msgbuf), 0,
    8.84 -                       (struct sockaddr *)&from, &slen);
    8.85 -
    8.86 -        if (len < 0) {
    8.87 -            perror("recvfrom");
    8.88 -            continue;
    8.89 -        }
    8.90 -
    8.91 -        if (len < MSGBUFSIZE_OP) {
    8.92 -            fprintf(stderr, "Short packet.\n");
    8.93 -            continue;
    8.94 -        }
    8.95 -
    8.96 -#ifdef BSDEBUG
    8.97 -        fprintf(stdout, "RX: %u bytes op=%u id=0x%llx\n",
    8.98 -                len, msgbuf.hdr.operation, msgbuf.hdr.id);
    8.99 -#endif
   8.100 -
   8.101 -        switch (msgbuf.hdr.operation) {
   8.102 -        case BSOP_READBLOCK:
   8.103 -            if (len < MSGBUFSIZE_ID) {
   8.104 -                fprintf(stderr, "Short packet (readblock %u).\n", len);
   8.105 -                continue;
   8.106 -            }
   8.107 -            rc = readblock_into(msgbuf.hdr.id, msgbuf.block);
   8.108 -            if (rc < 0) {
   8.109 -                fprintf(stderr, "readblock error\n");
   8.110 -                msgbuf.hdr.flags = BSOP_FLAG_ERROR;
   8.111 -                send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_ID);
   8.112 -                continue;
   8.113 -            }
   8.114 -            msgbuf.hdr.flags = 0;
   8.115 -            send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_BLOCK);
   8.116 -            break;
   8.117 -        case BSOP_WRITEBLOCK:
   8.118 -            if (len < MSGBUFSIZE_BLOCK) {
   8.119 -                fprintf(stderr, "Short packet (writeblock %u).\n", len);
   8.120 -                continue;
   8.121 -            }
   8.122 -            rc = writeblock(msgbuf.hdr.id, msgbuf.block);
   8.123 -            if (rc < 0) {
   8.124 -                fprintf(stderr, "writeblock error\n");
   8.125 -                msgbuf.hdr.flags = BSOP_FLAG_ERROR;
   8.126 -                send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_ID);
   8.127 -                continue;
   8.128 -            }
   8.129 -            msgbuf.hdr.flags = 0;
   8.130 -            send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_ID);
   8.131 -            break;
   8.132 -        case BSOP_ALLOCBLOCK:
   8.133 -            if (len < MSGBUFSIZE_BLOCK) {
   8.134 -                fprintf(stderr, "Short packet (allocblock %u).\n", len);
   8.135 -                continue;
   8.136 -            }
   8.137 -            bid = allocblock(msgbuf.block);
   8.138 -            if (bid == ALLOCFAIL) {
   8.139 -                fprintf(stderr, "allocblock error\n");
   8.140 -                msgbuf.hdr.flags = BSOP_FLAG_ERROR;
   8.141 -                send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_ID);
   8.142 -                continue;
   8.143 -            }
   8.144 -            msgbuf.hdr.id = bid;
   8.145 -            msgbuf.hdr.flags = 0;
   8.146 -            send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_ID);
   8.147 -            break;
   8.148 -        }
   8.149 -
   8.150 -    }
   8.151 -}
   8.152 - 
   8.153 -/**
   8.154 - * readblock: read a block from disk
   8.155 - *   @id: block id to read
   8.156 - *   @block: pointer to buffer to receive block
   8.157 - *
   8.158 - *   @return: 0 if OK, other on error
   8.159 - */
   8.160 -
   8.161 -int readblock_into(u64 id, void *block) {
   8.162 -    if (lseek64(block_fp, ((off64_t) id - 1LL) * BLOCK_SIZE, SEEK_SET) < 0) {
   8.163 -        printf ("%Ld\n", (id - 1) * BLOCK_SIZE);
   8.164 -        perror("readblock lseek");
   8.165 -        return -1;
   8.166 -    }
   8.167 -    if (read(block_fp, block, BLOCK_SIZE) != BLOCK_SIZE) {
   8.168 -        perror("readblock read");
   8.169 -        return -1;
   8.170 -    }
   8.171 -    return 0;
   8.172 -}
   8.173 -
   8.174 -/**
   8.175 - * writeblock: write an existing block to disk
   8.176 - *   @id: block id
   8.177 - *   @block: pointer to block
   8.178 - *
   8.179 - *   @return: zero on success, -1 on failure
   8.180 - */
   8.181 -int writeblock(u64 id, void *block) {
   8.182 -    if (lseek64(block_fp, ((off64_t) id - 1LL) * BLOCK_SIZE, SEEK_SET) < 0) {
   8.183 -        perror("writeblock lseek");
   8.184 -        return -1;
   8.185 -    }
   8.186 -    if (write(block_fp, block, BLOCK_SIZE) < 0) {
   8.187 -        perror("writeblock write");
   8.188 -        return -1;
   8.189 -    }
   8.190 -    return 0;
   8.191 -}
   8.192 -
   8.193 -/**
   8.194 - * allocblock: write a new block to disk
   8.195 - *   @block: pointer to block
   8.196 - *
   8.197 - *   @return: new id of block on disk
   8.198 - */
   8.199 -static u64 lastblock = 0;
   8.200 -
   8.201 -u64 allocblock(void *block) {
   8.202 -    u64 lb;
   8.203 -    off64_t pos;
   8.204 -
   8.205 -    retry:
   8.206 -    pos = lseek64(block_fp, 0, SEEK_END);
   8.207 -    if (pos == (off64_t)-1) {
   8.208 -        perror("allocblock lseek");
   8.209 -        return ALLOCFAIL;
   8.210 -    }
   8.211 -    if (pos % BLOCK_SIZE != 0) {
   8.212 -        fprintf(stderr, "file size not multiple of %d\n", BLOCK_SIZE);
   8.213 -        return ALLOCFAIL;
   8.214 -    }
   8.215 -    if (write(block_fp, block, BLOCK_SIZE) != BLOCK_SIZE) {
   8.216 -        perror("allocblock write");
   8.217 -        return ALLOCFAIL;
   8.218 -    }
   8.219 -    lb = pos / BLOCK_SIZE + 1;
   8.220 -
   8.221 -#ifdef BS_ALLOC_HACK
   8.222 -    if (lb < BS_ALLOC_SKIP)
   8.223 -        goto retry;
   8.224 -#endif
   8.225 -    
   8.226 -    if (lb <= lastblock)
   8.227 -        printf("[*** %Ld alredy allocated! ***]\n", lb);
   8.228 -    
   8.229 -    lastblock = lb;
   8.230 -    return lb;
   8.231 -}
   8.232 -
   8.233 -/**
   8.234 - * newblock: get a new in-memory block set to zeros
   8.235 - *
   8.236 - *   @return: pointer to new block, NULL on error
   8.237 - */
   8.238 -void *newblock() {
   8.239 -    void *block = malloc(BLOCK_SIZE);
   8.240 -    if (block == NULL) {
   8.241 -        perror("newblock");
   8.242 -        return NULL;
   8.243 -    }
   8.244 -    memset(block, 0, BLOCK_SIZE);
   8.245 -    return block;
   8.246 -}
   8.247 -
   8.248 -
   8.249 -/**
   8.250 - * freeblock: unallocate an in-memory block
   8.251 - *   @id: block id (zero if this is only in-memory)
   8.252 - *   @block: block to be freed
   8.253 - */
   8.254 -void freeblock(void *block) {
   8.255 -    if (block != NULL)
   8.256 -        free(block);
   8.257 -}
   8.258 -
   8.259 -
   8.260 -int main(int argc, char **argv)
   8.261 -{
   8.262 -    block_fp = open("blockstore.dat", O_RDWR | O_CREAT | O_LARGEFILE, 0644);
   8.263 -
   8.264 -    if (block_fp < 0) {
   8.265 -        perror("open");
   8.266 -        return -1;
   8.267 -    }
   8.268 -
   8.269 -    bssock = open_socket(BLOCKSTORED_PORT);
   8.270 -    if (bssock < 0) {
   8.271 -        return -1;
   8.272 -    }
   8.273 -
   8.274 -    service_loop();
   8.275 -    
   8.276 -    close(bssock);
   8.277 -
   8.278 -    return 0;
   8.279 -}
     9.1 --- a/tools/blktap/bstest.c	Sun Jul 03 22:32:52 2005 +0000
     9.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
     9.3 @@ -1,191 +0,0 @@
     9.4 -/**************************************************************************
     9.5 - * 
     9.6 - * bstest.c
     9.7 - *
     9.8 - * Block store daemon test program.
     9.9 - *
    9.10 - * usage: bstest <host>|X {r|w|a} ID 
    9.11 - *
    9.12 - */
    9.13 -
    9.14 -#include <fcntl.h>
    9.15 -#include <unistd.h>
    9.16 -#include <stdio.h>
    9.17 -#include <stdlib.h>
    9.18 -#include <string.h>
    9.19 -#include <sys/types.h>
    9.20 -#include <sys/stat.h>
    9.21 -#include <sys/socket.h>
    9.22 -#include <sys/ioctl.h>
    9.23 -#include <netinet/in.h>
    9.24 -#include <netdb.h>
    9.25 -#include <errno.h>
    9.26 -#include "blockstore.h"
    9.27 -
    9.28 -int direct(char *host, u32 op, u64 id, int len) {
    9.29 -    struct sockaddr_in sn, peer;
    9.30 -    int sock;
    9.31 -    bsmsg_t msgbuf;
    9.32 -    int rc, slen;
    9.33 -    struct hostent *addr;
    9.34 -
    9.35 -    addr = gethostbyname(host);
    9.36 -    if (!addr) {
    9.37 -        perror("bad hostname");
    9.38 -        exit(1);
    9.39 -    }
    9.40 -    peer.sin_family = addr->h_addrtype;
    9.41 -    peer.sin_port = htons(BLOCKSTORED_PORT);
    9.42 -    peer.sin_addr.s_addr =  ((struct in_addr *)(addr->h_addr))->s_addr;
    9.43 -    fprintf(stderr, "Sending to: %u.%u.%u.%u\n",
    9.44 -            (unsigned int)(unsigned char)addr->h_addr[0],
    9.45 -            (unsigned int)(unsigned char)addr->h_addr[1],
    9.46 -            (unsigned int)(unsigned char)addr->h_addr[2],
    9.47 -            (unsigned int)(unsigned char)addr->h_addr[3]);
    9.48 -
    9.49 -    sock = socket(AF_INET, SOCK_DGRAM, 0);
    9.50 -    if (sock < 0) {
    9.51 -        perror("Bad socket");
    9.52 -        exit(1);
    9.53 -    }
    9.54 -    memset(&sn, 0, sizeof(sn));
    9.55 -    sn.sin_family = AF_INET;
    9.56 -    sn.sin_port = htons(BLOCKSTORED_PORT);
    9.57 -    sn.sin_addr.s_addr = htonl(INADDR_ANY);
    9.58 -    if (bind(sock, (struct sockaddr *)&sn, sizeof(sn)) < 0) {
    9.59 -        perror("bind");
    9.60 -        close(sock);
    9.61 -        exit(1);
    9.62 -    }
    9.63 -
    9.64 -    memset((void *)&msgbuf, 0, sizeof(msgbuf));
    9.65 -    msgbuf.operation = op;
    9.66 -    msgbuf.id = id;
    9.67 -
    9.68 -    rc = sendto(sock, (void *)&msgbuf, len, 0,
    9.69 -                (struct sockaddr *)&peer, sizeof(peer));
    9.70 -    if (rc < 0) {
    9.71 -        perror("sendto");
    9.72 -        exit(1);
    9.73 -    }
    9.74 -
    9.75 -    slen = sizeof(peer);
    9.76 -    len = recvfrom(sock, (void *)&msgbuf, sizeof(msgbuf), 0,
    9.77 -                   (struct sockaddr *)&peer, &slen);
    9.78 -    if (len < 0) {
    9.79 -        perror("recvfrom");
    9.80 -        exit(1);
    9.81 -    }
    9.82 -
    9.83 -    printf("Reply %u bytes:\n", len);
    9.84 -    if (len >= MSGBUFSIZE_OP)
    9.85 -        printf("  operation: %u\n", msgbuf.operation);
    9.86 -    if (len >= MSGBUFSIZE_FLAGS)
    9.87 -        printf("  flags: 0x%x\n", msgbuf.flags);
    9.88 -    if (len >= MSGBUFSIZE_ID)
    9.89 -        printf("  id: %llu\n", msgbuf.id);
    9.90 -    if (len >= (MSGBUFSIZE_ID + 4))
    9.91 -        printf("  data: %02x %02x %02x %02x...\n",
    9.92 -               (unsigned int)msgbuf.block[0],
    9.93 -               (unsigned int)msgbuf.block[1],
    9.94 -               (unsigned int)msgbuf.block[2],
    9.95 -               (unsigned int)msgbuf.block[3]);
    9.96 -    
    9.97 -    if (sock > 0)
    9.98 -        close(sock);
    9.99 -   
   9.100 -    return 0;
   9.101 -}
   9.102 -
   9.103 -int main (int argc, char **argv) {
   9.104 -
   9.105 -    u32 op = 0;
   9.106 -    u64 id = 0;
   9.107 -    int len = 0, rc;
   9.108 -    void *block;
   9.109 -
   9.110 -    if (argc < 3) {
   9.111 -        fprintf(stderr, "usage: bstest <host>|X {r|w|a} ID\n");
   9.112 -        return 1;
   9.113 -    }
   9.114 -
   9.115 -    switch (argv[2][0]) {
   9.116 -    case 'r':
   9.117 -    case 'R':
   9.118 -        op = BSOP_READBLOCK;
   9.119 -        len = MSGBUFSIZE_ID;
   9.120 -        break;
   9.121 -    case 'w':
   9.122 -    case 'W':
   9.123 -        op = BSOP_WRITEBLOCK;
   9.124 -        len = MSGBUFSIZE_BLOCK;
   9.125 -        break;
   9.126 -    case 'a':
   9.127 -    case 'A':
   9.128 -        op = BSOP_ALLOCBLOCK;
   9.129 -        len = MSGBUFSIZE_BLOCK;
   9.130 -        break;
   9.131 -    default:
   9.132 -        fprintf(stderr, "Unknown action '%s'.\n", argv[2]);
   9.133 -        return 1;
   9.134 -    }
   9.135 -
   9.136 -    if (argc >= 4)
   9.137 -        id = atoll(argv[3]);
   9.138 -
   9.139 -    if (strcmp(argv[1], "X") == 0) {
   9.140 -        rc = __init_blockstore();
   9.141 -        if (rc < 0) {
   9.142 -            fprintf(stderr, "blockstore init failed.\n");
   9.143 -            return 1;
   9.144 -        }
   9.145 -        switch(op) {
   9.146 -        case BSOP_READBLOCK:
   9.147 -            block = readblock(id);
   9.148 -            if (block) {
   9.149 -                printf("data: %02x %02x %02x %02x...\n",
   9.150 -                       (unsigned int)((unsigned char*)block)[0],
   9.151 -                       (unsigned int)((unsigned char*)block)[1],
   9.152 -                       (unsigned int)((unsigned char*)block)[2],
   9.153 -                       (unsigned int)((unsigned char*)block)[3]);
   9.154 -            }
   9.155 -            break;
   9.156 -        case BSOP_WRITEBLOCK:
   9.157 -            block = malloc(BLOCK_SIZE);
   9.158 -            if (!block) {
   9.159 -                perror("bstest malloc");
   9.160 -                return 1;
   9.161 -            }
   9.162 -            memset(block, 0, BLOCK_SIZE);
   9.163 -            rc = writeblock(id, block);
   9.164 -            if (rc != 0) {
   9.165 -                printf("error\n");
   9.166 -            }
   9.167 -            else {
   9.168 -                printf("OK\n");
   9.169 -            }
   9.170 -            break;
   9.171 -        case BSOP_ALLOCBLOCK:
   9.172 -            block = malloc(BLOCK_SIZE);
   9.173 -            if (!block) {
   9.174 -                perror("bstest malloc");
   9.175 -                return 1;
   9.176 -            }
   9.177 -            memset(block, 0, BLOCK_SIZE);
   9.178 -            id = allocblock_hint(block, id);
   9.179 -            if (id == 0) {
   9.180 -                printf("error\n");
   9.181 -            }
   9.182 -            else {
   9.183 -                printf("ID: %llu\n", id);
   9.184 -            }
   9.185 -            break;
   9.186 -        }
   9.187 -    }
   9.188 -    else {
   9.189 -        direct(argv[1], op, id, len);
   9.190 -    }
   9.191 -
   9.192 -
   9.193 -    return 0;
   9.194 -}
    10.1 --- a/tools/blktap/parallax.c	Sun Jul 03 22:32:52 2005 +0000
    10.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    10.3 @@ -1,611 +0,0 @@
    10.4 -/**************************************************************************
    10.5 - * 
    10.6 - * parallax.c
    10.7 - *
    10.8 - * The Parallax Storage Server
    10.9 - *
   10.10 - */
   10.11 - 
   10.12 -
   10.13 -#include <stdio.h>
   10.14 -#include <stdlib.h>
   10.15 -#include <string.h>
   10.16 -#include <pthread.h>
   10.17 -#include "blktaplib.h"
   10.18 -#include "blockstore.h"
   10.19 -#include "vdi.h"
   10.20 -#include "block-async.h"
   10.21 -#include "requests-async.h"
   10.22 -
   10.23 -#define PARALLAX_DEV     61440
   10.24 -#define SECTS_PER_NODE   8
   10.25 -
   10.26 -
   10.27 -#if 0
   10.28 -#define DPRINTF(_f, _a...) printf ( _f , ## _a )
   10.29 -#else
   10.30 -#define DPRINTF(_f, _a...) ((void)0)
   10.31 -#endif
   10.32 -
   10.33 -/* ------[ session records ]----------------------------------------------- */
   10.34 -
   10.35 -#define BLKIF_HASHSZ 1024
   10.36 -#define BLKIF_HASH(_d,_h) (((int)(_d)^(int)(_h))&(BLKIF_HASHSZ-1))
   10.37 -
   10.38 -#define VDI_HASHSZ 16
   10.39 -#define VDI_HASH(_vd) ((((_vd)>>8)^(_vd))&(VDI_HASHSZ-1))
   10.40 -
   10.41 -typedef struct blkif {
   10.42 -    domid_t       domid;
   10.43 -    unsigned int  handle;
   10.44 -    enum { DISCONNECTED, DISCONNECTING, CONNECTED } status;
   10.45 -    vdi_t        *vdi_hash[VDI_HASHSZ];
   10.46 -    struct blkif *hash_next;
   10.47 -} blkif_t;
   10.48 -
   10.49 -static blkif_t      *blkif_hash[BLKIF_HASHSZ];
   10.50 -
   10.51 -blkif_t *blkif_find_by_handle(domid_t domid, unsigned int handle)
   10.52 -{
   10.53 -    if ( handle != 0 )
   10.54 -        printf("blktap/parallax don't currently support non-0 dev handles!\n");
   10.55 -    
   10.56 -    blkif_t *blkif = blkif_hash[BLKIF_HASH(domid, handle)];
   10.57 -    while ( (blkif != NULL) && 
   10.58 -            ((blkif->domid != domid) || (blkif->handle != handle)) )
   10.59 -        blkif = blkif->hash_next;
   10.60 -    return blkif;
   10.61 -}
   10.62 -
   10.63 -vdi_t *blkif_get_vdi(blkif_t *blkif, blkif_vdev_t device)
   10.64 -{
   10.65 -    vdi_t *vdi = blkif->vdi_hash[VDI_HASH(device)];
   10.66 -    
   10.67 -    while ((vdi != NULL) && (vdi->vdevice != device))
   10.68 -        vdi = vdi->next;
   10.69 -    
   10.70 -    return vdi;
   10.71 -}
   10.72 -
   10.73 -/* ------[ control message handling ]-------------------------------------- */
   10.74 -
   10.75 -void blkif_create(blkif_be_create_t *create)
   10.76 -{
   10.77 -    domid_t       domid  = create->domid;
   10.78 -    unsigned int  handle = create->blkif_handle;
   10.79 -    blkif_t     **pblkif, *blkif;
   10.80 -
   10.81 -    DPRINTF("parallax (blkif_create): create is %p\n", create); 
   10.82 -    
   10.83 -    if ( (blkif = (blkif_t *)malloc(sizeof(blkif_t))) == NULL )
   10.84 -    {
   10.85 -        DPRINTF("Could not create blkif: out of memory\n");
   10.86 -        create->status = BLKIF_BE_STATUS_OUT_OF_MEMORY;
   10.87 -        return;
   10.88 -    }
   10.89 -
   10.90 -    memset(blkif, 0, sizeof(*blkif));
   10.91 -    blkif->domid  = domid;
   10.92 -    blkif->handle = handle;
   10.93 -    blkif->status = DISCONNECTED;
   10.94 -
   10.95 -    pblkif = &blkif_hash[BLKIF_HASH(domid, handle)];
   10.96 -    while ( *pblkif != NULL )
   10.97 -    {
   10.98 -        if ( ((*pblkif)->domid == domid) && ((*pblkif)->handle == handle) )
   10.99 -        {
  10.100 -            DPRINTF("Could not create blkif: already exists (%d,%d)\n",
  10.101 -                domid, handle);
  10.102 -            create->status = BLKIF_BE_STATUS_INTERFACE_EXISTS;
  10.103 -            free(blkif);
  10.104 -            return;
  10.105 -        }
  10.106 -        pblkif = &(*pblkif)->hash_next;
  10.107 -    }
  10.108 -
  10.109 -    blkif->hash_next = *pblkif;
  10.110 -    *pblkif = blkif;
  10.111 -
  10.112 -    DPRINTF("Successfully created blkif\n");
  10.113 -    create->status = BLKIF_BE_STATUS_OKAY;
  10.114 -}
  10.115 -
  10.116 -void blkif_destroy(blkif_be_destroy_t *destroy)
  10.117 -{
  10.118 -    domid_t       domid  = destroy->domid;
  10.119 -    unsigned int  handle = destroy->blkif_handle;
  10.120 -    blkif_t     **pblkif, *blkif;
  10.121 -
  10.122 -    DPRINTF("parallax (blkif_destroy): destroy is %p\n", destroy); 
  10.123 -    
  10.124 -    pblkif = &blkif_hash[BLKIF_HASH(domid, handle)];
  10.125 -    while ( (blkif = *pblkif) != NULL )
  10.126 -    {
  10.127 -        if ( (blkif->domid == domid) && (blkif->handle == handle) )
  10.128 -        {
  10.129 -            if ( blkif->status != DISCONNECTED )
  10.130 -                goto still_connected;
  10.131 -            goto destroy;
  10.132 -        }
  10.133 -        pblkif = &blkif->hash_next;
  10.134 -    }
  10.135 -
  10.136 -    destroy->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
  10.137 -    return;
  10.138 -
  10.139 - still_connected:
  10.140 -    destroy->status = BLKIF_BE_STATUS_INTERFACE_CONNECTED;
  10.141 -    return;
  10.142 -
  10.143 - destroy:
  10.144 -    *pblkif = blkif->hash_next;
  10.145 -    free(blkif);
  10.146 -    destroy->status = BLKIF_BE_STATUS_OKAY;
  10.147 -}
  10.148 -
  10.149 -void vbd_create(blkif_be_vbd_create_t *create)
  10.150 -{
  10.151 -    blkif_t            *blkif;
  10.152 -    vdi_t              *vdi, **vdip;
  10.153 -    blkif_vdev_t        vdevice = create->vdevice;
  10.154 -
  10.155 -    DPRINTF("parallax (vbd_create): create=%p\n", create); 
  10.156 -    
  10.157 -    blkif = blkif_find_by_handle(create->domid, create->blkif_handle);
  10.158 -    if ( blkif == NULL )
  10.159 -    {
  10.160 -        DPRINTF("vbd_create attempted for non-existent blkif (%u,%u)\n", 
  10.161 -                create->domid, create->blkif_handle); 
  10.162 -        create->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
  10.163 -        return;
  10.164 -    }
  10.165 -
  10.166 -    /* VDI identifier is in grow->extent.sector_start */
  10.167 -    DPRINTF("vbd_create: create->dev_handle (id) is %lx\n", 
  10.168 -            (unsigned long)create->dev_handle);
  10.169 -
  10.170 -    vdi = vdi_get(create->dev_handle);
  10.171 -    if (vdi == NULL)
  10.172 -    {
  10.173 -        printf("parallax (vbd_create): VDI %lx not found.\n",
  10.174 -               (unsigned long)create->dev_handle);
  10.175 -        create->status = BLKIF_BE_STATUS_VBD_NOT_FOUND;
  10.176 -        return;
  10.177 -    }
  10.178 -    
  10.179 -    vdi->next = NULL;
  10.180 -    vdi->vdevice = vdevice;
  10.181 -    vdip = &blkif->vdi_hash[VDI_HASH(vdevice)];
  10.182 -    while (*vdip != NULL)
  10.183 -        vdip = &(*vdip)->next;
  10.184 -    *vdip = vdi;
  10.185 -    
  10.186 -    DPRINTF("blkif_create succeeded\n"); 
  10.187 -    create->status = BLKIF_BE_STATUS_OKAY;
  10.188 -}
  10.189 -
  10.190 -void vbd_destroy(blkif_be_vbd_destroy_t *destroy)
  10.191 -{
  10.192 -    blkif_t            *blkif;
  10.193 -    vdi_t              *vdi, **vdip;
  10.194 -    blkif_vdev_t        vdevice = destroy->vdevice;
  10.195 -    
  10.196 -    blkif = blkif_find_by_handle(destroy->domid, destroy->blkif_handle);
  10.197 -    if ( blkif == NULL )
  10.198 -    {
  10.199 -        DPRINTF("vbd_destroy attempted for non-existent blkif (%u,%u)\n", 
  10.200 -                destroy->domid, destroy->blkif_handle); 
  10.201 -        destroy->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
  10.202 -        return;
  10.203 -    }
  10.204 -
  10.205 -    vdip = &blkif->vdi_hash[VDI_HASH(vdevice)];
  10.206 -    while ((*vdip != NULL) && ((*vdip)->vdevice != vdevice))
  10.207 -        vdip = &(*vdip)->next;
  10.208 -
  10.209 -    if (*vdip != NULL) 
  10.210 -    {
  10.211 -        vdi = *vdip;
  10.212 -        *vdip = vdi->next;
  10.213 -        vdi_put(vdi);
  10.214 -    }
  10.215 -        
  10.216 -}
  10.217 -
  10.218 -int parallax_control(control_msg_t *msg)
  10.219 -{
  10.220 -    domid_t  domid;
  10.221 -    int      ret;
  10.222 -
  10.223 -    DPRINTF("parallax_control: msg is %p\n", msg); 
  10.224 -    
  10.225 -    if (msg->type != CMSG_BLKIF_BE) 
  10.226 -    {
  10.227 -        printf("Unexpected control message (%d)\n", msg->type);
  10.228 -        return 0;
  10.229 -    }
  10.230 -    
  10.231 -    switch(msg->subtype)
  10.232 -    {
  10.233 -    case CMSG_BLKIF_BE_CREATE:
  10.234 -        if ( msg->length != sizeof(blkif_be_create_t) )
  10.235 -            goto parse_error;
  10.236 -        blkif_create((blkif_be_create_t *)msg->msg);
  10.237 -        break;   
  10.238 -        
  10.239 -    case CMSG_BLKIF_BE_DESTROY:
  10.240 -        if ( msg->length != sizeof(blkif_be_destroy_t) )
  10.241 -            goto parse_error;
  10.242 -        blkif_destroy((blkif_be_destroy_t *)msg->msg);
  10.243 -        break;  
  10.244 -        
  10.245 -    case CMSG_BLKIF_BE_VBD_CREATE:
  10.246 -        if ( msg->length != sizeof(blkif_be_vbd_create_t) )
  10.247 -            goto parse_error;
  10.248 -        vbd_create((blkif_be_vbd_create_t *)msg->msg);
  10.249 -        break;
  10.250 -        
  10.251 -    case CMSG_BLKIF_BE_VBD_DESTROY:
  10.252 -        if ( msg->length != sizeof(blkif_be_vbd_destroy_t) )
  10.253 -            goto parse_error;
  10.254 -        vbd_destroy((blkif_be_vbd_destroy_t *)msg->msg);
  10.255 -        break;
  10.256 -
  10.257 -    case CMSG_BLKIF_BE_CONNECT:
  10.258 -    case CMSG_BLKIF_BE_DISCONNECT:
  10.259 -        /* we don't manage the device channel, the tap does. */
  10.260 -        break;
  10.261 -
  10.262 -    default:
  10.263 -        goto parse_error;
  10.264 -    }
  10.265 -    return 0;
  10.266 -parse_error:
  10.267 -    printf("Bad control message!\n");
  10.268 -    return 0;
  10.269 -    
  10.270 -}    
  10.271 -
  10.272 -int parallax_probe(blkif_request_t *req, blkif_t *blkif)
  10.273 -{
  10.274 -    blkif_response_t *rsp;
  10.275 -    vdisk_t *img_info;
  10.276 -    vdi_t *vdi;
  10.277 -    int i, nr_vdis = 0; 
  10.278 -
  10.279 -    DPRINTF("parallax_probe: req=%p, blkif=%p\n", req, blkif); 
  10.280 -
  10.281 -    /* We expect one buffer only. */
  10.282 -    if ( req->nr_segments != 1 )
  10.283 -      goto err;
  10.284 -
  10.285 -    /* Make sure the buffer is page-sized. */
  10.286 -    if ( (blkif_first_sect(req->frame_and_sects[0]) != 0) ||
  10.287 -       (blkif_last_sect (req->frame_and_sects[0]) != 7) )
  10.288 -      goto err;
  10.289 -
  10.290 -    /* fill the list of devices */
  10.291 -    for (i=0; i<VDI_HASHSZ; i++) {
  10.292 -        vdi = blkif->vdi_hash[i];
  10.293 -        while (vdi) {
  10.294 -            img_info = (vdisk_t *)MMAP_VADDR(ID_TO_IDX(req->id), 0);
  10.295 -            img_info[nr_vdis].device   = vdi->vdevice;
  10.296 -            img_info[nr_vdis].info     = 0;
  10.297 -            /* The -1 here accounts for the LSB in the radix tree */
  10.298 -            img_info[nr_vdis].capacity = 
  10.299 -                    ((1LL << (VDI_HEIGHT-1)) * SECTS_PER_NODE);
  10.300 -            nr_vdis++;
  10.301 -            vdi = vdi->next;
  10.302 -        }
  10.303 -    }
  10.304 -
  10.305 -    
  10.306 -    rsp = (blkif_response_t *)req;
  10.307 -    rsp->id = req->id;
  10.308 -    rsp->operation = BLKIF_OP_PROBE;
  10.309 -    rsp->status = nr_vdis; /* number of disks */
  10.310 -
  10.311 -    DPRINTF("parallax_probe: send positive response (nr_vdis=%d)\n", nr_vdis);
  10.312 -    return  BLKTAP_RESPOND;
  10.313 -err:
  10.314 -    rsp = (blkif_response_t *)req;
  10.315 -    rsp->id = req->id;
  10.316 -    rsp->operation = BLKIF_OP_PROBE;
  10.317 -    rsp->status = BLKIF_RSP_ERROR;
  10.318 -    
  10.319 -    DPRINTF("parallax_probe: send error response\n"); 
  10.320 -    return BLKTAP_RESPOND;  
  10.321 -}
  10.322 -
  10.323 -typedef struct {
  10.324 -    blkif_request_t *req;
  10.325 -    int              count;
  10.326 -    int              error;
  10.327 -    pthread_mutex_t  mutex;
  10.328 -} pending_t;
  10.329 -
  10.330 -#define MAX_REQUESTS 64
  10.331 -pending_t pending_list[MAX_REQUESTS];
  10.332 -
  10.333 -struct cb_param {
  10.334 -    pending_t *pent;
  10.335 -    int       segment;
  10.336 -    u64       sector; 
  10.337 -    u64       vblock; /* for debug printing -- can be removed. */
  10.338 -};
  10.339 -
  10.340 -static void read_cb(struct io_ret r, void *in_param)
  10.341 -{
  10.342 -    struct cb_param *param = (struct cb_param *)in_param;
  10.343 -    pending_t *p = param->pent;
  10.344 -    int segment = param->segment;
  10.345 -    blkif_request_t *req = p->req;
  10.346 -    unsigned long size, offset, start;
  10.347 -    char *dpage, *spage;
  10.348 -	
  10.349 -    spage  = IO_BLOCK(r);
  10.350 -    if (spage == NULL) { p->error++; goto finish; }
  10.351 -    dpage  = (char *)MMAP_VADDR(ID_TO_IDX(req->id), segment);
  10.352 -    
  10.353 -    /* Calculate read size and offset within the read block. */
  10.354 -
  10.355 -    offset = (param->sector << SECTOR_SHIFT) % BLOCK_SIZE;
  10.356 -    size = ( blkif_last_sect (req->frame_and_sects[segment]) -
  10.357 -             blkif_first_sect(req->frame_and_sects[segment]) + 1
  10.358 -        ) << SECTOR_SHIFT;
  10.359 -    start = blkif_first_sect(req->frame_and_sects[segment]) 
  10.360 -        << SECTOR_SHIFT;
  10.361 -
  10.362 -    DPRINTF("ParallaxRead: sect: %lld (%ld,%ld),  "
  10.363 -            "vblock %llx, "
  10.364 -            "size %lx\n", 
  10.365 -            param->sector, blkif_first_sect(p->req->frame_and_sects[segment]),
  10.366 -            blkif_last_sect (p->req->frame_and_sects[segment]),
  10.367 -            param->vblock, size); 
  10.368 -
  10.369 -    memcpy(dpage + start, spage + offset, size);
  10.370 -    freeblock(spage);
  10.371 -    
  10.372 -    /* Done the read.  Now update the pending record. */
  10.373 - finish:
  10.374 -    pthread_mutex_lock(&p->mutex);
  10.375 -    p->count--;
  10.376 -    
  10.377 -    if (p->count == 0) {
  10.378 -    	blkif_response_t *rsp;
  10.379 -    	
  10.380 -        rsp = (blkif_response_t *)req;
  10.381 -        rsp->id = req->id;
  10.382 -        rsp->operation = BLKIF_OP_READ;
  10.383 -    	if (p->error == 0) {
  10.384 -            rsp->status = BLKIF_RSP_OKAY;
  10.385 -    	} else {
  10.386 -            rsp->status = BLKIF_RSP_ERROR;
  10.387 -    	}
  10.388 -        blktap_inject_response(rsp);       
  10.389 -    }
  10.390 -    
  10.391 -    pthread_mutex_unlock(&p->mutex);
  10.392 -	
  10.393 -    free(param); /* TODO: replace with cached alloc/dealloc */
  10.394 -}	
  10.395 -
  10.396 -int parallax_read(blkif_request_t *req, blkif_t *blkif)
  10.397 -{
  10.398 -    blkif_response_t *rsp;
  10.399 -    u64 vblock, gblock;
  10.400 -    vdi_t *vdi;
  10.401 -    u64 sector;
  10.402 -    int i;
  10.403 -    char *dpage, *spage;
  10.404 -    pending_t *pent;
  10.405 -
  10.406 -    vdi = blkif_get_vdi(blkif, req->device);
  10.407 -    
  10.408 -    if ( vdi == NULL )
  10.409 -        goto err;
  10.410 -        
  10.411 -    pent = &pending_list[ID_TO_IDX(req->id)];
  10.412 -    pent->count = req->nr_segments;
  10.413 -    pent->req = req;
  10.414 -    pthread_mutex_init(&pent->mutex, NULL);
  10.415 -    
  10.416 -    for (i = 0; i < req->nr_segments; i++) {
  10.417 -        pthread_t tid;
  10.418 -        int ret;
  10.419 -        struct cb_param *p;
  10.420 -        
  10.421 -        /* Round the requested segment to a block address. */
  10.422 -        sector  = req->sector_number + (8*i);
  10.423 -        vblock = (sector << SECTOR_SHIFT) >> BLOCK_SHIFT;
  10.424 -        
  10.425 -        /* TODO: Replace this call to malloc with a cached allocation */
  10.426 -        p = (struct cb_param *)malloc(sizeof(struct cb_param));
  10.427 -        p->pent = pent;
  10.428 -        p->sector = sector; 
  10.429 -        p->segment = i;     
  10.430 -        p->vblock = vblock; /* dbg */
  10.431 -        
  10.432 -        /* Get that block from the store. */
  10.433 -        vdi_read(vdi, vblock, read_cb, (void *)p);    
  10.434 -    }
  10.435 -    
  10.436 -    return BLKTAP_STOLEN;
  10.437 -
  10.438 -err:
  10.439 -    rsp = (blkif_response_t *)req;
  10.440 -    rsp->id = req->id;
  10.441 -    rsp->operation = BLKIF_OP_READ;
  10.442 -    rsp->status = BLKIF_RSP_ERROR;
  10.443 -    
  10.444 -    return BLKTAP_RESPOND;  
  10.445 -}
  10.446 -
  10.447 -static void write_cb(struct io_ret r, void *in_param)
  10.448 -{
  10.449 -    struct cb_param *param = (struct cb_param *)in_param;
  10.450 -    pending_t *p = param->pent;
  10.451 -    blkif_request_t *req = p->req;
  10.452 -    
  10.453 -    /* catch errors from the block code. */
  10.454 -    if (IO_INT(r) < 0) p->error++;
  10.455 -    
  10.456 -    pthread_mutex_lock(&p->mutex);
  10.457 -    p->count--;
  10.458 -    
  10.459 -    if (p->count == 0) {
  10.460 -    	blkif_response_t *rsp;
  10.461 -    	
  10.462 -        rsp = (blkif_response_t *)req;
  10.463 -        rsp->id = req->id;
  10.464 -        rsp->operation = BLKIF_OP_WRITE;
  10.465 -    	if (p->error == 0) {
  10.466 -            rsp->status = BLKIF_RSP_OKAY;
  10.467 -    	} else {
  10.468 -            rsp->status = BLKIF_RSP_ERROR;
  10.469 -    	}
  10.470 -        blktap_inject_response(rsp);       
  10.471 -    }
  10.472 -    
  10.473 -    pthread_mutex_unlock(&p->mutex);
  10.474 -	
  10.475 -    free(param); /* TODO: replace with cached alloc/dealloc */
  10.476 -}
  10.477 -
  10.478 -int parallax_write(blkif_request_t *req, blkif_t *blkif)
  10.479 -{
  10.480 -    blkif_response_t *rsp;
  10.481 -    u64 sector;
  10.482 -    int i, writable = 0;
  10.483 -    u64 vblock, gblock;
  10.484 -    char *spage;
  10.485 -    unsigned long size, offset, start;
  10.486 -    vdi_t *vdi;
  10.487 -    pending_t *pent;
  10.488 -
  10.489 -    vdi = blkif_get_vdi(blkif, req->device);
  10.490 -    
  10.491 -    if ( vdi == NULL )
  10.492 -        goto err;
  10.493 -        
  10.494 -    pent = &pending_list[ID_TO_IDX(req->id)];
  10.495 -    pent->count = req->nr_segments;
  10.496 -    pent->req = req;
  10.497 -    pthread_mutex_init(&pent->mutex, NULL);
  10.498 -    
  10.499 -    for (i = 0; i < req->nr_segments; i++) {
  10.500 -        struct cb_param *p;
  10.501 -        
  10.502 -        spage  = (char *)MMAP_VADDR(ID_TO_IDX(req->id), i);
  10.503 -        
  10.504 -        /* Round the requested segment to a block address. */
  10.505 -        
  10.506 -        sector  = req->sector_number + (8*i);
  10.507 -        vblock = (sector << SECTOR_SHIFT) >> BLOCK_SHIFT;
  10.508 -        
  10.509 -        /* Calculate read size and offset within the read block. */
  10.510 -        
  10.511 -        offset = (sector << SECTOR_SHIFT) % BLOCK_SIZE;
  10.512 -        size = ( blkif_last_sect (req->frame_and_sects[i]) -
  10.513 -                 blkif_first_sect(req->frame_and_sects[i]) + 1
  10.514 -            ) << SECTOR_SHIFT;
  10.515 -        start = blkif_first_sect(req->frame_and_sects[i]) << SECTOR_SHIFT;
  10.516 -
  10.517 -        DPRINTF("ParallaxWrite: sect: %lld (%ld,%ld),  "
  10.518 -                "vblock %llx, gblock %llx, "
  10.519 -                "size %lx\n", 
  10.520 -                sector, blkif_first_sect(req->frame_and_sects[i]),
  10.521 -                blkif_last_sect (req->frame_and_sects[i]),
  10.522 -                vblock, gblock, size); 
  10.523 -      
  10.524 -        /* XXX: For now we just freak out if they try to write a   */
  10.525 -        /* non block-sized, block-aligned page.                    */
  10.526 -        
  10.527 -        if ((offset != 0) || (size != BLOCK_SIZE) || (start != 0)) {
  10.528 -            printf("]\n] STRANGE WRITE!\n]\n");
  10.529 -            goto err;
  10.530 -        }
  10.531 -        
  10.532 -        /* TODO: Replace this call to malloc with a cached allocation */
  10.533 -        p = (struct cb_param *)malloc(sizeof(struct cb_param));
  10.534 -        p->pent = pent;
  10.535 -        p->sector = sector; 
  10.536 -        p->segment = i;     
  10.537 -        p->vblock = vblock; /* dbg */
  10.538 -        
  10.539 -        /* Issue the write to the store. */
  10.540 -        vdi_write(vdi, vblock, spage, write_cb, (void *)p);
  10.541 -    }
  10.542 -
  10.543 -    return BLKTAP_STOLEN;
  10.544 -
  10.545 -err:
  10.546 -    rsp = (blkif_response_t *)req;
  10.547 -    rsp->id = req->id;
  10.548 -    rsp->operation = BLKIF_OP_WRITE;
  10.549 -    rsp->status = BLKIF_RSP_ERROR;
  10.550 -    
  10.551 -    return BLKTAP_RESPOND;  
  10.552 -}
  10.553 -
  10.554 -int parallax_request(blkif_request_t *req)
  10.555 -{
  10.556 -    blkif_response_t *rsp;
  10.557 -    domid_t  dom   = ID_TO_DOM(req->id);
  10.558 -    blkif_t *blkif = blkif_find_by_handle(dom, 0);
  10.559 -    
  10.560 -    if (blkif == NULL)
  10.561 -        goto err;
  10.562 -    
  10.563 -    if ( req->operation == BLKIF_OP_PROBE ) {
  10.564 -        
  10.565 -        return parallax_probe(req, blkif);
  10.566 -        
  10.567 -    } else if ( req->operation == BLKIF_OP_READ ) {
  10.568 -        
  10.569 -        return parallax_read(req, blkif);
  10.570 -        
  10.571 -    } else if ( req->operation == BLKIF_OP_WRITE ) {
  10.572 -        
  10.573 -        return parallax_write(req, blkif);
  10.574 -        
  10.575 -    } else {
  10.576 -        printf("Unknown request message type!\n");
  10.577 -        /* Unknown operation */
  10.578 -        goto err;
  10.579 -    }
  10.580 -    
  10.581 -err:
  10.582 -    rsp = (blkif_response_t *)req;
  10.583 -    rsp->operation = req->operation;
  10.584 -    rsp->id = req->id;
  10.585 -    rsp->status = BLKIF_RSP_ERROR;
  10.586 -    return BLKTAP_RESPOND;  
  10.587 -}
  10.588 -
  10.589 -void __init_parallax(void) 
  10.590 -{
  10.591 -    memset(blkif_hash, 0, sizeof(blkif_hash));
  10.592 -}
  10.593 -
  10.594 -
  10.595 -
  10.596 -int main(int argc, char *argv[])
  10.597 -{
  10.598 -    DPRINTF("parallax: starting.\n"); 
  10.599 -    __init_blockstore();
  10.600 -    DPRINTF("parallax: initialized blockstore...\n"); 
  10.601 -    init_block_async();
  10.602 -    DPRINTF("parallax: initialized async blocks...\n"); 
  10.603 -    __init_vdi();
  10.604 -    DPRINTF("parallax: initialized vdi registry etc...\n"); 
  10.605 -    __init_parallax();
  10.606 -    DPRINTF("parallax: initialized local stuff..\n"); 
  10.607 -
  10.608 -    blktap_register_ctrl_hook("parallax_control", parallax_control);
  10.609 -    blktap_register_request_hook("parallax_request", parallax_request);
  10.610 -    DPRINTF("parallax: added ctrl + request hooks, starting listen...\n"); 
  10.611 -    blktap_listen();
  10.612 -    
  10.613 -    return 0;
  10.614 -}
    11.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    11.2 +++ b/tools/blktap/parallax/README	Sun Jul 03 22:36:48 2005 +0000
    11.3 @@ -0,0 +1,177 @@
    11.4 +Parallax Quick Overview
    11.5 +March 3, 2005
    11.6 +
    11.7 +This is intended to provide a quick set of instructions to let you
    11.8 +guys play with the current parallax source.  In it's current form, the
    11.9 +code will let you run an arbitrary number of VMs off of a single disk
   11.10 +image, doing copy-on-write as they make updates.  Each domain is
   11.11 +assigned a virtual disk image (VDI), which may be based on a snapshot
   11.12 +of an existing image.  All of the VDI and snapshot management should
   11.13 +currently work.
   11.14 +
   11.15 +The current implementation uses a single file as a blockstore for
   11.16 +_everything_ this will soon be replaced by the fancier backend code
   11.17 +and the local cache.  As it stands, Parallax will create
   11.18 +"blockstore.dat" in the directory that you run it from, and use
   11.19 +largefile support to make this grow to unfathomable girth.  So, you
   11.20 +probably want to run the daemon off of a local disk, with a lot of
   11.21 +free space.
   11.22 +
   11.23 +Here's how to get going:
   11.24 +
   11.25 +0. Setup:
   11.26 +---------
   11.27 +
   11.28 +Pick a local directory on a disk with lots of room.  You should be
   11.29 +running from a privileged domain (e.g. dom0) with the blocktap
   11.30 +configured in and block backend NOT.
   11.31 +
   11.32 +For convenience (for the moment) copy all of the vdi tools (vdi_*) and
   11.33 +the parallax daemon from tools/blktap into this directory.
   11.34 +
   11.35 +1. Populate the blockstore:
   11.36 +---------------------------
   11.37 +
   11.38 +First you need to put at least one image into the blockstore.  You
   11.39 +will need a disk image, either as a file or local partition.  My
   11.40 +general approach has been to
   11.41 +
   11.42 +(a) make a really big sparse file with 
   11.43 +
   11.44 +        dd if=/dev/zero of=./image bs=4K count=1 seek=[big value]
   11.45 +
   11.46 +(b) put a filesystem into it
   11.47 +
   11.48 +        mkfs.ext3 ./image
   11.49 +
   11.50 +(c) mount it using loopback
   11.51 +
   11.52 +        mkdir ./mnt
   11.53 +        mount -o loop ./image
   11.54 +
   11.55 +(d) cd into it and untar one of the image files from srg-roots.
   11.56 +
   11.57 +        cd mnt
   11.58 +        tar ...
   11.59 +
   11.60 +NOTE: Beware if your system is FC3.  mkfs is not compatible with old
   11.61 +versions of fedora, and so you don't have much choice but to install
   11.62 +further fc3 images if you have used the fc3 version of mkfs.
   11.63 +
   11.64 +(e) unmount the image
   11.65 +
   11.66 +        cd ..
   11.67 +        umount mnt
   11.68 +
   11.69 +(f) now, create a new VDI to hold the image 
   11.70 +
   11.71 +        ./vdi_create "My new FC3 VDI"
   11.72 +
   11.73 +(g) get the id of the new VDI.
   11.74 +
   11.75 +        ./vdi_list
   11.76 +
   11.77 +        |      0                     My new FC3 VDI
   11.78 +
   11.79 +(0 is the VDI id... create a few more if you want.)
   11.80 +
   11.81 +(h) hoover your image into the new VDI.
   11.82 +
   11.83 +        ./vdi_fill 0 ./image
   11.84 +
   11.85 +This will pull the entire image into the blockstore and set up a
   11.86 +mapping tree for it for VDI 0.  Passing a device (i.e. /dev/sda3)
   11.87 +should also work, but vdi_fill has NO notion of sparseness yet, so you
   11.88 +are going to pump a block into the store for each block you read.
   11.89 +
   11.90 +vdi_fill will count up until it is done, and you should be ready to
   11.91 +go.  If you want to be anal, you can use vdi_validate to test the VDI
   11.92 +against the original image.
   11.93 +
   11.94 +2. Create some extra VDIs
   11.95 +-------------------------
   11.96 +
   11.97 +VDIs are actually a list of snapshots, and each snapshot is a full
   11.98 +image of mappings.  So, to preserve an immutable copy of a current
   11.99 +VDI, do this:
  11.100 +
  11.101 +(a) Snapshot your new VDI.
  11.102 +
  11.103 +        ./vdi_snap 0
  11.104 +
  11.105 +Snapshotting writes the current radix root to the VDI's snapshot log,
  11.106 +and assigns it a new writable root.
  11.107 +
  11.108 +(b) look at the VDI's snapshot log.
  11.109 +
  11.110 +        ./vdi_snap_list 0
  11.111 +
  11.112 +        | 16   0      Thu Mar  3 19:27:48 2005 565111           31
  11.113 +
  11.114 +The first two columns constitute a snapshot id and represent the
  11.115 +(block, offset) of the snapshot record.  The Date tells you when the
  11.116 +snapshot was made, and 31 is the radix root node of the snapshot.
  11.117 +
  11.118 +(c) Create a new VDI, based on that snapshot, and look at the list.
  11.119 +
  11.120 +        ./vdi_create "FC3 - Copy 1" 16 0
  11.121 +        ./vdi_list
  11.122 +
  11.123 +        |      0                     My new FC3 VDI
  11.124 +        |      1                       FC3 - Copy 1
  11.125 +
  11.126 +NOTE: If you have Graphviz installed on your system, you can use
  11.127 +vdi_tree to generate a postscript of your current set of VDIs and
  11.128 +snapshots.
  11.129 +
  11.130 +
  11.131 +Create as many VDIs as you need for the VMs that you want to run.
  11.132 +
  11.133 +3. Boot some VMs:
  11.134 +-----------------
  11.135 +
  11.136 +Parallax currently uses a hack in xend to pass the VDI id, you need to
  11.137 +modify the disk line of the VM config that is going to mount it.
  11.138 +
  11.139 +(a) set up your vm config, by using the following disk line:
  11.140 +
  11.141 +        disk = ['parallax:1,sda1,w,0' ]
  11.142 +
  11.143 +This example uses VDI 1 (from vdi_list above), presents it as sda1
  11.144 +(writable), and uses dom 0 as the backend.  If you were running the
  11.145 +daemon (and tap driver) in some domain other than 0, you would change
  11.146 +this last parameter.
  11.147 +
  11.148 +NOTE: You'll need to have reinstalled xend/tools prior to booting the vm, so that it knows what to do with "parallax:".
  11.149 +
  11.150 +(b) Run parallax in the backend domain.
  11.151 +
  11.152 +        ./parallax
  11.153 +
  11.154 +(c) create your new domain.
  11.155 +
  11.156 +        xm create ...
  11.157 +
  11.158 +---
  11.159 +
  11.160 +That's pretty much all there is to it at the moment.  Hope this is
  11.161 +clear enough to get you going.  Now, a few serious caveats that will
  11.162 +be sorted out in the almost immediate future:
  11.163 +
  11.164 +WARNINGS:
  11.165 +---------
  11.166 +
  11.167 +1. There is NO locking in the VDI tools at the moment, so I'd avoid
  11.168 +running them in parallel, or more importantly, running them while the
  11.169 +daemon is running.
  11.170 +
  11.171 +2. I doubt that xend will be very happy about restarting if you have
  11.172 +parallax-using domains.  So if it dies while there are active parallax
  11.173 +doms, you may need to reboot.
  11.174 +
  11.175 +3. I've turned off write-in-place.  So at the moment, EVERY block
  11.176 +write is a log append on the blockstore.  I've been having some probs
  11.177 +with the radix tree's marking of writable blocks after snapshots and
  11.178 +will sort this out very soon.
  11.179 +
  11.180 +
    12.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    12.2 +++ b/tools/blktap/parallax/block-async.c	Sun Jul 03 22:36:48 2005 +0000
    12.3 @@ -0,0 +1,393 @@
    12.4 +/* block-async.c
    12.5 + * 
    12.6 + * Asynchronous block wrappers for parallax.
    12.7 + */
    12.8 + 
    12.9 + 
   12.10 +#include <stdio.h>
   12.11 +#include <stdlib.h>
   12.12 +#include <string.h>
   12.13 +#include <pthread.h>
   12.14 +#include "block-async.h"
   12.15 +#include "blockstore.h"
   12.16 +#include "vdi.h"
   12.17 +
   12.18 +
   12.19 +#if 0
   12.20 +#define DPRINTF(_f, _a...) printf ( _f , ## _a )
   12.21 +#else
   12.22 +#define DPRINTF(_f, _a...) ((void)0)
   12.23 +#endif
   12.24 +
   12.25 +/* We have a queue of outstanding I/O requests implemented as a 
   12.26 + * circular producer-consumer ring with free-running buffers.
   12.27 + * to allow reordering, this ring indirects to indexes in an 
   12.28 + * ring of io_structs.
   12.29 + * 
   12.30 + * the block_* calls may either add an entry to this ring and return, 
   12.31 + * or satisfy the request immediately and call the callback directly.
   12.32 + * None of the io calls in parallax should be nested enough to worry 
   12.33 + * about stack problems with this approach.
   12.34 + */
   12.35 +
   12.36 +struct read_args {
   12.37 +    u64 addr;
   12.38 +};
   12.39 +
   12.40 +struct write_args {
   12.41 +    u64   addr;
   12.42 +    char *block;
   12.43 +};
   12.44 +
   12.45 +struct alloc_args {
   12.46 +    char *block;
   12.47 +};
   12.48 + 
   12.49 +struct pending_io_req {
   12.50 +    enum {IO_READ, IO_WRITE, IO_ALLOC, IO_RWAKE, IO_WWAKE} op;
   12.51 +    union {
   12.52 +        struct read_args  r;
   12.53 +        struct write_args w;
   12.54 +        struct alloc_args a;
   12.55 +    } u;
   12.56 +    io_cb_t cb;
   12.57 +    void *param;
   12.58 +};
   12.59 +
   12.60 +void radix_lock_init(struct radix_lock *r)
   12.61 +{
   12.62 +    int i;
   12.63 +    
   12.64 +    pthread_mutex_init(&r->lock, NULL);
   12.65 +    for (i=0; i < 1024; i++) {
   12.66 +        r->lines[i] = 0;
   12.67 +        r->waiters[i] = NULL;
   12.68 +        r->state[i] = ANY;
   12.69 +    }
   12.70 +}
   12.71 +
   12.72 +/* maximum outstanding I/O requests issued asynchronously */
   12.73 +/* must be a power of 2.*/
   12.74 +#define MAX_PENDING_IO 1024
   12.75 +
   12.76 +/* how many threads to concurrently issue I/O to the disk. */
   12.77 +#define IO_POOL_SIZE   10
   12.78 +
   12.79 +static struct pending_io_req pending_io_reqs[MAX_PENDING_IO];
   12.80 +static int pending_io_list[MAX_PENDING_IO];
   12.81 +static unsigned long io_prod = 0, io_cons = 0, io_free = 0;
   12.82 +#define PENDING_IO_MASK(_x) ((_x) & (MAX_PENDING_IO - 1))
   12.83 +#define PENDING_IO_IDX(_x) ((_x) - pending_io_reqs)
   12.84 +#define PENDING_IO_ENT(_x) \
   12.85 +	(&pending_io_reqs[pending_io_list[PENDING_IO_MASK(_x)]])
   12.86 +#define CAN_PRODUCE_PENDING_IO ((io_free + MAX_PENDING_IO) != io_prod)
   12.87 +#define CAN_CONSUME_PENDING_IO (io_cons != io_prod)
   12.88 +static pthread_mutex_t pending_io_lock = PTHREAD_MUTEX_INITIALIZER;
   12.89 +static pthread_cond_t  pending_io_cond = PTHREAD_COND_INITIALIZER;
   12.90 +
   12.91 +static void init_pending_io(void)
   12.92 +{
   12.93 +    int i;
   12.94 +	
   12.95 +    for (i=0; i<MAX_PENDING_IO; i++)
   12.96 +        pending_io_list[i] = i;
   12.97 +		
   12.98 +} 
   12.99 +
  12.100 +void block_read(u64 addr, io_cb_t cb, void *param)
  12.101 +{
  12.102 +    struct pending_io_req *req;
  12.103 +    
  12.104 +    pthread_mutex_lock(&pending_io_lock);
  12.105 +    assert(CAN_PRODUCE_PENDING_IO);
  12.106 +    
  12.107 +    req = PENDING_IO_ENT(io_prod++);
  12.108 +    DPRINTF("Produce (R) %lu (%p)\n", io_prod - 1, req);
  12.109 +    req->op = IO_READ;
  12.110 +    req->u.r.addr = addr;
  12.111 +    req->cb = cb;
  12.112 +    req->param = param;
  12.113 +    
  12.114 +    pthread_cond_signal(&pending_io_cond);
  12.115 +    pthread_mutex_unlock(&pending_io_lock);	
  12.116 +}
  12.117 +
  12.118 +
  12.119 +void block_write(u64 addr, char *block, io_cb_t cb, void *param)
  12.120 +{
  12.121 +    struct pending_io_req *req;
  12.122 +    
  12.123 +    pthread_mutex_lock(&pending_io_lock);
  12.124 +    assert(CAN_PRODUCE_PENDING_IO);
  12.125 +    
  12.126 +    req = PENDING_IO_ENT(io_prod++);
  12.127 +    DPRINTF("Produce (W) %lu (%p)\n", io_prod - 1, req);
  12.128 +    req->op = IO_WRITE;
  12.129 +    req->u.w.addr  = addr;
  12.130 +    req->u.w.block = block;
  12.131 +    req->cb = cb;
  12.132 +    req->param = param;
  12.133 +    
  12.134 +    pthread_cond_signal(&pending_io_cond);
  12.135 +    pthread_mutex_unlock(&pending_io_lock);	
  12.136 +}
  12.137 +
  12.138 +
  12.139 +void block_alloc(char *block, io_cb_t cb, void *param)
  12.140 +{
  12.141 +    struct pending_io_req *req;
  12.142 +	
  12.143 +    pthread_mutex_lock(&pending_io_lock);
  12.144 +    assert(CAN_PRODUCE_PENDING_IO);
  12.145 +    
  12.146 +    req = PENDING_IO_ENT(io_prod++);
  12.147 +    req->op = IO_ALLOC;
  12.148 +    req->u.a.block = block;
  12.149 +    req->cb = cb;
  12.150 +    req->param = param;
  12.151 +    
  12.152 +    pthread_cond_signal(&pending_io_cond);
  12.153 +    pthread_mutex_unlock(&pending_io_lock);	
  12.154 +}
  12.155 +
  12.156 +void block_rlock(struct radix_lock *r, int row, io_cb_t cb, void *param)
  12.157 +{
  12.158 +    struct io_ret ret;
  12.159 +    pthread_mutex_lock(&r->lock);
  12.160 +    
  12.161 +    if (( r->lines[row] >= 0 ) && (r->state[row] != STOP)) {
  12.162 +        r->lines[row]++;
  12.163 +        r->state[row] = READ;
  12.164 +        DPRINTF("RLOCK  : %3d (row: %d)\n", r->lines[row], row);
  12.165 +        pthread_mutex_unlock(&r->lock);
  12.166 +        ret.type = IO_INT_T;
  12.167 +        ret.u.i = 0;
  12.168 +        cb(ret, param);
  12.169 +    } else {
  12.170 +        struct radix_wait **rwc;
  12.171 +        struct radix_wait *rw = 
  12.172 +            (struct radix_wait *) malloc (sizeof(struct radix_wait));
  12.173 +        DPRINTF("RLOCK  : %3d (row: %d) -- DEFERRED!\n", r->lines[row], row);
  12.174 +        rw->type  = RLOCK;
  12.175 +        rw->param = param;
  12.176 +        rw->cb    = cb;
  12.177 +        rw->next  = NULL;
  12.178 +        /* append to waiters list. */
  12.179 +        rwc = &r->waiters[row];
  12.180 +        while (*rwc != NULL) rwc = &(*rwc)->next;
  12.181 +        *rwc = rw;
  12.182 +        pthread_mutex_unlock(&r->lock);
  12.183 +        return;
  12.184 +    }
  12.185 +}
  12.186 +
  12.187 +
  12.188 +void block_wlock(struct radix_lock *r, int row, io_cb_t cb, void *param)
  12.189 +{
  12.190 +    struct io_ret ret;
  12.191 +    pthread_mutex_lock(&r->lock);
  12.192 +    
  12.193 +    /* the second check here is redundant -- just here for debugging now. */
  12.194 +    if ((r->state[row] == ANY) && ( r->lines[row] == 0 )) {
  12.195 +        r->state[row] = STOP;
  12.196 +        r->lines[row] = -1;
  12.197 +        DPRINTF("WLOCK  : %3d (row: %d)\n", r->lines[row], row);
  12.198 +        pthread_mutex_unlock(&r->lock);
  12.199 +        ret.type = IO_INT_T;
  12.200 +        ret.u.i = 0;
  12.201 +        cb(ret, param);
  12.202 +    } else {
  12.203 +        struct radix_wait **rwc;
  12.204 +        struct radix_wait *rw = 
  12.205 +            (struct radix_wait *) malloc (sizeof(struct radix_wait));
  12.206 +        DPRINTF("WLOCK  : %3d (row: %d) -- DEFERRED!\n", r->lines[row], row);
  12.207 +        rw->type  = WLOCK;
  12.208 +        rw->param = param;
  12.209 +        rw->cb    = cb;
  12.210 +        rw->next  = NULL;
  12.211 +        /* append to waiters list. */
  12.212 +        rwc = &r->waiters[row];
  12.213 +        while (*rwc != NULL) rwc = &(*rwc)->next;
  12.214 +        *rwc = rw;
  12.215 +        pthread_mutex_unlock(&r->lock);
  12.216 +        return;
  12.217 +    }
  12.218 +	
  12.219 +}
  12.220 +
  12.221 +/* called with radix_lock locked and lock count of zero. */
  12.222 +static void wake_waiters(struct radix_lock *r, int row)
  12.223 +{
  12.224 +    struct pending_io_req *req;
  12.225 +    struct radix_wait *rw;
  12.226 +    
  12.227 +    if (r->lines[row] != 0) return;
  12.228 +    if (r->waiters[row] == NULL) return; 
  12.229 +    
  12.230 +    if (r->waiters[row]->type == WLOCK) {
  12.231 +
  12.232 +        rw = r->waiters[row];
  12.233 +        pthread_mutex_lock(&pending_io_lock);
  12.234 +        assert(CAN_PRODUCE_PENDING_IO);
  12.235 +        
  12.236 +        req = PENDING_IO_ENT(io_prod++);
  12.237 +        req->op    = IO_WWAKE;
  12.238 +        req->cb    = rw->cb;
  12.239 +        req->param = rw->param;
  12.240 +        r->lines[row] = -1; /* write lock the row. */
  12.241 +        r->state[row] = STOP;
  12.242 +        r->waiters[row] = rw->next;
  12.243 +        free(rw);
  12.244 +        pthread_mutex_unlock(&pending_io_lock);
  12.245 +    
  12.246 +    } else /* RLOCK */ {
  12.247 +
  12.248 +        while ((r->waiters[row] != NULL) && (r->waiters[row]->type == RLOCK)) {
  12.249 +            rw = r->waiters[row];
  12.250 +            pthread_mutex_lock(&pending_io_lock);
  12.251 +            assert(CAN_PRODUCE_PENDING_IO);
  12.252 +            
  12.253 +            req = PENDING_IO_ENT(io_prod++);
  12.254 +            req->op    = IO_RWAKE;
  12.255 +            req->cb    = rw->cb;
  12.256 +            req->param = rw->param;
  12.257 +            r->lines[row]++; /* read lock the row. */
  12.258 +            r->state[row] = READ; 
  12.259 +            r->waiters[row] = rw->next;
  12.260 +            free(rw);
  12.261 +            pthread_mutex_unlock(&pending_io_lock);
  12.262 +        }
  12.263 +
  12.264 +        if (r->waiters[row] != NULL) /* There is a write queued still */
  12.265 +            r->state[row] = STOP;
  12.266 +    }	
  12.267 +    
  12.268 +    pthread_mutex_lock(&pending_io_lock);
  12.269 +    pthread_cond_signal(&pending_io_cond);
  12.270 +    pthread_mutex_unlock(&pending_io_lock);
  12.271 +}
  12.272 +
  12.273 +void block_runlock(struct radix_lock *r, int row, io_cb_t cb, void *param)
  12.274 +{
  12.275 +    struct io_ret ret;
  12.276 +	
  12.277 +    pthread_mutex_lock(&r->lock);
  12.278 +    assert(r->lines[row] > 0); /* try to catch misuse. */
  12.279 +    r->lines[row]--;
  12.280 +    if (r->lines[row] == 0) {
  12.281 +        r->state[row] = ANY;
  12.282 +        wake_waiters(r, row);
  12.283 +    }
  12.284 +    pthread_mutex_unlock(&r->lock);
  12.285 +    cb(ret, param);
  12.286 +}
  12.287 +
  12.288 +void block_wunlock(struct radix_lock *r, int row, io_cb_t cb, void *param)
  12.289 +{
  12.290 +    struct io_ret ret;
  12.291 +    
  12.292 +    pthread_mutex_lock(&r->lock);
  12.293 +    assert(r->lines[row] == -1); /* try to catch misuse. */
  12.294 +    r->lines[row] = 0;
  12.295 +    r->state[row] = ANY;
  12.296 +    wake_waiters(r, row);
  12.297 +    pthread_mutex_unlock(&r->lock);
  12.298 +    cb(ret, param);
  12.299 +}
  12.300 +
  12.301 +/* consumer calls */
  12.302 +static void do_next_io_req(struct pending_io_req *req)
  12.303 +{
  12.304 +    struct io_ret          ret;
  12.305 +    void  *param;
  12.306 +    
  12.307 +    switch (req->op) {
  12.308 +    case IO_READ:
  12.309 +        ret.type = IO_BLOCK_T;
  12.310 +        ret.u.b  = readblock(req->u.r.addr);
  12.311 +        break;
  12.312 +    case IO_WRITE:
  12.313 +        ret.type = IO_INT_T;
  12.314 +        ret.u.i  = writeblock(req->u.w.addr, req->u.w.block);
  12.315 +        DPRINTF("wrote %d at %Lu\n", *(int *)(req->u.w.block), req->u.w.addr);
  12.316 +        break;
  12.317 +    case IO_ALLOC:
  12.318 +        ret.type = IO_ADDR_T;
  12.319 +        ret.u.a  = allocblock(req->u.a.block);
  12.320 +        break;
  12.321 +    case IO_RWAKE:
  12.322 +        DPRINTF("WAKE DEFERRED RLOCK!\n");
  12.323 +        ret.type = IO_INT_T;
  12.324 +        ret.u.i  = 0;
  12.325 +        break;
  12.326 +    case IO_WWAKE:
  12.327 +        DPRINTF("WAKE DEFERRED WLOCK!\n");
  12.328 +        ret.type = IO_INT_T;
  12.329 +        ret.u.i  = 0;
  12.330 +        break;
  12.331 +    default:
  12.332 +        DPRINTF("Unknown IO operation on pending list!\n");
  12.333 +        return;
  12.334 +    }
  12.335 +    
  12.336 +    param = req->param;
  12.337 +    pthread_mutex_lock(&pending_io_lock);
  12.338 +    pending_io_list[PENDING_IO_MASK(io_free++)] = PENDING_IO_IDX(req);
  12.339 +    pthread_mutex_unlock(&pending_io_lock);
  12.340 +	
  12.341 +    assert(req->cb != NULL);
  12.342 +    req->cb(ret, param);
  12.343 +    
  12.344 +}
  12.345 +
  12.346 +void *io_thread(void *param) 
  12.347 +{
  12.348 +    int tid;
  12.349 +    struct pending_io_req *req;
  12.350 +    
  12.351 +    /* Set this thread's tid. */
  12.352 +    tid = *(int *)param;
  12.353 +    free(param);
  12.354 +    
  12.355 +start:
  12.356 +    pthread_mutex_lock(&pending_io_lock);
  12.357 +    while (io_prod == io_cons) {
  12.358 +        pthread_cond_wait(&pending_io_cond, &pending_io_lock);
  12.359 +    }
  12.360 +    
  12.361 +    if (io_prod == io_cons) {
  12.362 +        /* unnecessary wakeup. */
  12.363 +        pthread_mutex_unlock(&pending_io_lock);
  12.364 +        goto start;
  12.365 +    }
  12.366 +    
  12.367 +    req = PENDING_IO_ENT(io_cons++);
  12.368 +    pthread_mutex_unlock(&pending_io_lock);
  12.369 +	
  12.370 +    do_next_io_req(req);
  12.371 +    
  12.372 +    goto start;
  12.373 +	
  12.374 +}
  12.375 +
  12.376 +static pthread_t io_pool[IO_POOL_SIZE];
  12.377 +void start_io_threads(void)
  12.378 +
  12.379 +{	
  12.380 +    int i, tid=0;
  12.381 +    
  12.382 +    for (i=0; i < IO_POOL_SIZE; i++) {
  12.383 +        int ret, *t;
  12.384 +        t = (int *)malloc(sizeof(int));
  12.385 +        *t = tid++;
  12.386 +        ret = pthread_create(&io_pool[i], NULL, io_thread, t);
  12.387 +        if (ret != 0) printf("Error starting thread %d\n", i);
  12.388 +    }
  12.389 +	
  12.390 +}
  12.391 +
  12.392 +void init_block_async(void)
  12.393 +{
  12.394 +    init_pending_io();
  12.395 +    start_io_threads();
  12.396 +}
    13.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    13.2 +++ b/tools/blktap/parallax/block-async.h	Sun Jul 03 22:36:48 2005 +0000
    13.3 @@ -0,0 +1,69 @@
    13.4 +/* block-async.h
    13.5 + * 
    13.6 + * Asynchronous block wrappers for parallax.
    13.7 + */
    13.8 + 
    13.9 +#ifndef _BLOCKASYNC_H_
   13.10 +#define _BLOCKASYNC_H_
   13.11 +
   13.12 +#include <assert.h>
   13.13 +#include <xc.h>
   13.14 +#include "vdi.h"
   13.15 +
   13.16 +struct io_ret
   13.17 +{
   13.18 +    enum {IO_ADDR_T, IO_BLOCK_T, IO_INT_T} type;
   13.19 +    union {
   13.20 +        u64   a;
   13.21 +        char *b;
   13.22 +        int   i;
   13.23 +    } u;
   13.24 +};
   13.25 +
   13.26 +typedef void (*io_cb_t)(struct io_ret r, void *param);
   13.27 +
   13.28 +/* per-vdi lock structures to make sure requests run in a safe order. */
   13.29 +struct radix_wait {
   13.30 +    enum {RLOCK, WLOCK} type;
   13.31 +    io_cb_t  cb;
   13.32 +    void    *param;
   13.33 +    struct radix_wait *next;
   13.34 +};
   13.35 +
   13.36 +struct radix_lock {
   13.37 +    pthread_mutex_t lock;
   13.38 +    int                    lines[1024];
   13.39 +    struct radix_wait     *waiters[1024];
   13.40 +    enum {ANY, READ, STOP} state[1024];
   13.41 +};
   13.42 +void radix_lock_init(struct radix_lock *r);
   13.43 +
   13.44 +void block_read(u64 addr, io_cb_t cb, void *param);
   13.45 +void block_write(u64 addr, char *block, io_cb_t cb, void *param);
   13.46 +void block_alloc(char *block, io_cb_t cb, void *param);
   13.47 +void block_rlock(struct radix_lock *r, int row, io_cb_t cb, void *param);
   13.48 +void block_wlock(struct radix_lock *r, int row, io_cb_t cb, void *param);
   13.49 +void block_runlock(struct radix_lock *r, int row, io_cb_t cb, void *param);
   13.50 +void block_wunlock(struct radix_lock *r, int row, io_cb_t cb, void *param);
   13.51 +void init_block_async(void);
   13.52 +
   13.53 +static inline u64 IO_ADDR(struct io_ret r)
   13.54 +{
   13.55 +    assert(r.type == IO_ADDR_T);
   13.56 +    return r.u.a;
   13.57 +}
   13.58 +
   13.59 +static inline char *IO_BLOCK(struct io_ret r)
   13.60 +{
   13.61 +    assert(r.type == IO_BLOCK_T);
   13.62 +    return r.u.b;
   13.63 +}
   13.64 +
   13.65 +static inline int IO_INT(struct io_ret r)
   13.66 +{
   13.67 +    assert(r.type == IO_INT_T);
   13.68 +    return r.u.i;
   13.69 +}
   13.70 +
   13.71 +
   13.72 +#endif //_BLOCKASYNC_H_
    14.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    14.2 +++ b/tools/blktap/parallax/blockstore.c	Sun Jul 03 22:36:48 2005 +0000
    14.3 @@ -0,0 +1,1350 @@
    14.4 +/**************************************************************************
    14.5 + * 
    14.6 + * blockstore.c
    14.7 + *
    14.8 + * Simple block store interface
    14.9 + *
   14.10 + */
   14.11 + 
   14.12 +#include <fcntl.h>
   14.13 +#include <unistd.h>
   14.14 +#include <stdio.h>
   14.15 +#include <stdlib.h>
   14.16 +#include <string.h>
   14.17 +#include <sys/types.h>
   14.18 +#include <sys/stat.h>
   14.19 +#include <sys/time.h>
   14.20 +#include <stdarg.h>
   14.21 +#include "blockstore.h"
   14.22 +#include <pthread.h>
   14.23 +
   14.24 +//#define BLOCKSTORE_REMOTE
   14.25 +//#define BSDEBUG
   14.26 +
   14.27 +#define RETRY_TIMEOUT 1000000 /* microseconds */
   14.28 +
   14.29 +/*****************************************************************************
   14.30 + * Debugging
   14.31 + */
   14.32 +#ifdef BSDEBUG
   14.33 +void DB(char *format, ...)
   14.34 +{
   14.35 +    va_list args;
   14.36 +    fprintf(stderr, "[%05u] ", (int)pthread_getspecific(tid_key));
   14.37 +    va_start(args, format);
   14.38 +    vfprintf(stderr, format, args);
   14.39 +    va_end(args);
   14.40 +}
   14.41 +#else
   14.42 +#define DB(format, ...) (void)0
   14.43 +#endif
   14.44 +
   14.45 +#ifdef BLOCKSTORE_REMOTE
   14.46 +
   14.47 +#include <sys/socket.h>
   14.48 +#include <sys/ioctl.h>
   14.49 +#include <netinet/in.h>
   14.50 +#include <netdb.h>
   14.51 +
   14.52 +/*****************************************************************************
   14.53 + * Network state                                                             *
   14.54 + *****************************************************************************/
   14.55 +
   14.56 +/* The individual disk servers we talks to. These will be referenced by
   14.57 + * an integer index into bsservers[].
   14.58 + */
   14.59 +bsserver_t bsservers[MAX_SERVERS];
   14.60 +
   14.61 +/* The cluster map. This is indexed by an integer cluster number.
   14.62 + */
   14.63 +bscluster_t bsclusters[MAX_CLUSTERS];
   14.64 +
   14.65 +/* Local socket.
   14.66 + */
   14.67 +struct sockaddr_in sin_local;
   14.68 +int bssock = 0;
   14.69 +
   14.70 +/*****************************************************************************
   14.71 + * Notification                                                              *
   14.72 + *****************************************************************************/
   14.73 +
   14.74 +typedef struct pool_thread_t_struct {
   14.75 +    pthread_mutex_t ptmutex;
   14.76 +    pthread_cond_t ptcv;
   14.77 +    int newdata;
   14.78 +} pool_thread_t;
   14.79 +
   14.80 +pool_thread_t pool_thread[READ_POOL_SIZE+1];
   14.81 +
   14.82 +#define RECV_NOTIFY(tid) { \
   14.83 +    pthread_mutex_lock(&(pool_thread[tid].ptmutex)); \
   14.84 +    pool_thread[tid].newdata = 1; \
   14.85 +    DB("CV Waking %u", tid); \
   14.86 +    pthread_cond_signal(&(pool_thread[tid].ptcv)); \
   14.87 +    pthread_mutex_unlock(&(pool_thread[tid].ptmutex)); }
   14.88 +#define RECV_AWAIT(tid) { \
   14.89 +    pthread_mutex_lock(&(pool_thread[tid].ptmutex)); \
   14.90 +    if (pool_thread[tid].newdata) { \
   14.91 +        pool_thread[tid].newdata = 0; \
   14.92 +        DB("CV Woken %u", tid); \
   14.93 +    } \
   14.94 +    else { \
   14.95 +        DB("CV Waiting %u", tid); \
   14.96 +        pthread_cond_wait(&(pool_thread[tid].ptcv), \
   14.97 +                          &(pool_thread[tid].ptmutex)); \
   14.98 +    } \
   14.99 +    pthread_mutex_unlock(&(pool_thread[tid].ptmutex)); }
  14.100 +
  14.101 +/*****************************************************************************
  14.102 + * Message queue management                                                  *
  14.103 + *****************************************************************************/
  14.104 +
  14.105 +/* Protects the queue manipulation critcal regions.
  14.106 + */
  14.107 +pthread_mutex_t ptmutex_queue;
  14.108 +#define ENTER_QUEUE_CR pthread_mutex_lock(&ptmutex_queue)
  14.109 +#define LEAVE_QUEUE_CR pthread_mutex_unlock(&ptmutex_queue)
  14.110 +
  14.111 +pthread_mutex_t ptmutex_recv;
  14.112 +#define ENTER_RECV_CR pthread_mutex_lock(&ptmutex_recv)
  14.113 +#define LEAVE_RECV_CR pthread_mutex_unlock(&ptmutex_recv)
  14.114 +
  14.115 +/* A message queue entry. We allocate one of these for every request we send.
  14.116 + * Asynchronous reply reception also used one of these.
  14.117 + */
  14.118 +typedef struct bsq_t_struct {
  14.119 +    struct bsq_t_struct *prev;
  14.120 +    struct bsq_t_struct *next;
  14.121 +    int status;
  14.122 +    int server;
  14.123 +    int length;
  14.124 +    struct msghdr msghdr;
  14.125 +    struct iovec iov[2];
  14.126 +    int tid;
  14.127 +    struct timeval tv_sent;
  14.128 +    bshdr_t message;
  14.129 +    void *block;
  14.130 +} bsq_t;
  14.131 +
  14.132 +#define BSQ_STATUS_MATCHED 1
  14.133 +
  14.134 +pthread_mutex_t ptmutex_luid;
  14.135 +#define ENTER_LUID_CR pthread_mutex_lock(&ptmutex_luid)
  14.136 +#define LEAVE_LUID_CR pthread_mutex_unlock(&ptmutex_luid)
  14.137 +
  14.138 +static u64 luid_cnt = 0x1000ULL;
  14.139 +u64 new_luid(void) {
  14.140 +    u64 luid;
  14.141 +    ENTER_LUID_CR;
  14.142 +    luid = luid_cnt++;
  14.143 +    LEAVE_LUID_CR;
  14.144 +    return luid;
  14.145 +}
  14.146 +
  14.147 +/* Queue of outstanding requests.
  14.148 + */
  14.149 +bsq_t *bs_head = NULL;
  14.150 +bsq_t *bs_tail = NULL;
  14.151 +int bs_qlen = 0;
  14.152 +
  14.153 +/*
  14.154 + */
  14.155 +void queuedebug(char *msg) {
  14.156 +    bsq_t *q;
  14.157 +    ENTER_QUEUE_CR;
  14.158 +    fprintf(stderr, "Q: %s len=%u\n", msg, bs_qlen);
  14.159 +    for (q = bs_head; q; q = q->next) {
  14.160 +        fprintf(stderr, "  luid=%016llx server=%u\n",
  14.161 +                q->message.luid, q->server);
  14.162 +    }
  14.163 +    LEAVE_QUEUE_CR;
  14.164 +}
  14.165 +
  14.166 +int enqueue(bsq_t *qe) {
  14.167 +    ENTER_QUEUE_CR;
  14.168 +    qe->next = NULL;
  14.169 +    qe->prev = bs_tail;
  14.170 +    if (!bs_head)
  14.171 +        bs_head = qe;
  14.172 +    else
  14.173 +        bs_tail->next = qe;
  14.174 +    bs_tail = qe;
  14.175 +    bs_qlen++;
  14.176 +    LEAVE_QUEUE_CR;
  14.177 +#ifdef BSDEBUG
  14.178 +    queuedebug("enqueue");
  14.179 +#endif
  14.180 +    return 0;
  14.181 +}
  14.182 +
  14.183 +int dequeue(bsq_t *qe) {
  14.184 +    bsq_t *q;
  14.185 +    ENTER_QUEUE_CR;
  14.186 +    for (q = bs_head; q; q = q->next) {
  14.187 +        if (q == qe) {
  14.188 +            if (q->prev)
  14.189 +                q->prev->next = q->next;
  14.190 +            else 
  14.191 +                bs_head = q->next;
  14.192 +            if (q->next)
  14.193 +                q->next->prev = q->prev;
  14.194 +            else
  14.195 +                bs_tail = q->prev;
  14.196 +            bs_qlen--;
  14.197 +            goto found;
  14.198 +        }
  14.199 +    }
  14.200 +
  14.201 +    LEAVE_QUEUE_CR;
  14.202 +#ifdef BSDEBUG
  14.203 +    queuedebug("dequeue not found");
  14.204 +#endif
  14.205 +    return 0;
  14.206 +
  14.207 +    found:
  14.208 +    LEAVE_QUEUE_CR;
  14.209 +#ifdef BSDEBUG
  14.210 +    queuedebug("dequeue not found");
  14.211 +#endif
  14.212 +    return 1;
  14.213 +}
  14.214 +
  14.215 +bsq_t *queuesearch(bsq_t *qe) {
  14.216 +    bsq_t *q;
  14.217 +    ENTER_QUEUE_CR;
  14.218 +    for (q = bs_head; q; q = q->next) {
  14.219 +        if ((qe->server == q->server) &&
  14.220 +            (qe->message.operation == q->message.operation) &&
  14.221 +            (qe->message.luid == q->message.luid)) {
  14.222 +
  14.223 +            if ((q->message.operation == BSOP_READBLOCK) &&
  14.224 +                ((q->message.flags & BSOP_FLAG_ERROR) == 0)) {
  14.225 +                q->block = qe->block;
  14.226 +                qe->block = NULL;
  14.227 +            }
  14.228 +            q->length = qe->length;
  14.229 +            q->message.flags = qe->message.flags;
  14.230 +            q->message.id = qe->message.id;
  14.231 +            q->status |= BSQ_STATUS_MATCHED;
  14.232 +
  14.233 +            if (q->prev)
  14.234 +                q->prev->next = q->next;
  14.235 +            else 
  14.236 +                bs_head = q->next;
  14.237 +            if (q->next)
  14.238 +                q->next->prev = q->prev;
  14.239 +            else
  14.240 +                bs_tail = q->prev;
  14.241 +            q->next = NULL;
  14.242 +            q->prev = NULL;
  14.243 +            bs_qlen--;
  14.244 +            goto found;
  14.245 +        }
  14.246 +    }
  14.247 +
  14.248 +    LEAVE_QUEUE_CR;
  14.249 +#ifdef BSDEBUG
  14.250 +    queuedebug("queuesearch not found");
  14.251 +#endif
  14.252 +    return NULL;
  14.253 +
  14.254 +    found:
  14.255 +    LEAVE_QUEUE_CR;
  14.256 +#ifdef BSDEBUG
  14.257 +    queuedebug("queuesearch found");
  14.258 +#endif
  14.259 +    return q;
  14.260 +}
  14.261 +
  14.262 +/*****************************************************************************
  14.263 + * Network communication                                                     *
  14.264 + *****************************************************************************/
  14.265 +
  14.266 +int send_message(bsq_t *qe) {
  14.267 +    int rc;
  14.268 +
  14.269 +    qe->msghdr.msg_name = (void *)&(bsservers[qe->server].sin);
  14.270 +    qe->msghdr.msg_namelen = sizeof(struct sockaddr_in);
  14.271 +    qe->msghdr.msg_iov = qe->iov;
  14.272 +    if (qe->block)
  14.273 +        qe->msghdr.msg_iovlen = 2;
  14.274 +    else
  14.275 +        qe->msghdr.msg_iovlen = 1;
  14.276 +    qe->msghdr.msg_control = NULL;
  14.277 +    qe->msghdr.msg_controllen = 0;
  14.278 +    qe->msghdr.msg_flags = 0;
  14.279 +
  14.280 +    qe->iov[0].iov_base = (void *)&(qe->message);
  14.281 +    qe->iov[0].iov_len = MSGBUFSIZE_ID;
  14.282 +
  14.283 +    if (qe->block) {
  14.284 +        qe->iov[1].iov_base = qe->block;
  14.285 +        qe->iov[1].iov_len = BLOCK_SIZE;
  14.286 +    }
  14.287 +
  14.288 +    qe->message.luid = new_luid();
  14.289 +
  14.290 +    qe->status = 0;
  14.291 +    qe->tid = (int)pthread_getspecific(tid_key);
  14.292 +    if (enqueue(qe) < 0) {
  14.293 +        fprintf(stderr, "Error enqueuing request.\n");
  14.294 +        return -1;
  14.295 +    }
  14.296 +
  14.297 +    gettimeofday(&(qe->tv_sent), NULL);
  14.298 +    DB("send_message to %d luid=%016llx\n", qe->server, qe->message.luid);
  14.299 +    rc = sendmsg(bssock, &(qe->msghdr), MSG_DONTWAIT);
  14.300 +    //rc = sendto(bssock, (void *)&(qe->message), qe->length, 0,
  14.301 +    //           (struct sockaddr *)&(bsservers[qe->server].sin),
  14.302 +    //           sizeof(struct sockaddr_in));
  14.303 +    if (rc < 0)
  14.304 +        return rc;
  14.305 +
  14.306 +    return rc;
  14.307 +}
  14.308 +
  14.309 +int recv_message(bsq_t *qe) {
  14.310 +    struct sockaddr_in from;
  14.311 +    //int flen = sizeof(from);
  14.312 +    int rc;
  14.313 +
  14.314 +    qe->msghdr.msg_name = &from;
  14.315 +    qe->msghdr.msg_namelen = sizeof(struct sockaddr_in);
  14.316 +    qe->msghdr.msg_iov = qe->iov;
  14.317 +    if (qe->block)
  14.318 +        qe->msghdr.msg_iovlen = 2;
  14.319 +    else
  14.320 +        qe->msghdr.msg_iovlen = 1;
  14.321 +    qe->msghdr.msg_control = NULL;
  14.322 +    qe->msghdr.msg_controllen = 0;
  14.323 +    qe->msghdr.msg_flags = 0;
  14.324 +
  14.325 +    qe->iov[0].iov_base = (void *)&(qe->message);
  14.326 +    qe->iov[0].iov_len = MSGBUFSIZE_ID;
  14.327 +    if (qe->block) {
  14.328 +        qe->iov[1].iov_base = qe->block;
  14.329 +        qe->iov[1].iov_len = BLOCK_SIZE;
  14.330 +    }
  14.331 +
  14.332 +    rc = recvmsg(bssock, &(qe->msghdr), 0);
  14.333 +
  14.334 +    //return recvfrom(bssock, (void *)&(qe->message), sizeof(bsmsg_t), 0,
  14.335 +    //               (struct sockaddr *)&from, &flen);
  14.336 +    return rc;
  14.337 +}
  14.338 +
  14.339 +int get_server_number(struct sockaddr_in *sin) {
  14.340 +    int i;
  14.341 +
  14.342 +#ifdef BSDEBUG2
  14.343 +    fprintf(stderr,
  14.344 +            "get_server_number(%u.%u.%u.%u/%u)\n",
  14.345 +            (unsigned int)sin->sin_addr.s_addr & 0xff,
  14.346 +            ((unsigned int)sin->sin_addr.s_addr >> 8) & 0xff,
  14.347 +            ((unsigned int)sin->sin_addr.s_addr >> 16) & 0xff,
  14.348 +            ((unsigned int)sin->sin_addr.s_addr >> 24) & 0xff,
  14.349 +            (unsigned int)sin->sin_port);
  14.350 +#endif
  14.351 +
  14.352 +    for (i = 0; i < MAX_SERVERS; i++) {
  14.353 +        if (bsservers[i].hostname) {
  14.354 +#ifdef BSDEBUG2
  14.355 +            fprintf(stderr,
  14.356 +                    "get_server_number check %u.%u.%u.%u/%u\n",
  14.357 +                    (unsigned int)bsservers[i].sin.sin_addr.s_addr&0xff,
  14.358 +                    ((unsigned int)bsservers[i].sin.sin_addr.s_addr >> 8)&0xff,
  14.359 +                    ((unsigned int)bsservers[i].sin.sin_addr.s_addr >> 16)&0xff,
  14.360 +                    ((unsigned int)bsservers[i].sin.sin_addr.s_addr >> 24)&0xff,
  14.361 +                    (unsigned int)bsservers[i].sin.sin_port);
  14.362 +#endif
  14.363 +            if ((sin->sin_family == bsservers[i].sin.sin_family) &&
  14.364 +                (sin->sin_port == bsservers[i].sin.sin_port) &&
  14.365 +                (memcmp((void *)&(sin->sin_addr),
  14.366 +                        (void *)&(bsservers[i].sin.sin_addr),
  14.367 +                        sizeof(struct in_addr)) == 0)) {
  14.368 +                return i;
  14.369 +            }
  14.370 +        }        
  14.371 +    }
  14.372 +
  14.373 +    return -1;
  14.374 +}
  14.375 +
  14.376 +void *rx_buffer = NULL;
  14.377 +bsq_t rx_qe;
  14.378 +bsq_t *recv_any(void) {
  14.379 +    struct sockaddr_in from;
  14.380 +    int rc;
  14.381 +    
  14.382 +    DB("ENTER recv_any\n");
  14.383 +
  14.384 +    rx_qe.msghdr.msg_name = &from;
  14.385 +    rx_qe.msghdr.msg_namelen = sizeof(struct sockaddr_in);
  14.386 +    rx_qe.msghdr.msg_iov = rx_qe.iov;
  14.387 +    if (!rx_buffer) {
  14.388 +        rx_buffer = malloc(BLOCK_SIZE);
  14.389 +        if (!rx_buffer) {
  14.390 +            perror("recv_any malloc");
  14.391 +            return NULL;
  14.392 +        }
  14.393 +    }
  14.394 +    rx_qe.block = rx_buffer;
  14.395 +    rx_buffer = NULL;
  14.396 +    rx_qe.msghdr.msg_iovlen = 2;
  14.397 +    rx_qe.msghdr.msg_control = NULL;
  14.398 +    rx_qe.msghdr.msg_controllen = 0;
  14.399 +    rx_qe.msghdr.msg_flags = 0;
  14.400 +    
  14.401 +    rx_qe.iov[0].iov_base = (void *)&(rx_qe.message);
  14.402 +    rx_qe.iov[0].iov_len = MSGBUFSIZE_ID;
  14.403 +    rx_qe.iov[1].iov_base = rx_qe.block;
  14.404 +    rx_qe.iov[1].iov_len = BLOCK_SIZE;
  14.405 +
  14.406 +    rc = recvmsg(bssock, &(rx_qe.msghdr), 0);
  14.407 +    if (rc < 0) {
  14.408 +        perror("recv_any");
  14.409 +        return NULL;
  14.410 +    }
  14.411 +
  14.412 +    rx_qe.length = rc;    
  14.413 +    rx_qe.server = get_server_number(&from);
  14.414 +
  14.415 +    DB("recv_any from %d luid=%016llx len=%u\n",
  14.416 +       rx_qe.server, rx_qe.message.luid, rx_qe.length);
  14.417 +
  14.418 +    return &rx_qe;
  14.419 +}
  14.420 +
  14.421 +void recv_recycle_buffer(bsq_t *q) {
  14.422 +    if (q->block) {
  14.423 +        rx_buffer = q->block;
  14.424 +        q->block = NULL;
  14.425 +    }
  14.426 +}
  14.427 +
  14.428 +// cycle through reading any incoming, searching for a match in the
  14.429 +// queue, until we have all we need.
  14.430 +int wait_recv(bsq_t **reqs, int numreqs) {
  14.431 +    bsq_t *q, *m;
  14.432 +    unsigned int x, i;
  14.433 +    int tid = (int)pthread_getspecific(tid_key);
  14.434 +
  14.435 +    DB("ENTER wait_recv %u\n", numreqs);
  14.436 +
  14.437 +    checkmatch:
  14.438 +    x = 0xffffffff;
  14.439 +    for (i = 0; i < numreqs; i++) {
  14.440 +        x &= reqs[i]->status;
  14.441 +    }
  14.442 +    if ((x & BSQ_STATUS_MATCHED)) {
  14.443 +        DB("LEAVE wait_recv\n");
  14.444 +        return numreqs;
  14.445 +    }
  14.446 +
  14.447 +    RECV_AWAIT(tid);
  14.448 +
  14.449 +    /*
  14.450 +    rxagain:
  14.451 +    ENTER_RECV_CR;
  14.452 +    q = recv_any();
  14.453 +    LEAVE_RECV_CR;
  14.454 +    if (!q)
  14.455 +        return -1;
  14.456 +
  14.457 +    m = queuesearch(q);
  14.458 +    recv_recycle_buffer(q);
  14.459 +    if (!m) {
  14.460 +        fprintf(stderr, "Unmatched RX\n");
  14.461 +        goto rxagain;
  14.462 +    }
  14.463 +    */
  14.464 +
  14.465 +    goto checkmatch;
  14.466 +
  14.467 +}
  14.468 +
  14.469 +/* retry
  14.470 + */
  14.471 +static int retry_count = 0;
  14.472 +int retry(bsq_t *qe)
  14.473 +{
  14.474 +    int rc;
  14.475 +    gettimeofday(&(qe->tv_sent), NULL);
  14.476 +    DB("retry to %d luid=%016llx\n", qe->server, qe->message.luid);
  14.477 +    retry_count++;
  14.478 +    rc = sendmsg(bssock, &(qe->msghdr), MSG_DONTWAIT);
  14.479 +    if (rc < 0)
  14.480 +        return rc;
  14.481 +    return 0;
  14.482 +}
  14.483 +
  14.484 +/* queue runner
  14.485 + */
  14.486 +void *queue_runner(void *arg)
  14.487 +{
  14.488 +    for (;;) {
  14.489 +        struct timeval now;
  14.490 +        long long nowus, sus;
  14.491 +        bsq_t *q;
  14.492 +        int r;
  14.493 +
  14.494 +        sleep(1);
  14.495 +
  14.496 +        gettimeofday(&now, NULL);
  14.497 +        nowus = now.tv_usec + now.tv_sec * 1000000;
  14.498 +        ENTER_QUEUE_CR;
  14.499 +        r = retry_count;
  14.500 +        for (q = bs_head; q; q = q->next) {
  14.501 +            sus = q->tv_sent.tv_usec + q->tv_sent.tv_sec * 1000000;
  14.502 +            if ((nowus - sus) > RETRY_TIMEOUT) {
  14.503 +                if (retry(q) < 0) {
  14.504 +                    fprintf(stderr, "Error on sendmsg retry.\n");
  14.505 +                }
  14.506 +            }
  14.507 +        }
  14.508 +        if (r != retry_count) {
  14.509 +            fprintf(stderr, "RETRIES: %u %u\n", retry_count - r, retry_count);
  14.510 +        }
  14.511 +        LEAVE_QUEUE_CR;
  14.512 +    }
  14.513 +}
  14.514 +
  14.515 +/* receive loop
  14.516 + */
  14.517 +void *receive_loop(void *arg)
  14.518 +{
  14.519 +    bsq_t *q, *m;
  14.520 +
  14.521 +    for(;;) {
  14.522 +        q = recv_any();
  14.523 +        if (!q) {
  14.524 +            fprintf(stderr, "recv_any error\n");
  14.525 +        }
  14.526 +        else {
  14.527 +            m = queuesearch(q);
  14.528 +            recv_recycle_buffer(q);
  14.529 +            if (!m) {
  14.530 +                fprintf(stderr, "Unmatched RX\n");
  14.531 +            }
  14.532 +            else {
  14.533 +                DB("RX MATCH");
  14.534 +                RECV_NOTIFY(m->tid);
  14.535 +            }
  14.536 +        }
  14.537 +    }
  14.538 +}
  14.539 +pthread_t pthread_recv;
  14.540 +
  14.541 +/*****************************************************************************
  14.542 + * Reading                                                                   *
  14.543 + *****************************************************************************/
  14.544 +
  14.545 +void *readblock_indiv(int server, u64 id) {
  14.546 +    void *block;
  14.547 +    bsq_t *qe;
  14.548 +    int len, rc;
  14.549 +
  14.550 +    qe = (bsq_t *)malloc(sizeof(bsq_t));
  14.551 +    if (!qe) {
  14.552 +        perror("readblock qe malloc");
  14.553 +        return NULL;
  14.554 +    }
  14.555 +    qe->block = NULL;
  14.556 +    
  14.557 +    /*
  14.558 +    qe->block = malloc(BLOCK_SIZE);
  14.559 +    if (!qe->block) {
  14.560 +        perror("readblock qe malloc");
  14.561 +        free((void *)qe);
  14.562 +        return NULL;
  14.563 +    }
  14.564 +    */
  14.565 +
  14.566 +    qe->server = server;
  14.567 +
  14.568 +    qe->message.operation = BSOP_READBLOCK;
  14.569 +    qe->message.flags = 0;
  14.570 +    qe->message.id = id;
  14.571 +    qe->length = MSGBUFSIZE_ID;
  14.572 +
  14.573 +    if (send_message(qe) < 0) {
  14.574 +        perror("readblock sendto");
  14.575 +        goto err;
  14.576 +    }
  14.577 +    
  14.578 +    /*len = recv_message(qe);
  14.579 +    if (len < 0) {
  14.580 +        perror("readblock recv");
  14.581 +        goto err;
  14.582 +    }*/
  14.583 +
  14.584 +    rc = wait_recv(&qe, 1);
  14.585 +    if (rc < 0) {
  14.586 +        perror("readblock recv");
  14.587 +        goto err;
  14.588 +    }
  14.589 +
  14.590 +    if ((qe->message.flags & BSOP_FLAG_ERROR)) {
  14.591 +        fprintf(stderr, "readblock server error\n");
  14.592 +        goto err;
  14.593 +    }
  14.594 +    if (qe->length < MSGBUFSIZE_BLOCK) {
  14.595 +        fprintf(stderr, "readblock recv short (%u)\n", len);
  14.596 +        goto err;
  14.597 +    }
  14.598 +    /* if ((block = malloc(BLOCK_SIZE)) == NULL) {
  14.599 +        perror("readblock malloc");
  14.600 +        goto err;
  14.601 +    }
  14.602 +    memcpy(block, qe->message.block, BLOCK_SIZE);
  14.603 +    */    
  14.604 +    block = qe->block;
  14.605 +
  14.606 +    free((void *)qe);
  14.607 +    return block;
  14.608 +
  14.609 +    err:
  14.610 +    if (qe->block)
  14.611 +        free(qe->block);
  14.612 +    free((void *)qe);
  14.613 +    return NULL;
  14.614 +}
  14.615 +
  14.616 +/**
  14.617 + * readblock: read a block from disk
  14.618 + *   @id: block id to read
  14.619 + *
  14.620 + *   @return: pointer to block, NULL on error
  14.621 + */
  14.622 +void *readblock(u64 id) {
  14.623 +    int map = (int)BSID_MAP(id);
  14.624 +    u64 xid;
  14.625 +    static int i = CLUSTER_MAX_REPLICAS - 1;
  14.626 +    void *block = NULL;
  14.627 +
  14.628 +    /* special case for the "superblock" just use the first block on the
  14.629 +     * first replica. (extend to blocks < 6 for vdi bug)
  14.630 +     */
  14.631 +    if (id < 6) {
  14.632 +        block = readblock_indiv(bsclusters[map].servers[0], id);
  14.633 +        goto out;
  14.634 +    }
  14.635 +
  14.636 +    i++;
  14.637 +    if (i >= CLUSTER_MAX_REPLICAS)
  14.638 +        i = 0;
  14.639 +    switch (i) {
  14.640 +    case 0:
  14.641 +        xid = BSID_REPLICA0(id);
  14.642 +        break;
  14.643 +    case 1:
  14.644 +        xid = BSID_REPLICA1(id);
  14.645 +        break;
  14.646 +    case 2:
  14.647 +        xid = BSID_REPLICA2(id);
  14.648 +        break;
  14.649 +    }
  14.650 +    
  14.651 +    block = readblock_indiv(bsclusters[map].servers[i], xid);
  14.652 +
  14.653 +    out:
  14.654 +#ifdef BSDEBUG
  14.655 +    if (block)
  14.656 +        fprintf(stderr, "READ:  %016llx %02x%02x %02x%02x %02x%02x %02x%02x\n",
  14.657 +                id,
  14.658 +                (unsigned int)((unsigned char *)block)[0],
  14.659 +                (unsigned int)((unsigned char *)block)[1],
  14.660 +                (unsigned int)((unsigned char *)block)[2],
  14.661 +                (unsigned int)((unsigned char *)block)[3],
  14.662 +                (unsigned int)((unsigned char *)block)[4],
  14.663 +                (unsigned int)((unsigned char *)block)[5],
  14.664 +                (unsigned int)((unsigned char *)block)[6],
  14.665 +                (unsigned int)((unsigned char *)block)[7]);
  14.666 +    else
  14.667 +        fprintf(stderr, "READ:  %016llx NULL\n", id);
  14.668 +#endif
  14.669 +    return block;
  14.670 +}
  14.671 +
  14.672 +/*****************************************************************************
  14.673 + * Writing                                                                   *
  14.674 + *****************************************************************************/
  14.675 +
  14.676 +bsq_t *writeblock_indiv(int server, u64 id, void *block) {
  14.677 +
  14.678 +    bsq_t *qe;
  14.679 +    int len;
  14.680 +
  14.681 +    qe = (bsq_t *)malloc(sizeof(bsq_t));
  14.682 +    if (!qe) {
  14.683 +        perror("writeblock qe malloc");
  14.684 +        goto err;
  14.685 +    }
  14.686 +    qe->server = server;
  14.687 +
  14.688 +    qe->message.operation = BSOP_WRITEBLOCK;
  14.689 +    qe->message.flags = 0;
  14.690 +    qe->message.id = id;
  14.691 +    //memcpy(qe->message.block, block, BLOCK_SIZE);
  14.692 +    qe->block = block;
  14.693 +    qe->length = MSGBUFSIZE_BLOCK;
  14.694 +
  14.695 +    if (send_message(qe) < 0) {
  14.696 +        perror("writeblock sendto");
  14.697 +        goto err;
  14.698 +    }
  14.699 +
  14.700 +    return qe;
  14.701 +
  14.702 +    err:
  14.703 +    free((void *)qe);
  14.704 +    return NULL;
  14.705 +}
  14.706 +    
  14.707 +
  14.708 +/**
  14.709 + * writeblock: write an existing block to disk
  14.710 + *   @id: block id
  14.711 + *   @block: pointer to block
  14.712 + *
  14.713 + *   @return: zero on success, -1 on failure
  14.714 + */
  14.715 +int writeblock(u64 id, void *block) {
  14.716 +    
  14.717 +    int map = (int)BSID_MAP(id);
  14.718 +    int rep0 = bsclusters[map].servers[0];
  14.719 +    int rep1 = bsclusters[map].servers[1];
  14.720 +    int rep2 = bsclusters[map].servers[2];
  14.721 +    bsq_t *reqs[3];
  14.722 +    int rc;
  14.723 +
  14.724 +    reqs[0] = reqs[1] = reqs[2] = NULL;
  14.725 +
  14.726 +#ifdef BSDEBUG
  14.727 +    fprintf(stderr,
  14.728 +            "WRITE: %016llx %02x%02x %02x%02x %02x%02x %02x%02x\n",
  14.729 +            id,
  14.730 +            (unsigned int)((unsigned char *)block)[0],
  14.731 +            (unsigned int)((unsigned char *)block)[1],
  14.732 +            (unsigned int)((unsigned char *)block)[2],
  14.733 +            (unsigned int)((unsigned char *)block)[3],
  14.734 +            (unsigned int)((unsigned char *)block)[4],
  14.735 +            (unsigned int)((unsigned char *)block)[5],
  14.736 +            (unsigned int)((unsigned char *)block)[6],
  14.737 +            (unsigned int)((unsigned char *)block)[7]);
  14.738 +#endif
  14.739 +
  14.740 +    /* special case for the "superblock" just use the first block on the
  14.741 +     * first replica. (extend to blocks < 6 for vdi bug)
  14.742 +     */
  14.743 +    if (id < 6) {
  14.744 +        reqs[0] = writeblock_indiv(rep0, id, block);
  14.745 +        if (!reqs[0])
  14.746 +            return -1;
  14.747 +        rc = wait_recv(reqs, 1);
  14.748 +        return rc;
  14.749 +    }
  14.750 +
  14.751 +    reqs[0] = writeblock_indiv(rep0, BSID_REPLICA0(id), block);
  14.752 +    if (!reqs[0])
  14.753 +        goto err;
  14.754 +    reqs[1] = writeblock_indiv(rep1, BSID_REPLICA1(id), block);
  14.755 +    if (!reqs[1])
  14.756 +        goto err;
  14.757 +    reqs[2] = writeblock_indiv(rep2, BSID_REPLICA2(id), block);
  14.758 +    if (!reqs[2])
  14.759 +        goto err;
  14.760 +
  14.761 +    rc = wait_recv(reqs, 3);
  14.762 +    if (rc < 0) {
  14.763 +        perror("writeblock recv");
  14.764 +        goto err;
  14.765 +    }
  14.766 +    if ((reqs[0]->message.flags & BSOP_FLAG_ERROR)) {
  14.767 +        fprintf(stderr, "writeblock server0 error\n");
  14.768 +        goto err;
  14.769 +    }
  14.770 +    if ((reqs[1]->message.flags & BSOP_FLAG_ERROR)) {
  14.771 +        fprintf(stderr, "writeblock server1 error\n");
  14.772 +        goto err;
  14.773 +    }
  14.774 +    if ((reqs[2]->message.flags & BSOP_FLAG_ERROR)) {
  14.775 +        fprintf(stderr, "writeblock server2 error\n");
  14.776 +        goto err;
  14.777 +    }
  14.778 +
  14.779 +
  14.780 +    free((void *)reqs[0]);
  14.781 +    free((void *)reqs[1]);
  14.782 +    free((void *)reqs[2]);
  14.783 +    return 0;
  14.784 +
  14.785 +    err:
  14.786 +    if (reqs[0]) {
  14.787 +        dequeue(reqs[0]);
  14.788 +        free((void *)reqs[0]);
  14.789 +    }
  14.790 +    if (reqs[1]) {
  14.791 +        dequeue(reqs[1]);
  14.792 +        free((void *)reqs[1]);
  14.793 +    }
  14.794 +    if (reqs[2]) {
  14.795 +        dequeue(reqs[2]);
  14.796 +        free((void *)reqs[2]);
  14.797 +    }
  14.798 +    return -1;
  14.799 +}
  14.800 +
  14.801 +/*****************************************************************************
  14.802 + * Allocation                                                                *
  14.803 + *****************************************************************************/
  14.804 +
  14.805 +/**
  14.806 + * allocblock: write a new block to disk
  14.807 + *   @block: pointer to block
  14.808 + *
  14.809 + *   @return: new id of block on disk
  14.810 + */
  14.811 +u64 allocblock(void *block) {
  14.812 +    return allocblock_hint(block, 0);
  14.813 +}
  14.814 +
  14.815 +bsq_t *allocblock_hint_indiv(int server, void *block, u64 hint) {
  14.816 +    bsq_t *qe;
  14.817 +    int len;
  14.818 +
  14.819 +    qe = (bsq_t *)malloc(sizeof(bsq_t));
  14.820 +    if (!qe) {
  14.821 +        perror("allocblock_hint qe malloc");
  14.822 +        goto err;
  14.823 +    }
  14.824 +    qe->server = server;
  14.825 +
  14.826 +    qe->message.operation = BSOP_ALLOCBLOCK;
  14.827 +    qe->message.flags = 0;
  14.828 +    qe->message.id = hint;
  14.829 +    //memcpy(qe->message.block, block, BLOCK_SIZE);
  14.830 +    qe->block = block;
  14.831 +    qe->length = MSGBUFSIZE_BLOCK;
  14.832 +
  14.833 +    if (send_message(qe) < 0) {
  14.834 +        perror("allocblock_hint sendto");
  14.835 +        goto err;
  14.836 +    }
  14.837 +    
  14.838 +    return qe;
  14.839 +
  14.840 +    err:
  14.841 +    free((void *)qe);
  14.842 +    return NULL;
  14.843 +}
  14.844 +
  14.845 +/**
  14.846 + * allocblock_hint: write a new block to disk
  14.847 + *   @block: pointer to block
  14.848 + *   @hint: allocation hint
  14.849 + *
  14.850 + *   @return: new id of block on disk
  14.851 + */
  14.852 +u64 allocblock_hint(void *block, u64 hint) {
  14.853 +    int map = (int)hint;
  14.854 +    int rep0 = bsclusters[map].servers[0];
  14.855 +    int rep1 = bsclusters[map].servers[1];
  14.856 +    int rep2 = bsclusters[map].servers[2];
  14.857 +    bsq_t *reqs[3];
  14.858 +    int rc;
  14.859 +    u64 id0, id1, id2;
  14.860 +
  14.861 +    reqs[0] = reqs[1] = reqs[2] = NULL;
  14.862 +
  14.863 +    DB("ENTER allocblock\n");
  14.864 +
  14.865 +    reqs[0] = allocblock_hint_indiv(rep0, block, hint);
  14.866 +    if (!reqs[0])
  14.867 +        goto err;
  14.868 +    reqs[1] = allocblock_hint_indiv(rep1, block, hint);
  14.869 +    if (!reqs[1])
  14.870 +        goto err;
  14.871 +    reqs[2] = allocblock_hint_indiv(rep2, block, hint);
  14.872 +    if (!reqs[2])
  14.873 +        goto err;
  14.874 +
  14.875 +    rc = wait_recv(reqs, 3);
  14.876 +    if (rc < 0) {
  14.877 +        perror("allocblock recv");
  14.878 +        goto err;
  14.879 +    }
  14.880 +    if ((reqs[0]->message.flags & BSOP_FLAG_ERROR)) {
  14.881 +        fprintf(stderr, "allocblock server0 error\n");
  14.882 +        goto err;
  14.883 +    }
  14.884 +    if ((reqs[1]->message.flags & BSOP_FLAG_ERROR)) {
  14.885 +        fprintf(stderr, "allocblock server1 error\n");
  14.886 +        goto err;
  14.887 +    }
  14.888 +    if ((reqs[2]->message.flags & BSOP_FLAG_ERROR)) {
  14.889 +        fprintf(stderr, "allocblock server2 error\n");
  14.890 +        goto err;
  14.891 +    }
  14.892 +
  14.893 +    id0 = reqs[0]->message.id;
  14.894 +    id1 = reqs[1]->message.id;
  14.895 +    id2 = reqs[2]->message.id;
  14.896 +
  14.897 +#ifdef BSDEBUG
  14.898 +    fprintf(stderr, "ALLOC: %016llx %02x%02x %02x%02x %02x%02x %02x%02x\n",
  14.899 +            BSID(map, id0, id1, id2),
  14.900 +            (unsigned int)((unsigned char *)block)[0],
  14.901 +            (unsigned int)((unsigned char *)block)[1],
  14.902 +            (unsigned int)((unsigned char *)block)[2],
  14.903 +            (unsigned int)((unsigned char *)block)[3],
  14.904 +            (unsigned int)((unsigned char *)block)[4],
  14.905 +            (unsigned int)((unsigned char *)block)[5],
  14.906 +            (unsigned int)((unsigned char *)block)[6],
  14.907 +            (unsigned int)((unsigned char *)block)[7]);
  14.908 +#endif
  14.909 +    
  14.910 +    free((void *)reqs[0]);
  14.911 +    free((void *)reqs[1]);
  14.912 +    free((void *)reqs[2]);
  14.913 +    return BSID(map, id0, id1, id2);
  14.914 +
  14.915 +    err:
  14.916 +    if (reqs[0]) {
  14.917 +        dequeue(reqs[0]);
  14.918 +        free((void *)reqs[0]);
  14.919 +    }
  14.920 +    if (reqs[1]) {
  14.921 +        dequeue(reqs[1]);
  14.922 +        free((void *)reqs[1]);
  14.923 +    }
  14.924 +    if (reqs[2]) {
  14.925 +        dequeue(reqs[2]);
  14.926 +        free((void *)reqs[2]);
  14.927 +    }
  14.928 +    return 0;
  14.929 +}
  14.930 +
  14.931 +#else /* /BLOCKSTORE_REMOTE */
  14.932 +
  14.933 +/*****************************************************************************
  14.934 + * Local storage version                                                     *
  14.935 + *****************************************************************************/
  14.936 + 
  14.937 +/**
  14.938 + * readblock: read a block from disk
  14.939 + *   @id: block id to read
  14.940 + *
  14.941 + *   @return: pointer to block, NULL on error
  14.942 + */
  14.943 +
  14.944 +void *readblock(u64 id) {
  14.945 +    void *block;
  14.946 +    int block_fp;
  14.947 +   
  14.948 +//printf("readblock(%llu)\n", id); 
  14.949 +    block_fp = open("blockstore.dat", O_RDONLY | O_CREAT | O_LARGEFILE, 0644);
  14.950 +
  14.951 +    if (block_fp < 0) {
  14.952 +        perror("open");
  14.953 +        return NULL;
  14.954 +    }
  14.955 +    
  14.956 +    if (lseek64(block_fp, ((off64_t) id - 1LL) * BLOCK_SIZE, SEEK_SET) < 0) {
  14.957 +        printf ("%Ld ", id);
  14.958 +        printf ("%Ld\n", (id - 1) * BLOCK_SIZE);
  14.959 +        perror("readblock lseek");
  14.960 +        goto err;
  14.961 +    }
  14.962 +    if ((block = malloc(BLOCK_SIZE)) == NULL) {
  14.963 +        perror("readblock malloc");
  14.964 +        goto err;
  14.965 +    }
  14.966 +    if (read(block_fp, block, BLOCK_SIZE) != BLOCK_SIZE) {
  14.967 +        perror("readblock read");
  14.968 +        free(block);
  14.969 +        goto err;
  14.970 +    }
  14.971 +    close(block_fp);
  14.972 +    return block;
  14.973 +    
  14.974 +err:
  14.975 +    close(block_fp);
  14.976 +    return NULL;
  14.977 +}
  14.978 +
  14.979 +/**
  14.980 + * writeblock: write an existing block to disk
  14.981 + *   @id: block id
  14.982 + *   @block: pointer to block
  14.983 + *
  14.984 + *   @return: zero on success, -1 on failure
  14.985 + */
  14.986 +int writeblock(u64 id, void *block) {
  14.987 +    
  14.988 +    int block_fp;
  14.989 +    
  14.990 +    block_fp = open("blockstore.dat", O_RDWR | O_CREAT | O_LARGEFILE, 0644);
  14.991 +
  14.992 +    if (block_fp < 0) {
  14.993 +        perror("open");
  14.994 +        return -1;
  14.995 +    }
  14.996 +
  14.997 +    if (lseek64(block_fp, ((off64_t) id - 1LL) * BLOCK_SIZE, SEEK_SET) < 0) {
  14.998 +        perror("writeblock lseek");
  14.999 +        goto err;
 14.1000 +    }
 14.1001 +    if (write(block_fp, block, BLOCK_SIZE) < 0) {
 14.1002 +        perror("writeblock write");
 14.1003 +        goto err;
 14.1004 +    }
 14.1005 +    close(block_fp);
 14.1006 +    return 0;
 14.1007 +
 14.1008 +err:
 14.1009 +    close(block_fp);
 14.1010 +    return -1;
 14.1011 +}
 14.1012 +
 14.1013 +/**
 14.1014 + * allocblock: write a new block to disk
 14.1015 + *   @block: pointer to block
 14.1016 + *
 14.1017 + *   @return: new id of block on disk
 14.1018 + */
 14.1019 +
 14.1020 +u64 allocblock(void *block) {
 14.1021 +    u64 lb;
 14.1022 +    off64_t pos;
 14.1023 +    int block_fp;
 14.1024 +    
 14.1025 +    block_fp = open("blockstore.dat", O_RDWR | O_CREAT | O_LARGEFILE, 0644);
 14.1026 +
 14.1027 +    if (block_fp < 0) {
 14.1028 +        perror("open");
 14.1029 +        return 0;
 14.1030 +    }
 14.1031 +
 14.1032 +    pos = lseek64(block_fp, 0, SEEK_END);
 14.1033 +    if (pos == (off64_t)-1) {
 14.1034 +        perror("allocblock lseek");
 14.1035 +        goto err;
 14.1036 +    }
 14.1037 +    if (pos % BLOCK_SIZE != 0) {
 14.1038 +        fprintf(stderr, "file size not multiple of %d\n", BLOCK_SIZE);
 14.1039 +        goto err;
 14.1040 +    }
 14.1041 +    if (write(block_fp, block, BLOCK_SIZE) != BLOCK_SIZE) {
 14.1042 +        perror("allocblock write");
 14.1043 +        goto err;
 14.1044 +    }
 14.1045 +    lb = pos / BLOCK_SIZE + 1;
 14.1046 +//printf("alloc(%Ld)\n", lb);
 14.1047 +    close(block_fp);
 14.1048 +    return lb;
 14.1049 +    
 14.1050 +err:
 14.1051 +    close(block_fp);
 14.1052 +    return 0;
 14.1053 +    
 14.1054 +}
 14.1055 +
 14.1056 +/**
 14.1057 + * allocblock_hint: write a new block to disk
 14.1058 + *   @block: pointer to block
 14.1059 + *   @hint: allocation hint
 14.1060 + *
 14.1061 + *   @return: new id of block on disk
 14.1062 + */
 14.1063 +u64 allocblock_hint(void *block, u64 hint) {
 14.1064 +    return allocblock(block);
 14.1065 +}
 14.1066 +
 14.1067 +#endif /* BLOCKSTORE_REMOTE */
 14.1068 +
 14.1069 +/*****************************************************************************
 14.1070 + * Memory management                                                         *
 14.1071 + *****************************************************************************/
 14.1072 +
 14.1073 +/**
 14.1074 + * newblock: get a new in-memory block set to zeros
 14.1075 + *
 14.1076 + *   @return: pointer to new block, NULL on error
 14.1077 + */
 14.1078 +void *newblock() {
 14.1079 +    void *block = malloc(BLOCK_SIZE);
 14.1080 +    if (block == NULL) {
 14.1081 +        perror("newblock");
 14.1082 +        return NULL;
 14.1083 +    }
 14.1084 +    memset(block, 0, BLOCK_SIZE);
 14.1085 +    return block;
 14.1086 +}
 14.1087 +
 14.1088 +
 14.1089 +/**
 14.1090 + * freeblock: unallocate an in-memory block
 14.1091 + *   @id: block id (zero if this is only in-memory)
 14.1092 + *   @block: block to be freed
 14.1093 + */
 14.1094 +void freeblock(void *block) {
 14.1095 +    if (block != NULL)
 14.1096 +        free(block);
 14.1097 +}
 14.1098 +
 14.1099 +static freeblock_t *new_freeblock(void)
 14.1100 +{
 14.1101 +    freeblock_t *fb;
 14.1102 +    
 14.1103 +    fb = newblock();
 14.1104 +    
 14.1105 +    if (fb == NULL) return NULL;
 14.1106 +    
 14.1107 +    fb->magic = FREEBLOCK_MAGIC;
 14.1108 +    fb->next  = 0ULL;
 14.1109 +    fb->count = 0ULL;
 14.1110 +    memset(fb->list, 0, sizeof fb->list);
 14.1111 +    
 14.1112 +    return fb;
 14.1113 +}
 14.1114 +
 14.1115 +void releaseblock(u64 id)
 14.1116 +{
 14.1117 +    blockstore_super_t *bs_super;
 14.1118 +    freeblock_t *fl_current;
 14.1119 +    
 14.1120 +    /* get superblock */
 14.1121 +    bs_super = (blockstore_super_t *) readblock(BLOCKSTORE_SUPER);
 14.1122 +    
 14.1123 +    /* get freeblock_current */
 14.1124 +    if (bs_super->freelist_current == 0ULL) 
 14.1125 +    {
 14.1126 +        fl_current = new_freeblock();
 14.1127 +        bs_super->freelist_current = allocblock(fl_current);
 14.1128 +        writeblock(BLOCKSTORE_SUPER, bs_super);
 14.1129 +    } else {
 14.1130 +        fl_current = readblock(bs_super->freelist_current);
 14.1131 +    }
 14.1132 +    
 14.1133 +    /* if full, chain to superblock and allocate new current */
 14.1134 +    
 14.1135 +    if (fl_current->count == FREEBLOCK_SIZE) {
 14.1136 +        fl_current->next = bs_super->freelist_full;
 14.1137 +        writeblock(bs_super->freelist_current, fl_current);
 14.1138 +        bs_super->freelist_full = bs_super->freelist_current;
 14.1139 +        freeblock(fl_current);
 14.1140 +        fl_current = new_freeblock();
 14.1141 +        bs_super->freelist_current = allocblock(fl_current);
 14.1142 +        writeblock(BLOCKSTORE_SUPER, bs_super);
 14.1143 +    }
 14.1144 +    
 14.1145 +    /* append id to current */
 14.1146 +    fl_current->list[fl_current->count++] = id;
 14.1147 +    writeblock(bs_super->freelist_current, fl_current);
 14.1148 +    
 14.1149 +    freeblock(fl_current);
 14.1150 +    freeblock(bs_super);
 14.1151 +    
 14.1152 +    
 14.1153 +}
 14.1154 +
 14.1155 +/* freelist debug functions: */
 14.1156 +void freelist_count(int print_each)
 14.1157 +{
 14.1158 +    blockstore_super_t *bs_super;
 14.1159 +    freeblock_t *fb;
 14.1160 +    u64 total = 0, next;
 14.1161 +    
 14.1162 +    bs_super = (blockstore_super_t *) readblock(BLOCKSTORE_SUPER);
 14.1163 +    
 14.1164 +    if (bs_super->freelist_current == 0ULL) {
 14.1165 +        printf("freelist is empty!\n");
 14.1166 +        return;
 14.1167 +    }
 14.1168 +    
 14.1169 +    fb = readblock(bs_super->freelist_current);
 14.1170 +    printf("%Ld entires on current.\n", fb->count);
 14.1171 +    total += fb->count;
 14.1172 +    if (print_each == 1)
 14.1173 +    {
 14.1174 +        int i;
 14.1175 +        for (i=0; i< fb->count; i++)
 14.1176 +            printf("  %Ld\n", fb->list[i]);
 14.1177 +    }
 14.1178 +    
 14.1179 +    freeblock(fb);
 14.1180 +    
 14.1181 +    if (bs_super->freelist_full == 0ULL) {
 14.1182 +        printf("freelist_full is empty!\n");
 14.1183 +        return;
 14.1184 +    }
 14.1185 +    
 14.1186 +    next = bs_super->freelist_full;
 14.1187 +    for (;;) {
 14.1188 +        fb = readblock(next);
 14.1189 +        total += fb->count;
 14.1190 +        if (print_each == 1)
 14.1191 +        {
 14.1192 +            int i;
 14.1193 +            for (i=0; i< fb->count; i++)
 14.1194 +                printf("  %Ld\n", fb->list[i]);
 14.1195 +        }
 14.1196 +        next = fb->next;
 14.1197 +        freeblock(fb);
 14.1198 +        if (next == 0ULL) break;
 14.1199 +    }
 14.1200 +    printf("Total of %Ld ids on freelist.\n", total);
 14.1201 +}
 14.1202 +
 14.1203 +/*****************************************************************************
 14.1204 + * Initialisation                                                            *
 14.1205 + *****************************************************************************/
 14.1206 +
 14.1207 +int __init_blockstore(void)
 14.1208 +{
 14.1209 +    int i;
 14.1210 +    blockstore_super_t *bs_super;
 14.1211 +    u64 ret;
 14.1212 +    int block_fp;
 14.1213 +    
 14.1214 +#ifdef BLOCKSTORE_REMOTE
 14.1215 +    struct hostent *addr;
 14.1216 +
 14.1217 +    pthread_mutex_init(&ptmutex_queue, NULL);
 14.1218 +    pthread_mutex_init(&ptmutex_luid, NULL);
 14.1219 +    pthread_mutex_init(&ptmutex_recv, NULL);
 14.1220 +    /*pthread_mutex_init(&ptmutex_notify, NULL);*/
 14.1221 +    for (i = 0; i <= READ_POOL_SIZE; i++) {
 14.1222 +        pool_thread[i].newdata = 0;
 14.1223 +        pthread_mutex_init(&(pool_thread[i].ptmutex), NULL);
 14.1224 +        pthread_cond_init(&(pool_thread[i].ptcv), NULL);
 14.1225 +    }
 14.1226 +
 14.1227 +    bsservers[0].hostname = "firebug.cl.cam.ac.uk";
 14.1228 +    bsservers[1].hostname = "planb.cl.cam.ac.uk";
 14.1229 +    bsservers[2].hostname = "simcity.cl.cam.ac.uk";
 14.1230 +    bsservers[3].hostname = NULL/*"gunfighter.cl.cam.ac.uk"*/;
 14.1231 +    bsservers[4].hostname = NULL/*"galaxian.cl.cam.ac.uk"*/;
 14.1232 +    bsservers[5].hostname = NULL/*"firetrack.cl.cam.ac.uk"*/;
 14.1233 +    bsservers[6].hostname = NULL/*"funfair.cl.cam.ac.uk"*/;
 14.1234 +    bsservers[7].hostname = NULL/*"felix.cl.cam.ac.uk"*/;
 14.1235 +    bsservers[8].hostname = NULL;
 14.1236 +    bsservers[9].hostname = NULL;
 14.1237 +    bsservers[10].hostname = NULL;
 14.1238 +    bsservers[11].hostname = NULL;
 14.1239 +    bsservers[12].hostname = NULL;
 14.1240 +    bsservers[13].hostname = NULL;
 14.1241 +    bsservers[14].hostname = NULL;
 14.1242 +    bsservers[15].hostname = NULL;
 14.1243 +
 14.1244 +    for (i = 0; i < MAX_SERVERS; i++) {
 14.1245 +        if (!bsservers[i].hostname)
 14.1246 +            continue;
 14.1247 +        addr = gethostbyname(bsservers[i].hostname);
 14.1248 +        if (!addr) {
 14.1249 +            perror("bad hostname");
 14.1250 +            return -1;
 14.1251 +        }
 14.1252 +        bsservers[i].sin.sin_family = addr->h_addrtype;
 14.1253 +        bsservers[i].sin.sin_port = htons(BLOCKSTORED_PORT);
 14.1254 +        bsservers[i].sin.sin_addr.s_addr = 
 14.1255 +            ((struct in_addr *)(addr->h_addr))->s_addr;
 14.1256 +    }
 14.1257 +
 14.1258 +    /* Cluster map
 14.1259 +     */
 14.1260 +    bsclusters[0].servers[0] = 0;
 14.1261 +    bsclusters[0].servers[1] = 1;
 14.1262 +    bsclusters[0].servers[2] = 2;
 14.1263 +    bsclusters[1].servers[0] = 1;
 14.1264 +    bsclusters[1].servers[1] = 2;
 14.1265 +    bsclusters[1].servers[2] = 3;
 14.1266 +    bsclusters[2].servers[0] = 2;
 14.1267 +    bsclusters[2].servers[1] = 3;
 14.1268 +    bsclusters[2].servers[2] = 4;
 14.1269 +    bsclusters[3].servers[0] = 3;
 14.1270 +    bsclusters[3].servers[1] = 4;
 14.1271 +    bsclusters[3].servers[2] = 5;
 14.1272 +    bsclusters[4].servers[0] = 4;
 14.1273 +    bsclusters[4].servers[1] = 5;
 14.1274 +    bsclusters[4].servers[2] = 6;
 14.1275 +    bsclusters[5].servers[0] = 5;
 14.1276 +    bsclusters[5].servers[1] = 6;
 14.1277 +    bsclusters[5].servers[2] = 7;
 14.1278 +    bsclusters[6].servers[0] = 6;
 14.1279 +    bsclusters[6].servers[1] = 7;
 14.1280 +    bsclusters[6].servers[2] = 0;
 14.1281 +    bsclusters[7].servers[0] = 7;
 14.1282 +    bsclusters[7].servers[1] = 0;
 14.1283 +    bsclusters[7].servers[2] = 1;
 14.1284 +
 14.1285 +    /* Local socket set up
 14.1286 +     */
 14.1287 +    bssock = socket(AF_INET, SOCK_DGRAM, 0);
 14.1288 +    if (bssock < 0) {
 14.1289 +        perror("Bad socket");
 14.1290 +        return -1;
 14.1291 +    }
 14.1292 +    memset(&sin_local, 0, sizeof(sin_local));
 14.1293 +    sin_local.sin_family = AF_INET;
 14.1294 +    sin_local.sin_port = htons(BLOCKSTORED_PORT);
 14.1295 +    sin_local.sin_addr.s_addr = htonl(INADDR_ANY);
 14.1296 +    if (bind(bssock, (struct sockaddr *)&sin_local, sizeof(sin_local)) < 0) {
 14.1297 +        perror("bind");
 14.1298 +        close(bssock);
 14.1299 +        return -1;
 14.1300 +    }
 14.1301 +
 14.1302 +    pthread_create(&pthread_recv, NULL, receive_loop, NULL);
 14.1303 +    pthread_create(&pthread_recv, NULL, queue_runner, NULL);
 14.1304 +
 14.1305 +#else /* /BLOCKSTORE_REMOTE */
 14.1306 +    block_fp = open("blockstore.dat", O_RDWR | O_CREAT | O_LARGEFILE, 0644);
 14.1307 +
 14.1308 +    if (block_fp < 0) {
 14.1309 +        perror("open");
 14.1310 +        return -1;
 14.1311 +        exit(-1);
 14.1312 +    }
 14.1313 +    
 14.1314 +    if (lseek(block_fp, 0, SEEK_END) == 0) {
 14.1315 +        bs_super = newblock();
 14.1316 +        bs_super->magic            = BLOCKSTORE_MAGIC;
 14.1317 +        bs_super->freelist_full    = 0LL;
 14.1318 +        bs_super->freelist_current = 0LL;
 14.1319 +        
 14.1320 +        ret = allocblock(bs_super);
 14.1321 +        
 14.1322 +        freeblock(bs_super);
 14.1323 +    } else {
 14.1324 +        bs_super = (blockstore_super_t *) readblock(BLOCKSTORE_SUPER);
 14.1325 +        if (bs_super->magic != BLOCKSTORE_MAGIC)
 14.1326 +        {
 14.1327 +            printf("BLOCKSTORE IS CORRUPT! (no magic in superblock!)\n");
 14.1328 +            exit(-1);
 14.1329 +        }
 14.1330 +        freeblock(bs_super);
 14.1331 +    }
 14.1332 +        
 14.1333 +    close(block_fp);
 14.1334 +        
 14.1335 +#endif /*  BLOCKSTORE_REMOTE */   
 14.1336 +    return 0;
 14.1337 +}
 14.1338 +
 14.1339 +void __exit_blockstore(void)
 14.1340 +{
 14.1341 +    int i;
 14.1342 +#ifdef BLOCKSTORE_REMOTE
 14.1343 +    pthread_mutex_destroy(&ptmutex_recv);
 14.1344 +    pthread_mutex_destroy(&ptmutex_luid);
 14.1345 +    pthread_mutex_destroy(&ptmutex_queue);
 14.1346 +    /*pthread_mutex_destroy(&ptmutex_notify);
 14.1347 +      pthread_cond_destroy(&ptcv_notify);*/
 14.1348 +    for (i = 0; i <= READ_POOL_SIZE; i++) {
 14.1349 +        pthread_mutex_destroy(&(pool_thread[i].ptmutex));
 14.1350 +        pthread_cond_destroy(&(pool_thread[i].ptcv));
 14.1351 +    }
 14.1352 +#endif
 14.1353 +}
    15.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    15.2 +++ b/tools/blktap/parallax/blockstore.h	Sun Jul 03 22:36:48 2005 +0000
    15.3 @@ -0,0 +1,134 @@
    15.4 +/**************************************************************************
    15.5 + * 
    15.6 + * blockstore.h
    15.7 + *
    15.8 + * Simple block store interface
    15.9 + *
   15.10 + */
   15.11 + 
   15.12 +#ifndef __BLOCKSTORE_H__
   15.13 +#define __BLOCKSTORE_H__
   15.14 +
   15.15 +#include <netinet/in.h>
   15.16 +#include <xc.h>
   15.17 +
   15.18 +#define BLOCK_SIZE  4096
   15.19 +#define BLOCK_SHIFT   12
   15.20 +#define BLOCK_MASK  0xfffffffffffff000LL
   15.21 +
   15.22 +/* XXX SMH: where is the below supposed to be defined???? */
   15.23 +#ifndef SECTOR_SHIFT 
   15.24 +#define SECTOR_SHIFT   9 
   15.25 +#endif
   15.26 +
   15.27 +#define FREEBLOCK_SIZE  (BLOCK_SIZE / sizeof(u64)) - (3 * sizeof(u64))
   15.28 +#define FREEBLOCK_MAGIC 0x0fee0fee0fee0feeULL
   15.29 +
   15.30 +typedef struct {
   15.31 +    u64 magic;
   15.32 +    u64 next;
   15.33 +    u64 count;
   15.34 +    u64 list[FREEBLOCK_SIZE];
   15.35 +} freeblock_t; 
   15.36 +
   15.37 +#define BLOCKSTORE_MAGIC 0xaaaaaaa00aaaaaaaULL
   15.38 +#define BLOCKSTORE_SUPER 1ULL
   15.39 +
   15.40 +typedef struct {
   15.41 +    u64 magic;
   15.42 +    u64 freelist_full;
   15.43 +    u64 freelist_current;
   15.44 +} blockstore_super_t;
   15.45 +
   15.46 +extern void *newblock();
   15.47 +extern void *readblock(u64 id);
   15.48 +extern u64 allocblock(void *block);
   15.49 +extern u64 allocblock_hint(void *block, u64 hint);
   15.50 +extern int writeblock(u64 id, void *block);
   15.51 +
   15.52 +/* Add this blockid to a freelist, to be recycled by the allocator. */
   15.53 +extern void releaseblock(u64 id);
   15.54 +
   15.55 +/* this is a memory free() operation for block-sized allocations */
   15.56 +extern void freeblock(void *block);
   15.57 +extern int __init_blockstore(void);
   15.58 +
   15.59 +/* debug for freelist. */
   15.60 +void freelist_count(int print_each);
   15.61 +#define ALLOCFAIL (((u64)(-1)))
   15.62 +
   15.63 +/* Distribution
   15.64 + */
   15.65 +#define BLOCKSTORED_PORT 9346
   15.66 +
   15.67 +struct bshdr_t_struct {
   15.68 +    u32            operation;
   15.69 +    u32            flags;
   15.70 +    u64            id;
   15.71 +    u64            luid;
   15.72 +} __attribute__ ((packed));
   15.73 +typedef struct bshdr_t_struct bshdr_t;
   15.74 +
   15.75 +struct bsmsg_t_struct {
   15.76 +    bshdr_t        hdr;
   15.77 +    unsigned char  block[BLOCK_SIZE];
   15.78 +} __attribute__ ((packed));
   15.79 +
   15.80 +typedef struct bsmsg_t_struct bsmsg_t;
   15.81 +
   15.82 +#define MSGBUFSIZE_OP    sizeof(u32)
   15.83 +#define MSGBUFSIZE_FLAGS (sizeof(u32) + sizeof(u32))
   15.84 +#define MSGBUFSIZE_ID    (sizeof(u32) + sizeof(u32) + sizeof(u64) + sizeof(u64))
   15.85 +#define MSGBUFSIZE_BLOCK sizeof(bsmsg_t)
   15.86 +
   15.87 +#define BSOP_READBLOCK  0x01
   15.88 +#define BSOP_WRITEBLOCK 0x02
   15.89 +#define BSOP_ALLOCBLOCK 0x03
   15.90 +#define BSOP_FREEBLOCK  0x04
   15.91 +
   15.92 +#define BSOP_FLAG_ERROR 0x01
   15.93 +
   15.94 +#define BS_ALLOC_SKIP 10
   15.95 +#define BS_ALLOC_HACK
   15.96 +
   15.97 +/* Remote hosts and cluster map - XXX need to generalise
   15.98 + */
   15.99 +
  15.100 +/*
  15.101 +
  15.102 +  Interim ID format is
  15.103 +
  15.104 +  63 60 59                40 39                20 19                 0
  15.105 +  +----+--------------------+--------------------+--------------------+
  15.106 +  |map | replica 2          | replica 1          | replica 0          |
  15.107 +  +----+--------------------+--------------------+--------------------+
  15.108 +
  15.109 +  The map is an index into a table detailing which machines form the
  15.110 +  cluster.
  15.111 +
  15.112 + */
  15.113 +
  15.114 +#define BSID_REPLICA0(_id) ((_id)&0xfffffULL)
  15.115 +#define BSID_REPLICA1(_id) (((_id)>>20)&0xfffffULL)
  15.116 +#define BSID_REPLICA2(_id) (((_id)>>40)&0xfffffULL)
  15.117 +#define BSID_MAP(_id)      (((_id)>>60)&0xfULL)
  15.118 +
  15.119 +#define BSID(_map, _rep0, _rep1, _rep2) ((((u64)(_map))<<60) | \
  15.120 +                                         (((u64)(_rep2))<<40) | \
  15.121 +                                         (((u64)(_rep1))<<20) | ((u64)(_rep0)))
  15.122 +
  15.123 +typedef struct bsserver_t_struct {
  15.124 +    char              *hostname;
  15.125 +    struct sockaddr_in sin;
  15.126 +} bsserver_t;
  15.127 +
  15.128 +#define MAX_SERVERS 16
  15.129 +
  15.130 +#define CLUSTER_MAX_REPLICAS 3
  15.131 +typedef struct bscluster_t_struct {
  15.132 +    int servers[CLUSTER_MAX_REPLICAS];
  15.133 +} bscluster_t;
  15.134 +
  15.135 +#define MAX_CLUSTERS 16
  15.136 +
  15.137 +#endif /* __BLOCKSTORE_H__ */
    16.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    16.2 +++ b/tools/blktap/parallax/blockstored.c	Sun Jul 03 22:36:48 2005 +0000
    16.3 @@ -0,0 +1,276 @@
    16.4 +/**************************************************************************
    16.5 + * 
    16.6 + * blockstored.c
    16.7 + *
    16.8 + * Block store daemon.
    16.9 + *
   16.10 + */
   16.11 +
   16.12 +#include <fcntl.h>
   16.13 +#include <unistd.h>
   16.14 +#include <stdio.h>
   16.15 +#include <stdlib.h>
   16.16 +#include <string.h>
   16.17 +#include <sys/types.h>
   16.18 +#include <sys/stat.h>
   16.19 +#include <sys/socket.h>
   16.20 +#include <sys/ioctl.h>
   16.21 +#include <netinet/in.h>
   16.22 +#include <errno.h>
   16.23 +#include "blockstore.h"
   16.24 +
   16.25 +//#define BSDEBUG
   16.26 +
   16.27 +int readblock_into(u64 id, void *block);
   16.28 +
   16.29 +int open_socket(u16 port) {
   16.30 +    
   16.31 +    struct sockaddr_in sn;
   16.32 +    int sock;
   16.33 +
   16.34 +    sock = socket(AF_INET, SOCK_DGRAM, 0);
   16.35 +    if (sock < 0) {
   16.36 +        perror("Bad socket");
   16.37 +        return -1;
   16.38 +    }
   16.39 +    memset(&sn, 0, sizeof(sn));
   16.40 +    sn.sin_family = AF_INET;
   16.41 +    sn.sin_port = htons(port);
   16.42 +    sn.sin_addr.s_addr = htonl(INADDR_ANY);
   16.43 +    if (bind(sock, (struct sockaddr *)&sn, sizeof(sn)) < 0) {
   16.44 +        perror("bind");
   16.45 +        close(sock);
   16.46 +        return -1;
   16.47 +    }
   16.48 +
   16.49 +    return sock;
   16.50 +}
   16.51 +
   16.52 +static int block_fp = -1;
   16.53 +static int bssock = -1;
   16.54 +
   16.55 +int send_reply(struct sockaddr_in *peer, void *buffer, int len) {
   16.56 +
   16.57 +    int rc;
   16.58 +    
   16.59 +#ifdef BSDEBUG
   16.60 +    fprintf(stdout, "TX: %u bytes op=%u id=0x%llx\n",
   16.61 +            len, ((bsmsg_t *)buffer)->hdr.operation, ((bsmsg_t *)buffer)->hdr.id);
   16.62 +#endif
   16.63 +    rc = sendto(bssock, buffer, len, 0, (struct sockaddr *)peer, sizeof(*peer));
   16.64 +    if (rc < 0) {
   16.65 +        perror("send_reply");
   16.66 +        return 1;
   16.67 +    }
   16.68 +
   16.69 +
   16.70 +    return 0;
   16.71 +}
   16.72 +
   16.73 +static bsmsg_t msgbuf;
   16.74 +
   16.75 +void service_loop(void) {
   16.76 +
   16.77 +    for (;;) {
   16.78 +        int rc, len;
   16.79 +        struct sockaddr_in from;
   16.80 +        size_t slen = sizeof(from);
   16.81 +        u64 bid;
   16.82 +
   16.83 +        len = recvfrom(bssock, (void *)&msgbuf, sizeof(msgbuf), 0,
   16.84 +                       (struct sockaddr *)&from, &slen);
   16.85 +
   16.86 +        if (len < 0) {
   16.87 +            perror("recvfrom");
   16.88 +            continue;
   16.89 +        }
   16.90 +
   16.91 +        if (len < MSGBUFSIZE_OP) {
   16.92 +            fprintf(stderr, "Short packet.\n");
   16.93 +            continue;
   16.94 +        }
   16.95 +
   16.96 +#ifdef BSDEBUG
   16.97 +        fprintf(stdout, "RX: %u bytes op=%u id=0x%llx\n",
   16.98 +                len, msgbuf.hdr.operation, msgbuf.hdr.id);
   16.99 +#endif
  16.100 +
  16.101 +        switch (msgbuf.hdr.operation) {
  16.102 +        case BSOP_READBLOCK:
  16.103 +            if (len < MSGBUFSIZE_ID) {
  16.104 +                fprintf(stderr, "Short packet (readblock %u).\n", len);
  16.105 +                continue;
  16.106 +            }
  16.107 +            rc = readblock_into(msgbuf.hdr.id, msgbuf.block);
  16.108 +            if (rc < 0) {
  16.109 +                fprintf(stderr, "readblock error\n");
  16.110 +                msgbuf.hdr.flags = BSOP_FLAG_ERROR;
  16.111 +                send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_ID);
  16.112 +                continue;
  16.113 +            }
  16.114 +            msgbuf.hdr.flags = 0;
  16.115 +            send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_BLOCK);
  16.116 +            break;
  16.117 +        case BSOP_WRITEBLOCK:
  16.118 +            if (len < MSGBUFSIZE_BLOCK) {
  16.119 +                fprintf(stderr, "Short packet (writeblock %u).\n", len);
  16.120 +                continue;
  16.121 +            }
  16.122 +            rc = writeblock(msgbuf.hdr.id, msgbuf.block);
  16.123 +            if (rc < 0) {
  16.124 +                fprintf(stderr, "writeblock error\n");
  16.125 +                msgbuf.hdr.flags = BSOP_FLAG_ERROR;
  16.126 +                send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_ID);
  16.127 +                continue;
  16.128 +            }
  16.129 +            msgbuf.hdr.flags = 0;
  16.130 +            send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_ID);
  16.131 +            break;
  16.132 +        case BSOP_ALLOCBLOCK:
  16.133 +            if (len < MSGBUFSIZE_BLOCK) {
  16.134 +                fprintf(stderr, "Short packet (allocblock %u).\n", len);
  16.135 +                continue;
  16.136 +            }
  16.137 +            bid = allocblock(msgbuf.block);
  16.138 +            if (bid == ALLOCFAIL) {
  16.139 +                fprintf(stderr, "allocblock error\n");
  16.140 +                msgbuf.hdr.flags = BSOP_FLAG_ERROR;
  16.141 +                send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_ID);
  16.142 +                continue;
  16.143 +            }
  16.144 +            msgbuf.hdr.id = bid;
  16.145 +            msgbuf.hdr.flags = 0;
  16.146 +            send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_ID);
  16.147 +            break;
  16.148 +        }
  16.149 +
  16.150 +    }
  16.151 +}
  16.152 + 
  16.153 +/**
  16.154 + * readblock: read a block from disk
  16.155 + *   @id: block id to read
  16.156 + *   @block: pointer to buffer to receive block
  16.157 + *
  16.158 + *   @return: 0 if OK, other on error
  16.159 + */
  16.160 +
  16.161 +int readblock_into(u64 id, void *block) {
  16.162 +    if (lseek64(block_fp, ((off64_t) id - 1LL) * BLOCK_SIZE, SEEK_SET) < 0) {
  16.163 +        printf ("%Ld\n", (id - 1) * BLOCK_SIZE);
  16.164 +        perror("readblock lseek");
  16.165 +        return -1;
  16.166 +    }
  16.167 +    if (read(block_fp, block, BLOCK_SIZE) != BLOCK_SIZE) {
  16.168 +        perror("readblock read");
  16.169 +        return -1;
  16.170 +    }
  16.171 +    return 0;
  16.172 +}
  16.173 +
  16.174 +/**
  16.175 + * writeblock: write an existing block to disk
  16.176 + *   @id: block id
  16.177 + *   @block: pointer to block
  16.178 + *
  16.179 + *   @return: zero on success, -1 on failure
  16.180 + */
  16.181 +int writeblock(u64 id, void *block) {
  16.182 +    if (lseek64(block_fp, ((off64_t) id - 1LL) * BLOCK_SIZE, SEEK_SET) < 0) {
  16.183 +        perror("writeblock lseek");
  16.184 +        return -1;
  16.185 +    }
  16.186 +    if (write(block_fp, block, BLOCK_SIZE) < 0) {
  16.187 +        perror("writeblock write");
  16.188 +        return -1;
  16.189 +    }
  16.190 +    return 0;
  16.191 +}
  16.192 +
  16.193 +/**
  16.194 + * allocblock: write a new block to disk
  16.195 + *   @block: pointer to block
  16.196 + *
  16.197 + *   @return: new id of block on disk
  16.198 + */
  16.199 +static u64 lastblock = 0;
  16.200 +
  16.201 +u64 allocblock(void *block) {
  16.202 +    u64 lb;
  16.203 +    off64_t pos;
  16.204 +
  16.205 +    retry:
  16.206 +    pos = lseek64(block_fp, 0, SEEK_END);
  16.207 +    if (pos == (off64_t)-1) {
  16.208 +        perror("allocblock lseek");
  16.209 +        return ALLOCFAIL;
  16.210 +    }
  16.211 +    if (pos % BLOCK_SIZE != 0) {
  16.212 +        fprintf(stderr, "file size not multiple of %d\n", BLOCK_SIZE);
  16.213 +        return ALLOCFAIL;
  16.214 +    }
  16.215 +    if (write(block_fp, block, BLOCK_SIZE) != BLOCK_SIZE) {
  16.216 +        perror("allocblock write");
  16.217 +        return ALLOCFAIL;
  16.218 +    }
  16.219 +    lb = pos / BLOCK_SIZE + 1;
  16.220 +
  16.221 +#ifdef BS_ALLOC_HACK
  16.222 +    if (lb < BS_ALLOC_SKIP)
  16.223 +        goto retry;
  16.224 +#endif
  16.225 +    
  16.226 +    if (lb <= lastblock)
  16.227 +        printf("[*** %Ld alredy allocated! ***]\n", lb);
  16.228 +    
  16.229 +    lastblock = lb;
  16.230 +    return lb;
  16.231 +}
  16.232 +
  16.233 +/**
  16.234 + * newblock: get a new in-memory block set to zeros
  16.235 + *
  16.236 + *   @return: pointer to new block, NULL on error
  16.237 + */
  16.238 +void *newblock() {
  16.239 +    void *block = malloc(BLOCK_SIZE);
  16.240 +    if (block == NULL) {
  16.241 +        perror("newblock");
  16.242 +        return NULL;
  16.243 +    }
  16.244 +    memset(block, 0, BLOCK_SIZE);
  16.245 +    return block;
  16.246 +}
  16.247 +
  16.248 +
  16.249 +/**
  16.250 + * freeblock: unallocate an in-memory block
  16.251 + *   @id: block id (zero if this is only in-memory)
  16.252 + *   @block: block to be freed
  16.253 + */
  16.254 +void freeblock(void *block) {
  16.255 +    if (block != NULL)
  16.256 +        free(block);
  16.257 +}
  16.258 +
  16.259 +
  16.260 +int main(int argc, char **argv)
  16.261 +{
  16.262 +    block_fp = open("blockstore.dat", O_RDWR | O_CREAT | O_LARGEFILE, 0644);
  16.263 +
  16.264 +    if (block_fp < 0) {
  16.265 +        perror("open");
  16.266 +        return -1;
  16.267 +    }
  16.268 +
  16.269 +    bssock = open_socket(BLOCKSTORED_PORT);
  16.270 +    if (bssock < 0) {
  16.271 +        return -1;
  16.272 +    }
  16.273 +
  16.274 +    service_loop();
  16.275 +    
  16.276 +    close(bssock);
  16.277 +
  16.278 +    return 0;
  16.279 +}
    17.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    17.2 +++ b/tools/blktap/parallax/bstest.c	Sun Jul 03 22:36:48 2005 +0000
    17.3 @@ -0,0 +1,191 @@
    17.4 +/**************************************************************************
    17.5 + * 
    17.6 + * bstest.c
    17.7 + *
    17.8 + * Block store daemon test program.
    17.9 + *
   17.10 + * usage: bstest <host>|X {r|w|a} ID 
   17.11 + *
   17.12 + */
   17.13 +
   17.14 +#include <fcntl.h>
   17.15 +#include <unistd.h>
   17.16 +#include <stdio.h>
   17.17 +#include <stdlib.h>
   17.18 +#include <string.h>
   17.19 +#include <sys/types.h>
   17.20 +#include <sys/stat.h>
   17.21 +#include <sys/socket.h>
   17.22 +#include <sys/ioctl.h>
   17.23 +#include <netinet/in.h>
   17.24 +#include <netdb.h>
   17.25 +#include <errno.h>
   17.26 +#include "blockstore.h"
   17.27 +
   17.28 +int direct(char *host, u32 op, u64 id, int len) {
   17.29 +    struct sockaddr_in sn, peer;
   17.30 +    int sock;
   17.31 +    bsmsg_t msgbuf;
   17.32 +    int rc, slen;
   17.33 +    struct hostent *addr;
   17.34 +
   17.35 +    addr = gethostbyname(host);
   17.36 +    if (!addr) {
   17.37 +        perror("bad hostname");
   17.38 +        exit(1);
   17.39 +    }
   17.40 +    peer.sin_family = addr->h_addrtype;
   17.41 +    peer.sin_port = htons(BLOCKSTORED_PORT);
   17.42 +    peer.sin_addr.s_addr =  ((struct in_addr *)(addr->h_addr))->s_addr;
   17.43 +    fprintf(stderr, "Sending to: %u.%u.%u.%u\n",
   17.44 +            (unsigned int)(unsigned char)addr->h_addr[0],
   17.45 +            (unsigned int)(unsigned char)addr->h_addr[1],
   17.46 +            (unsigned int)(unsigned char)addr->h_addr[2],
   17.47 +            (unsigned int)(unsigned char)addr->h_addr[3]);
   17.48 +
   17.49 +    sock = socket(AF_INET, SOCK_DGRAM, 0);
   17.50 +    if (sock < 0) {
   17.51 +        perror("Bad socket");
   17.52 +        exit(1);
   17.53 +    }
   17.54 +    memset(&sn, 0, sizeof(sn));
   17.55 +    sn.sin_family = AF_INET;
   17.56 +    sn.sin_port = htons(BLOCKSTORED_PORT);
   17.57 +    sn.sin_addr.s_addr = htonl(INADDR_ANY);
   17.58 +    if (bind(sock, (struct sockaddr *)&sn, sizeof(sn)) < 0) {
   17.59 +        perror("bind");
   17.60 +        close(sock);
   17.61 +        exit(1);
   17.62 +    }
   17.63 +
   17.64 +    memset((void *)&msgbuf, 0, sizeof(msgbuf));
   17.65 +    msgbuf.operation = op;
   17.66 +    msgbuf.id = id;
   17.67 +
   17.68 +    rc = sendto(sock, (void *)&msgbuf, len, 0,
   17.69 +                (struct sockaddr *)&peer, sizeof(peer));
   17.70 +    if (rc < 0) {
   17.71 +        perror("sendto");
   17.72 +        exit(1);
   17.73 +    }
   17.74 +
   17.75 +    slen = sizeof(peer);
   17.76 +    len = recvfrom(sock, (void *)&msgbuf, sizeof(msgbuf), 0,
   17.77 +                   (struct sockaddr *)&peer, &slen);
   17.78 +    if (len < 0) {
   17.79 +        perror("recvfrom");
   17.80 +        exit(1);
   17.81 +    }
   17.82 +
   17.83 +    printf("Reply %u bytes:\n", len);
   17.84 +    if (len >= MSGBUFSIZE_OP)
   17.85 +        printf("  operation: %u\n", msgbuf.operation);
   17.86 +    if (len >= MSGBUFSIZE_FLAGS)
   17.87 +        printf("  flags: 0x%x\n", msgbuf.flags);
   17.88 +    if (len >= MSGBUFSIZE_ID)
   17.89 +        printf("  id: %llu\n", msgbuf.id);
   17.90 +    if (len >= (MSGBUFSIZE_ID + 4))
   17.91 +        printf("  data: %02x %02x %02x %02x...\n",
   17.92 +               (unsigned int)msgbuf.block[0],
   17.93 +               (unsigned int)msgbuf.block[1],
   17.94 +               (unsigned int)msgbuf.block[2],
   17.95 +               (unsigned int)msgbuf.block[3]);
   17.96 +    
   17.97 +    if (sock > 0)
   17.98 +        close(sock);
   17.99 +   
  17.100 +    return 0;
  17.101 +}
  17.102 +
  17.103 +int main (int argc, char **argv) {
  17.104 +
  17.105 +    u32 op = 0;
  17.106 +    u64 id = 0;
  17.107 +    int len = 0, rc;
  17.108 +    void *block;
  17.109 +
  17.110 +    if (argc < 3) {
  17.111 +        fprintf(stderr, "usage: bstest <host>|X {r|w|a} ID\n");
  17.112 +        return 1;
  17.113 +    }
  17.114 +
  17.115 +    switch (argv[2][0]) {
  17.116 +    case 'r':
  17.117 +    case 'R':
  17.118 +        op = BSOP_READBLOCK;
  17.119 +        len = MSGBUFSIZE_ID;
  17.120 +        break;
  17.121 +    case 'w':
  17.122 +    case 'W':
  17.123 +        op = BSOP_WRITEBLOCK;
  17.124 +        len = MSGBUFSIZE_BLOCK;
  17.125 +        break;
  17.126 +    case 'a':
  17.127 +    case 'A':
  17.128 +        op = BSOP_ALLOCBLOCK;
  17.129 +        len = MSGBUFSIZE_BLOCK;
  17.130 +        break;
  17.131 +    default:
  17.132 +        fprintf(stderr, "Unknown action '%s'.\n", argv[2]);
  17.133 +        return 1;
  17.134 +    }
  17.135 +
  17.136 +    if (argc >= 4)
  17.137 +        id = atoll(argv[3]);
  17.138 +
  17.139 +    if (strcmp(argv[1], "X") == 0) {
  17.140 +        rc = __init_blockstore();
  17.141 +        if (rc < 0) {
  17.142 +            fprintf(stderr, "blockstore init failed.\n");
  17.143 +            return 1;
  17.144 +        }
  17.145 +        switch(op) {
  17.146 +        case BSOP_READBLOCK:
  17.147 +            block = readblock(id);
  17.148 +            if (block) {
  17.149 +                printf("data: %02x %02x %02x %02x...\n",
  17.150 +                       (unsigned int)((unsigned char*)block)[0],
  17.151 +                       (unsigned int)((unsigned char*)block)[1],
  17.152 +                       (unsigned int)((unsigned char*)block)[2],
  17.153 +                       (unsigned int)((unsigned char*)block)[3]);
  17.154 +            }
  17.155 +            break;
  17.156 +        case BSOP_WRITEBLOCK:
  17.157 +            block = malloc(BLOCK_SIZE);
  17.158 +            if (!block) {
  17.159 +                perror("bstest malloc");
  17.160 +                return 1;
  17.161 +            }
  17.162 +            memset(block, 0, BLOCK_SIZE);
  17.163 +            rc = writeblock(id, block);
  17.164 +            if (rc != 0) {
  17.165 +                printf("error\n");
  17.166 +            }
  17.167 +            else {
  17.168 +                printf("OK\n");
  17.169 +            }
  17.170 +            break;
  17.171 +        case BSOP_ALLOCBLOCK:
  17.172 +            block = malloc(BLOCK_SIZE);
  17.173 +            if (!block) {
  17.174 +                perror("bstest malloc");
  17.175 +                return 1;
  17.176 +            }
  17.177 +            memset(block, 0, BLOCK_SIZE);
  17.178 +            id = allocblock_hint(block, id);
  17.179 +            if (id == 0) {
  17.180 +                printf("error\n");
  17.181 +            }
  17.182 +            else {
  17.183 +                printf("ID: %llu\n", id);
  17.184 +            }
  17.185 +            break;
  17.186 +        }
  17.187 +    }
  17.188 +    else {
  17.189 +        direct(argv[1], op, id, len);
  17.190 +    }
  17.191 +
  17.192 +
  17.193 +    return 0;
  17.194 +}
    18.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    18.2 +++ b/tools/blktap/parallax/parallax.c	Sun Jul 03 22:36:48 2005 +0000
    18.3 @@ -0,0 +1,611 @@
    18.4 +/**************************************************************************
    18.5 + * 
    18.6 + * parallax.c
    18.7 + *
    18.8 + * The Parallax Storage Server
    18.9 + *
   18.10 + */
   18.11 + 
   18.12 +
   18.13 +#include <stdio.h>
   18.14 +#include <stdlib.h>
   18.15 +#include <string.h>
   18.16 +#include <pthread.h>
   18.17 +#include "blktaplib.h"
   18.18 +#include "blockstore.h"
   18.19 +#include "vdi.h"
   18.20 +#include "block-async.h"
   18.21 +#include "requests-async.h"
   18.22 +
   18.23 +#define PARALLAX_DEV     61440
   18.24 +#define SECTS_PER_NODE   8
   18.25 +
   18.26 +
   18.27 +#if 0
   18.28 +#define DPRINTF(_f, _a...) printf ( _f , ## _a )
   18.29 +#else
   18.30 +#define DPRINTF(_f, _a...) ((void)0)
   18.31 +#endif
   18.32 +
   18.33 +/* ------[ session records ]----------------------------------------------- */
   18.34 +
   18.35 +#define BLKIF_HASHSZ 1024
   18.36 +#define BLKIF_HASH(_d,_h) (((int)(_d)^(int)(_h))&(BLKIF_HASHSZ-1))
   18.37 +
   18.38 +#define VDI_HASHSZ 16
   18.39 +#define VDI_HASH(_vd) ((((_vd)>>8)^(_vd))&(VDI_HASHSZ-1))
   18.40 +
   18.41 +typedef struct blkif {
   18.42 +    domid_t       domid;
   18.43 +    unsigned int  handle;
   18.44 +    enum { DISCONNECTED, DISCONNECTING, CONNECTED } status;
   18.45 +    vdi_t        *vdi_hash[VDI_HASHSZ];
   18.46 +    struct blkif *hash_next;
   18.47 +} blkif_t;
   18.48 +
   18.49 +static blkif_t      *blkif_hash[BLKIF_HASHSZ];
   18.50 +
   18.51 +blkif_t *blkif_find_by_handle(domid_t domid, unsigned int handle)
   18.52 +{
   18.53 +    if ( handle != 0 )
   18.54 +        printf("blktap/parallax don't currently support non-0 dev handles!\n");
   18.55 +    
   18.56 +    blkif_t *blkif = blkif_hash[BLKIF_HASH(domid, handle)];
   18.57 +    while ( (blkif != NULL) && 
   18.58 +            ((blkif->domid != domid) || (blkif->handle != handle)) )
   18.59 +        blkif = blkif->hash_next;
   18.60 +    return blkif;
   18.61 +}
   18.62 +
   18.63 +vdi_t *blkif_get_vdi(blkif_t *blkif, blkif_vdev_t device)
   18.64 +{
   18.65 +    vdi_t *vdi = blkif->vdi_hash[VDI_HASH(device)];
   18.66 +    
   18.67 +    while ((vdi != NULL) && (vdi->vdevice != device))
   18.68 +        vdi = vdi->next;
   18.69 +    
   18.70 +    return vdi;
   18.71 +}
   18.72 +
   18.73 +/* ------[ control message handling ]-------------------------------------- */
   18.74 +
   18.75 +void blkif_create(blkif_be_create_t *create)
   18.76 +{
   18.77 +    domid_t       domid  = create->domid;
   18.78 +    unsigned int  handle = create->blkif_handle;
   18.79 +    blkif_t     **pblkif, *blkif;
   18.80 +
   18.81 +    DPRINTF("parallax (blkif_create): create is %p\n", create); 
   18.82 +    
   18.83 +    if ( (blkif = (blkif_t *)malloc(sizeof(blkif_t))) == NULL )
   18.84 +    {
   18.85 +        DPRINTF("Could not create blkif: out of memory\n");
   18.86 +        create->status = BLKIF_BE_STATUS_OUT_OF_MEMORY;
   18.87 +        return;
   18.88 +    }
   18.89 +
   18.90 +    memset(blkif, 0, sizeof(*blkif));
   18.91 +    blkif->domid  = domid;
   18.92 +    blkif->handle = handle;
   18.93 +    blkif->status = DISCONNECTED;
   18.94 +
   18.95 +    pblkif = &blkif_hash[BLKIF_HASH(domid, handle)];
   18.96 +    while ( *pblkif != NULL )
   18.97 +    {
   18.98 +        if ( ((*pblkif)->domid == domid) && ((*pblkif)->handle == handle) )
   18.99 +        {
  18.100 +            DPRINTF("Could not create blkif: already exists (%d,%d)\n",
  18.101 +                domid, handle);
  18.102 +            create->status = BLKIF_BE_STATUS_INTERFACE_EXISTS;
  18.103 +            free(blkif);
  18.104 +            return;
  18.105 +        }
  18.106 +        pblkif = &(*pblkif)->hash_next;
  18.107 +    }
  18.108 +
  18.109 +    blkif->hash_next = *pblkif;
  18.110 +    *pblkif = blkif;
  18.111 +
  18.112 +    DPRINTF("Successfully created blkif\n");
  18.113 +    create->status = BLKIF_BE_STATUS_OKAY;
  18.114 +}
  18.115 +
  18.116 +void blkif_destroy(blkif_be_destroy_t *destroy)
  18.117 +{
  18.118 +    domid_t       domid  = destroy->domid;
  18.119 +    unsigned int  handle = destroy->blkif_handle;
  18.120 +    blkif_t     **pblkif, *blkif;
  18.121 +
  18.122 +    DPRINTF("parallax (blkif_destroy): destroy is %p\n", destroy); 
  18.123 +    
  18.124 +    pblkif = &blkif_hash[BLKIF_HASH(domid, handle)];
  18.125 +    while ( (blkif = *pblkif) != NULL )
  18.126 +    {
  18.127 +        if ( (blkif->domid == domid) && (blkif->handle == handle) )
  18.128 +        {
  18.129 +            if ( blkif->status != DISCONNECTED )
  18.130 +                goto still_connected;
  18.131 +            goto destroy;
  18.132 +        }
  18.133 +        pblkif = &blkif->hash_next;
  18.134 +    }
  18.135 +
  18.136 +    destroy->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
  18.137 +    return;
  18.138 +
  18.139 + still_connected:
  18.140 +    destroy->status = BLKIF_BE_STATUS_INTERFACE_CONNECTED;
  18.141 +    return;
  18.142 +
  18.143 + destroy:
  18.144 +    *pblkif = blkif->hash_next;
  18.145 +    free(blkif);
  18.146 +    destroy->status = BLKIF_BE_STATUS_OKAY;
  18.147 +}
  18.148 +
  18.149 +void vbd_create(blkif_be_vbd_create_t *create)
  18.150 +{
  18.151 +    blkif_t            *blkif;
  18.152 +    vdi_t              *vdi, **vdip;
  18.153 +    blkif_vdev_t        vdevice = create->vdevice;
  18.154 +
  18.155 +    DPRINTF("parallax (vbd_create): create=%p\n", create); 
  18.156 +    
  18.157 +    blkif = blkif_find_by_handle(create->domid, create->blkif_handle);
  18.158 +    if ( blkif == NULL )
  18.159 +    {
  18.160 +        DPRINTF("vbd_create attempted for non-existent blkif (%u,%u)\n", 
  18.161 +                create->domid, create->blkif_handle); 
  18.162 +        create->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
  18.163 +        return;
  18.164 +    }
  18.165 +
  18.166 +    /* VDI identifier is in grow->extent.sector_start */
  18.167 +    DPRINTF("vbd_create: create->dev_handle (id) is %lx\n", 
  18.168 +            (unsigned long)create->dev_handle);
  18.169 +
  18.170 +    vdi = vdi_get(create->dev_handle);
  18.171 +    if (vdi == NULL)
  18.172 +    {
  18.173 +        printf("parallax (vbd_create): VDI %lx not found.\n",
  18.174 +               (unsigned long)create->dev_handle);
  18.175 +        create->status = BLKIF_BE_STATUS_VBD_NOT_FOUND;
  18.176 +        return;
  18.177 +    }
  18.178 +    
  18.179 +    vdi->next = NULL;
  18.180 +    vdi->vdevice = vdevice;
  18.181 +    vdip = &blkif->vdi_hash[VDI_HASH(vdevice)];
  18.182 +    while (*vdip != NULL)
  18.183 +        vdip = &(*vdip)->next;
  18.184 +    *vdip = vdi;
  18.185 +    
  18.186 +    DPRINTF("blkif_create succeeded\n"); 
  18.187 +    create->status = BLKIF_BE_STATUS_OKAY;
  18.188 +}
  18.189 +
  18.190 +void vbd_destroy(blkif_be_vbd_destroy_t *destroy)
  18.191 +{
  18.192 +    blkif_t            *blkif;
  18.193 +    vdi_t              *vdi, **vdip;
  18.194 +    blkif_vdev_t        vdevice = destroy->vdevice;
  18.195 +    
  18.196 +    blkif = blkif_find_by_handle(destroy->domid, destroy->blkif_handle);
  18.197 +    if ( blkif == NULL )
  18.198 +    {
  18.199 +        DPRINTF("vbd_destroy attempted for non-existent blkif (%u,%u)\n", 
  18.200 +                destroy->domid, destroy->blkif_handle); 
  18.201 +        destroy->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
  18.202 +        return;
  18.203 +    }
  18.204 +
  18.205 +    vdip = &blkif->vdi_hash[VDI_HASH(vdevice)];
  18.206 +    while ((*vdip != NULL) && ((*vdip)->vdevice != vdevice))
  18.207 +        vdip = &(*vdip)->next;
  18.208 +
  18.209 +    if (*vdip != NULL) 
  18.210 +    {
  18.211 +        vdi = *vdip;
  18.212 +        *vdip = vdi->next;
  18.213 +        vdi_put(vdi);
  18.214 +    }
  18.215 +        
  18.216 +}
  18.217 +
  18.218 +int parallax_control(control_msg_t *msg)
  18.219 +{
  18.220 +    domid_t  domid;
  18.221 +    int      ret;
  18.222 +
  18.223 +    DPRINTF("parallax_control: msg is %p\n", msg); 
  18.224 +    
  18.225 +    if (msg->type != CMSG_BLKIF_BE) 
  18.226 +    {
  18.227 +        printf("Unexpected control message (%d)\n", msg->type);
  18.228 +        return 0;
  18.229 +    }
  18.230 +    
  18.231 +    switch(msg->subtype)
  18.232 +    {
  18.233 +    case CMSG_BLKIF_BE_CREATE:
  18.234 +        if ( msg->length != sizeof(blkif_be_create_t) )
  18.235 +            goto parse_error;
  18.236 +        blkif_create((blkif_be_create_t *)msg->msg);
  18.237 +        break;   
  18.238 +        
  18.239 +    case CMSG_BLKIF_BE_DESTROY:
  18.240 +        if ( msg->length != sizeof(blkif_be_destroy_t) )
  18.241 +            goto parse_error;
  18.242 +        blkif_destroy((blkif_be_destroy_t *)msg->msg);
  18.243 +        break;  
  18.244 +        
  18.245 +    case CMSG_BLKIF_BE_VBD_CREATE:
  18.246 +        if ( msg->length != sizeof(blkif_be_vbd_create_t) )
  18.247 +            goto parse_error;
  18.248 +        vbd_create((blkif_be_vbd_create_t *)msg->msg);
  18.249 +        break;
  18.250 +        
  18.251 +    case CMSG_BLKIF_BE_VBD_DESTROY:
  18.252 +        if ( msg->length != sizeof(blkif_be_vbd_destroy_t) )
  18.253 +            goto parse_error;
  18.254 +        vbd_destroy((blkif_be_vbd_destroy_t *)msg->msg);
  18.255 +        break;
  18.256 +
  18.257 +    case CMSG_BLKIF_BE_CONNECT:
  18.258 +    case CMSG_BLKIF_BE_DISCONNECT:
  18.259 +        /* we don't manage the device channel, the tap does. */
  18.260 +        break;
  18.261 +
  18.262 +    default:
  18.263 +        goto parse_error;
  18.264 +    }
  18.265 +    return 0;
  18.266 +parse_error:
  18.267 +    printf("Bad control message!\n");
  18.268 +    return 0;
  18.269 +    
  18.270 +}    
  18.271 +
  18.272 +int parallax_probe(blkif_request_t *req, blkif_t *blkif)
  18.273 +{
  18.274 +    blkif_response_t *rsp;
  18.275 +    vdisk_t *img_info;
  18.276 +    vdi_t *vdi;
  18.277 +    int i, nr_vdis = 0; 
  18.278 +
  18.279 +    DPRINTF("parallax_probe: req=%p, blkif=%p\n", req, blkif); 
  18.280 +
  18.281 +    /* We expect one buffer only. */
  18.282 +    if ( req->nr_segments != 1 )
  18.283 +      goto err;
  18.284 +
  18.285 +    /* Make sure the buffer is page-sized. */
  18.286 +    if ( (blkif_first_sect(req->frame_and_sects[0]) != 0) ||
  18.287 +       (blkif_last_sect (req->frame_and_sects[0]) != 7) )
  18.288 +      goto err;
  18.289 +
  18.290 +    /* fill the list of devices */
  18.291 +    for (i=0; i<VDI_HASHSZ; i++) {
  18.292 +        vdi = blkif->vdi_hash[i];
  18.293 +        while (vdi) {
  18.294 +            img_info = (vdisk_t *)MMAP_VADDR(ID_TO_IDX(req->id), 0);
  18.295 +            img_info[nr_vdis].device   = vdi->vdevice;
  18.296 +            img_info[nr_vdis].info     = 0;
  18.297 +            /* The -1 here accounts for the LSB in the radix tree */
  18.298 +            img_info[nr_vdis].capacity = 
  18.299 +                    ((1LL << (VDI_HEIGHT-1)) * SECTS_PER_NODE);
  18.300 +            nr_vdis++;
  18.301 +            vdi = vdi->next;
  18.302 +        }
  18.303 +    }
  18.304 +
  18.305 +    
  18.306 +    rsp = (blkif_response_t *)req;
  18.307 +    rsp->id = req->id;
  18.308 +    rsp->operation = BLKIF_OP_PROBE;
  18.309 +    rsp->status = nr_vdis; /* number of disks */
  18.310 +
  18.311 +    DPRINTF("parallax_probe: send positive response (nr_vdis=%d)\n", nr_vdis);
  18.312 +    return  BLKTAP_RESPOND;
  18.313 +err:
  18.314 +    rsp = (blkif_response_t *)req;
  18.315 +    rsp->id = req->id;
  18.316 +    rsp->operation = BLKIF_OP_PROBE;
  18.317 +    rsp->status = BLKIF_RSP_ERROR;
  18.318 +    
  18.319 +    DPRINTF("parallax_probe: send error response\n"); 
  18.320 +    return BLKTAP_RESPOND;  
  18.321 +}
  18.322 +
  18.323 +typedef struct {
  18.324 +    blkif_request_t *req;
  18.325 +    int              count;
  18.326 +    int              error;
  18.327 +    pthread_mutex_t  mutex;
  18.328 +} pending_t;
  18.329 +
  18.330 +#define MAX_REQUESTS 64
  18.331 +pending_t pending_list[MAX_REQUESTS];
  18.332 +
  18.333 +struct cb_param {
  18.334 +    pending_t *pent;
  18.335 +    int       segment;
  18.336 +    u64       sector; 
  18.337 +    u64       vblock; /* for debug printing -- can be removed. */
  18.338 +};
  18.339 +
  18.340 +static void read_cb(struct io_ret r, void *in_param)
  18.341 +{
  18.342 +    struct cb_param *param = (struct cb_param *)in_param;
  18.343 +    pending_t *p = param->pent;
  18.344 +    int segment = param->segment;
  18.345 +    blkif_request_t *req = p->req;
  18.346 +    unsigned long size, offset, start;
  18.347 +    char *dpage, *spage;
  18.348 +	
  18.349 +    spage  = IO_BLOCK(r);
  18.350 +    if (spage == NULL) { p->error++; goto finish; }
  18.351 +    dpage  = (char *)MMAP_VADDR(ID_TO_IDX(req->id), segment);
  18.352 +    
  18.353 +    /* Calculate read size and offset within the read block. */
  18.354 +
  18.355 +    offset = (param->sector << SECTOR_SHIFT) % BLOCK_SIZE;
  18.356 +    size = ( blkif_last_sect (req->frame_and_sects[segment]) -
  18.357 +             blkif_first_sect(req->frame_and_sects[segment]) + 1
  18.358 +        ) << SECTOR_SHIFT;
  18.359 +    start = blkif_first_sect(req->frame_and_sects[segment]) 
  18.360 +        << SECTOR_SHIFT;
  18.361 +
  18.362 +    DPRINTF("ParallaxRead: sect: %lld (%ld,%ld),  "
  18.363 +            "vblock %llx, "
  18.364 +            "size %lx\n", 
  18.365 +            param->sector, blkif_first_sect(p->req->frame_and_sects[segment]),
  18.366 +            blkif_last_sect (p->req->frame_and_sects[segment]),
  18.367 +            param->vblock, size); 
  18.368 +
  18.369 +    memcpy(dpage + start, spage + offset, size);
  18.370 +    freeblock(spage);
  18.371 +    
  18.372 +    /* Done the read.  Now update the pending record. */
  18.373 + finish:
  18.374 +    pthread_mutex_lock(&p->mutex);
  18.375 +    p->count--;
  18.376 +    
  18.377 +    if (p->count == 0) {
  18.378 +    	blkif_response_t *rsp;
  18.379 +    	
  18.380 +        rsp = (blkif_response_t *)req;
  18.381 +        rsp->id = req->id;
  18.382 +        rsp->operation = BLKIF_OP_READ;
  18.383 +    	if (p->error == 0) {
  18.384 +            rsp->status = BLKIF_RSP_OKAY;
  18.385 +    	} else {
  18.386 +            rsp->status = BLKIF_RSP_ERROR;
  18.387 +    	}
  18.388 +        blktap_inject_response(rsp);       
  18.389 +    }
  18.390 +    
  18.391 +    pthread_mutex_unlock(&p->mutex);
  18.392 +	
  18.393 +    free(param); /* TODO: replace with cached alloc/dealloc */
  18.394 +}	
  18.395 +
  18.396 +int parallax_read(blkif_request_t *req, blkif_t *blkif)
  18.397 +{
  18.398 +    blkif_response_t *rsp;
  18.399 +    u64 vblock, gblock;
  18.400 +    vdi_t *vdi;
  18.401 +    u64 sector;
  18.402 +    int i;
  18.403 +    char *dpage, *spage;
  18.404 +    pending_t *pent;
  18.405 +
  18.406 +    vdi = blkif_get_vdi(blkif, req->device);
  18.407 +    
  18.408 +    if ( vdi == NULL )
  18.409 +        goto err;
  18.410 +        
  18.411 +    pent = &pending_list[ID_TO_IDX(req->id)];
  18.412 +    pent->count = req->nr_segments;
  18.413 +    pent->req = req;
  18.414 +    pthread_mutex_init(&pent->mutex, NULL);
  18.415 +    
  18.416 +    for (i = 0; i < req->nr_segments; i++) {
  18.417 +        pthread_t tid;
  18.418 +        int ret;
  18.419 +        struct cb_param *p;
  18.420 +        
  18.421 +        /* Round the requested segment to a block address. */
  18.422 +        sector  = req->sector_number + (8*i);
  18.423 +        vblock = (sector << SECTOR_SHIFT) >> BLOCK_SHIFT;
  18.424 +        
  18.425 +        /* TODO: Replace this call to malloc with a cached allocation */
  18.426 +        p = (struct cb_param *)malloc(sizeof(struct cb_param));
  18.427 +        p->pent = pent;
  18.428 +        p->sector = sector; 
  18.429 +        p->segment = i;     
  18.430 +        p->vblock = vblock; /* dbg */
  18.431 +        
  18.432 +        /* Get that block from the store. */
  18.433 +        vdi_read(vdi, vblock, read_cb, (void *)p);    
  18.434 +    }
  18.435 +    
  18.436 +    return BLKTAP_STOLEN;
  18.437 +
  18.438 +err:
  18.439 +    rsp = (blkif_response_t *)req;
  18.440 +    rsp->id = req->id;
  18.441 +    rsp->operation = BLKIF_OP_READ;
  18.442 +    rsp->status = BLKIF_RSP_ERROR;
  18.443 +    
  18.444 +    return BLKTAP_RESPOND;  
  18.445 +}
  18.446 +
  18.447 +static void write_cb(struct io_ret r, void *in_param)
  18.448 +{
  18.449 +    struct cb_param *param = (struct cb_param *)in_param;
  18.450 +    pending_t *p = param->pent;
  18.451 +    blkif_request_t *req = p->req;
  18.452 +    
  18.453 +    /* catch errors from the block code. */
  18.454 +    if (IO_INT(r) < 0) p->error++;
  18.455 +    
  18.456 +    pthread_mutex_lock(&p->mutex);
  18.457 +    p->count--;
  18.458 +    
  18.459 +    if (p->count == 0) {
  18.460 +    	blkif_response_t *rsp;
  18.461 +    	
  18.462 +        rsp = (blkif_response_t *)req;
  18.463 +        rsp->id = req->id;
  18.464 +        rsp->operation = BLKIF_OP_WRITE;
  18.465 +    	if (p->error == 0) {
  18.466 +            rsp->status = BLKIF_RSP_OKAY;
  18.467 +    	} else {
  18.468 +            rsp->status = BLKIF_RSP_ERROR;
  18.469 +    	}
  18.470 +        blktap_inject_response(rsp);       
  18.471 +    }
  18.472 +    
  18.473 +    pthread_mutex_unlock(&p->mutex);
  18.474 +	
  18.475 +    free(param); /* TODO: replace with cached alloc/dealloc */
  18.476 +}
  18.477 +
  18.478 +int parallax_write(blkif_request_t *req, blkif_t *blkif)
  18.479 +{
  18.480 +    blkif_response_t *rsp;
  18.481 +    u64 sector;
  18.482 +    int i, writable = 0;
  18.483 +    u64 vblock, gblock;
  18.484 +    char *spage;
  18.485 +    unsigned long size, offset, start;
  18.486 +    vdi_t *vdi;
  18.487 +    pending_t *pent;
  18.488 +
  18.489 +    vdi = blkif_get_vdi(blkif, req->device);
  18.490 +    
  18.491 +    if ( vdi == NULL )
  18.492 +        goto err;
  18.493 +        
  18.494 +    pent = &pending_list[ID_TO_IDX(req->id)];
  18.495 +    pent->count = req->nr_segments;
  18.496 +    pent->req = req;
  18.497 +    pthread_mutex_init(&pent->mutex, NULL);
  18.498 +    
  18.499 +    for (i = 0; i < req->nr_segments; i++) {
  18.500 +        struct cb_param *p;
  18.501 +        
  18.502 +        spage  = (char *)MMAP_VADDR(ID_TO_IDX(req->id), i);
  18.503 +        
  18.504 +        /* Round the requested segment to a block address. */
  18.505 +        
  18.506 +        sector  = req->sector_number + (8*i);
  18.507 +        vblock = (sector << SECTOR_SHIFT) >> BLOCK_SHIFT;
  18.508 +        
  18.509 +        /* Calculate read size and offset within the read block. */
  18.510 +        
  18.511 +        offset = (sector << SECTOR_SHIFT) % BLOCK_SIZE;
  18.512 +        size = ( blkif_last_sect (req->frame_and_sects[i]) -
  18.513 +                 blkif_first_sect(req->frame_and_sects[i]) + 1
  18.514 +            ) << SECTOR_SHIFT;
  18.515 +        start = blkif_first_sect(req->frame_and_sects[i]) << SECTOR_SHIFT;
  18.516 +
  18.517 +        DPRINTF("ParallaxWrite: sect: %lld (%ld,%ld),  "
  18.518 +                "vblock %llx, gblock %llx, "
  18.519 +                "size %lx\n", 
  18.520 +                sector, blkif_first_sect(req->frame_and_sects[i]),
  18.521 +                blkif_last_sect (req->frame_and_sects[i]),
  18.522 +                vblock, gblock, size); 
  18.523 +      
  18.524 +        /* XXX: For now we just freak out if they try to write a   */
  18.525 +        /* non block-sized, block-aligned page.                    */
  18.526 +        
  18.527 +        if ((offset != 0) || (size != BLOCK_SIZE) || (start != 0)) {
  18.528 +            printf("]\n] STRANGE WRITE!\n]\n");
  18.529 +            goto err;
  18.530 +        }
  18.531 +        
  18.532 +        /* TODO: Replace this call to malloc with a cached allocation */
  18.533 +        p = (struct cb_param *)malloc(sizeof(struct cb_param));
  18.534 +        p->pent = pent;
  18.535 +        p->sector = sector; 
  18.536 +        p->segment = i;     
  18.537 +        p->vblock = vblock; /* dbg */
  18.538 +        
  18.539 +        /* Issue the write to the store. */
  18.540 +        vdi_write(vdi, vblock, spage, write_cb, (void *)p);
  18.541 +    }
  18.542 +
  18.543 +    return BLKTAP_STOLEN;
  18.544 +
  18.545 +err:
  18.546 +    rsp = (blkif_response_t *)req;
  18.547 +    rsp->id = req->id;
  18.548 +    rsp->operation = BLKIF_OP_WRITE;
  18.549 +    rsp->status = BLKIF_RSP_ERROR;
  18.550 +    
  18.551 +    return BLKTAP_RESPOND;  
  18.552 +}
  18.553 +
  18.554 +int parallax_request(blkif_request_t *req)
  18.555 +{
  18.556 +    blkif_response_t *rsp;
  18.557 +    domid_t  dom   = ID_TO_DOM(req->id);
  18.558 +    blkif_t *blkif = blkif_find_by_handle(dom, 0);
  18.559 +    
  18.560 +    if (blkif == NULL)
  18.561 +        goto err;
  18.562 +    
  18.563 +    if ( req->operation == BLKIF_OP_PROBE ) {
  18.564 +        
  18.565 +        return parallax_probe(req, blkif);
  18.566 +        
  18.567 +    } else if ( req->operation == BLKIF_OP_READ ) {
  18.568 +        
  18.569 +        return parallax_read(req, blkif);
  18.570 +        
  18.571 +    } else if ( req->operation == BLKIF_OP_WRITE ) {
  18.572 +        
  18.573 +        return parallax_write(req, blkif);
  18.574 +        
  18.575 +    } else {
  18.576 +        printf("Unknown request message type!\n");
  18.577 +        /* Unknown operation */
  18.578 +        goto err;
  18.579 +    }
  18.580 +    
  18.581 +err:
  18.582 +    rsp = (blkif_response_t *)req;
  18.583 +    rsp->operation = req->operation;
  18.584 +    rsp->id = req->id;
  18.585 +    rsp->status = BLKIF_RSP_ERROR;
  18.586 +    return BLKTAP_RESPOND;  
  18.587 +}
  18.588 +
  18.589 +void __init_parallax(void) 
  18.590 +{
  18.591 +    memset(blkif_hash, 0, sizeof(blkif_hash));
  18.592 +}
  18.593 +
  18.594 +
  18.595 +
  18.596 +int main(int argc, char *argv[])
  18.597 +{
  18.598 +    DPRINTF("parallax: starting.\n"); 
  18.599 +    __init_blockstore();
  18.600 +    DPRINTF("parallax: initialized blockstore...\n"); 
  18.601 +    init_block_async();
  18.602 +    DPRINTF("parallax: initialized async blocks...\n"); 
  18.603 +    __init_vdi();
  18.604 +    DPRINTF("parallax: initialized vdi registry etc...\n"); 
  18.605 +    __init_parallax();
  18.606 +    DPRINTF("parallax: initialized local stuff..\n"); 
  18.607 +
  18.608 +    blktap_register_ctrl_hook("parallax_control", parallax_control);
  18.609 +    blktap_register_request_hook("parallax_request", parallax_request);
  18.610 +    DPRINTF("parallax: added ctrl + request hooks, starting listen...\n"); 
  18.611 +    blktap_listen();
  18.612 +    
  18.613 +    return 0;
  18.614 +}
    19.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    19.2 +++ b/tools/blktap/parallax/radix.c	Sun Jul 03 22:36:48 2005 +0000
    19.3 @@ -0,0 +1,631 @@
    19.4 +/*
    19.5 + * Radix tree for mapping (up to) 63-bit virtual block IDs to
    19.6 + * 63-bit global block IDs
    19.7 + *
    19.8 + * Pointers within the tree set aside the least significant bit to indicate
    19.9 + * whther or not the target block is writable from this node.
   19.10 + *
   19.11 + * The block with ID 0 is assumed to be an empty block of all zeros
   19.12 + */
   19.13 +
   19.14 +#include <unistd.h>
   19.15 +#include <stdio.h>
   19.16 +#include <stdlib.h>
   19.17 +#include <assert.h>
   19.18 +#include <string.h>
   19.19 +#include <pthread.h>
   19.20 +#include "blockstore.h"
   19.21 +#include "radix.h"
   19.22 +
   19.23 +#define RADIX_TREE_MAP_SHIFT 9
   19.24 +#define RADIX_TREE_MAP_MASK 0x1ff
   19.25 +#define RADIX_TREE_MAP_ENTRIES 512
   19.26 +
   19.27 +/*
   19.28 +#define DEBUG
   19.29 +*/
   19.30 +
   19.31 +/* Experimental radix cache. */
   19.32 +
   19.33 +static  pthread_mutex_t rcache_mutex = PTHREAD_MUTEX_INITIALIZER;
   19.34 +static  int rcache_count = 0;
   19.35 +#define RCACHE_MAX 1024
   19.36 +
   19.37 +typedef struct rcache_st {
   19.38 +    radix_tree_node  *node;
   19.39 +    u64               id;
   19.40 +    struct rcache_st *hash_next;
   19.41 +    struct rcache_st *cache_next;
   19.42 +    struct rcache_st *cache_prev;
   19.43 +} rcache_t;
   19.44 +
   19.45 +static rcache_t *rcache_head = NULL;
   19.46 +static rcache_t *rcache_tail = NULL;
   19.47 +
   19.48 +#define RCHASH_SIZE 512ULL
   19.49 +rcache_t *rcache[RCHASH_SIZE];
   19.50 +#define RCACHE_HASH(_id) ((_id) & (RCHASH_SIZE - 1))
   19.51 +
   19.52 +void __rcache_init(void)
   19.53 +{
   19.54 +    int i;
   19.55 +
   19.56 +    for (i=0; i<RCHASH_SIZE; i++)
   19.57 +        rcache[i] = NULL;
   19.58 +}
   19.59 +    
   19.60 +
   19.61 +void rcache_write(u64 id, radix_tree_node *node)
   19.62 +{
   19.63 +    rcache_t *r, *tmp, **curs;
   19.64 +    
   19.65 +    pthread_mutex_lock(&rcache_mutex);
   19.66 +    
   19.67 +    /* Is it already in the cache? */
   19.68 +    r = rcache[RCACHE_HASH(id)];
   19.69 +    
   19.70 +    for (;;) {
   19.71 +        if (r == NULL) 
   19.72 +            break;
   19.73 +        if (r->id == id) 
   19.74 +        {
   19.75 +            memcpy(r->node, node, BLOCK_SIZE);
   19.76 +            
   19.77 +            /* bring to front. */
   19.78 +            if (r != rcache_head) {
   19.79 +                
   19.80 +                if (r == rcache_tail) {
   19.81 +                    if (r->cache_prev != NULL) rcache_tail = r->cache_prev;
   19.82 +                    rcache_tail->cache_next = NULL;
   19.83 +                }
   19.84 +
   19.85 +                tmp = r->cache_next;
   19.86 +                if (r->cache_next != NULL) r->cache_next->cache_prev 
   19.87 +                                                     = r->cache_prev;
   19.88 +                if (r->cache_prev != NULL) r->cache_prev->cache_next = tmp;
   19.89 +
   19.90 +                r->cache_prev = NULL;
   19.91 +                r->cache_next = rcache_head;
   19.92 +                if (rcache_head != NULL) rcache_head->cache_prev = r;
   19.93 +                rcache_head = r;
   19.94 +            }
   19.95 +
   19.96 +//printf("Update (%Ld)\n", r->id);
   19.97 +            goto done;
   19.98 +        }
   19.99 +        r = r->hash_next;
  19.100 +    }
  19.101 +    
  19.102 +    if ( rcache_count == RCACHE_MAX ) 
  19.103 +    {
  19.104 +        /* Remove an entry */
  19.105 +        
  19.106 +        r = rcache_tail;
  19.107 +        if (r->cache_prev != NULL) rcache_tail = r->cache_prev;
  19.108 +        rcache_tail->cache_next = NULL;
  19.109 +        freeblock(r->node);
  19.110 +        
  19.111 +        curs = &rcache[RCACHE_HASH(r->id)];
  19.112 +        while ((*curs) != r)
  19.113 +            curs = &(*curs)->hash_next;
  19.114 +        *curs = r->hash_next;
  19.115 +//printf("Evict (%Ld)\n", r->id);
  19.116 +        
  19.117 +    } else {
  19.118 +        
  19.119 +        r = (rcache_t *)malloc(sizeof(rcache_t));
  19.120 +        rcache_count++;
  19.121 +    }
  19.122 +    
  19.123 +    r->node = newblock();
  19.124 +    memcpy(r->node, node, BLOCK_SIZE);
  19.125 +    r->id = id;
  19.126 +    
  19.127 +    r->hash_next = rcache[RCACHE_HASH(id)];
  19.128 +    rcache[RCACHE_HASH(id)] = r;
  19.129 +    
  19.130 +    r->cache_prev = NULL;
  19.131 +    r->cache_next = rcache_head;
  19.132 +    if (rcache_head != NULL) rcache_head->cache_prev = r;
  19.133 +    rcache_head = r;
  19.134 +    if (rcache_tail == NULL) rcache_tail = r;
  19.135 +    
  19.136 +//printf("Added (%Ld, %p)\n", id, r->node);
  19.137 +done:
  19.138 +    pthread_mutex_unlock(&rcache_mutex);
  19.139 +}
  19.140 +
  19.141 +radix_tree_node *rcache_read(u64 id)
  19.142 +{
  19.143 +    rcache_t *r, *tmp;
  19.144 +    radix_tree_node *node = NULL;
  19.145 +    
  19.146 +    pthread_mutex_lock(&rcache_mutex);
  19.147 +
  19.148 +    r = rcache[RCACHE_HASH(id)];
  19.149 +    
  19.150 +    for (;;) {
  19.151 +        if (r == NULL) {
  19.152 +//printf("Miss (%Ld)\n", id);
  19.153 +            goto done;
  19.154 +        }
  19.155 +        if (r->id == id) break;
  19.156 +        r = r->hash_next;
  19.157 +    }
  19.158 +   
  19.159 +    /* bring to front. */
  19.160 +    if (r != rcache_head) 
  19.161 +    {
  19.162 +        if (r == rcache_tail) {
  19.163 +            if (r->cache_prev != NULL) rcache_tail = r->cache_prev;
  19.164 +            rcache_tail->cache_next = NULL;
  19.165 +        }
  19.166 +        tmp = r->cache_next;
  19.167 +        if (r->cache_next != NULL) r->cache_next->cache_prev = r->cache_prev;
  19.168 +        if (r->cache_prev != NULL) r->cache_prev->cache_next = tmp;
  19.169 +
  19.170 +        r->cache_prev = NULL;
  19.171 +        r->cache_next = rcache_head;
  19.172 +        if (rcache_head != NULL) rcache_head->cache_prev = r;
  19.173 +        rcache_head = r;
  19.174 +    }
  19.175 +    
  19.176 +    node = newblock();
  19.177 +    memcpy(node, r->node, BLOCK_SIZE);
  19.178 +    
  19.179 +//printf("Hit (%Ld, %p)\n", id, r->node);
  19.180 +done:
  19.181 +    pthread_mutex_unlock(&rcache_mutex);
  19.182 +    
  19.183 +    return(node);
  19.184 +}
  19.185 +
  19.186 +
  19.187 +void *rc_readblock(u64 id)
  19.188 +{
  19.189 +    void *ret;
  19.190 +    
  19.191 +    ret = (void *)rcache_read(id);
  19.192 +    
  19.193 +    if (ret != NULL) return ret;
  19.194 +    
  19.195 +    ret = readblock(id);
  19.196 +    
  19.197 +    if (ret != NULL)
  19.198 +        rcache_write(id, ret);
  19.199 +    
  19.200 +    return(ret);
  19.201 +}
  19.202 +
  19.203 +u64 rc_allocblock(void *block)
  19.204 +{
  19.205 +    u64 ret;
  19.206 +    
  19.207 +    ret = allocblock(block);
  19.208 +    
  19.209 +    if (ret != ZERO)
  19.210 +        rcache_write(ret, block);
  19.211 +    
  19.212 +    return(ret);
  19.213 +}
  19.214 +
  19.215 +int rc_writeblock(u64 id, void *block)
  19.216 +{
  19.217 +    int ret;
  19.218 +    
  19.219 +    ret = writeblock(id, block);
  19.220 +    rcache_write(id, block);
  19.221 +    
  19.222 +    return(ret);
  19.223 +}
  19.224 +
  19.225 +
  19.226 +/*
  19.227 + * block device interface and other helper functions
  19.228 + * with these functions, block id is just a 63-bit number, with
  19.229 + * no special consideration for the LSB
  19.230 + */
  19.231 +radix_tree_node cloneblock(radix_tree_node block);
  19.232 +
  19.233 +/*
  19.234 + * main api
  19.235 + * with these functions, the LSB of root always indicates
  19.236 + * whether or not the block is writable, including the return
  19.237 + * values of update and snapshot
  19.238 + */
  19.239 +u64 lookup(int height, u64 root, u64 key);
  19.240 +u64 update(int height, u64 root, u64 key, u64 val);
  19.241 +u64 snapshot(u64 root);
  19.242 +
  19.243 +/**
  19.244 + * cloneblock: clone an existing block in memory
  19.245 + *   @block: the old block
  19.246 + *
  19.247 + *   @return: new block, with LSB cleared for every entry
  19.248 + */
  19.249 +radix_tree_node cloneblock(radix_tree_node block) {
  19.250 +    radix_tree_node node = (radix_tree_node) malloc(BLOCK_SIZE);
  19.251 +    int i;
  19.252 +    if (node == NULL) {
  19.253 +        perror("cloneblock malloc");
  19.254 +        return NULL;
  19.255 +    }
  19.256 +    for (i = 0; i < RADIX_TREE_MAP_ENTRIES; i++)
  19.257 +        node[i] = block[i] & ONEMASK;
  19.258 +    return node;
  19.259 +}
  19.260 +
  19.261 +/**
  19.262 + * lookup: find a value given a key
  19.263 + *   @height: height in bits of the radix tree
  19.264 + *   @root: root node id, with set LSB indicating writable node
  19.265 + *   @key: key to lookup
  19.266 + *
  19.267 + *   @return: value on success, zero on error
  19.268 + */
  19.269 +
  19.270 +u64 lookup(int height, u64 root, u64 key) {
  19.271 +    radix_tree_node node;
  19.272 +    u64 mask = ONE;
  19.273 +    
  19.274 +    assert(key >> height == 0);
  19.275 +
  19.276 +    /* the root block may be smaller to ensure all leaves are full */
  19.277 +    height = ((height - 1) / RADIX_TREE_MAP_SHIFT) * RADIX_TREE_MAP_SHIFT;
  19.278 +
  19.279 +    /* now carve off equal sized chunks at each step */
  19.280 +    for (;;) {
  19.281 +        u64 oldroot;
  19.282 +
  19.283 +#ifdef DEBUG
  19.284 +        printf("lookup: height=%3d root=%3Ld offset=%3d%s\n", height, root,
  19.285 +                (int) ((key >> height) & RADIX_TREE_MAP_MASK),
  19.286 +                (iswritable(root) ? "" : " (readonly)"));
  19.287 +#endif
  19.288 +        
  19.289 +        if (getid(root) == ZERO)
  19.290 +            return ZERO;
  19.291 +
  19.292 +        oldroot = root;
  19.293 +        node = (radix_tree_node) rc_readblock(getid(root));
  19.294 +        if (node == NULL)
  19.295 +            return ZERO;
  19.296 +
  19.297 +        root = node[(key >> height) & RADIX_TREE_MAP_MASK];
  19.298 +        mask &= root;
  19.299 +        freeblock(node);
  19.300 +
  19.301 +        if (height == 0)
  19.302 +            return ( root & ONEMASK ) | mask;
  19.303 +
  19.304 +        height -= RADIX_TREE_MAP_SHIFT;
  19.305 +    }
  19.306 +
  19.307 +    return ZERO;
  19.308 +}
  19.309 +
  19.310 +/*
  19.311 + * update: set a radix tree entry, doing copy-on-write as necessary
  19.312 + *   @height: height in bits of the radix tree
  19.313 + *   @root: root node id, with set LSB indicating writable node
  19.314 + *   @key: key to set
  19.315 + *   @val: value to set, s.t. radix(key)=val
  19.316 + *
  19.317 + *   @returns: (possibly new) root id on success (with LSB=1), 0 on failure
  19.318 + */
  19.319 +
  19.320 +u64 update(int height, u64 root, u64 key, u64 val) {
  19.321 +    int offset;
  19.322 +    u64 child;
  19.323 +    radix_tree_node node;
  19.324 +    
  19.325 +    /* base case--return val */
  19.326 +    if (height == 0)
  19.327 +        return val;
  19.328 +
  19.329 +    /* the root block may be smaller to ensure all leaves are full */
  19.330 +    height = ((height - 1) / RADIX_TREE_MAP_SHIFT) * RADIX_TREE_MAP_SHIFT;
  19.331 +    offset = (key >> height) & RADIX_TREE_MAP_MASK;
  19.332 +
  19.333 +#ifdef DEBUG
  19.334 +    printf("update: height=%3d root=%3Ld offset=%3d%s\n", height, root,
  19.335 +            offset, (iswritable(root)?"":" (clone)"));
  19.336 +#endif
  19.337 +
  19.338 +    /* load a block, or create a new one */
  19.339 +    if (root == ZERO) {
  19.340 +        node = (radix_tree_node) newblock();
  19.341 +    } else {
  19.342 +        node = (radix_tree_node) rc_readblock(getid(root));
  19.343 +
  19.344 +        if (!iswritable(root)) {
  19.345 +            /* need to clone this node */
  19.346 +            radix_tree_node oldnode = node;
  19.347 +            node = cloneblock(node);
  19.348 +            freeblock(oldnode);
  19.349 +            root = ZERO;
  19.350 +        }
  19.351 +    }
  19.352 +
  19.353 +    if (node == NULL) {
  19.354 +#ifdef DEBUG
  19.355 +        printf("update: node is null!\n");
  19.356 +#endif
  19.357 +        return ZERO;
  19.358 +    }
  19.359 +
  19.360 +    child = update(height, node[offset], key, val);
  19.361 +
  19.362 +    if (child == ZERO) {
  19.363 +        freeblock(node);
  19.364 +        return ZERO;
  19.365 +    } else if (child == node[offset]) {
  19.366 +        /* no change, so we already owned the child */
  19.367 +        assert(iswritable(root));
  19.368 +
  19.369 +        freeblock(node);
  19.370 +        return root;
  19.371 +    }
  19.372 +
  19.373 +    node[offset] = child;
  19.374 +
  19.375 +    /* new/cloned blocks need to be saved */
  19.376 +    if (root == ZERO) {
  19.377 +        /* mark this as an owned block */
  19.378 +        root = rc_allocblock(node);
  19.379 +        if (root)
  19.380 +            root = writable(root);
  19.381 +    } else if (rc_writeblock(getid(root), node) < 0) {
  19.382 +        freeblock(node);
  19.383 +        return ZERO;
  19.384 +    }
  19.385 +
  19.386 +    freeblock(node);
  19.387 +    return root;
  19.388 +}
  19.389 +
  19.390 +/**
  19.391 + * snapshot: create a snapshot
  19.392 + *   @root: old root node
  19.393 + *
  19.394 + *   @return: new root node, 0 on error
  19.395 + */
  19.396 +u64 snapshot(u64 root) {
  19.397 +    radix_tree_node node, newnode;
  19.398 +
  19.399 +    if ((node = rc_readblock(getid(root))) == NULL)
  19.400 +        return ZERO;
  19.401 +
  19.402 +    newnode = cloneblock(node);
  19.403 +    freeblock(node);
  19.404 +    if (newnode == NULL)
  19.405 +        return ZERO;
  19.406 +    
  19.407 +    root = rc_allocblock(newnode);
  19.408 +    freeblock(newnode);
  19.409 +
  19.410 +    if (root == ZERO)
  19.411 +        return ZERO;
  19.412 +    else
  19.413 +        return writable(root);
  19.414 +}
  19.415 +
  19.416 +/**
  19.417 + * collapse: collapse a parent onto a child.
  19.418 + * 
  19.419 + * NOTE: This assumes that parent and child really are, and further that
  19.420 + * there are no other children forked from this parent. (children of the
  19.421 + * child are okay...)
  19.422 + */
  19.423 +
  19.424 +int collapse(int height, u64 proot, u64 croot)
  19.425 +{
  19.426 +    int i, numlinks, ret, total = 0;
  19.427 +    radix_tree_node pnode, cnode;
  19.428 +    
  19.429 +    if (height == 0) {
  19.430 +        height = -1; /* terminate recursion */
  19.431 +    } else {        
  19.432 +        height = ((height - 1) / RADIX_TREE_MAP_SHIFT) * RADIX_TREE_MAP_SHIFT;
  19.433 +    }
  19.434 +    numlinks = (1UL << RADIX_TREE_MAP_SHIFT);
  19.435 +
  19.436 +    /* Terminal cases: */
  19.437 +
  19.438 +    if ( (getid(proot) == ZERO) || (getid(croot) == ZERO) )
  19.439 +        return -1;
  19.440 +    
  19.441 +    /* get roots */
  19.442 +    if ((pnode = readblock(getid(proot))) == NULL)
  19.443 +        return -1;
  19.444 +    
  19.445 +    if ((cnode = readblock(getid(croot))) == NULL)
  19.446 +    {
  19.447 +        freeblock(pnode);
  19.448 +        return -1;
  19.449 +    }
  19.450 +    
  19.451 +    /* For each writable link in proot */
  19.452 +    for (i=0; i<numlinks; i++)
  19.453 +    {
  19.454 +        if ( pnode[i] == cnode[i] ) continue;
  19.455 +        
  19.456 +        /* collapse (next level) */
  19.457 +        /* if height != 0 and writable... */
  19.458 +        if (( height >= 0 ) && ( iswritable(pnode[i]) ) )
  19.459 +        {
  19.460 +            //printf("   %Ld is writable (i=%d).\n", getid(pnode[i]), i);
  19.461 +            ret = collapse(height, pnode[i], cnode[i]);
  19.462 +            if (ret == -1) 
  19.463 +            {
  19.464 +                total = -1;
  19.465 +            } else {
  19.466 +                total += ret;
  19.467 +            }
  19.468 +        }
  19.469 +    
  19.470 +        
  19.471 +    }
  19.472 +    
  19.473 +    /* if plink is writable, AND clink is writable -> free plink block */
  19.474 +    if ( ( iswritable(proot) ) && ( iswritable(croot) ) ) 
  19.475 +    {
  19.476 +        releaseblock(getid(proot));
  19.477 +        if (ret >=0) total++;
  19.478 +        //printf("   Delete %Ld\n", getid(proot));
  19.479 +    }
  19.480 +//printf("done : %Ld\n", getid(proot));
  19.481 +    return total;
  19.482 +
  19.483 +}
  19.484 +
  19.485 +
  19.486 +void print_root(u64 root, int height, FILE *dot_f)
  19.487 +{
  19.488 +    FILE *f;
  19.489 +    int i;
  19.490 +    radix_tree_node node;
  19.491 +    char *style[2] = { "", "style=bold,color=blue," };
  19.492 +    
  19.493 +    if (dot_f == NULL) {
  19.494 +        f = fopen("radix.dot", "w");
  19.495 +        if (f == NULL) {
  19.496 +            perror("print_root: open");
  19.497 +            return;
  19.498 +        }
  19.499 +
  19.500 +        /* write graph preamble */
  19.501 +        fprintf(f, "digraph G {\n");
  19.502 +
  19.503 +        /* add a node for this root. */
  19.504 +        fprintf(f, "   n%Ld [%sshape=box,label=\"%Ld\"];\n", 
  19.505 +                getid(root), style[iswritable(root)], getid(root));
  19.506 +    }
  19.507 +    
  19.508 +    printf("print_root(%Ld)\n", getid(root));
  19.509 +    
  19.510 +    /* base case */
  19.511 +    if (height == 0) {
  19.512 +        /* add a node and edge for each child root */
  19.513 +        node = (radix_tree_node) readblock(getid(root));
  19.514 +        if (node == NULL)
  19.515 +            return;
  19.516 +        
  19.517 +        for (i = 0; i < RADIX_TREE_MAP_ENTRIES; i++) {
  19.518 +            if (node[i] != ZERO) {
  19.519 +                fprintf(f, "   n%Ld [%sshape=box,label=\"%Ld\"];\n", 
  19.520 +                        getid(node[i]), style[iswritable(node[i])], 
  19.521 +                        getid(node[i]));
  19.522 +                fprintf(f, "   n%Ld -> n%Ld [label=\"%d\"]\n", getid(root), 
  19.523 +                        getid(node[i]), i);
  19.524 +            }
  19.525 +        }
  19.526 +        freeblock(node);
  19.527 +        return;
  19.528 +    }
  19.529 +
  19.530 +    /* the root block may be smaller to ensure all leaves are full */
  19.531 +    height = ((height - 1) / RADIX_TREE_MAP_SHIFT) * RADIX_TREE_MAP_SHIFT;
  19.532 +
  19.533 +    if (getid(root) == ZERO)
  19.534 +        return;
  19.535 +
  19.536 +    node = (radix_tree_node) readblock(getid(root));
  19.537 +    if (node == NULL)
  19.538 +        return;
  19.539 +
  19.540 +    /* add a node and edge for each child root */
  19.541 +    for (i = 0; i < RADIX_TREE_MAP_ENTRIES; i++)
  19.542 +        if (node[i] != ZERO) {
  19.543 +            fprintf(f, "   n%Ld [%sshape=box,label=\"%Ld\"];\n", 
  19.544 +                    getid(node[i]), style[iswritable(node[i])], 
  19.545 +                    getid(node[i]));
  19.546 +
  19.547 +            print_root(node[i], height-RADIX_TREE_MAP_SHIFT, f);
  19.548 +            fprintf(f, "   n%Ld -> n%Ld [label=\"%d\"]\n", getid(root), 
  19.549 +                    getid(node[i]), i);
  19.550 +        }
  19.551 +
  19.552 +    freeblock(node);
  19.553 +    
  19.554 +    /* write graph postamble */
  19.555 +    if (dot_f == NULL) {
  19.556 +        fprintf(f, "}\n");
  19.557 +        fclose(f);
  19.558 +    }
  19.559 +}
  19.560 +
  19.561 +#ifdef RADIX_STANDALONE
  19.562 +
  19.563 +int main(int argc, char **argv) {
  19.564 +    u64 key = ZERO, val = ZERO;
  19.565 +    u64 root = writable(2ULL);
  19.566 +    u64 p = ZERO, c = ZERO;
  19.567 +    int v;
  19.568 +    char buff[4096];
  19.569 +
  19.570 +    __init_blockstore();
  19.571 +    
  19.572 +    memset(buff, 0, 4096);
  19.573 +    /*fp = open("radix.dat", O_RDWR | O_CREAT, 0644);
  19.574 +
  19.575 +    if (fp < 3) {
  19.576 +        perror("open");
  19.577 +        return -1;
  19.578 +    }
  19.579 +    if (lseek(fp, 0, SEEK_END) == 0) {
  19.580 +        write(fp, buff, 4096);
  19.581 +    }*/
  19.582 +        
  19.583 +    allocblock(buff);
  19.584 +            
  19.585 +    printf("Recognized commands:\n"
  19.586 +           "Note: the LSB of a node number indicates if it is writable\n"
  19.587 +           "  root <node>               set root to <node>\n"
  19.588 +           "  snapshot                  take a snapshot of the root\n"
  19.589 +           "  set <key> <val>           set key=val\n"
  19.590 +           "  get <key>                 query key\n"
  19.591 +           "  c <proot> <croot>         collapse\n"
  19.592 +           "  pr                        print tree to dot\n"
  19.593 +           "  pf <1=verbose>            print freelist\n"
  19.594 +           "  quit\n"
  19.595 +           "\nroot = %Ld\n", root);
  19.596 +    for (;;) {
  19.597 +        //print_root(root, 34, NULL);
  19.598 +        //system("dot radix.dot -Tps -o radix.ps");
  19.599 +
  19.600 +        printf("> ");
  19.601 +        fflush(stdout);
  19.602 +        fgets(buff, 1024, stdin);
  19.603 +        if (feof(stdin))
  19.604 +            break;
  19.605 +        if (sscanf(buff, " root %Ld", &root) == 1) {
  19.606 +            printf("root set to %Ld\n", root);
  19.607 +        } else if (sscanf(buff, " set %Ld %Ld", &key, &val) == 2) {
  19.608 +            root = update(34, root, key, val);
  19.609 +            printf("root = %Ld\n", root);
  19.610 +        } else if (sscanf(buff, " c %Ld %Ld", &p, &c) == 2) {
  19.611 +            v = collapse(34, p, c);
  19.612 +            printf("reclaimed %d blocks.\n", v);
  19.613 +        } else if (sscanf(buff, " get %Ld", &key) == 1) {
  19.614 +            val = lookup(34, root, key);
  19.615 +            printf("value = %Ld\n", val);
  19.616 +        } else if (!strcmp(buff, "quit\n")) {
  19.617 +            break;
  19.618 +        } else if (!strcmp(buff, "snapshot\n")) {
  19.619 +            root = snapshot(root);
  19.620 +            printf("new root = %Ld\n", root);
  19.621 +        } else if (sscanf(buff, " pr %Ld", &root) == 1) {
  19.622 +            print_root(root, 34, NULL);
  19.623 +        } else if (sscanf(buff, " pf %d", &v) == 1) {
  19.624 +            freelist_count(v);
  19.625 +        } else if (!strcmp(buff, "pf\n")) {
  19.626 +            freelist_count(0);
  19.627 +        } else {
  19.628 +            printf("command not recognized\n");
  19.629 +        }
  19.630 +    }
  19.631 +    return 0;
  19.632 +}
  19.633 +
  19.634 +#endif
    20.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    20.2 +++ b/tools/blktap/parallax/radix.h	Sun Jul 03 22:36:48 2005 +0000
    20.3 @@ -0,0 +1,45 @@
    20.4 +/*
    20.5 + * Radix tree for mapping (up to) 63-bit virtual block IDs to
    20.6 + * 63-bit global block IDs
    20.7 + *
    20.8 + * Pointers within the tree set aside the least significant bit to indicate
    20.9 + * whther or not the target block is writable from this node.
   20.10 + *
   20.11 + * The block with ID 0 is assumed to be an empty block of all zeros
   20.12 + */
   20.13 +
   20.14 +#ifndef __RADIX_H__
   20.15 +#define __RADIX_H__
   20.16 +
   20.17 +/* I don't really like exposing these, but... */
   20.18 +#define getid(x) (((x)>>1)&0x7fffffffffffffffLL)
   20.19 +#define putid(x) ((x)<<1)
   20.20 +#define writable(x) (((x)<<1)|1LL)
   20.21 +#define iswritable(x) ((x)&1LL)
   20.22 +#define ZERO 0LL
   20.23 +#define ONE 1LL
   20.24 +#define ONEMASK 0xffffffffffffffeLL
   20.25 +
   20.26 +#define RADIX_TREE_MAP_SHIFT 9
   20.27 +#define RADIX_TREE_MAP_MASK 0x1ff
   20.28 +#define RADIX_TREE_MAP_ENTRIES 512
   20.29 +
   20.30 +typedef u64 *radix_tree_node;
   20.31 +
   20.32 +
   20.33 +/*
   20.34 + * main api
   20.35 + * with these functions, the LSB of root always indicates
   20.36 + * whether or not the block is writable, including the return
   20.37 + * values of update and snapshot
   20.38 + */
   20.39 +u64 lookup(int height, u64 root, u64 key);
   20.40 +u64 update(int height, u64 root, u64 key, u64 val);
   20.41 +u64 snapshot(u64 root);
   20.42 +int collapse(int height, u64 proot, u64 croot);
   20.43 +int isprivate(int height, u64 root, u64 key);
   20.44 +
   20.45 +
   20.46 +void __rcache_init(void);
   20.47 +
   20.48 +#endif /* __RADIX_H__ */
    21.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    21.2 +++ b/tools/blktap/parallax/requests-async.c	Sun Jul 03 22:36:48 2005 +0000
    21.3 @@ -0,0 +1,762 @@
    21.4 +/* requests-async.c
    21.5 + *
    21.6 + * asynchronous request dispatcher for radix access in parallax.
    21.7 + */
    21.8 +
    21.9 +#include <stdio.h>
   21.10 +#include <stdlib.h>
   21.11 +#include <string.h>
   21.12 +#include <ctype.h>
   21.13 +#include <assert.h>
   21.14 +#include <pthread.h>
   21.15 +#include <err.h>
   21.16 +#include <zlib.h> /* for crc32() */
   21.17 +#include "requests-async.h"
   21.18 +#include "vdi.h"
   21.19 +#include "radix.h"
   21.20 +
   21.21 +#define L1_IDX(_a) (((_a) & 0x0000000007fc0000ULL) >> 18)
   21.22 +#define L2_IDX(_a) (((_a) & 0x000000000003fe00ULL) >> 9)
   21.23 +#define L3_IDX(_a) (((_a) & 0x00000000000001ffULL))
   21.24 +
   21.25 +
   21.26 +#if 0
   21.27 +#define DPRINTF(_f, _a...) printf ( _f , ## _a )
   21.28 +#else
   21.29 +#define DPRINTF(_f, _a...) ((void)0)
   21.30 +#endif
   21.31 +
   21.32 +struct block_info {
   21.33 +    u32        crc;
   21.34 +    u32        unused;
   21.35 +};
   21.36 +
   21.37 +struct io_req {
   21.38 +    enum { IO_OP_READ, IO_OP_WRITE } op;
   21.39 +    u64        root;
   21.40 +    u64        vaddr;
   21.41 +    int        state;
   21.42 +    io_cb_t    cb;
   21.43 +    void      *param;
   21.44 +    struct radix_lock *lock;
   21.45 +
   21.46 +    /* internal stuff: */
   21.47 +    struct io_ret     retval;/* holds the return while we unlock. */
   21.48 +    char             *block; /* the block to write */
   21.49 +    radix_tree_node   radix[3];
   21.50 +    u64               radix_addr[3];
   21.51 +    struct block_info bi;
   21.52 +};
   21.53 +
   21.54 +void clear_w_bits(radix_tree_node node) 
   21.55 +{
   21.56 +    int i;
   21.57 +    for (i=0; i<RADIX_TREE_MAP_ENTRIES; i++)
   21.58 +        node[i] = node[i] & ONEMASK;
   21.59 +    return;
   21.60 +}
   21.61 +
   21.62 +void clear_L3_w_bits(radix_tree_node node) 
   21.63 +{
   21.64 +    int i;
   21.65 +    for (i=0; i<RADIX_TREE_MAP_ENTRIES; i+=2)
   21.66 +        node[i] = node[i] & ONEMASK;
   21.67 +    return;
   21.68 +}
   21.69 +
   21.70 +enum states {
   21.71 +    /* both */
   21.72 +    READ_L1,
   21.73 +    READ_L2,
   21.74 +    READ_L3,
   21.75 +
   21.76 +    /* read */
   21.77 +    READ_LOCKED,
   21.78 +    READ_DATA,
   21.79 +    READ_UNLOCKED,
   21.80 +    RETURN_ZERO,
   21.81 +
   21.82 +    /* write */
   21.83 +    WRITE_LOCKED,
   21.84 +    WRITE_DATA,
   21.85 +    WRITE_L3,
   21.86 +    WRITE_UNLOCKED,
   21.87 +    
   21.88 +    /* L3 Zero Path */
   21.89 +    ALLOC_DATA_L3z,
   21.90 +    WRITE_L3_L3z,
   21.91 +    
   21.92 +    /* L3 Fault Path */
   21.93 +    ALLOC_DATA_L3f,
   21.94 +    WRITE_L3_L3f,
   21.95 +    
   21.96 +    /* L2 Zero Path */
   21.97 +    ALLOC_DATA_L2z,
   21.98 +    WRITE_L2_L2z,
   21.99 +    ALLOC_L3_L2z,
  21.100 +    WRITE_L2_L3z,
  21.101 +    
  21.102 +    /* L2 Fault Path */
  21.103 +    READ_L3_L2f,
  21.104 +    ALLOC_DATA_L2f,
  21.105 +    WRITE_L2_L2f,
  21.106 +    ALLOC_L3_L2f,
  21.107 +    WRITE_L2_L3f,
  21.108 +
  21.109 +    /* L1 Zero Path */
  21.110 +    ALLOC_DATA_L1z,
  21.111 +    ALLOC_L3_L1z,
  21.112 +    ALLOC_L2_L1z,
  21.113 +    WRITE_L1_L1z,
  21.114 +
  21.115 +    /* L1 Fault Path */
  21.116 +    READ_L2_L1f,
  21.117 +    READ_L3_L1f,
  21.118 +    ALLOC_DATA_L1f,
  21.119 +    ALLOC_L3_L1f,
  21.120 +    ALLOC_L2_L1f,
  21.121 +    WRITE_L1_L1f,
  21.122 +    
  21.123 +};
  21.124 +
  21.125 +enum radix_offsets {
  21.126 +    L1 = 0, 
  21.127 +    L2 = 1,
  21.128 +    L3 = 2
  21.129 +};
  21.130 +
  21.131 +
  21.132 +static void read_cb(struct io_ret ret, void *param);
  21.133 +static void write_cb(struct io_ret ret, void *param);
  21.134 +
  21.135 +int vdi_read(vdi_t *vdi, u64 vaddr, io_cb_t cb, void *param)
  21.136 +{
  21.137 +    struct io_req *req;
  21.138 +
  21.139 +    if (!VALID_VADDR(vaddr)) return ERR_BAD_VADDR;
  21.140 +    /* Every second line in the bottom-level radix tree is used to      */
  21.141 +    /* store crc32 values etc. We shift the vadder here to achied this. */
  21.142 +    vaddr <<= 1;
  21.143 +
  21.144 +    req = (struct io_req *)malloc(sizeof (struct io_req));
  21.145 +    if (req == NULL) return ERR_NOMEM;
  21.146 +
  21.147 +    req->radix[0] = req->radix[1] = req->radix[2] = NULL;	
  21.148 +    req->op    = IO_OP_READ;
  21.149 +    req->root  = vdi->radix_root;
  21.150 +    req->lock  = vdi->radix_lock; 
  21.151 +    req->vaddr = vaddr;
  21.152 +    req->cb    = cb;
  21.153 +    req->param = param;
  21.154 +    req->state = READ_LOCKED;
  21.155 +
  21.156 +    block_rlock(req->lock, L1_IDX(vaddr), read_cb, req);
  21.157 +	
  21.158 +    return 0;
  21.159 +}
  21.160 +
  21.161 +
  21.162 +int   vdi_write(vdi_t *vdi, u64 vaddr, char *block, 
  21.163 +                io_cb_t cb, void *param)
  21.164 +{
  21.165 +    struct io_req *req;
  21.166 +
  21.167 +    if (!VALID_VADDR(vaddr)) return ERR_BAD_VADDR;
  21.168 +    /* Every second line in the bottom-level radix tree is used to      */
  21.169 +    /* store crc32 values etc. We shift the vadder here to achied this. */
  21.170 +    vaddr <<= 1;
  21.171 +
  21.172 +    req = (struct io_req *)malloc(sizeof (struct io_req));
  21.173 +    if (req == NULL) return ERR_NOMEM; 
  21.174 +
  21.175 +    req->radix[0] = req->radix[1] = req->radix[2] = NULL;
  21.176 +    req->op     = IO_OP_WRITE;
  21.177 +    req->root   = vdi->radix_root;
  21.178 +    req->lock   = vdi->radix_lock; 
  21.179 +    req->vaddr  = vaddr;
  21.180 +    req->block  = block;
  21.181 +    /* Todo: add a pseodoheader to the block to include some location   */
  21.182 +    /* information in the CRC as well.                                  */
  21.183 +    req->bi.crc = (u32) crc32(0L, Z_NULL, 0); 
  21.184 +    req->bi.crc = (u32) crc32(req->bi.crc, block, BLOCK_SIZE); 
  21.185 +    req->bi.unused = 0xdeadbeef;
  21.186 +
  21.187 +    req->cb     = cb;
  21.188 +    req->param  = param;
  21.189 +    req->radix_addr[L1] = getid(req->root); /* for consistency */
  21.190 +    req->state  = WRITE_LOCKED;
  21.191 +
  21.192 +    block_wlock(req->lock, L1_IDX(vaddr), write_cb, req);
  21.193 +
  21.194 +
  21.195 +    return 0;
  21.196 +}
  21.197 +
  21.198 +static void read_cb(struct io_ret ret, void *param)
  21.199 +{
  21.200 +    struct io_req *req = (struct io_req *)param;
  21.201 +    radix_tree_node node;
  21.202 +    u64 idx;
  21.203 +    char *block;
  21.204 +    void *req_param;
  21.205 +
  21.206 +    DPRINTF("read_cb\n");
  21.207 +    /* get record */
  21.208 +    switch(req->state) {
  21.209 +    	
  21.210 +    case READ_LOCKED: 
  21.211 +    
  21.212 +        DPRINTF("READ_LOCKED\n");
  21.213 +    	req->state = READ_L1;
  21.214 +    	block_read(getid(req->root), read_cb, req); 
  21.215 +    	break;
  21.216 +    	
  21.217 +    case READ_L1: /* block is the radix root */
  21.218 +
  21.219 +        DPRINTF("READ_L1\n");
  21.220 +        block = IO_BLOCK(ret);
  21.221 +        if (block == NULL) goto fail;
  21.222 +        node = (radix_tree_node) block;
  21.223 +        idx  = getid( node[L1_IDX(req->vaddr)] );
  21.224 +        free(block);
  21.225 +        if ( idx == ZERO ) {
  21.226 +            req->state = RETURN_ZERO;
  21.227 +            block_runlock(req->lock, L1_IDX(req->vaddr), read_cb, req);
  21.228 +        } else {
  21.229 +            req->state = READ_L2;
  21.230 +            block_read(idx, read_cb, req);
  21.231 +        }
  21.232 +        break;
  21.233 +
  21.234 +    case READ_L2:
  21.235 +
  21.236 +        DPRINTF("READ_L2\n");
  21.237 +        block = IO_BLOCK(ret);
  21.238 +        if (block == NULL) goto fail;
  21.239 +        node = (radix_tree_node) block;
  21.240 +        idx  = getid( node[L2_IDX(req->vaddr)] );
  21.241 +        free(block);
  21.242 +        if ( idx == ZERO ) {
  21.243 +            req->state = RETURN_ZERO;
  21.244 +            block_runlock(req->lock, L1_IDX(req->vaddr), read_cb, req);
  21.245 +        } else {
  21.246 +            req->state = READ_L3;
  21.247 +            block_read(idx, read_cb, req);
  21.248 +        }
  21.249 +        break;
  21.250 +
  21.251 +    case READ_L3:
  21.252 +    {
  21.253 +        struct block_info *bi;
  21.254 +
  21.255 +        DPRINTF("READ_L3\n");
  21.256 +        block = IO_BLOCK(ret);
  21.257 +        if (block == NULL) goto fail;
  21.258 +        node = (radix_tree_node) block;
  21.259 +        idx  = getid( node[L3_IDX(req->vaddr)] );
  21.260 +        bi = (struct block_info *) &node[L3_IDX(req->vaddr) + 1];
  21.261 +        req->bi = *bi;
  21.262 +        free(block);
  21.263 +        if ( idx == ZERO )  {
  21.264 +            req->state = RETURN_ZERO;
  21.265 +            block_runlock(req->lock, L1_IDX(req->vaddr), read_cb, req);
  21.266 +        } else {
  21.267 +            req->state = READ_DATA;
  21.268 +            block_read(idx, read_cb, req);
  21.269 +        }
  21.270 +        break;
  21.271 +    }
  21.272 +    case READ_DATA:
  21.273 +    {
  21.274 +        u32 crc;
  21.275 +
  21.276 +        DPRINTF("READ_DATA\n");
  21.277 +        block = IO_BLOCK(ret);
  21.278 +        if (block == NULL) goto fail;
  21.279 +
  21.280 +        /* crc check */
  21.281 +        crc = (u32) crc32(0L, Z_NULL, 0); 
  21.282 +        crc = (u32) crc32(crc, block, BLOCK_SIZE); 
  21.283 +        if (crc != req->bi.crc) {
  21.284 +            /* TODO: add a retry loop here.                          */
  21.285 +            /* Do this after the cache is added -- make sure to      */
  21.286 +            /* invalidate the bad page before reissuing the read.    */
  21.287 +
  21.288 +            warn("Bad CRC on vaddr (%Lu:%d)\n", req->vaddr, req->bi.unused);
  21.289 +#ifdef PRINT_BADCRC_PAGES
  21.290 +            {
  21.291 +                int j;
  21.292 +                for (j=0; j<BLOCK_SIZE; j++) {
  21.293 +                    if isprint(block[j]) {
  21.294 +                        printf("%c", block[j]);
  21.295 +                    } else {
  21.296 +                        printf(".");
  21.297 +                    }
  21.298 +                    if ((j % 64) == 0) printf("\n");
  21.299 +                }
  21.300 +            }
  21.301 +#endif /* PRINT_BADCRC_PAGES */
  21.302 +
  21.303 +            /* fast and loose for the moment. */
  21.304 +            /* goto fail;                     */
  21.305 +        }
  21.306 +
  21.307 +        req->retval = ret;
  21.308 +        req->state = READ_UNLOCKED;
  21.309 +        block_runlock(req->lock, L1_IDX(req->vaddr), read_cb, req);
  21.310 +        break;
  21.311 +    }
  21.312 +    case READ_UNLOCKED:
  21.313 +    {
  21.314 +        struct io_ret r;
  21.315 +        io_cb_t cb;
  21.316 +        DPRINTF("READ_UNLOCKED\n");
  21.317 +        req_param = req->param;
  21.318 +        r         = req->retval;
  21.319 +        cb        = req->cb;
  21.320 +        free(req);
  21.321 +        cb(r, req_param);
  21.322 +        break;
  21.323 +    }
  21.324 +    
  21.325 +    case RETURN_ZERO:
  21.326 +    {
  21.327 +        struct io_ret r;
  21.328 +        io_cb_t cb;
  21.329 +        DPRINTF("RETURN_ZERO\n");
  21.330 +        req_param = req->param;
  21.331 +        cb        = req->cb;
  21.332 +        free(req);
  21.333 +        r.type = IO_BLOCK_T;
  21.334 +        r.u.b = newblock();
  21.335 +        cb(r, req_param);
  21.336 +        break;
  21.337 +    }
  21.338 +        
  21.339 +    default:
  21.340 +    	DPRINTF("*** Write: Bad state! (%d) ***\n", req->state);
  21.341 +    	goto fail;
  21.342 +    }
  21.343 + 
  21.344 +    return;
  21.345 +
  21.346 + fail:
  21.347 +    {
  21.348 +        struct io_ret r;
  21.349 +        io_cb_t cb;
  21.350 +        DPRINTF("asyn_read had a read error.\n");
  21.351 +        req_param = req->param;
  21.352 +        r         = ret;
  21.353 +        cb        = req->cb;
  21.354 +        free(req);
  21.355 +        cb(r, req_param);
  21.356 +    }
  21.357 +
  21.358 +
  21.359 +}
  21.360 +
  21.361 +static void write_cb(struct io_ret r, void *param)
  21.362 +{
  21.363 +    struct io_req *req = (struct io_req *)param;
  21.364 +    radix_tree_node node;
  21.365 +    u64 a, addr;
  21.366 +    void *req_param;
  21.367 +    struct block_info *bi;
  21.368 +
  21.369 +    switch(req->state) {
  21.370 +    	
  21.371 +    case WRITE_LOCKED:
  21.372 +        
  21.373 +        DPRINTF("WRITE_LOCKED (%llu)\n", L1_IDX(req->vaddr));
  21.374 +    	req->state = READ_L1;
  21.375 +    	block_read(getid(req->root), write_cb, req); 
  21.376 +    	break;
  21.377 +    	
  21.378 +    case READ_L1: /* block is the radix root */
  21.379 +
  21.380 +        DPRINTF("READ_L1\n");
  21.381 +        node = (radix_tree_node) IO_BLOCK(r);
  21.382 +        if (node == NULL) goto fail;
  21.383 +        a    = node[L1_IDX(req->vaddr)];
  21.384 +        addr = getid(a);
  21.385 +
  21.386 +        req->radix_addr[L2] = addr;
  21.387 +        req->radix[L1] = node;
  21.388 +
  21.389 +        if ( addr == ZERO ) {
  21.390 +            /* L1 empty subtree: */
  21.391 +            req->state = ALLOC_DATA_L1z;
  21.392 +            block_alloc( req->block, write_cb, req );
  21.393 +        } else if ( !iswritable(a) ) {
  21.394 +            /* L1 fault: */
  21.395 +            req->state = READ_L2_L1f;
  21.396 +            block_read( addr, write_cb, req );
  21.397 +        } else {
  21.398 +            req->state = READ_L2;
  21.399 +            block_read( addr, write_cb, req );
  21.400 +        }
  21.401 +        break;
  21.402 +    
  21.403 +    case READ_L2:
  21.404 +
  21.405 +        DPRINTF("READ_L2\n");
  21.406 +        node = (radix_tree_node) IO_BLOCK(r);
  21.407 +        if (node == NULL) goto fail;
  21.408 +        a    = node[L2_IDX(req->vaddr)];
  21.409 +        addr = getid(a);
  21.410 +
  21.411 +        req->radix_addr[L3] = addr;
  21.412 +        req->radix[L2] = node;
  21.413 +
  21.414 +        if ( addr == ZERO ) {
  21.415 +            /* L2 empty subtree: */
  21.416 +            req->state = ALLOC_DATA_L2z;
  21.417 +            block_alloc( req->block, write_cb, req );
  21.418 +        } else if ( !iswritable(a) ) {
  21.419 +            /* L2 fault: */
  21.420 +            req->state = READ_L3_L2f;
  21.421 +            block_read( addr, write_cb, req );
  21.422 +        } else {
  21.423 +            req->state = READ_L3;
  21.424 +            block_read( addr, write_cb, req );
  21.425 +        }
  21.426 +        break;
  21.427 +    
  21.428 +    case READ_L3:
  21.429 +
  21.430 +        DPRINTF("READ_L3\n");
  21.431 +        node = (radix_tree_node) IO_BLOCK(r);
  21.432 +        if (node == NULL) goto fail;
  21.433 +        a    = node[L3_IDX(req->vaddr)];
  21.434 +        addr = getid(a);
  21.435 +
  21.436 +        req->radix[L3] = node;
  21.437 +
  21.438 +        if ( addr == ZERO ) {
  21.439 +            /* L3 fault: */
  21.440 +            req->state = ALLOC_DATA_L3z;
  21.441 +            block_alloc( req->block, write_cb, req );
  21.442 +        } else if ( !iswritable(a) ) {
  21.443 +            /* L3 fault: */
  21.444 +            req->state = ALLOC_DATA_L3f;
  21.445 +            block_alloc( req->block, write_cb, req );
  21.446 +        } else {
  21.447 +            req->state = WRITE_DATA;
  21.448 +            block_write( addr, req->block, write_cb, req );
  21.449 +        }
  21.450 +        break;
  21.451 +    
  21.452 +    case WRITE_DATA:
  21.453 +
  21.454 +        DPRINTF("WRITE_DATA\n");
  21.455 +        /* The L3 radix points to the correct block, we just need to  */
  21.456 +        /* update the crc.                                            */
  21.457 +        if (IO_INT(r) < 0) goto fail;
  21.458 +        bi  = (struct block_info *) &req->radix[L3][L3_IDX(req->vaddr)+1];
  21.459 +        req->bi.unused = 101;
  21.460 +        *bi = req->bi;
  21.461 +        req->state = WRITE_L3;
  21.462 +        block_write(req->radix_addr[L3], (char*)req->radix[L3], write_cb, req);
  21.463 +        break;
  21.464 +    
  21.465 +    /* L3 Zero Path: */
  21.466 +
  21.467 +    case ALLOC_DATA_L3z:
  21.468 +
  21.469 +        DPRINTF("ALLOC_DATA_L3z\n");
  21.470 +        addr = IO_ADDR(r);
  21.471 +        a = writable(addr);
  21.472 +        req->radix[L3][L3_IDX(req->vaddr)] = a;
  21.473 +        bi  = (struct block_info *) &req->radix[L3][L3_IDX(req->vaddr)+1];
  21.474 +        req->bi.unused = 102;
  21.475 +        *bi = req->bi;
  21.476 +        req->state = WRITE_L3_L3z;
  21.477 +        block_write(req->radix_addr[L3], (char*)req->radix[L3], write_cb, req);
  21.478 +        break;
  21.479 +    
  21.480 +    /* L3 Fault Path: */
  21.481 +
  21.482 +    case ALLOC_DATA_L3f:
  21.483 +    
  21.484 +        DPRINTF("ALLOC_DATA_L3f\n");
  21.485 +        addr = IO_ADDR(r);
  21.486 +        a = writable(addr);
  21.487 +        req->radix[L3][L3_IDX(req->vaddr)] = a;
  21.488 +        bi  = (struct block_info *) &req->radix[L3][L3_IDX(req->vaddr)+1];
  21.489 +        req->bi.unused = 103;
  21.490 +        *bi = req->bi;
  21.491 +        req->state = WRITE_L3_L3f;
  21.492 +        block_write(req->radix_addr[L3], (char*)req->radix[L3], write_cb, req);
  21.493 +        break;
  21.494 +
  21.495 +    /* L2 Zero Path: */
  21.496 +        
  21.497 +    case ALLOC_DATA_L2z:
  21.498 +
  21.499 +        DPRINTF("ALLOC_DATA_L2z\n");
  21.500 +        addr = IO_ADDR(r);
  21.501 +        a = writable(addr);
  21.502 +        req->radix[L3] = newblock();
  21.503 +        req->radix[L3][L3_IDX(req->vaddr)] = a;
  21.504 +        bi  = (struct block_info *) &req->radix[L3][L3_IDX(req->vaddr)+1];
  21.505 +        req->bi.unused = 104;
  21.506 +        *bi = req->bi;
  21.507 +        req->state = ALLOC_L3_L2z;
  21.508 +        block_alloc( (char*)req->radix[L3], write_cb, req );
  21.509 +        break;
  21.510 +
  21.511 +    case ALLOC_L3_L2z:
  21.512 +
  21.513 +        DPRINTF("ALLOC_L3_L2z\n");
  21.514 +        addr = IO_ADDR(r);
  21.515 +        a = writable(addr);
  21.516 +        req->radix[L2][L2_IDX(req->vaddr)] = a;
  21.517 +        req->state = WRITE_L2_L2z;
  21.518 +        block_write(req->radix_addr[L2], (char*)req->radix[L2], write_cb, req);
  21.519 +        break;
  21.520 +        
  21.521 +    /* L2 Fault Path: */
  21.522 +        
  21.523 +    case READ_L3_L2f:
  21.524 +    
  21.525 +    	DPRINTF("READ_L3_L2f\n");
  21.526 +        node = (radix_tree_node) IO_BLOCK(r);
  21.527 +        clear_L3_w_bits(node);
  21.528 +        if (node == NULL) goto fail;
  21.529 +        a    = node[L2_IDX(req->vaddr)];
  21.530 +        addr = getid(a);
  21.531 +
  21.532 +        req->radix[L3] = node;
  21.533 +        req->state = ALLOC_DATA_L2f;
  21.534 +        block_alloc( req->block, write_cb, req );
  21.535 +        break;
  21.536 +                
  21.537 +    case ALLOC_DATA_L2f:
  21.538 +
  21.539 +        DPRINTF("ALLOC_DATA_L2f\n");
  21.540 +        addr = IO_ADDR(r);
  21.541 +        a = writable(addr);
  21.542 +        req->radix[L3][L3_IDX(req->vaddr)] = a;
  21.543 +        bi  = (struct block_info *) &req->radix[L3][L3_IDX(req->vaddr)+1];
  21.544 +        req->bi.unused = 105;
  21.545 +        *bi = req->bi;
  21.546 +        req->state = ALLOC_L3_L2f;
  21.547 +        block_alloc( (char*)req->radix[L3], write_cb, req );
  21.548 +        break;
  21.549 +
  21.550 +    case ALLOC_L3_L2f:
  21.551 +
  21.552 +        DPRINTF("ALLOC_L3_L2f\n");
  21.553 +        addr = IO_ADDR(r);
  21.554 +        a = writable(addr);
  21.555 +        req->radix[L2][L2_IDX(req->vaddr)] = a;
  21.556 +        req->state = WRITE_L2_L2f;
  21.557 +        block_write(req->radix_addr[L2], (char*)req->radix[L2], write_cb, req);
  21.558 +        break;
  21.559 +        
  21.560 +    /* L1 Zero Path: */
  21.561 +    
  21.562 +    case ALLOC_DATA_L1z:
  21.563 +
  21.564 +        DPRINTF("ALLOC_DATA_L1z\n");
  21.565 +        addr = IO_ADDR(r);
  21.566 +        a = writable(addr);
  21.567 +        req->radix[L3] = newblock();
  21.568 +        req->radix[L3][L3_IDX(req->vaddr)] = a;
  21.569 +        bi  = (struct block_info *) &req->radix[L3][L3_IDX(req->vaddr)+1];
  21.570 +        req->bi.unused = 106;
  21.571 +        *bi = req->bi;
  21.572 +        req->state = ALLOC_L3_L1z;
  21.573 +        block_alloc( (char*)req->radix[L3], write_cb, req );
  21.574 +        break;
  21.575 +        
  21.576 +    case ALLOC_L3_L1z:
  21.577 +
  21.578 +        DPRINTF("ALLOC_L3_L1z\n");
  21.579 +        addr = IO_ADDR(r);
  21.580 +        a = writable(addr);
  21.581 +        req->radix[L2] = newblock();
  21.582 +        req->radix[L2][L2_IDX(req->vaddr)] = a;
  21.583 +        req->state = ALLOC_L2_L1z;
  21.584 +        block_alloc( (char*)req->radix[L2], write_cb, req );
  21.585 +        break;
  21.586 +
  21.587 +    case ALLOC_L2_L1z:
  21.588 +
  21.589 +        DPRINTF("ALLOC_L2_L1z\n");
  21.590 +        addr = IO_ADDR(r);
  21.591 +        a = writable(addr);
  21.592 +        req->radix[L1][L1_IDX(req->vaddr)] = a;
  21.593 +        req->state = WRITE_L1_L1z;
  21.594 +        block_write(req->radix_addr[L1], (char*)req->radix[L1], write_cb, req);
  21.595 +        break;
  21.596 +
  21.597 +    /* L1 Fault Path: */
  21.598 +        
  21.599 +    case READ_L2_L1f:
  21.600 +    
  21.601 +    	DPRINTF("READ_L2_L1f\n");
  21.602 +        node = (radix_tree_node) IO_BLOCK(r);
  21.603 +        clear_w_bits(node);
  21.604 +        if (node == NULL) goto fail;
  21.605 +        a    = node[L2_IDX(req->vaddr)];
  21.606 +        addr = getid(a);
  21.607 +
  21.608 +        req->radix_addr[L3] = addr;
  21.609 +        req->radix[L2] = node;
  21.610 +        
  21.611 +        if (addr == ZERO) {
  21.612 +            /* nothing below L2, create an empty L3 and alloc data. */
  21.613 +            /* (So skip READ_L3_L1f.) */
  21.614 +            req->radix[L3] = newblock();
  21.615 +            req->state = ALLOC_DATA_L1f;
  21.616 +            block_alloc( req->block, write_cb, req );
  21.617 +        } else {
  21.618 +            req->state = READ_L3_L1f;
  21.619 +            block_read( addr, write_cb, req );
  21.620 +        }
  21.621 +        break;
  21.622 +        
  21.623 +    case READ_L3_L1f:
  21.624 +    
  21.625 +    	DPRINTF("READ_L3_L1f\n");
  21.626 +        node = (radix_tree_node) IO_BLOCK(r);
  21.627 +        clear_L3_w_bits(node);
  21.628 +        if (node == NULL) goto fail;
  21.629 +        a    = node[L2_IDX(req->vaddr)];
  21.630 +        addr = getid(a);
  21.631 +
  21.632 +        req->radix[L3] = node;
  21.633 +        req->state = ALLOC_DATA_L1f;
  21.634 +        block_alloc( req->block, write_cb, req );
  21.635 +        break;
  21.636 +                
  21.637 +    case ALLOC_DATA_L1f:
  21.638 +
  21.639 +        DPRINTF("ALLOC_DATA_L1f\n");
  21.640 +        addr = IO_ADDR(r);
  21.641 +        a = writable(addr);
  21.642 +        req->radix[L3][L3_IDX(req->vaddr)] = a;
  21.643 +        bi  = (struct block_info *) &req->radix[L3][L3_IDX(req->vaddr)+1];
  21.644 +        req->bi.unused = 107;
  21.645 +        *bi = req->bi;
  21.646 +        req->state = ALLOC_L3_L1f;
  21.647 +        block_alloc( (char*)req->radix[L3], write_cb, req );
  21.648 +        break;
  21.649 +
  21.650 +    case ALLOC_L3_L1f:
  21.651 +
  21.652 +        DPRINTF("ALLOC_L3_L1f\n");
  21.653 +        addr = IO_ADDR(r);
  21.654 +        a = writable(addr);
  21.655 +        req->radix[L2][L2_IDX(req->vaddr)] = a;
  21.656 +        req->state = ALLOC_L2_L1f;
  21.657 +        block_alloc( (char*)req->radix[L2], write_cb, req );
  21.658 +        break;
  21.659 +
  21.660 +    case ALLOC_L2_L1f:
  21.661 +
  21.662 +        DPRINTF("ALLOC_L2_L1f\n");
  21.663 +        addr = IO_ADDR(r);
  21.664 +        a = writable(addr);
  21.665 +        req->radix[L1][L1_IDX(req->vaddr)] = a;
  21.666 +        req->state = WRITE_L1_L1f;
  21.667 +        block_write(req->radix_addr[L1], (char*)req->radix[L1], write_cb, req);
  21.668 +        break;
  21.669 +
  21.670 +    case WRITE_L3:
  21.671 +    case WRITE_L3_L3z:
  21.672 +    case WRITE_L3_L3f:
  21.673 +    case WRITE_L2_L2z:
  21.674 +    case WRITE_L2_L2f:
  21.675 +    case WRITE_L1_L1z:
  21.676 +    case WRITE_L1_L1f:
  21.677 +    {
  21.678 +    	int i;
  21.679 +        DPRINTF("DONE\n");
  21.680 +        /* free any saved node vals. */
  21.681 +        for (i=0; i<3; i++)
  21.682 +            if (req->radix[i] != 0) free(req->radix[i]);
  21.683 +        req->retval = r;
  21.684 +        req->state = WRITE_UNLOCKED;
  21.685 +        block_wunlock(req->lock, L1_IDX(req->vaddr), write_cb, req);
  21.686 +        break;
  21.687 +    }
  21.688 +    case WRITE_UNLOCKED:
  21.689 +    {
  21.690 +        struct io_ret r;
  21.691 +        io_cb_t cb;
  21.692 +        DPRINTF("WRITE_UNLOCKED!\n");
  21.693 +        req_param = req->param;
  21.694 +        r         = req->retval;
  21.695 +        cb        = req->cb;
  21.696 +        free(req);
  21.697 +        cb(r, req_param);
  21.698 +        break;
  21.699 +    }
  21.700 +        
  21.701 +    default:
  21.702 +    	DPRINTF("*** Write: Bad state! (%d) ***\n", req->state);
  21.703 +    	goto fail;
  21.704 +    }
  21.705 +    
  21.706 +    return;
  21.707 +    
  21.708 + fail:
  21.709 +    {
  21.710 +        struct io_ret r;
  21.711 +        io_cb_t cb;
  21.712 +        int i;
  21.713 +
  21.714 +        DPRINTF("asyn_write had a read error mid-way.\n");
  21.715 +        req_param = req->param;
  21.716 +        cb        = req->cb;
  21.717 +        r.type = IO_INT_T;
  21.718 +        r.u.i  = -1;
  21.719 +        /* free any saved node vals. */
  21.720 +        for (i=0; i<3; i++)
  21.721 +            if (req->radix[i] != 0) free(req->radix[i]);
  21.722 +        free(req);
  21.723 +        cb(r, req_param);
  21.724 +    }
  21.725 +}
  21.726 +
  21.727 +char *vdi_read_s(vdi_t *vdi, u64 vaddr)
  21.728 +{
  21.729 +    pthread_mutex_t m = PTHREAD_MUTEX_INITIALIZER;
  21.730 +    char *block = NULL;
  21.731 +    int ret;
  21.732 +
  21.733 +    void reads_cb(struct io_ret r, void *param) 
  21.734 +    {
  21.735 +        block = IO_BLOCK(r);
  21.736 +        pthread_mutex_unlock((pthread_mutex_t *)param);
  21.737 +    }
  21.738 +
  21.739 +    pthread_mutex_lock(&m);
  21.740 +    ret = vdi_read(vdi, vaddr, reads_cb, &m);
  21.741 +
  21.742 +    if (ret == 0) pthread_mutex_lock(&m);
  21.743 +    
  21.744 +    return block;
  21.745 +}
  21.746 +
  21.747 +
  21.748 +int vdi_write_s(vdi_t *vdi, u64 vaddr, char *block)
  21.749 +{
  21.750 +    pthread_mutex_t m = PTHREAD_MUTEX_INITIALIZER;
  21.751 +    int ret, result;
  21.752 +
  21.753 +    void writes_cb(struct io_ret r, void *param) 
  21.754 +    {
  21.755 +        result = IO_INT(r);
  21.756 +        pthread_mutex_unlock((pthread_mutex_t *)param);
  21.757 +    }
  21.758 +
  21.759 +    pthread_mutex_lock(&m);
  21.760 +    ret = vdi_write(vdi, vaddr, block, writes_cb, &m);
  21.761 +
  21.762 +    if (ret == 0) pthread_mutex_lock(&m);
  21.763 +    
  21.764 +    return result;
  21.765 +}
    22.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    22.2 +++ b/tools/blktap/parallax/requests-async.h	Sun Jul 03 22:36:48 2005 +0000
    22.3 @@ -0,0 +1,29 @@
    22.4 +#ifndef _REQUESTSASYNC_H_
    22.5 +#define _REQUESTSASYNC_H_
    22.6 +
    22.7 +#include "block-async.h"
    22.8 +#include "blockstore.h" /* for newblock etc. */
    22.9 +
   22.10 +/*
   22.11 +#define BLOCK_SIZE 4096
   22.12 +#define ZERO 0ULL
   22.13 +#define getid(x) (((x)>>1)&0x7fffffffffffffffLLU)
   22.14 +#define iswritable(x) (((x) & 1LLU) != 0)
   22.15 +#define writable(x) (((x) << 1) | 1LLU)
   22.16 +#define readonly(x) ((u64)((x) << 1))
   22.17 +*/
   22.18 +
   22.19 +#define VADDR_MASK 0x0000000003ffffffLLU /* 26-bits = 256Gig */
   22.20 +#define VALID_VADDR(x) (((x) & VADDR_MASK) == (x))
   22.21 +
   22.22 +int vdi_read (vdi_t *vdi, u64 vaddr, io_cb_t cb, void *param);
   22.23 +int vdi_write(vdi_t *vdi, u64 vaddr, char *block, io_cb_t cb, void *param);
   22.24 +             
   22.25 +/* synchronous versions: */
   22.26 +char *vdi_read_s (vdi_t *vdi, u64 vaddr);
   22.27 +int   vdi_write_s(vdi_t *vdi, u64 vaddr, char *block);
   22.28 +
   22.29 +#define ERR_BAD_VADDR  -1
   22.30 +#define ERR_NOMEM      -2
   22.31 +
   22.32 +#endif //_REQUESTSASYNC_H_
    23.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    23.2 +++ b/tools/blktap/parallax/snaplog.c	Sun Jul 03 22:36:48 2005 +0000
    23.3 @@ -0,0 +1,238 @@
    23.4 +/**************************************************************************
    23.5 + * 
    23.6 + * snaplog.c
    23.7 + *
    23.8 + * Snapshot log on-disk data structure.
    23.9 + *
   23.10 + */
   23.11 + 
   23.12 + /* VDI histories are made from chains of snapshot logs.  These logs record 
   23.13 +  * the (radix) root and timestamp of individual snapshots.
   23.14 +  *
   23.15 +  * creation of a new VDI involves 'forking' a snapshot log, by creating a 
   23.16 +  * new, empty log (in a new VDI) and parenting it off of a record in an 
   23.17 +  * existing snapshot log.
   23.18 +  *
   23.19 +  * snapshot log blocks have at most one writer.
   23.20 +  */
   23.21 +
   23.22 +#include <stdio.h>
   23.23 +#include <stdlib.h>
   23.24 +#include <sys/time.h>
   23.25 +#include "blockstore.h"
   23.26 +#include "snaplog.h"
   23.27 +
   23.28 +
   23.29 +
   23.30 +snap_block_t *snap_get_block(u64 block)
   23.31 +{
   23.32 +    snap_block_t *blk = (snap_block_t *)readblock(block);
   23.33 +    
   23.34 +    if ( blk == NULL)
   23.35 +        return NULL;
   23.36 +    if ( blk->hdr.magic != SNAP_MAGIC ) {
   23.37 +        freeblock(blk);
   23.38 +        return NULL;
   23.39 +    }
   23.40 +    
   23.41 +    return blk;
   23.42 +}
   23.43 +    
   23.44 +int snap_get_id(snap_id_t *id, snap_rec_t *target)
   23.45 +{
   23.46 +    snap_block_t *blk;
   23.47 +    
   23.48 +    if ( id == NULL )
   23.49 +        return -1;
   23.50 +    
   23.51 +    blk = snap_get_block(id->block);
   23.52 +    
   23.53 +    if ( blk == NULL ) 
   23.54 +        return -1;
   23.55 +    
   23.56 +    if ( id->index > blk->hdr.nr_entries ) {
   23.57 +        freeblock(blk);
   23.58 +        return -1;
   23.59 +    }
   23.60 +    
   23.61 +    *target = blk->snaps[id->index];
   23.62 +    freeblock(blk);
   23.63 +    return 0;
   23.64 +}
   23.65 +
   23.66 +int __snap_block_create(snap_id_t *parent_id, snap_id_t *fork_id,
   23.67 +                                  snap_id_t *new_id)
   23.68 +{
   23.69 +    snap_rec_t parent_rec, fork_rec;
   23.70 +    snap_block_t *blk, *pblk;
   23.71 +    /*
   23.72 +    if ( (parent_id != NULL) && (snap_get_id(parent_id, &parent_rec) != 0) )
   23.73 +        return -1;    
   23.74 +    
   23.75 +    if ( (fork_id != NULL) && (snap_get_id(fork_id, &fork_rec) != 0) )
   23.76 +        return -1;   
   23.77 +*/
   23.78 +    blk = (snap_block_t *)newblock();
   23.79 +    blk->hdr.magic  = SNAP_MAGIC;
   23.80 +    blk->hdr.nr_entries  = 0;
   23.81 +    blk->hdr.log_entries = 0;
   23.82 +    blk->hdr.immutable   = 0;
   23.83 +    
   23.84 +    if (   (parent_id  != NULL) 
   23.85 +        && (parent_id->block != fork_id->block) 
   23.86 +        && (parent_id->block != 0)) {
   23.87 +        
   23.88 +        pblk = snap_get_block(parent_id->block);
   23.89 +        blk->hdr.log_entries = pblk->hdr.log_entries;
   23.90 +        freeblock(pblk);
   23.91 +    }
   23.92 +    
   23.93 +    if (parent_id != NULL) {
   23.94 +        blk->hdr.parent_block = *parent_id;
   23.95 +        blk->hdr.fork_block   = *fork_id;
   23.96 +    } else {
   23.97 +        blk->hdr.parent_block = null_snap_id;
   23.98 +        blk->hdr.fork_block   = null_snap_id;
   23.99 +    }
  23.100 +    
  23.101 +    new_id->index = 0;
  23.102 +    new_id->block = allocblock(blk);
  23.103 +    freeblock(blk);
  23.104 +    if (new_id->block == 0)
  23.105 +        return -1;
  23.106 +    
  23.107 +    return 0;
  23.108 +}
  23.109 +
  23.110 +int snap_block_create(snap_id_t *parent_id, snap_id_t *new_id)
  23.111 +{
  23.112 +    return __snap_block_create(parent_id, parent_id, new_id);
  23.113 +}
  23.114 +
  23.115 +int snap_append(snap_id_t *old_id, snap_rec_t *rec, snap_id_t *new_id)
  23.116 +{
  23.117 +    snap_id_t id = *old_id;
  23.118 +    snap_block_t *blk = snap_get_block(id.block);
  23.119 +    
  23.120 +    if ( rec->deleted == 1 ) {
  23.121 +        printf("Attempt to append a deleted snapshot!\n");
  23.122 +        return -1;
  23.123 +    }
  23.124 +    
  23.125 +    if ( blk->hdr.immutable != 0 ) {
  23.126 +        printf("Attempt to snap an immutable snap block!\n");
  23.127 +        return -1;
  23.128 +    }
  23.129 +    
  23.130 +    new_id->block = id.block;
  23.131 +    
  23.132 +    if (blk->hdr.nr_entries == SNAPS_PER_BLOCK) {
  23.133 +        int ret;
  23.134 +        
  23.135 +        id.index--; /* make id point to the last full record */
  23.136 +        
  23.137 +        ret = __snap_block_create(&id, &blk->hdr.fork_block, new_id);
  23.138 +        if ( ret != 0 ) {
  23.139 +            freeblock(blk);
  23.140 +            return -1;
  23.141 +        }
  23.142 +        
  23.143 +        blk->hdr.immutable = 1;
  23.144 +        writeblock(id.block, blk);
  23.145 +        freeblock(blk);
  23.146 +        blk = snap_get_block(new_id->block);
  23.147 +        id = *new_id;
  23.148 +    }
  23.149 +    
  23.150 +    blk->snaps[blk->hdr.nr_entries] = *rec;
  23.151 +    blk->hdr.nr_entries++;
  23.152 +    blk->hdr.log_entries++;
  23.153 +    new_id->index = blk->hdr.nr_entries;
  23.154 +    //printf("snap: %u %u\n", blk->hdr.nr_entries, blk->hdr.log_entries);
  23.155 +    writeblock(id.block, blk);
  23.156 +    freeblock(blk);
  23.157 +    return 0;
  23.158 +}
  23.159 +
  23.160 +int snap_collapse(int height, snap_id_t *p_id, snap_id_t *c_id)
  23.161 +{
  23.162 +    snap_block_t *p_blk, *c_blk, *blk;
  23.163 +    snap_rec_t   *p_rec, *c_rec;
  23.164 +    int ret = -1;
  23.165 +    
  23.166 +    p_blk = snap_get_block(p_id->block);
  23.167 +    
  23.168 +    if (p_blk == NULL) return(-1);
  23.169 +    
  23.170 +    if (c_id->block == p_id->block)
  23.171 +    {
  23.172 +        c_blk = p_blk;
  23.173 +    } else {
  23.174 +         c_blk = snap_get_block(c_id->block);
  23.175 +    }
  23.176 +    
  23.177 +    if (p_blk == NULL) {
  23.178 +        freeblock(p_blk);
  23.179 +        return(-1);
  23.180 +    }
  23.181 +     
  23.182 +    /* parent and child must not be deleted. */
  23.183 +    p_rec = &p_blk->snaps[p_id->index];
  23.184 +    c_rec = &c_blk->snaps[c_id->index];
  23.185 +    /*
  23.186 +    if ( (p_rec->deleted == 1) || (c_rec->deleted == 1) ) {
  23.187 +        printf("One of those snaps is already deleted.\n");
  23.188 +        goto done;
  23.189 +    }
  23.190 +    */
  23.191 +    /* first non-deleted thing in the log before child must be parent. */
  23.192 +    
  23.193 +    /* XXX todo: text the range here for delete (and eventually fork) bits) */
  23.194 +    /* for now, snaps must be consecutive, on the same log page: */
  23.195 +    
  23.196 +    if ((p_id->block != c_id->block) || (p_id->index != c_id->index-1))
  23.197 +    {
  23.198 +        printf("Deleting non-consecutive snaps is not done yet.\n");
  23.199 +        goto done;
  23.200 +    }
  23.201 +    
  23.202 +    /* mark parent as deleted XXX: may need to lock parent block here.*/
  23.203 +    p_rec->deleted = 1;
  23.204 +    writeblock(p_id->block, p_blk);
  23.205 +    
  23.206 +    /* delete the parent */
  23.207 +    printf("collapse(%Ld, %Ld)\n", p_rec->radix_root, c_rec->radix_root);
  23.208 +    ret = collapse(height, p_rec->radix_root, c_rec->radix_root);
  23.209 +    
  23.210 +    /* return the number of blocks reclaimed. */
  23.211 +    
  23.212 +done:
  23.213 +    if (c_blk != p_blk) freeblock(c_blk);
  23.214 +    freeblock(p_blk);
  23.215 +    
  23.216 +    return(ret);
  23.217 +}
  23.218 +
  23.219 +void snap_print_history(snap_id_t *snap_id)
  23.220 +{
  23.221 +    snap_id_t id = *snap_id;
  23.222 +    unsigned int idx = id.index;
  23.223 +    snap_block_t *new_blk, *blk = snap_get_block(id.block);
  23.224 +    
  23.225 +    while ( blk ) {
  23.226 +        printf("[Snap block %Ld]:\n", id.block);
  23.227 +        do {
  23.228 +            printf("   %03u: root: %Ld ts: %ld.%ld\n", idx, 
  23.229 +                    blk->snaps[idx].radix_root,
  23.230 +                    blk->snaps[idx].timestamp.tv_sec,
  23.231 +                    blk->snaps[idx].timestamp.tv_usec);
  23.232 +        } while (idx-- != 0);
  23.233 +        
  23.234 +        id = blk->hdr.parent_block;
  23.235 +        if (id.block != 0) {
  23.236 +            new_blk = snap_get_block(id.block);
  23.237 +        }
  23.238 +        freeblock(blk);
  23.239 +        blk = new_blk;
  23.240 +    }
  23.241 +}
    24.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    24.2 +++ b/tools/blktap/parallax/snaplog.h	Sun Jul 03 22:36:48 2005 +0000
    24.3 @@ -0,0 +1,61 @@
    24.4 +/**************************************************************************
    24.5 + * 
    24.6 + * snaplog.h
    24.7 + *
    24.8 + * Snapshot log on-disk data structure.
    24.9 + *
   24.10 + */
   24.11 + 
   24.12 +#include "radix.h"
   24.13 +#include "blockstore.h"    /* for BLOCK_SIZE */
   24.14 + 
   24.15 +#ifndef __SNAPLOG_H__
   24.16 +#define __SNAPLOG_H__
   24.17 +
   24.18 +typedef struct snap_id {
   24.19 +    u64            block;
   24.20 +    unsigned int   index;
   24.21 +} snap_id_t;
   24.22 +
   24.23 +typedef struct snap_rec {
   24.24 +    u64            radix_root;
   24.25 +    struct timeval timestamp;
   24.26 +    /* flags: */
   24.27 +    unsigned       deleted:1;
   24.28 +} snap_rec_t;
   24.29 +
   24.30 +
   24.31 +int  snap_block_create(snap_id_t *parent_id, snap_id_t *new_id);
   24.32 +int  snap_append(snap_id_t *id, snap_rec_t *rec, snap_id_t *new_id);
   24.33 +int  snap_collapse(int height, snap_id_t *p_id, snap_id_t *c_id);
   24.34 +void snap_print_history(snap_id_t *snap_id);
   24.35 +int  snap_get_id(snap_id_t *id, snap_rec_t *target);
   24.36 +
   24.37 +
   24.38 +/* exported for vdi debugging */
   24.39 +#define SNAP_MAGIC 0xff00ff0aa0ff00ffLL
   24.40 +
   24.41 +static const snap_id_t null_snap_id = { 0, 0 }; 
   24.42 +
   24.43 +typedef struct snap_block_hdr {
   24.44 +    u64            magic;
   24.45 +    snap_id_t      parent_block; /* parent block within this chain */
   24.46 +    snap_id_t      fork_block;   /* where this log was forked */
   24.47 +    unsigned       log_entries;  /* total entries since forking */
   24.48 +    unsigned short nr_entries;   /* entries in snaps[] */
   24.49 +    unsigned short immutable;    /* has this snap page become immutable? */
   24.50 +} snap_block_hdr_t;
   24.51 +
   24.52 +
   24.53 +#define SNAPS_PER_BLOCK \
   24.54 +    ((BLOCK_SIZE - sizeof(snap_block_hdr_t)) / sizeof(snap_rec_t))
   24.55 +
   24.56 +typedef struct snap_block {
   24.57 +    snap_block_hdr_t hdr;
   24.58 +    snap_rec_t       snaps[SNAPS_PER_BLOCK];
   24.59 +} snap_block_t;
   24.60 +    
   24.61 +
   24.62 +snap_block_t *snap_get_block(u64 block);
   24.63 +
   24.64 +#endif /* __SNAPLOG_H__ */
    25.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    25.2 +++ b/tools/blktap/parallax/vdi.c	Sun Jul 03 22:36:48 2005 +0000
    25.3 @@ -0,0 +1,367 @@
    25.4 +/**************************************************************************
    25.5 + * 
    25.6 + * vdi.c
    25.7 + *
    25.8 + * Virtual Disk Image (VDI) Interfaces
    25.9 + *
   25.10 + */
   25.11 + 
   25.12 +#include <stdio.h>
   25.13 +#include <stdlib.h>
   25.14 +#include <fcntl.h>
   25.15 +#include <string.h>
   25.16 +#include <sys/time.h>
   25.17 +#include <pthread.h>
   25.18 +#include "blockstore.h"
   25.19 +#include "block-async.h"
   25.20 +#include "requests-async.h"
   25.21 +#include "radix.h"
   25.22 +#include "vdi.h"
   25.23 +                    
   25.24 +#define VDI_REG_BLOCK   2LL
   25.25 +#define VDI_RADIX_ROOT  writable(3)
   25.26 +                                                            
   25.27 +#if 0
   25.28 +#define DPRINTF(_f, _a...) printf ( _f , ## _a )
   25.29 +#else
   25.30 +#define DPRINTF(_f, _a...) ((void)0)
   25.31 +#endif
   25.32 +
   25.33 +/* I haven't decided about this registry stuff, so this is just a really
   25.34 + * quick lash-up so that there is some way to track VDIs.
   25.35 + *
   25.36 + * (Most vdi access should be with a direct handle to the block, so this
   25.37 + *  registry is just for start-of-day lookup and other control operations.)
   25.38 + */
   25.39 +
   25.40 +vdi_registry_t *create_vdi_registry(void)
   25.41 +{
   25.42 +    vdi_registry_t *reg = (vdi_registry_t *)newblock();
   25.43 +    
   25.44 +    if (reg == NULL)
   25.45 +        return NULL;
   25.46 +    
   25.47 +    /* zero-fill the vdi radix root while we have an empty block. */
   25.48 +    writeblock(VDI_RADIX_ROOT, (void *)reg);
   25.49 +    
   25.50 +    
   25.51 +    DPRINTF("[vdi.c] Creating VDI registry!\n");
   25.52 +    reg->magic      = VDI_REG_MAGIC;
   25.53 +    reg->nr_vdis    = 0;
   25.54 +    
   25.55 +    writeblock(VDI_REG_BLOCK, (void *)reg);
   25.56 +    
   25.57 +    return reg;
   25.58 +}
   25.59 +    
   25.60 +vdi_registry_t *get_vdi_registry(void)
   25.61 +{
   25.62 +    vdi_registry_t *vdi_reg = (vdi_registry_t *)readblock(VDI_REG_BLOCK);
   25.63 +    
   25.64 +    if ( vdi_reg == NULL )
   25.65 +        vdi_reg = create_vdi_registry();
   25.66 +    
   25.67 +    if ( vdi_reg->magic != VDI_REG_MAGIC ) {
   25.68 +        freeblock(vdi_reg);
   25.69 +        return NULL;
   25.70 +    }
   25.71 +    
   25.72 +    return vdi_reg;
   25.73 +}
   25.74 +
   25.75 +
   25.76 +vdi_t *vdi_create(snap_id_t *parent_snap, char *name)
   25.77 +{
   25.78 +    int ret;
   25.79 +    vdi_t *vdi;
   25.80 +    vdi_registry_t *vdi_reg;
   25.81 +    snap_rec_t snap_rec;
   25.82 +    
   25.83 +    /* create a vdi struct */
   25.84 +    vdi = newblock();
   25.85 +    if (vdi == NULL) 
   25.86 +        return NULL;
   25.87 +    
   25.88 +    if ( snap_get_id(parent_snap, &snap_rec) == 0 ) {
   25.89 +        vdi->radix_root = snapshot(snap_rec.radix_root);
   25.90 +    } else {
   25.91 +        vdi->radix_root = allocblock((void *)vdi); /* vdi is just zeros here */
   25.92 +        vdi->radix_root = writable(vdi->radix_root); /* grr. */
   25.93 +    }
   25.94 +    
   25.95 +    /* create a snapshot log, and add it to the vdi struct */
   25.96 +    
   25.97 +    ret = snap_block_create(parent_snap, &vdi->snap);
   25.98 +    if ( ret != 0 ) {
   25.99 +        DPRINTF("Error getting snap block in vdi_create.\n");
  25.100 +        freeblock(vdi);
  25.101 +        return NULL;
  25.102 +    }
  25.103 +            
  25.104 +    /* append the vdi to the registry, fill block and id.             */
  25.105 +    /* implicit allocation means we have to write the vdi twice here. */
  25.106 +    vdi_reg    = get_vdi_registry();
  25.107 +    if ( vdi_reg == NULL ) {
  25.108 +        freeblock(vdi);
  25.109 +        return NULL;
  25.110 +    }
  25.111 +    
  25.112 +    vdi->block = allocblock((void *)vdi);
  25.113 +    vdi->id    = vdi_reg->nr_vdis++;
  25.114 +    strncpy(vdi->name, name, VDI_NAME_SZ);
  25.115 +    vdi->name[VDI_NAME_SZ] = '\0';
  25.116 +    vdi->radix_lock = NULL; /* for tidiness */
  25.117 +    writeblock(vdi->block, (void *)vdi);
  25.118 +    
  25.119 +    update(VDI_REG_HEIGHT, VDI_RADIX_ROOT, vdi->id, vdi->block);
  25.120 +    writeblock(VDI_REG_BLOCK, (void *)vdi_reg);
  25.121 +    freeblock(vdi_reg);
  25.122 +    
  25.123 +    vdi->radix_lock = (struct radix_lock *)malloc(sizeof(struct radix_lock));
  25.124 +    if (vdi->radix_lock == NULL) 
  25.125 +    {
  25.126 +    	perror("couldn't malloc radix_lock for new vdi!");
  25.127 +    	freeblock(vdi);
  25.128 +    	return NULL;
  25.129 +    }
  25.130 +    radix_lock_init(vdi->radix_lock);
  25.131 +    
  25.132 +    return vdi;
  25.133 +}
  25.134 +
  25.135 +/* vdi_get and vdi_put currently act more like alloc/free -- they don't 
  25.136 + * do refcount-based allocation.  
  25.137 + */
  25.138 +vdi_t *vdi_get(u64 vdi_id)
  25.139 +{
  25.140 +    u64 vdi_blk;
  25.141 +    vdi_t *vdi;
  25.142 +    
  25.143 +    vdi_blk = lookup(VDI_REG_HEIGHT, VDI_RADIX_ROOT, vdi_id);
  25.144 +    
  25.145 +    if ( vdi_blk == 0 )
  25.146 +        return NULL;
  25.147 +    
  25.148 +    vdi = (vdi_t *)readblock(vdi_blk);
  25.149 +    
  25.150 +    vdi->radix_lock = (struct radix_lock *)malloc(sizeof(struct radix_lock));
  25.151 +    if (vdi->radix_lock == NULL) 
  25.152 +    {
  25.153 +    	perror("couldn't malloc radix_lock for new vdi!");
  25.154 +    	freeblock(vdi);
  25.155 +    	return NULL;
  25.156 +    }
  25.157 +    radix_lock_init(vdi->radix_lock);
  25.158 +    
  25.159 +    return vdi;
  25.160 +}
  25.161 +
  25.162 +void vdi_put(vdi_t *vdi)
  25.163 +{
  25.164 +    free(vdi->radix_lock);
  25.165 +    freeblock(vdi);
  25.166 +}
  25.167 +
  25.168 +void vdi_snapshot(vdi_t *vdi)
  25.169 +{
  25.170 +    snap_rec_t rec;
  25.171 +    int ret;
  25.172 +    
  25.173 +    rec.radix_root = vdi->radix_root;
  25.174 +    gettimeofday(&rec.timestamp, NULL);
  25.175 +    rec.deleted = 0;
  25.176 +    
  25.177 +    vdi->radix_root = snapshot(vdi->radix_root);
  25.178 +    ret = snap_append(&vdi->snap, &rec, &vdi->snap);
  25.179 +    if ( ret != 0 ) {
  25.180 +        printf("snap_append returned failure\n");
  25.181 +        return;
  25.182 +    }
  25.183 +    writeblock(vdi->block, vdi);
  25.184 +}
  25.185 +    
  25.186 +int __init_vdi()
  25.187 +{
  25.188 +    /* sneak this in here for the moment. */
  25.189 +    __rcache_init();
  25.190 +    
  25.191 +    /* force the registry to be created if it doesn't exist. */
  25.192 +    vdi_registry_t *vdi_reg = get_vdi_registry();
  25.193 +    if (vdi_reg == NULL) {
  25.194 +        printf("[vdi.c] Couldn't get/create a VDI registry!\n");
  25.195 +        return -1;
  25.196 +    }
  25.197 +    freeblock(vdi_reg);
  25.198 +    
  25.199 +    
  25.200 +    return 0;
  25.201 +}
  25.202 +    
  25.203 +#ifdef VDI_STANDALONE
  25.204 +
  25.205 +#define TEST_VDIS      50
  25.206 +#define NR_ITERS    50000
  25.207 +#define FORK_POINTS   200
  25.208 +#define INIT_VDIS       3
  25.209 +#define INIT_SNAPS     40
  25.210 +
  25.211 +/* These must be of decreasing size: */
  25.212 +#define NEW_FORK       (RAND_MAX-(RAND_MAX/1000))
  25.213 +#define NEW_ROOT_VDI   (RAND_MAX-((RAND_MAX/1000)*2))
  25.214 +#define NEW_FORK_VDI   (RAND_MAX-((RAND_MAX/1000)*3))
  25.215 +
  25.216 +#define GRAPH_DOT_FILE "vdi.dot"
  25.217 +#define GRAPH_PS_FILE  "vdi.ps"
  25.218 +
  25.219 +
  25.220 +typedef struct sh_st {
  25.221 +    snap_id_t     id;
  25.222 +    struct sh_st *next;
  25.223 +} sh_t;
  25.224 +
  25.225 +#define SNAP_HASHSZ 1024
  25.226 +sh_t *node_hash[SNAP_HASHSZ];
  25.227 +#define SNAP_HASH(_id) (((int)(_id)->block^(_id)->index)%SNAP_HASHSZ)
  25.228 +
  25.229 +#define SNAPID_EQUAL(_a,_b) \
  25.230 +    (((_a)->block==(_b)->block) && ((_a)->index==(_b)->index))
  25.231 +int sh_check_and_add(snap_id_t *id)
  25.232 +{
  25.233 +    sh_t **s = &node_hash[SNAP_HASH(id)];
  25.234 +    
  25.235 +    while (*s != NULL) {
  25.236 +        if (SNAPID_EQUAL(&((*s)->id), id))
  25.237 +            return 1;
  25.238 +        *s = (*s)->next;
  25.239 +    }
  25.240 +    
  25.241 +    *s = (sh_t *)malloc(sizeof(sh_t));
  25.242 +    (*s)->id = *id;
  25.243 +    (*s)->next = NULL;
  25.244 +    
  25.245 +    return 0;
  25.246 +}
  25.247 +
  25.248 +int main(int argc, char *argv[])
  25.249 +{
  25.250 +    vdi_t *vdi_list[TEST_VDIS];
  25.251 +    snap_id_t id, fork_points[FORK_POINTS];
  25.252 +    int nr_vdis = 0, nr_forks = 0;
  25.253 +    int i, j, r;
  25.254 +    FILE *f;
  25.255 +    char name[VDI_NAME_SZ];
  25.256 +    
  25.257 +    __init_blockstore();
  25.258 +    __init_vdi();
  25.259 +    
  25.260 +    printf("[o] Generating seed VDIs. (%d VDIs)\n", INIT_VDIS);
  25.261 +    
  25.262 +    for (i=0; i<INIT_VDIS; i++) {
  25.263 +        r=rand();
  25.264 +        
  25.265 +        sprintf(name, "VDI Number %d", nr_vdis);
  25.266 +        vdi_list[i] = vdi_create(NULL, name);
  25.267 +        for (j=0; j<(r%INIT_SNAPS); j++)
  25.268 +            vdi_snapshot(vdi_list[i]);
  25.269 +        fork_points[i] = vdi_list[i]->snap;
  25.270 +        nr_vdis++;
  25.271 +        nr_forks++;
  25.272 +    }
  25.273 +    
  25.274 +    printf("[o] Running a random workload. (%d iterations)\n", NR_ITERS);
  25.275 +            
  25.276 +    for (i=0; i<NR_ITERS; i++) {
  25.277 +        r = rand();
  25.278 +        
  25.279 +        if ( r > NEW_FORK ) {
  25.280 +            if ( nr_forks > FORK_POINTS )
  25.281 +                continue;
  25.282 +            id = vdi_list[r%nr_vdis]->snap;
  25.283 +            if ( ( id.block == 0 ) || ( id.index == 0 ) )
  25.284 +                continue;
  25.285 +            id.index--;
  25.286 +            fork_points[nr_forks++] = id;
  25.287 +            
  25.288 +        } else if ( r > NEW_ROOT_VDI ) {
  25.289 +            
  25.290 +            if ( nr_vdis == TEST_VDIS )
  25.291 +                continue;
  25.292 +            
  25.293 +            sprintf(name, "VDI Number %d.", nr_vdis);
  25.294 +            vdi_list[nr_vdis++] = vdi_create(NULL, name);
  25.295 +            
  25.296 +        } else if ( r > NEW_FORK_VDI ) {
  25.297 +            
  25.298 +            if ( nr_vdis == TEST_VDIS )
  25.299 +                continue;
  25.300 +            
  25.301 +            sprintf(name, "VDI Number %d.", nr_vdis);
  25.302 +            vdi_list[nr_vdis++] = vdi_create(&fork_points[r%nr_forks], name);
  25.303 +            
  25.304 +        } else /* SNAPSHOT */ {
  25.305 +            
  25.306 +            vdi_snapshot(vdi_list[r%nr_vdis]);
  25.307 +            
  25.308 +        }
  25.309 +    }
  25.310 +    
  25.311 +    /* now dump it out to a dot file. */
  25.312 +    printf("[o] Dumping state to a dot graph. (%d VDIs)\n", nr_vdis);
  25.313 +    
  25.314 +    f = fopen(GRAPH_DOT_FILE, "w");
  25.315 +    
  25.316 +    /* write graph preamble */
  25.317 +    fprintf(f, "digraph G {\n");
  25.318 +    fprintf(f, "   rankdir=LR\n");
  25.319 +    
  25.320 +    for (i=0; i<nr_vdis; i++) {
  25.321 +        char oldnode[255];
  25.322 +        snap_block_t *blk;
  25.323 +        snap_id_t id = vdi_list[i]->snap;
  25.324 +        int nr_snaps, done=0;
  25.325 +        
  25.326 +        /* add a node for the id */
  25.327 +printf("vdi: %d\n", i);
  25.328 +        fprintf(f, "   n%Ld%d [color=blue,shape=box,label=\"%s\\nb:%Ld\\nidx:%d\"]\n", 
  25.329 +                id.block, id.index, vdi_list[i]->name,
  25.330 +                id.block, id.index);
  25.331 +        sprintf(oldnode, "n%Ld%d", id.block, id.index);
  25.332 +        
  25.333 +        while (id.block != 0) {
  25.334 +            blk = snap_get_block(id.block);
  25.335 +            nr_snaps = blk->hdr.log_entries - (blk->hdr.nr_entries - id.index);
  25.336 +            id = blk->hdr.fork_block;
  25.337 +            
  25.338 +            done = sh_check_and_add(&id);
  25.339 +            
  25.340 +            /* add a node for the fork_id */
  25.341 +            if (!done) {
  25.342 +                fprintf(f, "   n%Ld%d [shape=box,label=\"b:%Ld\\nidx:%d\"]\n", 
  25.343 +                    id.block, id.index,
  25.344 +                    id.block, id.index);
  25.345 +            }
  25.346 +            
  25.347 +            /* add an edge between them */
  25.348 +            fprintf(f, "   n%Ld%d -> %s [label=\"%u snapshots\"]\n",
  25.349 +                    id.block, id.index, oldnode, nr_snaps);
  25.350 +            sprintf(oldnode, "n%Ld%d", id.block, id.index);
  25.351 +            freeblock(blk);
  25.352 +            
  25.353 +            if (done) break;
  25.354 +        }
  25.355 +    }
  25.356 +    
  25.357 +    /* write graph postamble */
  25.358 +    fprintf(f, "}\n");
  25.359 +    fclose(f);
  25.360 +    
  25.361 +    printf("[o] Generating postscript graph. (%s)\n", GRAPH_PS_FILE);
  25.362 +    {
  25.363 +        char cmd[255];
  25.364 +        sprintf(cmd, "dot %s -Tps -o %s", GRAPH_DOT_FILE, GRAPH_PS_FILE);
  25.365 +        system(cmd);
  25.366 +    }
  25.367 +    return 0;
  25.368 +}
  25.369 +
  25.370 +#endif
    26.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    26.2 +++ b/tools/blktap/parallax/vdi.h	Sun Jul 03 22:36:48 2005 +0000
    26.3 @@ -0,0 +1,55 @@
    26.4 +#ifndef _VDI_H_
    26.5 +#define _VDI_H_
    26.6 +/**************************************************************************
    26.7 + * 
    26.8 + * vdi.h
    26.9 + *
   26.10 + * Virtual Disk Image (VDI) Interfaces
   26.11 + *
   26.12 + */
   26.13 +
   26.14 +#ifndef __VDI_H__
   26.15 +#define __VDI_H__
   26.16 +
   26.17 +#include "blktaplib.h"
   26.18 +#include "snaplog.h"
   26.19 +
   26.20 +#define VDI_HEIGHT     27 /* Note that these are now hard-coded */
   26.21 +#define VDI_REG_HEIGHT 27 /* in the async lookup code           */
   26.22 +
   26.23 +#define VDI_NAME_SZ 256
   26.24 +
   26.25 +
   26.26 +typedef struct vdi {
   26.27 +    u64         id;               /* unique vdi id -- used by the registry   */
   26.28 +    u64         block;            /* block where this vdi lives (also unique)*/
   26.29 +    u64         radix_root;       /* radix root node for block mappings      */
   26.30 +    snap_id_t   snap;             /* next snapshot slot for this VDI         */
   26.31 +    struct vdi *next;             /* used to hash-chain in blkif.            */
   26.32 +    blkif_vdev_t vdevice;         /* currently mounted as...                 */
   26.33 +    struct radix_lock *radix_lock;/* per-line L1 RW lock for parallel reqs   */
   26.34 +    char        name[VDI_NAME_SZ];/* human readable vdi name                 */
   26.35 +} vdi_t;
   26.36 +
   26.37 +#define VDI_REG_MAGIC   0xff00ff0bb0ff00ffLL
   26.38 +
   26.39 +typedef struct vdi_registry {
   26.40 +    u64     magic;
   26.41 +    u64     nr_vdis;
   26.42 +} vdi_registry_t;
   26.43 +
   26.44 +
   26.45 +int __init_vdi(void);
   26.46 +
   26.47 +vdi_t *vdi_get(u64 vdi_id);
   26.48 +void vdi_put(vdi_t *vdi);
   26.49 +vdi_registry_t *get_vdi_registry(void);
   26.50 +vdi_t *vdi_create(snap_id_t *parent_snap, char *name);
   26.51 +u64 vdi_lookup_block(vdi_t *vdi, u64 vdi_block, int *writable);
   26.52 +void vdi_update_block(vdi_t *vdi, u64 vdi_block, u64 g_block);
   26.53 +void vdi_snapshot(vdi_t *vdi);
   26.54 +
   26.55 +
   26.56 +#endif /* __VDI_H__ */
   26.57 +
   26.58 +#endif //_VDI_H_
    27.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    27.2 +++ b/tools/blktap/parallax/vdi_create.c	Sun Jul 03 22:36:48 2005 +0000
    27.3 @@ -0,0 +1,52 @@
    27.4 +/**************************************************************************
    27.5 + * 
    27.6 + * vdi_create.c
    27.7 + *
    27.8 + * Create a new vdi.
    27.9 + *
   27.10 + */
   27.11 + 
   27.12 +#include <stdio.h>
   27.13 +#include <stdlib.h>
   27.14 +#include <string.h>
   27.15 +#include <sys/time.h>
   27.16 +#include "blockstore.h"
   27.17 +#include "radix.h"
   27.18 +#include "vdi.h"
   27.19 +
   27.20 +int main(int argc, char *argv[])
   27.21 +{
   27.22 +    vdi_t       *vdi;
   27.23 +    char         name[VDI_NAME_SZ] = "";
   27.24 +    snap_id_t    id;
   27.25 +    int          from_snap = 0;
   27.26 +    
   27.27 +    __init_blockstore();
   27.28 +    __init_vdi();
   27.29 +    
   27.30 +    if ( argc == 1 ) {
   27.31 +        printf("usage: %s <VDI Name> [<snap block> <snap idx>]\n", argv[0]);
   27.32 +        exit(-1);
   27.33 +    }
   27.34 +    
   27.35 +    strncpy( name, argv[1], VDI_NAME_SZ);
   27.36 +    name[VDI_NAME_SZ] = '\0';    
   27.37 +    
   27.38 +    if ( argc > 3 ) {
   27.39 +        id.block   = (u64)          atoll(argv[2]);
   27.40 +        id.index   = (unsigned int) atol (argv[3]);
   27.41 +        from_snap  = 1;
   27.42 +    }
   27.43 +    
   27.44 +    vdi = vdi_create( from_snap ? &id : NULL, name);
   27.45 +    
   27.46 +    if ( vdi == NULL ) {
   27.47 +        printf("Failed to create VDI!\n");
   27.48 +        freeblock(vdi);
   27.49 +        exit(-1);
   27.50 +    }
   27.51 +    
   27.52 +    freeblock(vdi);
   27.53 +    
   27.54 +    return (0);
   27.55 +}
    28.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    28.2 +++ b/tools/blktap/parallax/vdi_fill.c	Sun Jul 03 22:36:48 2005 +0000
    28.3 @@ -0,0 +1,81 @@
    28.4 +/**************************************************************************
    28.5 + * 
    28.6 + * vdi_fill.c
    28.7 + *
    28.8 + * Hoover a file or device into a vdi.
    28.9 + * You must first create the vdi with vdi_create.
   28.10 + *
   28.11 + */
   28.12 + 
   28.13 +#include <stdio.h>
   28.14 +#include <stdlib.h>
   28.15 +#include <string.h>
   28.16 +#include <sys/types.h>
   28.17 +#include <sys/stat.h>
   28.18 +#include <fcntl.h>
   28.19 +#include <unistd.h>
   28.20 +#include "blockstore.h"
   28.21 +#include "radix.h"
   28.22 +#include "requests-async.h"
   28.23 +#include "vdi.h"
   28.24 +
   28.25 +int main(int argc, char *argv[])
   28.26 +{
   28.27 +    vdi_t       *vdi;
   28.28 +    u64          id;
   28.29 +    int          fd;
   28.30 +    struct stat  st;
   28.31 +    u64          tot_size;
   28.32 +    char         spage[BLOCK_SIZE];
   28.33 +    char        *dpage;
   28.34 +    u64          vblock = 0, count=0;
   28.35 +    
   28.36 +    __init_blockstore();
   28.37 +    init_block_async();
   28.38 +    __init_vdi();
   28.39 +    
   28.40 +    if ( argc < 3 ) {
   28.41 +        printf("usage: %s <VDI id> <filename>\n", argv[0]);
   28.42 +        exit(-1);
   28.43 +    }
   28.44 +        
   28.45 +    id = (u64) atoll(argv[1]);
   28.46 +    
   28.47 +    vdi = vdi_get( id );
   28.48 +    
   28.49 +    if ( vdi == NULL ) {
   28.50 +        printf("Failed to retreive VDI %Ld!\n", id);
   28.51 +        exit(-1);
   28.52 +    }
   28.53 +    
   28.54 +    fd = open(argv[2], O_RDONLY | O_LARGEFILE);
   28.55 +    
   28.56 +    if (fd < 0) {
   28.57 +        printf("Couldn't open %s!\n", argv[2]);
   28.58 +        exit(-1);
   28.59 +    }
   28.60 +    
   28.61 +    if ( fstat(fd, &st) != 0 ) {
   28.62 +        printf("Couldn't stat %s!\n", argv[2]);
   28.63 +        exit(-1);
   28.64 +    }
   28.65 +    
   28.66 +    tot_size = (u64) st.st_size;
   28.67 +    printf("Filling VDI %Ld with %Ld bytes.\n", id, tot_size);
   28.68 +    
   28.69 +    printf("%011Ld blocks total\n", tot_size / BLOCK_SIZE);    
   28.70 +    printf("           ");
   28.71 +    while ( ( count = read(fd, spage, BLOCK_SIZE) ) > 0 ) {
   28.72 +        vdi_write_s(vdi, vblock, spage);
   28.73 +        
   28.74 +        vblock++;
   28.75 +        if ((vblock % 512) == 0)
   28.76 +        printf("\b\b\b\b\b\b\b\b\b\b\b%011Ld", vblock);
   28.77 +        fflush(stdout);
   28.78 +    }
   28.79 +    printf("\n");
   28.80 +    
   28.81 +    freeblock(vdi);
   28.82 +    
   28.83 +    return (0);
   28.84 +}
    29.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    29.2 +++ b/tools/blktap/parallax/vdi_list.c	Sun Jul 03 22:36:48 2005 +0000
    29.3 @@ -0,0 +1,47 @@
    29.4 +/**************************************************************************
    29.5 + * 
    29.6 + * vdi_list.c
    29.7 + *
    29.8 + * Print a list of VDIs on the block store.
    29.9 + *
   29.10 + */
   29.11 + 
   29.12 +#include <stdio.h>
   29.13 +#include <stdlib.h>
   29.14 +#include <string.h>
   29.15 +#include <sys/time.h>
   29.16 +#include "blockstore.h"
   29.17 +#include "radix.h"
   29.18 +#include "vdi.h"
   29.19 +
   29.20 +int main(int argc, char *argv[])
   29.21 +{
   29.22 +    vdi_registry_t *reg;
   29.23 +    vdi_t *vdi;
   29.24 +    int i;
   29.25 +    
   29.26 +    __init_blockstore();
   29.27 +    __init_vdi();
   29.28 +    
   29.29 +    reg = get_vdi_registry();
   29.30 +    
   29.31 +    if ( reg == NULL ) {
   29.32 +        printf("couldn't get VDI registry.\n");
   29.33 +        exit(-1);
   29.34 +    }
   29.35 +    
   29.36 +    for (i=0; i < reg->nr_vdis; i++) {
   29.37 +        vdi = vdi_get(i);
   29.38 +        
   29.39 +        if ( vdi != NULL ) {
   29.40 +            
   29.41 +            printf("%10Ld %60s\n", vdi->id, vdi->name);
   29.42 +            freeblock(vdi);
   29.43 +            
   29.44 +        }
   29.45 +    }
   29.46 +    
   29.47 +    freeblock(reg);
   29.48 +    
   29.49 +    return 0;
   29.50 +}
    30.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    30.2 +++ b/tools/blktap/parallax/vdi_snap.c	Sun Jul 03 22:36:48 2005 +0000
    30.3 @@ -0,0 +1,43 @@
    30.4 +/**************************************************************************
    30.5 + * 
    30.6 + * vdi_snap.c
    30.7 + *
    30.8 + * Snapshot a vdi.
    30.9 + *
   30.10 + */
   30.11 + 
   30.12 +#include <stdio.h>
   30.13 +#include <stdlib.h>
   30.14 +#include <string.h>
   30.15 +#include <sys/time.h>
   30.16 +#include "blockstore.h"
   30.17 +#include "radix.h"
   30.18 +#include "vdi.h"
   30.19 +
   30.20 +int main(int argc, char *argv[])
   30.21 +{
   30.22 +    vdi_t  *vdi;
   30.23 +    u64     id;
   30.24 +    
   30.25 +    __init_blockstore();
   30.26 +    __init_vdi();
   30.27 +    
   30.28 +    if ( argc == 1 ) {
   30.29 +        printf("usage: %s <VDI id>\n", argv[0]);
   30.30 +        exit(-1);
   30.31 +    }
   30.32 +    
   30.33 +    id = (u64) atoll(argv[1]);
   30.34 +    
   30.35 +    vdi = vdi_get(id);
   30.36 +    
   30.37 +    if ( vdi == NULL ) {
   30.38 +        printf("couldn't find the requested VDI.\n");
   30.39 +        freeblock(vdi);
   30.40 +        exit(-1);
   30.41 +    }
   30.42 +    
   30.43 +    vdi_snapshot(vdi);
   30.44 +    
   30.45 +    return 0;
   30.46 +}
    31.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    31.2 +++ b/tools/blktap/parallax/vdi_snap_delete.c	Sun Jul 03 22:36:48 2005 +0000
    31.3 @@ -0,0 +1,48 @@
    31.4 +/**************************************************************************
    31.5 + * 
    31.6 + * vdi_snap_delete.c
    31.7 + *
    31.8 + * Delete a snapshot.
    31.9 + *
   31.10 + * This is not finished:  right now it takes a snap n and calls 
   31.11 + * snap_collapse(n,n+1).
   31.12 + *
   31.13 + * TODO: support for non-consecutive, non-same-block snaps
   31.14 + *       Avoid forking probs.
   31.15 + *
   31.16 + */
   31.17 + 
   31.18 +#include <stdio.h>
   31.19 +#include <stdlib.h>
   31.20 +#include <string.h>
   31.21 +#include <sys/time.h>
   31.22 +#include "blockstore.h"
   31.23 +#include "snaplog.h"
   31.24 +#include "radix.h"
   31.25 +#include "vdi.h"
   31.26 +
   31.27 +int main(int argc, char *argv[])
   31.28 +{
   31.29 +    snap_id_t    id, c_id;
   31.30 +    int ret;
   31.31 +    
   31.32 +    __init_blockstore();
   31.33 +    __init_vdi();
   31.34 +    
   31.35 +    if ( argc != 3 ) {
   31.36 +        printf("usage: %s <snap block> <snap idx>\n", argv[0]);
   31.37 +        exit(-1);
   31.38 +    }
   31.39 +    
   31.40 +    id.block   = (u64)          atoll(argv[1]);
   31.41 +    id.index   = (unsigned int) atol (argv[2]);
   31.42 +    
   31.43 +    c_id = id;
   31.44 +    c_id.index++;
   31.45 +    
   31.46 +    ret = snap_collapse(VDI_HEIGHT, &id, &c_id);
   31.47 +    
   31.48 +    printf("Freed %d blocks.\n", ret);
   31.49 +    
   31.50 +    return 0;
   31.51 +}
    32.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    32.2 +++ b/tools/blktap/parallax/vdi_snap_list.c	Sun Jul 03 22:36:48 2005 +0000
    32.3 @@ -0,0 +1,82 @@
    32.4 +/**************************************************************************
    32.5 + * 
    32.6 + * vdi_snap_list.c
    32.7 + *
    32.8 + * Print a list of snapshots for the specified vdi.
    32.9 + *
   32.10 + */
   32.11 + 
   32.12 +#include <stdio.h>
   32.13 +#include <stdlib.h>
   32.14 +#include <string.h>
   32.15 +#include <time.h>
   32.16 +#include <sys/time.h>
   32.17 +#include "blockstore.h"
   32.18 +#include "radix.h"
   32.19 +#include "vdi.h"
   32.20 +
   32.21 +int main(int argc, char *argv[])
   32.22 +{
   32.23 +    vdi_t        *vdi;
   32.24 +    u64           id;
   32.25 +    int           i, max_snaps = -1;
   32.26 +    snap_block_t *blk;
   32.27 +    snap_id_t     sid;
   32.28 +    char         *t;
   32.29 +    
   32.30 +    __init_blockstore();
   32.31 +    __init_vdi();
   32.32 +    
   32.33 +    if ( argc == 1 ) {
   32.34 +        printf("usage: %s <VDI id> [max snaps]\n", argv[0]);
   32.35 +        exit(-1);
   32.36 +    }
   32.37 +    
   32.38 +    id = (u64) atoll(argv[1]);
   32.39 +    
   32.40 +    if ( argc > 2 ) {
   32.41 +        max_snaps = atoi(argv[2]);
   32.42 +    }
   32.43 +    
   32.44 +    vdi = vdi_get(id);
   32.45 +    
   32.46 +    if ( vdi == NULL ) {
   32.47 +        printf("couldn't find the requested VDI.\n");
   32.48 +        freeblock(vdi);
   32.49 +        exit(-1);
   32.50 +    }
   32.51 +    
   32.52 +    sid = vdi->snap;
   32.53 +    sid.index--;
   32.54 +    
   32.55 +    //printf("%8s%4s%21s %12s %1s\n", "Block", "idx", "timestamp", 
   32.56 +    //    "radix root", "d");
   32.57 +    printf("%8s%4s%37s %12s %1s\n", "Block", "idx", "timestamp", 
   32.58 +            "radix root", "d");
   32.59 +     
   32.60 +    while (sid.block != 0) {
   32.61 +        blk = snap_get_block(sid.block);
   32.62 +        for (i = sid.index; i >= 0; i--) {
   32.63 +            if ( max_snaps == 0  ) {
   32.64 +                freeblock(blk);
   32.65 +                goto done;
   32.66 +            }
   32.67 +            t = ctime(&blk->snaps[i].timestamp.tv_sec);
   32.68 +            t[strlen(t)-1] = '\0';
   32.69 +            //printf("%8Ld%4u%14lu.%06lu %12Ld %1s\n",
   32.70 +            printf("%8Ld%4u%30s %06lu %12Ld %1s\n",
   32.71 +                    sid.block, i, 
   32.72 +                    //blk->snaps[i].timestamp.tv_sec,
   32.73 +                    t,
   32.74 +                    blk->snaps[i].timestamp.tv_usec,
   32.75 +                    blk->snaps[i].radix_root,
   32.76 +                    blk->snaps[i].deleted ? "*" : " ");
   32.77 +            if ( max_snaps != -1 ) 
   32.78 +                max_snaps--;
   32.79 +        }
   32.80 +        sid = blk->hdr.parent_block;
   32.81 +        freeblock(blk);
   32.82 +    }
   32.83 +done:            
   32.84 +    return 0;
   32.85 +}
    33.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    33.2 +++ b/tools/blktap/parallax/vdi_tree.c	Sun Jul 03 22:36:48 2005 +0000
    33.3 @@ -0,0 +1,132 @@
    33.4 +/**************************************************************************
    33.5 + * 
    33.6 + * vdi_tree.c
    33.7 + *
    33.8 + * Output current vdi tree to dot and postscript.
    33.9 + *
   33.10 + */
   33.11 + 
   33.12 +#include <stdio.h>
   33.13 +#include <stdlib.h>
   33.14 +#include <string.h>
   33.15 +#include <sys/time.h>
   33.16 +#include "blockstore.h"
   33.17 +#include "radix.h"
   33.18 +#include "vdi.h"
   33.19 +
   33.20 +#define GRAPH_DOT_FILE "vdi.dot"
   33.21 +#define GRAPH_PS_FILE  "vdi.ps"
   33.22 +
   33.23 +typedef struct sh_st {
   33.24 +    snap_id_t     id;
   33.25 +    struct sh_st *next;
   33.26 +} sh_t;
   33.27 +
   33.28 +#define SNAP_HASHSZ 1024
   33.29 +sh_t *node_hash[SNAP_HASHSZ];
   33.30 +#define SNAP_HASH(_id) (((int)(_id)->block^(_id)->index)%SNAP_HASHSZ)
   33.31 +
   33.32 +#define SNAPID_EQUAL(_a,_b) \
   33.33 +    (((_a)->block==(_b)->block) && ((_a)->index==(_b)->index))
   33.34 +int sh_check_and_add(snap_id_t *id)
   33.35 +{
   33.36 +    sh_t **s = &node_hash[SNAP_HASH(id)];
   33.37 +    
   33.38 +    while (*s != NULL) {
   33.39 +        if (SNAPID_EQUAL(&((*s)->id), id))
   33.40 +            return 1;
   33.41 +        *s = (*s)->next;
   33.42 +    }
   33.43 +    
   33.44 +    *s = (sh_t *)malloc(sizeof(sh_t));
   33.45 +    (*s)->id = *id;
   33.46 +    (*s)->next = NULL;
   33.47 +    
   33.48 +    return 0;
   33.49 +}
   33.50 +
   33.51 +int main(int argc, char *argv[])
   33.52 +{
   33.53 +    FILE *f;
   33.54 +    char dot_file[255] = GRAPH_DOT_FILE;
   33.55 +    char  ps_file[255] = GRAPH_PS_FILE;
   33.56 +    int nr_vdis = 0, nr_forks = 0;
   33.57 +    vdi_registry_t *reg;
   33.58 +    vdi_t *vdi;
   33.59 +    int i;
   33.60 +    
   33.61 +    __init_blockstore();
   33.62 +    __init_vdi();
   33.63 +    
   33.64 +    reg = get_vdi_registry();
   33.65 +    
   33.66 +    if ( reg == NULL ) {
   33.67 +        printf("couldn't get VDI registry.\n");
   33.68 +        exit(-1);
   33.69 +    }
   33.70 +    
   33.71 +    if ( argc > 1 ) {
   33.72 +        strncpy(ps_file, argv[1], 255);
   33.73 +        ps_file[255] = '\0';
   33.74 +    }
   33.75 +    
   33.76 +    /* now dump it out to a dot file. */
   33.77 +    printf("[o] Dumping state to a dot graph. (%d VDIs)\n", nr_vdis);
   33.78 +    
   33.79 +    f = fopen(dot_file, "w");
   33.80 +    
   33.81 +    /* write graph preamble */
   33.82 +    fprintf(f, "digraph G {\n");
   33.83 +    fprintf(f, "   rankdir=LR\n");
   33.84 +    
   33.85 +    for (i=0; i<reg->nr_vdis; i++) {
   33.86 +        char oldnode[255];
   33.87 +        snap_block_t *blk;
   33.88 +        snap_id_t id;
   33.89 +        int nr_snaps, done=0;
   33.90 +        
   33.91 +        vdi = vdi_get(i);
   33.92 +        id = vdi->snap;
   33.93 +        /* add a node for the id */
   33.94 +printf("vdi: %d\n", i);
   33.95 +        fprintf(f, "   n%Ld%d [color=blue,shape=box,label=\"%s\\nb:%Ld\\nidx:%d\"]\n", 
   33.96 +                id.block, id.index, vdi->name,
   33.97 +                id.block, id.index);
   33.98 +        sprintf(oldnode, "n%Ld%d", id.block, id.index);
   33.99 +        
  33.100 +        while (id.block != 0) {
  33.101 +            blk = snap_get_block(id.block);
  33.102 +            nr_snaps = blk->hdr.log_entries - (blk->hdr.nr_entries - id.index);
  33.103 +            id = blk->hdr.fork_block;
  33.104 +            
  33.105 +            done = sh_check_and_add(&id);
  33.106 +            
  33.107 +            /* add a node for the fork_id */
  33.108 +            if (!done) {
  33.109 +                fprintf(f, "   n%Ld%d [shape=box,label=\"b:%Ld\\nidx:%d\"]\n", 
  33.110 +                    id.block, id.index,
  33.111 +                    id.block, id.index);
  33.112 +            }
  33.113 +            
  33.114 +            /* add an edge between them */
  33.115 +            fprintf(f, "   n%Ld%d -> %s [label=\"%u snapshots\"]\n",
  33.116 +                    id.block, id.index, oldnode, nr_snaps);
  33.117 +            sprintf(oldnode, "n%Ld%d", id.block, id.index);
  33.118 +            freeblock(blk);
  33.119 +            
  33.120 +            if (done) break;
  33.121 +        }
  33.122 +    }
  33.123 +    
  33.124 +    /* write graph postamble */
  33.125 +    fprintf(f, "}\n");
  33.126 +    fclose(f);
  33.127 +    
  33.128 +    printf("[o] Generating postscript graph. (%s)\n", GRAPH_PS_FILE);
  33.129 +    {
  33.130 +        char cmd[255];
  33.131 +        sprintf(cmd, "dot %s -Tps -o %s", dot_file, ps_file);
  33.132 +        system(cmd);
  33.133 +    }
  33.134 +    return 0;
  33.135 +}
    34.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    34.2 +++ b/tools/blktap/parallax/vdi_unittest.c	Sun Jul 03 22:36:48 2005 +0000
    34.3 @@ -0,0 +1,184 @@
    34.4 +/**************************************************************************
    34.5 + * 
    34.6 + * vdi_unittest.c
    34.7 + *
    34.8 + * Run a small test workload to ensure that data access through a vdi
    34.9 + * is (at least superficially) correct.
   34.10 + *
   34.11 + */
   34.12 + 
   34.13 +#include <stdio.h>
   34.14 +#include <stdlib.h>
   34.15 +#include <string.h>
   34.16 +#include <sys/types.h>
   34.17 +#include <sys/stat.h>
   34.18 +#include <fcntl.h>
   34.19 +#include <unistd.h>
   34.20 +#include "requests-async.h"
   34.21 +#include "blockstore.h"
   34.22 +#include "radix.h"
   34.23 +#include "vdi.h"
   34.24 +
   34.25 +#define TEST_PAGES  32
   34.26 +static char *zero_page;
   34.27 +static char pages[TEST_PAGES][BLOCK_SIZE];
   34.28 +static int next_page = 0;
   34.29 +
   34.30 +void fill_test_pages(void)
   34.31 +{
   34.32 +    int i, j;
   34.33 +    long *page;
   34.34 +
   34.35 +    for (i=0; i< TEST_PAGES; i++) {
   34.36 +        page = (unsigned long *)pages[i];
   34.37 +        for (j=0; j<(BLOCK_SIZE/4); j++) {
   34.38 +            page[j] = random();
   34.39 +        }
   34.40 +    }
   34.41 +
   34.42 +    zero_page = newblock();
   34.43 +}
   34.44 +
   34.45 +inline u64 make_vaddr(u64 L1, u64 L2, u64 L3)
   34.46 +{
   34.47 +    u64 ret = L1;
   34.48 +
   34.49 +    ret = (ret << 9) | L2;
   34.50 +    ret = (ret << 9) | L3;
   34.51 +
   34.52 +    return ret;
   34.53 +}
   34.54 +
   34.55 +void touch_block(vdi_t *vdi, u64 L1, u64 L2, u64 L3)
   34.56 +{
   34.57 +    u64 vaddr;
   34.58 +    char *page = pages[next_page++];
   34.59 +    char *rpage = NULL;
   34.60 +
   34.61 +    printf("TOUCH (%3Lu, %3Lu, %3Lu)\n", L1, L2, L3);
   34.62 +
   34.63 +    vaddr = make_vaddr(L1, L2, L3);
   34.64 +    vdi_write_s(vdi, vaddr, page);
   34.65 +    rpage = vdi_read_s(vdi, vaddr);
   34.66 +
   34.67 +    if (rpage == NULL) 
   34.68 +    {
   34.69 +        printf( "read %Lu returned NULL\n", vaddr); 
   34.70 +        return; 
   34.71 +    }
   34.72 +
   34.73 +    if (memcmp(page, rpage, BLOCK_SIZE) != 0)
   34.74 +    {
   34.75 +        printf( "read %Lu returned a different page\n", vaddr);
   34.76 +        return;
   34.77 +    }
   34.78 +
   34.79 +    freeblock(rpage);
   34.80 +}
   34.81 +
   34.82 +void test_block(vdi_t *vdi, u64 L1, u64 L2, u64 L3, char *page)
   34.83 +{
   34.84 +    u64 vaddr;
   34.85 +    char *rpage = NULL;
   34.86 +
   34.87 +    printf("TEST  (%3Lu, %3Lu, %3Lu)\n", L1, L2, L3);
   34.88 +
   34.89 +    vaddr = make_vaddr(L1, L2, L3);
   34.90 +    rpage = vdi_read_s(vdi, vaddr);
   34.91 +
   34.92 +    if (rpage == NULL) 
   34.93 +    {
   34.94 +        printf( "read %Lu returned NULL\n", vaddr); 
   34.95 +        return; 
   34.96 +    }
   34.97 +
   34.98 +    if (memcmp(page, rpage, BLOCK_SIZE) != 0)
   34.99 +    {
  34.100 +        printf( "read %Lu returned a different page\n", vaddr);
  34.101 +        return;
  34.102 +    }
  34.103 +
  34.104 +    freeblock(rpage);
  34.105 +}
  34.106 +
  34.107 +void coverage_test(vdi_t *vdi)
  34.108 +{
  34.109 +    u64 vaddr;
  34.110 +    int i, j, k;
  34.111 +
  34.112 +    /* Do a series of writes and reads to test all paths through the 
  34.113 +     * async radix code.  The radix request code will dump CRC warnings
  34.114 +     * if there are data problems here as well.
  34.115 +     */
  34.116 +
  34.117 +    /* L1 Zero */
  34.118 +    touch_block(vdi, 0, 0, 0);
  34.119 +
  34.120 +    /* L2 Zero */
  34.121 +    i = next_page;
  34.122 +    touch_block(vdi, 0, 1, 0);
  34.123 +
  34.124 +    /* L3 Zero */
  34.125 +    j = next_page;
  34.126 +    touch_block(vdi, 0, 0, 1);
  34.127 +    k = next_page;
  34.128 +    touch_block(vdi, 0, 1, 1);
  34.129 +
  34.130 +    /* Direct write */
  34.131 +    touch_block(vdi, 0, 0, 0);
  34.132 +
  34.133 +    vdi_snapshot(vdi);
  34.134 +
  34.135 +    /* L1 fault */
  34.136 +    touch_block(vdi, 0, 0, 0);
  34.137 +    /* test the read-only branches that should have been copied over. */
  34.138 +    test_block(vdi, 0, 1, 0, pages[i]);
  34.139 +    test_block(vdi, 0, 0, 1, pages[j]);
  34.140 +
  34.141 +    /* L2 fault */
  34.142 +    touch_block(vdi, 0, 1, 0);
  34.143 +    test_block(vdi, 0, 1, 1, pages[k]);
  34.144 +
  34.145 +    /* L3 fault */
  34.146 +    touch_block(vdi, 0, 0, 1);
  34.147 +    
  34.148 +    /* read - L1 zero */
  34.149 +    test_block(vdi, 1, 0, 0, zero_page);
  34.150 +    
  34.151 +    /* read - L2 zero */
  34.152 +    test_block(vdi, 0, 2, 0, zero_page);
  34.153 +
  34.154 +    /* read - L3 zero */
  34.155 +    test_block(vdi, 0, 0, 2, zero_page);
  34.156 +}
  34.157 +
  34.158 +int main(int argc, char *argv[])
  34.159 +{
  34.160 +    vdi_t       *vdi;
  34.161 +    u64          id;
  34.162 +    int          fd;
  34.163 +    struct stat  st;
  34.164 +    u64          tot_size;
  34.165 +    char         spage[BLOCK_SIZE];
  34.166 +    char        *dpage;
  34.167 +    u64          vblock = 0, count=0;
  34.168 +    
  34.169 +    __init_blockstore();
  34.170 +    init_block_async();
  34.171 +    __init_vdi();
  34.172 +        
  34.173 +    vdi = vdi_create( NULL, "UNIT TEST VDI");
  34.174 +    
  34.175 +    if ( vdi == NULL ) {
  34.176 +        printf("Failed to create VDI!\n");
  34.177 +        freeblock(vdi);
  34.178 +        exit(-1);
  34.179 +    }
  34.180 +
  34.181 +    fill_test_pages();
  34.182 +    coverage_test(vdi);
  34.183 +    
  34.184 +    freeblock(vdi);
  34.185 +    
  34.186 +    return (0);
  34.187 +}
    35.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    35.2 +++ b/tools/blktap/parallax/vdi_validate.c	Sun Jul 03 22:36:48 2005 +0000
    35.3 @@ -0,0 +1,97 @@
    35.4 +/**************************************************************************
    35.5 + * 
    35.6 + * vdi_validate.c
    35.7 + *
    35.8 + * Intended to sanity-check vm_fill and the underlying vdi code.
    35.9 + *
   35.10 + * Block-by-block compare of a vdi with a file/device on the disk.
   35.11 + *
   35.12 + */
   35.13 + 
   35.14 +#include <stdio.h>
   35.15 +#include <stdlib.h>
   35.16 +#include <string.h>
   35.17 +#include <sys/types.h>
   35.18 +#include <sys/stat.h>
   35.19 +#include <fcntl.h>
   35.20 +#include <unistd.h>
   35.21 +#include "blockstore.h"
   35.22 +#include "radix.h"
   35.23 +#include "vdi.h"
   35.24 +#include "requests-async.h"
   35.25 +
   35.26 +int main(int argc, char *argv[])
   35.27 +{
   35.28 +    vdi_t       *vdi;
   35.29 +    u64          id;
   35.30 +    int          fd;
   35.31 +    struct stat  st;
   35.32 +    u64          tot_size;
   35.33 +    char         spage[BLOCK_SIZE], *dpage;
   35.34 +    char        *vpage;
   35.35 +    u64          vblock = 0, count=0;
   35.36 +    
   35.37 +    __init_blockstore();
   35.38 +    init_block_async();
   35.39 +    __init_vdi();
   35.40 +    
   35.41 +    if ( argc < 3 ) {
   35.42 +        printf("usage: %s <VDI id> <filename>\n", argv[0]);
   35.43 +        exit(-1);
   35.44 +    }
   35.45 +        
   35.46 +    id = (u64) atoll(argv[1]);
   35.47 +    
   35.48 +    vdi = vdi_get( id );
   35.49 +    
   35.50 +    if ( vdi == NULL ) {
   35.51 +        printf("Failed to retreive VDI %Ld!\n", id);
   35.52 +        exit(-1);
   35.53 +    }
   35.54 +    
   35.55 +    fd = open(argv[2], O_RDONLY | O_LARGEFILE);
   35.56 +    
   35.57 +    if (fd < 0) {
   35.58 +        printf("Couldn't open %s!\n", argv[2]);
   35.59 +        exit(-1);
   35.60 +    }
   35.61 +    
   35.62 +    if ( fstat(fd, &st) != 0 ) {
   35.63 +        printf("Couldn't stat %s!\n", argv[2]);
   35.64 +        exit(-1);
   35.65 +    }
   35.66 +    
   35.67 +    tot_size = (u64) st.st_size;
   35.68 +    printf("Testing VDI %Ld (%Ld bytes).\n", id, tot_size);
   35.69 +    
   35.70 +    printf("           ");
   35.71 +    while ( ( count = read(fd, spage, BLOCK_SIZE) ) > 0 ) {
   35.72 +
   35.73 +        dpage = vdi_read_s(vdi, vblock);
   35.74 +
   35.75 +        if (dpage == NULL) {
   35.76 +            printf("\n\nfound an unmapped VDI block (%Ld)\n", vblock);
   35.77 +            exit(0);
   35.78 +        }
   35.79 +
   35.80 +        if (memcmp(spage, dpage, BLOCK_SIZE) != 0) {
   35.81 +            printf("\n\nblocks don't match! (%Ld)\n", vblock);
   35.82 +            exit(0);
   35.83 +        }
   35.84 +        
   35.85 +        freeblock(dpage);
   35.86 +        
   35.87 +        vblock++;
   35.88 +        if ((vblock % 1024) == 0) {
   35.89 +            printf("\b\b\b\b\b\b\b\b\b\b\b%011Ld", vblock);
   35.90 +            fflush(stdout);
   35.91 +        }
   35.92 +    }
   35.93 +    printf("\n");
   35.94 +    
   35.95 +    printf("VDI %Ld looks good!\n", id);
   35.96 +    
   35.97 +    freeblock(vdi);
   35.98 +    
   35.99 +    return (0);
  35.100 +}
    36.1 --- a/tools/blktap/radix.c	Sun Jul 03 22:32:52 2005 +0000
    36.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    36.3 @@ -1,631 +0,0 @@
    36.4 -/*
    36.5 - * Radix tree for mapping (up to) 63-bit virtual block IDs to
    36.6 - * 63-bit global block IDs
    36.7 - *
    36.8 - * Pointers within the tree set aside the least significant bit to indicate
    36.9 - * whther or not the target block is writable from this node.
   36.10 - *
   36.11 - * The block with ID 0 is assumed to be an empty block of all zeros
   36.12 - */
   36.13 -
   36.14 -#include <unistd.h>
   36.15 -#include <stdio.h>
   36.16 -#include <stdlib.h>
   36.17 -#include <assert.h>
   36.18 -#include <string.h>
   36.19 -#include <pthread.h>
   36.20 -#include "blockstore.h"
   36.21 -#include "radix.h"
   36.22 -
   36.23 -#define RADIX_TREE_MAP_SHIFT 9
   36.24 -#define RADIX_TREE_MAP_MASK 0x1ff
   36.25 -#define RADIX_TREE_MAP_ENTRIES 512
   36.26 -
   36.27 -/*
   36.28 -#define DEBUG
   36.29 -*/
   36.30 -
   36.31 -/* Experimental radix cache. */
   36.32 -
   36.33 -static  pthread_mutex_t rcache_mutex = PTHREAD_MUTEX_INITIALIZER;
   36.34 -static  int rcache_count = 0;
   36.35 -#define RCACHE_MAX 1024
   36.36 -
   36.37 -typedef struct rcache_st {
   36.38 -    radix_tree_node  *node;
   36.39 -    u64               id;
   36.40 -    struct rcache_st *hash_next;
   36.41 -    struct rcache_st *cache_next;
   36.42 -    struct rcache_st *cache_prev;
   36.43 -} rcache_t;
   36.44 -
   36.45 -static rcache_t *rcache_head = NULL;
   36.46 -static rcache_t *rcache_tail = NULL;
   36.47 -
   36.48 -#define RCHASH_SIZE 512ULL
   36.49 -rcache_t *rcache[RCHASH_SIZE];
   36.50 -#define RCACHE_HASH(_id) ((_id) & (RCHASH_SIZE - 1))
   36.51 -
   36.52 -void __rcache_init(void)
   36.53 -{
   36.54 -    int i;
   36.55 -
   36.56 -    for (i=0; i<RCHASH_SIZE; i++)
   36.57 -        rcache[i] = NULL;
   36.58 -}
   36.59 -    
   36.60 -
   36.61 -void rcache_write(u64 id, radix_tree_node *node)
   36.62 -{
   36.63 -    rcache_t *r, *tmp, **curs;
   36.64 -    
   36.65 -    pthread_mutex_lock(&rcache_mutex);
   36.66 -    
   36.67 -    /* Is it already in the cache? */
   36.68 -    r = rcache[RCACHE_HASH(id)];
   36.69 -    
   36.70 -    for (;;) {
   36.71 -        if (r == NULL) 
   36.72 -            break;
   36.73 -        if (r->id == id) 
   36.74 -        {
   36.75 -            memcpy(r->node, node, BLOCK_SIZE);
   36.76 -            
   36.77 -            /* bring to front. */
   36.78 -            if (r != rcache_head) {
   36.79 -                
   36.80 -                if (r == rcache_tail) {
   36.81 -                    if (r->cache_prev != NULL) rcache_tail = r->cache_prev;
   36.82 -                    rcache_tail->cache_next = NULL;
   36.83 -                }
   36.84 -
   36.85 -                tmp = r->cache_next;
   36.86 -                if (r->cache_next != NULL) r->cache_next->cache_prev 
   36.87 -                                                     = r->cache_prev;
   36.88 -                if (r->cache_prev != NULL) r->cache_prev->cache_next = tmp;
   36.89 -
   36.90 -                r->cache_prev = NULL;
   36.91 -                r->cache_next = rcache_head;
   36.92 -                if (rcache_head != NULL) rcache_head->cache_prev = r;
   36.93 -                rcache_head = r;
   36.94 -            }
   36.95 -
   36.96 -//printf("Update (%Ld)\n", r->id);
   36.97 -            goto done;
   36.98 -        }
   36.99 -        r = r->hash_next;
  36.100 -    }
  36.101 -    
  36.102 -    if ( rcache_count == RCACHE_MAX ) 
  36.103 -    {
  36.104 -        /* Remove an entry */
  36.105 -        
  36.106 -        r = rcache_tail;
  36.107 -        if (r->cache_prev != NULL) rcache_tail = r->cache_prev;
  36.108 -        rcache_tail->cache_next = NULL;
  36.109 -        freeblock(r->node);
  36.110 -        
  36.111 -        curs = &rcache[RCACHE_HASH(r->id)];
  36.112 -        while ((*curs) != r)
  36.113 -            curs = &(*curs)->hash_next;
  36.114 -        *curs = r->hash_next;
  36.115 -//printf("Evict (%Ld)\n", r->id);
  36.116 -        
  36.117 -    } else {
  36.118 -        
  36.119 -        r = (rcache_t *)malloc(sizeof(rcache_t));
  36.120 -        rcache_count++;
  36.121 -    }
  36.122 -    
  36.123 -    r->node = newblock();
  36.124 -    memcpy(r->node, node, BLOCK_SIZE);
  36.125 -    r->id = id;
  36.126 -    
  36.127 -    r->hash_next = rcache[RCACHE_HASH(id)];
  36.128 -    rcache[RCACHE_HASH(id)] = r;
  36.129 -    
  36.130 -    r->cache_prev = NULL;
  36.131 -    r->cache_next = rcache_head;
  36.132 -    if (rcache_head != NULL) rcache_head->cache_prev = r;
  36.133 -    rcache_head = r;
  36.134 -    if (rcache_tail == NULL) rcache_tail = r;
  36.135 -    
  36.136 -//printf("Added (%Ld, %p)\n", id, r->node);
  36.137 -done:
  36.138 -    pthread_mutex_unlock(&rcache_mutex);
  36.139 -}
  36.140 -
  36.141 -radix_tree_node *rcache_read(u64 id)
  36.142 -{
  36.143 -    rcache_t *r, *tmp;
  36.144 -    radix_tree_node *node = NULL;
  36.145 -    
  36.146 -    pthread_mutex_lock(&rcache_mutex);
  36.147 -
  36.148 -    r = rcache[RCACHE_HASH(id)];
  36.149 -    
  36.150 -    for (;;) {
  36.151 -        if (r == NULL) {
  36.152 -//printf("Miss (%Ld)\n", id);
  36.153 -            goto done;
  36.154 -        }
  36.155 -        if (r->id == id) break;
  36.156 -        r = r->hash_next;
  36.157 -    }
  36.158 -   
  36.159 -    /* bring to front. */
  36.160 -    if (r != rcache_head) 
  36.161 -    {
  36.162 -        if (r == rcache_tail) {
  36.163 -            if (r->cache_prev != NULL) rcache_tail = r->cache_prev;
  36.164 -            rcache_tail->cache_next = NULL;
  36.165 -        }
  36.166 -        tmp = r->cache_next;
  36.167 -        if (r->cache_next != NULL) r->cache_next->cache_prev = r->cache_prev;
  36.168 -        if (r->cache_prev != NULL) r->cache_prev->cache_next = tmp;
  36.169 -
  36.170 -        r->cache_prev = NULL;
  36.171 -        r->cache_next = rcache_head;
  36.172 -        if (rcache_head != NULL) rcache_head->cache_prev = r;
  36.173 -        rcache_head = r;
  36.174 -    }
  36.175 -    
  36.176 -    node = newblock();
  36.177 -    memcpy(node, r->node, BLOCK_SIZE);
  36.178 -    
  36.179 -//printf("Hit (%Ld, %p)\n", id, r->node);
  36.180 -done:
  36.181 -    pthread_mutex_unlock(&rcache_mutex);
  36.182 -    
  36.183 -    return(node);
  36.184 -}
  36.185 -
  36.186 -
  36.187 -void *rc_readblock(u64 id)
  36.188 -{
  36.189 -    void *ret;
  36.190 -    
  36.191 -    ret = (void *)rcache_read(id);
  36.192 -    
  36.193 -    if (ret != NULL) return ret;
  36.194 -    
  36.195 -    ret = readblock(id);
  36.196 -    
  36.197 -    if (ret != NULL)
  36.198 -        rcache_write(id, ret);
  36.199 -    
  36.200 -    return(ret);
  36.201 -}
  36.202 -
  36.203 -u64 rc_allocblock(void *block)
  36.204 -{
  36.205 -    u64 ret;
  36.206 -    
  36.207 -    ret = allocblock(block);
  36.208 -    
  36.209 -    if (ret != ZERO)
  36.210 -        rcache_write(ret, block);
  36.211 -    
  36.212 -    return(ret);
  36.213 -}
  36.214 -
  36.215 -int rc_writeblock(u64 id, void *block)
  36.216 -{
  36.217 -    int ret;
  36.218 -    
  36.219 -    ret = writeblock(id, block);
  36.220 -    rcache_write(id, block);
  36.221 -    
  36.222 -    return(ret);
  36.223 -}
  36.224 -
  36.225 -
  36.226 -/*
  36.227 - * block device interface and other helper functions
  36.228 - * with these functions, block id is just a 63-bit number, with
  36.229 - * no special consideration for the LSB
  36.230 - */
  36.231 -radix_tree_node cloneblock(radix_tree_node block);
  36.232 -
  36.233 -/*
  36.234 - * main api
  36.235 - * with these functions, the LSB of root always indicates
  36.236 - * whether or not the block is writable, including the return
  36.237 - * values of update and snapshot
  36.238 - */
  36.239 -u64 lookup(int height, u64 root, u64 key);
  36.240 -u64 update(int height, u64 root, u64 key, u64 val);
  36.241 -u64 snapshot(u64 root);
  36.242 -
  36.243 -/**
  36.244 - * cloneblock: clone an existing block in memory
  36.245 - *   @block: the old block
  36.246 - *
  36.247 - *   @return: new block, with LSB cleared for every entry
  36.248 - */
  36.249 -radix_tree_node cloneblock(radix_tree_node block) {
  36.250 -    radix_tree_node node = (radix_tree_node) malloc(BLOCK_SIZE);
  36.251 -    int i;
  36.252 -    if (node == NULL) {
  36.253 -        perror("cloneblock malloc");
  36.254 -        return NULL;
  36.255 -    }
  36.256 -    for (i = 0; i < RADIX_TREE_MAP_ENTRIES; i++)
  36.257 -        node[i] = block[i] & ONEMASK;
  36.258 -    return node;
  36.259 -}
  36.260 -
  36.261 -/**
  36.262 - * lookup: find a value given a key
  36.263 - *   @height: height in bits of the radix tree
  36.264 - *   @root: root node id, with set LSB indicating writable node
  36.265 - *   @key: key to lookup
  36.266 - *
  36.267 - *   @return: value on success, zero on error
  36.268 - */
  36.269 -
  36.270 -u64 lookup(int height, u64 root, u64 key) {
  36.271 -    radix_tree_node node;
  36.272 -    u64 mask = ONE;
  36.273 -    
  36.274 -    assert(key >> height == 0);
  36.275 -
  36.276 -    /* the root block may be smaller to ensure all leaves are full */
  36.277 -    height = ((height - 1) / RADIX_TREE_MAP_SHIFT) * RADIX_TREE_MAP_SHIFT;
  36.278 -
  36.279 -    /* now carve off equal sized chunks at each step */
  36.280 -    for (;;) {
  36.281 -        u64 oldroot;
  36.282 -
  36.283 -#ifdef DEBUG
  36.284 -        printf("lookup: height=%3d root=%3Ld offset=%3d%s\n", height, root,
  36.285 -                (int) ((key >> height) & RADIX_TREE_MAP_MASK),
  36.286 -                (iswritable(root) ? "" : " (readonly)"));
  36.287 -#endif
  36.288 -        
  36.289 -        if (getid(root) == ZERO)
  36.290 -            return ZERO;
  36.291 -
  36.292 -        oldroot = root;
  36.293 -        node = (radix_tree_node) rc_readblock(getid(root));
  36.294 -        if (node == NULL)
  36.295 -            return ZERO;
  36.296 -
  36.297 -        root = node[(key >> height) & RADIX_TREE_MAP_MASK];
  36.298 -        mask &= root;
  36.299 -        freeblock(node);
  36.300 -
  36.301 -        if (height == 0)
  36.302 -            return ( root & ONEMASK ) | mask;
  36.303 -
  36.304 -        height -= RADIX_TREE_MAP_SHIFT;
  36.305 -    }
  36.306 -
  36.307 -    return ZERO;
  36.308 -}
  36.309 -
  36.310 -/*
  36.311 - * update: set a radix tree entry, doing copy-on-write as necessary
  36.312 - *   @height: height in bits of the radix tree
  36.313 - *   @root: root node id, with set LSB indicating writable node
  36.314 - *   @key: key to set
  36.315 - *   @val: value to set, s.t. radix(key)=val
  36.316 - *
  36.317 - *   @returns: (possibly new) root id on success (with LSB=1), 0 on failure
  36.318 - */
  36.319 -
  36.320 -u64 update(int height, u64 root, u64 key, u64 val) {
  36.321 -    int offset;
  36.322 -    u64 child;
  36.323 -    radix_tree_node node;
  36.324 -    
  36.325 -    /* base case--return val */
  36.326 -    if (height == 0)
  36.327 -        return val;
  36.328 -
  36.329 -    /* the root block may be smaller to ensure all leaves are full */
  36.330 -    height = ((height - 1) / RADIX_TREE_MAP_SHIFT) * RADIX_TREE_MAP_SHIFT;
  36.331 -    offset = (key >> height) & RADIX_TREE_MAP_MASK;
  36.332 -
  36.333 -#ifdef DEBUG
  36.334 -    printf("update: height=%3d root=%3Ld offset=%3d%s\n", height, root,
  36.335 -            offset, (iswritable(root)?"":" (clone)"));
  36.336 -#endif
  36.337 -
  36.338 -    /* load a block, or create a new one */
  36.339 -    if (root == ZERO) {
  36.340 -        node = (radix_tree_node) newblock();
  36.341 -    } else {
  36.342 -        node = (radix_tree_node) rc_readblock(getid(root));
  36.343 -
  36.344 -        if (!iswritable(root)) {
  36.345 -            /* need to clone this node */
  36.346 -            radix_tree_node oldnode = node;
  36.347 -            node = cloneblock(node);
  36.348 -            freeblock(oldnode);
  36.349 -            root = ZERO;
  36.350 -        }
  36.351 -    }
  36.352 -
  36.353 -    if (node == NULL) {
  36.354 -#ifdef DEBUG
  36.355 -        printf("update: node is null!\n");
  36.356 -#endif
  36.357 -        return ZERO;
  36.358 -    }
  36.359 -
  36.360 -    child = update(height, node[offset], key, val);
  36.361 -
  36.362 -    if (child == ZERO) {
  36.363 -        freeblock(node);
  36.364 -        return ZERO;
  36.365 -    } else if (child == node[offset]) {
  36.366 -        /* no change, so we already owned the child */
  36.367 -        assert(iswritable(root));
  36.368 -
  36.369 -        freeblock(node);
  36.370 -        return root;
  36.371 -    }
  36.372 -
  36.373 -    node[offset] = child;
  36.374 -
  36.375 -    /* new/cloned blocks need to be saved */
  36.376 -    if (root == ZERO) {
  36.377 -        /* mark this as an owned block */
  36.378 -        root = rc_allocblock(node);
  36.379 -        if (root)
  36.380 -            root = writable(root);
  36.381 -    } else if (rc_writeblock(getid(root), node) < 0) {
  36.382 -        freeblock(node);
  36.383 -        return ZERO;
  36.384 -    }
  36.385 -
  36.386 -    freeblock(node);
  36.387 -    return root;
  36.388 -}
  36.389 -
  36.390 -/**
  36.391 - * snapshot: create a snapshot
  36.392 - *   @root: old root node
  36.393 - *
  36.394 - *   @return: new root node, 0 on error
  36.395 - */
  36.396 -u64 snapshot(u64 root) {
  36.397 -    radix_tree_node node, newnode;
  36.398 -
  36.399 -    if ((node = rc_readblock(getid(root))) == NULL)
  36.400 -        return ZERO;
  36.401 -
  36.402 -    newnode = cloneblock(node);
  36.403 -    freeblock(node);
  36.404 -    if (newnode == NULL)
  36.405 -        return ZERO;
  36.406 -    
  36.407 -    root = rc_allocblock(newnode);
  36.408 -    freeblock(newnode);
  36.409 -
  36.410 -    if (root == ZERO)
  36.411 -        return ZERO;
  36.412 -    else
  36.413 -        return writable(root);
  36.414 -}
  36.415 -
  36.416 -/**
  36.417 - * collapse: collapse a parent onto a child.
  36.418 - * 
  36.419 - * NOTE: This assumes that parent and child really are, and further that
  36.420 - * there are no other children forked from this parent. (children of the
  36.421 - * child are okay...)
  36.422 - */
  36.423 -
  36.424 -int collapse(int height, u64 proot, u64 croot)
  36.425 -{
  36.426 -    int i, numlinks, ret, total = 0;
  36.427 -    radix_tree_node pnode, cnode;
  36.428 -    
  36.429 -    if (height == 0) {
  36.430 -        height = -1; /* terminate recursion */
  36.431 -    } else {        
  36.432 -        height = ((height - 1) / RADIX_TREE_MAP_SHIFT) * RADIX_TREE_MAP_SHIFT;
  36.433 -    }
  36.434 -    numlinks = (1UL << RADIX_TREE_MAP_SHIFT);
  36.435 -
  36.436 -    /* Terminal cases: */
  36.437 -
  36.438 -    if ( (getid(proot) == ZERO) || (getid(croot) == ZERO) )
  36.439 -        return -1;
  36.440 -    
  36.441 -    /* get roots */
  36.442 -    if ((pnode = readblock(getid(proot))) == NULL)
  36.443 -        return -1;
  36.444 -    
  36.445 -    if ((cnode = readblock(getid(croot))) == NULL)
  36.446 -    {
  36.447 -        freeblock(pnode);
  36.448 -        return -1;
  36.449 -    }
  36.450 -    
  36.451 -    /* For each writable link in proot */
  36.452 -    for (i=0; i<numlinks; i++)
  36.453 -    {
  36.454 -        if ( pnode[i] == cnode[i] ) continue;
  36.455 -        
  36.456 -        /* collapse (next level) */
  36.457 -        /* if height != 0 and writable... */
  36.458 -        if (( height >= 0 ) && ( iswritable(pnode[i]) ) )
  36.459 -        {
  36.460 -            //printf("   %Ld is writable (i=%d).\n", getid(pnode[i]), i);
  36.461 -            ret = collapse(height, pnode[i], cnode[i]);
  36.462 -            if (ret == -1) 
  36.463 -            {
  36.464 -                total = -1;
  36.465 -            } else {
  36.466 -                total += ret;
  36.467 -            }
  36.468 -        }
  36.469 -    
  36.470 -        
  36.471 -    }
  36.472 -    
  36.473 -    /* if plink is writable, AND clink is writable -> free plink block */
  36.474 -    if ( ( iswritable(proot) ) && ( iswritable(croot) ) ) 
  36.475 -    {
  36.476 -        releaseblock(getid(proot));
  36.477 -        if (ret >=0) total++;
  36.478 -        //printf("   Delete %Ld\n", getid(proot));
  36.479 -    }
  36.480 -//printf("done : %Ld\n", getid(proot));
  36.481 -    return total;
  36.482 -
  36.483 -}
  36.484 -
  36.485 -
  36.486 -void print_root(u64 root, int height, FILE *dot_f)
  36.487 -{
  36.488 -    FILE *f;
  36.489 -    int i;
  36.490 -    radix_tree_node node;
  36.491 -    char *style[2] = { "", "style=bold,color=blue," };
  36.492 -    
  36.493 -    if (dot_f == NULL) {
  36.494 -        f = fopen("radix.dot", "w");
  36.495 -        if (f == NULL) {
  36.496 -            perror("print_root: open");
  36.497 -            return;
  36.498 -        }
  36.499 -
  36.500 -        /* write graph preamble */
  36.501 -        fprintf(f, "digraph G {\n");
  36.502 -
  36.503 -        /* add a node for this root. */
  36.504 -        fprintf(f, "   n%Ld [%sshape=box,label=\"%Ld\"];\n", 
  36.505 -                getid(root), style[iswritable(root)], getid(root));
  36.506 -    }
  36.507 -    
  36.508 -    printf("print_root(%Ld)\n", getid(root));
  36.509 -    
  36.510 -    /* base case */
  36.511 -    if (height == 0) {
  36.512 -        /* add a node and edge for each child root */
  36.513 -        node = (radix_tree_node) readblock(getid(root));
  36.514 -        if (node == NULL)
  36.515 -            return;
  36.516 -        
  36.517 -        for (i = 0; i < RADIX_TREE_MAP_ENTRIES; i++) {
  36.518 -            if (node[i] != ZERO) {
  36.519 -                fprintf(f, "   n%Ld [%sshape=box,label=\"%Ld\"];\n", 
  36.520 -                        getid(node[i]), style[iswritable(node[i])], 
  36.521 -                        getid(node[i]));
  36.522