direct-io.hg

changeset 4261:1c7c83cdf0ed

bitkeeper revision 1.1236.1.111 (424042ca2OF5ZJ7nHGwhQO1NDV_-CQ)

Merge firebug.cl.cam.ac.uk:/auto/groups/xeno-xenod/BK/xen-unstable.bk
into firebug.cl.cam.ac.uk:/local/scratch/cl349/xen-unstable.bk
author cl349@firebug.cl.cam.ac.uk
date Tue Mar 22 16:07:38 2005 +0000 (2005-03-22)
parents beb68750d6e0 0529fe941ec4
children 04400e772fd7
files BitKeeper/etc/logging_ok tools/blktap/blockstore.c tools/blktap/blockstore.h
line diff
     1.1 --- a/BitKeeper/etc/logging_ok	Tue Mar 22 15:18:52 2005 +0000
     1.2 +++ b/BitKeeper/etc/logging_ok	Tue Mar 22 16:07:38 2005 +0000
     1.3 @@ -34,6 +34,7 @@ iap10@pb007.cl.cam.ac.uk
     1.4  iap10@striker.cl.cam.ac.uk
     1.5  iap10@tetris.cl.cam.ac.uk
     1.6  jrb44@plym.cl.cam.ac.uk
     1.7 +jrb44@swoop.cl.cam.ac.uk
     1.8  jws22@gauntlet.cl.cam.ac.uk
     1.9  jws@cairnwell.research
    1.10  kaf24@camelot.eng.3leafnetworks.com
     2.1 --- a/tools/blktap/blockstore.c	Tue Mar 22 15:18:52 2005 +0000
     2.2 +++ b/tools/blktap/blockstore.c	Tue Mar 22 16:07:38 2005 +0000
     2.3 @@ -13,32 +13,73 @@
     2.4  #include <string.h>
     2.5  #include <sys/types.h>
     2.6  #include <sys/stat.h>
     2.7 +#include <stdarg.h>
     2.8  #include "blockstore.h"
     2.9 -#include "parallax-threaded.h"
    2.10  
    2.11  #define BLOCKSTORE_REMOTE
    2.12 +//#define BSDEBUG
    2.13 +
    2.14 +/*****************************************************************************
    2.15 + * Debugging
    2.16 + */
    2.17 +#ifdef BSDEBUG
    2.18 +void DB(char *format, ...)
    2.19 +{
    2.20 +    va_list args;
    2.21 +    
    2.22 +    va_start(args, format);
    2.23 +    vfprintf(stderr, format, args);
    2.24 +    va_end(args);
    2.25 +}
    2.26 +#else
    2.27 +#define DB(format, ...) (void)0
    2.28 +#endif
    2.29  
    2.30  #ifdef BLOCKSTORE_REMOTE
    2.31  
    2.32 -//#define BSDEBUG
    2.33 -
    2.34  #include <sys/socket.h>
    2.35  #include <sys/ioctl.h>
    2.36  #include <netinet/in.h>
    2.37  #include <netdb.h>
    2.38  
    2.39 +/*****************************************************************************
    2.40 + *                                                                           *
    2.41 + *****************************************************************************/
    2.42 +
    2.43 +/*****************************************************************************
    2.44 + * Network state                                                             *
    2.45 + *****************************************************************************/
    2.46 +
    2.47 +/* The individual disk servers we talks to. These will be referenced by
    2.48 + * an integer index into bsservers[].
    2.49 + */
    2.50 +bsserver_t bsservers[MAX_SERVERS];
    2.51 +
    2.52 +/* The cluster map. This is indexed by an integer cluster number.
    2.53 + */
    2.54 +bscluster_t bsclusters[MAX_CLUSTERS];
    2.55 +
    2.56 +/* Local socket.
    2.57 + */
    2.58 +struct sockaddr_in sin_local;
    2.59 +int bssock = 0;
    2.60 +
    2.61 +/*****************************************************************************
    2.62 + * Message queue management                                                  *
    2.63 + *****************************************************************************/
    2.64 +
    2.65 +/* Protects the queue manipulation critcal regions.
    2.66 + */
    2.67  #define ENTER_QUEUE_CR (void)0
    2.68  #define LEAVE_QUEUE_CR (void)0
    2.69  
    2.70 -bsserver_t bsservers[MAX_SERVERS];
    2.71 -bscluster_t bsclusters[MAX_CLUSTERS];
    2.72 -
    2.73 -struct sockaddr_in sin_local;
    2.74 -int bssock = 0;
    2.75 -
    2.76 +/* A message queue entry. We allocate one of these for every request we send.
    2.77 + * Asynchronous reply reception also used one of these.
    2.78 + */
    2.79  typedef struct bsq_t_struct {
    2.80      struct bsq_t_struct *prev;
    2.81      struct bsq_t_struct *next;
    2.82 +    int status;
    2.83      int server;
    2.84      int length;
    2.85      struct msghdr msghdr;
    2.86 @@ -47,8 +88,134 @@ typedef struct bsq_t_struct {
    2.87      void *block;
    2.88  } bsq_t;
    2.89  
    2.90 +#define BSQ_STATUS_MATCHED 1
    2.91 +
    2.92 +#define ENTER_LUID_CR (void)0
    2.93 +#define LEAVE_LUID_CR (void)0
    2.94 +
    2.95 +static u64 luid_cnt = 0x1000ULL;
    2.96 +u64 new_luid(void) {
    2.97 +    u64 luid;
    2.98 +    ENTER_LUID_CR;
    2.99 +    luid = luid_cnt++;
   2.100 +    LEAVE_LUID_CR;
   2.101 +    return luid;
   2.102 +}
   2.103 +
   2.104 +/* Queue of outstanding requests.
   2.105 + */
   2.106  bsq_t *bs_head = NULL;
   2.107  bsq_t *bs_tail = NULL;
   2.108 +int bs_qlen = 0;
   2.109 +
   2.110 +/*
   2.111 + */
   2.112 +void queuedebug(char *msg) {
   2.113 +    bsq_t *q;
   2.114 +    ENTER_QUEUE_CR;
   2.115 +    fprintf(stderr, "Q: %s len=%u\n", msg, bs_qlen);
   2.116 +    for (q = bs_head; q; q = q->next) {
   2.117 +        fprintf(stderr, "  luid=%016llx server=%u\n",
   2.118 +                q->message.luid, q->server);
   2.119 +    }
   2.120 +    LEAVE_QUEUE_CR;
   2.121 +}
   2.122 +
   2.123 +int enqueue(bsq_t *qe) {
   2.124 +    ENTER_QUEUE_CR;
   2.125 +    qe->next = NULL;
   2.126 +    qe->prev = bs_tail;
   2.127 +    if (!bs_head)
   2.128 +        bs_head = qe;
   2.129 +    else
   2.130 +        bs_tail->next = qe;
   2.131 +    bs_tail = qe;
   2.132 +    bs_qlen++;
   2.133 +    LEAVE_QUEUE_CR;
   2.134 +#ifdef BSDEBUG
   2.135 +    queuedebug("enqueue");
   2.136 +#endif
   2.137 +    return 0;
   2.138 +}
   2.139 +
   2.140 +int dequeue(bsq_t *qe) {
   2.141 +    bsq_t *q;
   2.142 +    ENTER_QUEUE_CR;
   2.143 +    for (q = bs_head; q; q = q->next) {
   2.144 +        if (q == qe) {
   2.145 +            if (q->prev)
   2.146 +                q->prev->next = q->next;
   2.147 +            else 
   2.148 +                bs_head = q->next;
   2.149 +            if (q->next)
   2.150 +                q->next->prev = q->prev;
   2.151 +            else
   2.152 +                bs_tail = q->prev;
   2.153 +            bs_qlen--;
   2.154 +            goto found;
   2.155 +        }
   2.156 +    }
   2.157 +
   2.158 +    LEAVE_QUEUE_CR;
   2.159 +#ifdef BSDEBUG
   2.160 +    queuedebug("dequeue not found");
   2.161 +#endif
   2.162 +    return 0;
   2.163 +
   2.164 +    found:
   2.165 +    LEAVE_QUEUE_CR;
   2.166 +#ifdef BSDEBUG
   2.167 +    queuedebug("dequeue not found");
   2.168 +#endif
   2.169 +    return 1;
   2.170 +}
   2.171 +
   2.172 +bsq_t *queuesearch(bsq_t *qe) {
   2.173 +    bsq_t *q;
   2.174 +    ENTER_QUEUE_CR;
   2.175 +    for (q = bs_head; q; q = q->next) {
   2.176 +        if ((qe->server == q->server) &&
   2.177 +            (qe->message.operation == q->message.operation) &&
   2.178 +            (qe->message.luid == q->message.luid)) {
   2.179 +
   2.180 +            if ((q->message.operation == BSOP_READBLOCK) &&
   2.181 +                ((q->message.flags & BSOP_FLAG_ERROR) == 0)) {
   2.182 +                q->block = qe->block;
   2.183 +                qe->block = NULL;
   2.184 +            }
   2.185 +            q->length = qe->length;
   2.186 +            q->message.flags = qe->message.flags;
   2.187 +            q->message.id = qe->message.id;
   2.188 +            q->status |= BSQ_STATUS_MATCHED;
   2.189 +
   2.190 +            if (q->prev)
   2.191 +                q->prev->next = q->next;
   2.192 +            else 
   2.193 +                bs_head = q->next;
   2.194 +            if (q->next)
   2.195 +                q->next->prev = q->prev;
   2.196 +            else
   2.197 +                bs_tail = q->prev;
   2.198 +            q->next = NULL;
   2.199 +            q->prev = NULL;
   2.200 +            bs_qlen--;
   2.201 +            goto found;
   2.202 +        }
   2.203 +    }
   2.204 +
   2.205 +    LEAVE_QUEUE_CR;
   2.206 +#ifdef BSDEBUG
   2.207 +    queuedebug("queuesearch not found");
   2.208 +#endif
   2.209 +    return NULL;
   2.210 +
   2.211 +    found:
   2.212 +    LEAVE_QUEUE_CR;
   2.213 +#ifdef BSDEBUG
   2.214 +    queuedebug("queuesearch found");
   2.215 +#endif
   2.216 +    return q;
   2.217 +}
   2.218  
   2.219  int send_message(bsq_t *qe) {
   2.220      int rc;
   2.221 @@ -72,16 +239,21 @@ int send_message(bsq_t *qe) {
   2.222          qe->iov[1].iov_len = BLOCK_SIZE;
   2.223      }
   2.224  
   2.225 -    rc = sendmsg(bssock, &(qe->msghdr), 0);
   2.226 +    qe->message.luid = new_luid();
   2.227 +
   2.228 +    qe->status = 0;
   2.229 +    if (enqueue(qe) < 0) {
   2.230 +        fprintf(stderr, "Error enqueuing request.\n");
   2.231 +        return -1;
   2.232 +    }
   2.233 +
   2.234 +    DB("send_message to %d luid=%016llx\n", qe->server, qe->message.luid);
   2.235 +    rc = sendmsg(bssock, &(qe->msghdr), MSG_DONTWAIT);
   2.236      //rc = sendto(bssock, (void *)&(qe->message), qe->length, 0,
   2.237      //           (struct sockaddr *)&(bsservers[qe->server].sin),
   2.238      //           sizeof(struct sockaddr_in));
   2.239      if (rc < 0)
   2.240          return rc;
   2.241 -    
   2.242 -    ENTER_QUEUE_CR;
   2.243 -    
   2.244 -    LEAVE_QUEUE_CR;
   2.245  
   2.246      return rc;
   2.247  }
   2.248 @@ -116,22 +288,148 @@ int recv_message(bsq_t *qe) {
   2.249      return rc;
   2.250  }
   2.251  
   2.252 +int get_server_number(struct sockaddr_in *sin) {
   2.253 +    int i;
   2.254 +
   2.255 +#ifdef BSDEBUG2
   2.256 +    fprintf(stderr,
   2.257 +            "get_server_number(%u.%u.%u.%u/%u)\n",
   2.258 +            (unsigned int)sin->sin_addr.s_addr & 0xff,
   2.259 +            ((unsigned int)sin->sin_addr.s_addr >> 8) & 0xff,
   2.260 +            ((unsigned int)sin->sin_addr.s_addr >> 16) & 0xff,
   2.261 +            ((unsigned int)sin->sin_addr.s_addr >> 24) & 0xff,
   2.262 +            (unsigned int)sin->sin_port);
   2.263 +#endif
   2.264 +
   2.265 +    for (i = 0; i < MAX_SERVERS; i++) {
   2.266 +        if (bsservers[i].hostname) {
   2.267 +#ifdef BSDEBUG2
   2.268 +            fprintf(stderr,
   2.269 +                    "get_server_number check %u.%u.%u.%u/%u\n",
   2.270 +                    (unsigned int)bsservers[i].sin.sin_addr.s_addr&0xff,
   2.271 +                    ((unsigned int)bsservers[i].sin.sin_addr.s_addr >> 8)&0xff,
   2.272 +                    ((unsigned int)bsservers[i].sin.sin_addr.s_addr >> 16)&0xff,
   2.273 +                    ((unsigned int)bsservers[i].sin.sin_addr.s_addr >> 24)&0xff,
   2.274 +                    (unsigned int)bsservers[i].sin.sin_port);
   2.275 +#endif
   2.276 +            if ((sin->sin_family == bsservers[i].sin.sin_family) &&
   2.277 +                (sin->sin_port == bsservers[i].sin.sin_port) &&
   2.278 +                (memcmp((void *)&(sin->sin_addr),
   2.279 +                        (void *)&(bsservers[i].sin.sin_addr),
   2.280 +                        sizeof(struct in_addr)) == 0)) {
   2.281 +                return i;
   2.282 +            }
   2.283 +        }        
   2.284 +    }
   2.285 +
   2.286 +    return -1;
   2.287 +}
   2.288 +
   2.289 +void *rx_buffer = NULL;
   2.290 +bsq_t rx_qe;
   2.291 +bsq_t *recv_any(void) {
   2.292 +    struct sockaddr_in from;
   2.293 +    int rc;
   2.294 +
   2.295 +    DB("ENTER recv_any\n");
   2.296 +
   2.297 +    rx_qe.msghdr.msg_name = &from;
   2.298 +    rx_qe.msghdr.msg_namelen = sizeof(struct sockaddr_in);
   2.299 +    rx_qe.msghdr.msg_iov = rx_qe.iov;
   2.300 +    if (!rx_buffer) {
   2.301 +        rx_buffer = malloc(BLOCK_SIZE);
   2.302 +        if (!rx_buffer) {
   2.303 +            perror("recv_any malloc");
   2.304 +            return NULL;
   2.305 +        }
   2.306 +    }
   2.307 +    rx_qe.block = rx_buffer;
   2.308 +    rx_buffer = NULL;
   2.309 +    rx_qe.msghdr.msg_iovlen = 2;
   2.310 +    rx_qe.msghdr.msg_control = NULL;
   2.311 +    rx_qe.msghdr.msg_controllen = 0;
   2.312 +    rx_qe.msghdr.msg_flags = 0;
   2.313 +    
   2.314 +    rx_qe.iov[0].iov_base = (void *)&(rx_qe.message);
   2.315 +    rx_qe.iov[0].iov_len = MSGBUFSIZE_ID;
   2.316 +    rx_qe.iov[1].iov_base = rx_qe.block;
   2.317 +    rx_qe.iov[1].iov_len = BLOCK_SIZE;
   2.318 +
   2.319 +    rc = recvmsg(bssock, &(rx_qe.msghdr), 0);
   2.320 +    if (rc < 0) {
   2.321 +        perror("recv_any");
   2.322 +        return NULL;
   2.323 +    }
   2.324 +    rx_qe.length = rc;    
   2.325 +    rx_qe.server = get_server_number(&from);
   2.326 +
   2.327 +    DB("recv_any from %d luid=%016llx len=%u\n",
   2.328 +       rx_qe.server, rx_qe.message.luid, rx_qe.length);
   2.329 +
   2.330 +    return &rx_qe;
   2.331 +}
   2.332 +
   2.333 +void recv_recycle_buffer(bsq_t *q) {
   2.334 +    if (q->block) {
   2.335 +        rx_buffer = q->block;
   2.336 +        q->block = NULL;
   2.337 +    }
   2.338 +}
   2.339 +
   2.340 +// cycle through reading any incoming, searching for a match in the
   2.341 +// queue, until we have all we need.
   2.342 +int wait_recv(bsq_t **reqs, int numreqs) {
   2.343 +    bsq_t *q, *m;
   2.344 +    unsigned int x, i;
   2.345 +
   2.346 +    DB("ENTER wait_recv %u\n", numreqs);
   2.347 +
   2.348 +    checkmatch:
   2.349 +    x = 0xffffffff;
   2.350 +    for (i = 0; i < numreqs; i++) {
   2.351 +        x &= reqs[i]->status;
   2.352 +    }
   2.353 +    if ((x & BSQ_STATUS_MATCHED)) {
   2.354 +        DB("LEAVE wait_recv\n");
   2.355 +        return numreqs;
   2.356 +    }
   2.357 +
   2.358 +    rxagain:
   2.359 +    q = recv_any();
   2.360 +    if (!q)
   2.361 +        return -1;
   2.362 +
   2.363 +    m = queuesearch(q);
   2.364 +    recv_recycle_buffer(q);
   2.365 +    if (!m) {
   2.366 +        fprintf(stderr, "Unmatched RX\n");
   2.367 +        goto rxagain;
   2.368 +    }
   2.369 +
   2.370 +    goto checkmatch;
   2.371 +
   2.372 +}
   2.373 +
   2.374  void *readblock_indiv(int server, u64 id) {
   2.375      void *block;
   2.376      bsq_t *qe;
   2.377 -    int len;
   2.378 +    int len, rc;
   2.379  
   2.380      qe = (bsq_t *)malloc(sizeof(bsq_t));
   2.381      if (!qe) {
   2.382          perror("readblock qe malloc");
   2.383          return NULL;
   2.384      }
   2.385 +    qe->block = NULL;
   2.386 +    
   2.387 +    /*
   2.388      qe->block = malloc(BLOCK_SIZE);
   2.389      if (!qe->block) {
   2.390          perror("readblock qe malloc");
   2.391          free((void *)qe);
   2.392          return NULL;
   2.393      }
   2.394 +    */
   2.395  
   2.396      qe->server = server;
   2.397  
   2.398 @@ -145,34 +443,40 @@ void *readblock_indiv(int server, u64 id
   2.399          goto err;
   2.400      }
   2.401      
   2.402 -    len = recv_message(qe);
   2.403 +    /*len = recv_message(qe);
   2.404      if (len < 0) {
   2.405          perror("readblock recv");
   2.406          goto err;
   2.407 +    }*/
   2.408 +
   2.409 +    rc = wait_recv(&qe, 1);
   2.410 +    if (rc < 0) {
   2.411 +        perror("readblock recv");
   2.412 +        goto err;
   2.413      }
   2.414 +
   2.415      if ((qe->message.flags & BSOP_FLAG_ERROR)) {
   2.416          fprintf(stderr, "readblock server error\n");
   2.417          goto err;
   2.418      }
   2.419 -    if (len < MSGBUFSIZE_BLOCK) {
   2.420 +    if (qe->length < MSGBUFSIZE_BLOCK) {
   2.421          fprintf(stderr, "readblock recv short (%u)\n", len);
   2.422          goto err;
   2.423      }
   2.424 -    /* akw: memory leak here? */
   2.425 -    /*
   2.426 -    if ((block = malloc(BLOCK_SIZE)) == NULL) {
   2.427 +    /* if ((block = malloc(BLOCK_SIZE)) == NULL) {
   2.428          perror("readblock malloc");
   2.429          goto err;
   2.430      }
   2.431 -    */
   2.432 -    //memcpy(block, qe->message.block, BLOCK_SIZE);
   2.433 +    memcpy(block, qe->message.block, BLOCK_SIZE);
   2.434 +    */    
   2.435      block = qe->block;
   2.436  
   2.437      free((void *)qe);
   2.438      return block;
   2.439  
   2.440      err:
   2.441 -    free(qe->block);
   2.442 +    if (qe->block)
   2.443 +        free(qe->block);
   2.444      free((void *)qe);
   2.445      return NULL;
   2.446  }
   2.447 @@ -233,7 +537,8 @@ void *readblock(u64 id) {
   2.448      return block;
   2.449  }
   2.450  
   2.451 -int writeblock_indiv(int server, u64 id, void *block) {
   2.452 +bsq_t *writeblock_indiv(int server, u64 id, void *block) {
   2.453 +
   2.454      bsq_t *qe;
   2.455      int len;
   2.456  
   2.457 @@ -255,28 +560,14 @@ int writeblock_indiv(int server, u64 id,
   2.458          perror("writeblock sendto");
   2.459          goto err;
   2.460      }
   2.461 -    
   2.462 -    len = recv_message(qe);
   2.463 -    if (len < 0) {
   2.464 -        perror("writeblock recv");
   2.465 -        goto err;
   2.466 -    }
   2.467 -    if ((qe->message.flags & BSOP_FLAG_ERROR)) {
   2.468 -        fprintf(stderr, "writeblock server error\n");
   2.469 -        goto err;
   2.470 -    }
   2.471 -    if (len < MSGBUFSIZE_ID) {
   2.472 -        fprintf(stderr, "writeblock recv short (%u)\n", len);
   2.473 -        goto err;
   2.474 -    }
   2.475  
   2.476 -    free((void *)qe);
   2.477 -    return 0;
   2.478 +    return qe;
   2.479  
   2.480      err:
   2.481      free((void *)qe);
   2.482 -    return -1;
   2.483 +    return NULL;
   2.484  }
   2.485 +    
   2.486  
   2.487  /**
   2.488   * writeblock: write an existing block to disk
   2.489 @@ -286,11 +577,15 @@ int writeblock_indiv(int server, u64 id,
   2.490   *   @return: zero on success, -1 on failure
   2.491   */
   2.492  int writeblock(u64 id, void *block) {
   2.493 +    
   2.494      int map = (int)BSID_MAP(id);
   2.495 -    
   2.496      int rep0 = bsclusters[map].servers[0];
   2.497      int rep1 = bsclusters[map].servers[1];
   2.498      int rep2 = bsclusters[map].servers[2];
   2.499 +    bsq_t *reqs[3];
   2.500 +    int rc;
   2.501 +
   2.502 +    reqs[0] = reqs[1] = reqs[2] = NULL;
   2.503  
   2.504  #ifdef BSDEBUG
   2.505      fprintf(stderr,
   2.506 @@ -306,20 +601,65 @@ int writeblock(u64 id, void *block) {
   2.507              (unsigned int)((unsigned char *)block)[7]);
   2.508  #endif
   2.509  
   2.510 -/* special case for the "superblock" just use the first block on the
   2.511 +    /* special case for the "superblock" just use the first block on the
   2.512       * first replica. (extend to blocks < 6 for vdi bug)
   2.513       */
   2.514      if (id < 6) {
   2.515 -        return writeblock_indiv(rep0, id, block);
   2.516 +        reqs[0] = writeblock_indiv(rep0, id, block);
   2.517 +        if (!reqs[0])
   2.518 +            return -1;
   2.519 +        rc = wait_recv(reqs, 1);
   2.520 +        return rc;
   2.521      }
   2.522  
   2.523 -    if (writeblock_indiv(rep0, BSID_REPLICA0(id), block) < 0)
   2.524 -        return -1;
   2.525 -    if (writeblock_indiv(rep1, BSID_REPLICA1(id), block) < 0)
   2.526 -        return -1;
   2.527 -    if (writeblock_indiv(rep2, BSID_REPLICA2(id), block) < 0)
   2.528 -        return -1;
   2.529 +    reqs[0] = writeblock_indiv(rep0, BSID_REPLICA0(id), block);
   2.530 +    if (!reqs[0])
   2.531 +        goto err;
   2.532 +    reqs[1] = writeblock_indiv(rep1, BSID_REPLICA1(id), block);
   2.533 +    if (!reqs[1])
   2.534 +        goto err;
   2.535 +    reqs[2] = writeblock_indiv(rep2, BSID_REPLICA2(id), block);
   2.536 +    if (!reqs[2])
   2.537 +        goto err;
   2.538 +
   2.539 +    rc = wait_recv(reqs, 3);
   2.540 +    if (rc < 0) {
   2.541 +        perror("writeblock recv");
   2.542 +        goto err;
   2.543 +    }
   2.544 +    if ((reqs[0]->message.flags & BSOP_FLAG_ERROR)) {
   2.545 +        fprintf(stderr, "writeblock server0 error\n");
   2.546 +        goto err;
   2.547 +    }
   2.548 +    if ((reqs[1]->message.flags & BSOP_FLAG_ERROR)) {
   2.549 +        fprintf(stderr, "writeblock server1 error\n");
   2.550 +        goto err;
   2.551 +    }
   2.552 +    if ((reqs[2]->message.flags & BSOP_FLAG_ERROR)) {
   2.553 +        fprintf(stderr, "writeblock server2 error\n");
   2.554 +        goto err;
   2.555 +    }
   2.556 +
   2.557 +
   2.558 +    free((void *)reqs[0]);
   2.559 +    free((void *)reqs[1]);
   2.560 +    free((void *)reqs[2]);
   2.561      return 0;
   2.562 +
   2.563 +    err:
   2.564 +    if (reqs[0]) {
   2.565 +        dequeue(reqs[0]);
   2.566 +        free((void *)reqs[0]);
   2.567 +    }
   2.568 +    if (reqs[1]) {
   2.569 +        dequeue(reqs[1]);
   2.570 +        free((void *)reqs[1]);
   2.571 +    }
   2.572 +    if (reqs[2]) {
   2.573 +        dequeue(reqs[2]);
   2.574 +        free((void *)reqs[2]);
   2.575 +    }
   2.576 +    return -1;
   2.577  }
   2.578  
   2.579  /**
   2.580 @@ -332,7 +672,7 @@ u64 allocblock(void *block) {
   2.581      return allocblock_hint(block, 0);
   2.582  }
   2.583  
   2.584 -u64 allocblock_hint_indiv(int server, void *block, u64 hint) {
   2.585 +bsq_t *allocblock_hint_indiv(int server, void *block, u64 hint) {
   2.586      bsq_t *qe;
   2.587      int len;
   2.588  
   2.589 @@ -355,26 +695,11 @@ u64 allocblock_hint_indiv(int server, vo
   2.590          goto err;
   2.591      }
   2.592      
   2.593 -    len = recv_message(qe);
   2.594 -    if (len < 0) {
   2.595 -        perror("allocblock_hint recv");
   2.596 -        goto err;
   2.597 -    }
   2.598 -    if ((qe->message.flags & BSOP_FLAG_ERROR)) {
   2.599 -        fprintf(stderr, "allocblock_hint server error\n");
   2.600 -        goto err;
   2.601 -    }
   2.602 -    if (len < MSGBUFSIZE_ID) {
   2.603 -        fprintf(stderr, "allocblock_hint recv short (%u)\n", len);
   2.604 -        goto err;
   2.605 -    }
   2.606 -
   2.607 -    free((void *)qe);
   2.608 -    return qe->message.id;
   2.609 +    return qe;
   2.610  
   2.611      err:
   2.612      free((void *)qe);
   2.613 -    return 0;
   2.614 +    return NULL;
   2.615  }
   2.616  
   2.617  /**
   2.618 @@ -386,22 +711,48 @@ u64 allocblock_hint_indiv(int server, vo
   2.619   */
   2.620  u64 allocblock_hint(void *block, u64 hint) {
   2.621      int map = (int)hint;
   2.622 -    
   2.623      int rep0 = bsclusters[map].servers[0];
   2.624      int rep1 = bsclusters[map].servers[1];
   2.625      int rep2 = bsclusters[map].servers[2];
   2.626 -
   2.627 +    bsq_t *reqs[3];
   2.628 +    int rc;
   2.629      u64 id0, id1, id2;
   2.630  
   2.631 -    id0 = allocblock_hint_indiv(rep0, block, 0);
   2.632 -    if (id0 == 0)
   2.633 -        return 0;
   2.634 -    id1 = allocblock_hint_indiv(rep1, block, 0);
   2.635 -    if (id1 == 0)
   2.636 -        return 0;
   2.637 -    id2 = allocblock_hint_indiv(rep2, block, 0);
   2.638 -    if (id2 == 0)
   2.639 -        return 0;
   2.640 +    reqs[0] = reqs[1] = reqs[2] = NULL;
   2.641 +
   2.642 +    DB("ENTER allocblock\n");
   2.643 +
   2.644 +    reqs[0] = allocblock_hint_indiv(rep0, block, hint);
   2.645 +    if (!reqs[0])
   2.646 +        goto err;
   2.647 +    reqs[1] = allocblock_hint_indiv(rep1, block, hint);
   2.648 +    if (!reqs[1])
   2.649 +        goto err;
   2.650 +    reqs[2] = allocblock_hint_indiv(rep2, block, hint);
   2.651 +    if (!reqs[2])
   2.652 +        goto err;
   2.653 +
   2.654 +    rc = wait_recv(reqs, 3);
   2.655 +    if (rc < 0) {
   2.656 +        perror("allocblock recv");
   2.657 +        goto err;
   2.658 +    }
   2.659 +    if ((reqs[0]->message.flags & BSOP_FLAG_ERROR)) {
   2.660 +        fprintf(stderr, "allocblock server0 error\n");
   2.661 +        goto err;
   2.662 +    }
   2.663 +    if ((reqs[1]->message.flags & BSOP_FLAG_ERROR)) {
   2.664 +        fprintf(stderr, "allocblock server1 error\n");
   2.665 +        goto err;
   2.666 +    }
   2.667 +    if ((reqs[2]->message.flags & BSOP_FLAG_ERROR)) {
   2.668 +        fprintf(stderr, "allocblock server2 error\n");
   2.669 +        goto err;
   2.670 +    }
   2.671 +
   2.672 +    id0 = reqs[0]->message.id;
   2.673 +    id1 = reqs[1]->message.id;
   2.674 +    id2 = reqs[2]->message.id;
   2.675  
   2.676  #ifdef BSDEBUG
   2.677      fprintf(stderr, "ALLOC: %016llx %02x%02x %02x%02x %02x%02x %02x%02x\n",
   2.678 @@ -415,12 +766,31 @@ u64 allocblock_hint(void *block, u64 hin
   2.679              (unsigned int)((unsigned char *)block)[6],
   2.680              (unsigned int)((unsigned char *)block)[7]);
   2.681  #endif
   2.682 +    
   2.683 +    free((void *)reqs[0]);
   2.684 +    free((void *)reqs[1]);
   2.685 +    free((void *)reqs[2]);
   2.686 +    return BSID(map, id0, id1, id2);
   2.687  
   2.688 -    return BSID(map, id0, id1, id2);
   2.689 +    err:
   2.690 +    if (reqs[0]) {
   2.691 +        dequeue(reqs[0]);
   2.692 +        free((void *)reqs[0]);
   2.693 +    }
   2.694 +    if (reqs[1]) {
   2.695 +        dequeue(reqs[1]);
   2.696 +        free((void *)reqs[1]);
   2.697 +    }
   2.698 +    if (reqs[2]) {
   2.699 +        dequeue(reqs[2]);
   2.700 +        free((void *)reqs[2]);
   2.701 +    }
   2.702 +    return 0;
   2.703  }
   2.704  
   2.705  #else /* /BLOCKSTORE_REMOTE */
   2.706  
   2.707 +static int block_fp = -1;
   2.708   
   2.709  /**
   2.710   * readblock: read a block from disk
   2.711 @@ -431,36 +801,21 @@ u64 allocblock_hint(void *block, u64 hin
   2.712  
   2.713  void *readblock(u64 id) {
   2.714      void *block;
   2.715 -    int block_fp;
   2.716 -    
   2.717 -    block_fp = open("blockstore.dat", O_RDONLY | O_CREAT | O_LARGEFILE, 0644);
   2.718 -
   2.719 -    if (block_fp < 0) {
   2.720 -        perror("open");
   2.721 -        return NULL;
   2.722 -    }
   2.723 -    
   2.724      if (lseek64(block_fp, ((off64_t) id - 1LL) * BLOCK_SIZE, SEEK_SET) < 0) {
   2.725 -        printf ("%Ld ", id);
   2.726          printf ("%Ld\n", (id - 1) * BLOCK_SIZE);
   2.727          perror("readblock lseek");
   2.728 -        goto err;
   2.729 +        return NULL;
   2.730      }
   2.731      if ((block = malloc(BLOCK_SIZE)) == NULL) {
   2.732          perror("readblock malloc");
   2.733 -        goto err;
   2.734 +        return NULL;
   2.735      }
   2.736      if (read(block_fp, block, BLOCK_SIZE) != BLOCK_SIZE) {
   2.737          perror("readblock read");
   2.738          free(block);
   2.739 -        goto err;
   2.740 +        return NULL;
   2.741      }
   2.742 -    close(block_fp);
   2.743      return block;
   2.744 -    
   2.745 -err:
   2.746 -    close(block_fp);
   2.747 -    return NULL;
   2.748  }
   2.749  
   2.750  /**
   2.751 @@ -471,30 +826,15 @@ err:
   2.752   *   @return: zero on success, -1 on failure
   2.753   */
   2.754  int writeblock(u64 id, void *block) {
   2.755 -    
   2.756 -    int block_fp;
   2.757 -    
   2.758 -    block_fp = open("blockstore.dat", O_RDWR | O_CREAT | O_LARGEFILE, 0644);
   2.759 -
   2.760 -    if (block_fp < 0) {
   2.761 -        perror("open");
   2.762 -        return -1;
   2.763 -    }
   2.764 -
   2.765      if (lseek64(block_fp, ((off64_t) id - 1LL) * BLOCK_SIZE, SEEK_SET) < 0) {
   2.766          perror("writeblock lseek");
   2.767 -        goto err;
   2.768 +        return -1;
   2.769      }
   2.770      if (write(block_fp, block, BLOCK_SIZE) < 0) {
   2.771          perror("writeblock write");
   2.772 -        goto err;
   2.773 +        return -1;
   2.774      }
   2.775 -    close(block_fp);
   2.776      return 0;
   2.777 -
   2.778 -err:
   2.779 -    close(block_fp);
   2.780 -    return -1;
   2.781  }
   2.782  
   2.783  /**
   2.784 @@ -503,41 +843,30 @@ err:
   2.785   *
   2.786   *   @return: new id of block on disk
   2.787   */
   2.788 +static u64 lastblock = 0;
   2.789  
   2.790  u64 allocblock(void *block) {
   2.791      u64 lb;
   2.792 -    off64_t pos;
   2.793 -    int block_fp;
   2.794 -    
   2.795 -    block_fp = open("blockstore.dat", O_RDWR | O_CREAT | O_LARGEFILE, 0644);
   2.796 -
   2.797 -    if (block_fp < 0) {
   2.798 -        perror("open");
   2.799 -        return 0;
   2.800 -    }
   2.801 -
   2.802 -    pos = lseek64(block_fp, 0, SEEK_END);
   2.803 +    off64_t pos = lseek64(block_fp, 0, SEEK_END);
   2.804      if (pos == (off64_t)-1) {
   2.805          perror("allocblock lseek");
   2.806 -        goto err;
   2.807 +        return 0;
   2.808      }
   2.809      if (pos % BLOCK_SIZE != 0) {
   2.810          fprintf(stderr, "file size not multiple of %d\n", BLOCK_SIZE);
   2.811 -        goto err;
   2.812 +        return 0;
   2.813      }
   2.814      if (write(block_fp, block, BLOCK_SIZE) != BLOCK_SIZE) {
   2.815          perror("allocblock write");
   2.816 -        goto err;
   2.817 +        return 0;
   2.818      }
   2.819      lb = pos / BLOCK_SIZE + 1;
   2.820 -//printf("alloc(%Ld)\n", lb);
   2.821 -    close(block_fp);
   2.822 -    return lb;
   2.823      
   2.824 -err:
   2.825 -    close(block_fp);
   2.826 -    return 0;
   2.827 +    if (lb <= lastblock)
   2.828 +        printf("[*** %Ld alredy allocated! ***]\n", lb);
   2.829      
   2.830 +    lastblock = lb;
   2.831 +    return lb;
   2.832  }
   2.833  
   2.834  /**
   2.835 @@ -579,137 +908,21 @@ void freeblock(void *block) {
   2.836          free(block);
   2.837  }
   2.838  
   2.839 -static freeblock_t *new_freeblock(void)
   2.840 -{
   2.841 -    freeblock_t *fb;
   2.842 -    
   2.843 -    fb = newblock();
   2.844 -    
   2.845 -    if (fb == NULL) return NULL;
   2.846 -    
   2.847 -    fb->magic = FREEBLOCK_MAGIC;
   2.848 -    fb->next  = 0ULL;
   2.849 -    fb->count = 0ULL;
   2.850 -    memset(fb->list, 0, sizeof fb->list);
   2.851 -    
   2.852 -    return fb;
   2.853 -}
   2.854 -
   2.855 -void releaseblock(u64 id)
   2.856 -{
   2.857 -    blockstore_super_t *bs_super;
   2.858 -    freeblock_t *fl_current;
   2.859 -    
   2.860 -    /* get superblock */
   2.861 -    bs_super = (blockstore_super_t *) readblock(BLOCKSTORE_SUPER);
   2.862 -    
   2.863 -    /* get freeblock_current */
   2.864 -    if (bs_super->freelist_current == 0ULL) 
   2.865 -    {
   2.866 -        fl_current = new_freeblock();
   2.867 -        bs_super->freelist_current = allocblock(fl_current);
   2.868 -        writeblock(BLOCKSTORE_SUPER, bs_super);
   2.869 -    } else {
   2.870 -        fl_current = readblock(bs_super->freelist_current);
   2.871 -    }
   2.872 -    
   2.873 -    /* if full, chain to superblock and allocate new current */
   2.874 -    
   2.875 -    if (fl_current->count == FREEBLOCK_SIZE) {
   2.876 -        fl_current->next = bs_super->freelist_full;
   2.877 -        writeblock(bs_super->freelist_current, fl_current);
   2.878 -        bs_super->freelist_full = bs_super->freelist_current;
   2.879 -        freeblock(fl_current);
   2.880 -        fl_current = new_freeblock();
   2.881 -        bs_super->freelist_current = allocblock(fl_current);
   2.882 -        writeblock(BLOCKSTORE_SUPER, bs_super);
   2.883 -    }
   2.884 -    
   2.885 -    /* append id to current */
   2.886 -    fl_current->list[fl_current->count++] = id;
   2.887 -    writeblock(bs_super->freelist_current, fl_current);
   2.888 -    
   2.889 -    freeblock(fl_current);
   2.890 -    freeblock(bs_super);
   2.891 -    
   2.892 -    
   2.893 -}
   2.894 -
   2.895 -/* freelist debug functions: */
   2.896 -void freelist_count(int print_each)
   2.897 -{
   2.898 -    blockstore_super_t *bs_super;
   2.899 -    freeblock_t *fb;
   2.900 -    u64 total = 0, next;
   2.901 -    
   2.902 -    bs_super = (blockstore_super_t *) readblock(BLOCKSTORE_SUPER);
   2.903 -    
   2.904 -    if (bs_super->freelist_current == 0ULL) {
   2.905 -        printf("freelist is empty!\n");
   2.906 -        return;
   2.907 -    }
   2.908 -    
   2.909 -    fb = readblock(bs_super->freelist_current);
   2.910 -    printf("%Ld entires on current.\n", fb->count);
   2.911 -    total += fb->count;
   2.912 -    if (print_each == 1)
   2.913 -    {
   2.914 -        int i;
   2.915 -        for (i=0; i< fb->count; i++)
   2.916 -            printf("  %Ld\n", fb->list[i]);
   2.917 -    }
   2.918 -    
   2.919 -    freeblock(fb);
   2.920 -    
   2.921 -    if (bs_super->freelist_full == 0ULL) {
   2.922 -        printf("freelist_full is empty!\n");
   2.923 -        return;
   2.924 -    }
   2.925 -    
   2.926 -    next = bs_super->freelist_full;
   2.927 -    for (;;) {
   2.928 -        fb = readblock(next);
   2.929 -        total += fb->count;
   2.930 -        if (print_each == 1)
   2.931 -        {
   2.932 -            int i;
   2.933 -            for (i=0; i< fb->count; i++)
   2.934 -                printf("  %Ld\n", fb->list[i]);
   2.935 -        }
   2.936 -        next = fb->next;
   2.937 -        freeblock(fb);
   2.938 -        if (next == 0ULL) break;
   2.939 -    }
   2.940 -    printf("Total of %Ld ids on freelist.\n", total);
   2.941 -}
   2.942  
   2.943  int __init_blockstore(void)
   2.944  {
   2.945 -    int i;
   2.946 -    blockstore_super_t *bs_super;
   2.947 -    u64 ret;
   2.948 -    int block_fp;
   2.949 -    
   2.950  #ifdef BLOCKSTORE_REMOTE
   2.951      struct hostent *addr;
   2.952 -/* james's list
   2.953 +    int i;
   2.954 +
   2.955      bsservers[0].hostname = "firebug.cl.cam.ac.uk";
   2.956 -    bsservers[1].hostname = "tetris.cl.cam.ac.uk";
   2.957 -    bsservers[2].hostname = "donkeykong.cl.cam.ac.uk";
   2.958 -    bsservers[3].hostname = "gunfighter.cl.cam.ac.uk";
   2.959 -    bsservers[4].hostname = "galaxian.cl.cam.ac.uk";
   2.960 -    bsservers[5].hostname = "firetrack.cl.cam.ac.uk";
   2.961 -    bsservers[6].hostname = "funfair.cl.cam.ac.uk";
   2.962 -    bsservers[7].hostname = "felix.cl.cam.ac.uk";
   2.963 -*/
   2.964 -    bsservers[0].hostname = "arcadians.cl.cam.ac.uk";
   2.965 -    bsservers[1].hostname = "uridium.cl.cam.ac.uk";
   2.966 -    bsservers[2].hostname = "shep.cl.cam.ac.uk";
   2.967 -    bsservers[3].hostname = "centipede.cl.cam.ac.uk";
   2.968 -    bsservers[4].hostname = "ghouls.cl.cam.ac.uk";
   2.969 -    bsservers[5].hostname = "phoenix.cl.cam.ac.uk";
   2.970 -    bsservers[6].hostname = "swarm.cl.cam.ac.uk";
   2.971 -    bsservers[7].hostname = "freefall.cl.cam.ac.uk";
   2.972 +    bsservers[1].hostname = "planb.cl.cam.ac.uk";
   2.973 +    bsservers[2].hostname = "simcity.cl.cam.ac.uk";
   2.974 +    bsservers[3].hostname = NULL/*"gunfighter.cl.cam.ac.uk"*/;
   2.975 +    bsservers[4].hostname = NULL/*"galaxian.cl.cam.ac.uk"*/;
   2.976 +    bsservers[5].hostname = NULL/*"firetrack.cl.cam.ac.uk"*/;
   2.977 +    bsservers[6].hostname = NULL/*"funfair.cl.cam.ac.uk"*/;
   2.978 +    bsservers[7].hostname = NULL/*"felix.cl.cam.ac.uk"*/;
   2.979      bsservers[8].hostname = NULL;
   2.980      bsservers[9].hostname = NULL;
   2.981      bsservers[10].hostname = NULL;
   2.982 @@ -783,30 +996,7 @@ int __init_blockstore(void)
   2.983      if (block_fp < 0) {
   2.984          perror("open");
   2.985          return -1;
   2.986 -        exit(-1);
   2.987      }
   2.988 -    
   2.989 -    if (lseek(block_fp, 0, SEEK_END) == 0) {
   2.990 -        bs_super = newblock();
   2.991 -        bs_super->magic            = BLOCKSTORE_MAGIC;
   2.992 -        bs_super->freelist_full    = 0LL;
   2.993 -        bs_super->freelist_current = 0LL;
   2.994 -        
   2.995 -        ret = allocblock(bs_super);
   2.996 -        
   2.997 -        freeblock(bs_super);
   2.998 -    } else {
   2.999 -        bs_super = (blockstore_super_t *) readblock(BLOCKSTORE_SUPER);
  2.1000 -        if (bs_super->magic != BLOCKSTORE_MAGIC)
  2.1001 -        {
  2.1002 -            printf("BLOCKSTORE IS CORRUPT! (no magic in superblock!)\n");
  2.1003 -            exit(-1);
  2.1004 -        }
  2.1005 -        freeblock(bs_super);
  2.1006 -    }
  2.1007 -        
  2.1008 -    close(block_fp);
  2.1009 -        
  2.1010  #endif /*  BLOCKSTORE_REMOTE */   
  2.1011      return 0;
  2.1012  }
     3.1 --- a/tools/blktap/blockstore.h	Tue Mar 22 15:18:52 2005 +0000
     3.2 +++ b/tools/blktap/blockstore.h	Tue Mar 22 16:07:38 2005 +0000
     3.3 @@ -65,6 +65,7 @@ struct bshdr_t_struct {
     3.4      u32            operation;
     3.5      u32            flags;
     3.6      u64            id;
     3.7 +    u64            luid;
     3.8  } __attribute__ ((packed));
     3.9  typedef struct bshdr_t_struct bshdr_t;
    3.10  
    3.11 @@ -77,12 +78,13 @@ typedef struct bsmsg_t_struct bsmsg_t;
    3.12  
    3.13  #define MSGBUFSIZE_OP    sizeof(u32)
    3.14  #define MSGBUFSIZE_FLAGS (sizeof(u32) + sizeof(u32))
    3.15 -#define MSGBUFSIZE_ID    (sizeof(u32) + sizeof(u32) + sizeof(u64))
    3.16 +#define MSGBUFSIZE_ID    (sizeof(u32) + sizeof(u32) + sizeof(u64) + sizeof(u64))
    3.17  #define MSGBUFSIZE_BLOCK sizeof(bsmsg_t)
    3.18  
    3.19  #define BSOP_READBLOCK  0x01
    3.20  #define BSOP_WRITEBLOCK 0x02
    3.21  #define BSOP_ALLOCBLOCK 0x03
    3.22 +#define BSOP_FREEBLOCK  0x04
    3.23  
    3.24  #define BSOP_FLAG_ERROR 0x01
    3.25