]> xenbits.xensource.com Git - people/dstodden/blktap.git/commitdiff
PR-1053: llcache - local leaf caching drivers.
authorDaniel Stodden <daniel.stodden@citrix.com>
Tue, 15 Feb 2011 09:37:45 +0000 (01:37 -0800)
committerDaniel Stodden <daniel.stodden@citrix.com>
Tue, 15 Feb 2011 09:37:45 +0000 (01:37 -0800)
Add PR-1053 compliant local leaf caching support, in toplevel filter
drivers. Since data paths are very different (local or shared storage
writes, in the non-persistent vs. mirrored or shared storage write in
the persistent case), this adds two new driver types:

 - llp: Local Leaf, Persistent
 - lle: Local Leaf, Non-persistent ('ephemeral')

Both work by driving an aggregated vhd image, internally.

Signed-off-by: Daniel Stodden <daniel.stodden@citrix.com>
drivers/Makefile
drivers/block-llcache.c [new file with mode: 0644]
drivers/tapdisk-disktype.c
drivers/tapdisk-disktype.h

index 91a6ff2941285697f422408c81988a5f25d30d5e..8eae6d1f4254238ee19fa7f13a875f3f8a0388f6 100644 (file)
@@ -58,6 +58,7 @@ BLK-OBJS  += block-vhd.o
 BLK-OBJS  += block-valve.o
 BLK-OBJS  += block-vindex.o
 BLK-OBJS  += block-lcache.o
+BLK-OBJS  += block-llcache.o
 
 all: $(IBIN) lock-util
 
diff --git a/drivers/block-llcache.c b/drivers/block-llcache.c
new file mode 100644 (file)
index 0000000..9eed739
--- /dev/null
@@ -0,0 +1,613 @@
+/*
+ * Copyright (c) 2010, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <errno.h>
+
+#include "tapdisk.h"
+#include "tapdisk-vbd.h"
+#include "tapdisk-driver.h"
+#include "tapdisk-interface.h"
+#include "tapdisk-disktype.h"
+
+#define DBG(_f, _a...)  tlog_syslog(TLOG_DBG, _f, ##_a)
+#define INFO(_f, _a...) tlog_syslog(TLOG_INFO, _f, ##_a)
+#define WARN(_f, _a...) tlog_syslog(TLOG_WARN, "WARNING: "_f "in %s:%d", \
+                                   ##_a, __func__, __LINE__)
+
+#define BUG()           td_panic()
+#define BUG_ON(_cond)   if (unlikely(_cond)) { td_panic(); }
+#define WARN_ON(_p)     if (unlikely(_cond)) { WARN(_cond); }
+
+int ll_write_error(int curr, int error)
+{
+       if (error && (!curr || curr == -ENOSPC))
+               return error;
+
+       return 0;
+}
+
+void ll_log_switch(int type, int error,
+                  td_image_t *local, td_image_t *shared)
+{
+       WARN("WARNING: %s, on %s:%s. Switching to %s:%s.",
+            strerror(-error),
+            tapdisk_disk_types[local->type]->name, local->name,
+            tapdisk_disk_types[shared->type]->name, shared->name);
+}
+
+/*
+ * LLP: Local leaf persistent cache
+ *      -- Persistent write caching in local storage.
+ *
+ *    VBD
+ *      \
+ *       +--r/w--> llp+vhd:/local/leaf
+ *        \
+ *         +--r/w--> vhd:/shared/leaf
+ *          \
+ *           +--r/o--> vhd:/shared/parent
+ *
+ * We drive two 'leaf' (r/w) images: One LOCAL (i.e. on local storage,
+ * unreliable and prone to out-of-space failures), and one SHARED
+ * (i.e. in shared storage with plenty of physical backing).
+ *
+ * All images are on a linear read chain: LOCAL inherits from SHARED,
+ * which inherits from a shared master image. This filter driver
+ * aggregates LOCAL. SHARED is our immediate parent, forced into R/W
+ * mode.
+ *
+ * Unless LOCAL failed, reads are issued to LOCAL, to save shared
+ * storage bandwidth. In case of failure, SHARED provides continued
+ * VDI consistency.
+ *
+ */
+enum {
+       LLP_MIRROR = 1,
+       /*
+        * LLP_MIRROR:
+        *
+        * Writes are mirrored to both LOCAL and SHARED. Reads are
+        * issued to LOCAL.
+        *
+        * Failure to write LOCAL are recoverable. The driver will
+        * transition to LLP_SHARED.
+        *
+        * Failure to write SHARED is irrecoverable, and signaled to
+        * the original issuer.
+        */
+
+       LLP_SHARED = 2,
+       /*
+        * LLP_SHARED:
+        *
+        * Writes are issued to SHARED only. As are reads.
+        *
+        * Failure to write SHARED is irrecoverable.
+        */
+};
+
+typedef struct llpcache                 td_llpcache_t;
+typedef struct llpcache_request         td_llpcache_req_t;
+#define TD_LLPCACHE_MAX_REQ             (MAX_REQUESTS*2)
+
+struct llpcache_vreq {
+       enum { LOCAL = 0, SHARED = 1 }  target;
+       td_vbd_request_t                vreq;
+};
+
+struct llpcache_request {
+       td_request_t            treq;
+
+       struct td_iovec         iov;
+       int                     error;
+
+       struct llpcache_vreq    lvr[2];
+
+       unsigned int            pending;
+       int                     mode;
+};
+
+struct llpcache {
+       td_image_t             *local;
+       int                     mode;
+
+       td_llpcache_req_t       reqv[TD_LLPCACHE_MAX_REQ];
+       td_llpcache_req_t      *free[TD_LLPCACHE_MAX_REQ];
+       int                     n_free;
+};
+
+static void
+llpcache_close_image(td_llpcache_t *s)
+{
+}
+
+static td_llpcache_req_t *
+llpcache_alloc_request(td_llpcache_t *s)
+{
+       td_llpcache_req_t *req = NULL;
+
+       if (likely(s->n_free))
+               req = s->free[--s->n_free];
+
+       return req;
+}
+
+static void
+llpcache_free_request(td_llpcache_t *s, td_llpcache_req_t *req)
+{
+       BUG_ON(s->n_free >= TD_LLPCACHE_MAX_REQ);
+       s->free[s->n_free++] = req;
+}
+
+static void
+__llpcache_write_cb(td_vbd_request_t *vreq, int error,
+                  void *token, int final)
+{
+       td_llpcache_t *s = token;
+       struct llpcache_vreq *lvr;
+       td_llpcache_req_t *req;
+       int mask;
+
+       lvr = containerof(vreq, struct llpcache_vreq, vreq);
+       req = containerof(lvr, td_llpcache_req_t, lvr[lvr->target]);
+
+       mask = 1U << lvr->target;
+       BUG_ON(!(req->pending & mask))
+
+       if (lvr->target == LOCAL && error == -ENOSPC) {
+               td_image_t *shared =
+                       containerof(req->treq.image->next.next,
+                                   td_image_t, next);
+               ll_log_switch(DISK_TYPE_LLPCACHE, error,
+                             s->local, shared);
+               s->mode = LLP_SHARED;
+               error = 0;
+       }
+
+       req->pending &= ~mask;
+       req->error    = ll_write_error(req->error, error);
+
+       if (!req->pending) {
+               /* FIXME: Make sure this won't retry. */
+               td_complete_request(req->treq, req->error);
+               llpcache_free_request(s, req);
+       }
+}
+
+/*
+ * NB. Write mirroring. Lacking per-image queues, it's still a
+ * hack. But shall do for now:
+ *
+ *   1. Store the treq, thereby blocking the original vreq.
+ *   2. Reissue, as two clone vreqs. One local, one shared.
+ *   3. Clones seen again then get forwarded.
+ *   4. Treq completes after both vreqs.
+ *
+ * We can recognize clones by matching the vreq->token field.
+ */
+
+static int
+llpcache_requeue_treq(td_llpcache_t *s, td_llpcache_req_t *req, int target)
+{
+       struct llpcache_vreq *lvr;
+       td_vbd_request_t *vreq;
+       td_vbd_t *vbd;
+       int err;
+
+       lvr           = &req->lvr[target];
+       lvr->target   = target;
+
+       vreq          = &lvr->vreq;
+       vreq->op      = TD_OP_WRITE;
+       vreq->sec     = req->treq.sec;
+       vreq->iov     = &req->iov;
+       vreq->iovcnt  = 1;
+       vreq->cb      = __llpcache_write_cb;
+       vreq->token   = s;
+
+       err = tapdisk_vbd_queue_request(req->treq.vreq->vbd, vreq);
+       if (err)
+               goto fail;
+
+       req->pending |= 1UL << target;
+       return 0;
+
+fail:
+       req->error   = req->error ? : err;
+       return err;
+}
+
+static void
+llpcache_fork_write(td_llpcache_t *s, td_request_t treq)
+{
+       td_llpcache_req_t *req;
+       struct td_iovec *iov;
+       int err;
+
+       req = llpcache_alloc_request(s);
+       if (!req) {
+               td_complete_request(treq, -EBUSY);
+               return;
+       }
+
+       memset(req, 0, sizeof(req));
+
+       req->treq     = treq;
+
+       iov           = &req->iov;
+       iov->base     = treq.buf;
+       iov->secs     = treq.secs;
+
+       err = llpcache_requeue_treq(s, req, LOCAL);
+       if (err)
+               goto fail;
+
+       err = llpcache_requeue_treq(s, req, SHARED);
+       if (err)
+               goto fail;
+
+       return;
+
+fail:
+       if (!req->pending) {
+               td_complete_request(treq, req->error);
+               llpcache_free_request(s, req);
+       }
+}
+
+static void
+llpcache_forward_write(td_llpcache_t *s, td_request_t treq)
+{
+       const td_vbd_request_t *vreq = treq.vreq;
+       struct llpcache_vreq *lvr;
+
+       lvr = containerof(vreq, struct llpcache_vreq, vreq);
+
+       switch (lvr->target) {
+       case SHARED:
+               td_forward_request(treq);
+               break;
+       case LOCAL:
+               td_queue_write(s->local, treq);
+               break;
+       default:
+               BUG();
+       }
+}
+
+static void
+llpcache_queue_write(td_driver_t *driver, td_request_t treq)
+{
+       td_llpcache_t *s = driver->data;
+
+       if (treq.vreq->token == s)
+               llpcache_forward_write(s, treq);
+       else
+               llpcache_fork_write(s, treq);
+}
+
+static void
+llpcache_queue_read(td_driver_t *driver, td_request_t treq)
+{
+       td_llpcache_t *s = driver->data;
+
+       switch (s->mode) {
+       case LLP_MIRROR:
+               td_queue_read(s->local, treq);
+               break;
+       case LLP_SHARED:
+               td_forward_request(treq);
+       default:
+               BUG();
+       }
+}
+
+static int
+llpcache_close(td_driver_t *driver)
+{
+       td_llpcache_t *s = driver->data;
+
+       if (s->local) {
+               tapdisk_image_close(s->local);
+               s->local = NULL;
+       }
+
+       return 0;
+}
+
+static int
+llpcache_open(td_driver_t *driver, const char *name, td_flag_t flags)
+{
+       td_llpcache_t *s = driver->data;
+       int i, err;
+
+       s->mode = LLP_MIRROR;
+
+       for (i = 0; i < TD_LLPCACHE_MAX_REQ; i++)
+               llpcache_free_request(s, &s->reqv[i]);
+
+       err = tapdisk_image_open(DISK_TYPE_VHD, name, flags, &s->local);
+       if (err)
+               goto fail;
+
+       driver->info = s->local->driver->info;
+
+       return 0;
+
+fail:
+       llpcache_close(driver);
+       return err;
+}
+
+static int
+llcache_get_parent_id(td_driver_t *driver, td_disk_id_t *id)
+{
+       td_llpcache_t *s = driver->data;
+       int err;
+
+       err = td_get_parent_id(s->local, id);
+       if (!err)
+               id->flags &= ~TD_OPEN_RDONLY;
+
+       return err;
+}
+
+static int
+llcache_validate_parent(td_driver_t *driver,
+                       td_driver_t *pdriver, td_flag_t flags)
+{
+       return -ENOSYS;
+}
+
+
+struct tap_disk tapdisk_llpcache = {
+       .disk_type                  = "tapdisk_llpcache",
+       .flags                      = 0,
+       .private_data_size          = sizeof(td_llpcache_t),
+       .td_open                    = llpcache_open,
+       .td_close                   = llpcache_close,
+       .td_queue_read              = llpcache_queue_read,
+       .td_queue_write             = llpcache_queue_write,
+       .td_get_parent_id           = llcache_get_parent_id,
+       .td_validate_parent         = llcache_validate_parent,
+};
+
+/*
+ * LLE: Local Leaf Ephemeral Cache
+ *      -- Non-persistent write caching in local storage.
+ *
+ *    VBD
+ *      \
+ *       +--r/w--> lle+vhd:/shared/leaf
+ *        \
+ *         +--r/w--> vhd:/local/leaf
+ *          \
+ *           +--r/o--> vhd:/shared/parent
+ *
+ * Note that LOCAL and SHARED chain order differs from LLP. Shared
+ * storage data masks local data.
+ *
+ * This means VDI state in shared storage state alone is
+ * inconsistent. Wherever local is unavailable, SHARED must be
+ * discarded too.
+ */
+enum {
+       LLE_LOCAL = 1,
+       /*
+        * LLE_LOCAL:
+        *
+        * Writes are forwarded to LOCAL only. As are reads. This
+        * reduces network overhead.
+        *
+        * Failure to write LOCAL is recoverable. The driver will
+        * transition to LLE_SHARED.
+        *
+        * Failure to write to shared are irrecoverable and signaled
+        * to the original issuer.
+        */
+
+       LLE_SHARED = 2,
+       /*
+        * LLE_SHARED:
+        *
+        * Writes are issued to SHARED. As are reads.
+        *
+        * Failure to write to SHARED is irrecoverable.
+        */
+};
+
+typedef struct llecache                 td_llecache_t;
+typedef struct llecache_request         td_llecache_req_t;
+#define TD_LLECACHE_MAX_REQ             (MAX_REQUESTS*2)
+
+struct llecache_request {
+       td_llecache_t          *s;
+       td_request_t            treq;
+       int                     pending;
+       int                     error;
+};
+
+struct llecache {
+       td_image_t             *shared;
+       int                     mode;
+
+       td_llecache_req_t       reqv[TD_LLECACHE_MAX_REQ];
+       td_llecache_req_t      *free[TD_LLECACHE_MAX_REQ];
+       int                     n_free;
+};
+
+static td_llecache_req_t *
+llecache_alloc_request(td_llecache_t *s)
+{
+       td_llecache_req_t *req = NULL;
+
+       if (likely(s->n_free))
+               req = s->free[--s->n_free];
+
+       return req;
+}
+
+static void
+llecache_free_request(td_llecache_t *s, td_llecache_req_t *req)
+{
+       BUG_ON(s->n_free >= TD_LLECACHE_MAX_REQ);
+       s->free[s->n_free++] = req;
+}
+
+static int
+llecache_close(td_driver_t *driver)
+{
+       td_llecache_t *s = driver->data;
+
+       if (s->shared) {
+               tapdisk_image_close(s->shared);
+               s->shared = NULL;
+       }
+
+       return 0;
+}
+
+static int
+llecache_open(td_driver_t *driver, const char *name, td_flag_t flags)
+{
+       td_llecache_t *s = driver->data;
+       int i, err;
+
+       s->mode = LLE_LOCAL;
+
+       for (i = 0; i < TD_LLECACHE_MAX_REQ; i++)
+               llecache_free_request(s, &s->reqv[i]);
+
+       err = tapdisk_image_open(DISK_TYPE_VHD, name, flags, &s->shared);
+       if (err)
+               goto fail;
+
+       driver->info = s->shared->driver->info;
+
+       return 0;
+
+fail:
+       llecache_close(driver);
+       return err;
+}
+
+static void
+__llecache_write_cb(td_request_t treq, int error)
+{
+       td_llecache_req_t *req = treq.cb_data;
+       td_llecache_t *s = req->s;
+
+       BUG_ON(req->pending < treq.secs);
+
+       req->pending -= treq.secs;
+       req->error    = ll_write_error(req->error, error);
+
+       if (req->pending)
+               return;
+
+       if (req->error == -ENOSPC) {
+               ll_log_switch(DISK_TYPE_LLECACHE, req->error,
+                             treq.image, s->shared);
+
+               s->mode = LLE_SHARED;
+               td_queue_write(s->shared, req->treq);
+
+       } else
+               td_complete_request(req->treq, error);
+
+       llecache_free_request(s, req);
+}
+
+static void
+llecache_forward_write(td_llecache_t *s, td_request_t treq)
+{
+       td_llecache_req_t *req;
+       td_request_t clone;
+
+       req = llecache_alloc_request(s);
+       if (!req) {
+               td_complete_request(treq, -EBUSY);
+               return;
+       }
+
+       memset(req, 0, sizeof(req));
+
+       req->treq       = treq;
+       req->pending    = treq.secs;
+       req->s          = s;
+
+       clone           = treq;
+       clone.cb        = __llecache_write_cb;
+       clone.cb_data   = req;
+
+       td_forward_request(clone);
+}
+
+static void
+llecache_queue_write(td_driver_t *driver, td_request_t treq)
+{
+       td_llecache_t *s = driver->data;
+
+       switch (s->mode) {
+       case LLE_LOCAL:
+               llecache_forward_write(s, treq);
+               break;
+       case LLE_SHARED:
+               td_queue_write(s->shared, treq);
+               break;
+       }
+}
+
+static void
+llecache_queue_read(td_driver_t *driver, td_request_t treq)
+{
+       td_llecache_t *s = driver->data;
+
+       switch (s->mode) {
+       case LLE_LOCAL:
+               td_forward_request(treq);
+               break;
+       case LLE_SHARED:
+               td_queue_read(s->shared, treq);
+               break;
+       default:
+               BUG();
+       }
+}
+
+struct tap_disk tapdisk_llecache = {
+       .disk_type                  = "tapdisk_llecache",
+       .flags                      = 0,
+       .private_data_size          = sizeof(td_llecache_t),
+       .td_open                    = llecache_open,
+       .td_close                   = llecache_close,
+       .td_queue_read              = llecache_queue_read,
+       .td_queue_write             = llecache_queue_write,
+       .td_get_parent_id           = llcache_get_parent_id,
+       .td_validate_parent         = llcache_validate_parent,
+};
index f6aafc86baf21b402207ce72dbed440b2004b996..b890a465ba6c63f4e00bccb8d77a20f13d05344b 100644 (file)
@@ -105,6 +105,18 @@ static const disk_info_t lcache_disk = {
        DISK_TYPE_FILTER,
 };
 
+static const disk_info_t llpcache_disk = {
+       "llp",
+       "local leaf cache, persistent (llp)",
+       0,
+};
+
+static const disk_info_t llecache_disk = {
+       "lle",
+       "local leaf cache, ephemeral (lle)",
+       0,
+};
+
 static const disk_info_t valve_disk = {
        "valve",
        "group rate limiting (valve)",
@@ -125,6 +137,8 @@ const disk_info_t *tapdisk_disk_types[] = {
        [DISK_TYPE_REMUS]       = &remus_disk,
        [DISK_TYPE_LCACHE]      = &lcache_disk,
        [DISK_TYPE_VALVE]       = &valve_disk,
+       [DISK_TYPE_LLPCACHE]    = &llpcache_disk,
+       [DISK_TYPE_LLECACHE]    = &llecache_disk,
        0,
 };
 
@@ -145,6 +159,8 @@ extern struct tap_disk tapdisk_vhd_index;
 extern struct tap_disk tapdisk_log;
 #endif
 extern struct tap_disk tapdisk_lcache;
+extern struct tap_disk tapdisk_llpcache;
+extern struct tap_disk tapdisk_llecache;
 extern struct tap_disk tapdisk_valve;
 
 const struct tap_disk *tapdisk_disk_drivers[] = {
@@ -165,6 +181,8 @@ const struct tap_disk *tapdisk_disk_drivers[] = {
        [DISK_TYPE_LOG]         = &tapdisk_log,
 #endif
        [DISK_TYPE_LCACHE]      = &tapdisk_lcache,
+       [DISK_TYPE_LLPCACHE]    = &tapdisk_llpcache,
+       [DISK_TYPE_LLECACHE]    = &tapdisk_llecache,
        [DISK_TYPE_VALVE]       = &tapdisk_valve,
        0,
 };
index 562d4a723939f960bf8f26af73727c163b69654f..f4be6ac62890c96d3a96cc9301238e26a6f550c6 100644 (file)
@@ -41,7 +41,9 @@
 #define DISK_TYPE_LOG         9
 #define DISK_TYPE_REMUS       10
 #define DISK_TYPE_LCACHE      11
-#define DISK_TYPE_VALVE       12
+#define DISK_TYPE_LLECACHE    12
+#define DISK_TYPE_LLPCACHE    13
+#define DISK_TYPE_VALVE       14
 
 #define DISK_TYPE_NAME_MAX    32