]> xenbits.xensource.com Git - qemu-xen-4.4-testing.git/commitdiff
Use ioemu block drivers through blktap; import other Xen-specific files.
authorIan Jackson <iwj@mariner.uk.xensource.com>
Fri, 28 Mar 2008 16:10:30 +0000 (16:10 +0000)
committerIan Jackson <Ian.Jackson@eu.citrix.com>
Mon, 12 May 2008 11:20:10 +0000 (12:20 +0100)
Add support for a tap:ioemu pseudo driver. Devices using this driver
won't use tapdisk (containing the code duplication) any more, but will
connect to the qemu-dm of the domain. In this way no working
configuration should be broken right now as you can still choose to
use the tapdisk drivers.

Signed-off-by: Kevin Wolf <kwolf@suse.de>
Also, import and update various Xen-specific files from xen-unstable
tip 17307:b667e220e556.

block-vbd.c [new file with mode: 0644]
hw/xen_blktap.c [new file with mode: 0644]
hw/xen_blktap.h [new file with mode: 0644]
hw/xen_machine_fv.c
hw/xen_machine_pv.c
i386-dm/hooks.mak
xenstore.c

diff --git a/block-vbd.c b/block-vbd.c
new file mode 100644 (file)
index 0000000..53c6248
--- /dev/null
@@ -0,0 +1,347 @@
+/*
+ * Block driver for Mini-os PV devices
+ * Based on block-raw.c
+ * 
+ * Copyright (c) 2006 Fabrice Bellard, 2007 Samuel Thibault
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "vl.h"
+#include "block_int.h"
+#include <assert.h>
+#include <xenbus.h>
+#include <blkfront.h>
+#include <malloc.h>
+
+#define SECTOR_SIZE 512
+
+#ifndef QEMU_TOOL
+#include "exec-all.h"
+#endif
+
+#define DEBUG_BLOCK
+#ifdef  DEBUG_BLOCK
+#define DEBUG_BLOCK_PRINT( formatCstr, args... ) fprintf( logfile, formatCstr, ##args ); fflush( logfile )
+#else
+#define DEBUG_BLOCK_PRINT( formatCstr, args... )
+#endif
+
+#define FTYPE_FILE   0
+#define FTYPE_CD     1
+#define FTYPE_FD     2
+
+typedef struct BDRVVbdState {
+    struct blkfront_dev *dev;
+    int fd;
+    int type;
+    int mode;
+    int info;
+    uint64_t sectors;
+    unsigned sector_size;
+    QEMU_LIST_ENTRY(BDRVVbdState) list;
+} BDRVVbdState;
+
+QEMU_LIST_HEAD(, BDRVVbdState) vbds;
+
+static int vbd_probe(const uint8_t *buf, int buf_size, const char *filename)
+{
+    char *value;
+    if (xenbus_read(XBT_NIL, filename, &value))
+       return 0;
+    free(value);
+    return 100;
+}
+
+static void vbd_io_completed(void *opaque)
+{
+    BDRVVbdState *s = opaque;
+    blkfront_aio_poll(s->dev);
+}
+
+static int vbd_open(BlockDriverState *bs, const char *filename, int flags)
+{
+    BDRVVbdState *s = bs->opaque;
+
+    //handy to test posix access
+    //return -EIO;
+
+    s->dev = init_blkfront((char *) filename, &s->sectors, &s->sector_size, &s->mode, &s->info);
+
+    if (!s->dev)
+       return -EIO;
+
+    if (SECTOR_SIZE % s->sector_size) {
+       printf("sector size is %d, we only support sector sizes that divide %d\n", s->sector_size, SECTOR_SIZE);
+       return -EIO;
+    }
+
+    s->fd = blkfront_open(s->dev);
+    qemu_set_fd_handler(s->fd, vbd_io_completed, NULL, s);
+
+    QEMU_LIST_INSERT_HEAD(&vbds, s, list);
+
+    return 0;
+}
+
+typedef struct VbdAIOCB {
+    BlockDriverAIOCB common;
+    struct blkfront_aiocb aiocb;
+} VbdAIOCB;
+
+void qemu_aio_init(void)
+{
+}
+
+void qemu_aio_poll(void)
+{
+}
+
+/* Wait for all IO requests to complete.  */
+void qemu_aio_flush(void)
+{
+    BDRVVbdState *s;
+    for (s = vbds.lh_first; s; s = s->list.le_next)
+       blkfront_sync(s->dev);
+}
+
+void qemu_aio_wait_start(void)
+{
+}
+
+void qemu_aio_wait(void)
+{
+    int some = 0;
+    DEFINE_WAIT(w);
+    while (1) {
+       BDRVVbdState *s;
+       add_waiter(w, blkfront_queue);
+        for (s = vbds.lh_first; s; s = s->list.le_next)
+           if (blkfront_aio_poll(s->dev))
+               some = 1;
+       if (some)
+           break;
+       schedule();
+    }
+    remove_waiter(w);
+}
+
+void qemu_aio_wait_end(void)
+{
+}
+
+static void vbd_aio_callback(struct blkfront_aiocb *aiocbp, int ret) {
+    VbdAIOCB *acb = aiocbp->data;
+
+    acb->common.cb(acb->common.opaque, ret);
+    qemu_aio_release(acb);
+}
+
+static VbdAIOCB *vbd_aio_setup(BlockDriverState *bs,
+        int64_t sector_num, uint8_t *buf, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    BDRVVbdState *s = bs->opaque;
+    VbdAIOCB *acb;
+
+    acb = qemu_aio_get(bs, cb, opaque);
+    if (!acb)
+       return NULL;
+    acb->aiocb.aio_dev = s->dev;
+    acb->aiocb.aio_buf = buf;
+    acb->aiocb.aio_nbytes = nb_sectors * SECTOR_SIZE;
+    acb->aiocb.aio_offset = sector_num * SECTOR_SIZE;
+    acb->aiocb.aio_cb = vbd_aio_callback;
+    acb->aiocb.data = acb;
+
+    return acb;
+}
+
+static BlockDriverAIOCB *vbd_aio_read(BlockDriverState *bs,
+        int64_t sector_num, uint8_t *buf, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    VbdAIOCB *acb;
+
+    acb = vbd_aio_setup(bs, sector_num, buf, nb_sectors, cb, opaque);
+    if (!acb)
+       return NULL;
+    blkfront_aio(&acb->aiocb, 0);
+    return &acb->common;
+}
+
+static BlockDriverAIOCB *vbd_aio_write(BlockDriverState *bs,
+        int64_t sector_num, const uint8_t *buf, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    VbdAIOCB *acb;
+
+    acb = vbd_aio_setup(bs, sector_num, (uint8_t*) buf, nb_sectors, cb, opaque);
+    if (!acb)
+       return NULL;
+    blkfront_aio(&acb->aiocb, 1);
+    return &acb->common;
+}
+
+static void vbd_cb(void *data, int ret) {
+    int *result = data;
+    result[0] = 1;
+    result[1] = ret;
+}
+
+static int vbd_aligned_io(BlockDriverState *bs,
+       int64_t sector_num, uint8_t *buf, int nb_sectors, int write)
+{
+    VbdAIOCB *acb;
+    int result[2];
+    result[0] = 0;
+    qemu_aio_wait_start();
+    acb = vbd_aio_setup(bs, sector_num, (uint8_t*) buf, nb_sectors, vbd_cb, &result);
+    blkfront_aio(&acb->aiocb, write);
+    while (!result[0])
+       qemu_aio_wait();
+    qemu_aio_wait_end();
+    return result[1];
+}
+
+static int vbd_read(BlockDriverState *bs,
+       int64_t sector_num, uint8_t *buf, int nb_sectors)
+{
+    uint8_t *iobuf;
+    int ret;
+    /* page alignment would be a bit better, but that's still fine compared to
+     * copying */
+    if (!((uintptr_t)buf & (SECTOR_SIZE-1)))
+       return vbd_aligned_io(bs, sector_num, buf, nb_sectors, 0);
+    iobuf = qemu_memalign(PAGE_SIZE, nb_sectors * SECTOR_SIZE);
+    ret = vbd_aligned_io(bs, sector_num, iobuf, nb_sectors, 0);
+    memcpy(buf, iobuf, nb_sectors * SECTOR_SIZE);
+    free(iobuf);
+    if (ret < 0)
+       return ret;
+    else if (ret != nb_sectors * SECTOR_SIZE)
+       return -EINVAL;
+    else
+       return 0;
+}
+
+static int vbd_write(BlockDriverState *bs,
+       int64_t sector_num, const uint8_t *buf, int nb_sectors)
+{
+    uint8_t *iobuf;
+    int ret;
+    if (!((uintptr_t)buf & (SECTOR_SIZE-1)))
+       return vbd_aligned_io(bs, sector_num, (uint8_t*) buf, nb_sectors, 1);
+    iobuf = qemu_memalign(PAGE_SIZE, nb_sectors * SECTOR_SIZE);
+    memcpy(iobuf, buf, nb_sectors * SECTOR_SIZE);
+    ret = vbd_aligned_io(bs, sector_num, iobuf, nb_sectors, 1);
+    free(iobuf);
+    if (ret < 0)
+       return ret;
+    else if (ret != nb_sectors * SECTOR_SIZE)
+       return -EINVAL;
+    else
+       return 0;
+}
+
+static void vbd_aio_cancel(BlockDriverAIOCB *blockacb)
+{
+    /* TODO */
+    //VbdAIOCB *acb = (VbdAIOCB *)blockacb;
+
+    // Try to cancel. If can't, wait for it, drop the callback and call qemu_aio_release(acb)
+}
+
+static void vbd_close(BlockDriverState *bs)
+{
+    BDRVVbdState *s = bs->opaque;
+    bs->total_sectors = 0;
+    if (s->fd >= 0) {
+        qemu_set_fd_handler(s->fd, NULL, NULL, NULL);
+        close(s->fd);
+        s->fd = -1;
+    }
+    QEMU_LIST_REMOVE(s, list);
+}
+
+static int64_t  vbd_getlength(BlockDriverState *bs)
+{
+    BDRVVbdState *s = bs->opaque;
+    return s->sectors * s->sector_size;
+}
+
+static void vbd_flush(BlockDriverState *bs)
+{
+    BDRVVbdState *s = bs->opaque;
+    blkfront_sync(s->dev);
+}
+
+/***********************************************/
+/* host device */
+
+static int vbd_is_inserted(BlockDriverState *bs)
+{
+    /* TODO: monitor the backend */
+    return 1;
+}
+
+/* currently only used by fdc.c, but a CD version would be good too */
+static int vbd_media_changed(BlockDriverState *bs)
+{
+    /* TODO: monitor the backend */
+    return -ENOTSUP;
+}
+
+static int vbd_eject(BlockDriverState *bs, int eject_flag)
+{
+    /* TODO: Xen support needed */
+    return -ENOTSUP;
+}
+
+static int vbd_set_locked(BlockDriverState *bs, int locked)
+{
+    /* TODO: Xen support needed */
+    return -ENOTSUP;
+}
+
+BlockDriver bdrv_vbd = {
+    "vbd",
+    sizeof(BDRVVbdState),
+    vbd_probe,
+    vbd_open,
+    NULL,
+    NULL,
+    vbd_close,
+    NULL,
+    vbd_flush,
+    
+    .bdrv_aio_read = vbd_aio_read,
+    .bdrv_aio_write = vbd_aio_write,
+    .bdrv_aio_cancel = vbd_aio_cancel,
+    .aiocb_size = sizeof(VbdAIOCB),
+    .bdrv_read = vbd_read,
+    .bdrv_write = vbd_write,
+    .bdrv_getlength = vbd_getlength,
+
+    /* removable device support */
+    .bdrv_is_inserted = vbd_is_inserted,
+    .bdrv_media_changed = vbd_media_changed,
+    .bdrv_eject = vbd_eject,
+    .bdrv_set_locked = vbd_set_locked,
+};
+
diff --git a/hw/xen_blktap.c b/hw/xen_blktap.c
new file mode 100644 (file)
index 0000000..5420da2
--- /dev/null
@@ -0,0 +1,686 @@
+/* xen_blktap.c
+ *
+ * Interface to blktapctrl to allow use of qemu block drivers with blktap.
+ * This file is based on tools/blktap/drivers/tapdisk.c
+ * 
+ * Copyright (c) 2005 Julian Chesterfield and Andrew Warfield.
+ * Copyright (c) 2008 Kevin Wolf
+ */
+
+/*
+ * There are several communication channels which are used by this interface:
+ *
+ *   - A pair of pipes for receiving and sending general control messages
+ *     (qemu-read-N and qemu-writeN in /var/run/tap, where N is the domain ID).
+ *     These control messages are handled by handle_blktap_ctrlmsg().
+ *
+ *   - One file descriptor per attached disk (/dev/xen/blktapN) for disk
+ *     specific control messages. A callback is triggered on this fd if there
+ *     is a new IO request. The callback function is handle_blktap_iomsg().
+ *
+ *   - A shared ring for each attached disk containing the actual IO requests 
+ *     and responses. Whenever handle_blktap_iomsg() is triggered it processes
+ *     the requests on this ring.
+ */
+
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <sys/ioctl.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <errno.h>
+#include <stdlib.h>
+
+#include "vl.h"
+#include "blktaplib.h"
+#include "xen_blktap.h"
+#include "block_int.h"
+
+#define MSG_SIZE 4096
+
+#define BLKTAP_CTRL_DIR "/var/run/tap"
+
+/* If enabled, print debug messages to stderr */
+#if 1
+#define DPRINTF(_f, _a...) fprintf(stderr, __FILE__ ":%d: " _f, __LINE__, ##_a)
+#else
+#define DPRINTF(_f, _a...) ((void)0)
+#endif
+
+#if 1                                                                        
+#define ASSERT(_p) \
+    if ( !(_p) ) { DPRINTF("Assertion '%s' failed, line %d, file %s\n", #_p , \
+        __LINE__, __FILE__); *(int*)0=0; }
+#else
+#define ASSERT(_p) ((void)0)
+#endif 
+
+
+extern int domid;
+
+int read_fd;
+int write_fd;
+
+static pid_t process;
+fd_list_entry_t *fd_start = NULL;
+
+static void handle_blktap_iomsg(void* private);
+
+struct aiocb_info {
+       struct td_state *s;
+       uint64_t sector;
+       int nr_secs;
+       int idx;
+       long i;
+};
+
+static void unmap_disk(struct td_state *s)
+{
+       tapdev_info_t *info = s->ring_info;
+       fd_list_entry_t *entry;
+       
+       bdrv_close(s->bs);
+
+       if (info != NULL && info->mem > 0)
+               munmap(info->mem, getpagesize() * BLKTAP_MMAP_REGION_SIZE);
+
+       entry = s->fd_entry;
+       *entry->pprev = entry->next;
+       if (entry->next)
+               entry->next->pprev = entry->pprev;
+
+       qemu_set_fd_handler2(info->fd, NULL, NULL, NULL, NULL);
+       close(info->fd);
+
+       free(s->fd_entry);
+       free(s->blkif);
+       free(s->ring_info);
+       free(s);
+
+       return;
+}
+
+static inline fd_list_entry_t *add_fd_entry(int tap_fd, struct td_state *s)
+{
+       fd_list_entry_t **pprev, *entry;
+
+       DPRINTF("Adding fd_list_entry\n");
+
+       /*Add to linked list*/
+       s->fd_entry   = entry = malloc(sizeof(fd_list_entry_t));
+       entry->tap_fd = tap_fd;
+       entry->s      = s;
+       entry->next   = NULL;
+
+       pprev = &fd_start;
+       while (*pprev != NULL)
+               pprev = &(*pprev)->next;
+
+       *pprev = entry;
+       entry->pprev = pprev;
+
+       return entry;
+}
+
+static inline struct td_state *get_state(int cookie)
+{
+       fd_list_entry_t *ptr;
+
+       ptr = fd_start;
+       while (ptr != NULL) {
+               if (ptr->cookie == cookie) return ptr->s;
+               ptr = ptr->next;
+       }
+       return NULL;
+}
+
+static struct td_state *state_init(void)
+{
+       int i;
+       struct td_state *s;
+       blkif_t *blkif;
+
+       s = malloc(sizeof(struct td_state));
+       blkif = s->blkif = malloc(sizeof(blkif_t));
+       s->ring_info = calloc(1, sizeof(tapdev_info_t));
+
+       for (i = 0; i < MAX_REQUESTS; i++) {
+               blkif->pending_list[i].secs_pending = 0;
+               blkif->pending_list[i].submitting = 0;
+       }
+
+       return s;
+}
+
+static int map_new_dev(struct td_state *s, int minor)
+{
+       int tap_fd;
+       tapdev_info_t *info = s->ring_info;
+       char *devname;
+       fd_list_entry_t *ptr;
+       int page_size;
+
+       if (asprintf(&devname,"%s/%s%d", BLKTAP_DEV_DIR, BLKTAP_DEV_NAME, minor) == -1)
+               return -1;
+       tap_fd = open(devname, O_RDWR);
+       if (tap_fd == -1) 
+       {
+               DPRINTF("open failed on dev %s!\n",devname);
+               goto fail;
+       } 
+       info->fd = tap_fd;
+
+       /*Map the shared memory*/
+       page_size = getpagesize();
+       info->mem = mmap(0, page_size * BLKTAP_MMAP_REGION_SIZE, 
+                         PROT_READ | PROT_WRITE, MAP_SHARED, info->fd, 0);
+       if ((long int)info->mem == -1) 
+       {
+               DPRINTF("mmap failed on dev %s!\n",devname);
+               goto fail;
+       }
+
+       /* assign the rings to the mapped memory */ 
+       info->sring = (blkif_sring_t *)((unsigned long)info->mem);
+       BACK_RING_INIT(&info->fe_ring, info->sring, page_size);
+       
+       info->vstart = 
+               (unsigned long)info->mem + (BLKTAP_RING_PAGES * page_size);
+
+       ioctl(info->fd, BLKTAP_IOCTL_SENDPID, process );
+       ioctl(info->fd, BLKTAP_IOCTL_SETMODE, BLKTAP_MODE_INTERPOSE );
+       free(devname);
+
+       /*Update the fd entry*/
+       ptr = fd_start;
+       while (ptr != NULL) {
+               if (s == ptr->s) {
+                       ptr->tap_fd = tap_fd;
+
+                       /* Setup fd_handler for qemu main loop */
+                       DPRINTF("set tap_fd = %d\n", tap_fd);
+                       qemu_set_fd_handler2(tap_fd, NULL, &handle_blktap_iomsg, NULL, s);
+
+                       break;
+               }
+               ptr = ptr->next;
+       }       
+
+
+       DPRINTF("map_new_dev = %d\n", minor);
+       return minor;
+
+ fail:
+       free(devname);
+       return -1;
+}
+
+static int open_disk(struct td_state *s, char *path, int readonly)
+{
+       struct disk_id id;
+       BlockDriverState* bs;
+
+       DPRINTF("Opening %s\n", path);
+       bs = calloc(1, sizeof(*bs));
+
+       memset(&id, 0, sizeof(struct disk_id));
+
+       if (bdrv_open(bs, path, 0) != 0) {
+               fprintf(stderr, "Could not open image file %s\n", path);
+               return -ENOMEM;
+       }
+
+       s->bs = bs;
+       s->flags = readonly ? TD_RDONLY : 0;
+       s->size = bs->total_sectors;
+       s->sector_size = 512;
+
+       s->info = ((s->flags & TD_RDONLY) ? VDISK_READONLY : 0);
+
+       return 0;
+}
+
+static inline void write_rsp_to_ring(struct td_state *s, blkif_response_t *rsp)
+{
+       tapdev_info_t *info = s->ring_info;
+       blkif_response_t *rsp_d;
+       
+       rsp_d = RING_GET_RESPONSE(&info->fe_ring, info->fe_ring.rsp_prod_pvt);
+       memcpy(rsp_d, rsp, sizeof(blkif_response_t));
+       info->fe_ring.rsp_prod_pvt++;
+}
+
+static inline void kick_responses(struct td_state *s)
+{
+       tapdev_info_t *info = s->ring_info;
+
+       if (info->fe_ring.rsp_prod_pvt != info->fe_ring.sring->rsp_prod) 
+       {
+               RING_PUSH_RESPONSES(&info->fe_ring);
+               ioctl(info->fd, BLKTAP_IOCTL_KICK_FE);
+       }
+}
+
+static int send_responses(struct td_state *s, int res, 
+                  uint64_t sector, int nr_secs, int idx, void *private)
+{
+       pending_req_t   *preq;
+       blkif_request_t *req;
+       int responses_queued = 0;
+       blkif_t *blkif = s->blkif;
+       int secs_done = nr_secs;
+
+       if ( (idx > MAX_REQUESTS-1) )
+       {
+               DPRINTF("invalid index returned(%u)!\n", idx);
+               return 0;
+       }
+       preq = &blkif->pending_list[idx];
+       req  = &preq->req;
+
+       preq->secs_pending -= secs_done;
+
+       if (res == -EBUSY && preq->submitting) 
+               return -EBUSY;  /* propagate -EBUSY back to higher layers */
+       if (res) 
+               preq->status = BLKIF_RSP_ERROR;
+       
+       if (!preq->submitting && preq->secs_pending == 0) 
+       {
+               blkif_request_t tmp;
+               blkif_response_t *rsp;
+
+               tmp = preq->req;
+               rsp = (blkif_response_t *)req;
+               
+               rsp->id = tmp.id;
+               rsp->operation = tmp.operation;
+               rsp->status = preq->status;
+               
+               write_rsp_to_ring(s, rsp);
+               responses_queued++;
+
+               kick_responses(s);
+       }
+       
+       return responses_queued;
+}
+
+static void qemu_send_responses(void* opaque, int ret)
+{
+       struct aiocb_info* info = opaque;
+
+       if (ret != 0) {
+               DPRINTF("ERROR: ret = %d (%s)\n", ret, strerror(-ret));
+       }
+
+       send_responses(info->s, ret, info->sector, info->nr_secs, 
+               info->idx, (void*) info->i);
+       free(info);
+}
+
+/**
+ * Callback function for the IO message pipe. Reads requests from the ring
+ * and processes them (call qemu read/write functions).
+ *
+ * The private parameter points to the struct td_state representing the
+ * disk the request is targeted at.
+ */
+static void handle_blktap_iomsg(void* private)
+{
+       struct td_state* s = private;
+
+       RING_IDX          rp, j, i;
+       blkif_request_t  *req;
+       int idx, nsects, ret;
+       uint64_t sector_nr;
+       uint8_t *page;
+       blkif_t *blkif = s->blkif;
+       tapdev_info_t *info = s->ring_info;
+       int page_size = getpagesize();
+
+       struct aiocb_info *aiocb_info;
+
+       if (info->fe_ring.sring == NULL) {
+               DPRINTF("  sring == NULL, ignoring IO request\n");
+               return;
+       }
+
+       rp = info->fe_ring.sring->req_prod; 
+       xen_rmb();
+
+       for (j = info->fe_ring.req_cons; j != rp; j++)
+       {
+               int start_seg = 0; 
+
+               req = NULL;
+               req = RING_GET_REQUEST(&info->fe_ring, j);
+               ++info->fe_ring.req_cons;
+               
+               if (req == NULL)
+                       continue;
+
+               idx = req->id;
+
+               ASSERT(blkif->pending_list[idx].secs_pending == 0);
+               memcpy(&blkif->pending_list[idx].req, req, sizeof(*req));
+               blkif->pending_list[idx].status = BLKIF_RSP_OKAY;
+               blkif->pending_list[idx].submitting = 1;
+               sector_nr = req->sector_number;
+
+               /* Don't allow writes on readonly devices */
+               if ((s->flags & TD_RDONLY) && 
+                   (req->operation == BLKIF_OP_WRITE)) {
+                       blkif->pending_list[idx].status = BLKIF_RSP_ERROR;
+                       goto send_response;
+               }
+
+               for (i = start_seg; i < req->nr_segments; i++) {
+                       nsects = req->seg[i].last_sect - 
+                                req->seg[i].first_sect + 1;
+       
+                       if ((req->seg[i].last_sect >= page_size >> 9) ||
+                                       (nsects <= 0))
+                               continue;
+
+                       page  = (uint8_t*) MMAP_VADDR(info->vstart, 
+                                                  (unsigned long)req->id, i);
+                       page += (req->seg[i].first_sect << SECTOR_SHIFT);
+
+                       if (sector_nr >= s->size) {
+                               DPRINTF("Sector request failed:\n");
+                               DPRINTF("%s request, idx [%d,%d] size [%llu], "
+                                       "sector [%llu,%llu]\n",
+                                       (req->operation == BLKIF_OP_WRITE ? 
+                                        "WRITE" : "READ"),
+                                       idx,i,
+                                       (long long unsigned) 
+                                               nsects<<SECTOR_SHIFT,
+                                       (long long unsigned) 
+                                               sector_nr<<SECTOR_SHIFT,
+                                       (long long unsigned) sector_nr);
+                               continue;
+                       }
+
+                       blkif->pending_list[idx].secs_pending += nsects;
+
+                       switch (req->operation) 
+                       {
+                       case BLKIF_OP_WRITE:
+                               aiocb_info = malloc(sizeof(*aiocb_info));
+
+                               aiocb_info->s = s;
+                               aiocb_info->sector = sector_nr;
+                               aiocb_info->nr_secs = nsects;
+                               aiocb_info->idx = idx;
+                               aiocb_info->i = i;
+
+                               ret = (NULL == bdrv_aio_write(s->bs, sector_nr,
+                                                         page, nsects,
+                                                         qemu_send_responses,
+                                                         aiocb_info));
+
+                               if (ret) {
+                                       blkif->pending_list[idx].status = BLKIF_RSP_ERROR;
+                                       DPRINTF("ERROR: bdrv_write() == NULL\n");
+                                       goto send_response;
+                               }
+                               break;
+
+                       case BLKIF_OP_READ:
+                               aiocb_info = malloc(sizeof(*aiocb_info));
+
+                               aiocb_info->s = s;
+                               aiocb_info->sector = sector_nr;
+                               aiocb_info->nr_secs = nsects;
+                               aiocb_info->idx = idx;
+                               aiocb_info->i = i;
+
+                               ret = (NULL == bdrv_aio_read(s->bs, sector_nr,
+                                                        page, nsects,
+                                                        qemu_send_responses,
+                                                        aiocb_info));
+
+                               if (ret) {
+                                       blkif->pending_list[idx].status = BLKIF_RSP_ERROR;
+                                       DPRINTF("ERROR: bdrv_read() == NULL\n");
+                                       goto send_response;
+                               }
+                               break;
+
+                       default:
+                               DPRINTF("Unknown block operation\n");
+                               break;
+                       }
+                       sector_nr += nsects;
+               }
+       send_response:
+               blkif->pending_list[idx].submitting = 0;
+
+               /* force write_rsp_to_ring for synchronous case */
+               if (blkif->pending_list[idx].secs_pending == 0)
+                       send_responses(s, 0, 0, 0, idx, (void *)(long)0);
+       }
+}
+
+/**
+ * Callback function for the qemu-read pipe. Reads and processes control 
+ * message from the pipe.
+ *
+ * The parameter private is unused.
+ */
+static void handle_blktap_ctrlmsg(void* private)
+{
+       int length, len, msglen;
+       char *ptr, *path;
+       image_t *img;
+       msg_hdr_t *msg;
+       msg_newdev_t *msg_dev;
+       msg_pid_t *msg_pid;
+       int ret = -1;
+       struct td_state *s = NULL;
+       fd_list_entry_t *entry;
+
+       char buf[MSG_SIZE];
+
+       length = read(read_fd, buf, MSG_SIZE);
+
+       if (length > 0 && length >= sizeof(msg_hdr_t)) 
+       {
+               msg = (msg_hdr_t *)buf;
+               DPRINTF("blktap: Received msg, len %d, type %d, UID %d\n",
+                       length,msg->type,msg->cookie);
+
+               switch (msg->type) {
+               case CTLMSG_PARAMS:                     
+                       ptr = buf + sizeof(msg_hdr_t);
+                       len = (length - sizeof(msg_hdr_t));
+                       path = calloc(1, len + 1);
+                       
+                       memcpy(path, ptr, len); 
+                       DPRINTF("Received CTLMSG_PARAMS: [%s]\n", path);
+
+                       /* Allocate the disk structs */
+                       s = state_init();
+
+                       /*Open file*/
+                       if (s == NULL || open_disk(s, path, msg->readonly)) {
+                               msglen = sizeof(msg_hdr_t);
+                               msg->type = CTLMSG_IMG_FAIL;
+                               msg->len = msglen;
+                       } else {
+                               entry = add_fd_entry(0, s);
+                               entry->cookie = msg->cookie;
+                               DPRINTF("Entered cookie %d\n", entry->cookie);
+                               
+                               memset(buf, 0x00, MSG_SIZE); 
+                       
+                               msglen = sizeof(msg_hdr_t) + sizeof(image_t);
+                               msg->type = CTLMSG_IMG;
+                               img = (image_t *)(buf + sizeof(msg_hdr_t));
+                               img->size = s->size;
+                               img->secsize = s->sector_size;
+                               img->info = s->info;
+                               DPRINTF("Writing (size, secsize, info) = "
+                                       "(%#" PRIx64 ", %#" PRIx64 ", %d)\n",
+                                       s->size, s->sector_size, s->info);
+                       }
+                       len = write(write_fd, buf, msglen);
+                       free(path);
+                       break;
+                       
+               case CTLMSG_NEWDEV:
+                       msg_dev = (msg_newdev_t *)(buf + sizeof(msg_hdr_t));
+
+                       s = get_state(msg->cookie);
+                       DPRINTF("Retrieving state, cookie %d.....[%s]\n",
+                               msg->cookie, (s == NULL ? "FAIL":"OK"));
+                       if (s != NULL) {
+                               ret = ((map_new_dev(s, msg_dev->devnum) 
+                                       == msg_dev->devnum ? 0: -1));
+                       }       
+
+                       memset(buf, 0x00, MSG_SIZE); 
+                       msglen = sizeof(msg_hdr_t);
+                       msg->type = (ret == 0 ? CTLMSG_NEWDEV_RSP 
+                                             : CTLMSG_NEWDEV_FAIL);
+                       msg->len = msglen;
+
+                       len = write(write_fd, buf, msglen);
+                       break;
+
+               case CTLMSG_CLOSE:
+                       s = get_state(msg->cookie);
+                       if (s) unmap_disk(s);
+                       break;                  
+
+               case CTLMSG_PID:
+                       memset(buf, 0x00, MSG_SIZE);
+                       msglen = sizeof(msg_hdr_t) + sizeof(msg_pid_t);
+                       msg->type = CTLMSG_PID_RSP;
+                       msg->len = msglen;
+
+                       msg_pid = (msg_pid_t *)(buf + sizeof(msg_hdr_t));
+                       process = getpid();
+                       msg_pid->pid = process;
+
+                       len = write(write_fd, buf, msglen);
+                       break;
+
+               default:
+                       break;
+               }
+       }
+}
+
+/**
+ * Opens a control socket, i.e. a pipe to communicate with blktapctrl.
+ *
+ * Returns the file descriptor number for the pipe; -1 in error case
+ */
+static int open_ctrl_socket(char *devname)
+{
+       int ret;
+       int ipc_fd;
+
+       if (mkdir(BLKTAP_CTRL_DIR, 0755) == 0)
+               DPRINTF("Created %s directory\n", BLKTAP_CTRL_DIR);
+
+       ret = mkfifo(devname,S_IRWXU|S_IRWXG|S_IRWXO);
+       if ( (ret != 0) && (errno != EEXIST) ) {
+               DPRINTF("ERROR: pipe failed (%d)\n", errno);
+               return -1;
+       }
+
+       ipc_fd = open(devname,O_RDWR|O_NONBLOCK);
+
+       if (ipc_fd < 0) {
+               DPRINTF("FD open failed\n");
+               return -1;
+       }
+
+       return ipc_fd;
+}
+
+/**
+ * Unmaps all disks and closes their pipes
+ */
+void shutdown_blktap(void)
+{
+       fd_list_entry_t *ptr;
+       struct td_state *s;
+       char *devname;
+
+       DPRINTF("Shutdown blktap\n");
+
+       /* Unmap all disks */
+       ptr = fd_start;
+       while (ptr != NULL) {
+               s = ptr->s;
+               unmap_disk(s);
+               close(ptr->tap_fd);
+               ptr = ptr->next;
+       }
+
+       /* Delete control pipes */
+       if (asprintf(&devname, BLKTAP_CTRL_DIR "/qemu-read-%d", domid) >= 0) {
+               DPRINTF("Delete %s\n", devname);
+               if (unlink(devname))
+                       DPRINTF("Could not delete: %s\n", strerror(errno));
+               free(devname);
+       }
+       
+       if (asprintf(&devname, BLKTAP_CTRL_DIR "/qemu-write-%d", domid) >= 0) { 
+               DPRINTF("Delete %s\n", devname);
+               if (unlink(devname))
+                       DPRINTF("Could not delete: %s\n", strerror(errno));
+               free(devname);
+       }
+}
+
+/**
+ * Initialize the blktap interface, i.e. open a pair of pipes in /var/run/tap
+ * and register a fd handler.
+ *
+ * Returns 0 on success.
+ */
+int init_blktap(void)
+{
+       char* devname;  
+
+       DPRINTF("Init blktap pipes\n");
+
+       /* Open the read pipe */
+       if (asprintf(&devname, BLKTAP_CTRL_DIR "/qemu-read-%d", domid) >= 0) {  
+               read_fd = open_ctrl_socket(devname);            
+               free(devname);
+               
+               if (read_fd == -1) {
+                       fprintf(stderr, "Could not open %s/qemu-read-%d\n",
+                               BLKTAP_CTRL_DIR, domid);
+                       return -1;
+               }
+       }
+       
+       /* Open the write pipe */
+       if (asprintf(&devname, BLKTAP_CTRL_DIR "/qemu-write-%d", domid) >= 0) {
+               write_fd = open_ctrl_socket(devname);
+               free(devname);
+               
+               if (write_fd == -1) {
+                       fprintf(stderr, "Could not open %s/qemu-write-%d\n",
+                               BLKTAP_CTRL_DIR, domid);
+                       close(read_fd);
+                       return -1;
+               }
+       }
+
+       /* Attach a handler to the read pipe (called from qemu main loop) */
+       qemu_set_fd_handler2(read_fd, NULL, &handle_blktap_ctrlmsg, NULL, NULL);
+
+       /* Register handler to clean up when the domain is destroyed */
+       atexit(&shutdown_blktap);
+
+       return 0;
+}
diff --git a/hw/xen_blktap.h b/hw/xen_blktap.h
new file mode 100644 (file)
index 0000000..0fb4bb3
--- /dev/null
@@ -0,0 +1,57 @@
+/* xen_blktap.h
+ *
+ * Generic disk interface for blktap-based image adapters.
+ *
+ * (c) 2006 Andrew Warfield and Julian Chesterfield
+ */
+
+#ifndef XEN_BLKTAP_H_ 
+#define XEN_BLKTAP_H_
+
+#include <stdint.h>
+#include <syslog.h>
+#include <stdio.h>
+
+#include "block_int.h"
+
+/* Things disks need to know about, these should probably be in a higher-level
+ * header. */
+#define MAX_SEGMENTS_PER_REQ    11
+#define SECTOR_SHIFT             9
+#define DEFAULT_SECTOR_SIZE    512
+
+#define MAX_IOFD                 2
+
+#define BLK_NOT_ALLOCATED       99
+#define TD_NO_PARENT             1
+
+typedef uint32_t td_flag_t;
+
+#define TD_RDONLY                1
+
+struct disk_id {
+       char *name;
+       int drivertype;
+};
+
+/* This structure represents the state of an active virtual disk.           */
+struct td_state {
+       BlockDriverState* bs;
+       td_flag_t flags;
+       void *blkif;
+       void *image;
+       void *ring_info;
+       void *fd_entry;
+       uint64_t sector_size;
+       uint64_t size;
+       unsigned int       info;
+};
+
+typedef struct fd_list_entry {
+       int cookie;
+       int  tap_fd;
+       struct td_state *s;
+       struct fd_list_entry **pprev, *next;
+} fd_list_entry_t;
+
+#endif /*XEN_BLKTAP_H_*/
index 2d02b5e13600c5229d0d6b8876b61878cfff1d01..5fc9dda42f1c7f1de74b67f4745e5b67046e7b04 100644 (file)
@@ -24,9 +24,6 @@
  */
 
 #include "vl.h"
-#ifdef CONFIG_STUBDOM
-#include <xenbus.h>
-#endif
 #include <xen/hvm/params.h>
 #include <sys/mman.h>
 
index 41f051dbf81e68dc5dfd0fae750c3986f49e1ca2..5cf5a695d48bd27031c6488c302b1c43b3cbc4f4 100644 (file)
@@ -26,6 +26,9 @@
 #include "xen_console.h"
 #include "xenfb.h"
 
+extern void init_blktap(void);
+
+
 /* The Xen PV machine currently provides
  *   - a virtual framebuffer
  *   - ....
@@ -41,6 +44,12 @@ static void xen_init_pv(uint64_t ram_size, int vga_ram_size, char *boot_device,
     struct xenfb *xenfb;
     extern int domid;
 
+
+#ifndef CONFIG_STUBDOM
+    /* Initialize tapdisk client */
+    init_blktap();
+#endif
+
     /* Connect to text console */
     if (serial_hds[0]) {
         if (xencons_init(domid, serial_hds[0]) < 0) {
index 64571b48147d6dd2ad888462d9295ab7b6a775bf..3108309690b00c9cdcc471ce64dc78bba407a1cb 100644 (file)
@@ -1,3 +1,6 @@
 CPPFLAGS += -DHAS_AUDIO
 QEMU_PROG=qemu-dm
+
+OBJS += xen_blktap.o
+
 include ../xen-hooks.mak
index e1c253221b227e4cd7777cdd872d88798c6ff463..01ace34ada2d6923520e33efc0e63b46b4569528 100644 (file)
@@ -81,7 +81,7 @@ static void waitForDevice(char *fn)
 
 #define DIRECT_PCI_STR_LEN 160
 char direct_pci_str[DIRECT_PCI_STR_LEN];
-void xenstore_parse_domain_config(int domid)
+void xenstore_parse_domain_config(int hvm_domid)
 {
     char **e = NULL;
     char *buf = NULL, *path;
@@ -100,7 +100,7 @@ void xenstore_parse_domain_config(int domid)
         return;
     }
 
-    path = xs_get_domain_path(xsh, domid);
+    path = xs_get_domain_path(xsh, hvm_domid);
     if (path == NULL) {
         fprintf(logfile, "xs_get_domain_path() error\n");
         goto out;
@@ -189,6 +189,13 @@ void xenstore_parse_domain_config(int domid)
             memmove(params, offset+1, strlen(offset+1)+1 );
             fprintf(logfile, "Strip off blktap sub-type prefix to %s\n", params); 
         }
+        /* Prefix with /dev/ if needed */
+        if (!strcmp(drv, "phy") && params[0] != '/') {
+            char *newparams = malloc(5 + strlen(params) + 1);
+            sprintf(newparams, "/dev/%s", params);
+            free(params);
+            params = newparams;
+        }
 
         /* 
          * check if device has a phantom vbd; the phantom is hooked