From: Ian Jackson Date: Fri, 28 Mar 2008 16:10:30 +0000 (+0000) Subject: Use ioemu block drivers through blktap; import other Xen-specific files. X-Git-Tag: xen-3.3.0-rc1~223 X-Git-Url: http://xenbits.xensource.com/gitweb?a=commitdiff_plain;h=38e689af156584c6b4fbd42d516298c6ef485d8a;p=qemu-xen-4.0-testing.git Use ioemu block drivers through blktap; import other Xen-specific files. Add support for a tap:ioemu pseudo driver. Devices using this driver won't use tapdisk (containing the code duplication) any more, but will connect to the qemu-dm of the domain. In this way no working configuration should be broken right now as you can still choose to use the tapdisk drivers. Signed-off-by: Kevin Wolf Also, import and update various Xen-specific files from xen-unstable tip 17307:b667e220e556. --- diff --git a/block-vbd.c b/block-vbd.c new file mode 100644 index 00000000..53c62484 --- /dev/null +++ b/block-vbd.c @@ -0,0 +1,347 @@ +/* + * Block driver for Mini-os PV devices + * Based on block-raw.c + * + * Copyright (c) 2006 Fabrice Bellard, 2007 Samuel Thibault + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include "vl.h" +#include "block_int.h" +#include +#include +#include +#include + +#define SECTOR_SIZE 512 + +#ifndef QEMU_TOOL +#include "exec-all.h" +#endif + +#define DEBUG_BLOCK +#ifdef DEBUG_BLOCK +#define DEBUG_BLOCK_PRINT( formatCstr, args... ) fprintf( logfile, formatCstr, ##args ); fflush( logfile ) +#else +#define DEBUG_BLOCK_PRINT( formatCstr, args... ) +#endif + +#define FTYPE_FILE 0 +#define FTYPE_CD 1 +#define FTYPE_FD 2 + +typedef struct BDRVVbdState { + struct blkfront_dev *dev; + int fd; + int type; + int mode; + int info; + uint64_t sectors; + unsigned sector_size; + QEMU_LIST_ENTRY(BDRVVbdState) list; +} BDRVVbdState; + +QEMU_LIST_HEAD(, BDRVVbdState) vbds; + +static int vbd_probe(const uint8_t *buf, int buf_size, const char *filename) +{ + char *value; + if (xenbus_read(XBT_NIL, filename, &value)) + return 0; + free(value); + return 100; +} + +static void vbd_io_completed(void *opaque) +{ + BDRVVbdState *s = opaque; + blkfront_aio_poll(s->dev); +} + +static int vbd_open(BlockDriverState *bs, const char *filename, int flags) +{ + BDRVVbdState *s = bs->opaque; + + //handy to test posix access + //return -EIO; + + s->dev = init_blkfront((char *) filename, &s->sectors, &s->sector_size, &s->mode, &s->info); + + if (!s->dev) + return -EIO; + + if (SECTOR_SIZE % s->sector_size) { + printf("sector size is %d, we only support sector sizes that divide %d\n", s->sector_size, SECTOR_SIZE); + return -EIO; + } + + s->fd = blkfront_open(s->dev); + qemu_set_fd_handler(s->fd, vbd_io_completed, NULL, s); + + QEMU_LIST_INSERT_HEAD(&vbds, s, list); + + return 0; +} + +typedef struct VbdAIOCB { + BlockDriverAIOCB common; + struct blkfront_aiocb aiocb; +} VbdAIOCB; + +void qemu_aio_init(void) +{ +} + +void qemu_aio_poll(void) +{ +} + +/* Wait for all IO requests to complete. */ +void qemu_aio_flush(void) +{ + BDRVVbdState *s; + for (s = vbds.lh_first; s; s = s->list.le_next) + blkfront_sync(s->dev); +} + +void qemu_aio_wait_start(void) +{ +} + +void qemu_aio_wait(void) +{ + int some = 0; + DEFINE_WAIT(w); + while (1) { + BDRVVbdState *s; + add_waiter(w, blkfront_queue); + for (s = vbds.lh_first; s; s = s->list.le_next) + if (blkfront_aio_poll(s->dev)) + some = 1; + if (some) + break; + schedule(); + } + remove_waiter(w); +} + +void qemu_aio_wait_end(void) +{ +} + +static void vbd_aio_callback(struct blkfront_aiocb *aiocbp, int ret) { + VbdAIOCB *acb = aiocbp->data; + + acb->common.cb(acb->common.opaque, ret); + qemu_aio_release(acb); +} + +static VbdAIOCB *vbd_aio_setup(BlockDriverState *bs, + int64_t sector_num, uint8_t *buf, int nb_sectors, + BlockDriverCompletionFunc *cb, void *opaque) +{ + BDRVVbdState *s = bs->opaque; + VbdAIOCB *acb; + + acb = qemu_aio_get(bs, cb, opaque); + if (!acb) + return NULL; + acb->aiocb.aio_dev = s->dev; + acb->aiocb.aio_buf = buf; + acb->aiocb.aio_nbytes = nb_sectors * SECTOR_SIZE; + acb->aiocb.aio_offset = sector_num * SECTOR_SIZE; + acb->aiocb.aio_cb = vbd_aio_callback; + acb->aiocb.data = acb; + + return acb; +} + +static BlockDriverAIOCB *vbd_aio_read(BlockDriverState *bs, + int64_t sector_num, uint8_t *buf, int nb_sectors, + BlockDriverCompletionFunc *cb, void *opaque) +{ + VbdAIOCB *acb; + + acb = vbd_aio_setup(bs, sector_num, buf, nb_sectors, cb, opaque); + if (!acb) + return NULL; + blkfront_aio(&acb->aiocb, 0); + return &acb->common; +} + +static BlockDriverAIOCB *vbd_aio_write(BlockDriverState *bs, + int64_t sector_num, const uint8_t *buf, int nb_sectors, + BlockDriverCompletionFunc *cb, void *opaque) +{ + VbdAIOCB *acb; + + acb = vbd_aio_setup(bs, sector_num, (uint8_t*) buf, nb_sectors, cb, opaque); + if (!acb) + return NULL; + blkfront_aio(&acb->aiocb, 1); + return &acb->common; +} + +static void vbd_cb(void *data, int ret) { + int *result = data; + result[0] = 1; + result[1] = ret; +} + +static int vbd_aligned_io(BlockDriverState *bs, + int64_t sector_num, uint8_t *buf, int nb_sectors, int write) +{ + VbdAIOCB *acb; + int result[2]; + result[0] = 0; + qemu_aio_wait_start(); + acb = vbd_aio_setup(bs, sector_num, (uint8_t*) buf, nb_sectors, vbd_cb, &result); + blkfront_aio(&acb->aiocb, write); + while (!result[0]) + qemu_aio_wait(); + qemu_aio_wait_end(); + return result[1]; +} + +static int vbd_read(BlockDriverState *bs, + int64_t sector_num, uint8_t *buf, int nb_sectors) +{ + uint8_t *iobuf; + int ret; + /* page alignment would be a bit better, but that's still fine compared to + * copying */ + if (!((uintptr_t)buf & (SECTOR_SIZE-1))) + return vbd_aligned_io(bs, sector_num, buf, nb_sectors, 0); + iobuf = qemu_memalign(PAGE_SIZE, nb_sectors * SECTOR_SIZE); + ret = vbd_aligned_io(bs, sector_num, iobuf, nb_sectors, 0); + memcpy(buf, iobuf, nb_sectors * SECTOR_SIZE); + free(iobuf); + if (ret < 0) + return ret; + else if (ret != nb_sectors * SECTOR_SIZE) + return -EINVAL; + else + return 0; +} + +static int vbd_write(BlockDriverState *bs, + int64_t sector_num, const uint8_t *buf, int nb_sectors) +{ + uint8_t *iobuf; + int ret; + if (!((uintptr_t)buf & (SECTOR_SIZE-1))) + return vbd_aligned_io(bs, sector_num, (uint8_t*) buf, nb_sectors, 1); + iobuf = qemu_memalign(PAGE_SIZE, nb_sectors * SECTOR_SIZE); + memcpy(iobuf, buf, nb_sectors * SECTOR_SIZE); + ret = vbd_aligned_io(bs, sector_num, iobuf, nb_sectors, 1); + free(iobuf); + if (ret < 0) + return ret; + else if (ret != nb_sectors * SECTOR_SIZE) + return -EINVAL; + else + return 0; +} + +static void vbd_aio_cancel(BlockDriverAIOCB *blockacb) +{ + /* TODO */ + //VbdAIOCB *acb = (VbdAIOCB *)blockacb; + + // Try to cancel. If can't, wait for it, drop the callback and call qemu_aio_release(acb) +} + +static void vbd_close(BlockDriverState *bs) +{ + BDRVVbdState *s = bs->opaque; + bs->total_sectors = 0; + if (s->fd >= 0) { + qemu_set_fd_handler(s->fd, NULL, NULL, NULL); + close(s->fd); + s->fd = -1; + } + QEMU_LIST_REMOVE(s, list); +} + +static int64_t vbd_getlength(BlockDriverState *bs) +{ + BDRVVbdState *s = bs->opaque; + return s->sectors * s->sector_size; +} + +static void vbd_flush(BlockDriverState *bs) +{ + BDRVVbdState *s = bs->opaque; + blkfront_sync(s->dev); +} + +/***********************************************/ +/* host device */ + +static int vbd_is_inserted(BlockDriverState *bs) +{ + /* TODO: monitor the backend */ + return 1; +} + +/* currently only used by fdc.c, but a CD version would be good too */ +static int vbd_media_changed(BlockDriverState *bs) +{ + /* TODO: monitor the backend */ + return -ENOTSUP; +} + +static int vbd_eject(BlockDriverState *bs, int eject_flag) +{ + /* TODO: Xen support needed */ + return -ENOTSUP; +} + +static int vbd_set_locked(BlockDriverState *bs, int locked) +{ + /* TODO: Xen support needed */ + return -ENOTSUP; +} + +BlockDriver bdrv_vbd = { + "vbd", + sizeof(BDRVVbdState), + vbd_probe, + vbd_open, + NULL, + NULL, + vbd_close, + NULL, + vbd_flush, + + .bdrv_aio_read = vbd_aio_read, + .bdrv_aio_write = vbd_aio_write, + .bdrv_aio_cancel = vbd_aio_cancel, + .aiocb_size = sizeof(VbdAIOCB), + .bdrv_read = vbd_read, + .bdrv_write = vbd_write, + .bdrv_getlength = vbd_getlength, + + /* removable device support */ + .bdrv_is_inserted = vbd_is_inserted, + .bdrv_media_changed = vbd_media_changed, + .bdrv_eject = vbd_eject, + .bdrv_set_locked = vbd_set_locked, +}; + diff --git a/hw/xen_blktap.c b/hw/xen_blktap.c new file mode 100644 index 00000000..5420da24 --- /dev/null +++ b/hw/xen_blktap.c @@ -0,0 +1,686 @@ +/* xen_blktap.c + * + * Interface to blktapctrl to allow use of qemu block drivers with blktap. + * This file is based on tools/blktap/drivers/tapdisk.c + * + * Copyright (c) 2005 Julian Chesterfield and Andrew Warfield. + * Copyright (c) 2008 Kevin Wolf + */ + +/* + * There are several communication channels which are used by this interface: + * + * - A pair of pipes for receiving and sending general control messages + * (qemu-read-N and qemu-writeN in /var/run/tap, where N is the domain ID). + * These control messages are handled by handle_blktap_ctrlmsg(). + * + * - One file descriptor per attached disk (/dev/xen/blktapN) for disk + * specific control messages. A callback is triggered on this fd if there + * is a new IO request. The callback function is handle_blktap_iomsg(). + * + * - A shared ring for each attached disk containing the actual IO requests + * and responses. Whenever handle_blktap_iomsg() is triggered it processes + * the requests on this ring. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "vl.h" +#include "blktaplib.h" +#include "xen_blktap.h" +#include "block_int.h" + +#define MSG_SIZE 4096 + +#define BLKTAP_CTRL_DIR "/var/run/tap" + +/* If enabled, print debug messages to stderr */ +#if 1 +#define DPRINTF(_f, _a...) fprintf(stderr, __FILE__ ":%d: " _f, __LINE__, ##_a) +#else +#define DPRINTF(_f, _a...) ((void)0) +#endif + +#if 1 +#define ASSERT(_p) \ + if ( !(_p) ) { DPRINTF("Assertion '%s' failed, line %d, file %s\n", #_p , \ + __LINE__, __FILE__); *(int*)0=0; } +#else +#define ASSERT(_p) ((void)0) +#endif + + +extern int domid; + +int read_fd; +int write_fd; + +static pid_t process; +fd_list_entry_t *fd_start = NULL; + +static void handle_blktap_iomsg(void* private); + +struct aiocb_info { + struct td_state *s; + uint64_t sector; + int nr_secs; + int idx; + long i; +}; + +static void unmap_disk(struct td_state *s) +{ + tapdev_info_t *info = s->ring_info; + fd_list_entry_t *entry; + + bdrv_close(s->bs); + + if (info != NULL && info->mem > 0) + munmap(info->mem, getpagesize() * BLKTAP_MMAP_REGION_SIZE); + + entry = s->fd_entry; + *entry->pprev = entry->next; + if (entry->next) + entry->next->pprev = entry->pprev; + + qemu_set_fd_handler2(info->fd, NULL, NULL, NULL, NULL); + close(info->fd); + + free(s->fd_entry); + free(s->blkif); + free(s->ring_info); + free(s); + + return; +} + +static inline fd_list_entry_t *add_fd_entry(int tap_fd, struct td_state *s) +{ + fd_list_entry_t **pprev, *entry; + + DPRINTF("Adding fd_list_entry\n"); + + /*Add to linked list*/ + s->fd_entry = entry = malloc(sizeof(fd_list_entry_t)); + entry->tap_fd = tap_fd; + entry->s = s; + entry->next = NULL; + + pprev = &fd_start; + while (*pprev != NULL) + pprev = &(*pprev)->next; + + *pprev = entry; + entry->pprev = pprev; + + return entry; +} + +static inline struct td_state *get_state(int cookie) +{ + fd_list_entry_t *ptr; + + ptr = fd_start; + while (ptr != NULL) { + if (ptr->cookie == cookie) return ptr->s; + ptr = ptr->next; + } + return NULL; +} + +static struct td_state *state_init(void) +{ + int i; + struct td_state *s; + blkif_t *blkif; + + s = malloc(sizeof(struct td_state)); + blkif = s->blkif = malloc(sizeof(blkif_t)); + s->ring_info = calloc(1, sizeof(tapdev_info_t)); + + for (i = 0; i < MAX_REQUESTS; i++) { + blkif->pending_list[i].secs_pending = 0; + blkif->pending_list[i].submitting = 0; + } + + return s; +} + +static int map_new_dev(struct td_state *s, int minor) +{ + int tap_fd; + tapdev_info_t *info = s->ring_info; + char *devname; + fd_list_entry_t *ptr; + int page_size; + + if (asprintf(&devname,"%s/%s%d", BLKTAP_DEV_DIR, BLKTAP_DEV_NAME, minor) == -1) + return -1; + tap_fd = open(devname, O_RDWR); + if (tap_fd == -1) + { + DPRINTF("open failed on dev %s!\n",devname); + goto fail; + } + info->fd = tap_fd; + + /*Map the shared memory*/ + page_size = getpagesize(); + info->mem = mmap(0, page_size * BLKTAP_MMAP_REGION_SIZE, + PROT_READ | PROT_WRITE, MAP_SHARED, info->fd, 0); + if ((long int)info->mem == -1) + { + DPRINTF("mmap failed on dev %s!\n",devname); + goto fail; + } + + /* assign the rings to the mapped memory */ + info->sring = (blkif_sring_t *)((unsigned long)info->mem); + BACK_RING_INIT(&info->fe_ring, info->sring, page_size); + + info->vstart = + (unsigned long)info->mem + (BLKTAP_RING_PAGES * page_size); + + ioctl(info->fd, BLKTAP_IOCTL_SENDPID, process ); + ioctl(info->fd, BLKTAP_IOCTL_SETMODE, BLKTAP_MODE_INTERPOSE ); + free(devname); + + /*Update the fd entry*/ + ptr = fd_start; + while (ptr != NULL) { + if (s == ptr->s) { + ptr->tap_fd = tap_fd; + + /* Setup fd_handler for qemu main loop */ + DPRINTF("set tap_fd = %d\n", tap_fd); + qemu_set_fd_handler2(tap_fd, NULL, &handle_blktap_iomsg, NULL, s); + + break; + } + ptr = ptr->next; + } + + + DPRINTF("map_new_dev = %d\n", minor); + return minor; + + fail: + free(devname); + return -1; +} + +static int open_disk(struct td_state *s, char *path, int readonly) +{ + struct disk_id id; + BlockDriverState* bs; + + DPRINTF("Opening %s\n", path); + bs = calloc(1, sizeof(*bs)); + + memset(&id, 0, sizeof(struct disk_id)); + + if (bdrv_open(bs, path, 0) != 0) { + fprintf(stderr, "Could not open image file %s\n", path); + return -ENOMEM; + } + + s->bs = bs; + s->flags = readonly ? TD_RDONLY : 0; + s->size = bs->total_sectors; + s->sector_size = 512; + + s->info = ((s->flags & TD_RDONLY) ? VDISK_READONLY : 0); + + return 0; +} + +static inline void write_rsp_to_ring(struct td_state *s, blkif_response_t *rsp) +{ + tapdev_info_t *info = s->ring_info; + blkif_response_t *rsp_d; + + rsp_d = RING_GET_RESPONSE(&info->fe_ring, info->fe_ring.rsp_prod_pvt); + memcpy(rsp_d, rsp, sizeof(blkif_response_t)); + info->fe_ring.rsp_prod_pvt++; +} + +static inline void kick_responses(struct td_state *s) +{ + tapdev_info_t *info = s->ring_info; + + if (info->fe_ring.rsp_prod_pvt != info->fe_ring.sring->rsp_prod) + { + RING_PUSH_RESPONSES(&info->fe_ring); + ioctl(info->fd, BLKTAP_IOCTL_KICK_FE); + } +} + +static int send_responses(struct td_state *s, int res, + uint64_t sector, int nr_secs, int idx, void *private) +{ + pending_req_t *preq; + blkif_request_t *req; + int responses_queued = 0; + blkif_t *blkif = s->blkif; + int secs_done = nr_secs; + + if ( (idx > MAX_REQUESTS-1) ) + { + DPRINTF("invalid index returned(%u)!\n", idx); + return 0; + } + preq = &blkif->pending_list[idx]; + req = &preq->req; + + preq->secs_pending -= secs_done; + + if (res == -EBUSY && preq->submitting) + return -EBUSY; /* propagate -EBUSY back to higher layers */ + if (res) + preq->status = BLKIF_RSP_ERROR; + + if (!preq->submitting && preq->secs_pending == 0) + { + blkif_request_t tmp; + blkif_response_t *rsp; + + tmp = preq->req; + rsp = (blkif_response_t *)req; + + rsp->id = tmp.id; + rsp->operation = tmp.operation; + rsp->status = preq->status; + + write_rsp_to_ring(s, rsp); + responses_queued++; + + kick_responses(s); + } + + return responses_queued; +} + +static void qemu_send_responses(void* opaque, int ret) +{ + struct aiocb_info* info = opaque; + + if (ret != 0) { + DPRINTF("ERROR: ret = %d (%s)\n", ret, strerror(-ret)); + } + + send_responses(info->s, ret, info->sector, info->nr_secs, + info->idx, (void*) info->i); + free(info); +} + +/** + * Callback function for the IO message pipe. Reads requests from the ring + * and processes them (call qemu read/write functions). + * + * The private parameter points to the struct td_state representing the + * disk the request is targeted at. + */ +static void handle_blktap_iomsg(void* private) +{ + struct td_state* s = private; + + RING_IDX rp, j, i; + blkif_request_t *req; + int idx, nsects, ret; + uint64_t sector_nr; + uint8_t *page; + blkif_t *blkif = s->blkif; + tapdev_info_t *info = s->ring_info; + int page_size = getpagesize(); + + struct aiocb_info *aiocb_info; + + if (info->fe_ring.sring == NULL) { + DPRINTF(" sring == NULL, ignoring IO request\n"); + return; + } + + rp = info->fe_ring.sring->req_prod; + xen_rmb(); + + for (j = info->fe_ring.req_cons; j != rp; j++) + { + int start_seg = 0; + + req = NULL; + req = RING_GET_REQUEST(&info->fe_ring, j); + ++info->fe_ring.req_cons; + + if (req == NULL) + continue; + + idx = req->id; + + ASSERT(blkif->pending_list[idx].secs_pending == 0); + memcpy(&blkif->pending_list[idx].req, req, sizeof(*req)); + blkif->pending_list[idx].status = BLKIF_RSP_OKAY; + blkif->pending_list[idx].submitting = 1; + sector_nr = req->sector_number; + + /* Don't allow writes on readonly devices */ + if ((s->flags & TD_RDONLY) && + (req->operation == BLKIF_OP_WRITE)) { + blkif->pending_list[idx].status = BLKIF_RSP_ERROR; + goto send_response; + } + + for (i = start_seg; i < req->nr_segments; i++) { + nsects = req->seg[i].last_sect - + req->seg[i].first_sect + 1; + + if ((req->seg[i].last_sect >= page_size >> 9) || + (nsects <= 0)) + continue; + + page = (uint8_t*) MMAP_VADDR(info->vstart, + (unsigned long)req->id, i); + page += (req->seg[i].first_sect << SECTOR_SHIFT); + + if (sector_nr >= s->size) { + DPRINTF("Sector request failed:\n"); + DPRINTF("%s request, idx [%d,%d] size [%llu], " + "sector [%llu,%llu]\n", + (req->operation == BLKIF_OP_WRITE ? + "WRITE" : "READ"), + idx,i, + (long long unsigned) + nsects<pending_list[idx].secs_pending += nsects; + + switch (req->operation) + { + case BLKIF_OP_WRITE: + aiocb_info = malloc(sizeof(*aiocb_info)); + + aiocb_info->s = s; + aiocb_info->sector = sector_nr; + aiocb_info->nr_secs = nsects; + aiocb_info->idx = idx; + aiocb_info->i = i; + + ret = (NULL == bdrv_aio_write(s->bs, sector_nr, + page, nsects, + qemu_send_responses, + aiocb_info)); + + if (ret) { + blkif->pending_list[idx].status = BLKIF_RSP_ERROR; + DPRINTF("ERROR: bdrv_write() == NULL\n"); + goto send_response; + } + break; + + case BLKIF_OP_READ: + aiocb_info = malloc(sizeof(*aiocb_info)); + + aiocb_info->s = s; + aiocb_info->sector = sector_nr; + aiocb_info->nr_secs = nsects; + aiocb_info->idx = idx; + aiocb_info->i = i; + + ret = (NULL == bdrv_aio_read(s->bs, sector_nr, + page, nsects, + qemu_send_responses, + aiocb_info)); + + if (ret) { + blkif->pending_list[idx].status = BLKIF_RSP_ERROR; + DPRINTF("ERROR: bdrv_read() == NULL\n"); + goto send_response; + } + break; + + default: + DPRINTF("Unknown block operation\n"); + break; + } + sector_nr += nsects; + } + send_response: + blkif->pending_list[idx].submitting = 0; + + /* force write_rsp_to_ring for synchronous case */ + if (blkif->pending_list[idx].secs_pending == 0) + send_responses(s, 0, 0, 0, idx, (void *)(long)0); + } +} + +/** + * Callback function for the qemu-read pipe. Reads and processes control + * message from the pipe. + * + * The parameter private is unused. + */ +static void handle_blktap_ctrlmsg(void* private) +{ + int length, len, msglen; + char *ptr, *path; + image_t *img; + msg_hdr_t *msg; + msg_newdev_t *msg_dev; + msg_pid_t *msg_pid; + int ret = -1; + struct td_state *s = NULL; + fd_list_entry_t *entry; + + char buf[MSG_SIZE]; + + length = read(read_fd, buf, MSG_SIZE); + + if (length > 0 && length >= sizeof(msg_hdr_t)) + { + msg = (msg_hdr_t *)buf; + DPRINTF("blktap: Received msg, len %d, type %d, UID %d\n", + length,msg->type,msg->cookie); + + switch (msg->type) { + case CTLMSG_PARAMS: + ptr = buf + sizeof(msg_hdr_t); + len = (length - sizeof(msg_hdr_t)); + path = calloc(1, len + 1); + + memcpy(path, ptr, len); + DPRINTF("Received CTLMSG_PARAMS: [%s]\n", path); + + /* Allocate the disk structs */ + s = state_init(); + + /*Open file*/ + if (s == NULL || open_disk(s, path, msg->readonly)) { + msglen = sizeof(msg_hdr_t); + msg->type = CTLMSG_IMG_FAIL; + msg->len = msglen; + } else { + entry = add_fd_entry(0, s); + entry->cookie = msg->cookie; + DPRINTF("Entered cookie %d\n", entry->cookie); + + memset(buf, 0x00, MSG_SIZE); + + msglen = sizeof(msg_hdr_t) + sizeof(image_t); + msg->type = CTLMSG_IMG; + img = (image_t *)(buf + sizeof(msg_hdr_t)); + img->size = s->size; + img->secsize = s->sector_size; + img->info = s->info; + DPRINTF("Writing (size, secsize, info) = " + "(%#" PRIx64 ", %#" PRIx64 ", %d)\n", + s->size, s->sector_size, s->info); + } + len = write(write_fd, buf, msglen); + free(path); + break; + + case CTLMSG_NEWDEV: + msg_dev = (msg_newdev_t *)(buf + sizeof(msg_hdr_t)); + + s = get_state(msg->cookie); + DPRINTF("Retrieving state, cookie %d.....[%s]\n", + msg->cookie, (s == NULL ? "FAIL":"OK")); + if (s != NULL) { + ret = ((map_new_dev(s, msg_dev->devnum) + == msg_dev->devnum ? 0: -1)); + } + + memset(buf, 0x00, MSG_SIZE); + msglen = sizeof(msg_hdr_t); + msg->type = (ret == 0 ? CTLMSG_NEWDEV_RSP + : CTLMSG_NEWDEV_FAIL); + msg->len = msglen; + + len = write(write_fd, buf, msglen); + break; + + case CTLMSG_CLOSE: + s = get_state(msg->cookie); + if (s) unmap_disk(s); + break; + + case CTLMSG_PID: + memset(buf, 0x00, MSG_SIZE); + msglen = sizeof(msg_hdr_t) + sizeof(msg_pid_t); + msg->type = CTLMSG_PID_RSP; + msg->len = msglen; + + msg_pid = (msg_pid_t *)(buf + sizeof(msg_hdr_t)); + process = getpid(); + msg_pid->pid = process; + + len = write(write_fd, buf, msglen); + break; + + default: + break; + } + } +} + +/** + * Opens a control socket, i.e. a pipe to communicate with blktapctrl. + * + * Returns the file descriptor number for the pipe; -1 in error case + */ +static int open_ctrl_socket(char *devname) +{ + int ret; + int ipc_fd; + + if (mkdir(BLKTAP_CTRL_DIR, 0755) == 0) + DPRINTF("Created %s directory\n", BLKTAP_CTRL_DIR); + + ret = mkfifo(devname,S_IRWXU|S_IRWXG|S_IRWXO); + if ( (ret != 0) && (errno != EEXIST) ) { + DPRINTF("ERROR: pipe failed (%d)\n", errno); + return -1; + } + + ipc_fd = open(devname,O_RDWR|O_NONBLOCK); + + if (ipc_fd < 0) { + DPRINTF("FD open failed\n"); + return -1; + } + + return ipc_fd; +} + +/** + * Unmaps all disks and closes their pipes + */ +void shutdown_blktap(void) +{ + fd_list_entry_t *ptr; + struct td_state *s; + char *devname; + + DPRINTF("Shutdown blktap\n"); + + /* Unmap all disks */ + ptr = fd_start; + while (ptr != NULL) { + s = ptr->s; + unmap_disk(s); + close(ptr->tap_fd); + ptr = ptr->next; + } + + /* Delete control pipes */ + if (asprintf(&devname, BLKTAP_CTRL_DIR "/qemu-read-%d", domid) >= 0) { + DPRINTF("Delete %s\n", devname); + if (unlink(devname)) + DPRINTF("Could not delete: %s\n", strerror(errno)); + free(devname); + } + + if (asprintf(&devname, BLKTAP_CTRL_DIR "/qemu-write-%d", domid) >= 0) { + DPRINTF("Delete %s\n", devname); + if (unlink(devname)) + DPRINTF("Could not delete: %s\n", strerror(errno)); + free(devname); + } +} + +/** + * Initialize the blktap interface, i.e. open a pair of pipes in /var/run/tap + * and register a fd handler. + * + * Returns 0 on success. + */ +int init_blktap(void) +{ + char* devname; + + DPRINTF("Init blktap pipes\n"); + + /* Open the read pipe */ + if (asprintf(&devname, BLKTAP_CTRL_DIR "/qemu-read-%d", domid) >= 0) { + read_fd = open_ctrl_socket(devname); + free(devname); + + if (read_fd == -1) { + fprintf(stderr, "Could not open %s/qemu-read-%d\n", + BLKTAP_CTRL_DIR, domid); + return -1; + } + } + + /* Open the write pipe */ + if (asprintf(&devname, BLKTAP_CTRL_DIR "/qemu-write-%d", domid) >= 0) { + write_fd = open_ctrl_socket(devname); + free(devname); + + if (write_fd == -1) { + fprintf(stderr, "Could not open %s/qemu-write-%d\n", + BLKTAP_CTRL_DIR, domid); + close(read_fd); + return -1; + } + } + + /* Attach a handler to the read pipe (called from qemu main loop) */ + qemu_set_fd_handler2(read_fd, NULL, &handle_blktap_ctrlmsg, NULL, NULL); + + /* Register handler to clean up when the domain is destroyed */ + atexit(&shutdown_blktap); + + return 0; +} diff --git a/hw/xen_blktap.h b/hw/xen_blktap.h new file mode 100644 index 00000000..0fb4bb3f --- /dev/null +++ b/hw/xen_blktap.h @@ -0,0 +1,57 @@ +/* xen_blktap.h + * + * Generic disk interface for blktap-based image adapters. + * + * (c) 2006 Andrew Warfield and Julian Chesterfield + */ + +#ifndef XEN_BLKTAP_H_ +#define XEN_BLKTAP_H_ + +#include +#include +#include + +#include "block_int.h" + +/* Things disks need to know about, these should probably be in a higher-level + * header. */ +#define MAX_SEGMENTS_PER_REQ 11 +#define SECTOR_SHIFT 9 +#define DEFAULT_SECTOR_SIZE 512 + +#define MAX_IOFD 2 + +#define BLK_NOT_ALLOCATED 99 +#define TD_NO_PARENT 1 + +typedef uint32_t td_flag_t; + +#define TD_RDONLY 1 + +struct disk_id { + char *name; + int drivertype; +}; + +/* This structure represents the state of an active virtual disk. */ +struct td_state { + BlockDriverState* bs; + td_flag_t flags; + void *blkif; + void *image; + void *ring_info; + void *fd_entry; + uint64_t sector_size; + uint64_t size; + unsigned int info; +}; + +typedef struct fd_list_entry { + int cookie; + int tap_fd; + struct td_state *s; + struct fd_list_entry **pprev, *next; +} fd_list_entry_t; + +#endif /*XEN_BLKTAP_H_*/ diff --git a/hw/xen_machine_fv.c b/hw/xen_machine_fv.c index 2d02b5e1..5fc9dda4 100644 --- a/hw/xen_machine_fv.c +++ b/hw/xen_machine_fv.c @@ -24,9 +24,6 @@ */ #include "vl.h" -#ifdef CONFIG_STUBDOM -#include -#endif #include #include diff --git a/hw/xen_machine_pv.c b/hw/xen_machine_pv.c index 41f051db..5cf5a695 100644 --- a/hw/xen_machine_pv.c +++ b/hw/xen_machine_pv.c @@ -26,6 +26,9 @@ #include "xen_console.h" #include "xenfb.h" +extern void init_blktap(void); + + /* The Xen PV machine currently provides * - a virtual framebuffer * - .... @@ -41,6 +44,12 @@ static void xen_init_pv(uint64_t ram_size, int vga_ram_size, char *boot_device, struct xenfb *xenfb; extern int domid; + +#ifndef CONFIG_STUBDOM + /* Initialize tapdisk client */ + init_blktap(); +#endif + /* Connect to text console */ if (serial_hds[0]) { if (xencons_init(domid, serial_hds[0]) < 0) { diff --git a/i386-dm/hooks.mak b/i386-dm/hooks.mak index 64571b48..31083096 100644 --- a/i386-dm/hooks.mak +++ b/i386-dm/hooks.mak @@ -1,3 +1,6 @@ CPPFLAGS += -DHAS_AUDIO QEMU_PROG=qemu-dm + +OBJS += xen_blktap.o + include ../xen-hooks.mak diff --git a/xenstore.c b/xenstore.c index e1c25322..01ace34a 100644 --- a/xenstore.c +++ b/xenstore.c @@ -81,7 +81,7 @@ static void waitForDevice(char *fn) #define DIRECT_PCI_STR_LEN 160 char direct_pci_str[DIRECT_PCI_STR_LEN]; -void xenstore_parse_domain_config(int domid) +void xenstore_parse_domain_config(int hvm_domid) { char **e = NULL; char *buf = NULL, *path; @@ -100,7 +100,7 @@ void xenstore_parse_domain_config(int domid) return; } - path = xs_get_domain_path(xsh, domid); + path = xs_get_domain_path(xsh, hvm_domid); if (path == NULL) { fprintf(logfile, "xs_get_domain_path() error\n"); goto out; @@ -189,6 +189,13 @@ void xenstore_parse_domain_config(int domid) memmove(params, offset+1, strlen(offset+1)+1 ); fprintf(logfile, "Strip off blktap sub-type prefix to %s\n", params); } + /* Prefix with /dev/ if needed */ + if (!strcmp(drv, "phy") && params[0] != '/') { + char *newparams = malloc(5 + strlen(params) + 1); + sprintf(newparams, "/dev/%s", params); + free(params); + params = newparams; + } /* * check if device has a phantom vbd; the phantom is hooked