]> xenbits.xensource.com Git - people/ssmith/nc2-2.6.27.git/commitdiff
Add user-space facing part of blkfront to blktap to provide a
authorSteven Smith <ssmith@weybridge.uk.xensource.com>
Tue, 30 Jun 2009 11:55:47 +0000 (12:55 +0100)
committerSteven Smith <ssmith@weybridge.uk.xensource.com>
Tue, 30 Jun 2009 11:55:47 +0000 (12:55 +0100)
device in the backend domain giving access to the blktap device.

drivers/xen/blktap/Makefile
drivers/xen/blktap/backdev.c [new file with mode: 0644]
drivers/xen/blktap/backdev.h [new file with mode: 0644]
drivers/xen/blktap/blktap.c
drivers/xen/blktap/blktap.h [new file with mode: 0644]
drivers/xen/blktap/common.h
drivers/xen/blktap/xenbus.c

index f10cc4fe127595215f89ddcf5cb78fc76643845e..07ea7d6de5f014d2aef3164a2c9ead543d87b652 100644 (file)
@@ -2,4 +2,4 @@ LINUXINCLUDE += -I../xen/include/public/io
 
 obj-$(CONFIG_XEN_BLKDEV_TAP) := xenblktap.o
 
-xenblktap-y := xenbus.o interface.o blktap.o 
+xenblktap-y := xenbus.o interface.o blktap.o backdev.o
diff --git a/drivers/xen/blktap/backdev.c b/drivers/xen/blktap/backdev.c
new file mode 100644 (file)
index 0000000..b98cbed
--- /dev/null
@@ -0,0 +1,579 @@
+
+#include <linux/cdrom.h>
+#include <linux/fs.h>
+#include <linux/hdreg.h>
+#include <linux/module.h>
+
+#include <scsi/scsi.h>
+#include <scsi/scsi_ioctl.h>
+
+#include <xen/xenbus.h>
+#include <xen/interface/io/blkif.h>
+
+#include "common.h"
+#include "backdev.h"
+#include "blktap.h"
+
+#if 0
+#define DPRINTK_IOCTL(_f, _a...) printk(KERN_ALERT _f, ## _a)
+#else
+#define DPRINTK_IOCTL(_f, _a...) ((void)0)
+#endif
+
+DEFINE_SPINLOCK(backdev_io_lock);
+
+static int backdev_major;
+
+int
+register_backdev(void)
+{
+       int major;
+
+       /* Dynamically allocate a major for this device */
+       major = register_blkdev(0, "backdev");
+       if (major < 0) {
+               WPRINTK("Couldn't register /dev/xen/backdev\n");
+               return -ENOMEM;
+       }       
+       backdev_major = major;
+       DPRINTK("backdev major %d\n", major);
+       return 0;
+}
+
+int
+destroy_backdev(struct tap_blkif *uinfo)
+{
+       struct backdev_info *info = uinfo->backdev;
+
+       DPRINTK("destroy backdev %d users %d\n", uinfo->minor, info->users);
+       if (info->users)
+               return -EBUSY;
+
+       spin_lock_irq(&backdev_io_lock);
+       /* No more blkif_request(). */
+       blk_stop_queue(info->gd->queue);
+       spin_unlock_irq(&backdev_io_lock);
+
+       del_gendisk(info->gd);
+       put_disk(info->gd);
+
+       blk_cleanup_queue(info->gd->queue);
+
+       uinfo->backdev = NULL;
+       kfree(info);
+
+       return 0;
+}
+
+static int
+backdev_open(struct inode *inode, struct file *filep)
+{
+       struct backdev_info *info = inode->i_bdev->bd_disk->private_data;
+       info->users++;
+       return 0;
+}
+
+static int
+backdev_release(struct inode *inode, struct file *filep)
+{
+       struct backdev_info *info = inode->i_bdev->bd_disk->private_data;
+       info->users--;
+       if (info->uinfo->dev_inuse == 0)
+               destroy_backdev(info->uinfo);
+       return 0;
+}
+
+static int
+backdev_getgeo(struct block_device *bd, struct hd_geometry *hg)
+{
+       /* We don't have real geometry info, but let's at least return
+          values consistent with the size of the device */
+       sector_t nsect = get_capacity(bd->bd_disk);
+       sector_t cylinders = nsect;
+
+       hg->heads = 0xff;
+       hg->sectors = 0x3f;
+       sector_div(cylinders, hg->heads * hg->sectors);
+       hg->cylinders = cylinders;
+       if ((sector_t)(hg->cylinders + 1) * hg->heads * hg->sectors < nsect)
+               hg->cylinders = 0xffff;
+       return 0;
+}
+
+static int
+backdev_ioctl(struct inode *inode, struct file *filep,
+             unsigned command, unsigned long argument)
+{
+       int i;
+
+       DPRINTK_IOCTL("command: 0x%x, argument: 0x%lx, dev: 0x%04x\n",
+                     command, (long)argument, inode->i_rdev);
+
+       switch (command) {
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,16)
+       case HDIO_GETGEO: {
+               struct block_device *bd = inode->i_bdev;
+               struct hd_geometry geo;
+               int ret;
+
+               if (!argument)
+                       return -EINVAL;
+
+               geo.start = get_start_sect(bd);
+               ret = backdev_getgeo(bd, &geo);
+               if (ret)
+                       return ret;
+
+               if (copy_to_user((struct hd_geometry __user *)argument, &geo,
+                                sizeof(geo)))
+                       return -EFAULT;
+
+               return 0;
+       }
+#endif
+       case CDROMMULTISESSION:
+               DPRINTK("FIXME: support multisession CDs later\n");
+               for (i = 0; i < sizeof(struct cdrom_multisession); i++)
+                       if (put_user(0, (char __user *)(argument + i)))
+                               return -EFAULT;
+               return 0;
+
+       case SCSI_IOCTL_GET_IDLUN:
+               if (!access_ok(VERIFY_WRITE, argument, 
+                       sizeof(struct scsi_idlun)))
+                       return -EFAULT;
+
+               /* return 0 for now. */
+               __put_user(0, &((struct scsi_idlun __user *)argument)->dev_id);
+               __put_user(0, 
+                       &((struct scsi_idlun __user *)argument)->host_unique_id);
+               return 0;
+
+       default:
+               /*printk(KERN_ALERT "ioctl %08x not supported by Xen blkdev\n",
+                 command);*/
+               return -EINVAL; /* same return as native Linux */
+       }
+
+       return 0;
+}
+
+static struct block_device_operations backdev_fops = {
+       .owner = THIS_MODULE,
+       .open = backdev_open,
+       .release = backdev_release,
+       .ioctl  = backdev_ioctl,
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16)
+       .getgeo = backdev_getgeo
+#endif
+};
+
+static int map_uaddr_fn(
+       pte_t *ptep, struct page *pmd_page, unsigned long addr, void *data)
+{
+       pte_t *pte = (pte_t *)data;
+
+       DPRINTK("map_uaddr ptep %p -> %012llx/%012llx\n", ptep, pte_val(*pte),
+               pte_val_ma(*pte));
+       set_pte(ptep, *pte);
+       xen_invlpg(addr);
+       return 0;
+}
+
+static int map_uaddr(struct mm_struct *mm, unsigned long address,
+                    pte_t pte)
+{
+       return apply_to_page_range(mm, address, PAGE_SIZE, map_uaddr_fn, &pte);
+}
+
+static int umap_uaddr_fn(
+       pte_t *ptep, struct page *pmd_page, unsigned long addr, void *data)
+{
+       struct mm_struct *mm = (struct mm_struct *)data;
+
+       pte_clear(mm, addr, ptep);
+       xen_invlpg(addr);
+       return 0;
+}
+
+static int umap_uaddr(struct mm_struct *mm, unsigned long address)
+{
+       return apply_to_page_range(mm, address, PAGE_SIZE, umap_uaddr_fn, mm);
+}
+
+static void
+process_backdev_request(struct backdev_info *info)
+{
+    request_queue_t *rq;
+    struct tap_blkif *uinfo;
+    struct request *req;
+    blkif_request_t blkif_req;
+    blkif_request_t *target;
+    pending_req_t *pending_req;
+    blkif_t *blkif = NULL;
+    int queued;
+    unsigned long uvaddr, kvaddr;
+    pte_t pte;
+    unsigned int fsect, lsect;
+    struct bio *bio;
+    struct bio_vec *bvec;
+    int idx;
+    int usr_idx;
+    int pending_idx;
+    uint16_t mmap_idx;
+    unsigned long offset;
+    struct page *pg;
+    int nr_sects = 0;
+
+    uinfo = info->uinfo;
+    rq = info->gd->queue;
+    blkif = uinfo->blkif;
+
+    DPRINTK("Entered do_backdev_request %d\n", uinfo->minor);
+
+    queued = 0;
+
+    while ((req = elv_next_request(rq)) != NULL) {
+       if (!blk_fs_request(req)) {
+           end_request(req, 0);
+           continue;
+       }
+       if (blk_barrier_rq(req)) {
+           end_request(req, 0);
+           continue;
+       }
+       if (info != req->rq_disk->private_data) {
+           end_request(req, 0);
+           continue;
+       }
+       if (uinfo->dev_inuse == 0) {
+           DPRINTK("device no longer in use %d\n", info->uinfo->minor);
+           end_request(req, 0);
+           continue;
+       }
+
+       if (RING_FULL(&uinfo->ufe_ring)) {
+         wait:
+           /* Avoid pointless unplugs. */
+           blk_stop_queue(rq);
+           break;
+       }
+
+       /* Check we have space on user ring - should never fail. */
+       usr_idx = GET_NEXT_REQ(uinfo->idx_map);
+       if (usr_idx == INVALID_REQ)
+           goto wait;
+
+       pending_req = alloc_req();
+       if (pending_req == NULL) {
+           blkif->st_oo_req++;
+           blkif->waiting_reqs = 1;
+           goto wait;
+       }
+       pending_req->inuse = 2;
+
+       pending_idx = RTN_PEND_IDX(pending_req, pending_req->mem_idx);
+       mmap_idx = pending_req->mem_idx;
+       DPRINTK("request pending_req %p blkif %p pending_idx %d mmap_idx %d\n",
+               pending_req, blkif, pending_idx, mmap_idx);
+
+       DPRINTK("do_blk_req %p: dev %d cmd %p, sec %llx, "
+               "(%u/%li) buffer:%p [%s]\n",
+               req, uinfo->minor, req->cmd, (long long)req->sector,
+               req->current_nr_sectors,
+               req->nr_sectors, req->buffer,
+               rq_data_dir(req) ? "write" : "read");
+
+       blkdev_dequeue_request(req);
+
+       blkif_req.id = usr_idx;
+       blkif_req.sector_number = (blkif_sector_t)req->sector;
+       blkif_req.handle = uinfo->trans.busid;
+       blkif_req.operation = rq_data_dir(req) ?
+           BLKIF_OP_WRITE : BLKIF_OP_READ;
+
+       blkif_req.nr_segments = 0;
+       rq_for_each_bio (bio, req) {
+           bio_for_each_segment (bvec, bio, idx) {
+               BUG_ON(blkif_req.nr_segments ==
+                      BLKIF_MAX_SEGMENTS_PER_REQUEST);
+               fsect = bvec->bv_offset >> 9;
+               lsect = fsect + (bvec->bv_len >> 9) - 1;
+               nr_sects += bvec->bv_len >> 9;
+
+               blkif_req.seg[blkif_req.nr_segments] =
+                   (struct blkif_request_segment) {
+                       .gref       = 0,
+                       .first_sect = fsect,
+                       .last_sect  = lsect };
+
+               uvaddr = MMAP_VADDR(uinfo->user_vstart, usr_idx,
+                                   blkif_req.nr_segments);
+               kvaddr = idx_to_kaddr(mmap_idx, pending_idx,
+                                     blkif_req.nr_segments);
+               
+               pte = mk_pte(bvec->bv_page, uinfo->vma->vm_page_prot);
+               map_uaddr(uinfo->vma->vm_mm, uvaddr, pte_mkwrite(pte));
+               map_uaddr(&init_mm, kvaddr,
+                         mk_pte(bvec->bv_page, PAGE_KERNEL));
+               set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT, pte_mfn(pte));
+               offset = (uvaddr - uinfo->vma->vm_start) >> PAGE_SHIFT;
+               pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
+               DPRINTK("mapped uaddr %08lx to page %p\n", uvaddr, pg);
+               ((struct page **)uinfo->vma->vm_private_data)[offset] = pg;
+               SetPageReserved(pg);
+               if (0) if (vm_insert_page(uinfo->vma, uvaddr, pg))
+                   DPRINTK("boohoo\n");
+               pending_handle(mmap_idx, pending_idx,
+                              blkif_req.nr_segments).kernel =
+                   INVALID_GRANT_HANDLE;
+               pending_handle(mmap_idx, pending_idx,
+                              blkif_req.nr_segments).user =
+                   INVALID_GRANT_HANDLE;
+
+               blkif_req.nr_segments++;
+           }
+       }
+
+       pending_req->blkif     = blkif;
+       pending_req->id        = (unsigned long)req;
+       pending_req->operation = blkif_req.operation;
+       pending_req->status    = BLKIF_RSP_OKAY;
+       pending_req->nr_pages  = blkif_req.nr_segments;
+
+       /*record [mmap_idx,pending_idx] to [usr_idx] mapping*/
+       uinfo->idx_map[usr_idx] = MAKE_ID(mmap_idx, pending_idx);
+
+       blkif_get(blkif);
+       /* Finally, write the request message to the user ring. */
+       target = RING_GET_REQUEST(&uinfo->ufe_ring,
+                                 uinfo->ufe_ring.req_prod_pvt);
+       memcpy(target, &blkif_req, sizeof(blkif_req));
+       target->id = usr_idx;
+       wmb(); /* blktap_poll() reads req_prod_pvt asynchronously */
+       uinfo->ufe_ring.req_prod_pvt++;
+
+       if (rq_data_dir(req)) {
+           blkif->st_wr_sect += nr_sects;
+           blkif->st_wr_req++;
+       } else {
+           blkif->st_rd_sect += nr_sects;
+           blkif->st_rd_req++;
+       }
+
+       queued++;
+    }
+
+    if (queued != 0)
+       blktap_kick_user(blkif->dev_num);
+    return;
+}
+
+static void
+do_backdev_request(request_queue_t *rq)
+{
+       struct backdev_info *info;
+
+       info = rq->queuedata;
+       if (info->uinfo->blkif) {
+               info->uinfo->blkif->waiting_reqs = 1;
+               wake_up(&info->uinfo->blkif->wq);
+       }
+       DPRINTK("got requests for dev %d wake %p/%p\n", info->uinfo->minor,
+               info->uinfo->blkif, &info->uinfo->blkif->wq);
+}
+
+void
+backdev_finish_req(struct tap_blkif *info, int usr_idx, blkif_response_t *res,
+                  struct pending_req *pending_req)
+{
+       struct request *req;
+       int uptodate, ret;
+       int pending_idx, mmap_idx;
+       int i;
+
+       pending_idx = MASK_PEND_IDX(ID_TO_IDX(info->idx_map[usr_idx]));
+       mmap_idx = ID_TO_MIDX(info->idx_map[usr_idx]);
+
+       down_write(&info->vma->vm_mm->mmap_sem);
+       zap_page_range(info->vma, 
+                      MMAP_VADDR(info->user_vstart, usr_idx, 0), 
+                      pending_req->nr_pages << PAGE_SHIFT, NULL);
+       for (i = 0; i < pending_req->nr_pages; i++)
+           umap_uaddr(&init_mm, idx_to_kaddr(mmap_idx, pending_idx, i));
+       up_write(&info->vma->vm_mm->mmap_sem);
+
+       info->idx_map[usr_idx] = INVALID_REQ;
+
+       spin_lock_irq(&backdev_io_lock);
+
+       req = (struct request *)(unsigned long)pending_req->id;
+       DPRINTK("req %p res status %d operation %d/%d id %lld\n", req,
+               res->status, res->operation, pending_req->operation, res->id);
+       uptodate = (res->status == BLKIF_RSP_OKAY);
+       switch (pending_req->operation) {
+#if 0
+       case BLKIF_OP_WRITE_BARRIER:
+               if (unlikely(res->status == BLKIF_RSP_EOPNOTSUPP)) {
+                       printk("backdev: %s: write barrier op failed\n",
+                              info->gd->disk_name);
+                       uptodate = -EOPNOTSUPP;
+                       info->feature_barrier = 0;
+                       xlvbd_barrier(info);
+               }
+               /* fall through */
+#endif
+       case BLKIF_OP_READ:
+       case BLKIF_OP_WRITE:
+               if (unlikely(res->status != BLKIF_RSP_OKAY))
+                       DPRINTK("Bad return from blkdev data "
+                               "request: %x\n", res->status);
+
+               ret = end_that_request_first(req, uptodate,
+                                            req->hard_nr_sectors);
+               BUG_ON(ret);
+               end_that_request_last(req, uptodate);
+               break;
+       default:
+               BUG();
+       }
+
+       if (info->blkif)
+               info->blkif->waiting_reqs = 1;
+
+       spin_unlock_irq(&backdev_io_lock);
+}
+
+void
+backdev_restart_queue(struct tap_blkif *uinfo)
+{
+       struct backdev_info *info;
+
+       if (uinfo == NULL)
+               return;
+       info = uinfo->backdev;
+       if (info == NULL || info->gd == NULL || info->gd->queue == NULL)
+               return;
+
+       if (!RING_FULL(&uinfo->ufe_ring)) {
+               spin_lock_irq(&backdev_io_lock);
+               /* Re-enable calldowns. */
+               if (blk_queue_stopped(info->gd->queue))
+                       blk_start_queue(info->gd->queue);
+               /* Kick things off immediately. */
+               process_backdev_request(info);
+               spin_unlock_irq(&backdev_io_lock);
+       }
+       return;
+}
+
+int
+create_backdev(struct tap_blkif *uinfo)
+{
+       struct gendisk *gd = NULL;
+       struct request_queue *rq = NULL;
+       struct backdev_info *info = NULL;
+       unsigned long long capacity;
+       unsigned long sector_size;
+       char *path = NULL, *s = NULL;
+       int minor = uinfo->minor;
+       unsigned short domid = uinfo->trans.domid;
+       unsigned short busid = uinfo->trans.busid;
+       int err;
+
+       if (uinfo->backdev)
+               return -EEXIST;
+
+       DPRINTK("create_backdev minor %d domid %d busid %d\n",
+               minor, domid, busid);
+       err = -ENOMEM;
+       path = kasprintf(GFP_KERNEL, "/local/domain/0/backend/tap/%d/%d",
+                        domid, busid);
+       if (path == NULL)
+               goto error;
+       err = xenbus_gather(XBT_NIL, path, "sectors", "%Lu", &capacity,
+                           "sector-size", "%lu", &sector_size, NULL);
+       if (err)
+               goto error;
+       DPRINTK("create_backdev sectors %Lu sector-size %lu\n",
+               capacity, sector_size);
+
+       err = -ENODEV;
+
+       gd = alloc_disk(1);
+       if (gd == NULL)
+               goto error;
+
+       if (minor < 26)
+               sprintf(gd->disk_name, "backdev%c", 'a' + minor);
+       else
+               sprintf(gd->disk_name, "backdev%c%c", 'a' + ((minor/26)-1),
+                       'a' + (minor%26));
+
+       info = kzalloc(sizeof(*info), GFP_KERNEL);
+       if (info == NULL)
+               goto error;
+
+       info->uinfo = uinfo;
+
+       gd->major = backdev_major;
+       gd->first_minor = minor;
+       gd->fops = &backdev_fops;
+       gd->private_data = info;
+       set_capacity(gd, capacity);
+
+       rq = blk_init_queue(do_backdev_request, &backdev_io_lock);
+       if (rq == NULL)
+               goto error;
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10)
+       elevator_init(rq, "noop");
+#else
+       elevator_init(rq, &elevator_noop);
+#endif
+
+       /* Hard sector size and max sectors impersonate the equiv. hardware. */
+       blk_queue_hardsect_size(rq, sector_size);
+       blk_queue_max_sectors(rq, 512);
+
+       /* Each segment in a request is up to an aligned page in size. */
+       blk_queue_segment_boundary(rq, PAGE_SIZE - 1);
+       blk_queue_max_segment_size(rq, PAGE_SIZE);
+
+       /* Ensure a merged request will fit in a single I/O ring slot. */
+       blk_queue_max_phys_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
+       blk_queue_max_hw_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
+
+       /* Make sure buffer addresses are sector-aligned. */
+       blk_queue_dma_alignment(rq, 511);
+
+       rq->queuedata = info;
+
+       gd->queue = rq;
+
+       info->gd = gd;
+
+       add_disk(gd);
+
+       s = kasprintf(GFP_KERNEL, "%d:%d", backdev_major, minor);
+       if (s == NULL)
+               goto error;
+       err = xenbus_write(XBT_NIL, path, "backdev-node", s);
+       if (err)
+               goto error;
+
+       uinfo->backdev = info;
+
+       goto out;
+
+ error:
+       kfree(info);
+       if (gd)
+               del_gendisk(gd);
+       if (rq)
+               blk_cleanup_queue(rq);
+ out:
+       kfree(s);
+       kfree(path);
+       return err;
+}
diff --git a/drivers/xen/blktap/backdev.h b/drivers/xen/blktap/backdev.h
new file mode 100644 (file)
index 0000000..a885b9e
--- /dev/null
@@ -0,0 +1,16 @@
+
+struct tap_blkif;
+struct pending_req;
+
+struct backdev_info {
+       int users;
+       struct gendisk *gd;
+       struct tap_blkif *uinfo;
+};
+
+extern int register_backdev(void);
+extern int create_backdev(struct tap_blkif *);
+extern int destroy_backdev(struct tap_blkif *);
+extern void backdev_finish_req(struct tap_blkif *, int, blkif_response_t *,
+                              struct pending_req *);
+extern void backdev_restart_queue(struct tap_blkif *);
index cd555a8e778ecbed939ea6ece5ee1d07bd0381e5..2ab35eadf6d6a0b5fda1237f64870811c55612f9 100644 (file)
 #include <linux/nsproxy.h>
 #include <asm/tlbflush.h>
 
+#include "blktap.h"
+#include "backdev.h"
+
 #define MAX_TAP_DEV 256     /*the maximum number of tapdisk ring devices    */
 #define MAX_DEV_NAME 100    /*the max tapdisk ring device name e.g. blktap0 */
 
-/*
- * The maximum number of requests that can be outstanding at any time
- * is determined by 
- *
- *   [mmap_alloc * MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST] 
- *
- * where mmap_alloc < MAX_DYNAMIC_MEM.
- *
- * TODO:
- * mmap_alloc is initialised to 2 and should be adjustable on the fly via
- * sysfs.
- */
-#define BLK_RING_SIZE          __RING_SIZE((blkif_sring_t *)0, PAGE_SIZE)
-#define MAX_DYNAMIC_MEM                BLK_RING_SIZE
-#define MAX_PENDING_REQS       BLK_RING_SIZE
-#define MMAP_PAGES (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST)
-#define MMAP_VADDR(_start, _req,_seg)                                   \
-        (_start +                                                       \
-         ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) +        \
-         ((_seg) * PAGE_SIZE))
 static int blkif_reqs = MAX_PENDING_REQS;
 static int mmap_pages = MMAP_PAGES;
 
@@ -88,34 +71,6 @@ static int mmap_pages = MMAP_PAGES;
                      * memory rings.
                      */
 
-/*Data struct handed back to userspace for tapdisk device to VBD mapping*/
-typedef struct domid_translate {
-       unsigned short domid;
-       unsigned short busid;
-} domid_translate_t ;
-
-/*Data struct associated with each of the tapdisk devices*/
-typedef struct tap_blkif {
-       struct vm_area_struct *vma;   /*Shared memory area                   */
-       unsigned long rings_vstart;   /*Kernel memory mapping                */
-       unsigned long user_vstart;    /*User memory mapping                  */
-       unsigned long dev_inuse;      /*One process opens device at a time.  */
-       unsigned long dev_pending;    /*In process of being opened           */
-       unsigned long ring_ok;        /*make this ring->state                */
-       blkif_front_ring_t ufe_ring;  /*Rings up to user space.              */
-       wait_queue_head_t wait;       /*for poll                             */
-       unsigned long mode;           /*current switching mode               */
-       int minor;                    /*Minor number for tapdisk device      */
-       pid_t pid;                    /*tapdisk process id                   */
-       struct pid_namespace *pid_ns; /*... and its corresponding namespace  */
-       enum { RUNNING, CLEANSHUTDOWN } status; /*Detect a clean userspace 
-                                                 shutdown                   */
-       unsigned long *idx_map;       /*Record the user ring id to kern 
-                                       [req id, idx] tuple                  */
-       blkif_t *blkif;               /*Associate blkif with tapdev          */
-       struct domid_translate trans; /*Translation from domid to bus.       */
-} tap_blkif_t;
-
 static struct tap_blkif *tapfds[MAX_TAP_DEV];
 static int blktap_next_minor;
 
@@ -126,52 +81,15 @@ static unsigned int debug_lvl = 0;
 module_param(log_stats, int, 0644);
 module_param(debug_lvl, int, 0644);
 
-/*
- * Each outstanding request that we've passed to the lower device layers has a 
- * 'pending_req' allocated to it. Each buffer_head that completes decrements 
- * the pendcnt towards zero. When it hits zero, the specified domain has a 
- * response queued for it, with the saved 'id' passed back.
- */
-typedef struct {
-       blkif_t       *blkif;
-       u64            id;
-       unsigned short mem_idx;
-       int            nr_pages;
-       atomic_t       pendcnt;
-       unsigned short operation;
-       int            status;
-       struct list_head free_list;
-       int            inuse;
-} pending_req_t;
-
-static pending_req_t *pending_reqs[MAX_PENDING_REQS];
+pending_req_t *pending_reqs[MAX_PENDING_REQS];
 static struct list_head pending_free;
 static DEFINE_SPINLOCK(pending_free_lock);
 static DECLARE_WAIT_QUEUE_HEAD (pending_free_wq);
 static int alloc_pending_reqs;
 
-typedef unsigned int PEND_RING_IDX;
-
-static inline int MASK_PEND_IDX(int i) { 
-       return (i & (MAX_PENDING_REQS-1));
-}
-
-static inline unsigned int RTN_PEND_IDX(pending_req_t *req, int idx) {
-       return (req - pending_reqs[idx]);
-}
-
-#define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons)
-
 #define BLKBACK_INVALID_HANDLE (~0)
 
-static struct page **foreign_pages[MAX_DYNAMIC_MEM];
-static inline unsigned long idx_to_kaddr(
-       unsigned int mmap_idx, unsigned int req_idx, unsigned int sg_idx)
-{
-       unsigned int arr_idx = req_idx*BLKIF_MAX_SEGMENTS_PER_REQUEST + sg_idx;
-       unsigned long pfn = page_to_pfn(foreign_pages[mmap_idx][arr_idx]);
-       return (unsigned long)pfn_to_kaddr(pfn);
-}
+struct page **foreign_pages[MAX_DYNAMIC_MEM];
 
 static unsigned short mmap_alloc = 0;
 static unsigned short mmap_lock = 0;
@@ -217,6 +135,7 @@ static int blktap_major;
 #define BLKTAP_QUERY_ALLOC_REQS      8
 #define BLKTAP_IOCTL_FREEINTF        9
 #define BLKTAP_IOCTL_PRINT_IDXS      100  
+#define BLKTAP_IOCTL_BACKDEV_SETUP   200
 
 /* blktap switching modes: (Set with BLKTAP_IOCTL_SETMODE)             */
 #define BLKTAP_MODE_PASSTHROUGH      0x00000000  /* default            */
@@ -234,39 +153,6 @@ static inline int BLKTAP_MODE_VALID(unsigned long arg)
                 (arg == BLKTAP_MODE_INTERPOSE   ));
 }
 
-/* Requests passing through the tap to userspace are re-assigned an ID.
- * We must record a mapping between the BE [IDX,ID] tuple and the userspace
- * ring ID. 
- */
-
-static inline unsigned long MAKE_ID(domid_t fe_dom, PEND_RING_IDX idx)
-{
-        return ((fe_dom << 16) | MASK_PEND_IDX(idx));
-}
-
-extern inline PEND_RING_IDX ID_TO_IDX(unsigned long id)
-{
-        return (PEND_RING_IDX)(id & 0x0000ffff);
-}
-
-extern inline int ID_TO_MIDX(unsigned long id)
-{
-        return (int)(id >> 16);
-}
-
-#define INVALID_REQ 0xdead0000
-
-/*TODO: Convert to a free list*/
-static inline int GET_NEXT_REQ(unsigned long *idx_map)
-{
-       int i;
-       for (i = 0; i < MAX_PENDING_REQS; i++)
-               if (idx_map[i] == INVALID_REQ)
-                       return i;
-
-       return INVALID_REQ;
-}
-
 static inline int OFFSET_TO_USR_IDX(int offset)
 {
        return offset / BLKIF_MAX_SEGMENTS_PER_REQUEST;
@@ -432,6 +318,7 @@ static tap_blkif_t *get_next_free_dev(void)
                /* we could have failed a previous attempt. */
                if (!info ||
                    ((info->dev_inuse == 0) &&
+                    (info->backdev == NULL) &&
                     (info->dev_pending == 0)) ) {
                        info->dev_pending = 1;
                        goto found;
@@ -584,6 +471,7 @@ static int blktap_open(struct inode *inode, struct file *filp)
 static int blktap_release(struct inode *inode, struct file *filp)
 {
        tap_blkif_t *info = filp->private_data;
+       int ret;
        
        /* check for control device */
        if (!info)
@@ -620,6 +508,12 @@ static int blktap_release(struct inode *inode, struct file *filp)
                info->status = CLEANSHUTDOWN;
        }
 
+       ret = destroy_backdev(info);
+       if (ret && ret != -EBUSY)
+               WPRINTK("destroy_backdev failed %d\n", ret);
+
+       filp->private_data = NULL;      
+
        return 0;
 }
 
@@ -783,6 +677,7 @@ static int blktap_ioctl(struct inode *inode, struct file *filp,
                unsigned long dev = arg;
                unsigned long flags;
 
+               DPRINTK("FREEINTF Req for dev %ld\n", dev);
                info = tapfds[dev];
 
                if ((dev > MAX_TAP_DEV) || !info)
@@ -815,6 +710,16 @@ static int blktap_ioctl(struct inode *inode, struct file *filp,
                       alloc_pending_reqs, blkif_reqs);
                return (alloc_pending_reqs/blkif_reqs) * 100;
        }
+
+       case BLKTAP_IOCTL_BACKDEV_SETUP:
+       {
+               unsigned long dev = arg;
+
+               DPRINTK("BLKTAP_IOCTL_BACKDEV_SETUP ioctl: %ld\n", dev);
+               info = tapfds[dev];
+
+               return create_backdev(info);
+       }
        }
        return -ENOIOCTLCMD;
 }
@@ -914,7 +819,7 @@ static void mmap_req_del(int mmap)
        mmap_alloc--;
 }
 
-static pending_req_t* alloc_req(void)
+pending_req_t* alloc_req(void)
 {
        pending_req_t *req = NULL;
        unsigned long flags;
@@ -935,7 +840,7 @@ static pending_req_t* alloc_req(void)
        return req;
 }
 
-static void free_req(pending_req_t *req)
+void free_req(pending_req_t *req)
 {
        unsigned long flags;
        int was_empty;
@@ -1077,6 +982,8 @@ int tap_blkif_schedule(void *arg)
 
                if (do_block_io_op(blkif))
                        blkif->waiting_reqs = 1;
+               else
+                       backdev_restart_queue(tapfds[blkif->dev_num]);
 
                if (log_stats && time_after(jiffies, blkif->st_print))
                        print_stats(blkif);
@@ -1105,6 +1012,7 @@ static int blktap_read_ufe_ring(tap_blkif_t *info)
        blkif_t *blkif=NULL;
        int pending_idx, usr_idx, mmap_idx;
        pending_req_t *pending_req;
+       struct page **map;
        
        if (!info)
                return 0;
@@ -1140,31 +1048,35 @@ static int blktap_read_ufe_ring(tap_blkif_t *info)
 
                pending_req = &pending_reqs[mmap_idx][pending_idx];
                blkif = pending_req->blkif;
+               map = info->vma->vm_private_data;
 
                for (j = 0; j < pending_req->nr_pages; j++) {
-
-                       unsigned long kvaddr, uvaddr;
-                       struct page **map = info->vma->vm_private_data;
-                       struct page *pg;
+                       unsigned long uvaddr;
                        int offset;
 
                        uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, j);
-                       kvaddr = idx_to_kaddr(mmap_idx, pending_idx, j);
 
-                       pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
-                       ClearPageReserved(pg);
-                       offset = (uvaddr - info->vma->vm_start) 
-                               >> PAGE_SHIFT;
+                       offset = (uvaddr - info->vma->vm_start) >> PAGE_SHIFT;
+                       ClearPageReserved(map[offset]);
                        map[offset] = NULL;
                }
-               fast_flush_area(pending_req, pending_idx, usr_idx, info->minor);
-               info->idx_map[usr_idx] = INVALID_REQ;
-               make_response(blkif, pending_req->id, res.operation,
-                             res.status);
+               if (pending_req->inuse == 2)
+                       backdev_finish_req(info, usr_idx, &res, pending_req);
+               else {
+                       fast_flush_area(pending_req, pending_idx, usr_idx,
+                                       info->minor);
+
+                       info->idx_map[usr_idx] = INVALID_REQ;
+                       make_response(blkif, pending_req->id, res.operation,
+                                     res.status);
+               }
                blkif_put(pending_req->blkif);
                free_req(pending_req);
        }
                
+       if (info->blkif && info->blkif->waiting_reqs)
+               wake_up(&info->blkif->wq);
+
        return 0;
 }
 
@@ -1200,6 +1112,9 @@ static int do_block_io_op(blkif_t *blkif)
        int more_to_do = 0;
        tap_blkif_t *info;
 
+       if (!tap_blkif_connected(blkif))
+               return 0;
+
        rc = blk_rings->common.req_cons;
        rp = blk_rings->common.sring->req_prod;
 
@@ -1625,6 +1540,12 @@ static int __init blkif_init(void)
 
        tap_blkif_xenbus_init();
 
+       ret = register_backdev();
+       if (ret < 0) {
+               WPRINTK("Couldn't register /dev/xen/backdev\n");
+               return ret;
+       }
+
        /* Dynamically allocate a major for this device */
        ret = register_chrdev(0, "blktap", &blktap_fops);
 
diff --git a/drivers/xen/blktap/blktap.h b/drivers/xen/blktap/blktap.h
new file mode 100644 (file)
index 0000000..31ff6af
--- /dev/null
@@ -0,0 +1,128 @@
+/*
+ * The maximum number of requests that can be outstanding at any time
+ * is determined by 
+ *
+ *   [mmap_alloc * MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST] 
+ *
+ * where mmap_alloc < MAX_DYNAMIC_MEM.
+ *
+ * TODO:
+ * mmap_alloc is initialised to 2 and should be adjustable on the fly via
+ * sysfs.
+ */
+#define BLK_RING_SIZE          __RING_SIZE((blkif_sring_t *)0, PAGE_SIZE)
+#define MAX_DYNAMIC_MEM                BLK_RING_SIZE
+#define MAX_PENDING_REQS       BLK_RING_SIZE
+#define MMAP_PAGES (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST)
+#define MMAP_VADDR(_start, _req,_seg)                                   \
+        (_start +                                                       \
+         ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) +        \
+         ((_seg) * PAGE_SIZE))
+
+/*Data struct handed back to userspace for tapdisk device to VBD mapping*/
+typedef struct domid_translate {
+       unsigned short domid;
+       unsigned short busid;
+} domid_translate_t ;
+
+/*Data struct associated with each of the tapdisk devices*/
+typedef struct tap_blkif {
+       struct vm_area_struct *vma;   /*Shared memory area                   */
+       unsigned long rings_vstart;   /*Kernel memory mapping                */
+       unsigned long user_vstart;    /*User memory mapping                  */
+       unsigned long dev_inuse;      /*One process opens device at a time.  */
+       unsigned long dev_pending;    /*In process of being opened           */
+       unsigned long ring_ok;        /*make this ring->state                */
+       blkif_front_ring_t ufe_ring;  /*Rings up to user space.              */
+       wait_queue_head_t wait;       /*for poll                             */
+       unsigned long mode;           /*current switching mode               */
+       int minor;                    /*Minor number for tapdisk device      */
+       pid_t pid;                    /*tapdisk process id                   */
+       struct pid_namespace *pid_ns; /*... and its corresponding namespace  */
+       enum { RUNNING, CLEANSHUTDOWN } status; /*Detect a clean userspace 
+                                                 shutdown                   */
+       unsigned long *idx_map;       /*Record the user ring id to kern 
+                                       [req id, idx] tuple                  */
+       blkif_t *blkif;               /*Associate blkif with tapdev          */
+       struct domid_translate trans; /*Translation from domid to bus.       */
+       struct backdev_info *backdev; /*Backend domain device info           */
+} tap_blkif_t;
+
+/*
+ * Each outstanding request that we've passed to the lower device layers has a 
+ * 'pending_req' allocated to it. Each buffer_head that completes decrements 
+ * the pendcnt towards zero. When it hits zero, the specified domain has a 
+ * response queued for it, with the saved 'id' passed back.
+ */
+typedef struct pending_req {
+       blkif_t       *blkif;
+       u64            id;
+       unsigned short mem_idx;
+       int            nr_pages;
+       atomic_t       pendcnt;
+       unsigned short operation;
+       int            status;
+       struct list_head free_list;
+       int            inuse;
+} pending_req_t;
+
+extern pending_req_t *pending_reqs[MAX_PENDING_REQS];
+
+typedef unsigned int PEND_RING_IDX;
+
+static inline int MASK_PEND_IDX(int i) { 
+       return (i & (MAX_PENDING_REQS-1));
+}
+
+static inline unsigned int RTN_PEND_IDX(pending_req_t *req, int idx) {
+       return (req - pending_reqs[idx]);
+}
+
+#define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons)
+
+extern struct page **foreign_pages[MAX_DYNAMIC_MEM];
+static inline unsigned long idx_to_kaddr(
+       unsigned int mmap_idx, unsigned int req_idx, unsigned int sg_idx)
+{
+       unsigned int arr_idx = req_idx*BLKIF_MAX_SEGMENTS_PER_REQUEST + sg_idx;
+       unsigned long pfn = page_to_pfn(foreign_pages[mmap_idx][arr_idx]);
+       return (unsigned long)pfn_to_kaddr(pfn);
+}
+
+/* Requests passing through the tap to userspace are re-assigned an ID.
+ * We must record a mapping between the BE [IDX,ID] tuple and the userspace
+ * ring ID. 
+ */
+
+static inline unsigned long MAKE_ID(domid_t fe_dom, PEND_RING_IDX idx)
+{
+       return ((fe_dom << 16) | MASK_PEND_IDX(idx));
+}
+
+extern inline PEND_RING_IDX ID_TO_IDX(unsigned long id)
+{
+       return (PEND_RING_IDX)(id & 0x0000ffff);
+}
+
+extern inline int ID_TO_MIDX(unsigned long id)
+{
+       return (int)(id >> 16);
+}
+
+#define INVALID_REQ 0xdead0000
+
+/*TODO: Convert to a free list*/
+static inline int GET_NEXT_REQ(unsigned long *idx_map)
+{
+       int i;
+       for (i = 0; i < MAX_PENDING_REQS; i++)
+               if (idx_map[i] == INVALID_REQ)
+                       return i;
+
+       return INVALID_REQ;
+}
+
+pending_req_t* alloc_req(void);
+void free_req(pending_req_t *req);
+
+void blktap_kick_user(int idx);
index 520f6640ee98f2f17c911ca28dfaf41da582334d..34540a7bc2a1c41328859fd86b174a56edfc655a 100644 (file)
@@ -93,6 +93,7 @@ void tap_blkif_kmem_cache_free(blkif_t *blkif);
 int tap_blkif_map(blkif_t *blkif, unsigned long shared_page, 
                  unsigned int evtchn);
 void tap_blkif_unmap(blkif_t *blkif);
+int tap_blkif_connected(blkif_t *blkif);
 
 #define blkif_get(_b) (atomic_inc(&(_b)->refcnt))
 #define blkif_put(_b)                                  \
index 29f5513a0b31fa144527054592383469eb17d34e..4a8c841d99237797563dc81cad4ca24dc050a797 100644 (file)
@@ -193,7 +193,6 @@ static int blktap_remove(struct xenbus_device *dev)
 static void tap_update_blkif_status(blkif_t *blkif)
 { 
        int err;
-       char name[TASK_COMM_LEN];
 
        /* Not ready to connect? */
        if(!blkif->irq || !blkif->sectors) {
@@ -209,12 +208,6 @@ static void tap_update_blkif_status(blkif_t *blkif)
        if (blkif->be->dev->state != XenbusStateConnected)
                return;
 
-       err = blktap_name(blkif, name);
-       if (err) {
-               xenbus_dev_error(blkif->be->dev, err, "get blktap dev name");
-               return;
-       }
-
        if (!blkif->be->group_added) {
                err = xentap_sysfs_addif(blkif->be->dev);
                if (err) {
@@ -223,14 +216,11 @@ static void tap_update_blkif_status(blkif_t *blkif)
                        return;
                }
        }
+}
 
-       blkif->xenblkd = kthread_run(tap_blkif_schedule, blkif, name);
-       if (IS_ERR(blkif->xenblkd)) {
-               err = PTR_ERR(blkif->xenblkd);
-               blkif->xenblkd = NULL;
-               xenbus_dev_fatal(blkif->be->dev, err, "start xenblkd");
-               WPRINTK("Error starting thread\n");
-       }
+int tap_blkif_connected(blkif_t *blkif)
+{
+       return (blkif->be->dev->state == XenbusStateConnected);
 }
 
 /**
@@ -243,6 +233,7 @@ static int blktap_probe(struct xenbus_device *dev,
                         const struct xenbus_device_id *id)
 {
        int err;
+       char name[TASK_COMM_LEN];
        struct backend_info *be = kzalloc(sizeof(struct backend_info),
                                          GFP_KERNEL);
        if (!be) {
@@ -267,6 +258,22 @@ static int blktap_probe(struct xenbus_device *dev,
        be->blkif->be = be;
        be->blkif->sectors = 0;
 
+       err = blktap_name(be->blkif, name);
+       if (err) {
+               xenbus_dev_error(be->dev, err, "get blktap dev name");
+               goto fail;
+       }
+       DPRINTK("blktap_probe %d dev %s\n", dev->otherend_id, name);
+
+       be->blkif->xenblkd = kthread_run(tap_blkif_schedule, be->blkif, name);
+       if (IS_ERR(be->blkif->xenblkd)) {
+               err = PTR_ERR(be->blkif->xenblkd);
+               be->blkif->xenblkd = NULL;
+               xenbus_dev_fatal(be->dev, err, "start xenblkd");
+               WPRINTK("Error starting thread\n");
+               goto fail;
+       }
+
        /* set a watch on disk info, waiting for userspace to update details*/
        err = xenbus_watch_path2(dev, dev->nodename, "info",
                                 &be->backend_watch, tap_backend_changed);