--- /dev/null
+
+#include <linux/cdrom.h>
+#include <linux/fs.h>
+#include <linux/hdreg.h>
+#include <linux/module.h>
+
+#include <scsi/scsi.h>
+#include <scsi/scsi_ioctl.h>
+
+#include <xen/xenbus.h>
+#include <xen/interface/io/blkif.h>
+
+#include "common.h"
+#include "backdev.h"
+#include "blktap.h"
+
+#if 0
+#define DPRINTK_IOCTL(_f, _a...) printk(KERN_ALERT _f, ## _a)
+#else
+#define DPRINTK_IOCTL(_f, _a...) ((void)0)
+#endif
+
+DEFINE_SPINLOCK(backdev_io_lock);
+
+static int backdev_major;
+
+int
+register_backdev(void)
+{
+ int major;
+
+ /* Dynamically allocate a major for this device */
+ major = register_blkdev(0, "backdev");
+ if (major < 0) {
+ WPRINTK("Couldn't register /dev/xen/backdev\n");
+ return -ENOMEM;
+ }
+ backdev_major = major;
+ DPRINTK("backdev major %d\n", major);
+ return 0;
+}
+
+int
+destroy_backdev(struct tap_blkif *uinfo)
+{
+ struct backdev_info *info = uinfo->backdev;
+
+ DPRINTK("destroy backdev %d users %d\n", uinfo->minor, info->users);
+ if (info->users)
+ return -EBUSY;
+
+ spin_lock_irq(&backdev_io_lock);
+ /* No more blkif_request(). */
+ blk_stop_queue(info->gd->queue);
+ spin_unlock_irq(&backdev_io_lock);
+
+ del_gendisk(info->gd);
+ put_disk(info->gd);
+
+ blk_cleanup_queue(info->gd->queue);
+
+ uinfo->backdev = NULL;
+ kfree(info);
+
+ return 0;
+}
+
+static int
+backdev_open(struct inode *inode, struct file *filep)
+{
+ struct backdev_info *info = inode->i_bdev->bd_disk->private_data;
+ info->users++;
+ return 0;
+}
+
+static int
+backdev_release(struct inode *inode, struct file *filep)
+{
+ struct backdev_info *info = inode->i_bdev->bd_disk->private_data;
+ info->users--;
+ if (info->uinfo->dev_inuse == 0)
+ destroy_backdev(info->uinfo);
+ return 0;
+}
+
+static int
+backdev_getgeo(struct block_device *bd, struct hd_geometry *hg)
+{
+ /* We don't have real geometry info, but let's at least return
+ values consistent with the size of the device */
+ sector_t nsect = get_capacity(bd->bd_disk);
+ sector_t cylinders = nsect;
+
+ hg->heads = 0xff;
+ hg->sectors = 0x3f;
+ sector_div(cylinders, hg->heads * hg->sectors);
+ hg->cylinders = cylinders;
+ if ((sector_t)(hg->cylinders + 1) * hg->heads * hg->sectors < nsect)
+ hg->cylinders = 0xffff;
+ return 0;
+}
+
+static int
+backdev_ioctl(struct inode *inode, struct file *filep,
+ unsigned command, unsigned long argument)
+{
+ int i;
+
+ DPRINTK_IOCTL("command: 0x%x, argument: 0x%lx, dev: 0x%04x\n",
+ command, (long)argument, inode->i_rdev);
+
+ switch (command) {
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,16)
+ case HDIO_GETGEO: {
+ struct block_device *bd = inode->i_bdev;
+ struct hd_geometry geo;
+ int ret;
+
+ if (!argument)
+ return -EINVAL;
+
+ geo.start = get_start_sect(bd);
+ ret = backdev_getgeo(bd, &geo);
+ if (ret)
+ return ret;
+
+ if (copy_to_user((struct hd_geometry __user *)argument, &geo,
+ sizeof(geo)))
+ return -EFAULT;
+
+ return 0;
+ }
+#endif
+ case CDROMMULTISESSION:
+ DPRINTK("FIXME: support multisession CDs later\n");
+ for (i = 0; i < sizeof(struct cdrom_multisession); i++)
+ if (put_user(0, (char __user *)(argument + i)))
+ return -EFAULT;
+ return 0;
+
+ case SCSI_IOCTL_GET_IDLUN:
+ if (!access_ok(VERIFY_WRITE, argument,
+ sizeof(struct scsi_idlun)))
+ return -EFAULT;
+
+ /* return 0 for now. */
+ __put_user(0, &((struct scsi_idlun __user *)argument)->dev_id);
+ __put_user(0,
+ &((struct scsi_idlun __user *)argument)->host_unique_id);
+ return 0;
+
+ default:
+ /*printk(KERN_ALERT "ioctl %08x not supported by Xen blkdev\n",
+ command);*/
+ return -EINVAL; /* same return as native Linux */
+ }
+
+ return 0;
+}
+
+static struct block_device_operations backdev_fops = {
+ .owner = THIS_MODULE,
+ .open = backdev_open,
+ .release = backdev_release,
+ .ioctl = backdev_ioctl,
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16)
+ .getgeo = backdev_getgeo
+#endif
+};
+
+static int map_uaddr_fn(
+ pte_t *ptep, struct page *pmd_page, unsigned long addr, void *data)
+{
+ pte_t *pte = (pte_t *)data;
+
+ DPRINTK("map_uaddr ptep %p -> %012llx/%012llx\n", ptep, pte_val(*pte),
+ pte_val_ma(*pte));
+ set_pte(ptep, *pte);
+ xen_invlpg(addr);
+ return 0;
+}
+
+static int map_uaddr(struct mm_struct *mm, unsigned long address,
+ pte_t pte)
+{
+ return apply_to_page_range(mm, address, PAGE_SIZE, map_uaddr_fn, &pte);
+}
+
+static int umap_uaddr_fn(
+ pte_t *ptep, struct page *pmd_page, unsigned long addr, void *data)
+{
+ struct mm_struct *mm = (struct mm_struct *)data;
+
+ pte_clear(mm, addr, ptep);
+ xen_invlpg(addr);
+ return 0;
+}
+
+static int umap_uaddr(struct mm_struct *mm, unsigned long address)
+{
+ return apply_to_page_range(mm, address, PAGE_SIZE, umap_uaddr_fn, mm);
+}
+
+static void
+process_backdev_request(struct backdev_info *info)
+{
+ request_queue_t *rq;
+ struct tap_blkif *uinfo;
+ struct request *req;
+ blkif_request_t blkif_req;
+ blkif_request_t *target;
+ pending_req_t *pending_req;
+ blkif_t *blkif = NULL;
+ int queued;
+ unsigned long uvaddr, kvaddr;
+ pte_t pte;
+ unsigned int fsect, lsect;
+ struct bio *bio;
+ struct bio_vec *bvec;
+ int idx;
+ int usr_idx;
+ int pending_idx;
+ uint16_t mmap_idx;
+ unsigned long offset;
+ struct page *pg;
+ int nr_sects = 0;
+
+ uinfo = info->uinfo;
+ rq = info->gd->queue;
+ blkif = uinfo->blkif;
+
+ DPRINTK("Entered do_backdev_request %d\n", uinfo->minor);
+
+ queued = 0;
+
+ while ((req = elv_next_request(rq)) != NULL) {
+ if (!blk_fs_request(req)) {
+ end_request(req, 0);
+ continue;
+ }
+ if (blk_barrier_rq(req)) {
+ end_request(req, 0);
+ continue;
+ }
+ if (info != req->rq_disk->private_data) {
+ end_request(req, 0);
+ continue;
+ }
+ if (uinfo->dev_inuse == 0) {
+ DPRINTK("device no longer in use %d\n", info->uinfo->minor);
+ end_request(req, 0);
+ continue;
+ }
+
+ if (RING_FULL(&uinfo->ufe_ring)) {
+ wait:
+ /* Avoid pointless unplugs. */
+ blk_stop_queue(rq);
+ break;
+ }
+
+ /* Check we have space on user ring - should never fail. */
+ usr_idx = GET_NEXT_REQ(uinfo->idx_map);
+ if (usr_idx == INVALID_REQ)
+ goto wait;
+
+ pending_req = alloc_req();
+ if (pending_req == NULL) {
+ blkif->st_oo_req++;
+ blkif->waiting_reqs = 1;
+ goto wait;
+ }
+ pending_req->inuse = 2;
+
+ pending_idx = RTN_PEND_IDX(pending_req, pending_req->mem_idx);
+ mmap_idx = pending_req->mem_idx;
+ DPRINTK("request pending_req %p blkif %p pending_idx %d mmap_idx %d\n",
+ pending_req, blkif, pending_idx, mmap_idx);
+
+ DPRINTK("do_blk_req %p: dev %d cmd %p, sec %llx, "
+ "(%u/%li) buffer:%p [%s]\n",
+ req, uinfo->minor, req->cmd, (long long)req->sector,
+ req->current_nr_sectors,
+ req->nr_sectors, req->buffer,
+ rq_data_dir(req) ? "write" : "read");
+
+ blkdev_dequeue_request(req);
+
+ blkif_req.id = usr_idx;
+ blkif_req.sector_number = (blkif_sector_t)req->sector;
+ blkif_req.handle = uinfo->trans.busid;
+ blkif_req.operation = rq_data_dir(req) ?
+ BLKIF_OP_WRITE : BLKIF_OP_READ;
+
+ blkif_req.nr_segments = 0;
+ rq_for_each_bio (bio, req) {
+ bio_for_each_segment (bvec, bio, idx) {
+ BUG_ON(blkif_req.nr_segments ==
+ BLKIF_MAX_SEGMENTS_PER_REQUEST);
+ fsect = bvec->bv_offset >> 9;
+ lsect = fsect + (bvec->bv_len >> 9) - 1;
+ nr_sects += bvec->bv_len >> 9;
+
+ blkif_req.seg[blkif_req.nr_segments] =
+ (struct blkif_request_segment) {
+ .gref = 0,
+ .first_sect = fsect,
+ .last_sect = lsect };
+
+ uvaddr = MMAP_VADDR(uinfo->user_vstart, usr_idx,
+ blkif_req.nr_segments);
+ kvaddr = idx_to_kaddr(mmap_idx, pending_idx,
+ blkif_req.nr_segments);
+
+ pte = mk_pte(bvec->bv_page, uinfo->vma->vm_page_prot);
+ map_uaddr(uinfo->vma->vm_mm, uvaddr, pte_mkwrite(pte));
+ map_uaddr(&init_mm, kvaddr,
+ mk_pte(bvec->bv_page, PAGE_KERNEL));
+ set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT, pte_mfn(pte));
+ offset = (uvaddr - uinfo->vma->vm_start) >> PAGE_SHIFT;
+ pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
+ DPRINTK("mapped uaddr %08lx to page %p\n", uvaddr, pg);
+ ((struct page **)uinfo->vma->vm_private_data)[offset] = pg;
+ SetPageReserved(pg);
+ if (0) if (vm_insert_page(uinfo->vma, uvaddr, pg))
+ DPRINTK("boohoo\n");
+ pending_handle(mmap_idx, pending_idx,
+ blkif_req.nr_segments).kernel =
+ INVALID_GRANT_HANDLE;
+ pending_handle(mmap_idx, pending_idx,
+ blkif_req.nr_segments).user =
+ INVALID_GRANT_HANDLE;
+
+ blkif_req.nr_segments++;
+ }
+ }
+
+ pending_req->blkif = blkif;
+ pending_req->id = (unsigned long)req;
+ pending_req->operation = blkif_req.operation;
+ pending_req->status = BLKIF_RSP_OKAY;
+ pending_req->nr_pages = blkif_req.nr_segments;
+
+ /*record [mmap_idx,pending_idx] to [usr_idx] mapping*/
+ uinfo->idx_map[usr_idx] = MAKE_ID(mmap_idx, pending_idx);
+
+ blkif_get(blkif);
+ /* Finally, write the request message to the user ring. */
+ target = RING_GET_REQUEST(&uinfo->ufe_ring,
+ uinfo->ufe_ring.req_prod_pvt);
+ memcpy(target, &blkif_req, sizeof(blkif_req));
+ target->id = usr_idx;
+ wmb(); /* blktap_poll() reads req_prod_pvt asynchronously */
+ uinfo->ufe_ring.req_prod_pvt++;
+
+ if (rq_data_dir(req)) {
+ blkif->st_wr_sect += nr_sects;
+ blkif->st_wr_req++;
+ } else {
+ blkif->st_rd_sect += nr_sects;
+ blkif->st_rd_req++;
+ }
+
+ queued++;
+ }
+
+ if (queued != 0)
+ blktap_kick_user(blkif->dev_num);
+ return;
+}
+
+static void
+do_backdev_request(request_queue_t *rq)
+{
+ struct backdev_info *info;
+
+ info = rq->queuedata;
+ if (info->uinfo->blkif) {
+ info->uinfo->blkif->waiting_reqs = 1;
+ wake_up(&info->uinfo->blkif->wq);
+ }
+ DPRINTK("got requests for dev %d wake %p/%p\n", info->uinfo->minor,
+ info->uinfo->blkif, &info->uinfo->blkif->wq);
+}
+
+void
+backdev_finish_req(struct tap_blkif *info, int usr_idx, blkif_response_t *res,
+ struct pending_req *pending_req)
+{
+ struct request *req;
+ int uptodate, ret;
+ int pending_idx, mmap_idx;
+ int i;
+
+ pending_idx = MASK_PEND_IDX(ID_TO_IDX(info->idx_map[usr_idx]));
+ mmap_idx = ID_TO_MIDX(info->idx_map[usr_idx]);
+
+ down_write(&info->vma->vm_mm->mmap_sem);
+ zap_page_range(info->vma,
+ MMAP_VADDR(info->user_vstart, usr_idx, 0),
+ pending_req->nr_pages << PAGE_SHIFT, NULL);
+ for (i = 0; i < pending_req->nr_pages; i++)
+ umap_uaddr(&init_mm, idx_to_kaddr(mmap_idx, pending_idx, i));
+ up_write(&info->vma->vm_mm->mmap_sem);
+
+ info->idx_map[usr_idx] = INVALID_REQ;
+
+ spin_lock_irq(&backdev_io_lock);
+
+ req = (struct request *)(unsigned long)pending_req->id;
+ DPRINTK("req %p res status %d operation %d/%d id %lld\n", req,
+ res->status, res->operation, pending_req->operation, res->id);
+ uptodate = (res->status == BLKIF_RSP_OKAY);
+ switch (pending_req->operation) {
+#if 0
+ case BLKIF_OP_WRITE_BARRIER:
+ if (unlikely(res->status == BLKIF_RSP_EOPNOTSUPP)) {
+ printk("backdev: %s: write barrier op failed\n",
+ info->gd->disk_name);
+ uptodate = -EOPNOTSUPP;
+ info->feature_barrier = 0;
+ xlvbd_barrier(info);
+ }
+ /* fall through */
+#endif
+ case BLKIF_OP_READ:
+ case BLKIF_OP_WRITE:
+ if (unlikely(res->status != BLKIF_RSP_OKAY))
+ DPRINTK("Bad return from blkdev data "
+ "request: %x\n", res->status);
+
+ ret = end_that_request_first(req, uptodate,
+ req->hard_nr_sectors);
+ BUG_ON(ret);
+ end_that_request_last(req, uptodate);
+ break;
+ default:
+ BUG();
+ }
+
+ if (info->blkif)
+ info->blkif->waiting_reqs = 1;
+
+ spin_unlock_irq(&backdev_io_lock);
+}
+
+void
+backdev_restart_queue(struct tap_blkif *uinfo)
+{
+ struct backdev_info *info;
+
+ if (uinfo == NULL)
+ return;
+ info = uinfo->backdev;
+ if (info == NULL || info->gd == NULL || info->gd->queue == NULL)
+ return;
+
+ if (!RING_FULL(&uinfo->ufe_ring)) {
+ spin_lock_irq(&backdev_io_lock);
+ /* Re-enable calldowns. */
+ if (blk_queue_stopped(info->gd->queue))
+ blk_start_queue(info->gd->queue);
+ /* Kick things off immediately. */
+ process_backdev_request(info);
+ spin_unlock_irq(&backdev_io_lock);
+ }
+ return;
+}
+
+int
+create_backdev(struct tap_blkif *uinfo)
+{
+ struct gendisk *gd = NULL;
+ struct request_queue *rq = NULL;
+ struct backdev_info *info = NULL;
+ unsigned long long capacity;
+ unsigned long sector_size;
+ char *path = NULL, *s = NULL;
+ int minor = uinfo->minor;
+ unsigned short domid = uinfo->trans.domid;
+ unsigned short busid = uinfo->trans.busid;
+ int err;
+
+ if (uinfo->backdev)
+ return -EEXIST;
+
+ DPRINTK("create_backdev minor %d domid %d busid %d\n",
+ minor, domid, busid);
+ err = -ENOMEM;
+ path = kasprintf(GFP_KERNEL, "/local/domain/0/backend/tap/%d/%d",
+ domid, busid);
+ if (path == NULL)
+ goto error;
+ err = xenbus_gather(XBT_NIL, path, "sectors", "%Lu", &capacity,
+ "sector-size", "%lu", §or_size, NULL);
+ if (err)
+ goto error;
+ DPRINTK("create_backdev sectors %Lu sector-size %lu\n",
+ capacity, sector_size);
+
+ err = -ENODEV;
+
+ gd = alloc_disk(1);
+ if (gd == NULL)
+ goto error;
+
+ if (minor < 26)
+ sprintf(gd->disk_name, "backdev%c", 'a' + minor);
+ else
+ sprintf(gd->disk_name, "backdev%c%c", 'a' + ((minor/26)-1),
+ 'a' + (minor%26));
+
+ info = kzalloc(sizeof(*info), GFP_KERNEL);
+ if (info == NULL)
+ goto error;
+
+ info->uinfo = uinfo;
+
+ gd->major = backdev_major;
+ gd->first_minor = minor;
+ gd->fops = &backdev_fops;
+ gd->private_data = info;
+ set_capacity(gd, capacity);
+
+ rq = blk_init_queue(do_backdev_request, &backdev_io_lock);
+ if (rq == NULL)
+ goto error;
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10)
+ elevator_init(rq, "noop");
+#else
+ elevator_init(rq, &elevator_noop);
+#endif
+
+ /* Hard sector size and max sectors impersonate the equiv. hardware. */
+ blk_queue_hardsect_size(rq, sector_size);
+ blk_queue_max_sectors(rq, 512);
+
+ /* Each segment in a request is up to an aligned page in size. */
+ blk_queue_segment_boundary(rq, PAGE_SIZE - 1);
+ blk_queue_max_segment_size(rq, PAGE_SIZE);
+
+ /* Ensure a merged request will fit in a single I/O ring slot. */
+ blk_queue_max_phys_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
+ blk_queue_max_hw_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
+
+ /* Make sure buffer addresses are sector-aligned. */
+ blk_queue_dma_alignment(rq, 511);
+
+ rq->queuedata = info;
+
+ gd->queue = rq;
+
+ info->gd = gd;
+
+ add_disk(gd);
+
+ s = kasprintf(GFP_KERNEL, "%d:%d", backdev_major, minor);
+ if (s == NULL)
+ goto error;
+ err = xenbus_write(XBT_NIL, path, "backdev-node", s);
+ if (err)
+ goto error;
+
+ uinfo->backdev = info;
+
+ goto out;
+
+ error:
+ kfree(info);
+ if (gd)
+ del_gendisk(gd);
+ if (rq)
+ blk_cleanup_queue(rq);
+ out:
+ kfree(s);
+ kfree(path);
+ return err;
+}
#include <linux/nsproxy.h>
#include <asm/tlbflush.h>
+#include "blktap.h"
+#include "backdev.h"
+
#define MAX_TAP_DEV 256 /*the maximum number of tapdisk ring devices */
#define MAX_DEV_NAME 100 /*the max tapdisk ring device name e.g. blktap0 */
-/*
- * The maximum number of requests that can be outstanding at any time
- * is determined by
- *
- * [mmap_alloc * MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST]
- *
- * where mmap_alloc < MAX_DYNAMIC_MEM.
- *
- * TODO:
- * mmap_alloc is initialised to 2 and should be adjustable on the fly via
- * sysfs.
- */
-#define BLK_RING_SIZE __RING_SIZE((blkif_sring_t *)0, PAGE_SIZE)
-#define MAX_DYNAMIC_MEM BLK_RING_SIZE
-#define MAX_PENDING_REQS BLK_RING_SIZE
-#define MMAP_PAGES (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST)
-#define MMAP_VADDR(_start, _req,_seg) \
- (_start + \
- ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) + \
- ((_seg) * PAGE_SIZE))
static int blkif_reqs = MAX_PENDING_REQS;
static int mmap_pages = MMAP_PAGES;
* memory rings.
*/
-/*Data struct handed back to userspace for tapdisk device to VBD mapping*/
-typedef struct domid_translate {
- unsigned short domid;
- unsigned short busid;
-} domid_translate_t ;
-
-/*Data struct associated with each of the tapdisk devices*/
-typedef struct tap_blkif {
- struct vm_area_struct *vma; /*Shared memory area */
- unsigned long rings_vstart; /*Kernel memory mapping */
- unsigned long user_vstart; /*User memory mapping */
- unsigned long dev_inuse; /*One process opens device at a time. */
- unsigned long dev_pending; /*In process of being opened */
- unsigned long ring_ok; /*make this ring->state */
- blkif_front_ring_t ufe_ring; /*Rings up to user space. */
- wait_queue_head_t wait; /*for poll */
- unsigned long mode; /*current switching mode */
- int minor; /*Minor number for tapdisk device */
- pid_t pid; /*tapdisk process id */
- struct pid_namespace *pid_ns; /*... and its corresponding namespace */
- enum { RUNNING, CLEANSHUTDOWN } status; /*Detect a clean userspace
- shutdown */
- unsigned long *idx_map; /*Record the user ring id to kern
- [req id, idx] tuple */
- blkif_t *blkif; /*Associate blkif with tapdev */
- struct domid_translate trans; /*Translation from domid to bus. */
-} tap_blkif_t;
-
static struct tap_blkif *tapfds[MAX_TAP_DEV];
static int blktap_next_minor;
module_param(log_stats, int, 0644);
module_param(debug_lvl, int, 0644);
-/*
- * Each outstanding request that we've passed to the lower device layers has a
- * 'pending_req' allocated to it. Each buffer_head that completes decrements
- * the pendcnt towards zero. When it hits zero, the specified domain has a
- * response queued for it, with the saved 'id' passed back.
- */
-typedef struct {
- blkif_t *blkif;
- u64 id;
- unsigned short mem_idx;
- int nr_pages;
- atomic_t pendcnt;
- unsigned short operation;
- int status;
- struct list_head free_list;
- int inuse;
-} pending_req_t;
-
-static pending_req_t *pending_reqs[MAX_PENDING_REQS];
+pending_req_t *pending_reqs[MAX_PENDING_REQS];
static struct list_head pending_free;
static DEFINE_SPINLOCK(pending_free_lock);
static DECLARE_WAIT_QUEUE_HEAD (pending_free_wq);
static int alloc_pending_reqs;
-typedef unsigned int PEND_RING_IDX;
-
-static inline int MASK_PEND_IDX(int i) {
- return (i & (MAX_PENDING_REQS-1));
-}
-
-static inline unsigned int RTN_PEND_IDX(pending_req_t *req, int idx) {
- return (req - pending_reqs[idx]);
-}
-
-#define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons)
-
#define BLKBACK_INVALID_HANDLE (~0)
-static struct page **foreign_pages[MAX_DYNAMIC_MEM];
-static inline unsigned long idx_to_kaddr(
- unsigned int mmap_idx, unsigned int req_idx, unsigned int sg_idx)
-{
- unsigned int arr_idx = req_idx*BLKIF_MAX_SEGMENTS_PER_REQUEST + sg_idx;
- unsigned long pfn = page_to_pfn(foreign_pages[mmap_idx][arr_idx]);
- return (unsigned long)pfn_to_kaddr(pfn);
-}
+struct page **foreign_pages[MAX_DYNAMIC_MEM];
static unsigned short mmap_alloc = 0;
static unsigned short mmap_lock = 0;
#define BLKTAP_QUERY_ALLOC_REQS 8
#define BLKTAP_IOCTL_FREEINTF 9
#define BLKTAP_IOCTL_PRINT_IDXS 100
+#define BLKTAP_IOCTL_BACKDEV_SETUP 200
/* blktap switching modes: (Set with BLKTAP_IOCTL_SETMODE) */
#define BLKTAP_MODE_PASSTHROUGH 0x00000000 /* default */
(arg == BLKTAP_MODE_INTERPOSE ));
}
-/* Requests passing through the tap to userspace are re-assigned an ID.
- * We must record a mapping between the BE [IDX,ID] tuple and the userspace
- * ring ID.
- */
-
-static inline unsigned long MAKE_ID(domid_t fe_dom, PEND_RING_IDX idx)
-{
- return ((fe_dom << 16) | MASK_PEND_IDX(idx));
-}
-
-extern inline PEND_RING_IDX ID_TO_IDX(unsigned long id)
-{
- return (PEND_RING_IDX)(id & 0x0000ffff);
-}
-
-extern inline int ID_TO_MIDX(unsigned long id)
-{
- return (int)(id >> 16);
-}
-
-#define INVALID_REQ 0xdead0000
-
-/*TODO: Convert to a free list*/
-static inline int GET_NEXT_REQ(unsigned long *idx_map)
-{
- int i;
- for (i = 0; i < MAX_PENDING_REQS; i++)
- if (idx_map[i] == INVALID_REQ)
- return i;
-
- return INVALID_REQ;
-}
-
static inline int OFFSET_TO_USR_IDX(int offset)
{
return offset / BLKIF_MAX_SEGMENTS_PER_REQUEST;
/* we could have failed a previous attempt. */
if (!info ||
((info->dev_inuse == 0) &&
+ (info->backdev == NULL) &&
(info->dev_pending == 0)) ) {
info->dev_pending = 1;
goto found;
static int blktap_release(struct inode *inode, struct file *filp)
{
tap_blkif_t *info = filp->private_data;
+ int ret;
/* check for control device */
if (!info)
info->status = CLEANSHUTDOWN;
}
+ ret = destroy_backdev(info);
+ if (ret && ret != -EBUSY)
+ WPRINTK("destroy_backdev failed %d\n", ret);
+
+ filp->private_data = NULL;
+
return 0;
}
unsigned long dev = arg;
unsigned long flags;
+ DPRINTK("FREEINTF Req for dev %ld\n", dev);
info = tapfds[dev];
if ((dev > MAX_TAP_DEV) || !info)
alloc_pending_reqs, blkif_reqs);
return (alloc_pending_reqs/blkif_reqs) * 100;
}
+
+ case BLKTAP_IOCTL_BACKDEV_SETUP:
+ {
+ unsigned long dev = arg;
+
+ DPRINTK("BLKTAP_IOCTL_BACKDEV_SETUP ioctl: %ld\n", dev);
+ info = tapfds[dev];
+
+ return create_backdev(info);
+ }
}
return -ENOIOCTLCMD;
}
mmap_alloc--;
}
-static pending_req_t* alloc_req(void)
+pending_req_t* alloc_req(void)
{
pending_req_t *req = NULL;
unsigned long flags;
return req;
}
-static void free_req(pending_req_t *req)
+void free_req(pending_req_t *req)
{
unsigned long flags;
int was_empty;
if (do_block_io_op(blkif))
blkif->waiting_reqs = 1;
+ else
+ backdev_restart_queue(tapfds[blkif->dev_num]);
if (log_stats && time_after(jiffies, blkif->st_print))
print_stats(blkif);
blkif_t *blkif=NULL;
int pending_idx, usr_idx, mmap_idx;
pending_req_t *pending_req;
+ struct page **map;
if (!info)
return 0;
pending_req = &pending_reqs[mmap_idx][pending_idx];
blkif = pending_req->blkif;
+ map = info->vma->vm_private_data;
for (j = 0; j < pending_req->nr_pages; j++) {
-
- unsigned long kvaddr, uvaddr;
- struct page **map = info->vma->vm_private_data;
- struct page *pg;
+ unsigned long uvaddr;
int offset;
uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, j);
- kvaddr = idx_to_kaddr(mmap_idx, pending_idx, j);
- pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
- ClearPageReserved(pg);
- offset = (uvaddr - info->vma->vm_start)
- >> PAGE_SHIFT;
+ offset = (uvaddr - info->vma->vm_start) >> PAGE_SHIFT;
+ ClearPageReserved(map[offset]);
map[offset] = NULL;
}
- fast_flush_area(pending_req, pending_idx, usr_idx, info->minor);
- info->idx_map[usr_idx] = INVALID_REQ;
- make_response(blkif, pending_req->id, res.operation,
- res.status);
+ if (pending_req->inuse == 2)
+ backdev_finish_req(info, usr_idx, &res, pending_req);
+ else {
+ fast_flush_area(pending_req, pending_idx, usr_idx,
+ info->minor);
+
+ info->idx_map[usr_idx] = INVALID_REQ;
+ make_response(blkif, pending_req->id, res.operation,
+ res.status);
+ }
blkif_put(pending_req->blkif);
free_req(pending_req);
}
+ if (info->blkif && info->blkif->waiting_reqs)
+ wake_up(&info->blkif->wq);
+
return 0;
}
int more_to_do = 0;
tap_blkif_t *info;
+ if (!tap_blkif_connected(blkif))
+ return 0;
+
rc = blk_rings->common.req_cons;
rp = blk_rings->common.sring->req_prod;
tap_blkif_xenbus_init();
+ ret = register_backdev();
+ if (ret < 0) {
+ WPRINTK("Couldn't register /dev/xen/backdev\n");
+ return ret;
+ }
+
/* Dynamically allocate a major for this device */
ret = register_chrdev(0, "blktap", &blktap_fops);
--- /dev/null
+/*
+ * The maximum number of requests that can be outstanding at any time
+ * is determined by
+ *
+ * [mmap_alloc * MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST]
+ *
+ * where mmap_alloc < MAX_DYNAMIC_MEM.
+ *
+ * TODO:
+ * mmap_alloc is initialised to 2 and should be adjustable on the fly via
+ * sysfs.
+ */
+#define BLK_RING_SIZE __RING_SIZE((blkif_sring_t *)0, PAGE_SIZE)
+#define MAX_DYNAMIC_MEM BLK_RING_SIZE
+#define MAX_PENDING_REQS BLK_RING_SIZE
+#define MMAP_PAGES (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST)
+#define MMAP_VADDR(_start, _req,_seg) \
+ (_start + \
+ ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) + \
+ ((_seg) * PAGE_SIZE))
+
+/*Data struct handed back to userspace for tapdisk device to VBD mapping*/
+typedef struct domid_translate {
+ unsigned short domid;
+ unsigned short busid;
+} domid_translate_t ;
+
+/*Data struct associated with each of the tapdisk devices*/
+typedef struct tap_blkif {
+ struct vm_area_struct *vma; /*Shared memory area */
+ unsigned long rings_vstart; /*Kernel memory mapping */
+ unsigned long user_vstart; /*User memory mapping */
+ unsigned long dev_inuse; /*One process opens device at a time. */
+ unsigned long dev_pending; /*In process of being opened */
+ unsigned long ring_ok; /*make this ring->state */
+ blkif_front_ring_t ufe_ring; /*Rings up to user space. */
+ wait_queue_head_t wait; /*for poll */
+ unsigned long mode; /*current switching mode */
+ int minor; /*Minor number for tapdisk device */
+ pid_t pid; /*tapdisk process id */
+ struct pid_namespace *pid_ns; /*... and its corresponding namespace */
+ enum { RUNNING, CLEANSHUTDOWN } status; /*Detect a clean userspace
+ shutdown */
+ unsigned long *idx_map; /*Record the user ring id to kern
+ [req id, idx] tuple */
+ blkif_t *blkif; /*Associate blkif with tapdev */
+ struct domid_translate trans; /*Translation from domid to bus. */
+ struct backdev_info *backdev; /*Backend domain device info */
+} tap_blkif_t;
+
+/*
+ * Each outstanding request that we've passed to the lower device layers has a
+ * 'pending_req' allocated to it. Each buffer_head that completes decrements
+ * the pendcnt towards zero. When it hits zero, the specified domain has a
+ * response queued for it, with the saved 'id' passed back.
+ */
+typedef struct pending_req {
+ blkif_t *blkif;
+ u64 id;
+ unsigned short mem_idx;
+ int nr_pages;
+ atomic_t pendcnt;
+ unsigned short operation;
+ int status;
+ struct list_head free_list;
+ int inuse;
+} pending_req_t;
+
+extern pending_req_t *pending_reqs[MAX_PENDING_REQS];
+
+typedef unsigned int PEND_RING_IDX;
+
+static inline int MASK_PEND_IDX(int i) {
+ return (i & (MAX_PENDING_REQS-1));
+}
+
+static inline unsigned int RTN_PEND_IDX(pending_req_t *req, int idx) {
+ return (req - pending_reqs[idx]);
+}
+
+#define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons)
+
+extern struct page **foreign_pages[MAX_DYNAMIC_MEM];
+static inline unsigned long idx_to_kaddr(
+ unsigned int mmap_idx, unsigned int req_idx, unsigned int sg_idx)
+{
+ unsigned int arr_idx = req_idx*BLKIF_MAX_SEGMENTS_PER_REQUEST + sg_idx;
+ unsigned long pfn = page_to_pfn(foreign_pages[mmap_idx][arr_idx]);
+ return (unsigned long)pfn_to_kaddr(pfn);
+}
+
+/* Requests passing through the tap to userspace are re-assigned an ID.
+ * We must record a mapping between the BE [IDX,ID] tuple and the userspace
+ * ring ID.
+ */
+
+static inline unsigned long MAKE_ID(domid_t fe_dom, PEND_RING_IDX idx)
+{
+ return ((fe_dom << 16) | MASK_PEND_IDX(idx));
+}
+
+extern inline PEND_RING_IDX ID_TO_IDX(unsigned long id)
+{
+ return (PEND_RING_IDX)(id & 0x0000ffff);
+}
+
+extern inline int ID_TO_MIDX(unsigned long id)
+{
+ return (int)(id >> 16);
+}
+
+#define INVALID_REQ 0xdead0000
+
+/*TODO: Convert to a free list*/
+static inline int GET_NEXT_REQ(unsigned long *idx_map)
+{
+ int i;
+ for (i = 0; i < MAX_PENDING_REQS; i++)
+ if (idx_map[i] == INVALID_REQ)
+ return i;
+
+ return INVALID_REQ;
+}
+
+pending_req_t* alloc_req(void);
+void free_req(pending_req_t *req);
+
+void blktap_kick_user(int idx);