ia64/linux-2.6.18-xen.hg

changeset 878:eba6fe6d8d53

blktap2: a completely rewritten blktap implementation

Benefits to blktap2 over the old version of blktap:

* Isolation from xenstore - Blktap devices are now created directly on
the linux dom0 command line, rather than being spawned in response
to XenStore events. This is handy for debugging, makes blktap
generally easier to work with, and is a step toward a generic
user-level block device implementation that is not Xen-specific.

* Improved tapdisk infrastructure: simpler request forwarding, new
request scheduler, request merging, more efficient use of AIO.

* Improved tapdisk error handling and memory management. No
allocations on the block data path, IO retry logic to protect
guests
transient block device failures. This has been tested and is known
to work on weird environments such as NFS soft mounts.

* Pause and snapshot of live virtual disks (see xmsnap script).

* VHD support. The VHD code in this release has been rigorously
tested, and represents a very mature implementation of the VHD
image
format.

* No more duplication of mechanism with blkback. The blktap kernel
module has changed dramatically from the original blktap. Blkback
is now always used to talk to Xen guests, blktap just presents a
Linux gendisk that blkback can export. This is done while
preserving the zero-copy data path from domU to physical device.

These patches deprecate the old blktap code, which can hopefully be
removed from the tree completely at some point in the future.

Signed-off-by: Jake Wires <jake.wires@citrix.com>
Signed-off-by: Dutch Meyer <dmeyer@cs.ubc.ca>
author Keir Fraser <keir.fraser@citrix.com>
date Tue May 26 11:23:16 2009 +0100 (2009-05-26)
parents f3a935eb30e0
children 2ab54cc40761
files drivers/xen/Makefile drivers/xen/blkback/Makefile drivers/xen/blkback/blkback-pagemap.c drivers/xen/blkback/blkback-pagemap.h drivers/xen/blkback/blkback.c drivers/xen/blkback/common.h drivers/xen/blktap/blktap.c drivers/xen/blktap2/Makefile drivers/xen/blktap2/blktap.h drivers/xen/blktap2/control.c drivers/xen/blktap2/device.c drivers/xen/blktap2/request.c drivers/xen/blktap2/ring.c drivers/xen/blktap2/sysfs.c drivers/xen/blktap2/wait_queue.c include/linux/mm.h include/linux/page-flags.h mm/memory.c mm/mmap.c
line diff
     1.1 --- a/drivers/xen/Makefile	Tue May 26 09:53:55 2009 +0100
     1.2 +++ b/drivers/xen/Makefile	Tue May 26 11:23:16 2009 +0100
     1.3 @@ -8,6 +8,7 @@ obj-y	+= util.o
     1.4  obj-$(CONFIG_XEN_BALLOON)		+= balloon/
     1.5  obj-$(CONFIG_XEN_BLKDEV_BACKEND)	+= blkback/
     1.6  obj-$(CONFIG_XEN_BLKDEV_TAP)		+= blktap/
     1.7 +obj-$(CONFIG_XEN_BLKDEV_TAP)            += blktap2/
     1.8  obj-$(CONFIG_XEN_NETDEV_BACKEND)	+= netback/
     1.9  obj-$(CONFIG_XEN_TPMDEV_BACKEND)	+= tpmback/
    1.10  obj-$(CONFIG_XEN_BLKDEV_FRONTEND)	+= blkfront/
     2.1 --- a/drivers/xen/blkback/Makefile	Tue May 26 09:53:55 2009 +0100
     2.2 +++ b/drivers/xen/blkback/Makefile	Tue May 26 11:23:16 2009 +0100
     2.3 @@ -1,3 +1,3 @@
     2.4  obj-$(CONFIG_XEN_BLKDEV_BACKEND) := blkbk.o
     2.5  
     2.6 -blkbk-y	:= blkback.o xenbus.o interface.o vbd.o
     2.7 +blkbk-y	:= blkback.o xenbus.o interface.o vbd.o blkback-pagemap.o
     3.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     3.2 +++ b/drivers/xen/blkback/blkback-pagemap.c	Tue May 26 11:23:16 2009 +0100
     3.3 @@ -0,0 +1,91 @@
     3.4 +#include "common.h"
     3.5 +#include "blkback-pagemap.h"
     3.6 +
     3.7 +static int blkback_pagemap_size;
     3.8 +static struct blkback_pagemap *blkback_pagemap;
     3.9 +
    3.10 +static inline int
    3.11 +blkback_pagemap_entry_clear(struct blkback_pagemap *map)
    3.12 +{
    3.13 +	static struct blkback_pagemap zero;
    3.14 +	return !memcmp(map, &zero, sizeof(zero));
    3.15 +}
    3.16 +
    3.17 +int
    3.18 +blkback_pagemap_init(int pages)
    3.19 +{
    3.20 +	blkback_pagemap = kzalloc(pages * sizeof(struct blkback_pagemap),
    3.21 +				  GFP_KERNEL);
    3.22 +	if (!blkback_pagemap)
    3.23 +		return -ENOMEM;
    3.24 +
    3.25 +	blkback_pagemap_size = pages;
    3.26 +	return 0;
    3.27 +}
    3.28 +
    3.29 +void
    3.30 +blkback_pagemap_set(int idx, struct page *page,
    3.31 +		    domid_t domid, busid_t busid, grant_ref_t gref)
    3.32 +{
    3.33 +	struct blkback_pagemap *entry;
    3.34 +
    3.35 +	BUG_ON(!blkback_pagemap);
    3.36 +	BUG_ON(idx >= blkback_pagemap_size);
    3.37 +
    3.38 +	SetPageBlkback(page);
    3.39 +	set_page_private(page, idx);
    3.40 +
    3.41 +	entry = blkback_pagemap + idx;
    3.42 +	if (!blkback_pagemap_entry_clear(entry)) {
    3.43 +		printk("overwriting pagemap %d: d %u b %u g %u\n",
    3.44 +		       idx, entry->domid, entry->busid, entry->gref);
    3.45 +		BUG();
    3.46 +	}
    3.47 +
    3.48 +	entry->domid = domid;
    3.49 +	entry->busid = busid;
    3.50 +	entry->gref  = gref;
    3.51 +}
    3.52 +
    3.53 +void
    3.54 +blkback_pagemap_clear(struct page *page)
    3.55 +{
    3.56 +	int idx;
    3.57 +	struct blkback_pagemap *entry;
    3.58 +
    3.59 +	idx = (int)page_private(page);
    3.60 +
    3.61 +	BUG_ON(!blkback_pagemap);
    3.62 +	BUG_ON(!PageBlkback(page));
    3.63 +	BUG_ON(idx >= blkback_pagemap_size);
    3.64 +
    3.65 +	entry = blkback_pagemap + idx;
    3.66 +	if (blkback_pagemap_entry_clear(entry)) {
    3.67 +		printk("clearing empty pagemap %d\n", idx);
    3.68 +		BUG();
    3.69 +	}
    3.70 +
    3.71 +	memset(entry, 0, sizeof(*entry));
    3.72 +}
    3.73 +
    3.74 +struct blkback_pagemap
    3.75 +blkback_pagemap_read(struct page *page)
    3.76 +{
    3.77 +	int idx;
    3.78 +	struct blkback_pagemap *entry;
    3.79 +
    3.80 +	idx = (int)page_private(page);
    3.81 +
    3.82 +	BUG_ON(!blkback_pagemap);
    3.83 +	BUG_ON(!PageBlkback(page));
    3.84 +	BUG_ON(idx >= blkback_pagemap_size);
    3.85 +
    3.86 +	entry = blkback_pagemap + idx;
    3.87 +	if (blkback_pagemap_entry_clear(entry)) {
    3.88 +		printk("reading empty pagemap %d\n", idx);
    3.89 +		BUG();
    3.90 +	}
    3.91 +
    3.92 +	return *entry;
    3.93 +}
    3.94 +EXPORT_SYMBOL(blkback_pagemap_read);
     4.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     4.2 +++ b/drivers/xen/blkback/blkback-pagemap.h	Tue May 26 11:23:16 2009 +0100
     4.3 @@ -0,0 +1,17 @@
     4.4 +#ifndef _BLKBACK_PAGEMAP_H_
     4.5 +#define _BLKBACK_PAGEMAP_H_
     4.6 +
     4.7 +#include <xen/interface/xen.h>
     4.8 +#include <xen/interface/grant_table.h>
     4.9 +
    4.10 +typedef unsigned int busid_t;
    4.11 +
    4.12 +struct blkback_pagemap {
    4.13 +	domid_t          domid;
    4.14 +	busid_t          busid;
    4.15 +	grant_ref_t      gref;
    4.16 +};
    4.17 +
    4.18 +struct blkback_pagemap blkback_pagemap_read(struct page *);
    4.19 +
    4.20 +#endif
     5.1 --- a/drivers/xen/blkback/blkback.c	Tue May 26 09:53:55 2009 +0100
     5.2 +++ b/drivers/xen/blkback/blkback.c	Tue May 26 11:23:16 2009 +0100
     5.3 @@ -173,6 +173,7 @@ static void fast_flush_area(pending_req_
     5.4  		handle = pending_handle(req, i);
     5.5  		if (handle == BLKBACK_INVALID_HANDLE)
     5.6  			continue;
     5.7 +		blkback_pagemap_clear(virt_to_page(vaddr(req, i)));
     5.8  		gnttab_set_unmap_op(&unmap[invcount], vaddr(req, i),
     5.9  				    GNTMAP_host_map, handle);
    5.10  		pending_handle(req, i) = BLKBACK_INVALID_HANDLE;
    5.11 @@ -464,6 +465,10 @@ static void dispatch_rw_block_io(blkif_t
    5.12  			FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT));
    5.13  		seg[i].buf  = map[i].dev_bus_addr | 
    5.14  			(req->seg[i].first_sect << 9);
    5.15 +		blkback_pagemap_set(vaddr_pagenr(pending_req, i),
    5.16 +				    virt_to_page(vaddr(pending_req, i)),
    5.17 +				    blkif->domid, req->handle,
    5.18 +				    req->seg[i].gref);
    5.19  	}
    5.20  
    5.21  	if (ret)
    5.22 @@ -625,6 +630,9 @@ static int __init blkif_init(void)
    5.23  					mmap_pages, GFP_KERNEL);
    5.24  	pending_pages         = alloc_empty_pages_and_pagevec(mmap_pages);
    5.25  
    5.26 +	if (blkback_pagemap_init(mmap_pages))
    5.27 +		goto out_of_memory;
    5.28 +
    5.29  	if (!pending_reqs || !pending_grant_handles || !pending_pages)
    5.30  		goto out_of_memory;
    5.31  
     6.1 --- a/drivers/xen/blkback/common.h	Tue May 26 09:53:55 2009 +0100
     6.2 +++ b/drivers/xen/blkback/common.h	Tue May 26 11:23:16 2009 +0100
     6.3 @@ -43,6 +43,8 @@
     6.4  #include <xen/gnttab.h>
     6.5  #include <xen/driver_util.h>
     6.6  #include <xen/xenbus.h>
     6.7 +#include "blkback-pagemap.h"
     6.8 +
     6.9  
    6.10  #define DPRINTK(_f, _a...)			\
    6.11  	pr_debug("(file=%s, line=%d) " _f,	\
    6.12 @@ -136,4 +138,8 @@ int blkif_schedule(void *arg);
    6.13  int blkback_barrier(struct xenbus_transaction xbt,
    6.14  		    struct backend_info *be, int state);
    6.15  
    6.16 +int blkback_pagemap_init(int);
    6.17 +void blkback_pagemap_set(int, struct page *, domid_t, busid_t, grant_ref_t);
    6.18 +void blkback_pagemap_clear(struct page *);
    6.19 +
    6.20  #endif /* __BLKIF__BACKEND__COMMON_H__ */
     7.1 --- a/drivers/xen/blktap/blktap.c	Tue May 26 09:53:55 2009 +0100
     7.2 +++ b/drivers/xen/blktap/blktap.c	Tue May 26 11:23:16 2009 +0100
     7.3 @@ -116,7 +116,7 @@ typedef struct tap_blkif {
     7.4  					[req id, idx] tuple                  */
     7.5  	blkif_t *blkif;               /*Associate blkif with tapdev          */
     7.6  	struct domid_translate_ext trans; /*Translation from domid to bus.   */
     7.7 -	struct page **map;	      /*Mapping page */
     7.8 +	struct vm_foreign_map foreign_map;    /*Mapping page */
     7.9  } tap_blkif_t;
    7.10  
    7.11  static struct tap_blkif *tapfds[MAX_TAP_DEV];
    7.12 @@ -347,7 +347,7 @@ static pte_t blktap_clear_pte(struct vm_
    7.13  	kvaddr = idx_to_kaddr(mmap_idx, pending_idx, seg);
    7.14  	pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
    7.15  	ClearPageReserved(pg);
    7.16 -	info->map[offset + RING_PAGES] = NULL;
    7.17 +	info->foreign_map.map[offset + RING_PAGES] = NULL;
    7.18  
    7.19  	khandle = &pending_handle(mmap_idx, pending_idx, seg);
    7.20  
    7.21 @@ -396,7 +396,7 @@ static void blktap_vma_open(struct vm_ar
    7.22  
    7.23  	info = vma->vm_file->private_data;
    7.24  	vma->vm_private_data =
    7.25 -		&info->map[(vma->vm_start - info->rings_vstart) >> PAGE_SHIFT];
    7.26 +		&info->foreign_map.map[(vma->vm_start - info->rings_vstart) >> PAGE_SHIFT];
    7.27  }
    7.28  
    7.29  /* tricky part
    7.30 @@ -418,7 +418,7 @@ static void blktap_vma_close(struct vm_a
    7.31  
    7.32  	info = vma->vm_file->private_data;
    7.33  	next->vm_private_data =
    7.34 -		&info->map[(next->vm_start - info->rings_vstart) >> PAGE_SHIFT];
    7.35 +		&info->foreign_map.map[(next->vm_start - info->rings_vstart) >> PAGE_SHIFT];
    7.36  }
    7.37  
    7.38  static struct vm_operations_struct blktap_vm_ops = {
    7.39 @@ -642,8 +642,8 @@ static int blktap_release(struct inode *
    7.40  
    7.41  	mmput(info->mm);
    7.42  	info->mm = NULL;
    7.43 -	kfree(info->map);
    7.44 -	info->map = NULL;
    7.45 +	kfree(info->foreign_map.map);
    7.46 +	info->foreign_map.map = NULL;
    7.47  
    7.48  	/* Free the ring page. */
    7.49  	ClearPageReserved(virt_to_page(info->ufe_ring.sring));
    7.50 @@ -726,14 +726,14 @@ static int blktap_mmap(struct file *filp
    7.51  	}
    7.52  
    7.53  	/* Mark this VM as containing foreign pages, and set up mappings. */
    7.54 -	info->map = kzalloc(((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) *
    7.55 -			    sizeof(*info->map), GFP_KERNEL);
    7.56 -	if (info->map == NULL) {
    7.57 +	info->foreign_map.map = kzalloc(((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) *
    7.58 +			    sizeof(*info->foreign_map.map), GFP_KERNEL);
    7.59 +	if (info->foreign_map.map == NULL) {
    7.60  		WPRINTK("Couldn't alloc VM_FOREIGN map.\n");
    7.61  		goto fail;
    7.62  	}
    7.63  
    7.64 -	vma->vm_private_data = info->map;
    7.65 +	vma->vm_private_data = info->foreign_map.map;
    7.66  	vma->vm_flags |= VM_FOREIGN;
    7.67  	vma->vm_flags |= VM_DONTCOPY;
    7.68  
    7.69 @@ -1238,7 +1238,7 @@ static int blktap_read_ufe_ring(tap_blki
    7.70  			pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
    7.71  			ClearPageReserved(pg);
    7.72  			offset = (uvaddr - info->rings_vstart) >> PAGE_SHIFT;
    7.73 -			info->map[offset] = NULL;
    7.74 +			info->foreign_map.map[offset] = NULL;
    7.75  		}
    7.76  		fast_flush_area(pending_req, pending_idx, usr_idx, info->minor);
    7.77  		info->idx_map[usr_idx] = INVALID_REQ;
    7.78 @@ -1530,7 +1530,7 @@ static void dispatch_rw_block_io(blkif_t
    7.79  							  >> PAGE_SHIFT));
    7.80  			offset = (uvaddr - info->rings_vstart) >> PAGE_SHIFT;
    7.81  			pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
    7.82 -			info->map[offset] = pg;
    7.83 +			info->foreign_map.map[offset] = pg;
    7.84  		}
    7.85  	} else {
    7.86  		for (i = 0; i < nseg; i++) {
    7.87 @@ -1557,7 +1557,7 @@ static void dispatch_rw_block_io(blkif_t
    7.88  
    7.89  			offset = (uvaddr - info->rings_vstart) >> PAGE_SHIFT;
    7.90  			pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
    7.91 -			info->map[offset] = pg;
    7.92 +			info->foreign_map.map[offset] = pg;
    7.93  		}
    7.94  	}
    7.95  
     8.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     8.2 +++ b/drivers/xen/blktap2/Makefile	Tue May 26 11:23:16 2009 +0100
     8.3 @@ -0,0 +1,3 @@
     8.4 +obj-y := blktap.o
     8.5 +
     8.6 +blktap-objs := control.o ring.o wait_queue.o device.o request.o sysfs.o
     9.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     9.2 +++ b/drivers/xen/blktap2/blktap.h	Tue May 26 11:23:16 2009 +0100
     9.3 @@ -0,0 +1,244 @@
     9.4 +#ifndef _BLKTAP_H_
     9.5 +#define _BLKTAP_H_
     9.6 +
     9.7 +#include <linux/fs.h>
     9.8 +#include <linux/poll.h>
     9.9 +#include <linux/cdev.h>
    9.10 +#include <xen/blkif.h>
    9.11 +#include <xen/gnttab.h>
    9.12 +
    9.13 +//#define ENABLE_PASSTHROUGH
    9.14 +
    9.15 +extern int blktap_debug_level;
    9.16 +
    9.17 +#define BTPRINTK(level, tag, force, _f, _a...)				\
    9.18 +	do {								\
    9.19 +		if (blktap_debug_level > level &&			\
    9.20 +		    (force || printk_ratelimit()))			\
    9.21 +			printk(tag "%s: " _f, __func__, ##_a);		\
    9.22 +	} while (0)
    9.23 +
    9.24 +#define BTDBG(_f, _a...)             BTPRINTK(8, KERN_DEBUG, 1, _f, ##_a)
    9.25 +#define BTINFO(_f, _a...)            BTPRINTK(0, KERN_INFO, 0, _f, ##_a)
    9.26 +#define BTWARN(_f, _a...)            BTPRINTK(0, KERN_WARNING, 0, _f, ##_a)
    9.27 +#define BTERR(_f, _a...)             BTPRINTK(0, KERN_ERR, 0, _f, ##_a)
    9.28 +
    9.29 +#define MAX_BLKTAP_DEVICE            256
    9.30 +
    9.31 +#define BLKTAP_CONTROL               1
    9.32 +#define BLKTAP_RING_FD               2
    9.33 +#define BLKTAP_RING_VMA              3
    9.34 +#define BLKTAP_DEVICE                4
    9.35 +#define BLKTAP_SYSFS                 5
    9.36 +#define BLKTAP_PAUSE_REQUESTED       6
    9.37 +#define BLKTAP_PAUSED                7
    9.38 +#define BLKTAP_SHUTDOWN_REQUESTED    8
    9.39 +#define BLKTAP_PASSTHROUGH           9
    9.40 +#define BLKTAP_DEFERRED              10
    9.41 +
    9.42 +/* blktap IOCTLs: */
    9.43 +#define BLKTAP2_IOCTL_KICK_FE        1
    9.44 +#define BLKTAP2_IOCTL_ALLOC_TAP	     200
    9.45 +#define BLKTAP2_IOCTL_FREE_TAP       201
    9.46 +#define BLKTAP2_IOCTL_CREATE_DEVICE  202
    9.47 +#define BLKTAP2_IOCTL_SET_PARAMS     203
    9.48 +#define BLKTAP2_IOCTL_PAUSE          204
    9.49 +#define BLKTAP2_IOCTL_REOPEN         205
    9.50 +#define BLKTAP2_IOCTL_RESUME         206
    9.51 +
    9.52 +#define BLKTAP2_MAX_MESSAGE_LEN      256
    9.53 +
    9.54 +#define BLKTAP2_RING_MESSAGE_PAUSE   1
    9.55 +#define BLKTAP2_RING_MESSAGE_RESUME  2
    9.56 +#define BLKTAP2_RING_MESSAGE_CLOSE   3
    9.57 +
    9.58 +#define BLKTAP_REQUEST_FREE          0
    9.59 +#define BLKTAP_REQUEST_PENDING       1
    9.60 +
    9.61 +/*
    9.62 + * The maximum number of requests that can be outstanding at any time
    9.63 + * is determined by
    9.64 + *
    9.65 + *   [mmap_alloc * MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST] 
    9.66 + *
    9.67 + * where mmap_alloc < MAX_DYNAMIC_MEM.
    9.68 + *
    9.69 + * TODO:
    9.70 + * mmap_alloc is initialised to 2 and should be adjustable on the fly via
    9.71 + * sysfs.
    9.72 + */
    9.73 +#define BLK_RING_SIZE		__RING_SIZE((blkif_sring_t *)0, PAGE_SIZE)
    9.74 +#define MAX_DYNAMIC_MEM		BLK_RING_SIZE
    9.75 +#define MAX_PENDING_REQS	BLK_RING_SIZE
    9.76 +#define MMAP_PAGES (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST)
    9.77 +#define MMAP_VADDR(_start, _req, _seg)					\
    9.78 +        (_start +                                                       \
    9.79 +         ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) +        \
    9.80 +         ((_seg) * PAGE_SIZE))
    9.81 +
    9.82 +#define blktap_get(_b) (atomic_inc(&(_b)->refcnt))
    9.83 +#define blktap_put(_b)					\
    9.84 +	do {						\
    9.85 +		if (atomic_dec_and_test(&(_b)->refcnt))	\
    9.86 +			wake_up(&(_b)->wq);		\
    9.87 +	} while (0)
    9.88 +
    9.89 +struct blktap;
    9.90 +
    9.91 +struct grant_handle_pair {
    9.92 +	grant_handle_t                 kernel;
    9.93 +	grant_handle_t                 user;
    9.94 +};
    9.95 +#define INVALID_GRANT_HANDLE           0xFFFF
    9.96 +
    9.97 +struct blktap_handle {
    9.98 +	unsigned int                   ring;
    9.99 +	unsigned int                   device;
   9.100 +	unsigned int                   minor;
   9.101 +};
   9.102 +
   9.103 +struct blktap_params {
   9.104 +	char                           name[BLKTAP2_MAX_MESSAGE_LEN];
   9.105 +	unsigned long long             capacity;
   9.106 +	unsigned long                  sector_size;
   9.107 +};
   9.108 +
   9.109 +struct blktap_device {
   9.110 +	int                            users;
   9.111 +	spinlock_t                     lock;
   9.112 +	struct gendisk                *gd;
   9.113 +
   9.114 +#ifdef ENABLE_PASSTHROUGH
   9.115 +	struct block_device           *bdev;
   9.116 +#endif
   9.117 +};
   9.118 +
   9.119 +struct blktap_ring {
   9.120 +	struct vm_area_struct         *vma;
   9.121 +	blkif_front_ring_t             ring;
   9.122 +	struct vm_foreign_map          foreign_map;
   9.123 +	unsigned long                  ring_vstart;
   9.124 +	unsigned long                  user_vstart;
   9.125 +
   9.126 +	int                            response;
   9.127 +
   9.128 +	wait_queue_head_t              poll_wait;
   9.129 +
   9.130 +	dev_t                          devno;
   9.131 +	struct class_device           *dev;
   9.132 +	atomic_t                       sysfs_refcnt;
   9.133 +	struct mutex                   sysfs_mutex;
   9.134 +};
   9.135 +
   9.136 +struct blktap_statistics {
   9.137 +	unsigned long                  st_print;
   9.138 +	int                            st_rd_req;
   9.139 +	int                            st_wr_req;
   9.140 +	int                            st_oo_req;
   9.141 +	int                            st_rd_sect;
   9.142 +	int                            st_wr_sect;
   9.143 +	s64                            st_rd_cnt;
   9.144 +	s64                            st_rd_sum_usecs;
   9.145 +	s64                            st_rd_max_usecs;
   9.146 +	s64                            st_wr_cnt;
   9.147 +	s64                            st_wr_sum_usecs;
   9.148 +	s64                            st_wr_max_usecs;	
   9.149 +};
   9.150 +
   9.151 +struct blktap_request {
   9.152 +	uint64_t                       id;
   9.153 +	uint16_t                       usr_idx;
   9.154 +
   9.155 +	uint8_t                        status;
   9.156 +	atomic_t                       pendcnt;
   9.157 +	uint8_t                        nr_pages;
   9.158 +	unsigned short                 operation;
   9.159 +
   9.160 +	struct timeval                 time;
   9.161 +	struct grant_handle_pair       handles[BLKIF_MAX_SEGMENTS_PER_REQUEST];
   9.162 +	struct list_head               free_list;
   9.163 +};
   9.164 +
   9.165 +struct blktap {
   9.166 +	int                            minor;
   9.167 +	pid_t                          pid;
   9.168 +	atomic_t                       refcnt;
   9.169 +	unsigned long                  dev_inuse;
   9.170 +
   9.171 +	struct blktap_params           params;
   9.172 +
   9.173 +	struct rw_semaphore            tap_sem;
   9.174 +
   9.175 +	struct blktap_ring             ring;
   9.176 +	struct blktap_device           device;
   9.177 +
   9.178 +	int                            pending_cnt;
   9.179 +	struct blktap_request         *pending_requests[MAX_PENDING_REQS];
   9.180 +
   9.181 +	wait_queue_head_t              wq;
   9.182 +	struct list_head               deferred_queue;
   9.183 +
   9.184 +	struct blktap_statistics       stats;
   9.185 +};
   9.186 +
   9.187 +extern struct blktap *blktaps[MAX_BLKTAP_DEVICE];
   9.188 +
   9.189 +static inline int
   9.190 +blktap_active(struct blktap *tap)
   9.191 +{
   9.192 +	return test_bit(BLKTAP_RING_VMA, &tap->dev_inuse);
   9.193 +}
   9.194 +
   9.195 +static inline int
   9.196 +blktap_validate_params(struct blktap *tap, struct blktap_params *params)
   9.197 +{
   9.198 +	/* TODO: sanity check */
   9.199 +	params->name[sizeof(params->name) - 1] = '\0';
   9.200 +	BTINFO("%s: capacity: %llu, sector-size: %lu\n",
   9.201 +	       params->name, params->capacity, params->sector_size);
   9.202 +	return 0;
   9.203 +}
   9.204 +
   9.205 +int blktap_control_destroy_device(struct blktap *);
   9.206 +
   9.207 +int blktap_ring_init(int *);
   9.208 +int blktap_ring_free(void);
   9.209 +int blktap_ring_create(struct blktap *);
   9.210 +int blktap_ring_destroy(struct blktap *);
   9.211 +int blktap_ring_pause(struct blktap *);
   9.212 +int blktap_ring_resume(struct blktap *);
   9.213 +void blktap_ring_kick_user(struct blktap *);
   9.214 +
   9.215 +int blktap_sysfs_init(void);
   9.216 +void blktap_sysfs_free(void);
   9.217 +int blktap_sysfs_create(struct blktap *);
   9.218 +int blktap_sysfs_destroy(struct blktap *);
   9.219 +
   9.220 +int blktap_device_init(int *);
   9.221 +void blktap_device_free(void);
   9.222 +int blktap_device_create(struct blktap *);
   9.223 +int blktap_device_destroy(struct blktap *);
   9.224 +int blktap_device_pause(struct blktap *);
   9.225 +int blktap_device_resume(struct blktap *);
   9.226 +void blktap_device_restart(struct blktap *);
   9.227 +void blktap_device_finish_request(struct blktap *,
   9.228 +				  blkif_response_t *,
   9.229 +				  struct blktap_request *);
   9.230 +void blktap_device_fail_pending_requests(struct blktap *);
   9.231 +#ifdef ENABLE_PASSTHROUGH
   9.232 +int blktap_device_enable_passthrough(struct blktap *,
   9.233 +				     unsigned, unsigned);
   9.234 +#endif
   9.235 +
   9.236 +void blktap_defer(struct blktap *);
   9.237 +void blktap_run_deferred(void);
   9.238 +
   9.239 +int blktap_request_pool_init(void);
   9.240 +void blktap_request_pool_free(void);
   9.241 +int blktap_request_pool_grow(void);
   9.242 +int blktap_request_pool_shrink(void);
   9.243 +struct blktap_request *blktap_request_allocate(struct blktap *);
   9.244 +void blktap_request_free(struct blktap *, struct blktap_request *);
   9.245 +unsigned long request_to_kaddr(struct blktap_request *, int);
   9.246 +
   9.247 +#endif
    10.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    10.2 +++ b/drivers/xen/blktap2/control.c	Tue May 26 11:23:16 2009 +0100
    10.3 @@ -0,0 +1,277 @@
    10.4 +#include <linux/module.h>
    10.5 +#include <linux/miscdevice.h>
    10.6 +
    10.7 +#include "blktap.h"
    10.8 +
    10.9 +static DEFINE_SPINLOCK(blktap_control_lock);
   10.10 +struct blktap *blktaps[MAX_BLKTAP_DEVICE];
   10.11 +
   10.12 +static int ring_major;
   10.13 +static int device_major;
   10.14 +static int blktap_control_registered;
   10.15 +
   10.16 +static void
   10.17 +blktap_control_initialize_tap(struct blktap *tap)
   10.18 +{
   10.19 +	int minor = tap->minor;
   10.20 +
   10.21 +	memset(tap, 0, sizeof(*tap));
   10.22 +	set_bit(BLKTAP_CONTROL, &tap->dev_inuse);
   10.23 +	init_rwsem(&tap->tap_sem);
   10.24 +	init_waitqueue_head(&tap->wq);
   10.25 +	atomic_set(&tap->refcnt, 0);
   10.26 +
   10.27 +	tap->minor = minor;
   10.28 +}
   10.29 +
   10.30 +static struct blktap *
   10.31 +blktap_control_create_tap(void)
   10.32 +{
   10.33 +	int minor;
   10.34 +	struct blktap *tap;
   10.35 +
   10.36 +	tap = kmalloc(sizeof(*tap), GFP_KERNEL);
   10.37 +	if (unlikely(!tap))
   10.38 +		return NULL;
   10.39 +
   10.40 +	blktap_control_initialize_tap(tap);
   10.41 +
   10.42 +	spin_lock_irq(&blktap_control_lock);
   10.43 +	for (minor = 0; minor < MAX_BLKTAP_DEVICE; minor++)
   10.44 +		if (!blktaps[minor])
   10.45 +			break;
   10.46 +
   10.47 +	if (minor == MAX_BLKTAP_DEVICE) {
   10.48 +		kfree(tap);
   10.49 +		tap = NULL;
   10.50 +		goto out;
   10.51 +	}
   10.52 +
   10.53 +	tap->minor = minor;
   10.54 +	blktaps[minor] = tap;
   10.55 +
   10.56 +out:
   10.57 +	spin_unlock_irq(&blktap_control_lock);
   10.58 +	return tap;
   10.59 +}
   10.60 +
   10.61 +static struct blktap *
   10.62 +blktap_control_allocate_tap(void)
   10.63 +{
   10.64 +	int err, minor;
   10.65 +	struct blktap *tap;
   10.66 +
   10.67 +	/*
   10.68 +	 * This is called only from the ioctl, which
   10.69 +	 * means we should always have interrupts enabled.
   10.70 +	 */
   10.71 +	BUG_ON(irqs_disabled());
   10.72 +
   10.73 +	spin_lock_irq(&blktap_control_lock);
   10.74 +
   10.75 +	for (minor = 0; minor < MAX_BLKTAP_DEVICE; minor++) {
   10.76 +		tap = blktaps[minor];
   10.77 +		if (!tap)
   10.78 +			goto found;
   10.79 +
   10.80 +		if (!tap->dev_inuse) {
   10.81 +			blktap_control_initialize_tap(tap);
   10.82 +			goto found;
   10.83 +		}
   10.84 +	}
   10.85 +
   10.86 +	tap = NULL;
   10.87 +
   10.88 +found:
   10.89 +	spin_unlock_irq(&blktap_control_lock);
   10.90 +
   10.91 +	if (!tap) {
   10.92 +		tap = blktap_control_create_tap();
   10.93 +		if (!tap)
   10.94 +			return NULL;
   10.95 +	}
   10.96 +
   10.97 +	err = blktap_ring_create(tap);
   10.98 +	if (err) {
   10.99 +		BTERR("ring creation failed: %d\n", err);
  10.100 +		clear_bit(BLKTAP_CONTROL, &tap->dev_inuse);
  10.101 +		return NULL;
  10.102 +	}
  10.103 +
  10.104 +	BTINFO("allocated tap %p\n", tap);
  10.105 +	return tap;
  10.106 +}
  10.107 +
  10.108 +static int
  10.109 +blktap_control_ioctl(struct inode *inode, struct file *filp,
  10.110 +		     unsigned int cmd, unsigned long arg)
  10.111 +{
  10.112 +	unsigned long dev;
  10.113 +	struct blktap *tap;
  10.114 +
  10.115 +	switch (cmd) {
  10.116 +	case BLKTAP2_IOCTL_ALLOC_TAP: {
  10.117 +		struct blktap_handle h;
  10.118 +
  10.119 +		tap = blktap_control_allocate_tap();
  10.120 +		if (!tap) {
  10.121 +			BTERR("error allocating device\n");
  10.122 +			return -ENOMEM;
  10.123 +		}
  10.124 +
  10.125 +		h.ring   = ring_major;
  10.126 +		h.device = device_major;
  10.127 +		h.minor  = tap->minor;
  10.128 +
  10.129 +		if (copy_to_user((struct blktap_handle __user *)arg,
  10.130 +				 &h, sizeof(h))) {
  10.131 +			blktap_control_destroy_device(tap);
  10.132 +			return -EFAULT;
  10.133 +		}
  10.134 +
  10.135 +		return 0;
  10.136 +	}
  10.137 +
  10.138 +	case BLKTAP2_IOCTL_FREE_TAP:
  10.139 +		dev = arg;
  10.140 +
  10.141 +		if (dev > MAX_BLKTAP_DEVICE || !blktaps[dev])
  10.142 +			return -EINVAL;
  10.143 +
  10.144 +		blktap_control_destroy_device(blktaps[dev]);
  10.145 +		return 0;
  10.146 +	}
  10.147 +
  10.148 +	return -ENOIOCTLCMD;
  10.149 +}
  10.150 +
  10.151 +static struct file_operations blktap_control_file_operations = {
  10.152 +	.owner    = THIS_MODULE,
  10.153 +	.ioctl    = blktap_control_ioctl,
  10.154 +};
  10.155 +
  10.156 +static struct miscdevice blktap_misc = {
  10.157 +	.minor    = MISC_DYNAMIC_MINOR,
  10.158 +	.name     = "blktap-control",
  10.159 +	.fops     = &blktap_control_file_operations,
  10.160 +};
  10.161 +
  10.162 +int
  10.163 +blktap_control_destroy_device(struct blktap *tap)
  10.164 +{
  10.165 +	int err;
  10.166 +	unsigned long inuse;
  10.167 +
  10.168 +	if (!tap)
  10.169 +		return 0;
  10.170 +
  10.171 +	set_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse);
  10.172 +
  10.173 +	for (;;) {
  10.174 +		inuse = tap->dev_inuse;
  10.175 +		err   = blktap_device_destroy(tap);
  10.176 +		if (err)
  10.177 +			goto wait;
  10.178 +
  10.179 +		inuse = tap->dev_inuse;
  10.180 +		err   = blktap_ring_destroy(tap);
  10.181 +		if (err)
  10.182 +			goto wait;
  10.183 +
  10.184 +		inuse = tap->dev_inuse;
  10.185 +		err   = blktap_sysfs_destroy(tap);
  10.186 +		if (err)
  10.187 +			goto wait;
  10.188 +
  10.189 +		break;
  10.190 +
  10.191 +	wait:
  10.192 +		BTDBG("inuse: 0x%lx, dev_inuse: 0x%lx\n",
  10.193 +		      inuse, tap->dev_inuse);
  10.194 +		if (wait_event_interruptible(tap->wq, tap->dev_inuse != inuse))
  10.195 +			break;
  10.196 +	}
  10.197 +
  10.198 +	clear_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse);
  10.199 +
  10.200 +	if (tap->dev_inuse == (1UL << BLKTAP_CONTROL)) {
  10.201 +		err = 0;
  10.202 +		clear_bit(BLKTAP_CONTROL, &tap->dev_inuse);
  10.203 +	}
  10.204 +
  10.205 +	return err;
  10.206 +}
  10.207 +
  10.208 +static int
  10.209 +blktap_control_init(void)
  10.210 +{
  10.211 +	int err;
  10.212 +
  10.213 +	err = misc_register(&blktap_misc);
  10.214 +	if (err) {
  10.215 +		BTERR("misc_register failed for control device");
  10.216 +		return err;
  10.217 +	}
  10.218 +
  10.219 +	blktap_control_registered = 1;
  10.220 +	return 0;
  10.221 +}
  10.222 +
  10.223 +static void
  10.224 +blktap_control_free(void)
  10.225 +{
  10.226 +	int i;
  10.227 +
  10.228 +	for (i = 0; i < MAX_BLKTAP_DEVICE; i++)
  10.229 +		blktap_control_destroy_device(blktaps[i]);
  10.230 +
  10.231 +	if (blktap_control_registered)
  10.232 +		if (misc_deregister(&blktap_misc) < 0)
  10.233 +			BTERR("misc_deregister failed for control device");
  10.234 +}
  10.235 +
  10.236 +static void
  10.237 +blktap_exit(void)
  10.238 +{
  10.239 +	blktap_control_free();
  10.240 +	blktap_ring_free();
  10.241 +	blktap_sysfs_free();
  10.242 +	blktap_device_free();
  10.243 +	blktap_request_pool_free();
  10.244 +}
  10.245 +
  10.246 +static int __init
  10.247 +blktap_init(void)
  10.248 +{
  10.249 +	int err;
  10.250 +
  10.251 +	err = blktap_request_pool_init();
  10.252 +	if (err)
  10.253 +		return err;
  10.254 +
  10.255 +	err = blktap_device_init(&device_major);
  10.256 +	if (err)
  10.257 +		goto fail;
  10.258 +
  10.259 +	err = blktap_ring_init(&ring_major);
  10.260 +	if (err)
  10.261 +		goto fail;
  10.262 +
  10.263 +	err = blktap_sysfs_init();
  10.264 +	if (err)
  10.265 +		goto fail;
  10.266 +
  10.267 +	err = blktap_control_init();
  10.268 +	if (err)
  10.269 +		goto fail;
  10.270 +
  10.271 +	return 0;
  10.272 +
  10.273 +fail:
  10.274 +	blktap_exit();
  10.275 +	return err;
  10.276 +}
  10.277 +
  10.278 +module_init(blktap_init);
  10.279 +module_exit(blktap_exit);
  10.280 +MODULE_LICENSE("Dual BSD/GPL");
    11.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    11.2 +++ b/drivers/xen/blktap2/device.c	Tue May 26 11:23:16 2009 +0100
    11.3 @@ -0,0 +1,1132 @@
    11.4 +#include <linux/fs.h>
    11.5 +#include <linux/blkdev.h>
    11.6 +#include <linux/cdrom.h>
    11.7 +#include <linux/hdreg.h>
    11.8 +#include <linux/module.h>
    11.9 +
   11.10 +#include <scsi/scsi.h>
   11.11 +#include <scsi/scsi_ioctl.h>
   11.12 +
   11.13 +#include <xen/xenbus.h>
   11.14 +#include <xen/interface/io/blkif.h>
   11.15 +
   11.16 +#include "blktap.h"
   11.17 +
   11.18 +#ifdef CONFIG_XEN_BLKDEV_BACKEND
   11.19 +#include "../blkback/blkback-pagemap.h"
   11.20 +#else
   11.21 +struct blkback_pagemap { };
   11.22 +#define blkback_pagemap_read(page) BUG();
   11.23 +#endif
   11.24 +
   11.25 +#if 0
   11.26 +#define DPRINTK_IOCTL(_f, _a...) printk(KERN_ALERT _f, ## _a)
   11.27 +#else
   11.28 +#define DPRINTK_IOCTL(_f, _a...) ((void)0)
   11.29 +#endif
   11.30 +
   11.31 +struct blktap_grant_table {
   11.32 +	int cnt;
   11.33 +	struct gnttab_map_grant_ref grants[BLKIF_MAX_SEGMENTS_PER_REQUEST * 2];
   11.34 +};
   11.35 +
   11.36 +static int blktap_device_major;
   11.37 +
   11.38 +static inline struct blktap *
   11.39 +dev_to_blktap(struct blktap_device *dev)
   11.40 +{
   11.41 +	return container_of(dev, struct blktap, device);
   11.42 +}
   11.43 +
   11.44 +static int
   11.45 +blktap_device_open(struct inode *inode, struct file *filep)
   11.46 +{
   11.47 +	struct blktap *tap;
   11.48 +	struct blktap_device *dev = inode->i_bdev->bd_disk->private_data;
   11.49 +
   11.50 +	if (!dev)
   11.51 +		return -ENOENT;
   11.52 +
   11.53 +	tap = dev_to_blktap(dev);
   11.54 +	if (!blktap_active(tap) ||
   11.55 +	    test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse))
   11.56 +		return -ENOENT;
   11.57 +
   11.58 +	dev->users++;
   11.59 +
   11.60 +	return 0;
   11.61 +}
   11.62 +
   11.63 +static int
   11.64 +blktap_device_release(struct inode *inode, struct file *filep)
   11.65 +{
   11.66 +	struct blktap_device *dev = inode->i_bdev->bd_disk->private_data;
   11.67 +	struct blktap *tap = dev_to_blktap(dev);
   11.68 +
   11.69 +	dev->users--;
   11.70 +	if (test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse))
   11.71 +		blktap_device_destroy(tap);
   11.72 +
   11.73 +	return 0;
   11.74 +}
   11.75 +
   11.76 +static int
   11.77 +blktap_device_getgeo(struct block_device *bd, struct hd_geometry *hg)
   11.78 +{
   11.79 +	/* We don't have real geometry info, but let's at least return
   11.80 +	   values consistent with the size of the device */
   11.81 +	sector_t nsect = get_capacity(bd->bd_disk);
   11.82 +	sector_t cylinders = nsect;
   11.83 +
   11.84 +	hg->heads = 0xff;
   11.85 +	hg->sectors = 0x3f;
   11.86 +	sector_div(cylinders, hg->heads * hg->sectors);
   11.87 +	hg->cylinders = cylinders;
   11.88 +	if ((sector_t)(hg->cylinders + 1) * hg->heads * hg->sectors < nsect)
   11.89 +		hg->cylinders = 0xffff;
   11.90 +	return 0;
   11.91 +}
   11.92 +
   11.93 +static int
   11.94 +blktap_device_ioctl(struct inode *inode, struct file *filep,
   11.95 +		    unsigned command, unsigned long argument)
   11.96 +{
   11.97 +	int i;
   11.98 +
   11.99 +	DPRINTK_IOCTL("command: 0x%x, argument: 0x%lx, dev: 0x%04x\n",
  11.100 +		      command, (long)argument, inode->i_rdev);
  11.101 +
  11.102 +	switch (command) {
  11.103 +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,16)
  11.104 +	case HDIO_GETGEO: {
  11.105 +		struct block_device *bd = inode->i_bdev;
  11.106 +		struct hd_geometry geo;
  11.107 +		int ret;
  11.108 +
  11.109 +                if (!argument)
  11.110 +                        return -EINVAL;
  11.111 +
  11.112 +		geo.start = get_start_sect(bd);
  11.113 +		ret = blktap_device_getgeo(bd, &geo);
  11.114 +		if (ret)
  11.115 +			return ret;
  11.116 +
  11.117 +		if (copy_to_user((struct hd_geometry __user *)argument, &geo,
  11.118 +				 sizeof(geo)))
  11.119 +                        return -EFAULT;
  11.120 +
  11.121 +                return 0;
  11.122 +	}
  11.123 +#endif
  11.124 +	case CDROMMULTISESSION:
  11.125 +		BTDBG("FIXME: support multisession CDs later\n");
  11.126 +		for (i = 0; i < sizeof(struct cdrom_multisession); i++)
  11.127 +			if (put_user(0, (char __user *)(argument + i)))
  11.128 +				return -EFAULT;
  11.129 +		return 0;
  11.130 +
  11.131 +	case SCSI_IOCTL_GET_IDLUN:
  11.132 +		if (!access_ok(VERIFY_WRITE, argument, 
  11.133 +			sizeof(struct scsi_idlun)))
  11.134 +			return -EFAULT;
  11.135 +
  11.136 +		/* return 0 for now. */
  11.137 +		__put_user(0, &((struct scsi_idlun __user *)argument)->dev_id);
  11.138 +		__put_user(0, 
  11.139 +			&((struct scsi_idlun __user *)argument)->host_unique_id);
  11.140 +		return 0;
  11.141 +
  11.142 +	default:
  11.143 +		/*printk(KERN_ALERT "ioctl %08x not supported by Xen blkdev\n",
  11.144 +		  command);*/
  11.145 +		return -EINVAL; /* same return as native Linux */
  11.146 +	}
  11.147 +
  11.148 +	return 0;
  11.149 +}
  11.150 +
  11.151 +static struct block_device_operations blktap_device_file_operations = {
  11.152 +	.owner     = THIS_MODULE,
  11.153 +	.open      = blktap_device_open,
  11.154 +	.release   = blktap_device_release,
  11.155 +	.ioctl     = blktap_device_ioctl,
  11.156 +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16)
  11.157 +	.getgeo    = blktap_device_getgeo
  11.158 +#endif
  11.159 +};
  11.160 +
  11.161 +static int
  11.162 +blktap_map_uaddr_fn(pte_t *ptep, struct page *pmd_page,
  11.163 +		    unsigned long addr, void *data)
  11.164 +{
  11.165 +	pte_t *pte = (pte_t *)data;
  11.166 +
  11.167 +	BTDBG("ptep %p -> %012llx\n", ptep, pte_val(*pte));
  11.168 +	set_pte(ptep, *pte);
  11.169 +	xen_invlpg(addr);
  11.170 +	return 0;
  11.171 +}
  11.172 +
  11.173 +static int
  11.174 +blktap_map_uaddr(struct mm_struct *mm, unsigned long address, pte_t pte)
  11.175 +{
  11.176 +	return apply_to_page_range(mm, address,
  11.177 +				   PAGE_SIZE, blktap_map_uaddr_fn, &pte);
  11.178 +}
  11.179 +
  11.180 +static int
  11.181 +blktap_umap_uaddr_fn(pte_t *ptep, struct page *pmd_page,
  11.182 +		     unsigned long addr, void *data)
  11.183 +{
  11.184 +	struct mm_struct *mm = (struct mm_struct *)data;
  11.185 +
  11.186 +	BTDBG("ptep %p\n", ptep);
  11.187 +	pte_clear(mm, addr, ptep);
  11.188 +	xen_invlpg(addr);
  11.189 +	return 0;
  11.190 +}
  11.191 +
  11.192 +static int
  11.193 +blktap_umap_uaddr(struct mm_struct *mm, unsigned long address)
  11.194 +{
  11.195 +	return apply_to_page_range(mm, address,
  11.196 +				   PAGE_SIZE, blktap_umap_uaddr_fn, mm);
  11.197 +}
  11.198 +
  11.199 +static void
  11.200 +blktap_device_end_dequeued_request(struct blktap_device *dev,
  11.201 +				   struct request *req, int uptodate)
  11.202 +{
  11.203 +	int ret;
  11.204 +
  11.205 +	ret = end_that_request_first(req, uptodate, req->hard_nr_sectors);
  11.206 +	BUG_ON(ret);
  11.207 +
  11.208 +	spin_lock_irq(&dev->lock);
  11.209 +	end_that_request_last(req, uptodate);
  11.210 +	spin_unlock_irq(&dev->lock);
  11.211 +}
  11.212 +
  11.213 +/*
  11.214 + * tap->tap_sem held on entry
  11.215 + */
  11.216 +static void
  11.217 +blktap_device_fast_flush(struct blktap *tap, struct blktap_request *request)
  11.218 +{
  11.219 +	uint64_t ptep;
  11.220 +	int ret, usr_idx;
  11.221 +	unsigned int i, cnt;
  11.222 +	struct page **map, *page;
  11.223 +	struct blktap_ring *ring;
  11.224 +	struct grant_handle_pair *khandle;
  11.225 +	unsigned long kvaddr, uvaddr, offset;
  11.226 +	struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST * 2];
  11.227 +
  11.228 +	cnt     = 0;
  11.229 +	ring    = &tap->ring;
  11.230 +	usr_idx = request->usr_idx;
  11.231 +	map     = ring->foreign_map.map;
  11.232 +
  11.233 +	if (!ring->vma)
  11.234 +		return;
  11.235 +
  11.236 +	if (xen_feature(XENFEAT_auto_translated_physmap))
  11.237 +		zap_page_range(ring->vma, 
  11.238 +			       MMAP_VADDR(ring->user_vstart, usr_idx, 0),
  11.239 +			       request->nr_pages << PAGE_SHIFT, NULL);
  11.240 +
  11.241 +	for (i = 0; i < request->nr_pages; i++) {
  11.242 +		kvaddr = request_to_kaddr(request, i);
  11.243 +		uvaddr = MMAP_VADDR(ring->user_vstart, usr_idx, i);
  11.244 +
  11.245 +		khandle = request->handles + i;
  11.246 +
  11.247 +		if (khandle->kernel != INVALID_GRANT_HANDLE) {
  11.248 +			gnttab_set_unmap_op(&unmap[cnt], kvaddr,
  11.249 +					    GNTMAP_host_map, khandle->kernel);
  11.250 +			cnt++;
  11.251 +			set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT,
  11.252 +					    INVALID_P2M_ENTRY);
  11.253 +		}
  11.254 +
  11.255 +		if (khandle->user != INVALID_GRANT_HANDLE) {
  11.256 +			BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
  11.257 +			if (create_lookup_pte_addr(ring->vma->vm_mm,
  11.258 +						   uvaddr, &ptep) != 0) {
  11.259 +				BTERR("Couldn't get a pte addr!\n");
  11.260 +				return;
  11.261 +			}
  11.262 +
  11.263 +			gnttab_set_unmap_op(&unmap[cnt], ptep,
  11.264 +					    GNTMAP_host_map
  11.265 +					    | GNTMAP_application_map
  11.266 +					    | GNTMAP_contains_pte,
  11.267 +					    khandle->user);
  11.268 +			cnt++;
  11.269 +		}
  11.270 +
  11.271 +		offset = (uvaddr - ring->vma->vm_start) >> PAGE_SHIFT;
  11.272 +
  11.273 +		BTDBG("offset: 0x%08lx, page: %p, request: %p, usr_idx: %d, "
  11.274 +		      "seg: %d, kvaddr: 0x%08lx, khandle: %u, uvaddr: "
  11.275 +		      "0x%08lx, handle: %u\n", offset, map[offset], request,
  11.276 +		      usr_idx, i, kvaddr, khandle->kernel, uvaddr,
  11.277 +		      khandle->user);
  11.278 +
  11.279 +		page = map[offset];
  11.280 +		if (page) {
  11.281 +			ClearPageReserved(map[offset]);
  11.282 +			if (PageBlkback(page)) {
  11.283 +				ClearPageBlkback(page);
  11.284 +				set_page_private(page, 0);
  11.285 +			}
  11.286 +		}
  11.287 +		map[offset] = NULL;
  11.288 +
  11.289 +		khandle->kernel = INVALID_GRANT_HANDLE;
  11.290 +		khandle->user   = INVALID_GRANT_HANDLE;
  11.291 +	}
  11.292 +
  11.293 +	if (cnt) {
  11.294 +		ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
  11.295 +						unmap, cnt);
  11.296 +		BUG_ON(ret);
  11.297 +	}
  11.298 +
  11.299 +	if (!xen_feature(XENFEAT_auto_translated_physmap))
  11.300 +		zap_page_range(ring->vma, 
  11.301 +			       MMAP_VADDR(ring->user_vstart, usr_idx, 0), 
  11.302 +			       request->nr_pages << PAGE_SHIFT, NULL);
  11.303 +}
  11.304 +
  11.305 +/*
  11.306 + * tap->tap_sem held on entry
  11.307 + */
  11.308 +static void
  11.309 +blktap_unmap(struct blktap *tap, struct blktap_request *request)
  11.310 +{
  11.311 +	int i, usr_idx;
  11.312 +	unsigned long kvaddr;
  11.313 +
  11.314 +	usr_idx = request->usr_idx;
  11.315 +	down_write(&tap->ring.vma->vm_mm->mmap_sem);
  11.316 +
  11.317 +	for (i = 0; i < request->nr_pages; i++) {
  11.318 +		BTDBG("request: %p, seg: %d, kvaddr: 0x%08lx, khandle: %u, "
  11.319 +		      "uvaddr: 0x%08lx, uhandle: %u\n", request, i,
  11.320 +		      request_to_kaddr(request, i),
  11.321 +		      request->handles[i].kernel,
  11.322 +		      MMAP_VADDR(tap->ring.user_vstart, usr_idx, i),
  11.323 +		      request->handles[i].user);
  11.324 +
  11.325 +		if (request->handles[i].kernel == INVALID_GRANT_HANDLE) {
  11.326 +			kvaddr = request_to_kaddr(request, i);
  11.327 +			blktap_umap_uaddr(&init_mm, kvaddr);
  11.328 +			set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT,
  11.329 +					    INVALID_P2M_ENTRY);
  11.330 +		}
  11.331 +	}
  11.332 +
  11.333 +	blktap_device_fast_flush(tap, request);
  11.334 +	up_write(&tap->ring.vma->vm_mm->mmap_sem);
  11.335 +}
  11.336 +
  11.337 +/*
  11.338 + * called if the tapdisk process dies unexpectedly.
  11.339 + * fail and release any pending requests and disable queue.
  11.340 + */
  11.341 +void
  11.342 +blktap_device_fail_pending_requests(struct blktap *tap)
  11.343 +{
  11.344 +	int usr_idx;
  11.345 +	struct request *req;
  11.346 +	struct blktap_device *dev;
  11.347 +	struct blktap_request *request;
  11.348 +
  11.349 +	if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse))
  11.350 +		return;
  11.351 +
  11.352 +	down_write(&tap->tap_sem);
  11.353 +
  11.354 +	dev = &tap->device;
  11.355 +	for (usr_idx = 0; usr_idx < MAX_PENDING_REQS; usr_idx++) {
  11.356 +		request = tap->pending_requests[usr_idx];
  11.357 +		if (!request || request->status != BLKTAP_REQUEST_PENDING)
  11.358 +			continue;
  11.359 +
  11.360 +		BTERR("%u:%u: failing pending %s of %d pages\n",
  11.361 +		      blktap_device_major, tap->minor,
  11.362 +		      (request->operation == BLKIF_OP_READ ?
  11.363 +		       "read" : "write"), request->nr_pages);
  11.364 +
  11.365 +		blktap_unmap(tap, request);
  11.366 +		req = (struct request *)(unsigned long)request->id;
  11.367 +		blktap_device_end_dequeued_request(dev, req, 0);
  11.368 +		blktap_request_free(tap, request);
  11.369 +	}
  11.370 +
  11.371 +	up_write(&tap->tap_sem);
  11.372 +
  11.373 +	spin_lock_irq(&dev->lock);
  11.374 +
  11.375 +	/* fail any future requests */
  11.376 +	dev->gd->queue->queuedata = NULL;
  11.377 +	blk_start_queue(dev->gd->queue);
  11.378 +
  11.379 +	spin_unlock_irq(&dev->lock);
  11.380 +}
  11.381 +
  11.382 +/*
  11.383 + * tap->tap_sem held on entry
  11.384 + */
  11.385 +void
  11.386 +blktap_device_finish_request(struct blktap *tap,
  11.387 +			     blkif_response_t *res,
  11.388 +			     struct blktap_request *request)
  11.389 +{
  11.390 +	int uptodate;
  11.391 +	struct request *req;
  11.392 +	struct blktap_device *dev;
  11.393 +
  11.394 +	dev = &tap->device;
  11.395 +
  11.396 +	blktap_unmap(tap, request);
  11.397 +
  11.398 +	req = (struct request *)(unsigned long)request->id;
  11.399 +	uptodate = (res->status == BLKIF_RSP_OKAY);
  11.400 +
  11.401 +	BTDBG("req %p res status %d operation %d/%d id %lld\n", req,
  11.402 +		res->status, res->operation, request->operation, res->id);
  11.403 +
  11.404 +	switch (request->operation) {
  11.405 +	case BLKIF_OP_READ:
  11.406 +	case BLKIF_OP_WRITE:
  11.407 +		if (unlikely(res->status != BLKIF_RSP_OKAY))
  11.408 +			BTERR("Bad return from device data "
  11.409 +				"request: %x\n", res->status);
  11.410 +		blktap_device_end_dequeued_request(dev, req, uptodate);
  11.411 +		break;
  11.412 +	default:
  11.413 +		BUG();
  11.414 +	}
  11.415 +
  11.416 +	blktap_request_free(tap, request);
  11.417 +}
  11.418 +
  11.419 +static int
  11.420 +blktap_prep_foreign(struct blktap *tap,
  11.421 +		    struct blktap_request *request,
  11.422 +		    blkif_request_t *blkif_req,
  11.423 +		    unsigned int seg, struct page *page,
  11.424 +		    struct blktap_grant_table *table)
  11.425 +{
  11.426 +	uint64_t ptep;
  11.427 +	uint32_t flags;
  11.428 +	struct page *tap_page;
  11.429 +	struct blktap_ring *ring;
  11.430 +	struct blkback_pagemap map;
  11.431 +	unsigned long uvaddr, kvaddr;
  11.432 +
  11.433 +	ring = &tap->ring;
  11.434 +	map  = blkback_pagemap_read(page);
  11.435 +	blkif_req->seg[seg].gref = map.gref;
  11.436 +
  11.437 +	uvaddr = MMAP_VADDR(ring->user_vstart, request->usr_idx, seg);
  11.438 +	kvaddr = request_to_kaddr(request, seg);
  11.439 +	flags  = GNTMAP_host_map |
  11.440 +		(request->operation == BLKIF_OP_WRITE ? GNTMAP_readonly : 0);
  11.441 +
  11.442 +	gnttab_set_map_op(&table->grants[table->cnt],
  11.443 +			  kvaddr, flags, map.gref, map.domid);
  11.444 +	table->cnt++;
  11.445 +
  11.446 +	/* enable chained tap devices */
  11.447 +	tap_page = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
  11.448 +	set_page_private(tap_page, page_private(page));
  11.449 +	SetPageBlkback(tap_page);
  11.450 +
  11.451 +	if (xen_feature(XENFEAT_auto_translated_physmap))
  11.452 +		return 0;
  11.453 +
  11.454 +	if (create_lookup_pte_addr(ring->vma->vm_mm, uvaddr, &ptep)) {
  11.455 +		BTERR("couldn't get a pte addr!\n");
  11.456 +		return -1;
  11.457 +	}
  11.458 +
  11.459 +	flags |= GNTMAP_application_map | GNTMAP_contains_pte;
  11.460 +	gnttab_set_map_op(&table->grants[table->cnt],
  11.461 +			  ptep, flags, map.gref, map.domid);
  11.462 +	table->cnt++;
  11.463 +
  11.464 +	return 0;
  11.465 +}
  11.466 +
  11.467 +static int
  11.468 +blktap_map_foreign(struct blktap *tap,
  11.469 +		   struct blktap_request *request,
  11.470 +		   blkif_request_t *blkif_req,
  11.471 +		   struct blktap_grant_table *table)
  11.472 +{
  11.473 +	struct page *page;
  11.474 +	int i, grant, err, usr_idx;
  11.475 +	struct blktap_ring *ring;
  11.476 +	unsigned long uvaddr, kvaddr, foreign_mfn;
  11.477 +
  11.478 +	if (!table->cnt)
  11.479 +		return 0;
  11.480 +
  11.481 +	err = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
  11.482 +					table->grants, table->cnt);
  11.483 +	BUG_ON(err);
  11.484 +
  11.485 +	grant   = 0;
  11.486 +	usr_idx = request->usr_idx;
  11.487 +	ring    = &tap->ring;
  11.488 +
  11.489 +	for (i = 0; i < request->nr_pages; i++) {
  11.490 +		if (!blkif_req->seg[i].gref)
  11.491 +			continue;
  11.492 +
  11.493 +		uvaddr = MMAP_VADDR(ring->user_vstart, usr_idx, i);
  11.494 +		kvaddr = request_to_kaddr(request, i);
  11.495 +
  11.496 +		if (unlikely(table->grants[grant].status)) {
  11.497 +			BTERR("invalid kernel buffer: could not remap it\n");
  11.498 +			err |= 1;
  11.499 +			table->grants[grant].handle = INVALID_GRANT_HANDLE;
  11.500 +		}
  11.501 +
  11.502 +		request->handles[i].kernel = table->grants[grant].handle;
  11.503 +		foreign_mfn = table->grants[grant].dev_bus_addr >> PAGE_SHIFT;
  11.504 +		grant++;
  11.505 +
  11.506 +		if (xen_feature(XENFEAT_auto_translated_physmap))
  11.507 +			goto done;
  11.508 +
  11.509 +		if (unlikely(table->grants[grant].status)) {
  11.510 +			BTERR("invalid user buffer: could not remap it\n");
  11.511 +			err |= 1;
  11.512 +			table->grants[grant].handle = INVALID_GRANT_HANDLE;
  11.513 +		}
  11.514 +
  11.515 +		request->handles[i].user = table->grants[grant].handle;
  11.516 +		grant++;
  11.517 +
  11.518 +	done:
  11.519 +		if (err)
  11.520 +			continue;
  11.521 +
  11.522 +		page = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
  11.523 +
  11.524 +		if (!xen_feature(XENFEAT_auto_translated_physmap))
  11.525 +			set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT,
  11.526 +					    FOREIGN_FRAME(foreign_mfn));
  11.527 +		else if (vm_insert_page(ring->vma, uvaddr, page))
  11.528 +			err |= 1;
  11.529 +
  11.530 +		BTDBG("pending_req: %p, seg: %d, page: %p, "
  11.531 +		      "kvaddr: 0x%08lx, khandle: %u, uvaddr: 0x%08lx, "
  11.532 +		      "uhandle: %u\n", request, i, page,
  11.533 +		      kvaddr, request->handles[i].kernel,		       
  11.534 +		      uvaddr, request->handles[i].user);
  11.535 +	}
  11.536 +
  11.537 +	return err;
  11.538 +}
  11.539 +
  11.540 +static void
  11.541 +blktap_map(struct blktap *tap,
  11.542 +	   struct blktap_request *request,
  11.543 +	   unsigned int seg, struct page *page)
  11.544 +{
  11.545 +	pte_t pte;
  11.546 +	int usr_idx;
  11.547 +	struct blktap_ring *ring;
  11.548 +	unsigned long uvaddr, kvaddr;
  11.549 +
  11.550 +	ring    = &tap->ring;
  11.551 +	usr_idx = request->usr_idx;
  11.552 +	uvaddr  = MMAP_VADDR(ring->user_vstart, usr_idx, seg);
  11.553 +	kvaddr  = request_to_kaddr(request, seg);
  11.554 +
  11.555 +	pte = mk_pte(page, ring->vma->vm_page_prot);
  11.556 +	blktap_map_uaddr(ring->vma->vm_mm, uvaddr, pte_mkwrite(pte));
  11.557 +	blktap_map_uaddr(&init_mm, kvaddr, mk_pte(page, PAGE_KERNEL));
  11.558 +
  11.559 +	set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT, pte_mfn(pte));
  11.560 +	request->handles[seg].kernel = INVALID_GRANT_HANDLE;
  11.561 +	request->handles[seg].user   = INVALID_GRANT_HANDLE;
  11.562 +
  11.563 +	BTDBG("pending_req: %p, seg: %d, page: %p, kvaddr: 0x%08lx, "
  11.564 +	      "uvaddr: 0x%08lx\n", request, seg, page, kvaddr,
  11.565 +	      uvaddr);
  11.566 +}
  11.567 +
  11.568 +static int
  11.569 +blktap_device_process_request(struct blktap *tap,
  11.570 +			      struct blktap_request *request,
  11.571 +			      struct request *req)
  11.572 +{
  11.573 +	struct bio *bio;
  11.574 +	struct page *page;
  11.575 +	struct bio_vec *bvec;
  11.576 +	int idx, usr_idx, err;
  11.577 +	struct blktap_ring *ring;
  11.578 +	struct blktap_grant_table table;
  11.579 +	unsigned int fsect, lsect, nr_sects;
  11.580 +	unsigned long offset, uvaddr, kvaddr;
  11.581 +	struct blkif_request blkif_req, *target;
  11.582 +
  11.583 +	err = -1;
  11.584 +	memset(&table, 0, sizeof(table));
  11.585 +
  11.586 +	if (!blktap_active(tap))
  11.587 +		goto out;
  11.588 +
  11.589 +	ring    = &tap->ring;
  11.590 +	usr_idx = request->usr_idx;
  11.591 +	blkif_req.id = usr_idx;
  11.592 +	blkif_req.sector_number = (blkif_sector_t)req->sector;
  11.593 +	blkif_req.handle = 0;
  11.594 +	blkif_req.operation = rq_data_dir(req) ?
  11.595 +		BLKIF_OP_WRITE : BLKIF_OP_READ;
  11.596 +
  11.597 +	request->id        = (unsigned long)req;
  11.598 +	request->operation = blkif_req.operation;
  11.599 +	request->status    = BLKTAP_REQUEST_PENDING;
  11.600 +	do_gettimeofday(&request->time);
  11.601 +
  11.602 +	nr_sects = 0;
  11.603 +	request->nr_pages = 0;
  11.604 +	blkif_req.nr_segments = 0;
  11.605 +	rq_for_each_bio(bio, req) {
  11.606 +		bio_for_each_segment(bvec, bio, idx) {
  11.607 +			BUG_ON(blkif_req.nr_segments ==
  11.608 +			       BLKIF_MAX_SEGMENTS_PER_REQUEST);
  11.609 +
  11.610 +			fsect     = bvec->bv_offset >> 9;
  11.611 +			lsect     = fsect + (bvec->bv_len >> 9) - 1;
  11.612 +			nr_sects += bvec->bv_len >> 9;
  11.613 +
  11.614 +			blkif_req.seg[blkif_req.nr_segments] =
  11.615 +				(struct blkif_request_segment) {
  11.616 +				.gref       = 0,
  11.617 +				.first_sect = fsect,
  11.618 +				.last_sect  = lsect };
  11.619 +
  11.620 +			if (PageBlkback(bvec->bv_page)) {
  11.621 +				/* foreign page -- use xen */
  11.622 +				if (blktap_prep_foreign(tap,
  11.623 +							request,
  11.624 +							&blkif_req,
  11.625 +							blkif_req.nr_segments,
  11.626 +							bvec->bv_page,
  11.627 +							&table))
  11.628 +					goto out;
  11.629 +			} else {
  11.630 +				/* do it the old fashioned way */
  11.631 +				blktap_map(tap,
  11.632 +					   request,
  11.633 +					   blkif_req.nr_segments,
  11.634 +					   bvec->bv_page);
  11.635 +			}
  11.636 +
  11.637 +			uvaddr = MMAP_VADDR(ring->user_vstart,
  11.638 +					    usr_idx, blkif_req.nr_segments);
  11.639 +			kvaddr = request_to_kaddr(request,
  11.640 +						  blkif_req.nr_segments);
  11.641 +			offset = (uvaddr - ring->vma->vm_start) >> PAGE_SHIFT;
  11.642 +			page   = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
  11.643 +			ring->foreign_map.map[offset] = page;
  11.644 +			SetPageReserved(page);
  11.645 +
  11.646 +			BTDBG("mapped uaddr %08lx to page %p pfn 0x%lx\n",
  11.647 +			      uvaddr, page, __pa(kvaddr) >> PAGE_SHIFT);
  11.648 +			BTDBG("offset: 0x%08lx, pending_req: %p, seg: %d, "
  11.649 +			      "page: %p, kvaddr: 0x%08lx, uvaddr: 0x%08lx\n",
  11.650 +			      offset, request, blkif_req.nr_segments,
  11.651 +			      page, kvaddr, uvaddr);
  11.652 +
  11.653 +			blkif_req.nr_segments++;
  11.654 +			request->nr_pages++;
  11.655 +		}
  11.656 +	}
  11.657 +
  11.658 +	if (blktap_map_foreign(tap, request, &blkif_req, &table))
  11.659 +		goto out;
  11.660 +
  11.661 +	/* Finally, write the request message to the user ring. */
  11.662 +	target = RING_GET_REQUEST(&ring->ring, ring->ring.req_prod_pvt);
  11.663 +	memcpy(target, &blkif_req, sizeof(blkif_req));
  11.664 +	target->id = request->usr_idx;
  11.665 +	wmb(); /* blktap_poll() reads req_prod_pvt asynchronously */
  11.666 +	ring->ring.req_prod_pvt++;
  11.667 +
  11.668 +	if (rq_data_dir(req)) {
  11.669 +		tap->stats.st_wr_sect += nr_sects;
  11.670 +		tap->stats.st_wr_req++;
  11.671 +	} else {
  11.672 +		tap->stats.st_rd_sect += nr_sects;
  11.673 +		tap->stats.st_rd_req++;
  11.674 +	}
  11.675 +
  11.676 +	err = 0;
  11.677 +
  11.678 +out:
  11.679 +	if (err)
  11.680 +		blktap_device_fast_flush(tap, request);
  11.681 +	return err;
  11.682 +}
  11.683 +
  11.684 +#ifdef ENABLE_PASSTHROUGH
  11.685 +#define rq_for_each_bio_safe(_bio, _tmp, _req)				\
  11.686 +	if ((_req)->bio)						\
  11.687 +		for (_bio = (_req)->bio;				\
  11.688 +		     _bio && ((_tmp = _bio->bi_next) || 1);		\
  11.689 +		     _bio = _tmp)
  11.690 +
  11.691 +static void
  11.692 +blktap_device_forward_request(struct blktap *tap, struct request *req)
  11.693 +{
  11.694 +	struct bio *bio, *tmp;
  11.695 +	struct blktap_device *dev;
  11.696 +
  11.697 +	dev = &tap->device;
  11.698 +
  11.699 +	rq_for_each_bio_safe(bio, tmp, req) {
  11.700 +		bio->bi_bdev = dev->bdev;
  11.701 +		submit_bio(bio->bi_rw, bio);
  11.702 +	}
  11.703 +}
  11.704 +
  11.705 +static void
  11.706 +blktap_device_close_bdev(struct blktap *tap)
  11.707 +{
  11.708 +	struct blktap_device *dev;
  11.709 +
  11.710 +	dev = &tap->device;
  11.711 +
  11.712 +	if (dev->bdev)
  11.713 +		blkdev_put(dev->bdev);
  11.714 +
  11.715 +	dev->bdev = NULL;
  11.716 +	clear_bit(BLKTAP_PASSTHROUGH, &tap->dev_inuse);
  11.717 +}
  11.718 +
  11.719 +static int
  11.720 +blktap_device_open_bdev(struct blktap *tap, u32 pdev)
  11.721 +{
  11.722 +	struct block_device *bdev;
  11.723 +	struct blktap_device *dev;
  11.724 +
  11.725 +	dev = &tap->device;
  11.726 +
  11.727 +	bdev = open_by_devnum(pdev, FMODE_WRITE);
  11.728 +	if (IS_ERR(bdev)) {
  11.729 +		BTERR("opening device %x:%x failed: %ld\n",
  11.730 +		      MAJOR(pdev), MINOR(pdev), PTR_ERR(bdev));
  11.731 +		return PTR_ERR(bdev);
  11.732 +	}
  11.733 +
  11.734 +	if (!bdev->bd_disk) {
  11.735 +		BTERR("device %x:%x doesn't exist\n",
  11.736 +		      MAJOR(pdev), MINOR(pdev));
  11.737 +		blkdev_put(dev->bdev);
  11.738 +		return -ENOENT;
  11.739 +	}
  11.740 +
  11.741 +	dev->bdev = bdev;
  11.742 +	set_bit(BLKTAP_PASSTHROUGH, &tap->dev_inuse);
  11.743 +
  11.744 +	/* TODO: readjust queue parameters */
  11.745 +
  11.746 +	BTINFO("set device %d to passthrough on %x:%x\n",
  11.747 +	       tap->minor, MAJOR(pdev), MINOR(pdev));
  11.748 +
  11.749 +	return 0;
  11.750 +}
  11.751 +
  11.752 +int
  11.753 +blktap_device_enable_passthrough(struct blktap *tap,
  11.754 +				 unsigned major, unsigned minor)
  11.755 +{
  11.756 +	u32 pdev;
  11.757 +	struct blktap_device *dev;
  11.758 +
  11.759 +	dev  = &tap->device;
  11.760 +	pdev = MKDEV(major, minor);
  11.761 +
  11.762 +	if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
  11.763 +		return -EINVAL;
  11.764 +
  11.765 +	if (dev->bdev) {
  11.766 +		if (pdev)
  11.767 +			return -EINVAL;
  11.768 +		blktap_device_close_bdev(tap);
  11.769 +		return 0;
  11.770 +	}
  11.771 +
  11.772 +	return blktap_device_open_bdev(tap, pdev);
  11.773 +}
  11.774 +#endif
  11.775 +
  11.776 +/*
  11.777 + * dev->lock held on entry
  11.778 + */
  11.779 +static void
  11.780 +blktap_device_run_queue(struct blktap *tap)
  11.781 +{
  11.782 +	int queued, err;
  11.783 +	request_queue_t *rq;
  11.784 +	struct request *req;
  11.785 +	struct blktap_ring *ring;
  11.786 +	struct blktap_device *dev;
  11.787 +	struct blktap_request *request;
  11.788 +
  11.789 +	queued = 0;
  11.790 +	ring   = &tap->ring;
  11.791 +	dev    = &tap->device;
  11.792 +	rq     = dev->gd->queue;
  11.793 +
  11.794 +	BTDBG("running queue for %d\n", tap->minor);
  11.795 +
  11.796 +	while ((req = elv_next_request(rq)) != NULL) {
  11.797 +		if (!blk_fs_request(req)) {
  11.798 +			end_request(req, 0);
  11.799 +			continue;
  11.800 +		}
  11.801 +
  11.802 +		if (blk_barrier_rq(req)) {
  11.803 +			end_request(req, 0);
  11.804 +			continue;
  11.805 +		}
  11.806 +
  11.807 +#ifdef ENABLE_PASSTHROUGH
  11.808 +		if (test_bit(BLKTAP_PASSTHROUGH, &tap->dev_inuse)) {
  11.809 +			blkdev_dequeue_request(req);
  11.810 +			blktap_device_forward_request(tap, req);
  11.811 +			continue;
  11.812 +		}
  11.813 +#endif
  11.814 +
  11.815 +		if (RING_FULL(&ring->ring)) {
  11.816 +		wait:
  11.817 +			/* Avoid pointless unplugs. */
  11.818 +			blk_stop_queue(rq);
  11.819 +			blktap_defer(tap);
  11.820 +			break;
  11.821 +		}
  11.822 +
  11.823 +		request = blktap_request_allocate(tap);
  11.824 +		if (!request) {
  11.825 +			tap->stats.st_oo_req++;
  11.826 +			goto wait;
  11.827 +		}
  11.828 +
  11.829 +		BTDBG("req %p: dev %d cmd %p, sec 0x%llx, (0x%x/0x%lx) "
  11.830 +		      "buffer:%p [%s], pending: %p\n", req, tap->minor,
  11.831 +		      req->cmd, req->sector, req->current_nr_sectors,
  11.832 +		      req->nr_sectors, req->buffer,
  11.833 +		      rq_data_dir(req) ? "write" : "read", request);
  11.834 +
  11.835 +		blkdev_dequeue_request(req);
  11.836 +
  11.837 +		spin_unlock_irq(&dev->lock);
  11.838 +		down_read(&tap->tap_sem);
  11.839 +
  11.840 +		err = blktap_device_process_request(tap, request, req);
  11.841 +		if (!err)
  11.842 +			queued++;
  11.843 +		else {
  11.844 +			blktap_device_end_dequeued_request(dev, req, 0);
  11.845 +			blktap_request_free(tap, request);
  11.846 +		}
  11.847 +
  11.848 +		up_read(&tap->tap_sem);
  11.849 +		spin_lock_irq(&dev->lock);
  11.850 +	}
  11.851 +
  11.852 +	if (queued)
  11.853 +		blktap_ring_kick_user(tap);
  11.854 +}
  11.855 +
  11.856 +/*
  11.857 + * dev->lock held on entry
  11.858 + */
  11.859 +static void
  11.860 +blktap_device_do_request(request_queue_t *rq)
  11.861 +{
  11.862 +	struct request *req;
  11.863 +	struct blktap *tap;
  11.864 +	struct blktap_device *dev;
  11.865 +
  11.866 +	dev = rq->queuedata;
  11.867 +	if (!dev)
  11.868 +		goto fail;
  11.869 +
  11.870 +	tap = dev_to_blktap(dev);
  11.871 +	if (!blktap_active(tap))
  11.872 +		goto fail;
  11.873 +
  11.874 +	if (test_bit(BLKTAP_PAUSED, &tap->dev_inuse) ||
  11.875 +	    test_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse)) {
  11.876 +		blktap_defer(tap);
  11.877 +		return;
  11.878 +	}
  11.879 +
  11.880 +	blktap_device_run_queue(tap);
  11.881 +	return;
  11.882 +
  11.883 +fail:
  11.884 +	while ((req = elv_next_request(rq))) {
  11.885 +		BTERR("device closed: failing secs %llu - %llu\n",
  11.886 +		      req->sector, req->sector + req->nr_sectors);
  11.887 +		end_request(req, 0);
  11.888 +	}
  11.889 +}
  11.890 +
  11.891 +void
  11.892 +blktap_device_restart(struct blktap *tap)
  11.893 +{
  11.894 +	struct blktap_device *dev;
  11.895 +
  11.896 +	dev = &tap->device;
  11.897 +	if (!dev->gd || !dev->gd->queue)
  11.898 +		return;
  11.899 +
  11.900 +	if (blktap_active(tap) && RING_FULL(&tap->ring.ring)) {
  11.901 +		blktap_defer(tap);
  11.902 +		return;
  11.903 +	}
  11.904 +
  11.905 +	if (test_bit(BLKTAP_PAUSED, &tap->dev_inuse) ||
  11.906 +	    test_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse)) {
  11.907 +		blktap_defer(tap);
  11.908 +		return;
  11.909 +	}
  11.910 +
  11.911 +	spin_lock_irq(&dev->lock);
  11.912 +
  11.913 +	/* Re-enable calldowns. */
  11.914 +	if (blk_queue_stopped(dev->gd->queue))
  11.915 +		blk_start_queue(dev->gd->queue);
  11.916 +
  11.917 +	/* Kick things off immediately. */
  11.918 +	blktap_device_do_request(dev->gd->queue);
  11.919 +
  11.920 +	spin_unlock_irq(&dev->lock);
  11.921 +}
  11.922 +
  11.923 +static void
  11.924 +blktap_device_configure(struct blktap *tap)
  11.925 +{
  11.926 +	struct request_queue *rq;
  11.927 +	struct blktap_device *dev = &tap->device;
  11.928 +
  11.929 +	if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse) || !dev->gd)
  11.930 +		return;
  11.931 +
  11.932 +	dev = &tap->device;
  11.933 +	rq  = dev->gd->queue;
  11.934 +
  11.935 +	spin_lock_irq(&dev->lock);
  11.936 +
  11.937 +	set_capacity(dev->gd, tap->params.capacity);
  11.938 +
  11.939 +	/* Hard sector size and max sectors impersonate the equiv. hardware. */
  11.940 +	blk_queue_hardsect_size(rq, tap->params.sector_size);
  11.941 +	blk_queue_max_sectors(rq, 512);
  11.942 +
  11.943 +	/* Each segment in a request is up to an aligned page in size. */
  11.944 +	blk_queue_segment_boundary(rq, PAGE_SIZE - 1);
  11.945 +	blk_queue_max_segment_size(rq, PAGE_SIZE);
  11.946 +
  11.947 +	/* Ensure a merged request will fit in a single I/O ring slot. */
  11.948 +	blk_queue_max_phys_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
  11.949 +	blk_queue_max_hw_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
  11.950 +
  11.951 +	/* Make sure buffer addresses are sector-aligned. */
  11.952 +	blk_queue_dma_alignment(rq, 511);
  11.953 +
  11.954 +	spin_unlock_irq(&dev->lock);
  11.955 +}
  11.956 +
  11.957 +int
  11.958 +blktap_device_resume(struct blktap *tap)
  11.959 +{
  11.960 +	int err;
  11.961 +
  11.962 +	if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse) || !blktap_active(tap))
  11.963 +		return -ENODEV;
  11.964 +
  11.965 +	if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
  11.966 +		return 0;
  11.967 +
  11.968 +	err = blktap_ring_resume(tap);
  11.969 +	if (err)
  11.970 +		return err;
  11.971 +
  11.972 +	/* device size may have changed */
  11.973 +	blktap_device_configure(tap);
  11.974 +
  11.975 +	BTDBG("restarting device\n");
  11.976 +	blktap_device_restart(tap);
  11.977 +
  11.978 +	return 0;
  11.979 +}
  11.980 +
  11.981 +int
  11.982 +blktap_device_pause(struct blktap *tap)
  11.983 +{
  11.984 +	unsigned long flags;
  11.985 +	struct blktap_device *dev = &tap->device;
  11.986 +
  11.987 +	if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse) || !blktap_active(tap))
  11.988 +		return -ENODEV;
  11.989 +
  11.990 +	if (test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
  11.991 +		return 0;
  11.992 +
  11.993 +	spin_lock_irqsave(&dev->lock, flags);
  11.994 +
  11.995 +	blk_stop_queue(dev->gd->queue);
  11.996 +	set_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse);
  11.997 +
  11.998 +	spin_unlock_irqrestore(&dev->lock, flags);
  11.999 +
 11.1000 +	return blktap_ring_pause(tap);
 11.1001 +}
 11.1002 +
 11.1003 +int
 11.1004 +blktap_device_destroy(struct blktap *tap)
 11.1005 +{
 11.1006 +	struct blktap_device *dev = &tap->device;
 11.1007 +
 11.1008 +	if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse))
 11.1009 +		return 0;
 11.1010 +
 11.1011 +	BTINFO("destroy device %d users %d\n", tap->minor, dev->users);
 11.1012 +
 11.1013 +	if (dev->users)
 11.1014 +		return -EBUSY;
 11.1015 +
 11.1016 +	spin_lock_irq(&dev->lock);
 11.1017 +	/* No more blktap_device_do_request(). */
 11.1018 +	blk_stop_queue(dev->gd->queue);
 11.1019 +	clear_bit(BLKTAP_DEVICE, &tap->dev_inuse);
 11.1020 +	spin_unlock_irq(&dev->lock);
 11.1021 +
 11.1022 +#ifdef ENABLE_PASSTHROUGH
 11.1023 +	if (dev->bdev)
 11.1024 +		blktap_device_close_bdev(tap);
 11.1025 +#endif
 11.1026 +
 11.1027 +	del_gendisk(dev->gd);
 11.1028 +	put_disk(dev->gd);
 11.1029 +	blk_cleanup_queue(dev->gd->queue);
 11.1030 +
 11.1031 +	dev->gd = NULL;
 11.1032 +
 11.1033 +	wake_up(&tap->wq);
 11.1034 +
 11.1035 +	return 0;
 11.1036 +}
 11.1037 +
 11.1038 +int
 11.1039 +blktap_device_create(struct blktap *tap)
 11.1040 +{
 11.1041 +	int minor, err;
 11.1042 +	struct gendisk *gd;
 11.1043 +	struct request_queue *rq;
 11.1044 +	struct blktap_device *dev;
 11.1045 +
 11.1046 +	gd    = NULL;
 11.1047 +	rq    = NULL;
 11.1048 +	dev   = &tap->device;
 11.1049 +	minor = tap->minor;
 11.1050 +
 11.1051 +	if (test_bit(BLKTAP_DEVICE, &tap->dev_inuse))
 11.1052 +		return -EEXIST;
 11.1053 +
 11.1054 +	if (blktap_validate_params(tap, &tap->params))
 11.1055 +		return -EINVAL;
 11.1056 +
 11.1057 +	BTINFO("minor %d sectors %Lu sector-size %lu\n",
 11.1058 +	       minor, tap->params.capacity, tap->params.sector_size);
 11.1059 +
 11.1060 +	err = -ENODEV;
 11.1061 +
 11.1062 +	gd = alloc_disk(1);
 11.1063 +	if (!gd)
 11.1064 +		goto error;
 11.1065 +
 11.1066 +	if (minor < 26)
 11.1067 +		sprintf(gd->disk_name, "tapdev%c", 'a' + minor);
 11.1068 +	else
 11.1069 +		sprintf(gd->disk_name, "tapdev%c%c",
 11.1070 +			'a' + ((minor / 26) - 1), 'a' + (minor % 26));
 11.1071 +
 11.1072 +	gd->major = blktap_device_major;
 11.1073 +	gd->first_minor = minor;
 11.1074 +	gd->fops = &blktap_device_file_operations;
 11.1075 +	gd->private_data = dev;
 11.1076 +
 11.1077 +	spin_lock_init(&dev->lock);
 11.1078 +	rq = blk_init_queue(blktap_device_do_request, &dev->lock);
 11.1079 +	if (!rq)
 11.1080 +		goto error;
 11.1081 +
 11.1082 +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10)
 11.1083 +	elevator_init(rq, "noop");
 11.1084 +#else
 11.1085 +	elevator_init(rq, &elevator_noop);
 11.1086 +#endif
 11.1087 +
 11.1088 +	gd->queue     = rq;
 11.1089 +	rq->queuedata = dev;
 11.1090 +	dev->gd       = gd;
 11.1091 +
 11.1092 +	set_bit(BLKTAP_DEVICE, &tap->dev_inuse);
 11.1093 +	blktap_device_configure(tap);
 11.1094 +
 11.1095 +	add_disk(gd);
 11.1096 +
 11.1097 +	err = 0;
 11.1098 +	goto out;
 11.1099 +
 11.1100 + error:
 11.1101 +	if (gd)
 11.1102 +		del_gendisk(gd);
 11.1103 +	if (rq)
 11.1104 +		blk_cleanup_queue(rq);
 11.1105 +
 11.1106 + out:
 11.1107 +	BTINFO("creation of %u:%u: %d\n", blktap_device_major, tap->minor, err);
 11.1108 +	return err;
 11.1109 +}
 11.1110 +
 11.1111 +int
 11.1112 +blktap_device_init(int *maj)
 11.1113 +{
 11.1114 +	int major;
 11.1115 +
 11.1116 +	/* Dynamically allocate a major for this device */
 11.1117 +	major = register_blkdev(0, "tapdev");
 11.1118 +	if (major < 0) {
 11.1119 +		BTERR("Couldn't register blktap device\n");
 11.1120 +		return -ENOMEM;
 11.1121 +	}	
 11.1122 +
 11.1123 +	blktap_device_major = *maj = major;
 11.1124 +	BTINFO("blktap device major %d\n", major);
 11.1125 +
 11.1126 +	return 0;
 11.1127 +}
 11.1128 +
 11.1129 +void
 11.1130 +blktap_device_free(void)
 11.1131 +{
 11.1132 +	if (blktap_device_major)
 11.1133 +		if (unregister_blkdev(blktap_device_major, "tapdev"))
 11.1134 +			BTERR("blktap device unregister failed\n");
 11.1135 +}
    12.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    12.2 +++ b/drivers/xen/blktap2/request.c	Tue May 26 11:23:16 2009 +0100
    12.3 @@ -0,0 +1,297 @@
    12.4 +#include <linux/spinlock.h>
    12.5 +#include <xen/balloon.h>
    12.6 +
    12.7 +#include "blktap.h"
    12.8 +
    12.9 +#define MAX_BUCKETS                      8
   12.10 +#define BUCKET_SIZE                      MAX_PENDING_REQS
   12.11 +
   12.12 +#define BLKTAP_POOL_CLOSING              1
   12.13 +
   12.14 +struct blktap_request_bucket;
   12.15 +
   12.16 +struct blktap_request_handle {
   12.17 +	int                              slot;
   12.18 +	uint8_t                          inuse;
   12.19 +	struct blktap_request            request;
   12.20 +	struct blktap_request_bucket    *bucket;
   12.21 +};
   12.22 +
   12.23 +struct blktap_request_bucket {
   12.24 +	atomic_t                         reqs_in_use;
   12.25 +	struct blktap_request_handle     handles[BUCKET_SIZE];
   12.26 +	struct page                    **foreign_pages;
   12.27 +};
   12.28 +
   12.29 +struct blktap_request_pool {
   12.30 +	spinlock_t                       lock;
   12.31 +	uint8_t                          status;
   12.32 +	struct list_head                 free_list;
   12.33 +	atomic_t                         reqs_in_use;
   12.34 +	wait_queue_head_t                wait_queue;
   12.35 +	struct blktap_request_bucket    *buckets[MAX_BUCKETS];
   12.36 +};
   12.37 +
   12.38 +static struct blktap_request_pool pool;
   12.39 +
   12.40 +static inline struct blktap_request_handle *
   12.41 +blktap_request_to_handle(struct blktap_request *req)
   12.42 +{
   12.43 +	return container_of(req, struct blktap_request_handle, request);
   12.44 +}
   12.45 +
   12.46 +static void
   12.47 +blktap_request_pool_init_request(struct blktap_request *request)
   12.48 +{
   12.49 +	int i;
   12.50 +
   12.51 +	request->usr_idx  = -1;
   12.52 +	request->nr_pages = 0;
   12.53 +	request->status   = BLKTAP_REQUEST_FREE;
   12.54 +	INIT_LIST_HEAD(&request->free_list);
   12.55 +	for (i = 0; i < ARRAY_SIZE(request->handles); i++) {
   12.56 +		request->handles[i].user   = INVALID_GRANT_HANDLE;
   12.57 +		request->handles[i].kernel = INVALID_GRANT_HANDLE;
   12.58 +	}
   12.59 +}
   12.60 +
   12.61 +static int
   12.62 +blktap_request_pool_allocate_bucket(void)
   12.63 +{
   12.64 +	int i, idx;
   12.65 +	unsigned long flags;
   12.66 +	struct blktap_request *request;
   12.67 +	struct blktap_request_handle *handle;
   12.68 +	struct blktap_request_bucket *bucket;
   12.69 +
   12.70 +	bucket = kzalloc(sizeof(struct blktap_request_bucket), GFP_KERNEL);
   12.71 +	if (!bucket)
   12.72 +		goto fail;
   12.73 +
   12.74 +	bucket->foreign_pages = alloc_empty_pages_and_pagevec(MMAP_PAGES);
   12.75 +	if (!bucket->foreign_pages)
   12.76 +		goto fail;
   12.77 +
   12.78 +	spin_lock_irqsave(&pool.lock, flags);
   12.79 +
   12.80 +	idx = -1;
   12.81 +	for (i = 0; i < MAX_BUCKETS; i++) {
   12.82 +		if (!pool.buckets[i]) {
   12.83 +			idx = i;
   12.84 +			pool.buckets[idx] = bucket;
   12.85 +			break;
   12.86 +		}
   12.87 +	}
   12.88 +
   12.89 +	if (idx == -1) {
   12.90 +		spin_unlock_irqrestore(&pool.lock, flags);
   12.91 +		goto fail;
   12.92 +	}
   12.93 +
   12.94 +	for (i = 0; i < BUCKET_SIZE; i++) {
   12.95 +		handle  = bucket->handles + i;
   12.96 +		request = &handle->request;
   12.97 +
   12.98 +		handle->slot   = i;
   12.99 +		handle->inuse  = 0;
  12.100 +		handle->bucket = bucket;
  12.101 +
  12.102 +		blktap_request_pool_init_request(request);
  12.103 +		list_add_tail(&request->free_list, &pool.free_list);
  12.104 +	}
  12.105 +
  12.106 +	spin_unlock_irqrestore(&pool.lock, flags);
  12.107 +
  12.108 +	return 0;
  12.109 +
  12.110 +fail:
  12.111 +	if (bucket && bucket->foreign_pages)
  12.112 +		free_empty_pages_and_pagevec(bucket->foreign_pages, MMAP_PAGES);
  12.113 +	kfree(bucket);
  12.114 +	return -ENOMEM;
  12.115 +}
  12.116 +
  12.117 +static void
  12.118 +blktap_request_pool_free_bucket(struct blktap_request_bucket *bucket)
  12.119 +{
  12.120 +	if (!bucket)
  12.121 +		return;
  12.122 +
  12.123 +	BTDBG("freeing bucket %p\n", bucket);
  12.124 +
  12.125 +	free_empty_pages_and_pagevec(bucket->foreign_pages, MMAP_PAGES);
  12.126 +	kfree(bucket);
  12.127 +}
  12.128 +
  12.129 +unsigned long
  12.130 +request_to_kaddr(struct blktap_request *req, int seg)
  12.131 +{
  12.132 +	struct blktap_request_handle *handle = blktap_request_to_handle(req);
  12.133 +	int idx = handle->slot * BLKIF_MAX_SEGMENTS_PER_REQUEST + seg;
  12.134 +	unsigned long pfn = page_to_pfn(handle->bucket->foreign_pages[idx]);
  12.135 +	return (unsigned long)pfn_to_kaddr(pfn);
  12.136 +}
  12.137 +
  12.138 +int
  12.139 +blktap_request_pool_shrink(void)
  12.140 +{
  12.141 +	int i, err;
  12.142 +	unsigned long flags;
  12.143 +	struct blktap_request_bucket *bucket;
  12.144 +
  12.145 +	err = -EAGAIN;
  12.146 +
  12.147 +	spin_lock_irqsave(&pool.lock, flags);
  12.148 +
  12.149 +	/* always keep at least one bucket */
  12.150 +	for (i = 1; i < MAX_BUCKETS; i++) {
  12.151 +		bucket = pool.buckets[i];
  12.152 +		if (!bucket)
  12.153 +			continue;
  12.154 +
  12.155 +		if (atomic_read(&bucket->reqs_in_use))
  12.156 +			continue;
  12.157 +
  12.158 +		blktap_request_pool_free_bucket(bucket);
  12.159 +		pool.buckets[i] = NULL;
  12.160 +		err = 0;
  12.161 +		break;
  12.162 +	}
  12.163 +
  12.164 +	spin_unlock_irqrestore(&pool.lock, flags);
  12.165 +
  12.166 +	return err;
  12.167 +}
  12.168 +
  12.169 +int
  12.170 +blktap_request_pool_grow(void)
  12.171 +{
  12.172 +	return blktap_request_pool_allocate_bucket();
  12.173 +}
  12.174 +
  12.175 +struct blktap_request *
  12.176 +blktap_request_allocate(struct blktap *tap)
  12.177 +{
  12.178 +	int i;
  12.179 +	uint16_t usr_idx;
  12.180 +	unsigned long flags;
  12.181 +	struct blktap_request *request;
  12.182 +
  12.183 +	usr_idx = -1;
  12.184 +	request = NULL;
  12.185 +
  12.186 +	spin_lock_irqsave(&pool.lock, flags);
  12.187 +
  12.188 +	if (pool.status == BLKTAP_POOL_CLOSING)
  12.189 +		goto out;
  12.190 +
  12.191 +	for (i = 0; i < ARRAY_SIZE(tap->pending_requests); i++)
  12.192 +		if (!tap->pending_requests[i]) {
  12.193 +			usr_idx = i;
  12.194 +			break;
  12.195 +		}
  12.196 +
  12.197 +	if (usr_idx == (uint16_t)-1)
  12.198 +		goto out;
  12.199 +
  12.200 +	if (!list_empty(&pool.free_list)) {
  12.201 +		request = list_entry(pool.free_list.next,
  12.202 +				     struct blktap_request, free_list);
  12.203 +		list_del(&request->free_list);
  12.204 +	}
  12.205 +
  12.206 +	if (request) {
  12.207 +		struct blktap_request_handle *handle;
  12.208 +
  12.209 +		atomic_inc(&pool.reqs_in_use);
  12.210 +
  12.211 +		handle = blktap_request_to_handle(request);
  12.212 +		atomic_inc(&handle->bucket->reqs_in_use);
  12.213 +		handle->inuse = 1;
  12.214 +
  12.215 +		request->usr_idx = usr_idx;
  12.216 +
  12.217 +		tap->pending_requests[usr_idx] = request;
  12.218 +		tap->pending_cnt++;
  12.219 +	}
  12.220 +
  12.221 +out:
  12.222 +	spin_unlock_irqrestore(&pool.lock, flags);
  12.223 +	return request;
  12.224 +}
  12.225 +
  12.226 +void
  12.227 +blktap_request_free(struct blktap *tap, struct blktap_request *request)
  12.228 +{
  12.229 +	int free;
  12.230 +	unsigned long flags;
  12.231 +	struct blktap_request_handle *handle;
  12.232 +
  12.233 +	BUG_ON(request->usr_idx >= ARRAY_SIZE(tap->pending_requests));
  12.234 +	handle = blktap_request_to_handle(request);
  12.235 +
  12.236 +	spin_lock_irqsave(&pool.lock, flags);
  12.237 +
  12.238 +	handle->inuse = 0;
  12.239 +	tap->pending_requests[request->usr_idx] = NULL;
  12.240 +	blktap_request_pool_init_request(request);
  12.241 +	list_add(&request->free_list, &pool.free_list);
  12.242 +	atomic_dec(&handle->bucket->reqs_in_use);
  12.243 +	free = atomic_dec_and_test(&pool.reqs_in_use);
  12.244 +
  12.245 +	spin_unlock_irqrestore(&pool.lock, flags);
  12.246 +
  12.247 +	if (--tap->pending_cnt == 0)
  12.248 +		wake_up_interruptible(&tap->wq);
  12.249 +
  12.250 +	if (free)
  12.251 +		wake_up(&pool.wait_queue);
  12.252 +}
  12.253 +
  12.254 +void
  12.255 +blktap_request_pool_free(void)
  12.256 +{
  12.257 +	int i;
  12.258 +	unsigned long flags;
  12.259 +
  12.260 +	spin_lock_irqsave(&pool.lock, flags);
  12.261 +
  12.262 +	pool.status = BLKTAP_POOL_CLOSING;
  12.263 +	while (atomic_read(&pool.reqs_in_use)) {
  12.264 +		spin_unlock_irqrestore(&pool.lock, flags);
  12.265 +		wait_event(pool.wait_queue, !atomic_read(&pool.reqs_in_use));
  12.266 +		spin_lock_irqsave(&pool.lock, flags);
  12.267 +	}
  12.268 +
  12.269 +	for (i = 0; i < MAX_BUCKETS; i++) {
  12.270 +		blktap_request_pool_free_bucket(pool.buckets[i]);
  12.271 +		pool.buckets[i] = NULL;
  12.272 +	}
  12.273 +
  12.274 +	spin_unlock_irqrestore(&pool.lock, flags);
  12.275 +}
  12.276 +
  12.277 +int
  12.278 +blktap_request_pool_init(void)
  12.279 +{
  12.280 +	int i, err;
  12.281 +
  12.282 +	memset(&pool, 0, sizeof(pool));
  12.283 +
  12.284 +	spin_lock_init(&pool.lock);
  12.285 +	INIT_LIST_HEAD(&pool.free_list);
  12.286 +	atomic_set(&pool.reqs_in_use, 0);
  12.287 +	init_waitqueue_head(&pool.wait_queue);
  12.288 +
  12.289 +	for (i = 0; i < 2; i++) {
  12.290 +		err = blktap_request_pool_allocate_bucket();
  12.291 +		if (err)
  12.292 +			goto fail;
  12.293 +	}
  12.294 +
  12.295 +	return 0;
  12.296 +
  12.297 +fail:
  12.298 +	blktap_request_pool_free();
  12.299 +	return err;
  12.300 +}
    13.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    13.2 +++ b/drivers/xen/blktap2/ring.c	Tue May 26 11:23:16 2009 +0100
    13.3 @@ -0,0 +1,613 @@
    13.4 +#include <linux/module.h>
    13.5 +#include <linux/signal.h>
    13.6 +
    13.7 +#include "blktap.h"
    13.8 +
    13.9 +static int blktap_ring_major;
   13.10 +
   13.11 +static inline struct blktap *
   13.12 +vma_to_blktap(struct vm_area_struct *vma)
   13.13 +{
   13.14 +	struct vm_foreign_map *m = vma->vm_private_data;
   13.15 +	struct blktap_ring *r = container_of(m, struct blktap_ring, foreign_map);
   13.16 +	return container_of(r, struct blktap, ring);
   13.17 +}
   13.18 +
   13.19 + /* 
   13.20 +  * BLKTAP - immediately before the mmap area,
   13.21 +  * we have a bunch of pages reserved for shared memory rings.
   13.22 +  */
   13.23 +#define RING_PAGES 1
   13.24 +
   13.25 +static int
   13.26 +blktap_read_ring(struct blktap *tap)
   13.27 +{
   13.28 +	/* This is called to read responses from the ring. */
   13.29 +	int usr_idx;
   13.30 +	RING_IDX rc, rp;
   13.31 +	blkif_response_t res;
   13.32 +	struct blktap_ring *ring;
   13.33 +	struct blktap_request *request;
   13.34 +
   13.35 +	down_read(&tap->tap_sem);
   13.36 +
   13.37 +	ring = &tap->ring;
   13.38 +	if (!ring->vma) {
   13.39 +		up_read(&tap->tap_sem);
   13.40 +		return 0;
   13.41 +	}
   13.42 +
   13.43 +	/* for each outstanding message on the ring  */
   13.44 +	rp = ring->ring.sring->rsp_prod;
   13.45 +	rmb();
   13.46 +
   13.47 +	for (rc = ring->ring.rsp_cons; rc != rp; rc++) {
   13.48 +		memcpy(&res, RING_GET_RESPONSE(&ring->ring, rc), sizeof(res));
   13.49 +		mb(); /* rsp_cons read by RING_FULL() in do_block_io_op(). */
   13.50 +		++ring->ring.rsp_cons;
   13.51 +
   13.52 +		usr_idx = (int)res.id;
   13.53 +		if (usr_idx >= MAX_PENDING_REQS ||
   13.54 +		    !tap->pending_requests[usr_idx]) {
   13.55 +			BTWARN("Request %d/%d invalid [%x], tapdisk %d%p\n",
   13.56 +			       rc, rp, usr_idx, tap->pid, ring->vma);
   13.57 +			continue;
   13.58 +		}
   13.59 +
   13.60 +		request = tap->pending_requests[usr_idx];
   13.61 +		BTDBG("request %p response #%d id %x\n", request, rc, usr_idx);
   13.62 +		blktap_device_finish_request(tap, &res, request);
   13.63 +	}
   13.64 +
   13.65 +	up_read(&tap->tap_sem);
   13.66 +
   13.67 +	blktap_run_deferred();
   13.68 +
   13.69 +	return 0;
   13.70 +}
   13.71 +
   13.72 +static struct page *
   13.73 +blktap_ring_nopage(struct vm_area_struct *vma,
   13.74 +		   unsigned long address, int *type)
   13.75 +{
   13.76 +	/*
   13.77 +	 * if the page has not been mapped in by the driver then return
   13.78 +	 * NOPAGE_SIGBUS to the domain.
   13.79 +	 */
   13.80 +
   13.81 +	return NOPAGE_SIGBUS;
   13.82 +}
   13.83 +
   13.84 +static pte_t
   13.85 +blktap_ring_clear_pte(struct vm_area_struct *vma,
   13.86 +		      unsigned long uvaddr,
   13.87 +		      pte_t *ptep, int is_fullmm)
   13.88 +{
   13.89 +	pte_t copy;
   13.90 +	struct blktap *tap;
   13.91 +	unsigned long kvaddr;
   13.92 +	struct page **map, *page;
   13.93 +	struct blktap_ring *ring;
   13.94 +	struct blktap_request *request;
   13.95 +	struct grant_handle_pair *khandle;
   13.96 +	struct gnttab_unmap_grant_ref unmap[2];
   13.97 +	int offset, seg, usr_idx, count = 0;
   13.98 +
   13.99 +	tap  = vma_to_blktap(vma);
  13.100 +	ring = &tap->ring;
  13.101 +	map  = ring->foreign_map.map;
  13.102 +	BUG_ON(!map);	/* TODO Should this be changed to if statement? */
  13.103 +
  13.104 +	/*
  13.105 +	 * Zap entry if the address is before the start of the grant
  13.106 +	 * mapped region.
  13.107 +	 */
  13.108 +	if (uvaddr < ring->user_vstart)
  13.109 +		return ptep_get_and_clear_full(vma->vm_mm, uvaddr,
  13.110 +					       ptep, is_fullmm);
  13.111 +
  13.112 +	offset  = (int)((uvaddr - ring->user_vstart) >> PAGE_SHIFT);
  13.113 +	usr_idx = offset / BLKIF_MAX_SEGMENTS_PER_REQUEST;
  13.114 +	seg     = offset % BLKIF_MAX_SEGMENTS_PER_REQUEST;
  13.115 +
  13.116 +	offset  = (int)((uvaddr - vma->vm_start) >> PAGE_SHIFT);
  13.117 +	page    = map[offset];
  13.118 +	if (page) {
  13.119 +		ClearPageReserved(page);
  13.120 +		if (PageBlkback(page)) {
  13.121 +			ClearPageBlkback(page);
  13.122 +			set_page_private(page, 0);
  13.123 +		}
  13.124 +	}
  13.125 +	map[offset] = NULL;
  13.126 +
  13.127 +	request = tap->pending_requests[usr_idx];
  13.128 +	kvaddr  = request_to_kaddr(request, seg);
  13.129 +	khandle = request->handles + seg;
  13.130 +
  13.131 +	if (khandle->kernel != INVALID_GRANT_HANDLE) {
  13.132 +		gnttab_set_unmap_op(&unmap[count], kvaddr, 
  13.133 +				    GNTMAP_host_map, khandle->kernel);
  13.134 +		count++;
  13.135 +
  13.136 +		set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT, 
  13.137 +				    INVALID_P2M_ENTRY);
  13.138 +	}
  13.139 +
  13.140 +
  13.141 +	if (khandle->user != INVALID_GRANT_HANDLE) {
  13.142 +		BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
  13.143 +
  13.144 +		copy = *ptep;
  13.145 +		gnttab_set_unmap_op(&unmap[count], virt_to_machine(ptep), 
  13.146 +				    GNTMAP_host_map 
  13.147 +				    | GNTMAP_application_map 
  13.148 +				    | GNTMAP_contains_pte,
  13.149 +				    khandle->user);
  13.150 +		count++;
  13.151 +	} else
  13.152 +		copy = ptep_get_and_clear_full(vma->vm_mm, uvaddr, ptep,
  13.153 +					       is_fullmm);
  13.154 +
  13.155 +	if (count)
  13.156 +		if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
  13.157 +					      unmap, count))
  13.158 +			BUG();
  13.159 +
  13.160 +	khandle->kernel = INVALID_GRANT_HANDLE;
  13.161 +	khandle->user   = INVALID_GRANT_HANDLE;
  13.162 +
  13.163 +	return copy;
  13.164 +}
  13.165 +
  13.166 +static void
  13.167 +blktap_ring_vm_unmap(struct vm_area_struct *vma)
  13.168 +{
  13.169 +	struct blktap *tap = vma_to_blktap(vma);
  13.170 +
  13.171 +	down_write(&tap->tap_sem);
  13.172 +	clear_bit(BLKTAP_RING_VMA, &tap->dev_inuse);
  13.173 +	clear_bit(BLKTAP_PAUSED, &tap->dev_inuse);
  13.174 +	clear_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse);
  13.175 +	up_write(&tap->tap_sem);
  13.176 +}
  13.177 +
  13.178 +static void
  13.179 +blktap_ring_vm_close(struct vm_area_struct *vma)
  13.180 +{
  13.181 +	struct blktap *tap = vma_to_blktap(vma);
  13.182 +	struct blktap_ring *ring = &tap->ring;
  13.183 +
  13.184 +	blktap_ring_vm_unmap(vma);                 /* fail future requests */
  13.185 +	blktap_device_fail_pending_requests(tap);  /* fail pending requests */
  13.186 +	blktap_device_restart(tap);                /* fail deferred requests */
  13.187 +
  13.188 +	down_write(&tap->tap_sem);
  13.189 +
  13.190 +	zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start, NULL);
  13.191 +
  13.192 +	kfree(ring->foreign_map.map);
  13.193 +	ring->foreign_map.map = NULL;
  13.194 +
  13.195 +	/* Free the ring page. */
  13.196 +	ClearPageReserved(virt_to_page(ring->ring.sring));
  13.197 +	free_page((unsigned long)ring->ring.sring);
  13.198 +
  13.199 +	BTINFO("unmapping ring %d\n", tap->minor);
  13.200 +	ring->ring.sring = NULL;
  13.201 +	ring->vma = NULL;
  13.202 +
  13.203 +	up_write(&tap->tap_sem);
  13.204 +
  13.205 +	wake_up(&tap->wq);
  13.206 +}
  13.207 +
  13.208 +static struct vm_operations_struct blktap_ring_vm_operations = {
  13.209 +	.close    = blktap_ring_vm_close,
  13.210 +	.unmap    = blktap_ring_vm_unmap,
  13.211 +	.nopage   = blktap_ring_nopage,
  13.212 +	.zap_pte  = blktap_ring_clear_pte,
  13.213 +};
  13.214 +
  13.215 +static int
  13.216 +blktap_ring_open(struct inode *inode, struct file *filp)
  13.217 +{
  13.218 +	int idx;
  13.219 +	struct blktap *tap;
  13.220 +
  13.221 +	idx = iminor(inode);
  13.222 +	if (idx < 0 || idx > MAX_BLKTAP_DEVICE || blktaps[idx] == NULL) {
  13.223 +		BTERR("unable to open device blktap%d\n", idx);
  13.224 +		return -ENODEV;
  13.225 +	}
  13.226 +
  13.227 +	tap = blktaps[idx];
  13.228 +
  13.229 +	BTINFO("opening device blktap%d\n", idx);
  13.230 +
  13.231 +	if (!test_bit(BLKTAP_CONTROL, &tap->dev_inuse))
  13.232 +		return -ENODEV;
  13.233 +
  13.234 +	/* Only one process can access ring at a time */
  13.235 +	if (test_and_set_bit(BLKTAP_RING_FD, &tap->dev_inuse))
  13.236 +		return -EBUSY;
  13.237 +
  13.238 +	filp->private_data = tap;
  13.239 +	BTINFO("opened device %d\n", tap->minor);
  13.240 +
  13.241 +	return 0;
  13.242 +}
  13.243 +
  13.244 +static int
  13.245 +blktap_ring_release(struct inode *inode, struct file *filp)
  13.246 +{
  13.247 +	struct blktap *tap = filp->private_data;
  13.248 +
  13.249 +	BTINFO("freeing device %d\n", tap->minor);
  13.250 +	clear_bit(BLKTAP_RING_FD, &tap->dev_inuse);
  13.251 +	filp->private_data = NULL;
  13.252 +	wake_up(&tap->wq);	
  13.253 +	return 0;
  13.254 +}
  13.255 +
  13.256 +/* Note on mmap:
  13.257 + * We need to map pages to user space in a way that will allow the block
  13.258 + * subsystem set up direct IO to them.  This couldn't be done before, because
  13.259 + * there isn't really a sane way to translate a user virtual address down to a 
  13.260 + * physical address when the page belongs to another domain.
  13.261 + *
  13.262 + * My first approach was to map the page in to kernel memory, add an entry
  13.263 + * for it in the physical frame list (using alloc_lomem_region as in blkback)
  13.264 + * and then attempt to map that page up to user space.  This is disallowed
  13.265 + * by xen though, which realizes that we don't really own the machine frame
  13.266 + * underlying the physical page.
  13.267 + *
  13.268 + * The new approach is to provide explicit support for this in xen linux.
  13.269 + * The VMA now has a flag, VM_FOREIGN, to indicate that it contains pages
  13.270 + * mapped from other vms.  vma->vm_private_data is set up as a mapping 
  13.271 + * from pages to actual page structs.  There is a new clause in get_user_pages
  13.272 + * that does the right thing for this sort of mapping.
  13.273 + */
  13.274 +static int
  13.275 +blktap_ring_mmap(struct file *filp, struct vm_area_struct *vma)
  13.276 +{
  13.277 +	int size, err;
  13.278 +	struct page **map;
  13.279 +	struct blktap *tap;
  13.280 +	blkif_sring_t *sring;
  13.281 +	struct blktap_ring *ring;
  13.282 +
  13.283 +	tap   = filp->private_data;
  13.284 +	ring  = &tap->ring;
  13.285 +	map   = NULL;
  13.286 +	sring = NULL;
  13.287 +
  13.288 +	if (!tap || test_and_set_bit(BLKTAP_RING_VMA, &tap->dev_inuse))
  13.289 +		return -ENOMEM;
  13.290 +
  13.291 +	size = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
  13.292 +	if (size != (MMAP_PAGES + RING_PAGES)) {
  13.293 +		BTERR("you _must_ map exactly %lu pages!\n",
  13.294 +		      MMAP_PAGES + RING_PAGES);
  13.295 +		return -EAGAIN;
  13.296 +	}
  13.297 +
  13.298 +	/* Allocate the fe ring. */
  13.299 +	sring = (blkif_sring_t *)get_zeroed_page(GFP_KERNEL);
  13.300 +	if (!sring) {
  13.301 +		BTERR("Couldn't alloc sring.\n");
  13.302 +		goto fail_mem;
  13.303 +	}
  13.304 +
  13.305 +	map = kzalloc(size * sizeof(struct page *), GFP_KERNEL);
  13.306 +	if (!map) {
  13.307 +		BTERR("Couldn't alloc VM_FOREIGN map.\n");
  13.308 +		goto fail_mem;
  13.309 +	}
  13.310 +
  13.311 +	SetPageReserved(virt_to_page(sring));
  13.312 +    
  13.313 +	SHARED_RING_INIT(sring);
  13.314 +	FRONT_RING_INIT(&ring->ring, sring, PAGE_SIZE);
  13.315 +
  13.316 +	ring->ring_vstart = vma->vm_start;
  13.317 +	ring->user_vstart = ring->ring_vstart + (RING_PAGES << PAGE_SHIFT);
  13.318 +
  13.319 +	/* Map the ring pages to the start of the region and reserve it. */
  13.320 +	if (xen_feature(XENFEAT_auto_translated_physmap))
  13.321 +		err = vm_insert_page(vma, vma->vm_start,
  13.322 +				     virt_to_page(ring->ring.sring));
  13.323 +	else
  13.324 +		err = remap_pfn_range(vma, vma->vm_start,
  13.325 +				      __pa(ring->ring.sring) >> PAGE_SHIFT,
  13.326 +				      PAGE_SIZE, vma->vm_page_prot);
  13.327 +	if (err) {
  13.328 +		BTERR("Mapping user ring failed: %d\n", err);
  13.329 +		goto fail;
  13.330 +	}
  13.331 +
  13.332 +	/* Mark this VM as containing foreign pages, and set up mappings. */
  13.333 +	ring->foreign_map.map = map;
  13.334 +	vma->vm_private_data = &ring->foreign_map;
  13.335 +	vma->vm_flags |= VM_FOREIGN;
  13.336 +	vma->vm_flags |= VM_DONTCOPY;
  13.337 +	vma->vm_flags |= VM_RESERVED;
  13.338 +	vma->vm_ops = &blktap_ring_vm_operations;
  13.339 +
  13.340 +#ifdef CONFIG_X86
  13.341 +	vma->vm_mm->context.has_foreign_mappings = 1;
  13.342 +#endif
  13.343 +
  13.344 +	tap->pid = current->pid;
  13.345 +	BTINFO("blktap: mapping pid is %d\n", tap->pid);
  13.346 +
  13.347 +	ring->vma = vma;
  13.348 +	return 0;
  13.349 +
  13.350 + fail:
  13.351 +	/* Clear any active mappings. */
  13.352 +	zap_page_range(vma, vma->vm_start, 
  13.353 +		       vma->vm_end - vma->vm_start, NULL);
  13.354 +	ClearPageReserved(virt_to_page(sring));
  13.355 + fail_mem:
  13.356 +	free_page((unsigned long)sring);
  13.357 +	kfree(map);
  13.358 +
  13.359 +	return -ENOMEM;
  13.360 +}
  13.361 +
  13.362 +static inline void
  13.363 +blktap_ring_set_message(struct blktap *tap, int msg)
  13.364 +{
  13.365 +	struct blktap_ring *ring = &tap->ring;
  13.366 +
  13.367 +	down_read(&tap->tap_sem);
  13.368 +	if (ring->ring.sring)
  13.369 +		ring->ring.sring->pad[0] = msg;
  13.370 +	up_read(&tap->tap_sem);
  13.371 +}
  13.372 +
  13.373 +static int
  13.374 +blktap_ring_ioctl(struct inode *inode, struct file *filp,
  13.375 +		  unsigned int cmd, unsigned long arg)
  13.376 +{
  13.377 +	struct blktap_params params;
  13.378 +	struct blktap *tap = filp->private_data;
  13.379 +
  13.380 +	BTDBG("%d: cmd: %u, arg: %lu\n", tap->minor, cmd, arg);
  13.381 +
  13.382 +	switch(cmd) {
  13.383 +	case BLKTAP2_IOCTL_KICK_FE:
  13.384 +		/* There are fe messages to process. */
  13.385 +		return blktap_read_ring(tap);
  13.386 +
  13.387 +	case BLKTAP2_IOCTL_CREATE_DEVICE:
  13.388 +		if (!arg)
  13.389 +			return -EINVAL;
  13.390 +
  13.391 +		if (copy_from_user(&params, (struct blktap_params __user *)arg,
  13.392 +				   sizeof(params))) {
  13.393 +			BTERR("failed to get params\n");
  13.394 +			return -EFAULT;
  13.395 +		}
  13.396 +
  13.397 +		if (blktap_validate_params(tap, &params)) {
  13.398 +			BTERR("invalid params\n");
  13.399 +			return -EINVAL;
  13.400 +		}
  13.401 +
  13.402 +		tap->params = params;
  13.403 +		return blktap_device_create(tap);
  13.404 +
  13.405 +	case BLKTAP2_IOCTL_SET_PARAMS:
  13.406 +		if (!arg)
  13.407 +			return -EINVAL;
  13.408 +
  13.409 +		if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
  13.410 +			return -EINVAL;
  13.411 +
  13.412 +		if (copy_from_user(&params, (struct blktap_params __user *)arg,
  13.413 +				   sizeof(params))) {
  13.414 +			BTERR("failed to get params\n");
  13.415 +			return -EFAULT;
  13.416 +		}
  13.417 +
  13.418 +		if (blktap_validate_params(tap, &params)) {
  13.419 +			BTERR("invalid params\n");
  13.420 +			return -EINVAL;
  13.421 +		}
  13.422 +
  13.423 +		tap->params = params;
  13.424 +		return 0;
  13.425 +
  13.426 +	case BLKTAP2_IOCTL_PAUSE:
  13.427 +		if (!test_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse))
  13.428 +			return -EINVAL;
  13.429 +
  13.430 +		set_bit(BLKTAP_PAUSED, &tap->dev_inuse);
  13.431 +		clear_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse);
  13.432 +
  13.433 +		blktap_ring_set_message(tap, 0);
  13.434 +		wake_up_interruptible(&tap->wq);
  13.435 +
  13.436 +		return 0;
  13.437 +
  13.438 +
  13.439 +	case BLKTAP2_IOCTL_REOPEN:
  13.440 +		if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
  13.441 +			return -EINVAL;
  13.442 +
  13.443 +		if (!arg)
  13.444 +			return -EINVAL;
  13.445 +
  13.446 +		if (copy_to_user((char __user *)arg,
  13.447 +				 tap->params.name,
  13.448 +				 strlen(tap->params.name) + 1))
  13.449 +			return -EFAULT;
  13.450 +
  13.451 +		blktap_ring_set_message(tap, 0);
  13.452 +		wake_up_interruptible(&tap->wq);
  13.453 +
  13.454 +		return 0;
  13.455 +
  13.456 +	case BLKTAP2_IOCTL_RESUME:
  13.457 +		if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
  13.458 +			return -EINVAL;
  13.459 +
  13.460 +		tap->ring.response = (int)arg;
  13.461 +		if (!tap->ring.response)
  13.462 +			clear_bit(BLKTAP_PAUSED, &tap->dev_inuse);
  13.463 +
  13.464 +		blktap_ring_set_message(tap, 0);
  13.465 +		wake_up_interruptible(&tap->wq);
  13.466 +
  13.467 +		return 0;
  13.468 +	}
  13.469 +
  13.470 +	return -ENOIOCTLCMD;
  13.471 +}
  13.472 +
  13.473 +static unsigned int blktap_ring_poll(struct file *filp, poll_table *wait)
  13.474 +{
  13.475 +	struct blktap *tap = filp->private_data;
  13.476 +	struct blktap_ring *ring = &tap->ring;
  13.477 +
  13.478 +	poll_wait(filp, &ring->poll_wait, wait);
  13.479 +	if (ring->ring.sring->pad[0] != 0 ||
  13.480 +	    ring->ring.req_prod_pvt != ring->ring.sring->req_prod) {
  13.481 +		RING_PUSH_REQUESTS(&ring->ring);
  13.482 +		return POLLIN | POLLRDNORM;
  13.483 +	}
  13.484 +
  13.485 +	return 0;
  13.486 +}
  13.487 +
  13.488 +static struct file_operations blktap_ring_file_operations = {
  13.489 +	.owner    = THIS_MODULE,
  13.490 +	.open     = blktap_ring_open,
  13.491 +	.release  = blktap_ring_release,
  13.492 +	.ioctl    = blktap_ring_ioctl,
  13.493 +	.mmap     = blktap_ring_mmap,
  13.494 +	.poll     = blktap_ring_poll,
  13.495 +};
  13.496 +
  13.497 +void
  13.498 +blktap_ring_kick_user(struct blktap *tap)
  13.499 +{
  13.500 +	wake_up_interruptible(&tap->ring.poll_wait);
  13.501 +}
  13.502 +
  13.503 +int
  13.504 +blktap_ring_resume(struct blktap *tap)
  13.505 +{
  13.506 +	int err;
  13.507 +	struct blktap_ring *ring = &tap->ring;
  13.508 +
  13.509 +	if (!blktap_active(tap))
  13.510 +		return -ENODEV;
  13.511 +
  13.512 +	if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
  13.513 +		return -EINVAL;
  13.514 +
  13.515 +	/* set shared flag for resume */
  13.516 +	ring->response = 0;
  13.517 +
  13.518 +	blktap_ring_set_message(tap, BLKTAP2_RING_MESSAGE_RESUME);
  13.519 +	blktap_ring_kick_user(tap);
  13.520 +
  13.521 +	wait_event_interruptible(tap->wq, ring->response ||
  13.522 +				 !test_bit(BLKTAP_PAUSED, &tap->dev_inuse));
  13.523 +
  13.524 +	err = ring->response;
  13.525 +	ring->response = 0;
  13.526 +
  13.527 +	BTDBG("err: %d\n", err);
  13.528 +
  13.529 +	if (err)
  13.530 +		return err;
  13.531 +
  13.532 +	if (test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
  13.533 +		return -EAGAIN;
  13.534 +
  13.535 +	return 0;
  13.536 +}
  13.537 +
  13.538 +int
  13.539 +blktap_ring_pause(struct blktap *tap)
  13.540 +{
  13.541 +	if (!blktap_active(tap))
  13.542 +		return -ENODEV;
  13.543 +
  13.544 +	if (!test_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse))
  13.545 +		return -EINVAL;
  13.546 +
  13.547 +	BTDBG("draining queue\n");
  13.548 +	wait_event_interruptible(tap->wq, !tap->pending_cnt);
  13.549 +	if (tap->pending_cnt)
  13.550 +		return -EAGAIN;
  13.551 +
  13.552 +	blktap_ring_set_message(tap, BLKTAP2_RING_MESSAGE_PAUSE);
  13.553 +	blktap_ring_kick_user(tap);
  13.554 +
  13.555 +	BTDBG("waiting for tapdisk response\n");
  13.556 +	wait_event_interruptible(tap->wq, test_bit(BLKTAP_PAUSED, &tap->dev_inuse));
  13.557 +	if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
  13.558 +		return -EAGAIN;
  13.559 +
  13.560 +	return 0;
  13.561 +}
  13.562 +
  13.563 +int
  13.564 +blktap_ring_destroy(struct blktap *tap)
  13.565 +{
  13.566 +	if (!test_bit(BLKTAP_RING_FD, &tap->dev_inuse) &&
  13.567 +	    !test_bit(BLKTAP_RING_VMA, &tap->dev_inuse))
  13.568 +		return 0;
  13.569 +
  13.570 +	BTDBG("sending tapdisk close message\n");
  13.571 +	blktap_ring_set_message(tap, BLKTAP2_RING_MESSAGE_CLOSE);
  13.572 +	blktap_ring_kick_user(tap);
  13.573 +
  13.574 +	return -EAGAIN;
  13.575 +}
  13.576 +
  13.577 +static void
  13.578 +blktap_ring_initialize(struct blktap_ring *ring, int minor)
  13.579 +{
  13.580 +	memset(ring, 0, sizeof(*ring));
  13.581 +	init_waitqueue_head(&ring->poll_wait);
  13.582 +	ring->devno = MKDEV(blktap_ring_major, minor);
  13.583 +}
  13.584 +
  13.585 +int
  13.586 +blktap_ring_create(struct blktap *tap)
  13.587 +{
  13.588 +	struct blktap_ring *ring = &tap->ring;
  13.589 +	blktap_ring_initialize(ring, tap->minor);
  13.590 +	return blktap_sysfs_create(tap);
  13.591 +}
  13.592 +
  13.593 +int
  13.594 +blktap_ring_init(int *major)
  13.595 +{
  13.596 +	int err;
  13.597 +
  13.598 +	err = register_chrdev(0, "blktap2", &blktap_ring_file_operations);
  13.599 +	if (err < 0) {
  13.600 +		BTERR("error registering blktap ring device: %d\n", err);
  13.601 +		return err;
  13.602 +	}
  13.603 +
  13.604 +	blktap_ring_major = *major = err;
  13.605 +	BTINFO("blktap ring major: %d\n", blktap_ring_major);
  13.606 +	return 0;
  13.607 +}
  13.608 +
  13.609 +int
  13.610 +blktap_ring_free(void)
  13.611 +{
  13.612 +	if (blktap_ring_major)
  13.613 +		unregister_chrdev(blktap_ring_major, "blktap2");
  13.614 +
  13.615 +	return 0;
  13.616 +}
    14.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    14.2 +++ b/drivers/xen/blktap2/sysfs.c	Tue May 26 11:23:16 2009 +0100
    14.3 @@ -0,0 +1,425 @@
    14.4 +#include <linux/types.h>
    14.5 +#include <linux/device.h>
    14.6 +#include <linux/module.h>
    14.7 +
    14.8 +#include "blktap.h"
    14.9 +
   14.10 +int blktap_debug_level = 1;
   14.11 +
   14.12 +static struct class *class;
   14.13 +static DECLARE_WAIT_QUEUE_HEAD(sysfs_wq);
   14.14 +
   14.15 +static inline void
   14.16 +blktap_sysfs_get(struct blktap *tap)
   14.17 +{
   14.18 +	atomic_inc(&tap->ring.sysfs_refcnt);
   14.19 +}
   14.20 +
   14.21 +static inline void
   14.22 +blktap_sysfs_put(struct blktap *tap)
   14.23 +{
   14.24 +	if (atomic_dec_and_test(&tap->ring.sysfs_refcnt))
   14.25 +		wake_up(&sysfs_wq);
   14.26 +}
   14.27 +
   14.28 +static inline void
   14.29 +blktap_sysfs_enter(struct blktap *tap)
   14.30 +{
   14.31 +	blktap_sysfs_get(tap);               /* pin sysfs device */
   14.32 +	mutex_lock(&tap->ring.sysfs_mutex);  /* serialize sysfs operations */
   14.33 +}
   14.34 +
   14.35 +static inline void
   14.36 +blktap_sysfs_exit(struct blktap *tap)
   14.37 +{
   14.38 +	mutex_unlock(&tap->ring.sysfs_mutex);
   14.39 +	blktap_sysfs_put(tap);
   14.40 +}
   14.41 +
   14.42 +static ssize_t blktap_sysfs_pause_device(struct class_device *, const char *, size_t);
   14.43 +CLASS_DEVICE_ATTR(pause, S_IWUSR, NULL, blktap_sysfs_pause_device);
   14.44 +static ssize_t blktap_sysfs_resume_device(struct class_device *, const char *, size_t);
   14.45 +CLASS_DEVICE_ATTR(resume, S_IWUSR, NULL, blktap_sysfs_resume_device);
   14.46 +
   14.47 +static ssize_t
   14.48 +blktap_sysfs_set_name(struct class_device *dev, const char *buf, size_t size)
   14.49 +{
   14.50 +	int err;
   14.51 +	struct blktap *tap = (struct blktap *)dev->class_data;
   14.52 +
   14.53 +	blktap_sysfs_enter(tap);
   14.54 +
   14.55 +	if (!tap->ring.dev ||
   14.56 +	    test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse)) {
   14.57 +		err = -ENODEV;
   14.58 +		goto out;
   14.59 +	}
   14.60 +
   14.61 +	if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) {
   14.62 +		err = -EPERM;
   14.63 +		goto out;
   14.64 +	}
   14.65 +
   14.66 +	if (size > BLKTAP2_MAX_MESSAGE_LEN) {
   14.67 +		err = -ENAMETOOLONG;
   14.68 +		goto out;
   14.69 +	}
   14.70 +
   14.71 +	if (strnlen(buf, BLKTAP2_MAX_MESSAGE_LEN) >= BLKTAP2_MAX_MESSAGE_LEN) {
   14.72 +		err = -EINVAL;
   14.73 +		goto out;
   14.74 +	}
   14.75 +
   14.76 +	snprintf(tap->params.name, sizeof(tap->params.name) - 1, "%s", buf);
   14.77 +	err = size;
   14.78 +
   14.79 +out:
   14.80 +	blktap_sysfs_exit(tap);	
   14.81 +	return err;
   14.82 +}
   14.83 +
   14.84 +static ssize_t
   14.85 +blktap_sysfs_get_name(struct class_device *dev, char *buf)
   14.86 +{
   14.87 +	ssize_t size;
   14.88 +	struct blktap *tap = (struct blktap *)dev->class_data;
   14.89 +
   14.90 +	blktap_sysfs_enter(tap);
   14.91 +
   14.92 +	if (!tap->ring.dev)
   14.93 +		size = -ENODEV;
   14.94 +	else if (tap->params.name[0])
   14.95 +		size = sprintf(buf, "%s\n", tap->params.name);
   14.96 +	else
   14.97 +		size = sprintf(buf, "%d\n", tap->minor);
   14.98 +
   14.99 +	blktap_sysfs_exit(tap);
  14.100 +
  14.101 +	return size;
  14.102 +}
  14.103 +CLASS_DEVICE_ATTR(name, S_IRUSR | S_IWUSR,
  14.104 +		  blktap_sysfs_get_name, blktap_sysfs_set_name);
  14.105 +
  14.106 +static ssize_t
  14.107 +blktap_sysfs_remove_device(struct class_device *dev,
  14.108 +			   const char *buf, size_t size)
  14.109 +{
  14.110 +	int err;
  14.111 +	struct blktap *tap = (struct blktap *)dev->class_data;
  14.112 +
  14.113 +	if (!tap->ring.dev)
  14.114 +		return size;
  14.115 +
  14.116 +	if (test_and_set_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse))
  14.117 +		return -EBUSY;
  14.118 +
  14.119 +	err = blktap_control_destroy_device(tap);
  14.120 +
  14.121 +	return (err ? : size);
  14.122 +}
  14.123 +CLASS_DEVICE_ATTR(remove, S_IWUSR, NULL, blktap_sysfs_remove_device);
  14.124 +
  14.125 +static ssize_t
  14.126 +blktap_sysfs_pause_device(struct class_device *dev,
  14.127 +			  const char *buf, size_t size)
  14.128 +{
  14.129 +	int err;
  14.130 +	struct blktap *tap = (struct blktap *)dev->class_data;
  14.131 +
  14.132 +	blktap_sysfs_enter(tap);
  14.133 +
  14.134 +	BTDBG("pausing %u:%u: dev_inuse: %lu\n",
  14.135 +	      MAJOR(tap->ring.devno), MINOR(tap->ring.devno), tap->dev_inuse);
  14.136 +
  14.137 +	if (!tap->ring.dev ||
  14.138 +	    test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse)) {
  14.139 +		err = -ENODEV;
  14.140 +		goto out;
  14.141 +	}
  14.142 +
  14.143 +	if (test_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse)) {
  14.144 +		err = -EBUSY;
  14.145 +		goto out;
  14.146 +	}
  14.147 +
  14.148 +	if (test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) {
  14.149 +		err = 0;
  14.150 +		goto out;
  14.151 +	}
  14.152 +
  14.153 +	err = blktap_device_pause(tap);
  14.154 +	if (!err) {
  14.155 +		class_device_remove_file(dev, &class_device_attr_pause);
  14.156 +		class_device_create_file(dev, &class_device_attr_resume);
  14.157 +	}
  14.158 +
  14.159 +out:
  14.160 +	blktap_sysfs_exit(tap);
  14.161 +
  14.162 +	return (err ? err : size);
  14.163 +}
  14.164 +
  14.165 +static ssize_t
  14.166 +blktap_sysfs_resume_device(struct class_device *dev,
  14.167 +			   const char *buf, size_t size)
  14.168 +{
  14.169 +	int err;
  14.170 +	struct blktap *tap = (struct blktap *)dev->class_data;
  14.171 +
  14.172 +	blktap_sysfs_enter(tap);
  14.173 +
  14.174 +	if (!tap->ring.dev ||
  14.175 +	    test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse)) {
  14.176 +		err = -ENODEV;
  14.177 +		goto out;
  14.178 +	}
  14.179 +
  14.180 +	if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) {
  14.181 +		err = -EINVAL;
  14.182 +		goto out;
  14.183 +	}
  14.184 +
  14.185 +	err = blktap_device_resume(tap);
  14.186 +	if (!err) {
  14.187 +		class_device_remove_file(dev, &class_device_attr_resume);
  14.188 +		class_device_create_file(dev, &class_device_attr_pause);
  14.189 +	}
  14.190 +
  14.191 +out:
  14.192 +	blktap_sysfs_exit(tap);
  14.193 +
  14.194 +	BTDBG("returning %d\n", (err ? err : size));
  14.195 +	return (err ? err : size);
  14.196 +}
  14.197 +
  14.198 +#ifdef ENABLE_PASSTHROUGH
  14.199 +static ssize_t
  14.200 +blktap_sysfs_enable_passthrough(struct class_device *dev,
  14.201 +				const char *buf, size_t size)
  14.202 +{
  14.203 +	int err;
  14.204 +	unsigned major, minor;
  14.205 +	struct blktap *tap = (struct blktap *)dev->class_data;
  14.206 +
  14.207 +	BTINFO("passthrough request enabled\n");
  14.208 +
  14.209 +	blktap_sysfs_enter(tap);
  14.210 +
  14.211 +	if (!tap->ring.dev ||
  14.212 +	    test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse)) {
  14.213 +		err = -ENODEV;
  14.214 +		goto out;
  14.215 +	}
  14.216 +
  14.217 +	if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) {
  14.218 +		err = -EINVAL;
  14.219 +		goto out;
  14.220 +	}
  14.221 +
  14.222 +	if (test_bit(BLKTAP_PASSTHROUGH, &tap->dev_inuse)) {
  14.223 +		err = -EINVAL;
  14.224 +		goto out;
  14.225 +	}
  14.226 +
  14.227 +	err = sscanf(buf, "%x:%x", &major, &minor);
  14.228 +	if (err != 2) {
  14.229 +		err = -EINVAL;
  14.230 +		goto out;
  14.231 +	}
  14.232 +
  14.233 +	err = blktap_device_enable_passthrough(tap, major, minor);
  14.234 +
  14.235 +out:
  14.236 +	blktap_sysfs_exit(tap);
  14.237 +	BTDBG("returning %d\n", (err ? err : size));
  14.238 +	return (err ? err : size);
  14.239 +}
  14.240 +#endif
  14.241 +
  14.242 +static ssize_t
  14.243 +blktap_sysfs_debug_device(struct class_device *dev, char *buf)
  14.244 +{
  14.245 +	char *tmp;
  14.246 +	int i, ret;
  14.247 +	struct blktap *tap = (struct blktap *)dev->class_data;
  14.248 +
  14.249 +	tmp = buf;
  14.250 +	blktap_sysfs_get(tap);
  14.251 +
  14.252 +	if (!tap->ring.dev) {
  14.253 +		ret = sprintf(tmp, "no device\n");
  14.254 +		goto out;
  14.255 +	}
  14.256 +
  14.257 +	tmp += sprintf(tmp, "%s (%u:%u), refcnt: %d, dev_inuse: 0x%08lx\n",
  14.258 +		       tap->params.name, MAJOR(tap->ring.devno),
  14.259 +		       MINOR(tap->ring.devno), atomic_read(&tap->refcnt),
  14.260 +		       tap->dev_inuse);
  14.261 +	tmp += sprintf(tmp, "capacity: 0x%llx, sector size: 0x%lx, "
  14.262 +		       "device users: %d\n", tap->params.capacity,
  14.263 +		       tap->params.sector_size, tap->device.users);
  14.264 +
  14.265 +	down_read(&tap->tap_sem);
  14.266 +
  14.267 +	tmp += sprintf(tmp, "pending requests: %d\n", tap->pending_cnt);
  14.268 +	for (i = 0; i < MAX_PENDING_REQS; i++) {
  14.269 +		struct blktap_request *req = tap->pending_requests[i];
  14.270 +		if (!req)
  14.271 +			continue;
  14.272 +
  14.273 +		tmp += sprintf(tmp, "req %d: id: %llu, usr_idx: %d, "
  14.274 +			       "status: 0x%02x, pendcnt: %d, "
  14.275 +			       "nr_pages: %u, op: %d, time: %lu:%lu\n",
  14.276 +			       i, req->id, req->usr_idx,
  14.277 +			       req->status, atomic_read(&req->pendcnt),
  14.278 +			       req->nr_pages, req->operation, req->time.tv_sec,
  14.279 +			       req->time.tv_usec);
  14.280 +	}
  14.281 +
  14.282 +	up_read(&tap->tap_sem);
  14.283 +	ret = (tmp - buf) + 1;
  14.284 +
  14.285 +out:
  14.286 +	blktap_sysfs_put(tap);
  14.287 +	BTDBG("%s\n", buf);
  14.288 +
  14.289 +	return ret;
  14.290 +}
  14.291 +CLASS_DEVICE_ATTR(debug, S_IRUSR, blktap_sysfs_debug_device, NULL);
  14.292 +
  14.293 +int
  14.294 +blktap_sysfs_create(struct blktap *tap)
  14.295 +{
  14.296 +	struct blktap_ring *ring;
  14.297 +	struct class_device *dev;
  14.298 +
  14.299 +	if (!class)
  14.300 +		return -ENODEV;
  14.301 +
  14.302 +	ring = &tap->ring;
  14.303 +
  14.304 +	dev = class_device_create(class, NULL, ring->devno,
  14.305 +				  NULL, "blktap%d", tap->minor);
  14.306 +	if (IS_ERR(dev))
  14.307 +		return PTR_ERR(dev);
  14.308 +
  14.309 +	ring->dev       = dev;
  14.310 +	dev->class_data = tap;
  14.311 +
  14.312 +	mutex_init(&ring->sysfs_mutex);
  14.313 +	atomic_set(&ring->sysfs_refcnt, 0);
  14.314 +	set_bit(BLKTAP_SYSFS, &tap->dev_inuse);
  14.315 +
  14.316 +	class_device_create_file(dev, &class_device_attr_name);
  14.317 +	class_device_create_file(dev, &class_device_attr_remove);
  14.318 +	class_device_create_file(dev, &class_device_attr_pause);
  14.319 +	class_device_create_file(dev, &class_device_attr_debug);
  14.320 +
  14.321 +	return 0;
  14.322 +}
  14.323 +
  14.324 +int
  14.325 +blktap_sysfs_destroy(struct blktap *tap)
  14.326 +{
  14.327 +	struct blktap_ring *ring;
  14.328 +	struct class_device *dev;
  14.329 +
  14.330 +	ring = &tap->ring;
  14.331 +	dev  = ring->dev;
  14.332 +	if (!class || !dev)
  14.333 +		return 0;
  14.334 +
  14.335 +	ring->dev = NULL;
  14.336 +	if (wait_event_interruptible(sysfs_wq,
  14.337 +				     !atomic_read(&tap->ring.sysfs_refcnt)))
  14.338 +		return -EAGAIN;
  14.339 +
  14.340 +	/* XXX: is it safe to remove the class from a sysfs attribute? */
  14.341 +	class_device_remove_file(dev, &class_device_attr_name);
  14.342 +	class_device_remove_file(dev, &class_device_attr_remove);
  14.343 +	class_device_remove_file(dev, &class_device_attr_pause);
  14.344 +	class_device_remove_file(dev, &class_device_attr_resume);
  14.345 +	class_device_remove_file(dev, &class_device_attr_debug);
  14.346 +	class_device_destroy(class, ring->devno);
  14.347 +
  14.348 +	clear_bit(BLKTAP_SYSFS, &tap->dev_inuse);
  14.349 +
  14.350 +	return 0;
  14.351 +}
  14.352 +
  14.353 +static ssize_t
  14.354 +blktap_sysfs_show_verbosity(struct class *class, char *buf)
  14.355 +{
  14.356 +	return sprintf(buf, "%d\n", blktap_debug_level);
  14.357 +}
  14.358 +
  14.359 +static ssize_t
  14.360 +blktap_sysfs_set_verbosity(struct class *class, const char *buf, size_t size)
  14.361 +{
  14.362 +	int level;
  14.363 +
  14.364 +	if (sscanf(buf, "%d", &level) == 1) {
  14.365 +		blktap_debug_level = level;
  14.366 +		return size;
  14.367 +	}
  14.368 +
  14.369 +	return -EINVAL;
  14.370 +}
  14.371 +CLASS_ATTR(verbosity, S_IRUSR | S_IWUSR,
  14.372 +	   blktap_sysfs_show_verbosity, blktap_sysfs_set_verbosity);
  14.373 +
  14.374 +static ssize_t
  14.375 +blktap_sysfs_show_devices(struct class *class, char *buf)
  14.376 +{
  14.377 +	int i, ret;
  14.378 +	struct blktap *tap;
  14.379 +
  14.380 +	ret = 0;
  14.381 +	for (i = 0; i < MAX_BLKTAP_DEVICE; i++) {
  14.382 +		tap = blktaps[i];
  14.383 +		if (!tap)
  14.384 +			continue;
  14.385 +
  14.386 +		if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse))
  14.387 +			continue;
  14.388 +
  14.389 +		ret += sprintf(buf + ret, "%d ", tap->minor);
  14.390 +		ret += snprintf(buf + ret, sizeof(tap->params.name) - 1,
  14.391 +				tap->params.name);
  14.392 +		ret += sprintf(buf + ret, "\n");
  14.393 +	}
  14.394 +
  14.395 +	return ret;
  14.396 +}
  14.397 +CLASS_ATTR(devices, S_IRUSR, blktap_sysfs_show_devices, NULL);
  14.398 +
  14.399 +void
  14.400 +blktap_sysfs_free(void)
  14.401 +{
  14.402 +	if (!class)
  14.403 +		return;
  14.404 +
  14.405 +	class_remove_file(class, &class_attr_verbosity);
  14.406 +	class_remove_file(class, &class_attr_devices);
  14.407 +
  14.408 +	class_destroy(class);
  14.409 +}
  14.410 +
  14.411 +int
  14.412 +blktap_sysfs_init(void)
  14.413 +{
  14.414 +	struct class *cls;
  14.415 +
  14.416 +	if (class)
  14.417 +		return -EEXIST;
  14.418 +
  14.419 +	cls = class_create(THIS_MODULE, "blktap2");
  14.420 +	if (IS_ERR(cls))
  14.421 +		return PTR_ERR(cls);
  14.422 +
  14.423 +	class_create_file(cls, &class_attr_verbosity);
  14.424 +	class_create_file(cls, &class_attr_devices);
  14.425 +
  14.426 +	class = cls;
  14.427 +	return 0;
  14.428 +}
    15.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    15.2 +++ b/drivers/xen/blktap2/wait_queue.c	Tue May 26 11:23:16 2009 +0100
    15.3 @@ -0,0 +1,40 @@
    15.4 +#include <linux/list.h>
    15.5 +#include <linux/spinlock.h>
    15.6 +
    15.7 +#include "blktap.h"
    15.8 +
    15.9 +static LIST_HEAD(deferred_work_queue);
   15.10 +static DEFINE_SPINLOCK(deferred_work_lock);
   15.11 +
   15.12 +void
   15.13 +blktap_run_deferred(void)
   15.14 +{
   15.15 +	LIST_HEAD(queue);
   15.16 +	struct blktap *tap;
   15.17 +	unsigned long flags;
   15.18 +
   15.19 +	spin_lock_irqsave(&deferred_work_lock, flags);
   15.20 +	list_splice_init(&deferred_work_queue, &queue);
   15.21 +	list_for_each_entry(tap, &queue, deferred_queue)
   15.22 +		clear_bit(BLKTAP_DEFERRED, &tap->dev_inuse);
   15.23 +	spin_unlock_irqrestore(&deferred_work_lock, flags);
   15.24 +
   15.25 +	while (!list_empty(&queue)) {
   15.26 +		tap = list_entry(queue.next, struct blktap, deferred_queue);
   15.27 +		list_del_init(&tap->deferred_queue);
   15.28 +		blktap_device_restart(tap);
   15.29 +	}
   15.30 +}
   15.31 +
   15.32 +void
   15.33 +blktap_defer(struct blktap *tap)
   15.34 +{
   15.35 +	unsigned long flags;
   15.36 +
   15.37 +	spin_lock_irqsave(&deferred_work_lock, flags);
   15.38 +	if (!test_bit(BLKTAP_DEFERRED, &tap->dev_inuse)) {
   15.39 +		set_bit(BLKTAP_DEFERRED, &tap->dev_inuse);
   15.40 +		list_add_tail(&tap->deferred_queue, &deferred_work_queue);
   15.41 +	}
   15.42 +	spin_unlock_irqrestore(&deferred_work_lock, flags);
   15.43 +}
    16.1 --- a/include/linux/mm.h	Tue May 26 09:53:55 2009 +0100
    16.2 +++ b/include/linux/mm.h	Tue May 26 11:23:16 2009 +0100
    16.3 @@ -166,6 +166,9 @@ extern unsigned int kobjsize(const void 
    16.4  #define VM_INSERTPAGE	0x02000000	/* The vma has had "vm_insert_page()" done on it */
    16.5  #ifdef CONFIG_XEN
    16.6  #define VM_FOREIGN	0x04000000	/* Has pages belonging to another VM */
    16.7 +struct vm_foreign_map {
    16.8 +        struct page **map;
    16.9 +};
   16.10  #endif
   16.11  #define VM_ALWAYSDUMP	0x08000000	/* Always include in core dumps */
   16.12  
   16.13 @@ -210,6 +213,10 @@ struct vm_operations_struct {
   16.14  	 * original value of @ptep. */
   16.15  	pte_t (*zap_pte)(struct vm_area_struct *vma, 
   16.16  			 unsigned long addr, pte_t *ptep, int is_fullmm);
   16.17 +
   16.18 +        /* called before close() to indicate no more pages should be mapped */
   16.19 +        void (*unmap)(struct vm_area_struct *area);
   16.20 +
   16.21  #ifdef CONFIG_NUMA
   16.22  	int (*set_policy)(struct vm_area_struct *vma, struct mempolicy *new);
   16.23  	struct mempolicy *(*get_policy)(struct vm_area_struct *vma,
    17.1 --- a/include/linux/page-flags.h	Tue May 26 09:53:55 2009 +0100
    17.2 +++ b/include/linux/page-flags.h	Tue May 26 11:23:16 2009 +0100
    17.3 @@ -100,6 +100,16 @@
    17.4  
    17.5  #define PG_foreign		20	/* Page is owned by foreign allocator. */
    17.6  
    17.7 +#define PG_netback              21      /* Page is owned by netback */
    17.8 +#define PageNetback(page)       test_bit(PG_netback, &(page)->flags)
    17.9 +#define SetPageNetback(page)    set_bit(PG_netback, &(page)->flags)
   17.10 +#define ClearPageNetback(page)  clear_bit(PG_netback, &(page)->flags)
   17.11 +
   17.12 +#define PG_blkback              22      /* Page is owned by blkback */
   17.13 +#define PageBlkback(page)       test_bit(PG_blkback, &(page)->flags)
   17.14 +#define SetPageBlkback(page)    set_bit(PG_blkback, &(page)->flags)
   17.15 +#define ClearPageBlkback(page)  clear_bit(PG_blkback, &(page)->flags)
   17.16 +
   17.17  /*
   17.18   * Manipulation of page state flags
   17.19   */
    18.1 --- a/mm/memory.c	Tue May 26 09:53:55 2009 +0100
    18.2 +++ b/mm/memory.c	Tue May 26 11:23:16 2009 +0100
    18.3 @@ -1045,7 +1045,9 @@ int get_user_pages(struct task_struct *t
    18.4  
    18.5  #ifdef CONFIG_XEN
    18.6  		if (vma && (vma->vm_flags & VM_FOREIGN)) {
    18.7 -			struct page **map = vma->vm_private_data;
    18.8 +			struct vm_foreign_map *foreign_map =
    18.9 +				vma->vm_private_data;
   18.10 +			struct page **map = foreign_map->map;
   18.11  			int offset = (start - vma->vm_start) >> PAGE_SHIFT;
   18.12  			if (map[offset] != NULL) {
   18.13  			        if (pages) {
    19.1 --- a/mm/mmap.c	Tue May 26 09:53:55 2009 +0100
    19.2 +++ b/mm/mmap.c	Tue May 26 11:23:16 2009 +0100
    19.3 @@ -1687,6 +1687,12 @@ static void unmap_region(struct mm_struc
    19.4  	tlb_finish_mmu(tlb, start, end);
    19.5  }
    19.6  
    19.7 +static inline void unmap_vma(struct vm_area_struct *vma)
    19.8 +{
    19.9 +	if (unlikely(vma->vm_ops && vma->vm_ops->unmap))
   19.10 +		vma->vm_ops->unmap(vma);
   19.11 +}
   19.12 +
   19.13  /*
   19.14   * Create a list of vma's touched by the unmap, removing them from the mm's
   19.15   * vma list as we go..
   19.16 @@ -1702,6 +1708,7 @@ detach_vmas_to_be_unmapped(struct mm_str
   19.17  	insertion_point = (prev ? &prev->vm_next : &mm->mmap);
   19.18  	do {
   19.19  		rb_erase(&vma->vm_rb, &mm->mm_rb);
   19.20 +		unmap_vma(vma);
   19.21  		mm->map_count--;
   19.22  		tail_vma = vma;
   19.23  		vma = vma->vm_next;
   19.24 @@ -1959,7 +1966,7 @@ EXPORT_SYMBOL(do_brk);
   19.25  void exit_mmap(struct mm_struct *mm)
   19.26  {
   19.27  	struct mmu_gather *tlb;
   19.28 -	struct vm_area_struct *vma = mm->mmap;
   19.29 +	struct vm_area_struct *vma_tmp, *vma = mm->mmap;
   19.30  	unsigned long nr_accounted = 0;
   19.31  	unsigned long end;
   19.32  
   19.33 @@ -1967,6 +1974,9 @@ void exit_mmap(struct mm_struct *mm)
   19.34  	arch_exit_mmap(mm);
   19.35  #endif
   19.36  
   19.37 +	for (vma_tmp = mm->mmap; vma_tmp; vma_tmp = vma_tmp->vm_next)
   19.38 +		unmap_vma(vma_tmp);
   19.39 +
   19.40  	lru_add_drain();
   19.41  	flush_cache_mm(mm);
   19.42  	tlb = tlb_gather_mmu(mm, 1);