ia64/linux-2.6.18-xen.hg

diff drivers/xen/blktap2/ring.c @ 878:eba6fe6d8d53

blktap2: a completely rewritten blktap implementation

Benefits to blktap2 over the old version of blktap:

* Isolation from xenstore - Blktap devices are now created directly on
the linux dom0 command line, rather than being spawned in response
to XenStore events. This is handy for debugging, makes blktap
generally easier to work with, and is a step toward a generic
user-level block device implementation that is not Xen-specific.

* Improved tapdisk infrastructure: simpler request forwarding, new
request scheduler, request merging, more efficient use of AIO.

* Improved tapdisk error handling and memory management. No
allocations on the block data path, IO retry logic to protect
guests
transient block device failures. This has been tested and is known
to work on weird environments such as NFS soft mounts.

* Pause and snapshot of live virtual disks (see xmsnap script).

* VHD support. The VHD code in this release has been rigorously
tested, and represents a very mature implementation of the VHD
image
format.

* No more duplication of mechanism with blkback. The blktap kernel
module has changed dramatically from the original blktap. Blkback
is now always used to talk to Xen guests, blktap just presents a
Linux gendisk that blkback can export. This is done while
preserving the zero-copy data path from domU to physical device.

These patches deprecate the old blktap code, which can hopefully be
removed from the tree completely at some point in the future.

Signed-off-by: Jake Wires <jake.wires@citrix.com>
Signed-off-by: Dutch Meyer <dmeyer@cs.ubc.ca>
author Keir Fraser <keir.fraser@citrix.com>
date Tue May 26 11:23:16 2009 +0100 (2009-05-26)
parents
children
line diff
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/drivers/xen/blktap2/ring.c	Tue May 26 11:23:16 2009 +0100
     1.3 @@ -0,0 +1,613 @@
     1.4 +#include <linux/module.h>
     1.5 +#include <linux/signal.h>
     1.6 +
     1.7 +#include "blktap.h"
     1.8 +
     1.9 +static int blktap_ring_major;
    1.10 +
    1.11 +static inline struct blktap *
    1.12 +vma_to_blktap(struct vm_area_struct *vma)
    1.13 +{
    1.14 +	struct vm_foreign_map *m = vma->vm_private_data;
    1.15 +	struct blktap_ring *r = container_of(m, struct blktap_ring, foreign_map);
    1.16 +	return container_of(r, struct blktap, ring);
    1.17 +}
    1.18 +
    1.19 + /* 
    1.20 +  * BLKTAP - immediately before the mmap area,
    1.21 +  * we have a bunch of pages reserved for shared memory rings.
    1.22 +  */
    1.23 +#define RING_PAGES 1
    1.24 +
    1.25 +static int
    1.26 +blktap_read_ring(struct blktap *tap)
    1.27 +{
    1.28 +	/* This is called to read responses from the ring. */
    1.29 +	int usr_idx;
    1.30 +	RING_IDX rc, rp;
    1.31 +	blkif_response_t res;
    1.32 +	struct blktap_ring *ring;
    1.33 +	struct blktap_request *request;
    1.34 +
    1.35 +	down_read(&tap->tap_sem);
    1.36 +
    1.37 +	ring = &tap->ring;
    1.38 +	if (!ring->vma) {
    1.39 +		up_read(&tap->tap_sem);
    1.40 +		return 0;
    1.41 +	}
    1.42 +
    1.43 +	/* for each outstanding message on the ring  */
    1.44 +	rp = ring->ring.sring->rsp_prod;
    1.45 +	rmb();
    1.46 +
    1.47 +	for (rc = ring->ring.rsp_cons; rc != rp; rc++) {
    1.48 +		memcpy(&res, RING_GET_RESPONSE(&ring->ring, rc), sizeof(res));
    1.49 +		mb(); /* rsp_cons read by RING_FULL() in do_block_io_op(). */
    1.50 +		++ring->ring.rsp_cons;
    1.51 +
    1.52 +		usr_idx = (int)res.id;
    1.53 +		if (usr_idx >= MAX_PENDING_REQS ||
    1.54 +		    !tap->pending_requests[usr_idx]) {
    1.55 +			BTWARN("Request %d/%d invalid [%x], tapdisk %d%p\n",
    1.56 +			       rc, rp, usr_idx, tap->pid, ring->vma);
    1.57 +			continue;
    1.58 +		}
    1.59 +
    1.60 +		request = tap->pending_requests[usr_idx];
    1.61 +		BTDBG("request %p response #%d id %x\n", request, rc, usr_idx);
    1.62 +		blktap_device_finish_request(tap, &res, request);
    1.63 +	}
    1.64 +
    1.65 +	up_read(&tap->tap_sem);
    1.66 +
    1.67 +	blktap_run_deferred();
    1.68 +
    1.69 +	return 0;
    1.70 +}
    1.71 +
    1.72 +static struct page *
    1.73 +blktap_ring_nopage(struct vm_area_struct *vma,
    1.74 +		   unsigned long address, int *type)
    1.75 +{
    1.76 +	/*
    1.77 +	 * if the page has not been mapped in by the driver then return
    1.78 +	 * NOPAGE_SIGBUS to the domain.
    1.79 +	 */
    1.80 +
    1.81 +	return NOPAGE_SIGBUS;
    1.82 +}
    1.83 +
    1.84 +static pte_t
    1.85 +blktap_ring_clear_pte(struct vm_area_struct *vma,
    1.86 +		      unsigned long uvaddr,
    1.87 +		      pte_t *ptep, int is_fullmm)
    1.88 +{
    1.89 +	pte_t copy;
    1.90 +	struct blktap *tap;
    1.91 +	unsigned long kvaddr;
    1.92 +	struct page **map, *page;
    1.93 +	struct blktap_ring *ring;
    1.94 +	struct blktap_request *request;
    1.95 +	struct grant_handle_pair *khandle;
    1.96 +	struct gnttab_unmap_grant_ref unmap[2];
    1.97 +	int offset, seg, usr_idx, count = 0;
    1.98 +
    1.99 +	tap  = vma_to_blktap(vma);
   1.100 +	ring = &tap->ring;
   1.101 +	map  = ring->foreign_map.map;
   1.102 +	BUG_ON(!map);	/* TODO Should this be changed to if statement? */
   1.103 +
   1.104 +	/*
   1.105 +	 * Zap entry if the address is before the start of the grant
   1.106 +	 * mapped region.
   1.107 +	 */
   1.108 +	if (uvaddr < ring->user_vstart)
   1.109 +		return ptep_get_and_clear_full(vma->vm_mm, uvaddr,
   1.110 +					       ptep, is_fullmm);
   1.111 +
   1.112 +	offset  = (int)((uvaddr - ring->user_vstart) >> PAGE_SHIFT);
   1.113 +	usr_idx = offset / BLKIF_MAX_SEGMENTS_PER_REQUEST;
   1.114 +	seg     = offset % BLKIF_MAX_SEGMENTS_PER_REQUEST;
   1.115 +
   1.116 +	offset  = (int)((uvaddr - vma->vm_start) >> PAGE_SHIFT);
   1.117 +	page    = map[offset];
   1.118 +	if (page) {
   1.119 +		ClearPageReserved(page);
   1.120 +		if (PageBlkback(page)) {
   1.121 +			ClearPageBlkback(page);
   1.122 +			set_page_private(page, 0);
   1.123 +		}
   1.124 +	}
   1.125 +	map[offset] = NULL;
   1.126 +
   1.127 +	request = tap->pending_requests[usr_idx];
   1.128 +	kvaddr  = request_to_kaddr(request, seg);
   1.129 +	khandle = request->handles + seg;
   1.130 +
   1.131 +	if (khandle->kernel != INVALID_GRANT_HANDLE) {
   1.132 +		gnttab_set_unmap_op(&unmap[count], kvaddr, 
   1.133 +				    GNTMAP_host_map, khandle->kernel);
   1.134 +		count++;
   1.135 +
   1.136 +		set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT, 
   1.137 +				    INVALID_P2M_ENTRY);
   1.138 +	}
   1.139 +
   1.140 +
   1.141 +	if (khandle->user != INVALID_GRANT_HANDLE) {
   1.142 +		BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
   1.143 +
   1.144 +		copy = *ptep;
   1.145 +		gnttab_set_unmap_op(&unmap[count], virt_to_machine(ptep), 
   1.146 +				    GNTMAP_host_map 
   1.147 +				    | GNTMAP_application_map 
   1.148 +				    | GNTMAP_contains_pte,
   1.149 +				    khandle->user);
   1.150 +		count++;
   1.151 +	} else
   1.152 +		copy = ptep_get_and_clear_full(vma->vm_mm, uvaddr, ptep,
   1.153 +					       is_fullmm);
   1.154 +
   1.155 +	if (count)
   1.156 +		if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
   1.157 +					      unmap, count))
   1.158 +			BUG();
   1.159 +
   1.160 +	khandle->kernel = INVALID_GRANT_HANDLE;
   1.161 +	khandle->user   = INVALID_GRANT_HANDLE;
   1.162 +
   1.163 +	return copy;
   1.164 +}
   1.165 +
   1.166 +static void
   1.167 +blktap_ring_vm_unmap(struct vm_area_struct *vma)
   1.168 +{
   1.169 +	struct blktap *tap = vma_to_blktap(vma);
   1.170 +
   1.171 +	down_write(&tap->tap_sem);
   1.172 +	clear_bit(BLKTAP_RING_VMA, &tap->dev_inuse);
   1.173 +	clear_bit(BLKTAP_PAUSED, &tap->dev_inuse);
   1.174 +	clear_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse);
   1.175 +	up_write(&tap->tap_sem);
   1.176 +}
   1.177 +
   1.178 +static void
   1.179 +blktap_ring_vm_close(struct vm_area_struct *vma)
   1.180 +{
   1.181 +	struct blktap *tap = vma_to_blktap(vma);
   1.182 +	struct blktap_ring *ring = &tap->ring;
   1.183 +
   1.184 +	blktap_ring_vm_unmap(vma);                 /* fail future requests */
   1.185 +	blktap_device_fail_pending_requests(tap);  /* fail pending requests */
   1.186 +	blktap_device_restart(tap);                /* fail deferred requests */
   1.187 +
   1.188 +	down_write(&tap->tap_sem);
   1.189 +
   1.190 +	zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start, NULL);
   1.191 +
   1.192 +	kfree(ring->foreign_map.map);
   1.193 +	ring->foreign_map.map = NULL;
   1.194 +
   1.195 +	/* Free the ring page. */
   1.196 +	ClearPageReserved(virt_to_page(ring->ring.sring));
   1.197 +	free_page((unsigned long)ring->ring.sring);
   1.198 +
   1.199 +	BTINFO("unmapping ring %d\n", tap->minor);
   1.200 +	ring->ring.sring = NULL;
   1.201 +	ring->vma = NULL;
   1.202 +
   1.203 +	up_write(&tap->tap_sem);
   1.204 +
   1.205 +	wake_up(&tap->wq);
   1.206 +}
   1.207 +
   1.208 +static struct vm_operations_struct blktap_ring_vm_operations = {
   1.209 +	.close    = blktap_ring_vm_close,
   1.210 +	.unmap    = blktap_ring_vm_unmap,
   1.211 +	.nopage   = blktap_ring_nopage,
   1.212 +	.zap_pte  = blktap_ring_clear_pte,
   1.213 +};
   1.214 +
   1.215 +static int
   1.216 +blktap_ring_open(struct inode *inode, struct file *filp)
   1.217 +{
   1.218 +	int idx;
   1.219 +	struct blktap *tap;
   1.220 +
   1.221 +	idx = iminor(inode);
   1.222 +	if (idx < 0 || idx > MAX_BLKTAP_DEVICE || blktaps[idx] == NULL) {
   1.223 +		BTERR("unable to open device blktap%d\n", idx);
   1.224 +		return -ENODEV;
   1.225 +	}
   1.226 +
   1.227 +	tap = blktaps[idx];
   1.228 +
   1.229 +	BTINFO("opening device blktap%d\n", idx);
   1.230 +
   1.231 +	if (!test_bit(BLKTAP_CONTROL, &tap->dev_inuse))
   1.232 +		return -ENODEV;
   1.233 +
   1.234 +	/* Only one process can access ring at a time */
   1.235 +	if (test_and_set_bit(BLKTAP_RING_FD, &tap->dev_inuse))
   1.236 +		return -EBUSY;
   1.237 +
   1.238 +	filp->private_data = tap;
   1.239 +	BTINFO("opened device %d\n", tap->minor);
   1.240 +
   1.241 +	return 0;
   1.242 +}
   1.243 +
   1.244 +static int
   1.245 +blktap_ring_release(struct inode *inode, struct file *filp)
   1.246 +{
   1.247 +	struct blktap *tap = filp->private_data;
   1.248 +
   1.249 +	BTINFO("freeing device %d\n", tap->minor);
   1.250 +	clear_bit(BLKTAP_RING_FD, &tap->dev_inuse);
   1.251 +	filp->private_data = NULL;
   1.252 +	wake_up(&tap->wq);	
   1.253 +	return 0;
   1.254 +}
   1.255 +
   1.256 +/* Note on mmap:
   1.257 + * We need to map pages to user space in a way that will allow the block
   1.258 + * subsystem set up direct IO to them.  This couldn't be done before, because
   1.259 + * there isn't really a sane way to translate a user virtual address down to a 
   1.260 + * physical address when the page belongs to another domain.
   1.261 + *
   1.262 + * My first approach was to map the page in to kernel memory, add an entry
   1.263 + * for it in the physical frame list (using alloc_lomem_region as in blkback)
   1.264 + * and then attempt to map that page up to user space.  This is disallowed
   1.265 + * by xen though, which realizes that we don't really own the machine frame
   1.266 + * underlying the physical page.
   1.267 + *
   1.268 + * The new approach is to provide explicit support for this in xen linux.
   1.269 + * The VMA now has a flag, VM_FOREIGN, to indicate that it contains pages
   1.270 + * mapped from other vms.  vma->vm_private_data is set up as a mapping 
   1.271 + * from pages to actual page structs.  There is a new clause in get_user_pages
   1.272 + * that does the right thing for this sort of mapping.
   1.273 + */
   1.274 +static int
   1.275 +blktap_ring_mmap(struct file *filp, struct vm_area_struct *vma)
   1.276 +{
   1.277 +	int size, err;
   1.278 +	struct page **map;
   1.279 +	struct blktap *tap;
   1.280 +	blkif_sring_t *sring;
   1.281 +	struct blktap_ring *ring;
   1.282 +
   1.283 +	tap   = filp->private_data;
   1.284 +	ring  = &tap->ring;
   1.285 +	map   = NULL;
   1.286 +	sring = NULL;
   1.287 +
   1.288 +	if (!tap || test_and_set_bit(BLKTAP_RING_VMA, &tap->dev_inuse))
   1.289 +		return -ENOMEM;
   1.290 +
   1.291 +	size = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
   1.292 +	if (size != (MMAP_PAGES + RING_PAGES)) {
   1.293 +		BTERR("you _must_ map exactly %lu pages!\n",
   1.294 +		      MMAP_PAGES + RING_PAGES);
   1.295 +		return -EAGAIN;
   1.296 +	}
   1.297 +
   1.298 +	/* Allocate the fe ring. */
   1.299 +	sring = (blkif_sring_t *)get_zeroed_page(GFP_KERNEL);
   1.300 +	if (!sring) {
   1.301 +		BTERR("Couldn't alloc sring.\n");
   1.302 +		goto fail_mem;
   1.303 +	}
   1.304 +
   1.305 +	map = kzalloc(size * sizeof(struct page *), GFP_KERNEL);
   1.306 +	if (!map) {
   1.307 +		BTERR("Couldn't alloc VM_FOREIGN map.\n");
   1.308 +		goto fail_mem;
   1.309 +	}
   1.310 +
   1.311 +	SetPageReserved(virt_to_page(sring));
   1.312 +    
   1.313 +	SHARED_RING_INIT(sring);
   1.314 +	FRONT_RING_INIT(&ring->ring, sring, PAGE_SIZE);
   1.315 +
   1.316 +	ring->ring_vstart = vma->vm_start;
   1.317 +	ring->user_vstart = ring->ring_vstart + (RING_PAGES << PAGE_SHIFT);
   1.318 +
   1.319 +	/* Map the ring pages to the start of the region and reserve it. */
   1.320 +	if (xen_feature(XENFEAT_auto_translated_physmap))
   1.321 +		err = vm_insert_page(vma, vma->vm_start,
   1.322 +				     virt_to_page(ring->ring.sring));
   1.323 +	else
   1.324 +		err = remap_pfn_range(vma, vma->vm_start,
   1.325 +				      __pa(ring->ring.sring) >> PAGE_SHIFT,
   1.326 +				      PAGE_SIZE, vma->vm_page_prot);
   1.327 +	if (err) {
   1.328 +		BTERR("Mapping user ring failed: %d\n", err);
   1.329 +		goto fail;
   1.330 +	}
   1.331 +
   1.332 +	/* Mark this VM as containing foreign pages, and set up mappings. */
   1.333 +	ring->foreign_map.map = map;
   1.334 +	vma->vm_private_data = &ring->foreign_map;
   1.335 +	vma->vm_flags |= VM_FOREIGN;
   1.336 +	vma->vm_flags |= VM_DONTCOPY;
   1.337 +	vma->vm_flags |= VM_RESERVED;
   1.338 +	vma->vm_ops = &blktap_ring_vm_operations;
   1.339 +
   1.340 +#ifdef CONFIG_X86
   1.341 +	vma->vm_mm->context.has_foreign_mappings = 1;
   1.342 +#endif
   1.343 +
   1.344 +	tap->pid = current->pid;
   1.345 +	BTINFO("blktap: mapping pid is %d\n", tap->pid);
   1.346 +
   1.347 +	ring->vma = vma;
   1.348 +	return 0;
   1.349 +
   1.350 + fail:
   1.351 +	/* Clear any active mappings. */
   1.352 +	zap_page_range(vma, vma->vm_start, 
   1.353 +		       vma->vm_end - vma->vm_start, NULL);
   1.354 +	ClearPageReserved(virt_to_page(sring));
   1.355 + fail_mem:
   1.356 +	free_page((unsigned long)sring);
   1.357 +	kfree(map);
   1.358 +
   1.359 +	return -ENOMEM;
   1.360 +}
   1.361 +
   1.362 +static inline void
   1.363 +blktap_ring_set_message(struct blktap *tap, int msg)
   1.364 +{
   1.365 +	struct blktap_ring *ring = &tap->ring;
   1.366 +
   1.367 +	down_read(&tap->tap_sem);
   1.368 +	if (ring->ring.sring)
   1.369 +		ring->ring.sring->pad[0] = msg;
   1.370 +	up_read(&tap->tap_sem);
   1.371 +}
   1.372 +
   1.373 +static int
   1.374 +blktap_ring_ioctl(struct inode *inode, struct file *filp,
   1.375 +		  unsigned int cmd, unsigned long arg)
   1.376 +{
   1.377 +	struct blktap_params params;
   1.378 +	struct blktap *tap = filp->private_data;
   1.379 +
   1.380 +	BTDBG("%d: cmd: %u, arg: %lu\n", tap->minor, cmd, arg);
   1.381 +
   1.382 +	switch(cmd) {
   1.383 +	case BLKTAP2_IOCTL_KICK_FE:
   1.384 +		/* There are fe messages to process. */
   1.385 +		return blktap_read_ring(tap);
   1.386 +
   1.387 +	case BLKTAP2_IOCTL_CREATE_DEVICE:
   1.388 +		if (!arg)
   1.389 +			return -EINVAL;
   1.390 +
   1.391 +		if (copy_from_user(&params, (struct blktap_params __user *)arg,
   1.392 +				   sizeof(params))) {
   1.393 +			BTERR("failed to get params\n");
   1.394 +			return -EFAULT;
   1.395 +		}
   1.396 +
   1.397 +		if (blktap_validate_params(tap, &params)) {
   1.398 +			BTERR("invalid params\n");
   1.399 +			return -EINVAL;
   1.400 +		}
   1.401 +
   1.402 +		tap->params = params;
   1.403 +		return blktap_device_create(tap);
   1.404 +
   1.405 +	case BLKTAP2_IOCTL_SET_PARAMS:
   1.406 +		if (!arg)
   1.407 +			return -EINVAL;
   1.408 +
   1.409 +		if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
   1.410 +			return -EINVAL;
   1.411 +
   1.412 +		if (copy_from_user(&params, (struct blktap_params __user *)arg,
   1.413 +				   sizeof(params))) {
   1.414 +			BTERR("failed to get params\n");
   1.415 +			return -EFAULT;
   1.416 +		}
   1.417 +
   1.418 +		if (blktap_validate_params(tap, &params)) {
   1.419 +			BTERR("invalid params\n");
   1.420 +			return -EINVAL;
   1.421 +		}
   1.422 +
   1.423 +		tap->params = params;
   1.424 +		return 0;
   1.425 +
   1.426 +	case BLKTAP2_IOCTL_PAUSE:
   1.427 +		if (!test_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse))
   1.428 +			return -EINVAL;
   1.429 +
   1.430 +		set_bit(BLKTAP_PAUSED, &tap->dev_inuse);
   1.431 +		clear_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse);
   1.432 +
   1.433 +		blktap_ring_set_message(tap, 0);
   1.434 +		wake_up_interruptible(&tap->wq);
   1.435 +
   1.436 +		return 0;
   1.437 +
   1.438 +
   1.439 +	case BLKTAP2_IOCTL_REOPEN:
   1.440 +		if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
   1.441 +			return -EINVAL;
   1.442 +
   1.443 +		if (!arg)
   1.444 +			return -EINVAL;
   1.445 +
   1.446 +		if (copy_to_user((char __user *)arg,
   1.447 +				 tap->params.name,
   1.448 +				 strlen(tap->params.name) + 1))
   1.449 +			return -EFAULT;
   1.450 +
   1.451 +		blktap_ring_set_message(tap, 0);
   1.452 +		wake_up_interruptible(&tap->wq);
   1.453 +
   1.454 +		return 0;
   1.455 +
   1.456 +	case BLKTAP2_IOCTL_RESUME:
   1.457 +		if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
   1.458 +			return -EINVAL;
   1.459 +
   1.460 +		tap->ring.response = (int)arg;
   1.461 +		if (!tap->ring.response)
   1.462 +			clear_bit(BLKTAP_PAUSED, &tap->dev_inuse);
   1.463 +
   1.464 +		blktap_ring_set_message(tap, 0);
   1.465 +		wake_up_interruptible(&tap->wq);
   1.466 +
   1.467 +		return 0;
   1.468 +	}
   1.469 +
   1.470 +	return -ENOIOCTLCMD;
   1.471 +}
   1.472 +
   1.473 +static unsigned int blktap_ring_poll(struct file *filp, poll_table *wait)
   1.474 +{
   1.475 +	struct blktap *tap = filp->private_data;
   1.476 +	struct blktap_ring *ring = &tap->ring;
   1.477 +
   1.478 +	poll_wait(filp, &ring->poll_wait, wait);
   1.479 +	if (ring->ring.sring->pad[0] != 0 ||
   1.480 +	    ring->ring.req_prod_pvt != ring->ring.sring->req_prod) {
   1.481 +		RING_PUSH_REQUESTS(&ring->ring);
   1.482 +		return POLLIN | POLLRDNORM;
   1.483 +	}
   1.484 +
   1.485 +	return 0;
   1.486 +}
   1.487 +
   1.488 +static struct file_operations blktap_ring_file_operations = {
   1.489 +	.owner    = THIS_MODULE,
   1.490 +	.open     = blktap_ring_open,
   1.491 +	.release  = blktap_ring_release,
   1.492 +	.ioctl    = blktap_ring_ioctl,
   1.493 +	.mmap     = blktap_ring_mmap,
   1.494 +	.poll     = blktap_ring_poll,
   1.495 +};
   1.496 +
   1.497 +void
   1.498 +blktap_ring_kick_user(struct blktap *tap)
   1.499 +{
   1.500 +	wake_up_interruptible(&tap->ring.poll_wait);
   1.501 +}
   1.502 +
   1.503 +int
   1.504 +blktap_ring_resume(struct blktap *tap)
   1.505 +{
   1.506 +	int err;
   1.507 +	struct blktap_ring *ring = &tap->ring;
   1.508 +
   1.509 +	if (!blktap_active(tap))
   1.510 +		return -ENODEV;
   1.511 +
   1.512 +	if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
   1.513 +		return -EINVAL;
   1.514 +
   1.515 +	/* set shared flag for resume */
   1.516 +	ring->response = 0;
   1.517 +
   1.518 +	blktap_ring_set_message(tap, BLKTAP2_RING_MESSAGE_RESUME);
   1.519 +	blktap_ring_kick_user(tap);
   1.520 +
   1.521 +	wait_event_interruptible(tap->wq, ring->response ||
   1.522 +				 !test_bit(BLKTAP_PAUSED, &tap->dev_inuse));
   1.523 +
   1.524 +	err = ring->response;
   1.525 +	ring->response = 0;
   1.526 +
   1.527 +	BTDBG("err: %d\n", err);
   1.528 +
   1.529 +	if (err)
   1.530 +		return err;
   1.531 +
   1.532 +	if (test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
   1.533 +		return -EAGAIN;
   1.534 +
   1.535 +	return 0;
   1.536 +}
   1.537 +
   1.538 +int
   1.539 +blktap_ring_pause(struct blktap *tap)
   1.540 +{
   1.541 +	if (!blktap_active(tap))
   1.542 +		return -ENODEV;
   1.543 +
   1.544 +	if (!test_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse))
   1.545 +		return -EINVAL;
   1.546 +
   1.547 +	BTDBG("draining queue\n");
   1.548 +	wait_event_interruptible(tap->wq, !tap->pending_cnt);
   1.549 +	if (tap->pending_cnt)
   1.550 +		return -EAGAIN;
   1.551 +
   1.552 +	blktap_ring_set_message(tap, BLKTAP2_RING_MESSAGE_PAUSE);
   1.553 +	blktap_ring_kick_user(tap);
   1.554 +
   1.555 +	BTDBG("waiting for tapdisk response\n");
   1.556 +	wait_event_interruptible(tap->wq, test_bit(BLKTAP_PAUSED, &tap->dev_inuse));
   1.557 +	if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
   1.558 +		return -EAGAIN;
   1.559 +
   1.560 +	return 0;
   1.561 +}
   1.562 +
   1.563 +int
   1.564 +blktap_ring_destroy(struct blktap *tap)
   1.565 +{
   1.566 +	if (!test_bit(BLKTAP_RING_FD, &tap->dev_inuse) &&
   1.567 +	    !test_bit(BLKTAP_RING_VMA, &tap->dev_inuse))
   1.568 +		return 0;
   1.569 +
   1.570 +	BTDBG("sending tapdisk close message\n");
   1.571 +	blktap_ring_set_message(tap, BLKTAP2_RING_MESSAGE_CLOSE);
   1.572 +	blktap_ring_kick_user(tap);
   1.573 +
   1.574 +	return -EAGAIN;
   1.575 +}
   1.576 +
   1.577 +static void
   1.578 +blktap_ring_initialize(struct blktap_ring *ring, int minor)
   1.579 +{
   1.580 +	memset(ring, 0, sizeof(*ring));
   1.581 +	init_waitqueue_head(&ring->poll_wait);
   1.582 +	ring->devno = MKDEV(blktap_ring_major, minor);
   1.583 +}
   1.584 +
   1.585 +int
   1.586 +blktap_ring_create(struct blktap *tap)
   1.587 +{
   1.588 +	struct blktap_ring *ring = &tap->ring;
   1.589 +	blktap_ring_initialize(ring, tap->minor);
   1.590 +	return blktap_sysfs_create(tap);
   1.591 +}
   1.592 +
   1.593 +int
   1.594 +blktap_ring_init(int *major)
   1.595 +{
   1.596 +	int err;
   1.597 +
   1.598 +	err = register_chrdev(0, "blktap2", &blktap_ring_file_operations);
   1.599 +	if (err < 0) {
   1.600 +		BTERR("error registering blktap ring device: %d\n", err);
   1.601 +		return err;
   1.602 +	}
   1.603 +
   1.604 +	blktap_ring_major = *major = err;
   1.605 +	BTINFO("blktap ring major: %d\n", blktap_ring_major);
   1.606 +	return 0;
   1.607 +}
   1.608 +
   1.609 +int
   1.610 +blktap_ring_free(void)
   1.611 +{
   1.612 +	if (blktap_ring_major)
   1.613 +		unregister_chrdev(blktap_ring_major, "blktap2");
   1.614 +
   1.615 +	return 0;
   1.616 +}