ia64/xen-unstable

changeset 8291:6f62ad959f6b

Support CFQ scheduling of guest block requests by creating
a kernel thread per blkif connection. General cleanup work
in blkback driver.

Signed-off-by: Gerd Knorr <kraxel@suse.de>
author kaf24@firebug.cl.cam.ac.uk
date Thu Dec 08 16:53:53 2005 +0100 (2005-12-08)
parents c9772105fead
children 2cc09c21cdba
files linux-2.6-xen-sparse/drivers/xen/blkback/blkback.c linux-2.6-xen-sparse/drivers/xen/blkback/common.h linux-2.6-xen-sparse/drivers/xen/blkback/interface.c linux-2.6-xen-sparse/drivers/xen/blkback/xenbus.c
line diff
     1.1 --- a/linux-2.6-xen-sparse/drivers/xen/blkback/blkback.c	Thu Dec 08 15:04:41 2005 +0000
     1.2 +++ b/linux-2.6-xen-sparse/drivers/xen/blkback/blkback.c	Thu Dec 08 16:53:53 2005 +0100
     1.3 @@ -12,6 +12,8 @@
     1.4   */
     1.5  
     1.6  #include <linux/spinlock.h>
     1.7 +#include <linux/kthread.h>
     1.8 +#include <linux/list.h>
     1.9  #include <asm-xen/balloon.h>
    1.10  #include <asm/hypervisor.h>
    1.11  #include "common.h"
    1.12 @@ -21,26 +23,26 @@
    1.13   * pulled from a communication ring are quite likely to end up being part of
    1.14   * the same scatter/gather request at the disc.
    1.15   * 
    1.16 - * ** TRY INCREASING 'MAX_PENDING_REQS' IF WRITE SPEEDS SEEM TOO LOW **
    1.17 + * ** TRY INCREASING 'blkif_reqs' IF WRITE SPEEDS SEEM TOO LOW **
    1.18 + * 
    1.19   * This will increase the chances of being able to write whole tracks.
    1.20   * 64 should be enough to keep us competitive with Linux.
    1.21   */
    1.22 -#define MAX_PENDING_REQS 64
    1.23 -#define BATCH_PER_DOMAIN 16
    1.24 +static int blkif_reqs = 64;
    1.25 +static int mmap_pages;
    1.26  
    1.27 -static unsigned long mmap_vstart;
    1.28 -#define MMAP_PAGES						\
    1.29 -	(MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST)
    1.30 -#ifdef __ia64__
    1.31 -static void *pending_vaddrs[MMAP_PAGES];
    1.32 -#define MMAP_VADDR(_idx, _i) \
    1.33 -	(unsigned long)(pending_vaddrs[((_idx) * BLKIF_MAX_SEGMENTS_PER_REQUEST) + (_i)])
    1.34 -#else
    1.35 -#define MMAP_VADDR(_req,_seg)						\
    1.36 -	(mmap_vstart +							\
    1.37 -	 ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) +	\
    1.38 -	 ((_seg) * PAGE_SIZE))
    1.39 -#endif
    1.40 +static int __init set_blkif_reqs(char *str)
    1.41 +{
    1.42 +	get_option(&str, &blkif_reqs);
    1.43 +	return 1;
    1.44 +}
    1.45 +__setup("blkif_reqs=", set_blkif_reqs);
    1.46 +
    1.47 +/* Run-time switchable: /sys/module/blkback/parameters/ */
    1.48 +static unsigned int log_stats = 0;
    1.49 +static unsigned int debug_lvl = 0;
    1.50 +module_param(log_stats, int, 0644);
    1.51 +module_param(debug_lvl, int, 0644);
    1.52  
    1.53  /*
    1.54   * Each outstanding request that we've passed to the lower device layers has a 
    1.55 @@ -55,43 +57,33 @@ typedef struct {
    1.56  	atomic_t       pendcnt;
    1.57  	unsigned short operation;
    1.58  	int            status;
    1.59 +	struct list_head free_list;
    1.60  } pending_req_t;
    1.61  
    1.62 -/*
    1.63 - * We can't allocate pending_req's in order, since they may complete out of 
    1.64 - * order. We therefore maintain an allocation ring. This ring also indicates 
    1.65 - * when enough work has been passed down -- at that point the allocation ring 
    1.66 - * will be empty.
    1.67 - */
    1.68 -static pending_req_t pending_reqs[MAX_PENDING_REQS];
    1.69 -static unsigned char pending_ring[MAX_PENDING_REQS];
    1.70 -static spinlock_t pend_prod_lock = SPIN_LOCK_UNLOCKED;
    1.71 -/* NB. We use a different index type to differentiate from shared blk rings. */
    1.72 -typedef unsigned int PEND_RING_IDX;
    1.73 -#define MASK_PEND_IDX(_i) ((_i)&(MAX_PENDING_REQS-1))
    1.74 -static PEND_RING_IDX pending_prod, pending_cons;
    1.75 -#define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons)
    1.76 +static pending_req_t *pending_reqs;
    1.77 +static struct list_head pending_free;
    1.78 +static spinlock_t pending_free_lock = SPIN_LOCK_UNLOCKED;
    1.79 +static DECLARE_WAIT_QUEUE_HEAD(pending_free_wq);
    1.80 +
    1.81 +#define BLKBACK_INVALID_HANDLE (~0)
    1.82  
    1.83 -static request_queue_t *plugged_queue;
    1.84 -static inline void flush_plugged_queue(void)
    1.85 +static unsigned long mmap_vstart;
    1.86 +static unsigned long *pending_vaddrs;
    1.87 +static grant_handle_t *pending_grant_handles;
    1.88 +
    1.89 +static inline int vaddr_pagenr(pending_req_t *req, int seg)
    1.90  {
    1.91 -	request_queue_t *q = plugged_queue;
    1.92 -	if (q != NULL) {
    1.93 -		if ( q->unplug_fn != NULL )
    1.94 -			q->unplug_fn(q);
    1.95 -		blk_put_queue(q);
    1.96 -		plugged_queue = NULL;
    1.97 -	}
    1.98 +	return (req - pending_reqs) * BLKIF_MAX_SEGMENTS_PER_REQUEST + seg;
    1.99  }
   1.100  
   1.101 -/* When using grant tables to map a frame for device access then the
   1.102 - * handle returned must be used to unmap the frame. This is needed to
   1.103 - * drop the ref count on the frame.
   1.104 - */
   1.105 -static grant_handle_t pending_grant_handles[MMAP_PAGES];
   1.106 -#define pending_handle(_idx, _i) \
   1.107 -    (pending_grant_handles[((_idx) * BLKIF_MAX_SEGMENTS_PER_REQUEST) + (_i)])
   1.108 -#define BLKBACK_INVALID_HANDLE (~0)
   1.109 +static inline unsigned long vaddr(pending_req_t *req, int seg)
   1.110 +{
   1.111 +	return pending_vaddrs[vaddr_pagenr(req, seg)];
   1.112 +}
   1.113 +
   1.114 +#define pending_handle(_req, _seg) \
   1.115 +	(pending_grant_handles[vaddr_pagenr(_req, _seg)])
   1.116 +
   1.117  
   1.118  #ifdef CONFIG_XEN_BLKDEV_TAP_BE
   1.119  /*
   1.120 @@ -105,26 +97,79 @@ static grant_handle_t pending_grant_hand
   1.121  static inline domid_t ID_TO_DOM(unsigned long id) { return (id >> 16); }
   1.122  #endif
   1.123  
   1.124 -static int do_block_io_op(blkif_t *blkif, int max_to_do);
   1.125 -static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req);
   1.126 +static int do_block_io_op(blkif_t *blkif);
   1.127 +static void dispatch_rw_block_io(blkif_t *blkif,
   1.128 +				 blkif_request_t *req,
   1.129 +				 pending_req_t *pending_req);
   1.130  static void make_response(blkif_t *blkif, unsigned long id, 
   1.131                            unsigned short op, int st);
   1.132  
   1.133 -static void fast_flush_area(int idx, int nr_pages)
   1.134 +/******************************************************************
   1.135 + * misc small helpers
   1.136 + */
   1.137 +static pending_req_t* alloc_req(void)
   1.138 +{
   1.139 +	pending_req_t *req = NULL;
   1.140 +	unsigned long flags;
   1.141 +
   1.142 +	spin_lock_irqsave(&pending_free_lock, flags);
   1.143 +	if (!list_empty(&pending_free)) {
   1.144 +		req = list_entry(pending_free.next, pending_req_t, free_list);
   1.145 +		list_del(&req->free_list);
   1.146 +	}
   1.147 +	spin_unlock_irqrestore(&pending_free_lock, flags);
   1.148 +	return req;
   1.149 +}
   1.150 +
   1.151 +static void free_req(pending_req_t *req)
   1.152 +{
   1.153 +	unsigned long flags;
   1.154 +	int was_empty;
   1.155 +
   1.156 +	spin_lock_irqsave(&pending_free_lock, flags);
   1.157 +	was_empty = list_empty(&pending_free);
   1.158 +	list_add(&req->free_list, &pending_free);
   1.159 +	spin_unlock_irqrestore(&pending_free_lock, flags);
   1.160 +	if (was_empty)
   1.161 +		wake_up(&pending_free_wq);
   1.162 +}
   1.163 +
   1.164 +static void unplug_queue(blkif_t *blkif)
   1.165 +{
   1.166 +	if (blkif->plug == NULL)
   1.167 +		return;
   1.168 +	if (blkif->plug->unplug_fn)
   1.169 +		blkif->plug->unplug_fn(blkif->plug);
   1.170 +	blk_put_queue(blkif->plug);
   1.171 +	blkif->plug = NULL;
   1.172 +}
   1.173 +
   1.174 +static void plug_queue(blkif_t *blkif, struct bio *bio)
   1.175 +{
   1.176 +	request_queue_t *q = bdev_get_queue(bio->bi_bdev);
   1.177 +
   1.178 +	if (q == blkif->plug)
   1.179 +		return;
   1.180 +	unplug_queue(blkif);
   1.181 +	blk_get_queue(q);
   1.182 +	blkif->plug = q;
   1.183 +}
   1.184 +
   1.185 +static void fast_flush_area(pending_req_t *req)
   1.186  {
   1.187  	struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
   1.188  	unsigned int i, invcount = 0;
   1.189  	grant_handle_t handle;
   1.190  	int ret;
   1.191  
   1.192 -	for (i = 0; i < nr_pages; i++) {
   1.193 -		handle = pending_handle(idx, i);
   1.194 +	for (i = 0; i < req->nr_pages; i++) {
   1.195 +		handle = pending_handle(req, i);
   1.196  		if (handle == BLKBACK_INVALID_HANDLE)
   1.197  			continue;
   1.198 -		unmap[invcount].host_addr    = MMAP_VADDR(idx, i);
   1.199 +		unmap[invcount].host_addr    = vaddr(req, i);
   1.200  		unmap[invcount].dev_bus_addr = 0;
   1.201  		unmap[invcount].handle       = handle;
   1.202 -		pending_handle(idx, i) = BLKBACK_INVALID_HANDLE;
   1.203 +		pending_handle(req, i) = BLKBACK_INVALID_HANDLE;
   1.204  		invcount++;
   1.205  	}
   1.206  
   1.207 @@ -133,118 +178,90 @@ static void fast_flush_area(int idx, int
   1.208  	BUG_ON(ret);
   1.209  }
   1.210  
   1.211 -
   1.212 -/******************************************************************
   1.213 - * BLOCK-DEVICE SCHEDULER LIST MAINTENANCE
   1.214 - */
   1.215 -
   1.216 -static struct list_head blkio_schedule_list;
   1.217 -static spinlock_t blkio_schedule_list_lock;
   1.218 -
   1.219 -static int __on_blkdev_list(blkif_t *blkif)
   1.220 -{
   1.221 -	return blkif->blkdev_list.next != NULL;
   1.222 -}
   1.223 -
   1.224 -static void remove_from_blkdev_list(blkif_t *blkif)
   1.225 -{
   1.226 -	unsigned long flags;
   1.227 -
   1.228 -	if (!__on_blkdev_list(blkif))
   1.229 -		return;
   1.230 -
   1.231 -	spin_lock_irqsave(&blkio_schedule_list_lock, flags);
   1.232 -	if (__on_blkdev_list(blkif)) {
   1.233 -		list_del(&blkif->blkdev_list);
   1.234 -		blkif->blkdev_list.next = NULL;
   1.235 -		blkif_put(blkif);
   1.236 -	}
   1.237 -	spin_unlock_irqrestore(&blkio_schedule_list_lock, flags);
   1.238 -}
   1.239 -
   1.240 -static void add_to_blkdev_list_tail(blkif_t *blkif)
   1.241 -{
   1.242 -	unsigned long flags;
   1.243 -
   1.244 -	if (__on_blkdev_list(blkif))
   1.245 -		return;
   1.246 -
   1.247 -	spin_lock_irqsave(&blkio_schedule_list_lock, flags);
   1.248 -	if (!__on_blkdev_list(blkif) && (blkif->status == CONNECTED)) {
   1.249 -		list_add_tail(&blkif->blkdev_list, &blkio_schedule_list);
   1.250 -		blkif_get(blkif);
   1.251 -	}
   1.252 -	spin_unlock_irqrestore(&blkio_schedule_list_lock, flags);
   1.253 -}
   1.254 -
   1.255 -
   1.256  /******************************************************************
   1.257   * SCHEDULER FUNCTIONS
   1.258   */
   1.259  
   1.260 -static DECLARE_WAIT_QUEUE_HEAD(blkio_schedule_wait);
   1.261 -
   1.262 -static int blkio_schedule(void *arg)
   1.263 +static void print_stats(blkif_t *blkif)
   1.264  {
   1.265 -	DECLARE_WAITQUEUE(wq, current);
   1.266 +	printk(KERN_DEBUG "%s: oo %3d  |  rd %4d  |  wr %4d\n",
   1.267 +	       current->comm, blkif->st_oo_req,
   1.268 +	       blkif->st_rd_req, blkif->st_wr_req);
   1.269 +	blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
   1.270 +	blkif->st_rd_req = 0;
   1.271 +	blkif->st_wr_req = 0;
   1.272 +	blkif->st_oo_req = 0;
   1.273 +}
   1.274  
   1.275 -	blkif_t          *blkif;
   1.276 -	struct list_head *ent;
   1.277 +int blkif_schedule(void *arg)
   1.278 +{
   1.279 +	blkif_t          *blkif = arg;
   1.280  
   1.281 -	daemonize("xenblkd");
   1.282 -
   1.283 +	blkif_get(blkif);
   1.284 +	if (debug_lvl)
   1.285 +		printk(KERN_DEBUG "%s: started\n", current->comm);
   1.286  	for (;;) {
   1.287 -		/* Wait for work to do. */
   1.288 -		add_wait_queue(&blkio_schedule_wait, &wq);
   1.289 -		set_current_state(TASK_INTERRUPTIBLE);
   1.290 -		if ( (NR_PENDING_REQS == MAX_PENDING_REQS) || 
   1.291 -		     list_empty(&blkio_schedule_list) )
   1.292 -			schedule();
   1.293 -		__set_current_state(TASK_RUNNING);
   1.294 -		remove_wait_queue(&blkio_schedule_wait, &wq);
   1.295 -
   1.296 -		/* Queue up a batch of requests. */
   1.297 -		while ((NR_PENDING_REQS < MAX_PENDING_REQS) &&
   1.298 -		       !list_empty(&blkio_schedule_list)) {
   1.299 -			ent = blkio_schedule_list.next;
   1.300 -			blkif = list_entry(ent, blkif_t, blkdev_list);
   1.301 -			blkif_get(blkif);
   1.302 -			remove_from_blkdev_list(blkif);
   1.303 -			if (do_block_io_op(blkif, BATCH_PER_DOMAIN))
   1.304 -				add_to_blkdev_list_tail(blkif);
   1.305 -			blkif_put(blkif);
   1.306 +		if (kthread_should_stop()) {
   1.307 +			/* asked to quit? */
   1.308 +			if (!atomic_read(&blkif->io_pending))
   1.309 +				break;
   1.310 +			if (debug_lvl)
   1.311 +				printk(KERN_DEBUG "%s: I/O pending, "
   1.312 +				       "delaying exit\n", current->comm);
   1.313  		}
   1.314  
   1.315 -		/* Push the batch through to disc. */
   1.316 -		flush_plugged_queue();
   1.317 -	}
   1.318 -}
   1.319 +		if (!atomic_read(&blkif->io_pending)) {
   1.320 +			/* Wait for work to do. */
   1.321 +			wait_event_interruptible(
   1.322 +				blkif->wq,
   1.323 +				(atomic_read(&blkif->io_pending) ||
   1.324 +				 kthread_should_stop()));
   1.325 +		} else if (list_empty(&pending_free)) {
   1.326 +			/* Wait for pending_req becoming available. */
   1.327 +			wait_event_interruptible(
   1.328 +				pending_free_wq,
   1.329 +				!list_empty(&pending_free));
   1.330 +		}
   1.331  
   1.332 -static void maybe_trigger_blkio_schedule(void)
   1.333 -{
   1.334 -	/*
   1.335 -	 * Needed so that two processes, which together make the following
   1.336 -	 * predicate true, don't both read stale values and evaluate the
   1.337 -	 * predicate incorrectly. Incredibly unlikely to stall the scheduler
   1.338 -	 * on x86, but...
   1.339 -	 */
   1.340 -	smp_mb();
   1.341 +		if (blkif->status != CONNECTED) {
   1.342 +			/* make sure we are connected */
   1.343 +			if (debug_lvl)
   1.344 +				printk(KERN_DEBUG "%s: not connected "
   1.345 +				       "(%d pending)\n",
   1.346 +				       current->comm,
   1.347 +				       atomic_read(&blkif->io_pending));
   1.348 +			wait_event_interruptible(
   1.349 +				blkif->wq,
   1.350 +				(blkif->status == CONNECTED ||
   1.351 +				 kthread_should_stop()));
   1.352 +			continue;
   1.353 +		}
   1.354  
   1.355 -	if ((NR_PENDING_REQS < (MAX_PENDING_REQS/2)) &&
   1.356 -	    !list_empty(&blkio_schedule_list))
   1.357 -		wake_up(&blkio_schedule_wait);
   1.358 +		/* Schedule I/O */
   1.359 +		atomic_set(&blkif->io_pending, 0);
   1.360 +		if (do_block_io_op(blkif))
   1.361 +			atomic_inc(&blkif->io_pending);
   1.362 +		unplug_queue(blkif);
   1.363 +
   1.364 +		if (log_stats && time_after(jiffies, blkif->st_print))
   1.365 +			print_stats(blkif);
   1.366 +	}
   1.367 +
   1.368 +	if (log_stats)
   1.369 +		print_stats(blkif);
   1.370 +	if (debug_lvl)
   1.371 +		printk(KERN_DEBUG "%s: exiting\n", current->comm);
   1.372 +	blkif->xenblkd = NULL;
   1.373 +	blkif_put(blkif);
   1.374 +	return 0;
   1.375  }
   1.376  
   1.377 -
   1.378 -
   1.379  /******************************************************************
   1.380   * COMPLETION CALLBACK -- Called as bh->b_end_io()
   1.381   */
   1.382  
   1.383  static void __end_block_io_op(pending_req_t *pending_req, int uptodate)
   1.384  {
   1.385 -	unsigned long flags;
   1.386 -
   1.387  	/* An error fails the entire request. */
   1.388  	if (!uptodate) {
   1.389  		DPRINTK("Buffer not up-to-date at end of operation\n");
   1.390 @@ -252,15 +269,11 @@ static void __end_block_io_op(pending_re
   1.391  	}
   1.392  
   1.393  	if (atomic_dec_and_test(&pending_req->pendcnt)) {
   1.394 -		int pending_idx = pending_req - pending_reqs;
   1.395 -		fast_flush_area(pending_idx, pending_req->nr_pages);
   1.396 +		fast_flush_area(pending_req);
   1.397  		make_response(pending_req->blkif, pending_req->id,
   1.398  			      pending_req->operation, pending_req->status);
   1.399  		blkif_put(pending_req->blkif);
   1.400 -		spin_lock_irqsave(&pend_prod_lock, flags);
   1.401 -		pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
   1.402 -		spin_unlock_irqrestore(&pend_prod_lock, flags);
   1.403 -		maybe_trigger_blkio_schedule();
   1.404 +		free_req(pending_req);
   1.405  	}
   1.406  }
   1.407  
   1.408 @@ -281,8 +294,9 @@ static int end_block_io_op(struct bio *b
   1.409  irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs)
   1.410  {
   1.411  	blkif_t *blkif = dev_id;
   1.412 -	add_to_blkdev_list_tail(blkif);
   1.413 -	maybe_trigger_blkio_schedule();
   1.414 +
   1.415 +	atomic_inc(&blkif->io_pending);
   1.416 +	wake_up(&blkif->wq);
   1.417  	return IRQ_HANDLED;
   1.418  }
   1.419  
   1.420 @@ -292,10 +306,11 @@ irqreturn_t blkif_be_int(int irq, void *
   1.421   * DOWNWARD CALLS -- These interface with the block-device layer proper.
   1.422   */
   1.423  
   1.424 -static int do_block_io_op(blkif_t *blkif, int max_to_do)
   1.425 +static int do_block_io_op(blkif_t *blkif)
   1.426  {
   1.427  	blkif_back_ring_t *blk_ring = &blkif->blk_ring;
   1.428  	blkif_request_t *req;
   1.429 +	pending_req_t *pending_req;
   1.430  	RING_IDX rc, rp;
   1.431  	int more_to_do = 0;
   1.432  
   1.433 @@ -304,8 +319,10 @@ static int do_block_io_op(blkif_t *blkif
   1.434  	rmb(); /* Ensure we see queued requests up to 'rp'. */
   1.435  
   1.436  	while ((rc != rp) && !RING_REQUEST_CONS_OVERFLOW(blk_ring, rc)) {
   1.437 -		if ((max_to_do-- == 0) ||
   1.438 -		    (NR_PENDING_REQS == MAX_PENDING_REQS)) {
   1.439 +
   1.440 +		pending_req = alloc_req();
   1.441 +		if (NULL == pending_req) {
   1.442 +			blkif->st_oo_req++;
   1.443  			more_to_do = 1;
   1.444  			break;
   1.445  		}
   1.446 @@ -315,28 +332,31 @@ static int do_block_io_op(blkif_t *blkif
   1.447  
   1.448  		switch (req->operation) {
   1.449  		case BLKIF_OP_READ:
   1.450 +			blkif->st_rd_req++;
   1.451 +			dispatch_rw_block_io(blkif, req, pending_req);
   1.452 +			break;
   1.453  		case BLKIF_OP_WRITE:
   1.454 -			dispatch_rw_block_io(blkif, req);
   1.455 +			blkif->st_wr_req++;
   1.456 +			dispatch_rw_block_io(blkif, req, pending_req);
   1.457  			break;
   1.458 -
   1.459  		default:
   1.460  			DPRINTK("error: unknown block io operation [%d]\n",
   1.461  				req->operation);
   1.462  			make_response(blkif, req->id, req->operation,
   1.463  				      BLKIF_RSP_ERROR);
   1.464 +			free_req(pending_req);
   1.465  			break;
   1.466  		}
   1.467  	}
   1.468 -
   1.469  	return more_to_do;
   1.470  }
   1.471  
   1.472 -static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req)
   1.473 +static void dispatch_rw_block_io(blkif_t *blkif,
   1.474 +				 blkif_request_t *req,
   1.475 +				 pending_req_t *pending_req)
   1.476  {
   1.477  	extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]); 
   1.478  	int operation = (req->operation == BLKIF_OP_WRITE) ? WRITE : READ;
   1.479 -	int i, pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)];
   1.480 -	pending_req_t *pending_req;
   1.481  	struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
   1.482  	struct phys_req preq;
   1.483  	struct { 
   1.484 @@ -344,32 +364,36 @@ static void dispatch_rw_block_io(blkif_t
   1.485  	} seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
   1.486  	unsigned int nseg;
   1.487  	struct bio *bio = NULL, *biolist[BLKIF_MAX_SEGMENTS_PER_REQUEST];
   1.488 -	int nbio = 0;
   1.489 -	request_queue_t *q;
   1.490 -	int ret, errors = 0;
   1.491 +	int ret, i, nbio = 0;
   1.492  
   1.493  	/* Check that number of segments is sane. */
   1.494  	nseg = req->nr_segments;
   1.495  	if (unlikely(nseg == 0) || 
   1.496  	    unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
   1.497  		DPRINTK("Bad number of segments in request (%d)\n", nseg);
   1.498 -		goto bad_descriptor;
   1.499 +		goto fail_response;
   1.500  	}
   1.501  
   1.502  	preq.dev           = req->handle;
   1.503  	preq.sector_number = req->sector_number;
   1.504  	preq.nr_sects      = 0;
   1.505  
   1.506 +	pending_req->blkif     = blkif;
   1.507 +	pending_req->id        = req->id;
   1.508 +	pending_req->operation = operation;
   1.509 +	pending_req->status    = BLKIF_RSP_OKAY;
   1.510 +	pending_req->nr_pages  = nseg;
   1.511 +
   1.512  	for (i = 0; i < nseg; i++) {
   1.513  		seg[i].nsec = req->seg[i].last_sect -
   1.514  			req->seg[i].first_sect + 1;
   1.515  
   1.516  		if ((req->seg[i].last_sect >= (PAGE_SIZE >> 9)) ||
   1.517  		    (seg[i].nsec <= 0))
   1.518 -			goto bad_descriptor;
   1.519 +			goto fail_response;
   1.520  		preq.nr_sects += seg[i].nsec;
   1.521  
   1.522 -		map[i].host_addr = MMAP_VADDR(pending_idx, i);
   1.523 +		map[i].host_addr = vaddr(pending_req, i);
   1.524  		map[i].dom = blkif->domid;
   1.525  		map[i].ref = req->seg[i].gref;
   1.526  		map[i].flags = GNTMAP_host_map;
   1.527 @@ -381,26 +405,22 @@ static void dispatch_rw_block_io(blkif_t
   1.528  	BUG_ON(ret);
   1.529  
   1.530  	for (i = 0; i < nseg; i++) {
   1.531 -		if (likely(map[i].status == 0)) {
   1.532 -			pending_handle(pending_idx, i) = map[i].handle;
   1.533 +		if (unlikely(map[i].status != 0)) {
   1.534 +			DPRINTK("invalid buffer -- could not remap it\n");
   1.535 +			goto fail_flush;
   1.536 +		}
   1.537 +
   1.538 +		pending_handle(pending_req, i) = map[i].handle;
   1.539  #ifdef __ia64__
   1.540 -			MMAP_VADDR(pending_idx,i) = gnttab_map_vaddr(map[i]);
   1.541 +		pending_vaddrs[vaddr_pagenr(req, seg)] =
   1.542 +			= gnttab_map_vaddr(map[i]);
   1.543  #else
   1.544 -			set_phys_to_machine(__pa(MMAP_VADDR(
   1.545 -				pending_idx, i)) >> PAGE_SHIFT,
   1.546 -				FOREIGN_FRAME(map[i].dev_bus_addr>>PAGE_SHIFT));
   1.547 +		set_phys_to_machine(__pa(vaddr(
   1.548 +			pending_req, i)) >> PAGE_SHIFT,
   1.549 +			FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT));
   1.550  #endif
   1.551 -			seg[i].buf = map[i].dev_bus_addr |
   1.552 -				(req->seg[i].first_sect << 9);
   1.553 -		} else {
   1.554 -			errors++;
   1.555 -		}
   1.556 -	}
   1.557 -
   1.558 -	if (errors) {
   1.559 -		DPRINTK("invalid buffer -- could not remap it\n");
   1.560 -		fast_flush_area(pending_idx, nseg);
   1.561 -		goto bad_descriptor;
   1.562 +		seg[i].buf  = map[i].dev_bus_addr | 
   1.563 +			(req->seg[i].first_sect << 9);
   1.564  	}
   1.565  
   1.566  	if (vbd_translate(&preq, blkif, operation) != 0) {
   1.567 @@ -408,37 +428,25 @@ static void dispatch_rw_block_io(blkif_t
   1.568  			operation == READ ? "read" : "write",
   1.569  			preq.sector_number,
   1.570  			preq.sector_number + preq.nr_sects, preq.dev); 
   1.571 -		goto bad_descriptor;
   1.572 +		goto fail_flush;
   1.573  	}
   1.574  
   1.575 -	pending_req = &pending_reqs[pending_idx];
   1.576 -	pending_req->blkif     = blkif;
   1.577 -	pending_req->id        = req->id;
   1.578 -	pending_req->operation = operation;
   1.579 -	pending_req->status    = BLKIF_RSP_OKAY;
   1.580 -	pending_req->nr_pages  = nseg;
   1.581 -
   1.582  	for (i = 0; i < nseg; i++) {
   1.583  		if (((int)preq.sector_number|(int)seg[i].nsec) &
   1.584  		    ((bdev_hardsect_size(preq.bdev) >> 9) - 1)) {
   1.585  			DPRINTK("Misaligned I/O request from domain %d",
   1.586  				blkif->domid);
   1.587 -			goto cleanup_and_fail;
   1.588 +			goto fail_put_bio;
   1.589  		}
   1.590  
   1.591  		while ((bio == NULL) ||
   1.592  		       (bio_add_page(bio,
   1.593 -				     virt_to_page(MMAP_VADDR(pending_idx, i)),
   1.594 +				     virt_to_page(vaddr(pending_req, i)),
   1.595  				     seg[i].nsec << 9,
   1.596  				     seg[i].buf & ~PAGE_MASK) == 0)) {
   1.597  			bio = biolist[nbio++] = bio_alloc(GFP_KERNEL, nseg-i);
   1.598 -			if (unlikely(bio == NULL)) {
   1.599 -			cleanup_and_fail:
   1.600 -				for (i = 0; i < (nbio-1); i++)
   1.601 -					bio_put(biolist[i]);
   1.602 -				fast_flush_area(pending_idx, nseg);
   1.603 -				goto bad_descriptor;
   1.604 -			}
   1.605 +			if (unlikely(bio == NULL))
   1.606 +				goto fail_put_bio;
   1.607                  
   1.608  			bio->bi_bdev    = preq.bdev;
   1.609  			bio->bi_private = pending_req;
   1.610 @@ -449,14 +457,8 @@ static void dispatch_rw_block_io(blkif_t
   1.611  		preq.sector_number += seg[i].nsec;
   1.612  	}
   1.613  
   1.614 -	if ((q = bdev_get_queue(bio->bi_bdev)) != plugged_queue) {
   1.615 -		flush_plugged_queue();
   1.616 -		blk_get_queue(q);
   1.617 -		plugged_queue = q;
   1.618 -	}
   1.619 -
   1.620 +	plug_queue(blkif, bio);
   1.621  	atomic_set(&pending_req->pendcnt, nbio);
   1.622 -	pending_cons++;
   1.623  	blkif_get(blkif);
   1.624  
   1.625  	for (i = 0; i < nbio; i++)
   1.626 @@ -464,8 +466,14 @@ static void dispatch_rw_block_io(blkif_t
   1.627  
   1.628  	return;
   1.629  
   1.630 - bad_descriptor:
   1.631 + fail_put_bio:
   1.632 +	for (i = 0; i < (nbio-1); i++)
   1.633 +		bio_put(biolist[i]);
   1.634 + fail_flush:
   1.635 +	fast_flush_area(pending_req);
   1.636 + fail_response:
   1.637  	make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
   1.638 +	free_req(pending_req);
   1.639  } 
   1.640  
   1.641  
   1.642 @@ -481,6 +489,7 @@ static void make_response(blkif_t *blkif
   1.643  	blkif_response_t *resp;
   1.644  	unsigned long     flags;
   1.645  	blkif_back_ring_t *blk_ring = &blkif->blk_ring;
   1.646 +	int more_to_do = 0;
   1.647  	int notify;
   1.648  
   1.649  	spin_lock_irqsave(&blkif->blk_ring_lock, flags);
   1.650 @@ -499,76 +508,69 @@ static void make_response(blkif_t *blkif
   1.651  		 * notifications if requests are already in flight (lower
   1.652  		 * overheads and promotes batching).
   1.653  		 */
   1.654 -		int more_to_do;
   1.655  		RING_FINAL_CHECK_FOR_REQUESTS(blk_ring, more_to_do);
   1.656 -		if (more_to_do) {
   1.657 -			add_to_blkdev_list_tail(blkif);
   1.658 -			maybe_trigger_blkio_schedule();
   1.659 -		}
   1.660 +
   1.661 +	} else if (RING_HAS_UNCONSUMED_REQUESTS(blk_ring)) {
   1.662 +		more_to_do = 1;
   1.663 +
   1.664  	}
   1.665 -	else if (!__on_blkdev_list(blkif)
   1.666 -		 && RING_HAS_UNCONSUMED_REQUESTS(blk_ring)) {
   1.667 -		/* Keep pulling requests as they become available... */
   1.668 -		add_to_blkdev_list_tail(blkif);
   1.669 -		maybe_trigger_blkio_schedule();
   1.670 -	}
   1.671 -
   1.672  	spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
   1.673  
   1.674 +	if (more_to_do) {
   1.675 +		atomic_inc(&blkif->io_pending);
   1.676 +		wake_up(&blkif->wq);
   1.677 +	}
   1.678  	if (notify)
   1.679  		notify_remote_via_irq(blkif->irq);
   1.680  }
   1.681  
   1.682 -void blkif_deschedule(blkif_t *blkif)
   1.683 -{
   1.684 -	remove_from_blkdev_list(blkif);
   1.685 -}
   1.686 -
   1.687  static int __init blkif_init(void)
   1.688  {
   1.689 +	struct page *page;
   1.690  	int i;
   1.691 -	struct page *page;
   1.692 -	int ret;
   1.693 -
   1.694 -	for (i = 0; i < MMAP_PAGES; i++)
   1.695 -		pending_grant_handles[i] = BLKBACK_INVALID_HANDLE;
   1.696  
   1.697  	if (xen_init() < 0)
   1.698  		return -ENODEV;
   1.699  
   1.700 -	blkif_interface_init();
   1.701 -
   1.702 -#ifdef __ia64__
   1.703 -    {
   1.704 -	extern unsigned long alloc_empty_foreign_map_page_range(unsigned long pages);
   1.705 -	int i;
   1.706 +	mmap_pages            = blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST;
   1.707 +	pending_reqs          = kmalloc(sizeof(pending_reqs[0]) *
   1.708 +					blkif_reqs, GFP_KERNEL);
   1.709 +	pending_grant_handles = kmalloc(sizeof(pending_grant_handles[0]) *
   1.710 +					mmap_pages, GFP_KERNEL);
   1.711 +	pending_vaddrs        = kmalloc(sizeof(pending_vaddrs[0]) *
   1.712 +					mmap_pages, GFP_KERNEL);
   1.713 +	if (!pending_reqs || !pending_grant_handles || !pending_vaddrs) {
   1.714 +		printk("%s: out of memory\n", __FUNCTION__);
   1.715 +		return -1;
   1.716 +	}
   1.717  
   1.718 -	mmap_vstart =  alloc_empty_foreign_map_page_range(MMAP_PAGES);
   1.719 -	printk("Allocated mmap_vstart: 0x%lx\n", mmap_vstart);
   1.720 -	for(i = 0; i < MMAP_PAGES; i++)
   1.721 -	    pending_vaddrs[i] = mmap_vstart + (i << PAGE_SHIFT);
   1.722 -	BUG_ON(mmap_vstart == NULL);
   1.723 -    }
   1.724 -#else
   1.725 -	page = balloon_alloc_empty_page_range(MMAP_PAGES);
   1.726 +	blkif_interface_init();
   1.727 +	
   1.728 +#ifdef __ia64__
   1.729 +	extern unsigned long alloc_empty_foreign_map_page_range(
   1.730 +		unsigned long pages);
   1.731 +	mmap_vstart = (unsigned long)
   1.732 +		alloc_empty_foreign_map_page_range(mmap_pages);
   1.733 +#else /* ! ia64 */
   1.734 +	page = balloon_alloc_empty_page_range(mmap_pages);
   1.735  	BUG_ON(page == NULL);
   1.736  	mmap_vstart = (unsigned long)pfn_to_kaddr(page_to_pfn(page));
   1.737  #endif
   1.738 +	printk("%s: reqs=%d, pages=%d, mmap_vstart=0x%lx\n",
   1.739 +	       __FUNCTION__, blkif_reqs, mmap_pages, mmap_vstart);
   1.740 +	BUG_ON(mmap_vstart == 0);
   1.741 +	for (i = 0; i < mmap_pages; i++) {
   1.742 +		pending_vaddrs[i] = mmap_vstart + (i << PAGE_SHIFT);
   1.743 +		pending_grant_handles[i] = BLKBACK_INVALID_HANDLE;
   1.744 +	}
   1.745  
   1.746 -	pending_cons = 0;
   1.747 -	pending_prod = MAX_PENDING_REQS;
   1.748  	memset(pending_reqs, 0, sizeof(pending_reqs));
   1.749 -	for (i = 0; i < MAX_PENDING_REQS; i++)
   1.750 -		pending_ring[i] = i;
   1.751 +	INIT_LIST_HEAD(&pending_free);
   1.752 +
   1.753 +	for (i = 0; i < blkif_reqs; i++)
   1.754 +		list_add_tail(&pending_reqs[i].free_list, &pending_free);
   1.755      
   1.756 -	spin_lock_init(&blkio_schedule_list_lock);
   1.757 -	INIT_LIST_HEAD(&blkio_schedule_list);
   1.758 -
   1.759 -	ret = kernel_thread(blkio_schedule, NULL, CLONE_FS | CLONE_FILES);
   1.760 -	BUG_ON(ret < 0);
   1.761 -
   1.762  	blkif_xenbus_init();
   1.763 -
   1.764  	return 0;
   1.765  }
   1.766  
     2.1 --- a/linux-2.6-xen-sparse/drivers/xen/blkback/common.h	Thu Dec 08 15:04:41 2005 +0000
     2.2 +++ b/linux-2.6-xen-sparse/drivers/xen/blkback/common.h	Thu Dec 08 16:53:53 2005 +0100
     2.3 @@ -60,10 +60,20 @@ typedef struct blkif_st {
     2.4  	/* Is this a blktap frontend */
     2.5  	unsigned int     is_blktap;
     2.6  #endif
     2.7 -	struct list_head blkdev_list;
     2.8  	spinlock_t       blk_ring_lock;
     2.9  	atomic_t         refcnt;
    2.10  
    2.11 +	wait_queue_head_t   wq;
    2.12 +	struct task_struct  *xenblkd;
    2.13 +	atomic_t            io_pending;
    2.14 +	request_queue_t     *plug;
    2.15 +
    2.16 +	/* statistics */
    2.17 +	unsigned long       st_print;
    2.18 +	int                 st_rd_req;
    2.19 +	int                 st_wr_req;
    2.20 +	int                 st_oo_req;
    2.21 +
    2.22  	struct work_struct free_work;
    2.23  
    2.24  	grant_handle_t shmem_handle;
    2.25 @@ -101,11 +111,10 @@ int vbd_translate(struct phys_req *req, 
    2.26  
    2.27  void blkif_interface_init(void);
    2.28  
    2.29 -void blkif_deschedule(blkif_t *blkif);
    2.30 -
    2.31  void blkif_xenbus_init(void);
    2.32  
    2.33  irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs);
    2.34 +int blkif_schedule(void *arg);
    2.35  
    2.36  void update_blkif_status(blkif_t *blkif); 
    2.37  
     3.1 --- a/linux-2.6-xen-sparse/drivers/xen/blkback/interface.c	Thu Dec 08 15:04:41 2005 +0000
     3.2 +++ b/linux-2.6-xen-sparse/drivers/xen/blkback/interface.c	Thu Dec 08 16:53:53 2005 +0100
     3.3 @@ -24,6 +24,8 @@ blkif_t *alloc_blkif(domid_t domid)
     3.4  	blkif->status = DISCONNECTED;
     3.5  	spin_lock_init(&blkif->blk_ring_lock);
     3.6  	atomic_set(&blkif->refcnt, 1);
     3.7 +	init_waitqueue_head(&blkif->wq);
     3.8 +	blkif->st_print = jiffies;
     3.9  
    3.10  	return blkif;
    3.11  }
     4.1 --- a/linux-2.6-xen-sparse/drivers/xen/blkback/xenbus.c	Thu Dec 08 15:04:41 2005 +0000
     4.2 +++ b/linux-2.6-xen-sparse/drivers/xen/blkback/xenbus.c	Thu Dec 08 16:53:53 2005 +0100
     4.3 @@ -20,6 +20,7 @@
     4.4  
     4.5  #include <stdarg.h>
     4.6  #include <linux/module.h>
     4.7 +#include <linux/kthread.h>
     4.8  #include <asm-xen/xenbus.h>
     4.9  #include "common.h"
    4.10  
    4.11 @@ -92,6 +93,8 @@ static int blkback_remove(struct xenbus_
    4.12  	}
    4.13  	if (be->blkif) {
    4.14  		be->blkif->status = DISCONNECTED; 
    4.15 +		if (be->blkif->xenblkd)
    4.16 +			kthread_stop(be->blkif->xenblkd);
    4.17  		blkif_put(be->blkif);
    4.18  		be->blkif = NULL;
    4.19  	}
    4.20 @@ -220,6 +223,17 @@ static void backend_changed(struct xenbu
    4.21  			return;
    4.22  		}
    4.23  
    4.24 +		be->blkif->xenblkd = kthread_run(blkif_schedule, be->blkif,
    4.25 +						 "xvd %d %02x:%02x",
    4.26 +						 be->blkif->domid,
    4.27 +						 be->major, be->minor);
    4.28 +		if (IS_ERR(be->blkif->xenblkd)) {
    4.29 +			err = PTR_ERR(be->blkif->xenblkd);
    4.30 +			be->blkif->xenblkd = NULL;
    4.31 +			xenbus_dev_error(dev, err, "start xenblkd");
    4.32 +			return;
    4.33 +		}
    4.34 +
    4.35  		device_create_file(&dev->dev, &dev_attr_physical_device);
    4.36  		device_create_file(&dev->dev, &dev_attr_mode);
    4.37