ia64/xen-unstable

changeset 1910:780dd1691d62

bitkeeper revision 1.1108.17.1 (41051ec1NERNxLF017rAWe7ljBk92w)

A better fix for blkdev request merging. Should work for both IDE and
SCSI, and do as much merging as possible, and also no need for PIO
fallback mode.
author kaf24@scramble.cl.cam.ac.uk
date Mon Jul 26 15:09:53 2004 +0000 (2004-07-26)
parents 9fd9d87a5a61
children cb113908a384
files .rootkeys linux-2.4.26-xen-sparse/include/asm-xen/pci.h linux-2.4.26-xen-sparse/include/linux/blkdev.h
line diff
     1.1 --- a/.rootkeys	Fri Jul 23 22:39:55 2004 +0000
     1.2 +++ b/.rootkeys	Mon Jul 26 15:09:53 2004 +0000
     1.3 @@ -135,6 +135,7 @@ 3f1056a9L_kqHcFheV00KbKBzv9j5w linux-2.4
     1.4  3f689063nhrIRsMMZjZxMFk7iEINqQ linux-2.4.26-xen-sparse/include/asm-xen/xen_proc.h
     1.5  40659defgWA92arexpMGn8X3QMDj3w linux-2.4.26-xen-sparse/include/asm-xen/xor.h
     1.6  3f056927gMHl7mWB89rb73JahbhQIA linux-2.4.26-xen-sparse/include/linux/blk.h
     1.7 +41051ec1m6bJVjZocTG0C0V0O6RsVg linux-2.4.26-xen-sparse/include/linux/blkdev.h
     1.8  401c0590D_kwJDU59X8NyvqSv_Cl2A linux-2.4.26-xen-sparse/include/linux/sched.h
     1.9  40a248afgI0_JKthdYAe8beVfXSTpQ linux-2.4.26-xen-sparse/include/linux/skbuff.h
    1.10  3e5a4e686V0nioX2ZpFf056sgvdiQw linux-2.4.26-xen-sparse/include/linux/sunrpc/debug.h
     2.1 --- a/linux-2.4.26-xen-sparse/include/asm-xen/pci.h	Fri Jul 23 22:39:55 2004 +0000
     2.2 +++ b/linux-2.4.26-xen-sparse/include/asm-xen/pci.h	Mon Jul 26 15:09:53 2004 +0000
     2.3 @@ -145,8 +145,7 @@ static inline void pci_unmap_page(struct
     2.4  static inline int pci_map_sg(struct pci_dev *hwdev, struct scatterlist *sg,
     2.5  			     int nents, int direction)
     2.6  {
     2.7 -	int i, j, nr_pfns;
     2.8 -	unsigned long first_pfn;
     2.9 +	int i;
    2.10  
    2.11  	if (direction == PCI_DMA_NONE)
    2.12  		out_of_line_bug();
    2.13 @@ -160,28 +159,10 @@ static inline int pci_map_sg(struct pci_
    2.14   		else if (!sg[i].address && !sg[i].page)
    2.15   			out_of_line_bug();
    2.16   
    2.17 - 		if (sg[i].address) {
    2.18 + 		if (sg[i].address)
    2.19   			sg[i].dma_address = virt_to_bus(sg[i].address);
    2.20 - 			first_pfn = virt_to_phys(sg[i].address) >> PAGE_SHIFT;
    2.21 - 			nr_pfns = (((unsigned long)sg[i].address & 
    2.22 - 			    (PAGE_SIZE-1)) + sg[i].length + PAGE_SIZE - 1) >>
    2.23 - 			    PAGE_SHIFT;
    2.24 - 		} else {
    2.25 + 		else
    2.26   			sg[i].dma_address = page_to_bus(sg[i].page) + sg[i].offset;
    2.27 - 			first_pfn = page_to_phys(sg[i].page) >> PAGE_SHIFT;
    2.28 - 			nr_pfns = (sg[i].offset + sg[i].length + PAGE_SIZE - 
    2.29 - 			    1) >> PAGE_SHIFT;
    2.30 - 		}
    2.31 -
    2.32 -                /*
    2.33 -                 * Check that we merged physical buffers are also contiguous
    2.34 -                 * in machine-address space. We try to fail by returning 0.
    2.35 -                 */
    2.36 -                for (j = 1; j < nr_pfns; j++) {
    2.37 -                    if ( unlikely(pfn_to_mfn(first_pfn+j) != 
    2.38 -                                  (pfn_to_mfn(first_pfn)+j)) )
    2.39 -                        return 0;
    2.40 -                }
    2.41   	}
    2.42   
    2.43  	flush_write_buffers();
     3.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     3.2 +++ b/linux-2.4.26-xen-sparse/include/linux/blkdev.h	Mon Jul 26 15:09:53 2004 +0000
     3.3 @@ -0,0 +1,372 @@
     3.4 +#ifndef _LINUX_BLKDEV_H
     3.5 +#define _LINUX_BLKDEV_H
     3.6 +
     3.7 +#include <linux/major.h>
     3.8 +#include <linux/sched.h>
     3.9 +#include <linux/genhd.h>
    3.10 +#include <linux/tqueue.h>
    3.11 +#include <linux/list.h>
    3.12 +#include <linux/mm.h>
    3.13 +
    3.14 +#include <asm/io.h>
    3.15 +
    3.16 +struct request_queue;
    3.17 +typedef struct request_queue request_queue_t;
    3.18 +struct elevator_s;
    3.19 +typedef struct elevator_s elevator_t;
    3.20 +
    3.21 +/*
    3.22 + * Ok, this is an expanded form so that we can use the same
    3.23 + * request for paging requests.
    3.24 + */
    3.25 +struct request {
    3.26 +	struct list_head queue;
    3.27 +	int elevator_sequence;
    3.28 +
    3.29 +	volatile int rq_status;	/* should split this into a few status bits */
    3.30 +#define RQ_INACTIVE		(-1)
    3.31 +#define RQ_ACTIVE		1
    3.32 +#define RQ_SCSI_BUSY		0xffff
    3.33 +#define RQ_SCSI_DONE		0xfffe
    3.34 +#define RQ_SCSI_DISCONNECTING	0xffe0
    3.35 +
    3.36 +	kdev_t rq_dev;
    3.37 +	int cmd;		/* READ or WRITE */
    3.38 +	int errors;
    3.39 +	unsigned long start_time;
    3.40 +	unsigned long sector;
    3.41 +	unsigned long nr_sectors;
    3.42 +	unsigned long hard_sector, hard_nr_sectors;
    3.43 +	unsigned int nr_segments;
    3.44 +	unsigned int nr_hw_segments;
    3.45 +	unsigned long current_nr_sectors, hard_cur_sectors;
    3.46 +	void * special;
    3.47 +	char * buffer;
    3.48 +	struct completion * waiting;
    3.49 +	struct buffer_head * bh;
    3.50 +	struct buffer_head * bhtail;
    3.51 +	request_queue_t *q;
    3.52 +};
    3.53 +
    3.54 +#include <linux/elevator.h>
    3.55 +
    3.56 +typedef int (merge_request_fn) (request_queue_t *q, 
    3.57 +				struct request  *req,
    3.58 +				struct buffer_head *bh,
    3.59 +				int);
    3.60 +typedef int (merge_requests_fn) (request_queue_t *q, 
    3.61 +				 struct request  *req,
    3.62 +				 struct request  *req2,
    3.63 +				 int);
    3.64 +typedef void (request_fn_proc) (request_queue_t *q);
    3.65 +typedef request_queue_t * (queue_proc) (kdev_t dev);
    3.66 +typedef int (make_request_fn) (request_queue_t *q, int rw, struct buffer_head *bh);
    3.67 +typedef void (plug_device_fn) (request_queue_t *q, kdev_t device);
    3.68 +typedef void (unplug_device_fn) (void *q);
    3.69 +
    3.70 +struct request_list {
    3.71 +	unsigned int count;
    3.72 +	unsigned int pending[2];
    3.73 +	struct list_head free;
    3.74 +};
    3.75 +
    3.76 +struct request_queue
    3.77 +{
    3.78 +	/*
    3.79 +	 * the queue request freelist, one for reads and one for writes
    3.80 +	 */
    3.81 +	struct request_list	rq;
    3.82 +
    3.83 +	/*
    3.84 +	 * The total number of requests on each queue
    3.85 +	 */
    3.86 +	int nr_requests;
    3.87 +
    3.88 +	/*
    3.89 +	 * Batching threshold for sleep/wakeup decisions
    3.90 +	 */
    3.91 +	int batch_requests;
    3.92 +
    3.93 +	/*
    3.94 +	 * The total number of 512byte blocks on each queue
    3.95 +	 */
    3.96 +	atomic_t nr_sectors;
    3.97 +
    3.98 +	/*
    3.99 +	 * Batching threshold for sleep/wakeup decisions
   3.100 +	 */
   3.101 +	int batch_sectors;
   3.102 +
   3.103 +	/*
   3.104 +	 * The max number of 512byte blocks on each queue
   3.105 +	 */
   3.106 +	int max_queue_sectors;
   3.107 +
   3.108 +	/*
   3.109 +	 * Together with queue_head for cacheline sharing
   3.110 +	 */
   3.111 +	struct list_head	queue_head;
   3.112 +	elevator_t		elevator;
   3.113 +
   3.114 +	request_fn_proc		* request_fn;
   3.115 +	merge_request_fn	* back_merge_fn;
   3.116 +	merge_request_fn	* front_merge_fn;
   3.117 +	merge_requests_fn	* merge_requests_fn;
   3.118 +	make_request_fn		* make_request_fn;
   3.119 +	plug_device_fn		* plug_device_fn;
   3.120 +	/*
   3.121 +	 * The queue owner gets to use this for whatever they like.
   3.122 +	 * ll_rw_blk doesn't touch it.
   3.123 +	 */
   3.124 +	void			* queuedata;
   3.125 +
   3.126 +	/*
   3.127 +	 * This is used to remove the plug when tq_disk runs.
   3.128 +	 */
   3.129 +	struct tq_struct	plug_tq;
   3.130 +
   3.131 +	/*
   3.132 +	 * Boolean that indicates whether this queue is plugged or not.
   3.133 +	 */
   3.134 +	int			plugged:1;
   3.135 +
   3.136 +	/*
   3.137 +	 * Boolean that indicates whether current_request is active or
   3.138 +	 * not.
   3.139 +	 */
   3.140 +	int			head_active:1;
   3.141 +
   3.142 +	/*
   3.143 +	 * Boolean that indicates you will use blk_started_sectors
   3.144 +	 * and blk_finished_sectors in addition to blk_started_io
   3.145 +	 * and blk_finished_io.  It enables the throttling code to 
   3.146 +	 * help keep the sectors in flight to a reasonable value
   3.147 +	 */
   3.148 +	int			can_throttle:1;
   3.149 +
   3.150 +	unsigned long		bounce_pfn;
   3.151 +
   3.152 +	/*
   3.153 +	 * Is meant to protect the queue in the future instead of
   3.154 +	 * io_request_lock
   3.155 +	 */
   3.156 +	spinlock_t		queue_lock;
   3.157 +
   3.158 +	/*
   3.159 +	 * Tasks wait here for free read and write requests
   3.160 +	 */
   3.161 +	wait_queue_head_t	wait_for_requests;
   3.162 +};
   3.163 +
   3.164 +#define blk_queue_plugged(q)	(q)->plugged
   3.165 +#define blk_fs_request(rq)	((rq)->cmd == READ || (rq)->cmd == WRITE)
   3.166 +#define blk_queue_empty(q)	list_empty(&(q)->queue_head)
   3.167 +
   3.168 +extern inline int rq_data_dir(struct request *rq)
   3.169 +{
   3.170 +	if (rq->cmd == READ)
   3.171 +		return READ;
   3.172 +	else if (rq->cmd == WRITE)
   3.173 +		return WRITE;
   3.174 +	else {
   3.175 +		BUG();
   3.176 +		return -1; /* ahem */
   3.177 +	}
   3.178 +}
   3.179 +
   3.180 +extern unsigned long blk_max_low_pfn, blk_max_pfn;
   3.181 +
   3.182 +#define BLK_BOUNCE_HIGH		((u64)blk_max_low_pfn << PAGE_SHIFT)
   3.183 +#define BLK_BOUNCE_ANY		((u64)blk_max_pfn << PAGE_SHIFT)
   3.184 +
   3.185 +extern void blk_queue_bounce_limit(request_queue_t *, u64);
   3.186 +
   3.187 +#ifdef CONFIG_HIGHMEM
   3.188 +extern struct buffer_head *create_bounce(int, struct buffer_head *);
   3.189 +extern inline struct buffer_head *blk_queue_bounce(request_queue_t *q, int rw,
   3.190 +						   struct buffer_head *bh)
   3.191 +{
   3.192 +	struct page *page = bh->b_page;
   3.193 +
   3.194 +#ifndef CONFIG_DISCONTIGMEM
   3.195 +	if (page - mem_map <= q->bounce_pfn)
   3.196 +#else
   3.197 +	if ((page - page_zone(page)->zone_mem_map) + (page_zone(page)->zone_start_paddr >> PAGE_SHIFT) <= q->bounce_pfn)
   3.198 +#endif
   3.199 +		return bh;
   3.200 +
   3.201 +	return create_bounce(rw, bh);
   3.202 +}
   3.203 +#else
   3.204 +#define blk_queue_bounce(q, rw, bh)	(bh)
   3.205 +#endif
   3.206 +
   3.207 +#ifdef CONFIG_XEN
   3.208 +/* Used for buffer merging, where it is imperative we use machine addresses! */
   3.209 +#define bh_phys(bh)		(page_to_bus((bh)->b_page) + bh_offset((bh)))
   3.210 +#else
   3.211 +#define bh_phys(bh)		(page_to_phys((bh)->b_page) + bh_offset((bh)))
   3.212 +#endif
   3.213 +
   3.214 +#define BH_CONTIG(b1, b2)	(bh_phys((b1)) + (b1)->b_size == bh_phys((b2)))
   3.215 +#define BH_PHYS_4G(b1, b2)	((bh_phys((b1)) | 0xffffffff) == ((bh_phys((b2)) + (b2)->b_size - 1) | 0xffffffff))
   3.216 +
   3.217 +struct blk_dev_struct {
   3.218 +	/*
   3.219 +	 * queue_proc has to be atomic
   3.220 +	 */
   3.221 +	request_queue_t		request_queue;
   3.222 +	queue_proc		*queue;
   3.223 +	void			*data;
   3.224 +};
   3.225 +
   3.226 +struct sec_size {
   3.227 +	unsigned block_size;
   3.228 +	unsigned block_size_bits;
   3.229 +};
   3.230 +
   3.231 +/*
   3.232 + * Used to indicate the default queue for drivers that don't bother
   3.233 + * to implement multiple queues.  We have this access macro here
   3.234 + * so as to eliminate the need for each and every block device
   3.235 + * driver to know about the internal structure of blk_dev[].
   3.236 + */
   3.237 +#define BLK_DEFAULT_QUEUE(_MAJOR)  &blk_dev[_MAJOR].request_queue
   3.238 +
   3.239 +extern struct sec_size * blk_sec[MAX_BLKDEV];
   3.240 +extern struct blk_dev_struct blk_dev[MAX_BLKDEV];
   3.241 +extern void grok_partitions(struct gendisk *dev, int drive, unsigned minors, long size);
   3.242 +extern void register_disk(struct gendisk *dev, kdev_t first, unsigned minors, struct block_device_operations *ops, long size);
   3.243 +extern void generic_make_request(int rw, struct buffer_head * bh);
   3.244 +extern inline request_queue_t *blk_get_queue(kdev_t dev);
   3.245 +extern void blkdev_release_request(struct request *);
   3.246 +
   3.247 +/*
   3.248 + * Access functions for manipulating queue properties
   3.249 + */
   3.250 +extern int blk_grow_request_list(request_queue_t *q, int nr_requests, int max_queue_sectors);
   3.251 +extern void blk_init_queue(request_queue_t *, request_fn_proc *);
   3.252 +extern void blk_cleanup_queue(request_queue_t *);
   3.253 +extern void blk_queue_headactive(request_queue_t *, int);
   3.254 +extern void blk_queue_throttle_sectors(request_queue_t *, int);
   3.255 +extern void blk_queue_make_request(request_queue_t *, make_request_fn *);
   3.256 +extern void generic_unplug_device(void *);
   3.257 +extern inline int blk_seg_merge_ok(struct buffer_head *, struct buffer_head *);
   3.258 +
   3.259 +extern int * blk_size[MAX_BLKDEV];
   3.260 +
   3.261 +extern int * blksize_size[MAX_BLKDEV];
   3.262 +
   3.263 +extern int * hardsect_size[MAX_BLKDEV];
   3.264 +
   3.265 +extern int * max_readahead[MAX_BLKDEV];
   3.266 +
   3.267 +extern int * max_sectors[MAX_BLKDEV];
   3.268 +
   3.269 +extern int * max_segments[MAX_BLKDEV];
   3.270 +
   3.271 +#define MAX_SEGMENTS 128
   3.272 +#define MAX_SECTORS 255
   3.273 +#define MAX_QUEUE_SECTORS (4 << (20 - 9)) /* 4 mbytes when full sized */
   3.274 +#define MAX_NR_REQUESTS 1024 /* 1024k when in 512 units, normally min is 1M in 1k units */
   3.275 +
   3.276 +#define PageAlignSize(size) (((size) + PAGE_SIZE -1) & PAGE_MASK)
   3.277 +
   3.278 +#define blkdev_entry_to_request(entry) list_entry((entry), struct request, queue)
   3.279 +#define blkdev_entry_next_request(entry) blkdev_entry_to_request((entry)->next)
   3.280 +#define blkdev_entry_prev_request(entry) blkdev_entry_to_request((entry)->prev)
   3.281 +#define blkdev_next_request(req) blkdev_entry_to_request((req)->queue.next)
   3.282 +#define blkdev_prev_request(req) blkdev_entry_to_request((req)->queue.prev)
   3.283 +
   3.284 +extern void drive_stat_acct (kdev_t dev, int rw,
   3.285 +					unsigned long nr_sectors, int new_io);
   3.286 +
   3.287 +static inline int get_hardsect_size(kdev_t dev)
   3.288 +{
   3.289 +	int retval = 512;
   3.290 +	int major = MAJOR(dev);
   3.291 +
   3.292 +	if (hardsect_size[major]) {
   3.293 +		int minor = MINOR(dev);
   3.294 +		if (hardsect_size[major][minor])
   3.295 +			retval = hardsect_size[major][minor];
   3.296 +	}
   3.297 +	return retval;
   3.298 +}
   3.299 +
   3.300 +static inline int blk_oversized_queue(request_queue_t * q)
   3.301 +{
   3.302 +	if (q->can_throttle)
   3.303 +		return atomic_read(&q->nr_sectors) > q->max_queue_sectors;
   3.304 +	return q->rq.count == 0;
   3.305 +}
   3.306 +
   3.307 +static inline int blk_oversized_queue_reads(request_queue_t * q)
   3.308 +{
   3.309 +	if (q->can_throttle)
   3.310 +		return atomic_read(&q->nr_sectors) > q->max_queue_sectors + q->batch_sectors;
   3.311 +	return q->rq.count == 0;
   3.312 +}
   3.313 +
   3.314 +static inline int blk_oversized_queue_batch(request_queue_t * q)
   3.315 +{
   3.316 +	return atomic_read(&q->nr_sectors) > q->max_queue_sectors - q->batch_sectors;
   3.317 +}
   3.318 +
   3.319 +#define blk_finished_io(nsects)	do { } while (0)
   3.320 +#define blk_started_io(nsects)	do { } while (0)
   3.321 +
   3.322 +static inline void blk_started_sectors(struct request *rq, int count)
   3.323 +{
   3.324 +	request_queue_t *q = rq->q;
   3.325 +	if (q && q->can_throttle) {
   3.326 +		atomic_add(count, &q->nr_sectors);
   3.327 +		if (atomic_read(&q->nr_sectors) < 0) {
   3.328 +			printk("nr_sectors is %d\n", atomic_read(&q->nr_sectors));
   3.329 +			BUG();
   3.330 +		}
   3.331 +	}
   3.332 +}
   3.333 +
   3.334 +static inline void blk_finished_sectors(struct request *rq, int count)
   3.335 +{
   3.336 +	request_queue_t *q = rq->q;
   3.337 +	if (q && q->can_throttle) {
   3.338 +		atomic_sub(count, &q->nr_sectors);
   3.339 +		
   3.340 +		smp_mb();
   3.341 +		if (q->rq.count >= q->batch_requests && !blk_oversized_queue_batch(q)) {
   3.342 +			if (waitqueue_active(&q->wait_for_requests))
   3.343 +				wake_up(&q->wait_for_requests);
   3.344 +		}
   3.345 +		if (atomic_read(&q->nr_sectors) < 0) {
   3.346 +			printk("nr_sectors is %d\n", atomic_read(&q->nr_sectors));
   3.347 +			BUG();
   3.348 +		}
   3.349 +	}
   3.350 +}
   3.351 +
   3.352 +static inline unsigned int blksize_bits(unsigned int size)
   3.353 +{
   3.354 +	unsigned int bits = 8;
   3.355 +	do {
   3.356 +		bits++;
   3.357 +		size >>= 1;
   3.358 +	} while (size > 256);
   3.359 +	return bits;
   3.360 +}
   3.361 +
   3.362 +static inline unsigned int block_size(kdev_t dev)
   3.363 +{
   3.364 +	int retval = BLOCK_SIZE;
   3.365 +	int major = MAJOR(dev);
   3.366 +
   3.367 +	if (blksize_size[major]) {
   3.368 +		int minor = MINOR(dev);
   3.369 +		if (blksize_size[major][minor])
   3.370 +			retval = blksize_size[major][minor];
   3.371 +	}
   3.372 +	return retval;
   3.373 +}
   3.374 +
   3.375 +#endif