direct-io.hg

view xen/drivers/block/ll_rw_blk.c @ 875:ad4db8b417c1

bitkeeper revision 1.547 (3fa3dd2aH8eamu3ONvYovJgq8wBNbQ)

Many files:
Fixes to the DOM0 interface and domain building code. Ready for new save/restore dom0_ops.
author kaf24@scramble.cl.cam.ac.uk
date Sat Nov 01 16:19:54 2003 +0000 (2003-11-01)
parents 88dba60f48b3
children 14aefa321a10 06425d6a590e dce3446ac01e
line source
1 /*
2 * linux/drivers/block/ll_rw_blk.c
3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 * Copyright (C) 1994, Karl Keyte: Added support for disk statistics
6 * Elevator latency, (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
7 * Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de>
8 */
10 /*
11 * This handles all read/write requests to block devices
12 */
13 #include <xeno/config.h>
14 #include <xeno/types.h>
15 #include <xeno/lib.h>
16 #include <xeno/sched.h>
17 /*#include <xeno/kernel_stat.h>*/
18 #include <xeno/errno.h>
19 /*#include <xeno/locks.h>*/
20 #include <xeno/mm.h>
21 /*#include <xeno/swap.h>*/
22 #include <xeno/init.h>
23 /*#include <xeno/smp_lock.h>*/
24 /*#include <xeno/completion.h>*/
26 #include <asm/system.h>
27 #include <asm/io.h>
28 #include <xeno/blk.h>
29 /*#include <xeno/highmem.h>*/
30 #include <xeno/slab.h>
31 #include <xeno/module.h>
33 /*
34 * KAF: We can turn off noise relating to barking guest-OS requests.
35 */
36 #if 0
37 #define DPRINTK(_f, _a...) printk(_f , ## _a)
38 #else
39 #define DPRINTK(_f, _a...) ((void)0)
40 #endif
42 /* This will die as all synchronous stuff is coming to an end */
43 #if 0
44 #define complete(_r) panic("completion.h stuff may be needed...")
45 #else
46 // XXX SMH: we spin when waiting for completion so just toggle flag
47 #define complete(_r) (*(int *)(_r) = 0)
48 #endif
52 /*
53 * MAC Floppy IWM hooks
54 */
56 #ifdef CONFIG_MAC_FLOPPY_IWM
57 extern int mac_floppy_init(void);
58 #endif
60 /*
61 * For the allocated request tables
62 */
63 static kmem_cache_t *request_cachep;
65 /*
66 * The "disk" task queue is used to start the actual requests
67 * after a plug
68 */
69 DECLARE_TASK_QUEUE(tq_disk);
71 /*
72 * Protect the request list against multiple users..
73 *
74 * With this spinlock the Linux block IO subsystem is 100% SMP threaded
75 * from the IRQ event side, and almost 100% SMP threaded from the syscall
76 * side (we still have protect against block device array operations, and
77 * the do_request() side is casually still unsafe. The kernel lock protects
78 * this part currently.).
79 *
80 * there is a fair chance that things will work just OK if these functions
81 * are called with no global kernel lock held ...
82 */
83 spinlock_t io_request_lock = SPIN_LOCK_UNLOCKED;
85 /* This specifies how many sectors to read ahead on the disk. */
87 int read_ahead[MAX_BLKDEV];
89 /* blk_dev_struct is:
90 * *request_fn
91 * *current_request
92 */
93 struct blk_dev_struct blk_dev[MAX_BLKDEV]; /* initialized by blk_dev_init() */
95 /*
96 * blk_size contains the size of all block-devices in units of 1024 byte
97 * sectors:
98 *
99 * blk_size[MAJOR][MINOR]
100 *
101 * if (!blk_size[MAJOR]) then no minor size checking is done.
102 */
103 int * blk_size[MAX_BLKDEV];
105 /*
106 * blksize_size contains the size of all block-devices:
107 *
108 * blksize_size[MAJOR][MINOR]
109 *
110 * if (!blksize_size[MAJOR]) then 1024 bytes is assumed.
111 */
112 int * blksize_size[MAX_BLKDEV];
114 /*
115 * hardsect_size contains the size of the hardware sector of a device.
116 *
117 * hardsect_size[MAJOR][MINOR]
118 *
119 * if (!hardsect_size[MAJOR])
120 * then 512 bytes is assumed.
121 * else
122 * sector_size is hardsect_size[MAJOR][MINOR]
123 * This is currently set by some scsi devices and read by the msdos fs driver.
124 * Other uses may appear later.
125 */
126 int * hardsect_size[MAX_BLKDEV];
128 /*
129 * The following tunes the read-ahead algorithm in mm/filemap.c
130 */
131 int * max_readahead[MAX_BLKDEV];
133 /*
134 * Max number of sectors per request
135 */
136 int * max_sectors[MAX_BLKDEV];
138 static inline int get_max_sectors(kdev_t dev)
139 {
140 if (!max_sectors[MAJOR(dev)])
141 return MAX_SECTORS;
142 return max_sectors[MAJOR(dev)][MINOR(dev)];
143 }
145 inline request_queue_t *blk_get_queue(kdev_t dev)
146 {
147 struct blk_dev_struct *bdev = blk_dev + MAJOR(dev);
149 if (bdev->queue)
150 return bdev->queue(dev);
151 else
152 return &blk_dev[MAJOR(dev)].request_queue;
153 }
155 static int __blk_cleanup_queue(struct request_list *list)
156 {
157 struct list_head *head = &list->free;
158 struct request *rq;
159 int i = 0;
161 while (!list_empty(head)) {
162 rq = list_entry(head->next, struct request, queue);
163 list_del(&rq->queue);
164 kmem_cache_free(request_cachep, rq);
165 i++;
166 };
168 if (i != list->count)
169 printk("request list leak!\n");
171 list->count = 0;
172 return i;
173 }
175 /**
176 * blk_cleanup_queue: - release a &request_queue_t when it is no longer needed
177 * @q: the request queue to be released
178 *
179 * Description:
180 * blk_cleanup_queue is the pair to blk_init_queue(). It should
181 * be called when a request queue is being released; typically
182 * when a block device is being de-registered. Currently, its
183 * primary task it to free all the &struct request structures that
184 * were allocated to the queue.
185 * Caveat:
186 * Hopefully the low level driver will have finished any
187 * outstanding requests first...
188 **/
189 void blk_cleanup_queue(request_queue_t * q)
190 {
191 int count = q->nr_requests;
193 count -= __blk_cleanup_queue(&q->rq[READ]);
194 count -= __blk_cleanup_queue(&q->rq[WRITE]);
196 if (count)
197 printk("blk_cleanup_queue: leaked requests (%d)\n", count);
199 memset(q, 0, sizeof(*q));
200 }
202 /**
203 * blk_queue_headactive - indicate whether head of request queue may be active
204 * @q: The queue which this applies to.
205 * @active: A flag indication where the head of the queue is active.
206 *
207 * Description:
208 * The driver for a block device may choose to leave the currently active
209 * request on the request queue, removing it only when it has completed.
210 * The queue handling routines assume this by default for safety reasons
211 * and will not involve the head of the request queue in any merging or
212 * reordering of requests when the queue is unplugged (and thus may be
213 * working on this particular request).
214 *
215 * If a driver removes requests from the queue before processing them, then
216 * it may indicate that it does so, there by allowing the head of the queue
217 * to be involved in merging and reordering. This is done be calling
218 * blk_queue_headactive() with an @active flag of %0.
219 *
220 * If a driver processes several requests at once, it must remove them (or
221 * at least all but one of them) from the request queue.
222 *
223 * When a queue is plugged the head will be assumed to be inactive.
224 **/
226 void blk_queue_headactive(request_queue_t * q, int active)
227 {
228 q->head_active = active;
229 }
231 /**
232 * blk_queue_make_request - define an alternate make_request function for a device
233 * @q: the request queue for the device to be affected
234 * @mfn: the alternate make_request function
235 *
236 * Description:
237 * The normal way for &struct buffer_heads to be passed to a device
238 * driver is for them to be collected into requests on a request
239 * queue, and then to allow the device driver to select requests
240 * off that queue when it is ready. This works well for many block
241 * devices. However some block devices (typically virtual devices
242 * such as md or lvm) do not benefit from the processing on the
243 * request queue, and are served best by having the requests passed
244 * directly to them. This can be achieved by providing a function
245 * to blk_queue_make_request().
246 *
247 * Caveat:
248 * The driver that does this *must* be able to deal appropriately
249 * with buffers in "highmemory", either by calling bh_kmap() to get
250 * a kernel mapping, to by calling create_bounce() to create a
251 * buffer in normal memory.
252 **/
254 void blk_queue_make_request(request_queue_t * q, make_request_fn * mfn)
255 {
256 q->make_request_fn = mfn;
257 }
259 static inline int ll_new_segment(request_queue_t *q, struct request *req, int max_segments)
260 {
261 if (req->nr_segments < max_segments) {
262 req->nr_segments++;
263 return 1;
264 }
265 return 0;
266 }
268 static int ll_back_merge_fn(request_queue_t *q, struct request *req,
269 struct buffer_head *bh, int max_segments)
270 {
271 if (req->bhtail->b_data + req->bhtail->b_size == bh->b_data)
272 return 1;
273 return ll_new_segment(q, req, max_segments);
274 }
276 static int ll_front_merge_fn(request_queue_t *q, struct request *req,
277 struct buffer_head *bh, int max_segments)
278 {
279 if (bh->b_data + bh->b_size == req->bh->b_data)
280 return 1;
281 return ll_new_segment(q, req, max_segments);
282 }
284 static int ll_merge_requests_fn(request_queue_t *q, struct request *req,
285 struct request *next, int max_segments)
286 {
287 int total_segments = req->nr_segments + next->nr_segments;
289 if (req->bhtail->b_data + req->bhtail->b_size == next->bh->b_data)
290 total_segments--;
292 if (total_segments > max_segments)
293 return 0;
295 req->nr_segments = total_segments;
296 return 1;
297 }
299 /*
300 * "plug" the device if there are no outstanding requests: this will
301 * force the transfer to start only after we have put all the requests
302 * on the list.
303 *
304 * This is called with interrupts off and no requests on the queue.
305 * (and with the request spinlock acquired)
306 */
307 static void generic_plug_device(request_queue_t *q, kdev_t dev)
308 {
309 /*
310 * no need to replug device
311 */
312 if (!list_empty(&q->queue_head) || q->plugged)
313 return;
315 q->plugged = 1;
316 queue_task(&q->plug_tq, &tq_disk);
317 }
319 /*
320 * remove the plug and let it rip..
321 */
322 static inline void __generic_unplug_device(request_queue_t *q)
323 {
324 if (q->plugged) {
325 q->plugged = 0;
326 if (!list_empty(&q->queue_head))
327 {
328 q->request_fn(q);
329 }
330 }
331 }
333 void generic_unplug_device(void *data)
334 {
335 request_queue_t *q = (request_queue_t *) data;
336 unsigned long flags;
338 spin_lock_irqsave(&io_request_lock, flags);
339 __generic_unplug_device(q);
340 spin_unlock_irqrestore(&io_request_lock, flags);
341 }
343 /** blk_grow_request_list
344 * @q: The &request_queue_t
345 * @nr_requests: how many requests are desired
346 *
347 * More free requests are added to the queue's free lists, bringing
348 * the total number of requests to @nr_requests.
349 *
350 * The requests are added equally to the request queue's read
351 * and write freelists.
352 *
353 * This function can sleep.
354 *
355 * Returns the (new) number of requests which the queue has available.
356 */
357 int blk_grow_request_list(request_queue_t *q, int nr_requests)
358 {
359 unsigned long flags;
360 /* Several broken drivers assume that this function doesn't sleep,
361 * this causes system hangs during boot.
362 * As a temporary fix, make the the function non-blocking.
363 */
364 spin_lock_irqsave(&io_request_lock, flags);
365 while (q->nr_requests < nr_requests) {
366 struct request *rq;
367 int rw;
369 rq = kmem_cache_alloc(request_cachep, SLAB_ATOMIC);
370 if (rq == NULL)
371 break;
372 memset(rq, 0, sizeof(*rq));
373 rq->rq_status = RQ_INACTIVE;
374 rw = q->nr_requests & 1;
375 list_add(&rq->queue, &q->rq[rw].free);
376 q->rq[rw].count++;
377 q->nr_requests++;
378 }
379 q->batch_requests = q->nr_requests / 4;
380 if (q->batch_requests > 32)
381 q->batch_requests = 32;
382 spin_unlock_irqrestore(&io_request_lock, flags);
383 return q->nr_requests;
384 }
386 static void blk_init_free_list(request_queue_t *q)
387 {
388 /*struct sysinfo si;*/
389 /*int megs;*/ /* Total memory, in megabytes */
390 int nr_requests;
392 INIT_LIST_HEAD(&q->rq[READ].free);
393 INIT_LIST_HEAD(&q->rq[WRITE].free);
394 q->rq[READ].count = 0;
395 q->rq[WRITE].count = 0;
396 q->nr_requests = 0;
398 #if 0
399 si_meminfo(&si);
400 megs = si.totalram >> (20 - PAGE_SHIFT);
401 nr_requests = 128;
402 if (megs < 32)
403 nr_requests /= 2;
404 #else
405 nr_requests = 128;
406 #endif
407 blk_grow_request_list(q, nr_requests);
409 #if 0
410 init_waitqueue_head(&q->wait_for_requests[0]);
411 init_waitqueue_head(&q->wait_for_requests[1]);
412 #endif
413 spin_lock_init(&q->queue_lock);
414 }
416 static int __make_request(request_queue_t * q, int rw, struct buffer_head * bh);
418 /**
419 * blk_init_queue - prepare a request queue for use with a block device
420 * @q: The &request_queue_t to be initialised
421 * @rfn: The function to be called to process requests that have been
422 * placed on the queue.
423 *
424 * Description:
425 * If a block device wishes to use the standard request handling procedures,
426 * which sorts requests and coalesces adjacent requests, then it must
427 * call blk_init_queue(). The function @rfn will be called when there
428 * are requests on the queue that need to be processed. If the device
429 * supports plugging, then @rfn may not be called immediately when requests
430 * are available on the queue, but may be called at some time later instead.
431 * Plugged queues are generally unplugged when a buffer belonging to one
432 * of the requests on the queue is needed, or due to memory pressure.
433 *
434 * @rfn is not required, or even expected, to remove all requests off the
435 * queue, but only as many as it can handle at a time. If it does leave
436 * requests on the queue, it is responsible for arranging that the requests
437 * get dealt with eventually.
438 *
439 * A global spin lock $io_request_lock must be held while manipulating the
440 * requests on the request queue.
441 *
442 * The request on the head of the queue is by default assumed to be
443 * potentially active, and it is not considered for re-ordering or merging
444 * whenever the given queue is unplugged. This behaviour can be changed with
445 * blk_queue_headactive().
446 *
447 * Note:
448 * blk_init_queue() must be paired with a blk_cleanup_queue() call
449 * when the block device is deactivated (such as at module unload).
450 **/
451 void blk_init_queue(request_queue_t * q, request_fn_proc * rfn)
452 {
453 INIT_LIST_HEAD(&q->queue_head);
454 elevator_init(&q->elevator, ELEVATOR_LINUS);
455 blk_init_free_list(q);
456 q->request_fn = rfn;
457 q->back_merge_fn = ll_back_merge_fn;
458 q->front_merge_fn = ll_front_merge_fn;
459 q->merge_requests_fn = ll_merge_requests_fn;
460 q->make_request_fn = __make_request;
461 q->plug_tq.sync = 0;
462 q->plug_tq.routine = &generic_unplug_device;
463 q->plug_tq.data = q;
464 q->plugged = 0;
465 /*
466 * These booleans describe the queue properties. We set the
467 * default (and most common) values here. Other drivers can
468 * use the appropriate functions to alter the queue properties.
469 * as appropriate.
470 */
471 q->plug_device_fn = generic_plug_device;
472 q->head_active = 1;
473 }
475 #define blkdev_free_rq(list) list_entry((list)->next, struct request, queue);
476 /*
477 * Get a free request. io_request_lock must be held and interrupts
478 * disabled on the way in. Returns NULL if there are no free requests.
479 */
480 static struct request *get_request(request_queue_t *q, int rw)
481 {
482 struct request *rq = NULL;
483 struct request_list *rl = q->rq + rw;
485 if (!list_empty(&rl->free)) {
486 rq = blkdev_free_rq(&rl->free);
487 list_del(&rq->queue);
488 rl->count--;
489 rq->rq_status = RQ_ACTIVE;
490 rq->cmd = rw;
491 rq->special = NULL;
492 rq->q = q;
493 }
495 return rq;
496 }
498 /*
499 * Here's the request allocation design:
500 *
501 * 1: Blocking on request exhaustion is a key part of I/O throttling.
502 *
503 * 2: We want to be `fair' to all requesters. We must avoid starvation, and
504 * attempt to ensure that all requesters sleep for a similar duration. Hence
505 * no stealing requests when there are other processes waiting.
506 *
507 * 3: We also wish to support `batching' of requests. So when a process is
508 * woken, we want to allow it to allocate a decent number of requests
509 * before it blocks again, so they can be nicely merged (this only really
510 * matters if the process happens to be adding requests near the head of
511 * the queue).
512 *
513 * 4: We want to avoid scheduling storms. This isn't really important, because
514 * the system will be I/O bound anyway. But it's easy.
515 *
516 * There is tension between requirements 2 and 3. Once a task has woken,
517 * we don't want to allow it to sleep as soon as it takes its second request.
518 * But we don't want currently-running tasks to steal all the requests
519 * from the sleepers. We handle this with wakeup hysteresis around
520 * 0 .. batch_requests and with the assumption that request taking is much,
521 * much faster than request freeing.
522 *
523 * So here's what we do:
524 *
525 * a) A READA requester fails if free_requests < batch_requests
526 *
527 * We don't want READA requests to prevent sleepers from ever
528 * waking. Note that READA is used extremely rarely - a few
529 * filesystems use it for directory readahead.
530 *
531 * When a process wants a new request:
532 *
533 * b) If free_requests == 0, the requester sleeps in FIFO manner.
534 *
535 * b) If 0 < free_requests < batch_requests and there are waiters,
536 * we still take a request non-blockingly. This provides batching.
537 *
538 * c) If free_requests >= batch_requests, the caller is immediately
539 * granted a new request.
540 *
541 * When a request is released:
542 *
543 * d) If free_requests < batch_requests, do nothing.
544 *
545 * f) If free_requests >= batch_requests, wake up a single waiter.
546 *
547 * The net effect is that when a process is woken at the batch_requests level,
548 * it will be able to take approximately (batch_requests) requests before
549 * blocking again (at the tail of the queue).
550 *
551 * This all assumes that the rate of taking requests is much, much higher
552 * than the rate of releasing them. Which is very true.
553 *
554 * -akpm, Feb 2002.
555 */
557 static struct request *__get_request_wait(request_queue_t *q, int rw)
558 {
559 #if 0
560 register struct request *rq;
561 /*DECLARE_WAITQUEUE(wait, current);*/
563 generic_unplug_device(q);
564 add_wait_queue_exclusive(&q->wait_for_requests[rw], &wait);
565 do {
566 set_current_state(TASK_UNINTERRUPTIBLE);
567 if (q->rq[rw].count == 0)
568 schedule();
569 spin_lock_irq(&io_request_lock);
570 rq = get_request(q,rw);
571 spin_unlock_irq(&io_request_lock);
572 } while (rq == NULL);
573 remove_wait_queue(&q->wait_for_requests[rw], &wait);
574 current->state = TASK_RUNNING;
575 return rq;
576 #else
577 panic("__get_request_wait shouldn't be depended on");
578 return 0;
579 #endif
580 }
582 /* RO fail safe mechanism */
584 static long ro_bits[MAX_BLKDEV][8];
586 int is_read_only(kdev_t dev)
587 {
588 int minor,major;
590 major = MAJOR(dev);
591 minor = MINOR(dev);
592 if (major < 0 || major >= MAX_BLKDEV) return 0;
593 return ro_bits[major][minor >> 5] & (1 << (minor & 31));
594 }
596 void set_device_ro(kdev_t dev,int flag)
597 {
598 int minor,major;
600 major = MAJOR(dev);
601 minor = MINOR(dev);
602 if (major < 0 || major >= MAX_BLKDEV) return;
603 if (flag) ro_bits[major][minor >> 5] |= 1 << (minor & 31);
604 else ro_bits[major][minor >> 5] &= ~(1 << (minor & 31));
605 }
607 inline void drive_stat_acct (kdev_t dev, int rw,
608 unsigned long nr_sectors, int new_io)
609 {
610 /*unsigned int major = MAJOR(dev);*/
611 unsigned int index;
613 index = disk_index(dev);
614 #if 0
615 if ((index >= DK_MAX_DISK) || (major >= DK_MAX_MAJOR))
616 return;
617 #endif
619 #if 0
620 kstat.dk_drive[major][index] += new_io;
621 if (rw == READ) {
622 kstat.dk_drive_rio[major][index] += new_io;
623 kstat.dk_drive_rblk[major][index] += nr_sectors;
624 } else if (rw == WRITE) {
625 kstat.dk_drive_wio[major][index] += new_io;
626 kstat.dk_drive_wblk[major][index] += nr_sectors;
627 } else
628 printk(KERN_ERR "drive_stat_acct: cmd not R/W?\n");
629 #endif
630 }
632 /* Return up to two hd_structs on which to do IO accounting for a given
633 * request. On a partitioned device, we want to account both against
634 * the partition and against the whole disk. */
635 static void locate_hd_struct(struct request *req,
636 struct hd_struct **hd1,
637 struct hd_struct **hd2)
638 {
639 struct gendisk *gd;
641 *hd1 = NULL;
642 *hd2 = NULL;
644 gd = get_gendisk(req->rq_dev);
645 if (gd && gd->part) {
646 /* Mask out the partition bits: account for the entire disk */
647 int devnr = MINOR(req->rq_dev) >> gd->minor_shift;
648 int whole_minor = devnr << gd->minor_shift;
649 *hd1 = &gd->part[whole_minor];
650 if (whole_minor != MINOR(req->rq_dev))
651 *hd2= &gd->part[MINOR(req->rq_dev)];
652 }
653 }
655 /* Round off the performance stats on an hd_struct. The average IO
656 * queue length and utilisation statistics are maintained by observing
657 * the current state of the queue length and the amount of time it has
658 * been in this state for. Normally, that accounting is done on IO
659 * completion, but that can result in more than a second's worth of IO
660 * being accounted for within any one second, leading to >100%
661 * utilisation. To deal with that, we do a round-off before returning
662 * the results when reading /proc/partitions, accounting immediately for
663 * all queue usage up to the current jiffies and restarting the counters
664 * again. */
665 void disk_round_stats(struct hd_struct *hd)
666 {
667 unsigned long now = jiffies;
669 hd->aveq += (hd->ios_in_flight * (jiffies - hd->last_queue_change));
670 hd->last_queue_change = now;
672 if (hd->ios_in_flight)
673 hd->io_ticks += (now - hd->last_idle_time);
674 hd->last_idle_time = now;
675 }
678 static inline void down_ios(struct hd_struct *hd)
679 {
680 disk_round_stats(hd);
681 --hd->ios_in_flight;
682 }
684 static inline void up_ios(struct hd_struct *hd)
685 {
686 disk_round_stats(hd);
687 ++hd->ios_in_flight;
688 }
690 static void account_io_start(struct hd_struct *hd, struct request *req,
691 int merge, int sectors)
692 {
693 switch (req->cmd) {
694 case READ:
695 if (merge)
696 hd->rd_merges++;
697 hd->rd_sectors += sectors;
698 break;
699 case WRITE:
700 if (merge)
701 hd->wr_merges++;
702 hd->wr_sectors += sectors;
703 break;
704 }
705 if (!merge)
706 up_ios(hd);
707 }
709 static void account_io_end(struct hd_struct *hd, struct request *req)
710 {
711 unsigned long duration = jiffies - req->start_time;
712 switch (req->cmd) {
713 case READ:
714 hd->rd_ticks += duration;
715 hd->rd_ios++;
716 break;
717 case WRITE:
718 hd->wr_ticks += duration;
719 hd->wr_ios++;
720 break;
721 }
722 down_ios(hd);
723 }
725 void req_new_io(struct request *req, int merge, int sectors)
726 {
727 struct hd_struct *hd1, *hd2;
728 locate_hd_struct(req, &hd1, &hd2);
729 if (hd1)
730 account_io_start(hd1, req, merge, sectors);
731 if (hd2)
732 account_io_start(hd2, req, merge, sectors);
733 }
735 void req_finished_io(struct request *req)
736 {
737 struct hd_struct *hd1, *hd2;
738 locate_hd_struct(req, &hd1, &hd2);
739 if (hd1)
740 account_io_end(hd1, req);
741 if (hd2)
742 account_io_end(hd2, req);
743 }
745 /*
746 * add-request adds a request to the linked list.
747 * io_request_lock is held and interrupts disabled, as we muck with the
748 * request queue list.
749 *
750 * By this point, req->cmd is always either READ/WRITE, never READA,
751 * which is important for drive_stat_acct() above.
752 */
753 static inline void add_request(request_queue_t * q, struct request * req,
754 struct list_head *insert_here)
755 {
756 drive_stat_acct(req->rq_dev, req->cmd, req->nr_sectors, 1);
758 if (!q->plugged && q->head_active && insert_here == &q->queue_head) {
759 spin_unlock_irq(&io_request_lock);
760 BUG();
761 }
763 /*
764 * elevator indicated where it wants this request to be
765 * inserted at elevator_merge time
766 */
767 list_add(&req->queue, insert_here);
768 }
770 /*
771 * Must be called with io_request_lock held and interrupts disabled
772 */
773 void blkdev_release_request(struct request *req)
774 {
775 request_queue_t *q = req->q;
776 int rw = req->cmd;
778 req->rq_status = RQ_INACTIVE;
779 req->q = NULL;
781 /*
782 * Request may not have originated from ll_rw_blk. if not,
783 * assume it has free buffers and check waiters
784 */
785 if (q) {
786 list_add(&req->queue, &q->rq[rw].free);
787 #if 0
788 if (++q->rq[rw].count >= q->batch_requests &&
789 waitqueue_active(&q->wait_for_requests[rw]))
790 wake_up(&q->wait_for_requests[rw]);
791 #endif
792 }
793 }
795 /*
796 * Has to be called with the request spinlock acquired
797 */
798 static void attempt_merge(request_queue_t * q,
799 struct request *req,
800 int max_sectors,
801 int max_segments)
802 {
803 struct request *next;
804 struct hd_struct *hd1, *hd2;
806 next = blkdev_next_request(req);
807 if (req->sector + req->nr_sectors != next->sector)
808 return;
809 if (req->cmd != next->cmd
810 || req->rq_dev != next->rq_dev
811 || req->nr_sectors + next->nr_sectors > max_sectors
812 || next->waiting)
813 return;
814 /*
815 * If we are not allowed to merge these requests, then
816 * return. If we are allowed to merge, then the count
817 * will have been updated to the appropriate number,
818 * and we shouldn't do it here too.
819 */
820 if (!q->merge_requests_fn(q, req, next, max_segments))
821 return;
823 q->elevator.elevator_merge_req_fn(req, next);
824 req->bhtail->b_reqnext = next->bh;
825 req->bhtail = next->bhtail;
826 req->nr_sectors = req->hard_nr_sectors += next->hard_nr_sectors;
827 list_del(&next->queue);
829 /* One last thing: we have removed a request, so we now have one
830 less expected IO to complete for accounting purposes. */
832 locate_hd_struct(req, &hd1, &hd2);
833 if (hd1)
834 down_ios(hd1);
835 if (hd2)
836 down_ios(hd2);
837 blkdev_release_request(next);
838 }
840 static inline void attempt_back_merge(request_queue_t * q,
841 struct request *req,
842 int max_sectors,
843 int max_segments)
844 {
845 if (&req->queue == q->queue_head.prev)
846 return;
847 attempt_merge(q, req, max_sectors, max_segments);
848 }
850 static inline void attempt_front_merge(request_queue_t * q,
851 struct list_head * head,
852 struct request *req,
853 int max_sectors,
854 int max_segments)
855 {
856 struct list_head * prev;
858 prev = req->queue.prev;
859 if (head == prev)
860 return;
861 attempt_merge(q, blkdev_entry_to_request(prev), max_sectors, max_segments);
862 }
864 static int __make_request(request_queue_t * q, int rw,
865 struct buffer_head * bh)
866 {
867 unsigned int sector, count;
868 int max_segments = MAX_SEGMENTS;
869 struct request * req, *freereq = NULL;
870 int rw_ahead, max_sectors, el_ret;
871 struct list_head *head, *insert_here;
872 int latency;
873 elevator_t *elevator = &q->elevator;
875 count = bh->b_size >> 9;
876 sector = bh->b_rsector;
878 rw_ahead = 0; /* normal case; gets changed below for READA */
879 switch (rw) {
880 case READA:
881 #if 0 /* bread() misinterprets failed READA attempts as IO errors on SMP */
882 rw_ahead = 1;
883 #endif
884 rw = READ; /* drop into READ */
885 case READ:
886 case WRITE:
887 latency = elevator_request_latency(elevator, rw);
888 break;
889 default:
890 BUG();
891 goto end_io;
892 }
894 /* We'd better have a real physical mapping!
895 Check this bit only if the buffer was dirty and just locked
896 down by us so at this point flushpage will block and
897 won't clear the mapped bit under us. */
898 if (!buffer_mapped(bh))
899 BUG();
901 /*
902 * Temporary solution - in 2.5 this will be done by the lowlevel
903 * driver. Create a bounce buffer if the buffer data points into
904 * high memory - keep the original buffer otherwise.
905 */
906 #if CONFIG_HIGHMEM
907 bh = create_bounce(rw, bh);
908 #endif
910 /* look for a free request. */
911 /*
912 * Try to coalesce the new request with old requests
913 */
914 max_sectors = get_max_sectors(bh->b_rdev);
916 again:
917 req = NULL;
918 head = &q->queue_head;
919 /*
920 * Now we acquire the request spinlock, we have to be mega careful
921 * not to schedule or do something nonatomic
922 */
923 spin_lock_irq(&io_request_lock);
925 insert_here = head->prev;
926 if (list_empty(head)) {
927 q->plug_device_fn(q, bh->b_rdev); /* is atomic */
928 goto get_rq;
929 } else if (q->head_active && !q->plugged)
930 head = head->next;
932 el_ret = elevator->elevator_merge_fn(q, &req, head, bh, rw,max_sectors);
933 switch (el_ret) {
935 case ELEVATOR_BACK_MERGE:
936 if (!q->back_merge_fn(q, req, bh, max_segments)) {
937 insert_here = &req->queue;
938 break;
939 }
940 elevator->elevator_merge_cleanup_fn(q, req, count);
941 req->bhtail->b_reqnext = bh;
942 req->bhtail = bh;
943 req->nr_sectors = req->hard_nr_sectors += count;
944 blk_started_io(count);
945 drive_stat_acct(req->rq_dev, req->cmd, count, 0);
946 req_new_io(req, 1, count);
947 attempt_back_merge(q, req, max_sectors, max_segments);
948 goto out;
950 case ELEVATOR_FRONT_MERGE:
951 if (!q->front_merge_fn(q, req, bh, max_segments)) {
952 insert_here = req->queue.prev;
953 break;
954 }
955 elevator->elevator_merge_cleanup_fn(q, req, count);
956 bh->b_reqnext = req->bh;
957 req->bh = bh;
958 req->buffer = bh->b_data;
959 req->current_nr_sectors = count;
960 req->sector = req->hard_sector = sector;
961 req->nr_sectors = req->hard_nr_sectors += count;
962 blk_started_io(count);
963 drive_stat_acct(req->rq_dev, req->cmd, count, 0);
964 req_new_io(req, 1, count);
965 attempt_front_merge(q, head, req, max_sectors, max_segments);
966 goto out;
968 /*
969 * elevator says don't/can't merge. get new request
970 */
971 case ELEVATOR_NO_MERGE:
972 /*
973 * use elevator hints as to where to insert the
974 * request. if no hints, just add it to the back
975 * of the queue
976 */
977 if (req)
978 insert_here = &req->queue;
979 break;
981 default:
982 printk("elevator returned crap (%d)\n", el_ret);
983 BUG();
984 }
986 get_rq:
987 if (freereq) {
988 req = freereq;
989 freereq = NULL;
990 } else {
991 /*
992 * See description above __get_request_wait()
993 */
994 if (rw_ahead) {
995 if (q->rq[rw].count < q->batch_requests) {
996 spin_unlock_irq(&io_request_lock);
997 goto end_io;
998 }
999 req = get_request(q, rw);
1000 if (req == NULL)
1001 BUG();
1002 } else {
1003 req = get_request(q, rw);
1004 if (req == NULL) {
1005 spin_unlock_irq(&io_request_lock);
1006 freereq = __get_request_wait(q, rw);
1007 goto again;
1012 /* fill up the request-info, and add it to the queue */
1013 req->elevator_sequence = latency;
1014 req->cmd = rw;
1015 req->errors = 0;
1016 req->hard_sector = req->sector = sector;
1017 req->hard_nr_sectors = req->nr_sectors = count;
1018 req->current_nr_sectors = count;
1019 req->nr_segments = 1; /* Always 1 for a new request. */
1020 req->nr_hw_segments = 1; /* Always 1 for a new request. */
1021 req->buffer = bh->b_data;
1022 req->waiting = NULL;
1023 req->bh = bh;
1024 req->bhtail = bh;
1025 req->rq_dev = bh->b_rdev;
1026 req->start_time = jiffies;
1027 req_new_io(req, 0, count);
1028 blk_started_io(count);
1029 add_request(q, req, insert_here);
1030 out:
1031 if (freereq)
1032 blkdev_release_request(freereq);
1033 spin_unlock_irq(&io_request_lock);
1034 return 0;
1035 end_io:
1036 bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state));
1037 return 0;
1040 /**
1041 * generic_make_request: hand a buffer head to it's device driver for I/O
1042 * @rw: READ, WRITE, or READA - what sort of I/O is desired.
1043 * @bh: The buffer head describing the location in memory and on the device.
1045 * generic_make_request() is used to make I/O requests of block
1046 * devices. It is passed a &struct buffer_head and a &rw value. The
1047 * %READ and %WRITE options are (hopefully) obvious in meaning. The
1048 * %READA value means that a read is required, but that the driver is
1049 * free to fail the request if, for example, it cannot get needed
1050 * resources immediately.
1052 * generic_make_request() does not return any status. The
1053 * success/failure status of the request, along with notification of
1054 * completion, is delivered asynchronously through the bh->b_end_io
1055 * function described (one day) else where.
1057 * The caller of generic_make_request must make sure that b_page,
1058 * b_addr, b_size are set to describe the memory buffer, that b_rdev
1059 * and b_rsector are set to describe the device address, and the
1060 * b_end_io and optionally b_private are set to describe how
1061 * completion notification should be signaled. BH_Mapped should also
1062 * be set (to confirm that b_dev and b_blocknr are valid).
1064 * generic_make_request and the drivers it calls may use b_reqnext,
1065 * and may change b_rdev and b_rsector. So the values of these fields
1066 * should NOT be depended on after the call to generic_make_request.
1067 * Because of this, the caller should record the device address
1068 * information in b_dev and b_blocknr.
1070 * Apart from those fields mentioned above, no other fields, and in
1071 * particular, no other flags, are changed by generic_make_request or
1072 * any lower level drivers.
1073 * */
1074 void generic_make_request (int rw, struct buffer_head * bh)
1076 int major = MAJOR(bh->b_rdev);
1077 int minorsize = 0;
1078 request_queue_t *q;
1080 if (!bh->b_end_io)
1081 BUG();
1083 /* Test device size, when known. */
1084 if (blk_size[major])
1085 minorsize = blk_size[major][MINOR(bh->b_rdev)];
1086 if (minorsize) {
1087 unsigned long maxsector = (minorsize << 1) + 1;
1088 unsigned long sector = bh->b_rsector;
1089 unsigned int count = bh->b_size >> 9;
1091 if (maxsector < count || maxsector - count < sector) {
1092 /* Yecch */
1093 bh->b_state &= (1 << BH_Lock) | (1 << BH_Mapped);
1095 /* This may well happen - the kernel calls bread()
1096 without checking the size of the device, e.g.,
1097 when mounting a device. */
1098 DPRINTK(KERN_INFO
1099 "attempt to access beyond end of device\n");
1100 DPRINTK(KERN_INFO "%s: rw=%d, want=%ld, limit=%d\n",
1101 kdevname(bh->b_rdev), rw,
1102 (sector + count)>>1, minorsize);
1104 /* Yecch again */
1105 bh->b_end_io(bh, 0);
1106 return;
1110 /*
1111 * Resolve the mapping until finished. (drivers are
1112 * still free to implement/resolve their own stacking
1113 * by explicitly returning 0)
1114 */
1115 /* NOTE: we don't repeat the blk_size check for each new device.
1116 * Stacking drivers are expected to know what they are doing.
1117 */
1118 do {
1119 q = blk_get_queue(bh->b_rdev);
1120 if (!q || !q->make_request_fn) {
1121 DPRINTK(KERN_ERR
1122 "generic_make_request: Trying to access "
1123 "nonexistent block-device %s (%ld)\n",
1124 kdevname(bh->b_rdev), bh->b_rsector);
1125 buffer_IO_error(bh);
1126 break;
1128 } while (q->make_request_fn(q, rw, bh));
1132 /**
1133 * submit_bh: submit a buffer_head to the block device later for I/O
1134 * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead)
1135 * @bh: The &struct buffer_head which describes the I/O
1137 * submit_bh() is very similar in purpose to generic_make_request(), and
1138 * uses that function to do most of the work.
1140 * The extra functionality provided by submit_bh is to determine
1141 * b_rsector from b_blocknr and b_size, and to set b_rdev from b_dev.
1142 * This is is appropriate for IO requests that come from the buffer
1143 * cache and page cache which (currently) always use aligned blocks.
1144 */
1145 void submit_bh(int rw, struct buffer_head * bh)
1147 if (!test_bit(BH_Lock, &bh->b_state))
1148 BUG();
1150 set_bit(BH_Req, &bh->b_state);
1151 set_bit(BH_Launder, &bh->b_state);
1153 /*
1154 * First step, 'identity mapping' - RAID or LVM might
1155 * further remap this.
1156 */
1157 bh->b_rdev = bh->b_dev;
1158 /* bh->b_rsector = bh->b_blocknr * count; */
1160 generic_make_request(rw, bh);
1163 /**
1164 * ll_rw_block: low-level access to block devices
1165 * @rw: whether to %READ or %WRITE or maybe %READA (readahead)
1166 * @nr: number of &struct buffer_heads in the array
1167 * @bhs: array of pointers to &struct buffer_head
1169 * ll_rw_block() takes an array of pointers to &struct buffer_heads,
1170 * and requests an I/O operation on them, either a %READ or a %WRITE.
1171 * The third %READA option is described in the documentation for
1172 * generic_make_request() which ll_rw_block() calls.
1174 * This function provides extra functionality that is not in
1175 * generic_make_request() that is relevant to buffers in the buffer
1176 * cache or page cache. In particular it drops any buffer that it
1177 * cannot get a lock on (with the BH_Lock state bit), any buffer that
1178 * appears to be clean when doing a write request, and any buffer that
1179 * appears to be up-to-date when doing read request. Further it marks
1180 * as clean buffers that are processed for writing (the buffer cache
1181 * wont assume that they are actually clean until the buffer gets
1182 * unlocked).
1184 * ll_rw_block sets b_end_io to simple completion handler that marks
1185 * the buffer up-to-date (if approriate), unlocks the buffer and wakes
1186 * any waiters. As client that needs a more interesting completion
1187 * routine should call submit_bh() (or generic_make_request())
1188 * directly.
1190 * Caveat:
1191 * All of the buffers must be for the same device, and must also be
1192 * of the current approved size for the device. */
1194 void ll_rw_block(int rw, int nr, struct buffer_head * bhs[])
1196 unsigned int major;
1197 int correct_size;
1198 int i;
1200 if (!nr)
1201 return;
1203 major = MAJOR(bhs[0]->b_dev);
1205 /* Determine correct block size for this device. */
1206 correct_size = get_hardsect_size(bhs[0]->b_dev);
1208 /* Verify requested block sizes. */
1209 for (i = 0; i < nr; i++) {
1210 struct buffer_head *bh = bhs[i];
1211 if (bh->b_size % correct_size) {
1212 DPRINTK(KERN_NOTICE "ll_rw_block: device %s: "
1213 "only %d-char blocks implemented (%u)\n",
1214 kdevname(bhs[0]->b_dev),
1215 correct_size, bh->b_size);
1216 goto sorry;
1220 if ((rw & WRITE) && is_read_only(bhs[0]->b_dev)) {
1221 DPRINTK(KERN_NOTICE "Can't write to read-only device %s\n",
1222 kdevname(bhs[0]->b_dev));
1223 goto sorry;
1226 for (i = 0; i < nr; i++) {
1227 struct buffer_head *bh = bhs[i];
1229 /* Only one thread can actually submit the I/O. */
1230 if (test_and_set_bit(BH_Lock, &bh->b_state))
1231 continue;
1233 /* We have the buffer lock */
1234 /*atomic_inc(&bh->b_count);*/
1236 switch(rw) {
1237 case WRITE:
1238 if (!atomic_set_buffer_clean(bh))
1239 /* Hmmph! Nothing to write */
1240 goto end_io;
1241 /* __mark_buffer_clean(bh); */
1242 break;
1244 case READA:
1245 case READ:
1246 if (buffer_uptodate(bh))
1247 /* Hmmph! Already have it */
1248 goto end_io;
1249 break;
1250 default:
1251 BUG();
1252 end_io:
1253 bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state));
1254 continue;
1257 submit_bh(rw, bh);
1259 return;
1261 sorry:
1262 /* Make sure we don't get infinite dirty retries.. */
1263 for (i = 0; i < nr; i++)
1264 mark_buffer_clean(bhs[i]);
1267 #ifdef CONFIG_STRAM_SWAP
1268 extern int stram_device_init (void);
1269 #endif
1272 /**
1273 * end_that_request_first - end I/O on one buffer.
1274 * @req: the request being processed
1275 * @uptodate: 0 for I/O error
1276 * @name: the name printed for an I/O error
1278 * Description:
1279 * Ends I/O on the first buffer attached to @req, and sets it up
1280 * for the next buffer_head (if any) in the cluster.
1282 * Return:
1283 * 0 - we are done with this request, call end_that_request_last()
1284 * 1 - still buffers pending for this request
1286 * Caveat:
1287 * Drivers implementing their own end_request handling must call
1288 * blk_finished_io() appropriately.
1289 **/
1291 int end_that_request_first (struct request *req, int uptodate, char *name)
1293 struct buffer_head * bh;
1294 int nsect;
1296 req->errors = 0;
1297 if (!uptodate)
1298 printk("end_request: I/O error, dev %s (%s), sector %lu\n",
1299 kdevname(req->rq_dev), name, req->sector);
1301 if ((bh = req->bh) != NULL) {
1302 nsect = bh->b_size >> 9;
1303 blk_finished_io(nsect);
1304 req->bh = bh->b_reqnext;
1305 bh->b_reqnext = NULL;
1306 bh->b_end_io(bh, uptodate);
1307 if ((bh = req->bh) != NULL) {
1308 req->hard_sector += nsect;
1309 req->hard_nr_sectors -= nsect;
1310 req->sector = req->hard_sector;
1311 req->nr_sectors = req->hard_nr_sectors;
1313 req->current_nr_sectors = bh->b_size >> 9;
1314 if (req->nr_sectors < req->current_nr_sectors) {
1315 req->nr_sectors = req->current_nr_sectors;
1316 printk("end_request: buffer-list destroyed\n");
1318 req->buffer = bh->b_data;
1319 return 1;
1322 return 0;
1325 void end_that_request_last(struct request *req)
1327 if (req->waiting != NULL)
1328 complete(req->waiting);
1329 req_finished_io(req);
1331 blkdev_release_request(req);
1334 int __init blk_dev_init(void)
1336 struct blk_dev_struct *dev;
1338 request_cachep = kmem_cache_create("blkdev_requests",
1339 sizeof(struct request),
1340 0, SLAB_HWCACHE_ALIGN, NULL, NULL);
1342 if (!request_cachep)
1343 panic("Can't create request pool slab cache\n");
1345 for (dev = blk_dev + MAX_BLKDEV; dev-- != blk_dev;)
1346 dev->queue = NULL;
1348 memset(ro_bits,0,sizeof(ro_bits));
1349 memset(max_readahead, 0, sizeof(max_readahead));
1350 memset(max_sectors, 0, sizeof(max_sectors));
1352 #ifdef CONFIG_AMIGA_Z2RAM
1353 z2_init();
1354 #endif
1355 #ifdef CONFIG_STRAM_SWAP
1356 stram_device_init();
1357 #endif
1358 #ifdef CONFIG_ISP16_CDI
1359 isp16_init();
1360 #endif
1361 #if defined(CONFIG_IDE) && defined(CONFIG_BLK_DEV_IDE)
1362 ide_init(); /* this MUST precede hd_init */
1363 #endif
1364 #if defined(CONFIG_IDE) && defined(CONFIG_BLK_DEV_HD)
1365 hd_init();
1366 #endif
1367 #ifdef CONFIG_BLK_DEV_PS2
1368 ps2esdi_init();
1369 #endif
1370 #ifdef CONFIG_BLK_DEV_XD
1371 xd_init();
1372 #endif
1373 #ifdef CONFIG_BLK_DEV_MFM
1374 mfm_init();
1375 #endif
1376 #ifdef CONFIG_PARIDE
1377 { extern void paride_init(void); paride_init(); };
1378 #endif
1379 #ifdef CONFIG_MAC_FLOPPY
1380 swim3_init();
1381 #endif
1382 #ifdef CONFIG_BLK_DEV_SWIM_IOP
1383 swimiop_init();
1384 #endif
1385 #ifdef CONFIG_AMIGA_FLOPPY
1386 amiga_floppy_init();
1387 #endif
1388 #ifdef CONFIG_ATARI_FLOPPY
1389 atari_floppy_init();
1390 #endif
1391 #ifdef CONFIG_BLK_DEV_FD
1392 floppy_init();
1393 #else
1394 #if defined(__i386__) /* Do we even need this? */
1395 outb_p(0xc, 0x3f2);
1396 #endif
1397 #endif
1398 #ifdef CONFIG_CDU31A
1399 cdu31a_init();
1400 #endif
1401 #ifdef CONFIG_ATARI_ACSI
1402 acsi_init();
1403 #endif
1404 #ifdef CONFIG_MCD
1405 mcd_init();
1406 #endif
1407 #ifdef CONFIG_MCDX
1408 mcdx_init();
1409 #endif
1410 #ifdef CONFIG_SBPCD
1411 sbpcd_init();
1412 #endif
1413 #ifdef CONFIG_AZTCD
1414 aztcd_init();
1415 #endif
1416 #ifdef CONFIG_CDU535
1417 sony535_init();
1418 #endif
1419 #ifdef CONFIG_GSCD
1420 gscd_init();
1421 #endif
1422 #ifdef CONFIG_CM206
1423 cm206_init();
1424 #endif
1425 #ifdef CONFIG_OPTCD
1426 optcd_init();
1427 #endif
1428 #ifdef CONFIG_SJCD
1429 sjcd_init();
1430 #endif
1431 #ifdef CONFIG_APBLOCK
1432 ap_init();
1433 #endif
1434 #ifdef CONFIG_DDV
1435 ddv_init();
1436 #endif
1437 #ifdef CONFIG_MDISK
1438 mdisk_init();
1439 #endif
1440 #ifdef CONFIG_DASD
1441 dasd_init();
1442 #endif
1443 #if defined(CONFIG_S390_TAPE) && defined(CONFIG_S390_TAPE_BLOCK)
1444 tapeblock_init();
1445 #endif
1446 #ifdef CONFIG_BLK_DEV_XPRAM
1447 xpram_init();
1448 #endif
1450 #ifdef CONFIG_SUN_JSFLASH
1451 jsfd_init();
1452 #endif
1453 return 0;
1454 };
1456 EXPORT_SYMBOL(io_request_lock);
1457 EXPORT_SYMBOL(end_that_request_first);
1458 EXPORT_SYMBOL(end_that_request_last);
1459 EXPORT_SYMBOL(blk_grow_request_list);
1460 EXPORT_SYMBOL(blk_init_queue);
1461 EXPORT_SYMBOL(blk_get_queue);
1462 EXPORT_SYMBOL(blk_cleanup_queue);
1463 EXPORT_SYMBOL(blk_queue_headactive);
1464 EXPORT_SYMBOL(blk_queue_make_request);
1465 EXPORT_SYMBOL(generic_make_request);
1466 EXPORT_SYMBOL(blkdev_release_request);
1467 EXPORT_SYMBOL(req_finished_io);
1468 EXPORT_SYMBOL(generic_unplug_device);