ia64/xen-unstable

view linux-2.6-xen-sparse/drivers/xen/blkback/blkback.c @ 7514:5a4893a537ca

Minor driver code changes/rearrangement to reduce ia64-specific patches
Signed-off by: Dan Magenheimer <dan.magenheimer@hp.com>
author djm@kirby.fc.hp.com
date Fri Nov 04 10:40:29 2005 -0600 (2005-11-04)
parents 895149d24048
children b6cce4237ded
line source
1 /******************************************************************************
2 * arch/xen/drivers/blkif/backend/main.c
3 *
4 * Back-end of the driver for virtual block devices. This portion of the
5 * driver exports a 'unified' block-device interface that can be accessed
6 * by any operating system that implements a compatible front end. A
7 * reference front-end implementation can be found in:
8 * arch/xen/drivers/blkif/frontend
9 *
10 * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
11 * Copyright (c) 2005, Christopher Clark
12 */
14 #include <linux/spinlock.h>
15 #include <asm-xen/balloon.h>
16 #include <asm/hypervisor.h>
17 #include "common.h"
19 /*
20 * These are rather arbitrary. They are fairly large because adjacent requests
21 * pulled from a communication ring are quite likely to end up being part of
22 * the same scatter/gather request at the disc.
23 *
24 * ** TRY INCREASING 'MAX_PENDING_REQS' IF WRITE SPEEDS SEEM TOO LOW **
25 * This will increase the chances of being able to write whole tracks.
26 * 64 should be enough to keep us competitive with Linux.
27 */
28 #define MAX_PENDING_REQS 64
29 #define BATCH_PER_DOMAIN 16
31 static unsigned long mmap_vstart;
32 #define MMAP_PAGES \
33 (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST)
34 #define MMAP_VADDR(_req,_seg) \
35 (mmap_vstart + \
36 ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) + \
37 ((_seg) * PAGE_SIZE))
39 /*
40 * Each outstanding request that we've passed to the lower device layers has a
41 * 'pending_req' allocated to it. Each buffer_head that completes decrements
42 * the pendcnt towards zero. When it hits zero, the specified domain has a
43 * response queued for it, with the saved 'id' passed back.
44 */
45 typedef struct {
46 blkif_t *blkif;
47 unsigned long id;
48 int nr_pages;
49 atomic_t pendcnt;
50 unsigned short operation;
51 int status;
52 } pending_req_t;
54 /*
55 * We can't allocate pending_req's in order, since they may complete out of
56 * order. We therefore maintain an allocation ring. This ring also indicates
57 * when enough work has been passed down -- at that point the allocation ring
58 * will be empty.
59 */
60 static pending_req_t pending_reqs[MAX_PENDING_REQS];
61 static unsigned char pending_ring[MAX_PENDING_REQS];
62 static spinlock_t pend_prod_lock = SPIN_LOCK_UNLOCKED;
63 /* NB. We use a different index type to differentiate from shared blk rings. */
64 typedef unsigned int PEND_RING_IDX;
65 #define MASK_PEND_IDX(_i) ((_i)&(MAX_PENDING_REQS-1))
66 static PEND_RING_IDX pending_prod, pending_cons;
67 #define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons)
69 static request_queue_t *plugged_queue;
70 static inline void flush_plugged_queue(void)
71 {
72 request_queue_t *q = plugged_queue;
73 if (q != NULL) {
74 if ( q->unplug_fn != NULL )
75 q->unplug_fn(q);
76 blk_put_queue(q);
77 plugged_queue = NULL;
78 }
79 }
81 /* When using grant tables to map a frame for device access then the
82 * handle returned must be used to unmap the frame. This is needed to
83 * drop the ref count on the frame.
84 */
85 static u16 pending_grant_handles[MMAP_PAGES];
86 #define pending_handle(_idx, _i) \
87 (pending_grant_handles[((_idx) * BLKIF_MAX_SEGMENTS_PER_REQUEST) + (_i)])
88 #define BLKBACK_INVALID_HANDLE (0xFFFF)
90 #ifdef CONFIG_XEN_BLKDEV_TAP_BE
91 /*
92 * If the tap driver is used, we may get pages belonging to either the tap
93 * or (more likely) the real frontend. The backend must specify which domain
94 * a given page belongs to in update_va_mapping though. For the moment,
95 * the tap rewrites the ID field of the request to contain the request index
96 * and the id of the real front end domain.
97 */
98 #define BLKTAP_COOKIE 0xbeadfeed
99 static inline domid_t ID_TO_DOM(unsigned long id) { return (id >> 16); }
100 #endif
102 static int do_block_io_op(blkif_t *blkif, int max_to_do);
103 static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req);
104 static void make_response(blkif_t *blkif, unsigned long id,
105 unsigned short op, int st);
107 static void fast_flush_area(int idx, int nr_pages)
108 {
109 struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
110 unsigned int i, invcount = 0;
111 u16 handle;
113 for (i = 0; i < nr_pages; i++) {
114 handle = pending_handle(idx, i);
115 if (handle == BLKBACK_INVALID_HANDLE)
116 continue;
117 unmap[i].host_addr = MMAP_VADDR(idx, i);
118 unmap[i].dev_bus_addr = 0;
119 unmap[i].handle = handle;
120 pending_handle(idx, i) = BLKBACK_INVALID_HANDLE;
121 invcount++;
122 }
124 BUG_ON(HYPERVISOR_grant_table_op(
125 GNTTABOP_unmap_grant_ref, unmap, invcount));
126 }
129 /******************************************************************
130 * BLOCK-DEVICE SCHEDULER LIST MAINTENANCE
131 */
133 static struct list_head blkio_schedule_list;
134 static spinlock_t blkio_schedule_list_lock;
136 static int __on_blkdev_list(blkif_t *blkif)
137 {
138 return blkif->blkdev_list.next != NULL;
139 }
141 static void remove_from_blkdev_list(blkif_t *blkif)
142 {
143 unsigned long flags;
145 if (!__on_blkdev_list(blkif))
146 return;
148 spin_lock_irqsave(&blkio_schedule_list_lock, flags);
149 if (__on_blkdev_list(blkif)) {
150 list_del(&blkif->blkdev_list);
151 blkif->blkdev_list.next = NULL;
152 blkif_put(blkif);
153 }
154 spin_unlock_irqrestore(&blkio_schedule_list_lock, flags);
155 }
157 static void add_to_blkdev_list_tail(blkif_t *blkif)
158 {
159 unsigned long flags;
161 if (__on_blkdev_list(blkif))
162 return;
164 spin_lock_irqsave(&blkio_schedule_list_lock, flags);
165 if (!__on_blkdev_list(blkif) && (blkif->status == CONNECTED)) {
166 list_add_tail(&blkif->blkdev_list, &blkio_schedule_list);
167 blkif_get(blkif);
168 }
169 spin_unlock_irqrestore(&blkio_schedule_list_lock, flags);
170 }
173 /******************************************************************
174 * SCHEDULER FUNCTIONS
175 */
177 static DECLARE_WAIT_QUEUE_HEAD(blkio_schedule_wait);
179 static int blkio_schedule(void *arg)
180 {
181 DECLARE_WAITQUEUE(wq, current);
183 blkif_t *blkif;
184 struct list_head *ent;
186 daemonize("xenblkd");
188 for (;;) {
189 /* Wait for work to do. */
190 add_wait_queue(&blkio_schedule_wait, &wq);
191 set_current_state(TASK_INTERRUPTIBLE);
192 if ( (NR_PENDING_REQS == MAX_PENDING_REQS) ||
193 list_empty(&blkio_schedule_list) )
194 schedule();
195 __set_current_state(TASK_RUNNING);
196 remove_wait_queue(&blkio_schedule_wait, &wq);
198 /* Queue up a batch of requests. */
199 while ((NR_PENDING_REQS < MAX_PENDING_REQS) &&
200 !list_empty(&blkio_schedule_list)) {
201 ent = blkio_schedule_list.next;
202 blkif = list_entry(ent, blkif_t, blkdev_list);
203 blkif_get(blkif);
204 remove_from_blkdev_list(blkif);
205 if (do_block_io_op(blkif, BATCH_PER_DOMAIN))
206 add_to_blkdev_list_tail(blkif);
207 blkif_put(blkif);
208 }
210 /* Push the batch through to disc. */
211 flush_plugged_queue();
212 }
213 }
215 static void maybe_trigger_blkio_schedule(void)
216 {
217 /*
218 * Needed so that two processes, which together make the following
219 * predicate true, don't both read stale values and evaluate the
220 * predicate incorrectly. Incredibly unlikely to stall the scheduler
221 * on x86, but...
222 */
223 smp_mb();
225 if ((NR_PENDING_REQS < (MAX_PENDING_REQS/2)) &&
226 !list_empty(&blkio_schedule_list))
227 wake_up(&blkio_schedule_wait);
228 }
232 /******************************************************************
233 * COMPLETION CALLBACK -- Called as bh->b_end_io()
234 */
236 static void __end_block_io_op(pending_req_t *pending_req, int uptodate)
237 {
238 unsigned long flags;
240 /* An error fails the entire request. */
241 if (!uptodate) {
242 DPRINTK("Buffer not up-to-date at end of operation\n");
243 pending_req->status = BLKIF_RSP_ERROR;
244 }
246 if (atomic_dec_and_test(&pending_req->pendcnt)) {
247 int pending_idx = pending_req - pending_reqs;
248 fast_flush_area(pending_idx, pending_req->nr_pages);
249 make_response(pending_req->blkif, pending_req->id,
250 pending_req->operation, pending_req->status);
251 blkif_put(pending_req->blkif);
252 spin_lock_irqsave(&pend_prod_lock, flags);
253 pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
254 spin_unlock_irqrestore(&pend_prod_lock, flags);
255 maybe_trigger_blkio_schedule();
256 }
257 }
259 static int end_block_io_op(struct bio *bio, unsigned int done, int error)
260 {
261 if (bio->bi_size != 0)
262 return 1;
263 __end_block_io_op(bio->bi_private, !error);
264 bio_put(bio);
265 return error;
266 }
269 /******************************************************************************
270 * NOTIFICATION FROM GUEST OS.
271 */
273 irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs)
274 {
275 blkif_t *blkif = dev_id;
276 add_to_blkdev_list_tail(blkif);
277 maybe_trigger_blkio_schedule();
278 return IRQ_HANDLED;
279 }
283 /******************************************************************
284 * DOWNWARD CALLS -- These interface with the block-device layer proper.
285 */
287 static int do_block_io_op(blkif_t *blkif, int max_to_do)
288 {
289 blkif_back_ring_t *blk_ring = &blkif->blk_ring;
290 blkif_request_t *req;
291 RING_IDX i, rp;
292 int more_to_do = 0;
294 rp = blk_ring->sring->req_prod;
295 rmb(); /* Ensure we see queued requests up to 'rp'. */
297 for (i = blk_ring->req_cons;
298 (i != rp) && !RING_REQUEST_CONS_OVERFLOW(blk_ring, i);
299 i++) {
300 if ((max_to_do-- == 0) ||
301 (NR_PENDING_REQS == MAX_PENDING_REQS)) {
302 more_to_do = 1;
303 break;
304 }
306 req = RING_GET_REQUEST(blk_ring, i);
307 switch (req->operation) {
308 case BLKIF_OP_READ:
309 case BLKIF_OP_WRITE:
310 dispatch_rw_block_io(blkif, req);
311 break;
313 default:
314 DPRINTK("error: unknown block io operation [%d]\n",
315 req->operation);
316 make_response(blkif, req->id, req->operation,
317 BLKIF_RSP_ERROR);
318 break;
319 }
320 }
322 blk_ring->req_cons = i;
323 return more_to_do;
324 }
326 static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req)
327 {
328 extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]);
329 int operation = (req->operation == BLKIF_OP_WRITE) ? WRITE : READ;
330 unsigned long fas = 0;
331 int i, pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)];
332 pending_req_t *pending_req;
333 struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
334 struct phys_req preq;
335 struct {
336 unsigned long buf; unsigned int nsec;
337 } seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
338 unsigned int nseg;
339 struct bio *bio = NULL, *biolist[BLKIF_MAX_SEGMENTS_PER_REQUEST];
340 int nbio = 0;
341 request_queue_t *q;
343 /* Check that number of segments is sane. */
344 nseg = req->nr_segments;
345 if (unlikely(nseg == 0) ||
346 unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
347 DPRINTK("Bad number of segments in request (%d)\n", nseg);
348 goto bad_descriptor;
349 }
351 preq.dev = req->handle;
352 preq.sector_number = req->sector_number;
353 preq.nr_sects = 0;
355 for (i = 0; i < nseg; i++) {
356 fas = req->frame_and_sects[i];
357 seg[i].nsec = blkif_last_sect(fas) - blkif_first_sect(fas) + 1;
359 if (seg[i].nsec <= 0)
360 goto bad_descriptor;
361 preq.nr_sects += seg[i].nsec;
363 map[i].host_addr = MMAP_VADDR(pending_idx, i);
364 map[i].dom = blkif->domid;
365 map[i].ref = blkif_gref_from_fas(fas);
366 map[i].flags = GNTMAP_host_map;
367 if ( operation == WRITE )
368 map[i].flags |= GNTMAP_readonly;
369 }
371 BUG_ON(HYPERVISOR_grant_table_op(
372 GNTTABOP_map_grant_ref, map, nseg));
374 for (i = 0; i < nseg; i++) {
375 if (unlikely(map[i].handle < 0)) {
376 DPRINTK("invalid buffer -- could not remap it\n");
377 fast_flush_area(pending_idx, nseg);
378 goto bad_descriptor;
379 }
381 phys_to_machine_mapping[__pa(MMAP_VADDR(
382 pending_idx, i)) >> PAGE_SHIFT] =
383 FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT);
385 pending_handle(pending_idx, i) = map[i].handle;
386 }
388 for (i = 0; i < nseg; i++) {
389 fas = req->frame_and_sects[i];
390 seg[i].buf = map[i].dev_bus_addr |
391 (blkif_first_sect(fas) << 9);
392 }
394 if (vbd_translate(&preq, blkif, operation) != 0) {
395 DPRINTK("access denied: %s of [%llu,%llu] on dev=%04x\n",
396 operation == READ ? "read" : "write",
397 preq.sector_number,
398 preq.sector_number + preq.nr_sects, preq.dev);
399 goto bad_descriptor;
400 }
402 pending_req = &pending_reqs[pending_idx];
403 pending_req->blkif = blkif;
404 pending_req->id = req->id;
405 pending_req->operation = operation;
406 pending_req->status = BLKIF_RSP_OKAY;
407 pending_req->nr_pages = nseg;
409 for (i = 0; i < nseg; i++) {
410 if (((int)preq.sector_number|(int)seg[i].nsec) &
411 ((bdev_hardsect_size(preq.bdev) >> 9) - 1)) {
412 DPRINTK("Misaligned I/O request from domain %d",
413 blkif->domid);
414 goto cleanup_and_fail;
415 }
417 while ((bio == NULL) ||
418 (bio_add_page(bio,
419 virt_to_page(MMAP_VADDR(pending_idx, i)),
420 seg[i].nsec << 9,
421 seg[i].buf & ~PAGE_MASK) == 0)) {
422 bio = biolist[nbio++] = bio_alloc(GFP_KERNEL, nseg-i);
423 if (unlikely(bio == NULL)) {
424 cleanup_and_fail:
425 for (i = 0; i < (nbio-1); i++)
426 bio_put(biolist[i]);
427 fast_flush_area(pending_idx, nseg);
428 goto bad_descriptor;
429 }
431 bio->bi_bdev = preq.bdev;
432 bio->bi_private = pending_req;
433 bio->bi_end_io = end_block_io_op;
434 bio->bi_sector = preq.sector_number;
435 }
437 preq.sector_number += seg[i].nsec;
438 }
440 if ((q = bdev_get_queue(bio->bi_bdev)) != plugged_queue) {
441 flush_plugged_queue();
442 blk_get_queue(q);
443 plugged_queue = q;
444 }
446 atomic_set(&pending_req->pendcnt, nbio);
447 pending_cons++;
448 blkif_get(blkif);
450 for (i = 0; i < nbio; i++)
451 submit_bio(operation, biolist[i]);
453 return;
455 bad_descriptor:
456 make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
457 }
461 /******************************************************************
462 * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING
463 */
466 static void make_response(blkif_t *blkif, unsigned long id,
467 unsigned short op, int st)
468 {
469 blkif_response_t *resp;
470 unsigned long flags;
471 blkif_back_ring_t *blk_ring = &blkif->blk_ring;
473 /* Place on the response ring for the relevant domain. */
474 spin_lock_irqsave(&blkif->blk_ring_lock, flags);
475 resp = RING_GET_RESPONSE(blk_ring, blk_ring->rsp_prod_pvt);
476 resp->id = id;
477 resp->operation = op;
478 resp->status = st;
479 wmb(); /* Ensure other side can see the response fields. */
480 blk_ring->rsp_prod_pvt++;
481 RING_PUSH_RESPONSES(blk_ring);
482 spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
484 /* Kick the relevant domain. */
485 notify_remote_via_irq(blkif->irq);
486 }
488 void blkif_deschedule(blkif_t *blkif)
489 {
490 remove_from_blkdev_list(blkif);
491 }
493 static int __init blkif_init(void)
494 {
495 int i;
496 struct page *page;
498 if (xen_init() < 0)
499 return -ENODEV;
501 blkif_interface_init();
503 page = balloon_alloc_empty_page_range(MMAP_PAGES);
504 BUG_ON(page == NULL);
505 mmap_vstart = (unsigned long)pfn_to_kaddr(page_to_pfn(page));
507 pending_cons = 0;
508 pending_prod = MAX_PENDING_REQS;
509 memset(pending_reqs, 0, sizeof(pending_reqs));
510 for (i = 0; i < MAX_PENDING_REQS; i++)
511 pending_ring[i] = i;
513 spin_lock_init(&blkio_schedule_list_lock);
514 INIT_LIST_HEAD(&blkio_schedule_list);
516 BUG_ON(kernel_thread(blkio_schedule, 0, CLONE_FS | CLONE_FILES) < 0);
518 blkif_xenbus_init();
520 memset(pending_grant_handles, BLKBACK_INVALID_HANDLE, MMAP_PAGES);
522 return 0;
523 }
525 __initcall(blkif_init);
527 /*
528 * Local variables:
529 * c-file-style: "linux"
530 * indent-tabs-mode: t
531 * c-indent-level: 8
532 * c-basic-offset: 8
533 * tab-width: 8
534 * End:
535 */