ia64/xen-unstable

view linux-2.6.11-xen-sparse/drivers/xen/blkback/blkback.c @ 4942:85e3c42fd78f

bitkeeper revision 1.1159.258.132 (428900ceGeeOt2WYcJ01WZMZCdJCHA)

Fix multi-page I/O accesses in the blkback driver in cases where we
receive partial-completion callbacks.
Signed-off-by: Vincent Hanquez <vincent@xensource.com>
Signed-off-by: Keir Fraser <keir@xensource.com>
author kaf24@firebug.cl.cam.ac.uk
date Mon May 16 20:21:34 2005 +0000 (2005-05-16)
parents d851c864a172
children 1fa301443996 574892590001
line source
1 /******************************************************************************
2 * arch/xen/drivers/blkif/backend/main.c
3 *
4 * Back-end of the driver for virtual block devices. This portion of the
5 * driver exports a 'unified' block-device interface that can be accessed
6 * by any operating system that implements a compatible front end. A
7 * reference front-end implementation can be found in:
8 * arch/xen/drivers/blkif/frontend
9 *
10 * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
11 */
13 #include "common.h"
15 /*
16 * These are rather arbitrary. They are fairly large because adjacent requests
17 * pulled from a communication ring are quite likely to end up being part of
18 * the same scatter/gather request at the disc.
19 *
20 * ** TRY INCREASING 'MAX_PENDING_REQS' IF WRITE SPEEDS SEEM TOO LOW **
21 * This will increase the chances of being able to write whole tracks.
22 * 64 should be enough to keep us competitive with Linux.
23 */
24 #define MAX_PENDING_REQS 64
25 #define BATCH_PER_DOMAIN 16
27 static unsigned long mmap_vstart;
28 #define MMAP_PAGES_PER_REQUEST \
29 (BLKIF_MAX_SEGMENTS_PER_REQUEST + 1)
30 #define MMAP_PAGES \
31 (MAX_PENDING_REQS * MMAP_PAGES_PER_REQUEST)
32 #define MMAP_VADDR(_req,_seg) \
33 (mmap_vstart + \
34 ((_req) * MMAP_PAGES_PER_REQUEST * PAGE_SIZE) + \
35 ((_seg) * PAGE_SIZE))
37 /*
38 * Each outstanding request that we've passed to the lower device layers has a
39 * 'pending_req' allocated to it. Each buffer_head that completes decrements
40 * the pendcnt towards zero. When it hits zero, the specified domain has a
41 * response queued for it, with the saved 'id' passed back.
42 */
43 typedef struct {
44 blkif_t *blkif;
45 unsigned long id;
46 int nr_pages;
47 atomic_t pendcnt;
48 unsigned short operation;
49 int status;
50 } pending_req_t;
52 /*
53 * We can't allocate pending_req's in order, since they may complete out of
54 * order. We therefore maintain an allocation ring. This ring also indicates
55 * when enough work has been passed down -- at that point the allocation ring
56 * will be empty.
57 */
58 static pending_req_t pending_reqs[MAX_PENDING_REQS];
59 static unsigned char pending_ring[MAX_PENDING_REQS];
60 static spinlock_t pend_prod_lock = SPIN_LOCK_UNLOCKED;
61 /* NB. We use a different index type to differentiate from shared blk rings. */
62 typedef unsigned int PEND_RING_IDX;
63 #define MASK_PEND_IDX(_i) ((_i)&(MAX_PENDING_REQS-1))
64 static PEND_RING_IDX pending_prod, pending_cons;
65 #define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons)
67 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
68 static kmem_cache_t *buffer_head_cachep;
69 #else
70 static request_queue_t *plugged_queue;
71 static inline void flush_plugged_queue(void)
72 {
73 request_queue_t *q = plugged_queue;
74 if ( q != NULL )
75 {
76 if ( q->unplug_fn != NULL )
77 q->unplug_fn(q);
78 blk_put_queue(q);
79 plugged_queue = NULL;
80 }
81 }
82 #endif
84 static int do_block_io_op(blkif_t *blkif, int max_to_do);
85 static void dispatch_probe(blkif_t *blkif, blkif_request_t *req);
86 static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req);
87 static void make_response(blkif_t *blkif, unsigned long id,
88 unsigned short op, int st);
90 static void fast_flush_area(int idx, int nr_pages)
91 {
92 multicall_entry_t mcl[MMAP_PAGES_PER_REQUEST];
93 int i;
95 for ( i = 0; i < nr_pages; i++ )
96 {
97 mcl[i].op = __HYPERVISOR_update_va_mapping;
98 mcl[i].args[0] = MMAP_VADDR(idx, i) >> PAGE_SHIFT;
99 mcl[i].args[1] = 0;
100 mcl[i].args[2] = 0;
101 }
103 mcl[nr_pages-1].args[2] = UVMF_FLUSH_TLB;
104 if ( unlikely(HYPERVISOR_multicall(mcl, nr_pages) != 0) )
105 BUG();
106 }
109 /******************************************************************
110 * BLOCK-DEVICE SCHEDULER LIST MAINTENANCE
111 */
113 static struct list_head blkio_schedule_list;
114 static spinlock_t blkio_schedule_list_lock;
116 static int __on_blkdev_list(blkif_t *blkif)
117 {
118 return blkif->blkdev_list.next != NULL;
119 }
121 static void remove_from_blkdev_list(blkif_t *blkif)
122 {
123 unsigned long flags;
124 if ( !__on_blkdev_list(blkif) ) return;
125 spin_lock_irqsave(&blkio_schedule_list_lock, flags);
126 if ( __on_blkdev_list(blkif) )
127 {
128 list_del(&blkif->blkdev_list);
129 blkif->blkdev_list.next = NULL;
130 blkif_put(blkif);
131 }
132 spin_unlock_irqrestore(&blkio_schedule_list_lock, flags);
133 }
135 static void add_to_blkdev_list_tail(blkif_t *blkif)
136 {
137 unsigned long flags;
138 if ( __on_blkdev_list(blkif) ) return;
139 spin_lock_irqsave(&blkio_schedule_list_lock, flags);
140 if ( !__on_blkdev_list(blkif) && (blkif->status == CONNECTED) )
141 {
142 list_add_tail(&blkif->blkdev_list, &blkio_schedule_list);
143 blkif_get(blkif);
144 }
145 spin_unlock_irqrestore(&blkio_schedule_list_lock, flags);
146 }
149 /******************************************************************
150 * SCHEDULER FUNCTIONS
151 */
153 static DECLARE_WAIT_QUEUE_HEAD(blkio_schedule_wait);
155 static int blkio_schedule(void *arg)
156 {
157 DECLARE_WAITQUEUE(wq, current);
159 blkif_t *blkif;
160 struct list_head *ent;
162 daemonize(
163 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
164 "xenblkd"
165 #endif
166 );
168 for ( ; ; )
169 {
170 /* Wait for work to do. */
171 add_wait_queue(&blkio_schedule_wait, &wq);
172 set_current_state(TASK_INTERRUPTIBLE);
173 if ( (NR_PENDING_REQS == MAX_PENDING_REQS) ||
174 list_empty(&blkio_schedule_list) )
175 schedule();
176 __set_current_state(TASK_RUNNING);
177 remove_wait_queue(&blkio_schedule_wait, &wq);
179 /* Queue up a batch of requests. */
180 while ( (NR_PENDING_REQS < MAX_PENDING_REQS) &&
181 !list_empty(&blkio_schedule_list) )
182 {
183 ent = blkio_schedule_list.next;
184 blkif = list_entry(ent, blkif_t, blkdev_list);
185 blkif_get(blkif);
186 remove_from_blkdev_list(blkif);
187 if ( do_block_io_op(blkif, BATCH_PER_DOMAIN) )
188 add_to_blkdev_list_tail(blkif);
189 blkif_put(blkif);
190 }
192 /* Push the batch through to disc. */
193 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
194 run_task_queue(&tq_disk);
195 #else
196 flush_plugged_queue();
197 #endif
198 }
199 }
201 static void maybe_trigger_blkio_schedule(void)
202 {
203 /*
204 * Needed so that two processes, who together make the following predicate
205 * true, don't both read stale values and evaluate the predicate
206 * incorrectly. Incredibly unlikely to stall the scheduler on x86, but...
207 */
208 smp_mb();
210 if ( (NR_PENDING_REQS < (MAX_PENDING_REQS/2)) &&
211 !list_empty(&blkio_schedule_list) )
212 wake_up(&blkio_schedule_wait);
213 }
217 /******************************************************************
218 * COMPLETION CALLBACK -- Called as bh->b_end_io()
219 */
221 static void __end_block_io_op(pending_req_t *pending_req, int uptodate)
222 {
223 unsigned long flags;
225 /* An error fails the entire request. */
226 if ( !uptodate )
227 {
228 DPRINTK("Buffer not up-to-date at end of operation\n");
229 pending_req->status = BLKIF_RSP_ERROR;
230 }
232 if ( atomic_dec_and_test(&pending_req->pendcnt) )
233 {
234 int pending_idx = pending_req - pending_reqs;
235 fast_flush_area(pending_idx, pending_req->nr_pages);
236 make_response(pending_req->blkif, pending_req->id,
237 pending_req->operation, pending_req->status);
238 blkif_put(pending_req->blkif);
239 spin_lock_irqsave(&pend_prod_lock, flags);
240 pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
241 spin_unlock_irqrestore(&pend_prod_lock, flags);
242 maybe_trigger_blkio_schedule();
243 }
244 }
246 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
247 static void end_block_io_op(struct buffer_head *bh, int uptodate)
248 {
249 __end_block_io_op(bh->b_private, uptodate);
250 kmem_cache_free(buffer_head_cachep, bh);
251 }
252 #else
253 static int end_block_io_op(struct bio *bio, unsigned int done, int error)
254 {
255 if ( bio->bi_size != 0 )
256 return 1;
257 __end_block_io_op(bio->bi_private, !error);
258 bio_put(bio);
259 return error;
260 }
261 #endif
264 /******************************************************************************
265 * NOTIFICATION FROM GUEST OS.
266 */
268 irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs)
269 {
270 blkif_t *blkif = dev_id;
271 add_to_blkdev_list_tail(blkif);
272 maybe_trigger_blkio_schedule();
273 return IRQ_HANDLED;
274 }
278 /******************************************************************
279 * DOWNWARD CALLS -- These interface with the block-device layer proper.
280 */
282 static int do_block_io_op(blkif_t *blkif, int max_to_do)
283 {
284 blkif_ring_t *blk_ring = blkif->blk_ring_base;
285 blkif_request_t *req;
286 BLKIF_RING_IDX i, rp;
287 int more_to_do = 0;
289 rp = blk_ring->req_prod;
290 rmb(); /* Ensure we see queued requests up to 'rp'. */
292 /* Take items off the comms ring, taking care not to overflow. */
293 for ( i = blkif->blk_req_cons;
294 (i != rp) && ((i-blkif->blk_resp_prod) != BLKIF_RING_SIZE);
295 i++ )
296 {
297 if ( (max_to_do-- == 0) || (NR_PENDING_REQS == MAX_PENDING_REQS) )
298 {
299 more_to_do = 1;
300 break;
301 }
303 req = &blk_ring->ring[MASK_BLKIF_IDX(i)].req;
304 switch ( req->operation )
305 {
306 case BLKIF_OP_READ:
307 case BLKIF_OP_WRITE:
308 dispatch_rw_block_io(blkif, req);
309 break;
311 case BLKIF_OP_PROBE:
312 dispatch_probe(blkif, req);
313 break;
315 default:
316 DPRINTK("error: unknown block io operation [%d]\n",
317 blk_ring->ring[i].req.operation);
318 make_response(blkif, blk_ring->ring[i].req.id,
319 blk_ring->ring[i].req.operation, BLKIF_RSP_ERROR);
320 break;
321 }
322 }
324 blkif->blk_req_cons = i;
325 return more_to_do;
326 }
328 static void dispatch_probe(blkif_t *blkif, blkif_request_t *req)
329 {
330 int rsp = BLKIF_RSP_ERROR;
331 int pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)];
333 /* We expect one buffer only. */
334 if ( unlikely(req->nr_segments != 1) )
335 goto out;
337 /* Make sure the buffer is page-sized. */
338 if ( (blkif_first_sect(req->frame_and_sects[0]) != 0) ||
339 (blkif_last_sect(req->frame_and_sects[0]) != 7) )
340 goto out;
342 if ( HYPERVISOR_update_va_mapping_otherdomain(
343 MMAP_VADDR(pending_idx, 0) >> PAGE_SHIFT,
344 (pte_t) { (req->frame_and_sects[0] & PAGE_MASK) | __PAGE_KERNEL },
345 0, blkif->domid) )
346 goto out;
348 rsp = vbd_probe(blkif, (vdisk_t *)MMAP_VADDR(pending_idx, 0),
349 PAGE_SIZE / sizeof(vdisk_t));
351 out:
352 fast_flush_area(pending_idx, 1);
353 make_response(blkif, req->id, req->operation, rsp);
354 }
356 static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req)
357 {
358 extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]);
359 int operation = (req->operation == BLKIF_OP_WRITE) ? WRITE : READ;
360 short nr_sects;
361 unsigned long buffer, fas;
362 int i, tot_sects, pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)];
363 pending_req_t *pending_req;
364 unsigned long remap_prot;
365 multicall_entry_t mcl[MMAP_PAGES_PER_REQUEST];
367 /* We map virtual scatter/gather segments to physical segments. */
368 int new_segs, nr_psegs = 0;
369 phys_seg_t phys_seg[BLKIF_MAX_SEGMENTS_PER_REQUEST + 1];
371 /* Check that number of segments is sane. */
372 if ( unlikely(req->nr_segments == 0) ||
373 unlikely(req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST) )
374 {
375 DPRINTK("Bad number of segments in request (%d)\n", req->nr_segments);
376 goto bad_descriptor;
377 }
379 /*
380 * Check each address/size pair is sane, and convert into a
381 * physical device and block offset. Note that if the offset and size
382 * crosses a virtual extent boundary, we may end up with more
383 * physical scatter/gather segments than virtual segments.
384 */
385 for ( i = tot_sects = 0; i < req->nr_segments; i++, tot_sects += nr_sects )
386 {
387 fas = req->frame_and_sects[i];
388 buffer = (fas & PAGE_MASK) | (blkif_first_sect(fas) << 9);
389 nr_sects = blkif_last_sect(fas) - blkif_first_sect(fas) + 1;
391 if ( nr_sects <= 0 )
392 goto bad_descriptor;
394 phys_seg[nr_psegs].dev = req->device;
395 phys_seg[nr_psegs].sector_number = req->sector_number + tot_sects;
396 phys_seg[nr_psegs].buffer = buffer;
397 phys_seg[nr_psegs].nr_sects = nr_sects;
399 /* Translate the request into the relevant 'physical device' */
400 new_segs = vbd_translate(&phys_seg[nr_psegs], blkif, operation);
401 if ( new_segs < 0 )
402 {
403 DPRINTK("access denied: %s of [%llu,%llu] on dev=%04x\n",
404 operation == READ ? "read" : "write",
405 req->sector_number + tot_sects,
406 req->sector_number + tot_sects + nr_sects,
407 req->device);
408 goto bad_descriptor;
409 }
411 nr_psegs += new_segs;
412 ASSERT(nr_psegs <= (BLKIF_MAX_SEGMENTS_PER_REQUEST+1));
413 }
415 /* Nonsensical zero-sized request? */
416 if ( unlikely(nr_psegs == 0) )
417 goto bad_descriptor;
419 if ( operation == READ )
420 remap_prot = _PAGE_PRESENT|_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW;
421 else
422 remap_prot = _PAGE_PRESENT|_PAGE_DIRTY|_PAGE_ACCESSED;
424 for ( i = 0; i < nr_psegs; i++ )
425 {
426 mcl[i].op = __HYPERVISOR_update_va_mapping_otherdomain;
427 mcl[i].args[0] = MMAP_VADDR(pending_idx, i) >> PAGE_SHIFT;
428 mcl[i].args[1] = (phys_seg[i].buffer & PAGE_MASK) | remap_prot;
429 mcl[i].args[2] = 0;
430 mcl[i].args[3] = blkif->domid;
432 phys_to_machine_mapping[__pa(MMAP_VADDR(pending_idx, i))>>PAGE_SHIFT] =
433 FOREIGN_FRAME(phys_seg[i].buffer >> PAGE_SHIFT);
434 }
436 if ( unlikely(HYPERVISOR_multicall(mcl, nr_psegs) != 0) )
437 BUG();
439 for ( i = 0; i < nr_psegs; i++ )
440 {
441 if ( unlikely(mcl[i].args[5] != 0) )
442 {
443 DPRINTK("invalid buffer -- could not remap it\n");
444 fast_flush_area(pending_idx, nr_psegs);
445 goto bad_descriptor;
446 }
447 }
449 pending_req = &pending_reqs[pending_idx];
450 pending_req->blkif = blkif;
451 pending_req->id = req->id;
452 pending_req->operation = operation;
453 pending_req->status = BLKIF_RSP_OKAY;
454 pending_req->nr_pages = nr_psegs;
455 atomic_set(&pending_req->pendcnt, nr_psegs);
456 pending_cons++;
458 blkif_get(blkif);
460 /* Now we pass each segment down to the real blkdev layer. */
461 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
462 for ( i = 0; i < nr_psegs; i++ )
463 {
464 struct buffer_head *bh;
466 bh = kmem_cache_alloc(buffer_head_cachep, GFP_ATOMIC);
467 if ( unlikely(bh == NULL) )
468 {
469 __end_block_io_op(pending_req, 0);
470 continue;
471 }
473 memset(bh, 0, sizeof (struct buffer_head));
475 init_waitqueue_head(&bh->b_wait);
476 bh->b_size = phys_seg[i].nr_sects << 9;
477 bh->b_dev = phys_seg[i].dev;
478 bh->b_rdev = phys_seg[i].dev;
479 bh->b_rsector = (unsigned long)phys_seg[i].sector_number;
480 bh->b_data = (char *)MMAP_VADDR(pending_idx, i) +
481 (phys_seg[i].buffer & ~PAGE_MASK);
482 bh->b_page = virt_to_page(MMAP_VADDR(pending_idx, i));
483 bh->b_end_io = end_block_io_op;
484 bh->b_private = pending_req;
486 bh->b_state = (1 << BH_Mapped) | (1 << BH_Lock) |
487 (1 << BH_Req) | (1 << BH_Launder);
488 if ( operation == WRITE )
489 bh->b_state |= (1 << BH_JBD) | (1 << BH_Req) | (1 << BH_Uptodate);
491 atomic_set(&bh->b_count, 1);
493 /* Dispatch a single request. We'll flush it to disc later. */
494 generic_make_request(operation, bh);
495 }
496 #else
497 for ( i = 0; i < nr_psegs; i++ )
498 {
499 struct bio *bio;
500 request_queue_t *q;
502 bio = bio_alloc(GFP_ATOMIC, 1);
503 if ( unlikely(bio == NULL) )
504 {
505 __end_block_io_op(pending_req, 0);
506 continue;
507 }
509 bio->bi_bdev = phys_seg[i].bdev;
510 bio->bi_private = pending_req;
511 bio->bi_end_io = end_block_io_op;
512 bio->bi_sector = phys_seg[i].sector_number;
514 bio_add_page(
515 bio,
516 virt_to_page(MMAP_VADDR(pending_idx, i)),
517 phys_seg[i].nr_sects << 9,
518 phys_seg[i].buffer & ~PAGE_MASK);
520 if ( (q = bdev_get_queue(bio->bi_bdev)) != plugged_queue )
521 {
522 flush_plugged_queue();
523 blk_get_queue(q);
524 plugged_queue = q;
525 }
527 submit_bio(operation, bio);
528 }
529 #endif
531 return;
533 bad_descriptor:
534 make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
535 }
539 /******************************************************************
540 * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING
541 */
544 static void make_response(blkif_t *blkif, unsigned long id,
545 unsigned short op, int st)
546 {
547 blkif_response_t *resp;
548 unsigned long flags;
550 /* Place on the response ring for the relevant domain. */
551 spin_lock_irqsave(&blkif->blk_ring_lock, flags);
552 resp = &blkif->blk_ring_base->
553 ring[MASK_BLKIF_IDX(blkif->blk_resp_prod)].resp;
554 resp->id = id;
555 resp->operation = op;
556 resp->status = st;
557 wmb(); /* Ensure other side can see the response fields. */
558 blkif->blk_ring_base->resp_prod = ++blkif->blk_resp_prod;
559 spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
561 /* Kick the relevant domain. */
562 notify_via_evtchn(blkif->evtchn);
563 }
565 void blkif_deschedule(blkif_t *blkif)
566 {
567 remove_from_blkdev_list(blkif);
568 }
570 static int __init blkif_init(void)
571 {
572 int i;
574 if ( !(xen_start_info.flags & SIF_INITDOMAIN) &&
575 !(xen_start_info.flags & SIF_BLK_BE_DOMAIN) )
576 return 0;
578 blkif_interface_init();
580 if ( (mmap_vstart = allocate_empty_lowmem_region(MMAP_PAGES)) == 0 )
581 BUG();
583 pending_cons = 0;
584 pending_prod = MAX_PENDING_REQS;
585 memset(pending_reqs, 0, sizeof(pending_reqs));
586 for ( i = 0; i < MAX_PENDING_REQS; i++ )
587 pending_ring[i] = i;
589 spin_lock_init(&blkio_schedule_list_lock);
590 INIT_LIST_HEAD(&blkio_schedule_list);
592 if ( kernel_thread(blkio_schedule, 0, CLONE_FS | CLONE_FILES) < 0 )
593 BUG();
595 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
596 buffer_head_cachep = kmem_cache_create(
597 "buffer_head_cache", sizeof(struct buffer_head),
598 0, SLAB_HWCACHE_ALIGN, NULL, NULL);
599 #endif
601 blkif_ctrlif_init();
603 return 0;
604 }
606 __initcall(blkif_init);