direct-io.hg

view linux-2.6.11-xen-sparse/drivers/xen/blkback/blkback.c @ 4399:404af33b77b7

bitkeeper revision 1.1236.1.171 (424bbaa4hQe-DqWXkaydpYrAVLtpFA)

noop merge.
author kaf24@firebug.cl.cam.ac.uk
date Thu Mar 31 08:53:56 2005 +0000 (2005-03-31)
parents 65e8e0c70597 deebfbea73d9
children fa851e5b369e
line source
1 /******************************************************************************
2 * arch/xen/drivers/blkif/backend/main.c
3 *
4 * Back-end of the driver for virtual block devices. This portion of the
5 * driver exports a 'unified' block-device interface that can be accessed
6 * by any operating system that implements a compatible front end. A
7 * reference front-end implementation can be found in:
8 * arch/xen/drivers/blkif/frontend
9 *
10 * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
11 */
13 #include "common.h"
14 #include <asm-xen/evtchn.h>
16 /*
17 * These are rather arbitrary. They are fairly large because adjacent requests
18 * pulled from a communication ring are quite likely to end up being part of
19 * the same scatter/gather request at the disc.
20 *
21 * ** TRY INCREASING 'MAX_PENDING_REQS' IF WRITE SPEEDS SEEM TOO LOW **
22 * This will increase the chances of being able to write whole tracks.
23 * 64 should be enough to keep us competitive with Linux.
24 */
25 #define MAX_PENDING_REQS 64
26 #define BATCH_PER_DOMAIN 16
28 static unsigned long mmap_vstart;
29 #define MMAP_PAGES_PER_REQUEST \
30 (BLKIF_MAX_SEGMENTS_PER_REQUEST + 1)
31 #define MMAP_PAGES \
32 (MAX_PENDING_REQS * MMAP_PAGES_PER_REQUEST)
33 #define MMAP_VADDR(_req,_seg) \
34 (mmap_vstart + \
35 ((_req) * MMAP_PAGES_PER_REQUEST * PAGE_SIZE) + \
36 ((_seg) * PAGE_SIZE))
38 /*
39 * Each outstanding request that we've passed to the lower device layers has a
40 * 'pending_req' allocated to it. Each buffer_head that completes decrements
41 * the pendcnt towards zero. When it hits zero, the specified domain has a
42 * response queued for it, with the saved 'id' passed back.
43 */
44 typedef struct {
45 blkif_t *blkif;
46 unsigned long id;
47 int nr_pages;
48 atomic_t pendcnt;
49 unsigned short operation;
50 int status;
51 } pending_req_t;
53 /*
54 * We can't allocate pending_req's in order, since they may complete out of
55 * order. We therefore maintain an allocation ring. This ring also indicates
56 * when enough work has been passed down -- at that point the allocation ring
57 * will be empty.
58 */
59 static pending_req_t pending_reqs[MAX_PENDING_REQS];
60 static unsigned char pending_ring[MAX_PENDING_REQS];
61 static spinlock_t pend_prod_lock = SPIN_LOCK_UNLOCKED;
62 /* NB. We use a different index type to differentiate from shared blk rings. */
63 typedef unsigned int PEND_RING_IDX;
64 #define MASK_PEND_IDX(_i) ((_i)&(MAX_PENDING_REQS-1))
65 static PEND_RING_IDX pending_prod, pending_cons;
66 #define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons)
68 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
69 static kmem_cache_t *buffer_head_cachep;
70 #endif
72 #ifdef CONFIG_XEN_BLKDEV_TAP_BE
73 /*
74 * If the tap driver is used, we may get pages belonging to either the tap
75 * or (more likely) the real frontend. The backend must specify which domain
76 * a given page belongs to in update_va_mapping though. For the moment,
77 * the tap rewrites the ID field of the request to contain the request index
78 * and the id of the real front end domain.
79 */
80 #define BLKTAP_COOKIE 0xbeadfeed
81 static inline domid_t ID_TO_DOM(unsigned long id) { return (id >> 16); }
82 #endif
84 static int do_block_io_op(blkif_t *blkif, int max_to_do);
85 static void dispatch_probe(blkif_t *blkif, blkif_request_t *req);
86 static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req);
87 static void make_response(blkif_t *blkif, unsigned long id,
88 unsigned short op, int st);
90 static void fast_flush_area(int idx, int nr_pages)
91 {
92 multicall_entry_t mcl[MMAP_PAGES_PER_REQUEST];
93 int i;
95 for ( i = 0; i < nr_pages; i++ )
96 {
97 mcl[i].op = __HYPERVISOR_update_va_mapping;
98 mcl[i].args[0] = MMAP_VADDR(idx, i);
99 mcl[i].args[1] = 0;
100 mcl[i].args[2] = 0;
101 }
103 mcl[nr_pages-1].args[2] = UVMF_TLB_FLUSH_ALL;
104 if ( unlikely(HYPERVISOR_multicall(mcl, nr_pages) != 0) )
105 BUG();
106 }
109 /******************************************************************
110 * BLOCK-DEVICE SCHEDULER LIST MAINTENANCE
111 */
113 static struct list_head blkio_schedule_list;
114 static spinlock_t blkio_schedule_list_lock;
116 static int __on_blkdev_list(blkif_t *blkif)
117 {
118 return blkif->blkdev_list.next != NULL;
119 }
121 static void remove_from_blkdev_list(blkif_t *blkif)
122 {
123 unsigned long flags;
124 if ( !__on_blkdev_list(blkif) ) return;
125 spin_lock_irqsave(&blkio_schedule_list_lock, flags);
126 if ( __on_blkdev_list(blkif) )
127 {
128 list_del(&blkif->blkdev_list);
129 blkif->blkdev_list.next = NULL;
130 blkif_put(blkif);
131 }
132 spin_unlock_irqrestore(&blkio_schedule_list_lock, flags);
133 }
135 static void add_to_blkdev_list_tail(blkif_t *blkif)
136 {
137 unsigned long flags;
138 if ( __on_blkdev_list(blkif) ) return;
139 spin_lock_irqsave(&blkio_schedule_list_lock, flags);
140 if ( !__on_blkdev_list(blkif) && (blkif->status == CONNECTED) )
141 {
142 list_add_tail(&blkif->blkdev_list, &blkio_schedule_list);
143 blkif_get(blkif);
144 }
145 spin_unlock_irqrestore(&blkio_schedule_list_lock, flags);
146 }
149 /******************************************************************
150 * SCHEDULER FUNCTIONS
151 */
153 static DECLARE_WAIT_QUEUE_HEAD(blkio_schedule_wait);
155 static int blkio_schedule(void *arg)
156 {
157 DECLARE_WAITQUEUE(wq, current);
159 blkif_t *blkif;
160 struct list_head *ent;
162 daemonize(
163 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
164 "xenblkd"
165 #endif
166 );
168 for ( ; ; )
169 {
170 /* Wait for work to do. */
171 add_wait_queue(&blkio_schedule_wait, &wq);
172 set_current_state(TASK_INTERRUPTIBLE);
173 if ( (NR_PENDING_REQS == MAX_PENDING_REQS) ||
174 list_empty(&blkio_schedule_list) )
175 schedule();
176 __set_current_state(TASK_RUNNING);
177 remove_wait_queue(&blkio_schedule_wait, &wq);
179 /* Queue up a batch of requests. */
180 while ( (NR_PENDING_REQS < MAX_PENDING_REQS) &&
181 !list_empty(&blkio_schedule_list) )
182 {
183 ent = blkio_schedule_list.next;
184 blkif = list_entry(ent, blkif_t, blkdev_list);
185 blkif_get(blkif);
186 remove_from_blkdev_list(blkif);
187 if ( do_block_io_op(blkif, BATCH_PER_DOMAIN) )
188 add_to_blkdev_list_tail(blkif);
189 blkif_put(blkif);
190 }
192 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
193 /* Push the batch through to disc. */
194 run_task_queue(&tq_disk);
195 #endif
196 }
197 }
199 static void maybe_trigger_blkio_schedule(void)
200 {
201 /*
202 * Needed so that two processes, who together make the following predicate
203 * true, don't both read stale values and evaluate the predicate
204 * incorrectly. Incredibly unlikely to stall the scheduler on x86, but...
205 */
206 smp_mb();
208 if ( (NR_PENDING_REQS < (MAX_PENDING_REQS/2)) &&
209 !list_empty(&blkio_schedule_list) )
210 wake_up(&blkio_schedule_wait);
211 }
215 /******************************************************************
216 * COMPLETION CALLBACK -- Called as bh->b_end_io()
217 */
219 static void __end_block_io_op(pending_req_t *pending_req, int uptodate)
220 {
221 unsigned long flags;
223 /* An error fails the entire request. */
224 if ( !uptodate )
225 {
226 DPRINTK("Buffer not up-to-date at end of operation\n");
227 pending_req->status = BLKIF_RSP_ERROR;
228 }
230 if ( atomic_dec_and_test(&pending_req->pendcnt) )
231 {
232 int pending_idx = pending_req - pending_reqs;
233 fast_flush_area(pending_idx, pending_req->nr_pages);
234 make_response(pending_req->blkif, pending_req->id,
235 pending_req->operation, pending_req->status);
236 blkif_put(pending_req->blkif);
237 spin_lock_irqsave(&pend_prod_lock, flags);
238 pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
239 spin_unlock_irqrestore(&pend_prod_lock, flags);
240 maybe_trigger_blkio_schedule();
241 }
242 }
244 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
245 static void end_block_io_op(struct buffer_head *bh, int uptodate)
246 {
247 __end_block_io_op(bh->b_private, uptodate);
248 kmem_cache_free(buffer_head_cachep, bh);
249 }
250 #else
251 static int end_block_io_op(struct bio *bio, unsigned int done, int error)
252 {
253 if ( done || error )
254 __end_block_io_op(bio->bi_private, (done && !error));
255 bio_put(bio);
256 return error;
257 }
258 #endif
261 /******************************************************************************
262 * NOTIFICATION FROM GUEST OS.
263 */
265 irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs)
266 {
267 blkif_t *blkif = dev_id;
268 add_to_blkdev_list_tail(blkif);
269 maybe_trigger_blkio_schedule();
270 return IRQ_HANDLED;
271 }
275 /******************************************************************
276 * DOWNWARD CALLS -- These interface with the block-device layer proper.
277 */
279 static int do_block_io_op(blkif_t *blkif, int max_to_do)
280 {
281 blkif_back_ring_t *blk_ring = &blkif->blk_ring;
282 blkif_request_t *req;
283 RING_IDX i, rp;
284 int more_to_do = 0;
286 rp = blk_ring->sring->req_prod;
287 rmb(); /* Ensure we see queued requests up to 'rp'. */
289 for ( i = blk_ring->req_cons;
290 (i != rp) && !RING_REQUEST_CONS_OVERFLOW(blk_ring, i);
291 i++ )
292 {
293 if ( (max_to_do-- == 0) || (NR_PENDING_REQS == MAX_PENDING_REQS) )
294 {
295 more_to_do = 1;
296 break;
297 }
299 req = RING_GET_REQUEST(blk_ring, i);
300 switch ( req->operation )
301 {
302 case BLKIF_OP_READ:
303 case BLKIF_OP_WRITE:
304 dispatch_rw_block_io(blkif, req);
305 break;
307 case BLKIF_OP_PROBE:
308 dispatch_probe(blkif, req);
309 break;
311 default:
312 DPRINTK("error: unknown block io operation [%d]\n",
313 req->operation);
314 make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
315 break;
316 }
317 }
319 blk_ring->req_cons = i;
320 return more_to_do;
321 }
323 static void dispatch_probe(blkif_t *blkif, blkif_request_t *req)
324 {
325 int rsp = BLKIF_RSP_ERROR;
326 int pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)];
328 /* We expect one buffer only. */
329 if ( unlikely(req->nr_segments != 1) )
330 goto out;
332 /* Make sure the buffer is page-sized. */
333 if ( (blkif_first_sect(req->frame_and_sects[0]) != 0) ||
334 (blkif_last_sect(req->frame_and_sects[0]) != 7) )
335 goto out;
337 #ifdef CONFIG_XEN_BLKDEV_TAP_BE
338 /* Grab the real frontend out of the probe message. */
339 if (req->frame_and_sects[1] == BLKTAP_COOKIE)
340 blkif->is_blktap = 1;
341 #endif
344 #ifdef CONFIG_XEN_BLKDEV_TAP_BE
345 if ( HYPERVISOR_update_va_mapping_otherdomain(
346 MMAP_VADDR(pending_idx, 0),
347 (pte_t) { (req->frame_and_sects[0] & PAGE_MASK) | __PAGE_KERNEL },
348 0, (blkif->is_blktap ? ID_TO_DOM(req->id) : blkif->domid) ) )
350 goto out;
351 #else
352 if ( HYPERVISOR_update_va_mapping_otherdomain(
353 MMAP_VADDR(pending_idx, 0),
354 (pte_t) { (req->frame_and_sects[0] & PAGE_MASK) | __PAGE_KERNEL },
355 0, blkif->domid) )
357 goto out;
358 #endif
360 rsp = vbd_probe(blkif, (vdisk_t *)MMAP_VADDR(pending_idx, 0),
361 PAGE_SIZE / sizeof(vdisk_t));
363 out:
364 fast_flush_area(pending_idx, 1);
365 make_response(blkif, req->id, req->operation, rsp);
366 }
368 static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req)
369 {
370 extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]);
371 int operation = (req->operation == BLKIF_OP_WRITE) ? WRITE : READ;
372 short nr_sects;
373 unsigned long buffer, fas;
374 int i, tot_sects, pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)];
375 pending_req_t *pending_req;
376 unsigned long remap_prot;
377 multicall_entry_t mcl[MMAP_PAGES_PER_REQUEST];
379 /* We map virtual scatter/gather segments to physical segments. */
380 int new_segs, nr_psegs = 0;
381 phys_seg_t phys_seg[BLKIF_MAX_SEGMENTS_PER_REQUEST + 1];
383 /* Check that number of segments is sane. */
384 if ( unlikely(req->nr_segments == 0) ||
385 unlikely(req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST) )
386 {
387 DPRINTK("Bad number of segments in request (%d)\n", req->nr_segments);
388 goto bad_descriptor;
389 }
391 /*
392 * Check each address/size pair is sane, and convert into a
393 * physical device and block offset. Note that if the offset and size
394 * crosses a virtual extent boundary, we may end up with more
395 * physical scatter/gather segments than virtual segments.
396 */
397 for ( i = tot_sects = 0; i < req->nr_segments; i++, tot_sects += nr_sects )
398 {
399 fas = req->frame_and_sects[i];
400 buffer = (fas & PAGE_MASK) | (blkif_first_sect(fas) << 9);
401 nr_sects = blkif_last_sect(fas) - blkif_first_sect(fas) + 1;
403 if ( nr_sects <= 0 )
404 goto bad_descriptor;
406 phys_seg[nr_psegs].dev = req->device;
407 phys_seg[nr_psegs].sector_number = req->sector_number + tot_sects;
408 phys_seg[nr_psegs].buffer = buffer;
409 phys_seg[nr_psegs].nr_sects = nr_sects;
411 /* Translate the request into the relevant 'physical device' */
412 new_segs = vbd_translate(&phys_seg[nr_psegs], blkif, operation);
413 if ( new_segs < 0 )
414 {
415 DPRINTK("access denied: %s of [%llu,%llu] on dev=%04x\n",
416 operation == READ ? "read" : "write",
417 req->sector_number + tot_sects,
418 req->sector_number + tot_sects + nr_sects,
419 req->device);
420 goto bad_descriptor;
421 }
423 nr_psegs += new_segs;
424 ASSERT(nr_psegs <= (BLKIF_MAX_SEGMENTS_PER_REQUEST+1));
425 }
427 /* Nonsensical zero-sized request? */
428 if ( unlikely(nr_psegs == 0) )
429 goto bad_descriptor;
431 if ( operation == READ )
432 remap_prot = _PAGE_PRESENT|_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW;
433 else
434 remap_prot = _PAGE_PRESENT|_PAGE_DIRTY|_PAGE_ACCESSED;
436 for ( i = 0; i < nr_psegs; i++ )
437 {
438 mcl[i].op = __HYPERVISOR_update_va_mapping_otherdomain;
439 mcl[i].args[0] = MMAP_VADDR(pending_idx, i);
440 mcl[i].args[1] = (phys_seg[i].buffer & PAGE_MASK) | remap_prot;
441 mcl[i].args[2] = 0;
442 #ifdef CONFIG_XEN_BLKDEV_TAP_BE
443 mcl[i].args[3] = (blkif->is_blktap) ? ID_TO_DOM(req->id) : blkif->domid;
444 #else
445 mcl[i].args[3] = blkif->domid;
446 #endif
447 phys_to_machine_mapping[__pa(MMAP_VADDR(pending_idx, i))>>PAGE_SHIFT] =
448 FOREIGN_FRAME(phys_seg[i].buffer >> PAGE_SHIFT);
449 }
451 if ( unlikely(HYPERVISOR_multicall(mcl, nr_psegs) != 0) )
452 BUG();
454 for ( i = 0; i < nr_psegs; i++ )
455 {
456 if ( unlikely(mcl[i].args[5] != 0) )
457 {
458 DPRINTK("invalid buffer -- could not remap it\n");
459 fast_flush_area(pending_idx, nr_psegs);
460 goto bad_descriptor;
461 }
462 }
464 pending_req = &pending_reqs[pending_idx];
465 pending_req->blkif = blkif;
466 pending_req->id = req->id;
467 pending_req->operation = operation;
468 pending_req->status = BLKIF_RSP_OKAY;
469 pending_req->nr_pages = nr_psegs;
470 atomic_set(&pending_req->pendcnt, nr_psegs);
471 pending_cons++;
473 blkif_get(blkif);
475 /* Now we pass each segment down to the real blkdev layer. */
476 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
477 for ( i = 0; i < nr_psegs; i++ )
478 {
479 struct buffer_head *bh;
481 bh = kmem_cache_alloc(buffer_head_cachep, GFP_ATOMIC);
482 if ( unlikely(bh == NULL) )
483 {
484 __end_block_io_op(pending_req, 0);
485 continue;
486 }
488 memset(bh, 0, sizeof (struct buffer_head));
490 init_waitqueue_head(&bh->b_wait);
491 bh->b_size = phys_seg[i].nr_sects << 9;
492 bh->b_dev = phys_seg[i].dev;
493 bh->b_rdev = phys_seg[i].dev;
494 bh->b_rsector = (unsigned long)phys_seg[i].sector_number;
495 bh->b_data = (char *)MMAP_VADDR(pending_idx, i) +
496 (phys_seg[i].buffer & ~PAGE_MASK);
497 bh->b_page = virt_to_page(MMAP_VADDR(pending_idx, i));
498 bh->b_end_io = end_block_io_op;
499 bh->b_private = pending_req;
501 bh->b_state = (1 << BH_Mapped) | (1 << BH_Lock) |
502 (1 << BH_Req) | (1 << BH_Launder);
503 if ( operation == WRITE )
504 bh->b_state |= (1 << BH_JBD) | (1 << BH_Req) | (1 << BH_Uptodate);
506 atomic_set(&bh->b_count, 1);
508 /* Dispatch a single request. We'll flush it to disc later. */
509 generic_make_request(operation, bh);
510 }
511 #else
512 for ( i = 0; i < nr_psegs; i++ )
513 {
514 struct bio *bio;
516 bio = bio_alloc(GFP_ATOMIC, 1);
517 if ( unlikely(bio == NULL) )
518 {
519 __end_block_io_op(pending_req, 0);
520 continue;
521 }
523 bio->bi_bdev = phys_seg[i].bdev;
524 bio->bi_private = pending_req;
525 bio->bi_end_io = end_block_io_op;
526 bio->bi_sector = phys_seg[i].sector_number;
528 bio_add_page(
529 bio,
530 virt_to_page(MMAP_VADDR(pending_idx, i)),
531 phys_seg[i].nr_sects << 9,
532 phys_seg[i].buffer & ~PAGE_MASK);
534 submit_bio(operation | (1 << BIO_RW_SYNC), bio);
535 }
536 #endif
538 return;
540 bad_descriptor:
541 make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
542 }
546 /******************************************************************
547 * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING
548 */
551 static void make_response(blkif_t *blkif, unsigned long id,
552 unsigned short op, int st)
553 {
554 blkif_response_t *resp;
555 unsigned long flags;
556 blkif_back_ring_t *blk_ring = &blkif->blk_ring;
558 /* Place on the response ring for the relevant domain. */
559 spin_lock_irqsave(&blkif->blk_ring_lock, flags);
560 resp = RING_GET_RESPONSE(blk_ring, blk_ring->rsp_prod_pvt);
561 resp->id = id;
562 resp->operation = op;
563 resp->status = st;
564 wmb(); /* Ensure other side can see the response fields. */
565 blk_ring->rsp_prod_pvt++;
566 RING_PUSH_RESPONSES(blk_ring);
567 spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
569 /* Kick the relevant domain. */
570 notify_via_evtchn(blkif->evtchn);
571 }
573 void blkif_deschedule(blkif_t *blkif)
574 {
575 remove_from_blkdev_list(blkif);
576 }
578 static int __init blkif_init(void)
579 {
580 int i;
582 if ( !(xen_start_info.flags & SIF_INITDOMAIN) &&
583 !(xen_start_info.flags & SIF_BLK_BE_DOMAIN) )
584 return 0;
586 blkif_interface_init();
588 if ( (mmap_vstart = allocate_empty_lowmem_region(MMAP_PAGES)) == 0 )
589 BUG();
591 pending_cons = 0;
592 pending_prod = MAX_PENDING_REQS;
593 memset(pending_reqs, 0, sizeof(pending_reqs));
594 for ( i = 0; i < MAX_PENDING_REQS; i++ )
595 pending_ring[i] = i;
597 spin_lock_init(&blkio_schedule_list_lock);
598 INIT_LIST_HEAD(&blkio_schedule_list);
600 if ( kernel_thread(blkio_schedule, 0, CLONE_FS | CLONE_FILES) < 0 )
601 BUG();
603 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
604 buffer_head_cachep = kmem_cache_create(
605 "buffer_head_cache", sizeof(struct buffer_head),
606 0, SLAB_HWCACHE_ALIGN, NULL, NULL);
607 #endif
609 blkif_ctrlif_init();
611 #ifdef CONFIG_XEN_BLKDEV_TAP_BE
612 printk(KERN_ALERT "NOTE: Blkif backend is running with tap support on!\n");
613 #endif
614 return 0;
615 }
617 __initcall(blkif_init);