direct-io.hg

view linux-2.6-xen-sparse/drivers/xen/blkback/blkback.c @ 8121:58d46463413e

GNTTABOP_map_grant_ref returns error status and handle as
separate fields. Update callers for new interface. Also
use int16_t as standard error code type on all public
interfaces.

Signed-off-by: Keir Fraser <keir@xensource.com>
author kaf24@firebug.cl.cam.ac.uk
date Wed Nov 30 17:24:27 2005 +0100 (2005-11-30)
parents ff95b53bd39a
children ca236a81729d
line source
1 /******************************************************************************
2 * arch/xen/drivers/blkif/backend/main.c
3 *
4 * Back-end of the driver for virtual block devices. This portion of the
5 * driver exports a 'unified' block-device interface that can be accessed
6 * by any operating system that implements a compatible front end. A
7 * reference front-end implementation can be found in:
8 * arch/xen/drivers/blkif/frontend
9 *
10 * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
11 * Copyright (c) 2005, Christopher Clark
12 */
14 #include <linux/spinlock.h>
15 #include <asm-xen/balloon.h>
16 #include <asm/hypervisor.h>
17 #include "common.h"
19 /*
20 * These are rather arbitrary. They are fairly large because adjacent requests
21 * pulled from a communication ring are quite likely to end up being part of
22 * the same scatter/gather request at the disc.
23 *
24 * ** TRY INCREASING 'MAX_PENDING_REQS' IF WRITE SPEEDS SEEM TOO LOW **
25 * This will increase the chances of being able to write whole tracks.
26 * 64 should be enough to keep us competitive with Linux.
27 */
28 #define MAX_PENDING_REQS 64
29 #define BATCH_PER_DOMAIN 16
31 static unsigned long mmap_vstart;
32 #define MMAP_PAGES \
33 (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST)
34 #ifdef __ia64__
35 static void *pending_vaddrs[MMAP_PAGES];
36 #define MMAP_VADDR(_idx, _i) \
37 (unsigned long)(pending_vaddrs[((_idx) * BLKIF_MAX_SEGMENTS_PER_REQUEST) + (_i)])
38 #else
39 #define MMAP_VADDR(_req,_seg) \
40 (mmap_vstart + \
41 ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) + \
42 ((_seg) * PAGE_SIZE))
43 #endif
45 /*
46 * Each outstanding request that we've passed to the lower device layers has a
47 * 'pending_req' allocated to it. Each buffer_head that completes decrements
48 * the pendcnt towards zero. When it hits zero, the specified domain has a
49 * response queued for it, with the saved 'id' passed back.
50 */
51 typedef struct {
52 blkif_t *blkif;
53 unsigned long id;
54 int nr_pages;
55 atomic_t pendcnt;
56 unsigned short operation;
57 int status;
58 } pending_req_t;
60 /*
61 * We can't allocate pending_req's in order, since they may complete out of
62 * order. We therefore maintain an allocation ring. This ring also indicates
63 * when enough work has been passed down -- at that point the allocation ring
64 * will be empty.
65 */
66 static pending_req_t pending_reqs[MAX_PENDING_REQS];
67 static unsigned char pending_ring[MAX_PENDING_REQS];
68 static spinlock_t pend_prod_lock = SPIN_LOCK_UNLOCKED;
69 /* NB. We use a different index type to differentiate from shared blk rings. */
70 typedef unsigned int PEND_RING_IDX;
71 #define MASK_PEND_IDX(_i) ((_i)&(MAX_PENDING_REQS-1))
72 static PEND_RING_IDX pending_prod, pending_cons;
73 #define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons)
75 static request_queue_t *plugged_queue;
76 static inline void flush_plugged_queue(void)
77 {
78 request_queue_t *q = plugged_queue;
79 if (q != NULL) {
80 if ( q->unplug_fn != NULL )
81 q->unplug_fn(q);
82 blk_put_queue(q);
83 plugged_queue = NULL;
84 }
85 }
87 /* When using grant tables to map a frame for device access then the
88 * handle returned must be used to unmap the frame. This is needed to
89 * drop the ref count on the frame.
90 */
91 static grant_handle_t pending_grant_handles[MMAP_PAGES];
92 #define pending_handle(_idx, _i) \
93 (pending_grant_handles[((_idx) * BLKIF_MAX_SEGMENTS_PER_REQUEST) + (_i)])
94 #define BLKBACK_INVALID_HANDLE (~0)
96 #ifdef CONFIG_XEN_BLKDEV_TAP_BE
97 /*
98 * If the tap driver is used, we may get pages belonging to either the tap
99 * or (more likely) the real frontend. The backend must specify which domain
100 * a given page belongs to in update_va_mapping though. For the moment,
101 * the tap rewrites the ID field of the request to contain the request index
102 * and the id of the real front end domain.
103 */
104 #define BLKTAP_COOKIE 0xbeadfeed
105 static inline domid_t ID_TO_DOM(unsigned long id) { return (id >> 16); }
106 #endif
108 static int do_block_io_op(blkif_t *blkif, int max_to_do);
109 static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req);
110 static void make_response(blkif_t *blkif, unsigned long id,
111 unsigned short op, int st);
113 static void fast_flush_area(int idx, int nr_pages)
114 {
115 struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
116 unsigned int i, invcount = 0;
117 grant_handle_t handle;
118 int ret;
120 for (i = 0; i < nr_pages; i++) {
121 handle = pending_handle(idx, i);
122 if (handle == BLKBACK_INVALID_HANDLE)
123 continue;
124 unmap[invcount].host_addr = MMAP_VADDR(idx, i);
125 unmap[invcount].dev_bus_addr = 0;
126 unmap[invcount].handle = handle;
127 pending_handle(idx, i) = BLKBACK_INVALID_HANDLE;
128 invcount++;
129 }
131 ret = HYPERVISOR_grant_table_op(
132 GNTTABOP_unmap_grant_ref, unmap, invcount);
133 BUG_ON(ret);
134 }
137 /******************************************************************
138 * BLOCK-DEVICE SCHEDULER LIST MAINTENANCE
139 */
141 static struct list_head blkio_schedule_list;
142 static spinlock_t blkio_schedule_list_lock;
144 static int __on_blkdev_list(blkif_t *blkif)
145 {
146 return blkif->blkdev_list.next != NULL;
147 }
149 static void remove_from_blkdev_list(blkif_t *blkif)
150 {
151 unsigned long flags;
153 if (!__on_blkdev_list(blkif))
154 return;
156 spin_lock_irqsave(&blkio_schedule_list_lock, flags);
157 if (__on_blkdev_list(blkif)) {
158 list_del(&blkif->blkdev_list);
159 blkif->blkdev_list.next = NULL;
160 blkif_put(blkif);
161 }
162 spin_unlock_irqrestore(&blkio_schedule_list_lock, flags);
163 }
165 static void add_to_blkdev_list_tail(blkif_t *blkif)
166 {
167 unsigned long flags;
169 if (__on_blkdev_list(blkif))
170 return;
172 spin_lock_irqsave(&blkio_schedule_list_lock, flags);
173 if (!__on_blkdev_list(blkif) && (blkif->status == CONNECTED)) {
174 list_add_tail(&blkif->blkdev_list, &blkio_schedule_list);
175 blkif_get(blkif);
176 }
177 spin_unlock_irqrestore(&blkio_schedule_list_lock, flags);
178 }
181 /******************************************************************
182 * SCHEDULER FUNCTIONS
183 */
185 static DECLARE_WAIT_QUEUE_HEAD(blkio_schedule_wait);
187 static int blkio_schedule(void *arg)
188 {
189 DECLARE_WAITQUEUE(wq, current);
191 blkif_t *blkif;
192 struct list_head *ent;
194 daemonize("xenblkd");
196 for (;;) {
197 /* Wait for work to do. */
198 add_wait_queue(&blkio_schedule_wait, &wq);
199 set_current_state(TASK_INTERRUPTIBLE);
200 if ( (NR_PENDING_REQS == MAX_PENDING_REQS) ||
201 list_empty(&blkio_schedule_list) )
202 schedule();
203 __set_current_state(TASK_RUNNING);
204 remove_wait_queue(&blkio_schedule_wait, &wq);
206 /* Queue up a batch of requests. */
207 while ((NR_PENDING_REQS < MAX_PENDING_REQS) &&
208 !list_empty(&blkio_schedule_list)) {
209 ent = blkio_schedule_list.next;
210 blkif = list_entry(ent, blkif_t, blkdev_list);
211 blkif_get(blkif);
212 remove_from_blkdev_list(blkif);
213 if (do_block_io_op(blkif, BATCH_PER_DOMAIN))
214 add_to_blkdev_list_tail(blkif);
215 blkif_put(blkif);
216 }
218 /* Push the batch through to disc. */
219 flush_plugged_queue();
220 }
221 }
223 static void maybe_trigger_blkio_schedule(void)
224 {
225 /*
226 * Needed so that two processes, which together make the following
227 * predicate true, don't both read stale values and evaluate the
228 * predicate incorrectly. Incredibly unlikely to stall the scheduler
229 * on x86, but...
230 */
231 smp_mb();
233 if ((NR_PENDING_REQS < (MAX_PENDING_REQS/2)) &&
234 !list_empty(&blkio_schedule_list))
235 wake_up(&blkio_schedule_wait);
236 }
240 /******************************************************************
241 * COMPLETION CALLBACK -- Called as bh->b_end_io()
242 */
244 static void __end_block_io_op(pending_req_t *pending_req, int uptodate)
245 {
246 unsigned long flags;
248 /* An error fails the entire request. */
249 if (!uptodate) {
250 DPRINTK("Buffer not up-to-date at end of operation\n");
251 pending_req->status = BLKIF_RSP_ERROR;
252 }
254 if (atomic_dec_and_test(&pending_req->pendcnt)) {
255 int pending_idx = pending_req - pending_reqs;
256 fast_flush_area(pending_idx, pending_req->nr_pages);
257 make_response(pending_req->blkif, pending_req->id,
258 pending_req->operation, pending_req->status);
259 blkif_put(pending_req->blkif);
260 spin_lock_irqsave(&pend_prod_lock, flags);
261 pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
262 spin_unlock_irqrestore(&pend_prod_lock, flags);
263 maybe_trigger_blkio_schedule();
264 }
265 }
267 static int end_block_io_op(struct bio *bio, unsigned int done, int error)
268 {
269 if (bio->bi_size != 0)
270 return 1;
271 __end_block_io_op(bio->bi_private, !error);
272 bio_put(bio);
273 return error;
274 }
277 /******************************************************************************
278 * NOTIFICATION FROM GUEST OS.
279 */
281 irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs)
282 {
283 blkif_t *blkif = dev_id;
284 add_to_blkdev_list_tail(blkif);
285 maybe_trigger_blkio_schedule();
286 return IRQ_HANDLED;
287 }
291 /******************************************************************
292 * DOWNWARD CALLS -- These interface with the block-device layer proper.
293 */
295 static int do_block_io_op(blkif_t *blkif, int max_to_do)
296 {
297 blkif_back_ring_t *blk_ring = &blkif->blk_ring;
298 blkif_request_t *req;
299 RING_IDX i, rp;
300 int more_to_do = 0;
302 rp = blk_ring->sring->req_prod;
303 rmb(); /* Ensure we see queued requests up to 'rp'. */
305 for (i = blk_ring->req_cons;
306 (i != rp) && !RING_REQUEST_CONS_OVERFLOW(blk_ring, i);
307 i++) {
308 if ((max_to_do-- == 0) ||
309 (NR_PENDING_REQS == MAX_PENDING_REQS)) {
310 more_to_do = 1;
311 break;
312 }
314 req = RING_GET_REQUEST(blk_ring, i);
315 switch (req->operation) {
316 case BLKIF_OP_READ:
317 case BLKIF_OP_WRITE:
318 dispatch_rw_block_io(blkif, req);
319 break;
321 default:
322 DPRINTK("error: unknown block io operation [%d]\n",
323 req->operation);
324 make_response(blkif, req->id, req->operation,
325 BLKIF_RSP_ERROR);
326 break;
327 }
328 }
330 blk_ring->req_cons = i;
331 return more_to_do;
332 }
334 static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req)
335 {
336 extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]);
337 int operation = (req->operation == BLKIF_OP_WRITE) ? WRITE : READ;
338 int i, pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)];
339 pending_req_t *pending_req;
340 struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
341 struct phys_req preq;
342 struct {
343 unsigned long buf; unsigned int nsec;
344 } seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
345 unsigned int nseg;
346 struct bio *bio = NULL, *biolist[BLKIF_MAX_SEGMENTS_PER_REQUEST];
347 int nbio = 0;
348 request_queue_t *q;
349 int ret, errors = 0;
351 /* Check that number of segments is sane. */
352 nseg = req->nr_segments;
353 if (unlikely(nseg == 0) ||
354 unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
355 DPRINTK("Bad number of segments in request (%d)\n", nseg);
356 goto bad_descriptor;
357 }
359 preq.dev = req->handle;
360 preq.sector_number = req->sector_number;
361 preq.nr_sects = 0;
363 for (i = 0; i < nseg; i++) {
364 seg[i].nsec = req->seg[i].last_sect -
365 req->seg[i].first_sect + 1;
367 if ((req->seg[i].last_sect >= (PAGE_SIZE >> 9)) ||
368 (seg[i].nsec <= 0))
369 goto bad_descriptor;
370 preq.nr_sects += seg[i].nsec;
372 map[i].host_addr = MMAP_VADDR(pending_idx, i);
373 map[i].dom = blkif->domid;
374 map[i].ref = req->seg[i].gref;
375 map[i].flags = GNTMAP_host_map;
376 if ( operation == WRITE )
377 map[i].flags |= GNTMAP_readonly;
378 }
380 ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, nseg);
381 BUG_ON(ret);
383 for (i = 0; i < nseg; i++) {
384 if (likely(map[i].status == 0)) {
385 pending_handle(pending_idx, i) = map[i].handle;
386 #ifdef __ia64__
387 MMAP_VADDR(pending_idx,i) = gnttab_map_vaddr(map[i]);
388 #else
389 set_phys_to_machine(__pa(MMAP_VADDR(
390 pending_idx, i)) >> PAGE_SHIFT,
391 FOREIGN_FRAME(map[i].dev_bus_addr>>PAGE_SHIFT));
392 #endif
393 seg[i].buf = map[i].dev_bus_addr |
394 (req->seg[i].first_sect << 9);
395 } else {
396 errors++;
397 }
398 }
400 if (errors) {
401 DPRINTK("invalid buffer -- could not remap it\n");
402 fast_flush_area(pending_idx, nseg);
403 goto bad_descriptor;
404 }
406 if (vbd_translate(&preq, blkif, operation) != 0) {
407 DPRINTK("access denied: %s of [%llu,%llu] on dev=%04x\n",
408 operation == READ ? "read" : "write",
409 preq.sector_number,
410 preq.sector_number + preq.nr_sects, preq.dev);
411 goto bad_descriptor;
412 }
414 pending_req = &pending_reqs[pending_idx];
415 pending_req->blkif = blkif;
416 pending_req->id = req->id;
417 pending_req->operation = operation;
418 pending_req->status = BLKIF_RSP_OKAY;
419 pending_req->nr_pages = nseg;
421 for (i = 0; i < nseg; i++) {
422 if (((int)preq.sector_number|(int)seg[i].nsec) &
423 ((bdev_hardsect_size(preq.bdev) >> 9) - 1)) {
424 DPRINTK("Misaligned I/O request from domain %d",
425 blkif->domid);
426 goto cleanup_and_fail;
427 }
429 while ((bio == NULL) ||
430 (bio_add_page(bio,
431 virt_to_page(MMAP_VADDR(pending_idx, i)),
432 seg[i].nsec << 9,
433 seg[i].buf & ~PAGE_MASK) == 0)) {
434 bio = biolist[nbio++] = bio_alloc(GFP_KERNEL, nseg-i);
435 if (unlikely(bio == NULL)) {
436 cleanup_and_fail:
437 for (i = 0; i < (nbio-1); i++)
438 bio_put(biolist[i]);
439 fast_flush_area(pending_idx, nseg);
440 goto bad_descriptor;
441 }
443 bio->bi_bdev = preq.bdev;
444 bio->bi_private = pending_req;
445 bio->bi_end_io = end_block_io_op;
446 bio->bi_sector = preq.sector_number;
447 }
449 preq.sector_number += seg[i].nsec;
450 }
452 if ((q = bdev_get_queue(bio->bi_bdev)) != plugged_queue) {
453 flush_plugged_queue();
454 blk_get_queue(q);
455 plugged_queue = q;
456 }
458 atomic_set(&pending_req->pendcnt, nbio);
459 pending_cons++;
460 blkif_get(blkif);
462 for (i = 0; i < nbio; i++)
463 submit_bio(operation, biolist[i]);
465 return;
467 bad_descriptor:
468 make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
469 }
473 /******************************************************************
474 * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING
475 */
478 static void make_response(blkif_t *blkif, unsigned long id,
479 unsigned short op, int st)
480 {
481 blkif_response_t *resp;
482 unsigned long flags;
483 blkif_back_ring_t *blk_ring = &blkif->blk_ring;
485 /* Place on the response ring for the relevant domain. */
486 spin_lock_irqsave(&blkif->blk_ring_lock, flags);
487 resp = RING_GET_RESPONSE(blk_ring, blk_ring->rsp_prod_pvt);
488 resp->id = id;
489 resp->operation = op;
490 resp->status = st;
491 wmb(); /* Ensure other side can see the response fields. */
492 blk_ring->rsp_prod_pvt++;
493 RING_PUSH_RESPONSES(blk_ring);
494 spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
496 /* Kick the relevant domain. */
497 notify_remote_via_irq(blkif->irq);
498 }
500 void blkif_deschedule(blkif_t *blkif)
501 {
502 remove_from_blkdev_list(blkif);
503 }
505 static int __init blkif_init(void)
506 {
507 int i;
508 struct page *page;
509 int ret;
511 for (i = 0; i < MMAP_PAGES; i++)
512 pending_grant_handles[i] = BLKBACK_INVALID_HANDLE;
514 if (xen_init() < 0)
515 return -ENODEV;
517 blkif_interface_init();
519 #ifdef __ia64__
520 {
521 extern unsigned long alloc_empty_foreign_map_page_range(unsigned long pages);
522 int i;
524 mmap_vstart = alloc_empty_foreign_map_page_range(MMAP_PAGES);
525 printk("Allocated mmap_vstart: 0x%lx\n", mmap_vstart);
526 for(i = 0; i < MMAP_PAGES; i++)
527 pending_vaddrs[i] = mmap_vstart + (i << PAGE_SHIFT);
528 BUG_ON(mmap_vstart == NULL);
529 }
530 #else
531 page = balloon_alloc_empty_page_range(MMAP_PAGES);
532 BUG_ON(page == NULL);
533 mmap_vstart = (unsigned long)pfn_to_kaddr(page_to_pfn(page));
534 #endif
536 pending_cons = 0;
537 pending_prod = MAX_PENDING_REQS;
538 memset(pending_reqs, 0, sizeof(pending_reqs));
539 for (i = 0; i < MAX_PENDING_REQS; i++)
540 pending_ring[i] = i;
542 spin_lock_init(&blkio_schedule_list_lock);
543 INIT_LIST_HEAD(&blkio_schedule_list);
545 ret = kernel_thread(blkio_schedule, NULL, CLONE_FS | CLONE_FILES);
546 BUG_ON(ret < 0);
548 blkif_xenbus_init();
550 return 0;
551 }
553 __initcall(blkif_init);
555 /*
556 * Local variables:
557 * c-file-style: "linux"
558 * indent-tabs-mode: t
559 * c-indent-level: 8
560 * c-basic-offset: 8
561 * tab-width: 8
562 * End:
563 */