direct-io.hg

view xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/main.c @ 1397:23b7fa4bf219

bitkeeper revision 1.891.1.30 (40a6a4f4Ppc2ZqsR4fMzaXrsTKi3SQ)

Fix buffer-head memory leak in blkdev backend driver.
author kaf24@scramble.cl.cam.ac.uk
date Sat May 15 23:17:08 2004 +0000 (2004-05-15)
parents 00059c1948cf
children 9d1374c809c5
line source
1 /******************************************************************************
2 * arch/xen/drivers/blkif/backend/main.c
3 *
4 * Back-end of the driver for virtual block devices. This portion of the
5 * driver exports a 'unified' block-device interface that can be accessed
6 * by any operating system that implements a compatible front end. A
7 * reference front-end implementation can be found in:
8 * arch/xen/drivers/blkif/frontend
9 *
10 * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
11 */
13 #include "common.h"
15 /*
16 * These are rather arbitrary. They are fairly large because adjacent requests
17 * pulled from a communication ring are quite likely to end up being part of
18 * the same scatter/gather request at the disc.
19 *
20 * ** TRY INCREASING 'MAX_PENDING_REQS' IF WRITE SPEEDS SEEM TOO LOW **
21 * This will increase the chances of being able to write whole tracks.
22 * 64 should be enough to keep us competitive with Linux.
23 */
24 #define MAX_PENDING_REQS 64
25 #define BATCH_PER_DOMAIN 16
27 static unsigned long mmap_vstart;
28 #define MMAP_PAGES_PER_REQUEST \
29 (BLKIF_MAX_SEGMENTS_PER_REQUEST + 1)
30 #define MMAP_PAGES \
31 (MAX_PENDING_REQS * MMAP_PAGES_PER_REQUEST)
32 #define MMAP_VADDR(_req,_seg) \
33 (mmap_vstart + \
34 ((_req) * MMAP_PAGES_PER_REQUEST * PAGE_SIZE) + \
35 ((_seg) * PAGE_SIZE))
37 /*
38 * Each outstanding request that we've passed to the lower device layers has a
39 * 'pending_req' allocated to it. Each buffer_head that completes decrements
40 * the pendcnt towards zero. When it hits zero, the specified domain has a
41 * response queued for it, with the saved 'id' passed back.
42 *
43 * We can't allocate pending_req's in order, since they may complete out of
44 * order. We therefore maintain an allocation ring. This ring also indicates
45 * when enough work has been passed down -- at that point the allocation ring
46 * will be empty.
47 */
48 static pending_req_t pending_reqs[MAX_PENDING_REQS];
49 static unsigned char pending_ring[MAX_PENDING_REQS];
50 static spinlock_t pend_prod_lock = SPIN_LOCK_UNLOCKED;
51 /* NB. We use a different index type to differentiate from shared blk rings. */
52 typedef unsigned int PEND_RING_IDX;
53 #define MASK_PEND_IDX(_i) ((_i)&(MAX_PENDING_REQS-1))
54 static PEND_RING_IDX pending_prod, pending_cons;
55 #define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons)
57 static kmem_cache_t *buffer_head_cachep;
59 static int do_block_io_op(blkif_t *blkif, int max_to_do);
60 static void dispatch_probe(blkif_t *blkif, blkif_request_t *req);
61 static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req);
62 static void make_response(blkif_t *blkif, unsigned long id,
63 unsigned short op, int st);
66 /******************************************************************
67 * BLOCK-DEVICE SCHEDULER LIST MAINTENANCE
68 */
70 static struct list_head io_schedule_list;
71 static spinlock_t io_schedule_list_lock;
73 static int __on_blkdev_list(blkif_t *blkif)
74 {
75 return blkif->blkdev_list.next != NULL;
76 }
78 static void remove_from_blkdev_list(blkif_t *blkif)
79 {
80 unsigned long flags;
81 if ( !__on_blkdev_list(blkif) ) return;
82 spin_lock_irqsave(&io_schedule_list_lock, flags);
83 if ( __on_blkdev_list(blkif) )
84 {
85 list_del(&blkif->blkdev_list);
86 blkif->blkdev_list.next = NULL;
87 blkif_put(blkif);
88 }
89 spin_unlock_irqrestore(&io_schedule_list_lock, flags);
90 }
92 static void add_to_blkdev_list_tail(blkif_t *blkif)
93 {
94 unsigned long flags;
95 if ( __on_blkdev_list(blkif) ) return;
96 spin_lock_irqsave(&io_schedule_list_lock, flags);
97 if ( !__on_blkdev_list(blkif) && (blkif->status == CONNECTED) )
98 {
99 list_add_tail(&blkif->blkdev_list, &io_schedule_list);
100 blkif_get(blkif);
101 }
102 spin_unlock_irqrestore(&io_schedule_list_lock, flags);
103 }
106 /******************************************************************
107 * SCHEDULER FUNCTIONS
108 */
110 static void io_schedule(unsigned long unused)
111 {
112 blkif_t *blkif;
113 struct list_head *ent;
115 /* Queue up a batch of requests. */
116 while ( (NR_PENDING_REQS < MAX_PENDING_REQS) &&
117 !list_empty(&io_schedule_list) )
118 {
119 ent = io_schedule_list.next;
120 blkif = list_entry(ent, blkif_t, blkdev_list);
121 blkif_get(blkif);
122 remove_from_blkdev_list(blkif);
123 if ( do_block_io_op(blkif, BATCH_PER_DOMAIN) )
124 add_to_blkdev_list_tail(blkif);
125 blkif_put(blkif);
126 }
128 /* Push the batch through to disc. */
129 run_task_queue(&tq_disk);
130 }
132 static DECLARE_TASKLET(io_schedule_tasklet, io_schedule, 0);
134 static void maybe_trigger_io_schedule(void)
135 {
136 /*
137 * Needed so that two processes, who together make the following predicate
138 * true, don't both read stale values and evaluate the predicate
139 * incorrectly. Incredibly unlikely to stall the scheduler on x86, but...
140 */
141 smp_mb();
143 if ( (NR_PENDING_REQS < (MAX_PENDING_REQS/2)) &&
144 !list_empty(&io_schedule_list) )
145 tasklet_schedule(&io_schedule_tasklet);
146 }
150 /******************************************************************
151 * COMPLETION CALLBACK -- Called as bh->b_end_io()
152 */
154 static void end_block_io_op(struct buffer_head *bh, int uptodate)
155 {
156 pending_req_t *pending_req = bh->b_private;
157 unsigned long flags;
159 /* An error fails the entire request. */
160 if ( !uptodate )
161 {
162 DPRINTK("Buffer not up-to-date at end of operation\n");
163 pending_req->status = BLKIF_RSP_ERROR;
164 }
166 if ( atomic_dec_and_test(&pending_req->pendcnt) )
167 {
168 int pending_idx = pending_req - pending_reqs;
169 vmfree_area_pages(MMAP_VADDR(pending_idx, 0),
170 MMAP_PAGES_PER_REQUEST * PAGE_SIZE);
171 make_response(pending_req->blkif, pending_req->id,
172 pending_req->operation, pending_req->status);
173 blkif_put(pending_req->blkif);
174 spin_lock_irqsave(&pend_prod_lock, flags);
175 pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
176 spin_unlock_irqrestore(&pend_prod_lock, flags);
177 maybe_trigger_io_schedule();
178 }
180 kmem_cache_free(buffer_head_cachep, bh);
181 }
185 /******************************************************************************
186 * NOTIFICATION FROM GUEST OS.
187 */
189 void blkif_be_int(int irq, void *dev_id, struct pt_regs *regs)
190 {
191 blkif_t *blkif = dev_id;
192 add_to_blkdev_list_tail(blkif);
193 maybe_trigger_io_schedule();
194 }
198 /******************************************************************
199 * DOWNWARD CALLS -- These interface with the block-device layer proper.
200 */
202 static int do_block_io_op(blkif_t *blkif, int max_to_do)
203 {
204 blkif_ring_t *blk_ring = blkif->blk_ring_base;
205 blkif_request_t *req;
206 BLK_RING_IDX i;
207 int more_to_do = 0;
209 /* Take items off the comms ring, taking care not to overflow. */
210 for ( i = blkif->blk_req_cons;
211 (i != blk_ring->req_prod) && ((i-blkif->blk_resp_prod) !=
212 BLK_RING_SIZE);
213 i++ )
214 {
215 if ( (max_to_do-- == 0) || (NR_PENDING_REQS == MAX_PENDING_REQS) )
216 {
217 more_to_do = 1;
218 break;
219 }
221 req = &blk_ring->ring[MASK_BLK_IDX(i)].req;
222 switch ( req->operation )
223 {
224 case BLKIF_OP_READ:
225 case BLKIF_OP_WRITE:
226 dispatch_rw_block_io(blkif, req);
227 break;
229 case BLKIF_OP_PROBE:
230 dispatch_probe(blkif, req);
231 break;
233 default:
234 DPRINTK("error: unknown block io operation [%d]\n",
235 blk_ring->ring[i].req.operation);
236 make_response(blkif, blk_ring->ring[i].req.id,
237 blk_ring->ring[i].req.operation, BLKIF_RSP_ERROR);
238 break;
239 }
240 }
242 blkif->blk_req_cons = i;
243 return more_to_do;
244 }
246 static void dispatch_probe(blkif_t *blkif, blkif_request_t *req)
247 {
248 int i, rc, pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)];
249 pgprot_t prot;
251 /* Check that number of segments is sane. */
252 if ( unlikely(req->nr_segments == 0) ||
253 unlikely(req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST) )
254 {
255 DPRINTK("Bad number of segments in request (%d)\n", req->nr_segments);
256 goto bad_descriptor;
257 }
259 prot = __pgprot(_PAGE_PRESENT|_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW);
260 for ( i = 0; i < req->nr_segments; i++ )
261 {
262 /* Make sure the buffer is page-sized. */
263 if ( (blkif_first_sect(req->frame_and_sects[i]) != 0) ||
264 (blkif_last_sect(req->frame_and_sects[i]) != 7) )
265 goto bad_descriptor;
266 rc = direct_remap_area_pages(&init_mm,
267 MMAP_VADDR(pending_idx, i),
268 req->frame_and_sects[i] & PAGE_MASK,
269 PAGE_SIZE, prot, blkif->domid);
270 if ( rc != 0 )
271 goto bad_descriptor;
272 }
274 rc = vbd_probe(blkif, (vdisk_t *)MMAP_VADDR(pending_idx, 0),
275 (req->nr_segments * PAGE_SIZE) / sizeof(vdisk_t));
277 vmfree_area_pages(MMAP_VADDR(pending_idx, 0),
278 MMAP_PAGES_PER_REQUEST * PAGE_SIZE);
279 make_response(blkif, req->id, req->operation, rc);
280 return;
282 bad_descriptor:
283 vmfree_area_pages(MMAP_VADDR(pending_idx, 0),
284 MMAP_PAGES_PER_REQUEST * PAGE_SIZE);
285 make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
286 }
288 static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req)
289 {
290 extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]);
291 struct buffer_head *bh;
292 int operation = (req->operation == BLKIF_OP_WRITE) ? WRITE : READ;
293 short nr_sects;
294 unsigned long buffer, fas;
295 int i, tot_sects, pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)];
296 pending_req_t *pending_req;
297 pgprot_t prot;
299 /* We map virtual scatter/gather segments to physical segments. */
300 int new_segs, nr_psegs = 0;
301 phys_seg_t phys_seg[BLKIF_MAX_SEGMENTS_PER_REQUEST + 1];
303 /* Check that number of segments is sane. */
304 if ( unlikely(req->nr_segments == 0) ||
305 unlikely(req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST) )
306 {
307 DPRINTK("Bad number of segments in request (%d)\n", req->nr_segments);
308 goto bad_descriptor;
309 }
311 /*
312 * Check each address/size pair is sane, and convert into a
313 * physical device and block offset. Note that if the offset and size
314 * crosses a virtual extent boundary, we may end up with more
315 * physical scatter/gather segments than virtual segments.
316 */
317 for ( i = tot_sects = 0; i < req->nr_segments; i++, tot_sects += nr_sects )
318 {
319 fas = req->frame_and_sects[i];
320 buffer = (fas & PAGE_MASK) | (blkif_first_sect(fas) << 9);
321 nr_sects = blkif_last_sect(fas) - blkif_first_sect(fas) + 1;
323 if ( nr_sects <= 0 )
324 goto bad_descriptor;
326 phys_seg[nr_psegs].dev = req->device;
327 phys_seg[nr_psegs].sector_number = req->sector_number + tot_sects;
328 phys_seg[nr_psegs].buffer = buffer;
329 phys_seg[nr_psegs].nr_sects = nr_sects;
331 /* Translate the request into the relevant 'physical device' */
332 new_segs = vbd_translate(&phys_seg[nr_psegs], blkif, operation);
333 if ( new_segs < 0 )
334 {
335 DPRINTK("access denied: %s of [%llu,%llu] on dev=%04x\n",
336 operation == READ ? "read" : "write",
337 req->sector_number + tot_sects,
338 req->sector_number + tot_sects + nr_sects,
339 req->device);
340 goto bad_descriptor;
341 }
343 nr_psegs += new_segs;
344 ASSERT(nr_psegs <= (BLKIF_MAX_SEGMENTS_PER_REQUEST+1));
345 }
347 /* Nonsensical zero-sized request? */
348 if ( unlikely(nr_psegs == 0) )
349 goto bad_descriptor;
351 if ( operation == READ )
352 prot = __pgprot(_PAGE_PRESENT|_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW);
353 else
354 prot = __pgprot(_PAGE_PRESENT|_PAGE_DIRTY|_PAGE_ACCESSED);
356 for ( i = 0; i < nr_psegs; i++ )
357 {
358 int rc = direct_remap_area_pages(&init_mm,
359 MMAP_VADDR(pending_idx, i),
360 phys_seg[i].buffer & PAGE_MASK,
361 PAGE_SIZE, prot, blkif->domid);
362 if ( rc != 0 )
363 {
364 DPRINTK("invalid buffer\n");
365 vmfree_area_pages(MMAP_VADDR(pending_idx, 0),
366 MMAP_PAGES_PER_REQUEST * PAGE_SIZE);
367 goto bad_descriptor;
368 }
369 phys_to_machine_mapping[__pa(MMAP_VADDR(pending_idx, i))>>PAGE_SHIFT] =
370 phys_seg[i].buffer >> PAGE_SHIFT;
371 }
373 pending_req = &pending_reqs[pending_idx];
374 pending_req->blkif = blkif;
375 pending_req->id = req->id;
376 pending_req->operation = operation;
377 pending_req->status = BLKIF_RSP_OKAY;
378 atomic_set(&pending_req->pendcnt, nr_psegs);
379 pending_cons++;
381 blkif_get(blkif);
383 /* Now we pass each segment down to the real blkdev layer. */
384 for ( i = 0; i < nr_psegs; i++ )
385 {
386 bh = kmem_cache_alloc(buffer_head_cachep, GFP_ATOMIC);
387 if ( unlikely(bh == NULL) )
388 panic("bh is null\n");
389 memset(bh, 0, sizeof (struct buffer_head));
391 init_waitqueue_head(&bh->b_wait);
392 bh->b_size = phys_seg[i].nr_sects << 9;
393 bh->b_dev = phys_seg[i].dev;
394 bh->b_rdev = phys_seg[i].dev;
395 bh->b_rsector = (unsigned long)phys_seg[i].sector_number;
396 bh->b_data = (char *)MMAP_VADDR(pending_idx, i) +
397 (phys_seg[i].buffer & ~PAGE_MASK);
398 // bh->b_page = virt_to_page(MMAP_VADDR(pending_idx, i));
399 bh->b_end_io = end_block_io_op;
400 bh->b_private = pending_req;
402 bh->b_state = (1 << BH_Mapped) | (1 << BH_Lock) |
403 (1 << BH_Req) | (1 << BH_Launder);
404 if ( operation == WRITE )
405 bh->b_state |= (1 << BH_JBD) | (1 << BH_Req) | (1 << BH_Uptodate);
407 atomic_set(&bh->b_count, 1);
409 /* Dispatch a single request. We'll flush it to disc later. */
410 generic_make_request(operation, bh);
411 }
413 return;
415 bad_descriptor:
416 make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
417 }
421 /******************************************************************
422 * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING
423 */
426 static void make_response(blkif_t *blkif, unsigned long id,
427 unsigned short op, int st)
428 {
429 blkif_response_t *resp;
430 unsigned long flags;
432 /* Place on the response ring for the relevant domain. */
433 spin_lock_irqsave(&blkif->blk_ring_lock, flags);
434 resp = &blkif->blk_ring_base->
435 ring[MASK_BLK_IDX(blkif->blk_resp_prod)].resp;
436 resp->id = id;
437 resp->operation = op;
438 resp->status = st;
439 wmb();
440 blkif->blk_ring_base->resp_prod = ++blkif->blk_resp_prod;
441 spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
443 /* Kick the relevant domain. */
444 notify_via_evtchn(blkif->evtchn);
445 }
447 void blkif_deschedule(blkif_t *blkif)
448 {
449 remove_from_blkdev_list(blkif);
450 }
452 static int __init init_module(void)
453 {
454 int i;
456 if ( !(start_info.flags & SIF_INITDOMAIN) )
457 return 0;
459 blkif_interface_init();
461 if ( (mmap_vstart = allocate_empty_lowmem_region(MMAP_PAGES)) == 0 )
462 BUG();
464 pending_cons = 0;
465 pending_prod = MAX_PENDING_REQS;
466 memset(pending_reqs, 0, sizeof(pending_reqs));
467 for ( i = 0; i < MAX_PENDING_REQS; i++ )
468 pending_ring[i] = i;
470 spin_lock_init(&io_schedule_list_lock);
471 INIT_LIST_HEAD(&io_schedule_list);
473 buffer_head_cachep = kmem_cache_create(
474 "buffer_head_cache", sizeof(struct buffer_head),
475 0, SLAB_HWCACHE_ALIGN, NULL, NULL);
477 blkif_ctrlif_init();
479 return 0;
480 }
482 static void cleanup_module(void)
483 {
484 BUG();
485 }
487 module_init(init_module);
488 module_exit(cleanup_module);