ia64/xen-unstable

view linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/main.c @ 1776:c2f673cea5e4

bitkeeper revision 1.1072.1.1 (40f4e51fLMgcKX4Sn6FNYePX6EqkGA)

Merge http://xen.bkbits.net:8080/xeno-unstable.bk
into gandalf.hpl.hp.com:/var/bk/xeno-unstable.bk
author xenbk@gandalf.hpl.hp.com
date Wed Jul 14 07:47:43 2004 +0000 (2004-07-14)
parents 5bfc0d01717c c82c495264af
children e91945007886
line source
1 /******************************************************************************
2 * arch/xen/drivers/blkif/backend/main.c
3 *
4 * Back-end of the driver for virtual block devices. This portion of the
5 * driver exports a 'unified' block-device interface that can be accessed
6 * by any operating system that implements a compatible front end. A
7 * reference front-end implementation can be found in:
8 * arch/xen/drivers/blkif/frontend
9 *
10 * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
11 */
13 #include "common.h"
15 /*
16 * These are rather arbitrary. They are fairly large because adjacent requests
17 * pulled from a communication ring are quite likely to end up being part of
18 * the same scatter/gather request at the disc.
19 *
20 * ** TRY INCREASING 'MAX_PENDING_REQS' IF WRITE SPEEDS SEEM TOO LOW **
21 * This will increase the chances of being able to write whole tracks.
22 * 64 should be enough to keep us competitive with Linux.
23 */
24 #define MAX_PENDING_REQS 64
25 #define BATCH_PER_DOMAIN 16
27 /*
28 * NB. We place a page of padding between each buffer page to avoid incorrect
29 * merging of requests by the IDE and SCSI merging routines. Otherwise, two
30 * adjacent buffers in a scatter-gather request would have adjacent page
31 * numbers: since the merge routines don't realise that this is in *pseudophys*
32 * space, not real space, they may collapse the s-g elements!
33 */
34 static unsigned long mmap_vstart;
35 #define MMAP_PAGES_PER_REQUEST \
36 (2 * (BLKIF_MAX_SEGMENTS_PER_REQUEST + 1))
37 #define MMAP_PAGES \
38 (MAX_PENDING_REQS * MMAP_PAGES_PER_REQUEST)
39 #define MMAP_VADDR(_req,_seg) \
40 (mmap_vstart + \
41 ((_req) * MMAP_PAGES_PER_REQUEST * PAGE_SIZE) + \
42 ((_seg) * 2 * PAGE_SIZE))
44 /*
45 * Each outstanding request that we've passed to the lower device layers has a
46 * 'pending_req' allocated to it. Each buffer_head that completes decrements
47 * the pendcnt towards zero. When it hits zero, the specified domain has a
48 * response queued for it, with the saved 'id' passed back.
49 */
50 typedef struct {
51 blkif_t *blkif;
52 unsigned long id;
53 int nr_pages;
54 atomic_t pendcnt;
55 unsigned short operation;
56 int status;
57 } pending_req_t;
59 /*
60 * We can't allocate pending_req's in order, since they may complete out of
61 * order. We therefore maintain an allocation ring. This ring also indicates
62 * when enough work has been passed down -- at that point the allocation ring
63 * will be empty.
64 */
65 static pending_req_t pending_reqs[MAX_PENDING_REQS];
66 static unsigned char pending_ring[MAX_PENDING_REQS];
67 static spinlock_t pend_prod_lock = SPIN_LOCK_UNLOCKED;
68 /* NB. We use a different index type to differentiate from shared blk rings. */
69 typedef unsigned int PEND_RING_IDX;
70 #define MASK_PEND_IDX(_i) ((_i)&(MAX_PENDING_REQS-1))
71 static PEND_RING_IDX pending_prod, pending_cons;
72 #define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons)
74 static kmem_cache_t *buffer_head_cachep;
76 static int do_block_io_op(blkif_t *blkif, int max_to_do);
77 static void dispatch_probe(blkif_t *blkif, blkif_request_t *req);
78 static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req);
79 static void make_response(blkif_t *blkif, unsigned long id,
80 unsigned short op, int st);
82 static void fast_flush_area(int idx, int nr_pages)
83 {
84 multicall_entry_t mcl[MMAP_PAGES_PER_REQUEST];
85 int i;
87 for ( i = 0; i < nr_pages; i++ )
88 {
89 mcl[i].op = __HYPERVISOR_update_va_mapping;
90 mcl[i].args[0] = MMAP_VADDR(idx, i) >> PAGE_SHIFT;
91 mcl[i].args[1] = 0;
92 mcl[i].args[2] = 0;
93 }
95 mcl[nr_pages-1].args[2] = UVMF_FLUSH_TLB;
96 (void)HYPERVISOR_multicall(mcl, nr_pages);
97 }
100 /******************************************************************
101 * BLOCK-DEVICE SCHEDULER LIST MAINTENANCE
102 */
104 static struct list_head io_schedule_list;
105 static spinlock_t io_schedule_list_lock;
107 static int __on_blkdev_list(blkif_t *blkif)
108 {
109 return blkif->blkdev_list.next != NULL;
110 }
112 static void remove_from_blkdev_list(blkif_t *blkif)
113 {
114 unsigned long flags;
115 if ( !__on_blkdev_list(blkif) ) return;
116 spin_lock_irqsave(&io_schedule_list_lock, flags);
117 if ( __on_blkdev_list(blkif) )
118 {
119 list_del(&blkif->blkdev_list);
120 blkif->blkdev_list.next = NULL;
121 blkif_put(blkif);
122 }
123 spin_unlock_irqrestore(&io_schedule_list_lock, flags);
124 }
126 static void add_to_blkdev_list_tail(blkif_t *blkif)
127 {
128 unsigned long flags;
129 if ( __on_blkdev_list(blkif) ) return;
130 spin_lock_irqsave(&io_schedule_list_lock, flags);
131 if ( !__on_blkdev_list(blkif) && (blkif->status == CONNECTED) )
132 {
133 list_add_tail(&blkif->blkdev_list, &io_schedule_list);
134 blkif_get(blkif);
135 }
136 spin_unlock_irqrestore(&io_schedule_list_lock, flags);
137 }
140 /******************************************************************
141 * SCHEDULER FUNCTIONS
142 */
144 static DECLARE_WAIT_QUEUE_HEAD(io_schedule_wait);
146 static int io_schedule(void *arg)
147 {
148 DECLARE_WAITQUEUE(wq, current);
150 blkif_t *blkif;
151 struct list_head *ent;
153 for ( ; ; )
154 {
155 /* Wait for work to do. */
156 add_wait_queue(&io_schedule_wait, &wq);
157 set_current_state(TASK_INTERRUPTIBLE);
158 if ( (NR_PENDING_REQS == MAX_PENDING_REQS) ||
159 list_empty(&io_schedule_list) )
160 schedule();
161 __set_current_state(TASK_RUNNING);
162 remove_wait_queue(&io_schedule_wait, &wq);
164 /* Queue up a batch of requests. */
165 while ( (NR_PENDING_REQS < MAX_PENDING_REQS) &&
166 !list_empty(&io_schedule_list) )
167 {
168 ent = io_schedule_list.next;
169 blkif = list_entry(ent, blkif_t, blkdev_list);
170 blkif_get(blkif);
171 remove_from_blkdev_list(blkif);
172 if ( do_block_io_op(blkif, BATCH_PER_DOMAIN) )
173 add_to_blkdev_list_tail(blkif);
174 blkif_put(blkif);
175 }
177 /* Push the batch through to disc. */
178 run_task_queue(&tq_disk);
179 }
180 }
182 static void maybe_trigger_io_schedule(void)
183 {
184 /*
185 * Needed so that two processes, who together make the following predicate
186 * true, don't both read stale values and evaluate the predicate
187 * incorrectly. Incredibly unlikely to stall the scheduler on x86, but...
188 */
189 smp_mb();
191 if ( (NR_PENDING_REQS < (MAX_PENDING_REQS/2)) &&
192 !list_empty(&io_schedule_list) )
193 wake_up(&io_schedule_wait);
194 }
198 /******************************************************************
199 * COMPLETION CALLBACK -- Called as bh->b_end_io()
200 */
202 static void __end_block_io_op(pending_req_t *pending_req, int uptodate)
203 {
204 unsigned long flags;
206 /* An error fails the entire request. */
207 if ( !uptodate )
208 {
209 DPRINTK("Buffer not up-to-date at end of operation\n");
210 pending_req->status = BLKIF_RSP_ERROR;
211 }
213 if ( atomic_dec_and_test(&pending_req->pendcnt) )
214 {
215 int pending_idx = pending_req - pending_reqs;
216 fast_flush_area(pending_idx, pending_req->nr_pages);
217 make_response(pending_req->blkif, pending_req->id,
218 pending_req->operation, pending_req->status);
219 blkif_put(pending_req->blkif);
220 spin_lock_irqsave(&pend_prod_lock, flags);
221 pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
222 spin_unlock_irqrestore(&pend_prod_lock, flags);
223 maybe_trigger_io_schedule();
224 }
225 }
227 static void end_block_io_op(struct buffer_head *bh, int uptodate)
228 {
229 __end_block_io_op(bh->b_private, uptodate);
230 kmem_cache_free(buffer_head_cachep, bh);
231 }
235 /******************************************************************************
236 * NOTIFICATION FROM GUEST OS.
237 */
239 void blkif_be_int(int irq, void *dev_id, struct pt_regs *regs)
240 {
241 blkif_t *blkif = dev_id;
242 add_to_blkdev_list_tail(blkif);
243 maybe_trigger_io_schedule();
244 }
248 /******************************************************************
249 * DOWNWARD CALLS -- These interface with the block-device layer proper.
250 */
252 static int do_block_io_op(blkif_t *blkif, int max_to_do)
253 {
254 blkif_ring_t *blk_ring = blkif->blk_ring_base;
255 blkif_request_t *req;
256 BLKIF_RING_IDX i;
257 int more_to_do = 0;
259 /* Take items off the comms ring, taking care not to overflow. */
260 for ( i = blkif->blk_req_cons;
261 (i != blk_ring->req_prod) && ((i-blkif->blk_resp_prod) !=
262 BLKIF_RING_SIZE);
263 i++ )
264 {
265 if ( (max_to_do-- == 0) || (NR_PENDING_REQS == MAX_PENDING_REQS) )
266 {
267 more_to_do = 1;
268 break;
269 }
271 req = &blk_ring->ring[MASK_BLKIF_IDX(i)].req;
272 switch ( req->operation )
273 {
274 case BLKIF_OP_READ:
275 case BLKIF_OP_WRITE:
276 dispatch_rw_block_io(blkif, req);
277 break;
279 case BLKIF_OP_PROBE:
280 dispatch_probe(blkif, req);
281 break;
283 default:
284 DPRINTK("error: unknown block io operation [%d]\n",
285 blk_ring->ring[i].req.operation);
286 make_response(blkif, blk_ring->ring[i].req.id,
287 blk_ring->ring[i].req.operation, BLKIF_RSP_ERROR);
288 break;
289 }
290 }
292 blkif->blk_req_cons = i;
293 return more_to_do;
294 }
296 static void dispatch_probe(blkif_t *blkif, blkif_request_t *req)
297 {
298 int rsp = BLKIF_RSP_ERROR;
299 int pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)];
301 /* We expect one buffer only. */
302 if ( unlikely(req->nr_segments != 1) )
303 goto out;
305 /* Make sure the buffer is page-sized. */
306 if ( (blkif_first_sect(req->frame_and_sects[0]) != 0) ||
307 (blkif_last_sect(req->frame_and_sects[0]) != 7) )
308 goto out;
310 if ( HYPERVISOR_update_va_mapping_otherdomain(
311 MMAP_VADDR(pending_idx, 0) >> PAGE_SHIFT,
312 (pte_t) { (req->frame_and_sects[0] & PAGE_MASK) | __PAGE_KERNEL },
313 0, blkif->domid) )
314 goto out;
316 rsp = vbd_probe(blkif, (vdisk_t *)MMAP_VADDR(pending_idx, 0),
317 PAGE_SIZE / sizeof(vdisk_t));
319 out:
320 fast_flush_area(pending_idx, 1);
321 make_response(blkif, req->id, req->operation, rsp);
322 }
324 static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req)
325 {
326 extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]);
327 struct buffer_head *bh;
328 int operation = (req->operation == BLKIF_OP_WRITE) ? WRITE : READ;
329 short nr_sects;
330 unsigned long buffer, fas;
331 int i, tot_sects, pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)];
332 pending_req_t *pending_req;
333 unsigned long remap_prot;
334 multicall_entry_t mcl[MMAP_PAGES_PER_REQUEST];
336 /* We map virtual scatter/gather segments to physical segments. */
337 int new_segs, nr_psegs = 0;
338 phys_seg_t phys_seg[BLKIF_MAX_SEGMENTS_PER_REQUEST + 1];
340 /* Check that number of segments is sane. */
341 if ( unlikely(req->nr_segments == 0) ||
342 unlikely(req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST) )
343 {
344 DPRINTK("Bad number of segments in request (%d)\n", req->nr_segments);
345 goto bad_descriptor;
346 }
348 /*
349 * Check each address/size pair is sane, and convert into a
350 * physical device and block offset. Note that if the offset and size
351 * crosses a virtual extent boundary, we may end up with more
352 * physical scatter/gather segments than virtual segments.
353 */
354 for ( i = tot_sects = 0; i < req->nr_segments; i++, tot_sects += nr_sects )
355 {
356 fas = req->frame_and_sects[i];
357 buffer = (fas & PAGE_MASK) | (blkif_first_sect(fas) << 9);
358 nr_sects = blkif_last_sect(fas) - blkif_first_sect(fas) + 1;
360 if ( nr_sects <= 0 )
361 goto bad_descriptor;
363 phys_seg[nr_psegs].dev = req->device;
364 phys_seg[nr_psegs].sector_number = req->sector_number + tot_sects;
365 phys_seg[nr_psegs].buffer = buffer;
366 phys_seg[nr_psegs].nr_sects = nr_sects;
368 /* Translate the request into the relevant 'physical device' */
369 new_segs = vbd_translate(&phys_seg[nr_psegs], blkif, operation);
370 if ( new_segs < 0 )
371 {
372 DPRINTK("access denied: %s of [%llu,%llu] on dev=%04x\n",
373 operation == READ ? "read" : "write",
374 req->sector_number + tot_sects,
375 req->sector_number + tot_sects + nr_sects,
376 req->device);
377 goto bad_descriptor;
378 }
380 nr_psegs += new_segs;
381 ASSERT(nr_psegs <= (BLKIF_MAX_SEGMENTS_PER_REQUEST+1));
382 }
384 /* Nonsensical zero-sized request? */
385 if ( unlikely(nr_psegs == 0) )
386 goto bad_descriptor;
388 if ( operation == READ )
389 remap_prot = _PAGE_PRESENT|_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW;
390 else
391 remap_prot = _PAGE_PRESENT|_PAGE_DIRTY|_PAGE_ACCESSED;
393 for ( i = 0; i < nr_psegs; i++ )
394 {
395 mcl[i].op = __HYPERVISOR_update_va_mapping_otherdomain;
396 mcl[i].args[0] = MMAP_VADDR(pending_idx, i) >> PAGE_SHIFT;
397 mcl[i].args[1] = (phys_seg[i].buffer & PAGE_MASK) | remap_prot;
398 mcl[i].args[2] = 0;
399 mcl[i].args[3] = blkif->domid;
401 phys_to_machine_mapping[__pa(MMAP_VADDR(pending_idx, i))>>PAGE_SHIFT] =
402 phys_seg[i].buffer >> PAGE_SHIFT;
403 }
405 (void)HYPERVISOR_multicall(mcl, nr_psegs);
407 for ( i = 0; i < nr_psegs; i++ )
408 {
409 if ( unlikely(mcl[i].args[5] != 0) )
410 {
411 DPRINTK("invalid buffer -- could not remap it\n");
412 fast_flush_area(pending_idx, nr_psegs);
413 goto bad_descriptor;
414 }
415 }
417 pending_req = &pending_reqs[pending_idx];
418 pending_req->blkif = blkif;
419 pending_req->id = req->id;
420 pending_req->operation = operation;
421 pending_req->status = BLKIF_RSP_OKAY;
422 pending_req->nr_pages = nr_psegs;
423 atomic_set(&pending_req->pendcnt, nr_psegs);
424 pending_cons++;
426 blkif_get(blkif);
428 /* Now we pass each segment down to the real blkdev layer. */
429 for ( i = 0; i < nr_psegs; i++ )
430 {
431 bh = kmem_cache_alloc(buffer_head_cachep, GFP_ATOMIC);
432 if ( unlikely(bh == NULL) )
433 {
434 __end_block_io_op(pending_req, 0);
435 continue;
436 }
437 memset(bh, 0, sizeof (struct buffer_head));
439 init_waitqueue_head(&bh->b_wait);
440 bh->b_size = phys_seg[i].nr_sects << 9;
441 bh->b_dev = phys_seg[i].dev;
442 bh->b_rdev = phys_seg[i].dev;
443 bh->b_rsector = (unsigned long)phys_seg[i].sector_number;
444 bh->b_data = (char *)MMAP_VADDR(pending_idx, i) +
445 (phys_seg[i].buffer & ~PAGE_MASK);
446 bh->b_page = virt_to_page(MMAP_VADDR(pending_idx, i));
447 bh->b_end_io = end_block_io_op;
448 bh->b_private = pending_req;
450 bh->b_state = (1 << BH_Mapped) | (1 << BH_Lock) |
451 (1 << BH_Req) | (1 << BH_Launder);
452 if ( operation == WRITE )
453 bh->b_state |= (1 << BH_JBD) | (1 << BH_Req) | (1 << BH_Uptodate);
455 atomic_set(&bh->b_count, 1);
457 /* Dispatch a single request. We'll flush it to disc later. */
458 generic_make_request(operation, bh);
459 }
461 return;
463 bad_descriptor:
464 make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
465 }
469 /******************************************************************
470 * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING
471 */
474 static void make_response(blkif_t *blkif, unsigned long id,
475 unsigned short op, int st)
476 {
477 blkif_response_t *resp;
478 unsigned long flags;
480 /* Place on the response ring for the relevant domain. */
481 spin_lock_irqsave(&blkif->blk_ring_lock, flags);
482 resp = &blkif->blk_ring_base->
483 ring[MASK_BLKIF_IDX(blkif->blk_resp_prod)].resp;
484 resp->id = id;
485 resp->operation = op;
486 resp->status = st;
487 wmb();
488 blkif->blk_ring_base->resp_prod = ++blkif->blk_resp_prod;
489 spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
491 /* Kick the relevant domain. */
492 notify_via_evtchn(blkif->evtchn);
493 }
495 void blkif_deschedule(blkif_t *blkif)
496 {
497 remove_from_blkdev_list(blkif);
498 }
500 static int __init blkif_init(void)
501 {
502 int i;
504 if ( !(start_info.flags & SIF_INITDOMAIN)
505 && !(start_info.flags & SIF_BLK_BE_DOMAIN) )
506 return 0;
508 blkif_interface_init();
510 if ( (mmap_vstart = allocate_empty_lowmem_region(MMAP_PAGES)) == 0 )
511 BUG();
513 pending_cons = 0;
514 pending_prod = MAX_PENDING_REQS;
515 memset(pending_reqs, 0, sizeof(pending_reqs));
516 for ( i = 0; i < MAX_PENDING_REQS; i++ )
517 pending_ring[i] = i;
519 spin_lock_init(&io_schedule_list_lock);
520 INIT_LIST_HEAD(&io_schedule_list);
522 if ( kernel_thread(io_schedule, 0, CLONE_FS | CLONE_FILES) < 0 )
523 BUG();
525 buffer_head_cachep = kmem_cache_create(
526 "buffer_head_cache", sizeof(struct buffer_head),
527 0, SLAB_HWCACHE_ALIGN, NULL, NULL);
529 blkif_ctrlif_init();
531 return 0;
532 }
534 __initcall(blkif_init);