ia64/xen-unstable

view xen/drivers/block/xen_block.c @ 925:4aba3a48d64f

bitkeeper revision 1.580.1.1 (3fafd2c85ofrHsrmRaYOxYp49iwWOA)

new vbd probe world
author smh22@labyrinth.cl.cam.ac.uk
date Mon Nov 10 18:02:48 2003 +0000 (2003-11-10)
parents f8e22c28741a
children 0a901de56d7c
line source
1 /*
2 * xen_block.c
3 *
4 * process incoming block io requests from guestos's.
5 */
7 #include <xeno/config.h>
8 #include <xeno/types.h>
9 #include <xeno/lib.h>
10 #include <xeno/sched.h>
11 #include <xeno/blkdev.h>
12 #include <xeno/event.h>
13 #include <hypervisor-ifs/block.h>
14 #include <hypervisor-ifs/hypervisor-if.h>
15 #include <asm-i386/io.h>
16 #include <asm/domain_page.h>
17 #include <xeno/spinlock.h>
18 #include <xeno/keyhandler.h>
19 #include <xeno/interrupt.h>
20 #include <xeno/vbd.h>
21 #include <xeno/slab.h>
23 #if 1
24 #define DPRINTK(_f, _a...) printk( _f , ## _a )
25 #else
26 #define DPRINTK(_f, _a...) ((void)0)
27 #endif
29 /*
30 * These are rather arbitrary. They are fairly large because adjacent
31 * requests pulled from a communication ring are quite likely to end
32 * up being part of the same scatter/gather request at the disc.
33 *
34 * ** TRY INCREASING 'MAX_PENDING_REQS' IF WRITE SPEEDS SEEM TOO LOW **
35 * This will increase the chances of being able to write whole tracks.
36 * '64' should be enough to keep us competitive with Linux.
37 */
38 #define MAX_PENDING_REQS 64
39 #define BATCH_PER_DOMAIN 16
41 /*
42 * Each outstanding request which we've passed to the lower device layers
43 * has a 'pending_req' allocated to it. Each buffer_head that completes
44 * decrements the pendcnt towards zero. When it hits zero, the specified
45 * domain has a response queued for it, with the saved 'id' passed back.
46 *
47 * We can't allocate pending_req's in order, since they may complete out
48 * of order. We therefore maintain an allocation ring. This ring also
49 * indicates when enough work has been passed down -- at that point the
50 * allocation ring will be empty.
51 */
52 static pending_req_t pending_reqs[MAX_PENDING_REQS];
53 static unsigned char pending_ring[MAX_PENDING_REQS];
54 static unsigned int pending_prod, pending_cons;
55 static spinlock_t pend_prod_lock = SPIN_LOCK_UNLOCKED;
56 #define PENDREQ_IDX_INC(_i) ((_i) = ((_i)+1) & (MAX_PENDING_REQS-1))
58 static kmem_cache_t *buffer_head_cachep;
59 static atomic_t nr_pending;
61 static int __buffer_is_valid(struct task_struct *p,
62 unsigned long buffer,
63 unsigned short size,
64 int writeable_buffer);
65 static void __lock_buffer(unsigned long buffer,
66 unsigned short size,
67 int writeable_buffer);
68 static void unlock_buffer(struct task_struct *p,
69 unsigned long buffer,
70 unsigned short size,
71 int writeable_buffer);
73 static void io_schedule(unsigned long unused);
74 static int do_block_io_op_domain(struct task_struct *p, int max_to_do);
75 static void dispatch_rw_block_io(struct task_struct *p, int index);
76 static void dispatch_debug_block_io(struct task_struct *p, int index);
77 static void make_response(struct task_struct *p, unsigned long id,
78 unsigned short op, unsigned long st);
81 /******************************************************************
82 * BLOCK-DEVICE SCHEDULER LIST MAINTENANCE
83 */
85 static struct list_head io_schedule_list;
86 static spinlock_t io_schedule_list_lock;
88 static int __on_blkdev_list(struct task_struct *p)
89 {
90 return p->blkdev_list.next != NULL;
91 }
93 static void remove_from_blkdev_list(struct task_struct *p)
94 {
95 unsigned long flags;
96 if ( !__on_blkdev_list(p) ) return;
97 spin_lock_irqsave(&io_schedule_list_lock, flags);
98 if ( __on_blkdev_list(p) )
99 {
100 list_del(&p->blkdev_list);
101 p->blkdev_list.next = NULL;
102 put_task_struct(p);
103 }
104 spin_unlock_irqrestore(&io_schedule_list_lock, flags);
105 }
107 static void add_to_blkdev_list_tail(struct task_struct *p)
108 {
109 unsigned long flags;
110 if ( __on_blkdev_list(p) ) return;
111 spin_lock_irqsave(&io_schedule_list_lock, flags);
112 if ( !__on_blkdev_list(p) )
113 {
114 list_add_tail(&p->blkdev_list, &io_schedule_list);
115 get_task_struct(p);
116 }
117 spin_unlock_irqrestore(&io_schedule_list_lock, flags);
118 }
121 /******************************************************************
122 * SCHEDULER FUNCTIONS
123 */
125 static DECLARE_TASKLET(io_schedule_tasklet, io_schedule, 0);
127 static void io_schedule(unsigned long unused)
128 {
129 struct task_struct *p;
130 struct list_head *ent;
132 /* Queue up a batch of requests. */
133 while ( (atomic_read(&nr_pending) < MAX_PENDING_REQS) &&
134 !list_empty(&io_schedule_list) )
135 {
136 ent = io_schedule_list.next;
137 p = list_entry(ent, struct task_struct, blkdev_list);
138 get_task_struct(p);
139 remove_from_blkdev_list(p);
140 if ( do_block_io_op_domain(p, BATCH_PER_DOMAIN) )
141 add_to_blkdev_list_tail(p);
142 put_task_struct(p);
143 }
145 /* Push the batch through to disc. */
146 run_task_queue(&tq_disk);
147 }
149 static void maybe_trigger_io_schedule(void)
150 {
151 /*
152 * Needed so that two processes, who together make the following predicate
153 * true, don't both read stale values and evaluate the predicate
154 * incorrectly. Incredibly unlikely to stall the scheduler on x86, but...
155 */
156 smp_mb();
158 if ( (atomic_read(&nr_pending) < (MAX_PENDING_REQS/2)) &&
159 !list_empty(&io_schedule_list) )
160 {
161 tasklet_schedule(&io_schedule_tasklet);
162 }
163 }
167 /******************************************************************
168 * COMPLETION CALLBACK -- Called as bh->b_end_io()
169 */
171 static void end_block_io_op(struct buffer_head *bh, int uptodate)
172 {
173 unsigned long flags;
174 pending_req_t *pending_req = bh->pending_req;
176 /* An error fails the entire request. */
177 if ( !uptodate )
178 {
179 DPRINTK("Buffer not up-to-date at end of operation\n");
180 pending_req->status = 2;
181 }
183 unlock_buffer(pending_req->domain,
184 virt_to_phys(bh->b_data),
185 bh->b_size,
186 (pending_req->operation==READ));
188 if ( atomic_dec_and_test(&pending_req->pendcnt) )
189 {
190 make_response(pending_req->domain, pending_req->id,
191 pending_req->operation, pending_req->status);
192 put_task_struct(pending_req->domain);
193 spin_lock_irqsave(&pend_prod_lock, flags);
194 pending_ring[pending_prod] = pending_req - pending_reqs;
195 PENDREQ_IDX_INC(pending_prod);
196 spin_unlock_irqrestore(&pend_prod_lock, flags);
197 atomic_dec(&nr_pending);
198 maybe_trigger_io_schedule();
199 }
201 kmem_cache_free(buffer_head_cachep, bh);
202 }
203 /* ----[ Syscall Interface ]------------------------------------------------*/
205 long do_block_io_op(block_io_op_t *u_block_io_op)
206 {
207 long ret = 0;
208 block_io_op_t op;
209 struct task_struct *p = current;
211 if (copy_from_user(&op, u_block_io_op, sizeof(op)))
212 return -EFAULT;
214 switch (op.cmd) {
216 case BLOCK_IO_OP_SIGNAL:
217 /* simply indicates there're reqs outstanding => add current to list */
218 add_to_blkdev_list_tail(p);
219 maybe_trigger_io_schedule();
220 break;
222 case BLOCK_IO_OP_RESET:
223 /* Avoid a race with the tasklet. */
224 remove_from_blkdev_list(p);
225 if ( p->blk_req_cons != p->blk_resp_prod )
226 {
227 /* Interface isn't quiescent. */
228 ret = -EINVAL;
229 }
230 else
231 {
232 p->blk_req_cons = p->blk_resp_prod = 0;
233 ret = 0;
234 }
235 break;
237 case BLOCK_IO_OP_RING_ADDRESS:
238 op.u.ring_mfn = virt_to_phys(p->blk_ring_base) >> PAGE_SHIFT;
239 ret = copy_to_user(u_block_io_op, &op, sizeof(op)) ? -EFAULT : 0;
240 break;
242 case BLOCK_IO_OP_VBD_CREATE:
243 /* create a new VBD */
244 ret = vbd_create(&op.u.create_params);
245 break;
247 case BLOCK_IO_OP_VBD_ADD:
248 /* add an extent to a VBD */
249 ret = vbd_add(&op.u.add_params);
250 break;
252 case BLOCK_IO_OP_VBD_REMOVE:
253 /* remove an extent from a VBD */
254 ret = vbd_remove(&op.u.remove_params);
255 break;
257 case BLOCK_IO_OP_VBD_DELETE:
258 /* delete a VBD */
259 ret = vbd_delete(&op.u.delete_params);
260 break;
262 case BLOCK_IO_OP_VBD_PROBE:
263 /* query VBD information for self or others (or all) */
264 ret = vbd_probe(&op.u.probe_params);
265 if(ret == 0)
266 copy_to_user(u_block_io_op, &op, sizeof(op));
267 break;
269 case BLOCK_IO_OP_VBD_INFO:
270 /* query information about a particular VBD */
271 ret = vbd_info(&op.u.info_params);
272 break;
274 default:
275 ret = -ENOSYS;
276 }
279 return ret;
280 }
284 /******************************************************************
285 * DOWNWARD CALLS -- These interface with the block-device layer proper.
286 */
288 static int __buffer_is_valid(struct task_struct *p,
289 unsigned long buffer,
290 unsigned short size,
291 int writeable_buffer)
292 {
293 unsigned long pfn;
294 struct pfn_info *page;
295 int rc = 0;
297 /* A request may span multiple page frames. Each must be checked. */
298 for ( pfn = buffer >> PAGE_SHIFT;
299 pfn < ((buffer + size + PAGE_SIZE - 1) >> PAGE_SHIFT);
300 pfn++ )
301 {
302 /* Each frame must be within bounds of machine memory. */
303 if ( pfn >= max_page )
304 {
305 DPRINTK("pfn out of range: %08lx\n", pfn);
306 goto out;
307 }
309 page = frame_table + pfn;
311 /* Each frame must belong to the requesting domain. */
312 if ( (page->flags & PG_domain_mask) != p->domain )
313 {
314 DPRINTK("bad domain: expected %d, got %ld\n",
315 p->domain, page->flags & PG_domain_mask);
316 goto out;
317 }
319 /* If reading into the frame, the frame must be writeable. */
320 if ( writeable_buffer &&
321 ((page->flags & PG_type_mask) != PGT_writeable_page) &&
322 (page->type_count != 0) )
323 {
324 DPRINTK("non-writeable page passed for block read\n");
325 goto out;
326 }
327 }
329 rc = 1;
330 out:
331 return rc;
332 }
334 static void __lock_buffer(unsigned long buffer,
335 unsigned short size,
336 int writeable_buffer)
337 {
338 unsigned long pfn;
339 struct pfn_info *page;
341 for ( pfn = buffer >> PAGE_SHIFT;
342 pfn < ((buffer + size + PAGE_SIZE - 1) >> PAGE_SHIFT);
343 pfn++ )
344 {
345 page = frame_table + pfn;
346 if ( writeable_buffer )
347 {
348 if ( page->type_count == 0 )
349 {
350 page->flags &= ~PG_type_mask;
351 /* No need for PG_need_flush here. */
352 page->flags |= PGT_writeable_page;
353 }
354 get_page_type(page);
355 }
356 get_page_tot(page);
357 }
358 }
360 static void unlock_buffer(struct task_struct *p,
361 unsigned long buffer,
362 unsigned short size,
363 int writeable_buffer)
364 {
365 unsigned long pfn, flags;
366 struct pfn_info *page;
368 spin_lock_irqsave(&p->page_lock, flags);
369 for ( pfn = buffer >> PAGE_SHIFT;
370 pfn < ((buffer + size + PAGE_SIZE - 1) >> PAGE_SHIFT);
371 pfn++ )
372 {
373 page = frame_table + pfn;
374 if ( writeable_buffer )
375 put_page_type(page);
376 put_page_tot(page);
377 }
378 spin_unlock_irqrestore(&p->page_lock, flags);
379 }
381 static int do_block_io_op_domain(struct task_struct *p, int max_to_do)
382 {
383 blk_ring_t *blk_ring = p->blk_ring_base;
384 int i, more_to_do = 0;
386 /*
387 * Take items off the comms ring, taking care not to catch up
388 * with the response-producer index.
389 */
390 for ( i = p->blk_req_cons;
391 (i != blk_ring->req_prod) &&
392 (((p->blk_resp_prod-i) & (BLK_RING_SIZE-1)) != 1);
393 i = BLK_RING_INC(i) )
394 {
395 if ( (max_to_do-- == 0) ||
396 (atomic_read(&nr_pending) == MAX_PENDING_REQS) )
397 {
398 more_to_do = 1;
399 break;
400 }
402 switch ( blk_ring->ring[i].req.operation )
403 {
404 case XEN_BLOCK_READ:
405 case XEN_BLOCK_WRITE:
406 dispatch_rw_block_io(p, i);
407 break;
409 case XEN_BLOCK_DEBUG:
410 dispatch_debug_block_io(p, i);
411 break;
413 default:
414 DPRINTK("error: unknown block io operation [%d]\n",
415 blk_ring->ring[i].req.operation);
416 make_response(p, blk_ring->ring[i].req.id,
417 blk_ring->ring[i].req.operation, 1);
418 break;
419 }
420 }
422 p->blk_req_cons = i;
423 return more_to_do;
424 }
426 static void dispatch_debug_block_io(struct task_struct *p, int index)
427 {
428 DPRINTK("dispatch_debug_block_io: unimplemented\n");
429 }
431 static void dispatch_rw_block_io(struct task_struct *p, int index)
432 {
433 extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]);
434 blk_ring_t *blk_ring = p->blk_ring_base;
435 blk_ring_req_entry_t *req = &blk_ring->ring[index].req;
436 struct buffer_head *bh;
437 int operation = (req->operation == XEN_BLOCK_WRITE) ? WRITE : READ;
438 unsigned short nr_sects;
439 unsigned long buffer, flags;
440 int i, rc, tot_sects;
441 pending_req_t *pending_req;
443 /* We map virtual scatter/gather segments to physical segments. */
444 int new_segs, nr_psegs = 0;
445 phys_seg_t phys_seg[MAX_BLK_SEGS * 2];
447 spin_lock_irqsave(&p->page_lock, flags);
449 /* Check that number of segments is sane. */
450 if ( (req->nr_segments == 0) || (req->nr_segments > MAX_BLK_SEGS) )
451 {
452 DPRINTK("Bad number of segments in request (%d)\n", req->nr_segments);
453 goto bad_descriptor;
454 }
456 /*
457 * Check each address/size pair is sane, and convert into a
458 * physical device and block offset. Note that if the offset and size
459 * crosses a virtual extent boundary, we may end up with more
460 * physical scatter/gather segments than virtual segments.
461 */
462 for ( i = tot_sects = 0; i < req->nr_segments; i++, tot_sects += nr_sects )
463 {
464 buffer = req->buffer_and_sects[i] & ~0x1FF;
465 nr_sects = req->buffer_and_sects[i] & 0x1FF;
467 if ( nr_sects == 0 )
468 {
469 DPRINTK("zero-sized data request\n");
470 goto bad_descriptor;
471 }
473 if ( !__buffer_is_valid(p, buffer, nr_sects<<9, (operation==READ)) )
474 {
475 DPRINTK("invalid buffer\n");
476 goto bad_descriptor;
477 }
479 phys_seg[nr_psegs].dev = req->device;
480 phys_seg[nr_psegs].sector_number = req->sector_number + tot_sects;
481 phys_seg[nr_psegs].buffer = buffer;
482 phys_seg[nr_psegs].nr_sects = nr_sects;
484 /* Translate the request into the relevant 'physical device' */
485 new_segs = 1;
486 rc = vbd_translate(&phys_seg[nr_psegs], &new_segs, p, operation);
488 /* If it fails we bail (unless the caller is priv => has raw access) */
489 if(rc) {
490 if(!IS_PRIV(p)) {
491 printk("access denied: %s of [%ld,%ld] on dev=%04x\n",
492 operation == READ ? "read" : "write",
493 req->sector_number + tot_sects,
494 req->sector_number + tot_sects + nr_sects,
495 req->device);
496 goto bad_descriptor;
497 }
499 /* SMH: skanky hack; clear any 'partition' info in device */
500 phys_seg[nr_psegs].dev = req->device & 0xFFF0;
501 }
503 nr_psegs += new_segs;
504 if ( nr_psegs >= (MAX_BLK_SEGS*2) ) BUG();
505 }
507 /* Lock pages associated with each buffer head. */
508 for ( i = 0; i < nr_psegs; i++ )
509 __lock_buffer(phys_seg[i].buffer, phys_seg[i].nr_sects<<9,
510 (operation==READ));
511 spin_unlock_irqrestore(&p->page_lock, flags);
513 atomic_inc(&nr_pending);
514 pending_req = pending_reqs + pending_ring[pending_cons];
515 PENDREQ_IDX_INC(pending_cons);
516 pending_req->domain = p;
517 pending_req->id = req->id;
518 pending_req->operation = operation;
519 pending_req->status = 0;
520 atomic_set(&pending_req->pendcnt, nr_psegs);
522 get_task_struct(p);
524 /* Now we pass each segment down to the real blkdev layer. */
525 for ( i = 0; i < nr_psegs; i++ )
526 {
527 bh = kmem_cache_alloc(buffer_head_cachep, GFP_KERNEL);
528 if ( bh == NULL ) panic("bh is null\n");
529 memset (bh, 0, sizeof (struct buffer_head));
531 bh->b_size = phys_seg[i].nr_sects << 9;
532 bh->b_dev = phys_seg[i].dev;
533 bh->b_rsector = phys_seg[i].sector_number;
534 bh->b_data = phys_to_virt(phys_seg[i].buffer);
535 bh->b_end_io = end_block_io_op;
536 bh->pending_req = pending_req;
538 if ( operation == WRITE )
539 {
540 bh->b_state = (1 << BH_JBD) | (1 << BH_Mapped) | (1 << BH_Req) |
541 (1 << BH_Dirty) | (1 << BH_Uptodate) | (1 << BH_Write);
542 }
543 else
544 {
545 bh->b_state = (1 << BH_Mapped) | (1 << BH_Read);
546 }
548 /* Dispatch a single request. We'll flush it to disc later. */
549 ll_rw_block(operation, 1, &bh);
550 }
552 return;
554 bad_descriptor:
555 spin_unlock_irqrestore(&p->page_lock, flags);
556 make_response(p, req->id, req->operation, 1);
557 }
561 /******************************************************************
562 * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING
563 */
566 static void make_response(struct task_struct *p, unsigned long id,
567 unsigned short op, unsigned long st)
568 {
569 unsigned long cpu_mask, flags;
570 int position;
571 blk_ring_t *blk_ring;
573 /* Place on the response ring for the relevant domain. */
574 spin_lock_irqsave(&p->blk_ring_lock, flags);
575 blk_ring = p->blk_ring_base;
576 position = p->blk_resp_prod;
577 blk_ring->ring[position].resp.id = id;
578 blk_ring->ring[position].resp.operation = op;
579 blk_ring->ring[position].resp.status = st;
580 p->blk_resp_prod = blk_ring->resp_prod = BLK_RING_INC(position);
581 spin_unlock_irqrestore(&p->blk_ring_lock, flags);
583 /* Kick the relevant domain. */
584 cpu_mask = mark_guest_event(p, _EVENT_BLKDEV);
585 guest_event_notify(cpu_mask);
586 }
588 static void dump_blockq(u_char key, void *dev_id, struct pt_regs *regs)
589 {
590 struct task_struct *p;
591 blk_ring_t *blk_ring ;
593 printk("Dumping block queue stats: nr_pending = %d (prod=%d,cons=%d)\n",
594 atomic_read(&nr_pending), pending_prod, pending_cons);
596 p = current->next_task;
597 do
598 {
599 if ( !is_idle_task(p) )
600 {
601 printk("Domain: %d\n", p->domain);
602 blk_ring = p->blk_ring_base;
604 printk(" req_prod:%d, req_cons:%d resp_prod:%d/%d on_list=%d\n",
605 blk_ring->req_prod, p->blk_req_cons,
606 blk_ring->resp_prod, p->blk_resp_prod,
607 __on_blkdev_list(p));
608 }
609 p = p->next_task;
610 } while (p != current);
611 }
613 /* Start-of-day initialisation for a new domain. */
614 void init_blkdev_info(struct task_struct *p)
615 {
616 if ( sizeof(*p->blk_ring_base) > PAGE_SIZE ) BUG();
617 p->blk_ring_base = (blk_ring_t *)get_free_page(GFP_KERNEL);
618 clear_page(p->blk_ring_base);
619 SHARE_PFN_WITH_DOMAIN(virt_to_page(p->blk_ring_base), p->domain);
620 p->blkdev_list.next = NULL;
621 }
623 /* End-of-day teardown for a domain. */
624 void destroy_blkdev_info(struct task_struct *p)
625 {
626 ASSERT(!__on_blkdev_list(p));
627 UNSHARE_PFN(virt_to_page(p->blk_ring_base));
628 free_page((unsigned long)p->blk_ring_base);
629 }
631 void unlink_blkdev_info(struct task_struct *p)
632 {
633 unsigned long flags;
635 spin_lock_irqsave(&io_schedule_list_lock, flags);
636 if ( __on_blkdev_list(p) )
637 {
638 list_del(&p->blkdev_list);
639 p->blkdev_list.next = (void *)0xdeadbeef; /* prevent reinsertion */
640 put_task_struct(p);
641 }
642 spin_unlock_irqrestore(&io_schedule_list_lock, flags);
643 }
645 void initialize_block_io ()
646 {
647 int i;
649 atomic_set(&nr_pending, 0);
650 pending_prod = pending_cons = 0;
651 memset(pending_reqs, 0, sizeof(pending_reqs));
652 for ( i = 0; i < MAX_PENDING_REQS; i++ ) pending_ring[i] = i;
654 spin_lock_init(&io_schedule_list_lock);
655 INIT_LIST_HEAD(&io_schedule_list);
657 buffer_head_cachep = kmem_cache_create(
658 "buffer_head_cache", sizeof(struct buffer_head),
659 0, SLAB_HWCACHE_ALIGN, NULL, NULL);
661 add_key_handler('b', dump_blockq, "dump xen ide blkdev statistics");
662 }