direct-io.hg

view xen/drivers/block/xen_block.c @ 875:ad4db8b417c1

bitkeeper revision 1.547 (3fa3dd2aH8eamu3ONvYovJgq8wBNbQ)

Many files:
Fixes to the DOM0 interface and domain building code. Ready for new save/restore dom0_ops.
author kaf24@scramble.cl.cam.ac.uk
date Sat Nov 01 16:19:54 2003 +0000 (2003-11-01)
parents 138060ee0e68
children 6cde5e25c56f
line source
1 /*
2 * xen_block.c
3 *
4 * process incoming block io requests from guestos's.
5 */
7 #include <xeno/config.h>
8 #include <xeno/types.h>
9 #include <xeno/lib.h>
10 #include <xeno/sched.h>
11 #include <xeno/blkdev.h>
12 #include <xeno/event.h>
13 #include <hypervisor-ifs/block.h>
14 #include <hypervisor-ifs/hypervisor-if.h>
15 #include <asm-i386/io.h>
16 #include <asm/domain_page.h>
17 #include <xeno/spinlock.h>
18 #include <xeno/keyhandler.h>
19 #include <xeno/interrupt.h>
20 #include <xeno/vbd.h>
21 #include <xeno/slab.h>
23 #if 0
24 #define DPRINTK(_f, _a...) printk( _f , ## _a )
25 #else
26 #define DPRINTK(_f, _a...) ((void)0)
27 #endif
29 /*
30 * These are rather arbitrary. They are fairly large because adjacent
31 * requests pulled from a communication ring are quite likely to end
32 * up being part of the same scatter/gather request at the disc.
33 *
34 * ** TRY INCREASING 'MAX_PENDING_REQS' IF WRITE SPEEDS SEEM TOO LOW **
35 * This will increase the chances of being able to write whole tracks.
36 * '64' should be enough to keep us competitive with Linux.
37 */
38 #define MAX_PENDING_REQS 64
39 #define BATCH_PER_DOMAIN 16
41 /*
42 * Each outstanding request which we've passed to the lower device layers
43 * has a 'pending_req' allocated to it. Each buffer_head that completes
44 * decrements the pendcnt towards zero. When it hits zero, the specified
45 * domain has a response queued for it, with the saved 'id' passed back.
46 *
47 * We can't allocate pending_req's in order, since they may complete out
48 * of order. We therefore maintain an allocation ring. This ring also
49 * indicates when enough work has been passed down -- at that point the
50 * allocation ring will be empty.
51 */
52 static pending_req_t pending_reqs[MAX_PENDING_REQS];
53 static unsigned char pending_ring[MAX_PENDING_REQS];
54 static unsigned int pending_prod, pending_cons;
55 static spinlock_t pend_prod_lock = SPIN_LOCK_UNLOCKED;
56 #define PENDREQ_IDX_INC(_i) ((_i) = ((_i)+1) & (MAX_PENDING_REQS-1))
58 static kmem_cache_t *buffer_head_cachep;
59 static atomic_t nr_pending;
61 #define NR_IDE_DEVS 20
62 #define NR_SCSI_DEVS 16
64 static kdev_t ide_devs[NR_IDE_DEVS] = {
65 MKDEV(IDE0_MAJOR, 0), MKDEV(IDE0_MAJOR, 64), /* hda, hdb */
66 MKDEV(IDE1_MAJOR, 0), MKDEV(IDE1_MAJOR, 64), /* hdc, hdd */
67 MKDEV(IDE2_MAJOR, 0), MKDEV(IDE2_MAJOR, 64), /* hde, hdf */
68 MKDEV(IDE3_MAJOR, 0), MKDEV(IDE3_MAJOR, 64), /* hdg, hdh */
69 MKDEV(IDE4_MAJOR, 0), MKDEV(IDE4_MAJOR, 64), /* hdi, hdj */
70 MKDEV(IDE5_MAJOR, 0), MKDEV(IDE5_MAJOR, 64), /* hdk, hdl */
71 MKDEV(IDE6_MAJOR, 0), MKDEV(IDE6_MAJOR, 64), /* hdm, hdn */
72 MKDEV(IDE7_MAJOR, 0), MKDEV(IDE7_MAJOR, 64), /* hdo, hdp */
73 MKDEV(IDE8_MAJOR, 0), MKDEV(IDE8_MAJOR, 64), /* hdq, hdr */
74 MKDEV(IDE9_MAJOR, 0), MKDEV(IDE9_MAJOR, 64) /* hds, hdt */
75 };
77 static kdev_t scsi_devs[NR_SCSI_DEVS] = {
78 MKDEV(SCSI_DISK0_MAJOR, 0), MKDEV(SCSI_DISK0_MAJOR, 16), /* sda, sdb */
79 MKDEV(SCSI_DISK0_MAJOR, 32), MKDEV(SCSI_DISK0_MAJOR, 48), /* sdc, sdd */
80 MKDEV(SCSI_DISK0_MAJOR, 64), MKDEV(SCSI_DISK0_MAJOR, 80), /* sde, sdf */
81 MKDEV(SCSI_DISK0_MAJOR, 96), MKDEV(SCSI_DISK0_MAJOR, 112), /* sdg, sdh */
82 MKDEV(SCSI_DISK0_MAJOR, 128), MKDEV(SCSI_DISK0_MAJOR, 144), /* sdi, sdj */
83 MKDEV(SCSI_DISK0_MAJOR, 160), MKDEV(SCSI_DISK0_MAJOR, 176), /* sdk, sdl */
84 MKDEV(SCSI_DISK0_MAJOR, 192), MKDEV(SCSI_DISK0_MAJOR, 208), /* sdm, sdn */
85 MKDEV(SCSI_DISK0_MAJOR, 224), MKDEV(SCSI_DISK0_MAJOR, 240), /* sdo, sdp */
86 };
88 static int __buffer_is_valid(struct task_struct *p,
89 unsigned long buffer,
90 unsigned short size,
91 int writeable_buffer);
92 static void __lock_buffer(unsigned long buffer,
93 unsigned short size,
94 int writeable_buffer);
95 static void unlock_buffer(struct task_struct *p,
96 unsigned long buffer,
97 unsigned short size,
98 int writeable_buffer);
100 static void io_schedule(unsigned long unused);
101 static int do_block_io_op_domain(struct task_struct *p, int max_to_do);
102 static void dispatch_rw_block_io(struct task_struct *p, int index);
103 static void dispatch_probe(struct task_struct *p, int index);
104 static void dispatch_debug_block_io(struct task_struct *p, int index);
105 static void dispatch_create_vbd(struct task_struct *p, int index);
106 static void dispatch_delete_vbd(struct task_struct *p, int index);
107 static void dispatch_grant_physdev(struct task_struct *p, int index);
108 static void dispatch_probe_physdev(struct task_struct *p, int index);
109 static void make_response(struct task_struct *p, unsigned long id,
110 unsigned short op, unsigned long st);
113 /******************************************************************
114 * BLOCK-DEVICE SCHEDULER LIST MAINTENANCE
115 */
117 static struct list_head io_schedule_list;
118 static spinlock_t io_schedule_list_lock;
120 static int __on_blkdev_list(struct task_struct *p)
121 {
122 return p->blkdev_list.next != NULL;
123 }
125 static void remove_from_blkdev_list(struct task_struct *p)
126 {
127 unsigned long flags;
128 if ( !__on_blkdev_list(p) ) return;
129 spin_lock_irqsave(&io_schedule_list_lock, flags);
130 if ( __on_blkdev_list(p) )
131 {
132 list_del(&p->blkdev_list);
133 p->blkdev_list.next = NULL;
134 put_task_struct(p);
135 }
136 spin_unlock_irqrestore(&io_schedule_list_lock, flags);
137 }
139 static void add_to_blkdev_list_tail(struct task_struct *p)
140 {
141 unsigned long flags;
142 if ( __on_blkdev_list(p) ) return;
143 spin_lock_irqsave(&io_schedule_list_lock, flags);
144 if ( !__on_blkdev_list(p) )
145 {
146 list_add_tail(&p->blkdev_list, &io_schedule_list);
147 get_task_struct(p);
148 }
149 spin_unlock_irqrestore(&io_schedule_list_lock, flags);
150 }
153 /******************************************************************
154 * SCHEDULER FUNCTIONS
155 */
157 static DECLARE_TASKLET(io_schedule_tasklet, io_schedule, 0);
159 static void io_schedule(unsigned long unused)
160 {
161 struct task_struct *p;
162 struct list_head *ent;
164 /* Queue up a batch of requests. */
165 while ( (atomic_read(&nr_pending) < MAX_PENDING_REQS) &&
166 !list_empty(&io_schedule_list) )
167 {
168 ent = io_schedule_list.next;
169 p = list_entry(ent, struct task_struct, blkdev_list);
170 get_task_struct(p);
171 remove_from_blkdev_list(p);
172 if ( do_block_io_op_domain(p, BATCH_PER_DOMAIN) )
173 add_to_blkdev_list_tail(p);
174 put_task_struct(p);
175 }
177 /* Push the batch through to disc. */
178 run_task_queue(&tq_disk);
179 }
181 static void maybe_trigger_io_schedule(void)
182 {
183 /*
184 * Needed so that two processes, who together make the following predicate
185 * true, don't both read stale values and evaluate the predicate
186 * incorrectly. Incredibly unlikely to stall the scheduler on x86, but...
187 */
188 smp_mb();
190 if ( (atomic_read(&nr_pending) < (MAX_PENDING_REQS/2)) &&
191 !list_empty(&io_schedule_list) )
192 {
193 tasklet_schedule(&io_schedule_tasklet);
194 }
195 }
199 /******************************************************************
200 * COMPLETION CALLBACK -- Called as bh->b_end_io()
201 */
203 static void end_block_io_op(struct buffer_head *bh, int uptodate)
204 {
205 unsigned long flags;
206 pending_req_t *pending_req = bh->pending_req;
208 /* An error fails the entire request. */
209 if ( !uptodate )
210 {
211 DPRINTK("Buffer not up-to-date at end of operation\n");
212 pending_req->status = 2;
213 }
215 unlock_buffer(pending_req->domain,
216 virt_to_phys(bh->b_data),
217 bh->b_size,
218 (pending_req->operation==READ));
220 if ( atomic_dec_and_test(&pending_req->pendcnt) )
221 {
222 make_response(pending_req->domain, pending_req->id,
223 pending_req->operation, pending_req->status);
224 put_task_struct(pending_req->domain);
225 spin_lock_irqsave(&pend_prod_lock, flags);
226 pending_ring[pending_prod] = pending_req - pending_reqs;
227 PENDREQ_IDX_INC(pending_prod);
228 spin_unlock_irqrestore(&pend_prod_lock, flags);
229 atomic_dec(&nr_pending);
230 maybe_trigger_io_schedule();
231 }
233 kmem_cache_free(buffer_head_cachep, bh);
234 }
237 long vbd_attach(vbd_attach_t *info)
238 {
239 printk("vbd_attach called!!!\n");
240 return -ENOSYS;
241 }
243 /* ----[ Syscall Interface ]------------------------------------------------*/
245 long do_block_io_op(block_io_op_t *u_block_io_op)
246 {
247 long ret = 0;
248 block_io_op_t op;
250 if (copy_from_user(&op, u_block_io_op, sizeof(op)))
251 return -EFAULT;
253 switch (op.cmd) {
255 case BLOCK_IO_OP_SIGNAL:
256 /* simply indicates there're reqs outstanding => add current to list */
257 add_to_blkdev_list_tail(current);
258 maybe_trigger_io_schedule();
259 break;
261 case BLOCK_IO_OP_ATTACH_VBD:
262 /* attach a VBD to a given domain; caller must be privileged */
263 if(!IS_PRIV(current))
264 return -EPERM;
265 ret = vbd_attach(&op.u.attach_info);
266 break;
268 default:
269 ret = -ENOSYS;
270 }
273 return ret;
274 }
278 /******************************************************************
279 * DOWNWARD CALLS -- These interface with the block-device layer proper.
280 */
282 static int __buffer_is_valid(struct task_struct *p,
283 unsigned long buffer,
284 unsigned short size,
285 int writeable_buffer)
286 {
287 unsigned long pfn;
288 struct pfn_info *page;
289 int rc = 0;
291 /* A request may span multiple page frames. Each must be checked. */
292 for ( pfn = buffer >> PAGE_SHIFT;
293 pfn < ((buffer + size + PAGE_SIZE - 1) >> PAGE_SHIFT);
294 pfn++ )
295 {
296 /* Each frame must be within bounds of machine memory. */
297 if ( pfn >= max_page )
298 {
299 DPRINTK("pfn out of range: %08lx\n", pfn);
300 goto out;
301 }
303 page = frame_table + pfn;
305 /* Each frame must belong to the requesting domain. */
306 if ( (page->flags & PG_domain_mask) != p->domain )
307 {
308 DPRINTK("bad domain: expected %d, got %ld\n",
309 p->domain, page->flags & PG_domain_mask);
310 goto out;
311 }
313 /* If reading into the frame, the frame must be writeable. */
314 if ( writeable_buffer &&
315 ((page->flags & PG_type_mask) != PGT_writeable_page) &&
316 (page->type_count != 0) )
317 {
318 DPRINTK("non-writeable page passed for block read\n");
319 goto out;
320 }
321 }
323 rc = 1;
324 out:
325 return rc;
326 }
328 static void __lock_buffer(unsigned long buffer,
329 unsigned short size,
330 int writeable_buffer)
331 {
332 unsigned long pfn;
333 struct pfn_info *page;
335 for ( pfn = buffer >> PAGE_SHIFT;
336 pfn < ((buffer + size + PAGE_SIZE - 1) >> PAGE_SHIFT);
337 pfn++ )
338 {
339 page = frame_table + pfn;
340 if ( writeable_buffer )
341 {
342 if ( page->type_count == 0 )
343 {
344 page->flags &= ~PG_type_mask;
345 /* No need for PG_need_flush here. */
346 page->flags |= PGT_writeable_page;
347 }
348 get_page_type(page);
349 }
350 get_page_tot(page);
351 }
352 }
354 static void unlock_buffer(struct task_struct *p,
355 unsigned long buffer,
356 unsigned short size,
357 int writeable_buffer)
358 {
359 unsigned long pfn, flags;
360 struct pfn_info *page;
362 spin_lock_irqsave(&p->page_lock, flags);
363 for ( pfn = buffer >> PAGE_SHIFT;
364 pfn < ((buffer + size + PAGE_SIZE - 1) >> PAGE_SHIFT);
365 pfn++ )
366 {
367 page = frame_table + pfn;
368 if ( writeable_buffer )
369 put_page_type(page);
370 put_page_tot(page);
371 }
372 spin_unlock_irqrestore(&p->page_lock, flags);
373 }
375 static int do_block_io_op_domain(struct task_struct *p, int max_to_do)
376 {
377 blk_ring_t *blk_ring = p->blk_ring_base;
378 int i, more_to_do = 0;
380 /*
381 * Take items off the comms ring, taking care not to catch up
382 * with the response-producer index.
383 */
384 for ( i = p->blk_req_cons;
385 (i != blk_ring->req_prod) &&
386 (((p->blk_resp_prod-i) & (BLK_RING_SIZE-1)) != 1);
387 i = BLK_RING_INC(i) )
388 {
389 if ( (max_to_do-- == 0) ||
390 (atomic_read(&nr_pending) == MAX_PENDING_REQS) )
391 {
392 more_to_do = 1;
393 break;
394 }
396 switch ( blk_ring->ring[i].req.operation )
397 {
398 case XEN_BLOCK_READ:
399 case XEN_BLOCK_WRITE:
400 dispatch_rw_block_io(p, i);
401 break;
403 case XEN_BLOCK_PROBE:
404 dispatch_probe(p, i);
405 break;
407 case XEN_BLOCK_DEBUG:
408 dispatch_debug_block_io(p, i);
409 break;
411 case XEN_BLOCK_VBD_CREATE:
412 dispatch_create_vbd(p, i);
413 break;
415 case XEN_BLOCK_VBD_DELETE:
416 dispatch_delete_vbd(p, i);
417 break;
419 case XEN_BLOCK_PHYSDEV_GRANT:
420 dispatch_grant_physdev(p, i);
421 break;
423 case XEN_BLOCK_PHYSDEV_PROBE:
424 dispatch_probe_physdev(p, i);
425 break;
427 default:
428 DPRINTK("error: unknown block io operation [%d]\n",
429 blk_ring->ring[i].req.operation);
430 make_response(p, blk_ring->ring[i].req.id,
431 blk_ring->ring[i].req.operation, 1);
432 break;
433 }
434 }
436 p->blk_req_cons = i;
437 return more_to_do;
438 }
440 static void dispatch_debug_block_io(struct task_struct *p, int index)
441 {
442 DPRINTK("dispatch_debug_block_io: unimplemented\n");
443 }
445 static void dispatch_probe_physdev(struct task_struct *p, int index)
446 {
447 blk_ring_t *blk_ring = p->blk_ring_base;
448 unsigned long flags, buffer;
449 physdisk_probebuf_t *buf;
450 int result;
452 buffer = blk_ring->ring[index].req.buffer_and_sects[0] & ~0x1FF;
454 spin_lock_irqsave(&p->page_lock, flags);
455 if ( !__buffer_is_valid(p, buffer, sizeof(*buf), 1) )
456 {
457 spin_unlock_irqrestore(&p->page_lock, flags);
458 result = 1;
459 goto out;
460 }
461 __lock_buffer(buffer, sizeof(*buf), 1);
462 spin_unlock_irqrestore(&p->page_lock, flags);
464 buf = phys_to_virt(buffer);
465 result = xen_physdisk_probe(p, buf);
467 unlock_buffer(p, buffer, sizeof(*buf), 1);
469 out:
470 make_response(p, blk_ring->ring[index].req.id,
471 XEN_BLOCK_PHYSDEV_PROBE, result);
472 }
474 static void dispatch_grant_physdev(struct task_struct *p, int index)
475 {
476 blk_ring_t *blk_ring = p->blk_ring_base;
477 unsigned long flags, buffer;
478 xp_disk_t *xpd;
479 int result;
481 if ( p->domain != 0 )
482 {
483 DPRINTK("dispatch_grant_physdev called by dom%d\n", p->domain);
484 result = 1;
485 goto out;
486 }
488 buffer = blk_ring->ring[index].req.buffer_and_sects[0] & ~0x1FF;
490 spin_lock_irqsave(&p->page_lock, flags);
491 if ( !__buffer_is_valid(p, buffer, sizeof(xv_disk_t), 1) )
492 {
493 DPRINTK("Bad buffer in dispatch_grant_physdev\n");
494 spin_unlock_irqrestore(&p->page_lock, flags);
495 result = 1;
496 goto out;
497 }
498 __lock_buffer(buffer, sizeof(xv_disk_t), 1);
499 spin_unlock_irqrestore(&p->page_lock, flags);
501 xpd = phys_to_virt(buffer);
502 result = xen_physdisk_grant(xpd);
504 unlock_buffer(p, buffer, sizeof(xp_disk_t), 1);
506 out:
507 make_response(p, blk_ring->ring[index].req.id,
508 XEN_BLOCK_PHYSDEV_GRANT, result);
509 }
511 static void dispatch_create_vbd(struct task_struct *p, int index)
512 {
513 blk_ring_t *blk_ring = p->blk_ring_base;
514 unsigned long flags, buffer;
515 xv_disk_t *xvd;
516 int result;
518 if ( p->domain != 0 )
519 {
520 DPRINTK("dispatch_create_vbd called by dom%d\n", p->domain);
521 result = 1;
522 goto out;
523 }
525 buffer = blk_ring->ring[index].req.buffer_and_sects[0] & ~0x1FF;
527 spin_lock_irqsave(&p->page_lock, flags);
528 if ( !__buffer_is_valid(p, buffer, sizeof(xv_disk_t), 1) )
529 {
530 DPRINTK("Bad buffer in dispatch_create_vbd\n");
531 spin_unlock_irqrestore(&p->page_lock, flags);
532 result = 1;
533 goto out;
534 }
535 __lock_buffer(buffer, sizeof(xv_disk_t), 1);
536 spin_unlock_irqrestore(&p->page_lock, flags);
538 xvd = phys_to_virt(buffer);
539 result = xen_vbd_create(xvd);
541 unlock_buffer(p, buffer, sizeof(xv_disk_t), 1);
543 out:
544 make_response(p, blk_ring->ring[index].req.id,
545 XEN_BLOCK_VBD_CREATE, result);
546 }
548 static void dispatch_delete_vbd(struct task_struct *p, int index)
549 {
550 DPRINTK("dispatch_delete_vbd: unimplemented\n");
551 }
553 static void dispatch_probe(struct task_struct *p, int index)
554 {
555 extern void ide_probe_devices(xen_disk_info_t *xdi);
556 extern void scsi_probe_devices(xen_disk_info_t *xdi);
557 extern void vbd_probe_devices(xen_disk_info_t *xdi, struct task_struct *p);
559 blk_ring_t *blk_ring = p->blk_ring_base;
560 xen_disk_info_t *xdi;
561 unsigned long flags, buffer;
562 int rc = 0;
564 buffer = blk_ring->ring[index].req.buffer_and_sects[0] & ~0x1FF;
566 spin_lock_irqsave(&p->page_lock, flags);
567 if ( !__buffer_is_valid(p, buffer, sizeof(xen_disk_info_t), 1) )
568 {
569 DPRINTK("Bad buffer in dispatch_probe_blk\n");
570 spin_unlock_irqrestore(&p->page_lock, flags);
571 rc = 1;
572 goto out;
573 }
575 __lock_buffer(buffer, sizeof(xen_disk_info_t), 1);
576 spin_unlock_irqrestore(&p->page_lock, flags);
578 /*
579 ** XXX SMH: all three of the below probe functions /append/ their
580 ** info to the xdi array; i.e. they assume that all earlier slots
581 ** are correctly filled, and that xdi->count points to the first
582 ** free entry in the array. All kinda gross but it'll do for now.
583 */
584 xdi = map_domain_mem(buffer);
585 xdi->count = 0;
586 #if 0 // XXX SMH: fix below once done proper vbd/physd rewrit
587 if(IS_PRIV(p)) {
588 #endif
589 /* privilege domains always gets access to the 'real' devices */
590 ide_probe_devices(xdi);
591 scsi_probe_devices(xdi);
592 #if 0
593 }
594 #endif
595 vbd_probe_devices(xdi, p);
596 unmap_domain_mem(xdi);
598 unlock_buffer(p, buffer, sizeof(xen_disk_info_t), 1);
600 out:
601 make_response(p, blk_ring->ring[index].req.id, XEN_BLOCK_PROBE, rc);
602 }
604 static void dispatch_rw_block_io(struct task_struct *p, int index)
605 {
606 extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]);
607 blk_ring_t *blk_ring = p->blk_ring_base;
608 blk_ring_req_entry_t *req = &blk_ring->ring[index].req;
609 struct buffer_head *bh;
610 int operation = (req->operation == XEN_BLOCK_WRITE) ? WRITE : READ;
611 unsigned short nr_sects;
612 unsigned long buffer, flags;
613 int i, tot_sects;
614 pending_req_t *pending_req;
616 /* We map virtual scatter/gather segments to physical segments. */
617 int new_segs, nr_psegs = 0;
618 phys_seg_t phys_seg[MAX_BLK_SEGS * 2];
620 spin_lock_irqsave(&p->page_lock, flags);
622 /* Check that number of segments is sane. */
623 if ( (req->nr_segments == 0) || (req->nr_segments > MAX_BLK_SEGS) )
624 {
625 DPRINTK("Bad number of segments in request (%d)\n", req->nr_segments);
626 goto bad_descriptor;
627 }
629 /*
630 * Check each address/size pair is sane, and convert into a
631 * physical device and block offset. Note that if the offset and size
632 * crosses a virtual extent boundary, we may end up with more
633 * physical scatter/gather segments than virtual segments.
634 */
635 for ( i = tot_sects = 0; i < req->nr_segments; i++, tot_sects += nr_sects )
636 {
637 buffer = req->buffer_and_sects[i] & ~0x1FF;
638 nr_sects = req->buffer_and_sects[i] & 0x1FF;
640 if ( nr_sects == 0 )
641 {
642 DPRINTK("zero-sized data request\n");
643 goto bad_descriptor;
644 }
646 if ( !__buffer_is_valid(p, buffer, nr_sects<<9, (operation==READ)) )
647 {
648 DPRINTK("invalid buffer\n");
649 goto bad_descriptor;
650 }
652 /* Get the physical device and block index. */
653 if ( (req->device & XENDEV_TYPE_MASK) == XENDEV_VIRTUAL )
654 {
655 new_segs = xen_vbd_map_request(
656 &phys_seg[nr_psegs], p, operation,
657 req->device,
658 req->sector_number + tot_sects,
659 buffer, nr_sects);
660 if ( new_segs <= 0 )
661 {
662 DPRINTK("bogus xen_vbd_map_request\n");
663 goto bad_descriptor;
664 }
665 }
666 else
667 {
668 phys_seg[nr_psegs].dev = req->device;
669 phys_seg[nr_psegs].sector_number = req->sector_number + tot_sects;
670 phys_seg[nr_psegs].buffer = buffer;
671 phys_seg[nr_psegs].nr_sects = nr_sects;
672 if (p->domain != 0 &&
673 !xen_physdisk_access_okay(&phys_seg[nr_psegs], p, operation)) {
674 DPRINTK("access denied: dev=%04x off=%ld nr=%ld\n",
675 req->device, req->sector_number + tot_sects, nr_sects);
676 goto bad_descriptor;
677 }
678 phys_seg[nr_psegs].dev = xendev_to_physdev(req->device);
679 if ( phys_seg[nr_psegs].dev == 0 )
680 {
681 DPRINTK("bad device: %04x\n", req->device);
682 goto bad_descriptor;
683 }
684 new_segs = 1;
685 }
687 nr_psegs += new_segs;
688 if ( nr_psegs >= (MAX_BLK_SEGS*2) ) BUG();
689 }
691 /* Lock pages associated with each buffer head. */
692 for ( i = 0; i < nr_psegs; i++ )
693 __lock_buffer(phys_seg[i].buffer, phys_seg[i].nr_sects<<9,
694 (operation==READ));
695 spin_unlock_irqrestore(&p->page_lock, flags);
697 atomic_inc(&nr_pending);
698 pending_req = pending_reqs + pending_ring[pending_cons];
699 PENDREQ_IDX_INC(pending_cons);
700 pending_req->domain = p;
701 pending_req->id = req->id;
702 pending_req->operation = operation;
703 pending_req->status = 0;
704 atomic_set(&pending_req->pendcnt, nr_psegs);
706 get_task_struct(p);
708 /* Now we pass each segment down to the real blkdev layer. */
709 for ( i = 0; i < nr_psegs; i++ )
710 {
711 bh = kmem_cache_alloc(buffer_head_cachep, GFP_KERNEL);
712 if ( bh == NULL ) panic("bh is null\n");
713 memset (bh, 0, sizeof (struct buffer_head));
715 bh->b_size = phys_seg[i].nr_sects << 9;
716 bh->b_dev = phys_seg[i].dev;
717 bh->b_rsector = phys_seg[i].sector_number;
718 bh->b_data = phys_to_virt(phys_seg[i].buffer);
719 bh->b_end_io = end_block_io_op;
720 bh->pending_req = pending_req;
722 if ( operation == WRITE )
723 {
724 bh->b_state = (1 << BH_JBD) | (1 << BH_Mapped) | (1 << BH_Req) |
725 (1 << BH_Dirty) | (1 << BH_Uptodate) | (1 << BH_Write);
726 }
727 else
728 {
729 bh->b_state = (1 << BH_Mapped) | (1 << BH_Read);
730 }
732 /* Dispatch a single request. We'll flush it to disc later. */
733 ll_rw_block(operation, 1, &bh);
734 }
736 return;
738 bad_descriptor:
739 spin_unlock_irqrestore(&p->page_lock, flags);
740 make_response(p, req->id, req->operation, 1);
741 }
745 /******************************************************************
746 * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING
747 */
749 kdev_t xendev_to_physdev(unsigned short xendev)
750 {
751 switch ( (xendev & XENDEV_TYPE_MASK) )
752 {
753 case XENDEV_IDE:
754 xendev &= XENDEV_IDX_MASK;
755 if ( xendev >= NR_IDE_DEVS )
756 {
757 DPRINTK("IDE device number out of range %d\n", xendev);
758 goto fail;
759 }
760 return ide_devs[xendev];
762 case XENDEV_SCSI:
763 xendev &= XENDEV_IDX_MASK;
764 if ( xendev >= NR_SCSI_DEVS )
765 {
766 DPRINTK("SCSI device number out of range %d\n", xendev);
767 goto fail;
768 }
769 return scsi_devs[xendev];
771 case XENDEV_VIRTUAL:
772 default:
773 DPRINTK("xendev_to_physdev: unknown device %d\n", xendev);
774 }
776 fail:
777 return (kdev_t)0;
778 }
780 static void make_response(struct task_struct *p, unsigned long id,
781 unsigned short op, unsigned long st)
782 {
783 unsigned long cpu_mask, flags;
784 int position;
785 blk_ring_t *blk_ring;
787 /* Place on the response ring for the relevant domain. */
788 spin_lock_irqsave(&p->blk_ring_lock, flags);
789 blk_ring = p->blk_ring_base;
790 position = p->blk_resp_prod;
791 blk_ring->ring[position].resp.id = id;
792 blk_ring->ring[position].resp.operation = op;
793 blk_ring->ring[position].resp.status = st;
794 p->blk_resp_prod = blk_ring->resp_prod = BLK_RING_INC(position);
795 spin_unlock_irqrestore(&p->blk_ring_lock, flags);
797 /* Kick the relevant domain. */
798 cpu_mask = mark_guest_event(p, _EVENT_BLKDEV);
799 guest_event_notify(cpu_mask);
800 }
802 static void dump_blockq(u_char key, void *dev_id, struct pt_regs *regs)
803 {
804 struct task_struct *p;
805 blk_ring_t *blk_ring ;
807 printk("Dumping block queue stats: nr_pending = %d (prod=%d,cons=%d)\n",
808 atomic_read(&nr_pending), pending_prod, pending_cons);
810 p = current->next_task;
811 do
812 {
813 if ( !is_idle_task(p) )
814 {
815 printk("Domain: %d\n", p->domain);
816 blk_ring = p->blk_ring_base;
818 printk(" req_prod:%d, req_cons:%d resp_prod:%d/%d on_list=%d\n",
819 blk_ring->req_prod, p->blk_req_cons,
820 blk_ring->resp_prod, p->blk_resp_prod,
821 __on_blkdev_list(p));
822 }
823 p = p->next_task;
824 } while (p != current);
825 }
827 /* Start-of-day initialisation for a new domain. */
828 void init_blkdev_info(struct task_struct *p)
829 {
830 if ( sizeof(*p->blk_ring_base) > PAGE_SIZE ) BUG();
831 p->blk_ring_base = (blk_ring_t *)get_free_page(GFP_KERNEL);
832 clear_page(p->blk_ring_base);
833 SHARE_PFN_WITH_DOMAIN(virt_to_page(p->blk_ring_base), p->domain);
834 p->blkdev_list.next = NULL;
836 memset(p->vbd_list, 0, sizeof(p->vbd_list));
838 /* Get any previously created segments. */
839 xen_refresh_vbd_list(p);
840 }
842 /* End-of-day teardown for a domain. */
843 void destroy_blkdev_info(struct task_struct *p)
844 {
845 ASSERT(!__on_blkdev_list(p));
846 UNSHARE_PFN(virt_to_page(p->blk_ring_base));
847 free_page((unsigned long)p->blk_ring_base);
848 }
850 void unlink_blkdev_info(struct task_struct *p)
851 {
852 unsigned long flags;
854 spin_lock_irqsave(&io_schedule_list_lock, flags);
855 if ( __on_blkdev_list(p) )
856 {
857 list_del(&p->blkdev_list);
858 p->blkdev_list.next = (void *)0xdeadbeef; /* prevent reinsertion */
859 put_task_struct(p);
860 }
861 spin_unlock_irqrestore(&io_schedule_list_lock, flags);
862 }
864 void initialize_block_io ()
865 {
866 int i;
868 atomic_set(&nr_pending, 0);
869 pending_prod = pending_cons = 0;
870 memset(pending_reqs, 0, sizeof(pending_reqs));
871 for ( i = 0; i < MAX_PENDING_REQS; i++ ) pending_ring[i] = i;
873 spin_lock_init(&io_schedule_list_lock);
874 INIT_LIST_HEAD(&io_schedule_list);
876 buffer_head_cachep = kmem_cache_create(
877 "buffer_head_cache", sizeof(struct buffer_head),
878 0, SLAB_HWCACHE_ALIGN, NULL, NULL);
880 xen_vbd_initialize();
882 add_key_handler('b', dump_blockq, "dump xen ide blkdev statistics");
883 }