ia64/xen-unstable

view linux-2.6.7-xen-sparse/drivers/xen/blkfront/blkfront.c @ 2220:9166d5ce37f3

bitkeeper revision 1.1159.1.48 (411c848c0EDRT7eJdbiXsiM284ZIbA)

2.6 blkfront fixes
author iap10@labyrinth.cl.cam.ac.uk
date Fri Aug 13 09:06:20 2004 +0000 (2004-08-13)
parents afbab8dc06bd
children 10b75f2911b6
line source
1 /******************************************************************************
2 * block.c
3 *
4 * XenLinux virtual block-device driver.
5 *
6 * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
7 * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
8 * Copyright (c) 2004, Christian Limpach
9 */
11 #include <linux/version.h>
13 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
14 #include "block.h"
15 #else
16 #include "common.h"
17 #include <linux/blk.h>
18 #include <linux/tqueue.h>
19 #endif
21 #include <linux/cdrom.h>
22 #include <linux/sched.h>
23 #include <linux/interrupt.h>
24 #include <scsi/scsi.h>
25 #include <asm-xen/ctrl_if.h>
27 typedef unsigned char byte; /* from linux/ide.h */
29 #define BLKIF_STATE_CLOSED 0
30 #define BLKIF_STATE_DISCONNECTED 1
31 #define BLKIF_STATE_CONNECTED 2
32 static unsigned int blkif_state = BLKIF_STATE_CLOSED;
33 static unsigned int blkif_evtchn, blkif_irq;
35 static int blkif_control_rsp_valid;
36 static blkif_response_t blkif_control_rsp;
38 static blkif_ring_t *blk_ring;
39 static BLKIF_RING_IDX resp_cons; /* Response consumer for comms ring. */
40 static BLKIF_RING_IDX req_prod; /* Private request producer. */
42 static blkif_ring_t *blk_ring_rec; /* Private copy of requests, used for
43 * recovery. Responses not stored here. */
44 static BLKIF_RING_IDX resp_cons_rec; /* Copy of response consumer, used for
45 * recovery */
46 static int recovery = 0; /* "Recovery in progress" flag. Protected
47 * by the blkif_io_lock */
49 /* We plug the I/O ring if the driver is suspended or if the ring is full. */
50 #define BLKIF_RING_FULL (((req_prod - resp_cons) == BLKIF_RING_SIZE) || \
51 (blkif_state != BLKIF_STATE_CONNECTED))
53 /*
54 * Request queues with outstanding work, but ring is currently full.
55 * We need no special lock here, as we always access this with the
56 * blkif_io_lock held. We only need a small maximum list.
57 */
58 #define MAX_PENDING 8
59 static request_queue_t *pending_queues[MAX_PENDING];
60 static int nr_pending;
62 static inline void translate_req_to_mfn(blkif_request_t *xreq,
63 blkif_request_t *req);
65 static inline void translate_req_to_pfn(blkif_request_t *xreq,
66 blkif_request_t *req);
68 static inline void flush_requests(void);
70 static void kick_pending_request_queues(void);
72 int __init xlblk_init(void);
74 /************************** KERNEL VERSION 2.6 **************************/
76 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
78 #define DISABLE_SCATTERGATHER()
80 __initcall(xlblk_init);
82 int blkif_open(struct inode *inode, struct file *filep)
83 {
84 struct gendisk *gd = inode->i_bdev->bd_disk;
85 struct xlbd_disk_info *di = (struct xlbd_disk_info *)gd->private_data;
87 /* Update of usage count is protected by per-device semaphore. */
88 di->mi->usage++;
90 return 0;
91 }
94 int blkif_release(struct inode *inode, struct file *filep)
95 {
96 struct gendisk *gd = inode->i_bdev->bd_disk;
97 struct xlbd_disk_info *di = (struct xlbd_disk_info *)gd->private_data;
99 /*
100 * When usage drops to zero it may allow more VBD updates to occur.
101 * Update of usage count is protected by a per-device semaphore.
102 */
103 if (--di->mi->usage == 0) {
104 #if 0
105 update_tq.routine = update_vbds_task;
106 schedule_task(&update_tq);
107 #endif
108 }
110 return 0;
111 }
114 int blkif_ioctl(struct inode *inode, struct file *filep,
115 unsigned command, unsigned long argument)
116 {
117 /* struct gendisk *gd = inode->i_bdev->bd_disk; */
119 DPRINTK_IOCTL("command: 0x%x, argument: 0x%lx, dev: 0x%04x\n",
120 command, (long)argument, inode->i_rdev);
122 switch (command) {
124 case HDIO_GETGEO:
125 /* return ENOSYS to use defaults */
126 return -ENOSYS;
128 default:
129 printk(KERN_ALERT "ioctl %08x not supported by Xen blkdev\n",
130 command);
131 return -ENOSYS;
132 }
134 return 0;
135 }
137 #if 0
138 /* check media change: should probably do something here in some cases :-) */
139 int blkif_check(kdev_t dev)
140 {
141 DPRINTK("blkif_check\n");
142 return 0;
143 }
145 int blkif_revalidate(kdev_t dev)
146 {
147 struct block_device *bd;
148 struct gendisk *gd;
149 xen_block_t *disk;
150 unsigned long capacity;
151 int i, rc = 0;
153 if ( (bd = bdget(dev)) == NULL )
154 return -EINVAL;
156 /*
157 * Update of partition info, and check of usage count, is protected
158 * by the per-block-device semaphore.
159 */
160 down(&bd->bd_sem);
162 if ( ((gd = get_gendisk(dev)) == NULL) ||
163 ((disk = xldev_to_xldisk(dev)) == NULL) ||
164 ((capacity = gd->part[MINOR(dev)].nr_sects) == 0) )
165 {
166 rc = -EINVAL;
167 goto out;
168 }
170 if ( disk->usage > 1 )
171 {
172 rc = -EBUSY;
173 goto out;
174 }
176 /* Only reread partition table if VBDs aren't mapped to partitions. */
177 if ( !(gd->flags[MINOR(dev) >> gd->minor_shift] & GENHD_FL_VIRT_PARTNS) )
178 {
179 for ( i = gd->max_p - 1; i >= 0; i-- )
180 {
181 invalidate_device(dev+i, 1);
182 gd->part[MINOR(dev+i)].start_sect = 0;
183 gd->part[MINOR(dev+i)].nr_sects = 0;
184 gd->sizes[MINOR(dev+i)] = 0;
185 }
187 grok_partitions(gd, MINOR(dev)>>gd->minor_shift, gd->max_p, capacity);
188 }
190 out:
191 up(&bd->bd_sem);
192 bdput(bd);
193 return rc;
194 }
195 #endif
197 /*
198 * blkif_queue_request
199 *
200 * request block io
201 *
202 * id: for guest use only.
203 * operation: BLKIF_OP_{READ,WRITE,PROBE}
204 * buffer: buffer to read/write into. this should be a
205 * virtual address in the guest os.
206 */
207 static int blkif_queue_request(struct request *req)
208 {
209 struct xlbd_disk_info *di =
210 (struct xlbd_disk_info *)req->rq_disk->private_data;
211 unsigned long buffer_ma;
212 blkif_request_t *ring_req;
213 struct bio *bio;
214 struct bio_vec *bvec;
215 int idx, s;
216 unsigned int fsect, lsect;
218 if (unlikely(blkif_state != BLKIF_STATE_CONNECTED))
219 return 1;
221 /* Fill out a communications ring structure. */
222 ring_req = &blk_ring->ring[MASK_BLKIF_IDX(req_prod)].req;
223 ring_req->id = (unsigned long)req;
224 ring_req->operation = rq_data_dir(req) ? BLKIF_OP_WRITE :
225 BLKIF_OP_READ;
226 ring_req->sector_number = (blkif_sector_t)req->sector;
227 ring_req->device = di->xd_device;
229 s = 0;
230 ring_req->nr_segments = 0;
231 rq_for_each_bio(bio, req) {
232 bio_for_each_segment(bvec, bio, idx) {
233 buffer_ma = page_to_phys(bvec->bv_page);
234 if (unlikely((buffer_ma & ((1<<9)-1)) != 0))
235 BUG();
237 fsect = bvec->bv_offset >> 9;
238 lsect = fsect + (bvec->bv_len >> 9) - 1;
239 if (unlikely(lsect > 7))
240 BUG();
242 ring_req->frame_and_sects[ring_req->nr_segments++] =
243 buffer_ma | (fsect << 3) | lsect;
244 s += bvec->bv_len >> 9;
245 }
246 }
248 req_prod++;
250 /* Keep a private copy so we can reissue requests when recovering. */
251 translate_req_to_pfn(
252 &blk_ring_rec->ring[MASK_BLKIF_IDX(blk_ring_rec->req_prod)].req,
253 ring_req);
255 blk_ring_rec->req_prod++;
257 return 0;
258 }
261 /*
262 * do_blkif_request
263 * read a block; request is in a request queue
264 */
265 void do_blkif_request(request_queue_t *rq)
266 {
267 struct request *req;
268 int queued;
270 DPRINTK("Entered do_blkif_request\n");
272 queued = 0;
274 while ((req = elv_next_request(rq)) != NULL) {
275 if (!blk_fs_request(req)) {
276 end_request(req, 0);
277 continue;
278 }
280 if (BLKIF_RING_FULL) {
281 blk_stop_queue(rq);
282 break;
283 }
284 DPRINTK("do_blkif_request %p: cmd %p, sec %lx, (%u/%li) buffer:%p [%s]\n",
285 req, req->cmd, req->sector, req->current_nr_sectors,
286 req->nr_sectors, req->buffer,
287 rq_data_dir(req) ? "write" : "read");
288 blkdev_dequeue_request(req);
289 if (blkif_queue_request(req)) {
290 blk_stop_queue(rq);
291 break;
292 }
293 queued++;
294 }
296 if (queued != 0)
297 flush_requests();
298 }
301 static irqreturn_t blkif_int(int irq, void *dev_id, struct pt_regs *ptregs)
302 {
303 struct request *req;
304 blkif_response_t *bret;
305 BLKIF_RING_IDX i, rp;
306 unsigned long flags;
308 spin_lock_irqsave(&blkif_io_lock, flags);
310 if ( unlikely(blkif_state == BLKIF_STATE_CLOSED) ||
311 unlikely(recovery) )
312 {
313 spin_unlock_irqrestore(&blkif_io_lock, flags);
314 return IRQ_HANDLED;
315 }
317 rp = blk_ring->resp_prod;
318 rmb(); /* Ensure we see queued responses up to 'rp'. */
320 for ( i = resp_cons; i != rp; i++ )
321 {
322 bret = &blk_ring->ring[MASK_BLKIF_IDX(i)].resp;
323 switch ( bret->operation )
324 {
325 case BLKIF_OP_READ:
326 case BLKIF_OP_WRITE:
327 if ( unlikely(bret->status != BLKIF_RSP_OKAY) )
328 DPRINTK("Bad return from blkdev data request: %x\n",
329 bret->status);
330 req = (struct request *)bret->id;
331 if ( unlikely(end_that_request_first
332 (req,
333 (bret->status == BLKIF_RSP_OKAY),
334 req->hard_nr_sectors)) )
335 BUG();
336 end_that_request_last(req);
337 break;
338 case BLKIF_OP_PROBE:
339 memcpy(&blkif_control_rsp, bret, sizeof(*bret));
340 blkif_control_rsp_valid = 1;
341 break;
342 default:
343 BUG();
344 }
345 }
347 resp_cons = i;
348 resp_cons_rec = i;
350 if ( (xlbd_blk_queue != NULL) &&
351 test_bit(QUEUE_FLAG_STOPPED, &xlbd_blk_queue->queue_flags) )
352 {
353 blk_start_queue(xlbd_blk_queue);
354 /* XXXcl call to request_fn should not be needed but
355 * we get stuck without... needs investigating
356 */
357 xlbd_blk_queue->request_fn(xlbd_blk_queue);
358 }
360 spin_unlock_irqrestore(&blkif_io_lock, flags);
362 return IRQ_HANDLED;
363 }
365 #else
366 /************************** KERNEL VERSION 2.4 **************************/
368 static kdev_t sg_dev;
369 static int sg_operation = -1;
370 static unsigned long sg_next_sect;
372 #define DISABLE_SCATTERGATHER() (sg_operation = -1)
374 #define blkif_io_lock io_request_lock
376 /*
377 * blkif_update_int/update-vbds_task - handle VBD update events.
378 * Schedule a task for keventd to run, which will update the VBDs and perform
379 * the corresponding updates to our view of VBD state.
380 */
382 #if 0
383 static struct tq_struct update_tq;
384 static void update_vbds_task(void *unused)
385 {
386 xlvbd_update_vbds();
387 }
388 #endif
390 int blkif_open(struct inode *inode, struct file *filep)
391 {
392 short xldev = inode->i_rdev;
393 struct gendisk *gd = get_gendisk(xldev);
394 xl_disk_t *disk = xldev_to_xldisk(inode->i_rdev);
395 short minor = MINOR(xldev);
397 if ( gd->part[minor].nr_sects == 0 )
398 {
399 /*
400 * Device either doesn't exist, or has zero capacity; we use a few
401 * cheesy heuristics to return the relevant error code
402 */
403 if ( (gd->sizes[minor >> gd->minor_shift] != 0) ||
404 ((minor & (gd->max_p - 1)) != 0) )
405 {
406 /*
407 * We have a real device, but no such partition, or we just have a
408 * partition number so guess this is the problem.
409 */
410 return -ENXIO; /* no such device or address */
411 }
412 else if ( gd->flags[minor >> gd->minor_shift] & GENHD_FL_REMOVABLE )
413 {
414 /* This is a removable device => assume that media is missing. */
415 return -ENOMEDIUM; /* media not present (this is a guess) */
416 }
417 else
418 {
419 /* Just go for the general 'no such device' error. */
420 return -ENODEV; /* no such device */
421 }
422 }
424 /* Update of usage count is protected by per-device semaphore. */
425 disk->usage++;
427 return 0;
428 }
431 int blkif_release(struct inode *inode, struct file *filep)
432 {
433 xl_disk_t *disk = xldev_to_xldisk(inode->i_rdev);
435 /*
436 * When usage drops to zero it may allow more VBD updates to occur.
437 * Update of usage count is protected by a per-device semaphore.
438 */
439 if ( --disk->usage == 0 )
440 {
441 #if 0
442 update_tq.routine = update_vbds_task;
443 schedule_task(&update_tq);
444 #endif
445 }
447 return 0;
448 }
451 int blkif_ioctl(struct inode *inode, struct file *filep,
452 unsigned command, unsigned long argument)
453 {
454 kdev_t dev = inode->i_rdev;
455 struct hd_geometry *geo = (struct hd_geometry *)argument;
456 struct gendisk *gd;
457 struct hd_struct *part;
458 int i;
460 /* NB. No need to check permissions. That is done for us. */
462 DPRINTK_IOCTL("command: 0x%x, argument: 0x%lx, dev: 0x%04x\n",
463 command, (long) argument, dev);
465 gd = get_gendisk(dev);
466 part = &gd->part[MINOR(dev)];
468 switch ( command )
469 {
470 case BLKGETSIZE:
471 DPRINTK_IOCTL(" BLKGETSIZE: %x %lx\n", BLKGETSIZE, part->nr_sects);
472 return put_user(part->nr_sects, (unsigned long *) argument);
474 case BLKGETSIZE64:
475 DPRINTK_IOCTL(" BLKGETSIZE64: %x %llx\n", BLKGETSIZE64,
476 (u64)part->nr_sects * 512);
477 return put_user((u64)part->nr_sects * 512, (u64 *) argument);
479 case BLKRRPART: /* re-read partition table */
480 DPRINTK_IOCTL(" BLKRRPART: %x\n", BLKRRPART);
481 return blkif_revalidate(dev);
483 case BLKSSZGET:
484 return hardsect_size[MAJOR(dev)][MINOR(dev)];
486 case BLKBSZGET: /* get block size */
487 DPRINTK_IOCTL(" BLKBSZGET: %x\n", BLKBSZGET);
488 break;
490 case BLKBSZSET: /* set block size */
491 DPRINTK_IOCTL(" BLKBSZSET: %x\n", BLKBSZSET);
492 break;
494 case BLKRASET: /* set read-ahead */
495 DPRINTK_IOCTL(" BLKRASET: %x\n", BLKRASET);
496 break;
498 case BLKRAGET: /* get read-ahead */
499 DPRINTK_IOCTL(" BLKRAFET: %x\n", BLKRAGET);
500 break;
502 case HDIO_GETGEO:
503 /* note: these values are complete garbage */
504 DPRINTK_IOCTL(" HDIO_GETGEO: %x\n", HDIO_GETGEO);
505 if (!argument) return -EINVAL;
506 if (put_user(0x00, (unsigned long *) &geo->start)) return -EFAULT;
507 if (put_user(0xff, (byte *)&geo->heads)) return -EFAULT;
508 if (put_user(0x3f, (byte *)&geo->sectors)) return -EFAULT;
509 if (put_user(0x106, (unsigned short *)&geo->cylinders)) return -EFAULT;
510 return 0;
512 case HDIO_GETGEO_BIG:
513 /* note: these values are complete garbage */
514 DPRINTK_IOCTL(" HDIO_GETGEO_BIG: %x\n", HDIO_GETGEO_BIG);
515 if (!argument) return -EINVAL;
516 if (put_user(0x00, (unsigned long *) &geo->start)) return -EFAULT;
517 if (put_user(0xff, (byte *)&geo->heads)) return -EFAULT;
518 if (put_user(0x3f, (byte *)&geo->sectors)) return -EFAULT;
519 if (put_user(0x106, (unsigned int *) &geo->cylinders)) return -EFAULT;
520 return 0;
522 case CDROMMULTISESSION:
523 DPRINTK("FIXME: support multisession CDs later\n");
524 for ( i = 0; i < sizeof(struct cdrom_multisession); i++ )
525 if ( put_user(0, (byte *)(argument + i)) ) return -EFAULT;
526 return 0;
528 case SCSI_IOCTL_GET_BUS_NUMBER:
529 DPRINTK("FIXME: SCSI_IOCTL_GET_BUS_NUMBER ioctl in XL blkif");
530 return -ENOSYS;
532 default:
533 printk(KERN_ALERT "ioctl %08x not supported by XL blkif\n", command);
534 return -ENOSYS;
535 }
537 return 0;
538 }
542 /* check media change: should probably do something here in some cases :-) */
543 int blkif_check(kdev_t dev)
544 {
545 DPRINTK("blkif_check\n");
546 return 0;
547 }
549 int blkif_revalidate(kdev_t dev)
550 {
551 struct block_device *bd;
552 struct gendisk *gd;
553 xl_disk_t *disk;
554 unsigned long capacity;
555 int i, rc = 0;
557 if ( (bd = bdget(dev)) == NULL )
558 return -EINVAL;
560 /*
561 * Update of partition info, and check of usage count, is protected
562 * by the per-block-device semaphore.
563 */
564 down(&bd->bd_sem);
566 if ( ((gd = get_gendisk(dev)) == NULL) ||
567 ((disk = xldev_to_xldisk(dev)) == NULL) ||
568 ((capacity = gd->part[MINOR(dev)].nr_sects) == 0) )
569 {
570 rc = -EINVAL;
571 goto out;
572 }
574 if ( disk->usage > 1 )
575 {
576 rc = -EBUSY;
577 goto out;
578 }
580 /* Only reread partition table if VBDs aren't mapped to partitions. */
581 if ( !(gd->flags[MINOR(dev) >> gd->minor_shift] & GENHD_FL_VIRT_PARTNS) )
582 {
583 for ( i = gd->max_p - 1; i >= 0; i-- )
584 {
585 invalidate_device(dev+i, 1);
586 gd->part[MINOR(dev+i)].start_sect = 0;
587 gd->part[MINOR(dev+i)].nr_sects = 0;
588 gd->sizes[MINOR(dev+i)] = 0;
589 }
591 grok_partitions(gd, MINOR(dev)>>gd->minor_shift, gd->max_p, capacity);
592 }
594 out:
595 up(&bd->bd_sem);
596 bdput(bd);
597 return rc;
598 }
603 /*
604 * blkif_queue_request
605 *
606 * request block io
607 *
608 * id: for guest use only.
609 * operation: BLKIF_OP_{READ,WRITE,PROBE}
610 * buffer: buffer to read/write into. this should be a
611 * virtual address in the guest os.
612 */
613 static int blkif_queue_request(unsigned long id,
614 int operation,
615 char * buffer,
616 unsigned long sector_number,
617 unsigned short nr_sectors,
618 kdev_t device)
619 {
620 unsigned long buffer_ma = phys_to_machine(virt_to_phys(buffer));
621 struct gendisk *gd;
622 blkif_request_t *req;
623 struct buffer_head *bh;
624 unsigned int fsect, lsect;
626 fsect = (buffer_ma & ~PAGE_MASK) >> 9;
627 lsect = fsect + nr_sectors - 1;
629 /* Buffer must be sector-aligned. Extent mustn't cross a page boundary. */
630 if ( unlikely((buffer_ma & ((1<<9)-1)) != 0) )
631 BUG();
632 if ( lsect > 7 )
633 BUG();
635 buffer_ma &= PAGE_MASK;
637 if ( unlikely(blkif_state != BLKIF_STATE_CONNECTED) )
638 return 1;
640 switch ( operation )
641 {
643 case BLKIF_OP_READ:
644 case BLKIF_OP_WRITE:
645 gd = get_gendisk(device);
647 /*
648 * Update the sector_number we'll pass down as appropriate; note that
649 * we could sanity check that resulting sector will be in this
650 * partition, but this will happen in driver backend anyhow.
651 */
652 sector_number += gd->part[MINOR(device)].start_sect;
654 /*
655 * If this unit doesn't consist of virtual partitions then we clear
656 * the partn bits from the device number.
657 */
658 if ( !(gd->flags[MINOR(device)>>gd->minor_shift] &
659 GENHD_FL_VIRT_PARTNS) )
660 device &= ~(gd->max_p - 1);
662 if ( (sg_operation == operation) &&
663 (sg_dev == device) &&
664 (sg_next_sect == sector_number) )
665 {
666 req = &blk_ring->ring[MASK_BLKIF_IDX(req_prod-1)].req;
667 bh = (struct buffer_head *)id;
668 bh->b_reqnext = (struct buffer_head *)req->id;
669 req->id = id;
670 req->frame_and_sects[req->nr_segments] =
671 buffer_ma | (fsect<<3) | lsect;
672 if ( ++req->nr_segments < BLKIF_MAX_SEGMENTS_PER_REQUEST )
673 sg_next_sect += nr_sectors;
674 else
675 DISABLE_SCATTERGATHER();
677 /* Update the copy of the request in the recovery ring. */
678 translate_req_to_pfn(&blk_ring_rec->ring[
679 MASK_BLKIF_IDX(blk_ring_rec->req_prod - 1)].req, req);
681 return 0;
682 }
683 else if ( BLKIF_RING_FULL )
684 {
685 return 1;
686 }
687 else
688 {
689 sg_operation = operation;
690 sg_dev = device;
691 sg_next_sect = sector_number + nr_sectors;
692 }
693 break;
695 default:
696 panic("unknown op %d\n", operation);
697 }
699 /* Fill out a communications ring structure. */
700 req = &blk_ring->ring[MASK_BLKIF_IDX(req_prod)].req;
701 req->id = id;
702 req->operation = operation;
703 req->sector_number = (blkif_sector_t)sector_number;
704 req->device = device;
705 req->nr_segments = 1;
706 req->frame_and_sects[0] = buffer_ma | (fsect<<3) | lsect;
707 req_prod++;
709 /* Keep a private copy so we can reissue requests when recovering. */
710 translate_req_to_pfn(&blk_ring_rec->ring[
711 MASK_BLKIF_IDX(blk_ring_rec->req_prod)].req, req);
712 blk_ring_rec->req_prod++;
714 return 0;
715 }
718 /*
719 * do_blkif_request
720 * read a block; request is in a request queue
721 */
722 void do_blkif_request(request_queue_t *rq)
723 {
724 struct request *req;
725 struct buffer_head *bh, *next_bh;
726 int rw, nsect, full, queued = 0;
728 DPRINTK("Entered do_blkif_request\n");
730 while ( !rq->plugged && !list_empty(&rq->queue_head))
731 {
732 if ( (req = blkdev_entry_next_request(&rq->queue_head)) == NULL )
733 goto out;
735 DPRINTK("do_blkif_request %p: cmd %i, sec %lx, (%li/%li) bh:%p\n",
736 req, req->cmd, req->sector,
737 req->current_nr_sectors, req->nr_sectors, req->bh);
739 rw = req->cmd;
740 if ( rw == READA )
741 rw = READ;
742 if ( unlikely((rw != READ) && (rw != WRITE)) )
743 panic("XenoLinux Virtual Block Device: bad cmd: %d\n", rw);
745 req->errors = 0;
747 bh = req->bh;
748 while ( bh != NULL )
749 {
750 next_bh = bh->b_reqnext;
751 bh->b_reqnext = NULL;
753 full = blkif_queue_request(
754 (unsigned long)bh,
755 (rw == READ) ? BLKIF_OP_READ : BLKIF_OP_WRITE,
756 bh->b_data, bh->b_rsector, bh->b_size>>9, bh->b_rdev);
758 if ( full )
759 {
760 bh->b_reqnext = next_bh;
761 pending_queues[nr_pending++] = rq;
762 if ( unlikely(nr_pending >= MAX_PENDING) )
763 BUG();
764 goto out;
765 }
767 queued++;
769 /* Dequeue the buffer head from the request. */
770 nsect = bh->b_size >> 9;
771 bh = req->bh = next_bh;
773 if ( bh != NULL )
774 {
775 /* There's another buffer head to do. Update the request. */
776 req->hard_sector += nsect;
777 req->hard_nr_sectors -= nsect;
778 req->sector = req->hard_sector;
779 req->nr_sectors = req->hard_nr_sectors;
780 req->current_nr_sectors = bh->b_size >> 9;
781 req->buffer = bh->b_data;
782 }
783 else
784 {
785 /* That was the last buffer head. Finalise the request. */
786 if ( unlikely(end_that_request_first(req, 1, "XenBlk")) )
787 BUG();
788 blkdev_dequeue_request(req);
789 end_that_request_last(req);
790 }
791 }
792 }
794 out:
795 if ( queued != 0 )
796 flush_requests();
797 }
800 static void blkif_int(int irq, void *dev_id, struct pt_regs *ptregs)
801 {
802 BLKIF_RING_IDX i, rp;
803 unsigned long flags;
804 struct buffer_head *bh, *next_bh;
806 spin_lock_irqsave(&io_request_lock, flags);
808 if ( unlikely(blkif_state == BLKIF_STATE_CLOSED || recovery) )
809 {
810 spin_unlock_irqrestore(&io_request_lock, flags);
811 return;
812 }
814 rp = blk_ring->resp_prod;
815 rmb(); /* Ensure we see queued responses up to 'rp'. */
817 for ( i = resp_cons; i != rp; i++ )
818 {
819 blkif_response_t *bret = &blk_ring->ring[MASK_BLKIF_IDX(i)].resp;
820 switch ( bret->operation )
821 {
822 case BLKIF_OP_READ:
823 case BLKIF_OP_WRITE:
824 if ( unlikely(bret->status != BLKIF_RSP_OKAY) )
825 DPRINTK("Bad return from blkdev data request: %lx\n",
826 bret->status);
827 for ( bh = (struct buffer_head *)bret->id;
828 bh != NULL;
829 bh = next_bh )
830 {
831 next_bh = bh->b_reqnext;
832 bh->b_reqnext = NULL;
833 bh->b_end_io(bh, bret->status == BLKIF_RSP_OKAY);
834 }
835 break;
836 case BLKIF_OP_PROBE:
837 memcpy(&blkif_control_rsp, bret, sizeof(*bret));
838 blkif_control_rsp_valid = 1;
839 break;
840 default:
841 BUG();
842 }
843 }
845 resp_cons = i;
846 resp_cons_rec = i;
848 kick_pending_request_queues();
850 spin_unlock_irqrestore(&io_request_lock, flags);
851 }
853 #endif
855 /***************************** COMMON CODE *******************************/
858 static inline void translate_req_to_pfn(blkif_request_t *xreq,
859 blkif_request_t *req)
860 {
861 int i;
863 *xreq = *req;
865 for ( i = 0; i < req->nr_segments; i++ )
866 {
867 xreq->frame_and_sects[i] = (req->frame_and_sects[i] & ~PAGE_MASK) |
868 (machine_to_phys_mapping[req->frame_and_sects[i] >> PAGE_SHIFT] <<
869 PAGE_SHIFT);
870 }
871 }
873 static inline void translate_req_to_mfn(blkif_request_t *xreq,
874 blkif_request_t *req)
875 {
876 int i;
878 *xreq = *req;
880 for ( i = 0; i < req->nr_segments; i++ )
881 {
882 xreq->frame_and_sects[i] = (req->frame_and_sects[i] & ~PAGE_MASK) |
883 (phys_to_machine_mapping[req->frame_and_sects[i] >> PAGE_SHIFT] <<
884 PAGE_SHIFT);
885 }
886 }
888 static inline void flush_requests(void)
889 {
890 DISABLE_SCATTERGATHER();
891 wmb(); /* Ensure that the frontend can see the requests. */
892 blk_ring->req_prod = req_prod;
893 notify_via_evtchn(blkif_evtchn);
894 }
897 static void kick_pending_request_queues(void)
898 {
899 /* We kick pending request queues if the ring is reasonably empty. */
900 if ( (nr_pending != 0) &&
901 ((req_prod - resp_cons) < (BLKIF_RING_SIZE >> 1)) )
902 {
903 /* Attempt to drain the queue, but bail if the ring becomes full. */
904 while ( (nr_pending != 0) && !BLKIF_RING_FULL )
905 do_blkif_request(pending_queues[--nr_pending]);
906 }
907 }
909 void blkif_control_send(blkif_request_t *req, blkif_response_t *rsp)
910 {
911 unsigned long flags;
913 retry:
914 while ( (req_prod - resp_cons) == BLKIF_RING_SIZE )
915 {
916 set_current_state(TASK_INTERRUPTIBLE);
917 schedule_timeout(1);
918 }
920 spin_lock_irqsave(&blkif_io_lock, flags);
921 if ( (req_prod - resp_cons) == BLKIF_RING_SIZE )
922 {
923 spin_unlock_irqrestore(&blkif_io_lock, flags);
924 goto retry;
925 }
927 DISABLE_SCATTERGATHER();
928 blk_ring->ring[MASK_BLKIF_IDX(req_prod)].req = *req;
930 translate_req_to_pfn(&blk_ring_rec->ring[
931 MASK_BLKIF_IDX(blk_ring_rec->req_prod++)].req,req);
933 req_prod++;
934 flush_requests();
936 spin_unlock_irqrestore(&blkif_io_lock, flags);
938 while ( !blkif_control_rsp_valid )
939 {
940 set_current_state(TASK_INTERRUPTIBLE);
941 schedule_timeout(1);
942 }
944 memcpy(rsp, &blkif_control_rsp, sizeof(*rsp));
945 blkif_control_rsp_valid = 0;
946 }
949 static void blkif_status_change(blkif_fe_interface_status_changed_t *status)
950 {
951 ctrl_msg_t cmsg;
952 blkif_fe_interface_connect_t up;
954 if ( status->handle != 0 )
955 {
956 printk(KERN_WARNING "Status change on unsupported blkif %d\n",
957 status->handle);
958 return;
959 }
961 switch ( status->status )
962 {
963 case BLKIF_INTERFACE_STATUS_DESTROYED:
964 printk(KERN_WARNING "Unexpected blkif-DESTROYED message in state %d\n",
965 blkif_state);
966 break;
968 case BLKIF_INTERFACE_STATUS_DISCONNECTED:
969 if ( blkif_state != BLKIF_STATE_CLOSED )
970 {
971 printk(KERN_WARNING "Unexpected blkif-DISCONNECTED message"
972 " in state %d\n", blkif_state);
974 printk(KERN_INFO "VBD driver recovery in progress\n");
976 /* Prevent new requests being issued until we fix things up. */
977 spin_lock_irq(&blkif_io_lock);
978 recovery = 1;
979 blkif_state = BLKIF_STATE_DISCONNECTED;
980 spin_unlock_irq(&blkif_io_lock);
982 /* Free resources associated with old device channel. */
983 free_page((unsigned long)blk_ring);
984 free_irq(blkif_irq, NULL);
985 unbind_evtchn_from_irq(blkif_evtchn);
986 }
988 /* Move from CLOSED to DISCONNECTED state. */
989 blk_ring = (blkif_ring_t *)__get_free_page(GFP_KERNEL);
990 blk_ring->req_prod = blk_ring->resp_prod = resp_cons = req_prod = 0;
991 blkif_state = BLKIF_STATE_DISCONNECTED;
993 /* Construct an interface-CONNECT message for the domain controller. */
994 cmsg.type = CMSG_BLKIF_FE;
995 cmsg.subtype = CMSG_BLKIF_FE_INTERFACE_CONNECT;
996 cmsg.length = sizeof(blkif_fe_interface_connect_t);
997 up.handle = 0;
998 up.shmem_frame = virt_to_machine(blk_ring) >> PAGE_SHIFT;
999 memcpy(cmsg.msg, &up, sizeof(up));
1001 /* Tell the controller to bring up the interface. */
1002 ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
1003 break;
1005 case BLKIF_INTERFACE_STATUS_CONNECTED:
1006 if ( blkif_state == BLKIF_STATE_CLOSED )
1008 printk(KERN_WARNING "Unexpected blkif-CONNECTED message"
1009 " in state %d\n", blkif_state);
1010 break;
1013 blkif_evtchn = status->evtchn;
1014 blkif_irq = bind_evtchn_to_irq(blkif_evtchn);
1015 (void)request_irq(blkif_irq, blkif_int,
1016 SA_SAMPLE_RANDOM, "blkif", NULL);
1018 if ( recovery )
1020 int i,j;
1022 /*
1023 * Shouldn't need the io_request_lock here - the device is plugged
1024 * and the recovery flag prevents the interrupt handler changing
1025 * anything.
1026 */
1028 /* Reissue requests from the private block ring. */
1029 for ( i = 0;
1030 resp_cons_rec < blk_ring_rec->req_prod;
1031 resp_cons_rec++, i++ )
1033 translate_req_to_mfn(
1034 &blk_ring->ring[i].req,
1035 &blk_ring_rec->ring[MASK_BLKIF_IDX(resp_cons_rec)].req);
1038 /* Reset the private block ring to match the new ring. */
1039 for( j = 0; j < i; j++ )
1041 translate_req_to_pfn(
1042 &blk_ring_rec->ring[j].req,
1043 &blk_ring->ring[j].req);
1046 resp_cons_rec = 0;
1048 /* blk_ring->req_prod will be set when we flush_requests().*/
1049 blk_ring_rec->req_prod = req_prod = i;
1050 wmb();
1052 /* Switch off recovery mode, using a memory barrier to ensure that
1053 * it's seen before we flush requests - we don't want to miss any
1054 * interrupts. */
1055 recovery = 0;
1056 wmb();
1058 /* Kicks things back into life. */
1059 flush_requests();
1061 else
1063 /* Probe for discs that are attached to the interface. */
1064 xlvbd_init();
1067 blkif_state = BLKIF_STATE_CONNECTED;
1069 /* Kick pending requests. */
1070 spin_lock_irq(&blkif_io_lock);
1071 kick_pending_request_queues();
1072 spin_unlock_irq(&blkif_io_lock);
1074 break;
1076 default:
1077 printk(KERN_WARNING "Status change to unknown value %d\n",
1078 status->status);
1079 break;
1084 static void blkif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id)
1086 switch ( msg->subtype )
1088 case CMSG_BLKIF_FE_INTERFACE_STATUS_CHANGED:
1089 if ( msg->length != sizeof(blkif_fe_interface_status_changed_t) )
1090 goto parse_error;
1091 blkif_status_change((blkif_fe_interface_status_changed_t *)
1092 &msg->msg[0]);
1093 break;
1094 #if 0
1095 case CMSG_BLKIF_FE_VBD_STATUS_CHANGED:
1096 update_tq.routine = update_vbds_task;
1097 schedule_task(&update_tq);
1098 break;
1099 #endif
1100 default:
1101 goto parse_error;
1104 ctrl_if_send_response(msg);
1105 return;
1107 parse_error:
1108 msg->length = 0;
1109 ctrl_if_send_response(msg);
1113 int __init xlblk_init(void)
1115 ctrl_msg_t cmsg;
1116 blkif_fe_driver_status_changed_t st;
1118 if ( (start_info.flags & SIF_INITDOMAIN)
1119 || (start_info.flags & SIF_BLK_BE_DOMAIN) )
1120 return 0;
1122 printk(KERN_INFO "Initialising Xen virtual block device\n");
1124 blk_ring_rec = (blkif_ring_t *)__get_free_page(GFP_KERNEL);
1125 memset(blk_ring_rec, 0, sizeof(*blk_ring_rec));
1127 (void)ctrl_if_register_receiver(CMSG_BLKIF_FE, blkif_ctrlif_rx,
1128 CALLBACK_IN_BLOCKING_CONTEXT);
1130 /* Send a driver-UP notification to the domain controller. */
1131 cmsg.type = CMSG_BLKIF_FE;
1132 cmsg.subtype = CMSG_BLKIF_FE_DRIVER_STATUS_CHANGED;
1133 cmsg.length = sizeof(blkif_fe_driver_status_changed_t);
1134 st.status = BLKIF_DRIVER_STATUS_UP;
1135 memcpy(cmsg.msg, &st, sizeof(st));
1136 ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
1138 /*
1139 * We should read 'nr_interfaces' from response message and wait
1140 * for notifications before proceeding. For now we assume that we
1141 * will be notified of exactly one interface.
1142 */
1143 while ( blkif_state != BLKIF_STATE_CONNECTED )
1145 set_current_state(TASK_INTERRUPTIBLE);
1146 schedule_timeout(1);
1149 return 0;
1152 void blkdev_suspend(void)
1156 void blkdev_resume(void)
1158 ctrl_msg_t cmsg;
1159 blkif_fe_driver_status_changed_t st;
1161 /* Send a driver-UP notification to the domain controller. */
1162 cmsg.type = CMSG_BLKIF_FE;
1163 cmsg.subtype = CMSG_BLKIF_FE_DRIVER_STATUS_CHANGED;
1164 cmsg.length = sizeof(blkif_fe_driver_status_changed_t);
1165 st.status = BLKIF_DRIVER_STATUS_UP;
1166 memcpy(cmsg.msg, &st, sizeof(st));
1167 ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);