ia64/xen-unstable

view linux-2.6-xen-sparse/drivers/xen/blkfront/blkfront.c @ 6422:e24fd7012ffb

merge?
author cl349@firebug.cl.cam.ac.uk
date Thu Aug 25 10:09:39 2005 +0000 (2005-08-25)
parents 2f20c2fce2c5 be5c24f2709c
children 4abd299ef2f6
line source
1 /******************************************************************************
2 * blkfront.c
3 *
4 * XenLinux virtual block-device driver.
5 *
6 * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
7 * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
8 * Copyright (c) 2004, Christian Limpach
9 * Copyright (c) 2004, Andrew Warfield
10 * Copyright (c) 2005, Christopher Clark
11 *
12 * This file may be distributed separately from the Linux kernel, or
13 * incorporated into other software packages, subject to the following license:
14 *
15 * Permission is hereby granted, free of charge, to any person obtaining a copy
16 * of this source file (the "Software"), to deal in the Software without
17 * restriction, including without limitation the rights to use, copy, modify,
18 * merge, publish, distribute, sublicense, and/or sell copies of the Software,
19 * and to permit persons to whom the Software is furnished to do so, subject to
20 * the following conditions:
21 *
22 * The above copyright notice and this permission notice shall be included in
23 * all copies or substantial portions of the Software.
24 *
25 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
26 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
27 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
28 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
29 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
30 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
31 * IN THE SOFTWARE.
32 */
34 #if 1
35 #define ASSERT(_p) \
36 if ( !(_p) ) { printk("Assertion '%s' failed, line %d, file %s", #_p , \
37 __LINE__, __FILE__); *(int*)0=0; }
38 #else
39 #define ASSERT(_p)
40 #endif
42 #include <linux/version.h>
44 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
45 #include "block.h"
46 #else
47 #include "common.h"
48 #include <linux/blk.h>
49 #include <linux/tqueue.h>
50 #endif
52 #include <linux/cdrom.h>
53 #include <linux/sched.h>
54 #include <linux/interrupt.h>
55 #include <scsi/scsi.h>
56 #include <asm-xen/evtchn.h>
57 #include <asm-xen/xenbus.h>
58 #include <asm-xen/xen-public/grant_table.h>
59 #include <asm-xen/gnttab.h>
61 typedef unsigned char byte; /* from linux/ide.h */
63 /* Control whether runtime update of vbds is enabled. */
64 #define ENABLE_VBD_UPDATE 1
66 #define BLKIF_STATE_DISCONNECTED 0
67 #define BLKIF_STATE_CONNECTED 1
69 static unsigned int blkif_state = BLKIF_STATE_DISCONNECTED;
71 #define BLK_RING_SIZE __RING_SIZE((blkif_sring_t *)0, PAGE_SIZE)
73 #define MAXIMUM_OUTSTANDING_BLOCK_REQS \
74 (BLKIF_MAX_SEGMENTS_PER_REQUEST * BLKIF_RING_SIZE)
75 #define GRANTREF_INVALID (1<<15)
77 static struct blk_shadow {
78 blkif_request_t req;
79 unsigned long request;
80 unsigned long frame[BLKIF_MAX_SEGMENTS_PER_REQUEST];
81 } blk_shadow[BLK_RING_SIZE];
82 unsigned long blk_shadow_free;
84 static int recovery = 0; /* Recovery in progress: protected by blkif_io_lock */
86 static void kick_pending_request_queues(struct blkfront_info *info);
88 static int __init xlblk_init(void);
90 static void blkif_completion(struct blk_shadow *s);
92 static inline int GET_ID_FROM_FREELIST(void)
93 {
94 unsigned long free = blk_shadow_free;
95 BUG_ON(free > BLK_RING_SIZE);
96 blk_shadow_free = blk_shadow[free].req.id;
97 blk_shadow[free].req.id = 0x0fffffee; /* debug */
98 return free;
99 }
101 static inline void ADD_ID_TO_FREELIST(unsigned long id)
102 {
103 blk_shadow[id].req.id = blk_shadow_free;
104 blk_shadow[id].request = 0;
105 blk_shadow_free = id;
106 }
109 /************************ COMMON CODE (inlined) ************************/
111 /* Kernel-specific definitions used in the common code */
112 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
113 #define DISABLE_SCATTERGATHER()
114 #else
115 static int sg_operation = -1;
116 #define DISABLE_SCATTERGATHER() (sg_operation = -1)
117 #endif
119 static inline void pickle_request(struct blk_shadow *s, blkif_request_t *r)
120 {
122 s->req = *r;
123 }
125 static inline void unpickle_request(blkif_request_t *r, struct blk_shadow *s)
126 {
128 *r = s->req;
129 }
132 static inline void flush_requests(struct blkfront_info *info)
133 {
134 DISABLE_SCATTERGATHER();
135 RING_PUSH_REQUESTS(&info->ring);
136 notify_via_evtchn(info->evtchn);
137 }
140 /************************** KERNEL VERSION 2.6 **************************/
142 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
144 module_init(xlblk_init);
146 static void kick_pending_request_queues(struct blkfront_info *info)
147 {
148 if (!RING_FULL(&info->ring)) {
149 /* Re-enable calldowns. */
150 blk_start_queue(info->rq);
151 /* Kick things off immediately. */
152 do_blkif_request(info->rq);
153 }
154 }
156 static void blkif_restart_queue(void *arg)
157 {
158 struct blkfront_info *info = (struct blkfront_info *)arg;
159 spin_lock_irq(&blkif_io_lock);
160 kick_pending_request_queues(info);
161 spin_unlock_irq(&blkif_io_lock);
162 }
164 static void blkif_restart_queue_callback(void *arg)
165 {
166 struct blkfront_info *info = (struct blkfront_info *)arg;
167 schedule_work(&info->work);
168 }
170 int blkif_open(struct inode *inode, struct file *filep)
171 {
172 // struct gendisk *gd = inode->i_bdev->bd_disk;
173 // struct xlbd_disk_info *di = (struct xlbd_disk_info *)gd->private_data;
175 /* Update of usage count is protected by per-device semaphore. */
176 // di->mi->usage++;
178 return 0;
179 }
182 int blkif_release(struct inode *inode, struct file *filep)
183 {
184 /* FIXME: This is where we can actually free up majors, etc. --RR */
185 return 0;
186 }
189 int blkif_ioctl(struct inode *inode, struct file *filep,
190 unsigned command, unsigned long argument)
191 {
192 int i;
194 DPRINTK_IOCTL("command: 0x%x, argument: 0x%lx, dev: 0x%04x\n",
195 command, (long)argument, inode->i_rdev);
197 switch ( command )
198 {
199 case HDIO_GETGEO:
200 /* return ENOSYS to use defaults */
201 return -ENOSYS;
203 case CDROMMULTISESSION:
204 DPRINTK("FIXME: support multisession CDs later\n");
205 for ( i = 0; i < sizeof(struct cdrom_multisession); i++ )
206 if ( put_user(0, (byte *)(argument + i)) ) return -EFAULT;
207 return 0;
209 default:
210 /*printk(KERN_ALERT "ioctl %08x not supported by Xen blkdev\n",
211 command);*/
212 return -EINVAL; /* same return as native Linux */
213 }
215 return 0;
216 }
219 /*
220 * blkif_queue_request
221 *
222 * request block io
223 *
224 * id: for guest use only.
225 * operation: BLKIF_OP_{READ,WRITE,PROBE}
226 * buffer: buffer to read/write into. this should be a
227 * virtual address in the guest os.
228 */
229 static int blkif_queue_request(struct request *req)
230 {
231 struct blkfront_info *info = req->rq_disk->private_data;
232 unsigned long buffer_ma;
233 blkif_request_t *ring_req;
234 struct bio *bio;
235 struct bio_vec *bvec;
236 int idx;
237 unsigned long id;
238 unsigned int fsect, lsect;
239 int ref;
240 grant_ref_t gref_head;
242 if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
243 return 1;
245 if (gnttab_alloc_grant_references(BLKIF_MAX_SEGMENTS_PER_REQUEST,
246 &gref_head) < 0) {
247 gnttab_request_free_callback(&info->callback,
248 blkif_restart_queue_callback, info,
249 BLKIF_MAX_SEGMENTS_PER_REQUEST);
250 return 1;
251 }
253 /* Fill out a communications ring structure. */
254 ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt);
255 id = GET_ID_FROM_FREELIST();
256 blk_shadow[id].request = (unsigned long)req;
258 ring_req->id = id;
259 ring_req->operation = rq_data_dir(req) ? BLKIF_OP_WRITE : BLKIF_OP_READ;
260 ring_req->sector_number = (blkif_sector_t)req->sector;
261 ring_req->handle = info->handle;
263 ring_req->nr_segments = 0;
264 rq_for_each_bio(bio, req)
265 {
266 bio_for_each_segment(bvec, bio, idx)
267 {
268 if ( ring_req->nr_segments == BLKIF_MAX_SEGMENTS_PER_REQUEST )
269 BUG();
270 buffer_ma = page_to_phys(bvec->bv_page);
271 fsect = bvec->bv_offset >> 9;
272 lsect = fsect + (bvec->bv_len >> 9) - 1;
273 /* install a grant reference. */
274 ref = gnttab_claim_grant_reference(&gref_head);
275 ASSERT( ref != -ENOSPC );
277 gnttab_grant_foreign_access_ref(
278 ref,
279 info->backend_id,
280 buffer_ma >> PAGE_SHIFT,
281 rq_data_dir(req) );
283 blk_shadow[id].frame[ring_req->nr_segments] =
284 buffer_ma >> PAGE_SHIFT;
286 ring_req->frame_and_sects[ring_req->nr_segments] =
287 blkif_fas_from_gref(ref, fsect, lsect);
289 ring_req->nr_segments++;
290 }
291 }
293 info->ring.req_prod_pvt++;
295 /* Keep a private copy so we can reissue requests when recovering. */
296 pickle_request(&blk_shadow[id], ring_req);
298 gnttab_free_grant_references(gref_head);
300 return 0;
301 }
303 /*
304 * do_blkif_request
305 * read a block; request is in a request queue
306 */
307 void do_blkif_request(request_queue_t *rq)
308 {
309 struct blkfront_info *info = NULL;
310 struct request *req;
311 int queued;
313 DPRINTK("Entered do_blkif_request\n");
315 queued = 0;
317 while ( (req = elv_next_request(rq)) != NULL )
318 {
319 info = req->rq_disk->private_data;
321 if ( !blk_fs_request(req) )
322 {
323 end_request(req, 0);
324 continue;
325 }
327 if (RING_FULL(&info->ring))
328 goto wait;
330 DPRINTK("do_blk_req %p: cmd %p, sec %lx, (%u/%li) buffer:%p [%s]\n",
331 req, req->cmd, req->sector, req->current_nr_sectors,
332 req->nr_sectors, req->buffer,
333 rq_data_dir(req) ? "write" : "read");
335 blkdev_dequeue_request(req);
336 if (blkif_queue_request(req)) {
337 blk_requeue_request(rq, req);
338 wait:
339 /* Avoid pointless unplugs. */
340 blk_stop_queue(rq);
341 break;
342 }
344 queued++;
345 }
347 if ( queued != 0 )
348 flush_requests(info);
349 }
352 static irqreturn_t blkif_int(int irq, void *dev_id, struct pt_regs *ptregs)
353 {
354 struct request *req;
355 blkif_response_t *bret;
356 RING_IDX i, rp;
357 unsigned long flags;
358 struct blkfront_info *info = (struct blkfront_info *)dev_id;
360 spin_lock_irqsave(&blkif_io_lock, flags);
362 if (unlikely(info->connected != BLKIF_STATE_CONNECTED || recovery)) {
363 spin_unlock_irqrestore(&blkif_io_lock, flags);
364 return IRQ_HANDLED;
365 }
367 rp = info->ring.sring->rsp_prod;
368 rmb(); /* Ensure we see queued responses up to 'rp'. */
370 for ( i = info->ring.rsp_cons; i != rp; i++ )
371 {
372 unsigned long id;
374 bret = RING_GET_RESPONSE(&info->ring, i);
375 id = bret->id;
376 req = (struct request *)blk_shadow[id].request;
378 blkif_completion(&blk_shadow[id]);
380 ADD_ID_TO_FREELIST(id);
382 switch ( bret->operation )
383 {
384 case BLKIF_OP_READ:
385 case BLKIF_OP_WRITE:
386 if ( unlikely(bret->status != BLKIF_RSP_OKAY) )
387 DPRINTK("Bad return from blkdev data request: %x\n",
388 bret->status);
390 if ( unlikely(end_that_request_first
391 (req,
392 (bret->status == BLKIF_RSP_OKAY),
393 req->hard_nr_sectors)) )
394 BUG();
395 end_that_request_last(req);
397 break;
398 default:
399 BUG();
400 }
401 }
403 info->ring.rsp_cons = i;
405 kick_pending_request_queues(info);
407 spin_unlock_irqrestore(&blkif_io_lock, flags);
409 return IRQ_HANDLED;
410 }
412 #else
413 /************************** KERNEL VERSION 2.4 **************************/
415 static kdev_t sg_dev;
416 static unsigned long sg_next_sect;
418 /*
419 * Request queues with outstanding work, but ring is currently full.
420 * We need no special lock here, as we always access this with the
421 * blkif_io_lock held. We only need a small maximum list.
422 */
423 #define MAX_PENDING 8
424 static request_queue_t *pending_queues[MAX_PENDING];
425 static int nr_pending;
428 #define blkif_io_lock io_request_lock
430 /*============================================================================*/
431 static void kick_pending_request_queues(void)
432 {
433 /* We kick pending request queues if the ring is reasonably empty. */
434 if ( (nr_pending != 0) &&
435 (RING_PENDING_REQUESTS(&info->ring) < (BLK_RING_SIZE >> 1)) )
436 {
437 /* Attempt to drain the queue, but bail if the ring becomes full. */
438 while ( (nr_pending != 0) && !RING_FULL(&info->ring) )
439 do_blkif_request(pending_queues[--nr_pending]);
440 }
441 }
443 int blkif_open(struct inode *inode, struct file *filep)
444 {
445 short xldev = inode->i_rdev;
446 struct gendisk *gd = get_gendisk(xldev);
447 xl_disk_t *disk = xldev_to_xldisk(inode->i_rdev);
448 short minor = MINOR(xldev);
450 if ( gd->part[minor].nr_sects == 0 )
451 {
452 /*
453 * Device either doesn't exist, or has zero capacity; we use a few
454 * cheesy heuristics to return the relevant error code
455 */
456 if ( (gd->sizes[minor >> gd->minor_shift] != 0) ||
457 ((minor & (gd->max_p - 1)) != 0) )
458 {
459 /*
460 * We have a real device, but no such partition, or we just have a
461 * partition number so guess this is the problem.
462 */
463 return -ENXIO; /* no such device or address */
464 }
465 else if ( gd->flags[minor >> gd->minor_shift] & GENHD_FL_REMOVABLE )
466 {
467 /* This is a removable device => assume that media is missing. */
468 return -ENOMEDIUM; /* media not present (this is a guess) */
469 }
470 else
471 {
472 /* Just go for the general 'no such device' error. */
473 return -ENODEV; /* no such device */
474 }
475 }
477 /* Update of usage count is protected by per-device semaphore. */
478 disk->usage++;
480 return 0;
481 }
484 int blkif_release(struct inode *inode, struct file *filep)
485 {
486 xl_disk_t *disk = xldev_to_xldisk(inode->i_rdev);
488 /*
489 * When usage drops to zero it may allow more VBD updates to occur.
490 * Update of usage count is protected by a per-device semaphore.
491 */
492 if ( --disk->usage == 0 ) {
493 vbd_update();
494 }
496 return 0;
497 }
500 int blkif_ioctl(struct inode *inode, struct file *filep,
501 unsigned command, unsigned long argument)
502 {
503 kdev_t dev = inode->i_rdev;
504 struct hd_geometry *geo = (struct hd_geometry *)argument;
505 struct gendisk *gd;
506 struct hd_struct *part;
507 int i;
508 unsigned short cylinders;
509 byte heads, sectors;
511 /* NB. No need to check permissions. That is done for us. */
513 DPRINTK_IOCTL("command: 0x%x, argument: 0x%lx, dev: 0x%04x\n",
514 command, (long) argument, dev);
516 gd = get_gendisk(dev);
517 part = &gd->part[MINOR(dev)];
519 switch ( command )
520 {
521 case BLKGETSIZE:
522 DPRINTK_IOCTL(" BLKGETSIZE: %x %lx\n", BLKGETSIZE, part->nr_sects);
523 return put_user(part->nr_sects, (unsigned long *) argument);
525 case BLKGETSIZE64:
526 DPRINTK_IOCTL(" BLKGETSIZE64: %x %llx\n", BLKGETSIZE64,
527 (u64)part->nr_sects * 512);
528 return put_user((u64)part->nr_sects * 512, (u64 *) argument);
530 case BLKRRPART: /* re-read partition table */
531 DPRINTK_IOCTL(" BLKRRPART: %x\n", BLKRRPART);
532 return blkif_revalidate(dev);
534 case BLKSSZGET:
535 return hardsect_size[MAJOR(dev)][MINOR(dev)];
537 case BLKBSZGET: /* get block size */
538 DPRINTK_IOCTL(" BLKBSZGET: %x\n", BLKBSZGET);
539 break;
541 case BLKBSZSET: /* set block size */
542 DPRINTK_IOCTL(" BLKBSZSET: %x\n", BLKBSZSET);
543 break;
545 case BLKRASET: /* set read-ahead */
546 DPRINTK_IOCTL(" BLKRASET: %x\n", BLKRASET);
547 break;
549 case BLKRAGET: /* get read-ahead */
550 DPRINTK_IOCTL(" BLKRAFET: %x\n", BLKRAGET);
551 break;
553 case HDIO_GETGEO:
554 DPRINTK_IOCTL(" HDIO_GETGEO: %x\n", HDIO_GETGEO);
555 if (!argument) return -EINVAL;
557 /* We don't have real geometry info, but let's at least return
558 values consistent with the size of the device */
560 heads = 0xff;
561 sectors = 0x3f;
562 cylinders = part->nr_sects / (heads * sectors);
564 if (put_user(0x00, (unsigned long *) &geo->start)) return -EFAULT;
565 if (put_user(heads, (byte *)&geo->heads)) return -EFAULT;
566 if (put_user(sectors, (byte *)&geo->sectors)) return -EFAULT;
567 if (put_user(cylinders, (unsigned short *)&geo->cylinders)) return -EFAULT;
569 return 0;
571 case HDIO_GETGEO_BIG:
572 DPRINTK_IOCTL(" HDIO_GETGEO_BIG: %x\n", HDIO_GETGEO_BIG);
573 if (!argument) return -EINVAL;
575 /* We don't have real geometry info, but let's at least return
576 values consistent with the size of the device */
578 heads = 0xff;
579 sectors = 0x3f;
580 cylinders = part->nr_sects / (heads * sectors);
582 if (put_user(0x00, (unsigned long *) &geo->start)) return -EFAULT;
583 if (put_user(heads, (byte *)&geo->heads)) return -EFAULT;
584 if (put_user(sectors, (byte *)&geo->sectors)) return -EFAULT;
585 if (put_user(cylinders, (unsigned int *) &geo->cylinders)) return -EFAULT;
587 return 0;
589 case CDROMMULTISESSION:
590 DPRINTK("FIXME: support multisession CDs later\n");
591 for ( i = 0; i < sizeof(struct cdrom_multisession); i++ )
592 if ( put_user(0, (byte *)(argument + i)) ) return -EFAULT;
593 return 0;
595 case SCSI_IOCTL_GET_BUS_NUMBER:
596 DPRINTK("FIXME: SCSI_IOCTL_GET_BUS_NUMBER ioctl in XL blkif");
597 return -ENOSYS;
599 default:
600 WPRINTK("ioctl %08x not supported by XL blkif\n", command);
601 return -ENOSYS;
602 }
604 return 0;
605 }
609 /* check media change: should probably do something here in some cases :-) */
610 int blkif_check(kdev_t dev)
611 {
612 DPRINTK("blkif_check\n");
613 return 0;
614 }
616 int blkif_revalidate(kdev_t dev)
617 {
618 struct block_device *bd;
619 struct gendisk *gd;
620 xl_disk_t *disk;
621 unsigned long capacity;
622 int i, rc = 0;
624 if ( (bd = bdget(dev)) == NULL )
625 return -EINVAL;
627 /*
628 * Update of partition info, and check of usage count, is protected
629 * by the per-block-device semaphore.
630 */
631 down(&bd->bd_sem);
633 if ( ((gd = get_gendisk(dev)) == NULL) ||
634 ((disk = xldev_to_xldisk(dev)) == NULL) ||
635 ((capacity = gd->part[MINOR(dev)].nr_sects) == 0) )
636 {
637 rc = -EINVAL;
638 goto out;
639 }
641 if ( disk->usage > 1 )
642 {
643 rc = -EBUSY;
644 goto out;
645 }
647 /* Only reread partition table if VBDs aren't mapped to partitions. */
648 if ( !(gd->flags[MINOR(dev) >> gd->minor_shift] & GENHD_FL_VIRT_PARTNS) )
649 {
650 for ( i = gd->max_p - 1; i >= 0; i-- )
651 {
652 invalidate_device(dev+i, 1);
653 gd->part[MINOR(dev+i)].start_sect = 0;
654 gd->part[MINOR(dev+i)].nr_sects = 0;
655 gd->sizes[MINOR(dev+i)] = 0;
656 }
658 grok_partitions(gd, MINOR(dev)>>gd->minor_shift, gd->max_p, capacity);
659 }
661 out:
662 up(&bd->bd_sem);
663 bdput(bd);
664 return rc;
665 }
668 /*
669 * blkif_queue_request
670 *
671 * request block io
672 *
673 * id: for guest use only.
674 * operation: BLKIF_OP_{READ,WRITE,PROBE}
675 * buffer: buffer to read/write into. this should be a
676 * virtual address in the guest os.
677 */
678 static int blkif_queue_request(unsigned long id,
679 int operation,
680 char * buffer,
681 unsigned long sector_number,
682 unsigned short nr_sectors,
683 kdev_t device,
684 blkif_vdev_t handle)
685 {
686 unsigned long buffer_ma = virt_to_bus(buffer);
687 unsigned long xid;
688 struct gendisk *gd;
689 blkif_request_t *req;
690 struct buffer_head *bh;
691 unsigned int fsect, lsect;
692 int ref;
694 fsect = (buffer_ma & ~PAGE_MASK) >> 9;
695 lsect = fsect + nr_sectors - 1;
697 /* Buffer must be sector-aligned. Extent mustn't cross a page boundary. */
698 if ( unlikely((buffer_ma & ((1<<9)-1)) != 0) )
699 BUG();
700 if ( lsect > ((PAGE_SIZE/512)-1) )
701 BUG();
703 buffer_ma &= PAGE_MASK;
705 if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
706 return 1;
708 switch ( operation )
709 {
711 case BLKIF_OP_READ:
712 case BLKIF_OP_WRITE:
713 gd = get_gendisk(device);
715 /*
716 * Update the sector_number we'll pass down as appropriate; note that
717 * we could sanity check that resulting sector will be in this
718 * partition, but this will happen in driver backend anyhow.
719 */
720 sector_number += gd->part[MINOR(device)].start_sect;
722 /*
723 * If this unit doesn't consist of virtual partitions then we clear
724 * the partn bits from the device number.
725 */
726 if ( !(gd->flags[MINOR(device)>>gd->minor_shift] &
727 GENHD_FL_VIRT_PARTNS) )
728 device &= ~(gd->max_p - 1);
730 if ( (sg_operation == operation) &&
731 (sg_dev == device) &&
732 (sg_next_sect == sector_number) )
733 {
734 req = RING_GET_REQUEST(&info->ring,
735 info->ring.req_prod_pvt - 1);
736 bh = (struct buffer_head *)id;
738 bh->b_reqnext = (struct buffer_head *)blk_shadow[req->id].request;
739 blk_shadow[req->id].request = (unsigned long)id;
741 /* install a grant reference. */
742 ref = gnttab_claim_grant_reference(&gref_head);
743 ASSERT( ref != -ENOSPC );
745 gnttab_grant_foreign_access_ref(
746 ref,
747 info->backend_id,
748 buffer_ma >> PAGE_SHIFT,
749 ( operation == BLKIF_OP_WRITE ? 1 : 0 ) );
751 blk_shadow[req->id].frame[req->nr_segments] =
752 buffer_ma >> PAGE_SHIFT;
754 req->frame_and_sects[req->nr_segments] =
755 blkif_fas_from_gref(ref, fsect, lsect);
756 if ( ++req->nr_segments < BLKIF_MAX_SEGMENTS_PER_REQUEST )
757 sg_next_sect += nr_sectors;
758 else
759 DISABLE_SCATTERGATHER();
761 /* Update the copy of the request in the recovery ring. */
762 pickle_request(&blk_shadow[req->id], req );
764 return 0;
765 }
766 else if ( RING_FULL(&info->ring) )
767 {
768 return 1;
769 }
770 else
771 {
772 sg_operation = operation;
773 sg_dev = device;
774 sg_next_sect = sector_number + nr_sectors;
775 }
776 break;
778 default:
779 panic("unknown op %d\n", operation);
780 }
782 /* Fill out a communications ring structure. */
783 req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt);
785 xid = GET_ID_FROM_FREELIST();
786 blk_shadow[xid].request = (unsigned long)id;
788 req->id = xid;
789 req->operation = operation;
790 req->sector_number = (blkif_sector_t)sector_number;
791 req->handle = handle;
792 req->nr_segments = 1;
793 /* install a grant reference. */
794 ref = gnttab_claim_grant_reference(&gref_head);
795 ASSERT( ref != -ENOSPC );
797 gnttab_grant_foreign_access_ref(
798 ref,
799 info->backend_id,
800 buffer_ma >> PAGE_SHIFT,
801 ( operation == BLKIF_OP_WRITE ? 1 : 0 ) );
803 blk_shadow[xid].frame[0] = buffer_ma >> PAGE_SHIFT;
805 req->frame_and_sects[0] = blkif_fas_from_gref(ref, fsect, lsect);
807 /* Keep a private copy so we can reissue requests when recovering. */
808 pickle_request(&blk_shadow[xid], req);
810 info->ring.req_prod_pvt++;
812 return 0;
813 }
816 /*
817 * do_blkif_request
818 * read a block; request is in a request queue
819 */
820 void do_blkif_request(request_queue_t *rq)
821 {
822 struct request *req;
823 struct buffer_head *bh, *next_bh;
824 int rw, nsect, full, queued = 0;
826 DPRINTK("Entered do_blkif_request\n");
828 while ( !rq->plugged && !list_empty(&rq->queue_head))
829 {
830 if ( (req = blkdev_entry_next_request(&rq->queue_head)) == NULL )
831 goto out;
833 DPRINTK("do_blkif_request %p: cmd %i, sec %lx, (%li/%li) bh:%p\n",
834 req, req->cmd, req->sector,
835 req->current_nr_sectors, req->nr_sectors, req->bh);
837 rw = req->cmd;
838 if ( rw == READA )
839 rw = READ;
840 if ( unlikely((rw != READ) && (rw != WRITE)) )
841 panic("XenoLinux Virtual Block Device: bad cmd: %d\n", rw);
843 req->errors = 0;
845 bh = req->bh;
846 while ( bh != NULL )
847 {
848 next_bh = bh->b_reqnext;
849 bh->b_reqnext = NULL;
851 full = blkif_queue_request(
852 (unsigned long)bh,
853 (rw == READ) ? BLKIF_OP_READ : BLKIF_OP_WRITE,
854 bh->b_data, bh->b_rsector, bh->b_size>>9, bh->b_rdev);
856 if ( full )
857 {
858 bh->b_reqnext = next_bh;
859 pending_queues[nr_pending++] = rq;
860 if ( unlikely(nr_pending >= MAX_PENDING) )
861 BUG();
862 goto out;
863 }
865 queued++;
867 /* Dequeue the buffer head from the request. */
868 nsect = bh->b_size >> 9;
869 bh = req->bh = next_bh;
871 if ( bh != NULL )
872 {
873 /* There's another buffer head to do. Update the request. */
874 req->hard_sector += nsect;
875 req->hard_nr_sectors -= nsect;
876 req->sector = req->hard_sector;
877 req->nr_sectors = req->hard_nr_sectors;
878 req->current_nr_sectors = bh->b_size >> 9;
879 req->buffer = bh->b_data;
880 }
881 else
882 {
883 /* That was the last buffer head. Finalise the request. */
884 if ( unlikely(end_that_request_first(req, 1, "XenBlk")) )
885 BUG();
886 blkdev_dequeue_request(req);
887 end_that_request_last(req);
888 }
889 }
890 }
892 out:
893 if ( queued != 0 )
894 flush_requests();
895 }
898 static void blkif_int(int irq, void *dev_id, struct pt_regs *ptregs)
899 {
900 RING_IDX i, rp;
901 unsigned long flags;
902 struct buffer_head *bh, *next_bh;
904 spin_lock_irqsave(&io_request_lock, flags);
906 if ( unlikely(info->connected != BLKIF_STATE_CONNECTED || recovery) )
907 {
908 spin_unlock_irqrestore(&io_request_lock, flags);
909 return;
910 }
912 rp = info->ring.sring->rsp_prod;
913 rmb(); /* Ensure we see queued responses up to 'rp'. */
915 for ( i = info->ring.rsp_cons; i != rp; i++ )
916 {
917 unsigned long id;
918 blkif_response_t *bret;
920 bret = RING_GET_RESPONSE(&info->ring, i);
921 id = bret->id;
922 bh = (struct buffer_head *)blk_shadow[id].request;
924 blkif_completion(&blk_shadow[id]);
926 ADD_ID_TO_FREELIST(id);
928 switch ( bret->operation )
929 {
930 case BLKIF_OP_READ:
931 case BLKIF_OP_WRITE:
932 if ( unlikely(bret->status != BLKIF_RSP_OKAY) )
933 DPRINTK("Bad return from blkdev data request: %lx\n",
934 bret->status);
935 for ( ; bh != NULL; bh = next_bh )
936 {
937 next_bh = bh->b_reqnext;
938 bh->b_reqnext = NULL;
939 bh->b_end_io(bh, bret->status == BLKIF_RSP_OKAY);
940 }
942 break;
943 case BLKIF_OP_PROBE:
944 memcpy(&blkif_control_rsp, bret, sizeof(*bret));
945 blkif_control_rsp_valid = 1;
946 break;
947 default:
948 BUG();
949 }
951 }
952 info->ring.rsp_cons = i;
954 kick_pending_request_queues();
956 spin_unlock_irqrestore(&io_request_lock, flags);
957 }
959 #endif
961 /***************************** COMMON CODE *******************************/
963 static void blkif_free(struct blkfront_info *info)
964 {
965 /* Prevent new requests being issued until we fix things up. */
966 spin_lock_irq(&blkif_io_lock);
967 info->connected = BLKIF_STATE_DISCONNECTED;
968 spin_unlock_irq(&blkif_io_lock);
970 /* Free resources associated with old device channel. */
971 if ( info->ring.sring != NULL )
972 {
973 free_page((unsigned long)info->ring.sring);
974 info->ring.sring = NULL;
975 }
976 unbind_evtchn_from_irqhandler(info->evtchn, NULL);
977 info->evtchn = 0;
978 }
980 static void blkif_recover(struct blkfront_info *info)
981 {
982 int i;
983 blkif_request_t *req;
984 struct blk_shadow *copy;
985 int j;
987 /* Stage 1: Make a safe copy of the shadow state. */
988 copy = (struct blk_shadow *)kmalloc(sizeof(blk_shadow), GFP_KERNEL);
989 BUG_ON(copy == NULL);
990 memcpy(copy, blk_shadow, sizeof(blk_shadow));
992 /* Stage 2: Set up free list. */
993 memset(&blk_shadow, 0, sizeof(blk_shadow));
994 for ( i = 0; i < BLK_RING_SIZE; i++ )
995 blk_shadow[i].req.id = i+1;
996 blk_shadow_free = info->ring.req_prod_pvt;
997 blk_shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
999 /* Stage 3: Find pending requests and requeue them. */
1000 for ( i = 0; i < BLK_RING_SIZE; i++ )
1002 /* Not in use? */
1003 if ( copy[i].request == 0 )
1004 continue;
1006 /* Grab a request slot and unpickle shadow state into it. */
1007 req = RING_GET_REQUEST(
1008 &info->ring, info->ring.req_prod_pvt);
1009 unpickle_request(req, &copy[i]);
1011 /* We get a new request id, and must reset the shadow state. */
1012 req->id = GET_ID_FROM_FREELIST();
1013 memcpy(&blk_shadow[req->id], &copy[i], sizeof(copy[i]));
1015 /* Rewrite any grant references invalidated by suspend/resume. */
1016 for ( j = 0; j < req->nr_segments; j++ )
1018 if ( req->frame_and_sects[j] & GRANTREF_INVALID )
1019 gnttab_grant_foreign_access_ref(
1020 blkif_gref_from_fas(req->frame_and_sects[j]),
1021 info->backend_id,
1022 blk_shadow[req->id].frame[j],
1023 rq_data_dir((struct request *)
1024 blk_shadow[req->id].request));
1025 req->frame_and_sects[j] &= ~GRANTREF_INVALID;
1027 blk_shadow[req->id].req = *req;
1029 info->ring.req_prod_pvt++;
1032 kfree(copy);
1034 recovery = 0;
1036 /* info->ring->req_prod will be set when we flush_requests().*/
1037 wmb();
1039 /* Kicks things back into life. */
1040 flush_requests(info);
1042 /* Now safe to left other people use the interface. */
1043 info->connected = BLKIF_STATE_CONNECTED;
1046 static void blkif_connect(struct blkfront_info *info, u16 evtchn)
1048 int err = 0;
1050 info->evtchn = evtchn;
1052 err = bind_evtchn_to_irqhandler(
1053 info->evtchn, blkif_int, SA_SAMPLE_RANDOM, "blkif", info);
1054 if ( err != 0 )
1056 WPRINTK("bind_evtchn_to_irqhandler failed (err=%d)\n", err);
1057 return;
1062 static struct xenbus_device_id blkfront_ids[] = {
1063 { "vbd" },
1064 { "" }
1065 };
1067 static void watch_for_status(struct xenbus_watch *watch, const char *node)
1069 struct blkfront_info *info;
1070 unsigned int binfo;
1071 unsigned long sectors, sector_size;
1072 int err;
1074 info = container_of(watch, struct blkfront_info, watch);
1075 node += strlen(watch->node);
1077 /* FIXME: clean up when error on the other end. */
1078 if (info->connected == BLKIF_STATE_CONNECTED)
1079 return;
1081 err = xenbus_gather(watch->node,
1082 "sectors", "%lu", &sectors,
1083 "info", "%u", &binfo,
1084 "sector-size", "%lu", &sector_size,
1085 NULL);
1086 if (err) {
1087 xenbus_dev_error(info->xbdev, err,
1088 "reading backend fields at %s", watch->node);
1089 return;
1092 xlvbd_add(sectors, info->vdevice, binfo, sector_size, info);
1093 info->connected = BLKIF_STATE_CONNECTED;
1095 blkif_state = BLKIF_STATE_CONNECTED;
1097 xenbus_dev_ok(info->xbdev);
1099 /* Kick pending requests. */
1100 spin_lock_irq(&blkif_io_lock);
1101 kick_pending_request_queues(info);
1102 spin_unlock_irq(&blkif_io_lock);
1105 static int setup_blkring(struct xenbus_device *dev, struct blkfront_info *info)
1107 blkif_sring_t *sring;
1108 evtchn_op_t op = { .cmd = EVTCHNOP_alloc_unbound };
1109 int err;
1111 sring = (void *)__get_free_page(GFP_KERNEL);
1112 if (!sring) {
1113 xenbus_dev_error(dev, -ENOMEM, "allocating shared ring");
1114 return -ENOMEM;
1116 SHARED_RING_INIT(sring);
1117 FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE);
1119 err = gnttab_grant_foreign_access(info->backend_id,
1120 virt_to_mfn(info->ring.sring), 0);
1121 if (err == -ENOSPC) {
1122 free_page((unsigned long)info->ring.sring);
1123 info->ring.sring = 0;
1124 xenbus_dev_error(dev, err, "granting access to ring page");
1125 return err;
1127 info->ring_ref = err;
1129 op.u.alloc_unbound.dom = info->backend_id;
1130 err = HYPERVISOR_event_channel_op(&op);
1131 if (err) {
1132 gnttab_end_foreign_access(info->ring_ref, 0);
1133 free_page((unsigned long)info->ring.sring);
1134 info->ring.sring = 0;
1135 xenbus_dev_error(dev, err, "allocating event channel");
1136 return err;
1138 blkif_connect(info, op.u.alloc_unbound.port);
1139 return 0;
1142 /* Common code used when first setting up, and when resuming. */
1143 static int talk_to_backend(struct xenbus_device *dev,
1144 struct blkfront_info *info)
1146 char *backend;
1147 const char *message;
1148 int err;
1150 backend = NULL;
1151 err = xenbus_gather(dev->nodename,
1152 "backend-id", "%i", &info->backend_id,
1153 "backend", NULL, &backend,
1154 NULL);
1155 if (XENBUS_EXIST_ERR(err))
1156 goto out;
1157 if (backend && strlen(backend) == 0) {
1158 err = -ENOENT;
1159 goto out;
1161 if (err < 0) {
1162 xenbus_dev_error(dev, err, "reading %s/backend or backend-id",
1163 dev->nodename);
1164 goto out;
1167 /* Create shared ring, alloc event channel. */
1168 err = setup_blkring(dev, info);
1169 if (err) {
1170 xenbus_dev_error(dev, err, "setting up block ring");
1171 goto out;
1174 err = xenbus_transaction_start(dev->nodename);
1175 if (err) {
1176 xenbus_dev_error(dev, err, "starting transaction");
1177 goto destroy_blkring;
1180 err = xenbus_printf(dev->nodename, "ring-ref","%u", info->ring_ref);
1181 if (err) {
1182 message = "writing ring-ref";
1183 goto abort_transaction;
1185 err = xenbus_printf(dev->nodename,
1186 "event-channel", "%u", info->evtchn);
1187 if (err) {
1188 message = "writing event-channel";
1189 goto abort_transaction;
1192 info->backend = backend;
1193 backend = NULL;
1195 info->watch.node = info->backend;
1196 info->watch.callback = watch_for_status;
1197 err = register_xenbus_watch(&info->watch);
1198 if (err) {
1199 message = "registering watch on backend";
1200 goto abort_transaction;
1203 err = xenbus_transaction_end(0);
1204 if (err) {
1205 xenbus_dev_error(dev, err, "completing transaction");
1206 goto destroy_blkring;
1209 out:
1210 if (backend)
1211 kfree(backend);
1212 return err;
1214 abort_transaction:
1215 xenbus_transaction_end(1);
1216 /* Have to do this *outside* transaction. */
1217 xenbus_dev_error(dev, err, "%s", message);
1218 destroy_blkring:
1219 blkif_free(info);
1220 goto out;
1223 /* Setup supplies the backend dir, virtual device.
1225 We place an event channel and shared frame entries.
1226 We watch backend to wait if it's ok. */
1227 static int blkfront_probe(struct xenbus_device *dev,
1228 const struct xenbus_device_id *id)
1230 int err;
1231 struct blkfront_info *info;
1232 int vdevice;
1234 /* FIXME: Use dynamic device id if this is not set. */
1235 err = xenbus_scanf(dev->nodename, "virtual-device", "%i", &vdevice);
1236 if (XENBUS_EXIST_ERR(err))
1237 return err;
1238 if (err < 0) {
1239 xenbus_dev_error(dev, err, "reading virtual-device");
1240 return err;
1243 info = kmalloc(sizeof(*info), GFP_KERNEL);
1244 if (!info) {
1245 xenbus_dev_error(dev, err, "allocating info structure");
1246 return err;
1248 info->xbdev = dev;
1249 info->vdevice = vdevice;
1250 info->connected = BLKIF_STATE_DISCONNECTED;
1251 info->mi = NULL;
1252 INIT_WORK(&info->work, blkif_restart_queue, (void *)info);
1254 /* Front end dir is a number, which is used as the id. */
1255 info->handle = simple_strtoul(strrchr(dev->nodename,'/')+1, NULL, 0);
1256 dev->data = info;
1258 err = talk_to_backend(dev, info);
1259 if (err) {
1260 kfree(info);
1261 return err;
1264 /* Call once in case entries already there. */
1265 watch_for_status(&info->watch, info->watch.node);
1266 return 0;
1269 static int blkfront_remove(struct xenbus_device *dev)
1271 struct blkfront_info *info = dev->data;
1273 if (info->backend)
1274 unregister_xenbus_watch(&info->watch);
1276 if (info->mi)
1277 xlvbd_del(info);
1279 blkif_free(info);
1281 kfree(info->backend);
1282 kfree(info);
1284 return 0;
1287 static int blkfront_suspend(struct xenbus_device *dev)
1289 struct blkfront_info *info = dev->data;
1291 unregister_xenbus_watch(&info->watch);
1292 kfree(info->backend);
1293 info->backend = NULL;
1295 recovery = 1;
1296 blkif_free(info);
1298 return 0;
1301 static int blkfront_resume(struct xenbus_device *dev)
1303 struct blkfront_info *info = dev->data;
1304 int err;
1306 /* FIXME: Check geometry hasn't changed here... */
1307 err = talk_to_backend(dev, info);
1308 if (!err) {
1309 blkif_recover(info);
1311 return err;
1314 static struct xenbus_driver blkfront = {
1315 .name = "vbd",
1316 .owner = THIS_MODULE,
1317 .ids = blkfront_ids,
1318 .probe = blkfront_probe,
1319 .remove = blkfront_remove,
1320 .resume = blkfront_resume,
1321 .suspend = blkfront_suspend,
1322 };
1324 static void __init init_blk_xenbus(void)
1326 xenbus_register_device(&blkfront);
1329 static int wait_for_blkif(void)
1331 int err = 0;
1332 int i;
1334 /*
1335 * We should figure out how many and which devices we need to
1336 * proceed and only wait for those. For now, continue once the
1337 * first device is around.
1338 */
1339 for ( i=0; blkif_state != BLKIF_STATE_CONNECTED && (i < 10*HZ); i++ )
1341 set_current_state(TASK_INTERRUPTIBLE);
1342 schedule_timeout(1);
1345 if ( blkif_state != BLKIF_STATE_CONNECTED )
1347 WPRINTK("Timeout connecting to device!\n");
1348 err = -ENOSYS;
1350 return err;
1353 static int __init xlblk_init(void)
1355 int i;
1357 if ( (xen_start_info.flags & SIF_INITDOMAIN) ||
1358 (xen_start_info.flags & SIF_BLK_BE_DOMAIN) )
1359 return 0;
1361 IPRINTK("Initialising virtual block device driver\n");
1363 blk_shadow_free = 0;
1364 memset(blk_shadow, 0, sizeof(blk_shadow));
1365 for ( i = 0; i < BLK_RING_SIZE; i++ )
1366 blk_shadow[i].req.id = i+1;
1367 blk_shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
1369 init_blk_xenbus();
1371 wait_for_blkif();
1373 return 0;
1376 static void blkif_completion(struct blk_shadow *s)
1378 int i;
1379 for ( i = 0; i < s->req.nr_segments; i++ )
1380 gnttab_free_grant_reference(
1381 blkif_gref_from_fas(s->req.frame_and_sects[i]));