ia64/xen-unstable

view linux-2.6-xen-sparse/drivers/xen/blkfront/blkfront.c @ 6298:e8c2c3123ec6

Improve error paths and cleanup code.
Signed-off-by: Christian Limpach <Christian.Limpach@cl.cam.ac.uk>
author cl349@firebug.cl.cam.ac.uk
date Fri Aug 19 17:38:07 2005 +0000 (2005-08-19)
parents 1a0723cd37f1
children 631cc5dc3e8a
line source
1 /******************************************************************************
2 * blkfront.c
3 *
4 * XenLinux virtual block-device driver.
5 *
6 * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
7 * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
8 * Copyright (c) 2004, Christian Limpach
9 * Copyright (c) 2004, Andrew Warfield
10 * Copyright (c) 2005, Christopher Clark
11 *
12 * This file may be distributed separately from the Linux kernel, or
13 * incorporated into other software packages, subject to the following license:
14 *
15 * Permission is hereby granted, free of charge, to any person obtaining a copy
16 * of this source file (the "Software"), to deal in the Software without
17 * restriction, including without limitation the rights to use, copy, modify,
18 * merge, publish, distribute, sublicense, and/or sell copies of the Software,
19 * and to permit persons to whom the Software is furnished to do so, subject to
20 * the following conditions:
21 *
22 * The above copyright notice and this permission notice shall be included in
23 * all copies or substantial portions of the Software.
24 *
25 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
26 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
27 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
28 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
29 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
30 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
31 * IN THE SOFTWARE.
32 */
34 #if 1
35 #define ASSERT(_p) \
36 if ( !(_p) ) { printk("Assertion '%s' failed, line %d, file %s", #_p , \
37 __LINE__, __FILE__); *(int*)0=0; }
38 #else
39 #define ASSERT(_p)
40 #endif
42 #include <linux/version.h>
44 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
45 #include "block.h"
46 #else
47 #include "common.h"
48 #include <linux/blk.h>
49 #include <linux/tqueue.h>
50 #endif
52 #include <linux/cdrom.h>
53 #include <linux/sched.h>
54 #include <linux/interrupt.h>
55 #include <scsi/scsi.h>
56 #include <asm-xen/evtchn.h>
57 #include <asm-xen/xenbus.h>
58 #ifdef CONFIG_XEN_BLKDEV_GRANT
59 #include <asm-xen/xen-public/grant_table.h>
60 #include <asm-xen/gnttab.h>
61 #endif
63 typedef unsigned char byte; /* from linux/ide.h */
65 /* Control whether runtime update of vbds is enabled. */
66 #define ENABLE_VBD_UPDATE 1
68 #define BLKIF_STATE_CLOSED 0
69 #define BLKIF_STATE_DISCONNECTED 1
70 #define BLKIF_STATE_CONNECTED 2
72 static unsigned int blkif_state = BLKIF_STATE_CLOSED;
73 static unsigned int blkif_evtchn = 0;
74 static unsigned int blkif_vbds = 0;
75 static unsigned int blkif_vbds_connected = 0;
77 static blkif_front_ring_t blk_ring;
79 #define BLK_RING_SIZE __RING_SIZE((blkif_sring_t *)0, PAGE_SIZE)
81 #ifdef CONFIG_XEN_BLKDEV_GRANT
82 static domid_t rdomid = 0;
83 static grant_ref_t gref_head, gref_terminal;
84 #define MAXIMUM_OUTSTANDING_BLOCK_REQS \
85 (BLKIF_MAX_SEGMENTS_PER_REQUEST * BLKIF_RING_SIZE)
86 #define GRANTREF_INVALID (1<<15)
87 static int shmem_ref;
88 #endif
90 static struct blk_shadow {
91 blkif_request_t req;
92 unsigned long request;
93 unsigned long frame[BLKIF_MAX_SEGMENTS_PER_REQUEST];
94 } blk_shadow[BLK_RING_SIZE];
95 unsigned long blk_shadow_free;
97 static int recovery = 0; /* Recovery in progress: protected by blkif_io_lock */
99 static void kick_pending_request_queues(void);
101 static int __init xlblk_init(void);
103 static void blkif_completion(struct blk_shadow *s);
105 static inline int GET_ID_FROM_FREELIST(void)
106 {
107 unsigned long free = blk_shadow_free;
108 BUG_ON(free > BLK_RING_SIZE);
109 blk_shadow_free = blk_shadow[free].req.id;
110 blk_shadow[free].req.id = 0x0fffffee; /* debug */
111 return free;
112 }
114 static inline void ADD_ID_TO_FREELIST(unsigned long id)
115 {
116 blk_shadow[id].req.id = blk_shadow_free;
117 blk_shadow[id].request = 0;
118 blk_shadow_free = id;
119 }
122 /************************ COMMON CODE (inlined) ************************/
124 /* Kernel-specific definitions used in the common code */
125 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
126 #define DISABLE_SCATTERGATHER()
127 #else
128 static int sg_operation = -1;
129 #define DISABLE_SCATTERGATHER() (sg_operation = -1)
130 #endif
132 static inline void pickle_request(struct blk_shadow *s, blkif_request_t *r)
133 {
134 #ifndef CONFIG_XEN_BLKDEV_GRANT
135 int i;
136 #endif
138 s->req = *r;
140 #ifndef CONFIG_XEN_BLKDEV_GRANT
141 for ( i = 0; i < r->nr_segments; i++ )
142 s->req.frame_and_sects[i] = machine_to_phys(r->frame_and_sects[i]);
143 #endif
144 }
146 static inline void unpickle_request(blkif_request_t *r, struct blk_shadow *s)
147 {
148 #ifndef CONFIG_XEN_BLKDEV_GRANT
149 int i;
150 #endif
152 *r = s->req;
154 #ifndef CONFIG_XEN_BLKDEV_GRANT
155 for ( i = 0; i < s->req.nr_segments; i++ )
156 r->frame_and_sects[i] = phys_to_machine(s->req.frame_and_sects[i]);
157 #endif
158 }
161 static inline void flush_requests(void)
162 {
163 DISABLE_SCATTERGATHER();
164 RING_PUSH_REQUESTS(&blk_ring);
165 notify_via_evtchn(blkif_evtchn);
166 }
169 /************************** KERNEL VERSION 2.6 **************************/
171 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
173 module_init(xlblk_init);
175 static struct xlbd_disk_info *head_waiting = NULL;
176 static void kick_pending_request_queues(void)
177 {
178 struct xlbd_disk_info *di;
179 while ( ((di = head_waiting) != NULL) && !RING_FULL(&blk_ring) )
180 {
181 head_waiting = di->next_waiting;
182 di->next_waiting = NULL;
183 /* Re-enable calldowns. */
184 blk_start_queue(di->rq);
185 /* Kick things off immediately. */
186 do_blkif_request(di->rq);
187 }
188 }
190 int blkif_open(struct inode *inode, struct file *filep)
191 {
192 struct gendisk *gd = inode->i_bdev->bd_disk;
193 struct xlbd_disk_info *di = (struct xlbd_disk_info *)gd->private_data;
195 /* Update of usage count is protected by per-device semaphore. */
196 di->mi->usage++;
198 return 0;
199 }
202 int blkif_release(struct inode *inode, struct file *filep)
203 {
204 /* FIXME: This is where we can actually free up majors, etc. --RR */
205 return 0;
206 }
209 int blkif_ioctl(struct inode *inode, struct file *filep,
210 unsigned command, unsigned long argument)
211 {
212 int i;
214 DPRINTK_IOCTL("command: 0x%x, argument: 0x%lx, dev: 0x%04x\n",
215 command, (long)argument, inode->i_rdev);
217 switch ( command )
218 {
219 case HDIO_GETGEO:
220 /* return ENOSYS to use defaults */
221 return -ENOSYS;
223 case CDROMMULTISESSION:
224 DPRINTK("FIXME: support multisession CDs later\n");
225 for ( i = 0; i < sizeof(struct cdrom_multisession); i++ )
226 if ( put_user(0, (byte *)(argument + i)) ) return -EFAULT;
227 return 0;
229 default:
230 /*printk(KERN_ALERT "ioctl %08x not supported by Xen blkdev\n",
231 command);*/
232 return -EINVAL; /* same return as native Linux */
233 }
235 return 0;
236 }
239 /*
240 * blkif_queue_request
241 *
242 * request block io
243 *
244 * id: for guest use only.
245 * operation: BLKIF_OP_{READ,WRITE,PROBE}
246 * buffer: buffer to read/write into. this should be a
247 * virtual address in the guest os.
248 */
249 static int blkif_queue_request(struct request *req)
250 {
251 struct xlbd_disk_info *di = req->rq_disk->private_data;
252 unsigned long buffer_ma;
253 blkif_request_t *ring_req;
254 struct bio *bio;
255 struct bio_vec *bvec;
256 int idx;
257 unsigned long id;
258 unsigned int fsect, lsect;
259 #ifdef CONFIG_XEN_BLKDEV_GRANT
260 int ref;
261 #endif
263 if ( unlikely(blkif_state != BLKIF_STATE_CONNECTED) )
264 return 1;
266 /* Fill out a communications ring structure. */
267 ring_req = RING_GET_REQUEST(&blk_ring, blk_ring.req_prod_pvt);
268 id = GET_ID_FROM_FREELIST();
269 blk_shadow[id].request = (unsigned long)req;
271 ring_req->id = id;
272 ring_req->operation = rq_data_dir(req) ? BLKIF_OP_WRITE :
273 BLKIF_OP_READ;
274 ring_req->sector_number = (blkif_sector_t)req->sector;
275 ring_req->handle = di->handle;
277 ring_req->nr_segments = 0;
278 rq_for_each_bio(bio, req)
279 {
280 bio_for_each_segment(bvec, bio, idx)
281 {
282 if ( ring_req->nr_segments == BLKIF_MAX_SEGMENTS_PER_REQUEST )
283 BUG();
284 buffer_ma = page_to_phys(bvec->bv_page);
285 fsect = bvec->bv_offset >> 9;
286 lsect = fsect + (bvec->bv_len >> 9) - 1;
287 #ifdef CONFIG_XEN_BLKDEV_GRANT
288 /* install a grant reference. */
289 ref = gnttab_claim_grant_reference(&gref_head, gref_terminal);
290 ASSERT( ref != -ENOSPC );
292 gnttab_grant_foreign_access_ref(
293 ref,
294 rdomid,
295 buffer_ma >> PAGE_SHIFT,
296 rq_data_dir(req) );
298 blk_shadow[id].frame[ring_req->nr_segments] =
299 buffer_ma >> PAGE_SHIFT;
301 ring_req->frame_and_sects[ring_req->nr_segments++] =
302 blkif_fas_from_gref(ref, fsect, lsect);
304 #else
305 ring_req->frame_and_sects[ring_req->nr_segments++] =
306 blkif_fas(buffer_ma, fsect, lsect);
307 #endif
308 }
309 }
311 blk_ring.req_prod_pvt++;
313 /* Keep a private copy so we can reissue requests when recovering. */
314 pickle_request(&blk_shadow[id], ring_req);
316 return 0;
317 }
320 /*
321 * do_blkif_request
322 * read a block; request is in a request queue
323 */
324 void do_blkif_request(request_queue_t *rq)
325 {
326 struct xlbd_disk_info *di;
327 struct request *req;
328 int queued;
330 DPRINTK("Entered do_blkif_request\n");
332 queued = 0;
334 while ( (req = elv_next_request(rq)) != NULL )
335 {
336 if ( !blk_fs_request(req) )
337 {
338 end_request(req, 0);
339 continue;
340 }
342 if ( RING_FULL(&blk_ring) )
343 goto wait;
345 DPRINTK("do_blk_req %p: cmd %p, sec %lx, (%u/%li) buffer:%p [%s]\n",
346 req, req->cmd, req->sector, req->current_nr_sectors,
347 req->nr_sectors, req->buffer,
348 rq_data_dir(req) ? "write" : "read");
350 blkdev_dequeue_request(req);
351 if ( blkif_queue_request(req) )
352 {
353 wait:
354 di = req->rq_disk->private_data;
355 if ( di->next_waiting == NULL )
356 {
357 di->next_waiting = head_waiting;
358 head_waiting = di;
359 /* Avoid pointless unplugs. */
360 blk_stop_queue(rq);
361 }
362 break;
363 }
365 queued++;
366 }
368 if ( queued != 0 )
369 flush_requests();
370 }
373 static irqreturn_t blkif_int(int irq, void *dev_id, struct pt_regs *ptregs)
374 {
375 struct request *req;
376 blkif_response_t *bret;
377 RING_IDX i, rp;
378 unsigned long flags;
380 spin_lock_irqsave(&blkif_io_lock, flags);
382 if ( unlikely(blkif_state == BLKIF_STATE_CLOSED) ||
383 unlikely(recovery) )
384 {
385 spin_unlock_irqrestore(&blkif_io_lock, flags);
386 return IRQ_HANDLED;
387 }
389 rp = blk_ring.sring->rsp_prod;
390 rmb(); /* Ensure we see queued responses up to 'rp'. */
392 for ( i = blk_ring.rsp_cons; i != rp; i++ )
393 {
394 unsigned long id;
396 bret = RING_GET_RESPONSE(&blk_ring, i);
397 id = bret->id;
398 req = (struct request *)blk_shadow[id].request;
400 blkif_completion(&blk_shadow[id]);
402 ADD_ID_TO_FREELIST(id);
404 switch ( bret->operation )
405 {
406 case BLKIF_OP_READ:
407 case BLKIF_OP_WRITE:
408 if ( unlikely(bret->status != BLKIF_RSP_OKAY) )
409 DPRINTK("Bad return from blkdev data request: %x\n",
410 bret->status);
412 if ( unlikely(end_that_request_first
413 (req,
414 (bret->status == BLKIF_RSP_OKAY),
415 req->hard_nr_sectors)) )
416 BUG();
417 end_that_request_last(req);
419 break;
420 default:
421 BUG();
422 }
423 }
425 blk_ring.rsp_cons = i;
427 kick_pending_request_queues();
429 spin_unlock_irqrestore(&blkif_io_lock, flags);
431 return IRQ_HANDLED;
432 }
434 #else
435 /************************** KERNEL VERSION 2.4 **************************/
437 static kdev_t sg_dev;
438 static unsigned long sg_next_sect;
440 /*
441 * Request queues with outstanding work, but ring is currently full.
442 * We need no special lock here, as we always access this with the
443 * blkif_io_lock held. We only need a small maximum list.
444 */
445 #define MAX_PENDING 8
446 static request_queue_t *pending_queues[MAX_PENDING];
447 static int nr_pending;
450 #define blkif_io_lock io_request_lock
452 /*============================================================================*/
453 static void kick_pending_request_queues(void)
454 {
455 /* We kick pending request queues if the ring is reasonably empty. */
456 if ( (nr_pending != 0) &&
457 (RING_PENDING_REQUESTS(&blk_ring) < (BLK_RING_SIZE >> 1)) )
458 {
459 /* Attempt to drain the queue, but bail if the ring becomes full. */
460 while ( (nr_pending != 0) && !RING_FULL(&blk_ring) )
461 do_blkif_request(pending_queues[--nr_pending]);
462 }
463 }
465 int blkif_open(struct inode *inode, struct file *filep)
466 {
467 short xldev = inode->i_rdev;
468 struct gendisk *gd = get_gendisk(xldev);
469 xl_disk_t *disk = xldev_to_xldisk(inode->i_rdev);
470 short minor = MINOR(xldev);
472 if ( gd->part[minor].nr_sects == 0 )
473 {
474 /*
475 * Device either doesn't exist, or has zero capacity; we use a few
476 * cheesy heuristics to return the relevant error code
477 */
478 if ( (gd->sizes[minor >> gd->minor_shift] != 0) ||
479 ((minor & (gd->max_p - 1)) != 0) )
480 {
481 /*
482 * We have a real device, but no such partition, or we just have a
483 * partition number so guess this is the problem.
484 */
485 return -ENXIO; /* no such device or address */
486 }
487 else if ( gd->flags[minor >> gd->minor_shift] & GENHD_FL_REMOVABLE )
488 {
489 /* This is a removable device => assume that media is missing. */
490 return -ENOMEDIUM; /* media not present (this is a guess) */
491 }
492 else
493 {
494 /* Just go for the general 'no such device' error. */
495 return -ENODEV; /* no such device */
496 }
497 }
499 /* Update of usage count is protected by per-device semaphore. */
500 disk->usage++;
502 return 0;
503 }
506 int blkif_release(struct inode *inode, struct file *filep)
507 {
508 xl_disk_t *disk = xldev_to_xldisk(inode->i_rdev);
510 /*
511 * When usage drops to zero it may allow more VBD updates to occur.
512 * Update of usage count is protected by a per-device semaphore.
513 */
514 if ( --disk->usage == 0 ) {
515 vbd_update();
516 }
518 return 0;
519 }
522 int blkif_ioctl(struct inode *inode, struct file *filep,
523 unsigned command, unsigned long argument)
524 {
525 kdev_t dev = inode->i_rdev;
526 struct hd_geometry *geo = (struct hd_geometry *)argument;
527 struct gendisk *gd;
528 struct hd_struct *part;
529 int i;
530 unsigned short cylinders;
531 byte heads, sectors;
533 /* NB. No need to check permissions. That is done for us. */
535 DPRINTK_IOCTL("command: 0x%x, argument: 0x%lx, dev: 0x%04x\n",
536 command, (long) argument, dev);
538 gd = get_gendisk(dev);
539 part = &gd->part[MINOR(dev)];
541 switch ( command )
542 {
543 case BLKGETSIZE:
544 DPRINTK_IOCTL(" BLKGETSIZE: %x %lx\n", BLKGETSIZE, part->nr_sects);
545 return put_user(part->nr_sects, (unsigned long *) argument);
547 case BLKGETSIZE64:
548 DPRINTK_IOCTL(" BLKGETSIZE64: %x %llx\n", BLKGETSIZE64,
549 (u64)part->nr_sects * 512);
550 return put_user((u64)part->nr_sects * 512, (u64 *) argument);
552 case BLKRRPART: /* re-read partition table */
553 DPRINTK_IOCTL(" BLKRRPART: %x\n", BLKRRPART);
554 return blkif_revalidate(dev);
556 case BLKSSZGET:
557 return hardsect_size[MAJOR(dev)][MINOR(dev)];
559 case BLKBSZGET: /* get block size */
560 DPRINTK_IOCTL(" BLKBSZGET: %x\n", BLKBSZGET);
561 break;
563 case BLKBSZSET: /* set block size */
564 DPRINTK_IOCTL(" BLKBSZSET: %x\n", BLKBSZSET);
565 break;
567 case BLKRASET: /* set read-ahead */
568 DPRINTK_IOCTL(" BLKRASET: %x\n", BLKRASET);
569 break;
571 case BLKRAGET: /* get read-ahead */
572 DPRINTK_IOCTL(" BLKRAFET: %x\n", BLKRAGET);
573 break;
575 case HDIO_GETGEO:
576 DPRINTK_IOCTL(" HDIO_GETGEO: %x\n", HDIO_GETGEO);
577 if (!argument) return -EINVAL;
579 /* We don't have real geometry info, but let's at least return
580 values consistent with the size of the device */
582 heads = 0xff;
583 sectors = 0x3f;
584 cylinders = part->nr_sects / (heads * sectors);
586 if (put_user(0x00, (unsigned long *) &geo->start)) return -EFAULT;
587 if (put_user(heads, (byte *)&geo->heads)) return -EFAULT;
588 if (put_user(sectors, (byte *)&geo->sectors)) return -EFAULT;
589 if (put_user(cylinders, (unsigned short *)&geo->cylinders)) return -EFAULT;
591 return 0;
593 case HDIO_GETGEO_BIG:
594 DPRINTK_IOCTL(" HDIO_GETGEO_BIG: %x\n", HDIO_GETGEO_BIG);
595 if (!argument) return -EINVAL;
597 /* We don't have real geometry info, but let's at least return
598 values consistent with the size of the device */
600 heads = 0xff;
601 sectors = 0x3f;
602 cylinders = part->nr_sects / (heads * sectors);
604 if (put_user(0x00, (unsigned long *) &geo->start)) return -EFAULT;
605 if (put_user(heads, (byte *)&geo->heads)) return -EFAULT;
606 if (put_user(sectors, (byte *)&geo->sectors)) return -EFAULT;
607 if (put_user(cylinders, (unsigned int *) &geo->cylinders)) return -EFAULT;
609 return 0;
611 case CDROMMULTISESSION:
612 DPRINTK("FIXME: support multisession CDs later\n");
613 for ( i = 0; i < sizeof(struct cdrom_multisession); i++ )
614 if ( put_user(0, (byte *)(argument + i)) ) return -EFAULT;
615 return 0;
617 case SCSI_IOCTL_GET_BUS_NUMBER:
618 DPRINTK("FIXME: SCSI_IOCTL_GET_BUS_NUMBER ioctl in XL blkif");
619 return -ENOSYS;
621 default:
622 WPRINTK("ioctl %08x not supported by XL blkif\n", command);
623 return -ENOSYS;
624 }
626 return 0;
627 }
631 /* check media change: should probably do something here in some cases :-) */
632 int blkif_check(kdev_t dev)
633 {
634 DPRINTK("blkif_check\n");
635 return 0;
636 }
638 int blkif_revalidate(kdev_t dev)
639 {
640 struct block_device *bd;
641 struct gendisk *gd;
642 xl_disk_t *disk;
643 unsigned long capacity;
644 int i, rc = 0;
646 if ( (bd = bdget(dev)) == NULL )
647 return -EINVAL;
649 /*
650 * Update of partition info, and check of usage count, is protected
651 * by the per-block-device semaphore.
652 */
653 down(&bd->bd_sem);
655 if ( ((gd = get_gendisk(dev)) == NULL) ||
656 ((disk = xldev_to_xldisk(dev)) == NULL) ||
657 ((capacity = gd->part[MINOR(dev)].nr_sects) == 0) )
658 {
659 rc = -EINVAL;
660 goto out;
661 }
663 if ( disk->usage > 1 )
664 {
665 rc = -EBUSY;
666 goto out;
667 }
669 /* Only reread partition table if VBDs aren't mapped to partitions. */
670 if ( !(gd->flags[MINOR(dev) >> gd->minor_shift] & GENHD_FL_VIRT_PARTNS) )
671 {
672 for ( i = gd->max_p - 1; i >= 0; i-- )
673 {
674 invalidate_device(dev+i, 1);
675 gd->part[MINOR(dev+i)].start_sect = 0;
676 gd->part[MINOR(dev+i)].nr_sects = 0;
677 gd->sizes[MINOR(dev+i)] = 0;
678 }
680 grok_partitions(gd, MINOR(dev)>>gd->minor_shift, gd->max_p, capacity);
681 }
683 out:
684 up(&bd->bd_sem);
685 bdput(bd);
686 return rc;
687 }
690 /*
691 * blkif_queue_request
692 *
693 * request block io
694 *
695 * id: for guest use only.
696 * operation: BLKIF_OP_{READ,WRITE,PROBE}
697 * buffer: buffer to read/write into. this should be a
698 * virtual address in the guest os.
699 */
700 static int blkif_queue_request(unsigned long id,
701 int operation,
702 char * buffer,
703 unsigned long sector_number,
704 unsigned short nr_sectors,
705 kdev_t device,
706 blkif_vdev_t handle)
707 {
708 unsigned long buffer_ma = virt_to_bus(buffer);
709 unsigned long xid;
710 struct gendisk *gd;
711 blkif_request_t *req;
712 struct buffer_head *bh;
713 unsigned int fsect, lsect;
714 #ifdef CONFIG_XEN_BLKDEV_GRANT
715 int ref;
716 #endif
718 fsect = (buffer_ma & ~PAGE_MASK) >> 9;
719 lsect = fsect + nr_sectors - 1;
721 /* Buffer must be sector-aligned. Extent mustn't cross a page boundary. */
722 if ( unlikely((buffer_ma & ((1<<9)-1)) != 0) )
723 BUG();
724 if ( lsect > ((PAGE_SIZE/512)-1) )
725 BUG();
727 buffer_ma &= PAGE_MASK;
729 if ( unlikely(blkif_state != BLKIF_STATE_CONNECTED) )
730 return 1;
732 switch ( operation )
733 {
735 case BLKIF_OP_READ:
736 case BLKIF_OP_WRITE:
737 gd = get_gendisk(device);
739 /*
740 * Update the sector_number we'll pass down as appropriate; note that
741 * we could sanity check that resulting sector will be in this
742 * partition, but this will happen in driver backend anyhow.
743 */
744 sector_number += gd->part[MINOR(device)].start_sect;
746 /*
747 * If this unit doesn't consist of virtual partitions then we clear
748 * the partn bits from the device number.
749 */
750 if ( !(gd->flags[MINOR(device)>>gd->minor_shift] &
751 GENHD_FL_VIRT_PARTNS) )
752 device &= ~(gd->max_p - 1);
754 if ( (sg_operation == operation) &&
755 (sg_dev == device) &&
756 (sg_next_sect == sector_number) )
757 {
758 req = RING_GET_REQUEST(&blk_ring,
759 blk_ring.req_prod_pvt - 1);
760 bh = (struct buffer_head *)id;
762 bh->b_reqnext = (struct buffer_head *)blk_shadow[req->id].request;
763 blk_shadow[req->id].request = (unsigned long)id;
765 #ifdef CONFIG_XEN_BLKDEV_GRANT
766 /* install a grant reference. */
767 ref = gnttab_claim_grant_reference(&gref_head, gref_terminal);
768 ASSERT( ref != -ENOSPC );
770 gnttab_grant_foreign_access_ref(
771 ref,
772 rdomid,
773 buffer_ma >> PAGE_SHIFT,
774 ( operation == BLKIF_OP_WRITE ? 1 : 0 ) );
776 blk_shadow[req->id].frame[req->nr_segments] =
777 buffer_ma >> PAGE_SHIFT;
779 req->frame_and_sects[req->nr_segments] =
780 blkif_fas_from_gref(ref, fsect, lsect);
781 #else
782 req->frame_and_sects[req->nr_segments] =
783 blkif_fas(buffer_ma, fsect, lsect);
784 #endif
785 if ( ++req->nr_segments < BLKIF_MAX_SEGMENTS_PER_REQUEST )
786 sg_next_sect += nr_sectors;
787 else
788 DISABLE_SCATTERGATHER();
790 /* Update the copy of the request in the recovery ring. */
791 pickle_request(&blk_shadow[req->id], req );
793 return 0;
794 }
795 else if ( RING_FULL(&blk_ring) )
796 {
797 return 1;
798 }
799 else
800 {
801 sg_operation = operation;
802 sg_dev = device;
803 sg_next_sect = sector_number + nr_sectors;
804 }
805 break;
807 default:
808 panic("unknown op %d\n", operation);
809 }
811 /* Fill out a communications ring structure. */
812 req = RING_GET_REQUEST(&blk_ring, blk_ring.req_prod_pvt);
814 xid = GET_ID_FROM_FREELIST();
815 blk_shadow[xid].request = (unsigned long)id;
817 req->id = xid;
818 req->operation = operation;
819 req->sector_number = (blkif_sector_t)sector_number;
820 req->handle = handle;
821 req->nr_segments = 1;
822 #ifdef CONFIG_XEN_BLKDEV_GRANT
823 /* install a grant reference. */
824 ref = gnttab_claim_grant_reference(&gref_head, gref_terminal);
825 ASSERT( ref != -ENOSPC );
827 gnttab_grant_foreign_access_ref(
828 ref,
829 rdomid,
830 buffer_ma >> PAGE_SHIFT,
831 ( operation == BLKIF_OP_WRITE ? 1 : 0 ) );
833 blk_shadow[xid].frame[0] = buffer_ma >> PAGE_SHIFT;
835 req->frame_and_sects[0] = blkif_fas_from_gref(ref, fsect, lsect);
836 #else
837 req->frame_and_sects[0] = blkif_fas(buffer_ma, fsect, lsect);
838 #endif
840 /* Keep a private copy so we can reissue requests when recovering. */
841 pickle_request(&blk_shadow[xid], req);
843 blk_ring.req_prod_pvt++;
845 return 0;
846 }
849 /*
850 * do_blkif_request
851 * read a block; request is in a request queue
852 */
853 void do_blkif_request(request_queue_t *rq)
854 {
855 struct request *req;
856 struct buffer_head *bh, *next_bh;
857 int rw, nsect, full, queued = 0;
859 DPRINTK("Entered do_blkif_request\n");
861 while ( !rq->plugged && !list_empty(&rq->queue_head))
862 {
863 if ( (req = blkdev_entry_next_request(&rq->queue_head)) == NULL )
864 goto out;
866 DPRINTK("do_blkif_request %p: cmd %i, sec %lx, (%li/%li) bh:%p\n",
867 req, req->cmd, req->sector,
868 req->current_nr_sectors, req->nr_sectors, req->bh);
870 rw = req->cmd;
871 if ( rw == READA )
872 rw = READ;
873 if ( unlikely((rw != READ) && (rw != WRITE)) )
874 panic("XenoLinux Virtual Block Device: bad cmd: %d\n", rw);
876 req->errors = 0;
878 bh = req->bh;
879 while ( bh != NULL )
880 {
881 next_bh = bh->b_reqnext;
882 bh->b_reqnext = NULL;
884 full = blkif_queue_request(
885 (unsigned long)bh,
886 (rw == READ) ? BLKIF_OP_READ : BLKIF_OP_WRITE,
887 bh->b_data, bh->b_rsector, bh->b_size>>9, bh->b_rdev);
889 if ( full )
890 {
891 bh->b_reqnext = next_bh;
892 pending_queues[nr_pending++] = rq;
893 if ( unlikely(nr_pending >= MAX_PENDING) )
894 BUG();
895 goto out;
896 }
898 queued++;
900 /* Dequeue the buffer head from the request. */
901 nsect = bh->b_size >> 9;
902 bh = req->bh = next_bh;
904 if ( bh != NULL )
905 {
906 /* There's another buffer head to do. Update the request. */
907 req->hard_sector += nsect;
908 req->hard_nr_sectors -= nsect;
909 req->sector = req->hard_sector;
910 req->nr_sectors = req->hard_nr_sectors;
911 req->current_nr_sectors = bh->b_size >> 9;
912 req->buffer = bh->b_data;
913 }
914 else
915 {
916 /* That was the last buffer head. Finalise the request. */
917 if ( unlikely(end_that_request_first(req, 1, "XenBlk")) )
918 BUG();
919 blkdev_dequeue_request(req);
920 end_that_request_last(req);
921 }
922 }
923 }
925 out:
926 if ( queued != 0 )
927 flush_requests();
928 }
931 static void blkif_int(int irq, void *dev_id, struct pt_regs *ptregs)
932 {
933 RING_IDX i, rp;
934 unsigned long flags;
935 struct buffer_head *bh, *next_bh;
937 spin_lock_irqsave(&io_request_lock, flags);
939 if ( unlikely(blkif_state == BLKIF_STATE_CLOSED || recovery) )
940 {
941 spin_unlock_irqrestore(&io_request_lock, flags);
942 return;
943 }
945 rp = blk_ring.sring->rsp_prod;
946 rmb(); /* Ensure we see queued responses up to 'rp'. */
948 for ( i = blk_ring.rsp_cons; i != rp; i++ )
949 {
950 unsigned long id;
951 blkif_response_t *bret;
953 bret = RING_GET_RESPONSE(&blk_ring, i);
954 id = bret->id;
955 bh = (struct buffer_head *)blk_shadow[id].request;
957 blkif_completion(&blk_shadow[id]);
959 ADD_ID_TO_FREELIST(id);
961 switch ( bret->operation )
962 {
963 case BLKIF_OP_READ:
964 case BLKIF_OP_WRITE:
965 if ( unlikely(bret->status != BLKIF_RSP_OKAY) )
966 DPRINTK("Bad return from blkdev data request: %lx\n",
967 bret->status);
968 for ( ; bh != NULL; bh = next_bh )
969 {
970 next_bh = bh->b_reqnext;
971 bh->b_reqnext = NULL;
972 bh->b_end_io(bh, bret->status == BLKIF_RSP_OKAY);
973 }
975 break;
976 case BLKIF_OP_PROBE:
977 memcpy(&blkif_control_rsp, bret, sizeof(*bret));
978 blkif_control_rsp_valid = 1;
979 break;
980 default:
981 BUG();
982 }
984 }
985 blk_ring.rsp_cons = i;
987 kick_pending_request_queues();
989 spin_unlock_irqrestore(&io_request_lock, flags);
990 }
992 #endif
994 /***************************** COMMON CODE *******************************/
996 static void blkif_free(void)
997 {
998 /* Prevent new requests being issued until we fix things up. */
999 spin_lock_irq(&blkif_io_lock);
1000 blkif_state = BLKIF_STATE_DISCONNECTED;
1001 spin_unlock_irq(&blkif_io_lock);
1003 /* Free resources associated with old device channel. */
1004 if ( blk_ring.sring != NULL )
1006 free_page((unsigned long)blk_ring.sring);
1007 blk_ring.sring = NULL;
1009 unbind_evtchn_from_irqhandler(blkif_evtchn, NULL);
1010 blkif_evtchn = 0;
1013 static void blkif_recover(void)
1015 int i;
1016 blkif_request_t *req;
1017 struct blk_shadow *copy;
1018 #ifdef CONFIG_XEN_BLKDEV_GRANT
1019 int j;
1020 #endif
1022 /* Stage 1: Make a safe copy of the shadow state. */
1023 copy = (struct blk_shadow *)kmalloc(sizeof(blk_shadow), GFP_KERNEL);
1024 BUG_ON(copy == NULL);
1025 memcpy(copy, blk_shadow, sizeof(blk_shadow));
1027 /* Stage 2: Set up free list. */
1028 memset(&blk_shadow, 0, sizeof(blk_shadow));
1029 for ( i = 0; i < BLK_RING_SIZE; i++ )
1030 blk_shadow[i].req.id = i+1;
1031 blk_shadow_free = blk_ring.req_prod_pvt;
1032 blk_shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
1034 /* Stage 3: Find pending requests and requeue them. */
1035 for ( i = 0; i < BLK_RING_SIZE; i++ )
1037 /* Not in use? */
1038 if ( copy[i].request == 0 )
1039 continue;
1041 /* Grab a request slot and unpickle shadow state into it. */
1042 req = RING_GET_REQUEST(
1043 &blk_ring, blk_ring.req_prod_pvt);
1044 unpickle_request(req, &copy[i]);
1046 /* We get a new request id, and must reset the shadow state. */
1047 req->id = GET_ID_FROM_FREELIST();
1048 memcpy(&blk_shadow[req->id], &copy[i], sizeof(copy[i]));
1050 #ifdef CONFIG_XEN_BLKDEV_GRANT
1051 /* Rewrite any grant references invalidated by suspend/resume. */
1052 for ( j = 0; j < req->nr_segments; j++ )
1054 if ( req->frame_and_sects[j] & GRANTREF_INVALID )
1055 gnttab_grant_foreign_access_ref(
1056 blkif_gref_from_fas(req->frame_and_sects[j]),
1057 rdomid,
1058 blk_shadow[req->id].frame[j],
1059 rq_data_dir((struct request *)
1060 blk_shadow[req->id].request));
1061 req->frame_and_sects[j] &= ~GRANTREF_INVALID;
1063 blk_shadow[req->id].req = *req;
1064 #endif
1066 blk_ring.req_prod_pvt++;
1069 kfree(copy);
1071 recovery = 0;
1073 /* blk_ring->req_prod will be set when we flush_requests().*/
1074 wmb();
1076 /* Kicks things back into life. */
1077 flush_requests();
1079 /* Now safe to left other people use the interface. */
1080 blkif_state = BLKIF_STATE_CONNECTED;
1083 static void blkif_connect(u16 evtchn, domid_t domid)
1085 int err = 0;
1087 blkif_evtchn = evtchn;
1088 #ifdef CONFIG_XEN_BLKDEV_GRANT
1089 rdomid = domid;
1090 #endif
1092 err = bind_evtchn_to_irqhandler(
1093 blkif_evtchn, blkif_int, SA_SAMPLE_RANDOM, "blkif", NULL);
1094 if ( err != 0 )
1096 WPRINTK("bind_evtchn_to_irqhandler failed (err=%d)\n", err);
1097 return;
1102 static struct xenbus_device_id blkfront_ids[] = {
1103 { "vbd" },
1104 { "" }
1105 };
1107 struct blkfront_info
1109 /* We watch the backend */
1110 struct xenbus_watch watch;
1111 int vdevice;
1112 u16 handle;
1113 int connected;
1114 struct xenbus_device *dev;
1115 char *backend;
1116 };
1118 static void watch_for_status(struct xenbus_watch *watch, const char *node)
1120 struct blkfront_info *info;
1121 unsigned int binfo;
1122 unsigned long sectors, sector_size;
1123 int err;
1125 info = container_of(watch, struct blkfront_info, watch);
1126 node += strlen(watch->node);
1128 /* FIXME: clean up when error on the other end. */
1129 if (info->connected)
1130 return;
1132 err = xenbus_gather(watch->node,
1133 "sectors", "%lu", &sectors,
1134 "info", "%u", &binfo,
1135 "sector-size", "%lu", &sector_size,
1136 NULL);
1137 if (err) {
1138 xenbus_dev_error(info->dev, err, "reading backend fields");
1139 return;
1142 xlvbd_add(sectors, info->vdevice, info->handle, binfo, sector_size);
1143 info->connected = 1;
1145 /* First to connect? blkif is now connected. */
1146 if (blkif_vbds_connected++ == 0)
1147 blkif_state = BLKIF_STATE_CONNECTED;
1149 xenbus_dev_ok(info->dev);
1151 /* Kick pending requests. */
1152 spin_lock_irq(&blkif_io_lock);
1153 kick_pending_request_queues();
1154 spin_unlock_irq(&blkif_io_lock);
1157 static int setup_blkring(struct xenbus_device *dev, unsigned int backend_id)
1159 blkif_sring_t *sring;
1160 evtchn_op_t op = { .cmd = EVTCHNOP_alloc_unbound };
1161 int err;
1163 sring = (void *)__get_free_page(GFP_KERNEL);
1164 if (!sring) {
1165 xenbus_dev_error(dev, -ENOMEM, "allocating shared ring");
1166 return -ENOMEM;
1168 SHARED_RING_INIT(sring);
1169 FRONT_RING_INIT(&blk_ring, sring, PAGE_SIZE);
1171 #ifdef CONFIG_XEN_BLKDEV_GRANT
1172 shmem_ref = gnttab_claim_grant_reference(&gref_head,
1173 gref_terminal);
1174 ASSERT(shmem_ref != -ENOSPC);
1175 gnttab_grant_foreign_access_ref(shmem_ref,
1176 backend_id,
1177 virt_to_mfn(blk_ring.sring),
1178 0);
1179 #endif
1181 op.u.alloc_unbound.dom = backend_id;
1182 err = HYPERVISOR_event_channel_op(&op);
1183 if (err) {
1184 free_page((unsigned long)blk_ring.sring);
1185 blk_ring.sring = 0;
1186 xenbus_dev_error(dev, err, "allocating event channel");
1187 return err;
1189 blkif_connect(op.u.alloc_unbound.port, backend_id);
1190 return 0;
1193 /* Common code used when first setting up, and when resuming. */
1194 static int talk_to_backend(struct xenbus_device *dev,
1195 struct blkfront_info *info)
1197 char *backend;
1198 const char *message;
1199 int err, backend_id;
1201 backend = NULL;
1202 err = xenbus_gather(dev->nodename,
1203 "backend-id", "%i", &backend_id,
1204 "backend", NULL, &backend,
1205 NULL);
1206 if (XENBUS_EXIST_ERR(err))
1207 goto out;
1208 if (backend && strlen(backend) == 0) {
1209 err = -ENOENT;
1210 goto out;
1212 if (err < 0) {
1213 xenbus_dev_error(dev, err, "reading %s/backend or backend-id",
1214 dev->nodename);
1215 goto out;
1218 /* First device? We create shared ring, alloc event channel. */
1219 if (blkif_vbds == 0) {
1220 err = setup_blkring(dev, backend_id);
1221 if (err)
1222 goto out;
1225 err = xenbus_transaction_start(dev->nodename);
1226 if (err) {
1227 xenbus_dev_error(dev, err, "starting transaction");
1228 goto destroy_blkring;
1231 #ifdef CONFIG_XEN_BLKDEV_GRANT
1232 err = xenbus_printf(dev->nodename, "grant-id","%u", shmem_ref);
1233 if (err) {
1234 message = "writing grant-id";
1235 goto abort_transaction;
1237 #else
1238 err = xenbus_printf(dev->nodename, "shared-frame", "%lu",
1239 virt_to_mfn(blk_ring.sring));
1240 if (err) {
1241 message = "writing shared-frame";
1242 goto abort_transaction;
1244 #endif
1245 err = xenbus_printf(dev->nodename,
1246 "event-channel", "%u", blkif_evtchn);
1247 if (err) {
1248 message = "writing event-channel";
1249 goto abort_transaction;
1252 info->backend = backend;
1253 backend = NULL;
1255 info->watch.node = info->backend;
1256 info->watch.callback = watch_for_status;
1257 err = register_xenbus_watch(&info->watch);
1258 if (err) {
1259 message = "registering watch on backend";
1260 goto abort_transaction;
1263 err = xenbus_transaction_end(0);
1264 if (err) {
1265 xenbus_dev_error(dev, err, "completing transaction");
1266 goto destroy_blkring;
1269 out:
1270 if (backend)
1271 kfree(backend);
1272 return err;
1274 abort_transaction:
1275 xenbus_transaction_end(1);
1276 /* Have to do this *outside* transaction. */
1277 xenbus_dev_error(dev, err, "%s", message);
1278 destroy_blkring:
1279 if (blkif_vbds == 0)
1280 blkif_free();
1281 goto out;
1284 /* Setup supplies the backend dir, virtual device.
1286 We place an event channel and shared frame entries.
1287 We watch backend to wait if it's ok. */
1288 static int blkfront_probe(struct xenbus_device *dev,
1289 const struct xenbus_device_id *id)
1291 int err;
1292 struct blkfront_info *info;
1293 int vdevice;
1295 /* FIXME: Use dynamic device id if this is not set. */
1296 err = xenbus_scanf(dev->nodename, "virtual-device", "%i", &vdevice);
1297 if (XENBUS_EXIST_ERR(err))
1298 return err;
1299 if (err < 0) {
1300 xenbus_dev_error(dev, err, "reading virtual-device");
1301 return err;
1304 info = kmalloc(sizeof(*info), GFP_KERNEL);
1305 if (!info) {
1306 xenbus_dev_error(dev, err, "allocating info structure");
1307 return err;
1309 info->dev = dev;
1310 info->vdevice = vdevice;
1311 info->connected = 0;
1313 /* Front end dir is a number, which is used as the id. */
1314 info->handle = simple_strtoul(strrchr(dev->nodename,'/')+1, NULL, 0);
1315 dev->data = info;
1317 err = talk_to_backend(dev, info);
1318 if (err) {
1319 kfree(info);
1320 return err;
1323 /* Call once in case entries already there. */
1324 watch_for_status(&info->watch, info->watch.node);
1325 blkif_vbds++;
1326 return 0;
1329 static int blkfront_remove(struct xenbus_device *dev)
1331 struct blkfront_info *info = dev->data;
1333 if (info->backend)
1334 unregister_xenbus_watch(&info->watch);
1336 if (info->connected) {
1337 xlvbd_del(info->handle);
1338 blkif_vbds_connected--;
1340 kfree(info->backend);
1341 kfree(info);
1343 if (--blkif_vbds == 0)
1344 blkif_free();
1346 return 0;
1349 static int blkfront_suspend(struct xenbus_device *dev)
1351 struct blkfront_info *info = dev->data;
1353 unregister_xenbus_watch(&info->watch);
1354 kfree(info->backend);
1355 info->backend = NULL;
1357 if (--blkif_vbds == 0) {
1358 recovery = 1;
1359 blkif_free();
1362 return 0;
1365 static int blkfront_resume(struct xenbus_device *dev)
1367 struct blkfront_info *info = dev->data;
1368 int err;
1370 /* FIXME: Check geometry hasn't changed here... */
1371 err = talk_to_backend(dev, info);
1372 if (!err) {
1373 if (blkif_vbds++ == 0)
1374 blkif_recover();
1376 return err;
1379 static struct xenbus_driver blkfront = {
1380 .name = "vbd",
1381 .owner = THIS_MODULE,
1382 .ids = blkfront_ids,
1383 .probe = blkfront_probe,
1384 .remove = blkfront_remove,
1385 .resume = blkfront_resume,
1386 .suspend = blkfront_suspend,
1387 };
1389 static void __init init_blk_xenbus(void)
1391 xenbus_register_device(&blkfront);
1394 static int wait_for_blkif(void)
1396 int err = 0;
1397 int i;
1399 /*
1400 * We should read 'nr_interfaces' from response message and wait
1401 * for notifications before proceeding. For now we assume that we
1402 * will be notified of exactly one interface.
1403 */
1404 for ( i=0; blkif_state != BLKIF_STATE_CONNECTED && (i < 10*HZ); i++ )
1406 set_current_state(TASK_INTERRUPTIBLE);
1407 schedule_timeout(1);
1410 if ( blkif_state != BLKIF_STATE_CONNECTED )
1412 WPRINTK("Timeout connecting to device!\n");
1413 err = -ENOSYS;
1415 return err;
1418 static int __init xlblk_init(void)
1420 int i;
1422 #ifdef CONFIG_XEN_BLKDEV_GRANT
1423 /* A grant for every ring slot, plus one for the ring itself. */
1424 if (gnttab_alloc_grant_references(MAXIMUM_OUTSTANDING_BLOCK_REQS + 1,
1425 &gref_head, &gref_terminal) < 0)
1426 return 1;
1427 printk(KERN_ALERT "Blkif frontend is using grant tables.\n");
1428 #endif
1430 if ( (xen_start_info.flags & SIF_INITDOMAIN) ||
1431 (xen_start_info.flags & SIF_BLK_BE_DOMAIN) )
1432 return 0;
1434 IPRINTK("Initialising virtual block device driver\n");
1436 blk_shadow_free = 0;
1437 memset(blk_shadow, 0, sizeof(blk_shadow));
1438 for ( i = 0; i < BLK_RING_SIZE; i++ )
1439 blk_shadow[i].req.id = i+1;
1440 blk_shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
1442 init_blk_xenbus();
1444 wait_for_blkif();
1446 return 0;
1449 static void blkif_completion(struct blk_shadow *s)
1451 int i;
1452 #ifdef CONFIG_XEN_BLKDEV_GRANT
1453 for ( i = 0; i < s->req.nr_segments; i++ )
1454 gnttab_release_grant_reference(
1455 &gref_head, blkif_gref_from_fas(s->req.frame_and_sects[i]));
1456 #else
1457 /* This is a hack to get the dirty logging bits set */
1458 if ( s->req.operation == BLKIF_OP_READ )
1460 for ( i = 0; i < s->req.nr_segments; i++ )
1462 unsigned long pfn = s->req.frame_and_sects[i] >> PAGE_SHIFT;
1463 unsigned long mfn = phys_to_machine_mapping[pfn];
1464 xen_machphys_update(mfn, pfn);
1467 #endif