ia64/linux-2.6.18-xen.hg

view drivers/xen/blktap2/device.c @ 878:eba6fe6d8d53

blktap2: a completely rewritten blktap implementation

Benefits to blktap2 over the old version of blktap:

* Isolation from xenstore - Blktap devices are now created directly on
the linux dom0 command line, rather than being spawned in response
to XenStore events. This is handy for debugging, makes blktap
generally easier to work with, and is a step toward a generic
user-level block device implementation that is not Xen-specific.

* Improved tapdisk infrastructure: simpler request forwarding, new
request scheduler, request merging, more efficient use of AIO.

* Improved tapdisk error handling and memory management. No
allocations on the block data path, IO retry logic to protect
guests
transient block device failures. This has been tested and is known
to work on weird environments such as NFS soft mounts.

* Pause and snapshot of live virtual disks (see xmsnap script).

* VHD support. The VHD code in this release has been rigorously
tested, and represents a very mature implementation of the VHD
image
format.

* No more duplication of mechanism with blkback. The blktap kernel
module has changed dramatically from the original blktap. Blkback
is now always used to talk to Xen guests, blktap just presents a
Linux gendisk that blkback can export. This is done while
preserving the zero-copy data path from domU to physical device.

These patches deprecate the old blktap code, which can hopefully be
removed from the tree completely at some point in the future.

Signed-off-by: Jake Wires <jake.wires@citrix.com>
Signed-off-by: Dutch Meyer <dmeyer@cs.ubc.ca>
author Keir Fraser <keir.fraser@citrix.com>
date Tue May 26 11:23:16 2009 +0100 (2009-05-26)
parents
children a4b49dff3387
line source
1 #include <linux/fs.h>
2 #include <linux/blkdev.h>
3 #include <linux/cdrom.h>
4 #include <linux/hdreg.h>
5 #include <linux/module.h>
7 #include <scsi/scsi.h>
8 #include <scsi/scsi_ioctl.h>
10 #include <xen/xenbus.h>
11 #include <xen/interface/io/blkif.h>
13 #include "blktap.h"
15 #ifdef CONFIG_XEN_BLKDEV_BACKEND
16 #include "../blkback/blkback-pagemap.h"
17 #else
18 struct blkback_pagemap { };
19 #define blkback_pagemap_read(page) BUG();
20 #endif
22 #if 0
23 #define DPRINTK_IOCTL(_f, _a...) printk(KERN_ALERT _f, ## _a)
24 #else
25 #define DPRINTK_IOCTL(_f, _a...) ((void)0)
26 #endif
28 struct blktap_grant_table {
29 int cnt;
30 struct gnttab_map_grant_ref grants[BLKIF_MAX_SEGMENTS_PER_REQUEST * 2];
31 };
33 static int blktap_device_major;
35 static inline struct blktap *
36 dev_to_blktap(struct blktap_device *dev)
37 {
38 return container_of(dev, struct blktap, device);
39 }
41 static int
42 blktap_device_open(struct inode *inode, struct file *filep)
43 {
44 struct blktap *tap;
45 struct blktap_device *dev = inode->i_bdev->bd_disk->private_data;
47 if (!dev)
48 return -ENOENT;
50 tap = dev_to_blktap(dev);
51 if (!blktap_active(tap) ||
52 test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse))
53 return -ENOENT;
55 dev->users++;
57 return 0;
58 }
60 static int
61 blktap_device_release(struct inode *inode, struct file *filep)
62 {
63 struct blktap_device *dev = inode->i_bdev->bd_disk->private_data;
64 struct blktap *tap = dev_to_blktap(dev);
66 dev->users--;
67 if (test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse))
68 blktap_device_destroy(tap);
70 return 0;
71 }
73 static int
74 blktap_device_getgeo(struct block_device *bd, struct hd_geometry *hg)
75 {
76 /* We don't have real geometry info, but let's at least return
77 values consistent with the size of the device */
78 sector_t nsect = get_capacity(bd->bd_disk);
79 sector_t cylinders = nsect;
81 hg->heads = 0xff;
82 hg->sectors = 0x3f;
83 sector_div(cylinders, hg->heads * hg->sectors);
84 hg->cylinders = cylinders;
85 if ((sector_t)(hg->cylinders + 1) * hg->heads * hg->sectors < nsect)
86 hg->cylinders = 0xffff;
87 return 0;
88 }
90 static int
91 blktap_device_ioctl(struct inode *inode, struct file *filep,
92 unsigned command, unsigned long argument)
93 {
94 int i;
96 DPRINTK_IOCTL("command: 0x%x, argument: 0x%lx, dev: 0x%04x\n",
97 command, (long)argument, inode->i_rdev);
99 switch (command) {
100 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,16)
101 case HDIO_GETGEO: {
102 struct block_device *bd = inode->i_bdev;
103 struct hd_geometry geo;
104 int ret;
106 if (!argument)
107 return -EINVAL;
109 geo.start = get_start_sect(bd);
110 ret = blktap_device_getgeo(bd, &geo);
111 if (ret)
112 return ret;
114 if (copy_to_user((struct hd_geometry __user *)argument, &geo,
115 sizeof(geo)))
116 return -EFAULT;
118 return 0;
119 }
120 #endif
121 case CDROMMULTISESSION:
122 BTDBG("FIXME: support multisession CDs later\n");
123 for (i = 0; i < sizeof(struct cdrom_multisession); i++)
124 if (put_user(0, (char __user *)(argument + i)))
125 return -EFAULT;
126 return 0;
128 case SCSI_IOCTL_GET_IDLUN:
129 if (!access_ok(VERIFY_WRITE, argument,
130 sizeof(struct scsi_idlun)))
131 return -EFAULT;
133 /* return 0 for now. */
134 __put_user(0, &((struct scsi_idlun __user *)argument)->dev_id);
135 __put_user(0,
136 &((struct scsi_idlun __user *)argument)->host_unique_id);
137 return 0;
139 default:
140 /*printk(KERN_ALERT "ioctl %08x not supported by Xen blkdev\n",
141 command);*/
142 return -EINVAL; /* same return as native Linux */
143 }
145 return 0;
146 }
148 static struct block_device_operations blktap_device_file_operations = {
149 .owner = THIS_MODULE,
150 .open = blktap_device_open,
151 .release = blktap_device_release,
152 .ioctl = blktap_device_ioctl,
153 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16)
154 .getgeo = blktap_device_getgeo
155 #endif
156 };
158 static int
159 blktap_map_uaddr_fn(pte_t *ptep, struct page *pmd_page,
160 unsigned long addr, void *data)
161 {
162 pte_t *pte = (pte_t *)data;
164 BTDBG("ptep %p -> %012llx\n", ptep, pte_val(*pte));
165 set_pte(ptep, *pte);
166 xen_invlpg(addr);
167 return 0;
168 }
170 static int
171 blktap_map_uaddr(struct mm_struct *mm, unsigned long address, pte_t pte)
172 {
173 return apply_to_page_range(mm, address,
174 PAGE_SIZE, blktap_map_uaddr_fn, &pte);
175 }
177 static int
178 blktap_umap_uaddr_fn(pte_t *ptep, struct page *pmd_page,
179 unsigned long addr, void *data)
180 {
181 struct mm_struct *mm = (struct mm_struct *)data;
183 BTDBG("ptep %p\n", ptep);
184 pte_clear(mm, addr, ptep);
185 xen_invlpg(addr);
186 return 0;
187 }
189 static int
190 blktap_umap_uaddr(struct mm_struct *mm, unsigned long address)
191 {
192 return apply_to_page_range(mm, address,
193 PAGE_SIZE, blktap_umap_uaddr_fn, mm);
194 }
196 static void
197 blktap_device_end_dequeued_request(struct blktap_device *dev,
198 struct request *req, int uptodate)
199 {
200 int ret;
202 ret = end_that_request_first(req, uptodate, req->hard_nr_sectors);
203 BUG_ON(ret);
205 spin_lock_irq(&dev->lock);
206 end_that_request_last(req, uptodate);
207 spin_unlock_irq(&dev->lock);
208 }
210 /*
211 * tap->tap_sem held on entry
212 */
213 static void
214 blktap_device_fast_flush(struct blktap *tap, struct blktap_request *request)
215 {
216 uint64_t ptep;
217 int ret, usr_idx;
218 unsigned int i, cnt;
219 struct page **map, *page;
220 struct blktap_ring *ring;
221 struct grant_handle_pair *khandle;
222 unsigned long kvaddr, uvaddr, offset;
223 struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST * 2];
225 cnt = 0;
226 ring = &tap->ring;
227 usr_idx = request->usr_idx;
228 map = ring->foreign_map.map;
230 if (!ring->vma)
231 return;
233 if (xen_feature(XENFEAT_auto_translated_physmap))
234 zap_page_range(ring->vma,
235 MMAP_VADDR(ring->user_vstart, usr_idx, 0),
236 request->nr_pages << PAGE_SHIFT, NULL);
238 for (i = 0; i < request->nr_pages; i++) {
239 kvaddr = request_to_kaddr(request, i);
240 uvaddr = MMAP_VADDR(ring->user_vstart, usr_idx, i);
242 khandle = request->handles + i;
244 if (khandle->kernel != INVALID_GRANT_HANDLE) {
245 gnttab_set_unmap_op(&unmap[cnt], kvaddr,
246 GNTMAP_host_map, khandle->kernel);
247 cnt++;
248 set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT,
249 INVALID_P2M_ENTRY);
250 }
252 if (khandle->user != INVALID_GRANT_HANDLE) {
253 BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
254 if (create_lookup_pte_addr(ring->vma->vm_mm,
255 uvaddr, &ptep) != 0) {
256 BTERR("Couldn't get a pte addr!\n");
257 return;
258 }
260 gnttab_set_unmap_op(&unmap[cnt], ptep,
261 GNTMAP_host_map
262 | GNTMAP_application_map
263 | GNTMAP_contains_pte,
264 khandle->user);
265 cnt++;
266 }
268 offset = (uvaddr - ring->vma->vm_start) >> PAGE_SHIFT;
270 BTDBG("offset: 0x%08lx, page: %p, request: %p, usr_idx: %d, "
271 "seg: %d, kvaddr: 0x%08lx, khandle: %u, uvaddr: "
272 "0x%08lx, handle: %u\n", offset, map[offset], request,
273 usr_idx, i, kvaddr, khandle->kernel, uvaddr,
274 khandle->user);
276 page = map[offset];
277 if (page) {
278 ClearPageReserved(map[offset]);
279 if (PageBlkback(page)) {
280 ClearPageBlkback(page);
281 set_page_private(page, 0);
282 }
283 }
284 map[offset] = NULL;
286 khandle->kernel = INVALID_GRANT_HANDLE;
287 khandle->user = INVALID_GRANT_HANDLE;
288 }
290 if (cnt) {
291 ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
292 unmap, cnt);
293 BUG_ON(ret);
294 }
296 if (!xen_feature(XENFEAT_auto_translated_physmap))
297 zap_page_range(ring->vma,
298 MMAP_VADDR(ring->user_vstart, usr_idx, 0),
299 request->nr_pages << PAGE_SHIFT, NULL);
300 }
302 /*
303 * tap->tap_sem held on entry
304 */
305 static void
306 blktap_unmap(struct blktap *tap, struct blktap_request *request)
307 {
308 int i, usr_idx;
309 unsigned long kvaddr;
311 usr_idx = request->usr_idx;
312 down_write(&tap->ring.vma->vm_mm->mmap_sem);
314 for (i = 0; i < request->nr_pages; i++) {
315 BTDBG("request: %p, seg: %d, kvaddr: 0x%08lx, khandle: %u, "
316 "uvaddr: 0x%08lx, uhandle: %u\n", request, i,
317 request_to_kaddr(request, i),
318 request->handles[i].kernel,
319 MMAP_VADDR(tap->ring.user_vstart, usr_idx, i),
320 request->handles[i].user);
322 if (request->handles[i].kernel == INVALID_GRANT_HANDLE) {
323 kvaddr = request_to_kaddr(request, i);
324 blktap_umap_uaddr(&init_mm, kvaddr);
325 set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT,
326 INVALID_P2M_ENTRY);
327 }
328 }
330 blktap_device_fast_flush(tap, request);
331 up_write(&tap->ring.vma->vm_mm->mmap_sem);
332 }
334 /*
335 * called if the tapdisk process dies unexpectedly.
336 * fail and release any pending requests and disable queue.
337 */
338 void
339 blktap_device_fail_pending_requests(struct blktap *tap)
340 {
341 int usr_idx;
342 struct request *req;
343 struct blktap_device *dev;
344 struct blktap_request *request;
346 if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse))
347 return;
349 down_write(&tap->tap_sem);
351 dev = &tap->device;
352 for (usr_idx = 0; usr_idx < MAX_PENDING_REQS; usr_idx++) {
353 request = tap->pending_requests[usr_idx];
354 if (!request || request->status != BLKTAP_REQUEST_PENDING)
355 continue;
357 BTERR("%u:%u: failing pending %s of %d pages\n",
358 blktap_device_major, tap->minor,
359 (request->operation == BLKIF_OP_READ ?
360 "read" : "write"), request->nr_pages);
362 blktap_unmap(tap, request);
363 req = (struct request *)(unsigned long)request->id;
364 blktap_device_end_dequeued_request(dev, req, 0);
365 blktap_request_free(tap, request);
366 }
368 up_write(&tap->tap_sem);
370 spin_lock_irq(&dev->lock);
372 /* fail any future requests */
373 dev->gd->queue->queuedata = NULL;
374 blk_start_queue(dev->gd->queue);
376 spin_unlock_irq(&dev->lock);
377 }
379 /*
380 * tap->tap_sem held on entry
381 */
382 void
383 blktap_device_finish_request(struct blktap *tap,
384 blkif_response_t *res,
385 struct blktap_request *request)
386 {
387 int uptodate;
388 struct request *req;
389 struct blktap_device *dev;
391 dev = &tap->device;
393 blktap_unmap(tap, request);
395 req = (struct request *)(unsigned long)request->id;
396 uptodate = (res->status == BLKIF_RSP_OKAY);
398 BTDBG("req %p res status %d operation %d/%d id %lld\n", req,
399 res->status, res->operation, request->operation, res->id);
401 switch (request->operation) {
402 case BLKIF_OP_READ:
403 case BLKIF_OP_WRITE:
404 if (unlikely(res->status != BLKIF_RSP_OKAY))
405 BTERR("Bad return from device data "
406 "request: %x\n", res->status);
407 blktap_device_end_dequeued_request(dev, req, uptodate);
408 break;
409 default:
410 BUG();
411 }
413 blktap_request_free(tap, request);
414 }
416 static int
417 blktap_prep_foreign(struct blktap *tap,
418 struct blktap_request *request,
419 blkif_request_t *blkif_req,
420 unsigned int seg, struct page *page,
421 struct blktap_grant_table *table)
422 {
423 uint64_t ptep;
424 uint32_t flags;
425 struct page *tap_page;
426 struct blktap_ring *ring;
427 struct blkback_pagemap map;
428 unsigned long uvaddr, kvaddr;
430 ring = &tap->ring;
431 map = blkback_pagemap_read(page);
432 blkif_req->seg[seg].gref = map.gref;
434 uvaddr = MMAP_VADDR(ring->user_vstart, request->usr_idx, seg);
435 kvaddr = request_to_kaddr(request, seg);
436 flags = GNTMAP_host_map |
437 (request->operation == BLKIF_OP_WRITE ? GNTMAP_readonly : 0);
439 gnttab_set_map_op(&table->grants[table->cnt],
440 kvaddr, flags, map.gref, map.domid);
441 table->cnt++;
443 /* enable chained tap devices */
444 tap_page = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
445 set_page_private(tap_page, page_private(page));
446 SetPageBlkback(tap_page);
448 if (xen_feature(XENFEAT_auto_translated_physmap))
449 return 0;
451 if (create_lookup_pte_addr(ring->vma->vm_mm, uvaddr, &ptep)) {
452 BTERR("couldn't get a pte addr!\n");
453 return -1;
454 }
456 flags |= GNTMAP_application_map | GNTMAP_contains_pte;
457 gnttab_set_map_op(&table->grants[table->cnt],
458 ptep, flags, map.gref, map.domid);
459 table->cnt++;
461 return 0;
462 }
464 static int
465 blktap_map_foreign(struct blktap *tap,
466 struct blktap_request *request,
467 blkif_request_t *blkif_req,
468 struct blktap_grant_table *table)
469 {
470 struct page *page;
471 int i, grant, err, usr_idx;
472 struct blktap_ring *ring;
473 unsigned long uvaddr, kvaddr, foreign_mfn;
475 if (!table->cnt)
476 return 0;
478 err = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
479 table->grants, table->cnt);
480 BUG_ON(err);
482 grant = 0;
483 usr_idx = request->usr_idx;
484 ring = &tap->ring;
486 for (i = 0; i < request->nr_pages; i++) {
487 if (!blkif_req->seg[i].gref)
488 continue;
490 uvaddr = MMAP_VADDR(ring->user_vstart, usr_idx, i);
491 kvaddr = request_to_kaddr(request, i);
493 if (unlikely(table->grants[grant].status)) {
494 BTERR("invalid kernel buffer: could not remap it\n");
495 err |= 1;
496 table->grants[grant].handle = INVALID_GRANT_HANDLE;
497 }
499 request->handles[i].kernel = table->grants[grant].handle;
500 foreign_mfn = table->grants[grant].dev_bus_addr >> PAGE_SHIFT;
501 grant++;
503 if (xen_feature(XENFEAT_auto_translated_physmap))
504 goto done;
506 if (unlikely(table->grants[grant].status)) {
507 BTERR("invalid user buffer: could not remap it\n");
508 err |= 1;
509 table->grants[grant].handle = INVALID_GRANT_HANDLE;
510 }
512 request->handles[i].user = table->grants[grant].handle;
513 grant++;
515 done:
516 if (err)
517 continue;
519 page = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
521 if (!xen_feature(XENFEAT_auto_translated_physmap))
522 set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT,
523 FOREIGN_FRAME(foreign_mfn));
524 else if (vm_insert_page(ring->vma, uvaddr, page))
525 err |= 1;
527 BTDBG("pending_req: %p, seg: %d, page: %p, "
528 "kvaddr: 0x%08lx, khandle: %u, uvaddr: 0x%08lx, "
529 "uhandle: %u\n", request, i, page,
530 kvaddr, request->handles[i].kernel,
531 uvaddr, request->handles[i].user);
532 }
534 return err;
535 }
537 static void
538 blktap_map(struct blktap *tap,
539 struct blktap_request *request,
540 unsigned int seg, struct page *page)
541 {
542 pte_t pte;
543 int usr_idx;
544 struct blktap_ring *ring;
545 unsigned long uvaddr, kvaddr;
547 ring = &tap->ring;
548 usr_idx = request->usr_idx;
549 uvaddr = MMAP_VADDR(ring->user_vstart, usr_idx, seg);
550 kvaddr = request_to_kaddr(request, seg);
552 pte = mk_pte(page, ring->vma->vm_page_prot);
553 blktap_map_uaddr(ring->vma->vm_mm, uvaddr, pte_mkwrite(pte));
554 blktap_map_uaddr(&init_mm, kvaddr, mk_pte(page, PAGE_KERNEL));
556 set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT, pte_mfn(pte));
557 request->handles[seg].kernel = INVALID_GRANT_HANDLE;
558 request->handles[seg].user = INVALID_GRANT_HANDLE;
560 BTDBG("pending_req: %p, seg: %d, page: %p, kvaddr: 0x%08lx, "
561 "uvaddr: 0x%08lx\n", request, seg, page, kvaddr,
562 uvaddr);
563 }
565 static int
566 blktap_device_process_request(struct blktap *tap,
567 struct blktap_request *request,
568 struct request *req)
569 {
570 struct bio *bio;
571 struct page *page;
572 struct bio_vec *bvec;
573 int idx, usr_idx, err;
574 struct blktap_ring *ring;
575 struct blktap_grant_table table;
576 unsigned int fsect, lsect, nr_sects;
577 unsigned long offset, uvaddr, kvaddr;
578 struct blkif_request blkif_req, *target;
580 err = -1;
581 memset(&table, 0, sizeof(table));
583 if (!blktap_active(tap))
584 goto out;
586 ring = &tap->ring;
587 usr_idx = request->usr_idx;
588 blkif_req.id = usr_idx;
589 blkif_req.sector_number = (blkif_sector_t)req->sector;
590 blkif_req.handle = 0;
591 blkif_req.operation = rq_data_dir(req) ?
592 BLKIF_OP_WRITE : BLKIF_OP_READ;
594 request->id = (unsigned long)req;
595 request->operation = blkif_req.operation;
596 request->status = BLKTAP_REQUEST_PENDING;
597 do_gettimeofday(&request->time);
599 nr_sects = 0;
600 request->nr_pages = 0;
601 blkif_req.nr_segments = 0;
602 rq_for_each_bio(bio, req) {
603 bio_for_each_segment(bvec, bio, idx) {
604 BUG_ON(blkif_req.nr_segments ==
605 BLKIF_MAX_SEGMENTS_PER_REQUEST);
607 fsect = bvec->bv_offset >> 9;
608 lsect = fsect + (bvec->bv_len >> 9) - 1;
609 nr_sects += bvec->bv_len >> 9;
611 blkif_req.seg[blkif_req.nr_segments] =
612 (struct blkif_request_segment) {
613 .gref = 0,
614 .first_sect = fsect,
615 .last_sect = lsect };
617 if (PageBlkback(bvec->bv_page)) {
618 /* foreign page -- use xen */
619 if (blktap_prep_foreign(tap,
620 request,
621 &blkif_req,
622 blkif_req.nr_segments,
623 bvec->bv_page,
624 &table))
625 goto out;
626 } else {
627 /* do it the old fashioned way */
628 blktap_map(tap,
629 request,
630 blkif_req.nr_segments,
631 bvec->bv_page);
632 }
634 uvaddr = MMAP_VADDR(ring->user_vstart,
635 usr_idx, blkif_req.nr_segments);
636 kvaddr = request_to_kaddr(request,
637 blkif_req.nr_segments);
638 offset = (uvaddr - ring->vma->vm_start) >> PAGE_SHIFT;
639 page = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
640 ring->foreign_map.map[offset] = page;
641 SetPageReserved(page);
643 BTDBG("mapped uaddr %08lx to page %p pfn 0x%lx\n",
644 uvaddr, page, __pa(kvaddr) >> PAGE_SHIFT);
645 BTDBG("offset: 0x%08lx, pending_req: %p, seg: %d, "
646 "page: %p, kvaddr: 0x%08lx, uvaddr: 0x%08lx\n",
647 offset, request, blkif_req.nr_segments,
648 page, kvaddr, uvaddr);
650 blkif_req.nr_segments++;
651 request->nr_pages++;
652 }
653 }
655 if (blktap_map_foreign(tap, request, &blkif_req, &table))
656 goto out;
658 /* Finally, write the request message to the user ring. */
659 target = RING_GET_REQUEST(&ring->ring, ring->ring.req_prod_pvt);
660 memcpy(target, &blkif_req, sizeof(blkif_req));
661 target->id = request->usr_idx;
662 wmb(); /* blktap_poll() reads req_prod_pvt asynchronously */
663 ring->ring.req_prod_pvt++;
665 if (rq_data_dir(req)) {
666 tap->stats.st_wr_sect += nr_sects;
667 tap->stats.st_wr_req++;
668 } else {
669 tap->stats.st_rd_sect += nr_sects;
670 tap->stats.st_rd_req++;
671 }
673 err = 0;
675 out:
676 if (err)
677 blktap_device_fast_flush(tap, request);
678 return err;
679 }
681 #ifdef ENABLE_PASSTHROUGH
682 #define rq_for_each_bio_safe(_bio, _tmp, _req) \
683 if ((_req)->bio) \
684 for (_bio = (_req)->bio; \
685 _bio && ((_tmp = _bio->bi_next) || 1); \
686 _bio = _tmp)
688 static void
689 blktap_device_forward_request(struct blktap *tap, struct request *req)
690 {
691 struct bio *bio, *tmp;
692 struct blktap_device *dev;
694 dev = &tap->device;
696 rq_for_each_bio_safe(bio, tmp, req) {
697 bio->bi_bdev = dev->bdev;
698 submit_bio(bio->bi_rw, bio);
699 }
700 }
702 static void
703 blktap_device_close_bdev(struct blktap *tap)
704 {
705 struct blktap_device *dev;
707 dev = &tap->device;
709 if (dev->bdev)
710 blkdev_put(dev->bdev);
712 dev->bdev = NULL;
713 clear_bit(BLKTAP_PASSTHROUGH, &tap->dev_inuse);
714 }
716 static int
717 blktap_device_open_bdev(struct blktap *tap, u32 pdev)
718 {
719 struct block_device *bdev;
720 struct blktap_device *dev;
722 dev = &tap->device;
724 bdev = open_by_devnum(pdev, FMODE_WRITE);
725 if (IS_ERR(bdev)) {
726 BTERR("opening device %x:%x failed: %ld\n",
727 MAJOR(pdev), MINOR(pdev), PTR_ERR(bdev));
728 return PTR_ERR(bdev);
729 }
731 if (!bdev->bd_disk) {
732 BTERR("device %x:%x doesn't exist\n",
733 MAJOR(pdev), MINOR(pdev));
734 blkdev_put(dev->bdev);
735 return -ENOENT;
736 }
738 dev->bdev = bdev;
739 set_bit(BLKTAP_PASSTHROUGH, &tap->dev_inuse);
741 /* TODO: readjust queue parameters */
743 BTINFO("set device %d to passthrough on %x:%x\n",
744 tap->minor, MAJOR(pdev), MINOR(pdev));
746 return 0;
747 }
749 int
750 blktap_device_enable_passthrough(struct blktap *tap,
751 unsigned major, unsigned minor)
752 {
753 u32 pdev;
754 struct blktap_device *dev;
756 dev = &tap->device;
757 pdev = MKDEV(major, minor);
759 if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
760 return -EINVAL;
762 if (dev->bdev) {
763 if (pdev)
764 return -EINVAL;
765 blktap_device_close_bdev(tap);
766 return 0;
767 }
769 return blktap_device_open_bdev(tap, pdev);
770 }
771 #endif
773 /*
774 * dev->lock held on entry
775 */
776 static void
777 blktap_device_run_queue(struct blktap *tap)
778 {
779 int queued, err;
780 request_queue_t *rq;
781 struct request *req;
782 struct blktap_ring *ring;
783 struct blktap_device *dev;
784 struct blktap_request *request;
786 queued = 0;
787 ring = &tap->ring;
788 dev = &tap->device;
789 rq = dev->gd->queue;
791 BTDBG("running queue for %d\n", tap->minor);
793 while ((req = elv_next_request(rq)) != NULL) {
794 if (!blk_fs_request(req)) {
795 end_request(req, 0);
796 continue;
797 }
799 if (blk_barrier_rq(req)) {
800 end_request(req, 0);
801 continue;
802 }
804 #ifdef ENABLE_PASSTHROUGH
805 if (test_bit(BLKTAP_PASSTHROUGH, &tap->dev_inuse)) {
806 blkdev_dequeue_request(req);
807 blktap_device_forward_request(tap, req);
808 continue;
809 }
810 #endif
812 if (RING_FULL(&ring->ring)) {
813 wait:
814 /* Avoid pointless unplugs. */
815 blk_stop_queue(rq);
816 blktap_defer(tap);
817 break;
818 }
820 request = blktap_request_allocate(tap);
821 if (!request) {
822 tap->stats.st_oo_req++;
823 goto wait;
824 }
826 BTDBG("req %p: dev %d cmd %p, sec 0x%llx, (0x%x/0x%lx) "
827 "buffer:%p [%s], pending: %p\n", req, tap->minor,
828 req->cmd, req->sector, req->current_nr_sectors,
829 req->nr_sectors, req->buffer,
830 rq_data_dir(req) ? "write" : "read", request);
832 blkdev_dequeue_request(req);
834 spin_unlock_irq(&dev->lock);
835 down_read(&tap->tap_sem);
837 err = blktap_device_process_request(tap, request, req);
838 if (!err)
839 queued++;
840 else {
841 blktap_device_end_dequeued_request(dev, req, 0);
842 blktap_request_free(tap, request);
843 }
845 up_read(&tap->tap_sem);
846 spin_lock_irq(&dev->lock);
847 }
849 if (queued)
850 blktap_ring_kick_user(tap);
851 }
853 /*
854 * dev->lock held on entry
855 */
856 static void
857 blktap_device_do_request(request_queue_t *rq)
858 {
859 struct request *req;
860 struct blktap *tap;
861 struct blktap_device *dev;
863 dev = rq->queuedata;
864 if (!dev)
865 goto fail;
867 tap = dev_to_blktap(dev);
868 if (!blktap_active(tap))
869 goto fail;
871 if (test_bit(BLKTAP_PAUSED, &tap->dev_inuse) ||
872 test_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse)) {
873 blktap_defer(tap);
874 return;
875 }
877 blktap_device_run_queue(tap);
878 return;
880 fail:
881 while ((req = elv_next_request(rq))) {
882 BTERR("device closed: failing secs %llu - %llu\n",
883 req->sector, req->sector + req->nr_sectors);
884 end_request(req, 0);
885 }
886 }
888 void
889 blktap_device_restart(struct blktap *tap)
890 {
891 struct blktap_device *dev;
893 dev = &tap->device;
894 if (!dev->gd || !dev->gd->queue)
895 return;
897 if (blktap_active(tap) && RING_FULL(&tap->ring.ring)) {
898 blktap_defer(tap);
899 return;
900 }
902 if (test_bit(BLKTAP_PAUSED, &tap->dev_inuse) ||
903 test_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse)) {
904 blktap_defer(tap);
905 return;
906 }
908 spin_lock_irq(&dev->lock);
910 /* Re-enable calldowns. */
911 if (blk_queue_stopped(dev->gd->queue))
912 blk_start_queue(dev->gd->queue);
914 /* Kick things off immediately. */
915 blktap_device_do_request(dev->gd->queue);
917 spin_unlock_irq(&dev->lock);
918 }
920 static void
921 blktap_device_configure(struct blktap *tap)
922 {
923 struct request_queue *rq;
924 struct blktap_device *dev = &tap->device;
926 if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse) || !dev->gd)
927 return;
929 dev = &tap->device;
930 rq = dev->gd->queue;
932 spin_lock_irq(&dev->lock);
934 set_capacity(dev->gd, tap->params.capacity);
936 /* Hard sector size and max sectors impersonate the equiv. hardware. */
937 blk_queue_hardsect_size(rq, tap->params.sector_size);
938 blk_queue_max_sectors(rq, 512);
940 /* Each segment in a request is up to an aligned page in size. */
941 blk_queue_segment_boundary(rq, PAGE_SIZE - 1);
942 blk_queue_max_segment_size(rq, PAGE_SIZE);
944 /* Ensure a merged request will fit in a single I/O ring slot. */
945 blk_queue_max_phys_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
946 blk_queue_max_hw_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
948 /* Make sure buffer addresses are sector-aligned. */
949 blk_queue_dma_alignment(rq, 511);
951 spin_unlock_irq(&dev->lock);
952 }
954 int
955 blktap_device_resume(struct blktap *tap)
956 {
957 int err;
959 if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse) || !blktap_active(tap))
960 return -ENODEV;
962 if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
963 return 0;
965 err = blktap_ring_resume(tap);
966 if (err)
967 return err;
969 /* device size may have changed */
970 blktap_device_configure(tap);
972 BTDBG("restarting device\n");
973 blktap_device_restart(tap);
975 return 0;
976 }
978 int
979 blktap_device_pause(struct blktap *tap)
980 {
981 unsigned long flags;
982 struct blktap_device *dev = &tap->device;
984 if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse) || !blktap_active(tap))
985 return -ENODEV;
987 if (test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
988 return 0;
990 spin_lock_irqsave(&dev->lock, flags);
992 blk_stop_queue(dev->gd->queue);
993 set_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse);
995 spin_unlock_irqrestore(&dev->lock, flags);
997 return blktap_ring_pause(tap);
998 }
1000 int
1001 blktap_device_destroy(struct blktap *tap)
1003 struct blktap_device *dev = &tap->device;
1005 if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse))
1006 return 0;
1008 BTINFO("destroy device %d users %d\n", tap->minor, dev->users);
1010 if (dev->users)
1011 return -EBUSY;
1013 spin_lock_irq(&dev->lock);
1014 /* No more blktap_device_do_request(). */
1015 blk_stop_queue(dev->gd->queue);
1016 clear_bit(BLKTAP_DEVICE, &tap->dev_inuse);
1017 spin_unlock_irq(&dev->lock);
1019 #ifdef ENABLE_PASSTHROUGH
1020 if (dev->bdev)
1021 blktap_device_close_bdev(tap);
1022 #endif
1024 del_gendisk(dev->gd);
1025 put_disk(dev->gd);
1026 blk_cleanup_queue(dev->gd->queue);
1028 dev->gd = NULL;
1030 wake_up(&tap->wq);
1032 return 0;
1035 int
1036 blktap_device_create(struct blktap *tap)
1038 int minor, err;
1039 struct gendisk *gd;
1040 struct request_queue *rq;
1041 struct blktap_device *dev;
1043 gd = NULL;
1044 rq = NULL;
1045 dev = &tap->device;
1046 minor = tap->minor;
1048 if (test_bit(BLKTAP_DEVICE, &tap->dev_inuse))
1049 return -EEXIST;
1051 if (blktap_validate_params(tap, &tap->params))
1052 return -EINVAL;
1054 BTINFO("minor %d sectors %Lu sector-size %lu\n",
1055 minor, tap->params.capacity, tap->params.sector_size);
1057 err = -ENODEV;
1059 gd = alloc_disk(1);
1060 if (!gd)
1061 goto error;
1063 if (minor < 26)
1064 sprintf(gd->disk_name, "tapdev%c", 'a' + minor);
1065 else
1066 sprintf(gd->disk_name, "tapdev%c%c",
1067 'a' + ((minor / 26) - 1), 'a' + (minor % 26));
1069 gd->major = blktap_device_major;
1070 gd->first_minor = minor;
1071 gd->fops = &blktap_device_file_operations;
1072 gd->private_data = dev;
1074 spin_lock_init(&dev->lock);
1075 rq = blk_init_queue(blktap_device_do_request, &dev->lock);
1076 if (!rq)
1077 goto error;
1079 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10)
1080 elevator_init(rq, "noop");
1081 #else
1082 elevator_init(rq, &elevator_noop);
1083 #endif
1085 gd->queue = rq;
1086 rq->queuedata = dev;
1087 dev->gd = gd;
1089 set_bit(BLKTAP_DEVICE, &tap->dev_inuse);
1090 blktap_device_configure(tap);
1092 add_disk(gd);
1094 err = 0;
1095 goto out;
1097 error:
1098 if (gd)
1099 del_gendisk(gd);
1100 if (rq)
1101 blk_cleanup_queue(rq);
1103 out:
1104 BTINFO("creation of %u:%u: %d\n", blktap_device_major, tap->minor, err);
1105 return err;
1108 int
1109 blktap_device_init(int *maj)
1111 int major;
1113 /* Dynamically allocate a major for this device */
1114 major = register_blkdev(0, "tapdev");
1115 if (major < 0) {
1116 BTERR("Couldn't register blktap device\n");
1117 return -ENOMEM;
1120 blktap_device_major = *maj = major;
1121 BTINFO("blktap device major %d\n", major);
1123 return 0;
1126 void
1127 blktap_device_free(void)
1129 if (blktap_device_major)
1130 if (unregister_blkdev(blktap_device_major, "tapdev"))
1131 BTERR("blktap device unregister failed\n");