ia64/linux-2.6.18-xen.hg

view drivers/xen/blktap2/device.c @ 893:f994bfe9b93b

linux/blktap2: reduce TLB flush scope

c/s 885 added very coarse TLB flushing. Since these flushes always
follow single page updates, single page flushes (when available) are
sufficient.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
author Keir Fraser <keir.fraser@citrix.com>
date Thu Jun 04 10:32:57 2009 +0100 (2009-06-04)
parents 485fe5efa4ff
children 4e5cd2fc45a7
line source
1 #include <linux/fs.h>
2 #include <linux/blkdev.h>
3 #include <linux/cdrom.h>
4 #include <linux/hdreg.h>
5 #include <linux/module.h>
6 #include <asm/tlbflush.h>
8 #include <scsi/scsi.h>
9 #include <scsi/scsi_ioctl.h>
11 #include <xen/xenbus.h>
12 #include <xen/interface/io/blkif.h>
14 #include "blktap.h"
16 #if defined(CONFIG_XEN_BLKDEV_BACKEND) || \
17 (defined(CONFIG_XEN_BLKDEV_BACKEND_MODULE) && defined(MODULE))
18 #include "../blkback/blkback-pagemap.h"
19 #else
20 struct blkback_pagemap { };
21 #define blkback_pagemap_read(page) BUG();
22 #endif
24 #if 0
25 #define DPRINTK_IOCTL(_f, _a...) printk(KERN_ALERT _f, ## _a)
26 #else
27 #define DPRINTK_IOCTL(_f, _a...) ((void)0)
28 #endif
30 struct blktap_grant_table {
31 int cnt;
32 struct gnttab_map_grant_ref grants[BLKIF_MAX_SEGMENTS_PER_REQUEST * 2];
33 };
35 static int blktap_device_major;
37 static inline struct blktap *
38 dev_to_blktap(struct blktap_device *dev)
39 {
40 return container_of(dev, struct blktap, device);
41 }
43 static int
44 blktap_device_open(struct inode *inode, struct file *filep)
45 {
46 struct blktap *tap;
47 struct blktap_device *dev = inode->i_bdev->bd_disk->private_data;
49 if (!dev)
50 return -ENOENT;
52 tap = dev_to_blktap(dev);
53 if (!blktap_active(tap) ||
54 test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse))
55 return -ENOENT;
57 dev->users++;
59 return 0;
60 }
62 static int
63 blktap_device_release(struct inode *inode, struct file *filep)
64 {
65 struct blktap_device *dev = inode->i_bdev->bd_disk->private_data;
66 struct blktap *tap = dev_to_blktap(dev);
68 dev->users--;
69 if (test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse))
70 blktap_device_destroy(tap);
72 return 0;
73 }
75 static int
76 blktap_device_getgeo(struct block_device *bd, struct hd_geometry *hg)
77 {
78 /* We don't have real geometry info, but let's at least return
79 values consistent with the size of the device */
80 sector_t nsect = get_capacity(bd->bd_disk);
81 sector_t cylinders = nsect;
83 hg->heads = 0xff;
84 hg->sectors = 0x3f;
85 sector_div(cylinders, hg->heads * hg->sectors);
86 hg->cylinders = cylinders;
87 if ((sector_t)(hg->cylinders + 1) * hg->heads * hg->sectors < nsect)
88 hg->cylinders = 0xffff;
89 return 0;
90 }
92 static int
93 blktap_device_ioctl(struct inode *inode, struct file *filep,
94 unsigned command, unsigned long argument)
95 {
96 int i;
98 DPRINTK_IOCTL("command: 0x%x, argument: 0x%lx, dev: 0x%04x\n",
99 command, (long)argument, inode->i_rdev);
101 switch (command) {
102 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,16)
103 case HDIO_GETGEO: {
104 struct block_device *bd = inode->i_bdev;
105 struct hd_geometry geo;
106 int ret;
108 if (!argument)
109 return -EINVAL;
111 geo.start = get_start_sect(bd);
112 ret = blktap_device_getgeo(bd, &geo);
113 if (ret)
114 return ret;
116 if (copy_to_user((struct hd_geometry __user *)argument, &geo,
117 sizeof(geo)))
118 return -EFAULT;
120 return 0;
121 }
122 #endif
123 case CDROMMULTISESSION:
124 BTDBG("FIXME: support multisession CDs later\n");
125 for (i = 0; i < sizeof(struct cdrom_multisession); i++)
126 if (put_user(0, (char __user *)(argument + i)))
127 return -EFAULT;
128 return 0;
130 case SCSI_IOCTL_GET_IDLUN:
131 if (!access_ok(VERIFY_WRITE, argument,
132 sizeof(struct scsi_idlun)))
133 return -EFAULT;
135 /* return 0 for now. */
136 __put_user(0, &((struct scsi_idlun __user *)argument)->dev_id);
137 __put_user(0,
138 &((struct scsi_idlun __user *)argument)->host_unique_id);
139 return 0;
141 default:
142 /*printk(KERN_ALERT "ioctl %08x not supported by Xen blkdev\n",
143 command);*/
144 return -EINVAL; /* same return as native Linux */
145 }
147 return 0;
148 }
150 static struct block_device_operations blktap_device_file_operations = {
151 .owner = THIS_MODULE,
152 .open = blktap_device_open,
153 .release = blktap_device_release,
154 .ioctl = blktap_device_ioctl,
155 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16)
156 .getgeo = blktap_device_getgeo
157 #endif
158 };
160 static int
161 blktap_map_uaddr_fn(pte_t *ptep, struct page *pmd_page,
162 unsigned long addr, void *data)
163 {
164 pte_t *pte = (pte_t *)data;
166 BTDBG("ptep %p -> %012llx\n", ptep, (unsigned long long)pte_val(*pte));
167 set_pte(ptep, *pte);
168 return 0;
169 }
171 static int
172 blktap_map_uaddr(struct mm_struct *mm, unsigned long address, pte_t pte)
173 {
174 return apply_to_page_range(mm, address,
175 PAGE_SIZE, blktap_map_uaddr_fn, &pte);
176 }
178 static int
179 blktap_umap_uaddr_fn(pte_t *ptep, struct page *pmd_page,
180 unsigned long addr, void *data)
181 {
182 struct mm_struct *mm = (struct mm_struct *)data;
184 BTDBG("ptep %p\n", ptep);
185 pte_clear(mm, addr, ptep);
186 return 0;
187 }
189 static int
190 blktap_umap_uaddr(struct mm_struct *mm, unsigned long address)
191 {
192 return apply_to_page_range(mm, address,
193 PAGE_SIZE, blktap_umap_uaddr_fn, mm);
194 }
196 static inline void
197 flush_tlb_kernel_page(unsigned long kvaddr)
198 {
199 #ifdef CONFIG_X86
200 xen_invlpg_all(kvaddr);
201 #else
202 flush_tlb_kernel_range(kvaddr, kvaddr + PAGE_SIZE);
203 #endif
204 }
206 static void
207 blktap_device_end_dequeued_request(struct blktap_device *dev,
208 struct request *req, int uptodate)
209 {
210 int ret;
212 ret = end_that_request_first(req, uptodate, req->hard_nr_sectors);
213 BUG_ON(ret);
215 spin_lock_irq(&dev->lock);
216 end_that_request_last(req, uptodate);
217 spin_unlock_irq(&dev->lock);
218 }
220 /*
221 * tap->tap_sem held on entry
222 */
223 static void
224 blktap_device_fast_flush(struct blktap *tap, struct blktap_request *request)
225 {
226 uint64_t ptep;
227 int ret, usr_idx;
228 unsigned int i, cnt;
229 struct page **map, *page;
230 struct blktap_ring *ring;
231 struct grant_handle_pair *khandle;
232 unsigned long kvaddr, uvaddr, offset;
233 struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST * 2];
235 cnt = 0;
236 ring = &tap->ring;
237 usr_idx = request->usr_idx;
238 map = ring->foreign_map.map;
240 if (!ring->vma)
241 return;
243 if (xen_feature(XENFEAT_auto_translated_physmap))
244 zap_page_range(ring->vma,
245 MMAP_VADDR(ring->user_vstart, usr_idx, 0),
246 request->nr_pages << PAGE_SHIFT, NULL);
248 for (i = 0; i < request->nr_pages; i++) {
249 kvaddr = request_to_kaddr(request, i);
250 uvaddr = MMAP_VADDR(ring->user_vstart, usr_idx, i);
252 khandle = request->handles + i;
254 if (khandle->kernel != INVALID_GRANT_HANDLE) {
255 gnttab_set_unmap_op(&unmap[cnt], kvaddr,
256 GNTMAP_host_map, khandle->kernel);
257 cnt++;
258 set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT,
259 INVALID_P2M_ENTRY);
260 }
262 if (khandle->user != INVALID_GRANT_HANDLE) {
263 BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
264 if (create_lookup_pte_addr(ring->vma->vm_mm,
265 uvaddr, &ptep) != 0) {
266 BTERR("Couldn't get a pte addr!\n");
267 return;
268 }
270 gnttab_set_unmap_op(&unmap[cnt], ptep,
271 GNTMAP_host_map
272 | GNTMAP_application_map
273 | GNTMAP_contains_pte,
274 khandle->user);
275 cnt++;
276 }
278 offset = (uvaddr - ring->vma->vm_start) >> PAGE_SHIFT;
280 BTDBG("offset: 0x%08lx, page: %p, request: %p, usr_idx: %d, "
281 "seg: %d, kvaddr: 0x%08lx, khandle: %u, uvaddr: "
282 "0x%08lx, handle: %u\n", offset, map[offset], request,
283 usr_idx, i, kvaddr, khandle->kernel, uvaddr,
284 khandle->user);
286 page = map[offset];
287 if (page) {
288 ClearPageReserved(map[offset]);
289 if (PageBlkback(page)) {
290 ClearPageBlkback(page);
291 set_page_private(page, 0);
292 }
293 }
294 map[offset] = NULL;
296 khandle->kernel = INVALID_GRANT_HANDLE;
297 khandle->user = INVALID_GRANT_HANDLE;
298 }
300 if (cnt) {
301 ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
302 unmap, cnt);
303 BUG_ON(ret);
304 }
306 if (!xen_feature(XENFEAT_auto_translated_physmap))
307 zap_page_range(ring->vma,
308 MMAP_VADDR(ring->user_vstart, usr_idx, 0),
309 request->nr_pages << PAGE_SHIFT, NULL);
310 }
312 /*
313 * tap->tap_sem held on entry
314 */
315 static void
316 blktap_unmap(struct blktap *tap, struct blktap_request *request)
317 {
318 int i, usr_idx;
319 unsigned long kvaddr;
321 usr_idx = request->usr_idx;
322 down_write(&tap->ring.vma->vm_mm->mmap_sem);
324 for (i = 0; i < request->nr_pages; i++) {
325 BTDBG("request: %p, seg: %d, kvaddr: 0x%08lx, khandle: %u, "
326 "uvaddr: 0x%08lx, uhandle: %u\n", request, i,
327 request_to_kaddr(request, i),
328 request->handles[i].kernel,
329 MMAP_VADDR(tap->ring.user_vstart, usr_idx, i),
330 request->handles[i].user);
332 if (request->handles[i].kernel == INVALID_GRANT_HANDLE) {
333 kvaddr = request_to_kaddr(request, i);
334 blktap_umap_uaddr(&init_mm, kvaddr);
335 flush_tlb_kernel_page(kvaddr);
336 set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT,
337 INVALID_P2M_ENTRY);
338 }
339 }
341 blktap_device_fast_flush(tap, request);
342 up_write(&tap->ring.vma->vm_mm->mmap_sem);
343 }
345 /*
346 * called if the tapdisk process dies unexpectedly.
347 * fail and release any pending requests and disable queue.
348 */
349 void
350 blktap_device_fail_pending_requests(struct blktap *tap)
351 {
352 int usr_idx;
353 struct request *req;
354 struct blktap_device *dev;
355 struct blktap_request *request;
357 if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse))
358 return;
360 down_write(&tap->tap_sem);
362 dev = &tap->device;
363 for (usr_idx = 0; usr_idx < MAX_PENDING_REQS; usr_idx++) {
364 request = tap->pending_requests[usr_idx];
365 if (!request || request->status != BLKTAP_REQUEST_PENDING)
366 continue;
368 BTERR("%u:%u: failing pending %s of %d pages\n",
369 blktap_device_major, tap->minor,
370 (request->operation == BLKIF_OP_READ ?
371 "read" : "write"), request->nr_pages);
373 blktap_unmap(tap, request);
374 req = (struct request *)(unsigned long)request->id;
375 blktap_device_end_dequeued_request(dev, req, 0);
376 blktap_request_free(tap, request);
377 }
379 up_write(&tap->tap_sem);
381 spin_lock_irq(&dev->lock);
383 /* fail any future requests */
384 dev->gd->queue->queuedata = NULL;
385 blk_start_queue(dev->gd->queue);
387 spin_unlock_irq(&dev->lock);
388 }
390 /*
391 * tap->tap_sem held on entry
392 */
393 void
394 blktap_device_finish_request(struct blktap *tap,
395 blkif_response_t *res,
396 struct blktap_request *request)
397 {
398 int uptodate;
399 struct request *req;
400 struct blktap_device *dev;
402 dev = &tap->device;
404 blktap_unmap(tap, request);
406 req = (struct request *)(unsigned long)request->id;
407 uptodate = (res->status == BLKIF_RSP_OKAY);
409 BTDBG("req %p res status %d operation %d/%d id %lld\n", req,
410 res->status, res->operation, request->operation, res->id);
412 switch (request->operation) {
413 case BLKIF_OP_READ:
414 case BLKIF_OP_WRITE:
415 if (unlikely(res->status != BLKIF_RSP_OKAY))
416 BTERR("Bad return from device data "
417 "request: %x\n", res->status);
418 blktap_device_end_dequeued_request(dev, req, uptodate);
419 break;
420 default:
421 BUG();
422 }
424 blktap_request_free(tap, request);
425 }
427 static int
428 blktap_prep_foreign(struct blktap *tap,
429 struct blktap_request *request,
430 blkif_request_t *blkif_req,
431 unsigned int seg, struct page *page,
432 struct blktap_grant_table *table)
433 {
434 uint64_t ptep;
435 uint32_t flags;
436 struct page *tap_page;
437 struct blktap_ring *ring;
438 struct blkback_pagemap map;
439 unsigned long uvaddr, kvaddr;
441 ring = &tap->ring;
442 map = blkback_pagemap_read(page);
443 blkif_req->seg[seg].gref = map.gref;
445 uvaddr = MMAP_VADDR(ring->user_vstart, request->usr_idx, seg);
446 kvaddr = request_to_kaddr(request, seg);
447 flags = GNTMAP_host_map |
448 (request->operation == BLKIF_OP_WRITE ? GNTMAP_readonly : 0);
450 gnttab_set_map_op(&table->grants[table->cnt],
451 kvaddr, flags, map.gref, map.domid);
452 table->cnt++;
454 /* enable chained tap devices */
455 tap_page = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
456 set_page_private(tap_page, page_private(page));
457 SetPageBlkback(tap_page);
459 if (xen_feature(XENFEAT_auto_translated_physmap))
460 return 0;
462 if (create_lookup_pte_addr(ring->vma->vm_mm, uvaddr, &ptep)) {
463 BTERR("couldn't get a pte addr!\n");
464 return -1;
465 }
467 flags |= GNTMAP_application_map | GNTMAP_contains_pte;
468 gnttab_set_map_op(&table->grants[table->cnt],
469 ptep, flags, map.gref, map.domid);
470 table->cnt++;
472 return 0;
473 }
475 static int
476 blktap_map_foreign(struct blktap *tap,
477 struct blktap_request *request,
478 blkif_request_t *blkif_req,
479 struct blktap_grant_table *table)
480 {
481 struct page *page;
482 int i, grant, err, usr_idx;
483 struct blktap_ring *ring;
484 unsigned long uvaddr, kvaddr, foreign_mfn;
486 if (!table->cnt)
487 return 0;
489 err = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
490 table->grants, table->cnt);
491 BUG_ON(err);
493 grant = 0;
494 usr_idx = request->usr_idx;
495 ring = &tap->ring;
497 for (i = 0; i < request->nr_pages; i++) {
498 if (!blkif_req->seg[i].gref)
499 continue;
501 uvaddr = MMAP_VADDR(ring->user_vstart, usr_idx, i);
502 kvaddr = request_to_kaddr(request, i);
504 if (unlikely(table->grants[grant].status)) {
505 BTERR("invalid kernel buffer: could not remap it\n");
506 err |= 1;
507 table->grants[grant].handle = INVALID_GRANT_HANDLE;
508 }
510 request->handles[i].kernel = table->grants[grant].handle;
511 foreign_mfn = table->grants[grant].dev_bus_addr >> PAGE_SHIFT;
512 grant++;
514 if (xen_feature(XENFEAT_auto_translated_physmap))
515 goto done;
517 if (unlikely(table->grants[grant].status)) {
518 BTERR("invalid user buffer: could not remap it\n");
519 err |= 1;
520 table->grants[grant].handle = INVALID_GRANT_HANDLE;
521 }
523 request->handles[i].user = table->grants[grant].handle;
524 grant++;
526 done:
527 if (err)
528 continue;
530 page = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
532 if (!xen_feature(XENFEAT_auto_translated_physmap))
533 set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT,
534 FOREIGN_FRAME(foreign_mfn));
535 else if (vm_insert_page(ring->vma, uvaddr, page))
536 err |= 1;
538 BTDBG("pending_req: %p, seg: %d, page: %p, "
539 "kvaddr: 0x%08lx, khandle: %u, uvaddr: 0x%08lx, "
540 "uhandle: %u\n", request, i, page,
541 kvaddr, request->handles[i].kernel,
542 uvaddr, request->handles[i].user);
543 }
545 return err;
546 }
548 static void
549 blktap_map(struct blktap *tap,
550 struct blktap_request *request,
551 unsigned int seg, struct page *page)
552 {
553 pte_t pte;
554 int usr_idx;
555 struct blktap_ring *ring;
556 unsigned long uvaddr, kvaddr;
558 ring = &tap->ring;
559 usr_idx = request->usr_idx;
560 uvaddr = MMAP_VADDR(ring->user_vstart, usr_idx, seg);
561 kvaddr = request_to_kaddr(request, seg);
563 pte = mk_pte(page, ring->vma->vm_page_prot);
564 blktap_map_uaddr(ring->vma->vm_mm, uvaddr, pte_mkwrite(pte));
565 flush_tlb_page(ring->vma, uvaddr);
566 blktap_map_uaddr(&init_mm, kvaddr, mk_pte(page, PAGE_KERNEL));
567 flush_tlb_kernel_page(kvaddr);
569 set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT, pte_mfn(pte));
570 request->handles[seg].kernel = INVALID_GRANT_HANDLE;
571 request->handles[seg].user = INVALID_GRANT_HANDLE;
573 BTDBG("pending_req: %p, seg: %d, page: %p, kvaddr: 0x%08lx, "
574 "uvaddr: 0x%08lx\n", request, seg, page, kvaddr,
575 uvaddr);
576 }
578 static int
579 blktap_device_process_request(struct blktap *tap,
580 struct blktap_request *request,
581 struct request *req)
582 {
583 struct bio *bio;
584 struct page *page;
585 struct bio_vec *bvec;
586 int idx, usr_idx, err;
587 struct blktap_ring *ring;
588 struct blktap_grant_table table;
589 unsigned int fsect, lsect, nr_sects;
590 unsigned long offset, uvaddr, kvaddr;
591 struct blkif_request blkif_req, *target;
593 err = -1;
594 memset(&table, 0, sizeof(table));
596 if (!blktap_active(tap))
597 goto out;
599 ring = &tap->ring;
600 usr_idx = request->usr_idx;
601 blkif_req.id = usr_idx;
602 blkif_req.sector_number = (blkif_sector_t)req->sector;
603 blkif_req.handle = 0;
604 blkif_req.operation = rq_data_dir(req) ?
605 BLKIF_OP_WRITE : BLKIF_OP_READ;
607 request->id = (unsigned long)req;
608 request->operation = blkif_req.operation;
609 request->status = BLKTAP_REQUEST_PENDING;
610 do_gettimeofday(&request->time);
612 nr_sects = 0;
613 request->nr_pages = 0;
614 blkif_req.nr_segments = 0;
615 rq_for_each_bio(bio, req) {
616 bio_for_each_segment(bvec, bio, idx) {
617 BUG_ON(blkif_req.nr_segments ==
618 BLKIF_MAX_SEGMENTS_PER_REQUEST);
620 fsect = bvec->bv_offset >> 9;
621 lsect = fsect + (bvec->bv_len >> 9) - 1;
622 nr_sects += bvec->bv_len >> 9;
624 blkif_req.seg[blkif_req.nr_segments] =
625 (struct blkif_request_segment) {
626 .gref = 0,
627 .first_sect = fsect,
628 .last_sect = lsect };
630 if (PageBlkback(bvec->bv_page)) {
631 /* foreign page -- use xen */
632 if (blktap_prep_foreign(tap,
633 request,
634 &blkif_req,
635 blkif_req.nr_segments,
636 bvec->bv_page,
637 &table))
638 goto out;
639 } else {
640 /* do it the old fashioned way */
641 blktap_map(tap,
642 request,
643 blkif_req.nr_segments,
644 bvec->bv_page);
645 }
647 uvaddr = MMAP_VADDR(ring->user_vstart,
648 usr_idx, blkif_req.nr_segments);
649 kvaddr = request_to_kaddr(request,
650 blkif_req.nr_segments);
651 offset = (uvaddr - ring->vma->vm_start) >> PAGE_SHIFT;
652 page = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
653 ring->foreign_map.map[offset] = page;
654 SetPageReserved(page);
656 BTDBG("mapped uaddr %08lx to page %p pfn 0x%lx\n",
657 uvaddr, page, __pa(kvaddr) >> PAGE_SHIFT);
658 BTDBG("offset: 0x%08lx, pending_req: %p, seg: %d, "
659 "page: %p, kvaddr: 0x%08lx, uvaddr: 0x%08lx\n",
660 offset, request, blkif_req.nr_segments,
661 page, kvaddr, uvaddr);
663 blkif_req.nr_segments++;
664 request->nr_pages++;
665 }
666 }
668 if (blktap_map_foreign(tap, request, &blkif_req, &table))
669 goto out;
671 /* Finally, write the request message to the user ring. */
672 target = RING_GET_REQUEST(&ring->ring, ring->ring.req_prod_pvt);
673 memcpy(target, &blkif_req, sizeof(blkif_req));
674 target->id = request->usr_idx;
675 wmb(); /* blktap_poll() reads req_prod_pvt asynchronously */
676 ring->ring.req_prod_pvt++;
678 if (rq_data_dir(req)) {
679 tap->stats.st_wr_sect += nr_sects;
680 tap->stats.st_wr_req++;
681 } else {
682 tap->stats.st_rd_sect += nr_sects;
683 tap->stats.st_rd_req++;
684 }
686 err = 0;
688 out:
689 if (err)
690 blktap_device_fast_flush(tap, request);
691 return err;
692 }
694 #ifdef ENABLE_PASSTHROUGH
695 #define rq_for_each_bio_safe(_bio, _tmp, _req) \
696 if ((_req)->bio) \
697 for (_bio = (_req)->bio; \
698 _bio && ((_tmp = _bio->bi_next) || 1); \
699 _bio = _tmp)
701 static void
702 blktap_device_forward_request(struct blktap *tap, struct request *req)
703 {
704 struct bio *bio, *tmp;
705 struct blktap_device *dev;
707 dev = &tap->device;
709 rq_for_each_bio_safe(bio, tmp, req) {
710 bio->bi_bdev = dev->bdev;
711 submit_bio(bio->bi_rw, bio);
712 }
713 }
715 static void
716 blktap_device_close_bdev(struct blktap *tap)
717 {
718 struct blktap_device *dev;
720 dev = &tap->device;
722 if (dev->bdev)
723 blkdev_put(dev->bdev);
725 dev->bdev = NULL;
726 clear_bit(BLKTAP_PASSTHROUGH, &tap->dev_inuse);
727 }
729 static int
730 blktap_device_open_bdev(struct blktap *tap, u32 pdev)
731 {
732 struct block_device *bdev;
733 struct blktap_device *dev;
735 dev = &tap->device;
737 bdev = open_by_devnum(pdev, FMODE_WRITE);
738 if (IS_ERR(bdev)) {
739 BTERR("opening device %x:%x failed: %ld\n",
740 MAJOR(pdev), MINOR(pdev), PTR_ERR(bdev));
741 return PTR_ERR(bdev);
742 }
744 if (!bdev->bd_disk) {
745 BTERR("device %x:%x doesn't exist\n",
746 MAJOR(pdev), MINOR(pdev));
747 blkdev_put(dev->bdev);
748 return -ENOENT;
749 }
751 dev->bdev = bdev;
752 set_bit(BLKTAP_PASSTHROUGH, &tap->dev_inuse);
754 /* TODO: readjust queue parameters */
756 BTINFO("set device %d to passthrough on %x:%x\n",
757 tap->minor, MAJOR(pdev), MINOR(pdev));
759 return 0;
760 }
762 int
763 blktap_device_enable_passthrough(struct blktap *tap,
764 unsigned major, unsigned minor)
765 {
766 u32 pdev;
767 struct blktap_device *dev;
769 dev = &tap->device;
770 pdev = MKDEV(major, minor);
772 if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
773 return -EINVAL;
775 if (dev->bdev) {
776 if (pdev)
777 return -EINVAL;
778 blktap_device_close_bdev(tap);
779 return 0;
780 }
782 return blktap_device_open_bdev(tap, pdev);
783 }
784 #endif
786 /*
787 * dev->lock held on entry
788 */
789 static void
790 blktap_device_run_queue(struct blktap *tap)
791 {
792 int queued, err;
793 request_queue_t *rq;
794 struct request *req;
795 struct blktap_ring *ring;
796 struct blktap_device *dev;
797 struct blktap_request *request;
799 queued = 0;
800 ring = &tap->ring;
801 dev = &tap->device;
802 rq = dev->gd->queue;
804 BTDBG("running queue for %d\n", tap->minor);
806 while ((req = elv_next_request(rq)) != NULL) {
807 if (!blk_fs_request(req)) {
808 end_request(req, 0);
809 continue;
810 }
812 if (blk_barrier_rq(req)) {
813 end_request(req, 0);
814 continue;
815 }
817 #ifdef ENABLE_PASSTHROUGH
818 if (test_bit(BLKTAP_PASSTHROUGH, &tap->dev_inuse)) {
819 blkdev_dequeue_request(req);
820 blktap_device_forward_request(tap, req);
821 continue;
822 }
823 #endif
825 if (RING_FULL(&ring->ring)) {
826 wait:
827 /* Avoid pointless unplugs. */
828 blk_stop_queue(rq);
829 blktap_defer(tap);
830 break;
831 }
833 request = blktap_request_allocate(tap);
834 if (!request) {
835 tap->stats.st_oo_req++;
836 goto wait;
837 }
839 BTDBG("req %p: dev %d cmd %p, sec 0x%llx, (0x%x/0x%lx) "
840 "buffer:%p [%s], pending: %p\n", req, tap->minor,
841 req->cmd, req->sector, req->current_nr_sectors,
842 req->nr_sectors, req->buffer,
843 rq_data_dir(req) ? "write" : "read", request);
845 blkdev_dequeue_request(req);
847 spin_unlock_irq(&dev->lock);
848 down_read(&tap->tap_sem);
850 err = blktap_device_process_request(tap, request, req);
851 if (!err)
852 queued++;
853 else {
854 blktap_device_end_dequeued_request(dev, req, 0);
855 blktap_request_free(tap, request);
856 }
858 up_read(&tap->tap_sem);
859 spin_lock_irq(&dev->lock);
860 }
862 if (queued)
863 blktap_ring_kick_user(tap);
864 }
866 /*
867 * dev->lock held on entry
868 */
869 static void
870 blktap_device_do_request(request_queue_t *rq)
871 {
872 struct request *req;
873 struct blktap *tap;
874 struct blktap_device *dev;
876 dev = rq->queuedata;
877 if (!dev)
878 goto fail;
880 tap = dev_to_blktap(dev);
881 if (!blktap_active(tap))
882 goto fail;
884 if (test_bit(BLKTAP_PAUSED, &tap->dev_inuse) ||
885 test_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse)) {
886 blktap_defer(tap);
887 return;
888 }
890 blktap_device_run_queue(tap);
891 return;
893 fail:
894 while ((req = elv_next_request(rq))) {
895 BTERR("device closed: failing secs %llu - %llu\n",
896 req->sector, req->sector + req->nr_sectors);
897 end_request(req, 0);
898 }
899 }
901 void
902 blktap_device_restart(struct blktap *tap)
903 {
904 struct blktap_device *dev;
906 dev = &tap->device;
907 if (!dev->gd || !dev->gd->queue)
908 return;
910 if (blktap_active(tap) && RING_FULL(&tap->ring.ring)) {
911 blktap_defer(tap);
912 return;
913 }
915 if (test_bit(BLKTAP_PAUSED, &tap->dev_inuse) ||
916 test_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse)) {
917 blktap_defer(tap);
918 return;
919 }
921 spin_lock_irq(&dev->lock);
923 /* Re-enable calldowns. */
924 if (blk_queue_stopped(dev->gd->queue))
925 blk_start_queue(dev->gd->queue);
927 /* Kick things off immediately. */
928 blktap_device_do_request(dev->gd->queue);
930 spin_unlock_irq(&dev->lock);
931 }
933 static void
934 blktap_device_configure(struct blktap *tap)
935 {
936 struct request_queue *rq;
937 struct blktap_device *dev = &tap->device;
939 if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse) || !dev->gd)
940 return;
942 dev = &tap->device;
943 rq = dev->gd->queue;
945 spin_lock_irq(&dev->lock);
947 set_capacity(dev->gd, tap->params.capacity);
949 /* Hard sector size and max sectors impersonate the equiv. hardware. */
950 blk_queue_hardsect_size(rq, tap->params.sector_size);
951 blk_queue_max_sectors(rq, 512);
953 /* Each segment in a request is up to an aligned page in size. */
954 blk_queue_segment_boundary(rq, PAGE_SIZE - 1);
955 blk_queue_max_segment_size(rq, PAGE_SIZE);
957 /* Ensure a merged request will fit in a single I/O ring slot. */
958 blk_queue_max_phys_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
959 blk_queue_max_hw_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
961 /* Make sure buffer addresses are sector-aligned. */
962 blk_queue_dma_alignment(rq, 511);
964 spin_unlock_irq(&dev->lock);
965 }
967 int
968 blktap_device_resume(struct blktap *tap)
969 {
970 int err;
972 if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse) || !blktap_active(tap))
973 return -ENODEV;
975 if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
976 return 0;
978 err = blktap_ring_resume(tap);
979 if (err)
980 return err;
982 /* device size may have changed */
983 blktap_device_configure(tap);
985 BTDBG("restarting device\n");
986 blktap_device_restart(tap);
988 return 0;
989 }
991 int
992 blktap_device_pause(struct blktap *tap)
993 {
994 unsigned long flags;
995 struct blktap_device *dev = &tap->device;
997 if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse) || !blktap_active(tap))
998 return -ENODEV;
1000 if (test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
1001 return 0;
1003 spin_lock_irqsave(&dev->lock, flags);
1005 blk_stop_queue(dev->gd->queue);
1006 set_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse);
1008 spin_unlock_irqrestore(&dev->lock, flags);
1010 return blktap_ring_pause(tap);
1013 int
1014 blktap_device_destroy(struct blktap *tap)
1016 struct blktap_device *dev = &tap->device;
1018 if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse))
1019 return 0;
1021 BTINFO("destroy device %d users %d\n", tap->minor, dev->users);
1023 if (dev->users)
1024 return -EBUSY;
1026 spin_lock_irq(&dev->lock);
1027 /* No more blktap_device_do_request(). */
1028 blk_stop_queue(dev->gd->queue);
1029 clear_bit(BLKTAP_DEVICE, &tap->dev_inuse);
1030 spin_unlock_irq(&dev->lock);
1032 #ifdef ENABLE_PASSTHROUGH
1033 if (dev->bdev)
1034 blktap_device_close_bdev(tap);
1035 #endif
1037 del_gendisk(dev->gd);
1038 put_disk(dev->gd);
1039 blk_cleanup_queue(dev->gd->queue);
1041 dev->gd = NULL;
1043 wake_up(&tap->wq);
1045 return 0;
1048 int
1049 blktap_device_create(struct blktap *tap)
1051 int minor, err;
1052 struct gendisk *gd;
1053 struct request_queue *rq;
1054 struct blktap_device *dev;
1056 gd = NULL;
1057 rq = NULL;
1058 dev = &tap->device;
1059 minor = tap->minor;
1061 if (test_bit(BLKTAP_DEVICE, &tap->dev_inuse))
1062 return -EEXIST;
1064 if (blktap_validate_params(tap, &tap->params))
1065 return -EINVAL;
1067 BTINFO("minor %d sectors %Lu sector-size %lu\n",
1068 minor, tap->params.capacity, tap->params.sector_size);
1070 err = -ENODEV;
1072 gd = alloc_disk(1);
1073 if (!gd)
1074 goto error;
1076 if (minor < 26)
1077 sprintf(gd->disk_name, "tapdev%c", 'a' + minor);
1078 else
1079 sprintf(gd->disk_name, "tapdev%c%c",
1080 'a' + ((minor / 26) - 1), 'a' + (minor % 26));
1082 gd->major = blktap_device_major;
1083 gd->first_minor = minor;
1084 gd->fops = &blktap_device_file_operations;
1085 gd->private_data = dev;
1087 spin_lock_init(&dev->lock);
1088 rq = blk_init_queue(blktap_device_do_request, &dev->lock);
1089 if (!rq)
1090 goto error;
1092 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10)
1093 elevator_init(rq, "noop");
1094 #else
1095 elevator_init(rq, &elevator_noop);
1096 #endif
1098 gd->queue = rq;
1099 rq->queuedata = dev;
1100 dev->gd = gd;
1102 set_bit(BLKTAP_DEVICE, &tap->dev_inuse);
1103 blktap_device_configure(tap);
1105 add_disk(gd);
1107 err = 0;
1108 goto out;
1110 error:
1111 if (gd)
1112 del_gendisk(gd);
1113 if (rq)
1114 blk_cleanup_queue(rq);
1116 out:
1117 BTINFO("creation of %u:%u: %d\n", blktap_device_major, tap->minor, err);
1118 return err;
1121 int
1122 blktap_device_init(int *maj)
1124 int major;
1126 /* Dynamically allocate a major for this device */
1127 major = register_blkdev(0, "tapdev");
1128 if (major < 0) {
1129 BTERR("Couldn't register blktap device\n");
1130 return -ENOMEM;
1133 blktap_device_major = *maj = major;
1134 BTINFO("blktap device major %d\n", major);
1136 return 0;
1139 void
1140 blktap_device_free(void)
1142 if (blktap_device_major)
1143 if (unregister_blkdev(blktap_device_major, "tapdev"))
1144 BTERR("blktap device unregister failed\n");