direct-io.hg

view linux-2.6-xen-sparse/drivers/xen/blktap/blktap.c @ 12271:f56b7ade7068

[BLKTAP] ia64 support
Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp>
author kfraser@localhost.localdomain
date Tue Nov 07 11:17:39 2006 +0000 (2006-11-07)
parents 3cc7e419b949
children f0ba459065d3
line source
1 /******************************************************************************
2 * drivers/xen/blktap/blktap.c
3 *
4 * Back-end driver for user level virtual block devices. This portion of the
5 * driver exports a 'unified' block-device interface that can be accessed
6 * by any operating system that implements a compatible front end. Requests
7 * are remapped to a user-space memory region.
8 *
9 * Based on the blkback driver code.
10 *
11 * Copyright (c) 2004-2005, Andrew Warfield and Julian Chesterfield
12 *
13 * Clean ups and fix ups:
14 * Copyright (c) 2006, Steven Rostedt - Red Hat, Inc.
15 *
16 * This program is free software; you can redistribute it and/or
17 * modify it under the terms of the GNU General Public License version 2
18 * as published by the Free Software Foundation; or, when distributed
19 * separately from the Linux kernel or incorporated into other
20 * software packages, subject to the following license:
21 *
22 * Permission is hereby granted, free of charge, to any person obtaining a copy
23 * of this source file (the "Software"), to deal in the Software without
24 * restriction, including without limitation the rights to use, copy, modify,
25 * merge, publish, distribute, sublicense, and/or sell copies of the Software,
26 * and to permit persons to whom the Software is furnished to do so, subject to
27 * the following conditions:
28 *
29 * The above copyright notice and this permission notice shall be included in
30 * all copies or substantial portions of the Software.
31 *
32 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
33 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
34 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
35 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
36 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
37 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
38 * IN THE SOFTWARE.
39 */
41 #include <linux/spinlock.h>
42 #include <linux/kthread.h>
43 #include <linux/list.h>
44 #include <asm/hypervisor.h>
45 #include "common.h"
46 #include <xen/balloon.h>
47 #include <linux/kernel.h>
48 #include <linux/fs.h>
49 #include <linux/mm.h>
50 #include <linux/errno.h>
51 #include <linux/major.h>
52 #include <linux/gfp.h>
53 #include <linux/poll.h>
54 #include <asm/tlbflush.h>
55 #include <linux/devfs_fs_kernel.h>
57 #define MAX_TAP_DEV 256 /*the maximum number of tapdisk ring devices */
58 #define MAX_DEV_NAME 100 /*the max tapdisk ring device name e.g. blktap0 */
61 struct class *xen_class;
62 EXPORT_SYMBOL_GPL(xen_class);
64 /*
65 * Setup the xen class. This should probably go in another file, but
66 * since blktap is the only user of it so far, it gets to keep it.
67 */
68 int setup_xen_class(void)
69 {
70 int ret;
72 if (xen_class)
73 return 0;
75 xen_class = class_create(THIS_MODULE, "xen");
76 if ((ret = IS_ERR(xen_class))) {
77 xen_class = NULL;
78 return ret;
79 }
81 return 0;
82 }
84 /*
85 * The maximum number of requests that can be outstanding at any time
86 * is determined by
87 *
88 * [mmap_alloc * MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST]
89 *
90 * where mmap_alloc < MAX_DYNAMIC_MEM.
91 *
92 * TODO:
93 * mmap_alloc is initialised to 2 and should be adjustable on the fly via
94 * sysfs.
95 */
96 #define BLK_RING_SIZE __RING_SIZE((blkif_sring_t *)0, PAGE_SIZE)
97 #define MAX_DYNAMIC_MEM BLK_RING_SIZE
98 #define MAX_PENDING_REQS BLK_RING_SIZE
99 #define MMAP_PAGES (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST)
100 #define MMAP_VADDR(_start, _req,_seg) \
101 (_start + \
102 ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) + \
103 ((_seg) * PAGE_SIZE))
104 static int blkif_reqs = MAX_PENDING_REQS;
105 static int mmap_pages = MMAP_PAGES;
107 #define RING_PAGES 1 /* BLKTAP - immediately before the mmap area, we
108 * have a bunch of pages reserved for shared
109 * memory rings.
110 */
112 /*Data struct handed back to userspace for tapdisk device to VBD mapping*/
113 typedef struct domid_translate {
114 unsigned short domid;
115 unsigned short busid;
116 } domid_translate_t ;
118 /*Data struct associated with each of the tapdisk devices*/
119 typedef struct tap_blkif {
120 struct vm_area_struct *vma; /*Shared memory area */
121 unsigned long rings_vstart; /*Kernel memory mapping */
122 unsigned long user_vstart; /*User memory mapping */
123 unsigned long dev_inuse; /*One process opens device at a time. */
124 unsigned long dev_pending; /*In process of being opened */
125 unsigned long ring_ok; /*make this ring->state */
126 blkif_front_ring_t ufe_ring; /*Rings up to user space. */
127 wait_queue_head_t wait; /*for poll */
128 unsigned long mode; /*current switching mode */
129 int minor; /*Minor number for tapdisk device */
130 pid_t pid; /*tapdisk process id */
131 enum { RUNNING, CLEANSHUTDOWN } status; /*Detect a clean userspace
132 shutdown */
133 unsigned long *idx_map; /*Record the user ring id to kern
134 [req id, idx] tuple */
135 blkif_t *blkif; /*Associate blkif with tapdev */
136 struct domid_translate trans; /*Translation from domid to bus. */
137 } tap_blkif_t;
139 static struct tap_blkif *tapfds[MAX_TAP_DEV];
140 static int blktap_next_minor;
142 static int __init set_blkif_reqs(char *str)
143 {
144 get_option(&str, &blkif_reqs);
145 return 1;
146 }
147 __setup("blkif_reqs=", set_blkif_reqs);
149 /* Run-time switchable: /sys/module/blktap/parameters/ */
150 static unsigned int log_stats = 0;
151 static unsigned int debug_lvl = 0;
152 module_param(log_stats, int, 0644);
153 module_param(debug_lvl, int, 0644);
155 /*
156 * Each outstanding request that we've passed to the lower device layers has a
157 * 'pending_req' allocated to it. Each buffer_head that completes decrements
158 * the pendcnt towards zero. When it hits zero, the specified domain has a
159 * response queued for it, with the saved 'id' passed back.
160 */
161 typedef struct {
162 blkif_t *blkif;
163 unsigned long id;
164 unsigned short mem_idx;
165 int nr_pages;
166 atomic_t pendcnt;
167 unsigned short operation;
168 int status;
169 struct list_head free_list;
170 int inuse;
171 } pending_req_t;
173 static pending_req_t *pending_reqs[MAX_PENDING_REQS];
174 static struct list_head pending_free;
175 static DEFINE_SPINLOCK(pending_free_lock);
176 static DECLARE_WAIT_QUEUE_HEAD (pending_free_wq);
177 static int alloc_pending_reqs;
179 typedef unsigned int PEND_RING_IDX;
181 static inline int MASK_PEND_IDX(int i) {
182 return (i & (MAX_PENDING_REQS-1));
183 }
185 static inline unsigned int RTN_PEND_IDX(pending_req_t *req, int idx) {
186 return (req - pending_reqs[idx]);
187 }
189 #define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons)
191 #define BLKBACK_INVALID_HANDLE (~0)
193 static struct page **foreign_pages[MAX_DYNAMIC_MEM];
194 static inline unsigned long idx_to_kaddr(
195 unsigned int mmap_idx, unsigned int req_idx, unsigned int sg_idx)
196 {
197 unsigned int arr_idx = req_idx*BLKIF_MAX_SEGMENTS_PER_REQUEST + sg_idx;
198 unsigned long pfn = page_to_pfn(foreign_pages[mmap_idx][arr_idx]);
199 return (unsigned long)pfn_to_kaddr(pfn);
200 }
202 static unsigned short mmap_alloc = 0;
203 static unsigned short mmap_lock = 0;
204 static unsigned short mmap_inuse = 0;
206 /******************************************************************
207 * GRANT HANDLES
208 */
210 /* When using grant tables to map a frame for device access then the
211 * handle returned must be used to unmap the frame. This is needed to
212 * drop the ref count on the frame.
213 */
214 struct grant_handle_pair
215 {
216 grant_handle_t kernel;
217 grant_handle_t user;
218 };
219 #define INVALID_GRANT_HANDLE 0xFFFF
221 static struct grant_handle_pair
222 pending_grant_handles[MAX_DYNAMIC_MEM][MMAP_PAGES];
223 #define pending_handle(_id, _idx, _i) \
224 (pending_grant_handles[_id][((_idx) * BLKIF_MAX_SEGMENTS_PER_REQUEST) \
225 + (_i)])
228 static int blktap_read_ufe_ring(tap_blkif_t *info); /*local prototypes*/
230 #define BLKTAP_MINOR 0 /*/dev/xen/blktap has a dynamic major */
231 #define BLKTAP_DEV_DIR "/dev/xen"
233 static int blktap_major;
235 /* blktap IOCTLs: */
236 #define BLKTAP_IOCTL_KICK_FE 1
237 #define BLKTAP_IOCTL_KICK_BE 2 /* currently unused */
238 #define BLKTAP_IOCTL_SETMODE 3
239 #define BLKTAP_IOCTL_SENDPID 4
240 #define BLKTAP_IOCTL_NEWINTF 5
241 #define BLKTAP_IOCTL_MINOR 6
242 #define BLKTAP_IOCTL_MAJOR 7
243 #define BLKTAP_QUERY_ALLOC_REQS 8
244 #define BLKTAP_IOCTL_FREEINTF 9
245 #define BLKTAP_IOCTL_PRINT_IDXS 100
247 /* blktap switching modes: (Set with BLKTAP_IOCTL_SETMODE) */
248 #define BLKTAP_MODE_PASSTHROUGH 0x00000000 /* default */
249 #define BLKTAP_MODE_INTERCEPT_FE 0x00000001
250 #define BLKTAP_MODE_INTERCEPT_BE 0x00000002 /* unimp. */
252 #define BLKTAP_MODE_INTERPOSE \
253 (BLKTAP_MODE_INTERCEPT_FE | BLKTAP_MODE_INTERCEPT_BE)
256 static inline int BLKTAP_MODE_VALID(unsigned long arg)
257 {
258 return ((arg == BLKTAP_MODE_PASSTHROUGH ) ||
259 (arg == BLKTAP_MODE_INTERCEPT_FE) ||
260 (arg == BLKTAP_MODE_INTERPOSE ));
261 }
263 /* Requests passing through the tap to userspace are re-assigned an ID.
264 * We must record a mapping between the BE [IDX,ID] tuple and the userspace
265 * ring ID.
266 */
268 static inline unsigned long MAKE_ID(domid_t fe_dom, PEND_RING_IDX idx)
269 {
270 return ((fe_dom << 16) | MASK_PEND_IDX(idx));
271 }
273 extern inline PEND_RING_IDX ID_TO_IDX(unsigned long id)
274 {
275 return (PEND_RING_IDX)(id & 0x0000ffff);
276 }
278 extern inline int ID_TO_MIDX(unsigned long id)
279 {
280 return (int)(id >> 16);
281 }
283 #define INVALID_REQ 0xdead0000
285 /*TODO: Convert to a free list*/
286 static inline int GET_NEXT_REQ(unsigned long *idx_map)
287 {
288 int i;
289 for (i = 0; i < MAX_PENDING_REQS; i++)
290 if (idx_map[i] == INVALID_REQ)
291 return i;
293 return INVALID_REQ;
294 }
297 #define BLKTAP_INVALID_HANDLE(_g) \
298 (((_g->kernel) == INVALID_GRANT_HANDLE) && \
299 ((_g->user) == INVALID_GRANT_HANDLE))
301 #define BLKTAP_INVALIDATE_HANDLE(_g) do { \
302 (_g)->kernel = INVALID_GRANT_HANDLE; (_g)->user = INVALID_GRANT_HANDLE; \
303 } while(0)
306 /******************************************************************
307 * BLKTAP VM OPS
308 */
310 static struct page *blktap_nopage(struct vm_area_struct *vma,
311 unsigned long address,
312 int *type)
313 {
314 /*
315 * if the page has not been mapped in by the driver then return
316 * NOPAGE_SIGBUS to the domain.
317 */
319 return NOPAGE_SIGBUS;
320 }
322 struct vm_operations_struct blktap_vm_ops = {
323 nopage: blktap_nopage,
324 };
326 /******************************************************************
327 * BLKTAP FILE OPS
328 */
330 /*Function Declarations*/
331 static tap_blkif_t *get_next_free_dev(void);
332 static int blktap_open(struct inode *inode, struct file *filp);
333 static int blktap_release(struct inode *inode, struct file *filp);
334 static int blktap_mmap(struct file *filp, struct vm_area_struct *vma);
335 static int blktap_ioctl(struct inode *inode, struct file *filp,
336 unsigned int cmd, unsigned long arg);
337 static unsigned int blktap_poll(struct file *file, poll_table *wait);
339 static struct file_operations blktap_fops = {
340 .owner = THIS_MODULE,
341 .poll = blktap_poll,
342 .ioctl = blktap_ioctl,
343 .open = blktap_open,
344 .release = blktap_release,
345 .mmap = blktap_mmap,
346 };
349 static tap_blkif_t *get_next_free_dev(void)
350 {
351 tap_blkif_t *info;
352 int minor;
354 /*
355 * This is called only from the ioctl, which
356 * means we should always have interrupts enabled.
357 */
358 BUG_ON(irqs_disabled());
360 spin_lock_irq(&pending_free_lock);
362 /* tapfds[0] is always NULL */
364 for (minor = 1; minor < blktap_next_minor; minor++) {
365 info = tapfds[minor];
366 /* we could have failed a previous attempt. */
367 if (!info ||
368 ((info->dev_inuse == 0) &&
369 (info->dev_pending == 0)) ) {
370 info->dev_pending = 1;
371 goto found;
372 }
373 }
374 info = NULL;
375 minor = -1;
377 /*
378 * We didn't find free device. If we can still allocate
379 * more, then we grab the next device minor that is
380 * available. This is done while we are still under
381 * the protection of the pending_free_lock.
382 */
383 if (blktap_next_minor < MAX_TAP_DEV)
384 minor = blktap_next_minor++;
385 found:
386 spin_unlock_irq(&pending_free_lock);
388 if (!info && minor > 0) {
389 info = kzalloc(sizeof(*info), GFP_KERNEL);
390 if (unlikely(!info)) {
391 /*
392 * If we failed here, try to put back
393 * the next minor number. But if one
394 * was just taken, then we just lose this
395 * minor. We can try to allocate this
396 * minor again later.
397 */
398 spin_lock_irq(&pending_free_lock);
399 if (blktap_next_minor == minor+1)
400 blktap_next_minor--;
401 spin_unlock_irq(&pending_free_lock);
402 goto out;
403 }
405 info->minor = minor;
406 /*
407 * Make sure that we have a minor before others can
408 * see us.
409 */
410 wmb();
411 tapfds[minor] = info;
413 class_device_create(xen_class, NULL,
414 MKDEV(blktap_major, minor), NULL,
415 "blktap%d", minor);
416 devfs_mk_cdev(MKDEV(blktap_major, minor),
417 S_IFCHR|S_IRUGO|S_IWUSR, "xen/blktap%d", minor);
418 }
420 out:
421 return info;
422 }
424 int dom_to_devid(domid_t domid, int xenbus_id, blkif_t *blkif)
425 {
426 tap_blkif_t *info;
427 int i;
429 for (i = 1; i < blktap_next_minor; i++) {
430 info = tapfds[i];
431 if ( info &&
432 (info->trans.domid == domid) &&
433 (info->trans.busid == xenbus_id) ) {
434 info->blkif = blkif;
435 info->status = RUNNING;
436 return i;
437 }
438 }
439 return -1;
440 }
442 void signal_tapdisk(int idx)
443 {
444 tap_blkif_t *info;
445 struct task_struct *ptask;
447 info = tapfds[idx];
448 if ((idx < 0) || (idx > MAX_TAP_DEV) || !info)
449 return;
451 if (info->pid > 0) {
452 ptask = find_task_by_pid(info->pid);
453 if (ptask)
454 info->status = CLEANSHUTDOWN;
455 }
456 info->blkif = NULL;
458 return;
459 }
461 static int blktap_open(struct inode *inode, struct file *filp)
462 {
463 blkif_sring_t *sring;
464 int idx = iminor(inode) - BLKTAP_MINOR;
465 tap_blkif_t *info;
466 int i;
468 /* ctrl device, treat differently */
469 if (!idx)
470 return 0;
472 info = tapfds[idx];
474 if ((idx < 0) || (idx > MAX_TAP_DEV) || !info) {
475 WPRINTK("Unable to open device /dev/xen/blktap%d\n",
476 idx);
477 return -ENODEV;
478 }
480 DPRINTK("Opening device /dev/xen/blktap%d\n",idx);
482 /*Only one process can access device at a time*/
483 if (test_and_set_bit(0, &info->dev_inuse))
484 return -EBUSY;
486 info->dev_pending = 0;
488 /* Allocate the fe ring. */
489 sring = (blkif_sring_t *)get_zeroed_page(GFP_KERNEL);
490 if (sring == NULL)
491 goto fail_nomem;
493 SetPageReserved(virt_to_page(sring));
495 SHARED_RING_INIT(sring);
496 FRONT_RING_INIT(&info->ufe_ring, sring, PAGE_SIZE);
498 filp->private_data = info;
499 info->vma = NULL;
501 info->idx_map = kmalloc(sizeof(unsigned long) * MAX_PENDING_REQS,
502 GFP_KERNEL);
504 if (idx > 0) {
505 init_waitqueue_head(&info->wait);
506 for (i = 0; i < MAX_PENDING_REQS; i++)
507 info->idx_map[i] = INVALID_REQ;
508 }
510 DPRINTK("Tap open: device /dev/xen/blktap%d\n",idx);
511 return 0;
513 fail_nomem:
514 return -ENOMEM;
515 }
517 static int blktap_release(struct inode *inode, struct file *filp)
518 {
519 tap_blkif_t *info = filp->private_data;
521 /* check for control device */
522 if (!info)
523 return 0;
525 info->dev_inuse = 0;
526 DPRINTK("Freeing device [/dev/xen/blktap%d]\n",info->minor);
528 /* Free the ring page. */
529 ClearPageReserved(virt_to_page(info->ufe_ring.sring));
530 free_page((unsigned long) info->ufe_ring.sring);
532 /* Clear any active mappings and free foreign map table */
533 if (info->vma) {
534 zap_page_range(
535 info->vma, info->vma->vm_start,
536 info->vma->vm_end - info->vma->vm_start, NULL);
537 info->vma = NULL;
538 }
540 if ( (info->status != CLEANSHUTDOWN) && (info->blkif != NULL) ) {
541 if (info->blkif->xenblkd != NULL) {
542 kthread_stop(info->blkif->xenblkd);
543 info->blkif->xenblkd = NULL;
544 }
545 info->status = CLEANSHUTDOWN;
546 }
547 return 0;
548 }
551 /* Note on mmap:
552 * We need to map pages to user space in a way that will allow the block
553 * subsystem set up direct IO to them. This couldn't be done before, because
554 * there isn't really a sane way to translate a user virtual address down to a
555 * physical address when the page belongs to another domain.
556 *
557 * My first approach was to map the page in to kernel memory, add an entry
558 * for it in the physical frame list (using alloc_lomem_region as in blkback)
559 * and then attempt to map that page up to user space. This is disallowed
560 * by xen though, which realizes that we don't really own the machine frame
561 * underlying the physical page.
562 *
563 * The new approach is to provide explicit support for this in xen linux.
564 * The VMA now has a flag, VM_FOREIGN, to indicate that it contains pages
565 * mapped from other vms. vma->vm_private_data is set up as a mapping
566 * from pages to actual page structs. There is a new clause in get_user_pages
567 * that does the right thing for this sort of mapping.
568 */
569 static int blktap_mmap(struct file *filp, struct vm_area_struct *vma)
570 {
571 int size;
572 struct page **map;
573 int i;
574 tap_blkif_t *info = filp->private_data;
576 if (info == NULL) {
577 WPRINTK("blktap: mmap, retrieving idx failed\n");
578 return -ENOMEM;
579 }
581 vma->vm_flags |= VM_RESERVED;
582 vma->vm_ops = &blktap_vm_ops;
584 size = vma->vm_end - vma->vm_start;
585 if (size != ((mmap_pages + RING_PAGES) << PAGE_SHIFT)) {
586 WPRINTK("you _must_ map exactly %d pages!\n",
587 mmap_pages + RING_PAGES);
588 return -EAGAIN;
589 }
591 size >>= PAGE_SHIFT;
592 info->rings_vstart = vma->vm_start;
593 info->user_vstart = info->rings_vstart + (RING_PAGES << PAGE_SHIFT);
595 /* Map the ring pages to the start of the region and reserve it. */
596 if (remap_pfn_range(vma, vma->vm_start,
597 __pa(info->ufe_ring.sring) >> PAGE_SHIFT,
598 PAGE_SIZE, vma->vm_page_prot)) {
599 WPRINTK("Mapping user ring failed!\n");
600 goto fail;
601 }
603 /* Mark this VM as containing foreign pages, and set up mappings. */
604 map = kzalloc(((vma->vm_end - vma->vm_start) >> PAGE_SHIFT)
605 * sizeof(struct page_struct*),
606 GFP_KERNEL);
607 if (map == NULL) {
608 WPRINTK("Couldn't alloc VM_FOREIGN map.\n");
609 goto fail;
610 }
612 for (i = 0; i < ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT); i++)
613 map[i] = NULL;
615 vma->vm_private_data = map;
616 vma->vm_flags |= VM_FOREIGN;
618 info->vma = vma;
619 info->ring_ok = 1;
620 return 0;
621 fail:
622 /* Clear any active mappings. */
623 zap_page_range(vma, vma->vm_start,
624 vma->vm_end - vma->vm_start, NULL);
626 return -ENOMEM;
627 }
630 static int blktap_ioctl(struct inode *inode, struct file *filp,
631 unsigned int cmd, unsigned long arg)
632 {
633 tap_blkif_t *info = filp->private_data;
635 switch(cmd) {
636 case BLKTAP_IOCTL_KICK_FE:
637 {
638 /* There are fe messages to process. */
639 return blktap_read_ufe_ring(info);
640 }
641 case BLKTAP_IOCTL_SETMODE:
642 {
643 if (info) {
644 if (BLKTAP_MODE_VALID(arg)) {
645 info->mode = arg;
646 /* XXX: may need to flush rings here. */
647 DPRINTK("blktap: set mode to %lx\n",
648 arg);
649 return 0;
650 }
651 }
652 return 0;
653 }
654 case BLKTAP_IOCTL_PRINT_IDXS:
655 {
656 if (info) {
657 printk("User Rings: \n-----------\n");
658 printk("UF: rsp_cons: %2d, req_prod_prv: %2d "
659 "| req_prod: %2d, rsp_prod: %2d\n",
660 info->ufe_ring.rsp_cons,
661 info->ufe_ring.req_prod_pvt,
662 info->ufe_ring.sring->req_prod,
663 info->ufe_ring.sring->rsp_prod);
664 }
665 return 0;
666 }
667 case BLKTAP_IOCTL_SENDPID:
668 {
669 if (info) {
670 info->pid = (pid_t)arg;
671 DPRINTK("blktap: pid received %d\n",
672 info->pid);
673 }
674 return 0;
675 }
676 case BLKTAP_IOCTL_NEWINTF:
677 {
678 uint64_t val = (uint64_t)arg;
679 domid_translate_t *tr = (domid_translate_t *)&val;
681 DPRINTK("NEWINTF Req for domid %d and bus id %d\n",
682 tr->domid, tr->busid);
683 info = get_next_free_dev();
684 if (!info) {
685 WPRINTK("Error initialising /dev/xen/blktap - "
686 "No more devices\n");
687 return -1;
688 }
689 info->trans.domid = tr->domid;
690 info->trans.busid = tr->busid;
691 return info->minor;
692 }
693 case BLKTAP_IOCTL_FREEINTF:
694 {
695 unsigned long dev = arg;
696 unsigned long flags;
698 info = tapfds[dev];
700 if ((dev > MAX_TAP_DEV) || !info)
701 return 0; /* should this be an error? */
703 spin_lock_irqsave(&pending_free_lock, flags);
704 if (info->dev_pending)
705 info->dev_pending = 0;
706 spin_unlock_irqrestore(&pending_free_lock, flags);
708 return 0;
709 }
710 case BLKTAP_IOCTL_MINOR:
711 {
712 unsigned long dev = arg;
714 info = tapfds[dev];
716 if ((dev > MAX_TAP_DEV) || !info)
717 return -EINVAL;
719 return info->minor;
720 }
721 case BLKTAP_IOCTL_MAJOR:
722 return blktap_major;
724 case BLKTAP_QUERY_ALLOC_REQS:
725 {
726 WPRINTK("BLKTAP_QUERY_ALLOC_REQS ioctl: %d/%d\n",
727 alloc_pending_reqs, blkif_reqs);
728 return (alloc_pending_reqs/blkif_reqs) * 100;
729 }
730 }
731 return -ENOIOCTLCMD;
732 }
734 static unsigned int blktap_poll(struct file *filp, poll_table *wait)
735 {
736 tap_blkif_t *info = filp->private_data;
738 /* do not work on the control device */
739 if (!info)
740 return 0;
742 poll_wait(filp, &info->wait, wait);
743 if (info->ufe_ring.req_prod_pvt != info->ufe_ring.sring->req_prod) {
744 RING_PUSH_REQUESTS(&info->ufe_ring);
745 return POLLIN | POLLRDNORM;
746 }
747 return 0;
748 }
750 void blktap_kick_user(int idx)
751 {
752 tap_blkif_t *info;
754 info = tapfds[idx];
756 if ((idx < 0) || (idx > MAX_TAP_DEV) || !info)
757 return;
759 wake_up_interruptible(&info->wait);
761 return;
762 }
764 static int do_block_io_op(blkif_t *blkif);
765 static void dispatch_rw_block_io(blkif_t *blkif,
766 blkif_request_t *req,
767 pending_req_t *pending_req);
768 static void make_response(blkif_t *blkif, unsigned long id,
769 unsigned short op, int st);
771 /******************************************************************
772 * misc small helpers
773 */
774 static int req_increase(void)
775 {
776 int i, j;
778 if (mmap_alloc >= MAX_PENDING_REQS || mmap_lock)
779 return -EINVAL;
781 pending_reqs[mmap_alloc] = kzalloc(sizeof(pending_req_t)
782 * blkif_reqs, GFP_KERNEL);
783 foreign_pages[mmap_alloc] = alloc_empty_pages_and_pagevec(mmap_pages);
785 if (!pending_reqs[mmap_alloc] || !foreign_pages[mmap_alloc])
786 goto out_of_memory;
788 DPRINTK("%s: reqs=%d, pages=%d\n",
789 __FUNCTION__, blkif_reqs, mmap_pages);
791 for (i = 0; i < MAX_PENDING_REQS; i++) {
792 list_add_tail(&pending_reqs[mmap_alloc][i].free_list,
793 &pending_free);
794 pending_reqs[mmap_alloc][i].mem_idx = mmap_alloc;
795 for (j = 0; j < BLKIF_MAX_SEGMENTS_PER_REQUEST; j++)
796 BLKTAP_INVALIDATE_HANDLE(&pending_handle(mmap_alloc,
797 i, j));
798 }
800 mmap_alloc++;
801 DPRINTK("# MMAPs increased to %d\n",mmap_alloc);
802 return 0;
804 out_of_memory:
805 free_empty_pages_and_pagevec(foreign_pages[mmap_alloc], mmap_pages);
806 kfree(pending_reqs[mmap_alloc]);
807 WPRINTK("%s: out of memory\n", __FUNCTION__);
808 return -ENOMEM;
809 }
811 static void mmap_req_del(int mmap)
812 {
813 BUG_ON(!spin_is_locked(&pending_free_lock));
815 kfree(pending_reqs[mmap]);
816 pending_reqs[mmap] = NULL;
818 free_empty_pages_and_pagevec(foreign_pages[mmap_alloc], mmap_pages);
819 foreign_pages[mmap] = NULL;
821 mmap_lock = 0;
822 DPRINTK("# MMAPs decreased to %d\n",mmap_alloc);
823 mmap_alloc--;
824 }
826 static pending_req_t* alloc_req(void)
827 {
828 pending_req_t *req = NULL;
829 unsigned long flags;
831 spin_lock_irqsave(&pending_free_lock, flags);
833 if (!list_empty(&pending_free)) {
834 req = list_entry(pending_free.next, pending_req_t, free_list);
835 list_del(&req->free_list);
836 }
838 if (req) {
839 req->inuse = 1;
840 alloc_pending_reqs++;
841 }
842 spin_unlock_irqrestore(&pending_free_lock, flags);
844 return req;
845 }
847 static void free_req(pending_req_t *req)
848 {
849 unsigned long flags;
850 int was_empty;
852 spin_lock_irqsave(&pending_free_lock, flags);
854 alloc_pending_reqs--;
855 req->inuse = 0;
856 if (mmap_lock && (req->mem_idx == mmap_alloc-1)) {
857 mmap_inuse--;
858 if (mmap_inuse == 0) mmap_req_del(mmap_alloc-1);
859 spin_unlock_irqrestore(&pending_free_lock, flags);
860 return;
861 }
862 was_empty = list_empty(&pending_free);
863 list_add(&req->free_list, &pending_free);
865 spin_unlock_irqrestore(&pending_free_lock, flags);
867 if (was_empty)
868 wake_up(&pending_free_wq);
869 }
871 static void fast_flush_area(pending_req_t *req, int k_idx, int u_idx,
872 int tapidx)
873 {
874 struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST*2];
875 unsigned int i, invcount = 0;
876 struct grant_handle_pair *khandle;
877 uint64_t ptep;
878 int ret, mmap_idx;
879 unsigned long kvaddr, uvaddr;
880 tap_blkif_t *info;
883 info = tapfds[tapidx];
885 if ((tapidx < 0) || (tapidx > MAX_TAP_DEV) || !info) {
886 WPRINTK("fast_flush: Couldn't get info!\n");
887 return;
888 }
890 if (info->vma != NULL &&
891 xen_feature(XENFEAT_auto_translated_physmap)) {
892 down_write(&info->vma->vm_mm->mmap_sem);
893 zap_page_range(info->vma,
894 MMAP_VADDR(info->user_vstart, u_idx, 0),
895 req->nr_pages << PAGE_SHIFT, NULL);
896 up_write(&info->vma->vm_mm->mmap_sem);
897 }
899 mmap_idx = req->mem_idx;
901 for (i = 0; i < req->nr_pages; i++) {
902 kvaddr = idx_to_kaddr(mmap_idx, k_idx, i);
903 uvaddr = MMAP_VADDR(info->user_vstart, u_idx, i);
905 khandle = &pending_handle(mmap_idx, k_idx, i);
907 if (khandle->kernel != INVALID_GRANT_HANDLE) {
908 gnttab_set_unmap_op(&unmap[invcount],
909 idx_to_kaddr(mmap_idx, k_idx, i),
910 GNTMAP_host_map, khandle->kernel);
911 invcount++;
912 }
914 if (khandle->user != INVALID_GRANT_HANDLE) {
915 BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
916 if (create_lookup_pte_addr(
917 info->vma->vm_mm,
918 MMAP_VADDR(info->user_vstart, u_idx, i),
919 &ptep) !=0) {
920 WPRINTK("Couldn't get a pte addr!\n");
921 return;
922 }
924 gnttab_set_unmap_op(&unmap[invcount], ptep,
925 GNTMAP_host_map
926 | GNTMAP_application_map
927 | GNTMAP_contains_pte,
928 khandle->user);
929 invcount++;
930 }
932 BLKTAP_INVALIDATE_HANDLE(khandle);
933 }
934 ret = HYPERVISOR_grant_table_op(
935 GNTTABOP_unmap_grant_ref, unmap, invcount);
936 BUG_ON(ret);
938 if (info->vma != NULL && !xen_feature(XENFEAT_auto_translated_physmap))
939 zap_page_range(info->vma,
940 MMAP_VADDR(info->user_vstart, u_idx, 0),
941 req->nr_pages << PAGE_SHIFT, NULL);
942 }
944 /******************************************************************
945 * SCHEDULER FUNCTIONS
946 */
948 static void print_stats(blkif_t *blkif)
949 {
950 printk(KERN_DEBUG "%s: oo %3d | rd %4d | wr %4d\n",
951 current->comm, blkif->st_oo_req,
952 blkif->st_rd_req, blkif->st_wr_req);
953 blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
954 blkif->st_rd_req = 0;
955 blkif->st_wr_req = 0;
956 blkif->st_oo_req = 0;
957 }
959 int tap_blkif_schedule(void *arg)
960 {
961 blkif_t *blkif = arg;
963 blkif_get(blkif);
965 if (debug_lvl)
966 printk(KERN_DEBUG "%s: started\n", current->comm);
968 while (!kthread_should_stop()) {
969 wait_event_interruptible(
970 blkif->wq,
971 blkif->waiting_reqs || kthread_should_stop());
972 wait_event_interruptible(
973 pending_free_wq,
974 !list_empty(&pending_free) || kthread_should_stop());
976 blkif->waiting_reqs = 0;
977 smp_mb(); /* clear flag *before* checking for work */
979 if (do_block_io_op(blkif))
980 blkif->waiting_reqs = 1;
982 if (log_stats && time_after(jiffies, blkif->st_print))
983 print_stats(blkif);
984 }
986 if (log_stats)
987 print_stats(blkif);
988 if (debug_lvl)
989 printk(KERN_DEBUG "%s: exiting\n", current->comm);
991 blkif->xenblkd = NULL;
992 blkif_put(blkif);
994 return 0;
995 }
997 /******************************************************************
998 * COMPLETION CALLBACK -- Called by user level ioctl()
999 */
1001 static int blktap_read_ufe_ring(tap_blkif_t *info)
1003 /* This is called to read responses from the UFE ring. */
1004 RING_IDX i, j, rp;
1005 blkif_response_t *resp;
1006 blkif_t *blkif=NULL;
1007 int pending_idx, usr_idx, mmap_idx;
1008 pending_req_t *pending_req;
1010 if (!info)
1011 return 0;
1013 /* We currently only forward packets in INTERCEPT_FE mode. */
1014 if (!(info->mode & BLKTAP_MODE_INTERCEPT_FE))
1015 return 0;
1017 /* for each outstanding message on the UFEring */
1018 rp = info->ufe_ring.sring->rsp_prod;
1019 rmb();
1021 for (i = info->ufe_ring.rsp_cons; i != rp; i++) {
1022 blkif_response_t res;
1023 resp = RING_GET_RESPONSE(&info->ufe_ring, i);
1024 memcpy(&res, resp, sizeof(res));
1025 mb(); /* rsp_cons read by RING_FULL() in do_block_io_op(). */
1026 ++info->ufe_ring.rsp_cons;
1028 /*retrieve [usr_idx] to [mmap_idx,pending_idx] mapping*/
1029 usr_idx = (int)res.id;
1030 pending_idx = MASK_PEND_IDX(ID_TO_IDX(info->idx_map[usr_idx]));
1031 mmap_idx = ID_TO_MIDX(info->idx_map[usr_idx]);
1033 if ( (mmap_idx >= mmap_alloc) ||
1034 (ID_TO_IDX(info->idx_map[usr_idx]) >= MAX_PENDING_REQS) )
1035 WPRINTK("Incorrect req map"
1036 "[%d], internal map [%d,%d (%d)]\n",
1037 usr_idx, mmap_idx,
1038 ID_TO_IDX(info->idx_map[usr_idx]),
1039 MASK_PEND_IDX(
1040 ID_TO_IDX(info->idx_map[usr_idx])));
1042 pending_req = &pending_reqs[mmap_idx][pending_idx];
1043 blkif = pending_req->blkif;
1045 for (j = 0; j < pending_req->nr_pages; j++) {
1047 unsigned long kvaddr, uvaddr;
1048 struct page **map = info->vma->vm_private_data;
1049 struct page *pg;
1050 int offset;
1052 uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, j);
1053 kvaddr = idx_to_kaddr(mmap_idx, pending_idx, j);
1055 pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
1056 ClearPageReserved(pg);
1057 offset = (uvaddr - info->vma->vm_start)
1058 >> PAGE_SHIFT;
1059 map[offset] = NULL;
1061 fast_flush_area(pending_req, pending_idx, usr_idx, info->minor);
1062 make_response(blkif, pending_req->id, res.operation,
1063 res.status);
1064 info->idx_map[usr_idx] = INVALID_REQ;
1065 blkif_put(pending_req->blkif);
1066 free_req(pending_req);
1069 return 0;
1073 /******************************************************************************
1074 * NOTIFICATION FROM GUEST OS.
1075 */
1077 static void blkif_notify_work(blkif_t *blkif)
1079 blkif->waiting_reqs = 1;
1080 wake_up(&blkif->wq);
1083 irqreturn_t tap_blkif_be_int(int irq, void *dev_id, struct pt_regs *regs)
1085 blkif_notify_work(dev_id);
1086 return IRQ_HANDLED;
1091 /******************************************************************
1092 * DOWNWARD CALLS -- These interface with the block-device layer proper.
1093 */
1094 static int print_dbug = 1;
1095 static int do_block_io_op(blkif_t *blkif)
1097 blkif_back_ring_t *blk_ring = &blkif->blk_ring;
1098 blkif_request_t req;
1099 pending_req_t *pending_req;
1100 RING_IDX rc, rp;
1101 int more_to_do = 0;
1102 tap_blkif_t *info;
1104 rc = blk_ring->req_cons;
1105 rp = blk_ring->sring->req_prod;
1106 rmb(); /* Ensure we see queued requests up to 'rp'. */
1108 /*Check blkif has corresponding UE ring*/
1109 if (blkif->dev_num < 0) {
1110 /*oops*/
1111 if (print_dbug) {
1112 WPRINTK("Corresponding UE "
1113 "ring does not exist!\n");
1114 print_dbug = 0; /*We only print this message once*/
1116 return 0;
1119 info = tapfds[blkif->dev_num];
1121 if (blkif->dev_num > MAX_TAP_DEV || !info || !info->dev_inuse) {
1122 if (print_dbug) {
1123 WPRINTK("Can't get UE info!\n");
1124 print_dbug = 0;
1126 return 0;
1129 while (rc != rp) {
1131 if (RING_FULL(&info->ufe_ring)) {
1132 WPRINTK("RING_FULL! More to do\n");
1133 more_to_do = 1;
1134 break;
1137 if (RING_REQUEST_CONS_OVERFLOW(blk_ring, rc)) {
1138 WPRINTK("RING_REQUEST_CONS_OVERFLOW!"
1139 " More to do\n");
1140 more_to_do = 1;
1141 break;
1144 pending_req = alloc_req();
1145 if (NULL == pending_req) {
1146 blkif->st_oo_req++;
1147 more_to_do = 1;
1148 break;
1151 memcpy(&req, RING_GET_REQUEST(blk_ring, rc), sizeof(req));
1152 blk_ring->req_cons = ++rc; /* before make_response() */
1154 switch (req.operation) {
1155 case BLKIF_OP_READ:
1156 blkif->st_rd_req++;
1157 dispatch_rw_block_io(blkif, &req, pending_req);
1158 break;
1160 case BLKIF_OP_WRITE:
1161 blkif->st_wr_req++;
1162 dispatch_rw_block_io(blkif, &req, pending_req);
1163 break;
1165 default:
1166 WPRINTK("unknown operation [%d]\n",
1167 req.operation);
1168 make_response(blkif, req.id, req.operation,
1169 BLKIF_RSP_ERROR);
1170 free_req(pending_req);
1171 break;
1175 blktap_kick_user(blkif->dev_num);
1177 return more_to_do;
1180 static void dispatch_rw_block_io(blkif_t *blkif,
1181 blkif_request_t *req,
1182 pending_req_t *pending_req)
1184 extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]);
1185 int op, operation = (req->operation == BLKIF_OP_WRITE) ? WRITE : READ;
1186 struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST*2];
1187 unsigned int nseg;
1188 int ret, i;
1189 tap_blkif_t *info;
1190 uint64_t sector;
1191 blkif_request_t *target;
1192 int pending_idx = RTN_PEND_IDX(pending_req,pending_req->mem_idx);
1193 int usr_idx;
1194 uint16_t mmap_idx = pending_req->mem_idx;
1196 if (blkif->dev_num < 0 || blkif->dev_num > MAX_TAP_DEV)
1197 goto fail_response;
1199 info = tapfds[blkif->dev_num];
1200 if (info == NULL)
1201 goto fail_response;
1203 /* Check we have space on user ring - should never fail. */
1204 usr_idx = GET_NEXT_REQ(info->idx_map);
1205 if (usr_idx == INVALID_REQ) {
1206 BUG();
1207 goto fail_response;
1210 /* Check that number of segments is sane. */
1211 nseg = req->nr_segments;
1212 if ( unlikely(nseg == 0) ||
1213 unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST) ) {
1214 WPRINTK("Bad number of segments in request (%d)\n", nseg);
1215 goto fail_response;
1218 /* Make sure userspace is ready. */
1219 if (!info->ring_ok) {
1220 WPRINTK("blktap: ring not ready for requests!\n");
1221 goto fail_response;
1224 if (RING_FULL(&info->ufe_ring)) {
1225 WPRINTK("blktap: fe_ring is full, can't add "
1226 "IO Request will be dropped. %d %d\n",
1227 RING_SIZE(&info->ufe_ring),
1228 RING_SIZE(&blkif->blk_ring));
1229 goto fail_response;
1232 pending_req->blkif = blkif;
1233 pending_req->id = req->id;
1234 pending_req->operation = operation;
1235 pending_req->status = BLKIF_RSP_OKAY;
1236 pending_req->nr_pages = nseg;
1237 op = 0;
1238 for (i = 0; i < nseg; i++) {
1239 unsigned long uvaddr;
1240 unsigned long kvaddr;
1241 uint64_t ptep;
1242 uint32_t flags;
1244 uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i);
1245 kvaddr = idx_to_kaddr(mmap_idx, pending_idx, i);
1247 sector = req->sector_number + ((PAGE_SIZE / 512) * i);
1248 if( (blkif->sectors > 0) && (sector >= blkif->sectors) ) {
1249 WPRINTK("BLKTAP: Sector request greater"
1250 "than size\n");
1251 WPRINTK("BLKTAP: %s request sector"
1252 "[%llu,%llu], Total [%llu]\n",
1253 (req->operation ==
1254 BLKIF_OP_WRITE ? "WRITE" : "READ"),
1255 (long long unsigned) sector,
1256 (long long unsigned) sector>>9,
1257 (long long unsigned) blkif->sectors);
1260 flags = GNTMAP_host_map;
1261 if (operation == WRITE)
1262 flags |= GNTMAP_readonly;
1263 gnttab_set_map_op(&map[op], kvaddr, flags,
1264 req->seg[i].gref, blkif->domid);
1265 op++;
1267 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
1268 /* Now map it to user. */
1269 ret = create_lookup_pte_addr(info->vma->vm_mm,
1270 uvaddr, &ptep);
1271 if (ret) {
1272 WPRINTK("Couldn't get a pte addr!\n");
1273 goto fail_flush;
1276 flags = GNTMAP_host_map | GNTMAP_application_map
1277 | GNTMAP_contains_pte;
1278 if (operation == WRITE)
1279 flags |= GNTMAP_readonly;
1280 gnttab_set_map_op(&map[op], ptep, flags,
1281 req->seg[i].gref, blkif->domid);
1282 op++;
1286 ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, op);
1287 BUG_ON(ret);
1289 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
1290 for (i = 0; i < (nseg*2); i+=2) {
1291 unsigned long uvaddr;
1292 unsigned long kvaddr;
1293 unsigned long offset;
1294 struct page *pg;
1296 uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i/2);
1297 kvaddr = idx_to_kaddr(mmap_idx, pending_idx, i/2);
1299 if (unlikely(map[i].status != 0)) {
1300 WPRINTK("invalid kernel buffer -- "
1301 "could not remap it\n");
1302 ret |= 1;
1303 map[i].handle = INVALID_GRANT_HANDLE;
1306 if (unlikely(map[i+1].status != 0)) {
1307 WPRINTK("invalid user buffer -- "
1308 "could not remap it\n");
1309 ret |= 1;
1310 map[i+1].handle = INVALID_GRANT_HANDLE;
1313 pending_handle(mmap_idx, pending_idx, i/2).kernel
1314 = map[i].handle;
1315 pending_handle(mmap_idx, pending_idx, i/2).user
1316 = map[i+1].handle;
1318 if (ret)
1319 continue;
1321 set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT,
1322 FOREIGN_FRAME(map[i].dev_bus_addr
1323 >> PAGE_SHIFT));
1324 offset = (uvaddr - info->vma->vm_start) >> PAGE_SHIFT;
1325 pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
1326 ((struct page **)info->vma->vm_private_data)[offset] =
1327 pg;
1329 } else {
1330 for (i = 0; i < nseg; i++) {
1331 unsigned long uvaddr;
1332 unsigned long kvaddr;
1333 unsigned long offset;
1334 struct page *pg;
1336 uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i);
1337 kvaddr = idx_to_kaddr(mmap_idx, pending_idx, i);
1339 if (unlikely(map[i].status != 0)) {
1340 WPRINTK("invalid kernel buffer -- "
1341 "could not remap it\n");
1342 ret |= 1;
1343 map[i].handle = INVALID_GRANT_HANDLE;
1346 pending_handle(mmap_idx, pending_idx, i).kernel
1347 = map[i].handle;
1349 if (ret)
1350 continue;
1352 offset = (uvaddr - info->vma->vm_start) >> PAGE_SHIFT;
1353 pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
1354 ((struct page **)info->vma->vm_private_data)[offset] =
1355 pg;
1359 if (ret)
1360 goto fail_flush;
1362 if (xen_feature(XENFEAT_auto_translated_physmap))
1363 down_write(&info->vma->vm_mm->mmap_sem);
1364 /* Mark mapped pages as reserved: */
1365 for (i = 0; i < req->nr_segments; i++) {
1366 unsigned long kvaddr;
1367 struct page *pg;
1369 kvaddr = idx_to_kaddr(mmap_idx, pending_idx, i);
1370 pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
1371 SetPageReserved(pg);
1372 if (xen_feature(XENFEAT_auto_translated_physmap)) {
1373 ret = vm_insert_page(info->vma,
1374 MMAP_VADDR(info->user_vstart,
1375 usr_idx, i), pg);
1376 if (ret) {
1377 up_write(&info->vma->vm_mm->mmap_sem);
1378 goto fail_flush;
1382 if (xen_feature(XENFEAT_auto_translated_physmap))
1383 up_write(&info->vma->vm_mm->mmap_sem);
1385 /*record [mmap_idx,pending_idx] to [usr_idx] mapping*/
1386 info->idx_map[usr_idx] = MAKE_ID(mmap_idx, pending_idx);
1388 blkif_get(blkif);
1389 /* Finally, write the request message to the user ring. */
1390 target = RING_GET_REQUEST(&info->ufe_ring,
1391 info->ufe_ring.req_prod_pvt);
1392 memcpy(target, req, sizeof(*req));
1393 target->id = usr_idx;
1394 wmb(); /* blktap_poll() reads req_prod_pvt asynchronously */
1395 info->ufe_ring.req_prod_pvt++;
1396 return;
1398 fail_flush:
1399 WPRINTK("Reached Fail_flush\n");
1400 fast_flush_area(pending_req, pending_idx, usr_idx, blkif->dev_num);
1401 fail_response:
1402 make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
1403 free_req(pending_req);
1408 /******************************************************************
1409 * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING
1410 */
1413 static void make_response(blkif_t *blkif, unsigned long id,
1414 unsigned short op, int st)
1416 blkif_response_t *resp;
1417 unsigned long flags;
1418 blkif_back_ring_t *blk_ring = &blkif->blk_ring;
1419 int more_to_do = 0;
1420 int notify;
1422 spin_lock_irqsave(&blkif->blk_ring_lock, flags);
1423 /* Place on the response ring for the relevant domain. */
1424 resp = RING_GET_RESPONSE(blk_ring, blk_ring->rsp_prod_pvt);
1425 resp->id = id;
1426 resp->operation = op;
1427 resp->status = st;
1428 blk_ring->rsp_prod_pvt++;
1429 RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(blk_ring, notify);
1431 if (blk_ring->rsp_prod_pvt == blk_ring->req_cons) {
1432 /*
1433 * Tail check for pending requests. Allows frontend to avoid
1434 * notifications if requests are already in flight (lower
1435 * overheads and promotes batching).
1436 */
1437 RING_FINAL_CHECK_FOR_REQUESTS(blk_ring, more_to_do);
1438 } else if (RING_HAS_UNCONSUMED_REQUESTS(blk_ring)) {
1439 more_to_do = 1;
1442 spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
1443 if (more_to_do)
1444 blkif_notify_work(blkif);
1445 if (notify)
1446 notify_remote_via_irq(blkif->irq);
1449 static int __init blkif_init(void)
1451 int i,ret,blktap_dir;
1453 if (!is_running_on_xen())
1454 return -ENODEV;
1456 INIT_LIST_HEAD(&pending_free);
1457 for(i = 0; i < 2; i++) {
1458 ret = req_increase();
1459 if (ret)
1460 break;
1462 if (i == 0)
1463 return ret;
1465 tap_blkif_interface_init();
1467 alloc_pending_reqs = 0;
1469 tap_blkif_xenbus_init();
1471 /* Dynamically allocate a major for this device */
1472 ret = register_chrdev(0, "blktap", &blktap_fops);
1473 blktap_dir = devfs_mk_dir(NULL, "xen", 0, NULL);
1475 if ( (ret < 0)||(blktap_dir < 0) ) {
1476 WPRINTK("Couldn't register /dev/xen/blktap\n");
1477 return -ENOMEM;
1480 blktap_major = ret;
1482 /* tapfds[0] is always NULL */
1483 blktap_next_minor++;
1485 ret = devfs_mk_cdev(MKDEV(blktap_major, i),
1486 S_IFCHR|S_IRUGO|S_IWUSR, "xen/blktap%d", i);
1488 if(ret != 0)
1489 return -ENOMEM;
1491 DPRINTK("Created misc_dev [/dev/xen/blktap%d]\n",i);
1493 /* Make sure the xen class exists */
1494 if (!setup_xen_class()) {
1495 /*
1496 * This will allow udev to create the blktap ctrl device.
1497 * We only want to create blktap0 first. We don't want
1498 * to flood the sysfs system with needless blktap devices.
1499 * We only create the device when a request of a new device is
1500 * made.
1501 */
1502 class_device_create(xen_class, NULL,
1503 MKDEV(blktap_major, 0), NULL,
1504 "blktap0");
1505 } else {
1506 /* this is bad, but not fatal */
1507 WPRINTK("blktap: sysfs xen_class not created\n");
1510 DPRINTK("Blktap device successfully created\n");
1512 return 0;
1515 module_init(blkif_init);
1517 MODULE_LICENSE("Dual BSD/GPL");