ia64/linux-2.6.18-xen.hg

view drivers/xen/blktap/blktap.c @ 869:271d9b9bee40

xen: miscellaneous cleanup

- add two missing unwind annotations
- mark remaining struct file_operations instances const
- use get_capacity() instead of raw access to the capacity field
- use assert_spin_locked() instead of BUG_ON(!spin_is_locked())
- use clear_tsk_thread_flag() instead of clear_ti_thread_flag()
- remove dead variable cpu_state

Signed-off-by: Jan Beulich <jbeulich@novell.com>
author Keir Fraser <keir.fraser@citrix.com>
date Thu May 14 10:08:10 2009 +0100 (2009-05-14)
parents 42dfb4e2bce0
children eba6fe6d8d53
line source
1 /******************************************************************************
2 * drivers/xen/blktap/blktap.c
3 *
4 * Back-end driver for user level virtual block devices. This portion of the
5 * driver exports a 'unified' block-device interface that can be accessed
6 * by any operating system that implements a compatible front end. Requests
7 * are remapped to a user-space memory region.
8 *
9 * Based on the blkback driver code.
10 *
11 * Copyright (c) 2004-2005, Andrew Warfield and Julian Chesterfield
12 *
13 * Clean ups and fix ups:
14 * Copyright (c) 2006, Steven Rostedt - Red Hat, Inc.
15 *
16 * This program is free software; you can redistribute it and/or
17 * modify it under the terms of the GNU General Public License version 2
18 * as published by the Free Software Foundation; or, when distributed
19 * separately from the Linux kernel or incorporated into other
20 * software packages, subject to the following license:
21 *
22 * Permission is hereby granted, free of charge, to any person obtaining a copy
23 * of this source file (the "Software"), to deal in the Software without
24 * restriction, including without limitation the rights to use, copy, modify,
25 * merge, publish, distribute, sublicense, and/or sell copies of the Software,
26 * and to permit persons to whom the Software is furnished to do so, subject to
27 * the following conditions:
28 *
29 * The above copyright notice and this permission notice shall be included in
30 * all copies or substantial portions of the Software.
31 *
32 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
33 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
34 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
35 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
36 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
37 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
38 * IN THE SOFTWARE.
39 */
41 #include <linux/spinlock.h>
42 #include <linux/kthread.h>
43 #include <linux/list.h>
44 #include <asm/hypervisor.h>
45 #include "common.h"
46 #include <xen/balloon.h>
47 #include <xen/driver_util.h>
48 #include <linux/kernel.h>
49 #include <linux/fs.h>
50 #include <linux/mm.h>
51 #include <linux/errno.h>
52 #include <linux/major.h>
53 #include <linux/gfp.h>
54 #include <linux/poll.h>
55 #include <linux/delay.h>
56 #include <asm/tlbflush.h>
58 #define MAX_TAP_DEV 256 /*the maximum number of tapdisk ring devices */
59 #define MAX_DEV_NAME 100 /*the max tapdisk ring device name e.g. blktap0 */
61 /*
62 * The maximum number of requests that can be outstanding at any time
63 * is determined by
64 *
65 * [mmap_alloc * MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST]
66 *
67 * where mmap_alloc < MAX_DYNAMIC_MEM.
68 *
69 * TODO:
70 * mmap_alloc is initialised to 2 and should be adjustable on the fly via
71 * sysfs.
72 */
73 #define BLK_RING_SIZE __RING_SIZE((blkif_sring_t *)0, PAGE_SIZE)
74 #define MAX_DYNAMIC_MEM BLK_RING_SIZE
75 #define MAX_PENDING_REQS BLK_RING_SIZE
76 #define MMAP_PAGES (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST)
77 #define MMAP_VADDR(_start, _req,_seg) \
78 (_start + \
79 ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) + \
80 ((_seg) * PAGE_SIZE))
81 static int blkif_reqs = MAX_PENDING_REQS;
82 static int mmap_pages = MMAP_PAGES;
84 #define RING_PAGES 1 /* BLKTAP - immediately before the mmap area, we
85 * have a bunch of pages reserved for shared
86 * memory rings.
87 */
89 /*Data struct handed back to userspace for tapdisk device to VBD mapping*/
90 typedef struct domid_translate {
91 unsigned short domid;
92 unsigned short busid;
93 } domid_translate_t ;
95 typedef struct domid_translate_ext {
96 unsigned short domid;
97 u32 busid;
98 } domid_translate_ext_t ;
100 /*Data struct associated with each of the tapdisk devices*/
101 typedef struct tap_blkif {
102 struct mm_struct *mm; /*User address space */
103 unsigned long rings_vstart; /*Kernel memory mapping */
104 unsigned long user_vstart; /*User memory mapping */
105 unsigned long dev_inuse; /*One process opens device at a time. */
106 unsigned long dev_pending; /*In process of being opened */
107 unsigned long ring_ok; /*make this ring->state */
108 blkif_front_ring_t ufe_ring; /*Rings up to user space. */
109 wait_queue_head_t wait; /*for poll */
110 unsigned long mode; /*current switching mode */
111 int minor; /*Minor number for tapdisk device */
112 pid_t pid; /*tapdisk process id */
113 enum { RUNNING, CLEANSHUTDOWN } status; /*Detect a clean userspace
114 shutdown */
115 unsigned long *idx_map; /*Record the user ring id to kern
116 [req id, idx] tuple */
117 blkif_t *blkif; /*Associate blkif with tapdev */
118 struct domid_translate_ext trans; /*Translation from domid to bus. */
119 struct page **map; /*Mapping page */
120 } tap_blkif_t;
122 static struct tap_blkif *tapfds[MAX_TAP_DEV];
123 static int blktap_next_minor;
125 module_param(blkif_reqs, int, 0);
126 /* Run-time switchable: /sys/module/blktap/parameters/ */
127 static unsigned int log_stats = 0;
128 static unsigned int debug_lvl = 0;
129 module_param(log_stats, int, 0644);
130 module_param(debug_lvl, int, 0644);
132 /*
133 * Each outstanding request that we've passed to the lower device layers has a
134 * 'pending_req' allocated to it. Each buffer_head that completes decrements
135 * the pendcnt towards zero. When it hits zero, the specified domain has a
136 * response queued for it, with the saved 'id' passed back.
137 */
138 typedef struct {
139 blkif_t *blkif;
140 u64 id;
141 unsigned short mem_idx;
142 int nr_pages;
143 atomic_t pendcnt;
144 unsigned short operation;
145 int status;
146 struct list_head free_list;
147 int inuse;
148 } pending_req_t;
150 static pending_req_t *pending_reqs[MAX_PENDING_REQS];
151 static struct list_head pending_free;
152 static DEFINE_SPINLOCK(pending_free_lock);
153 static DECLARE_WAIT_QUEUE_HEAD (pending_free_wq);
154 static int alloc_pending_reqs;
156 typedef unsigned int PEND_RING_IDX;
158 static inline int MASK_PEND_IDX(int i) {
159 return (i & (MAX_PENDING_REQS-1));
160 }
162 static inline unsigned int RTN_PEND_IDX(pending_req_t *req, int idx) {
163 return (req - pending_reqs[idx]);
164 }
166 #define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons)
168 #define BLKBACK_INVALID_HANDLE (~0)
170 static struct page **foreign_pages[MAX_DYNAMIC_MEM];
171 static inline unsigned long idx_to_kaddr(
172 unsigned int mmap_idx, unsigned int req_idx, unsigned int sg_idx)
173 {
174 unsigned int arr_idx = req_idx*BLKIF_MAX_SEGMENTS_PER_REQUEST + sg_idx;
175 unsigned long pfn = page_to_pfn(foreign_pages[mmap_idx][arr_idx]);
176 return (unsigned long)pfn_to_kaddr(pfn);
177 }
179 static unsigned short mmap_alloc = 0;
180 static unsigned short mmap_lock = 0;
181 static unsigned short mmap_inuse = 0;
183 /******************************************************************
184 * GRANT HANDLES
185 */
187 /* When using grant tables to map a frame for device access then the
188 * handle returned must be used to unmap the frame. This is needed to
189 * drop the ref count on the frame.
190 */
191 struct grant_handle_pair
192 {
193 grant_handle_t kernel;
194 grant_handle_t user;
195 };
196 #define INVALID_GRANT_HANDLE 0xFFFF
198 static struct grant_handle_pair
199 pending_grant_handles[MAX_DYNAMIC_MEM][MMAP_PAGES];
200 #define pending_handle(_id, _idx, _i) \
201 (pending_grant_handles[_id][((_idx) * BLKIF_MAX_SEGMENTS_PER_REQUEST) \
202 + (_i)])
205 static int blktap_read_ufe_ring(tap_blkif_t *info); /*local prototypes*/
207 #define BLKTAP_MINOR 0 /*/dev/xen/blktap has a dynamic major */
208 #define BLKTAP_DEV_DIR "/dev/xen"
210 static int blktap_major;
212 /* blktap IOCTLs: */
213 #define BLKTAP_IOCTL_KICK_FE 1
214 #define BLKTAP_IOCTL_KICK_BE 2 /* currently unused */
215 #define BLKTAP_IOCTL_SETMODE 3
216 #define BLKTAP_IOCTL_SENDPID 4
217 #define BLKTAP_IOCTL_NEWINTF 5
218 #define BLKTAP_IOCTL_MINOR 6
219 #define BLKTAP_IOCTL_MAJOR 7
220 #define BLKTAP_QUERY_ALLOC_REQS 8
221 #define BLKTAP_IOCTL_FREEINTF 9
222 #define BLKTAP_IOCTL_NEWINTF_EXT 50
223 #define BLKTAP_IOCTL_PRINT_IDXS 100
225 /* blktap switching modes: (Set with BLKTAP_IOCTL_SETMODE) */
226 #define BLKTAP_MODE_PASSTHROUGH 0x00000000 /* default */
227 #define BLKTAP_MODE_INTERCEPT_FE 0x00000001
228 #define BLKTAP_MODE_INTERCEPT_BE 0x00000002 /* unimp. */
230 #define BLKTAP_MODE_INTERPOSE \
231 (BLKTAP_MODE_INTERCEPT_FE | BLKTAP_MODE_INTERCEPT_BE)
234 static inline int BLKTAP_MODE_VALID(unsigned long arg)
235 {
236 return ((arg == BLKTAP_MODE_PASSTHROUGH ) ||
237 (arg == BLKTAP_MODE_INTERCEPT_FE) ||
238 (arg == BLKTAP_MODE_INTERPOSE ));
239 }
241 /* Requests passing through the tap to userspace are re-assigned an ID.
242 * We must record a mapping between the BE [IDX,ID] tuple and the userspace
243 * ring ID.
244 */
246 static inline unsigned long MAKE_ID(domid_t fe_dom, PEND_RING_IDX idx)
247 {
248 return ((fe_dom << 16) | MASK_PEND_IDX(idx));
249 }
251 extern inline PEND_RING_IDX ID_TO_IDX(unsigned long id)
252 {
253 return (PEND_RING_IDX)(id & 0x0000ffff);
254 }
256 extern inline int ID_TO_MIDX(unsigned long id)
257 {
258 return (int)(id >> 16);
259 }
261 #define INVALID_REQ 0xdead0000
263 /*TODO: Convert to a free list*/
264 static inline int GET_NEXT_REQ(unsigned long *idx_map)
265 {
266 int i;
267 for (i = 0; i < MAX_PENDING_REQS; i++)
268 if (idx_map[i] == INVALID_REQ)
269 return i;
271 return INVALID_REQ;
272 }
274 static inline int OFFSET_TO_USR_IDX(int offset)
275 {
276 return offset / BLKIF_MAX_SEGMENTS_PER_REQUEST;
277 }
279 static inline int OFFSET_TO_SEG(int offset)
280 {
281 return offset % BLKIF_MAX_SEGMENTS_PER_REQUEST;
282 }
285 #define BLKTAP_INVALID_HANDLE(_g) \
286 (((_g->kernel) == INVALID_GRANT_HANDLE) && \
287 ((_g->user) == INVALID_GRANT_HANDLE))
289 #define BLKTAP_INVALIDATE_HANDLE(_g) do { \
290 (_g)->kernel = INVALID_GRANT_HANDLE; (_g)->user = INVALID_GRANT_HANDLE; \
291 } while(0)
294 /******************************************************************
295 * BLKTAP VM OPS
296 */
298 static struct page *blktap_nopage(struct vm_area_struct *vma,
299 unsigned long address,
300 int *type)
301 {
302 /*
303 * if the page has not been mapped in by the driver then return
304 * NOPAGE_SIGBUS to the domain.
305 */
307 return NOPAGE_SIGBUS;
308 }
310 static pte_t blktap_clear_pte(struct vm_area_struct *vma,
311 unsigned long uvaddr,
312 pte_t *ptep, int is_fullmm)
313 {
314 pte_t copy;
315 tap_blkif_t *info = NULL;
316 int offset, seg, usr_idx, pending_idx, mmap_idx;
317 unsigned long uvstart = 0;
318 unsigned long kvaddr;
319 struct page *pg;
320 struct grant_handle_pair *khandle;
321 struct gnttab_unmap_grant_ref unmap[2];
322 int count = 0;
324 /*
325 * If the address is before the start of the grant mapped region or
326 * if vm_file is NULL (meaning mmap failed and we have nothing to do)
327 */
328 if (vma->vm_file != NULL) {
329 info = vma->vm_file->private_data;
330 uvstart = info->rings_vstart + (RING_PAGES << PAGE_SHIFT);
331 }
332 if (vma->vm_file == NULL || uvaddr < uvstart)
333 return ptep_get_and_clear_full(vma->vm_mm, uvaddr,
334 ptep, is_fullmm);
336 /* TODO Should these be changed to if statements? */
337 BUG_ON(!info);
338 BUG_ON(!info->idx_map);
340 offset = (int) ((uvaddr - uvstart) >> PAGE_SHIFT);
341 usr_idx = OFFSET_TO_USR_IDX(offset);
342 seg = OFFSET_TO_SEG(offset);
344 pending_idx = MASK_PEND_IDX(ID_TO_IDX(info->idx_map[usr_idx]));
345 mmap_idx = ID_TO_MIDX(info->idx_map[usr_idx]);
347 kvaddr = idx_to_kaddr(mmap_idx, pending_idx, seg);
348 pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
349 ClearPageReserved(pg);
350 info->map[offset + RING_PAGES] = NULL;
352 khandle = &pending_handle(mmap_idx, pending_idx, seg);
354 if (khandle->kernel != INVALID_GRANT_HANDLE) {
355 gnttab_set_unmap_op(&unmap[count], kvaddr,
356 GNTMAP_host_map, khandle->kernel);
357 count++;
359 set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT,
360 INVALID_P2M_ENTRY);
361 }
363 if (khandle->user != INVALID_GRANT_HANDLE) {
364 BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
366 copy = *ptep;
367 gnttab_set_unmap_op(&unmap[count], ptep_to_machine(ptep),
368 GNTMAP_host_map
369 | GNTMAP_application_map
370 | GNTMAP_contains_pte,
371 khandle->user);
372 count++;
373 } else {
374 BUG_ON(!xen_feature(XENFEAT_auto_translated_physmap));
376 /* USING SHADOW PAGE TABLES. */
377 copy = ptep_get_and_clear_full(vma->vm_mm, uvaddr, ptep,
378 is_fullmm);
379 }
381 if (count) {
382 BLKTAP_INVALIDATE_HANDLE(khandle);
383 if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
384 unmap, count))
385 BUG();
386 }
388 return copy;
389 }
391 static void blktap_vma_open(struct vm_area_struct *vma)
392 {
393 tap_blkif_t *info;
394 if (vma->vm_file == NULL)
395 return;
397 info = vma->vm_file->private_data;
398 vma->vm_private_data =
399 &info->map[(vma->vm_start - info->rings_vstart) >> PAGE_SHIFT];
400 }
402 /* tricky part
403 * When partial munmapping, ->open() is called only splitted vma which
404 * will be released soon. * See split_vma() and do_munmap() in mm/mmap.c
405 * So there is no chance to fix up vm_private_data of the end vma.
406 */
407 static void blktap_vma_close(struct vm_area_struct *vma)
408 {
409 tap_blkif_t *info;
410 struct vm_area_struct *next = vma->vm_next;
412 if (next == NULL ||
413 vma->vm_ops != next->vm_ops ||
414 vma->vm_end != next->vm_start ||
415 vma->vm_file == NULL ||
416 vma->vm_file != next->vm_file)
417 return;
419 info = vma->vm_file->private_data;
420 next->vm_private_data =
421 &info->map[(next->vm_start - info->rings_vstart) >> PAGE_SHIFT];
422 }
424 static struct vm_operations_struct blktap_vm_ops = {
425 nopage: blktap_nopage,
426 zap_pte: blktap_clear_pte,
427 open: blktap_vma_open,
428 close: blktap_vma_close,
429 };
431 /******************************************************************
432 * BLKTAP FILE OPS
433 */
435 /*Function Declarations*/
436 static tap_blkif_t *get_next_free_dev(void);
437 static int blktap_open(struct inode *inode, struct file *filp);
438 static int blktap_release(struct inode *inode, struct file *filp);
439 static int blktap_mmap(struct file *filp, struct vm_area_struct *vma);
440 static int blktap_ioctl(struct inode *inode, struct file *filp,
441 unsigned int cmd, unsigned long arg);
442 static unsigned int blktap_poll(struct file *file, poll_table *wait);
444 static const struct file_operations blktap_fops = {
445 .owner = THIS_MODULE,
446 .poll = blktap_poll,
447 .ioctl = blktap_ioctl,
448 .open = blktap_open,
449 .release = blktap_release,
450 .mmap = blktap_mmap,
451 };
454 static tap_blkif_t *get_next_free_dev(void)
455 {
456 struct class *class;
457 tap_blkif_t *info;
458 int minor;
460 /*
461 * This is called only from the ioctl, which
462 * means we should always have interrupts enabled.
463 */
464 BUG_ON(irqs_disabled());
466 spin_lock_irq(&pending_free_lock);
468 /* tapfds[0] is always NULL */
470 for (minor = 1; minor < blktap_next_minor; minor++) {
471 info = tapfds[minor];
472 /* we could have failed a previous attempt. */
473 if (!info ||
474 ((!test_bit(0, &info->dev_inuse)) &&
475 (info->dev_pending == 0)) ) {
476 info->dev_pending = 1;
477 goto found;
478 }
479 }
480 info = NULL;
481 minor = -1;
483 /*
484 * We didn't find free device. If we can still allocate
485 * more, then we grab the next device minor that is
486 * available. This is done while we are still under
487 * the protection of the pending_free_lock.
488 */
489 if (blktap_next_minor < MAX_TAP_DEV)
490 minor = blktap_next_minor++;
491 found:
492 spin_unlock_irq(&pending_free_lock);
494 if (!info && minor > 0) {
495 info = kzalloc(sizeof(*info), GFP_KERNEL);
496 if (unlikely(!info)) {
497 /*
498 * If we failed here, try to put back
499 * the next minor number. But if one
500 * was just taken, then we just lose this
501 * minor. We can try to allocate this
502 * minor again later.
503 */
504 spin_lock_irq(&pending_free_lock);
505 if (blktap_next_minor == minor+1)
506 blktap_next_minor--;
507 spin_unlock_irq(&pending_free_lock);
508 goto out;
509 }
511 info->minor = minor;
512 /*
513 * Make sure that we have a minor before others can
514 * see us.
515 */
516 wmb();
517 tapfds[minor] = info;
519 if ((class = get_xen_class()) != NULL)
520 class_device_create(class, NULL,
521 MKDEV(blktap_major, minor), NULL,
522 "blktap%d", minor);
523 }
525 out:
526 return info;
527 }
529 int dom_to_devid(domid_t domid, int xenbus_id, blkif_t *blkif)
530 {
531 tap_blkif_t *info;
532 int i;
534 for (i = 1; i < blktap_next_minor; i++) {
535 info = tapfds[i];
536 if ( info &&
537 (info->trans.domid == domid) &&
538 (info->trans.busid == xenbus_id) ) {
539 info->blkif = blkif;
540 info->status = RUNNING;
541 return i;
542 }
543 }
544 return -1;
545 }
547 void signal_tapdisk(int idx)
548 {
549 tap_blkif_t *info;
550 struct task_struct *ptask;
552 /*
553 * if the userland tools set things up wrong, this could be negative;
554 * just don't try to signal in this case
555 */
556 if (idx < 0)
557 return;
559 info = tapfds[idx];
560 if ((idx < 0) || (idx > MAX_TAP_DEV) || !info)
561 return;
563 if (info->pid > 0) {
564 ptask = find_task_by_pid(info->pid);
565 if (ptask)
566 info->status = CLEANSHUTDOWN;
567 }
568 info->blkif = NULL;
570 return;
571 }
573 static int blktap_open(struct inode *inode, struct file *filp)
574 {
575 blkif_sring_t *sring;
576 int idx = iminor(inode) - BLKTAP_MINOR;
577 tap_blkif_t *info;
578 int i;
580 /* ctrl device, treat differently */
581 if (!idx)
582 return 0;
584 info = tapfds[idx];
586 if ((idx < 0) || (idx > MAX_TAP_DEV) || !info) {
587 WPRINTK("Unable to open device /dev/xen/blktap%d\n",
588 idx);
589 return -ENODEV;
590 }
592 DPRINTK("Opening device /dev/xen/blktap%d\n",idx);
594 /*Only one process can access device at a time*/
595 if (test_and_set_bit(0, &info->dev_inuse))
596 return -EBUSY;
598 info->dev_pending = 0;
600 /* Allocate the fe ring. */
601 sring = (blkif_sring_t *)get_zeroed_page(GFP_KERNEL);
602 if (sring == NULL)
603 goto fail_nomem;
605 SetPageReserved(virt_to_page(sring));
607 SHARED_RING_INIT(sring);
608 FRONT_RING_INIT(&info->ufe_ring, sring, PAGE_SIZE);
610 filp->private_data = info;
611 info->mm = NULL;
613 info->idx_map = kmalloc(sizeof(unsigned long) * MAX_PENDING_REQS,
614 GFP_KERNEL);
616 if (info->idx_map == NULL)
617 goto fail_nomem;
619 if (idx > 0) {
620 init_waitqueue_head(&info->wait);
621 for (i = 0; i < MAX_PENDING_REQS; i++)
622 info->idx_map[i] = INVALID_REQ;
623 }
625 DPRINTK("Tap open: device /dev/xen/blktap%d\n",idx);
626 return 0;
628 fail_nomem:
629 return -ENOMEM;
630 }
632 static int blktap_release(struct inode *inode, struct file *filp)
633 {
634 tap_blkif_t *info = filp->private_data;
636 /* check for control device */
637 if (!info)
638 return 0;
640 info->ring_ok = 0;
641 smp_wmb();
643 mmput(info->mm);
644 info->mm = NULL;
645 kfree(info->map);
646 info->map = NULL;
648 /* Free the ring page. */
649 ClearPageReserved(virt_to_page(info->ufe_ring.sring));
650 free_page((unsigned long) info->ufe_ring.sring);
652 if (info->idx_map) {
653 kfree(info->idx_map);
654 info->idx_map = NULL;
655 }
657 if ( (info->status != CLEANSHUTDOWN) && (info->blkif != NULL) ) {
658 if (info->blkif->xenblkd != NULL) {
659 kthread_stop(info->blkif->xenblkd);
660 info->blkif->xenblkd = NULL;
661 }
662 info->status = CLEANSHUTDOWN;
663 }
665 clear_bit(0, &info->dev_inuse);
666 DPRINTK("Freeing device [/dev/xen/blktap%d]\n",info->minor);
668 return 0;
669 }
672 /* Note on mmap:
673 * We need to map pages to user space in a way that will allow the block
674 * subsystem set up direct IO to them. This couldn't be done before, because
675 * there isn't really a sane way to translate a user virtual address down to a
676 * physical address when the page belongs to another domain.
677 *
678 * My first approach was to map the page in to kernel memory, add an entry
679 * for it in the physical frame list (using alloc_lomem_region as in blkback)
680 * and then attempt to map that page up to user space. This is disallowed
681 * by xen though, which realizes that we don't really own the machine frame
682 * underlying the physical page.
683 *
684 * The new approach is to provide explicit support for this in xen linux.
685 * The VMA now has a flag, VM_FOREIGN, to indicate that it contains pages
686 * mapped from other vms. vma->vm_private_data is set up as a mapping
687 * from pages to actual page structs. There is a new clause in get_user_pages
688 * that does the right thing for this sort of mapping.
689 */
690 static int blktap_mmap(struct file *filp, struct vm_area_struct *vma)
691 {
692 int size;
693 tap_blkif_t *info = filp->private_data;
694 int ret;
696 if (info == NULL) {
697 WPRINTK("blktap: mmap, retrieving idx failed\n");
698 return -ENOMEM;
699 }
701 vma->vm_flags |= VM_RESERVED;
702 vma->vm_ops = &blktap_vm_ops;
704 size = vma->vm_end - vma->vm_start;
705 if (size != ((mmap_pages + RING_PAGES) << PAGE_SHIFT)) {
706 WPRINTK("you _must_ map exactly %d pages!\n",
707 mmap_pages + RING_PAGES);
708 return -EAGAIN;
709 }
711 size >>= PAGE_SHIFT;
712 info->rings_vstart = vma->vm_start;
713 info->user_vstart = info->rings_vstart + (RING_PAGES << PAGE_SHIFT);
715 /* Map the ring pages to the start of the region and reserve it. */
716 if (xen_feature(XENFEAT_auto_translated_physmap))
717 ret = vm_insert_page(vma, vma->vm_start,
718 virt_to_page(info->ufe_ring.sring));
719 else
720 ret = remap_pfn_range(vma, vma->vm_start,
721 __pa(info->ufe_ring.sring) >> PAGE_SHIFT,
722 PAGE_SIZE, vma->vm_page_prot);
723 if (ret) {
724 WPRINTK("Mapping user ring failed!\n");
725 goto fail;
726 }
728 /* Mark this VM as containing foreign pages, and set up mappings. */
729 info->map = kzalloc(((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) *
730 sizeof(*info->map), GFP_KERNEL);
731 if (info->map == NULL) {
732 WPRINTK("Couldn't alloc VM_FOREIGN map.\n");
733 goto fail;
734 }
736 vma->vm_private_data = info->map;
737 vma->vm_flags |= VM_FOREIGN;
738 vma->vm_flags |= VM_DONTCOPY;
740 #ifdef CONFIG_X86
741 vma->vm_mm->context.has_foreign_mappings = 1;
742 #endif
744 info->mm = get_task_mm(current);
745 smp_wmb();
746 info->ring_ok = 1;
747 return 0;
748 fail:
749 /* Clear any active mappings. */
750 zap_page_range(vma, vma->vm_start,
751 vma->vm_end - vma->vm_start, NULL);
753 return -ENOMEM;
754 }
757 static int blktap_ioctl(struct inode *inode, struct file *filp,
758 unsigned int cmd, unsigned long arg)
759 {
760 tap_blkif_t *info = filp->private_data;
762 switch(cmd) {
763 case BLKTAP_IOCTL_KICK_FE:
764 {
765 /* There are fe messages to process. */
766 return blktap_read_ufe_ring(info);
767 }
768 case BLKTAP_IOCTL_SETMODE:
769 {
770 if (info) {
771 if (BLKTAP_MODE_VALID(arg)) {
772 info->mode = arg;
773 /* XXX: may need to flush rings here. */
774 DPRINTK("blktap: set mode to %lx\n",
775 arg);
776 return 0;
777 }
778 }
779 return 0;
780 }
781 case BLKTAP_IOCTL_PRINT_IDXS:
782 {
783 if (info) {
784 printk("User Rings: \n-----------\n");
785 printk("UF: rsp_cons: %2d, req_prod_prv: %2d "
786 "| req_prod: %2d, rsp_prod: %2d\n",
787 info->ufe_ring.rsp_cons,
788 info->ufe_ring.req_prod_pvt,
789 info->ufe_ring.sring->req_prod,
790 info->ufe_ring.sring->rsp_prod);
791 }
792 return 0;
793 }
794 case BLKTAP_IOCTL_SENDPID:
795 {
796 if (info) {
797 info->pid = (pid_t)arg;
798 DPRINTK("blktap: pid received %d\n",
799 info->pid);
800 }
801 return 0;
802 }
803 case BLKTAP_IOCTL_NEWINTF:
804 {
805 uint64_t val = (uint64_t)arg;
806 domid_translate_t *tr = (domid_translate_t *)&val;
808 DPRINTK("NEWINTF Req for domid %d and bus id %d\n",
809 tr->domid, tr->busid);
810 info = get_next_free_dev();
811 if (!info) {
812 WPRINTK("Error initialising /dev/xen/blktap - "
813 "No more devices\n");
814 return -1;
815 }
816 info->trans.domid = tr->domid;
817 info->trans.busid = tr->busid;
818 return info->minor;
819 }
820 case BLKTAP_IOCTL_NEWINTF_EXT:
821 {
822 void __user *udata = (void __user *) arg;
823 domid_translate_ext_t tr;
825 if (copy_from_user(&tr, udata, sizeof(domid_translate_ext_t)))
826 return -EFAULT;
828 DPRINTK("NEWINTF_EXT Req for domid %d and bus id %d\n",
829 tr.domid, tr.busid);
830 info = get_next_free_dev();
831 if (!info) {
832 WPRINTK("Error initialising /dev/xen/blktap - "
833 "No more devices\n");
834 return -1;
835 }
836 info->trans.domid = tr.domid;
837 info->trans.busid = tr.busid;
838 return info->minor;
839 }
840 case BLKTAP_IOCTL_FREEINTF:
841 {
842 unsigned long dev = arg;
843 unsigned long flags;
845 info = tapfds[dev];
847 if ((dev > MAX_TAP_DEV) || !info)
848 return 0; /* should this be an error? */
850 spin_lock_irqsave(&pending_free_lock, flags);
851 if (info->dev_pending)
852 info->dev_pending = 0;
853 spin_unlock_irqrestore(&pending_free_lock, flags);
855 return 0;
856 }
857 case BLKTAP_IOCTL_MINOR:
858 {
859 unsigned long dev = arg;
861 info = tapfds[dev];
863 if ((dev > MAX_TAP_DEV) || !info)
864 return -EINVAL;
866 return info->minor;
867 }
868 case BLKTAP_IOCTL_MAJOR:
869 return blktap_major;
871 case BLKTAP_QUERY_ALLOC_REQS:
872 {
873 WPRINTK("BLKTAP_QUERY_ALLOC_REQS ioctl: %d/%d\n",
874 alloc_pending_reqs, blkif_reqs);
875 return (alloc_pending_reqs/blkif_reqs) * 100;
876 }
877 }
878 return -ENOIOCTLCMD;
879 }
881 static unsigned int blktap_poll(struct file *filp, poll_table *wait)
882 {
883 tap_blkif_t *info = filp->private_data;
885 /* do not work on the control device */
886 if (!info)
887 return 0;
889 poll_wait(filp, &info->wait, wait);
890 if (info->ufe_ring.req_prod_pvt != info->ufe_ring.sring->req_prod) {
891 RING_PUSH_REQUESTS(&info->ufe_ring);
892 return POLLIN | POLLRDNORM;
893 }
894 return 0;
895 }
897 static void blktap_kick_user(int idx)
898 {
899 tap_blkif_t *info;
901 info = tapfds[idx];
903 if ((idx < 0) || (idx > MAX_TAP_DEV) || !info)
904 return;
906 wake_up_interruptible(&info->wait);
908 return;
909 }
911 static int do_block_io_op(blkif_t *blkif);
912 static void dispatch_rw_block_io(blkif_t *blkif,
913 blkif_request_t *req,
914 pending_req_t *pending_req);
915 static void make_response(blkif_t *blkif, u64 id,
916 unsigned short op, int st);
918 /******************************************************************
919 * misc small helpers
920 */
921 static int req_increase(void)
922 {
923 int i, j;
925 if (mmap_alloc >= MAX_PENDING_REQS || mmap_lock)
926 return -EINVAL;
928 pending_reqs[mmap_alloc] = kzalloc(sizeof(pending_req_t)
929 * blkif_reqs, GFP_KERNEL);
930 foreign_pages[mmap_alloc] = alloc_empty_pages_and_pagevec(mmap_pages);
932 if (!pending_reqs[mmap_alloc] || !foreign_pages[mmap_alloc])
933 goto out_of_memory;
935 DPRINTK("%s: reqs=%d, pages=%d\n",
936 __FUNCTION__, blkif_reqs, mmap_pages);
938 for (i = 0; i < MAX_PENDING_REQS; i++) {
939 list_add_tail(&pending_reqs[mmap_alloc][i].free_list,
940 &pending_free);
941 pending_reqs[mmap_alloc][i].mem_idx = mmap_alloc;
942 for (j = 0; j < BLKIF_MAX_SEGMENTS_PER_REQUEST; j++)
943 BLKTAP_INVALIDATE_HANDLE(&pending_handle(mmap_alloc,
944 i, j));
945 }
947 mmap_alloc++;
948 DPRINTK("# MMAPs increased to %d\n",mmap_alloc);
949 return 0;
951 out_of_memory:
952 free_empty_pages_and_pagevec(foreign_pages[mmap_alloc], mmap_pages);
953 kfree(pending_reqs[mmap_alloc]);
954 WPRINTK("%s: out of memory\n", __FUNCTION__);
955 return -ENOMEM;
956 }
958 static void mmap_req_del(int mmap)
959 {
960 assert_spin_locked(&pending_free_lock);
962 kfree(pending_reqs[mmap]);
963 pending_reqs[mmap] = NULL;
965 free_empty_pages_and_pagevec(foreign_pages[mmap_alloc], mmap_pages);
966 foreign_pages[mmap] = NULL;
968 mmap_lock = 0;
969 DPRINTK("# MMAPs decreased to %d\n",mmap_alloc);
970 mmap_alloc--;
971 }
973 static pending_req_t* alloc_req(void)
974 {
975 pending_req_t *req = NULL;
976 unsigned long flags;
978 spin_lock_irqsave(&pending_free_lock, flags);
980 if (!list_empty(&pending_free)) {
981 req = list_entry(pending_free.next, pending_req_t, free_list);
982 list_del(&req->free_list);
983 }
985 if (req) {
986 req->inuse = 1;
987 alloc_pending_reqs++;
988 }
989 spin_unlock_irqrestore(&pending_free_lock, flags);
991 return req;
992 }
994 static void free_req(pending_req_t *req)
995 {
996 unsigned long flags;
997 int was_empty;
999 spin_lock_irqsave(&pending_free_lock, flags);
1001 alloc_pending_reqs--;
1002 req->inuse = 0;
1003 if (mmap_lock && (req->mem_idx == mmap_alloc-1)) {
1004 mmap_inuse--;
1005 if (mmap_inuse == 0) mmap_req_del(mmap_alloc-1);
1006 spin_unlock_irqrestore(&pending_free_lock, flags);
1007 return;
1009 was_empty = list_empty(&pending_free);
1010 list_add(&req->free_list, &pending_free);
1012 spin_unlock_irqrestore(&pending_free_lock, flags);
1014 if (was_empty)
1015 wake_up(&pending_free_wq);
1018 static void blktap_zap_page_range(struct mm_struct *mm,
1019 unsigned long uvaddr, int nr_pages)
1021 unsigned long end = uvaddr + (nr_pages << PAGE_SHIFT);
1022 struct vm_area_struct *vma;
1024 vma = find_vma(mm, uvaddr);
1025 while (vma && uvaddr < end) {
1026 unsigned long s = max(uvaddr, vma->vm_start);
1027 unsigned long e = min(end, vma->vm_end);
1029 zap_page_range(vma, s, e - s, NULL);
1031 uvaddr = e;
1032 vma = vma->vm_next;
1036 static void fast_flush_area(pending_req_t *req, int k_idx, int u_idx,
1037 int tapidx)
1039 struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST*2];
1040 unsigned int i, invcount = 0, locked = 0;
1041 struct grant_handle_pair *khandle;
1042 uint64_t ptep;
1043 int ret, mmap_idx;
1044 unsigned long kvaddr, uvaddr;
1045 tap_blkif_t *info;
1046 struct mm_struct *mm;
1049 info = tapfds[tapidx];
1051 if ((tapidx < 0) || (tapidx > MAX_TAP_DEV) || !info) {
1052 WPRINTK("fast_flush: Couldn't get info!\n");
1053 return;
1056 mm = info->mm;
1058 if (mm != NULL && xen_feature(XENFEAT_auto_translated_physmap)) {
1059 down_write(&mm->mmap_sem);
1060 blktap_zap_page_range(mm,
1061 MMAP_VADDR(info->user_vstart, u_idx, 0),
1062 req->nr_pages);
1063 up_write(&mm->mmap_sem);
1064 return;
1067 mmap_idx = req->mem_idx;
1069 for (i = 0; i < req->nr_pages; i++) {
1070 kvaddr = idx_to_kaddr(mmap_idx, k_idx, i);
1071 uvaddr = MMAP_VADDR(info->user_vstart, u_idx, i);
1073 khandle = &pending_handle(mmap_idx, k_idx, i);
1075 if (khandle->kernel != INVALID_GRANT_HANDLE) {
1076 gnttab_set_unmap_op(&unmap[invcount],
1077 idx_to_kaddr(mmap_idx, k_idx, i),
1078 GNTMAP_host_map, khandle->kernel);
1079 invcount++;
1081 set_phys_to_machine(
1082 __pa(idx_to_kaddr(mmap_idx, k_idx, i))
1083 >> PAGE_SHIFT, INVALID_P2M_ENTRY);
1086 if (khandle->user != INVALID_GRANT_HANDLE) {
1087 BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
1088 if (!locked++)
1089 down_write(&mm->mmap_sem);
1090 if (create_lookup_pte_addr(
1091 mm,
1092 MMAP_VADDR(info->user_vstart, u_idx, i),
1093 &ptep) !=0) {
1094 up_write(&mm->mmap_sem);
1095 WPRINTK("Couldn't get a pte addr!\n");
1096 return;
1099 gnttab_set_unmap_op(&unmap[invcount], ptep,
1100 GNTMAP_host_map
1101 | GNTMAP_application_map
1102 | GNTMAP_contains_pte,
1103 khandle->user);
1104 invcount++;
1107 BLKTAP_INVALIDATE_HANDLE(khandle);
1109 ret = HYPERVISOR_grant_table_op(
1110 GNTTABOP_unmap_grant_ref, unmap, invcount);
1111 BUG_ON(ret);
1113 if (mm != NULL && !xen_feature(XENFEAT_auto_translated_physmap)) {
1114 if (!locked++)
1115 down_write(&mm->mmap_sem);
1116 blktap_zap_page_range(mm,
1117 MMAP_VADDR(info->user_vstart, u_idx, 0),
1118 req->nr_pages);
1121 if (locked)
1122 up_write(&mm->mmap_sem);
1125 /******************************************************************
1126 * SCHEDULER FUNCTIONS
1127 */
1129 static void print_stats(blkif_t *blkif)
1131 printk(KERN_DEBUG "%s: oo %3d | rd %4d | wr %4d\n",
1132 current->comm, blkif->st_oo_req,
1133 blkif->st_rd_req, blkif->st_wr_req);
1134 blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
1135 blkif->st_rd_req = 0;
1136 blkif->st_wr_req = 0;
1137 blkif->st_oo_req = 0;
1140 int tap_blkif_schedule(void *arg)
1142 blkif_t *blkif = arg;
1144 blkif_get(blkif);
1146 if (debug_lvl)
1147 printk(KERN_DEBUG "%s: started\n", current->comm);
1149 while (!kthread_should_stop()) {
1150 if (try_to_freeze())
1151 continue;
1153 wait_event_interruptible(
1154 blkif->wq,
1155 blkif->waiting_reqs || kthread_should_stop());
1156 wait_event_interruptible(
1157 pending_free_wq,
1158 !list_empty(&pending_free) || kthread_should_stop());
1160 blkif->waiting_reqs = 0;
1161 smp_mb(); /* clear flag *before* checking for work */
1163 if (do_block_io_op(blkif))
1164 blkif->waiting_reqs = 1;
1166 if (log_stats && time_after(jiffies, blkif->st_print))
1167 print_stats(blkif);
1170 if (log_stats)
1171 print_stats(blkif);
1172 if (debug_lvl)
1173 printk(KERN_DEBUG "%s: exiting\n", current->comm);
1175 blkif->xenblkd = NULL;
1176 blkif_put(blkif);
1178 return 0;
1181 /******************************************************************
1182 * COMPLETION CALLBACK -- Called by user level ioctl()
1183 */
1185 static int blktap_read_ufe_ring(tap_blkif_t *info)
1187 /* This is called to read responses from the UFE ring. */
1188 RING_IDX i, j, rp;
1189 blkif_response_t *resp;
1190 blkif_t *blkif=NULL;
1191 int pending_idx, usr_idx, mmap_idx;
1192 pending_req_t *pending_req;
1194 if (!info)
1195 return 0;
1197 /* We currently only forward packets in INTERCEPT_FE mode. */
1198 if (!(info->mode & BLKTAP_MODE_INTERCEPT_FE))
1199 return 0;
1201 /* for each outstanding message on the UFEring */
1202 rp = info->ufe_ring.sring->rsp_prod;
1203 rmb();
1205 for (i = info->ufe_ring.rsp_cons; i != rp; i++) {
1206 blkif_response_t res;
1207 resp = RING_GET_RESPONSE(&info->ufe_ring, i);
1208 memcpy(&res, resp, sizeof(res));
1209 mb(); /* rsp_cons read by RING_FULL() in do_block_io_op(). */
1210 ++info->ufe_ring.rsp_cons;
1212 /*retrieve [usr_idx] to [mmap_idx,pending_idx] mapping*/
1213 usr_idx = (int)res.id;
1214 pending_idx = MASK_PEND_IDX(ID_TO_IDX(info->idx_map[usr_idx]));
1215 mmap_idx = ID_TO_MIDX(info->idx_map[usr_idx]);
1217 if ( (mmap_idx >= mmap_alloc) ||
1218 (ID_TO_IDX(info->idx_map[usr_idx]) >= MAX_PENDING_REQS) )
1219 WPRINTK("Incorrect req map"
1220 "[%d], internal map [%d,%d (%d)]\n",
1221 usr_idx, mmap_idx,
1222 ID_TO_IDX(info->idx_map[usr_idx]),
1223 MASK_PEND_IDX(
1224 ID_TO_IDX(info->idx_map[usr_idx])));
1226 pending_req = &pending_reqs[mmap_idx][pending_idx];
1227 blkif = pending_req->blkif;
1229 for (j = 0; j < pending_req->nr_pages; j++) {
1231 unsigned long kvaddr, uvaddr;
1232 struct page *pg;
1233 int offset;
1235 uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, j);
1236 kvaddr = idx_to_kaddr(mmap_idx, pending_idx, j);
1238 pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
1239 ClearPageReserved(pg);
1240 offset = (uvaddr - info->rings_vstart) >> PAGE_SHIFT;
1241 info->map[offset] = NULL;
1243 fast_flush_area(pending_req, pending_idx, usr_idx, info->minor);
1244 info->idx_map[usr_idx] = INVALID_REQ;
1245 make_response(blkif, pending_req->id, res.operation,
1246 res.status);
1247 blkif_put(pending_req->blkif);
1248 free_req(pending_req);
1251 return 0;
1255 /******************************************************************************
1256 * NOTIFICATION FROM GUEST OS.
1257 */
1259 static void blkif_notify_work(blkif_t *blkif)
1261 blkif->waiting_reqs = 1;
1262 wake_up(&blkif->wq);
1265 irqreturn_t tap_blkif_be_int(int irq, void *dev_id, struct pt_regs *regs)
1267 blkif_notify_work(dev_id);
1268 return IRQ_HANDLED;
1273 /******************************************************************
1274 * DOWNWARD CALLS -- These interface with the block-device layer proper.
1275 */
1276 static int print_dbug = 1;
1277 static int do_block_io_op(blkif_t *blkif)
1279 blkif_back_rings_t *blk_rings = &blkif->blk_rings;
1280 blkif_request_t req;
1281 pending_req_t *pending_req;
1282 RING_IDX rc, rp;
1283 int more_to_do = 0;
1284 tap_blkif_t *info;
1286 rc = blk_rings->common.req_cons;
1287 rp = blk_rings->common.sring->req_prod;
1288 rmb(); /* Ensure we see queued requests up to 'rp'. */
1290 /*Check blkif has corresponding UE ring*/
1291 if (blkif->dev_num < 0) {
1292 /*oops*/
1293 if (print_dbug) {
1294 WPRINTK("Corresponding UE "
1295 "ring does not exist!\n");
1296 print_dbug = 0; /*We only print this message once*/
1298 return 0;
1301 info = tapfds[blkif->dev_num];
1303 if (blkif->dev_num > MAX_TAP_DEV || !info ||
1304 !test_bit(0, &info->dev_inuse)) {
1305 if (print_dbug) {
1306 WPRINTK("Can't get UE info!\n");
1307 print_dbug = 0;
1309 return 0;
1312 while (rc != rp) {
1314 if (RING_FULL(&info->ufe_ring)) {
1315 WPRINTK("RING_FULL! More to do\n");
1316 more_to_do = 1;
1317 break;
1320 if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc)) {
1321 WPRINTK("RING_REQUEST_CONS_OVERFLOW!"
1322 " More to do\n");
1323 more_to_do = 1;
1324 break;
1327 if (kthread_should_stop()) {
1328 more_to_do = 1;
1329 break;
1332 pending_req = alloc_req();
1333 if (NULL == pending_req) {
1334 blkif->st_oo_req++;
1335 more_to_do = 1;
1336 break;
1339 switch (blkif->blk_protocol) {
1340 case BLKIF_PROTOCOL_NATIVE:
1341 memcpy(&req, RING_GET_REQUEST(&blk_rings->native, rc),
1342 sizeof(req));
1343 break;
1344 case BLKIF_PROTOCOL_X86_32:
1345 blkif_get_x86_32_req(&req, RING_GET_REQUEST(&blk_rings->x86_32, rc));
1346 break;
1347 case BLKIF_PROTOCOL_X86_64:
1348 blkif_get_x86_64_req(&req, RING_GET_REQUEST(&blk_rings->x86_64, rc));
1349 break;
1350 default:
1351 BUG();
1353 blk_rings->common.req_cons = ++rc; /* before make_response() */
1355 /* Apply all sanity checks to /private copy/ of request. */
1356 barrier();
1358 switch (req.operation) {
1359 case BLKIF_OP_READ:
1360 blkif->st_rd_req++;
1361 dispatch_rw_block_io(blkif, &req, pending_req);
1362 break;
1364 case BLKIF_OP_WRITE:
1365 blkif->st_wr_req++;
1366 dispatch_rw_block_io(blkif, &req, pending_req);
1367 break;
1369 default:
1370 /* A good sign something is wrong: sleep for a while to
1371 * avoid excessive CPU consumption by a bad guest. */
1372 msleep(1);
1373 WPRINTK("unknown operation [%d]\n",
1374 req.operation);
1375 make_response(blkif, req.id, req.operation,
1376 BLKIF_RSP_ERROR);
1377 free_req(pending_req);
1378 break;
1381 /* Yield point for this unbounded loop. */
1382 cond_resched();
1385 blktap_kick_user(blkif->dev_num);
1387 return more_to_do;
1390 static void dispatch_rw_block_io(blkif_t *blkif,
1391 blkif_request_t *req,
1392 pending_req_t *pending_req)
1394 extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]);
1395 int op, operation = (req->operation == BLKIF_OP_WRITE) ? WRITE : READ;
1396 struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST*2];
1397 unsigned int nseg;
1398 int ret, i, nr_sects = 0;
1399 tap_blkif_t *info;
1400 blkif_request_t *target;
1401 int pending_idx = RTN_PEND_IDX(pending_req,pending_req->mem_idx);
1402 int usr_idx;
1403 uint16_t mmap_idx = pending_req->mem_idx;
1404 struct mm_struct *mm;
1405 struct vm_area_struct *vma = NULL;
1407 if (blkif->dev_num < 0 || blkif->dev_num > MAX_TAP_DEV)
1408 goto fail_response;
1410 info = tapfds[blkif->dev_num];
1411 if (info == NULL)
1412 goto fail_response;
1414 /* Check we have space on user ring - should never fail. */
1415 usr_idx = GET_NEXT_REQ(info->idx_map);
1416 if (usr_idx == INVALID_REQ) {
1417 BUG();
1418 goto fail_response;
1421 /* Check that number of segments is sane. */
1422 nseg = req->nr_segments;
1423 if ( unlikely(nseg == 0) ||
1424 unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST) ) {
1425 WPRINTK("Bad number of segments in request (%d)\n", nseg);
1426 goto fail_response;
1429 /* Make sure userspace is ready. */
1430 if (!info->ring_ok) {
1431 WPRINTK("blktap: ring not ready for requests!\n");
1432 goto fail_response;
1434 smp_rmb();
1436 if (RING_FULL(&info->ufe_ring)) {
1437 WPRINTK("blktap: fe_ring is full, can't add "
1438 "IO Request will be dropped. %d %d\n",
1439 RING_SIZE(&info->ufe_ring),
1440 RING_SIZE(&blkif->blk_rings.common));
1441 goto fail_response;
1444 pending_req->blkif = blkif;
1445 pending_req->id = req->id;
1446 pending_req->operation = operation;
1447 pending_req->status = BLKIF_RSP_OKAY;
1448 pending_req->nr_pages = nseg;
1449 op = 0;
1450 mm = info->mm;
1451 if (!xen_feature(XENFEAT_auto_translated_physmap))
1452 down_write(&mm->mmap_sem);
1453 for (i = 0; i < nseg; i++) {
1454 unsigned long uvaddr;
1455 unsigned long kvaddr;
1456 uint64_t ptep;
1457 uint32_t flags;
1459 uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i);
1460 kvaddr = idx_to_kaddr(mmap_idx, pending_idx, i);
1462 flags = GNTMAP_host_map;
1463 if (operation == WRITE)
1464 flags |= GNTMAP_readonly;
1465 gnttab_set_map_op(&map[op], kvaddr, flags,
1466 req->seg[i].gref, blkif->domid);
1467 op++;
1469 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
1470 /* Now map it to user. */
1471 ret = create_lookup_pte_addr(mm, uvaddr, &ptep);
1472 if (ret) {
1473 up_write(&mm->mmap_sem);
1474 WPRINTK("Couldn't get a pte addr!\n");
1475 goto fail_flush;
1478 flags = GNTMAP_host_map | GNTMAP_application_map
1479 | GNTMAP_contains_pte;
1480 if (operation == WRITE)
1481 flags |= GNTMAP_readonly;
1482 gnttab_set_map_op(&map[op], ptep, flags,
1483 req->seg[i].gref, blkif->domid);
1484 op++;
1487 nr_sects += (req->seg[i].last_sect -
1488 req->seg[i].first_sect + 1);
1491 ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, op);
1492 BUG_ON(ret);
1494 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
1495 up_write(&mm->mmap_sem);
1497 for (i = 0; i < (nseg*2); i+=2) {
1498 unsigned long uvaddr;
1499 unsigned long kvaddr;
1500 unsigned long offset;
1501 struct page *pg;
1503 uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i/2);
1504 kvaddr = idx_to_kaddr(mmap_idx, pending_idx, i/2);
1506 if (unlikely(map[i].status != 0)) {
1507 WPRINTK("invalid kernel buffer -- "
1508 "could not remap it\n");
1509 ret |= 1;
1510 map[i].handle = INVALID_GRANT_HANDLE;
1513 if (unlikely(map[i+1].status != 0)) {
1514 WPRINTK("invalid user buffer -- "
1515 "could not remap it\n");
1516 ret |= 1;
1517 map[i+1].handle = INVALID_GRANT_HANDLE;
1520 pending_handle(mmap_idx, pending_idx, i/2).kernel
1521 = map[i].handle;
1522 pending_handle(mmap_idx, pending_idx, i/2).user
1523 = map[i+1].handle;
1525 if (ret)
1526 continue;
1528 set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT,
1529 FOREIGN_FRAME(map[i].dev_bus_addr
1530 >> PAGE_SHIFT));
1531 offset = (uvaddr - info->rings_vstart) >> PAGE_SHIFT;
1532 pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
1533 info->map[offset] = pg;
1535 } else {
1536 for (i = 0; i < nseg; i++) {
1537 unsigned long uvaddr;
1538 unsigned long kvaddr;
1539 unsigned long offset;
1540 struct page *pg;
1542 uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i);
1543 kvaddr = idx_to_kaddr(mmap_idx, pending_idx, i);
1545 if (unlikely(map[i].status != 0)) {
1546 WPRINTK("invalid kernel buffer -- "
1547 "could not remap it\n");
1548 ret |= 1;
1549 map[i].handle = INVALID_GRANT_HANDLE;
1552 pending_handle(mmap_idx, pending_idx, i).kernel
1553 = map[i].handle;
1555 if (ret)
1556 continue;
1558 offset = (uvaddr - info->rings_vstart) >> PAGE_SHIFT;
1559 pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
1560 info->map[offset] = pg;
1564 if (ret)
1565 goto fail_flush;
1567 if (xen_feature(XENFEAT_auto_translated_physmap))
1568 down_write(&mm->mmap_sem);
1569 /* Mark mapped pages as reserved: */
1570 for (i = 0; i < req->nr_segments; i++) {
1571 unsigned long kvaddr;
1572 struct page *pg;
1574 kvaddr = idx_to_kaddr(mmap_idx, pending_idx, i);
1575 pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
1576 SetPageReserved(pg);
1577 if (xen_feature(XENFEAT_auto_translated_physmap)) {
1578 unsigned long uvaddr = MMAP_VADDR(info->user_vstart,
1579 usr_idx, i);
1580 if (vma && uvaddr >= vma->vm_end) {
1581 vma = vma->vm_next;
1582 if (vma &&
1583 (uvaddr < vma->vm_start ||
1584 uvaddr >= vma->vm_end))
1585 vma = NULL;
1587 if (vma == NULL) {
1588 vma = find_vma(mm, uvaddr);
1589 /* this virtual area was already munmapped.
1590 so skip to next page */
1591 if (!vma)
1592 continue;
1594 ret = vm_insert_page(vma, uvaddr, pg);
1595 if (ret) {
1596 up_write(&mm->mmap_sem);
1597 goto fail_flush;
1601 if (xen_feature(XENFEAT_auto_translated_physmap))
1602 up_write(&mm->mmap_sem);
1604 /*record [mmap_idx,pending_idx] to [usr_idx] mapping*/
1605 info->idx_map[usr_idx] = MAKE_ID(mmap_idx, pending_idx);
1607 blkif_get(blkif);
1608 /* Finally, write the request message to the user ring. */
1609 target = RING_GET_REQUEST(&info->ufe_ring,
1610 info->ufe_ring.req_prod_pvt);
1611 memcpy(target, req, sizeof(*req));
1612 target->id = usr_idx;
1613 wmb(); /* blktap_poll() reads req_prod_pvt asynchronously */
1614 info->ufe_ring.req_prod_pvt++;
1616 if (operation == READ)
1617 blkif->st_rd_sect += nr_sects;
1618 else if (operation == WRITE)
1619 blkif->st_wr_sect += nr_sects;
1621 return;
1623 fail_flush:
1624 WPRINTK("Reached Fail_flush\n");
1625 fast_flush_area(pending_req, pending_idx, usr_idx, blkif->dev_num);
1626 fail_response:
1627 make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
1628 free_req(pending_req);
1629 msleep(1); /* back off a bit */
1634 /******************************************************************
1635 * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING
1636 */
1639 static void make_response(blkif_t *blkif, u64 id,
1640 unsigned short op, int st)
1642 blkif_response_t resp;
1643 unsigned long flags;
1644 blkif_back_rings_t *blk_rings = &blkif->blk_rings;
1645 int more_to_do = 0;
1646 int notify;
1648 resp.id = id;
1649 resp.operation = op;
1650 resp.status = st;
1652 spin_lock_irqsave(&blkif->blk_ring_lock, flags);
1653 /* Place on the response ring for the relevant domain. */
1654 switch (blkif->blk_protocol) {
1655 case BLKIF_PROTOCOL_NATIVE:
1656 memcpy(RING_GET_RESPONSE(&blk_rings->native,
1657 blk_rings->native.rsp_prod_pvt),
1658 &resp, sizeof(resp));
1659 break;
1660 case BLKIF_PROTOCOL_X86_32:
1661 memcpy(RING_GET_RESPONSE(&blk_rings->x86_32,
1662 blk_rings->x86_32.rsp_prod_pvt),
1663 &resp, sizeof(resp));
1664 break;
1665 case BLKIF_PROTOCOL_X86_64:
1666 memcpy(RING_GET_RESPONSE(&blk_rings->x86_64,
1667 blk_rings->x86_64.rsp_prod_pvt),
1668 &resp, sizeof(resp));
1669 break;
1670 default:
1671 BUG();
1673 blk_rings->common.rsp_prod_pvt++;
1674 RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings->common, notify);
1676 if (blk_rings->common.rsp_prod_pvt == blk_rings->common.req_cons) {
1677 /*
1678 * Tail check for pending requests. Allows frontend to avoid
1679 * notifications if requests are already in flight (lower
1680 * overheads and promotes batching).
1681 */
1682 RING_FINAL_CHECK_FOR_REQUESTS(&blk_rings->common, more_to_do);
1683 } else if (RING_HAS_UNCONSUMED_REQUESTS(&blk_rings->common)) {
1684 more_to_do = 1;
1687 spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
1688 if (more_to_do)
1689 blkif_notify_work(blkif);
1690 if (notify)
1691 notify_remote_via_irq(blkif->irq);
1694 static int __init blkif_init(void)
1696 int i, ret;
1697 struct class *class;
1699 if (!is_running_on_xen())
1700 return -ENODEV;
1702 INIT_LIST_HEAD(&pending_free);
1703 for(i = 0; i < 2; i++) {
1704 ret = req_increase();
1705 if (ret)
1706 break;
1708 if (i == 0)
1709 return ret;
1711 tap_blkif_interface_init();
1713 alloc_pending_reqs = 0;
1715 tap_blkif_xenbus_init();
1717 /* Dynamically allocate a major for this device */
1718 ret = register_chrdev(0, "blktap", &blktap_fops);
1720 if (ret < 0) {
1721 WPRINTK("Couldn't register /dev/xen/blktap\n");
1722 return -ENOMEM;
1725 blktap_major = ret;
1727 /* tapfds[0] is always NULL */
1728 blktap_next_minor++;
1730 DPRINTK("Created misc_dev [/dev/xen/blktap%d]\n",i);
1732 /* Make sure the xen class exists */
1733 if ((class = get_xen_class()) != NULL) {
1734 /*
1735 * This will allow udev to create the blktap ctrl device.
1736 * We only want to create blktap0 first. We don't want
1737 * to flood the sysfs system with needless blktap devices.
1738 * We only create the device when a request of a new device is
1739 * made.
1740 */
1741 class_device_create(class, NULL,
1742 MKDEV(blktap_major, 0), NULL,
1743 "blktap0");
1744 } else {
1745 /* this is bad, but not fatal */
1746 WPRINTK("blktap: sysfs xen_class not created\n");
1749 DPRINTK("Blktap device successfully created\n");
1751 return 0;
1754 module_init(blkif_init);
1756 MODULE_LICENSE("Dual BSD/GPL");