ia64/linux-2.6.18-xen.hg

view drivers/xen/blktap/blktap.c @ 863:464a925d73f1

blktap: don't access deallocated data

Dereferencing filp->private_data->vma in the file_operations.release
actor isn't permitted, as the vma generally has been destroyed by that
time. The kfree()ing of vma->vm_private_data must be done in the
vm_operations.close actor, and the call to zap_page_range() seems
redundant with the caller of that actor altogether.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
author Keir Fraser <keir.fraser@citrix.com>
date Fri Apr 17 13:03:22 2009 +0100 (2009-04-17)
parents 5e1269aa5c29
children 613216635ff0
line source
1 /******************************************************************************
2 * drivers/xen/blktap/blktap.c
3 *
4 * Back-end driver for user level virtual block devices. This portion of the
5 * driver exports a 'unified' block-device interface that can be accessed
6 * by any operating system that implements a compatible front end. Requests
7 * are remapped to a user-space memory region.
8 *
9 * Based on the blkback driver code.
10 *
11 * Copyright (c) 2004-2005, Andrew Warfield and Julian Chesterfield
12 *
13 * Clean ups and fix ups:
14 * Copyright (c) 2006, Steven Rostedt - Red Hat, Inc.
15 *
16 * This program is free software; you can redistribute it and/or
17 * modify it under the terms of the GNU General Public License version 2
18 * as published by the Free Software Foundation; or, when distributed
19 * separately from the Linux kernel or incorporated into other
20 * software packages, subject to the following license:
21 *
22 * Permission is hereby granted, free of charge, to any person obtaining a copy
23 * of this source file (the "Software"), to deal in the Software without
24 * restriction, including without limitation the rights to use, copy, modify,
25 * merge, publish, distribute, sublicense, and/or sell copies of the Software,
26 * and to permit persons to whom the Software is furnished to do so, subject to
27 * the following conditions:
28 *
29 * The above copyright notice and this permission notice shall be included in
30 * all copies or substantial portions of the Software.
31 *
32 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
33 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
34 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
35 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
36 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
37 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
38 * IN THE SOFTWARE.
39 */
41 #include <linux/spinlock.h>
42 #include <linux/kthread.h>
43 #include <linux/list.h>
44 #include <asm/hypervisor.h>
45 #include "common.h"
46 #include <xen/balloon.h>
47 #include <xen/driver_util.h>
48 #include <linux/kernel.h>
49 #include <linux/fs.h>
50 #include <linux/mm.h>
51 #include <linux/errno.h>
52 #include <linux/major.h>
53 #include <linux/gfp.h>
54 #include <linux/poll.h>
55 #include <linux/delay.h>
56 #include <asm/tlbflush.h>
58 #define MAX_TAP_DEV 256 /*the maximum number of tapdisk ring devices */
59 #define MAX_DEV_NAME 100 /*the max tapdisk ring device name e.g. blktap0 */
61 /*
62 * The maximum number of requests that can be outstanding at any time
63 * is determined by
64 *
65 * [mmap_alloc * MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST]
66 *
67 * where mmap_alloc < MAX_DYNAMIC_MEM.
68 *
69 * TODO:
70 * mmap_alloc is initialised to 2 and should be adjustable on the fly via
71 * sysfs.
72 */
73 #define BLK_RING_SIZE __RING_SIZE((blkif_sring_t *)0, PAGE_SIZE)
74 #define MAX_DYNAMIC_MEM BLK_RING_SIZE
75 #define MAX_PENDING_REQS BLK_RING_SIZE
76 #define MMAP_PAGES (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST)
77 #define MMAP_VADDR(_start, _req,_seg) \
78 (_start + \
79 ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) + \
80 ((_seg) * PAGE_SIZE))
81 static int blkif_reqs = MAX_PENDING_REQS;
82 static int mmap_pages = MMAP_PAGES;
84 #define RING_PAGES 1 /* BLKTAP - immediately before the mmap area, we
85 * have a bunch of pages reserved for shared
86 * memory rings.
87 */
89 /*Data struct handed back to userspace for tapdisk device to VBD mapping*/
90 typedef struct domid_translate {
91 unsigned short domid;
92 unsigned short busid;
93 } domid_translate_t ;
95 typedef struct domid_translate_ext {
96 unsigned short domid;
97 u32 busid;
98 } domid_translate_ext_t ;
100 /*Data struct associated with each of the tapdisk devices*/
101 typedef struct tap_blkif {
102 struct vm_area_struct *vma; /*Shared memory area */
103 unsigned long rings_vstart; /*Kernel memory mapping */
104 unsigned long user_vstart; /*User memory mapping */
105 unsigned long dev_inuse; /*One process opens device at a time. */
106 unsigned long dev_pending; /*In process of being opened */
107 unsigned long ring_ok; /*make this ring->state */
108 blkif_front_ring_t ufe_ring; /*Rings up to user space. */
109 wait_queue_head_t wait; /*for poll */
110 unsigned long mode; /*current switching mode */
111 int minor; /*Minor number for tapdisk device */
112 pid_t pid; /*tapdisk process id */
113 enum { RUNNING, CLEANSHUTDOWN } status; /*Detect a clean userspace
114 shutdown */
115 unsigned long *idx_map; /*Record the user ring id to kern
116 [req id, idx] tuple */
117 blkif_t *blkif; /*Associate blkif with tapdev */
118 struct domid_translate_ext trans; /*Translation from domid to bus. */
119 } tap_blkif_t;
121 static struct tap_blkif *tapfds[MAX_TAP_DEV];
122 static int blktap_next_minor;
124 module_param(blkif_reqs, int, 0);
125 /* Run-time switchable: /sys/module/blktap/parameters/ */
126 static unsigned int log_stats = 0;
127 static unsigned int debug_lvl = 0;
128 module_param(log_stats, int, 0644);
129 module_param(debug_lvl, int, 0644);
131 /*
132 * Each outstanding request that we've passed to the lower device layers has a
133 * 'pending_req' allocated to it. Each buffer_head that completes decrements
134 * the pendcnt towards zero. When it hits zero, the specified domain has a
135 * response queued for it, with the saved 'id' passed back.
136 */
137 typedef struct {
138 blkif_t *blkif;
139 u64 id;
140 unsigned short mem_idx;
141 int nr_pages;
142 atomic_t pendcnt;
143 unsigned short operation;
144 int status;
145 struct list_head free_list;
146 int inuse;
147 } pending_req_t;
149 static pending_req_t *pending_reqs[MAX_PENDING_REQS];
150 static struct list_head pending_free;
151 static DEFINE_SPINLOCK(pending_free_lock);
152 static DECLARE_WAIT_QUEUE_HEAD (pending_free_wq);
153 static int alloc_pending_reqs;
155 typedef unsigned int PEND_RING_IDX;
157 static inline int MASK_PEND_IDX(int i) {
158 return (i & (MAX_PENDING_REQS-1));
159 }
161 static inline unsigned int RTN_PEND_IDX(pending_req_t *req, int idx) {
162 return (req - pending_reqs[idx]);
163 }
165 #define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons)
167 #define BLKBACK_INVALID_HANDLE (~0)
169 static struct page **foreign_pages[MAX_DYNAMIC_MEM];
170 static inline unsigned long idx_to_kaddr(
171 unsigned int mmap_idx, unsigned int req_idx, unsigned int sg_idx)
172 {
173 unsigned int arr_idx = req_idx*BLKIF_MAX_SEGMENTS_PER_REQUEST + sg_idx;
174 unsigned long pfn = page_to_pfn(foreign_pages[mmap_idx][arr_idx]);
175 return (unsigned long)pfn_to_kaddr(pfn);
176 }
178 static unsigned short mmap_alloc = 0;
179 static unsigned short mmap_lock = 0;
180 static unsigned short mmap_inuse = 0;
182 /******************************************************************
183 * GRANT HANDLES
184 */
186 /* When using grant tables to map a frame for device access then the
187 * handle returned must be used to unmap the frame. This is needed to
188 * drop the ref count on the frame.
189 */
190 struct grant_handle_pair
191 {
192 grant_handle_t kernel;
193 grant_handle_t user;
194 };
195 #define INVALID_GRANT_HANDLE 0xFFFF
197 static struct grant_handle_pair
198 pending_grant_handles[MAX_DYNAMIC_MEM][MMAP_PAGES];
199 #define pending_handle(_id, _idx, _i) \
200 (pending_grant_handles[_id][((_idx) * BLKIF_MAX_SEGMENTS_PER_REQUEST) \
201 + (_i)])
204 static int blktap_read_ufe_ring(tap_blkif_t *info); /*local prototypes*/
206 #define BLKTAP_MINOR 0 /*/dev/xen/blktap has a dynamic major */
207 #define BLKTAP_DEV_DIR "/dev/xen"
209 static int blktap_major;
211 /* blktap IOCTLs: */
212 #define BLKTAP_IOCTL_KICK_FE 1
213 #define BLKTAP_IOCTL_KICK_BE 2 /* currently unused */
214 #define BLKTAP_IOCTL_SETMODE 3
215 #define BLKTAP_IOCTL_SENDPID 4
216 #define BLKTAP_IOCTL_NEWINTF 5
217 #define BLKTAP_IOCTL_MINOR 6
218 #define BLKTAP_IOCTL_MAJOR 7
219 #define BLKTAP_QUERY_ALLOC_REQS 8
220 #define BLKTAP_IOCTL_FREEINTF 9
221 #define BLKTAP_IOCTL_NEWINTF_EXT 50
222 #define BLKTAP_IOCTL_PRINT_IDXS 100
224 /* blktap switching modes: (Set with BLKTAP_IOCTL_SETMODE) */
225 #define BLKTAP_MODE_PASSTHROUGH 0x00000000 /* default */
226 #define BLKTAP_MODE_INTERCEPT_FE 0x00000001
227 #define BLKTAP_MODE_INTERCEPT_BE 0x00000002 /* unimp. */
229 #define BLKTAP_MODE_INTERPOSE \
230 (BLKTAP_MODE_INTERCEPT_FE | BLKTAP_MODE_INTERCEPT_BE)
233 static inline int BLKTAP_MODE_VALID(unsigned long arg)
234 {
235 return ((arg == BLKTAP_MODE_PASSTHROUGH ) ||
236 (arg == BLKTAP_MODE_INTERCEPT_FE) ||
237 (arg == BLKTAP_MODE_INTERPOSE ));
238 }
240 /* Requests passing through the tap to userspace are re-assigned an ID.
241 * We must record a mapping between the BE [IDX,ID] tuple and the userspace
242 * ring ID.
243 */
245 static inline unsigned long MAKE_ID(domid_t fe_dom, PEND_RING_IDX idx)
246 {
247 return ((fe_dom << 16) | MASK_PEND_IDX(idx));
248 }
250 extern inline PEND_RING_IDX ID_TO_IDX(unsigned long id)
251 {
252 return (PEND_RING_IDX)(id & 0x0000ffff);
253 }
255 extern inline int ID_TO_MIDX(unsigned long id)
256 {
257 return (int)(id >> 16);
258 }
260 #define INVALID_REQ 0xdead0000
262 /*TODO: Convert to a free list*/
263 static inline int GET_NEXT_REQ(unsigned long *idx_map)
264 {
265 int i;
266 for (i = 0; i < MAX_PENDING_REQS; i++)
267 if (idx_map[i] == INVALID_REQ)
268 return i;
270 return INVALID_REQ;
271 }
273 static inline int OFFSET_TO_USR_IDX(int offset)
274 {
275 return offset / BLKIF_MAX_SEGMENTS_PER_REQUEST;
276 }
278 static inline int OFFSET_TO_SEG(int offset)
279 {
280 return offset % BLKIF_MAX_SEGMENTS_PER_REQUEST;
281 }
284 #define BLKTAP_INVALID_HANDLE(_g) \
285 (((_g->kernel) == INVALID_GRANT_HANDLE) && \
286 ((_g->user) == INVALID_GRANT_HANDLE))
288 #define BLKTAP_INVALIDATE_HANDLE(_g) do { \
289 (_g)->kernel = INVALID_GRANT_HANDLE; (_g)->user = INVALID_GRANT_HANDLE; \
290 } while(0)
293 /******************************************************************
294 * BLKTAP VM OPS
295 */
296 struct tap_vma_priv {
297 tap_blkif_t *info;
298 struct page *map[];
299 };
301 static struct page *blktap_nopage(struct vm_area_struct *vma,
302 unsigned long address,
303 int *type)
304 {
305 /*
306 * if the page has not been mapped in by the driver then return
307 * NOPAGE_SIGBUS to the domain.
308 */
310 return NOPAGE_SIGBUS;
311 }
313 static pte_t blktap_clear_pte(struct vm_area_struct *vma,
314 unsigned long uvaddr,
315 pte_t *ptep, int is_fullmm)
316 {
317 pte_t copy;
318 tap_blkif_t *info;
319 int offset, seg, usr_idx, pending_idx, mmap_idx;
320 unsigned long uvstart = vma->vm_start + (RING_PAGES << PAGE_SHIFT);
321 unsigned long kvaddr;
322 struct tap_vma_priv *priv;
323 struct page *pg;
324 struct grant_handle_pair *khandle;
325 struct gnttab_unmap_grant_ref unmap[2];
326 int count = 0;
328 /*
329 * If the address is before the start of the grant mapped region or
330 * if vm_file is NULL (meaning mmap failed and we have nothing to do)
331 */
332 if (uvaddr < uvstart || vma->vm_file == NULL)
333 return ptep_get_and_clear_full(vma->vm_mm, uvaddr,
334 ptep, is_fullmm);
336 info = vma->vm_file->private_data;
337 priv = vma->vm_private_data;
339 /* TODO Should these be changed to if statements? */
340 BUG_ON(!info);
341 BUG_ON(!info->idx_map);
342 BUG_ON(!priv);
344 offset = (int) ((uvaddr - uvstart) >> PAGE_SHIFT);
345 usr_idx = OFFSET_TO_USR_IDX(offset);
346 seg = OFFSET_TO_SEG(offset);
348 pending_idx = MASK_PEND_IDX(ID_TO_IDX(info->idx_map[usr_idx]));
349 mmap_idx = ID_TO_MIDX(info->idx_map[usr_idx]);
351 kvaddr = idx_to_kaddr(mmap_idx, pending_idx, seg);
352 pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
353 ClearPageReserved(pg);
354 priv->map[offset + RING_PAGES] = NULL;
356 khandle = &pending_handle(mmap_idx, pending_idx, seg);
358 if (khandle->kernel != INVALID_GRANT_HANDLE) {
359 gnttab_set_unmap_op(&unmap[count], kvaddr,
360 GNTMAP_host_map, khandle->kernel);
361 count++;
363 set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT,
364 INVALID_P2M_ENTRY);
365 }
367 if (khandle->user != INVALID_GRANT_HANDLE) {
368 BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
370 copy = *ptep;
371 gnttab_set_unmap_op(&unmap[count], ptep_to_machine(ptep),
372 GNTMAP_host_map
373 | GNTMAP_application_map
374 | GNTMAP_contains_pte,
375 khandle->user);
376 count++;
377 } else {
378 BUG_ON(!xen_feature(XENFEAT_auto_translated_physmap));
380 /* USING SHADOW PAGE TABLES. */
381 copy = ptep_get_and_clear_full(vma->vm_mm, uvaddr, ptep,
382 is_fullmm);
383 }
385 if (count) {
386 BLKTAP_INVALIDATE_HANDLE(khandle);
387 if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
388 unmap, count))
389 BUG();
390 }
392 return copy;
393 }
395 static void blktap_vma_close(struct vm_area_struct *vma)
396 {
397 struct tap_vma_priv *priv = vma->vm_private_data;
399 if (priv) {
400 priv->info->vma = NULL;
401 kfree(priv);
402 }
403 }
405 struct vm_operations_struct blktap_vm_ops = {
406 nopage: blktap_nopage,
407 zap_pte: blktap_clear_pte,
408 close: blktap_vma_close,
409 };
411 /******************************************************************
412 * BLKTAP FILE OPS
413 */
415 /*Function Declarations*/
416 static tap_blkif_t *get_next_free_dev(void);
417 static int blktap_open(struct inode *inode, struct file *filp);
418 static int blktap_release(struct inode *inode, struct file *filp);
419 static int blktap_mmap(struct file *filp, struct vm_area_struct *vma);
420 static int blktap_ioctl(struct inode *inode, struct file *filp,
421 unsigned int cmd, unsigned long arg);
422 static unsigned int blktap_poll(struct file *file, poll_table *wait);
424 static const struct file_operations blktap_fops = {
425 .owner = THIS_MODULE,
426 .poll = blktap_poll,
427 .ioctl = blktap_ioctl,
428 .open = blktap_open,
429 .release = blktap_release,
430 .mmap = blktap_mmap,
431 };
434 static tap_blkif_t *get_next_free_dev(void)
435 {
436 struct class *class;
437 tap_blkif_t *info;
438 int minor;
440 /*
441 * This is called only from the ioctl, which
442 * means we should always have interrupts enabled.
443 */
444 BUG_ON(irqs_disabled());
446 spin_lock_irq(&pending_free_lock);
448 /* tapfds[0] is always NULL */
450 for (minor = 1; minor < blktap_next_minor; minor++) {
451 info = tapfds[minor];
452 /* we could have failed a previous attempt. */
453 if (!info ||
454 ((info->dev_inuse == 0) &&
455 (info->dev_pending == 0)) ) {
456 info->dev_pending = 1;
457 goto found;
458 }
459 }
460 info = NULL;
461 minor = -1;
463 /*
464 * We didn't find free device. If we can still allocate
465 * more, then we grab the next device minor that is
466 * available. This is done while we are still under
467 * the protection of the pending_free_lock.
468 */
469 if (blktap_next_minor < MAX_TAP_DEV)
470 minor = blktap_next_minor++;
471 found:
472 spin_unlock_irq(&pending_free_lock);
474 if (!info && minor > 0) {
475 info = kzalloc(sizeof(*info), GFP_KERNEL);
476 if (unlikely(!info)) {
477 /*
478 * If we failed here, try to put back
479 * the next minor number. But if one
480 * was just taken, then we just lose this
481 * minor. We can try to allocate this
482 * minor again later.
483 */
484 spin_lock_irq(&pending_free_lock);
485 if (blktap_next_minor == minor+1)
486 blktap_next_minor--;
487 spin_unlock_irq(&pending_free_lock);
488 goto out;
489 }
491 info->minor = minor;
492 /*
493 * Make sure that we have a minor before others can
494 * see us.
495 */
496 wmb();
497 tapfds[minor] = info;
499 if ((class = get_xen_class()) != NULL)
500 class_device_create(class, NULL,
501 MKDEV(blktap_major, minor), NULL,
502 "blktap%d", minor);
503 }
505 out:
506 return info;
507 }
509 int dom_to_devid(domid_t domid, int xenbus_id, blkif_t *blkif)
510 {
511 tap_blkif_t *info;
512 int i;
514 for (i = 1; i < blktap_next_minor; i++) {
515 info = tapfds[i];
516 if ( info &&
517 (info->trans.domid == domid) &&
518 (info->trans.busid == xenbus_id) ) {
519 info->blkif = blkif;
520 info->status = RUNNING;
521 return i;
522 }
523 }
524 return -1;
525 }
527 void signal_tapdisk(int idx)
528 {
529 tap_blkif_t *info;
530 struct task_struct *ptask;
532 /*
533 * if the userland tools set things up wrong, this could be negative;
534 * just don't try to signal in this case
535 */
536 if (idx < 0)
537 return;
539 info = tapfds[idx];
540 if ((idx < 0) || (idx > MAX_TAP_DEV) || !info)
541 return;
543 if (info->pid > 0) {
544 ptask = find_task_by_pid(info->pid);
545 if (ptask)
546 info->status = CLEANSHUTDOWN;
547 }
548 info->blkif = NULL;
550 return;
551 }
553 static int blktap_open(struct inode *inode, struct file *filp)
554 {
555 blkif_sring_t *sring;
556 int idx = iminor(inode) - BLKTAP_MINOR;
557 tap_blkif_t *info;
558 int i;
560 /* ctrl device, treat differently */
561 if (!idx)
562 return 0;
564 info = tapfds[idx];
566 if ((idx < 0) || (idx > MAX_TAP_DEV) || !info) {
567 WPRINTK("Unable to open device /dev/xen/blktap%d\n",
568 idx);
569 return -ENODEV;
570 }
572 DPRINTK("Opening device /dev/xen/blktap%d\n",idx);
574 /*Only one process can access device at a time*/
575 if (test_and_set_bit(0, &info->dev_inuse))
576 return -EBUSY;
578 info->dev_pending = 0;
580 /* Allocate the fe ring. */
581 sring = (blkif_sring_t *)get_zeroed_page(GFP_KERNEL);
582 if (sring == NULL)
583 goto fail_nomem;
585 SetPageReserved(virt_to_page(sring));
587 SHARED_RING_INIT(sring);
588 FRONT_RING_INIT(&info->ufe_ring, sring, PAGE_SIZE);
590 filp->private_data = info;
591 info->vma = NULL;
593 info->idx_map = kmalloc(sizeof(unsigned long) * MAX_PENDING_REQS,
594 GFP_KERNEL);
596 if (info->idx_map == NULL)
597 goto fail_nomem;
599 if (idx > 0) {
600 init_waitqueue_head(&info->wait);
601 for (i = 0; i < MAX_PENDING_REQS; i++)
602 info->idx_map[i] = INVALID_REQ;
603 }
605 DPRINTK("Tap open: device /dev/xen/blktap%d\n",idx);
606 return 0;
608 fail_nomem:
609 return -ENOMEM;
610 }
612 static int blktap_release(struct inode *inode, struct file *filp)
613 {
614 tap_blkif_t *info = filp->private_data;
616 /* check for control device */
617 if (!info)
618 return 0;
620 info->dev_inuse = 0;
621 DPRINTK("Freeing device [/dev/xen/blktap%d]\n",info->minor);
623 /* Free the ring page. */
624 ClearPageReserved(virt_to_page(info->ufe_ring.sring));
625 free_page((unsigned long) info->ufe_ring.sring);
627 if (info->idx_map) {
628 kfree(info->idx_map);
629 info->idx_map = NULL;
630 }
632 if ( (info->status != CLEANSHUTDOWN) && (info->blkif != NULL) ) {
633 if (info->blkif->xenblkd != NULL) {
634 kthread_stop(info->blkif->xenblkd);
635 info->blkif->xenblkd = NULL;
636 }
637 info->status = CLEANSHUTDOWN;
638 }
640 return 0;
641 }
644 /* Note on mmap:
645 * We need to map pages to user space in a way that will allow the block
646 * subsystem set up direct IO to them. This couldn't be done before, because
647 * there isn't really a sane way to translate a user virtual address down to a
648 * physical address when the page belongs to another domain.
649 *
650 * My first approach was to map the page in to kernel memory, add an entry
651 * for it in the physical frame list (using alloc_lomem_region as in blkback)
652 * and then attempt to map that page up to user space. This is disallowed
653 * by xen though, which realizes that we don't really own the machine frame
654 * underlying the physical page.
655 *
656 * The new approach is to provide explicit support for this in xen linux.
657 * The VMA now has a flag, VM_FOREIGN, to indicate that it contains pages
658 * mapped from other vms. vma->vm_private_data is set up as a mapping
659 * from pages to actual page structs. There is a new clause in get_user_pages
660 * that does the right thing for this sort of mapping.
661 */
662 static int blktap_mmap(struct file *filp, struct vm_area_struct *vma)
663 {
664 int size;
665 struct tap_vma_priv *priv;
666 tap_blkif_t *info = filp->private_data;
667 int ret;
669 if (info == NULL) {
670 WPRINTK("blktap: mmap, retrieving idx failed\n");
671 return -ENOMEM;
672 }
674 vma->vm_flags |= VM_RESERVED;
675 vma->vm_ops = &blktap_vm_ops;
677 size = vma->vm_end - vma->vm_start;
678 if (size != ((mmap_pages + RING_PAGES) << PAGE_SHIFT)) {
679 WPRINTK("you _must_ map exactly %d pages!\n",
680 mmap_pages + RING_PAGES);
681 return -EAGAIN;
682 }
684 size >>= PAGE_SHIFT;
685 info->rings_vstart = vma->vm_start;
686 info->user_vstart = info->rings_vstart + (RING_PAGES << PAGE_SHIFT);
688 /* Map the ring pages to the start of the region and reserve it. */
689 if (xen_feature(XENFEAT_auto_translated_physmap))
690 ret = vm_insert_page(vma, vma->vm_start,
691 virt_to_page(info->ufe_ring.sring));
692 else
693 ret = remap_pfn_range(vma, vma->vm_start,
694 __pa(info->ufe_ring.sring) >> PAGE_SHIFT,
695 PAGE_SIZE, vma->vm_page_prot);
696 if (ret) {
697 WPRINTK("Mapping user ring failed!\n");
698 goto fail;
699 }
701 /* Mark this VM as containing foreign pages, and set up mappings. */
702 priv = kzalloc(sizeof(*priv) + ((vma->vm_end - vma->vm_start)
703 >> PAGE_SHIFT) * sizeof(*priv->map),
704 GFP_KERNEL);
705 if (priv == NULL) {
706 WPRINTK("Couldn't alloc VM_FOREIGN map.\n");
707 goto fail;
708 }
709 priv->info = info;
711 vma->vm_private_data = priv;
712 vma->vm_flags |= VM_FOREIGN;
713 vma->vm_flags |= VM_DONTCOPY;
715 #ifdef CONFIG_X86
716 vma->vm_mm->context.has_foreign_mappings = 1;
717 #endif
719 info->vma = vma;
720 info->ring_ok = 1;
721 return 0;
722 fail:
723 /* Clear any active mappings. */
724 zap_page_range(vma, vma->vm_start,
725 vma->vm_end - vma->vm_start, NULL);
727 return -ENOMEM;
728 }
731 static int blktap_ioctl(struct inode *inode, struct file *filp,
732 unsigned int cmd, unsigned long arg)
733 {
734 tap_blkif_t *info = filp->private_data;
736 switch(cmd) {
737 case BLKTAP_IOCTL_KICK_FE:
738 {
739 /* There are fe messages to process. */
740 return blktap_read_ufe_ring(info);
741 }
742 case BLKTAP_IOCTL_SETMODE:
743 {
744 if (info) {
745 if (BLKTAP_MODE_VALID(arg)) {
746 info->mode = arg;
747 /* XXX: may need to flush rings here. */
748 DPRINTK("blktap: set mode to %lx\n",
749 arg);
750 return 0;
751 }
752 }
753 return 0;
754 }
755 case BLKTAP_IOCTL_PRINT_IDXS:
756 {
757 if (info) {
758 printk("User Rings: \n-----------\n");
759 printk("UF: rsp_cons: %2d, req_prod_prv: %2d "
760 "| req_prod: %2d, rsp_prod: %2d\n",
761 info->ufe_ring.rsp_cons,
762 info->ufe_ring.req_prod_pvt,
763 info->ufe_ring.sring->req_prod,
764 info->ufe_ring.sring->rsp_prod);
765 }
766 return 0;
767 }
768 case BLKTAP_IOCTL_SENDPID:
769 {
770 if (info) {
771 info->pid = (pid_t)arg;
772 DPRINTK("blktap: pid received %d\n",
773 info->pid);
774 }
775 return 0;
776 }
777 case BLKTAP_IOCTL_NEWINTF:
778 {
779 uint64_t val = (uint64_t)arg;
780 domid_translate_t *tr = (domid_translate_t *)&val;
782 DPRINTK("NEWINTF Req for domid %d and bus id %d\n",
783 tr->domid, tr->busid);
784 info = get_next_free_dev();
785 if (!info) {
786 WPRINTK("Error initialising /dev/xen/blktap - "
787 "No more devices\n");
788 return -1;
789 }
790 info->trans.domid = tr->domid;
791 info->trans.busid = tr->busid;
792 return info->minor;
793 }
794 case BLKTAP_IOCTL_NEWINTF_EXT:
795 {
796 void __user *udata = (void __user *) arg;
797 domid_translate_ext_t tr;
799 if (copy_from_user(&tr, udata, sizeof(domid_translate_ext_t)))
800 return -EFAULT;
802 DPRINTK("NEWINTF_EXT Req for domid %d and bus id %d\n",
803 tr.domid, tr.busid);
804 info = get_next_free_dev();
805 if (!info) {
806 WPRINTK("Error initialising /dev/xen/blktap - "
807 "No more devices\n");
808 return -1;
809 }
810 info->trans.domid = tr.domid;
811 info->trans.busid = tr.busid;
812 return info->minor;
813 }
814 case BLKTAP_IOCTL_FREEINTF:
815 {
816 unsigned long dev = arg;
817 unsigned long flags;
819 info = tapfds[dev];
821 if ((dev > MAX_TAP_DEV) || !info)
822 return 0; /* should this be an error? */
824 spin_lock_irqsave(&pending_free_lock, flags);
825 if (info->dev_pending)
826 info->dev_pending = 0;
827 spin_unlock_irqrestore(&pending_free_lock, flags);
829 return 0;
830 }
831 case BLKTAP_IOCTL_MINOR:
832 {
833 unsigned long dev = arg;
835 info = tapfds[dev];
837 if ((dev > MAX_TAP_DEV) || !info)
838 return -EINVAL;
840 return info->minor;
841 }
842 case BLKTAP_IOCTL_MAJOR:
843 return blktap_major;
845 case BLKTAP_QUERY_ALLOC_REQS:
846 {
847 WPRINTK("BLKTAP_QUERY_ALLOC_REQS ioctl: %d/%d\n",
848 alloc_pending_reqs, blkif_reqs);
849 return (alloc_pending_reqs/blkif_reqs) * 100;
850 }
851 }
852 return -ENOIOCTLCMD;
853 }
855 static unsigned int blktap_poll(struct file *filp, poll_table *wait)
856 {
857 tap_blkif_t *info = filp->private_data;
859 /* do not work on the control device */
860 if (!info)
861 return 0;
863 poll_wait(filp, &info->wait, wait);
864 if (info->ufe_ring.req_prod_pvt != info->ufe_ring.sring->req_prod) {
865 RING_PUSH_REQUESTS(&info->ufe_ring);
866 return POLLIN | POLLRDNORM;
867 }
868 return 0;
869 }
871 void blktap_kick_user(int idx)
872 {
873 tap_blkif_t *info;
875 info = tapfds[idx];
877 if ((idx < 0) || (idx > MAX_TAP_DEV) || !info)
878 return;
880 wake_up_interruptible(&info->wait);
882 return;
883 }
885 static int do_block_io_op(blkif_t *blkif);
886 static void dispatch_rw_block_io(blkif_t *blkif,
887 blkif_request_t *req,
888 pending_req_t *pending_req);
889 static void make_response(blkif_t *blkif, u64 id,
890 unsigned short op, int st);
892 /******************************************************************
893 * misc small helpers
894 */
895 static int req_increase(void)
896 {
897 int i, j;
899 if (mmap_alloc >= MAX_PENDING_REQS || mmap_lock)
900 return -EINVAL;
902 pending_reqs[mmap_alloc] = kzalloc(sizeof(pending_req_t)
903 * blkif_reqs, GFP_KERNEL);
904 foreign_pages[mmap_alloc] = alloc_empty_pages_and_pagevec(mmap_pages);
906 if (!pending_reqs[mmap_alloc] || !foreign_pages[mmap_alloc])
907 goto out_of_memory;
909 DPRINTK("%s: reqs=%d, pages=%d\n",
910 __FUNCTION__, blkif_reqs, mmap_pages);
912 for (i = 0; i < MAX_PENDING_REQS; i++) {
913 list_add_tail(&pending_reqs[mmap_alloc][i].free_list,
914 &pending_free);
915 pending_reqs[mmap_alloc][i].mem_idx = mmap_alloc;
916 for (j = 0; j < BLKIF_MAX_SEGMENTS_PER_REQUEST; j++)
917 BLKTAP_INVALIDATE_HANDLE(&pending_handle(mmap_alloc,
918 i, j));
919 }
921 mmap_alloc++;
922 DPRINTK("# MMAPs increased to %d\n",mmap_alloc);
923 return 0;
925 out_of_memory:
926 free_empty_pages_and_pagevec(foreign_pages[mmap_alloc], mmap_pages);
927 kfree(pending_reqs[mmap_alloc]);
928 WPRINTK("%s: out of memory\n", __FUNCTION__);
929 return -ENOMEM;
930 }
932 static void mmap_req_del(int mmap)
933 {
934 BUG_ON(!spin_is_locked(&pending_free_lock));
936 kfree(pending_reqs[mmap]);
937 pending_reqs[mmap] = NULL;
939 free_empty_pages_and_pagevec(foreign_pages[mmap_alloc], mmap_pages);
940 foreign_pages[mmap] = NULL;
942 mmap_lock = 0;
943 DPRINTK("# MMAPs decreased to %d\n",mmap_alloc);
944 mmap_alloc--;
945 }
947 static pending_req_t* alloc_req(void)
948 {
949 pending_req_t *req = NULL;
950 unsigned long flags;
952 spin_lock_irqsave(&pending_free_lock, flags);
954 if (!list_empty(&pending_free)) {
955 req = list_entry(pending_free.next, pending_req_t, free_list);
956 list_del(&req->free_list);
957 }
959 if (req) {
960 req->inuse = 1;
961 alloc_pending_reqs++;
962 }
963 spin_unlock_irqrestore(&pending_free_lock, flags);
965 return req;
966 }
968 static void free_req(pending_req_t *req)
969 {
970 unsigned long flags;
971 int was_empty;
973 spin_lock_irqsave(&pending_free_lock, flags);
975 alloc_pending_reqs--;
976 req->inuse = 0;
977 if (mmap_lock && (req->mem_idx == mmap_alloc-1)) {
978 mmap_inuse--;
979 if (mmap_inuse == 0) mmap_req_del(mmap_alloc-1);
980 spin_unlock_irqrestore(&pending_free_lock, flags);
981 return;
982 }
983 was_empty = list_empty(&pending_free);
984 list_add(&req->free_list, &pending_free);
986 spin_unlock_irqrestore(&pending_free_lock, flags);
988 if (was_empty)
989 wake_up(&pending_free_wq);
990 }
992 static void fast_flush_area(pending_req_t *req, int k_idx, int u_idx,
993 int tapidx)
994 {
995 struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST*2];
996 unsigned int i, invcount = 0, locked = 0;
997 struct grant_handle_pair *khandle;
998 uint64_t ptep;
999 int ret, mmap_idx;
1000 unsigned long kvaddr, uvaddr;
1001 tap_blkif_t *info;
1002 struct mm_struct *mm;
1005 info = tapfds[tapidx];
1007 if ((tapidx < 0) || (tapidx > MAX_TAP_DEV) || !info) {
1008 WPRINTK("fast_flush: Couldn't get info!\n");
1009 return;
1012 mm = info->vma ? info->vma->vm_mm : NULL;
1014 if (info->vma != NULL &&
1015 xen_feature(XENFEAT_auto_translated_physmap)) {
1016 down_write(&mm->mmap_sem);
1017 zap_page_range(info->vma,
1018 MMAP_VADDR(info->user_vstart, u_idx, 0),
1019 req->nr_pages << PAGE_SHIFT, NULL);
1020 up_write(&mm->mmap_sem);
1021 return;
1024 mmap_idx = req->mem_idx;
1026 for (i = 0; i < req->nr_pages; i++) {
1027 kvaddr = idx_to_kaddr(mmap_idx, k_idx, i);
1028 uvaddr = MMAP_VADDR(info->user_vstart, u_idx, i);
1030 khandle = &pending_handle(mmap_idx, k_idx, i);
1032 if (khandle->kernel != INVALID_GRANT_HANDLE) {
1033 gnttab_set_unmap_op(&unmap[invcount],
1034 idx_to_kaddr(mmap_idx, k_idx, i),
1035 GNTMAP_host_map, khandle->kernel);
1036 invcount++;
1038 set_phys_to_machine(
1039 __pa(idx_to_kaddr(mmap_idx, k_idx, i))
1040 >> PAGE_SHIFT, INVALID_P2M_ENTRY);
1043 if (khandle->user != INVALID_GRANT_HANDLE) {
1044 BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
1045 if (!locked++)
1046 down_write(&mm->mmap_sem);
1047 if (create_lookup_pte_addr(
1048 mm,
1049 MMAP_VADDR(info->user_vstart, u_idx, i),
1050 &ptep) !=0) {
1051 up_write(&mm->mmap_sem);
1052 WPRINTK("Couldn't get a pte addr!\n");
1053 return;
1056 gnttab_set_unmap_op(&unmap[invcount], ptep,
1057 GNTMAP_host_map
1058 | GNTMAP_application_map
1059 | GNTMAP_contains_pte,
1060 khandle->user);
1061 invcount++;
1064 BLKTAP_INVALIDATE_HANDLE(khandle);
1066 ret = HYPERVISOR_grant_table_op(
1067 GNTTABOP_unmap_grant_ref, unmap, invcount);
1068 BUG_ON(ret);
1070 if (info->vma != NULL &&
1071 !xen_feature(XENFEAT_auto_translated_physmap)) {
1072 if (!locked++)
1073 down_write(&mm->mmap_sem);
1074 zap_page_range(info->vma,
1075 MMAP_VADDR(info->user_vstart, u_idx, 0),
1076 req->nr_pages << PAGE_SHIFT, NULL);
1079 if (locked)
1080 up_write(&mm->mmap_sem);
1083 /******************************************************************
1084 * SCHEDULER FUNCTIONS
1085 */
1087 static void print_stats(blkif_t *blkif)
1089 printk(KERN_DEBUG "%s: oo %3d | rd %4d | wr %4d\n",
1090 current->comm, blkif->st_oo_req,
1091 blkif->st_rd_req, blkif->st_wr_req);
1092 blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
1093 blkif->st_rd_req = 0;
1094 blkif->st_wr_req = 0;
1095 blkif->st_oo_req = 0;
1098 int tap_blkif_schedule(void *arg)
1100 blkif_t *blkif = arg;
1102 blkif_get(blkif);
1104 if (debug_lvl)
1105 printk(KERN_DEBUG "%s: started\n", current->comm);
1107 while (!kthread_should_stop()) {
1108 if (try_to_freeze())
1109 continue;
1111 wait_event_interruptible(
1112 blkif->wq,
1113 blkif->waiting_reqs || kthread_should_stop());
1114 wait_event_interruptible(
1115 pending_free_wq,
1116 !list_empty(&pending_free) || kthread_should_stop());
1118 blkif->waiting_reqs = 0;
1119 smp_mb(); /* clear flag *before* checking for work */
1121 if (do_block_io_op(blkif))
1122 blkif->waiting_reqs = 1;
1124 if (log_stats && time_after(jiffies, blkif->st_print))
1125 print_stats(blkif);
1128 if (log_stats)
1129 print_stats(blkif);
1130 if (debug_lvl)
1131 printk(KERN_DEBUG "%s: exiting\n", current->comm);
1133 blkif->xenblkd = NULL;
1134 blkif_put(blkif);
1136 return 0;
1139 /******************************************************************
1140 * COMPLETION CALLBACK -- Called by user level ioctl()
1141 */
1143 static int blktap_read_ufe_ring(tap_blkif_t *info)
1145 /* This is called to read responses from the UFE ring. */
1146 RING_IDX i, j, rp;
1147 blkif_response_t *resp;
1148 blkif_t *blkif=NULL;
1149 int pending_idx, usr_idx, mmap_idx;
1150 pending_req_t *pending_req;
1152 if (!info)
1153 return 0;
1155 /* We currently only forward packets in INTERCEPT_FE mode. */
1156 if (!(info->mode & BLKTAP_MODE_INTERCEPT_FE))
1157 return 0;
1159 /* for each outstanding message on the UFEring */
1160 rp = info->ufe_ring.sring->rsp_prod;
1161 rmb();
1163 for (i = info->ufe_ring.rsp_cons; i != rp; i++) {
1164 blkif_response_t res;
1165 resp = RING_GET_RESPONSE(&info->ufe_ring, i);
1166 memcpy(&res, resp, sizeof(res));
1167 mb(); /* rsp_cons read by RING_FULL() in do_block_io_op(). */
1168 ++info->ufe_ring.rsp_cons;
1170 /*retrieve [usr_idx] to [mmap_idx,pending_idx] mapping*/
1171 usr_idx = (int)res.id;
1172 pending_idx = MASK_PEND_IDX(ID_TO_IDX(info->idx_map[usr_idx]));
1173 mmap_idx = ID_TO_MIDX(info->idx_map[usr_idx]);
1175 if ( (mmap_idx >= mmap_alloc) ||
1176 (ID_TO_IDX(info->idx_map[usr_idx]) >= MAX_PENDING_REQS) )
1177 WPRINTK("Incorrect req map"
1178 "[%d], internal map [%d,%d (%d)]\n",
1179 usr_idx, mmap_idx,
1180 ID_TO_IDX(info->idx_map[usr_idx]),
1181 MASK_PEND_IDX(
1182 ID_TO_IDX(info->idx_map[usr_idx])));
1184 pending_req = &pending_reqs[mmap_idx][pending_idx];
1185 blkif = pending_req->blkif;
1187 for (j = 0; j < pending_req->nr_pages; j++) {
1189 unsigned long kvaddr, uvaddr;
1190 struct tap_vma_priv *priv = info->vma->vm_private_data;
1191 struct page *pg;
1192 int offset;
1194 uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, j);
1195 kvaddr = idx_to_kaddr(mmap_idx, pending_idx, j);
1197 pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
1198 ClearPageReserved(pg);
1199 offset = (uvaddr - info->vma->vm_start)
1200 >> PAGE_SHIFT;
1201 priv->map[offset] = NULL;
1203 fast_flush_area(pending_req, pending_idx, usr_idx, info->minor);
1204 info->idx_map[usr_idx] = INVALID_REQ;
1205 make_response(blkif, pending_req->id, res.operation,
1206 res.status);
1207 blkif_put(pending_req->blkif);
1208 free_req(pending_req);
1211 return 0;
1215 /******************************************************************************
1216 * NOTIFICATION FROM GUEST OS.
1217 */
1219 static void blkif_notify_work(blkif_t *blkif)
1221 blkif->waiting_reqs = 1;
1222 wake_up(&blkif->wq);
1225 irqreturn_t tap_blkif_be_int(int irq, void *dev_id, struct pt_regs *regs)
1227 blkif_notify_work(dev_id);
1228 return IRQ_HANDLED;
1233 /******************************************************************
1234 * DOWNWARD CALLS -- These interface with the block-device layer proper.
1235 */
1236 static int print_dbug = 1;
1237 static int do_block_io_op(blkif_t *blkif)
1239 blkif_back_rings_t *blk_rings = &blkif->blk_rings;
1240 blkif_request_t req;
1241 pending_req_t *pending_req;
1242 RING_IDX rc, rp;
1243 int more_to_do = 0;
1244 tap_blkif_t *info;
1246 rc = blk_rings->common.req_cons;
1247 rp = blk_rings->common.sring->req_prod;
1248 rmb(); /* Ensure we see queued requests up to 'rp'. */
1250 /*Check blkif has corresponding UE ring*/
1251 if (blkif->dev_num < 0) {
1252 /*oops*/
1253 if (print_dbug) {
1254 WPRINTK("Corresponding UE "
1255 "ring does not exist!\n");
1256 print_dbug = 0; /*We only print this message once*/
1258 return 0;
1261 info = tapfds[blkif->dev_num];
1263 if (blkif->dev_num > MAX_TAP_DEV || !info || !info->dev_inuse) {
1264 if (print_dbug) {
1265 WPRINTK("Can't get UE info!\n");
1266 print_dbug = 0;
1268 return 0;
1271 while (rc != rp) {
1273 if (RING_FULL(&info->ufe_ring)) {
1274 WPRINTK("RING_FULL! More to do\n");
1275 more_to_do = 1;
1276 break;
1279 if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc)) {
1280 WPRINTK("RING_REQUEST_CONS_OVERFLOW!"
1281 " More to do\n");
1282 more_to_do = 1;
1283 break;
1286 if (kthread_should_stop()) {
1287 more_to_do = 1;
1288 break;
1291 pending_req = alloc_req();
1292 if (NULL == pending_req) {
1293 blkif->st_oo_req++;
1294 more_to_do = 1;
1295 break;
1298 switch (blkif->blk_protocol) {
1299 case BLKIF_PROTOCOL_NATIVE:
1300 memcpy(&req, RING_GET_REQUEST(&blk_rings->native, rc),
1301 sizeof(req));
1302 break;
1303 case BLKIF_PROTOCOL_X86_32:
1304 blkif_get_x86_32_req(&req, RING_GET_REQUEST(&blk_rings->x86_32, rc));
1305 break;
1306 case BLKIF_PROTOCOL_X86_64:
1307 blkif_get_x86_64_req(&req, RING_GET_REQUEST(&blk_rings->x86_64, rc));
1308 break;
1309 default:
1310 BUG();
1312 blk_rings->common.req_cons = ++rc; /* before make_response() */
1314 /* Apply all sanity checks to /private copy/ of request. */
1315 barrier();
1317 switch (req.operation) {
1318 case BLKIF_OP_READ:
1319 blkif->st_rd_req++;
1320 dispatch_rw_block_io(blkif, &req, pending_req);
1321 break;
1323 case BLKIF_OP_WRITE:
1324 blkif->st_wr_req++;
1325 dispatch_rw_block_io(blkif, &req, pending_req);
1326 break;
1328 default:
1329 /* A good sign something is wrong: sleep for a while to
1330 * avoid excessive CPU consumption by a bad guest. */
1331 msleep(1);
1332 WPRINTK("unknown operation [%d]\n",
1333 req.operation);
1334 make_response(blkif, req.id, req.operation,
1335 BLKIF_RSP_ERROR);
1336 free_req(pending_req);
1337 break;
1340 /* Yield point for this unbounded loop. */
1341 cond_resched();
1344 blktap_kick_user(blkif->dev_num);
1346 return more_to_do;
1349 static void dispatch_rw_block_io(blkif_t *blkif,
1350 blkif_request_t *req,
1351 pending_req_t *pending_req)
1353 extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]);
1354 int op, operation = (req->operation == BLKIF_OP_WRITE) ? WRITE : READ;
1355 struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST*2];
1356 unsigned int nseg;
1357 int ret, i, nr_sects = 0;
1358 tap_blkif_t *info;
1359 struct tap_vma_priv *priv;
1360 blkif_request_t *target;
1361 int pending_idx = RTN_PEND_IDX(pending_req,pending_req->mem_idx);
1362 int usr_idx;
1363 uint16_t mmap_idx = pending_req->mem_idx;
1364 struct mm_struct *mm;
1366 if (blkif->dev_num < 0 || blkif->dev_num > MAX_TAP_DEV)
1367 goto fail_response;
1369 info = tapfds[blkif->dev_num];
1370 if (info == NULL)
1371 goto fail_response;
1373 /* Check we have space on user ring - should never fail. */
1374 usr_idx = GET_NEXT_REQ(info->idx_map);
1375 if (usr_idx == INVALID_REQ) {
1376 BUG();
1377 goto fail_response;
1380 /* Check that number of segments is sane. */
1381 nseg = req->nr_segments;
1382 if ( unlikely(nseg == 0) ||
1383 unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST) ) {
1384 WPRINTK("Bad number of segments in request (%d)\n", nseg);
1385 goto fail_response;
1388 /* Make sure userspace is ready. */
1389 if (!info->ring_ok) {
1390 WPRINTK("blktap: ring not ready for requests!\n");
1391 goto fail_response;
1394 if (RING_FULL(&info->ufe_ring)) {
1395 WPRINTK("blktap: fe_ring is full, can't add "
1396 "IO Request will be dropped. %d %d\n",
1397 RING_SIZE(&info->ufe_ring),
1398 RING_SIZE(&blkif->blk_rings.common));
1399 goto fail_response;
1402 pending_req->blkif = blkif;
1403 pending_req->id = req->id;
1404 pending_req->operation = operation;
1405 pending_req->status = BLKIF_RSP_OKAY;
1406 pending_req->nr_pages = nseg;
1407 op = 0;
1408 priv = info->vma->vm_private_data;
1409 mm = info->vma->vm_mm;
1410 if (!xen_feature(XENFEAT_auto_translated_physmap))
1411 down_write(&mm->mmap_sem);
1412 for (i = 0; i < nseg; i++) {
1413 unsigned long uvaddr;
1414 unsigned long kvaddr;
1415 uint64_t ptep;
1416 uint32_t flags;
1418 uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i);
1419 kvaddr = idx_to_kaddr(mmap_idx, pending_idx, i);
1421 flags = GNTMAP_host_map;
1422 if (operation == WRITE)
1423 flags |= GNTMAP_readonly;
1424 gnttab_set_map_op(&map[op], kvaddr, flags,
1425 req->seg[i].gref, blkif->domid);
1426 op++;
1428 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
1429 /* Now map it to user. */
1430 ret = create_lookup_pte_addr(mm, uvaddr, &ptep);
1431 if (ret) {
1432 up_write(&mm->mmap_sem);
1433 WPRINTK("Couldn't get a pte addr!\n");
1434 goto fail_flush;
1437 flags = GNTMAP_host_map | GNTMAP_application_map
1438 | GNTMAP_contains_pte;
1439 if (operation == WRITE)
1440 flags |= GNTMAP_readonly;
1441 gnttab_set_map_op(&map[op], ptep, flags,
1442 req->seg[i].gref, blkif->domid);
1443 op++;
1446 nr_sects += (req->seg[i].last_sect -
1447 req->seg[i].first_sect + 1);
1450 ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, op);
1451 BUG_ON(ret);
1453 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
1454 up_write(&mm->mmap_sem);
1456 for (i = 0; i < (nseg*2); i+=2) {
1457 unsigned long uvaddr;
1458 unsigned long kvaddr;
1459 unsigned long offset;
1460 struct page *pg;
1462 uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i/2);
1463 kvaddr = idx_to_kaddr(mmap_idx, pending_idx, i/2);
1465 if (unlikely(map[i].status != 0)) {
1466 WPRINTK("invalid kernel buffer -- "
1467 "could not remap it\n");
1468 ret |= 1;
1469 map[i].handle = INVALID_GRANT_HANDLE;
1472 if (unlikely(map[i+1].status != 0)) {
1473 WPRINTK("invalid user buffer -- "
1474 "could not remap it\n");
1475 ret |= 1;
1476 map[i+1].handle = INVALID_GRANT_HANDLE;
1479 pending_handle(mmap_idx, pending_idx, i/2).kernel
1480 = map[i].handle;
1481 pending_handle(mmap_idx, pending_idx, i/2).user
1482 = map[i+1].handle;
1484 if (ret)
1485 continue;
1487 set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT,
1488 FOREIGN_FRAME(map[i].dev_bus_addr
1489 >> PAGE_SHIFT));
1490 offset = (uvaddr - info->vma->vm_start) >> PAGE_SHIFT;
1491 pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
1492 priv->map[offset] = pg;
1494 } else {
1495 for (i = 0; i < nseg; i++) {
1496 unsigned long uvaddr;
1497 unsigned long kvaddr;
1498 unsigned long offset;
1499 struct page *pg;
1501 uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i);
1502 kvaddr = idx_to_kaddr(mmap_idx, pending_idx, i);
1504 if (unlikely(map[i].status != 0)) {
1505 WPRINTK("invalid kernel buffer -- "
1506 "could not remap it\n");
1507 ret |= 1;
1508 map[i].handle = INVALID_GRANT_HANDLE;
1511 pending_handle(mmap_idx, pending_idx, i).kernel
1512 = map[i].handle;
1514 if (ret)
1515 continue;
1517 offset = (uvaddr - info->vma->vm_start) >> PAGE_SHIFT;
1518 pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
1519 priv->map[offset] = pg;
1523 if (ret)
1524 goto fail_flush;
1526 if (xen_feature(XENFEAT_auto_translated_physmap))
1527 down_write(&mm->mmap_sem);
1528 /* Mark mapped pages as reserved: */
1529 for (i = 0; i < req->nr_segments; i++) {
1530 unsigned long kvaddr;
1531 struct page *pg;
1533 kvaddr = idx_to_kaddr(mmap_idx, pending_idx, i);
1534 pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
1535 SetPageReserved(pg);
1536 if (xen_feature(XENFEAT_auto_translated_physmap)) {
1537 ret = vm_insert_page(info->vma,
1538 MMAP_VADDR(info->user_vstart,
1539 usr_idx, i), pg);
1540 if (ret) {
1541 up_write(&mm->mmap_sem);
1542 goto fail_flush;
1546 if (xen_feature(XENFEAT_auto_translated_physmap))
1547 up_write(&mm->mmap_sem);
1549 /*record [mmap_idx,pending_idx] to [usr_idx] mapping*/
1550 info->idx_map[usr_idx] = MAKE_ID(mmap_idx, pending_idx);
1552 blkif_get(blkif);
1553 /* Finally, write the request message to the user ring. */
1554 target = RING_GET_REQUEST(&info->ufe_ring,
1555 info->ufe_ring.req_prod_pvt);
1556 memcpy(target, req, sizeof(*req));
1557 target->id = usr_idx;
1558 wmb(); /* blktap_poll() reads req_prod_pvt asynchronously */
1559 info->ufe_ring.req_prod_pvt++;
1561 if (operation == READ)
1562 blkif->st_rd_sect += nr_sects;
1563 else if (operation == WRITE)
1564 blkif->st_wr_sect += nr_sects;
1566 return;
1568 fail_flush:
1569 WPRINTK("Reached Fail_flush\n");
1570 fast_flush_area(pending_req, pending_idx, usr_idx, blkif->dev_num);
1571 fail_response:
1572 make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
1573 free_req(pending_req);
1574 msleep(1); /* back off a bit */
1579 /******************************************************************
1580 * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING
1581 */
1584 static void make_response(blkif_t *blkif, u64 id,
1585 unsigned short op, int st)
1587 blkif_response_t resp;
1588 unsigned long flags;
1589 blkif_back_rings_t *blk_rings = &blkif->blk_rings;
1590 int more_to_do = 0;
1591 int notify;
1593 resp.id = id;
1594 resp.operation = op;
1595 resp.status = st;
1597 spin_lock_irqsave(&blkif->blk_ring_lock, flags);
1598 /* Place on the response ring for the relevant domain. */
1599 switch (blkif->blk_protocol) {
1600 case BLKIF_PROTOCOL_NATIVE:
1601 memcpy(RING_GET_RESPONSE(&blk_rings->native,
1602 blk_rings->native.rsp_prod_pvt),
1603 &resp, sizeof(resp));
1604 break;
1605 case BLKIF_PROTOCOL_X86_32:
1606 memcpy(RING_GET_RESPONSE(&blk_rings->x86_32,
1607 blk_rings->x86_32.rsp_prod_pvt),
1608 &resp, sizeof(resp));
1609 break;
1610 case BLKIF_PROTOCOL_X86_64:
1611 memcpy(RING_GET_RESPONSE(&blk_rings->x86_64,
1612 blk_rings->x86_64.rsp_prod_pvt),
1613 &resp, sizeof(resp));
1614 break;
1615 default:
1616 BUG();
1618 blk_rings->common.rsp_prod_pvt++;
1619 RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings->common, notify);
1621 if (blk_rings->common.rsp_prod_pvt == blk_rings->common.req_cons) {
1622 /*
1623 * Tail check for pending requests. Allows frontend to avoid
1624 * notifications if requests are already in flight (lower
1625 * overheads and promotes batching).
1626 */
1627 RING_FINAL_CHECK_FOR_REQUESTS(&blk_rings->common, more_to_do);
1628 } else if (RING_HAS_UNCONSUMED_REQUESTS(&blk_rings->common)) {
1629 more_to_do = 1;
1632 spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
1633 if (more_to_do)
1634 blkif_notify_work(blkif);
1635 if (notify)
1636 notify_remote_via_irq(blkif->irq);
1639 static int __init blkif_init(void)
1641 int i, ret;
1642 struct class *class;
1644 if (!is_running_on_xen())
1645 return -ENODEV;
1647 INIT_LIST_HEAD(&pending_free);
1648 for(i = 0; i < 2; i++) {
1649 ret = req_increase();
1650 if (ret)
1651 break;
1653 if (i == 0)
1654 return ret;
1656 tap_blkif_interface_init();
1658 alloc_pending_reqs = 0;
1660 tap_blkif_xenbus_init();
1662 /* Dynamically allocate a major for this device */
1663 ret = register_chrdev(0, "blktap", &blktap_fops);
1665 if (ret < 0) {
1666 WPRINTK("Couldn't register /dev/xen/blktap\n");
1667 return -ENOMEM;
1670 blktap_major = ret;
1672 /* tapfds[0] is always NULL */
1673 blktap_next_minor++;
1675 DPRINTK("Created misc_dev [/dev/xen/blktap%d]\n",i);
1677 /* Make sure the xen class exists */
1678 if ((class = get_xen_class()) != NULL) {
1679 /*
1680 * This will allow udev to create the blktap ctrl device.
1681 * We only want to create blktap0 first. We don't want
1682 * to flood the sysfs system with needless blktap devices.
1683 * We only create the device when a request of a new device is
1684 * made.
1685 */
1686 class_device_create(class, NULL,
1687 MKDEV(blktap_major, 0), NULL,
1688 "blktap0");
1689 } else {
1690 /* this is bad, but not fatal */
1691 WPRINTK("blktap: sysfs xen_class not created\n");
1694 DPRINTK("Blktap device successfully created\n");
1696 return 0;
1699 module_init(blkif_init);
1701 MODULE_LICENSE("Dual BSD/GPL");