direct-io.hg

view linux-2.6-xen-sparse/drivers/xen/blktap/blktap.c @ 8478:9e03e60f2d46

convert latest old initializer to C99 initializer

Signed-off-by: Vincent Hanquez <vincent@xensource.com>
author vhanquez@kneesa.uk.xensource.com
date Tue Jan 03 16:57:41 2006 +0000 (2006-01-03)
parents 9270bb6b0421
children fd9b2c1bb577
line source
1 /******************************************************************************
2 * arch/xen/drivers/blkif/blktap/blktap.c
3 *
4 * This is a modified version of the block backend driver that remaps requests
5 * to a user-space memory region. It is intended to be used to write
6 * application-level servers that provide block interfaces to client VMs.
7 */
9 #include <linux/kernel.h>
10 #include <linux/spinlock.h>
11 #include <asm-xen/balloon.h>
12 #include <linux/kernel.h>
13 #include <linux/fs.h>
14 #include <linux/mm.h>
15 #include <linux/miscdevice.h>
16 #include <linux/errno.h>
17 #include <linux/major.h>
18 #include <linux/gfp.h>
19 #include <linux/poll.h>
20 #include <asm/tlbflush.h>
21 #include "common.h"
23 /* Only one process may open /dev/xen/blktap at any time. */
24 static unsigned long blktap_dev_inuse;
25 unsigned long blktap_ring_ok; /* make this ring->state */
27 /* Rings up to user space. */
28 static blkif_front_ring_t blktap_ufe_ring;
30 /* for poll: */
31 static wait_queue_head_t blktap_wait;
33 /* current switching mode */
34 static unsigned long blktap_mode;
36 /* local prototypes */
37 static int blktap_read_ufe_ring(void);
40 /* /dev/xen/blktap resides at device number major=10, minor=200 */
41 #define BLKTAP_MINOR 202
43 /* blktap IOCTLs: */
44 #define BLKTAP_IOCTL_KICK_FE 1
45 #define BLKTAP_IOCTL_KICK_BE 2 /* currently unused */
46 #define BLKTAP_IOCTL_SETMODE 3
47 #define BLKTAP_IOCTL_PRINT_IDXS 100
49 /* blktap switching modes: (Set with BLKTAP_IOCTL_SETMODE) */
50 #define BLKTAP_MODE_PASSTHROUGH 0x00000000 /* default */
51 #define BLKTAP_MODE_INTERCEPT_FE 0x00000001
52 #define BLKTAP_MODE_INTERCEPT_BE 0x00000002 /* unimp. */
53 #define BLKTAP_MODE_COPY_FE 0x00000004 /* unimp. */
54 #define BLKTAP_MODE_COPY_BE 0x00000008 /* unimp. */
55 #define BLKTAP_MODE_COPY_FE_PAGES 0x00000010 /* unimp. */
56 #define BLKTAP_MODE_COPY_BE_PAGES 0x00000020 /* unimp. */
58 #define BLKTAP_MODE_INTERPOSE \
59 (BLKTAP_MODE_INTERCEPT_FE | BLKTAP_MODE_INTERCEPT_BE)
61 #define BLKTAP_MODE_COPY_BOTH \
62 (BLKTAP_MODE_COPY_FE | BLKTAP_MODE_COPY_BE)
64 #define BLKTAP_MODE_COPY_BOTH_PAGES \
65 (BLKTAP_MODE_COPY_FE_PAGES | BLKTAP_MODE_COPY_BE_PAGES)
67 static inline int BLKTAP_MODE_VALID(unsigned long arg)
68 {
69 return ((arg == BLKTAP_MODE_PASSTHROUGH ) ||
70 (arg == BLKTAP_MODE_INTERCEPT_FE) ||
71 (arg == BLKTAP_MODE_INTERPOSE ));
72 /*
73 return (
74 ( arg == BLKTAP_MODE_PASSTHROUGH ) ||
75 ( arg == BLKTAP_MODE_INTERCEPT_FE ) ||
76 ( arg == BLKTAP_MODE_INTERCEPT_BE ) ||
77 ( arg == BLKTAP_MODE_INTERPOSE ) ||
78 ( (arg & ~BLKTAP_MODE_COPY_FE_PAGES) == BLKTAP_MODE_COPY_FE ) ||
79 ( (arg & ~BLKTAP_MODE_COPY_BE_PAGES) == BLKTAP_MODE_COPY_BE ) ||
80 ( (arg & ~BLKTAP_MODE_COPY_BOTH_PAGES) == BLKTAP_MODE_COPY_BOTH )
81 );
82 */
83 }
86 /******************************************************************
87 * MMAP REGION
88 */
90 /*
91 * We use a big chunk of address space to map in-flight requests into,
92 * and export this region up to user-space. See the comments in blkback
93 * about this -- the two must be kept in sync if the tap is used as a
94 * passthrough.
95 */
97 #define MAX_PENDING_REQS 64
98 #define BATCH_PER_DOMAIN 16
100 /* immediately before the mmap area, we have a bunch of pages reserved
101 * for shared memory rings.
102 */
103 #define RING_PAGES 1 /* Front */
105 /* Where things are inside the device mapping. */
106 struct vm_area_struct *blktap_vma = NULL;
107 unsigned long mmap_vstart; /* Kernel pages for mapping in data. */
108 unsigned long rings_vstart; /* start of mmaped vma */
109 unsigned long user_vstart; /* start of user mappings */
111 #define MMAP_PAGES \
112 (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST)
113 #define MMAP_VADDR(_start, _req,_seg) \
114 (_start + \
115 ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) + \
116 ((_seg) * PAGE_SIZE))
118 /*
119 * Each outstanding request that we've passed to the lower device layers has a
120 * 'pending_req' allocated to it. Each buffer_head that completes decrements
121 * the pendcnt towards zero. When it hits zero, the specified domain has a
122 * response queued for it, with the saved 'id' passed back.
123 */
124 typedef struct {
125 blkif_t *blkif;
126 unsigned long id;
127 int nr_pages;
128 atomic_t pendcnt;
129 unsigned short operation;
130 int status;
131 } pending_req_t;
133 /*
134 * We can't allocate pending_req's in order, since they may complete out of
135 * order. We therefore maintain an allocation ring. This ring also indicates
136 * when enough work has been passed down -- at that point the allocation ring
137 * will be empty.
138 */
139 static pending_req_t pending_reqs[MAX_PENDING_REQS];
140 static unsigned char pending_ring[MAX_PENDING_REQS];
141 static spinlock_t pend_prod_lock = SPIN_LOCK_UNLOCKED;
142 /* NB. We use a different index type to differentiate from shared blk rings. */
143 typedef unsigned int PEND_RING_IDX;
144 #define MASK_PEND_IDX(_i) ((_i)&(MAX_PENDING_REQS-1))
145 static PEND_RING_IDX pending_prod, pending_cons;
146 #define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons)
148 /* Requests passing through the tap to the backend hijack the id field
149 * in the request message. In it we put the AR index _AND_ the fe domid.
150 * the domid is used by the backend to map the pages properly.
151 */
153 static inline unsigned long MAKE_ID(domid_t fe_dom, PEND_RING_IDX idx)
154 {
155 return ((fe_dom << 16) | MASK_PEND_IDX(idx));
156 }
158 extern inline PEND_RING_IDX ID_TO_IDX(unsigned long id)
159 {
160 return (PEND_RING_IDX)(id & 0x0000ffff);
161 }
163 extern inline domid_t ID_TO_DOM(unsigned long id)
164 {
165 return (domid_t)(id >> 16);
166 }
170 /******************************************************************
171 * GRANT HANDLES
172 */
174 /* When using grant tables to map a frame for device access then the
175 * handle returned must be used to unmap the frame. This is needed to
176 * drop the ref count on the frame.
177 */
178 struct grant_handle_pair
179 {
180 grant_handle_t kernel;
181 grant_handle_t user;
182 };
183 static struct grant_handle_pair pending_grant_handles[MMAP_PAGES];
184 #define pending_handle(_idx, _i) \
185 (pending_grant_handles[((_idx) * BLKIF_MAX_SEGMENTS_PER_REQUEST) + (_i)])
186 #define BLKTAP_INVALID_HANDLE(_g) \
187 (((_g->kernel) == 0xFFFF) && ((_g->user) == 0xFFFF))
188 #define BLKTAP_INVALIDATE_HANDLE(_g) do { \
189 (_g)->kernel = 0xFFFF; (_g)->user = 0xFFFF; \
190 } while(0)
193 /******************************************************************
194 * BLKTAP VM OPS
195 */
197 static struct page *blktap_nopage(struct vm_area_struct *vma,
198 unsigned long address,
199 int *type)
200 {
201 /*
202 * if the page has not been mapped in by the driver then generate
203 * a SIGBUS to the domain.
204 */
205 force_sig(SIGBUS, current);
207 return 0;
208 }
210 struct vm_operations_struct blktap_vm_ops = {
211 .nopage = blktap_nopage,
212 };
214 /******************************************************************
215 * BLKTAP FILE OPS
216 */
218 static int blktap_open(struct inode *inode, struct file *filp)
219 {
220 blkif_sring_t *sring;
222 if (test_and_set_bit(0, &blktap_dev_inuse))
223 return -EBUSY;
225 /* Allocate the fe ring. */
226 sring = (blkif_sring_t *)get_zeroed_page(GFP_KERNEL);
227 if (sring == NULL)
228 return -ENOMEM;
230 SetPageReserved(virt_to_page(sring));
232 SHARED_RING_INIT(sring);
233 FRONT_RING_INIT(&blktap_ufe_ring, sring, PAGE_SIZE);
235 return 0;
236 }
238 static int blktap_release(struct inode *inode, struct file *filp)
239 {
240 blktap_dev_inuse = 0;
241 blktap_ring_ok = 0;
243 /* Free the ring page. */
244 ClearPageReserved(virt_to_page(blktap_ufe_ring.sring));
245 free_page((unsigned long) blktap_ufe_ring.sring);
247 /* Clear any active mappings and free foreign map table */
248 if (blktap_vma != NULL) {
249 zap_page_range(
250 blktap_vma, blktap_vma->vm_start,
251 blktap_vma->vm_end - blktap_vma->vm_start, NULL);
252 blktap_vma = NULL;
253 }
255 return 0;
256 }
259 /* Note on mmap:
260 * We need to map pages to user space in a way that will allow the block
261 * subsystem set up direct IO to them. This couldn't be done before, because
262 * there isn't really a sane way to translate a user virtual address down to a
263 * physical address when the page belongs to another domain.
264 *
265 * My first approach was to map the page in to kernel memory, add an entry
266 * for it in the physical frame list (using alloc_lomem_region as in blkback)
267 * and then attempt to map that page up to user space. This is disallowed
268 * by xen though, which realizes that we don't really own the machine frame
269 * underlying the physical page.
270 *
271 * The new approach is to provide explicit support for this in xen linux.
272 * The VMA now has a flag, VM_FOREIGN, to indicate that it contains pages
273 * mapped from other vms. vma->vm_private_data is set up as a mapping
274 * from pages to actual page structs. There is a new clause in get_user_pages
275 * that does the right thing for this sort of mapping.
276 */
277 static int blktap_mmap(struct file *filp, struct vm_area_struct *vma)
278 {
279 int size;
280 struct page **map;
281 int i;
283 DPRINTK(KERN_ALERT "blktap mmap (%lx, %lx)\n",
284 vma->vm_start, vma->vm_end);
286 vma->vm_flags |= VM_RESERVED;
287 vma->vm_ops = &blktap_vm_ops;
289 size = vma->vm_end - vma->vm_start;
290 if (size != ((MMAP_PAGES + RING_PAGES) << PAGE_SHIFT)) {
291 printk(KERN_INFO
292 "blktap: you _must_ map exactly %d pages!\n",
293 MMAP_PAGES + RING_PAGES);
294 return -EAGAIN;
295 }
297 size >>= PAGE_SHIFT;
298 DPRINTK(KERN_INFO "blktap: 2 rings + %d pages.\n", size-1);
300 rings_vstart = vma->vm_start;
301 user_vstart = rings_vstart + (RING_PAGES << PAGE_SHIFT);
303 /* Map the ring pages to the start of the region and reserve it. */
305 /* not sure if I really need to do this... */
306 vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
308 if (remap_pfn_range(vma, vma->vm_start,
309 __pa(blktap_ufe_ring.sring) >> PAGE_SHIFT,
310 PAGE_SIZE, vma->vm_page_prot)) {
311 WPRINTK("Mapping user ring failed!\n");
312 goto fail;
313 }
315 /* Mark this VM as containing foreign pages, and set up mappings. */
316 map = kmalloc(((vma->vm_end - vma->vm_start) >> PAGE_SHIFT)
317 * sizeof(struct page_struct*),
318 GFP_KERNEL);
319 if (map == NULL) {
320 WPRINTK("Couldn't alloc VM_FOREIGH map.\n");
321 goto fail;
322 }
324 for (i = 0; i < ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT); i++)
325 map[i] = NULL;
327 vma->vm_private_data = map;
328 vma->vm_flags |= VM_FOREIGN;
330 blktap_vma = vma;
331 blktap_ring_ok = 1;
333 return 0;
334 fail:
335 /* Clear any active mappings. */
336 zap_page_range(vma, vma->vm_start,
337 vma->vm_end - vma->vm_start, NULL);
339 return -ENOMEM;
340 }
342 static int blktap_ioctl(struct inode *inode, struct file *filp,
343 unsigned int cmd, unsigned long arg)
344 {
345 switch(cmd) {
346 case BLKTAP_IOCTL_KICK_FE: /* There are fe messages to process. */
347 return blktap_read_ufe_ring();
349 case BLKTAP_IOCTL_SETMODE:
350 if (BLKTAP_MODE_VALID(arg)) {
351 blktap_mode = arg;
352 /* XXX: may need to flush rings here. */
353 printk(KERN_INFO "blktap: set mode to %lx\n", arg);
354 return 0;
355 }
356 case BLKTAP_IOCTL_PRINT_IDXS:
357 {
358 //print_fe_ring_idxs();
359 WPRINTK("User Rings: \n-----------\n");
360 WPRINTK("UF: rsp_cons: %2d, req_prod_prv: %2d "
361 "| req_prod: %2d, rsp_prod: %2d\n",
362 blktap_ufe_ring.rsp_cons,
363 blktap_ufe_ring.req_prod_pvt,
364 blktap_ufe_ring.sring->req_prod,
365 blktap_ufe_ring.sring->rsp_prod);
367 }
368 }
369 return -ENOIOCTLCMD;
370 }
372 static unsigned int blktap_poll(struct file *file, poll_table *wait)
373 {
374 poll_wait(file, &blktap_wait, wait);
375 if (blktap_ufe_ring.req_prod_pvt != blktap_ufe_ring.sring->req_prod) {
376 flush_tlb_all();
377 RING_PUSH_REQUESTS(&blktap_ufe_ring);
378 return POLLIN | POLLRDNORM;
379 }
381 return 0;
382 }
384 void blktap_kick_user(void)
385 {
386 /* blktap_ring->req_prod = blktap_req_prod; */
387 wake_up_interruptible(&blktap_wait);
388 }
390 static struct file_operations blktap_fops = {
391 .owner = THIS_MODULE,
392 .poll = blktap_poll,
393 .ioctl = blktap_ioctl,
394 .open = blktap_open,
395 .release = blktap_release,
396 .mmap = blktap_mmap,
397 };
401 static int do_block_io_op(blkif_t *blkif, int max_to_do);
402 static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req);
403 static void make_response(blkif_t *blkif, unsigned long id,
404 unsigned short op, int st);
407 static void fast_flush_area(int idx, int nr_pages)
408 {
409 struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST*2];
410 unsigned int i, op = 0;
411 struct grant_handle_pair *handle;
412 uint64_t ptep;
413 int ret;
415 for ( i = 0; i < nr_pages; i++)
416 {
417 handle = &pending_handle(idx, i);
418 if (BLKTAP_INVALID_HANDLE(handle))
419 continue;
421 unmap[op].host_addr = MMAP_VADDR(mmap_vstart, idx, i);
422 unmap[op].dev_bus_addr = 0;
423 unmap[op].handle = handle->kernel;
424 op++;
426 if (create_lookup_pte_addr(
427 blktap_vma->vm_mm,
428 MMAP_VADDR(user_vstart, idx, i),
429 &ptep) !=0) {
430 DPRINTK("Couldn't get a pte addr!\n");
431 return;
432 }
433 unmap[op].host_addr = ptep;
434 unmap[op].dev_bus_addr = 0;
435 unmap[op].handle = handle->user;
436 op++;
438 BLKTAP_INVALIDATE_HANDLE(handle);
439 }
441 ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, unmap, op);
442 BUG_ON(ret);
444 if (blktap_vma != NULL)
445 zap_page_range(blktap_vma,
446 MMAP_VADDR(user_vstart, idx, 0),
447 nr_pages << PAGE_SHIFT, NULL);
448 }
450 /******************************************************************
451 * BLOCK-DEVICE SCHEDULER LIST MAINTENANCE
452 */
454 static struct list_head blkio_schedule_list;
455 static spinlock_t blkio_schedule_list_lock;
457 static int __on_blkdev_list(blkif_t *blkif)
458 {
459 return blkif->blkdev_list.next != NULL;
460 }
462 static void remove_from_blkdev_list(blkif_t *blkif)
463 {
464 unsigned long flags;
466 if (!__on_blkdev_list(blkif))
467 return;
469 spin_lock_irqsave(&blkio_schedule_list_lock, flags);
470 if (__on_blkdev_list(blkif)) {
471 list_del(&blkif->blkdev_list);
472 blkif->blkdev_list.next = NULL;
473 blkif_put(blkif);
474 }
475 spin_unlock_irqrestore(&blkio_schedule_list_lock, flags);
476 }
478 static void add_to_blkdev_list_tail(blkif_t *blkif)
479 {
480 unsigned long flags;
482 if (__on_blkdev_list(blkif))
483 return;
485 spin_lock_irqsave(&blkio_schedule_list_lock, flags);
486 if (!__on_blkdev_list(blkif) && (blkif->status == CONNECTED)) {
487 list_add_tail(&blkif->blkdev_list, &blkio_schedule_list);
488 blkif_get(blkif);
489 }
490 spin_unlock_irqrestore(&blkio_schedule_list_lock, flags);
491 }
494 /******************************************************************
495 * SCHEDULER FUNCTIONS
496 */
498 static DECLARE_WAIT_QUEUE_HEAD(blkio_schedule_wait);
500 static int blkio_schedule(void *arg)
501 {
502 DECLARE_WAITQUEUE(wq, current);
504 blkif_t *blkif;
505 struct list_head *ent;
507 daemonize("xenblkd");
509 for (;;) {
510 /* Wait for work to do. */
511 add_wait_queue(&blkio_schedule_wait, &wq);
512 set_current_state(TASK_INTERRUPTIBLE);
513 if ((NR_PENDING_REQS == MAX_PENDING_REQS) ||
514 list_empty(&blkio_schedule_list))
515 schedule();
516 __set_current_state(TASK_RUNNING);
517 remove_wait_queue(&blkio_schedule_wait, &wq);
519 /* Queue up a batch of requests. */
520 while ((NR_PENDING_REQS < MAX_PENDING_REQS) &&
521 !list_empty(&blkio_schedule_list)) {
522 ent = blkio_schedule_list.next;
523 blkif = list_entry(ent, blkif_t, blkdev_list);
524 blkif_get(blkif);
525 remove_from_blkdev_list(blkif);
526 if (do_block_io_op(blkif, BATCH_PER_DOMAIN))
527 add_to_blkdev_list_tail(blkif);
528 blkif_put(blkif);
529 }
530 }
531 }
533 static void maybe_trigger_blkio_schedule(void)
534 {
535 /*
536 * Needed so that two processes, who together make the following
537 * predicate true, don't both read stale values and evaluate the
538 * predicate incorrectly. Incredibly unlikely to stall the scheduler
539 * on the x86, but...
540 */
541 smp_mb();
543 if ((NR_PENDING_REQS < (MAX_PENDING_REQS/2)) &&
544 !list_empty(&blkio_schedule_list))
545 wake_up(&blkio_schedule_wait);
546 }
550 /******************************************************************
551 * COMPLETION CALLBACK -- Called as bh->b_end_io()
552 */
555 static int blktap_read_ufe_ring(void)
556 {
557 /* This is called to read responses from the UFE ring. */
559 RING_IDX i, j, rp;
560 blkif_response_t *resp;
561 blkif_t *blkif;
562 int pending_idx;
563 pending_req_t *pending_req;
564 unsigned long flags;
566 /* if we are forwarding from UFERring to FERing */
567 if (blktap_mode & BLKTAP_MODE_INTERCEPT_FE) {
569 /* for each outstanding message on the UFEring */
570 rp = blktap_ufe_ring.sring->rsp_prod;
571 rmb();
573 for (i = blktap_ufe_ring.rsp_cons; i != rp; i++) {
574 resp = RING_GET_RESPONSE(&blktap_ufe_ring, i);
575 pending_idx = MASK_PEND_IDX(ID_TO_IDX(resp->id));
576 pending_req = &pending_reqs[pending_idx];
578 blkif = pending_req->blkif;
579 for (j = 0; j < pending_req->nr_pages; j++) {
580 unsigned long vaddr;
581 struct page **map = blktap_vma->vm_private_data;
582 int offset;
584 vaddr = MMAP_VADDR(user_vstart, pending_idx, j);
585 offset = (vaddr - blktap_vma->vm_start) >> PAGE_SHIFT;
587 //ClearPageReserved(virt_to_page(vaddr));
588 ClearPageReserved((struct page *)map[offset]);
589 map[offset] = NULL;
590 }
592 fast_flush_area(pending_idx, pending_req->nr_pages);
593 make_response(blkif, pending_req->id, resp->operation,
594 resp->status);
595 blkif_put(pending_req->blkif);
596 spin_lock_irqsave(&pend_prod_lock, flags);
597 pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
598 spin_unlock_irqrestore(&pend_prod_lock, flags);
599 }
600 blktap_ufe_ring.rsp_cons = i;
601 maybe_trigger_blkio_schedule();
602 }
603 return 0;
604 }
607 /******************************************************************************
608 * NOTIFICATION FROM GUEST OS.
609 */
611 irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs)
612 {
613 blkif_t *blkif = dev_id;
614 add_to_blkdev_list_tail(blkif);
615 maybe_trigger_blkio_schedule();
616 return IRQ_HANDLED;
617 }
621 /******************************************************************
622 * DOWNWARD CALLS -- These interface with the block-device layer proper.
623 */
625 static int do_block_io_op(blkif_t *blkif, int max_to_do)
626 {
627 blkif_back_ring_t *blk_ring = &blkif->blk_ring;
628 blkif_request_t *req;
629 RING_IDX i, rp;
630 int more_to_do = 0;
632 rp = blk_ring->sring->req_prod;
633 rmb(); /* Ensure we see queued requests up to 'rp'. */
635 for (i = blk_ring->req_cons;
636 (i != rp) && !RING_REQUEST_CONS_OVERFLOW(blk_ring, i);
637 i++ ) {
638 if ((max_to_do-- == 0) ||
639 (NR_PENDING_REQS == MAX_PENDING_REQS)) {
640 more_to_do = 1;
641 break;
642 }
644 req = RING_GET_REQUEST(blk_ring, i);
645 switch (req->operation) {
646 case BLKIF_OP_READ:
647 case BLKIF_OP_WRITE:
648 dispatch_rw_block_io(blkif, req);
649 break;
651 default:
652 DPRINTK("error: unknown block io operation [%d]\n",
653 req->operation);
654 make_response(blkif, req->id, req->operation,
655 BLKIF_RSP_ERROR);
656 break;
657 }
658 }
660 blk_ring->req_cons = i;
661 blktap_kick_user();
663 return more_to_do;
664 }
666 static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req)
667 {
668 blkif_request_t *target;
669 int i, pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)];
670 pending_req_t *pending_req;
671 struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST*2];
672 int op, ret;
673 unsigned int nseg;
674 int retval;
676 /* Check that number of segments is sane. */
677 nseg = req->nr_segments;
678 if (unlikely(nseg == 0) ||
679 unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
680 DPRINTK("Bad number of segments in request (%d)\n", nseg);
681 goto bad_descriptor;
682 }
684 /* Make sure userspace is ready. */
685 if (!blktap_ring_ok) {
686 DPRINTK("blktap: ring not ready for requests!\n");
687 goto bad_descriptor;
688 }
691 if (RING_FULL(&blktap_ufe_ring)) {
692 WPRINTK("blktap: fe_ring is full, can't add "
693 "(very broken!).\n");
694 goto bad_descriptor;
695 }
697 flush_cache_all(); /* a noop on intel... */
699 /* Map the foreign pages directly in to the application */
700 op = 0;
701 for (i = 0; i < req->nr_segments; i++) {
703 unsigned long uvaddr;
704 unsigned long kvaddr;
705 uint64_t ptep;
707 uvaddr = MMAP_VADDR(user_vstart, pending_idx, i);
708 kvaddr = MMAP_VADDR(mmap_vstart, pending_idx, i);
710 /* Map the remote page to kernel. */
711 map[op].host_addr = kvaddr;
712 map[op].dom = blkif->domid;
713 map[op].ref = req->seg[i].gref;
714 map[op].flags = GNTMAP_host_map;
715 /* This needs a bit more thought in terms of interposition:
716 * If we want to be able to modify pages during write using
717 * grant table mappings, the guest will either need to allow
718 * it, or we'll need to incur a copy. Bit of an fbufs moment. ;) */
719 if (req->operation == BLKIF_OP_WRITE)
720 map[op].flags |= GNTMAP_readonly;
721 op++;
723 /* Now map it to user. */
724 ret = create_lookup_pte_addr(blktap_vma->vm_mm, uvaddr, &ptep);
725 if (ret) {
726 DPRINTK("Couldn't get a pte addr!\n");
727 fast_flush_area(pending_idx, req->nr_segments);
728 goto bad_descriptor;
729 }
731 map[op].host_addr = ptep;
732 map[op].dom = blkif->domid;
733 map[op].ref = req->seg[i].gref;
734 map[op].flags = GNTMAP_host_map | GNTMAP_application_map
735 | GNTMAP_contains_pte;
736 /* Above interposition comment applies here as well. */
737 if (req->operation == BLKIF_OP_WRITE)
738 map[op].flags |= GNTMAP_readonly;
739 op++;
740 }
742 retval = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, op);
743 BUG_ON(retval);
745 op = 0;
746 for (i = 0; i < (req->nr_segments*2); i += 2) {
747 unsigned long uvaddr;
748 unsigned long kvaddr;
749 unsigned long offset;
750 int cancel = 0;
752 uvaddr = MMAP_VADDR(user_vstart, pending_idx, i/2);
753 kvaddr = MMAP_VADDR(mmap_vstart, pending_idx, i/2);
755 if (unlikely(map[i].status)) {
756 DPRINTK("Error on kernel grant mapping (%d)\n",
757 map[i].status);
758 ret = map[i].status;
759 cancel = 1;
760 }
762 if (unlikely(map[i+1].status)) {
763 DPRINTK("Error on user grant mapping (%d)\n",
764 map[i+1].status);
765 ret = map[i+1].status;
766 cancel = 1;
767 }
769 if (cancel) {
770 fast_flush_area(pending_idx, req->nr_segments);
771 goto bad_descriptor;
772 }
774 /* Set the necessary mappings in p2m and in the VM_FOREIGN
775 * vm_area_struct to allow user vaddr -> struct page lookups
776 * to work. This is needed for direct IO to foreign pages. */
777 set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT,
778 FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT));
780 offset = (uvaddr - blktap_vma->vm_start) >> PAGE_SHIFT;
781 ((struct page **)blktap_vma->vm_private_data)[offset] =
782 pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
784 /* Save handles for unmapping later. */
785 pending_handle(pending_idx, i/2).kernel = map[i].handle;
786 pending_handle(pending_idx, i/2).user = map[i+1].handle;
787 }
789 /* Mark mapped pages as reserved: */
790 for (i = 0; i < req->nr_segments; i++) {
791 unsigned long kvaddr;
792 kvaddr = MMAP_VADDR(mmap_vstart, pending_idx, i);
793 SetPageReserved(pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT));
794 }
796 pending_req = &pending_reqs[pending_idx];
797 pending_req->blkif = blkif;
798 pending_req->id = req->id;
799 pending_req->operation = req->operation;
800 pending_req->status = BLKIF_RSP_OKAY;
801 pending_req->nr_pages = nseg;
802 req->id = MAKE_ID(blkif->domid, pending_idx);
803 //atomic_set(&pending_req->pendcnt, nbio);
804 pending_cons++;
805 blkif_get(blkif);
807 /* Finally, write the request message to the user ring. */
808 target = RING_GET_REQUEST(&blktap_ufe_ring,
809 blktap_ufe_ring.req_prod_pvt);
810 memcpy(target, req, sizeof(*req));
811 blktap_ufe_ring.req_prod_pvt++;
812 return;
814 bad_descriptor:
815 make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
816 }
820 /******************************************************************
821 * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING
822 */
825 static void make_response(blkif_t *blkif, unsigned long id,
826 unsigned short op, int st)
827 {
828 blkif_response_t *resp;
829 unsigned long flags;
830 blkif_back_ring_t *blk_ring = &blkif->blk_ring;
832 /* Place on the response ring for the relevant domain. */
833 spin_lock_irqsave(&blkif->blk_ring_lock, flags);
834 resp = RING_GET_RESPONSE(blk_ring, blk_ring->rsp_prod_pvt);
835 resp->id = id;
836 resp->operation = op;
837 resp->status = st;
838 wmb(); /* Ensure other side can see the response fields. */
839 blk_ring->rsp_prod_pvt++;
840 RING_PUSH_RESPONSES(blk_ring);
841 spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
843 /* Kick the relevant domain. */
844 notify_remote_via_irq(blkif->irq);
845 }
847 static struct miscdevice blktap_miscdev = {
848 .minor = BLKTAP_MINOR,
849 .name = "blktap",
850 .fops = &blktap_fops,
851 .devfs_name = "misc/blktap",
852 };
854 void blkif_deschedule(blkif_t *blkif)
855 {
856 remove_from_blkdev_list(blkif);
857 }
859 static int __init blkif_init(void)
860 {
861 int i, j, err;
862 struct page *page;
864 blkif_interface_init();
866 page = balloon_alloc_empty_page_range(MMAP_PAGES);
867 BUG_ON(page == NULL);
868 mmap_vstart = (unsigned long)pfn_to_kaddr(page_to_pfn(page));
870 pending_cons = 0;
871 pending_prod = MAX_PENDING_REQS;
872 memset(pending_reqs, 0, sizeof(pending_reqs));
873 for ( i = 0; i < MAX_PENDING_REQS; i++ )
874 pending_ring[i] = i;
876 spin_lock_init(&blkio_schedule_list_lock);
877 INIT_LIST_HEAD(&blkio_schedule_list);
879 i = kernel_thread(blkio_schedule, 0, CLONE_FS | CLONE_FILES);
880 BUG_ON(i<0);
882 blkif_xenbus_init();
884 for (i = 0; i < MAX_PENDING_REQS ; i++)
885 for (j = 0; j < BLKIF_MAX_SEGMENTS_PER_REQUEST; j++)
886 BLKTAP_INVALIDATE_HANDLE(&pending_handle(i, j));
888 err = misc_register(&blktap_miscdev);
889 if (err != 0) {
890 printk(KERN_ALERT "Couldn't register /dev/misc/blktap (%d)\n",
891 err);
892 return err;
893 }
895 init_waitqueue_head(&blktap_wait);
897 return 0;
898 }
900 __initcall(blkif_init);
902 /*
903 * Local variables:
904 * c-file-style: "linux"
905 * indent-tabs-mode: t
906 * c-indent-level: 8
907 * c-basic-offset: 8
908 * tab-width: 8
909 * End:
910 */