ia64/linux-2.6.18-xen.hg

view drivers/xen/blktap2/ring.c @ 878:eba6fe6d8d53

blktap2: a completely rewritten blktap implementation

Benefits to blktap2 over the old version of blktap:

* Isolation from xenstore - Blktap devices are now created directly on
the linux dom0 command line, rather than being spawned in response
to XenStore events. This is handy for debugging, makes blktap
generally easier to work with, and is a step toward a generic
user-level block device implementation that is not Xen-specific.

* Improved tapdisk infrastructure: simpler request forwarding, new
request scheduler, request merging, more efficient use of AIO.

* Improved tapdisk error handling and memory management. No
allocations on the block data path, IO retry logic to protect
guests
transient block device failures. This has been tested and is known
to work on weird environments such as NFS soft mounts.

* Pause and snapshot of live virtual disks (see xmsnap script).

* VHD support. The VHD code in this release has been rigorously
tested, and represents a very mature implementation of the VHD
image
format.

* No more duplication of mechanism with blkback. The blktap kernel
module has changed dramatically from the original blktap. Blkback
is now always used to talk to Xen guests, blktap just presents a
Linux gendisk that blkback can export. This is done while
preserving the zero-copy data path from domU to physical device.

These patches deprecate the old blktap code, which can hopefully be
removed from the tree completely at some point in the future.

Signed-off-by: Jake Wires <jake.wires@citrix.com>
Signed-off-by: Dutch Meyer <dmeyer@cs.ubc.ca>
author Keir Fraser <keir.fraser@citrix.com>
date Tue May 26 11:23:16 2009 +0100 (2009-05-26)
parents
children
line source
1 #include <linux/module.h>
2 #include <linux/signal.h>
4 #include "blktap.h"
6 static int blktap_ring_major;
8 static inline struct blktap *
9 vma_to_blktap(struct vm_area_struct *vma)
10 {
11 struct vm_foreign_map *m = vma->vm_private_data;
12 struct blktap_ring *r = container_of(m, struct blktap_ring, foreign_map);
13 return container_of(r, struct blktap, ring);
14 }
16 /*
17 * BLKTAP - immediately before the mmap area,
18 * we have a bunch of pages reserved for shared memory rings.
19 */
20 #define RING_PAGES 1
22 static int
23 blktap_read_ring(struct blktap *tap)
24 {
25 /* This is called to read responses from the ring. */
26 int usr_idx;
27 RING_IDX rc, rp;
28 blkif_response_t res;
29 struct blktap_ring *ring;
30 struct blktap_request *request;
32 down_read(&tap->tap_sem);
34 ring = &tap->ring;
35 if (!ring->vma) {
36 up_read(&tap->tap_sem);
37 return 0;
38 }
40 /* for each outstanding message on the ring */
41 rp = ring->ring.sring->rsp_prod;
42 rmb();
44 for (rc = ring->ring.rsp_cons; rc != rp; rc++) {
45 memcpy(&res, RING_GET_RESPONSE(&ring->ring, rc), sizeof(res));
46 mb(); /* rsp_cons read by RING_FULL() in do_block_io_op(). */
47 ++ring->ring.rsp_cons;
49 usr_idx = (int)res.id;
50 if (usr_idx >= MAX_PENDING_REQS ||
51 !tap->pending_requests[usr_idx]) {
52 BTWARN("Request %d/%d invalid [%x], tapdisk %d%p\n",
53 rc, rp, usr_idx, tap->pid, ring->vma);
54 continue;
55 }
57 request = tap->pending_requests[usr_idx];
58 BTDBG("request %p response #%d id %x\n", request, rc, usr_idx);
59 blktap_device_finish_request(tap, &res, request);
60 }
62 up_read(&tap->tap_sem);
64 blktap_run_deferred();
66 return 0;
67 }
69 static struct page *
70 blktap_ring_nopage(struct vm_area_struct *vma,
71 unsigned long address, int *type)
72 {
73 /*
74 * if the page has not been mapped in by the driver then return
75 * NOPAGE_SIGBUS to the domain.
76 */
78 return NOPAGE_SIGBUS;
79 }
81 static pte_t
82 blktap_ring_clear_pte(struct vm_area_struct *vma,
83 unsigned long uvaddr,
84 pte_t *ptep, int is_fullmm)
85 {
86 pte_t copy;
87 struct blktap *tap;
88 unsigned long kvaddr;
89 struct page **map, *page;
90 struct blktap_ring *ring;
91 struct blktap_request *request;
92 struct grant_handle_pair *khandle;
93 struct gnttab_unmap_grant_ref unmap[2];
94 int offset, seg, usr_idx, count = 0;
96 tap = vma_to_blktap(vma);
97 ring = &tap->ring;
98 map = ring->foreign_map.map;
99 BUG_ON(!map); /* TODO Should this be changed to if statement? */
101 /*
102 * Zap entry if the address is before the start of the grant
103 * mapped region.
104 */
105 if (uvaddr < ring->user_vstart)
106 return ptep_get_and_clear_full(vma->vm_mm, uvaddr,
107 ptep, is_fullmm);
109 offset = (int)((uvaddr - ring->user_vstart) >> PAGE_SHIFT);
110 usr_idx = offset / BLKIF_MAX_SEGMENTS_PER_REQUEST;
111 seg = offset % BLKIF_MAX_SEGMENTS_PER_REQUEST;
113 offset = (int)((uvaddr - vma->vm_start) >> PAGE_SHIFT);
114 page = map[offset];
115 if (page) {
116 ClearPageReserved(page);
117 if (PageBlkback(page)) {
118 ClearPageBlkback(page);
119 set_page_private(page, 0);
120 }
121 }
122 map[offset] = NULL;
124 request = tap->pending_requests[usr_idx];
125 kvaddr = request_to_kaddr(request, seg);
126 khandle = request->handles + seg;
128 if (khandle->kernel != INVALID_GRANT_HANDLE) {
129 gnttab_set_unmap_op(&unmap[count], kvaddr,
130 GNTMAP_host_map, khandle->kernel);
131 count++;
133 set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT,
134 INVALID_P2M_ENTRY);
135 }
138 if (khandle->user != INVALID_GRANT_HANDLE) {
139 BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
141 copy = *ptep;
142 gnttab_set_unmap_op(&unmap[count], virt_to_machine(ptep),
143 GNTMAP_host_map
144 | GNTMAP_application_map
145 | GNTMAP_contains_pte,
146 khandle->user);
147 count++;
148 } else
149 copy = ptep_get_and_clear_full(vma->vm_mm, uvaddr, ptep,
150 is_fullmm);
152 if (count)
153 if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
154 unmap, count))
155 BUG();
157 khandle->kernel = INVALID_GRANT_HANDLE;
158 khandle->user = INVALID_GRANT_HANDLE;
160 return copy;
161 }
163 static void
164 blktap_ring_vm_unmap(struct vm_area_struct *vma)
165 {
166 struct blktap *tap = vma_to_blktap(vma);
168 down_write(&tap->tap_sem);
169 clear_bit(BLKTAP_RING_VMA, &tap->dev_inuse);
170 clear_bit(BLKTAP_PAUSED, &tap->dev_inuse);
171 clear_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse);
172 up_write(&tap->tap_sem);
173 }
175 static void
176 blktap_ring_vm_close(struct vm_area_struct *vma)
177 {
178 struct blktap *tap = vma_to_blktap(vma);
179 struct blktap_ring *ring = &tap->ring;
181 blktap_ring_vm_unmap(vma); /* fail future requests */
182 blktap_device_fail_pending_requests(tap); /* fail pending requests */
183 blktap_device_restart(tap); /* fail deferred requests */
185 down_write(&tap->tap_sem);
187 zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start, NULL);
189 kfree(ring->foreign_map.map);
190 ring->foreign_map.map = NULL;
192 /* Free the ring page. */
193 ClearPageReserved(virt_to_page(ring->ring.sring));
194 free_page((unsigned long)ring->ring.sring);
196 BTINFO("unmapping ring %d\n", tap->minor);
197 ring->ring.sring = NULL;
198 ring->vma = NULL;
200 up_write(&tap->tap_sem);
202 wake_up(&tap->wq);
203 }
205 static struct vm_operations_struct blktap_ring_vm_operations = {
206 .close = blktap_ring_vm_close,
207 .unmap = blktap_ring_vm_unmap,
208 .nopage = blktap_ring_nopage,
209 .zap_pte = blktap_ring_clear_pte,
210 };
212 static int
213 blktap_ring_open(struct inode *inode, struct file *filp)
214 {
215 int idx;
216 struct blktap *tap;
218 idx = iminor(inode);
219 if (idx < 0 || idx > MAX_BLKTAP_DEVICE || blktaps[idx] == NULL) {
220 BTERR("unable to open device blktap%d\n", idx);
221 return -ENODEV;
222 }
224 tap = blktaps[idx];
226 BTINFO("opening device blktap%d\n", idx);
228 if (!test_bit(BLKTAP_CONTROL, &tap->dev_inuse))
229 return -ENODEV;
231 /* Only one process can access ring at a time */
232 if (test_and_set_bit(BLKTAP_RING_FD, &tap->dev_inuse))
233 return -EBUSY;
235 filp->private_data = tap;
236 BTINFO("opened device %d\n", tap->minor);
238 return 0;
239 }
241 static int
242 blktap_ring_release(struct inode *inode, struct file *filp)
243 {
244 struct blktap *tap = filp->private_data;
246 BTINFO("freeing device %d\n", tap->minor);
247 clear_bit(BLKTAP_RING_FD, &tap->dev_inuse);
248 filp->private_data = NULL;
249 wake_up(&tap->wq);
250 return 0;
251 }
253 /* Note on mmap:
254 * We need to map pages to user space in a way that will allow the block
255 * subsystem set up direct IO to them. This couldn't be done before, because
256 * there isn't really a sane way to translate a user virtual address down to a
257 * physical address when the page belongs to another domain.
258 *
259 * My first approach was to map the page in to kernel memory, add an entry
260 * for it in the physical frame list (using alloc_lomem_region as in blkback)
261 * and then attempt to map that page up to user space. This is disallowed
262 * by xen though, which realizes that we don't really own the machine frame
263 * underlying the physical page.
264 *
265 * The new approach is to provide explicit support for this in xen linux.
266 * The VMA now has a flag, VM_FOREIGN, to indicate that it contains pages
267 * mapped from other vms. vma->vm_private_data is set up as a mapping
268 * from pages to actual page structs. There is a new clause in get_user_pages
269 * that does the right thing for this sort of mapping.
270 */
271 static int
272 blktap_ring_mmap(struct file *filp, struct vm_area_struct *vma)
273 {
274 int size, err;
275 struct page **map;
276 struct blktap *tap;
277 blkif_sring_t *sring;
278 struct blktap_ring *ring;
280 tap = filp->private_data;
281 ring = &tap->ring;
282 map = NULL;
283 sring = NULL;
285 if (!tap || test_and_set_bit(BLKTAP_RING_VMA, &tap->dev_inuse))
286 return -ENOMEM;
288 size = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
289 if (size != (MMAP_PAGES + RING_PAGES)) {
290 BTERR("you _must_ map exactly %lu pages!\n",
291 MMAP_PAGES + RING_PAGES);
292 return -EAGAIN;
293 }
295 /* Allocate the fe ring. */
296 sring = (blkif_sring_t *)get_zeroed_page(GFP_KERNEL);
297 if (!sring) {
298 BTERR("Couldn't alloc sring.\n");
299 goto fail_mem;
300 }
302 map = kzalloc(size * sizeof(struct page *), GFP_KERNEL);
303 if (!map) {
304 BTERR("Couldn't alloc VM_FOREIGN map.\n");
305 goto fail_mem;
306 }
308 SetPageReserved(virt_to_page(sring));
310 SHARED_RING_INIT(sring);
311 FRONT_RING_INIT(&ring->ring, sring, PAGE_SIZE);
313 ring->ring_vstart = vma->vm_start;
314 ring->user_vstart = ring->ring_vstart + (RING_PAGES << PAGE_SHIFT);
316 /* Map the ring pages to the start of the region and reserve it. */
317 if (xen_feature(XENFEAT_auto_translated_physmap))
318 err = vm_insert_page(vma, vma->vm_start,
319 virt_to_page(ring->ring.sring));
320 else
321 err = remap_pfn_range(vma, vma->vm_start,
322 __pa(ring->ring.sring) >> PAGE_SHIFT,
323 PAGE_SIZE, vma->vm_page_prot);
324 if (err) {
325 BTERR("Mapping user ring failed: %d\n", err);
326 goto fail;
327 }
329 /* Mark this VM as containing foreign pages, and set up mappings. */
330 ring->foreign_map.map = map;
331 vma->vm_private_data = &ring->foreign_map;
332 vma->vm_flags |= VM_FOREIGN;
333 vma->vm_flags |= VM_DONTCOPY;
334 vma->vm_flags |= VM_RESERVED;
335 vma->vm_ops = &blktap_ring_vm_operations;
337 #ifdef CONFIG_X86
338 vma->vm_mm->context.has_foreign_mappings = 1;
339 #endif
341 tap->pid = current->pid;
342 BTINFO("blktap: mapping pid is %d\n", tap->pid);
344 ring->vma = vma;
345 return 0;
347 fail:
348 /* Clear any active mappings. */
349 zap_page_range(vma, vma->vm_start,
350 vma->vm_end - vma->vm_start, NULL);
351 ClearPageReserved(virt_to_page(sring));
352 fail_mem:
353 free_page((unsigned long)sring);
354 kfree(map);
356 return -ENOMEM;
357 }
359 static inline void
360 blktap_ring_set_message(struct blktap *tap, int msg)
361 {
362 struct blktap_ring *ring = &tap->ring;
364 down_read(&tap->tap_sem);
365 if (ring->ring.sring)
366 ring->ring.sring->pad[0] = msg;
367 up_read(&tap->tap_sem);
368 }
370 static int
371 blktap_ring_ioctl(struct inode *inode, struct file *filp,
372 unsigned int cmd, unsigned long arg)
373 {
374 struct blktap_params params;
375 struct blktap *tap = filp->private_data;
377 BTDBG("%d: cmd: %u, arg: %lu\n", tap->minor, cmd, arg);
379 switch(cmd) {
380 case BLKTAP2_IOCTL_KICK_FE:
381 /* There are fe messages to process. */
382 return blktap_read_ring(tap);
384 case BLKTAP2_IOCTL_CREATE_DEVICE:
385 if (!arg)
386 return -EINVAL;
388 if (copy_from_user(&params, (struct blktap_params __user *)arg,
389 sizeof(params))) {
390 BTERR("failed to get params\n");
391 return -EFAULT;
392 }
394 if (blktap_validate_params(tap, &params)) {
395 BTERR("invalid params\n");
396 return -EINVAL;
397 }
399 tap->params = params;
400 return blktap_device_create(tap);
402 case BLKTAP2_IOCTL_SET_PARAMS:
403 if (!arg)
404 return -EINVAL;
406 if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
407 return -EINVAL;
409 if (copy_from_user(&params, (struct blktap_params __user *)arg,
410 sizeof(params))) {
411 BTERR("failed to get params\n");
412 return -EFAULT;
413 }
415 if (blktap_validate_params(tap, &params)) {
416 BTERR("invalid params\n");
417 return -EINVAL;
418 }
420 tap->params = params;
421 return 0;
423 case BLKTAP2_IOCTL_PAUSE:
424 if (!test_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse))
425 return -EINVAL;
427 set_bit(BLKTAP_PAUSED, &tap->dev_inuse);
428 clear_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse);
430 blktap_ring_set_message(tap, 0);
431 wake_up_interruptible(&tap->wq);
433 return 0;
436 case BLKTAP2_IOCTL_REOPEN:
437 if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
438 return -EINVAL;
440 if (!arg)
441 return -EINVAL;
443 if (copy_to_user((char __user *)arg,
444 tap->params.name,
445 strlen(tap->params.name) + 1))
446 return -EFAULT;
448 blktap_ring_set_message(tap, 0);
449 wake_up_interruptible(&tap->wq);
451 return 0;
453 case BLKTAP2_IOCTL_RESUME:
454 if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
455 return -EINVAL;
457 tap->ring.response = (int)arg;
458 if (!tap->ring.response)
459 clear_bit(BLKTAP_PAUSED, &tap->dev_inuse);
461 blktap_ring_set_message(tap, 0);
462 wake_up_interruptible(&tap->wq);
464 return 0;
465 }
467 return -ENOIOCTLCMD;
468 }
470 static unsigned int blktap_ring_poll(struct file *filp, poll_table *wait)
471 {
472 struct blktap *tap = filp->private_data;
473 struct blktap_ring *ring = &tap->ring;
475 poll_wait(filp, &ring->poll_wait, wait);
476 if (ring->ring.sring->pad[0] != 0 ||
477 ring->ring.req_prod_pvt != ring->ring.sring->req_prod) {
478 RING_PUSH_REQUESTS(&ring->ring);
479 return POLLIN | POLLRDNORM;
480 }
482 return 0;
483 }
485 static struct file_operations blktap_ring_file_operations = {
486 .owner = THIS_MODULE,
487 .open = blktap_ring_open,
488 .release = blktap_ring_release,
489 .ioctl = blktap_ring_ioctl,
490 .mmap = blktap_ring_mmap,
491 .poll = blktap_ring_poll,
492 };
494 void
495 blktap_ring_kick_user(struct blktap *tap)
496 {
497 wake_up_interruptible(&tap->ring.poll_wait);
498 }
500 int
501 blktap_ring_resume(struct blktap *tap)
502 {
503 int err;
504 struct blktap_ring *ring = &tap->ring;
506 if (!blktap_active(tap))
507 return -ENODEV;
509 if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
510 return -EINVAL;
512 /* set shared flag for resume */
513 ring->response = 0;
515 blktap_ring_set_message(tap, BLKTAP2_RING_MESSAGE_RESUME);
516 blktap_ring_kick_user(tap);
518 wait_event_interruptible(tap->wq, ring->response ||
519 !test_bit(BLKTAP_PAUSED, &tap->dev_inuse));
521 err = ring->response;
522 ring->response = 0;
524 BTDBG("err: %d\n", err);
526 if (err)
527 return err;
529 if (test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
530 return -EAGAIN;
532 return 0;
533 }
535 int
536 blktap_ring_pause(struct blktap *tap)
537 {
538 if (!blktap_active(tap))
539 return -ENODEV;
541 if (!test_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse))
542 return -EINVAL;
544 BTDBG("draining queue\n");
545 wait_event_interruptible(tap->wq, !tap->pending_cnt);
546 if (tap->pending_cnt)
547 return -EAGAIN;
549 blktap_ring_set_message(tap, BLKTAP2_RING_MESSAGE_PAUSE);
550 blktap_ring_kick_user(tap);
552 BTDBG("waiting for tapdisk response\n");
553 wait_event_interruptible(tap->wq, test_bit(BLKTAP_PAUSED, &tap->dev_inuse));
554 if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
555 return -EAGAIN;
557 return 0;
558 }
560 int
561 blktap_ring_destroy(struct blktap *tap)
562 {
563 if (!test_bit(BLKTAP_RING_FD, &tap->dev_inuse) &&
564 !test_bit(BLKTAP_RING_VMA, &tap->dev_inuse))
565 return 0;
567 BTDBG("sending tapdisk close message\n");
568 blktap_ring_set_message(tap, BLKTAP2_RING_MESSAGE_CLOSE);
569 blktap_ring_kick_user(tap);
571 return -EAGAIN;
572 }
574 static void
575 blktap_ring_initialize(struct blktap_ring *ring, int minor)
576 {
577 memset(ring, 0, sizeof(*ring));
578 init_waitqueue_head(&ring->poll_wait);
579 ring->devno = MKDEV(blktap_ring_major, minor);
580 }
582 int
583 blktap_ring_create(struct blktap *tap)
584 {
585 struct blktap_ring *ring = &tap->ring;
586 blktap_ring_initialize(ring, tap->minor);
587 return blktap_sysfs_create(tap);
588 }
590 int
591 blktap_ring_init(int *major)
592 {
593 int err;
595 err = register_chrdev(0, "blktap2", &blktap_ring_file_operations);
596 if (err < 0) {
597 BTERR("error registering blktap ring device: %d\n", err);
598 return err;
599 }
601 blktap_ring_major = *major = err;
602 BTINFO("blktap ring major: %d\n", blktap_ring_major);
603 return 0;
604 }
606 int
607 blktap_ring_free(void)
608 {
609 if (blktap_ring_major)
610 unregister_chrdev(blktap_ring_major, "blktap2");
612 return 0;
613 }