ia64/linux-2.6.18-xen.hg

view drivers/md/dm.c @ 897:329ea0ccb344

balloon: try harder to balloon up under memory pressure.

Currently if the balloon driver is unable to increase the guest's
reservation it assumes the failure was due to reaching its full
allocation, gives up on the ballooning operation and records the limit
it reached as the "hard limit". The driver will not try again until
the target is set again (even to the same value).

However it is possible that ballooning has in fact failed due to
memory pressure in the host and therefore it is desirable to keep
attempting to reach the target in case memory becomes available. The
most likely scenario is that some guests are ballooning down while
others are ballooning up and therefore there is temporary memory
pressure while things stabilise. You would not expect a well behaved
toolstack to ask a domain to balloon to more than its allocation nor
would you expect it to deliberately over-commit memory by setting
balloon targets which exceed the total host memory.

This patch drops the concept of a hard limit and causes the balloon
driver to retry increasing the reservation on a timer in the same
manner as when decreasing the reservation.

Also if we partially succeed in increasing the reservation
(i.e. receive less pages than we asked for) then we may as well keep
those pages rather than returning them to Xen.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Fri Jun 05 14:01:20 2009 +0100 (2009-06-05)
parents 831230e53067
children
line source
1 /*
2 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This file is released under the GPL.
6 */
8 #include "dm.h"
9 #include "dm-bio-list.h"
11 #include <linux/init.h>
12 #include <linux/module.h>
13 #include <linux/mutex.h>
14 #include <linux/moduleparam.h>
15 #include <linux/blkpg.h>
16 #include <linux/bio.h>
17 #include <linux/buffer_head.h>
18 #include <linux/mempool.h>
19 #include <linux/slab.h>
20 #include <linux/idr.h>
21 #include <linux/hdreg.h>
22 #include <linux/blktrace_api.h>
24 #define DM_MSG_PREFIX "core"
26 static const char *_name = DM_NAME;
28 static unsigned int major = 0;
29 static unsigned int _major = 0;
31 static DEFINE_SPINLOCK(_minor_lock);
32 /*
33 * One of these is allocated per bio.
34 */
35 struct dm_io {
36 struct mapped_device *md;
37 int error;
38 struct bio *bio;
39 atomic_t io_count;
40 unsigned long start_time;
41 };
43 /*
44 * One of these is allocated per target within a bio. Hopefully
45 * this will be simplified out one day.
46 */
47 struct target_io {
48 struct dm_io *io;
49 struct dm_target *ti;
50 union map_info info;
51 };
53 union map_info *dm_get_mapinfo(struct bio *bio)
54 {
55 if (bio && bio->bi_private)
56 return &((struct target_io *)bio->bi_private)->info;
57 return NULL;
58 }
60 #define MINOR_ALLOCED ((void *)-1)
62 /*
63 * Bits for the md->flags field.
64 */
65 #define DMF_BLOCK_IO 0
66 #define DMF_SUSPENDED 1
67 #define DMF_FROZEN 2
68 #define DMF_FREEING 3
69 #define DMF_DELETING 4
71 struct mapped_device {
72 struct rw_semaphore io_lock;
73 struct semaphore suspend_lock;
74 rwlock_t map_lock;
75 atomic_t holders;
76 atomic_t open_count;
78 unsigned long flags;
80 request_queue_t *queue;
81 struct gendisk *disk;
82 char name[16];
84 void *interface_ptr;
86 /*
87 * A list of ios that arrived while we were suspended.
88 */
89 atomic_t pending;
90 wait_queue_head_t wait;
91 struct bio_list deferred;
93 /*
94 * The current mapping.
95 */
96 struct dm_table *map;
98 /*
99 * io objects are allocated from here.
100 */
101 mempool_t *io_pool;
102 mempool_t *tio_pool;
104 /*
105 * Event handling.
106 */
107 atomic_t event_nr;
108 wait_queue_head_t eventq;
110 /*
111 * freeze/thaw support require holding onto a super block
112 */
113 struct super_block *frozen_sb;
114 struct block_device *suspended_bdev;
116 /* forced geometry settings */
117 struct hd_geometry geometry;
118 };
120 #define MIN_IOS 256
121 static kmem_cache_t *_io_cache;
122 static kmem_cache_t *_tio_cache;
124 static struct bio_set *dm_set;
126 static int __init local_init(void)
127 {
128 int r;
130 dm_set = bioset_create(16, 16, 4);
131 if (!dm_set)
132 return -ENOMEM;
134 /* allocate a slab for the dm_ios */
135 _io_cache = kmem_cache_create("dm_io",
136 sizeof(struct dm_io), 0, 0, NULL, NULL);
137 if (!_io_cache)
138 return -ENOMEM;
140 /* allocate a slab for the target ios */
141 _tio_cache = kmem_cache_create("dm_tio", sizeof(struct target_io),
142 0, 0, NULL, NULL);
143 if (!_tio_cache) {
144 kmem_cache_destroy(_io_cache);
145 return -ENOMEM;
146 }
148 _major = major;
149 r = register_blkdev(_major, _name);
150 if (r < 0) {
151 kmem_cache_destroy(_tio_cache);
152 kmem_cache_destroy(_io_cache);
153 return r;
154 }
156 if (!_major)
157 _major = r;
159 return 0;
160 }
162 static void local_exit(void)
163 {
164 kmem_cache_destroy(_tio_cache);
165 kmem_cache_destroy(_io_cache);
167 bioset_free(dm_set);
169 if (unregister_blkdev(_major, _name) < 0)
170 DMERR("unregister_blkdev failed");
172 _major = 0;
174 DMINFO("cleaned up");
175 }
177 int (*_inits[])(void) __initdata = {
178 local_init,
179 dm_target_init,
180 dm_linear_init,
181 dm_stripe_init,
182 dm_interface_init,
183 };
185 void (*_exits[])(void) = {
186 local_exit,
187 dm_target_exit,
188 dm_linear_exit,
189 dm_stripe_exit,
190 dm_interface_exit,
191 };
193 static int __init dm_init(void)
194 {
195 const int count = ARRAY_SIZE(_inits);
197 int r, i;
199 for (i = 0; i < count; i++) {
200 r = _inits[i]();
201 if (r)
202 goto bad;
203 }
205 return 0;
207 bad:
208 while (i--)
209 _exits[i]();
211 return r;
212 }
214 static void __exit dm_exit(void)
215 {
216 int i = ARRAY_SIZE(_exits);
218 while (i--)
219 _exits[i]();
220 }
222 /*
223 * Block device functions
224 */
225 static int dm_blk_open(struct inode *inode, struct file *file)
226 {
227 struct mapped_device *md;
229 spin_lock(&_minor_lock);
231 md = inode->i_bdev->bd_disk->private_data;
232 if (!md)
233 goto out;
235 if (test_bit(DMF_FREEING, &md->flags) ||
236 test_bit(DMF_DELETING, &md->flags)) {
237 md = NULL;
238 goto out;
239 }
241 dm_get(md);
242 atomic_inc(&md->open_count);
244 out:
245 spin_unlock(&_minor_lock);
247 return md ? 0 : -ENXIO;
248 }
250 static int dm_blk_close(struct inode *inode, struct file *file)
251 {
252 struct mapped_device *md;
254 md = inode->i_bdev->bd_disk->private_data;
255 atomic_dec(&md->open_count);
256 dm_put(md);
257 return 0;
258 }
260 int dm_open_count(struct mapped_device *md)
261 {
262 return atomic_read(&md->open_count);
263 }
265 /*
266 * Guarantees nothing is using the device before it's deleted.
267 */
268 int dm_lock_for_deletion(struct mapped_device *md)
269 {
270 int r = 0;
272 spin_lock(&_minor_lock);
274 if (dm_open_count(md))
275 r = -EBUSY;
276 else
277 set_bit(DMF_DELETING, &md->flags);
279 spin_unlock(&_minor_lock);
281 return r;
282 }
284 static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
285 {
286 struct mapped_device *md = bdev->bd_disk->private_data;
288 return dm_get_geometry(md, geo);
289 }
291 static inline struct dm_io *alloc_io(struct mapped_device *md)
292 {
293 return mempool_alloc(md->io_pool, GFP_NOIO);
294 }
296 static inline void free_io(struct mapped_device *md, struct dm_io *io)
297 {
298 mempool_free(io, md->io_pool);
299 }
301 static inline struct target_io *alloc_tio(struct mapped_device *md)
302 {
303 return mempool_alloc(md->tio_pool, GFP_NOIO);
304 }
306 static inline void free_tio(struct mapped_device *md, struct target_io *tio)
307 {
308 mempool_free(tio, md->tio_pool);
309 }
311 static void start_io_acct(struct dm_io *io)
312 {
313 struct mapped_device *md = io->md;
315 io->start_time = jiffies;
317 preempt_disable();
318 disk_round_stats(dm_disk(md));
319 preempt_enable();
320 dm_disk(md)->in_flight = atomic_inc_return(&md->pending);
321 }
323 static int end_io_acct(struct dm_io *io)
324 {
325 struct mapped_device *md = io->md;
326 struct bio *bio = io->bio;
327 unsigned long duration = jiffies - io->start_time;
328 int pending;
329 int rw = bio_data_dir(bio);
331 preempt_disable();
332 disk_round_stats(dm_disk(md));
333 preempt_enable();
334 dm_disk(md)->in_flight = pending = atomic_dec_return(&md->pending);
336 disk_stat_add(dm_disk(md), ticks[rw], duration);
338 return !pending;
339 }
341 /*
342 * Add the bio to the list of deferred io.
343 */
344 static int queue_io(struct mapped_device *md, struct bio *bio)
345 {
346 down_write(&md->io_lock);
348 if (!test_bit(DMF_BLOCK_IO, &md->flags)) {
349 up_write(&md->io_lock);
350 return 1;
351 }
353 bio_list_add(&md->deferred, bio);
355 up_write(&md->io_lock);
356 return 0; /* deferred successfully */
357 }
359 /*
360 * Everyone (including functions in this file), should use this
361 * function to access the md->map field, and make sure they call
362 * dm_table_put() when finished.
363 */
364 struct dm_table *dm_get_table(struct mapped_device *md)
365 {
366 struct dm_table *t;
368 read_lock(&md->map_lock);
369 t = md->map;
370 if (t)
371 dm_table_get(t);
372 read_unlock(&md->map_lock);
374 return t;
375 }
377 /*
378 * Get the geometry associated with a dm device
379 */
380 int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo)
381 {
382 *geo = md->geometry;
384 return 0;
385 }
387 /*
388 * Set the geometry of a device.
389 */
390 int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo)
391 {
392 sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors;
394 if (geo->start > sz) {
395 DMWARN("Start sector is beyond the geometry limits.");
396 return -EINVAL;
397 }
399 md->geometry = *geo;
401 return 0;
402 }
404 /*-----------------------------------------------------------------
405 * CRUD START:
406 * A more elegant soln is in the works that uses the queue
407 * merge fn, unfortunately there are a couple of changes to
408 * the block layer that I want to make for this. So in the
409 * interests of getting something for people to use I give
410 * you this clearly demarcated crap.
411 *---------------------------------------------------------------*/
413 /*
414 * Decrements the number of outstanding ios that a bio has been
415 * cloned into, completing the original io if necc.
416 */
417 static void dec_pending(struct dm_io *io, int error)
418 {
419 if (error)
420 io->error = error;
422 if (atomic_dec_and_test(&io->io_count)) {
423 if (end_io_acct(io))
424 /* nudge anyone waiting on suspend queue */
425 wake_up(&io->md->wait);
427 blk_add_trace_bio(io->md->queue, io->bio, BLK_TA_COMPLETE);
429 bio_endio(io->bio, io->bio->bi_size, io->error);
430 free_io(io->md, io);
431 }
432 }
434 static int clone_endio(struct bio *bio, unsigned int done, int error)
435 {
436 int r = 0;
437 struct target_io *tio = bio->bi_private;
438 struct dm_io *io = tio->io;
439 dm_endio_fn endio = tio->ti->type->end_io;
441 if (bio->bi_size)
442 return 1;
444 if (!bio_flagged(bio, BIO_UPTODATE) && !error)
445 error = -EIO;
447 if (endio) {
448 r = endio(tio->ti, bio, error, &tio->info);
449 if (r < 0)
450 error = r;
452 else if (r > 0)
453 /* the target wants another shot at the io */
454 return 1;
455 }
457 free_tio(io->md, tio);
458 dec_pending(io, error);
459 bio_put(bio);
460 return r;
461 }
463 static sector_t max_io_len(struct mapped_device *md,
464 sector_t sector, struct dm_target *ti)
465 {
466 sector_t offset = sector - ti->begin;
467 sector_t len = ti->len - offset;
469 /*
470 * Does the target need to split even further ?
471 */
472 if (ti->split_io) {
473 sector_t boundary;
474 boundary = ((offset + ti->split_io) & ~(ti->split_io - 1))
475 - offset;
476 if (len > boundary)
477 len = boundary;
478 }
480 return len;
481 }
483 static void __map_bio(struct dm_target *ti, struct bio *clone,
484 struct target_io *tio)
485 {
486 int r;
487 sector_t sector;
489 /*
490 * Sanity checks.
491 */
492 BUG_ON(!clone->bi_size);
494 clone->bi_end_io = clone_endio;
495 clone->bi_private = tio;
497 /*
498 * Map the clone. If r == 0 we don't need to do
499 * anything, the target has assumed ownership of
500 * this io.
501 */
502 atomic_inc(&tio->io->io_count);
503 sector = clone->bi_sector;
504 r = ti->type->map(ti, clone, &tio->info);
505 if (r > 0) {
506 /* the bio has been remapped so dispatch it */
508 blk_add_trace_remap(bdev_get_queue(clone->bi_bdev), clone,
509 tio->io->bio->bi_bdev->bd_dev, sector,
510 clone->bi_sector);
512 generic_make_request(clone);
513 }
515 else if (r < 0) {
516 /* error the io and bail out */
517 struct dm_io *io = tio->io;
518 free_tio(tio->io->md, tio);
519 dec_pending(io, r);
520 bio_put(clone);
521 }
522 }
524 struct clone_info {
525 struct mapped_device *md;
526 struct dm_table *map;
527 struct bio *bio;
528 struct dm_io *io;
529 sector_t sector;
530 sector_t sector_count;
531 unsigned short idx;
532 };
534 static void dm_bio_destructor(struct bio *bio)
535 {
536 bio_free(bio, dm_set);
537 }
539 /*
540 * Creates a little bio that is just does part of a bvec.
541 */
542 static struct bio *split_bvec(struct bio *bio, sector_t sector,
543 unsigned short idx, unsigned int offset,
544 unsigned int len)
545 {
546 struct bio *clone;
547 struct bio_vec *bv = bio->bi_io_vec + idx;
549 clone = bio_alloc_bioset(GFP_NOIO, 1, dm_set);
550 clone->bi_destructor = dm_bio_destructor;
551 *clone->bi_io_vec = *bv;
553 clone->bi_sector = sector;
554 clone->bi_bdev = bio->bi_bdev;
555 clone->bi_rw = bio->bi_rw;
556 clone->bi_vcnt = 1;
557 clone->bi_size = to_bytes(len);
558 clone->bi_io_vec->bv_offset = offset;
559 clone->bi_io_vec->bv_len = clone->bi_size;
561 return clone;
562 }
564 /*
565 * Creates a bio that consists of range of complete bvecs.
566 */
567 static struct bio *clone_bio(struct bio *bio, sector_t sector,
568 unsigned short idx, unsigned short bv_count,
569 unsigned int len)
570 {
571 struct bio *clone;
573 clone = bio_clone(bio, GFP_NOIO);
574 clone->bi_sector = sector;
575 clone->bi_idx = idx;
576 clone->bi_vcnt = idx + bv_count;
577 clone->bi_size = to_bytes(len);
578 clone->bi_flags &= ~(1 << BIO_SEG_VALID);
580 return clone;
581 }
583 static void __clone_and_map(struct clone_info *ci)
584 {
585 struct bio *clone, *bio = ci->bio;
586 struct dm_target *ti = dm_table_find_target(ci->map, ci->sector);
587 sector_t len = 0, max = max_io_len(ci->md, ci->sector, ti);
588 struct target_io *tio;
590 /*
591 * Allocate a target io object.
592 */
593 tio = alloc_tio(ci->md);
594 tio->io = ci->io;
595 tio->ti = ti;
596 memset(&tio->info, 0, sizeof(tio->info));
598 if (ci->sector_count <= max) {
599 /*
600 * Optimise for the simple case where we can do all of
601 * the remaining io with a single clone.
602 */
603 clone = clone_bio(bio, ci->sector, ci->idx,
604 bio->bi_vcnt - ci->idx, ci->sector_count);
605 __map_bio(ti, clone, tio);
606 ci->sector_count = 0;
608 } else if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) {
609 /*
610 * There are some bvecs that don't span targets.
611 * Do as many of these as possible.
612 */
613 int i;
614 sector_t remaining = max;
615 sector_t bv_len;
617 for (i = ci->idx; remaining && (i < bio->bi_vcnt); i++) {
618 bv_len = to_sector(bio->bi_io_vec[i].bv_len);
620 if (bv_len > remaining)
621 break;
623 remaining -= bv_len;
624 len += bv_len;
625 }
627 clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len);
628 __map_bio(ti, clone, tio);
630 ci->sector += len;
631 ci->sector_count -= len;
632 ci->idx = i;
634 } else {
635 /*
636 * Handle a bvec that must be split between two or more targets.
637 */
638 struct bio_vec *bv = bio->bi_io_vec + ci->idx;
639 sector_t remaining = to_sector(bv->bv_len);
640 unsigned int offset = 0;
642 do {
643 if (offset) {
644 ti = dm_table_find_target(ci->map, ci->sector);
645 max = max_io_len(ci->md, ci->sector, ti);
647 tio = alloc_tio(ci->md);
648 tio->io = ci->io;
649 tio->ti = ti;
650 memset(&tio->info, 0, sizeof(tio->info));
651 }
653 len = min(remaining, max);
655 clone = split_bvec(bio, ci->sector, ci->idx,
656 bv->bv_offset + offset, len);
658 __map_bio(ti, clone, tio);
660 ci->sector += len;
661 ci->sector_count -= len;
662 offset += to_bytes(len);
663 } while (remaining -= len);
665 ci->idx++;
666 }
667 }
669 /*
670 * Split the bio into several clones.
671 */
672 static void __split_bio(struct mapped_device *md, struct bio *bio)
673 {
674 struct clone_info ci;
676 ci.map = dm_get_table(md);
677 if (!ci.map) {
678 bio_io_error(bio, bio->bi_size);
679 return;
680 }
682 ci.md = md;
683 ci.bio = bio;
684 ci.io = alloc_io(md);
685 ci.io->error = 0;
686 atomic_set(&ci.io->io_count, 1);
687 ci.io->bio = bio;
688 ci.io->md = md;
689 ci.sector = bio->bi_sector;
690 ci.sector_count = bio_sectors(bio);
691 ci.idx = bio->bi_idx;
693 start_io_acct(ci.io);
694 while (ci.sector_count)
695 __clone_and_map(&ci);
697 /* drop the extra reference count */
698 dec_pending(ci.io, 0);
699 dm_table_put(ci.map);
700 }
701 /*-----------------------------------------------------------------
702 * CRUD END
703 *---------------------------------------------------------------*/
705 /*
706 * The request function that just remaps the bio built up by
707 * dm_merge_bvec.
708 */
709 static int dm_request(request_queue_t *q, struct bio *bio)
710 {
711 int r;
712 int rw = bio_data_dir(bio);
713 struct mapped_device *md = q->queuedata;
715 down_read(&md->io_lock);
717 disk_stat_inc(dm_disk(md), ios[rw]);
718 disk_stat_add(dm_disk(md), sectors[rw], bio_sectors(bio));
720 /*
721 * If we're suspended we have to queue
722 * this io for later.
723 */
724 while (test_bit(DMF_BLOCK_IO, &md->flags)) {
725 up_read(&md->io_lock);
727 if (bio_rw(bio) == READA) {
728 bio_io_error(bio, bio->bi_size);
729 return 0;
730 }
732 r = queue_io(md, bio);
733 if (r < 0) {
734 bio_io_error(bio, bio->bi_size);
735 return 0;
737 } else if (r == 0)
738 return 0; /* deferred successfully */
740 /*
741 * We're in a while loop, because someone could suspend
742 * before we get to the following read lock.
743 */
744 down_read(&md->io_lock);
745 }
747 __split_bio(md, bio);
748 up_read(&md->io_lock);
749 return 0;
750 }
752 static int dm_flush_all(request_queue_t *q, struct gendisk *disk,
753 sector_t *error_sector)
754 {
755 struct mapped_device *md = q->queuedata;
756 struct dm_table *map = dm_get_table(md);
757 int ret = -ENXIO;
759 if (map) {
760 ret = dm_table_flush_all(map);
761 dm_table_put(map);
762 }
764 return ret;
765 }
767 static void dm_unplug_all(request_queue_t *q)
768 {
769 struct mapped_device *md = q->queuedata;
770 struct dm_table *map = dm_get_table(md);
772 if (map) {
773 dm_table_unplug_all(map);
774 dm_table_put(map);
775 }
776 }
778 static int dm_any_congested(void *congested_data, int bdi_bits)
779 {
780 int r;
781 struct mapped_device *md = (struct mapped_device *) congested_data;
782 struct dm_table *map = dm_get_table(md);
784 if (!map || test_bit(DMF_BLOCK_IO, &md->flags))
785 r = bdi_bits;
786 else
787 r = dm_table_any_congested(map, bdi_bits);
789 dm_table_put(map);
790 return r;
791 }
793 /*-----------------------------------------------------------------
794 * An IDR is used to keep track of allocated minor numbers.
795 *---------------------------------------------------------------*/
796 static DEFINE_IDR(_minor_idr);
798 static void free_minor(int minor)
799 {
800 spin_lock(&_minor_lock);
801 idr_remove(&_minor_idr, minor);
802 spin_unlock(&_minor_lock);
803 }
805 /*
806 * See if the device with a specific minor # is free.
807 */
808 static int specific_minor(struct mapped_device *md, int minor)
809 {
810 int r, m;
812 if (minor >= (1 << MINORBITS))
813 return -EINVAL;
815 r = idr_pre_get(&_minor_idr, GFP_KERNEL);
816 if (!r)
817 return -ENOMEM;
819 spin_lock(&_minor_lock);
821 if (idr_find(&_minor_idr, minor)) {
822 r = -EBUSY;
823 goto out;
824 }
826 r = idr_get_new_above(&_minor_idr, MINOR_ALLOCED, minor, &m);
827 if (r)
828 goto out;
830 if (m != minor) {
831 idr_remove(&_minor_idr, m);
832 r = -EBUSY;
833 goto out;
834 }
836 out:
837 spin_unlock(&_minor_lock);
838 return r;
839 }
841 static int next_free_minor(struct mapped_device *md, int *minor)
842 {
843 int r, m;
845 r = idr_pre_get(&_minor_idr, GFP_KERNEL);
846 if (!r)
847 return -ENOMEM;
849 spin_lock(&_minor_lock);
851 r = idr_get_new(&_minor_idr, MINOR_ALLOCED, &m);
852 if (r) {
853 goto out;
854 }
856 if (m >= (1 << MINORBITS)) {
857 idr_remove(&_minor_idr, m);
858 r = -ENOSPC;
859 goto out;
860 }
862 *minor = m;
864 out:
865 spin_unlock(&_minor_lock);
866 return r;
867 }
869 static struct block_device_operations dm_blk_dops;
871 /*
872 * Allocate and initialise a blank device with a given minor.
873 */
874 static struct mapped_device *alloc_dev(int minor)
875 {
876 int r;
877 struct mapped_device *md = kmalloc(sizeof(*md), GFP_KERNEL);
878 void *old_md;
880 if (!md) {
881 DMWARN("unable to allocate device, out of memory.");
882 return NULL;
883 }
885 if (!try_module_get(THIS_MODULE))
886 goto bad0;
888 /* get a minor number for the dev */
889 if (minor == DM_ANY_MINOR)
890 r = next_free_minor(md, &minor);
891 else
892 r = specific_minor(md, minor);
893 if (r < 0)
894 goto bad1;
896 memset(md, 0, sizeof(*md));
897 init_rwsem(&md->io_lock);
898 init_MUTEX(&md->suspend_lock);
899 rwlock_init(&md->map_lock);
900 atomic_set(&md->holders, 1);
901 atomic_set(&md->open_count, 0);
902 atomic_set(&md->event_nr, 0);
904 md->queue = blk_alloc_queue(GFP_KERNEL);
905 if (!md->queue)
906 goto bad1;
908 md->queue->queuedata = md;
909 md->queue->backing_dev_info.congested_fn = dm_any_congested;
910 md->queue->backing_dev_info.congested_data = md;
911 blk_queue_make_request(md->queue, dm_request);
912 blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
913 md->queue->unplug_fn = dm_unplug_all;
914 md->queue->issue_flush_fn = dm_flush_all;
916 md->io_pool = mempool_create_slab_pool(MIN_IOS, _io_cache);
917 if (!md->io_pool)
918 goto bad2;
920 md->tio_pool = mempool_create_slab_pool(MIN_IOS, _tio_cache);
921 if (!md->tio_pool)
922 goto bad3;
924 md->disk = alloc_disk(1);
925 if (!md->disk)
926 goto bad4;
928 atomic_set(&md->pending, 0);
929 init_waitqueue_head(&md->wait);
930 init_waitqueue_head(&md->eventq);
932 md->disk->major = _major;
933 md->disk->first_minor = minor;
934 md->disk->fops = &dm_blk_dops;
935 md->disk->queue = md->queue;
936 md->disk->private_data = md;
937 sprintf(md->disk->disk_name, "dm-%d", minor);
938 add_disk(md->disk);
939 format_dev_t(md->name, MKDEV(_major, minor));
941 /* Populate the mapping, nobody knows we exist yet */
942 spin_lock(&_minor_lock);
943 old_md = idr_replace(&_minor_idr, md, minor);
944 spin_unlock(&_minor_lock);
946 BUG_ON(old_md != MINOR_ALLOCED);
948 return md;
950 bad4:
951 mempool_destroy(md->tio_pool);
952 bad3:
953 mempool_destroy(md->io_pool);
954 bad2:
955 blk_cleanup_queue(md->queue);
956 free_minor(minor);
957 bad1:
958 module_put(THIS_MODULE);
959 bad0:
960 kfree(md);
961 return NULL;
962 }
964 static void free_dev(struct mapped_device *md)
965 {
966 int minor = md->disk->first_minor;
968 if (md->suspended_bdev) {
969 thaw_bdev(md->suspended_bdev, NULL);
970 bdput(md->suspended_bdev);
971 }
972 mempool_destroy(md->tio_pool);
973 mempool_destroy(md->io_pool);
974 del_gendisk(md->disk);
975 free_minor(minor);
977 spin_lock(&_minor_lock);
978 md->disk->private_data = NULL;
979 spin_unlock(&_minor_lock);
981 put_disk(md->disk);
982 blk_cleanup_queue(md->queue);
983 module_put(THIS_MODULE);
984 kfree(md);
985 }
987 /*
988 * Bind a table to the device.
989 */
990 static void event_callback(void *context)
991 {
992 struct mapped_device *md = (struct mapped_device *) context;
994 atomic_inc(&md->event_nr);
995 wake_up(&md->eventq);
996 }
998 static void __set_size(struct mapped_device *md, sector_t size)
999 {
1000 set_capacity(md->disk, size);
1002 mutex_lock(&md->suspended_bdev->bd_inode->i_mutex);
1003 i_size_write(md->suspended_bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
1004 mutex_unlock(&md->suspended_bdev->bd_inode->i_mutex);
1007 static int __bind(struct mapped_device *md, struct dm_table *t)
1009 request_queue_t *q = md->queue;
1010 sector_t size;
1012 size = dm_table_get_size(t);
1014 /*
1015 * Wipe any geometry if the size of the table changed.
1016 */
1017 if (size != get_capacity(md->disk))
1018 memset(&md->geometry, 0, sizeof(md->geometry));
1020 __set_size(md, size);
1021 if (size == 0)
1022 return 0;
1024 dm_table_get(t);
1025 dm_table_event_callback(t, event_callback, md);
1027 write_lock(&md->map_lock);
1028 md->map = t;
1029 dm_table_set_restrictions(t, q);
1030 write_unlock(&md->map_lock);
1032 return 0;
1035 static void __unbind(struct mapped_device *md)
1037 struct dm_table *map = md->map;
1039 if (!map)
1040 return;
1042 dm_table_event_callback(map, NULL, NULL);
1043 write_lock(&md->map_lock);
1044 md->map = NULL;
1045 write_unlock(&md->map_lock);
1046 dm_table_put(map);
1049 /*
1050 * Constructor for a new device.
1051 */
1052 int dm_create(int minor, struct mapped_device **result)
1054 struct mapped_device *md;
1056 md = alloc_dev(minor);
1057 if (!md)
1058 return -ENXIO;
1060 *result = md;
1061 return 0;
1064 static struct mapped_device *dm_find_md(dev_t dev)
1066 struct mapped_device *md;
1067 unsigned minor = MINOR(dev);
1069 if (MAJOR(dev) != _major || minor >= (1 << MINORBITS))
1070 return NULL;
1072 spin_lock(&_minor_lock);
1074 md = idr_find(&_minor_idr, minor);
1075 if (md && (md == MINOR_ALLOCED ||
1076 (dm_disk(md)->first_minor != minor) ||
1077 test_bit(DMF_FREEING, &md->flags))) {
1078 md = NULL;
1079 goto out;
1082 out:
1083 spin_unlock(&_minor_lock);
1085 return md;
1088 struct mapped_device *dm_get_md(dev_t dev)
1090 struct mapped_device *md = dm_find_md(dev);
1092 if (md)
1093 dm_get(md);
1095 return md;
1098 void *dm_get_mdptr(struct mapped_device *md)
1100 return md->interface_ptr;
1103 void dm_set_mdptr(struct mapped_device *md, void *ptr)
1105 md->interface_ptr = ptr;
1108 void dm_get(struct mapped_device *md)
1110 atomic_inc(&md->holders);
1113 const char *dm_device_name(struct mapped_device *md)
1115 return md->name;
1117 EXPORT_SYMBOL_GPL(dm_device_name);
1119 void dm_put(struct mapped_device *md)
1121 struct dm_table *map;
1123 BUG_ON(test_bit(DMF_FREEING, &md->flags));
1125 if (atomic_dec_and_lock(&md->holders, &_minor_lock)) {
1126 map = dm_get_table(md);
1127 idr_replace(&_minor_idr, MINOR_ALLOCED, dm_disk(md)->first_minor);
1128 set_bit(DMF_FREEING, &md->flags);
1129 spin_unlock(&_minor_lock);
1130 if (!dm_suspended(md)) {
1131 dm_table_presuspend_targets(map);
1132 dm_table_postsuspend_targets(map);
1134 __unbind(md);
1135 dm_table_put(map);
1136 free_dev(md);
1140 /*
1141 * Process the deferred bios
1142 */
1143 static void __flush_deferred_io(struct mapped_device *md, struct bio *c)
1145 struct bio *n;
1147 while (c) {
1148 n = c->bi_next;
1149 c->bi_next = NULL;
1150 __split_bio(md, c);
1151 c = n;
1155 /*
1156 * Swap in a new table (destroying old one).
1157 */
1158 int dm_swap_table(struct mapped_device *md, struct dm_table *table)
1160 int r = -EINVAL;
1162 down(&md->suspend_lock);
1164 /* device must be suspended */
1165 if (!dm_suspended(md))
1166 goto out;
1168 __unbind(md);
1169 r = __bind(md, table);
1171 out:
1172 up(&md->suspend_lock);
1173 return r;
1176 /*
1177 * Functions to lock and unlock any filesystem running on the
1178 * device.
1179 */
1180 static int lock_fs(struct mapped_device *md)
1182 int r;
1184 WARN_ON(md->frozen_sb);
1186 md->frozen_sb = freeze_bdev(md->suspended_bdev);
1187 if (IS_ERR(md->frozen_sb)) {
1188 r = PTR_ERR(md->frozen_sb);
1189 md->frozen_sb = NULL;
1190 return r;
1193 set_bit(DMF_FROZEN, &md->flags);
1195 /* don't bdput right now, we don't want the bdev
1196 * to go away while it is locked.
1197 */
1198 return 0;
1201 static void unlock_fs(struct mapped_device *md)
1203 if (!test_bit(DMF_FROZEN, &md->flags))
1204 return;
1206 thaw_bdev(md->suspended_bdev, md->frozen_sb);
1207 md->frozen_sb = NULL;
1208 clear_bit(DMF_FROZEN, &md->flags);
1211 /*
1212 * We need to be able to change a mapping table under a mounted
1213 * filesystem. For example we might want to move some data in
1214 * the background. Before the table can be swapped with
1215 * dm_bind_table, dm_suspend must be called to flush any in
1216 * flight bios and ensure that any further io gets deferred.
1217 */
1218 int dm_suspend(struct mapped_device *md, int do_lockfs)
1220 struct dm_table *map = NULL;
1221 DECLARE_WAITQUEUE(wait, current);
1222 struct bio *def;
1223 int r = -EINVAL;
1225 down(&md->suspend_lock);
1227 if (dm_suspended(md))
1228 goto out;
1230 map = dm_get_table(md);
1232 /* This does not get reverted if there's an error later. */
1233 dm_table_presuspend_targets(map);
1235 md->suspended_bdev = bdget_disk(md->disk, 0);
1236 if (!md->suspended_bdev) {
1237 DMWARN("bdget failed in dm_suspend");
1238 r = -ENOMEM;
1239 goto out;
1242 /* Flush I/O to the device. */
1243 if (do_lockfs) {
1244 r = lock_fs(md);
1245 if (r)
1246 goto out;
1249 /*
1250 * First we set the BLOCK_IO flag so no more ios will be mapped.
1251 */
1252 down_write(&md->io_lock);
1253 set_bit(DMF_BLOCK_IO, &md->flags);
1255 add_wait_queue(&md->wait, &wait);
1256 up_write(&md->io_lock);
1258 /* unplug */
1259 if (map)
1260 dm_table_unplug_all(map);
1262 /*
1263 * Then we wait for the already mapped ios to
1264 * complete.
1265 */
1266 while (1) {
1267 set_current_state(TASK_INTERRUPTIBLE);
1269 if (!atomic_read(&md->pending) || signal_pending(current))
1270 break;
1272 io_schedule();
1274 set_current_state(TASK_RUNNING);
1276 down_write(&md->io_lock);
1277 remove_wait_queue(&md->wait, &wait);
1279 /* were we interrupted ? */
1280 r = -EINTR;
1281 if (atomic_read(&md->pending)) {
1282 clear_bit(DMF_BLOCK_IO, &md->flags);
1283 def = bio_list_get(&md->deferred);
1284 __flush_deferred_io(md, def);
1285 up_write(&md->io_lock);
1286 unlock_fs(md);
1287 goto out;
1289 up_write(&md->io_lock);
1291 dm_table_postsuspend_targets(map);
1293 set_bit(DMF_SUSPENDED, &md->flags);
1295 r = 0;
1297 out:
1298 if (r && md->suspended_bdev) {
1299 bdput(md->suspended_bdev);
1300 md->suspended_bdev = NULL;
1303 dm_table_put(map);
1304 up(&md->suspend_lock);
1305 return r;
1308 int dm_resume(struct mapped_device *md)
1310 int r = -EINVAL;
1311 struct bio *def;
1312 struct dm_table *map = NULL;
1314 down(&md->suspend_lock);
1315 if (!dm_suspended(md))
1316 goto out;
1318 map = dm_get_table(md);
1319 if (!map || !dm_table_get_size(map))
1320 goto out;
1322 dm_table_resume_targets(map);
1324 down_write(&md->io_lock);
1325 clear_bit(DMF_BLOCK_IO, &md->flags);
1327 def = bio_list_get(&md->deferred);
1328 __flush_deferred_io(md, def);
1329 up_write(&md->io_lock);
1331 unlock_fs(md);
1333 bdput(md->suspended_bdev);
1334 md->suspended_bdev = NULL;
1336 clear_bit(DMF_SUSPENDED, &md->flags);
1338 dm_table_unplug_all(map);
1340 r = 0;
1342 out:
1343 dm_table_put(map);
1344 up(&md->suspend_lock);
1346 return r;
1349 /*-----------------------------------------------------------------
1350 * Event notification.
1351 *---------------------------------------------------------------*/
1352 uint32_t dm_get_event_nr(struct mapped_device *md)
1354 return atomic_read(&md->event_nr);
1357 int dm_wait_event(struct mapped_device *md, int event_nr)
1359 return wait_event_interruptible(md->eventq,
1360 (event_nr != atomic_read(&md->event_nr)));
1363 /*
1364 * The gendisk is only valid as long as you have a reference
1365 * count on 'md'.
1366 */
1367 struct gendisk *dm_disk(struct mapped_device *md)
1369 return md->disk;
1372 int dm_suspended(struct mapped_device *md)
1374 return test_bit(DMF_SUSPENDED, &md->flags);
1377 static struct block_device_operations dm_blk_dops = {
1378 .open = dm_blk_open,
1379 .release = dm_blk_close,
1380 .getgeo = dm_blk_getgeo,
1381 .owner = THIS_MODULE
1382 };
1384 EXPORT_SYMBOL(dm_get_mapinfo);
1386 /*
1387 * module hooks
1388 */
1389 module_init(dm_init);
1390 module_exit(dm_exit);
1392 module_param(major, uint, 0);
1393 MODULE_PARM_DESC(major, "The major number of the device mapper");
1394 MODULE_DESCRIPTION(DM_NAME " driver");
1395 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
1396 MODULE_LICENSE("GPL");