ia64/linux-2.6.18-xen.hg

view drivers/md/dm-snap.c @ 897:329ea0ccb344

balloon: try harder to balloon up under memory pressure.

Currently if the balloon driver is unable to increase the guest's
reservation it assumes the failure was due to reaching its full
allocation, gives up on the ballooning operation and records the limit
it reached as the "hard limit". The driver will not try again until
the target is set again (even to the same value).

However it is possible that ballooning has in fact failed due to
memory pressure in the host and therefore it is desirable to keep
attempting to reach the target in case memory becomes available. The
most likely scenario is that some guests are ballooning down while
others are ballooning up and therefore there is temporary memory
pressure while things stabilise. You would not expect a well behaved
toolstack to ask a domain to balloon to more than its allocation nor
would you expect it to deliberately over-commit memory by setting
balloon targets which exceed the total host memory.

This patch drops the concept of a hard limit and causes the balloon
driver to retry increasing the reservation on a timer in the same
manner as when decreasing the reservation.

Also if we partially succeed in increasing the reservation
(i.e. receive less pages than we asked for) then we may as well keep
those pages rather than returning them to Xen.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Fri Jun 05 14:01:20 2009 +0100 (2009-06-05)
parents 3e8752eb6d9c
children
line source
1 /*
2 * dm-snapshot.c
3 *
4 * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
5 *
6 * This file is released under the GPL.
7 */
9 #include <linux/blkdev.h>
10 #include <linux/ctype.h>
11 #include <linux/device-mapper.h>
12 #include <linux/fs.h>
13 #include <linux/init.h>
14 #include <linux/kdev_t.h>
15 #include <linux/list.h>
16 #include <linux/mempool.h>
17 #include <linux/module.h>
18 #include <linux/slab.h>
19 #include <linux/vmalloc.h>
21 #include "dm-snap.h"
22 #include "dm-bio-list.h"
23 #include "kcopyd.h"
25 #define DM_MSG_PREFIX "snapshots"
27 /*
28 * The percentage increment we will wake up users at
29 */
30 #define WAKE_UP_PERCENT 5
32 /*
33 * kcopyd priority of snapshot operations
34 */
35 #define SNAPSHOT_COPY_PRIORITY 2
37 /*
38 * Each snapshot reserves this many pages for io
39 */
40 #define SNAPSHOT_PAGES 256
42 struct pending_exception {
43 struct exception e;
45 /*
46 * Origin buffers waiting for this to complete are held
47 * in a bio list
48 */
49 struct bio_list origin_bios;
50 struct bio_list snapshot_bios;
52 /*
53 * Short-term queue of pending exceptions prior to submission.
54 */
55 struct list_head list;
57 /*
58 * The primary pending_exception is the one that holds
59 * the sibling_count and the list of origin_bios for a
60 * group of pending_exceptions. It is always last to get freed.
61 * These fields get set up when writing to the origin.
62 */
63 struct pending_exception *primary_pe;
65 /*
66 * Number of pending_exceptions processing this chunk.
67 * When this drops to zero we must complete the origin bios.
68 * If incrementing or decrementing this, hold pe->snap->lock for
69 * the sibling concerned and not pe->primary_pe->snap->lock unless
70 * they are the same.
71 */
72 atomic_t sibling_count;
74 /* Pointer back to snapshot context */
75 struct dm_snapshot *snap;
77 /*
78 * 1 indicates the exception has already been sent to
79 * kcopyd.
80 */
81 int started;
82 };
84 /*
85 * Hash table mapping origin volumes to lists of snapshots and
86 * a lock to protect it
87 */
88 static kmem_cache_t *exception_cache;
89 static kmem_cache_t *pending_cache;
90 static mempool_t *pending_pool;
92 /*
93 * One of these per registered origin, held in the snapshot_origins hash
94 */
95 struct origin {
96 /* The origin device */
97 struct block_device *bdev;
99 struct list_head hash_list;
101 /* List of snapshots for this origin */
102 struct list_head snapshots;
103 };
105 /*
106 * Size of the hash table for origin volumes. If we make this
107 * the size of the minors list then it should be nearly perfect
108 */
109 #define ORIGIN_HASH_SIZE 256
110 #define ORIGIN_MASK 0xFF
111 static struct list_head *_origins;
112 static struct rw_semaphore _origins_lock;
114 static int init_origin_hash(void)
115 {
116 int i;
118 _origins = kmalloc(ORIGIN_HASH_SIZE * sizeof(struct list_head),
119 GFP_KERNEL);
120 if (!_origins) {
121 DMERR("unable to allocate memory");
122 return -ENOMEM;
123 }
125 for (i = 0; i < ORIGIN_HASH_SIZE; i++)
126 INIT_LIST_HEAD(_origins + i);
127 init_rwsem(&_origins_lock);
129 return 0;
130 }
132 static void exit_origin_hash(void)
133 {
134 kfree(_origins);
135 }
137 static inline unsigned int origin_hash(struct block_device *bdev)
138 {
139 return bdev->bd_dev & ORIGIN_MASK;
140 }
142 static struct origin *__lookup_origin(struct block_device *origin)
143 {
144 struct list_head *ol;
145 struct origin *o;
147 ol = &_origins[origin_hash(origin)];
148 list_for_each_entry (o, ol, hash_list)
149 if (bdev_equal(o->bdev, origin))
150 return o;
152 return NULL;
153 }
155 static void __insert_origin(struct origin *o)
156 {
157 struct list_head *sl = &_origins[origin_hash(o->bdev)];
158 list_add_tail(&o->hash_list, sl);
159 }
161 /*
162 * Make a note of the snapshot and its origin so we can look it
163 * up when the origin has a write on it.
164 */
165 static int register_snapshot(struct dm_snapshot *snap)
166 {
167 struct origin *o;
168 struct block_device *bdev = snap->origin->bdev;
170 down_write(&_origins_lock);
171 o = __lookup_origin(bdev);
173 if (!o) {
174 /* New origin */
175 o = kmalloc(sizeof(*o), GFP_KERNEL);
176 if (!o) {
177 up_write(&_origins_lock);
178 return -ENOMEM;
179 }
181 /* Initialise the struct */
182 INIT_LIST_HEAD(&o->snapshots);
183 o->bdev = bdev;
185 __insert_origin(o);
186 }
188 list_add_tail(&snap->list, &o->snapshots);
190 up_write(&_origins_lock);
191 return 0;
192 }
194 static void unregister_snapshot(struct dm_snapshot *s)
195 {
196 struct origin *o;
198 down_write(&_origins_lock);
199 o = __lookup_origin(s->origin->bdev);
201 list_del(&s->list);
202 if (list_empty(&o->snapshots)) {
203 list_del(&o->hash_list);
204 kfree(o);
205 }
207 up_write(&_origins_lock);
208 }
210 /*
211 * Implementation of the exception hash tables.
212 */
213 static int init_exception_table(struct exception_table *et, uint32_t size)
214 {
215 unsigned int i;
217 et->hash_mask = size - 1;
218 et->table = dm_vcalloc(size, sizeof(struct list_head));
219 if (!et->table)
220 return -ENOMEM;
222 for (i = 0; i < size; i++)
223 INIT_LIST_HEAD(et->table + i);
225 return 0;
226 }
228 static void exit_exception_table(struct exception_table *et, kmem_cache_t *mem)
229 {
230 struct list_head *slot;
231 struct exception *ex, *next;
232 int i, size;
234 size = et->hash_mask + 1;
235 for (i = 0; i < size; i++) {
236 slot = et->table + i;
238 list_for_each_entry_safe (ex, next, slot, hash_list)
239 kmem_cache_free(mem, ex);
240 }
242 vfree(et->table);
243 }
245 static inline uint32_t exception_hash(struct exception_table *et, chunk_t chunk)
246 {
247 return chunk & et->hash_mask;
248 }
250 static void insert_exception(struct exception_table *eh, struct exception *e)
251 {
252 struct list_head *l = &eh->table[exception_hash(eh, e->old_chunk)];
253 list_add(&e->hash_list, l);
254 }
256 static inline void remove_exception(struct exception *e)
257 {
258 list_del(&e->hash_list);
259 }
261 /*
262 * Return the exception data for a sector, or NULL if not
263 * remapped.
264 */
265 static struct exception *lookup_exception(struct exception_table *et,
266 chunk_t chunk)
267 {
268 struct list_head *slot;
269 struct exception *e;
271 slot = &et->table[exception_hash(et, chunk)];
272 list_for_each_entry (e, slot, hash_list)
273 if (e->old_chunk == chunk)
274 return e;
276 return NULL;
277 }
279 static inline struct exception *alloc_exception(void)
280 {
281 struct exception *e;
283 e = kmem_cache_alloc(exception_cache, GFP_NOIO);
284 if (!e)
285 e = kmem_cache_alloc(exception_cache, GFP_ATOMIC);
287 return e;
288 }
290 static inline void free_exception(struct exception *e)
291 {
292 kmem_cache_free(exception_cache, e);
293 }
295 static inline struct pending_exception *alloc_pending_exception(void)
296 {
297 return mempool_alloc(pending_pool, GFP_NOIO);
298 }
300 static inline void free_pending_exception(struct pending_exception *pe)
301 {
302 mempool_free(pe, pending_pool);
303 }
305 int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new)
306 {
307 struct exception *e;
309 e = alloc_exception();
310 if (!e)
311 return -ENOMEM;
313 e->old_chunk = old;
314 e->new_chunk = new;
315 insert_exception(&s->complete, e);
316 return 0;
317 }
319 /*
320 * Hard coded magic.
321 */
322 static int calc_max_buckets(void)
323 {
324 /* use a fixed size of 2MB */
325 unsigned long mem = 2 * 1024 * 1024;
326 mem /= sizeof(struct list_head);
328 return mem;
329 }
331 /*
332 * Rounds a number down to a power of 2.
333 */
334 static inline uint32_t round_down(uint32_t n)
335 {
336 while (n & (n - 1))
337 n &= (n - 1);
338 return n;
339 }
341 /*
342 * Allocate room for a suitable hash table.
343 */
344 static int init_hash_tables(struct dm_snapshot *s)
345 {
346 sector_t hash_size, cow_dev_size, origin_dev_size, max_buckets;
348 /*
349 * Calculate based on the size of the original volume or
350 * the COW volume...
351 */
352 cow_dev_size = get_dev_size(s->cow->bdev);
353 origin_dev_size = get_dev_size(s->origin->bdev);
354 max_buckets = calc_max_buckets();
356 hash_size = min(origin_dev_size, cow_dev_size) >> s->chunk_shift;
357 hash_size = min(hash_size, max_buckets);
359 /* Round it down to a power of 2 */
360 hash_size = round_down(hash_size);
361 if (init_exception_table(&s->complete, hash_size))
362 return -ENOMEM;
364 /*
365 * Allocate hash table for in-flight exceptions
366 * Make this smaller than the real hash table
367 */
368 hash_size >>= 3;
369 if (hash_size < 64)
370 hash_size = 64;
372 if (init_exception_table(&s->pending, hash_size)) {
373 exit_exception_table(&s->complete, exception_cache);
374 return -ENOMEM;
375 }
377 return 0;
378 }
380 /*
381 * Round a number up to the nearest 'size' boundary. size must
382 * be a power of 2.
383 */
384 static inline ulong round_up(ulong n, ulong size)
385 {
386 size--;
387 return (n + size) & ~size;
388 }
390 static void read_snapshot_metadata(struct dm_snapshot *s)
391 {
392 if (s->store.read_metadata(&s->store)) {
393 down_write(&s->lock);
394 s->valid = 0;
395 up_write(&s->lock);
397 dm_table_event(s->table);
398 }
399 }
401 /*
402 * Construct a snapshot mapping: <origin_dev> <COW-dev> <p/n> <chunk-size>
403 */
404 static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
405 {
406 struct dm_snapshot *s;
407 unsigned long chunk_size;
408 int r = -EINVAL;
409 char persistent;
410 char *origin_path;
411 char *cow_path;
412 char *value;
413 int blocksize;
415 if (argc < 4) {
416 ti->error = "requires exactly 4 arguments";
417 r = -EINVAL;
418 goto bad1;
419 }
421 origin_path = argv[0];
422 cow_path = argv[1];
423 persistent = toupper(*argv[2]);
425 if (persistent != 'P' && persistent != 'N') {
426 ti->error = "Persistent flag is not P or N";
427 r = -EINVAL;
428 goto bad1;
429 }
431 chunk_size = simple_strtoul(argv[3], &value, 10);
432 if (chunk_size == 0 || value == NULL) {
433 ti->error = "Invalid chunk size";
434 r = -EINVAL;
435 goto bad1;
436 }
438 s = kmalloc(sizeof(*s), GFP_KERNEL);
439 if (s == NULL) {
440 ti->error = "Cannot allocate snapshot context private "
441 "structure";
442 r = -ENOMEM;
443 goto bad1;
444 }
446 r = dm_get_device(ti, origin_path, 0, ti->len, FMODE_READ, &s->origin);
447 if (r) {
448 ti->error = "Cannot get origin device";
449 goto bad2;
450 }
452 r = dm_get_device(ti, cow_path, 0, 0,
453 FMODE_READ | FMODE_WRITE, &s->cow);
454 if (r) {
455 dm_put_device(ti, s->origin);
456 ti->error = "Cannot get COW device";
457 goto bad2;
458 }
460 /*
461 * Chunk size must be multiple of page size. Silently
462 * round up if it's not.
463 */
464 chunk_size = round_up(chunk_size, PAGE_SIZE >> 9);
466 /* Validate the chunk size against the device block size */
467 blocksize = s->cow->bdev->bd_disk->queue->hardsect_size;
468 if (chunk_size % (blocksize >> 9)) {
469 ti->error = "Chunk size is not a multiple of device blocksize";
470 r = -EINVAL;
471 goto bad3;
472 }
474 /* Check chunk_size is a power of 2 */
475 if (chunk_size & (chunk_size - 1)) {
476 ti->error = "Chunk size is not a power of 2";
477 r = -EINVAL;
478 goto bad3;
479 }
481 s->chunk_size = chunk_size;
482 s->chunk_mask = chunk_size - 1;
483 s->type = persistent;
484 s->chunk_shift = ffs(chunk_size) - 1;
486 s->valid = 1;
487 s->active = 0;
488 s->last_percent = 0;
489 init_rwsem(&s->lock);
490 s->table = ti->table;
492 /* Allocate hash table for COW data */
493 if (init_hash_tables(s)) {
494 ti->error = "Unable to allocate hash table space";
495 r = -ENOMEM;
496 goto bad3;
497 }
499 /*
500 * Check the persistent flag - done here because we need the iobuf
501 * to check the LV header
502 */
503 s->store.snap = s;
505 if (persistent == 'P')
506 r = dm_create_persistent(&s->store, chunk_size);
507 else
508 r = dm_create_transient(&s->store, s, blocksize);
510 if (r) {
511 ti->error = "Couldn't create exception store";
512 r = -EINVAL;
513 goto bad4;
514 }
516 r = kcopyd_client_create(SNAPSHOT_PAGES, &s->kcopyd_client);
517 if (r) {
518 ti->error = "Could not create kcopyd client";
519 goto bad5;
520 }
522 /* Metadata must only be loaded into one table at once */
523 read_snapshot_metadata(s);
525 /* Add snapshot to the list of snapshots for this origin */
526 /* Exceptions aren't triggered till snapshot_resume() is called */
527 if (register_snapshot(s)) {
528 r = -EINVAL;
529 ti->error = "Cannot register snapshot origin";
530 goto bad6;
531 }
533 ti->private = s;
534 ti->split_io = s->chunk_size;
536 return 0;
538 bad6:
539 kcopyd_client_destroy(s->kcopyd_client);
541 bad5:
542 s->store.destroy(&s->store);
544 bad4:
545 exit_exception_table(&s->pending, pending_cache);
546 exit_exception_table(&s->complete, exception_cache);
548 bad3:
549 dm_put_device(ti, s->cow);
550 dm_put_device(ti, s->origin);
552 bad2:
553 kfree(s);
555 bad1:
556 return r;
557 }
559 static void snapshot_dtr(struct dm_target *ti)
560 {
561 struct dm_snapshot *s = (struct dm_snapshot *) ti->private;
563 /* Prevent further origin writes from using this snapshot. */
564 /* After this returns there can be no new kcopyd jobs. */
565 unregister_snapshot(s);
567 kcopyd_client_destroy(s->kcopyd_client);
569 exit_exception_table(&s->pending, pending_cache);
570 exit_exception_table(&s->complete, exception_cache);
572 /* Deallocate memory used */
573 s->store.destroy(&s->store);
575 dm_put_device(ti, s->origin);
576 dm_put_device(ti, s->cow);
578 kfree(s);
579 }
581 /*
582 * Flush a list of buffers.
583 */
584 static void flush_bios(struct bio *bio)
585 {
586 struct bio *n;
588 while (bio) {
589 n = bio->bi_next;
590 bio->bi_next = NULL;
591 generic_make_request(bio);
592 bio = n;
593 }
594 }
596 /*
597 * Error a list of buffers.
598 */
599 static void error_bios(struct bio *bio)
600 {
601 struct bio *n;
603 while (bio) {
604 n = bio->bi_next;
605 bio->bi_next = NULL;
606 bio_io_error(bio, bio->bi_size);
607 bio = n;
608 }
609 }
611 static inline void error_snapshot_bios(struct pending_exception *pe)
612 {
613 error_bios(bio_list_get(&pe->snapshot_bios));
614 }
616 static struct bio *__flush_bios(struct pending_exception *pe)
617 {
618 /*
619 * If this pe is involved in a write to the origin and
620 * it is the last sibling to complete then release
621 * the bios for the original write to the origin.
622 */
624 if (pe->primary_pe &&
625 atomic_dec_and_test(&pe->primary_pe->sibling_count))
626 return bio_list_get(&pe->primary_pe->origin_bios);
628 return NULL;
629 }
631 static void __invalidate_snapshot(struct dm_snapshot *s,
632 struct pending_exception *pe, int err)
633 {
634 if (!s->valid)
635 return;
637 if (err == -EIO)
638 DMERR("Invalidating snapshot: Error reading/writing.");
639 else if (err == -ENOMEM)
640 DMERR("Invalidating snapshot: Unable to allocate exception.");
642 if (pe)
643 remove_exception(&pe->e);
645 if (s->store.drop_snapshot)
646 s->store.drop_snapshot(&s->store);
648 s->valid = 0;
650 dm_table_event(s->table);
651 }
653 static void pending_complete(struct pending_exception *pe, int success)
654 {
655 struct exception *e;
656 struct pending_exception *primary_pe;
657 struct dm_snapshot *s = pe->snap;
658 struct bio *flush = NULL;
660 if (!success) {
661 /* Read/write error - snapshot is unusable */
662 down_write(&s->lock);
663 __invalidate_snapshot(s, pe, -EIO);
664 flush = __flush_bios(pe);
665 up_write(&s->lock);
667 error_snapshot_bios(pe);
668 goto out;
669 }
671 e = alloc_exception();
672 if (!e) {
673 down_write(&s->lock);
674 __invalidate_snapshot(s, pe, -ENOMEM);
675 flush = __flush_bios(pe);
676 up_write(&s->lock);
678 error_snapshot_bios(pe);
679 goto out;
680 }
681 *e = pe->e;
683 /*
684 * Add a proper exception, and remove the
685 * in-flight exception from the list.
686 */
687 down_write(&s->lock);
688 if (!s->valid) {
689 flush = __flush_bios(pe);
690 up_write(&s->lock);
692 free_exception(e);
694 remove_exception(&pe->e);
695 error_snapshot_bios(pe);
696 goto out;
697 }
699 insert_exception(&s->complete, e);
700 remove_exception(&pe->e);
701 flush = __flush_bios(pe);
703 up_write(&s->lock);
705 /* Submit any pending write bios */
706 flush_bios(bio_list_get(&pe->snapshot_bios));
708 out:
709 primary_pe = pe->primary_pe;
711 /*
712 * Free the pe if it's not linked to an origin write or if
713 * it's not itself a primary pe.
714 */
715 if (!primary_pe || primary_pe != pe)
716 free_pending_exception(pe);
718 /*
719 * Free the primary pe if nothing references it.
720 */
721 if (primary_pe && !atomic_read(&primary_pe->sibling_count))
722 free_pending_exception(primary_pe);
724 if (flush)
725 flush_bios(flush);
726 }
728 static void commit_callback(void *context, int success)
729 {
730 struct pending_exception *pe = (struct pending_exception *) context;
731 pending_complete(pe, success);
732 }
734 /*
735 * Called when the copy I/O has finished. kcopyd actually runs
736 * this code so don't block.
737 */
738 static void copy_callback(int read_err, unsigned int write_err, void *context)
739 {
740 struct pending_exception *pe = (struct pending_exception *) context;
741 struct dm_snapshot *s = pe->snap;
743 if (read_err || write_err)
744 pending_complete(pe, 0);
746 else
747 /* Update the metadata if we are persistent */
748 s->store.commit_exception(&s->store, &pe->e, commit_callback,
749 pe);
750 }
752 /*
753 * Dispatches the copy operation to kcopyd.
754 */
755 static void start_copy(struct pending_exception *pe)
756 {
757 struct dm_snapshot *s = pe->snap;
758 struct io_region src, dest;
759 struct block_device *bdev = s->origin->bdev;
760 sector_t dev_size;
762 dev_size = get_dev_size(bdev);
764 src.bdev = bdev;
765 src.sector = chunk_to_sector(s, pe->e.old_chunk);
766 src.count = min(s->chunk_size, dev_size - src.sector);
768 dest.bdev = s->cow->bdev;
769 dest.sector = chunk_to_sector(s, pe->e.new_chunk);
770 dest.count = src.count;
772 /* Hand over to kcopyd */
773 kcopyd_copy(s->kcopyd_client,
774 &src, 1, &dest, 0, copy_callback, pe);
775 }
777 /*
778 * Looks to see if this snapshot already has a pending exception
779 * for this chunk, otherwise it allocates a new one and inserts
780 * it into the pending table.
781 *
782 * NOTE: a write lock must be held on snap->lock before calling
783 * this.
784 */
785 static struct pending_exception *
786 __find_pending_exception(struct dm_snapshot *s, struct bio *bio)
787 {
788 struct exception *e;
789 struct pending_exception *pe;
790 chunk_t chunk = sector_to_chunk(s, bio->bi_sector);
792 /*
793 * Is there a pending exception for this already ?
794 */
795 e = lookup_exception(&s->pending, chunk);
796 if (e) {
797 /* cast the exception to a pending exception */
798 pe = container_of(e, struct pending_exception, e);
799 goto out;
800 }
802 /*
803 * Create a new pending exception, we don't want
804 * to hold the lock while we do this.
805 */
806 up_write(&s->lock);
807 pe = alloc_pending_exception();
808 down_write(&s->lock);
810 if (!s->valid) {
811 free_pending_exception(pe);
812 return NULL;
813 }
815 e = lookup_exception(&s->pending, chunk);
816 if (e) {
817 free_pending_exception(pe);
818 pe = container_of(e, struct pending_exception, e);
819 goto out;
820 }
822 pe->e.old_chunk = chunk;
823 bio_list_init(&pe->origin_bios);
824 bio_list_init(&pe->snapshot_bios);
825 pe->primary_pe = NULL;
826 atomic_set(&pe->sibling_count, 1);
827 pe->snap = s;
828 pe->started = 0;
830 if (s->store.prepare_exception(&s->store, &pe->e)) {
831 free_pending_exception(pe);
832 return NULL;
833 }
835 insert_exception(&s->pending, &pe->e);
837 out:
838 return pe;
839 }
841 static inline void remap_exception(struct dm_snapshot *s, struct exception *e,
842 struct bio *bio)
843 {
844 bio->bi_bdev = s->cow->bdev;
845 bio->bi_sector = chunk_to_sector(s, e->new_chunk) +
846 (bio->bi_sector & s->chunk_mask);
847 }
849 static int snapshot_map(struct dm_target *ti, struct bio *bio,
850 union map_info *map_context)
851 {
852 struct exception *e;
853 struct dm_snapshot *s = (struct dm_snapshot *) ti->private;
854 int copy_needed = 0;
855 int r = 1;
856 chunk_t chunk;
857 struct pending_exception *pe = NULL;
859 chunk = sector_to_chunk(s, bio->bi_sector);
861 /* Full snapshots are not usable */
862 /* To get here the table must be live so s->active is always set. */
863 if (!s->valid)
864 return -EIO;
866 if (unlikely(bio_barrier(bio)))
867 return -EOPNOTSUPP;
869 /*
870 * Write to snapshot - higher level takes care of RW/RO
871 * flags so we should only get this if we are
872 * writeable.
873 */
874 if (bio_rw(bio) == WRITE) {
876 /* FIXME: should only take write lock if we need
877 * to copy an exception */
878 down_write(&s->lock);
880 if (!s->valid) {
881 r = -EIO;
882 goto out_unlock;
883 }
885 /* If the block is already remapped - use that, else remap it */
886 e = lookup_exception(&s->complete, chunk);
887 if (e) {
888 remap_exception(s, e, bio);
889 goto out_unlock;
890 }
892 pe = __find_pending_exception(s, bio);
893 if (!pe) {
894 __invalidate_snapshot(s, pe, -ENOMEM);
895 r = -EIO;
896 goto out_unlock;
897 }
899 remap_exception(s, &pe->e, bio);
900 bio_list_add(&pe->snapshot_bios, bio);
902 if (!pe->started) {
903 /* this is protected by snap->lock */
904 pe->started = 1;
905 copy_needed = 1;
906 }
908 r = 0;
910 out_unlock:
911 up_write(&s->lock);
913 if (copy_needed)
914 start_copy(pe);
915 } else {
916 /*
917 * FIXME: this read path scares me because we
918 * always use the origin when we have a pending
919 * exception. However I can't think of a
920 * situation where this is wrong - ejt.
921 */
923 /* Do reads */
924 down_read(&s->lock);
926 if (!s->valid) {
927 up_read(&s->lock);
928 return -EIO;
929 }
931 /* See if it it has been remapped */
932 e = lookup_exception(&s->complete, chunk);
933 if (e)
934 remap_exception(s, e, bio);
935 else
936 bio->bi_bdev = s->origin->bdev;
938 up_read(&s->lock);
939 }
941 return r;
942 }
944 static void snapshot_resume(struct dm_target *ti)
945 {
946 struct dm_snapshot *s = (struct dm_snapshot *) ti->private;
948 down_write(&s->lock);
949 s->active = 1;
950 up_write(&s->lock);
951 }
953 static int snapshot_status(struct dm_target *ti, status_type_t type,
954 char *result, unsigned int maxlen)
955 {
956 struct dm_snapshot *snap = (struct dm_snapshot *) ti->private;
958 switch (type) {
959 case STATUSTYPE_INFO:
960 if (!snap->valid)
961 snprintf(result, maxlen, "Invalid");
962 else {
963 if (snap->store.fraction_full) {
964 sector_t numerator, denominator;
965 snap->store.fraction_full(&snap->store,
966 &numerator,
967 &denominator);
968 snprintf(result, maxlen, "%llu/%llu",
969 (unsigned long long)numerator,
970 (unsigned long long)denominator);
971 }
972 else
973 snprintf(result, maxlen, "Unknown");
974 }
975 break;
977 case STATUSTYPE_TABLE:
978 /*
979 * kdevname returns a static pointer so we need
980 * to make private copies if the output is to
981 * make sense.
982 */
983 snprintf(result, maxlen, "%s %s %c %llu",
984 snap->origin->name, snap->cow->name,
985 snap->type,
986 (unsigned long long)snap->chunk_size);
987 break;
988 }
990 return 0;
991 }
993 /*-----------------------------------------------------------------
994 * Origin methods
995 *---------------------------------------------------------------*/
996 static int __origin_write(struct list_head *snapshots, struct bio *bio)
997 {
998 int r = 1, first = 0;
999 struct dm_snapshot *snap;
1000 struct exception *e;
1001 struct pending_exception *pe, *next_pe, *primary_pe = NULL;
1002 chunk_t chunk;
1003 LIST_HEAD(pe_queue);
1005 /* Do all the snapshots on this origin */
1006 list_for_each_entry (snap, snapshots, list) {
1008 down_write(&snap->lock);
1010 /* Only deal with valid and active snapshots */
1011 if (!snap->valid || !snap->active)
1012 goto next_snapshot;
1014 /* Nothing to do if writing beyond end of snapshot */
1015 if (bio->bi_sector >= dm_table_get_size(snap->table))
1016 goto next_snapshot;
1018 /*
1019 * Remember, different snapshots can have
1020 * different chunk sizes.
1021 */
1022 chunk = sector_to_chunk(snap, bio->bi_sector);
1024 /*
1025 * Check exception table to see if block
1026 * is already remapped in this snapshot
1027 * and trigger an exception if not.
1029 * sibling_count is initialised to 1 so pending_complete()
1030 * won't destroy the primary_pe while we're inside this loop.
1031 */
1032 e = lookup_exception(&snap->complete, chunk);
1033 if (e)
1034 goto next_snapshot;
1036 pe = __find_pending_exception(snap, bio);
1037 if (!pe) {
1038 __invalidate_snapshot(snap, pe, ENOMEM);
1039 goto next_snapshot;
1042 if (!primary_pe) {
1043 /*
1044 * Either every pe here has same
1045 * primary_pe or none has one yet.
1046 */
1047 if (pe->primary_pe)
1048 primary_pe = pe->primary_pe;
1049 else {
1050 primary_pe = pe;
1051 first = 1;
1054 bio_list_add(&primary_pe->origin_bios, bio);
1056 r = 0;
1059 if (!pe->primary_pe) {
1060 atomic_inc(&primary_pe->sibling_count);
1061 pe->primary_pe = primary_pe;
1064 if (!pe->started) {
1065 pe->started = 1;
1066 list_add_tail(&pe->list, &pe_queue);
1069 next_snapshot:
1070 up_write(&snap->lock);
1073 if (!primary_pe)
1074 goto out;
1076 /*
1077 * If this is the first time we're processing this chunk and
1078 * sibling_count is now 1 it means all the pending exceptions
1079 * got completed while we were in the loop above, so it falls to
1080 * us here to remove the primary_pe and submit any origin_bios.
1081 */
1083 if (first && atomic_dec_and_test(&primary_pe->sibling_count)) {
1084 flush_bios(bio_list_get(&primary_pe->origin_bios));
1085 free_pending_exception(primary_pe);
1086 /* If we got here, pe_queue is necessarily empty. */
1087 goto out;
1090 /*
1091 * Now that we have a complete pe list we can start the copying.
1092 */
1093 list_for_each_entry_safe(pe, next_pe, &pe_queue, list)
1094 start_copy(pe);
1096 out:
1097 return r;
1100 /*
1101 * Called on a write from the origin driver.
1102 */
1103 static int do_origin(struct dm_dev *origin, struct bio *bio)
1105 struct origin *o;
1106 int r = 1;
1108 down_read(&_origins_lock);
1109 o = __lookup_origin(origin->bdev);
1110 if (o)
1111 r = __origin_write(&o->snapshots, bio);
1112 up_read(&_origins_lock);
1114 return r;
1117 /*
1118 * Origin: maps a linear range of a device, with hooks for snapshotting.
1119 */
1121 /*
1122 * Construct an origin mapping: <dev_path>
1123 * The context for an origin is merely a 'struct dm_dev *'
1124 * pointing to the real device.
1125 */
1126 static int origin_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1128 int r;
1129 struct dm_dev *dev;
1131 if (argc != 1) {
1132 ti->error = "origin: incorrect number of arguments";
1133 return -EINVAL;
1136 r = dm_get_device(ti, argv[0], 0, ti->len,
1137 dm_table_get_mode(ti->table), &dev);
1138 if (r) {
1139 ti->error = "Cannot get target device";
1140 return r;
1143 ti->private = dev;
1144 return 0;
1147 static void origin_dtr(struct dm_target *ti)
1149 struct dm_dev *dev = (struct dm_dev *) ti->private;
1150 dm_put_device(ti, dev);
1153 static int origin_map(struct dm_target *ti, struct bio *bio,
1154 union map_info *map_context)
1156 struct dm_dev *dev = (struct dm_dev *) ti->private;
1157 bio->bi_bdev = dev->bdev;
1159 if (unlikely(bio_barrier(bio)))
1160 return -EOPNOTSUPP;
1162 /* Only tell snapshots if this is a write */
1163 return (bio_rw(bio) == WRITE) ? do_origin(dev, bio) : 1;
1166 #define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
1168 /*
1169 * Set the target "split_io" field to the minimum of all the snapshots'
1170 * chunk sizes.
1171 */
1172 static void origin_resume(struct dm_target *ti)
1174 struct dm_dev *dev = (struct dm_dev *) ti->private;
1175 struct dm_snapshot *snap;
1176 struct origin *o;
1177 chunk_t chunk_size = 0;
1179 down_read(&_origins_lock);
1180 o = __lookup_origin(dev->bdev);
1181 if (o)
1182 list_for_each_entry (snap, &o->snapshots, list)
1183 chunk_size = min_not_zero(chunk_size, snap->chunk_size);
1184 up_read(&_origins_lock);
1186 ti->split_io = chunk_size;
1189 static int origin_status(struct dm_target *ti, status_type_t type, char *result,
1190 unsigned int maxlen)
1192 struct dm_dev *dev = (struct dm_dev *) ti->private;
1194 switch (type) {
1195 case STATUSTYPE_INFO:
1196 result[0] = '\0';
1197 break;
1199 case STATUSTYPE_TABLE:
1200 snprintf(result, maxlen, "%s", dev->name);
1201 break;
1204 return 0;
1207 static struct target_type origin_target = {
1208 .name = "snapshot-origin",
1209 .version = {1, 4, 0},
1210 .module = THIS_MODULE,
1211 .ctr = origin_ctr,
1212 .dtr = origin_dtr,
1213 .map = origin_map,
1214 .resume = origin_resume,
1215 .status = origin_status,
1216 };
1218 static struct target_type snapshot_target = {
1219 .name = "snapshot",
1220 .version = {1, 4, 0},
1221 .module = THIS_MODULE,
1222 .ctr = snapshot_ctr,
1223 .dtr = snapshot_dtr,
1224 .map = snapshot_map,
1225 .resume = snapshot_resume,
1226 .status = snapshot_status,
1227 };
1229 static int __init dm_snapshot_init(void)
1231 int r;
1233 r = dm_register_target(&snapshot_target);
1234 if (r) {
1235 DMERR("snapshot target register failed %d", r);
1236 return r;
1239 r = dm_register_target(&origin_target);
1240 if (r < 0) {
1241 DMERR("Origin target register failed %d", r);
1242 goto bad1;
1245 r = init_origin_hash();
1246 if (r) {
1247 DMERR("init_origin_hash failed.");
1248 goto bad2;
1251 exception_cache = kmem_cache_create("dm-snapshot-ex",
1252 sizeof(struct exception),
1253 __alignof__(struct exception),
1254 0, NULL, NULL);
1255 if (!exception_cache) {
1256 DMERR("Couldn't create exception cache.");
1257 r = -ENOMEM;
1258 goto bad3;
1261 pending_cache =
1262 kmem_cache_create("dm-snapshot-in",
1263 sizeof(struct pending_exception),
1264 __alignof__(struct pending_exception),
1265 0, NULL, NULL);
1266 if (!pending_cache) {
1267 DMERR("Couldn't create pending cache.");
1268 r = -ENOMEM;
1269 goto bad4;
1272 pending_pool = mempool_create_slab_pool(128, pending_cache);
1273 if (!pending_pool) {
1274 DMERR("Couldn't create pending pool.");
1275 r = -ENOMEM;
1276 goto bad5;
1279 return 0;
1281 bad5:
1282 kmem_cache_destroy(pending_cache);
1283 bad4:
1284 kmem_cache_destroy(exception_cache);
1285 bad3:
1286 exit_origin_hash();
1287 bad2:
1288 dm_unregister_target(&origin_target);
1289 bad1:
1290 dm_unregister_target(&snapshot_target);
1291 return r;
1294 static void __exit dm_snapshot_exit(void)
1296 int r;
1298 r = dm_unregister_target(&snapshot_target);
1299 if (r)
1300 DMERR("snapshot unregister failed %d", r);
1302 r = dm_unregister_target(&origin_target);
1303 if (r)
1304 DMERR("origin unregister failed %d", r);
1306 exit_origin_hash();
1307 mempool_destroy(pending_pool);
1308 kmem_cache_destroy(pending_cache);
1309 kmem_cache_destroy(exception_cache);
1312 /* Module hooks */
1313 module_init(dm_snapshot_init);
1314 module_exit(dm_snapshot_exit);
1316 MODULE_DESCRIPTION(DM_NAME " snapshot target");
1317 MODULE_AUTHOR("Joe Thornber");
1318 MODULE_LICENSE("GPL");