ia64/linux-2.6.18-xen.hg

view drivers/md/dm-raid1.c @ 897:329ea0ccb344

balloon: try harder to balloon up under memory pressure.

Currently if the balloon driver is unable to increase the guest's
reservation it assumes the failure was due to reaching its full
allocation, gives up on the ballooning operation and records the limit
it reached as the "hard limit". The driver will not try again until
the target is set again (even to the same value).

However it is possible that ballooning has in fact failed due to
memory pressure in the host and therefore it is desirable to keep
attempting to reach the target in case memory becomes available. The
most likely scenario is that some guests are ballooning down while
others are ballooning up and therefore there is temporary memory
pressure while things stabilise. You would not expect a well behaved
toolstack to ask a domain to balloon to more than its allocation nor
would you expect it to deliberately over-commit memory by setting
balloon targets which exceed the total host memory.

This patch drops the concept of a hard limit and causes the balloon
driver to retry increasing the reservation on a timer in the same
manner as when decreasing the reservation.

Also if we partially succeed in increasing the reservation
(i.e. receive less pages than we asked for) then we may as well keep
those pages rather than returning them to Xen.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Fri Jun 05 14:01:20 2009 +0100 (2009-06-05)
parents 831230e53067
children
line source
1 /*
2 * Copyright (C) 2003 Sistina Software Limited.
3 *
4 * This file is released under the GPL.
5 */
7 #include "dm.h"
8 #include "dm-bio-list.h"
9 #include "dm-io.h"
10 #include "dm-log.h"
11 #include "kcopyd.h"
13 #include <linux/ctype.h>
14 #include <linux/init.h>
15 #include <linux/mempool.h>
16 #include <linux/module.h>
17 #include <linux/pagemap.h>
18 #include <linux/slab.h>
19 #include <linux/time.h>
20 #include <linux/vmalloc.h>
21 #include <linux/workqueue.h>
23 #define DM_MSG_PREFIX "raid1"
25 static struct workqueue_struct *_kmirrord_wq;
26 static struct work_struct _kmirrord_work;
28 static inline void wake(void)
29 {
30 queue_work(_kmirrord_wq, &_kmirrord_work);
31 }
33 /*-----------------------------------------------------------------
34 * Region hash
35 *
36 * The mirror splits itself up into discrete regions. Each
37 * region can be in one of three states: clean, dirty,
38 * nosync. There is no need to put clean regions in the hash.
39 *
40 * In addition to being present in the hash table a region _may_
41 * be present on one of three lists.
42 *
43 * clean_regions: Regions on this list have no io pending to
44 * them, they are in sync, we are no longer interested in them,
45 * they are dull. rh_update_states() will remove them from the
46 * hash table.
47 *
48 * quiesced_regions: These regions have been spun down, ready
49 * for recovery. rh_recovery_start() will remove regions from
50 * this list and hand them to kmirrord, which will schedule the
51 * recovery io with kcopyd.
52 *
53 * recovered_regions: Regions that kcopyd has successfully
54 * recovered. rh_update_states() will now schedule any delayed
55 * io, up the recovery_count, and remove the region from the
56 * hash.
57 *
58 * There are 2 locks:
59 * A rw spin lock 'hash_lock' protects just the hash table,
60 * this is never held in write mode from interrupt context,
61 * which I believe means that we only have to disable irqs when
62 * doing a write lock.
63 *
64 * An ordinary spin lock 'region_lock' that protects the three
65 * lists in the region_hash, with the 'state', 'list' and
66 * 'bhs_delayed' fields of the regions. This is used from irq
67 * context, so all other uses will have to suspend local irqs.
68 *---------------------------------------------------------------*/
69 struct mirror_set;
70 struct region_hash {
71 struct mirror_set *ms;
72 uint32_t region_size;
73 unsigned region_shift;
75 /* holds persistent region state */
76 struct dirty_log *log;
78 /* hash table */
79 rwlock_t hash_lock;
80 mempool_t *region_pool;
81 unsigned int mask;
82 unsigned int nr_buckets;
83 struct list_head *buckets;
85 spinlock_t region_lock;
86 struct semaphore recovery_count;
87 struct list_head clean_regions;
88 struct list_head quiesced_regions;
89 struct list_head recovered_regions;
90 };
92 enum {
93 RH_CLEAN,
94 RH_DIRTY,
95 RH_NOSYNC,
96 RH_RECOVERING
97 };
99 struct region {
100 struct region_hash *rh; /* FIXME: can we get rid of this ? */
101 region_t key;
102 int state;
104 struct list_head hash_list;
105 struct list_head list;
107 atomic_t pending;
108 struct bio_list delayed_bios;
109 };
112 /*-----------------------------------------------------------------
113 * Mirror set structures.
114 *---------------------------------------------------------------*/
115 struct mirror {
116 atomic_t error_count;
117 struct dm_dev *dev;
118 sector_t offset;
119 };
121 struct mirror_set {
122 struct dm_target *ti;
123 struct list_head list;
124 struct region_hash rh;
125 struct kcopyd_client *kcopyd_client;
127 spinlock_t lock; /* protects the next two lists */
128 struct bio_list reads;
129 struct bio_list writes;
131 /* recovery */
132 region_t nr_regions;
133 int in_sync;
135 struct mirror *default_mirror; /* Default mirror */
137 unsigned int nr_mirrors;
138 struct mirror mirror[0];
139 };
141 /*
142 * Conversion fns
143 */
144 static inline region_t bio_to_region(struct region_hash *rh, struct bio *bio)
145 {
146 return (bio->bi_sector - rh->ms->ti->begin) >> rh->region_shift;
147 }
149 static inline sector_t region_to_sector(struct region_hash *rh, region_t region)
150 {
151 return region << rh->region_shift;
152 }
154 /* FIXME move this */
155 static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw);
157 #define MIN_REGIONS 64
158 #define MAX_RECOVERY 1
159 static int rh_init(struct region_hash *rh, struct mirror_set *ms,
160 struct dirty_log *log, uint32_t region_size,
161 region_t nr_regions)
162 {
163 unsigned int nr_buckets, max_buckets;
164 size_t i;
166 /*
167 * Calculate a suitable number of buckets for our hash
168 * table.
169 */
170 max_buckets = nr_regions >> 6;
171 for (nr_buckets = 128u; nr_buckets < max_buckets; nr_buckets <<= 1)
172 ;
173 nr_buckets >>= 1;
175 rh->ms = ms;
176 rh->log = log;
177 rh->region_size = region_size;
178 rh->region_shift = ffs(region_size) - 1;
179 rwlock_init(&rh->hash_lock);
180 rh->mask = nr_buckets - 1;
181 rh->nr_buckets = nr_buckets;
183 rh->buckets = vmalloc(nr_buckets * sizeof(*rh->buckets));
184 if (!rh->buckets) {
185 DMERR("unable to allocate region hash memory");
186 return -ENOMEM;
187 }
189 for (i = 0; i < nr_buckets; i++)
190 INIT_LIST_HEAD(rh->buckets + i);
192 spin_lock_init(&rh->region_lock);
193 sema_init(&rh->recovery_count, 0);
194 INIT_LIST_HEAD(&rh->clean_regions);
195 INIT_LIST_HEAD(&rh->quiesced_regions);
196 INIT_LIST_HEAD(&rh->recovered_regions);
198 rh->region_pool = mempool_create_kmalloc_pool(MIN_REGIONS,
199 sizeof(struct region));
200 if (!rh->region_pool) {
201 vfree(rh->buckets);
202 rh->buckets = NULL;
203 return -ENOMEM;
204 }
206 return 0;
207 }
209 static void rh_exit(struct region_hash *rh)
210 {
211 unsigned int h;
212 struct region *reg, *nreg;
214 BUG_ON(!list_empty(&rh->quiesced_regions));
215 for (h = 0; h < rh->nr_buckets; h++) {
216 list_for_each_entry_safe(reg, nreg, rh->buckets + h, hash_list) {
217 BUG_ON(atomic_read(&reg->pending));
218 mempool_free(reg, rh->region_pool);
219 }
220 }
222 if (rh->log)
223 dm_destroy_dirty_log(rh->log);
224 if (rh->region_pool)
225 mempool_destroy(rh->region_pool);
226 vfree(rh->buckets);
227 }
229 #define RH_HASH_MULT 2654435387U
231 static inline unsigned int rh_hash(struct region_hash *rh, region_t region)
232 {
233 return (unsigned int) ((region * RH_HASH_MULT) >> 12) & rh->mask;
234 }
236 static struct region *__rh_lookup(struct region_hash *rh, region_t region)
237 {
238 struct region *reg;
240 list_for_each_entry (reg, rh->buckets + rh_hash(rh, region), hash_list)
241 if (reg->key == region)
242 return reg;
244 return NULL;
245 }
247 static void __rh_insert(struct region_hash *rh, struct region *reg)
248 {
249 unsigned int h = rh_hash(rh, reg->key);
250 list_add(&reg->hash_list, rh->buckets + h);
251 }
253 static struct region *__rh_alloc(struct region_hash *rh, region_t region)
254 {
255 struct region *reg, *nreg;
257 read_unlock(&rh->hash_lock);
258 nreg = mempool_alloc(rh->region_pool, GFP_ATOMIC);
259 if (unlikely(!nreg))
260 nreg = kmalloc(sizeof(struct region), GFP_NOIO);
261 nreg->state = rh->log->type->in_sync(rh->log, region, 1) ?
262 RH_CLEAN : RH_NOSYNC;
263 nreg->rh = rh;
264 nreg->key = region;
266 INIT_LIST_HEAD(&nreg->list);
268 atomic_set(&nreg->pending, 0);
269 bio_list_init(&nreg->delayed_bios);
270 write_lock_irq(&rh->hash_lock);
272 reg = __rh_lookup(rh, region);
273 if (reg)
274 /* we lost the race */
275 mempool_free(nreg, rh->region_pool);
277 else {
278 __rh_insert(rh, nreg);
279 if (nreg->state == RH_CLEAN) {
280 spin_lock(&rh->region_lock);
281 list_add(&nreg->list, &rh->clean_regions);
282 spin_unlock(&rh->region_lock);
283 }
284 reg = nreg;
285 }
286 write_unlock_irq(&rh->hash_lock);
287 read_lock(&rh->hash_lock);
289 return reg;
290 }
292 static inline struct region *__rh_find(struct region_hash *rh, region_t region)
293 {
294 struct region *reg;
296 reg = __rh_lookup(rh, region);
297 if (!reg)
298 reg = __rh_alloc(rh, region);
300 return reg;
301 }
303 static int rh_state(struct region_hash *rh, region_t region, int may_block)
304 {
305 int r;
306 struct region *reg;
308 read_lock(&rh->hash_lock);
309 reg = __rh_lookup(rh, region);
310 read_unlock(&rh->hash_lock);
312 if (reg)
313 return reg->state;
315 /*
316 * The region wasn't in the hash, so we fall back to the
317 * dirty log.
318 */
319 r = rh->log->type->in_sync(rh->log, region, may_block);
321 /*
322 * Any error from the dirty log (eg. -EWOULDBLOCK) gets
323 * taken as a RH_NOSYNC
324 */
325 return r == 1 ? RH_CLEAN : RH_NOSYNC;
326 }
328 static inline int rh_in_sync(struct region_hash *rh,
329 region_t region, int may_block)
330 {
331 int state = rh_state(rh, region, may_block);
332 return state == RH_CLEAN || state == RH_DIRTY;
333 }
335 static void dispatch_bios(struct mirror_set *ms, struct bio_list *bio_list)
336 {
337 struct bio *bio;
339 while ((bio = bio_list_pop(bio_list))) {
340 queue_bio(ms, bio, WRITE);
341 }
342 }
344 static void rh_update_states(struct region_hash *rh)
345 {
346 struct region *reg, *next;
348 LIST_HEAD(clean);
349 LIST_HEAD(recovered);
351 /*
352 * Quickly grab the lists.
353 */
354 write_lock_irq(&rh->hash_lock);
355 spin_lock(&rh->region_lock);
356 if (!list_empty(&rh->clean_regions)) {
357 list_splice(&rh->clean_regions, &clean);
358 INIT_LIST_HEAD(&rh->clean_regions);
360 list_for_each_entry (reg, &clean, list) {
361 rh->log->type->clear_region(rh->log, reg->key);
362 list_del(&reg->hash_list);
363 }
364 }
366 if (!list_empty(&rh->recovered_regions)) {
367 list_splice(&rh->recovered_regions, &recovered);
368 INIT_LIST_HEAD(&rh->recovered_regions);
370 list_for_each_entry (reg, &recovered, list)
371 list_del(&reg->hash_list);
372 }
373 spin_unlock(&rh->region_lock);
374 write_unlock_irq(&rh->hash_lock);
376 /*
377 * All the regions on the recovered and clean lists have
378 * now been pulled out of the system, so no need to do
379 * any more locking.
380 */
381 list_for_each_entry_safe (reg, next, &recovered, list) {
382 rh->log->type->clear_region(rh->log, reg->key);
383 rh->log->type->complete_resync_work(rh->log, reg->key, 1);
384 dispatch_bios(rh->ms, &reg->delayed_bios);
385 up(&rh->recovery_count);
386 mempool_free(reg, rh->region_pool);
387 }
389 if (!list_empty(&recovered))
390 rh->log->type->flush(rh->log);
392 list_for_each_entry_safe (reg, next, &clean, list)
393 mempool_free(reg, rh->region_pool);
394 }
396 static void rh_inc(struct region_hash *rh, region_t region)
397 {
398 struct region *reg;
400 read_lock(&rh->hash_lock);
401 reg = __rh_find(rh, region);
403 spin_lock_irq(&rh->region_lock);
404 atomic_inc(&reg->pending);
406 if (reg->state == RH_CLEAN) {
407 reg->state = RH_DIRTY;
408 list_del_init(&reg->list); /* take off the clean list */
409 spin_unlock_irq(&rh->region_lock);
411 rh->log->type->mark_region(rh->log, reg->key);
412 } else
413 spin_unlock_irq(&rh->region_lock);
416 read_unlock(&rh->hash_lock);
417 }
419 static void rh_inc_pending(struct region_hash *rh, struct bio_list *bios)
420 {
421 struct bio *bio;
423 for (bio = bios->head; bio; bio = bio->bi_next)
424 rh_inc(rh, bio_to_region(rh, bio));
425 }
427 static void rh_dec(struct region_hash *rh, region_t region)
428 {
429 unsigned long flags;
430 struct region *reg;
431 int should_wake = 0;
433 read_lock(&rh->hash_lock);
434 reg = __rh_lookup(rh, region);
435 read_unlock(&rh->hash_lock);
437 spin_lock_irqsave(&rh->region_lock, flags);
438 if (atomic_dec_and_test(&reg->pending)) {
439 /*
440 * There is no pending I/O for this region.
441 * We can move the region to corresponding list for next action.
442 * At this point, the region is not yet connected to any list.
443 *
444 * If the state is RH_NOSYNC, the region should be kept off
445 * from clean list.
446 * The hash entry for RH_NOSYNC will remain in memory
447 * until the region is recovered or the map is reloaded.
448 */
450 /* do nothing for RH_NOSYNC */
451 if (reg->state == RH_RECOVERING) {
452 list_add_tail(&reg->list, &rh->quiesced_regions);
453 } else if (reg->state == RH_DIRTY) {
454 reg->state = RH_CLEAN;
455 list_add(&reg->list, &rh->clean_regions);
456 }
457 should_wake = 1;
458 }
459 spin_unlock_irqrestore(&rh->region_lock, flags);
461 if (should_wake)
462 wake();
463 }
465 /*
466 * Starts quiescing a region in preparation for recovery.
467 */
468 static int __rh_recovery_prepare(struct region_hash *rh)
469 {
470 int r;
471 struct region *reg;
472 region_t region;
474 /*
475 * Ask the dirty log what's next.
476 */
477 r = rh->log->type->get_resync_work(rh->log, &region);
478 if (r <= 0)
479 return r;
481 /*
482 * Get this region, and start it quiescing by setting the
483 * recovering flag.
484 */
485 read_lock(&rh->hash_lock);
486 reg = __rh_find(rh, region);
487 read_unlock(&rh->hash_lock);
489 spin_lock_irq(&rh->region_lock);
490 reg->state = RH_RECOVERING;
492 /* Already quiesced ? */
493 if (atomic_read(&reg->pending))
494 list_del_init(&reg->list);
495 else
496 list_move(&reg->list, &rh->quiesced_regions);
498 spin_unlock_irq(&rh->region_lock);
500 return 1;
501 }
503 static void rh_recovery_prepare(struct region_hash *rh)
504 {
505 while (!down_trylock(&rh->recovery_count))
506 if (__rh_recovery_prepare(rh) <= 0) {
507 up(&rh->recovery_count);
508 break;
509 }
510 }
512 /*
513 * Returns any quiesced regions.
514 */
515 static struct region *rh_recovery_start(struct region_hash *rh)
516 {
517 struct region *reg = NULL;
519 spin_lock_irq(&rh->region_lock);
520 if (!list_empty(&rh->quiesced_regions)) {
521 reg = list_entry(rh->quiesced_regions.next,
522 struct region, list);
523 list_del_init(&reg->list); /* remove from the quiesced list */
524 }
525 spin_unlock_irq(&rh->region_lock);
527 return reg;
528 }
530 /* FIXME: success ignored for now */
531 static void rh_recovery_end(struct region *reg, int success)
532 {
533 struct region_hash *rh = reg->rh;
535 spin_lock_irq(&rh->region_lock);
536 list_add(&reg->list, &reg->rh->recovered_regions);
537 spin_unlock_irq(&rh->region_lock);
539 wake();
540 }
542 static void rh_flush(struct region_hash *rh)
543 {
544 rh->log->type->flush(rh->log);
545 }
547 static void rh_delay(struct region_hash *rh, struct bio *bio)
548 {
549 struct region *reg;
551 read_lock(&rh->hash_lock);
552 reg = __rh_find(rh, bio_to_region(rh, bio));
553 bio_list_add(&reg->delayed_bios, bio);
554 read_unlock(&rh->hash_lock);
555 }
557 static void rh_stop_recovery(struct region_hash *rh)
558 {
559 int i;
561 /* wait for any recovering regions */
562 for (i = 0; i < MAX_RECOVERY; i++)
563 down(&rh->recovery_count);
564 }
566 static void rh_start_recovery(struct region_hash *rh)
567 {
568 int i;
570 for (i = 0; i < MAX_RECOVERY; i++)
571 up(&rh->recovery_count);
573 wake();
574 }
576 /*
577 * Every mirror should look like this one.
578 */
579 #define DEFAULT_MIRROR 0
581 /*
582 * This is yucky. We squirrel the mirror_set struct away inside
583 * bi_next for write buffers. This is safe since the bh
584 * doesn't get submitted to the lower levels of block layer.
585 */
586 static struct mirror_set *bio_get_ms(struct bio *bio)
587 {
588 return (struct mirror_set *) bio->bi_next;
589 }
591 static void bio_set_ms(struct bio *bio, struct mirror_set *ms)
592 {
593 bio->bi_next = (struct bio *) ms;
594 }
596 /*-----------------------------------------------------------------
597 * Recovery.
598 *
599 * When a mirror is first activated we may find that some regions
600 * are in the no-sync state. We have to recover these by
601 * recopying from the default mirror to all the others.
602 *---------------------------------------------------------------*/
603 static void recovery_complete(int read_err, unsigned int write_err,
604 void *context)
605 {
606 struct region *reg = (struct region *) context;
608 /* FIXME: better error handling */
609 rh_recovery_end(reg, !(read_err || write_err));
610 }
612 static int recover(struct mirror_set *ms, struct region *reg)
613 {
614 int r;
615 unsigned int i;
616 struct io_region from, to[KCOPYD_MAX_REGIONS], *dest;
617 struct mirror *m;
618 unsigned long flags = 0;
620 /* fill in the source */
621 m = ms->default_mirror;
622 from.bdev = m->dev->bdev;
623 from.sector = m->offset + region_to_sector(reg->rh, reg->key);
624 if (reg->key == (ms->nr_regions - 1)) {
625 /*
626 * The final region may be smaller than
627 * region_size.
628 */
629 from.count = ms->ti->len & (reg->rh->region_size - 1);
630 if (!from.count)
631 from.count = reg->rh->region_size;
632 } else
633 from.count = reg->rh->region_size;
635 /* fill in the destinations */
636 for (i = 0, dest = to; i < ms->nr_mirrors; i++) {
637 if (&ms->mirror[i] == ms->default_mirror)
638 continue;
640 m = ms->mirror + i;
641 dest->bdev = m->dev->bdev;
642 dest->sector = m->offset + region_to_sector(reg->rh, reg->key);
643 dest->count = from.count;
644 dest++;
645 }
647 /* hand to kcopyd */
648 set_bit(KCOPYD_IGNORE_ERROR, &flags);
649 r = kcopyd_copy(ms->kcopyd_client, &from, ms->nr_mirrors - 1, to, flags,
650 recovery_complete, reg);
652 return r;
653 }
655 static void do_recovery(struct mirror_set *ms)
656 {
657 int r;
658 struct region *reg;
659 struct dirty_log *log = ms->rh.log;
661 /*
662 * Start quiescing some regions.
663 */
664 rh_recovery_prepare(&ms->rh);
666 /*
667 * Copy any already quiesced regions.
668 */
669 while ((reg = rh_recovery_start(&ms->rh))) {
670 r = recover(ms, reg);
671 if (r)
672 rh_recovery_end(reg, 0);
673 }
675 /*
676 * Update the in sync flag.
677 */
678 if (!ms->in_sync &&
679 (log->type->get_sync_count(log) == ms->nr_regions)) {
680 /* the sync is complete */
681 dm_table_event(ms->ti->table);
682 ms->in_sync = 1;
683 }
684 }
686 /*-----------------------------------------------------------------
687 * Reads
688 *---------------------------------------------------------------*/
689 static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector)
690 {
691 /* FIXME: add read balancing */
692 return ms->default_mirror;
693 }
695 /*
696 * remap a buffer to a particular mirror.
697 */
698 static void map_bio(struct mirror_set *ms, struct mirror *m, struct bio *bio)
699 {
700 bio->bi_bdev = m->dev->bdev;
701 bio->bi_sector = m->offset + (bio->bi_sector - ms->ti->begin);
702 }
704 static void do_reads(struct mirror_set *ms, struct bio_list *reads)
705 {
706 region_t region;
707 struct bio *bio;
708 struct mirror *m;
710 while ((bio = bio_list_pop(reads))) {
711 region = bio_to_region(&ms->rh, bio);
713 /*
714 * We can only read balance if the region is in sync.
715 */
716 if (rh_in_sync(&ms->rh, region, 0))
717 m = choose_mirror(ms, bio->bi_sector);
718 else
719 m = ms->default_mirror;
721 map_bio(ms, m, bio);
722 generic_make_request(bio);
723 }
724 }
726 /*-----------------------------------------------------------------
727 * Writes.
728 *
729 * We do different things with the write io depending on the
730 * state of the region that it's in:
731 *
732 * SYNC: increment pending, use kcopyd to write to *all* mirrors
733 * RECOVERING: delay the io until recovery completes
734 * NOSYNC: increment pending, just write to the default mirror
735 *---------------------------------------------------------------*/
736 static void write_callback(unsigned long error, void *context)
737 {
738 unsigned int i;
739 int uptodate = 1;
740 struct bio *bio = (struct bio *) context;
741 struct mirror_set *ms;
743 ms = bio_get_ms(bio);
744 bio_set_ms(bio, NULL);
746 /*
747 * NOTE: We don't decrement the pending count here,
748 * instead it is done by the targets endio function.
749 * This way we handle both writes to SYNC and NOSYNC
750 * regions with the same code.
751 */
753 if (error) {
754 /*
755 * only error the io if all mirrors failed.
756 * FIXME: bogus
757 */
758 uptodate = 0;
759 for (i = 0; i < ms->nr_mirrors; i++)
760 if (!test_bit(i, &error)) {
761 uptodate = 1;
762 break;
763 }
764 }
765 bio_endio(bio, bio->bi_size, 0);
766 }
768 static void do_write(struct mirror_set *ms, struct bio *bio)
769 {
770 unsigned int i;
771 struct io_region io[KCOPYD_MAX_REGIONS+1];
772 struct mirror *m;
774 for (i = 0; i < ms->nr_mirrors; i++) {
775 m = ms->mirror + i;
777 io[i].bdev = m->dev->bdev;
778 io[i].sector = m->offset + (bio->bi_sector - ms->ti->begin);
779 io[i].count = bio->bi_size >> 9;
780 }
782 bio_set_ms(bio, ms);
783 dm_io_async_bvec(ms->nr_mirrors, io, WRITE,
784 bio->bi_io_vec + bio->bi_idx,
785 write_callback, bio);
786 }
788 static void do_writes(struct mirror_set *ms, struct bio_list *writes)
789 {
790 int state;
791 struct bio *bio;
792 struct bio_list sync, nosync, recover, *this_list = NULL;
794 if (!writes->head)
795 return;
797 /*
798 * Classify each write.
799 */
800 bio_list_init(&sync);
801 bio_list_init(&nosync);
802 bio_list_init(&recover);
804 while ((bio = bio_list_pop(writes))) {
805 state = rh_state(&ms->rh, bio_to_region(&ms->rh, bio), 1);
806 switch (state) {
807 case RH_CLEAN:
808 case RH_DIRTY:
809 this_list = &sync;
810 break;
812 case RH_NOSYNC:
813 this_list = &nosync;
814 break;
816 case RH_RECOVERING:
817 this_list = &recover;
818 break;
819 }
821 bio_list_add(this_list, bio);
822 }
824 /*
825 * Increment the pending counts for any regions that will
826 * be written to (writes to recover regions are going to
827 * be delayed).
828 */
829 rh_inc_pending(&ms->rh, &sync);
830 rh_inc_pending(&ms->rh, &nosync);
831 rh_flush(&ms->rh);
833 /*
834 * Dispatch io.
835 */
836 while ((bio = bio_list_pop(&sync)))
837 do_write(ms, bio);
839 while ((bio = bio_list_pop(&recover)))
840 rh_delay(&ms->rh, bio);
842 while ((bio = bio_list_pop(&nosync))) {
843 map_bio(ms, ms->default_mirror, bio);
844 generic_make_request(bio);
845 }
846 }
848 /*-----------------------------------------------------------------
849 * kmirrord
850 *---------------------------------------------------------------*/
851 static LIST_HEAD(_mirror_sets);
852 static DECLARE_RWSEM(_mirror_sets_lock);
854 static void do_mirror(struct mirror_set *ms)
855 {
856 struct bio_list reads, writes;
858 spin_lock(&ms->lock);
859 reads = ms->reads;
860 writes = ms->writes;
861 bio_list_init(&ms->reads);
862 bio_list_init(&ms->writes);
863 spin_unlock(&ms->lock);
865 rh_update_states(&ms->rh);
866 do_recovery(ms);
867 do_reads(ms, &reads);
868 do_writes(ms, &writes);
869 }
871 static void do_work(void *ignored)
872 {
873 struct mirror_set *ms;
875 down_read(&_mirror_sets_lock);
876 list_for_each_entry (ms, &_mirror_sets, list)
877 do_mirror(ms);
878 up_read(&_mirror_sets_lock);
879 }
881 /*-----------------------------------------------------------------
882 * Target functions
883 *---------------------------------------------------------------*/
884 static struct mirror_set *alloc_context(unsigned int nr_mirrors,
885 uint32_t region_size,
886 struct dm_target *ti,
887 struct dirty_log *dl)
888 {
889 size_t len;
890 struct mirror_set *ms = NULL;
892 if (array_too_big(sizeof(*ms), sizeof(ms->mirror[0]), nr_mirrors))
893 return NULL;
895 len = sizeof(*ms) + (sizeof(ms->mirror[0]) * nr_mirrors);
897 ms = kmalloc(len, GFP_KERNEL);
898 if (!ms) {
899 ti->error = "Cannot allocate mirror context";
900 return NULL;
901 }
903 memset(ms, 0, len);
904 spin_lock_init(&ms->lock);
906 ms->ti = ti;
907 ms->nr_mirrors = nr_mirrors;
908 ms->nr_regions = dm_sector_div_up(ti->len, region_size);
909 ms->in_sync = 0;
910 ms->default_mirror = &ms->mirror[DEFAULT_MIRROR];
912 if (rh_init(&ms->rh, ms, dl, region_size, ms->nr_regions)) {
913 ti->error = "Error creating dirty region hash";
914 kfree(ms);
915 return NULL;
916 }
918 return ms;
919 }
921 static void free_context(struct mirror_set *ms, struct dm_target *ti,
922 unsigned int m)
923 {
924 while (m--)
925 dm_put_device(ti, ms->mirror[m].dev);
927 rh_exit(&ms->rh);
928 kfree(ms);
929 }
931 static inline int _check_region_size(struct dm_target *ti, uint32_t size)
932 {
933 return !(size % (PAGE_SIZE >> 9) || (size & (size - 1)) ||
934 size > ti->len);
935 }
937 static int get_mirror(struct mirror_set *ms, struct dm_target *ti,
938 unsigned int mirror, char **argv)
939 {
940 unsigned long long offset;
942 if (sscanf(argv[1], "%llu", &offset) != 1) {
943 ti->error = "Invalid offset";
944 return -EINVAL;
945 }
947 if (dm_get_device(ti, argv[0], offset, ti->len,
948 dm_table_get_mode(ti->table),
949 &ms->mirror[mirror].dev)) {
950 ti->error = "Device lookup failure";
951 return -ENXIO;
952 }
954 ms->mirror[mirror].offset = offset;
956 return 0;
957 }
959 static int add_mirror_set(struct mirror_set *ms)
960 {
961 down_write(&_mirror_sets_lock);
962 list_add_tail(&ms->list, &_mirror_sets);
963 up_write(&_mirror_sets_lock);
964 wake();
966 return 0;
967 }
969 static void del_mirror_set(struct mirror_set *ms)
970 {
971 down_write(&_mirror_sets_lock);
972 list_del(&ms->list);
973 up_write(&_mirror_sets_lock);
974 }
976 /*
977 * Create dirty log: log_type #log_params <log_params>
978 */
979 static struct dirty_log *create_dirty_log(struct dm_target *ti,
980 unsigned int argc, char **argv,
981 unsigned int *args_used)
982 {
983 unsigned int param_count;
984 struct dirty_log *dl;
986 if (argc < 2) {
987 ti->error = "Insufficient mirror log arguments";
988 return NULL;
989 }
991 if (sscanf(argv[1], "%u", &param_count) != 1) {
992 ti->error = "Invalid mirror log argument count";
993 return NULL;
994 }
996 *args_used = 2 + param_count;
998 if (argc < *args_used) {
999 ti->error = "Insufficient mirror log arguments";
1000 return NULL;
1003 dl = dm_create_dirty_log(argv[0], ti, param_count, argv + 2);
1004 if (!dl) {
1005 ti->error = "Error creating mirror dirty log";
1006 return NULL;
1009 if (!_check_region_size(ti, dl->type->get_region_size(dl))) {
1010 ti->error = "Invalid region size";
1011 dm_destroy_dirty_log(dl);
1012 return NULL;
1015 return dl;
1018 /*
1019 * Construct a mirror mapping:
1021 * log_type #log_params <log_params>
1022 * #mirrors [mirror_path offset]{2,}
1024 * log_type is "core" or "disk"
1025 * #log_params is between 1 and 3
1026 */
1027 #define DM_IO_PAGES 64
1028 static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1030 int r;
1031 unsigned int nr_mirrors, m, args_used;
1032 struct mirror_set *ms;
1033 struct dirty_log *dl;
1035 dl = create_dirty_log(ti, argc, argv, &args_used);
1036 if (!dl)
1037 return -EINVAL;
1039 argv += args_used;
1040 argc -= args_used;
1042 if (!argc || sscanf(argv[0], "%u", &nr_mirrors) != 1 ||
1043 nr_mirrors < 2 || nr_mirrors > KCOPYD_MAX_REGIONS + 1) {
1044 ti->error = "Invalid number of mirrors";
1045 dm_destroy_dirty_log(dl);
1046 return -EINVAL;
1049 argv++, argc--;
1051 if (argc != nr_mirrors * 2) {
1052 ti->error = "Wrong number of mirror arguments";
1053 dm_destroy_dirty_log(dl);
1054 return -EINVAL;
1057 ms = alloc_context(nr_mirrors, dl->type->get_region_size(dl), ti, dl);
1058 if (!ms) {
1059 dm_destroy_dirty_log(dl);
1060 return -ENOMEM;
1063 /* Get the mirror parameter sets */
1064 for (m = 0; m < nr_mirrors; m++) {
1065 r = get_mirror(ms, ti, m, argv);
1066 if (r) {
1067 free_context(ms, ti, m);
1068 return r;
1070 argv += 2;
1071 argc -= 2;
1074 ti->private = ms;
1075 ti->split_io = ms->rh.region_size;
1077 r = kcopyd_client_create(DM_IO_PAGES, &ms->kcopyd_client);
1078 if (r) {
1079 free_context(ms, ti, ms->nr_mirrors);
1080 return r;
1083 add_mirror_set(ms);
1084 return 0;
1087 static void mirror_dtr(struct dm_target *ti)
1089 struct mirror_set *ms = (struct mirror_set *) ti->private;
1091 del_mirror_set(ms);
1092 kcopyd_client_destroy(ms->kcopyd_client);
1093 free_context(ms, ti, ms->nr_mirrors);
1096 static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw)
1098 int should_wake = 0;
1099 struct bio_list *bl;
1101 bl = (rw == WRITE) ? &ms->writes : &ms->reads;
1102 spin_lock(&ms->lock);
1103 should_wake = !(bl->head);
1104 bio_list_add(bl, bio);
1105 spin_unlock(&ms->lock);
1107 if (should_wake)
1108 wake();
1111 /*
1112 * Mirror mapping function
1113 */
1114 static int mirror_map(struct dm_target *ti, struct bio *bio,
1115 union map_info *map_context)
1117 int r, rw = bio_rw(bio);
1118 struct mirror *m;
1119 struct mirror_set *ms = ti->private;
1121 map_context->ll = bio_to_region(&ms->rh, bio);
1123 if (rw == WRITE) {
1124 queue_bio(ms, bio, rw);
1125 return 0;
1128 r = ms->rh.log->type->in_sync(ms->rh.log,
1129 bio_to_region(&ms->rh, bio), 0);
1130 if (r < 0 && r != -EWOULDBLOCK)
1131 return r;
1133 if (r == -EWOULDBLOCK) /* FIXME: ugly */
1134 r = 0;
1136 /*
1137 * We don't want to fast track a recovery just for a read
1138 * ahead. So we just let it silently fail.
1139 * FIXME: get rid of this.
1140 */
1141 if (!r && rw == READA)
1142 return -EIO;
1144 if (!r) {
1145 /* Pass this io over to the daemon */
1146 queue_bio(ms, bio, rw);
1147 return 0;
1150 m = choose_mirror(ms, bio->bi_sector);
1151 if (!m)
1152 return -EIO;
1154 map_bio(ms, m, bio);
1155 return 1;
1158 static int mirror_end_io(struct dm_target *ti, struct bio *bio,
1159 int error, union map_info *map_context)
1161 int rw = bio_rw(bio);
1162 struct mirror_set *ms = (struct mirror_set *) ti->private;
1163 region_t region = map_context->ll;
1165 /*
1166 * We need to dec pending if this was a write.
1167 */
1168 if (rw == WRITE)
1169 rh_dec(&ms->rh, region);
1171 return 0;
1174 static void mirror_postsuspend(struct dm_target *ti)
1176 struct mirror_set *ms = (struct mirror_set *) ti->private;
1177 struct dirty_log *log = ms->rh.log;
1179 rh_stop_recovery(&ms->rh);
1180 if (log->type->suspend && log->type->suspend(log))
1181 /* FIXME: need better error handling */
1182 DMWARN("log suspend failed");
1185 static void mirror_resume(struct dm_target *ti)
1187 struct mirror_set *ms = (struct mirror_set *) ti->private;
1188 struct dirty_log *log = ms->rh.log;
1189 if (log->type->resume && log->type->resume(log))
1190 /* FIXME: need better error handling */
1191 DMWARN("log resume failed");
1192 rh_start_recovery(&ms->rh);
1195 static int mirror_status(struct dm_target *ti, status_type_t type,
1196 char *result, unsigned int maxlen)
1198 unsigned int m, sz;
1199 struct mirror_set *ms = (struct mirror_set *) ti->private;
1201 sz = ms->rh.log->type->status(ms->rh.log, type, result, maxlen);
1203 switch (type) {
1204 case STATUSTYPE_INFO:
1205 DMEMIT("%d ", ms->nr_mirrors);
1206 for (m = 0; m < ms->nr_mirrors; m++)
1207 DMEMIT("%s ", ms->mirror[m].dev->name);
1209 DMEMIT("%llu/%llu",
1210 (unsigned long long)ms->rh.log->type->
1211 get_sync_count(ms->rh.log),
1212 (unsigned long long)ms->nr_regions);
1213 break;
1215 case STATUSTYPE_TABLE:
1216 DMEMIT("%d ", ms->nr_mirrors);
1217 for (m = 0; m < ms->nr_mirrors; m++)
1218 DMEMIT("%s %llu ", ms->mirror[m].dev->name,
1219 (unsigned long long)ms->mirror[m].offset);
1222 return 0;
1225 static struct target_type mirror_target = {
1226 .name = "mirror",
1227 .version = {1, 0, 2},
1228 .module = THIS_MODULE,
1229 .ctr = mirror_ctr,
1230 .dtr = mirror_dtr,
1231 .map = mirror_map,
1232 .end_io = mirror_end_io,
1233 .postsuspend = mirror_postsuspend,
1234 .resume = mirror_resume,
1235 .status = mirror_status,
1236 };
1238 static int __init dm_mirror_init(void)
1240 int r;
1242 r = dm_dirty_log_init();
1243 if (r)
1244 return r;
1246 _kmirrord_wq = create_singlethread_workqueue("kmirrord");
1247 if (!_kmirrord_wq) {
1248 DMERR("couldn't start kmirrord");
1249 dm_dirty_log_exit();
1250 return r;
1252 INIT_WORK(&_kmirrord_work, do_work, NULL);
1254 r = dm_register_target(&mirror_target);
1255 if (r < 0) {
1256 DMERR("%s: Failed to register mirror target",
1257 mirror_target.name);
1258 dm_dirty_log_exit();
1259 destroy_workqueue(_kmirrord_wq);
1262 return r;
1265 static void __exit dm_mirror_exit(void)
1267 int r;
1269 r = dm_unregister_target(&mirror_target);
1270 if (r < 0)
1271 DMERR("%s: unregister failed %d", mirror_target.name, r);
1273 destroy_workqueue(_kmirrord_wq);
1274 dm_dirty_log_exit();
1277 /* Module hooks */
1278 module_init(dm_mirror_init);
1279 module_exit(dm_mirror_exit);
1281 MODULE_DESCRIPTION(DM_NAME " mirror target");
1282 MODULE_AUTHOR("Joe Thornber");
1283 MODULE_LICENSE("GPL");