ia64/linux-2.6.18-xen.hg

view drivers/md/raid1.c @ 897:329ea0ccb344

balloon: try harder to balloon up under memory pressure.

Currently if the balloon driver is unable to increase the guest's
reservation it assumes the failure was due to reaching its full
allocation, gives up on the ballooning operation and records the limit
it reached as the "hard limit". The driver will not try again until
the target is set again (even to the same value).

However it is possible that ballooning has in fact failed due to
memory pressure in the host and therefore it is desirable to keep
attempting to reach the target in case memory becomes available. The
most likely scenario is that some guests are ballooning down while
others are ballooning up and therefore there is temporary memory
pressure while things stabilise. You would not expect a well behaved
toolstack to ask a domain to balloon to more than its allocation nor
would you expect it to deliberately over-commit memory by setting
balloon targets which exceed the total host memory.

This patch drops the concept of a hard limit and causes the balloon
driver to retry increasing the reservation on a timer in the same
manner as when decreasing the reservation.

Also if we partially succeed in increasing the reservation
(i.e. receive less pages than we asked for) then we may as well keep
those pages rather than returning them to Xen.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Fri Jun 05 14:01:20 2009 +0100 (2009-06-05)
parents 831230e53067
children
line source
1 /*
2 * raid1.c : Multiple Devices driver for Linux
3 *
4 * Copyright (C) 1999, 2000, 2001 Ingo Molnar, Red Hat
5 *
6 * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
7 *
8 * RAID-1 management functions.
9 *
10 * Better read-balancing code written by Mika Kuoppala <miku@iki.fi>, 2000
11 *
12 * Fixes to reconstruction by Jakob Østergaard" <jakob@ostenfeld.dk>
13 * Various fixes by Neil Brown <neilb@cse.unsw.edu.au>
14 *
15 * Changes by Peter T. Breuer <ptb@it.uc3m.es> 31/1/2003 to support
16 * bitmapped intelligence in resync:
17 *
18 * - bitmap marked during normal i/o
19 * - bitmap used to skip nondirty blocks during sync
20 *
21 * Additions to bitmap code, (C) 2003-2004 Paul Clements, SteelEye Technology:
22 * - persistent bitmap code
23 *
24 * This program is free software; you can redistribute it and/or modify
25 * it under the terms of the GNU General Public License as published by
26 * the Free Software Foundation; either version 2, or (at your option)
27 * any later version.
28 *
29 * You should have received a copy of the GNU General Public License
30 * (for example /usr/src/linux/COPYING); if not, write to the Free
31 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
32 */
34 #include "dm-bio-list.h"
35 #include <linux/raid/raid1.h>
36 #include <linux/raid/bitmap.h>
38 #define DEBUG 0
39 #if DEBUG
40 #define PRINTK(x...) printk(x)
41 #else
42 #define PRINTK(x...)
43 #endif
45 /*
46 * Number of guaranteed r1bios in case of extreme VM load:
47 */
48 #define NR_RAID1_BIOS 256
51 static void unplug_slaves(mddev_t *mddev);
53 static void allow_barrier(conf_t *conf);
54 static void lower_barrier(conf_t *conf);
56 static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
57 {
58 struct pool_info *pi = data;
59 r1bio_t *r1_bio;
60 int size = offsetof(r1bio_t, bios[pi->raid_disks]);
62 /* allocate a r1bio with room for raid_disks entries in the bios array */
63 r1_bio = kzalloc(size, gfp_flags);
64 if (!r1_bio)
65 unplug_slaves(pi->mddev);
67 return r1_bio;
68 }
70 static void r1bio_pool_free(void *r1_bio, void *data)
71 {
72 kfree(r1_bio);
73 }
75 #define RESYNC_BLOCK_SIZE (64*1024)
76 //#define RESYNC_BLOCK_SIZE PAGE_SIZE
77 #define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
78 #define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
79 #define RESYNC_WINDOW (2048*1024)
81 static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
82 {
83 struct pool_info *pi = data;
84 struct page *page;
85 r1bio_t *r1_bio;
86 struct bio *bio;
87 int i, j;
89 r1_bio = r1bio_pool_alloc(gfp_flags, pi);
90 if (!r1_bio) {
91 unplug_slaves(pi->mddev);
92 return NULL;
93 }
95 /*
96 * Allocate bios : 1 for reading, n-1 for writing
97 */
98 for (j = pi->raid_disks ; j-- ; ) {
99 bio = bio_alloc(gfp_flags, RESYNC_PAGES);
100 if (!bio)
101 goto out_free_bio;
102 r1_bio->bios[j] = bio;
103 }
104 /*
105 * Allocate RESYNC_PAGES data pages and attach them to
106 * the first bio.
107 * If this is a user-requested check/repair, allocate
108 * RESYNC_PAGES for each bio.
109 */
110 if (test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery))
111 j = pi->raid_disks;
112 else
113 j = 1;
114 while(j--) {
115 bio = r1_bio->bios[j];
116 for (i = 0; i < RESYNC_PAGES; i++) {
117 page = alloc_page(gfp_flags);
118 if (unlikely(!page))
119 goto out_free_pages;
121 bio->bi_io_vec[i].bv_page = page;
122 }
123 }
124 /* If not user-requests, copy the page pointers to all bios */
125 if (!test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) {
126 for (i=0; i<RESYNC_PAGES ; i++)
127 for (j=1; j<pi->raid_disks; j++)
128 r1_bio->bios[j]->bi_io_vec[i].bv_page =
129 r1_bio->bios[0]->bi_io_vec[i].bv_page;
130 }
132 r1_bio->master_bio = NULL;
134 return r1_bio;
136 out_free_pages:
137 for (i=0; i < RESYNC_PAGES ; i++)
138 for (j=0 ; j < pi->raid_disks; j++)
139 safe_put_page(r1_bio->bios[j]->bi_io_vec[i].bv_page);
140 j = -1;
141 out_free_bio:
142 while ( ++j < pi->raid_disks )
143 bio_put(r1_bio->bios[j]);
144 r1bio_pool_free(r1_bio, data);
145 return NULL;
146 }
148 static void r1buf_pool_free(void *__r1_bio, void *data)
149 {
150 struct pool_info *pi = data;
151 int i,j;
152 r1bio_t *r1bio = __r1_bio;
154 for (i = 0; i < RESYNC_PAGES; i++)
155 for (j = pi->raid_disks; j-- ;) {
156 if (j == 0 ||
157 r1bio->bios[j]->bi_io_vec[i].bv_page !=
158 r1bio->bios[0]->bi_io_vec[i].bv_page)
159 safe_put_page(r1bio->bios[j]->bi_io_vec[i].bv_page);
160 }
161 for (i=0 ; i < pi->raid_disks; i++)
162 bio_put(r1bio->bios[i]);
164 r1bio_pool_free(r1bio, data);
165 }
167 static void put_all_bios(conf_t *conf, r1bio_t *r1_bio)
168 {
169 int i;
171 for (i = 0; i < conf->raid_disks; i++) {
172 struct bio **bio = r1_bio->bios + i;
173 if (*bio && *bio != IO_BLOCKED)
174 bio_put(*bio);
175 *bio = NULL;
176 }
177 }
179 static void free_r1bio(r1bio_t *r1_bio)
180 {
181 conf_t *conf = mddev_to_conf(r1_bio->mddev);
183 /*
184 * Wake up any possible resync thread that waits for the device
185 * to go idle.
186 */
187 allow_barrier(conf);
189 put_all_bios(conf, r1_bio);
190 mempool_free(r1_bio, conf->r1bio_pool);
191 }
193 static void put_buf(r1bio_t *r1_bio)
194 {
195 conf_t *conf = mddev_to_conf(r1_bio->mddev);
196 int i;
198 for (i=0; i<conf->raid_disks; i++) {
199 struct bio *bio = r1_bio->bios[i];
200 if (bio->bi_end_io)
201 rdev_dec_pending(conf->mirrors[i].rdev, r1_bio->mddev);
202 }
204 mempool_free(r1_bio, conf->r1buf_pool);
206 lower_barrier(conf);
207 }
209 static void reschedule_retry(r1bio_t *r1_bio)
210 {
211 unsigned long flags;
212 mddev_t *mddev = r1_bio->mddev;
213 conf_t *conf = mddev_to_conf(mddev);
215 spin_lock_irqsave(&conf->device_lock, flags);
216 list_add(&r1_bio->retry_list, &conf->retry_list);
217 conf->nr_queued ++;
218 spin_unlock_irqrestore(&conf->device_lock, flags);
220 wake_up(&conf->wait_barrier);
221 md_wakeup_thread(mddev->thread);
222 }
224 /*
225 * raid_end_bio_io() is called when we have finished servicing a mirrored
226 * operation and are ready to return a success/failure code to the buffer
227 * cache layer.
228 */
229 static void raid_end_bio_io(r1bio_t *r1_bio)
230 {
231 struct bio *bio = r1_bio->master_bio;
233 /* if nobody has done the final endio yet, do it now */
234 if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
235 PRINTK(KERN_DEBUG "raid1: sync end %s on sectors %llu-%llu\n",
236 (bio_data_dir(bio) == WRITE) ? "write" : "read",
237 (unsigned long long) bio->bi_sector,
238 (unsigned long long) bio->bi_sector +
239 (bio->bi_size >> 9) - 1);
241 bio_endio(bio, bio->bi_size,
242 test_bit(R1BIO_Uptodate, &r1_bio->state) ? 0 : -EIO);
243 }
244 free_r1bio(r1_bio);
245 }
247 /*
248 * Update disk head position estimator based on IRQ completion info.
249 */
250 static inline void update_head_pos(int disk, r1bio_t *r1_bio)
251 {
252 conf_t *conf = mddev_to_conf(r1_bio->mddev);
254 conf->mirrors[disk].head_position =
255 r1_bio->sector + (r1_bio->sectors);
256 }
258 static int raid1_end_read_request(struct bio *bio, unsigned int bytes_done, int error)
259 {
260 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
261 r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);
262 int mirror;
263 conf_t *conf = mddev_to_conf(r1_bio->mddev);
265 if (bio->bi_size)
266 return 1;
268 mirror = r1_bio->read_disk;
269 /*
270 * this branch is our 'one mirror IO has finished' event handler:
271 */
272 update_head_pos(mirror, r1_bio);
274 if (uptodate || conf->working_disks <= 1) {
275 /*
276 * Set R1BIO_Uptodate in our master bio, so that
277 * we will return a good error code for to the higher
278 * levels even if IO on some other mirrored buffer fails.
279 *
280 * The 'master' represents the composite IO operation to
281 * user-side. So if something waits for IO, then it will
282 * wait for the 'master' bio.
283 */
284 if (uptodate)
285 set_bit(R1BIO_Uptodate, &r1_bio->state);
287 raid_end_bio_io(r1_bio);
288 } else {
289 /*
290 * oops, read error:
291 */
292 char b[BDEVNAME_SIZE];
293 if (printk_ratelimit())
294 printk(KERN_ERR "raid1: %s: rescheduling sector %llu\n",
295 bdevname(conf->mirrors[mirror].rdev->bdev,b), (unsigned long long)r1_bio->sector);
296 reschedule_retry(r1_bio);
297 }
299 rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
300 return 0;
301 }
303 static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int error)
304 {
305 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
306 r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);
307 int mirror, behind = test_bit(R1BIO_BehindIO, &r1_bio->state);
308 conf_t *conf = mddev_to_conf(r1_bio->mddev);
309 struct bio *to_put = NULL;
311 if (bio->bi_size)
312 return 1;
314 for (mirror = 0; mirror < conf->raid_disks; mirror++)
315 if (r1_bio->bios[mirror] == bio)
316 break;
318 if (error == -EOPNOTSUPP && test_bit(R1BIO_Barrier, &r1_bio->state)) {
319 set_bit(BarriersNotsupp, &conf->mirrors[mirror].rdev->flags);
320 set_bit(R1BIO_BarrierRetry, &r1_bio->state);
321 r1_bio->mddev->barriers_work = 0;
322 /* Don't rdev_dec_pending in this branch - keep it for the retry */
323 } else {
324 /*
325 * this branch is our 'one mirror IO has finished' event handler:
326 */
327 r1_bio->bios[mirror] = NULL;
328 to_put = bio;
329 if (!uptodate) {
330 md_error(r1_bio->mddev, conf->mirrors[mirror].rdev);
331 /* an I/O failed, we can't clear the bitmap */
332 set_bit(R1BIO_Degraded, &r1_bio->state);
333 } else
334 /*
335 * Set R1BIO_Uptodate in our master bio, so that
336 * we will return a good error code for to the higher
337 * levels even if IO on some other mirrored buffer fails.
338 *
339 * The 'master' represents the composite IO operation to
340 * user-side. So if something waits for IO, then it will
341 * wait for the 'master' bio.
342 */
343 set_bit(R1BIO_Uptodate, &r1_bio->state);
345 update_head_pos(mirror, r1_bio);
347 if (behind) {
348 if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags))
349 atomic_dec(&r1_bio->behind_remaining);
351 /* In behind mode, we ACK the master bio once the I/O has safely
352 * reached all non-writemostly disks. Setting the Returned bit
353 * ensures that this gets done only once -- we don't ever want to
354 * return -EIO here, instead we'll wait */
356 if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) &&
357 test_bit(R1BIO_Uptodate, &r1_bio->state)) {
358 /* Maybe we can return now */
359 if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
360 struct bio *mbio = r1_bio->master_bio;
361 PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n",
362 (unsigned long long) mbio->bi_sector,
363 (unsigned long long) mbio->bi_sector +
364 (mbio->bi_size >> 9) - 1);
365 bio_endio(mbio, mbio->bi_size, 0);
366 }
367 }
368 }
369 rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
370 }
371 /*
372 *
373 * Let's see if all mirrored write operations have finished
374 * already.
375 */
376 if (atomic_dec_and_test(&r1_bio->remaining)) {
377 if (test_bit(R1BIO_BarrierRetry, &r1_bio->state))
378 reschedule_retry(r1_bio);
379 else {
380 /* it really is the end of this request */
381 if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
382 /* free extra copy of the data pages */
383 int i = bio->bi_vcnt;
384 while (i--)
385 safe_put_page(bio->bi_io_vec[i].bv_page);
386 }
387 /* clear the bitmap if all writes complete successfully */
388 bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
389 r1_bio->sectors,
390 !test_bit(R1BIO_Degraded, &r1_bio->state),
391 behind);
392 md_write_end(r1_bio->mddev);
393 raid_end_bio_io(r1_bio);
394 }
395 }
397 if (to_put)
398 bio_put(to_put);
400 return 0;
401 }
404 /*
405 * This routine returns the disk from which the requested read should
406 * be done. There is a per-array 'next expected sequential IO' sector
407 * number - if this matches on the next IO then we use the last disk.
408 * There is also a per-disk 'last know head position' sector that is
409 * maintained from IRQ contexts, both the normal and the resync IO
410 * completion handlers update this position correctly. If there is no
411 * perfect sequential match then we pick the disk whose head is closest.
412 *
413 * If there are 2 mirrors in the same 2 devices, performance degrades
414 * because position is mirror, not device based.
415 *
416 * The rdev for the device selected will have nr_pending incremented.
417 */
418 static int read_balance(conf_t *conf, r1bio_t *r1_bio)
419 {
420 const unsigned long this_sector = r1_bio->sector;
421 int new_disk = conf->last_used, disk = new_disk;
422 int wonly_disk = -1;
423 const int sectors = r1_bio->sectors;
424 sector_t new_distance, current_distance;
425 mdk_rdev_t *rdev;
427 rcu_read_lock();
428 /*
429 * Check if we can balance. We can balance on the whole
430 * device if no resync is going on, or below the resync window.
431 * We take the first readable disk when above the resync window.
432 */
433 retry:
434 if (conf->mddev->recovery_cp < MaxSector &&
435 (this_sector + sectors >= conf->next_resync)) {
436 /* Choose the first operation device, for consistancy */
437 new_disk = 0;
439 for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev);
440 r1_bio->bios[new_disk] == IO_BLOCKED ||
441 !rdev || !test_bit(In_sync, &rdev->flags)
442 || test_bit(WriteMostly, &rdev->flags);
443 rdev = rcu_dereference(conf->mirrors[++new_disk].rdev)) {
445 if (rdev && test_bit(In_sync, &rdev->flags) &&
446 r1_bio->bios[new_disk] != IO_BLOCKED)
447 wonly_disk = new_disk;
449 if (new_disk == conf->raid_disks - 1) {
450 new_disk = wonly_disk;
451 break;
452 }
453 }
454 goto rb_out;
455 }
458 /* make sure the disk is operational */
459 for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev);
460 r1_bio->bios[new_disk] == IO_BLOCKED ||
461 !rdev || !test_bit(In_sync, &rdev->flags) ||
462 test_bit(WriteMostly, &rdev->flags);
463 rdev = rcu_dereference(conf->mirrors[new_disk].rdev)) {
465 if (rdev && test_bit(In_sync, &rdev->flags) &&
466 r1_bio->bios[new_disk] != IO_BLOCKED)
467 wonly_disk = new_disk;
469 if (new_disk <= 0)
470 new_disk = conf->raid_disks;
471 new_disk--;
472 if (new_disk == disk) {
473 new_disk = wonly_disk;
474 break;
475 }
476 }
478 if (new_disk < 0)
479 goto rb_out;
481 disk = new_disk;
482 /* now disk == new_disk == starting point for search */
484 /*
485 * Don't change to another disk for sequential reads:
486 */
487 if (conf->next_seq_sect == this_sector)
488 goto rb_out;
489 if (this_sector == conf->mirrors[new_disk].head_position)
490 goto rb_out;
492 current_distance = abs(this_sector - conf->mirrors[disk].head_position);
494 /* Find the disk whose head is closest */
496 do {
497 if (disk <= 0)
498 disk = conf->raid_disks;
499 disk--;
501 rdev = rcu_dereference(conf->mirrors[disk].rdev);
503 if (!rdev || r1_bio->bios[disk] == IO_BLOCKED ||
504 !test_bit(In_sync, &rdev->flags) ||
505 test_bit(WriteMostly, &rdev->flags))
506 continue;
508 if (!atomic_read(&rdev->nr_pending)) {
509 new_disk = disk;
510 break;
511 }
512 new_distance = abs(this_sector - conf->mirrors[disk].head_position);
513 if (new_distance < current_distance) {
514 current_distance = new_distance;
515 new_disk = disk;
516 }
517 } while (disk != conf->last_used);
519 rb_out:
522 if (new_disk >= 0) {
523 rdev = rcu_dereference(conf->mirrors[new_disk].rdev);
524 if (!rdev)
525 goto retry;
526 atomic_inc(&rdev->nr_pending);
527 if (!test_bit(In_sync, &rdev->flags)) {
528 /* cannot risk returning a device that failed
529 * before we inc'ed nr_pending
530 */
531 rdev_dec_pending(rdev, conf->mddev);
532 goto retry;
533 }
534 conf->next_seq_sect = this_sector + sectors;
535 conf->last_used = new_disk;
536 }
537 rcu_read_unlock();
539 return new_disk;
540 }
542 static void unplug_slaves(mddev_t *mddev)
543 {
544 conf_t *conf = mddev_to_conf(mddev);
545 int i;
547 rcu_read_lock();
548 for (i=0; i<mddev->raid_disks; i++) {
549 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
550 if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) {
551 request_queue_t *r_queue = bdev_get_queue(rdev->bdev);
553 atomic_inc(&rdev->nr_pending);
554 rcu_read_unlock();
556 if (r_queue->unplug_fn)
557 r_queue->unplug_fn(r_queue);
559 rdev_dec_pending(rdev, mddev);
560 rcu_read_lock();
561 }
562 }
563 rcu_read_unlock();
564 }
566 static void raid1_unplug(request_queue_t *q)
567 {
568 mddev_t *mddev = q->queuedata;
570 unplug_slaves(mddev);
571 md_wakeup_thread(mddev->thread);
572 }
574 static int raid1_issue_flush(request_queue_t *q, struct gendisk *disk,
575 sector_t *error_sector)
576 {
577 mddev_t *mddev = q->queuedata;
578 conf_t *conf = mddev_to_conf(mddev);
579 int i, ret = 0;
581 rcu_read_lock();
582 for (i=0; i<mddev->raid_disks && ret == 0; i++) {
583 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
584 if (rdev && !test_bit(Faulty, &rdev->flags)) {
585 struct block_device *bdev = rdev->bdev;
586 request_queue_t *r_queue = bdev_get_queue(bdev);
588 if (!r_queue->issue_flush_fn)
589 ret = -EOPNOTSUPP;
590 else {
591 atomic_inc(&rdev->nr_pending);
592 rcu_read_unlock();
593 ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk,
594 error_sector);
595 rdev_dec_pending(rdev, mddev);
596 rcu_read_lock();
597 }
598 }
599 }
600 rcu_read_unlock();
601 return ret;
602 }
604 /* Barriers....
605 * Sometimes we need to suspend IO while we do something else,
606 * either some resync/recovery, or reconfigure the array.
607 * To do this we raise a 'barrier'.
608 * The 'barrier' is a counter that can be raised multiple times
609 * to count how many activities are happening which preclude
610 * normal IO.
611 * We can only raise the barrier if there is no pending IO.
612 * i.e. if nr_pending == 0.
613 * We choose only to raise the barrier if no-one is waiting for the
614 * barrier to go down. This means that as soon as an IO request
615 * is ready, no other operations which require a barrier will start
616 * until the IO request has had a chance.
617 *
618 * So: regular IO calls 'wait_barrier'. When that returns there
619 * is no backgroup IO happening, It must arrange to call
620 * allow_barrier when it has finished its IO.
621 * backgroup IO calls must call raise_barrier. Once that returns
622 * there is no normal IO happeing. It must arrange to call
623 * lower_barrier when the particular background IO completes.
624 */
625 #define RESYNC_DEPTH 32
627 static void raise_barrier(conf_t *conf)
628 {
629 spin_lock_irq(&conf->resync_lock);
631 /* Wait until no block IO is waiting */
632 wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting,
633 conf->resync_lock,
634 raid1_unplug(conf->mddev->queue));
636 /* block any new IO from starting */
637 conf->barrier++;
639 /* No wait for all pending IO to complete */
640 wait_event_lock_irq(conf->wait_barrier,
641 !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
642 conf->resync_lock,
643 raid1_unplug(conf->mddev->queue));
645 spin_unlock_irq(&conf->resync_lock);
646 }
648 static void lower_barrier(conf_t *conf)
649 {
650 unsigned long flags;
651 spin_lock_irqsave(&conf->resync_lock, flags);
652 conf->barrier--;
653 spin_unlock_irqrestore(&conf->resync_lock, flags);
654 wake_up(&conf->wait_barrier);
655 }
657 static void wait_barrier(conf_t *conf)
658 {
659 spin_lock_irq(&conf->resync_lock);
660 if (conf->barrier) {
661 conf->nr_waiting++;
662 wait_event_lock_irq(conf->wait_barrier, !conf->barrier,
663 conf->resync_lock,
664 raid1_unplug(conf->mddev->queue));
665 conf->nr_waiting--;
666 }
667 conf->nr_pending++;
668 spin_unlock_irq(&conf->resync_lock);
669 }
671 static void allow_barrier(conf_t *conf)
672 {
673 unsigned long flags;
674 spin_lock_irqsave(&conf->resync_lock, flags);
675 conf->nr_pending--;
676 spin_unlock_irqrestore(&conf->resync_lock, flags);
677 wake_up(&conf->wait_barrier);
678 }
680 static void freeze_array(conf_t *conf)
681 {
682 /* stop syncio and normal IO and wait for everything to
683 * go quite.
684 * We increment barrier and nr_waiting, and then
685 * wait until barrier+nr_pending match nr_queued+2
686 */
687 spin_lock_irq(&conf->resync_lock);
688 conf->barrier++;
689 conf->nr_waiting++;
690 wait_event_lock_irq(conf->wait_barrier,
691 conf->barrier+conf->nr_pending == conf->nr_queued+2,
692 conf->resync_lock,
693 raid1_unplug(conf->mddev->queue));
694 spin_unlock_irq(&conf->resync_lock);
695 }
696 static void unfreeze_array(conf_t *conf)
697 {
698 /* reverse the effect of the freeze */
699 spin_lock_irq(&conf->resync_lock);
700 conf->barrier--;
701 conf->nr_waiting--;
702 wake_up(&conf->wait_barrier);
703 spin_unlock_irq(&conf->resync_lock);
704 }
707 /* duplicate the data pages for behind I/O */
708 static struct page **alloc_behind_pages(struct bio *bio)
709 {
710 int i;
711 struct bio_vec *bvec;
712 struct page **pages = kzalloc(bio->bi_vcnt * sizeof(struct page *),
713 GFP_NOIO);
714 if (unlikely(!pages))
715 goto do_sync_io;
717 bio_for_each_segment(bvec, bio, i) {
718 pages[i] = alloc_page(GFP_NOIO);
719 if (unlikely(!pages[i]))
720 goto do_sync_io;
721 memcpy(kmap(pages[i]) + bvec->bv_offset,
722 kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len);
723 kunmap(pages[i]);
724 kunmap(bvec->bv_page);
725 }
727 return pages;
729 do_sync_io:
730 if (pages)
731 for (i = 0; i < bio->bi_vcnt && pages[i]; i++)
732 put_page(pages[i]);
733 kfree(pages);
734 PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size);
735 return NULL;
736 }
738 static int make_request(request_queue_t *q, struct bio * bio)
739 {
740 mddev_t *mddev = q->queuedata;
741 conf_t *conf = mddev_to_conf(mddev);
742 mirror_info_t *mirror;
743 r1bio_t *r1_bio;
744 struct bio *read_bio;
745 int i, targets = 0, disks;
746 mdk_rdev_t *rdev;
747 struct bitmap *bitmap = mddev->bitmap;
748 unsigned long flags;
749 struct bio_list bl;
750 struct page **behind_pages = NULL;
751 const int rw = bio_data_dir(bio);
752 int do_barriers;
754 /*
755 * Register the new request and wait if the reconstruction
756 * thread has put up a bar for new requests.
757 * Continue immediately if no resync is active currently.
758 * We test barriers_work *after* md_write_start as md_write_start
759 * may cause the first superblock write, and that will check out
760 * if barriers work.
761 */
763 md_write_start(mddev, bio); /* wait on superblock update early */
765 if (unlikely(!mddev->barriers_work && bio_barrier(bio))) {
766 if (rw == WRITE)
767 md_write_end(mddev);
768 bio_endio(bio, bio->bi_size, -EOPNOTSUPP);
769 return 0;
770 }
772 wait_barrier(conf);
774 disk_stat_inc(mddev->gendisk, ios[rw]);
775 disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bio));
777 /*
778 * make_request() can abort the operation when READA is being
779 * used and no empty request is available.
780 *
781 */
782 r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
784 r1_bio->master_bio = bio;
785 r1_bio->sectors = bio->bi_size >> 9;
786 r1_bio->state = 0;
787 r1_bio->mddev = mddev;
788 r1_bio->sector = bio->bi_sector;
790 if (rw == READ) {
791 /*
792 * read balancing logic:
793 */
794 int rdisk = read_balance(conf, r1_bio);
796 if (rdisk < 0) {
797 /* couldn't find anywhere to read from */
798 raid_end_bio_io(r1_bio);
799 return 0;
800 }
801 mirror = conf->mirrors + rdisk;
803 r1_bio->read_disk = rdisk;
805 read_bio = bio_clone(bio, GFP_NOIO);
807 r1_bio->bios[rdisk] = read_bio;
809 read_bio->bi_sector = r1_bio->sector + mirror->rdev->data_offset;
810 read_bio->bi_bdev = mirror->rdev->bdev;
811 read_bio->bi_end_io = raid1_end_read_request;
812 read_bio->bi_rw = READ;
813 read_bio->bi_private = r1_bio;
815 generic_make_request(read_bio);
816 return 0;
817 }
819 /*
820 * WRITE:
821 */
822 /* first select target devices under spinlock and
823 * inc refcount on their rdev. Record them by setting
824 * bios[x] to bio
825 */
826 disks = conf->raid_disks;
827 #if 0
828 { static int first=1;
829 if (first) printk("First Write sector %llu disks %d\n",
830 (unsigned long long)r1_bio->sector, disks);
831 first = 0;
832 }
833 #endif
834 rcu_read_lock();
835 for (i = 0; i < disks; i++) {
836 if ((rdev=rcu_dereference(conf->mirrors[i].rdev)) != NULL &&
837 !test_bit(Faulty, &rdev->flags)) {
838 atomic_inc(&rdev->nr_pending);
839 if (test_bit(Faulty, &rdev->flags)) {
840 rdev_dec_pending(rdev, mddev);
841 r1_bio->bios[i] = NULL;
842 } else
843 r1_bio->bios[i] = bio;
844 targets++;
845 } else
846 r1_bio->bios[i] = NULL;
847 }
848 rcu_read_unlock();
850 BUG_ON(targets == 0); /* we never fail the last device */
852 if (targets < conf->raid_disks) {
853 /* array is degraded, we will not clear the bitmap
854 * on I/O completion (see raid1_end_write_request) */
855 set_bit(R1BIO_Degraded, &r1_bio->state);
856 }
858 /* do behind I/O ? */
859 if (bitmap &&
860 atomic_read(&bitmap->behind_writes) < bitmap->max_write_behind &&
861 (behind_pages = alloc_behind_pages(bio)) != NULL)
862 set_bit(R1BIO_BehindIO, &r1_bio->state);
864 atomic_set(&r1_bio->remaining, 0);
865 atomic_set(&r1_bio->behind_remaining, 0);
867 do_barriers = bio_barrier(bio);
868 if (do_barriers)
869 set_bit(R1BIO_Barrier, &r1_bio->state);
871 bio_list_init(&bl);
872 for (i = 0; i < disks; i++) {
873 struct bio *mbio;
874 if (!r1_bio->bios[i])
875 continue;
877 mbio = bio_clone(bio, GFP_NOIO);
878 r1_bio->bios[i] = mbio;
880 mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset;
881 mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
882 mbio->bi_end_io = raid1_end_write_request;
883 mbio->bi_rw = WRITE | do_barriers;
884 mbio->bi_private = r1_bio;
886 if (behind_pages) {
887 struct bio_vec *bvec;
888 int j;
890 /* Yes, I really want the '__' version so that
891 * we clear any unused pointer in the io_vec, rather
892 * than leave them unchanged. This is important
893 * because when we come to free the pages, we won't
894 * know the originial bi_idx, so we just free
895 * them all
896 */
897 __bio_for_each_segment(bvec, mbio, j, 0)
898 bvec->bv_page = behind_pages[j];
899 if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags))
900 atomic_inc(&r1_bio->behind_remaining);
901 }
903 atomic_inc(&r1_bio->remaining);
905 bio_list_add(&bl, mbio);
906 }
907 kfree(behind_pages); /* the behind pages are attached to the bios now */
909 bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors,
910 test_bit(R1BIO_BehindIO, &r1_bio->state));
911 spin_lock_irqsave(&conf->device_lock, flags);
912 bio_list_merge(&conf->pending_bio_list, &bl);
913 bio_list_init(&bl);
915 blk_plug_device(mddev->queue);
916 spin_unlock_irqrestore(&conf->device_lock, flags);
918 #if 0
919 while ((bio = bio_list_pop(&bl)) != NULL)
920 generic_make_request(bio);
921 #endif
923 return 0;
924 }
926 static void status(struct seq_file *seq, mddev_t *mddev)
927 {
928 conf_t *conf = mddev_to_conf(mddev);
929 int i;
931 seq_printf(seq, " [%d/%d] [", conf->raid_disks,
932 conf->working_disks);
933 rcu_read_lock();
934 for (i = 0; i < conf->raid_disks; i++) {
935 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
936 seq_printf(seq, "%s",
937 rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
938 }
939 rcu_read_unlock();
940 seq_printf(seq, "]");
941 }
944 static void error(mddev_t *mddev, mdk_rdev_t *rdev)
945 {
946 char b[BDEVNAME_SIZE];
947 conf_t *conf = mddev_to_conf(mddev);
949 /*
950 * If it is not operational, then we have already marked it as dead
951 * else if it is the last working disks, ignore the error, let the
952 * next level up know.
953 * else mark the drive as failed
954 */
955 if (test_bit(In_sync, &rdev->flags)
956 && conf->working_disks == 1)
957 /*
958 * Don't fail the drive, act as though we were just a
959 * normal single drive
960 */
961 return;
962 if (test_bit(In_sync, &rdev->flags)) {
963 mddev->degraded++;
964 conf->working_disks--;
965 /*
966 * if recovery is running, make sure it aborts.
967 */
968 set_bit(MD_RECOVERY_ERR, &mddev->recovery);
969 }
970 clear_bit(In_sync, &rdev->flags);
971 set_bit(Faulty, &rdev->flags);
972 mddev->sb_dirty = 1;
973 printk(KERN_ALERT "raid1: Disk failure on %s, disabling device. \n"
974 " Operation continuing on %d devices\n",
975 bdevname(rdev->bdev,b), conf->working_disks);
976 }
978 static void print_conf(conf_t *conf)
979 {
980 int i;
982 printk("RAID1 conf printout:\n");
983 if (!conf) {
984 printk("(!conf)\n");
985 return;
986 }
987 printk(" --- wd:%d rd:%d\n", conf->working_disks,
988 conf->raid_disks);
990 rcu_read_lock();
991 for (i = 0; i < conf->raid_disks; i++) {
992 char b[BDEVNAME_SIZE];
993 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
994 if (rdev)
995 printk(" disk %d, wo:%d, o:%d, dev:%s\n",
996 i, !test_bit(In_sync, &rdev->flags),
997 !test_bit(Faulty, &rdev->flags),
998 bdevname(rdev->bdev,b));
999 }
1000 rcu_read_unlock();
1003 static void close_sync(conf_t *conf)
1005 wait_barrier(conf);
1006 allow_barrier(conf);
1008 mempool_destroy(conf->r1buf_pool);
1009 conf->r1buf_pool = NULL;
1012 static int raid1_spare_active(mddev_t *mddev)
1014 int i;
1015 conf_t *conf = mddev->private;
1017 /*
1018 * Find all failed disks within the RAID1 configuration
1019 * and mark them readable.
1020 * Called under mddev lock, so rcu protection not needed.
1021 */
1022 for (i = 0; i < conf->raid_disks; i++) {
1023 mdk_rdev_t *rdev = conf->mirrors[i].rdev;
1024 if (rdev
1025 && !test_bit(Faulty, &rdev->flags)
1026 && !test_bit(In_sync, &rdev->flags)) {
1027 conf->working_disks++;
1028 mddev->degraded--;
1029 set_bit(In_sync, &rdev->flags);
1033 print_conf(conf);
1034 return 0;
1038 static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
1040 conf_t *conf = mddev->private;
1041 int found = 0;
1042 int mirror = 0;
1043 mirror_info_t *p;
1045 for (mirror=0; mirror < mddev->raid_disks; mirror++)
1046 if ( !(p=conf->mirrors+mirror)->rdev) {
1048 blk_queue_stack_limits(mddev->queue,
1049 rdev->bdev->bd_disk->queue);
1050 /* as we don't honour merge_bvec_fn, we must never risk
1051 * violating it, so limit ->max_sector to one PAGE, as
1052 * a one page request is never in violation.
1053 */
1054 if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
1055 mddev->queue->max_sectors > (PAGE_SIZE>>9))
1056 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
1058 p->head_position = 0;
1059 rdev->raid_disk = mirror;
1060 found = 1;
1061 /* As all devices are equivalent, we don't need a full recovery
1062 * if this was recently any drive of the array
1063 */
1064 if (rdev->saved_raid_disk < 0)
1065 conf->fullsync = 1;
1066 rcu_assign_pointer(p->rdev, rdev);
1067 break;
1070 print_conf(conf);
1071 return found;
1074 static int raid1_remove_disk(mddev_t *mddev, int number)
1076 conf_t *conf = mddev->private;
1077 int err = 0;
1078 mdk_rdev_t *rdev;
1079 mirror_info_t *p = conf->mirrors+ number;
1081 print_conf(conf);
1082 rdev = p->rdev;
1083 if (rdev) {
1084 if (test_bit(In_sync, &rdev->flags) ||
1085 atomic_read(&rdev->nr_pending)) {
1086 err = -EBUSY;
1087 goto abort;
1089 p->rdev = NULL;
1090 synchronize_rcu();
1091 if (atomic_read(&rdev->nr_pending)) {
1092 /* lost the race, try later */
1093 err = -EBUSY;
1094 p->rdev = rdev;
1097 abort:
1099 print_conf(conf);
1100 return err;
1104 static int end_sync_read(struct bio *bio, unsigned int bytes_done, int error)
1106 r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);
1107 int i;
1109 if (bio->bi_size)
1110 return 1;
1112 for (i=r1_bio->mddev->raid_disks; i--; )
1113 if (r1_bio->bios[i] == bio)
1114 break;
1115 BUG_ON(i < 0);
1116 update_head_pos(i, r1_bio);
1117 /*
1118 * we have read a block, now it needs to be re-written,
1119 * or re-read if the read failed.
1120 * We don't do much here, just schedule handling by raid1d
1121 */
1122 if (test_bit(BIO_UPTODATE, &bio->bi_flags))
1123 set_bit(R1BIO_Uptodate, &r1_bio->state);
1125 if (atomic_dec_and_test(&r1_bio->remaining))
1126 reschedule_retry(r1_bio);
1127 return 0;
1130 static int end_sync_write(struct bio *bio, unsigned int bytes_done, int error)
1132 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1133 r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);
1134 mddev_t *mddev = r1_bio->mddev;
1135 conf_t *conf = mddev_to_conf(mddev);
1136 int i;
1137 int mirror=0;
1139 if (bio->bi_size)
1140 return 1;
1142 for (i = 0; i < conf->raid_disks; i++)
1143 if (r1_bio->bios[i] == bio) {
1144 mirror = i;
1145 break;
1147 if (!uptodate) {
1148 int sync_blocks = 0;
1149 sector_t s = r1_bio->sector;
1150 long sectors_to_go = r1_bio->sectors;
1151 /* make sure these bits doesn't get cleared. */
1152 do {
1153 bitmap_end_sync(mddev->bitmap, s,
1154 &sync_blocks, 1);
1155 s += sync_blocks;
1156 sectors_to_go -= sync_blocks;
1157 } while (sectors_to_go > 0);
1158 md_error(mddev, conf->mirrors[mirror].rdev);
1161 update_head_pos(mirror, r1_bio);
1163 if (atomic_dec_and_test(&r1_bio->remaining)) {
1164 md_done_sync(mddev, r1_bio->sectors, uptodate);
1165 put_buf(r1_bio);
1167 return 0;
1170 static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
1172 conf_t *conf = mddev_to_conf(mddev);
1173 int i;
1174 int disks = conf->raid_disks;
1175 struct bio *bio, *wbio;
1177 bio = r1_bio->bios[r1_bio->read_disk];
1180 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
1181 /* We have read all readable devices. If we haven't
1182 * got the block, then there is no hope left.
1183 * If we have, then we want to do a comparison
1184 * and skip the write if everything is the same.
1185 * If any blocks failed to read, then we need to
1186 * attempt an over-write
1187 */
1188 int primary;
1189 if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) {
1190 for (i=0; i<mddev->raid_disks; i++)
1191 if (r1_bio->bios[i]->bi_end_io == end_sync_read)
1192 md_error(mddev, conf->mirrors[i].rdev);
1194 md_done_sync(mddev, r1_bio->sectors, 1);
1195 put_buf(r1_bio);
1196 return;
1198 for (primary=0; primary<mddev->raid_disks; primary++)
1199 if (r1_bio->bios[primary]->bi_end_io == end_sync_read &&
1200 test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) {
1201 r1_bio->bios[primary]->bi_end_io = NULL;
1202 rdev_dec_pending(conf->mirrors[primary].rdev, mddev);
1203 break;
1205 r1_bio->read_disk = primary;
1206 for (i=0; i<mddev->raid_disks; i++)
1207 if (r1_bio->bios[i]->bi_end_io == end_sync_read &&
1208 test_bit(BIO_UPTODATE, &r1_bio->bios[i]->bi_flags)) {
1209 int j;
1210 int vcnt = r1_bio->sectors >> (PAGE_SHIFT- 9);
1211 struct bio *pbio = r1_bio->bios[primary];
1212 struct bio *sbio = r1_bio->bios[i];
1213 for (j = vcnt; j-- ; )
1214 if (memcmp(page_address(pbio->bi_io_vec[j].bv_page),
1215 page_address(sbio->bi_io_vec[j].bv_page),
1216 PAGE_SIZE))
1217 break;
1218 if (j >= 0)
1219 mddev->resync_mismatches += r1_bio->sectors;
1220 if (j < 0 || test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) {
1221 sbio->bi_end_io = NULL;
1222 rdev_dec_pending(conf->mirrors[i].rdev, mddev);
1223 } else {
1224 /* fixup the bio for reuse */
1225 sbio->bi_vcnt = vcnt;
1226 sbio->bi_size = r1_bio->sectors << 9;
1227 sbio->bi_idx = 0;
1228 sbio->bi_phys_segments = 0;
1229 sbio->bi_hw_segments = 0;
1230 sbio->bi_hw_front_size = 0;
1231 sbio->bi_hw_back_size = 0;
1232 sbio->bi_flags &= ~(BIO_POOL_MASK - 1);
1233 sbio->bi_flags |= 1 << BIO_UPTODATE;
1234 sbio->bi_next = NULL;
1235 sbio->bi_sector = r1_bio->sector +
1236 conf->mirrors[i].rdev->data_offset;
1237 sbio->bi_bdev = conf->mirrors[i].rdev->bdev;
1241 if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) {
1242 /* ouch - failed to read all of that.
1243 * Try some synchronous reads of other devices to get
1244 * good data, much like with normal read errors. Only
1245 * read into the pages we already have so we don't
1246 * need to re-issue the read request.
1247 * We don't need to freeze the array, because being in an
1248 * active sync request, there is no normal IO, and
1249 * no overlapping syncs.
1250 */
1251 sector_t sect = r1_bio->sector;
1252 int sectors = r1_bio->sectors;
1253 int idx = 0;
1255 while(sectors) {
1256 int s = sectors;
1257 int d = r1_bio->read_disk;
1258 int success = 0;
1259 mdk_rdev_t *rdev;
1261 if (s > (PAGE_SIZE>>9))
1262 s = PAGE_SIZE >> 9;
1263 do {
1264 if (r1_bio->bios[d]->bi_end_io == end_sync_read) {
1265 /* No rcu protection needed here devices
1266 * can only be removed when no resync is
1267 * active, and resync is currently active
1268 */
1269 rdev = conf->mirrors[d].rdev;
1270 if (sync_page_io(rdev->bdev,
1271 sect + rdev->data_offset,
1272 s<<9,
1273 bio->bi_io_vec[idx].bv_page,
1274 READ)) {
1275 success = 1;
1276 break;
1279 d++;
1280 if (d == conf->raid_disks)
1281 d = 0;
1282 } while (!success && d != r1_bio->read_disk);
1284 if (success) {
1285 int start = d;
1286 /* write it back and re-read */
1287 set_bit(R1BIO_Uptodate, &r1_bio->state);
1288 while (d != r1_bio->read_disk) {
1289 if (d == 0)
1290 d = conf->raid_disks;
1291 d--;
1292 if (r1_bio->bios[d]->bi_end_io != end_sync_read)
1293 continue;
1294 rdev = conf->mirrors[d].rdev;
1295 atomic_add(s, &rdev->corrected_errors);
1296 if (sync_page_io(rdev->bdev,
1297 sect + rdev->data_offset,
1298 s<<9,
1299 bio->bi_io_vec[idx].bv_page,
1300 WRITE) == 0)
1301 md_error(mddev, rdev);
1303 d = start;
1304 while (d != r1_bio->read_disk) {
1305 if (d == 0)
1306 d = conf->raid_disks;
1307 d--;
1308 if (r1_bio->bios[d]->bi_end_io != end_sync_read)
1309 continue;
1310 rdev = conf->mirrors[d].rdev;
1311 if (sync_page_io(rdev->bdev,
1312 sect + rdev->data_offset,
1313 s<<9,
1314 bio->bi_io_vec[idx].bv_page,
1315 READ) == 0)
1316 md_error(mddev, rdev);
1318 } else {
1319 char b[BDEVNAME_SIZE];
1320 /* Cannot read from anywhere, array is toast */
1321 md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev);
1322 printk(KERN_ALERT "raid1: %s: unrecoverable I/O read error"
1323 " for block %llu\n",
1324 bdevname(bio->bi_bdev,b),
1325 (unsigned long long)r1_bio->sector);
1326 md_done_sync(mddev, r1_bio->sectors, 0);
1327 put_buf(r1_bio);
1328 return;
1330 sectors -= s;
1331 sect += s;
1332 idx ++;
1336 /*
1337 * schedule writes
1338 */
1339 atomic_set(&r1_bio->remaining, 1);
1340 for (i = 0; i < disks ; i++) {
1341 wbio = r1_bio->bios[i];
1342 if (wbio->bi_end_io == NULL ||
1343 (wbio->bi_end_io == end_sync_read &&
1344 (i == r1_bio->read_disk ||
1345 !test_bit(MD_RECOVERY_SYNC, &mddev->recovery))))
1346 continue;
1348 wbio->bi_rw = WRITE;
1349 wbio->bi_end_io = end_sync_write;
1350 atomic_inc(&r1_bio->remaining);
1351 md_sync_acct(conf->mirrors[i].rdev->bdev, wbio->bi_size >> 9);
1353 generic_make_request(wbio);
1356 if (atomic_dec_and_test(&r1_bio->remaining)) {
1357 /* if we're here, all write(s) have completed, so clean up */
1358 md_done_sync(mddev, r1_bio->sectors, 1);
1359 put_buf(r1_bio);
1363 /*
1364 * This is a kernel thread which:
1366 * 1. Retries failed read operations on working mirrors.
1367 * 2. Updates the raid superblock when problems encounter.
1368 * 3. Performs writes following reads for array syncronising.
1369 */
1371 static void raid1d(mddev_t *mddev)
1373 r1bio_t *r1_bio;
1374 struct bio *bio;
1375 unsigned long flags;
1376 conf_t *conf = mddev_to_conf(mddev);
1377 struct list_head *head = &conf->retry_list;
1378 int unplug=0;
1379 mdk_rdev_t *rdev;
1381 md_check_recovery(mddev);
1383 for (;;) {
1384 char b[BDEVNAME_SIZE];
1385 spin_lock_irqsave(&conf->device_lock, flags);
1387 if (conf->pending_bio_list.head) {
1388 bio = bio_list_get(&conf->pending_bio_list);
1389 blk_remove_plug(mddev->queue);
1390 spin_unlock_irqrestore(&conf->device_lock, flags);
1391 /* flush any pending bitmap writes to disk before proceeding w/ I/O */
1392 if (bitmap_unplug(mddev->bitmap) != 0)
1393 printk("%s: bitmap file write failed!\n", mdname(mddev));
1395 while (bio) { /* submit pending writes */
1396 struct bio *next = bio->bi_next;
1397 bio->bi_next = NULL;
1398 generic_make_request(bio);
1399 bio = next;
1401 unplug = 1;
1403 continue;
1406 if (list_empty(head))
1407 break;
1408 r1_bio = list_entry(head->prev, r1bio_t, retry_list);
1409 list_del(head->prev);
1410 conf->nr_queued--;
1411 spin_unlock_irqrestore(&conf->device_lock, flags);
1413 mddev = r1_bio->mddev;
1414 conf = mddev_to_conf(mddev);
1415 if (test_bit(R1BIO_IsSync, &r1_bio->state)) {
1416 sync_request_write(mddev, r1_bio);
1417 unplug = 1;
1418 } else if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) {
1419 /* some requests in the r1bio were BIO_RW_BARRIER
1420 * requests which failed with -EOPNOTSUPP. Hohumm..
1421 * Better resubmit without the barrier.
1422 * We know which devices to resubmit for, because
1423 * all others have had their bios[] entry cleared.
1424 * We already have a nr_pending reference on these rdevs.
1425 */
1426 int i;
1427 clear_bit(R1BIO_BarrierRetry, &r1_bio->state);
1428 clear_bit(R1BIO_Barrier, &r1_bio->state);
1429 for (i=0; i < conf->raid_disks; i++)
1430 if (r1_bio->bios[i])
1431 atomic_inc(&r1_bio->remaining);
1432 for (i=0; i < conf->raid_disks; i++)
1433 if (r1_bio->bios[i]) {
1434 struct bio_vec *bvec;
1435 int j;
1437 bio = bio_clone(r1_bio->master_bio, GFP_NOIO);
1438 /* copy pages from the failed bio, as
1439 * this might be a write-behind device */
1440 __bio_for_each_segment(bvec, bio, j, 0)
1441 bvec->bv_page = bio_iovec_idx(r1_bio->bios[i], j)->bv_page;
1442 bio_put(r1_bio->bios[i]);
1443 bio->bi_sector = r1_bio->sector +
1444 conf->mirrors[i].rdev->data_offset;
1445 bio->bi_bdev = conf->mirrors[i].rdev->bdev;
1446 bio->bi_end_io = raid1_end_write_request;
1447 bio->bi_rw = WRITE;
1448 bio->bi_private = r1_bio;
1449 r1_bio->bios[i] = bio;
1450 generic_make_request(bio);
1452 } else {
1453 int disk;
1455 /* we got a read error. Maybe the drive is bad. Maybe just
1456 * the block and we can fix it.
1457 * We freeze all other IO, and try reading the block from
1458 * other devices. When we find one, we re-write
1459 * and check it that fixes the read error.
1460 * This is all done synchronously while the array is
1461 * frozen
1462 */
1463 sector_t sect = r1_bio->sector;
1464 int sectors = r1_bio->sectors;
1465 freeze_array(conf);
1466 if (mddev->ro == 0) while(sectors) {
1467 int s = sectors;
1468 int d = r1_bio->read_disk;
1469 int success = 0;
1471 if (s > (PAGE_SIZE>>9))
1472 s = PAGE_SIZE >> 9;
1474 do {
1475 /* Note: no rcu protection needed here
1476 * as this is synchronous in the raid1d thread
1477 * which is the thread that might remove
1478 * a device. If raid1d ever becomes multi-threaded....
1479 */
1480 rdev = conf->mirrors[d].rdev;
1481 if (rdev &&
1482 test_bit(In_sync, &rdev->flags) &&
1483 sync_page_io(rdev->bdev,
1484 sect + rdev->data_offset,
1485 s<<9,
1486 conf->tmppage, READ))
1487 success = 1;
1488 else {
1489 d++;
1490 if (d == conf->raid_disks)
1491 d = 0;
1493 } while (!success && d != r1_bio->read_disk);
1495 if (success) {
1496 /* write it back and re-read */
1497 int start = d;
1498 while (d != r1_bio->read_disk) {
1499 if (d==0)
1500 d = conf->raid_disks;
1501 d--;
1502 rdev = conf->mirrors[d].rdev;
1503 if (rdev &&
1504 test_bit(In_sync, &rdev->flags)) {
1505 if (sync_page_io(rdev->bdev,
1506 sect + rdev->data_offset,
1507 s<<9, conf->tmppage, WRITE) == 0)
1508 /* Well, this device is dead */
1509 md_error(mddev, rdev);
1512 d = start;
1513 while (d != r1_bio->read_disk) {
1514 if (d==0)
1515 d = conf->raid_disks;
1516 d--;
1517 rdev = conf->mirrors[d].rdev;
1518 if (rdev &&
1519 test_bit(In_sync, &rdev->flags)) {
1520 if (sync_page_io(rdev->bdev,
1521 sect + rdev->data_offset,
1522 s<<9, conf->tmppage, READ) == 0)
1523 /* Well, this device is dead */
1524 md_error(mddev, rdev);
1525 else {
1526 atomic_add(s, &rdev->corrected_errors);
1527 printk(KERN_INFO "raid1:%s: read error corrected (%d sectors at %llu on %s)\n",
1528 mdname(mddev), s, (unsigned long long)(sect + rdev->data_offset), bdevname(rdev->bdev, b));
1532 } else {
1533 /* Cannot read from anywhere -- bye bye array */
1534 md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev);
1535 break;
1537 sectors -= s;
1538 sect += s;
1541 unfreeze_array(conf);
1543 bio = r1_bio->bios[r1_bio->read_disk];
1544 if ((disk=read_balance(conf, r1_bio)) == -1) {
1545 printk(KERN_ALERT "raid1: %s: unrecoverable I/O"
1546 " read error for block %llu\n",
1547 bdevname(bio->bi_bdev,b),
1548 (unsigned long long)r1_bio->sector);
1549 raid_end_bio_io(r1_bio);
1550 } else {
1551 r1_bio->bios[r1_bio->read_disk] =
1552 mddev->ro ? IO_BLOCKED : NULL;
1553 r1_bio->read_disk = disk;
1554 bio_put(bio);
1555 bio = bio_clone(r1_bio->master_bio, GFP_NOIO);
1556 r1_bio->bios[r1_bio->read_disk] = bio;
1557 rdev = conf->mirrors[disk].rdev;
1558 if (printk_ratelimit())
1559 printk(KERN_ERR "raid1: %s: redirecting sector %llu to"
1560 " another mirror\n",
1561 bdevname(rdev->bdev,b),
1562 (unsigned long long)r1_bio->sector);
1563 bio->bi_sector = r1_bio->sector + rdev->data_offset;
1564 bio->bi_bdev = rdev->bdev;
1565 bio->bi_end_io = raid1_end_read_request;
1566 bio->bi_rw = READ;
1567 bio->bi_private = r1_bio;
1568 unplug = 1;
1569 generic_make_request(bio);
1573 spin_unlock_irqrestore(&conf->device_lock, flags);
1574 if (unplug)
1575 unplug_slaves(mddev);
1579 static int init_resync(conf_t *conf)
1581 int buffs;
1583 buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
1584 BUG_ON(conf->r1buf_pool);
1585 conf->r1buf_pool = mempool_create(buffs, r1buf_pool_alloc, r1buf_pool_free,
1586 conf->poolinfo);
1587 if (!conf->r1buf_pool)
1588 return -ENOMEM;
1589 conf->next_resync = 0;
1590 return 0;
1593 /*
1594 * perform a "sync" on one "block"
1596 * We need to make sure that no normal I/O request - particularly write
1597 * requests - conflict with active sync requests.
1599 * This is achieved by tracking pending requests and a 'barrier' concept
1600 * that can be installed to exclude normal IO requests.
1601 */
1603 static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
1605 conf_t *conf = mddev_to_conf(mddev);
1606 r1bio_t *r1_bio;
1607 struct bio *bio;
1608 sector_t max_sector, nr_sectors;
1609 int disk = -1;
1610 int i;
1611 int wonly = -1;
1612 int write_targets = 0, read_targets = 0;
1613 int sync_blocks;
1614 int still_degraded = 0;
1616 if (!conf->r1buf_pool)
1618 /*
1619 printk("sync start - bitmap %p\n", mddev->bitmap);
1620 */
1621 if (init_resync(conf))
1622 return 0;
1625 max_sector = mddev->size << 1;
1626 if (sector_nr >= max_sector) {
1627 /* If we aborted, we need to abort the
1628 * sync on the 'current' bitmap chunk (there will
1629 * only be one in raid1 resync.
1630 * We can find the current addess in mddev->curr_resync
1631 */
1632 if (mddev->curr_resync < max_sector) /* aborted */
1633 bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
1634 &sync_blocks, 1);
1635 else /* completed sync */
1636 conf->fullsync = 0;
1638 bitmap_close_sync(mddev->bitmap);
1639 close_sync(conf);
1640 return 0;
1643 if (mddev->bitmap == NULL &&
1644 mddev->recovery_cp == MaxSector &&
1645 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
1646 conf->fullsync == 0) {
1647 *skipped = 1;
1648 return max_sector - sector_nr;
1650 /* before building a request, check if we can skip these blocks..
1651 * This call the bitmap_start_sync doesn't actually record anything
1652 */
1653 if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
1654 !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
1655 /* We can skip this block, and probably several more */
1656 *skipped = 1;
1657 return sync_blocks;
1659 /*
1660 * If there is non-resync activity waiting for a turn,
1661 * and resync is going fast enough,
1662 * then let it though before starting on this new sync request.
1663 */
1664 if (!go_faster && conf->nr_waiting)
1665 msleep_interruptible(1000);
1667 raise_barrier(conf);
1669 conf->next_resync = sector_nr;
1671 r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO);
1672 rcu_read_lock();
1673 /*
1674 * If we get a correctably read error during resync or recovery,
1675 * we might want to read from a different device. So we
1676 * flag all drives that could conceivably be read from for READ,
1677 * and any others (which will be non-In_sync devices) for WRITE.
1678 * If a read fails, we try reading from something else for which READ
1679 * is OK.
1680 */
1682 r1_bio->mddev = mddev;
1683 r1_bio->sector = sector_nr;
1684 r1_bio->state = 0;
1685 set_bit(R1BIO_IsSync, &r1_bio->state);
1687 for (i=0; i < conf->raid_disks; i++) {
1688 mdk_rdev_t *rdev;
1689 bio = r1_bio->bios[i];
1691 /* take from bio_init */
1692 bio->bi_next = NULL;
1693 bio->bi_flags |= 1 << BIO_UPTODATE;
1694 bio->bi_rw = 0;
1695 bio->bi_vcnt = 0;
1696 bio->bi_idx = 0;
1697 bio->bi_phys_segments = 0;
1698 bio->bi_hw_segments = 0;
1699 bio->bi_size = 0;
1700 bio->bi_end_io = NULL;
1701 bio->bi_private = NULL;
1703 rdev = rcu_dereference(conf->mirrors[i].rdev);
1704 if (rdev == NULL ||
1705 test_bit(Faulty, &rdev->flags)) {
1706 still_degraded = 1;
1707 continue;
1708 } else if (!test_bit(In_sync, &rdev->flags)) {
1709 bio->bi_rw = WRITE;
1710 bio->bi_end_io = end_sync_write;
1711 write_targets ++;
1712 } else {
1713 /* may need to read from here */
1714 bio->bi_rw = READ;
1715 bio->bi_end_io = end_sync_read;
1716 if (test_bit(WriteMostly, &rdev->flags)) {
1717 if (wonly < 0)
1718 wonly = i;
1719 } else {
1720 if (disk < 0)
1721 disk = i;
1723 read_targets++;
1725 atomic_inc(&rdev->nr_pending);
1726 bio->bi_sector = sector_nr + rdev->data_offset;
1727 bio->bi_bdev = rdev->bdev;
1728 bio->bi_private = r1_bio;
1730 rcu_read_unlock();
1731 if (disk < 0)
1732 disk = wonly;
1733 r1_bio->read_disk = disk;
1735 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && read_targets > 0)
1736 /* extra read targets are also write targets */
1737 write_targets += read_targets-1;
1739 if (write_targets == 0 || read_targets == 0) {
1740 /* There is nowhere to write, so all non-sync
1741 * drives must be failed - so we are finished
1742 */
1743 sector_t rv = max_sector - sector_nr;
1744 *skipped = 1;
1745 put_buf(r1_bio);
1746 return rv;
1749 nr_sectors = 0;
1750 sync_blocks = 0;
1751 do {
1752 struct page *page;
1753 int len = PAGE_SIZE;
1754 if (sector_nr + (len>>9) > max_sector)
1755 len = (max_sector - sector_nr) << 9;
1756 if (len == 0)
1757 break;
1758 if (sync_blocks == 0) {
1759 if (!bitmap_start_sync(mddev->bitmap, sector_nr,
1760 &sync_blocks, still_degraded) &&
1761 !conf->fullsync &&
1762 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
1763 break;
1764 BUG_ON(sync_blocks < (PAGE_SIZE>>9));
1765 if (len > (sync_blocks<<9))
1766 len = sync_blocks<<9;
1769 for (i=0 ; i < conf->raid_disks; i++) {
1770 bio = r1_bio->bios[i];
1771 if (bio->bi_end_io) {
1772 page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
1773 if (bio_add_page(bio, page, len, 0) == 0) {
1774 /* stop here */
1775 bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
1776 while (i > 0) {
1777 i--;
1778 bio = r1_bio->bios[i];
1779 if (bio->bi_end_io==NULL)
1780 continue;
1781 /* remove last page from this bio */
1782 bio->bi_vcnt--;
1783 bio->bi_size -= len;
1784 bio->bi_flags &= ~(1<< BIO_SEG_VALID);
1786 goto bio_full;
1790 nr_sectors += len>>9;
1791 sector_nr += len>>9;
1792 sync_blocks -= (len>>9);
1793 } while (r1_bio->bios[disk]->bi_vcnt < RESYNC_PAGES);
1794 bio_full:
1795 r1_bio->sectors = nr_sectors;
1797 /* For a user-requested sync, we read all readable devices and do a
1798 * compare
1799 */
1800 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
1801 atomic_set(&r1_bio->remaining, read_targets);
1802 for (i=0; i<conf->raid_disks; i++) {
1803 bio = r1_bio->bios[i];
1804 if (bio->bi_end_io == end_sync_read) {
1805 md_sync_acct(bio->bi_bdev, nr_sectors);
1806 generic_make_request(bio);
1809 } else {
1810 atomic_set(&r1_bio->remaining, 1);
1811 bio = r1_bio->bios[r1_bio->read_disk];
1812 md_sync_acct(bio->bi_bdev, nr_sectors);
1813 generic_make_request(bio);
1816 return nr_sectors;
1819 static int run(mddev_t *mddev)
1821 conf_t *conf;
1822 int i, j, disk_idx;
1823 mirror_info_t *disk;
1824 mdk_rdev_t *rdev;
1825 struct list_head *tmp;
1827 if (mddev->level != 1) {
1828 printk("raid1: %s: raid level not set to mirroring (%d)\n",
1829 mdname(mddev), mddev->level);
1830 goto out;
1832 if (mddev->reshape_position != MaxSector) {
1833 printk("raid1: %s: reshape_position set but not supported\n",
1834 mdname(mddev));
1835 goto out;
1837 /*
1838 * copy the already verified devices into our private RAID1
1839 * bookkeeping area. [whatever we allocate in run(),
1840 * should be freed in stop()]
1841 */
1842 conf = kzalloc(sizeof(conf_t), GFP_KERNEL);
1843 mddev->private = conf;
1844 if (!conf)
1845 goto out_no_mem;
1847 conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks,
1848 GFP_KERNEL);
1849 if (!conf->mirrors)
1850 goto out_no_mem;
1852 conf->tmppage = alloc_page(GFP_KERNEL);
1853 if (!conf->tmppage)
1854 goto out_no_mem;
1856 conf->poolinfo = kmalloc(sizeof(*conf->poolinfo), GFP_KERNEL);
1857 if (!conf->poolinfo)
1858 goto out_no_mem;
1859 conf->poolinfo->mddev = mddev;
1860 conf->poolinfo->raid_disks = mddev->raid_disks;
1861 conf->r1bio_pool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc,
1862 r1bio_pool_free,
1863 conf->poolinfo);
1864 if (!conf->r1bio_pool)
1865 goto out_no_mem;
1867 ITERATE_RDEV(mddev, rdev, tmp) {
1868 disk_idx = rdev->raid_disk;
1869 if (disk_idx >= mddev->raid_disks
1870 || disk_idx < 0)
1871 continue;
1872 disk = conf->mirrors + disk_idx;
1874 disk->rdev = rdev;
1876 blk_queue_stack_limits(mddev->queue,
1877 rdev->bdev->bd_disk->queue);
1878 /* as we don't honour merge_bvec_fn, we must never risk
1879 * violating it, so limit ->max_sector to one PAGE, as
1880 * a one page request is never in violation.
1881 */
1882 if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
1883 mddev->queue->max_sectors > (PAGE_SIZE>>9))
1884 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
1886 disk->head_position = 0;
1887 if (!test_bit(Faulty, &rdev->flags) && test_bit(In_sync, &rdev->flags))
1888 conf->working_disks++;
1890 conf->raid_disks = mddev->raid_disks;
1891 conf->mddev = mddev;
1892 spin_lock_init(&conf->device_lock);
1893 INIT_LIST_HEAD(&conf->retry_list);
1894 if (conf->working_disks == 1)
1895 mddev->recovery_cp = MaxSector;
1897 spin_lock_init(&conf->resync_lock);
1898 init_waitqueue_head(&conf->wait_barrier);
1900 bio_list_init(&conf->pending_bio_list);
1901 bio_list_init(&conf->flushing_bio_list);
1903 if (!conf->working_disks) {
1904 printk(KERN_ERR "raid1: no operational mirrors for %s\n",
1905 mdname(mddev));
1906 goto out_free_conf;
1909 mddev->degraded = 0;
1910 for (i = 0; i < conf->raid_disks; i++) {
1912 disk = conf->mirrors + i;
1914 if (!disk->rdev ||
1915 !test_bit(In_sync, &disk->rdev->flags)) {
1916 disk->head_position = 0;
1917 mddev->degraded++;
1921 /*
1922 * find the first working one and use it as a starting point
1923 * to read balancing.
1924 */
1925 for (j = 0; j < conf->raid_disks &&
1926 (!conf->mirrors[j].rdev ||
1927 !test_bit(In_sync, &conf->mirrors[j].rdev->flags)) ; j++)
1928 /* nothing */;
1929 conf->last_used = j;
1932 mddev->thread = md_register_thread(raid1d, mddev, "%s_raid1");
1933 if (!mddev->thread) {
1934 printk(KERN_ERR
1935 "raid1: couldn't allocate thread for %s\n",
1936 mdname(mddev));
1937 goto out_free_conf;
1940 printk(KERN_INFO
1941 "raid1: raid set %s active with %d out of %d mirrors\n",
1942 mdname(mddev), mddev->raid_disks - mddev->degraded,
1943 mddev->raid_disks);
1944 /*
1945 * Ok, everything is just fine now
1946 */
1947 mddev->array_size = mddev->size;
1949 mddev->queue->unplug_fn = raid1_unplug;
1950 mddev->queue->issue_flush_fn = raid1_issue_flush;
1952 return 0;
1954 out_no_mem:
1955 printk(KERN_ERR "raid1: couldn't allocate memory for %s\n",
1956 mdname(mddev));
1958 out_free_conf:
1959 if (conf) {
1960 if (conf->r1bio_pool)
1961 mempool_destroy(conf->r1bio_pool);
1962 kfree(conf->mirrors);
1963 safe_put_page(conf->tmppage);
1964 kfree(conf->poolinfo);
1965 kfree(conf);
1966 mddev->private = NULL;
1968 out:
1969 return -EIO;
1972 static int stop(mddev_t *mddev)
1974 conf_t *conf = mddev_to_conf(mddev);
1975 struct bitmap *bitmap = mddev->bitmap;
1976 int behind_wait = 0;
1978 /* wait for behind writes to complete */
1979 while (bitmap && atomic_read(&bitmap->behind_writes) > 0) {
1980 behind_wait++;
1981 printk(KERN_INFO "raid1: behind writes in progress on device %s, waiting to stop (%d)\n", mdname(mddev), behind_wait);
1982 set_current_state(TASK_UNINTERRUPTIBLE);
1983 schedule_timeout(HZ); /* wait a second */
1984 /* need to kick something here to make sure I/O goes? */
1987 md_unregister_thread(mddev->thread);
1988 mddev->thread = NULL;
1989 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
1990 if (conf->r1bio_pool)
1991 mempool_destroy(conf->r1bio_pool);
1992 kfree(conf->mirrors);
1993 kfree(conf->poolinfo);
1994 kfree(conf);
1995 mddev->private = NULL;
1996 return 0;
1999 static int raid1_resize(mddev_t *mddev, sector_t sectors)
2001 /* no resync is happening, and there is enough space
2002 * on all devices, so we can resize.
2003 * We need to make sure resync covers any new space.
2004 * If the array is shrinking we should possibly wait until
2005 * any io in the removed space completes, but it hardly seems
2006 * worth it.
2007 */
2008 mddev->array_size = sectors>>1;
2009 set_capacity(mddev->gendisk, mddev->array_size << 1);
2010 mddev->changed = 1;
2011 if (mddev->array_size > mddev->size && mddev->recovery_cp == MaxSector) {
2012 mddev->recovery_cp = mddev->size << 1;
2013 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2015 mddev->size = mddev->array_size;
2016 mddev->resync_max_sectors = sectors;
2017 return 0;
2020 static int raid1_reshape(mddev_t *mddev)
2022 /* We need to:
2023 * 1/ resize the r1bio_pool
2024 * 2/ resize conf->mirrors
2026 * We allocate a new r1bio_pool if we can.
2027 * Then raise a device barrier and wait until all IO stops.
2028 * Then resize conf->mirrors and swap in the new r1bio pool.
2030 * At the same time, we "pack" the devices so that all the missing
2031 * devices have the higher raid_disk numbers.
2032 */
2033 mempool_t *newpool, *oldpool;
2034 struct pool_info *newpoolinfo;
2035 mirror_info_t *newmirrors;
2036 conf_t *conf = mddev_to_conf(mddev);
2037 int cnt, raid_disks;
2039 int d, d2;
2041 /* Cannot change chunk_size, layout, or level */
2042 if (mddev->chunk_size != mddev->new_chunk ||
2043 mddev->layout != mddev->new_layout ||
2044 mddev->level != mddev->new_level) {
2045 mddev->new_chunk = mddev->chunk_size;
2046 mddev->new_layout = mddev->layout;
2047 mddev->new_level = mddev->level;
2048 return -EINVAL;
2051 raid_disks = mddev->raid_disks + mddev->delta_disks;
2053 if (raid_disks < conf->raid_disks) {
2054 cnt=0;
2055 for (d= 0; d < conf->raid_disks; d++)
2056 if (conf->mirrors[d].rdev)
2057 cnt++;
2058 if (cnt > raid_disks)
2059 return -EBUSY;
2062 newpoolinfo = kmalloc(sizeof(*newpoolinfo), GFP_KERNEL);
2063 if (!newpoolinfo)
2064 return -ENOMEM;
2065 newpoolinfo->mddev = mddev;
2066 newpoolinfo->raid_disks = raid_disks;
2068 newpool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc,
2069 r1bio_pool_free, newpoolinfo);
2070 if (!newpool) {
2071 kfree(newpoolinfo);
2072 return -ENOMEM;
2074 newmirrors = kzalloc(sizeof(struct mirror_info) * raid_disks, GFP_KERNEL);
2075 if (!newmirrors) {
2076 kfree(newpoolinfo);
2077 mempool_destroy(newpool);
2078 return -ENOMEM;
2081 raise_barrier(conf);
2083 /* ok, everything is stopped */
2084 oldpool = conf->r1bio_pool;
2085 conf->r1bio_pool = newpool;
2087 for (d=d2=0; d < conf->raid_disks; d++)
2088 if (conf->mirrors[d].rdev) {
2089 conf->mirrors[d].rdev->raid_disk = d2;
2090 newmirrors[d2++].rdev = conf->mirrors[d].rdev;
2092 kfree(conf->mirrors);
2093 conf->mirrors = newmirrors;
2094 kfree(conf->poolinfo);
2095 conf->poolinfo = newpoolinfo;
2097 mddev->degraded += (raid_disks - conf->raid_disks);
2098 conf->raid_disks = mddev->raid_disks = raid_disks;
2099 mddev->delta_disks = 0;
2101 conf->last_used = 0; /* just make sure it is in-range */
2102 lower_barrier(conf);
2104 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2105 md_wakeup_thread(mddev->thread);
2107 mempool_destroy(oldpool);
2108 return 0;
2111 static void raid1_quiesce(mddev_t *mddev, int state)
2113 conf_t *conf = mddev_to_conf(mddev);
2115 switch(state) {
2116 case 1:
2117 raise_barrier(conf);
2118 break;
2119 case 0:
2120 lower_barrier(conf);
2121 break;
2126 static struct mdk_personality raid1_personality =
2128 .name = "raid1",
2129 .level = 1,
2130 .owner = THIS_MODULE,
2131 .make_request = make_request,
2132 .run = run,
2133 .stop = stop,
2134 .status = status,
2135 .error_handler = error,
2136 .hot_add_disk = raid1_add_disk,
2137 .hot_remove_disk= raid1_remove_disk,
2138 .spare_active = raid1_spare_active,
2139 .sync_request = sync_request,
2140 .resize = raid1_resize,
2141 .check_reshape = raid1_reshape,
2142 .quiesce = raid1_quiesce,
2143 };
2145 static int __init raid_init(void)
2147 return register_md_personality(&raid1_personality);
2150 static void raid_exit(void)
2152 unregister_md_personality(&raid1_personality);
2155 module_init(raid_init);
2156 module_exit(raid_exit);
2157 MODULE_LICENSE("GPL");
2158 MODULE_ALIAS("md-personality-3"); /* RAID1 */
2159 MODULE_ALIAS("md-raid1");
2160 MODULE_ALIAS("md-level-1");