ia64/linux-2.6.18-xen.hg

view drivers/md/raid10.c @ 897:329ea0ccb344

balloon: try harder to balloon up under memory pressure.

Currently if the balloon driver is unable to increase the guest's
reservation it assumes the failure was due to reaching its full
allocation, gives up on the ballooning operation and records the limit
it reached as the "hard limit". The driver will not try again until
the target is set again (even to the same value).

However it is possible that ballooning has in fact failed due to
memory pressure in the host and therefore it is desirable to keep
attempting to reach the target in case memory becomes available. The
most likely scenario is that some guests are ballooning down while
others are ballooning up and therefore there is temporary memory
pressure while things stabilise. You would not expect a well behaved
toolstack to ask a domain to balloon to more than its allocation nor
would you expect it to deliberately over-commit memory by setting
balloon targets which exceed the total host memory.

This patch drops the concept of a hard limit and causes the balloon
driver to retry increasing the reservation on a timer in the same
manner as when decreasing the reservation.

Also if we partially succeed in increasing the reservation
(i.e. receive less pages than we asked for) then we may as well keep
those pages rather than returning them to Xen.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Fri Jun 05 14:01:20 2009 +0100 (2009-06-05)
parents 3e8752eb6d9c
children
line source
1 /*
2 * raid10.c : Multiple Devices driver for Linux
3 *
4 * Copyright (C) 2000-2004 Neil Brown
5 *
6 * RAID-10 support for md.
7 *
8 * Base on code in raid1.c. See raid1.c for futher copyright information.
9 *
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2, or (at your option)
14 * any later version.
15 *
16 * You should have received a copy of the GNU General Public License
17 * (for example /usr/src/linux/COPYING); if not, write to the Free
18 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 */
21 #include "dm-bio-list.h"
22 #include <linux/raid/raid10.h>
23 #include <linux/raid/bitmap.h>
25 /*
26 * RAID10 provides a combination of RAID0 and RAID1 functionality.
27 * The layout of data is defined by
28 * chunk_size
29 * raid_disks
30 * near_copies (stored in low byte of layout)
31 * far_copies (stored in second byte of layout)
32 * far_offset (stored in bit 16 of layout )
33 *
34 * The data to be stored is divided into chunks using chunksize.
35 * Each device is divided into far_copies sections.
36 * In each section, chunks are laid out in a style similar to raid0, but
37 * near_copies copies of each chunk is stored (each on a different drive).
38 * The starting device for each section is offset near_copies from the starting
39 * device of the previous section.
40 * Thus they are (near_copies*far_copies) of each chunk, and each is on a different
41 * drive.
42 * near_copies and far_copies must be at least one, and their product is at most
43 * raid_disks.
44 *
45 * If far_offset is true, then the far_copies are handled a bit differently.
46 * The copies are still in different stripes, but instead of be very far apart
47 * on disk, there are adjacent stripes.
48 */
50 /*
51 * Number of guaranteed r10bios in case of extreme VM load:
52 */
53 #define NR_RAID10_BIOS 256
55 static void unplug_slaves(mddev_t *mddev);
57 static void allow_barrier(conf_t *conf);
58 static void lower_barrier(conf_t *conf);
60 static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
61 {
62 conf_t *conf = data;
63 r10bio_t *r10_bio;
64 int size = offsetof(struct r10bio_s, devs[conf->copies]);
66 /* allocate a r10bio with room for raid_disks entries in the bios array */
67 r10_bio = kzalloc(size, gfp_flags);
68 if (!r10_bio)
69 unplug_slaves(conf->mddev);
71 return r10_bio;
72 }
74 static void r10bio_pool_free(void *r10_bio, void *data)
75 {
76 kfree(r10_bio);
77 }
79 #define RESYNC_BLOCK_SIZE (64*1024)
80 //#define RESYNC_BLOCK_SIZE PAGE_SIZE
81 #define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
82 #define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
83 #define RESYNC_WINDOW (2048*1024)
85 /*
86 * When performing a resync, we need to read and compare, so
87 * we need as many pages are there are copies.
88 * When performing a recovery, we need 2 bios, one for read,
89 * one for write (we recover only one drive per r10buf)
90 *
91 */
92 static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
93 {
94 conf_t *conf = data;
95 struct page *page;
96 r10bio_t *r10_bio;
97 struct bio *bio;
98 int i, j;
99 int nalloc;
101 r10_bio = r10bio_pool_alloc(gfp_flags, conf);
102 if (!r10_bio) {
103 unplug_slaves(conf->mddev);
104 return NULL;
105 }
107 if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery))
108 nalloc = conf->copies; /* resync */
109 else
110 nalloc = 2; /* recovery */
112 /*
113 * Allocate bios.
114 */
115 for (j = nalloc ; j-- ; ) {
116 bio = bio_alloc(gfp_flags, RESYNC_PAGES);
117 if (!bio)
118 goto out_free_bio;
119 r10_bio->devs[j].bio = bio;
120 }
121 /*
122 * Allocate RESYNC_PAGES data pages and attach them
123 * where needed.
124 */
125 for (j = 0 ; j < nalloc; j++) {
126 bio = r10_bio->devs[j].bio;
127 for (i = 0; i < RESYNC_PAGES; i++) {
128 page = alloc_page(gfp_flags);
129 if (unlikely(!page))
130 goto out_free_pages;
132 bio->bi_io_vec[i].bv_page = page;
133 }
134 }
136 return r10_bio;
138 out_free_pages:
139 for ( ; i > 0 ; i--)
140 safe_put_page(bio->bi_io_vec[i-1].bv_page);
141 while (j--)
142 for (i = 0; i < RESYNC_PAGES ; i++)
143 safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page);
144 j = -1;
145 out_free_bio:
146 while ( ++j < nalloc )
147 bio_put(r10_bio->devs[j].bio);
148 r10bio_pool_free(r10_bio, conf);
149 return NULL;
150 }
152 static void r10buf_pool_free(void *__r10_bio, void *data)
153 {
154 int i;
155 conf_t *conf = data;
156 r10bio_t *r10bio = __r10_bio;
157 int j;
159 for (j=0; j < conf->copies; j++) {
160 struct bio *bio = r10bio->devs[j].bio;
161 if (bio) {
162 for (i = 0; i < RESYNC_PAGES; i++) {
163 safe_put_page(bio->bi_io_vec[i].bv_page);
164 bio->bi_io_vec[i].bv_page = NULL;
165 }
166 bio_put(bio);
167 }
168 }
169 r10bio_pool_free(r10bio, conf);
170 }
172 static void put_all_bios(conf_t *conf, r10bio_t *r10_bio)
173 {
174 int i;
176 for (i = 0; i < conf->copies; i++) {
177 struct bio **bio = & r10_bio->devs[i].bio;
178 if (*bio && *bio != IO_BLOCKED)
179 bio_put(*bio);
180 *bio = NULL;
181 }
182 }
184 static void free_r10bio(r10bio_t *r10_bio)
185 {
186 conf_t *conf = mddev_to_conf(r10_bio->mddev);
188 /*
189 * Wake up any possible resync thread that waits for the device
190 * to go idle.
191 */
192 allow_barrier(conf);
194 put_all_bios(conf, r10_bio);
195 mempool_free(r10_bio, conf->r10bio_pool);
196 }
198 static void put_buf(r10bio_t *r10_bio)
199 {
200 conf_t *conf = mddev_to_conf(r10_bio->mddev);
202 mempool_free(r10_bio, conf->r10buf_pool);
204 lower_barrier(conf);
205 }
207 static void reschedule_retry(r10bio_t *r10_bio)
208 {
209 unsigned long flags;
210 mddev_t *mddev = r10_bio->mddev;
211 conf_t *conf = mddev_to_conf(mddev);
213 spin_lock_irqsave(&conf->device_lock, flags);
214 list_add(&r10_bio->retry_list, &conf->retry_list);
215 conf->nr_queued ++;
216 spin_unlock_irqrestore(&conf->device_lock, flags);
218 md_wakeup_thread(mddev->thread);
219 }
221 /*
222 * raid_end_bio_io() is called when we have finished servicing a mirrored
223 * operation and are ready to return a success/failure code to the buffer
224 * cache layer.
225 */
226 static void raid_end_bio_io(r10bio_t *r10_bio)
227 {
228 struct bio *bio = r10_bio->master_bio;
230 bio_endio(bio, bio->bi_size,
231 test_bit(R10BIO_Uptodate, &r10_bio->state) ? 0 : -EIO);
232 free_r10bio(r10_bio);
233 }
235 /*
236 * Update disk head position estimator based on IRQ completion info.
237 */
238 static inline void update_head_pos(int slot, r10bio_t *r10_bio)
239 {
240 conf_t *conf = mddev_to_conf(r10_bio->mddev);
242 conf->mirrors[r10_bio->devs[slot].devnum].head_position =
243 r10_bio->devs[slot].addr + (r10_bio->sectors);
244 }
246 static int raid10_end_read_request(struct bio *bio, unsigned int bytes_done, int error)
247 {
248 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
249 r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private);
250 int slot, dev;
251 conf_t *conf = mddev_to_conf(r10_bio->mddev);
253 if (bio->bi_size)
254 return 1;
256 slot = r10_bio->read_slot;
257 dev = r10_bio->devs[slot].devnum;
258 /*
259 * this branch is our 'one mirror IO has finished' event handler:
260 */
261 update_head_pos(slot, r10_bio);
263 if (uptodate) {
264 /*
265 * Set R10BIO_Uptodate in our master bio, so that
266 * we will return a good error code to the higher
267 * levels even if IO on some other mirrored buffer fails.
268 *
269 * The 'master' represents the composite IO operation to
270 * user-side. So if something waits for IO, then it will
271 * wait for the 'master' bio.
272 */
273 set_bit(R10BIO_Uptodate, &r10_bio->state);
274 raid_end_bio_io(r10_bio);
275 } else {
276 /*
277 * oops, read error:
278 */
279 char b[BDEVNAME_SIZE];
280 if (printk_ratelimit())
281 printk(KERN_ERR "raid10: %s: rescheduling sector %llu\n",
282 bdevname(conf->mirrors[dev].rdev->bdev,b), (unsigned long long)r10_bio->sector);
283 reschedule_retry(r10_bio);
284 }
286 rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
287 return 0;
288 }
290 static int raid10_end_write_request(struct bio *bio, unsigned int bytes_done, int error)
291 {
292 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
293 r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private);
294 int slot, dev;
295 conf_t *conf = mddev_to_conf(r10_bio->mddev);
297 if (bio->bi_size)
298 return 1;
300 for (slot = 0; slot < conf->copies; slot++)
301 if (r10_bio->devs[slot].bio == bio)
302 break;
303 dev = r10_bio->devs[slot].devnum;
305 /*
306 * this branch is our 'one mirror IO has finished' event handler:
307 */
308 if (!uptodate) {
309 md_error(r10_bio->mddev, conf->mirrors[dev].rdev);
310 /* an I/O failed, we can't clear the bitmap */
311 set_bit(R10BIO_Degraded, &r10_bio->state);
312 } else
313 /*
314 * Set R10BIO_Uptodate in our master bio, so that
315 * we will return a good error code for to the higher
316 * levels even if IO on some other mirrored buffer fails.
317 *
318 * The 'master' represents the composite IO operation to
319 * user-side. So if something waits for IO, then it will
320 * wait for the 'master' bio.
321 */
322 set_bit(R10BIO_Uptodate, &r10_bio->state);
324 update_head_pos(slot, r10_bio);
326 /*
327 *
328 * Let's see if all mirrored write operations have finished
329 * already.
330 */
331 if (atomic_dec_and_test(&r10_bio->remaining)) {
332 /* clear the bitmap if all writes complete successfully */
333 bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,
334 r10_bio->sectors,
335 !test_bit(R10BIO_Degraded, &r10_bio->state),
336 0);
337 md_write_end(r10_bio->mddev);
338 raid_end_bio_io(r10_bio);
339 }
341 rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
342 return 0;
343 }
346 /*
347 * RAID10 layout manager
348 * Aswell as the chunksize and raid_disks count, there are two
349 * parameters: near_copies and far_copies.
350 * near_copies * far_copies must be <= raid_disks.
351 * Normally one of these will be 1.
352 * If both are 1, we get raid0.
353 * If near_copies == raid_disks, we get raid1.
354 *
355 * Chunks are layed out in raid0 style with near_copies copies of the
356 * first chunk, followed by near_copies copies of the next chunk and
357 * so on.
358 * If far_copies > 1, then after 1/far_copies of the array has been assigned
359 * as described above, we start again with a device offset of near_copies.
360 * So we effectively have another copy of the whole array further down all
361 * the drives, but with blocks on different drives.
362 * With this layout, and block is never stored twice on the one device.
363 *
364 * raid10_find_phys finds the sector offset of a given virtual sector
365 * on each device that it is on.
366 *
367 * raid10_find_virt does the reverse mapping, from a device and a
368 * sector offset to a virtual address
369 */
371 static void raid10_find_phys(conf_t *conf, r10bio_t *r10bio)
372 {
373 int n,f;
374 sector_t sector;
375 sector_t chunk;
376 sector_t stripe;
377 int dev;
379 int slot = 0;
381 /* now calculate first sector/dev */
382 chunk = r10bio->sector >> conf->chunk_shift;
383 sector = r10bio->sector & conf->chunk_mask;
385 chunk *= conf->near_copies;
386 stripe = chunk;
387 dev = sector_div(stripe, conf->raid_disks);
388 if (conf->far_offset)
389 stripe *= conf->far_copies;
391 sector += stripe << conf->chunk_shift;
393 /* and calculate all the others */
394 for (n=0; n < conf->near_copies; n++) {
395 int d = dev;
396 sector_t s = sector;
397 r10bio->devs[slot].addr = sector;
398 r10bio->devs[slot].devnum = d;
399 slot++;
401 for (f = 1; f < conf->far_copies; f++) {
402 d += conf->near_copies;
403 if (d >= conf->raid_disks)
404 d -= conf->raid_disks;
405 s += conf->stride;
406 r10bio->devs[slot].devnum = d;
407 r10bio->devs[slot].addr = s;
408 slot++;
409 }
410 dev++;
411 if (dev >= conf->raid_disks) {
412 dev = 0;
413 sector += (conf->chunk_mask + 1);
414 }
415 }
416 BUG_ON(slot != conf->copies);
417 }
419 static sector_t raid10_find_virt(conf_t *conf, sector_t sector, int dev)
420 {
421 sector_t offset, chunk, vchunk;
423 offset = sector & conf->chunk_mask;
424 if (conf->far_offset) {
425 int fc;
426 chunk = sector >> conf->chunk_shift;
427 fc = sector_div(chunk, conf->far_copies);
428 dev -= fc * conf->near_copies;
429 if (dev < 0)
430 dev += conf->raid_disks;
431 } else {
432 while (sector > conf->stride) {
433 sector -= conf->stride;
434 if (dev < conf->near_copies)
435 dev += conf->raid_disks - conf->near_copies;
436 else
437 dev -= conf->near_copies;
438 }
439 chunk = sector >> conf->chunk_shift;
440 }
441 vchunk = chunk * conf->raid_disks + dev;
442 sector_div(vchunk, conf->near_copies);
443 return (vchunk << conf->chunk_shift) + offset;
444 }
446 /**
447 * raid10_mergeable_bvec -- tell bio layer if a two requests can be merged
448 * @q: request queue
449 * @bio: the buffer head that's been built up so far
450 * @biovec: the request that could be merged to it.
451 *
452 * Return amount of bytes we can accept at this offset
453 * If near_copies == raid_disk, there are no striping issues,
454 * but in that case, the function isn't called at all.
455 */
456 static int raid10_mergeable_bvec(request_queue_t *q, struct bio *bio,
457 struct bio_vec *bio_vec)
458 {
459 mddev_t *mddev = q->queuedata;
460 sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev);
461 int max;
462 unsigned int chunk_sectors = mddev->chunk_size >> 9;
463 unsigned int bio_sectors = bio->bi_size >> 9;
465 max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
466 if (max < 0) max = 0; /* bio_add cannot handle a negative return */
467 if (max <= bio_vec->bv_len && bio_sectors == 0)
468 return bio_vec->bv_len;
469 else
470 return max;
471 }
473 /*
474 * This routine returns the disk from which the requested read should
475 * be done. There is a per-array 'next expected sequential IO' sector
476 * number - if this matches on the next IO then we use the last disk.
477 * There is also a per-disk 'last know head position' sector that is
478 * maintained from IRQ contexts, both the normal and the resync IO
479 * completion handlers update this position correctly. If there is no
480 * perfect sequential match then we pick the disk whose head is closest.
481 *
482 * If there are 2 mirrors in the same 2 devices, performance degrades
483 * because position is mirror, not device based.
484 *
485 * The rdev for the device selected will have nr_pending incremented.
486 */
488 /*
489 * FIXME: possibly should rethink readbalancing and do it differently
490 * depending on near_copies / far_copies geometry.
491 */
492 static int read_balance(conf_t *conf, r10bio_t *r10_bio)
493 {
494 const unsigned long this_sector = r10_bio->sector;
495 int disk, slot, nslot;
496 const int sectors = r10_bio->sectors;
497 sector_t new_distance, current_distance;
498 mdk_rdev_t *rdev;
500 raid10_find_phys(conf, r10_bio);
501 rcu_read_lock();
502 /*
503 * Check if we can balance. We can balance on the whole
504 * device if no resync is going on (recovery is ok), or below
505 * the resync window. We take the first readable disk when
506 * above the resync window.
507 */
508 if (conf->mddev->recovery_cp < MaxSector
509 && (this_sector + sectors >= conf->next_resync)) {
510 /* make sure that disk is operational */
511 slot = 0;
512 disk = r10_bio->devs[slot].devnum;
514 while ((rdev = rcu_dereference(conf->mirrors[disk].rdev)) == NULL ||
515 r10_bio->devs[slot].bio == IO_BLOCKED ||
516 !test_bit(In_sync, &rdev->flags)) {
517 slot++;
518 if (slot == conf->copies) {
519 slot = 0;
520 disk = -1;
521 break;
522 }
523 disk = r10_bio->devs[slot].devnum;
524 }
525 goto rb_out;
526 }
529 /* make sure the disk is operational */
530 slot = 0;
531 disk = r10_bio->devs[slot].devnum;
532 while ((rdev=rcu_dereference(conf->mirrors[disk].rdev)) == NULL ||
533 r10_bio->devs[slot].bio == IO_BLOCKED ||
534 !test_bit(In_sync, &rdev->flags)) {
535 slot ++;
536 if (slot == conf->copies) {
537 disk = -1;
538 goto rb_out;
539 }
540 disk = r10_bio->devs[slot].devnum;
541 }
544 current_distance = abs(r10_bio->devs[slot].addr -
545 conf->mirrors[disk].head_position);
547 /* Find the disk whose head is closest */
549 for (nslot = slot; nslot < conf->copies; nslot++) {
550 int ndisk = r10_bio->devs[nslot].devnum;
553 if ((rdev=rcu_dereference(conf->mirrors[ndisk].rdev)) == NULL ||
554 r10_bio->devs[nslot].bio == IO_BLOCKED ||
555 !test_bit(In_sync, &rdev->flags))
556 continue;
558 /* This optimisation is debatable, and completely destroys
559 * sequential read speed for 'far copies' arrays. So only
560 * keep it for 'near' arrays, and review those later.
561 */
562 if (conf->near_copies > 1 && !atomic_read(&rdev->nr_pending)) {
563 disk = ndisk;
564 slot = nslot;
565 break;
566 }
567 new_distance = abs(r10_bio->devs[nslot].addr -
568 conf->mirrors[ndisk].head_position);
569 if (new_distance < current_distance) {
570 current_distance = new_distance;
571 disk = ndisk;
572 slot = nslot;
573 }
574 }
576 rb_out:
577 r10_bio->read_slot = slot;
578 /* conf->next_seq_sect = this_sector + sectors;*/
580 if (disk >= 0 && (rdev=rcu_dereference(conf->mirrors[disk].rdev))!= NULL)
581 atomic_inc(&conf->mirrors[disk].rdev->nr_pending);
582 else
583 disk = -1;
584 rcu_read_unlock();
586 return disk;
587 }
589 static void unplug_slaves(mddev_t *mddev)
590 {
591 conf_t *conf = mddev_to_conf(mddev);
592 int i;
594 rcu_read_lock();
595 for (i=0; i<mddev->raid_disks; i++) {
596 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
597 if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) {
598 request_queue_t *r_queue = bdev_get_queue(rdev->bdev);
600 atomic_inc(&rdev->nr_pending);
601 rcu_read_unlock();
603 if (r_queue->unplug_fn)
604 r_queue->unplug_fn(r_queue);
606 rdev_dec_pending(rdev, mddev);
607 rcu_read_lock();
608 }
609 }
610 rcu_read_unlock();
611 }
613 static void raid10_unplug(request_queue_t *q)
614 {
615 mddev_t *mddev = q->queuedata;
617 unplug_slaves(q->queuedata);
618 md_wakeup_thread(mddev->thread);
619 }
621 static int raid10_issue_flush(request_queue_t *q, struct gendisk *disk,
622 sector_t *error_sector)
623 {
624 mddev_t *mddev = q->queuedata;
625 conf_t *conf = mddev_to_conf(mddev);
626 int i, ret = 0;
628 rcu_read_lock();
629 for (i=0; i<mddev->raid_disks && ret == 0; i++) {
630 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
631 if (rdev && !test_bit(Faulty, &rdev->flags)) {
632 struct block_device *bdev = rdev->bdev;
633 request_queue_t *r_queue = bdev_get_queue(bdev);
635 if (!r_queue->issue_flush_fn)
636 ret = -EOPNOTSUPP;
637 else {
638 atomic_inc(&rdev->nr_pending);
639 rcu_read_unlock();
640 ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk,
641 error_sector);
642 rdev_dec_pending(rdev, mddev);
643 rcu_read_lock();
644 }
645 }
646 }
647 rcu_read_unlock();
648 return ret;
649 }
651 /* Barriers....
652 * Sometimes we need to suspend IO while we do something else,
653 * either some resync/recovery, or reconfigure the array.
654 * To do this we raise a 'barrier'.
655 * The 'barrier' is a counter that can be raised multiple times
656 * to count how many activities are happening which preclude
657 * normal IO.
658 * We can only raise the barrier if there is no pending IO.
659 * i.e. if nr_pending == 0.
660 * We choose only to raise the barrier if no-one is waiting for the
661 * barrier to go down. This means that as soon as an IO request
662 * is ready, no other operations which require a barrier will start
663 * until the IO request has had a chance.
664 *
665 * So: regular IO calls 'wait_barrier'. When that returns there
666 * is no backgroup IO happening, It must arrange to call
667 * allow_barrier when it has finished its IO.
668 * backgroup IO calls must call raise_barrier. Once that returns
669 * there is no normal IO happeing. It must arrange to call
670 * lower_barrier when the particular background IO completes.
671 */
672 #define RESYNC_DEPTH 32
674 static void raise_barrier(conf_t *conf, int force)
675 {
676 BUG_ON(force && !conf->barrier);
677 spin_lock_irq(&conf->resync_lock);
679 /* Wait until no block IO is waiting (unless 'force') */
680 wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
681 conf->resync_lock,
682 raid10_unplug(conf->mddev->queue));
684 /* block any new IO from starting */
685 conf->barrier++;
687 /* No wait for all pending IO to complete */
688 wait_event_lock_irq(conf->wait_barrier,
689 !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
690 conf->resync_lock,
691 raid10_unplug(conf->mddev->queue));
693 spin_unlock_irq(&conf->resync_lock);
694 }
696 static void lower_barrier(conf_t *conf)
697 {
698 unsigned long flags;
699 spin_lock_irqsave(&conf->resync_lock, flags);
700 conf->barrier--;
701 spin_unlock_irqrestore(&conf->resync_lock, flags);
702 wake_up(&conf->wait_barrier);
703 }
705 static void wait_barrier(conf_t *conf)
706 {
707 spin_lock_irq(&conf->resync_lock);
708 if (conf->barrier) {
709 conf->nr_waiting++;
710 wait_event_lock_irq(conf->wait_barrier, !conf->barrier,
711 conf->resync_lock,
712 raid10_unplug(conf->mddev->queue));
713 conf->nr_waiting--;
714 }
715 conf->nr_pending++;
716 spin_unlock_irq(&conf->resync_lock);
717 }
719 static void allow_barrier(conf_t *conf)
720 {
721 unsigned long flags;
722 spin_lock_irqsave(&conf->resync_lock, flags);
723 conf->nr_pending--;
724 spin_unlock_irqrestore(&conf->resync_lock, flags);
725 wake_up(&conf->wait_barrier);
726 }
728 static void freeze_array(conf_t *conf)
729 {
730 /* stop syncio and normal IO and wait for everything to
731 * go quiet.
732 * We increment barrier and nr_waiting, and then
733 * wait until barrier+nr_pending match nr_queued+2
734 */
735 spin_lock_irq(&conf->resync_lock);
736 conf->barrier++;
737 conf->nr_waiting++;
738 wait_event_lock_irq(conf->wait_barrier,
739 conf->barrier+conf->nr_pending == conf->nr_queued+2,
740 conf->resync_lock,
741 raid10_unplug(conf->mddev->queue));
742 spin_unlock_irq(&conf->resync_lock);
743 }
745 static void unfreeze_array(conf_t *conf)
746 {
747 /* reverse the effect of the freeze */
748 spin_lock_irq(&conf->resync_lock);
749 conf->barrier--;
750 conf->nr_waiting--;
751 wake_up(&conf->wait_barrier);
752 spin_unlock_irq(&conf->resync_lock);
753 }
755 static int make_request(request_queue_t *q, struct bio * bio)
756 {
757 mddev_t *mddev = q->queuedata;
758 conf_t *conf = mddev_to_conf(mddev);
759 mirror_info_t *mirror;
760 r10bio_t *r10_bio;
761 struct bio *read_bio;
762 int i;
763 int chunk_sects = conf->chunk_mask + 1;
764 const int rw = bio_data_dir(bio);
765 struct bio_list bl;
766 unsigned long flags;
768 if (unlikely(bio_barrier(bio))) {
769 bio_endio(bio, bio->bi_size, -EOPNOTSUPP);
770 return 0;
771 }
773 /* If this request crosses a chunk boundary, we need to
774 * split it. This will only happen for 1 PAGE (or less) requests.
775 */
776 if (unlikely( (bio->bi_sector & conf->chunk_mask) + (bio->bi_size >> 9)
777 > chunk_sects &&
778 conf->near_copies < conf->raid_disks)) {
779 struct bio_pair *bp;
780 /* Sanity check -- queue functions should prevent this happening */
781 if (bio->bi_vcnt != 1 ||
782 bio->bi_idx != 0)
783 goto bad_map;
784 /* This is a one page bio that upper layers
785 * refuse to split for us, so we need to split it.
786 */
787 bp = bio_split(bio, bio_split_pool,
788 chunk_sects - (bio->bi_sector & (chunk_sects - 1)) );
789 if (make_request(q, &bp->bio1))
790 generic_make_request(&bp->bio1);
791 if (make_request(q, &bp->bio2))
792 generic_make_request(&bp->bio2);
794 bio_pair_release(bp);
795 return 0;
796 bad_map:
797 printk("raid10_make_request bug: can't convert block across chunks"
798 " or bigger than %dk %llu %d\n", chunk_sects/2,
799 (unsigned long long)bio->bi_sector, bio->bi_size >> 10);
801 bio_io_error(bio, bio->bi_size);
802 return 0;
803 }
805 md_write_start(mddev, bio);
807 /*
808 * Register the new request and wait if the reconstruction
809 * thread has put up a bar for new requests.
810 * Continue immediately if no resync is active currently.
811 */
812 wait_barrier(conf);
814 disk_stat_inc(mddev->gendisk, ios[rw]);
815 disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bio));
817 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
819 r10_bio->master_bio = bio;
820 r10_bio->sectors = bio->bi_size >> 9;
822 r10_bio->mddev = mddev;
823 r10_bio->sector = bio->bi_sector;
824 r10_bio->state = 0;
826 if (rw == READ) {
827 /*
828 * read balancing logic:
829 */
830 int disk = read_balance(conf, r10_bio);
831 int slot = r10_bio->read_slot;
832 if (disk < 0) {
833 raid_end_bio_io(r10_bio);
834 return 0;
835 }
836 mirror = conf->mirrors + disk;
838 read_bio = bio_clone(bio, GFP_NOIO);
840 r10_bio->devs[slot].bio = read_bio;
842 read_bio->bi_sector = r10_bio->devs[slot].addr +
843 mirror->rdev->data_offset;
844 read_bio->bi_bdev = mirror->rdev->bdev;
845 read_bio->bi_end_io = raid10_end_read_request;
846 read_bio->bi_rw = READ;
847 read_bio->bi_private = r10_bio;
849 generic_make_request(read_bio);
850 return 0;
851 }
853 /*
854 * WRITE:
855 */
856 /* first select target devices under spinlock and
857 * inc refcount on their rdev. Record them by setting
858 * bios[x] to bio
859 */
860 raid10_find_phys(conf, r10_bio);
861 rcu_read_lock();
862 for (i = 0; i < conf->copies; i++) {
863 int d = r10_bio->devs[i].devnum;
864 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[d].rdev);
865 if (rdev &&
866 !test_bit(Faulty, &rdev->flags)) {
867 atomic_inc(&rdev->nr_pending);
868 r10_bio->devs[i].bio = bio;
869 } else {
870 r10_bio->devs[i].bio = NULL;
871 set_bit(R10BIO_Degraded, &r10_bio->state);
872 }
873 }
874 rcu_read_unlock();
876 atomic_set(&r10_bio->remaining, 0);
878 bio_list_init(&bl);
879 for (i = 0; i < conf->copies; i++) {
880 struct bio *mbio;
881 int d = r10_bio->devs[i].devnum;
882 if (!r10_bio->devs[i].bio)
883 continue;
885 mbio = bio_clone(bio, GFP_NOIO);
886 r10_bio->devs[i].bio = mbio;
888 mbio->bi_sector = r10_bio->devs[i].addr+
889 conf->mirrors[d].rdev->data_offset;
890 mbio->bi_bdev = conf->mirrors[d].rdev->bdev;
891 mbio->bi_end_io = raid10_end_write_request;
892 mbio->bi_rw = WRITE;
893 mbio->bi_private = r10_bio;
895 atomic_inc(&r10_bio->remaining);
896 bio_list_add(&bl, mbio);
897 }
899 bitmap_startwrite(mddev->bitmap, bio->bi_sector, r10_bio->sectors, 0);
900 spin_lock_irqsave(&conf->device_lock, flags);
901 bio_list_merge(&conf->pending_bio_list, &bl);
902 blk_plug_device(mddev->queue);
903 spin_unlock_irqrestore(&conf->device_lock, flags);
905 return 0;
906 }
908 static void status(struct seq_file *seq, mddev_t *mddev)
909 {
910 conf_t *conf = mddev_to_conf(mddev);
911 int i;
913 if (conf->near_copies < conf->raid_disks)
914 seq_printf(seq, " %dK chunks", mddev->chunk_size/1024);
915 if (conf->near_copies > 1)
916 seq_printf(seq, " %d near-copies", conf->near_copies);
917 if (conf->far_copies > 1) {
918 if (conf->far_offset)
919 seq_printf(seq, " %d offset-copies", conf->far_copies);
920 else
921 seq_printf(seq, " %d far-copies", conf->far_copies);
922 }
923 seq_printf(seq, " [%d/%d] [", conf->raid_disks,
924 conf->working_disks);
925 for (i = 0; i < conf->raid_disks; i++)
926 seq_printf(seq, "%s",
927 conf->mirrors[i].rdev &&
928 test_bit(In_sync, &conf->mirrors[i].rdev->flags) ? "U" : "_");
929 seq_printf(seq, "]");
930 }
932 static void error(mddev_t *mddev, mdk_rdev_t *rdev)
933 {
934 char b[BDEVNAME_SIZE];
935 conf_t *conf = mddev_to_conf(mddev);
937 /*
938 * If it is not operational, then we have already marked it as dead
939 * else if it is the last working disks, ignore the error, let the
940 * next level up know.
941 * else mark the drive as failed
942 */
943 if (test_bit(In_sync, &rdev->flags)
944 && conf->working_disks == 1)
945 /*
946 * Don't fail the drive, just return an IO error.
947 * The test should really be more sophisticated than
948 * "working_disks == 1", but it isn't critical, and
949 * can wait until we do more sophisticated "is the drive
950 * really dead" tests...
951 */
952 return;
953 if (test_bit(In_sync, &rdev->flags)) {
954 mddev->degraded++;
955 conf->working_disks--;
956 /*
957 * if recovery is running, make sure it aborts.
958 */
959 set_bit(MD_RECOVERY_ERR, &mddev->recovery);
960 }
961 clear_bit(In_sync, &rdev->flags);
962 set_bit(Faulty, &rdev->flags);
963 mddev->sb_dirty = 1;
964 printk(KERN_ALERT "raid10: Disk failure on %s, disabling device. \n"
965 " Operation continuing on %d devices\n",
966 bdevname(rdev->bdev,b), conf->working_disks);
967 }
969 static void print_conf(conf_t *conf)
970 {
971 int i;
972 mirror_info_t *tmp;
974 printk("RAID10 conf printout:\n");
975 if (!conf) {
976 printk("(!conf)\n");
977 return;
978 }
979 printk(" --- wd:%d rd:%d\n", conf->working_disks,
980 conf->raid_disks);
982 for (i = 0; i < conf->raid_disks; i++) {
983 char b[BDEVNAME_SIZE];
984 tmp = conf->mirrors + i;
985 if (tmp->rdev)
986 printk(" disk %d, wo:%d, o:%d, dev:%s\n",
987 i, !test_bit(In_sync, &tmp->rdev->flags),
988 !test_bit(Faulty, &tmp->rdev->flags),
989 bdevname(tmp->rdev->bdev,b));
990 }
991 }
993 static void close_sync(conf_t *conf)
994 {
995 wait_barrier(conf);
996 allow_barrier(conf);
998 mempool_destroy(conf->r10buf_pool);
999 conf->r10buf_pool = NULL;
1002 /* check if there are enough drives for
1003 * every block to appear on atleast one
1004 */
1005 static int enough(conf_t *conf)
1007 int first = 0;
1009 do {
1010 int n = conf->copies;
1011 int cnt = 0;
1012 while (n--) {
1013 if (conf->mirrors[first].rdev)
1014 cnt++;
1015 first = (first+1) % conf->raid_disks;
1017 if (cnt == 0)
1018 return 0;
1019 } while (first != 0);
1020 return 1;
1023 static int raid10_spare_active(mddev_t *mddev)
1025 int i;
1026 conf_t *conf = mddev->private;
1027 mirror_info_t *tmp;
1029 /*
1030 * Find all non-in_sync disks within the RAID10 configuration
1031 * and mark them in_sync
1032 */
1033 for (i = 0; i < conf->raid_disks; i++) {
1034 tmp = conf->mirrors + i;
1035 if (tmp->rdev
1036 && !test_bit(Faulty, &tmp->rdev->flags)
1037 && !test_bit(In_sync, &tmp->rdev->flags)) {
1038 conf->working_disks++;
1039 mddev->degraded--;
1040 set_bit(In_sync, &tmp->rdev->flags);
1044 print_conf(conf);
1045 return 0;
1049 static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
1051 conf_t *conf = mddev->private;
1052 int found = 0;
1053 int mirror;
1054 mirror_info_t *p;
1056 if (mddev->recovery_cp < MaxSector)
1057 /* only hot-add to in-sync arrays, as recovery is
1058 * very different from resync
1059 */
1060 return 0;
1061 if (!enough(conf))
1062 return 0;
1064 if (rdev->saved_raid_disk >= 0 &&
1065 conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
1066 mirror = rdev->saved_raid_disk;
1067 else
1068 mirror = 0;
1069 for ( ; mirror < mddev->raid_disks; mirror++)
1070 if ( !(p=conf->mirrors+mirror)->rdev) {
1072 blk_queue_stack_limits(mddev->queue,
1073 rdev->bdev->bd_disk->queue);
1074 /* as we don't honour merge_bvec_fn, we must never risk
1075 * violating it, so limit ->max_sector to one PAGE, as
1076 * a one page request is never in violation.
1077 */
1078 if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
1079 mddev->queue->max_sectors > (PAGE_SIZE>>9))
1080 mddev->queue->max_sectors = (PAGE_SIZE>>9);
1082 p->head_position = 0;
1083 rdev->raid_disk = mirror;
1084 found = 1;
1085 if (rdev->saved_raid_disk != mirror)
1086 conf->fullsync = 1;
1087 rcu_assign_pointer(p->rdev, rdev);
1088 break;
1091 print_conf(conf);
1092 return found;
1095 static int raid10_remove_disk(mddev_t *mddev, int number)
1097 conf_t *conf = mddev->private;
1098 int err = 0;
1099 mdk_rdev_t *rdev;
1100 mirror_info_t *p = conf->mirrors+ number;
1102 print_conf(conf);
1103 rdev = p->rdev;
1104 if (rdev) {
1105 if (test_bit(In_sync, &rdev->flags) ||
1106 atomic_read(&rdev->nr_pending)) {
1107 err = -EBUSY;
1108 goto abort;
1110 p->rdev = NULL;
1111 synchronize_rcu();
1112 if (atomic_read(&rdev->nr_pending)) {
1113 /* lost the race, try later */
1114 err = -EBUSY;
1115 p->rdev = rdev;
1118 abort:
1120 print_conf(conf);
1121 return err;
1125 static int end_sync_read(struct bio *bio, unsigned int bytes_done, int error)
1127 r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private);
1128 conf_t *conf = mddev_to_conf(r10_bio->mddev);
1129 int i,d;
1131 if (bio->bi_size)
1132 return 1;
1134 for (i=0; i<conf->copies; i++)
1135 if (r10_bio->devs[i].bio == bio)
1136 break;
1137 BUG_ON(i == conf->copies);
1138 update_head_pos(i, r10_bio);
1139 d = r10_bio->devs[i].devnum;
1141 if (test_bit(BIO_UPTODATE, &bio->bi_flags))
1142 set_bit(R10BIO_Uptodate, &r10_bio->state);
1143 else {
1144 atomic_add(r10_bio->sectors,
1145 &conf->mirrors[d].rdev->corrected_errors);
1146 if (!test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery))
1147 md_error(r10_bio->mddev,
1148 conf->mirrors[d].rdev);
1151 /* for reconstruct, we always reschedule after a read.
1152 * for resync, only after all reads
1153 */
1154 if (test_bit(R10BIO_IsRecover, &r10_bio->state) ||
1155 atomic_dec_and_test(&r10_bio->remaining)) {
1156 /* we have read all the blocks,
1157 * do the comparison in process context in raid10d
1158 */
1159 reschedule_retry(r10_bio);
1161 rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev);
1162 return 0;
1165 static int end_sync_write(struct bio *bio, unsigned int bytes_done, int error)
1167 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1168 r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private);
1169 mddev_t *mddev = r10_bio->mddev;
1170 conf_t *conf = mddev_to_conf(mddev);
1171 int i,d;
1173 if (bio->bi_size)
1174 return 1;
1176 for (i = 0; i < conf->copies; i++)
1177 if (r10_bio->devs[i].bio == bio)
1178 break;
1179 d = r10_bio->devs[i].devnum;
1181 if (!uptodate)
1182 md_error(mddev, conf->mirrors[d].rdev);
1183 update_head_pos(i, r10_bio);
1185 while (atomic_dec_and_test(&r10_bio->remaining)) {
1186 if (r10_bio->master_bio == NULL) {
1187 /* the primary of several recovery bios */
1188 md_done_sync(mddev, r10_bio->sectors, 1);
1189 put_buf(r10_bio);
1190 break;
1191 } else {
1192 r10bio_t *r10_bio2 = (r10bio_t *)r10_bio->master_bio;
1193 put_buf(r10_bio);
1194 r10_bio = r10_bio2;
1197 rdev_dec_pending(conf->mirrors[d].rdev, mddev);
1198 return 0;
1201 /*
1202 * Note: sync and recover and handled very differently for raid10
1203 * This code is for resync.
1204 * For resync, we read through virtual addresses and read all blocks.
1205 * If there is any error, we schedule a write. The lowest numbered
1206 * drive is authoritative.
1207 * However requests come for physical address, so we need to map.
1208 * For every physical address there are raid_disks/copies virtual addresses,
1209 * which is always are least one, but is not necessarly an integer.
1210 * This means that a physical address can span multiple chunks, so we may
1211 * have to submit multiple io requests for a single sync request.
1212 */
1213 /*
1214 * We check if all blocks are in-sync and only write to blocks that
1215 * aren't in sync
1216 */
1217 static void sync_request_write(mddev_t *mddev, r10bio_t *r10_bio)
1219 conf_t *conf = mddev_to_conf(mddev);
1220 int i, first;
1221 struct bio *tbio, *fbio;
1223 atomic_set(&r10_bio->remaining, 1);
1225 /* find the first device with a block */
1226 for (i=0; i<conf->copies; i++)
1227 if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags))
1228 break;
1230 if (i == conf->copies)
1231 goto done;
1233 first = i;
1234 fbio = r10_bio->devs[i].bio;
1236 /* now find blocks with errors */
1237 for (i=0 ; i < conf->copies ; i++) {
1238 int j, d;
1239 int vcnt = r10_bio->sectors >> (PAGE_SHIFT-9);
1241 tbio = r10_bio->devs[i].bio;
1243 if (tbio->bi_end_io != end_sync_read)
1244 continue;
1245 if (i == first)
1246 continue;
1247 if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags)) {
1248 /* We know that the bi_io_vec layout is the same for
1249 * both 'first' and 'i', so we just compare them.
1250 * All vec entries are PAGE_SIZE;
1251 */
1252 for (j = 0; j < vcnt; j++)
1253 if (memcmp(page_address(fbio->bi_io_vec[j].bv_page),
1254 page_address(tbio->bi_io_vec[j].bv_page),
1255 PAGE_SIZE))
1256 break;
1257 if (j == vcnt)
1258 continue;
1259 mddev->resync_mismatches += r10_bio->sectors;
1261 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
1262 /* Don't fix anything. */
1263 continue;
1264 /* Ok, we need to write this bio
1265 * First we need to fixup bv_offset, bv_len and
1266 * bi_vecs, as the read request might have corrupted these
1267 */
1268 tbio->bi_vcnt = vcnt;
1269 tbio->bi_size = r10_bio->sectors << 9;
1270 tbio->bi_idx = 0;
1271 tbio->bi_phys_segments = 0;
1272 tbio->bi_hw_segments = 0;
1273 tbio->bi_hw_front_size = 0;
1274 tbio->bi_hw_back_size = 0;
1275 tbio->bi_flags &= ~(BIO_POOL_MASK - 1);
1276 tbio->bi_flags |= 1 << BIO_UPTODATE;
1277 tbio->bi_next = NULL;
1278 tbio->bi_rw = WRITE;
1279 tbio->bi_private = r10_bio;
1280 tbio->bi_sector = r10_bio->devs[i].addr;
1282 for (j=0; j < vcnt ; j++) {
1283 tbio->bi_io_vec[j].bv_offset = 0;
1284 tbio->bi_io_vec[j].bv_len = PAGE_SIZE;
1286 memcpy(page_address(tbio->bi_io_vec[j].bv_page),
1287 page_address(fbio->bi_io_vec[j].bv_page),
1288 PAGE_SIZE);
1290 tbio->bi_end_io = end_sync_write;
1292 d = r10_bio->devs[i].devnum;
1293 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
1294 atomic_inc(&r10_bio->remaining);
1295 md_sync_acct(conf->mirrors[d].rdev->bdev, tbio->bi_size >> 9);
1297 tbio->bi_sector += conf->mirrors[d].rdev->data_offset;
1298 tbio->bi_bdev = conf->mirrors[d].rdev->bdev;
1299 generic_make_request(tbio);
1302 done:
1303 if (atomic_dec_and_test(&r10_bio->remaining)) {
1304 md_done_sync(mddev, r10_bio->sectors, 1);
1305 put_buf(r10_bio);
1309 /*
1310 * Now for the recovery code.
1311 * Recovery happens across physical sectors.
1312 * We recover all non-is_sync drives by finding the virtual address of
1313 * each, and then choose a working drive that also has that virt address.
1314 * There is a separate r10_bio for each non-in_sync drive.
1315 * Only the first two slots are in use. The first for reading,
1316 * The second for writing.
1318 */
1320 static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio)
1322 conf_t *conf = mddev_to_conf(mddev);
1323 int i, d;
1324 struct bio *bio, *wbio;
1327 /* move the pages across to the second bio
1328 * and submit the write request
1329 */
1330 bio = r10_bio->devs[0].bio;
1331 wbio = r10_bio->devs[1].bio;
1332 for (i=0; i < wbio->bi_vcnt; i++) {
1333 struct page *p = bio->bi_io_vec[i].bv_page;
1334 bio->bi_io_vec[i].bv_page = wbio->bi_io_vec[i].bv_page;
1335 wbio->bi_io_vec[i].bv_page = p;
1337 d = r10_bio->devs[1].devnum;
1339 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
1340 md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9);
1341 if (test_bit(R10BIO_Uptodate, &r10_bio->state))
1342 generic_make_request(wbio);
1343 else
1344 bio_endio(wbio, wbio->bi_size, -EIO);
1348 /*
1349 * This is a kernel thread which:
1351 * 1. Retries failed read operations on working mirrors.
1352 * 2. Updates the raid superblock when problems encounter.
1353 * 3. Performs writes following reads for array syncronising.
1354 */
1356 static void raid10d(mddev_t *mddev)
1358 r10bio_t *r10_bio;
1359 struct bio *bio;
1360 unsigned long flags;
1361 conf_t *conf = mddev_to_conf(mddev);
1362 struct list_head *head = &conf->retry_list;
1363 int unplug=0;
1364 mdk_rdev_t *rdev;
1366 md_check_recovery(mddev);
1368 for (;;) {
1369 char b[BDEVNAME_SIZE];
1370 spin_lock_irqsave(&conf->device_lock, flags);
1372 if (conf->pending_bio_list.head) {
1373 bio = bio_list_get(&conf->pending_bio_list);
1374 blk_remove_plug(mddev->queue);
1375 spin_unlock_irqrestore(&conf->device_lock, flags);
1376 /* flush any pending bitmap writes to disk before proceeding w/ I/O */
1377 if (bitmap_unplug(mddev->bitmap) != 0)
1378 printk("%s: bitmap file write failed!\n", mdname(mddev));
1380 while (bio) { /* submit pending writes */
1381 struct bio *next = bio->bi_next;
1382 bio->bi_next = NULL;
1383 generic_make_request(bio);
1384 bio = next;
1386 unplug = 1;
1388 continue;
1391 if (list_empty(head))
1392 break;
1393 r10_bio = list_entry(head->prev, r10bio_t, retry_list);
1394 list_del(head->prev);
1395 conf->nr_queued--;
1396 spin_unlock_irqrestore(&conf->device_lock, flags);
1398 mddev = r10_bio->mddev;
1399 conf = mddev_to_conf(mddev);
1400 if (test_bit(R10BIO_IsSync, &r10_bio->state)) {
1401 sync_request_write(mddev, r10_bio);
1402 unplug = 1;
1403 } else if (test_bit(R10BIO_IsRecover, &r10_bio->state)) {
1404 recovery_request_write(mddev, r10_bio);
1405 unplug = 1;
1406 } else {
1407 int mirror;
1408 /* we got a read error. Maybe the drive is bad. Maybe just
1409 * the block and we can fix it.
1410 * We freeze all other IO, and try reading the block from
1411 * other devices. When we find one, we re-write
1412 * and check it that fixes the read error.
1413 * This is all done synchronously while the array is
1414 * frozen.
1415 */
1416 int sect = 0; /* Offset from r10_bio->sector */
1417 int sectors = r10_bio->sectors;
1418 freeze_array(conf);
1419 if (mddev->ro == 0) while(sectors) {
1420 int s = sectors;
1421 int sl = r10_bio->read_slot;
1422 int success = 0;
1424 if (s > (PAGE_SIZE>>9))
1425 s = PAGE_SIZE >> 9;
1427 rcu_read_lock();
1428 do {
1429 int d = r10_bio->devs[sl].devnum;
1430 rdev = rcu_dereference(conf->mirrors[d].rdev);
1431 if (rdev &&
1432 test_bit(In_sync, &rdev->flags)) {
1433 atomic_inc(&rdev->nr_pending);
1434 rcu_read_unlock();
1435 success = sync_page_io(rdev->bdev,
1436 r10_bio->devs[sl].addr +
1437 sect + rdev->data_offset,
1438 s<<9,
1439 conf->tmppage, READ);
1440 rdev_dec_pending(rdev, mddev);
1441 rcu_read_lock();
1442 if (success)
1443 break;
1445 sl++;
1446 if (sl == conf->copies)
1447 sl = 0;
1448 } while (!success && sl != r10_bio->read_slot);
1449 rcu_read_unlock();
1451 if (success) {
1452 int start = sl;
1453 /* write it back and re-read */
1454 rcu_read_lock();
1455 while (sl != r10_bio->read_slot) {
1456 int d;
1457 if (sl==0)
1458 sl = conf->copies;
1459 sl--;
1460 d = r10_bio->devs[sl].devnum;
1461 rdev = rcu_dereference(conf->mirrors[d].rdev);
1462 if (rdev &&
1463 test_bit(In_sync, &rdev->flags)) {
1464 atomic_inc(&rdev->nr_pending);
1465 rcu_read_unlock();
1466 atomic_add(s, &rdev->corrected_errors);
1467 if (sync_page_io(rdev->bdev,
1468 r10_bio->devs[sl].addr +
1469 sect + rdev->data_offset,
1470 s<<9, conf->tmppage, WRITE) == 0)
1471 /* Well, this device is dead */
1472 md_error(mddev, rdev);
1473 rdev_dec_pending(rdev, mddev);
1474 rcu_read_lock();
1477 sl = start;
1478 while (sl != r10_bio->read_slot) {
1479 int d;
1480 if (sl==0)
1481 sl = conf->copies;
1482 sl--;
1483 d = r10_bio->devs[sl].devnum;
1484 rdev = rcu_dereference(conf->mirrors[d].rdev);
1485 if (rdev &&
1486 test_bit(In_sync, &rdev->flags)) {
1487 atomic_inc(&rdev->nr_pending);
1488 rcu_read_unlock();
1489 if (sync_page_io(rdev->bdev,
1490 r10_bio->devs[sl].addr +
1491 sect + rdev->data_offset,
1492 s<<9, conf->tmppage, READ) == 0)
1493 /* Well, this device is dead */
1494 md_error(mddev, rdev);
1495 else
1496 printk(KERN_INFO "raid10:%s: read error corrected (%d sectors at %llu on %s)\n",
1497 mdname(mddev), s, (unsigned long long)(sect+rdev->data_offset), bdevname(rdev->bdev, b));
1499 rdev_dec_pending(rdev, mddev);
1500 rcu_read_lock();
1503 rcu_read_unlock();
1504 } else {
1505 /* Cannot read from anywhere -- bye bye array */
1506 md_error(mddev, conf->mirrors[r10_bio->devs[r10_bio->read_slot].devnum].rdev);
1507 break;
1509 sectors -= s;
1510 sect += s;
1513 unfreeze_array(conf);
1515 bio = r10_bio->devs[r10_bio->read_slot].bio;
1516 r10_bio->devs[r10_bio->read_slot].bio =
1517 mddev->ro ? IO_BLOCKED : NULL;
1518 bio_put(bio);
1519 mirror = read_balance(conf, r10_bio);
1520 if (mirror == -1) {
1521 printk(KERN_ALERT "raid10: %s: unrecoverable I/O"
1522 " read error for block %llu\n",
1523 bdevname(bio->bi_bdev,b),
1524 (unsigned long long)r10_bio->sector);
1525 raid_end_bio_io(r10_bio);
1526 } else {
1527 rdev = conf->mirrors[mirror].rdev;
1528 if (printk_ratelimit())
1529 printk(KERN_ERR "raid10: %s: redirecting sector %llu to"
1530 " another mirror\n",
1531 bdevname(rdev->bdev,b),
1532 (unsigned long long)r10_bio->sector);
1533 bio = bio_clone(r10_bio->master_bio, GFP_NOIO);
1534 r10_bio->devs[r10_bio->read_slot].bio = bio;
1535 bio->bi_sector = r10_bio->devs[r10_bio->read_slot].addr
1536 + rdev->data_offset;
1537 bio->bi_bdev = rdev->bdev;
1538 bio->bi_rw = READ;
1539 bio->bi_private = r10_bio;
1540 bio->bi_end_io = raid10_end_read_request;
1541 unplug = 1;
1542 generic_make_request(bio);
1546 spin_unlock_irqrestore(&conf->device_lock, flags);
1547 if (unplug)
1548 unplug_slaves(mddev);
1552 static int init_resync(conf_t *conf)
1554 int buffs;
1556 buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
1557 BUG_ON(conf->r10buf_pool);
1558 conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf);
1559 if (!conf->r10buf_pool)
1560 return -ENOMEM;
1561 conf->next_resync = 0;
1562 return 0;
1565 /*
1566 * perform a "sync" on one "block"
1568 * We need to make sure that no normal I/O request - particularly write
1569 * requests - conflict with active sync requests.
1571 * This is achieved by tracking pending requests and a 'barrier' concept
1572 * that can be installed to exclude normal IO requests.
1574 * Resync and recovery are handled very differently.
1575 * We differentiate by looking at MD_RECOVERY_SYNC in mddev->recovery.
1577 * For resync, we iterate over virtual addresses, read all copies,
1578 * and update if there are differences. If only one copy is live,
1579 * skip it.
1580 * For recovery, we iterate over physical addresses, read a good
1581 * value for each non-in_sync drive, and over-write.
1583 * So, for recovery we may have several outstanding complex requests for a
1584 * given address, one for each out-of-sync device. We model this by allocating
1585 * a number of r10_bio structures, one for each out-of-sync device.
1586 * As we setup these structures, we collect all bio's together into a list
1587 * which we then process collectively to add pages, and then process again
1588 * to pass to generic_make_request.
1590 * The r10_bio structures are linked using a borrowed master_bio pointer.
1591 * This link is counted in ->remaining. When the r10_bio that points to NULL
1592 * has its remaining count decremented to 0, the whole complex operation
1593 * is complete.
1595 */
1597 static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
1599 conf_t *conf = mddev_to_conf(mddev);
1600 r10bio_t *r10_bio;
1601 struct bio *biolist = NULL, *bio;
1602 sector_t max_sector, nr_sectors;
1603 int disk;
1604 int i;
1605 int max_sync;
1606 int sync_blocks;
1608 sector_t sectors_skipped = 0;
1609 int chunks_skipped = 0;
1611 if (!conf->r10buf_pool)
1612 if (init_resync(conf))
1613 return 0;
1615 skipped:
1616 max_sector = mddev->size << 1;
1617 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
1618 max_sector = mddev->resync_max_sectors;
1619 if (sector_nr >= max_sector) {
1620 /* If we aborted, we need to abort the
1621 * sync on the 'current' bitmap chucks (there can
1622 * be several when recovering multiple devices).
1623 * as we may have started syncing it but not finished.
1624 * We can find the current address in
1625 * mddev->curr_resync, but for recovery,
1626 * we need to convert that to several
1627 * virtual addresses.
1628 */
1629 if (mddev->curr_resync < max_sector) { /* aborted */
1630 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
1631 bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
1632 &sync_blocks, 1);
1633 else for (i=0; i<conf->raid_disks; i++) {
1634 sector_t sect =
1635 raid10_find_virt(conf, mddev->curr_resync, i);
1636 bitmap_end_sync(mddev->bitmap, sect,
1637 &sync_blocks, 1);
1639 } else /* completed sync */
1640 conf->fullsync = 0;
1642 bitmap_close_sync(mddev->bitmap);
1643 close_sync(conf);
1644 *skipped = 1;
1645 return sectors_skipped;
1647 if (chunks_skipped >= conf->raid_disks) {
1648 /* if there has been nothing to do on any drive,
1649 * then there is nothing to do at all..
1650 */
1651 *skipped = 1;
1652 return (max_sector - sector_nr) + sectors_skipped;
1655 /* make sure whole request will fit in a chunk - if chunks
1656 * are meaningful
1657 */
1658 if (conf->near_copies < conf->raid_disks &&
1659 max_sector > (sector_nr | conf->chunk_mask))
1660 max_sector = (sector_nr | conf->chunk_mask) + 1;
1661 /*
1662 * If there is non-resync activity waiting for us then
1663 * put in a delay to throttle resync.
1664 */
1665 if (!go_faster && conf->nr_waiting)
1666 msleep_interruptible(1000);
1668 /* Again, very different code for resync and recovery.
1669 * Both must result in an r10bio with a list of bios that
1670 * have bi_end_io, bi_sector, bi_bdev set,
1671 * and bi_private set to the r10bio.
1672 * For recovery, we may actually create several r10bios
1673 * with 2 bios in each, that correspond to the bios in the main one.
1674 * In this case, the subordinate r10bios link back through a
1675 * borrowed master_bio pointer, and the counter in the master
1676 * includes a ref from each subordinate.
1677 */
1678 /* First, we decide what to do and set ->bi_end_io
1679 * To end_sync_read if we want to read, and
1680 * end_sync_write if we will want to write.
1681 */
1683 max_sync = RESYNC_PAGES << (PAGE_SHIFT-9);
1684 if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
1685 /* recovery... the complicated one */
1686 int i, j, k;
1687 r10_bio = NULL;
1689 for (i=0 ; i<conf->raid_disks; i++)
1690 if (conf->mirrors[i].rdev &&
1691 !test_bit(In_sync, &conf->mirrors[i].rdev->flags)) {
1692 int still_degraded = 0;
1693 /* want to reconstruct this device */
1694 r10bio_t *rb2 = r10_bio;
1695 sector_t sect = raid10_find_virt(conf, sector_nr, i);
1696 int must_sync;
1697 /* Unless we are doing a full sync, we only need
1698 * to recover the block if it is set in the bitmap
1699 */
1700 must_sync = bitmap_start_sync(mddev->bitmap, sect,
1701 &sync_blocks, 1);
1702 if (sync_blocks < max_sync)
1703 max_sync = sync_blocks;
1704 if (!must_sync &&
1705 !conf->fullsync) {
1706 /* yep, skip the sync_blocks here, but don't assume
1707 * that there will never be anything to do here
1708 */
1709 chunks_skipped = -1;
1710 continue;
1713 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
1714 raise_barrier(conf, rb2 != NULL);
1715 atomic_set(&r10_bio->remaining, 0);
1717 r10_bio->master_bio = (struct bio*)rb2;
1718 if (rb2)
1719 atomic_inc(&rb2->remaining);
1720 r10_bio->mddev = mddev;
1721 set_bit(R10BIO_IsRecover, &r10_bio->state);
1722 r10_bio->sector = sect;
1724 raid10_find_phys(conf, r10_bio);
1725 /* Need to check if this section will still be
1726 * degraded
1727 */
1728 for (j=0; j<conf->copies;j++) {
1729 int d = r10_bio->devs[j].devnum;
1730 if (conf->mirrors[d].rdev == NULL ||
1731 test_bit(Faulty, &conf->mirrors[d].rdev->flags)) {
1732 still_degraded = 1;
1733 break;
1736 must_sync = bitmap_start_sync(mddev->bitmap, sect,
1737 &sync_blocks, still_degraded);
1739 for (j=0; j<conf->copies;j++) {
1740 int d = r10_bio->devs[j].devnum;
1741 if (conf->mirrors[d].rdev &&
1742 test_bit(In_sync, &conf->mirrors[d].rdev->flags)) {
1743 /* This is where we read from */
1744 bio = r10_bio->devs[0].bio;
1745 bio->bi_next = biolist;
1746 biolist = bio;
1747 bio->bi_private = r10_bio;
1748 bio->bi_end_io = end_sync_read;
1749 bio->bi_rw = 0;
1750 bio->bi_sector = r10_bio->devs[j].addr +
1751 conf->mirrors[d].rdev->data_offset;
1752 bio->bi_bdev = conf->mirrors[d].rdev->bdev;
1753 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
1754 atomic_inc(&r10_bio->remaining);
1755 /* and we write to 'i' */
1757 for (k=0; k<conf->copies; k++)
1758 if (r10_bio->devs[k].devnum == i)
1759 break;
1760 bio = r10_bio->devs[1].bio;
1761 bio->bi_next = biolist;
1762 biolist = bio;
1763 bio->bi_private = r10_bio;
1764 bio->bi_end_io = end_sync_write;
1765 bio->bi_rw = 1;
1766 bio->bi_sector = r10_bio->devs[k].addr +
1767 conf->mirrors[i].rdev->data_offset;
1768 bio->bi_bdev = conf->mirrors[i].rdev->bdev;
1770 r10_bio->devs[0].devnum = d;
1771 r10_bio->devs[1].devnum = i;
1773 break;
1776 if (j == conf->copies) {
1777 /* Cannot recover, so abort the recovery */
1778 put_buf(r10_bio);
1779 r10_bio = rb2;
1780 if (!test_and_set_bit(MD_RECOVERY_ERR, &mddev->recovery))
1781 printk(KERN_INFO "raid10: %s: insufficient working devices for recovery.\n",
1782 mdname(mddev));
1783 break;
1786 if (biolist == NULL) {
1787 while (r10_bio) {
1788 r10bio_t *rb2 = r10_bio;
1789 r10_bio = (r10bio_t*) rb2->master_bio;
1790 rb2->master_bio = NULL;
1791 put_buf(rb2);
1793 goto giveup;
1795 } else {
1796 /* resync. Schedule a read for every block at this virt offset */
1797 int count = 0;
1799 if (!bitmap_start_sync(mddev->bitmap, sector_nr,
1800 &sync_blocks, mddev->degraded) &&
1801 !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
1802 /* We can skip this block */
1803 *skipped = 1;
1804 return sync_blocks + sectors_skipped;
1806 if (sync_blocks < max_sync)
1807 max_sync = sync_blocks;
1808 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
1810 r10_bio->mddev = mddev;
1811 atomic_set(&r10_bio->remaining, 0);
1812 raise_barrier(conf, 0);
1813 conf->next_resync = sector_nr;
1815 r10_bio->master_bio = NULL;
1816 r10_bio->sector = sector_nr;
1817 set_bit(R10BIO_IsSync, &r10_bio->state);
1818 raid10_find_phys(conf, r10_bio);
1819 r10_bio->sectors = (sector_nr | conf->chunk_mask) - sector_nr +1;
1821 for (i=0; i<conf->copies; i++) {
1822 int d = r10_bio->devs[i].devnum;
1823 bio = r10_bio->devs[i].bio;
1824 bio->bi_end_io = NULL;
1825 if (conf->mirrors[d].rdev == NULL ||
1826 test_bit(Faulty, &conf->mirrors[d].rdev->flags))
1827 continue;
1828 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
1829 atomic_inc(&r10_bio->remaining);
1830 bio->bi_next = biolist;
1831 biolist = bio;
1832 bio->bi_private = r10_bio;
1833 bio->bi_end_io = end_sync_read;
1834 bio->bi_rw = 0;
1835 bio->bi_sector = r10_bio->devs[i].addr +
1836 conf->mirrors[d].rdev->data_offset;
1837 bio->bi_bdev = conf->mirrors[d].rdev->bdev;
1838 count++;
1841 if (count < 2) {
1842 for (i=0; i<conf->copies; i++) {
1843 int d = r10_bio->devs[i].devnum;
1844 if (r10_bio->devs[i].bio->bi_end_io)
1845 rdev_dec_pending(conf->mirrors[d].rdev, mddev);
1847 put_buf(r10_bio);
1848 biolist = NULL;
1849 goto giveup;
1853 for (bio = biolist; bio ; bio=bio->bi_next) {
1855 bio->bi_flags &= ~(BIO_POOL_MASK - 1);
1856 if (bio->bi_end_io)
1857 bio->bi_flags |= 1 << BIO_UPTODATE;
1858 bio->bi_vcnt = 0;
1859 bio->bi_idx = 0;
1860 bio->bi_phys_segments = 0;
1861 bio->bi_hw_segments = 0;
1862 bio->bi_size = 0;
1865 nr_sectors = 0;
1866 if (sector_nr + max_sync < max_sector)
1867 max_sector = sector_nr + max_sync;
1868 do {
1869 struct page *page;
1870 int len = PAGE_SIZE;
1871 disk = 0;
1872 if (sector_nr + (len>>9) > max_sector)
1873 len = (max_sector - sector_nr) << 9;
1874 if (len == 0)
1875 break;
1876 for (bio= biolist ; bio ; bio=bio->bi_next) {
1877 page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
1878 if (bio_add_page(bio, page, len, 0) == 0) {
1879 /* stop here */
1880 struct bio *bio2;
1881 bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
1882 for (bio2 = biolist; bio2 && bio2 != bio; bio2 = bio2->bi_next) {
1883 /* remove last page from this bio */
1884 bio2->bi_vcnt--;
1885 bio2->bi_size -= len;
1886 bio2->bi_flags &= ~(1<< BIO_SEG_VALID);
1888 goto bio_full;
1890 disk = i;
1892 nr_sectors += len>>9;
1893 sector_nr += len>>9;
1894 } while (biolist->bi_vcnt < RESYNC_PAGES);
1895 bio_full:
1896 r10_bio->sectors = nr_sectors;
1898 while (biolist) {
1899 bio = biolist;
1900 biolist = biolist->bi_next;
1902 bio->bi_next = NULL;
1903 r10_bio = bio->bi_private;
1904 r10_bio->sectors = nr_sectors;
1906 if (bio->bi_end_io == end_sync_read) {
1907 md_sync_acct(bio->bi_bdev, nr_sectors);
1908 generic_make_request(bio);
1912 if (sectors_skipped)
1913 /* pretend they weren't skipped, it makes
1914 * no important difference in this case
1915 */
1916 md_done_sync(mddev, sectors_skipped, 1);
1918 return sectors_skipped + nr_sectors;
1919 giveup:
1920 /* There is nowhere to write, so all non-sync
1921 * drives must be failed, so try the next chunk...
1922 */
1924 sector_t sec = max_sector - sector_nr;
1925 sectors_skipped += sec;
1926 chunks_skipped ++;
1927 sector_nr = max_sector;
1928 goto skipped;
1932 static int run(mddev_t *mddev)
1934 conf_t *conf;
1935 int i, disk_idx;
1936 mirror_info_t *disk;
1937 mdk_rdev_t *rdev;
1938 struct list_head *tmp;
1939 int nc, fc, fo;
1940 sector_t stride, size;
1942 if (mddev->chunk_size == 0) {
1943 printk(KERN_ERR "md/raid10: non-zero chunk size required.\n");
1944 return -EINVAL;
1947 nc = mddev->layout & 255;
1948 fc = (mddev->layout >> 8) & 255;
1949 fo = mddev->layout & (1<<16);
1950 if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks ||
1951 (mddev->layout >> 17)) {
1952 printk(KERN_ERR "raid10: %s: unsupported raid10 layout: 0x%8x\n",
1953 mdname(mddev), mddev->layout);
1954 goto out;
1956 /*
1957 * copy the already verified devices into our private RAID10
1958 * bookkeeping area. [whatever we allocate in run(),
1959 * should be freed in stop()]
1960 */
1961 conf = kzalloc(sizeof(conf_t), GFP_KERNEL);
1962 mddev->private = conf;
1963 if (!conf) {
1964 printk(KERN_ERR "raid10: couldn't allocate memory for %s\n",
1965 mdname(mddev));
1966 goto out;
1968 conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks,
1969 GFP_KERNEL);
1970 if (!conf->mirrors) {
1971 printk(KERN_ERR "raid10: couldn't allocate memory for %s\n",
1972 mdname(mddev));
1973 goto out_free_conf;
1976 conf->tmppage = alloc_page(GFP_KERNEL);
1977 if (!conf->tmppage)
1978 goto out_free_conf;
1980 conf->near_copies = nc;
1981 conf->far_copies = fc;
1982 conf->copies = nc*fc;
1983 conf->far_offset = fo;
1984 conf->chunk_mask = (sector_t)(mddev->chunk_size>>9)-1;
1985 conf->chunk_shift = ffz(~mddev->chunk_size) - 9;
1986 if (fo)
1987 conf->stride = 1 << conf->chunk_shift;
1988 else {
1989 stride = mddev->size >> (conf->chunk_shift-1);
1990 sector_div(stride, fc);
1991 conf->stride = stride << conf->chunk_shift;
1993 conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc,
1994 r10bio_pool_free, conf);
1995 if (!conf->r10bio_pool) {
1996 printk(KERN_ERR "raid10: couldn't allocate memory for %s\n",
1997 mdname(mddev));
1998 goto out_free_conf;
2001 ITERATE_RDEV(mddev, rdev, tmp) {
2002 disk_idx = rdev->raid_disk;
2003 if (disk_idx >= mddev->raid_disks
2004 || disk_idx < 0)
2005 continue;
2006 disk = conf->mirrors + disk_idx;
2008 disk->rdev = rdev;
2010 blk_queue_stack_limits(mddev->queue,
2011 rdev->bdev->bd_disk->queue);
2012 /* as we don't honour merge_bvec_fn, we must never risk
2013 * violating it, so limit ->max_sector to one PAGE, as
2014 * a one page request is never in violation.
2015 */
2016 if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
2017 mddev->queue->max_sectors > (PAGE_SIZE>>9))
2018 mddev->queue->max_sectors = (PAGE_SIZE>>9);
2020 disk->head_position = 0;
2021 if (!test_bit(Faulty, &rdev->flags) && test_bit(In_sync, &rdev->flags))
2022 conf->working_disks++;
2024 conf->raid_disks = mddev->raid_disks;
2025 conf->mddev = mddev;
2026 spin_lock_init(&conf->device_lock);
2027 INIT_LIST_HEAD(&conf->retry_list);
2029 spin_lock_init(&conf->resync_lock);
2030 init_waitqueue_head(&conf->wait_barrier);
2032 /* need to check that every block has at least one working mirror */
2033 if (!enough(conf)) {
2034 printk(KERN_ERR "raid10: not enough operational mirrors for %s\n",
2035 mdname(mddev));
2036 goto out_free_conf;
2039 mddev->degraded = 0;
2040 for (i = 0; i < conf->raid_disks; i++) {
2042 disk = conf->mirrors + i;
2044 if (!disk->rdev ||
2045 !test_bit(In_sync, &disk->rdev->flags)) {
2046 disk->head_position = 0;
2047 mddev->degraded++;
2052 mddev->thread = md_register_thread(raid10d, mddev, "%s_raid10");
2053 if (!mddev->thread) {
2054 printk(KERN_ERR
2055 "raid10: couldn't allocate thread for %s\n",
2056 mdname(mddev));
2057 goto out_free_conf;
2060 printk(KERN_INFO
2061 "raid10: raid set %s active with %d out of %d devices\n",
2062 mdname(mddev), mddev->raid_disks - mddev->degraded,
2063 mddev->raid_disks);
2064 /*
2065 * Ok, everything is just fine now
2066 */
2067 if (conf->far_offset) {
2068 size = mddev->size >> (conf->chunk_shift-1);
2069 size *= conf->raid_disks;
2070 size <<= conf->chunk_shift;
2071 sector_div(size, conf->far_copies);
2072 } else
2073 size = conf->stride * conf->raid_disks;
2074 sector_div(size, conf->near_copies);
2075 mddev->array_size = size/2;
2076 mddev->resync_max_sectors = size;
2078 mddev->queue->unplug_fn = raid10_unplug;
2079 mddev->queue->issue_flush_fn = raid10_issue_flush;
2081 /* Calculate max read-ahead size.
2082 * We need to readahead at least twice a whole stripe....
2083 * maybe...
2084 */
2086 int stripe = conf->raid_disks * (mddev->chunk_size / PAGE_SIZE);
2087 stripe /= conf->near_copies;
2088 if (mddev->queue->backing_dev_info.ra_pages < 2* stripe)
2089 mddev->queue->backing_dev_info.ra_pages = 2* stripe;
2092 if (conf->near_copies < mddev->raid_disks)
2093 blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
2094 return 0;
2096 out_free_conf:
2097 if (conf->r10bio_pool)
2098 mempool_destroy(conf->r10bio_pool);
2099 safe_put_page(conf->tmppage);
2100 kfree(conf->mirrors);
2101 kfree(conf);
2102 mddev->private = NULL;
2103 out:
2104 return -EIO;
2107 static int stop(mddev_t *mddev)
2109 conf_t *conf = mddev_to_conf(mddev);
2111 md_unregister_thread(mddev->thread);
2112 mddev->thread = NULL;
2113 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
2114 if (conf->r10bio_pool)
2115 mempool_destroy(conf->r10bio_pool);
2116 kfree(conf->mirrors);
2117 kfree(conf);
2118 mddev->private = NULL;
2119 return 0;
2122 static void raid10_quiesce(mddev_t *mddev, int state)
2124 conf_t *conf = mddev_to_conf(mddev);
2126 switch(state) {
2127 case 1:
2128 raise_barrier(conf, 0);
2129 break;
2130 case 0:
2131 lower_barrier(conf);
2132 break;
2134 if (mddev->thread) {
2135 if (mddev->bitmap)
2136 mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ;
2137 else
2138 mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
2139 md_wakeup_thread(mddev->thread);
2143 static struct mdk_personality raid10_personality =
2145 .name = "raid10",
2146 .level = 10,
2147 .owner = THIS_MODULE,
2148 .make_request = make_request,
2149 .run = run,
2150 .stop = stop,
2151 .status = status,
2152 .error_handler = error,
2153 .hot_add_disk = raid10_add_disk,
2154 .hot_remove_disk= raid10_remove_disk,
2155 .spare_active = raid10_spare_active,
2156 .sync_request = sync_request,
2157 .quiesce = raid10_quiesce,
2158 };
2160 static int __init raid_init(void)
2162 return register_md_personality(&raid10_personality);
2165 static void raid_exit(void)
2167 unregister_md_personality(&raid10_personality);
2170 module_init(raid_init);
2171 module_exit(raid_exit);
2172 MODULE_LICENSE("GPL");
2173 MODULE_ALIAS("md-personality-9"); /* RAID10 */
2174 MODULE_ALIAS("md-raid10");
2175 MODULE_ALIAS("md-level-10");