ia64/linux-2.6.18-xen.hg

view drivers/md/raid5.c @ 897:329ea0ccb344

balloon: try harder to balloon up under memory pressure.

Currently if the balloon driver is unable to increase the guest's
reservation it assumes the failure was due to reaching its full
allocation, gives up on the ballooning operation and records the limit
it reached as the "hard limit". The driver will not try again until
the target is set again (even to the same value).

However it is possible that ballooning has in fact failed due to
memory pressure in the host and therefore it is desirable to keep
attempting to reach the target in case memory becomes available. The
most likely scenario is that some guests are ballooning down while
others are ballooning up and therefore there is temporary memory
pressure while things stabilise. You would not expect a well behaved
toolstack to ask a domain to balloon to more than its allocation nor
would you expect it to deliberately over-commit memory by setting
balloon targets which exceed the total host memory.

This patch drops the concept of a hard limit and causes the balloon
driver to retry increasing the reservation on a timer in the same
manner as when decreasing the reservation.

Also if we partially succeed in increasing the reservation
(i.e. receive less pages than we asked for) then we may as well keep
those pages rather than returning them to Xen.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Fri Jun 05 14:01:20 2009 +0100 (2009-06-05)
parents 831230e53067
children
line source
1 /*
2 * raid5.c : Multiple Devices driver for Linux
3 * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
4 * Copyright (C) 1999, 2000 Ingo Molnar
5 * Copyright (C) 2002, 2003 H. Peter Anvin
6 *
7 * RAID-4/5/6 management functions.
8 * Thanks to Penguin Computing for making the RAID-6 development possible
9 * by donating a test server!
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2, or (at your option)
14 * any later version.
15 *
16 * You should have received a copy of the GNU General Public License
17 * (for example /usr/src/linux/COPYING); if not, write to the Free
18 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 */
21 /*
22 * BITMAP UNPLUGGING:
23 *
24 * The sequencing for updating the bitmap reliably is a little
25 * subtle (and I got it wrong the first time) so it deserves some
26 * explanation.
27 *
28 * We group bitmap updates into batches. Each batch has a number.
29 * We may write out several batches at once, but that isn't very important.
30 * conf->bm_write is the number of the last batch successfully written.
31 * conf->bm_flush is the number of the last batch that was closed to
32 * new additions.
33 * When we discover that we will need to write to any block in a stripe
34 * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq
35 * the number of the batch it will be in. This is bm_flush+1.
36 * When we are ready to do a write, if that batch hasn't been written yet,
37 * we plug the array and queue the stripe for later.
38 * When an unplug happens, we increment bm_flush, thus closing the current
39 * batch.
40 * When we notice that bm_flush > bm_write, we write out all pending updates
41 * to the bitmap, and advance bm_write to where bm_flush was.
42 * This may occasionally write a bit out twice, but is sure never to
43 * miss any bits.
44 */
46 #include <linux/module.h>
47 #include <linux/slab.h>
48 #include <linux/highmem.h>
49 #include <linux/bitops.h>
50 #include <linux/kthread.h>
51 #include <asm/atomic.h>
52 #include "raid6.h"
54 #include <linux/raid/bitmap.h>
56 /*
57 * Stripe cache
58 */
60 #define NR_STRIPES 256
61 #define STRIPE_SIZE PAGE_SIZE
62 #define STRIPE_SHIFT (PAGE_SHIFT - 9)
63 #define STRIPE_SECTORS (STRIPE_SIZE>>9)
64 #define IO_THRESHOLD 1
65 #define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head))
66 #define HASH_MASK (NR_HASH - 1)
68 #define stripe_hash(conf, sect) (&((conf)->stripe_hashtbl[((sect) >> STRIPE_SHIFT) & HASH_MASK]))
70 /* bio's attached to a stripe+device for I/O are linked together in bi_sector
71 * order without overlap. There may be several bio's per stripe+device, and
72 * a bio could span several devices.
73 * When walking this list for a particular stripe+device, we must never proceed
74 * beyond a bio that extends past this device, as the next bio might no longer
75 * be valid.
76 * This macro is used to determine the 'next' bio in the list, given the sector
77 * of the current stripe+device
78 */
79 #define r5_next_bio(bio, sect) ( ( (bio)->bi_sector + ((bio)->bi_size>>9) < sect + STRIPE_SECTORS) ? (bio)->bi_next : NULL)
80 /*
81 * The following can be used to debug the driver
82 */
83 #define RAID5_DEBUG 0
84 #define RAID5_PARANOIA 1
85 #if RAID5_PARANOIA && defined(CONFIG_SMP)
86 # define CHECK_DEVLOCK() assert_spin_locked(&conf->device_lock)
87 #else
88 # define CHECK_DEVLOCK()
89 #endif
91 #define PRINTK(x...) ((void)(RAID5_DEBUG && printk(x)))
92 #if RAID5_DEBUG
93 #define inline
94 #define __inline__
95 #endif
97 #if !RAID6_USE_EMPTY_ZERO_PAGE
98 /* In .bss so it's zeroed */
99 const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256)));
100 #endif
102 static inline int raid6_next_disk(int disk, int raid_disks)
103 {
104 disk++;
105 return (disk < raid_disks) ? disk : 0;
106 }
107 static void print_raid5_conf (raid5_conf_t *conf);
109 static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
110 {
111 if (atomic_dec_and_test(&sh->count)) {
112 BUG_ON(!list_empty(&sh->lru));
113 BUG_ON(atomic_read(&conf->active_stripes)==0);
114 if (test_bit(STRIPE_HANDLE, &sh->state)) {
115 if (test_bit(STRIPE_DELAYED, &sh->state)) {
116 list_add_tail(&sh->lru, &conf->delayed_list);
117 blk_plug_device(conf->mddev->queue);
118 } else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
119 sh->bm_seq - conf->seq_write > 0) {
120 list_add_tail(&sh->lru, &conf->bitmap_list);
121 blk_plug_device(conf->mddev->queue);
122 } else {
123 clear_bit(STRIPE_BIT_DELAY, &sh->state);
124 list_add_tail(&sh->lru, &conf->handle_list);
125 }
126 md_wakeup_thread(conf->mddev->thread);
127 } else {
128 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
129 atomic_dec(&conf->preread_active_stripes);
130 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
131 md_wakeup_thread(conf->mddev->thread);
132 }
133 atomic_dec(&conf->active_stripes);
134 if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
135 list_add_tail(&sh->lru, &conf->inactive_list);
136 wake_up(&conf->wait_for_stripe);
137 }
138 }
139 }
140 }
141 static void release_stripe(struct stripe_head *sh)
142 {
143 raid5_conf_t *conf = sh->raid_conf;
144 unsigned long flags;
146 spin_lock_irqsave(&conf->device_lock, flags);
147 __release_stripe(conf, sh);
148 spin_unlock_irqrestore(&conf->device_lock, flags);
149 }
151 static inline void remove_hash(struct stripe_head *sh)
152 {
153 PRINTK("remove_hash(), stripe %llu\n", (unsigned long long)sh->sector);
155 hlist_del_init(&sh->hash);
156 }
158 static inline void insert_hash(raid5_conf_t *conf, struct stripe_head *sh)
159 {
160 struct hlist_head *hp = stripe_hash(conf, sh->sector);
162 PRINTK("insert_hash(), stripe %llu\n", (unsigned long long)sh->sector);
164 CHECK_DEVLOCK();
165 hlist_add_head(&sh->hash, hp);
166 }
169 /* find an idle stripe, make sure it is unhashed, and return it. */
170 static struct stripe_head *get_free_stripe(raid5_conf_t *conf)
171 {
172 struct stripe_head *sh = NULL;
173 struct list_head *first;
175 CHECK_DEVLOCK();
176 if (list_empty(&conf->inactive_list))
177 goto out;
178 first = conf->inactive_list.next;
179 sh = list_entry(first, struct stripe_head, lru);
180 list_del_init(first);
181 remove_hash(sh);
182 atomic_inc(&conf->active_stripes);
183 out:
184 return sh;
185 }
187 static void shrink_buffers(struct stripe_head *sh, int num)
188 {
189 struct page *p;
190 int i;
192 for (i=0; i<num ; i++) {
193 p = sh->dev[i].page;
194 if (!p)
195 continue;
196 sh->dev[i].page = NULL;
197 put_page(p);
198 }
199 }
201 static int grow_buffers(struct stripe_head *sh, int num)
202 {
203 int i;
205 for (i=0; i<num; i++) {
206 struct page *page;
208 if (!(page = alloc_page(GFP_KERNEL))) {
209 return 1;
210 }
211 sh->dev[i].page = page;
212 }
213 return 0;
214 }
216 static void raid5_build_block (struct stripe_head *sh, int i);
218 static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int disks)
219 {
220 raid5_conf_t *conf = sh->raid_conf;
221 int i;
223 BUG_ON(atomic_read(&sh->count) != 0);
224 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
226 CHECK_DEVLOCK();
227 PRINTK("init_stripe called, stripe %llu\n",
228 (unsigned long long)sh->sector);
230 remove_hash(sh);
232 sh->sector = sector;
233 sh->pd_idx = pd_idx;
234 sh->state = 0;
236 sh->disks = disks;
238 for (i = sh->disks; i--; ) {
239 struct r5dev *dev = &sh->dev[i];
241 if (dev->toread || dev->towrite || dev->written ||
242 test_bit(R5_LOCKED, &dev->flags)) {
243 printk("sector=%llx i=%d %p %p %p %d\n",
244 (unsigned long long)sh->sector, i, dev->toread,
245 dev->towrite, dev->written,
246 test_bit(R5_LOCKED, &dev->flags));
247 BUG();
248 }
249 dev->flags = 0;
250 raid5_build_block(sh, i);
251 }
252 insert_hash(conf, sh);
253 }
255 static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, int disks)
256 {
257 struct stripe_head *sh;
258 struct hlist_node *hn;
260 CHECK_DEVLOCK();
261 PRINTK("__find_stripe, sector %llu\n", (unsigned long long)sector);
262 hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash)
263 if (sh->sector == sector && sh->disks == disks)
264 return sh;
265 PRINTK("__stripe %llu not in cache\n", (unsigned long long)sector);
266 return NULL;
267 }
269 static void unplug_slaves(mddev_t *mddev);
270 static void raid5_unplug_device(request_queue_t *q);
272 static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector, int disks,
273 int pd_idx, int noblock)
274 {
275 struct stripe_head *sh;
277 PRINTK("get_stripe, sector %llu\n", (unsigned long long)sector);
279 spin_lock_irq(&conf->device_lock);
281 do {
282 wait_event_lock_irq(conf->wait_for_stripe,
283 conf->quiesce == 0,
284 conf->device_lock, /* nothing */);
285 sh = __find_stripe(conf, sector, disks);
286 if (!sh) {
287 if (!conf->inactive_blocked)
288 sh = get_free_stripe(conf);
289 if (noblock && sh == NULL)
290 break;
291 if (!sh) {
292 conf->inactive_blocked = 1;
293 wait_event_lock_irq(conf->wait_for_stripe,
294 !list_empty(&conf->inactive_list) &&
295 (atomic_read(&conf->active_stripes)
296 < (conf->max_nr_stripes *3/4)
297 || !conf->inactive_blocked),
298 conf->device_lock,
299 raid5_unplug_device(conf->mddev->queue)
300 );
301 conf->inactive_blocked = 0;
302 } else
303 init_stripe(sh, sector, pd_idx, disks);
304 } else {
305 if (atomic_read(&sh->count)) {
306 BUG_ON(!list_empty(&sh->lru));
307 } else {
308 if (!test_bit(STRIPE_HANDLE, &sh->state))
309 atomic_inc(&conf->active_stripes);
310 if (list_empty(&sh->lru) &&
311 !test_bit(STRIPE_EXPANDING, &sh->state))
312 BUG();
313 list_del_init(&sh->lru);
314 }
315 }
316 } while (sh == NULL);
318 if (sh)
319 atomic_inc(&sh->count);
321 spin_unlock_irq(&conf->device_lock);
322 return sh;
323 }
325 static int grow_one_stripe(raid5_conf_t *conf)
326 {
327 struct stripe_head *sh;
328 sh = kmem_cache_alloc(conf->slab_cache, GFP_KERNEL);
329 if (!sh)
330 return 0;
331 memset(sh, 0, sizeof(*sh) + (conf->raid_disks-1)*sizeof(struct r5dev));
332 sh->raid_conf = conf;
333 spin_lock_init(&sh->lock);
335 if (grow_buffers(sh, conf->raid_disks)) {
336 shrink_buffers(sh, conf->raid_disks);
337 kmem_cache_free(conf->slab_cache, sh);
338 return 0;
339 }
340 sh->disks = conf->raid_disks;
341 /* we just created an active stripe so... */
342 atomic_set(&sh->count, 1);
343 atomic_inc(&conf->active_stripes);
344 INIT_LIST_HEAD(&sh->lru);
345 release_stripe(sh);
346 return 1;
347 }
349 static int grow_stripes(raid5_conf_t *conf, int num)
350 {
351 kmem_cache_t *sc;
352 int devs = conf->raid_disks;
354 sprintf(conf->cache_name[0], "raid5/%s", mdname(conf->mddev));
355 sprintf(conf->cache_name[1], "raid5/%s-alt", mdname(conf->mddev));
356 conf->active_name = 0;
357 sc = kmem_cache_create(conf->cache_name[conf->active_name],
358 sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
359 0, 0, NULL, NULL);
360 if (!sc)
361 return 1;
362 conf->slab_cache = sc;
363 conf->pool_size = devs;
364 while (num--)
365 if (!grow_one_stripe(conf))
366 return 1;
367 return 0;
368 }
370 #ifdef CONFIG_MD_RAID5_RESHAPE
371 static int resize_stripes(raid5_conf_t *conf, int newsize)
372 {
373 /* Make all the stripes able to hold 'newsize' devices.
374 * New slots in each stripe get 'page' set to a new page.
375 *
376 * This happens in stages:
377 * 1/ create a new kmem_cache and allocate the required number of
378 * stripe_heads.
379 * 2/ gather all the old stripe_heads and tranfer the pages across
380 * to the new stripe_heads. This will have the side effect of
381 * freezing the array as once all stripe_heads have been collected,
382 * no IO will be possible. Old stripe heads are freed once their
383 * pages have been transferred over, and the old kmem_cache is
384 * freed when all stripes are done.
385 * 3/ reallocate conf->disks to be suitable bigger. If this fails,
386 * we simple return a failre status - no need to clean anything up.
387 * 4/ allocate new pages for the new slots in the new stripe_heads.
388 * If this fails, we don't bother trying the shrink the
389 * stripe_heads down again, we just leave them as they are.
390 * As each stripe_head is processed the new one is released into
391 * active service.
392 *
393 * Once step2 is started, we cannot afford to wait for a write,
394 * so we use GFP_NOIO allocations.
395 */
396 struct stripe_head *osh, *nsh;
397 LIST_HEAD(newstripes);
398 struct disk_info *ndisks;
399 int err = 0;
400 kmem_cache_t *sc;
401 int i;
403 if (newsize <= conf->pool_size)
404 return 0; /* never bother to shrink */
406 /* Step 1 */
407 sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
408 sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev),
409 0, 0, NULL, NULL);
410 if (!sc)
411 return -ENOMEM;
413 for (i = conf->max_nr_stripes; i; i--) {
414 nsh = kmem_cache_alloc(sc, GFP_KERNEL);
415 if (!nsh)
416 break;
418 memset(nsh, 0, sizeof(*nsh) + (newsize-1)*sizeof(struct r5dev));
420 nsh->raid_conf = conf;
421 spin_lock_init(&nsh->lock);
423 list_add(&nsh->lru, &newstripes);
424 }
425 if (i) {
426 /* didn't get enough, give up */
427 while (!list_empty(&newstripes)) {
428 nsh = list_entry(newstripes.next, struct stripe_head, lru);
429 list_del(&nsh->lru);
430 kmem_cache_free(sc, nsh);
431 }
432 kmem_cache_destroy(sc);
433 return -ENOMEM;
434 }
435 /* Step 2 - Must use GFP_NOIO now.
436 * OK, we have enough stripes, start collecting inactive
437 * stripes and copying them over
438 */
439 list_for_each_entry(nsh, &newstripes, lru) {
440 spin_lock_irq(&conf->device_lock);
441 wait_event_lock_irq(conf->wait_for_stripe,
442 !list_empty(&conf->inactive_list),
443 conf->device_lock,
444 unplug_slaves(conf->mddev)
445 );
446 osh = get_free_stripe(conf);
447 spin_unlock_irq(&conf->device_lock);
448 atomic_set(&nsh->count, 1);
449 for(i=0; i<conf->pool_size; i++)
450 nsh->dev[i].page = osh->dev[i].page;
451 for( ; i<newsize; i++)
452 nsh->dev[i].page = NULL;
453 kmem_cache_free(conf->slab_cache, osh);
454 }
455 kmem_cache_destroy(conf->slab_cache);
457 /* Step 3.
458 * At this point, we are holding all the stripes so the array
459 * is completely stalled, so now is a good time to resize
460 * conf->disks.
461 */
462 ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO);
463 if (ndisks) {
464 for (i=0; i<conf->raid_disks; i++)
465 ndisks[i] = conf->disks[i];
466 kfree(conf->disks);
467 conf->disks = ndisks;
468 } else
469 err = -ENOMEM;
471 /* Step 4, return new stripes to service */
472 while(!list_empty(&newstripes)) {
473 nsh = list_entry(newstripes.next, struct stripe_head, lru);
474 list_del_init(&nsh->lru);
475 for (i=conf->raid_disks; i < newsize; i++)
476 if (nsh->dev[i].page == NULL) {
477 struct page *p = alloc_page(GFP_NOIO);
478 nsh->dev[i].page = p;
479 if (!p)
480 err = -ENOMEM;
481 }
482 release_stripe(nsh);
483 }
484 /* critical section pass, GFP_NOIO no longer needed */
486 conf->slab_cache = sc;
487 conf->active_name = 1-conf->active_name;
488 conf->pool_size = newsize;
489 return err;
490 }
491 #endif
493 static int drop_one_stripe(raid5_conf_t *conf)
494 {
495 struct stripe_head *sh;
497 spin_lock_irq(&conf->device_lock);
498 sh = get_free_stripe(conf);
499 spin_unlock_irq(&conf->device_lock);
500 if (!sh)
501 return 0;
502 BUG_ON(atomic_read(&sh->count));
503 shrink_buffers(sh, conf->pool_size);
504 kmem_cache_free(conf->slab_cache, sh);
505 atomic_dec(&conf->active_stripes);
506 return 1;
507 }
509 static void shrink_stripes(raid5_conf_t *conf)
510 {
511 while (drop_one_stripe(conf))
512 ;
514 if (conf->slab_cache)
515 kmem_cache_destroy(conf->slab_cache);
516 conf->slab_cache = NULL;
517 }
519 static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done,
520 int error)
521 {
522 struct stripe_head *sh = bi->bi_private;
523 raid5_conf_t *conf = sh->raid_conf;
524 int disks = sh->disks, i;
525 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
526 char b[BDEVNAME_SIZE];
527 mdk_rdev_t *rdev;
529 if (bi->bi_size)
530 return 1;
532 for (i=0 ; i<disks; i++)
533 if (bi == &sh->dev[i].req)
534 break;
536 PRINTK("end_read_request %llu/%d, count: %d, uptodate %d.\n",
537 (unsigned long long)sh->sector, i, atomic_read(&sh->count),
538 uptodate);
539 if (i == disks) {
540 BUG();
541 return 0;
542 }
544 if (uptodate) {
545 #if 0
546 struct bio *bio;
547 unsigned long flags;
548 spin_lock_irqsave(&conf->device_lock, flags);
549 /* we can return a buffer if we bypassed the cache or
550 * if the top buffer is not in highmem. If there are
551 * multiple buffers, leave the extra work to
552 * handle_stripe
553 */
554 buffer = sh->bh_read[i];
555 if (buffer &&
556 (!PageHighMem(buffer->b_page)
557 || buffer->b_page == bh->b_page )
558 ) {
559 sh->bh_read[i] = buffer->b_reqnext;
560 buffer->b_reqnext = NULL;
561 } else
562 buffer = NULL;
563 spin_unlock_irqrestore(&conf->device_lock, flags);
564 if (sh->bh_page[i]==bh->b_page)
565 set_buffer_uptodate(bh);
566 if (buffer) {
567 if (buffer->b_page != bh->b_page)
568 memcpy(buffer->b_data, bh->b_data, bh->b_size);
569 buffer->b_end_io(buffer, 1);
570 }
571 #else
572 set_bit(R5_UPTODATE, &sh->dev[i].flags);
573 #endif
574 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
575 rdev = conf->disks[i].rdev;
576 printk(KERN_INFO "raid5:%s: read error corrected (%lu sectors at %llu on %s)\n",
577 mdname(conf->mddev), STRIPE_SECTORS,
578 (unsigned long long)sh->sector + rdev->data_offset,
579 bdevname(rdev->bdev, b));
580 clear_bit(R5_ReadError, &sh->dev[i].flags);
581 clear_bit(R5_ReWrite, &sh->dev[i].flags);
582 }
583 if (atomic_read(&conf->disks[i].rdev->read_errors))
584 atomic_set(&conf->disks[i].rdev->read_errors, 0);
585 } else {
586 const char *bdn = bdevname(conf->disks[i].rdev->bdev, b);
587 int retry = 0;
588 rdev = conf->disks[i].rdev;
590 clear_bit(R5_UPTODATE, &sh->dev[i].flags);
591 atomic_inc(&rdev->read_errors);
592 if (conf->mddev->degraded)
593 printk(KERN_WARNING "raid5:%s: read error not correctable (sector %llu on %s).\n",
594 mdname(conf->mddev),
595 (unsigned long long)sh->sector + rdev->data_offset,
596 bdn);
597 else if (test_bit(R5_ReWrite, &sh->dev[i].flags))
598 /* Oh, no!!! */
599 printk(KERN_WARNING "raid5:%s: read error NOT corrected!! (sector %llu on %s).\n",
600 mdname(conf->mddev),
601 (unsigned long long)sh->sector + rdev->data_offset,
602 bdn);
603 else if (atomic_read(&rdev->read_errors)
604 > conf->max_nr_stripes)
605 printk(KERN_WARNING
606 "raid5:%s: Too many read errors, failing device %s.\n",
607 mdname(conf->mddev), bdn);
608 else
609 retry = 1;
610 if (retry)
611 set_bit(R5_ReadError, &sh->dev[i].flags);
612 else {
613 clear_bit(R5_ReadError, &sh->dev[i].flags);
614 clear_bit(R5_ReWrite, &sh->dev[i].flags);
615 md_error(conf->mddev, rdev);
616 }
617 }
618 rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
619 #if 0
620 /* must restore b_page before unlocking buffer... */
621 if (sh->bh_page[i] != bh->b_page) {
622 bh->b_page = sh->bh_page[i];
623 bh->b_data = page_address(bh->b_page);
624 clear_buffer_uptodate(bh);
625 }
626 #endif
627 clear_bit(R5_LOCKED, &sh->dev[i].flags);
628 set_bit(STRIPE_HANDLE, &sh->state);
629 release_stripe(sh);
630 return 0;
631 }
633 static int raid5_end_write_request (struct bio *bi, unsigned int bytes_done,
634 int error)
635 {
636 struct stripe_head *sh = bi->bi_private;
637 raid5_conf_t *conf = sh->raid_conf;
638 int disks = sh->disks, i;
639 unsigned long flags;
640 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
642 if (bi->bi_size)
643 return 1;
645 for (i=0 ; i<disks; i++)
646 if (bi == &sh->dev[i].req)
647 break;
649 PRINTK("end_write_request %llu/%d, count %d, uptodate: %d.\n",
650 (unsigned long long)sh->sector, i, atomic_read(&sh->count),
651 uptodate);
652 if (i == disks) {
653 BUG();
654 return 0;
655 }
657 spin_lock_irqsave(&conf->device_lock, flags);
658 if (!uptodate)
659 md_error(conf->mddev, conf->disks[i].rdev);
661 rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
663 clear_bit(R5_LOCKED, &sh->dev[i].flags);
664 set_bit(STRIPE_HANDLE, &sh->state);
665 __release_stripe(conf, sh);
666 spin_unlock_irqrestore(&conf->device_lock, flags);
667 return 0;
668 }
671 static sector_t compute_blocknr(struct stripe_head *sh, int i);
673 static void raid5_build_block (struct stripe_head *sh, int i)
674 {
675 struct r5dev *dev = &sh->dev[i];
677 bio_init(&dev->req);
678 dev->req.bi_io_vec = &dev->vec;
679 dev->req.bi_vcnt++;
680 dev->req.bi_max_vecs++;
681 dev->vec.bv_page = dev->page;
682 dev->vec.bv_len = STRIPE_SIZE;
683 dev->vec.bv_offset = 0;
685 dev->req.bi_sector = sh->sector;
686 dev->req.bi_private = sh;
688 dev->flags = 0;
689 dev->sector = compute_blocknr(sh, i);
690 }
692 static void error(mddev_t *mddev, mdk_rdev_t *rdev)
693 {
694 char b[BDEVNAME_SIZE];
695 raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
696 PRINTK("raid5: error called\n");
698 if (!test_bit(Faulty, &rdev->flags)) {
699 mddev->sb_dirty = 1;
700 if (test_bit(In_sync, &rdev->flags)) {
701 conf->working_disks--;
702 mddev->degraded++;
703 conf->failed_disks++;
704 clear_bit(In_sync, &rdev->flags);
705 /*
706 * if recovery was running, make sure it aborts.
707 */
708 set_bit(MD_RECOVERY_ERR, &mddev->recovery);
709 }
710 set_bit(Faulty, &rdev->flags);
711 printk (KERN_ALERT
712 "raid5: Disk failure on %s, disabling device."
713 " Operation continuing on %d devices\n",
714 bdevname(rdev->bdev,b), conf->working_disks);
715 }
716 }
718 /*
719 * Input: a 'big' sector number,
720 * Output: index of the data and parity disk, and the sector # in them.
721 */
722 static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks,
723 unsigned int data_disks, unsigned int * dd_idx,
724 unsigned int * pd_idx, raid5_conf_t *conf)
725 {
726 long stripe;
727 unsigned long chunk_number;
728 unsigned int chunk_offset;
729 sector_t new_sector;
730 int sectors_per_chunk = conf->chunk_size >> 9;
732 /* First compute the information on this sector */
734 /*
735 * Compute the chunk number and the sector offset inside the chunk
736 */
737 chunk_offset = sector_div(r_sector, sectors_per_chunk);
738 chunk_number = r_sector;
739 BUG_ON(r_sector != chunk_number);
741 /*
742 * Compute the stripe number
743 */
744 stripe = chunk_number / data_disks;
746 /*
747 * Compute the data disk and parity disk indexes inside the stripe
748 */
749 *dd_idx = chunk_number % data_disks;
751 /*
752 * Select the parity disk based on the user selected algorithm.
753 */
754 switch(conf->level) {
755 case 4:
756 *pd_idx = data_disks;
757 break;
758 case 5:
759 switch (conf->algorithm) {
760 case ALGORITHM_LEFT_ASYMMETRIC:
761 *pd_idx = data_disks - stripe % raid_disks;
762 if (*dd_idx >= *pd_idx)
763 (*dd_idx)++;
764 break;
765 case ALGORITHM_RIGHT_ASYMMETRIC:
766 *pd_idx = stripe % raid_disks;
767 if (*dd_idx >= *pd_idx)
768 (*dd_idx)++;
769 break;
770 case ALGORITHM_LEFT_SYMMETRIC:
771 *pd_idx = data_disks - stripe % raid_disks;
772 *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks;
773 break;
774 case ALGORITHM_RIGHT_SYMMETRIC:
775 *pd_idx = stripe % raid_disks;
776 *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks;
777 break;
778 default:
779 printk(KERN_ERR "raid5: unsupported algorithm %d\n",
780 conf->algorithm);
781 }
782 break;
783 case 6:
785 /**** FIX THIS ****/
786 switch (conf->algorithm) {
787 case ALGORITHM_LEFT_ASYMMETRIC:
788 *pd_idx = raid_disks - 1 - (stripe % raid_disks);
789 if (*pd_idx == raid_disks-1)
790 (*dd_idx)++; /* Q D D D P */
791 else if (*dd_idx >= *pd_idx)
792 (*dd_idx) += 2; /* D D P Q D */
793 break;
794 case ALGORITHM_RIGHT_ASYMMETRIC:
795 *pd_idx = stripe % raid_disks;
796 if (*pd_idx == raid_disks-1)
797 (*dd_idx)++; /* Q D D D P */
798 else if (*dd_idx >= *pd_idx)
799 (*dd_idx) += 2; /* D D P Q D */
800 break;
801 case ALGORITHM_LEFT_SYMMETRIC:
802 *pd_idx = raid_disks - 1 - (stripe % raid_disks);
803 *dd_idx = (*pd_idx + 2 + *dd_idx) % raid_disks;
804 break;
805 case ALGORITHM_RIGHT_SYMMETRIC:
806 *pd_idx = stripe % raid_disks;
807 *dd_idx = (*pd_idx + 2 + *dd_idx) % raid_disks;
808 break;
809 default:
810 printk (KERN_CRIT "raid6: unsupported algorithm %d\n",
811 conf->algorithm);
812 }
813 break;
814 }
816 /*
817 * Finally, compute the new sector number
818 */
819 new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset;
820 return new_sector;
821 }
824 static sector_t compute_blocknr(struct stripe_head *sh, int i)
825 {
826 raid5_conf_t *conf = sh->raid_conf;
827 int raid_disks = sh->disks, data_disks = raid_disks - 1;
828 sector_t new_sector = sh->sector, check;
829 int sectors_per_chunk = conf->chunk_size >> 9;
830 sector_t stripe;
831 int chunk_offset;
832 int chunk_number, dummy1, dummy2, dd_idx = i;
833 sector_t r_sector;
836 chunk_offset = sector_div(new_sector, sectors_per_chunk);
837 stripe = new_sector;
838 BUG_ON(new_sector != stripe);
840 if (i == sh->pd_idx)
841 return 0;
842 switch(conf->level) {
843 case 4: break;
844 case 5:
845 switch (conf->algorithm) {
846 case ALGORITHM_LEFT_ASYMMETRIC:
847 case ALGORITHM_RIGHT_ASYMMETRIC:
848 if (i > sh->pd_idx)
849 i--;
850 break;
851 case ALGORITHM_LEFT_SYMMETRIC:
852 case ALGORITHM_RIGHT_SYMMETRIC:
853 if (i < sh->pd_idx)
854 i += raid_disks;
855 i -= (sh->pd_idx + 1);
856 break;
857 default:
858 printk(KERN_ERR "raid5: unsupported algorithm %d\n",
859 conf->algorithm);
860 }
861 break;
862 case 6:
863 data_disks = raid_disks - 2;
864 if (i == raid6_next_disk(sh->pd_idx, raid_disks))
865 return 0; /* It is the Q disk */
866 switch (conf->algorithm) {
867 case ALGORITHM_LEFT_ASYMMETRIC:
868 case ALGORITHM_RIGHT_ASYMMETRIC:
869 if (sh->pd_idx == raid_disks-1)
870 i--; /* Q D D D P */
871 else if (i > sh->pd_idx)
872 i -= 2; /* D D P Q D */
873 break;
874 case ALGORITHM_LEFT_SYMMETRIC:
875 case ALGORITHM_RIGHT_SYMMETRIC:
876 if (sh->pd_idx == raid_disks-1)
877 i--; /* Q D D D P */
878 else {
879 /* D D P Q D */
880 if (i < sh->pd_idx)
881 i += raid_disks;
882 i -= (sh->pd_idx + 2);
883 }
884 break;
885 default:
886 printk (KERN_CRIT "raid6: unsupported algorithm %d\n",
887 conf->algorithm);
888 }
889 break;
890 }
892 chunk_number = stripe * data_disks + i;
893 r_sector = (sector_t)chunk_number * sectors_per_chunk + chunk_offset;
895 check = raid5_compute_sector (r_sector, raid_disks, data_disks, &dummy1, &dummy2, conf);
896 if (check != sh->sector || dummy1 != dd_idx || dummy2 != sh->pd_idx) {
897 printk(KERN_ERR "compute_blocknr: map not correct\n");
898 return 0;
899 }
900 return r_sector;
901 }
905 /*
906 * Copy data between a page in the stripe cache, and one or more bion
907 * The page could align with the middle of the bio, or there could be
908 * several bion, each with several bio_vecs, which cover part of the page
909 * Multiple bion are linked together on bi_next. There may be extras
910 * at the end of this list. We ignore them.
911 */
912 static void copy_data(int frombio, struct bio *bio,
913 struct page *page,
914 sector_t sector)
915 {
916 char *pa = page_address(page);
917 struct bio_vec *bvl;
918 int i;
919 int page_offset;
921 if (bio->bi_sector >= sector)
922 page_offset = (signed)(bio->bi_sector - sector) * 512;
923 else
924 page_offset = (signed)(sector - bio->bi_sector) * -512;
925 bio_for_each_segment(bvl, bio, i) {
926 int len = bio_iovec_idx(bio,i)->bv_len;
927 int clen;
928 int b_offset = 0;
930 if (page_offset < 0) {
931 b_offset = -page_offset;
932 page_offset += b_offset;
933 len -= b_offset;
934 }
936 if (len > 0 && page_offset + len > STRIPE_SIZE)
937 clen = STRIPE_SIZE - page_offset;
938 else clen = len;
940 if (clen > 0) {
941 char *ba = __bio_kmap_atomic(bio, i, KM_USER0);
942 if (frombio)
943 memcpy(pa+page_offset, ba+b_offset, clen);
944 else
945 memcpy(ba+b_offset, pa+page_offset, clen);
946 __bio_kunmap_atomic(ba, KM_USER0);
947 }
948 if (clen < len) /* hit end of page */
949 break;
950 page_offset += len;
951 }
952 }
954 #define check_xor() do { \
955 if (count == MAX_XOR_BLOCKS) { \
956 xor_block(count, STRIPE_SIZE, ptr); \
957 count = 1; \
958 } \
959 } while(0)
962 static void compute_block(struct stripe_head *sh, int dd_idx)
963 {
964 int i, count, disks = sh->disks;
965 void *ptr[MAX_XOR_BLOCKS], *p;
967 PRINTK("compute_block, stripe %llu, idx %d\n",
968 (unsigned long long)sh->sector, dd_idx);
970 ptr[0] = page_address(sh->dev[dd_idx].page);
971 memset(ptr[0], 0, STRIPE_SIZE);
972 count = 1;
973 for (i = disks ; i--; ) {
974 if (i == dd_idx)
975 continue;
976 p = page_address(sh->dev[i].page);
977 if (test_bit(R5_UPTODATE, &sh->dev[i].flags))
978 ptr[count++] = p;
979 else
980 printk(KERN_ERR "compute_block() %d, stripe %llu, %d"
981 " not present\n", dd_idx,
982 (unsigned long long)sh->sector, i);
984 check_xor();
985 }
986 if (count != 1)
987 xor_block(count, STRIPE_SIZE, ptr);
988 set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
989 }
991 static void compute_parity5(struct stripe_head *sh, int method)
992 {
993 raid5_conf_t *conf = sh->raid_conf;
994 int i, pd_idx = sh->pd_idx, disks = sh->disks, count;
995 void *ptr[MAX_XOR_BLOCKS];
996 struct bio *chosen;
998 PRINTK("compute_parity5, stripe %llu, method %d\n",
999 (unsigned long long)sh->sector, method);
1001 count = 1;
1002 ptr[0] = page_address(sh->dev[pd_idx].page);
1003 switch(method) {
1004 case READ_MODIFY_WRITE:
1005 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags));
1006 for (i=disks ; i-- ;) {
1007 if (i==pd_idx)
1008 continue;
1009 if (sh->dev[i].towrite &&
1010 test_bit(R5_UPTODATE, &sh->dev[i].flags)) {
1011 ptr[count++] = page_address(sh->dev[i].page);
1012 chosen = sh->dev[i].towrite;
1013 sh->dev[i].towrite = NULL;
1015 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
1016 wake_up(&conf->wait_for_overlap);
1018 BUG_ON(sh->dev[i].written);
1019 sh->dev[i].written = chosen;
1020 check_xor();
1023 break;
1024 case RECONSTRUCT_WRITE:
1025 memset(ptr[0], 0, STRIPE_SIZE);
1026 for (i= disks; i-- ;)
1027 if (i!=pd_idx && sh->dev[i].towrite) {
1028 chosen = sh->dev[i].towrite;
1029 sh->dev[i].towrite = NULL;
1031 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
1032 wake_up(&conf->wait_for_overlap);
1034 BUG_ON(sh->dev[i].written);
1035 sh->dev[i].written = chosen;
1037 break;
1038 case CHECK_PARITY:
1039 break;
1041 if (count>1) {
1042 xor_block(count, STRIPE_SIZE, ptr);
1043 count = 1;
1046 for (i = disks; i--;)
1047 if (sh->dev[i].written) {
1048 sector_t sector = sh->dev[i].sector;
1049 struct bio *wbi = sh->dev[i].written;
1050 while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
1051 copy_data(1, wbi, sh->dev[i].page, sector);
1052 wbi = r5_next_bio(wbi, sector);
1055 set_bit(R5_LOCKED, &sh->dev[i].flags);
1056 set_bit(R5_UPTODATE, &sh->dev[i].flags);
1059 switch(method) {
1060 case RECONSTRUCT_WRITE:
1061 case CHECK_PARITY:
1062 for (i=disks; i--;)
1063 if (i != pd_idx) {
1064 ptr[count++] = page_address(sh->dev[i].page);
1065 check_xor();
1067 break;
1068 case READ_MODIFY_WRITE:
1069 for (i = disks; i--;)
1070 if (sh->dev[i].written) {
1071 ptr[count++] = page_address(sh->dev[i].page);
1072 check_xor();
1075 if (count != 1)
1076 xor_block(count, STRIPE_SIZE, ptr);
1078 if (method != CHECK_PARITY) {
1079 set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
1080 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
1081 } else
1082 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
1085 static void compute_parity6(struct stripe_head *sh, int method)
1087 raid6_conf_t *conf = sh->raid_conf;
1088 int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = conf->raid_disks, count;
1089 struct bio *chosen;
1090 /**** FIX THIS: This could be very bad if disks is close to 256 ****/
1091 void *ptrs[disks];
1093 qd_idx = raid6_next_disk(pd_idx, disks);
1094 d0_idx = raid6_next_disk(qd_idx, disks);
1096 PRINTK("compute_parity, stripe %llu, method %d\n",
1097 (unsigned long long)sh->sector, method);
1099 switch(method) {
1100 case READ_MODIFY_WRITE:
1101 BUG(); /* READ_MODIFY_WRITE N/A for RAID-6 */
1102 case RECONSTRUCT_WRITE:
1103 for (i= disks; i-- ;)
1104 if ( i != pd_idx && i != qd_idx && sh->dev[i].towrite ) {
1105 chosen = sh->dev[i].towrite;
1106 sh->dev[i].towrite = NULL;
1108 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
1109 wake_up(&conf->wait_for_overlap);
1111 if (sh->dev[i].written) BUG();
1112 sh->dev[i].written = chosen;
1114 break;
1115 case CHECK_PARITY:
1116 BUG(); /* Not implemented yet */
1119 for (i = disks; i--;)
1120 if (sh->dev[i].written) {
1121 sector_t sector = sh->dev[i].sector;
1122 struct bio *wbi = sh->dev[i].written;
1123 while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
1124 copy_data(1, wbi, sh->dev[i].page, sector);
1125 wbi = r5_next_bio(wbi, sector);
1128 set_bit(R5_LOCKED, &sh->dev[i].flags);
1129 set_bit(R5_UPTODATE, &sh->dev[i].flags);
1132 // switch(method) {
1133 // case RECONSTRUCT_WRITE:
1134 // case CHECK_PARITY:
1135 // case UPDATE_PARITY:
1136 /* Note that unlike RAID-5, the ordering of the disks matters greatly. */
1137 /* FIX: Is this ordering of drives even remotely optimal? */
1138 count = 0;
1139 i = d0_idx;
1140 do {
1141 ptrs[count++] = page_address(sh->dev[i].page);
1142 if (count <= disks-2 && !test_bit(R5_UPTODATE, &sh->dev[i].flags))
1143 printk("block %d/%d not uptodate on parity calc\n", i,count);
1144 i = raid6_next_disk(i, disks);
1145 } while ( i != d0_idx );
1146 // break;
1147 // }
1149 raid6_call.gen_syndrome(disks, STRIPE_SIZE, ptrs);
1151 switch(method) {
1152 case RECONSTRUCT_WRITE:
1153 set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
1154 set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags);
1155 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
1156 set_bit(R5_LOCKED, &sh->dev[qd_idx].flags);
1157 break;
1158 case UPDATE_PARITY:
1159 set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
1160 set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags);
1161 break;
1166 /* Compute one missing block */
1167 static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero)
1169 raid6_conf_t *conf = sh->raid_conf;
1170 int i, count, disks = conf->raid_disks;
1171 void *ptr[MAX_XOR_BLOCKS], *p;
1172 int pd_idx = sh->pd_idx;
1173 int qd_idx = raid6_next_disk(pd_idx, disks);
1175 PRINTK("compute_block_1, stripe %llu, idx %d\n",
1176 (unsigned long long)sh->sector, dd_idx);
1178 if ( dd_idx == qd_idx ) {
1179 /* We're actually computing the Q drive */
1180 compute_parity6(sh, UPDATE_PARITY);
1181 } else {
1182 ptr[0] = page_address(sh->dev[dd_idx].page);
1183 if (!nozero) memset(ptr[0], 0, STRIPE_SIZE);
1184 count = 1;
1185 for (i = disks ; i--; ) {
1186 if (i == dd_idx || i == qd_idx)
1187 continue;
1188 p = page_address(sh->dev[i].page);
1189 if (test_bit(R5_UPTODATE, &sh->dev[i].flags))
1190 ptr[count++] = p;
1191 else
1192 printk("compute_block() %d, stripe %llu, %d"
1193 " not present\n", dd_idx,
1194 (unsigned long long)sh->sector, i);
1196 check_xor();
1198 if (count != 1)
1199 xor_block(count, STRIPE_SIZE, ptr);
1200 if (!nozero) set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
1201 else clear_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
1205 /* Compute two missing blocks */
1206 static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2)
1208 raid6_conf_t *conf = sh->raid_conf;
1209 int i, count, disks = conf->raid_disks;
1210 int pd_idx = sh->pd_idx;
1211 int qd_idx = raid6_next_disk(pd_idx, disks);
1212 int d0_idx = raid6_next_disk(qd_idx, disks);
1213 int faila, failb;
1215 /* faila and failb are disk numbers relative to d0_idx */
1216 /* pd_idx become disks-2 and qd_idx become disks-1 */
1217 faila = (dd_idx1 < d0_idx) ? dd_idx1+(disks-d0_idx) : dd_idx1-d0_idx;
1218 failb = (dd_idx2 < d0_idx) ? dd_idx2+(disks-d0_idx) : dd_idx2-d0_idx;
1220 BUG_ON(faila == failb);
1221 if ( failb < faila ) { int tmp = faila; faila = failb; failb = tmp; }
1223 PRINTK("compute_block_2, stripe %llu, idx %d,%d (%d,%d)\n",
1224 (unsigned long long)sh->sector, dd_idx1, dd_idx2, faila, failb);
1226 if ( failb == disks-1 ) {
1227 /* Q disk is one of the missing disks */
1228 if ( faila == disks-2 ) {
1229 /* Missing P+Q, just recompute */
1230 compute_parity6(sh, UPDATE_PARITY);
1231 return;
1232 } else {
1233 /* We're missing D+Q; recompute D from P */
1234 compute_block_1(sh, (dd_idx1 == qd_idx) ? dd_idx2 : dd_idx1, 0);
1235 compute_parity6(sh, UPDATE_PARITY); /* Is this necessary? */
1236 return;
1240 /* We're missing D+P or D+D; build pointer table */
1242 /**** FIX THIS: This could be very bad if disks is close to 256 ****/
1243 void *ptrs[disks];
1245 count = 0;
1246 i = d0_idx;
1247 do {
1248 ptrs[count++] = page_address(sh->dev[i].page);
1249 i = raid6_next_disk(i, disks);
1250 if (i != dd_idx1 && i != dd_idx2 &&
1251 !test_bit(R5_UPTODATE, &sh->dev[i].flags))
1252 printk("compute_2 with missing block %d/%d\n", count, i);
1253 } while ( i != d0_idx );
1255 if ( failb == disks-2 ) {
1256 /* We're missing D+P. */
1257 raid6_datap_recov(disks, STRIPE_SIZE, faila, ptrs);
1258 } else {
1259 /* We're missing D+D. */
1260 raid6_2data_recov(disks, STRIPE_SIZE, faila, failb, ptrs);
1263 /* Both the above update both missing blocks */
1264 set_bit(R5_UPTODATE, &sh->dev[dd_idx1].flags);
1265 set_bit(R5_UPTODATE, &sh->dev[dd_idx2].flags);
1271 /*
1272 * Each stripe/dev can have one or more bion attached.
1273 * toread/towrite point to the first in a chain.
1274 * The bi_next chain must be in order.
1275 */
1276 static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite)
1278 struct bio **bip;
1279 raid5_conf_t *conf = sh->raid_conf;
1280 int firstwrite=0;
1282 PRINTK("adding bh b#%llu to stripe s#%llu\n",
1283 (unsigned long long)bi->bi_sector,
1284 (unsigned long long)sh->sector);
1287 spin_lock(&sh->lock);
1288 spin_lock_irq(&conf->device_lock);
1289 if (forwrite) {
1290 bip = &sh->dev[dd_idx].towrite;
1291 if (*bip == NULL && sh->dev[dd_idx].written == NULL)
1292 firstwrite = 1;
1293 } else
1294 bip = &sh->dev[dd_idx].toread;
1295 while (*bip && (*bip)->bi_sector < bi->bi_sector) {
1296 if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector)
1297 goto overlap;
1298 bip = & (*bip)->bi_next;
1300 if (*bip && (*bip)->bi_sector < bi->bi_sector + ((bi->bi_size)>>9))
1301 goto overlap;
1303 BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next);
1304 if (*bip)
1305 bi->bi_next = *bip;
1306 *bip = bi;
1307 bi->bi_phys_segments ++;
1308 spin_unlock_irq(&conf->device_lock);
1309 spin_unlock(&sh->lock);
1311 PRINTK("added bi b#%llu to stripe s#%llu, disk %d.\n",
1312 (unsigned long long)bi->bi_sector,
1313 (unsigned long long)sh->sector, dd_idx);
1315 if (conf->mddev->bitmap && firstwrite) {
1316 bitmap_startwrite(conf->mddev->bitmap, sh->sector,
1317 STRIPE_SECTORS, 0);
1318 sh->bm_seq = conf->seq_flush+1;
1319 set_bit(STRIPE_BIT_DELAY, &sh->state);
1322 if (forwrite) {
1323 /* check if page is covered */
1324 sector_t sector = sh->dev[dd_idx].sector;
1325 for (bi=sh->dev[dd_idx].towrite;
1326 sector < sh->dev[dd_idx].sector + STRIPE_SECTORS &&
1327 bi && bi->bi_sector <= sector;
1328 bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) {
1329 if (bi->bi_sector + (bi->bi_size>>9) >= sector)
1330 sector = bi->bi_sector + (bi->bi_size>>9);
1332 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
1333 set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
1335 return 1;
1337 overlap:
1338 set_bit(R5_Overlap, &sh->dev[dd_idx].flags);
1339 spin_unlock_irq(&conf->device_lock);
1340 spin_unlock(&sh->lock);
1341 return 0;
1344 static void end_reshape(raid5_conf_t *conf);
1346 static int page_is_zero(struct page *p)
1348 char *a = page_address(p);
1349 return ((*(u32*)a) == 0 &&
1350 memcmp(a, a+4, STRIPE_SIZE-4)==0);
1353 static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks)
1355 int sectors_per_chunk = conf->chunk_size >> 9;
1356 sector_t x = stripe;
1357 int pd_idx, dd_idx;
1358 int chunk_offset = sector_div(x, sectors_per_chunk);
1359 stripe = x;
1360 raid5_compute_sector(stripe*(disks-1)*sectors_per_chunk
1361 + chunk_offset, disks, disks-1, &dd_idx, &pd_idx, conf);
1362 return pd_idx;
1366 /*
1367 * handle_stripe - do things to a stripe.
1369 * We lock the stripe and then examine the state of various bits
1370 * to see what needs to be done.
1371 * Possible results:
1372 * return some read request which now have data
1373 * return some write requests which are safely on disc
1374 * schedule a read on some buffers
1375 * schedule a write of some buffers
1376 * return confirmation of parity correctness
1378 * Parity calculations are done inside the stripe lock
1379 * buffers are taken off read_list or write_list, and bh_cache buffers
1380 * get BH_Lock set before the stripe lock is released.
1382 */
1384 static void handle_stripe5(struct stripe_head *sh)
1386 raid5_conf_t *conf = sh->raid_conf;
1387 int disks = sh->disks;
1388 struct bio *return_bi= NULL;
1389 struct bio *bi;
1390 int i;
1391 int syncing, expanding, expanded;
1392 int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0;
1393 int non_overwrite = 0;
1394 int failed_num=0;
1395 struct r5dev *dev;
1397 PRINTK("handling stripe %llu, cnt=%d, pd_idx=%d\n",
1398 (unsigned long long)sh->sector, atomic_read(&sh->count),
1399 sh->pd_idx);
1401 spin_lock(&sh->lock);
1402 clear_bit(STRIPE_HANDLE, &sh->state);
1403 clear_bit(STRIPE_DELAYED, &sh->state);
1405 syncing = test_bit(STRIPE_SYNCING, &sh->state);
1406 expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
1407 expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
1408 /* Now to look around and see what can be done */
1410 rcu_read_lock();
1411 for (i=disks; i--; ) {
1412 mdk_rdev_t *rdev;
1413 dev = &sh->dev[i];
1414 clear_bit(R5_Insync, &dev->flags);
1416 PRINTK("check %d: state 0x%lx read %p write %p written %p\n",
1417 i, dev->flags, dev->toread, dev->towrite, dev->written);
1418 /* maybe we can reply to a read */
1419 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread) {
1420 struct bio *rbi, *rbi2;
1421 PRINTK("Return read for disc %d\n", i);
1422 spin_lock_irq(&conf->device_lock);
1423 rbi = dev->toread;
1424 dev->toread = NULL;
1425 if (test_and_clear_bit(R5_Overlap, &dev->flags))
1426 wake_up(&conf->wait_for_overlap);
1427 spin_unlock_irq(&conf->device_lock);
1428 while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) {
1429 copy_data(0, rbi, dev->page, dev->sector);
1430 rbi2 = r5_next_bio(rbi, dev->sector);
1431 spin_lock_irq(&conf->device_lock);
1432 if (--rbi->bi_phys_segments == 0) {
1433 rbi->bi_next = return_bi;
1434 return_bi = rbi;
1436 spin_unlock_irq(&conf->device_lock);
1437 rbi = rbi2;
1441 /* now count some things */
1442 if (test_bit(R5_LOCKED, &dev->flags)) locked++;
1443 if (test_bit(R5_UPTODATE, &dev->flags)) uptodate++;
1446 if (dev->toread) to_read++;
1447 if (dev->towrite) {
1448 to_write++;
1449 if (!test_bit(R5_OVERWRITE, &dev->flags))
1450 non_overwrite++;
1452 if (dev->written) written++;
1453 rdev = rcu_dereference(conf->disks[i].rdev);
1454 if (!rdev || !test_bit(In_sync, &rdev->flags)) {
1455 /* The ReadError flag will just be confusing now */
1456 clear_bit(R5_ReadError, &dev->flags);
1457 clear_bit(R5_ReWrite, &dev->flags);
1459 if (!rdev || !test_bit(In_sync, &rdev->flags)
1460 || test_bit(R5_ReadError, &dev->flags)) {
1461 failed++;
1462 failed_num = i;
1463 } else
1464 set_bit(R5_Insync, &dev->flags);
1466 rcu_read_unlock();
1467 PRINTK("locked=%d uptodate=%d to_read=%d"
1468 " to_write=%d failed=%d failed_num=%d\n",
1469 locked, uptodate, to_read, to_write, failed, failed_num);
1470 /* check if the array has lost two devices and, if so, some requests might
1471 * need to be failed
1472 */
1473 if (failed > 1 && to_read+to_write+written) {
1474 for (i=disks; i--; ) {
1475 int bitmap_end = 0;
1477 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
1478 mdk_rdev_t *rdev;
1479 rcu_read_lock();
1480 rdev = rcu_dereference(conf->disks[i].rdev);
1481 if (rdev && test_bit(In_sync, &rdev->flags))
1482 /* multiple read failures in one stripe */
1483 md_error(conf->mddev, rdev);
1484 rcu_read_unlock();
1487 spin_lock_irq(&conf->device_lock);
1488 /* fail all writes first */
1489 bi = sh->dev[i].towrite;
1490 sh->dev[i].towrite = NULL;
1491 if (bi) { to_write--; bitmap_end = 1; }
1493 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
1494 wake_up(&conf->wait_for_overlap);
1496 while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
1497 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
1498 clear_bit(BIO_UPTODATE, &bi->bi_flags);
1499 if (--bi->bi_phys_segments == 0) {
1500 md_write_end(conf->mddev);
1501 bi->bi_next = return_bi;
1502 return_bi = bi;
1504 bi = nextbi;
1506 /* and fail all 'written' */
1507 bi = sh->dev[i].written;
1508 sh->dev[i].written = NULL;
1509 if (bi) bitmap_end = 1;
1510 while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) {
1511 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
1512 clear_bit(BIO_UPTODATE, &bi->bi_flags);
1513 if (--bi->bi_phys_segments == 0) {
1514 md_write_end(conf->mddev);
1515 bi->bi_next = return_bi;
1516 return_bi = bi;
1518 bi = bi2;
1521 /* fail any reads if this device is non-operational */
1522 if (!test_bit(R5_Insync, &sh->dev[i].flags) ||
1523 test_bit(R5_ReadError, &sh->dev[i].flags)) {
1524 bi = sh->dev[i].toread;
1525 sh->dev[i].toread = NULL;
1526 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
1527 wake_up(&conf->wait_for_overlap);
1528 if (bi) to_read--;
1529 while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
1530 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
1531 clear_bit(BIO_UPTODATE, &bi->bi_flags);
1532 if (--bi->bi_phys_segments == 0) {
1533 bi->bi_next = return_bi;
1534 return_bi = bi;
1536 bi = nextbi;
1539 spin_unlock_irq(&conf->device_lock);
1540 if (bitmap_end)
1541 bitmap_endwrite(conf->mddev->bitmap, sh->sector,
1542 STRIPE_SECTORS, 0, 0);
1545 if (failed > 1 && syncing) {
1546 md_done_sync(conf->mddev, STRIPE_SECTORS,0);
1547 clear_bit(STRIPE_SYNCING, &sh->state);
1548 syncing = 0;
1551 /* might be able to return some write requests if the parity block
1552 * is safe, or on a failed drive
1553 */
1554 dev = &sh->dev[sh->pd_idx];
1555 if ( written &&
1556 ( (test_bit(R5_Insync, &dev->flags) && !test_bit(R5_LOCKED, &dev->flags) &&
1557 test_bit(R5_UPTODATE, &dev->flags))
1558 || (failed == 1 && failed_num == sh->pd_idx))
1559 ) {
1560 /* any written block on an uptodate or failed drive can be returned.
1561 * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but
1562 * never LOCKED, so we don't need to test 'failed' directly.
1563 */
1564 for (i=disks; i--; )
1565 if (sh->dev[i].written) {
1566 dev = &sh->dev[i];
1567 if (!test_bit(R5_LOCKED, &dev->flags) &&
1568 test_bit(R5_UPTODATE, &dev->flags) ) {
1569 /* We can return any write requests */
1570 struct bio *wbi, *wbi2;
1571 int bitmap_end = 0;
1572 PRINTK("Return write for disc %d\n", i);
1573 spin_lock_irq(&conf->device_lock);
1574 wbi = dev->written;
1575 dev->written = NULL;
1576 while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {
1577 wbi2 = r5_next_bio(wbi, dev->sector);
1578 if (--wbi->bi_phys_segments == 0) {
1579 md_write_end(conf->mddev);
1580 wbi->bi_next = return_bi;
1581 return_bi = wbi;
1583 wbi = wbi2;
1585 if (dev->towrite == NULL)
1586 bitmap_end = 1;
1587 spin_unlock_irq(&conf->device_lock);
1588 if (bitmap_end)
1589 bitmap_endwrite(conf->mddev->bitmap, sh->sector,
1590 STRIPE_SECTORS,
1591 !test_bit(STRIPE_DEGRADED, &sh->state), 0);
1596 /* Now we might consider reading some blocks, either to check/generate
1597 * parity, or to satisfy requests
1598 * or to load a block that is being partially written.
1599 */
1600 if (to_read || non_overwrite || (syncing && (uptodate < disks)) || expanding) {
1601 for (i=disks; i--;) {
1602 dev = &sh->dev[i];
1603 if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
1604 (dev->toread ||
1605 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
1606 syncing ||
1607 expanding ||
1608 (failed && (sh->dev[failed_num].toread ||
1609 (sh->dev[failed_num].towrite && !test_bit(R5_OVERWRITE, &sh->dev[failed_num].flags))))
1611 ) {
1612 /* we would like to get this block, possibly
1613 * by computing it, but we might not be able to
1614 */
1615 if (uptodate == disks-1) {
1616 PRINTK("Computing block %d\n", i);
1617 compute_block(sh, i);
1618 uptodate++;
1619 } else if (test_bit(R5_Insync, &dev->flags)) {
1620 set_bit(R5_LOCKED, &dev->flags);
1621 set_bit(R5_Wantread, &dev->flags);
1622 #if 0
1623 /* if I am just reading this block and we don't have
1624 a failed drive, or any pending writes then sidestep the cache */
1625 if (sh->bh_read[i] && !sh->bh_read[i]->b_reqnext &&
1626 ! syncing && !failed && !to_write) {
1627 sh->bh_cache[i]->b_page = sh->bh_read[i]->b_page;
1628 sh->bh_cache[i]->b_data = sh->bh_read[i]->b_data;
1630 #endif
1631 locked++;
1632 PRINTK("Reading block %d (sync=%d)\n",
1633 i, syncing);
1637 set_bit(STRIPE_HANDLE, &sh->state);
1640 /* now to consider writing and what else, if anything should be read */
1641 if (to_write) {
1642 int rmw=0, rcw=0;
1643 for (i=disks ; i--;) {
1644 /* would I have to read this buffer for read_modify_write */
1645 dev = &sh->dev[i];
1646 if ((dev->towrite || i == sh->pd_idx) &&
1647 (!test_bit(R5_LOCKED, &dev->flags)
1648 #if 0
1649 || sh->bh_page[i]!=bh->b_page
1650 #endif
1651 ) &&
1652 !test_bit(R5_UPTODATE, &dev->flags)) {
1653 if (test_bit(R5_Insync, &dev->flags)
1654 /* && !(!mddev->insync && i == sh->pd_idx) */
1656 rmw++;
1657 else rmw += 2*disks; /* cannot read it */
1659 /* Would I have to read this buffer for reconstruct_write */
1660 if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx &&
1661 (!test_bit(R5_LOCKED, &dev->flags)
1662 #if 0
1663 || sh->bh_page[i] != bh->b_page
1664 #endif
1665 ) &&
1666 !test_bit(R5_UPTODATE, &dev->flags)) {
1667 if (test_bit(R5_Insync, &dev->flags)) rcw++;
1668 else rcw += 2*disks;
1671 PRINTK("for sector %llu, rmw=%d rcw=%d\n",
1672 (unsigned long long)sh->sector, rmw, rcw);
1673 set_bit(STRIPE_HANDLE, &sh->state);
1674 if (rmw < rcw && rmw > 0)
1675 /* prefer read-modify-write, but need to get some data */
1676 for (i=disks; i--;) {
1677 dev = &sh->dev[i];
1678 if ((dev->towrite || i == sh->pd_idx) &&
1679 !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
1680 test_bit(R5_Insync, &dev->flags)) {
1681 if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
1683 PRINTK("Read_old block %d for r-m-w\n", i);
1684 set_bit(R5_LOCKED, &dev->flags);
1685 set_bit(R5_Wantread, &dev->flags);
1686 locked++;
1687 } else {
1688 set_bit(STRIPE_DELAYED, &sh->state);
1689 set_bit(STRIPE_HANDLE, &sh->state);
1693 if (rcw <= rmw && rcw > 0)
1694 /* want reconstruct write, but need to get some data */
1695 for (i=disks; i--;) {
1696 dev = &sh->dev[i];
1697 if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx &&
1698 !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
1699 test_bit(R5_Insync, &dev->flags)) {
1700 if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
1702 PRINTK("Read_old block %d for Reconstruct\n", i);
1703 set_bit(R5_LOCKED, &dev->flags);
1704 set_bit(R5_Wantread, &dev->flags);
1705 locked++;
1706 } else {
1707 set_bit(STRIPE_DELAYED, &sh->state);
1708 set_bit(STRIPE_HANDLE, &sh->state);
1712 /* now if nothing is locked, and if we have enough data, we can start a write request */
1713 if (locked == 0 && (rcw == 0 ||rmw == 0) &&
1714 !test_bit(STRIPE_BIT_DELAY, &sh->state)) {
1715 PRINTK("Computing parity...\n");
1716 compute_parity5(sh, rcw==0 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE);
1717 /* now every locked buffer is ready to be written */
1718 for (i=disks; i--;)
1719 if (test_bit(R5_LOCKED, &sh->dev[i].flags)) {
1720 PRINTK("Writing block %d\n", i);
1721 locked++;
1722 set_bit(R5_Wantwrite, &sh->dev[i].flags);
1723 if (!test_bit(R5_Insync, &sh->dev[i].flags)
1724 || (i==sh->pd_idx && failed == 0))
1725 set_bit(STRIPE_INSYNC, &sh->state);
1727 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
1728 atomic_dec(&conf->preread_active_stripes);
1729 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
1730 md_wakeup_thread(conf->mddev->thread);
1735 /* maybe we need to check and possibly fix the parity for this stripe
1736 * Any reads will already have been scheduled, so we just see if enough data
1737 * is available
1738 */
1739 if (syncing && locked == 0 &&
1740 !test_bit(STRIPE_INSYNC, &sh->state)) {
1741 set_bit(STRIPE_HANDLE, &sh->state);
1742 if (failed == 0) {
1743 BUG_ON(uptodate != disks);
1744 compute_parity5(sh, CHECK_PARITY);
1745 uptodate--;
1746 if (page_is_zero(sh->dev[sh->pd_idx].page)) {
1747 /* parity is correct (on disc, not in buffer any more) */
1748 set_bit(STRIPE_INSYNC, &sh->state);
1749 } else {
1750 conf->mddev->resync_mismatches += STRIPE_SECTORS;
1751 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
1752 /* don't try to repair!! */
1753 set_bit(STRIPE_INSYNC, &sh->state);
1754 else {
1755 compute_block(sh, sh->pd_idx);
1756 uptodate++;
1760 if (!test_bit(STRIPE_INSYNC, &sh->state)) {
1761 /* either failed parity check, or recovery is happening */
1762 if (failed==0)
1763 failed_num = sh->pd_idx;
1764 dev = &sh->dev[failed_num];
1765 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags));
1766 BUG_ON(uptodate != disks);
1768 set_bit(R5_LOCKED, &dev->flags);
1769 set_bit(R5_Wantwrite, &dev->flags);
1770 clear_bit(STRIPE_DEGRADED, &sh->state);
1771 locked++;
1772 set_bit(STRIPE_INSYNC, &sh->state);
1775 if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
1776 md_done_sync(conf->mddev, STRIPE_SECTORS,1);
1777 clear_bit(STRIPE_SYNCING, &sh->state);
1780 /* If the failed drive is just a ReadError, then we might need to progress
1781 * the repair/check process
1782 */
1783 if (failed == 1 && ! conf->mddev->ro &&
1784 test_bit(R5_ReadError, &sh->dev[failed_num].flags)
1785 && !test_bit(R5_LOCKED, &sh->dev[failed_num].flags)
1786 && test_bit(R5_UPTODATE, &sh->dev[failed_num].flags)
1787 ) {
1788 dev = &sh->dev[failed_num];
1789 if (!test_bit(R5_ReWrite, &dev->flags)) {
1790 set_bit(R5_Wantwrite, &dev->flags);
1791 set_bit(R5_ReWrite, &dev->flags);
1792 set_bit(R5_LOCKED, &dev->flags);
1793 locked++;
1794 } else {
1795 /* let's read it back */
1796 set_bit(R5_Wantread, &dev->flags);
1797 set_bit(R5_LOCKED, &dev->flags);
1798 locked++;
1802 if (expanded && test_bit(STRIPE_EXPANDING, &sh->state)) {
1803 /* Need to write out all blocks after computing parity */
1804 sh->disks = conf->raid_disks;
1805 sh->pd_idx = stripe_to_pdidx(sh->sector, conf, conf->raid_disks);
1806 compute_parity5(sh, RECONSTRUCT_WRITE);
1807 for (i= conf->raid_disks; i--;) {
1808 set_bit(R5_LOCKED, &sh->dev[i].flags);
1809 locked++;
1810 set_bit(R5_Wantwrite, &sh->dev[i].flags);
1812 clear_bit(STRIPE_EXPANDING, &sh->state);
1813 } else if (expanded) {
1814 clear_bit(STRIPE_EXPAND_READY, &sh->state);
1815 atomic_dec(&conf->reshape_stripes);
1816 wake_up(&conf->wait_for_overlap);
1817 md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
1820 if (expanding && locked == 0) {
1821 /* We have read all the blocks in this stripe and now we need to
1822 * copy some of them into a target stripe for expand.
1823 */
1824 clear_bit(STRIPE_EXPAND_SOURCE, &sh->state);
1825 for (i=0; i< sh->disks; i++)
1826 if (i != sh->pd_idx) {
1827 int dd_idx, pd_idx, j;
1828 struct stripe_head *sh2;
1830 sector_t bn = compute_blocknr(sh, i);
1831 sector_t s = raid5_compute_sector(bn, conf->raid_disks,
1832 conf->raid_disks-1,
1833 &dd_idx, &pd_idx, conf);
1834 sh2 = get_active_stripe(conf, s, conf->raid_disks, pd_idx, 1);
1835 if (sh2 == NULL)
1836 /* so far only the early blocks of this stripe
1837 * have been requested. When later blocks
1838 * get requested, we will try again
1839 */
1840 continue;
1841 if(!test_bit(STRIPE_EXPANDING, &sh2->state) ||
1842 test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) {
1843 /* must have already done this block */
1844 release_stripe(sh2);
1845 continue;
1847 memcpy(page_address(sh2->dev[dd_idx].page),
1848 page_address(sh->dev[i].page),
1849 STRIPE_SIZE);
1850 set_bit(R5_Expanded, &sh2->dev[dd_idx].flags);
1851 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags);
1852 for (j=0; j<conf->raid_disks; j++)
1853 if (j != sh2->pd_idx &&
1854 !test_bit(R5_Expanded, &sh2->dev[j].flags))
1855 break;
1856 if (j == conf->raid_disks) {
1857 set_bit(STRIPE_EXPAND_READY, &sh2->state);
1858 set_bit(STRIPE_HANDLE, &sh2->state);
1860 release_stripe(sh2);
1864 spin_unlock(&sh->lock);
1866 while ((bi=return_bi)) {
1867 int bytes = bi->bi_size;
1869 return_bi = bi->bi_next;
1870 bi->bi_next = NULL;
1871 bi->bi_size = 0;
1872 bi->bi_end_io(bi, bytes, 0);
1874 for (i=disks; i-- ;) {
1875 int rw;
1876 struct bio *bi;
1877 mdk_rdev_t *rdev;
1878 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags))
1879 rw = 1;
1880 else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
1881 rw = 0;
1882 else
1883 continue;
1885 bi = &sh->dev[i].req;
1887 bi->bi_rw = rw;
1888 if (rw)
1889 bi->bi_end_io = raid5_end_write_request;
1890 else
1891 bi->bi_end_io = raid5_end_read_request;
1893 rcu_read_lock();
1894 rdev = rcu_dereference(conf->disks[i].rdev);
1895 if (rdev && test_bit(Faulty, &rdev->flags))
1896 rdev = NULL;
1897 if (rdev)
1898 atomic_inc(&rdev->nr_pending);
1899 rcu_read_unlock();
1901 if (rdev) {
1902 if (syncing || expanding || expanded)
1903 md_sync_acct(rdev->bdev, STRIPE_SECTORS);
1905 bi->bi_bdev = rdev->bdev;
1906 PRINTK("for %llu schedule op %ld on disc %d\n",
1907 (unsigned long long)sh->sector, bi->bi_rw, i);
1908 atomic_inc(&sh->count);
1909 bi->bi_sector = sh->sector + rdev->data_offset;
1910 bi->bi_flags = 1 << BIO_UPTODATE;
1911 bi->bi_vcnt = 1;
1912 bi->bi_max_vecs = 1;
1913 bi->bi_idx = 0;
1914 bi->bi_io_vec = &sh->dev[i].vec;
1915 bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
1916 bi->bi_io_vec[0].bv_offset = 0;
1917 bi->bi_size = STRIPE_SIZE;
1918 bi->bi_next = NULL;
1919 if (rw == WRITE &&
1920 test_bit(R5_ReWrite, &sh->dev[i].flags))
1921 atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
1922 generic_make_request(bi);
1923 } else {
1924 if (rw == 1)
1925 set_bit(STRIPE_DEGRADED, &sh->state);
1926 PRINTK("skip op %ld on disc %d for sector %llu\n",
1927 bi->bi_rw, i, (unsigned long long)sh->sector);
1928 clear_bit(R5_LOCKED, &sh->dev[i].flags);
1929 set_bit(STRIPE_HANDLE, &sh->state);
1934 static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
1936 raid6_conf_t *conf = sh->raid_conf;
1937 int disks = conf->raid_disks;
1938 struct bio *return_bi= NULL;
1939 struct bio *bi;
1940 int i;
1941 int syncing;
1942 int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0;
1943 int non_overwrite = 0;
1944 int failed_num[2] = {0, 0};
1945 struct r5dev *dev, *pdev, *qdev;
1946 int pd_idx = sh->pd_idx;
1947 int qd_idx = raid6_next_disk(pd_idx, disks);
1948 int p_failed, q_failed;
1950 PRINTK("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d, qd_idx=%d\n",
1951 (unsigned long long)sh->sector, sh->state, atomic_read(&sh->count),
1952 pd_idx, qd_idx);
1954 spin_lock(&sh->lock);
1955 clear_bit(STRIPE_HANDLE, &sh->state);
1956 clear_bit(STRIPE_DELAYED, &sh->state);
1958 syncing = test_bit(STRIPE_SYNCING, &sh->state);
1959 /* Now to look around and see what can be done */
1961 rcu_read_lock();
1962 for (i=disks; i--; ) {
1963 mdk_rdev_t *rdev;
1964 dev = &sh->dev[i];
1965 clear_bit(R5_Insync, &dev->flags);
1967 PRINTK("check %d: state 0x%lx read %p write %p written %p\n",
1968 i, dev->flags, dev->toread, dev->towrite, dev->written);
1969 /* maybe we can reply to a read */
1970 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread) {
1971 struct bio *rbi, *rbi2;
1972 PRINTK("Return read for disc %d\n", i);
1973 spin_lock_irq(&conf->device_lock);
1974 rbi = dev->toread;
1975 dev->toread = NULL;
1976 if (test_and_clear_bit(R5_Overlap, &dev->flags))
1977 wake_up(&conf->wait_for_overlap);
1978 spin_unlock_irq(&conf->device_lock);
1979 while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) {
1980 copy_data(0, rbi, dev->page, dev->sector);
1981 rbi2 = r5_next_bio(rbi, dev->sector);
1982 spin_lock_irq(&conf->device_lock);
1983 if (--rbi->bi_phys_segments == 0) {
1984 rbi->bi_next = return_bi;
1985 return_bi = rbi;
1987 spin_unlock_irq(&conf->device_lock);
1988 rbi = rbi2;
1992 /* now count some things */
1993 if (test_bit(R5_LOCKED, &dev->flags)) locked++;
1994 if (test_bit(R5_UPTODATE, &dev->flags)) uptodate++;
1997 if (dev->toread) to_read++;
1998 if (dev->towrite) {
1999 to_write++;
2000 if (!test_bit(R5_OVERWRITE, &dev->flags))
2001 non_overwrite++;
2003 if (dev->written) written++;
2004 rdev = rcu_dereference(conf->disks[i].rdev);
2005 if (!rdev || !test_bit(In_sync, &rdev->flags)) {
2006 /* The ReadError flag will just be confusing now */
2007 clear_bit(R5_ReadError, &dev->flags);
2008 clear_bit(R5_ReWrite, &dev->flags);
2010 if (!rdev || !test_bit(In_sync, &rdev->flags)
2011 || test_bit(R5_ReadError, &dev->flags)) {
2012 if ( failed < 2 )
2013 failed_num[failed] = i;
2014 failed++;
2015 } else
2016 set_bit(R5_Insync, &dev->flags);
2018 rcu_read_unlock();
2019 PRINTK("locked=%d uptodate=%d to_read=%d"
2020 " to_write=%d failed=%d failed_num=%d,%d\n",
2021 locked, uptodate, to_read, to_write, failed,
2022 failed_num[0], failed_num[1]);
2023 /* check if the array has lost >2 devices and, if so, some requests might
2024 * need to be failed
2025 */
2026 if (failed > 2 && to_read+to_write+written) {
2027 for (i=disks; i--; ) {
2028 int bitmap_end = 0;
2030 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
2031 mdk_rdev_t *rdev;
2032 rcu_read_lock();
2033 rdev = rcu_dereference(conf->disks[i].rdev);
2034 if (rdev && test_bit(In_sync, &rdev->flags))
2035 /* multiple read failures in one stripe */
2036 md_error(conf->mddev, rdev);
2037 rcu_read_unlock();
2040 spin_lock_irq(&conf->device_lock);
2041 /* fail all writes first */
2042 bi = sh->dev[i].towrite;
2043 sh->dev[i].towrite = NULL;
2044 if (bi) { to_write--; bitmap_end = 1; }
2046 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
2047 wake_up(&conf->wait_for_overlap);
2049 while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
2050 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
2051 clear_bit(BIO_UPTODATE, &bi->bi_flags);
2052 if (--bi->bi_phys_segments == 0) {
2053 md_write_end(conf->mddev);
2054 bi->bi_next = return_bi;
2055 return_bi = bi;
2057 bi = nextbi;
2059 /* and fail all 'written' */
2060 bi = sh->dev[i].written;
2061 sh->dev[i].written = NULL;
2062 if (bi) bitmap_end = 1;
2063 while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) {
2064 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
2065 clear_bit(BIO_UPTODATE, &bi->bi_flags);
2066 if (--bi->bi_phys_segments == 0) {
2067 md_write_end(conf->mddev);
2068 bi->bi_next = return_bi;
2069 return_bi = bi;
2071 bi = bi2;
2074 /* fail any reads if this device is non-operational */
2075 if (!test_bit(R5_Insync, &sh->dev[i].flags) ||
2076 test_bit(R5_ReadError, &sh->dev[i].flags)) {
2077 bi = sh->dev[i].toread;
2078 sh->dev[i].toread = NULL;
2079 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
2080 wake_up(&conf->wait_for_overlap);
2081 if (bi) to_read--;
2082 while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
2083 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
2084 clear_bit(BIO_UPTODATE, &bi->bi_flags);
2085 if (--bi->bi_phys_segments == 0) {
2086 bi->bi_next = return_bi;
2087 return_bi = bi;
2089 bi = nextbi;
2092 spin_unlock_irq(&conf->device_lock);
2093 if (bitmap_end)
2094 bitmap_endwrite(conf->mddev->bitmap, sh->sector,
2095 STRIPE_SECTORS, 0, 0);
2098 if (failed > 2 && syncing) {
2099 md_done_sync(conf->mddev, STRIPE_SECTORS,0);
2100 clear_bit(STRIPE_SYNCING, &sh->state);
2101 syncing = 0;
2104 /*
2105 * might be able to return some write requests if the parity blocks
2106 * are safe, or on a failed drive
2107 */
2108 pdev = &sh->dev[pd_idx];
2109 p_failed = (failed >= 1 && failed_num[0] == pd_idx)
2110 || (failed >= 2 && failed_num[1] == pd_idx);
2111 qdev = &sh->dev[qd_idx];
2112 q_failed = (failed >= 1 && failed_num[0] == qd_idx)
2113 || (failed >= 2 && failed_num[1] == qd_idx);
2115 if ( written &&
2116 ( p_failed || ((test_bit(R5_Insync, &pdev->flags)
2117 && !test_bit(R5_LOCKED, &pdev->flags)
2118 && test_bit(R5_UPTODATE, &pdev->flags))) ) &&
2119 ( q_failed || ((test_bit(R5_Insync, &qdev->flags)
2120 && !test_bit(R5_LOCKED, &qdev->flags)
2121 && test_bit(R5_UPTODATE, &qdev->flags))) ) ) {
2122 /* any written block on an uptodate or failed drive can be
2123 * returned. Note that if we 'wrote' to a failed drive,
2124 * it will be UPTODATE, but never LOCKED, so we don't need
2125 * to test 'failed' directly.
2126 */
2127 for (i=disks; i--; )
2128 if (sh->dev[i].written) {
2129 dev = &sh->dev[i];
2130 if (!test_bit(R5_LOCKED, &dev->flags) &&
2131 test_bit(R5_UPTODATE, &dev->flags) ) {
2132 /* We can return any write requests */
2133 int bitmap_end = 0;
2134 struct bio *wbi, *wbi2;
2135 PRINTK("Return write for stripe %llu disc %d\n",
2136 (unsigned long long)sh->sector, i);
2137 spin_lock_irq(&conf->device_lock);
2138 wbi = dev->written;
2139 dev->written = NULL;
2140 while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {
2141 wbi2 = r5_next_bio(wbi, dev->sector);
2142 if (--wbi->bi_phys_segments == 0) {
2143 md_write_end(conf->mddev);
2144 wbi->bi_next = return_bi;
2145 return_bi = wbi;
2147 wbi = wbi2;
2149 if (dev->towrite == NULL)
2150 bitmap_end = 1;
2151 spin_unlock_irq(&conf->device_lock);
2152 if (bitmap_end)
2153 bitmap_endwrite(conf->mddev->bitmap, sh->sector,
2154 STRIPE_SECTORS,
2155 !test_bit(STRIPE_DEGRADED, &sh->state), 0);
2160 /* Now we might consider reading some blocks, either to check/generate
2161 * parity, or to satisfy requests
2162 * or to load a block that is being partially written.
2163 */
2164 if (to_read || non_overwrite || (to_write && failed) || (syncing && (uptodate < disks))) {
2165 for (i=disks; i--;) {
2166 dev = &sh->dev[i];
2167 if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
2168 (dev->toread ||
2169 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
2170 syncing ||
2171 (failed >= 1 && (sh->dev[failed_num[0]].toread || to_write)) ||
2172 (failed >= 2 && (sh->dev[failed_num[1]].toread || to_write))
2174 ) {
2175 /* we would like to get this block, possibly
2176 * by computing it, but we might not be able to
2177 */
2178 if (uptodate == disks-1) {
2179 PRINTK("Computing stripe %llu block %d\n",
2180 (unsigned long long)sh->sector, i);
2181 compute_block_1(sh, i, 0);
2182 uptodate++;
2183 } else if ( uptodate == disks-2 && failed >= 2 ) {
2184 /* Computing 2-failure is *very* expensive; only do it if failed >= 2 */
2185 int other;
2186 for (other=disks; other--;) {
2187 if ( other == i )
2188 continue;
2189 if ( !test_bit(R5_UPTODATE, &sh->dev[other].flags) )
2190 break;
2192 BUG_ON(other < 0);
2193 PRINTK("Computing stripe %llu blocks %d,%d\n",
2194 (unsigned long long)sh->sector, i, other);
2195 compute_block_2(sh, i, other);
2196 uptodate += 2;
2197 } else if (test_bit(R5_Insync, &dev->flags)) {
2198 set_bit(R5_LOCKED, &dev->flags);
2199 set_bit(R5_Wantread, &dev->flags);
2200 #if 0
2201 /* if I am just reading this block and we don't have
2202 a failed drive, or any pending writes then sidestep the cache */
2203 if (sh->bh_read[i] && !sh->bh_read[i]->b_reqnext &&
2204 ! syncing && !failed && !to_write) {
2205 sh->bh_cache[i]->b_page = sh->bh_read[i]->b_page;
2206 sh->bh_cache[i]->b_data = sh->bh_read[i]->b_data;
2208 #endif
2209 locked++;
2210 PRINTK("Reading block %d (sync=%d)\n",
2211 i, syncing);
2215 set_bit(STRIPE_HANDLE, &sh->state);
2218 /* now to consider writing and what else, if anything should be read */
2219 if (to_write) {
2220 int rcw=0, must_compute=0;
2221 for (i=disks ; i--;) {
2222 dev = &sh->dev[i];
2223 /* Would I have to read this buffer for reconstruct_write */
2224 if (!test_bit(R5_OVERWRITE, &dev->flags)
2225 && i != pd_idx && i != qd_idx
2226 && (!test_bit(R5_LOCKED, &dev->flags)
2227 #if 0
2228 || sh->bh_page[i] != bh->b_page
2229 #endif
2230 ) &&
2231 !test_bit(R5_UPTODATE, &dev->flags)) {
2232 if (test_bit(R5_Insync, &dev->flags)) rcw++;
2233 else {
2234 PRINTK("raid6: must_compute: disk %d flags=%#lx\n", i, dev->flags);
2235 must_compute++;
2239 PRINTK("for sector %llu, rcw=%d, must_compute=%d\n",
2240 (unsigned long long)sh->sector, rcw, must_compute);
2241 set_bit(STRIPE_HANDLE, &sh->state);
2243 if (rcw > 0)
2244 /* want reconstruct write, but need to get some data */
2245 for (i=disks; i--;) {
2246 dev = &sh->dev[i];
2247 if (!test_bit(R5_OVERWRITE, &dev->flags)
2248 && !(failed == 0 && (i == pd_idx || i == qd_idx))
2249 && !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
2250 test_bit(R5_Insync, &dev->flags)) {
2251 if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
2253 PRINTK("Read_old stripe %llu block %d for Reconstruct\n",
2254 (unsigned long long)sh->sector, i);
2255 set_bit(R5_LOCKED, &dev->flags);
2256 set_bit(R5_Wantread, &dev->flags);
2257 locked++;
2258 } else {
2259 PRINTK("Request delayed stripe %llu block %d for Reconstruct\n",
2260 (unsigned long long)sh->sector, i);
2261 set_bit(STRIPE_DELAYED, &sh->state);
2262 set_bit(STRIPE_HANDLE, &sh->state);
2266 /* now if nothing is locked, and if we have enough data, we can start a write request */
2267 if (locked == 0 && rcw == 0 &&
2268 !test_bit(STRIPE_BIT_DELAY, &sh->state)) {
2269 if ( must_compute > 0 ) {
2270 /* We have failed blocks and need to compute them */
2271 switch ( failed ) {
2272 case 0: BUG();
2273 case 1: compute_block_1(sh, failed_num[0], 0); break;
2274 case 2: compute_block_2(sh, failed_num[0], failed_num[1]); break;
2275 default: BUG(); /* This request should have been failed? */
2279 PRINTK("Computing parity for stripe %llu\n", (unsigned long long)sh->sector);
2280 compute_parity6(sh, RECONSTRUCT_WRITE);
2281 /* now every locked buffer is ready to be written */
2282 for (i=disks; i--;)
2283 if (test_bit(R5_LOCKED, &sh->dev[i].flags)) {
2284 PRINTK("Writing stripe %llu block %d\n",
2285 (unsigned long long)sh->sector, i);
2286 locked++;
2287 set_bit(R5_Wantwrite, &sh->dev[i].flags);
2289 /* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */
2290 set_bit(STRIPE_INSYNC, &sh->state);
2292 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
2293 atomic_dec(&conf->preread_active_stripes);
2294 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
2295 md_wakeup_thread(conf->mddev->thread);
2300 /* maybe we need to check and possibly fix the parity for this stripe
2301 * Any reads will already have been scheduled, so we just see if enough data
2302 * is available
2303 */
2304 if (syncing && locked == 0 && !test_bit(STRIPE_INSYNC, &sh->state)) {
2305 int update_p = 0, update_q = 0;
2306 struct r5dev *dev;
2308 set_bit(STRIPE_HANDLE, &sh->state);
2310 BUG_ON(failed>2);
2311 BUG_ON(uptodate < disks);
2312 /* Want to check and possibly repair P and Q.
2313 * However there could be one 'failed' device, in which
2314 * case we can only check one of them, possibly using the
2315 * other to generate missing data
2316 */
2318 /* If !tmp_page, we cannot do the calculations,
2319 * but as we have set STRIPE_HANDLE, we will soon be called
2320 * by stripe_handle with a tmp_page - just wait until then.
2321 */
2322 if (tmp_page) {
2323 if (failed == q_failed) {
2324 /* The only possible failed device holds 'Q', so it makes
2325 * sense to check P (If anything else were failed, we would
2326 * have used P to recreate it).
2327 */
2328 compute_block_1(sh, pd_idx, 1);
2329 if (!page_is_zero(sh->dev[pd_idx].page)) {
2330 compute_block_1(sh,pd_idx,0);
2331 update_p = 1;
2334 if (!q_failed && failed < 2) {
2335 /* q is not failed, and we didn't use it to generate
2336 * anything, so it makes sense to check it
2337 */
2338 memcpy(page_address(tmp_page),
2339 page_address(sh->dev[qd_idx].page),
2340 STRIPE_SIZE);
2341 compute_parity6(sh, UPDATE_PARITY);
2342 if (memcmp(page_address(tmp_page),
2343 page_address(sh->dev[qd_idx].page),
2344 STRIPE_SIZE)!= 0) {
2345 clear_bit(STRIPE_INSYNC, &sh->state);
2346 update_q = 1;
2349 if (update_p || update_q) {
2350 conf->mddev->resync_mismatches += STRIPE_SECTORS;
2351 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
2352 /* don't try to repair!! */
2353 update_p = update_q = 0;
2356 /* now write out any block on a failed drive,
2357 * or P or Q if they need it
2358 */
2360 if (failed == 2) {
2361 dev = &sh->dev[failed_num[1]];
2362 locked++;
2363 set_bit(R5_LOCKED, &dev->flags);
2364 set_bit(R5_Wantwrite, &dev->flags);
2366 if (failed >= 1) {
2367 dev = &sh->dev[failed_num[0]];
2368 locked++;
2369 set_bit(R5_LOCKED, &dev->flags);
2370 set_bit(R5_Wantwrite, &dev->flags);
2373 if (update_p) {
2374 dev = &sh->dev[pd_idx];
2375 locked ++;
2376 set_bit(R5_LOCKED, &dev->flags);
2377 set_bit(R5_Wantwrite, &dev->flags);
2379 if (update_q) {
2380 dev = &sh->dev[qd_idx];
2381 locked++;
2382 set_bit(R5_LOCKED, &dev->flags);
2383 set_bit(R5_Wantwrite, &dev->flags);
2385 clear_bit(STRIPE_DEGRADED, &sh->state);
2387 set_bit(STRIPE_INSYNC, &sh->state);
2391 if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
2392 md_done_sync(conf->mddev, STRIPE_SECTORS,1);
2393 clear_bit(STRIPE_SYNCING, &sh->state);
2396 /* If the failed drives are just a ReadError, then we might need
2397 * to progress the repair/check process
2398 */
2399 if (failed <= 2 && ! conf->mddev->ro)
2400 for (i=0; i<failed;i++) {
2401 dev = &sh->dev[failed_num[i]];
2402 if (test_bit(R5_ReadError, &dev->flags)
2403 && !test_bit(R5_LOCKED, &dev->flags)
2404 && test_bit(R5_UPTODATE, &dev->flags)
2405 ) {
2406 if (!test_bit(R5_ReWrite, &dev->flags)) {
2407 set_bit(R5_Wantwrite, &dev->flags);
2408 set_bit(R5_ReWrite, &dev->flags);
2409 set_bit(R5_LOCKED, &dev->flags);
2410 } else {
2411 /* let's read it back */
2412 set_bit(R5_Wantread, &dev->flags);
2413 set_bit(R5_LOCKED, &dev->flags);
2417 spin_unlock(&sh->lock);
2419 while ((bi=return_bi)) {
2420 int bytes = bi->bi_size;
2422 return_bi = bi->bi_next;
2423 bi->bi_next = NULL;
2424 bi->bi_size = 0;
2425 bi->bi_end_io(bi, bytes, 0);
2427 for (i=disks; i-- ;) {
2428 int rw;
2429 struct bio *bi;
2430 mdk_rdev_t *rdev;
2431 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags))
2432 rw = 1;
2433 else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
2434 rw = 0;
2435 else
2436 continue;
2438 bi = &sh->dev[i].req;
2440 bi->bi_rw = rw;
2441 if (rw)
2442 bi->bi_end_io = raid5_end_write_request;
2443 else
2444 bi->bi_end_io = raid5_end_read_request;
2446 rcu_read_lock();
2447 rdev = rcu_dereference(conf->disks[i].rdev);
2448 if (rdev && test_bit(Faulty, &rdev->flags))
2449 rdev = NULL;
2450 if (rdev)
2451 atomic_inc(&rdev->nr_pending);
2452 rcu_read_unlock();
2454 if (rdev) {
2455 if (syncing)
2456 md_sync_acct(rdev->bdev, STRIPE_SECTORS);
2458 bi->bi_bdev = rdev->bdev;
2459 PRINTK("for %llu schedule op %ld on disc %d\n",
2460 (unsigned long long)sh->sector, bi->bi_rw, i);
2461 atomic_inc(&sh->count);
2462 bi->bi_sector = sh->sector + rdev->data_offset;
2463 bi->bi_flags = 1 << BIO_UPTODATE;
2464 bi->bi_vcnt = 1;
2465 bi->bi_max_vecs = 1;
2466 bi->bi_idx = 0;
2467 bi->bi_io_vec = &sh->dev[i].vec;
2468 bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
2469 bi->bi_io_vec[0].bv_offset = 0;
2470 bi->bi_size = STRIPE_SIZE;
2471 bi->bi_next = NULL;
2472 if (rw == WRITE &&
2473 test_bit(R5_ReWrite, &sh->dev[i].flags))
2474 atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
2475 generic_make_request(bi);
2476 } else {
2477 if (rw == 1)
2478 set_bit(STRIPE_DEGRADED, &sh->state);
2479 PRINTK("skip op %ld on disc %d for sector %llu\n",
2480 bi->bi_rw, i, (unsigned long long)sh->sector);
2481 clear_bit(R5_LOCKED, &sh->dev[i].flags);
2482 set_bit(STRIPE_HANDLE, &sh->state);
2487 static void handle_stripe(struct stripe_head *sh, struct page *tmp_page)
2489 if (sh->raid_conf->level == 6)
2490 handle_stripe6(sh, tmp_page);
2491 else
2492 handle_stripe5(sh);
2497 static void raid5_activate_delayed(raid5_conf_t *conf)
2499 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
2500 while (!list_empty(&conf->delayed_list)) {
2501 struct list_head *l = conf->delayed_list.next;
2502 struct stripe_head *sh;
2503 sh = list_entry(l, struct stripe_head, lru);
2504 list_del_init(l);
2505 clear_bit(STRIPE_DELAYED, &sh->state);
2506 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
2507 atomic_inc(&conf->preread_active_stripes);
2508 list_add_tail(&sh->lru, &conf->handle_list);
2513 static void activate_bit_delay(raid5_conf_t *conf)
2515 /* device_lock is held */
2516 struct list_head head;
2517 list_add(&head, &conf->bitmap_list);
2518 list_del_init(&conf->bitmap_list);
2519 while (!list_empty(&head)) {
2520 struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru);
2521 list_del_init(&sh->lru);
2522 atomic_inc(&sh->count);
2523 __release_stripe(conf, sh);
2527 static void unplug_slaves(mddev_t *mddev)
2529 raid5_conf_t *conf = mddev_to_conf(mddev);
2530 int i;
2532 rcu_read_lock();
2533 for (i=0; i<mddev->raid_disks; i++) {
2534 mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
2535 if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) {
2536 request_queue_t *r_queue = bdev_get_queue(rdev->bdev);
2538 atomic_inc(&rdev->nr_pending);
2539 rcu_read_unlock();
2541 if (r_queue->unplug_fn)
2542 r_queue->unplug_fn(r_queue);
2544 rdev_dec_pending(rdev, mddev);
2545 rcu_read_lock();
2548 rcu_read_unlock();
2551 static void raid5_unplug_device(request_queue_t *q)
2553 mddev_t *mddev = q->queuedata;
2554 raid5_conf_t *conf = mddev_to_conf(mddev);
2555 unsigned long flags;
2557 spin_lock_irqsave(&conf->device_lock, flags);
2559 if (blk_remove_plug(q)) {
2560 conf->seq_flush++;
2561 raid5_activate_delayed(conf);
2563 md_wakeup_thread(mddev->thread);
2565 spin_unlock_irqrestore(&conf->device_lock, flags);
2567 unplug_slaves(mddev);
2570 static int raid5_issue_flush(request_queue_t *q, struct gendisk *disk,
2571 sector_t *error_sector)
2573 mddev_t *mddev = q->queuedata;
2574 raid5_conf_t *conf = mddev_to_conf(mddev);
2575 int i, ret = 0;
2577 rcu_read_lock();
2578 for (i=0; i<mddev->raid_disks && ret == 0; i++) {
2579 mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
2580 if (rdev && !test_bit(Faulty, &rdev->flags)) {
2581 struct block_device *bdev = rdev->bdev;
2582 request_queue_t *r_queue = bdev_get_queue(bdev);
2584 if (!r_queue->issue_flush_fn)
2585 ret = -EOPNOTSUPP;
2586 else {
2587 atomic_inc(&rdev->nr_pending);
2588 rcu_read_unlock();
2589 ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk,
2590 error_sector);
2591 rdev_dec_pending(rdev, mddev);
2592 rcu_read_lock();
2596 rcu_read_unlock();
2597 return ret;
2600 static int make_request(request_queue_t *q, struct bio * bi)
2602 mddev_t *mddev = q->queuedata;
2603 raid5_conf_t *conf = mddev_to_conf(mddev);
2604 unsigned int dd_idx, pd_idx;
2605 sector_t new_sector;
2606 sector_t logical_sector, last_sector;
2607 struct stripe_head *sh;
2608 const int rw = bio_data_dir(bi);
2609 int remaining;
2611 if (unlikely(bio_barrier(bi))) {
2612 bio_endio(bi, bi->bi_size, -EOPNOTSUPP);
2613 return 0;
2616 md_write_start(mddev, bi);
2618 disk_stat_inc(mddev->gendisk, ios[rw]);
2619 disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bi));
2621 logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
2622 last_sector = bi->bi_sector + (bi->bi_size>>9);
2623 bi->bi_next = NULL;
2624 bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
2626 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
2627 DEFINE_WAIT(w);
2628 int disks, data_disks;
2630 retry:
2631 prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
2632 if (likely(conf->expand_progress == MaxSector))
2633 disks = conf->raid_disks;
2634 else {
2635 /* spinlock is needed as expand_progress may be
2636 * 64bit on a 32bit platform, and so it might be
2637 * possible to see a half-updated value
2638 * Ofcourse expand_progress could change after
2639 * the lock is dropped, so once we get a reference
2640 * to the stripe that we think it is, we will have
2641 * to check again.
2642 */
2643 spin_lock_irq(&conf->device_lock);
2644 disks = conf->raid_disks;
2645 if (logical_sector >= conf->expand_progress)
2646 disks = conf->previous_raid_disks;
2647 else {
2648 if (logical_sector >= conf->expand_lo) {
2649 spin_unlock_irq(&conf->device_lock);
2650 schedule();
2651 goto retry;
2654 spin_unlock_irq(&conf->device_lock);
2656 data_disks = disks - conf->max_degraded;
2658 new_sector = raid5_compute_sector(logical_sector, disks, data_disks,
2659 &dd_idx, &pd_idx, conf);
2660 PRINTK("raid5: make_request, sector %llu logical %llu\n",
2661 (unsigned long long)new_sector,
2662 (unsigned long long)logical_sector);
2664 sh = get_active_stripe(conf, new_sector, disks, pd_idx, (bi->bi_rw&RWA_MASK));
2665 if (sh) {
2666 if (unlikely(conf->expand_progress != MaxSector)) {
2667 /* expansion might have moved on while waiting for a
2668 * stripe, so we must do the range check again.
2669 * Expansion could still move past after this
2670 * test, but as we are holding a reference to
2671 * 'sh', we know that if that happens,
2672 * STRIPE_EXPANDING will get set and the expansion
2673 * won't proceed until we finish with the stripe.
2674 */
2675 int must_retry = 0;
2676 spin_lock_irq(&conf->device_lock);
2677 if (logical_sector < conf->expand_progress &&
2678 disks == conf->previous_raid_disks)
2679 /* mismatch, need to try again */
2680 must_retry = 1;
2681 spin_unlock_irq(&conf->device_lock);
2682 if (must_retry) {
2683 release_stripe(sh);
2684 goto retry;
2687 /* FIXME what if we get a false positive because these
2688 * are being updated.
2689 */
2690 if (logical_sector >= mddev->suspend_lo &&
2691 logical_sector < mddev->suspend_hi) {
2692 release_stripe(sh);
2693 schedule();
2694 goto retry;
2697 if (test_bit(STRIPE_EXPANDING, &sh->state) ||
2698 !add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) {
2699 /* Stripe is busy expanding or
2700 * add failed due to overlap. Flush everything
2701 * and wait a while
2702 */
2703 raid5_unplug_device(mddev->queue);
2704 release_stripe(sh);
2705 schedule();
2706 goto retry;
2708 finish_wait(&conf->wait_for_overlap, &w);
2709 handle_stripe(sh, NULL);
2710 release_stripe(sh);
2711 } else {
2712 /* cannot get stripe for read-ahead, just give-up */
2713 clear_bit(BIO_UPTODATE, &bi->bi_flags);
2714 finish_wait(&conf->wait_for_overlap, &w);
2715 break;
2719 spin_lock_irq(&conf->device_lock);
2720 remaining = --bi->bi_phys_segments;
2721 spin_unlock_irq(&conf->device_lock);
2722 if (remaining == 0) {
2723 int bytes = bi->bi_size;
2725 if ( rw == WRITE )
2726 md_write_end(mddev);
2727 bi->bi_size = 0;
2728 bi->bi_end_io(bi, bytes, 0);
2730 return 0;
2733 static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped)
2735 /* reshaping is quite different to recovery/resync so it is
2736 * handled quite separately ... here.
2738 * On each call to sync_request, we gather one chunk worth of
2739 * destination stripes and flag them as expanding.
2740 * Then we find all the source stripes and request reads.
2741 * As the reads complete, handle_stripe will copy the data
2742 * into the destination stripe and release that stripe.
2743 */
2744 raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
2745 struct stripe_head *sh;
2746 int pd_idx;
2747 sector_t first_sector, last_sector;
2748 int raid_disks;
2749 int data_disks;
2750 int i;
2751 int dd_idx;
2752 sector_t writepos, safepos, gap;
2754 if (sector_nr == 0 &&
2755 conf->expand_progress != 0) {
2756 /* restarting in the middle, skip the initial sectors */
2757 sector_nr = conf->expand_progress;
2758 sector_div(sector_nr, conf->raid_disks-1);
2759 *skipped = 1;
2760 return sector_nr;
2763 /* we update the metadata when there is more than 3Meg
2764 * in the block range (that is rather arbitrary, should
2765 * probably be time based) or when the data about to be
2766 * copied would over-write the source of the data at
2767 * the front of the range.
2768 * i.e. one new_stripe forward from expand_progress new_maps
2769 * to after where expand_lo old_maps to
2770 */
2771 writepos = conf->expand_progress +
2772 conf->chunk_size/512*(conf->raid_disks-1);
2773 sector_div(writepos, conf->raid_disks-1);
2774 safepos = conf->expand_lo;
2775 sector_div(safepos, conf->previous_raid_disks-1);
2776 gap = conf->expand_progress - conf->expand_lo;
2778 if (writepos >= safepos ||
2779 gap > (conf->raid_disks-1)*3000*2 /*3Meg*/) {
2780 /* Cannot proceed until we've updated the superblock... */
2781 wait_event(conf->wait_for_overlap,
2782 atomic_read(&conf->reshape_stripes)==0);
2783 mddev->reshape_position = conf->expand_progress;
2784 mddev->sb_dirty = 1;
2785 md_wakeup_thread(mddev->thread);
2786 wait_event(mddev->sb_wait, mddev->sb_dirty == 0 ||
2787 kthread_should_stop());
2788 spin_lock_irq(&conf->device_lock);
2789 conf->expand_lo = mddev->reshape_position;
2790 spin_unlock_irq(&conf->device_lock);
2791 wake_up(&conf->wait_for_overlap);
2794 for (i=0; i < conf->chunk_size/512; i+= STRIPE_SECTORS) {
2795 int j;
2796 int skipped = 0;
2797 pd_idx = stripe_to_pdidx(sector_nr+i, conf, conf->raid_disks);
2798 sh = get_active_stripe(conf, sector_nr+i,
2799 conf->raid_disks, pd_idx, 0);
2800 set_bit(STRIPE_EXPANDING, &sh->state);
2801 atomic_inc(&conf->reshape_stripes);
2802 /* If any of this stripe is beyond the end of the old
2803 * array, then we need to zero those blocks
2804 */
2805 for (j=sh->disks; j--;) {
2806 sector_t s;
2807 if (j == sh->pd_idx)
2808 continue;
2809 s = compute_blocknr(sh, j);
2810 if (s < (mddev->array_size<<1)) {
2811 skipped = 1;
2812 continue;
2814 memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE);
2815 set_bit(R5_Expanded, &sh->dev[j].flags);
2816 set_bit(R5_UPTODATE, &sh->dev[j].flags);
2818 if (!skipped) {
2819 set_bit(STRIPE_EXPAND_READY, &sh->state);
2820 set_bit(STRIPE_HANDLE, &sh->state);
2822 release_stripe(sh);
2824 spin_lock_irq(&conf->device_lock);
2825 conf->expand_progress = (sector_nr + i)*(conf->raid_disks-1);
2826 spin_unlock_irq(&conf->device_lock);
2827 /* Ok, those stripe are ready. We can start scheduling
2828 * reads on the source stripes.
2829 * The source stripes are determined by mapping the first and last
2830 * block on the destination stripes.
2831 */
2832 raid_disks = conf->previous_raid_disks;
2833 data_disks = raid_disks - 1;
2834 first_sector =
2835 raid5_compute_sector(sector_nr*(conf->raid_disks-1),
2836 raid_disks, data_disks,
2837 &dd_idx, &pd_idx, conf);
2838 last_sector =
2839 raid5_compute_sector((sector_nr+conf->chunk_size/512)
2840 *(conf->raid_disks-1) -1,
2841 raid_disks, data_disks,
2842 &dd_idx, &pd_idx, conf);
2843 if (last_sector >= (mddev->size<<1))
2844 last_sector = (mddev->size<<1)-1;
2845 while (first_sector <= last_sector) {
2846 pd_idx = stripe_to_pdidx(first_sector, conf, conf->previous_raid_disks);
2847 sh = get_active_stripe(conf, first_sector,
2848 conf->previous_raid_disks, pd_idx, 0);
2849 set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
2850 set_bit(STRIPE_HANDLE, &sh->state);
2851 release_stripe(sh);
2852 first_sector += STRIPE_SECTORS;
2854 return conf->chunk_size>>9;
2857 /* FIXME go_faster isn't used */
2858 static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
2860 raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
2861 struct stripe_head *sh;
2862 int pd_idx;
2863 int raid_disks = conf->raid_disks;
2864 sector_t max_sector = mddev->size << 1;
2865 int sync_blocks;
2866 int still_degraded = 0;
2867 int i;
2869 if (sector_nr >= max_sector) {
2870 /* just being told to finish up .. nothing much to do */
2871 unplug_slaves(mddev);
2872 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
2873 end_reshape(conf);
2874 return 0;
2877 if (mddev->curr_resync < max_sector) /* aborted */
2878 bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
2879 &sync_blocks, 1);
2880 else /* completed sync */
2881 conf->fullsync = 0;
2882 bitmap_close_sync(mddev->bitmap);
2884 return 0;
2887 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
2888 return reshape_request(mddev, sector_nr, skipped);
2890 /* if there is too many failed drives and we are trying
2891 * to resync, then assert that we are finished, because there is
2892 * nothing we can do.
2893 */
2894 if (mddev->degraded >= conf->max_degraded &&
2895 test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
2896 sector_t rv = (mddev->size << 1) - sector_nr;
2897 *skipped = 1;
2898 return rv;
2900 if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
2901 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
2902 !conf->fullsync && sync_blocks >= STRIPE_SECTORS) {
2903 /* we can skip this block, and probably more */
2904 sync_blocks /= STRIPE_SECTORS;
2905 *skipped = 1;
2906 return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */
2909 pd_idx = stripe_to_pdidx(sector_nr, conf, raid_disks);
2910 sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 1);
2911 if (sh == NULL) {
2912 sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 0);
2913 /* make sure we don't swamp the stripe cache if someone else
2914 * is trying to get access
2915 */
2916 schedule_timeout_uninterruptible(1);
2918 /* Need to check if array will still be degraded after recovery/resync
2919 * We don't need to check the 'failed' flag as when that gets set,
2920 * recovery aborts.
2921 */
2922 for (i=0; i<mddev->raid_disks; i++)
2923 if (conf->disks[i].rdev == NULL)
2924 still_degraded = 1;
2926 bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded);
2928 spin_lock(&sh->lock);
2929 set_bit(STRIPE_SYNCING, &sh->state);
2930 clear_bit(STRIPE_INSYNC, &sh->state);
2931 spin_unlock(&sh->lock);
2933 handle_stripe(sh, NULL);
2934 release_stripe(sh);
2936 return STRIPE_SECTORS;
2939 /*
2940 * This is our raid5 kernel thread.
2942 * We scan the hash table for stripes which can be handled now.
2943 * During the scan, completed stripes are saved for us by the interrupt
2944 * handler, so that they will not have to wait for our next wakeup.
2945 */
2946 static void raid5d (mddev_t *mddev)
2948 struct stripe_head *sh;
2949 raid5_conf_t *conf = mddev_to_conf(mddev);
2950 int handled;
2952 PRINTK("+++ raid5d active\n");
2954 md_check_recovery(mddev);
2956 handled = 0;
2957 spin_lock_irq(&conf->device_lock);
2958 while (1) {
2959 struct list_head *first;
2961 if (conf->seq_flush != conf->seq_write) {
2962 int seq = conf->seq_flush;
2963 spin_unlock_irq(&conf->device_lock);
2964 bitmap_unplug(mddev->bitmap);
2965 spin_lock_irq(&conf->device_lock);
2966 conf->seq_write = seq;
2967 activate_bit_delay(conf);
2970 if (list_empty(&conf->handle_list) &&
2971 atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD &&
2972 !blk_queue_plugged(mddev->queue) &&
2973 !list_empty(&conf->delayed_list))
2974 raid5_activate_delayed(conf);
2976 if (list_empty(&conf->handle_list))
2977 break;
2979 first = conf->handle_list.next;
2980 sh = list_entry(first, struct stripe_head, lru);
2982 list_del_init(first);
2983 atomic_inc(&sh->count);
2984 BUG_ON(atomic_read(&sh->count)!= 1);
2985 spin_unlock_irq(&conf->device_lock);
2987 handled++;
2988 handle_stripe(sh, conf->spare_page);
2989 release_stripe(sh);
2991 spin_lock_irq(&conf->device_lock);
2993 PRINTK("%d stripes handled\n", handled);
2995 spin_unlock_irq(&conf->device_lock);
2997 unplug_slaves(mddev);
2999 PRINTK("--- raid5d inactive\n");
3002 static ssize_t
3003 raid5_show_stripe_cache_size(mddev_t *mddev, char *page)
3005 raid5_conf_t *conf = mddev_to_conf(mddev);
3006 if (conf)
3007 return sprintf(page, "%d\n", conf->max_nr_stripes);
3008 else
3009 return 0;
3012 static ssize_t
3013 raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len)
3015 raid5_conf_t *conf = mddev_to_conf(mddev);
3016 char *end;
3017 int new;
3018 if (len >= PAGE_SIZE)
3019 return -EINVAL;
3020 if (!conf)
3021 return -ENODEV;
3023 new = simple_strtoul(page, &end, 10);
3024 if (!*page || (*end && *end != '\n') )
3025 return -EINVAL;
3026 if (new <= 16 || new > 32768)
3027 return -EINVAL;
3028 while (new < conf->max_nr_stripes) {
3029 if (drop_one_stripe(conf))
3030 conf->max_nr_stripes--;
3031 else
3032 break;
3034 while (new > conf->max_nr_stripes) {
3035 if (grow_one_stripe(conf))
3036 conf->max_nr_stripes++;
3037 else break;
3039 return len;
3042 static struct md_sysfs_entry
3043 raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR,
3044 raid5_show_stripe_cache_size,
3045 raid5_store_stripe_cache_size);
3047 static ssize_t
3048 stripe_cache_active_show(mddev_t *mddev, char *page)
3050 raid5_conf_t *conf = mddev_to_conf(mddev);
3051 if (conf)
3052 return sprintf(page, "%d\n", atomic_read(&conf->active_stripes));
3053 else
3054 return 0;
3057 static struct md_sysfs_entry
3058 raid5_stripecache_active = __ATTR_RO(stripe_cache_active);
3060 static struct attribute *raid5_attrs[] = {
3061 &raid5_stripecache_size.attr,
3062 &raid5_stripecache_active.attr,
3063 NULL,
3064 };
3065 static struct attribute_group raid5_attrs_group = {
3066 .name = NULL,
3067 .attrs = raid5_attrs,
3068 };
3070 static int run(mddev_t *mddev)
3072 raid5_conf_t *conf;
3073 int raid_disk, memory;
3074 mdk_rdev_t *rdev;
3075 struct disk_info *disk;
3076 struct list_head *tmp;
3078 if (mddev->level != 5 && mddev->level != 4 && mddev->level != 6) {
3079 printk(KERN_ERR "raid5: %s: raid level not set to 4/5/6 (%d)\n",
3080 mdname(mddev), mddev->level);
3081 return -EIO;
3084 if (mddev->reshape_position != MaxSector) {
3085 /* Check that we can continue the reshape.
3086 * Currently only disks can change, it must
3087 * increase, and we must be past the point where
3088 * a stripe over-writes itself
3089 */
3090 sector_t here_new, here_old;
3091 int old_disks;
3093 if (mddev->new_level != mddev->level ||
3094 mddev->new_layout != mddev->layout ||
3095 mddev->new_chunk != mddev->chunk_size) {
3096 printk(KERN_ERR "raid5: %s: unsupported reshape required - aborting.\n",
3097 mdname(mddev));
3098 return -EINVAL;
3100 if (mddev->delta_disks <= 0) {
3101 printk(KERN_ERR "raid5: %s: unsupported reshape (reduce disks) required - aborting.\n",
3102 mdname(mddev));
3103 return -EINVAL;
3105 old_disks = mddev->raid_disks - mddev->delta_disks;
3106 /* reshape_position must be on a new-stripe boundary, and one
3107 * further up in new geometry must map after here in old geometry.
3108 */
3109 here_new = mddev->reshape_position;
3110 if (sector_div(here_new, (mddev->chunk_size>>9)*(mddev->raid_disks-1))) {
3111 printk(KERN_ERR "raid5: reshape_position not on a stripe boundary\n");
3112 return -EINVAL;
3114 /* here_new is the stripe we will write to */
3115 here_old = mddev->reshape_position;
3116 sector_div(here_old, (mddev->chunk_size>>9)*(old_disks-1));
3117 /* here_old is the first stripe that we might need to read from */
3118 if (here_new >= here_old) {
3119 /* Reading from the same stripe as writing to - bad */
3120 printk(KERN_ERR "raid5: reshape_position too early for auto-recovery - aborting.\n");
3121 return -EINVAL;
3123 printk(KERN_INFO "raid5: reshape will continue\n");
3124 /* OK, we should be able to continue; */
3128 mddev->private = kzalloc(sizeof (raid5_conf_t), GFP_KERNEL);
3129 if ((conf = mddev->private) == NULL)
3130 goto abort;
3131 if (mddev->reshape_position == MaxSector) {
3132 conf->previous_raid_disks = conf->raid_disks = mddev->raid_disks;
3133 } else {
3134 conf->raid_disks = mddev->raid_disks;
3135 conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks;
3138 conf->disks = kzalloc(conf->raid_disks * sizeof(struct disk_info),
3139 GFP_KERNEL);
3140 if (!conf->disks)
3141 goto abort;
3143 conf->mddev = mddev;
3145 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
3146 goto abort;
3148 if (mddev->level == 6) {
3149 conf->spare_page = alloc_page(GFP_KERNEL);
3150 if (!conf->spare_page)
3151 goto abort;
3153 spin_lock_init(&conf->device_lock);
3154 init_waitqueue_head(&conf->wait_for_stripe);
3155 init_waitqueue_head(&conf->wait_for_overlap);
3156 INIT_LIST_HEAD(&conf->handle_list);
3157 INIT_LIST_HEAD(&conf->delayed_list);
3158 INIT_LIST_HEAD(&conf->bitmap_list);
3159 INIT_LIST_HEAD(&conf->inactive_list);
3160 atomic_set(&conf->active_stripes, 0);
3161 atomic_set(&conf->preread_active_stripes, 0);
3163 PRINTK("raid5: run(%s) called.\n", mdname(mddev));
3165 ITERATE_RDEV(mddev,rdev,tmp) {
3166 raid_disk = rdev->raid_disk;
3167 if (raid_disk >= conf->raid_disks
3168 || raid_disk < 0)
3169 continue;
3170 disk = conf->disks + raid_disk;
3172 disk->rdev = rdev;
3174 if (test_bit(In_sync, &rdev->flags)) {
3175 char b[BDEVNAME_SIZE];
3176 printk(KERN_INFO "raid5: device %s operational as raid"
3177 " disk %d\n", bdevname(rdev->bdev,b),
3178 raid_disk);
3179 conf->working_disks++;
3183 /*
3184 * 0 for a fully functional array, 1 or 2 for a degraded array.
3185 */
3186 mddev->degraded = conf->failed_disks = conf->raid_disks - conf->working_disks;
3187 conf->mddev = mddev;
3188 conf->chunk_size = mddev->chunk_size;
3189 conf->level = mddev->level;
3190 if (conf->level == 6)
3191 conf->max_degraded = 2;
3192 else
3193 conf->max_degraded = 1;
3194 conf->algorithm = mddev->layout;
3195 conf->max_nr_stripes = NR_STRIPES;
3196 conf->expand_progress = mddev->reshape_position;
3198 /* device size must be a multiple of chunk size */
3199 mddev->size &= ~(mddev->chunk_size/1024 -1);
3200 mddev->resync_max_sectors = mddev->size << 1;
3202 if (conf->level == 6 && conf->raid_disks < 4) {
3203 printk(KERN_ERR "raid6: not enough configured devices for %s (%d, minimum 4)\n",
3204 mdname(mddev), conf->raid_disks);
3205 goto abort;
3207 if (!conf->chunk_size || conf->chunk_size % 4) {
3208 printk(KERN_ERR "raid5: invalid chunk size %d for %s\n",
3209 conf->chunk_size, mdname(mddev));
3210 goto abort;
3212 if (conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) {
3213 printk(KERN_ERR
3214 "raid5: unsupported parity algorithm %d for %s\n",
3215 conf->algorithm, mdname(mddev));
3216 goto abort;
3218 if (mddev->degraded > conf->max_degraded) {
3219 printk(KERN_ERR "raid5: not enough operational devices for %s"
3220 " (%d/%d failed)\n",
3221 mdname(mddev), conf->failed_disks, conf->raid_disks);
3222 goto abort;
3225 if (mddev->degraded > 0 &&
3226 mddev->recovery_cp != MaxSector) {
3227 if (mddev->ok_start_degraded)
3228 printk(KERN_WARNING
3229 "raid5: starting dirty degraded array: %s"
3230 "- data corruption possible.\n",
3231 mdname(mddev));
3232 else {
3233 printk(KERN_ERR
3234 "raid5: cannot start dirty degraded array for %s\n",
3235 mdname(mddev));
3236 goto abort;
3241 mddev->thread = md_register_thread(raid5d, mddev, "%s_raid5");
3242 if (!mddev->thread) {
3243 printk(KERN_ERR
3244 "raid5: couldn't allocate thread for %s\n",
3245 mdname(mddev));
3246 goto abort;
3249 memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
3250 conf->raid_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
3251 if (grow_stripes(conf, conf->max_nr_stripes)) {
3252 printk(KERN_ERR
3253 "raid5: couldn't allocate %dkB for buffers\n", memory);
3254 shrink_stripes(conf);
3255 md_unregister_thread(mddev->thread);
3256 goto abort;
3257 } else
3258 printk(KERN_INFO "raid5: allocated %dkB for %s\n",
3259 memory, mdname(mddev));
3261 if (mddev->degraded == 0)
3262 printk("raid5: raid level %d set %s active with %d out of %d"
3263 " devices, algorithm %d\n", conf->level, mdname(mddev),
3264 mddev->raid_disks-mddev->degraded, mddev->raid_disks,
3265 conf->algorithm);
3266 else
3267 printk(KERN_ALERT "raid5: raid level %d set %s active with %d"
3268 " out of %d devices, algorithm %d\n", conf->level,
3269 mdname(mddev), mddev->raid_disks - mddev->degraded,
3270 mddev->raid_disks, conf->algorithm);
3272 print_raid5_conf(conf);
3274 if (conf->expand_progress != MaxSector) {
3275 printk("...ok start reshape thread\n");
3276 conf->expand_lo = conf->expand_progress;
3277 atomic_set(&conf->reshape_stripes, 0);
3278 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
3279 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
3280 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
3281 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
3282 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
3283 "%s_reshape");
3286 /* read-ahead size must cover two whole stripes, which is
3287 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices
3288 */
3290 int data_disks = conf->previous_raid_disks - conf->max_degraded;
3291 int stripe = data_disks *
3292 (mddev->chunk_size / PAGE_SIZE);
3293 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
3294 mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
3297 /* Ok, everything is just fine now */
3298 sysfs_create_group(&mddev->kobj, &raid5_attrs_group);
3300 mddev->queue->unplug_fn = raid5_unplug_device;
3301 mddev->queue->issue_flush_fn = raid5_issue_flush;
3302 mddev->array_size = mddev->size * (conf->previous_raid_disks -
3303 conf->max_degraded);
3305 return 0;
3306 abort:
3307 if (conf) {
3308 print_raid5_conf(conf);
3309 safe_put_page(conf->spare_page);
3310 kfree(conf->disks);
3311 kfree(conf->stripe_hashtbl);
3312 kfree(conf);
3314 mddev->private = NULL;
3315 printk(KERN_ALERT "raid5: failed to run raid set %s\n", mdname(mddev));
3316 return -EIO;
3321 static int stop(mddev_t *mddev)
3323 raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
3325 md_unregister_thread(mddev->thread);
3326 mddev->thread = NULL;
3327 shrink_stripes(conf);
3328 kfree(conf->stripe_hashtbl);
3329 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
3330 sysfs_remove_group(&mddev->kobj, &raid5_attrs_group);
3331 kfree(conf->disks);
3332 kfree(conf);
3333 mddev->private = NULL;
3334 return 0;
3337 #if RAID5_DEBUG
3338 static void print_sh (struct seq_file *seq, struct stripe_head *sh)
3340 int i;
3342 seq_printf(seq, "sh %llu, pd_idx %d, state %ld.\n",
3343 (unsigned long long)sh->sector, sh->pd_idx, sh->state);
3344 seq_printf(seq, "sh %llu, count %d.\n",
3345 (unsigned long long)sh->sector, atomic_read(&sh->count));
3346 seq_printf(seq, "sh %llu, ", (unsigned long long)sh->sector);
3347 for (i = 0; i < sh->disks; i++) {
3348 seq_printf(seq, "(cache%d: %p %ld) ",
3349 i, sh->dev[i].page, sh->dev[i].flags);
3351 seq_printf(seq, "\n");
3354 static void printall (struct seq_file *seq, raid5_conf_t *conf)
3356 struct stripe_head *sh;
3357 struct hlist_node *hn;
3358 int i;
3360 spin_lock_irq(&conf->device_lock);
3361 for (i = 0; i < NR_HASH; i++) {
3362 hlist_for_each_entry(sh, hn, &conf->stripe_hashtbl[i], hash) {
3363 if (sh->raid_conf != conf)
3364 continue;
3365 print_sh(seq, sh);
3368 spin_unlock_irq(&conf->device_lock);
3370 #endif
3372 static void status (struct seq_file *seq, mddev_t *mddev)
3374 raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
3375 int i;
3377 seq_printf (seq, " level %d, %dk chunk, algorithm %d", mddev->level, mddev->chunk_size >> 10, mddev->layout);
3378 seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->working_disks);
3379 for (i = 0; i < conf->raid_disks; i++)
3380 seq_printf (seq, "%s",
3381 conf->disks[i].rdev &&
3382 test_bit(In_sync, &conf->disks[i].rdev->flags) ? "U" : "_");
3383 seq_printf (seq, "]");
3384 #if RAID5_DEBUG
3385 seq_printf (seq, "\n");
3386 printall(seq, conf);
3387 #endif
3390 static void print_raid5_conf (raid5_conf_t *conf)
3392 int i;
3393 struct disk_info *tmp;
3395 printk("RAID5 conf printout:\n");
3396 if (!conf) {
3397 printk("(conf==NULL)\n");
3398 return;
3400 printk(" --- rd:%d wd:%d fd:%d\n", conf->raid_disks,
3401 conf->working_disks, conf->failed_disks);
3403 for (i = 0; i < conf->raid_disks; i++) {
3404 char b[BDEVNAME_SIZE];
3405 tmp = conf->disks + i;
3406 if (tmp->rdev)
3407 printk(" disk %d, o:%d, dev:%s\n",
3408 i, !test_bit(Faulty, &tmp->rdev->flags),
3409 bdevname(tmp->rdev->bdev,b));
3413 static int raid5_spare_active(mddev_t *mddev)
3415 int i;
3416 raid5_conf_t *conf = mddev->private;
3417 struct disk_info *tmp;
3419 for (i = 0; i < conf->raid_disks; i++) {
3420 tmp = conf->disks + i;
3421 if (tmp->rdev
3422 && !test_bit(Faulty, &tmp->rdev->flags)
3423 && !test_bit(In_sync, &tmp->rdev->flags)) {
3424 mddev->degraded--;
3425 conf->failed_disks--;
3426 conf->working_disks++;
3427 set_bit(In_sync, &tmp->rdev->flags);
3430 print_raid5_conf(conf);
3431 return 0;
3434 static int raid5_remove_disk(mddev_t *mddev, int number)
3436 raid5_conf_t *conf = mddev->private;
3437 int err = 0;
3438 mdk_rdev_t *rdev;
3439 struct disk_info *p = conf->disks + number;
3441 print_raid5_conf(conf);
3442 rdev = p->rdev;
3443 if (rdev) {
3444 if (test_bit(In_sync, &rdev->flags) ||
3445 atomic_read(&rdev->nr_pending)) {
3446 err = -EBUSY;
3447 goto abort;
3449 p->rdev = NULL;
3450 synchronize_rcu();
3451 if (atomic_read(&rdev->nr_pending)) {
3452 /* lost the race, try later */
3453 err = -EBUSY;
3454 p->rdev = rdev;
3457 abort:
3459 print_raid5_conf(conf);
3460 return err;
3463 static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
3465 raid5_conf_t *conf = mddev->private;
3466 int found = 0;
3467 int disk;
3468 struct disk_info *p;
3470 if (mddev->degraded > conf->max_degraded)
3471 /* no point adding a device */
3472 return 0;
3474 /*
3475 * find the disk ... but prefer rdev->saved_raid_disk
3476 * if possible.
3477 */
3478 if (rdev->saved_raid_disk >= 0 &&
3479 conf->disks[rdev->saved_raid_disk].rdev == NULL)
3480 disk = rdev->saved_raid_disk;
3481 else
3482 disk = 0;
3483 for ( ; disk < conf->raid_disks; disk++)
3484 if ((p=conf->disks + disk)->rdev == NULL) {
3485 clear_bit(In_sync, &rdev->flags);
3486 rdev->raid_disk = disk;
3487 found = 1;
3488 if (rdev->saved_raid_disk != disk)
3489 conf->fullsync = 1;
3490 rcu_assign_pointer(p->rdev, rdev);
3491 break;
3493 print_raid5_conf(conf);
3494 return found;
3497 static int raid5_resize(mddev_t *mddev, sector_t sectors)
3499 /* no resync is happening, and there is enough space
3500 * on all devices, so we can resize.
3501 * We need to make sure resync covers any new space.
3502 * If the array is shrinking we should possibly wait until
3503 * any io in the removed space completes, but it hardly seems
3504 * worth it.
3505 */
3506 raid5_conf_t *conf = mddev_to_conf(mddev);
3508 sectors &= ~((sector_t)mddev->chunk_size/512 - 1);
3509 mddev->array_size = (sectors * (mddev->raid_disks-conf->max_degraded))>>1;
3510 set_capacity(mddev->gendisk, mddev->array_size << 1);
3511 mddev->changed = 1;
3512 if (sectors/2 > mddev->size && mddev->recovery_cp == MaxSector) {
3513 mddev->recovery_cp = mddev->size << 1;
3514 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3516 mddev->size = sectors /2;
3517 mddev->resync_max_sectors = sectors;
3518 return 0;
3521 #ifdef CONFIG_MD_RAID5_RESHAPE
3522 static int raid5_check_reshape(mddev_t *mddev)
3524 raid5_conf_t *conf = mddev_to_conf(mddev);
3525 int err;
3527 if (mddev->delta_disks < 0 ||
3528 mddev->new_level != mddev->level)
3529 return -EINVAL; /* Cannot shrink array or change level yet */
3530 if (mddev->delta_disks == 0)
3531 return 0; /* nothing to do */
3533 /* Can only proceed if there are plenty of stripe_heads.
3534 * We need a minimum of one full stripe,, and for sensible progress
3535 * it is best to have about 4 times that.
3536 * If we require 4 times, then the default 256 4K stripe_heads will
3537 * allow for chunk sizes up to 256K, which is probably OK.
3538 * If the chunk size is greater, user-space should request more
3539 * stripe_heads first.
3540 */
3541 if ((mddev->chunk_size / STRIPE_SIZE) * 4 > conf->max_nr_stripes ||
3542 (mddev->new_chunk / STRIPE_SIZE) * 4 > conf->max_nr_stripes) {
3543 printk(KERN_WARNING "raid5: reshape: not enough stripes. Needed %lu\n",
3544 (mddev->chunk_size / STRIPE_SIZE)*4);
3545 return -ENOSPC;
3548 err = resize_stripes(conf, conf->raid_disks + mddev->delta_disks);
3549 if (err)
3550 return err;
3552 /* looks like we might be able to manage this */
3553 return 0;
3556 static int raid5_start_reshape(mddev_t *mddev)
3558 raid5_conf_t *conf = mddev_to_conf(mddev);
3559 mdk_rdev_t *rdev;
3560 struct list_head *rtmp;
3561 int spares = 0;
3562 int added_devices = 0;
3564 if (mddev->degraded ||
3565 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
3566 return -EBUSY;
3568 ITERATE_RDEV(mddev, rdev, rtmp)
3569 if (rdev->raid_disk < 0 &&
3570 !test_bit(Faulty, &rdev->flags))
3571 spares++;
3573 if (spares < mddev->delta_disks-1)
3574 /* Not enough devices even to make a degraded array
3575 * of that size
3576 */
3577 return -EINVAL;
3579 atomic_set(&conf->reshape_stripes, 0);
3580 spin_lock_irq(&conf->device_lock);
3581 conf->previous_raid_disks = conf->raid_disks;
3582 conf->raid_disks += mddev->delta_disks;
3583 conf->expand_progress = 0;
3584 conf->expand_lo = 0;
3585 spin_unlock_irq(&conf->device_lock);
3587 /* Add some new drives, as many as will fit.
3588 * We know there are enough to make the newly sized array work.
3589 */
3590 ITERATE_RDEV(mddev, rdev, rtmp)
3591 if (rdev->raid_disk < 0 &&
3592 !test_bit(Faulty, &rdev->flags)) {
3593 if (raid5_add_disk(mddev, rdev)) {
3594 char nm[20];
3595 set_bit(In_sync, &rdev->flags);
3596 conf->working_disks++;
3597 added_devices++;
3598 rdev->recovery_offset = 0;
3599 sprintf(nm, "rd%d", rdev->raid_disk);
3600 sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
3601 } else
3602 break;
3605 mddev->degraded = (conf->raid_disks - conf->previous_raid_disks) - added_devices;
3606 mddev->raid_disks = conf->raid_disks;
3607 mddev->reshape_position = 0;
3608 mddev->sb_dirty = 1;
3610 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
3611 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
3612 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
3613 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
3614 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
3615 "%s_reshape");
3616 if (!mddev->sync_thread) {
3617 mddev->recovery = 0;
3618 spin_lock_irq(&conf->device_lock);
3619 mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
3620 conf->expand_progress = MaxSector;
3621 spin_unlock_irq(&conf->device_lock);
3622 return -EAGAIN;
3624 md_wakeup_thread(mddev->sync_thread);
3625 md_new_event(mddev);
3626 return 0;
3628 #endif
3630 static void end_reshape(raid5_conf_t *conf)
3632 struct block_device *bdev;
3634 if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
3635 conf->mddev->array_size = conf->mddev->size * (conf->raid_disks-1);
3636 set_capacity(conf->mddev->gendisk, conf->mddev->array_size << 1);
3637 conf->mddev->changed = 1;
3639 bdev = bdget_disk(conf->mddev->gendisk, 0);
3640 if (bdev) {
3641 mutex_lock(&bdev->bd_inode->i_mutex);
3642 i_size_write(bdev->bd_inode, conf->mddev->array_size << 10);
3643 mutex_unlock(&bdev->bd_inode->i_mutex);
3644 bdput(bdev);
3646 spin_lock_irq(&conf->device_lock);
3647 conf->expand_progress = MaxSector;
3648 spin_unlock_irq(&conf->device_lock);
3649 conf->mddev->reshape_position = MaxSector;
3651 /* read-ahead size must cover two whole stripes, which is
3652 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices
3653 */
3655 int data_disks = conf->previous_raid_disks - conf->max_degraded;
3656 int stripe = data_disks *
3657 (conf->mddev->chunk_size / PAGE_SIZE);
3658 if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
3659 conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
3664 static void raid5_quiesce(mddev_t *mddev, int state)
3666 raid5_conf_t *conf = mddev_to_conf(mddev);
3668 switch(state) {
3669 case 2: /* resume for a suspend */
3670 wake_up(&conf->wait_for_overlap);
3671 break;
3673 case 1: /* stop all writes */
3674 spin_lock_irq(&conf->device_lock);
3675 conf->quiesce = 1;
3676 wait_event_lock_irq(conf->wait_for_stripe,
3677 atomic_read(&conf->active_stripes) == 0,
3678 conf->device_lock, /* nothing */);
3679 spin_unlock_irq(&conf->device_lock);
3680 break;
3682 case 0: /* re-enable writes */
3683 spin_lock_irq(&conf->device_lock);
3684 conf->quiesce = 0;
3685 wake_up(&conf->wait_for_stripe);
3686 wake_up(&conf->wait_for_overlap);
3687 spin_unlock_irq(&conf->device_lock);
3688 break;
3692 static struct mdk_personality raid6_personality =
3694 .name = "raid6",
3695 .level = 6,
3696 .owner = THIS_MODULE,
3697 .make_request = make_request,
3698 .run = run,
3699 .stop = stop,
3700 .status = status,
3701 .error_handler = error,
3702 .hot_add_disk = raid5_add_disk,
3703 .hot_remove_disk= raid5_remove_disk,
3704 .spare_active = raid5_spare_active,
3705 .sync_request = sync_request,
3706 .resize = raid5_resize,
3707 .quiesce = raid5_quiesce,
3708 };
3709 static struct mdk_personality raid5_personality =
3711 .name = "raid5",
3712 .level = 5,
3713 .owner = THIS_MODULE,
3714 .make_request = make_request,
3715 .run = run,
3716 .stop = stop,
3717 .status = status,
3718 .error_handler = error,
3719 .hot_add_disk = raid5_add_disk,
3720 .hot_remove_disk= raid5_remove_disk,
3721 .spare_active = raid5_spare_active,
3722 .sync_request = sync_request,
3723 .resize = raid5_resize,
3724 #ifdef CONFIG_MD_RAID5_RESHAPE
3725 .check_reshape = raid5_check_reshape,
3726 .start_reshape = raid5_start_reshape,
3727 #endif
3728 .quiesce = raid5_quiesce,
3729 };
3731 static struct mdk_personality raid4_personality =
3733 .name = "raid4",
3734 .level = 4,
3735 .owner = THIS_MODULE,
3736 .make_request = make_request,
3737 .run = run,
3738 .stop = stop,
3739 .status = status,
3740 .error_handler = error,
3741 .hot_add_disk = raid5_add_disk,
3742 .hot_remove_disk= raid5_remove_disk,
3743 .spare_active = raid5_spare_active,
3744 .sync_request = sync_request,
3745 .resize = raid5_resize,
3746 .quiesce = raid5_quiesce,
3747 };
3749 static int __init raid5_init(void)
3751 int e;
3753 e = raid6_select_algo();
3754 if ( e )
3755 return e;
3756 register_md_personality(&raid6_personality);
3757 register_md_personality(&raid5_personality);
3758 register_md_personality(&raid4_personality);
3759 return 0;
3762 static void raid5_exit(void)
3764 unregister_md_personality(&raid6_personality);
3765 unregister_md_personality(&raid5_personality);
3766 unregister_md_personality(&raid4_personality);
3769 module_init(raid5_init);
3770 module_exit(raid5_exit);
3771 MODULE_LICENSE("GPL");
3772 MODULE_ALIAS("md-personality-4"); /* RAID5 */
3773 MODULE_ALIAS("md-raid5");
3774 MODULE_ALIAS("md-raid4");
3775 MODULE_ALIAS("md-level-5");
3776 MODULE_ALIAS("md-level-4");
3777 MODULE_ALIAS("md-personality-8"); /* RAID6 */
3778 MODULE_ALIAS("md-raid6");
3779 MODULE_ALIAS("md-level-6");
3781 /* This used to be two separate modules, they were: */
3782 MODULE_ALIAS("raid5");
3783 MODULE_ALIAS("raid6");