ia64/linux-2.6.18-xen.hg

view drivers/md/md.c @ 897:329ea0ccb344

balloon: try harder to balloon up under memory pressure.

Currently if the balloon driver is unable to increase the guest's
reservation it assumes the failure was due to reaching its full
allocation, gives up on the ballooning operation and records the limit
it reached as the "hard limit". The driver will not try again until
the target is set again (even to the same value).

However it is possible that ballooning has in fact failed due to
memory pressure in the host and therefore it is desirable to keep
attempting to reach the target in case memory becomes available. The
most likely scenario is that some guests are ballooning down while
others are ballooning up and therefore there is temporary memory
pressure while things stabilise. You would not expect a well behaved
toolstack to ask a domain to balloon to more than its allocation nor
would you expect it to deliberately over-commit memory by setting
balloon targets which exceed the total host memory.

This patch drops the concept of a hard limit and causes the balloon
driver to retry increasing the reservation on a timer in the same
manner as when decreasing the reservation.

Also if we partially succeed in increasing the reservation
(i.e. receive less pages than we asked for) then we may as well keep
those pages rather than returning them to Xen.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Fri Jun 05 14:01:20 2009 +0100 (2009-06-05)
parents 3e8752eb6d9c
children
line source
1 /*
2 md.c : Multiple Devices driver for Linux
3 Copyright (C) 1998, 1999, 2000 Ingo Molnar
5 completely rewritten, based on the MD driver code from Marc Zyngier
7 Changes:
9 - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
10 - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com>
11 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
12 - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
13 - kmod support by: Cyrus Durgin
14 - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
15 - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>
17 - lots of fixes and improvements to the RAID1/RAID5 and generic
18 RAID code (such as request based resynchronization):
20 Neil Brown <neilb@cse.unsw.edu.au>.
22 - persistent bitmap code
23 Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
25 This program is free software; you can redistribute it and/or modify
26 it under the terms of the GNU General Public License as published by
27 the Free Software Foundation; either version 2, or (at your option)
28 any later version.
30 You should have received a copy of the GNU General Public License
31 (for example /usr/src/linux/COPYING); if not, write to the Free
32 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
33 */
35 #include <linux/module.h>
36 #include <linux/kthread.h>
37 #include <linux/linkage.h>
38 #include <linux/raid/md.h>
39 #include <linux/raid/bitmap.h>
40 #include <linux/sysctl.h>
41 #include <linux/buffer_head.h> /* for invalidate_bdev */
42 #include <linux/suspend.h>
43 #include <linux/poll.h>
44 #include <linux/mutex.h>
45 #include <linux/ctype.h>
47 #include <linux/init.h>
49 #include <linux/file.h>
51 #ifdef CONFIG_KMOD
52 #include <linux/kmod.h>
53 #endif
55 #include <asm/unaligned.h>
57 #define MAJOR_NR MD_MAJOR
58 #define MD_DRIVER
60 /* 63 partitions with the alternate major number (mdp) */
61 #define MdpMinorShift 6
63 #define DEBUG 0
64 #define dprintk(x...) ((void)(DEBUG && printk(x)))
67 #ifndef MODULE
68 static void autostart_arrays (int part);
69 #endif
71 static LIST_HEAD(pers_list);
72 static DEFINE_SPINLOCK(pers_lock);
74 static void md_print_devices(void);
76 #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
78 /*
79 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
80 * is 1000 KB/sec, so the extra system load does not show up that much.
81 * Increase it if you want to have more _guaranteed_ speed. Note that
82 * the RAID driver will use the maximum available bandwidth if the IO
83 * subsystem is idle. There is also an 'absolute maximum' reconstruction
84 * speed limit - in case reconstruction slows down your system despite
85 * idle IO detection.
86 *
87 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
88 * or /sys/block/mdX/md/sync_speed_{min,max}
89 */
91 static int sysctl_speed_limit_min = 1000;
92 static int sysctl_speed_limit_max = 200000;
93 static inline int speed_min(mddev_t *mddev)
94 {
95 return mddev->sync_speed_min ?
96 mddev->sync_speed_min : sysctl_speed_limit_min;
97 }
99 static inline int speed_max(mddev_t *mddev)
100 {
101 return mddev->sync_speed_max ?
102 mddev->sync_speed_max : sysctl_speed_limit_max;
103 }
105 static struct ctl_table_header *raid_table_header;
107 static ctl_table raid_table[] = {
108 {
109 .ctl_name = DEV_RAID_SPEED_LIMIT_MIN,
110 .procname = "speed_limit_min",
111 .data = &sysctl_speed_limit_min,
112 .maxlen = sizeof(int),
113 .mode = S_IRUGO|S_IWUSR,
114 .proc_handler = &proc_dointvec,
115 },
116 {
117 .ctl_name = DEV_RAID_SPEED_LIMIT_MAX,
118 .procname = "speed_limit_max",
119 .data = &sysctl_speed_limit_max,
120 .maxlen = sizeof(int),
121 .mode = S_IRUGO|S_IWUSR,
122 .proc_handler = &proc_dointvec,
123 },
124 { .ctl_name = 0 }
125 };
127 static ctl_table raid_dir_table[] = {
128 {
129 .ctl_name = DEV_RAID,
130 .procname = "raid",
131 .maxlen = 0,
132 .mode = S_IRUGO|S_IXUGO,
133 .child = raid_table,
134 },
135 { .ctl_name = 0 }
136 };
138 static ctl_table raid_root_table[] = {
139 {
140 .ctl_name = CTL_DEV,
141 .procname = "dev",
142 .maxlen = 0,
143 .mode = 0555,
144 .child = raid_dir_table,
145 },
146 { .ctl_name = 0 }
147 };
149 static struct block_device_operations md_fops;
151 static int start_readonly;
153 /*
154 * We have a system wide 'event count' that is incremented
155 * on any 'interesting' event, and readers of /proc/mdstat
156 * can use 'poll' or 'select' to find out when the event
157 * count increases.
158 *
159 * Events are:
160 * start array, stop array, error, add device, remove device,
161 * start build, activate spare
162 */
163 static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
164 static atomic_t md_event_count;
165 void md_new_event(mddev_t *mddev)
166 {
167 atomic_inc(&md_event_count);
168 wake_up(&md_event_waiters);
169 sysfs_notify(&mddev->kobj, NULL, "sync_action");
170 }
171 EXPORT_SYMBOL_GPL(md_new_event);
173 /* Alternate version that can be called from interrupts
174 * when calling sysfs_notify isn't needed.
175 */
176 static void md_new_event_inintr(mddev_t *mddev)
177 {
178 atomic_inc(&md_event_count);
179 wake_up(&md_event_waiters);
180 }
182 /*
183 * Enables to iterate over all existing md arrays
184 * all_mddevs_lock protects this list.
185 */
186 static LIST_HEAD(all_mddevs);
187 static DEFINE_SPINLOCK(all_mddevs_lock);
190 /*
191 * iterates through all used mddevs in the system.
192 * We take care to grab the all_mddevs_lock whenever navigating
193 * the list, and to always hold a refcount when unlocked.
194 * Any code which breaks out of this loop while own
195 * a reference to the current mddev and must mddev_put it.
196 */
197 #define ITERATE_MDDEV(mddev,tmp) \
198 \
199 for (({ spin_lock(&all_mddevs_lock); \
200 tmp = all_mddevs.next; \
201 mddev = NULL;}); \
202 ({ if (tmp != &all_mddevs) \
203 mddev_get(list_entry(tmp, mddev_t, all_mddevs));\
204 spin_unlock(&all_mddevs_lock); \
205 if (mddev) mddev_put(mddev); \
206 mddev = list_entry(tmp, mddev_t, all_mddevs); \
207 tmp != &all_mddevs;}); \
208 ({ spin_lock(&all_mddevs_lock); \
209 tmp = tmp->next;}) \
210 )
213 static int md_fail_request (request_queue_t *q, struct bio *bio)
214 {
215 bio_io_error(bio, bio->bi_size);
216 return 0;
217 }
219 static inline mddev_t *mddev_get(mddev_t *mddev)
220 {
221 atomic_inc(&mddev->active);
222 return mddev;
223 }
225 static void mddev_put(mddev_t *mddev)
226 {
227 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
228 return;
229 if (!mddev->raid_disks && list_empty(&mddev->disks)) {
230 list_del(&mddev->all_mddevs);
231 spin_unlock(&all_mddevs_lock);
232 blk_cleanup_queue(mddev->queue);
233 kobject_unregister(&mddev->kobj);
234 } else
235 spin_unlock(&all_mddevs_lock);
236 }
238 static mddev_t * mddev_find(dev_t unit)
239 {
240 mddev_t *mddev, *new = NULL;
242 retry:
243 spin_lock(&all_mddevs_lock);
244 list_for_each_entry(mddev, &all_mddevs, all_mddevs)
245 if (mddev->unit == unit) {
246 mddev_get(mddev);
247 spin_unlock(&all_mddevs_lock);
248 kfree(new);
249 return mddev;
250 }
252 if (new) {
253 list_add(&new->all_mddevs, &all_mddevs);
254 spin_unlock(&all_mddevs_lock);
255 return new;
256 }
257 spin_unlock(&all_mddevs_lock);
259 new = kzalloc(sizeof(*new), GFP_KERNEL);
260 if (!new)
261 return NULL;
263 new->unit = unit;
264 if (MAJOR(unit) == MD_MAJOR)
265 new->md_minor = MINOR(unit);
266 else
267 new->md_minor = MINOR(unit) >> MdpMinorShift;
269 mutex_init(&new->reconfig_mutex);
270 INIT_LIST_HEAD(&new->disks);
271 INIT_LIST_HEAD(&new->all_mddevs);
272 init_timer(&new->safemode_timer);
273 atomic_set(&new->active, 1);
274 spin_lock_init(&new->write_lock);
275 init_waitqueue_head(&new->sb_wait);
277 new->queue = blk_alloc_queue(GFP_KERNEL);
278 if (!new->queue) {
279 kfree(new);
280 return NULL;
281 }
282 set_bit(QUEUE_FLAG_CLUSTER, &new->queue->queue_flags);
284 blk_queue_make_request(new->queue, md_fail_request);
286 goto retry;
287 }
289 static inline int mddev_lock(mddev_t * mddev)
290 {
291 return mutex_lock_interruptible(&mddev->reconfig_mutex);
292 }
294 static inline int mddev_trylock(mddev_t * mddev)
295 {
296 return mutex_trylock(&mddev->reconfig_mutex);
297 }
299 static inline void mddev_unlock(mddev_t * mddev)
300 {
301 mutex_unlock(&mddev->reconfig_mutex);
303 md_wakeup_thread(mddev->thread);
304 }
306 static mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
307 {
308 mdk_rdev_t * rdev;
309 struct list_head *tmp;
311 ITERATE_RDEV(mddev,rdev,tmp) {
312 if (rdev->desc_nr == nr)
313 return rdev;
314 }
315 return NULL;
316 }
318 static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev)
319 {
320 struct list_head *tmp;
321 mdk_rdev_t *rdev;
323 ITERATE_RDEV(mddev,rdev,tmp) {
324 if (rdev->bdev->bd_dev == dev)
325 return rdev;
326 }
327 return NULL;
328 }
330 static struct mdk_personality *find_pers(int level, char *clevel)
331 {
332 struct mdk_personality *pers;
333 list_for_each_entry(pers, &pers_list, list) {
334 if (level != LEVEL_NONE && pers->level == level)
335 return pers;
336 if (strcmp(pers->name, clevel)==0)
337 return pers;
338 }
339 return NULL;
340 }
342 static inline sector_t calc_dev_sboffset(struct block_device *bdev)
343 {
344 sector_t size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
345 return MD_NEW_SIZE_BLOCKS(size);
346 }
348 static sector_t calc_dev_size(mdk_rdev_t *rdev, unsigned chunk_size)
349 {
350 sector_t size;
352 size = rdev->sb_offset;
354 if (chunk_size)
355 size &= ~((sector_t)chunk_size/1024 - 1);
356 return size;
357 }
359 static int alloc_disk_sb(mdk_rdev_t * rdev)
360 {
361 if (rdev->sb_page)
362 MD_BUG();
364 rdev->sb_page = alloc_page(GFP_KERNEL);
365 if (!rdev->sb_page) {
366 printk(KERN_ALERT "md: out of memory.\n");
367 return -EINVAL;
368 }
370 return 0;
371 }
373 static void free_disk_sb(mdk_rdev_t * rdev)
374 {
375 if (rdev->sb_page) {
376 put_page(rdev->sb_page);
377 rdev->sb_loaded = 0;
378 rdev->sb_page = NULL;
379 rdev->sb_offset = 0;
380 rdev->size = 0;
381 }
382 }
385 static int super_written(struct bio *bio, unsigned int bytes_done, int error)
386 {
387 mdk_rdev_t *rdev = bio->bi_private;
388 mddev_t *mddev = rdev->mddev;
389 if (bio->bi_size)
390 return 1;
392 if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags))
393 md_error(mddev, rdev);
395 if (atomic_dec_and_test(&mddev->pending_writes))
396 wake_up(&mddev->sb_wait);
397 bio_put(bio);
398 return 0;
399 }
401 static int super_written_barrier(struct bio *bio, unsigned int bytes_done, int error)
402 {
403 struct bio *bio2 = bio->bi_private;
404 mdk_rdev_t *rdev = bio2->bi_private;
405 mddev_t *mddev = rdev->mddev;
406 if (bio->bi_size)
407 return 1;
409 if (!test_bit(BIO_UPTODATE, &bio->bi_flags) &&
410 error == -EOPNOTSUPP) {
411 unsigned long flags;
412 /* barriers don't appear to be supported :-( */
413 set_bit(BarriersNotsupp, &rdev->flags);
414 mddev->barriers_work = 0;
415 spin_lock_irqsave(&mddev->write_lock, flags);
416 bio2->bi_next = mddev->biolist;
417 mddev->biolist = bio2;
418 spin_unlock_irqrestore(&mddev->write_lock, flags);
419 wake_up(&mddev->sb_wait);
420 bio_put(bio);
421 return 0;
422 }
423 bio_put(bio2);
424 bio->bi_private = rdev;
425 return super_written(bio, bytes_done, error);
426 }
428 void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
429 sector_t sector, int size, struct page *page)
430 {
431 /* write first size bytes of page to sector of rdev
432 * Increment mddev->pending_writes before returning
433 * and decrement it on completion, waking up sb_wait
434 * if zero is reached.
435 * If an error occurred, call md_error
436 *
437 * As we might need to resubmit the request if BIO_RW_BARRIER
438 * causes ENOTSUPP, we allocate a spare bio...
439 */
440 struct bio *bio = bio_alloc(GFP_NOIO, 1);
441 int rw = (1<<BIO_RW) | (1<<BIO_RW_SYNC);
443 bio->bi_bdev = rdev->bdev;
444 bio->bi_sector = sector;
445 bio_add_page(bio, page, size, 0);
446 bio->bi_private = rdev;
447 bio->bi_end_io = super_written;
448 bio->bi_rw = rw;
450 atomic_inc(&mddev->pending_writes);
451 if (!test_bit(BarriersNotsupp, &rdev->flags)) {
452 struct bio *rbio;
453 rw |= (1<<BIO_RW_BARRIER);
454 rbio = bio_clone(bio, GFP_NOIO);
455 rbio->bi_private = bio;
456 rbio->bi_end_io = super_written_barrier;
457 submit_bio(rw, rbio);
458 } else
459 submit_bio(rw, bio);
460 }
462 void md_super_wait(mddev_t *mddev)
463 {
464 /* wait for all superblock writes that were scheduled to complete.
465 * if any had to be retried (due to BARRIER problems), retry them
466 */
467 DEFINE_WAIT(wq);
468 for(;;) {
469 prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE);
470 if (atomic_read(&mddev->pending_writes)==0)
471 break;
472 while (mddev->biolist) {
473 struct bio *bio;
474 spin_lock_irq(&mddev->write_lock);
475 bio = mddev->biolist;
476 mddev->biolist = bio->bi_next ;
477 bio->bi_next = NULL;
478 spin_unlock_irq(&mddev->write_lock);
479 submit_bio(bio->bi_rw, bio);
480 }
481 schedule();
482 }
483 finish_wait(&mddev->sb_wait, &wq);
484 }
486 static int bi_complete(struct bio *bio, unsigned int bytes_done, int error)
487 {
488 if (bio->bi_size)
489 return 1;
491 complete((struct completion*)bio->bi_private);
492 return 0;
493 }
495 int sync_page_io(struct block_device *bdev, sector_t sector, int size,
496 struct page *page, int rw)
497 {
498 struct bio *bio = bio_alloc(GFP_NOIO, 1);
499 struct completion event;
500 int ret;
502 rw |= (1 << BIO_RW_SYNC);
504 bio->bi_bdev = bdev;
505 bio->bi_sector = sector;
506 bio_add_page(bio, page, size, 0);
507 init_completion(&event);
508 bio->bi_private = &event;
509 bio->bi_end_io = bi_complete;
510 submit_bio(rw, bio);
511 wait_for_completion(&event);
513 ret = test_bit(BIO_UPTODATE, &bio->bi_flags);
514 bio_put(bio);
515 return ret;
516 }
517 EXPORT_SYMBOL_GPL(sync_page_io);
519 static int read_disk_sb(mdk_rdev_t * rdev, int size)
520 {
521 char b[BDEVNAME_SIZE];
522 if (!rdev->sb_page) {
523 MD_BUG();
524 return -EINVAL;
525 }
526 if (rdev->sb_loaded)
527 return 0;
530 if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, size, rdev->sb_page, READ))
531 goto fail;
532 rdev->sb_loaded = 1;
533 return 0;
535 fail:
536 printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n",
537 bdevname(rdev->bdev,b));
538 return -EINVAL;
539 }
541 static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
542 {
543 if ( (sb1->set_uuid0 == sb2->set_uuid0) &&
544 (sb1->set_uuid1 == sb2->set_uuid1) &&
545 (sb1->set_uuid2 == sb2->set_uuid2) &&
546 (sb1->set_uuid3 == sb2->set_uuid3))
548 return 1;
550 return 0;
551 }
554 static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
555 {
556 int ret;
557 mdp_super_t *tmp1, *tmp2;
559 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
560 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
562 if (!tmp1 || !tmp2) {
563 ret = 0;
564 printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n");
565 goto abort;
566 }
568 *tmp1 = *sb1;
569 *tmp2 = *sb2;
571 /*
572 * nr_disks is not constant
573 */
574 tmp1->nr_disks = 0;
575 tmp2->nr_disks = 0;
577 if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4))
578 ret = 0;
579 else
580 ret = 1;
582 abort:
583 kfree(tmp1);
584 kfree(tmp2);
585 return ret;
586 }
588 static unsigned int calc_sb_csum(mdp_super_t * sb)
589 {
590 unsigned int disk_csum, csum;
592 disk_csum = sb->sb_csum;
593 sb->sb_csum = 0;
594 csum = csum_partial((void *)sb, MD_SB_BYTES, 0);
595 sb->sb_csum = disk_csum;
596 return csum;
597 }
600 /*
601 * Handle superblock details.
602 * We want to be able to handle multiple superblock formats
603 * so we have a common interface to them all, and an array of
604 * different handlers.
605 * We rely on user-space to write the initial superblock, and support
606 * reading and updating of superblocks.
607 * Interface methods are:
608 * int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev, int minor_version)
609 * loads and validates a superblock on dev.
610 * if refdev != NULL, compare superblocks on both devices
611 * Return:
612 * 0 - dev has a superblock that is compatible with refdev
613 * 1 - dev has a superblock that is compatible and newer than refdev
614 * so dev should be used as the refdev in future
615 * -EINVAL superblock incompatible or invalid
616 * -othererror e.g. -EIO
617 *
618 * int validate_super(mddev_t *mddev, mdk_rdev_t *dev)
619 * Verify that dev is acceptable into mddev.
620 * The first time, mddev->raid_disks will be 0, and data from
621 * dev should be merged in. Subsequent calls check that dev
622 * is new enough. Return 0 or -EINVAL
623 *
624 * void sync_super(mddev_t *mddev, mdk_rdev_t *dev)
625 * Update the superblock for rdev with data in mddev
626 * This does not write to disc.
627 *
628 */
630 struct super_type {
631 char *name;
632 struct module *owner;
633 int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version);
634 int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev);
635 void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev);
636 };
638 /*
639 * load_super for 0.90.0
640 */
641 static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
642 {
643 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
644 mdp_super_t *sb;
645 int ret;
646 sector_t sb_offset;
648 /*
649 * Calculate the position of the superblock,
650 * it's at the end of the disk.
651 *
652 * It also happens to be a multiple of 4Kb.
653 */
654 sb_offset = calc_dev_sboffset(rdev->bdev);
655 rdev->sb_offset = sb_offset;
657 ret = read_disk_sb(rdev, MD_SB_BYTES);
658 if (ret) return ret;
660 ret = -EINVAL;
662 bdevname(rdev->bdev, b);
663 sb = (mdp_super_t*)page_address(rdev->sb_page);
665 if (sb->md_magic != MD_SB_MAGIC) {
666 printk(KERN_ERR "md: invalid raid superblock magic on %s\n",
667 b);
668 goto abort;
669 }
671 if (sb->major_version != 0 ||
672 sb->minor_version < 90 ||
673 sb->minor_version > 91) {
674 printk(KERN_WARNING "Bad version number %d.%d on %s\n",
675 sb->major_version, sb->minor_version,
676 b);
677 goto abort;
678 }
680 if (sb->raid_disks <= 0)
681 goto abort;
683 if (csum_fold(calc_sb_csum(sb)) != csum_fold(sb->sb_csum)) {
684 printk(KERN_WARNING "md: invalid superblock checksum on %s\n",
685 b);
686 goto abort;
687 }
689 rdev->preferred_minor = sb->md_minor;
690 rdev->data_offset = 0;
691 rdev->sb_size = MD_SB_BYTES;
693 if (sb->level == LEVEL_MULTIPATH)
694 rdev->desc_nr = -1;
695 else
696 rdev->desc_nr = sb->this_disk.number;
698 if (refdev == 0)
699 ret = 1;
700 else {
701 __u64 ev1, ev2;
702 mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page);
703 if (!uuid_equal(refsb, sb)) {
704 printk(KERN_WARNING "md: %s has different UUID to %s\n",
705 b, bdevname(refdev->bdev,b2));
706 goto abort;
707 }
708 if (!sb_equal(refsb, sb)) {
709 printk(KERN_WARNING "md: %s has same UUID"
710 " but different superblock to %s\n",
711 b, bdevname(refdev->bdev, b2));
712 goto abort;
713 }
714 ev1 = md_event(sb);
715 ev2 = md_event(refsb);
716 if (ev1 > ev2)
717 ret = 1;
718 else
719 ret = 0;
720 }
721 rdev->size = calc_dev_size(rdev, sb->chunk_size);
723 if (rdev->size < sb->size && sb->level > 1)
724 /* "this cannot possibly happen" ... */
725 ret = -EINVAL;
727 abort:
728 return ret;
729 }
731 /*
732 * validate_super for 0.90.0
733 */
734 static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
735 {
736 mdp_disk_t *desc;
737 mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page);
738 __u64 ev1 = md_event(sb);
740 rdev->raid_disk = -1;
741 rdev->flags = 0;
742 if (mddev->raid_disks == 0) {
743 mddev->major_version = 0;
744 mddev->minor_version = sb->minor_version;
745 mddev->patch_version = sb->patch_version;
746 mddev->persistent = ! sb->not_persistent;
747 mddev->chunk_size = sb->chunk_size;
748 mddev->ctime = sb->ctime;
749 mddev->utime = sb->utime;
750 mddev->level = sb->level;
751 mddev->clevel[0] = 0;
752 mddev->layout = sb->layout;
753 mddev->raid_disks = sb->raid_disks;
754 mddev->size = sb->size;
755 mddev->events = ev1;
756 mddev->bitmap_offset = 0;
757 mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
759 if (mddev->minor_version >= 91) {
760 mddev->reshape_position = sb->reshape_position;
761 mddev->delta_disks = sb->delta_disks;
762 mddev->new_level = sb->new_level;
763 mddev->new_layout = sb->new_layout;
764 mddev->new_chunk = sb->new_chunk;
765 } else {
766 mddev->reshape_position = MaxSector;
767 mddev->delta_disks = 0;
768 mddev->new_level = mddev->level;
769 mddev->new_layout = mddev->layout;
770 mddev->new_chunk = mddev->chunk_size;
771 }
773 if (sb->state & (1<<MD_SB_CLEAN))
774 mddev->recovery_cp = MaxSector;
775 else {
776 if (sb->events_hi == sb->cp_events_hi &&
777 sb->events_lo == sb->cp_events_lo) {
778 mddev->recovery_cp = sb->recovery_cp;
779 } else
780 mddev->recovery_cp = 0;
781 }
783 memcpy(mddev->uuid+0, &sb->set_uuid0, 4);
784 memcpy(mddev->uuid+4, &sb->set_uuid1, 4);
785 memcpy(mddev->uuid+8, &sb->set_uuid2, 4);
786 memcpy(mddev->uuid+12,&sb->set_uuid3, 4);
788 mddev->max_disks = MD_SB_DISKS;
790 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
791 mddev->bitmap_file == NULL) {
792 if (mddev->level != 1 && mddev->level != 4
793 && mddev->level != 5 && mddev->level != 6
794 && mddev->level != 10) {
795 /* FIXME use a better test */
796 printk(KERN_WARNING "md: bitmaps not supported for this level.\n");
797 return -EINVAL;
798 }
799 mddev->bitmap_offset = mddev->default_bitmap_offset;
800 }
802 } else if (mddev->pers == NULL) {
803 /* Insist on good event counter while assembling */
804 ++ev1;
805 if (ev1 < mddev->events)
806 return -EINVAL;
807 } else if (mddev->bitmap) {
808 /* if adding to array with a bitmap, then we can accept an
809 * older device ... but not too old.
810 */
811 if (ev1 < mddev->bitmap->events_cleared)
812 return 0;
813 } else {
814 if (ev1 < mddev->events)
815 /* just a hot-add of a new device, leave raid_disk at -1 */
816 return 0;
817 }
819 if (mddev->level != LEVEL_MULTIPATH) {
820 desc = sb->disks + rdev->desc_nr;
822 if (desc->state & (1<<MD_DISK_FAULTY))
823 set_bit(Faulty, &rdev->flags);
824 else if (desc->state & (1<<MD_DISK_SYNC) /* &&
825 desc->raid_disk < mddev->raid_disks */) {
826 set_bit(In_sync, &rdev->flags);
827 rdev->raid_disk = desc->raid_disk;
828 }
829 if (desc->state & (1<<MD_DISK_WRITEMOSTLY))
830 set_bit(WriteMostly, &rdev->flags);
831 } else /* MULTIPATH are always insync */
832 set_bit(In_sync, &rdev->flags);
833 return 0;
834 }
836 /*
837 * sync_super for 0.90.0
838 */
839 static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
840 {
841 mdp_super_t *sb;
842 struct list_head *tmp;
843 mdk_rdev_t *rdev2;
844 int next_spare = mddev->raid_disks;
847 /* make rdev->sb match mddev data..
848 *
849 * 1/ zero out disks
850 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare);
851 * 3/ any empty disks < next_spare become removed
852 *
853 * disks[0] gets initialised to REMOVED because
854 * we cannot be sure from other fields if it has
855 * been initialised or not.
856 */
857 int i;
858 int active=0, working=0,failed=0,spare=0,nr_disks=0;
860 rdev->sb_size = MD_SB_BYTES;
862 sb = (mdp_super_t*)page_address(rdev->sb_page);
864 memset(sb, 0, sizeof(*sb));
866 sb->md_magic = MD_SB_MAGIC;
867 sb->major_version = mddev->major_version;
868 sb->patch_version = mddev->patch_version;
869 sb->gvalid_words = 0; /* ignored */
870 memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
871 memcpy(&sb->set_uuid1, mddev->uuid+4, 4);
872 memcpy(&sb->set_uuid2, mddev->uuid+8, 4);
873 memcpy(&sb->set_uuid3, mddev->uuid+12,4);
875 sb->ctime = mddev->ctime;
876 sb->level = mddev->level;
877 sb->size = mddev->size;
878 sb->raid_disks = mddev->raid_disks;
879 sb->md_minor = mddev->md_minor;
880 sb->not_persistent = !mddev->persistent;
881 sb->utime = mddev->utime;
882 sb->state = 0;
883 sb->events_hi = (mddev->events>>32);
884 sb->events_lo = (u32)mddev->events;
886 if (mddev->reshape_position == MaxSector)
887 sb->minor_version = 90;
888 else {
889 sb->minor_version = 91;
890 sb->reshape_position = mddev->reshape_position;
891 sb->new_level = mddev->new_level;
892 sb->delta_disks = mddev->delta_disks;
893 sb->new_layout = mddev->new_layout;
894 sb->new_chunk = mddev->new_chunk;
895 }
896 mddev->minor_version = sb->minor_version;
897 if (mddev->in_sync)
898 {
899 sb->recovery_cp = mddev->recovery_cp;
900 sb->cp_events_hi = (mddev->events>>32);
901 sb->cp_events_lo = (u32)mddev->events;
902 if (mddev->recovery_cp == MaxSector)
903 sb->state = (1<< MD_SB_CLEAN);
904 } else
905 sb->recovery_cp = 0;
907 sb->layout = mddev->layout;
908 sb->chunk_size = mddev->chunk_size;
910 if (mddev->bitmap && mddev->bitmap_file == NULL)
911 sb->state |= (1<<MD_SB_BITMAP_PRESENT);
913 sb->disks[0].state = (1<<MD_DISK_REMOVED);
914 ITERATE_RDEV(mddev,rdev2,tmp) {
915 mdp_disk_t *d;
916 int desc_nr;
917 if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags)
918 && !test_bit(Faulty, &rdev2->flags))
919 desc_nr = rdev2->raid_disk;
920 else
921 desc_nr = next_spare++;
922 rdev2->desc_nr = desc_nr;
923 d = &sb->disks[rdev2->desc_nr];
924 nr_disks++;
925 d->number = rdev2->desc_nr;
926 d->major = MAJOR(rdev2->bdev->bd_dev);
927 d->minor = MINOR(rdev2->bdev->bd_dev);
928 if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags)
929 && !test_bit(Faulty, &rdev2->flags))
930 d->raid_disk = rdev2->raid_disk;
931 else
932 d->raid_disk = rdev2->desc_nr; /* compatibility */
933 if (test_bit(Faulty, &rdev2->flags))
934 d->state = (1<<MD_DISK_FAULTY);
935 else if (test_bit(In_sync, &rdev2->flags)) {
936 d->state = (1<<MD_DISK_ACTIVE);
937 d->state |= (1<<MD_DISK_SYNC);
938 active++;
939 working++;
940 } else {
941 d->state = 0;
942 spare++;
943 working++;
944 }
945 if (test_bit(WriteMostly, &rdev2->flags))
946 d->state |= (1<<MD_DISK_WRITEMOSTLY);
947 }
948 /* now set the "removed" and "faulty" bits on any missing devices */
949 for (i=0 ; i < mddev->raid_disks ; i++) {
950 mdp_disk_t *d = &sb->disks[i];
951 if (d->state == 0 && d->number == 0) {
952 d->number = i;
953 d->raid_disk = i;
954 d->state = (1<<MD_DISK_REMOVED);
955 d->state |= (1<<MD_DISK_FAULTY);
956 failed++;
957 }
958 }
959 sb->nr_disks = nr_disks;
960 sb->active_disks = active;
961 sb->working_disks = working;
962 sb->failed_disks = failed;
963 sb->spare_disks = spare;
965 sb->this_disk = sb->disks[rdev->desc_nr];
966 sb->sb_csum = calc_sb_csum(sb);
967 }
969 /*
970 * version 1 superblock
971 */
973 static unsigned int calc_sb_1_csum(struct mdp_superblock_1 * sb)
974 {
975 unsigned int disk_csum, csum;
976 unsigned long long newcsum;
977 int size = 256 + le32_to_cpu(sb->max_dev)*2;
978 unsigned int *isuper = (unsigned int*)sb;
979 int i;
981 disk_csum = sb->sb_csum;
982 sb->sb_csum = 0;
983 newcsum = 0;
984 for (i=0; size>=4; size -= 4 )
985 newcsum += le32_to_cpu(*isuper++);
987 if (size == 2)
988 newcsum += le16_to_cpu(*(unsigned short*) isuper);
990 csum = (newcsum & 0xffffffff) + (newcsum >> 32);
991 sb->sb_csum = disk_csum;
992 return cpu_to_le32(csum);
993 }
995 static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
996 {
997 struct mdp_superblock_1 *sb;
998 int ret;
999 sector_t sb_offset;
1000 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
1001 int bmask;
1003 /*
1004 * Calculate the position of the superblock.
1005 * It is always aligned to a 4K boundary and
1006 * depeding on minor_version, it can be:
1007 * 0: At least 8K, but less than 12K, from end of device
1008 * 1: At start of device
1009 * 2: 4K from start of device.
1010 */
1011 switch(minor_version) {
1012 case 0:
1013 sb_offset = rdev->bdev->bd_inode->i_size >> 9;
1014 sb_offset -= 8*2;
1015 sb_offset &= ~(sector_t)(4*2-1);
1016 /* convert from sectors to K */
1017 sb_offset /= 2;
1018 break;
1019 case 1:
1020 sb_offset = 0;
1021 break;
1022 case 2:
1023 sb_offset = 4;
1024 break;
1025 default:
1026 return -EINVAL;
1028 rdev->sb_offset = sb_offset;
1030 /* superblock is rarely larger than 1K, but it can be larger,
1031 * and it is safe to read 4k, so we do that
1032 */
1033 ret = read_disk_sb(rdev, 4096);
1034 if (ret) return ret;
1037 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
1039 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
1040 sb->major_version != cpu_to_le32(1) ||
1041 le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
1042 le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) ||
1043 (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0)
1044 return -EINVAL;
1046 if (calc_sb_1_csum(sb) != sb->sb_csum) {
1047 printk("md: invalid superblock checksum on %s\n",
1048 bdevname(rdev->bdev,b));
1049 return -EINVAL;
1051 if (le64_to_cpu(sb->data_size) < 10) {
1052 printk("md: data_size too small on %s\n",
1053 bdevname(rdev->bdev,b));
1054 return -EINVAL;
1056 rdev->preferred_minor = 0xffff;
1057 rdev->data_offset = le64_to_cpu(sb->data_offset);
1058 atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read));
1060 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
1061 bmask = queue_hardsect_size(rdev->bdev->bd_disk->queue)-1;
1062 if (rdev->sb_size & bmask)
1063 rdev-> sb_size = (rdev->sb_size | bmask)+1;
1065 if (sb->level == cpu_to_le32(LEVEL_MULTIPATH))
1066 rdev->desc_nr = -1;
1067 else
1068 rdev->desc_nr = le32_to_cpu(sb->dev_number);
1070 if (refdev == 0)
1071 ret = 1;
1072 else {
1073 __u64 ev1, ev2;
1074 struct mdp_superblock_1 *refsb =
1075 (struct mdp_superblock_1*)page_address(refdev->sb_page);
1077 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 ||
1078 sb->level != refsb->level ||
1079 sb->layout != refsb->layout ||
1080 sb->chunksize != refsb->chunksize) {
1081 printk(KERN_WARNING "md: %s has strangely different"
1082 " superblock to %s\n",
1083 bdevname(rdev->bdev,b),
1084 bdevname(refdev->bdev,b2));
1085 return -EINVAL;
1087 ev1 = le64_to_cpu(sb->events);
1088 ev2 = le64_to_cpu(refsb->events);
1090 if (ev1 > ev2)
1091 ret = 1;
1092 else
1093 ret = 0;
1095 if (minor_version)
1096 rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2;
1097 else
1098 rdev->size = rdev->sb_offset;
1099 if (rdev->size < le64_to_cpu(sb->data_size)/2)
1100 return -EINVAL;
1101 rdev->size = le64_to_cpu(sb->data_size)/2;
1102 if (le32_to_cpu(sb->chunksize))
1103 rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1);
1105 if (le32_to_cpu(sb->size) > rdev->size*2)
1106 return -EINVAL;
1107 return ret;
1110 static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1112 struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
1113 __u64 ev1 = le64_to_cpu(sb->events);
1115 rdev->raid_disk = -1;
1116 rdev->flags = 0;
1117 if (mddev->raid_disks == 0) {
1118 mddev->major_version = 1;
1119 mddev->patch_version = 0;
1120 mddev->persistent = 1;
1121 mddev->chunk_size = le32_to_cpu(sb->chunksize) << 9;
1122 mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1);
1123 mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1);
1124 mddev->level = le32_to_cpu(sb->level);
1125 mddev->clevel[0] = 0;
1126 mddev->layout = le32_to_cpu(sb->layout);
1127 mddev->raid_disks = le32_to_cpu(sb->raid_disks);
1128 mddev->size = le64_to_cpu(sb->size)/2;
1129 mddev->events = ev1;
1130 mddev->bitmap_offset = 0;
1131 mddev->default_bitmap_offset = 1024 >> 9;
1133 mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
1134 memcpy(mddev->uuid, sb->set_uuid, 16);
1136 mddev->max_disks = (4096-256)/2;
1138 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
1139 mddev->bitmap_file == NULL ) {
1140 if (mddev->level != 1 && mddev->level != 5 && mddev->level != 6
1141 && mddev->level != 10) {
1142 printk(KERN_WARNING "md: bitmaps not supported for this level.\n");
1143 return -EINVAL;
1145 mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset);
1147 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
1148 mddev->reshape_position = le64_to_cpu(sb->reshape_position);
1149 mddev->delta_disks = le32_to_cpu(sb->delta_disks);
1150 mddev->new_level = le32_to_cpu(sb->new_level);
1151 mddev->new_layout = le32_to_cpu(sb->new_layout);
1152 mddev->new_chunk = le32_to_cpu(sb->new_chunk)<<9;
1153 } else {
1154 mddev->reshape_position = MaxSector;
1155 mddev->delta_disks = 0;
1156 mddev->new_level = mddev->level;
1157 mddev->new_layout = mddev->layout;
1158 mddev->new_chunk = mddev->chunk_size;
1161 } else if (mddev->pers == NULL) {
1162 /* Insist of good event counter while assembling */
1163 ++ev1;
1164 if (ev1 < mddev->events)
1165 return -EINVAL;
1166 } else if (mddev->bitmap) {
1167 /* If adding to array with a bitmap, then we can accept an
1168 * older device, but not too old.
1169 */
1170 if (ev1 < mddev->bitmap->events_cleared)
1171 return 0;
1172 } else {
1173 if (ev1 < mddev->events)
1174 /* just a hot-add of a new device, leave raid_disk at -1 */
1175 return 0;
1177 if (mddev->level != LEVEL_MULTIPATH) {
1178 int role;
1179 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
1180 switch(role) {
1181 case 0xffff: /* spare */
1182 break;
1183 case 0xfffe: /* faulty */
1184 set_bit(Faulty, &rdev->flags);
1185 break;
1186 default:
1187 if ((le32_to_cpu(sb->feature_map) &
1188 MD_FEATURE_RECOVERY_OFFSET))
1189 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
1190 else
1191 set_bit(In_sync, &rdev->flags);
1192 rdev->raid_disk = role;
1193 break;
1195 if (sb->devflags & WriteMostly1)
1196 set_bit(WriteMostly, &rdev->flags);
1197 } else /* MULTIPATH are always insync */
1198 set_bit(In_sync, &rdev->flags);
1200 return 0;
1203 static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1205 struct mdp_superblock_1 *sb;
1206 struct list_head *tmp;
1207 mdk_rdev_t *rdev2;
1208 int max_dev, i;
1209 /* make rdev->sb match mddev and rdev data. */
1211 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
1213 sb->feature_map = 0;
1214 sb->pad0 = 0;
1215 sb->recovery_offset = cpu_to_le64(0);
1216 memset(sb->pad1, 0, sizeof(sb->pad1));
1217 memset(sb->pad2, 0, sizeof(sb->pad2));
1218 memset(sb->pad3, 0, sizeof(sb->pad3));
1220 sb->utime = cpu_to_le64((__u64)mddev->utime);
1221 sb->events = cpu_to_le64(mddev->events);
1222 if (mddev->in_sync)
1223 sb->resync_offset = cpu_to_le64(mddev->recovery_cp);
1224 else
1225 sb->resync_offset = cpu_to_le64(0);
1227 sb->cnt_corrected_read = atomic_read(&rdev->corrected_errors);
1229 sb->raid_disks = cpu_to_le32(mddev->raid_disks);
1230 sb->size = cpu_to_le64(mddev->size<<1);
1232 if (mddev->bitmap && mddev->bitmap_file == NULL) {
1233 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset);
1234 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
1237 if (rdev->raid_disk >= 0 &&
1238 !test_bit(In_sync, &rdev->flags) &&
1239 rdev->recovery_offset > 0) {
1240 sb->feature_map |= cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
1241 sb->recovery_offset = cpu_to_le64(rdev->recovery_offset);
1244 if (mddev->reshape_position != MaxSector) {
1245 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
1246 sb->reshape_position = cpu_to_le64(mddev->reshape_position);
1247 sb->new_layout = cpu_to_le32(mddev->new_layout);
1248 sb->delta_disks = cpu_to_le32(mddev->delta_disks);
1249 sb->new_level = cpu_to_le32(mddev->new_level);
1250 sb->new_chunk = cpu_to_le32(mddev->new_chunk>>9);
1253 max_dev = 0;
1254 ITERATE_RDEV(mddev,rdev2,tmp)
1255 if (rdev2->desc_nr+1 > max_dev)
1256 max_dev = rdev2->desc_nr+1;
1258 sb->max_dev = cpu_to_le32(max_dev);
1259 for (i=0; i<max_dev;i++)
1260 sb->dev_roles[i] = cpu_to_le16(0xfffe);
1262 ITERATE_RDEV(mddev,rdev2,tmp) {
1263 i = rdev2->desc_nr;
1264 if (test_bit(Faulty, &rdev2->flags))
1265 sb->dev_roles[i] = cpu_to_le16(0xfffe);
1266 else if (test_bit(In_sync, &rdev2->flags))
1267 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1268 else if (rdev2->raid_disk >= 0 && rdev2->recovery_offset > 0)
1269 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1270 else
1271 sb->dev_roles[i] = cpu_to_le16(0xffff);
1274 sb->sb_csum = calc_sb_1_csum(sb);
1278 static struct super_type super_types[] = {
1279 [0] = {
1280 .name = "0.90.0",
1281 .owner = THIS_MODULE,
1282 .load_super = super_90_load,
1283 .validate_super = super_90_validate,
1284 .sync_super = super_90_sync,
1285 },
1286 [1] = {
1287 .name = "md-1",
1288 .owner = THIS_MODULE,
1289 .load_super = super_1_load,
1290 .validate_super = super_1_validate,
1291 .sync_super = super_1_sync,
1292 },
1293 };
1295 static mdk_rdev_t * match_dev_unit(mddev_t *mddev, mdk_rdev_t *dev)
1297 struct list_head *tmp;
1298 mdk_rdev_t *rdev;
1300 ITERATE_RDEV(mddev,rdev,tmp)
1301 if (rdev->bdev->bd_contains == dev->bdev->bd_contains)
1302 return rdev;
1304 return NULL;
1307 static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
1309 struct list_head *tmp;
1310 mdk_rdev_t *rdev;
1312 ITERATE_RDEV(mddev1,rdev,tmp)
1313 if (match_dev_unit(mddev2, rdev))
1314 return 1;
1316 return 0;
1319 static LIST_HEAD(pending_raid_disks);
1321 static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
1323 mdk_rdev_t *same_pdev;
1324 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
1325 struct kobject *ko;
1326 char *s;
1328 if (rdev->mddev) {
1329 MD_BUG();
1330 return -EINVAL;
1332 /* make sure rdev->size exceeds mddev->size */
1333 if (rdev->size && (mddev->size == 0 || rdev->size < mddev->size)) {
1334 if (mddev->pers)
1335 /* Cannot change size, so fail */
1336 return -ENOSPC;
1337 else
1338 mddev->size = rdev->size;
1340 same_pdev = match_dev_unit(mddev, rdev);
1341 if (same_pdev)
1342 printk(KERN_WARNING
1343 "%s: WARNING: %s appears to be on the same physical"
1344 " disk as %s. True\n protection against single-disk"
1345 " failure might be compromised.\n",
1346 mdname(mddev), bdevname(rdev->bdev,b),
1347 bdevname(same_pdev->bdev,b2));
1349 /* Verify rdev->desc_nr is unique.
1350 * If it is -1, assign a free number, else
1351 * check number is not in use
1352 */
1353 if (rdev->desc_nr < 0) {
1354 int choice = 0;
1355 if (mddev->pers) choice = mddev->raid_disks;
1356 while (find_rdev_nr(mddev, choice))
1357 choice++;
1358 rdev->desc_nr = choice;
1359 } else {
1360 if (find_rdev_nr(mddev, rdev->desc_nr))
1361 return -EBUSY;
1363 bdevname(rdev->bdev,b);
1364 if (kobject_set_name(&rdev->kobj, "dev-%s", b) < 0)
1365 return -ENOMEM;
1366 while ( (s=strchr(rdev->kobj.k_name, '/')) != NULL)
1367 *s = '!';
1369 list_add(&rdev->same_set, &mddev->disks);
1370 rdev->mddev = mddev;
1371 printk(KERN_INFO "md: bind<%s>\n", b);
1373 rdev->kobj.parent = &mddev->kobj;
1374 kobject_add(&rdev->kobj);
1376 if (rdev->bdev->bd_part)
1377 ko = &rdev->bdev->bd_part->kobj;
1378 else
1379 ko = &rdev->bdev->bd_disk->kobj;
1380 sysfs_create_link(&rdev->kobj, ko, "block");
1381 bd_claim_by_disk(rdev->bdev, rdev, mddev->gendisk);
1382 return 0;
1385 static void unbind_rdev_from_array(mdk_rdev_t * rdev)
1387 char b[BDEVNAME_SIZE];
1388 if (!rdev->mddev) {
1389 MD_BUG();
1390 return;
1392 bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk);
1393 list_del_init(&rdev->same_set);
1394 printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b));
1395 rdev->mddev = NULL;
1396 sysfs_remove_link(&rdev->kobj, "block");
1397 kobject_del(&rdev->kobj);
1400 /*
1401 * prevent the device from being mounted, repartitioned or
1402 * otherwise reused by a RAID array (or any other kernel
1403 * subsystem), by bd_claiming the device.
1404 */
1405 static int lock_rdev(mdk_rdev_t *rdev, dev_t dev)
1407 int err = 0;
1408 struct block_device *bdev;
1409 char b[BDEVNAME_SIZE];
1411 bdev = open_partition_by_devnum(dev, FMODE_READ|FMODE_WRITE);
1412 if (IS_ERR(bdev)) {
1413 printk(KERN_ERR "md: could not open %s.\n",
1414 __bdevname(dev, b));
1415 return PTR_ERR(bdev);
1417 err = bd_claim(bdev, rdev);
1418 if (err) {
1419 printk(KERN_ERR "md: could not bd_claim %s.\n",
1420 bdevname(bdev, b));
1421 blkdev_put_partition(bdev);
1422 return err;
1424 rdev->bdev = bdev;
1425 return err;
1428 static void unlock_rdev(mdk_rdev_t *rdev)
1430 struct block_device *bdev = rdev->bdev;
1431 rdev->bdev = NULL;
1432 if (!bdev)
1433 MD_BUG();
1434 bd_release(bdev);
1435 blkdev_put_partition(bdev);
1438 void md_autodetect_dev(dev_t dev);
1440 static void export_rdev(mdk_rdev_t * rdev)
1442 char b[BDEVNAME_SIZE];
1443 printk(KERN_INFO "md: export_rdev(%s)\n",
1444 bdevname(rdev->bdev,b));
1445 if (rdev->mddev)
1446 MD_BUG();
1447 free_disk_sb(rdev);
1448 list_del_init(&rdev->same_set);
1449 #ifndef MODULE
1450 md_autodetect_dev(rdev->bdev->bd_dev);
1451 #endif
1452 unlock_rdev(rdev);
1453 kobject_put(&rdev->kobj);
1456 static void kick_rdev_from_array(mdk_rdev_t * rdev)
1458 unbind_rdev_from_array(rdev);
1459 export_rdev(rdev);
1462 static void export_array(mddev_t *mddev)
1464 struct list_head *tmp;
1465 mdk_rdev_t *rdev;
1467 ITERATE_RDEV(mddev,rdev,tmp) {
1468 if (!rdev->mddev) {
1469 MD_BUG();
1470 continue;
1472 kick_rdev_from_array(rdev);
1474 if (!list_empty(&mddev->disks))
1475 MD_BUG();
1476 mddev->raid_disks = 0;
1477 mddev->major_version = 0;
1480 static void print_desc(mdp_disk_t *desc)
1482 printk(" DISK<N:%d,(%d,%d),R:%d,S:%d>\n", desc->number,
1483 desc->major,desc->minor,desc->raid_disk,desc->state);
1486 static void print_sb(mdp_super_t *sb)
1488 int i;
1490 printk(KERN_INFO
1491 "md: SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n",
1492 sb->major_version, sb->minor_version, sb->patch_version,
1493 sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3,
1494 sb->ctime);
1495 printk(KERN_INFO "md: L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n",
1496 sb->level, sb->size, sb->nr_disks, sb->raid_disks,
1497 sb->md_minor, sb->layout, sb->chunk_size);
1498 printk(KERN_INFO "md: UT:%08x ST:%d AD:%d WD:%d"
1499 " FD:%d SD:%d CSUM:%08x E:%08lx\n",
1500 sb->utime, sb->state, sb->active_disks, sb->working_disks,
1501 sb->failed_disks, sb->spare_disks,
1502 sb->sb_csum, (unsigned long)sb->events_lo);
1504 printk(KERN_INFO);
1505 for (i = 0; i < MD_SB_DISKS; i++) {
1506 mdp_disk_t *desc;
1508 desc = sb->disks + i;
1509 if (desc->number || desc->major || desc->minor ||
1510 desc->raid_disk || (desc->state && (desc->state != 4))) {
1511 printk(" D %2d: ", i);
1512 print_desc(desc);
1515 printk(KERN_INFO "md: THIS: ");
1516 print_desc(&sb->this_disk);
1520 static void print_rdev(mdk_rdev_t *rdev)
1522 char b[BDEVNAME_SIZE];
1523 printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%u\n",
1524 bdevname(rdev->bdev,b), (unsigned long long)rdev->size,
1525 test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags),
1526 rdev->desc_nr);
1527 if (rdev->sb_loaded) {
1528 printk(KERN_INFO "md: rdev superblock:\n");
1529 print_sb((mdp_super_t*)page_address(rdev->sb_page));
1530 } else
1531 printk(KERN_INFO "md: no rdev superblock!\n");
1534 static void md_print_devices(void)
1536 struct list_head *tmp, *tmp2;
1537 mdk_rdev_t *rdev;
1538 mddev_t *mddev;
1539 char b[BDEVNAME_SIZE];
1541 printk("\n");
1542 printk("md: **********************************\n");
1543 printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n");
1544 printk("md: **********************************\n");
1545 ITERATE_MDDEV(mddev,tmp) {
1547 if (mddev->bitmap)
1548 bitmap_print_sb(mddev->bitmap);
1549 else
1550 printk("%s: ", mdname(mddev));
1551 ITERATE_RDEV(mddev,rdev,tmp2)
1552 printk("<%s>", bdevname(rdev->bdev,b));
1553 printk("\n");
1555 ITERATE_RDEV(mddev,rdev,tmp2)
1556 print_rdev(rdev);
1558 printk("md: **********************************\n");
1559 printk("\n");
1563 static void sync_sbs(mddev_t * mddev, int nospares)
1565 /* Update each superblock (in-memory image), but
1566 * if we are allowed to, skip spares which already
1567 * have the right event counter, or have one earlier
1568 * (which would mean they aren't being marked as dirty
1569 * with the rest of the array)
1570 */
1571 mdk_rdev_t *rdev;
1572 struct list_head *tmp;
1574 ITERATE_RDEV(mddev,rdev,tmp) {
1575 if (rdev->sb_events == mddev->events ||
1576 (nospares &&
1577 rdev->raid_disk < 0 &&
1578 (rdev->sb_events&1)==0 &&
1579 rdev->sb_events+1 == mddev->events)) {
1580 /* Don't update this superblock */
1581 rdev->sb_loaded = 2;
1582 } else {
1583 super_types[mddev->major_version].
1584 sync_super(mddev, rdev);
1585 rdev->sb_loaded = 1;
1590 void md_update_sb(mddev_t * mddev)
1592 int err;
1593 struct list_head *tmp;
1594 mdk_rdev_t *rdev;
1595 int sync_req;
1596 int nospares = 0;
1598 repeat:
1599 spin_lock_irq(&mddev->write_lock);
1601 if (mddev->degraded && mddev->sb_dirty == 3)
1602 /* If the array is degraded, then skipping spares is both
1603 * dangerous and fairly pointless.
1604 * Dangerous because a device that was removed from the array
1605 * might have a event_count that still looks up-to-date,
1606 * so it can be re-added without a resync.
1607 * Pointless because if there are any spares to skip,
1608 * then a recovery will happen and soon that array won't
1609 * be degraded any more and the spare can go back to sleep then.
1610 */
1611 mddev->sb_dirty = 1;
1613 sync_req = mddev->in_sync;
1614 mddev->utime = get_seconds();
1615 if (mddev->sb_dirty == 3)
1616 /* just a clean<-> dirty transition, possibly leave spares alone,
1617 * though if events isn't the right even/odd, we will have to do
1618 * spares after all
1619 */
1620 nospares = 1;
1622 /* If this is just a dirty<->clean transition, and the array is clean
1623 * and 'events' is odd, we can roll back to the previous clean state */
1624 if (mddev->sb_dirty == 3
1625 && (mddev->in_sync && mddev->recovery_cp == MaxSector)
1626 && (mddev->events & 1))
1627 mddev->events--;
1628 else {
1629 /* otherwise we have to go forward and ... */
1630 mddev->events ++;
1631 if (!mddev->in_sync || mddev->recovery_cp != MaxSector) { /* not clean */
1632 /* .. if the array isn't clean, insist on an odd 'events' */
1633 if ((mddev->events&1)==0) {
1634 mddev->events++;
1635 nospares = 0;
1637 } else {
1638 /* otherwise insist on an even 'events' (for clean states) */
1639 if ((mddev->events&1)) {
1640 mddev->events++;
1641 nospares = 0;
1646 if (!mddev->events) {
1647 /*
1648 * oops, this 64-bit counter should never wrap.
1649 * Either we are in around ~1 trillion A.C., assuming
1650 * 1 reboot per second, or we have a bug:
1651 */
1652 MD_BUG();
1653 mddev->events --;
1655 mddev->sb_dirty = 2;
1656 sync_sbs(mddev, nospares);
1658 /*
1659 * do not write anything to disk if using
1660 * nonpersistent superblocks
1661 */
1662 if (!mddev->persistent) {
1663 mddev->sb_dirty = 0;
1664 spin_unlock_irq(&mddev->write_lock);
1665 wake_up(&mddev->sb_wait);
1666 return;
1668 spin_unlock_irq(&mddev->write_lock);
1670 dprintk(KERN_INFO
1671 "md: updating %s RAID superblock on device (in sync %d)\n",
1672 mdname(mddev),mddev->in_sync);
1674 err = bitmap_update_sb(mddev->bitmap);
1675 ITERATE_RDEV(mddev,rdev,tmp) {
1676 char b[BDEVNAME_SIZE];
1677 dprintk(KERN_INFO "md: ");
1678 if (rdev->sb_loaded != 1)
1679 continue; /* no noise on spare devices */
1680 if (test_bit(Faulty, &rdev->flags))
1681 dprintk("(skipping faulty ");
1683 dprintk("%s ", bdevname(rdev->bdev,b));
1684 if (!test_bit(Faulty, &rdev->flags)) {
1685 md_super_write(mddev,rdev,
1686 rdev->sb_offset<<1, rdev->sb_size,
1687 rdev->sb_page);
1688 dprintk(KERN_INFO "(write) %s's sb offset: %llu\n",
1689 bdevname(rdev->bdev,b),
1690 (unsigned long long)rdev->sb_offset);
1691 rdev->sb_events = mddev->events;
1693 } else
1694 dprintk(")\n");
1695 if (mddev->level == LEVEL_MULTIPATH)
1696 /* only need to write one superblock... */
1697 break;
1699 md_super_wait(mddev);
1700 /* if there was a failure, sb_dirty was set to 1, and we re-write super */
1702 spin_lock_irq(&mddev->write_lock);
1703 if (mddev->in_sync != sync_req|| mddev->sb_dirty == 1) {
1704 /* have to write it out again */
1705 spin_unlock_irq(&mddev->write_lock);
1706 goto repeat;
1708 mddev->sb_dirty = 0;
1709 spin_unlock_irq(&mddev->write_lock);
1710 wake_up(&mddev->sb_wait);
1713 EXPORT_SYMBOL_GPL(md_update_sb);
1715 /* words written to sysfs files may, or my not, be \n terminated.
1716 * We want to accept with case. For this we use cmd_match.
1717 */
1718 static int cmd_match(const char *cmd, const char *str)
1720 /* See if cmd, written into a sysfs file, matches
1721 * str. They must either be the same, or cmd can
1722 * have a trailing newline
1723 */
1724 while (*cmd && *str && *cmd == *str) {
1725 cmd++;
1726 str++;
1728 if (*cmd == '\n')
1729 cmd++;
1730 if (*str || *cmd)
1731 return 0;
1732 return 1;
1735 struct rdev_sysfs_entry {
1736 struct attribute attr;
1737 ssize_t (*show)(mdk_rdev_t *, char *);
1738 ssize_t (*store)(mdk_rdev_t *, const char *, size_t);
1739 };
1741 static ssize_t
1742 state_show(mdk_rdev_t *rdev, char *page)
1744 char *sep = "";
1745 int len=0;
1747 if (test_bit(Faulty, &rdev->flags)) {
1748 len+= sprintf(page+len, "%sfaulty",sep);
1749 sep = ",";
1751 if (test_bit(In_sync, &rdev->flags)) {
1752 len += sprintf(page+len, "%sin_sync",sep);
1753 sep = ",";
1755 if (test_bit(WriteMostly, &rdev->flags)) {
1756 len += sprintf(page+len, "%swrite_mostly",sep);
1757 sep = ",";
1759 if (!test_bit(Faulty, &rdev->flags) &&
1760 !test_bit(In_sync, &rdev->flags)) {
1761 len += sprintf(page+len, "%sspare", sep);
1762 sep = ",";
1764 return len+sprintf(page+len, "\n");
1767 static ssize_t
1768 state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1770 /* can write
1771 * faulty - simulates and error
1772 * remove - disconnects the device
1773 * writemostly - sets write_mostly
1774 * -writemostly - clears write_mostly
1775 */
1776 int err = -EINVAL;
1777 if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
1778 md_error(rdev->mddev, rdev);
1779 err = 0;
1780 } else if (cmd_match(buf, "remove")) {
1781 if (rdev->raid_disk >= 0)
1782 err = -EBUSY;
1783 else {
1784 mddev_t *mddev = rdev->mddev;
1785 kick_rdev_from_array(rdev);
1786 md_update_sb(mddev);
1787 md_new_event(mddev);
1788 err = 0;
1790 } else if (cmd_match(buf, "writemostly")) {
1791 set_bit(WriteMostly, &rdev->flags);
1792 err = 0;
1793 } else if (cmd_match(buf, "-writemostly")) {
1794 clear_bit(WriteMostly, &rdev->flags);
1795 err = 0;
1797 return err ? err : len;
1799 static struct rdev_sysfs_entry rdev_state =
1800 __ATTR(state, S_IRUGO|S_IWUSR, state_show, state_store);
1802 static ssize_t
1803 super_show(mdk_rdev_t *rdev, char *page)
1805 if (rdev->sb_loaded && rdev->sb_size) {
1806 memcpy(page, page_address(rdev->sb_page), rdev->sb_size);
1807 return rdev->sb_size;
1808 } else
1809 return 0;
1811 static struct rdev_sysfs_entry rdev_super = __ATTR_RO(super);
1813 static ssize_t
1814 errors_show(mdk_rdev_t *rdev, char *page)
1816 return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors));
1819 static ssize_t
1820 errors_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1822 char *e;
1823 unsigned long n = simple_strtoul(buf, &e, 10);
1824 if (*buf && (*e == 0 || *e == '\n')) {
1825 atomic_set(&rdev->corrected_errors, n);
1826 return len;
1828 return -EINVAL;
1830 static struct rdev_sysfs_entry rdev_errors =
1831 __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store);
1833 static ssize_t
1834 slot_show(mdk_rdev_t *rdev, char *page)
1836 if (rdev->raid_disk < 0)
1837 return sprintf(page, "none\n");
1838 else
1839 return sprintf(page, "%d\n", rdev->raid_disk);
1842 static ssize_t
1843 slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1845 char *e;
1846 int slot = simple_strtoul(buf, &e, 10);
1847 if (strncmp(buf, "none", 4)==0)
1848 slot = -1;
1849 else if (e==buf || (*e && *e!= '\n'))
1850 return -EINVAL;
1851 if (rdev->mddev->pers)
1852 /* Cannot set slot in active array (yet) */
1853 return -EBUSY;
1854 if (slot >= rdev->mddev->raid_disks)
1855 return -ENOSPC;
1856 rdev->raid_disk = slot;
1857 /* assume it is working */
1858 rdev->flags = 0;
1859 set_bit(In_sync, &rdev->flags);
1860 return len;
1864 static struct rdev_sysfs_entry rdev_slot =
1865 __ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store);
1867 static ssize_t
1868 offset_show(mdk_rdev_t *rdev, char *page)
1870 return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset);
1873 static ssize_t
1874 offset_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1876 char *e;
1877 unsigned long long offset = simple_strtoull(buf, &e, 10);
1878 if (e==buf || (*e && *e != '\n'))
1879 return -EINVAL;
1880 if (rdev->mddev->pers)
1881 return -EBUSY;
1882 rdev->data_offset = offset;
1883 return len;
1886 static struct rdev_sysfs_entry rdev_offset =
1887 __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store);
1889 static ssize_t
1890 rdev_size_show(mdk_rdev_t *rdev, char *page)
1892 return sprintf(page, "%llu\n", (unsigned long long)rdev->size);
1895 static ssize_t
1896 rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1898 char *e;
1899 unsigned long long size = simple_strtoull(buf, &e, 10);
1900 if (e==buf || (*e && *e != '\n'))
1901 return -EINVAL;
1902 if (rdev->mddev->pers)
1903 return -EBUSY;
1904 rdev->size = size;
1905 if (size < rdev->mddev->size || rdev->mddev->size == 0)
1906 rdev->mddev->size = size;
1907 return len;
1910 static struct rdev_sysfs_entry rdev_size =
1911 __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store);
1913 static struct attribute *rdev_default_attrs[] = {
1914 &rdev_state.attr,
1915 &rdev_super.attr,
1916 &rdev_errors.attr,
1917 &rdev_slot.attr,
1918 &rdev_offset.attr,
1919 &rdev_size.attr,
1920 NULL,
1921 };
1922 static ssize_t
1923 rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
1925 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
1926 mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj);
1928 if (!entry->show)
1929 return -EIO;
1930 return entry->show(rdev, page);
1933 static ssize_t
1934 rdev_attr_store(struct kobject *kobj, struct attribute *attr,
1935 const char *page, size_t length)
1937 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
1938 mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj);
1940 if (!entry->store)
1941 return -EIO;
1942 if (!capable(CAP_SYS_ADMIN))
1943 return -EACCES;
1944 return entry->store(rdev, page, length);
1947 static void rdev_free(struct kobject *ko)
1949 mdk_rdev_t *rdev = container_of(ko, mdk_rdev_t, kobj);
1950 kfree(rdev);
1952 static struct sysfs_ops rdev_sysfs_ops = {
1953 .show = rdev_attr_show,
1954 .store = rdev_attr_store,
1955 };
1956 static struct kobj_type rdev_ktype = {
1957 .release = rdev_free,
1958 .sysfs_ops = &rdev_sysfs_ops,
1959 .default_attrs = rdev_default_attrs,
1960 };
1962 /*
1963 * Import a device. If 'super_format' >= 0, then sanity check the superblock
1965 * mark the device faulty if:
1967 * - the device is nonexistent (zero size)
1968 * - the device has no valid superblock
1970 * a faulty rdev _never_ has rdev->sb set.
1971 */
1972 static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_minor)
1974 char b[BDEVNAME_SIZE];
1975 int err;
1976 mdk_rdev_t *rdev;
1977 sector_t size;
1979 rdev = kzalloc(sizeof(*rdev), GFP_KERNEL);
1980 if (!rdev) {
1981 printk(KERN_ERR "md: could not alloc mem for new device!\n");
1982 return ERR_PTR(-ENOMEM);
1985 if ((err = alloc_disk_sb(rdev)))
1986 goto abort_free;
1988 err = lock_rdev(rdev, newdev);
1989 if (err)
1990 goto abort_free;
1992 rdev->kobj.parent = NULL;
1993 rdev->kobj.ktype = &rdev_ktype;
1994 kobject_init(&rdev->kobj);
1996 rdev->desc_nr = -1;
1997 rdev->saved_raid_disk = -1;
1998 rdev->flags = 0;
1999 rdev->data_offset = 0;
2000 rdev->sb_events = 0;
2001 atomic_set(&rdev->nr_pending, 0);
2002 atomic_set(&rdev->read_errors, 0);
2003 atomic_set(&rdev->corrected_errors, 0);
2005 size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
2006 if (!size) {
2007 printk(KERN_WARNING
2008 "md: %s has zero or unknown size, marking faulty!\n",
2009 bdevname(rdev->bdev,b));
2010 err = -EINVAL;
2011 goto abort_free;
2014 if (super_format >= 0) {
2015 err = super_types[super_format].
2016 load_super(rdev, NULL, super_minor);
2017 if (err == -EINVAL) {
2018 printk(KERN_WARNING
2019 "md: %s has invalid sb, not importing!\n",
2020 bdevname(rdev->bdev,b));
2021 goto abort_free;
2023 if (err < 0) {
2024 printk(KERN_WARNING
2025 "md: could not read %s's sb, not importing!\n",
2026 bdevname(rdev->bdev,b));
2027 goto abort_free;
2030 INIT_LIST_HEAD(&rdev->same_set);
2032 return rdev;
2034 abort_free:
2035 if (rdev->sb_page) {
2036 if (rdev->bdev)
2037 unlock_rdev(rdev);
2038 free_disk_sb(rdev);
2040 kfree(rdev);
2041 return ERR_PTR(err);
2044 /*
2045 * Check a full RAID array for plausibility
2046 */
2049 static void analyze_sbs(mddev_t * mddev)
2051 int i;
2052 struct list_head *tmp;
2053 mdk_rdev_t *rdev, *freshest;
2054 char b[BDEVNAME_SIZE];
2056 freshest = NULL;
2057 ITERATE_RDEV(mddev,rdev,tmp)
2058 switch (super_types[mddev->major_version].
2059 load_super(rdev, freshest, mddev->minor_version)) {
2060 case 1:
2061 freshest = rdev;
2062 break;
2063 case 0:
2064 break;
2065 default:
2066 printk( KERN_ERR \
2067 "md: fatal superblock inconsistency in %s"
2068 " -- removing from array\n",
2069 bdevname(rdev->bdev,b));
2070 kick_rdev_from_array(rdev);
2074 super_types[mddev->major_version].
2075 validate_super(mddev, freshest);
2077 i = 0;
2078 ITERATE_RDEV(mddev,rdev,tmp) {
2079 if (rdev != freshest)
2080 if (super_types[mddev->major_version].
2081 validate_super(mddev, rdev)) {
2082 printk(KERN_WARNING "md: kicking non-fresh %s"
2083 " from array!\n",
2084 bdevname(rdev->bdev,b));
2085 kick_rdev_from_array(rdev);
2086 continue;
2088 if (mddev->level == LEVEL_MULTIPATH) {
2089 rdev->desc_nr = i++;
2090 rdev->raid_disk = rdev->desc_nr;
2091 set_bit(In_sync, &rdev->flags);
2097 if (mddev->recovery_cp != MaxSector &&
2098 mddev->level >= 1)
2099 printk(KERN_ERR "md: %s: raid array is not clean"
2100 " -- starting background reconstruction\n",
2101 mdname(mddev));
2105 static ssize_t
2106 safe_delay_show(mddev_t *mddev, char *page)
2108 int msec = (mddev->safemode_delay*1000)/HZ;
2109 return sprintf(page, "%d.%03d\n", msec/1000, msec%1000);
2111 static ssize_t
2112 safe_delay_store(mddev_t *mddev, const char *cbuf, size_t len)
2114 int scale=1;
2115 int dot=0;
2116 int i;
2117 unsigned long msec;
2118 char buf[30];
2119 char *e;
2120 /* remove a period, and count digits after it */
2121 if (len >= sizeof(buf))
2122 return -EINVAL;
2123 strlcpy(buf, cbuf, len);
2124 buf[len] = 0;
2125 for (i=0; i<len; i++) {
2126 if (dot) {
2127 if (isdigit(buf[i])) {
2128 buf[i-1] = buf[i];
2129 scale *= 10;
2131 buf[i] = 0;
2132 } else if (buf[i] == '.') {
2133 dot=1;
2134 buf[i] = 0;
2137 msec = simple_strtoul(buf, &e, 10);
2138 if (e == buf || (*e && *e != '\n'))
2139 return -EINVAL;
2140 msec = (msec * 1000) / scale;
2141 if (msec == 0)
2142 mddev->safemode_delay = 0;
2143 else {
2144 mddev->safemode_delay = (msec*HZ)/1000;
2145 if (mddev->safemode_delay == 0)
2146 mddev->safemode_delay = 1;
2148 return len;
2150 static struct md_sysfs_entry md_safe_delay =
2151 __ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store);
2153 static ssize_t
2154 level_show(mddev_t *mddev, char *page)
2156 struct mdk_personality *p = mddev->pers;
2157 if (p)
2158 return sprintf(page, "%s\n", p->name);
2159 else if (mddev->clevel[0])
2160 return sprintf(page, "%s\n", mddev->clevel);
2161 else if (mddev->level != LEVEL_NONE)
2162 return sprintf(page, "%d\n", mddev->level);
2163 else
2164 return 0;
2167 static ssize_t
2168 level_store(mddev_t *mddev, const char *buf, size_t len)
2170 int rv = len;
2171 if (mddev->pers)
2172 return -EBUSY;
2173 if (len == 0)
2174 return 0;
2175 if (len >= sizeof(mddev->clevel))
2176 return -ENOSPC;
2177 strncpy(mddev->clevel, buf, len);
2178 if (mddev->clevel[len-1] == '\n')
2179 len--;
2180 mddev->clevel[len] = 0;
2181 mddev->level = LEVEL_NONE;
2182 return rv;
2185 static struct md_sysfs_entry md_level =
2186 __ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store);
2189 static ssize_t
2190 layout_show(mddev_t *mddev, char *page)
2192 /* just a number, not meaningful for all levels */
2193 return sprintf(page, "%d\n", mddev->layout);
2196 static ssize_t
2197 layout_store(mddev_t *mddev, const char *buf, size_t len)
2199 char *e;
2200 unsigned long n = simple_strtoul(buf, &e, 10);
2201 if (mddev->pers)
2202 return -EBUSY;
2204 if (!*buf || (*e && *e != '\n'))
2205 return -EINVAL;
2207 mddev->layout = n;
2208 return len;
2210 static struct md_sysfs_entry md_layout =
2211 __ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store);
2214 static ssize_t
2215 raid_disks_show(mddev_t *mddev, char *page)
2217 if (mddev->raid_disks == 0)
2218 return 0;
2219 return sprintf(page, "%d\n", mddev->raid_disks);
2222 static int update_raid_disks(mddev_t *mddev, int raid_disks);
2224 static ssize_t
2225 raid_disks_store(mddev_t *mddev, const char *buf, size_t len)
2227 /* can only set raid_disks if array is not yet active */
2228 char *e;
2229 int rv = 0;
2230 unsigned long n = simple_strtoul(buf, &e, 10);
2232 if (!*buf || (*e && *e != '\n'))
2233 return -EINVAL;
2235 if (mddev->pers)
2236 rv = update_raid_disks(mddev, n);
2237 else
2238 mddev->raid_disks = n;
2239 return rv ? rv : len;
2241 static struct md_sysfs_entry md_raid_disks =
2242 __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store);
2244 static ssize_t
2245 chunk_size_show(mddev_t *mddev, char *page)
2247 return sprintf(page, "%d\n", mddev->chunk_size);
2250 static ssize_t
2251 chunk_size_store(mddev_t *mddev, const char *buf, size_t len)
2253 /* can only set chunk_size if array is not yet active */
2254 char *e;
2255 unsigned long n = simple_strtoul(buf, &e, 10);
2257 if (mddev->pers)
2258 return -EBUSY;
2259 if (!*buf || (*e && *e != '\n'))
2260 return -EINVAL;
2262 mddev->chunk_size = n;
2263 return len;
2265 static struct md_sysfs_entry md_chunk_size =
2266 __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store);
2268 static ssize_t
2269 resync_start_show(mddev_t *mddev, char *page)
2271 return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp);
2274 static ssize_t
2275 resync_start_store(mddev_t *mddev, const char *buf, size_t len)
2277 /* can only set chunk_size if array is not yet active */
2278 char *e;
2279 unsigned long long n = simple_strtoull(buf, &e, 10);
2281 if (mddev->pers)
2282 return -EBUSY;
2283 if (!*buf || (*e && *e != '\n'))
2284 return -EINVAL;
2286 mddev->recovery_cp = n;
2287 return len;
2289 static struct md_sysfs_entry md_resync_start =
2290 __ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store);
2292 /*
2293 * The array state can be:
2295 * clear
2296 * No devices, no size, no level
2297 * Equivalent to STOP_ARRAY ioctl
2298 * inactive
2299 * May have some settings, but array is not active
2300 * all IO results in error
2301 * When written, doesn't tear down array, but just stops it
2302 * suspended (not supported yet)
2303 * All IO requests will block. The array can be reconfigured.
2304 * Writing this, if accepted, will block until array is quiessent
2305 * readonly
2306 * no resync can happen. no superblocks get written.
2307 * write requests fail
2308 * read-auto
2309 * like readonly, but behaves like 'clean' on a write request.
2311 * clean - no pending writes, but otherwise active.
2312 * When written to inactive array, starts without resync
2313 * If a write request arrives then
2314 * if metadata is known, mark 'dirty' and switch to 'active'.
2315 * if not known, block and switch to write-pending
2316 * If written to an active array that has pending writes, then fails.
2317 * active
2318 * fully active: IO and resync can be happening.
2319 * When written to inactive array, starts with resync
2321 * write-pending
2322 * clean, but writes are blocked waiting for 'active' to be written.
2324 * active-idle
2325 * like active, but no writes have been seen for a while (100msec).
2327 */
2328 enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active,
2329 write_pending, active_idle, bad_word};
2330 static char *array_states[] = {
2331 "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active",
2332 "write-pending", "active-idle", NULL };
2334 static int match_word(const char *word, char **list)
2336 int n;
2337 for (n=0; list[n]; n++)
2338 if (cmd_match(word, list[n]))
2339 break;
2340 return n;
2343 static ssize_t
2344 array_state_show(mddev_t *mddev, char *page)
2346 enum array_state st = inactive;
2348 if (mddev->pers)
2349 switch(mddev->ro) {
2350 case 1:
2351 st = readonly;
2352 break;
2353 case 2:
2354 st = read_auto;
2355 break;
2356 case 0:
2357 if (mddev->in_sync)
2358 st = clean;
2359 else if (mddev->safemode)
2360 st = active_idle;
2361 else
2362 st = active;
2364 else {
2365 if (list_empty(&mddev->disks) &&
2366 mddev->raid_disks == 0 &&
2367 mddev->size == 0)
2368 st = clear;
2369 else
2370 st = inactive;
2372 return sprintf(page, "%s\n", array_states[st]);
2375 static int do_md_stop(mddev_t * mddev, int ro);
2376 static int do_md_run(mddev_t * mddev);
2377 static int restart_array(mddev_t *mddev);
2379 static ssize_t
2380 array_state_store(mddev_t *mddev, const char *buf, size_t len)
2382 int err = -EINVAL;
2383 enum array_state st = match_word(buf, array_states);
2384 switch(st) {
2385 case bad_word:
2386 break;
2387 case clear:
2388 /* stopping an active array */
2389 if (mddev->pers) {
2390 if (atomic_read(&mddev->active) > 1)
2391 return -EBUSY;
2392 err = do_md_stop(mddev, 0);
2394 break;
2395 case inactive:
2396 /* stopping an active array */
2397 if (mddev->pers) {
2398 if (atomic_read(&mddev->active) > 1)
2399 return -EBUSY;
2400 err = do_md_stop(mddev, 2);
2402 break;
2403 case suspended:
2404 break; /* not supported yet */
2405 case readonly:
2406 if (mddev->pers)
2407 err = do_md_stop(mddev, 1);
2408 else {
2409 mddev->ro = 1;
2410 err = do_md_run(mddev);
2412 break;
2413 case read_auto:
2414 /* stopping an active array */
2415 if (mddev->pers) {
2416 err = do_md_stop(mddev, 1);
2417 if (err == 0)
2418 mddev->ro = 2; /* FIXME mark devices writable */
2419 } else {
2420 mddev->ro = 2;
2421 err = do_md_run(mddev);
2423 break;
2424 case clean:
2425 if (mddev->pers) {
2426 restart_array(mddev);
2427 spin_lock_irq(&mddev->write_lock);
2428 if (atomic_read(&mddev->writes_pending) == 0) {
2429 mddev->in_sync = 1;
2430 mddev->sb_dirty = 1;
2432 spin_unlock_irq(&mddev->write_lock);
2433 } else {
2434 mddev->ro = 0;
2435 mddev->recovery_cp = MaxSector;
2436 err = do_md_run(mddev);
2438 break;
2439 case active:
2440 if (mddev->pers) {
2441 restart_array(mddev);
2442 mddev->sb_dirty = 0;
2443 wake_up(&mddev->sb_wait);
2444 err = 0;
2445 } else {
2446 mddev->ro = 0;
2447 err = do_md_run(mddev);
2449 break;
2450 case write_pending:
2451 case active_idle:
2452 /* these cannot be set */
2453 break;
2455 if (err)
2456 return err;
2457 else
2458 return len;
2460 static struct md_sysfs_entry md_array_state =
2461 __ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
2463 static ssize_t
2464 null_show(mddev_t *mddev, char *page)
2466 return -EINVAL;
2469 static ssize_t
2470 new_dev_store(mddev_t *mddev, const char *buf, size_t len)
2472 /* buf must be %d:%d\n? giving major and minor numbers */
2473 /* The new device is added to the array.
2474 * If the array has a persistent superblock, we read the
2475 * superblock to initialise info and check validity.
2476 * Otherwise, only checking done is that in bind_rdev_to_array,
2477 * which mainly checks size.
2478 */
2479 char *e;
2480 int major = simple_strtoul(buf, &e, 10);
2481 int minor;
2482 dev_t dev;
2483 mdk_rdev_t *rdev;
2484 int err;
2486 if (!*buf || *e != ':' || !e[1] || e[1] == '\n')
2487 return -EINVAL;
2488 minor = simple_strtoul(e+1, &e, 10);
2489 if (*e && *e != '\n')
2490 return -EINVAL;
2491 dev = MKDEV(major, minor);
2492 if (major != MAJOR(dev) ||
2493 minor != MINOR(dev))
2494 return -EOVERFLOW;
2497 if (mddev->persistent) {
2498 rdev = md_import_device(dev, mddev->major_version,
2499 mddev->minor_version);
2500 if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) {
2501 mdk_rdev_t *rdev0 = list_entry(mddev->disks.next,
2502 mdk_rdev_t, same_set);
2503 err = super_types[mddev->major_version]
2504 .load_super(rdev, rdev0, mddev->minor_version);
2505 if (err < 0)
2506 goto out;
2508 } else
2509 rdev = md_import_device(dev, -1, -1);
2511 if (IS_ERR(rdev))
2512 return PTR_ERR(rdev);
2513 err = bind_rdev_to_array(rdev, mddev);
2514 out:
2515 if (err)
2516 export_rdev(rdev);
2517 return err ? err : len;
2520 static struct md_sysfs_entry md_new_device =
2521 __ATTR(new_dev, S_IWUSR, null_show, new_dev_store);
2523 static ssize_t
2524 size_show(mddev_t *mddev, char *page)
2526 return sprintf(page, "%llu\n", (unsigned long long)mddev->size);
2529 static int update_size(mddev_t *mddev, unsigned long size);
2531 static ssize_t
2532 size_store(mddev_t *mddev, const char *buf, size_t len)
2534 /* If array is inactive, we can reduce the component size, but
2535 * not increase it (except from 0).
2536 * If array is active, we can try an on-line resize
2537 */
2538 char *e;
2539 int err = 0;
2540 unsigned long long size = simple_strtoull(buf, &e, 10);
2541 if (!*buf || *buf == '\n' ||
2542 (*e && *e != '\n'))
2543 return -EINVAL;
2545 if (mddev->pers) {
2546 err = update_size(mddev, size);
2547 md_update_sb(mddev);
2548 } else {
2549 if (mddev->size == 0 ||
2550 mddev->size > size)
2551 mddev->size = size;
2552 else
2553 err = -ENOSPC;
2555 return err ? err : len;
2558 static struct md_sysfs_entry md_size =
2559 __ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store);
2562 /* Metdata version.
2563 * This is either 'none' for arrays with externally managed metadata,
2564 * or N.M for internally known formats
2565 */
2566 static ssize_t
2567 metadata_show(mddev_t *mddev, char *page)
2569 if (mddev->persistent)
2570 return sprintf(page, "%d.%d\n",
2571 mddev->major_version, mddev->minor_version);
2572 else
2573 return sprintf(page, "none\n");
2576 static ssize_t
2577 metadata_store(mddev_t *mddev, const char *buf, size_t len)
2579 int major, minor;
2580 char *e;
2581 if (!list_empty(&mddev->disks))
2582 return -EBUSY;
2584 if (cmd_match(buf, "none")) {
2585 mddev->persistent = 0;
2586 mddev->major_version = 0;
2587 mddev->minor_version = 90;
2588 return len;
2590 major = simple_strtoul(buf, &e, 10);
2591 if (e==buf || *e != '.')
2592 return -EINVAL;
2593 buf = e+1;
2594 minor = simple_strtoul(buf, &e, 10);
2595 if (e==buf || *e != '\n')
2596 return -EINVAL;
2597 if (major >= sizeof(super_types)/sizeof(super_types[0]) ||
2598 super_types[major].name == NULL)
2599 return -ENOENT;
2600 mddev->major_version = major;
2601 mddev->minor_version = minor;
2602 mddev->persistent = 1;
2603 return len;
2606 static struct md_sysfs_entry md_metadata =
2607 __ATTR(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
2609 static ssize_t
2610 action_show(mddev_t *mddev, char *page)
2612 char *type = "idle";
2613 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
2614 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) {
2615 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
2616 type = "reshape";
2617 else if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
2618 if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
2619 type = "resync";
2620 else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
2621 type = "check";
2622 else
2623 type = "repair";
2624 } else
2625 type = "recover";
2627 return sprintf(page, "%s\n", type);
2630 static ssize_t
2631 action_store(mddev_t *mddev, const char *page, size_t len)
2633 if (!mddev->pers || !mddev->pers->sync_request)
2634 return -EINVAL;
2636 if (cmd_match(page, "idle")) {
2637 if (mddev->sync_thread) {
2638 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
2639 md_unregister_thread(mddev->sync_thread);
2640 mddev->sync_thread = NULL;
2641 mddev->recovery = 0;
2643 } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
2644 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
2645 return -EBUSY;
2646 else if (cmd_match(page, "resync") || cmd_match(page, "recover"))
2647 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2648 else if (cmd_match(page, "reshape")) {
2649 int err;
2650 if (mddev->pers->start_reshape == NULL)
2651 return -EINVAL;
2652 err = mddev->pers->start_reshape(mddev);
2653 if (err)
2654 return err;
2655 } else {
2656 if (cmd_match(page, "check"))
2657 set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
2658 else if (!cmd_match(page, "repair"))
2659 return -EINVAL;
2660 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
2661 set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
2663 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2664 md_wakeup_thread(mddev->thread);
2665 return len;
2668 static ssize_t
2669 mismatch_cnt_show(mddev_t *mddev, char *page)
2671 return sprintf(page, "%llu\n",
2672 (unsigned long long) mddev->resync_mismatches);
2675 static struct md_sysfs_entry md_scan_mode =
2676 __ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
2679 static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt);
2681 static ssize_t
2682 sync_min_show(mddev_t *mddev, char *page)
2684 return sprintf(page, "%d (%s)\n", speed_min(mddev),
2685 mddev->sync_speed_min ? "local": "system");
2688 static ssize_t
2689 sync_min_store(mddev_t *mddev, const char *buf, size_t len)
2691 int min;
2692 char *e;
2693 if (strncmp(buf, "system", 6)==0) {
2694 mddev->sync_speed_min = 0;
2695 return len;
2697 min = simple_strtoul(buf, &e, 10);
2698 if (buf == e || (*e && *e != '\n') || min <= 0)
2699 return -EINVAL;
2700 mddev->sync_speed_min = min;
2701 return len;
2704 static struct md_sysfs_entry md_sync_min =
2705 __ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store);
2707 static ssize_t
2708 sync_max_show(mddev_t *mddev, char *page)
2710 return sprintf(page, "%d (%s)\n", speed_max(mddev),
2711 mddev->sync_speed_max ? "local": "system");
2714 static ssize_t
2715 sync_max_store(mddev_t *mddev, const char *buf, size_t len)
2717 int max;
2718 char *e;
2719 if (strncmp(buf, "system", 6)==0) {
2720 mddev->sync_speed_max = 0;
2721 return len;
2723 max = simple_strtoul(buf, &e, 10);
2724 if (buf == e || (*e && *e != '\n') || max <= 0)
2725 return -EINVAL;
2726 mddev->sync_speed_max = max;
2727 return len;
2730 static struct md_sysfs_entry md_sync_max =
2731 __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store);
2734 static ssize_t
2735 sync_speed_show(mddev_t *mddev, char *page)
2737 unsigned long resync, dt, db;
2738 resync = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active));
2739 dt = ((jiffies - mddev->resync_mark) / HZ);
2740 if (!dt) dt++;
2741 db = resync - (mddev->resync_mark_cnt);
2742 return sprintf(page, "%ld\n", db/dt/2); /* K/sec */
2745 static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
2747 static ssize_t
2748 sync_completed_show(mddev_t *mddev, char *page)
2750 unsigned long max_blocks, resync;
2752 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
2753 max_blocks = mddev->resync_max_sectors;
2754 else
2755 max_blocks = mddev->size << 1;
2757 resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active));
2758 return sprintf(page, "%lu / %lu\n", resync, max_blocks);
2761 static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed);
2763 static ssize_t
2764 suspend_lo_show(mddev_t *mddev, char *page)
2766 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo);
2769 static ssize_t
2770 suspend_lo_store(mddev_t *mddev, const char *buf, size_t len)
2772 char *e;
2773 unsigned long long new = simple_strtoull(buf, &e, 10);
2775 if (mddev->pers->quiesce == NULL)
2776 return -EINVAL;
2777 if (buf == e || (*e && *e != '\n'))
2778 return -EINVAL;
2779 if (new >= mddev->suspend_hi ||
2780 (new > mddev->suspend_lo && new < mddev->suspend_hi)) {
2781 mddev->suspend_lo = new;
2782 mddev->pers->quiesce(mddev, 2);
2783 return len;
2784 } else
2785 return -EINVAL;
2787 static struct md_sysfs_entry md_suspend_lo =
2788 __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
2791 static ssize_t
2792 suspend_hi_show(mddev_t *mddev, char *page)
2794 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi);
2797 static ssize_t
2798 suspend_hi_store(mddev_t *mddev, const char *buf, size_t len)
2800 char *e;
2801 unsigned long long new = simple_strtoull(buf, &e, 10);
2803 if (mddev->pers->quiesce == NULL)
2804 return -EINVAL;
2805 if (buf == e || (*e && *e != '\n'))
2806 return -EINVAL;
2807 if ((new <= mddev->suspend_lo && mddev->suspend_lo >= mddev->suspend_hi) ||
2808 (new > mddev->suspend_lo && new > mddev->suspend_hi)) {
2809 mddev->suspend_hi = new;
2810 mddev->pers->quiesce(mddev, 1);
2811 mddev->pers->quiesce(mddev, 0);
2812 return len;
2813 } else
2814 return -EINVAL;
2816 static struct md_sysfs_entry md_suspend_hi =
2817 __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
2820 static struct attribute *md_default_attrs[] = {
2821 &md_level.attr,
2822 &md_layout.attr,
2823 &md_raid_disks.attr,
2824 &md_chunk_size.attr,
2825 &md_size.attr,
2826 &md_resync_start.attr,
2827 &md_metadata.attr,
2828 &md_new_device.attr,
2829 &md_safe_delay.attr,
2830 &md_array_state.attr,
2831 NULL,
2832 };
2834 static struct attribute *md_redundancy_attrs[] = {
2835 &md_scan_mode.attr,
2836 &md_mismatches.attr,
2837 &md_sync_min.attr,
2838 &md_sync_max.attr,
2839 &md_sync_speed.attr,
2840 &md_sync_completed.attr,
2841 &md_suspend_lo.attr,
2842 &md_suspend_hi.attr,
2843 NULL,
2844 };
2845 static struct attribute_group md_redundancy_group = {
2846 .name = NULL,
2847 .attrs = md_redundancy_attrs,
2848 };
2851 static ssize_t
2852 md_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
2854 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
2855 mddev_t *mddev = container_of(kobj, struct mddev_s, kobj);
2856 ssize_t rv;
2858 if (!entry->show)
2859 return -EIO;
2860 rv = mddev_lock(mddev);
2861 if (!rv) {
2862 rv = entry->show(mddev, page);
2863 mddev_unlock(mddev);
2865 return rv;
2868 static ssize_t
2869 md_attr_store(struct kobject *kobj, struct attribute *attr,
2870 const char *page, size_t length)
2872 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
2873 mddev_t *mddev = container_of(kobj, struct mddev_s, kobj);
2874 ssize_t rv;
2876 if (!entry->store)
2877 return -EIO;
2878 if (!capable(CAP_SYS_ADMIN))
2879 return -EACCES;
2880 rv = mddev_lock(mddev);
2881 if (!rv) {
2882 rv = entry->store(mddev, page, length);
2883 mddev_unlock(mddev);
2885 return rv;
2888 static void md_free(struct kobject *ko)
2890 mddev_t *mddev = container_of(ko, mddev_t, kobj);
2891 kfree(mddev);
2894 static struct sysfs_ops md_sysfs_ops = {
2895 .show = md_attr_show,
2896 .store = md_attr_store,
2897 };
2898 static struct kobj_type md_ktype = {
2899 .release = md_free,
2900 .sysfs_ops = &md_sysfs_ops,
2901 .default_attrs = md_default_attrs,
2902 };
2904 int mdp_major = 0;
2906 static struct kobject *md_probe(dev_t dev, int *part, void *data)
2908 static DEFINE_MUTEX(disks_mutex);
2909 mddev_t *mddev = mddev_find(dev);
2910 struct gendisk *disk;
2911 int partitioned = (MAJOR(dev) != MD_MAJOR);
2912 int shift = partitioned ? MdpMinorShift : 0;
2913 int unit = MINOR(dev) >> shift;
2915 if (!mddev)
2916 return NULL;
2918 mutex_lock(&disks_mutex);
2919 if (mddev->gendisk) {
2920 mutex_unlock(&disks_mutex);
2921 mddev_put(mddev);
2922 return NULL;
2924 disk = alloc_disk(1 << shift);
2925 if (!disk) {
2926 mutex_unlock(&disks_mutex);
2927 mddev_put(mddev);
2928 return NULL;
2930 disk->major = MAJOR(dev);
2931 disk->first_minor = unit << shift;
2932 if (partitioned)
2933 sprintf(disk->disk_name, "md_d%d", unit);
2934 else
2935 sprintf(disk->disk_name, "md%d", unit);
2936 disk->fops = &md_fops;
2937 disk->private_data = mddev;
2938 disk->queue = mddev->queue;
2939 add_disk(disk);
2940 mddev->gendisk = disk;
2941 mutex_unlock(&disks_mutex);
2942 mddev->kobj.parent = &disk->kobj;
2943 mddev->kobj.k_name = NULL;
2944 snprintf(mddev->kobj.name, KOBJ_NAME_LEN, "%s", "md");
2945 mddev->kobj.ktype = &md_ktype;
2946 kobject_register(&mddev->kobj);
2947 return NULL;
2950 static void md_safemode_timeout(unsigned long data)
2952 mddev_t *mddev = (mddev_t *) data;
2954 mddev->safemode = 1;
2955 md_wakeup_thread(mddev->thread);
2958 static int start_dirty_degraded;
2960 static int do_md_run(mddev_t * mddev)
2962 int err;
2963 int chunk_size;
2964 struct list_head *tmp;
2965 mdk_rdev_t *rdev;
2966 struct gendisk *disk;
2967 struct mdk_personality *pers;
2968 char b[BDEVNAME_SIZE];
2970 if (list_empty(&mddev->disks))
2971 /* cannot run an array with no devices.. */
2972 return -EINVAL;
2974 if (mddev->pers)
2975 return -EBUSY;
2977 /*
2978 * Analyze all RAID superblock(s)
2979 */
2980 if (!mddev->raid_disks)
2981 analyze_sbs(mddev);
2983 chunk_size = mddev->chunk_size;
2985 if (chunk_size) {
2986 if (chunk_size > MAX_CHUNK_SIZE) {
2987 printk(KERN_ERR "too big chunk_size: %d > %d\n",
2988 chunk_size, MAX_CHUNK_SIZE);
2989 return -EINVAL;
2991 /*
2992 * chunk-size has to be a power of 2 and multiples of PAGE_SIZE
2993 */
2994 if ( (1 << ffz(~chunk_size)) != chunk_size) {
2995 printk(KERN_ERR "chunk_size of %d not valid\n", chunk_size);
2996 return -EINVAL;
2998 if (chunk_size < PAGE_SIZE) {
2999 printk(KERN_ERR "too small chunk_size: %d < %ld\n",
3000 chunk_size, PAGE_SIZE);
3001 return -EINVAL;
3004 /* devices must have minimum size of one chunk */
3005 ITERATE_RDEV(mddev,rdev,tmp) {
3006 if (test_bit(Faulty, &rdev->flags))
3007 continue;
3008 if (rdev->size < chunk_size / 1024) {
3009 printk(KERN_WARNING
3010 "md: Dev %s smaller than chunk_size:"
3011 " %lluk < %dk\n",
3012 bdevname(rdev->bdev,b),
3013 (unsigned long long)rdev->size,
3014 chunk_size / 1024);
3015 return -EINVAL;
3020 #ifdef CONFIG_KMOD
3021 if (mddev->level != LEVEL_NONE)
3022 request_module("md-level-%d", mddev->level);
3023 else if (mddev->clevel[0])
3024 request_module("md-%s", mddev->clevel);
3025 #endif
3027 /*
3028 * Drop all container device buffers, from now on
3029 * the only valid external interface is through the md
3030 * device.
3031 * Also find largest hardsector size
3032 */
3033 ITERATE_RDEV(mddev,rdev,tmp) {
3034 if (test_bit(Faulty, &rdev->flags))
3035 continue;
3036 sync_blockdev(rdev->bdev);
3037 invalidate_bdev(rdev->bdev, 0);
3040 md_probe(mddev->unit, NULL, NULL);
3041 disk = mddev->gendisk;
3042 if (!disk)
3043 return -ENOMEM;
3045 spin_lock(&pers_lock);
3046 pers = find_pers(mddev->level, mddev->clevel);
3047 if (!pers || !try_module_get(pers->owner)) {
3048 spin_unlock(&pers_lock);
3049 if (mddev->level != LEVEL_NONE)
3050 printk(KERN_WARNING "md: personality for level %d is not loaded!\n",
3051 mddev->level);
3052 else
3053 printk(KERN_WARNING "md: personality for level %s is not loaded!\n",
3054 mddev->clevel);
3055 return -EINVAL;
3057 mddev->pers = pers;
3058 spin_unlock(&pers_lock);
3059 mddev->level = pers->level;
3060 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
3062 if (mddev->reshape_position != MaxSector &&
3063 pers->start_reshape == NULL) {
3064 /* This personality cannot handle reshaping... */
3065 mddev->pers = NULL;
3066 module_put(pers->owner);
3067 return -EINVAL;
3070 mddev->recovery = 0;
3071 mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */
3072 mddev->barriers_work = 1;
3073 mddev->ok_start_degraded = start_dirty_degraded;
3075 if (start_readonly)
3076 mddev->ro = 2; /* read-only, but switch on first write */
3078 err = mddev->pers->run(mddev);
3079 if (!err && mddev->pers->sync_request) {
3080 err = bitmap_create(mddev);
3081 if (err) {
3082 printk(KERN_ERR "%s: failed to create bitmap (%d)\n",
3083 mdname(mddev), err);
3084 mddev->pers->stop(mddev);
3087 if (err) {
3088 printk(KERN_ERR "md: pers->run() failed ...\n");
3089 module_put(mddev->pers->owner);
3090 mddev->pers = NULL;
3091 bitmap_destroy(mddev);
3092 return err;
3094 if (mddev->pers->sync_request)
3095 sysfs_create_group(&mddev->kobj, &md_redundancy_group);
3096 else if (mddev->ro == 2) /* auto-readonly not meaningful */
3097 mddev->ro = 0;
3099 atomic_set(&mddev->writes_pending,0);
3100 mddev->safemode = 0;
3101 mddev->safemode_timer.function = md_safemode_timeout;
3102 mddev->safemode_timer.data = (unsigned long) mddev;
3103 mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
3104 mddev->in_sync = 1;
3106 ITERATE_RDEV(mddev,rdev,tmp)
3107 if (rdev->raid_disk >= 0) {
3108 char nm[20];
3109 sprintf(nm, "rd%d", rdev->raid_disk);
3110 sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
3113 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3115 if (mddev->sb_dirty)
3116 md_update_sb(mddev);
3118 set_capacity(disk, mddev->array_size<<1);
3120 /* If we call blk_queue_make_request here, it will
3121 * re-initialise max_sectors etc which may have been
3122 * refined inside -> run. So just set the bits we need to set.
3123 * Most initialisation happended when we called
3124 * blk_queue_make_request(..., md_fail_request)
3125 * earlier.
3126 */
3127 mddev->queue->queuedata = mddev;
3128 mddev->queue->make_request_fn = mddev->pers->make_request;
3130 /* If there is a partially-recovered drive we need to
3131 * start recovery here. If we leave it to md_check_recovery,
3132 * it will remove the drives and not do the right thing
3133 */
3134 if (mddev->degraded && !mddev->sync_thread) {
3135 struct list_head *rtmp;
3136 int spares = 0;
3137 ITERATE_RDEV(mddev,rdev,rtmp)
3138 if (rdev->raid_disk >= 0 &&
3139 !test_bit(In_sync, &rdev->flags) &&
3140 !test_bit(Faulty, &rdev->flags))
3141 /* complete an interrupted recovery */
3142 spares++;
3143 if (spares && mddev->pers->sync_request) {
3144 mddev->recovery = 0;
3145 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
3146 mddev->sync_thread = md_register_thread(md_do_sync,
3147 mddev,
3148 "%s_resync");
3149 if (!mddev->sync_thread) {
3150 printk(KERN_ERR "%s: could not start resync"
3151 " thread...\n",
3152 mdname(mddev));
3153 /* leave the spares where they are, it shouldn't hurt */
3154 mddev->recovery = 0;
3158 md_wakeup_thread(mddev->thread);
3159 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
3161 mddev->changed = 1;
3162 md_new_event(mddev);
3163 return 0;
3166 static int restart_array(mddev_t *mddev)
3168 struct gendisk *disk = mddev->gendisk;
3169 int err;
3171 /*
3172 * Complain if it has no devices
3173 */
3174 err = -ENXIO;
3175 if (list_empty(&mddev->disks))
3176 goto out;
3178 if (mddev->pers) {
3179 err = -EBUSY;
3180 if (!mddev->ro)
3181 goto out;
3183 mddev->safemode = 0;
3184 mddev->ro = 0;
3185 set_disk_ro(disk, 0);
3187 printk(KERN_INFO "md: %s switched to read-write mode.\n",
3188 mdname(mddev));
3189 /*
3190 * Kick recovery or resync if necessary
3191 */
3192 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3193 md_wakeup_thread(mddev->thread);
3194 md_wakeup_thread(mddev->sync_thread);
3195 err = 0;
3196 } else
3197 err = -EINVAL;
3199 out:
3200 return err;
3203 /* similar to deny_write_access, but accounts for our holding a reference
3204 * to the file ourselves */
3205 static int deny_bitmap_write_access(struct file * file)
3207 struct inode *inode = file->f_mapping->host;
3209 spin_lock(&inode->i_lock);
3210 if (atomic_read(&inode->i_writecount) > 1) {
3211 spin_unlock(&inode->i_lock);
3212 return -ETXTBSY;
3214 atomic_set(&inode->i_writecount, -1);
3215 spin_unlock(&inode->i_lock);
3217 return 0;
3220 static void restore_bitmap_write_access(struct file *file)
3222 struct inode *inode = file->f_mapping->host;
3224 spin_lock(&inode->i_lock);
3225 atomic_set(&inode->i_writecount, 1);
3226 spin_unlock(&inode->i_lock);
3229 /* mode:
3230 * 0 - completely stop and dis-assemble array
3231 * 1 - switch to readonly
3232 * 2 - stop but do not disassemble array
3233 */
3234 static int do_md_stop(mddev_t * mddev, int mode)
3236 int err = 0;
3237 struct gendisk *disk = mddev->gendisk;
3239 if (mddev->pers) {
3240 if (atomic_read(&mddev->active)>2) {
3241 printk("md: %s still in use.\n",mdname(mddev));
3242 return -EBUSY;
3245 if (mddev->sync_thread) {
3246 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
3247 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
3248 md_unregister_thread(mddev->sync_thread);
3249 mddev->sync_thread = NULL;
3252 del_timer_sync(&mddev->safemode_timer);
3254 invalidate_partition(disk, 0);
3256 switch(mode) {
3257 case 1: /* readonly */
3258 err = -ENXIO;
3259 if (mddev->ro==1)
3260 goto out;
3261 mddev->ro = 1;
3262 break;
3263 case 0: /* disassemble */
3264 case 2: /* stop */
3265 bitmap_flush(mddev);
3266 md_super_wait(mddev);
3267 if (mddev->ro)
3268 set_disk_ro(disk, 0);
3269 blk_queue_make_request(mddev->queue, md_fail_request);
3270 mddev->pers->stop(mddev);
3271 if (mddev->pers->sync_request)
3272 sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
3274 module_put(mddev->pers->owner);
3275 mddev->pers = NULL;
3276 if (mddev->ro)
3277 mddev->ro = 0;
3279 if (!mddev->in_sync || mddev->sb_dirty) {
3280 /* mark array as shutdown cleanly */
3281 mddev->in_sync = 1;
3282 md_update_sb(mddev);
3284 if (mode == 1)
3285 set_disk_ro(disk, 1);
3286 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
3289 /*
3290 * Free resources if final stop
3291 */
3292 if (mode == 0) {
3293 mdk_rdev_t *rdev;
3294 struct list_head *tmp;
3295 struct gendisk *disk;
3296 printk(KERN_INFO "md: %s stopped.\n", mdname(mddev));
3298 bitmap_destroy(mddev);
3299 if (mddev->bitmap_file) {
3300 restore_bitmap_write_access(mddev->bitmap_file);
3301 fput(mddev->bitmap_file);
3302 mddev->bitmap_file = NULL;
3304 mddev->bitmap_offset = 0;
3306 ITERATE_RDEV(mddev,rdev,tmp)
3307 if (rdev->raid_disk >= 0) {
3308 char nm[20];
3309 sprintf(nm, "rd%d", rdev->raid_disk);
3310 sysfs_remove_link(&mddev->kobj, nm);
3313 export_array(mddev);
3315 mddev->array_size = 0;
3316 mddev->size = 0;
3317 mddev->raid_disks = 0;
3318 mddev->recovery_cp = 0;
3320 disk = mddev->gendisk;
3321 if (disk)
3322 set_capacity(disk, 0);
3323 mddev->changed = 1;
3324 } else if (mddev->pers)
3325 printk(KERN_INFO "md: %s switched to read-only mode.\n",
3326 mdname(mddev));
3327 err = 0;
3328 md_new_event(mddev);
3329 out:
3330 return err;
3333 static void autorun_array(mddev_t *mddev)
3335 mdk_rdev_t *rdev;
3336 struct list_head *tmp;
3337 int err;
3339 if (list_empty(&mddev->disks))
3340 return;
3342 printk(KERN_INFO "md: running: ");
3344 ITERATE_RDEV(mddev,rdev,tmp) {
3345 char b[BDEVNAME_SIZE];
3346 printk("<%s>", bdevname(rdev->bdev,b));
3348 printk("\n");
3350 err = do_md_run (mddev);
3351 if (err) {
3352 printk(KERN_WARNING "md: do_md_run() returned %d\n", err);
3353 do_md_stop (mddev, 0);
3357 /*
3358 * lets try to run arrays based on all disks that have arrived
3359 * until now. (those are in pending_raid_disks)
3361 * the method: pick the first pending disk, collect all disks with
3362 * the same UUID, remove all from the pending list and put them into
3363 * the 'same_array' list. Then order this list based on superblock
3364 * update time (freshest comes first), kick out 'old' disks and
3365 * compare superblocks. If everything's fine then run it.
3367 * If "unit" is allocated, then bump its reference count
3368 */
3369 static void autorun_devices(int part)
3371 struct list_head *tmp;
3372 mdk_rdev_t *rdev0, *rdev;
3373 mddev_t *mddev;
3374 char b[BDEVNAME_SIZE];
3376 printk(KERN_INFO "md: autorun ...\n");
3377 while (!list_empty(&pending_raid_disks)) {
3378 dev_t dev;
3379 LIST_HEAD(candidates);
3380 rdev0 = list_entry(pending_raid_disks.next,
3381 mdk_rdev_t, same_set);
3383 printk(KERN_INFO "md: considering %s ...\n",
3384 bdevname(rdev0->bdev,b));
3385 INIT_LIST_HEAD(&candidates);
3386 ITERATE_RDEV_PENDING(rdev,tmp)
3387 if (super_90_load(rdev, rdev0, 0) >= 0) {
3388 printk(KERN_INFO "md: adding %s ...\n",
3389 bdevname(rdev->bdev,b));
3390 list_move(&rdev->same_set, &candidates);
3392 /*
3393 * now we have a set of devices, with all of them having
3394 * mostly sane superblocks. It's time to allocate the
3395 * mddev.
3396 */
3397 if (rdev0->preferred_minor < 0 || rdev0->preferred_minor >= MAX_MD_DEVS) {
3398 printk(KERN_INFO "md: unit number in %s is bad: %d\n",
3399 bdevname(rdev0->bdev, b), rdev0->preferred_minor);
3400 break;
3402 if (part)
3403 dev = MKDEV(mdp_major,
3404 rdev0->preferred_minor << MdpMinorShift);
3405 else
3406 dev = MKDEV(MD_MAJOR, rdev0->preferred_minor);
3408 md_probe(dev, NULL, NULL);
3409 mddev = mddev_find(dev);
3410 if (!mddev) {
3411 printk(KERN_ERR
3412 "md: cannot allocate memory for md drive.\n");
3413 break;
3415 if (mddev_lock(mddev))
3416 printk(KERN_WARNING "md: %s locked, cannot run\n",
3417 mdname(mddev));
3418 else if (mddev->raid_disks || mddev->major_version
3419 || !list_empty(&mddev->disks)) {
3420 printk(KERN_WARNING
3421 "md: %s already running, cannot run %s\n",
3422 mdname(mddev), bdevname(rdev0->bdev,b));
3423 mddev_unlock(mddev);
3424 } else {
3425 printk(KERN_INFO "md: created %s\n", mdname(mddev));
3426 ITERATE_RDEV_GENERIC(candidates,rdev,tmp) {
3427 list_del_init(&rdev->same_set);
3428 if (bind_rdev_to_array(rdev, mddev))
3429 export_rdev(rdev);
3431 autorun_array(mddev);
3432 mddev_unlock(mddev);
3434 /* on success, candidates will be empty, on error
3435 * it won't...
3436 */
3437 ITERATE_RDEV_GENERIC(candidates,rdev,tmp)
3438 export_rdev(rdev);
3439 mddev_put(mddev);
3441 printk(KERN_INFO "md: ... autorun DONE.\n");
3444 /*
3445 * import RAID devices based on one partition
3446 * if possible, the array gets run as well.
3447 */
3449 static int autostart_array(dev_t startdev)
3451 char b[BDEVNAME_SIZE];
3452 int err = -EINVAL, i;
3453 mdp_super_t *sb = NULL;
3454 mdk_rdev_t *start_rdev = NULL, *rdev;
3456 start_rdev = md_import_device(startdev, 0, 0);
3457 if (IS_ERR(start_rdev))
3458 return err;
3461 /* NOTE: this can only work for 0.90.0 superblocks */
3462 sb = (mdp_super_t*)page_address(start_rdev->sb_page);
3463 if (sb->major_version != 0 ||
3464 sb->minor_version != 90 ) {
3465 printk(KERN_WARNING "md: can only autostart 0.90.0 arrays\n");
3466 export_rdev(start_rdev);
3467 return err;
3470 if (test_bit(Faulty, &start_rdev->flags)) {
3471 printk(KERN_WARNING
3472 "md: can not autostart based on faulty %s!\n",
3473 bdevname(start_rdev->bdev,b));
3474 export_rdev(start_rdev);
3475 return err;
3477 list_add(&start_rdev->same_set, &pending_raid_disks);
3479 for (i = 0; i < MD_SB_DISKS; i++) {
3480 mdp_disk_t *desc = sb->disks + i;
3481 dev_t dev = MKDEV(desc->major, desc->minor);
3483 if (!dev)
3484 continue;
3485 if (dev == startdev)
3486 continue;
3487 if (MAJOR(dev) != desc->major || MINOR(dev) != desc->minor)
3488 continue;
3489 rdev = md_import_device(dev, 0, 0);
3490 if (IS_ERR(rdev))
3491 continue;
3493 list_add(&rdev->same_set, &pending_raid_disks);
3496 /*
3497 * possibly return codes
3498 */
3499 autorun_devices(0);
3500 return 0;
3505 static int get_version(void __user * arg)
3507 mdu_version_t ver;
3509 ver.major = MD_MAJOR_VERSION;
3510 ver.minor = MD_MINOR_VERSION;
3511 ver.patchlevel = MD_PATCHLEVEL_VERSION;
3513 if (copy_to_user(arg, &ver, sizeof(ver)))
3514 return -EFAULT;
3516 return 0;
3519 static int get_array_info(mddev_t * mddev, void __user * arg)
3521 mdu_array_info_t info;
3522 int nr,working,active,failed,spare;
3523 mdk_rdev_t *rdev;
3524 struct list_head *tmp;
3526 nr=working=active=failed=spare=0;
3527 ITERATE_RDEV(mddev,rdev,tmp) {
3528 nr++;
3529 if (test_bit(Faulty, &rdev->flags))
3530 failed++;
3531 else {
3532 working++;
3533 if (test_bit(In_sync, &rdev->flags))
3534 active++;
3535 else
3536 spare++;
3540 info.major_version = mddev->major_version;
3541 info.minor_version = mddev->minor_version;
3542 info.patch_version = MD_PATCHLEVEL_VERSION;
3543 info.ctime = mddev->ctime;
3544 info.level = mddev->level;
3545 info.size = mddev->size;
3546 if (info.size != mddev->size) /* overflow */
3547 info.size = -1;
3548 info.nr_disks = nr;
3549 info.raid_disks = mddev->raid_disks;
3550 info.md_minor = mddev->md_minor;
3551 info.not_persistent= !mddev->persistent;
3553 info.utime = mddev->utime;
3554 info.state = 0;
3555 if (mddev->in_sync)
3556 info.state = (1<<MD_SB_CLEAN);
3557 if (mddev->bitmap && mddev->bitmap_offset)
3558 info.state = (1<<MD_SB_BITMAP_PRESENT);
3559 info.active_disks = active;
3560 info.working_disks = working;
3561 info.failed_disks = failed;
3562 info.spare_disks = spare;
3564 info.layout = mddev->layout;
3565 info.chunk_size = mddev->chunk_size;
3567 if (copy_to_user(arg, &info, sizeof(info)))
3568 return -EFAULT;
3570 return 0;
3573 static int get_bitmap_file(mddev_t * mddev, void __user * arg)
3575 mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */
3576 char *ptr, *buf = NULL;
3577 int err = -ENOMEM;
3579 file = kmalloc(sizeof(*file), GFP_KERNEL);
3580 if (!file)
3581 goto out;
3583 /* bitmap disabled, zero the first byte and copy out */
3584 if (!mddev->bitmap || !mddev->bitmap->file) {
3585 file->pathname[0] = '\0';
3586 goto copy_out;
3589 buf = kmalloc(sizeof(file->pathname), GFP_KERNEL);
3590 if (!buf)
3591 goto out;
3593 ptr = file_path(mddev->bitmap->file, buf, sizeof(file->pathname));
3594 if (!ptr)
3595 goto out;
3597 strcpy(file->pathname, ptr);
3599 copy_out:
3600 err = 0;
3601 if (copy_to_user(arg, file, sizeof(*file)))
3602 err = -EFAULT;
3603 out:
3604 kfree(buf);
3605 kfree(file);
3606 return err;
3609 static int get_disk_info(mddev_t * mddev, void __user * arg)
3611 mdu_disk_info_t info;
3612 unsigned int nr;
3613 mdk_rdev_t *rdev;
3615 if (copy_from_user(&info, arg, sizeof(info)))
3616 return -EFAULT;
3618 nr = info.number;
3620 rdev = find_rdev_nr(mddev, nr);
3621 if (rdev) {
3622 info.major = MAJOR(rdev->bdev->bd_dev);
3623 info.minor = MINOR(rdev->bdev->bd_dev);
3624 info.raid_disk = rdev->raid_disk;
3625 info.state = 0;
3626 if (test_bit(Faulty, &rdev->flags))
3627 info.state |= (1<<MD_DISK_FAULTY);
3628 else if (test_bit(In_sync, &rdev->flags)) {
3629 info.state |= (1<<MD_DISK_ACTIVE);
3630 info.state |= (1<<MD_DISK_SYNC);
3632 if (test_bit(WriteMostly, &rdev->flags))
3633 info.state |= (1<<MD_DISK_WRITEMOSTLY);
3634 } else {
3635 info.major = info.minor = 0;
3636 info.raid_disk = -1;
3637 info.state = (1<<MD_DISK_REMOVED);
3640 if (copy_to_user(arg, &info, sizeof(info)))
3641 return -EFAULT;
3643 return 0;
3646 static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
3648 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
3649 mdk_rdev_t *rdev;
3650 dev_t dev = MKDEV(info->major,info->minor);
3652 if (info->major != MAJOR(dev) || info->minor != MINOR(dev))
3653 return -EOVERFLOW;
3655 if (!mddev->raid_disks) {
3656 int err;
3657 /* expecting a device which has a superblock */
3658 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version);
3659 if (IS_ERR(rdev)) {
3660 printk(KERN_WARNING
3661 "md: md_import_device returned %ld\n",
3662 PTR_ERR(rdev));
3663 return PTR_ERR(rdev);
3665 if (!list_empty(&mddev->disks)) {
3666 mdk_rdev_t *rdev0 = list_entry(mddev->disks.next,
3667 mdk_rdev_t, same_set);
3668 int err = super_types[mddev->major_version]
3669 .load_super(rdev, rdev0, mddev->minor_version);
3670 if (err < 0) {
3671 printk(KERN_WARNING
3672 "md: %s has different UUID to %s\n",
3673 bdevname(rdev->bdev,b),
3674 bdevname(rdev0->bdev,b2));
3675 export_rdev(rdev);
3676 return -EINVAL;
3679 err = bind_rdev_to_array(rdev, mddev);
3680 if (err)
3681 export_rdev(rdev);
3682 return err;
3685 /*
3686 * add_new_disk can be used once the array is assembled
3687 * to add "hot spares". They must already have a superblock
3688 * written
3689 */
3690 if (mddev->pers) {
3691 int err;
3692 if (!mddev->pers->hot_add_disk) {
3693 printk(KERN_WARNING
3694 "%s: personality does not support diskops!\n",
3695 mdname(mddev));
3696 return -EINVAL;
3698 if (mddev->persistent)
3699 rdev = md_import_device(dev, mddev->major_version,
3700 mddev->minor_version);
3701 else
3702 rdev = md_import_device(dev, -1, -1);
3703 if (IS_ERR(rdev)) {
3704 printk(KERN_WARNING
3705 "md: md_import_device returned %ld\n",
3706 PTR_ERR(rdev));
3707 return PTR_ERR(rdev);
3709 /* set save_raid_disk if appropriate */
3710 if (!mddev->persistent) {
3711 if (info->state & (1<<MD_DISK_SYNC) &&
3712 info->raid_disk < mddev->raid_disks)
3713 rdev->raid_disk = info->raid_disk;
3714 else
3715 rdev->raid_disk = -1;
3716 } else
3717 super_types[mddev->major_version].
3718 validate_super(mddev, rdev);
3719 rdev->saved_raid_disk = rdev->raid_disk;
3721 clear_bit(In_sync, &rdev->flags); /* just to be sure */
3722 if (info->state & (1<<MD_DISK_WRITEMOSTLY))
3723 set_bit(WriteMostly, &rdev->flags);
3725 rdev->raid_disk = -1;
3726 err = bind_rdev_to_array(rdev, mddev);
3727 if (!err && !mddev->pers->hot_remove_disk) {
3728 /* If there is hot_add_disk but no hot_remove_disk
3729 * then added disks for geometry changes,
3730 * and should be added immediately.
3731 */
3732 super_types[mddev->major_version].
3733 validate_super(mddev, rdev);
3734 err = mddev->pers->hot_add_disk(mddev, rdev);
3735 if (err)
3736 unbind_rdev_from_array(rdev);
3738 if (err)
3739 export_rdev(rdev);
3741 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3742 md_wakeup_thread(mddev->thread);
3743 return err;
3746 /* otherwise, add_new_disk is only allowed
3747 * for major_version==0 superblocks
3748 */
3749 if (mddev->major_version != 0) {
3750 printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n",
3751 mdname(mddev));
3752 return -EINVAL;
3755 if (!(info->state & (1<<MD_DISK_FAULTY))) {
3756 int err;
3757 rdev = md_import_device (dev, -1, 0);
3758 if (IS_ERR(rdev)) {
3759 printk(KERN_WARNING
3760 "md: error, md_import_device() returned %ld\n",
3761 PTR_ERR(rdev));
3762 return PTR_ERR(rdev);
3764 rdev->desc_nr = info->number;
3765 if (info->raid_disk < mddev->raid_disks)
3766 rdev->raid_disk = info->raid_disk;
3767 else
3768 rdev->raid_disk = -1;
3770 rdev->flags = 0;
3772 if (rdev->raid_disk < mddev->raid_disks)
3773 if (info->state & (1<<MD_DISK_SYNC))
3774 set_bit(In_sync, &rdev->flags);
3776 if (info->state & (1<<MD_DISK_WRITEMOSTLY))
3777 set_bit(WriteMostly, &rdev->flags);
3779 if (!mddev->persistent) {
3780 printk(KERN_INFO "md: nonpersistent superblock ...\n");
3781 rdev->sb_offset = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
3782 } else
3783 rdev->sb_offset = calc_dev_sboffset(rdev->bdev);
3784 rdev->size = calc_dev_size(rdev, mddev->chunk_size);
3786 err = bind_rdev_to_array(rdev, mddev);
3787 if (err) {
3788 export_rdev(rdev);
3789 return err;
3793 return 0;
3796 static int hot_remove_disk(mddev_t * mddev, dev_t dev)
3798 char b[BDEVNAME_SIZE];
3799 mdk_rdev_t *rdev;
3801 if (!mddev->pers)
3802 return -ENODEV;
3804 rdev = find_rdev(mddev, dev);
3805 if (!rdev)
3806 return -ENXIO;
3808 if (rdev->raid_disk >= 0)
3809 goto busy;
3811 kick_rdev_from_array(rdev);
3812 md_update_sb(mddev);
3813 md_new_event(mddev);
3815 return 0;
3816 busy:
3817 printk(KERN_WARNING "md: cannot remove active disk %s from %s ... \n",
3818 bdevname(rdev->bdev,b), mdname(mddev));
3819 return -EBUSY;
3822 static int hot_add_disk(mddev_t * mddev, dev_t dev)
3824 char b[BDEVNAME_SIZE];
3825 int err;
3826 unsigned int size;
3827 mdk_rdev_t *rdev;
3829 if (!mddev->pers)
3830 return -ENODEV;
3832 if (mddev->major_version != 0) {
3833 printk(KERN_WARNING "%s: HOT_ADD may only be used with"
3834 " version-0 superblocks.\n",
3835 mdname(mddev));
3836 return -EINVAL;
3838 if (!mddev->pers->hot_add_disk) {
3839 printk(KERN_WARNING
3840 "%s: personality does not support diskops!\n",
3841 mdname(mddev));
3842 return -EINVAL;
3845 rdev = md_import_device (dev, -1, 0);
3846 if (IS_ERR(rdev)) {
3847 printk(KERN_WARNING
3848 "md: error, md_import_device() returned %ld\n",
3849 PTR_ERR(rdev));
3850 return -EINVAL;
3853 if (mddev->persistent)
3854 rdev->sb_offset = calc_dev_sboffset(rdev->bdev);
3855 else
3856 rdev->sb_offset =
3857 rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
3859 size = calc_dev_size(rdev, mddev->chunk_size);
3860 rdev->size = size;
3862 if (test_bit(Faulty, &rdev->flags)) {
3863 printk(KERN_WARNING
3864 "md: can not hot-add faulty %s disk to %s!\n",
3865 bdevname(rdev->bdev,b), mdname(mddev));
3866 err = -EINVAL;
3867 goto abort_export;
3869 clear_bit(In_sync, &rdev->flags);
3870 rdev->desc_nr = -1;
3871 rdev->saved_raid_disk = -1;
3872 err = bind_rdev_to_array(rdev, mddev);
3873 if (err)
3874 goto abort_export;
3876 /*
3877 * The rest should better be atomic, we can have disk failures
3878 * noticed in interrupt contexts ...
3879 */
3881 if (rdev->desc_nr == mddev->max_disks) {
3882 printk(KERN_WARNING "%s: can not hot-add to full array!\n",
3883 mdname(mddev));
3884 err = -EBUSY;
3885 goto abort_unbind_export;
3888 rdev->raid_disk = -1;
3890 md_update_sb(mddev);
3892 /*
3893 * Kick recovery, maybe this spare has to be added to the
3894 * array immediately.
3895 */
3896 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3897 md_wakeup_thread(mddev->thread);
3898 md_new_event(mddev);
3899 return 0;
3901 abort_unbind_export:
3902 unbind_rdev_from_array(rdev);
3904 abort_export:
3905 export_rdev(rdev);
3906 return err;
3909 static int set_bitmap_file(mddev_t *mddev, int fd)
3911 int err;
3913 if (mddev->pers) {
3914 if (!mddev->pers->quiesce)
3915 return -EBUSY;
3916 if (mddev->recovery || mddev->sync_thread)
3917 return -EBUSY;
3918 /* we should be able to change the bitmap.. */
3922 if (fd >= 0) {
3923 if (mddev->bitmap)
3924 return -EEXIST; /* cannot add when bitmap is present */
3925 mddev->bitmap_file = fget(fd);
3927 if (mddev->bitmap_file == NULL) {
3928 printk(KERN_ERR "%s: error: failed to get bitmap file\n",
3929 mdname(mddev));
3930 return -EBADF;
3933 err = deny_bitmap_write_access(mddev->bitmap_file);
3934 if (err) {
3935 printk(KERN_ERR "%s: error: bitmap file is already in use\n",
3936 mdname(mddev));
3937 fput(mddev->bitmap_file);
3938 mddev->bitmap_file = NULL;
3939 return err;
3941 mddev->bitmap_offset = 0; /* file overrides offset */
3942 } else if (mddev->bitmap == NULL)
3943 return -ENOENT; /* cannot remove what isn't there */
3944 err = 0;
3945 if (mddev->pers) {
3946 mddev->pers->quiesce(mddev, 1);
3947 if (fd >= 0)
3948 err = bitmap_create(mddev);
3949 if (fd < 0 || err) {
3950 bitmap_destroy(mddev);
3951 fd = -1; /* make sure to put the file */
3953 mddev->pers->quiesce(mddev, 0);
3955 if (fd < 0) {
3956 if (mddev->bitmap_file) {
3957 restore_bitmap_write_access(mddev->bitmap_file);
3958 fput(mddev->bitmap_file);
3960 mddev->bitmap_file = NULL;
3963 return err;
3966 /*
3967 * set_array_info is used two different ways
3968 * The original usage is when creating a new array.
3969 * In this usage, raid_disks is > 0 and it together with
3970 * level, size, not_persistent,layout,chunksize determine the
3971 * shape of the array.
3972 * This will always create an array with a type-0.90.0 superblock.
3973 * The newer usage is when assembling an array.
3974 * In this case raid_disks will be 0, and the major_version field is
3975 * use to determine which style super-blocks are to be found on the devices.
3976 * The minor and patch _version numbers are also kept incase the
3977 * super_block handler wishes to interpret them.
3978 */
3979 static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
3982 if (info->raid_disks == 0) {
3983 /* just setting version number for superblock loading */
3984 if (info->major_version < 0 ||
3985 info->major_version >= sizeof(super_types)/sizeof(super_types[0]) ||
3986 super_types[info->major_version].name == NULL) {
3987 /* maybe try to auto-load a module? */
3988 printk(KERN_INFO
3989 "md: superblock version %d not known\n",
3990 info->major_version);
3991 return -EINVAL;
3993 mddev->major_version = info->major_version;
3994 mddev->minor_version = info->minor_version;
3995 mddev->patch_version = info->patch_version;
3996 return 0;
3998 mddev->major_version = MD_MAJOR_VERSION;
3999 mddev->minor_version = MD_MINOR_VERSION;
4000 mddev->patch_version = MD_PATCHLEVEL_VERSION;
4001 mddev->ctime = get_seconds();
4003 mddev->level = info->level;
4004 mddev->clevel[0] = 0;
4005 mddev->size = info->size;
4006 mddev->raid_disks = info->raid_disks;
4007 /* don't set md_minor, it is determined by which /dev/md* was
4008 * openned
4009 */
4010 if (info->state & (1<<MD_SB_CLEAN))
4011 mddev->recovery_cp = MaxSector;
4012 else
4013 mddev->recovery_cp = 0;
4014 mddev->persistent = ! info->not_persistent;
4016 mddev->layout = info->layout;
4017 mddev->chunk_size = info->chunk_size;
4019 mddev->max_disks = MD_SB_DISKS;
4021 mddev->sb_dirty = 1;
4023 mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
4024 mddev->bitmap_offset = 0;
4026 mddev->reshape_position = MaxSector;
4028 /*
4029 * Generate a 128 bit UUID
4030 */
4031 get_random_bytes(mddev->uuid, 16);
4033 mddev->new_level = mddev->level;
4034 mddev->new_chunk = mddev->chunk_size;
4035 mddev->new_layout = mddev->layout;
4036 mddev->delta_disks = 0;
4038 return 0;
4041 static int update_size(mddev_t *mddev, unsigned long size)
4043 mdk_rdev_t * rdev;
4044 int rv;
4045 struct list_head *tmp;
4046 int fit = (size == 0);
4048 if (mddev->pers->resize == NULL)
4049 return -EINVAL;
4050 /* The "size" is the amount of each device that is used.
4051 * This can only make sense for arrays with redundancy.
4052 * linear and raid0 always use whatever space is available
4053 * We can only consider changing the size if no resync
4054 * or reconstruction is happening, and if the new size
4055 * is acceptable. It must fit before the sb_offset or,
4056 * if that is <data_offset, it must fit before the
4057 * size of each device.
4058 * If size is zero, we find the largest size that fits.
4059 */
4060 if (mddev->sync_thread)
4061 return -EBUSY;
4062 ITERATE_RDEV(mddev,rdev,tmp) {
4063 sector_t avail;
4064 if (rdev->sb_offset > rdev->data_offset)
4065 avail = (rdev->sb_offset*2) - rdev->data_offset;
4066 else
4067 avail = get_capacity(rdev->bdev->bd_disk)
4068 - rdev->data_offset;
4069 if (fit && (size == 0 || size > avail/2))
4070 size = avail/2;
4071 if (avail < ((sector_t)size << 1))
4072 return -ENOSPC;
4074 rv = mddev->pers->resize(mddev, (sector_t)size *2);
4075 if (!rv) {
4076 struct block_device *bdev;
4078 bdev = bdget_disk(mddev->gendisk, 0);
4079 if (bdev) {
4080 mutex_lock(&bdev->bd_inode->i_mutex);
4081 i_size_write(bdev->bd_inode, (loff_t)mddev->array_size << 10);
4082 mutex_unlock(&bdev->bd_inode->i_mutex);
4083 bdput(bdev);
4086 return rv;
4089 static int update_raid_disks(mddev_t *mddev, int raid_disks)
4091 int rv;
4092 /* change the number of raid disks */
4093 if (mddev->pers->check_reshape == NULL)
4094 return -EINVAL;
4095 if (raid_disks <= 0 ||
4096 raid_disks >= mddev->max_disks)
4097 return -EINVAL;
4098 if (mddev->sync_thread || mddev->reshape_position != MaxSector)
4099 return -EBUSY;
4100 mddev->delta_disks = raid_disks - mddev->raid_disks;
4102 rv = mddev->pers->check_reshape(mddev);
4103 return rv;
4107 /*
4108 * update_array_info is used to change the configuration of an
4109 * on-line array.
4110 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size
4111 * fields in the info are checked against the array.
4112 * Any differences that cannot be handled will cause an error.
4113 * Normally, only one change can be managed at a time.
4114 */
4115 static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
4117 int rv = 0;
4118 int cnt = 0;
4119 int state = 0;
4121 /* calculate expected state,ignoring low bits */
4122 if (mddev->bitmap && mddev->bitmap_offset)
4123 state |= (1 << MD_SB_BITMAP_PRESENT);
4125 if (mddev->major_version != info->major_version ||
4126 mddev->minor_version != info->minor_version ||
4127 /* mddev->patch_version != info->patch_version || */
4128 mddev->ctime != info->ctime ||
4129 mddev->level != info->level ||
4130 /* mddev->layout != info->layout || */
4131 !mddev->persistent != info->not_persistent||
4132 mddev->chunk_size != info->chunk_size ||
4133 /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */
4134 ((state^info->state) & 0xfffffe00)
4136 return -EINVAL;
4137 /* Check there is only one change */
4138 if (info->size >= 0 && mddev->size != info->size) cnt++;
4139 if (mddev->raid_disks != info->raid_disks) cnt++;
4140 if (mddev->layout != info->layout) cnt++;
4141 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) cnt++;
4142 if (cnt == 0) return 0;
4143 if (cnt > 1) return -EINVAL;
4145 if (mddev->layout != info->layout) {
4146 /* Change layout
4147 * we don't need to do anything at the md level, the
4148 * personality will take care of it all.
4149 */
4150 if (mddev->pers->reconfig == NULL)
4151 return -EINVAL;
4152 else
4153 return mddev->pers->reconfig(mddev, info->layout, -1);
4155 if (info->size >= 0 && mddev->size != info->size)
4156 rv = update_size(mddev, info->size);
4158 if (mddev->raid_disks != info->raid_disks)
4159 rv = update_raid_disks(mddev, info->raid_disks);
4161 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) {
4162 if (mddev->pers->quiesce == NULL)
4163 return -EINVAL;
4164 if (mddev->recovery || mddev->sync_thread)
4165 return -EBUSY;
4166 if (info->state & (1<<MD_SB_BITMAP_PRESENT)) {
4167 /* add the bitmap */
4168 if (mddev->bitmap)
4169 return -EEXIST;
4170 if (mddev->default_bitmap_offset == 0)
4171 return -EINVAL;
4172 mddev->bitmap_offset = mddev->default_bitmap_offset;
4173 mddev->pers->quiesce(mddev, 1);
4174 rv = bitmap_create(mddev);
4175 if (rv)
4176 bitmap_destroy(mddev);
4177 mddev->pers->quiesce(mddev, 0);
4178 } else {
4179 /* remove the bitmap */
4180 if (!mddev->bitmap)
4181 return -ENOENT;
4182 if (mddev->bitmap->file)
4183 return -EINVAL;
4184 mddev->pers->quiesce(mddev, 1);
4185 bitmap_destroy(mddev);
4186 mddev->pers->quiesce(mddev, 0);
4187 mddev->bitmap_offset = 0;
4190 md_update_sb(mddev);
4191 return rv;
4194 static int set_disk_faulty(mddev_t *mddev, dev_t dev)
4196 mdk_rdev_t *rdev;
4198 if (mddev->pers == NULL)
4199 return -ENODEV;
4201 rdev = find_rdev(mddev, dev);
4202 if (!rdev)
4203 return -ENODEV;
4205 md_error(mddev, rdev);
4206 return 0;
4209 static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
4211 mddev_t *mddev = bdev->bd_disk->private_data;
4213 geo->heads = 2;
4214 geo->sectors = 4;
4215 geo->cylinders = get_capacity(mddev->gendisk) / 8;
4216 return 0;
4219 static int md_ioctl(struct inode *inode, struct file *file,
4220 unsigned int cmd, unsigned long arg)
4222 int err = 0;
4223 void __user *argp = (void __user *)arg;
4224 mddev_t *mddev = NULL;
4226 if (!capable(CAP_SYS_ADMIN))
4227 return -EACCES;
4229 /*
4230 * Commands dealing with the RAID driver but not any
4231 * particular array:
4232 */
4233 switch (cmd)
4235 case RAID_VERSION:
4236 err = get_version(argp);
4237 goto done;
4239 case PRINT_RAID_DEBUG:
4240 err = 0;
4241 md_print_devices();
4242 goto done;
4244 #ifndef MODULE
4245 case RAID_AUTORUN:
4246 err = 0;
4247 autostart_arrays(arg);
4248 goto done;
4249 #endif
4250 default:;
4253 /*
4254 * Commands creating/starting a new array:
4255 */
4257 mddev = inode->i_bdev->bd_disk->private_data;
4259 if (!mddev) {
4260 BUG();
4261 goto abort;
4265 if (cmd == START_ARRAY) {
4266 /* START_ARRAY doesn't need to lock the array as autostart_array
4267 * does the locking, and it could even be a different array
4268 */
4269 static int cnt = 3;
4270 if (cnt > 0 ) {
4271 printk(KERN_WARNING
4272 "md: %s(pid %d) used deprecated START_ARRAY ioctl. "
4273 "This will not be supported beyond July 2006\n",
4274 current->comm, current->pid);
4275 cnt--;
4277 err = autostart_array(new_decode_dev(arg));
4278 if (err) {
4279 printk(KERN_WARNING "md: autostart failed!\n");
4280 goto abort;
4282 goto done;
4285 err = mddev_lock(mddev);
4286 if (err) {
4287 printk(KERN_INFO
4288 "md: ioctl lock interrupted, reason %d, cmd %d\n",
4289 err, cmd);
4290 goto abort;
4293 switch (cmd)
4295 case SET_ARRAY_INFO:
4297 mdu_array_info_t info;
4298 if (!arg)
4299 memset(&info, 0, sizeof(info));
4300 else if (copy_from_user(&info, argp, sizeof(info))) {
4301 err = -EFAULT;
4302 goto abort_unlock;
4304 if (mddev->pers) {
4305 err = update_array_info(mddev, &info);
4306 if (err) {
4307 printk(KERN_WARNING "md: couldn't update"
4308 " array info. %d\n", err);
4309 goto abort_unlock;
4311 goto done_unlock;
4313 if (!list_empty(&mddev->disks)) {
4314 printk(KERN_WARNING
4315 "md: array %s already has disks!\n",
4316 mdname(mddev));
4317 err = -EBUSY;
4318 goto abort_unlock;
4320 if (mddev->raid_disks) {
4321 printk(KERN_WARNING
4322 "md: array %s already initialised!\n",
4323 mdname(mddev));
4324 err = -EBUSY;
4325 goto abort_unlock;
4327 err = set_array_info(mddev, &info);
4328 if (err) {
4329 printk(KERN_WARNING "md: couldn't set"
4330 " array info. %d\n", err);
4331 goto abort_unlock;
4334 goto done_unlock;
4336 default:;
4339 /*
4340 * Commands querying/configuring an existing array:
4341 */
4342 /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY,
4343 * RUN_ARRAY, and SET_BITMAP_FILE are allowed */
4344 if (!mddev->raid_disks && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY
4345 && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE) {
4346 err = -ENODEV;
4347 goto abort_unlock;
4350 /*
4351 * Commands even a read-only array can execute:
4352 */
4353 switch (cmd)
4355 case GET_ARRAY_INFO:
4356 err = get_array_info(mddev, argp);
4357 goto done_unlock;
4359 case GET_BITMAP_FILE:
4360 err = get_bitmap_file(mddev, argp);
4361 goto done_unlock;
4363 case GET_DISK_INFO:
4364 err = get_disk_info(mddev, argp);
4365 goto done_unlock;
4367 case RESTART_ARRAY_RW:
4368 err = restart_array(mddev);
4369 goto done_unlock;
4371 case STOP_ARRAY:
4372 err = do_md_stop (mddev, 0);
4373 goto done_unlock;
4375 case STOP_ARRAY_RO:
4376 err = do_md_stop (mddev, 1);
4377 goto done_unlock;
4379 /*
4380 * We have a problem here : there is no easy way to give a CHS
4381 * virtual geometry. We currently pretend that we have a 2 heads
4382 * 4 sectors (with a BIG number of cylinders...). This drives
4383 * dosfs just mad... ;-)
4384 */
4387 /*
4388 * The remaining ioctls are changing the state of the
4389 * superblock, so we do not allow them on read-only arrays.
4390 * However non-MD ioctls (e.g. get-size) will still come through
4391 * here and hit the 'default' below, so only disallow
4392 * 'md' ioctls, and switch to rw mode if started auto-readonly.
4393 */
4394 if (_IOC_TYPE(cmd) == MD_MAJOR &&
4395 mddev->ro && mddev->pers) {
4396 if (mddev->ro == 2) {
4397 mddev->ro = 0;
4398 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4399 md_wakeup_thread(mddev->thread);
4401 } else {
4402 err = -EROFS;
4403 goto abort_unlock;
4407 switch (cmd)
4409 case ADD_NEW_DISK:
4411 mdu_disk_info_t info;
4412 if (copy_from_user(&info, argp, sizeof(info)))
4413 err = -EFAULT;
4414 else
4415 err = add_new_disk(mddev, &info);
4416 goto done_unlock;
4419 case HOT_REMOVE_DISK:
4420 err = hot_remove_disk(mddev, new_decode_dev(arg));
4421 goto done_unlock;
4423 case HOT_ADD_DISK:
4424 err = hot_add_disk(mddev, new_decode_dev(arg));
4425 goto done_unlock;
4427 case SET_DISK_FAULTY:
4428 err = set_disk_faulty(mddev, new_decode_dev(arg));
4429 goto done_unlock;
4431 case RUN_ARRAY:
4432 err = do_md_run (mddev);
4433 goto done_unlock;
4435 case SET_BITMAP_FILE:
4436 err = set_bitmap_file(mddev, (int)arg);
4437 goto done_unlock;
4439 default:
4440 err = -EINVAL;
4441 goto abort_unlock;
4444 done_unlock:
4445 abort_unlock:
4446 mddev_unlock(mddev);
4448 return err;
4449 done:
4450 if (err)
4451 MD_BUG();
4452 abort:
4453 return err;
4456 static int md_open(struct inode *inode, struct file *file)
4458 /*
4459 * Succeed if we can lock the mddev, which confirms that
4460 * it isn't being stopped right now.
4461 */
4462 mddev_t *mddev = inode->i_bdev->bd_disk->private_data;
4463 int err;
4465 if ((err = mddev_lock(mddev)))
4466 goto out;
4468 err = 0;
4469 mddev_get(mddev);
4470 mddev_unlock(mddev);
4472 check_disk_change(inode->i_bdev);
4473 out:
4474 return err;
4477 static int md_release(struct inode *inode, struct file * file)
4479 mddev_t *mddev = inode->i_bdev->bd_disk->private_data;
4481 if (!mddev)
4482 BUG();
4483 mddev_put(mddev);
4485 return 0;
4488 static int md_media_changed(struct gendisk *disk)
4490 mddev_t *mddev = disk->private_data;
4492 return mddev->changed;
4495 static int md_revalidate(struct gendisk *disk)
4497 mddev_t *mddev = disk->private_data;
4499 mddev->changed = 0;
4500 return 0;
4502 static struct block_device_operations md_fops =
4504 .owner = THIS_MODULE,
4505 .open = md_open,
4506 .release = md_release,
4507 .ioctl = md_ioctl,
4508 .getgeo = md_getgeo,
4509 .media_changed = md_media_changed,
4510 .revalidate_disk= md_revalidate,
4511 };
4513 static int md_thread(void * arg)
4515 mdk_thread_t *thread = arg;
4517 /*
4518 * md_thread is a 'system-thread', it's priority should be very
4519 * high. We avoid resource deadlocks individually in each
4520 * raid personality. (RAID5 does preallocation) We also use RR and
4521 * the very same RT priority as kswapd, thus we will never get
4522 * into a priority inversion deadlock.
4524 * we definitely have to have equal or higher priority than
4525 * bdflush, otherwise bdflush will deadlock if there are too
4526 * many dirty RAID5 blocks.
4527 */
4529 allow_signal(SIGKILL);
4530 while (!kthread_should_stop()) {
4532 /* We need to wait INTERRUPTIBLE so that
4533 * we don't add to the load-average.
4534 * That means we need to be sure no signals are
4535 * pending
4536 */
4537 if (signal_pending(current))
4538 flush_signals(current);
4540 wait_event_interruptible_timeout
4541 (thread->wqueue,
4542 test_bit(THREAD_WAKEUP, &thread->flags)
4543 || kthread_should_stop(),
4544 thread->timeout);
4545 try_to_freeze();
4547 clear_bit(THREAD_WAKEUP, &thread->flags);
4549 thread->run(thread->mddev);
4552 return 0;
4555 void md_wakeup_thread(mdk_thread_t *thread)
4557 if (thread) {
4558 dprintk("md: waking up MD thread %s.\n", thread->tsk->comm);
4559 set_bit(THREAD_WAKEUP, &thread->flags);
4560 wake_up(&thread->wqueue);
4564 mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev,
4565 const char *name)
4567 mdk_thread_t *thread;
4569 thread = kzalloc(sizeof(mdk_thread_t), GFP_KERNEL);
4570 if (!thread)
4571 return NULL;
4573 init_waitqueue_head(&thread->wqueue);
4575 thread->run = run;
4576 thread->mddev = mddev;
4577 thread->timeout = MAX_SCHEDULE_TIMEOUT;
4578 thread->tsk = kthread_run(md_thread, thread, name, mdname(thread->mddev));
4579 if (IS_ERR(thread->tsk)) {
4580 kfree(thread);
4581 return NULL;
4583 return thread;
4586 void md_unregister_thread(mdk_thread_t *thread)
4588 dprintk("interrupting MD-thread pid %d\n", thread->tsk->pid);
4590 kthread_stop(thread->tsk);
4591 kfree(thread);
4594 void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
4596 if (!mddev) {
4597 MD_BUG();
4598 return;
4601 if (!rdev || test_bit(Faulty, &rdev->flags))
4602 return;
4603 /*
4604 dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",
4605 mdname(mddev),
4606 MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev),
4607 __builtin_return_address(0),__builtin_return_address(1),
4608 __builtin_return_address(2),__builtin_return_address(3));
4609 */
4610 if (!mddev->pers)
4611 return;
4612 if (!mddev->pers->error_handler)
4613 return;
4614 mddev->pers->error_handler(mddev,rdev);
4615 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4616 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4617 md_wakeup_thread(mddev->thread);
4618 md_new_event_inintr(mddev);
4621 /* seq_file implementation /proc/mdstat */
4623 static void status_unused(struct seq_file *seq)
4625 int i = 0;
4626 mdk_rdev_t *rdev;
4627 struct list_head *tmp;
4629 seq_printf(seq, "unused devices: ");
4631 ITERATE_RDEV_PENDING(rdev,tmp) {
4632 char b[BDEVNAME_SIZE];
4633 i++;
4634 seq_printf(seq, "%s ",
4635 bdevname(rdev->bdev,b));
4637 if (!i)
4638 seq_printf(seq, "<none>");
4640 seq_printf(seq, "\n");
4644 static void status_resync(struct seq_file *seq, mddev_t * mddev)
4646 sector_t max_blocks, resync, res;
4647 unsigned long dt, db, rt;
4648 int scale;
4649 unsigned int per_milli;
4651 resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2;
4653 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
4654 max_blocks = mddev->resync_max_sectors >> 1;
4655 else
4656 max_blocks = mddev->size;
4658 /*
4659 * Should not happen.
4660 */
4661 if (!max_blocks) {
4662 MD_BUG();
4663 return;
4665 /* Pick 'scale' such that (resync>>scale)*1000 will fit
4666 * in a sector_t, and (max_blocks>>scale) will fit in a
4667 * u32, as those are the requirements for sector_div.
4668 * Thus 'scale' must be at least 10
4669 */
4670 scale = 10;
4671 if (sizeof(sector_t) > sizeof(unsigned long)) {
4672 while ( max_blocks/2 > (1ULL<<(scale+32)))
4673 scale++;
4675 res = (resync>>scale)*1000;
4676 sector_div(res, (u32)((max_blocks>>scale)+1));
4678 per_milli = res;
4680 int i, x = per_milli/50, y = 20-x;
4681 seq_printf(seq, "[");
4682 for (i = 0; i < x; i++)
4683 seq_printf(seq, "=");
4684 seq_printf(seq, ">");
4685 for (i = 0; i < y; i++)
4686 seq_printf(seq, ".");
4687 seq_printf(seq, "] ");
4689 seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)",
4690 (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)?
4691 "reshape" :
4692 (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
4693 "resync" : "recovery")),
4694 per_milli/10, per_milli % 10,
4695 (unsigned long long) resync,
4696 (unsigned long long) max_blocks);
4698 /*
4699 * We do not want to overflow, so the order of operands and
4700 * the * 100 / 100 trick are important. We do a +1 to be
4701 * safe against division by zero. We only estimate anyway.
4703 * dt: time from mark until now
4704 * db: blocks written from mark until now
4705 * rt: remaining time
4706 */
4707 dt = ((jiffies - mddev->resync_mark) / HZ);
4708 if (!dt) dt++;
4709 db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active))
4710 - mddev->resync_mark_cnt;
4711 rt = (dt * ((unsigned long)(max_blocks-resync) / (db/2/100+1)))/100;
4713 seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6);
4715 seq_printf(seq, " speed=%ldK/sec", db/2/dt);
4718 static void *md_seq_start(struct seq_file *seq, loff_t *pos)
4720 struct list_head *tmp;
4721 loff_t l = *pos;
4722 mddev_t *mddev;
4724 if (l >= 0x10000)
4725 return NULL;
4726 if (!l--)
4727 /* header */
4728 return (void*)1;
4730 spin_lock(&all_mddevs_lock);
4731 list_for_each(tmp,&all_mddevs)
4732 if (!l--) {
4733 mddev = list_entry(tmp, mddev_t, all_mddevs);
4734 mddev_get(mddev);
4735 spin_unlock(&all_mddevs_lock);
4736 return mddev;
4738 spin_unlock(&all_mddevs_lock);
4739 if (!l--)
4740 return (void*)2;/* tail */
4741 return NULL;
4744 static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4746 struct list_head *tmp;
4747 mddev_t *next_mddev, *mddev = v;
4749 ++*pos;
4750 if (v == (void*)2)
4751 return NULL;
4753 spin_lock(&all_mddevs_lock);
4754 if (v == (void*)1)
4755 tmp = all_mddevs.next;
4756 else
4757 tmp = mddev->all_mddevs.next;
4758 if (tmp != &all_mddevs)
4759 next_mddev = mddev_get(list_entry(tmp,mddev_t,all_mddevs));
4760 else {
4761 next_mddev = (void*)2;
4762 *pos = 0x10000;
4764 spin_unlock(&all_mddevs_lock);
4766 if (v != (void*)1)
4767 mddev_put(mddev);
4768 return next_mddev;
4772 static void md_seq_stop(struct seq_file *seq, void *v)
4774 mddev_t *mddev = v;
4776 if (mddev && v != (void*)1 && v != (void*)2)
4777 mddev_put(mddev);
4780 struct mdstat_info {
4781 int event;
4782 };
4784 static int md_seq_show(struct seq_file *seq, void *v)
4786 mddev_t *mddev = v;
4787 sector_t size;
4788 struct list_head *tmp2;
4789 mdk_rdev_t *rdev;
4790 struct mdstat_info *mi = seq->private;
4791 struct bitmap *bitmap;
4793 if (v == (void*)1) {
4794 struct mdk_personality *pers;
4795 seq_printf(seq, "Personalities : ");
4796 spin_lock(&pers_lock);
4797 list_for_each_entry(pers, &pers_list, list)
4798 seq_printf(seq, "[%s] ", pers->name);
4800 spin_unlock(&pers_lock);
4801 seq_printf(seq, "\n");
4802 mi->event = atomic_read(&md_event_count);
4803 return 0;
4805 if (v == (void*)2) {
4806 status_unused(seq);
4807 return 0;
4810 if (mddev_lock(mddev) < 0)
4811 return -EINTR;
4813 if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
4814 seq_printf(seq, "%s : %sactive", mdname(mddev),
4815 mddev->pers ? "" : "in");
4816 if (mddev->pers) {
4817 if (mddev->ro==1)
4818 seq_printf(seq, " (read-only)");
4819 if (mddev->ro==2)
4820 seq_printf(seq, "(auto-read-only)");
4821 seq_printf(seq, " %s", mddev->pers->name);
4824 size = 0;
4825 ITERATE_RDEV(mddev,rdev,tmp2) {
4826 char b[BDEVNAME_SIZE];
4827 seq_printf(seq, " %s[%d]",
4828 bdevname(rdev->bdev,b), rdev->desc_nr);
4829 if (test_bit(WriteMostly, &rdev->flags))
4830 seq_printf(seq, "(W)");
4831 if (test_bit(Faulty, &rdev->flags)) {
4832 seq_printf(seq, "(F)");
4833 continue;
4834 } else if (rdev->raid_disk < 0)
4835 seq_printf(seq, "(S)"); /* spare */
4836 size += rdev->size;
4839 if (!list_empty(&mddev->disks)) {
4840 if (mddev->pers)
4841 seq_printf(seq, "\n %llu blocks",
4842 (unsigned long long)mddev->array_size);
4843 else
4844 seq_printf(seq, "\n %llu blocks",
4845 (unsigned long long)size);
4847 if (mddev->persistent) {
4848 if (mddev->major_version != 0 ||
4849 mddev->minor_version != 90) {
4850 seq_printf(seq," super %d.%d",
4851 mddev->major_version,
4852 mddev->minor_version);
4854 } else
4855 seq_printf(seq, " super non-persistent");
4857 if (mddev->pers) {
4858 mddev->pers->status (seq, mddev);
4859 seq_printf(seq, "\n ");
4860 if (mddev->pers->sync_request) {
4861 if (mddev->curr_resync > 2) {
4862 status_resync (seq, mddev);
4863 seq_printf(seq, "\n ");
4864 } else if (mddev->curr_resync == 1 || mddev->curr_resync == 2)
4865 seq_printf(seq, "\tresync=DELAYED\n ");
4866 else if (mddev->recovery_cp < MaxSector)
4867 seq_printf(seq, "\tresync=PENDING\n ");
4869 } else
4870 seq_printf(seq, "\n ");
4872 if ((bitmap = mddev->bitmap)) {
4873 unsigned long chunk_kb;
4874 unsigned long flags;
4875 spin_lock_irqsave(&bitmap->lock, flags);
4876 chunk_kb = bitmap->chunksize >> 10;
4877 seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], "
4878 "%lu%s chunk",
4879 bitmap->pages - bitmap->missing_pages,
4880 bitmap->pages,
4881 (bitmap->pages - bitmap->missing_pages)
4882 << (PAGE_SHIFT - 10),
4883 chunk_kb ? chunk_kb : bitmap->chunksize,
4884 chunk_kb ? "KB" : "B");
4885 if (bitmap->file) {
4886 seq_printf(seq, ", file: ");
4887 seq_path(seq, bitmap->file->f_vfsmnt,
4888 bitmap->file->f_dentry," \t\n");
4891 seq_printf(seq, "\n");
4892 spin_unlock_irqrestore(&bitmap->lock, flags);
4895 seq_printf(seq, "\n");
4897 mddev_unlock(mddev);
4899 return 0;
4902 static struct seq_operations md_seq_ops = {
4903 .start = md_seq_start,
4904 .next = md_seq_next,
4905 .stop = md_seq_stop,
4906 .show = md_seq_show,
4907 };
4909 static int md_seq_open(struct inode *inode, struct file *file)
4911 int error;
4912 struct mdstat_info *mi = kmalloc(sizeof(*mi), GFP_KERNEL);
4913 if (mi == NULL)
4914 return -ENOMEM;
4916 error = seq_open(file, &md_seq_ops);
4917 if (error)
4918 kfree(mi);
4919 else {
4920 struct seq_file *p = file->private_data;
4921 p->private = mi;
4922 mi->event = atomic_read(&md_event_count);
4924 return error;
4927 static int md_seq_release(struct inode *inode, struct file *file)
4929 struct seq_file *m = file->private_data;
4930 struct mdstat_info *mi = m->private;
4931 m->private = NULL;
4932 kfree(mi);
4933 return seq_release(inode, file);
4936 static unsigned int mdstat_poll(struct file *filp, poll_table *wait)
4938 struct seq_file *m = filp->private_data;
4939 struct mdstat_info *mi = m->private;
4940 int mask;
4942 poll_wait(filp, &md_event_waiters, wait);
4944 /* always allow read */
4945 mask = POLLIN | POLLRDNORM;
4947 if (mi->event != atomic_read(&md_event_count))
4948 mask |= POLLERR | POLLPRI;
4949 return mask;
4952 static struct file_operations md_seq_fops = {
4953 .open = md_seq_open,
4954 .read = seq_read,
4955 .llseek = seq_lseek,
4956 .release = md_seq_release,
4957 .poll = mdstat_poll,
4958 };
4960 int register_md_personality(struct mdk_personality *p)
4962 spin_lock(&pers_lock);
4963 list_add_tail(&p->list, &pers_list);
4964 printk(KERN_INFO "md: %s personality registered for level %d\n", p->name, p->level);
4965 spin_unlock(&pers_lock);
4966 return 0;
4969 int unregister_md_personality(struct mdk_personality *p)
4971 printk(KERN_INFO "md: %s personality unregistered\n", p->name);
4972 spin_lock(&pers_lock);
4973 list_del_init(&p->list);
4974 spin_unlock(&pers_lock);
4975 return 0;
4978 static int is_mddev_idle(mddev_t *mddev)
4980 mdk_rdev_t * rdev;
4981 struct list_head *tmp;
4982 int idle;
4983 unsigned long curr_events;
4985 idle = 1;
4986 ITERATE_RDEV(mddev,rdev,tmp) {
4987 struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
4988 curr_events = disk_stat_read(disk, sectors[0]) +
4989 disk_stat_read(disk, sectors[1]) -
4990 atomic_read(&disk->sync_io);
4991 /* The difference between curr_events and last_events
4992 * will be affected by any new non-sync IO (making
4993 * curr_events bigger) and any difference in the amount of
4994 * in-flight syncio (making current_events bigger or smaller)
4995 * The amount in-flight is currently limited to
4996 * 32*64K in raid1/10 and 256*PAGE_SIZE in raid5/6
4997 * which is at most 4096 sectors.
4998 * These numbers are fairly fragile and should be made
4999 * more robust, probably by enforcing the
5000 * 'window size' that md_do_sync sort-of uses.
5002 * Note: the following is an unsigned comparison.
5003 */
5004 if ((curr_events - rdev->last_events + 4096) > 8192) {
5005 rdev->last_events = curr_events;
5006 idle = 0;
5009 return idle;
5012 void md_done_sync(mddev_t *mddev, int blocks, int ok)
5014 /* another "blocks" (512byte) blocks have been synced */
5015 atomic_sub(blocks, &mddev->recovery_active);
5016 wake_up(&mddev->recovery_wait);
5017 if (!ok) {
5018 set_bit(MD_RECOVERY_ERR, &mddev->recovery);
5019 md_wakeup_thread(mddev->thread);
5020 // stop recovery, signal do_sync ....
5025 /* md_write_start(mddev, bi)
5026 * If we need to update some array metadata (e.g. 'active' flag
5027 * in superblock) before writing, schedule a superblock update
5028 * and wait for it to complete.
5029 */
5030 void md_write_start(mddev_t *mddev, struct bio *bi)
5032 if (bio_data_dir(bi) != WRITE)
5033 return;
5035 BUG_ON(mddev->ro == 1);
5036 if (mddev->ro == 2) {
5037 /* need to switch to read/write */
5038 mddev->ro = 0;
5039 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5040 md_wakeup_thread(mddev->thread);
5042 atomic_inc(&mddev->writes_pending);
5043 if (mddev->in_sync) {
5044 spin_lock_irq(&mddev->write_lock);
5045 if (mddev->in_sync) {
5046 mddev->in_sync = 0;
5047 mddev->sb_dirty = 3;
5048 md_wakeup_thread(mddev->thread);
5050 spin_unlock_irq(&mddev->write_lock);
5052 wait_event(mddev->sb_wait, mddev->sb_dirty==0);
5055 void md_write_end(mddev_t *mddev)
5057 if (atomic_dec_and_test(&mddev->writes_pending)) {
5058 if (mddev->safemode == 2)
5059 md_wakeup_thread(mddev->thread);
5060 else if (mddev->safemode_delay)
5061 mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay);
5065 static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
5067 #define SYNC_MARKS 10
5068 #define SYNC_MARK_STEP (3*HZ)
5069 void md_do_sync(mddev_t *mddev)
5071 mddev_t *mddev2;
5072 unsigned int currspeed = 0,
5073 window;
5074 sector_t max_sectors,j, io_sectors;
5075 unsigned long mark[SYNC_MARKS];
5076 sector_t mark_cnt[SYNC_MARKS];
5077 int last_mark,m;
5078 struct list_head *tmp;
5079 sector_t last_check;
5080 int skipped = 0;
5081 struct list_head *rtmp;
5082 mdk_rdev_t *rdev;
5084 /* just incase thread restarts... */
5085 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
5086 return;
5087 if (mddev->ro) /* never try to sync a read-only array */
5088 return;
5090 /* we overload curr_resync somewhat here.
5091 * 0 == not engaged in resync at all
5092 * 2 == checking that there is no conflict with another sync
5093 * 1 == like 2, but have yielded to allow conflicting resync to
5094 * commense
5095 * other == active in resync - this many blocks
5097 * Before starting a resync we must have set curr_resync to
5098 * 2, and then checked that every "conflicting" array has curr_resync
5099 * less than ours. When we find one that is the same or higher
5100 * we wait on resync_wait. To avoid deadlock, we reduce curr_resync
5101 * to 1 if we choose to yield (based arbitrarily on address of mddev structure).
5102 * This will mean we have to start checking from the beginning again.
5104 */
5106 do {
5107 mddev->curr_resync = 2;
5109 try_again:
5110 if (kthread_should_stop()) {
5111 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5112 goto skip;
5114 ITERATE_MDDEV(mddev2,tmp) {
5115 if (mddev2 == mddev)
5116 continue;
5117 if (mddev2->curr_resync &&
5118 match_mddev_units(mddev,mddev2)) {
5119 DEFINE_WAIT(wq);
5120 if (mddev < mddev2 && mddev->curr_resync == 2) {
5121 /* arbitrarily yield */
5122 mddev->curr_resync = 1;
5123 wake_up(&resync_wait);
5125 if (mddev > mddev2 && mddev->curr_resync == 1)
5126 /* no need to wait here, we can wait the next
5127 * time 'round when curr_resync == 2
5128 */
5129 continue;
5130 prepare_to_wait(&resync_wait, &wq, TASK_UNINTERRUPTIBLE);
5131 if (!kthread_should_stop() &&
5132 mddev2->curr_resync >= mddev->curr_resync) {
5133 printk(KERN_INFO "md: delaying resync of %s"
5134 " until %s has finished resync (they"
5135 " share one or more physical units)\n",
5136 mdname(mddev), mdname(mddev2));
5137 mddev_put(mddev2);
5138 schedule();
5139 finish_wait(&resync_wait, &wq);
5140 goto try_again;
5142 finish_wait(&resync_wait, &wq);
5145 } while (mddev->curr_resync < 2);
5147 j = 0;
5148 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
5149 /* resync follows the size requested by the personality,
5150 * which defaults to physical size, but can be virtual size
5151 */
5152 max_sectors = mddev->resync_max_sectors;
5153 mddev->resync_mismatches = 0;
5154 /* we don't use the checkpoint if there's a bitmap */
5155 if (!mddev->bitmap &&
5156 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
5157 j = mddev->recovery_cp;
5158 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
5159 max_sectors = mddev->size << 1;
5160 else {
5161 /* recovery follows the physical size of devices */
5162 max_sectors = mddev->size << 1;
5163 j = MaxSector;
5164 ITERATE_RDEV(mddev,rdev,rtmp)
5165 if (rdev->raid_disk >= 0 &&
5166 !test_bit(Faulty, &rdev->flags) &&
5167 !test_bit(In_sync, &rdev->flags) &&
5168 rdev->recovery_offset < j)
5169 j = rdev->recovery_offset;
5172 printk(KERN_INFO "md: syncing RAID array %s\n", mdname(mddev));
5173 printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:"
5174 " %d KB/sec/disc.\n", speed_min(mddev));
5175 printk(KERN_INFO "md: using maximum available idle IO bandwidth "
5176 "(but not more than %d KB/sec) for reconstruction.\n",
5177 speed_max(mddev));
5179 is_mddev_idle(mddev); /* this also initializes IO event counters */
5181 io_sectors = 0;
5182 for (m = 0; m < SYNC_MARKS; m++) {
5183 mark[m] = jiffies;
5184 mark_cnt[m] = io_sectors;
5186 last_mark = 0;
5187 mddev->resync_mark = mark[last_mark];
5188 mddev->resync_mark_cnt = mark_cnt[last_mark];
5190 /*
5191 * Tune reconstruction:
5192 */
5193 window = 32*(PAGE_SIZE/512);
5194 printk(KERN_INFO "md: using %dk window, over a total of %llu blocks.\n",
5195 window/2,(unsigned long long) max_sectors/2);
5197 atomic_set(&mddev->recovery_active, 0);
5198 init_waitqueue_head(&mddev->recovery_wait);
5199 last_check = 0;
5201 if (j>2) {
5202 printk(KERN_INFO
5203 "md: resuming recovery of %s from checkpoint.\n",
5204 mdname(mddev));
5205 mddev->curr_resync = j;
5208 while (j < max_sectors) {
5209 sector_t sectors;
5211 skipped = 0;
5212 sectors = mddev->pers->sync_request(mddev, j, &skipped,
5213 currspeed < speed_min(mddev));
5214 if (sectors == 0) {
5215 set_bit(MD_RECOVERY_ERR, &mddev->recovery);
5216 goto out;
5219 if (!skipped) { /* actual IO requested */
5220 io_sectors += sectors;
5221 atomic_add(sectors, &mddev->recovery_active);
5224 j += sectors;
5225 if (j>1) mddev->curr_resync = j;
5226 mddev->curr_mark_cnt = io_sectors;
5227 if (last_check == 0)
5228 /* this is the earliers that rebuilt will be
5229 * visible in /proc/mdstat
5230 */
5231 md_new_event(mddev);
5233 if (last_check + window > io_sectors || j == max_sectors)
5234 continue;
5236 last_check = io_sectors;
5238 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery) ||
5239 test_bit(MD_RECOVERY_ERR, &mddev->recovery))
5240 break;
5242 repeat:
5243 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) {
5244 /* step marks */
5245 int next = (last_mark+1) % SYNC_MARKS;
5247 mddev->resync_mark = mark[next];
5248 mddev->resync_mark_cnt = mark_cnt[next];
5249 mark[next] = jiffies;
5250 mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active);
5251 last_mark = next;
5255 if (kthread_should_stop()) {
5256 /*
5257 * got a signal, exit.
5258 */
5259 printk(KERN_INFO
5260 "md: md_do_sync() got signal ... exiting\n");
5261 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5262 goto out;
5265 /*
5266 * this loop exits only if either when we are slower than
5267 * the 'hard' speed limit, or the system was IO-idle for
5268 * a jiffy.
5269 * the system might be non-idle CPU-wise, but we only care
5270 * about not overloading the IO subsystem. (things like an
5271 * e2fsck being done on the RAID array should execute fast)
5272 */
5273 mddev->queue->unplug_fn(mddev->queue);
5274 cond_resched();
5276 currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2
5277 /((jiffies-mddev->resync_mark)/HZ +1) +1;
5279 if (currspeed > speed_min(mddev)) {
5280 if ((currspeed > speed_max(mddev)) ||
5281 !is_mddev_idle(mddev)) {
5282 msleep(500);
5283 goto repeat;
5287 printk(KERN_INFO "md: %s: sync done.\n",mdname(mddev));
5288 /*
5289 * this also signals 'finished resyncing' to md_stop
5290 */
5291 out:
5292 mddev->queue->unplug_fn(mddev->queue);
5294 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
5296 /* tell personality that we are finished */
5297 mddev->pers->sync_request(mddev, max_sectors, &skipped, 1);
5299 if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) &&
5300 test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
5301 !test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
5302 mddev->curr_resync > 2) {
5303 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
5304 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
5305 if (mddev->curr_resync >= mddev->recovery_cp) {
5306 printk(KERN_INFO
5307 "md: checkpointing recovery of %s.\n",
5308 mdname(mddev));
5309 mddev->recovery_cp = mddev->curr_resync;
5311 } else
5312 mddev->recovery_cp = MaxSector;
5313 } else {
5314 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
5315 mddev->curr_resync = MaxSector;
5316 ITERATE_RDEV(mddev,rdev,rtmp)
5317 if (rdev->raid_disk >= 0 &&
5318 !test_bit(Faulty, &rdev->flags) &&
5319 !test_bit(In_sync, &rdev->flags) &&
5320 rdev->recovery_offset < mddev->curr_resync)
5321 rdev->recovery_offset = mddev->curr_resync;
5322 mddev->sb_dirty = 1;
5326 skip:
5327 mddev->curr_resync = 0;
5328 wake_up(&resync_wait);
5329 set_bit(MD_RECOVERY_DONE, &mddev->recovery);
5330 md_wakeup_thread(mddev->thread);
5332 EXPORT_SYMBOL_GPL(md_do_sync);
5335 /*
5336 * This routine is regularly called by all per-raid-array threads to
5337 * deal with generic issues like resync and super-block update.
5338 * Raid personalities that don't have a thread (linear/raid0) do not
5339 * need this as they never do any recovery or update the superblock.
5341 * It does not do any resync itself, but rather "forks" off other threads
5342 * to do that as needed.
5343 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in
5344 * "->recovery" and create a thread at ->sync_thread.
5345 * When the thread finishes it sets MD_RECOVERY_DONE (and might set MD_RECOVERY_ERR)
5346 * and wakeups up this thread which will reap the thread and finish up.
5347 * This thread also removes any faulty devices (with nr_pending == 0).
5349 * The overall approach is:
5350 * 1/ if the superblock needs updating, update it.
5351 * 2/ If a recovery thread is running, don't do anything else.
5352 * 3/ If recovery has finished, clean up, possibly marking spares active.
5353 * 4/ If there are any faulty devices, remove them.
5354 * 5/ If array is degraded, try to add spares devices
5355 * 6/ If array has spares or is not in-sync, start a resync thread.
5356 */
5357 void md_check_recovery(mddev_t *mddev)
5359 mdk_rdev_t *rdev;
5360 struct list_head *rtmp;
5363 if (mddev->bitmap)
5364 bitmap_daemon_work(mddev->bitmap);
5366 if (mddev->ro)
5367 return;
5369 if (signal_pending(current)) {
5370 if (mddev->pers->sync_request) {
5371 printk(KERN_INFO "md: %s in immediate safe mode\n",
5372 mdname(mddev));
5373 mddev->safemode = 2;
5375 flush_signals(current);
5378 if ( ! (
5379 mddev->sb_dirty ||
5380 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
5381 test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
5382 (mddev->safemode == 1) ||
5383 (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending)
5384 && !mddev->in_sync && mddev->recovery_cp == MaxSector)
5385 ))
5386 return;
5388 if (mddev_trylock(mddev)) {
5389 int spares =0;
5391 spin_lock_irq(&mddev->write_lock);
5392 if (mddev->safemode && !atomic_read(&mddev->writes_pending) &&
5393 !mddev->in_sync && mddev->recovery_cp == MaxSector) {
5394 mddev->in_sync = 1;
5395 mddev->sb_dirty = 3;
5397 if (mddev->safemode == 1)
5398 mddev->safemode = 0;
5399 spin_unlock_irq(&mddev->write_lock);
5401 if (mddev->sb_dirty)
5402 md_update_sb(mddev);
5405 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
5406 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
5407 /* resync/recovery still happening */
5408 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5409 goto unlock;
5411 if (mddev->sync_thread) {
5412 /* resync has finished, collect result */
5413 md_unregister_thread(mddev->sync_thread);
5414 mddev->sync_thread = NULL;
5415 if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) &&
5416 !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
5417 /* success...*/
5418 /* activate any spares */
5419 mddev->pers->spare_active(mddev);
5421 md_update_sb(mddev);
5423 /* if array is no-longer degraded, then any saved_raid_disk
5424 * information must be scrapped
5425 */
5426 if (!mddev->degraded)
5427 ITERATE_RDEV(mddev,rdev,rtmp)
5428 rdev->saved_raid_disk = -1;
5430 mddev->recovery = 0;
5431 /* flag recovery needed just to double check */
5432 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5433 md_new_event(mddev);
5434 goto unlock;
5436 /* Clear some bits that don't mean anything, but
5437 * might be left set
5438 */
5439 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5440 clear_bit(MD_RECOVERY_ERR, &mddev->recovery);
5441 clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
5442 clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
5444 if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
5445 goto unlock;
5446 /* no recovery is running.
5447 * remove any failed drives, then
5448 * add spares if possible.
5449 * Spare are also removed and re-added, to allow
5450 * the personality to fail the re-add.
5451 */
5452 ITERATE_RDEV(mddev,rdev,rtmp)
5453 if (rdev->raid_disk >= 0 &&
5454 (test_bit(Faulty, &rdev->flags) || ! test_bit(In_sync, &rdev->flags)) &&
5455 atomic_read(&rdev->nr_pending)==0) {
5456 if (mddev->pers->hot_remove_disk(mddev, rdev->raid_disk)==0) {
5457 char nm[20];
5458 sprintf(nm,"rd%d", rdev->raid_disk);
5459 sysfs_remove_link(&mddev->kobj, nm);
5460 rdev->raid_disk = -1;
5464 if (mddev->degraded) {
5465 ITERATE_RDEV(mddev,rdev,rtmp)
5466 if (rdev->raid_disk < 0
5467 && !test_bit(Faulty, &rdev->flags)) {
5468 rdev->recovery_offset = 0;
5469 if (mddev->pers->hot_add_disk(mddev,rdev)) {
5470 char nm[20];
5471 sprintf(nm, "rd%d", rdev->raid_disk);
5472 sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
5473 spares++;
5474 md_new_event(mddev);
5475 } else
5476 break;
5480 if (spares) {
5481 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
5482 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
5483 } else if (mddev->recovery_cp < MaxSector) {
5484 set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
5485 } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
5486 /* nothing to be done ... */
5487 goto unlock;
5489 if (mddev->pers->sync_request) {
5490 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
5491 if (spares && mddev->bitmap && ! mddev->bitmap->file) {
5492 /* We are adding a device or devices to an array
5493 * which has the bitmap stored on all devices.
5494 * So make sure all bitmap pages get written
5495 */
5496 bitmap_write_all(mddev->bitmap);
5498 mddev->sync_thread = md_register_thread(md_do_sync,
5499 mddev,
5500 "%s_resync");
5501 if (!mddev->sync_thread) {
5502 printk(KERN_ERR "%s: could not start resync"
5503 " thread...\n",
5504 mdname(mddev));
5505 /* leave the spares where they are, it shouldn't hurt */
5506 mddev->recovery = 0;
5507 } else
5508 md_wakeup_thread(mddev->sync_thread);
5509 md_new_event(mddev);
5511 unlock:
5512 mddev_unlock(mddev);
5516 static int md_notify_reboot(struct notifier_block *this,
5517 unsigned long code, void *x)
5519 struct list_head *tmp;
5520 mddev_t *mddev;
5522 if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) {
5524 printk(KERN_INFO "md: stopping all md devices.\n");
5526 ITERATE_MDDEV(mddev,tmp)
5527 if (mddev_trylock(mddev)) {
5528 do_md_stop (mddev, 1);
5529 mddev_unlock(mddev);
5531 /*
5532 * certain more exotic SCSI devices are known to be
5533 * volatile wrt too early system reboots. While the
5534 * right place to handle this issue is the given
5535 * driver, we do want to have a safe RAID driver ...
5536 */
5537 mdelay(1000*1);
5539 return NOTIFY_DONE;
5542 static struct notifier_block md_notifier = {
5543 .notifier_call = md_notify_reboot,
5544 .next = NULL,
5545 .priority = INT_MAX, /* before any real devices */
5546 };
5548 static void md_geninit(void)
5550 struct proc_dir_entry *p;
5552 dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
5554 p = create_proc_entry("mdstat", S_IRUGO, NULL);
5555 if (p)
5556 p->proc_fops = &md_seq_fops;
5559 static int __init md_init(void)
5561 printk(KERN_INFO "md: md driver %d.%d.%d MAX_MD_DEVS=%d,"
5562 " MD_SB_DISKS=%d\n",
5563 MD_MAJOR_VERSION, MD_MINOR_VERSION,
5564 MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS);
5565 printk(KERN_INFO "md: bitmap version %d.%d\n", BITMAP_MAJOR_HI,
5566 BITMAP_MINOR);
5568 if (register_blkdev(MAJOR_NR, "md"))
5569 return -1;
5570 if ((mdp_major=register_blkdev(0, "mdp"))<=0) {
5571 unregister_blkdev(MAJOR_NR, "md");
5572 return -1;
5574 blk_register_region(MKDEV(MAJOR_NR, 0), MAX_MD_DEVS, THIS_MODULE,
5575 md_probe, NULL, NULL);
5576 blk_register_region(MKDEV(mdp_major, 0), MAX_MD_DEVS<<MdpMinorShift, THIS_MODULE,
5577 md_probe, NULL, NULL);
5579 register_reboot_notifier(&md_notifier);
5580 raid_table_header = register_sysctl_table(raid_root_table, 1);
5582 md_geninit();
5583 return (0);
5587 #ifndef MODULE
5589 /*
5590 * Searches all registered partitions for autorun RAID arrays
5591 * at boot time.
5592 */
5593 static dev_t detected_devices[128];
5594 static int dev_cnt;
5596 void md_autodetect_dev(dev_t dev)
5598 if (dev_cnt >= 0 && dev_cnt < 127)
5599 detected_devices[dev_cnt++] = dev;
5603 static void autostart_arrays(int part)
5605 mdk_rdev_t *rdev;
5606 int i;
5608 printk(KERN_INFO "md: Autodetecting RAID arrays.\n");
5610 for (i = 0; i < dev_cnt; i++) {
5611 dev_t dev = detected_devices[i];
5613 rdev = md_import_device(dev,0, 0);
5614 if (IS_ERR(rdev))
5615 continue;
5617 if (test_bit(Faulty, &rdev->flags)) {
5618 MD_BUG();
5619 continue;
5621 list_add(&rdev->same_set, &pending_raid_disks);
5623 dev_cnt = 0;
5625 autorun_devices(part);
5628 #endif
5630 static __exit void md_exit(void)
5632 mddev_t *mddev;
5633 struct list_head *tmp;
5635 blk_unregister_region(MKDEV(MAJOR_NR,0), MAX_MD_DEVS);
5636 blk_unregister_region(MKDEV(mdp_major,0), MAX_MD_DEVS << MdpMinorShift);
5638 unregister_blkdev(MAJOR_NR,"md");
5639 unregister_blkdev(mdp_major, "mdp");
5640 unregister_reboot_notifier(&md_notifier);
5641 unregister_sysctl_table(raid_table_header);
5642 remove_proc_entry("mdstat", NULL);
5643 ITERATE_MDDEV(mddev,tmp) {
5644 struct gendisk *disk = mddev->gendisk;
5645 if (!disk)
5646 continue;
5647 export_array(mddev);
5648 del_gendisk(disk);
5649 put_disk(disk);
5650 mddev->gendisk = NULL;
5651 mddev_put(mddev);
5655 module_init(md_init)
5656 module_exit(md_exit)
5658 static int get_ro(char *buffer, struct kernel_param *kp)
5660 return sprintf(buffer, "%d", start_readonly);
5662 static int set_ro(const char *val, struct kernel_param *kp)
5664 char *e;
5665 int num = simple_strtoul(val, &e, 10);
5666 if (*val && (*e == '\0' || *e == '\n')) {
5667 start_readonly = num;
5668 return 0;
5670 return -EINVAL;
5673 module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR);
5674 module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR);
5677 EXPORT_SYMBOL(register_md_personality);
5678 EXPORT_SYMBOL(unregister_md_personality);
5679 EXPORT_SYMBOL(md_error);
5680 EXPORT_SYMBOL(md_done_sync);
5681 EXPORT_SYMBOL(md_write_start);
5682 EXPORT_SYMBOL(md_write_end);
5683 EXPORT_SYMBOL(md_register_thread);
5684 EXPORT_SYMBOL(md_unregister_thread);
5685 EXPORT_SYMBOL(md_wakeup_thread);
5686 EXPORT_SYMBOL(md_check_recovery);
5687 MODULE_LICENSE("GPL");
5688 MODULE_ALIAS("md");
5689 MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR);