ia64/linux-2.6.18-xen.hg

view drivers/md/multipath.c @ 897:329ea0ccb344

balloon: try harder to balloon up under memory pressure.

Currently if the balloon driver is unable to increase the guest's
reservation it assumes the failure was due to reaching its full
allocation, gives up on the ballooning operation and records the limit
it reached as the "hard limit". The driver will not try again until
the target is set again (even to the same value).

However it is possible that ballooning has in fact failed due to
memory pressure in the host and therefore it is desirable to keep
attempting to reach the target in case memory becomes available. The
most likely scenario is that some guests are ballooning down while
others are ballooning up and therefore there is temporary memory
pressure while things stabilise. You would not expect a well behaved
toolstack to ask a domain to balloon to more than its allocation nor
would you expect it to deliberately over-commit memory by setting
balloon targets which exceed the total host memory.

This patch drops the concept of a hard limit and causes the balloon
driver to retry increasing the reservation on a timer in the same
manner as when decreasing the reservation.

Also if we partially succeed in increasing the reservation
(i.e. receive less pages than we asked for) then we may as well keep
those pages rather than returning them to Xen.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Fri Jun 05 14:01:20 2009 +0100 (2009-06-05)
parents 3e8752eb6d9c
children
line source
1 /*
2 * multipath.c : Multiple Devices driver for Linux
3 *
4 * Copyright (C) 1999, 2000, 2001 Ingo Molnar, Red Hat
5 *
6 * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
7 *
8 * MULTIPATH management functions.
9 *
10 * derived from raid1.c.
11 *
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2, or (at your option)
15 * any later version.
16 *
17 * You should have received a copy of the GNU General Public License
18 * (for example /usr/src/linux/COPYING); if not, write to the Free
19 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 */
22 #include <linux/module.h>
23 #include <linux/slab.h>
24 #include <linux/spinlock.h>
25 #include <linux/raid/multipath.h>
26 #include <linux/buffer_head.h>
27 #include <asm/atomic.h>
29 #define MAJOR_NR MD_MAJOR
30 #define MD_DRIVER
31 #define MD_PERSONALITY
33 #define MAX_WORK_PER_DISK 128
35 #define NR_RESERVED_BUFS 32
38 static int multipath_map (multipath_conf_t *conf)
39 {
40 int i, disks = conf->raid_disks;
42 /*
43 * Later we do read balancing on the read side
44 * now we use the first available disk.
45 */
47 rcu_read_lock();
48 for (i = 0; i < disks; i++) {
49 mdk_rdev_t *rdev = rcu_dereference(conf->multipaths[i].rdev);
50 if (rdev && test_bit(In_sync, &rdev->flags)) {
51 atomic_inc(&rdev->nr_pending);
52 rcu_read_unlock();
53 return i;
54 }
55 }
56 rcu_read_unlock();
58 printk(KERN_ERR "multipath_map(): no more operational IO paths?\n");
59 return (-1);
60 }
62 static void multipath_reschedule_retry (struct multipath_bh *mp_bh)
63 {
64 unsigned long flags;
65 mddev_t *mddev = mp_bh->mddev;
66 multipath_conf_t *conf = mddev_to_conf(mddev);
68 spin_lock_irqsave(&conf->device_lock, flags);
69 list_add(&mp_bh->retry_list, &conf->retry_list);
70 spin_unlock_irqrestore(&conf->device_lock, flags);
71 md_wakeup_thread(mddev->thread);
72 }
75 /*
76 * multipath_end_bh_io() is called when we have finished servicing a multipathed
77 * operation and are ready to return a success/failure code to the buffer
78 * cache layer.
79 */
80 static void multipath_end_bh_io (struct multipath_bh *mp_bh, int err)
81 {
82 struct bio *bio = mp_bh->master_bio;
83 multipath_conf_t *conf = mddev_to_conf(mp_bh->mddev);
85 bio_endio(bio, bio->bi_size, err);
86 mempool_free(mp_bh, conf->pool);
87 }
89 static int multipath_end_request(struct bio *bio, unsigned int bytes_done,
90 int error)
91 {
92 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
93 struct multipath_bh * mp_bh = (struct multipath_bh *)(bio->bi_private);
94 multipath_conf_t *conf = mddev_to_conf(mp_bh->mddev);
95 mdk_rdev_t *rdev = conf->multipaths[mp_bh->path].rdev;
97 if (bio->bi_size)
98 return 1;
100 if (uptodate)
101 multipath_end_bh_io(mp_bh, 0);
102 else if (!bio_rw_ahead(bio)) {
103 /*
104 * oops, IO error:
105 */
106 char b[BDEVNAME_SIZE];
107 md_error (mp_bh->mddev, rdev);
108 printk(KERN_ERR "multipath: %s: rescheduling sector %llu\n",
109 bdevname(rdev->bdev,b),
110 (unsigned long long)bio->bi_sector);
111 multipath_reschedule_retry(mp_bh);
112 } else
113 multipath_end_bh_io(mp_bh, error);
114 rdev_dec_pending(rdev, conf->mddev);
115 return 0;
116 }
118 static void unplug_slaves(mddev_t *mddev)
119 {
120 multipath_conf_t *conf = mddev_to_conf(mddev);
121 int i;
123 rcu_read_lock();
124 for (i=0; i<mddev->raid_disks; i++) {
125 mdk_rdev_t *rdev = rcu_dereference(conf->multipaths[i].rdev);
126 if (rdev && !test_bit(Faulty, &rdev->flags)
127 && atomic_read(&rdev->nr_pending)) {
128 request_queue_t *r_queue = bdev_get_queue(rdev->bdev);
130 atomic_inc(&rdev->nr_pending);
131 rcu_read_unlock();
133 if (r_queue->unplug_fn)
134 r_queue->unplug_fn(r_queue);
136 rdev_dec_pending(rdev, mddev);
137 rcu_read_lock();
138 }
139 }
140 rcu_read_unlock();
141 }
143 static void multipath_unplug(request_queue_t *q)
144 {
145 unplug_slaves(q->queuedata);
146 }
149 static int multipath_make_request (request_queue_t *q, struct bio * bio)
150 {
151 mddev_t *mddev = q->queuedata;
152 multipath_conf_t *conf = mddev_to_conf(mddev);
153 struct multipath_bh * mp_bh;
154 struct multipath_info *multipath;
155 const int rw = bio_data_dir(bio);
157 if (unlikely(bio_barrier(bio))) {
158 bio_endio(bio, bio->bi_size, -EOPNOTSUPP);
159 return 0;
160 }
162 mp_bh = mempool_alloc(conf->pool, GFP_NOIO);
164 mp_bh->master_bio = bio;
165 mp_bh->mddev = mddev;
167 disk_stat_inc(mddev->gendisk, ios[rw]);
168 disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bio));
170 mp_bh->path = multipath_map(conf);
171 if (mp_bh->path < 0) {
172 bio_endio(bio, bio->bi_size, -EIO);
173 mempool_free(mp_bh, conf->pool);
174 return 0;
175 }
176 multipath = conf->multipaths + mp_bh->path;
178 mp_bh->bio = *bio;
179 mp_bh->bio.bi_sector += multipath->rdev->data_offset;
180 mp_bh->bio.bi_bdev = multipath->rdev->bdev;
181 mp_bh->bio.bi_rw |= (1 << BIO_RW_FAILFAST);
182 mp_bh->bio.bi_end_io = multipath_end_request;
183 mp_bh->bio.bi_private = mp_bh;
184 generic_make_request(&mp_bh->bio);
185 return 0;
186 }
188 static void multipath_status (struct seq_file *seq, mddev_t *mddev)
189 {
190 multipath_conf_t *conf = mddev_to_conf(mddev);
191 int i;
193 seq_printf (seq, " [%d/%d] [", conf->raid_disks,
194 conf->working_disks);
195 for (i = 0; i < conf->raid_disks; i++)
196 seq_printf (seq, "%s",
197 conf->multipaths[i].rdev &&
198 test_bit(In_sync, &conf->multipaths[i].rdev->flags) ? "U" : "_");
199 seq_printf (seq, "]");
200 }
202 static int multipath_issue_flush(request_queue_t *q, struct gendisk *disk,
203 sector_t *error_sector)
204 {
205 mddev_t *mddev = q->queuedata;
206 multipath_conf_t *conf = mddev_to_conf(mddev);
207 int i, ret = 0;
209 rcu_read_lock();
210 for (i=0; i<mddev->raid_disks && ret == 0; i++) {
211 mdk_rdev_t *rdev = rcu_dereference(conf->multipaths[i].rdev);
212 if (rdev && !test_bit(Faulty, &rdev->flags)) {
213 struct block_device *bdev = rdev->bdev;
214 request_queue_t *r_queue = bdev_get_queue(bdev);
216 if (!r_queue->issue_flush_fn)
217 ret = -EOPNOTSUPP;
218 else {
219 atomic_inc(&rdev->nr_pending);
220 rcu_read_unlock();
221 ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk,
222 error_sector);
223 rdev_dec_pending(rdev, mddev);
224 rcu_read_lock();
225 }
226 }
227 }
228 rcu_read_unlock();
229 return ret;
230 }
232 /*
233 * Careful, this can execute in IRQ contexts as well!
234 */
235 static void multipath_error (mddev_t *mddev, mdk_rdev_t *rdev)
236 {
237 multipath_conf_t *conf = mddev_to_conf(mddev);
239 if (conf->working_disks <= 1) {
240 /*
241 * Uh oh, we can do nothing if this is our last path, but
242 * first check if this is a queued request for a device
243 * which has just failed.
244 */
245 printk(KERN_ALERT
246 "multipath: only one IO path left and IO error.\n");
247 /* leave it active... it's all we have */
248 } else {
249 /*
250 * Mark disk as unusable
251 */
252 if (!test_bit(Faulty, &rdev->flags)) {
253 char b[BDEVNAME_SIZE];
254 clear_bit(In_sync, &rdev->flags);
255 set_bit(Faulty, &rdev->flags);
256 mddev->sb_dirty = 1;
257 conf->working_disks--;
258 printk(KERN_ALERT "multipath: IO failure on %s,"
259 " disabling IO path. \n Operation continuing"
260 " on %d IO paths.\n",
261 bdevname (rdev->bdev,b),
262 conf->working_disks);
263 }
264 }
265 }
267 static void print_multipath_conf (multipath_conf_t *conf)
268 {
269 int i;
270 struct multipath_info *tmp;
272 printk("MULTIPATH conf printout:\n");
273 if (!conf) {
274 printk("(conf==NULL)\n");
275 return;
276 }
277 printk(" --- wd:%d rd:%d\n", conf->working_disks,
278 conf->raid_disks);
280 for (i = 0; i < conf->raid_disks; i++) {
281 char b[BDEVNAME_SIZE];
282 tmp = conf->multipaths + i;
283 if (tmp->rdev)
284 printk(" disk%d, o:%d, dev:%s\n",
285 i,!test_bit(Faulty, &tmp->rdev->flags),
286 bdevname(tmp->rdev->bdev,b));
287 }
288 }
291 static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
292 {
293 multipath_conf_t *conf = mddev->private;
294 struct request_queue *q;
295 int found = 0;
296 int path;
297 struct multipath_info *p;
299 print_multipath_conf(conf);
301 for (path=0; path<mddev->raid_disks; path++)
302 if ((p=conf->multipaths+path)->rdev == NULL) {
303 q = rdev->bdev->bd_disk->queue;
304 blk_queue_stack_limits(mddev->queue, q);
306 /* as we don't honour merge_bvec_fn, we must never risk
307 * violating it, so limit ->max_sector to one PAGE, as
308 * a one page request is never in violation.
309 * (Note: it is very unlikely that a device with
310 * merge_bvec_fn will be involved in multipath.)
311 */
312 if (q->merge_bvec_fn &&
313 mddev->queue->max_sectors > (PAGE_SIZE>>9))
314 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
316 conf->working_disks++;
317 rdev->raid_disk = path;
318 set_bit(In_sync, &rdev->flags);
319 rcu_assign_pointer(p->rdev, rdev);
320 found = 1;
321 }
323 print_multipath_conf(conf);
324 return found;
325 }
327 static int multipath_remove_disk(mddev_t *mddev, int number)
328 {
329 multipath_conf_t *conf = mddev->private;
330 int err = 0;
331 mdk_rdev_t *rdev;
332 struct multipath_info *p = conf->multipaths + number;
334 print_multipath_conf(conf);
336 rdev = p->rdev;
337 if (rdev) {
338 if (test_bit(In_sync, &rdev->flags) ||
339 atomic_read(&rdev->nr_pending)) {
340 printk(KERN_ERR "hot-remove-disk, slot %d is identified" " but is still operational!\n", number);
341 err = -EBUSY;
342 goto abort;
343 }
344 p->rdev = NULL;
345 synchronize_rcu();
346 if (atomic_read(&rdev->nr_pending)) {
347 /* lost the race, try later */
348 err = -EBUSY;
349 p->rdev = rdev;
350 }
351 }
352 abort:
354 print_multipath_conf(conf);
355 return err;
356 }
360 /*
361 * This is a kernel thread which:
362 *
363 * 1. Retries failed read operations on working multipaths.
364 * 2. Updates the raid superblock when problems encounter.
365 * 3. Performs writes following reads for array syncronising.
366 */
368 static void multipathd (mddev_t *mddev)
369 {
370 struct multipath_bh *mp_bh;
371 struct bio *bio;
372 unsigned long flags;
373 multipath_conf_t *conf = mddev_to_conf(mddev);
374 struct list_head *head = &conf->retry_list;
376 md_check_recovery(mddev);
377 for (;;) {
378 char b[BDEVNAME_SIZE];
379 spin_lock_irqsave(&conf->device_lock, flags);
380 if (list_empty(head))
381 break;
382 mp_bh = list_entry(head->prev, struct multipath_bh, retry_list);
383 list_del(head->prev);
384 spin_unlock_irqrestore(&conf->device_lock, flags);
386 bio = &mp_bh->bio;
387 bio->bi_sector = mp_bh->master_bio->bi_sector;
389 if ((mp_bh->path = multipath_map (conf))<0) {
390 printk(KERN_ALERT "multipath: %s: unrecoverable IO read"
391 " error for block %llu\n",
392 bdevname(bio->bi_bdev,b),
393 (unsigned long long)bio->bi_sector);
394 multipath_end_bh_io(mp_bh, -EIO);
395 } else {
396 printk(KERN_ERR "multipath: %s: redirecting sector %llu"
397 " to another IO path\n",
398 bdevname(bio->bi_bdev,b),
399 (unsigned long long)bio->bi_sector);
400 *bio = *(mp_bh->master_bio);
401 bio->bi_sector += conf->multipaths[mp_bh->path].rdev->data_offset;
402 bio->bi_bdev = conf->multipaths[mp_bh->path].rdev->bdev;
403 bio->bi_rw |= (1 << BIO_RW_FAILFAST);
404 bio->bi_end_io = multipath_end_request;
405 bio->bi_private = mp_bh;
406 generic_make_request(bio);
407 }
408 }
409 spin_unlock_irqrestore(&conf->device_lock, flags);
410 }
412 static int multipath_run (mddev_t *mddev)
413 {
414 multipath_conf_t *conf;
415 int disk_idx;
416 struct multipath_info *disk;
417 mdk_rdev_t *rdev;
418 struct list_head *tmp;
420 if (mddev->level != LEVEL_MULTIPATH) {
421 printk("multipath: %s: raid level not set to multipath IO (%d)\n",
422 mdname(mddev), mddev->level);
423 goto out;
424 }
425 /*
426 * copy the already verified devices into our private MULTIPATH
427 * bookkeeping area. [whatever we allocate in multipath_run(),
428 * should be freed in multipath_stop()]
429 */
431 conf = kzalloc(sizeof(multipath_conf_t), GFP_KERNEL);
432 mddev->private = conf;
433 if (!conf) {
434 printk(KERN_ERR
435 "multipath: couldn't allocate memory for %s\n",
436 mdname(mddev));
437 goto out;
438 }
440 conf->multipaths = kzalloc(sizeof(struct multipath_info)*mddev->raid_disks,
441 GFP_KERNEL);
442 if (!conf->multipaths) {
443 printk(KERN_ERR
444 "multipath: couldn't allocate memory for %s\n",
445 mdname(mddev));
446 goto out_free_conf;
447 }
449 conf->working_disks = 0;
450 ITERATE_RDEV(mddev,rdev,tmp) {
451 disk_idx = rdev->raid_disk;
452 if (disk_idx < 0 ||
453 disk_idx >= mddev->raid_disks)
454 continue;
456 disk = conf->multipaths + disk_idx;
457 disk->rdev = rdev;
459 blk_queue_stack_limits(mddev->queue,
460 rdev->bdev->bd_disk->queue);
461 /* as we don't honour merge_bvec_fn, we must never risk
462 * violating it, not that we ever expect a device with
463 * a merge_bvec_fn to be involved in multipath */
464 if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
465 mddev->queue->max_sectors > (PAGE_SIZE>>9))
466 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
468 if (!test_bit(Faulty, &rdev->flags))
469 conf->working_disks++;
470 }
472 conf->raid_disks = mddev->raid_disks;
473 mddev->sb_dirty = 1;
474 conf->mddev = mddev;
475 spin_lock_init(&conf->device_lock);
476 INIT_LIST_HEAD(&conf->retry_list);
478 if (!conf->working_disks) {
479 printk(KERN_ERR "multipath: no operational IO paths for %s\n",
480 mdname(mddev));
481 goto out_free_conf;
482 }
483 mddev->degraded = conf->raid_disks - conf->working_disks;
485 conf->pool = mempool_create_kzalloc_pool(NR_RESERVED_BUFS,
486 sizeof(struct multipath_bh));
487 if (conf->pool == NULL) {
488 printk(KERN_ERR
489 "multipath: couldn't allocate memory for %s\n",
490 mdname(mddev));
491 goto out_free_conf;
492 }
494 {
495 mddev->thread = md_register_thread(multipathd, mddev, "%s_multipath");
496 if (!mddev->thread) {
497 printk(KERN_ERR "multipath: couldn't allocate thread"
498 " for %s\n", mdname(mddev));
499 goto out_free_conf;
500 }
501 }
503 printk(KERN_INFO
504 "multipath: array %s active with %d out of %d IO paths\n",
505 mdname(mddev), conf->working_disks, mddev->raid_disks);
506 /*
507 * Ok, everything is just fine now
508 */
509 mddev->array_size = mddev->size;
511 mddev->queue->unplug_fn = multipath_unplug;
512 mddev->queue->issue_flush_fn = multipath_issue_flush;
514 return 0;
516 out_free_conf:
517 if (conf->pool)
518 mempool_destroy(conf->pool);
519 kfree(conf->multipaths);
520 kfree(conf);
521 mddev->private = NULL;
522 out:
523 return -EIO;
524 }
527 static int multipath_stop (mddev_t *mddev)
528 {
529 multipath_conf_t *conf = mddev_to_conf(mddev);
531 md_unregister_thread(mddev->thread);
532 mddev->thread = NULL;
533 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
534 mempool_destroy(conf->pool);
535 kfree(conf->multipaths);
536 kfree(conf);
537 mddev->private = NULL;
538 return 0;
539 }
541 static struct mdk_personality multipath_personality =
542 {
543 .name = "multipath",
544 .level = LEVEL_MULTIPATH,
545 .owner = THIS_MODULE,
546 .make_request = multipath_make_request,
547 .run = multipath_run,
548 .stop = multipath_stop,
549 .status = multipath_status,
550 .error_handler = multipath_error,
551 .hot_add_disk = multipath_add_disk,
552 .hot_remove_disk= multipath_remove_disk,
553 };
555 static int __init multipath_init (void)
556 {
557 return register_md_personality (&multipath_personality);
558 }
560 static void __exit multipath_exit (void)
561 {
562 unregister_md_personality (&multipath_personality);
563 }
565 module_init(multipath_init);
566 module_exit(multipath_exit);
567 MODULE_LICENSE("GPL");
568 MODULE_ALIAS("md-personality-7"); /* MULTIPATH */
569 MODULE_ALIAS("md-multipath");
570 MODULE_ALIAS("md-level--4");