ia64/linux-2.6.18-xen.hg

view drivers/md/raid0.c @ 897:329ea0ccb344

balloon: try harder to balloon up under memory pressure.

Currently if the balloon driver is unable to increase the guest's
reservation it assumes the failure was due to reaching its full
allocation, gives up on the ballooning operation and records the limit
it reached as the "hard limit". The driver will not try again until
the target is set again (even to the same value).

However it is possible that ballooning has in fact failed due to
memory pressure in the host and therefore it is desirable to keep
attempting to reach the target in case memory becomes available. The
most likely scenario is that some guests are ballooning down while
others are ballooning up and therefore there is temporary memory
pressure while things stabilise. You would not expect a well behaved
toolstack to ask a domain to balloon to more than its allocation nor
would you expect it to deliberately over-commit memory by setting
balloon targets which exceed the total host memory.

This patch drops the concept of a hard limit and causes the balloon
driver to retry increasing the reservation on a timer in the same
manner as when decreasing the reservation.

Also if we partially succeed in increasing the reservation
(i.e. receive less pages than we asked for) then we may as well keep
those pages rather than returning them to Xen.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Fri Jun 05 14:01:20 2009 +0100 (2009-06-05)
parents 831230e53067
children
line source
1 /*
2 raid0.c : Multiple Devices driver for Linux
3 Copyright (C) 1994-96 Marc ZYNGIER
4 <zyngier@ufr-info-p7.ibp.fr> or
5 <maz@gloups.fdn.fr>
6 Copyright (C) 1999, 2000 Ingo Molnar, Red Hat
9 RAID-0 management functions.
11 This program is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2, or (at your option)
14 any later version.
16 You should have received a copy of the GNU General Public License
17 (for example /usr/src/linux/COPYING); if not, write to the Free
18 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 */
21 #include <linux/module.h>
22 #include <linux/raid/raid0.h>
24 #define MAJOR_NR MD_MAJOR
25 #define MD_DRIVER
26 #define MD_PERSONALITY
28 static void raid0_unplug(request_queue_t *q)
29 {
30 mddev_t *mddev = q->queuedata;
31 raid0_conf_t *conf = mddev_to_conf(mddev);
32 mdk_rdev_t **devlist = conf->strip_zone[0].dev;
33 int i;
35 for (i=0; i<mddev->raid_disks; i++) {
36 request_queue_t *r_queue = bdev_get_queue(devlist[i]->bdev);
38 if (r_queue->unplug_fn)
39 r_queue->unplug_fn(r_queue);
40 }
41 }
43 static int raid0_issue_flush(request_queue_t *q, struct gendisk *disk,
44 sector_t *error_sector)
45 {
46 mddev_t *mddev = q->queuedata;
47 raid0_conf_t *conf = mddev_to_conf(mddev);
48 mdk_rdev_t **devlist = conf->strip_zone[0].dev;
49 int i, ret = 0;
51 for (i=0; i<mddev->raid_disks && ret == 0; i++) {
52 struct block_device *bdev = devlist[i]->bdev;
53 request_queue_t *r_queue = bdev_get_queue(bdev);
55 if (!r_queue->issue_flush_fn)
56 ret = -EOPNOTSUPP;
57 else
58 ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector);
59 }
60 return ret;
61 }
64 static int create_strip_zones (mddev_t *mddev)
65 {
66 int i, c, j;
67 sector_t current_offset, curr_zone_offset;
68 sector_t min_spacing;
69 raid0_conf_t *conf = mddev_to_conf(mddev);
70 mdk_rdev_t *smallest, *rdev1, *rdev2, *rdev;
71 struct list_head *tmp1, *tmp2;
72 struct strip_zone *zone;
73 int cnt;
74 char b[BDEVNAME_SIZE];
76 /*
77 * The number of 'same size groups'
78 */
79 conf->nr_strip_zones = 0;
81 ITERATE_RDEV(mddev,rdev1,tmp1) {
82 printk("raid0: looking at %s\n",
83 bdevname(rdev1->bdev,b));
84 c = 0;
85 ITERATE_RDEV(mddev,rdev2,tmp2) {
86 printk("raid0: comparing %s(%llu)",
87 bdevname(rdev1->bdev,b),
88 (unsigned long long)rdev1->size);
89 printk(" with %s(%llu)\n",
90 bdevname(rdev2->bdev,b),
91 (unsigned long long)rdev2->size);
92 if (rdev2 == rdev1) {
93 printk("raid0: END\n");
94 break;
95 }
96 if (rdev2->size == rdev1->size)
97 {
98 /*
99 * Not unique, don't count it as a new
100 * group
101 */
102 printk("raid0: EQUAL\n");
103 c = 1;
104 break;
105 }
106 printk("raid0: NOT EQUAL\n");
107 }
108 if (!c) {
109 printk("raid0: ==> UNIQUE\n");
110 conf->nr_strip_zones++;
111 printk("raid0: %d zones\n", conf->nr_strip_zones);
112 }
113 }
114 printk("raid0: FINAL %d zones\n", conf->nr_strip_zones);
116 conf->strip_zone = kzalloc(sizeof(struct strip_zone)*
117 conf->nr_strip_zones, GFP_KERNEL);
118 if (!conf->strip_zone)
119 return 1;
120 conf->devlist = kzalloc(sizeof(mdk_rdev_t*)*
121 conf->nr_strip_zones*mddev->raid_disks,
122 GFP_KERNEL);
123 if (!conf->devlist)
124 return 1;
126 /* The first zone must contain all devices, so here we check that
127 * there is a proper alignment of slots to devices and find them all
128 */
129 zone = &conf->strip_zone[0];
130 cnt = 0;
131 smallest = NULL;
132 zone->dev = conf->devlist;
133 ITERATE_RDEV(mddev, rdev1, tmp1) {
134 int j = rdev1->raid_disk;
136 if (j < 0 || j >= mddev->raid_disks) {
137 printk("raid0: bad disk number %d - aborting!\n", j);
138 goto abort;
139 }
140 if (zone->dev[j]) {
141 printk("raid0: multiple devices for %d - aborting!\n",
142 j);
143 goto abort;
144 }
145 zone->dev[j] = rdev1;
147 blk_queue_stack_limits(mddev->queue,
148 rdev1->bdev->bd_disk->queue);
149 /* as we don't honour merge_bvec_fn, we must never risk
150 * violating it, so limit ->max_sector to one PAGE, as
151 * a one page request is never in violation.
152 */
154 if (rdev1->bdev->bd_disk->queue->merge_bvec_fn &&
155 mddev->queue->max_sectors > (PAGE_SIZE>>9))
156 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
158 if (!smallest || (rdev1->size <smallest->size))
159 smallest = rdev1;
160 cnt++;
161 }
162 if (cnt != mddev->raid_disks) {
163 printk("raid0: too few disks (%d of %d) - aborting!\n",
164 cnt, mddev->raid_disks);
165 goto abort;
166 }
167 zone->nb_dev = cnt;
168 zone->size = smallest->size * cnt;
169 zone->zone_offset = 0;
171 current_offset = smallest->size;
172 curr_zone_offset = zone->size;
174 /* now do the other zones */
175 for (i = 1; i < conf->nr_strip_zones; i++)
176 {
177 zone = conf->strip_zone + i;
178 zone->dev = conf->strip_zone[i-1].dev + mddev->raid_disks;
180 printk("raid0: zone %d\n", i);
181 zone->dev_offset = current_offset;
182 smallest = NULL;
183 c = 0;
185 for (j=0; j<cnt; j++) {
186 char b[BDEVNAME_SIZE];
187 rdev = conf->strip_zone[0].dev[j];
188 printk("raid0: checking %s ...", bdevname(rdev->bdev,b));
189 if (rdev->size > current_offset)
190 {
191 printk(" contained as device %d\n", c);
192 zone->dev[c] = rdev;
193 c++;
194 if (!smallest || (rdev->size <smallest->size)) {
195 smallest = rdev;
196 printk(" (%llu) is smallest!.\n",
197 (unsigned long long)rdev->size);
198 }
199 } else
200 printk(" nope.\n");
201 }
203 zone->nb_dev = c;
204 zone->size = (smallest->size - current_offset) * c;
205 printk("raid0: zone->nb_dev: %d, size: %llu\n",
206 zone->nb_dev, (unsigned long long)zone->size);
208 zone->zone_offset = curr_zone_offset;
209 curr_zone_offset += zone->size;
211 current_offset = smallest->size;
212 printk("raid0: current zone offset: %llu\n",
213 (unsigned long long)current_offset);
214 }
216 /* Now find appropriate hash spacing.
217 * We want a number which causes most hash entries to cover
218 * at most two strips, but the hash table must be at most
219 * 1 PAGE. We choose the smallest strip, or contiguous collection
220 * of strips, that has big enough size. We never consider the last
221 * strip though as it's size has no bearing on the efficacy of the hash
222 * table.
223 */
224 conf->hash_spacing = curr_zone_offset;
225 min_spacing = curr_zone_offset;
226 sector_div(min_spacing, PAGE_SIZE/sizeof(struct strip_zone*));
227 for (i=0; i < conf->nr_strip_zones-1; i++) {
228 sector_t sz = 0;
229 for (j=i; j<conf->nr_strip_zones-1 &&
230 sz < min_spacing ; j++)
231 sz += conf->strip_zone[j].size;
232 if (sz >= min_spacing && sz < conf->hash_spacing)
233 conf->hash_spacing = sz;
234 }
236 mddev->queue->unplug_fn = raid0_unplug;
238 mddev->queue->issue_flush_fn = raid0_issue_flush;
240 printk("raid0: done.\n");
241 return 0;
242 abort:
243 return 1;
244 }
246 /**
247 * raid0_mergeable_bvec -- tell bio layer if a two requests can be merged
248 * @q: request queue
249 * @bio: the buffer head that's been built up so far
250 * @biovec: the request that could be merged to it.
251 *
252 * Return amount of bytes we can accept at this offset
253 */
254 static int raid0_mergeable_bvec(request_queue_t *q, struct bio *bio, struct bio_vec *biovec)
255 {
256 mddev_t *mddev = q->queuedata;
257 sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev);
258 int max;
259 unsigned int chunk_sectors = mddev->chunk_size >> 9;
260 unsigned int bio_sectors = bio->bi_size >> 9;
262 max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
263 if (max < 0) max = 0; /* bio_add cannot handle a negative return */
264 if (max <= biovec->bv_len && bio_sectors == 0)
265 return biovec->bv_len;
266 else
267 return max;
268 }
270 static int raid0_run (mddev_t *mddev)
271 {
272 unsigned cur=0, i=0, nb_zone;
273 s64 size;
274 raid0_conf_t *conf;
275 mdk_rdev_t *rdev;
276 struct list_head *tmp;
278 if (mddev->chunk_size == 0) {
279 printk(KERN_ERR "md/raid0: non-zero chunk size required.\n");
280 return -EINVAL;
281 }
282 printk(KERN_INFO "%s: setting max_sectors to %d, segment boundary to %d\n",
283 mdname(mddev),
284 mddev->chunk_size >> 9,
285 (mddev->chunk_size>>1)-1);
286 blk_queue_max_sectors(mddev->queue, mddev->chunk_size >> 9);
287 blk_queue_segment_boundary(mddev->queue, (mddev->chunk_size>>1) - 1);
289 conf = kmalloc(sizeof (raid0_conf_t), GFP_KERNEL);
290 if (!conf)
291 goto out;
292 mddev->private = (void *)conf;
294 conf->strip_zone = NULL;
295 conf->devlist = NULL;
296 if (create_strip_zones (mddev))
297 goto out_free_conf;
299 /* calculate array device size */
300 mddev->array_size = 0;
301 ITERATE_RDEV(mddev,rdev,tmp)
302 mddev->array_size += rdev->size;
304 printk("raid0 : md_size is %llu blocks.\n",
305 (unsigned long long)mddev->array_size);
306 printk("raid0 : conf->hash_spacing is %llu blocks.\n",
307 (unsigned long long)conf->hash_spacing);
308 {
309 sector_t s = mddev->array_size;
310 sector_t space = conf->hash_spacing;
311 int round;
312 conf->preshift = 0;
313 if (sizeof(sector_t) > sizeof(u32)) {
314 /*shift down space and s so that sector_div will work */
315 while (space > (sector_t) (~(u32)0)) {
316 s >>= 1;
317 space >>= 1;
318 s += 1; /* force round-up */
319 conf->preshift++;
320 }
321 }
322 round = sector_div(s, (u32)space) ? 1 : 0;
323 nb_zone = s + round;
324 }
325 printk("raid0 : nb_zone is %d.\n", nb_zone);
327 printk("raid0 : Allocating %Zd bytes for hash.\n",
328 nb_zone*sizeof(struct strip_zone*));
329 conf->hash_table = kmalloc (sizeof (struct strip_zone *)*nb_zone, GFP_KERNEL);
330 if (!conf->hash_table)
331 goto out_free_conf;
332 size = conf->strip_zone[cur].size;
334 conf->hash_table[0] = conf->strip_zone + cur;
335 for (i=1; i< nb_zone; i++) {
336 while (size <= conf->hash_spacing) {
337 cur++;
338 size += conf->strip_zone[cur].size;
339 }
340 size -= conf->hash_spacing;
341 conf->hash_table[i] = conf->strip_zone + cur;
342 }
343 if (conf->preshift) {
344 conf->hash_spacing >>= conf->preshift;
345 /* round hash_spacing up so when we divide by it, we
346 * err on the side of too-low, which is safest
347 */
348 conf->hash_spacing++;
349 }
351 /* calculate the max read-ahead size.
352 * For read-ahead of large files to be effective, we need to
353 * readahead at least twice a whole stripe. i.e. number of devices
354 * multiplied by chunk size times 2.
355 * If an individual device has an ra_pages greater than the
356 * chunk size, then we will not drive that device as hard as it
357 * wants. We consider this a configuration error: a larger
358 * chunksize should be used in that case.
359 */
360 {
361 int stripe = mddev->raid_disks * mddev->chunk_size / PAGE_SIZE;
362 if (mddev->queue->backing_dev_info.ra_pages < 2* stripe)
363 mddev->queue->backing_dev_info.ra_pages = 2* stripe;
364 }
367 blk_queue_merge_bvec(mddev->queue, raid0_mergeable_bvec);
368 return 0;
370 out_free_conf:
371 kfree(conf->strip_zone);
372 kfree(conf->devlist);
373 kfree(conf);
374 mddev->private = NULL;
375 out:
376 return -ENOMEM;
377 }
379 static int raid0_stop (mddev_t *mddev)
380 {
381 raid0_conf_t *conf = mddev_to_conf(mddev);
383 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
384 kfree(conf->hash_table);
385 conf->hash_table = NULL;
386 kfree(conf->strip_zone);
387 conf->strip_zone = NULL;
388 kfree(conf);
389 mddev->private = NULL;
391 return 0;
392 }
394 static int raid0_make_request (request_queue_t *q, struct bio *bio)
395 {
396 mddev_t *mddev = q->queuedata;
397 unsigned int sect_in_chunk, chunksize_bits, chunk_size, chunk_sects;
398 raid0_conf_t *conf = mddev_to_conf(mddev);
399 struct strip_zone *zone;
400 mdk_rdev_t *tmp_dev;
401 unsigned long chunk;
402 sector_t block, rsect;
403 const int rw = bio_data_dir(bio);
405 if (unlikely(bio_barrier(bio))) {
406 bio_endio(bio, bio->bi_size, -EOPNOTSUPP);
407 return 0;
408 }
410 disk_stat_inc(mddev->gendisk, ios[rw]);
411 disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bio));
413 chunk_size = mddev->chunk_size >> 10;
414 chunk_sects = mddev->chunk_size >> 9;
415 chunksize_bits = ffz(~chunk_size);
416 block = bio->bi_sector >> 1;
419 if (unlikely(chunk_sects < (bio->bi_sector & (chunk_sects - 1)) + (bio->bi_size >> 9))) {
420 struct bio_pair *bp;
421 /* Sanity check -- queue functions should prevent this happening */
422 if (bio->bi_vcnt != 1 ||
423 bio->bi_idx != 0)
424 goto bad_map;
425 /* This is a one page bio that upper layers
426 * refuse to split for us, so we need to split it.
427 */
428 bp = bio_split(bio, bio_split_pool, chunk_sects - (bio->bi_sector & (chunk_sects - 1)) );
429 if (raid0_make_request(q, &bp->bio1))
430 generic_make_request(&bp->bio1);
431 if (raid0_make_request(q, &bp->bio2))
432 generic_make_request(&bp->bio2);
434 bio_pair_release(bp);
435 return 0;
436 }
439 {
440 sector_t x = block >> conf->preshift;
441 sector_div(x, (u32)conf->hash_spacing);
442 zone = conf->hash_table[x];
443 }
445 while (block >= (zone->zone_offset + zone->size))
446 zone++;
448 sect_in_chunk = bio->bi_sector & ((chunk_size<<1) -1);
451 {
452 sector_t x = (block - zone->zone_offset) >> chunksize_bits;
454 sector_div(x, zone->nb_dev);
455 chunk = x;
456 BUG_ON(x != (sector_t)chunk);
458 x = block >> chunksize_bits;
459 tmp_dev = zone->dev[sector_div(x, zone->nb_dev)];
460 }
461 rsect = (((chunk << chunksize_bits) + zone->dev_offset)<<1)
462 + sect_in_chunk;
464 bio->bi_bdev = tmp_dev->bdev;
465 bio->bi_sector = rsect + tmp_dev->data_offset;
467 /*
468 * Let the main block layer submit the IO and resolve recursion:
469 */
470 return 1;
472 bad_map:
473 printk("raid0_make_request bug: can't convert block across chunks"
474 " or bigger than %dk %llu %d\n", chunk_size,
475 (unsigned long long)bio->bi_sector, bio->bi_size >> 10);
477 bio_io_error(bio, bio->bi_size);
478 return 0;
479 }
481 static void raid0_status (struct seq_file *seq, mddev_t *mddev)
482 {
483 #undef MD_DEBUG
484 #ifdef MD_DEBUG
485 int j, k, h;
486 char b[BDEVNAME_SIZE];
487 raid0_conf_t *conf = mddev_to_conf(mddev);
489 h = 0;
490 for (j = 0; j < conf->nr_strip_zones; j++) {
491 seq_printf(seq, " z%d", j);
492 if (conf->hash_table[h] == conf->strip_zone+j)
493 seq_printf("(h%d)", h++);
494 seq_printf(seq, "=[");
495 for (k = 0; k < conf->strip_zone[j].nb_dev; k++)
496 seq_printf (seq, "%s/", bdevname(
497 conf->strip_zone[j].dev[k]->bdev,b));
499 seq_printf (seq, "] zo=%d do=%d s=%d\n",
500 conf->strip_zone[j].zone_offset,
501 conf->strip_zone[j].dev_offset,
502 conf->strip_zone[j].size);
503 }
504 #endif
505 seq_printf(seq, " %dk chunks", mddev->chunk_size/1024);
506 return;
507 }
509 static struct mdk_personality raid0_personality=
510 {
511 .name = "raid0",
512 .level = 0,
513 .owner = THIS_MODULE,
514 .make_request = raid0_make_request,
515 .run = raid0_run,
516 .stop = raid0_stop,
517 .status = raid0_status,
518 };
520 static int __init raid0_init (void)
521 {
522 return register_md_personality (&raid0_personality);
523 }
525 static void raid0_exit (void)
526 {
527 unregister_md_personality (&raid0_personality);
528 }
530 module_init(raid0_init);
531 module_exit(raid0_exit);
532 MODULE_LICENSE("GPL");
533 MODULE_ALIAS("md-personality-2"); /* RAID0 */
534 MODULE_ALIAS("md-raid0");
535 MODULE_ALIAS("md-level-0");