ia64/linux-2.6.18-xen.hg

view drivers/md/linear.c @ 897:329ea0ccb344

balloon: try harder to balloon up under memory pressure.

Currently if the balloon driver is unable to increase the guest's
reservation it assumes the failure was due to reaching its full
allocation, gives up on the ballooning operation and records the limit
it reached as the "hard limit". The driver will not try again until
the target is set again (even to the same value).

However it is possible that ballooning has in fact failed due to
memory pressure in the host and therefore it is desirable to keep
attempting to reach the target in case memory becomes available. The
most likely scenario is that some guests are ballooning down while
others are ballooning up and therefore there is temporary memory
pressure while things stabilise. You would not expect a well behaved
toolstack to ask a domain to balloon to more than its allocation nor
would you expect it to deliberately over-commit memory by setting
balloon targets which exceed the total host memory.

This patch drops the concept of a hard limit and causes the balloon
driver to retry increasing the reservation on a timer in the same
manner as when decreasing the reservation.

Also if we partially succeed in increasing the reservation
(i.e. receive less pages than we asked for) then we may as well keep
those pages rather than returning them to Xen.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Fri Jun 05 14:01:20 2009 +0100 (2009-06-05)
parents 831230e53067
children
line source
1 /*
2 linear.c : Multiple Devices driver for Linux
3 Copyright (C) 1994-96 Marc ZYNGIER
4 <zyngier@ufr-info-p7.ibp.fr> or
5 <maz@gloups.fdn.fr>
7 Linear mode management functions.
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation; either version 2, or (at your option)
12 any later version.
14 You should have received a copy of the GNU General Public License
15 (for example /usr/src/linux/COPYING); if not, write to the Free
16 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
17 */
19 #include <linux/module.h>
21 #include <linux/raid/md.h>
22 #include <linux/slab.h>
23 #include <linux/raid/linear.h>
25 #define MAJOR_NR MD_MAJOR
26 #define MD_DRIVER
27 #define MD_PERSONALITY
29 /*
30 * find which device holds a particular offset
31 */
32 static inline dev_info_t *which_dev(mddev_t *mddev, sector_t sector)
33 {
34 dev_info_t *hash;
35 linear_conf_t *conf = mddev_to_conf(mddev);
36 sector_t block = sector >> 1;
38 /*
39 * sector_div(a,b) returns the remainer and sets a to a/b
40 */
41 block >>= conf->preshift;
42 (void)sector_div(block, conf->hash_spacing);
43 hash = conf->hash_table[block];
45 while ((sector>>1) >= (hash->size + hash->offset))
46 hash++;
47 return hash;
48 }
50 /**
51 * linear_mergeable_bvec -- tell bio layer if two requests can be merged
52 * @q: request queue
53 * @bio: the buffer head that's been built up so far
54 * @biovec: the request that could be merged to it.
55 *
56 * Return amount of bytes we can take at this offset
57 */
58 static int linear_mergeable_bvec(request_queue_t *q, struct bio *bio, struct bio_vec *biovec)
59 {
60 mddev_t *mddev = q->queuedata;
61 dev_info_t *dev0;
62 unsigned long maxsectors, bio_sectors = bio->bi_size >> 9;
63 sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev);
65 dev0 = which_dev(mddev, sector);
66 maxsectors = (dev0->size << 1) - (sector - (dev0->offset<<1));
68 if (maxsectors < bio_sectors)
69 maxsectors = 0;
70 else
71 maxsectors -= bio_sectors;
73 if (maxsectors <= (PAGE_SIZE >> 9 ) && bio_sectors == 0)
74 return biovec->bv_len;
75 /* The bytes available at this offset could be really big,
76 * so we cap at 2^31 to avoid overflow */
77 if (maxsectors > (1 << (31-9)))
78 return 1<<31;
79 return maxsectors << 9;
80 }
82 static void linear_unplug(request_queue_t *q)
83 {
84 mddev_t *mddev = q->queuedata;
85 linear_conf_t *conf = mddev_to_conf(mddev);
86 int i;
88 for (i=0; i < mddev->raid_disks; i++) {
89 request_queue_t *r_queue = bdev_get_queue(conf->disks[i].rdev->bdev);
90 if (r_queue->unplug_fn)
91 r_queue->unplug_fn(r_queue);
92 }
93 }
95 static int linear_issue_flush(request_queue_t *q, struct gendisk *disk,
96 sector_t *error_sector)
97 {
98 mddev_t *mddev = q->queuedata;
99 linear_conf_t *conf = mddev_to_conf(mddev);
100 int i, ret = 0;
102 for (i=0; i < mddev->raid_disks && ret == 0; i++) {
103 struct block_device *bdev = conf->disks[i].rdev->bdev;
104 request_queue_t *r_queue = bdev_get_queue(bdev);
106 if (!r_queue->issue_flush_fn)
107 ret = -EOPNOTSUPP;
108 else
109 ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector);
110 }
111 return ret;
112 }
114 static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
115 {
116 linear_conf_t *conf;
117 dev_info_t **table;
118 mdk_rdev_t *rdev;
119 int i, nb_zone, cnt;
120 sector_t min_spacing;
121 sector_t curr_offset;
122 struct list_head *tmp;
124 conf = kzalloc (sizeof (*conf) + raid_disks*sizeof(dev_info_t),
125 GFP_KERNEL);
126 if (!conf)
127 return NULL;
129 mddev->private = conf;
131 cnt = 0;
132 conf->array_size = 0;
134 ITERATE_RDEV(mddev,rdev,tmp) {
135 int j = rdev->raid_disk;
136 dev_info_t *disk = conf->disks + j;
138 if (j < 0 || j > raid_disks || disk->rdev) {
139 printk("linear: disk numbering problem. Aborting!\n");
140 goto out;
141 }
143 disk->rdev = rdev;
145 blk_queue_stack_limits(mddev->queue,
146 rdev->bdev->bd_disk->queue);
147 /* as we don't honour merge_bvec_fn, we must never risk
148 * violating it, so limit ->max_sector to one PAGE, as
149 * a one page request is never in violation.
150 */
151 if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
152 mddev->queue->max_sectors > (PAGE_SIZE>>9))
153 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
155 disk->size = rdev->size;
156 conf->array_size += rdev->size;
158 cnt++;
159 }
160 if (cnt != raid_disks) {
161 printk("linear: not enough drives present. Aborting!\n");
162 goto out;
163 }
165 min_spacing = conf->array_size;
166 sector_div(min_spacing, PAGE_SIZE/sizeof(struct dev_info *));
168 /* min_spacing is the minimum spacing that will fit the hash
169 * table in one PAGE. This may be much smaller than needed.
170 * We find the smallest non-terminal set of consecutive devices
171 * that is larger than min_spacing as use the size of that as
172 * the actual spacing
173 */
174 conf->hash_spacing = conf->array_size;
175 for (i=0; i < cnt-1 ; i++) {
176 sector_t sz = 0;
177 int j;
178 for (j=i; i<cnt-1 && sz < min_spacing ; j++)
179 sz += conf->disks[j].size;
180 if (sz >= min_spacing && sz < conf->hash_spacing)
181 conf->hash_spacing = sz;
182 }
184 /* hash_spacing may be too large for sector_div to work with,
185 * so we might need to pre-shift
186 */
187 conf->preshift = 0;
188 if (sizeof(sector_t) > sizeof(u32)) {
189 sector_t space = conf->hash_spacing;
190 while (space > (sector_t)(~(u32)0)) {
191 space >>= 1;
192 conf->preshift++;
193 }
194 }
195 /*
196 * This code was restructured to work around a gcc-2.95.3 internal
197 * compiler error. Alter it with care.
198 */
199 {
200 sector_t sz;
201 unsigned round;
202 unsigned long base;
204 sz = conf->array_size >> conf->preshift;
205 sz += 1; /* force round-up */
206 base = conf->hash_spacing >> conf->preshift;
207 round = sector_div(sz, base);
208 nb_zone = sz + (round ? 1 : 0);
209 }
210 BUG_ON(nb_zone > PAGE_SIZE / sizeof(struct dev_info *));
212 conf->hash_table = kmalloc (sizeof (struct dev_info *) * nb_zone,
213 GFP_KERNEL);
214 if (!conf->hash_table)
215 goto out;
217 /*
218 * Here we generate the linear hash table
219 * First calculate the device offsets.
220 */
221 conf->disks[0].offset = 0;
222 for (i=1; i<mddev->raid_disks; i++)
223 conf->disks[i].offset =
224 conf->disks[i-1].offset +
225 conf->disks[i-1].size;
227 table = conf->hash_table;
228 curr_offset = 0;
229 i = 0;
230 for (curr_offset = 0;
231 curr_offset < conf->array_size;
232 curr_offset += conf->hash_spacing) {
234 while (i < mddev->raid_disks-1 &&
235 curr_offset >= conf->disks[i+1].offset)
236 i++;
238 *table ++ = conf->disks + i;
239 }
241 if (conf->preshift) {
242 conf->hash_spacing >>= conf->preshift;
243 /* round hash_spacing up so that when we divide by it,
244 * we err on the side of "too-low", which is safest.
245 */
246 conf->hash_spacing++;
247 }
249 BUG_ON(table - conf->hash_table > nb_zone);
251 return conf;
253 out:
254 kfree(conf);
255 return NULL;
256 }
258 static int linear_run (mddev_t *mddev)
259 {
260 linear_conf_t *conf;
262 conf = linear_conf(mddev, mddev->raid_disks);
264 if (!conf)
265 return 1;
266 mddev->private = conf;
267 mddev->array_size = conf->array_size;
269 blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec);
270 mddev->queue->unplug_fn = linear_unplug;
271 mddev->queue->issue_flush_fn = linear_issue_flush;
272 return 0;
273 }
275 static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev)
276 {
277 /* Adding a drive to a linear array allows the array to grow.
278 * It is permitted if the new drive has a matching superblock
279 * already on it, with raid_disk equal to raid_disks.
280 * It is achieved by creating a new linear_private_data structure
281 * and swapping it in in-place of the current one.
282 * The current one is never freed until the array is stopped.
283 * This avoids races.
284 */
285 linear_conf_t *newconf;
287 if (rdev->raid_disk != mddev->raid_disks)
288 return -EINVAL;
290 newconf = linear_conf(mddev,mddev->raid_disks+1);
292 if (!newconf)
293 return -ENOMEM;
295 newconf->prev = mddev_to_conf(mddev);
296 mddev->private = newconf;
297 mddev->raid_disks++;
298 mddev->array_size = newconf->array_size;
299 set_capacity(mddev->gendisk, mddev->array_size << 1);
300 return 0;
301 }
303 static int linear_stop (mddev_t *mddev)
304 {
305 linear_conf_t *conf = mddev_to_conf(mddev);
307 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
308 do {
309 linear_conf_t *t = conf->prev;
310 kfree(conf->hash_table);
311 kfree(conf);
312 conf = t;
313 } while (conf);
315 return 0;
316 }
318 static int linear_make_request (request_queue_t *q, struct bio *bio)
319 {
320 const int rw = bio_data_dir(bio);
321 mddev_t *mddev = q->queuedata;
322 dev_info_t *tmp_dev;
323 sector_t block;
325 if (unlikely(bio_barrier(bio))) {
326 bio_endio(bio, bio->bi_size, -EOPNOTSUPP);
327 return 0;
328 }
330 disk_stat_inc(mddev->gendisk, ios[rw]);
331 disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bio));
333 tmp_dev = which_dev(mddev, bio->bi_sector);
334 block = bio->bi_sector >> 1;
336 if (unlikely(block >= (tmp_dev->size + tmp_dev->offset)
337 || block < tmp_dev->offset)) {
338 char b[BDEVNAME_SIZE];
340 printk("linear_make_request: Block %llu out of bounds on "
341 "dev %s size %llu offset %llu\n",
342 (unsigned long long)block,
343 bdevname(tmp_dev->rdev->bdev, b),
344 (unsigned long long)tmp_dev->size,
345 (unsigned long long)tmp_dev->offset);
346 bio_io_error(bio, bio->bi_size);
347 return 0;
348 }
349 if (unlikely(bio->bi_sector + (bio->bi_size >> 9) >
350 (tmp_dev->offset + tmp_dev->size)<<1)) {
351 /* This bio crosses a device boundary, so we have to
352 * split it.
353 */
354 struct bio_pair *bp;
355 bp = bio_split(bio, bio_split_pool,
356 ((tmp_dev->offset + tmp_dev->size)<<1) - bio->bi_sector);
357 if (linear_make_request(q, &bp->bio1))
358 generic_make_request(&bp->bio1);
359 if (linear_make_request(q, &bp->bio2))
360 generic_make_request(&bp->bio2);
361 bio_pair_release(bp);
362 return 0;
363 }
365 bio->bi_bdev = tmp_dev->rdev->bdev;
366 bio->bi_sector = bio->bi_sector - (tmp_dev->offset << 1) + tmp_dev->rdev->data_offset;
368 return 1;
369 }
371 static void linear_status (struct seq_file *seq, mddev_t *mddev)
372 {
374 #undef MD_DEBUG
375 #ifdef MD_DEBUG
376 int j;
377 linear_conf_t *conf = mddev_to_conf(mddev);
378 sector_t s = 0;
380 seq_printf(seq, " ");
381 for (j = 0; j < mddev->raid_disks; j++)
382 {
383 char b[BDEVNAME_SIZE];
384 s += conf->smallest_size;
385 seq_printf(seq, "[%s",
386 bdevname(conf->hash_table[j][0].rdev->bdev,b));
388 while (s > conf->hash_table[j][0].offset +
389 conf->hash_table[j][0].size)
390 seq_printf(seq, "/%s] ",
391 bdevname(conf->hash_table[j][1].rdev->bdev,b));
392 else
393 seq_printf(seq, "] ");
394 }
395 seq_printf(seq, "\n");
396 #endif
397 seq_printf(seq, " %dk rounding", mddev->chunk_size/1024);
398 }
401 static struct mdk_personality linear_personality =
402 {
403 .name = "linear",
404 .level = LEVEL_LINEAR,
405 .owner = THIS_MODULE,
406 .make_request = linear_make_request,
407 .run = linear_run,
408 .stop = linear_stop,
409 .status = linear_status,
410 .hot_add_disk = linear_add,
411 };
413 static int __init linear_init (void)
414 {
415 return register_md_personality (&linear_personality);
416 }
418 static void linear_exit (void)
419 {
420 unregister_md_personality (&linear_personality);
421 }
424 module_init(linear_init);
425 module_exit(linear_exit);
426 MODULE_LICENSE("GPL");
427 MODULE_ALIAS("md-personality-1"); /* LINEAR - deprecated*/
428 MODULE_ALIAS("md-linear");
429 MODULE_ALIAS("md-level--1");