ia64/linux-2.6.18-xen.hg

view drivers/md/faulty.c @ 897:329ea0ccb344

balloon: try harder to balloon up under memory pressure.

Currently if the balloon driver is unable to increase the guest's
reservation it assumes the failure was due to reaching its full
allocation, gives up on the ballooning operation and records the limit
it reached as the "hard limit". The driver will not try again until
the target is set again (even to the same value).

However it is possible that ballooning has in fact failed due to
memory pressure in the host and therefore it is desirable to keep
attempting to reach the target in case memory becomes available. The
most likely scenario is that some guests are ballooning down while
others are ballooning up and therefore there is temporary memory
pressure while things stabilise. You would not expect a well behaved
toolstack to ask a domain to balloon to more than its allocation nor
would you expect it to deliberately over-commit memory by setting
balloon targets which exceed the total host memory.

This patch drops the concept of a hard limit and causes the balloon
driver to retry increasing the reservation on a timer in the same
manner as when decreasing the reservation.

Also if we partially succeed in increasing the reservation
(i.e. receive less pages than we asked for) then we may as well keep
those pages rather than returning them to Xen.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Fri Jun 05 14:01:20 2009 +0100 (2009-06-05)
parents 831230e53067
children
line source
1 /*
2 * faulty.c : Multiple Devices driver for Linux
3 *
4 * Copyright (C) 2004 Neil Brown
5 *
6 * fautly-device-simulator personality for md
7 *
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2, or (at your option)
12 * any later version.
13 *
14 * You should have received a copy of the GNU General Public License
15 * (for example /usr/src/linux/COPYING); if not, write to the Free
16 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
17 */
20 /*
21 * The "faulty" personality causes some requests to fail.
22 *
23 * Possible failure modes are:
24 * reads fail "randomly" but succeed on retry
25 * writes fail "randomly" but succeed on retry
26 * reads for some address fail and then persist until a write
27 * reads for some address fail and then persist irrespective of write
28 * writes for some address fail and persist
29 * all writes fail
30 *
31 * Different modes can be active at a time, but only
32 * one can be set at array creation. Others can be added later.
33 * A mode can be one-shot or recurrent with the recurrance being
34 * once in every N requests.
35 * The bottom 5 bits of the "layout" indicate the mode. The
36 * remainder indicate a period, or 0 for one-shot.
37 *
38 * There is an implementation limit on the number of concurrently
39 * persisting-faulty blocks. When a new fault is requested that would
40 * exceed the limit, it is ignored.
41 * All current faults can be clear using a layout of "0".
42 *
43 * Requests are always sent to the device. If they are to fail,
44 * we clone the bio and insert a new b_end_io into the chain.
45 */
47 #define WriteTransient 0
48 #define ReadTransient 1
49 #define WritePersistent 2
50 #define ReadPersistent 3
51 #define WriteAll 4 /* doesn't go to device */
52 #define ReadFixable 5
53 #define Modes 6
55 #define ClearErrors 31
56 #define ClearFaults 30
58 #define AllPersist 100 /* internal use only */
59 #define NoPersist 101
61 #define ModeMask 0x1f
62 #define ModeShift 5
64 #define MaxFault 50
65 #include <linux/raid/md.h>
68 static int faulty_fail(struct bio *bio, unsigned int bytes_done, int error)
69 {
70 struct bio *b = bio->bi_private;
72 b->bi_size = bio->bi_size;
73 b->bi_sector = bio->bi_sector;
75 if (bio->bi_size == 0)
76 bio_put(bio);
78 clear_bit(BIO_UPTODATE, &b->bi_flags);
79 return (b->bi_end_io)(b, bytes_done, -EIO);
80 }
82 typedef struct faulty_conf {
83 int period[Modes];
84 atomic_t counters[Modes];
85 sector_t faults[MaxFault];
86 int modes[MaxFault];
87 int nfaults;
88 mdk_rdev_t *rdev;
89 } conf_t;
91 static int check_mode(conf_t *conf, int mode)
92 {
93 if (conf->period[mode] == 0 &&
94 atomic_read(&conf->counters[mode]) <= 0)
95 return 0; /* no failure, no decrement */
98 if (atomic_dec_and_test(&conf->counters[mode])) {
99 if (conf->period[mode])
100 atomic_set(&conf->counters[mode], conf->period[mode]);
101 return 1;
102 }
103 return 0;
104 }
106 static int check_sector(conf_t *conf, sector_t start, sector_t end, int dir)
107 {
108 /* If we find a ReadFixable sector, we fix it ... */
109 int i;
110 for (i=0; i<conf->nfaults; i++)
111 if (conf->faults[i] >= start &&
112 conf->faults[i] < end) {
113 /* found it ... */
114 switch (conf->modes[i] * 2 + dir) {
115 case WritePersistent*2+WRITE: return 1;
116 case ReadPersistent*2+READ: return 1;
117 case ReadFixable*2+READ: return 1;
118 case ReadFixable*2+WRITE:
119 conf->modes[i] = NoPersist;
120 return 0;
121 case AllPersist*2+READ:
122 case AllPersist*2+WRITE: return 1;
123 default:
124 return 0;
125 }
126 }
127 return 0;
128 }
130 static void add_sector(conf_t *conf, sector_t start, int mode)
131 {
132 int i;
133 int n = conf->nfaults;
134 for (i=0; i<conf->nfaults; i++)
135 if (conf->faults[i] == start) {
136 switch(mode) {
137 case NoPersist: conf->modes[i] = mode; return;
138 case WritePersistent:
139 if (conf->modes[i] == ReadPersistent ||
140 conf->modes[i] == ReadFixable)
141 conf->modes[i] = AllPersist;
142 else
143 conf->modes[i] = WritePersistent;
144 return;
145 case ReadPersistent:
146 if (conf->modes[i] == WritePersistent)
147 conf->modes[i] = AllPersist;
148 else
149 conf->modes[i] = ReadPersistent;
150 return;
151 case ReadFixable:
152 if (conf->modes[i] == WritePersistent ||
153 conf->modes[i] == ReadPersistent)
154 conf->modes[i] = AllPersist;
155 else
156 conf->modes[i] = ReadFixable;
157 return;
158 }
159 } else if (conf->modes[i] == NoPersist)
160 n = i;
162 if (n >= MaxFault)
163 return;
164 conf->faults[n] = start;
165 conf->modes[n] = mode;
166 if (conf->nfaults == n)
167 conf->nfaults = n+1;
168 }
170 static int make_request(request_queue_t *q, struct bio *bio)
171 {
172 mddev_t *mddev = q->queuedata;
173 conf_t *conf = (conf_t*)mddev->private;
174 int failit = 0;
176 if (bio->bi_rw & 1) {
177 /* write request */
178 if (atomic_read(&conf->counters[WriteAll])) {
179 /* special case - don't decrement, don't generic_make_request,
180 * just fail immediately
181 */
182 bio_endio(bio, bio->bi_size, -EIO);
183 return 0;
184 }
186 if (check_sector(conf, bio->bi_sector, bio->bi_sector+(bio->bi_size>>9),
187 WRITE))
188 failit = 1;
189 if (check_mode(conf, WritePersistent)) {
190 add_sector(conf, bio->bi_sector, WritePersistent);
191 failit = 1;
192 }
193 if (check_mode(conf, WriteTransient))
194 failit = 1;
195 } else {
196 /* read request */
197 if (check_sector(conf, bio->bi_sector, bio->bi_sector + (bio->bi_size>>9),
198 READ))
199 failit = 1;
200 if (check_mode(conf, ReadTransient))
201 failit = 1;
202 if (check_mode(conf, ReadPersistent)) {
203 add_sector(conf, bio->bi_sector, ReadPersistent);
204 failit = 1;
205 }
206 if (check_mode(conf, ReadFixable)) {
207 add_sector(conf, bio->bi_sector, ReadFixable);
208 failit = 1;
209 }
210 }
211 if (failit) {
212 struct bio *b = bio_clone(bio, GFP_NOIO);
213 b->bi_bdev = conf->rdev->bdev;
214 b->bi_private = bio;
215 b->bi_end_io = faulty_fail;
216 generic_make_request(b);
217 return 0;
218 } else {
219 bio->bi_bdev = conf->rdev->bdev;
220 return 1;
221 }
222 }
224 static void status(struct seq_file *seq, mddev_t *mddev)
225 {
226 conf_t *conf = (conf_t*)mddev->private;
227 int n;
229 if ((n=atomic_read(&conf->counters[WriteTransient])) != 0)
230 seq_printf(seq, " WriteTransient=%d(%d)",
231 n, conf->period[WriteTransient]);
233 if ((n=atomic_read(&conf->counters[ReadTransient])) != 0)
234 seq_printf(seq, " ReadTransient=%d(%d)",
235 n, conf->period[ReadTransient]);
237 if ((n=atomic_read(&conf->counters[WritePersistent])) != 0)
238 seq_printf(seq, " WritePersistent=%d(%d)",
239 n, conf->period[WritePersistent]);
241 if ((n=atomic_read(&conf->counters[ReadPersistent])) != 0)
242 seq_printf(seq, " ReadPersistent=%d(%d)",
243 n, conf->period[ReadPersistent]);
246 if ((n=atomic_read(&conf->counters[ReadFixable])) != 0)
247 seq_printf(seq, " ReadFixable=%d(%d)",
248 n, conf->period[ReadFixable]);
250 if ((n=atomic_read(&conf->counters[WriteAll])) != 0)
251 seq_printf(seq, " WriteAll");
253 seq_printf(seq, " nfaults=%d", conf->nfaults);
254 }
257 static int reconfig(mddev_t *mddev, int layout, int chunk_size)
258 {
259 int mode = layout & ModeMask;
260 int count = layout >> ModeShift;
261 conf_t *conf = mddev->private;
263 if (chunk_size != -1)
264 return -EINVAL;
266 /* new layout */
267 if (mode == ClearFaults)
268 conf->nfaults = 0;
269 else if (mode == ClearErrors) {
270 int i;
271 for (i=0 ; i < Modes ; i++) {
272 conf->period[i] = 0;
273 atomic_set(&conf->counters[i], 0);
274 }
275 } else if (mode < Modes) {
276 conf->period[mode] = count;
277 if (!count) count++;
278 atomic_set(&conf->counters[mode], count);
279 } else
280 return -EINVAL;
281 mddev->layout = -1; /* makes sure further changes come through */
282 return 0;
283 }
285 static int run(mddev_t *mddev)
286 {
287 mdk_rdev_t *rdev;
288 struct list_head *tmp;
289 int i;
291 conf_t *conf = kmalloc(sizeof(*conf), GFP_KERNEL);
293 for (i=0; i<Modes; i++) {
294 atomic_set(&conf->counters[i], 0);
295 conf->period[i] = 0;
296 }
297 conf->nfaults = 0;
299 ITERATE_RDEV(mddev, rdev, tmp)
300 conf->rdev = rdev;
302 mddev->array_size = mddev->size;
303 mddev->private = conf;
305 reconfig(mddev, mddev->layout, -1);
307 return 0;
308 }
310 static int stop(mddev_t *mddev)
311 {
312 conf_t *conf = (conf_t *)mddev->private;
314 kfree(conf);
315 mddev->private = NULL;
316 return 0;
317 }
319 static struct mdk_personality faulty_personality =
320 {
321 .name = "faulty",
322 .level = LEVEL_FAULTY,
323 .owner = THIS_MODULE,
324 .make_request = make_request,
325 .run = run,
326 .stop = stop,
327 .status = status,
328 .reconfig = reconfig,
329 };
331 static int __init raid_init(void)
332 {
333 return register_md_personality(&faulty_personality);
334 }
336 static void raid_exit(void)
337 {
338 unregister_md_personality(&faulty_personality);
339 }
341 module_init(raid_init);
342 module_exit(raid_exit);
343 MODULE_LICENSE("GPL");
344 MODULE_ALIAS("md-personality-10"); /* faulty */
345 MODULE_ALIAS("md-faulty");
346 MODULE_ALIAS("md-level--5");