ia64/linux-2.6.18-xen.hg

view drivers/md/dm-io.c @ 897:329ea0ccb344

balloon: try harder to balloon up under memory pressure.

Currently if the balloon driver is unable to increase the guest's
reservation it assumes the failure was due to reaching its full
allocation, gives up on the ballooning operation and records the limit
it reached as the "hard limit". The driver will not try again until
the target is set again (even to the same value).

However it is possible that ballooning has in fact failed due to
memory pressure in the host and therefore it is desirable to keep
attempting to reach the target in case memory becomes available. The
most likely scenario is that some guests are ballooning down while
others are ballooning up and therefore there is temporary memory
pressure while things stabilise. You would not expect a well behaved
toolstack to ask a domain to balloon to more than its allocation nor
would you expect it to deliberately over-commit memory by setting
balloon targets which exceed the total host memory.

This patch drops the concept of a hard limit and causes the balloon
driver to retry increasing the reservation on a timer in the same
manner as when decreasing the reservation.

Also if we partially succeed in increasing the reservation
(i.e. receive less pages than we asked for) then we may as well keep
those pages rather than returning them to Xen.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Fri Jun 05 14:01:20 2009 +0100 (2009-06-05)
parents 831230e53067
children
line source
1 /*
2 * Copyright (C) 2003 Sistina Software
3 *
4 * This file is released under the GPL.
5 */
7 #include "dm-io.h"
9 #include <linux/bio.h>
10 #include <linux/mempool.h>
11 #include <linux/module.h>
12 #include <linux/sched.h>
13 #include <linux/slab.h>
15 static struct bio_set *_bios;
17 /* FIXME: can we shrink this ? */
18 struct io {
19 unsigned long error;
20 atomic_t count;
21 struct task_struct *sleeper;
22 io_notify_fn callback;
23 void *context;
24 };
26 /*
27 * io contexts are only dynamically allocated for asynchronous
28 * io. Since async io is likely to be the majority of io we'll
29 * have the same number of io contexts as buffer heads ! (FIXME:
30 * must reduce this).
31 */
32 static unsigned _num_ios;
33 static mempool_t *_io_pool;
35 static unsigned int pages_to_ios(unsigned int pages)
36 {
37 return 4 * pages; /* too many ? */
38 }
40 static int resize_pool(unsigned int new_ios)
41 {
42 int r = 0;
44 if (_io_pool) {
45 if (new_ios == 0) {
46 /* free off the pool */
47 mempool_destroy(_io_pool);
48 _io_pool = NULL;
49 bioset_free(_bios);
51 } else {
52 /* resize the pool */
53 r = mempool_resize(_io_pool, new_ios, GFP_KERNEL);
54 }
56 } else {
57 /* create new pool */
58 _io_pool = mempool_create_kmalloc_pool(new_ios,
59 sizeof(struct io));
60 if (!_io_pool)
61 return -ENOMEM;
63 _bios = bioset_create(16, 16, 4);
64 if (!_bios) {
65 mempool_destroy(_io_pool);
66 _io_pool = NULL;
67 return -ENOMEM;
68 }
69 }
71 if (!r)
72 _num_ios = new_ios;
74 return r;
75 }
77 int dm_io_get(unsigned int num_pages)
78 {
79 return resize_pool(_num_ios + pages_to_ios(num_pages));
80 }
82 void dm_io_put(unsigned int num_pages)
83 {
84 resize_pool(_num_ios - pages_to_ios(num_pages));
85 }
87 /*-----------------------------------------------------------------
88 * We need to keep track of which region a bio is doing io for.
89 * In order to save a memory allocation we store this the last
90 * bvec which we know is unused (blech).
91 * XXX This is ugly and can OOPS with some configs... find another way.
92 *---------------------------------------------------------------*/
93 static inline void bio_set_region(struct bio *bio, unsigned region)
94 {
95 bio->bi_io_vec[bio->bi_max_vecs - 1].bv_len = region;
96 }
98 static inline unsigned bio_get_region(struct bio *bio)
99 {
100 return bio->bi_io_vec[bio->bi_max_vecs - 1].bv_len;
101 }
103 /*-----------------------------------------------------------------
104 * We need an io object to keep track of the number of bios that
105 * have been dispatched for a particular io.
106 *---------------------------------------------------------------*/
107 static void dec_count(struct io *io, unsigned int region, int error)
108 {
109 if (error)
110 set_bit(region, &io->error);
112 if (atomic_dec_and_test(&io->count)) {
113 if (io->sleeper)
114 wake_up_process(io->sleeper);
116 else {
117 int r = io->error;
118 io_notify_fn fn = io->callback;
119 void *context = io->context;
121 mempool_free(io, _io_pool);
122 fn(r, context);
123 }
124 }
125 }
127 static int endio(struct bio *bio, unsigned int done, int error)
128 {
129 struct io *io = (struct io *) bio->bi_private;
131 /* keep going until we've finished */
132 if (bio->bi_size)
133 return 1;
135 if (error && bio_data_dir(bio) == READ)
136 zero_fill_bio(bio);
138 dec_count(io, bio_get_region(bio), error);
139 bio_put(bio);
141 return 0;
142 }
144 /*-----------------------------------------------------------------
145 * These little objects provide an abstraction for getting a new
146 * destination page for io.
147 *---------------------------------------------------------------*/
148 struct dpages {
149 void (*get_page)(struct dpages *dp,
150 struct page **p, unsigned long *len, unsigned *offset);
151 void (*next_page)(struct dpages *dp);
153 unsigned context_u;
154 void *context_ptr;
155 };
157 /*
158 * Functions for getting the pages from a list.
159 */
160 static void list_get_page(struct dpages *dp,
161 struct page **p, unsigned long *len, unsigned *offset)
162 {
163 unsigned o = dp->context_u;
164 struct page_list *pl = (struct page_list *) dp->context_ptr;
166 *p = pl->page;
167 *len = PAGE_SIZE - o;
168 *offset = o;
169 }
171 static void list_next_page(struct dpages *dp)
172 {
173 struct page_list *pl = (struct page_list *) dp->context_ptr;
174 dp->context_ptr = pl->next;
175 dp->context_u = 0;
176 }
178 static void list_dp_init(struct dpages *dp, struct page_list *pl, unsigned offset)
179 {
180 dp->get_page = list_get_page;
181 dp->next_page = list_next_page;
182 dp->context_u = offset;
183 dp->context_ptr = pl;
184 }
186 /*
187 * Functions for getting the pages from a bvec.
188 */
189 static void bvec_get_page(struct dpages *dp,
190 struct page **p, unsigned long *len, unsigned *offset)
191 {
192 struct bio_vec *bvec = (struct bio_vec *) dp->context_ptr;
193 *p = bvec->bv_page;
194 *len = bvec->bv_len;
195 *offset = bvec->bv_offset;
196 }
198 static void bvec_next_page(struct dpages *dp)
199 {
200 struct bio_vec *bvec = (struct bio_vec *) dp->context_ptr;
201 dp->context_ptr = bvec + 1;
202 }
204 static void bvec_dp_init(struct dpages *dp, struct bio_vec *bvec)
205 {
206 dp->get_page = bvec_get_page;
207 dp->next_page = bvec_next_page;
208 dp->context_ptr = bvec;
209 }
211 static void vm_get_page(struct dpages *dp,
212 struct page **p, unsigned long *len, unsigned *offset)
213 {
214 *p = vmalloc_to_page(dp->context_ptr);
215 *offset = dp->context_u;
216 *len = PAGE_SIZE - dp->context_u;
217 }
219 static void vm_next_page(struct dpages *dp)
220 {
221 dp->context_ptr += PAGE_SIZE - dp->context_u;
222 dp->context_u = 0;
223 }
225 static void vm_dp_init(struct dpages *dp, void *data)
226 {
227 dp->get_page = vm_get_page;
228 dp->next_page = vm_next_page;
229 dp->context_u = ((unsigned long) data) & (PAGE_SIZE - 1);
230 dp->context_ptr = data;
231 }
233 static void dm_bio_destructor(struct bio *bio)
234 {
235 bio_free(bio, _bios);
236 }
238 /*-----------------------------------------------------------------
239 * IO routines that accept a list of pages.
240 *---------------------------------------------------------------*/
241 static void do_region(int rw, unsigned int region, struct io_region *where,
242 struct dpages *dp, struct io *io)
243 {
244 struct bio *bio;
245 struct page *page;
246 unsigned long len;
247 unsigned offset;
248 unsigned num_bvecs;
249 sector_t remaining = where->count;
251 while (remaining) {
252 /*
253 * Allocate a suitably sized bio, we add an extra
254 * bvec for bio_get/set_region().
255 */
256 num_bvecs = (remaining / (PAGE_SIZE >> 9)) + 2;
257 bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, _bios);
258 bio->bi_sector = where->sector + (where->count - remaining);
259 bio->bi_bdev = where->bdev;
260 bio->bi_end_io = endio;
261 bio->bi_private = io;
262 bio->bi_destructor = dm_bio_destructor;
263 bio_set_region(bio, region);
265 /*
266 * Try and add as many pages as possible.
267 */
268 while (remaining) {
269 dp->get_page(dp, &page, &len, &offset);
270 len = min(len, to_bytes(remaining));
271 if (!bio_add_page(bio, page, len, offset))
272 break;
274 offset = 0;
275 remaining -= to_sector(len);
276 dp->next_page(dp);
277 }
279 atomic_inc(&io->count);
280 submit_bio(rw, bio);
281 }
282 }
284 static void dispatch_io(int rw, unsigned int num_regions,
285 struct io_region *where, struct dpages *dp,
286 struct io *io, int sync)
287 {
288 int i;
289 struct dpages old_pages = *dp;
291 if (sync)
292 rw |= (1 << BIO_RW_SYNC);
294 /*
295 * For multiple regions we need to be careful to rewind
296 * the dp object for each call to do_region.
297 */
298 for (i = 0; i < num_regions; i++) {
299 *dp = old_pages;
300 if (where[i].count)
301 do_region(rw, i, where + i, dp, io);
302 }
304 /*
305 * Drop the extra refence that we were holding to avoid
306 * the io being completed too early.
307 */
308 dec_count(io, 0, 0);
309 }
311 static int sync_io(unsigned int num_regions, struct io_region *where,
312 int rw, struct dpages *dp, unsigned long *error_bits)
313 {
314 struct io io;
316 if (num_regions > 1 && rw != WRITE) {
317 WARN_ON(1);
318 return -EIO;
319 }
321 io.error = 0;
322 atomic_set(&io.count, 1); /* see dispatch_io() */
323 io.sleeper = current;
325 dispatch_io(rw, num_regions, where, dp, &io, 1);
327 while (1) {
328 set_current_state(TASK_UNINTERRUPTIBLE);
330 if (!atomic_read(&io.count) || signal_pending(current))
331 break;
333 io_schedule();
334 }
335 set_current_state(TASK_RUNNING);
337 if (atomic_read(&io.count))
338 return -EINTR;
340 *error_bits = io.error;
341 return io.error ? -EIO : 0;
342 }
344 static int async_io(unsigned int num_regions, struct io_region *where, int rw,
345 struct dpages *dp, io_notify_fn fn, void *context)
346 {
347 struct io *io;
349 if (num_regions > 1 && rw != WRITE) {
350 WARN_ON(1);
351 fn(1, context);
352 return -EIO;
353 }
355 io = mempool_alloc(_io_pool, GFP_NOIO);
356 io->error = 0;
357 atomic_set(&io->count, 1); /* see dispatch_io() */
358 io->sleeper = NULL;
359 io->callback = fn;
360 io->context = context;
362 dispatch_io(rw, num_regions, where, dp, io, 0);
363 return 0;
364 }
366 int dm_io_sync(unsigned int num_regions, struct io_region *where, int rw,
367 struct page_list *pl, unsigned int offset,
368 unsigned long *error_bits)
369 {
370 struct dpages dp;
371 list_dp_init(&dp, pl, offset);
372 return sync_io(num_regions, where, rw, &dp, error_bits);
373 }
375 int dm_io_sync_bvec(unsigned int num_regions, struct io_region *where, int rw,
376 struct bio_vec *bvec, unsigned long *error_bits)
377 {
378 struct dpages dp;
379 bvec_dp_init(&dp, bvec);
380 return sync_io(num_regions, where, rw, &dp, error_bits);
381 }
383 int dm_io_sync_vm(unsigned int num_regions, struct io_region *where, int rw,
384 void *data, unsigned long *error_bits)
385 {
386 struct dpages dp;
387 vm_dp_init(&dp, data);
388 return sync_io(num_regions, where, rw, &dp, error_bits);
389 }
391 int dm_io_async(unsigned int num_regions, struct io_region *where, int rw,
392 struct page_list *pl, unsigned int offset,
393 io_notify_fn fn, void *context)
394 {
395 struct dpages dp;
396 list_dp_init(&dp, pl, offset);
397 return async_io(num_regions, where, rw, &dp, fn, context);
398 }
400 int dm_io_async_bvec(unsigned int num_regions, struct io_region *where, int rw,
401 struct bio_vec *bvec, io_notify_fn fn, void *context)
402 {
403 struct dpages dp;
404 bvec_dp_init(&dp, bvec);
405 return async_io(num_regions, where, rw, &dp, fn, context);
406 }
408 int dm_io_async_vm(unsigned int num_regions, struct io_region *where, int rw,
409 void *data, io_notify_fn fn, void *context)
410 {
411 struct dpages dp;
412 vm_dp_init(&dp, data);
413 return async_io(num_regions, where, rw, &dp, fn, context);
414 }
416 EXPORT_SYMBOL(dm_io_get);
417 EXPORT_SYMBOL(dm_io_put);
418 EXPORT_SYMBOL(dm_io_sync);
419 EXPORT_SYMBOL(dm_io_async);
420 EXPORT_SYMBOL(dm_io_sync_bvec);
421 EXPORT_SYMBOL(dm_io_async_bvec);
422 EXPORT_SYMBOL(dm_io_sync_vm);
423 EXPORT_SYMBOL(dm_io_async_vm);