ia64/linux-2.6.18-xen.hg

view drivers/block/rd.c @ 897:329ea0ccb344

balloon: try harder to balloon up under memory pressure.

Currently if the balloon driver is unable to increase the guest's
reservation it assumes the failure was due to reaching its full
allocation, gives up on the ballooning operation and records the limit
it reached as the "hard limit". The driver will not try again until
the target is set again (even to the same value).

However it is possible that ballooning has in fact failed due to
memory pressure in the host and therefore it is desirable to keep
attempting to reach the target in case memory becomes available. The
most likely scenario is that some guests are ballooning down while
others are ballooning up and therefore there is temporary memory
pressure while things stabilise. You would not expect a well behaved
toolstack to ask a domain to balloon to more than its allocation nor
would you expect it to deliberately over-commit memory by setting
balloon targets which exceed the total host memory.

This patch drops the concept of a hard limit and causes the balloon
driver to retry increasing the reservation on a timer in the same
manner as when decreasing the reservation.

Also if we partially succeed in increasing the reservation
(i.e. receive less pages than we asked for) then we may as well keep
those pages rather than returning them to Xen.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Fri Jun 05 14:01:20 2009 +0100 (2009-06-05)
parents 831230e53067
children
line source
1 /*
2 * ramdisk.c - Multiple RAM disk driver - gzip-loading version - v. 0.8 beta.
3 *
4 * (C) Chad Page, Theodore Ts'o, et. al, 1995.
5 *
6 * This RAM disk is designed to have filesystems created on it and mounted
7 * just like a regular floppy disk.
8 *
9 * It also does something suggested by Linus: use the buffer cache as the
10 * RAM disk data. This makes it possible to dynamically allocate the RAM disk
11 * buffer - with some consequences I have to deal with as I write this.
12 *
13 * This code is based on the original ramdisk.c, written mostly by
14 * Theodore Ts'o (TYT) in 1991. The code was largely rewritten by
15 * Chad Page to use the buffer cache to store the RAM disk data in
16 * 1995; Theodore then took over the driver again, and cleaned it up
17 * for inclusion in the mainline kernel.
18 *
19 * The original CRAMDISK code was written by Richard Lyons, and
20 * adapted by Chad Page to use the new RAM disk interface. Theodore
21 * Ts'o rewrote it so that both the compressed RAM disk loader and the
22 * kernel decompressor uses the same inflate.c codebase. The RAM disk
23 * loader now also loads into a dynamic (buffer cache based) RAM disk,
24 * not the old static RAM disk. Support for the old static RAM disk has
25 * been completely removed.
26 *
27 * Loadable module support added by Tom Dyas.
28 *
29 * Further cleanups by Chad Page (page0588@sundance.sjsu.edu):
30 * Cosmetic changes in #ifdef MODULE, code movement, etc.
31 * When the RAM disk module is removed, free the protected buffers
32 * Default RAM disk size changed to 2.88 MB
33 *
34 * Added initrd: Werner Almesberger & Hans Lermen, Feb '96
35 *
36 * 4/25/96 : Made RAM disk size a parameter (default is now 4 MB)
37 * - Chad Page
38 *
39 * Add support for fs images split across >1 disk, Paul Gortmaker, Mar '98
40 *
41 * Make block size and block size shift for RAM disks a global macro
42 * and set blk_size for -ENOSPC, Werner Fink <werner@suse.de>, Apr '99
43 */
45 #include <linux/string.h>
46 #include <linux/slab.h>
47 #include <asm/atomic.h>
48 #include <linux/bio.h>
49 #include <linux/module.h>
50 #include <linux/moduleparam.h>
51 #include <linux/init.h>
52 #include <linux/pagemap.h>
53 #include <linux/blkdev.h>
54 #include <linux/genhd.h>
55 #include <linux/buffer_head.h> /* for invalidate_bdev() */
56 #include <linux/backing-dev.h>
57 #include <linux/blkpg.h>
58 #include <linux/writeback.h>
60 #include <asm/uaccess.h>
62 /* Various static variables go here. Most are used only in the RAM disk code.
63 */
65 static struct gendisk *rd_disks[CONFIG_BLK_DEV_RAM_COUNT];
66 static struct block_device *rd_bdev[CONFIG_BLK_DEV_RAM_COUNT];/* Protected device data */
67 static struct request_queue *rd_queue[CONFIG_BLK_DEV_RAM_COUNT];
69 /*
70 * Parameters for the boot-loading of the RAM disk. These are set by
71 * init/main.c (from arguments to the kernel command line) or from the
72 * architecture-specific setup routine (from the stored boot sector
73 * information).
74 */
75 int rd_size = CONFIG_BLK_DEV_RAM_SIZE; /* Size of the RAM disks */
76 /*
77 * It would be very desirable to have a soft-blocksize (that in the case
78 * of the ramdisk driver is also the hardblocksize ;) of PAGE_SIZE because
79 * doing that we'll achieve a far better MM footprint. Using a rd_blocksize of
80 * BLOCK_SIZE in the worst case we'll make PAGE_SIZE/BLOCK_SIZE buffer-pages
81 * unfreeable. With a rd_blocksize of PAGE_SIZE instead we are sure that only
82 * 1 page will be protected. Depending on the size of the ramdisk you
83 * may want to change the ramdisk blocksize to achieve a better or worse MM
84 * behaviour. The default is still BLOCK_SIZE (needed by rd_load_image that
85 * supposes the filesystem in the image uses a BLOCK_SIZE blocksize).
86 */
87 static int rd_blocksize = CONFIG_BLK_DEV_RAM_BLOCKSIZE;
89 /*
90 * Copyright (C) 2000 Linus Torvalds.
91 * 2000 Transmeta Corp.
92 * aops copied from ramfs.
93 */
95 /*
96 * If a ramdisk page has buffers, some may be uptodate and some may be not.
97 * To bring the page uptodate we zero out the non-uptodate buffers. The
98 * page must be locked.
99 */
100 static void make_page_uptodate(struct page *page)
101 {
102 if (page_has_buffers(page)) {
103 struct buffer_head *bh = page_buffers(page);
104 struct buffer_head *head = bh;
106 do {
107 if (!buffer_uptodate(bh)) {
108 memset(bh->b_data, 0, bh->b_size);
109 /*
110 * akpm: I'm totally undecided about this. The
111 * buffer has just been magically brought "up to
112 * date", but nobody should want to be reading
113 * it anyway, because it hasn't been used for
114 * anything yet. It is still in a "not read
115 * from disk yet" state.
116 *
117 * But non-uptodate buffers against an uptodate
118 * page are against the rules. So do it anyway.
119 */
120 set_buffer_uptodate(bh);
121 }
122 } while ((bh = bh->b_this_page) != head);
123 } else {
124 memset(page_address(page), 0, PAGE_CACHE_SIZE);
125 }
126 flush_dcache_page(page);
127 SetPageUptodate(page);
128 }
130 static int ramdisk_readpage(struct file *file, struct page *page)
131 {
132 if (!PageUptodate(page))
133 make_page_uptodate(page);
134 unlock_page(page);
135 return 0;
136 }
138 static int ramdisk_prepare_write(struct file *file, struct page *page,
139 unsigned offset, unsigned to)
140 {
141 if (!PageUptodate(page))
142 make_page_uptodate(page);
143 return 0;
144 }
146 static int ramdisk_commit_write(struct file *file, struct page *page,
147 unsigned offset, unsigned to)
148 {
149 set_page_dirty(page);
150 return 0;
151 }
153 /*
154 * ->writepage to the the blockdev's mapping has to redirty the page so that the
155 * VM doesn't go and steal it. We return AOP_WRITEPAGE_ACTIVATE so that the VM
156 * won't try to (pointlessly) write the page again for a while.
157 *
158 * Really, these pages should not be on the LRU at all.
159 */
160 static int ramdisk_writepage(struct page *page, struct writeback_control *wbc)
161 {
162 if (!PageUptodate(page))
163 make_page_uptodate(page);
164 SetPageDirty(page);
165 if (wbc->for_reclaim)
166 return AOP_WRITEPAGE_ACTIVATE;
167 unlock_page(page);
168 return 0;
169 }
171 /*
172 * This is a little speedup thing: short-circuit attempts to write back the
173 * ramdisk blockdev inode to its non-existent backing store.
174 */
175 static int ramdisk_writepages(struct address_space *mapping,
176 struct writeback_control *wbc)
177 {
178 return 0;
179 }
181 /*
182 * ramdisk blockdev pages have their own ->set_page_dirty() because we don't
183 * want them to contribute to dirty memory accounting.
184 */
185 static int ramdisk_set_page_dirty(struct page *page)
186 {
187 if (!TestSetPageDirty(page))
188 return 1;
189 return 0;
190 }
192 static const struct address_space_operations ramdisk_aops = {
193 .readpage = ramdisk_readpage,
194 .prepare_write = ramdisk_prepare_write,
195 .commit_write = ramdisk_commit_write,
196 .writepage = ramdisk_writepage,
197 .set_page_dirty = ramdisk_set_page_dirty,
198 .writepages = ramdisk_writepages,
199 };
201 static int rd_blkdev_pagecache_IO(int rw, struct bio_vec *vec, sector_t sector,
202 struct address_space *mapping)
203 {
204 pgoff_t index = sector >> (PAGE_CACHE_SHIFT - 9);
205 unsigned int vec_offset = vec->bv_offset;
206 int offset = (sector << 9) & ~PAGE_CACHE_MASK;
207 int size = vec->bv_len;
208 int err = 0;
210 do {
211 int count;
212 struct page *page;
213 char *src;
214 char *dst;
216 count = PAGE_CACHE_SIZE - offset;
217 if (count > size)
218 count = size;
219 size -= count;
221 page = grab_cache_page(mapping, index);
222 if (!page) {
223 err = -ENOMEM;
224 goto out;
225 }
227 if (!PageUptodate(page))
228 make_page_uptodate(page);
230 index++;
232 if (rw == READ) {
233 src = kmap_atomic(page, KM_USER0) + offset;
234 dst = kmap_atomic(vec->bv_page, KM_USER1) + vec_offset;
235 } else {
236 src = kmap_atomic(vec->bv_page, KM_USER0) + vec_offset;
237 dst = kmap_atomic(page, KM_USER1) + offset;
238 }
239 offset = 0;
240 vec_offset += count;
242 memcpy(dst, src, count);
244 kunmap_atomic(src, KM_USER0);
245 kunmap_atomic(dst, KM_USER1);
247 if (rw == READ)
248 flush_dcache_page(vec->bv_page);
249 else
250 set_page_dirty(page);
251 unlock_page(page);
252 put_page(page);
253 } while (size);
255 out:
256 return err;
257 }
259 /*
260 * Basically, my strategy here is to set up a buffer-head which can't be
261 * deleted, and make that my Ramdisk. If the request is outside of the
262 * allocated size, we must get rid of it...
263 *
264 * 19-JAN-1998 Richard Gooch <rgooch@atnf.csiro.au> Added devfs support
265 *
266 */
267 static int rd_make_request(request_queue_t *q, struct bio *bio)
268 {
269 struct block_device *bdev = bio->bi_bdev;
270 struct address_space * mapping = bdev->bd_inode->i_mapping;
271 sector_t sector = bio->bi_sector;
272 unsigned long len = bio->bi_size >> 9;
273 int rw = bio_data_dir(bio);
274 struct bio_vec *bvec;
275 int ret = 0, i;
277 if (sector + len > get_capacity(bdev->bd_disk))
278 goto fail;
280 if (rw==READA)
281 rw=READ;
283 bio_for_each_segment(bvec, bio, i) {
284 ret |= rd_blkdev_pagecache_IO(rw, bvec, sector, mapping);
285 sector += bvec->bv_len >> 9;
286 }
287 if (ret)
288 goto fail;
290 bio_endio(bio, bio->bi_size, 0);
291 return 0;
292 fail:
293 bio_io_error(bio, bio->bi_size);
294 return 0;
295 }
297 static int rd_ioctl(struct inode *inode, struct file *file,
298 unsigned int cmd, unsigned long arg)
299 {
300 int error;
301 struct block_device *bdev = inode->i_bdev;
303 if (cmd != BLKFLSBUF)
304 return -ENOTTY;
306 /*
307 * special: we want to release the ramdisk memory, it's not like with
308 * the other blockdevices where this ioctl only flushes away the buffer
309 * cache
310 */
311 error = -EBUSY;
312 mutex_lock(&bdev->bd_mutex);
313 if (bdev->bd_openers <= 2) {
314 truncate_inode_pages(bdev->bd_inode->i_mapping, 0);
315 error = 0;
316 }
317 mutex_unlock(&bdev->bd_mutex);
318 return error;
319 }
321 /*
322 * This is the backing_dev_info for the blockdev inode itself. It doesn't need
323 * writeback and it does not contribute to dirty memory accounting.
324 */
325 static struct backing_dev_info rd_backing_dev_info = {
326 .ra_pages = 0, /* No readahead */
327 .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK | BDI_CAP_MAP_COPY,
328 .unplug_io_fn = default_unplug_io_fn,
329 };
331 /*
332 * This is the backing_dev_info for the files which live atop the ramdisk
333 * "device". These files do need writeback and they do contribute to dirty
334 * memory accounting.
335 */
336 static struct backing_dev_info rd_file_backing_dev_info = {
337 .ra_pages = 0, /* No readahead */
338 .capabilities = BDI_CAP_MAP_COPY, /* Does contribute to dirty memory */
339 .unplug_io_fn = default_unplug_io_fn,
340 };
342 static int rd_open(struct inode *inode, struct file *filp)
343 {
344 unsigned unit = iminor(inode);
346 if (rd_bdev[unit] == NULL) {
347 struct block_device *bdev = inode->i_bdev;
348 struct address_space *mapping;
349 unsigned bsize;
350 gfp_t gfp_mask;
352 inode = igrab(bdev->bd_inode);
353 rd_bdev[unit] = bdev;
354 bdev->bd_openers++;
355 bsize = bdev_hardsect_size(bdev);
356 bdev->bd_block_size = bsize;
357 inode->i_blkbits = blksize_bits(bsize);
358 inode->i_size = get_capacity(bdev->bd_disk)<<9;
360 mapping = inode->i_mapping;
361 mapping->a_ops = &ramdisk_aops;
362 mapping->backing_dev_info = &rd_backing_dev_info;
363 bdev->bd_inode_backing_dev_info = &rd_file_backing_dev_info;
365 /*
366 * Deep badness. rd_blkdev_pagecache_IO() needs to allocate
367 * pagecache pages within a request_fn. We cannot recur back
368 * into the filesytem which is mounted atop the ramdisk, because
369 * that would deadlock on fs locks. And we really don't want
370 * to reenter rd_blkdev_pagecache_IO when we're already within
371 * that function.
372 *
373 * So we turn off __GFP_FS and __GFP_IO.
374 *
375 * And to give this thing a hope of working, turn on __GFP_HIGH.
376 * Hopefully, there's enough regular memory allocation going on
377 * for the page allocator emergency pools to keep the ramdisk
378 * driver happy.
379 */
380 gfp_mask = mapping_gfp_mask(mapping);
381 gfp_mask &= ~(__GFP_FS|__GFP_IO);
382 gfp_mask |= __GFP_HIGH;
383 mapping_set_gfp_mask(mapping, gfp_mask);
384 }
386 return 0;
387 }
389 static struct block_device_operations rd_bd_op = {
390 .owner = THIS_MODULE,
391 .open = rd_open,
392 .ioctl = rd_ioctl,
393 };
395 /*
396 * Before freeing the module, invalidate all of the protected buffers!
397 */
398 static void __exit rd_cleanup(void)
399 {
400 int i;
402 for (i = 0; i < CONFIG_BLK_DEV_RAM_COUNT; i++) {
403 struct block_device *bdev = rd_bdev[i];
404 rd_bdev[i] = NULL;
405 if (bdev) {
406 invalidate_bdev(bdev, 1);
407 blkdev_put(bdev);
408 }
409 del_gendisk(rd_disks[i]);
410 put_disk(rd_disks[i]);
411 blk_cleanup_queue(rd_queue[i]);
412 }
413 unregister_blkdev(RAMDISK_MAJOR, "ramdisk");
414 }
416 /*
417 * This is the registration and initialization section of the RAM disk driver
418 */
419 static int __init rd_init(void)
420 {
421 int i;
422 int err = -ENOMEM;
424 if (rd_blocksize > PAGE_SIZE || rd_blocksize < 512 ||
425 (rd_blocksize & (rd_blocksize-1))) {
426 printk("RAMDISK: wrong blocksize %d, reverting to defaults\n",
427 rd_blocksize);
428 rd_blocksize = BLOCK_SIZE;
429 }
431 for (i = 0; i < CONFIG_BLK_DEV_RAM_COUNT; i++) {
432 rd_disks[i] = alloc_disk(1);
433 if (!rd_disks[i])
434 goto out;
435 }
437 if (register_blkdev(RAMDISK_MAJOR, "ramdisk")) {
438 err = -EIO;
439 goto out;
440 }
442 for (i = 0; i < CONFIG_BLK_DEV_RAM_COUNT; i++) {
443 struct gendisk *disk = rd_disks[i];
445 rd_queue[i] = blk_alloc_queue(GFP_KERNEL);
446 if (!rd_queue[i])
447 goto out_queue;
449 blk_queue_make_request(rd_queue[i], &rd_make_request);
450 blk_queue_hardsect_size(rd_queue[i], rd_blocksize);
452 /* rd_size is given in kB */
453 disk->major = RAMDISK_MAJOR;
454 disk->first_minor = i;
455 disk->fops = &rd_bd_op;
456 disk->queue = rd_queue[i];
457 disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO;
458 sprintf(disk->disk_name, "ram%d", i);
459 set_capacity(disk, rd_size * 2);
460 add_disk(rd_disks[i]);
461 }
463 /* rd_size is given in kB */
464 printk("RAMDISK driver initialized: "
465 "%d RAM disks of %dK size %d blocksize\n",
466 CONFIG_BLK_DEV_RAM_COUNT, rd_size, rd_blocksize);
468 return 0;
469 out_queue:
470 unregister_blkdev(RAMDISK_MAJOR, "ramdisk");
471 out:
472 while (i--) {
473 put_disk(rd_disks[i]);
474 blk_cleanup_queue(rd_queue[i]);
475 }
476 return err;
477 }
479 module_init(rd_init);
480 module_exit(rd_cleanup);
482 /* options - nonmodular */
483 #ifndef MODULE
484 static int __init ramdisk_size(char *str)
485 {
486 rd_size = simple_strtol(str,NULL,0);
487 return 1;
488 }
489 static int __init ramdisk_size2(char *str) /* kludge */
490 {
491 return ramdisk_size(str);
492 }
493 static int __init ramdisk_blocksize(char *str)
494 {
495 rd_blocksize = simple_strtol(str,NULL,0);
496 return 1;
497 }
498 __setup("ramdisk=", ramdisk_size);
499 __setup("ramdisk_size=", ramdisk_size2);
500 __setup("ramdisk_blocksize=", ramdisk_blocksize);
501 #endif
503 /* options - modular */
504 module_param(rd_size, int, 0);
505 MODULE_PARM_DESC(rd_size, "Size of each RAM disk in kbytes.");
506 module_param(rd_blocksize, int, 0);
507 MODULE_PARM_DESC(rd_blocksize, "Blocksize of each RAM disk in bytes.");
508 MODULE_ALIAS_BLOCKDEV_MAJOR(RAMDISK_MAJOR);
510 MODULE_LICENSE("GPL");