ia64/linux-2.6.18-xen.hg

annotate drivers/mtd/mtdblock.c @ 897:329ea0ccb344

balloon: try harder to balloon up under memory pressure.

Currently if the balloon driver is unable to increase the guest's
reservation it assumes the failure was due to reaching its full
allocation, gives up on the ballooning operation and records the limit
it reached as the "hard limit". The driver will not try again until
the target is set again (even to the same value).

However it is possible that ballooning has in fact failed due to
memory pressure in the host and therefore it is desirable to keep
attempting to reach the target in case memory becomes available. The
most likely scenario is that some guests are ballooning down while
others are ballooning up and therefore there is temporary memory
pressure while things stabilise. You would not expect a well behaved
toolstack to ask a domain to balloon to more than its allocation nor
would you expect it to deliberately over-commit memory by setting
balloon targets which exceed the total host memory.

This patch drops the concept of a hard limit and causes the balloon
driver to retry increasing the reservation on a timer in the same
manner as when decreasing the reservation.

Also if we partially succeed in increasing the reservation
(i.e. receive less pages than we asked for) then we may as well keep
those pages rather than returning them to Xen.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Fri Jun 05 14:01:20 2009 +0100 (2009-06-05)
parents 831230e53067
children
rev   line source
ian@0 1 /*
ian@0 2 * Direct MTD block device access
ian@0 3 *
ian@0 4 * $Id: mtdblock.c,v 1.68 2005/11/07 11:14:20 gleixner Exp $
ian@0 5 *
ian@0 6 * (C) 2000-2003 Nicolas Pitre <nico@cam.org>
ian@0 7 * (C) 1999-2003 David Woodhouse <dwmw2@infradead.org>
ian@0 8 */
ian@0 9
ian@0 10 #include <linux/fs.h>
ian@0 11 #include <linux/init.h>
ian@0 12 #include <linux/kernel.h>
ian@0 13 #include <linux/module.h>
ian@0 14 #include <linux/sched.h>
ian@0 15 #include <linux/slab.h>
ian@0 16 #include <linux/types.h>
ian@0 17 #include <linux/vmalloc.h>
ian@0 18
ian@0 19 #include <linux/mtd/mtd.h>
ian@0 20 #include <linux/mtd/blktrans.h>
ian@0 21 #include <linux/mutex.h>
ian@0 22
ian@0 23
ian@0 24 static struct mtdblk_dev {
ian@0 25 struct mtd_info *mtd;
ian@0 26 int count;
ian@0 27 struct mutex cache_mutex;
ian@0 28 unsigned char *cache_data;
ian@0 29 unsigned long cache_offset;
ian@0 30 unsigned int cache_size;
ian@0 31 enum { STATE_EMPTY, STATE_CLEAN, STATE_DIRTY } cache_state;
ian@0 32 } *mtdblks[MAX_MTD_DEVICES];
ian@0 33
ian@0 34 /*
ian@0 35 * Cache stuff...
ian@0 36 *
ian@0 37 * Since typical flash erasable sectors are much larger than what Linux's
ian@0 38 * buffer cache can handle, we must implement read-modify-write on flash
ian@0 39 * sectors for each block write requests. To avoid over-erasing flash sectors
ian@0 40 * and to speed things up, we locally cache a whole flash sector while it is
ian@0 41 * being written to until a different sector is required.
ian@0 42 */
ian@0 43
ian@0 44 static void erase_callback(struct erase_info *done)
ian@0 45 {
ian@0 46 wait_queue_head_t *wait_q = (wait_queue_head_t *)done->priv;
ian@0 47 wake_up(wait_q);
ian@0 48 }
ian@0 49
ian@0 50 static int erase_write (struct mtd_info *mtd, unsigned long pos,
ian@0 51 int len, const char *buf)
ian@0 52 {
ian@0 53 struct erase_info erase;
ian@0 54 DECLARE_WAITQUEUE(wait, current);
ian@0 55 wait_queue_head_t wait_q;
ian@0 56 size_t retlen;
ian@0 57 int ret;
ian@0 58
ian@0 59 /*
ian@0 60 * First, let's erase the flash block.
ian@0 61 */
ian@0 62
ian@0 63 init_waitqueue_head(&wait_q);
ian@0 64 erase.mtd = mtd;
ian@0 65 erase.callback = erase_callback;
ian@0 66 erase.addr = pos;
ian@0 67 erase.len = len;
ian@0 68 erase.priv = (u_long)&wait_q;
ian@0 69
ian@0 70 set_current_state(TASK_INTERRUPTIBLE);
ian@0 71 add_wait_queue(&wait_q, &wait);
ian@0 72
ian@0 73 ret = mtd->erase(mtd, &erase);
ian@0 74 if (ret) {
ian@0 75 set_current_state(TASK_RUNNING);
ian@0 76 remove_wait_queue(&wait_q, &wait);
ian@0 77 printk (KERN_WARNING "mtdblock: erase of region [0x%lx, 0x%x] "
ian@0 78 "on \"%s\" failed\n",
ian@0 79 pos, len, mtd->name);
ian@0 80 return ret;
ian@0 81 }
ian@0 82
ian@0 83 schedule(); /* Wait for erase to finish. */
ian@0 84 remove_wait_queue(&wait_q, &wait);
ian@0 85
ian@0 86 /*
ian@0 87 * Next, writhe data to flash.
ian@0 88 */
ian@0 89
ian@0 90 ret = mtd->write(mtd, pos, len, &retlen, buf);
ian@0 91 if (ret)
ian@0 92 return ret;
ian@0 93 if (retlen != len)
ian@0 94 return -EIO;
ian@0 95 return 0;
ian@0 96 }
ian@0 97
ian@0 98
ian@0 99 static int write_cached_data (struct mtdblk_dev *mtdblk)
ian@0 100 {
ian@0 101 struct mtd_info *mtd = mtdblk->mtd;
ian@0 102 int ret;
ian@0 103
ian@0 104 if (mtdblk->cache_state != STATE_DIRTY)
ian@0 105 return 0;
ian@0 106
ian@0 107 DEBUG(MTD_DEBUG_LEVEL2, "mtdblock: writing cached data for \"%s\" "
ian@0 108 "at 0x%lx, size 0x%x\n", mtd->name,
ian@0 109 mtdblk->cache_offset, mtdblk->cache_size);
ian@0 110
ian@0 111 ret = erase_write (mtd, mtdblk->cache_offset,
ian@0 112 mtdblk->cache_size, mtdblk->cache_data);
ian@0 113 if (ret)
ian@0 114 return ret;
ian@0 115
ian@0 116 /*
ian@0 117 * Here we could argubly set the cache state to STATE_CLEAN.
ian@0 118 * However this could lead to inconsistency since we will not
ian@0 119 * be notified if this content is altered on the flash by other
ian@0 120 * means. Let's declare it empty and leave buffering tasks to
ian@0 121 * the buffer cache instead.
ian@0 122 */
ian@0 123 mtdblk->cache_state = STATE_EMPTY;
ian@0 124 return 0;
ian@0 125 }
ian@0 126
ian@0 127
ian@0 128 static int do_cached_write (struct mtdblk_dev *mtdblk, unsigned long pos,
ian@0 129 int len, const char *buf)
ian@0 130 {
ian@0 131 struct mtd_info *mtd = mtdblk->mtd;
ian@0 132 unsigned int sect_size = mtdblk->cache_size;
ian@0 133 size_t retlen;
ian@0 134 int ret;
ian@0 135
ian@0 136 DEBUG(MTD_DEBUG_LEVEL2, "mtdblock: write on \"%s\" at 0x%lx, size 0x%x\n",
ian@0 137 mtd->name, pos, len);
ian@0 138
ian@0 139 if (!sect_size)
ian@0 140 return mtd->write(mtd, pos, len, &retlen, buf);
ian@0 141
ian@0 142 while (len > 0) {
ian@0 143 unsigned long sect_start = (pos/sect_size)*sect_size;
ian@0 144 unsigned int offset = pos - sect_start;
ian@0 145 unsigned int size = sect_size - offset;
ian@0 146 if( size > len )
ian@0 147 size = len;
ian@0 148
ian@0 149 if (size == sect_size) {
ian@0 150 /*
ian@0 151 * We are covering a whole sector. Thus there is no
ian@0 152 * need to bother with the cache while it may still be
ian@0 153 * useful for other partial writes.
ian@0 154 */
ian@0 155 ret = erase_write (mtd, pos, size, buf);
ian@0 156 if (ret)
ian@0 157 return ret;
ian@0 158 } else {
ian@0 159 /* Partial sector: need to use the cache */
ian@0 160
ian@0 161 if (mtdblk->cache_state == STATE_DIRTY &&
ian@0 162 mtdblk->cache_offset != sect_start) {
ian@0 163 ret = write_cached_data(mtdblk);
ian@0 164 if (ret)
ian@0 165 return ret;
ian@0 166 }
ian@0 167
ian@0 168 if (mtdblk->cache_state == STATE_EMPTY ||
ian@0 169 mtdblk->cache_offset != sect_start) {
ian@0 170 /* fill the cache with the current sector */
ian@0 171 mtdblk->cache_state = STATE_EMPTY;
ian@0 172 ret = mtd->read(mtd, sect_start, sect_size,
ian@0 173 &retlen, mtdblk->cache_data);
ian@0 174 if (ret)
ian@0 175 return ret;
ian@0 176 if (retlen != sect_size)
ian@0 177 return -EIO;
ian@0 178
ian@0 179 mtdblk->cache_offset = sect_start;
ian@0 180 mtdblk->cache_size = sect_size;
ian@0 181 mtdblk->cache_state = STATE_CLEAN;
ian@0 182 }
ian@0 183
ian@0 184 /* write data to our local cache */
ian@0 185 memcpy (mtdblk->cache_data + offset, buf, size);
ian@0 186 mtdblk->cache_state = STATE_DIRTY;
ian@0 187 }
ian@0 188
ian@0 189 buf += size;
ian@0 190 pos += size;
ian@0 191 len -= size;
ian@0 192 }
ian@0 193
ian@0 194 return 0;
ian@0 195 }
ian@0 196
ian@0 197
ian@0 198 static int do_cached_read (struct mtdblk_dev *mtdblk, unsigned long pos,
ian@0 199 int len, char *buf)
ian@0 200 {
ian@0 201 struct mtd_info *mtd = mtdblk->mtd;
ian@0 202 unsigned int sect_size = mtdblk->cache_size;
ian@0 203 size_t retlen;
ian@0 204 int ret;
ian@0 205
ian@0 206 DEBUG(MTD_DEBUG_LEVEL2, "mtdblock: read on \"%s\" at 0x%lx, size 0x%x\n",
ian@0 207 mtd->name, pos, len);
ian@0 208
ian@0 209 if (!sect_size)
ian@0 210 return mtd->read(mtd, pos, len, &retlen, buf);
ian@0 211
ian@0 212 while (len > 0) {
ian@0 213 unsigned long sect_start = (pos/sect_size)*sect_size;
ian@0 214 unsigned int offset = pos - sect_start;
ian@0 215 unsigned int size = sect_size - offset;
ian@0 216 if (size > len)
ian@0 217 size = len;
ian@0 218
ian@0 219 /*
ian@0 220 * Check if the requested data is already cached
ian@0 221 * Read the requested amount of data from our internal cache if it
ian@0 222 * contains what we want, otherwise we read the data directly
ian@0 223 * from flash.
ian@0 224 */
ian@0 225 if (mtdblk->cache_state != STATE_EMPTY &&
ian@0 226 mtdblk->cache_offset == sect_start) {
ian@0 227 memcpy (buf, mtdblk->cache_data + offset, size);
ian@0 228 } else {
ian@0 229 ret = mtd->read(mtd, pos, size, &retlen, buf);
ian@0 230 if (ret)
ian@0 231 return ret;
ian@0 232 if (retlen != size)
ian@0 233 return -EIO;
ian@0 234 }
ian@0 235
ian@0 236 buf += size;
ian@0 237 pos += size;
ian@0 238 len -= size;
ian@0 239 }
ian@0 240
ian@0 241 return 0;
ian@0 242 }
ian@0 243
ian@0 244 static int mtdblock_readsect(struct mtd_blktrans_dev *dev,
ian@0 245 unsigned long block, char *buf)
ian@0 246 {
ian@0 247 struct mtdblk_dev *mtdblk = mtdblks[dev->devnum];
ian@0 248 return do_cached_read(mtdblk, block<<9, 512, buf);
ian@0 249 }
ian@0 250
ian@0 251 static int mtdblock_writesect(struct mtd_blktrans_dev *dev,
ian@0 252 unsigned long block, char *buf)
ian@0 253 {
ian@0 254 struct mtdblk_dev *mtdblk = mtdblks[dev->devnum];
ian@0 255 if (unlikely(!mtdblk->cache_data && mtdblk->cache_size)) {
ian@0 256 mtdblk->cache_data = vmalloc(mtdblk->mtd->erasesize);
ian@0 257 if (!mtdblk->cache_data)
ian@0 258 return -EINTR;
ian@0 259 /* -EINTR is not really correct, but it is the best match
ian@0 260 * documented in man 2 write for all cases. We could also
ian@0 261 * return -EAGAIN sometimes, but why bother?
ian@0 262 */
ian@0 263 }
ian@0 264 return do_cached_write(mtdblk, block<<9, 512, buf);
ian@0 265 }
ian@0 266
ian@0 267 static int mtdblock_open(struct mtd_blktrans_dev *mbd)
ian@0 268 {
ian@0 269 struct mtdblk_dev *mtdblk;
ian@0 270 struct mtd_info *mtd = mbd->mtd;
ian@0 271 int dev = mbd->devnum;
ian@0 272
ian@0 273 DEBUG(MTD_DEBUG_LEVEL1,"mtdblock_open\n");
ian@0 274
ian@0 275 if (mtdblks[dev]) {
ian@0 276 mtdblks[dev]->count++;
ian@0 277 return 0;
ian@0 278 }
ian@0 279
ian@0 280 /* OK, it's not open. Create cache info for it */
ian@0 281 mtdblk = kmalloc(sizeof(struct mtdblk_dev), GFP_KERNEL);
ian@0 282 if (!mtdblk)
ian@0 283 return -ENOMEM;
ian@0 284
ian@0 285 memset(mtdblk, 0, sizeof(*mtdblk));
ian@0 286 mtdblk->count = 1;
ian@0 287 mtdblk->mtd = mtd;
ian@0 288
ian@0 289 mutex_init(&mtdblk->cache_mutex);
ian@0 290 mtdblk->cache_state = STATE_EMPTY;
ian@0 291 if ( !(mtdblk->mtd->flags & MTD_NO_ERASE) && mtdblk->mtd->erasesize) {
ian@0 292 mtdblk->cache_size = mtdblk->mtd->erasesize;
ian@0 293 mtdblk->cache_data = NULL;
ian@0 294 }
ian@0 295
ian@0 296 mtdblks[dev] = mtdblk;
ian@0 297
ian@0 298 DEBUG(MTD_DEBUG_LEVEL1, "ok\n");
ian@0 299
ian@0 300 return 0;
ian@0 301 }
ian@0 302
ian@0 303 static int mtdblock_release(struct mtd_blktrans_dev *mbd)
ian@0 304 {
ian@0 305 int dev = mbd->devnum;
ian@0 306 struct mtdblk_dev *mtdblk = mtdblks[dev];
ian@0 307
ian@0 308 DEBUG(MTD_DEBUG_LEVEL1, "mtdblock_release\n");
ian@0 309
ian@0 310 mutex_lock(&mtdblk->cache_mutex);
ian@0 311 write_cached_data(mtdblk);
ian@0 312 mutex_unlock(&mtdblk->cache_mutex);
ian@0 313
ian@0 314 if (!--mtdblk->count) {
ian@0 315 /* It was the last usage. Free the device */
ian@0 316 mtdblks[dev] = NULL;
ian@0 317 if (mtdblk->mtd->sync)
ian@0 318 mtdblk->mtd->sync(mtdblk->mtd);
ian@0 319 vfree(mtdblk->cache_data);
ian@0 320 kfree(mtdblk);
ian@0 321 }
ian@0 322 DEBUG(MTD_DEBUG_LEVEL1, "ok\n");
ian@0 323
ian@0 324 return 0;
ian@0 325 }
ian@0 326
ian@0 327 static int mtdblock_flush(struct mtd_blktrans_dev *dev)
ian@0 328 {
ian@0 329 struct mtdblk_dev *mtdblk = mtdblks[dev->devnum];
ian@0 330
ian@0 331 mutex_lock(&mtdblk->cache_mutex);
ian@0 332 write_cached_data(mtdblk);
ian@0 333 mutex_unlock(&mtdblk->cache_mutex);
ian@0 334
ian@0 335 if (mtdblk->mtd->sync)
ian@0 336 mtdblk->mtd->sync(mtdblk->mtd);
ian@0 337 return 0;
ian@0 338 }
ian@0 339
ian@0 340 static void mtdblock_add_mtd(struct mtd_blktrans_ops *tr, struct mtd_info *mtd)
ian@0 341 {
ian@0 342 struct mtd_blktrans_dev *dev = kmalloc(sizeof(*dev), GFP_KERNEL);
ian@0 343
ian@0 344 if (!dev)
ian@0 345 return;
ian@0 346
ian@0 347 memset(dev, 0, sizeof(*dev));
ian@0 348
ian@0 349 dev->mtd = mtd;
ian@0 350 dev->devnum = mtd->index;
ian@0 351 dev->blksize = 512;
ian@0 352 dev->size = mtd->size >> 9;
ian@0 353 dev->tr = tr;
ian@0 354
ian@0 355 if (!(mtd->flags & MTD_WRITEABLE))
ian@0 356 dev->readonly = 1;
ian@0 357
ian@0 358 add_mtd_blktrans_dev(dev);
ian@0 359 }
ian@0 360
ian@0 361 static void mtdblock_remove_dev(struct mtd_blktrans_dev *dev)
ian@0 362 {
ian@0 363 del_mtd_blktrans_dev(dev);
ian@0 364 kfree(dev);
ian@0 365 }
ian@0 366
ian@0 367 static struct mtd_blktrans_ops mtdblock_tr = {
ian@0 368 .name = "mtdblock",
ian@0 369 .major = 31,
ian@0 370 .part_bits = 0,
ian@0 371 .open = mtdblock_open,
ian@0 372 .flush = mtdblock_flush,
ian@0 373 .release = mtdblock_release,
ian@0 374 .readsect = mtdblock_readsect,
ian@0 375 .writesect = mtdblock_writesect,
ian@0 376 .add_mtd = mtdblock_add_mtd,
ian@0 377 .remove_dev = mtdblock_remove_dev,
ian@0 378 .owner = THIS_MODULE,
ian@0 379 };
ian@0 380
ian@0 381 static int __init init_mtdblock(void)
ian@0 382 {
ian@0 383 return register_mtd_blktrans(&mtdblock_tr);
ian@0 384 }
ian@0 385
ian@0 386 static void __exit cleanup_mtdblock(void)
ian@0 387 {
ian@0 388 deregister_mtd_blktrans(&mtdblock_tr);
ian@0 389 }
ian@0 390
ian@0 391 module_init(init_mtdblock);
ian@0 392 module_exit(cleanup_mtdblock);
ian@0 393
ian@0 394
ian@0 395 MODULE_LICENSE("GPL");
ian@0 396 MODULE_AUTHOR("Nicolas Pitre <nico@cam.org> et al.");
ian@0 397 MODULE_DESCRIPTION("Caching read/erase/writeback block device emulation access to MTD devices");