ia64/linux-2.6.18-xen.hg

view drivers/mtd/mtdblock.c @ 897:329ea0ccb344

balloon: try harder to balloon up under memory pressure.

Currently if the balloon driver is unable to increase the guest's
reservation it assumes the failure was due to reaching its full
allocation, gives up on the ballooning operation and records the limit
it reached as the "hard limit". The driver will not try again until
the target is set again (even to the same value).

However it is possible that ballooning has in fact failed due to
memory pressure in the host and therefore it is desirable to keep
attempting to reach the target in case memory becomes available. The
most likely scenario is that some guests are ballooning down while
others are ballooning up and therefore there is temporary memory
pressure while things stabilise. You would not expect a well behaved
toolstack to ask a domain to balloon to more than its allocation nor
would you expect it to deliberately over-commit memory by setting
balloon targets which exceed the total host memory.

This patch drops the concept of a hard limit and causes the balloon
driver to retry increasing the reservation on a timer in the same
manner as when decreasing the reservation.

Also if we partially succeed in increasing the reservation
(i.e. receive less pages than we asked for) then we may as well keep
those pages rather than returning them to Xen.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Fri Jun 05 14:01:20 2009 +0100 (2009-06-05)
parents 831230e53067
children
line source
1 /*
2 * Direct MTD block device access
3 *
4 * $Id: mtdblock.c,v 1.68 2005/11/07 11:14:20 gleixner Exp $
5 *
6 * (C) 2000-2003 Nicolas Pitre <nico@cam.org>
7 * (C) 1999-2003 David Woodhouse <dwmw2@infradead.org>
8 */
10 #include <linux/fs.h>
11 #include <linux/init.h>
12 #include <linux/kernel.h>
13 #include <linux/module.h>
14 #include <linux/sched.h>
15 #include <linux/slab.h>
16 #include <linux/types.h>
17 #include <linux/vmalloc.h>
19 #include <linux/mtd/mtd.h>
20 #include <linux/mtd/blktrans.h>
21 #include <linux/mutex.h>
24 static struct mtdblk_dev {
25 struct mtd_info *mtd;
26 int count;
27 struct mutex cache_mutex;
28 unsigned char *cache_data;
29 unsigned long cache_offset;
30 unsigned int cache_size;
31 enum { STATE_EMPTY, STATE_CLEAN, STATE_DIRTY } cache_state;
32 } *mtdblks[MAX_MTD_DEVICES];
34 /*
35 * Cache stuff...
36 *
37 * Since typical flash erasable sectors are much larger than what Linux's
38 * buffer cache can handle, we must implement read-modify-write on flash
39 * sectors for each block write requests. To avoid over-erasing flash sectors
40 * and to speed things up, we locally cache a whole flash sector while it is
41 * being written to until a different sector is required.
42 */
44 static void erase_callback(struct erase_info *done)
45 {
46 wait_queue_head_t *wait_q = (wait_queue_head_t *)done->priv;
47 wake_up(wait_q);
48 }
50 static int erase_write (struct mtd_info *mtd, unsigned long pos,
51 int len, const char *buf)
52 {
53 struct erase_info erase;
54 DECLARE_WAITQUEUE(wait, current);
55 wait_queue_head_t wait_q;
56 size_t retlen;
57 int ret;
59 /*
60 * First, let's erase the flash block.
61 */
63 init_waitqueue_head(&wait_q);
64 erase.mtd = mtd;
65 erase.callback = erase_callback;
66 erase.addr = pos;
67 erase.len = len;
68 erase.priv = (u_long)&wait_q;
70 set_current_state(TASK_INTERRUPTIBLE);
71 add_wait_queue(&wait_q, &wait);
73 ret = mtd->erase(mtd, &erase);
74 if (ret) {
75 set_current_state(TASK_RUNNING);
76 remove_wait_queue(&wait_q, &wait);
77 printk (KERN_WARNING "mtdblock: erase of region [0x%lx, 0x%x] "
78 "on \"%s\" failed\n",
79 pos, len, mtd->name);
80 return ret;
81 }
83 schedule(); /* Wait for erase to finish. */
84 remove_wait_queue(&wait_q, &wait);
86 /*
87 * Next, writhe data to flash.
88 */
90 ret = mtd->write(mtd, pos, len, &retlen, buf);
91 if (ret)
92 return ret;
93 if (retlen != len)
94 return -EIO;
95 return 0;
96 }
99 static int write_cached_data (struct mtdblk_dev *mtdblk)
100 {
101 struct mtd_info *mtd = mtdblk->mtd;
102 int ret;
104 if (mtdblk->cache_state != STATE_DIRTY)
105 return 0;
107 DEBUG(MTD_DEBUG_LEVEL2, "mtdblock: writing cached data for \"%s\" "
108 "at 0x%lx, size 0x%x\n", mtd->name,
109 mtdblk->cache_offset, mtdblk->cache_size);
111 ret = erase_write (mtd, mtdblk->cache_offset,
112 mtdblk->cache_size, mtdblk->cache_data);
113 if (ret)
114 return ret;
116 /*
117 * Here we could argubly set the cache state to STATE_CLEAN.
118 * However this could lead to inconsistency since we will not
119 * be notified if this content is altered on the flash by other
120 * means. Let's declare it empty and leave buffering tasks to
121 * the buffer cache instead.
122 */
123 mtdblk->cache_state = STATE_EMPTY;
124 return 0;
125 }
128 static int do_cached_write (struct mtdblk_dev *mtdblk, unsigned long pos,
129 int len, const char *buf)
130 {
131 struct mtd_info *mtd = mtdblk->mtd;
132 unsigned int sect_size = mtdblk->cache_size;
133 size_t retlen;
134 int ret;
136 DEBUG(MTD_DEBUG_LEVEL2, "mtdblock: write on \"%s\" at 0x%lx, size 0x%x\n",
137 mtd->name, pos, len);
139 if (!sect_size)
140 return mtd->write(mtd, pos, len, &retlen, buf);
142 while (len > 0) {
143 unsigned long sect_start = (pos/sect_size)*sect_size;
144 unsigned int offset = pos - sect_start;
145 unsigned int size = sect_size - offset;
146 if( size > len )
147 size = len;
149 if (size == sect_size) {
150 /*
151 * We are covering a whole sector. Thus there is no
152 * need to bother with the cache while it may still be
153 * useful for other partial writes.
154 */
155 ret = erase_write (mtd, pos, size, buf);
156 if (ret)
157 return ret;
158 } else {
159 /* Partial sector: need to use the cache */
161 if (mtdblk->cache_state == STATE_DIRTY &&
162 mtdblk->cache_offset != sect_start) {
163 ret = write_cached_data(mtdblk);
164 if (ret)
165 return ret;
166 }
168 if (mtdblk->cache_state == STATE_EMPTY ||
169 mtdblk->cache_offset != sect_start) {
170 /* fill the cache with the current sector */
171 mtdblk->cache_state = STATE_EMPTY;
172 ret = mtd->read(mtd, sect_start, sect_size,
173 &retlen, mtdblk->cache_data);
174 if (ret)
175 return ret;
176 if (retlen != sect_size)
177 return -EIO;
179 mtdblk->cache_offset = sect_start;
180 mtdblk->cache_size = sect_size;
181 mtdblk->cache_state = STATE_CLEAN;
182 }
184 /* write data to our local cache */
185 memcpy (mtdblk->cache_data + offset, buf, size);
186 mtdblk->cache_state = STATE_DIRTY;
187 }
189 buf += size;
190 pos += size;
191 len -= size;
192 }
194 return 0;
195 }
198 static int do_cached_read (struct mtdblk_dev *mtdblk, unsigned long pos,
199 int len, char *buf)
200 {
201 struct mtd_info *mtd = mtdblk->mtd;
202 unsigned int sect_size = mtdblk->cache_size;
203 size_t retlen;
204 int ret;
206 DEBUG(MTD_DEBUG_LEVEL2, "mtdblock: read on \"%s\" at 0x%lx, size 0x%x\n",
207 mtd->name, pos, len);
209 if (!sect_size)
210 return mtd->read(mtd, pos, len, &retlen, buf);
212 while (len > 0) {
213 unsigned long sect_start = (pos/sect_size)*sect_size;
214 unsigned int offset = pos - sect_start;
215 unsigned int size = sect_size - offset;
216 if (size > len)
217 size = len;
219 /*
220 * Check if the requested data is already cached
221 * Read the requested amount of data from our internal cache if it
222 * contains what we want, otherwise we read the data directly
223 * from flash.
224 */
225 if (mtdblk->cache_state != STATE_EMPTY &&
226 mtdblk->cache_offset == sect_start) {
227 memcpy (buf, mtdblk->cache_data + offset, size);
228 } else {
229 ret = mtd->read(mtd, pos, size, &retlen, buf);
230 if (ret)
231 return ret;
232 if (retlen != size)
233 return -EIO;
234 }
236 buf += size;
237 pos += size;
238 len -= size;
239 }
241 return 0;
242 }
244 static int mtdblock_readsect(struct mtd_blktrans_dev *dev,
245 unsigned long block, char *buf)
246 {
247 struct mtdblk_dev *mtdblk = mtdblks[dev->devnum];
248 return do_cached_read(mtdblk, block<<9, 512, buf);
249 }
251 static int mtdblock_writesect(struct mtd_blktrans_dev *dev,
252 unsigned long block, char *buf)
253 {
254 struct mtdblk_dev *mtdblk = mtdblks[dev->devnum];
255 if (unlikely(!mtdblk->cache_data && mtdblk->cache_size)) {
256 mtdblk->cache_data = vmalloc(mtdblk->mtd->erasesize);
257 if (!mtdblk->cache_data)
258 return -EINTR;
259 /* -EINTR is not really correct, but it is the best match
260 * documented in man 2 write for all cases. We could also
261 * return -EAGAIN sometimes, but why bother?
262 */
263 }
264 return do_cached_write(mtdblk, block<<9, 512, buf);
265 }
267 static int mtdblock_open(struct mtd_blktrans_dev *mbd)
268 {
269 struct mtdblk_dev *mtdblk;
270 struct mtd_info *mtd = mbd->mtd;
271 int dev = mbd->devnum;
273 DEBUG(MTD_DEBUG_LEVEL1,"mtdblock_open\n");
275 if (mtdblks[dev]) {
276 mtdblks[dev]->count++;
277 return 0;
278 }
280 /* OK, it's not open. Create cache info for it */
281 mtdblk = kmalloc(sizeof(struct mtdblk_dev), GFP_KERNEL);
282 if (!mtdblk)
283 return -ENOMEM;
285 memset(mtdblk, 0, sizeof(*mtdblk));
286 mtdblk->count = 1;
287 mtdblk->mtd = mtd;
289 mutex_init(&mtdblk->cache_mutex);
290 mtdblk->cache_state = STATE_EMPTY;
291 if ( !(mtdblk->mtd->flags & MTD_NO_ERASE) && mtdblk->mtd->erasesize) {
292 mtdblk->cache_size = mtdblk->mtd->erasesize;
293 mtdblk->cache_data = NULL;
294 }
296 mtdblks[dev] = mtdblk;
298 DEBUG(MTD_DEBUG_LEVEL1, "ok\n");
300 return 0;
301 }
303 static int mtdblock_release(struct mtd_blktrans_dev *mbd)
304 {
305 int dev = mbd->devnum;
306 struct mtdblk_dev *mtdblk = mtdblks[dev];
308 DEBUG(MTD_DEBUG_LEVEL1, "mtdblock_release\n");
310 mutex_lock(&mtdblk->cache_mutex);
311 write_cached_data(mtdblk);
312 mutex_unlock(&mtdblk->cache_mutex);
314 if (!--mtdblk->count) {
315 /* It was the last usage. Free the device */
316 mtdblks[dev] = NULL;
317 if (mtdblk->mtd->sync)
318 mtdblk->mtd->sync(mtdblk->mtd);
319 vfree(mtdblk->cache_data);
320 kfree(mtdblk);
321 }
322 DEBUG(MTD_DEBUG_LEVEL1, "ok\n");
324 return 0;
325 }
327 static int mtdblock_flush(struct mtd_blktrans_dev *dev)
328 {
329 struct mtdblk_dev *mtdblk = mtdblks[dev->devnum];
331 mutex_lock(&mtdblk->cache_mutex);
332 write_cached_data(mtdblk);
333 mutex_unlock(&mtdblk->cache_mutex);
335 if (mtdblk->mtd->sync)
336 mtdblk->mtd->sync(mtdblk->mtd);
337 return 0;
338 }
340 static void mtdblock_add_mtd(struct mtd_blktrans_ops *tr, struct mtd_info *mtd)
341 {
342 struct mtd_blktrans_dev *dev = kmalloc(sizeof(*dev), GFP_KERNEL);
344 if (!dev)
345 return;
347 memset(dev, 0, sizeof(*dev));
349 dev->mtd = mtd;
350 dev->devnum = mtd->index;
351 dev->blksize = 512;
352 dev->size = mtd->size >> 9;
353 dev->tr = tr;
355 if (!(mtd->flags & MTD_WRITEABLE))
356 dev->readonly = 1;
358 add_mtd_blktrans_dev(dev);
359 }
361 static void mtdblock_remove_dev(struct mtd_blktrans_dev *dev)
362 {
363 del_mtd_blktrans_dev(dev);
364 kfree(dev);
365 }
367 static struct mtd_blktrans_ops mtdblock_tr = {
368 .name = "mtdblock",
369 .major = 31,
370 .part_bits = 0,
371 .open = mtdblock_open,
372 .flush = mtdblock_flush,
373 .release = mtdblock_release,
374 .readsect = mtdblock_readsect,
375 .writesect = mtdblock_writesect,
376 .add_mtd = mtdblock_add_mtd,
377 .remove_dev = mtdblock_remove_dev,
378 .owner = THIS_MODULE,
379 };
381 static int __init init_mtdblock(void)
382 {
383 return register_mtd_blktrans(&mtdblock_tr);
384 }
386 static void __exit cleanup_mtdblock(void)
387 {
388 deregister_mtd_blktrans(&mtdblock_tr);
389 }
391 module_init(init_mtdblock);
392 module_exit(cleanup_mtdblock);
395 MODULE_LICENSE("GPL");
396 MODULE_AUTHOR("Nicolas Pitre <nico@cam.org> et al.");
397 MODULE_DESCRIPTION("Caching read/erase/writeback block device emulation access to MTD devices");