ia64/linux-2.6.18-xen.hg

view drivers/mtd/mtd_blkdevs.c @ 897:329ea0ccb344

balloon: try harder to balloon up under memory pressure.

Currently if the balloon driver is unable to increase the guest's
reservation it assumes the failure was due to reaching its full
allocation, gives up on the ballooning operation and records the limit
it reached as the "hard limit". The driver will not try again until
the target is set again (even to the same value).

However it is possible that ballooning has in fact failed due to
memory pressure in the host and therefore it is desirable to keep
attempting to reach the target in case memory becomes available. The
most likely scenario is that some guests are ballooning down while
others are ballooning up and therefore there is temporary memory
pressure while things stabilise. You would not expect a well behaved
toolstack to ask a domain to balloon to more than its allocation nor
would you expect it to deliberately over-commit memory by setting
balloon targets which exceed the total host memory.

This patch drops the concept of a hard limit and causes the balloon
driver to retry increasing the reservation on a timer in the same
manner as when decreasing the reservation.

Also if we partially succeed in increasing the reservation
(i.e. receive less pages than we asked for) then we may as well keep
those pages rather than returning them to Xen.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Fri Jun 05 14:01:20 2009 +0100 (2009-06-05)
parents 831230e53067
children
line source
1 /*
2 * $Id: mtd_blkdevs.c,v 1.27 2005/11/07 11:14:20 gleixner Exp $
3 *
4 * (C) 2003 David Woodhouse <dwmw2@infradead.org>
5 *
6 * Interface to Linux 2.5 block layer for MTD 'translation layers'.
7 *
8 */
10 #include <linux/kernel.h>
11 #include <linux/slab.h>
12 #include <linux/module.h>
13 #include <linux/list.h>
14 #include <linux/fs.h>
15 #include <linux/mtd/blktrans.h>
16 #include <linux/mtd/mtd.h>
17 #include <linux/blkdev.h>
18 #include <linux/blkpg.h>
19 #include <linux/spinlock.h>
20 #include <linux/hdreg.h>
21 #include <linux/init.h>
22 #include <linux/mutex.h>
23 #include <asm/uaccess.h>
25 static LIST_HEAD(blktrans_majors);
27 extern struct mutex mtd_table_mutex;
28 extern struct mtd_info *mtd_table[];
30 struct mtd_blkcore_priv {
31 struct completion thread_dead;
32 int exiting;
33 wait_queue_head_t thread_wq;
34 struct request_queue *rq;
35 spinlock_t queue_lock;
36 };
38 static int do_blktrans_request(struct mtd_blktrans_ops *tr,
39 struct mtd_blktrans_dev *dev,
40 struct request *req)
41 {
42 unsigned long block, nsect;
43 char *buf;
45 block = req->sector;
46 nsect = req->current_nr_sectors;
47 buf = req->buffer;
49 if (!(req->flags & REQ_CMD))
50 return 0;
52 if (block + nsect > get_capacity(req->rq_disk))
53 return 0;
55 switch(rq_data_dir(req)) {
56 case READ:
57 for (; nsect > 0; nsect--, block++, buf += 512)
58 if (tr->readsect(dev, block, buf))
59 return 0;
60 return 1;
62 case WRITE:
63 if (!tr->writesect)
64 return 0;
66 for (; nsect > 0; nsect--, block++, buf += 512)
67 if (tr->writesect(dev, block, buf))
68 return 0;
69 return 1;
71 default:
72 printk(KERN_NOTICE "Unknown request %ld\n", rq_data_dir(req));
73 return 0;
74 }
75 }
77 static int mtd_blktrans_thread(void *arg)
78 {
79 struct mtd_blktrans_ops *tr = arg;
80 struct request_queue *rq = tr->blkcore_priv->rq;
82 /* we might get involved when memory gets low, so use PF_MEMALLOC */
83 current->flags |= PF_MEMALLOC | PF_NOFREEZE;
85 daemonize("%sd", tr->name);
87 /* daemonize() doesn't do this for us since some kernel threads
88 actually want to deal with signals. We can't just call
89 exit_sighand() since that'll cause an oops when we finally
90 do exit. */
91 spin_lock_irq(&current->sighand->siglock);
92 sigfillset(&current->blocked);
93 recalc_sigpending();
94 spin_unlock_irq(&current->sighand->siglock);
96 spin_lock_irq(rq->queue_lock);
98 while (!tr->blkcore_priv->exiting) {
99 struct request *req;
100 struct mtd_blktrans_dev *dev;
101 int res = 0;
102 DECLARE_WAITQUEUE(wait, current);
104 req = elv_next_request(rq);
106 if (!req) {
107 add_wait_queue(&tr->blkcore_priv->thread_wq, &wait);
108 set_current_state(TASK_INTERRUPTIBLE);
110 spin_unlock_irq(rq->queue_lock);
112 schedule();
113 remove_wait_queue(&tr->blkcore_priv->thread_wq, &wait);
115 spin_lock_irq(rq->queue_lock);
117 continue;
118 }
120 dev = req->rq_disk->private_data;
121 tr = dev->tr;
123 spin_unlock_irq(rq->queue_lock);
125 mutex_lock(&dev->lock);
126 res = do_blktrans_request(tr, dev, req);
127 mutex_unlock(&dev->lock);
129 spin_lock_irq(rq->queue_lock);
131 end_request(req, res);
132 }
133 spin_unlock_irq(rq->queue_lock);
135 complete_and_exit(&tr->blkcore_priv->thread_dead, 0);
136 }
138 static void mtd_blktrans_request(struct request_queue *rq)
139 {
140 struct mtd_blktrans_ops *tr = rq->queuedata;
141 wake_up(&tr->blkcore_priv->thread_wq);
142 }
145 static int blktrans_open(struct inode *i, struct file *f)
146 {
147 struct mtd_blktrans_dev *dev;
148 struct mtd_blktrans_ops *tr;
149 int ret = -ENODEV;
151 dev = i->i_bdev->bd_disk->private_data;
152 tr = dev->tr;
154 if (!try_module_get(dev->mtd->owner))
155 goto out;
157 if (!try_module_get(tr->owner))
158 goto out_tr;
160 /* FIXME: Locking. A hot pluggable device can go away
161 (del_mtd_device can be called for it) without its module
162 being unloaded. */
163 dev->mtd->usecount++;
165 ret = 0;
166 if (tr->open && (ret = tr->open(dev))) {
167 dev->mtd->usecount--;
168 module_put(dev->mtd->owner);
169 out_tr:
170 module_put(tr->owner);
171 }
172 out:
173 return ret;
174 }
176 static int blktrans_release(struct inode *i, struct file *f)
177 {
178 struct mtd_blktrans_dev *dev;
179 struct mtd_blktrans_ops *tr;
180 int ret = 0;
182 dev = i->i_bdev->bd_disk->private_data;
183 tr = dev->tr;
185 if (tr->release)
186 ret = tr->release(dev);
188 if (!ret) {
189 dev->mtd->usecount--;
190 module_put(dev->mtd->owner);
191 module_put(tr->owner);
192 }
194 return ret;
195 }
197 static int blktrans_getgeo(struct block_device *bdev, struct hd_geometry *geo)
198 {
199 struct mtd_blktrans_dev *dev = bdev->bd_disk->private_data;
201 if (dev->tr->getgeo)
202 return dev->tr->getgeo(dev, geo);
203 return -ENOTTY;
204 }
206 static int blktrans_ioctl(struct inode *inode, struct file *file,
207 unsigned int cmd, unsigned long arg)
208 {
209 struct mtd_blktrans_dev *dev = inode->i_bdev->bd_disk->private_data;
210 struct mtd_blktrans_ops *tr = dev->tr;
212 switch (cmd) {
213 case BLKFLSBUF:
214 if (tr->flush)
215 return tr->flush(dev);
216 /* The core code did the work, we had nothing to do. */
217 return 0;
218 default:
219 return -ENOTTY;
220 }
221 }
223 struct block_device_operations mtd_blktrans_ops = {
224 .owner = THIS_MODULE,
225 .open = blktrans_open,
226 .release = blktrans_release,
227 .ioctl = blktrans_ioctl,
228 .getgeo = blktrans_getgeo,
229 };
231 int add_mtd_blktrans_dev(struct mtd_blktrans_dev *new)
232 {
233 struct mtd_blktrans_ops *tr = new->tr;
234 struct list_head *this;
235 int last_devnum = -1;
236 struct gendisk *gd;
238 if (!!mutex_trylock(&mtd_table_mutex)) {
239 mutex_unlock(&mtd_table_mutex);
240 BUG();
241 }
243 list_for_each(this, &tr->devs) {
244 struct mtd_blktrans_dev *d = list_entry(this, struct mtd_blktrans_dev, list);
245 if (new->devnum == -1) {
246 /* Use first free number */
247 if (d->devnum != last_devnum+1) {
248 /* Found a free devnum. Plug it in here */
249 new->devnum = last_devnum+1;
250 list_add_tail(&new->list, &d->list);
251 goto added;
252 }
253 } else if (d->devnum == new->devnum) {
254 /* Required number taken */
255 return -EBUSY;
256 } else if (d->devnum > new->devnum) {
257 /* Required number was free */
258 list_add_tail(&new->list, &d->list);
259 goto added;
260 }
261 last_devnum = d->devnum;
262 }
263 if (new->devnum == -1)
264 new->devnum = last_devnum+1;
266 if ((new->devnum << tr->part_bits) > 256) {
267 return -EBUSY;
268 }
270 mutex_init(&new->lock);
271 list_add_tail(&new->list, &tr->devs);
272 added:
273 if (!tr->writesect)
274 new->readonly = 1;
276 gd = alloc_disk(1 << tr->part_bits);
277 if (!gd) {
278 list_del(&new->list);
279 return -ENOMEM;
280 }
281 gd->major = tr->major;
282 gd->first_minor = (new->devnum) << tr->part_bits;
283 gd->fops = &mtd_blktrans_ops;
285 if (tr->part_bits)
286 if (new->devnum < 26)
287 snprintf(gd->disk_name, sizeof(gd->disk_name),
288 "%s%c", tr->name, 'a' + new->devnum);
289 else
290 snprintf(gd->disk_name, sizeof(gd->disk_name),
291 "%s%c%c", tr->name,
292 'a' - 1 + new->devnum / 26,
293 'a' + new->devnum % 26);
294 else
295 snprintf(gd->disk_name, sizeof(gd->disk_name),
296 "%s%d", tr->name, new->devnum);
298 /* 2.5 has capacity in units of 512 bytes while still
299 having BLOCK_SIZE_BITS set to 10. Just to keep us amused. */
300 set_capacity(gd, (new->size * new->blksize) >> 9);
302 gd->private_data = new;
303 new->blkcore_priv = gd;
304 gd->queue = tr->blkcore_priv->rq;
306 if (new->readonly)
307 set_disk_ro(gd, 1);
309 add_disk(gd);
311 return 0;
312 }
314 int del_mtd_blktrans_dev(struct mtd_blktrans_dev *old)
315 {
316 if (!!mutex_trylock(&mtd_table_mutex)) {
317 mutex_unlock(&mtd_table_mutex);
318 BUG();
319 }
321 list_del(&old->list);
323 del_gendisk(old->blkcore_priv);
324 put_disk(old->blkcore_priv);
326 return 0;
327 }
329 static void blktrans_notify_remove(struct mtd_info *mtd)
330 {
331 struct list_head *this, *this2, *next;
333 list_for_each(this, &blktrans_majors) {
334 struct mtd_blktrans_ops *tr = list_entry(this, struct mtd_blktrans_ops, list);
336 list_for_each_safe(this2, next, &tr->devs) {
337 struct mtd_blktrans_dev *dev = list_entry(this2, struct mtd_blktrans_dev, list);
339 if (dev->mtd == mtd)
340 tr->remove_dev(dev);
341 }
342 }
343 }
345 static void blktrans_notify_add(struct mtd_info *mtd)
346 {
347 struct list_head *this;
349 if (mtd->type == MTD_ABSENT)
350 return;
352 list_for_each(this, &blktrans_majors) {
353 struct mtd_blktrans_ops *tr = list_entry(this, struct mtd_blktrans_ops, list);
355 tr->add_mtd(tr, mtd);
356 }
358 }
360 static struct mtd_notifier blktrans_notifier = {
361 .add = blktrans_notify_add,
362 .remove = blktrans_notify_remove,
363 };
365 int register_mtd_blktrans(struct mtd_blktrans_ops *tr)
366 {
367 int ret, i;
369 /* Register the notifier if/when the first device type is
370 registered, to prevent the link/init ordering from fucking
371 us over. */
372 if (!blktrans_notifier.list.next)
373 register_mtd_user(&blktrans_notifier);
375 tr->blkcore_priv = kmalloc(sizeof(*tr->blkcore_priv), GFP_KERNEL);
376 if (!tr->blkcore_priv)
377 return -ENOMEM;
379 memset(tr->blkcore_priv, 0, sizeof(*tr->blkcore_priv));
381 mutex_lock(&mtd_table_mutex);
383 ret = register_blkdev(tr->major, tr->name);
384 if (ret) {
385 printk(KERN_WARNING "Unable to register %s block device on major %d: %d\n",
386 tr->name, tr->major, ret);
387 kfree(tr->blkcore_priv);
388 mutex_unlock(&mtd_table_mutex);
389 return ret;
390 }
391 spin_lock_init(&tr->blkcore_priv->queue_lock);
392 init_completion(&tr->blkcore_priv->thread_dead);
393 init_waitqueue_head(&tr->blkcore_priv->thread_wq);
395 tr->blkcore_priv->rq = blk_init_queue(mtd_blktrans_request, &tr->blkcore_priv->queue_lock);
396 if (!tr->blkcore_priv->rq) {
397 unregister_blkdev(tr->major, tr->name);
398 kfree(tr->blkcore_priv);
399 mutex_unlock(&mtd_table_mutex);
400 return -ENOMEM;
401 }
403 tr->blkcore_priv->rq->queuedata = tr;
405 ret = kernel_thread(mtd_blktrans_thread, tr, CLONE_KERNEL);
406 if (ret < 0) {
407 blk_cleanup_queue(tr->blkcore_priv->rq);
408 unregister_blkdev(tr->major, tr->name);
409 kfree(tr->blkcore_priv);
410 mutex_unlock(&mtd_table_mutex);
411 return ret;
412 }
414 INIT_LIST_HEAD(&tr->devs);
415 list_add(&tr->list, &blktrans_majors);
417 for (i=0; i<MAX_MTD_DEVICES; i++) {
418 if (mtd_table[i] && mtd_table[i]->type != MTD_ABSENT)
419 tr->add_mtd(tr, mtd_table[i]);
420 }
422 mutex_unlock(&mtd_table_mutex);
424 return 0;
425 }
427 int deregister_mtd_blktrans(struct mtd_blktrans_ops *tr)
428 {
429 struct list_head *this, *next;
431 mutex_lock(&mtd_table_mutex);
433 /* Clean up the kernel thread */
434 tr->blkcore_priv->exiting = 1;
435 wake_up(&tr->blkcore_priv->thread_wq);
436 wait_for_completion(&tr->blkcore_priv->thread_dead);
438 /* Remove it from the list of active majors */
439 list_del(&tr->list);
441 list_for_each_safe(this, next, &tr->devs) {
442 struct mtd_blktrans_dev *dev = list_entry(this, struct mtd_blktrans_dev, list);
443 tr->remove_dev(dev);
444 }
446 blk_cleanup_queue(tr->blkcore_priv->rq);
447 unregister_blkdev(tr->major, tr->name);
449 mutex_unlock(&mtd_table_mutex);
451 kfree(tr->blkcore_priv);
453 BUG_ON(!list_empty(&tr->devs));
454 return 0;
455 }
457 static void __exit mtd_blktrans_exit(void)
458 {
459 /* No race here -- if someone's currently in register_mtd_blktrans
460 we're screwed anyway. */
461 if (blktrans_notifier.list.next)
462 unregister_mtd_user(&blktrans_notifier);
463 }
465 module_exit(mtd_blktrans_exit);
467 EXPORT_SYMBOL_GPL(register_mtd_blktrans);
468 EXPORT_SYMBOL_GPL(deregister_mtd_blktrans);
469 EXPORT_SYMBOL_GPL(add_mtd_blktrans_dev);
470 EXPORT_SYMBOL_GPL(del_mtd_blktrans_dev);
472 MODULE_AUTHOR("David Woodhouse <dwmw2@infradead.org>");
473 MODULE_LICENSE("GPL");
474 MODULE_DESCRIPTION("Common interface to block layer for MTD 'translation layers'");