ia64/linux-2.6.18-xen.hg

view drivers/xen/balloon/balloon.c @ 897:329ea0ccb344

balloon: try harder to balloon up under memory pressure.

Currently if the balloon driver is unable to increase the guest's
reservation it assumes the failure was due to reaching its full
allocation, gives up on the ballooning operation and records the limit
it reached as the "hard limit". The driver will not try again until
the target is set again (even to the same value).

However it is possible that ballooning has in fact failed due to
memory pressure in the host and therefore it is desirable to keep
attempting to reach the target in case memory becomes available. The
most likely scenario is that some guests are ballooning down while
others are ballooning up and therefore there is temporary memory
pressure while things stabilise. You would not expect a well behaved
toolstack to ask a domain to balloon to more than its allocation nor
would you expect it to deliberately over-commit memory by setting
balloon targets which exceed the total host memory.

This patch drops the concept of a hard limit and causes the balloon
driver to retry increasing the reservation on a timer in the same
manner as when decreasing the reservation.

Also if we partially succeed in increasing the reservation
(i.e. receive less pages than we asked for) then we may as well keep
those pages rather than returning them to Xen.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Fri Jun 05 14:01:20 2009 +0100 (2009-06-05)
parents 163a3807cb1f
children
line source
1 /******************************************************************************
2 * balloon.c
3 *
4 * Xen balloon driver - enables returning/claiming memory to/from Xen.
5 *
6 * Copyright (c) 2003, B Dragovic
7 * Copyright (c) 2003-2004, M Williamson, K Fraser
8 * Copyright (c) 2005 Dan M. Smith, IBM Corporation
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License version 2
12 * as published by the Free Software Foundation; or, when distributed
13 * separately from the Linux kernel or incorporated into other
14 * software packages, subject to the following license:
15 *
16 * Permission is hereby granted, free of charge, to any person obtaining a copy
17 * of this source file (the "Software"), to deal in the Software without
18 * restriction, including without limitation the rights to use, copy, modify,
19 * merge, publish, distribute, sublicense, and/or sell copies of the Software,
20 * and to permit persons to whom the Software is furnished to do so, subject to
21 * the following conditions:
22 *
23 * The above copyright notice and this permission notice shall be included in
24 * all copies or substantial portions of the Software.
25 *
26 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
27 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
28 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
29 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
30 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
31 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
32 * IN THE SOFTWARE.
33 */
35 #include <linux/kernel.h>
36 #include <linux/module.h>
37 #include <linux/sched.h>
38 #include <linux/errno.h>
39 #include <linux/mm.h>
40 #include <linux/mman.h>
41 #include <linux/smp_lock.h>
42 #include <linux/pagemap.h>
43 #include <linux/bootmem.h>
44 #include <linux/highmem.h>
45 #include <linux/vmalloc.h>
46 #include <linux/mutex.h>
47 #include <xen/xen_proc.h>
48 #include <asm/hypervisor.h>
49 #include <xen/balloon.h>
50 #include <xen/interface/memory.h>
51 #include <asm/maddr.h>
52 #include <asm/page.h>
53 #include <asm/pgalloc.h>
54 #include <asm/pgtable.h>
55 #include <asm/uaccess.h>
56 #include <asm/tlb.h>
57 #include <linux/highmem.h>
58 #include <linux/list.h>
59 #include <xen/xenbus.h>
60 #include "common.h"
62 #ifdef HAVE_XEN_PLATFORM_COMPAT_H
63 #include <xen/platform-compat.h>
64 #endif
66 #ifdef CONFIG_PROC_FS
67 static struct proc_dir_entry *balloon_pde;
68 #endif
70 static DEFINE_MUTEX(balloon_mutex);
72 /*
73 * Protects atomic reservation decrease/increase against concurrent increases.
74 * Also protects non-atomic updates of current_pages and driver_pages, and
75 * balloon lists.
76 */
77 DEFINE_SPINLOCK(balloon_lock);
79 struct balloon_stats balloon_stats;
81 /* We increase/decrease in batches which fit in a page */
82 static unsigned long frame_list[PAGE_SIZE / sizeof(unsigned long)];
84 /* VM /proc information for memory */
85 extern unsigned long totalram_pages;
87 #ifndef MODULE
88 extern unsigned long totalhigh_pages;
89 #define inc_totalhigh_pages() (totalhigh_pages++)
90 #define dec_totalhigh_pages() (totalhigh_pages--)
91 #else
92 #define inc_totalhigh_pages() ((void)0)
93 #define dec_totalhigh_pages() ((void)0)
94 #endif
96 /* List of ballooned pages, threaded through the mem_map array. */
97 static LIST_HEAD(ballooned_pages);
99 /* Main work function, always executed in process context. */
100 static void balloon_process(void *unused);
101 static DECLARE_WORK(balloon_worker, balloon_process, NULL);
102 static struct timer_list balloon_timer;
104 /* When ballooning out (allocating memory to return to Xen) we don't really
105 want the kernel to try too hard since that can trigger the oom killer. */
106 #define GFP_BALLOON \
107 (GFP_HIGHUSER|__GFP_NOWARN|__GFP_NORETRY|__GFP_NOMEMALLOC|__GFP_COLD)
109 #define PAGE_TO_LIST(p) (&(p)->lru)
110 #define LIST_TO_PAGE(l) list_entry((l), struct page, lru)
111 #define UNLIST_PAGE(p) \
112 do { \
113 list_del(PAGE_TO_LIST(p)); \
114 PAGE_TO_LIST(p)->next = NULL; \
115 PAGE_TO_LIST(p)->prev = NULL; \
116 } while(0)
118 #define IPRINTK(fmt, args...) \
119 printk(KERN_INFO "xen_mem: " fmt, ##args)
120 #define WPRINTK(fmt, args...) \
121 printk(KERN_WARNING "xen_mem: " fmt, ##args)
123 /* balloon_append: add the given page to the balloon. */
124 static void balloon_append(struct page *page)
125 {
126 /* Lowmem is re-populated first, so highmem pages go at list tail. */
127 if (PageHighMem(page)) {
128 list_add_tail(PAGE_TO_LIST(page), &ballooned_pages);
129 bs.balloon_high++;
130 dec_totalhigh_pages();
131 } else {
132 list_add(PAGE_TO_LIST(page), &ballooned_pages);
133 bs.balloon_low++;
134 }
135 }
137 /* balloon_retrieve: rescue a page from the balloon, if it is not empty. */
138 static struct page *balloon_retrieve(void)
139 {
140 struct page *page;
142 if (list_empty(&ballooned_pages))
143 return NULL;
145 page = LIST_TO_PAGE(ballooned_pages.next);
146 UNLIST_PAGE(page);
148 if (PageHighMem(page)) {
149 bs.balloon_high--;
150 inc_totalhigh_pages();
151 }
152 else
153 bs.balloon_low--;
155 return page;
156 }
158 static struct page *balloon_first_page(void)
159 {
160 if (list_empty(&ballooned_pages))
161 return NULL;
162 return LIST_TO_PAGE(ballooned_pages.next);
163 }
165 static struct page *balloon_next_page(struct page *page)
166 {
167 struct list_head *next = PAGE_TO_LIST(page)->next;
168 if (next == &ballooned_pages)
169 return NULL;
170 return LIST_TO_PAGE(next);
171 }
173 static inline void balloon_free_page(struct page *page)
174 {
175 #ifndef MODULE
176 if (put_page_testzero(page))
177 free_cold_page(page);
178 #else
179 /* free_cold_page() is not being exported. */
180 __free_page(page);
181 #endif
182 }
184 static void balloon_alarm(unsigned long unused)
185 {
186 schedule_work(&balloon_worker);
187 }
189 static unsigned long current_target(void)
190 {
191 unsigned long target = bs.target_pages;
192 if (target > (bs.current_pages + bs.balloon_low + bs.balloon_high))
193 target = bs.current_pages + bs.balloon_low + bs.balloon_high;
194 return target;
195 }
197 static unsigned long minimum_target(void)
198 {
199 #ifndef CONFIG_XEN
200 #define max_pfn num_physpages
201 #endif
202 unsigned long min_pages, curr_pages = current_target();
204 #define MB2PAGES(mb) ((mb) << (20 - PAGE_SHIFT))
205 /* Simple continuous piecewiese linear function:
206 * max MiB -> min MiB gradient
207 * 0 0
208 * 16 16
209 * 32 24
210 * 128 72 (1/2)
211 * 512 168 (1/4)
212 * 2048 360 (1/8)
213 * 8192 552 (1/32)
214 * 32768 1320
215 * 131072 4392
216 */
217 if (max_pfn < MB2PAGES(128))
218 min_pages = MB2PAGES(8) + (max_pfn >> 1);
219 else if (max_pfn < MB2PAGES(512))
220 min_pages = MB2PAGES(40) + (max_pfn >> 2);
221 else if (max_pfn < MB2PAGES(2048))
222 min_pages = MB2PAGES(104) + (max_pfn >> 3);
223 else
224 min_pages = MB2PAGES(296) + (max_pfn >> 5);
225 #undef MB2PAGES
227 /* Don't enforce growth */
228 return min(min_pages, curr_pages);
229 #ifndef CONFIG_XEN
230 #undef max_pfn
231 #endif
232 }
234 static int increase_reservation(unsigned long nr_pages)
235 {
236 unsigned long pfn, i, flags;
237 struct page *page;
238 long rc;
239 struct xen_memory_reservation reservation = {
240 .address_bits = 0,
241 .extent_order = 0,
242 .domid = DOMID_SELF
243 };
245 if (nr_pages > ARRAY_SIZE(frame_list))
246 nr_pages = ARRAY_SIZE(frame_list);
248 balloon_lock(flags);
250 page = balloon_first_page();
251 for (i = 0; i < nr_pages; i++) {
252 BUG_ON(page == NULL);
253 frame_list[i] = page_to_pfn(page);;
254 page = balloon_next_page(page);
255 }
257 set_xen_guest_handle(reservation.extent_start, frame_list);
258 reservation.nr_extents = nr_pages;
259 rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation);
260 if (rc < 0)
261 goto out;
263 for (i = 0; i < rc; i++) {
264 page = balloon_retrieve();
265 BUG_ON(page == NULL);
267 pfn = page_to_pfn(page);
268 BUG_ON(!xen_feature(XENFEAT_auto_translated_physmap) &&
269 phys_to_machine_mapping_valid(pfn));
271 set_phys_to_machine(pfn, frame_list[i]);
273 #ifdef CONFIG_XEN
274 /* Link back into the page tables if not highmem. */
275 if (pfn < max_low_pfn) {
276 int ret;
277 ret = HYPERVISOR_update_va_mapping(
278 (unsigned long)__va(pfn << PAGE_SHIFT),
279 pfn_pte_ma(frame_list[i], PAGE_KERNEL),
280 0);
281 BUG_ON(ret);
282 }
283 #endif
285 /* Relinquish the page back to the allocator. */
286 ClearPageReserved(page);
287 init_page_count(page);
288 balloon_free_page(page);
289 }
291 bs.current_pages += rc;
292 totalram_pages = bs.current_pages;
294 out:
295 balloon_unlock(flags);
297 return rc < 0 ? rc : rc != nr_pages;
298 }
300 static int decrease_reservation(unsigned long nr_pages)
301 {
302 unsigned long pfn, i, flags;
303 struct page *page;
304 void *v;
305 int need_sleep = 0;
306 int ret;
307 struct xen_memory_reservation reservation = {
308 .address_bits = 0,
309 .extent_order = 0,
310 .domid = DOMID_SELF
311 };
313 if (nr_pages > ARRAY_SIZE(frame_list))
314 nr_pages = ARRAY_SIZE(frame_list);
316 for (i = 0; i < nr_pages; i++) {
317 if ((page = alloc_page(GFP_BALLOON)) == NULL) {
318 nr_pages = i;
319 need_sleep = 1;
320 break;
321 }
323 pfn = page_to_pfn(page);
324 frame_list[i] = pfn_to_mfn(pfn);
326 if (!PageHighMem(page)) {
327 v = phys_to_virt(pfn << PAGE_SHIFT);
328 scrub_pages(v, 1);
329 #ifdef CONFIG_XEN
330 ret = HYPERVISOR_update_va_mapping(
331 (unsigned long)v, __pte_ma(0), 0);
332 BUG_ON(ret);
333 #endif
334 }
335 #ifdef CONFIG_XEN_SCRUB_PAGES
336 else {
337 v = kmap(page);
338 scrub_pages(v, 1);
339 kunmap(page);
340 }
341 #endif
342 }
344 #ifdef CONFIG_XEN
345 /* Ensure that ballooned highmem pages don't have kmaps. */
346 kmap_flush_unused();
347 flush_tlb_all();
348 #endif
350 balloon_lock(flags);
352 /* No more mappings: invalidate P2M and add to balloon. */
353 for (i = 0; i < nr_pages; i++) {
354 pfn = mfn_to_pfn(frame_list[i]);
355 set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
356 balloon_append(pfn_to_page(pfn));
357 }
359 set_xen_guest_handle(reservation.extent_start, frame_list);
360 reservation.nr_extents = nr_pages;
361 ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
362 BUG_ON(ret != nr_pages);
364 bs.current_pages -= nr_pages;
365 totalram_pages = bs.current_pages;
367 balloon_unlock(flags);
369 return need_sleep;
370 }
372 /*
373 * We avoid multiple worker processes conflicting via the balloon mutex.
374 * We may of course race updates of the target counts (which are protected
375 * by the balloon lock), or with changes to the Xen hard limit, but we will
376 * recover from these in time.
377 */
378 static void balloon_process(void *unused)
379 {
380 int need_sleep = 0;
381 long credit;
383 mutex_lock(&balloon_mutex);
385 do {
386 credit = current_target() - bs.current_pages;
387 if (credit > 0)
388 need_sleep = (increase_reservation(credit) != 0);
389 if (credit < 0)
390 need_sleep = (decrease_reservation(-credit) != 0);
392 #ifndef CONFIG_PREEMPT
393 if (need_resched())
394 schedule();
395 #endif
396 } while ((credit != 0) && !need_sleep);
398 /* Schedule more work if there is some still to be done. */
399 if (current_target() != bs.current_pages)
400 mod_timer(&balloon_timer, jiffies + HZ);
402 mutex_unlock(&balloon_mutex);
403 }
405 /* Resets the Xen limit, sets new target, and kicks off processing. */
406 void balloon_set_new_target(unsigned long target)
407 {
408 /* No need for lock. Not read-modify-write updates. */
409 bs.target_pages = max(target, minimum_target());
410 schedule_work(&balloon_worker);
411 }
413 static struct xenbus_watch target_watch =
414 {
415 .node = "memory/target"
416 };
418 /* React to a change in the target key */
419 static void watch_target(struct xenbus_watch *watch,
420 const char **vec, unsigned int len)
421 {
422 unsigned long long new_target;
423 int err;
425 err = xenbus_scanf(XBT_NIL, "memory", "target", "%llu", &new_target);
426 if (err != 1) {
427 /* This is ok (for domain0 at least) - so just return */
428 return;
429 }
431 /* The given memory/target value is in KiB, so it needs converting to
432 * pages. PAGE_SHIFT converts bytes to pages, hence PAGE_SHIFT - 10.
433 */
434 balloon_set_new_target(new_target >> (PAGE_SHIFT - 10));
435 }
437 static int balloon_init_watcher(struct notifier_block *notifier,
438 unsigned long event,
439 void *data)
440 {
441 int err;
443 err = register_xenbus_watch(&target_watch);
444 if (err)
445 printk(KERN_ERR "Failed to set balloon watcher\n");
447 return NOTIFY_DONE;
448 }
450 #ifdef CONFIG_PROC_FS
451 static int balloon_write(struct file *file, const char __user *buffer,
452 unsigned long count, void *data)
453 {
454 char memstring[64], *endchar;
455 unsigned long long target_bytes;
457 if (!capable(CAP_SYS_ADMIN))
458 return -EPERM;
460 if (count <= 1)
461 return -EBADMSG; /* runt */
462 if (count > sizeof(memstring))
463 return -EFBIG; /* too long */
465 if (copy_from_user(memstring, buffer, count))
466 return -EFAULT;
467 memstring[sizeof(memstring)-1] = '\0';
469 target_bytes = memparse(memstring, &endchar);
470 balloon_set_new_target(target_bytes >> PAGE_SHIFT);
472 return count;
473 }
475 static int balloon_read(char *page, char **start, off_t off,
476 int count, int *eof, void *data)
477 {
478 int len;
480 len = sprintf(
481 page,
482 "Current allocation: %8lu kB\n"
483 "Requested target: %8lu kB\n"
484 "Low-mem balloon: %8lu kB\n"
485 "High-mem balloon: %8lu kB\n"
486 "Driver pages: %8lu kB\n",
487 PAGES2KB(bs.current_pages), PAGES2KB(bs.target_pages),
488 PAGES2KB(bs.balloon_low), PAGES2KB(bs.balloon_high),
489 PAGES2KB(bs.driver_pages));
492 *eof = 1;
493 return len;
494 }
495 #endif
497 static struct notifier_block xenstore_notifier;
499 static int __init balloon_init(void)
500 {
501 #if defined(CONFIG_X86) && defined(CONFIG_XEN)
502 unsigned long pfn;
503 struct page *page;
504 #endif
506 if (!is_running_on_xen())
507 return -ENODEV;
509 IPRINTK("Initialising balloon driver.\n");
511 #ifdef CONFIG_XEN
512 bs.current_pages = min(xen_start_info->nr_pages, max_pfn);
513 totalram_pages = bs.current_pages;
514 #else
515 bs.current_pages = totalram_pages;
516 #endif
517 bs.target_pages = bs.current_pages;
518 bs.balloon_low = 0;
519 bs.balloon_high = 0;
520 bs.driver_pages = 0UL;
522 init_timer(&balloon_timer);
523 balloon_timer.data = 0;
524 balloon_timer.function = balloon_alarm;
526 #ifdef CONFIG_PROC_FS
527 if ((balloon_pde = create_xen_proc_entry("balloon", 0644)) == NULL) {
528 WPRINTK("Unable to create /proc/xen/balloon.\n");
529 return -1;
530 }
532 balloon_pde->read_proc = balloon_read;
533 balloon_pde->write_proc = balloon_write;
534 #endif
535 balloon_sysfs_init();
537 #if defined(CONFIG_X86) && defined(CONFIG_XEN)
538 /* Initialise the balloon with excess memory space. */
539 for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) {
540 page = pfn_to_page(pfn);
541 if (!PageReserved(page))
542 balloon_append(page);
543 }
544 #endif
546 target_watch.callback = watch_target;
547 xenstore_notifier.notifier_call = balloon_init_watcher;
549 register_xenstore_notifier(&xenstore_notifier);
551 return 0;
552 }
554 subsys_initcall(balloon_init);
556 static void __exit balloon_exit(void)
557 {
558 balloon_sysfs_exit();
559 /* XXX - release balloon here */
560 }
562 module_exit(balloon_exit);
564 void balloon_update_driver_allowance(long delta)
565 {
566 unsigned long flags;
568 balloon_lock(flags);
569 bs.driver_pages += delta;
570 balloon_unlock(flags);
571 }
573 #ifdef CONFIG_XEN
574 static int dealloc_pte_fn(
575 pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
576 {
577 unsigned long mfn = pte_mfn(*pte);
578 int ret;
579 struct xen_memory_reservation reservation = {
580 .nr_extents = 1,
581 .extent_order = 0,
582 .domid = DOMID_SELF
583 };
584 set_xen_guest_handle(reservation.extent_start, &mfn);
585 set_pte_at(&init_mm, addr, pte, __pte_ma(0));
586 set_phys_to_machine(__pa(addr) >> PAGE_SHIFT, INVALID_P2M_ENTRY);
587 ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
588 BUG_ON(ret != 1);
589 return 0;
590 }
591 #endif
593 struct page **alloc_empty_pages_and_pagevec(int nr_pages)
594 {
595 unsigned long flags;
596 void *v;
597 struct page *page, **pagevec;
598 int i, ret;
600 pagevec = kmalloc(sizeof(page) * nr_pages, GFP_KERNEL);
601 if (pagevec == NULL)
602 return NULL;
604 for (i = 0; i < nr_pages; i++) {
605 page = pagevec[i] = alloc_page(GFP_KERNEL|__GFP_COLD);
606 if (page == NULL)
607 goto err;
609 v = page_address(page);
610 scrub_pages(v, 1);
612 balloon_lock(flags);
614 if (xen_feature(XENFEAT_auto_translated_physmap)) {
615 unsigned long gmfn = page_to_pfn(page);
616 struct xen_memory_reservation reservation = {
617 .nr_extents = 1,
618 .extent_order = 0,
619 .domid = DOMID_SELF
620 };
621 set_xen_guest_handle(reservation.extent_start, &gmfn);
622 ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
623 &reservation);
624 if (ret == 1)
625 ret = 0; /* success */
626 } else {
627 #ifdef CONFIG_XEN
628 ret = apply_to_page_range(&init_mm, (unsigned long)v,
629 PAGE_SIZE, dealloc_pte_fn,
630 NULL);
631 #else
632 /* Cannot handle non-auto translate mode. */
633 ret = 1;
634 #endif
635 }
637 if (ret != 0) {
638 balloon_unlock(flags);
639 balloon_free_page(page);
640 goto err;
641 }
643 totalram_pages = --bs.current_pages;
645 balloon_unlock(flags);
646 }
648 out:
649 schedule_work(&balloon_worker);
650 #ifdef CONFIG_XEN
651 flush_tlb_all();
652 #endif
653 return pagevec;
655 err:
656 balloon_lock(flags);
657 while (--i >= 0)
658 balloon_append(pagevec[i]);
659 balloon_unlock(flags);
660 kfree(pagevec);
661 pagevec = NULL;
662 goto out;
663 }
665 void free_empty_pages_and_pagevec(struct page **pagevec, int nr_pages)
666 {
667 unsigned long flags;
668 int i;
670 if (pagevec == NULL)
671 return;
673 balloon_lock(flags);
674 for (i = 0; i < nr_pages; i++) {
675 BUG_ON(page_count(pagevec[i]) != 1);
676 balloon_append(pagevec[i]);
677 }
678 balloon_unlock(flags);
680 kfree(pagevec);
682 schedule_work(&balloon_worker);
683 }
685 void balloon_release_driver_page(struct page *page)
686 {
687 unsigned long flags;
689 balloon_lock(flags);
690 balloon_append(page);
691 bs.driver_pages--;
692 balloon_unlock(flags);
694 schedule_work(&balloon_worker);
695 }
697 EXPORT_SYMBOL_GPL(balloon_update_driver_allowance);
698 EXPORT_SYMBOL_GPL(alloc_empty_pages_and_pagevec);
699 EXPORT_SYMBOL_GPL(free_empty_pages_and_pagevec);
700 EXPORT_SYMBOL_GPL(balloon_release_driver_page);
702 MODULE_LICENSE("Dual BSD/GPL");